xref: /netbsd-src/sys/arch/x86/x86/pmap.c (revision b757af438b42b93f8c6571f026d8b8ef3eaf5fc9)
1 /*	$NetBSD: pmap.c,v 1.176 2012/02/25 20:03:58 cherry Exp $	*/
2 
3 /*-
4  * Copyright (c) 2008, 2010 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 2007 Manuel Bouyer.
34  *
35  * Redistribution and use in source and binary forms, with or without
36  * modification, are permitted provided that the following conditions
37  * are met:
38  * 1. Redistributions of source code must retain the above copyright
39  *    notice, this list of conditions and the following disclaimer.
40  * 2. Redistributions in binary form must reproduce the above copyright
41  *    notice, this list of conditions and the following disclaimer in the
42  *    documentation and/or other materials provided with the distribution.
43  *
44  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
45  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
46  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
47  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
48  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
49  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
50  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
51  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
52  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
53  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
54  *
55  */
56 
57 /*
58  * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
59  *
60  * Permission to use, copy, modify, and distribute this software for any
61  * purpose with or without fee is hereby granted, provided that the above
62  * copyright notice and this permission notice appear in all copies.
63  *
64  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
65  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
66  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
67  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
68  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
69  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
70  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
71  */
72 
73 /*
74  * Copyright (c) 1997 Charles D. Cranor and Washington University.
75  * All rights reserved.
76  *
77  * Redistribution and use in source and binary forms, with or without
78  * modification, are permitted provided that the following conditions
79  * are met:
80  * 1. Redistributions of source code must retain the above copyright
81  *    notice, this list of conditions and the following disclaimer.
82  * 2. Redistributions in binary form must reproduce the above copyright
83  *    notice, this list of conditions and the following disclaimer in the
84  *    documentation and/or other materials provided with the distribution.
85  *
86  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
87  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
88  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
89  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
90  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
91  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
92  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
93  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
94  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
95  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
96  */
97 
98 /*
99  * Copyright 2001 (c) Wasabi Systems, Inc.
100  * All rights reserved.
101  *
102  * Written by Frank van der Linden for Wasabi Systems, Inc.
103  *
104  * Redistribution and use in source and binary forms, with or without
105  * modification, are permitted provided that the following conditions
106  * are met:
107  * 1. Redistributions of source code must retain the above copyright
108  *    notice, this list of conditions and the following disclaimer.
109  * 2. Redistributions in binary form must reproduce the above copyright
110  *    notice, this list of conditions and the following disclaimer in the
111  *    documentation and/or other materials provided with the distribution.
112  * 3. All advertising materials mentioning features or use of this software
113  *    must display the following acknowledgement:
114  *      This product includes software developed for the NetBSD Project by
115  *      Wasabi Systems, Inc.
116  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
117  *    or promote products derived from this software without specific prior
118  *    written permission.
119  *
120  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
121  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
122  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
123  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
124  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
125  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
126  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
127  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
128  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
129  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
130  * POSSIBILITY OF SUCH DAMAGE.
131  */
132 
133 /*
134  * This is the i386 pmap modified and generalized to support x86-64
135  * as well. The idea is to hide the upper N levels of the page tables
136  * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest
137  * is mostly untouched, except that it uses some more generalized
138  * macros and interfaces.
139  *
140  * This pmap has been tested on the i386 as well, and it can be easily
141  * adapted to PAE.
142  *
143  * fvdl@wasabisystems.com 18-Jun-2001
144  */
145 
146 /*
147  * pmap.c: i386 pmap module rewrite
148  * Chuck Cranor <chuck@netbsd>
149  * 11-Aug-97
150  *
151  * history of this pmap module: in addition to my own input, i used
152  *    the following references for this rewrite of the i386 pmap:
153  *
154  * [1] the NetBSD i386 pmap.   this pmap appears to be based on the
155  *     BSD hp300 pmap done by Mike Hibler at University of Utah.
156  *     it was then ported to the i386 by William Jolitz of UUNET
157  *     Technologies, Inc.   Then Charles M. Hannum of the NetBSD
158  *     project fixed some bugs and provided some speed ups.
159  *
160  * [2] the FreeBSD i386 pmap.   this pmap seems to be the
161  *     Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson
162  *     and David Greenman.
163  *
164  * [3] the Mach pmap.   this pmap, from CMU, seems to have migrated
165  *     between several processors.   the VAX version was done by
166  *     Avadis Tevanian, Jr., and Michael Wayne Young.    the i386
167  *     version was done by Lance Berc, Mike Kupfer, Bob Baron,
168  *     David Golub, and Richard Draves.    the alpha version was
169  *     done by Alessandro Forin (CMU/Mach) and Chris Demetriou
170  *     (NetBSD/alpha).
171  */
172 
173 #include <sys/cdefs.h>
174 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.176 2012/02/25 20:03:58 cherry Exp $");
175 
176 #include "opt_user_ldt.h"
177 #include "opt_lockdebug.h"
178 #include "opt_multiprocessor.h"
179 #include "opt_xen.h"
180 #if !defined(__x86_64__)
181 #include "opt_kstack_dr0.h"
182 #endif /* !defined(__x86_64__) */
183 
184 #include <sys/param.h>
185 #include <sys/systm.h>
186 #include <sys/proc.h>
187 #include <sys/pool.h>
188 #include <sys/kernel.h>
189 #include <sys/atomic.h>
190 #include <sys/cpu.h>
191 #include <sys/intr.h>
192 #include <sys/xcall.h>
193 #include <sys/kcore.h>
194 
195 #include <uvm/uvm.h>
196 
197 #include <dev/isa/isareg.h>
198 
199 #include <machine/specialreg.h>
200 #include <machine/gdt.h>
201 #include <machine/isa_machdep.h>
202 #include <machine/cpuvar.h>
203 #include <machine/cputypes.h>
204 
205 #include <x86/pmap.h>
206 #include <x86/pmap_pv.h>
207 
208 #include <x86/i82489reg.h>
209 #include <x86/i82489var.h>
210 
211 #ifdef XEN
212 #include <xen/xen-public/xen.h>
213 #include <xen/hypervisor.h>
214 #endif
215 
216 /*
217  * general info:
218  *
219  *  - for an explanation of how the i386 MMU hardware works see
220  *    the comments in <machine/pte.h>.
221  *
222  *  - for an explanation of the general memory structure used by
223  *    this pmap (including the recursive mapping), see the comments
224  *    in <machine/pmap.h>.
225  *
226  * this file contains the code for the "pmap module."   the module's
227  * job is to manage the hardware's virtual to physical address mappings.
228  * note that there are two levels of mapping in the VM system:
229  *
230  *  [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
231  *      to map ranges of virtual address space to objects/files.  for
232  *      example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
233  *      to the file /bin/ls starting at offset zero."   note that
234  *      the upper layer mapping is not concerned with how individual
235  *      vm_pages are mapped.
236  *
237  *  [2] the lower layer of the VM system (the pmap) maintains the mappings
238  *      from virtual addresses.   it is concerned with which vm_page is
239  *      mapped where.   for example, when you run /bin/ls and start
240  *      at page 0x1000 the fault routine may lookup the correct page
241  *      of the /bin/ls file and then ask the pmap layer to establish
242  *      a mapping for it.
243  *
244  * note that information in the lower layer of the VM system can be
245  * thrown away since it can easily be reconstructed from the info
246  * in the upper layer.
247  *
248  * data structures we use include:
249  *
250  *  - struct pmap: describes the address space of one thread
251  *  - struct pv_entry: describes one <PMAP,VA> mapping of a PA
252  *  - struct pv_head: there is one pv_head per managed page of
253  *	physical memory.   the pv_head points to a list of pv_entry
254  *	structures which describe all the <PMAP,VA> pairs that this
255  *      page is mapped in.    this is critical for page based operations
256  *      such as pmap_page_protect() [change protection on _all_ mappings
257  *      of a page]
258  */
259 
260 /*
261  * memory allocation
262  *
263  *  - there are three data structures that we must dynamically allocate:
264  *
265  * [A] new process' page directory page (PDP)
266  *	- plan 1: done at pmap_create() we use
267  *	  uvm_km_alloc(kernel_map, PAGE_SIZE)  [fka kmem_alloc] to do this
268  *	  allocation.
269  *
270  * if we are low in free physical memory then we sleep in
271  * uvm_km_alloc -- in this case this is ok since we are creating
272  * a new pmap and should not be holding any locks.
273  *
274  * if the kernel is totally out of virtual space
275  * (i.e. uvm_km_alloc returns NULL), then we panic.
276  *
277  * [B] new page tables pages (PTP)
278  * 	- call uvm_pagealloc()
279  * 		=> success: zero page, add to pm_pdir
280  * 		=> failure: we are out of free vm_pages, let pmap_enter()
281  *		   tell UVM about it.
282  *
283  * note: for kernel PTPs, we start with NKPTP of them.   as we map
284  * kernel memory (at uvm_map time) we check to see if we've grown
285  * the kernel pmap.   if so, we call the optional function
286  * pmap_growkernel() to grow the kernel PTPs in advance.
287  *
288  * [C] pv_entry structures
289  */
290 
291 /*
292  * locking
293  *
294  * we have the following locks that we must contend with:
295  *
296  * mutexes:
297  *
298  * - pmap lock (per pmap, part of uvm_object)
299  *   this lock protects the fields in the pmap structure including
300  *   the non-kernel PDEs in the PDP, and the PTEs.  it also locks
301  *   in the alternate PTE space (since that is determined by the
302  *   entry in the PDP).
303  *
304  * - pvh_lock (per pv_head)
305  *   this lock protects the pv_entry list which is chained off the
306  *   pv_head structure for a specific managed PA.   it is locked
307  *   when traversing the list (e.g. adding/removing mappings,
308  *   syncing R/M bits, etc.)
309  *
310  * - pmaps_lock
311  *   this lock protects the list of active pmaps (headed by "pmaps").
312  *   we lock it when adding or removing pmaps from this list.
313  */
314 
315 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER;
316 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER;
317 const long nkptpmax[] = NKPTPMAX_INITIALIZER;
318 const long nbpd[] = NBPD_INITIALIZER;
319 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER;
320 
321 long nkptp[] = NKPTP_INITIALIZER;
322 
323 struct pmap_head pmaps;
324 kmutex_t pmaps_lock;
325 
326 static vaddr_t pmap_maxkvaddr;
327 
328 /*
329  * XXX kludge: dummy locking to make KASSERTs in uvm_page.c comfortable.
330  * actual locking is done by pm_lock.
331  */
332 #if defined(DIAGNOSTIC)
333 #define	PMAP_SUBOBJ_LOCK(pm, idx) \
334 	KASSERT(mutex_owned((pm)->pm_lock)); \
335 	if ((idx) != 0) \
336 		mutex_enter((pm)->pm_obj[(idx)].vmobjlock)
337 #define	PMAP_SUBOBJ_UNLOCK(pm, idx) \
338 	KASSERT(mutex_owned((pm)->pm_lock)); \
339 	if ((idx) != 0) \
340 		mutex_exit((pm)->pm_obj[(idx)].vmobjlock)
341 #else /* defined(DIAGNOSTIC) */
342 #define	PMAP_SUBOBJ_LOCK(pm, idx)	/* nothing */
343 #define	PMAP_SUBOBJ_UNLOCK(pm, idx)	/* nothing */
344 #endif /* defined(DIAGNOSTIC) */
345 
346 /*
347  * Misc. event counters.
348  */
349 struct evcnt pmap_iobmp_evcnt;
350 struct evcnt pmap_ldt_evcnt;
351 
352 /*
353  * PAT
354  */
355 #define	PATENTRY(n, type)	(type << ((n) * 8))
356 #define	PAT_UC		0x0ULL
357 #define	PAT_WC		0x1ULL
358 #define	PAT_WT		0x4ULL
359 #define	PAT_WP		0x5ULL
360 #define	PAT_WB		0x6ULL
361 #define	PAT_UCMINUS	0x7ULL
362 
363 static bool cpu_pat_enabled __read_mostly = false;
364 
365 /*
366  * global data structures
367  */
368 
369 static struct pmap kernel_pmap_store;	/* the kernel's pmap (proc0) */
370 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store;
371 
372 /*
373  * pmap_pg_g: if our processor supports PG_G in the PTE then we
374  * set pmap_pg_g to PG_G (otherwise it is zero).
375  */
376 
377 int pmap_pg_g __read_mostly = 0;
378 
379 /*
380  * pmap_largepages: if our processor supports PG_PS and we are
381  * using it, this is set to true.
382  */
383 
384 int pmap_largepages __read_mostly;
385 
386 /*
387  * i386 physical memory comes in a big contig chunk with a small
388  * hole toward the front of it...  the following two paddr_t's
389  * (shared with machdep.c) describe the physical address space
390  * of this machine.
391  */
392 paddr_t avail_start __read_mostly; /* PA of first available physical page */
393 paddr_t avail_end __read_mostly; /* PA of last available physical page */
394 
395 #ifdef XEN
396 #ifdef __x86_64__
397 /* Dummy PGD for user cr3, used between pmap_deactivate() and pmap_activate() */
398 static paddr_t xen_dummy_user_pgd;
399 #endif /* __x86_64__ */
400 paddr_t pmap_pa_start; /* PA of first physical page for this domain */
401 paddr_t pmap_pa_end;   /* PA of last physical page for this domain */
402 #endif /* XEN */
403 
404 #define	VM_PAGE_TO_PP(pg)	(&(pg)->mdpage.mp_pp)
405 
406 #define	PV_HASH_SIZE		32768
407 #define	PV_HASH_LOCK_CNT	32
408 
409 struct pv_hash_lock {
410 	kmutex_t lock;
411 } __aligned(CACHE_LINE_SIZE) pv_hash_locks[PV_HASH_LOCK_CNT]
412     __aligned(CACHE_LINE_SIZE);
413 
414 struct pv_hash_head {
415 	SLIST_HEAD(, pv_entry) hh_list;
416 } pv_hash_heads[PV_HASH_SIZE];
417 
418 static u_int
419 pvhash_hash(struct vm_page *ptp, vaddr_t va)
420 {
421 
422 	return (uintptr_t)ptp / sizeof(*ptp) + (va >> PAGE_SHIFT);
423 }
424 
425 static struct pv_hash_head *
426 pvhash_head(u_int hash)
427 {
428 
429 	return &pv_hash_heads[hash % PV_HASH_SIZE];
430 }
431 
432 static kmutex_t *
433 pvhash_lock(u_int hash)
434 {
435 
436 	return &pv_hash_locks[hash % PV_HASH_LOCK_CNT].lock;
437 }
438 
439 static struct pv_entry *
440 pvhash_remove(struct pv_hash_head *hh, struct vm_page *ptp, vaddr_t va)
441 {
442 	struct pv_entry *pve;
443 	struct pv_entry *prev;
444 
445 	prev = NULL;
446 	SLIST_FOREACH(pve, &hh->hh_list, pve_hash) {
447 		if (pve->pve_pte.pte_ptp == ptp &&
448 		    pve->pve_pte.pte_va == va) {
449 			if (prev != NULL) {
450 				SLIST_REMOVE_AFTER(prev, pve_hash);
451 			} else {
452 				SLIST_REMOVE_HEAD(&hh->hh_list, pve_hash);
453 			}
454 			break;
455 		}
456 		prev = pve;
457 	}
458 	return pve;
459 }
460 
461 /*
462  * other data structures
463  */
464 
465 static pt_entry_t protection_codes[8] __read_mostly; /* maps MI prot to i386
466 							prot code */
467 static bool pmap_initialized __read_mostly = false; /* pmap_init done yet? */
468 
469 /*
470  * the following two vaddr_t's are used during system startup
471  * to keep track of how much of the kernel's VM space we have used.
472  * once the system is started, the management of the remaining kernel
473  * VM space is turned over to the kernel_map vm_map.
474  */
475 
476 static vaddr_t virtual_avail __read_mostly;	/* VA of first free KVA */
477 static vaddr_t virtual_end __read_mostly;	/* VA of last free KVA */
478 
479 /*
480  * pool that pmap structures are allocated from
481  */
482 
483 static struct pool_cache pmap_cache;
484 
485 /*
486  * pv_entry cache
487  */
488 
489 static struct pool_cache pmap_pv_cache;
490 
491 #ifdef __HAVE_DIRECT_MAP
492 
493 extern phys_ram_seg_t mem_clusters[];
494 extern int mem_cluster_cnt;
495 
496 #else
497 
498 /*
499  * MULTIPROCESSOR: special VA's/ PTE's are actually allocated inside a
500  * maxcpus*NPTECL array of PTE's, to avoid cache line thrashing
501  * due to false sharing.
502  */
503 
504 #ifdef MULTIPROCESSOR
505 #define PTESLEW(pte, id) ((pte)+(id)*NPTECL)
506 #define VASLEW(va,id) ((va)+(id)*NPTECL*PAGE_SIZE)
507 #else
508 #define PTESLEW(pte, id) (pte)
509 #define VASLEW(va,id) (va)
510 #endif
511 
512 /*
513  * special VAs and the PTEs that map them
514  */
515 static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte, *early_zero_pte;
516 static char *csrcp, *cdstp, *zerop, *ptpp;
517 #ifdef XEN
518 char *early_zerop; /* also referenced from xen_pmap_bootstrap() */
519 #else
520 static char *early_zerop;
521 #endif
522 
523 #endif
524 
525 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int);
526 
527 /* PDP pool_cache(9) and its callbacks */
528 struct pool_cache pmap_pdp_cache;
529 static int  pmap_pdp_ctor(void *, void *, int);
530 static void pmap_pdp_dtor(void *, void *);
531 #ifdef PAE
532 /* need to allocate items of 4 pages */
533 static void *pmap_pdp_alloc(struct pool *, int);
534 static void pmap_pdp_free(struct pool *, void *);
535 static struct pool_allocator pmap_pdp_allocator = {
536 	.pa_alloc = pmap_pdp_alloc,
537 	.pa_free = pmap_pdp_free,
538 	.pa_pagesz = PAGE_SIZE * PDP_SIZE,
539 };
540 #endif /* PAE */
541 
542 extern vaddr_t idt_vaddr;			/* we allocate IDT early */
543 extern paddr_t idt_paddr;
544 
545 #ifdef _LP64
546 extern vaddr_t lo32_vaddr;
547 extern vaddr_t lo32_paddr;
548 #endif
549 
550 extern int end;
551 
552 #ifdef i386
553 /* stuff to fix the pentium f00f bug */
554 extern vaddr_t pentium_idt_vaddr;
555 #endif
556 
557 
558 /*
559  * local prototypes
560  */
561 
562 static struct vm_page	*pmap_get_ptp(struct pmap *, vaddr_t,
563 				      pd_entry_t * const *);
564 static struct vm_page	*pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int);
565 static void		 pmap_freepage(struct pmap *, struct vm_page *, int);
566 static void		 pmap_free_ptp(struct pmap *, struct vm_page *,
567 				       vaddr_t, pt_entry_t *,
568 				       pd_entry_t * const *);
569 static bool		 pmap_remove_pte(struct pmap *, struct vm_page *,
570 					 pt_entry_t *, vaddr_t,
571 					 struct pv_entry **);
572 static void		 pmap_remove_ptes(struct pmap *, struct vm_page *,
573 					  vaddr_t, vaddr_t, vaddr_t,
574 					  struct pv_entry **);
575 
576 static bool		 pmap_get_physpage(vaddr_t, int, paddr_t *);
577 static void		 pmap_alloc_level(pd_entry_t * const *, vaddr_t, int,
578 					  long *);
579 
580 static bool		 pmap_reactivate(struct pmap *);
581 
582 /*
583  * p m a p   h e l p e r   f u n c t i o n s
584  */
585 
586 static inline void
587 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff)
588 {
589 
590 	if (pmap == pmap_kernel()) {
591 		atomic_add_long(&pmap->pm_stats.resident_count, resid_diff);
592 		atomic_add_long(&pmap->pm_stats.wired_count, wired_diff);
593 	} else {
594 		KASSERT(mutex_owned(pmap->pm_lock));
595 		pmap->pm_stats.resident_count += resid_diff;
596 		pmap->pm_stats.wired_count += wired_diff;
597 	}
598 }
599 
600 static inline void
601 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
602 {
603 	int resid_diff = ((npte & PG_V) ? 1 : 0) - ((opte & PG_V) ? 1 : 0);
604 	int wired_diff = ((npte & PG_W) ? 1 : 0) - ((opte & PG_W) ? 1 : 0);
605 
606 	KASSERT((npte & (PG_V | PG_W)) != PG_W);
607 	KASSERT((opte & (PG_V | PG_W)) != PG_W);
608 
609 	pmap_stats_update(pmap, resid_diff, wired_diff);
610 }
611 
612 /*
613  * ptp_to_pmap: lookup pmap by ptp
614  */
615 
616 static struct pmap *
617 ptp_to_pmap(struct vm_page *ptp)
618 {
619 	struct pmap *pmap;
620 
621 	if (ptp == NULL) {
622 		return pmap_kernel();
623 	}
624 	pmap = (struct pmap *)ptp->uobject;
625 	KASSERT(pmap != NULL);
626 	KASSERT(&pmap->pm_obj[0] == ptp->uobject);
627 	return pmap;
628 }
629 
630 static inline struct pv_pte *
631 pve_to_pvpte(struct pv_entry *pve)
632 {
633 
634 	KASSERT((void *)&pve->pve_pte == (void *)pve);
635 	return &pve->pve_pte;
636 }
637 
638 static inline struct pv_entry *
639 pvpte_to_pve(struct pv_pte *pvpte)
640 {
641 	struct pv_entry *pve = (void *)pvpte;
642 
643 	KASSERT(pve_to_pvpte(pve) == pvpte);
644 	return pve;
645 }
646 
647 /*
648  * pv_pte_first, pv_pte_next: PV list iterator.
649  */
650 
651 static struct pv_pte *
652 pv_pte_first(struct pmap_page *pp)
653 {
654 
655 	if ((pp->pp_flags & PP_EMBEDDED) != 0) {
656 		return &pp->pp_pte;
657 	}
658 	return pve_to_pvpte(LIST_FIRST(&pp->pp_head.pvh_list));
659 }
660 
661 static struct pv_pte *
662 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte)
663 {
664 
665 	KASSERT(pvpte != NULL);
666 	if (pvpte == &pp->pp_pte) {
667 		KASSERT((pp->pp_flags & PP_EMBEDDED) != 0);
668 		return NULL;
669 	}
670 	KASSERT((pp->pp_flags & PP_EMBEDDED) == 0);
671 	return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list));
672 }
673 
674 /*
675  * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
676  *		of course the kernel is always loaded
677  */
678 
679 bool
680 pmap_is_curpmap(struct pmap *pmap)
681 {
682 	return((pmap == pmap_kernel()) ||
683 	       (pmap == curcpu()->ci_pmap));
684 }
685 
686 /*
687  *	Add a reference to the specified pmap.
688  */
689 
690 void
691 pmap_reference(struct pmap *pmap)
692 {
693 
694 	atomic_inc_uint(&pmap->pm_obj[0].uo_refs);
695 }
696 
697 /*
698  * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
699  *
700  * there are several pmaps involved.  some or all of them might be same.
701  *
702  *	- the pmap given by the first argument
703  *		our caller wants to access this pmap's PTEs.
704  *
705  *	- pmap_kernel()
706  *		the kernel pmap.  note that it only contains the kernel part
707  *		of the address space which is shared by any pmap.  ie. any
708  *		pmap can be used instead of pmap_kernel() for our purpose.
709  *
710  *	- ci->ci_pmap
711  *		pmap currently loaded on the cpu.
712  *
713  *	- vm_map_pmap(&curproc->p_vmspace->vm_map)
714  *		current process' pmap.
715  *
716  * => we lock enough pmaps to keep things locked in
717  * => must be undone with pmap_unmap_ptes before returning
718  */
719 
720 void
721 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2,
722 	      pd_entry_t **ptepp, pd_entry_t * const **pdeppp)
723 {
724 	struct pmap *curpmap;
725 	struct cpu_info *ci;
726 	uint32_t cpumask;
727 	lwp_t *l;
728 
729 	/* The kernel's pmap is always accessible. */
730 	if (pmap == pmap_kernel()) {
731 		*pmap2 = NULL;
732 		*ptepp = PTE_BASE;
733 		*pdeppp = normal_pdes;
734 		return;
735 	}
736 	KASSERT(kpreempt_disabled());
737 
738 	l = curlwp;
739  retry:
740 	mutex_enter(pmap->pm_lock);
741 	ci = curcpu();
742 	curpmap = ci->ci_pmap;
743 	if (vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) {
744 		/* Our own pmap so just load it: easy. */
745 		if (__predict_false(ci->ci_want_pmapload)) {
746 			mutex_exit(pmap->pm_lock);
747 			pmap_load();
748 			goto retry;
749 		}
750 		KASSERT(pmap == curpmap);
751 	} else if (pmap == curpmap) {
752 		/*
753 		 * Already on the CPU: make it valid.  This is very
754 		 * often the case during exit(), when we have switched
755 		 * to the kernel pmap in order to destroy a user pmap.
756 		 */
757 		if (!pmap_reactivate(pmap)) {
758 			u_int gen = uvm_emap_gen_return();
759 			tlbflush();
760 			uvm_emap_update(gen);
761 		}
762 	} else {
763 		/*
764 		 * Toss current pmap from CPU, but keep a reference to it.
765 		 * The reference will be dropped by pmap_unmap_ptes().
766 		 * Can happen if we block during exit().
767 		 */
768 		cpumask = ci->ci_cpumask;
769 		atomic_and_32(&curpmap->pm_cpus, ~cpumask);
770 		atomic_and_32(&curpmap->pm_kernel_cpus, ~cpumask);
771 		ci->ci_pmap = pmap;
772 		ci->ci_tlbstate = TLBSTATE_VALID;
773 		atomic_or_32(&pmap->pm_cpus, cpumask);
774 		atomic_or_32(&pmap->pm_kernel_cpus, cpumask);
775 		cpu_load_pmap(pmap, curpmap);
776 	}
777 	pmap->pm_ncsw = l->l_ncsw;
778 	*pmap2 = curpmap;
779 	*ptepp = PTE_BASE;
780 #if defined(XEN) && defined(__x86_64__)
781 	KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] == L4_BASE);
782 	ci->ci_normal_pdes[PTP_LEVELS - 2] = pmap->pm_pdir;
783 	*pdeppp = ci->ci_normal_pdes;
784 #else /* XEN && __x86_64__ */
785 	*pdeppp = normal_pdes;
786 #endif /* XEN && __x86_64__ */
787 }
788 
789 /*
790  * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
791  */
792 
793 void
794 pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2)
795 {
796 	struct cpu_info *ci;
797 	struct pmap *mypmap;
798 
799 	KASSERT(kpreempt_disabled());
800 
801 	/* The kernel's pmap is always accessible. */
802 	if (pmap == pmap_kernel()) {
803 		return;
804 	}
805 
806 	ci = curcpu();
807 #if defined(XEN) && defined(__x86_64__)
808 	/* Reset per-cpu normal_pdes */
809 	KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] != L4_BASE);
810 	ci->ci_normal_pdes[PTP_LEVELS - 2] = L4_BASE;
811 #endif /* XEN && __x86_64__ */
812 	/*
813 	 * We cannot tolerate context switches while mapped in.
814 	 * If it is our own pmap all we have to do is unlock.
815 	 */
816 	KASSERT(pmap->pm_ncsw == curlwp->l_ncsw);
817 	mypmap = vm_map_pmap(&curproc->p_vmspace->vm_map);
818 	if (pmap == mypmap) {
819 		mutex_exit(pmap->pm_lock);
820 		return;
821 	}
822 
823 	/*
824 	 * Mark whatever's on the CPU now as lazy and unlock.
825 	 * If the pmap was already installed, we are done.
826 	 */
827 	ci->ci_tlbstate = TLBSTATE_LAZY;
828 	ci->ci_want_pmapload = (mypmap != pmap_kernel());
829 	mutex_exit(pmap->pm_lock);
830 	if (pmap == pmap2) {
831 		return;
832 	}
833 
834 	/*
835 	 * We installed another pmap on the CPU.  Grab a reference to
836 	 * it and leave in place.  Toss the evicted pmap (can block).
837 	 */
838 	pmap_reference(pmap);
839 	pmap_destroy(pmap2);
840 }
841 
842 
843 inline static void
844 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte)
845 {
846 
847 #if !defined(__x86_64__)
848 	if (curproc == NULL || curproc->p_vmspace == NULL ||
849 	    pm != vm_map_pmap(&curproc->p_vmspace->vm_map))
850 		return;
851 
852 	if ((opte ^ npte) & PG_X)
853 		pmap_update_pg(va);
854 
855 	/*
856 	 * Executability was removed on the last executable change.
857 	 * Reset the code segment to something conservative and
858 	 * let the trap handler deal with setting the right limit.
859 	 * We can't do that because of locking constraints on the vm map.
860 	 */
861 
862 	if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) {
863 		struct trapframe *tf = curlwp->l_md.md_regs;
864 
865 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
866 		pm->pm_hiexec = I386_MAX_EXE_ADDR;
867 	}
868 #endif /* !defined(__x86_64__) */
869 }
870 
871 #if !defined(__x86_64__)
872 /*
873  * Fixup the code segment to cover all potential executable mappings.
874  * returns 0 if no changes to the code segment were made.
875  */
876 
877 int
878 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb)
879 {
880 	struct vm_map_entry *ent;
881 	struct pmap *pm = vm_map_pmap(map);
882 	vaddr_t va = 0;
883 
884 	vm_map_lock_read(map);
885 	for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) {
886 
887 		/*
888 		 * This entry has greater va than the entries before.
889 		 * We need to make it point to the last page, not past it.
890 		 */
891 
892 		if (ent->protection & VM_PROT_EXECUTE)
893 			va = trunc_page(ent->end) - PAGE_SIZE;
894 	}
895 	vm_map_unlock_read(map);
896 	if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL))
897 		return (0);
898 
899 	pm->pm_hiexec = va;
900 	if (pm->pm_hiexec > I386_MAX_EXE_ADDR) {
901 		tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
902 	} else {
903 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
904 		return (0);
905 	}
906 	return (1);
907 }
908 #endif /* !defined(__x86_64__) */
909 
910 void
911 pat_init(struct cpu_info *ci)
912 {
913 	uint64_t pat;
914 
915 	if (!(ci->ci_feat_val[0] & CPUID_PAT))
916 		return;
917 
918 	/* We change WT to WC. Leave all other entries the default values. */
919 	pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) |
920 	      PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) |
921 	      PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) |
922 	      PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC);
923 
924 	wrmsr(MSR_CR_PAT, pat);
925 	cpu_pat_enabled = true;
926 	aprint_debug_dev(ci->ci_dev, "PAT enabled\n");
927 }
928 
929 static pt_entry_t
930 pmap_pat_flags(u_int flags)
931 {
932 	u_int cacheflags = (flags & PMAP_CACHE_MASK);
933 
934 	if (!cpu_pat_enabled) {
935 		switch (cacheflags) {
936 		case PMAP_NOCACHE:
937 		case PMAP_NOCACHE_OVR:
938 			/* results in PGC_UCMINUS on cpus which have
939 			 * the cpuid PAT but PAT "disabled"
940 			 */
941 			return PG_N;
942 		default:
943 			return 0;
944 		}
945 	}
946 
947 	switch (cacheflags) {
948 	case PMAP_NOCACHE:
949 		return PGC_UC;
950 	case PMAP_WRITE_COMBINE:
951 		return PGC_WC;
952 	case PMAP_WRITE_BACK:
953 		return PGC_WB;
954 	case PMAP_NOCACHE_OVR:
955 		return PGC_UCMINUS;
956 	}
957 
958 	return 0;
959 }
960 
961 /*
962  * p m a p   k e n t e r   f u n c t i o n s
963  *
964  * functions to quickly enter/remove pages from the kernel address
965  * space.   pmap_kremove is exported to MI kernel.  we make use of
966  * the recursive PTE mappings.
967  */
968 
969 /*
970  * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
971  *
972  * => no need to lock anything, assume va is already allocated
973  * => should be faster than normal pmap enter function
974  */
975 
976 void
977 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
978 {
979 	pt_entry_t *pte, opte, npte;
980 
981 	KASSERT(!(prot & ~VM_PROT_ALL));
982 
983 	if (va < VM_MIN_KERNEL_ADDRESS)
984 		pte = vtopte(va);
985 	else
986 		pte = kvtopte(va);
987 #ifdef DOM0OPS
988 	if (pa < pmap_pa_start || pa >= pmap_pa_end) {
989 #ifdef DEBUG
990 		printf_nolog("%s: pa 0x%" PRIx64 " for va 0x%" PRIx64
991 		    " outside range\n", __func__, (int64_t)pa, (int64_t)va);
992 #endif /* DEBUG */
993 		npte = pa;
994 	} else
995 #endif /* DOM0OPS */
996 		npte = pmap_pa2pte(pa);
997 	npte |= protection_codes[prot] | PG_k | PG_V | pmap_pg_g;
998 	npte |= pmap_pat_flags(flags);
999 	opte = pmap_pte_testset(pte, npte); /* zap! */
1000 #if defined(DIAGNOSTIC)
1001 	/* XXX For now... */
1002 	if (opte & PG_PS)
1003 		panic("%s: PG_PS", __func__);
1004 #endif
1005 	if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) {
1006 		/* This should not happen. */
1007 		printf_nolog("%s: mapping already present\n", __func__);
1008 		kpreempt_disable();
1009 		pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER);
1010 		kpreempt_enable();
1011 	}
1012 }
1013 
1014 void
1015 pmap_emap_enter(vaddr_t va, paddr_t pa, vm_prot_t prot)
1016 {
1017 	pt_entry_t *pte, opte, npte;
1018 
1019 	KASSERT((prot & ~VM_PROT_ALL) == 0);
1020 	pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va);
1021 
1022 #ifdef DOM0OPS
1023 	if (pa < pmap_pa_start || pa >= pmap_pa_end) {
1024 		npte = pa;
1025 	} else
1026 #endif
1027 		npte = pmap_pa2pte(pa);
1028 
1029 	npte = pmap_pa2pte(pa);
1030 	npte |= protection_codes[prot] | PG_k | PG_V;
1031 	opte = pmap_pte_testset(pte, npte);
1032 }
1033 
1034 /*
1035  * pmap_emap_sync: perform TLB flush or pmap load, if it was deferred.
1036  */
1037 void
1038 pmap_emap_sync(bool canload)
1039 {
1040 	struct cpu_info *ci = curcpu();
1041 	struct pmap *pmap;
1042 
1043 	KASSERT(kpreempt_disabled());
1044 	if (__predict_true(ci->ci_want_pmapload && canload)) {
1045 		/*
1046 		 * XXX: Hint for pmap_reactivate(), which might suggest to
1047 		 * not perform TLB flush, if state has not changed.
1048 		 */
1049 		pmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map);
1050 		if (__predict_false(pmap == ci->ci_pmap)) {
1051 			const uint32_t cpumask = ci->ci_cpumask;
1052 			atomic_and_32(&pmap->pm_cpus, ~cpumask);
1053 		}
1054 		pmap_load();
1055 		KASSERT(ci->ci_want_pmapload == 0);
1056 	} else {
1057 		tlbflush();
1058 	}
1059 
1060 }
1061 
1062 void
1063 pmap_emap_remove(vaddr_t sva, vsize_t len)
1064 {
1065 	pt_entry_t *pte, xpte;
1066 	vaddr_t va, eva = sva + len;
1067 
1068 	for (va = sva; va < eva; va += PAGE_SIZE) {
1069 		pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va);
1070 		xpte |= pmap_pte_testset(pte, 0);
1071 	}
1072 }
1073 
1074 __strict_weak_alias(pmap_kenter_ma, pmap_kenter_pa);
1075 
1076 #if defined(__x86_64__)
1077 /*
1078  * Change protection for a virtual address. Local for a CPU only, don't
1079  * care about TLB shootdowns.
1080  *
1081  * => must be called with preemption disabled
1082  */
1083 void
1084 pmap_changeprot_local(vaddr_t va, vm_prot_t prot)
1085 {
1086 	pt_entry_t *pte, opte, npte;
1087 
1088 	KASSERT(kpreempt_disabled());
1089 
1090 	if (va < VM_MIN_KERNEL_ADDRESS)
1091 		pte = vtopte(va);
1092 	else
1093 		pte = kvtopte(va);
1094 
1095 	npte = opte = *pte;
1096 
1097 	if ((prot & VM_PROT_WRITE) != 0)
1098 		npte |= PG_RW;
1099 	else
1100 		npte &= ~PG_RW;
1101 
1102 	if (opte != npte) {
1103 		pmap_pte_set(pte, npte);
1104 		pmap_pte_flush();
1105 		invlpg(va);
1106 	}
1107 }
1108 #endif /* defined(__x86_64__) */
1109 
1110 /*
1111  * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
1112  *
1113  * => no need to lock anything
1114  * => caller must dispose of any vm_page mapped in the va range
1115  * => note: not an inline function
1116  * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
1117  * => we assume kernel only unmaps valid addresses and thus don't bother
1118  *    checking the valid bit before doing TLB flushing
1119  * => must be followed by call to pmap_update() before reuse of page
1120  */
1121 
1122 void
1123 pmap_kremove(vaddr_t sva, vsize_t len)
1124 {
1125 	pt_entry_t *pte, opte;
1126 	vaddr_t va, eva;
1127 
1128 	eva = sva + len;
1129 
1130 	kpreempt_disable();
1131 	for (va = sva; va < eva; va += PAGE_SIZE) {
1132 		if (va < VM_MIN_KERNEL_ADDRESS)
1133 			pte = vtopte(va);
1134 		else
1135 			pte = kvtopte(va);
1136 		opte = pmap_pte_testset(pte, 0); /* zap! */
1137 		if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) {
1138 			pmap_tlb_shootdown(pmap_kernel(), va, opte,
1139 			    TLBSHOOT_KREMOVE);
1140 		}
1141 		KASSERT((opte & PG_PS) == 0);
1142 		KASSERT((opte & PG_PVLIST) == 0);
1143 	}
1144 	kpreempt_enable();
1145 }
1146 
1147 /*
1148  * p m a p   i n i t   f u n c t i o n s
1149  *
1150  * pmap_bootstrap and pmap_init are called during system startup
1151  * to init the pmap module.   pmap_bootstrap() does a low level
1152  * init just to get things rolling.   pmap_init() finishes the job.
1153  */
1154 
1155 /*
1156  * pmap_bootstrap: get the system in a state where it can run with VM
1157  *	properly enabled (called before main()).   the VM system is
1158  *      fully init'd later...
1159  *
1160  * => on i386, locore.s has already enabled the MMU by allocating
1161  *	a PDP for the kernel, and nkpde PTP's for the kernel.
1162  * => kva_start is the first free virtual address in kernel space
1163  */
1164 
1165 void
1166 pmap_bootstrap(vaddr_t kva_start)
1167 {
1168 	struct pmap *kpm;
1169 	pt_entry_t *pte;
1170 	int i;
1171 	vaddr_t kva;
1172 #ifndef XEN
1173 	pd_entry_t *pde;
1174 	unsigned long p1i;
1175 	vaddr_t kva_end;
1176 #endif
1177 #ifdef __HAVE_DIRECT_MAP
1178 	phys_ram_seg_t *mc;
1179 	long ndmpdp;
1180 	paddr_t lastpa, dmpd, dmpdp, pdp;
1181 	vaddr_t tmpva;
1182 #endif
1183 
1184 	pt_entry_t pg_nx = (cpu_feature[2] & CPUID_NOX ? PG_NX : 0);
1185 
1186 	/*
1187 	 * set up our local static global vars that keep track of the
1188 	 * usage of KVM before kernel_map is set up
1189 	 */
1190 
1191 	virtual_avail = kva_start;		/* first free KVA */
1192 	virtual_end = VM_MAX_KERNEL_ADDRESS;	/* last KVA */
1193 
1194 	/*
1195 	 * set up protection_codes: we need to be able to convert from
1196 	 * a MI protection code (some combo of VM_PROT...) to something
1197 	 * we can jam into a i386 PTE.
1198 	 */
1199 
1200 	protection_codes[VM_PROT_NONE] = pg_nx;			/* --- */
1201 	protection_codes[VM_PROT_EXECUTE] = PG_RO | PG_X;	/* --x */
1202 	protection_codes[VM_PROT_READ] = PG_RO | pg_nx;		/* -r- */
1203 	protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO | PG_X;/* -rx */
1204 	protection_codes[VM_PROT_WRITE] = PG_RW | pg_nx;	/* w-- */
1205 	protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW | PG_X;/* w-x */
1206 	protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW | pg_nx;
1207 								/* wr- */
1208 	protection_codes[VM_PROT_ALL] = PG_RW | PG_X;		/* wrx */
1209 
1210 	/*
1211 	 * now we init the kernel's pmap
1212 	 *
1213 	 * the kernel pmap's pm_obj is not used for much.   however, in
1214 	 * user pmaps the pm_obj contains the list of active PTPs.
1215 	 * the pm_obj currently does not have a pager.   it might be possible
1216 	 * to add a pager that would allow a process to read-only mmap its
1217 	 * own page tables (fast user level vtophys?).   this may or may not
1218 	 * be useful.
1219 	 */
1220 
1221 	kpm = pmap_kernel();
1222 	for (i = 0; i < PTP_LEVELS - 1; i++) {
1223 		mutex_init(&kpm->pm_obj_lock[i], MUTEX_DEFAULT, IPL_NONE);
1224 		uvm_obj_init(&kpm->pm_obj[i], NULL, false, 1);
1225 		uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_obj_lock[i]);
1226 		kpm->pm_ptphint[i] = NULL;
1227 	}
1228 	memset(&kpm->pm_list, 0, sizeof(kpm->pm_list));  /* pm_list not used */
1229 
1230 	kpm->pm_pdir = (pd_entry_t *)(PDPpaddr + KERNBASE);
1231 	for (i = 0; i < PDP_SIZE; i++)
1232 		kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i;
1233 
1234 	kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
1235 		x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS);
1236 
1237 	/*
1238 	 * the above is just a rough estimate and not critical to the proper
1239 	 * operation of the system.
1240 	 */
1241 
1242 #ifndef XEN
1243 	/*
1244 	 * Begin to enable global TLB entries if they are supported.
1245 	 * The G bit has no effect until the CR4_PGE bit is set in CR4,
1246 	 * which happens in cpu_init(), which is run on each cpu
1247 	 * (and happens later)
1248 	 */
1249 
1250 	if (cpu_feature[0] & CPUID_PGE) {
1251 		pmap_pg_g = PG_G;		/* enable software */
1252 
1253 		/* add PG_G attribute to already mapped kernel pages */
1254 		if (KERNBASE == VM_MIN_KERNEL_ADDRESS) {
1255 			kva_end = virtual_avail;
1256 		} else {
1257 			extern vaddr_t eblob, esym;
1258 			kva_end = (vaddr_t)&end;
1259 			if (esym > kva_end)
1260 				kva_end = esym;
1261 			if (eblob > kva_end)
1262 				kva_end = eblob;
1263 			kva_end = roundup(kva_end, PAGE_SIZE);
1264 		}
1265 		for (kva = KERNBASE; kva < kva_end; kva += PAGE_SIZE) {
1266 			p1i = pl1_i(kva);
1267 			if (pmap_valid_entry(PTE_BASE[p1i]))
1268 				PTE_BASE[p1i] |= PG_G;
1269 		}
1270 	}
1271 
1272 	/*
1273 	 * enable large pages if they are supported.
1274 	 */
1275 
1276 	if (cpu_feature[0] & CPUID_PSE) {
1277 		paddr_t pa;
1278 		extern char __data_start;
1279 
1280 		lcr4(rcr4() | CR4_PSE);	/* enable hardware (via %cr4) */
1281 		pmap_largepages = 1;	/* enable software */
1282 
1283 		/*
1284 		 * the TLB must be flushed after enabling large pages
1285 		 * on Pentium CPUs, according to section 3.6.2.2 of
1286 		 * "Intel Architecture Software Developer's Manual,
1287 		 * Volume 3: System Programming".
1288 		 */
1289 		tlbflushg();
1290 
1291 		/*
1292 		 * now, remap the kernel text using large pages.  we
1293 		 * assume that the linker has properly aligned the
1294 		 * .data segment to a NBPD_L2 boundary.
1295 		 */
1296 		kva_end = rounddown((vaddr_t)&__data_start, NBPD_L1);
1297 		for (pa = 0, kva = KERNBASE; kva + NBPD_L2 <= kva_end;
1298 		     kva += NBPD_L2, pa += NBPD_L2) {
1299 			pde = &L2_BASE[pl2_i(kva)];
1300 			*pde = pa | pmap_pg_g | PG_PS |
1301 			    PG_KR | PG_V;	/* zap! */
1302 			tlbflushg();
1303 		}
1304 #if defined(DEBUG)
1305 		aprint_normal("kernel text is mapped with %" PRIuPSIZE " large "
1306 		    "pages and %" PRIuPSIZE " normal pages\n",
1307 		    howmany(kva - KERNBASE, NBPD_L2),
1308 		    howmany((vaddr_t)&__data_start - kva, NBPD_L1));
1309 #endif /* defined(DEBUG) */
1310 	}
1311 #endif /* !XEN */
1312 
1313 #ifdef __HAVE_DIRECT_MAP
1314 
1315 	tmpva = (KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2);
1316 	pte = PTE_BASE + pl1_i(tmpva);
1317 
1318 	/*
1319 	 * Map the direct map.  Use 1GB pages if they are available,
1320 	 * otherwise use 2MB pages.  Note that the unused parts of
1321 	 * PTPs * must be zero outed, as they might be accessed due
1322 	 * to speculative execution.  Also, PG_G is not allowed on
1323 	 * non-leaf PTPs.
1324 	 */
1325 
1326 	lastpa = 0;
1327 	for (i = 0; i < mem_cluster_cnt; i++) {
1328 		mc = &mem_clusters[i];
1329 		lastpa = MAX(lastpa, mc->start + mc->size);
1330 	}
1331 
1332 	ndmpdp = (lastpa + NBPD_L3 - 1) >> L3_SHIFT;
1333 	dmpdp = avail_start;	avail_start += PAGE_SIZE;
1334 
1335 	*pte = dmpdp | PG_V | PG_RW;
1336 	pmap_update_pg(tmpva);
1337 	memset((void *)tmpva, 0, PAGE_SIZE);
1338 
1339 	if (cpu_feature[2] & CPUID_P1GB) {
1340 		for (i = 0; i < ndmpdp; i++) {
1341 			pdp = (paddr_t)&(((pd_entry_t *)dmpdp)[i]);
1342 			*pte = (pdp & PG_FRAME) | PG_V | PG_RW;
1343 			pmap_update_pg(tmpva);
1344 
1345 			pde = (pd_entry_t *)(tmpva + (pdp & ~PG_FRAME));
1346 			*pde = ((paddr_t)i << L3_SHIFT) |
1347 				PG_RW | PG_V | PG_U | PG_PS | PG_G;
1348 		}
1349 	} else {
1350 		dmpd = avail_start;	avail_start += ndmpdp * PAGE_SIZE;
1351 
1352 		for (i = 0; i < ndmpdp; i++) {
1353 			pdp = dmpd + i * PAGE_SIZE;
1354 			*pte = (pdp & PG_FRAME) | PG_V | PG_RW;
1355 			pmap_update_pg(tmpva);
1356 
1357 			memset((void *)tmpva, 0, PAGE_SIZE);
1358 		}
1359 		for (i = 0; i < NPDPG * ndmpdp; i++) {
1360 			pdp = (paddr_t)&(((pd_entry_t *)dmpd)[i]);
1361 			*pte = (pdp & PG_FRAME) | PG_V | PG_RW;
1362 			pmap_update_pg(tmpva);
1363 
1364 			pde = (pd_entry_t *)(tmpva + (pdp & ~PG_FRAME));
1365 			*pde = ((paddr_t)i << L2_SHIFT) |
1366 				PG_RW | PG_V | PG_U | PG_PS | PG_G;
1367 		}
1368 		for (i = 0; i < ndmpdp; i++) {
1369 			pdp = (paddr_t)&(((pd_entry_t *)dmpdp)[i]);
1370 			*pte = (pdp & PG_FRAME) | PG_V | PG_RW;
1371 			pmap_update_pg((vaddr_t)tmpva);
1372 
1373 			pde = (pd_entry_t *)(tmpva + (pdp & ~PG_FRAME));
1374 			*pde = (dmpd + (i << PAGE_SHIFT)) |
1375 				PG_RW | PG_V | PG_U;
1376 		}
1377 	}
1378 
1379 	kpm->pm_pdir[PDIR_SLOT_DIRECT] = dmpdp | PG_KW | PG_V | PG_U;
1380 
1381 	tlbflush();
1382 
1383 #else
1384 	if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
1385 		/*
1386 		 * zero_pte is stuck at the end of mapped space for the kernel
1387 		 * image (disjunct from kva space). This is done so that it
1388 		 * can safely be used in pmap_growkernel (pmap_get_physpage),
1389 		 * when it's called for the first time.
1390 		 * XXXfvdl fix this for MULTIPROCESSOR later.
1391 		 */
1392 #ifdef XEN
1393 		/* early_zerop initialized in xen_pmap_bootstrap() */
1394 #else
1395 		early_zerop = (void *)(KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2);
1396 #endif
1397 		early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop);
1398 	}
1399 
1400 	/*
1401 	 * now we allocate the "special" VAs which are used for tmp mappings
1402 	 * by the pmap (and other modules).    we allocate the VAs by advancing
1403 	 * virtual_avail (note that there are no pages mapped at these VAs).
1404 	 * we find the PTE that maps the allocated VA via the linear PTE
1405 	 * mapping.
1406 	 */
1407 
1408 	pte = PTE_BASE + pl1_i(virtual_avail);
1409 
1410 #ifdef MULTIPROCESSOR
1411 	/*
1412 	 * Waste some VA space to avoid false sharing of cache lines
1413 	 * for page table pages: Give each possible CPU a cache line
1414 	 * of PTE's (8) to play with, though we only need 4.  We could
1415 	 * recycle some of this waste by putting the idle stacks here
1416 	 * as well; we could waste less space if we knew the largest
1417 	 * CPU ID beforehand.
1418 	 */
1419 	csrcp = (char *) virtual_avail;  csrc_pte = pte;
1420 
1421 	cdstp = (char *) virtual_avail+PAGE_SIZE;  cdst_pte = pte+1;
1422 
1423 	zerop = (char *) virtual_avail+PAGE_SIZE*2;  zero_pte = pte+2;
1424 
1425 	ptpp = (char *) virtual_avail+PAGE_SIZE*3;  ptp_pte = pte+3;
1426 
1427 	virtual_avail += PAGE_SIZE * maxcpus * NPTECL;
1428 	pte += maxcpus * NPTECL;
1429 #else
1430 	csrcp = (void *) virtual_avail;  csrc_pte = pte;	/* allocate */
1431 	virtual_avail += PAGE_SIZE; pte++;			/* advance */
1432 
1433 	cdstp = (void *) virtual_avail;  cdst_pte = pte;
1434 	virtual_avail += PAGE_SIZE; pte++;
1435 
1436 	zerop = (void *) virtual_avail;  zero_pte = pte;
1437 	virtual_avail += PAGE_SIZE; pte++;
1438 
1439 	ptpp = (void *) virtual_avail;  ptp_pte = pte;
1440 	virtual_avail += PAGE_SIZE; pte++;
1441 #endif
1442 
1443 	if (VM_MIN_KERNEL_ADDRESS == KERNBASE) {
1444 		early_zerop = zerop;
1445 		early_zero_pte = zero_pte;
1446 	}
1447 #endif
1448 
1449 	/*
1450 	 * Nothing after this point actually needs pte.
1451 	 */
1452 	pte = (void *)0xdeadbeef;
1453 
1454 #ifdef XEN
1455 #ifdef __x86_64__
1456 	/*
1457 	 * We want a dummy page directory for Xen:
1458 	 * when deactivate a pmap, Xen will still consider it active.
1459 	 * So we set user PGD to this one to lift all protection on
1460 	 * the now inactive page tables set.
1461 	 */
1462 	xen_dummy_user_pgd = avail_start;
1463 	avail_start += PAGE_SIZE;
1464 
1465 	/* Zero fill it, the less checks in Xen it requires the better */
1466 	memset((void *) (xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE);
1467 	/* Mark read-only */
1468 	HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE,
1469 	    pmap_pa2pte(xen_dummy_user_pgd) | PG_u | PG_V, UVMF_INVLPG);
1470 	/* Pin as L4 */
1471 	xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd));
1472 #endif /* __x86_64__ */
1473 	idt_vaddr = virtual_avail;                      /* don't need pte */
1474 	idt_paddr = avail_start;                        /* steal a page */
1475 	/*
1476 	 * Xen require one more page as we can't store
1477 	 * GDT and LDT on the same page
1478 	 */
1479 	virtual_avail += 3 * PAGE_SIZE;
1480 	avail_start += 3 * PAGE_SIZE;
1481 #else /* XEN */
1482 	idt_vaddr = virtual_avail;			/* don't need pte */
1483 	idt_paddr = avail_start;			/* steal a page */
1484 #if defined(__x86_64__)
1485 	virtual_avail += 2 * PAGE_SIZE;
1486 	avail_start += 2 * PAGE_SIZE;
1487 #else /* defined(__x86_64__) */
1488 	virtual_avail += PAGE_SIZE;
1489 	avail_start += PAGE_SIZE;
1490 	/* pentium f00f bug stuff */
1491 	pentium_idt_vaddr = virtual_avail;		/* don't need pte */
1492 	virtual_avail += PAGE_SIZE;
1493 #endif /* defined(__x86_64__) */
1494 #endif /* XEN */
1495 
1496 #ifdef _LP64
1497 	/*
1498 	 * Grab a page below 4G for things that need it (i.e.
1499 	 * having an initial %cr3 for the MP trampoline).
1500 	 */
1501 	lo32_vaddr = virtual_avail;
1502 	virtual_avail += PAGE_SIZE;
1503 	lo32_paddr = avail_start;
1504 	avail_start += PAGE_SIZE;
1505 #endif
1506 
1507 	/*
1508 	 * now we reserve some VM for mapping pages when doing a crash dump
1509 	 */
1510 
1511 	virtual_avail = reserve_dumppages(virtual_avail);
1512 
1513 	/*
1514 	 * init the static-global locks and global lists.
1515 	 *
1516 	 * => pventry::pvh_lock (initialized elsewhere) must also be
1517 	 *      a spin lock, again at IPL_VM to prevent deadlock, and
1518 	 *	again is never taken from interrupt context.
1519 	 */
1520 
1521 	mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE);
1522 	LIST_INIT(&pmaps);
1523 
1524 	/*
1525 	 * ensure the TLB is sync'd with reality by flushing it...
1526 	 */
1527 
1528 	tlbflushg();
1529 
1530 	/*
1531 	 * calculate pmap_maxkvaddr from nkptp[].
1532 	 */
1533 
1534 	kva = VM_MIN_KERNEL_ADDRESS;
1535 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
1536 		kva += nkptp[i] * nbpd[i];
1537 	}
1538 	pmap_maxkvaddr = kva;
1539 }
1540 
1541 #if defined(__x86_64__)
1542 /*
1543  * Pre-allocate PTPs for low memory, so that 1:1 mappings for various
1544  * trampoline code can be entered.
1545  */
1546 void
1547 pmap_prealloc_lowmem_ptps(void)
1548 {
1549 	int level;
1550 	paddr_t newp;
1551 	pd_entry_t *pdes;
1552 
1553 	const pd_entry_t pteflags = PG_k | PG_V | PG_RW;
1554 
1555 	pdes = pmap_kernel()->pm_pdir;
1556 	level = PTP_LEVELS;
1557 	for (;;) {
1558 		newp = avail_start;
1559 		avail_start += PAGE_SIZE;
1560 #ifdef __HAVE_DIRECT_MAP
1561 		memset((void *)PMAP_DIRECT_MAP(newp), 0, PAGE_SIZE);
1562 #else
1563 		pmap_pte_set(early_zero_pte, pmap_pa2pte(newp) | pteflags);
1564 		pmap_pte_flush();
1565 		pmap_update_pg((vaddr_t)early_zerop);
1566 		memset(early_zerop, 0, PAGE_SIZE);
1567 #endif
1568 
1569 #ifdef XEN
1570 		/* Mark R/O before installing */
1571 		HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop,
1572 		    xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG);
1573 		if (newp < (NKL2_KIMG_ENTRIES * NBPD_L2))
1574 			HYPERVISOR_update_va_mapping (newp + KERNBASE,
1575 			    xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG);
1576 
1577 
1578 		if (level == PTP_LEVELS) { /* Top level pde is per-cpu */
1579 			pd_entry_t *kpm_pdir;
1580 			/* Reach it via recursive mapping */
1581 			kpm_pdir = normal_pdes[PTP_LEVELS - 2];
1582 
1583 			/* Set it as usual. We can't defer this
1584 			 * outside the loop since recursive
1585 			 * pte entries won't be accessible during
1586 			 * further iterations at lower levels
1587 			 * otherwise.
1588 			 */
1589 			pmap_pte_set(&kpm_pdir[pl_i(0, PTP_LEVELS)],
1590 			    pmap_pa2pte(newp) | pteflags);
1591 		}
1592 
1593 #endif /* XEN */
1594 		pmap_pte_set(&pdes[pl_i(0, level)],
1595 		    pmap_pa2pte(newp) | pteflags);
1596 
1597 		pmap_pte_flush();
1598 
1599 		level--;
1600 		if (level <= 1)
1601 			break;
1602 		pdes = normal_pdes[level - 2];
1603 	}
1604 }
1605 #endif /* defined(__x86_64__) */
1606 
1607 /*
1608  * pmap_init: called from uvm_init, our job is to get the pmap
1609  * system ready to manage mappings...
1610  */
1611 
1612 void
1613 pmap_init(void)
1614 {
1615 	int i, flags;
1616 
1617 	for (i = 0; i < PV_HASH_SIZE; i++) {
1618 		SLIST_INIT(&pv_hash_heads[i].hh_list);
1619 	}
1620 	for (i = 0; i < PV_HASH_LOCK_CNT; i++) {
1621 		mutex_init(&pv_hash_locks[i].lock, MUTEX_NODEBUG, IPL_VM);
1622 	}
1623 
1624 	/*
1625 	 * initialize caches.
1626 	 */
1627 
1628 	pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), 0, 0, 0,
1629 	    "pmappl", NULL, IPL_NONE, NULL, NULL, NULL);
1630 
1631 #ifdef XEN
1632 	/*
1633 	 * pool_cache(9) should not touch cached objects, since they
1634 	 * are pinned on xen and R/O for the domU
1635 	 */
1636 	flags = PR_NOTOUCH;
1637 #else /* XEN */
1638 	flags = 0;
1639 #endif /* XEN */
1640 #ifdef PAE
1641 	pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE * PDP_SIZE, 0, 0, flags,
1642 	    "pdppl", &pmap_pdp_allocator, IPL_NONE,
1643 	    pmap_pdp_ctor, pmap_pdp_dtor, NULL);
1644 #else /* PAE */
1645 	pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE, 0, 0, flags,
1646 	    "pdppl", NULL, IPL_NONE, pmap_pdp_ctor, pmap_pdp_dtor, NULL);
1647 #endif /* PAE */
1648 	pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0,
1649 	    PR_LARGECACHE, "pvpl", &pool_allocator_kmem, IPL_NONE, NULL,
1650 	    NULL, NULL);
1651 
1652 	pmap_tlb_init();
1653 
1654 	evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC,
1655 	    NULL, "x86", "io bitmap copy");
1656 	evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC,
1657 	    NULL, "x86", "ldt sync");
1658 
1659 	/*
1660 	 * done: pmap module is up (and ready for business)
1661 	 */
1662 
1663 	pmap_initialized = true;
1664 }
1665 
1666 /*
1667  * pmap_cpu_init_late: perform late per-CPU initialization.
1668  */
1669 
1670 #ifndef XEN
1671 void
1672 pmap_cpu_init_late(struct cpu_info *ci)
1673 {
1674 	/*
1675 	 * The BP has already its own PD page allocated during early
1676 	 * MD startup.
1677 	 */
1678 	if (ci == &cpu_info_primary)
1679 		return;
1680 
1681 #ifdef PAE
1682 	cpu_alloc_l3_page(ci);
1683 #endif
1684 }
1685 #endif
1686 
1687 /*
1688  * p v _ e n t r y   f u n c t i o n s
1689  */
1690 
1691 /*
1692  * pmap_free_pvs: free a list of pv_entrys
1693  */
1694 
1695 static void
1696 pmap_free_pvs(struct pv_entry *pve)
1697 {
1698 	struct pv_entry *next;
1699 
1700 	for ( /* null */ ; pve != NULL ; pve = next) {
1701 		next = pve->pve_next;
1702 		pool_cache_put(&pmap_pv_cache, pve);
1703 	}
1704 }
1705 
1706 /*
1707  * main pv_entry manipulation functions:
1708  *   pmap_enter_pv: enter a mapping onto a pv_head list
1709  *   pmap_remove_pv: remove a mapping from a pv_head list
1710  *
1711  * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock
1712  *       the pvh before calling
1713  */
1714 
1715 /*
1716  * insert_pv: a helper of pmap_enter_pv
1717  */
1718 
1719 static void
1720 insert_pv(struct pmap_page *pp, struct pv_entry *pve)
1721 {
1722 	struct pv_hash_head *hh;
1723 	kmutex_t *lock;
1724 	u_int hash;
1725 
1726 	hash = pvhash_hash(pve->pve_pte.pte_ptp, pve->pve_pte.pte_va);
1727 	lock = pvhash_lock(hash);
1728 	hh = pvhash_head(hash);
1729 	mutex_spin_enter(lock);
1730 	SLIST_INSERT_HEAD(&hh->hh_list, pve, pve_hash);
1731 	mutex_spin_exit(lock);
1732 
1733 	LIST_INSERT_HEAD(&pp->pp_head.pvh_list, pve, pve_list);
1734 }
1735 
1736 /*
1737  * pmap_enter_pv: enter a mapping onto a pv_head lst
1738  *
1739  * => caller should adjust ptp's wire_count before calling
1740  */
1741 
1742 static struct pv_entry *
1743 pmap_enter_pv(struct pmap_page *pp,
1744 	      struct pv_entry *pve,	/* preallocated pve for us to use */
1745 	      struct pv_entry **sparepve,
1746 	      struct vm_page *ptp,
1747 	      vaddr_t va)
1748 {
1749 
1750 	KASSERT(ptp == NULL || ptp->wire_count >= 2);
1751 	KASSERT(ptp == NULL || ptp->uobject != NULL);
1752 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
1753 
1754 	if ((pp->pp_flags & PP_EMBEDDED) == 0) {
1755 		if (LIST_EMPTY(&pp->pp_head.pvh_list)) {
1756 			pp->pp_flags |= PP_EMBEDDED;
1757 			pp->pp_pte.pte_ptp = ptp;
1758 			pp->pp_pte.pte_va = va;
1759 
1760 			return pve;
1761 		}
1762 	} else {
1763 		struct pv_entry *pve2;
1764 
1765 		pve2 = *sparepve;
1766 		*sparepve = NULL;
1767 
1768 		pve2->pve_pte = pp->pp_pte;
1769 		pp->pp_flags &= ~PP_EMBEDDED;
1770 		LIST_INIT(&pp->pp_head.pvh_list);
1771 		insert_pv(pp, pve2);
1772 	}
1773 
1774 	pve->pve_pte.pte_ptp = ptp;
1775 	pve->pve_pte.pte_va = va;
1776 	insert_pv(pp, pve);
1777 
1778 	return NULL;
1779 }
1780 
1781 /*
1782  * pmap_remove_pv: try to remove a mapping from a pv_list
1783  *
1784  * => caller should adjust ptp's wire_count and free PTP if needed
1785  * => we return the removed pve
1786  */
1787 
1788 static struct pv_entry *
1789 pmap_remove_pv(struct pmap_page *pp, struct vm_page *ptp, vaddr_t va)
1790 {
1791 	struct pv_hash_head *hh;
1792 	struct pv_entry *pve;
1793 	kmutex_t *lock;
1794 	u_int hash;
1795 
1796 	KASSERT(ptp == NULL || ptp->uobject != NULL);
1797 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
1798 
1799 	if ((pp->pp_flags & PP_EMBEDDED) != 0) {
1800 		KASSERT(pp->pp_pte.pte_ptp == ptp);
1801 		KASSERT(pp->pp_pte.pte_va == va);
1802 
1803 		pp->pp_flags &= ~PP_EMBEDDED;
1804 		LIST_INIT(&pp->pp_head.pvh_list);
1805 
1806 		return NULL;
1807 	}
1808 
1809 	hash = pvhash_hash(ptp, va);
1810 	lock = pvhash_lock(hash);
1811 	hh = pvhash_head(hash);
1812 	mutex_spin_enter(lock);
1813 	pve = pvhash_remove(hh, ptp, va);
1814 	mutex_spin_exit(lock);
1815 
1816 	LIST_REMOVE(pve, pve_list);
1817 
1818 	return pve;
1819 }
1820 
1821 /*
1822  * p t p   f u n c t i o n s
1823  */
1824 
1825 static inline struct vm_page *
1826 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level)
1827 {
1828 	int lidx = level - 1;
1829 	struct vm_page *pg;
1830 
1831 	KASSERT(mutex_owned(pmap->pm_lock));
1832 
1833 	if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] &&
1834 	    pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) {
1835 		return (pmap->pm_ptphint[lidx]);
1836 	}
1837 	PMAP_SUBOBJ_LOCK(pmap, lidx);
1838 	pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level));
1839 	PMAP_SUBOBJ_UNLOCK(pmap, lidx);
1840 
1841 	KASSERT(pg == NULL || pg->wire_count >= 1);
1842 	return pg;
1843 }
1844 
1845 static inline void
1846 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level)
1847 {
1848 	lwp_t *l;
1849 	int lidx;
1850 	struct uvm_object *obj;
1851 
1852 	KASSERT(ptp->wire_count == 1);
1853 
1854 	lidx = level - 1;
1855 
1856 	obj = &pmap->pm_obj[lidx];
1857 	pmap_stats_update(pmap, -1, 0);
1858 	if (lidx != 0)
1859 		mutex_enter(obj->vmobjlock);
1860 	if (pmap->pm_ptphint[lidx] == ptp)
1861 		pmap->pm_ptphint[lidx] = TAILQ_FIRST(&obj->memq);
1862 	ptp->wire_count = 0;
1863 	uvm_pagerealloc(ptp, NULL, 0);
1864 	l = curlwp;
1865 	KASSERT((l->l_pflag & LP_INTR) == 0);
1866 	VM_PAGE_TO_PP(ptp)->pp_link = l->l_md.md_gc_ptp;
1867 	l->l_md.md_gc_ptp = ptp;
1868 	if (lidx != 0)
1869 		mutex_exit(obj->vmobjlock);
1870 }
1871 
1872 static void
1873 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
1874 	      pt_entry_t *ptes, pd_entry_t * const *pdes)
1875 {
1876 	unsigned long index;
1877 	int level;
1878 	vaddr_t invaladdr;
1879 	pd_entry_t opde;
1880 
1881 	KASSERT(pmap != pmap_kernel());
1882 	KASSERT(mutex_owned(pmap->pm_lock));
1883 	KASSERT(kpreempt_disabled());
1884 
1885 	level = 1;
1886 	do {
1887 		index = pl_i(va, level + 1);
1888 		opde = pmap_pte_testset(&pdes[level - 1][index], 0);
1889 #if defined(XEN)
1890 #  if defined(__x86_64__)
1891 		/*
1892 		 * If ptp is a L3 currently mapped in kernel space,
1893 		 * on any cpu, clear it before freeing
1894 		 */
1895 		if (level == PTP_LEVELS - 1) {
1896 			/*
1897 			 * Update the per-cpu PD on all cpus the current
1898 			 * pmap is active on
1899 			 */
1900 			xen_kpm_sync(pmap, index);
1901 
1902 		}
1903 #  endif /*__x86_64__ */
1904 		invaladdr = level == 1 ? (vaddr_t)ptes :
1905 		    (vaddr_t)pdes[level - 2];
1906 		pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE,
1907 		    opde, TLBSHOOT_FREE_PTP1);
1908 		pmap_tlb_shootnow();
1909 #else	/* XEN */
1910 		invaladdr = level == 1 ? (vaddr_t)ptes :
1911 		    (vaddr_t)pdes[level - 2];
1912 		pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE,
1913 		    opde, TLBSHOOT_FREE_PTP1);
1914 #endif	/* XEN */
1915 		pmap_freepage(pmap, ptp, level);
1916 		if (level < PTP_LEVELS - 1) {
1917 			ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1);
1918 			ptp->wire_count--;
1919 			if (ptp->wire_count > 1)
1920 				break;
1921 		}
1922 	} while (++level < PTP_LEVELS);
1923 	pmap_pte_flush();
1924 }
1925 
1926 /*
1927  * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
1928  *
1929  * => pmap should NOT be pmap_kernel()
1930  * => pmap should be locked
1931  * => preemption should be disabled
1932  */
1933 
1934 static struct vm_page *
1935 pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t * const *pdes)
1936 {
1937 	struct vm_page *ptp, *pptp;
1938 	int i;
1939 	unsigned long index;
1940 	pd_entry_t *pva;
1941 	paddr_t ppa, pa;
1942 	struct uvm_object *obj;
1943 
1944 	KASSERT(pmap != pmap_kernel());
1945 	KASSERT(mutex_owned(pmap->pm_lock));
1946 	KASSERT(kpreempt_disabled());
1947 
1948 	ptp = NULL;
1949 	pa = (paddr_t)-1;
1950 
1951 	/*
1952 	 * Loop through all page table levels seeing if we need to
1953 	 * add a new page to that level.
1954 	 */
1955 	for (i = PTP_LEVELS; i > 1; i--) {
1956 		/*
1957 		 * Save values from previous round.
1958 		 */
1959 		pptp = ptp;
1960 		ppa = pa;
1961 
1962 		index = pl_i(va, i);
1963 		pva = pdes[i - 2];
1964 
1965 		if (pmap_valid_entry(pva[index])) {
1966 			ppa = pmap_pte2pa(pva[index]);
1967 			ptp = NULL;
1968 			continue;
1969 		}
1970 
1971 		obj = &pmap->pm_obj[i-2];
1972 		PMAP_SUBOBJ_LOCK(pmap, i - 2);
1973 		ptp = uvm_pagealloc(obj, ptp_va2o(va, i - 1), NULL,
1974 		    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
1975 		PMAP_SUBOBJ_UNLOCK(pmap, i - 2);
1976 
1977 		if (ptp == NULL)
1978 			return NULL;
1979 
1980 		ptp->flags &= ~PG_BUSY; /* never busy */
1981 		ptp->wire_count = 1;
1982 		pmap->pm_ptphint[i - 2] = ptp;
1983 		pa = VM_PAGE_TO_PHYS(ptp);
1984 		pmap_pte_set(&pva[index], (pd_entry_t)
1985 		        (pmap_pa2pte(pa) | PG_u | PG_RW | PG_V));
1986 #if defined(XEN) && defined(__x86_64__)
1987 		if(i == PTP_LEVELS) {
1988 			/*
1989 			 * Update the per-cpu PD on all cpus the current
1990 			 * pmap is active on
1991 			 */
1992 			xen_kpm_sync(pmap, index);
1993 		}
1994 #endif /* XEN && __x86_64__ */
1995 		pmap_pte_flush();
1996 		pmap_stats_update(pmap, 1, 0);
1997 		/*
1998 		 * If we're not in the top level, increase the
1999 		 * wire count of the parent page.
2000 		 */
2001 		if (i < PTP_LEVELS) {
2002 			if (pptp == NULL)
2003 				pptp = pmap_find_ptp(pmap, va, ppa, i);
2004 #ifdef DIAGNOSTIC
2005 			if (pptp == NULL)
2006 				panic("pde page disappeared");
2007 #endif
2008 			pptp->wire_count++;
2009 		}
2010 	}
2011 
2012 	/*
2013 	 * ptp is not NULL if we just allocated a new ptp. If it's
2014 	 * still NULL, we must look up the existing one.
2015 	 */
2016 	if (ptp == NULL) {
2017 		ptp = pmap_find_ptp(pmap, va, ppa, 1);
2018 #ifdef DIAGNOSTIC
2019 		if (ptp == NULL) {
2020 			printf("va %" PRIxVADDR " ppa %" PRIxPADDR "\n",
2021 			    va, ppa);
2022 			panic("pmap_get_ptp: unmanaged user PTP");
2023 		}
2024 #endif
2025 	}
2026 
2027 	pmap->pm_ptphint[0] = ptp;
2028 	return(ptp);
2029 }
2030 
2031 /*
2032  * p m a p  l i f e c y c l e   f u n c t i o n s
2033  */
2034 
2035 /*
2036  * pmap_pdp_ctor: constructor for the PDP cache.
2037  */
2038 static int
2039 pmap_pdp_ctor(void *arg, void *v, int flags)
2040 {
2041 	pd_entry_t *pdir = v;
2042 	paddr_t pdirpa = 0;	/* XXX: GCC */
2043 	vaddr_t object;
2044 	int i;
2045 
2046 #if !defined(XEN) || !defined(__x86_64__)
2047 	int npde;
2048 #endif
2049 #ifdef XEN
2050 	int s;
2051 #endif
2052 
2053 	/*
2054 	 * NOTE: The `pmaps_lock' is held when the PDP is allocated.
2055 	 */
2056 
2057 #if defined(XEN) && defined(__x86_64__)
2058 	/* fetch the physical address of the page directory. */
2059 	(void) pmap_extract(pmap_kernel(), (vaddr_t) pdir, &pdirpa);
2060 
2061 	/* zero init area */
2062 	memset (pdir, 0, PAGE_SIZE); /* Xen wants a clean page */
2063 	/*
2064 	 * this pdir will NEVER be active in kernel mode
2065 	 * so mark recursive entry invalid
2066 	 */
2067 	pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa) | PG_u;
2068 	/*
2069 	 * PDP constructed this way won't be for kernel,
2070 	 * hence we don't put kernel mappings on Xen.
2071 	 * But we need to make pmap_create() happy, so put a dummy (without
2072 	 * PG_V) value at the right place.
2073 	 */
2074 	pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] =
2075 	     (pd_entry_t)-1 & PG_FRAME;
2076 #else /* XEN && __x86_64__*/
2077 	/* zero init area */
2078 	memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t));
2079 
2080 	object = (vaddr_t)v;
2081 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2082 		/* fetch the physical address of the page directory. */
2083 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2084 		/* put in recursive PDE to map the PTEs */
2085 		pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PG_V;
2086 #ifndef XEN
2087 		pdir[PDIR_SLOT_PTE + i] |= PG_KW;
2088 #endif
2089 	}
2090 
2091 	/* copy kernel's PDE */
2092 	npde = nkptp[PTP_LEVELS - 1];
2093 
2094 	memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN],
2095 	    npde * sizeof(pd_entry_t));
2096 
2097 	/* zero the rest */
2098 	memset(&pdir[PDIR_SLOT_KERN + npde], 0, (PAGE_SIZE * PDP_SIZE) -
2099 	    (PDIR_SLOT_KERN + npde) * sizeof(pd_entry_t));
2100 
2101 	if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
2102 		int idx = pl_i(KERNBASE, PTP_LEVELS);
2103 
2104 		pdir[idx] = PDP_BASE[idx];
2105 	}
2106 
2107 #ifdef __HAVE_DIRECT_MAP
2108 	pdir[PDIR_SLOT_DIRECT] = PDP_BASE[PDIR_SLOT_DIRECT];
2109 #endif
2110 
2111 #endif /* XEN  && __x86_64__*/
2112 #ifdef XEN
2113 	s = splvm();
2114 	object = (vaddr_t)v;
2115 	pmap_protect(pmap_kernel(), object, object + (PAGE_SIZE * PDP_SIZE),
2116 	    VM_PROT_READ);
2117 	pmap_update(pmap_kernel());
2118 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2119 		/*
2120 		 * pin as L2/L4 page, we have to do the page with the
2121 		 * PDIR_SLOT_PTE entries last
2122 		 */
2123 #ifdef PAE
2124 		if (i == l2tol3(PDIR_SLOT_PTE))
2125 			continue;
2126 #endif
2127 
2128 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2129 #ifdef __x86_64__
2130 		xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa));
2131 #else
2132 		xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2133 #endif
2134 	}
2135 #ifdef PAE
2136 	object = ((vaddr_t)pdir) + PAGE_SIZE  * l2tol3(PDIR_SLOT_PTE);
2137 	(void)pmap_extract(pmap_kernel(), object, &pdirpa);
2138 	xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2139 #endif
2140 	splx(s);
2141 #endif /* XEN */
2142 
2143 	return (0);
2144 }
2145 
2146 /*
2147  * pmap_pdp_dtor: destructor for the PDP cache.
2148  */
2149 
2150 static void
2151 pmap_pdp_dtor(void *arg, void *v)
2152 {
2153 #ifdef XEN
2154 	paddr_t pdirpa = 0;	/* XXX: GCC */
2155 	vaddr_t object = (vaddr_t)v;
2156 	int i;
2157 	int s = splvm();
2158 	pt_entry_t *pte;
2159 
2160 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2161 		/* fetch the physical address of the page directory. */
2162 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2163 		/* unpin page table */
2164 		xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa));
2165 	}
2166 	object = (vaddr_t)v;
2167 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2168 		/* Set page RW again */
2169 		pte = kvtopte(object);
2170 		pmap_pte_set(pte, *pte | PG_RW);
2171 		xen_bcast_invlpg((vaddr_t)object);
2172 	}
2173 	splx(s);
2174 #endif  /* XEN */
2175 }
2176 
2177 #ifdef PAE
2178 
2179 /* pmap_pdp_alloc: Allocate a page for the pdp memory pool. */
2180 
2181 static void *
2182 pmap_pdp_alloc(struct pool *pp, int flags)
2183 {
2184 	return (void *)uvm_km_alloc(kernel_map,
2185 	    PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE,
2186 	    ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK)
2187 	    | UVM_KMF_WIRED);
2188 }
2189 
2190 /*
2191  * pmap_pdp_free: free a PDP
2192  */
2193 
2194 static void
2195 pmap_pdp_free(struct pool *pp, void *v)
2196 {
2197 	uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE,
2198 	    UVM_KMF_WIRED);
2199 }
2200 #endif /* PAE */
2201 
2202 /*
2203  * pmap_create: create a pmap
2204  *
2205  * => note: old pmap interface took a "size" args which allowed for
2206  *	the creation of "software only" pmaps (not in bsd).
2207  */
2208 
2209 struct pmap *
2210 pmap_create(void)
2211 {
2212 	struct pmap *pmap;
2213 	int i;
2214 
2215 	pmap = pool_cache_get(&pmap_cache, PR_WAITOK);
2216 
2217 	/* init uvm_object */
2218 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2219 		mutex_init(&pmap->pm_obj_lock[i], MUTEX_DEFAULT, IPL_NONE);
2220 		uvm_obj_init(&pmap->pm_obj[i], NULL, false, 1);
2221 		uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_obj_lock[i]);
2222 		pmap->pm_ptphint[i] = NULL;
2223 	}
2224 	pmap->pm_stats.wired_count = 0;
2225 	/* count the PDP allocd below */
2226 	pmap->pm_stats.resident_count = PDP_SIZE;
2227 #if !defined(__x86_64__)
2228 	pmap->pm_hiexec = 0;
2229 #endif /* !defined(__x86_64__) */
2230 	pmap->pm_flags = 0;
2231 	pmap->pm_cpus = 0;
2232 	pmap->pm_kernel_cpus = 0;
2233 	pmap->pm_xen_ptp_cpus = 0;
2234 	pmap->pm_gc_ptp = NULL;
2235 
2236 	/* init the LDT */
2237 	pmap->pm_ldt = NULL;
2238 	pmap->pm_ldt_len = 0;
2239 	pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2240 
2241 	/* allocate PDP */
2242  try_again:
2243 	pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK);
2244 
2245 	mutex_enter(&pmaps_lock);
2246 
2247 	if (pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] == 0) {
2248 		mutex_exit(&pmaps_lock);
2249 		pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir);
2250 		goto try_again;
2251 	}
2252 
2253 	for (i = 0; i < PDP_SIZE; i++)
2254 		pmap->pm_pdirpa[i] =
2255 		    pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]);
2256 
2257 	LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
2258 
2259 	mutex_exit(&pmaps_lock);
2260 
2261 	return (pmap);
2262 }
2263 
2264 /*
2265  * pmap_free_ptps: put a list of ptps back to the freelist.
2266  */
2267 
2268 static void
2269 pmap_free_ptps(struct vm_page *empty_ptps)
2270 {
2271 	struct vm_page *ptp;
2272 	struct pmap_page *pp;
2273 
2274 	while ((ptp = empty_ptps) != NULL) {
2275 		pp = VM_PAGE_TO_PP(ptp);
2276 		empty_ptps = pp->pp_link;
2277 		LIST_INIT(&pp->pp_head.pvh_list);
2278 		uvm_pagefree(ptp);
2279 	}
2280 }
2281 
2282 /*
2283  * pmap_destroy: drop reference count on pmap.   free pmap if
2284  *	reference count goes to zero.
2285  */
2286 
2287 void
2288 pmap_destroy(struct pmap *pmap)
2289 {
2290 	int i;
2291 #ifdef DIAGNOSTIC
2292 	struct cpu_info *ci;
2293 	CPU_INFO_ITERATOR cii;
2294 #endif /* DIAGNOSTIC */
2295 	lwp_t *l;
2296 
2297 	/*
2298 	 * If we have torn down this pmap, process deferred frees and
2299 	 * invalidations.  Free now if the system is low on memory.
2300 	 * Otherwise, free when the pmap is destroyed thus avoiding a
2301 	 * TLB shootdown.
2302 	 */
2303 	l = curlwp;
2304 	if (__predict_false(l->l_md.md_gc_pmap == pmap)) {
2305 		if (uvmexp.free < uvmexp.freetarg) {
2306 			pmap_update(pmap);
2307 		} else {
2308 			KASSERT(pmap->pm_gc_ptp == NULL);
2309 			pmap->pm_gc_ptp = l->l_md.md_gc_ptp;
2310 			l->l_md.md_gc_ptp = NULL;
2311 			l->l_md.md_gc_pmap = NULL;
2312 		}
2313 	}
2314 
2315 	/*
2316 	 * drop reference count
2317 	 */
2318 
2319 	if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) {
2320 		return;
2321 	}
2322 
2323 #ifdef DIAGNOSTIC
2324 	for (CPU_INFO_FOREACH(cii, ci)) {
2325 		if (ci->ci_pmap == pmap)
2326 			panic("destroying pmap being used");
2327 #if defined(XEN) && defined(__x86_64__)
2328 		for (i = 0; i < PDIR_SLOT_PTE; i++) {
2329 			if (pmap->pm_pdir[i] != 0 &&
2330 			    ci->ci_kpm_pdir[i] == pmap->pm_pdir[i]) {
2331 				printf("pmap_destroy(%p) pmap_kernel %p "
2332 				    "curcpu %d cpu %d ci_pmap %p "
2333 				    "ci->ci_kpm_pdir[%d]=%" PRIx64
2334 				    " pmap->pm_pdir[%d]=%" PRIx64 "\n",
2335 				    pmap, pmap_kernel(), curcpu()->ci_index,
2336 				    ci->ci_index, ci->ci_pmap,
2337 				    i, ci->ci_kpm_pdir[i],
2338 				    i, pmap->pm_pdir[i]);
2339 				panic("pmap_destroy: used pmap");
2340 			}
2341 		}
2342 #endif
2343 	}
2344 #endif /* DIAGNOSTIC */
2345 
2346 	/*
2347 	 * reference count is zero, free pmap resources and then free pmap.
2348 	 */
2349 
2350 	/*
2351 	 * remove it from global list of pmaps
2352 	 */
2353 
2354 	mutex_enter(&pmaps_lock);
2355 	LIST_REMOVE(pmap, pm_list);
2356 	mutex_exit(&pmaps_lock);
2357 
2358 	/*
2359 	 * Process deferred PTP frees.  No TLB shootdown required, as the
2360 	 * PTP pages are no longer visible to any CPU.
2361 	 */
2362 
2363 	pmap_free_ptps(pmap->pm_gc_ptp);
2364 
2365 	/*
2366 	 * destroyed pmap shouldn't have remaining PTPs
2367 	 */
2368 
2369 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2370 		KASSERT(pmap->pm_obj[i].uo_npages == 0);
2371 		KASSERT(TAILQ_EMPTY(&pmap->pm_obj[i].memq));
2372 	}
2373 
2374 	pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir);
2375 
2376 #ifdef USER_LDT
2377 	if (pmap->pm_ldt != NULL) {
2378 		/*
2379 		 * no need to switch the LDT; this address space is gone,
2380 		 * nothing is using it.
2381 		 *
2382 		 * No need to lock the pmap for ldt_free (or anything else),
2383 		 * we're the last one to use it.
2384 		 */
2385 		mutex_enter(&cpu_lock);
2386 		ldt_free(pmap->pm_ldt_sel);
2387 		mutex_exit(&cpu_lock);
2388 		uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt,
2389 		    pmap->pm_ldt_len, UVM_KMF_WIRED);
2390 	}
2391 #endif
2392 
2393 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2394 		uvm_obj_destroy(&pmap->pm_obj[i], false);
2395 		mutex_destroy(&pmap->pm_obj_lock[i]);
2396 	}
2397 	pool_cache_put(&pmap_cache, pmap);
2398 }
2399 
2400 /*
2401  * pmap_remove_all: pmap is being torn down by the current thread.
2402  * avoid unnecessary invalidations.
2403  */
2404 
2405 void
2406 pmap_remove_all(struct pmap *pmap)
2407 {
2408 	lwp_t *l = curlwp;
2409 
2410 	KASSERT(l->l_md.md_gc_pmap == NULL);
2411 
2412 	l->l_md.md_gc_pmap = pmap;
2413 }
2414 
2415 #if defined(PMAP_FORK)
2416 /*
2417  * pmap_fork: perform any necessary data structure manipulation when
2418  * a VM space is forked.
2419  */
2420 
2421 void
2422 pmap_fork(struct pmap *pmap1, struct pmap *pmap2)
2423 {
2424 #ifdef USER_LDT
2425 	union descriptor *new_ldt;
2426 	size_t len;
2427 	int sel;
2428 
2429 	if (__predict_true(pmap1->pm_ldt == NULL)) {
2430 		return;
2431 	}
2432 
2433  retry:
2434 	if (pmap1->pm_ldt != NULL) {
2435 		len = pmap1->pm_ldt_len;
2436 		new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, len, 0,
2437 		    UVM_KMF_WIRED);
2438 		mutex_enter(&cpu_lock);
2439 		sel = ldt_alloc(new_ldt, len);
2440 		if (sel == -1) {
2441 			mutex_exit(&cpu_lock);
2442 			uvm_km_free(kernel_map, (vaddr_t)new_ldt, len,
2443 			    UVM_KMF_WIRED);
2444 			printf("WARNING: pmap_fork: unable to allocate LDT\n");
2445 			return;
2446 		}
2447 	} else {
2448 		len = -1;
2449 		new_ldt = NULL;
2450 		sel = -1;
2451 		mutex_enter(&cpu_lock);
2452 	}
2453 
2454  	/* Copy the LDT, if necessary. */
2455  	if (pmap1->pm_ldt != NULL) {
2456 		if (len != pmap1->pm_ldt_len) {
2457 			if (len != -1) {
2458 				ldt_free(sel);
2459 				uvm_km_free(kernel_map, (vaddr_t)new_ldt,
2460 				    len, UVM_KMF_WIRED);
2461 			}
2462 			mutex_exit(&cpu_lock);
2463 			goto retry;
2464 		}
2465 
2466 		memcpy(new_ldt, pmap1->pm_ldt, len);
2467 		pmap2->pm_ldt = new_ldt;
2468 		pmap2->pm_ldt_len = pmap1->pm_ldt_len;
2469 		pmap2->pm_ldt_sel = sel;
2470 		len = -1;
2471 	}
2472 
2473 	if (len != -1) {
2474 		ldt_free(sel);
2475 		uvm_km_free(kernel_map, (vaddr_t)new_ldt, len,
2476 		    UVM_KMF_WIRED);
2477 	}
2478 	mutex_exit(&cpu_lock);
2479 #endif /* USER_LDT */
2480 }
2481 #endif /* PMAP_FORK */
2482 
2483 #ifdef USER_LDT
2484 
2485 /*
2486  * pmap_ldt_xcall: cross call used by pmap_ldt_sync.  if the named pmap
2487  * is active, reload LDTR.
2488  */
2489 static void
2490 pmap_ldt_xcall(void *arg1, void *arg2)
2491 {
2492 	struct pmap *pm;
2493 
2494 	kpreempt_disable();
2495 	pm = arg1;
2496 	if (curcpu()->ci_pmap == pm) {
2497 		lldt(pm->pm_ldt_sel);
2498 	}
2499 	kpreempt_enable();
2500 }
2501 
2502 /*
2503  * pmap_ldt_sync: LDT selector for the named pmap is changing.  swap
2504  * in the new selector on all CPUs.
2505  */
2506 void
2507 pmap_ldt_sync(struct pmap *pm)
2508 {
2509 	uint64_t where;
2510 
2511 	KASSERT(mutex_owned(&cpu_lock));
2512 
2513 	pmap_ldt_evcnt.ev_count++;
2514 	where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL);
2515 	xc_wait(where);
2516 }
2517 
2518 /*
2519  * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and
2520  * restore the default.
2521  */
2522 
2523 void
2524 pmap_ldt_cleanup(struct lwp *l)
2525 {
2526 	pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap;
2527 	union descriptor *dp = NULL;
2528 	size_t len = 0;
2529 	int sel = -1;
2530 
2531 	if (__predict_true(pmap->pm_ldt == NULL)) {
2532 		return;
2533 	}
2534 
2535 	mutex_enter(&cpu_lock);
2536 	if (pmap->pm_ldt != NULL) {
2537 		sel = pmap->pm_ldt_sel;
2538 		dp = pmap->pm_ldt;
2539 		len = pmap->pm_ldt_len;
2540 		pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2541 		pmap->pm_ldt = NULL;
2542 		pmap->pm_ldt_len = 0;
2543 		pmap_ldt_sync(pmap);
2544 		ldt_free(sel);
2545 		uvm_km_free(kernel_map, (vaddr_t)dp, len, UVM_KMF_WIRED);
2546 	}
2547 	mutex_exit(&cpu_lock);
2548 }
2549 #endif /* USER_LDT */
2550 
2551 /*
2552  * pmap_activate: activate a process' pmap
2553  *
2554  * => must be called with kernel preemption disabled
2555  * => if lwp is the curlwp, then set ci_want_pmapload so that
2556  *    actual MMU context switch will be done by pmap_load() later
2557  */
2558 
2559 void
2560 pmap_activate(struct lwp *l)
2561 {
2562 	struct cpu_info *ci;
2563 	struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2564 
2565 	KASSERT(kpreempt_disabled());
2566 
2567 	ci = curcpu();
2568 
2569 	if (l == ci->ci_curlwp) {
2570 		KASSERT(ci->ci_want_pmapload == 0);
2571 		KASSERT(ci->ci_tlbstate != TLBSTATE_VALID);
2572 #ifdef KSTACK_CHECK_DR0
2573 		/*
2574 		 * setup breakpoint on the top of stack
2575 		 */
2576 		if (l == &lwp0)
2577 			dr0(0, 0, 0, 0);
2578 		else
2579 			dr0(KSTACK_LOWEST_ADDR(l), 1, 3, 1);
2580 #endif
2581 
2582 		/*
2583 		 * no need to switch to kernel vmspace because
2584 		 * it's a subset of any vmspace.
2585 		 */
2586 
2587 		if (pmap == pmap_kernel()) {
2588 			ci->ci_want_pmapload = 0;
2589 			return;
2590 		}
2591 
2592 		ci->ci_want_pmapload = 1;
2593 	}
2594 }
2595 
2596 /*
2597  * pmap_reactivate: try to regain reference to the pmap.
2598  *
2599  * => must be called with kernel preemption disabled
2600  */
2601 
2602 static bool
2603 pmap_reactivate(struct pmap *pmap)
2604 {
2605 	struct cpu_info *ci;
2606 	uint32_t cpumask;
2607 	bool result;
2608 	uint32_t oldcpus;
2609 
2610 	ci = curcpu();
2611 	cpumask = ci->ci_cpumask;
2612 
2613 	KASSERT(kpreempt_disabled());
2614 #if defined(XEN) && defined(__x86_64__)
2615 	KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd);
2616 #elif defined(PAE)
2617 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
2618 #elif !defined(XEN)
2619 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()));
2620 #endif
2621 
2622 	/*
2623 	 * if we still have a lazy reference to this pmap,
2624 	 * we can assume that there was no tlb shootdown
2625 	 * for this pmap in the meantime.
2626 	 *
2627 	 * the order of events here is important as we must
2628 	 * synchronize with TLB shootdown interrupts.  declare
2629 	 * interest in invalidations (TLBSTATE_VALID) and then
2630 	 * check the cpumask, which the IPIs can change only
2631 	 * when the state is TLBSTATE_LAZY.
2632 	 */
2633 
2634 	ci->ci_tlbstate = TLBSTATE_VALID;
2635 	oldcpus = pmap->pm_cpus;
2636 	KASSERT((pmap->pm_kernel_cpus & cpumask) != 0);
2637 	if (oldcpus & cpumask) {
2638 		/* got it */
2639 		result = true;
2640 	} else {
2641 		/* must reload */
2642 		atomic_or_32(&pmap->pm_cpus, cpumask);
2643 		result = false;
2644 	}
2645 
2646 	return result;
2647 }
2648 
2649 /*
2650  * pmap_load: actually switch pmap.  (fill in %cr3 and LDT info)
2651  *
2652  * ensures that the current process' pmap is loaded on the current cpu's MMU
2653  * and there's no stale TLB entries.
2654  *
2655  * the caller should disable preemption or do check-and-retry to prevent
2656  * a preemption from undoing our efforts.
2657  *
2658  * this function can block.
2659  */
2660 
2661 void
2662 pmap_load(void)
2663 {
2664 	struct cpu_info *ci;
2665 	uint32_t cpumask;
2666 	struct pmap *pmap;
2667 	struct pmap *oldpmap;
2668 	struct lwp *l;
2669 	struct pcb *pcb;
2670 	uint64_t ncsw;
2671 
2672 	kpreempt_disable();
2673  retry:
2674 	ci = curcpu();
2675 	if (!ci->ci_want_pmapload) {
2676 		kpreempt_enable();
2677 		return;
2678 	}
2679 	cpumask = ci->ci_cpumask;
2680 	l = ci->ci_curlwp;
2681 	ncsw = l->l_ncsw;
2682 
2683 	/* should be able to take ipis. */
2684 	KASSERT(ci->ci_ilevel < IPL_HIGH);
2685 #ifdef XEN
2686 	/* Check to see if interrupts are enabled (ie; no events are masked) */
2687 	KASSERT(x86_read_psl() == 0);
2688 #else
2689 	KASSERT((x86_read_psl() & PSL_I) != 0);
2690 #endif
2691 
2692 	KASSERT(l != NULL);
2693 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2694 	KASSERT(pmap != pmap_kernel());
2695 	oldpmap = ci->ci_pmap;
2696 	pcb = lwp_getpcb(l);
2697 
2698 	if (pmap == oldpmap) {
2699 		if (!pmap_reactivate(pmap)) {
2700 			u_int gen = uvm_emap_gen_return();
2701 
2702 			/*
2703 			 * pmap has been changed during deactivated.
2704 			 * our tlb may be stale.
2705 			 */
2706 
2707 			tlbflush();
2708 			uvm_emap_update(gen);
2709 		}
2710 
2711 		ci->ci_want_pmapload = 0;
2712 		kpreempt_enable();
2713 		return;
2714 	}
2715 
2716 	/*
2717 	 * grab a reference to the new pmap.
2718 	 */
2719 
2720 	pmap_reference(pmap);
2721 
2722 	/*
2723 	 * actually switch pmap.
2724 	 */
2725 
2726 	atomic_and_32(&oldpmap->pm_cpus, ~cpumask);
2727 	atomic_and_32(&oldpmap->pm_kernel_cpus, ~cpumask);
2728 
2729 #if defined(XEN) && defined(__x86_64__)
2730 	KASSERT(pmap_pdirpa(oldpmap, 0) == ci->ci_xen_current_user_pgd ||
2731 	    oldpmap == pmap_kernel());
2732 #elif defined(PAE)
2733 	KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
2734 #elif !defined(XEN)
2735 	KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(rcr3()));
2736 #endif
2737 	KASSERT((pmap->pm_cpus & cpumask) == 0);
2738 	KASSERT((pmap->pm_kernel_cpus & cpumask) == 0);
2739 
2740 	/*
2741 	 * mark the pmap in use by this processor.  again we must
2742 	 * synchronize with TLB shootdown interrupts, so set the
2743 	 * state VALID first, then register us for shootdown events
2744 	 * on this pmap.
2745 	 */
2746 
2747 	ci->ci_tlbstate = TLBSTATE_VALID;
2748 	atomic_or_32(&pmap->pm_cpus, cpumask);
2749 	atomic_or_32(&pmap->pm_kernel_cpus, cpumask);
2750 	ci->ci_pmap = pmap;
2751 
2752 	/*
2753 	 * update tss.  now that we have registered for invalidations
2754 	 * from other CPUs, we're good to load the page tables.
2755 	 */
2756 #ifdef PAE
2757 	pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa;
2758 #else
2759 	pcb->pcb_cr3 = pmap_pdirpa(pmap, 0);
2760 #endif
2761 
2762 #ifdef i386
2763 #ifndef XEN
2764 	ci->ci_tss.tss_ldt = pmap->pm_ldt_sel;
2765 	ci->ci_tss.tss_cr3 = pcb->pcb_cr3;
2766 #endif /* !XEN */
2767 #endif /* i386 */
2768 
2769 	lldt(pmap->pm_ldt_sel);
2770 
2771 	u_int gen = uvm_emap_gen_return();
2772 	cpu_load_pmap(pmap, oldpmap);
2773 	uvm_emap_update(gen);
2774 
2775 	ci->ci_want_pmapload = 0;
2776 
2777 	/*
2778 	 * we're now running with the new pmap.  drop the reference
2779 	 * to the old pmap.  if we block, we need to go around again.
2780 	 */
2781 
2782 	pmap_destroy(oldpmap);
2783 	if (l->l_ncsw != ncsw) {
2784 		goto retry;
2785 	}
2786 
2787 	kpreempt_enable();
2788 }
2789 
2790 /*
2791  * pmap_deactivate: deactivate a process' pmap.
2792  *
2793  * => Must be called with kernel preemption disabled (high IPL is enough).
2794  */
2795 void
2796 pmap_deactivate(struct lwp *l)
2797 {
2798 	struct pmap *pmap;
2799 	struct cpu_info *ci;
2800 
2801 	KASSERT(kpreempt_disabled());
2802 
2803 	if (l != curlwp) {
2804 		return;
2805 	}
2806 
2807 	/*
2808 	 * Wait for pending TLB shootdowns to complete.  Necessary because
2809 	 * TLB shootdown state is per-CPU, and the LWP may be coming off
2810 	 * the CPU before it has a chance to call pmap_update(), e.g. due
2811 	 * to kernel preemption or blocking routine in between.
2812 	 */
2813 	pmap_tlb_shootnow();
2814 
2815 	ci = curcpu();
2816 
2817 	if (ci->ci_want_pmapload) {
2818 		/*
2819 		 * ci_want_pmapload means that our pmap is not loaded on
2820 		 * the CPU or TLB might be stale.  note that pmap_kernel()
2821 		 * is always considered loaded.
2822 		 */
2823 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
2824 		    != pmap_kernel());
2825 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
2826 		    != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID);
2827 
2828 		/*
2829 		 * userspace has not been touched.
2830 		 * nothing to do here.
2831 		 */
2832 
2833 		ci->ci_want_pmapload = 0;
2834 		return;
2835 	}
2836 
2837 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2838 
2839 	if (pmap == pmap_kernel()) {
2840 		return;
2841 	}
2842 
2843 #if defined(XEN) && defined(__x86_64__)
2844 	KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd);
2845 #elif defined(PAE)
2846 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
2847 #elif !defined(XEN)
2848 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()));
2849 #endif
2850 	KASSERT(ci->ci_pmap == pmap);
2851 
2852 	/*
2853 	 * we aren't interested in TLB invalidations for this pmap,
2854 	 * at least for the time being.
2855 	 */
2856 
2857 	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
2858 	ci->ci_tlbstate = TLBSTATE_LAZY;
2859 }
2860 
2861 /*
2862  * end of lifecycle functions
2863  */
2864 
2865 /*
2866  * some misc. functions
2867  */
2868 
2869 int
2870 pmap_pdes_invalid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde)
2871 {
2872 	int i;
2873 	unsigned long index;
2874 	pd_entry_t pde;
2875 
2876 	for (i = PTP_LEVELS; i > 1; i--) {
2877 		index = pl_i(va, i);
2878 		pde = pdes[i - 2][index];
2879 		if ((pde & PG_V) == 0)
2880 			return i;
2881 	}
2882 	if (lastpde != NULL)
2883 		*lastpde = pde;
2884 	return 0;
2885 }
2886 
2887 /*
2888  * pmap_extract: extract a PA for the given VA
2889  */
2890 
2891 bool
2892 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
2893 {
2894 	pt_entry_t *ptes, pte;
2895 	pd_entry_t pde;
2896 	pd_entry_t * const *pdes;
2897 	struct pmap *pmap2;
2898 	struct cpu_info *ci;
2899 	paddr_t pa;
2900 	lwp_t *l;
2901 	bool hard, rv;
2902 
2903 #ifdef __HAVE_DIRECT_MAP
2904 	if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
2905 		if (pap != NULL) {
2906 			*pap = va - PMAP_DIRECT_BASE;
2907 		}
2908 		return true;
2909 	}
2910 #endif
2911 
2912 	rv = false;
2913 	pa = 0;
2914 	l = curlwp;
2915 
2916 	KPREEMPT_DISABLE(l);
2917 	ci = l->l_cpu;
2918 	if (__predict_true(!ci->ci_want_pmapload && ci->ci_pmap == pmap) ||
2919 	    pmap == pmap_kernel()) {
2920 		/*
2921 		 * no need to lock, because it's pmap_kernel() or our
2922 		 * own pmap and is active.  if a user pmap, the caller
2923 		 * will hold the vm_map write/read locked and so prevent
2924 		 * entries from disappearing while we are here.  ptps
2925 		 * can disappear via pmap_remove() and pmap_protect(),
2926 		 * but they are called with the vm_map write locked.
2927 		 */
2928 		hard = false;
2929 		ptes = PTE_BASE;
2930 		pdes = normal_pdes;
2931 	} else {
2932 		/* we lose, do it the hard way. */
2933 		hard = true;
2934 		pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
2935 	}
2936 	if (pmap_pdes_valid(va, pdes, &pde)) {
2937 		pte = ptes[pl1_i(va)];
2938 		if (pde & PG_PS) {
2939 			pa = (pde & PG_LGFRAME) | (va & (NBPD_L2 - 1));
2940 			rv = true;
2941 		} else if (__predict_true((pte & PG_V) != 0)) {
2942 			pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
2943 			rv = true;
2944 		}
2945 	}
2946 	if (__predict_false(hard)) {
2947 		pmap_unmap_ptes(pmap, pmap2);
2948 	}
2949 	KPREEMPT_ENABLE(l);
2950 	if (pap != NULL) {
2951 		*pap = pa;
2952 	}
2953 	return rv;
2954 }
2955 
2956 
2957 /*
2958  * vtophys: virtual address to physical address.  For use by
2959  * machine-dependent code only.
2960  */
2961 
2962 paddr_t
2963 vtophys(vaddr_t va)
2964 {
2965 	paddr_t pa;
2966 
2967 	if (pmap_extract(pmap_kernel(), va, &pa) == true)
2968 		return (pa);
2969 	return (0);
2970 }
2971 
2972 __strict_weak_alias(pmap_extract_ma, pmap_extract);
2973 
2974 #ifdef XEN
2975 
2976 /*
2977  * vtomach: virtual address to machine address.  For use by
2978  * machine-dependent code only.
2979  */
2980 
2981 paddr_t
2982 vtomach(vaddr_t va)
2983 {
2984 	paddr_t pa;
2985 
2986 	if (pmap_extract_ma(pmap_kernel(), va, &pa) == true)
2987 		return (pa);
2988 	return (0);
2989 }
2990 
2991 #endif /* XEN */
2992 
2993 /*
2994  * pmap_virtual_space: used during bootup [pmap_steal_memory] to
2995  *	determine the bounds of the kernel virtual addess space.
2996  */
2997 
2998 void
2999 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp)
3000 {
3001 	*startp = virtual_avail;
3002 	*endp = virtual_end;
3003 }
3004 
3005 /*
3006  * pmap_map: map a range of PAs into kvm.
3007  *
3008  * => used during crash dump
3009  * => XXX: pmap_map() should be phased out?
3010  */
3011 
3012 vaddr_t
3013 pmap_map(vaddr_t va, paddr_t spa, paddr_t epa, vm_prot_t prot)
3014 {
3015 	while (spa < epa) {
3016 		pmap_kenter_pa(va, spa, prot, 0);
3017 		va += PAGE_SIZE;
3018 		spa += PAGE_SIZE;
3019 	}
3020 	pmap_update(pmap_kernel());
3021 	return va;
3022 }
3023 
3024 /*
3025  * pmap_zero_page: zero a page
3026  */
3027 
3028 void
3029 pmap_zero_page(paddr_t pa)
3030 {
3031 #ifdef __HAVE_DIRECT_MAP
3032 	pagezero(PMAP_DIRECT_MAP(pa));
3033 #else
3034 	pt_entry_t *zpte;
3035 	void *zerova;
3036 	int id;
3037 
3038 	kpreempt_disable();
3039 	id = cpu_number();
3040 	zpte = PTESLEW(zero_pte, id);
3041 	zerova = VASLEW(zerop, id);
3042 
3043 #ifdef DIAGNOSTIC
3044 	if (*zpte)
3045 		panic("pmap_zero_page: lock botch");
3046 #endif
3047 
3048 	pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k);
3049 	pmap_pte_flush();
3050 	pmap_update_pg((vaddr_t)zerova);		/* flush TLB */
3051 
3052 	memset(zerova, 0, PAGE_SIZE);
3053 
3054 #if defined(DIAGNOSTIC) || defined(XEN)
3055 	pmap_pte_set(zpte, 0);				/* zap ! */
3056 	pmap_pte_flush();
3057 #endif
3058 	kpreempt_enable();
3059 #endif
3060 }
3061 
3062 /*
3063  * pmap_pagezeroidle: the same, for the idle loop page zero'er.
3064  * Returns true if the page was zero'd, false if we aborted for
3065  * some reason.
3066  */
3067 
3068 bool
3069 pmap_pageidlezero(paddr_t pa)
3070 {
3071 #ifdef __HAVE_DIRECT_MAP
3072 	KASSERT(cpu_feature[0] & CPUID_SSE2);
3073 	return sse2_idlezero_page((void *)PMAP_DIRECT_MAP(pa));
3074 #else
3075 	pt_entry_t *zpte;
3076 	void *zerova;
3077 	bool rv;
3078 	int id;
3079 
3080 	id = cpu_number();
3081 	zpte = PTESLEW(zero_pte, id);
3082 	zerova = VASLEW(zerop, id);
3083 
3084 	KASSERT(cpu_feature[0] & CPUID_SSE2);
3085 	KASSERT(*zpte == 0);
3086 
3087 	pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k);
3088 	pmap_pte_flush();
3089 	pmap_update_pg((vaddr_t)zerova);		/* flush TLB */
3090 
3091 	rv = sse2_idlezero_page(zerova);
3092 
3093 #if defined(DIAGNOSTIC) || defined(XEN)
3094 	pmap_pte_set(zpte, 0);				/* zap ! */
3095 	pmap_pte_flush();
3096 #endif
3097 
3098 	return rv;
3099 #endif
3100 }
3101 
3102 /*
3103  * pmap_copy_page: copy a page
3104  */
3105 
3106 void
3107 pmap_copy_page(paddr_t srcpa, paddr_t dstpa)
3108 {
3109 #ifdef __HAVE_DIRECT_MAP
3110 	vaddr_t srcva = PMAP_DIRECT_MAP(srcpa);
3111 	vaddr_t dstva = PMAP_DIRECT_MAP(dstpa);
3112 
3113 	memcpy((void *)dstva, (void *)srcva, PAGE_SIZE);
3114 #else
3115 	pt_entry_t *spte;
3116 	pt_entry_t *dpte;
3117 	void *csrcva;
3118 	void *cdstva;
3119 	int id;
3120 
3121 	kpreempt_disable();
3122 	id = cpu_number();
3123 	spte = PTESLEW(csrc_pte,id);
3124 	dpte = PTESLEW(cdst_pte,id);
3125 	csrcva = VASLEW(csrcp, id);
3126 	cdstva = VASLEW(cdstp, id);
3127 
3128 	KASSERT(*spte == 0 && *dpte == 0);
3129 
3130 	pmap_pte_set(spte, pmap_pa2pte(srcpa) | PG_V | PG_RW | PG_U | PG_k);
3131 	pmap_pte_set(dpte,
3132 	    pmap_pa2pte(dstpa) | PG_V | PG_RW | PG_M | PG_U | PG_k);
3133 	pmap_pte_flush();
3134 	pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva);
3135 
3136 	memcpy(cdstva, csrcva, PAGE_SIZE);
3137 
3138 #if defined(DIAGNOSTIC) || defined(XEN)
3139 	pmap_pte_set(spte, 0);
3140 	pmap_pte_set(dpte, 0);
3141 	pmap_pte_flush();
3142 #endif
3143 	kpreempt_enable();
3144 #endif
3145 }
3146 
3147 static pt_entry_t *
3148 pmap_map_ptp(struct vm_page *ptp)
3149 {
3150 #ifdef __HAVE_DIRECT_MAP
3151 	return (void *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp));
3152 #else
3153 	pt_entry_t *ptppte;
3154 	void *ptpva;
3155 	int id;
3156 
3157 	KASSERT(kpreempt_disabled());
3158 
3159 	id = cpu_number();
3160 	ptppte = PTESLEW(ptp_pte, id);
3161 	ptpva = VASLEW(ptpp, id);
3162 #if !defined(XEN)
3163 	pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M |
3164 	    PG_RW | PG_U | PG_k);
3165 #else
3166 	pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M |
3167 	    PG_U | PG_k);
3168 #endif
3169 	pmap_pte_flush();
3170 	pmap_update_pg((vaddr_t)ptpva);
3171 
3172 	return (pt_entry_t *)ptpva;
3173 #endif
3174 }
3175 
3176 static void
3177 pmap_unmap_ptp(void)
3178 {
3179 #ifndef __HAVE_DIRECT_MAP
3180 #if defined(DIAGNOSTIC) || defined(XEN)
3181 	pt_entry_t *pte;
3182 
3183 	KASSERT(kpreempt_disabled());
3184 
3185 	pte = PTESLEW(ptp_pte, cpu_number());
3186 	if (*pte != 0) {
3187 		pmap_pte_set(pte, 0);
3188 		pmap_pte_flush();
3189 	}
3190 #endif
3191 #endif
3192 }
3193 
3194 static pt_entry_t *
3195 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
3196 {
3197 
3198 	KASSERT(kpreempt_disabled());
3199 	if (pmap_is_curpmap(pmap)) {
3200 		return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */
3201 	}
3202 	KASSERT(ptp != NULL);
3203 	return pmap_map_ptp(ptp) + pl1_pi(va);
3204 }
3205 
3206 static void
3207 pmap_unmap_pte(void)
3208 {
3209 
3210 	KASSERT(kpreempt_disabled());
3211 
3212 	pmap_unmap_ptp();
3213 }
3214 
3215 /*
3216  * p m a p   r e m o v e   f u n c t i o n s
3217  *
3218  * functions that remove mappings
3219  */
3220 
3221 /*
3222  * pmap_remove_ptes: remove PTEs from a PTP
3223  *
3224  * => caller must hold pmap's lock
3225  * => PTP must be mapped into KVA
3226  * => PTP should be null if pmap == pmap_kernel()
3227  * => must be called with kernel preemption disabled
3228  * => returns composite pte if at least one page should be shot down
3229  */
3230 
3231 static void
3232 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
3233 		 vaddr_t startva, vaddr_t endva, struct pv_entry **pv_tofree)
3234 {
3235 	pt_entry_t *pte = (pt_entry_t *)ptpva;
3236 
3237 	KASSERT(pmap == pmap_kernel() || mutex_owned(pmap->pm_lock));
3238 	KASSERT(kpreempt_disabled());
3239 
3240 	/*
3241 	 * note that ptpva points to the PTE that maps startva.   this may
3242 	 * or may not be the first PTE in the PTP.
3243 	 *
3244 	 * we loop through the PTP while there are still PTEs to look at
3245 	 * and the wire_count is greater than 1 (because we use the wire_count
3246 	 * to keep track of the number of real PTEs in the PTP).
3247 	 */
3248 	while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) {
3249 		(void)pmap_remove_pte(pmap, ptp, pte, startva, pv_tofree);
3250 		startva += PAGE_SIZE;
3251 		pte++;
3252 	}
3253 }
3254 
3255 
3256 /*
3257  * pmap_remove_pte: remove a single PTE from a PTP.
3258  *
3259  * => caller must hold pmap's lock
3260  * => PTP must be mapped into KVA
3261  * => PTP should be null if pmap == pmap_kernel()
3262  * => returns true if we removed a mapping
3263  * => must be called with kernel preemption disabled
3264  */
3265 static bool
3266 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
3267 		vaddr_t va, struct pv_entry **pv_tofree)
3268 {
3269 	struct pv_entry *pve;
3270 	struct vm_page *pg;
3271 	struct pmap_page *pp;
3272 	pt_entry_t opte;
3273 
3274 	KASSERT(pmap == pmap_kernel() || mutex_owned(pmap->pm_lock));
3275 	KASSERT(kpreempt_disabled());
3276 
3277 	if (!pmap_valid_entry(*pte)) {
3278 		/* VA not mapped. */
3279 		return false;
3280 	}
3281 
3282 	/* Atomically save the old PTE and zap it. */
3283 	opte = pmap_pte_testset(pte, 0);
3284 	if (!pmap_valid_entry(opte)) {
3285 		return false;
3286 	}
3287 
3288 	pmap_exec_account(pmap, va, opte, 0);
3289 	pmap_stats_update_bypte(pmap, 0, opte);
3290 
3291 	if (ptp) {
3292 		/*
3293 		 * Dropping a PTE.  Make sure that the PDE is flushed.
3294 		 */
3295 		ptp->wire_count--;
3296 		if (ptp->wire_count <= 1) {
3297 			opte |= PG_U;
3298 		}
3299 	}
3300 
3301 	if ((opte & PG_U) != 0) {
3302 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE);
3303 	}
3304 
3305 	/*
3306 	 * If we are not on a pv_head list - we are done.
3307 	 */
3308 	if ((opte & PG_PVLIST) == 0) {
3309 #if defined(DIAGNOSTIC) && !defined(DOM0OPS)
3310 		if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL)
3311 			panic("pmap_remove_pte: managed page without "
3312 			      "PG_PVLIST for %#" PRIxVADDR, va);
3313 #endif
3314 		return true;
3315 	}
3316 
3317 	pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte));
3318 
3319 	KASSERTMSG(pg != NULL, "pmap_remove_pte: unmanaged page marked "
3320 	    "PG_PVLIST, va = %#" PRIxVADDR ", pa = %#" PRIxPADDR,
3321 	    va, (paddr_t)pmap_pte2pa(opte));
3322 
3323 	KASSERT(uvm_page_locked_p(pg));
3324 
3325 	/* Sync R/M bits. */
3326 	pp = VM_PAGE_TO_PP(pg);
3327 	pp->pp_attrs |= opte;
3328 	pve = pmap_remove_pv(pp, ptp, va);
3329 
3330 	if (pve) {
3331 		pve->pve_next = *pv_tofree;
3332 		*pv_tofree = pve;
3333 	}
3334 	return true;
3335 }
3336 
3337 /*
3338  * pmap_remove: mapping removal function.
3339  *
3340  * => caller should not be holding any pmap locks
3341  */
3342 
3343 void
3344 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
3345 {
3346 	pt_entry_t *ptes;
3347 	pd_entry_t pde;
3348 	pd_entry_t * const *pdes;
3349 	struct pv_entry *pv_tofree = NULL;
3350 	bool result;
3351 	int i;
3352 	paddr_t ptppa;
3353 	vaddr_t blkendva, va = sva;
3354 	struct vm_page *ptp;
3355 	struct pmap *pmap2;
3356 
3357 	kpreempt_disable();
3358 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
3359 
3360 	/*
3361 	 * removing one page?  take shortcut function.
3362 	 */
3363 
3364 	if (va + PAGE_SIZE == eva) {
3365 		if (pmap_pdes_valid(va, pdes, &pde)) {
3366 
3367 			/* PA of the PTP */
3368 			ptppa = pmap_pte2pa(pde);
3369 
3370 			/* Get PTP if non-kernel mapping. */
3371 			if (pmap != pmap_kernel()) {
3372 				ptp = pmap_find_ptp(pmap, va, ptppa, 1);
3373 				KASSERTMSG(ptp != NULL,
3374 				    "pmap_remove: unmanaged PTP detected");
3375 			} else {
3376 				/* Never free kernel PTPs. */
3377 				ptp = NULL;
3378 			}
3379 
3380 			result = pmap_remove_pte(pmap, ptp,
3381 			    &ptes[pl1_i(va)], va, &pv_tofree);
3382 
3383 			/*
3384 			 * if mapping removed and the PTP is no longer
3385 			 * being used, free it!
3386 			 */
3387 
3388 			if (result && ptp && ptp->wire_count <= 1)
3389 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3390 		}
3391 	} else for (/* null */ ; va < eva ; va = blkendva) {
3392 		int lvl;
3393 
3394 		/* determine range of block */
3395 		blkendva = x86_round_pdr(va+1);
3396 		if (blkendva > eva)
3397 			blkendva = eva;
3398 
3399 		/*
3400 		 * XXXCDC: our PTE mappings should never be removed
3401 		 * with pmap_remove!  if we allow this (and why would
3402 		 * we?) then we end up freeing the pmap's page
3403 		 * directory page (PDP) before we are finished using
3404 		 * it when we hit in in the recursive mapping.  this
3405 		 * is BAD.
3406 		 *
3407 		 * long term solution is to move the PTEs out of user
3408 		 * address space.  and into kernel address space (up
3409 		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
3410 		 * be VM_MAX_ADDRESS.
3411 		 */
3412 
3413 		/* XXXCDC: ugly hack to avoid freeing PDP here */
3414 		for (i = 0; i < PDP_SIZE; i++) {
3415 			if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i)
3416 				continue;
3417 		}
3418 
3419 		lvl = pmap_pdes_invalid(va, pdes, &pde);
3420 		if (lvl != 0) {
3421 			/*
3422 			 * skip a range corresponding to an invalid pde.
3423 			 */
3424 			blkendva = (va & ptp_masks[lvl - 1]) + nbpd[lvl - 1];
3425  			continue;
3426 		}
3427 
3428 		/* PA of the PTP */
3429 		ptppa = pmap_pte2pa(pde);
3430 
3431 		/* Get PTP if non-kernel mapping. */
3432 		if (pmap != pmap_kernel()) {
3433 			ptp = pmap_find_ptp(pmap, va, ptppa, 1);
3434 			KASSERTMSG(ptp != NULL,
3435 			    "pmap_remove: unmanaged PTP detected");
3436 		} else {
3437 			/* Never free kernel PTPs. */
3438 			ptp = NULL;
3439 		}
3440 
3441 		pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va,
3442 		    blkendva, &pv_tofree);
3443 
3444 		/* if PTP is no longer being used, free it! */
3445 		if (ptp && ptp->wire_count <= 1) {
3446 			pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3447 		}
3448 	}
3449 	pmap_unmap_ptes(pmap, pmap2);		/* unlock pmap */
3450 	kpreempt_enable();
3451 
3452 	/* Now we free unused PVs */
3453 	if (pv_tofree)
3454 		pmap_free_pvs(pv_tofree);
3455 }
3456 
3457 /*
3458  * pmap_sync_pv: clear pte bits and return the old value of the pte.
3459  *
3460  * => Caller should disable kernel preemption.
3461  * => issues tlb shootdowns if necessary.
3462  */
3463 
3464 static int
3465 pmap_sync_pv(struct pv_pte *pvpte, pt_entry_t expect, int clearbits,
3466     pt_entry_t *optep)
3467 {
3468 	struct pmap *pmap;
3469 	struct vm_page *ptp;
3470 	vaddr_t va;
3471 	pt_entry_t *ptep;
3472 	pt_entry_t opte;
3473 	pt_entry_t npte;
3474 	bool need_shootdown;
3475 
3476 	ptp = pvpte->pte_ptp;
3477 	va = pvpte->pte_va;
3478 	KASSERT(ptp == NULL || ptp->uobject != NULL);
3479 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
3480 	pmap = ptp_to_pmap(ptp);
3481 
3482 	KASSERT((expect & ~(PG_FRAME | PG_V)) == 0);
3483 	KASSERT((expect & PG_V) != 0);
3484 	KASSERT(clearbits == ~0 || (clearbits & ~(PG_M | PG_U | PG_RW)) == 0);
3485 	KASSERT(kpreempt_disabled());
3486 
3487 	ptep = pmap_map_pte(pmap, ptp, va);
3488 	do {
3489 		opte = *ptep;
3490 		KASSERT((opte & (PG_M | PG_U)) != PG_M);
3491 		KASSERT((opte & (PG_U | PG_V)) != PG_U);
3492 		KASSERT(opte == 0 || (opte & PG_V) != 0);
3493 		if ((opte & (PG_FRAME | PG_V)) != expect) {
3494 
3495 			/*
3496 			 * we lost a race with a V->P operation like
3497 			 * pmap_remove().  wait for the competitor
3498 			 * reflecting pte bits into mp_attrs.
3499 			 *
3500 			 * issue a redundant TLB shootdown so that
3501 			 * we can wait for its completion.
3502 			 */
3503 
3504 			pmap_unmap_pte();
3505 			if (clearbits != 0) {
3506 				pmap_tlb_shootdown(pmap, va,
3507 				    (pmap == pmap_kernel() ? PG_G : 0),
3508 				    TLBSHOOT_SYNC_PV1);
3509 			}
3510 			return EAGAIN;
3511 		}
3512 
3513 		/*
3514 		 * check if there's anything to do on this pte.
3515 		 */
3516 
3517 		if ((opte & clearbits) == 0) {
3518 			need_shootdown = false;
3519 			break;
3520 		}
3521 
3522 		/*
3523 		 * we need a shootdown if the pte is cached. (PG_U)
3524 		 *
3525 		 * ...unless we are clearing only the PG_RW bit and
3526 		 * it isn't cached as RW. (PG_M)
3527 		 */
3528 
3529 		need_shootdown = (opte & PG_U) != 0 &&
3530 		    !(clearbits == PG_RW && (opte & PG_M) == 0);
3531 
3532 		npte = opte & ~clearbits;
3533 
3534 		/*
3535 		 * if we need a shootdown anyway, clear PG_U and PG_M.
3536 		 */
3537 
3538 		if (need_shootdown) {
3539 			npte &= ~(PG_U | PG_M);
3540 		}
3541 		KASSERT((npte & (PG_M | PG_U)) != PG_M);
3542 		KASSERT((npte & (PG_U | PG_V)) != PG_U);
3543 		KASSERT(npte == 0 || (opte & PG_V) != 0);
3544 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
3545 
3546 	if (need_shootdown) {
3547 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV2);
3548 	}
3549 	pmap_unmap_pte();
3550 
3551 	*optep = opte;
3552 	return 0;
3553 }
3554 
3555 /*
3556  * pmap_page_remove: remove a managed vm_page from all pmaps that map it
3557  *
3558  * => R/M bits are sync'd back to attrs
3559  */
3560 
3561 void
3562 pmap_page_remove(struct vm_page *pg)
3563 {
3564 	struct pmap_page *pp;
3565 	struct pv_pte *pvpte;
3566 	struct pv_entry *killlist = NULL;
3567 	struct vm_page *ptp;
3568 	pt_entry_t expect;
3569 	lwp_t *l;
3570 	int count;
3571 
3572 	KASSERT(uvm_page_locked_p(pg));
3573 
3574 	l = curlwp;
3575 	pp = VM_PAGE_TO_PP(pg);
3576 	expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V;
3577 	count = SPINLOCK_BACKOFF_MIN;
3578 	kpreempt_disable();
3579 startover:
3580 	while ((pvpte = pv_pte_first(pp)) != NULL) {
3581 		struct pmap *pmap;
3582 		struct pv_entry *pve;
3583 		pt_entry_t opte;
3584 		vaddr_t va;
3585 		int error;
3586 
3587 		/*
3588 		 * add a reference to the pmap before clearing the pte.
3589 		 * otherwise the pmap can disappear behind us.
3590 		 */
3591 
3592 		ptp = pvpte->pte_ptp;
3593 		pmap = ptp_to_pmap(ptp);
3594 		if (ptp != NULL) {
3595 			pmap_reference(pmap);
3596 		}
3597 
3598 		error = pmap_sync_pv(pvpte, expect, ~0, &opte);
3599 		if (error == EAGAIN) {
3600 			int hold_count;
3601 			KERNEL_UNLOCK_ALL(curlwp, &hold_count);
3602 			if (ptp != NULL) {
3603 				pmap_destroy(pmap);
3604 			}
3605 			SPINLOCK_BACKOFF(count);
3606 			KERNEL_LOCK(hold_count, curlwp);
3607 			goto startover;
3608 		}
3609 
3610 		pp->pp_attrs |= opte;
3611 		va = pvpte->pte_va;
3612 		pve = pmap_remove_pv(pp, ptp, va);
3613 
3614 		/* update the PTP reference count.  free if last reference. */
3615 		if (ptp != NULL) {
3616 			struct pmap *pmap2;
3617 			pt_entry_t *ptes;
3618 			pd_entry_t * const *pdes;
3619 
3620 			KASSERT(pmap != pmap_kernel());
3621 
3622 			pmap_tlb_shootnow();
3623 			pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3624 			pmap_stats_update_bypte(pmap, 0, opte);
3625 			ptp->wire_count--;
3626 			if (ptp->wire_count <= 1) {
3627 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3628 			}
3629 			pmap_unmap_ptes(pmap, pmap2);
3630 			pmap_destroy(pmap);
3631 		} else {
3632 			KASSERT(pmap == pmap_kernel());
3633 			pmap_stats_update_bypte(pmap, 0, opte);
3634 		}
3635 
3636 		if (pve != NULL) {
3637 			pve->pve_next = killlist;	/* mark it for death */
3638 			killlist = pve;
3639 		}
3640 	}
3641 	pmap_tlb_shootnow();
3642 	kpreempt_enable();
3643 
3644 	/* Now free unused pvs. */
3645 	pmap_free_pvs(killlist);
3646 }
3647 
3648 /*
3649  * p m a p   a t t r i b u t e  f u n c t i o n s
3650  * functions that test/change managed page's attributes
3651  * since a page can be mapped multiple times we must check each PTE that
3652  * maps it by going down the pv lists.
3653  */
3654 
3655 /*
3656  * pmap_test_attrs: test a page's attributes
3657  */
3658 
3659 bool
3660 pmap_test_attrs(struct vm_page *pg, unsigned testbits)
3661 {
3662 	struct pmap_page *pp;
3663 	struct pv_pte *pvpte;
3664 	pt_entry_t expect;
3665 	u_int result;
3666 
3667 	KASSERT(uvm_page_locked_p(pg));
3668 
3669 	pp = VM_PAGE_TO_PP(pg);
3670 	if ((pp->pp_attrs & testbits) != 0) {
3671 		return true;
3672 	}
3673 	expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V;
3674 	kpreempt_disable();
3675 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
3676 		pt_entry_t opte;
3677 		int error;
3678 
3679 		if ((pp->pp_attrs & testbits) != 0) {
3680 			break;
3681 		}
3682 		error = pmap_sync_pv(pvpte, expect, 0, &opte);
3683 		if (error == 0) {
3684 			pp->pp_attrs |= opte;
3685 		}
3686 	}
3687 	result = pp->pp_attrs & testbits;
3688 	kpreempt_enable();
3689 
3690 	/*
3691 	 * note that we will exit the for loop with a non-null pve if
3692 	 * we have found the bits we are testing for.
3693 	 */
3694 
3695 	return result != 0;
3696 }
3697 
3698 /*
3699  * pmap_clear_attrs: clear the specified attribute for a page.
3700  *
3701  * => we return true if we cleared one of the bits we were asked to
3702  */
3703 
3704 bool
3705 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits)
3706 {
3707 	struct pmap_page *pp;
3708 	struct pv_pte *pvpte;
3709 	u_int result;
3710 	pt_entry_t expect;
3711 	int count;
3712 
3713 	KASSERT(uvm_page_locked_p(pg));
3714 
3715 	pp = VM_PAGE_TO_PP(pg);
3716 	expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V;
3717 	count = SPINLOCK_BACKOFF_MIN;
3718 	kpreempt_disable();
3719 startover:
3720 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
3721 		pt_entry_t opte;
3722 		int error;
3723 
3724 		error = pmap_sync_pv(pvpte, expect, clearbits, &opte);
3725 		if (error == EAGAIN) {
3726 			int hold_count;
3727 			KERNEL_UNLOCK_ALL(curlwp, &hold_count);
3728 			SPINLOCK_BACKOFF(count);
3729 			KERNEL_LOCK(hold_count, curlwp);
3730 			goto startover;
3731 		}
3732 		pp->pp_attrs |= opte;
3733 	}
3734 	result = pp->pp_attrs & clearbits;
3735 	pp->pp_attrs &= ~clearbits;
3736 	kpreempt_enable();
3737 
3738 	return result != 0;
3739 }
3740 
3741 
3742 /*
3743  * p m a p   p r o t e c t i o n   f u n c t i o n s
3744  */
3745 
3746 /*
3747  * pmap_page_protect: change the protection of all recorded mappings
3748  *	of a managed page
3749  *
3750  * => NOTE: this is an inline function in pmap.h
3751  */
3752 
3753 /* see pmap.h */
3754 
3755 /*
3756  * pmap_protect: set the protection in of the pages in a pmap
3757  *
3758  * => NOTE: this is an inline function in pmap.h
3759  */
3760 
3761 /* see pmap.h */
3762 
3763 /*
3764  * pmap_write_protect: write-protect pages in a pmap.
3765  */
3766 void
3767 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
3768 {
3769 	pt_entry_t *ptes;
3770 	pt_entry_t * const *pdes;
3771 	struct pmap *pmap2;
3772 	vaddr_t blockend, va;
3773 
3774 	KASSERT(curlwp->l_md.md_gc_pmap != pmap);
3775 
3776 	sva &= PG_FRAME;
3777 	eva &= PG_FRAME;
3778 
3779 	/* Acquire pmap. */
3780 	kpreempt_disable();
3781 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3782 
3783 	for (va = sva ; va < eva ; va = blockend) {
3784 		pt_entry_t *spte, *epte;
3785 		int i;
3786 
3787 		blockend = x86_round_pdr(va + 1);
3788 		if (blockend > eva)
3789 			blockend = eva;
3790 
3791 		/*
3792 		 * XXXCDC: our PTE mappings should never be write-protected!
3793 		 *
3794 		 * long term solution is to move the PTEs out of user
3795 		 * address space.  and into kernel address space (up
3796 		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
3797 		 * be VM_MAX_ADDRESS.
3798 		 */
3799 
3800 		/* XXXCDC: ugly hack to avoid freeing PDP here */
3801 		for (i = 0; i < PDP_SIZE; i++) {
3802 			if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i)
3803 				continue;
3804 		}
3805 
3806 		/* Is it a valid block? */
3807 		if (!pmap_pdes_valid(va, pdes, NULL)) {
3808 			continue;
3809 		}
3810 		KASSERT(va < VM_MAXUSER_ADDRESS || va >= VM_MAX_ADDRESS);
3811 
3812 		spte = &ptes[pl1_i(va)];
3813 		epte = &ptes[pl1_i(blockend)];
3814 
3815 		for (/*null */; spte < epte ; spte++) {
3816 			pt_entry_t opte, npte;
3817 
3818 			do {
3819 				opte = *spte;
3820 				if ((~opte & (PG_RW | PG_V)) != 0) {
3821 					goto next;
3822 				}
3823 				npte = opte & ~PG_RW;
3824 			} while (pmap_pte_cas(spte, opte, npte) != opte);
3825 
3826 			if ((opte & PG_M) != 0) {
3827 				vaddr_t tva = x86_ptob(spte - ptes);
3828 				pmap_tlb_shootdown(pmap, tva, opte,
3829 				    TLBSHOOT_WRITE_PROTECT);
3830 			}
3831 next:;
3832 		}
3833 	}
3834 
3835 	/* Release pmap. */
3836 	pmap_unmap_ptes(pmap, pmap2);
3837 	kpreempt_enable();
3838 }
3839 
3840 /*
3841  * pmap_unwire: clear the wired bit in the PTE.
3842  *
3843  * => Mapping should already be present.
3844  */
3845 void
3846 pmap_unwire(struct pmap *pmap, vaddr_t va)
3847 {
3848 	pt_entry_t *ptes, *ptep, opte;
3849 	pd_entry_t * const *pdes;
3850 	struct pmap *pmap2;
3851 
3852 	/* Acquire pmap. */
3853 	kpreempt_disable();
3854 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3855 
3856 	if (!pmap_pdes_valid(va, pdes, NULL)) {
3857 		panic("pmap_unwire: invalid PDE");
3858 	}
3859 
3860 	ptep = &ptes[pl1_i(va)];
3861 	opte = *ptep;
3862 	KASSERT(pmap_valid_entry(opte));
3863 
3864 	if (opte & PG_W) {
3865 		pt_entry_t npte = opte & ~PG_W;
3866 
3867 		opte = pmap_pte_testset(ptep, npte);
3868 		pmap_stats_update_bypte(pmap, npte, opte);
3869 	} else {
3870 		printf("pmap_unwire: wiring for pmap %p va 0x%lx "
3871 		    "did not change!\n", pmap, va);
3872 	}
3873 
3874 	/* Release pmap. */
3875 	pmap_unmap_ptes(pmap, pmap2);
3876 	kpreempt_enable();
3877 }
3878 
3879 /*
3880  * pmap_copy: copy mappings from one pmap to another
3881  *
3882  * => optional function
3883  * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
3884  */
3885 
3886 /*
3887  * defined as macro in pmap.h
3888  */
3889 
3890 __strict_weak_alias(pmap_enter, pmap_enter_default);
3891 
3892 int
3893 pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
3894     u_int flags)
3895 {
3896 	return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0);
3897 }
3898 
3899 /*
3900  * pmap_enter: enter a mapping into a pmap
3901  *
3902  * => must be done "now" ... no lazy-evaluation
3903  * => we set pmap => pv_head locking
3904  */
3905 int
3906 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa,
3907 	   vm_prot_t prot, u_int flags, int domid)
3908 {
3909 	pt_entry_t *ptes, opte, npte;
3910 	pt_entry_t *ptep;
3911 	pd_entry_t * const *pdes;
3912 	struct vm_page *ptp, *pg;
3913 	struct pmap_page *new_pp;
3914 	struct pmap_page *old_pp;
3915 	struct pv_entry *old_pve = NULL;
3916 	struct pv_entry *new_pve;
3917 	struct pv_entry *new_pve2;
3918 	int error;
3919 	bool wired = (flags & PMAP_WIRED) != 0;
3920 	struct pmap *pmap2;
3921 
3922 	KASSERT(pmap_initialized);
3923 	KASSERT(curlwp->l_md.md_gc_pmap != pmap);
3924 	KASSERT(va < VM_MAX_KERNEL_ADDRESS);
3925 	KASSERTMSG(va != (vaddr_t)PDP_BASE,
3926 	    "pmap_enter: trying to map over PDP!");
3927 	KASSERTMSG(va < VM_MIN_KERNEL_ADDRESS ||
3928 	    pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]),
3929 	    "pmap_enter: missing kernel PTP for VA %lx!", va);
3930 
3931 #ifdef XEN
3932 	KASSERT(domid == DOMID_SELF || pa == 0);
3933 #endif /* XEN */
3934 
3935 	npte = ma | protection_codes[prot] | PG_V;
3936 	npte |= pmap_pat_flags(flags);
3937 	if (wired)
3938 	        npte |= PG_W;
3939 	if (va < VM_MAXUSER_ADDRESS)
3940 		npte |= PG_u;
3941 	else if (va < VM_MAX_ADDRESS)
3942 		npte |= (PG_u | PG_RW);	/* XXXCDC: no longer needed? */
3943 	else
3944 		npte |= PG_k;
3945 	if (pmap == pmap_kernel())
3946 		npte |= pmap_pg_g;
3947 	if (flags & VM_PROT_ALL) {
3948 		npte |= PG_U;
3949 		if (flags & VM_PROT_WRITE) {
3950 			KASSERT((npte & PG_RW) != 0);
3951 			npte |= PG_M;
3952 		}
3953 	}
3954 
3955 #ifdef XEN
3956 	if (domid != DOMID_SELF)
3957 		pg = NULL;
3958 	else
3959 #endif
3960 		pg = PHYS_TO_VM_PAGE(pa);
3961 	if (pg != NULL) {
3962 		/* This is a managed page */
3963 		npte |= PG_PVLIST;
3964 		new_pp = VM_PAGE_TO_PP(pg);
3965 	} else {
3966 		new_pp = NULL;
3967 	}
3968 
3969 	/* get pves. */
3970 	new_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
3971 	new_pve2 = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
3972 	if (new_pve == NULL || new_pve2 == NULL) {
3973 		if (flags & PMAP_CANFAIL) {
3974 			error = ENOMEM;
3975 			goto out2;
3976 		}
3977 		panic("pmap_enter: pve allocation failed");
3978 	}
3979 
3980 	kpreempt_disable();
3981 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
3982 	if (pmap == pmap_kernel()) {
3983 		ptp = NULL;
3984 	} else {
3985 		ptp = pmap_get_ptp(pmap, va, pdes);
3986 		if (ptp == NULL) {
3987 			pmap_unmap_ptes(pmap, pmap2);
3988 			if (flags & PMAP_CANFAIL) {
3989 				error = ENOMEM;
3990 				goto out;
3991 			}
3992 			panic("pmap_enter: get ptp failed");
3993 		}
3994 	}
3995 
3996 	/*
3997 	 * update the pte.
3998 	 */
3999 
4000 	ptep = &ptes[pl1_i(va)];
4001 	do {
4002 		opte = *ptep;
4003 
4004 		/*
4005 		 * if the same page, inherit PG_U and PG_M.
4006 		 */
4007 		if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) {
4008 			npte |= opte & (PG_U | PG_M);
4009 		}
4010 #if defined(XEN)
4011 		if (domid != DOMID_SELF) {
4012 			/* pmap_pte_cas with error handling */
4013 			int s = splvm();
4014 			if (opte != *ptep) {
4015 				splx(s);
4016 				continue;
4017 			}
4018 			error = xpq_update_foreign(
4019 			    vtomach((vaddr_t)ptep), npte, domid);
4020 			splx(s);
4021 			if (error) {
4022 				if (ptp != NULL && ptp->wire_count <= 1) {
4023 					pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4024 				}
4025 				pmap_unmap_ptes(pmap, pmap2);
4026 				goto out;
4027 			}
4028 			break;
4029 		}
4030 #endif /* defined(XEN) */
4031 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
4032 
4033 	/*
4034 	 * update statistics and PTP's reference count.
4035 	 */
4036 
4037 	pmap_stats_update_bypte(pmap, npte, opte);
4038 	if (ptp != NULL && !pmap_valid_entry(opte)) {
4039 		ptp->wire_count++;
4040 	}
4041 	KASSERT(ptp == NULL || ptp->wire_count > 1);
4042 
4043 	/*
4044 	 * if the same page, we can skip pv_entry handling.
4045 	 */
4046 
4047 	if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) {
4048 		KASSERT(((opte ^ npte) & PG_PVLIST) == 0);
4049 		goto same_pa;
4050 	}
4051 
4052 	/*
4053 	 * if old page is managed, remove pv_entry from its list.
4054 	 */
4055 
4056 	if ((~opte & (PG_V | PG_PVLIST)) == 0) {
4057 		pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte));
4058 
4059 		KASSERTMSG(pg != NULL, "pmap_enter: PG_PVLIST mapping with "
4060 		    "unmanaged page pa = 0x%" PRIx64 " (0x%" PRIx64 ")",
4061 		    (int64_t)pa, (int64_t)atop(pa));
4062 
4063 		KASSERT(uvm_page_locked_p(pg));
4064 
4065 		old_pp = VM_PAGE_TO_PP(pg);
4066 		old_pve = pmap_remove_pv(old_pp, ptp, va);
4067 		old_pp->pp_attrs |= opte;
4068 	}
4069 
4070 	/*
4071 	 * if new page is managed, insert pv_entry into its list.
4072 	 */
4073 
4074 	if (new_pp) {
4075 		new_pve = pmap_enter_pv(new_pp, new_pve, &new_pve2, ptp, va);
4076 	}
4077 
4078 same_pa:
4079 	pmap_unmap_ptes(pmap, pmap2);
4080 
4081 	/*
4082 	 * shootdown tlb if necessary.
4083 	 */
4084 
4085 	if ((~opte & (PG_V | PG_U)) == 0 &&
4086 	    ((opte ^ npte) & (PG_FRAME | PG_RW)) != 0) {
4087 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER);
4088 	}
4089 
4090 	error = 0;
4091 out:
4092 	kpreempt_enable();
4093 out2:
4094 	if (old_pve != NULL) {
4095 		pool_cache_put(&pmap_pv_cache, old_pve);
4096 	}
4097 	if (new_pve != NULL) {
4098 		pool_cache_put(&pmap_pv_cache, new_pve);
4099 	}
4100 	if (new_pve2 != NULL) {
4101 		pool_cache_put(&pmap_pv_cache, new_pve2);
4102 	}
4103 
4104 	return error;
4105 }
4106 
4107 static bool
4108 pmap_get_physpage(vaddr_t va, int level, paddr_t *paddrp)
4109 {
4110 	struct vm_page *ptp;
4111 	struct pmap *kpm = pmap_kernel();
4112 
4113 	if (!uvm.page_init_done) {
4114 
4115 		/*
4116 		 * we're growing the kernel pmap early (from
4117 		 * uvm_pageboot_alloc()).  this case must be
4118 		 * handled a little differently.
4119 		 */
4120 
4121 		if (!uvm_page_physget(paddrp))
4122 			panic("pmap_get_physpage: out of memory");
4123 #ifdef __HAVE_DIRECT_MAP
4124 		pagezero(PMAP_DIRECT_MAP(*paddrp));
4125 #else
4126 		kpreempt_disable();
4127 		pmap_pte_set(early_zero_pte,
4128 		    pmap_pa2pte(*paddrp) | PG_V | PG_RW | PG_k);
4129 		pmap_pte_flush();
4130 		pmap_update_pg((vaddr_t)early_zerop);
4131 		memset(early_zerop, 0, PAGE_SIZE);
4132 #if defined(DIAGNOSTIC) || defined (XEN)
4133 		pmap_pte_set(early_zero_pte, 0);
4134 		pmap_pte_flush();
4135 #endif /* defined(DIAGNOSTIC) */
4136 		kpreempt_enable();
4137 #endif
4138 	} else {
4139 		/* XXX */
4140 		ptp = uvm_pagealloc(NULL, 0, NULL,
4141 				    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
4142 		if (ptp == NULL)
4143 			panic("pmap_get_physpage: out of memory");
4144 		ptp->flags &= ~PG_BUSY;
4145 		ptp->wire_count = 1;
4146 		*paddrp = VM_PAGE_TO_PHYS(ptp);
4147 	}
4148 	pmap_stats_update(kpm, 1, 0);
4149 	return true;
4150 }
4151 
4152 /*
4153  * Allocate the amount of specified ptps for a ptp level, and populate
4154  * all levels below accordingly, mapping virtual addresses starting at
4155  * kva.
4156  *
4157  * Used by pmap_growkernel.
4158  */
4159 static void
4160 pmap_alloc_level(pd_entry_t * const *pdes, vaddr_t kva, int lvl,
4161     long *needed_ptps)
4162 {
4163 	unsigned long i;
4164 	vaddr_t va;
4165 	paddr_t pa;
4166 	unsigned long index, endindex;
4167 	int level;
4168 	pd_entry_t *pdep;
4169 #ifdef XEN
4170 	int s = splvm(); /* protect xpq_* */
4171 #endif
4172 
4173 	for (level = lvl; level > 1; level--) {
4174 		if (level == PTP_LEVELS)
4175 			pdep = pmap_kernel()->pm_pdir;
4176 		else
4177 			pdep = pdes[level - 2];
4178 		va = kva;
4179 		index = pl_i_roundup(kva, level);
4180 		endindex = index + needed_ptps[level - 1] - 1;
4181 
4182 
4183 		for (i = index; i <= endindex; i++) {
4184 			pt_entry_t pte;
4185 
4186 			KASSERT(!pmap_valid_entry(pdep[i]));
4187 			pmap_get_physpage(va, level - 1, &pa);
4188 			pte = pmap_pa2pte(pa) | PG_k | PG_V | PG_RW;
4189 #ifdef XEN
4190 			pmap_pte_set(&pdep[i], pte);
4191 #if defined(PAE) || defined(__x86_64__)
4192 			if (level == PTP_LEVELS && i >= PDIR_SLOT_KERN) {
4193 				if (__predict_true(
4194 				    cpu_info_primary.ci_flags & CPUF_PRESENT)) {
4195 					/* update per-cpu PMDs on all cpus */
4196 					xen_kpm_sync(pmap_kernel(), i);
4197 				} else {
4198 					/*
4199 					 * too early; update primary CPU
4200 					 * PMD only (without locks)
4201 					 */
4202 #ifdef PAE
4203 					pd_entry_t *cpu_pdep =
4204 					    &cpu_info_primary.ci_kpm_pdir[l2tol2(i)];
4205 #endif
4206 #ifdef __x86_64__
4207 					pd_entry_t *cpu_pdep =
4208 						&cpu_info_primary.ci_kpm_pdir[i];
4209 #endif
4210 					pmap_pte_set(cpu_pdep, pte);
4211 				}
4212 			}
4213 #endif /* PAE || __x86_64__ */
4214 #else /* XEN */
4215 			pdep[i] = pte;
4216 #endif /* XEN */
4217 			KASSERT(level != PTP_LEVELS || nkptp[level - 1] +
4218 			    pl_i(VM_MIN_KERNEL_ADDRESS, level) == i);
4219 			nkptp[level - 1]++;
4220 			va += nbpd[level - 1];
4221 		}
4222 		pmap_pte_flush();
4223 	}
4224 #ifdef XEN
4225 	splx(s);
4226 #endif
4227 }
4228 
4229 /*
4230  * pmap_growkernel: increase usage of KVM space
4231  *
4232  * => we allocate new PTPs for the kernel and install them in all
4233  *	the pmaps on the system.
4234  */
4235 
4236 vaddr_t
4237 pmap_growkernel(vaddr_t maxkvaddr)
4238 {
4239 	struct pmap *kpm = pmap_kernel();
4240 #if !defined(XEN) || !defined(__x86_64__)
4241 	struct pmap *pm;
4242 #endif
4243 	int s, i;
4244 	long needed_kptp[PTP_LEVELS], target_nptp, old;
4245 	bool invalidate = false;
4246 
4247 	s = splvm();	/* to be safe */
4248 	mutex_enter(kpm->pm_lock);
4249 
4250 	if (maxkvaddr <= pmap_maxkvaddr) {
4251 		mutex_exit(kpm->pm_lock);
4252 		splx(s);
4253 		return pmap_maxkvaddr;
4254 	}
4255 
4256 	maxkvaddr = x86_round_pdr(maxkvaddr);
4257 	old = nkptp[PTP_LEVELS - 1];
4258 	/*
4259 	 * This loop could be optimized more, but pmap_growkernel()
4260 	 * is called infrequently.
4261 	 */
4262 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
4263 		target_nptp = pl_i_roundup(maxkvaddr, i + 1) -
4264 		    pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1);
4265 		/*
4266 		 * XXX only need to check toplevel.
4267 		 */
4268 		if (target_nptp > nkptpmax[i])
4269 			panic("out of KVA space");
4270 		KASSERT(target_nptp >= nkptp[i]);
4271 		needed_kptp[i] = target_nptp - nkptp[i];
4272 	}
4273 
4274 	pmap_alloc_level(normal_pdes, pmap_maxkvaddr, PTP_LEVELS, needed_kptp);
4275 
4276 	/*
4277 	 * If the number of top level entries changed, update all
4278 	 * pmaps.
4279 	 */
4280 	if (needed_kptp[PTP_LEVELS - 1] != 0) {
4281 #ifdef XEN
4282 #ifdef __x86_64__
4283 		/* nothing, kernel entries are never entered in user pmap */
4284 #else /* __x86_64__ */
4285 		mutex_enter(&pmaps_lock);
4286 		LIST_FOREACH(pm, &pmaps, pm_list) {
4287 			int pdkidx;
4288 			for (pdkidx =  PDIR_SLOT_KERN + old;
4289 			    pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1];
4290 			    pdkidx++) {
4291 				pmap_pte_set(&pm->pm_pdir[pdkidx],
4292 				    kpm->pm_pdir[pdkidx]);
4293 			}
4294 			pmap_pte_flush();
4295 		}
4296 		mutex_exit(&pmaps_lock);
4297 #endif /* __x86_64__ */
4298 #else /* XEN */
4299 		unsigned newpdes;
4300 		newpdes = nkptp[PTP_LEVELS - 1] - old;
4301 		mutex_enter(&pmaps_lock);
4302 		LIST_FOREACH(pm, &pmaps, pm_list) {
4303 			memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old],
4304 			       &kpm->pm_pdir[PDIR_SLOT_KERN + old],
4305 			       newpdes * sizeof (pd_entry_t));
4306 		}
4307 		mutex_exit(&pmaps_lock);
4308 #endif
4309 		invalidate = true;
4310 	}
4311 	pmap_maxkvaddr = maxkvaddr;
4312 	mutex_exit(kpm->pm_lock);
4313 	splx(s);
4314 
4315 	if (invalidate && pmap_initialized) {
4316 		/* Invalidate the PDP cache. */
4317 		pool_cache_invalidate(&pmap_pdp_cache);
4318 	}
4319 
4320 	return maxkvaddr;
4321 }
4322 
4323 #ifdef DEBUG
4324 void pmap_dump(struct pmap *, vaddr_t, vaddr_t);
4325 
4326 /*
4327  * pmap_dump: dump all the mappings from a pmap
4328  *
4329  * => caller should not be holding any pmap locks
4330  */
4331 
4332 void
4333 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
4334 {
4335 	pt_entry_t *ptes, *pte;
4336 	pd_entry_t * const *pdes;
4337 	struct pmap *pmap2;
4338 	vaddr_t blkendva;
4339 
4340 	/*
4341 	 * if end is out of range truncate.
4342 	 * if (end == start) update to max.
4343 	 */
4344 
4345 	if (eva > VM_MAXUSER_ADDRESS || eva <= sva)
4346 		eva = VM_MAXUSER_ADDRESS;
4347 
4348 	/*
4349 	 * we lock in the pmap => pv_head direction
4350 	 */
4351 
4352 	kpreempt_disable();
4353 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
4354 
4355 	/*
4356 	 * dumping a range of pages: we dump in PTP sized blocks (4MB)
4357 	 */
4358 
4359 	for (/* null */ ; sva < eva ; sva = blkendva) {
4360 
4361 		/* determine range of block */
4362 		blkendva = x86_round_pdr(sva+1);
4363 		if (blkendva > eva)
4364 			blkendva = eva;
4365 
4366 		/* valid block? */
4367 		if (!pmap_pdes_valid(sva, pdes, NULL))
4368 			continue;
4369 
4370 		pte = &ptes[pl1_i(sva)];
4371 		for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) {
4372 			if (!pmap_valid_entry(*pte))
4373 				continue;
4374 			printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR
4375 			    " (pte=%#" PRIxPADDR ")\n",
4376 			    sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte);
4377 		}
4378 	}
4379 	pmap_unmap_ptes(pmap, pmap2);
4380 	kpreempt_enable();
4381 }
4382 #endif
4383 
4384 /*
4385  * pmap_update: process deferred invalidations and frees.
4386  */
4387 
4388 void
4389 pmap_update(struct pmap *pmap)
4390 {
4391 	struct vm_page *empty_ptps;
4392 	lwp_t *l = curlwp;
4393 
4394 	/*
4395 	 * If we have torn down this pmap, invalidate non-global TLB
4396 	 * entries on any processors using it.
4397 	 */
4398 	KPREEMPT_DISABLE(l);
4399 	if (__predict_false(l->l_md.md_gc_pmap == pmap)) {
4400 		l->l_md.md_gc_pmap = NULL;
4401 		pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, TLBSHOOT_UPDATE);
4402 	}
4403 	/*
4404 	 * Initiate any pending TLB shootdowns.  Wait for them to
4405 	 * complete before returning control to the caller.
4406 	 */
4407 	pmap_tlb_shootnow();
4408 	KPREEMPT_ENABLE(l);
4409 
4410 	/*
4411 	 * Now that shootdowns are complete, process deferred frees,
4412 	 * but not from interrupt context.
4413 	 */
4414 	if (l->l_md.md_gc_ptp != NULL) {
4415 		KASSERT((l->l_pflag & LP_INTR) == 0);
4416 		if (cpu_intr_p()) {
4417 			return;
4418 		}
4419 		empty_ptps = l->l_md.md_gc_ptp;
4420 		l->l_md.md_gc_ptp = NULL;
4421 		pmap_free_ptps(empty_ptps);
4422 	}
4423 }
4424 
4425 #if PTP_LEVELS > 4
4426 #error "Unsupported number of page table mappings"
4427 #endif
4428 
4429 paddr_t
4430 pmap_init_tmp_pgtbl(paddr_t pg)
4431 {
4432 	static bool maps_loaded;
4433 	static const paddr_t x86_tmp_pml_paddr[] = {
4434 	    4 * PAGE_SIZE,
4435 	    5 * PAGE_SIZE,
4436 	    6 * PAGE_SIZE,
4437 	    7 * PAGE_SIZE
4438 	};
4439 	static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 };
4440 
4441 	pd_entry_t *tmp_pml, *kernel_pml;
4442 
4443 	int level;
4444 
4445 	if (!maps_loaded) {
4446 		for (level = 0; level < PTP_LEVELS; ++level) {
4447 			x86_tmp_pml_vaddr[level] =
4448 			    uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
4449 			    UVM_KMF_VAONLY);
4450 
4451 			if (x86_tmp_pml_vaddr[level] == 0)
4452 				panic("mapping of real mode PML failed\n");
4453 			pmap_kenter_pa(x86_tmp_pml_vaddr[level],
4454 			    x86_tmp_pml_paddr[level],
4455 			    VM_PROT_READ | VM_PROT_WRITE, 0);
4456 			pmap_update(pmap_kernel());
4457 		}
4458 		maps_loaded = true;
4459 	}
4460 
4461 	/* Zero levels 1-3 */
4462 	for (level = 0; level < PTP_LEVELS - 1; ++level) {
4463 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
4464 		memset(tmp_pml, 0, PAGE_SIZE);
4465 	}
4466 
4467 	/* Copy PML4 */
4468 	kernel_pml = pmap_kernel()->pm_pdir;
4469 	tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1];
4470 	memcpy(tmp_pml, kernel_pml, PAGE_SIZE);
4471 
4472 #ifdef PAE
4473 	/*
4474 	 * Use the last 4 entries of the L2 page as L3 PD entries. These
4475 	 * last entries are unlikely to be used for temporary mappings.
4476 	 * 508: maps 0->1GB (userland)
4477 	 * 509: unused
4478 	 * 510: unused
4479 	 * 511: maps 3->4GB (kernel)
4480 	 */
4481 	tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PG_V;
4482 	tmp_pml[509] = 0;
4483 	tmp_pml[510] = 0;
4484 	tmp_pml[511] = pmap_pdirpa(pmap_kernel(), PDIR_SLOT_KERN) | PG_V;
4485 #endif
4486 
4487 	for (level = PTP_LEVELS - 1; level > 0; --level) {
4488 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
4489 
4490 		tmp_pml[pl_i(pg, level + 1)] =
4491 		    (x86_tmp_pml_paddr[level - 1] & PG_FRAME) | PG_RW | PG_V;
4492 	}
4493 
4494 	tmp_pml = (void *)x86_tmp_pml_vaddr[0];
4495 	tmp_pml[pl_i(pg, 1)] = (pg & PG_FRAME) | PG_RW | PG_V;
4496 
4497 #ifdef PAE
4498 	/* Return the PA of the L3 page (entry 508 of the L2 page) */
4499 	return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t);
4500 #endif
4501 
4502 	return x86_tmp_pml_paddr[PTP_LEVELS - 1];
4503 }
4504 
4505 u_int
4506 x86_mmap_flags(paddr_t mdpgno)
4507 {
4508 	u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK;
4509 	u_int pflag = 0;
4510 
4511 	if (nflag & X86_MMAP_FLAG_PREFETCH)
4512 		pflag |= PMAP_WRITE_COMBINE;
4513 
4514 	return pflag;
4515 }
4516