xref: /netbsd-src/sys/arch/x86/x86/pmap.c (revision a5847cc334d9a7029f6352b847e9e8d71a0f9e0c)
1 /*	$NetBSD: pmap.c,v 1.141 2011/11/08 17:16:52 cherry Exp $	*/
2 
3 /*-
4  * Copyright (c) 2008, 2010 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 2007 Manuel Bouyer.
34  *
35  * Redistribution and use in source and binary forms, with or without
36  * modification, are permitted provided that the following conditions
37  * are met:
38  * 1. Redistributions of source code must retain the above copyright
39  *    notice, this list of conditions and the following disclaimer.
40  * 2. Redistributions in binary form must reproduce the above copyright
41  *    notice, this list of conditions and the following disclaimer in the
42  *    documentation and/or other materials provided with the distribution.
43  *
44  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
45  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
46  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
47  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
48  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
49  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
50  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
51  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
52  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
53  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
54  *
55  */
56 
57 /*
58  * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
59  *
60  * Permission to use, copy, modify, and distribute this software for any
61  * purpose with or without fee is hereby granted, provided that the above
62  * copyright notice and this permission notice appear in all copies.
63  *
64  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
65  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
66  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
67  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
68  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
69  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
70  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
71  */
72 
73 /*
74  * Copyright (c) 1997 Charles D. Cranor and Washington University.
75  * All rights reserved.
76  *
77  * Redistribution and use in source and binary forms, with or without
78  * modification, are permitted provided that the following conditions
79  * are met:
80  * 1. Redistributions of source code must retain the above copyright
81  *    notice, this list of conditions and the following disclaimer.
82  * 2. Redistributions in binary form must reproduce the above copyright
83  *    notice, this list of conditions and the following disclaimer in the
84  *    documentation and/or other materials provided with the distribution.
85  *
86  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
87  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
88  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
89  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
90  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
91  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
92  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
93  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
94  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
95  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
96  */
97 
98 /*
99  * Copyright 2001 (c) Wasabi Systems, Inc.
100  * All rights reserved.
101  *
102  * Written by Frank van der Linden for Wasabi Systems, Inc.
103  *
104  * Redistribution and use in source and binary forms, with or without
105  * modification, are permitted provided that the following conditions
106  * are met:
107  * 1. Redistributions of source code must retain the above copyright
108  *    notice, this list of conditions and the following disclaimer.
109  * 2. Redistributions in binary form must reproduce the above copyright
110  *    notice, this list of conditions and the following disclaimer in the
111  *    documentation and/or other materials provided with the distribution.
112  * 3. All advertising materials mentioning features or use of this software
113  *    must display the following acknowledgement:
114  *      This product includes software developed for the NetBSD Project by
115  *      Wasabi Systems, Inc.
116  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
117  *    or promote products derived from this software without specific prior
118  *    written permission.
119  *
120  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
121  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
122  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
123  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
124  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
125  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
126  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
127  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
128  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
129  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
130  * POSSIBILITY OF SUCH DAMAGE.
131  */
132 
133 /*
134  * This is the i386 pmap modified and generalized to support x86-64
135  * as well. The idea is to hide the upper N levels of the page tables
136  * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest
137  * is mostly untouched, except that it uses some more generalized
138  * macros and interfaces.
139  *
140  * This pmap has been tested on the i386 as well, and it can be easily
141  * adapted to PAE.
142  *
143  * fvdl@wasabisystems.com 18-Jun-2001
144  */
145 
146 /*
147  * pmap.c: i386 pmap module rewrite
148  * Chuck Cranor <chuck@netbsd>
149  * 11-Aug-97
150  *
151  * history of this pmap module: in addition to my own input, i used
152  *    the following references for this rewrite of the i386 pmap:
153  *
154  * [1] the NetBSD i386 pmap.   this pmap appears to be based on the
155  *     BSD hp300 pmap done by Mike Hibler at University of Utah.
156  *     it was then ported to the i386 by William Jolitz of UUNET
157  *     Technologies, Inc.   Then Charles M. Hannum of the NetBSD
158  *     project fixed some bugs and provided some speed ups.
159  *
160  * [2] the FreeBSD i386 pmap.   this pmap seems to be the
161  *     Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson
162  *     and David Greenman.
163  *
164  * [3] the Mach pmap.   this pmap, from CMU, seems to have migrated
165  *     between several processors.   the VAX version was done by
166  *     Avadis Tevanian, Jr., and Michael Wayne Young.    the i386
167  *     version was done by Lance Berc, Mike Kupfer, Bob Baron,
168  *     David Golub, and Richard Draves.    the alpha version was
169  *     done by Alessandro Forin (CMU/Mach) and Chris Demetriou
170  *     (NetBSD/alpha).
171  */
172 
173 #include <sys/cdefs.h>
174 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.141 2011/11/08 17:16:52 cherry Exp $");
175 
176 #include "opt_user_ldt.h"
177 #include "opt_lockdebug.h"
178 #include "opt_multiprocessor.h"
179 #include "opt_xen.h"
180 #if !defined(__x86_64__)
181 #include "opt_kstack_dr0.h"
182 #endif /* !defined(__x86_64__) */
183 
184 #include <sys/param.h>
185 #include <sys/systm.h>
186 #include <sys/proc.h>
187 #include <sys/pool.h>
188 #include <sys/kernel.h>
189 #include <sys/atomic.h>
190 #include <sys/cpu.h>
191 #include <sys/intr.h>
192 #include <sys/xcall.h>
193 
194 #include <uvm/uvm.h>
195 
196 #include <dev/isa/isareg.h>
197 
198 #include <machine/specialreg.h>
199 #include <machine/gdt.h>
200 #include <machine/isa_machdep.h>
201 #include <machine/cpuvar.h>
202 
203 #include <x86/pmap.h>
204 #include <x86/pmap_pv.h>
205 
206 #include <x86/i82489reg.h>
207 #include <x86/i82489var.h>
208 
209 #ifdef XEN
210 #include <xen/xen3-public/xen.h>
211 #include <xen/hypervisor.h>
212 #endif
213 
214 /*
215  * general info:
216  *
217  *  - for an explanation of how the i386 MMU hardware works see
218  *    the comments in <machine/pte.h>.
219  *
220  *  - for an explanation of the general memory structure used by
221  *    this pmap (including the recursive mapping), see the comments
222  *    in <machine/pmap.h>.
223  *
224  * this file contains the code for the "pmap module."   the module's
225  * job is to manage the hardware's virtual to physical address mappings.
226  * note that there are two levels of mapping in the VM system:
227  *
228  *  [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
229  *      to map ranges of virtual address space to objects/files.  for
230  *      example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
231  *      to the file /bin/ls starting at offset zero."   note that
232  *      the upper layer mapping is not concerned with how individual
233  *      vm_pages are mapped.
234  *
235  *  [2] the lower layer of the VM system (the pmap) maintains the mappings
236  *      from virtual addresses.   it is concerned with which vm_page is
237  *      mapped where.   for example, when you run /bin/ls and start
238  *      at page 0x1000 the fault routine may lookup the correct page
239  *      of the /bin/ls file and then ask the pmap layer to establish
240  *      a mapping for it.
241  *
242  * note that information in the lower layer of the VM system can be
243  * thrown away since it can easily be reconstructed from the info
244  * in the upper layer.
245  *
246  * data structures we use include:
247  *
248  *  - struct pmap: describes the address space of one thread
249  *  - struct pv_entry: describes one <PMAP,VA> mapping of a PA
250  *  - struct pv_head: there is one pv_head per managed page of
251  *	physical memory.   the pv_head points to a list of pv_entry
252  *	structures which describe all the <PMAP,VA> pairs that this
253  *      page is mapped in.    this is critical for page based operations
254  *      such as pmap_page_protect() [change protection on _all_ mappings
255  *      of a page]
256  */
257 
258 /*
259  * memory allocation
260  *
261  *  - there are three data structures that we must dynamically allocate:
262  *
263  * [A] new process' page directory page (PDP)
264  *	- plan 1: done at pmap_create() we use
265  *	  uvm_km_alloc(kernel_map, PAGE_SIZE)  [fka kmem_alloc] to do this
266  *	  allocation.
267  *
268  * if we are low in free physical memory then we sleep in
269  * uvm_km_alloc -- in this case this is ok since we are creating
270  * a new pmap and should not be holding any locks.
271  *
272  * if the kernel is totally out of virtual space
273  * (i.e. uvm_km_alloc returns NULL), then we panic.
274  *
275  * [B] new page tables pages (PTP)
276  * 	- call uvm_pagealloc()
277  * 		=> success: zero page, add to pm_pdir
278  * 		=> failure: we are out of free vm_pages, let pmap_enter()
279  *		   tell UVM about it.
280  *
281  * note: for kernel PTPs, we start with NKPTP of them.   as we map
282  * kernel memory (at uvm_map time) we check to see if we've grown
283  * the kernel pmap.   if so, we call the optional function
284  * pmap_growkernel() to grow the kernel PTPs in advance.
285  *
286  * [C] pv_entry structures
287  */
288 
289 /*
290  * locking
291  *
292  * we have the following locks that we must contend with:
293  *
294  * mutexes:
295  *
296  * - pmap lock (per pmap, part of uvm_object)
297  *   this lock protects the fields in the pmap structure including
298  *   the non-kernel PDEs in the PDP, and the PTEs.  it also locks
299  *   in the alternate PTE space (since that is determined by the
300  *   entry in the PDP).
301  *
302  * - pvh_lock (per pv_head)
303  *   this lock protects the pv_entry list which is chained off the
304  *   pv_head structure for a specific managed PA.   it is locked
305  *   when traversing the list (e.g. adding/removing mappings,
306  *   syncing R/M bits, etc.)
307  *
308  * - pmaps_lock
309  *   this lock protects the list of active pmaps (headed by "pmaps").
310  *   we lock it when adding or removing pmaps from this list.
311  */
312 
313 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER;
314 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER;
315 const long nkptpmax[] = NKPTPMAX_INITIALIZER;
316 const long nbpd[] = NBPD_INITIALIZER;
317 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER;
318 
319 long nkptp[] = NKPTP_INITIALIZER;
320 
321 struct pmap_head pmaps;
322 kmutex_t pmaps_lock;
323 
324 static vaddr_t pmap_maxkvaddr;
325 
326 /*
327  * XXX kludge: dummy locking to make KASSERTs in uvm_page.c comfortable.
328  * actual locking is done by pm_lock.
329  */
330 #if defined(DIAGNOSTIC)
331 #define	PMAP_SUBOBJ_LOCK(pm, idx) \
332 	KASSERT(mutex_owned((pm)->pm_lock)); \
333 	if ((idx) != 0) \
334 		mutex_enter((pm)->pm_obj[(idx)].vmobjlock)
335 #define	PMAP_SUBOBJ_UNLOCK(pm, idx) \
336 	KASSERT(mutex_owned((pm)->pm_lock)); \
337 	if ((idx) != 0) \
338 		mutex_exit((pm)->pm_obj[(idx)].vmobjlock)
339 #else /* defined(DIAGNOSTIC) */
340 #define	PMAP_SUBOBJ_LOCK(pm, idx)	/* nothing */
341 #define	PMAP_SUBOBJ_UNLOCK(pm, idx)	/* nothing */
342 #endif /* defined(DIAGNOSTIC) */
343 
344 /*
345  * Misc. event counters.
346  */
347 struct evcnt pmap_iobmp_evcnt;
348 struct evcnt pmap_ldt_evcnt;
349 
350 /*
351  * PAT
352  */
353 #define	PATENTRY(n, type)	(type << ((n) * 8))
354 #define	PAT_UC		0x0ULL
355 #define	PAT_WC		0x1ULL
356 #define	PAT_WT		0x4ULL
357 #define	PAT_WP		0x5ULL
358 #define	PAT_WB		0x6ULL
359 #define	PAT_UCMINUS	0x7ULL
360 
361 static bool cpu_pat_enabled __read_mostly = false;
362 
363 /*
364  * global data structures
365  */
366 
367 static struct pmap kernel_pmap_store;	/* the kernel's pmap (proc0) */
368 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store;
369 
370 /*
371  * pmap_pg_g: if our processor supports PG_G in the PTE then we
372  * set pmap_pg_g to PG_G (otherwise it is zero).
373  */
374 
375 int pmap_pg_g __read_mostly = 0;
376 
377 /*
378  * pmap_largepages: if our processor supports PG_PS and we are
379  * using it, this is set to true.
380  */
381 
382 int pmap_largepages __read_mostly;
383 
384 /*
385  * i386 physical memory comes in a big contig chunk with a small
386  * hole toward the front of it...  the following two paddr_t's
387  * (shared with machdep.c) describe the physical address space
388  * of this machine.
389  */
390 paddr_t avail_start __read_mostly; /* PA of first available physical page */
391 paddr_t avail_end __read_mostly; /* PA of last available physical page */
392 
393 #ifdef XEN
394 #ifdef __x86_64__
395 /* Dummy PGD for user cr3, used between pmap_deactivate() and pmap_activate() */
396 static paddr_t xen_dummy_user_pgd;
397 #endif /* __x86_64__ */
398 paddr_t pmap_pa_start; /* PA of first physical page for this domain */
399 paddr_t pmap_pa_end;   /* PA of last physical page for this domain */
400 #endif /* XEN */
401 
402 #define	VM_PAGE_TO_PP(pg)	(&(pg)->mdpage.mp_pp)
403 
404 #define	PV_HASH_SIZE		32768
405 #define	PV_HASH_LOCK_CNT	32
406 
407 struct pv_hash_lock {
408 	kmutex_t lock;
409 } __aligned(CACHE_LINE_SIZE) pv_hash_locks[PV_HASH_LOCK_CNT]
410     __aligned(CACHE_LINE_SIZE);
411 
412 struct pv_hash_head {
413 	SLIST_HEAD(, pv_entry) hh_list;
414 } pv_hash_heads[PV_HASH_SIZE];
415 
416 static u_int
417 pvhash_hash(struct vm_page *ptp, vaddr_t va)
418 {
419 
420 	return (uintptr_t)ptp / sizeof(*ptp) + (va >> PAGE_SHIFT);
421 }
422 
423 static struct pv_hash_head *
424 pvhash_head(u_int hash)
425 {
426 
427 	return &pv_hash_heads[hash % PV_HASH_SIZE];
428 }
429 
430 static kmutex_t *
431 pvhash_lock(u_int hash)
432 {
433 
434 	return &pv_hash_locks[hash % PV_HASH_LOCK_CNT].lock;
435 }
436 
437 static struct pv_entry *
438 pvhash_remove(struct pv_hash_head *hh, struct vm_page *ptp, vaddr_t va)
439 {
440 	struct pv_entry *pve;
441 	struct pv_entry *prev;
442 
443 	prev = NULL;
444 	SLIST_FOREACH(pve, &hh->hh_list, pve_hash) {
445 		if (pve->pve_pte.pte_ptp == ptp &&
446 		    pve->pve_pte.pte_va == va) {
447 			if (prev != NULL) {
448 				SLIST_REMOVE_AFTER(prev, pve_hash);
449 			} else {
450 				SLIST_REMOVE_HEAD(&hh->hh_list, pve_hash);
451 			}
452 			break;
453 		}
454 		prev = pve;
455 	}
456 	return pve;
457 }
458 
459 /*
460  * other data structures
461  */
462 
463 static pt_entry_t protection_codes[8] __read_mostly; /* maps MI prot to i386
464 							prot code */
465 static bool pmap_initialized __read_mostly = false; /* pmap_init done yet? */
466 
467 /*
468  * the following two vaddr_t's are used during system startup
469  * to keep track of how much of the kernel's VM space we have used.
470  * once the system is started, the management of the remaining kernel
471  * VM space is turned over to the kernel_map vm_map.
472  */
473 
474 static vaddr_t virtual_avail __read_mostly;	/* VA of first free KVA */
475 static vaddr_t virtual_end __read_mostly;	/* VA of last free KVA */
476 
477 /*
478  * pool that pmap structures are allocated from
479  */
480 
481 static struct pool_cache pmap_cache;
482 
483 /*
484  * pv_entry cache
485  */
486 
487 static struct pool_cache pmap_pv_cache;
488 
489 /*
490  * MULTIPROCESSOR: special VA's/ PTE's are actually allocated inside a
491  * maxcpus*NPTECL array of PTE's, to avoid cache line thrashing
492  * due to false sharing.
493  */
494 
495 #ifdef MULTIPROCESSOR
496 #define PTESLEW(pte, id) ((pte)+(id)*NPTECL)
497 #define VASLEW(va,id) ((va)+(id)*NPTECL*PAGE_SIZE)
498 #else
499 #define PTESLEW(pte, id) (pte)
500 #define VASLEW(va,id) (va)
501 #endif
502 
503 /*
504  * special VAs and the PTEs that map them
505  */
506 static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte, *early_zero_pte;
507 static char *csrcp, *cdstp, *zerop, *ptpp, *early_zerop;
508 
509 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int);
510 
511 /*
512  * pool and cache that PDPs are allocated from
513  */
514 
515 static struct pool_cache pmap_pdp_cache;
516 int	pmap_pdp_ctor(void *, void *, int);
517 void	pmap_pdp_dtor(void *, void *);
518 #ifdef PAE
519 /* need to allocate items of 4 pages */
520 void *pmap_pdp_alloc(struct pool *, int);
521 void pmap_pdp_free(struct pool *, void *);
522 static struct pool_allocator pmap_pdp_allocator = {
523 	.pa_alloc = pmap_pdp_alloc,
524 	.pa_free = pmap_pdp_free,
525 	.pa_pagesz = PAGE_SIZE * PDP_SIZE,
526 };
527 #endif /* PAE */
528 
529 extern vaddr_t idt_vaddr;			/* we allocate IDT early */
530 extern paddr_t idt_paddr;
531 
532 #ifdef _LP64
533 extern vaddr_t lo32_vaddr;
534 extern vaddr_t lo32_paddr;
535 #endif
536 
537 extern int end;
538 
539 #ifdef i386
540 /* stuff to fix the pentium f00f bug */
541 extern vaddr_t pentium_idt_vaddr;
542 #endif
543 
544 
545 /*
546  * local prototypes
547  */
548 
549 static struct vm_page	*pmap_get_ptp(struct pmap *, vaddr_t,
550 				      pd_entry_t * const *);
551 static struct vm_page	*pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int);
552 static void		 pmap_freepage(struct pmap *, struct vm_page *, int);
553 static void		 pmap_free_ptp(struct pmap *, struct vm_page *,
554 				       vaddr_t, pt_entry_t *,
555 				       pd_entry_t * const *);
556 static bool		 pmap_is_active(struct pmap *, struct cpu_info *, bool);
557 static bool		 pmap_remove_pte(struct pmap *, struct vm_page *,
558 					 pt_entry_t *, vaddr_t,
559 					 struct pv_entry **);
560 static void		 pmap_remove_ptes(struct pmap *, struct vm_page *,
561 					  vaddr_t, vaddr_t, vaddr_t,
562 					  struct pv_entry **);
563 
564 static bool		 pmap_get_physpage(vaddr_t, int, paddr_t *);
565 static void		 pmap_alloc_level(pd_entry_t * const *, vaddr_t, int,
566 					  long *);
567 
568 static bool		 pmap_reactivate(struct pmap *);
569 
570 /*
571  * p m a p   h e l p e r   f u n c t i o n s
572  */
573 
574 static inline void
575 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff)
576 {
577 
578 	if (pmap == pmap_kernel()) {
579 		atomic_add_long(&pmap->pm_stats.resident_count, resid_diff);
580 		atomic_add_long(&pmap->pm_stats.wired_count, wired_diff);
581 	} else {
582 		KASSERT(mutex_owned(pmap->pm_lock));
583 		pmap->pm_stats.resident_count += resid_diff;
584 		pmap->pm_stats.wired_count += wired_diff;
585 	}
586 }
587 
588 static inline void
589 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
590 {
591 	int resid_diff = ((npte & PG_V) ? 1 : 0) - ((opte & PG_V) ? 1 : 0);
592 	int wired_diff = ((npte & PG_W) ? 1 : 0) - ((opte & PG_W) ? 1 : 0);
593 
594 	KASSERT((npte & (PG_V | PG_W)) != PG_W);
595 	KASSERT((opte & (PG_V | PG_W)) != PG_W);
596 
597 	pmap_stats_update(pmap, resid_diff, wired_diff);
598 }
599 
600 /*
601  * ptp_to_pmap: lookup pmap by ptp
602  */
603 
604 static struct pmap *
605 ptp_to_pmap(struct vm_page *ptp)
606 {
607 	struct pmap *pmap;
608 
609 	if (ptp == NULL) {
610 		return pmap_kernel();
611 	}
612 	pmap = (struct pmap *)ptp->uobject;
613 	KASSERT(pmap != NULL);
614 	KASSERT(&pmap->pm_obj[0] == ptp->uobject);
615 	return pmap;
616 }
617 
618 static inline struct pv_pte *
619 pve_to_pvpte(struct pv_entry *pve)
620 {
621 
622 	KASSERT((void *)&pve->pve_pte == (void *)pve);
623 	return &pve->pve_pte;
624 }
625 
626 static inline struct pv_entry *
627 pvpte_to_pve(struct pv_pte *pvpte)
628 {
629 	struct pv_entry *pve = (void *)pvpte;
630 
631 	KASSERT(pve_to_pvpte(pve) == pvpte);
632 	return pve;
633 }
634 
635 /*
636  * pv_pte_first, pv_pte_next: PV list iterator.
637  */
638 
639 static struct pv_pte *
640 pv_pte_first(struct pmap_page *pp)
641 {
642 
643 	if ((pp->pp_flags & PP_EMBEDDED) != 0) {
644 		return &pp->pp_pte;
645 	}
646 	return pve_to_pvpte(LIST_FIRST(&pp->pp_head.pvh_list));
647 }
648 
649 static struct pv_pte *
650 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte)
651 {
652 
653 	KASSERT(pvpte != NULL);
654 	if (pvpte == &pp->pp_pte) {
655 		KASSERT((pp->pp_flags & PP_EMBEDDED) != 0);
656 		return NULL;
657 	}
658 	KASSERT((pp->pp_flags & PP_EMBEDDED) == 0);
659 	return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list));
660 }
661 
662 /*
663  * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
664  *		of course the kernel is always loaded
665  */
666 
667 bool
668 pmap_is_curpmap(struct pmap *pmap)
669 {
670 #if defined(XEN) && defined(__x86_64__)
671 	/*
672 	 * Only kernel pmap is physically loaded.
673 	 * User PGD may be active, but TLB will be flushed
674 	 * with HYPERVISOR_iret anyway, so let's say no
675 	 */
676 	return(pmap == pmap_kernel());
677 #else /* XEN && __x86_64__*/
678 	return((pmap == pmap_kernel()) ||
679 	       (pmap == curcpu()->ci_pmap));
680 #endif
681 }
682 
683 /*
684  * pmap_is_active: is this pmap loaded into the specified processor's %cr3?
685  */
686 
687 inline static bool
688 pmap_is_active(struct pmap *pmap, struct cpu_info *ci, bool kernel)
689 {
690 
691 	return (pmap == pmap_kernel() ||
692 	    (pmap->pm_cpus & ci->ci_cpumask) != 0 ||
693 	    (kernel && (pmap->pm_kernel_cpus & ci->ci_cpumask) != 0));
694 }
695 
696 /*
697  *	Add a reference to the specified pmap.
698  */
699 
700 void
701 pmap_reference(struct pmap *pmap)
702 {
703 
704 	atomic_inc_uint(&pmap->pm_obj[0].uo_refs);
705 }
706 
707 #ifndef XEN
708 
709 /*
710  * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
711  *
712  * there are several pmaps involved.  some or all of them might be same.
713  *
714  *	- the pmap given by the first argument
715  *		our caller wants to access this pmap's PTEs.
716  *
717  *	- pmap_kernel()
718  *		the kernel pmap.  note that it only contains the kernel part
719  *		of the address space which is shared by any pmap.  ie. any
720  *		pmap can be used instead of pmap_kernel() for our purpose.
721  *
722  *	- ci->ci_pmap
723  *		pmap currently loaded on the cpu.
724  *
725  *	- vm_map_pmap(&curproc->p_vmspace->vm_map)
726  *		current process' pmap.
727  *
728  * => we lock enough pmaps to keep things locked in
729  * => must be undone with pmap_unmap_ptes before returning
730  */
731 
732 void
733 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2,
734 	      pd_entry_t **ptepp, pd_entry_t * const **pdeppp)
735 {
736 	struct pmap *curpmap;
737 	struct cpu_info *ci;
738 	uint32_t cpumask;
739 	lwp_t *l;
740 
741 	/* The kernel's pmap is always accessible. */
742 	if (pmap == pmap_kernel()) {
743 		*pmap2 = NULL;
744 		*ptepp = PTE_BASE;
745 		*pdeppp = normal_pdes;
746 		return;
747 	}
748 	KASSERT(kpreempt_disabled());
749 
750 	l = curlwp;
751  retry:
752 	mutex_enter(pmap->pm_lock);
753 	ci = curcpu();
754 	curpmap = ci->ci_pmap;
755 	if (vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) {
756 		/* Our own pmap so just load it: easy. */
757 		if (__predict_false(ci->ci_want_pmapload)) {
758 			mutex_exit(pmap->pm_lock);
759 			pmap_load();
760 			goto retry;
761 		}
762 		KASSERT(pmap == curpmap);
763 	} else if (pmap == curpmap) {
764 		/*
765 		 * Already on the CPU: make it valid.  This is very
766 		 * often the case during exit(), when we have switched
767 		 * to the kernel pmap in order to destroy a user pmap.
768 		 */
769 		if (!pmap_reactivate(pmap)) {
770 			u_int gen = uvm_emap_gen_return();
771 			tlbflush();
772 			uvm_emap_update(gen);
773 		}
774 	} else {
775 		/*
776 		 * Toss current pmap from CPU, but keep a reference to it.
777 		 * The reference will be dropped by pmap_unmap_ptes().
778 		 * Can happen if we block during exit().
779 		 */
780 		cpumask = ci->ci_cpumask;
781 		atomic_and_32(&curpmap->pm_cpus, ~cpumask);
782 		atomic_and_32(&curpmap->pm_kernel_cpus, ~cpumask);
783 		ci->ci_pmap = pmap;
784 		ci->ci_tlbstate = TLBSTATE_VALID;
785 		atomic_or_32(&pmap->pm_cpus, cpumask);
786 		atomic_or_32(&pmap->pm_kernel_cpus, cpumask);
787 		cpu_load_pmap(pmap);
788 	}
789 	pmap->pm_ncsw = l->l_ncsw;
790 	*pmap2 = curpmap;
791 	*ptepp = PTE_BASE;
792 	*pdeppp = normal_pdes;
793 }
794 
795 /*
796  * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
797  */
798 
799 void
800 pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2)
801 {
802 	struct cpu_info *ci;
803 	struct pmap *mypmap;
804 
805 	KASSERT(kpreempt_disabled());
806 
807 	/* The kernel's pmap is always accessible. */
808 	if (pmap == pmap_kernel()) {
809 		return;
810 	}
811 
812 	/*
813 	 * We cannot tolerate context switches while mapped in.
814 	 * If it is our own pmap all we have to do is unlock.
815 	 */
816 	KASSERT(pmap->pm_ncsw == curlwp->l_ncsw);
817 	mypmap = vm_map_pmap(&curproc->p_vmspace->vm_map);
818 	if (pmap == mypmap) {
819 		mutex_exit(pmap->pm_lock);
820 		return;
821 	}
822 
823 	/*
824 	 * Mark whatever's on the CPU now as lazy and unlock.
825 	 * If the pmap was already installed, we are done.
826 	 */
827 	ci = curcpu();
828 	ci->ci_tlbstate = TLBSTATE_LAZY;
829 	ci->ci_want_pmapload = (mypmap != pmap_kernel());
830 	mutex_exit(pmap->pm_lock);
831 	if (pmap == pmap2) {
832 		return;
833 	}
834 
835 	/*
836 	 * We installed another pmap on the CPU.  Grab a reference to
837 	 * it and leave in place.  Toss the evicted pmap (can block).
838 	 */
839 	pmap_reference(pmap);
840 	pmap_destroy(pmap2);
841 }
842 
843 #endif
844 
845 inline static void
846 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte)
847 {
848 
849 #if !defined(__x86_64__)
850 	if (curproc == NULL || curproc->p_vmspace == NULL ||
851 	    pm != vm_map_pmap(&curproc->p_vmspace->vm_map))
852 		return;
853 
854 	if ((opte ^ npte) & PG_X)
855 		pmap_update_pg(va);
856 
857 	/*
858 	 * Executability was removed on the last executable change.
859 	 * Reset the code segment to something conservative and
860 	 * let the trap handler deal with setting the right limit.
861 	 * We can't do that because of locking constraints on the vm map.
862 	 */
863 
864 	if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) {
865 		struct trapframe *tf = curlwp->l_md.md_regs;
866 
867 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
868 		pm->pm_hiexec = I386_MAX_EXE_ADDR;
869 	}
870 #endif /* !defined(__x86_64__) */
871 }
872 
873 #if !defined(__x86_64__)
874 /*
875  * Fixup the code segment to cover all potential executable mappings.
876  * returns 0 if no changes to the code segment were made.
877  */
878 
879 int
880 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb)
881 {
882 	struct vm_map_entry *ent;
883 	struct pmap *pm = vm_map_pmap(map);
884 	vaddr_t va = 0;
885 
886 	vm_map_lock_read(map);
887 	for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) {
888 
889 		/*
890 		 * This entry has greater va than the entries before.
891 		 * We need to make it point to the last page, not past it.
892 		 */
893 
894 		if (ent->protection & VM_PROT_EXECUTE)
895 			va = trunc_page(ent->end) - PAGE_SIZE;
896 	}
897 	vm_map_unlock_read(map);
898 	if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL))
899 		return (0);
900 
901 	pm->pm_hiexec = va;
902 	if (pm->pm_hiexec > I386_MAX_EXE_ADDR) {
903 		tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
904 	} else {
905 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
906 		return (0);
907 	}
908 	return (1);
909 }
910 #endif /* !defined(__x86_64__) */
911 
912 void
913 pat_init(struct cpu_info *ci)
914 {
915 	uint64_t pat;
916 
917 	if (!(ci->ci_feat_val[0] & CPUID_PAT))
918 		return;
919 
920 	/* We change WT to WC. Leave all other entries the default values. */
921 	pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) |
922 	      PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) |
923 	      PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) |
924 	      PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC);
925 
926 	wrmsr(MSR_CR_PAT, pat);
927 	cpu_pat_enabled = true;
928 	aprint_debug_dev(ci->ci_dev, "PAT enabled\n");
929 }
930 
931 static pt_entry_t
932 pmap_pat_flags(u_int flags)
933 {
934 	u_int cacheflags = (flags & PMAP_CACHE_MASK);
935 
936 	if (!cpu_pat_enabled) {
937 		switch (cacheflags) {
938 		case PMAP_NOCACHE:
939 		case PMAP_NOCACHE_OVR:
940 			/* results in PGC_UCMINUS on cpus which have
941 			 * the cpuid PAT but PAT "disabled"
942 			 */
943 			return PG_N;
944 		default:
945 			return 0;
946 		}
947 	}
948 
949 	switch (cacheflags) {
950 	case PMAP_NOCACHE:
951 		return PGC_UC;
952 	case PMAP_WRITE_COMBINE:
953 		return PGC_WC;
954 	case PMAP_WRITE_BACK:
955 		return PGC_WB;
956 	case PMAP_NOCACHE_OVR:
957 		return PGC_UCMINUS;
958 	}
959 
960 	return 0;
961 }
962 
963 /*
964  * p m a p   k e n t e r   f u n c t i o n s
965  *
966  * functions to quickly enter/remove pages from the kernel address
967  * space.   pmap_kremove is exported to MI kernel.  we make use of
968  * the recursive PTE mappings.
969  */
970 
971 /*
972  * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
973  *
974  * => no need to lock anything, assume va is already allocated
975  * => should be faster than normal pmap enter function
976  */
977 
978 void
979 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
980 {
981 	pt_entry_t *pte, opte, npte;
982 
983 	KASSERT(!(prot & ~VM_PROT_ALL));
984 
985 	if (va < VM_MIN_KERNEL_ADDRESS)
986 		pte = vtopte(va);
987 	else
988 		pte = kvtopte(va);
989 #ifdef DOM0OPS
990 	if (pa < pmap_pa_start || pa >= pmap_pa_end) {
991 #ifdef DEBUG
992 		printf_nolog("%s: pa 0x%" PRIx64 " for va 0x%" PRIx64
993 		    " outside range\n", __func__, (int64_t)pa, (int64_t)va);
994 #endif /* DEBUG */
995 		npte = pa;
996 	} else
997 #endif /* DOM0OPS */
998 		npte = pmap_pa2pte(pa);
999 	npte |= protection_codes[prot] | PG_k | PG_V | pmap_pg_g;
1000 	npte |= pmap_pat_flags(flags);
1001 	opte = pmap_pte_testset(pte, npte); /* zap! */
1002 #if defined(DIAGNOSTIC)
1003 	/* XXX For now... */
1004 	if (opte & PG_PS)
1005 		panic("%s: PG_PS", __func__);
1006 #endif
1007 	if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) {
1008 #if defined(DIAGNOSTIC)
1009 		printf_nolog("%s: mapping already present\n", __func__);
1010 #endif
1011 		/* This should not happen. */
1012 		kpreempt_disable();
1013 		pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER);
1014 		kpreempt_enable();
1015 	}
1016 }
1017 
1018 void
1019 pmap_emap_enter(vaddr_t va, paddr_t pa, vm_prot_t prot)
1020 {
1021 	pt_entry_t *pte, opte, npte;
1022 
1023 	KASSERT((prot & ~VM_PROT_ALL) == 0);
1024 	pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va);
1025 
1026 #ifdef DOM0OPS
1027 	if (pa < pmap_pa_start || pa >= pmap_pa_end) {
1028 		npte = pa;
1029 	} else
1030 #endif
1031 		npte = pmap_pa2pte(pa);
1032 
1033 	npte = pmap_pa2pte(pa);
1034 	npte |= protection_codes[prot] | PG_k | PG_V;
1035 	opte = pmap_pte_testset(pte, npte);
1036 }
1037 
1038 /*
1039  * pmap_emap_sync: perform TLB flush or pmap load, if it was deferred.
1040  */
1041 void
1042 pmap_emap_sync(bool canload)
1043 {
1044 	struct cpu_info *ci = curcpu();
1045 	struct pmap *pmap;
1046 
1047 	KASSERT(kpreempt_disabled());
1048 	if (__predict_true(ci->ci_want_pmapload && canload)) {
1049 		/*
1050 		 * XXX: Hint for pmap_reactivate(), which might suggest to
1051 		 * not perform TLB flush, if state has not changed.
1052 		 */
1053 		pmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map);
1054 		if (__predict_false(pmap == ci->ci_pmap)) {
1055 			const uint32_t cpumask = ci->ci_cpumask;
1056 			atomic_and_32(&pmap->pm_cpus, ~cpumask);
1057 		}
1058 		pmap_load();
1059 		KASSERT(ci->ci_want_pmapload == 0);
1060 	} else {
1061 		tlbflush();
1062 	}
1063 
1064 }
1065 
1066 void
1067 pmap_emap_remove(vaddr_t sva, vsize_t len)
1068 {
1069 	pt_entry_t *pte, xpte;
1070 	vaddr_t va, eva = sva + len;
1071 
1072 	for (va = sva; va < eva; va += PAGE_SIZE) {
1073 		pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va);
1074 		xpte |= pmap_pte_testset(pte, 0);
1075 	}
1076 }
1077 
1078 __strict_weak_alias(pmap_kenter_ma, pmap_kenter_pa);
1079 
1080 #if defined(__x86_64__)
1081 /*
1082  * Change protection for a virtual address. Local for a CPU only, don't
1083  * care about TLB shootdowns.
1084  *
1085  * => must be called with preemption disabled
1086  */
1087 void
1088 pmap_changeprot_local(vaddr_t va, vm_prot_t prot)
1089 {
1090 	pt_entry_t *pte, opte, npte;
1091 
1092 	KASSERT(kpreempt_disabled());
1093 
1094 	if (va < VM_MIN_KERNEL_ADDRESS)
1095 		pte = vtopte(va);
1096 	else
1097 		pte = kvtopte(va);
1098 
1099 	npte = opte = *pte;
1100 
1101 	if ((prot & VM_PROT_WRITE) != 0)
1102 		npte |= PG_RW;
1103 	else
1104 		npte &= ~PG_RW;
1105 
1106 	if (opte != npte) {
1107 		pmap_pte_set(pte, npte);
1108 		pmap_pte_flush();
1109 		invlpg(va);
1110 	}
1111 }
1112 #endif /* defined(__x86_64__) */
1113 
1114 /*
1115  * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
1116  *
1117  * => no need to lock anything
1118  * => caller must dispose of any vm_page mapped in the va range
1119  * => note: not an inline function
1120  * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
1121  * => we assume kernel only unmaps valid addresses and thus don't bother
1122  *    checking the valid bit before doing TLB flushing
1123  * => must be followed by call to pmap_update() before reuse of page
1124  */
1125 
1126 void
1127 pmap_kremove(vaddr_t sva, vsize_t len)
1128 {
1129 	pt_entry_t *pte, opte;
1130 	vaddr_t va, eva;
1131 
1132 	eva = sva + len;
1133 
1134 	kpreempt_disable();
1135 	for (va = sva; va < eva; va += PAGE_SIZE) {
1136 		if (va < VM_MIN_KERNEL_ADDRESS)
1137 			pte = vtopte(va);
1138 		else
1139 			pte = kvtopte(va);
1140 		opte = pmap_pte_testset(pte, 0); /* zap! */
1141 		if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) {
1142 			pmap_tlb_shootdown(pmap_kernel(), va, opte,
1143 			    TLBSHOOT_KREMOVE);
1144 		}
1145 		KASSERT((opte & PG_PS) == 0);
1146 		KASSERT((opte & PG_PVLIST) == 0);
1147 	}
1148 	kpreempt_enable();
1149 }
1150 
1151 /*
1152  * p m a p   i n i t   f u n c t i o n s
1153  *
1154  * pmap_bootstrap and pmap_init are called during system startup
1155  * to init the pmap module.   pmap_bootstrap() does a low level
1156  * init just to get things rolling.   pmap_init() finishes the job.
1157  */
1158 
1159 /*
1160  * pmap_bootstrap: get the system in a state where it can run with VM
1161  *	properly enabled (called before main()).   the VM system is
1162  *      fully init'd later...
1163  *
1164  * => on i386, locore.s has already enabled the MMU by allocating
1165  *	a PDP for the kernel, and nkpde PTP's for the kernel.
1166  * => kva_start is the first free virtual address in kernel space
1167  */
1168 
1169 void
1170 pmap_bootstrap(vaddr_t kva_start)
1171 {
1172 	struct pmap *kpm;
1173 	pt_entry_t *pte;
1174 	int i;
1175 	vaddr_t kva;
1176 #ifndef XEN
1177 	unsigned long p1i;
1178 	vaddr_t kva_end;
1179 #endif
1180 
1181 	pt_entry_t pg_nx = (cpu_feature[2] & CPUID_NOX ? PG_NX : 0);
1182 
1183 	/*
1184 	 * set up our local static global vars that keep track of the
1185 	 * usage of KVM before kernel_map is set up
1186 	 */
1187 
1188 	virtual_avail = kva_start;		/* first free KVA */
1189 	virtual_end = VM_MAX_KERNEL_ADDRESS;	/* last KVA */
1190 
1191 	/*
1192 	 * set up protection_codes: we need to be able to convert from
1193 	 * a MI protection code (some combo of VM_PROT...) to something
1194 	 * we can jam into a i386 PTE.
1195 	 */
1196 
1197 	protection_codes[VM_PROT_NONE] = pg_nx;			/* --- */
1198 	protection_codes[VM_PROT_EXECUTE] = PG_RO | PG_X;	/* --x */
1199 	protection_codes[VM_PROT_READ] = PG_RO | pg_nx;		/* -r- */
1200 	protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO | PG_X;/* -rx */
1201 	protection_codes[VM_PROT_WRITE] = PG_RW | pg_nx;	/* w-- */
1202 	protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW | PG_X;/* w-x */
1203 	protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW | pg_nx;
1204 								/* wr- */
1205 	protection_codes[VM_PROT_ALL] = PG_RW | PG_X;		/* wrx */
1206 
1207 	/*
1208 	 * now we init the kernel's pmap
1209 	 *
1210 	 * the kernel pmap's pm_obj is not used for much.   however, in
1211 	 * user pmaps the pm_obj contains the list of active PTPs.
1212 	 * the pm_obj currently does not have a pager.   it might be possible
1213 	 * to add a pager that would allow a process to read-only mmap its
1214 	 * own page tables (fast user level vtophys?).   this may or may not
1215 	 * be useful.
1216 	 */
1217 
1218 	kpm = pmap_kernel();
1219 	for (i = 0; i < PTP_LEVELS - 1; i++) {
1220 		mutex_init(&kpm->pm_obj_lock[i], MUTEX_DEFAULT, IPL_NONE);
1221 		uvm_obj_init(&kpm->pm_obj[i], NULL, false, 1);
1222 		uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_obj_lock[i]);
1223 		kpm->pm_ptphint[i] = NULL;
1224 	}
1225 	memset(&kpm->pm_list, 0, sizeof(kpm->pm_list));  /* pm_list not used */
1226 
1227 	kpm->pm_pdir = (pd_entry_t *)(PDPpaddr + KERNBASE);
1228 	for (i = 0; i < PDP_SIZE; i++)
1229 		kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i;
1230 
1231 	kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
1232 		x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS);
1233 
1234 	/*
1235 	 * the above is just a rough estimate and not critical to the proper
1236 	 * operation of the system.
1237 	 */
1238 
1239 #ifndef XEN
1240 	/*
1241 	 * Begin to enable global TLB entries if they are supported.
1242 	 * The G bit has no effect until the CR4_PGE bit is set in CR4,
1243 	 * which happens in cpu_init(), which is run on each cpu
1244 	 * (and happens later)
1245 	 */
1246 
1247 	if (cpu_feature[0] & CPUID_PGE) {
1248 		pmap_pg_g = PG_G;		/* enable software */
1249 
1250 		/* add PG_G attribute to already mapped kernel pages */
1251 		if (KERNBASE == VM_MIN_KERNEL_ADDRESS) {
1252 			kva_end = virtual_avail;
1253 		} else {
1254 			extern vaddr_t eblob, esym;
1255 			kva_end = (vaddr_t)&end;
1256 			if (esym > kva_end)
1257 				kva_end = esym;
1258 			if (eblob > kva_end)
1259 				kva_end = eblob;
1260 			kva_end = roundup(kva_end, PAGE_SIZE);
1261 		}
1262 		for (kva = KERNBASE; kva < kva_end; kva += PAGE_SIZE) {
1263 			p1i = pl1_i(kva);
1264 			if (pmap_valid_entry(PTE_BASE[p1i]))
1265 				PTE_BASE[p1i] |= PG_G;
1266 		}
1267 	}
1268 
1269 	/*
1270 	 * enable large pages if they are supported.
1271 	 */
1272 
1273 	if (cpu_feature[0] & CPUID_PSE) {
1274 		paddr_t pa;
1275 		pd_entry_t *pde;
1276 		extern char __data_start;
1277 
1278 		lcr4(rcr4() | CR4_PSE);	/* enable hardware (via %cr4) */
1279 		pmap_largepages = 1;	/* enable software */
1280 
1281 		/*
1282 		 * the TLB must be flushed after enabling large pages
1283 		 * on Pentium CPUs, according to section 3.6.2.2 of
1284 		 * "Intel Architecture Software Developer's Manual,
1285 		 * Volume 3: System Programming".
1286 		 */
1287 		tlbflushg();
1288 
1289 		/*
1290 		 * now, remap the kernel text using large pages.  we
1291 		 * assume that the linker has properly aligned the
1292 		 * .data segment to a NBPD_L2 boundary.
1293 		 */
1294 		kva_end = rounddown((vaddr_t)&__data_start, NBPD_L1);
1295 		for (pa = 0, kva = KERNBASE; kva + NBPD_L2 <= kva_end;
1296 		     kva += NBPD_L2, pa += NBPD_L2) {
1297 			pde = &L2_BASE[pl2_i(kva)];
1298 			*pde = pa | pmap_pg_g | PG_PS |
1299 			    PG_KR | PG_V;	/* zap! */
1300 			tlbflushg();
1301 		}
1302 #if defined(DEBUG)
1303 		aprint_normal("kernel text is mapped with %" PRIuPSIZE " large "
1304 		    "pages and %" PRIuPSIZE " normal pages\n",
1305 		    howmany(kva - KERNBASE, NBPD_L2),
1306 		    howmany((vaddr_t)&__data_start - kva, NBPD_L1));
1307 #endif /* defined(DEBUG) */
1308 	}
1309 #endif /* !XEN */
1310 
1311 	if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
1312 		/*
1313 		 * zero_pte is stuck at the end of mapped space for the kernel
1314 		 * image (disjunct from kva space). This is done so that it
1315 		 * can safely be used in pmap_growkernel (pmap_get_physpage),
1316 		 * when it's called for the first time.
1317 		 * XXXfvdl fix this for MULTIPROCESSOR later.
1318 		 */
1319 
1320 		early_zerop = (void *)(KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2);
1321 		early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop);
1322 	}
1323 
1324 	/*
1325 	 * now we allocate the "special" VAs which are used for tmp mappings
1326 	 * by the pmap (and other modules).    we allocate the VAs by advancing
1327 	 * virtual_avail (note that there are no pages mapped at these VAs).
1328 	 * we find the PTE that maps the allocated VA via the linear PTE
1329 	 * mapping.
1330 	 */
1331 
1332 	pte = PTE_BASE + pl1_i(virtual_avail);
1333 
1334 #ifdef MULTIPROCESSOR
1335 	/*
1336 	 * Waste some VA space to avoid false sharing of cache lines
1337 	 * for page table pages: Give each possible CPU a cache line
1338 	 * of PTE's (8) to play with, though we only need 4.  We could
1339 	 * recycle some of this waste by putting the idle stacks here
1340 	 * as well; we could waste less space if we knew the largest
1341 	 * CPU ID beforehand.
1342 	 */
1343 	csrcp = (char *) virtual_avail;  csrc_pte = pte;
1344 
1345 	cdstp = (char *) virtual_avail+PAGE_SIZE;  cdst_pte = pte+1;
1346 
1347 	zerop = (char *) virtual_avail+PAGE_SIZE*2;  zero_pte = pte+2;
1348 
1349 	ptpp = (char *) virtual_avail+PAGE_SIZE*3;  ptp_pte = pte+3;
1350 
1351 	virtual_avail += PAGE_SIZE * maxcpus * NPTECL;
1352 	pte += maxcpus * NPTECL;
1353 #else
1354 	csrcp = (void *) virtual_avail;  csrc_pte = pte;	/* allocate */
1355 	virtual_avail += PAGE_SIZE; pte++;			/* advance */
1356 
1357 	cdstp = (void *) virtual_avail;  cdst_pte = pte;
1358 	virtual_avail += PAGE_SIZE; pte++;
1359 
1360 	zerop = (void *) virtual_avail;  zero_pte = pte;
1361 	virtual_avail += PAGE_SIZE; pte++;
1362 
1363 	ptpp = (void *) virtual_avail;  ptp_pte = pte;
1364 	virtual_avail += PAGE_SIZE; pte++;
1365 #endif
1366 
1367 	if (VM_MIN_KERNEL_ADDRESS == KERNBASE) {
1368 		early_zerop = zerop;
1369 		early_zero_pte = zero_pte;
1370 	}
1371 
1372 	/*
1373 	 * Nothing after this point actually needs pte;
1374 	 */
1375 	pte = (void *)0xdeadbeef;
1376 
1377 #ifdef XEN
1378 #ifdef __x86_64__
1379 	/*
1380 	 * We want a dummy page directory for Xen:
1381 	 * when deactivate a pmap, Xen will still consider it active.
1382 	 * So we set user PGD to this one to lift all protection on
1383 	 * the now inactive page tables set.
1384 	 */
1385 	xen_dummy_user_pgd = avail_start;
1386 	avail_start += PAGE_SIZE;
1387 
1388 	/* Zero fill it, the less checks in Xen it requires the better */
1389 	memset((void *) (xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE);
1390 	/* Mark read-only */
1391 	HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE,
1392 	    pmap_pa2pte(xen_dummy_user_pgd) | PG_u | PG_V, UVMF_INVLPG);
1393 	/* Pin as L4 */
1394 	xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd));
1395 #endif /* __x86_64__ */
1396 	idt_vaddr = virtual_avail;                      /* don't need pte */
1397 	idt_paddr = avail_start;                        /* steal a page */
1398 	/*
1399 	 * Xen require one more page as we can't store
1400 	 * GDT and LDT on the same page
1401 	 */
1402 	virtual_avail += 3 * PAGE_SIZE;
1403 	avail_start += 3 * PAGE_SIZE;
1404 #else /* XEN */
1405 	idt_vaddr = virtual_avail;			/* don't need pte */
1406 	idt_paddr = avail_start;			/* steal a page */
1407 #if defined(__x86_64__)
1408 	virtual_avail += 2 * PAGE_SIZE; pte += 2;
1409 	avail_start += 2 * PAGE_SIZE;
1410 #else /* defined(__x86_64__) */
1411 	virtual_avail += PAGE_SIZE; pte++;
1412 	avail_start += PAGE_SIZE;
1413 	/* pentium f00f bug stuff */
1414 	pentium_idt_vaddr = virtual_avail;		/* don't need pte */
1415 	virtual_avail += PAGE_SIZE; pte++;
1416 #endif /* defined(__x86_64__) */
1417 #endif /* XEN */
1418 
1419 #ifdef _LP64
1420 	/*
1421 	 * Grab a page below 4G for things that need it (i.e.
1422 	 * having an initial %cr3 for the MP trampoline).
1423 	 */
1424 	lo32_vaddr = virtual_avail;
1425 	virtual_avail += PAGE_SIZE; pte++;
1426 	lo32_paddr = avail_start;
1427 	avail_start += PAGE_SIZE;
1428 #endif
1429 
1430 	/*
1431 	 * now we reserve some VM for mapping pages when doing a crash dump
1432 	 */
1433 
1434 	virtual_avail = reserve_dumppages(virtual_avail);
1435 
1436 	/*
1437 	 * init the static-global locks and global lists.
1438 	 *
1439 	 * => pventry::pvh_lock (initialized elsewhere) must also be
1440 	 *      a spin lock, again at IPL_VM to prevent deadlock, and
1441 	 *	again is never taken from interrupt context.
1442 	 */
1443 
1444 	mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE);
1445 	LIST_INIT(&pmaps);
1446 
1447 	/*
1448 	 * initialize caches.
1449 	 */
1450 
1451 	pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), 0, 0, 0,
1452 	    "pmappl", NULL, IPL_NONE, NULL, NULL, NULL);
1453 #ifdef PAE
1454 	pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE * PDP_SIZE, 0, 0, 0,
1455 	    "pdppl", &pmap_pdp_allocator, IPL_NONE,
1456 	    pmap_pdp_ctor, pmap_pdp_dtor, NULL);
1457 #else /* PAE */
1458 	pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE, 0, 0, 0,
1459 	    "pdppl", NULL, IPL_NONE, pmap_pdp_ctor, pmap_pdp_dtor, NULL);
1460 #endif /* PAE */
1461 	pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0,
1462 	    PR_LARGECACHE, "pvpl", &pool_allocator_meta, IPL_NONE, NULL,
1463 	    NULL, NULL);
1464 
1465 	/*
1466 	 * ensure the TLB is sync'd with reality by flushing it...
1467 	 */
1468 
1469 	tlbflushg();
1470 
1471 	/*
1472 	 * calculate pmap_maxkvaddr from nkptp[].
1473 	 */
1474 
1475 	kva = VM_MIN_KERNEL_ADDRESS;
1476 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
1477 		kva += nkptp[i] * nbpd[i];
1478 	}
1479 	pmap_maxkvaddr = kva;
1480 }
1481 
1482 #if defined(__x86_64__)
1483 /*
1484  * Pre-allocate PTPs for low memory, so that 1:1 mappings for various
1485  * trampoline code can be entered.
1486  */
1487 void
1488 pmap_prealloc_lowmem_ptps(void)
1489 {
1490 	int level;
1491 	paddr_t newp;
1492 #ifdef XEN
1493 	paddr_t pdes_pa;
1494 
1495 	pdes_pa = pmap_pdirpa(pmap_kernel(), 0);
1496 	level = PTP_LEVELS;
1497 	for (;;) {
1498 		newp = avail_start;
1499 		avail_start += PAGE_SIZE;
1500 		HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop,
1501 		    xpmap_ptom_masked(newp) | PG_u | PG_V | PG_RW, UVMF_INVLPG);
1502 		memset(early_zerop, 0, PAGE_SIZE);
1503 		/* Mark R/O before installing */
1504 		HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop,
1505 		    xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG);
1506 		if (newp < (NKL2_KIMG_ENTRIES * NBPD_L2))
1507 			HYPERVISOR_update_va_mapping (newp + KERNBASE,
1508 			    xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG);
1509 		/* Update the pmap_kernel() L4 shadow */
1510 		xpq_queue_pte_update (
1511 		    xpmap_ptom_masked(pdes_pa)
1512 		    + (pl_i(0, level) * sizeof (pd_entry_t)),
1513 		    xpmap_ptom_masked(newp) | PG_RW | PG_u | PG_V);
1514 		/* sync to per-cpu PD */
1515 		xpq_queue_pte_update(
1516 			xpmap_ptom_masked(cpu_info_primary.ci_kpm_pdirpa +
1517 			    pl_i(0, PTP_LEVELS) *
1518 			    sizeof(pd_entry_t)),
1519 			pmap_kernel()->pm_pdir[pl_i(0, PTP_LEVELS)]);
1520 		pmap_pte_flush();
1521 		level--;
1522 		if (level <= 1)
1523 			break;
1524 		pdes_pa = newp;
1525 	}
1526 #else /* XEN */
1527 	pd_entry_t *pdes;
1528 
1529 	pdes = pmap_kernel()->pm_pdir;
1530 	level = PTP_LEVELS;
1531 	for (;;) {
1532 		newp = avail_start;
1533 		avail_start += PAGE_SIZE;
1534 		pmap_pte_set(early_zero_pte, (newp & PG_FRAME) | PG_V | PG_RW);
1535 		pmap_pte_flush();
1536 		pmap_update_pg((vaddr_t)early_zerop);
1537 		memset(early_zerop, 0, PAGE_SIZE);
1538 		pdes[pl_i(0, level)] = (newp & PG_FRAME) | PG_V | PG_RW;
1539 		level--;
1540 		if (level <= 1)
1541 			break;
1542 		pdes = normal_pdes[level - 2];
1543 	}
1544 #endif /* XEN */
1545 }
1546 #endif /* defined(__x86_64__) */
1547 
1548 /*
1549  * pmap_init: called from uvm_init, our job is to get the pmap
1550  * system ready to manage mappings...
1551  */
1552 
1553 void
1554 pmap_init(void)
1555 {
1556 	int i;
1557 
1558 	for (i = 0; i < PV_HASH_SIZE; i++) {
1559 		SLIST_INIT(&pv_hash_heads[i].hh_list);
1560 	}
1561 	for (i = 0; i < PV_HASH_LOCK_CNT; i++) {
1562 		mutex_init(&pv_hash_locks[i].lock, MUTEX_NODEBUG, IPL_VM);
1563 	}
1564 
1565 	pmap_tlb_init();
1566 
1567 	evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC,
1568 	    NULL, "x86", "io bitmap copy");
1569 	evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC,
1570 	    NULL, "x86", "ldt sync");
1571 
1572 	/*
1573 	 * done: pmap module is up (and ready for business)
1574 	 */
1575 
1576 	pmap_initialized = true;
1577 }
1578 
1579 /*
1580  * pmap_cpu_init_late: perform late per-CPU initialization.
1581  */
1582 
1583 #ifndef XEN
1584 void
1585 pmap_cpu_init_late(struct cpu_info *ci)
1586 {
1587 	/*
1588 	 * The BP has already its own PD page allocated during early
1589 	 * MD startup.
1590 	 */
1591 	if (ci == &cpu_info_primary)
1592 		return;
1593 
1594 #ifdef PAE
1595 	int ret;
1596 	struct pglist pg;
1597 	struct vm_page *vmap;
1598 
1599 	/*
1600 	 * Allocate a page for the per-CPU L3 PD. cr3 being 32 bits, PA musts
1601 	 * resides below the 4GB boundary.
1602 	 */
1603 	ret = uvm_pglistalloc(PAGE_SIZE, 0, 0x100000000ULL, 32, 0, &pg, 1, 0);
1604 	vmap = TAILQ_FIRST(&pg);
1605 
1606 	if (ret != 0 || vmap == NULL)
1607 		panic("%s: failed to allocate L3 pglist for CPU %d (ret %d)\n",
1608 			__func__, cpu_index(ci), ret);
1609 
1610 	ci->ci_pae_l3_pdirpa = vmap->phys_addr;
1611 
1612 	ci->ci_pae_l3_pdir = (paddr_t *)uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
1613 		UVM_KMF_VAONLY | UVM_KMF_NOWAIT);
1614 	if (ci->ci_pae_l3_pdir == NULL)
1615 		panic("%s: failed to allocate L3 PD for CPU %d\n",
1616 			__func__, cpu_index(ci));
1617 
1618 	pmap_kenter_pa((vaddr_t)ci->ci_pae_l3_pdir, ci->ci_pae_l3_pdirpa,
1619 		VM_PROT_READ | VM_PROT_WRITE, 0);
1620 
1621 	pmap_update(pmap_kernel());
1622 #endif
1623 }
1624 #endif
1625 
1626 /*
1627  * p v _ e n t r y   f u n c t i o n s
1628  */
1629 
1630 /*
1631  * pmap_free_pvs: free a list of pv_entrys
1632  */
1633 
1634 static void
1635 pmap_free_pvs(struct pv_entry *pve)
1636 {
1637 	struct pv_entry *next;
1638 
1639 	for ( /* null */ ; pve != NULL ; pve = next) {
1640 		next = pve->pve_next;
1641 		pool_cache_put(&pmap_pv_cache, pve);
1642 	}
1643 }
1644 
1645 /*
1646  * main pv_entry manipulation functions:
1647  *   pmap_enter_pv: enter a mapping onto a pv_head list
1648  *   pmap_remove_pv: remove a mapping from a pv_head list
1649  *
1650  * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock
1651  *       the pvh before calling
1652  */
1653 
1654 /*
1655  * insert_pv: a helper of pmap_enter_pv
1656  */
1657 
1658 static void
1659 insert_pv(struct pmap_page *pp, struct pv_entry *pve)
1660 {
1661 	struct pv_hash_head *hh;
1662 	kmutex_t *lock;
1663 	u_int hash;
1664 
1665 	hash = pvhash_hash(pve->pve_pte.pte_ptp, pve->pve_pte.pte_va);
1666 	lock = pvhash_lock(hash);
1667 	hh = pvhash_head(hash);
1668 	mutex_spin_enter(lock);
1669 	SLIST_INSERT_HEAD(&hh->hh_list, pve, pve_hash);
1670 	mutex_spin_exit(lock);
1671 
1672 	LIST_INSERT_HEAD(&pp->pp_head.pvh_list, pve, pve_list);
1673 }
1674 
1675 /*
1676  * pmap_enter_pv: enter a mapping onto a pv_head lst
1677  *
1678  * => caller should adjust ptp's wire_count before calling
1679  */
1680 
1681 static struct pv_entry *
1682 pmap_enter_pv(struct pmap_page *pp,
1683 	      struct pv_entry *pve,	/* preallocated pve for us to use */
1684 	      struct pv_entry **sparepve,
1685 	      struct vm_page *ptp,
1686 	      vaddr_t va)
1687 {
1688 
1689 	KASSERT(ptp == NULL || ptp->wire_count >= 2);
1690 	KASSERT(ptp == NULL || ptp->uobject != NULL);
1691 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
1692 
1693 	if ((pp->pp_flags & PP_EMBEDDED) == 0) {
1694 		if (LIST_EMPTY(&pp->pp_head.pvh_list)) {
1695 			pp->pp_flags |= PP_EMBEDDED;
1696 			pp->pp_pte.pte_ptp = ptp;
1697 			pp->pp_pte.pte_va = va;
1698 
1699 			return pve;
1700 		}
1701 	} else {
1702 		struct pv_entry *pve2;
1703 
1704 		pve2 = *sparepve;
1705 		*sparepve = NULL;
1706 
1707 		pve2->pve_pte = pp->pp_pte;
1708 		pp->pp_flags &= ~PP_EMBEDDED;
1709 		LIST_INIT(&pp->pp_head.pvh_list);
1710 		insert_pv(pp, pve2);
1711 	}
1712 
1713 	pve->pve_pte.pte_ptp = ptp;
1714 	pve->pve_pte.pte_va = va;
1715 	insert_pv(pp, pve);
1716 
1717 	return NULL;
1718 }
1719 
1720 /*
1721  * pmap_remove_pv: try to remove a mapping from a pv_list
1722  *
1723  * => caller should adjust ptp's wire_count and free PTP if needed
1724  * => we return the removed pve
1725  */
1726 
1727 static struct pv_entry *
1728 pmap_remove_pv(struct pmap_page *pp, struct vm_page *ptp, vaddr_t va)
1729 {
1730 	struct pv_hash_head *hh;
1731 	struct pv_entry *pve;
1732 	kmutex_t *lock;
1733 	u_int hash;
1734 
1735 	KASSERT(ptp == NULL || ptp->uobject != NULL);
1736 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
1737 
1738 	if ((pp->pp_flags & PP_EMBEDDED) != 0) {
1739 		KASSERT(pp->pp_pte.pte_ptp == ptp);
1740 		KASSERT(pp->pp_pte.pte_va == va);
1741 
1742 		pp->pp_flags &= ~PP_EMBEDDED;
1743 		LIST_INIT(&pp->pp_head.pvh_list);
1744 
1745 		return NULL;
1746 	}
1747 
1748 	hash = pvhash_hash(ptp, va);
1749 	lock = pvhash_lock(hash);
1750 	hh = pvhash_head(hash);
1751 	mutex_spin_enter(lock);
1752 	pve = pvhash_remove(hh, ptp, va);
1753 	mutex_spin_exit(lock);
1754 
1755 	LIST_REMOVE(pve, pve_list);
1756 
1757 	return pve;
1758 }
1759 
1760 /*
1761  * p t p   f u n c t i o n s
1762  */
1763 
1764 static inline struct vm_page *
1765 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level)
1766 {
1767 	int lidx = level - 1;
1768 	struct vm_page *pg;
1769 
1770 	KASSERT(mutex_owned(pmap->pm_lock));
1771 
1772 	if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] &&
1773 	    pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) {
1774 		return (pmap->pm_ptphint[lidx]);
1775 	}
1776 	PMAP_SUBOBJ_LOCK(pmap, lidx);
1777 	pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level));
1778 	PMAP_SUBOBJ_UNLOCK(pmap, lidx);
1779 
1780 	KASSERT(pg == NULL || pg->wire_count >= 1);
1781 	return pg;
1782 }
1783 
1784 static inline void
1785 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level)
1786 {
1787 	lwp_t *l;
1788 	int lidx;
1789 	struct uvm_object *obj;
1790 
1791 	KASSERT(ptp->wire_count == 1);
1792 
1793 	lidx = level - 1;
1794 
1795 	obj = &pmap->pm_obj[lidx];
1796 	pmap_stats_update(pmap, -1, 0);
1797 	if (lidx != 0)
1798 		mutex_enter(obj->vmobjlock);
1799 	if (pmap->pm_ptphint[lidx] == ptp)
1800 		pmap->pm_ptphint[lidx] = TAILQ_FIRST(&obj->memq);
1801 	ptp->wire_count = 0;
1802 	uvm_pagerealloc(ptp, NULL, 0);
1803 	l = curlwp;
1804 	KASSERT((l->l_pflag & LP_INTR) == 0);
1805 	VM_PAGE_TO_PP(ptp)->pp_link = l->l_md.md_gc_ptp;
1806 	l->l_md.md_gc_ptp = ptp;
1807 	if (lidx != 0)
1808 		mutex_exit(obj->vmobjlock);
1809 }
1810 
1811 static void
1812 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
1813 	      pt_entry_t *ptes, pd_entry_t * const *pdes)
1814 {
1815 	unsigned long index;
1816 	int level;
1817 	vaddr_t invaladdr;
1818 	pd_entry_t opde;
1819 #ifdef XEN
1820 	struct pmap *curpmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map);
1821 #ifdef MULTIPROCESSOR
1822 	vaddr_t invaladdr2;
1823 #endif
1824 #endif
1825 
1826 	KASSERT(pmap != pmap_kernel());
1827 	KASSERT(mutex_owned(pmap->pm_lock));
1828 	KASSERT(kpreempt_disabled());
1829 
1830 	level = 1;
1831 	do {
1832 		index = pl_i(va, level + 1);
1833 		opde = pmap_pte_testset(&pdes[level - 1][index], 0);
1834 #if defined(XEN)
1835 #  if defined(__x86_64__)
1836 		/*
1837 		 * If ptp is a L3 currently mapped in kernel space,
1838 		 * clear it before freeing
1839 		 */
1840 		if (pmap_pdirpa(pmap, 0) == curcpu()->ci_xen_current_user_pgd
1841 		    && level == PTP_LEVELS - 1) {
1842 			pmap_pte_set(&pmap_kernel()->pm_pdir[index], 0);
1843 			/*
1844 			 * Update the per-cpu PD on all cpus the current
1845 			 * pmap is active on
1846 			 */
1847 			CPU_INFO_ITERATOR cii;
1848 			struct cpu_info *ci;
1849 			for (CPU_INFO_FOREACH(cii, ci)) {
1850 				if (ci == NULL) {
1851 					continue;
1852 				}
1853 				if (ci->ci_cpumask & pmap->pm_cpus) {
1854 					pmap_pte_set(&ci->ci_kpm_pdir[index], 0);
1855 				}
1856 			}
1857 		}
1858 #  endif /*__x86_64__ */
1859 		invaladdr = level == 1 ? (vaddr_t)ptes :
1860 		    (vaddr_t)pdes[level - 2];
1861 		pmap_tlb_shootdown(curpmap, invaladdr + index * PAGE_SIZE,
1862 		    opde, TLBSHOOT_FREE_PTP1);
1863 #  if defined(MULTIPROCESSOR)
1864 		invaladdr2 = level == 1 ? (vaddr_t)PTE_BASE :
1865 		    (vaddr_t)normal_pdes[level - 2];
1866 		if (pmap != curpmap || invaladdr != invaladdr2) {
1867 			pmap_tlb_shootdown(pmap, invaladdr2 + index * PAGE_SIZE,
1868 			    opde, TLBSHOOT_FREE_PTP2);
1869 		}
1870 #  endif /* MULTIPROCESSOR */
1871 #else	/* XEN */
1872 		invaladdr = level == 1 ? (vaddr_t)ptes :
1873 		    (vaddr_t)pdes[level - 2];
1874 		pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE,
1875 		    opde, TLBSHOOT_FREE_PTP1);
1876 #endif	/* XEN */
1877 		pmap_freepage(pmap, ptp, level);
1878 		if (level < PTP_LEVELS - 1) {
1879 			ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1);
1880 			ptp->wire_count--;
1881 			if (ptp->wire_count > 1)
1882 				break;
1883 		}
1884 	} while (++level < PTP_LEVELS);
1885 	pmap_pte_flush();
1886 }
1887 
1888 /*
1889  * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
1890  *
1891  * => pmap should NOT be pmap_kernel()
1892  * => pmap should be locked
1893  * => preemption should be disabled
1894  */
1895 
1896 static struct vm_page *
1897 pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t * const *pdes)
1898 {
1899 	struct vm_page *ptp, *pptp;
1900 	int i;
1901 	unsigned long index;
1902 	pd_entry_t *pva;
1903 	paddr_t ppa, pa;
1904 	struct uvm_object *obj;
1905 
1906 	KASSERT(pmap != pmap_kernel());
1907 	KASSERT(mutex_owned(pmap->pm_lock));
1908 	KASSERT(kpreempt_disabled());
1909 
1910 	ptp = NULL;
1911 	pa = (paddr_t)-1;
1912 
1913 	/*
1914 	 * Loop through all page table levels seeing if we need to
1915 	 * add a new page to that level.
1916 	 */
1917 	for (i = PTP_LEVELS; i > 1; i--) {
1918 		/*
1919 		 * Save values from previous round.
1920 		 */
1921 		pptp = ptp;
1922 		ppa = pa;
1923 
1924 		index = pl_i(va, i);
1925 		pva = pdes[i - 2];
1926 
1927 		if (pmap_valid_entry(pva[index])) {
1928 			ppa = pmap_pte2pa(pva[index]);
1929 			ptp = NULL;
1930 			continue;
1931 		}
1932 
1933 		obj = &pmap->pm_obj[i-2];
1934 		PMAP_SUBOBJ_LOCK(pmap, i - 2);
1935 		ptp = uvm_pagealloc(obj, ptp_va2o(va, i - 1), NULL,
1936 		    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
1937 		PMAP_SUBOBJ_UNLOCK(pmap, i - 2);
1938 
1939 		if (ptp == NULL)
1940 			return NULL;
1941 
1942 		ptp->flags &= ~PG_BUSY; /* never busy */
1943 		ptp->wire_count = 1;
1944 		pmap->pm_ptphint[i - 2] = ptp;
1945 		pa = VM_PAGE_TO_PHYS(ptp);
1946 		pmap_pte_set(&pva[index], (pd_entry_t)
1947 		        (pmap_pa2pte(pa) | PG_u | PG_RW | PG_V));
1948 #if defined(XEN) && defined(__x86_64__)
1949 		/*
1950 		 * In Xen we must enter the mapping in kernel map too
1951 		 * if pmap is curmap and modifying top level (PGD)
1952 		 */
1953 		if(i == PTP_LEVELS && pmap != pmap_kernel()) {
1954 		        pmap_pte_set(&pmap_kernel()->pm_pdir[index],
1955 		                (pd_entry_t) (pmap_pa2pte(pa)
1956 		                        | PG_u | PG_RW | PG_V));
1957 			/*
1958 			 * Update the per-cpu PD on all cpus the current
1959 			 * pmap is active on
1960 			 */
1961 			CPU_INFO_ITERATOR cii;
1962 			struct cpu_info *ci;
1963 			for (CPU_INFO_FOREACH(cii, ci)) {
1964 				if (ci == NULL) {
1965 					continue;
1966 				}
1967 				if (ci->ci_cpumask & pmap->pm_cpus) {
1968 					pmap_pte_set(&ci->ci_kpm_pdir[index],
1969 						     (pd_entry_t) (pmap_pa2pte(pa) | PG_u | PG_RW | PG_V));
1970 				}
1971 			}
1972 		}
1973 #endif /* XEN && __x86_64__ */
1974 		pmap_pte_flush();
1975 		pmap_stats_update(pmap, 1, 0);
1976 		/*
1977 		 * If we're not in the top level, increase the
1978 		 * wire count of the parent page.
1979 		 */
1980 		if (i < PTP_LEVELS) {
1981 			if (pptp == NULL)
1982 				pptp = pmap_find_ptp(pmap, va, ppa, i);
1983 #ifdef DIAGNOSTIC
1984 			if (pptp == NULL)
1985 				panic("pde page disappeared");
1986 #endif
1987 			pptp->wire_count++;
1988 		}
1989 	}
1990 
1991 	/*
1992 	 * ptp is not NULL if we just allocated a new ptp. If it's
1993 	 * still NULL, we must look up the existing one.
1994 	 */
1995 	if (ptp == NULL) {
1996 		ptp = pmap_find_ptp(pmap, va, ppa, 1);
1997 #ifdef DIAGNOSTIC
1998 		if (ptp == NULL) {
1999 			printf("va %" PRIxVADDR " ppa %" PRIxPADDR "\n",
2000 			    va, ppa);
2001 			panic("pmap_get_ptp: unmanaged user PTP");
2002 		}
2003 #endif
2004 	}
2005 
2006 	pmap->pm_ptphint[0] = ptp;
2007 	return(ptp);
2008 }
2009 
2010 /*
2011  * p m a p  l i f e c y c l e   f u n c t i o n s
2012  */
2013 
2014 /*
2015  * pmap_pdp_ctor: constructor for the PDP cache.
2016  */
2017 int
2018 pmap_pdp_ctor(void *arg, void *v, int flags)
2019 {
2020 	pd_entry_t *pdir = v;
2021 	paddr_t pdirpa = 0;	/* XXX: GCC */
2022 	vaddr_t object;
2023 	int i;
2024 
2025 #if !defined(XEN) || !defined(__x86_64__)
2026 	int npde;
2027 #endif
2028 #ifdef XEN
2029 	int s;
2030 #endif
2031 
2032 	/*
2033 	 * NOTE: The `pmaps_lock' is held when the PDP is allocated.
2034 	 */
2035 
2036 #if defined(XEN) && defined(__x86_64__)
2037 	/* fetch the physical address of the page directory. */
2038 	(void) pmap_extract(pmap_kernel(), (vaddr_t) pdir, &pdirpa);
2039 
2040 	/* zero init area */
2041 	memset (pdir, 0, PAGE_SIZE); /* Xen wants a clean page */
2042 	/*
2043 	 * this pdir will NEVER be active in kernel mode
2044 	 * so mark recursive entry invalid
2045 	 */
2046 	pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa) | PG_u;
2047 	/*
2048 	 * PDP constructed this way won't be for kernel,
2049 	 * hence we don't put kernel mappings on Xen.
2050 	 * But we need to make pmap_create() happy, so put a dummy (without
2051 	 * PG_V) value at the right place.
2052 	 */
2053 	pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] =
2054 	     (pd_entry_t)-1 & PG_FRAME;
2055 #else /* XEN && __x86_64__*/
2056 	/* zero init area */
2057 	memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t));
2058 
2059 	object = (vaddr_t)v;
2060 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2061 		/* fetch the physical address of the page directory. */
2062 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2063 		/* put in recursive PDE to map the PTEs */
2064 		pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PG_V;
2065 #ifndef XEN
2066 		pdir[PDIR_SLOT_PTE + i] |= PG_KW;
2067 #endif
2068 	}
2069 
2070 	/* copy kernel's PDE */
2071 	npde = nkptp[PTP_LEVELS - 1];
2072 
2073 	memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN],
2074 	    npde * sizeof(pd_entry_t));
2075 
2076 	/* zero the rest */
2077 	memset(&pdir[PDIR_SLOT_KERN + npde], 0,
2078 	    (NTOPLEVEL_PDES - (PDIR_SLOT_KERN + npde)) * sizeof(pd_entry_t));
2079 
2080 	if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
2081 		int idx = pl_i(KERNBASE, PTP_LEVELS);
2082 
2083 		pdir[idx] = PDP_BASE[idx];
2084 	}
2085 #endif /* XEN  && __x86_64__*/
2086 #ifdef XEN
2087 	s = splvm();
2088 	object = (vaddr_t)v;
2089 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2090 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2091 		/* FIXME: This should use pmap_protect() .. */
2092 		pmap_kenter_pa(object, pdirpa, VM_PROT_READ, 0);
2093 		pmap_update(pmap_kernel());
2094 		/*
2095 		 * pin as L2/L4 page, we have to do the page with the
2096 		 * PDIR_SLOT_PTE entries last
2097 		 */
2098 #ifdef PAE
2099 		if (i == l2tol3(PDIR_SLOT_PTE))
2100 			continue;
2101 #endif
2102 
2103 #ifdef __x86_64__
2104 		xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa));
2105 #else
2106 		xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2107 #endif
2108 	}
2109 #ifdef PAE
2110 	object = ((vaddr_t)pdir) + PAGE_SIZE  * l2tol3(PDIR_SLOT_PTE);
2111 	(void)pmap_extract(pmap_kernel(), object, &pdirpa);
2112 	xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2113 #endif
2114 	splx(s);
2115 #endif /* XEN */
2116 
2117 	return (0);
2118 }
2119 
2120 /*
2121  * pmap_pdp_dtor: destructor for the PDP cache.
2122  */
2123 
2124 void
2125 pmap_pdp_dtor(void *arg, void *v)
2126 {
2127 #ifdef XEN
2128 	paddr_t pdirpa = 0;	/* XXX: GCC */
2129 	vaddr_t object = (vaddr_t)v;
2130 	int i;
2131 	int s = splvm();
2132 	pt_entry_t *pte;
2133 
2134 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2135 		/* fetch the physical address of the page directory. */
2136 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2137 		/* unpin page table */
2138 		xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa));
2139 	}
2140 	object = (vaddr_t)v;
2141 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2142 		/* Set page RW again */
2143 		pte = kvtopte(object);
2144 		xpq_queue_pte_update(xpmap_ptetomach(pte), *pte | PG_RW);
2145 		xpq_queue_invlpg((vaddr_t)object);
2146 	}
2147 	splx(s);
2148 #endif  /* XEN */
2149 }
2150 
2151 #ifdef PAE
2152 
2153 /* pmap_pdp_alloc: Allocate a page for the pdp memory pool. */
2154 
2155 void *
2156 pmap_pdp_alloc(struct pool *pp, int flags)
2157 {
2158 	return (void *)uvm_km_alloc(kernel_map,
2159 	    PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE,
2160 	    ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK)
2161 	    | UVM_KMF_WIRED);
2162 }
2163 
2164 /*
2165  * pmap_pdp_free: free a PDP
2166  */
2167 
2168 void
2169 pmap_pdp_free(struct pool *pp, void *v)
2170 {
2171 	uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE,
2172 	    UVM_KMF_WIRED);
2173 }
2174 #endif /* PAE */
2175 
2176 /*
2177  * pmap_create: create a pmap
2178  *
2179  * => note: old pmap interface took a "size" args which allowed for
2180  *	the creation of "software only" pmaps (not in bsd).
2181  */
2182 
2183 struct pmap *
2184 pmap_create(void)
2185 {
2186 	struct pmap *pmap;
2187 	int i;
2188 
2189 	pmap = pool_cache_get(&pmap_cache, PR_WAITOK);
2190 
2191 	/* init uvm_object */
2192 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2193 		mutex_init(&pmap->pm_obj_lock[i], MUTEX_DEFAULT, IPL_NONE);
2194 		uvm_obj_init(&pmap->pm_obj[i], NULL, false, 1);
2195 		uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_obj_lock[i]);
2196 		pmap->pm_ptphint[i] = NULL;
2197 	}
2198 	pmap->pm_stats.wired_count = 0;
2199 	/* count the PDP allocd below */
2200 	pmap->pm_stats.resident_count = PDP_SIZE;
2201 #if !defined(__x86_64__)
2202 	pmap->pm_hiexec = 0;
2203 #endif /* !defined(__x86_64__) */
2204 	pmap->pm_flags = 0;
2205 	pmap->pm_cpus = 0;
2206 	pmap->pm_kernel_cpus = 0;
2207 	pmap->pm_gc_ptp = NULL;
2208 
2209 	/* init the LDT */
2210 	pmap->pm_ldt = NULL;
2211 	pmap->pm_ldt_len = 0;
2212 	pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2213 
2214 	/* allocate PDP */
2215  try_again:
2216 	pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK);
2217 
2218 	mutex_enter(&pmaps_lock);
2219 
2220 	if (pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] == 0) {
2221 		mutex_exit(&pmaps_lock);
2222 		pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir);
2223 		goto try_again;
2224 	}
2225 
2226 	for (i = 0; i < PDP_SIZE; i++)
2227 		pmap->pm_pdirpa[i] =
2228 		    pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]);
2229 
2230 	LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
2231 
2232 	mutex_exit(&pmaps_lock);
2233 
2234 	return (pmap);
2235 }
2236 
2237 /*
2238  * pmap_free_ptps: put a list of ptps back to the freelist.
2239  */
2240 
2241 static void
2242 pmap_free_ptps(struct vm_page *empty_ptps)
2243 {
2244 	struct vm_page *ptp;
2245 	struct pmap_page *pp;
2246 
2247 	while ((ptp = empty_ptps) != NULL) {
2248 		pp = VM_PAGE_TO_PP(ptp);
2249 		empty_ptps = pp->pp_link;
2250 		LIST_INIT(&pp->pp_head.pvh_list);
2251 		uvm_pagefree(ptp);
2252 	}
2253 }
2254 
2255 /*
2256  * pmap_destroy: drop reference count on pmap.   free pmap if
2257  *	reference count goes to zero.
2258  */
2259 
2260 void
2261 pmap_destroy(struct pmap *pmap)
2262 {
2263 	int i;
2264 #ifdef DIAGNOSTIC
2265 	struct cpu_info *ci;
2266 	CPU_INFO_ITERATOR cii;
2267 #endif /* DIAGNOSTIC */
2268 	lwp_t *l;
2269 
2270 	/*
2271 	 * If we have torn down this pmap, process deferred frees and
2272 	 * invalidations.  Free now if the system is low on memory.
2273 	 * Otherwise, free when the pmap is destroyed thus avoiding a
2274 	 * TLB shootdown.
2275 	 */
2276 	l = curlwp;
2277 	if (__predict_false(l->l_md.md_gc_pmap == pmap)) {
2278 		if (uvmexp.free < uvmexp.freetarg) {
2279 			pmap_update(pmap);
2280 		} else {
2281 			KASSERT(pmap->pm_gc_ptp == NULL);
2282 			pmap->pm_gc_ptp = l->l_md.md_gc_ptp;
2283 			l->l_md.md_gc_ptp = NULL;
2284 			l->l_md.md_gc_pmap = NULL;
2285 		}
2286 	}
2287 
2288 	/*
2289 	 * drop reference count
2290 	 */
2291 
2292 	if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) {
2293 		return;
2294 	}
2295 
2296 #ifdef DIAGNOSTIC
2297 	for (CPU_INFO_FOREACH(cii, ci))
2298 		if (ci->ci_pmap == pmap)
2299 			panic("destroying pmap being used");
2300 #endif /* DIAGNOSTIC */
2301 
2302 	/*
2303 	 * reference count is zero, free pmap resources and then free pmap.
2304 	 */
2305 #ifdef XEN
2306 	/*
2307 	 * Xen lazy APDP handling:
2308 	 * clear APDP_PDE if pmap is the currently mapped
2309 	 */
2310 	if (xpmap_ptom_masked(pmap_pdirpa(pmap, 0)) == (*APDP_PDE & PG_FRAME)) {
2311 		kpreempt_disable();
2312 		pmap_unmap_apdp();
2313 		pmap_pte_flush();
2314 	        pmap_apte_flush(pmap_kernel());
2315 	        kpreempt_enable();
2316 	}
2317 #endif
2318 
2319 	/*
2320 	 * remove it from global list of pmaps
2321 	 */
2322 
2323 	mutex_enter(&pmaps_lock);
2324 	LIST_REMOVE(pmap, pm_list);
2325 	mutex_exit(&pmaps_lock);
2326 
2327 	/*
2328 	 * Process deferred PTP frees.  No TLB shootdown required, as the
2329 	 * PTP pages are no longer visible to any CPU.
2330 	 */
2331 
2332 	pmap_free_ptps(pmap->pm_gc_ptp);
2333 
2334 	/*
2335 	 * destroyed pmap shouldn't have remaining PTPs
2336 	 */
2337 
2338 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2339 		KASSERT(pmap->pm_obj[i].uo_npages == 0);
2340 		KASSERT(TAILQ_EMPTY(&pmap->pm_obj[i].memq));
2341 	}
2342 
2343 	/*
2344 	 * MULTIPROCESSOR -- no need to flush out of other processors'
2345 	 * APTE space because we do that in pmap_unmap_ptes().
2346 	 */
2347 	pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir);
2348 
2349 #ifdef USER_LDT
2350 	if (pmap->pm_ldt != NULL) {
2351 		/*
2352 		 * no need to switch the LDT; this address space is gone,
2353 		 * nothing is using it.
2354 		 *
2355 		 * No need to lock the pmap for ldt_free (or anything else),
2356 		 * we're the last one to use it.
2357 		 */
2358 		mutex_enter(&cpu_lock);
2359 		ldt_free(pmap->pm_ldt_sel);
2360 		mutex_exit(&cpu_lock);
2361 		uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt,
2362 		    pmap->pm_ldt_len, UVM_KMF_WIRED);
2363 	}
2364 #endif
2365 
2366 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2367 		uvm_obj_destroy(&pmap->pm_obj[i], false);
2368 		mutex_destroy(&pmap->pm_obj_lock[i]);
2369 	}
2370 	pool_cache_put(&pmap_cache, pmap);
2371 }
2372 
2373 /*
2374  * pmap_remove_all: pmap is being torn down by the current thread.
2375  * avoid unnecessary invalidations.
2376  */
2377 
2378 void
2379 pmap_remove_all(struct pmap *pmap)
2380 {
2381 	lwp_t *l = curlwp;
2382 
2383 	KASSERT(l->l_md.md_gc_pmap == NULL);
2384 
2385 	l->l_md.md_gc_pmap = pmap;
2386 }
2387 
2388 #if defined(PMAP_FORK)
2389 /*
2390  * pmap_fork: perform any necessary data structure manipulation when
2391  * a VM space is forked.
2392  */
2393 
2394 void
2395 pmap_fork(struct pmap *pmap1, struct pmap *pmap2)
2396 {
2397 #ifdef USER_LDT
2398 	union descriptor *new_ldt;
2399 	size_t len;
2400 	int sel;
2401 
2402 	if (__predict_true(pmap1->pm_ldt == NULL)) {
2403 		return;
2404 	}
2405 
2406  retry:
2407 	if (pmap1->pm_ldt != NULL) {
2408 		len = pmap1->pm_ldt_len;
2409 		new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, len, 0,
2410 		    UVM_KMF_WIRED);
2411 		mutex_enter(&cpu_lock);
2412 		sel = ldt_alloc(new_ldt, len);
2413 		if (sel == -1) {
2414 			mutex_exit(&cpu_lock);
2415 			uvm_km_free(kernel_map, (vaddr_t)new_ldt, len,
2416 			    UVM_KMF_WIRED);
2417 			printf("WARNING: pmap_fork: unable to allocate LDT\n");
2418 			return;
2419 		}
2420 	} else {
2421 		len = -1;
2422 		new_ldt = NULL;
2423 		sel = -1;
2424 		mutex_enter(&cpu_lock);
2425 	}
2426 
2427  	/* Copy the LDT, if necessary. */
2428  	if (pmap1->pm_ldt != NULL) {
2429 		if (len != pmap1->pm_ldt_len) {
2430 			if (len != -1) {
2431 				ldt_free(sel);
2432 				uvm_km_free(kernel_map, (vaddr_t)new_ldt,
2433 				    len, UVM_KMF_WIRED);
2434 			}
2435 			mutex_exit(&cpu_lock);
2436 			goto retry;
2437 		}
2438 
2439 		memcpy(new_ldt, pmap1->pm_ldt, len);
2440 		pmap2->pm_ldt = new_ldt;
2441 		pmap2->pm_ldt_len = pmap1->pm_ldt_len;
2442 		pmap2->pm_ldt_sel = sel;
2443 		len = -1;
2444 	}
2445 
2446 	if (len != -1) {
2447 		ldt_free(sel);
2448 		uvm_km_free(kernel_map, (vaddr_t)new_ldt, len,
2449 		    UVM_KMF_WIRED);
2450 	}
2451 	mutex_exit(&cpu_lock);
2452 #endif /* USER_LDT */
2453 }
2454 #endif /* PMAP_FORK */
2455 
2456 #ifdef USER_LDT
2457 
2458 /*
2459  * pmap_ldt_xcall: cross call used by pmap_ldt_sync.  if the named pmap
2460  * is active, reload LDTR.
2461  */
2462 static void
2463 pmap_ldt_xcall(void *arg1, void *arg2)
2464 {
2465 	struct pmap *pm;
2466 
2467 	kpreempt_disable();
2468 	pm = arg1;
2469 	if (curcpu()->ci_pmap == pm) {
2470 		lldt(pm->pm_ldt_sel);
2471 	}
2472 	kpreempt_enable();
2473 }
2474 
2475 /*
2476  * pmap_ldt_sync: LDT selector for the named pmap is changing.  swap
2477  * in the new selector on all CPUs.
2478  */
2479 void
2480 pmap_ldt_sync(struct pmap *pm)
2481 {
2482 	uint64_t where;
2483 
2484 	KASSERT(mutex_owned(&cpu_lock));
2485 
2486 	pmap_ldt_evcnt.ev_count++;
2487 	where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL);
2488 	xc_wait(where);
2489 }
2490 
2491 /*
2492  * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and
2493  * restore the default.
2494  */
2495 
2496 void
2497 pmap_ldt_cleanup(struct lwp *l)
2498 {
2499 	pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap;
2500 	union descriptor *dp = NULL;
2501 	size_t len = 0;
2502 	int sel = -1;
2503 
2504 	if (__predict_true(pmap->pm_ldt == NULL)) {
2505 		return;
2506 	}
2507 
2508 	mutex_enter(&cpu_lock);
2509 	if (pmap->pm_ldt != NULL) {
2510 		sel = pmap->pm_ldt_sel;
2511 		dp = pmap->pm_ldt;
2512 		len = pmap->pm_ldt_len;
2513 		pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2514 		pmap->pm_ldt = NULL;
2515 		pmap->pm_ldt_len = 0;
2516 		pmap_ldt_sync(pmap);
2517 		ldt_free(sel);
2518 		uvm_km_free(kernel_map, (vaddr_t)dp, len, UVM_KMF_WIRED);
2519 	}
2520 	mutex_exit(&cpu_lock);
2521 }
2522 #endif /* USER_LDT */
2523 
2524 /*
2525  * pmap_activate: activate a process' pmap
2526  *
2527  * => must be called with kernel preemption disabled
2528  * => if lwp is the curlwp, then set ci_want_pmapload so that
2529  *    actual MMU context switch will be done by pmap_load() later
2530  */
2531 
2532 void
2533 pmap_activate(struct lwp *l)
2534 {
2535 	struct cpu_info *ci;
2536 	struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2537 
2538 	KASSERT(kpreempt_disabled());
2539 
2540 	ci = curcpu();
2541 
2542 	if (l == ci->ci_curlwp) {
2543 		KASSERT(ci->ci_want_pmapload == 0);
2544 		KASSERT(ci->ci_tlbstate != TLBSTATE_VALID);
2545 #ifdef KSTACK_CHECK_DR0
2546 		/*
2547 		 * setup breakpoint on the top of stack
2548 		 */
2549 		if (l == &lwp0)
2550 			dr0(0, 0, 0, 0);
2551 		else
2552 			dr0(KSTACK_LOWEST_ADDR(l), 1, 3, 1);
2553 #endif
2554 
2555 		/*
2556 		 * no need to switch to kernel vmspace because
2557 		 * it's a subset of any vmspace.
2558 		 */
2559 
2560 		if (pmap == pmap_kernel()) {
2561 			ci->ci_want_pmapload = 0;
2562 			return;
2563 		}
2564 
2565 		ci->ci_want_pmapload = 1;
2566 	}
2567 }
2568 
2569 /*
2570  * pmap_reactivate: try to regain reference to the pmap.
2571  *
2572  * => must be called with kernel preemption disabled
2573  */
2574 
2575 static bool
2576 pmap_reactivate(struct pmap *pmap)
2577 {
2578 	struct cpu_info *ci;
2579 	uint32_t cpumask;
2580 	bool result;
2581 	uint32_t oldcpus;
2582 
2583 	ci = curcpu();
2584 	cpumask = ci->ci_cpumask;
2585 
2586 	KASSERT(kpreempt_disabled());
2587 #if defined(XEN) && defined(__x86_64__)
2588 	KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd);
2589 #elif defined(PAE)
2590 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
2591 #elif !defined(XEN)
2592 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()));
2593 #endif
2594 
2595 	/*
2596 	 * if we still have a lazy reference to this pmap,
2597 	 * we can assume that there was no tlb shootdown
2598 	 * for this pmap in the meantime.
2599 	 *
2600 	 * the order of events here is important as we must
2601 	 * synchronize with TLB shootdown interrupts.  declare
2602 	 * interest in invalidations (TLBSTATE_VALID) and then
2603 	 * check the cpumask, which the IPIs can change only
2604 	 * when the state is TLBSTATE_LAZY.
2605 	 */
2606 
2607 	ci->ci_tlbstate = TLBSTATE_VALID;
2608 	oldcpus = pmap->pm_cpus;
2609 	KASSERT((pmap->pm_kernel_cpus & cpumask) != 0);
2610 	if (oldcpus & cpumask) {
2611 		/* got it */
2612 		result = true;
2613 	} else {
2614 		/* must reload */
2615 		atomic_or_32(&pmap->pm_cpus, cpumask);
2616 		result = false;
2617 	}
2618 
2619 	return result;
2620 }
2621 
2622 /*
2623  * pmap_load: actually switch pmap.  (fill in %cr3 and LDT info)
2624  *
2625  * ensures that the current process' pmap is loaded on the current cpu's MMU
2626  * and there's no stale TLB entries.
2627  *
2628  * the caller should disable preemption or do check-and-retry to prevent
2629  * a preemption from undoing our efforts.
2630  *
2631  * this function can block.
2632  */
2633 
2634 void
2635 pmap_load(void)
2636 {
2637 	struct cpu_info *ci;
2638 	uint32_t cpumask;
2639 	struct pmap *pmap;
2640 	struct pmap *oldpmap;
2641 	struct lwp *l;
2642 	struct pcb *pcb;
2643 	uint64_t ncsw;
2644 
2645 	kpreempt_disable();
2646  retry:
2647 	ci = curcpu();
2648 	if (!ci->ci_want_pmapload) {
2649 		kpreempt_enable();
2650 		return;
2651 	}
2652 	cpumask = ci->ci_cpumask;
2653 	l = ci->ci_curlwp;
2654 	ncsw = l->l_ncsw;
2655 
2656 	/* should be able to take ipis. */
2657 	KASSERT(ci->ci_ilevel < IPL_HIGH);
2658 #ifdef XEN
2659 	/* Check to see if interrupts are enabled (ie; no events are masked) */
2660 	KASSERT(x86_read_psl() == 0);
2661 #else
2662 	KASSERT((x86_read_psl() & PSL_I) != 0);
2663 #endif
2664 
2665 	KASSERT(l != NULL);
2666 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2667 	KASSERT(pmap != pmap_kernel());
2668 	oldpmap = ci->ci_pmap;
2669 	pcb = lwp_getpcb(l);
2670 
2671 	if (pmap == oldpmap) {
2672 		if (!pmap_reactivate(pmap)) {
2673 			u_int gen = uvm_emap_gen_return();
2674 
2675 			/*
2676 			 * pmap has been changed during deactivated.
2677 			 * our tlb may be stale.
2678 			 */
2679 
2680 			tlbflush();
2681 			uvm_emap_update(gen);
2682 		}
2683 
2684 		ci->ci_want_pmapload = 0;
2685 		kpreempt_enable();
2686 		return;
2687 	}
2688 
2689 	/*
2690 	 * grab a reference to the new pmap.
2691 	 */
2692 
2693 	pmap_reference(pmap);
2694 
2695 	/*
2696 	 * actually switch pmap.
2697 	 */
2698 
2699 	atomic_and_32(&oldpmap->pm_cpus, ~cpumask);
2700 	atomic_and_32(&oldpmap->pm_kernel_cpus, ~cpumask);
2701 
2702 #if defined(XEN) && defined(__x86_64__)
2703 	KASSERT(pmap_pdirpa(oldpmap, 0) == ci->ci_xen_current_user_pgd ||
2704 	    oldpmap == pmap_kernel());
2705 #elif defined(PAE)
2706 	KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
2707 #elif !defined(XEN)
2708 	KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(rcr3()));
2709 #endif
2710 	KASSERT((pmap->pm_cpus & cpumask) == 0);
2711 	KASSERT((pmap->pm_kernel_cpus & cpumask) == 0);
2712 
2713 	/*
2714 	 * mark the pmap in use by this processor.  again we must
2715 	 * synchronize with TLB shootdown interrupts, so set the
2716 	 * state VALID first, then register us for shootdown events
2717 	 * on this pmap.
2718 	 */
2719 
2720 	ci->ci_tlbstate = TLBSTATE_VALID;
2721 	atomic_or_32(&pmap->pm_cpus, cpumask);
2722 	atomic_or_32(&pmap->pm_kernel_cpus, cpumask);
2723 	ci->ci_pmap = pmap;
2724 
2725 	/*
2726 	 * update tss.  now that we have registered for invalidations
2727 	 * from other CPUs, we're good to load the page tables.
2728 	 */
2729 #ifdef PAE
2730 	pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa;
2731 #else
2732 	pcb->pcb_cr3 = pmap_pdirpa(pmap, 0);
2733 #endif
2734 
2735 #ifdef i386
2736 #ifdef XEN
2737 	/*
2738 	 * clear APDP slot, in case it points to a page table that has
2739 	 * been freed
2740 	 */
2741 	if (*APDP_PDE) {
2742 		pmap_unmap_apdp();
2743 	}
2744 	/* lldt() does pmap_pte_flush() */
2745 #endif /* XEN */
2746 
2747 #ifndef XEN
2748 	ci->ci_tss.tss_ldt = pmap->pm_ldt_sel;
2749 	ci->ci_tss.tss_cr3 = pcb->pcb_cr3;
2750 #endif /* !XEN */
2751 #endif /* i386 */
2752 
2753 	lldt(pmap->pm_ldt_sel);
2754 
2755 	u_int gen = uvm_emap_gen_return();
2756 	cpu_load_pmap(pmap);
2757 	uvm_emap_update(gen);
2758 
2759 	ci->ci_want_pmapload = 0;
2760 
2761 	/*
2762 	 * we're now running with the new pmap.  drop the reference
2763 	 * to the old pmap.  if we block, we need to go around again.
2764 	 */
2765 
2766 	pmap_destroy(oldpmap);
2767 	if (l->l_ncsw != ncsw) {
2768 		goto retry;
2769 	}
2770 
2771 	kpreempt_enable();
2772 }
2773 
2774 /*
2775  * pmap_deactivate: deactivate a process' pmap.
2776  *
2777  * => Must be called with kernel preemption disabled (high IPL is enough).
2778  */
2779 void
2780 pmap_deactivate(struct lwp *l)
2781 {
2782 	struct pmap *pmap;
2783 	struct cpu_info *ci;
2784 
2785 	KASSERT(kpreempt_disabled());
2786 
2787 	if (l != curlwp) {
2788 		return;
2789 	}
2790 
2791 	/*
2792 	 * Wait for pending TLB shootdowns to complete.  Necessary because
2793 	 * TLB shootdown state is per-CPU, and the LWP may be coming off
2794 	 * the CPU before it has a chance to call pmap_update(), e.g. due
2795 	 * to kernel preemption or blocking routine in between.
2796 	 */
2797 	pmap_tlb_shootnow();
2798 
2799 	ci = curcpu();
2800 
2801 	if (ci->ci_want_pmapload) {
2802 		/*
2803 		 * ci_want_pmapload means that our pmap is not loaded on
2804 		 * the CPU or TLB might be stale.  note that pmap_kernel()
2805 		 * is always considered loaded.
2806 		 */
2807 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
2808 		    != pmap_kernel());
2809 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
2810 		    != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID);
2811 
2812 		/*
2813 		 * userspace has not been touched.
2814 		 * nothing to do here.
2815 		 */
2816 
2817 		ci->ci_want_pmapload = 0;
2818 		return;
2819 	}
2820 
2821 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2822 
2823 	if (pmap == pmap_kernel()) {
2824 		return;
2825 	}
2826 
2827 #if defined(XEN) && defined(__x86_64__)
2828 	KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd);
2829 #elif defined(PAE)
2830 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
2831 #elif !defined(XEN)
2832 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()));
2833 #endif
2834 	KASSERT(ci->ci_pmap == pmap);
2835 
2836 	/*
2837 	 * we aren't interested in TLB invalidations for this pmap,
2838 	 * at least for the time being.
2839 	 */
2840 
2841 	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
2842 	ci->ci_tlbstate = TLBSTATE_LAZY;
2843 }
2844 
2845 /*
2846  * end of lifecycle functions
2847  */
2848 
2849 /*
2850  * some misc. functions
2851  */
2852 
2853 int
2854 pmap_pdes_invalid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde)
2855 {
2856 	int i;
2857 	unsigned long index;
2858 	pd_entry_t pde;
2859 
2860 	for (i = PTP_LEVELS; i > 1; i--) {
2861 		index = pl_i(va, i);
2862 		pde = pdes[i - 2][index];
2863 		if ((pde & PG_V) == 0)
2864 			return i;
2865 	}
2866 	if (lastpde != NULL)
2867 		*lastpde = pde;
2868 	return 0;
2869 }
2870 
2871 /*
2872  * pmap_extract: extract a PA for the given VA
2873  */
2874 
2875 bool
2876 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
2877 {
2878 	pt_entry_t *ptes, pte;
2879 	pd_entry_t pde;
2880 	pd_entry_t * const *pdes;
2881 	struct pmap *pmap2;
2882 	struct cpu_info *ci;
2883 	paddr_t pa;
2884 	lwp_t *l;
2885 	bool hard, rv;
2886 
2887 	rv = false;
2888 	pa = 0;
2889 	l = curlwp;
2890 
2891 	KPREEMPT_DISABLE(l);
2892 	ci = l->l_cpu;
2893 	if (__predict_true(!ci->ci_want_pmapload && ci->ci_pmap == pmap) ||
2894 	    pmap == pmap_kernel()) {
2895 		/*
2896 		 * no need to lock, because it's pmap_kernel() or our
2897 		 * own pmap and is active.  if a user pmap, the caller
2898 		 * will hold the vm_map write/read locked and so prevent
2899 		 * entries from disappearing while we are here.  ptps
2900 		 * can disappear via pmap_remove() and pmap_protect(),
2901 		 * but they are called with the vm_map write locked.
2902 		 */
2903 		hard = false;
2904 		ptes = PTE_BASE;
2905 		pdes = normal_pdes;
2906 	} else {
2907 		/* we lose, do it the hard way. */
2908 		hard = true;
2909 		pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
2910 	}
2911 	if (pmap_pdes_valid(va, pdes, &pde)) {
2912 		pte = ptes[pl1_i(va)];
2913 		if (pde & PG_PS) {
2914 			pa = (pde & PG_LGFRAME) | (va & (NBPD_L2 - 1));
2915 			rv = true;
2916 		} else if (__predict_true((pte & PG_V) != 0)) {
2917 			pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
2918 			rv = true;
2919 		}
2920 	}
2921 	if (__predict_false(hard)) {
2922 		pmap_unmap_ptes(pmap, pmap2);
2923 	}
2924 	KPREEMPT_ENABLE(l);
2925 	if (pap != NULL) {
2926 		*pap = pa;
2927 	}
2928 	return rv;
2929 }
2930 
2931 
2932 /*
2933  * vtophys: virtual address to physical address.  For use by
2934  * machine-dependent code only.
2935  */
2936 
2937 paddr_t
2938 vtophys(vaddr_t va)
2939 {
2940 	paddr_t pa;
2941 
2942 	if (pmap_extract(pmap_kernel(), va, &pa) == true)
2943 		return (pa);
2944 	return (0);
2945 }
2946 
2947 __strict_weak_alias(pmap_extract_ma, pmap_extract);
2948 
2949 #ifdef XEN
2950 
2951 /*
2952  * vtomach: virtual address to machine address.  For use by
2953  * machine-dependent code only.
2954  */
2955 
2956 paddr_t
2957 vtomach(vaddr_t va)
2958 {
2959 	paddr_t pa;
2960 
2961 	if (pmap_extract_ma(pmap_kernel(), va, &pa) == true)
2962 		return (pa);
2963 	return (0);
2964 }
2965 
2966 #endif /* XEN */
2967 
2968 /*
2969  * pmap_virtual_space: used during bootup [pmap_steal_memory] to
2970  *	determine the bounds of the kernel virtual addess space.
2971  */
2972 
2973 void
2974 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp)
2975 {
2976 	*startp = virtual_avail;
2977 	*endp = virtual_end;
2978 }
2979 
2980 /*
2981  * pmap_map: map a range of PAs into kvm.
2982  *
2983  * => used during crash dump
2984  * => XXX: pmap_map() should be phased out?
2985  */
2986 
2987 vaddr_t
2988 pmap_map(vaddr_t va, paddr_t spa, paddr_t epa, vm_prot_t prot)
2989 {
2990 	while (spa < epa) {
2991 		pmap_kenter_pa(va, spa, prot, 0);
2992 		va += PAGE_SIZE;
2993 		spa += PAGE_SIZE;
2994 	}
2995 	pmap_update(pmap_kernel());
2996 	return va;
2997 }
2998 
2999 /*
3000  * pmap_zero_page: zero a page
3001  */
3002 
3003 void
3004 pmap_zero_page(paddr_t pa)
3005 {
3006 	pt_entry_t *zpte;
3007 	void *zerova;
3008 	int id;
3009 
3010 	kpreempt_disable();
3011 	id = cpu_number();
3012 	zpte = PTESLEW(zero_pte, id);
3013 	zerova = VASLEW(zerop, id);
3014 
3015 #ifdef DIAGNOSTIC
3016 	if (*zpte)
3017 		panic("pmap_zero_page: lock botch");
3018 #endif
3019 
3020 	pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k);
3021 	pmap_pte_flush();
3022 	pmap_update_pg((vaddr_t)zerova);		/* flush TLB */
3023 
3024 	memset(zerova, 0, PAGE_SIZE);
3025 
3026 #if defined(DIAGNOSTIC) || defined(XEN)
3027 	pmap_pte_set(zpte, 0);				/* zap ! */
3028 	pmap_pte_flush();
3029 #endif
3030 	kpreempt_enable();
3031 }
3032 
3033 /*
3034  * pmap_pagezeroidle: the same, for the idle loop page zero'er.
3035  * Returns true if the page was zero'd, false if we aborted for
3036  * some reason.
3037  */
3038 
3039 bool
3040 pmap_pageidlezero(paddr_t pa)
3041 {
3042 	pt_entry_t *zpte;
3043 	void *zerova;
3044 	bool rv;
3045 	int id;
3046 
3047 	id = cpu_number();
3048 	zpte = PTESLEW(zero_pte, id);
3049 	zerova = VASLEW(zerop, id);
3050 
3051 	KASSERT(cpu_feature[0] & CPUID_SSE2);
3052 	KASSERT(*zpte == 0);
3053 
3054 	pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k);
3055 	pmap_pte_flush();
3056 	pmap_update_pg((vaddr_t)zerova);		/* flush TLB */
3057 
3058 	rv = sse2_idlezero_page(zerova);
3059 
3060 #if defined(DIAGNOSTIC) || defined(XEN)
3061 	pmap_pte_set(zpte, 0);				/* zap ! */
3062 	pmap_pte_flush();
3063 #endif
3064 
3065 	return rv;
3066 }
3067 
3068 /*
3069  * pmap_copy_page: copy a page
3070  */
3071 
3072 void
3073 pmap_copy_page(paddr_t srcpa, paddr_t dstpa)
3074 {
3075 	pt_entry_t *spte;
3076 	pt_entry_t *dpte;
3077 	void *csrcva;
3078 	void *cdstva;
3079 	int id;
3080 
3081 	kpreempt_disable();
3082 	id = cpu_number();
3083 	spte = PTESLEW(csrc_pte,id);
3084 	dpte = PTESLEW(cdst_pte,id);
3085 	csrcva = VASLEW(csrcp, id);
3086 	cdstva = VASLEW(cdstp, id);
3087 
3088 	KASSERT(*spte == 0 && *dpte == 0);
3089 
3090 	pmap_pte_set(spte, pmap_pa2pte(srcpa) | PG_V | PG_RW | PG_U | PG_k);
3091 	pmap_pte_set(dpte,
3092 	    pmap_pa2pte(dstpa) | PG_V | PG_RW | PG_M | PG_U | PG_k);
3093 	pmap_pte_flush();
3094 	pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva);
3095 
3096 	memcpy(cdstva, csrcva, PAGE_SIZE);
3097 
3098 #if defined(DIAGNOSTIC) || defined(XEN)
3099 	pmap_pte_set(spte, 0);
3100 	pmap_pte_set(dpte, 0);
3101 	pmap_pte_flush();
3102 #endif
3103 	kpreempt_enable();
3104 }
3105 
3106 static pt_entry_t *
3107 pmap_map_ptp(struct vm_page *ptp)
3108 {
3109 	pt_entry_t *ptppte;
3110 	void *ptpva;
3111 	int id;
3112 
3113 	KASSERT(kpreempt_disabled());
3114 
3115 	id = cpu_number();
3116 	ptppte = PTESLEW(ptp_pte, id);
3117 	ptpva = VASLEW(ptpp, id);
3118 #if !defined(XEN)
3119 	pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M |
3120 	    PG_RW | PG_U | PG_k);
3121 #else
3122 	pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M |
3123 	    PG_U | PG_k);
3124 #endif
3125 	pmap_pte_flush();
3126 	pmap_update_pg((vaddr_t)ptpva);
3127 
3128 	return (pt_entry_t *)ptpva;
3129 }
3130 
3131 static void
3132 pmap_unmap_ptp(void)
3133 {
3134 #if defined(DIAGNOSTIC) || defined(XEN)
3135 	pt_entry_t *pte;
3136 
3137 	KASSERT(kpreempt_disabled());
3138 
3139 	pte = PTESLEW(ptp_pte, cpu_number());
3140 	if (*pte != 0) {
3141 		pmap_pte_set(pte, 0);
3142 		pmap_pte_flush();
3143 	}
3144 #endif
3145 }
3146 
3147 static pt_entry_t *
3148 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
3149 {
3150 
3151 	KASSERT(kpreempt_disabled());
3152 	if (pmap_is_curpmap(pmap)) {
3153 		return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */
3154 	}
3155 	KASSERT(ptp != NULL);
3156 	return pmap_map_ptp(ptp) + pl1_pi(va);
3157 }
3158 
3159 static void
3160 pmap_unmap_pte(void)
3161 {
3162 
3163 	KASSERT(kpreempt_disabled());
3164 
3165 	pmap_unmap_ptp();
3166 }
3167 
3168 /*
3169  * p m a p   r e m o v e   f u n c t i o n s
3170  *
3171  * functions that remove mappings
3172  */
3173 
3174 /*
3175  * pmap_remove_ptes: remove PTEs from a PTP
3176  *
3177  * => caller must hold pmap's lock
3178  * => PTP must be mapped into KVA
3179  * => PTP should be null if pmap == pmap_kernel()
3180  * => must be called with kernel preemption disabled
3181  * => returns composite pte if at least one page should be shot down
3182  */
3183 
3184 static void
3185 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
3186 		 vaddr_t startva, vaddr_t endva, struct pv_entry **pv_tofree)
3187 {
3188 	pt_entry_t *pte = (pt_entry_t *)ptpva;
3189 
3190 	KASSERT(pmap == pmap_kernel() || mutex_owned(pmap->pm_lock));
3191 	KASSERT(kpreempt_disabled());
3192 
3193 	/*
3194 	 * note that ptpva points to the PTE that maps startva.   this may
3195 	 * or may not be the first PTE in the PTP.
3196 	 *
3197 	 * we loop through the PTP while there are still PTEs to look at
3198 	 * and the wire_count is greater than 1 (because we use the wire_count
3199 	 * to keep track of the number of real PTEs in the PTP).
3200 	 */
3201 	while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) {
3202 		(void)pmap_remove_pte(pmap, ptp, pte, startva, pv_tofree);
3203 		startva += PAGE_SIZE;
3204 		pte++;
3205 	}
3206 }
3207 
3208 
3209 /*
3210  * pmap_remove_pte: remove a single PTE from a PTP.
3211  *
3212  * => caller must hold pmap's lock
3213  * => PTP must be mapped into KVA
3214  * => PTP should be null if pmap == pmap_kernel()
3215  * => returns true if we removed a mapping
3216  * => must be called with kernel preemption disabled
3217  */
3218 static bool
3219 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
3220 		vaddr_t va, struct pv_entry **pv_tofree)
3221 {
3222 	struct pv_entry *pve;
3223 	struct vm_page *pg;
3224 	struct pmap_page *pp;
3225 	pt_entry_t opte;
3226 
3227 	KASSERT(pmap == pmap_kernel() || mutex_owned(pmap->pm_lock));
3228 	KASSERT(kpreempt_disabled());
3229 
3230 	if (!pmap_valid_entry(*pte)) {
3231 		/* VA not mapped. */
3232 		return false;
3233 	}
3234 
3235 	/* Atomically save the old PTE and zap it. */
3236 	opte = pmap_pte_testset(pte, 0);
3237 	if (!pmap_valid_entry(opte)) {
3238 		return false;
3239 	}
3240 
3241 	pmap_exec_account(pmap, va, opte, 0);
3242 	pmap_stats_update_bypte(pmap, 0, opte);
3243 
3244 	if (ptp) {
3245 		/*
3246 		 * Dropping a PTE.  Make sure that the PDE is flushed.
3247 		 */
3248 		ptp->wire_count--;
3249 		if (ptp->wire_count <= 1) {
3250 			opte |= PG_U;
3251 		}
3252 	}
3253 
3254 	if ((opte & PG_U) != 0) {
3255 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE);
3256 	}
3257 
3258 	/*
3259 	 * If we are not on a pv_head list - we are done.
3260 	 */
3261 	if ((opte & PG_PVLIST) == 0) {
3262 #if defined(DIAGNOSTIC) && !defined(DOM0OPS)
3263 		if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL)
3264 			panic("pmap_remove_pte: managed page without "
3265 			      "PG_PVLIST for %#" PRIxVADDR, va);
3266 #endif
3267 		return true;
3268 	}
3269 
3270 	pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte));
3271 
3272 	KASSERTMSG(pg != NULL, "pmap_remove_pte: unmanaged page marked "
3273 	    "PG_PVLIST, va = %#" PRIxVADDR ", pa = %#" PRIxPADDR,
3274 	    va, (paddr_t)pmap_pte2pa(opte));
3275 
3276 	KASSERT(uvm_page_locked_p(pg));
3277 
3278 	/* Sync R/M bits. */
3279 	pp = VM_PAGE_TO_PP(pg);
3280 	pp->pp_attrs |= opte;
3281 	pve = pmap_remove_pv(pp, ptp, va);
3282 
3283 	if (pve) {
3284 		pve->pve_next = *pv_tofree;
3285 		*pv_tofree = pve;
3286 	}
3287 	return true;
3288 }
3289 
3290 /*
3291  * pmap_remove: mapping removal function.
3292  *
3293  * => caller should not be holding any pmap locks
3294  */
3295 
3296 void
3297 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
3298 {
3299 	pt_entry_t *ptes;
3300 	pd_entry_t pde;
3301 	pd_entry_t * const *pdes;
3302 	struct pv_entry *pv_tofree = NULL;
3303 	bool result;
3304 	int i;
3305 	paddr_t ptppa;
3306 	vaddr_t blkendva, va = sva;
3307 	struct vm_page *ptp;
3308 	struct pmap *pmap2;
3309 
3310 	kpreempt_disable();
3311 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
3312 
3313 	/*
3314 	 * removing one page?  take shortcut function.
3315 	 */
3316 
3317 	if (va + PAGE_SIZE == eva) {
3318 		if (pmap_pdes_valid(va, pdes, &pde)) {
3319 
3320 			/* PA of the PTP */
3321 			ptppa = pmap_pte2pa(pde);
3322 
3323 			/* Get PTP if non-kernel mapping. */
3324 			if (pmap != pmap_kernel()) {
3325 				ptp = pmap_find_ptp(pmap, va, ptppa, 1);
3326 				KASSERTMSG(ptp != NULL,
3327 				    "pmap_remove: unmanaged PTP detected");
3328 			} else {
3329 				/* Never free kernel PTPs. */
3330 				ptp = NULL;
3331 			}
3332 
3333 			result = pmap_remove_pte(pmap, ptp,
3334 			    &ptes[pl1_i(va)], va, &pv_tofree);
3335 
3336 			/*
3337 			 * if mapping removed and the PTP is no longer
3338 			 * being used, free it!
3339 			 */
3340 
3341 			if (result && ptp && ptp->wire_count <= 1)
3342 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3343 		}
3344 	} else for (/* null */ ; va < eva ; va = blkendva) {
3345 		int lvl;
3346 
3347 		/* determine range of block */
3348 		blkendva = x86_round_pdr(va+1);
3349 		if (blkendva > eva)
3350 			blkendva = eva;
3351 
3352 		/*
3353 		 * XXXCDC: our PTE mappings should never be removed
3354 		 * with pmap_remove!  if we allow this (and why would
3355 		 * we?) then we end up freeing the pmap's page
3356 		 * directory page (PDP) before we are finished using
3357 		 * it when we hit in in the recursive mapping.  this
3358 		 * is BAD.
3359 		 *
3360 		 * long term solution is to move the PTEs out of user
3361 		 * address space.  and into kernel address space (up
3362 		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
3363 		 * be VM_MAX_ADDRESS.
3364 		 */
3365 
3366 		/* XXXCDC: ugly hack to avoid freeing PDP here */
3367 		for (i = 0; i < PDP_SIZE; i++) {
3368 			if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i)
3369 				continue;
3370 		}
3371 
3372 		lvl = pmap_pdes_invalid(va, pdes, &pde);
3373 		if (lvl != 0) {
3374 			/*
3375 			 * skip a range corresponding to an invalid pde.
3376 			 */
3377 			blkendva = (va & ptp_masks[lvl - 1]) + nbpd[lvl - 1];
3378  			continue;
3379 		}
3380 
3381 		/* PA of the PTP */
3382 		ptppa = pmap_pte2pa(pde);
3383 
3384 		/* Get PTP if non-kernel mapping. */
3385 		if (pmap != pmap_kernel()) {
3386 			ptp = pmap_find_ptp(pmap, va, ptppa, 1);
3387 			KASSERTMSG(ptp != NULL,
3388 			    "pmap_remove: unmanaged PTP detected");
3389 		} else {
3390 			/* Never free kernel PTPs. */
3391 			ptp = NULL;
3392 		}
3393 
3394 		pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va,
3395 		    blkendva, &pv_tofree);
3396 
3397 		/* if PTP is no longer being used, free it! */
3398 		if (ptp && ptp->wire_count <= 1) {
3399 			pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3400 		}
3401 	}
3402 	pmap_unmap_ptes(pmap, pmap2);		/* unlock pmap */
3403 	kpreempt_enable();
3404 
3405 	/* Now we free unused PVs */
3406 	if (pv_tofree)
3407 		pmap_free_pvs(pv_tofree);
3408 }
3409 
3410 /*
3411  * pmap_sync_pv: clear pte bits and return the old value of the pte.
3412  *
3413  * => Caller should disable kernel preemption.
3414  * => issues tlb shootdowns if necessary.
3415  */
3416 
3417 static int
3418 pmap_sync_pv(struct pv_pte *pvpte, pt_entry_t expect, int clearbits,
3419     pt_entry_t *optep)
3420 {
3421 	struct pmap *pmap;
3422 	struct vm_page *ptp;
3423 	vaddr_t va;
3424 	pt_entry_t *ptep;
3425 	pt_entry_t opte;
3426 	pt_entry_t npte;
3427 	bool need_shootdown;
3428 
3429 	ptp = pvpte->pte_ptp;
3430 	va = pvpte->pte_va;
3431 	KASSERT(ptp == NULL || ptp->uobject != NULL);
3432 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
3433 	pmap = ptp_to_pmap(ptp);
3434 
3435 	KASSERT((expect & ~(PG_FRAME | PG_V)) == 0);
3436 	KASSERT((expect & PG_V) != 0);
3437 	KASSERT(clearbits == ~0 || (clearbits & ~(PG_M | PG_U | PG_RW)) == 0);
3438 	KASSERT(kpreempt_disabled());
3439 
3440 	ptep = pmap_map_pte(pmap, ptp, va);
3441 	do {
3442 		opte = *ptep;
3443 		KASSERT((opte & (PG_M | PG_U)) != PG_M);
3444 		KASSERT((opte & (PG_U | PG_V)) != PG_U);
3445 		KASSERT(opte == 0 || (opte & PG_V) != 0);
3446 		if ((opte & (PG_FRAME | PG_V)) != expect) {
3447 
3448 			/*
3449 			 * we lost a race with a V->P operation like
3450 			 * pmap_remove().  wait for the competitor
3451 			 * reflecting pte bits into mp_attrs.
3452 			 *
3453 			 * issue a redundant TLB shootdown so that
3454 			 * we can wait for its completion.
3455 			 */
3456 
3457 			pmap_unmap_pte();
3458 			if (clearbits != 0) {
3459 				pmap_tlb_shootdown(pmap, va,
3460 				    (pmap == pmap_kernel() ? PG_G : 0),
3461 				    TLBSHOOT_SYNC_PV1);
3462 			}
3463 			return EAGAIN;
3464 		}
3465 
3466 		/*
3467 		 * check if there's anything to do on this pte.
3468 		 */
3469 
3470 		if ((opte & clearbits) == 0) {
3471 			need_shootdown = false;
3472 			break;
3473 		}
3474 
3475 		/*
3476 		 * we need a shootdown if the pte is cached. (PG_U)
3477 		 *
3478 		 * ...unless we are clearing only the PG_RW bit and
3479 		 * it isn't cached as RW. (PG_M)
3480 		 */
3481 
3482 		need_shootdown = (opte & PG_U) != 0 &&
3483 		    !(clearbits == PG_RW && (opte & PG_M) == 0);
3484 
3485 		npte = opte & ~clearbits;
3486 
3487 		/*
3488 		 * if we need a shootdown anyway, clear PG_U and PG_M.
3489 		 */
3490 
3491 		if (need_shootdown) {
3492 			npte &= ~(PG_U | PG_M);
3493 		}
3494 		KASSERT((npte & (PG_M | PG_U)) != PG_M);
3495 		KASSERT((npte & (PG_U | PG_V)) != PG_U);
3496 		KASSERT(npte == 0 || (opte & PG_V) != 0);
3497 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
3498 
3499 	if (need_shootdown) {
3500 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV2);
3501 	}
3502 	pmap_unmap_pte();
3503 
3504 	*optep = opte;
3505 	return 0;
3506 }
3507 
3508 /*
3509  * pmap_page_remove: remove a managed vm_page from all pmaps that map it
3510  *
3511  * => R/M bits are sync'd back to attrs
3512  */
3513 
3514 void
3515 pmap_page_remove(struct vm_page *pg)
3516 {
3517 	struct pmap_page *pp;
3518 	struct pv_pte *pvpte;
3519 	struct pv_entry *killlist = NULL;
3520 	struct vm_page *ptp;
3521 	pt_entry_t expect;
3522 	lwp_t *l;
3523 	int count;
3524 
3525 	KASSERT(uvm_page_locked_p(pg));
3526 
3527 	l = curlwp;
3528 	pp = VM_PAGE_TO_PP(pg);
3529 	expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V;
3530 	count = SPINLOCK_BACKOFF_MIN;
3531 	kpreempt_disable();
3532 startover:
3533 	while ((pvpte = pv_pte_first(pp)) != NULL) {
3534 		struct pmap *pmap;
3535 		struct pv_entry *pve;
3536 		pt_entry_t opte;
3537 		vaddr_t va;
3538 		int error;
3539 
3540 		/*
3541 		 * add a reference to the pmap before clearing the pte.
3542 		 * otherwise the pmap can disappear behind us.
3543 		 */
3544 
3545 		ptp = pvpte->pte_ptp;
3546 		pmap = ptp_to_pmap(ptp);
3547 		if (ptp != NULL) {
3548 			pmap_reference(pmap);
3549 		}
3550 
3551 		error = pmap_sync_pv(pvpte, expect, ~0, &opte);
3552 		if (error == EAGAIN) {
3553 			int hold_count;
3554 			KERNEL_UNLOCK_ALL(curlwp, &hold_count);
3555 			if (ptp != NULL) {
3556 				pmap_destroy(pmap);
3557 			}
3558 			SPINLOCK_BACKOFF(count);
3559 			KERNEL_LOCK(hold_count, curlwp);
3560 			goto startover;
3561 		}
3562 
3563 		pp->pp_attrs |= opte;
3564 		va = pvpte->pte_va;
3565 		pve = pmap_remove_pv(pp, ptp, va);
3566 
3567 		/* update the PTP reference count.  free if last reference. */
3568 		if (ptp != NULL) {
3569 			struct pmap *pmap2;
3570 			pt_entry_t *ptes;
3571 			pd_entry_t * const *pdes;
3572 
3573 			KASSERT(pmap != pmap_kernel());
3574 
3575 			pmap_tlb_shootnow();
3576 			pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3577 			pmap_stats_update_bypte(pmap, 0, opte);
3578 			ptp->wire_count--;
3579 			if (ptp->wire_count <= 1) {
3580 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3581 			}
3582 			pmap_unmap_ptes(pmap, pmap2);
3583 			pmap_destroy(pmap);
3584 		} else {
3585 			KASSERT(pmap == pmap_kernel());
3586 			pmap_stats_update_bypte(pmap, 0, opte);
3587 		}
3588 
3589 		if (pve != NULL) {
3590 			pve->pve_next = killlist;	/* mark it for death */
3591 			killlist = pve;
3592 		}
3593 	}
3594 	pmap_tlb_shootnow();
3595 	kpreempt_enable();
3596 
3597 	/* Now free unused pvs. */
3598 	pmap_free_pvs(killlist);
3599 }
3600 
3601 /*
3602  * p m a p   a t t r i b u t e  f u n c t i o n s
3603  * functions that test/change managed page's attributes
3604  * since a page can be mapped multiple times we must check each PTE that
3605  * maps it by going down the pv lists.
3606  */
3607 
3608 /*
3609  * pmap_test_attrs: test a page's attributes
3610  */
3611 
3612 bool
3613 pmap_test_attrs(struct vm_page *pg, unsigned testbits)
3614 {
3615 	struct pmap_page *pp;
3616 	struct pv_pte *pvpte;
3617 	pt_entry_t expect;
3618 	u_int result;
3619 
3620 	KASSERT(uvm_page_locked_p(pg));
3621 
3622 	pp = VM_PAGE_TO_PP(pg);
3623 	if ((pp->pp_attrs & testbits) != 0) {
3624 		return true;
3625 	}
3626 	expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V;
3627 	kpreempt_disable();
3628 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
3629 		pt_entry_t opte;
3630 		int error;
3631 
3632 		if ((pp->pp_attrs & testbits) != 0) {
3633 			break;
3634 		}
3635 		error = pmap_sync_pv(pvpte, expect, 0, &opte);
3636 		if (error == 0) {
3637 			pp->pp_attrs |= opte;
3638 		}
3639 	}
3640 	result = pp->pp_attrs & testbits;
3641 	kpreempt_enable();
3642 
3643 	/*
3644 	 * note that we will exit the for loop with a non-null pve if
3645 	 * we have found the bits we are testing for.
3646 	 */
3647 
3648 	return result != 0;
3649 }
3650 
3651 /*
3652  * pmap_clear_attrs: clear the specified attribute for a page.
3653  *
3654  * => we return true if we cleared one of the bits we were asked to
3655  */
3656 
3657 bool
3658 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits)
3659 {
3660 	struct pmap_page *pp;
3661 	struct pv_pte *pvpte;
3662 	u_int result;
3663 	pt_entry_t expect;
3664 	int count;
3665 
3666 	KASSERT(uvm_page_locked_p(pg));
3667 
3668 	pp = VM_PAGE_TO_PP(pg);
3669 	expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V;
3670 	count = SPINLOCK_BACKOFF_MIN;
3671 	kpreempt_disable();
3672 startover:
3673 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
3674 		pt_entry_t opte;
3675 		int error;
3676 
3677 		error = pmap_sync_pv(pvpte, expect, clearbits, &opte);
3678 		if (error == EAGAIN) {
3679 			int hold_count;
3680 			KERNEL_UNLOCK_ALL(curlwp, &hold_count);
3681 			SPINLOCK_BACKOFF(count);
3682 			KERNEL_LOCK(hold_count, curlwp);
3683 			goto startover;
3684 		}
3685 		pp->pp_attrs |= opte;
3686 	}
3687 	result = pp->pp_attrs & clearbits;
3688 	pp->pp_attrs &= ~clearbits;
3689 	kpreempt_enable();
3690 
3691 	return result != 0;
3692 }
3693 
3694 
3695 /*
3696  * p m a p   p r o t e c t i o n   f u n c t i o n s
3697  */
3698 
3699 /*
3700  * pmap_page_protect: change the protection of all recorded mappings
3701  *	of a managed page
3702  *
3703  * => NOTE: this is an inline function in pmap.h
3704  */
3705 
3706 /* see pmap.h */
3707 
3708 /*
3709  * pmap_protect: set the protection in of the pages in a pmap
3710  *
3711  * => NOTE: this is an inline function in pmap.h
3712  */
3713 
3714 /* see pmap.h */
3715 
3716 /*
3717  * pmap_write_protect: write-protect pages in a pmap.
3718  */
3719 void
3720 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
3721 {
3722 	pt_entry_t *ptes;
3723 	pt_entry_t * const *pdes;
3724 	struct pmap *pmap2;
3725 	vaddr_t blockend, va;
3726 
3727 	KASSERT(curlwp->l_md.md_gc_pmap != pmap);
3728 
3729 	sva &= PG_FRAME;
3730 	eva &= PG_FRAME;
3731 
3732 	/* Acquire pmap. */
3733 	kpreempt_disable();
3734 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3735 
3736 	for (va = sva ; va < eva ; va = blockend) {
3737 		pt_entry_t *spte, *epte;
3738 		int i;
3739 
3740 		blockend = (va & L2_FRAME) + NBPD_L2;
3741 		if (blockend > eva)
3742 			blockend = eva;
3743 
3744 		/*
3745 		 * XXXCDC: our PTE mappings should never be write-protected!
3746 		 *
3747 		 * long term solution is to move the PTEs out of user
3748 		 * address space.  and into kernel address space (up
3749 		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
3750 		 * be VM_MAX_ADDRESS.
3751 		 */
3752 
3753 		/* XXXCDC: ugly hack to avoid freeing PDP here */
3754 		for (i = 0; i < PDP_SIZE; i++) {
3755 			if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i)
3756 				continue;
3757 		}
3758 
3759 		/* Is it a valid block? */
3760 		if (!pmap_pdes_valid(va, pdes, NULL)) {
3761 			continue;
3762 		}
3763 		KASSERT(va < VM_MAXUSER_ADDRESS || va >= VM_MAX_ADDRESS);
3764 
3765 		spte = &ptes[pl1_i(va)];
3766 		epte = &ptes[pl1_i(blockend)];
3767 
3768 		for (/*null */; spte < epte ; spte++) {
3769 			pt_entry_t opte, npte;
3770 
3771 			do {
3772 				opte = *spte;
3773 				if ((~opte & (PG_RW | PG_V)) != 0) {
3774 					goto next;
3775 				}
3776 				npte = opte & ~PG_RW;
3777 			} while (pmap_pte_cas(spte, opte, npte) != opte);
3778 
3779 			if ((opte & PG_M) != 0) {
3780 				vaddr_t tva = x86_ptob(spte - ptes);
3781 				pmap_tlb_shootdown(pmap, tva, opte,
3782 				    TLBSHOOT_WRITE_PROTECT);
3783 			}
3784 next:;
3785 		}
3786 	}
3787 
3788 	/* Release pmap. */
3789 	pmap_unmap_ptes(pmap, pmap2);
3790 	kpreempt_enable();
3791 }
3792 
3793 /*
3794  * pmap_unwire: clear the wired bit in the PTE.
3795  *
3796  * => Mapping should already be present.
3797  */
3798 void
3799 pmap_unwire(struct pmap *pmap, vaddr_t va)
3800 {
3801 	pt_entry_t *ptes, *ptep, opte;
3802 	pd_entry_t * const *pdes;
3803 	struct pmap *pmap2;
3804 
3805 	/* Acquire pmap. */
3806 	kpreempt_disable();
3807 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3808 
3809 	if (!pmap_pdes_valid(va, pdes, NULL)) {
3810 		panic("pmap_unwire: invalid PDE");
3811 	}
3812 
3813 	ptep = &ptes[pl1_i(va)];
3814 	opte = *ptep;
3815 	KASSERT(pmap_valid_entry(opte));
3816 
3817 	if (opte & PG_W) {
3818 		pt_entry_t npte = opte & ~PG_W;
3819 
3820 		opte = pmap_pte_testset(ptep, npte);
3821 		pmap_stats_update_bypte(pmap, npte, opte);
3822 	} else {
3823 		printf("pmap_unwire: wiring for pmap %p va 0x%lx "
3824 		    "did not change!\n", pmap, va);
3825 	}
3826 
3827 	/* Release pmap. */
3828 	pmap_unmap_ptes(pmap, pmap2);
3829 	kpreempt_enable();
3830 }
3831 
3832 /*
3833  * pmap_copy: copy mappings from one pmap to another
3834  *
3835  * => optional function
3836  * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
3837  */
3838 
3839 /*
3840  * defined as macro in pmap.h
3841  */
3842 
3843 __strict_weak_alias(pmap_enter, pmap_enter_default);
3844 
3845 int
3846 pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
3847     u_int flags)
3848 {
3849 	return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0);
3850 }
3851 
3852 /*
3853  * pmap_enter: enter a mapping into a pmap
3854  *
3855  * => must be done "now" ... no lazy-evaluation
3856  * => we set pmap => pv_head locking
3857  */
3858 int
3859 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa,
3860 	   vm_prot_t prot, u_int flags, int domid)
3861 {
3862 	pt_entry_t *ptes, opte, npte;
3863 	pt_entry_t *ptep;
3864 	pd_entry_t * const *pdes;
3865 	struct vm_page *ptp, *pg;
3866 	struct pmap_page *new_pp;
3867 	struct pmap_page *old_pp;
3868 	struct pv_entry *old_pve = NULL;
3869 	struct pv_entry *new_pve;
3870 	struct pv_entry *new_pve2;
3871 	int error;
3872 	bool wired = (flags & PMAP_WIRED) != 0;
3873 	struct pmap *pmap2;
3874 
3875 	KASSERT(pmap_initialized);
3876 	KASSERT(curlwp->l_md.md_gc_pmap != pmap);
3877 	KASSERT(va < VM_MAX_KERNEL_ADDRESS);
3878 	KASSERTMSG(va != (vaddr_t)PDP_BASE && va != (vaddr_t)APDP_BASE,
3879 	    "pmap_enter: trying to map over PDP/APDP!");
3880 	KASSERTMSG(va < VM_MIN_KERNEL_ADDRESS ||
3881 	    pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]),
3882 	    "pmap_enter: missing kernel PTP for VA %lx!", va);
3883 
3884 #ifdef XEN
3885 	KASSERT(domid == DOMID_SELF || pa == 0);
3886 #endif /* XEN */
3887 
3888 	npte = ma | protection_codes[prot] | PG_V;
3889 	npte |= pmap_pat_flags(flags);
3890 	if (wired)
3891 	        npte |= PG_W;
3892 	if (va < VM_MAXUSER_ADDRESS)
3893 		npte |= PG_u;
3894 	else if (va < VM_MAX_ADDRESS)
3895 		npte |= (PG_u | PG_RW);	/* XXXCDC: no longer needed? */
3896 	else
3897 		npte |= PG_k;
3898 	if (pmap == pmap_kernel())
3899 		npte |= pmap_pg_g;
3900 	if (flags & VM_PROT_ALL) {
3901 		npte |= PG_U;
3902 		if (flags & VM_PROT_WRITE) {
3903 			KASSERT((npte & PG_RW) != 0);
3904 			npte |= PG_M;
3905 		}
3906 	}
3907 
3908 #ifdef XEN
3909 	if (domid != DOMID_SELF)
3910 		pg = NULL;
3911 	else
3912 #endif
3913 		pg = PHYS_TO_VM_PAGE(pa);
3914 	if (pg != NULL) {
3915 		/* This is a managed page */
3916 		npte |= PG_PVLIST;
3917 		new_pp = VM_PAGE_TO_PP(pg);
3918 	} else {
3919 		new_pp = NULL;
3920 	}
3921 
3922 	/* get pves. */
3923 	new_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
3924 	new_pve2 = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
3925 	if (new_pve == NULL || new_pve2 == NULL) {
3926 		if (flags & PMAP_CANFAIL) {
3927 			error = ENOMEM;
3928 			goto out2;
3929 		}
3930 		panic("pmap_enter: pve allocation failed");
3931 	}
3932 
3933 	kpreempt_disable();
3934 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
3935 	if (pmap == pmap_kernel()) {
3936 		ptp = NULL;
3937 	} else {
3938 		ptp = pmap_get_ptp(pmap, va, pdes);
3939 		if (ptp == NULL) {
3940 			pmap_unmap_ptes(pmap, pmap2);
3941 			if (flags & PMAP_CANFAIL) {
3942 				error = ENOMEM;
3943 				goto out;
3944 			}
3945 			panic("pmap_enter: get ptp failed");
3946 		}
3947 	}
3948 
3949 	/*
3950 	 * update the pte.
3951 	 */
3952 
3953 	ptep = &ptes[pl1_i(va)];
3954 	do {
3955 		opte = *ptep;
3956 
3957 		/*
3958 		 * if the same page, inherit PG_U and PG_M.
3959 		 */
3960 		if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) {
3961 			npte |= opte & (PG_U | PG_M);
3962 		}
3963 #if defined(XEN)
3964 		if (domid != DOMID_SELF) {
3965 			/* pmap_pte_cas with error handling */
3966 			int s = splvm();
3967 			if (opte != *ptep) {
3968 				splx(s);
3969 				continue;
3970 			}
3971 			error = xpq_update_foreign(
3972 			    vtomach((vaddr_t)ptep), npte, domid);
3973 			splx(s);
3974 			if (error) {
3975 				if (ptp != NULL && ptp->wire_count <= 1) {
3976 					pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3977 				}
3978 				pmap_unmap_ptes(pmap, pmap2);
3979 				goto out;
3980 			}
3981 			break;
3982 		}
3983 #endif /* defined(XEN) */
3984 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
3985 
3986 	/*
3987 	 * update statistics and PTP's reference count.
3988 	 */
3989 
3990 	pmap_stats_update_bypte(pmap, npte, opte);
3991 	if (ptp != NULL && !pmap_valid_entry(opte)) {
3992 		ptp->wire_count++;
3993 	}
3994 	KASSERT(ptp == NULL || ptp->wire_count > 1);
3995 
3996 	/*
3997 	 * if the same page, we can skip pv_entry handling.
3998 	 */
3999 
4000 	if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) {
4001 		KASSERT(((opte ^ npte) & PG_PVLIST) == 0);
4002 		goto same_pa;
4003 	}
4004 
4005 	/*
4006 	 * if old page is managed, remove pv_entry from its list.
4007 	 */
4008 
4009 	if ((~opte & (PG_V | PG_PVLIST)) == 0) {
4010 		pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte));
4011 
4012 		KASSERTMSG(pg != NULL, "pmap_enter: PG_PVLIST mapping with "
4013 		    "unmanaged page pa = 0x%" PRIx64 " (0x%" PRIx64 ")",
4014 		    (int64_t)pa, (int64_t)atop(pa));
4015 
4016 		KASSERT(uvm_page_locked_p(pg));
4017 
4018 		old_pp = VM_PAGE_TO_PP(pg);
4019 		old_pve = pmap_remove_pv(old_pp, ptp, va);
4020 		old_pp->pp_attrs |= opte;
4021 	}
4022 
4023 	/*
4024 	 * if new page is managed, insert pv_entry into its list.
4025 	 */
4026 
4027 	if (new_pp) {
4028 		new_pve = pmap_enter_pv(new_pp, new_pve, &new_pve2, ptp, va);
4029 	}
4030 
4031 same_pa:
4032 	pmap_unmap_ptes(pmap, pmap2);
4033 
4034 	/*
4035 	 * shootdown tlb if necessary.
4036 	 */
4037 
4038 	if ((~opte & (PG_V | PG_U)) == 0 &&
4039 	    ((opte ^ npte) & (PG_FRAME | PG_RW)) != 0) {
4040 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER);
4041 	}
4042 
4043 	error = 0;
4044 out:
4045 	kpreempt_enable();
4046 out2:
4047 	if (old_pve != NULL) {
4048 		pool_cache_put(&pmap_pv_cache, old_pve);
4049 	}
4050 	if (new_pve != NULL) {
4051 		pool_cache_put(&pmap_pv_cache, new_pve);
4052 	}
4053 	if (new_pve2 != NULL) {
4054 		pool_cache_put(&pmap_pv_cache, new_pve2);
4055 	}
4056 
4057 	return error;
4058 }
4059 
4060 static bool
4061 pmap_get_physpage(vaddr_t va, int level, paddr_t *paddrp)
4062 {
4063 	struct vm_page *ptp;
4064 	struct pmap *kpm = pmap_kernel();
4065 
4066 	if (uvm.page_init_done == false) {
4067 		/*
4068 		 * we're growing the kernel pmap early (from
4069 		 * uvm_pageboot_alloc()).  this case must be
4070 		 * handled a little differently.
4071 		 */
4072 
4073 		if (uvm_page_physget(paddrp) == false)
4074 			panic("pmap_get_physpage: out of memory");
4075 		kpreempt_disable();
4076 		pmap_pte_set(early_zero_pte,
4077 		    pmap_pa2pte(*paddrp) | PG_V | PG_RW | PG_k);
4078 		pmap_pte_flush();
4079 		pmap_update_pg((vaddr_t)early_zerop);
4080 		memset(early_zerop, 0, PAGE_SIZE);
4081 #if defined(DIAGNOSTIC) || defined (XEN)
4082 		pmap_pte_set(early_zero_pte, 0);
4083 		pmap_pte_flush();
4084 #endif /* defined(DIAGNOSTIC) */
4085 		kpreempt_enable();
4086 	} else {
4087 		/* XXX */
4088 		ptp = uvm_pagealloc(NULL, 0, NULL,
4089 				    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
4090 		if (ptp == NULL)
4091 			panic("pmap_get_physpage: out of memory");
4092 		ptp->flags &= ~PG_BUSY;
4093 		ptp->wire_count = 1;
4094 		*paddrp = VM_PAGE_TO_PHYS(ptp);
4095 	}
4096 	pmap_stats_update(kpm, 1, 0);
4097 	return true;
4098 }
4099 
4100 /*
4101  * Allocate the amount of specified ptps for a ptp level, and populate
4102  * all levels below accordingly, mapping virtual addresses starting at
4103  * kva.
4104  *
4105  * Used by pmap_growkernel.
4106  */
4107 static void
4108 pmap_alloc_level(pd_entry_t * const *pdes, vaddr_t kva, int lvl,
4109     long *needed_ptps)
4110 {
4111 	unsigned long i;
4112 	vaddr_t va;
4113 	paddr_t pa;
4114 	unsigned long index, endindex;
4115 	int level;
4116 	pd_entry_t *pdep;
4117 #ifdef XEN
4118 	int s = splvm(); /* protect xpq_* */
4119 #endif
4120 
4121 	for (level = lvl; level > 1; level--) {
4122 		if (level == PTP_LEVELS)
4123 			pdep = pmap_kernel()->pm_pdir;
4124 		else
4125 			pdep = pdes[level - 2];
4126 		va = kva;
4127 		index = pl_i_roundup(kva, level);
4128 		endindex = index + needed_ptps[level - 1] - 1;
4129 
4130 
4131 		for (i = index; i <= endindex; i++) {
4132 			pt_entry_t pte;
4133 
4134 			KASSERT(!pmap_valid_entry(pdep[i]));
4135 			pmap_get_physpage(va, level - 1, &pa);
4136 			pte = pmap_pa2pte(pa) | PG_k | PG_V | PG_RW;
4137 #ifdef XEN
4138 			switch (level) {
4139 			case PTP_LEVELS:
4140 #if defined(PAE) || defined(__x86_64__)
4141 				if (i >= PDIR_SLOT_KERN) {
4142 					/* update per-cpu PMDs on all cpus */
4143 					CPU_INFO_ITERATOR cii;
4144 					struct cpu_info *ci;
4145 					for (CPU_INFO_FOREACH(cii, ci)) {
4146 						if (ci == NULL) {
4147 							continue;
4148 						}
4149 #ifdef PAE
4150 						xpq_queue_pte_update(
4151 							xpmap_ptetomach(&ci->ci_kpm_pdir[l2tol2(i)]), pte);
4152 #elif defined(__x86_64__)
4153 						xpq_queue_pte_update(
4154 							xpmap_ptetomach(&ci->ci_kpm_pdir[i]), pte);
4155 #endif /* PAE */
4156 					}
4157 				}
4158 #endif /* PAE || __x86_64__ */
4159 				/* FALLTHROUGH */
4160 
4161 			default: /* All other levels */
4162 				xpq_queue_pte_update(
4163 					xpmap_ptetomach(&pdep[i]),
4164 					pte);
4165 			}
4166 #else /* XEN */
4167 			pdep[i] = pte;
4168 #endif /* XEN */
4169 			KASSERT(level != PTP_LEVELS || nkptp[level - 1] +
4170 			    pl_i(VM_MIN_KERNEL_ADDRESS, level) == i);
4171 			nkptp[level - 1]++;
4172 			va += nbpd[level - 1];
4173 		}
4174 		pmap_pte_flush();
4175 	}
4176 #ifdef XEN
4177 	splx(s);
4178 #endif
4179 }
4180 
4181 /*
4182  * pmap_growkernel: increase usage of KVM space
4183  *
4184  * => we allocate new PTPs for the kernel and install them in all
4185  *	the pmaps on the system.
4186  */
4187 
4188 vaddr_t
4189 pmap_growkernel(vaddr_t maxkvaddr)
4190 {
4191 	struct pmap *kpm = pmap_kernel();
4192 #if !defined(XEN) || !defined(__x86_64__)
4193 	struct pmap *pm;
4194 #endif
4195 	int s, i;
4196 	long needed_kptp[PTP_LEVELS], target_nptp, old;
4197 	bool invalidate = false;
4198 
4199 	s = splvm();	/* to be safe */
4200 	mutex_enter(kpm->pm_lock);
4201 
4202 	if (maxkvaddr <= pmap_maxkvaddr) {
4203 		mutex_exit(kpm->pm_lock);
4204 		splx(s);
4205 		return pmap_maxkvaddr;
4206 	}
4207 
4208 	maxkvaddr = x86_round_pdr(maxkvaddr);
4209 	old = nkptp[PTP_LEVELS - 1];
4210 	/*
4211 	 * This loop could be optimized more, but pmap_growkernel()
4212 	 * is called infrequently.
4213 	 */
4214 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
4215 		target_nptp = pl_i_roundup(maxkvaddr, i + 1) -
4216 		    pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1);
4217 		/*
4218 		 * XXX only need to check toplevel.
4219 		 */
4220 		if (target_nptp > nkptpmax[i])
4221 			panic("out of KVA space");
4222 		KASSERT(target_nptp >= nkptp[i]);
4223 		needed_kptp[i] = target_nptp - nkptp[i];
4224 	}
4225 
4226 	pmap_alloc_level(normal_pdes, pmap_maxkvaddr, PTP_LEVELS, needed_kptp);
4227 
4228 	/*
4229 	 * If the number of top level entries changed, update all
4230 	 * pmaps.
4231 	 */
4232 	if (needed_kptp[PTP_LEVELS - 1] != 0) {
4233 #ifdef XEN
4234 #ifdef __x86_64__
4235 		/* nothing, kernel entries are never entered in user pmap */
4236 #else /* __x86_64__ */
4237 		mutex_enter(&pmaps_lock);
4238 		LIST_FOREACH(pm, &pmaps, pm_list) {
4239 			int pdkidx;
4240 			for (pdkidx =  PDIR_SLOT_KERN + old;
4241 			    pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1];
4242 			    pdkidx++) {
4243 				xpq_queue_pte_update(
4244 				    xpmap_ptom(pmap_pdirpa(pm, pdkidx)),
4245 				    kpm->pm_pdir[pdkidx]);
4246 			}
4247 			xpq_flush_queue();
4248 		}
4249 		mutex_exit(&pmaps_lock);
4250 #endif /* __x86_64__ */
4251 #else /* XEN */
4252 		unsigned newpdes;
4253 		newpdes = nkptp[PTP_LEVELS - 1] - old;
4254 		mutex_enter(&pmaps_lock);
4255 		LIST_FOREACH(pm, &pmaps, pm_list) {
4256 			memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old],
4257 			       &kpm->pm_pdir[PDIR_SLOT_KERN + old],
4258 			       newpdes * sizeof (pd_entry_t));
4259 		}
4260 		mutex_exit(&pmaps_lock);
4261 #endif
4262 		invalidate = true;
4263 	}
4264 	pmap_maxkvaddr = maxkvaddr;
4265 	mutex_exit(kpm->pm_lock);
4266 	splx(s);
4267 
4268 	if (invalidate) {
4269 		/* Invalidate the PDP cache. */
4270 		pool_cache_invalidate(&pmap_pdp_cache);
4271 	}
4272 
4273 	return maxkvaddr;
4274 }
4275 
4276 #ifdef DEBUG
4277 void pmap_dump(struct pmap *, vaddr_t, vaddr_t);
4278 
4279 /*
4280  * pmap_dump: dump all the mappings from a pmap
4281  *
4282  * => caller should not be holding any pmap locks
4283  */
4284 
4285 void
4286 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
4287 {
4288 	pt_entry_t *ptes, *pte;
4289 	pd_entry_t * const *pdes;
4290 	struct pmap *pmap2;
4291 	vaddr_t blkendva;
4292 
4293 	/*
4294 	 * if end is out of range truncate.
4295 	 * if (end == start) update to max.
4296 	 */
4297 
4298 	if (eva > VM_MAXUSER_ADDRESS || eva <= sva)
4299 		eva = VM_MAXUSER_ADDRESS;
4300 
4301 	/*
4302 	 * we lock in the pmap => pv_head direction
4303 	 */
4304 
4305 	kpreempt_disable();
4306 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
4307 
4308 	/*
4309 	 * dumping a range of pages: we dump in PTP sized blocks (4MB)
4310 	 */
4311 
4312 	for (/* null */ ; sva < eva ; sva = blkendva) {
4313 
4314 		/* determine range of block */
4315 		blkendva = x86_round_pdr(sva+1);
4316 		if (blkendva > eva)
4317 			blkendva = eva;
4318 
4319 		/* valid block? */
4320 		if (!pmap_pdes_valid(sva, pdes, NULL))
4321 			continue;
4322 
4323 		pte = &ptes[pl1_i(sva)];
4324 		for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) {
4325 			if (!pmap_valid_entry(*pte))
4326 				continue;
4327 			printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR
4328 			    " (pte=%#" PRIxPADDR ")\n",
4329 			    sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte);
4330 		}
4331 	}
4332 	pmap_unmap_ptes(pmap, pmap2);
4333 	kpreempt_enable();
4334 }
4335 #endif
4336 
4337 /*
4338  * pmap_update: process deferred invalidations and frees.
4339  */
4340 
4341 void
4342 pmap_update(struct pmap *pmap)
4343 {
4344 	struct vm_page *empty_ptps;
4345 	lwp_t *l = curlwp;
4346 
4347 	/*
4348 	 * If we have torn down this pmap, invalidate non-global TLB
4349 	 * entries on any processors using it.
4350 	 */
4351 	KPREEMPT_DISABLE(l);
4352 	if (__predict_false(l->l_md.md_gc_pmap == pmap)) {
4353 		l->l_md.md_gc_pmap = NULL;
4354 		pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, TLBSHOOT_UPDATE);
4355 	}
4356 	/*
4357 	 * Initiate any pending TLB shootdowns.  Wait for them to
4358 	 * complete before returning control to the caller.
4359 	 */
4360 	pmap_tlb_shootnow();
4361 	KPREEMPT_ENABLE(l);
4362 
4363 	/*
4364 	 * Now that shootdowns are complete, process deferred frees,
4365 	 * but not from interrupt context.
4366 	 */
4367 	if (l->l_md.md_gc_ptp != NULL) {
4368 		KASSERT((l->l_pflag & LP_INTR) == 0);
4369 		if (cpu_intr_p()) {
4370 			return;
4371 		}
4372 		empty_ptps = l->l_md.md_gc_ptp;
4373 		l->l_md.md_gc_ptp = NULL;
4374 		pmap_free_ptps(empty_ptps);
4375 	}
4376 }
4377 
4378 #if PTP_LEVELS > 4
4379 #error "Unsupported number of page table mappings"
4380 #endif
4381 
4382 paddr_t
4383 pmap_init_tmp_pgtbl(paddr_t pg)
4384 {
4385 	static bool maps_loaded;
4386 	static const paddr_t x86_tmp_pml_paddr[] = {
4387 	    4 * PAGE_SIZE,
4388 	    5 * PAGE_SIZE,
4389 	    6 * PAGE_SIZE,
4390 	    7 * PAGE_SIZE
4391 	};
4392 	static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 };
4393 
4394 	pd_entry_t *tmp_pml, *kernel_pml;
4395 
4396 	int level;
4397 
4398 	if (!maps_loaded) {
4399 		for (level = 0; level < PTP_LEVELS; ++level) {
4400 			x86_tmp_pml_vaddr[level] =
4401 			    uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
4402 			    UVM_KMF_VAONLY);
4403 
4404 			if (x86_tmp_pml_vaddr[level] == 0)
4405 				panic("mapping of real mode PML failed\n");
4406 			pmap_kenter_pa(x86_tmp_pml_vaddr[level],
4407 			    x86_tmp_pml_paddr[level],
4408 			    VM_PROT_READ | VM_PROT_WRITE, 0);
4409 			pmap_update(pmap_kernel());
4410 		}
4411 		maps_loaded = true;
4412 	}
4413 
4414 	/* Zero levels 1-3 */
4415 	for (level = 0; level < PTP_LEVELS - 1; ++level) {
4416 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
4417 		memset(tmp_pml, 0, PAGE_SIZE);
4418 	}
4419 
4420 	/* Copy PML4 */
4421 	kernel_pml = pmap_kernel()->pm_pdir;
4422 	tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1];
4423 	memcpy(tmp_pml, kernel_pml, PAGE_SIZE);
4424 
4425 #ifdef PAE
4426 	/*
4427 	 * Use the last 4 entries of the L2 page as L3 PD entries. These
4428 	 * last entries are unlikely to be used for temporary mappings.
4429 	 * 508: maps 0->1GB (userland)
4430 	 * 509: unused
4431 	 * 510: unused
4432 	 * 511: maps 3->4GB (kernel)
4433 	 */
4434 	tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PG_V;
4435 	tmp_pml[509] = 0;
4436 	tmp_pml[510] = 0;
4437 	tmp_pml[511] = pmap_pdirpa(pmap_kernel(),PDIR_SLOT_KERN) | PG_V;
4438 #endif
4439 
4440 	for (level = PTP_LEVELS - 1; level > 0; --level) {
4441 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
4442 
4443 		tmp_pml[pl_i(pg, level + 1)] =
4444 		    (x86_tmp_pml_paddr[level - 1] & PG_FRAME) | PG_RW | PG_V;
4445 	}
4446 
4447 	tmp_pml = (void *)x86_tmp_pml_vaddr[0];
4448 	tmp_pml[pl_i(pg, 1)] = (pg & PG_FRAME) | PG_RW | PG_V;
4449 
4450 #ifdef PAE
4451 	/* Return the PA of the L3 page (entry 508 of the L2 page) */
4452 	return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t);
4453 #endif
4454 
4455 	return x86_tmp_pml_paddr[PTP_LEVELS - 1];
4456 }
4457 
4458 u_int
4459 x86_mmap_flags(paddr_t mdpgno)
4460 {
4461 	u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK;
4462 	u_int pflag = 0;
4463 
4464 	if (nflag & X86_MMAP_FLAG_PREFETCH)
4465 		pflag |= PMAP_WRITE_COMBINE;
4466 
4467 	return pflag;
4468 }
4469 
4470 /*
4471  * Invalidates pool_cache(9) used by pmap(9).
4472  */
4473 void
4474 pmap_invalidate_pool_caches(void)
4475 {
4476 #ifdef XEN
4477 	/*
4478 	 * We must invalidate all shadow pages found inside the pmap_pdp_cache.
4479 	 * They are technically considered by Xen as L2 pages, although they
4480 	 * are not currently found inside pmaps list.
4481 	 */
4482 	pool_cache_invalidate(&pmap_pdp_cache);
4483 #endif
4484 }
4485