xref: /netbsd-src/sys/arch/x86/x86/pmap.c (revision 404fbe5fb94ca1e054339640cabb2801ce52dd30)
1 /*	$NetBSD: pmap.c,v 1.77 2008/12/18 12:18:20 cegger Exp $	*/
2 
3 /*
4  * Copyright (c) 2007 Manuel Bouyer.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 3. All advertising materials mentioning features or use of this software
15  *    must display the following acknowledgement:
16  *      This product includes software developed by Manuel Bouyer.
17  * 4. The name of the author may not be used to endorse or promote products
18  *    derived from this software without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
21  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
22  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
23  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
24  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
25  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
29  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  *
31  */
32 
33 /*
34  * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
35  *
36  * Permission to use, copy, modify, and distribute this software for any
37  * purpose with or without fee is hereby granted, provided that the above
38  * copyright notice and this permission notice appear in all copies.
39  *
40  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
41  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
42  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
43  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
44  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
45  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
46  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
47  */
48 
49 /*
50  *
51  * Copyright (c) 1997 Charles D. Cranor and Washington University.
52  * All rights reserved.
53  *
54  * Redistribution and use in source and binary forms, with or without
55  * modification, are permitted provided that the following conditions
56  * are met:
57  * 1. Redistributions of source code must retain the above copyright
58  *    notice, this list of conditions and the following disclaimer.
59  * 2. Redistributions in binary form must reproduce the above copyright
60  *    notice, this list of conditions and the following disclaimer in the
61  *    documentation and/or other materials provided with the distribution.
62  * 3. All advertising materials mentioning features or use of this software
63  *    must display the following acknowledgement:
64  *      This product includes software developed by Charles D. Cranor and
65  *      Washington University.
66  * 4. The name of the author may not be used to endorse or promote products
67  *    derived from this software without specific prior written permission.
68  *
69  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
70  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
71  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
72  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
73  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
74  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
75  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
76  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
77  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
78  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
79  */
80 
81 /*
82  * Copyright 2001 (c) Wasabi Systems, Inc.
83  * All rights reserved.
84  *
85  * Written by Frank van der Linden for Wasabi Systems, Inc.
86  *
87  * Redistribution and use in source and binary forms, with or without
88  * modification, are permitted provided that the following conditions
89  * are met:
90  * 1. Redistributions of source code must retain the above copyright
91  *    notice, this list of conditions and the following disclaimer.
92  * 2. Redistributions in binary form must reproduce the above copyright
93  *    notice, this list of conditions and the following disclaimer in the
94  *    documentation and/or other materials provided with the distribution.
95  * 3. All advertising materials mentioning features or use of this software
96  *    must display the following acknowledgement:
97  *      This product includes software developed for the NetBSD Project by
98  *      Wasabi Systems, Inc.
99  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
100  *    or promote products derived from this software without specific prior
101  *    written permission.
102  *
103  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
104  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
105  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
106  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
107  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
108  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
109  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
110  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
111  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
112  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
113  * POSSIBILITY OF SUCH DAMAGE.
114  */
115 
116 /*
117  * This is the i386 pmap modified and generalized to support x86-64
118  * as well. The idea is to hide the upper N levels of the page tables
119  * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest
120  * is mostly untouched, except that it uses some more generalized
121  * macros and interfaces.
122  *
123  * This pmap has been tested on the i386 as well, and it can be easily
124  * adapted to PAE.
125  *
126  * fvdl@wasabisystems.com 18-Jun-2001
127  */
128 
129 /*
130  * pmap.c: i386 pmap module rewrite
131  * Chuck Cranor <chuck@ccrc.wustl.edu>
132  * 11-Aug-97
133  *
134  * history of this pmap module: in addition to my own input, i used
135  *    the following references for this rewrite of the i386 pmap:
136  *
137  * [1] the NetBSD i386 pmap.   this pmap appears to be based on the
138  *     BSD hp300 pmap done by Mike Hibler at University of Utah.
139  *     it was then ported to the i386 by William Jolitz of UUNET
140  *     Technologies, Inc.   Then Charles M. Hannum of the NetBSD
141  *     project fixed some bugs and provided some speed ups.
142  *
143  * [2] the FreeBSD i386 pmap.   this pmap seems to be the
144  *     Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson
145  *     and David Greenman.
146  *
147  * [3] the Mach pmap.   this pmap, from CMU, seems to have migrated
148  *     between several processors.   the VAX version was done by
149  *     Avadis Tevanian, Jr., and Michael Wayne Young.    the i386
150  *     version was done by Lance Berc, Mike Kupfer, Bob Baron,
151  *     David Golub, and Richard Draves.    the alpha version was
152  *     done by Alessandro Forin (CMU/Mach) and Chris Demetriou
153  *     (NetBSD/alpha).
154  */
155 
156 #include <sys/cdefs.h>
157 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.77 2008/12/18 12:18:20 cegger Exp $");
158 
159 #include "opt_user_ldt.h"
160 #include "opt_lockdebug.h"
161 #include "opt_multiprocessor.h"
162 #include "opt_xen.h"
163 #if !defined(__x86_64__)
164 #include "opt_kstack_dr0.h"
165 #endif /* !defined(__x86_64__) */
166 
167 #include <sys/param.h>
168 #include <sys/systm.h>
169 #include <sys/proc.h>
170 #include <sys/pool.h>
171 #include <sys/user.h>
172 #include <sys/kernel.h>
173 #include <sys/atomic.h>
174 #include <sys/cpu.h>
175 #include <sys/intr.h>
176 
177 #include <uvm/uvm.h>
178 
179 #include <dev/isa/isareg.h>
180 
181 #include <machine/specialreg.h>
182 #include <machine/gdt.h>
183 #include <machine/isa_machdep.h>
184 #include <machine/cpuvar.h>
185 
186 #include <x86/pmap.h>
187 #include <x86/pmap_pv.h>
188 
189 #include <x86/i82489reg.h>
190 #include <x86/i82489var.h>
191 
192 #ifdef XEN
193 #include <xen/xen3-public/xen.h>
194 #include <xen/hypervisor.h>
195 #endif
196 
197 /* flag to be used for kernel mappings: PG_u on Xen/amd64, 0 otherwise */
198 #if defined(XEN) && defined(__x86_64__)
199 #define PG_k PG_u
200 #else
201 #define PG_k 0
202 #endif
203 
204 /*
205  * general info:
206  *
207  *  - for an explanation of how the i386 MMU hardware works see
208  *    the comments in <machine/pte.h>.
209  *
210  *  - for an explanation of the general memory structure used by
211  *    this pmap (including the recursive mapping), see the comments
212  *    in <machine/pmap.h>.
213  *
214  * this file contains the code for the "pmap module."   the module's
215  * job is to manage the hardware's virtual to physical address mappings.
216  * note that there are two levels of mapping in the VM system:
217  *
218  *  [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
219  *      to map ranges of virtual address space to objects/files.  for
220  *      example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
221  *      to the file /bin/ls starting at offset zero."   note that
222  *      the upper layer mapping is not concerned with how individual
223  *      vm_pages are mapped.
224  *
225  *  [2] the lower layer of the VM system (the pmap) maintains the mappings
226  *      from virtual addresses.   it is concerned with which vm_page is
227  *      mapped where.   for example, when you run /bin/ls and start
228  *      at page 0x1000 the fault routine may lookup the correct page
229  *      of the /bin/ls file and then ask the pmap layer to establish
230  *      a mapping for it.
231  *
232  * note that information in the lower layer of the VM system can be
233  * thrown away since it can easily be reconstructed from the info
234  * in the upper layer.
235  *
236  * data structures we use include:
237  *
238  *  - struct pmap: describes the address space of one thread
239  *  - struct pv_entry: describes one <PMAP,VA> mapping of a PA
240  *  - struct pv_head: there is one pv_head per managed page of
241  *	physical memory.   the pv_head points to a list of pv_entry
242  *	structures which describe all the <PMAP,VA> pairs that this
243  *      page is mapped in.    this is critical for page based operations
244  *      such as pmap_page_protect() [change protection on _all_ mappings
245  *      of a page]
246  */
247 
248 /*
249  * memory allocation
250  *
251  *  - there are three data structures that we must dynamically allocate:
252  *
253  * [A] new process' page directory page (PDP)
254  *	- plan 1: done at pmap_create() we use
255  *	  uvm_km_alloc(kernel_map, PAGE_SIZE)  [fka kmem_alloc] to do this
256  *	  allocation.
257  *
258  * if we are low in free physical memory then we sleep in
259  * uvm_km_alloc -- in this case this is ok since we are creating
260  * a new pmap and should not be holding any locks.
261  *
262  * if the kernel is totally out of virtual space
263  * (i.e. uvm_km_alloc returns NULL), then we panic.
264  *
265  * XXX: the fork code currently has no way to return an "out of
266  * memory, try again" error code since uvm_fork [fka vm_fork]
267  * is a void function.
268  *
269  * [B] new page tables pages (PTP)
270  * 	- call uvm_pagealloc()
271  * 		=> success: zero page, add to pm_pdir
272  * 		=> failure: we are out of free vm_pages, let pmap_enter()
273  *		   tell UVM about it.
274  *
275  * note: for kernel PTPs, we start with NKPTP of them.   as we map
276  * kernel memory (at uvm_map time) we check to see if we've grown
277  * the kernel pmap.   if so, we call the optional function
278  * pmap_growkernel() to grow the kernel PTPs in advance.
279  *
280  * [C] pv_entry structures
281  */
282 
283 /*
284  * locking
285  *
286  * we have the following locks that we must contend with:
287  *
288  * mutexes:
289  *
290  * - pmap lock (per pmap, part of uvm_object)
291  *   this lock protects the fields in the pmap structure including
292  *   the non-kernel PDEs in the PDP, and the PTEs.  it also locks
293  *   in the alternate PTE space (since that is determined by the
294  *   entry in the PDP).
295  *
296  * - pvh_lock (per pv_head)
297  *   this lock protects the pv_entry list which is chained off the
298  *   pv_head structure for a specific managed PA.   it is locked
299  *   when traversing the list (e.g. adding/removing mappings,
300  *   syncing R/M bits, etc.)
301  *
302  * - pmaps_lock
303  *   this lock protects the list of active pmaps (headed by "pmaps").
304  *   we lock it when adding or removing pmaps from this list.
305  *
306  * tlb shootdown
307  *
308  * tlb shootdowns are hard interrupts that operate outside the spl
309  * framework: they don't need to be blocked provided that the pmap module
310  * gets the order of events correct.  the calls are made by talking directly
311  * to the lapic.  the stubs to handle the interrupts are quite short and do
312  * one of the following: invalidate a single page, a range of pages, all
313  * user tlb entries or the entire tlb.
314  *
315  * the cpus synchronize with each other using pmap_mbox structures which are
316  * aligned on 64-byte cache lines.  tlb shootdowns against the kernel pmap
317  * use a global mailbox and are generated using a broadcast ipi (broadcast
318  * to all but the sending cpu).  shootdowns against regular pmaps use
319  * per-cpu mailboxes and are multicast.  kernel and user shootdowns can
320  * execute simultaneously, as can shootdowns within different multithreaded
321  * processes.  TODO:
322  *
323  *   1. figure out which waitpoints can be deferered to pmap_update().
324  *   2. see if there is a cheap way to batch some updates.
325  */
326 
327 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER;
328 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER;
329 const long nkptpmax[] = NKPTPMAX_INITIALIZER;
330 const long nbpd[] = NBPD_INITIALIZER;
331 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER;
332 pd_entry_t * const alternate_pdes[] = APDES_INITIALIZER;
333 
334 long nkptp[] = NKPTP_INITIALIZER;
335 
336 static kmutex_t pmaps_lock;
337 
338 static vaddr_t pmap_maxkvaddr;
339 
340 #define COUNT(x)	/* nothing */
341 
342 /*
343  * XXX kludge: dummy locking to make KASSERTs in uvm_page.c comfortable.
344  * actual locking is done by pm_lock.
345  */
346 #if defined(DIAGNOSTIC)
347 #define	PMAP_SUBOBJ_LOCK(pm, idx) \
348 	KASSERT(mutex_owned(&(pm)->pm_lock)); \
349 	if ((idx) != 0) \
350 		mutex_enter(&(pm)->pm_obj[(idx)].vmobjlock)
351 #define	PMAP_SUBOBJ_UNLOCK(pm, idx) \
352 	KASSERT(mutex_owned(&(pm)->pm_lock)); \
353 	if ((idx) != 0) \
354 		mutex_exit(&(pm)->pm_obj[(idx)].vmobjlock)
355 #else /* defined(DIAGNOSTIC) */
356 #define	PMAP_SUBOBJ_LOCK(pm, idx)	/* nothing */
357 #define	PMAP_SUBOBJ_UNLOCK(pm, idx)	/* nothing */
358 #endif /* defined(DIAGNOSTIC) */
359 
360 /*
361  * Global TLB shootdown mailbox.
362  */
363 struct evcnt pmap_tlb_evcnt __aligned(64);
364 struct pmap_mbox pmap_mbox __aligned(64);
365 
366 /*
367  * Per-CPU data.  The pmap mailbox is cache intensive so gets its
368  * own line.  Note that the mailbox must be the first item.
369  */
370 struct pmap_cpu {
371 	/* TLB shootdown */
372 	struct pmap_mbox pc_mbox;
373 };
374 
375 union {
376 	struct pmap_cpu pc;
377 	uint8_t padding[64];
378 } pmap_cpu[MAXCPUS] __aligned(64);
379 
380 /*
381  * global data structures
382  */
383 
384 static struct pmap kernel_pmap_store;	/* the kernel's pmap (proc0) */
385 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store;
386 
387 /*
388  * pmap_pg_g: if our processor supports PG_G in the PTE then we
389  * set pmap_pg_g to PG_G (otherwise it is zero).
390  */
391 
392 int pmap_pg_g = 0;
393 
394 /*
395  * pmap_largepages: if our processor supports PG_PS and we are
396  * using it, this is set to true.
397  */
398 
399 int pmap_largepages;
400 
401 /*
402  * i386 physical memory comes in a big contig chunk with a small
403  * hole toward the front of it...  the following two paddr_t's
404  * (shared with machdep.c) describe the physical address space
405  * of this machine.
406  */
407 paddr_t avail_start;	/* PA of first available physical page */
408 paddr_t avail_end;	/* PA of last available physical page */
409 
410 #ifdef XEN
411 /* First avail vaddr in bootstrap space, needed by pmap_bootstrap() */
412 vaddr_t first_bt_vaddr;
413 #ifdef __x86_64__
414 /* Dummy PGD for user cr3, used between pmap_deacivate() and pmap_activate() */
415 static paddr_t xen_dummy_user_pgd;
416 /* Currently active user PGD (can't use rcr3()) */
417 static paddr_t xen_current_user_pgd = 0;
418 #endif /* __x86_64__ */
419 paddr_t pmap_pa_start; /* PA of first physical page for this domain */
420 paddr_t pmap_pa_end;   /* PA of last physical page for this domain */
421 #endif /* XEN */
422 
423 #define	VM_PAGE_TO_PP(pg)	(&(pg)->mdpage.mp_pp)
424 
425 #define	pp_lock(pp)	mutex_spin_enter(&(pp)->pp_lock)
426 #define	pp_unlock(pp)	mutex_spin_exit(&(pp)->pp_lock)
427 #define	pp_locked(pp)	mutex_owned(&(pp)->pp_lock)
428 
429 #define	PV_HASH_SIZE		32768
430 #define	PV_HASH_LOCK_CNT	32
431 
432 struct pv_hash_lock {
433 	kmutex_t lock;
434 } __aligned(CACHE_LINE_SIZE) pv_hash_locks[PV_HASH_LOCK_CNT]
435     __aligned(CACHE_LINE_SIZE);
436 
437 struct pv_hash_head {
438 	SLIST_HEAD(, pv_entry) hh_list;
439 } pv_hash_heads[PV_HASH_SIZE];
440 
441 static u_int
442 pvhash_hash(struct vm_page *ptp, vaddr_t va)
443 {
444 
445 	return (uintptr_t)ptp / sizeof(*ptp) + (va >> PAGE_SHIFT);
446 }
447 
448 static struct pv_hash_head *
449 pvhash_head(u_int hash)
450 {
451 
452 	return &pv_hash_heads[hash % PV_HASH_SIZE];
453 }
454 
455 static kmutex_t *
456 pvhash_lock(u_int hash)
457 {
458 
459 	return &pv_hash_locks[hash % PV_HASH_LOCK_CNT].lock;
460 }
461 
462 static struct pv_entry *
463 pvhash_remove(struct pv_hash_head *hh, struct vm_page *ptp, vaddr_t va)
464 {
465 	struct pv_entry *pve;
466 	struct pv_entry *prev;
467 
468 	prev = NULL;
469 	SLIST_FOREACH(pve, &hh->hh_list, pve_hash) {
470 		if (pve->pve_pte.pte_ptp == ptp &&
471 		    pve->pve_pte.pte_va == va) {
472 			if (prev != NULL) {
473 				SLIST_REMOVE_AFTER(prev, pve_hash);
474 			} else {
475 				SLIST_REMOVE_HEAD(&hh->hh_list, pve_hash);
476 			}
477 			break;
478 		}
479 		prev = pve;
480 	}
481 	return pve;
482 }
483 
484 /*
485  * other data structures
486  */
487 
488 static pt_entry_t protection_codes[8];	/* maps MI prot to i386 prot code */
489 static bool pmap_initialized = false;	/* pmap_init done yet? */
490 
491 /*
492  * the following two vaddr_t's are used during system startup
493  * to keep track of how much of the kernel's VM space we have used.
494  * once the system is started, the management of the remaining kernel
495  * VM space is turned over to the kernel_map vm_map.
496  */
497 
498 static vaddr_t virtual_avail;	/* VA of first free KVA */
499 static vaddr_t virtual_end;	/* VA of last free KVA */
500 
501 /*
502  * linked list of all non-kernel pmaps
503  */
504 
505 static struct pmap_head pmaps;
506 
507 /*
508  * pool that pmap structures are allocated from
509  */
510 
511 static struct pool_cache pmap_cache;
512 
513 /*
514  * pv_entry cache
515  */
516 
517 static struct pool_cache pmap_pv_cache;
518 
519 /*
520  * MULTIPROCESSOR: special VA's/ PTE's are actually allocated inside a
521  * maxcpus*NPTECL array of PTE's, to avoid cache line thrashing
522  * due to false sharing.
523  */
524 
525 #ifdef MULTIPROCESSOR
526 #define PTESLEW(pte, id) ((pte)+(id)*NPTECL)
527 #define VASLEW(va,id) ((va)+(id)*NPTECL*PAGE_SIZE)
528 #else
529 #define PTESLEW(pte, id) (pte)
530 #define VASLEW(va,id) (va)
531 #endif
532 
533 /*
534  * special VAs and the PTEs that map them
535  */
536 static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte, *early_zero_pte;
537 static char *csrcp, *cdstp, *zerop, *ptpp, *early_zerop;
538 
539 /*
540  * pool and cache that PDPs are allocated from
541  */
542 
543 static struct pool_cache pmap_pdp_cache;
544 int	pmap_pdp_ctor(void *, void *, int);
545 void	pmap_pdp_dtor(void *, void *);
546 #ifdef PAE
547 /* need to allocate items of 4 pages */
548 void *pmap_pdp_alloc(struct pool *, int);
549 void pmap_pdp_free(struct pool *, void *);
550 static struct pool_allocator pmap_pdp_allocator = {
551 	.pa_alloc = pmap_pdp_alloc,
552 	.pa_free = pmap_pdp_free,
553 	.pa_pagesz = PAGE_SIZE * PDP_SIZE,
554 };
555 #endif /* PAE */
556 
557 void *vmmap; /* XXX: used by mem.c... it should really uvm_map_reserve it */
558 
559 extern vaddr_t idt_vaddr;			/* we allocate IDT early */
560 extern paddr_t idt_paddr;
561 
562 #ifdef _LP64
563 extern vaddr_t lo32_vaddr;
564 extern vaddr_t lo32_paddr;
565 #endif
566 
567 extern int end;
568 
569 #ifdef i386
570 /* stuff to fix the pentium f00f bug */
571 extern vaddr_t pentium_idt_vaddr;
572 #endif
573 
574 
575 /*
576  * local prototypes
577  */
578 
579 static struct vm_page	*pmap_get_ptp(struct pmap *, vaddr_t,
580 				      pd_entry_t * const *);
581 static struct vm_page	*pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int);
582 static void		 pmap_freepage(struct pmap *, struct vm_page *, int);
583 static void		 pmap_free_ptp(struct pmap *, struct vm_page *,
584 				       vaddr_t, pt_entry_t *,
585 				       pd_entry_t * const *);
586 static bool		 pmap_is_curpmap(struct pmap *);
587 static bool		 pmap_is_active(struct pmap *, struct cpu_info *, bool);
588 static void		 pmap_map_ptes(struct pmap *, struct pmap **,
589 				       pt_entry_t **, pd_entry_t * const **);
590 static void		 pmap_do_remove(struct pmap *, vaddr_t, vaddr_t, int);
591 static bool		 pmap_remove_pte(struct pmap *, struct vm_page *,
592 					 pt_entry_t *, vaddr_t, int,
593 					 struct pv_entry **);
594 static pt_entry_t	 pmap_remove_ptes(struct pmap *, struct vm_page *,
595 					  vaddr_t, vaddr_t, vaddr_t, int,
596 					  struct pv_entry **);
597 #define PMAP_REMOVE_ALL		0	/* remove all mappings */
598 #define PMAP_REMOVE_SKIPWIRED	1	/* skip wired mappings */
599 
600 static void		 pmap_unmap_ptes(struct pmap *, struct pmap *);
601 static bool		 pmap_get_physpage(vaddr_t, int, paddr_t *);
602 static int		 pmap_pdes_invalid(vaddr_t, pd_entry_t * const *,
603 					   pd_entry_t *);
604 #define	pmap_pdes_valid(va, pdes, lastpde)	\
605 	(pmap_pdes_invalid((va), (pdes), (lastpde)) == 0)
606 static void		 pmap_alloc_level(pd_entry_t * const *, vaddr_t, int,
607 					  long *);
608 
609 static bool		 pmap_reactivate(struct pmap *);
610 
611 /*
612  * p m a p   h e l p e r   f u n c t i o n s
613  */
614 
615 static inline void
616 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff)
617 {
618 
619 	if (pmap == pmap_kernel()) {
620 		atomic_add_long(&pmap->pm_stats.resident_count, resid_diff);
621 		atomic_add_long(&pmap->pm_stats.wired_count, wired_diff);
622 	} else {
623 		KASSERT(mutex_owned(&pmap->pm_lock));
624 		pmap->pm_stats.resident_count += resid_diff;
625 		pmap->pm_stats.wired_count += wired_diff;
626 	}
627 }
628 
629 static inline void
630 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
631 {
632 	int resid_diff = ((npte & PG_V) ? 1 : 0) - ((opte & PG_V) ? 1 : 0);
633 	int wired_diff = ((npte & PG_W) ? 1 : 0) - ((opte & PG_W) ? 1 : 0);
634 
635 	KASSERT((npte & (PG_V | PG_W)) != PG_W);
636 	KASSERT((opte & (PG_V | PG_W)) != PG_W);
637 
638 	pmap_stats_update(pmap, resid_diff, wired_diff);
639 }
640 
641 /*
642  * ptp_to_pmap: lookup pmap by ptp
643  */
644 
645 static struct pmap *
646 ptp_to_pmap(struct vm_page *ptp)
647 {
648 	struct pmap *pmap;
649 
650 	if (ptp == NULL) {
651 		return pmap_kernel();
652 	}
653 	pmap = (struct pmap *)ptp->uobject;
654 	KASSERT(pmap != NULL);
655 	KASSERT(&pmap->pm_obj[0] == ptp->uobject);
656 	return pmap;
657 }
658 
659 static inline struct pv_pte *
660 pve_to_pvpte(struct pv_entry *pve)
661 {
662 
663 	KASSERT((void *)&pve->pve_pte == (void *)pve);
664 	return &pve->pve_pte;
665 }
666 
667 static inline struct pv_entry *
668 pvpte_to_pve(struct pv_pte *pvpte)
669 {
670 	struct pv_entry *pve = (void *)pvpte;
671 
672 	KASSERT(pve_to_pvpte(pve) == pvpte);
673 	return pve;
674 }
675 
676 /*
677  * pv_pte_first, pv_pte_next: PV list iterator.
678  */
679 
680 static struct pv_pte *
681 pv_pte_first(struct pmap_page *pp)
682 {
683 
684 	KASSERT(pp_locked(pp));
685 	if ((pp->pp_flags & PP_EMBEDDED) != 0) {
686 		return &pp->pp_pte;
687 	}
688 	return pve_to_pvpte(LIST_FIRST(&pp->pp_head.pvh_list));
689 }
690 
691 static struct pv_pte *
692 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte)
693 {
694 
695 	KASSERT(pvpte != NULL);
696 	KASSERT(pp_locked(pp));
697 	if (pvpte == &pp->pp_pte) {
698 		KASSERT((pp->pp_flags & PP_EMBEDDED) != 0);
699 		return NULL;
700 	}
701 	KASSERT((pp->pp_flags & PP_EMBEDDED) == 0);
702 	return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list));
703 }
704 
705 /*
706  * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
707  *		of course the kernel is always loaded
708  */
709 
710 inline static bool
711 pmap_is_curpmap(struct pmap *pmap)
712 {
713 #if defined(XEN) && defined(__x86_64__)
714 	/*
715 	 * Only kernel pmap is physically loaded.
716 	 * User PGD may be active, but TLB will be flushed
717 	 * with HYPERVISOR_iret anyway, so let's say no
718 	 */
719 	return(pmap == pmap_kernel());
720 #else /* XEN && __x86_64__*/
721 	return((pmap == pmap_kernel()) ||
722 	       (pmap == curcpu()->ci_pmap));
723 #endif
724 }
725 
726 /*
727  * pmap_is_active: is this pmap loaded into the specified processor's %cr3?
728  */
729 
730 inline static bool
731 pmap_is_active(struct pmap *pmap, struct cpu_info *ci, bool kernel)
732 {
733 
734 	return (pmap == pmap_kernel() ||
735 	    (pmap->pm_cpus & ci->ci_cpumask) != 0 ||
736 	    (kernel && (pmap->pm_kernel_cpus & ci->ci_cpumask) != 0));
737 }
738 
739 static void
740 pmap_apte_flush(struct pmap *pmap)
741 {
742 
743 	KASSERT(kpreempt_disabled());
744 
745 	/*
746 	 * Flush the APTE mapping from all other CPUs that
747 	 * are using the pmap we are using (who's APTE space
748 	 * is the one we've just modified).
749 	 *
750 	 * XXXthorpej -- find a way to defer the IPI.
751 	 */
752 	pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, 0);
753 	pmap_tlb_shootwait();
754 }
755 
756 /*
757  *	Add a reference to the specified pmap.
758  */
759 
760 inline void
761 pmap_reference(struct pmap *pmap)
762 {
763 
764 	atomic_inc_uint((unsigned *)&pmap->pm_obj[0].uo_refs);
765 }
766 
767 /*
768  * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
769  *
770  * => we lock enough pmaps to keep things locked in
771  * => must be undone with pmap_unmap_ptes before returning
772  */
773 
774 static void
775 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2,
776     pd_entry_t **ptepp, pd_entry_t * const **pdeppp)
777 {
778 	pd_entry_t opde, npde;
779 	struct pmap *ourpmap;
780 	struct cpu_info *ci;
781 	struct lwp *l;
782 	bool iscurrent;
783 	uint64_t ncsw;
784 #ifdef XEN
785 	int s;
786 #endif
787 
788 	/* the kernel's pmap is always accessible */
789 	if (pmap == pmap_kernel()) {
790 		*pmap2 = NULL;
791 		*ptepp = PTE_BASE;
792 		*pdeppp = normal_pdes;
793 		return;
794 	}
795 	KASSERT(kpreempt_disabled());
796 
797  retry:
798 	l = curlwp;
799 	ncsw = l->l_ncsw;
800  	ourpmap = NULL;
801 	ci = curcpu();
802 #if defined(XEN) && defined(__x86_64__)
803 	/*
804 	 * curmap can only be pmap_kernel so at this point
805 	 * pmap_is_curpmap is always false
806 	 */
807 	iscurrent = 0;
808 	ourpmap = pmap_kernel();
809 #else /* XEN && __x86_64__*/
810 	if (ci->ci_want_pmapload &&
811 	    vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) {
812 		pmap_load();
813 		if (l->l_ncsw != ncsw)
814 			goto retry;
815 	}
816 	iscurrent = pmap_is_curpmap(pmap);
817 	/* if curpmap then we are always mapped */
818 	if (iscurrent) {
819 		mutex_enter(&pmap->pm_lock);
820 		*pmap2 = NULL;
821 		*ptepp = PTE_BASE;
822 		*pdeppp = normal_pdes;
823 		goto out;
824 	}
825 	ourpmap = ci->ci_pmap;
826 #endif /* XEN && __x86_64__ */
827 
828 	/* need to lock both curpmap and pmap: use ordered locking */
829 	pmap_reference(ourpmap);
830 	if ((uintptr_t) pmap < (uintptr_t) ourpmap) {
831 		mutex_enter(&pmap->pm_lock);
832 		mutex_enter(&ourpmap->pm_lock);
833 	} else {
834 		mutex_enter(&ourpmap->pm_lock);
835 		mutex_enter(&pmap->pm_lock);
836 	}
837 
838 	if (l->l_ncsw != ncsw)
839 		goto unlock_and_retry;
840 
841 	/* need to load a new alternate pt space into curpmap? */
842 	COUNT(apdp_pde_map);
843 	opde = *APDP_PDE;
844 #ifdef XEN
845 	if (!pmap_valid_entry(opde) ||
846 	    pmap_pte2pa(opde) != pmap_pdirpa(pmap, 0)) {
847 		int i;
848 		s = splvm();
849 		/* Make recursive entry usable in user PGD */
850 		for (i = 0; i < PDP_SIZE; i++) {
851 			npde = pmap_pa2pte(
852 			    pmap_pdirpa(pmap, i * NPDPG)) | PG_k | PG_V;
853 			xpq_queue_pte_update(
854 			    xpmap_ptom(pmap_pdirpa(pmap, PDIR_SLOT_PTE + i)),
855 			    npde);
856 			xpq_queue_pte_update(xpmap_ptetomach(&APDP_PDE[i]),
857 			    npde);
858 #ifdef PAE
859 			/* update shadow entry too */
860 			xpq_queue_pte_update(
861 			    xpmap_ptetomach(&APDP_PDE_SHADOW[i]), npde);
862 #endif /* PAE */
863 			xpq_queue_invlpg(
864 			    (vaddr_t)&pmap->pm_pdir[PDIR_SLOT_PTE + i]);
865 		}
866 		xpq_flush_queue();
867 		if (pmap_valid_entry(opde))
868 			pmap_apte_flush(ourpmap);
869 		splx(s);
870 	}
871 #else /* XEN */
872 	npde = pmap_pa2pte(pmap_pdirpa(pmap, 0)) | PG_RW | PG_V;
873 	if (!pmap_valid_entry(opde) ||
874 	    pmap_pte2pa(opde) != pmap_pdirpa(pmap, 0)) {
875 		pmap_pte_set(APDP_PDE, npde);
876 		pmap_pte_flush();
877 		if (pmap_valid_entry(opde))
878 			pmap_apte_flush(ourpmap);
879 	}
880 #endif /* XEN */
881 	*pmap2 = ourpmap;
882 	*ptepp = APTE_BASE;
883 	*pdeppp = alternate_pdes;
884 	KASSERT(l->l_ncsw == ncsw);
885 #if !defined(XEN) || !defined(__x86_64__)
886  out:
887 #endif
888  	/*
889  	 * might have blocked, need to retry?
890  	 */
891 	if (l->l_ncsw != ncsw) {
892  unlock_and_retry:
893 	    	if (ourpmap != NULL) {
894 			mutex_exit(&ourpmap->pm_lock);
895 			pmap_destroy(ourpmap);
896 		}
897 		mutex_exit(&pmap->pm_lock);
898 		goto retry;
899 	}
900 
901 	return;
902 }
903 
904 /*
905  * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
906  */
907 
908 static void
909 pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2)
910 {
911 
912 	if (pmap == pmap_kernel()) {
913 		return;
914 	}
915 	KASSERT(kpreempt_disabled());
916 	if (pmap2 == NULL) {
917 		mutex_exit(&pmap->pm_lock);
918 	} else {
919 #if defined(XEN) && defined(__x86_64__)
920 		KASSERT(pmap2 == pmap_kernel());
921 #else
922 		KASSERT(curcpu()->ci_pmap == pmap2);
923 #endif
924 #if defined(MULTIPROCESSOR)
925 		pmap_pte_set(APDP_PDE, 0);
926 		pmap_pte_flush();
927 		pmap_apte_flush(pmap2);
928 #endif
929 		COUNT(apdp_pde_unmap);
930 		mutex_exit(&pmap->pm_lock);
931 		mutex_exit(&pmap2->pm_lock);
932 		pmap_destroy(pmap2);
933 	}
934 }
935 
936 inline static void
937 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte)
938 {
939 
940 #if !defined(__x86_64__)
941 	if (curproc == NULL || curproc->p_vmspace == NULL ||
942 	    pm != vm_map_pmap(&curproc->p_vmspace->vm_map))
943 		return;
944 
945 	if ((opte ^ npte) & PG_X)
946 		pmap_update_pg(va);
947 
948 	/*
949 	 * Executability was removed on the last executable change.
950 	 * Reset the code segment to something conservative and
951 	 * let the trap handler deal with setting the right limit.
952 	 * We can't do that because of locking constraints on the vm map.
953 	 */
954 
955 	if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) {
956 		struct trapframe *tf = curlwp->l_md.md_regs;
957 
958 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
959 		pm->pm_hiexec = I386_MAX_EXE_ADDR;
960 	}
961 #endif /* !defined(__x86_64__) */
962 }
963 
964 #if !defined(__x86_64__)
965 /*
966  * Fixup the code segment to cover all potential executable mappings.
967  * returns 0 if no changes to the code segment were made.
968  */
969 
970 int
971 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb)
972 {
973 	struct vm_map_entry *ent;
974 	struct pmap *pm = vm_map_pmap(map);
975 	vaddr_t va = 0;
976 
977 	vm_map_lock_read(map);
978 	for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) {
979 
980 		/*
981 		 * This entry has greater va than the entries before.
982 		 * We need to make it point to the last page, not past it.
983 		 */
984 
985 		if (ent->protection & VM_PROT_EXECUTE)
986 			va = trunc_page(ent->end) - PAGE_SIZE;
987 	}
988 	vm_map_unlock_read(map);
989 	if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL))
990 		return (0);
991 
992 	pm->pm_hiexec = va;
993 	if (pm->pm_hiexec > I386_MAX_EXE_ADDR) {
994 		tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
995 	} else {
996 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
997 		return (0);
998 	}
999 	return (1);
1000 }
1001 #endif /* !defined(__x86_64__) */
1002 
1003 /*
1004  * p m a p   k e n t e r   f u n c t i o n s
1005  *
1006  * functions to quickly enter/remove pages from the kernel address
1007  * space.   pmap_kremove is exported to MI kernel.  we make use of
1008  * the recursive PTE mappings.
1009  */
1010 
1011 /*
1012  * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
1013  *
1014  * => no need to lock anything, assume va is already allocated
1015  * => should be faster than normal pmap enter function
1016  */
1017 
1018 void
1019 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot)
1020 {
1021 	pt_entry_t *pte, opte, npte;
1022 
1023 	KASSERT(!(prot & ~VM_PROT_ALL));
1024 
1025 	if (va < VM_MIN_KERNEL_ADDRESS)
1026 		pte = vtopte(va);
1027 	else
1028 		pte = kvtopte(va);
1029 #ifdef DOM0OPS
1030 	if (pa < pmap_pa_start || pa >= pmap_pa_end) {
1031 #ifdef DEBUG
1032 		printk("pmap_kenter_pa: pa 0x%" PRIx64 " for va 0x%" PRIx64
1033 		    " outside range\n", (int64_t)pa, (int64_t)va);
1034 #endif /* DEBUG */
1035 		npte = pa;
1036 	} else
1037 #endif /* DOM0OPS */
1038 		npte = pmap_pa2pte(pa);
1039 	npte |= protection_codes[prot] | PG_k | PG_V | pmap_pg_g;
1040 	opte = pmap_pte_testset(pte, npte); /* zap! */
1041 #if defined(DIAGNOSTIC)
1042 	/* XXX For now... */
1043 	if (opte & PG_PS)
1044 		panic("pmap_kenter_pa: PG_PS");
1045 #endif
1046 	if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) {
1047 		/* This should not happen, so no need to batch updates. */
1048 		kpreempt_disable();
1049 		pmap_tlb_shootdown(pmap_kernel(), va, 0, opte);
1050 		kpreempt_enable();
1051 	}
1052 }
1053 
1054 #ifdef XEN
1055 /*
1056  * pmap_kenter_ma: enter a kernel mapping without R/M (pv_entry) tracking
1057  *
1058  * => no need to lock anything, assume va is already allocated
1059  * => should be faster than normal pmap enter function
1060  * => we expect a MACHINE address
1061  */
1062 
1063 void
1064 pmap_kenter_ma(vaddr_t va, paddr_t ma, vm_prot_t prot)
1065 {
1066 	pt_entry_t *pte, opte, npte;
1067 
1068 	if (va < VM_MIN_KERNEL_ADDRESS)
1069 		pte = vtopte(va);
1070 	else
1071 		pte = kvtopte(va);
1072 
1073 	npte = ma | ((prot & VM_PROT_WRITE) ? PG_RW : PG_RO) |
1074 	     PG_V | PG_k;
1075 #ifndef XEN
1076 	if ((cpu_feature & CPUID_NOX) && !(prot & VM_PROT_EXECUTE))
1077 		npte |= PG_NX;
1078 #endif
1079 	opte = pmap_pte_testset (pte, npte); /* zap! */
1080 
1081 	if (pmap_valid_entry(opte)) {
1082 #if defined(MULTIPROCESSOR)
1083 		kpreempt_disable();
1084 		pmap_tlb_shootdown(pmap_kernel(), va, 0, opte);
1085 		kpreempt_enable();
1086 #else
1087 		/* Don't bother deferring in the single CPU case. */
1088 		pmap_update_pg(va);
1089 #endif
1090 	}
1091 }
1092 #endif	/* XEN */
1093 
1094 #if defined(__x86_64__)
1095 /*
1096  * Change protection for a virtual address. Local for a CPU only, don't
1097  * care about TLB shootdowns.
1098  *
1099  * => must be called with preemption disabled
1100  */
1101 void
1102 pmap_changeprot_local(vaddr_t va, vm_prot_t prot)
1103 {
1104 	pt_entry_t *pte, opte, npte;
1105 
1106 	KASSERT(kpreempt_disabled());
1107 
1108 	if (va < VM_MIN_KERNEL_ADDRESS)
1109 		pte = vtopte(va);
1110 	else
1111 		pte = kvtopte(va);
1112 
1113 	npte = opte = *pte;
1114 
1115 	if ((prot & VM_PROT_WRITE) != 0)
1116 		npte |= PG_RW;
1117 	else
1118 		npte &= ~PG_RW;
1119 
1120 	if (opte != npte) {
1121 		pmap_pte_set(pte, npte);
1122 		pmap_pte_flush();
1123 		invlpg(va);
1124 	}
1125 }
1126 #endif /* defined(__x86_64__) */
1127 
1128 /*
1129  * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
1130  *
1131  * => no need to lock anything
1132  * => caller must dispose of any vm_page mapped in the va range
1133  * => note: not an inline function
1134  * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
1135  * => we assume kernel only unmaps valid addresses and thus don't bother
1136  *    checking the valid bit before doing TLB flushing
1137  * => must be followed by call to pmap_update() before reuse of page
1138  */
1139 
1140 void
1141 pmap_kremove(vaddr_t sva, vsize_t len)
1142 {
1143 	pt_entry_t *pte, xpte;
1144 	vaddr_t va, eva;
1145 
1146 	eva = sva + len;
1147 	xpte = 0;
1148 
1149 	for (va = sva; va < eva; va += PAGE_SIZE) {
1150 		if (va < VM_MIN_KERNEL_ADDRESS)
1151 			pte = vtopte(va);
1152 		else
1153 			pte = kvtopte(va);
1154 		xpte |= pmap_pte_testset(pte, 0); /* zap! */
1155 #if defined(DIAGNOSTIC)
1156 		/* XXX For now... */
1157 		if (xpte & PG_PS)
1158 			panic("pmap_kremove: PG_PS");
1159 		if (xpte & PG_PVLIST)
1160 			panic("pmap_kremove: PG_PVLIST mapping for 0x%lx",
1161 			      va);
1162 #endif
1163 	}
1164 	if ((xpte & (PG_V | PG_U)) == (PG_V | PG_U)) {
1165 		kpreempt_disable();
1166 		pmap_tlb_shootdown(pmap_kernel(), sva, eva, xpte);
1167 		kpreempt_enable();
1168 	}
1169 }
1170 
1171 /*
1172  * p m a p   i n i t   f u n c t i o n s
1173  *
1174  * pmap_bootstrap and pmap_init are called during system startup
1175  * to init the pmap module.   pmap_bootstrap() does a low level
1176  * init just to get things rolling.   pmap_init() finishes the job.
1177  */
1178 
1179 /*
1180  * pmap_bootstrap: get the system in a state where it can run with VM
1181  *	properly enabled (called before main()).   the VM system is
1182  *      fully init'd later...
1183  *
1184  * => on i386, locore.s has already enabled the MMU by allocating
1185  *	a PDP for the kernel, and nkpde PTP's for the kernel.
1186  * => kva_start is the first free virtual address in kernel space
1187  */
1188 
1189 void
1190 pmap_bootstrap(vaddr_t kva_start)
1191 {
1192 	struct pmap *kpm;
1193 	pt_entry_t *pte;
1194 	int i;
1195 	vaddr_t kva;
1196 #ifdef XEN
1197 	pt_entry_t pg_nx = 0;
1198 #else
1199 	unsigned long p1i;
1200 	vaddr_t kva_end;
1201 	pt_entry_t pg_nx = (cpu_feature & CPUID_NOX ? PG_NX : 0);
1202 #endif
1203 
1204 	/*
1205 	 * set up our local static global vars that keep track of the
1206 	 * usage of KVM before kernel_map is set up
1207 	 */
1208 
1209 	virtual_avail = kva_start;		/* first free KVA */
1210 	virtual_end = VM_MAX_KERNEL_ADDRESS;	/* last KVA */
1211 
1212 	/*
1213 	 * set up protection_codes: we need to be able to convert from
1214 	 * a MI protection code (some combo of VM_PROT...) to something
1215 	 * we can jam into a i386 PTE.
1216 	 */
1217 
1218 	protection_codes[VM_PROT_NONE] = pg_nx;			/* --- */
1219 	protection_codes[VM_PROT_EXECUTE] = PG_RO | PG_X;	/* --x */
1220 	protection_codes[VM_PROT_READ] = PG_RO | pg_nx;		/* -r- */
1221 	protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO | PG_X;/* -rx */
1222 	protection_codes[VM_PROT_WRITE] = PG_RW | pg_nx;	/* w-- */
1223 	protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW | PG_X;/* w-x */
1224 	protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW | pg_nx;
1225 								/* wr- */
1226 	protection_codes[VM_PROT_ALL] = PG_RW | PG_X;		/* wrx */
1227 
1228 	/*
1229 	 * now we init the kernel's pmap
1230 	 *
1231 	 * the kernel pmap's pm_obj is not used for much.   however, in
1232 	 * user pmaps the pm_obj contains the list of active PTPs.
1233 	 * the pm_obj currently does not have a pager.   it might be possible
1234 	 * to add a pager that would allow a process to read-only mmap its
1235 	 * own page tables (fast user level vtophys?).   this may or may not
1236 	 * be useful.
1237 	 */
1238 
1239 	kpm = pmap_kernel();
1240 	for (i = 0; i < PTP_LEVELS - 1; i++) {
1241 		UVM_OBJ_INIT(&kpm->pm_obj[i], NULL, 1);
1242 		kpm->pm_ptphint[i] = NULL;
1243 	}
1244 	memset(&kpm->pm_list, 0, sizeof(kpm->pm_list));  /* pm_list not used */
1245 	kpm->pm_pdir = (pd_entry_t *)(lwp0.l_addr->u_pcb.pcb_cr3 + KERNBASE);
1246 #ifdef PAE
1247 	for (i = 0; i < PDP_SIZE; i++)
1248 		kpm->pm_pdirpa[i] =
1249 		    (paddr_t)lwp0.l_addr->u_pcb.pcb_cr3 + PAGE_SIZE * i;
1250 #else
1251 	kpm->pm_pdirpa = (paddr_t) lwp0.l_addr->u_pcb.pcb_cr3;
1252 #endif
1253 	kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
1254 		x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS);
1255 
1256 	/*
1257 	 * the above is just a rough estimate and not critical to the proper
1258 	 * operation of the system.
1259 	 */
1260 
1261 #ifndef XEN
1262 	/*
1263 	 * Begin to enable global TLB entries if they are supported.
1264 	 * The G bit has no effect until the CR4_PGE bit is set in CR4,
1265 	 * which happens in cpu_init(), which is run on each cpu
1266 	 * (and happens later)
1267 	 */
1268 
1269 	if (cpu_feature & CPUID_PGE) {
1270 		pmap_pg_g = PG_G;		/* enable software */
1271 
1272 		/* add PG_G attribute to already mapped kernel pages */
1273 		if (KERNBASE == VM_MIN_KERNEL_ADDRESS) {
1274 			kva_end = virtual_avail;
1275 		} else {
1276 			extern vaddr_t eblob, esym;
1277 			kva_end = (vaddr_t)&end;
1278 			if (esym > kva_end)
1279 				kva_end = esym;
1280 			if (eblob > kva_end)
1281 				kva_end = eblob;
1282 			kva_end = roundup(kva_end, PAGE_SIZE);
1283 		}
1284 		for (kva = KERNBASE; kva < kva_end; kva += PAGE_SIZE) {
1285 			p1i = pl1_i(kva);
1286 			if (pmap_valid_entry(PTE_BASE[p1i]))
1287 				PTE_BASE[p1i] |= PG_G;
1288 		}
1289 	}
1290 
1291 	/*
1292 	 * enable large pages if they are supported.
1293 	 */
1294 
1295 	if (cpu_feature & CPUID_PSE) {
1296 		paddr_t pa;
1297 		pd_entry_t *pde;
1298 		extern char __data_start;
1299 
1300 		lcr4(rcr4() | CR4_PSE);	/* enable hardware (via %cr4) */
1301 		pmap_largepages = 1;	/* enable software */
1302 
1303 		/*
1304 		 * the TLB must be flushed after enabling large pages
1305 		 * on Pentium CPUs, according to section 3.6.2.2 of
1306 		 * "Intel Architecture Software Developer's Manual,
1307 		 * Volume 3: System Programming".
1308 		 */
1309 		tlbflush();
1310 
1311 		/*
1312 		 * now, remap the kernel text using large pages.  we
1313 		 * assume that the linker has properly aligned the
1314 		 * .data segment to a NBPD_L2 boundary.
1315 		 */
1316 		kva_end = rounddown((vaddr_t)&__data_start, NBPD_L1);
1317 		for (pa = 0, kva = KERNBASE; kva + NBPD_L2 <= kva_end;
1318 		     kva += NBPD_L2, pa += NBPD_L2) {
1319 			pde = &L2_BASE[pl2_i(kva)];
1320 			*pde = pa | pmap_pg_g | PG_PS |
1321 			    PG_KR | PG_V;	/* zap! */
1322 			tlbflush();
1323 		}
1324 #if defined(DEBUG)
1325 		printf("kernel text is mapped with "
1326 		    "%lu large pages and %lu normal pages\n",
1327 		    (unsigned long)howmany(kva - KERNBASE, NBPD_L2),
1328 		    (unsigned long)howmany((vaddr_t)&__data_start - kva,
1329 		    NBPD_L1));
1330 #endif /* defined(DEBUG) */
1331 	}
1332 #endif /* !XEN */
1333 
1334 	if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
1335 		/*
1336 		 * zero_pte is stuck at the end of mapped space for the kernel
1337 		 * image (disjunct from kva space). This is done so that it
1338 		 * can safely be used in pmap_growkernel (pmap_get_physpage),
1339 		 * when it's called for the first time.
1340 		 * XXXfvdl fix this for MULTIPROCESSOR later.
1341 		 */
1342 
1343 		early_zerop = (void *)(KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2);
1344 		early_zero_pte = PTE_BASE + pl1_i((unsigned long)early_zerop);
1345 	}
1346 
1347 	/*
1348 	 * now we allocate the "special" VAs which are used for tmp mappings
1349 	 * by the pmap (and other modules).    we allocate the VAs by advancing
1350 	 * virtual_avail (note that there are no pages mapped at these VAs).
1351 	 * we find the PTE that maps the allocated VA via the linear PTE
1352 	 * mapping.
1353 	 */
1354 
1355 	pte = PTE_BASE + pl1_i(virtual_avail);
1356 
1357 #ifdef MULTIPROCESSOR
1358 	/*
1359 	 * Waste some VA space to avoid false sharing of cache lines
1360 	 * for page table pages: Give each possible CPU a cache line
1361 	 * of PTE's (8) to play with, though we only need 4.  We could
1362 	 * recycle some of this waste by putting the idle stacks here
1363 	 * as well; we could waste less space if we knew the largest
1364 	 * CPU ID beforehand.
1365 	 */
1366 	csrcp = (char *) virtual_avail;  csrc_pte = pte;
1367 
1368 	cdstp = (char *) virtual_avail+PAGE_SIZE;  cdst_pte = pte+1;
1369 
1370 	zerop = (char *) virtual_avail+PAGE_SIZE*2;  zero_pte = pte+2;
1371 
1372 	ptpp = (char *) virtual_avail+PAGE_SIZE*3;  ptp_pte = pte+3;
1373 
1374 	virtual_avail += PAGE_SIZE * maxcpus * NPTECL;
1375 	pte += maxcpus * NPTECL;
1376 #else
1377 	csrcp = (void *) virtual_avail;  csrc_pte = pte;	/* allocate */
1378 	virtual_avail += PAGE_SIZE; pte++;			/* advance */
1379 
1380 	cdstp = (void *) virtual_avail;  cdst_pte = pte;
1381 	virtual_avail += PAGE_SIZE; pte++;
1382 
1383 	zerop = (void *) virtual_avail;  zero_pte = pte;
1384 	virtual_avail += PAGE_SIZE; pte++;
1385 
1386 	ptpp = (void *) virtual_avail;  ptp_pte = pte;
1387 	virtual_avail += PAGE_SIZE; pte++;
1388 #endif
1389 
1390 	if (VM_MIN_KERNEL_ADDRESS == KERNBASE) {
1391 		early_zerop = zerop;
1392 		early_zero_pte = zero_pte;
1393 	}
1394 
1395 	/*
1396 	 * Nothing after this point actually needs pte;
1397 	 */
1398 	pte = (void *)0xdeadbeef;
1399 
1400 	/* XXX: vmmap used by mem.c... should be uvm_map_reserve */
1401 	/* XXXfvdl PTEs not needed here */
1402 	vmmap = (char *)virtual_avail;			/* don't need pte */
1403 	virtual_avail += PAGE_SIZE; pte++;
1404 
1405 #ifdef XEN
1406 #ifdef __x86_64__
1407 	/*
1408 	 * We want a dummy page directory for Xen:
1409 	 * when deactivate a pmap, Xen will still consider it active.
1410 	 * So we set user PGD to this one to lift all protection on
1411 	 * the now inactive page tables set.
1412 	 */
1413 	xen_dummy_user_pgd = avail_start;
1414 	avail_start += PAGE_SIZE;
1415 
1416 	/* Zero fill it, the less checks in Xen it requires the better */
1417 	memset((void *) (xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE);
1418 	/* Mark read-only */
1419 	HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE,
1420 	    pmap_pa2pte(xen_dummy_user_pgd) | PG_u | PG_V, UVMF_INVLPG);
1421 	/* Pin as L4 */
1422 	xpq_queue_pin_table(xpmap_ptom_masked(xen_dummy_user_pgd));
1423 #endif /* __x86_64__ */
1424 	idt_vaddr = virtual_avail;                      /* don't need pte */
1425 	idt_paddr = avail_start;                        /* steal a page */
1426 	/*
1427 	 * Xen require one more page as we can't store
1428 	 * GDT and LDT on the same page
1429 	 */
1430 	virtual_avail += 3 * PAGE_SIZE;
1431 	avail_start += 3 * PAGE_SIZE;
1432 #else /* XEN */
1433 	idt_vaddr = virtual_avail;			/* don't need pte */
1434 	idt_paddr = avail_start;			/* steal a page */
1435 #if defined(__x86_64__)
1436 	virtual_avail += 2 * PAGE_SIZE; pte += 2;
1437 	avail_start += 2 * PAGE_SIZE;
1438 #else /* defined(__x86_64__) */
1439 	virtual_avail += PAGE_SIZE; pte++;
1440 	avail_start += PAGE_SIZE;
1441 	/* pentium f00f bug stuff */
1442 	pentium_idt_vaddr = virtual_avail;		/* don't need pte */
1443 	virtual_avail += PAGE_SIZE; pte++;
1444 #endif /* defined(__x86_64__) */
1445 #endif /* XEN */
1446 
1447 #ifdef _LP64
1448 	/*
1449 	 * Grab a page below 4G for things that need it (i.e.
1450 	 * having an initial %cr3 for the MP trampoline).
1451 	 */
1452 	lo32_vaddr = virtual_avail;
1453 	virtual_avail += PAGE_SIZE; pte++;
1454 	lo32_paddr = avail_start;
1455 	avail_start += PAGE_SIZE;
1456 #endif
1457 
1458 	/*
1459 	 * now we reserve some VM for mapping pages when doing a crash dump
1460 	 */
1461 
1462 	virtual_avail = reserve_dumppages(virtual_avail);
1463 
1464 	/*
1465 	 * init the static-global locks and global lists.
1466 	 *
1467 	 * => pventry::pvh_lock (initialized elsewhere) must also be
1468 	 *      a spin lock, again at IPL_VM to prevent deadlock, and
1469 	 *	again is never taken from interrupt context.
1470 	 */
1471 
1472 	mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE);
1473 	LIST_INIT(&pmaps);
1474 	pmap_cpu_init_early(curcpu());
1475 
1476 	/*
1477 	 * initialize caches.
1478 	 */
1479 
1480 	pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), 0, 0, 0,
1481 	    "pmappl", NULL, IPL_NONE, NULL, NULL, NULL);
1482 #ifdef PAE
1483 	pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE * PDP_SIZE, 0, 0, 0,
1484 	    "pdppl", &pmap_pdp_allocator, IPL_NONE,
1485 	    pmap_pdp_ctor, pmap_pdp_dtor, NULL);
1486 #else /* PAE */
1487 	pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE, 0, 0, 0,
1488 	    "pdppl", NULL, IPL_NONE, pmap_pdp_ctor, pmap_pdp_dtor, NULL);
1489 #endif /* PAE */
1490 	pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0,
1491 	    PR_LARGECACHE, "pvpl", &pool_allocator_meta, IPL_NONE, NULL,
1492 	    NULL, NULL);
1493 
1494 	/*
1495 	 * ensure the TLB is sync'd with reality by flushing it...
1496 	 */
1497 
1498 	tlbflush();
1499 
1500 	/*
1501 	 * calculate pmap_maxkvaddr from nkptp[].
1502 	 */
1503 
1504 	kva = VM_MIN_KERNEL_ADDRESS;
1505 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
1506 		kva += nkptp[i] * nbpd[i];
1507 	}
1508 	pmap_maxkvaddr = kva;
1509 }
1510 
1511 #if defined(__x86_64__)
1512 /*
1513  * Pre-allocate PTPs for low memory, so that 1:1 mappings for various
1514  * trampoline code can be entered.
1515  */
1516 void
1517 pmap_prealloc_lowmem_ptps(void)
1518 {
1519 #ifdef XEN
1520 	int level;
1521 	paddr_t newp;
1522 	paddr_t pdes_pa;
1523 
1524 	pdes_pa = pmap_kernel()->pm_pdirpa;
1525 	level = PTP_LEVELS;
1526 	for (;;) {
1527 		newp = avail_start;
1528 		avail_start += PAGE_SIZE;
1529 		HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop,
1530 		    xpmap_ptom_masked(newp) | PG_u | PG_V | PG_RW, UVMF_INVLPG);
1531 		memset((void *)early_zerop, 0, PAGE_SIZE);
1532 		/* Mark R/O before installing */
1533 		HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop,
1534 		    xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG);
1535 		if (newp < (NKL2_KIMG_ENTRIES * NBPD_L2))
1536 			HYPERVISOR_update_va_mapping (newp + KERNBASE,
1537 			    xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG);
1538 		xpq_queue_pte_update (
1539 			xpmap_ptom_masked(pdes_pa)
1540 			+ (pl_i(0, level) * sizeof (pd_entry_t)),
1541 			xpmap_ptom_masked(newp) | PG_RW | PG_u | PG_V);
1542 		level--;
1543 		if (level <= 1)
1544 			break;
1545 		pdes_pa = newp;
1546 	}
1547 #else /* XEN */
1548 	pd_entry_t *pdes;
1549 	int level;
1550 	paddr_t newp;
1551 
1552 	pdes = pmap_kernel()->pm_pdir;
1553 	level = PTP_LEVELS;
1554 	for (;;) {
1555 		newp = avail_start;
1556 		avail_start += PAGE_SIZE;
1557 		*early_zero_pte = (newp & PG_FRAME) | PG_V | PG_RW;
1558 		pmap_update_pg((vaddr_t)early_zerop);
1559 		memset(early_zerop, 0, PAGE_SIZE);
1560 		pdes[pl_i(0, level)] = (newp & PG_FRAME) | PG_V | PG_RW;
1561 		level--;
1562 		if (level <= 1)
1563 			break;
1564 		pdes = normal_pdes[level - 2];
1565 	}
1566 #endif /* XEN */
1567 }
1568 #endif /* defined(__x86_64__) */
1569 
1570 /*
1571  * pmap_init: called from uvm_init, our job is to get the pmap
1572  * system ready to manage mappings...
1573  */
1574 
1575 void
1576 pmap_init(void)
1577 {
1578 	int i;
1579 
1580 	for (i = 0; i < PV_HASH_SIZE; i++) {
1581 		SLIST_INIT(&pv_hash_heads[i].hh_list);
1582 	}
1583 	for (i = 0; i < PV_HASH_LOCK_CNT; i++) {
1584 		mutex_init(&pv_hash_locks[i].lock, MUTEX_NODEBUG, IPL_VM);
1585 	}
1586 
1587 	/*
1588 	 * done: pmap module is up (and ready for business)
1589 	 */
1590 
1591 	pmap_initialized = true;
1592 }
1593 
1594 /*
1595  * pmap_cpu_init_early: perform early per-CPU initialization.
1596  */
1597 
1598 void
1599 pmap_cpu_init_early(struct cpu_info *ci)
1600 {
1601 	struct pmap_cpu *pc;
1602 	static uint8_t pmap_cpu_alloc;
1603 
1604 	pc = &pmap_cpu[pmap_cpu_alloc++].pc;
1605 	ci->ci_pmap_cpu = pc;
1606 }
1607 
1608 /*
1609  * pmap_cpu_init_late: perform late per-CPU initialization.
1610  */
1611 
1612 void
1613 pmap_cpu_init_late(struct cpu_info *ci)
1614 {
1615 
1616 	if (ci == &cpu_info_primary)
1617 		evcnt_attach_dynamic(&pmap_tlb_evcnt, EVCNT_TYPE_INTR,
1618 		    NULL, "global", "TLB IPI");
1619 	evcnt_attach_dynamic(&ci->ci_tlb_evcnt, EVCNT_TYPE_MISC,
1620 	    NULL, device_xname(ci->ci_dev), "TLB IPI");
1621 }
1622 
1623 /*
1624  * p v _ e n t r y   f u n c t i o n s
1625  */
1626 
1627 /*
1628  * pmap_free_pvs: free a list of pv_entrys
1629  */
1630 
1631 static void
1632 pmap_free_pvs(struct pv_entry *pve)
1633 {
1634 	struct pv_entry *next;
1635 
1636 	for ( /* null */ ; pve != NULL ; pve = next) {
1637 		next = pve->pve_next;
1638 		pool_cache_put(&pmap_pv_cache, pve);
1639 	}
1640 }
1641 
1642 /*
1643  * main pv_entry manipulation functions:
1644  *   pmap_enter_pv: enter a mapping onto a pv_head list
1645  *   pmap_remove_pv: remove a mappiing from a pv_head list
1646  *
1647  * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock
1648  *       the pvh before calling
1649  */
1650 
1651 /*
1652  * insert_pv: a helper of pmap_enter_pv
1653  */
1654 
1655 static void
1656 insert_pv(struct pmap_page *pp, struct pv_entry *pve)
1657 {
1658 	struct pv_hash_head *hh;
1659 	kmutex_t *lock;
1660 	u_int hash;
1661 
1662 	KASSERT(pp_locked(pp));
1663 
1664 	hash = pvhash_hash(pve->pve_pte.pte_ptp, pve->pve_pte.pte_va);
1665 	lock = pvhash_lock(hash);
1666 	hh = pvhash_head(hash);
1667 	mutex_spin_enter(lock);
1668 	SLIST_INSERT_HEAD(&hh->hh_list, pve, pve_hash);
1669 	mutex_spin_exit(lock);
1670 
1671 	LIST_INSERT_HEAD(&pp->pp_head.pvh_list, pve, pve_list);
1672 }
1673 
1674 /*
1675  * pmap_enter_pv: enter a mapping onto a pv_head lst
1676  *
1677  * => caller should have the pp_lock locked
1678  * => caller should adjust ptp's wire_count before calling
1679  */
1680 
1681 static struct pv_entry *
1682 pmap_enter_pv(struct pmap_page *pp,
1683 	      struct pv_entry *pve,	/* preallocated pve for us to use */
1684 	      struct pv_entry **sparepve,
1685 	      struct vm_page *ptp,
1686 	      vaddr_t va)
1687 {
1688 
1689 	KASSERT(ptp == NULL || ptp->wire_count >= 2);
1690 	KASSERT(ptp == NULL || ptp->uobject != NULL);
1691 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
1692 	KASSERT(pp_locked(pp));
1693 
1694 	if ((pp->pp_flags & PP_EMBEDDED) == 0) {
1695 		if (LIST_EMPTY(&pp->pp_head.pvh_list)) {
1696 			pp->pp_flags |= PP_EMBEDDED;
1697 			pp->pp_pte.pte_ptp = ptp;
1698 			pp->pp_pte.pte_va = va;
1699 
1700 			return pve;
1701 		}
1702 	} else {
1703 		struct pv_entry *pve2;
1704 
1705 		pve2 = *sparepve;
1706 		*sparepve = NULL;
1707 
1708 		pve2->pve_pte = pp->pp_pte;
1709 		pp->pp_flags &= ~PP_EMBEDDED;
1710 		LIST_INIT(&pp->pp_head.pvh_list);
1711 		insert_pv(pp, pve2);
1712 	}
1713 
1714 	pve->pve_pte.pte_ptp = ptp;
1715 	pve->pve_pte.pte_va = va;
1716 	insert_pv(pp, pve);
1717 
1718 	return NULL;
1719 }
1720 
1721 /*
1722  * pmap_remove_pv: try to remove a mapping from a pv_list
1723  *
1724  * => caller should hold pp_lock [so that attrs can be adjusted]
1725  * => caller should adjust ptp's wire_count and free PTP if needed
1726  * => we return the removed pve
1727  */
1728 
1729 static struct pv_entry *
1730 pmap_remove_pv(struct pmap_page *pp, struct vm_page *ptp, vaddr_t va)
1731 {
1732 	struct pv_hash_head *hh;
1733 	struct pv_entry *pve;
1734 	kmutex_t *lock;
1735 	u_int hash;
1736 
1737 	KASSERT(ptp == NULL || ptp->uobject != NULL);
1738 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
1739 	KASSERT(pp_locked(pp));
1740 
1741 	if ((pp->pp_flags & PP_EMBEDDED) != 0) {
1742 		KASSERT(pp->pp_pte.pte_ptp == ptp);
1743 		KASSERT(pp->pp_pte.pte_va == va);
1744 
1745 		pp->pp_flags &= ~PP_EMBEDDED;
1746 		LIST_INIT(&pp->pp_head.pvh_list);
1747 
1748 		return NULL;
1749 	}
1750 
1751 	hash = pvhash_hash(ptp, va);
1752 	lock = pvhash_lock(hash);
1753 	hh = pvhash_head(hash);
1754 	mutex_spin_enter(lock);
1755 	pve = pvhash_remove(hh, ptp, va);
1756 	mutex_spin_exit(lock);
1757 
1758 	LIST_REMOVE(pve, pve_list);
1759 
1760 	return pve;
1761 }
1762 
1763 /*
1764  * p t p   f u n c t i o n s
1765  */
1766 
1767 static inline struct vm_page *
1768 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level)
1769 {
1770 	int lidx = level - 1;
1771 	struct vm_page *pg;
1772 
1773 	KASSERT(mutex_owned(&pmap->pm_lock));
1774 
1775 	if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] &&
1776 	    pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) {
1777 		return (pmap->pm_ptphint[lidx]);
1778 	}
1779 	PMAP_SUBOBJ_LOCK(pmap, lidx);
1780 	pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level));
1781 	PMAP_SUBOBJ_UNLOCK(pmap, lidx);
1782 
1783 	KASSERT(pg == NULL || pg->wire_count >= 1);
1784 	return pg;
1785 }
1786 
1787 static inline void
1788 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level)
1789 {
1790 	int lidx;
1791 	struct uvm_object *obj;
1792 
1793 	KASSERT(ptp->wire_count == 1);
1794 
1795 	lidx = level - 1;
1796 
1797 	obj = &pmap->pm_obj[lidx];
1798 	pmap_stats_update(pmap, -1, 0);
1799 	if (lidx != 0)
1800 		mutex_enter(&obj->vmobjlock);
1801 	if (pmap->pm_ptphint[lidx] == ptp)
1802 		pmap->pm_ptphint[lidx] = TAILQ_FIRST(&obj->memq);
1803 	ptp->wire_count = 0;
1804 	uvm_pagerealloc(ptp, NULL, 0);
1805 	VM_PAGE_TO_PP(ptp)->pp_link = curlwp->l_md.md_gc_ptp;
1806 	curlwp->l_md.md_gc_ptp = ptp;
1807 	if (lidx != 0)
1808 		mutex_exit(&obj->vmobjlock);
1809 }
1810 
1811 static void
1812 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
1813 	      pt_entry_t *ptes, pd_entry_t * const *pdes)
1814 {
1815 	unsigned long index;
1816 	int level;
1817 	vaddr_t invaladdr;
1818 #ifdef MULTIPROCESSOR
1819 	vaddr_t invaladdr2;
1820 #endif
1821 	pd_entry_t opde;
1822 	struct pmap *curpmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map);
1823 
1824 	KASSERT(pmap != pmap_kernel());
1825 	KASSERT(mutex_owned(&pmap->pm_lock));
1826 	KASSERT(kpreempt_disabled());
1827 
1828 	level = 1;
1829 	do {
1830 		index = pl_i(va, level + 1);
1831 		opde = pmap_pte_testset(&pdes[level - 1][index], 0);
1832 #if defined(XEN) && defined(__x86_64__)
1833 		/*
1834 		 * If ptp is a L3 currently mapped in kernel space,
1835 		 * clear it before freeing
1836 		 */
1837 		if (pmap->pm_pdirpa == xen_current_user_pgd
1838 		    && level == PTP_LEVELS - 1)
1839 			pmap_pte_set(&pmap_kernel()->pm_pdir[index], 0);
1840 #endif /* XEN && __x86_64__ */
1841 		pmap_freepage(pmap, ptp, level);
1842 		invaladdr = level == 1 ? (vaddr_t)ptes :
1843 		    (vaddr_t)pdes[level - 2];
1844 		pmap_tlb_shootdown(curpmap, invaladdr + index * PAGE_SIZE,
1845 		    0, opde);
1846 #if defined(MULTIPROCESSOR)
1847 		invaladdr2 = level == 1 ? (vaddr_t)PTE_BASE :
1848 		    (vaddr_t)normal_pdes[level - 2];
1849 		if (pmap != curpmap || invaladdr != invaladdr2) {
1850 			pmap_tlb_shootdown(pmap, invaladdr2 + index * PAGE_SIZE,
1851 			    0, opde);
1852 		}
1853 #endif
1854 		if (level < PTP_LEVELS - 1) {
1855 			ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1);
1856 			ptp->wire_count--;
1857 			if (ptp->wire_count > 1)
1858 				break;
1859 		}
1860 	} while (++level < PTP_LEVELS);
1861 	pmap_pte_flush();
1862 }
1863 
1864 /*
1865  * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
1866  *
1867  * => pmap should NOT be pmap_kernel()
1868  * => pmap should be locked
1869  * => preemption should be disabled
1870  */
1871 
1872 static struct vm_page *
1873 pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t * const *pdes)
1874 {
1875 	struct vm_page *ptp, *pptp;
1876 	int i;
1877 	unsigned long index;
1878 	pd_entry_t *pva;
1879 	paddr_t ppa, pa;
1880 	struct uvm_object *obj;
1881 
1882 	KASSERT(pmap != pmap_kernel());
1883 	KASSERT(mutex_owned(&pmap->pm_lock));
1884 	KASSERT(kpreempt_disabled());
1885 
1886 	ptp = NULL;
1887 	pa = (paddr_t)-1;
1888 
1889 	/*
1890 	 * Loop through all page table levels seeing if we need to
1891 	 * add a new page to that level.
1892 	 */
1893 	for (i = PTP_LEVELS; i > 1; i--) {
1894 		/*
1895 		 * Save values from previous round.
1896 		 */
1897 		pptp = ptp;
1898 		ppa = pa;
1899 
1900 		index = pl_i(va, i);
1901 		pva = pdes[i - 2];
1902 
1903 		if (pmap_valid_entry(pva[index])) {
1904 			ppa = pmap_pte2pa(pva[index]);
1905 			ptp = NULL;
1906 			continue;
1907 		}
1908 
1909 		obj = &pmap->pm_obj[i-2];
1910 		PMAP_SUBOBJ_LOCK(pmap, i - 2);
1911 		ptp = uvm_pagealloc(obj, ptp_va2o(va, i - 1), NULL,
1912 		    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
1913 		PMAP_SUBOBJ_UNLOCK(pmap, i - 2);
1914 
1915 		if (ptp == NULL)
1916 			return NULL;
1917 
1918 		ptp->flags &= ~PG_BUSY; /* never busy */
1919 		ptp->wire_count = 1;
1920 		pmap->pm_ptphint[i - 2] = ptp;
1921 		pa = VM_PAGE_TO_PHYS(ptp);
1922 		pmap_pte_set(&pva[index], (pd_entry_t)
1923 		        (pmap_pa2pte(pa) | PG_u | PG_RW | PG_V));
1924 #if defined(XEN) && defined(__x86_64__)
1925 		/*
1926 		 * In Xen we must enter the mapping in kernel map too
1927 		 * if pmap is curmap and modifying top level (PGD)
1928 		 */
1929 		if(i == PTP_LEVELS && pmap != pmap_kernel()) {
1930 		        pmap_pte_set(&pmap_kernel()->pm_pdir[index],
1931 		                (pd_entry_t) (pmap_pa2pte(pa)
1932 		                        | PG_u | PG_RW | PG_V));
1933 		}
1934 #endif /* XEN && __x86_64__ */
1935 		pmap_pte_flush();
1936 		pmap_stats_update(pmap, 1, 0);
1937 		/*
1938 		 * If we're not in the top level, increase the
1939 		 * wire count of the parent page.
1940 		 */
1941 		if (i < PTP_LEVELS) {
1942 			if (pptp == NULL)
1943 				pptp = pmap_find_ptp(pmap, va, ppa, i);
1944 #ifdef DIAGNOSTIC
1945 			if (pptp == NULL)
1946 				panic("pde page disappeared");
1947 #endif
1948 			pptp->wire_count++;
1949 		}
1950 	}
1951 
1952 	/*
1953 	 * ptp is not NULL if we just allocated a new ptp. If it's
1954 	 * still NULL, we must look up the existing one.
1955 	 */
1956 	if (ptp == NULL) {
1957 		ptp = pmap_find_ptp(pmap, va, ppa, 1);
1958 #ifdef DIAGNOSTIC
1959 		if (ptp == NULL) {
1960 			printf("va %lx ppa %lx\n", (unsigned long)va,
1961 			    (unsigned long)ppa);
1962 			panic("pmap_get_ptp: unmanaged user PTP");
1963 		}
1964 #endif
1965 	}
1966 
1967 	pmap->pm_ptphint[0] = ptp;
1968 	return(ptp);
1969 }
1970 
1971 /*
1972  * p m a p  l i f e c y c l e   f u n c t i o n s
1973  */
1974 
1975 /*
1976  * pmap_pdp_ctor: constructor for the PDP cache.
1977  */
1978 
1979 int
1980 pmap_pdp_ctor(void *arg, void *v, int flags)
1981 {
1982 	pd_entry_t *pdir = v;
1983 	paddr_t pdirpa = 0;	/* XXX: GCC */
1984 	vaddr_t object;
1985 	int i;
1986 
1987 #if !defined(XEN) || !defined(__x86_64__)
1988 	int npde;
1989 #endif
1990 #ifdef XEN
1991 	int s;
1992 #endif
1993 
1994 	/*
1995 	 * NOTE: The `pmap_lock' is held when the PDP is allocated.
1996 	 */
1997 
1998 #if defined(XEN) && defined(__x86_64__)
1999 	/* fetch the physical address of the page directory. */
2000 	(void) pmap_extract(pmap_kernel(), (vaddr_t) pdir, &pdirpa);
2001 
2002 	/* zero init area */
2003 	memset (pdir, 0, PAGE_SIZE); /* Xen wants a clean page */
2004 	/*
2005 	 * this pdir will NEVER be active in kernel mode
2006 	 * so mark recursive entry invalid
2007 	 */
2008 	pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa) | PG_u;
2009 	/*
2010 	 * PDP constructed this way won't be for kernel,
2011 	 * hence we don't put kernel mappings on Xen.
2012 	 * But we need to make pmap_create() happy, so put a dummy (without
2013 	 * PG_V) value at the right place.
2014 	 */
2015 	pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] =
2016 	     (unsigned long)-1 & PG_FRAME;
2017 #else /* XEN  && __x86_64__*/
2018 	/* zero init area */
2019 	memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t));
2020 
2021 	object = (vaddr_t)v;
2022 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2023 		/* fetch the physical address of the page directory. */
2024 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2025 		/* put in recursive PDE to map the PTEs */
2026 		pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PG_V;
2027 #ifndef XEN
2028 		pdir[PDIR_SLOT_PTE + i] |= PG_KW;
2029 #endif
2030 	}
2031 
2032 	/* copy kernel's PDE */
2033 	npde = nkptp[PTP_LEVELS - 1];
2034 
2035 	memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN],
2036 	    npde * sizeof(pd_entry_t));
2037 
2038 	/* zero the rest */
2039 	memset(&pdir[PDIR_SLOT_KERN + npde], 0,
2040 	    (NTOPLEVEL_PDES - (PDIR_SLOT_KERN + npde)) * sizeof(pd_entry_t));
2041 
2042 	if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
2043 		int idx = pl_i(KERNBASE, PTP_LEVELS);
2044 
2045 		pdir[idx] = PDP_BASE[idx];
2046 	}
2047 #endif /* XEN  && __x86_64__*/
2048 #ifdef XEN
2049 	s = splvm();
2050 	object = (vaddr_t)v;
2051 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2052 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2053 		/* remap this page RO */
2054 		pmap_kenter_pa(object, pdirpa, VM_PROT_READ);
2055 		pmap_update(pmap_kernel());
2056 		/*
2057 		 * pin as L2/L4 page, we have to do the page with the
2058 		 * PDIR_SLOT_PTE entries last
2059 		 */
2060 #ifdef PAE
2061 		if (i == l2tol3(PDIR_SLOT_PTE))
2062 			continue;
2063 #endif
2064 		xpq_queue_pin_table(xpmap_ptom_masked(pdirpa));
2065 	}
2066 #ifdef PAE
2067 	object = ((vaddr_t)pdir) + PAGE_SIZE  * l2tol3(PDIR_SLOT_PTE);
2068 	(void)pmap_extract(pmap_kernel(), object, &pdirpa);
2069 	xpq_queue_pin_table(xpmap_ptom_masked(pdirpa));
2070 #endif
2071 	xpq_flush_queue();
2072 	splx(s);
2073 #endif /* XEN */
2074 
2075 	return (0);
2076 }
2077 
2078 /*
2079  * pmap_pdp_dtor: destructor for the PDP cache.
2080  */
2081 
2082 void
2083 pmap_pdp_dtor(void *arg, void *v)
2084 {
2085 #ifdef XEN
2086 	paddr_t pdirpa = 0;	/* XXX: GCC */
2087 	vaddr_t object = (vaddr_t)v;
2088 	int i;
2089 	int s = splvm();
2090 	pt_entry_t *pte;
2091 
2092 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2093 		/* fetch the physical address of the page directory. */
2094 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2095 		/* unpin page table */
2096 		xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa));
2097 	}
2098 	object = (vaddr_t)v;
2099 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2100 		/* Set page RW again */
2101 		pte = kvtopte(object);
2102 		xpq_queue_pte_update(xpmap_ptetomach(pte), *pte | PG_RW);
2103 		xpq_queue_invlpg((vaddr_t)object);
2104 	}
2105 	xpq_flush_queue();
2106 	splx(s);
2107 #endif  /* XEN */
2108 }
2109 
2110 #ifdef PAE
2111 
2112 /* pmap_pdp_alloc: Allocate a page for the pdp memory pool. */
2113 
2114 void *
2115 pmap_pdp_alloc(struct pool *pp, int flags)
2116 {
2117 	return (void *)uvm_km_alloc(kernel_map,
2118 	    PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE,
2119 	    ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK)
2120 	    | UVM_KMF_WIRED);
2121 }
2122 
2123 /*
2124  * pmap_pdp_free: free a PDP
2125  */
2126 
2127 void
2128 pmap_pdp_free(struct pool *pp, void *v)
2129 {
2130 	uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE,
2131 	    UVM_KMF_WIRED);
2132 }
2133 #endif /* PAE */
2134 
2135 /*
2136  * pmap_create: create a pmap
2137  *
2138  * => note: old pmap interface took a "size" args which allowed for
2139  *	the creation of "software only" pmaps (not in bsd).
2140  */
2141 
2142 struct pmap *
2143 pmap_create(void)
2144 {
2145 	struct pmap *pmap;
2146 	int i;
2147 
2148 	pmap = pool_cache_get(&pmap_cache, PR_WAITOK);
2149 
2150 	/* init uvm_object */
2151 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2152 		UVM_OBJ_INIT(&pmap->pm_obj[i], NULL, 1);
2153 		pmap->pm_ptphint[i] = NULL;
2154 	}
2155 	pmap->pm_stats.wired_count = 0;
2156 	pmap->pm_stats.resident_count = 1;	/* count the PDP allocd below */
2157 #if !defined(__x86_64__)
2158 	pmap->pm_hiexec = 0;
2159 #endif /* !defined(__x86_64__) */
2160 	pmap->pm_flags = 0;
2161 	pmap->pm_cpus = 0;
2162 	pmap->pm_kernel_cpus = 0;
2163 
2164 	/* init the LDT */
2165 	pmap->pm_ldt = NULL;
2166 	pmap->pm_ldt_len = 0;
2167 	pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2168 
2169 	/* allocate PDP */
2170  try_again:
2171 	pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK);
2172 
2173 	mutex_enter(&pmaps_lock);
2174 
2175 	if (pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] == 0) {
2176 		mutex_exit(&pmaps_lock);
2177 		pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir);
2178 		goto try_again;
2179 	}
2180 
2181 #ifdef PAE
2182 	for (i = 0; i < PDP_SIZE; i++)
2183 		pmap->pm_pdirpa[i] =
2184 		    pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]);
2185 #else
2186 	pmap->pm_pdirpa = pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE]);
2187 #endif
2188 
2189 	LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
2190 
2191 	mutex_exit(&pmaps_lock);
2192 
2193 	return (pmap);
2194 }
2195 
2196 /*
2197  * pmap_destroy: drop reference count on pmap.   free pmap if
2198  *	reference count goes to zero.
2199  */
2200 
2201 void
2202 pmap_destroy(struct pmap *pmap)
2203 {
2204 	int i;
2205 #ifdef DIAGNOSTIC
2206 	struct cpu_info *ci;
2207 	CPU_INFO_ITERATOR cii;
2208 #endif /* DIAGNOSTIC */
2209 
2210 	/*
2211 	 * if we have torn down this pmap, process deferred frees and
2212 	 * invalidations now.
2213 	 */
2214 	if (__predict_false(curlwp->l_md.md_gc_pmap == pmap)) {
2215 		pmap_update(pmap);
2216 	}
2217 
2218 	/*
2219 	 * drop reference count
2220 	 */
2221 
2222 	if (atomic_dec_uint_nv((unsigned *)&pmap->pm_obj[0].uo_refs) > 0) {
2223 		return;
2224 	}
2225 
2226 #ifdef DIAGNOSTIC
2227 	for (CPU_INFO_FOREACH(cii, ci))
2228 		if (ci->ci_pmap == pmap)
2229 			panic("destroying pmap being used");
2230 #endif /* DIAGNOSTIC */
2231 
2232 	/*
2233 	 * reference count is zero, free pmap resources and then free pmap.
2234 	 */
2235 #ifdef XEN
2236 	/*
2237 	 * Xen lazy APDP handling:
2238 	 * clear APDP_PDE if pmap is the currently mapped
2239 	 */
2240 	if (xpmap_ptom_masked(pmap_pdirpa(pmap, 0)) == (*APDP_PDE & PG_FRAME)) {
2241 		kpreempt_disable();
2242 		for (i = 0; i < PDP_SIZE; i++) {
2243 	        	pmap_pte_set(&APDP_PDE[i], 0);
2244 #ifdef PAE
2245 			/* clear shadow entry too */
2246 	    		pmap_pte_set(&APDP_PDE_SHADOW[i], 0);
2247 #endif
2248 		}
2249 		pmap_pte_flush();
2250 	        pmap_apte_flush(pmap_kernel());
2251 	        kpreempt_enable();
2252 	}
2253 #endif
2254 
2255 	/*
2256 	 * remove it from global list of pmaps
2257 	 */
2258 
2259 	mutex_enter(&pmaps_lock);
2260 	LIST_REMOVE(pmap, pm_list);
2261 	mutex_exit(&pmaps_lock);
2262 
2263 	/*
2264 	 * destroyed pmap shouldn't have remaining PTPs
2265 	 */
2266 
2267 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2268 		KASSERT(pmap->pm_obj[i].uo_npages == 0);
2269 		KASSERT(TAILQ_EMPTY(&pmap->pm_obj[i].memq));
2270 	}
2271 
2272 	/*
2273 	 * MULTIPROCESSOR -- no need to flush out of other processors'
2274 	 * APTE space because we do that in pmap_unmap_ptes().
2275 	 */
2276 	pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir);
2277 
2278 #ifdef USER_LDT
2279 	if (pmap->pm_flags & PMF_USER_LDT) {
2280 		/*
2281 		 * no need to switch the LDT; this address space is gone,
2282 		 * nothing is using it.
2283 		 *
2284 		 * No need to lock the pmap for ldt_free (or anything else),
2285 		 * we're the last one to use it.
2286 		 */
2287 		ldt_free(pmap->pm_ldt_sel);
2288 		uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt,
2289 		    pmap->pm_ldt_len * sizeof(union descriptor), UVM_KMF_WIRED);
2290 	}
2291 #endif
2292 
2293 	for (i = 0; i < PTP_LEVELS - 1; i++)
2294 		mutex_destroy(&pmap->pm_obj[i].vmobjlock);
2295 	pool_cache_put(&pmap_cache, pmap);
2296 }
2297 
2298 /*
2299  * pmap_remove_all: pmap is being torn down by the current thread.
2300  * avoid unnecessary invalidations.
2301  */
2302 
2303 void
2304 pmap_remove_all(struct pmap *pmap)
2305 {
2306 	lwp_t *l = curlwp;
2307 
2308 	KASSERT(l->l_md.md_gc_pmap == NULL);
2309 
2310 	l->l_md.md_gc_pmap = pmap;
2311 }
2312 
2313 #if defined(PMAP_FORK)
2314 /*
2315  * pmap_fork: perform any necessary data structure manipulation when
2316  * a VM space is forked.
2317  */
2318 
2319 void
2320 pmap_fork(struct pmap *pmap1, struct pmap *pmap2)
2321 {
2322 #ifdef USER_LDT
2323 	union descriptor *new_ldt;
2324 	size_t len;
2325 	int sel;
2326 
2327  retry:
2328 	if (pmap1->pm_flags & PMF_USER_LDT) {
2329 		len = pmap1->pm_ldt_len * sizeof(union descriptor);
2330 		new_ldt = (union descriptor *)uvm_km_alloc(kernel_map,
2331 		    len, 0, UVM_KMF_WIRED);
2332 		sel = ldt_alloc(new_ldt, len);
2333 	} else {
2334 		len = -1;
2335 		new_ldt = NULL;
2336 		sel = -1;
2337 	}
2338 
2339 	if ((uintptr_t) pmap1 < (uintptr_t) pmap2) {
2340 		mutex_enter(&pmap1->pm_lock);
2341 		mutex_enter(&pmap2->pm_lock);
2342 	} else {
2343 		mutex_enter(&pmap2->pm_lock);
2344 		mutex_enter(&pmap1->pm_lock);
2345 	}
2346 
2347  	/* Copy the LDT, if necessary. */
2348  	if (pmap1->pm_flags & PMF_USER_LDT) {
2349 		if (len != pmap1->pm_ldt_len * sizeof(union descriptor)) {
2350 			mutex_exit(&pmap2->pm_lock);
2351 			mutex_exit(&pmap1->pm_lock);
2352 			if (len != -1) {
2353 				ldt_free(sel);
2354 				uvm_km_free(kernel_map, (vaddr_t)new_ldt,
2355 				    len, UVM_KMF_WIRED);
2356 			}
2357 			goto retry;
2358 		}
2359 
2360 		memcpy(new_ldt, pmap1->pm_ldt, len);
2361 		pmap2->pm_ldt = new_ldt;
2362 		pmap2->pm_ldt_len = pmap1->pm_ldt_len;
2363 		pmap2->pm_flags |= PMF_USER_LDT;
2364 		pmap2->pm_ldt_sel = sel;
2365 		len = -1;
2366 	}
2367 
2368 	mutex_exit(&pmap2->pm_lock);
2369 	mutex_exit(&pmap1->pm_lock);
2370 
2371 	if (len != -1) {
2372 		ldt_free(sel);
2373 		uvm_km_free(kernel_map, (vaddr_t)new_ldt, len,
2374 		    UVM_KMF_WIRED);
2375 	}
2376 #endif /* USER_LDT */
2377 }
2378 #endif /* PMAP_FORK */
2379 
2380 #ifdef USER_LDT
2381 /*
2382  * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and
2383  * restore the default.
2384  */
2385 
2386 void
2387 pmap_ldt_cleanup(struct lwp *l)
2388 {
2389 	struct pcb *pcb = &l->l_addr->u_pcb;
2390 	pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap;
2391 	union descriptor *old_ldt = NULL;
2392 	size_t len = 0;
2393 	int sel = -1;
2394 
2395 	mutex_enter(&pmap->pm_lock);
2396 	kpreempt_disable();
2397 
2398 	if (pmap->pm_flags & PMF_USER_LDT) {
2399 		sel = pmap->pm_ldt_sel;
2400 		pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2401 		pcb->pcb_ldt_sel = pmap->pm_ldt_sel;
2402 		if (l == curlwp)
2403 			lldt(pcb->pcb_ldt_sel);
2404 		old_ldt = pmap->pm_ldt;
2405 		len = pmap->pm_ldt_len * sizeof(union descriptor);
2406 		pmap->pm_ldt = NULL;
2407 		pmap->pm_ldt_len = 0;
2408 		pmap->pm_flags &= ~PMF_USER_LDT;
2409 	}
2410 
2411 	kpreempt_enable();
2412 	mutex_exit(&pmap->pm_lock);
2413 
2414 	if (sel != -1)
2415 		ldt_free(sel);
2416 	if (old_ldt != NULL)
2417 		uvm_km_free(kernel_map, (vaddr_t)old_ldt, len, UVM_KMF_WIRED);
2418 }
2419 #endif /* USER_LDT */
2420 
2421 /*
2422  * pmap_activate: activate a process' pmap
2423  *
2424  * => must be called with kernel preemption disabled
2425  * => if lwp is the curlwp, then set ci_want_pmapload so that
2426  *    actual MMU context switch will be done by pmap_load() later
2427  */
2428 
2429 void
2430 pmap_activate(struct lwp *l)
2431 {
2432 	struct cpu_info *ci;
2433 	struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2434 
2435 	KASSERT(kpreempt_disabled());
2436 
2437 	ci = curcpu();
2438 
2439 	if (l == ci->ci_curlwp) {
2440 		struct pcb *pcb;
2441 
2442 		KASSERT(ci->ci_want_pmapload == 0);
2443 		KASSERT(ci->ci_tlbstate != TLBSTATE_VALID);
2444 #ifdef KSTACK_CHECK_DR0
2445 		/*
2446 		 * setup breakpoint on the top of stack
2447 		 */
2448 		if (l == &lwp0)
2449 			dr0(0, 0, 0, 0);
2450 		else
2451 			dr0(KSTACK_LOWEST_ADDR(l), 1, 3, 1);
2452 #endif
2453 
2454 		/*
2455 		 * no need to switch to kernel vmspace because
2456 		 * it's a subset of any vmspace.
2457 		 */
2458 
2459 		if (pmap == pmap_kernel()) {
2460 			ci->ci_want_pmapload = 0;
2461 			return;
2462 		}
2463 
2464 		pcb = &l->l_addr->u_pcb;
2465 		pcb->pcb_ldt_sel = pmap->pm_ldt_sel;
2466 
2467 		ci->ci_want_pmapload = 1;
2468 
2469 #if defined(__x86_64__)
2470 		if (pcb->pcb_flags & PCB_GS64)
2471 			wrmsr(MSR_KERNELGSBASE, pcb->pcb_gs);
2472 		if (pcb->pcb_flags & PCB_FS64)
2473 			wrmsr(MSR_FSBASE, pcb->pcb_fs);
2474 #endif /* defined(__x86_64__) */
2475 	}
2476 }
2477 
2478 /*
2479  * pmap_reactivate: try to regain reference to the pmap.
2480  *
2481  * => must be called with kernel preemption disabled
2482  */
2483 
2484 static bool
2485 pmap_reactivate(struct pmap *pmap)
2486 {
2487 	struct cpu_info *ci;
2488 	uint32_t cpumask;
2489 	bool result;
2490 	uint32_t oldcpus;
2491 
2492 	ci = curcpu();
2493 	cpumask = ci->ci_cpumask;
2494 
2495 	KASSERT(kpreempt_disabled());
2496 #if defined(XEN) && defined(__x86_64__)
2497 	KASSERT(pmap->pm_pdirpa == xen_current_user_pgd);
2498 #elif defined(PAE)
2499 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(pmap_l3pd[0]));
2500 #elif !defined(XEN) || (defined(XEN) && defined(XEN3))
2501 	KASSERT(pmap->pm_pdirpa == pmap_pte2pa(rcr3()));
2502 #endif
2503 
2504 	/*
2505 	 * if we still have a lazy reference to this pmap,
2506 	 * we can assume that there was no tlb shootdown
2507 	 * for this pmap in the meantime.
2508 	 *
2509 	 * the order of events here is important as we must
2510 	 * synchronize with TLB shootdown interrupts.  declare
2511 	 * interest in invalidations (TLBSTATE_VALID) and then
2512 	 * check the cpumask, which the IPIs can change only
2513 	 * when the state is TLBSTATE_LAZY.
2514 	 */
2515 
2516 	ci->ci_tlbstate = TLBSTATE_VALID;
2517 	oldcpus = pmap->pm_cpus;
2518 	KASSERT((pmap->pm_kernel_cpus & cpumask) != 0);
2519 	if (oldcpus & cpumask) {
2520 		/* got it */
2521 		result = true;
2522 	} else {
2523 		/* must reload */
2524 		atomic_or_32(&pmap->pm_cpus, cpumask);
2525 		result = false;
2526 	}
2527 
2528 	return result;
2529 }
2530 
2531 /*
2532  * pmap_load: actually switch pmap.  (fill in %cr3 and LDT info)
2533  */
2534 
2535 void
2536 pmap_load(void)
2537 {
2538 	struct cpu_info *ci;
2539 	uint32_t cpumask;
2540 	struct pmap *pmap;
2541 	struct pmap *oldpmap;
2542 	struct lwp *l;
2543 	struct pcb *pcb;
2544 	uint64_t ncsw;
2545 
2546 	kpreempt_disable();
2547  retry:
2548 	ci = curcpu();
2549 	if (!ci->ci_want_pmapload) {
2550 		kpreempt_enable();
2551 		return;
2552 	}
2553 	cpumask = ci->ci_cpumask;
2554 	l = ci->ci_curlwp;
2555 	ncsw = l->l_ncsw;
2556 
2557 	/* should be able to take ipis. */
2558 	KASSERT(ci->ci_ilevel < IPL_IPI);
2559 #ifdef XEN
2560 	/* XXX not yet KASSERT(x86_read_psl() != 0); */
2561 #else
2562 	KASSERT((x86_read_psl() & PSL_I) != 0);
2563 #endif
2564 
2565 	KASSERT(l != NULL);
2566 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2567 	KASSERT(pmap != pmap_kernel());
2568 	oldpmap = ci->ci_pmap;
2569 
2570 	pcb = &l->l_addr->u_pcb;
2571 	/* loaded by pmap_activate */
2572 	KASSERT(pcb->pcb_ldt_sel == pmap->pm_ldt_sel);
2573 
2574 	if (pmap == oldpmap) {
2575 		if (!pmap_reactivate(pmap)) {
2576 
2577 			/*
2578 			 * pmap has been changed during deactivated.
2579 			 * our tlb may be stale.
2580 			 */
2581 
2582 			tlbflush();
2583 		}
2584 
2585 		ci->ci_want_pmapload = 0;
2586 		kpreempt_enable();
2587 		return;
2588 	}
2589 
2590 	/*
2591 	 * grab a reference to the new pmap.
2592 	 */
2593 
2594 	pmap_reference(pmap);
2595 
2596 	/*
2597 	 * actually switch pmap.
2598 	 */
2599 
2600 	atomic_and_32(&oldpmap->pm_cpus, ~cpumask);
2601 	atomic_and_32(&oldpmap->pm_kernel_cpus, ~cpumask);
2602 
2603 #if defined(XEN) && defined(__x86_64__)
2604 	KASSERT(oldpmap->pm_pdirpa == xen_current_user_pgd ||
2605 	    oldpmap == pmap_kernel());
2606 #elif defined(PAE)
2607 	KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(pmap_l3pd[0]));
2608 #elif !defined(XEN) || (defined(XEN) && defined(XEN3))
2609 	KASSERT(oldpmap->pm_pdirpa == pmap_pte2pa(rcr3()));
2610 #endif
2611 	KASSERT((pmap->pm_cpus & cpumask) == 0);
2612 	KASSERT((pmap->pm_kernel_cpus & cpumask) == 0);
2613 
2614 	/*
2615 	 * mark the pmap in use by this processor.  again we must
2616 	 * synchronize with TLB shootdown interrupts, so set the
2617 	 * state VALID first, then register us for shootdown events
2618 	 * on this pmap.
2619 	 */
2620 
2621 	ci->ci_tlbstate = TLBSTATE_VALID;
2622 	atomic_or_32(&pmap->pm_cpus, cpumask);
2623 	atomic_or_32(&pmap->pm_kernel_cpus, cpumask);
2624 	ci->ci_pmap = pmap;
2625 
2626 	/*
2627 	 * update tss.  now that we have registered for invalidations
2628 	 * from other CPUs, we're good to load the page tables.
2629 	 */
2630 #ifdef PAE
2631 	pcb->pcb_cr3 = pmap_l3paddr;
2632 #else
2633 	pcb->pcb_cr3 = pmap->pm_pdirpa;
2634 #endif
2635 #if defined(XEN) && defined(__x86_64__)
2636 	/* kernel pmap always in cr3 and should never go in user cr3 */
2637 	if (pmap_pdirpa(pmap, 0) != pmap_pdirpa(pmap_kernel(), 0)) {
2638 		/*
2639 		 * Map user space address in kernel space and load
2640 		 * user cr3
2641 		 */
2642 		int i, s;
2643 		pd_entry_t *old_pgd, *new_pgd;
2644 		paddr_t addr;
2645 		s = splvm();
2646 		new_pgd  = pmap->pm_pdir;
2647 		old_pgd = pmap_kernel()->pm_pdir;
2648 		addr = xpmap_ptom(pmap_pdirpa(pmap_kernel(), 0));
2649 		for (i = 0; i < PDIR_SLOT_PTE;
2650 		    i++, addr += sizeof(pd_entry_t)) {
2651 			if ((new_pgd[i] & PG_V) || (old_pgd[i] & PG_V))
2652 				xpq_queue_pte_update(addr, new_pgd[i]);
2653 		}
2654 		xpq_flush_queue(); /* XXXtlb */
2655 		tlbflush();
2656 		xen_set_user_pgd(pmap_pdirpa(pmap, 0));
2657 		xen_current_user_pgd = pmap_pdirpa(pmap, 0);
2658 		splx(s);
2659 	}
2660 #else /* XEN && x86_64 */
2661 #if defined(XEN)
2662 	/*
2663 	 * clear APDP slot, in case it points to a page table that has
2664 	 * been freed
2665 	 */
2666 	if (*APDP_PDE) {
2667 		int i;
2668 		for (i = 0; i < PDP_SIZE; i++) {
2669 			pmap_pte_set(&APDP_PDE[i], 0);
2670 #ifdef PAE
2671 			/* clear shadow entry too */
2672 			pmap_pte_set(&APDP_PDE_SHADOW[i], 0);
2673 #endif
2674 		}
2675 	}
2676 	/* lldt() does pmap_pte_flush() */
2677 #else /* XEN */
2678 #if defined(i386)
2679 	ci->ci_tss.tss_ldt = pcb->pcb_ldt_sel;
2680 	ci->ci_tss.tss_cr3 = pcb->pcb_cr3;
2681 #endif
2682 #endif /* XEN */
2683 	lldt(pcb->pcb_ldt_sel);
2684 #ifdef PAE
2685 	{
2686 	paddr_t l3_pd = xpmap_ptom_masked(pmap_l3paddr);
2687 	int i;
2688 	int s = splvm();
2689 	/* don't update the kernel L3 slot */
2690 	for (i = 0 ; i < PDP_SIZE - 1  ; i++, l3_pd += sizeof(pd_entry_t)) {
2691 		xpq_queue_pte_update(l3_pd,
2692 		    xpmap_ptom(pmap->pm_pdirpa[i]) | PG_V);
2693 	}
2694 	tlbflush();
2695 	xpq_flush_queue();
2696 	splx(s);
2697 	}
2698 #else /* PAE */
2699 	lcr3(pcb->pcb_cr3);
2700 #endif /* PAE */
2701 #endif /* XEN && x86_64 */
2702 
2703 	ci->ci_want_pmapload = 0;
2704 
2705 	/*
2706 	 * we're now running with the new pmap.  drop the reference
2707 	 * to the old pmap.  if we block, we need to go around again.
2708 	 */
2709 
2710 	pmap_destroy(oldpmap);
2711 	if (l->l_ncsw != ncsw) {
2712 		goto retry;
2713 	}
2714 
2715 	kpreempt_enable();
2716 }
2717 
2718 /*
2719  * pmap_deactivate: deactivate a process' pmap
2720  *
2721  * => must be called with kernel preemption disabled (high SPL is enough)
2722  */
2723 
2724 void
2725 pmap_deactivate(struct lwp *l)
2726 {
2727 	struct pmap *pmap;
2728 	struct cpu_info *ci;
2729 
2730 	KASSERT(kpreempt_disabled());
2731 
2732 	if (l != curlwp) {
2733 		return;
2734 	}
2735 
2736 	/*
2737 	 * wait for pending TLB shootdowns to complete.  necessary
2738 	 * because TLB shootdown state is per-CPU, and the LWP may
2739 	 * be coming off the CPU before it has a chance to call
2740 	 * pmap_update().
2741 	 */
2742 	pmap_tlb_shootwait();
2743 
2744 	ci = curcpu();
2745 
2746 	if (ci->ci_want_pmapload) {
2747 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
2748 		    != pmap_kernel());
2749 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
2750 		    != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID);
2751 
2752 		/*
2753 		 * userspace has not been touched.
2754 		 * nothing to do here.
2755 		 */
2756 
2757 		ci->ci_want_pmapload = 0;
2758 		return;
2759 	}
2760 
2761 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2762 
2763 	if (pmap == pmap_kernel()) {
2764 		return;
2765 	}
2766 
2767 #if defined(XEN) && defined(__x86_64__)
2768 	KASSERT(pmap->pm_pdirpa == xen_current_user_pgd);
2769 #elif defined(PAE)
2770 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(pmap_l3pd[0]));
2771 #elif !defined(XEN) || (defined(XEN) && defined(XEN3))
2772 	KASSERT(pmap->pm_pdirpa == pmap_pte2pa(rcr3()));
2773 #endif
2774 	KASSERT(ci->ci_pmap == pmap);
2775 
2776 	/*
2777 	 * we aren't interested in TLB invalidations for this pmap,
2778 	 * at least for the time being.
2779 	 */
2780 
2781 	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
2782 	ci->ci_tlbstate = TLBSTATE_LAZY;
2783 }
2784 
2785 /*
2786  * end of lifecycle functions
2787  */
2788 
2789 /*
2790  * some misc. functions
2791  */
2792 
2793 static int
2794 pmap_pdes_invalid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde)
2795 {
2796 	int i;
2797 	unsigned long index;
2798 	pd_entry_t pde;
2799 
2800 	for (i = PTP_LEVELS; i > 1; i--) {
2801 		index = pl_i(va, i);
2802 		pde = pdes[i - 2][index];
2803 		if ((pde & PG_V) == 0)
2804 			return i;
2805 	}
2806 	if (lastpde != NULL)
2807 		*lastpde = pde;
2808 	return 0;
2809 }
2810 
2811 /*
2812  * pmap_extract: extract a PA for the given VA
2813  */
2814 
2815 bool
2816 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
2817 {
2818 	pt_entry_t *ptes, pte;
2819 	pd_entry_t pde;
2820 	pd_entry_t * const *pdes;
2821 	struct pmap *pmap2;
2822 	struct cpu_info *ci;
2823 	vaddr_t pa;
2824 	lwp_t *l;
2825 	bool hard, rv;
2826 
2827 	rv = false;
2828 	pa = 0;
2829 	l = curlwp;
2830 
2831 	KPREEMPT_DISABLE(l);
2832 	ci = l->l_cpu;
2833 	if (__predict_true(!ci->ci_want_pmapload && ci->ci_pmap == pmap) ||
2834 	    pmap == pmap_kernel()) {
2835 		/*
2836 		 * no need to lock, because it's pmap_kernel() or our
2837 		 * own pmap and is active.  if a user pmap, the caller
2838 		 * will hold the vm_map write/read locked and so prevent
2839 		 * entries from disappearing while we are here.  ptps
2840 		 * can disappear via pmap_remove(), pmap_protect() and
2841 		 * pmap_collect(), but they are called with the vm_map
2842 		 * write locked.
2843 		 */
2844 		hard = false;
2845 		ptes = PTE_BASE;
2846 		pdes = normal_pdes;
2847 	} else {
2848 		/* we lose, do it the hard way. */
2849 		hard = true;
2850 		pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
2851 	}
2852 	if (pmap_pdes_valid(va, pdes, &pde)) {
2853 		pte = ptes[pl1_i(va)];
2854 		if (pde & PG_PS) {
2855 			pa = (pde & PG_LGFRAME) | (va & (NBPD_L2 - 1));
2856 			rv = true;
2857 		} else if (__predict_true((pte & PG_V) != 0)) {
2858 			pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
2859 			rv = true;
2860 		}
2861 	}
2862 	if (__predict_false(hard)) {
2863 		pmap_unmap_ptes(pmap, pmap2);
2864 	}
2865 	KPREEMPT_ENABLE(l);
2866 	if (pap != NULL) {
2867 		*pap = pa;
2868 	}
2869 	return rv;
2870 }
2871 
2872 
2873 /*
2874  * vtophys: virtual address to physical address.  For use by
2875  * machine-dependent code only.
2876  */
2877 
2878 paddr_t
2879 vtophys(vaddr_t va)
2880 {
2881 	paddr_t pa;
2882 
2883 	if (pmap_extract(pmap_kernel(), va, &pa) == true)
2884 		return (pa);
2885 	return (0);
2886 }
2887 
2888 #ifdef XEN
2889 /*
2890  * pmap_extract_ma: extract a MA for the given VA
2891  */
2892 
2893 bool
2894 pmap_extract_ma(pmap, va, pap)
2895 	struct pmap *pmap;
2896 	vaddr_t va;
2897 	paddr_t *pap;
2898 {
2899 	pt_entry_t *ptes, pte;
2900 	pd_entry_t pde;
2901 	pd_entry_t * const *pdes;
2902 	struct pmap *pmap2;
2903 
2904 	kpreempt_disable();
2905 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
2906 	if (!pmap_pdes_valid(va, pdes, &pde)) {
2907 		pmap_unmap_ptes(pmap, pmap2);
2908 		kpreempt_enable();
2909 		return false;
2910 	}
2911 
2912 	pte = ptes[pl1_i(va)];
2913 	pmap_unmap_ptes(pmap, pmap2);
2914 	kpreempt_enable();
2915 
2916 	if (__predict_true((pte & PG_V) != 0)) {
2917 		if (pap != NULL)
2918 			*pap = (pte & PG_FRAME) | (va & (NBPD_L1 - 1));
2919 		return true;
2920 	}
2921 
2922 	return false;
2923 }
2924 
2925 /*
2926  * vtomach: virtual address to machine address.  For use by
2927  * machine-dependent code only.
2928  */
2929 
2930 paddr_t
2931 vtomach(vaddr_t va)
2932 {
2933 	paddr_t pa;
2934 
2935 	if (pmap_extract_ma(pmap_kernel(), va, &pa) == true)
2936 		return (pa);
2937 	return (0);
2938 }
2939 
2940 #endif /* XEN */
2941 
2942 
2943 
2944 /*
2945  * pmap_virtual_space: used during bootup [pmap_steal_memory] to
2946  *	determine the bounds of the kernel virtual addess space.
2947  */
2948 
2949 void
2950 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp)
2951 {
2952 	*startp = virtual_avail;
2953 	*endp = virtual_end;
2954 }
2955 
2956 /*
2957  * pmap_map: map a range of PAs into kvm.
2958  *
2959  * => used during crash dump
2960  * => XXX: pmap_map() should be phased out?
2961  */
2962 
2963 vaddr_t
2964 pmap_map(vaddr_t va, paddr_t spa, paddr_t epa, vm_prot_t prot)
2965 {
2966 	while (spa < epa) {
2967 		pmap_kenter_pa(va, spa, prot);
2968 		va += PAGE_SIZE;
2969 		spa += PAGE_SIZE;
2970 	}
2971 	pmap_update(pmap_kernel());
2972 	return va;
2973 }
2974 
2975 /*
2976  * pmap_zero_page: zero a page
2977  */
2978 
2979 void
2980 pmap_zero_page(paddr_t pa)
2981 {
2982 	pt_entry_t *zpte;
2983 	void *zerova;
2984 	int id;
2985 
2986 	kpreempt_disable();
2987 	id = cpu_number();
2988 	zpte = PTESLEW(zero_pte, id);
2989 	zerova = VASLEW(zerop, id);
2990 
2991 #ifdef DIAGNOSTIC
2992 	if (*zpte)
2993 		panic("pmap_zero_page: lock botch");
2994 #endif
2995 
2996 	pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k);
2997 	pmap_pte_flush();
2998 	pmap_update_pg((vaddr_t)zerova);		/* flush TLB */
2999 
3000 	memset(zerova, 0, PAGE_SIZE);
3001 
3002 #if defined(DIAGNOSTIC) || defined(XEN)
3003 	pmap_pte_set(zpte, 0);				/* zap ! */
3004 	pmap_pte_flush();
3005 #endif
3006 	kpreempt_enable();
3007 }
3008 
3009 /*
3010  * pmap_pagezeroidle: the same, for the idle loop page zero'er.
3011  * Returns true if the page was zero'd, false if we aborted for
3012  * some reason.
3013  */
3014 
3015 bool
3016 pmap_pageidlezero(paddr_t pa)
3017 {
3018 	pt_entry_t *zpte;
3019 	void *zerova;
3020 	bool rv;
3021 	int id;
3022 
3023 	id = cpu_number();
3024 	zpte = PTESLEW(zero_pte, id);
3025 	zerova = VASLEW(zerop, id);
3026 
3027 	KASSERT(cpu_feature & CPUID_SSE2);
3028 	KASSERT(*zpte == 0);
3029 
3030 	pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k);
3031 	pmap_pte_flush();
3032 	pmap_update_pg((vaddr_t)zerova);		/* flush TLB */
3033 
3034 	rv = sse2_idlezero_page(zerova);
3035 
3036 #if defined(DIAGNOSTIC) || defined(XEN)
3037 	pmap_pte_set(zpte, 0);				/* zap ! */
3038 	pmap_pte_flush();
3039 #endif
3040 
3041 	return rv;
3042 }
3043 
3044 /*
3045  * pmap_copy_page: copy a page
3046  */
3047 
3048 void
3049 pmap_copy_page(paddr_t srcpa, paddr_t dstpa)
3050 {
3051 	pt_entry_t *spte;
3052 	pt_entry_t *dpte;
3053 	void *csrcva;
3054 	void *cdstva;
3055 	int id;
3056 
3057 	kpreempt_disable();
3058 	id = cpu_number();
3059 	spte = PTESLEW(csrc_pte,id);
3060 	dpte = PTESLEW(cdst_pte,id);
3061 	csrcva = VASLEW(csrcp, id);
3062 	cdstva = VASLEW(cdstp, id);
3063 
3064 	KASSERT(*spte == 0 && *dpte == 0);
3065 
3066 	pmap_pte_set(spte, pmap_pa2pte(srcpa) | PG_V | PG_RW | PG_U | PG_k);
3067 	pmap_pte_set(dpte,
3068 	    pmap_pa2pte(dstpa) | PG_V | PG_RW | PG_M | PG_U | PG_k);
3069 	pmap_pte_flush();
3070 	pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva);
3071 
3072 	memcpy(cdstva, csrcva, PAGE_SIZE);
3073 
3074 #if defined(DIAGNOSTIC) || defined(XEN)
3075 	pmap_pte_set(spte, 0);
3076 	pmap_pte_set(dpte, 0);
3077 	pmap_pte_flush();
3078 #endif
3079 	kpreempt_enable();
3080 }
3081 
3082 static pt_entry_t *
3083 pmap_map_ptp(struct vm_page *ptp)
3084 {
3085 	pt_entry_t *ptppte;
3086 	void *ptpva;
3087 	int id;
3088 
3089 	KASSERT(kpreempt_disabled());
3090 
3091 	id = cpu_number();
3092 	ptppte = PTESLEW(ptp_pte, id);
3093 	ptpva = VASLEW(ptpp, id);
3094 #if !defined(XEN)
3095 	pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M |
3096 	    PG_RW | PG_U | PG_k);
3097 #else
3098 	pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M |
3099 	    PG_U | PG_k);
3100 #endif
3101 	pmap_pte_flush();
3102 	pmap_update_pg((vaddr_t)ptpva);
3103 
3104 	return (pt_entry_t *)ptpva;
3105 }
3106 
3107 static void
3108 pmap_unmap_ptp(void)
3109 {
3110 #if defined(DIAGNOSTIC) || defined(XEN)
3111 	pt_entry_t *pte;
3112 
3113 	KASSERT(kpreempt_disabled());
3114 
3115 	pte = PTESLEW(ptp_pte, cpu_number());
3116 	if (*pte != 0) {
3117 		pmap_pte_set(pte, 0);
3118 		pmap_pte_flush();
3119 	}
3120 #endif
3121 }
3122 
3123 static pt_entry_t *
3124 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
3125 {
3126 
3127 	KASSERT(kpreempt_disabled());
3128 	if (pmap_is_curpmap(pmap)) {
3129 		return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */
3130 	}
3131 	KASSERT(ptp != NULL);
3132 	return pmap_map_ptp(ptp) + pl1_pi(va);
3133 }
3134 
3135 static void
3136 pmap_unmap_pte(void)
3137 {
3138 
3139 	KASSERT(kpreempt_disabled());
3140 
3141 	pmap_unmap_ptp();
3142 }
3143 
3144 /*
3145  * p m a p   r e m o v e   f u n c t i o n s
3146  *
3147  * functions that remove mappings
3148  */
3149 
3150 /*
3151  * pmap_remove_ptes: remove PTEs from a PTP
3152  *
3153  * => must have proper locking on pmap_master_lock
3154  * => caller must hold pmap's lock
3155  * => PTP must be mapped into KVA
3156  * => PTP should be null if pmap == pmap_kernel()
3157  * => must be called with kernel preemption disabled
3158  * => returns composite pte if at least one page should be shot down
3159  */
3160 
3161 static pt_entry_t
3162 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
3163 		 vaddr_t startva, vaddr_t endva, int flags,
3164 		 struct pv_entry **pv_tofree)
3165 {
3166 	struct pv_entry *pve;
3167 	pt_entry_t *pte = (pt_entry_t *) ptpva;
3168 	pt_entry_t opte, xpte = 0;
3169 
3170 	KASSERT(pmap == pmap_kernel() || mutex_owned(&pmap->pm_lock));
3171 	KASSERT(kpreempt_disabled());
3172 
3173 	/*
3174 	 * note that ptpva points to the PTE that maps startva.   this may
3175 	 * or may not be the first PTE in the PTP.
3176 	 *
3177 	 * we loop through the PTP while there are still PTEs to look at
3178 	 * and the wire_count is greater than 1 (because we use the wire_count
3179 	 * to keep track of the number of real PTEs in the PTP).
3180 	 */
3181 
3182 	for (/*null*/; startva < endva && (ptp == NULL || ptp->wire_count > 1)
3183 			     ; pte++, startva += PAGE_SIZE) {
3184 		struct vm_page *pg;
3185 		struct pmap_page *pp;
3186 
3187 		if (!pmap_valid_entry(*pte))
3188 			continue;			/* VA not mapped */
3189 		if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) {
3190 			continue;
3191 		}
3192 
3193 		/* atomically save the old PTE and zap! it */
3194 		opte = pmap_pte_testset(pte, 0);
3195 		if (!pmap_valid_entry(opte)) {
3196 			continue;
3197 		}
3198 
3199 		pmap_exec_account(pmap, startva, opte, 0);
3200 		pmap_stats_update_bypte(pmap, 0, opte);
3201 		xpte |= opte;
3202 
3203 		if (ptp) {
3204 			ptp->wire_count--;		/* dropping a PTE */
3205 			/* Make sure that the PDE is flushed */
3206 			if (ptp->wire_count <= 1)
3207 				xpte |= PG_U;
3208 		}
3209 
3210 		/*
3211 		 * if we are not on a pv_head list we are done.
3212 		 */
3213 
3214 		if ((opte & PG_PVLIST) == 0) {
3215 #if defined(DIAGNOSTIC) && !defined(DOM0OPS)
3216 			if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL)
3217 				panic("pmap_remove_ptes: managed page without "
3218 				      "PG_PVLIST for 0x%lx", startva);
3219 #endif
3220 			continue;
3221 		}
3222 
3223 		pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte));
3224 #ifdef DIAGNOSTIC
3225 		if (pg == NULL)
3226 			panic("pmap_remove_ptes: unmanaged page marked "
3227 			      "PG_PVLIST, va = 0x%lx, pa = 0x%lx",
3228 			      startva, (u_long)pmap_pte2pa(opte));
3229 #endif
3230 
3231 		/* sync R/M bits */
3232 		pp = VM_PAGE_TO_PP(pg);
3233 		pp_lock(pp);
3234 		pp->pp_attrs |= opte;
3235 		pve = pmap_remove_pv(pp, ptp, startva);
3236 		pp_unlock(pp);
3237 
3238 		if (pve != NULL) {
3239 			pve->pve_next = *pv_tofree;
3240 			*pv_tofree = pve;
3241 		}
3242 
3243 		/* end of "for" loop: time for next pte */
3244 	}
3245 
3246 	return xpte;
3247 }
3248 
3249 
3250 /*
3251  * pmap_remove_pte: remove a single PTE from a PTP
3252  *
3253  * => must have proper locking on pmap_master_lock
3254  * => caller must hold pmap's lock
3255  * => PTP must be mapped into KVA
3256  * => PTP should be null if pmap == pmap_kernel()
3257  * => returns true if we removed a mapping
3258  * => must be called with kernel preemption disabled
3259  */
3260 
3261 static bool
3262 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
3263 		vaddr_t va, int flags, struct pv_entry **pv_tofree)
3264 {
3265 	pt_entry_t opte;
3266 	struct pv_entry *pve;
3267 	struct vm_page *pg;
3268 	struct pmap_page *pp;
3269 
3270 	KASSERT(pmap == pmap_kernel() || mutex_owned(&pmap->pm_lock));
3271 	KASSERT(pmap == pmap_kernel() || kpreempt_disabled());
3272 
3273 	if (!pmap_valid_entry(*pte))
3274 		return(false);		/* VA not mapped */
3275 	if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) {
3276 		return(false);
3277 	}
3278 
3279 	/* atomically save the old PTE and zap! it */
3280 	opte = pmap_pte_testset(pte, 0);
3281 	if (!pmap_valid_entry(opte)) {
3282 		return false;
3283 	}
3284 
3285 	pmap_exec_account(pmap, va, opte, 0);
3286 	pmap_stats_update_bypte(pmap, 0, opte);
3287 
3288 	if (opte & PG_U)
3289 		pmap_tlb_shootdown(pmap, va, 0, opte);
3290 
3291 	if (ptp) {
3292 		ptp->wire_count--;		/* dropping a PTE */
3293 		/* Make sure that the PDE is flushed */
3294 		if ((ptp->wire_count <= 1) && !(opte & PG_U))
3295 			pmap_tlb_shootdown(pmap, va, 0, opte);
3296 	}
3297 
3298 	/*
3299 	 * if we are not on a pv_head list we are done.
3300 	 */
3301 
3302 	if ((opte & PG_PVLIST) == 0) {
3303 #if defined(DIAGNOSTIC) && !defined(DOM0OPS)
3304 		if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL)
3305 			panic("pmap_remove_pte: managed page without "
3306 			      "PG_PVLIST for 0x%lx", va);
3307 #endif
3308 		return(true);
3309 	}
3310 
3311 	pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte));
3312 #ifdef DIAGNOSTIC
3313 	if (pg == NULL)
3314 		panic("pmap_remove_pte: unmanaged page marked "
3315 		    "PG_PVLIST, va = 0x%lx, pa = 0x%lx", va,
3316 		    (u_long)(pmap_pte2pa(opte)));
3317 #endif
3318 
3319 	/* sync R/M bits */
3320 	pp = VM_PAGE_TO_PP(pg);
3321 	pp_lock(pp);
3322 	pp->pp_attrs |= opte;
3323 	pve = pmap_remove_pv(pp, ptp, va);
3324 	pp_unlock(pp);
3325 
3326 	if (pve) {
3327 		pve->pve_next = *pv_tofree;
3328 		*pv_tofree = pve;
3329 	}
3330 
3331 	return(true);
3332 }
3333 
3334 /*
3335  * pmap_remove: top level mapping removal function
3336  *
3337  * => caller should not be holding any pmap locks
3338  */
3339 
3340 void
3341 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
3342 {
3343 	pmap_do_remove(pmap, sva, eva, PMAP_REMOVE_ALL);
3344 }
3345 
3346 /*
3347  * pmap_do_remove: mapping removal guts
3348  *
3349  * => caller should not be holding any pmap locks
3350  */
3351 
3352 static void
3353 pmap_do_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva, int flags)
3354 {
3355 	pt_entry_t *ptes, xpte = 0;
3356 	pd_entry_t pde;
3357 	pd_entry_t * const *pdes;
3358 	struct pv_entry *pv_tofree = NULL;
3359 	bool result;
3360 	paddr_t ptppa;
3361 	vaddr_t blkendva, va = sva;
3362 	struct vm_page *ptp;
3363 	struct pmap *pmap2;
3364 
3365 	kpreempt_disable();
3366 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
3367 
3368 	/*
3369 	 * removing one page?  take shortcut function.
3370 	 */
3371 
3372 	if (va + PAGE_SIZE == eva) {
3373 		if (pmap_pdes_valid(va, pdes, &pde)) {
3374 
3375 			/* PA of the PTP */
3376 			ptppa = pmap_pte2pa(pde);
3377 
3378 			/* get PTP if non-kernel mapping */
3379 			if (pmap == pmap_kernel()) {
3380 				/* we never free kernel PTPs */
3381 				ptp = NULL;
3382 			} else {
3383 				ptp = pmap_find_ptp(pmap, va, ptppa, 1);
3384 #ifdef DIAGNOSTIC
3385 				if (ptp == NULL)
3386 					panic("pmap_remove: unmanaged "
3387 					      "PTP detected");
3388 #endif
3389 			}
3390 
3391 			/* do it! */
3392 			result = pmap_remove_pte(pmap, ptp,
3393 			    &ptes[pl1_i(va)], va, flags, &pv_tofree);
3394 
3395 			/*
3396 			 * if mapping removed and the PTP is no longer
3397 			 * being used, free it!
3398 			 */
3399 
3400 			if (result && ptp && ptp->wire_count <= 1)
3401 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3402 		}
3403 	} else for (/* null */ ; va < eva ; va = blkendva) {
3404 		int lvl;
3405 
3406 		/* determine range of block */
3407 		blkendva = x86_round_pdr(va+1);
3408 		if (blkendva > eva)
3409 			blkendva = eva;
3410 
3411 		/*
3412 		 * XXXCDC: our PTE mappings should never be removed
3413 		 * with pmap_remove!  if we allow this (and why would
3414 		 * we?) then we end up freeing the pmap's page
3415 		 * directory page (PDP) before we are finished using
3416 		 * it when we hit in in the recursive mapping.  this
3417 		 * is BAD.
3418 		 *
3419 		 * long term solution is to move the PTEs out of user
3420 		 * address space.  and into kernel address space (up
3421 		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
3422 		 * be VM_MAX_ADDRESS.
3423 		 */
3424 
3425 		if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE)
3426 			/* XXXCDC: ugly hack to avoid freeing PDP here */
3427 			continue;
3428 
3429 		lvl = pmap_pdes_invalid(va, pdes, &pde);
3430 		if (lvl != 0) {
3431 			/*
3432 			 * skip a range corresponding to an invalid pde.
3433 			 */
3434 			blkendva = (va & ptp_masks[lvl - 1]) + nbpd[lvl - 1];
3435  			continue;
3436 		}
3437 
3438 		/* PA of the PTP */
3439 		ptppa = pmap_pte2pa(pde);
3440 
3441 		/* get PTP if non-kernel mapping */
3442 		if (pmap == pmap_kernel()) {
3443 			/* we never free kernel PTPs */
3444 			ptp = NULL;
3445 		} else {
3446 			ptp = pmap_find_ptp(pmap, va, ptppa, 1);
3447 #ifdef DIAGNOSTIC
3448 			if (ptp == NULL)
3449 				panic("pmap_remove: unmanaged PTP "
3450 				      "detected");
3451 #endif
3452 		}
3453 		xpte |= pmap_remove_ptes(pmap, ptp,
3454 		    (vaddr_t)&ptes[pl1_i(va)], va, blkendva,
3455 		    flags, &pv_tofree);
3456 
3457 		/* if PTP is no longer being used, free it! */
3458 		if (ptp && ptp->wire_count <= 1) {
3459 			pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3460 		}
3461 		if ((xpte & PG_U) != 0)
3462 			pmap_tlb_shootdown(pmap, sva, eva, xpte);
3463 	}
3464 	pmap_unmap_ptes(pmap, pmap2);		/* unlock pmap */
3465 	kpreempt_enable();
3466 
3467 	/* Now we free unused PVs */
3468 	if (pv_tofree)
3469 		pmap_free_pvs(pv_tofree);
3470 }
3471 
3472 /*
3473  * pmap_sync_pv: clear pte bits and return the old value of the pte.
3474  *
3475  * => called with pp_lock held. (thus preemption disabled)
3476  * => issues tlb shootdowns if necessary.
3477  */
3478 
3479 static int
3480 pmap_sync_pv(struct pv_pte *pvpte, pt_entry_t expect, int clearbits,
3481     pt_entry_t *optep)
3482 {
3483 	struct pmap *pmap;
3484 	struct vm_page *ptp;
3485 	vaddr_t va;
3486 	pt_entry_t *ptep;
3487 	pt_entry_t opte;
3488 	pt_entry_t npte;
3489 	bool need_shootdown;
3490 
3491 	ptp = pvpte->pte_ptp;
3492 	va = pvpte->pte_va;
3493 	KASSERT(ptp == NULL || ptp->uobject != NULL);
3494 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
3495 	pmap = ptp_to_pmap(ptp);
3496 
3497 	KASSERT((expect & ~(PG_FRAME | PG_V)) == 0);
3498 	KASSERT((expect & PG_V) != 0);
3499 	KASSERT(clearbits == ~0 || (clearbits & ~(PG_M | PG_U | PG_RW)) == 0);
3500 	KASSERT(kpreempt_disabled());
3501 
3502 	ptep = pmap_map_pte(pmap, ptp, va);
3503 	do {
3504 		opte = *ptep;
3505 		KASSERT((opte & (PG_M | PG_U)) != PG_M);
3506 		KASSERT((opte & (PG_U | PG_V)) != PG_U);
3507 		KASSERT(opte == 0 || (opte & PG_V) != 0);
3508 		if ((opte & (PG_FRAME | PG_V)) != expect) {
3509 
3510 			/*
3511 			 * we lost a race with a V->P operation like
3512 			 * pmap_remove().  wait for the competitor
3513 			 * reflecting pte bits into mp_attrs.
3514 			 *
3515 			 * issue a redundant TLB shootdown so that
3516 			 * we can wait for its completion.
3517 			 */
3518 
3519 			pmap_unmap_pte();
3520 			if (clearbits != 0) {
3521 				pmap_tlb_shootdown(pmap, va, 0,
3522 				    (pmap == pmap_kernel() ? PG_G : 0));
3523 			}
3524 			return EAGAIN;
3525 		}
3526 
3527 		/*
3528 		 * check if there's anything to do on this pte.
3529 		 */
3530 
3531 		if ((opte & clearbits) == 0) {
3532 			need_shootdown = false;
3533 			break;
3534 		}
3535 
3536 		/*
3537 		 * we need a shootdown if the pte is cached. (PG_U)
3538 		 *
3539 		 * ...unless we are clearing only the PG_RW bit and
3540 		 * it isn't cached as RW. (PG_M)
3541 		 */
3542 
3543 		need_shootdown = (opte & PG_U) != 0 &&
3544 		    !(clearbits == PG_RW && (opte & PG_M) == 0);
3545 
3546 		npte = opte & ~clearbits;
3547 
3548 		/*
3549 		 * if we need a shootdown anyway, clear PG_U and PG_M.
3550 		 */
3551 
3552 		if (need_shootdown) {
3553 			npte &= ~(PG_U | PG_M);
3554 		}
3555 		KASSERT((npte & (PG_M | PG_U)) != PG_M);
3556 		KASSERT((npte & (PG_U | PG_V)) != PG_U);
3557 		KASSERT(npte == 0 || (opte & PG_V) != 0);
3558 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
3559 
3560 	if (need_shootdown) {
3561 		pmap_tlb_shootdown(pmap, va, 0, opte);
3562 	}
3563 	pmap_unmap_pte();
3564 
3565 	*optep = opte;
3566 	return 0;
3567 }
3568 
3569 /*
3570  * pmap_page_remove: remove a managed vm_page from all pmaps that map it
3571  *
3572  * => R/M bits are sync'd back to attrs
3573  */
3574 
3575 void
3576 pmap_page_remove(struct vm_page *pg)
3577 {
3578 	struct pmap_page *pp;
3579 	struct pv_pte *pvpte;
3580 	struct pv_entry *killlist = NULL;
3581 	struct vm_page *ptp;
3582 	pt_entry_t expect;
3583 	lwp_t *l;
3584 	int count;
3585 
3586 #ifdef DIAGNOSTIC
3587 	int bank, off;
3588 
3589 	bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off);
3590 	if (bank == -1)
3591 		panic("pmap_page_remove: unmanaged page?");
3592 #endif
3593 
3594 	l = curlwp;
3595 	pp = VM_PAGE_TO_PP(pg);
3596 	expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V;
3597 	count = SPINLOCK_BACKOFF_MIN;
3598 	kpreempt_disable();
3599 startover:
3600 	pp_lock(pp);
3601 	while ((pvpte = pv_pte_first(pp)) != NULL) {
3602 		struct pmap *pmap;
3603 		struct pv_entry *pve;
3604 		pt_entry_t opte;
3605 		vaddr_t va;
3606 		int error;
3607 
3608 		/*
3609 		 * add a reference to the pmap before clearing the pte.
3610 		 * otherwise the pmap can disappear behind us.
3611 		 */
3612 
3613 		ptp = pvpte->pte_ptp;
3614 		pmap = ptp_to_pmap(ptp);
3615 		if (ptp != NULL) {
3616 			pmap_reference(pmap);
3617 		}
3618 
3619 		error = pmap_sync_pv(pvpte, expect, ~0, &opte);
3620 		if (error == EAGAIN) {
3621 			int hold_count;
3622 			pp_unlock(pp);
3623 			KERNEL_UNLOCK_ALL(curlwp, &hold_count);
3624 			if (ptp != NULL) {
3625 				pmap_destroy(pmap);
3626 			}
3627 			SPINLOCK_BACKOFF(count);
3628 			KERNEL_LOCK(hold_count, curlwp);
3629 			goto startover;
3630 		}
3631 
3632 		pp->pp_attrs |= opte;
3633 		va = pvpte->pte_va;
3634 		pve = pmap_remove_pv(pp, ptp, va);
3635 		pp_unlock(pp);
3636 
3637 		/* update the PTP reference count.  free if last reference. */
3638 		if (ptp != NULL) {
3639 			struct pmap *pmap2;
3640 			pt_entry_t *ptes;
3641 			pd_entry_t * const *pdes;
3642 
3643 			KASSERT(pmap != pmap_kernel());
3644 
3645 			pmap_tlb_shootwait();
3646 			pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3647 			pmap_stats_update_bypte(pmap, 0, opte);
3648 			ptp->wire_count--;
3649 			if (ptp->wire_count <= 1) {
3650 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3651 			}
3652 			pmap_unmap_ptes(pmap, pmap2);
3653 			pmap_destroy(pmap);
3654 		} else {
3655 			KASSERT(pmap == pmap_kernel());
3656 			pmap_stats_update_bypte(pmap, 0, opte);
3657 		}
3658 
3659 		if (pve != NULL) {
3660 			pve->pve_next = killlist;	/* mark it for death */
3661 			killlist = pve;
3662 		}
3663 		pp_lock(pp);
3664 	}
3665 	pp_unlock(pp);
3666 	kpreempt_enable();
3667 
3668 	/* Now free unused pvs. */
3669 	pmap_free_pvs(killlist);
3670 }
3671 
3672 /*
3673  * p m a p   a t t r i b u t e  f u n c t i o n s
3674  * functions that test/change managed page's attributes
3675  * since a page can be mapped multiple times we must check each PTE that
3676  * maps it by going down the pv lists.
3677  */
3678 
3679 /*
3680  * pmap_test_attrs: test a page's attributes
3681  */
3682 
3683 bool
3684 pmap_test_attrs(struct vm_page *pg, unsigned testbits)
3685 {
3686 	struct pmap_page *pp;
3687 	struct pv_pte *pvpte;
3688 	pt_entry_t expect;
3689 	u_int result;
3690 
3691 #if DIAGNOSTIC
3692 	int bank, off;
3693 
3694 	bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off);
3695 	if (bank == -1)
3696 		panic("pmap_test_attrs: unmanaged page?");
3697 #endif
3698 
3699 	pp = VM_PAGE_TO_PP(pg);
3700 	if ((pp->pp_attrs & testbits) != 0) {
3701 		return true;
3702 	}
3703 	expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V;
3704 	pp_lock(pp);
3705 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
3706 		pt_entry_t opte;
3707 		int error;
3708 
3709 		if ((pp->pp_attrs & testbits) != 0) {
3710 			break;
3711 		}
3712 		error = pmap_sync_pv(pvpte, expect, 0, &opte);
3713 		if (error == 0) {
3714 			pp->pp_attrs |= opte;
3715 		}
3716 	}
3717 	result = pp->pp_attrs & testbits;
3718 	pp_unlock(pp);
3719 
3720 	/*
3721 	 * note that we will exit the for loop with a non-null pve if
3722 	 * we have found the bits we are testing for.
3723 	 */
3724 
3725 	return result != 0;
3726 }
3727 
3728 /*
3729  * pmap_clear_attrs: clear the specified attribute for a page.
3730  *
3731  * => we return true if we cleared one of the bits we were asked to
3732  */
3733 
3734 bool
3735 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits)
3736 {
3737 	struct pmap_page *pp;
3738 	struct pv_pte *pvpte;
3739 	u_int result;
3740 	pt_entry_t expect;
3741 	int count;
3742 #ifdef DIAGNOSTIC
3743 	int bank, off;
3744 
3745 	bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off);
3746 	if (bank == -1)
3747 		panic("pmap_change_attrs: unmanaged page?");
3748 #endif
3749 
3750 	pp = VM_PAGE_TO_PP(pg);
3751 	expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V;
3752 	count = SPINLOCK_BACKOFF_MIN;
3753 	kpreempt_disable();
3754 startover:
3755 	pp_lock(pp);
3756 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
3757 		pt_entry_t opte;
3758 		int error;
3759 
3760 		error = pmap_sync_pv(pvpte, expect, clearbits, &opte);
3761 		if (error == EAGAIN) {
3762 			int hold_count;
3763 			pp_unlock(pp);
3764 			KERNEL_UNLOCK_ALL(curlwp, &hold_count);
3765 			SPINLOCK_BACKOFF(count);
3766 			KERNEL_LOCK(hold_count, curlwp);
3767 			goto startover;
3768 		}
3769 		pp->pp_attrs |= opte;
3770 	}
3771 	result = pp->pp_attrs & clearbits;
3772 	pp->pp_attrs &= ~clearbits;
3773 	pp_unlock(pp);
3774 	kpreempt_enable();
3775 
3776 	return result != 0;
3777 }
3778 
3779 
3780 /*
3781  * p m a p   p r o t e c t i o n   f u n c t i o n s
3782  */
3783 
3784 /*
3785  * pmap_page_protect: change the protection of all recorded mappings
3786  *	of a managed page
3787  *
3788  * => NOTE: this is an inline function in pmap.h
3789  */
3790 
3791 /* see pmap.h */
3792 
3793 /*
3794  * pmap_protect: set the protection in of the pages in a pmap
3795  *
3796  * => NOTE: this is an inline function in pmap.h
3797  */
3798 
3799 /* see pmap.h */
3800 
3801 /*
3802  * pmap_write_protect: write-protect pages in a pmap
3803  */
3804 
3805 void
3806 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
3807 {
3808 	pt_entry_t *ptes, *epte;
3809 	pt_entry_t *spte;
3810 	pd_entry_t * const *pdes;
3811 	vaddr_t blockend, va;
3812 	pt_entry_t opte;
3813 	struct pmap *pmap2;
3814 
3815 	KASSERT(curlwp->l_md.md_gc_pmap != pmap);
3816 
3817 	kpreempt_disable();
3818 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
3819 
3820 	/* should be ok, but just in case ... */
3821 	sva &= PG_FRAME;
3822 	eva &= PG_FRAME;
3823 
3824 	for (va = sva ; va < eva ; va = blockend) {
3825 
3826 		blockend = (va & L2_FRAME) + NBPD_L2;
3827 		if (blockend > eva)
3828 			blockend = eva;
3829 
3830 		/*
3831 		 * XXXCDC: our PTE mappings should never be write-protected!
3832 		 *
3833 		 * long term solution is to move the PTEs out of user
3834 		 * address space.  and into kernel address space (up
3835 		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
3836 		 * be VM_MAX_ADDRESS.
3837 		 */
3838 
3839 		/* XXXCDC: ugly hack to avoid freeing PDP here */
3840 		if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE)
3841 			continue;
3842 
3843 		/* empty block? */
3844 		if (!pmap_pdes_valid(va, pdes, NULL))
3845 			continue;
3846 
3847 #ifdef DIAGNOSTIC
3848 		if (va >= VM_MAXUSER_ADDRESS &&
3849 		    va < VM_MAX_ADDRESS)
3850 			panic("pmap_write_protect: PTE space");
3851 #endif
3852 
3853 		spte = &ptes[pl1_i(va)];
3854 		epte = &ptes[pl1_i(blockend)];
3855 
3856 		for (/*null */; spte < epte ; spte++) {
3857 			pt_entry_t npte;
3858 
3859 			do {
3860 				opte = *spte;
3861 				if ((~opte & (PG_RW | PG_V)) != 0) {
3862 					goto next;
3863 				}
3864 				npte = opte & ~PG_RW;
3865 			} while (pmap_pte_cas(spte, opte, npte) != opte);
3866 			if ((opte & PG_M) != 0) {
3867 				vaddr_t tva;
3868 
3869 				tva = x86_ptob(spte - ptes);
3870 				pmap_tlb_shootdown(pmap, tva, 0, opte);
3871 			}
3872 next:;
3873 		}
3874 	}
3875 
3876 	pmap_unmap_ptes(pmap, pmap2);	/* unlocks pmap */
3877 	kpreempt_enable();
3878 }
3879 
3880 /*
3881  * end of protection functions
3882  */
3883 
3884 /*
3885  * pmap_unwire: clear the wired bit in the PTE
3886  *
3887  * => mapping should already be in map
3888  */
3889 
3890 void
3891 pmap_unwire(struct pmap *pmap, vaddr_t va)
3892 {
3893 	pt_entry_t *ptes;
3894 	pd_entry_t * const *pdes;
3895 	struct pmap *pmap2;
3896 
3897 	kpreempt_disable();
3898 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
3899 
3900 	if (pmap_pdes_valid(va, pdes, NULL)) {
3901 		pt_entry_t *ptep = &ptes[pl1_i(va)];
3902 		pt_entry_t opte = *ptep;
3903 
3904 #ifdef DIAGNOSTIC
3905 		if (!pmap_valid_entry(opte))
3906 			panic("pmap_unwire: invalid (unmapped) va 0x%lx", va);
3907 #endif
3908 		if ((opte & PG_W) != 0) {
3909 			pt_entry_t npte = opte & ~PG_W;
3910 
3911 			opte = pmap_pte_testset(ptep, npte);
3912 			pmap_stats_update_bypte(pmap, npte, opte);
3913 		}
3914 #ifdef DIAGNOSTIC
3915 		else {
3916 			printf("pmap_unwire: wiring for pmap %p va 0x%lx "
3917 			       "didn't change!\n", pmap, va);
3918 		}
3919 #endif
3920 		pmap_unmap_ptes(pmap, pmap2);		/* unlocks map */
3921 	}
3922 #ifdef DIAGNOSTIC
3923 	else {
3924 		panic("pmap_unwire: invalid PDE");
3925 	}
3926 #endif
3927 	kpreempt_enable();
3928 }
3929 
3930 /*
3931  * pmap_collect: free resources held by a pmap
3932  *
3933  * => optional function.
3934  * => called when a process is swapped out to free memory.
3935  */
3936 
3937 void
3938 pmap_collect(struct pmap *pmap)
3939 {
3940 	/*
3941 	 * free all of the pt pages by removing the physical mappings
3942 	 * for its entire address space.
3943 	 */
3944 
3945 	pmap_do_remove(pmap, VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS,
3946 	    PMAP_REMOVE_SKIPWIRED);
3947 }
3948 
3949 /*
3950  * pmap_copy: copy mappings from one pmap to another
3951  *
3952  * => optional function
3953  * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
3954  */
3955 
3956 /*
3957  * defined as macro in pmap.h
3958  */
3959 
3960 /*
3961  * pmap_enter: enter a mapping into a pmap
3962  *
3963  * => must be done "now" ... no lazy-evaluation
3964  * => we set pmap => pv_head locking
3965  */
3966 #ifdef XEN
3967 int
3968 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa,
3969 	   vm_prot_t prot, int flags, int domid)
3970 {
3971 #else /* XEN */
3972 int
3973 pmap_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
3974 	   int flags)
3975 {
3976 	paddr_t ma = pa;
3977 #endif /* XEN */
3978 	pt_entry_t *ptes, opte, npte;
3979 	pt_entry_t *ptep;
3980 	pd_entry_t * const *pdes;
3981 	struct vm_page *ptp, *pg;
3982 	struct pmap_page *new_pp;
3983 	struct pmap_page *old_pp;
3984 	struct pv_entry *old_pve = NULL;
3985 	struct pv_entry *new_pve;
3986 	struct pv_entry *new_pve2;
3987 	int error;
3988 	bool wired = (flags & PMAP_WIRED) != 0;
3989 	struct pmap *pmap2;
3990 
3991 	KASSERT(pmap_initialized);
3992 	KASSERT(curlwp->l_md.md_gc_pmap != pmap);
3993 
3994 #ifdef DIAGNOSTIC
3995 	/* sanity check: totally out of range? */
3996 	if (va >= VM_MAX_KERNEL_ADDRESS)
3997 		panic("pmap_enter: too big");
3998 
3999 	if (va == (vaddr_t) PDP_BASE || va == (vaddr_t) APDP_BASE)
4000 		panic("pmap_enter: trying to map over PDP/APDP!");
4001 
4002 	/* sanity check: kernel PTPs should already have been pre-allocated */
4003 	if (va >= VM_MIN_KERNEL_ADDRESS &&
4004 	    !pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]))
4005 		panic("pmap_enter: missing kernel PTP for va %lx!", va);
4006 #endif /* DIAGNOSTIC */
4007 #ifdef XEN
4008 	KASSERT(domid == DOMID_SELF || pa == 0);
4009 #endif /* XEN */
4010 
4011 	npte = ma | protection_codes[prot] | PG_V;
4012 	if (wired)
4013 	        npte |= PG_W;
4014 	if (va < VM_MAXUSER_ADDRESS)
4015 		npte |= PG_u;
4016 	else if (va < VM_MAX_ADDRESS)
4017 		npte |= (PG_u | PG_RW);	/* XXXCDC: no longer needed? */
4018 	else
4019 		npte |= PG_k;
4020 	if (pmap == pmap_kernel())
4021 		npte |= pmap_pg_g;
4022 	if (flags & VM_PROT_ALL) {
4023 		npte |= PG_U;
4024 		if (flags & VM_PROT_WRITE) {
4025 			KASSERT((npte & PG_RW) != 0);
4026 			npte |= PG_M;
4027 		}
4028 	}
4029 
4030 #ifdef XEN
4031 	if (domid != DOMID_SELF)
4032 		pg = NULL;
4033 	else
4034 #endif
4035 		pg = PHYS_TO_VM_PAGE(pa);
4036 	if (pg != NULL) {
4037 		/* This is a managed page */
4038 		npte |= PG_PVLIST;
4039 		new_pp = VM_PAGE_TO_PP(pg);
4040 	} else {
4041 		new_pp = NULL;
4042 	}
4043 
4044 	/* get pves. */
4045 	new_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
4046 	new_pve2 = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
4047 	if (new_pve == NULL || new_pve2 == NULL) {
4048 		if (flags & PMAP_CANFAIL) {
4049 			error = ENOMEM;
4050 			goto out2;
4051 		}
4052 		panic("pmap_enter: pve allocation failed");
4053 	}
4054 
4055 	kpreempt_disable();
4056 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
4057 	if (pmap == pmap_kernel()) {
4058 		ptp = NULL;
4059 	} else {
4060 		ptp = pmap_get_ptp(pmap, va, pdes);
4061 		if (ptp == NULL) {
4062 			pmap_unmap_ptes(pmap, pmap2);
4063 			if (flags & PMAP_CANFAIL) {
4064 				error = ENOMEM;
4065 				goto out;
4066 			}
4067 			panic("pmap_enter: get ptp failed");
4068 		}
4069 	}
4070 
4071 	/*
4072 	 * update the pte.
4073 	 */
4074 
4075 	ptep = &ptes[pl1_i(va)];
4076 	do {
4077 		opte = *ptep;
4078 
4079 		/*
4080 		 * if the same page, inherit PG_U and PG_M.
4081 		 */
4082 		if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) {
4083 			npte |= opte & (PG_U | PG_M);
4084 		}
4085 #if defined(XEN)
4086 		if (domid != DOMID_SELF) {
4087 			/* pmap_pte_cas with error handling */
4088 			int s = splvm();
4089 			if (opte != *ptep) {
4090 				splx(s);
4091 				continue;
4092 			}
4093 			error = xpq_update_foreign(
4094 			    vtomach((vaddr_t)ptep), npte, domid);
4095 			splx(s);
4096 			if (error) {
4097 				if (ptp != NULL && ptp->wire_count <= 1) {
4098 					pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4099 				}
4100 				pmap_unmap_ptes(pmap, pmap2);
4101 				goto out;
4102 			}
4103 			break;
4104 		}
4105 #endif /* defined(XEN) */
4106 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
4107 
4108 	/*
4109 	 * update statistics and PTP's reference count.
4110 	 */
4111 
4112 	pmap_stats_update_bypte(pmap, npte, opte);
4113 	if (ptp != NULL && !pmap_valid_entry(opte)) {
4114 		ptp->wire_count++;
4115 	}
4116 	KASSERT(ptp == NULL || ptp->wire_count > 1);
4117 
4118 	/*
4119 	 * if the same page, we can skip pv_entry handling.
4120 	 */
4121 
4122 	if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) {
4123 		KASSERT(((opte ^ npte) & PG_PVLIST) == 0);
4124 		goto same_pa;
4125 	}
4126 
4127 	/*
4128 	 * if old page is managed, remove pv_entry from its list.
4129 	 */
4130 
4131 	if ((~opte & (PG_V | PG_PVLIST)) == 0) {
4132 		pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte));
4133 #ifdef DIAGNOSTIC
4134 		if (pg == NULL)
4135 			panic("pmap_enter: PG_PVLIST mapping with "
4136 			      "unmanaged page "
4137 			      "pa = 0x%" PRIx64 " (0x%" PRIx64 ")",
4138 			      (int64_t)pa, (int64_t)atop(pa));
4139 #endif
4140 		old_pp = VM_PAGE_TO_PP(pg);
4141 
4142 		pp_lock(old_pp);
4143 		old_pve = pmap_remove_pv(old_pp, ptp, va);
4144 		old_pp->pp_attrs |= opte;
4145 		pp_unlock(old_pp);
4146 	}
4147 
4148 	/*
4149 	 * if new page is managed, insert pv_entry into its list.
4150 	 */
4151 
4152 	if (new_pp) {
4153 		pp_lock(new_pp);
4154 		new_pve = pmap_enter_pv(new_pp, new_pve, &new_pve2, ptp, va);
4155 		pp_unlock(new_pp);
4156 	}
4157 
4158 same_pa:
4159 	pmap_unmap_ptes(pmap, pmap2);
4160 
4161 	/*
4162 	 * shootdown tlb if necessary.
4163 	 */
4164 
4165 	if ((~opte & (PG_V | PG_U)) == 0 &&
4166 	    ((opte ^ npte) & (PG_FRAME | PG_RW)) != 0) {
4167 		pmap_tlb_shootdown(pmap, va, 0, opte);
4168 	}
4169 
4170 	error = 0;
4171 out:
4172 	kpreempt_enable();
4173 out2:
4174 	if (old_pve != NULL) {
4175 		pool_cache_put(&pmap_pv_cache, old_pve);
4176 	}
4177 	if (new_pve != NULL) {
4178 		pool_cache_put(&pmap_pv_cache, new_pve);
4179 	}
4180 	if (new_pve2 != NULL) {
4181 		pool_cache_put(&pmap_pv_cache, new_pve2);
4182 	}
4183 
4184 	return error;
4185 }
4186 
4187 #ifdef XEN
4188 int
4189 pmap_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, int flags)
4190 {
4191         paddr_t ma;
4192 
4193 	if (__predict_false(pa < pmap_pa_start || pmap_pa_end <= pa)) {
4194 		ma = pa; /* XXX hack */
4195 	} else {
4196 		ma = xpmap_ptom(pa);
4197 	}
4198 
4199 	return pmap_enter_ma(pmap, va, ma, pa, prot, flags, DOMID_SELF);
4200 }
4201 #endif /* XEN */
4202 
4203 static bool
4204 pmap_get_physpage(vaddr_t va, int level, paddr_t *paddrp)
4205 {
4206 	struct vm_page *ptp;
4207 	struct pmap *kpm = pmap_kernel();
4208 
4209 	if (uvm.page_init_done == false) {
4210 		/*
4211 		 * we're growing the kernel pmap early (from
4212 		 * uvm_pageboot_alloc()).  this case must be
4213 		 * handled a little differently.
4214 		 */
4215 
4216 		if (uvm_page_physget(paddrp) == false)
4217 			panic("pmap_get_physpage: out of memory");
4218 		kpreempt_disable();
4219 		pmap_pte_set(early_zero_pte,
4220 		    pmap_pa2pte(*paddrp) | PG_V | PG_RW | PG_k);
4221 		pmap_pte_flush();
4222 		pmap_update_pg((vaddr_t)early_zerop);
4223 		memset(early_zerop, 0, PAGE_SIZE);
4224 #if defined(DIAGNOSTIC) || defined (XEN)
4225 		pmap_pte_set(early_zero_pte, 0);
4226 		pmap_pte_flush();
4227 #endif /* defined(DIAGNOSTIC) */
4228 		kpreempt_enable();
4229 	} else {
4230 		/* XXX */
4231 		PMAP_SUBOBJ_LOCK(kpm, level - 1);
4232 		ptp = uvm_pagealloc(&kpm->pm_obj[level - 1],
4233 				    ptp_va2o(va, level), NULL,
4234 				    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
4235 		PMAP_SUBOBJ_UNLOCK(kpm, level - 1);
4236 		if (ptp == NULL)
4237 			panic("pmap_get_physpage: out of memory");
4238 		ptp->flags &= ~PG_BUSY;
4239 		ptp->wire_count = 1;
4240 		*paddrp = VM_PAGE_TO_PHYS(ptp);
4241 	}
4242 	pmap_stats_update(kpm, 1, 0);
4243 	return true;
4244 }
4245 
4246 /*
4247  * Allocate the amount of specified ptps for a ptp level, and populate
4248  * all levels below accordingly, mapping virtual addresses starting at
4249  * kva.
4250  *
4251  * Used by pmap_growkernel.
4252  */
4253 static void
4254 pmap_alloc_level(pd_entry_t * const *pdes, vaddr_t kva, int lvl,
4255     long *needed_ptps)
4256 {
4257 	unsigned long i;
4258 	vaddr_t va;
4259 	paddr_t pa;
4260 	unsigned long index, endindex;
4261 	int level;
4262 	pd_entry_t *pdep;
4263 #ifdef XEN
4264 	int s = splvm(); /* protect xpq_* */
4265 #endif
4266 
4267 	for (level = lvl; level > 1; level--) {
4268 		if (level == PTP_LEVELS)
4269 			pdep = pmap_kernel()->pm_pdir;
4270 		else
4271 			pdep = pdes[level - 2];
4272 		va = kva;
4273 		index = pl_i_roundup(kva, level);
4274 		endindex = index + needed_ptps[level - 1] - 1;
4275 
4276 
4277 		for (i = index; i <= endindex; i++) {
4278 			KASSERT(!pmap_valid_entry(pdep[i]));
4279 			pmap_get_physpage(va, level - 1, &pa);
4280 #ifdef XEN
4281 			xpq_queue_pte_update((level == PTP_LEVELS) ?
4282 			    xpmap_ptom(pmap_pdirpa(pmap_kernel(), i)) :
4283 			    xpmap_ptetomach(&pdep[i]),
4284 			    pmap_pa2pte(pa) | PG_k | PG_V | PG_RW);
4285 #ifdef PAE
4286 			if (level == PTP_LEVELS &&  i > L2_SLOT_KERN) {
4287 				/* update real kernel PD too */
4288 				xpq_queue_pte_update(
4289 				    xpmap_ptetomach(&pmap_kl2pd[l2tol2(i)]),
4290 				    pmap_pa2pte(pa) | PG_k | PG_V | PG_RW);
4291 			}
4292 #endif
4293 #else /* XEN */
4294 			pdep[i] = pa | PG_RW | PG_V;
4295 #endif /* XEN */
4296 			KASSERT(level != PTP_LEVELS || nkptp[level - 1] +
4297 			    pl_i(VM_MIN_KERNEL_ADDRESS, level) == i);
4298 			nkptp[level - 1]++;
4299 			va += nbpd[level - 1];
4300 		}
4301 		pmap_pte_flush();
4302 	}
4303 #ifdef XEN
4304 	splx(s);
4305 #endif
4306 }
4307 
4308 /*
4309  * pmap_growkernel: increase usage of KVM space
4310  *
4311  * => we allocate new PTPs for the kernel and install them in all
4312  *	the pmaps on the system.
4313  */
4314 
4315 vaddr_t
4316 pmap_growkernel(vaddr_t maxkvaddr)
4317 {
4318 	struct pmap *kpm = pmap_kernel();
4319 #if !defined(XEN) || !defined(__x86_64__)
4320 	struct pmap *pm;
4321 #endif
4322 	int s, i;
4323 	long needed_kptp[PTP_LEVELS], target_nptp, old;
4324 	bool invalidate = false;
4325 
4326 	s = splvm();	/* to be safe */
4327 	mutex_enter(&kpm->pm_lock);
4328 
4329 	if (maxkvaddr <= pmap_maxkvaddr) {
4330 		mutex_exit(&kpm->pm_lock);
4331 		splx(s);
4332 		return pmap_maxkvaddr;
4333 	}
4334 
4335 	maxkvaddr = x86_round_pdr(maxkvaddr);
4336 	old = nkptp[PTP_LEVELS - 1];
4337 	/*
4338 	 * This loop could be optimized more, but pmap_growkernel()
4339 	 * is called infrequently.
4340 	 */
4341 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
4342 		target_nptp = pl_i_roundup(maxkvaddr, i + 1) -
4343 		    pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1);
4344 		/*
4345 		 * XXX only need to check toplevel.
4346 		 */
4347 		if (target_nptp > nkptpmax[i])
4348 			panic("out of KVA space");
4349 		KASSERT(target_nptp >= nkptp[i]);
4350 		needed_kptp[i] = target_nptp - nkptp[i];
4351 	}
4352 
4353 	pmap_alloc_level(normal_pdes, pmap_maxkvaddr, PTP_LEVELS, needed_kptp);
4354 
4355 	/*
4356 	 * If the number of top level entries changed, update all
4357 	 * pmaps.
4358 	 */
4359 	if (needed_kptp[PTP_LEVELS - 1] != 0) {
4360 #ifdef XEN
4361 #ifdef __x86_64__
4362 		/* nothing, kernel entries are never entered in user pmap */
4363 #else /* __x86_64__ */
4364 		mutex_enter(&pmaps_lock);
4365 		LIST_FOREACH(pm, &pmaps, pm_list) {
4366 			int pdkidx;
4367 			for (pdkidx =  PDIR_SLOT_KERN + old;
4368 			    pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1];
4369 			    pdkidx++) {
4370 				xpq_queue_pte_update(
4371 				    xpmap_ptom(pmap_pdirpa(pm, pdkidx)),
4372 				    kpm->pm_pdir[pdkidx]);
4373 			}
4374 			xpq_flush_queue();
4375 		}
4376 		mutex_exit(&pmaps_lock);
4377 #endif /* __x86_64__ */
4378 #else /* XEN */
4379 		unsigned newpdes;
4380 		newpdes = nkptp[PTP_LEVELS - 1] - old;
4381 		mutex_enter(&pmaps_lock);
4382 		LIST_FOREACH(pm, &pmaps, pm_list) {
4383 			memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old],
4384 			       &kpm->pm_pdir[PDIR_SLOT_KERN + old],
4385 			       newpdes * sizeof (pd_entry_t));
4386 		}
4387 		mutex_exit(&pmaps_lock);
4388 #endif
4389 		invalidate = true;
4390 	}
4391 	pmap_maxkvaddr = maxkvaddr;
4392 	mutex_exit(&kpm->pm_lock);
4393 	splx(s);
4394 
4395 	if (invalidate) {
4396 		/* Invalidate the PDP cache. */
4397 		pool_cache_invalidate(&pmap_pdp_cache);
4398 	}
4399 
4400 	return maxkvaddr;
4401 }
4402 
4403 #ifdef DEBUG
4404 void pmap_dump(struct pmap *, vaddr_t, vaddr_t);
4405 
4406 /*
4407  * pmap_dump: dump all the mappings from a pmap
4408  *
4409  * => caller should not be holding any pmap locks
4410  */
4411 
4412 void
4413 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
4414 {
4415 	pt_entry_t *ptes, *pte;
4416 	pd_entry_t * const *pdes;
4417 	struct pmap *pmap2;
4418 	vaddr_t blkendva;
4419 
4420 	/*
4421 	 * if end is out of range truncate.
4422 	 * if (end == start) update to max.
4423 	 */
4424 
4425 	if (eva > VM_MAXUSER_ADDRESS || eva <= sva)
4426 		eva = VM_MAXUSER_ADDRESS;
4427 
4428 	/*
4429 	 * we lock in the pmap => pv_head direction
4430 	 */
4431 
4432 	kpreempt_disable();
4433 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
4434 
4435 	/*
4436 	 * dumping a range of pages: we dump in PTP sized blocks (4MB)
4437 	 */
4438 
4439 	for (/* null */ ; sva < eva ; sva = blkendva) {
4440 
4441 		/* determine range of block */
4442 		blkendva = x86_round_pdr(sva+1);
4443 		if (blkendva > eva)
4444 			blkendva = eva;
4445 
4446 		/* valid block? */
4447 		if (!pmap_pdes_valid(sva, pdes, NULL))
4448 			continue;
4449 
4450 		pte = &ptes[pl1_i(sva)];
4451 		for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) {
4452 			if (!pmap_valid_entry(*pte))
4453 				continue;
4454 			printf("va %#lx -> pa %#lx (pte=%#lx)\n",
4455 			       sva, (unsigned long)*pte,
4456 			       (unsigned long)pmap_pte2pa(*pte));
4457 		}
4458 	}
4459 	pmap_unmap_ptes(pmap, pmap2);
4460 	kpreempt_enable();
4461 }
4462 #endif
4463 
4464 /*
4465  * pmap_tlb_shootdown: invalidate pages on all CPUs using pmap 'pm'
4466  *
4467  * => always invalidates locally before returning
4468  * => returns before remote CPUs have invalidated
4469  * => must be called with preemption disabled
4470  */
4471 
4472 void
4473 pmap_tlb_shootdown(struct pmap *pm, vaddr_t sva, vaddr_t eva, pt_entry_t pte)
4474 {
4475 #ifdef MULTIPROCESSOR
4476 	extern bool x86_mp_online;
4477 	struct cpu_info *ci;
4478 	struct pmap_mbox *mb, *selfmb;
4479 	CPU_INFO_ITERATOR cii;
4480 	uintptr_t head;
4481 	u_int count;
4482 	int s;
4483 #endif	/* MULTIPROCESSOR */
4484 	struct cpu_info *self;
4485 	bool kernel;
4486 
4487 	KASSERT(eva == 0 || eva >= sva);
4488 	KASSERT(kpreempt_disabled());
4489 
4490 	if (pte & PG_PS)
4491 		sva &= PG_LGFRAME;
4492 	pte &= PG_G;
4493 	self = curcpu();
4494 
4495 	if (sva == (vaddr_t)-1LL) {
4496 		kernel = true;
4497 	} else {
4498 		if (eva == 0)
4499 			eva = sva + PAGE_SIZE;
4500 		kernel = sva >= VM_MAXUSER_ADDRESS;
4501 		KASSERT(kernel == (eva > VM_MAXUSER_ADDRESS));
4502 	}
4503 
4504 	/*
4505 	 * if tearing down the pmap, do nothing.  we'll flush later
4506 	 * when we're ready to recycle/destroy it.
4507 	 */
4508 	if (__predict_false(curlwp->l_md.md_gc_pmap == pm)) {
4509 		return;
4510 	}
4511 
4512 	/*
4513 	 * If the range is larger than 32 pages, then invalidate
4514 	 * everything.
4515 	 */
4516 	if (sva != (vaddr_t)-1LL && eva - sva > (32 * PAGE_SIZE)) {
4517 		sva = (vaddr_t)-1LL;
4518 		eva = sva;
4519 	}
4520 
4521 #ifdef MULTIPROCESSOR
4522 	if (ncpu > 1 && x86_mp_online) {
4523 		selfmb = &self->ci_pmap_cpu->pc_mbox;
4524 
4525 		/*
4526 		 * If the CPUs have no notion of global pages then
4527 		 * reload of %cr3 is sufficient.
4528 		 */
4529 		if (pte != 0 && (cpu_feature & CPUID_PGE) == 0)
4530 			pte = 0;
4531 
4532 		if (pm == pmap_kernel()) {
4533 			/*
4534 			 * Mapped on all CPUs: use the broadcast mechanism.
4535 			 * Once we have the lock, increment the counter.
4536 			 */
4537 			s = splvm();
4538 			mb = &pmap_mbox;
4539 			count = SPINLOCK_BACKOFF_MIN;
4540 			do {
4541 				if ((head = mb->mb_head) != mb->mb_tail) {
4542 					splx(s);
4543 					while ((head = mb->mb_head) !=
4544 					    mb->mb_tail)
4545 						SPINLOCK_BACKOFF(count);
4546 					s = splvm();
4547 				}
4548 			} while (atomic_cas_ulong(
4549 			    (volatile u_long *)&mb->mb_head,
4550 			    head, head + ncpu - 1) != head);
4551 
4552 			/*
4553 			 * Once underway we must stay at IPL_VM until the
4554 			 * IPI is dispatched.  Otherwise interrupt handlers
4555 			 * on this CPU can deadlock against us.
4556 			 */
4557 			pmap_tlb_evcnt.ev_count++;
4558 			mb->mb_pointer = self;
4559 			mb->mb_addr1 = sva;
4560 			mb->mb_addr2 = eva;
4561 			mb->mb_global = pte;
4562 			x86_ipi(LAPIC_TLB_BCAST_VECTOR, LAPIC_DEST_ALLEXCL,
4563 			    LAPIC_DLMODE_FIXED);
4564 			self->ci_need_tlbwait = 1;
4565 			splx(s);
4566 		} else if ((pm->pm_cpus & ~self->ci_cpumask) != 0 ||
4567 		    (kernel && (pm->pm_kernel_cpus & ~self->ci_cpumask) != 0)) {
4568 			/*
4569 			 * We don't bother traversing the CPU list if only
4570 			 * used by this CPU.
4571 			 *
4572 			 * We can't do global flushes with the multicast
4573 			 * mechanism.
4574 			 */
4575 			KASSERT(pte == 0);
4576 
4577 			/*
4578 			 * Take ownership of the shootdown mailbox on each
4579 			 * CPU, fill the details and fire it off.
4580 			 */
4581 			s = splvm();
4582 			for (CPU_INFO_FOREACH(cii, ci)) {
4583 				if (ci == self ||
4584 				    !pmap_is_active(pm, ci, kernel) ||
4585 				    !(ci->ci_flags & CPUF_RUNNING))
4586 					continue;
4587 				selfmb->mb_head++;
4588 				mb = &ci->ci_pmap_cpu->pc_mbox;
4589 				count = SPINLOCK_BACKOFF_MIN;
4590 				while (atomic_cas_ulong(
4591 				    (u_long *)&mb->mb_pointer,
4592 				    0, (u_long)&selfmb->mb_tail) != 0) {
4593 				    	splx(s);
4594 					while (mb->mb_pointer != 0)
4595 						SPINLOCK_BACKOFF(count);
4596 					s = splvm();
4597 				}
4598 				mb->mb_addr1 = sva;
4599 				mb->mb_addr2 = eva;
4600 				mb->mb_global = pte;
4601 				if (x86_ipi(LAPIC_TLB_MCAST_VECTOR,
4602 				    ci->ci_cpuid, LAPIC_DLMODE_FIXED))
4603 					panic("pmap_tlb_shootdown: ipi failed");
4604 			}
4605 			self->ci_need_tlbwait = 1;
4606 			splx(s);
4607 		}
4608 	}
4609 #endif	/* MULTIPROCESSOR */
4610 
4611 	/* Update the current CPU before waiting for others. */
4612 	if (!pmap_is_active(pm, self, kernel))
4613 		return;
4614 
4615 	if (sva == (vaddr_t)-1LL) {
4616 		if (pte != 0)
4617 			tlbflushg();
4618 		else
4619 			tlbflush();
4620 	} else {
4621 		do {
4622 			pmap_update_pg(sva);
4623 			sva += PAGE_SIZE;
4624 		} while (sva < eva);
4625 	}
4626 }
4627 
4628 /*
4629  * pmap_tlb_shootwait: wait for pending TLB shootdowns to complete
4630  *
4631  * => only waits for operations generated by the current CPU
4632  * => must be called with preemption disabled
4633  */
4634 
4635 void
4636 pmap_tlb_shootwait(void)
4637 {
4638 	struct cpu_info *self;
4639 	struct pmap_mbox *mb;
4640 
4641 	KASSERT(kpreempt_disabled());
4642 
4643 	/*
4644 	 * Anything to do?  XXX Really we want to avoid touching the cache
4645 	 * lines of the two mailboxes, but the processor may read ahead.
4646 	 */
4647 	self = curcpu();
4648 	if (!self->ci_need_tlbwait)
4649 		return;
4650 	self->ci_need_tlbwait = 0;
4651 
4652 	/* If we own the global mailbox, wait for it to drain. */
4653 	mb = &pmap_mbox;
4654 	while (mb->mb_pointer == self && mb->mb_head != mb->mb_tail)
4655 		x86_pause();
4656 
4657 	/* If we own other CPU's mailboxes, wait for them to drain. */
4658 	mb = &self->ci_pmap_cpu->pc_mbox;
4659 	KASSERT(mb->mb_pointer != &mb->mb_tail);
4660 	while (mb->mb_head != mb->mb_tail)
4661 		x86_pause();
4662 }
4663 
4664 /*
4665  * pmap_update: process deferred invalidations
4666  */
4667 
4668 void
4669 pmap_update(struct pmap *pmap)
4670 {
4671 	struct vm_page *ptp, *empty_ptps;
4672 	struct pmap_page *pp;
4673 	lwp_t *l;
4674 
4675 	/*
4676 	 * if we have torn down this pmap, invalidate non-global TLB
4677 	 * entries on any processors using it.
4678 	 */
4679 	l = curlwp;
4680 	if (__predict_false(l->l_md.md_gc_pmap == pmap)) {
4681 		l->l_md.md_gc_pmap = NULL;
4682 		KPREEMPT_DISABLE(l);
4683 		pmap_tlb_shootdown(pmap, -1, -1, 0);
4684 		KPREEMPT_ENABLE(l);
4685 	}
4686 
4687 	/*
4688 	 * wait for tlb shootdowns to complete before returning control
4689 	 * to the caller.
4690 	 */
4691 	kpreempt_disable();
4692 	pmap_tlb_shootwait();
4693 	kpreempt_enable();
4694 
4695 	/*
4696 	 * now that shootdowns are complete, process deferred frees,
4697 	 * but not from interrupt context.
4698 	 */
4699 	if (l->l_md.md_gc_ptp != NULL) {
4700 		if (cpu_intr_p() || (l->l_pflag & LP_INTR) != 0) {
4701 			return;
4702 		}
4703 
4704 		empty_ptps = l->l_md.md_gc_ptp;
4705 		l->l_md.md_gc_ptp = NULL;
4706 
4707 		while ((ptp = empty_ptps) != NULL) {
4708 			ptp->flags |= PG_ZERO;
4709 			pp = VM_PAGE_TO_PP(ptp);
4710 			empty_ptps = pp->pp_link;
4711 			LIST_INIT(&pp->pp_head.pvh_list);
4712 			uvm_pagefree(ptp);
4713 		}
4714 	}
4715 }
4716 
4717 #if PTP_LEVELS > 4
4718 #error "Unsupported number of page table mappings"
4719 #endif
4720 
4721 paddr_t
4722 pmap_init_tmp_pgtbl(paddr_t pg)
4723 {
4724 	static bool maps_loaded;
4725 	static const paddr_t x86_tmp_pml_paddr[] = {
4726 	    4 * PAGE_SIZE,
4727 	    5 * PAGE_SIZE,
4728 	    6 * PAGE_SIZE,
4729 	    7 * PAGE_SIZE
4730 	};
4731 	static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 };
4732 
4733 	pd_entry_t *tmp_pml, *kernel_pml;
4734 
4735 	int level;
4736 
4737 	if (!maps_loaded) {
4738 		for (level = 0; level < PTP_LEVELS; ++level) {
4739 			x86_tmp_pml_vaddr[level] =
4740 			    uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
4741 			    UVM_KMF_VAONLY);
4742 
4743 			if (x86_tmp_pml_vaddr[level] == 0)
4744 				panic("mapping of real mode PML failed\n");
4745 			pmap_kenter_pa(x86_tmp_pml_vaddr[level],
4746 			    x86_tmp_pml_paddr[level],
4747 			    VM_PROT_READ | VM_PROT_WRITE);
4748 			pmap_update(pmap_kernel());
4749 		}
4750 		maps_loaded = true;
4751 	}
4752 
4753 	/* Zero levels 1-3 */
4754 	for (level = 0; level < PTP_LEVELS - 1; ++level) {
4755 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
4756 		memset(tmp_pml, 0, PAGE_SIZE);
4757 	}
4758 
4759 	/* Copy PML4 */
4760 	kernel_pml = pmap_kernel()->pm_pdir;
4761 	tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1];
4762 	memcpy(tmp_pml, kernel_pml, PAGE_SIZE);
4763 
4764 	/* Hook our own level 3 in */
4765 	tmp_pml[pl_i(pg, PTP_LEVELS)] =
4766 	    (x86_tmp_pml_paddr[PTP_LEVELS - 2] & PG_FRAME) | PG_RW | PG_V;
4767 
4768 	for (level = PTP_LEVELS - 1; level > 0; --level) {
4769 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
4770 
4771 		tmp_pml[pl_i(pg, level + 1)] =
4772 		    (x86_tmp_pml_paddr[level - 1] & PG_FRAME) | PG_RW | PG_V;
4773 	}
4774 
4775 	tmp_pml = (void *)x86_tmp_pml_vaddr[0];
4776 	tmp_pml[pl_i(pg, 1)] = (pg & PG_FRAME) | PG_RW | PG_V;
4777 
4778 	return x86_tmp_pml_paddr[PTP_LEVELS - 1];
4779 }
4780