xref: /netbsd-src/sys/arch/x86/x86/pmap.c (revision 7f21db1c0118155e0dd40b75182e30c589d9f63e)
1 /*	$NetBSD: pmap.c,v 1.102 2010/02/10 00:39:30 jym Exp $	*/
2 
3 /*
4  * Copyright (c) 2007 Manuel Bouyer.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *
26  */
27 
28 /*
29  * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
30  *
31  * Permission to use, copy, modify, and distribute this software for any
32  * purpose with or without fee is hereby granted, provided that the above
33  * copyright notice and this permission notice appear in all copies.
34  *
35  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
36  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
37  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
38  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
39  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
40  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
41  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
42  */
43 
44 /*
45  *
46  * Copyright (c) 1997 Charles D. Cranor and Washington University.
47  * All rights reserved.
48  *
49  * Redistribution and use in source and binary forms, with or without
50  * modification, are permitted provided that the following conditions
51  * are met:
52  * 1. Redistributions of source code must retain the above copyright
53  *    notice, this list of conditions and the following disclaimer.
54  * 2. Redistributions in binary form must reproduce the above copyright
55  *    notice, this list of conditions and the following disclaimer in the
56  *    documentation and/or other materials provided with the distribution.
57  * 3. All advertising materials mentioning features or use of this software
58  *    must display the following acknowledgement:
59  *      This product includes software developed by Charles D. Cranor and
60  *      Washington University.
61  * 4. The name of the author may not be used to endorse or promote products
62  *    derived from this software without specific prior written permission.
63  *
64  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
65  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
66  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
67  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
68  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
69  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
70  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
71  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
72  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
73  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
74  */
75 
76 /*
77  * Copyright 2001 (c) Wasabi Systems, Inc.
78  * All rights reserved.
79  *
80  * Written by Frank van der Linden for Wasabi Systems, Inc.
81  *
82  * Redistribution and use in source and binary forms, with or without
83  * modification, are permitted provided that the following conditions
84  * are met:
85  * 1. Redistributions of source code must retain the above copyright
86  *    notice, this list of conditions and the following disclaimer.
87  * 2. Redistributions in binary form must reproduce the above copyright
88  *    notice, this list of conditions and the following disclaimer in the
89  *    documentation and/or other materials provided with the distribution.
90  * 3. All advertising materials mentioning features or use of this software
91  *    must display the following acknowledgement:
92  *      This product includes software developed for the NetBSD Project by
93  *      Wasabi Systems, Inc.
94  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
95  *    or promote products derived from this software without specific prior
96  *    written permission.
97  *
98  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
99  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
100  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
101  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
102  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
103  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
104  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
105  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
106  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
107  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
108  * POSSIBILITY OF SUCH DAMAGE.
109  */
110 
111 /*
112  * This is the i386 pmap modified and generalized to support x86-64
113  * as well. The idea is to hide the upper N levels of the page tables
114  * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest
115  * is mostly untouched, except that it uses some more generalized
116  * macros and interfaces.
117  *
118  * This pmap has been tested on the i386 as well, and it can be easily
119  * adapted to PAE.
120  *
121  * fvdl@wasabisystems.com 18-Jun-2001
122  */
123 
124 /*
125  * pmap.c: i386 pmap module rewrite
126  * Chuck Cranor <chuck@ccrc.wustl.edu>
127  * 11-Aug-97
128  *
129  * history of this pmap module: in addition to my own input, i used
130  *    the following references for this rewrite of the i386 pmap:
131  *
132  * [1] the NetBSD i386 pmap.   this pmap appears to be based on the
133  *     BSD hp300 pmap done by Mike Hibler at University of Utah.
134  *     it was then ported to the i386 by William Jolitz of UUNET
135  *     Technologies, Inc.   Then Charles M. Hannum of the NetBSD
136  *     project fixed some bugs and provided some speed ups.
137  *
138  * [2] the FreeBSD i386 pmap.   this pmap seems to be the
139  *     Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson
140  *     and David Greenman.
141  *
142  * [3] the Mach pmap.   this pmap, from CMU, seems to have migrated
143  *     between several processors.   the VAX version was done by
144  *     Avadis Tevanian, Jr., and Michael Wayne Young.    the i386
145  *     version was done by Lance Berc, Mike Kupfer, Bob Baron,
146  *     David Golub, and Richard Draves.    the alpha version was
147  *     done by Alessandro Forin (CMU/Mach) and Chris Demetriou
148  *     (NetBSD/alpha).
149  */
150 
151 #include <sys/cdefs.h>
152 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.102 2010/02/10 00:39:30 jym Exp $");
153 
154 #include "opt_user_ldt.h"
155 #include "opt_lockdebug.h"
156 #include "opt_multiprocessor.h"
157 #include "opt_xen.h"
158 #if !defined(__x86_64__)
159 #include "opt_kstack_dr0.h"
160 #endif /* !defined(__x86_64__) */
161 
162 #include <sys/param.h>
163 #include <sys/systm.h>
164 #include <sys/proc.h>
165 #include <sys/pool.h>
166 #include <sys/kernel.h>
167 #include <sys/atomic.h>
168 #include <sys/cpu.h>
169 #include <sys/intr.h>
170 #include <sys/xcall.h>
171 
172 #include <uvm/uvm.h>
173 
174 #include <dev/isa/isareg.h>
175 
176 #include <machine/specialreg.h>
177 #include <machine/gdt.h>
178 #include <machine/isa_machdep.h>
179 #include <machine/cpuvar.h>
180 
181 #include <x86/pmap.h>
182 #include <x86/pmap_pv.h>
183 
184 #include <x86/i82489reg.h>
185 #include <x86/i82489var.h>
186 
187 #ifdef XEN
188 #include <xen/xen3-public/xen.h>
189 #include <xen/hypervisor.h>
190 #endif
191 
192 /* flag to be used for kernel mappings: PG_u on Xen/amd64, 0 otherwise */
193 #if defined(XEN) && defined(__x86_64__)
194 #define PG_k PG_u
195 #else
196 #define PG_k 0
197 #endif
198 
199 /*
200  * general info:
201  *
202  *  - for an explanation of how the i386 MMU hardware works see
203  *    the comments in <machine/pte.h>.
204  *
205  *  - for an explanation of the general memory structure used by
206  *    this pmap (including the recursive mapping), see the comments
207  *    in <machine/pmap.h>.
208  *
209  * this file contains the code for the "pmap module."   the module's
210  * job is to manage the hardware's virtual to physical address mappings.
211  * note that there are two levels of mapping in the VM system:
212  *
213  *  [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
214  *      to map ranges of virtual address space to objects/files.  for
215  *      example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
216  *      to the file /bin/ls starting at offset zero."   note that
217  *      the upper layer mapping is not concerned with how individual
218  *      vm_pages are mapped.
219  *
220  *  [2] the lower layer of the VM system (the pmap) maintains the mappings
221  *      from virtual addresses.   it is concerned with which vm_page is
222  *      mapped where.   for example, when you run /bin/ls and start
223  *      at page 0x1000 the fault routine may lookup the correct page
224  *      of the /bin/ls file and then ask the pmap layer to establish
225  *      a mapping for it.
226  *
227  * note that information in the lower layer of the VM system can be
228  * thrown away since it can easily be reconstructed from the info
229  * in the upper layer.
230  *
231  * data structures we use include:
232  *
233  *  - struct pmap: describes the address space of one thread
234  *  - struct pv_entry: describes one <PMAP,VA> mapping of a PA
235  *  - struct pv_head: there is one pv_head per managed page of
236  *	physical memory.   the pv_head points to a list of pv_entry
237  *	structures which describe all the <PMAP,VA> pairs that this
238  *      page is mapped in.    this is critical for page based operations
239  *      such as pmap_page_protect() [change protection on _all_ mappings
240  *      of a page]
241  */
242 
243 /*
244  * memory allocation
245  *
246  *  - there are three data structures that we must dynamically allocate:
247  *
248  * [A] new process' page directory page (PDP)
249  *	- plan 1: done at pmap_create() we use
250  *	  uvm_km_alloc(kernel_map, PAGE_SIZE)  [fka kmem_alloc] to do this
251  *	  allocation.
252  *
253  * if we are low in free physical memory then we sleep in
254  * uvm_km_alloc -- in this case this is ok since we are creating
255  * a new pmap and should not be holding any locks.
256  *
257  * if the kernel is totally out of virtual space
258  * (i.e. uvm_km_alloc returns NULL), then we panic.
259  *
260  * [B] new page tables pages (PTP)
261  * 	- call uvm_pagealloc()
262  * 		=> success: zero page, add to pm_pdir
263  * 		=> failure: we are out of free vm_pages, let pmap_enter()
264  *		   tell UVM about it.
265  *
266  * note: for kernel PTPs, we start with NKPTP of them.   as we map
267  * kernel memory (at uvm_map time) we check to see if we've grown
268  * the kernel pmap.   if so, we call the optional function
269  * pmap_growkernel() to grow the kernel PTPs in advance.
270  *
271  * [C] pv_entry structures
272  */
273 
274 /*
275  * locking
276  *
277  * we have the following locks that we must contend with:
278  *
279  * mutexes:
280  *
281  * - pmap lock (per pmap, part of uvm_object)
282  *   this lock protects the fields in the pmap structure including
283  *   the non-kernel PDEs in the PDP, and the PTEs.  it also locks
284  *   in the alternate PTE space (since that is determined by the
285  *   entry in the PDP).
286  *
287  * - pvh_lock (per pv_head)
288  *   this lock protects the pv_entry list which is chained off the
289  *   pv_head structure for a specific managed PA.   it is locked
290  *   when traversing the list (e.g. adding/removing mappings,
291  *   syncing R/M bits, etc.)
292  *
293  * - pmaps_lock
294  *   this lock protects the list of active pmaps (headed by "pmaps").
295  *   we lock it when adding or removing pmaps from this list.
296  *
297  * tlb shootdown
298  *
299  * tlb shootdowns are hard interrupts that operate outside the spl
300  * framework: they don't need to be blocked provided that the pmap module
301  * gets the order of events correct.  the calls are made by talking directly
302  * to the lapic.  the stubs to handle the interrupts are quite short and do
303  * one of the following: invalidate a single page, a range of pages, all
304  * user tlb entries or the entire tlb.
305  *
306  * the cpus synchronize with each other using pmap_mbox structures which are
307  * aligned on 64-byte cache lines.  tlb shootdowns against the kernel pmap
308  * use a global mailbox and are generated using a broadcast ipi (broadcast
309  * to all but the sending cpu).  shootdowns against regular pmaps use
310  * per-cpu mailboxes and are multicast.  kernel and user shootdowns can
311  * execute simultaneously, as can shootdowns within different multithreaded
312  * processes.  TODO:
313  *
314  *   1. figure out which waitpoints can be deferered to pmap_update().
315  *   2. see if there is a cheap way to batch some updates.
316  */
317 
318 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER;
319 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER;
320 const long nkptpmax[] = NKPTPMAX_INITIALIZER;
321 const long nbpd[] = NBPD_INITIALIZER;
322 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER;
323 pd_entry_t * const alternate_pdes[] = APDES_INITIALIZER;
324 
325 long nkptp[] = NKPTP_INITIALIZER;
326 
327 static kmutex_t pmaps_lock;
328 
329 static vaddr_t pmap_maxkvaddr;
330 
331 #define COUNT(x)	/* nothing */
332 
333 /*
334  * XXX kludge: dummy locking to make KASSERTs in uvm_page.c comfortable.
335  * actual locking is done by pm_lock.
336  */
337 #if defined(DIAGNOSTIC)
338 #define	PMAP_SUBOBJ_LOCK(pm, idx) \
339 	KASSERT(mutex_owned(&(pm)->pm_lock)); \
340 	if ((idx) != 0) \
341 		mutex_enter(&(pm)->pm_obj[(idx)].vmobjlock)
342 #define	PMAP_SUBOBJ_UNLOCK(pm, idx) \
343 	KASSERT(mutex_owned(&(pm)->pm_lock)); \
344 	if ((idx) != 0) \
345 		mutex_exit(&(pm)->pm_obj[(idx)].vmobjlock)
346 #else /* defined(DIAGNOSTIC) */
347 #define	PMAP_SUBOBJ_LOCK(pm, idx)	/* nothing */
348 #define	PMAP_SUBOBJ_UNLOCK(pm, idx)	/* nothing */
349 #endif /* defined(DIAGNOSTIC) */
350 
351 /*
352  * Misc. event counters.
353  */
354 struct evcnt pmap_iobmp_evcnt;
355 struct evcnt pmap_ldt_evcnt;
356 
357 /*
358  * Global TLB shootdown mailbox.
359  */
360 struct evcnt pmap_tlb_evcnt __aligned(64);
361 struct pmap_mbox pmap_mbox __aligned(64);
362 
363 /*
364  * Per-CPU data.  The pmap mailbox is cache intensive so gets its
365  * own line.  Note that the mailbox must be the first item.
366  */
367 struct pmap_cpu {
368 	/* TLB shootdown */
369 	struct pmap_mbox pc_mbox;
370 };
371 
372 union {
373 	struct pmap_cpu pc;
374 	uint8_t padding[64];
375 } pmap_cpu[MAXCPUS] __aligned(64);
376 
377 /*
378  * global data structures
379  */
380 
381 static struct pmap kernel_pmap_store;	/* the kernel's pmap (proc0) */
382 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store;
383 
384 /*
385  * pmap_pg_g: if our processor supports PG_G in the PTE then we
386  * set pmap_pg_g to PG_G (otherwise it is zero).
387  */
388 
389 int pmap_pg_g = 0;
390 
391 /*
392  * pmap_largepages: if our processor supports PG_PS and we are
393  * using it, this is set to true.
394  */
395 
396 int pmap_largepages;
397 
398 /*
399  * i386 physical memory comes in a big contig chunk with a small
400  * hole toward the front of it...  the following two paddr_t's
401  * (shared with machdep.c) describe the physical address space
402  * of this machine.
403  */
404 paddr_t avail_start;	/* PA of first available physical page */
405 paddr_t avail_end;	/* PA of last available physical page */
406 
407 #ifdef XEN
408 #ifdef __x86_64__
409 /* Dummy PGD for user cr3, used between pmap_deactivate() and pmap_activate() */
410 static paddr_t xen_dummy_user_pgd;
411 /* Currently active user PGD (can't use rcr3()) */
412 static paddr_t xen_current_user_pgd = 0;
413 #endif /* __x86_64__ */
414 paddr_t pmap_pa_start; /* PA of first physical page for this domain */
415 paddr_t pmap_pa_end;   /* PA of last physical page for this domain */
416 #endif /* XEN */
417 
418 #define	VM_PAGE_TO_PP(pg)	(&(pg)->mdpage.mp_pp)
419 
420 #define	pp_lock(pp)	mutex_spin_enter(&(pp)->pp_lock)
421 #define	pp_unlock(pp)	mutex_spin_exit(&(pp)->pp_lock)
422 #define	pp_locked(pp)	mutex_owned(&(pp)->pp_lock)
423 
424 #define	PV_HASH_SIZE		32768
425 #define	PV_HASH_LOCK_CNT	32
426 
427 struct pv_hash_lock {
428 	kmutex_t lock;
429 } __aligned(CACHE_LINE_SIZE) pv_hash_locks[PV_HASH_LOCK_CNT]
430     __aligned(CACHE_LINE_SIZE);
431 
432 struct pv_hash_head {
433 	SLIST_HEAD(, pv_entry) hh_list;
434 } pv_hash_heads[PV_HASH_SIZE];
435 
436 static u_int
437 pvhash_hash(struct vm_page *ptp, vaddr_t va)
438 {
439 
440 	return (uintptr_t)ptp / sizeof(*ptp) + (va >> PAGE_SHIFT);
441 }
442 
443 static struct pv_hash_head *
444 pvhash_head(u_int hash)
445 {
446 
447 	return &pv_hash_heads[hash % PV_HASH_SIZE];
448 }
449 
450 static kmutex_t *
451 pvhash_lock(u_int hash)
452 {
453 
454 	return &pv_hash_locks[hash % PV_HASH_LOCK_CNT].lock;
455 }
456 
457 static struct pv_entry *
458 pvhash_remove(struct pv_hash_head *hh, struct vm_page *ptp, vaddr_t va)
459 {
460 	struct pv_entry *pve;
461 	struct pv_entry *prev;
462 
463 	prev = NULL;
464 	SLIST_FOREACH(pve, &hh->hh_list, pve_hash) {
465 		if (pve->pve_pte.pte_ptp == ptp &&
466 		    pve->pve_pte.pte_va == va) {
467 			if (prev != NULL) {
468 				SLIST_REMOVE_AFTER(prev, pve_hash);
469 			} else {
470 				SLIST_REMOVE_HEAD(&hh->hh_list, pve_hash);
471 			}
472 			break;
473 		}
474 		prev = pve;
475 	}
476 	return pve;
477 }
478 
479 /*
480  * other data structures
481  */
482 
483 static pt_entry_t protection_codes[8];	/* maps MI prot to i386 prot code */
484 static bool pmap_initialized = false;	/* pmap_init done yet? */
485 
486 /*
487  * the following two vaddr_t's are used during system startup
488  * to keep track of how much of the kernel's VM space we have used.
489  * once the system is started, the management of the remaining kernel
490  * VM space is turned over to the kernel_map vm_map.
491  */
492 
493 static vaddr_t virtual_avail;	/* VA of first free KVA */
494 static vaddr_t virtual_end;	/* VA of last free KVA */
495 
496 /*
497  * linked list of all non-kernel pmaps
498  */
499 
500 static struct pmap_head pmaps;
501 
502 /*
503  * pool that pmap structures are allocated from
504  */
505 
506 static struct pool_cache pmap_cache;
507 
508 /*
509  * pv_entry cache
510  */
511 
512 static struct pool_cache pmap_pv_cache;
513 
514 /*
515  * MULTIPROCESSOR: special VA's/ PTE's are actually allocated inside a
516  * maxcpus*NPTECL array of PTE's, to avoid cache line thrashing
517  * due to false sharing.
518  */
519 
520 #ifdef MULTIPROCESSOR
521 #define PTESLEW(pte, id) ((pte)+(id)*NPTECL)
522 #define VASLEW(va,id) ((va)+(id)*NPTECL*PAGE_SIZE)
523 #else
524 #define PTESLEW(pte, id) (pte)
525 #define VASLEW(va,id) (va)
526 #endif
527 
528 /*
529  * special VAs and the PTEs that map them
530  */
531 static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte, *early_zero_pte;
532 static char *csrcp, *cdstp, *zerop, *ptpp, *early_zerop;
533 
534 /*
535  * pool and cache that PDPs are allocated from
536  */
537 
538 static struct pool_cache pmap_pdp_cache;
539 int	pmap_pdp_ctor(void *, void *, int);
540 void	pmap_pdp_dtor(void *, void *);
541 #ifdef PAE
542 /* need to allocate items of 4 pages */
543 void *pmap_pdp_alloc(struct pool *, int);
544 void pmap_pdp_free(struct pool *, void *);
545 static struct pool_allocator pmap_pdp_allocator = {
546 	.pa_alloc = pmap_pdp_alloc,
547 	.pa_free = pmap_pdp_free,
548 	.pa_pagesz = PAGE_SIZE * PDP_SIZE,
549 };
550 #endif /* PAE */
551 
552 void *vmmap; /* XXX: used by mem.c... it should really uvm_map_reserve it */
553 
554 extern vaddr_t idt_vaddr;			/* we allocate IDT early */
555 extern paddr_t idt_paddr;
556 
557 #ifdef _LP64
558 extern vaddr_t lo32_vaddr;
559 extern vaddr_t lo32_paddr;
560 #endif
561 
562 extern int end;
563 
564 #ifdef i386
565 /* stuff to fix the pentium f00f bug */
566 extern vaddr_t pentium_idt_vaddr;
567 #endif
568 
569 
570 /*
571  * local prototypes
572  */
573 
574 static struct vm_page	*pmap_get_ptp(struct pmap *, vaddr_t,
575 				      pd_entry_t * const *);
576 static struct vm_page	*pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int);
577 static void		 pmap_freepage(struct pmap *, struct vm_page *, int);
578 static void		 pmap_free_ptp(struct pmap *, struct vm_page *,
579 				       vaddr_t, pt_entry_t *,
580 				       pd_entry_t * const *);
581 static bool		 pmap_is_curpmap(struct pmap *);
582 static bool		 pmap_is_active(struct pmap *, struct cpu_info *, bool);
583 static void		 pmap_map_ptes(struct pmap *, struct pmap **,
584 				       pt_entry_t **, pd_entry_t * const **);
585 static bool		 pmap_remove_pte(struct pmap *, struct vm_page *,
586 					 pt_entry_t *, vaddr_t,
587 					 struct pv_entry **);
588 static pt_entry_t	 pmap_remove_ptes(struct pmap *, struct vm_page *,
589 					  vaddr_t, vaddr_t, vaddr_t,
590 					  struct pv_entry **);
591 
592 static void		 pmap_unmap_ptes(struct pmap *, struct pmap *);
593 static bool		 pmap_get_physpage(vaddr_t, int, paddr_t *);
594 static int		 pmap_pdes_invalid(vaddr_t, pd_entry_t * const *,
595 					   pd_entry_t *);
596 #define	pmap_pdes_valid(va, pdes, lastpde)	\
597 	(pmap_pdes_invalid((va), (pdes), (lastpde)) == 0)
598 static void		 pmap_alloc_level(pd_entry_t * const *, vaddr_t, int,
599 					  long *);
600 
601 static bool		 pmap_reactivate(struct pmap *);
602 
603 /*
604  * p m a p   h e l p e r   f u n c t i o n s
605  */
606 
607 static inline void
608 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff)
609 {
610 
611 	if (pmap == pmap_kernel()) {
612 		atomic_add_long(&pmap->pm_stats.resident_count, resid_diff);
613 		atomic_add_long(&pmap->pm_stats.wired_count, wired_diff);
614 	} else {
615 		KASSERT(mutex_owned(&pmap->pm_lock));
616 		pmap->pm_stats.resident_count += resid_diff;
617 		pmap->pm_stats.wired_count += wired_diff;
618 	}
619 }
620 
621 static inline void
622 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
623 {
624 	int resid_diff = ((npte & PG_V) ? 1 : 0) - ((opte & PG_V) ? 1 : 0);
625 	int wired_diff = ((npte & PG_W) ? 1 : 0) - ((opte & PG_W) ? 1 : 0);
626 
627 	KASSERT((npte & (PG_V | PG_W)) != PG_W);
628 	KASSERT((opte & (PG_V | PG_W)) != PG_W);
629 
630 	pmap_stats_update(pmap, resid_diff, wired_diff);
631 }
632 
633 /*
634  * ptp_to_pmap: lookup pmap by ptp
635  */
636 
637 static struct pmap *
638 ptp_to_pmap(struct vm_page *ptp)
639 {
640 	struct pmap *pmap;
641 
642 	if (ptp == NULL) {
643 		return pmap_kernel();
644 	}
645 	pmap = (struct pmap *)ptp->uobject;
646 	KASSERT(pmap != NULL);
647 	KASSERT(&pmap->pm_obj[0] == ptp->uobject);
648 	return pmap;
649 }
650 
651 static inline struct pv_pte *
652 pve_to_pvpte(struct pv_entry *pve)
653 {
654 
655 	KASSERT((void *)&pve->pve_pte == (void *)pve);
656 	return &pve->pve_pte;
657 }
658 
659 static inline struct pv_entry *
660 pvpte_to_pve(struct pv_pte *pvpte)
661 {
662 	struct pv_entry *pve = (void *)pvpte;
663 
664 	KASSERT(pve_to_pvpte(pve) == pvpte);
665 	return pve;
666 }
667 
668 /*
669  * pv_pte_first, pv_pte_next: PV list iterator.
670  */
671 
672 static struct pv_pte *
673 pv_pte_first(struct pmap_page *pp)
674 {
675 
676 	KASSERT(pp_locked(pp));
677 	if ((pp->pp_flags & PP_EMBEDDED) != 0) {
678 		return &pp->pp_pte;
679 	}
680 	return pve_to_pvpte(LIST_FIRST(&pp->pp_head.pvh_list));
681 }
682 
683 static struct pv_pte *
684 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte)
685 {
686 
687 	KASSERT(pvpte != NULL);
688 	KASSERT(pp_locked(pp));
689 	if (pvpte == &pp->pp_pte) {
690 		KASSERT((pp->pp_flags & PP_EMBEDDED) != 0);
691 		return NULL;
692 	}
693 	KASSERT((pp->pp_flags & PP_EMBEDDED) == 0);
694 	return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list));
695 }
696 
697 /*
698  * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
699  *		of course the kernel is always loaded
700  */
701 
702 inline static bool
703 pmap_is_curpmap(struct pmap *pmap)
704 {
705 #if defined(XEN) && defined(__x86_64__)
706 	/*
707 	 * Only kernel pmap is physically loaded.
708 	 * User PGD may be active, but TLB will be flushed
709 	 * with HYPERVISOR_iret anyway, so let's say no
710 	 */
711 	return(pmap == pmap_kernel());
712 #else /* XEN && __x86_64__*/
713 	return((pmap == pmap_kernel()) ||
714 	       (pmap == curcpu()->ci_pmap));
715 #endif
716 }
717 
718 /*
719  * pmap_is_active: is this pmap loaded into the specified processor's %cr3?
720  */
721 
722 inline static bool
723 pmap_is_active(struct pmap *pmap, struct cpu_info *ci, bool kernel)
724 {
725 
726 	return (pmap == pmap_kernel() ||
727 	    (pmap->pm_cpus & ci->ci_cpumask) != 0 ||
728 	    (kernel && (pmap->pm_kernel_cpus & ci->ci_cpumask) != 0));
729 }
730 
731 static void
732 pmap_apte_flush(struct pmap *pmap)
733 {
734 
735 	KASSERT(kpreempt_disabled());
736 
737 	/*
738 	 * Flush the APTE mapping from all other CPUs that
739 	 * are using the pmap we are using (who's APTE space
740 	 * is the one we've just modified).
741 	 *
742 	 * XXXthorpej -- find a way to defer the IPI.
743 	 */
744 	pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, 0);
745 	pmap_tlb_shootwait();
746 }
747 
748 /*
749  *	Add a reference to the specified pmap.
750  */
751 
752 inline void
753 pmap_reference(struct pmap *pmap)
754 {
755 
756 	atomic_inc_uint(&pmap->pm_obj[0].uo_refs);
757 }
758 
759 /*
760  * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
761  *
762  * => we lock enough pmaps to keep things locked in
763  * => must be undone with pmap_unmap_ptes before returning
764  */
765 
766 static void
767 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2,
768     pd_entry_t **ptepp, pd_entry_t * const **pdeppp)
769 {
770 	pd_entry_t opde, npde;
771 	struct pmap *ourpmap;
772 	struct cpu_info *ci;
773 	struct lwp *l;
774 	bool iscurrent;
775 	uint64_t ncsw;
776 #ifdef XEN
777 	int s;
778 #endif
779 
780 	/* the kernel's pmap is always accessible */
781 	if (pmap == pmap_kernel()) {
782 		*pmap2 = NULL;
783 		*ptepp = PTE_BASE;
784 		*pdeppp = normal_pdes;
785 		return;
786 	}
787 	KASSERT(kpreempt_disabled());
788 
789  retry:
790 	l = curlwp;
791 	ncsw = l->l_ncsw;
792  	ourpmap = NULL;
793 	ci = curcpu();
794 #if defined(XEN) && defined(__x86_64__)
795 	/*
796 	 * curmap can only be pmap_kernel so at this point
797 	 * pmap_is_curpmap is always false
798 	 */
799 	iscurrent = 0;
800 	ourpmap = pmap_kernel();
801 #else /* XEN && __x86_64__*/
802 	if (ci->ci_want_pmapload &&
803 	    vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) {
804 		pmap_load();
805 		if (l->l_ncsw != ncsw)
806 			goto retry;
807 	}
808 	iscurrent = pmap_is_curpmap(pmap);
809 	/* if curpmap then we are always mapped */
810 	if (iscurrent) {
811 		mutex_enter(&pmap->pm_lock);
812 		*pmap2 = NULL;
813 		*ptepp = PTE_BASE;
814 		*pdeppp = normal_pdes;
815 		goto out;
816 	}
817 	ourpmap = ci->ci_pmap;
818 #endif /* XEN && __x86_64__ */
819 
820 	/* need to lock both curpmap and pmap: use ordered locking */
821 	pmap_reference(ourpmap);
822 	if ((uintptr_t) pmap < (uintptr_t) ourpmap) {
823 		mutex_enter(&pmap->pm_lock);
824 		mutex_enter(&ourpmap->pm_lock);
825 	} else {
826 		mutex_enter(&ourpmap->pm_lock);
827 		mutex_enter(&pmap->pm_lock);
828 	}
829 
830 	if (l->l_ncsw != ncsw)
831 		goto unlock_and_retry;
832 
833 	/* need to load a new alternate pt space into curpmap? */
834 	COUNT(apdp_pde_map);
835 	opde = *APDP_PDE;
836 #ifdef XEN
837 	if (!pmap_valid_entry(opde) ||
838 	    pmap_pte2pa(opde) != pmap_pdirpa(pmap, 0)) {
839 		int i;
840 		s = splvm();
841 		/* Make recursive entry usable in user PGD */
842 		for (i = 0; i < PDP_SIZE; i++) {
843 			npde = pmap_pa2pte(
844 			    pmap_pdirpa(pmap, i * NPDPG)) | PG_k | PG_V;
845 			xpq_queue_pte_update(
846 			    xpmap_ptom(pmap_pdirpa(pmap, PDIR_SLOT_PTE + i)),
847 			    npde);
848 			xpq_queue_pte_update(xpmap_ptetomach(&APDP_PDE[i]),
849 			    npde);
850 #ifdef PAE
851 			/* update shadow entry too */
852 			xpq_queue_pte_update(
853 			    xpmap_ptetomach(&APDP_PDE_SHADOW[i]), npde);
854 #endif /* PAE */
855 			xpq_queue_invlpg(
856 			    (vaddr_t)&pmap->pm_pdir[PDIR_SLOT_PTE + i]);
857 		}
858 		xpq_flush_queue();
859 		if (pmap_valid_entry(opde))
860 			pmap_apte_flush(ourpmap);
861 		splx(s);
862 	}
863 #else /* XEN */
864 	npde = pmap_pa2pte(pmap_pdirpa(pmap, 0)) | PG_RW | PG_V;
865 	if (!pmap_valid_entry(opde) ||
866 	    pmap_pte2pa(opde) != pmap_pdirpa(pmap, 0)) {
867 		pmap_pte_set(APDP_PDE, npde);
868 		pmap_pte_flush();
869 		if (pmap_valid_entry(opde))
870 			pmap_apte_flush(ourpmap);
871 	}
872 #endif /* XEN */
873 	*pmap2 = ourpmap;
874 	*ptepp = APTE_BASE;
875 	*pdeppp = alternate_pdes;
876 	KASSERT(l->l_ncsw == ncsw);
877 #if !defined(XEN) || !defined(__x86_64__)
878  out:
879 #endif
880  	/*
881  	 * might have blocked, need to retry?
882  	 */
883 	if (l->l_ncsw != ncsw) {
884  unlock_and_retry:
885 	    	if (ourpmap != NULL) {
886 			mutex_exit(&ourpmap->pm_lock);
887 			pmap_destroy(ourpmap);
888 		}
889 		mutex_exit(&pmap->pm_lock);
890 		goto retry;
891 	}
892 
893 	return;
894 }
895 
896 /*
897  * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
898  */
899 
900 static void
901 pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2)
902 {
903 
904 	if (pmap == pmap_kernel()) {
905 		return;
906 	}
907 	KASSERT(kpreempt_disabled());
908 	if (pmap2 == NULL) {
909 		mutex_exit(&pmap->pm_lock);
910 	} else {
911 #if defined(XEN) && defined(__x86_64__)
912 		KASSERT(pmap2 == pmap_kernel());
913 #else
914 		KASSERT(curcpu()->ci_pmap == pmap2);
915 #endif
916 #if defined(MULTIPROCESSOR)
917 		pmap_pte_set(APDP_PDE, 0);
918 		pmap_pte_flush();
919 		pmap_apte_flush(pmap2);
920 #endif
921 		COUNT(apdp_pde_unmap);
922 		mutex_exit(&pmap->pm_lock);
923 		mutex_exit(&pmap2->pm_lock);
924 		pmap_destroy(pmap2);
925 	}
926 }
927 
928 inline static void
929 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte)
930 {
931 
932 #if !defined(__x86_64__)
933 	if (curproc == NULL || curproc->p_vmspace == NULL ||
934 	    pm != vm_map_pmap(&curproc->p_vmspace->vm_map))
935 		return;
936 
937 	if ((opte ^ npte) & PG_X)
938 		pmap_update_pg(va);
939 
940 	/*
941 	 * Executability was removed on the last executable change.
942 	 * Reset the code segment to something conservative and
943 	 * let the trap handler deal with setting the right limit.
944 	 * We can't do that because of locking constraints on the vm map.
945 	 */
946 
947 	if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) {
948 		struct trapframe *tf = curlwp->l_md.md_regs;
949 
950 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
951 		pm->pm_hiexec = I386_MAX_EXE_ADDR;
952 	}
953 #endif /* !defined(__x86_64__) */
954 }
955 
956 #if !defined(__x86_64__)
957 /*
958  * Fixup the code segment to cover all potential executable mappings.
959  * returns 0 if no changes to the code segment were made.
960  */
961 
962 int
963 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb)
964 {
965 	struct vm_map_entry *ent;
966 	struct pmap *pm = vm_map_pmap(map);
967 	vaddr_t va = 0;
968 
969 	vm_map_lock_read(map);
970 	for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) {
971 
972 		/*
973 		 * This entry has greater va than the entries before.
974 		 * We need to make it point to the last page, not past it.
975 		 */
976 
977 		if (ent->protection & VM_PROT_EXECUTE)
978 			va = trunc_page(ent->end) - PAGE_SIZE;
979 	}
980 	vm_map_unlock_read(map);
981 	if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL))
982 		return (0);
983 
984 	pm->pm_hiexec = va;
985 	if (pm->pm_hiexec > I386_MAX_EXE_ADDR) {
986 		tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
987 	} else {
988 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
989 		return (0);
990 	}
991 	return (1);
992 }
993 #endif /* !defined(__x86_64__) */
994 
995 /*
996  * p m a p   k e n t e r   f u n c t i o n s
997  *
998  * functions to quickly enter/remove pages from the kernel address
999  * space.   pmap_kremove is exported to MI kernel.  we make use of
1000  * the recursive PTE mappings.
1001  */
1002 
1003 /*
1004  * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
1005  *
1006  * => no need to lock anything, assume va is already allocated
1007  * => should be faster than normal pmap enter function
1008  */
1009 
1010 void
1011 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
1012 {
1013 	pt_entry_t *pte, opte, npte;
1014 
1015 	KASSERT(!(prot & ~VM_PROT_ALL));
1016 
1017 	if (va < VM_MIN_KERNEL_ADDRESS)
1018 		pte = vtopte(va);
1019 	else
1020 		pte = kvtopte(va);
1021 #ifdef DOM0OPS
1022 	if (pa < pmap_pa_start || pa >= pmap_pa_end) {
1023 #ifdef DEBUG
1024 		printk("pmap_kenter_pa: pa 0x%" PRIx64 " for va 0x%" PRIx64
1025 		    " outside range\n", (int64_t)pa, (int64_t)va);
1026 #endif /* DEBUG */
1027 		npte = pa;
1028 	} else
1029 #endif /* DOM0OPS */
1030 		npte = pmap_pa2pte(pa);
1031 	npte |= protection_codes[prot] | PG_k | PG_V | pmap_pg_g;
1032 	if (flags & PMAP_NOCACHE)
1033 		npte |= PG_N;
1034 	opte = pmap_pte_testset(pte, npte); /* zap! */
1035 #if defined(DIAGNOSTIC)
1036 	/* XXX For now... */
1037 	if (opte & PG_PS)
1038 		panic("pmap_kenter_pa: PG_PS");
1039 #endif
1040 	if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) {
1041 		/* This should not happen, so no need to batch updates. */
1042 		kpreempt_disable();
1043 		pmap_tlb_shootdown(pmap_kernel(), va, 0, opte);
1044 		kpreempt_enable();
1045 	}
1046 }
1047 
1048 void
1049 pmap_emap_enter(vaddr_t va, paddr_t pa, vm_prot_t prot)
1050 {
1051 	pt_entry_t *pte, opte, npte;
1052 
1053 	KASSERT((prot & ~VM_PROT_ALL) == 0);
1054 	pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va);
1055 
1056 #ifdef DOM0OPS
1057 	if (pa < pmap_pa_start || pa >= pmap_pa_end) {
1058 		npte = pa;
1059 	} else
1060 #endif
1061 		npte = pmap_pa2pte(pa);
1062 
1063 	npte = pmap_pa2pte(pa);
1064 	npte |= protection_codes[prot] | PG_k | PG_V;
1065 	opte = pmap_pte_testset(pte, npte);
1066 }
1067 
1068 /*
1069  * pmap_emap_sync: perform TLB flush or pmap load, if it was deferred.
1070  */
1071 void
1072 pmap_emap_sync(bool canload)
1073 {
1074 	struct cpu_info *ci = curcpu();
1075 	struct pmap *pmap;
1076 
1077 	KASSERT(kpreempt_disabled());
1078 	if (__predict_true(ci->ci_want_pmapload && canload)) {
1079 		/*
1080 		 * XXX: Hint for pmap_reactivate(), which might suggest to
1081 		 * not perform TLB flush, if state has not changed.
1082 		 */
1083 		pmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map);
1084 		if (__predict_false(pmap == ci->ci_pmap)) {
1085 			const uint32_t cpumask = ci->ci_cpumask;
1086 			atomic_and_32(&pmap->pm_cpus, ~cpumask);
1087 		}
1088 		pmap_load();
1089 		KASSERT(ci->ci_want_pmapload == 0);
1090 	} else {
1091 		tlbflush();
1092 	}
1093 
1094 }
1095 
1096 void
1097 pmap_emap_remove(vaddr_t sva, vsize_t len)
1098 {
1099 	pt_entry_t *pte, xpte;
1100 	vaddr_t va, eva = sva + len;
1101 
1102 	for (va = sva; va < eva; va += PAGE_SIZE) {
1103 		pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va);
1104 		xpte |= pmap_pte_testset(pte, 0);
1105 	}
1106 }
1107 
1108 #ifdef XEN
1109 /*
1110  * pmap_kenter_ma: enter a kernel mapping without R/M (pv_entry) tracking
1111  *
1112  * => no need to lock anything, assume va is already allocated
1113  * => should be faster than normal pmap enter function
1114  * => we expect a MACHINE address
1115  */
1116 
1117 void
1118 pmap_kenter_ma(vaddr_t va, paddr_t ma, vm_prot_t prot, u_int flags)
1119 {
1120 	pt_entry_t *pte, opte, npte;
1121 
1122 	if (va < VM_MIN_KERNEL_ADDRESS)
1123 		pte = vtopte(va);
1124 	else
1125 		pte = kvtopte(va);
1126 
1127 	npte = ma | ((prot & VM_PROT_WRITE) ? PG_RW : PG_RO) |
1128 	     PG_V | PG_k;
1129 	if (flags & PMAP_NOCACHE)
1130 		npte |= PG_N;
1131 
1132 #ifndef XEN
1133 	if ((cpu_feature & CPUID_NOX) && !(prot & VM_PROT_EXECUTE))
1134 		npte |= PG_NX;
1135 #endif
1136 	opte = pmap_pte_testset (pte, npte); /* zap! */
1137 
1138 	if (pmap_valid_entry(opte)) {
1139 #if defined(MULTIPROCESSOR)
1140 		kpreempt_disable();
1141 		pmap_tlb_shootdown(pmap_kernel(), va, 0, opte);
1142 		kpreempt_enable();
1143 #else
1144 		/* Don't bother deferring in the single CPU case. */
1145 		pmap_update_pg(va);
1146 #endif
1147 	}
1148 }
1149 #endif	/* XEN */
1150 
1151 #if defined(__x86_64__)
1152 /*
1153  * Change protection for a virtual address. Local for a CPU only, don't
1154  * care about TLB shootdowns.
1155  *
1156  * => must be called with preemption disabled
1157  */
1158 void
1159 pmap_changeprot_local(vaddr_t va, vm_prot_t prot)
1160 {
1161 	pt_entry_t *pte, opte, npte;
1162 
1163 	KASSERT(kpreempt_disabled());
1164 
1165 	if (va < VM_MIN_KERNEL_ADDRESS)
1166 		pte = vtopte(va);
1167 	else
1168 		pte = kvtopte(va);
1169 
1170 	npte = opte = *pte;
1171 
1172 	if ((prot & VM_PROT_WRITE) != 0)
1173 		npte |= PG_RW;
1174 	else
1175 		npte &= ~PG_RW;
1176 
1177 	if (opte != npte) {
1178 		pmap_pte_set(pte, npte);
1179 		pmap_pte_flush();
1180 		invlpg(va);
1181 	}
1182 }
1183 #endif /* defined(__x86_64__) */
1184 
1185 /*
1186  * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
1187  *
1188  * => no need to lock anything
1189  * => caller must dispose of any vm_page mapped in the va range
1190  * => note: not an inline function
1191  * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
1192  * => we assume kernel only unmaps valid addresses and thus don't bother
1193  *    checking the valid bit before doing TLB flushing
1194  * => must be followed by call to pmap_update() before reuse of page
1195  */
1196 
1197 void
1198 pmap_kremove(vaddr_t sva, vsize_t len)
1199 {
1200 	pt_entry_t *pte, xpte;
1201 	vaddr_t va, eva;
1202 
1203 	eva = sva + len;
1204 	xpte = 0;
1205 
1206 	for (va = sva; va < eva; va += PAGE_SIZE) {
1207 		if (va < VM_MIN_KERNEL_ADDRESS)
1208 			pte = vtopte(va);
1209 		else
1210 			pte = kvtopte(va);
1211 		xpte |= pmap_pte_testset(pte, 0); /* zap! */
1212 #if defined(DIAGNOSTIC)
1213 		/* XXX For now... */
1214 		if (xpte & PG_PS)
1215 			panic("pmap_kremove: PG_PS");
1216 		if (xpte & PG_PVLIST)
1217 			panic("pmap_kremove: PG_PVLIST mapping for 0x%lx",
1218 			      va);
1219 #endif
1220 	}
1221 	if ((xpte & (PG_V | PG_U)) == (PG_V | PG_U)) {
1222 		kpreempt_disable();
1223 		pmap_tlb_shootdown(pmap_kernel(), sva, eva, xpte);
1224 		kpreempt_enable();
1225 	}
1226 }
1227 
1228 /*
1229  * p m a p   i n i t   f u n c t i o n s
1230  *
1231  * pmap_bootstrap and pmap_init are called during system startup
1232  * to init the pmap module.   pmap_bootstrap() does a low level
1233  * init just to get things rolling.   pmap_init() finishes the job.
1234  */
1235 
1236 /*
1237  * pmap_bootstrap: get the system in a state where it can run with VM
1238  *	properly enabled (called before main()).   the VM system is
1239  *      fully init'd later...
1240  *
1241  * => on i386, locore.s has already enabled the MMU by allocating
1242  *	a PDP for the kernel, and nkpde PTP's for the kernel.
1243  * => kva_start is the first free virtual address in kernel space
1244  */
1245 
1246 void
1247 pmap_bootstrap(vaddr_t kva_start)
1248 {
1249 	struct pmap *kpm;
1250 	pt_entry_t *pte;
1251 	struct pcb *pcb;
1252 	int i;
1253 	vaddr_t kva;
1254 #ifdef XEN
1255 	pt_entry_t pg_nx = 0;
1256 #else
1257 	unsigned long p1i;
1258 	vaddr_t kva_end;
1259 	pt_entry_t pg_nx = (cpu_feature & CPUID_NOX ? PG_NX : 0);
1260 #endif
1261 
1262 	/*
1263 	 * set up our local static global vars that keep track of the
1264 	 * usage of KVM before kernel_map is set up
1265 	 */
1266 
1267 	virtual_avail = kva_start;		/* first free KVA */
1268 	virtual_end = VM_MAX_KERNEL_ADDRESS;	/* last KVA */
1269 
1270 	/*
1271 	 * set up protection_codes: we need to be able to convert from
1272 	 * a MI protection code (some combo of VM_PROT...) to something
1273 	 * we can jam into a i386 PTE.
1274 	 */
1275 
1276 	protection_codes[VM_PROT_NONE] = pg_nx;			/* --- */
1277 	protection_codes[VM_PROT_EXECUTE] = PG_RO | PG_X;	/* --x */
1278 	protection_codes[VM_PROT_READ] = PG_RO | pg_nx;		/* -r- */
1279 	protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO | PG_X;/* -rx */
1280 	protection_codes[VM_PROT_WRITE] = PG_RW | pg_nx;	/* w-- */
1281 	protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW | PG_X;/* w-x */
1282 	protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW | pg_nx;
1283 								/* wr- */
1284 	protection_codes[VM_PROT_ALL] = PG_RW | PG_X;		/* wrx */
1285 
1286 	/*
1287 	 * now we init the kernel's pmap
1288 	 *
1289 	 * the kernel pmap's pm_obj is not used for much.   however, in
1290 	 * user pmaps the pm_obj contains the list of active PTPs.
1291 	 * the pm_obj currently does not have a pager.   it might be possible
1292 	 * to add a pager that would allow a process to read-only mmap its
1293 	 * own page tables (fast user level vtophys?).   this may or may not
1294 	 * be useful.
1295 	 */
1296 
1297 	kpm = pmap_kernel();
1298 	for (i = 0; i < PTP_LEVELS - 1; i++) {
1299 		UVM_OBJ_INIT(&kpm->pm_obj[i], NULL, 1);
1300 		kpm->pm_ptphint[i] = NULL;
1301 	}
1302 	memset(&kpm->pm_list, 0, sizeof(kpm->pm_list));  /* pm_list not used */
1303 	pcb = lwp_getpcb(&lwp0);
1304 	kpm->pm_pdir = (pd_entry_t *)(pcb->pcb_cr3 + KERNBASE);
1305 #ifdef PAE
1306 	for (i = 0; i < PDP_SIZE; i++)
1307 		kpm->pm_pdirpa[i] = (paddr_t)pcb->pcb_cr3 + PAGE_SIZE * i;
1308 #else
1309 	kpm->pm_pdirpa = (paddr_t)pcb->pcb_cr3;
1310 #endif
1311 	kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
1312 		x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS);
1313 
1314 	/*
1315 	 * the above is just a rough estimate and not critical to the proper
1316 	 * operation of the system.
1317 	 */
1318 
1319 #ifndef XEN
1320 	/*
1321 	 * Begin to enable global TLB entries if they are supported.
1322 	 * The G bit has no effect until the CR4_PGE bit is set in CR4,
1323 	 * which happens in cpu_init(), which is run on each cpu
1324 	 * (and happens later)
1325 	 */
1326 
1327 	if (cpu_feature & CPUID_PGE) {
1328 		pmap_pg_g = PG_G;		/* enable software */
1329 
1330 		/* add PG_G attribute to already mapped kernel pages */
1331 		if (KERNBASE == VM_MIN_KERNEL_ADDRESS) {
1332 			kva_end = virtual_avail;
1333 		} else {
1334 			extern vaddr_t eblob, esym;
1335 			kva_end = (vaddr_t)&end;
1336 			if (esym > kva_end)
1337 				kva_end = esym;
1338 			if (eblob > kva_end)
1339 				kva_end = eblob;
1340 			kva_end = roundup(kva_end, PAGE_SIZE);
1341 		}
1342 		for (kva = KERNBASE; kva < kva_end; kva += PAGE_SIZE) {
1343 			p1i = pl1_i(kva);
1344 			if (pmap_valid_entry(PTE_BASE[p1i]))
1345 				PTE_BASE[p1i] |= PG_G;
1346 		}
1347 	}
1348 
1349 	/*
1350 	 * enable large pages if they are supported.
1351 	 */
1352 
1353 	if (cpu_feature & CPUID_PSE) {
1354 		paddr_t pa;
1355 		pd_entry_t *pde;
1356 		extern char __data_start;
1357 
1358 		lcr4(rcr4() | CR4_PSE);	/* enable hardware (via %cr4) */
1359 		pmap_largepages = 1;	/* enable software */
1360 
1361 		/*
1362 		 * the TLB must be flushed after enabling large pages
1363 		 * on Pentium CPUs, according to section 3.6.2.2 of
1364 		 * "Intel Architecture Software Developer's Manual,
1365 		 * Volume 3: System Programming".
1366 		 */
1367 		tlbflush();
1368 
1369 		/*
1370 		 * now, remap the kernel text using large pages.  we
1371 		 * assume that the linker has properly aligned the
1372 		 * .data segment to a NBPD_L2 boundary.
1373 		 */
1374 		kva_end = rounddown((vaddr_t)&__data_start, NBPD_L1);
1375 		for (pa = 0, kva = KERNBASE; kva + NBPD_L2 <= kva_end;
1376 		     kva += NBPD_L2, pa += NBPD_L2) {
1377 			pde = &L2_BASE[pl2_i(kva)];
1378 			*pde = pa | pmap_pg_g | PG_PS |
1379 			    PG_KR | PG_V;	/* zap! */
1380 			tlbflush();
1381 		}
1382 #if defined(DEBUG)
1383 		aprint_normal("kernel text is mapped with "
1384 		    "%lu large pages and %lu normal pages\n",
1385 		    (unsigned long)howmany(kva - KERNBASE, NBPD_L2),
1386 		    (unsigned long)howmany((vaddr_t)&__data_start - kva,
1387 		    NBPD_L1));
1388 #endif /* defined(DEBUG) */
1389 	}
1390 #endif /* !XEN */
1391 
1392 	if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
1393 		/*
1394 		 * zero_pte is stuck at the end of mapped space for the kernel
1395 		 * image (disjunct from kva space). This is done so that it
1396 		 * can safely be used in pmap_growkernel (pmap_get_physpage),
1397 		 * when it's called for the first time.
1398 		 * XXXfvdl fix this for MULTIPROCESSOR later.
1399 		 */
1400 
1401 		early_zerop = (void *)(KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2);
1402 		early_zero_pte = PTE_BASE + pl1_i((unsigned long)early_zerop);
1403 	}
1404 
1405 	/*
1406 	 * now we allocate the "special" VAs which are used for tmp mappings
1407 	 * by the pmap (and other modules).    we allocate the VAs by advancing
1408 	 * virtual_avail (note that there are no pages mapped at these VAs).
1409 	 * we find the PTE that maps the allocated VA via the linear PTE
1410 	 * mapping.
1411 	 */
1412 
1413 	pte = PTE_BASE + pl1_i(virtual_avail);
1414 
1415 #ifdef MULTIPROCESSOR
1416 	/*
1417 	 * Waste some VA space to avoid false sharing of cache lines
1418 	 * for page table pages: Give each possible CPU a cache line
1419 	 * of PTE's (8) to play with, though we only need 4.  We could
1420 	 * recycle some of this waste by putting the idle stacks here
1421 	 * as well; we could waste less space if we knew the largest
1422 	 * CPU ID beforehand.
1423 	 */
1424 	csrcp = (char *) virtual_avail;  csrc_pte = pte;
1425 
1426 	cdstp = (char *) virtual_avail+PAGE_SIZE;  cdst_pte = pte+1;
1427 
1428 	zerop = (char *) virtual_avail+PAGE_SIZE*2;  zero_pte = pte+2;
1429 
1430 	ptpp = (char *) virtual_avail+PAGE_SIZE*3;  ptp_pte = pte+3;
1431 
1432 	virtual_avail += PAGE_SIZE * maxcpus * NPTECL;
1433 	pte += maxcpus * NPTECL;
1434 #else
1435 	csrcp = (void *) virtual_avail;  csrc_pte = pte;	/* allocate */
1436 	virtual_avail += PAGE_SIZE; pte++;			/* advance */
1437 
1438 	cdstp = (void *) virtual_avail;  cdst_pte = pte;
1439 	virtual_avail += PAGE_SIZE; pte++;
1440 
1441 	zerop = (void *) virtual_avail;  zero_pte = pte;
1442 	virtual_avail += PAGE_SIZE; pte++;
1443 
1444 	ptpp = (void *) virtual_avail;  ptp_pte = pte;
1445 	virtual_avail += PAGE_SIZE; pte++;
1446 #endif
1447 
1448 	if (VM_MIN_KERNEL_ADDRESS == KERNBASE) {
1449 		early_zerop = zerop;
1450 		early_zero_pte = zero_pte;
1451 	}
1452 
1453 	/*
1454 	 * Nothing after this point actually needs pte;
1455 	 */
1456 	pte = (void *)0xdeadbeef;
1457 
1458 	/* XXX: vmmap used by mem.c... should be uvm_map_reserve */
1459 	/* XXXfvdl PTEs not needed here */
1460 	vmmap = (char *)virtual_avail;			/* don't need pte */
1461 	virtual_avail += PAGE_SIZE; pte++;
1462 
1463 #ifdef XEN
1464 #ifdef __x86_64__
1465 	/*
1466 	 * We want a dummy page directory for Xen:
1467 	 * when deactivate a pmap, Xen will still consider it active.
1468 	 * So we set user PGD to this one to lift all protection on
1469 	 * the now inactive page tables set.
1470 	 */
1471 	xen_dummy_user_pgd = avail_start;
1472 	avail_start += PAGE_SIZE;
1473 
1474 	/* Zero fill it, the less checks in Xen it requires the better */
1475 	memset((void *) (xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE);
1476 	/* Mark read-only */
1477 	HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE,
1478 	    pmap_pa2pte(xen_dummy_user_pgd) | PG_u | PG_V, UVMF_INVLPG);
1479 	/* Pin as L4 */
1480 	xpq_queue_pin_table(xpmap_ptom_masked(xen_dummy_user_pgd));
1481 #endif /* __x86_64__ */
1482 	idt_vaddr = virtual_avail;                      /* don't need pte */
1483 	idt_paddr = avail_start;                        /* steal a page */
1484 	/*
1485 	 * Xen require one more page as we can't store
1486 	 * GDT and LDT on the same page
1487 	 */
1488 	virtual_avail += 3 * PAGE_SIZE;
1489 	avail_start += 3 * PAGE_SIZE;
1490 #else /* XEN */
1491 	idt_vaddr = virtual_avail;			/* don't need pte */
1492 	idt_paddr = avail_start;			/* steal a page */
1493 #if defined(__x86_64__)
1494 	virtual_avail += 2 * PAGE_SIZE; pte += 2;
1495 	avail_start += 2 * PAGE_SIZE;
1496 #else /* defined(__x86_64__) */
1497 	virtual_avail += PAGE_SIZE; pte++;
1498 	avail_start += PAGE_SIZE;
1499 	/* pentium f00f bug stuff */
1500 	pentium_idt_vaddr = virtual_avail;		/* don't need pte */
1501 	virtual_avail += PAGE_SIZE; pte++;
1502 #endif /* defined(__x86_64__) */
1503 #endif /* XEN */
1504 
1505 #ifdef _LP64
1506 	/*
1507 	 * Grab a page below 4G for things that need it (i.e.
1508 	 * having an initial %cr3 for the MP trampoline).
1509 	 */
1510 	lo32_vaddr = virtual_avail;
1511 	virtual_avail += PAGE_SIZE; pte++;
1512 	lo32_paddr = avail_start;
1513 	avail_start += PAGE_SIZE;
1514 #endif
1515 
1516 	/*
1517 	 * now we reserve some VM for mapping pages when doing a crash dump
1518 	 */
1519 
1520 	virtual_avail = reserve_dumppages(virtual_avail);
1521 
1522 	/*
1523 	 * init the static-global locks and global lists.
1524 	 *
1525 	 * => pventry::pvh_lock (initialized elsewhere) must also be
1526 	 *      a spin lock, again at IPL_VM to prevent deadlock, and
1527 	 *	again is never taken from interrupt context.
1528 	 */
1529 
1530 	mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE);
1531 	LIST_INIT(&pmaps);
1532 	pmap_cpu_init_early(curcpu());
1533 
1534 	/*
1535 	 * initialize caches.
1536 	 */
1537 
1538 	pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), 0, 0, 0,
1539 	    "pmappl", NULL, IPL_NONE, NULL, NULL, NULL);
1540 #ifdef PAE
1541 	pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE * PDP_SIZE, 0, 0, 0,
1542 	    "pdppl", &pmap_pdp_allocator, IPL_NONE,
1543 	    pmap_pdp_ctor, pmap_pdp_dtor, NULL);
1544 #else /* PAE */
1545 	pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE, 0, 0, 0,
1546 	    "pdppl", NULL, IPL_NONE, pmap_pdp_ctor, pmap_pdp_dtor, NULL);
1547 #endif /* PAE */
1548 	pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0,
1549 	    PR_LARGECACHE, "pvpl", &pool_allocator_meta, IPL_NONE, NULL,
1550 	    NULL, NULL);
1551 
1552 	/*
1553 	 * ensure the TLB is sync'd with reality by flushing it...
1554 	 */
1555 
1556 	tlbflush();
1557 
1558 	/*
1559 	 * calculate pmap_maxkvaddr from nkptp[].
1560 	 */
1561 
1562 	kva = VM_MIN_KERNEL_ADDRESS;
1563 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
1564 		kva += nkptp[i] * nbpd[i];
1565 	}
1566 	pmap_maxkvaddr = kva;
1567 }
1568 
1569 #if defined(__x86_64__)
1570 /*
1571  * Pre-allocate PTPs for low memory, so that 1:1 mappings for various
1572  * trampoline code can be entered.
1573  */
1574 void
1575 pmap_prealloc_lowmem_ptps(void)
1576 {
1577 #ifdef XEN
1578 	int level;
1579 	paddr_t newp;
1580 	paddr_t pdes_pa;
1581 
1582 	pdes_pa = pmap_kernel()->pm_pdirpa;
1583 	level = PTP_LEVELS;
1584 	for (;;) {
1585 		newp = avail_start;
1586 		avail_start += PAGE_SIZE;
1587 		HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop,
1588 		    xpmap_ptom_masked(newp) | PG_u | PG_V | PG_RW, UVMF_INVLPG);
1589 		memset((void *)early_zerop, 0, PAGE_SIZE);
1590 		/* Mark R/O before installing */
1591 		HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop,
1592 		    xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG);
1593 		if (newp < (NKL2_KIMG_ENTRIES * NBPD_L2))
1594 			HYPERVISOR_update_va_mapping (newp + KERNBASE,
1595 			    xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG);
1596 		xpq_queue_pte_update (
1597 			xpmap_ptom_masked(pdes_pa)
1598 			+ (pl_i(0, level) * sizeof (pd_entry_t)),
1599 			xpmap_ptom_masked(newp) | PG_RW | PG_u | PG_V);
1600 		level--;
1601 		if (level <= 1)
1602 			break;
1603 		pdes_pa = newp;
1604 	}
1605 #else /* XEN */
1606 	pd_entry_t *pdes;
1607 	int level;
1608 	paddr_t newp;
1609 
1610 	pdes = pmap_kernel()->pm_pdir;
1611 	level = PTP_LEVELS;
1612 	for (;;) {
1613 		newp = avail_start;
1614 		avail_start += PAGE_SIZE;
1615 		*early_zero_pte = (newp & PG_FRAME) | PG_V | PG_RW;
1616 		pmap_update_pg((vaddr_t)early_zerop);
1617 		memset(early_zerop, 0, PAGE_SIZE);
1618 		pdes[pl_i(0, level)] = (newp & PG_FRAME) | PG_V | PG_RW;
1619 		level--;
1620 		if (level <= 1)
1621 			break;
1622 		pdes = normal_pdes[level - 2];
1623 	}
1624 #endif /* XEN */
1625 }
1626 #endif /* defined(__x86_64__) */
1627 
1628 /*
1629  * pmap_init: called from uvm_init, our job is to get the pmap
1630  * system ready to manage mappings...
1631  */
1632 
1633 void
1634 pmap_init(void)
1635 {
1636 	int i;
1637 
1638 	for (i = 0; i < PV_HASH_SIZE; i++) {
1639 		SLIST_INIT(&pv_hash_heads[i].hh_list);
1640 	}
1641 	for (i = 0; i < PV_HASH_LOCK_CNT; i++) {
1642 		mutex_init(&pv_hash_locks[i].lock, MUTEX_NODEBUG, IPL_VM);
1643 	}
1644 
1645 	/*
1646 	 * done: pmap module is up (and ready for business)
1647 	 */
1648 
1649 	pmap_initialized = true;
1650 }
1651 
1652 /*
1653  * pmap_cpu_init_early: perform early per-CPU initialization.
1654  */
1655 
1656 void
1657 pmap_cpu_init_early(struct cpu_info *ci)
1658 {
1659 	struct pmap_cpu *pc;
1660 	static uint8_t pmap_cpu_alloc;
1661 
1662 	pc = &pmap_cpu[pmap_cpu_alloc++].pc;
1663 	ci->ci_pmap_cpu = pc;
1664 }
1665 
1666 /*
1667  * pmap_cpu_init_late: perform late per-CPU initialization.
1668  */
1669 
1670 void
1671 pmap_cpu_init_late(struct cpu_info *ci)
1672 {
1673 
1674 	if (ci == &cpu_info_primary) {
1675 		evcnt_attach_dynamic(&pmap_tlb_evcnt, EVCNT_TYPE_INTR,
1676 		    NULL, "global", "TLB IPI");
1677 		evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC,
1678 		    NULL, "x86", "io bitmap copy");
1679 		evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC,
1680 		    NULL, "x86", "ldt sync");
1681 	}
1682 
1683 	evcnt_attach_dynamic(&ci->ci_tlb_evcnt, EVCNT_TYPE_MISC,
1684 	    NULL, device_xname(ci->ci_dev), "TLB IPI");
1685 }
1686 
1687 /*
1688  * p v _ e n t r y   f u n c t i o n s
1689  */
1690 
1691 /*
1692  * pmap_free_pvs: free a list of pv_entrys
1693  */
1694 
1695 static void
1696 pmap_free_pvs(struct pv_entry *pve)
1697 {
1698 	struct pv_entry *next;
1699 
1700 	for ( /* null */ ; pve != NULL ; pve = next) {
1701 		next = pve->pve_next;
1702 		pool_cache_put(&pmap_pv_cache, pve);
1703 	}
1704 }
1705 
1706 /*
1707  * main pv_entry manipulation functions:
1708  *   pmap_enter_pv: enter a mapping onto a pv_head list
1709  *   pmap_remove_pv: remove a mapping from a pv_head list
1710  *
1711  * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock
1712  *       the pvh before calling
1713  */
1714 
1715 /*
1716  * insert_pv: a helper of pmap_enter_pv
1717  */
1718 
1719 static void
1720 insert_pv(struct pmap_page *pp, struct pv_entry *pve)
1721 {
1722 	struct pv_hash_head *hh;
1723 	kmutex_t *lock;
1724 	u_int hash;
1725 
1726 	KASSERT(pp_locked(pp));
1727 
1728 	hash = pvhash_hash(pve->pve_pte.pte_ptp, pve->pve_pte.pte_va);
1729 	lock = pvhash_lock(hash);
1730 	hh = pvhash_head(hash);
1731 	mutex_spin_enter(lock);
1732 	SLIST_INSERT_HEAD(&hh->hh_list, pve, pve_hash);
1733 	mutex_spin_exit(lock);
1734 
1735 	LIST_INSERT_HEAD(&pp->pp_head.pvh_list, pve, pve_list);
1736 }
1737 
1738 /*
1739  * pmap_enter_pv: enter a mapping onto a pv_head lst
1740  *
1741  * => caller should have the pp_lock locked
1742  * => caller should adjust ptp's wire_count before calling
1743  */
1744 
1745 static struct pv_entry *
1746 pmap_enter_pv(struct pmap_page *pp,
1747 	      struct pv_entry *pve,	/* preallocated pve for us to use */
1748 	      struct pv_entry **sparepve,
1749 	      struct vm_page *ptp,
1750 	      vaddr_t va)
1751 {
1752 
1753 	KASSERT(ptp == NULL || ptp->wire_count >= 2);
1754 	KASSERT(ptp == NULL || ptp->uobject != NULL);
1755 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
1756 	KASSERT(pp_locked(pp));
1757 
1758 	if ((pp->pp_flags & PP_EMBEDDED) == 0) {
1759 		if (LIST_EMPTY(&pp->pp_head.pvh_list)) {
1760 			pp->pp_flags |= PP_EMBEDDED;
1761 			pp->pp_pte.pte_ptp = ptp;
1762 			pp->pp_pte.pte_va = va;
1763 
1764 			return pve;
1765 		}
1766 	} else {
1767 		struct pv_entry *pve2;
1768 
1769 		pve2 = *sparepve;
1770 		*sparepve = NULL;
1771 
1772 		pve2->pve_pte = pp->pp_pte;
1773 		pp->pp_flags &= ~PP_EMBEDDED;
1774 		LIST_INIT(&pp->pp_head.pvh_list);
1775 		insert_pv(pp, pve2);
1776 	}
1777 
1778 	pve->pve_pte.pte_ptp = ptp;
1779 	pve->pve_pte.pte_va = va;
1780 	insert_pv(pp, pve);
1781 
1782 	return NULL;
1783 }
1784 
1785 /*
1786  * pmap_remove_pv: try to remove a mapping from a pv_list
1787  *
1788  * => caller should hold pp_lock [so that attrs can be adjusted]
1789  * => caller should adjust ptp's wire_count and free PTP if needed
1790  * => we return the removed pve
1791  */
1792 
1793 static struct pv_entry *
1794 pmap_remove_pv(struct pmap_page *pp, struct vm_page *ptp, vaddr_t va)
1795 {
1796 	struct pv_hash_head *hh;
1797 	struct pv_entry *pve;
1798 	kmutex_t *lock;
1799 	u_int hash;
1800 
1801 	KASSERT(ptp == NULL || ptp->uobject != NULL);
1802 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
1803 	KASSERT(pp_locked(pp));
1804 
1805 	if ((pp->pp_flags & PP_EMBEDDED) != 0) {
1806 		KASSERT(pp->pp_pte.pte_ptp == ptp);
1807 		KASSERT(pp->pp_pte.pte_va == va);
1808 
1809 		pp->pp_flags &= ~PP_EMBEDDED;
1810 		LIST_INIT(&pp->pp_head.pvh_list);
1811 
1812 		return NULL;
1813 	}
1814 
1815 	hash = pvhash_hash(ptp, va);
1816 	lock = pvhash_lock(hash);
1817 	hh = pvhash_head(hash);
1818 	mutex_spin_enter(lock);
1819 	pve = pvhash_remove(hh, ptp, va);
1820 	mutex_spin_exit(lock);
1821 
1822 	LIST_REMOVE(pve, pve_list);
1823 
1824 	return pve;
1825 }
1826 
1827 /*
1828  * p t p   f u n c t i o n s
1829  */
1830 
1831 static inline struct vm_page *
1832 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level)
1833 {
1834 	int lidx = level - 1;
1835 	struct vm_page *pg;
1836 
1837 	KASSERT(mutex_owned(&pmap->pm_lock));
1838 
1839 	if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] &&
1840 	    pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) {
1841 		return (pmap->pm_ptphint[lidx]);
1842 	}
1843 	PMAP_SUBOBJ_LOCK(pmap, lidx);
1844 	pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level));
1845 	PMAP_SUBOBJ_UNLOCK(pmap, lidx);
1846 
1847 	KASSERT(pg == NULL || pg->wire_count >= 1);
1848 	return pg;
1849 }
1850 
1851 static inline void
1852 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level)
1853 {
1854 	int lidx;
1855 	struct uvm_object *obj;
1856 
1857 	KASSERT(ptp->wire_count == 1);
1858 
1859 	lidx = level - 1;
1860 
1861 	obj = &pmap->pm_obj[lidx];
1862 	pmap_stats_update(pmap, -1, 0);
1863 	if (lidx != 0)
1864 		mutex_enter(&obj->vmobjlock);
1865 	if (pmap->pm_ptphint[lidx] == ptp)
1866 		pmap->pm_ptphint[lidx] = TAILQ_FIRST(&obj->memq);
1867 	ptp->wire_count = 0;
1868 	uvm_pagerealloc(ptp, NULL, 0);
1869 	VM_PAGE_TO_PP(ptp)->pp_link = curlwp->l_md.md_gc_ptp;
1870 	curlwp->l_md.md_gc_ptp = ptp;
1871 	if (lidx != 0)
1872 		mutex_exit(&obj->vmobjlock);
1873 }
1874 
1875 static void
1876 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
1877 	      pt_entry_t *ptes, pd_entry_t * const *pdes)
1878 {
1879 	unsigned long index;
1880 	int level;
1881 	vaddr_t invaladdr;
1882 #ifdef MULTIPROCESSOR
1883 	vaddr_t invaladdr2;
1884 #endif
1885 	pd_entry_t opde;
1886 	struct pmap *curpmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map);
1887 
1888 	KASSERT(pmap != pmap_kernel());
1889 	KASSERT(mutex_owned(&pmap->pm_lock));
1890 	KASSERT(kpreempt_disabled());
1891 
1892 	level = 1;
1893 	do {
1894 		index = pl_i(va, level + 1);
1895 		opde = pmap_pte_testset(&pdes[level - 1][index], 0);
1896 #if defined(XEN) && defined(__x86_64__)
1897 		/*
1898 		 * If ptp is a L3 currently mapped in kernel space,
1899 		 * clear it before freeing
1900 		 */
1901 		if (pmap->pm_pdirpa == xen_current_user_pgd
1902 		    && level == PTP_LEVELS - 1)
1903 			pmap_pte_set(&pmap_kernel()->pm_pdir[index], 0);
1904 #endif /* XEN && __x86_64__ */
1905 		pmap_freepage(pmap, ptp, level);
1906 		invaladdr = level == 1 ? (vaddr_t)ptes :
1907 		    (vaddr_t)pdes[level - 2];
1908 		pmap_tlb_shootdown(curpmap, invaladdr + index * PAGE_SIZE,
1909 		    0, opde);
1910 #if defined(MULTIPROCESSOR)
1911 		invaladdr2 = level == 1 ? (vaddr_t)PTE_BASE :
1912 		    (vaddr_t)normal_pdes[level - 2];
1913 		if (pmap != curpmap || invaladdr != invaladdr2) {
1914 			pmap_tlb_shootdown(pmap, invaladdr2 + index * PAGE_SIZE,
1915 			    0, opde);
1916 		}
1917 #endif
1918 		if (level < PTP_LEVELS - 1) {
1919 			ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1);
1920 			ptp->wire_count--;
1921 			if (ptp->wire_count > 1)
1922 				break;
1923 		}
1924 	} while (++level < PTP_LEVELS);
1925 	pmap_pte_flush();
1926 }
1927 
1928 /*
1929  * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
1930  *
1931  * => pmap should NOT be pmap_kernel()
1932  * => pmap should be locked
1933  * => preemption should be disabled
1934  */
1935 
1936 static struct vm_page *
1937 pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t * const *pdes)
1938 {
1939 	struct vm_page *ptp, *pptp;
1940 	int i;
1941 	unsigned long index;
1942 	pd_entry_t *pva;
1943 	paddr_t ppa, pa;
1944 	struct uvm_object *obj;
1945 
1946 	KASSERT(pmap != pmap_kernel());
1947 	KASSERT(mutex_owned(&pmap->pm_lock));
1948 	KASSERT(kpreempt_disabled());
1949 
1950 	ptp = NULL;
1951 	pa = (paddr_t)-1;
1952 
1953 	/*
1954 	 * Loop through all page table levels seeing if we need to
1955 	 * add a new page to that level.
1956 	 */
1957 	for (i = PTP_LEVELS; i > 1; i--) {
1958 		/*
1959 		 * Save values from previous round.
1960 		 */
1961 		pptp = ptp;
1962 		ppa = pa;
1963 
1964 		index = pl_i(va, i);
1965 		pva = pdes[i - 2];
1966 
1967 		if (pmap_valid_entry(pva[index])) {
1968 			ppa = pmap_pte2pa(pva[index]);
1969 			ptp = NULL;
1970 			continue;
1971 		}
1972 
1973 		obj = &pmap->pm_obj[i-2];
1974 		PMAP_SUBOBJ_LOCK(pmap, i - 2);
1975 		ptp = uvm_pagealloc(obj, ptp_va2o(va, i - 1), NULL,
1976 		    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
1977 		PMAP_SUBOBJ_UNLOCK(pmap, i - 2);
1978 
1979 		if (ptp == NULL)
1980 			return NULL;
1981 
1982 		ptp->flags &= ~PG_BUSY; /* never busy */
1983 		ptp->wire_count = 1;
1984 		pmap->pm_ptphint[i - 2] = ptp;
1985 		pa = VM_PAGE_TO_PHYS(ptp);
1986 		pmap_pte_set(&pva[index], (pd_entry_t)
1987 		        (pmap_pa2pte(pa) | PG_u | PG_RW | PG_V));
1988 #if defined(XEN) && defined(__x86_64__)
1989 		/*
1990 		 * In Xen we must enter the mapping in kernel map too
1991 		 * if pmap is curmap and modifying top level (PGD)
1992 		 */
1993 		if(i == PTP_LEVELS && pmap != pmap_kernel()) {
1994 		        pmap_pte_set(&pmap_kernel()->pm_pdir[index],
1995 		                (pd_entry_t) (pmap_pa2pte(pa)
1996 		                        | PG_u | PG_RW | PG_V));
1997 		}
1998 #endif /* XEN && __x86_64__ */
1999 		pmap_pte_flush();
2000 		pmap_stats_update(pmap, 1, 0);
2001 		/*
2002 		 * If we're not in the top level, increase the
2003 		 * wire count of the parent page.
2004 		 */
2005 		if (i < PTP_LEVELS) {
2006 			if (pptp == NULL)
2007 				pptp = pmap_find_ptp(pmap, va, ppa, i);
2008 #ifdef DIAGNOSTIC
2009 			if (pptp == NULL)
2010 				panic("pde page disappeared");
2011 #endif
2012 			pptp->wire_count++;
2013 		}
2014 	}
2015 
2016 	/*
2017 	 * ptp is not NULL if we just allocated a new ptp. If it's
2018 	 * still NULL, we must look up the existing one.
2019 	 */
2020 	if (ptp == NULL) {
2021 		ptp = pmap_find_ptp(pmap, va, ppa, 1);
2022 #ifdef DIAGNOSTIC
2023 		if (ptp == NULL) {
2024 			printf("va %lx ppa %lx\n", (unsigned long)va,
2025 			    (unsigned long)ppa);
2026 			panic("pmap_get_ptp: unmanaged user PTP");
2027 		}
2028 #endif
2029 	}
2030 
2031 	pmap->pm_ptphint[0] = ptp;
2032 	return(ptp);
2033 }
2034 
2035 /*
2036  * p m a p  l i f e c y c l e   f u n c t i o n s
2037  */
2038 
2039 /*
2040  * pmap_pdp_ctor: constructor for the PDP cache.
2041  */
2042 
2043 int
2044 pmap_pdp_ctor(void *arg, void *v, int flags)
2045 {
2046 	pd_entry_t *pdir = v;
2047 	paddr_t pdirpa = 0;	/* XXX: GCC */
2048 	vaddr_t object;
2049 	int i;
2050 
2051 #if !defined(XEN) || !defined(__x86_64__)
2052 	int npde;
2053 #endif
2054 #ifdef XEN
2055 	int s;
2056 #endif
2057 
2058 	/*
2059 	 * NOTE: The `pmap_lock' is held when the PDP is allocated.
2060 	 */
2061 
2062 #if defined(XEN) && defined(__x86_64__)
2063 	/* fetch the physical address of the page directory. */
2064 	(void) pmap_extract(pmap_kernel(), (vaddr_t) pdir, &pdirpa);
2065 
2066 	/* zero init area */
2067 	memset (pdir, 0, PAGE_SIZE); /* Xen wants a clean page */
2068 	/*
2069 	 * this pdir will NEVER be active in kernel mode
2070 	 * so mark recursive entry invalid
2071 	 */
2072 	pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa) | PG_u;
2073 	/*
2074 	 * PDP constructed this way won't be for kernel,
2075 	 * hence we don't put kernel mappings on Xen.
2076 	 * But we need to make pmap_create() happy, so put a dummy (without
2077 	 * PG_V) value at the right place.
2078 	 */
2079 	pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] =
2080 	     (unsigned long)-1 & PG_FRAME;
2081 #else /* XEN  && __x86_64__*/
2082 	/* zero init area */
2083 	memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t));
2084 
2085 	object = (vaddr_t)v;
2086 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2087 		/* fetch the physical address of the page directory. */
2088 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2089 		/* put in recursive PDE to map the PTEs */
2090 		pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PG_V;
2091 #ifndef XEN
2092 		pdir[PDIR_SLOT_PTE + i] |= PG_KW;
2093 #endif
2094 	}
2095 
2096 	/* copy kernel's PDE */
2097 	npde = nkptp[PTP_LEVELS - 1];
2098 
2099 	memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN],
2100 	    npde * sizeof(pd_entry_t));
2101 
2102 	/* zero the rest */
2103 	memset(&pdir[PDIR_SLOT_KERN + npde], 0,
2104 	    (NTOPLEVEL_PDES - (PDIR_SLOT_KERN + npde)) * sizeof(pd_entry_t));
2105 
2106 	if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
2107 		int idx = pl_i(KERNBASE, PTP_LEVELS);
2108 
2109 		pdir[idx] = PDP_BASE[idx];
2110 	}
2111 #endif /* XEN  && __x86_64__*/
2112 #ifdef XEN
2113 	s = splvm();
2114 	object = (vaddr_t)v;
2115 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2116 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2117 		/* remap this page RO */
2118 		pmap_kenter_pa(object, pdirpa, VM_PROT_READ, 0);
2119 		pmap_update(pmap_kernel());
2120 		/*
2121 		 * pin as L2/L4 page, we have to do the page with the
2122 		 * PDIR_SLOT_PTE entries last
2123 		 */
2124 #ifdef PAE
2125 		if (i == l2tol3(PDIR_SLOT_PTE))
2126 			continue;
2127 #endif
2128 		xpq_queue_pin_table(xpmap_ptom_masked(pdirpa));
2129 	}
2130 #ifdef PAE
2131 	object = ((vaddr_t)pdir) + PAGE_SIZE  * l2tol3(PDIR_SLOT_PTE);
2132 	(void)pmap_extract(pmap_kernel(), object, &pdirpa);
2133 	xpq_queue_pin_table(xpmap_ptom_masked(pdirpa));
2134 #endif
2135 	xpq_flush_queue();
2136 	splx(s);
2137 #endif /* XEN */
2138 
2139 	return (0);
2140 }
2141 
2142 /*
2143  * pmap_pdp_dtor: destructor for the PDP cache.
2144  */
2145 
2146 void
2147 pmap_pdp_dtor(void *arg, void *v)
2148 {
2149 #ifdef XEN
2150 	paddr_t pdirpa = 0;	/* XXX: GCC */
2151 	vaddr_t object = (vaddr_t)v;
2152 	int i;
2153 	int s = splvm();
2154 	pt_entry_t *pte;
2155 
2156 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2157 		/* fetch the physical address of the page directory. */
2158 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2159 		/* unpin page table */
2160 		xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa));
2161 	}
2162 	object = (vaddr_t)v;
2163 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2164 		/* Set page RW again */
2165 		pte = kvtopte(object);
2166 		xpq_queue_pte_update(xpmap_ptetomach(pte), *pte | PG_RW);
2167 		xpq_queue_invlpg((vaddr_t)object);
2168 	}
2169 	xpq_flush_queue();
2170 	splx(s);
2171 #endif  /* XEN */
2172 }
2173 
2174 #ifdef PAE
2175 
2176 /* pmap_pdp_alloc: Allocate a page for the pdp memory pool. */
2177 
2178 void *
2179 pmap_pdp_alloc(struct pool *pp, int flags)
2180 {
2181 	return (void *)uvm_km_alloc(kernel_map,
2182 	    PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE,
2183 	    ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK)
2184 	    | UVM_KMF_WIRED);
2185 }
2186 
2187 /*
2188  * pmap_pdp_free: free a PDP
2189  */
2190 
2191 void
2192 pmap_pdp_free(struct pool *pp, void *v)
2193 {
2194 	uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE,
2195 	    UVM_KMF_WIRED);
2196 }
2197 #endif /* PAE */
2198 
2199 /*
2200  * pmap_create: create a pmap
2201  *
2202  * => note: old pmap interface took a "size" args which allowed for
2203  *	the creation of "software only" pmaps (not in bsd).
2204  */
2205 
2206 struct pmap *
2207 pmap_create(void)
2208 {
2209 	struct pmap *pmap;
2210 	int i;
2211 
2212 	pmap = pool_cache_get(&pmap_cache, PR_WAITOK);
2213 
2214 	/* init uvm_object */
2215 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2216 		UVM_OBJ_INIT(&pmap->pm_obj[i], NULL, 1);
2217 		pmap->pm_ptphint[i] = NULL;
2218 	}
2219 	pmap->pm_stats.wired_count = 0;
2220 	/* count the PDP allocd below */
2221 	pmap->pm_stats.resident_count = PDP_SIZE;
2222 #if !defined(__x86_64__)
2223 	pmap->pm_hiexec = 0;
2224 #endif /* !defined(__x86_64__) */
2225 	pmap->pm_flags = 0;
2226 	pmap->pm_cpus = 0;
2227 	pmap->pm_kernel_cpus = 0;
2228 
2229 	/* init the LDT */
2230 	pmap->pm_ldt = NULL;
2231 	pmap->pm_ldt_len = 0;
2232 	pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2233 
2234 	/* allocate PDP */
2235  try_again:
2236 	pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK);
2237 
2238 	mutex_enter(&pmaps_lock);
2239 
2240 	if (pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] == 0) {
2241 		mutex_exit(&pmaps_lock);
2242 		pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir);
2243 		goto try_again;
2244 	}
2245 
2246 #ifdef PAE
2247 	for (i = 0; i < PDP_SIZE; i++)
2248 		pmap->pm_pdirpa[i] =
2249 		    pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]);
2250 #else
2251 	pmap->pm_pdirpa = pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE]);
2252 #endif
2253 
2254 	LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
2255 
2256 	mutex_exit(&pmaps_lock);
2257 
2258 	return (pmap);
2259 }
2260 
2261 /*
2262  * pmap_destroy: drop reference count on pmap.   free pmap if
2263  *	reference count goes to zero.
2264  */
2265 
2266 void
2267 pmap_destroy(struct pmap *pmap)
2268 {
2269 	int i;
2270 #ifdef DIAGNOSTIC
2271 	struct cpu_info *ci;
2272 	CPU_INFO_ITERATOR cii;
2273 #endif /* DIAGNOSTIC */
2274 
2275 	/*
2276 	 * if we have torn down this pmap, process deferred frees and
2277 	 * invalidations now.
2278 	 */
2279 	if (__predict_false(curlwp->l_md.md_gc_pmap == pmap)) {
2280 		pmap_update(pmap);
2281 	}
2282 
2283 	/*
2284 	 * drop reference count
2285 	 */
2286 
2287 	if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) {
2288 		return;
2289 	}
2290 
2291 #ifdef DIAGNOSTIC
2292 	for (CPU_INFO_FOREACH(cii, ci))
2293 		if (ci->ci_pmap == pmap)
2294 			panic("destroying pmap being used");
2295 #endif /* DIAGNOSTIC */
2296 
2297 	/*
2298 	 * reference count is zero, free pmap resources and then free pmap.
2299 	 */
2300 #ifdef XEN
2301 	/*
2302 	 * Xen lazy APDP handling:
2303 	 * clear APDP_PDE if pmap is the currently mapped
2304 	 */
2305 	if (xpmap_ptom_masked(pmap_pdirpa(pmap, 0)) == (*APDP_PDE & PG_FRAME)) {
2306 		kpreempt_disable();
2307 		for (i = 0; i < PDP_SIZE; i++) {
2308 	        	pmap_pte_set(&APDP_PDE[i], 0);
2309 #ifdef PAE
2310 			/* clear shadow entry too */
2311 	    		pmap_pte_set(&APDP_PDE_SHADOW[i], 0);
2312 #endif
2313 		}
2314 		pmap_pte_flush();
2315 	        pmap_apte_flush(pmap_kernel());
2316 	        kpreempt_enable();
2317 	}
2318 #endif
2319 
2320 	/*
2321 	 * remove it from global list of pmaps
2322 	 */
2323 
2324 	mutex_enter(&pmaps_lock);
2325 	LIST_REMOVE(pmap, pm_list);
2326 	mutex_exit(&pmaps_lock);
2327 
2328 	/*
2329 	 * destroyed pmap shouldn't have remaining PTPs
2330 	 */
2331 
2332 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2333 		KASSERT(pmap->pm_obj[i].uo_npages == 0);
2334 		KASSERT(TAILQ_EMPTY(&pmap->pm_obj[i].memq));
2335 	}
2336 
2337 	/*
2338 	 * MULTIPROCESSOR -- no need to flush out of other processors'
2339 	 * APTE space because we do that in pmap_unmap_ptes().
2340 	 */
2341 	pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir);
2342 
2343 #ifdef USER_LDT
2344 	if (pmap->pm_ldt != NULL) {
2345 		/*
2346 		 * no need to switch the LDT; this address space is gone,
2347 		 * nothing is using it.
2348 		 *
2349 		 * No need to lock the pmap for ldt_free (or anything else),
2350 		 * we're the last one to use it.
2351 		 */
2352 		mutex_enter(&cpu_lock);
2353 		ldt_free(pmap->pm_ldt_sel);
2354 		mutex_exit(&cpu_lock);
2355 		uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt,
2356 		    pmap->pm_ldt_len, UVM_KMF_WIRED);
2357 	}
2358 #endif
2359 
2360 	for (i = 0; i < PTP_LEVELS - 1; i++)
2361 		mutex_destroy(&pmap->pm_obj[i].vmobjlock);
2362 	pool_cache_put(&pmap_cache, pmap);
2363 }
2364 
2365 /*
2366  * pmap_remove_all: pmap is being torn down by the current thread.
2367  * avoid unnecessary invalidations.
2368  */
2369 
2370 void
2371 pmap_remove_all(struct pmap *pmap)
2372 {
2373 	lwp_t *l = curlwp;
2374 
2375 	KASSERT(l->l_md.md_gc_pmap == NULL);
2376 
2377 	l->l_md.md_gc_pmap = pmap;
2378 }
2379 
2380 #if defined(PMAP_FORK)
2381 /*
2382  * pmap_fork: perform any necessary data structure manipulation when
2383  * a VM space is forked.
2384  */
2385 
2386 void
2387 pmap_fork(struct pmap *pmap1, struct pmap *pmap2)
2388 {
2389 #ifdef USER_LDT
2390 	union descriptor *new_ldt;
2391 	size_t len;
2392 	int sel;
2393 
2394 	if (__predict_true(pmap1->pm_ldt == NULL)) {
2395 		return;
2396 	}
2397 
2398  retry:
2399 	if (pmap1->pm_ldt != NULL) {
2400 		len = pmap1->pm_ldt_len;
2401 		new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, len, 0,
2402 		    UVM_KMF_WIRED);
2403 		mutex_enter(&cpu_lock);
2404 		sel = ldt_alloc(new_ldt, len);
2405 		if (sel == -1) {
2406 			mutex_exit(&cpu_lock);
2407 			uvm_km_free(kernel_map, (vaddr_t)new_ldt, len,
2408 			    UVM_KMF_WIRED);
2409 			printf("WARNING: pmap_fork: unable to allocate LDT\n");
2410 			return;
2411 		}
2412 	} else {
2413 		len = -1;
2414 		new_ldt = NULL;
2415 		sel = -1;
2416 		mutex_enter(&cpu_lock);
2417 	}
2418 
2419  	/* Copy the LDT, if necessary. */
2420  	if (pmap1->pm_ldt != NULL) {
2421 		if (len != pmap1->pm_ldt_len) {
2422 			if (len != -1) {
2423 				ldt_free(sel);
2424 				uvm_km_free(kernel_map, (vaddr_t)new_ldt,
2425 				    len, UVM_KMF_WIRED);
2426 			}
2427 			mutex_exit(&cpu_lock);
2428 			goto retry;
2429 		}
2430 
2431 		memcpy(new_ldt, pmap1->pm_ldt, len);
2432 		pmap2->pm_ldt = new_ldt;
2433 		pmap2->pm_ldt_len = pmap1->pm_ldt_len;
2434 		pmap2->pm_ldt_sel = sel;
2435 		len = -1;
2436 	}
2437 
2438 	if (len != -1) {
2439 		ldt_free(sel);
2440 		uvm_km_free(kernel_map, (vaddr_t)new_ldt, len,
2441 		    UVM_KMF_WIRED);
2442 	}
2443 	mutex_exit(&cpu_lock);
2444 #endif /* USER_LDT */
2445 }
2446 #endif /* PMAP_FORK */
2447 
2448 #ifdef USER_LDT
2449 
2450 /*
2451  * pmap_ldt_xcall: cross call used by pmap_ldt_sync.  if the named pmap
2452  * is active, reload LDTR.
2453  */
2454 static void
2455 pmap_ldt_xcall(void *arg1, void *arg2)
2456 {
2457 	struct pmap *pm;
2458 
2459 	kpreempt_disable();
2460 	pm = arg1;
2461 	if (curcpu()->ci_pmap == pm) {
2462 		lldt(pm->pm_ldt_sel);
2463 	}
2464 	kpreempt_enable();
2465 }
2466 
2467 /*
2468  * pmap_ldt_sync: LDT selector for the named pmap is changing.  swap
2469  * in the new selector on all CPUs.
2470  */
2471 void
2472 pmap_ldt_sync(struct pmap *pm)
2473 {
2474 	uint64_t where;
2475 
2476 	KASSERT(mutex_owned(&cpu_lock));
2477 
2478 	pmap_ldt_evcnt.ev_count++;
2479 	where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL);
2480 	xc_wait(where);
2481 }
2482 
2483 /*
2484  * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and
2485  * restore the default.
2486  */
2487 
2488 void
2489 pmap_ldt_cleanup(struct lwp *l)
2490 {
2491 	pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap;
2492 	union descriptor *dp = NULL;
2493 	size_t len = 0;
2494 	int sel = -1;
2495 
2496 	if (__predict_true(pmap->pm_ldt == NULL)) {
2497 		return;
2498 	}
2499 
2500 	mutex_enter(&cpu_lock);
2501 	if (pmap->pm_ldt != NULL) {
2502 		sel = pmap->pm_ldt_sel;
2503 		dp = pmap->pm_ldt;
2504 		len = pmap->pm_ldt_len;
2505 		pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2506 		pmap->pm_ldt = NULL;
2507 		pmap->pm_ldt_len = 0;
2508 		pmap_ldt_sync(pmap);
2509 		ldt_free(sel);
2510 		uvm_km_free(kernel_map, (vaddr_t)dp, len, UVM_KMF_WIRED);
2511 	}
2512 	mutex_exit(&cpu_lock);
2513 }
2514 #endif /* USER_LDT */
2515 
2516 /*
2517  * pmap_activate: activate a process' pmap
2518  *
2519  * => must be called with kernel preemption disabled
2520  * => if lwp is the curlwp, then set ci_want_pmapload so that
2521  *    actual MMU context switch will be done by pmap_load() later
2522  */
2523 
2524 void
2525 pmap_activate(struct lwp *l)
2526 {
2527 	struct cpu_info *ci;
2528 	struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2529 
2530 	KASSERT(kpreempt_disabled());
2531 
2532 	ci = curcpu();
2533 
2534 	if (l == ci->ci_curlwp) {
2535 		struct pcb *pcb;
2536 
2537 		KASSERT(ci->ci_want_pmapload == 0);
2538 		KASSERT(ci->ci_tlbstate != TLBSTATE_VALID);
2539 #ifdef KSTACK_CHECK_DR0
2540 		/*
2541 		 * setup breakpoint on the top of stack
2542 		 */
2543 		if (l == &lwp0)
2544 			dr0(0, 0, 0, 0);
2545 		else
2546 			dr0(KSTACK_LOWEST_ADDR(l), 1, 3, 1);
2547 #endif
2548 
2549 		/*
2550 		 * no need to switch to kernel vmspace because
2551 		 * it's a subset of any vmspace.
2552 		 */
2553 
2554 		if (pmap == pmap_kernel()) {
2555 			ci->ci_want_pmapload = 0;
2556 			return;
2557 		}
2558 
2559 		pcb = lwp_getpcb(l);
2560 		ci->ci_want_pmapload = 1;
2561 
2562 #if defined(__x86_64__)
2563 		if (pcb->pcb_flags & PCB_GS64)
2564 			wrmsr(MSR_KERNELGSBASE, pcb->pcb_gs);
2565 		if (pcb->pcb_flags & PCB_FS64)
2566 			wrmsr(MSR_FSBASE, pcb->pcb_fs);
2567 #endif /* defined(__x86_64__) */
2568 	}
2569 }
2570 
2571 /*
2572  * pmap_reactivate: try to regain reference to the pmap.
2573  *
2574  * => must be called with kernel preemption disabled
2575  */
2576 
2577 static bool
2578 pmap_reactivate(struct pmap *pmap)
2579 {
2580 	struct cpu_info *ci;
2581 	uint32_t cpumask;
2582 	bool result;
2583 	uint32_t oldcpus;
2584 
2585 	ci = curcpu();
2586 	cpumask = ci->ci_cpumask;
2587 
2588 	KASSERT(kpreempt_disabled());
2589 #if defined(XEN) && defined(__x86_64__)
2590 	KASSERT(pmap->pm_pdirpa == xen_current_user_pgd);
2591 #elif defined(PAE)
2592 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(pmap_l3pd[0]));
2593 #elif !defined(XEN)
2594 	KASSERT(pmap->pm_pdirpa == pmap_pte2pa(rcr3()));
2595 #endif
2596 
2597 	/*
2598 	 * if we still have a lazy reference to this pmap,
2599 	 * we can assume that there was no tlb shootdown
2600 	 * for this pmap in the meantime.
2601 	 *
2602 	 * the order of events here is important as we must
2603 	 * synchronize with TLB shootdown interrupts.  declare
2604 	 * interest in invalidations (TLBSTATE_VALID) and then
2605 	 * check the cpumask, which the IPIs can change only
2606 	 * when the state is TLBSTATE_LAZY.
2607 	 */
2608 
2609 	ci->ci_tlbstate = TLBSTATE_VALID;
2610 	oldcpus = pmap->pm_cpus;
2611 	KASSERT((pmap->pm_kernel_cpus & cpumask) != 0);
2612 	if (oldcpus & cpumask) {
2613 		/* got it */
2614 		result = true;
2615 	} else {
2616 		/* must reload */
2617 		atomic_or_32(&pmap->pm_cpus, cpumask);
2618 		result = false;
2619 	}
2620 
2621 	return result;
2622 }
2623 
2624 /*
2625  * pmap_load: actually switch pmap.  (fill in %cr3 and LDT info)
2626  */
2627 
2628 void
2629 pmap_load(void)
2630 {
2631 	struct cpu_info *ci;
2632 	uint32_t cpumask;
2633 	struct pmap *pmap;
2634 	struct pmap *oldpmap;
2635 	struct lwp *l;
2636 	struct pcb *pcb;
2637 	uint64_t ncsw;
2638 
2639 	kpreempt_disable();
2640  retry:
2641 	ci = curcpu();
2642 	if (!ci->ci_want_pmapload) {
2643 		kpreempt_enable();
2644 		return;
2645 	}
2646 	cpumask = ci->ci_cpumask;
2647 	l = ci->ci_curlwp;
2648 	ncsw = l->l_ncsw;
2649 
2650 	/* should be able to take ipis. */
2651 	KASSERT(ci->ci_ilevel < IPL_HIGH);
2652 #ifdef XEN
2653 	/* XXX not yet KASSERT(x86_read_psl() != 0); */
2654 #else
2655 	KASSERT((x86_read_psl() & PSL_I) != 0);
2656 #endif
2657 
2658 	KASSERT(l != NULL);
2659 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2660 	KASSERT(pmap != pmap_kernel());
2661 	oldpmap = ci->ci_pmap;
2662 	pcb = lwp_getpcb(l);
2663 
2664 	if (pmap == oldpmap) {
2665 		if (!pmap_reactivate(pmap)) {
2666 			u_int gen = uvm_emap_gen_return();
2667 
2668 			/*
2669 			 * pmap has been changed during deactivated.
2670 			 * our tlb may be stale.
2671 			 */
2672 
2673 			tlbflush();
2674 			uvm_emap_update(gen);
2675 		}
2676 
2677 		ci->ci_want_pmapload = 0;
2678 		kpreempt_enable();
2679 		return;
2680 	}
2681 
2682 	/*
2683 	 * grab a reference to the new pmap.
2684 	 */
2685 
2686 	pmap_reference(pmap);
2687 
2688 	/*
2689 	 * actually switch pmap.
2690 	 */
2691 
2692 	atomic_and_32(&oldpmap->pm_cpus, ~cpumask);
2693 	atomic_and_32(&oldpmap->pm_kernel_cpus, ~cpumask);
2694 
2695 #if defined(XEN) && defined(__x86_64__)
2696 	KASSERT(oldpmap->pm_pdirpa == xen_current_user_pgd ||
2697 	    oldpmap == pmap_kernel());
2698 #elif defined(PAE)
2699 	KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(pmap_l3pd[0]));
2700 #elif !defined(XEN)
2701 	KASSERT(oldpmap->pm_pdirpa == pmap_pte2pa(rcr3()));
2702 #endif
2703 	KASSERT((pmap->pm_cpus & cpumask) == 0);
2704 	KASSERT((pmap->pm_kernel_cpus & cpumask) == 0);
2705 
2706 	/*
2707 	 * mark the pmap in use by this processor.  again we must
2708 	 * synchronize with TLB shootdown interrupts, so set the
2709 	 * state VALID first, then register us for shootdown events
2710 	 * on this pmap.
2711 	 */
2712 
2713 	ci->ci_tlbstate = TLBSTATE_VALID;
2714 	atomic_or_32(&pmap->pm_cpus, cpumask);
2715 	atomic_or_32(&pmap->pm_kernel_cpus, cpumask);
2716 	ci->ci_pmap = pmap;
2717 
2718 	/*
2719 	 * update tss.  now that we have registered for invalidations
2720 	 * from other CPUs, we're good to load the page tables.
2721 	 */
2722 #ifdef PAE
2723 	pcb->pcb_cr3 = pmap_l3paddr;
2724 #else
2725 	pcb->pcb_cr3 = pmap->pm_pdirpa;
2726 #endif
2727 #if defined(XEN) && defined(__x86_64__)
2728 	/* kernel pmap always in cr3 and should never go in user cr3 */
2729 	if (pmap_pdirpa(pmap, 0) != pmap_pdirpa(pmap_kernel(), 0)) {
2730 		/*
2731 		 * Map user space address in kernel space and load
2732 		 * user cr3
2733 		 */
2734 		int i, s;
2735 		pd_entry_t *old_pgd, *new_pgd;
2736 		paddr_t addr;
2737 		s = splvm();
2738 		new_pgd  = pmap->pm_pdir;
2739 		old_pgd = pmap_kernel()->pm_pdir;
2740 		addr = xpmap_ptom(pmap_pdirpa(pmap_kernel(), 0));
2741 		for (i = 0; i < PDIR_SLOT_PTE;
2742 		    i++, addr += sizeof(pd_entry_t)) {
2743 			if ((new_pgd[i] & PG_V) || (old_pgd[i] & PG_V))
2744 				xpq_queue_pte_update(addr, new_pgd[i]);
2745 		}
2746 		xpq_flush_queue(); /* XXXtlb */
2747 		tlbflush();
2748 		xen_set_user_pgd(pmap_pdirpa(pmap, 0));
2749 		xen_current_user_pgd = pmap_pdirpa(pmap, 0);
2750 		splx(s);
2751 	}
2752 #else /* XEN && x86_64 */
2753 #if defined(XEN)
2754 	/*
2755 	 * clear APDP slot, in case it points to a page table that has
2756 	 * been freed
2757 	 */
2758 	if (*APDP_PDE) {
2759 		int i;
2760 		for (i = 0; i < PDP_SIZE; i++) {
2761 			pmap_pte_set(&APDP_PDE[i], 0);
2762 #ifdef PAE
2763 			/* clear shadow entry too */
2764 			pmap_pte_set(&APDP_PDE_SHADOW[i], 0);
2765 #endif
2766 		}
2767 	}
2768 	/* lldt() does pmap_pte_flush() */
2769 #else /* XEN */
2770 #if defined(i386)
2771 	ci->ci_tss.tss_ldt = pmap->pm_ldt_sel;
2772 	ci->ci_tss.tss_cr3 = pcb->pcb_cr3;
2773 #endif
2774 #endif /* XEN */
2775 	lldt(pmap->pm_ldt_sel);
2776 #ifdef PAE
2777 	{
2778 	paddr_t l3_pd = xpmap_ptom_masked(pmap_l3paddr);
2779 	int i;
2780 	int s = splvm();
2781 	/* don't update the kernel L3 slot */
2782 	for (i = 0 ; i < PDP_SIZE - 1  ; i++, l3_pd += sizeof(pd_entry_t)) {
2783 		xpq_queue_pte_update(l3_pd,
2784 		    xpmap_ptom(pmap->pm_pdirpa[i]) | PG_V);
2785 	}
2786 	tlbflush();
2787 	xpq_flush_queue();
2788 	splx(s);
2789 	}
2790 #else /* PAE */
2791 	{
2792 	u_int gen = uvm_emap_gen_return();
2793 	lcr3(pcb->pcb_cr3);
2794 	uvm_emap_update(gen);
2795 	}
2796 #endif /* PAE */
2797 #endif /* XEN && x86_64 */
2798 
2799 	ci->ci_want_pmapload = 0;
2800 
2801 	/*
2802 	 * we're now running with the new pmap.  drop the reference
2803 	 * to the old pmap.  if we block, we need to go around again.
2804 	 */
2805 
2806 	pmap_destroy(oldpmap);
2807 	if (l->l_ncsw != ncsw) {
2808 		goto retry;
2809 	}
2810 
2811 	kpreempt_enable();
2812 }
2813 
2814 /*
2815  * pmap_deactivate: deactivate a process' pmap
2816  *
2817  * => must be called with kernel preemption disabled (high SPL is enough)
2818  */
2819 
2820 void
2821 pmap_deactivate(struct lwp *l)
2822 {
2823 	struct pmap *pmap;
2824 	struct cpu_info *ci;
2825 
2826 	KASSERT(kpreempt_disabled());
2827 
2828 	if (l != curlwp) {
2829 		return;
2830 	}
2831 
2832 	/*
2833 	 * wait for pending TLB shootdowns to complete.  necessary
2834 	 * because TLB shootdown state is per-CPU, and the LWP may
2835 	 * be coming off the CPU before it has a chance to call
2836 	 * pmap_update().
2837 	 */
2838 	pmap_tlb_shootwait();
2839 
2840 	ci = curcpu();
2841 
2842 	if (ci->ci_want_pmapload) {
2843 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
2844 		    != pmap_kernel());
2845 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
2846 		    != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID);
2847 
2848 		/*
2849 		 * userspace has not been touched.
2850 		 * nothing to do here.
2851 		 */
2852 
2853 		ci->ci_want_pmapload = 0;
2854 		return;
2855 	}
2856 
2857 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2858 
2859 	if (pmap == pmap_kernel()) {
2860 		return;
2861 	}
2862 
2863 #if defined(XEN) && defined(__x86_64__)
2864 	KASSERT(pmap->pm_pdirpa == xen_current_user_pgd);
2865 #elif defined(PAE)
2866 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(pmap_l3pd[0]));
2867 #elif !defined(XEN)
2868 	KASSERT(pmap->pm_pdirpa == pmap_pte2pa(rcr3()));
2869 #endif
2870 	KASSERT(ci->ci_pmap == pmap);
2871 
2872 	/*
2873 	 * we aren't interested in TLB invalidations for this pmap,
2874 	 * at least for the time being.
2875 	 */
2876 
2877 	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
2878 	ci->ci_tlbstate = TLBSTATE_LAZY;
2879 }
2880 
2881 /*
2882  * end of lifecycle functions
2883  */
2884 
2885 /*
2886  * some misc. functions
2887  */
2888 
2889 static int
2890 pmap_pdes_invalid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde)
2891 {
2892 	int i;
2893 	unsigned long index;
2894 	pd_entry_t pde;
2895 
2896 	for (i = PTP_LEVELS; i > 1; i--) {
2897 		index = pl_i(va, i);
2898 		pde = pdes[i - 2][index];
2899 		if ((pde & PG_V) == 0)
2900 			return i;
2901 	}
2902 	if (lastpde != NULL)
2903 		*lastpde = pde;
2904 	return 0;
2905 }
2906 
2907 /*
2908  * pmap_extract: extract a PA for the given VA
2909  */
2910 
2911 bool
2912 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
2913 {
2914 	pt_entry_t *ptes, pte;
2915 	pd_entry_t pde;
2916 	pd_entry_t * const *pdes;
2917 	struct pmap *pmap2;
2918 	struct cpu_info *ci;
2919 	vaddr_t pa;
2920 	lwp_t *l;
2921 	bool hard, rv;
2922 
2923 	rv = false;
2924 	pa = 0;
2925 	l = curlwp;
2926 
2927 	KPREEMPT_DISABLE(l);
2928 	ci = l->l_cpu;
2929 	if (__predict_true(!ci->ci_want_pmapload && ci->ci_pmap == pmap) ||
2930 	    pmap == pmap_kernel()) {
2931 		/*
2932 		 * no need to lock, because it's pmap_kernel() or our
2933 		 * own pmap and is active.  if a user pmap, the caller
2934 		 * will hold the vm_map write/read locked and so prevent
2935 		 * entries from disappearing while we are here.  ptps
2936 		 * can disappear via pmap_remove() and pmap_protect(),
2937 		 * but they are called with the vm_map write locked.
2938 		 */
2939 		hard = false;
2940 		ptes = PTE_BASE;
2941 		pdes = normal_pdes;
2942 	} else {
2943 		/* we lose, do it the hard way. */
2944 		hard = true;
2945 		pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
2946 	}
2947 	if (pmap_pdes_valid(va, pdes, &pde)) {
2948 		pte = ptes[pl1_i(va)];
2949 		if (pde & PG_PS) {
2950 			pa = (pde & PG_LGFRAME) | (va & (NBPD_L2 - 1));
2951 			rv = true;
2952 		} else if (__predict_true((pte & PG_V) != 0)) {
2953 			pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
2954 			rv = true;
2955 		}
2956 	}
2957 	if (__predict_false(hard)) {
2958 		pmap_unmap_ptes(pmap, pmap2);
2959 	}
2960 	KPREEMPT_ENABLE(l);
2961 	if (pap != NULL) {
2962 		*pap = pa;
2963 	}
2964 	return rv;
2965 }
2966 
2967 
2968 /*
2969  * vtophys: virtual address to physical address.  For use by
2970  * machine-dependent code only.
2971  */
2972 
2973 paddr_t
2974 vtophys(vaddr_t va)
2975 {
2976 	paddr_t pa;
2977 
2978 	if (pmap_extract(pmap_kernel(), va, &pa) == true)
2979 		return (pa);
2980 	return (0);
2981 }
2982 
2983 #ifdef XEN
2984 /*
2985  * pmap_extract_ma: extract a MA for the given VA
2986  */
2987 
2988 bool
2989 pmap_extract_ma(struct pmap *pmap, vaddr_t va, paddr_t *pap)
2990 {
2991 	pt_entry_t *ptes, pte;
2992 	pd_entry_t pde;
2993 	pd_entry_t * const *pdes;
2994 	struct pmap *pmap2;
2995 
2996 	kpreempt_disable();
2997 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
2998 	if (!pmap_pdes_valid(va, pdes, &pde)) {
2999 		pmap_unmap_ptes(pmap, pmap2);
3000 		kpreempt_enable();
3001 		return false;
3002 	}
3003 
3004 	pte = ptes[pl1_i(va)];
3005 	pmap_unmap_ptes(pmap, pmap2);
3006 	kpreempt_enable();
3007 
3008 	if (__predict_true((pte & PG_V) != 0)) {
3009 		if (pap != NULL)
3010 			*pap = (pte & PG_FRAME) | (va & (NBPD_L1 - 1));
3011 		return true;
3012 	}
3013 
3014 	return false;
3015 }
3016 
3017 /*
3018  * vtomach: virtual address to machine address.  For use by
3019  * machine-dependent code only.
3020  */
3021 
3022 paddr_t
3023 vtomach(vaddr_t va)
3024 {
3025 	paddr_t pa;
3026 
3027 	if (pmap_extract_ma(pmap_kernel(), va, &pa) == true)
3028 		return (pa);
3029 	return (0);
3030 }
3031 
3032 #endif /* XEN */
3033 
3034 
3035 
3036 /*
3037  * pmap_virtual_space: used during bootup [pmap_steal_memory] to
3038  *	determine the bounds of the kernel virtual addess space.
3039  */
3040 
3041 void
3042 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp)
3043 {
3044 	*startp = virtual_avail;
3045 	*endp = virtual_end;
3046 }
3047 
3048 /*
3049  * pmap_map: map a range of PAs into kvm.
3050  *
3051  * => used during crash dump
3052  * => XXX: pmap_map() should be phased out?
3053  */
3054 
3055 vaddr_t
3056 pmap_map(vaddr_t va, paddr_t spa, paddr_t epa, vm_prot_t prot)
3057 {
3058 	while (spa < epa) {
3059 		pmap_kenter_pa(va, spa, prot, 0);
3060 		va += PAGE_SIZE;
3061 		spa += PAGE_SIZE;
3062 	}
3063 	pmap_update(pmap_kernel());
3064 	return va;
3065 }
3066 
3067 /*
3068  * pmap_zero_page: zero a page
3069  */
3070 
3071 void
3072 pmap_zero_page(paddr_t pa)
3073 {
3074 	pt_entry_t *zpte;
3075 	void *zerova;
3076 	int id;
3077 
3078 	kpreempt_disable();
3079 	id = cpu_number();
3080 	zpte = PTESLEW(zero_pte, id);
3081 	zerova = VASLEW(zerop, id);
3082 
3083 #ifdef DIAGNOSTIC
3084 	if (*zpte)
3085 		panic("pmap_zero_page: lock botch");
3086 #endif
3087 
3088 	pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k);
3089 	pmap_pte_flush();
3090 	pmap_update_pg((vaddr_t)zerova);		/* flush TLB */
3091 
3092 	memset(zerova, 0, PAGE_SIZE);
3093 
3094 #if defined(DIAGNOSTIC) || defined(XEN)
3095 	pmap_pte_set(zpte, 0);				/* zap ! */
3096 	pmap_pte_flush();
3097 #endif
3098 	kpreempt_enable();
3099 }
3100 
3101 /*
3102  * pmap_pagezeroidle: the same, for the idle loop page zero'er.
3103  * Returns true if the page was zero'd, false if we aborted for
3104  * some reason.
3105  */
3106 
3107 bool
3108 pmap_pageidlezero(paddr_t pa)
3109 {
3110 	pt_entry_t *zpte;
3111 	void *zerova;
3112 	bool rv;
3113 	int id;
3114 
3115 	id = cpu_number();
3116 	zpte = PTESLEW(zero_pte, id);
3117 	zerova = VASLEW(zerop, id);
3118 
3119 	KASSERT(cpu_feature & CPUID_SSE2);
3120 	KASSERT(*zpte == 0);
3121 
3122 	pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k);
3123 	pmap_pte_flush();
3124 	pmap_update_pg((vaddr_t)zerova);		/* flush TLB */
3125 
3126 	rv = sse2_idlezero_page(zerova);
3127 
3128 #if defined(DIAGNOSTIC) || defined(XEN)
3129 	pmap_pte_set(zpte, 0);				/* zap ! */
3130 	pmap_pte_flush();
3131 #endif
3132 
3133 	return rv;
3134 }
3135 
3136 /*
3137  * pmap_copy_page: copy a page
3138  */
3139 
3140 void
3141 pmap_copy_page(paddr_t srcpa, paddr_t dstpa)
3142 {
3143 	pt_entry_t *spte;
3144 	pt_entry_t *dpte;
3145 	void *csrcva;
3146 	void *cdstva;
3147 	int id;
3148 
3149 	kpreempt_disable();
3150 	id = cpu_number();
3151 	spte = PTESLEW(csrc_pte,id);
3152 	dpte = PTESLEW(cdst_pte,id);
3153 	csrcva = VASLEW(csrcp, id);
3154 	cdstva = VASLEW(cdstp, id);
3155 
3156 	KASSERT(*spte == 0 && *dpte == 0);
3157 
3158 	pmap_pte_set(spte, pmap_pa2pte(srcpa) | PG_V | PG_RW | PG_U | PG_k);
3159 	pmap_pte_set(dpte,
3160 	    pmap_pa2pte(dstpa) | PG_V | PG_RW | PG_M | PG_U | PG_k);
3161 	pmap_pte_flush();
3162 	pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva);
3163 
3164 	memcpy(cdstva, csrcva, PAGE_SIZE);
3165 
3166 #if defined(DIAGNOSTIC) || defined(XEN)
3167 	pmap_pte_set(spte, 0);
3168 	pmap_pte_set(dpte, 0);
3169 	pmap_pte_flush();
3170 #endif
3171 	kpreempt_enable();
3172 }
3173 
3174 static pt_entry_t *
3175 pmap_map_ptp(struct vm_page *ptp)
3176 {
3177 	pt_entry_t *ptppte;
3178 	void *ptpva;
3179 	int id;
3180 
3181 	KASSERT(kpreempt_disabled());
3182 
3183 	id = cpu_number();
3184 	ptppte = PTESLEW(ptp_pte, id);
3185 	ptpva = VASLEW(ptpp, id);
3186 #if !defined(XEN)
3187 	pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M |
3188 	    PG_RW | PG_U | PG_k);
3189 #else
3190 	pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M |
3191 	    PG_U | PG_k);
3192 #endif
3193 	pmap_pte_flush();
3194 	pmap_update_pg((vaddr_t)ptpva);
3195 
3196 	return (pt_entry_t *)ptpva;
3197 }
3198 
3199 static void
3200 pmap_unmap_ptp(void)
3201 {
3202 #if defined(DIAGNOSTIC) || defined(XEN)
3203 	pt_entry_t *pte;
3204 
3205 	KASSERT(kpreempt_disabled());
3206 
3207 	pte = PTESLEW(ptp_pte, cpu_number());
3208 	if (*pte != 0) {
3209 		pmap_pte_set(pte, 0);
3210 		pmap_pte_flush();
3211 	}
3212 #endif
3213 }
3214 
3215 static pt_entry_t *
3216 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
3217 {
3218 
3219 	KASSERT(kpreempt_disabled());
3220 	if (pmap_is_curpmap(pmap)) {
3221 		return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */
3222 	}
3223 	KASSERT(ptp != NULL);
3224 	return pmap_map_ptp(ptp) + pl1_pi(va);
3225 }
3226 
3227 static void
3228 pmap_unmap_pte(void)
3229 {
3230 
3231 	KASSERT(kpreempt_disabled());
3232 
3233 	pmap_unmap_ptp();
3234 }
3235 
3236 /*
3237  * p m a p   r e m o v e   f u n c t i o n s
3238  *
3239  * functions that remove mappings
3240  */
3241 
3242 /*
3243  * pmap_remove_ptes: remove PTEs from a PTP
3244  *
3245  * => must have proper locking on pmap_master_lock
3246  * => caller must hold pmap's lock
3247  * => PTP must be mapped into KVA
3248  * => PTP should be null if pmap == pmap_kernel()
3249  * => must be called with kernel preemption disabled
3250  * => returns composite pte if at least one page should be shot down
3251  */
3252 
3253 static pt_entry_t
3254 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
3255 		 vaddr_t startva, vaddr_t endva, struct pv_entry **pv_tofree)
3256 {
3257 	struct pv_entry *pve;
3258 	pt_entry_t *pte = (pt_entry_t *) ptpva;
3259 	pt_entry_t opte, xpte = 0;
3260 
3261 	KASSERT(pmap == pmap_kernel() || mutex_owned(&pmap->pm_lock));
3262 	KASSERT(kpreempt_disabled());
3263 
3264 	/*
3265 	 * note that ptpva points to the PTE that maps startva.   this may
3266 	 * or may not be the first PTE in the PTP.
3267 	 *
3268 	 * we loop through the PTP while there are still PTEs to look at
3269 	 * and the wire_count is greater than 1 (because we use the wire_count
3270 	 * to keep track of the number of real PTEs in the PTP).
3271 	 */
3272 
3273 	for (/*null*/; startva < endva && (ptp == NULL || ptp->wire_count > 1)
3274 			     ; pte++, startva += PAGE_SIZE) {
3275 		struct vm_page *pg;
3276 		struct pmap_page *pp;
3277 
3278 		if (!pmap_valid_entry(*pte))
3279 			continue;			/* VA not mapped */
3280 
3281 		/* atomically save the old PTE and zap! it */
3282 		opte = pmap_pte_testset(pte, 0);
3283 		if (!pmap_valid_entry(opte)) {
3284 			continue;
3285 		}
3286 
3287 		pmap_exec_account(pmap, startva, opte, 0);
3288 		pmap_stats_update_bypte(pmap, 0, opte);
3289 		xpte |= opte;
3290 
3291 		if (ptp) {
3292 			ptp->wire_count--;		/* dropping a PTE */
3293 			/* Make sure that the PDE is flushed */
3294 			if (ptp->wire_count <= 1)
3295 				xpte |= PG_U;
3296 		}
3297 
3298 		/*
3299 		 * if we are not on a pv_head list we are done.
3300 		 */
3301 
3302 		if ((opte & PG_PVLIST) == 0) {
3303 #if defined(DIAGNOSTIC) && !defined(DOM0OPS)
3304 			if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL)
3305 				panic("pmap_remove_ptes: managed page without "
3306 				      "PG_PVLIST for 0x%lx", startva);
3307 #endif
3308 			continue;
3309 		}
3310 
3311 		pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte));
3312 #ifdef DIAGNOSTIC
3313 		if (pg == NULL)
3314 			panic("pmap_remove_ptes: unmanaged page marked "
3315 			      "PG_PVLIST, va = 0x%lx, pa = 0x%lx",
3316 			      startva, (u_long)pmap_pte2pa(opte));
3317 #endif
3318 
3319 		/* sync R/M bits */
3320 		pp = VM_PAGE_TO_PP(pg);
3321 		pp_lock(pp);
3322 		pp->pp_attrs |= opte;
3323 		pve = pmap_remove_pv(pp, ptp, startva);
3324 		pp_unlock(pp);
3325 
3326 		if (pve != NULL) {
3327 			pve->pve_next = *pv_tofree;
3328 			*pv_tofree = pve;
3329 		}
3330 
3331 		/* end of "for" loop: time for next pte */
3332 	}
3333 
3334 	return xpte;
3335 }
3336 
3337 
3338 /*
3339  * pmap_remove_pte: remove a single PTE from a PTP
3340  *
3341  * => must have proper locking on pmap_master_lock
3342  * => caller must hold pmap's lock
3343  * => PTP must be mapped into KVA
3344  * => PTP should be null if pmap == pmap_kernel()
3345  * => returns true if we removed a mapping
3346  * => must be called with kernel preemption disabled
3347  */
3348 
3349 static bool
3350 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
3351 		vaddr_t va, struct pv_entry **pv_tofree)
3352 {
3353 	pt_entry_t opte;
3354 	struct pv_entry *pve;
3355 	struct vm_page *pg;
3356 	struct pmap_page *pp;
3357 
3358 	KASSERT(pmap == pmap_kernel() || mutex_owned(&pmap->pm_lock));
3359 	KASSERT(pmap == pmap_kernel() || kpreempt_disabled());
3360 
3361 	if (!pmap_valid_entry(*pte))
3362 		return(false);		/* VA not mapped */
3363 
3364 	/* atomically save the old PTE and zap! it */
3365 	opte = pmap_pte_testset(pte, 0);
3366 	if (!pmap_valid_entry(opte)) {
3367 		return false;
3368 	}
3369 
3370 	pmap_exec_account(pmap, va, opte, 0);
3371 	pmap_stats_update_bypte(pmap, 0, opte);
3372 
3373 	if (opte & PG_U)
3374 		pmap_tlb_shootdown(pmap, va, 0, opte);
3375 
3376 	if (ptp) {
3377 		ptp->wire_count--;		/* dropping a PTE */
3378 		/* Make sure that the PDE is flushed */
3379 		if ((ptp->wire_count <= 1) && !(opte & PG_U))
3380 			pmap_tlb_shootdown(pmap, va, 0, opte);
3381 	}
3382 
3383 	/*
3384 	 * if we are not on a pv_head list we are done.
3385 	 */
3386 
3387 	if ((opte & PG_PVLIST) == 0) {
3388 #if defined(DIAGNOSTIC) && !defined(DOM0OPS)
3389 		if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL)
3390 			panic("pmap_remove_pte: managed page without "
3391 			      "PG_PVLIST for 0x%lx", va);
3392 #endif
3393 		return(true);
3394 	}
3395 
3396 	pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte));
3397 #ifdef DIAGNOSTIC
3398 	if (pg == NULL)
3399 		panic("pmap_remove_pte: unmanaged page marked "
3400 		    "PG_PVLIST, va = 0x%lx, pa = 0x%lx", va,
3401 		    (u_long)(pmap_pte2pa(opte)));
3402 #endif
3403 
3404 	/* sync R/M bits */
3405 	pp = VM_PAGE_TO_PP(pg);
3406 	pp_lock(pp);
3407 	pp->pp_attrs |= opte;
3408 	pve = pmap_remove_pv(pp, ptp, va);
3409 	pp_unlock(pp);
3410 
3411 	if (pve) {
3412 		pve->pve_next = *pv_tofree;
3413 		*pv_tofree = pve;
3414 	}
3415 
3416 	return(true);
3417 }
3418 
3419 /*
3420  * pmap_remove: mapping removal function.
3421  *
3422  * => caller should not be holding any pmap locks
3423  */
3424 
3425 void
3426 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
3427 {
3428 	pt_entry_t *ptes, xpte = 0;
3429 	pd_entry_t pde;
3430 	pd_entry_t * const *pdes;
3431 	struct pv_entry *pv_tofree = NULL;
3432 	bool result;
3433 	paddr_t ptppa;
3434 	vaddr_t blkendva, va = sva;
3435 	struct vm_page *ptp;
3436 	struct pmap *pmap2;
3437 
3438 	kpreempt_disable();
3439 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
3440 
3441 	/*
3442 	 * removing one page?  take shortcut function.
3443 	 */
3444 
3445 	if (va + PAGE_SIZE == eva) {
3446 		if (pmap_pdes_valid(va, pdes, &pde)) {
3447 
3448 			/* PA of the PTP */
3449 			ptppa = pmap_pte2pa(pde);
3450 
3451 			/* get PTP if non-kernel mapping */
3452 			if (pmap == pmap_kernel()) {
3453 				/* we never free kernel PTPs */
3454 				ptp = NULL;
3455 			} else {
3456 				ptp = pmap_find_ptp(pmap, va, ptppa, 1);
3457 #ifdef DIAGNOSTIC
3458 				if (ptp == NULL)
3459 					panic("pmap_remove: unmanaged "
3460 					      "PTP detected");
3461 #endif
3462 			}
3463 
3464 			/* do it! */
3465 			result = pmap_remove_pte(pmap, ptp,
3466 			    &ptes[pl1_i(va)], va, &pv_tofree);
3467 
3468 			/*
3469 			 * if mapping removed and the PTP is no longer
3470 			 * being used, free it!
3471 			 */
3472 
3473 			if (result && ptp && ptp->wire_count <= 1)
3474 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3475 		}
3476 	} else for (/* null */ ; va < eva ; va = blkendva) {
3477 		int lvl;
3478 
3479 		/* determine range of block */
3480 		blkendva = x86_round_pdr(va+1);
3481 		if (blkendva > eva)
3482 			blkendva = eva;
3483 
3484 		/*
3485 		 * XXXCDC: our PTE mappings should never be removed
3486 		 * with pmap_remove!  if we allow this (and why would
3487 		 * we?) then we end up freeing the pmap's page
3488 		 * directory page (PDP) before we are finished using
3489 		 * it when we hit in in the recursive mapping.  this
3490 		 * is BAD.
3491 		 *
3492 		 * long term solution is to move the PTEs out of user
3493 		 * address space.  and into kernel address space (up
3494 		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
3495 		 * be VM_MAX_ADDRESS.
3496 		 */
3497 
3498 		if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE)
3499 			/* XXXCDC: ugly hack to avoid freeing PDP here */
3500 			continue;
3501 
3502 		lvl = pmap_pdes_invalid(va, pdes, &pde);
3503 		if (lvl != 0) {
3504 			/*
3505 			 * skip a range corresponding to an invalid pde.
3506 			 */
3507 			blkendva = (va & ptp_masks[lvl - 1]) + nbpd[lvl - 1];
3508  			continue;
3509 		}
3510 
3511 		/* PA of the PTP */
3512 		ptppa = pmap_pte2pa(pde);
3513 
3514 		/* get PTP if non-kernel mapping */
3515 		if (pmap == pmap_kernel()) {
3516 			/* we never free kernel PTPs */
3517 			ptp = NULL;
3518 		} else {
3519 			ptp = pmap_find_ptp(pmap, va, ptppa, 1);
3520 #ifdef DIAGNOSTIC
3521 			if (ptp == NULL)
3522 				panic("pmap_remove: unmanaged PTP "
3523 				      "detected");
3524 #endif
3525 		}
3526 		xpte |= pmap_remove_ptes(pmap, ptp,
3527 		    (vaddr_t)&ptes[pl1_i(va)], va, blkendva, &pv_tofree);
3528 
3529 		/* if PTP is no longer being used, free it! */
3530 		if (ptp && ptp->wire_count <= 1) {
3531 			pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3532 		}
3533 		if ((xpte & PG_U) != 0)
3534 			pmap_tlb_shootdown(pmap, sva, eva, xpte);
3535 	}
3536 	pmap_unmap_ptes(pmap, pmap2);		/* unlock pmap */
3537 	kpreempt_enable();
3538 
3539 	/* Now we free unused PVs */
3540 	if (pv_tofree)
3541 		pmap_free_pvs(pv_tofree);
3542 }
3543 
3544 /*
3545  * pmap_sync_pv: clear pte bits and return the old value of the pte.
3546  *
3547  * => called with pp_lock held. (thus preemption disabled)
3548  * => issues tlb shootdowns if necessary.
3549  */
3550 
3551 static int
3552 pmap_sync_pv(struct pv_pte *pvpte, pt_entry_t expect, int clearbits,
3553     pt_entry_t *optep)
3554 {
3555 	struct pmap *pmap;
3556 	struct vm_page *ptp;
3557 	vaddr_t va;
3558 	pt_entry_t *ptep;
3559 	pt_entry_t opte;
3560 	pt_entry_t npte;
3561 	bool need_shootdown;
3562 
3563 	ptp = pvpte->pte_ptp;
3564 	va = pvpte->pte_va;
3565 	KASSERT(ptp == NULL || ptp->uobject != NULL);
3566 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
3567 	pmap = ptp_to_pmap(ptp);
3568 
3569 	KASSERT((expect & ~(PG_FRAME | PG_V)) == 0);
3570 	KASSERT((expect & PG_V) != 0);
3571 	KASSERT(clearbits == ~0 || (clearbits & ~(PG_M | PG_U | PG_RW)) == 0);
3572 	KASSERT(kpreempt_disabled());
3573 
3574 	ptep = pmap_map_pte(pmap, ptp, va);
3575 	do {
3576 		opte = *ptep;
3577 		KASSERT((opte & (PG_M | PG_U)) != PG_M);
3578 		KASSERT((opte & (PG_U | PG_V)) != PG_U);
3579 		KASSERT(opte == 0 || (opte & PG_V) != 0);
3580 		if ((opte & (PG_FRAME | PG_V)) != expect) {
3581 
3582 			/*
3583 			 * we lost a race with a V->P operation like
3584 			 * pmap_remove().  wait for the competitor
3585 			 * reflecting pte bits into mp_attrs.
3586 			 *
3587 			 * issue a redundant TLB shootdown so that
3588 			 * we can wait for its completion.
3589 			 */
3590 
3591 			pmap_unmap_pte();
3592 			if (clearbits != 0) {
3593 				pmap_tlb_shootdown(pmap, va, 0,
3594 				    (pmap == pmap_kernel() ? PG_G : 0));
3595 			}
3596 			return EAGAIN;
3597 		}
3598 
3599 		/*
3600 		 * check if there's anything to do on this pte.
3601 		 */
3602 
3603 		if ((opte & clearbits) == 0) {
3604 			need_shootdown = false;
3605 			break;
3606 		}
3607 
3608 		/*
3609 		 * we need a shootdown if the pte is cached. (PG_U)
3610 		 *
3611 		 * ...unless we are clearing only the PG_RW bit and
3612 		 * it isn't cached as RW. (PG_M)
3613 		 */
3614 
3615 		need_shootdown = (opte & PG_U) != 0 &&
3616 		    !(clearbits == PG_RW && (opte & PG_M) == 0);
3617 
3618 		npte = opte & ~clearbits;
3619 
3620 		/*
3621 		 * if we need a shootdown anyway, clear PG_U and PG_M.
3622 		 */
3623 
3624 		if (need_shootdown) {
3625 			npte &= ~(PG_U | PG_M);
3626 		}
3627 		KASSERT((npte & (PG_M | PG_U)) != PG_M);
3628 		KASSERT((npte & (PG_U | PG_V)) != PG_U);
3629 		KASSERT(npte == 0 || (opte & PG_V) != 0);
3630 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
3631 
3632 	if (need_shootdown) {
3633 		pmap_tlb_shootdown(pmap, va, 0, opte);
3634 	}
3635 	pmap_unmap_pte();
3636 
3637 	*optep = opte;
3638 	return 0;
3639 }
3640 
3641 /*
3642  * pmap_page_remove: remove a managed vm_page from all pmaps that map it
3643  *
3644  * => R/M bits are sync'd back to attrs
3645  */
3646 
3647 void
3648 pmap_page_remove(struct vm_page *pg)
3649 {
3650 	struct pmap_page *pp;
3651 	struct pv_pte *pvpte;
3652 	struct pv_entry *killlist = NULL;
3653 	struct vm_page *ptp;
3654 	pt_entry_t expect;
3655 	lwp_t *l;
3656 	int count;
3657 
3658 	l = curlwp;
3659 	pp = VM_PAGE_TO_PP(pg);
3660 	expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V;
3661 	count = SPINLOCK_BACKOFF_MIN;
3662 	kpreempt_disable();
3663 startover:
3664 	pp_lock(pp);
3665 	while ((pvpte = pv_pte_first(pp)) != NULL) {
3666 		struct pmap *pmap;
3667 		struct pv_entry *pve;
3668 		pt_entry_t opte;
3669 		vaddr_t va;
3670 		int error;
3671 
3672 		/*
3673 		 * add a reference to the pmap before clearing the pte.
3674 		 * otherwise the pmap can disappear behind us.
3675 		 */
3676 
3677 		ptp = pvpte->pte_ptp;
3678 		pmap = ptp_to_pmap(ptp);
3679 		if (ptp != NULL) {
3680 			pmap_reference(pmap);
3681 		}
3682 
3683 		error = pmap_sync_pv(pvpte, expect, ~0, &opte);
3684 		if (error == EAGAIN) {
3685 			int hold_count;
3686 			pp_unlock(pp);
3687 			KERNEL_UNLOCK_ALL(curlwp, &hold_count);
3688 			if (ptp != NULL) {
3689 				pmap_destroy(pmap);
3690 			}
3691 			SPINLOCK_BACKOFF(count);
3692 			KERNEL_LOCK(hold_count, curlwp);
3693 			goto startover;
3694 		}
3695 
3696 		pp->pp_attrs |= opte;
3697 		va = pvpte->pte_va;
3698 		pve = pmap_remove_pv(pp, ptp, va);
3699 		pp_unlock(pp);
3700 
3701 		/* update the PTP reference count.  free if last reference. */
3702 		if (ptp != NULL) {
3703 			struct pmap *pmap2;
3704 			pt_entry_t *ptes;
3705 			pd_entry_t * const *pdes;
3706 
3707 			KASSERT(pmap != pmap_kernel());
3708 
3709 			pmap_tlb_shootwait();
3710 			pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3711 			pmap_stats_update_bypte(pmap, 0, opte);
3712 			ptp->wire_count--;
3713 			if (ptp->wire_count <= 1) {
3714 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3715 			}
3716 			pmap_unmap_ptes(pmap, pmap2);
3717 			pmap_destroy(pmap);
3718 		} else {
3719 			KASSERT(pmap == pmap_kernel());
3720 			pmap_stats_update_bypte(pmap, 0, opte);
3721 		}
3722 
3723 		if (pve != NULL) {
3724 			pve->pve_next = killlist;	/* mark it for death */
3725 			killlist = pve;
3726 		}
3727 		pp_lock(pp);
3728 	}
3729 	pp_unlock(pp);
3730 	kpreempt_enable();
3731 
3732 	/* Now free unused pvs. */
3733 	pmap_free_pvs(killlist);
3734 }
3735 
3736 /*
3737  * p m a p   a t t r i b u t e  f u n c t i o n s
3738  * functions that test/change managed page's attributes
3739  * since a page can be mapped multiple times we must check each PTE that
3740  * maps it by going down the pv lists.
3741  */
3742 
3743 /*
3744  * pmap_test_attrs: test a page's attributes
3745  */
3746 
3747 bool
3748 pmap_test_attrs(struct vm_page *pg, unsigned testbits)
3749 {
3750 	struct pmap_page *pp;
3751 	struct pv_pte *pvpte;
3752 	pt_entry_t expect;
3753 	u_int result;
3754 
3755 	pp = VM_PAGE_TO_PP(pg);
3756 	if ((pp->pp_attrs & testbits) != 0) {
3757 		return true;
3758 	}
3759 	expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V;
3760 	pp_lock(pp);
3761 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
3762 		pt_entry_t opte;
3763 		int error;
3764 
3765 		if ((pp->pp_attrs & testbits) != 0) {
3766 			break;
3767 		}
3768 		error = pmap_sync_pv(pvpte, expect, 0, &opte);
3769 		if (error == 0) {
3770 			pp->pp_attrs |= opte;
3771 		}
3772 	}
3773 	result = pp->pp_attrs & testbits;
3774 	pp_unlock(pp);
3775 
3776 	/*
3777 	 * note that we will exit the for loop with a non-null pve if
3778 	 * we have found the bits we are testing for.
3779 	 */
3780 
3781 	return result != 0;
3782 }
3783 
3784 /*
3785  * pmap_clear_attrs: clear the specified attribute for a page.
3786  *
3787  * => we return true if we cleared one of the bits we were asked to
3788  */
3789 
3790 bool
3791 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits)
3792 {
3793 	struct pmap_page *pp;
3794 	struct pv_pte *pvpte;
3795 	u_int result;
3796 	pt_entry_t expect;
3797 	int count;
3798 
3799 	pp = VM_PAGE_TO_PP(pg);
3800 	expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V;
3801 	count = SPINLOCK_BACKOFF_MIN;
3802 	kpreempt_disable();
3803 startover:
3804 	pp_lock(pp);
3805 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
3806 		pt_entry_t opte;
3807 		int error;
3808 
3809 		error = pmap_sync_pv(pvpte, expect, clearbits, &opte);
3810 		if (error == EAGAIN) {
3811 			int hold_count;
3812 			pp_unlock(pp);
3813 			KERNEL_UNLOCK_ALL(curlwp, &hold_count);
3814 			SPINLOCK_BACKOFF(count);
3815 			KERNEL_LOCK(hold_count, curlwp);
3816 			goto startover;
3817 		}
3818 		pp->pp_attrs |= opte;
3819 	}
3820 	result = pp->pp_attrs & clearbits;
3821 	pp->pp_attrs &= ~clearbits;
3822 	pp_unlock(pp);
3823 	kpreempt_enable();
3824 
3825 	return result != 0;
3826 }
3827 
3828 
3829 /*
3830  * p m a p   p r o t e c t i o n   f u n c t i o n s
3831  */
3832 
3833 /*
3834  * pmap_page_protect: change the protection of all recorded mappings
3835  *	of a managed page
3836  *
3837  * => NOTE: this is an inline function in pmap.h
3838  */
3839 
3840 /* see pmap.h */
3841 
3842 /*
3843  * pmap_protect: set the protection in of the pages in a pmap
3844  *
3845  * => NOTE: this is an inline function in pmap.h
3846  */
3847 
3848 /* see pmap.h */
3849 
3850 /*
3851  * pmap_write_protect: write-protect pages in a pmap
3852  */
3853 
3854 void
3855 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
3856 {
3857 	pt_entry_t *ptes, *epte;
3858 	pt_entry_t *spte;
3859 	pd_entry_t * const *pdes;
3860 	vaddr_t blockend, va;
3861 	pt_entry_t opte;
3862 	struct pmap *pmap2;
3863 
3864 	KASSERT(curlwp->l_md.md_gc_pmap != pmap);
3865 
3866 	kpreempt_disable();
3867 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
3868 
3869 	/* should be ok, but just in case ... */
3870 	sva &= PG_FRAME;
3871 	eva &= PG_FRAME;
3872 
3873 	for (va = sva ; va < eva ; va = blockend) {
3874 
3875 		blockend = (va & L2_FRAME) + NBPD_L2;
3876 		if (blockend > eva)
3877 			blockend = eva;
3878 
3879 		/*
3880 		 * XXXCDC: our PTE mappings should never be write-protected!
3881 		 *
3882 		 * long term solution is to move the PTEs out of user
3883 		 * address space.  and into kernel address space (up
3884 		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
3885 		 * be VM_MAX_ADDRESS.
3886 		 */
3887 
3888 		/* XXXCDC: ugly hack to avoid freeing PDP here */
3889 		if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE)
3890 			continue;
3891 
3892 		/* empty block? */
3893 		if (!pmap_pdes_valid(va, pdes, NULL))
3894 			continue;
3895 
3896 #ifdef DIAGNOSTIC
3897 		if (va >= VM_MAXUSER_ADDRESS &&
3898 		    va < VM_MAX_ADDRESS)
3899 			panic("pmap_write_protect: PTE space");
3900 #endif
3901 
3902 		spte = &ptes[pl1_i(va)];
3903 		epte = &ptes[pl1_i(blockend)];
3904 
3905 		for (/*null */; spte < epte ; spte++) {
3906 			pt_entry_t npte;
3907 
3908 			do {
3909 				opte = *spte;
3910 				if ((~opte & (PG_RW | PG_V)) != 0) {
3911 					goto next;
3912 				}
3913 				npte = opte & ~PG_RW;
3914 			} while (pmap_pte_cas(spte, opte, npte) != opte);
3915 			if ((opte & PG_M) != 0) {
3916 				vaddr_t tva;
3917 
3918 				tva = x86_ptob(spte - ptes);
3919 				pmap_tlb_shootdown(pmap, tva, 0, opte);
3920 			}
3921 next:;
3922 		}
3923 	}
3924 
3925 	pmap_unmap_ptes(pmap, pmap2);	/* unlocks pmap */
3926 	kpreempt_enable();
3927 }
3928 
3929 /*
3930  * end of protection functions
3931  */
3932 
3933 /*
3934  * pmap_unwire: clear the wired bit in the PTE
3935  *
3936  * => mapping should already be in map
3937  */
3938 
3939 void
3940 pmap_unwire(struct pmap *pmap, vaddr_t va)
3941 {
3942 	pt_entry_t *ptes;
3943 	pd_entry_t * const *pdes;
3944 	struct pmap *pmap2;
3945 
3946 	kpreempt_disable();
3947 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
3948 
3949 	if (pmap_pdes_valid(va, pdes, NULL)) {
3950 		pt_entry_t *ptep = &ptes[pl1_i(va)];
3951 		pt_entry_t opte = *ptep;
3952 
3953 #ifdef DIAGNOSTIC
3954 		if (!pmap_valid_entry(opte))
3955 			panic("pmap_unwire: invalid (unmapped) va 0x%lx", va);
3956 #endif
3957 		if ((opte & PG_W) != 0) {
3958 			pt_entry_t npte = opte & ~PG_W;
3959 
3960 			opte = pmap_pte_testset(ptep, npte);
3961 			pmap_stats_update_bypte(pmap, npte, opte);
3962 		}
3963 #ifdef DIAGNOSTIC
3964 		else {
3965 			printf("pmap_unwire: wiring for pmap %p va 0x%lx "
3966 			       "didn't change!\n", pmap, va);
3967 		}
3968 #endif
3969 		pmap_unmap_ptes(pmap, pmap2);		/* unlocks map */
3970 	}
3971 #ifdef DIAGNOSTIC
3972 	else {
3973 		panic("pmap_unwire: invalid PDE");
3974 	}
3975 #endif
3976 	kpreempt_enable();
3977 }
3978 
3979 /*
3980  * pmap_copy: copy mappings from one pmap to another
3981  *
3982  * => optional function
3983  * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
3984  */
3985 
3986 /*
3987  * defined as macro in pmap.h
3988  */
3989 
3990 /*
3991  * pmap_enter: enter a mapping into a pmap
3992  *
3993  * => must be done "now" ... no lazy-evaluation
3994  * => we set pmap => pv_head locking
3995  */
3996 #ifdef XEN
3997 int
3998 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa,
3999 	   vm_prot_t prot, u_int flags, int domid)
4000 {
4001 #else /* XEN */
4002 int
4003 pmap_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
4004 	   u_int flags)
4005 {
4006 	paddr_t ma = pa;
4007 #endif /* XEN */
4008 	pt_entry_t *ptes, opte, npte;
4009 	pt_entry_t *ptep;
4010 	pd_entry_t * const *pdes;
4011 	struct vm_page *ptp, *pg;
4012 	struct pmap_page *new_pp;
4013 	struct pmap_page *old_pp;
4014 	struct pv_entry *old_pve = NULL;
4015 	struct pv_entry *new_pve;
4016 	struct pv_entry *new_pve2;
4017 	int error;
4018 	bool wired = (flags & PMAP_WIRED) != 0;
4019 	struct pmap *pmap2;
4020 
4021 	KASSERT(pmap_initialized);
4022 	KASSERT(curlwp->l_md.md_gc_pmap != pmap);
4023 
4024 #ifdef DIAGNOSTIC
4025 	/* sanity check: totally out of range? */
4026 	if (va >= VM_MAX_KERNEL_ADDRESS)
4027 		panic("pmap_enter: too big");
4028 
4029 	if (va == (vaddr_t) PDP_BASE || va == (vaddr_t) APDP_BASE)
4030 		panic("pmap_enter: trying to map over PDP/APDP!");
4031 
4032 	/* sanity check: kernel PTPs should already have been pre-allocated */
4033 	if (va >= VM_MIN_KERNEL_ADDRESS &&
4034 	    !pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]))
4035 		panic("pmap_enter: missing kernel PTP for va %lx!", va);
4036 #endif /* DIAGNOSTIC */
4037 #ifdef XEN
4038 	KASSERT(domid == DOMID_SELF || pa == 0);
4039 #endif /* XEN */
4040 
4041 	npte = ma | protection_codes[prot] | PG_V;
4042 	if (wired)
4043 	        npte |= PG_W;
4044 	if (flags & PMAP_NOCACHE)
4045 		npte |= PG_N;
4046 	if (va < VM_MAXUSER_ADDRESS)
4047 		npte |= PG_u;
4048 	else if (va < VM_MAX_ADDRESS)
4049 		npte |= (PG_u | PG_RW);	/* XXXCDC: no longer needed? */
4050 	else
4051 		npte |= PG_k;
4052 	if (pmap == pmap_kernel())
4053 		npte |= pmap_pg_g;
4054 	if (flags & VM_PROT_ALL) {
4055 		npte |= PG_U;
4056 		if (flags & VM_PROT_WRITE) {
4057 			KASSERT((npte & PG_RW) != 0);
4058 			npte |= PG_M;
4059 		}
4060 	}
4061 
4062 #ifdef XEN
4063 	if (domid != DOMID_SELF)
4064 		pg = NULL;
4065 	else
4066 #endif
4067 		pg = PHYS_TO_VM_PAGE(pa);
4068 	if (pg != NULL) {
4069 		/* This is a managed page */
4070 		npte |= PG_PVLIST;
4071 		new_pp = VM_PAGE_TO_PP(pg);
4072 	} else {
4073 		new_pp = NULL;
4074 	}
4075 
4076 	/* get pves. */
4077 	new_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
4078 	new_pve2 = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
4079 	if (new_pve == NULL || new_pve2 == NULL) {
4080 		if (flags & PMAP_CANFAIL) {
4081 			error = ENOMEM;
4082 			goto out2;
4083 		}
4084 		panic("pmap_enter: pve allocation failed");
4085 	}
4086 
4087 	kpreempt_disable();
4088 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
4089 	if (pmap == pmap_kernel()) {
4090 		ptp = NULL;
4091 	} else {
4092 		ptp = pmap_get_ptp(pmap, va, pdes);
4093 		if (ptp == NULL) {
4094 			pmap_unmap_ptes(pmap, pmap2);
4095 			if (flags & PMAP_CANFAIL) {
4096 				error = ENOMEM;
4097 				goto out;
4098 			}
4099 			panic("pmap_enter: get ptp failed");
4100 		}
4101 	}
4102 
4103 	/*
4104 	 * update the pte.
4105 	 */
4106 
4107 	ptep = &ptes[pl1_i(va)];
4108 	do {
4109 		opte = *ptep;
4110 
4111 		/*
4112 		 * if the same page, inherit PG_U and PG_M.
4113 		 */
4114 		if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) {
4115 			npte |= opte & (PG_U | PG_M);
4116 		}
4117 #if defined(XEN)
4118 		if (domid != DOMID_SELF) {
4119 			/* pmap_pte_cas with error handling */
4120 			int s = splvm();
4121 			if (opte != *ptep) {
4122 				splx(s);
4123 				continue;
4124 			}
4125 			error = xpq_update_foreign(
4126 			    vtomach((vaddr_t)ptep), npte, domid);
4127 			splx(s);
4128 			if (error) {
4129 				if (ptp != NULL && ptp->wire_count <= 1) {
4130 					pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4131 				}
4132 				pmap_unmap_ptes(pmap, pmap2);
4133 				goto out;
4134 			}
4135 			break;
4136 		}
4137 #endif /* defined(XEN) */
4138 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
4139 
4140 	/*
4141 	 * update statistics and PTP's reference count.
4142 	 */
4143 
4144 	pmap_stats_update_bypte(pmap, npte, opte);
4145 	if (ptp != NULL && !pmap_valid_entry(opte)) {
4146 		ptp->wire_count++;
4147 	}
4148 	KASSERT(ptp == NULL || ptp->wire_count > 1);
4149 
4150 	/*
4151 	 * if the same page, we can skip pv_entry handling.
4152 	 */
4153 
4154 	if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) {
4155 		KASSERT(((opte ^ npte) & PG_PVLIST) == 0);
4156 		goto same_pa;
4157 	}
4158 
4159 	/*
4160 	 * if old page is managed, remove pv_entry from its list.
4161 	 */
4162 
4163 	if ((~opte & (PG_V | PG_PVLIST)) == 0) {
4164 		pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte));
4165 #ifdef DIAGNOSTIC
4166 		if (pg == NULL)
4167 			panic("pmap_enter: PG_PVLIST mapping with "
4168 			      "unmanaged page "
4169 			      "pa = 0x%" PRIx64 " (0x%" PRIx64 ")",
4170 			      (int64_t)pa, (int64_t)atop(pa));
4171 #endif
4172 		old_pp = VM_PAGE_TO_PP(pg);
4173 
4174 		pp_lock(old_pp);
4175 		old_pve = pmap_remove_pv(old_pp, ptp, va);
4176 		old_pp->pp_attrs |= opte;
4177 		pp_unlock(old_pp);
4178 	}
4179 
4180 	/*
4181 	 * if new page is managed, insert pv_entry into its list.
4182 	 */
4183 
4184 	if (new_pp) {
4185 		pp_lock(new_pp);
4186 		new_pve = pmap_enter_pv(new_pp, new_pve, &new_pve2, ptp, va);
4187 		pp_unlock(new_pp);
4188 	}
4189 
4190 same_pa:
4191 	pmap_unmap_ptes(pmap, pmap2);
4192 
4193 	/*
4194 	 * shootdown tlb if necessary.
4195 	 */
4196 
4197 	if ((~opte & (PG_V | PG_U)) == 0 &&
4198 	    ((opte ^ npte) & (PG_FRAME | PG_RW)) != 0) {
4199 		pmap_tlb_shootdown(pmap, va, 0, opte);
4200 	}
4201 
4202 	error = 0;
4203 out:
4204 	kpreempt_enable();
4205 out2:
4206 	if (old_pve != NULL) {
4207 		pool_cache_put(&pmap_pv_cache, old_pve);
4208 	}
4209 	if (new_pve != NULL) {
4210 		pool_cache_put(&pmap_pv_cache, new_pve);
4211 	}
4212 	if (new_pve2 != NULL) {
4213 		pool_cache_put(&pmap_pv_cache, new_pve2);
4214 	}
4215 
4216 	return error;
4217 }
4218 
4219 #ifdef XEN
4220 int
4221 pmap_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
4222 {
4223         paddr_t ma;
4224 
4225 	if (__predict_false(pa < pmap_pa_start || pmap_pa_end <= pa)) {
4226 		ma = pa; /* XXX hack */
4227 	} else {
4228 		ma = xpmap_ptom(pa);
4229 	}
4230 
4231 	return pmap_enter_ma(pmap, va, ma, pa, prot, flags, DOMID_SELF);
4232 }
4233 #endif /* XEN */
4234 
4235 static bool
4236 pmap_get_physpage(vaddr_t va, int level, paddr_t *paddrp)
4237 {
4238 	struct vm_page *ptp;
4239 	struct pmap *kpm = pmap_kernel();
4240 
4241 	if (uvm.page_init_done == false) {
4242 		/*
4243 		 * we're growing the kernel pmap early (from
4244 		 * uvm_pageboot_alloc()).  this case must be
4245 		 * handled a little differently.
4246 		 */
4247 
4248 		if (uvm_page_physget(paddrp) == false)
4249 			panic("pmap_get_physpage: out of memory");
4250 		kpreempt_disable();
4251 		pmap_pte_set(early_zero_pte,
4252 		    pmap_pa2pte(*paddrp) | PG_V | PG_RW | PG_k);
4253 		pmap_pte_flush();
4254 		pmap_update_pg((vaddr_t)early_zerop);
4255 		memset(early_zerop, 0, PAGE_SIZE);
4256 #if defined(DIAGNOSTIC) || defined (XEN)
4257 		pmap_pte_set(early_zero_pte, 0);
4258 		pmap_pte_flush();
4259 #endif /* defined(DIAGNOSTIC) */
4260 		kpreempt_enable();
4261 	} else {
4262 		/* XXX */
4263 		PMAP_SUBOBJ_LOCK(kpm, level - 1);
4264 		ptp = uvm_pagealloc(&kpm->pm_obj[level - 1],
4265 				    ptp_va2o(va, level), NULL,
4266 				    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
4267 		PMAP_SUBOBJ_UNLOCK(kpm, level - 1);
4268 		if (ptp == NULL)
4269 			panic("pmap_get_physpage: out of memory");
4270 		ptp->flags &= ~PG_BUSY;
4271 		ptp->wire_count = 1;
4272 		*paddrp = VM_PAGE_TO_PHYS(ptp);
4273 	}
4274 	pmap_stats_update(kpm, 1, 0);
4275 	return true;
4276 }
4277 
4278 /*
4279  * Allocate the amount of specified ptps for a ptp level, and populate
4280  * all levels below accordingly, mapping virtual addresses starting at
4281  * kva.
4282  *
4283  * Used by pmap_growkernel.
4284  */
4285 static void
4286 pmap_alloc_level(pd_entry_t * const *pdes, vaddr_t kva, int lvl,
4287     long *needed_ptps)
4288 {
4289 	unsigned long i;
4290 	vaddr_t va;
4291 	paddr_t pa;
4292 	unsigned long index, endindex;
4293 	int level;
4294 	pd_entry_t *pdep;
4295 #ifdef XEN
4296 	int s = splvm(); /* protect xpq_* */
4297 #endif
4298 
4299 	for (level = lvl; level > 1; level--) {
4300 		if (level == PTP_LEVELS)
4301 			pdep = pmap_kernel()->pm_pdir;
4302 		else
4303 			pdep = pdes[level - 2];
4304 		va = kva;
4305 		index = pl_i_roundup(kva, level);
4306 		endindex = index + needed_ptps[level - 1] - 1;
4307 
4308 
4309 		for (i = index; i <= endindex; i++) {
4310 			KASSERT(!pmap_valid_entry(pdep[i]));
4311 			pmap_get_physpage(va, level - 1, &pa);
4312 #ifdef XEN
4313 			xpq_queue_pte_update((level == PTP_LEVELS) ?
4314 			    xpmap_ptom(pmap_pdirpa(pmap_kernel(), i)) :
4315 			    xpmap_ptetomach(&pdep[i]),
4316 			    pmap_pa2pte(pa) | PG_k | PG_V | PG_RW);
4317 #ifdef PAE
4318 			if (level == PTP_LEVELS &&  i > L2_SLOT_KERN) {
4319 				/* update real kernel PD too */
4320 				xpq_queue_pte_update(
4321 				    xpmap_ptetomach(&pmap_kl2pd[l2tol2(i)]),
4322 				    pmap_pa2pte(pa) | PG_k | PG_V | PG_RW);
4323 			}
4324 #endif
4325 #else /* XEN */
4326 			pdep[i] = pa | PG_RW | PG_V;
4327 #endif /* XEN */
4328 			KASSERT(level != PTP_LEVELS || nkptp[level - 1] +
4329 			    pl_i(VM_MIN_KERNEL_ADDRESS, level) == i);
4330 			nkptp[level - 1]++;
4331 			va += nbpd[level - 1];
4332 		}
4333 		pmap_pte_flush();
4334 	}
4335 #ifdef XEN
4336 	splx(s);
4337 #endif
4338 }
4339 
4340 /*
4341  * pmap_growkernel: increase usage of KVM space
4342  *
4343  * => we allocate new PTPs for the kernel and install them in all
4344  *	the pmaps on the system.
4345  */
4346 
4347 vaddr_t
4348 pmap_growkernel(vaddr_t maxkvaddr)
4349 {
4350 	struct pmap *kpm = pmap_kernel();
4351 #if !defined(XEN) || !defined(__x86_64__)
4352 	struct pmap *pm;
4353 #endif
4354 	int s, i;
4355 	long needed_kptp[PTP_LEVELS], target_nptp, old;
4356 	bool invalidate = false;
4357 
4358 	s = splvm();	/* to be safe */
4359 	mutex_enter(&kpm->pm_lock);
4360 
4361 	if (maxkvaddr <= pmap_maxkvaddr) {
4362 		mutex_exit(&kpm->pm_lock);
4363 		splx(s);
4364 		return pmap_maxkvaddr;
4365 	}
4366 
4367 	maxkvaddr = x86_round_pdr(maxkvaddr);
4368 	old = nkptp[PTP_LEVELS - 1];
4369 	/*
4370 	 * This loop could be optimized more, but pmap_growkernel()
4371 	 * is called infrequently.
4372 	 */
4373 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
4374 		target_nptp = pl_i_roundup(maxkvaddr, i + 1) -
4375 		    pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1);
4376 		/*
4377 		 * XXX only need to check toplevel.
4378 		 */
4379 		if (target_nptp > nkptpmax[i])
4380 			panic("out of KVA space");
4381 		KASSERT(target_nptp >= nkptp[i]);
4382 		needed_kptp[i] = target_nptp - nkptp[i];
4383 	}
4384 
4385 	pmap_alloc_level(normal_pdes, pmap_maxkvaddr, PTP_LEVELS, needed_kptp);
4386 
4387 	/*
4388 	 * If the number of top level entries changed, update all
4389 	 * pmaps.
4390 	 */
4391 	if (needed_kptp[PTP_LEVELS - 1] != 0) {
4392 #ifdef XEN
4393 #ifdef __x86_64__
4394 		/* nothing, kernel entries are never entered in user pmap */
4395 #else /* __x86_64__ */
4396 		mutex_enter(&pmaps_lock);
4397 		LIST_FOREACH(pm, &pmaps, pm_list) {
4398 			int pdkidx;
4399 			for (pdkidx =  PDIR_SLOT_KERN + old;
4400 			    pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1];
4401 			    pdkidx++) {
4402 				xpq_queue_pte_update(
4403 				    xpmap_ptom(pmap_pdirpa(pm, pdkidx)),
4404 				    kpm->pm_pdir[pdkidx]);
4405 			}
4406 			xpq_flush_queue();
4407 		}
4408 		mutex_exit(&pmaps_lock);
4409 #endif /* __x86_64__ */
4410 #else /* XEN */
4411 		unsigned newpdes;
4412 		newpdes = nkptp[PTP_LEVELS - 1] - old;
4413 		mutex_enter(&pmaps_lock);
4414 		LIST_FOREACH(pm, &pmaps, pm_list) {
4415 			memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old],
4416 			       &kpm->pm_pdir[PDIR_SLOT_KERN + old],
4417 			       newpdes * sizeof (pd_entry_t));
4418 		}
4419 		mutex_exit(&pmaps_lock);
4420 #endif
4421 		invalidate = true;
4422 	}
4423 	pmap_maxkvaddr = maxkvaddr;
4424 	mutex_exit(&kpm->pm_lock);
4425 	splx(s);
4426 
4427 	if (invalidate) {
4428 		/* Invalidate the PDP cache. */
4429 		pool_cache_invalidate(&pmap_pdp_cache);
4430 	}
4431 
4432 	return maxkvaddr;
4433 }
4434 
4435 #ifdef DEBUG
4436 void pmap_dump(struct pmap *, vaddr_t, vaddr_t);
4437 
4438 /*
4439  * pmap_dump: dump all the mappings from a pmap
4440  *
4441  * => caller should not be holding any pmap locks
4442  */
4443 
4444 void
4445 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
4446 {
4447 	pt_entry_t *ptes, *pte;
4448 	pd_entry_t * const *pdes;
4449 	struct pmap *pmap2;
4450 	vaddr_t blkendva;
4451 
4452 	/*
4453 	 * if end is out of range truncate.
4454 	 * if (end == start) update to max.
4455 	 */
4456 
4457 	if (eva > VM_MAXUSER_ADDRESS || eva <= sva)
4458 		eva = VM_MAXUSER_ADDRESS;
4459 
4460 	/*
4461 	 * we lock in the pmap => pv_head direction
4462 	 */
4463 
4464 	kpreempt_disable();
4465 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
4466 
4467 	/*
4468 	 * dumping a range of pages: we dump in PTP sized blocks (4MB)
4469 	 */
4470 
4471 	for (/* null */ ; sva < eva ; sva = blkendva) {
4472 
4473 		/* determine range of block */
4474 		blkendva = x86_round_pdr(sva+1);
4475 		if (blkendva > eva)
4476 			blkendva = eva;
4477 
4478 		/* valid block? */
4479 		if (!pmap_pdes_valid(sva, pdes, NULL))
4480 			continue;
4481 
4482 		pte = &ptes[pl1_i(sva)];
4483 		for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) {
4484 			if (!pmap_valid_entry(*pte))
4485 				continue;
4486 			printf("va %#lx -> pa %#lx (pte=%#lx)\n",
4487 			       sva, (unsigned long)*pte,
4488 			       (unsigned long)pmap_pte2pa(*pte));
4489 		}
4490 	}
4491 	pmap_unmap_ptes(pmap, pmap2);
4492 	kpreempt_enable();
4493 }
4494 #endif
4495 
4496 /*
4497  * pmap_tlb_shootdown: invalidate pages on all CPUs using pmap 'pm'
4498  *
4499  * => always invalidates locally before returning
4500  * => returns before remote CPUs have invalidated
4501  * => must be called with preemption disabled
4502  */
4503 
4504 void
4505 pmap_tlb_shootdown(struct pmap *pm, vaddr_t sva, vaddr_t eva, pt_entry_t pte)
4506 {
4507 #ifdef MULTIPROCESSOR
4508 	extern bool x86_mp_online;
4509 	struct cpu_info *ci;
4510 	struct pmap_mbox *mb, *selfmb;
4511 	CPU_INFO_ITERATOR cii;
4512 	uintptr_t head;
4513 	u_int count;
4514 	int s;
4515 #endif	/* MULTIPROCESSOR */
4516 	struct cpu_info *self;
4517 	bool kernel;
4518 
4519 	KASSERT(eva == 0 || eva >= sva);
4520 	KASSERT(kpreempt_disabled());
4521 
4522 	if (pte & PG_PS)
4523 		sva &= PG_LGFRAME;
4524 	pte &= PG_G;
4525 	self = curcpu();
4526 
4527 	if (sva == (vaddr_t)-1LL) {
4528 		kernel = true;
4529 	} else {
4530 		if (eva == 0)
4531 			eva = sva + PAGE_SIZE;
4532 		kernel = sva >= VM_MAXUSER_ADDRESS;
4533 		KASSERT(kernel == (eva > VM_MAXUSER_ADDRESS));
4534 	}
4535 
4536 	/*
4537 	 * if tearing down the pmap, do nothing.  we'll flush later
4538 	 * when we're ready to recycle/destroy it.
4539 	 */
4540 	if (__predict_false(curlwp->l_md.md_gc_pmap == pm)) {
4541 		return;
4542 	}
4543 
4544 	/*
4545 	 * If the range is larger than 32 pages, then invalidate
4546 	 * everything.
4547 	 */
4548 	if (sva != (vaddr_t)-1LL && eva - sva > (32 * PAGE_SIZE)) {
4549 		sva = (vaddr_t)-1LL;
4550 		eva = sva;
4551 	}
4552 
4553 #ifdef MULTIPROCESSOR
4554 	if (ncpu > 1 && x86_mp_online) {
4555 		selfmb = &self->ci_pmap_cpu->pc_mbox;
4556 
4557 		/*
4558 		 * If the CPUs have no notion of global pages then
4559 		 * reload of %cr3 is sufficient.
4560 		 */
4561 		if (pte != 0 && (cpu_feature & CPUID_PGE) == 0)
4562 			pte = 0;
4563 
4564 		if (pm == pmap_kernel()) {
4565 			/*
4566 			 * Mapped on all CPUs: use the broadcast mechanism.
4567 			 * Once we have the lock, increment the counter.
4568 			 */
4569 			s = splvm();
4570 			mb = &pmap_mbox;
4571 			count = SPINLOCK_BACKOFF_MIN;
4572 			do {
4573 				if ((head = mb->mb_head) != mb->mb_tail) {
4574 					splx(s);
4575 					while ((head = mb->mb_head) !=
4576 					    mb->mb_tail)
4577 						SPINLOCK_BACKOFF(count);
4578 					s = splvm();
4579 				}
4580 			} while (atomic_cas_ulong(
4581 			    (volatile u_long *)&mb->mb_head,
4582 			    head, head + ncpu - 1) != head);
4583 
4584 			/*
4585 			 * Once underway we must stay at IPL_VM until the
4586 			 * IPI is dispatched.  Otherwise interrupt handlers
4587 			 * on this CPU can deadlock against us.
4588 			 */
4589 			pmap_tlb_evcnt.ev_count++;
4590 			mb->mb_pointer = self;
4591 			mb->mb_addr1 = sva;
4592 			mb->mb_addr2 = eva;
4593 			mb->mb_global = pte;
4594 			x86_ipi(LAPIC_TLB_BCAST_VECTOR, LAPIC_DEST_ALLEXCL,
4595 			    LAPIC_DLMODE_FIXED);
4596 			self->ci_need_tlbwait = 1;
4597 			splx(s);
4598 		} else if ((pm->pm_cpus & ~self->ci_cpumask) != 0 ||
4599 		    (kernel && (pm->pm_kernel_cpus & ~self->ci_cpumask) != 0)) {
4600 			/*
4601 			 * We don't bother traversing the CPU list if only
4602 			 * used by this CPU.
4603 			 *
4604 			 * We can't do global flushes with the multicast
4605 			 * mechanism.
4606 			 */
4607 			KASSERT(pte == 0);
4608 
4609 			/*
4610 			 * Take ownership of the shootdown mailbox on each
4611 			 * CPU, fill the details and fire it off.
4612 			 */
4613 			s = splvm();
4614 			for (CPU_INFO_FOREACH(cii, ci)) {
4615 				if (ci == self ||
4616 				    !pmap_is_active(pm, ci, kernel) ||
4617 				    !(ci->ci_flags & CPUF_RUNNING))
4618 					continue;
4619 				selfmb->mb_head++;
4620 				mb = &ci->ci_pmap_cpu->pc_mbox;
4621 				count = SPINLOCK_BACKOFF_MIN;
4622 				while (atomic_cas_ulong(
4623 				    (u_long *)&mb->mb_pointer,
4624 				    0, (u_long)&selfmb->mb_tail) != 0) {
4625 				    	splx(s);
4626 					while (mb->mb_pointer != 0)
4627 						SPINLOCK_BACKOFF(count);
4628 					s = splvm();
4629 				}
4630 				mb->mb_addr1 = sva;
4631 				mb->mb_addr2 = eva;
4632 				mb->mb_global = pte;
4633 				if (x86_ipi(LAPIC_TLB_MCAST_VECTOR,
4634 				    ci->ci_cpuid, LAPIC_DLMODE_FIXED))
4635 					panic("pmap_tlb_shootdown: ipi failed");
4636 			}
4637 			self->ci_need_tlbwait = 1;
4638 			splx(s);
4639 		}
4640 	}
4641 #endif	/* MULTIPROCESSOR */
4642 
4643 	/* Update the current CPU before waiting for others. */
4644 	if (!pmap_is_active(pm, self, kernel))
4645 		return;
4646 
4647 	if (sva == (vaddr_t)-1LL) {
4648 		u_int gen = uvm_emap_gen_return();
4649 		if (pte != 0) {
4650 			tlbflushg();
4651 		} else {
4652 			tlbflush();
4653 		}
4654 		uvm_emap_update(gen);
4655 	} else {
4656 		do {
4657 			pmap_update_pg(sva);
4658 			sva += PAGE_SIZE;
4659 		} while (sva < eva);
4660 	}
4661 }
4662 
4663 /*
4664  * pmap_tlb_shootwait: wait for pending TLB shootdowns to complete
4665  *
4666  * => only waits for operations generated by the current CPU
4667  * => must be called with preemption disabled
4668  */
4669 
4670 void
4671 pmap_tlb_shootwait(void)
4672 {
4673 	struct cpu_info *self;
4674 	struct pmap_mbox *mb;
4675 
4676 	KASSERT(kpreempt_disabled());
4677 
4678 	/*
4679 	 * Anything to do?  XXX Really we want to avoid touching the cache
4680 	 * lines of the two mailboxes, but the processor may read ahead.
4681 	 */
4682 	self = curcpu();
4683 	if (!self->ci_need_tlbwait)
4684 		return;
4685 	self->ci_need_tlbwait = 0;
4686 
4687 	/* If we own the global mailbox, wait for it to drain. */
4688 	mb = &pmap_mbox;
4689 	while (mb->mb_pointer == self && mb->mb_head != mb->mb_tail)
4690 		x86_pause();
4691 
4692 	/* If we own other CPU's mailboxes, wait for them to drain. */
4693 	mb = &self->ci_pmap_cpu->pc_mbox;
4694 	KASSERT(mb->mb_pointer != &mb->mb_tail);
4695 	while (mb->mb_head != mb->mb_tail)
4696 		x86_pause();
4697 }
4698 
4699 /*
4700  * pmap_update: process deferred invalidations
4701  */
4702 
4703 void
4704 pmap_update(struct pmap *pmap)
4705 {
4706 	struct vm_page *ptp, *empty_ptps;
4707 	struct pmap_page *pp;
4708 	lwp_t *l;
4709 
4710 	/*
4711 	 * if we have torn down this pmap, invalidate non-global TLB
4712 	 * entries on any processors using it.
4713 	 */
4714 	l = curlwp;
4715 	if (__predict_false(l->l_md.md_gc_pmap == pmap)) {
4716 		l->l_md.md_gc_pmap = NULL;
4717 		KPREEMPT_DISABLE(l);
4718 		pmap_tlb_shootdown(pmap, -1, -1, 0);
4719 		KPREEMPT_ENABLE(l);
4720 	}
4721 
4722 	/*
4723 	 * wait for tlb shootdowns to complete before returning control
4724 	 * to the caller.
4725 	 */
4726 	kpreempt_disable();
4727 	pmap_tlb_shootwait();
4728 	kpreempt_enable();
4729 
4730 	/*
4731 	 * now that shootdowns are complete, process deferred frees,
4732 	 * but not from interrupt context.
4733 	 */
4734 	if (l->l_md.md_gc_ptp != NULL) {
4735 		if (cpu_intr_p() || (l->l_pflag & LP_INTR) != 0) {
4736 			return;
4737 		}
4738 
4739 		empty_ptps = l->l_md.md_gc_ptp;
4740 		l->l_md.md_gc_ptp = NULL;
4741 
4742 		while ((ptp = empty_ptps) != NULL) {
4743 			ptp->flags |= PG_ZERO;
4744 			pp = VM_PAGE_TO_PP(ptp);
4745 			empty_ptps = pp->pp_link;
4746 			LIST_INIT(&pp->pp_head.pvh_list);
4747 			uvm_pagefree(ptp);
4748 		}
4749 	}
4750 }
4751 
4752 #if PTP_LEVELS > 4
4753 #error "Unsupported number of page table mappings"
4754 #endif
4755 
4756 paddr_t
4757 pmap_init_tmp_pgtbl(paddr_t pg)
4758 {
4759 	static bool maps_loaded;
4760 	static const paddr_t x86_tmp_pml_paddr[] = {
4761 	    4 * PAGE_SIZE,
4762 	    5 * PAGE_SIZE,
4763 	    6 * PAGE_SIZE,
4764 	    7 * PAGE_SIZE
4765 	};
4766 	static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 };
4767 
4768 	pd_entry_t *tmp_pml, *kernel_pml;
4769 
4770 	int level;
4771 
4772 	if (!maps_loaded) {
4773 		for (level = 0; level < PTP_LEVELS; ++level) {
4774 			x86_tmp_pml_vaddr[level] =
4775 			    uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
4776 			    UVM_KMF_VAONLY);
4777 
4778 			if (x86_tmp_pml_vaddr[level] == 0)
4779 				panic("mapping of real mode PML failed\n");
4780 			pmap_kenter_pa(x86_tmp_pml_vaddr[level],
4781 			    x86_tmp_pml_paddr[level],
4782 			    VM_PROT_READ | VM_PROT_WRITE, 0);
4783 			pmap_update(pmap_kernel());
4784 		}
4785 		maps_loaded = true;
4786 	}
4787 
4788 	/* Zero levels 1-3 */
4789 	for (level = 0; level < PTP_LEVELS - 1; ++level) {
4790 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
4791 		memset(tmp_pml, 0, PAGE_SIZE);
4792 	}
4793 
4794 	/* Copy PML4 */
4795 	kernel_pml = pmap_kernel()->pm_pdir;
4796 	tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1];
4797 	memcpy(tmp_pml, kernel_pml, PAGE_SIZE);
4798 
4799 	for (level = PTP_LEVELS - 1; level > 0; --level) {
4800 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
4801 
4802 		tmp_pml[pl_i(pg, level + 1)] =
4803 		    (x86_tmp_pml_paddr[level - 1] & PG_FRAME) | PG_RW | PG_V;
4804 	}
4805 
4806 	tmp_pml = (void *)x86_tmp_pml_vaddr[0];
4807 	tmp_pml[pl_i(pg, 1)] = (pg & PG_FRAME) | PG_RW | PG_V;
4808 
4809 	return x86_tmp_pml_paddr[PTP_LEVELS - 1];
4810 }
4811