xref: /netbsd-src/sys/arch/x86/x86/pmap.c (revision 3816d47b2c42fcd6e549e3407f842a5b1a1d23ad)
1 /*	$NetBSD: pmap.c,v 1.99 2010/01/10 12:10:23 jym Exp $	*/
2 
3 /*
4  * Copyright (c) 2007 Manuel Bouyer.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *
26  */
27 
28 /*
29  * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
30  *
31  * Permission to use, copy, modify, and distribute this software for any
32  * purpose with or without fee is hereby granted, provided that the above
33  * copyright notice and this permission notice appear in all copies.
34  *
35  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
36  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
37  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
38  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
39  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
40  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
41  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
42  */
43 
44 /*
45  *
46  * Copyright (c) 1997 Charles D. Cranor and Washington University.
47  * All rights reserved.
48  *
49  * Redistribution and use in source and binary forms, with or without
50  * modification, are permitted provided that the following conditions
51  * are met:
52  * 1. Redistributions of source code must retain the above copyright
53  *    notice, this list of conditions and the following disclaimer.
54  * 2. Redistributions in binary form must reproduce the above copyright
55  *    notice, this list of conditions and the following disclaimer in the
56  *    documentation and/or other materials provided with the distribution.
57  * 3. All advertising materials mentioning features or use of this software
58  *    must display the following acknowledgement:
59  *      This product includes software developed by Charles D. Cranor and
60  *      Washington University.
61  * 4. The name of the author may not be used to endorse or promote products
62  *    derived from this software without specific prior written permission.
63  *
64  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
65  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
66  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
67  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
68  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
69  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
70  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
71  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
72  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
73  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
74  */
75 
76 /*
77  * Copyright 2001 (c) Wasabi Systems, Inc.
78  * All rights reserved.
79  *
80  * Written by Frank van der Linden for Wasabi Systems, Inc.
81  *
82  * Redistribution and use in source and binary forms, with or without
83  * modification, are permitted provided that the following conditions
84  * are met:
85  * 1. Redistributions of source code must retain the above copyright
86  *    notice, this list of conditions and the following disclaimer.
87  * 2. Redistributions in binary form must reproduce the above copyright
88  *    notice, this list of conditions and the following disclaimer in the
89  *    documentation and/or other materials provided with the distribution.
90  * 3. All advertising materials mentioning features or use of this software
91  *    must display the following acknowledgement:
92  *      This product includes software developed for the NetBSD Project by
93  *      Wasabi Systems, Inc.
94  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
95  *    or promote products derived from this software without specific prior
96  *    written permission.
97  *
98  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
99  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
100  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
101  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
102  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
103  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
104  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
105  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
106  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
107  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
108  * POSSIBILITY OF SUCH DAMAGE.
109  */
110 
111 /*
112  * This is the i386 pmap modified and generalized to support x86-64
113  * as well. The idea is to hide the upper N levels of the page tables
114  * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest
115  * is mostly untouched, except that it uses some more generalized
116  * macros and interfaces.
117  *
118  * This pmap has been tested on the i386 as well, and it can be easily
119  * adapted to PAE.
120  *
121  * fvdl@wasabisystems.com 18-Jun-2001
122  */
123 
124 /*
125  * pmap.c: i386 pmap module rewrite
126  * Chuck Cranor <chuck@ccrc.wustl.edu>
127  * 11-Aug-97
128  *
129  * history of this pmap module: in addition to my own input, i used
130  *    the following references for this rewrite of the i386 pmap:
131  *
132  * [1] the NetBSD i386 pmap.   this pmap appears to be based on the
133  *     BSD hp300 pmap done by Mike Hibler at University of Utah.
134  *     it was then ported to the i386 by William Jolitz of UUNET
135  *     Technologies, Inc.   Then Charles M. Hannum of the NetBSD
136  *     project fixed some bugs and provided some speed ups.
137  *
138  * [2] the FreeBSD i386 pmap.   this pmap seems to be the
139  *     Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson
140  *     and David Greenman.
141  *
142  * [3] the Mach pmap.   this pmap, from CMU, seems to have migrated
143  *     between several processors.   the VAX version was done by
144  *     Avadis Tevanian, Jr., and Michael Wayne Young.    the i386
145  *     version was done by Lance Berc, Mike Kupfer, Bob Baron,
146  *     David Golub, and Richard Draves.    the alpha version was
147  *     done by Alessandro Forin (CMU/Mach) and Chris Demetriou
148  *     (NetBSD/alpha).
149  */
150 
151 #include <sys/cdefs.h>
152 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.99 2010/01/10 12:10:23 jym Exp $");
153 
154 #include "opt_user_ldt.h"
155 #include "opt_lockdebug.h"
156 #include "opt_multiprocessor.h"
157 #include "opt_xen.h"
158 #if !defined(__x86_64__)
159 #include "opt_kstack_dr0.h"
160 #endif /* !defined(__x86_64__) */
161 
162 #include <sys/param.h>
163 #include <sys/systm.h>
164 #include <sys/proc.h>
165 #include <sys/pool.h>
166 #include <sys/kernel.h>
167 #include <sys/atomic.h>
168 #include <sys/cpu.h>
169 #include <sys/intr.h>
170 #include <sys/xcall.h>
171 
172 #include <uvm/uvm.h>
173 
174 #include <dev/isa/isareg.h>
175 
176 #include <machine/specialreg.h>
177 #include <machine/gdt.h>
178 #include <machine/isa_machdep.h>
179 #include <machine/cpuvar.h>
180 
181 #include <x86/pmap.h>
182 #include <x86/pmap_pv.h>
183 
184 #include <x86/i82489reg.h>
185 #include <x86/i82489var.h>
186 
187 #ifdef XEN
188 #include <xen/xen3-public/xen.h>
189 #include <xen/hypervisor.h>
190 #endif
191 
192 /* flag to be used for kernel mappings: PG_u on Xen/amd64, 0 otherwise */
193 #if defined(XEN) && defined(__x86_64__)
194 #define PG_k PG_u
195 #else
196 #define PG_k 0
197 #endif
198 
199 /*
200  * general info:
201  *
202  *  - for an explanation of how the i386 MMU hardware works see
203  *    the comments in <machine/pte.h>.
204  *
205  *  - for an explanation of the general memory structure used by
206  *    this pmap (including the recursive mapping), see the comments
207  *    in <machine/pmap.h>.
208  *
209  * this file contains the code for the "pmap module."   the module's
210  * job is to manage the hardware's virtual to physical address mappings.
211  * note that there are two levels of mapping in the VM system:
212  *
213  *  [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
214  *      to map ranges of virtual address space to objects/files.  for
215  *      example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
216  *      to the file /bin/ls starting at offset zero."   note that
217  *      the upper layer mapping is not concerned with how individual
218  *      vm_pages are mapped.
219  *
220  *  [2] the lower layer of the VM system (the pmap) maintains the mappings
221  *      from virtual addresses.   it is concerned with which vm_page is
222  *      mapped where.   for example, when you run /bin/ls and start
223  *      at page 0x1000 the fault routine may lookup the correct page
224  *      of the /bin/ls file and then ask the pmap layer to establish
225  *      a mapping for it.
226  *
227  * note that information in the lower layer of the VM system can be
228  * thrown away since it can easily be reconstructed from the info
229  * in the upper layer.
230  *
231  * data structures we use include:
232  *
233  *  - struct pmap: describes the address space of one thread
234  *  - struct pv_entry: describes one <PMAP,VA> mapping of a PA
235  *  - struct pv_head: there is one pv_head per managed page of
236  *	physical memory.   the pv_head points to a list of pv_entry
237  *	structures which describe all the <PMAP,VA> pairs that this
238  *      page is mapped in.    this is critical for page based operations
239  *      such as pmap_page_protect() [change protection on _all_ mappings
240  *      of a page]
241  */
242 
243 /*
244  * memory allocation
245  *
246  *  - there are three data structures that we must dynamically allocate:
247  *
248  * [A] new process' page directory page (PDP)
249  *	- plan 1: done at pmap_create() we use
250  *	  uvm_km_alloc(kernel_map, PAGE_SIZE)  [fka kmem_alloc] to do this
251  *	  allocation.
252  *
253  * if we are low in free physical memory then we sleep in
254  * uvm_km_alloc -- in this case this is ok since we are creating
255  * a new pmap and should not be holding any locks.
256  *
257  * if the kernel is totally out of virtual space
258  * (i.e. uvm_km_alloc returns NULL), then we panic.
259  *
260  * [B] new page tables pages (PTP)
261  * 	- call uvm_pagealloc()
262  * 		=> success: zero page, add to pm_pdir
263  * 		=> failure: we are out of free vm_pages, let pmap_enter()
264  *		   tell UVM about it.
265  *
266  * note: for kernel PTPs, we start with NKPTP of them.   as we map
267  * kernel memory (at uvm_map time) we check to see if we've grown
268  * the kernel pmap.   if so, we call the optional function
269  * pmap_growkernel() to grow the kernel PTPs in advance.
270  *
271  * [C] pv_entry structures
272  */
273 
274 /*
275  * locking
276  *
277  * we have the following locks that we must contend with:
278  *
279  * mutexes:
280  *
281  * - pmap lock (per pmap, part of uvm_object)
282  *   this lock protects the fields in the pmap structure including
283  *   the non-kernel PDEs in the PDP, and the PTEs.  it also locks
284  *   in the alternate PTE space (since that is determined by the
285  *   entry in the PDP).
286  *
287  * - pvh_lock (per pv_head)
288  *   this lock protects the pv_entry list which is chained off the
289  *   pv_head structure for a specific managed PA.   it is locked
290  *   when traversing the list (e.g. adding/removing mappings,
291  *   syncing R/M bits, etc.)
292  *
293  * - pmaps_lock
294  *   this lock protects the list of active pmaps (headed by "pmaps").
295  *   we lock it when adding or removing pmaps from this list.
296  *
297  * tlb shootdown
298  *
299  * tlb shootdowns are hard interrupts that operate outside the spl
300  * framework: they don't need to be blocked provided that the pmap module
301  * gets the order of events correct.  the calls are made by talking directly
302  * to the lapic.  the stubs to handle the interrupts are quite short and do
303  * one of the following: invalidate a single page, a range of pages, all
304  * user tlb entries or the entire tlb.
305  *
306  * the cpus synchronize with each other using pmap_mbox structures which are
307  * aligned on 64-byte cache lines.  tlb shootdowns against the kernel pmap
308  * use a global mailbox and are generated using a broadcast ipi (broadcast
309  * to all but the sending cpu).  shootdowns against regular pmaps use
310  * per-cpu mailboxes and are multicast.  kernel and user shootdowns can
311  * execute simultaneously, as can shootdowns within different multithreaded
312  * processes.  TODO:
313  *
314  *   1. figure out which waitpoints can be deferered to pmap_update().
315  *   2. see if there is a cheap way to batch some updates.
316  */
317 
318 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER;
319 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER;
320 const long nkptpmax[] = NKPTPMAX_INITIALIZER;
321 const long nbpd[] = NBPD_INITIALIZER;
322 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER;
323 pd_entry_t * const alternate_pdes[] = APDES_INITIALIZER;
324 
325 long nkptp[] = NKPTP_INITIALIZER;
326 
327 static kmutex_t pmaps_lock;
328 
329 static vaddr_t pmap_maxkvaddr;
330 
331 #define COUNT(x)	/* nothing */
332 
333 /*
334  * XXX kludge: dummy locking to make KASSERTs in uvm_page.c comfortable.
335  * actual locking is done by pm_lock.
336  */
337 #if defined(DIAGNOSTIC)
338 #define	PMAP_SUBOBJ_LOCK(pm, idx) \
339 	KASSERT(mutex_owned(&(pm)->pm_lock)); \
340 	if ((idx) != 0) \
341 		mutex_enter(&(pm)->pm_obj[(idx)].vmobjlock)
342 #define	PMAP_SUBOBJ_UNLOCK(pm, idx) \
343 	KASSERT(mutex_owned(&(pm)->pm_lock)); \
344 	if ((idx) != 0) \
345 		mutex_exit(&(pm)->pm_obj[(idx)].vmobjlock)
346 #else /* defined(DIAGNOSTIC) */
347 #define	PMAP_SUBOBJ_LOCK(pm, idx)	/* nothing */
348 #define	PMAP_SUBOBJ_UNLOCK(pm, idx)	/* nothing */
349 #endif /* defined(DIAGNOSTIC) */
350 
351 /*
352  * Misc. event counters.
353  */
354 struct evcnt pmap_iobmp_evcnt;
355 struct evcnt pmap_ldt_evcnt;
356 
357 /*
358  * Global TLB shootdown mailbox.
359  */
360 struct evcnt pmap_tlb_evcnt __aligned(64);
361 struct pmap_mbox pmap_mbox __aligned(64);
362 
363 /*
364  * Per-CPU data.  The pmap mailbox is cache intensive so gets its
365  * own line.  Note that the mailbox must be the first item.
366  */
367 struct pmap_cpu {
368 	/* TLB shootdown */
369 	struct pmap_mbox pc_mbox;
370 };
371 
372 union {
373 	struct pmap_cpu pc;
374 	uint8_t padding[64];
375 } pmap_cpu[MAXCPUS] __aligned(64);
376 
377 /*
378  * global data structures
379  */
380 
381 static struct pmap kernel_pmap_store;	/* the kernel's pmap (proc0) */
382 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store;
383 
384 /*
385  * pmap_pg_g: if our processor supports PG_G in the PTE then we
386  * set pmap_pg_g to PG_G (otherwise it is zero).
387  */
388 
389 int pmap_pg_g = 0;
390 
391 /*
392  * pmap_largepages: if our processor supports PG_PS and we are
393  * using it, this is set to true.
394  */
395 
396 int pmap_largepages;
397 
398 /*
399  * i386 physical memory comes in a big contig chunk with a small
400  * hole toward the front of it...  the following two paddr_t's
401  * (shared with machdep.c) describe the physical address space
402  * of this machine.
403  */
404 paddr_t avail_start;	/* PA of first available physical page */
405 paddr_t avail_end;	/* PA of last available physical page */
406 
407 #ifdef XEN
408 #ifdef __x86_64__
409 /* Dummy PGD for user cr3, used between pmap_deacivate() and pmap_activate() */
410 static paddr_t xen_dummy_user_pgd;
411 /* Currently active user PGD (can't use rcr3()) */
412 static paddr_t xen_current_user_pgd = 0;
413 #endif /* __x86_64__ */
414 paddr_t pmap_pa_start; /* PA of first physical page for this domain */
415 paddr_t pmap_pa_end;   /* PA of last physical page for this domain */
416 #endif /* XEN */
417 
418 #define	VM_PAGE_TO_PP(pg)	(&(pg)->mdpage.mp_pp)
419 
420 #define	pp_lock(pp)	mutex_spin_enter(&(pp)->pp_lock)
421 #define	pp_unlock(pp)	mutex_spin_exit(&(pp)->pp_lock)
422 #define	pp_locked(pp)	mutex_owned(&(pp)->pp_lock)
423 
424 #define	PV_HASH_SIZE		32768
425 #define	PV_HASH_LOCK_CNT	32
426 
427 struct pv_hash_lock {
428 	kmutex_t lock;
429 } __aligned(CACHE_LINE_SIZE) pv_hash_locks[PV_HASH_LOCK_CNT]
430     __aligned(CACHE_LINE_SIZE);
431 
432 struct pv_hash_head {
433 	SLIST_HEAD(, pv_entry) hh_list;
434 } pv_hash_heads[PV_HASH_SIZE];
435 
436 static u_int
437 pvhash_hash(struct vm_page *ptp, vaddr_t va)
438 {
439 
440 	return (uintptr_t)ptp / sizeof(*ptp) + (va >> PAGE_SHIFT);
441 }
442 
443 static struct pv_hash_head *
444 pvhash_head(u_int hash)
445 {
446 
447 	return &pv_hash_heads[hash % PV_HASH_SIZE];
448 }
449 
450 static kmutex_t *
451 pvhash_lock(u_int hash)
452 {
453 
454 	return &pv_hash_locks[hash % PV_HASH_LOCK_CNT].lock;
455 }
456 
457 static struct pv_entry *
458 pvhash_remove(struct pv_hash_head *hh, struct vm_page *ptp, vaddr_t va)
459 {
460 	struct pv_entry *pve;
461 	struct pv_entry *prev;
462 
463 	prev = NULL;
464 	SLIST_FOREACH(pve, &hh->hh_list, pve_hash) {
465 		if (pve->pve_pte.pte_ptp == ptp &&
466 		    pve->pve_pte.pte_va == va) {
467 			if (prev != NULL) {
468 				SLIST_REMOVE_AFTER(prev, pve_hash);
469 			} else {
470 				SLIST_REMOVE_HEAD(&hh->hh_list, pve_hash);
471 			}
472 			break;
473 		}
474 		prev = pve;
475 	}
476 	return pve;
477 }
478 
479 /*
480  * other data structures
481  */
482 
483 static pt_entry_t protection_codes[8];	/* maps MI prot to i386 prot code */
484 static bool pmap_initialized = false;	/* pmap_init done yet? */
485 
486 /*
487  * the following two vaddr_t's are used during system startup
488  * to keep track of how much of the kernel's VM space we have used.
489  * once the system is started, the management of the remaining kernel
490  * VM space is turned over to the kernel_map vm_map.
491  */
492 
493 static vaddr_t virtual_avail;	/* VA of first free KVA */
494 static vaddr_t virtual_end;	/* VA of last free KVA */
495 
496 /*
497  * linked list of all non-kernel pmaps
498  */
499 
500 static struct pmap_head pmaps;
501 
502 /*
503  * pool that pmap structures are allocated from
504  */
505 
506 static struct pool_cache pmap_cache;
507 
508 /*
509  * pv_entry cache
510  */
511 
512 static struct pool_cache pmap_pv_cache;
513 
514 /*
515  * MULTIPROCESSOR: special VA's/ PTE's are actually allocated inside a
516  * maxcpus*NPTECL array of PTE's, to avoid cache line thrashing
517  * due to false sharing.
518  */
519 
520 #ifdef MULTIPROCESSOR
521 #define PTESLEW(pte, id) ((pte)+(id)*NPTECL)
522 #define VASLEW(va,id) ((va)+(id)*NPTECL*PAGE_SIZE)
523 #else
524 #define PTESLEW(pte, id) (pte)
525 #define VASLEW(va,id) (va)
526 #endif
527 
528 /*
529  * special VAs and the PTEs that map them
530  */
531 static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte, *early_zero_pte;
532 static char *csrcp, *cdstp, *zerop, *ptpp, *early_zerop;
533 
534 /*
535  * pool and cache that PDPs are allocated from
536  */
537 
538 static struct pool_cache pmap_pdp_cache;
539 int	pmap_pdp_ctor(void *, void *, int);
540 void	pmap_pdp_dtor(void *, void *);
541 #ifdef PAE
542 /* need to allocate items of 4 pages */
543 void *pmap_pdp_alloc(struct pool *, int);
544 void pmap_pdp_free(struct pool *, void *);
545 static struct pool_allocator pmap_pdp_allocator = {
546 	.pa_alloc = pmap_pdp_alloc,
547 	.pa_free = pmap_pdp_free,
548 	.pa_pagesz = PAGE_SIZE * PDP_SIZE,
549 };
550 #endif /* PAE */
551 
552 void *vmmap; /* XXX: used by mem.c... it should really uvm_map_reserve it */
553 
554 extern vaddr_t idt_vaddr;			/* we allocate IDT early */
555 extern paddr_t idt_paddr;
556 
557 #ifdef _LP64
558 extern vaddr_t lo32_vaddr;
559 extern vaddr_t lo32_paddr;
560 #endif
561 
562 extern int end;
563 
564 #ifdef i386
565 /* stuff to fix the pentium f00f bug */
566 extern vaddr_t pentium_idt_vaddr;
567 #endif
568 
569 
570 /*
571  * local prototypes
572  */
573 
574 static struct vm_page	*pmap_get_ptp(struct pmap *, vaddr_t,
575 				      pd_entry_t * const *);
576 static struct vm_page	*pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int);
577 static void		 pmap_freepage(struct pmap *, struct vm_page *, int);
578 static void		 pmap_free_ptp(struct pmap *, struct vm_page *,
579 				       vaddr_t, pt_entry_t *,
580 				       pd_entry_t * const *);
581 static bool		 pmap_is_curpmap(struct pmap *);
582 static bool		 pmap_is_active(struct pmap *, struct cpu_info *, bool);
583 static void		 pmap_map_ptes(struct pmap *, struct pmap **,
584 				       pt_entry_t **, pd_entry_t * const **);
585 static bool		 pmap_remove_pte(struct pmap *, struct vm_page *,
586 					 pt_entry_t *, vaddr_t,
587 					 struct pv_entry **);
588 static pt_entry_t	 pmap_remove_ptes(struct pmap *, struct vm_page *,
589 					  vaddr_t, vaddr_t, vaddr_t,
590 					  struct pv_entry **);
591 
592 static void		 pmap_unmap_ptes(struct pmap *, struct pmap *);
593 static bool		 pmap_get_physpage(vaddr_t, int, paddr_t *);
594 static int		 pmap_pdes_invalid(vaddr_t, pd_entry_t * const *,
595 					   pd_entry_t *);
596 #define	pmap_pdes_valid(va, pdes, lastpde)	\
597 	(pmap_pdes_invalid((va), (pdes), (lastpde)) == 0)
598 static void		 pmap_alloc_level(pd_entry_t * const *, vaddr_t, int,
599 					  long *);
600 
601 static bool		 pmap_reactivate(struct pmap *);
602 
603 /*
604  * p m a p   h e l p e r   f u n c t i o n s
605  */
606 
607 static inline void
608 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff)
609 {
610 
611 	if (pmap == pmap_kernel()) {
612 		atomic_add_long(&pmap->pm_stats.resident_count, resid_diff);
613 		atomic_add_long(&pmap->pm_stats.wired_count, wired_diff);
614 	} else {
615 		KASSERT(mutex_owned(&pmap->pm_lock));
616 		pmap->pm_stats.resident_count += resid_diff;
617 		pmap->pm_stats.wired_count += wired_diff;
618 	}
619 }
620 
621 static inline void
622 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
623 {
624 	int resid_diff = ((npte & PG_V) ? 1 : 0) - ((opte & PG_V) ? 1 : 0);
625 	int wired_diff = ((npte & PG_W) ? 1 : 0) - ((opte & PG_W) ? 1 : 0);
626 
627 	KASSERT((npte & (PG_V | PG_W)) != PG_W);
628 	KASSERT((opte & (PG_V | PG_W)) != PG_W);
629 
630 	pmap_stats_update(pmap, resid_diff, wired_diff);
631 }
632 
633 /*
634  * ptp_to_pmap: lookup pmap by ptp
635  */
636 
637 static struct pmap *
638 ptp_to_pmap(struct vm_page *ptp)
639 {
640 	struct pmap *pmap;
641 
642 	if (ptp == NULL) {
643 		return pmap_kernel();
644 	}
645 	pmap = (struct pmap *)ptp->uobject;
646 	KASSERT(pmap != NULL);
647 	KASSERT(&pmap->pm_obj[0] == ptp->uobject);
648 	return pmap;
649 }
650 
651 static inline struct pv_pte *
652 pve_to_pvpte(struct pv_entry *pve)
653 {
654 
655 	KASSERT((void *)&pve->pve_pte == (void *)pve);
656 	return &pve->pve_pte;
657 }
658 
659 static inline struct pv_entry *
660 pvpte_to_pve(struct pv_pte *pvpte)
661 {
662 	struct pv_entry *pve = (void *)pvpte;
663 
664 	KASSERT(pve_to_pvpte(pve) == pvpte);
665 	return pve;
666 }
667 
668 /*
669  * pv_pte_first, pv_pte_next: PV list iterator.
670  */
671 
672 static struct pv_pte *
673 pv_pte_first(struct pmap_page *pp)
674 {
675 
676 	KASSERT(pp_locked(pp));
677 	if ((pp->pp_flags & PP_EMBEDDED) != 0) {
678 		return &pp->pp_pte;
679 	}
680 	return pve_to_pvpte(LIST_FIRST(&pp->pp_head.pvh_list));
681 }
682 
683 static struct pv_pte *
684 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte)
685 {
686 
687 	KASSERT(pvpte != NULL);
688 	KASSERT(pp_locked(pp));
689 	if (pvpte == &pp->pp_pte) {
690 		KASSERT((pp->pp_flags & PP_EMBEDDED) != 0);
691 		return NULL;
692 	}
693 	KASSERT((pp->pp_flags & PP_EMBEDDED) == 0);
694 	return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list));
695 }
696 
697 /*
698  * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
699  *		of course the kernel is always loaded
700  */
701 
702 inline static bool
703 pmap_is_curpmap(struct pmap *pmap)
704 {
705 #if defined(XEN) && defined(__x86_64__)
706 	/*
707 	 * Only kernel pmap is physically loaded.
708 	 * User PGD may be active, but TLB will be flushed
709 	 * with HYPERVISOR_iret anyway, so let's say no
710 	 */
711 	return(pmap == pmap_kernel());
712 #else /* XEN && __x86_64__*/
713 	return((pmap == pmap_kernel()) ||
714 	       (pmap == curcpu()->ci_pmap));
715 #endif
716 }
717 
718 /*
719  * pmap_is_active: is this pmap loaded into the specified processor's %cr3?
720  */
721 
722 inline static bool
723 pmap_is_active(struct pmap *pmap, struct cpu_info *ci, bool kernel)
724 {
725 
726 	return (pmap == pmap_kernel() ||
727 	    (pmap->pm_cpus & ci->ci_cpumask) != 0 ||
728 	    (kernel && (pmap->pm_kernel_cpus & ci->ci_cpumask) != 0));
729 }
730 
731 static void
732 pmap_apte_flush(struct pmap *pmap)
733 {
734 
735 	KASSERT(kpreempt_disabled());
736 
737 	/*
738 	 * Flush the APTE mapping from all other CPUs that
739 	 * are using the pmap we are using (who's APTE space
740 	 * is the one we've just modified).
741 	 *
742 	 * XXXthorpej -- find a way to defer the IPI.
743 	 */
744 	pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, 0);
745 	pmap_tlb_shootwait();
746 }
747 
748 /*
749  *	Add a reference to the specified pmap.
750  */
751 
752 inline void
753 pmap_reference(struct pmap *pmap)
754 {
755 
756 	atomic_inc_uint(&pmap->pm_obj[0].uo_refs);
757 }
758 
759 /*
760  * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
761  *
762  * => we lock enough pmaps to keep things locked in
763  * => must be undone with pmap_unmap_ptes before returning
764  */
765 
766 static void
767 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2,
768     pd_entry_t **ptepp, pd_entry_t * const **pdeppp)
769 {
770 	pd_entry_t opde, npde;
771 	struct pmap *ourpmap;
772 	struct cpu_info *ci;
773 	struct lwp *l;
774 	bool iscurrent;
775 	uint64_t ncsw;
776 #ifdef XEN
777 	int s;
778 #endif
779 
780 	/* the kernel's pmap is always accessible */
781 	if (pmap == pmap_kernel()) {
782 		*pmap2 = NULL;
783 		*ptepp = PTE_BASE;
784 		*pdeppp = normal_pdes;
785 		return;
786 	}
787 	KASSERT(kpreempt_disabled());
788 
789  retry:
790 	l = curlwp;
791 	ncsw = l->l_ncsw;
792  	ourpmap = NULL;
793 	ci = curcpu();
794 #if defined(XEN) && defined(__x86_64__)
795 	/*
796 	 * curmap can only be pmap_kernel so at this point
797 	 * pmap_is_curpmap is always false
798 	 */
799 	iscurrent = 0;
800 	ourpmap = pmap_kernel();
801 #else /* XEN && __x86_64__*/
802 	if (ci->ci_want_pmapload &&
803 	    vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) {
804 		pmap_load();
805 		if (l->l_ncsw != ncsw)
806 			goto retry;
807 	}
808 	iscurrent = pmap_is_curpmap(pmap);
809 	/* if curpmap then we are always mapped */
810 	if (iscurrent) {
811 		mutex_enter(&pmap->pm_lock);
812 		*pmap2 = NULL;
813 		*ptepp = PTE_BASE;
814 		*pdeppp = normal_pdes;
815 		goto out;
816 	}
817 	ourpmap = ci->ci_pmap;
818 #endif /* XEN && __x86_64__ */
819 
820 	/* need to lock both curpmap and pmap: use ordered locking */
821 	pmap_reference(ourpmap);
822 	if ((uintptr_t) pmap < (uintptr_t) ourpmap) {
823 		mutex_enter(&pmap->pm_lock);
824 		mutex_enter(&ourpmap->pm_lock);
825 	} else {
826 		mutex_enter(&ourpmap->pm_lock);
827 		mutex_enter(&pmap->pm_lock);
828 	}
829 
830 	if (l->l_ncsw != ncsw)
831 		goto unlock_and_retry;
832 
833 	/* need to load a new alternate pt space into curpmap? */
834 	COUNT(apdp_pde_map);
835 	opde = *APDP_PDE;
836 #ifdef XEN
837 	if (!pmap_valid_entry(opde) ||
838 	    pmap_pte2pa(opde) != pmap_pdirpa(pmap, 0)) {
839 		int i;
840 		s = splvm();
841 		/* Make recursive entry usable in user PGD */
842 		for (i = 0; i < PDP_SIZE; i++) {
843 			npde = pmap_pa2pte(
844 			    pmap_pdirpa(pmap, i * NPDPG)) | PG_k | PG_V;
845 			xpq_queue_pte_update(
846 			    xpmap_ptom(pmap_pdirpa(pmap, PDIR_SLOT_PTE + i)),
847 			    npde);
848 			xpq_queue_pte_update(xpmap_ptetomach(&APDP_PDE[i]),
849 			    npde);
850 #ifdef PAE
851 			/* update shadow entry too */
852 			xpq_queue_pte_update(
853 			    xpmap_ptetomach(&APDP_PDE_SHADOW[i]), npde);
854 #endif /* PAE */
855 			xpq_queue_invlpg(
856 			    (vaddr_t)&pmap->pm_pdir[PDIR_SLOT_PTE + i]);
857 		}
858 		xpq_flush_queue();
859 		if (pmap_valid_entry(opde))
860 			pmap_apte_flush(ourpmap);
861 		splx(s);
862 	}
863 #else /* XEN */
864 	npde = pmap_pa2pte(pmap_pdirpa(pmap, 0)) | PG_RW | PG_V;
865 	if (!pmap_valid_entry(opde) ||
866 	    pmap_pte2pa(opde) != pmap_pdirpa(pmap, 0)) {
867 		pmap_pte_set(APDP_PDE, npde);
868 		pmap_pte_flush();
869 		if (pmap_valid_entry(opde))
870 			pmap_apte_flush(ourpmap);
871 	}
872 #endif /* XEN */
873 	*pmap2 = ourpmap;
874 	*ptepp = APTE_BASE;
875 	*pdeppp = alternate_pdes;
876 	KASSERT(l->l_ncsw == ncsw);
877 #if !defined(XEN) || !defined(__x86_64__)
878  out:
879 #endif
880  	/*
881  	 * might have blocked, need to retry?
882  	 */
883 	if (l->l_ncsw != ncsw) {
884  unlock_and_retry:
885 	    	if (ourpmap != NULL) {
886 			mutex_exit(&ourpmap->pm_lock);
887 			pmap_destroy(ourpmap);
888 		}
889 		mutex_exit(&pmap->pm_lock);
890 		goto retry;
891 	}
892 
893 	return;
894 }
895 
896 /*
897  * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
898  */
899 
900 static void
901 pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2)
902 {
903 
904 	if (pmap == pmap_kernel()) {
905 		return;
906 	}
907 	KASSERT(kpreempt_disabled());
908 	if (pmap2 == NULL) {
909 		mutex_exit(&pmap->pm_lock);
910 	} else {
911 #if defined(XEN) && defined(__x86_64__)
912 		KASSERT(pmap2 == pmap_kernel());
913 #else
914 		KASSERT(curcpu()->ci_pmap == pmap2);
915 #endif
916 #if defined(MULTIPROCESSOR)
917 		pmap_pte_set(APDP_PDE, 0);
918 		pmap_pte_flush();
919 		pmap_apte_flush(pmap2);
920 #endif
921 		COUNT(apdp_pde_unmap);
922 		mutex_exit(&pmap->pm_lock);
923 		mutex_exit(&pmap2->pm_lock);
924 		pmap_destroy(pmap2);
925 	}
926 }
927 
928 inline static void
929 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte)
930 {
931 
932 #if !defined(__x86_64__)
933 	if (curproc == NULL || curproc->p_vmspace == NULL ||
934 	    pm != vm_map_pmap(&curproc->p_vmspace->vm_map))
935 		return;
936 
937 	if ((opte ^ npte) & PG_X)
938 		pmap_update_pg(va);
939 
940 	/*
941 	 * Executability was removed on the last executable change.
942 	 * Reset the code segment to something conservative and
943 	 * let the trap handler deal with setting the right limit.
944 	 * We can't do that because of locking constraints on the vm map.
945 	 */
946 
947 	if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) {
948 		struct trapframe *tf = curlwp->l_md.md_regs;
949 
950 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
951 		pm->pm_hiexec = I386_MAX_EXE_ADDR;
952 	}
953 #endif /* !defined(__x86_64__) */
954 }
955 
956 #if !defined(__x86_64__)
957 /*
958  * Fixup the code segment to cover all potential executable mappings.
959  * returns 0 if no changes to the code segment were made.
960  */
961 
962 int
963 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb)
964 {
965 	struct vm_map_entry *ent;
966 	struct pmap *pm = vm_map_pmap(map);
967 	vaddr_t va = 0;
968 
969 	vm_map_lock_read(map);
970 	for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) {
971 
972 		/*
973 		 * This entry has greater va than the entries before.
974 		 * We need to make it point to the last page, not past it.
975 		 */
976 
977 		if (ent->protection & VM_PROT_EXECUTE)
978 			va = trunc_page(ent->end) - PAGE_SIZE;
979 	}
980 	vm_map_unlock_read(map);
981 	if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL))
982 		return (0);
983 
984 	pm->pm_hiexec = va;
985 	if (pm->pm_hiexec > I386_MAX_EXE_ADDR) {
986 		tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
987 	} else {
988 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
989 		return (0);
990 	}
991 	return (1);
992 }
993 #endif /* !defined(__x86_64__) */
994 
995 /*
996  * p m a p   k e n t e r   f u n c t i o n s
997  *
998  * functions to quickly enter/remove pages from the kernel address
999  * space.   pmap_kremove is exported to MI kernel.  we make use of
1000  * the recursive PTE mappings.
1001  */
1002 
1003 /*
1004  * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
1005  *
1006  * => no need to lock anything, assume va is already allocated
1007  * => should be faster than normal pmap enter function
1008  */
1009 
1010 void
1011 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
1012 {
1013 	pt_entry_t *pte, opte, npte;
1014 
1015 	KASSERT(!(prot & ~VM_PROT_ALL));
1016 
1017 	if (va < VM_MIN_KERNEL_ADDRESS)
1018 		pte = vtopte(va);
1019 	else
1020 		pte = kvtopte(va);
1021 #ifdef DOM0OPS
1022 	if (pa < pmap_pa_start || pa >= pmap_pa_end) {
1023 #ifdef DEBUG
1024 		printk("pmap_kenter_pa: pa 0x%" PRIx64 " for va 0x%" PRIx64
1025 		    " outside range\n", (int64_t)pa, (int64_t)va);
1026 #endif /* DEBUG */
1027 		npte = pa;
1028 	} else
1029 #endif /* DOM0OPS */
1030 		npte = pmap_pa2pte(pa);
1031 	npte |= protection_codes[prot] | PG_k | PG_V | pmap_pg_g;
1032 	if (flags & PMAP_NOCACHE)
1033 		npte |= PG_N;
1034 	opte = pmap_pte_testset(pte, npte); /* zap! */
1035 #if defined(DIAGNOSTIC)
1036 	/* XXX For now... */
1037 	if (opte & PG_PS)
1038 		panic("pmap_kenter_pa: PG_PS");
1039 #endif
1040 	if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) {
1041 		/* This should not happen, so no need to batch updates. */
1042 		kpreempt_disable();
1043 		pmap_tlb_shootdown(pmap_kernel(), va, 0, opte);
1044 		kpreempt_enable();
1045 	}
1046 }
1047 
1048 void
1049 pmap_emap_enter(vaddr_t va, paddr_t pa, vm_prot_t prot)
1050 {
1051 	pt_entry_t *pte, opte, npte;
1052 
1053 	KASSERT((prot & ~VM_PROT_ALL) == 0);
1054 	pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va);
1055 
1056 #ifdef DOM0OPS
1057 	if (pa < pmap_pa_start || pa >= pmap_pa_end) {
1058 		npte = pa;
1059 	} else
1060 #endif
1061 		npte = pmap_pa2pte(pa);
1062 
1063 	npte = pmap_pa2pte(pa);
1064 	npte |= protection_codes[prot] | PG_k | PG_V;
1065 	opte = pmap_pte_testset(pte, npte);
1066 }
1067 
1068 /*
1069  * pmap_emap_sync: perform TLB flush or pmap load, if it was deferred.
1070  */
1071 void
1072 pmap_emap_sync(bool canload)
1073 {
1074 	struct cpu_info *ci = curcpu();
1075 	struct pmap *pmap;
1076 
1077 	KASSERT(kpreempt_disabled());
1078 	if (__predict_true(ci->ci_want_pmapload && canload)) {
1079 		/*
1080 		 * XXX: Hint for pmap_reactivate(), which might suggest to
1081 		 * not perform TLB flush, if state has not changed.
1082 		 */
1083 		pmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map);
1084 		if (__predict_false(pmap == ci->ci_pmap)) {
1085 			const uint32_t cpumask = ci->ci_cpumask;
1086 			atomic_and_32(&pmap->pm_cpus, ~cpumask);
1087 		}
1088 		pmap_load();
1089 		KASSERT(ci->ci_want_pmapload == 0);
1090 	} else {
1091 		tlbflush();
1092 	}
1093 
1094 }
1095 
1096 void
1097 pmap_emap_remove(vaddr_t sva, vsize_t len)
1098 {
1099 	pt_entry_t *pte, xpte;
1100 	vaddr_t va, eva = sva + len;
1101 
1102 	for (va = sva; va < eva; va += PAGE_SIZE) {
1103 		pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va);
1104 		xpte |= pmap_pte_testset(pte, 0);
1105 	}
1106 }
1107 
1108 #ifdef XEN
1109 /*
1110  * pmap_kenter_ma: enter a kernel mapping without R/M (pv_entry) tracking
1111  *
1112  * => no need to lock anything, assume va is already allocated
1113  * => should be faster than normal pmap enter function
1114  * => we expect a MACHINE address
1115  */
1116 
1117 void
1118 pmap_kenter_ma(vaddr_t va, paddr_t ma, vm_prot_t prot, u_int flags)
1119 {
1120 	pt_entry_t *pte, opte, npte;
1121 
1122 	if (va < VM_MIN_KERNEL_ADDRESS)
1123 		pte = vtopte(va);
1124 	else
1125 		pte = kvtopte(va);
1126 
1127 	npte = ma | ((prot & VM_PROT_WRITE) ? PG_RW : PG_RO) |
1128 	     PG_V | PG_k;
1129 	if (flags & PMAP_NOCACHE)
1130 		npte |= PG_N;
1131 
1132 #ifndef XEN
1133 	if ((cpu_feature & CPUID_NOX) && !(prot & VM_PROT_EXECUTE))
1134 		npte |= PG_NX;
1135 #endif
1136 	opte = pmap_pte_testset (pte, npte); /* zap! */
1137 
1138 	if (pmap_valid_entry(opte)) {
1139 #if defined(MULTIPROCESSOR)
1140 		kpreempt_disable();
1141 		pmap_tlb_shootdown(pmap_kernel(), va, 0, opte);
1142 		kpreempt_enable();
1143 #else
1144 		/* Don't bother deferring in the single CPU case. */
1145 		pmap_update_pg(va);
1146 #endif
1147 	}
1148 }
1149 #endif	/* XEN */
1150 
1151 #if defined(__x86_64__)
1152 /*
1153  * Change protection for a virtual address. Local for a CPU only, don't
1154  * care about TLB shootdowns.
1155  *
1156  * => must be called with preemption disabled
1157  */
1158 void
1159 pmap_changeprot_local(vaddr_t va, vm_prot_t prot)
1160 {
1161 	pt_entry_t *pte, opte, npte;
1162 
1163 	KASSERT(kpreempt_disabled());
1164 
1165 	if (va < VM_MIN_KERNEL_ADDRESS)
1166 		pte = vtopte(va);
1167 	else
1168 		pte = kvtopte(va);
1169 
1170 	npte = opte = *pte;
1171 
1172 	if ((prot & VM_PROT_WRITE) != 0)
1173 		npte |= PG_RW;
1174 	else
1175 		npte &= ~PG_RW;
1176 
1177 	if (opte != npte) {
1178 		pmap_pte_set(pte, npte);
1179 		pmap_pte_flush();
1180 		invlpg(va);
1181 	}
1182 }
1183 #endif /* defined(__x86_64__) */
1184 
1185 /*
1186  * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
1187  *
1188  * => no need to lock anything
1189  * => caller must dispose of any vm_page mapped in the va range
1190  * => note: not an inline function
1191  * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
1192  * => we assume kernel only unmaps valid addresses and thus don't bother
1193  *    checking the valid bit before doing TLB flushing
1194  * => must be followed by call to pmap_update() before reuse of page
1195  */
1196 
1197 void
1198 pmap_kremove(vaddr_t sva, vsize_t len)
1199 {
1200 	pt_entry_t *pte, xpte;
1201 	vaddr_t va, eva;
1202 
1203 	eva = sva + len;
1204 	xpte = 0;
1205 
1206 	for (va = sva; va < eva; va += PAGE_SIZE) {
1207 		if (va < VM_MIN_KERNEL_ADDRESS)
1208 			pte = vtopte(va);
1209 		else
1210 			pte = kvtopte(va);
1211 		xpte |= pmap_pte_testset(pte, 0); /* zap! */
1212 #if defined(DIAGNOSTIC)
1213 		/* XXX For now... */
1214 		if (xpte & PG_PS)
1215 			panic("pmap_kremove: PG_PS");
1216 		if (xpte & PG_PVLIST)
1217 			panic("pmap_kremove: PG_PVLIST mapping for 0x%lx",
1218 			      va);
1219 #endif
1220 	}
1221 	if ((xpte & (PG_V | PG_U)) == (PG_V | PG_U)) {
1222 		kpreempt_disable();
1223 		pmap_tlb_shootdown(pmap_kernel(), sva, eva, xpte);
1224 		kpreempt_enable();
1225 	}
1226 }
1227 
1228 /*
1229  * p m a p   i n i t   f u n c t i o n s
1230  *
1231  * pmap_bootstrap and pmap_init are called during system startup
1232  * to init the pmap module.   pmap_bootstrap() does a low level
1233  * init just to get things rolling.   pmap_init() finishes the job.
1234  */
1235 
1236 /*
1237  * pmap_bootstrap: get the system in a state where it can run with VM
1238  *	properly enabled (called before main()).   the VM system is
1239  *      fully init'd later...
1240  *
1241  * => on i386, locore.s has already enabled the MMU by allocating
1242  *	a PDP for the kernel, and nkpde PTP's for the kernel.
1243  * => kva_start is the first free virtual address in kernel space
1244  */
1245 
1246 void
1247 pmap_bootstrap(vaddr_t kva_start)
1248 {
1249 	struct pmap *kpm;
1250 	pt_entry_t *pte;
1251 	struct pcb *pcb;
1252 	int i;
1253 	vaddr_t kva;
1254 #ifdef XEN
1255 	pt_entry_t pg_nx = 0;
1256 #else
1257 	unsigned long p1i;
1258 	vaddr_t kva_end;
1259 	pt_entry_t pg_nx = (cpu_feature & CPUID_NOX ? PG_NX : 0);
1260 #endif
1261 
1262 	/*
1263 	 * set up our local static global vars that keep track of the
1264 	 * usage of KVM before kernel_map is set up
1265 	 */
1266 
1267 	virtual_avail = kva_start;		/* first free KVA */
1268 	virtual_end = VM_MAX_KERNEL_ADDRESS;	/* last KVA */
1269 
1270 	/*
1271 	 * set up protection_codes: we need to be able to convert from
1272 	 * a MI protection code (some combo of VM_PROT...) to something
1273 	 * we can jam into a i386 PTE.
1274 	 */
1275 
1276 	protection_codes[VM_PROT_NONE] = pg_nx;			/* --- */
1277 	protection_codes[VM_PROT_EXECUTE] = PG_RO | PG_X;	/* --x */
1278 	protection_codes[VM_PROT_READ] = PG_RO | pg_nx;		/* -r- */
1279 	protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO | PG_X;/* -rx */
1280 	protection_codes[VM_PROT_WRITE] = PG_RW | pg_nx;	/* w-- */
1281 	protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW | PG_X;/* w-x */
1282 	protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW | pg_nx;
1283 								/* wr- */
1284 	protection_codes[VM_PROT_ALL] = PG_RW | PG_X;		/* wrx */
1285 
1286 	/*
1287 	 * now we init the kernel's pmap
1288 	 *
1289 	 * the kernel pmap's pm_obj is not used for much.   however, in
1290 	 * user pmaps the pm_obj contains the list of active PTPs.
1291 	 * the pm_obj currently does not have a pager.   it might be possible
1292 	 * to add a pager that would allow a process to read-only mmap its
1293 	 * own page tables (fast user level vtophys?).   this may or may not
1294 	 * be useful.
1295 	 */
1296 
1297 	kpm = pmap_kernel();
1298 	for (i = 0; i < PTP_LEVELS - 1; i++) {
1299 		UVM_OBJ_INIT(&kpm->pm_obj[i], NULL, 1);
1300 		kpm->pm_ptphint[i] = NULL;
1301 	}
1302 	memset(&kpm->pm_list, 0, sizeof(kpm->pm_list));  /* pm_list not used */
1303 	pcb = lwp_getpcb(&lwp0);
1304 	kpm->pm_pdir = (pd_entry_t *)(pcb->pcb_cr3 + KERNBASE);
1305 #ifdef PAE
1306 	for (i = 0; i < PDP_SIZE; i++)
1307 		kpm->pm_pdirpa[i] = (paddr_t)pcb->pcb_cr3 + PAGE_SIZE * i;
1308 #else
1309 	kpm->pm_pdirpa = (paddr_t)pcb->pcb_cr3;
1310 #endif
1311 	kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
1312 		x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS);
1313 
1314 	/*
1315 	 * the above is just a rough estimate and not critical to the proper
1316 	 * operation of the system.
1317 	 */
1318 
1319 #ifndef XEN
1320 	/*
1321 	 * Begin to enable global TLB entries if they are supported.
1322 	 * The G bit has no effect until the CR4_PGE bit is set in CR4,
1323 	 * which happens in cpu_init(), which is run on each cpu
1324 	 * (and happens later)
1325 	 */
1326 
1327 	if (cpu_feature & CPUID_PGE) {
1328 		pmap_pg_g = PG_G;		/* enable software */
1329 
1330 		/* add PG_G attribute to already mapped kernel pages */
1331 		if (KERNBASE == VM_MIN_KERNEL_ADDRESS) {
1332 			kva_end = virtual_avail;
1333 		} else {
1334 			extern vaddr_t eblob, esym;
1335 			kva_end = (vaddr_t)&end;
1336 			if (esym > kva_end)
1337 				kva_end = esym;
1338 			if (eblob > kva_end)
1339 				kva_end = eblob;
1340 			kva_end = roundup(kva_end, PAGE_SIZE);
1341 		}
1342 		for (kva = KERNBASE; kva < kva_end; kva += PAGE_SIZE) {
1343 			p1i = pl1_i(kva);
1344 			if (pmap_valid_entry(PTE_BASE[p1i]))
1345 				PTE_BASE[p1i] |= PG_G;
1346 		}
1347 	}
1348 
1349 	/*
1350 	 * enable large pages if they are supported.
1351 	 */
1352 
1353 	if (cpu_feature & CPUID_PSE) {
1354 		paddr_t pa;
1355 		pd_entry_t *pde;
1356 		extern char __data_start;
1357 
1358 		lcr4(rcr4() | CR4_PSE);	/* enable hardware (via %cr4) */
1359 		pmap_largepages = 1;	/* enable software */
1360 
1361 		/*
1362 		 * the TLB must be flushed after enabling large pages
1363 		 * on Pentium CPUs, according to section 3.6.2.2 of
1364 		 * "Intel Architecture Software Developer's Manual,
1365 		 * Volume 3: System Programming".
1366 		 */
1367 		tlbflush();
1368 
1369 		/*
1370 		 * now, remap the kernel text using large pages.  we
1371 		 * assume that the linker has properly aligned the
1372 		 * .data segment to a NBPD_L2 boundary.
1373 		 */
1374 		kva_end = rounddown((vaddr_t)&__data_start, NBPD_L1);
1375 		for (pa = 0, kva = KERNBASE; kva + NBPD_L2 <= kva_end;
1376 		     kva += NBPD_L2, pa += NBPD_L2) {
1377 			pde = &L2_BASE[pl2_i(kva)];
1378 			*pde = pa | pmap_pg_g | PG_PS |
1379 			    PG_KR | PG_V;	/* zap! */
1380 			tlbflush();
1381 		}
1382 #if defined(DEBUG)
1383 		printf("kernel text is mapped with "
1384 		    "%lu large pages and %lu normal pages\n",
1385 		    (unsigned long)howmany(kva - KERNBASE, NBPD_L2),
1386 		    (unsigned long)howmany((vaddr_t)&__data_start - kva,
1387 		    NBPD_L1));
1388 #endif /* defined(DEBUG) */
1389 	}
1390 #endif /* !XEN */
1391 
1392 	if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
1393 		/*
1394 		 * zero_pte is stuck at the end of mapped space for the kernel
1395 		 * image (disjunct from kva space). This is done so that it
1396 		 * can safely be used in pmap_growkernel (pmap_get_physpage),
1397 		 * when it's called for the first time.
1398 		 * XXXfvdl fix this for MULTIPROCESSOR later.
1399 		 */
1400 
1401 		early_zerop = (void *)(KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2);
1402 		early_zero_pte = PTE_BASE + pl1_i((unsigned long)early_zerop);
1403 	}
1404 
1405 	/*
1406 	 * now we allocate the "special" VAs which are used for tmp mappings
1407 	 * by the pmap (and other modules).    we allocate the VAs by advancing
1408 	 * virtual_avail (note that there are no pages mapped at these VAs).
1409 	 * we find the PTE that maps the allocated VA via the linear PTE
1410 	 * mapping.
1411 	 */
1412 
1413 	pte = PTE_BASE + pl1_i(virtual_avail);
1414 
1415 #ifdef MULTIPROCESSOR
1416 	/*
1417 	 * Waste some VA space to avoid false sharing of cache lines
1418 	 * for page table pages: Give each possible CPU a cache line
1419 	 * of PTE's (8) to play with, though we only need 4.  We could
1420 	 * recycle some of this waste by putting the idle stacks here
1421 	 * as well; we could waste less space if we knew the largest
1422 	 * CPU ID beforehand.
1423 	 */
1424 	csrcp = (char *) virtual_avail;  csrc_pte = pte;
1425 
1426 	cdstp = (char *) virtual_avail+PAGE_SIZE;  cdst_pte = pte+1;
1427 
1428 	zerop = (char *) virtual_avail+PAGE_SIZE*2;  zero_pte = pte+2;
1429 
1430 	ptpp = (char *) virtual_avail+PAGE_SIZE*3;  ptp_pte = pte+3;
1431 
1432 	virtual_avail += PAGE_SIZE * maxcpus * NPTECL;
1433 	pte += maxcpus * NPTECL;
1434 #else
1435 	csrcp = (void *) virtual_avail;  csrc_pte = pte;	/* allocate */
1436 	virtual_avail += PAGE_SIZE; pte++;			/* advance */
1437 
1438 	cdstp = (void *) virtual_avail;  cdst_pte = pte;
1439 	virtual_avail += PAGE_SIZE; pte++;
1440 
1441 	zerop = (void *) virtual_avail;  zero_pte = pte;
1442 	virtual_avail += PAGE_SIZE; pte++;
1443 
1444 	ptpp = (void *) virtual_avail;  ptp_pte = pte;
1445 	virtual_avail += PAGE_SIZE; pte++;
1446 #endif
1447 
1448 	if (VM_MIN_KERNEL_ADDRESS == KERNBASE) {
1449 		early_zerop = zerop;
1450 		early_zero_pte = zero_pte;
1451 	}
1452 
1453 	/*
1454 	 * Nothing after this point actually needs pte;
1455 	 */
1456 	pte = (void *)0xdeadbeef;
1457 
1458 	/* XXX: vmmap used by mem.c... should be uvm_map_reserve */
1459 	/* XXXfvdl PTEs not needed here */
1460 	vmmap = (char *)virtual_avail;			/* don't need pte */
1461 	virtual_avail += PAGE_SIZE; pte++;
1462 
1463 #ifdef XEN
1464 #ifdef __x86_64__
1465 	/*
1466 	 * We want a dummy page directory for Xen:
1467 	 * when deactivate a pmap, Xen will still consider it active.
1468 	 * So we set user PGD to this one to lift all protection on
1469 	 * the now inactive page tables set.
1470 	 */
1471 	xen_dummy_user_pgd = avail_start;
1472 	avail_start += PAGE_SIZE;
1473 
1474 	/* Zero fill it, the less checks in Xen it requires the better */
1475 	memset((void *) (xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE);
1476 	/* Mark read-only */
1477 	HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE,
1478 	    pmap_pa2pte(xen_dummy_user_pgd) | PG_u | PG_V, UVMF_INVLPG);
1479 	/* Pin as L4 */
1480 	xpq_queue_pin_table(xpmap_ptom_masked(xen_dummy_user_pgd));
1481 #endif /* __x86_64__ */
1482 	idt_vaddr = virtual_avail;                      /* don't need pte */
1483 	idt_paddr = avail_start;                        /* steal a page */
1484 	/*
1485 	 * Xen require one more page as we can't store
1486 	 * GDT and LDT on the same page
1487 	 */
1488 	virtual_avail += 3 * PAGE_SIZE;
1489 	avail_start += 3 * PAGE_SIZE;
1490 #else /* XEN */
1491 	idt_vaddr = virtual_avail;			/* don't need pte */
1492 	idt_paddr = avail_start;			/* steal a page */
1493 #if defined(__x86_64__)
1494 	virtual_avail += 2 * PAGE_SIZE; pte += 2;
1495 	avail_start += 2 * PAGE_SIZE;
1496 #else /* defined(__x86_64__) */
1497 	virtual_avail += PAGE_SIZE; pte++;
1498 	avail_start += PAGE_SIZE;
1499 	/* pentium f00f bug stuff */
1500 	pentium_idt_vaddr = virtual_avail;		/* don't need pte */
1501 	virtual_avail += PAGE_SIZE; pte++;
1502 #endif /* defined(__x86_64__) */
1503 #endif /* XEN */
1504 
1505 #ifdef _LP64
1506 	/*
1507 	 * Grab a page below 4G for things that need it (i.e.
1508 	 * having an initial %cr3 for the MP trampoline).
1509 	 */
1510 	lo32_vaddr = virtual_avail;
1511 	virtual_avail += PAGE_SIZE; pte++;
1512 	lo32_paddr = avail_start;
1513 	avail_start += PAGE_SIZE;
1514 #endif
1515 
1516 	/*
1517 	 * now we reserve some VM for mapping pages when doing a crash dump
1518 	 */
1519 
1520 	virtual_avail = reserve_dumppages(virtual_avail);
1521 
1522 	/*
1523 	 * init the static-global locks and global lists.
1524 	 *
1525 	 * => pventry::pvh_lock (initialized elsewhere) must also be
1526 	 *      a spin lock, again at IPL_VM to prevent deadlock, and
1527 	 *	again is never taken from interrupt context.
1528 	 */
1529 
1530 	mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE);
1531 	LIST_INIT(&pmaps);
1532 	pmap_cpu_init_early(curcpu());
1533 
1534 	/*
1535 	 * initialize caches.
1536 	 */
1537 
1538 	pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), 0, 0, 0,
1539 	    "pmappl", NULL, IPL_NONE, NULL, NULL, NULL);
1540 #ifdef PAE
1541 	pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE * PDP_SIZE, 0, 0, 0,
1542 	    "pdppl", &pmap_pdp_allocator, IPL_NONE,
1543 	    pmap_pdp_ctor, pmap_pdp_dtor, NULL);
1544 #else /* PAE */
1545 	pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE, 0, 0, 0,
1546 	    "pdppl", NULL, IPL_NONE, pmap_pdp_ctor, pmap_pdp_dtor, NULL);
1547 #endif /* PAE */
1548 	pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0,
1549 	    PR_LARGECACHE, "pvpl", &pool_allocator_meta, IPL_NONE, NULL,
1550 	    NULL, NULL);
1551 
1552 	/*
1553 	 * ensure the TLB is sync'd with reality by flushing it...
1554 	 */
1555 
1556 	tlbflush();
1557 
1558 	/*
1559 	 * calculate pmap_maxkvaddr from nkptp[].
1560 	 */
1561 
1562 	kva = VM_MIN_KERNEL_ADDRESS;
1563 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
1564 		kva += nkptp[i] * nbpd[i];
1565 	}
1566 	pmap_maxkvaddr = kva;
1567 }
1568 
1569 #if defined(__x86_64__)
1570 /*
1571  * Pre-allocate PTPs for low memory, so that 1:1 mappings for various
1572  * trampoline code can be entered.
1573  */
1574 void
1575 pmap_prealloc_lowmem_ptps(void)
1576 {
1577 #ifdef XEN
1578 	int level;
1579 	paddr_t newp;
1580 	paddr_t pdes_pa;
1581 
1582 	pdes_pa = pmap_kernel()->pm_pdirpa;
1583 	level = PTP_LEVELS;
1584 	for (;;) {
1585 		newp = avail_start;
1586 		avail_start += PAGE_SIZE;
1587 		HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop,
1588 		    xpmap_ptom_masked(newp) | PG_u | PG_V | PG_RW, UVMF_INVLPG);
1589 		memset((void *)early_zerop, 0, PAGE_SIZE);
1590 		/* Mark R/O before installing */
1591 		HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop,
1592 		    xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG);
1593 		if (newp < (NKL2_KIMG_ENTRIES * NBPD_L2))
1594 			HYPERVISOR_update_va_mapping (newp + KERNBASE,
1595 			    xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG);
1596 		xpq_queue_pte_update (
1597 			xpmap_ptom_masked(pdes_pa)
1598 			+ (pl_i(0, level) * sizeof (pd_entry_t)),
1599 			xpmap_ptom_masked(newp) | PG_RW | PG_u | PG_V);
1600 		level--;
1601 		if (level <= 1)
1602 			break;
1603 		pdes_pa = newp;
1604 	}
1605 #else /* XEN */
1606 	pd_entry_t *pdes;
1607 	int level;
1608 	paddr_t newp;
1609 
1610 	pdes = pmap_kernel()->pm_pdir;
1611 	level = PTP_LEVELS;
1612 	for (;;) {
1613 		newp = avail_start;
1614 		avail_start += PAGE_SIZE;
1615 		*early_zero_pte = (newp & PG_FRAME) | PG_V | PG_RW;
1616 		pmap_update_pg((vaddr_t)early_zerop);
1617 		memset(early_zerop, 0, PAGE_SIZE);
1618 		pdes[pl_i(0, level)] = (newp & PG_FRAME) | PG_V | PG_RW;
1619 		level--;
1620 		if (level <= 1)
1621 			break;
1622 		pdes = normal_pdes[level - 2];
1623 	}
1624 #endif /* XEN */
1625 }
1626 #endif /* defined(__x86_64__) */
1627 
1628 /*
1629  * pmap_init: called from uvm_init, our job is to get the pmap
1630  * system ready to manage mappings...
1631  */
1632 
1633 void
1634 pmap_init(void)
1635 {
1636 	int i;
1637 
1638 	for (i = 0; i < PV_HASH_SIZE; i++) {
1639 		SLIST_INIT(&pv_hash_heads[i].hh_list);
1640 	}
1641 	for (i = 0; i < PV_HASH_LOCK_CNT; i++) {
1642 		mutex_init(&pv_hash_locks[i].lock, MUTEX_NODEBUG, IPL_VM);
1643 	}
1644 
1645 	/*
1646 	 * done: pmap module is up (and ready for business)
1647 	 */
1648 
1649 	pmap_initialized = true;
1650 }
1651 
1652 /*
1653  * pmap_cpu_init_early: perform early per-CPU initialization.
1654  */
1655 
1656 void
1657 pmap_cpu_init_early(struct cpu_info *ci)
1658 {
1659 	struct pmap_cpu *pc;
1660 	static uint8_t pmap_cpu_alloc;
1661 
1662 	pc = &pmap_cpu[pmap_cpu_alloc++].pc;
1663 	ci->ci_pmap_cpu = pc;
1664 }
1665 
1666 /*
1667  * pmap_cpu_init_late: perform late per-CPU initialization.
1668  */
1669 
1670 void
1671 pmap_cpu_init_late(struct cpu_info *ci)
1672 {
1673 
1674 	if (ci == &cpu_info_primary) {
1675 		evcnt_attach_dynamic(&pmap_tlb_evcnt, EVCNT_TYPE_INTR,
1676 		    NULL, "global", "TLB IPI");
1677 		evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC,
1678 		    NULL, "x86", "io bitmap copy");
1679 		evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC,
1680 		    NULL, "x86", "ldt sync");
1681 	}
1682 
1683 	evcnt_attach_dynamic(&ci->ci_tlb_evcnt, EVCNT_TYPE_MISC,
1684 	    NULL, device_xname(ci->ci_dev), "TLB IPI");
1685 }
1686 
1687 /*
1688  * p v _ e n t r y   f u n c t i o n s
1689  */
1690 
1691 /*
1692  * pmap_free_pvs: free a list of pv_entrys
1693  */
1694 
1695 static void
1696 pmap_free_pvs(struct pv_entry *pve)
1697 {
1698 	struct pv_entry *next;
1699 
1700 	for ( /* null */ ; pve != NULL ; pve = next) {
1701 		next = pve->pve_next;
1702 		pool_cache_put(&pmap_pv_cache, pve);
1703 	}
1704 }
1705 
1706 /*
1707  * main pv_entry manipulation functions:
1708  *   pmap_enter_pv: enter a mapping onto a pv_head list
1709  *   pmap_remove_pv: remove a mapping from a pv_head list
1710  *
1711  * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock
1712  *       the pvh before calling
1713  */
1714 
1715 /*
1716  * insert_pv: a helper of pmap_enter_pv
1717  */
1718 
1719 static void
1720 insert_pv(struct pmap_page *pp, struct pv_entry *pve)
1721 {
1722 	struct pv_hash_head *hh;
1723 	kmutex_t *lock;
1724 	u_int hash;
1725 
1726 	KASSERT(pp_locked(pp));
1727 
1728 	hash = pvhash_hash(pve->pve_pte.pte_ptp, pve->pve_pte.pte_va);
1729 	lock = pvhash_lock(hash);
1730 	hh = pvhash_head(hash);
1731 	mutex_spin_enter(lock);
1732 	SLIST_INSERT_HEAD(&hh->hh_list, pve, pve_hash);
1733 	mutex_spin_exit(lock);
1734 
1735 	LIST_INSERT_HEAD(&pp->pp_head.pvh_list, pve, pve_list);
1736 }
1737 
1738 /*
1739  * pmap_enter_pv: enter a mapping onto a pv_head lst
1740  *
1741  * => caller should have the pp_lock locked
1742  * => caller should adjust ptp's wire_count before calling
1743  */
1744 
1745 static struct pv_entry *
1746 pmap_enter_pv(struct pmap_page *pp,
1747 	      struct pv_entry *pve,	/* preallocated pve for us to use */
1748 	      struct pv_entry **sparepve,
1749 	      struct vm_page *ptp,
1750 	      vaddr_t va)
1751 {
1752 
1753 	KASSERT(ptp == NULL || ptp->wire_count >= 2);
1754 	KASSERT(ptp == NULL || ptp->uobject != NULL);
1755 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
1756 	KASSERT(pp_locked(pp));
1757 
1758 	if ((pp->pp_flags & PP_EMBEDDED) == 0) {
1759 		if (LIST_EMPTY(&pp->pp_head.pvh_list)) {
1760 			pp->pp_flags |= PP_EMBEDDED;
1761 			pp->pp_pte.pte_ptp = ptp;
1762 			pp->pp_pte.pte_va = va;
1763 
1764 			return pve;
1765 		}
1766 	} else {
1767 		struct pv_entry *pve2;
1768 
1769 		pve2 = *sparepve;
1770 		*sparepve = NULL;
1771 
1772 		pve2->pve_pte = pp->pp_pte;
1773 		pp->pp_flags &= ~PP_EMBEDDED;
1774 		LIST_INIT(&pp->pp_head.pvh_list);
1775 		insert_pv(pp, pve2);
1776 	}
1777 
1778 	pve->pve_pte.pte_ptp = ptp;
1779 	pve->pve_pte.pte_va = va;
1780 	insert_pv(pp, pve);
1781 
1782 	return NULL;
1783 }
1784 
1785 /*
1786  * pmap_remove_pv: try to remove a mapping from a pv_list
1787  *
1788  * => caller should hold pp_lock [so that attrs can be adjusted]
1789  * => caller should adjust ptp's wire_count and free PTP if needed
1790  * => we return the removed pve
1791  */
1792 
1793 static struct pv_entry *
1794 pmap_remove_pv(struct pmap_page *pp, struct vm_page *ptp, vaddr_t va)
1795 {
1796 	struct pv_hash_head *hh;
1797 	struct pv_entry *pve;
1798 	kmutex_t *lock;
1799 	u_int hash;
1800 
1801 	KASSERT(ptp == NULL || ptp->uobject != NULL);
1802 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
1803 	KASSERT(pp_locked(pp));
1804 
1805 	if ((pp->pp_flags & PP_EMBEDDED) != 0) {
1806 		KASSERT(pp->pp_pte.pte_ptp == ptp);
1807 		KASSERT(pp->pp_pte.pte_va == va);
1808 
1809 		pp->pp_flags &= ~PP_EMBEDDED;
1810 		LIST_INIT(&pp->pp_head.pvh_list);
1811 
1812 		return NULL;
1813 	}
1814 
1815 	hash = pvhash_hash(ptp, va);
1816 	lock = pvhash_lock(hash);
1817 	hh = pvhash_head(hash);
1818 	mutex_spin_enter(lock);
1819 	pve = pvhash_remove(hh, ptp, va);
1820 	mutex_spin_exit(lock);
1821 
1822 	LIST_REMOVE(pve, pve_list);
1823 
1824 	return pve;
1825 }
1826 
1827 /*
1828  * p t p   f u n c t i o n s
1829  */
1830 
1831 static inline struct vm_page *
1832 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level)
1833 {
1834 	int lidx = level - 1;
1835 	struct vm_page *pg;
1836 
1837 	KASSERT(mutex_owned(&pmap->pm_lock));
1838 
1839 	if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] &&
1840 	    pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) {
1841 		return (pmap->pm_ptphint[lidx]);
1842 	}
1843 	PMAP_SUBOBJ_LOCK(pmap, lidx);
1844 	pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level));
1845 	PMAP_SUBOBJ_UNLOCK(pmap, lidx);
1846 
1847 	KASSERT(pg == NULL || pg->wire_count >= 1);
1848 	return pg;
1849 }
1850 
1851 static inline void
1852 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level)
1853 {
1854 	int lidx;
1855 	struct uvm_object *obj;
1856 
1857 	KASSERT(ptp->wire_count == 1);
1858 
1859 	lidx = level - 1;
1860 
1861 	obj = &pmap->pm_obj[lidx];
1862 	pmap_stats_update(pmap, -1, 0);
1863 	if (lidx != 0)
1864 		mutex_enter(&obj->vmobjlock);
1865 	if (pmap->pm_ptphint[lidx] == ptp)
1866 		pmap->pm_ptphint[lidx] = TAILQ_FIRST(&obj->memq);
1867 	ptp->wire_count = 0;
1868 	uvm_pagerealloc(ptp, NULL, 0);
1869 	VM_PAGE_TO_PP(ptp)->pp_link = curlwp->l_md.md_gc_ptp;
1870 	curlwp->l_md.md_gc_ptp = ptp;
1871 	if (lidx != 0)
1872 		mutex_exit(&obj->vmobjlock);
1873 }
1874 
1875 static void
1876 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
1877 	      pt_entry_t *ptes, pd_entry_t * const *pdes)
1878 {
1879 	unsigned long index;
1880 	int level;
1881 	vaddr_t invaladdr;
1882 #ifdef MULTIPROCESSOR
1883 	vaddr_t invaladdr2;
1884 #endif
1885 	pd_entry_t opde;
1886 	struct pmap *curpmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map);
1887 
1888 	KASSERT(pmap != pmap_kernel());
1889 	KASSERT(mutex_owned(&pmap->pm_lock));
1890 	KASSERT(kpreempt_disabled());
1891 
1892 	level = 1;
1893 	do {
1894 		index = pl_i(va, level + 1);
1895 		opde = pmap_pte_testset(&pdes[level - 1][index], 0);
1896 #if defined(XEN) && defined(__x86_64__)
1897 		/*
1898 		 * If ptp is a L3 currently mapped in kernel space,
1899 		 * clear it before freeing
1900 		 */
1901 		if (pmap->pm_pdirpa == xen_current_user_pgd
1902 		    && level == PTP_LEVELS - 1)
1903 			pmap_pte_set(&pmap_kernel()->pm_pdir[index], 0);
1904 #endif /* XEN && __x86_64__ */
1905 		pmap_freepage(pmap, ptp, level);
1906 		invaladdr = level == 1 ? (vaddr_t)ptes :
1907 		    (vaddr_t)pdes[level - 2];
1908 		pmap_tlb_shootdown(curpmap, invaladdr + index * PAGE_SIZE,
1909 		    0, opde);
1910 #if defined(MULTIPROCESSOR)
1911 		invaladdr2 = level == 1 ? (vaddr_t)PTE_BASE :
1912 		    (vaddr_t)normal_pdes[level - 2];
1913 		if (pmap != curpmap || invaladdr != invaladdr2) {
1914 			pmap_tlb_shootdown(pmap, invaladdr2 + index * PAGE_SIZE,
1915 			    0, opde);
1916 		}
1917 #endif
1918 		if (level < PTP_LEVELS - 1) {
1919 			ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1);
1920 			ptp->wire_count--;
1921 			if (ptp->wire_count > 1)
1922 				break;
1923 		}
1924 	} while (++level < PTP_LEVELS);
1925 	pmap_pte_flush();
1926 }
1927 
1928 /*
1929  * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
1930  *
1931  * => pmap should NOT be pmap_kernel()
1932  * => pmap should be locked
1933  * => preemption should be disabled
1934  */
1935 
1936 static struct vm_page *
1937 pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t * const *pdes)
1938 {
1939 	struct vm_page *ptp, *pptp;
1940 	int i;
1941 	unsigned long index;
1942 	pd_entry_t *pva;
1943 	paddr_t ppa, pa;
1944 	struct uvm_object *obj;
1945 
1946 	KASSERT(pmap != pmap_kernel());
1947 	KASSERT(mutex_owned(&pmap->pm_lock));
1948 	KASSERT(kpreempt_disabled());
1949 
1950 	ptp = NULL;
1951 	pa = (paddr_t)-1;
1952 
1953 	/*
1954 	 * Loop through all page table levels seeing if we need to
1955 	 * add a new page to that level.
1956 	 */
1957 	for (i = PTP_LEVELS; i > 1; i--) {
1958 		/*
1959 		 * Save values from previous round.
1960 		 */
1961 		pptp = ptp;
1962 		ppa = pa;
1963 
1964 		index = pl_i(va, i);
1965 		pva = pdes[i - 2];
1966 
1967 		if (pmap_valid_entry(pva[index])) {
1968 			ppa = pmap_pte2pa(pva[index]);
1969 			ptp = NULL;
1970 			continue;
1971 		}
1972 
1973 		obj = &pmap->pm_obj[i-2];
1974 		PMAP_SUBOBJ_LOCK(pmap, i - 2);
1975 		ptp = uvm_pagealloc(obj, ptp_va2o(va, i - 1), NULL,
1976 		    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
1977 		PMAP_SUBOBJ_UNLOCK(pmap, i - 2);
1978 
1979 		if (ptp == NULL)
1980 			return NULL;
1981 
1982 		ptp->flags &= ~PG_BUSY; /* never busy */
1983 		ptp->wire_count = 1;
1984 		pmap->pm_ptphint[i - 2] = ptp;
1985 		pa = VM_PAGE_TO_PHYS(ptp);
1986 		pmap_pte_set(&pva[index], (pd_entry_t)
1987 		        (pmap_pa2pte(pa) | PG_u | PG_RW | PG_V));
1988 #if defined(XEN) && defined(__x86_64__)
1989 		/*
1990 		 * In Xen we must enter the mapping in kernel map too
1991 		 * if pmap is curmap and modifying top level (PGD)
1992 		 */
1993 		if(i == PTP_LEVELS && pmap != pmap_kernel()) {
1994 		        pmap_pte_set(&pmap_kernel()->pm_pdir[index],
1995 		                (pd_entry_t) (pmap_pa2pte(pa)
1996 		                        | PG_u | PG_RW | PG_V));
1997 		}
1998 #endif /* XEN && __x86_64__ */
1999 		pmap_pte_flush();
2000 		pmap_stats_update(pmap, 1, 0);
2001 		/*
2002 		 * If we're not in the top level, increase the
2003 		 * wire count of the parent page.
2004 		 */
2005 		if (i < PTP_LEVELS) {
2006 			if (pptp == NULL)
2007 				pptp = pmap_find_ptp(pmap, va, ppa, i);
2008 #ifdef DIAGNOSTIC
2009 			if (pptp == NULL)
2010 				panic("pde page disappeared");
2011 #endif
2012 			pptp->wire_count++;
2013 		}
2014 	}
2015 
2016 	/*
2017 	 * ptp is not NULL if we just allocated a new ptp. If it's
2018 	 * still NULL, we must look up the existing one.
2019 	 */
2020 	if (ptp == NULL) {
2021 		ptp = pmap_find_ptp(pmap, va, ppa, 1);
2022 #ifdef DIAGNOSTIC
2023 		if (ptp == NULL) {
2024 			printf("va %lx ppa %lx\n", (unsigned long)va,
2025 			    (unsigned long)ppa);
2026 			panic("pmap_get_ptp: unmanaged user PTP");
2027 		}
2028 #endif
2029 	}
2030 
2031 	pmap->pm_ptphint[0] = ptp;
2032 	return(ptp);
2033 }
2034 
2035 /*
2036  * p m a p  l i f e c y c l e   f u n c t i o n s
2037  */
2038 
2039 /*
2040  * pmap_pdp_ctor: constructor for the PDP cache.
2041  */
2042 
2043 int
2044 pmap_pdp_ctor(void *arg, void *v, int flags)
2045 {
2046 	pd_entry_t *pdir = v;
2047 	paddr_t pdirpa = 0;	/* XXX: GCC */
2048 	vaddr_t object;
2049 	int i;
2050 
2051 #if !defined(XEN) || !defined(__x86_64__)
2052 	int npde;
2053 #endif
2054 #ifdef XEN
2055 	int s;
2056 #endif
2057 
2058 	/*
2059 	 * NOTE: The `pmap_lock' is held when the PDP is allocated.
2060 	 */
2061 
2062 #if defined(XEN) && defined(__x86_64__)
2063 	/* fetch the physical address of the page directory. */
2064 	(void) pmap_extract(pmap_kernel(), (vaddr_t) pdir, &pdirpa);
2065 
2066 	/* zero init area */
2067 	memset (pdir, 0, PAGE_SIZE); /* Xen wants a clean page */
2068 	/*
2069 	 * this pdir will NEVER be active in kernel mode
2070 	 * so mark recursive entry invalid
2071 	 */
2072 	pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa) | PG_u;
2073 	/*
2074 	 * PDP constructed this way won't be for kernel,
2075 	 * hence we don't put kernel mappings on Xen.
2076 	 * But we need to make pmap_create() happy, so put a dummy (without
2077 	 * PG_V) value at the right place.
2078 	 */
2079 	pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] =
2080 	     (unsigned long)-1 & PG_FRAME;
2081 #else /* XEN  && __x86_64__*/
2082 	/* zero init area */
2083 	memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t));
2084 
2085 	object = (vaddr_t)v;
2086 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2087 		/* fetch the physical address of the page directory. */
2088 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2089 		/* put in recursive PDE to map the PTEs */
2090 		pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PG_V;
2091 #ifndef XEN
2092 		pdir[PDIR_SLOT_PTE + i] |= PG_KW;
2093 #endif
2094 	}
2095 
2096 	/* copy kernel's PDE */
2097 	npde = nkptp[PTP_LEVELS - 1];
2098 
2099 	memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN],
2100 	    npde * sizeof(pd_entry_t));
2101 
2102 	/* zero the rest */
2103 	memset(&pdir[PDIR_SLOT_KERN + npde], 0,
2104 	    (NTOPLEVEL_PDES - (PDIR_SLOT_KERN + npde)) * sizeof(pd_entry_t));
2105 
2106 	if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
2107 		int idx = pl_i(KERNBASE, PTP_LEVELS);
2108 
2109 		pdir[idx] = PDP_BASE[idx];
2110 	}
2111 #endif /* XEN  && __x86_64__*/
2112 #ifdef XEN
2113 	s = splvm();
2114 	object = (vaddr_t)v;
2115 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2116 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2117 		/* remap this page RO */
2118 		pmap_kenter_pa(object, pdirpa, VM_PROT_READ, 0);
2119 		pmap_update(pmap_kernel());
2120 		/*
2121 		 * pin as L2/L4 page, we have to do the page with the
2122 		 * PDIR_SLOT_PTE entries last
2123 		 */
2124 #ifdef PAE
2125 		if (i == l2tol3(PDIR_SLOT_PTE))
2126 			continue;
2127 #endif
2128 		xpq_queue_pin_table(xpmap_ptom_masked(pdirpa));
2129 	}
2130 #ifdef PAE
2131 	object = ((vaddr_t)pdir) + PAGE_SIZE  * l2tol3(PDIR_SLOT_PTE);
2132 	(void)pmap_extract(pmap_kernel(), object, &pdirpa);
2133 	xpq_queue_pin_table(xpmap_ptom_masked(pdirpa));
2134 #endif
2135 	xpq_flush_queue();
2136 	splx(s);
2137 #endif /* XEN */
2138 
2139 	return (0);
2140 }
2141 
2142 /*
2143  * pmap_pdp_dtor: destructor for the PDP cache.
2144  */
2145 
2146 void
2147 pmap_pdp_dtor(void *arg, void *v)
2148 {
2149 #ifdef XEN
2150 	paddr_t pdirpa = 0;	/* XXX: GCC */
2151 	vaddr_t object = (vaddr_t)v;
2152 	int i;
2153 	int s = splvm();
2154 	pt_entry_t *pte;
2155 
2156 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2157 		/* fetch the physical address of the page directory. */
2158 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2159 		/* unpin page table */
2160 		xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa));
2161 	}
2162 	object = (vaddr_t)v;
2163 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2164 		/* Set page RW again */
2165 		pte = kvtopte(object);
2166 		xpq_queue_pte_update(xpmap_ptetomach(pte), *pte | PG_RW);
2167 		xpq_queue_invlpg((vaddr_t)object);
2168 	}
2169 	xpq_flush_queue();
2170 	splx(s);
2171 #endif  /* XEN */
2172 }
2173 
2174 #ifdef PAE
2175 
2176 /* pmap_pdp_alloc: Allocate a page for the pdp memory pool. */
2177 
2178 void *
2179 pmap_pdp_alloc(struct pool *pp, int flags)
2180 {
2181 	return (void *)uvm_km_alloc(kernel_map,
2182 	    PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE,
2183 	    ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK)
2184 	    | UVM_KMF_WIRED);
2185 }
2186 
2187 /*
2188  * pmap_pdp_free: free a PDP
2189  */
2190 
2191 void
2192 pmap_pdp_free(struct pool *pp, void *v)
2193 {
2194 	uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE,
2195 	    UVM_KMF_WIRED);
2196 }
2197 #endif /* PAE */
2198 
2199 /*
2200  * pmap_create: create a pmap
2201  *
2202  * => note: old pmap interface took a "size" args which allowed for
2203  *	the creation of "software only" pmaps (not in bsd).
2204  */
2205 
2206 struct pmap *
2207 pmap_create(void)
2208 {
2209 	struct pmap *pmap;
2210 	int i;
2211 
2212 	pmap = pool_cache_get(&pmap_cache, PR_WAITOK);
2213 
2214 	/* init uvm_object */
2215 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2216 		UVM_OBJ_INIT(&pmap->pm_obj[i], NULL, 1);
2217 		pmap->pm_ptphint[i] = NULL;
2218 	}
2219 	pmap->pm_stats.wired_count = 0;
2220 	pmap->pm_stats.resident_count = 1;	/* count the PDP allocd below */
2221 #if !defined(__x86_64__)
2222 	pmap->pm_hiexec = 0;
2223 #endif /* !defined(__x86_64__) */
2224 	pmap->pm_flags = 0;
2225 	pmap->pm_cpus = 0;
2226 	pmap->pm_kernel_cpus = 0;
2227 
2228 	/* init the LDT */
2229 	pmap->pm_ldt = NULL;
2230 	pmap->pm_ldt_len = 0;
2231 	pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2232 
2233 	/* allocate PDP */
2234  try_again:
2235 	pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK);
2236 
2237 	mutex_enter(&pmaps_lock);
2238 
2239 	if (pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] == 0) {
2240 		mutex_exit(&pmaps_lock);
2241 		pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir);
2242 		goto try_again;
2243 	}
2244 
2245 #ifdef PAE
2246 	for (i = 0; i < PDP_SIZE; i++)
2247 		pmap->pm_pdirpa[i] =
2248 		    pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]);
2249 #else
2250 	pmap->pm_pdirpa = pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE]);
2251 #endif
2252 
2253 	LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
2254 
2255 	mutex_exit(&pmaps_lock);
2256 
2257 	return (pmap);
2258 }
2259 
2260 /*
2261  * pmap_destroy: drop reference count on pmap.   free pmap if
2262  *	reference count goes to zero.
2263  */
2264 
2265 void
2266 pmap_destroy(struct pmap *pmap)
2267 {
2268 	int i;
2269 #ifdef DIAGNOSTIC
2270 	struct cpu_info *ci;
2271 	CPU_INFO_ITERATOR cii;
2272 #endif /* DIAGNOSTIC */
2273 
2274 	/*
2275 	 * if we have torn down this pmap, process deferred frees and
2276 	 * invalidations now.
2277 	 */
2278 	if (__predict_false(curlwp->l_md.md_gc_pmap == pmap)) {
2279 		pmap_update(pmap);
2280 	}
2281 
2282 	/*
2283 	 * drop reference count
2284 	 */
2285 
2286 	if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) {
2287 		return;
2288 	}
2289 
2290 #ifdef DIAGNOSTIC
2291 	for (CPU_INFO_FOREACH(cii, ci))
2292 		if (ci->ci_pmap == pmap)
2293 			panic("destroying pmap being used");
2294 #endif /* DIAGNOSTIC */
2295 
2296 	/*
2297 	 * reference count is zero, free pmap resources and then free pmap.
2298 	 */
2299 #ifdef XEN
2300 	/*
2301 	 * Xen lazy APDP handling:
2302 	 * clear APDP_PDE if pmap is the currently mapped
2303 	 */
2304 	if (xpmap_ptom_masked(pmap_pdirpa(pmap, 0)) == (*APDP_PDE & PG_FRAME)) {
2305 		kpreempt_disable();
2306 		for (i = 0; i < PDP_SIZE; i++) {
2307 	        	pmap_pte_set(&APDP_PDE[i], 0);
2308 #ifdef PAE
2309 			/* clear shadow entry too */
2310 	    		pmap_pte_set(&APDP_PDE_SHADOW[i], 0);
2311 #endif
2312 		}
2313 		pmap_pte_flush();
2314 	        pmap_apte_flush(pmap_kernel());
2315 	        kpreempt_enable();
2316 	}
2317 #endif
2318 
2319 	/*
2320 	 * remove it from global list of pmaps
2321 	 */
2322 
2323 	mutex_enter(&pmaps_lock);
2324 	LIST_REMOVE(pmap, pm_list);
2325 	mutex_exit(&pmaps_lock);
2326 
2327 	/*
2328 	 * destroyed pmap shouldn't have remaining PTPs
2329 	 */
2330 
2331 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2332 		KASSERT(pmap->pm_obj[i].uo_npages == 0);
2333 		KASSERT(TAILQ_EMPTY(&pmap->pm_obj[i].memq));
2334 	}
2335 
2336 	/*
2337 	 * MULTIPROCESSOR -- no need to flush out of other processors'
2338 	 * APTE space because we do that in pmap_unmap_ptes().
2339 	 */
2340 	pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir);
2341 
2342 #ifdef USER_LDT
2343 	if (pmap->pm_ldt != NULL) {
2344 		/*
2345 		 * no need to switch the LDT; this address space is gone,
2346 		 * nothing is using it.
2347 		 *
2348 		 * No need to lock the pmap for ldt_free (or anything else),
2349 		 * we're the last one to use it.
2350 		 */
2351 		mutex_enter(&cpu_lock);
2352 		ldt_free(pmap->pm_ldt_sel);
2353 		mutex_exit(&cpu_lock);
2354 		uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt,
2355 		    pmap->pm_ldt_len, UVM_KMF_WIRED);
2356 	}
2357 #endif
2358 
2359 	for (i = 0; i < PTP_LEVELS - 1; i++)
2360 		mutex_destroy(&pmap->pm_obj[i].vmobjlock);
2361 	pool_cache_put(&pmap_cache, pmap);
2362 }
2363 
2364 /*
2365  * pmap_remove_all: pmap is being torn down by the current thread.
2366  * avoid unnecessary invalidations.
2367  */
2368 
2369 void
2370 pmap_remove_all(struct pmap *pmap)
2371 {
2372 	lwp_t *l = curlwp;
2373 
2374 	KASSERT(l->l_md.md_gc_pmap == NULL);
2375 
2376 	l->l_md.md_gc_pmap = pmap;
2377 }
2378 
2379 #if defined(PMAP_FORK)
2380 /*
2381  * pmap_fork: perform any necessary data structure manipulation when
2382  * a VM space is forked.
2383  */
2384 
2385 void
2386 pmap_fork(struct pmap *pmap1, struct pmap *pmap2)
2387 {
2388 #ifdef USER_LDT
2389 	union descriptor *new_ldt;
2390 	size_t len;
2391 	int sel;
2392 
2393 	if (__predict_true(pmap1->pm_ldt == NULL)) {
2394 		return;
2395 	}
2396 
2397  retry:
2398 	if (pmap1->pm_ldt != NULL) {
2399 		len = pmap1->pm_ldt_len;
2400 		new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, len, 0,
2401 		    UVM_KMF_WIRED);
2402 		mutex_enter(&cpu_lock);
2403 		sel = ldt_alloc(new_ldt, len);
2404 		if (sel == -1) {
2405 			mutex_exit(&cpu_lock);
2406 			uvm_km_free(kernel_map, (vaddr_t)new_ldt, len,
2407 			    UVM_KMF_WIRED);
2408 			printf("WARNING: pmap_fork: unable to allocate LDT\n");
2409 			return;
2410 		}
2411 	} else {
2412 		len = -1;
2413 		new_ldt = NULL;
2414 		sel = -1;
2415 		mutex_enter(&cpu_lock);
2416 	}
2417 
2418  	/* Copy the LDT, if necessary. */
2419  	if (pmap1->pm_ldt != NULL) {
2420 		if (len != pmap1->pm_ldt_len) {
2421 			if (len != -1) {
2422 				ldt_free(sel);
2423 				uvm_km_free(kernel_map, (vaddr_t)new_ldt,
2424 				    len, UVM_KMF_WIRED);
2425 			}
2426 			mutex_exit(&cpu_lock);
2427 			goto retry;
2428 		}
2429 
2430 		memcpy(new_ldt, pmap1->pm_ldt, len);
2431 		pmap2->pm_ldt = new_ldt;
2432 		pmap2->pm_ldt_len = pmap1->pm_ldt_len;
2433 		pmap2->pm_ldt_sel = sel;
2434 		len = -1;
2435 	}
2436 
2437 	if (len != -1) {
2438 		ldt_free(sel);
2439 		uvm_km_free(kernel_map, (vaddr_t)new_ldt, len,
2440 		    UVM_KMF_WIRED);
2441 	}
2442 	mutex_exit(&cpu_lock);
2443 #endif /* USER_LDT */
2444 }
2445 #endif /* PMAP_FORK */
2446 
2447 #ifdef USER_LDT
2448 
2449 /*
2450  * pmap_ldt_xcall: cross call used by pmap_ldt_sync.  if the named pmap
2451  * is active, reload LDTR.
2452  */
2453 static void
2454 pmap_ldt_xcall(void *arg1, void *arg2)
2455 {
2456 	struct pmap *pm;
2457 
2458 	kpreempt_disable();
2459 	pm = arg1;
2460 	if (curcpu()->ci_pmap == pm) {
2461 		lldt(pm->pm_ldt_sel);
2462 	}
2463 	kpreempt_enable();
2464 }
2465 
2466 /*
2467  * pmap_ldt_sync: LDT selector for the named pmap is changing.  swap
2468  * in the new selector on all CPUs.
2469  */
2470 void
2471 pmap_ldt_sync(struct pmap *pm)
2472 {
2473 	uint64_t where;
2474 
2475 	KASSERT(mutex_owned(&cpu_lock));
2476 
2477 	pmap_ldt_evcnt.ev_count++;
2478 	where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL);
2479 	xc_wait(where);
2480 }
2481 
2482 /*
2483  * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and
2484  * restore the default.
2485  */
2486 
2487 void
2488 pmap_ldt_cleanup(struct lwp *l)
2489 {
2490 	pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap;
2491 	union descriptor *dp = NULL;
2492 	size_t len = 0;
2493 	int sel = -1;
2494 
2495 	if (__predict_true(pmap->pm_ldt == NULL)) {
2496 		return;
2497 	}
2498 
2499 	mutex_enter(&cpu_lock);
2500 	if (pmap->pm_ldt != NULL) {
2501 		sel = pmap->pm_ldt_sel;
2502 		dp = pmap->pm_ldt;
2503 		len = pmap->pm_ldt_len;
2504 		pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2505 		pmap->pm_ldt = NULL;
2506 		pmap->pm_ldt_len = 0;
2507 		pmap_ldt_sync(pmap);
2508 		ldt_free(sel);
2509 		uvm_km_free(kernel_map, (vaddr_t)dp, len, UVM_KMF_WIRED);
2510 	}
2511 	mutex_exit(&cpu_lock);
2512 }
2513 #endif /* USER_LDT */
2514 
2515 /*
2516  * pmap_activate: activate a process' pmap
2517  *
2518  * => must be called with kernel preemption disabled
2519  * => if lwp is the curlwp, then set ci_want_pmapload so that
2520  *    actual MMU context switch will be done by pmap_load() later
2521  */
2522 
2523 void
2524 pmap_activate(struct lwp *l)
2525 {
2526 	struct cpu_info *ci;
2527 	struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2528 
2529 	KASSERT(kpreempt_disabled());
2530 
2531 	ci = curcpu();
2532 
2533 	if (l == ci->ci_curlwp) {
2534 		struct pcb *pcb;
2535 
2536 		KASSERT(ci->ci_want_pmapload == 0);
2537 		KASSERT(ci->ci_tlbstate != TLBSTATE_VALID);
2538 #ifdef KSTACK_CHECK_DR0
2539 		/*
2540 		 * setup breakpoint on the top of stack
2541 		 */
2542 		if (l == &lwp0)
2543 			dr0(0, 0, 0, 0);
2544 		else
2545 			dr0(KSTACK_LOWEST_ADDR(l), 1, 3, 1);
2546 #endif
2547 
2548 		/*
2549 		 * no need to switch to kernel vmspace because
2550 		 * it's a subset of any vmspace.
2551 		 */
2552 
2553 		if (pmap == pmap_kernel()) {
2554 			ci->ci_want_pmapload = 0;
2555 			return;
2556 		}
2557 
2558 		pcb = lwp_getpcb(l);
2559 		ci->ci_want_pmapload = 1;
2560 
2561 #if defined(__x86_64__)
2562 		if (pcb->pcb_flags & PCB_GS64)
2563 			wrmsr(MSR_KERNELGSBASE, pcb->pcb_gs);
2564 		if (pcb->pcb_flags & PCB_FS64)
2565 			wrmsr(MSR_FSBASE, pcb->pcb_fs);
2566 #endif /* defined(__x86_64__) */
2567 	}
2568 }
2569 
2570 /*
2571  * pmap_reactivate: try to regain reference to the pmap.
2572  *
2573  * => must be called with kernel preemption disabled
2574  */
2575 
2576 static bool
2577 pmap_reactivate(struct pmap *pmap)
2578 {
2579 	struct cpu_info *ci;
2580 	uint32_t cpumask;
2581 	bool result;
2582 	uint32_t oldcpus;
2583 
2584 	ci = curcpu();
2585 	cpumask = ci->ci_cpumask;
2586 
2587 	KASSERT(kpreempt_disabled());
2588 #if defined(XEN) && defined(__x86_64__)
2589 	KASSERT(pmap->pm_pdirpa == xen_current_user_pgd);
2590 #elif defined(PAE)
2591 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(pmap_l3pd[0]));
2592 #elif !defined(XEN)
2593 	KASSERT(pmap->pm_pdirpa == pmap_pte2pa(rcr3()));
2594 #endif
2595 
2596 	/*
2597 	 * if we still have a lazy reference to this pmap,
2598 	 * we can assume that there was no tlb shootdown
2599 	 * for this pmap in the meantime.
2600 	 *
2601 	 * the order of events here is important as we must
2602 	 * synchronize with TLB shootdown interrupts.  declare
2603 	 * interest in invalidations (TLBSTATE_VALID) and then
2604 	 * check the cpumask, which the IPIs can change only
2605 	 * when the state is TLBSTATE_LAZY.
2606 	 */
2607 
2608 	ci->ci_tlbstate = TLBSTATE_VALID;
2609 	oldcpus = pmap->pm_cpus;
2610 	KASSERT((pmap->pm_kernel_cpus & cpumask) != 0);
2611 	if (oldcpus & cpumask) {
2612 		/* got it */
2613 		result = true;
2614 	} else {
2615 		/* must reload */
2616 		atomic_or_32(&pmap->pm_cpus, cpumask);
2617 		result = false;
2618 	}
2619 
2620 	return result;
2621 }
2622 
2623 /*
2624  * pmap_load: actually switch pmap.  (fill in %cr3 and LDT info)
2625  */
2626 
2627 void
2628 pmap_load(void)
2629 {
2630 	struct cpu_info *ci;
2631 	uint32_t cpumask;
2632 	struct pmap *pmap;
2633 	struct pmap *oldpmap;
2634 	struct lwp *l;
2635 	struct pcb *pcb;
2636 	uint64_t ncsw;
2637 
2638 	kpreempt_disable();
2639  retry:
2640 	ci = curcpu();
2641 	if (!ci->ci_want_pmapload) {
2642 		kpreempt_enable();
2643 		return;
2644 	}
2645 	cpumask = ci->ci_cpumask;
2646 	l = ci->ci_curlwp;
2647 	ncsw = l->l_ncsw;
2648 
2649 	/* should be able to take ipis. */
2650 	KASSERT(ci->ci_ilevel < IPL_HIGH);
2651 #ifdef XEN
2652 	/* XXX not yet KASSERT(x86_read_psl() != 0); */
2653 #else
2654 	KASSERT((x86_read_psl() & PSL_I) != 0);
2655 #endif
2656 
2657 	KASSERT(l != NULL);
2658 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2659 	KASSERT(pmap != pmap_kernel());
2660 	oldpmap = ci->ci_pmap;
2661 	pcb = lwp_getpcb(l);
2662 
2663 	if (pmap == oldpmap) {
2664 		if (!pmap_reactivate(pmap)) {
2665 			u_int gen = uvm_emap_gen_return();
2666 
2667 			/*
2668 			 * pmap has been changed during deactivated.
2669 			 * our tlb may be stale.
2670 			 */
2671 
2672 			tlbflush();
2673 			uvm_emap_update(gen);
2674 		}
2675 
2676 		ci->ci_want_pmapload = 0;
2677 		kpreempt_enable();
2678 		return;
2679 	}
2680 
2681 	/*
2682 	 * grab a reference to the new pmap.
2683 	 */
2684 
2685 	pmap_reference(pmap);
2686 
2687 	/*
2688 	 * actually switch pmap.
2689 	 */
2690 
2691 	atomic_and_32(&oldpmap->pm_cpus, ~cpumask);
2692 	atomic_and_32(&oldpmap->pm_kernel_cpus, ~cpumask);
2693 
2694 #if defined(XEN) && defined(__x86_64__)
2695 	KASSERT(oldpmap->pm_pdirpa == xen_current_user_pgd ||
2696 	    oldpmap == pmap_kernel());
2697 #elif defined(PAE)
2698 	KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(pmap_l3pd[0]));
2699 #elif !defined(XEN)
2700 	KASSERT(oldpmap->pm_pdirpa == pmap_pte2pa(rcr3()));
2701 #endif
2702 	KASSERT((pmap->pm_cpus & cpumask) == 0);
2703 	KASSERT((pmap->pm_kernel_cpus & cpumask) == 0);
2704 
2705 	/*
2706 	 * mark the pmap in use by this processor.  again we must
2707 	 * synchronize with TLB shootdown interrupts, so set the
2708 	 * state VALID first, then register us for shootdown events
2709 	 * on this pmap.
2710 	 */
2711 
2712 	ci->ci_tlbstate = TLBSTATE_VALID;
2713 	atomic_or_32(&pmap->pm_cpus, cpumask);
2714 	atomic_or_32(&pmap->pm_kernel_cpus, cpumask);
2715 	ci->ci_pmap = pmap;
2716 
2717 	/*
2718 	 * update tss.  now that we have registered for invalidations
2719 	 * from other CPUs, we're good to load the page tables.
2720 	 */
2721 #ifdef PAE
2722 	pcb->pcb_cr3 = pmap_l3paddr;
2723 #else
2724 	pcb->pcb_cr3 = pmap->pm_pdirpa;
2725 #endif
2726 #if defined(XEN) && defined(__x86_64__)
2727 	/* kernel pmap always in cr3 and should never go in user cr3 */
2728 	if (pmap_pdirpa(pmap, 0) != pmap_pdirpa(pmap_kernel(), 0)) {
2729 		/*
2730 		 * Map user space address in kernel space and load
2731 		 * user cr3
2732 		 */
2733 		int i, s;
2734 		pd_entry_t *old_pgd, *new_pgd;
2735 		paddr_t addr;
2736 		s = splvm();
2737 		new_pgd  = pmap->pm_pdir;
2738 		old_pgd = pmap_kernel()->pm_pdir;
2739 		addr = xpmap_ptom(pmap_pdirpa(pmap_kernel(), 0));
2740 		for (i = 0; i < PDIR_SLOT_PTE;
2741 		    i++, addr += sizeof(pd_entry_t)) {
2742 			if ((new_pgd[i] & PG_V) || (old_pgd[i] & PG_V))
2743 				xpq_queue_pte_update(addr, new_pgd[i]);
2744 		}
2745 		xpq_flush_queue(); /* XXXtlb */
2746 		tlbflush();
2747 		xen_set_user_pgd(pmap_pdirpa(pmap, 0));
2748 		xen_current_user_pgd = pmap_pdirpa(pmap, 0);
2749 		splx(s);
2750 	}
2751 #else /* XEN && x86_64 */
2752 #if defined(XEN)
2753 	/*
2754 	 * clear APDP slot, in case it points to a page table that has
2755 	 * been freed
2756 	 */
2757 	if (*APDP_PDE) {
2758 		int i;
2759 		for (i = 0; i < PDP_SIZE; i++) {
2760 			pmap_pte_set(&APDP_PDE[i], 0);
2761 #ifdef PAE
2762 			/* clear shadow entry too */
2763 			pmap_pte_set(&APDP_PDE_SHADOW[i], 0);
2764 #endif
2765 		}
2766 	}
2767 	/* lldt() does pmap_pte_flush() */
2768 #else /* XEN */
2769 #if defined(i386)
2770 	ci->ci_tss.tss_ldt = pmap->pm_ldt_sel;
2771 	ci->ci_tss.tss_cr3 = pcb->pcb_cr3;
2772 #endif
2773 #endif /* XEN */
2774 	lldt(pmap->pm_ldt_sel);
2775 #ifdef PAE
2776 	{
2777 	paddr_t l3_pd = xpmap_ptom_masked(pmap_l3paddr);
2778 	int i;
2779 	int s = splvm();
2780 	/* don't update the kernel L3 slot */
2781 	for (i = 0 ; i < PDP_SIZE - 1  ; i++, l3_pd += sizeof(pd_entry_t)) {
2782 		xpq_queue_pte_update(l3_pd,
2783 		    xpmap_ptom(pmap->pm_pdirpa[i]) | PG_V);
2784 	}
2785 	tlbflush();
2786 	xpq_flush_queue();
2787 	splx(s);
2788 	}
2789 #else /* PAE */
2790 	{
2791 	u_int gen = uvm_emap_gen_return();
2792 	lcr3(pcb->pcb_cr3);
2793 	uvm_emap_update(gen);
2794 	}
2795 #endif /* PAE */
2796 #endif /* XEN && x86_64 */
2797 
2798 	ci->ci_want_pmapload = 0;
2799 
2800 	/*
2801 	 * we're now running with the new pmap.  drop the reference
2802 	 * to the old pmap.  if we block, we need to go around again.
2803 	 */
2804 
2805 	pmap_destroy(oldpmap);
2806 	if (l->l_ncsw != ncsw) {
2807 		goto retry;
2808 	}
2809 
2810 	kpreempt_enable();
2811 }
2812 
2813 /*
2814  * pmap_deactivate: deactivate a process' pmap
2815  *
2816  * => must be called with kernel preemption disabled (high SPL is enough)
2817  */
2818 
2819 void
2820 pmap_deactivate(struct lwp *l)
2821 {
2822 	struct pmap *pmap;
2823 	struct cpu_info *ci;
2824 
2825 	KASSERT(kpreempt_disabled());
2826 
2827 	if (l != curlwp) {
2828 		return;
2829 	}
2830 
2831 	/*
2832 	 * wait for pending TLB shootdowns to complete.  necessary
2833 	 * because TLB shootdown state is per-CPU, and the LWP may
2834 	 * be coming off the CPU before it has a chance to call
2835 	 * pmap_update().
2836 	 */
2837 	pmap_tlb_shootwait();
2838 
2839 	ci = curcpu();
2840 
2841 	if (ci->ci_want_pmapload) {
2842 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
2843 		    != pmap_kernel());
2844 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
2845 		    != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID);
2846 
2847 		/*
2848 		 * userspace has not been touched.
2849 		 * nothing to do here.
2850 		 */
2851 
2852 		ci->ci_want_pmapload = 0;
2853 		return;
2854 	}
2855 
2856 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2857 
2858 	if (pmap == pmap_kernel()) {
2859 		return;
2860 	}
2861 
2862 #if defined(XEN) && defined(__x86_64__)
2863 	KASSERT(pmap->pm_pdirpa == xen_current_user_pgd);
2864 #elif defined(PAE)
2865 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(pmap_l3pd[0]));
2866 #elif !defined(XEN)
2867 	KASSERT(pmap->pm_pdirpa == pmap_pte2pa(rcr3()));
2868 #endif
2869 	KASSERT(ci->ci_pmap == pmap);
2870 
2871 	/*
2872 	 * we aren't interested in TLB invalidations for this pmap,
2873 	 * at least for the time being.
2874 	 */
2875 
2876 	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
2877 	ci->ci_tlbstate = TLBSTATE_LAZY;
2878 }
2879 
2880 /*
2881  * end of lifecycle functions
2882  */
2883 
2884 /*
2885  * some misc. functions
2886  */
2887 
2888 static int
2889 pmap_pdes_invalid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde)
2890 {
2891 	int i;
2892 	unsigned long index;
2893 	pd_entry_t pde;
2894 
2895 	for (i = PTP_LEVELS; i > 1; i--) {
2896 		index = pl_i(va, i);
2897 		pde = pdes[i - 2][index];
2898 		if ((pde & PG_V) == 0)
2899 			return i;
2900 	}
2901 	if (lastpde != NULL)
2902 		*lastpde = pde;
2903 	return 0;
2904 }
2905 
2906 /*
2907  * pmap_extract: extract a PA for the given VA
2908  */
2909 
2910 bool
2911 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
2912 {
2913 	pt_entry_t *ptes, pte;
2914 	pd_entry_t pde;
2915 	pd_entry_t * const *pdes;
2916 	struct pmap *pmap2;
2917 	struct cpu_info *ci;
2918 	vaddr_t pa;
2919 	lwp_t *l;
2920 	bool hard, rv;
2921 
2922 	rv = false;
2923 	pa = 0;
2924 	l = curlwp;
2925 
2926 	KPREEMPT_DISABLE(l);
2927 	ci = l->l_cpu;
2928 	if (__predict_true(!ci->ci_want_pmapload && ci->ci_pmap == pmap) ||
2929 	    pmap == pmap_kernel()) {
2930 		/*
2931 		 * no need to lock, because it's pmap_kernel() or our
2932 		 * own pmap and is active.  if a user pmap, the caller
2933 		 * will hold the vm_map write/read locked and so prevent
2934 		 * entries from disappearing while we are here.  ptps
2935 		 * can disappear via pmap_remove() and pmap_protect(),
2936 		 * but they are called with the vm_map write locked.
2937 		 */
2938 		hard = false;
2939 		ptes = PTE_BASE;
2940 		pdes = normal_pdes;
2941 	} else {
2942 		/* we lose, do it the hard way. */
2943 		hard = true;
2944 		pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
2945 	}
2946 	if (pmap_pdes_valid(va, pdes, &pde)) {
2947 		pte = ptes[pl1_i(va)];
2948 		if (pde & PG_PS) {
2949 			pa = (pde & PG_LGFRAME) | (va & (NBPD_L2 - 1));
2950 			rv = true;
2951 		} else if (__predict_true((pte & PG_V) != 0)) {
2952 			pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
2953 			rv = true;
2954 		}
2955 	}
2956 	if (__predict_false(hard)) {
2957 		pmap_unmap_ptes(pmap, pmap2);
2958 	}
2959 	KPREEMPT_ENABLE(l);
2960 	if (pap != NULL) {
2961 		*pap = pa;
2962 	}
2963 	return rv;
2964 }
2965 
2966 
2967 /*
2968  * vtophys: virtual address to physical address.  For use by
2969  * machine-dependent code only.
2970  */
2971 
2972 paddr_t
2973 vtophys(vaddr_t va)
2974 {
2975 	paddr_t pa;
2976 
2977 	if (pmap_extract(pmap_kernel(), va, &pa) == true)
2978 		return (pa);
2979 	return (0);
2980 }
2981 
2982 #ifdef XEN
2983 /*
2984  * pmap_extract_ma: extract a MA for the given VA
2985  */
2986 
2987 bool
2988 pmap_extract_ma(struct pmap *pmap, vaddr_t va, paddr_t *pap)
2989 {
2990 	pt_entry_t *ptes, pte;
2991 	pd_entry_t pde;
2992 	pd_entry_t * const *pdes;
2993 	struct pmap *pmap2;
2994 
2995 	kpreempt_disable();
2996 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
2997 	if (!pmap_pdes_valid(va, pdes, &pde)) {
2998 		pmap_unmap_ptes(pmap, pmap2);
2999 		kpreempt_enable();
3000 		return false;
3001 	}
3002 
3003 	pte = ptes[pl1_i(va)];
3004 	pmap_unmap_ptes(pmap, pmap2);
3005 	kpreempt_enable();
3006 
3007 	if (__predict_true((pte & PG_V) != 0)) {
3008 		if (pap != NULL)
3009 			*pap = (pte & PG_FRAME) | (va & (NBPD_L1 - 1));
3010 		return true;
3011 	}
3012 
3013 	return false;
3014 }
3015 
3016 /*
3017  * vtomach: virtual address to machine address.  For use by
3018  * machine-dependent code only.
3019  */
3020 
3021 paddr_t
3022 vtomach(vaddr_t va)
3023 {
3024 	paddr_t pa;
3025 
3026 	if (pmap_extract_ma(pmap_kernel(), va, &pa) == true)
3027 		return (pa);
3028 	return (0);
3029 }
3030 
3031 #endif /* XEN */
3032 
3033 
3034 
3035 /*
3036  * pmap_virtual_space: used during bootup [pmap_steal_memory] to
3037  *	determine the bounds of the kernel virtual addess space.
3038  */
3039 
3040 void
3041 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp)
3042 {
3043 	*startp = virtual_avail;
3044 	*endp = virtual_end;
3045 }
3046 
3047 /*
3048  * pmap_map: map a range of PAs into kvm.
3049  *
3050  * => used during crash dump
3051  * => XXX: pmap_map() should be phased out?
3052  */
3053 
3054 vaddr_t
3055 pmap_map(vaddr_t va, paddr_t spa, paddr_t epa, vm_prot_t prot)
3056 {
3057 	while (spa < epa) {
3058 		pmap_kenter_pa(va, spa, prot, 0);
3059 		va += PAGE_SIZE;
3060 		spa += PAGE_SIZE;
3061 	}
3062 	pmap_update(pmap_kernel());
3063 	return va;
3064 }
3065 
3066 /*
3067  * pmap_zero_page: zero a page
3068  */
3069 
3070 void
3071 pmap_zero_page(paddr_t pa)
3072 {
3073 	pt_entry_t *zpte;
3074 	void *zerova;
3075 	int id;
3076 
3077 	kpreempt_disable();
3078 	id = cpu_number();
3079 	zpte = PTESLEW(zero_pte, id);
3080 	zerova = VASLEW(zerop, id);
3081 
3082 #ifdef DIAGNOSTIC
3083 	if (*zpte)
3084 		panic("pmap_zero_page: lock botch");
3085 #endif
3086 
3087 	pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k);
3088 	pmap_pte_flush();
3089 	pmap_update_pg((vaddr_t)zerova);		/* flush TLB */
3090 
3091 	memset(zerova, 0, PAGE_SIZE);
3092 
3093 #if defined(DIAGNOSTIC) || defined(XEN)
3094 	pmap_pte_set(zpte, 0);				/* zap ! */
3095 	pmap_pte_flush();
3096 #endif
3097 	kpreempt_enable();
3098 }
3099 
3100 /*
3101  * pmap_pagezeroidle: the same, for the idle loop page zero'er.
3102  * Returns true if the page was zero'd, false if we aborted for
3103  * some reason.
3104  */
3105 
3106 bool
3107 pmap_pageidlezero(paddr_t pa)
3108 {
3109 	pt_entry_t *zpte;
3110 	void *zerova;
3111 	bool rv;
3112 	int id;
3113 
3114 	id = cpu_number();
3115 	zpte = PTESLEW(zero_pte, id);
3116 	zerova = VASLEW(zerop, id);
3117 
3118 	KASSERT(cpu_feature & CPUID_SSE2);
3119 	KASSERT(*zpte == 0);
3120 
3121 	pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k);
3122 	pmap_pte_flush();
3123 	pmap_update_pg((vaddr_t)zerova);		/* flush TLB */
3124 
3125 	rv = sse2_idlezero_page(zerova);
3126 
3127 #if defined(DIAGNOSTIC) || defined(XEN)
3128 	pmap_pte_set(zpte, 0);				/* zap ! */
3129 	pmap_pte_flush();
3130 #endif
3131 
3132 	return rv;
3133 }
3134 
3135 /*
3136  * pmap_copy_page: copy a page
3137  */
3138 
3139 void
3140 pmap_copy_page(paddr_t srcpa, paddr_t dstpa)
3141 {
3142 	pt_entry_t *spte;
3143 	pt_entry_t *dpte;
3144 	void *csrcva;
3145 	void *cdstva;
3146 	int id;
3147 
3148 	kpreempt_disable();
3149 	id = cpu_number();
3150 	spte = PTESLEW(csrc_pte,id);
3151 	dpte = PTESLEW(cdst_pte,id);
3152 	csrcva = VASLEW(csrcp, id);
3153 	cdstva = VASLEW(cdstp, id);
3154 
3155 	KASSERT(*spte == 0 && *dpte == 0);
3156 
3157 	pmap_pte_set(spte, pmap_pa2pte(srcpa) | PG_V | PG_RW | PG_U | PG_k);
3158 	pmap_pte_set(dpte,
3159 	    pmap_pa2pte(dstpa) | PG_V | PG_RW | PG_M | PG_U | PG_k);
3160 	pmap_pte_flush();
3161 	pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva);
3162 
3163 	memcpy(cdstva, csrcva, PAGE_SIZE);
3164 
3165 #if defined(DIAGNOSTIC) || defined(XEN)
3166 	pmap_pte_set(spte, 0);
3167 	pmap_pte_set(dpte, 0);
3168 	pmap_pte_flush();
3169 #endif
3170 	kpreempt_enable();
3171 }
3172 
3173 static pt_entry_t *
3174 pmap_map_ptp(struct vm_page *ptp)
3175 {
3176 	pt_entry_t *ptppte;
3177 	void *ptpva;
3178 	int id;
3179 
3180 	KASSERT(kpreempt_disabled());
3181 
3182 	id = cpu_number();
3183 	ptppte = PTESLEW(ptp_pte, id);
3184 	ptpva = VASLEW(ptpp, id);
3185 #if !defined(XEN)
3186 	pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M |
3187 	    PG_RW | PG_U | PG_k);
3188 #else
3189 	pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M |
3190 	    PG_U | PG_k);
3191 #endif
3192 	pmap_pte_flush();
3193 	pmap_update_pg((vaddr_t)ptpva);
3194 
3195 	return (pt_entry_t *)ptpva;
3196 }
3197 
3198 static void
3199 pmap_unmap_ptp(void)
3200 {
3201 #if defined(DIAGNOSTIC) || defined(XEN)
3202 	pt_entry_t *pte;
3203 
3204 	KASSERT(kpreempt_disabled());
3205 
3206 	pte = PTESLEW(ptp_pte, cpu_number());
3207 	if (*pte != 0) {
3208 		pmap_pte_set(pte, 0);
3209 		pmap_pte_flush();
3210 	}
3211 #endif
3212 }
3213 
3214 static pt_entry_t *
3215 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
3216 {
3217 
3218 	KASSERT(kpreempt_disabled());
3219 	if (pmap_is_curpmap(pmap)) {
3220 		return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */
3221 	}
3222 	KASSERT(ptp != NULL);
3223 	return pmap_map_ptp(ptp) + pl1_pi(va);
3224 }
3225 
3226 static void
3227 pmap_unmap_pte(void)
3228 {
3229 
3230 	KASSERT(kpreempt_disabled());
3231 
3232 	pmap_unmap_ptp();
3233 }
3234 
3235 /*
3236  * p m a p   r e m o v e   f u n c t i o n s
3237  *
3238  * functions that remove mappings
3239  */
3240 
3241 /*
3242  * pmap_remove_ptes: remove PTEs from a PTP
3243  *
3244  * => must have proper locking on pmap_master_lock
3245  * => caller must hold pmap's lock
3246  * => PTP must be mapped into KVA
3247  * => PTP should be null if pmap == pmap_kernel()
3248  * => must be called with kernel preemption disabled
3249  * => returns composite pte if at least one page should be shot down
3250  */
3251 
3252 static pt_entry_t
3253 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
3254 		 vaddr_t startva, vaddr_t endva, struct pv_entry **pv_tofree)
3255 {
3256 	struct pv_entry *pve;
3257 	pt_entry_t *pte = (pt_entry_t *) ptpva;
3258 	pt_entry_t opte, xpte = 0;
3259 
3260 	KASSERT(pmap == pmap_kernel() || mutex_owned(&pmap->pm_lock));
3261 	KASSERT(kpreempt_disabled());
3262 
3263 	/*
3264 	 * note that ptpva points to the PTE that maps startva.   this may
3265 	 * or may not be the first PTE in the PTP.
3266 	 *
3267 	 * we loop through the PTP while there are still PTEs to look at
3268 	 * and the wire_count is greater than 1 (because we use the wire_count
3269 	 * to keep track of the number of real PTEs in the PTP).
3270 	 */
3271 
3272 	for (/*null*/; startva < endva && (ptp == NULL || ptp->wire_count > 1)
3273 			     ; pte++, startva += PAGE_SIZE) {
3274 		struct vm_page *pg;
3275 		struct pmap_page *pp;
3276 
3277 		if (!pmap_valid_entry(*pte))
3278 			continue;			/* VA not mapped */
3279 
3280 		/* atomically save the old PTE and zap! it */
3281 		opte = pmap_pte_testset(pte, 0);
3282 		if (!pmap_valid_entry(opte)) {
3283 			continue;
3284 		}
3285 
3286 		pmap_exec_account(pmap, startva, opte, 0);
3287 		pmap_stats_update_bypte(pmap, 0, opte);
3288 		xpte |= opte;
3289 
3290 		if (ptp) {
3291 			ptp->wire_count--;		/* dropping a PTE */
3292 			/* Make sure that the PDE is flushed */
3293 			if (ptp->wire_count <= 1)
3294 				xpte |= PG_U;
3295 		}
3296 
3297 		/*
3298 		 * if we are not on a pv_head list we are done.
3299 		 */
3300 
3301 		if ((opte & PG_PVLIST) == 0) {
3302 #if defined(DIAGNOSTIC) && !defined(DOM0OPS)
3303 			if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL)
3304 				panic("pmap_remove_ptes: managed page without "
3305 				      "PG_PVLIST for 0x%lx", startva);
3306 #endif
3307 			continue;
3308 		}
3309 
3310 		pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte));
3311 #ifdef DIAGNOSTIC
3312 		if (pg == NULL)
3313 			panic("pmap_remove_ptes: unmanaged page marked "
3314 			      "PG_PVLIST, va = 0x%lx, pa = 0x%lx",
3315 			      startva, (u_long)pmap_pte2pa(opte));
3316 #endif
3317 
3318 		/* sync R/M bits */
3319 		pp = VM_PAGE_TO_PP(pg);
3320 		pp_lock(pp);
3321 		pp->pp_attrs |= opte;
3322 		pve = pmap_remove_pv(pp, ptp, startva);
3323 		pp_unlock(pp);
3324 
3325 		if (pve != NULL) {
3326 			pve->pve_next = *pv_tofree;
3327 			*pv_tofree = pve;
3328 		}
3329 
3330 		/* end of "for" loop: time for next pte */
3331 	}
3332 
3333 	return xpte;
3334 }
3335 
3336 
3337 /*
3338  * pmap_remove_pte: remove a single PTE from a PTP
3339  *
3340  * => must have proper locking on pmap_master_lock
3341  * => caller must hold pmap's lock
3342  * => PTP must be mapped into KVA
3343  * => PTP should be null if pmap == pmap_kernel()
3344  * => returns true if we removed a mapping
3345  * => must be called with kernel preemption disabled
3346  */
3347 
3348 static bool
3349 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
3350 		vaddr_t va, struct pv_entry **pv_tofree)
3351 {
3352 	pt_entry_t opte;
3353 	struct pv_entry *pve;
3354 	struct vm_page *pg;
3355 	struct pmap_page *pp;
3356 
3357 	KASSERT(pmap == pmap_kernel() || mutex_owned(&pmap->pm_lock));
3358 	KASSERT(pmap == pmap_kernel() || kpreempt_disabled());
3359 
3360 	if (!pmap_valid_entry(*pte))
3361 		return(false);		/* VA not mapped */
3362 
3363 	/* atomically save the old PTE and zap! it */
3364 	opte = pmap_pte_testset(pte, 0);
3365 	if (!pmap_valid_entry(opte)) {
3366 		return false;
3367 	}
3368 
3369 	pmap_exec_account(pmap, va, opte, 0);
3370 	pmap_stats_update_bypte(pmap, 0, opte);
3371 
3372 	if (opte & PG_U)
3373 		pmap_tlb_shootdown(pmap, va, 0, opte);
3374 
3375 	if (ptp) {
3376 		ptp->wire_count--;		/* dropping a PTE */
3377 		/* Make sure that the PDE is flushed */
3378 		if ((ptp->wire_count <= 1) && !(opte & PG_U))
3379 			pmap_tlb_shootdown(pmap, va, 0, opte);
3380 	}
3381 
3382 	/*
3383 	 * if we are not on a pv_head list we are done.
3384 	 */
3385 
3386 	if ((opte & PG_PVLIST) == 0) {
3387 #if defined(DIAGNOSTIC) && !defined(DOM0OPS)
3388 		if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL)
3389 			panic("pmap_remove_pte: managed page without "
3390 			      "PG_PVLIST for 0x%lx", va);
3391 #endif
3392 		return(true);
3393 	}
3394 
3395 	pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte));
3396 #ifdef DIAGNOSTIC
3397 	if (pg == NULL)
3398 		panic("pmap_remove_pte: unmanaged page marked "
3399 		    "PG_PVLIST, va = 0x%lx, pa = 0x%lx", va,
3400 		    (u_long)(pmap_pte2pa(opte)));
3401 #endif
3402 
3403 	/* sync R/M bits */
3404 	pp = VM_PAGE_TO_PP(pg);
3405 	pp_lock(pp);
3406 	pp->pp_attrs |= opte;
3407 	pve = pmap_remove_pv(pp, ptp, va);
3408 	pp_unlock(pp);
3409 
3410 	if (pve) {
3411 		pve->pve_next = *pv_tofree;
3412 		*pv_tofree = pve;
3413 	}
3414 
3415 	return(true);
3416 }
3417 
3418 /*
3419  * pmap_remove: mapping removal function.
3420  *
3421  * => caller should not be holding any pmap locks
3422  */
3423 
3424 void
3425 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
3426 {
3427 	pt_entry_t *ptes, xpte = 0;
3428 	pd_entry_t pde;
3429 	pd_entry_t * const *pdes;
3430 	struct pv_entry *pv_tofree = NULL;
3431 	bool result;
3432 	paddr_t ptppa;
3433 	vaddr_t blkendva, va = sva;
3434 	struct vm_page *ptp;
3435 	struct pmap *pmap2;
3436 
3437 	kpreempt_disable();
3438 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
3439 
3440 	/*
3441 	 * removing one page?  take shortcut function.
3442 	 */
3443 
3444 	if (va + PAGE_SIZE == eva) {
3445 		if (pmap_pdes_valid(va, pdes, &pde)) {
3446 
3447 			/* PA of the PTP */
3448 			ptppa = pmap_pte2pa(pde);
3449 
3450 			/* get PTP if non-kernel mapping */
3451 			if (pmap == pmap_kernel()) {
3452 				/* we never free kernel PTPs */
3453 				ptp = NULL;
3454 			} else {
3455 				ptp = pmap_find_ptp(pmap, va, ptppa, 1);
3456 #ifdef DIAGNOSTIC
3457 				if (ptp == NULL)
3458 					panic("pmap_remove: unmanaged "
3459 					      "PTP detected");
3460 #endif
3461 			}
3462 
3463 			/* do it! */
3464 			result = pmap_remove_pte(pmap, ptp,
3465 			    &ptes[pl1_i(va)], va, &pv_tofree);
3466 
3467 			/*
3468 			 * if mapping removed and the PTP is no longer
3469 			 * being used, free it!
3470 			 */
3471 
3472 			if (result && ptp && ptp->wire_count <= 1)
3473 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3474 		}
3475 	} else for (/* null */ ; va < eva ; va = blkendva) {
3476 		int lvl;
3477 
3478 		/* determine range of block */
3479 		blkendva = x86_round_pdr(va+1);
3480 		if (blkendva > eva)
3481 			blkendva = eva;
3482 
3483 		/*
3484 		 * XXXCDC: our PTE mappings should never be removed
3485 		 * with pmap_remove!  if we allow this (and why would
3486 		 * we?) then we end up freeing the pmap's page
3487 		 * directory page (PDP) before we are finished using
3488 		 * it when we hit in in the recursive mapping.  this
3489 		 * is BAD.
3490 		 *
3491 		 * long term solution is to move the PTEs out of user
3492 		 * address space.  and into kernel address space (up
3493 		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
3494 		 * be VM_MAX_ADDRESS.
3495 		 */
3496 
3497 		if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE)
3498 			/* XXXCDC: ugly hack to avoid freeing PDP here */
3499 			continue;
3500 
3501 		lvl = pmap_pdes_invalid(va, pdes, &pde);
3502 		if (lvl != 0) {
3503 			/*
3504 			 * skip a range corresponding to an invalid pde.
3505 			 */
3506 			blkendva = (va & ptp_masks[lvl - 1]) + nbpd[lvl - 1];
3507  			continue;
3508 		}
3509 
3510 		/* PA of the PTP */
3511 		ptppa = pmap_pte2pa(pde);
3512 
3513 		/* get PTP if non-kernel mapping */
3514 		if (pmap == pmap_kernel()) {
3515 			/* we never free kernel PTPs */
3516 			ptp = NULL;
3517 		} else {
3518 			ptp = pmap_find_ptp(pmap, va, ptppa, 1);
3519 #ifdef DIAGNOSTIC
3520 			if (ptp == NULL)
3521 				panic("pmap_remove: unmanaged PTP "
3522 				      "detected");
3523 #endif
3524 		}
3525 		xpte |= pmap_remove_ptes(pmap, ptp,
3526 		    (vaddr_t)&ptes[pl1_i(va)], va, blkendva, &pv_tofree);
3527 
3528 		/* if PTP is no longer being used, free it! */
3529 		if (ptp && ptp->wire_count <= 1) {
3530 			pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3531 		}
3532 		if ((xpte & PG_U) != 0)
3533 			pmap_tlb_shootdown(pmap, sva, eva, xpte);
3534 	}
3535 	pmap_unmap_ptes(pmap, pmap2);		/* unlock pmap */
3536 	kpreempt_enable();
3537 
3538 	/* Now we free unused PVs */
3539 	if (pv_tofree)
3540 		pmap_free_pvs(pv_tofree);
3541 }
3542 
3543 /*
3544  * pmap_sync_pv: clear pte bits and return the old value of the pte.
3545  *
3546  * => called with pp_lock held. (thus preemption disabled)
3547  * => issues tlb shootdowns if necessary.
3548  */
3549 
3550 static int
3551 pmap_sync_pv(struct pv_pte *pvpte, pt_entry_t expect, int clearbits,
3552     pt_entry_t *optep)
3553 {
3554 	struct pmap *pmap;
3555 	struct vm_page *ptp;
3556 	vaddr_t va;
3557 	pt_entry_t *ptep;
3558 	pt_entry_t opte;
3559 	pt_entry_t npte;
3560 	bool need_shootdown;
3561 
3562 	ptp = pvpte->pte_ptp;
3563 	va = pvpte->pte_va;
3564 	KASSERT(ptp == NULL || ptp->uobject != NULL);
3565 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
3566 	pmap = ptp_to_pmap(ptp);
3567 
3568 	KASSERT((expect & ~(PG_FRAME | PG_V)) == 0);
3569 	KASSERT((expect & PG_V) != 0);
3570 	KASSERT(clearbits == ~0 || (clearbits & ~(PG_M | PG_U | PG_RW)) == 0);
3571 	KASSERT(kpreempt_disabled());
3572 
3573 	ptep = pmap_map_pte(pmap, ptp, va);
3574 	do {
3575 		opte = *ptep;
3576 		KASSERT((opte & (PG_M | PG_U)) != PG_M);
3577 		KASSERT((opte & (PG_U | PG_V)) != PG_U);
3578 		KASSERT(opte == 0 || (opte & PG_V) != 0);
3579 		if ((opte & (PG_FRAME | PG_V)) != expect) {
3580 
3581 			/*
3582 			 * we lost a race with a V->P operation like
3583 			 * pmap_remove().  wait for the competitor
3584 			 * reflecting pte bits into mp_attrs.
3585 			 *
3586 			 * issue a redundant TLB shootdown so that
3587 			 * we can wait for its completion.
3588 			 */
3589 
3590 			pmap_unmap_pte();
3591 			if (clearbits != 0) {
3592 				pmap_tlb_shootdown(pmap, va, 0,
3593 				    (pmap == pmap_kernel() ? PG_G : 0));
3594 			}
3595 			return EAGAIN;
3596 		}
3597 
3598 		/*
3599 		 * check if there's anything to do on this pte.
3600 		 */
3601 
3602 		if ((opte & clearbits) == 0) {
3603 			need_shootdown = false;
3604 			break;
3605 		}
3606 
3607 		/*
3608 		 * we need a shootdown if the pte is cached. (PG_U)
3609 		 *
3610 		 * ...unless we are clearing only the PG_RW bit and
3611 		 * it isn't cached as RW. (PG_M)
3612 		 */
3613 
3614 		need_shootdown = (opte & PG_U) != 0 &&
3615 		    !(clearbits == PG_RW && (opte & PG_M) == 0);
3616 
3617 		npte = opte & ~clearbits;
3618 
3619 		/*
3620 		 * if we need a shootdown anyway, clear PG_U and PG_M.
3621 		 */
3622 
3623 		if (need_shootdown) {
3624 			npte &= ~(PG_U | PG_M);
3625 		}
3626 		KASSERT((npte & (PG_M | PG_U)) != PG_M);
3627 		KASSERT((npte & (PG_U | PG_V)) != PG_U);
3628 		KASSERT(npte == 0 || (opte & PG_V) != 0);
3629 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
3630 
3631 	if (need_shootdown) {
3632 		pmap_tlb_shootdown(pmap, va, 0, opte);
3633 	}
3634 	pmap_unmap_pte();
3635 
3636 	*optep = opte;
3637 	return 0;
3638 }
3639 
3640 /*
3641  * pmap_page_remove: remove a managed vm_page from all pmaps that map it
3642  *
3643  * => R/M bits are sync'd back to attrs
3644  */
3645 
3646 void
3647 pmap_page_remove(struct vm_page *pg)
3648 {
3649 	struct pmap_page *pp;
3650 	struct pv_pte *pvpte;
3651 	struct pv_entry *killlist = NULL;
3652 	struct vm_page *ptp;
3653 	pt_entry_t expect;
3654 	lwp_t *l;
3655 	int count;
3656 
3657 	l = curlwp;
3658 	pp = VM_PAGE_TO_PP(pg);
3659 	expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V;
3660 	count = SPINLOCK_BACKOFF_MIN;
3661 	kpreempt_disable();
3662 startover:
3663 	pp_lock(pp);
3664 	while ((pvpte = pv_pte_first(pp)) != NULL) {
3665 		struct pmap *pmap;
3666 		struct pv_entry *pve;
3667 		pt_entry_t opte;
3668 		vaddr_t va;
3669 		int error;
3670 
3671 		/*
3672 		 * add a reference to the pmap before clearing the pte.
3673 		 * otherwise the pmap can disappear behind us.
3674 		 */
3675 
3676 		ptp = pvpte->pte_ptp;
3677 		pmap = ptp_to_pmap(ptp);
3678 		if (ptp != NULL) {
3679 			pmap_reference(pmap);
3680 		}
3681 
3682 		error = pmap_sync_pv(pvpte, expect, ~0, &opte);
3683 		if (error == EAGAIN) {
3684 			int hold_count;
3685 			pp_unlock(pp);
3686 			KERNEL_UNLOCK_ALL(curlwp, &hold_count);
3687 			if (ptp != NULL) {
3688 				pmap_destroy(pmap);
3689 			}
3690 			SPINLOCK_BACKOFF(count);
3691 			KERNEL_LOCK(hold_count, curlwp);
3692 			goto startover;
3693 		}
3694 
3695 		pp->pp_attrs |= opte;
3696 		va = pvpte->pte_va;
3697 		pve = pmap_remove_pv(pp, ptp, va);
3698 		pp_unlock(pp);
3699 
3700 		/* update the PTP reference count.  free if last reference. */
3701 		if (ptp != NULL) {
3702 			struct pmap *pmap2;
3703 			pt_entry_t *ptes;
3704 			pd_entry_t * const *pdes;
3705 
3706 			KASSERT(pmap != pmap_kernel());
3707 
3708 			pmap_tlb_shootwait();
3709 			pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3710 			pmap_stats_update_bypte(pmap, 0, opte);
3711 			ptp->wire_count--;
3712 			if (ptp->wire_count <= 1) {
3713 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3714 			}
3715 			pmap_unmap_ptes(pmap, pmap2);
3716 			pmap_destroy(pmap);
3717 		} else {
3718 			KASSERT(pmap == pmap_kernel());
3719 			pmap_stats_update_bypte(pmap, 0, opte);
3720 		}
3721 
3722 		if (pve != NULL) {
3723 			pve->pve_next = killlist;	/* mark it for death */
3724 			killlist = pve;
3725 		}
3726 		pp_lock(pp);
3727 	}
3728 	pp_unlock(pp);
3729 	kpreempt_enable();
3730 
3731 	/* Now free unused pvs. */
3732 	pmap_free_pvs(killlist);
3733 }
3734 
3735 /*
3736  * p m a p   a t t r i b u t e  f u n c t i o n s
3737  * functions that test/change managed page's attributes
3738  * since a page can be mapped multiple times we must check each PTE that
3739  * maps it by going down the pv lists.
3740  */
3741 
3742 /*
3743  * pmap_test_attrs: test a page's attributes
3744  */
3745 
3746 bool
3747 pmap_test_attrs(struct vm_page *pg, unsigned testbits)
3748 {
3749 	struct pmap_page *pp;
3750 	struct pv_pte *pvpte;
3751 	pt_entry_t expect;
3752 	u_int result;
3753 
3754 	pp = VM_PAGE_TO_PP(pg);
3755 	if ((pp->pp_attrs & testbits) != 0) {
3756 		return true;
3757 	}
3758 	expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V;
3759 	pp_lock(pp);
3760 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
3761 		pt_entry_t opte;
3762 		int error;
3763 
3764 		if ((pp->pp_attrs & testbits) != 0) {
3765 			break;
3766 		}
3767 		error = pmap_sync_pv(pvpte, expect, 0, &opte);
3768 		if (error == 0) {
3769 			pp->pp_attrs |= opte;
3770 		}
3771 	}
3772 	result = pp->pp_attrs & testbits;
3773 	pp_unlock(pp);
3774 
3775 	/*
3776 	 * note that we will exit the for loop with a non-null pve if
3777 	 * we have found the bits we are testing for.
3778 	 */
3779 
3780 	return result != 0;
3781 }
3782 
3783 /*
3784  * pmap_clear_attrs: clear the specified attribute for a page.
3785  *
3786  * => we return true if we cleared one of the bits we were asked to
3787  */
3788 
3789 bool
3790 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits)
3791 {
3792 	struct pmap_page *pp;
3793 	struct pv_pte *pvpte;
3794 	u_int result;
3795 	pt_entry_t expect;
3796 	int count;
3797 
3798 	pp = VM_PAGE_TO_PP(pg);
3799 	expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V;
3800 	count = SPINLOCK_BACKOFF_MIN;
3801 	kpreempt_disable();
3802 startover:
3803 	pp_lock(pp);
3804 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
3805 		pt_entry_t opte;
3806 		int error;
3807 
3808 		error = pmap_sync_pv(pvpte, expect, clearbits, &opte);
3809 		if (error == EAGAIN) {
3810 			int hold_count;
3811 			pp_unlock(pp);
3812 			KERNEL_UNLOCK_ALL(curlwp, &hold_count);
3813 			SPINLOCK_BACKOFF(count);
3814 			KERNEL_LOCK(hold_count, curlwp);
3815 			goto startover;
3816 		}
3817 		pp->pp_attrs |= opte;
3818 	}
3819 	result = pp->pp_attrs & clearbits;
3820 	pp->pp_attrs &= ~clearbits;
3821 	pp_unlock(pp);
3822 	kpreempt_enable();
3823 
3824 	return result != 0;
3825 }
3826 
3827 
3828 /*
3829  * p m a p   p r o t e c t i o n   f u n c t i o n s
3830  */
3831 
3832 /*
3833  * pmap_page_protect: change the protection of all recorded mappings
3834  *	of a managed page
3835  *
3836  * => NOTE: this is an inline function in pmap.h
3837  */
3838 
3839 /* see pmap.h */
3840 
3841 /*
3842  * pmap_protect: set the protection in of the pages in a pmap
3843  *
3844  * => NOTE: this is an inline function in pmap.h
3845  */
3846 
3847 /* see pmap.h */
3848 
3849 /*
3850  * pmap_write_protect: write-protect pages in a pmap
3851  */
3852 
3853 void
3854 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
3855 {
3856 	pt_entry_t *ptes, *epte;
3857 	pt_entry_t *spte;
3858 	pd_entry_t * const *pdes;
3859 	vaddr_t blockend, va;
3860 	pt_entry_t opte;
3861 	struct pmap *pmap2;
3862 
3863 	KASSERT(curlwp->l_md.md_gc_pmap != pmap);
3864 
3865 	kpreempt_disable();
3866 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
3867 
3868 	/* should be ok, but just in case ... */
3869 	sva &= PG_FRAME;
3870 	eva &= PG_FRAME;
3871 
3872 	for (va = sva ; va < eva ; va = blockend) {
3873 
3874 		blockend = (va & L2_FRAME) + NBPD_L2;
3875 		if (blockend > eva)
3876 			blockend = eva;
3877 
3878 		/*
3879 		 * XXXCDC: our PTE mappings should never be write-protected!
3880 		 *
3881 		 * long term solution is to move the PTEs out of user
3882 		 * address space.  and into kernel address space (up
3883 		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
3884 		 * be VM_MAX_ADDRESS.
3885 		 */
3886 
3887 		/* XXXCDC: ugly hack to avoid freeing PDP here */
3888 		if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE)
3889 			continue;
3890 
3891 		/* empty block? */
3892 		if (!pmap_pdes_valid(va, pdes, NULL))
3893 			continue;
3894 
3895 #ifdef DIAGNOSTIC
3896 		if (va >= VM_MAXUSER_ADDRESS &&
3897 		    va < VM_MAX_ADDRESS)
3898 			panic("pmap_write_protect: PTE space");
3899 #endif
3900 
3901 		spte = &ptes[pl1_i(va)];
3902 		epte = &ptes[pl1_i(blockend)];
3903 
3904 		for (/*null */; spte < epte ; spte++) {
3905 			pt_entry_t npte;
3906 
3907 			do {
3908 				opte = *spte;
3909 				if ((~opte & (PG_RW | PG_V)) != 0) {
3910 					goto next;
3911 				}
3912 				npte = opte & ~PG_RW;
3913 			} while (pmap_pte_cas(spte, opte, npte) != opte);
3914 			if ((opte & PG_M) != 0) {
3915 				vaddr_t tva;
3916 
3917 				tva = x86_ptob(spte - ptes);
3918 				pmap_tlb_shootdown(pmap, tva, 0, opte);
3919 			}
3920 next:;
3921 		}
3922 	}
3923 
3924 	pmap_unmap_ptes(pmap, pmap2);	/* unlocks pmap */
3925 	kpreempt_enable();
3926 }
3927 
3928 /*
3929  * end of protection functions
3930  */
3931 
3932 /*
3933  * pmap_unwire: clear the wired bit in the PTE
3934  *
3935  * => mapping should already be in map
3936  */
3937 
3938 void
3939 pmap_unwire(struct pmap *pmap, vaddr_t va)
3940 {
3941 	pt_entry_t *ptes;
3942 	pd_entry_t * const *pdes;
3943 	struct pmap *pmap2;
3944 
3945 	kpreempt_disable();
3946 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
3947 
3948 	if (pmap_pdes_valid(va, pdes, NULL)) {
3949 		pt_entry_t *ptep = &ptes[pl1_i(va)];
3950 		pt_entry_t opte = *ptep;
3951 
3952 #ifdef DIAGNOSTIC
3953 		if (!pmap_valid_entry(opte))
3954 			panic("pmap_unwire: invalid (unmapped) va 0x%lx", va);
3955 #endif
3956 		if ((opte & PG_W) != 0) {
3957 			pt_entry_t npte = opte & ~PG_W;
3958 
3959 			opte = pmap_pte_testset(ptep, npte);
3960 			pmap_stats_update_bypte(pmap, npte, opte);
3961 		}
3962 #ifdef DIAGNOSTIC
3963 		else {
3964 			printf("pmap_unwire: wiring for pmap %p va 0x%lx "
3965 			       "didn't change!\n", pmap, va);
3966 		}
3967 #endif
3968 		pmap_unmap_ptes(pmap, pmap2);		/* unlocks map */
3969 	}
3970 #ifdef DIAGNOSTIC
3971 	else {
3972 		panic("pmap_unwire: invalid PDE");
3973 	}
3974 #endif
3975 	kpreempt_enable();
3976 }
3977 
3978 /*
3979  * pmap_copy: copy mappings from one pmap to another
3980  *
3981  * => optional function
3982  * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
3983  */
3984 
3985 /*
3986  * defined as macro in pmap.h
3987  */
3988 
3989 /*
3990  * pmap_enter: enter a mapping into a pmap
3991  *
3992  * => must be done "now" ... no lazy-evaluation
3993  * => we set pmap => pv_head locking
3994  */
3995 #ifdef XEN
3996 int
3997 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa,
3998 	   vm_prot_t prot, u_int flags, int domid)
3999 {
4000 #else /* XEN */
4001 int
4002 pmap_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
4003 	   u_int flags)
4004 {
4005 	paddr_t ma = pa;
4006 #endif /* XEN */
4007 	pt_entry_t *ptes, opte, npte;
4008 	pt_entry_t *ptep;
4009 	pd_entry_t * const *pdes;
4010 	struct vm_page *ptp, *pg;
4011 	struct pmap_page *new_pp;
4012 	struct pmap_page *old_pp;
4013 	struct pv_entry *old_pve = NULL;
4014 	struct pv_entry *new_pve;
4015 	struct pv_entry *new_pve2;
4016 	int error;
4017 	bool wired = (flags & PMAP_WIRED) != 0;
4018 	struct pmap *pmap2;
4019 
4020 	KASSERT(pmap_initialized);
4021 	KASSERT(curlwp->l_md.md_gc_pmap != pmap);
4022 
4023 #ifdef DIAGNOSTIC
4024 	/* sanity check: totally out of range? */
4025 	if (va >= VM_MAX_KERNEL_ADDRESS)
4026 		panic("pmap_enter: too big");
4027 
4028 	if (va == (vaddr_t) PDP_BASE || va == (vaddr_t) APDP_BASE)
4029 		panic("pmap_enter: trying to map over PDP/APDP!");
4030 
4031 	/* sanity check: kernel PTPs should already have been pre-allocated */
4032 	if (va >= VM_MIN_KERNEL_ADDRESS &&
4033 	    !pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]))
4034 		panic("pmap_enter: missing kernel PTP for va %lx!", va);
4035 #endif /* DIAGNOSTIC */
4036 #ifdef XEN
4037 	KASSERT(domid == DOMID_SELF || pa == 0);
4038 #endif /* XEN */
4039 
4040 	npte = ma | protection_codes[prot] | PG_V;
4041 	if (wired)
4042 	        npte |= PG_W;
4043 	if (flags & PMAP_NOCACHE)
4044 		npte |= PG_N;
4045 	if (va < VM_MAXUSER_ADDRESS)
4046 		npte |= PG_u;
4047 	else if (va < VM_MAX_ADDRESS)
4048 		npte |= (PG_u | PG_RW);	/* XXXCDC: no longer needed? */
4049 	else
4050 		npte |= PG_k;
4051 	if (pmap == pmap_kernel())
4052 		npte |= pmap_pg_g;
4053 	if (flags & VM_PROT_ALL) {
4054 		npte |= PG_U;
4055 		if (flags & VM_PROT_WRITE) {
4056 			KASSERT((npte & PG_RW) != 0);
4057 			npte |= PG_M;
4058 		}
4059 	}
4060 
4061 #ifdef XEN
4062 	if (domid != DOMID_SELF)
4063 		pg = NULL;
4064 	else
4065 #endif
4066 		pg = PHYS_TO_VM_PAGE(pa);
4067 	if (pg != NULL) {
4068 		/* This is a managed page */
4069 		npte |= PG_PVLIST;
4070 		new_pp = VM_PAGE_TO_PP(pg);
4071 	} else {
4072 		new_pp = NULL;
4073 	}
4074 
4075 	/* get pves. */
4076 	new_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
4077 	new_pve2 = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
4078 	if (new_pve == NULL || new_pve2 == NULL) {
4079 		if (flags & PMAP_CANFAIL) {
4080 			error = ENOMEM;
4081 			goto out2;
4082 		}
4083 		panic("pmap_enter: pve allocation failed");
4084 	}
4085 
4086 	kpreempt_disable();
4087 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
4088 	if (pmap == pmap_kernel()) {
4089 		ptp = NULL;
4090 	} else {
4091 		ptp = pmap_get_ptp(pmap, va, pdes);
4092 		if (ptp == NULL) {
4093 			pmap_unmap_ptes(pmap, pmap2);
4094 			if (flags & PMAP_CANFAIL) {
4095 				error = ENOMEM;
4096 				goto out;
4097 			}
4098 			panic("pmap_enter: get ptp failed");
4099 		}
4100 	}
4101 
4102 	/*
4103 	 * update the pte.
4104 	 */
4105 
4106 	ptep = &ptes[pl1_i(va)];
4107 	do {
4108 		opte = *ptep;
4109 
4110 		/*
4111 		 * if the same page, inherit PG_U and PG_M.
4112 		 */
4113 		if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) {
4114 			npte |= opte & (PG_U | PG_M);
4115 		}
4116 #if defined(XEN)
4117 		if (domid != DOMID_SELF) {
4118 			/* pmap_pte_cas with error handling */
4119 			int s = splvm();
4120 			if (opte != *ptep) {
4121 				splx(s);
4122 				continue;
4123 			}
4124 			error = xpq_update_foreign(
4125 			    vtomach((vaddr_t)ptep), npte, domid);
4126 			splx(s);
4127 			if (error) {
4128 				if (ptp != NULL && ptp->wire_count <= 1) {
4129 					pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4130 				}
4131 				pmap_unmap_ptes(pmap, pmap2);
4132 				goto out;
4133 			}
4134 			break;
4135 		}
4136 #endif /* defined(XEN) */
4137 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
4138 
4139 	/*
4140 	 * update statistics and PTP's reference count.
4141 	 */
4142 
4143 	pmap_stats_update_bypte(pmap, npte, opte);
4144 	if (ptp != NULL && !pmap_valid_entry(opte)) {
4145 		ptp->wire_count++;
4146 	}
4147 	KASSERT(ptp == NULL || ptp->wire_count > 1);
4148 
4149 	/*
4150 	 * if the same page, we can skip pv_entry handling.
4151 	 */
4152 
4153 	if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) {
4154 		KASSERT(((opte ^ npte) & PG_PVLIST) == 0);
4155 		goto same_pa;
4156 	}
4157 
4158 	/*
4159 	 * if old page is managed, remove pv_entry from its list.
4160 	 */
4161 
4162 	if ((~opte & (PG_V | PG_PVLIST)) == 0) {
4163 		pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte));
4164 #ifdef DIAGNOSTIC
4165 		if (pg == NULL)
4166 			panic("pmap_enter: PG_PVLIST mapping with "
4167 			      "unmanaged page "
4168 			      "pa = 0x%" PRIx64 " (0x%" PRIx64 ")",
4169 			      (int64_t)pa, (int64_t)atop(pa));
4170 #endif
4171 		old_pp = VM_PAGE_TO_PP(pg);
4172 
4173 		pp_lock(old_pp);
4174 		old_pve = pmap_remove_pv(old_pp, ptp, va);
4175 		old_pp->pp_attrs |= opte;
4176 		pp_unlock(old_pp);
4177 	}
4178 
4179 	/*
4180 	 * if new page is managed, insert pv_entry into its list.
4181 	 */
4182 
4183 	if (new_pp) {
4184 		pp_lock(new_pp);
4185 		new_pve = pmap_enter_pv(new_pp, new_pve, &new_pve2, ptp, va);
4186 		pp_unlock(new_pp);
4187 	}
4188 
4189 same_pa:
4190 	pmap_unmap_ptes(pmap, pmap2);
4191 
4192 	/*
4193 	 * shootdown tlb if necessary.
4194 	 */
4195 
4196 	if ((~opte & (PG_V | PG_U)) == 0 &&
4197 	    ((opte ^ npte) & (PG_FRAME | PG_RW)) != 0) {
4198 		pmap_tlb_shootdown(pmap, va, 0, opte);
4199 	}
4200 
4201 	error = 0;
4202 out:
4203 	kpreempt_enable();
4204 out2:
4205 	if (old_pve != NULL) {
4206 		pool_cache_put(&pmap_pv_cache, old_pve);
4207 	}
4208 	if (new_pve != NULL) {
4209 		pool_cache_put(&pmap_pv_cache, new_pve);
4210 	}
4211 	if (new_pve2 != NULL) {
4212 		pool_cache_put(&pmap_pv_cache, new_pve2);
4213 	}
4214 
4215 	return error;
4216 }
4217 
4218 #ifdef XEN
4219 int
4220 pmap_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
4221 {
4222         paddr_t ma;
4223 
4224 	if (__predict_false(pa < pmap_pa_start || pmap_pa_end <= pa)) {
4225 		ma = pa; /* XXX hack */
4226 	} else {
4227 		ma = xpmap_ptom(pa);
4228 	}
4229 
4230 	return pmap_enter_ma(pmap, va, ma, pa, prot, flags, DOMID_SELF);
4231 }
4232 #endif /* XEN */
4233 
4234 static bool
4235 pmap_get_physpage(vaddr_t va, int level, paddr_t *paddrp)
4236 {
4237 	struct vm_page *ptp;
4238 	struct pmap *kpm = pmap_kernel();
4239 
4240 	if (uvm.page_init_done == false) {
4241 		/*
4242 		 * we're growing the kernel pmap early (from
4243 		 * uvm_pageboot_alloc()).  this case must be
4244 		 * handled a little differently.
4245 		 */
4246 
4247 		if (uvm_page_physget(paddrp) == false)
4248 			panic("pmap_get_physpage: out of memory");
4249 		kpreempt_disable();
4250 		pmap_pte_set(early_zero_pte,
4251 		    pmap_pa2pte(*paddrp) | PG_V | PG_RW | PG_k);
4252 		pmap_pte_flush();
4253 		pmap_update_pg((vaddr_t)early_zerop);
4254 		memset(early_zerop, 0, PAGE_SIZE);
4255 #if defined(DIAGNOSTIC) || defined (XEN)
4256 		pmap_pte_set(early_zero_pte, 0);
4257 		pmap_pte_flush();
4258 #endif /* defined(DIAGNOSTIC) */
4259 		kpreempt_enable();
4260 	} else {
4261 		/* XXX */
4262 		PMAP_SUBOBJ_LOCK(kpm, level - 1);
4263 		ptp = uvm_pagealloc(&kpm->pm_obj[level - 1],
4264 				    ptp_va2o(va, level), NULL,
4265 				    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
4266 		PMAP_SUBOBJ_UNLOCK(kpm, level - 1);
4267 		if (ptp == NULL)
4268 			panic("pmap_get_physpage: out of memory");
4269 		ptp->flags &= ~PG_BUSY;
4270 		ptp->wire_count = 1;
4271 		*paddrp = VM_PAGE_TO_PHYS(ptp);
4272 	}
4273 	pmap_stats_update(kpm, 1, 0);
4274 	return true;
4275 }
4276 
4277 /*
4278  * Allocate the amount of specified ptps for a ptp level, and populate
4279  * all levels below accordingly, mapping virtual addresses starting at
4280  * kva.
4281  *
4282  * Used by pmap_growkernel.
4283  */
4284 static void
4285 pmap_alloc_level(pd_entry_t * const *pdes, vaddr_t kva, int lvl,
4286     long *needed_ptps)
4287 {
4288 	unsigned long i;
4289 	vaddr_t va;
4290 	paddr_t pa;
4291 	unsigned long index, endindex;
4292 	int level;
4293 	pd_entry_t *pdep;
4294 #ifdef XEN
4295 	int s = splvm(); /* protect xpq_* */
4296 #endif
4297 
4298 	for (level = lvl; level > 1; level--) {
4299 		if (level == PTP_LEVELS)
4300 			pdep = pmap_kernel()->pm_pdir;
4301 		else
4302 			pdep = pdes[level - 2];
4303 		va = kva;
4304 		index = pl_i_roundup(kva, level);
4305 		endindex = index + needed_ptps[level - 1] - 1;
4306 
4307 
4308 		for (i = index; i <= endindex; i++) {
4309 			KASSERT(!pmap_valid_entry(pdep[i]));
4310 			pmap_get_physpage(va, level - 1, &pa);
4311 #ifdef XEN
4312 			xpq_queue_pte_update((level == PTP_LEVELS) ?
4313 			    xpmap_ptom(pmap_pdirpa(pmap_kernel(), i)) :
4314 			    xpmap_ptetomach(&pdep[i]),
4315 			    pmap_pa2pte(pa) | PG_k | PG_V | PG_RW);
4316 #ifdef PAE
4317 			if (level == PTP_LEVELS &&  i > L2_SLOT_KERN) {
4318 				/* update real kernel PD too */
4319 				xpq_queue_pte_update(
4320 				    xpmap_ptetomach(&pmap_kl2pd[l2tol2(i)]),
4321 				    pmap_pa2pte(pa) | PG_k | PG_V | PG_RW);
4322 			}
4323 #endif
4324 #else /* XEN */
4325 			pdep[i] = pa | PG_RW | PG_V;
4326 #endif /* XEN */
4327 			KASSERT(level != PTP_LEVELS || nkptp[level - 1] +
4328 			    pl_i(VM_MIN_KERNEL_ADDRESS, level) == i);
4329 			nkptp[level - 1]++;
4330 			va += nbpd[level - 1];
4331 		}
4332 		pmap_pte_flush();
4333 	}
4334 #ifdef XEN
4335 	splx(s);
4336 #endif
4337 }
4338 
4339 /*
4340  * pmap_growkernel: increase usage of KVM space
4341  *
4342  * => we allocate new PTPs for the kernel and install them in all
4343  *	the pmaps on the system.
4344  */
4345 
4346 vaddr_t
4347 pmap_growkernel(vaddr_t maxkvaddr)
4348 {
4349 	struct pmap *kpm = pmap_kernel();
4350 #if !defined(XEN) || !defined(__x86_64__)
4351 	struct pmap *pm;
4352 #endif
4353 	int s, i;
4354 	long needed_kptp[PTP_LEVELS], target_nptp, old;
4355 	bool invalidate = false;
4356 
4357 	s = splvm();	/* to be safe */
4358 	mutex_enter(&kpm->pm_lock);
4359 
4360 	if (maxkvaddr <= pmap_maxkvaddr) {
4361 		mutex_exit(&kpm->pm_lock);
4362 		splx(s);
4363 		return pmap_maxkvaddr;
4364 	}
4365 
4366 	maxkvaddr = x86_round_pdr(maxkvaddr);
4367 	old = nkptp[PTP_LEVELS - 1];
4368 	/*
4369 	 * This loop could be optimized more, but pmap_growkernel()
4370 	 * is called infrequently.
4371 	 */
4372 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
4373 		target_nptp = pl_i_roundup(maxkvaddr, i + 1) -
4374 		    pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1);
4375 		/*
4376 		 * XXX only need to check toplevel.
4377 		 */
4378 		if (target_nptp > nkptpmax[i])
4379 			panic("out of KVA space");
4380 		KASSERT(target_nptp >= nkptp[i]);
4381 		needed_kptp[i] = target_nptp - nkptp[i];
4382 	}
4383 
4384 	pmap_alloc_level(normal_pdes, pmap_maxkvaddr, PTP_LEVELS, needed_kptp);
4385 
4386 	/*
4387 	 * If the number of top level entries changed, update all
4388 	 * pmaps.
4389 	 */
4390 	if (needed_kptp[PTP_LEVELS - 1] != 0) {
4391 #ifdef XEN
4392 #ifdef __x86_64__
4393 		/* nothing, kernel entries are never entered in user pmap */
4394 #else /* __x86_64__ */
4395 		mutex_enter(&pmaps_lock);
4396 		LIST_FOREACH(pm, &pmaps, pm_list) {
4397 			int pdkidx;
4398 			for (pdkidx =  PDIR_SLOT_KERN + old;
4399 			    pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1];
4400 			    pdkidx++) {
4401 				xpq_queue_pte_update(
4402 				    xpmap_ptom(pmap_pdirpa(pm, pdkidx)),
4403 				    kpm->pm_pdir[pdkidx]);
4404 			}
4405 			xpq_flush_queue();
4406 		}
4407 		mutex_exit(&pmaps_lock);
4408 #endif /* __x86_64__ */
4409 #else /* XEN */
4410 		unsigned newpdes;
4411 		newpdes = nkptp[PTP_LEVELS - 1] - old;
4412 		mutex_enter(&pmaps_lock);
4413 		LIST_FOREACH(pm, &pmaps, pm_list) {
4414 			memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old],
4415 			       &kpm->pm_pdir[PDIR_SLOT_KERN + old],
4416 			       newpdes * sizeof (pd_entry_t));
4417 		}
4418 		mutex_exit(&pmaps_lock);
4419 #endif
4420 		invalidate = true;
4421 	}
4422 	pmap_maxkvaddr = maxkvaddr;
4423 	mutex_exit(&kpm->pm_lock);
4424 	splx(s);
4425 
4426 	if (invalidate) {
4427 		/* Invalidate the PDP cache. */
4428 		pool_cache_invalidate(&pmap_pdp_cache);
4429 	}
4430 
4431 	return maxkvaddr;
4432 }
4433 
4434 #ifdef DEBUG
4435 void pmap_dump(struct pmap *, vaddr_t, vaddr_t);
4436 
4437 /*
4438  * pmap_dump: dump all the mappings from a pmap
4439  *
4440  * => caller should not be holding any pmap locks
4441  */
4442 
4443 void
4444 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
4445 {
4446 	pt_entry_t *ptes, *pte;
4447 	pd_entry_t * const *pdes;
4448 	struct pmap *pmap2;
4449 	vaddr_t blkendva;
4450 
4451 	/*
4452 	 * if end is out of range truncate.
4453 	 * if (end == start) update to max.
4454 	 */
4455 
4456 	if (eva > VM_MAXUSER_ADDRESS || eva <= sva)
4457 		eva = VM_MAXUSER_ADDRESS;
4458 
4459 	/*
4460 	 * we lock in the pmap => pv_head direction
4461 	 */
4462 
4463 	kpreempt_disable();
4464 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
4465 
4466 	/*
4467 	 * dumping a range of pages: we dump in PTP sized blocks (4MB)
4468 	 */
4469 
4470 	for (/* null */ ; sva < eva ; sva = blkendva) {
4471 
4472 		/* determine range of block */
4473 		blkendva = x86_round_pdr(sva+1);
4474 		if (blkendva > eva)
4475 			blkendva = eva;
4476 
4477 		/* valid block? */
4478 		if (!pmap_pdes_valid(sva, pdes, NULL))
4479 			continue;
4480 
4481 		pte = &ptes[pl1_i(sva)];
4482 		for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) {
4483 			if (!pmap_valid_entry(*pte))
4484 				continue;
4485 			printf("va %#lx -> pa %#lx (pte=%#lx)\n",
4486 			       sva, (unsigned long)*pte,
4487 			       (unsigned long)pmap_pte2pa(*pte));
4488 		}
4489 	}
4490 	pmap_unmap_ptes(pmap, pmap2);
4491 	kpreempt_enable();
4492 }
4493 #endif
4494 
4495 /*
4496  * pmap_tlb_shootdown: invalidate pages on all CPUs using pmap 'pm'
4497  *
4498  * => always invalidates locally before returning
4499  * => returns before remote CPUs have invalidated
4500  * => must be called with preemption disabled
4501  */
4502 
4503 void
4504 pmap_tlb_shootdown(struct pmap *pm, vaddr_t sva, vaddr_t eva, pt_entry_t pte)
4505 {
4506 #ifdef MULTIPROCESSOR
4507 	extern bool x86_mp_online;
4508 	struct cpu_info *ci;
4509 	struct pmap_mbox *mb, *selfmb;
4510 	CPU_INFO_ITERATOR cii;
4511 	uintptr_t head;
4512 	u_int count;
4513 	int s;
4514 #endif	/* MULTIPROCESSOR */
4515 	struct cpu_info *self;
4516 	bool kernel;
4517 
4518 	KASSERT(eva == 0 || eva >= sva);
4519 	KASSERT(kpreempt_disabled());
4520 
4521 	if (pte & PG_PS)
4522 		sva &= PG_LGFRAME;
4523 	pte &= PG_G;
4524 	self = curcpu();
4525 
4526 	if (sva == (vaddr_t)-1LL) {
4527 		kernel = true;
4528 	} else {
4529 		if (eva == 0)
4530 			eva = sva + PAGE_SIZE;
4531 		kernel = sva >= VM_MAXUSER_ADDRESS;
4532 		KASSERT(kernel == (eva > VM_MAXUSER_ADDRESS));
4533 	}
4534 
4535 	/*
4536 	 * if tearing down the pmap, do nothing.  we'll flush later
4537 	 * when we're ready to recycle/destroy it.
4538 	 */
4539 	if (__predict_false(curlwp->l_md.md_gc_pmap == pm)) {
4540 		return;
4541 	}
4542 
4543 	/*
4544 	 * If the range is larger than 32 pages, then invalidate
4545 	 * everything.
4546 	 */
4547 	if (sva != (vaddr_t)-1LL && eva - sva > (32 * PAGE_SIZE)) {
4548 		sva = (vaddr_t)-1LL;
4549 		eva = sva;
4550 	}
4551 
4552 #ifdef MULTIPROCESSOR
4553 	if (ncpu > 1 && x86_mp_online) {
4554 		selfmb = &self->ci_pmap_cpu->pc_mbox;
4555 
4556 		/*
4557 		 * If the CPUs have no notion of global pages then
4558 		 * reload of %cr3 is sufficient.
4559 		 */
4560 		if (pte != 0 && (cpu_feature & CPUID_PGE) == 0)
4561 			pte = 0;
4562 
4563 		if (pm == pmap_kernel()) {
4564 			/*
4565 			 * Mapped on all CPUs: use the broadcast mechanism.
4566 			 * Once we have the lock, increment the counter.
4567 			 */
4568 			s = splvm();
4569 			mb = &pmap_mbox;
4570 			count = SPINLOCK_BACKOFF_MIN;
4571 			do {
4572 				if ((head = mb->mb_head) != mb->mb_tail) {
4573 					splx(s);
4574 					while ((head = mb->mb_head) !=
4575 					    mb->mb_tail)
4576 						SPINLOCK_BACKOFF(count);
4577 					s = splvm();
4578 				}
4579 			} while (atomic_cas_ulong(
4580 			    (volatile u_long *)&mb->mb_head,
4581 			    head, head + ncpu - 1) != head);
4582 
4583 			/*
4584 			 * Once underway we must stay at IPL_VM until the
4585 			 * IPI is dispatched.  Otherwise interrupt handlers
4586 			 * on this CPU can deadlock against us.
4587 			 */
4588 			pmap_tlb_evcnt.ev_count++;
4589 			mb->mb_pointer = self;
4590 			mb->mb_addr1 = sva;
4591 			mb->mb_addr2 = eva;
4592 			mb->mb_global = pte;
4593 			x86_ipi(LAPIC_TLB_BCAST_VECTOR, LAPIC_DEST_ALLEXCL,
4594 			    LAPIC_DLMODE_FIXED);
4595 			self->ci_need_tlbwait = 1;
4596 			splx(s);
4597 		} else if ((pm->pm_cpus & ~self->ci_cpumask) != 0 ||
4598 		    (kernel && (pm->pm_kernel_cpus & ~self->ci_cpumask) != 0)) {
4599 			/*
4600 			 * We don't bother traversing the CPU list if only
4601 			 * used by this CPU.
4602 			 *
4603 			 * We can't do global flushes with the multicast
4604 			 * mechanism.
4605 			 */
4606 			KASSERT(pte == 0);
4607 
4608 			/*
4609 			 * Take ownership of the shootdown mailbox on each
4610 			 * CPU, fill the details and fire it off.
4611 			 */
4612 			s = splvm();
4613 			for (CPU_INFO_FOREACH(cii, ci)) {
4614 				if (ci == self ||
4615 				    !pmap_is_active(pm, ci, kernel) ||
4616 				    !(ci->ci_flags & CPUF_RUNNING))
4617 					continue;
4618 				selfmb->mb_head++;
4619 				mb = &ci->ci_pmap_cpu->pc_mbox;
4620 				count = SPINLOCK_BACKOFF_MIN;
4621 				while (atomic_cas_ulong(
4622 				    (u_long *)&mb->mb_pointer,
4623 				    0, (u_long)&selfmb->mb_tail) != 0) {
4624 				    	splx(s);
4625 					while (mb->mb_pointer != 0)
4626 						SPINLOCK_BACKOFF(count);
4627 					s = splvm();
4628 				}
4629 				mb->mb_addr1 = sva;
4630 				mb->mb_addr2 = eva;
4631 				mb->mb_global = pte;
4632 				if (x86_ipi(LAPIC_TLB_MCAST_VECTOR,
4633 				    ci->ci_cpuid, LAPIC_DLMODE_FIXED))
4634 					panic("pmap_tlb_shootdown: ipi failed");
4635 			}
4636 			self->ci_need_tlbwait = 1;
4637 			splx(s);
4638 		}
4639 	}
4640 #endif	/* MULTIPROCESSOR */
4641 
4642 	/* Update the current CPU before waiting for others. */
4643 	if (!pmap_is_active(pm, self, kernel))
4644 		return;
4645 
4646 	if (sva == (vaddr_t)-1LL) {
4647 		u_int gen = uvm_emap_gen_return();
4648 		if (pte != 0) {
4649 			tlbflushg();
4650 		} else {
4651 			tlbflush();
4652 		}
4653 		uvm_emap_update(gen);
4654 	} else {
4655 		do {
4656 			pmap_update_pg(sva);
4657 			sva += PAGE_SIZE;
4658 		} while (sva < eva);
4659 	}
4660 }
4661 
4662 /*
4663  * pmap_tlb_shootwait: wait for pending TLB shootdowns to complete
4664  *
4665  * => only waits for operations generated by the current CPU
4666  * => must be called with preemption disabled
4667  */
4668 
4669 void
4670 pmap_tlb_shootwait(void)
4671 {
4672 	struct cpu_info *self;
4673 	struct pmap_mbox *mb;
4674 
4675 	KASSERT(kpreempt_disabled());
4676 
4677 	/*
4678 	 * Anything to do?  XXX Really we want to avoid touching the cache
4679 	 * lines of the two mailboxes, but the processor may read ahead.
4680 	 */
4681 	self = curcpu();
4682 	if (!self->ci_need_tlbwait)
4683 		return;
4684 	self->ci_need_tlbwait = 0;
4685 
4686 	/* If we own the global mailbox, wait for it to drain. */
4687 	mb = &pmap_mbox;
4688 	while (mb->mb_pointer == self && mb->mb_head != mb->mb_tail)
4689 		x86_pause();
4690 
4691 	/* If we own other CPU's mailboxes, wait for them to drain. */
4692 	mb = &self->ci_pmap_cpu->pc_mbox;
4693 	KASSERT(mb->mb_pointer != &mb->mb_tail);
4694 	while (mb->mb_head != mb->mb_tail)
4695 		x86_pause();
4696 }
4697 
4698 /*
4699  * pmap_update: process deferred invalidations
4700  */
4701 
4702 void
4703 pmap_update(struct pmap *pmap)
4704 {
4705 	struct vm_page *ptp, *empty_ptps;
4706 	struct pmap_page *pp;
4707 	lwp_t *l;
4708 
4709 	/*
4710 	 * if we have torn down this pmap, invalidate non-global TLB
4711 	 * entries on any processors using it.
4712 	 */
4713 	l = curlwp;
4714 	if (__predict_false(l->l_md.md_gc_pmap == pmap)) {
4715 		l->l_md.md_gc_pmap = NULL;
4716 		KPREEMPT_DISABLE(l);
4717 		pmap_tlb_shootdown(pmap, -1, -1, 0);
4718 		KPREEMPT_ENABLE(l);
4719 	}
4720 
4721 	/*
4722 	 * wait for tlb shootdowns to complete before returning control
4723 	 * to the caller.
4724 	 */
4725 	kpreempt_disable();
4726 	pmap_tlb_shootwait();
4727 	kpreempt_enable();
4728 
4729 	/*
4730 	 * now that shootdowns are complete, process deferred frees,
4731 	 * but not from interrupt context.
4732 	 */
4733 	if (l->l_md.md_gc_ptp != NULL) {
4734 		if (cpu_intr_p() || (l->l_pflag & LP_INTR) != 0) {
4735 			return;
4736 		}
4737 
4738 		empty_ptps = l->l_md.md_gc_ptp;
4739 		l->l_md.md_gc_ptp = NULL;
4740 
4741 		while ((ptp = empty_ptps) != NULL) {
4742 			ptp->flags |= PG_ZERO;
4743 			pp = VM_PAGE_TO_PP(ptp);
4744 			empty_ptps = pp->pp_link;
4745 			LIST_INIT(&pp->pp_head.pvh_list);
4746 			uvm_pagefree(ptp);
4747 		}
4748 	}
4749 }
4750 
4751 #if PTP_LEVELS > 4
4752 #error "Unsupported number of page table mappings"
4753 #endif
4754 
4755 paddr_t
4756 pmap_init_tmp_pgtbl(paddr_t pg)
4757 {
4758 	static bool maps_loaded;
4759 	static const paddr_t x86_tmp_pml_paddr[] = {
4760 	    4 * PAGE_SIZE,
4761 	    5 * PAGE_SIZE,
4762 	    6 * PAGE_SIZE,
4763 	    7 * PAGE_SIZE
4764 	};
4765 	static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 };
4766 
4767 	pd_entry_t *tmp_pml, *kernel_pml;
4768 
4769 	int level;
4770 
4771 	if (!maps_loaded) {
4772 		for (level = 0; level < PTP_LEVELS; ++level) {
4773 			x86_tmp_pml_vaddr[level] =
4774 			    uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
4775 			    UVM_KMF_VAONLY);
4776 
4777 			if (x86_tmp_pml_vaddr[level] == 0)
4778 				panic("mapping of real mode PML failed\n");
4779 			pmap_kenter_pa(x86_tmp_pml_vaddr[level],
4780 			    x86_tmp_pml_paddr[level],
4781 			    VM_PROT_READ | VM_PROT_WRITE, 0);
4782 			pmap_update(pmap_kernel());
4783 		}
4784 		maps_loaded = true;
4785 	}
4786 
4787 	/* Zero levels 1-3 */
4788 	for (level = 0; level < PTP_LEVELS - 1; ++level) {
4789 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
4790 		memset(tmp_pml, 0, PAGE_SIZE);
4791 	}
4792 
4793 	/* Copy PML4 */
4794 	kernel_pml = pmap_kernel()->pm_pdir;
4795 	tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1];
4796 	memcpy(tmp_pml, kernel_pml, PAGE_SIZE);
4797 
4798 	for (level = PTP_LEVELS - 1; level > 0; --level) {
4799 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
4800 
4801 		tmp_pml[pl_i(pg, level + 1)] =
4802 		    (x86_tmp_pml_paddr[level - 1] & PG_FRAME) | PG_RW | PG_V;
4803 	}
4804 
4805 	tmp_pml = (void *)x86_tmp_pml_vaddr[0];
4806 	tmp_pml[pl_i(pg, 1)] = (pg & PG_FRAME) | PG_RW | PG_V;
4807 
4808 	return x86_tmp_pml_paddr[PTP_LEVELS - 1];
4809 }
4810