xref: /netbsd-src/sys/arch/x86/x86/pmap.c (revision 274254cdae52594c1aa480a736aef78313d15c9c)
1 /*	$NetBSD: pmap.c,v 1.82 2009/03/21 22:55:08 ad Exp $	*/
2 
3 /*
4  * Copyright (c) 2007 Manuel Bouyer.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 3. All advertising materials mentioning features or use of this software
15  *    must display the following acknowledgement:
16  *      This product includes software developed by Manuel Bouyer.
17  * 4. The name of the author may not be used to endorse or promote products
18  *    derived from this software without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
21  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
22  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
23  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
24  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
25  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
29  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  *
31  */
32 
33 /*
34  * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
35  *
36  * Permission to use, copy, modify, and distribute this software for any
37  * purpose with or without fee is hereby granted, provided that the above
38  * copyright notice and this permission notice appear in all copies.
39  *
40  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
41  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
42  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
43  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
44  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
45  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
46  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
47  */
48 
49 /*
50  *
51  * Copyright (c) 1997 Charles D. Cranor and Washington University.
52  * All rights reserved.
53  *
54  * Redistribution and use in source and binary forms, with or without
55  * modification, are permitted provided that the following conditions
56  * are met:
57  * 1. Redistributions of source code must retain the above copyright
58  *    notice, this list of conditions and the following disclaimer.
59  * 2. Redistributions in binary form must reproduce the above copyright
60  *    notice, this list of conditions and the following disclaimer in the
61  *    documentation and/or other materials provided with the distribution.
62  * 3. All advertising materials mentioning features or use of this software
63  *    must display the following acknowledgement:
64  *      This product includes software developed by Charles D. Cranor and
65  *      Washington University.
66  * 4. The name of the author may not be used to endorse or promote products
67  *    derived from this software without specific prior written permission.
68  *
69  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
70  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
71  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
72  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
73  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
74  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
75  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
76  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
77  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
78  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
79  */
80 
81 /*
82  * Copyright 2001 (c) Wasabi Systems, Inc.
83  * All rights reserved.
84  *
85  * Written by Frank van der Linden for Wasabi Systems, Inc.
86  *
87  * Redistribution and use in source and binary forms, with or without
88  * modification, are permitted provided that the following conditions
89  * are met:
90  * 1. Redistributions of source code must retain the above copyright
91  *    notice, this list of conditions and the following disclaimer.
92  * 2. Redistributions in binary form must reproduce the above copyright
93  *    notice, this list of conditions and the following disclaimer in the
94  *    documentation and/or other materials provided with the distribution.
95  * 3. All advertising materials mentioning features or use of this software
96  *    must display the following acknowledgement:
97  *      This product includes software developed for the NetBSD Project by
98  *      Wasabi Systems, Inc.
99  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
100  *    or promote products derived from this software without specific prior
101  *    written permission.
102  *
103  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
104  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
105  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
106  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
107  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
108  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
109  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
110  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
111  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
112  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
113  * POSSIBILITY OF SUCH DAMAGE.
114  */
115 
116 /*
117  * This is the i386 pmap modified and generalized to support x86-64
118  * as well. The idea is to hide the upper N levels of the page tables
119  * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest
120  * is mostly untouched, except that it uses some more generalized
121  * macros and interfaces.
122  *
123  * This pmap has been tested on the i386 as well, and it can be easily
124  * adapted to PAE.
125  *
126  * fvdl@wasabisystems.com 18-Jun-2001
127  */
128 
129 /*
130  * pmap.c: i386 pmap module rewrite
131  * Chuck Cranor <chuck@ccrc.wustl.edu>
132  * 11-Aug-97
133  *
134  * history of this pmap module: in addition to my own input, i used
135  *    the following references for this rewrite of the i386 pmap:
136  *
137  * [1] the NetBSD i386 pmap.   this pmap appears to be based on the
138  *     BSD hp300 pmap done by Mike Hibler at University of Utah.
139  *     it was then ported to the i386 by William Jolitz of UUNET
140  *     Technologies, Inc.   Then Charles M. Hannum of the NetBSD
141  *     project fixed some bugs and provided some speed ups.
142  *
143  * [2] the FreeBSD i386 pmap.   this pmap seems to be the
144  *     Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson
145  *     and David Greenman.
146  *
147  * [3] the Mach pmap.   this pmap, from CMU, seems to have migrated
148  *     between several processors.   the VAX version was done by
149  *     Avadis Tevanian, Jr., and Michael Wayne Young.    the i386
150  *     version was done by Lance Berc, Mike Kupfer, Bob Baron,
151  *     David Golub, and Richard Draves.    the alpha version was
152  *     done by Alessandro Forin (CMU/Mach) and Chris Demetriou
153  *     (NetBSD/alpha).
154  */
155 
156 #include <sys/cdefs.h>
157 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.82 2009/03/21 22:55:08 ad Exp $");
158 
159 #include "opt_user_ldt.h"
160 #include "opt_lockdebug.h"
161 #include "opt_multiprocessor.h"
162 #include "opt_xen.h"
163 #if !defined(__x86_64__)
164 #include "opt_kstack_dr0.h"
165 #endif /* !defined(__x86_64__) */
166 
167 #include <sys/param.h>
168 #include <sys/systm.h>
169 #include <sys/proc.h>
170 #include <sys/pool.h>
171 #include <sys/user.h>
172 #include <sys/kernel.h>
173 #include <sys/atomic.h>
174 #include <sys/cpu.h>
175 #include <sys/intr.h>
176 #include <sys/xcall.h>
177 
178 #include <uvm/uvm.h>
179 
180 #include <dev/isa/isareg.h>
181 
182 #include <machine/specialreg.h>
183 #include <machine/gdt.h>
184 #include <machine/isa_machdep.h>
185 #include <machine/cpuvar.h>
186 
187 #include <x86/pmap.h>
188 #include <x86/pmap_pv.h>
189 
190 #include <x86/i82489reg.h>
191 #include <x86/i82489var.h>
192 
193 #ifdef XEN
194 #include <xen/xen3-public/xen.h>
195 #include <xen/hypervisor.h>
196 #endif
197 
198 /* flag to be used for kernel mappings: PG_u on Xen/amd64, 0 otherwise */
199 #if defined(XEN) && defined(__x86_64__)
200 #define PG_k PG_u
201 #else
202 #define PG_k 0
203 #endif
204 
205 /*
206  * general info:
207  *
208  *  - for an explanation of how the i386 MMU hardware works see
209  *    the comments in <machine/pte.h>.
210  *
211  *  - for an explanation of the general memory structure used by
212  *    this pmap (including the recursive mapping), see the comments
213  *    in <machine/pmap.h>.
214  *
215  * this file contains the code for the "pmap module."   the module's
216  * job is to manage the hardware's virtual to physical address mappings.
217  * note that there are two levels of mapping in the VM system:
218  *
219  *  [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
220  *      to map ranges of virtual address space to objects/files.  for
221  *      example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
222  *      to the file /bin/ls starting at offset zero."   note that
223  *      the upper layer mapping is not concerned with how individual
224  *      vm_pages are mapped.
225  *
226  *  [2] the lower layer of the VM system (the pmap) maintains the mappings
227  *      from virtual addresses.   it is concerned with which vm_page is
228  *      mapped where.   for example, when you run /bin/ls and start
229  *      at page 0x1000 the fault routine may lookup the correct page
230  *      of the /bin/ls file and then ask the pmap layer to establish
231  *      a mapping for it.
232  *
233  * note that information in the lower layer of the VM system can be
234  * thrown away since it can easily be reconstructed from the info
235  * in the upper layer.
236  *
237  * data structures we use include:
238  *
239  *  - struct pmap: describes the address space of one thread
240  *  - struct pv_entry: describes one <PMAP,VA> mapping of a PA
241  *  - struct pv_head: there is one pv_head per managed page of
242  *	physical memory.   the pv_head points to a list of pv_entry
243  *	structures which describe all the <PMAP,VA> pairs that this
244  *      page is mapped in.    this is critical for page based operations
245  *      such as pmap_page_protect() [change protection on _all_ mappings
246  *      of a page]
247  */
248 
249 /*
250  * memory allocation
251  *
252  *  - there are three data structures that we must dynamically allocate:
253  *
254  * [A] new process' page directory page (PDP)
255  *	- plan 1: done at pmap_create() we use
256  *	  uvm_km_alloc(kernel_map, PAGE_SIZE)  [fka kmem_alloc] to do this
257  *	  allocation.
258  *
259  * if we are low in free physical memory then we sleep in
260  * uvm_km_alloc -- in this case this is ok since we are creating
261  * a new pmap and should not be holding any locks.
262  *
263  * if the kernel is totally out of virtual space
264  * (i.e. uvm_km_alloc returns NULL), then we panic.
265  *
266  * XXX: the fork code currently has no way to return an "out of
267  * memory, try again" error code since uvm_fork [fka vm_fork]
268  * is a void function.
269  *
270  * [B] new page tables pages (PTP)
271  * 	- call uvm_pagealloc()
272  * 		=> success: zero page, add to pm_pdir
273  * 		=> failure: we are out of free vm_pages, let pmap_enter()
274  *		   tell UVM about it.
275  *
276  * note: for kernel PTPs, we start with NKPTP of them.   as we map
277  * kernel memory (at uvm_map time) we check to see if we've grown
278  * the kernel pmap.   if so, we call the optional function
279  * pmap_growkernel() to grow the kernel PTPs in advance.
280  *
281  * [C] pv_entry structures
282  */
283 
284 /*
285  * locking
286  *
287  * we have the following locks that we must contend with:
288  *
289  * mutexes:
290  *
291  * - pmap lock (per pmap, part of uvm_object)
292  *   this lock protects the fields in the pmap structure including
293  *   the non-kernel PDEs in the PDP, and the PTEs.  it also locks
294  *   in the alternate PTE space (since that is determined by the
295  *   entry in the PDP).
296  *
297  * - pvh_lock (per pv_head)
298  *   this lock protects the pv_entry list which is chained off the
299  *   pv_head structure for a specific managed PA.   it is locked
300  *   when traversing the list (e.g. adding/removing mappings,
301  *   syncing R/M bits, etc.)
302  *
303  * - pmaps_lock
304  *   this lock protects the list of active pmaps (headed by "pmaps").
305  *   we lock it when adding or removing pmaps from this list.
306  *
307  * tlb shootdown
308  *
309  * tlb shootdowns are hard interrupts that operate outside the spl
310  * framework: they don't need to be blocked provided that the pmap module
311  * gets the order of events correct.  the calls are made by talking directly
312  * to the lapic.  the stubs to handle the interrupts are quite short and do
313  * one of the following: invalidate a single page, a range of pages, all
314  * user tlb entries or the entire tlb.
315  *
316  * the cpus synchronize with each other using pmap_mbox structures which are
317  * aligned on 64-byte cache lines.  tlb shootdowns against the kernel pmap
318  * use a global mailbox and are generated using a broadcast ipi (broadcast
319  * to all but the sending cpu).  shootdowns against regular pmaps use
320  * per-cpu mailboxes and are multicast.  kernel and user shootdowns can
321  * execute simultaneously, as can shootdowns within different multithreaded
322  * processes.  TODO:
323  *
324  *   1. figure out which waitpoints can be deferered to pmap_update().
325  *   2. see if there is a cheap way to batch some updates.
326  */
327 
328 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER;
329 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER;
330 const long nkptpmax[] = NKPTPMAX_INITIALIZER;
331 const long nbpd[] = NBPD_INITIALIZER;
332 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER;
333 pd_entry_t * const alternate_pdes[] = APDES_INITIALIZER;
334 
335 long nkptp[] = NKPTP_INITIALIZER;
336 
337 static kmutex_t pmaps_lock;
338 
339 static vaddr_t pmap_maxkvaddr;
340 
341 #define COUNT(x)	/* nothing */
342 
343 /*
344  * XXX kludge: dummy locking to make KASSERTs in uvm_page.c comfortable.
345  * actual locking is done by pm_lock.
346  */
347 #if defined(DIAGNOSTIC)
348 #define	PMAP_SUBOBJ_LOCK(pm, idx) \
349 	KASSERT(mutex_owned(&(pm)->pm_lock)); \
350 	if ((idx) != 0) \
351 		mutex_enter(&(pm)->pm_obj[(idx)].vmobjlock)
352 #define	PMAP_SUBOBJ_UNLOCK(pm, idx) \
353 	KASSERT(mutex_owned(&(pm)->pm_lock)); \
354 	if ((idx) != 0) \
355 		mutex_exit(&(pm)->pm_obj[(idx)].vmobjlock)
356 #else /* defined(DIAGNOSTIC) */
357 #define	PMAP_SUBOBJ_LOCK(pm, idx)	/* nothing */
358 #define	PMAP_SUBOBJ_UNLOCK(pm, idx)	/* nothing */
359 #endif /* defined(DIAGNOSTIC) */
360 
361 /*
362  * Misc. event counters.
363  */
364 struct evcnt pmap_iobmp_evcnt;
365 struct evcnt pmap_ldt_evcnt;
366 
367 /*
368  * Global TLB shootdown mailbox.
369  */
370 struct evcnt pmap_tlb_evcnt __aligned(64);
371 struct pmap_mbox pmap_mbox __aligned(64);
372 
373 /*
374  * Per-CPU data.  The pmap mailbox is cache intensive so gets its
375  * own line.  Note that the mailbox must be the first item.
376  */
377 struct pmap_cpu {
378 	/* TLB shootdown */
379 	struct pmap_mbox pc_mbox;
380 };
381 
382 union {
383 	struct pmap_cpu pc;
384 	uint8_t padding[64];
385 } pmap_cpu[MAXCPUS] __aligned(64);
386 
387 /*
388  * global data structures
389  */
390 
391 static struct pmap kernel_pmap_store;	/* the kernel's pmap (proc0) */
392 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store;
393 
394 /*
395  * pmap_pg_g: if our processor supports PG_G in the PTE then we
396  * set pmap_pg_g to PG_G (otherwise it is zero).
397  */
398 
399 int pmap_pg_g = 0;
400 
401 /*
402  * pmap_largepages: if our processor supports PG_PS and we are
403  * using it, this is set to true.
404  */
405 
406 int pmap_largepages;
407 
408 /*
409  * i386 physical memory comes in a big contig chunk with a small
410  * hole toward the front of it...  the following two paddr_t's
411  * (shared with machdep.c) describe the physical address space
412  * of this machine.
413  */
414 paddr_t avail_start;	/* PA of first available physical page */
415 paddr_t avail_end;	/* PA of last available physical page */
416 
417 #ifdef XEN
418 #ifdef __x86_64__
419 /* Dummy PGD for user cr3, used between pmap_deacivate() and pmap_activate() */
420 static paddr_t xen_dummy_user_pgd;
421 /* Currently active user PGD (can't use rcr3()) */
422 static paddr_t xen_current_user_pgd = 0;
423 #endif /* __x86_64__ */
424 paddr_t pmap_pa_start; /* PA of first physical page for this domain */
425 paddr_t pmap_pa_end;   /* PA of last physical page for this domain */
426 #endif /* XEN */
427 
428 #define	VM_PAGE_TO_PP(pg)	(&(pg)->mdpage.mp_pp)
429 
430 #define	pp_lock(pp)	mutex_spin_enter(&(pp)->pp_lock)
431 #define	pp_unlock(pp)	mutex_spin_exit(&(pp)->pp_lock)
432 #define	pp_locked(pp)	mutex_owned(&(pp)->pp_lock)
433 
434 #define	PV_HASH_SIZE		32768
435 #define	PV_HASH_LOCK_CNT	32
436 
437 struct pv_hash_lock {
438 	kmutex_t lock;
439 } __aligned(CACHE_LINE_SIZE) pv_hash_locks[PV_HASH_LOCK_CNT]
440     __aligned(CACHE_LINE_SIZE);
441 
442 struct pv_hash_head {
443 	SLIST_HEAD(, pv_entry) hh_list;
444 } pv_hash_heads[PV_HASH_SIZE];
445 
446 static u_int
447 pvhash_hash(struct vm_page *ptp, vaddr_t va)
448 {
449 
450 	return (uintptr_t)ptp / sizeof(*ptp) + (va >> PAGE_SHIFT);
451 }
452 
453 static struct pv_hash_head *
454 pvhash_head(u_int hash)
455 {
456 
457 	return &pv_hash_heads[hash % PV_HASH_SIZE];
458 }
459 
460 static kmutex_t *
461 pvhash_lock(u_int hash)
462 {
463 
464 	return &pv_hash_locks[hash % PV_HASH_LOCK_CNT].lock;
465 }
466 
467 static struct pv_entry *
468 pvhash_remove(struct pv_hash_head *hh, struct vm_page *ptp, vaddr_t va)
469 {
470 	struct pv_entry *pve;
471 	struct pv_entry *prev;
472 
473 	prev = NULL;
474 	SLIST_FOREACH(pve, &hh->hh_list, pve_hash) {
475 		if (pve->pve_pte.pte_ptp == ptp &&
476 		    pve->pve_pte.pte_va == va) {
477 			if (prev != NULL) {
478 				SLIST_REMOVE_AFTER(prev, pve_hash);
479 			} else {
480 				SLIST_REMOVE_HEAD(&hh->hh_list, pve_hash);
481 			}
482 			break;
483 		}
484 		prev = pve;
485 	}
486 	return pve;
487 }
488 
489 /*
490  * other data structures
491  */
492 
493 static pt_entry_t protection_codes[8];	/* maps MI prot to i386 prot code */
494 static bool pmap_initialized = false;	/* pmap_init done yet? */
495 
496 /*
497  * the following two vaddr_t's are used during system startup
498  * to keep track of how much of the kernel's VM space we have used.
499  * once the system is started, the management of the remaining kernel
500  * VM space is turned over to the kernel_map vm_map.
501  */
502 
503 static vaddr_t virtual_avail;	/* VA of first free KVA */
504 static vaddr_t virtual_end;	/* VA of last free KVA */
505 
506 /*
507  * linked list of all non-kernel pmaps
508  */
509 
510 static struct pmap_head pmaps;
511 
512 /*
513  * pool that pmap structures are allocated from
514  */
515 
516 static struct pool_cache pmap_cache;
517 
518 /*
519  * pv_entry cache
520  */
521 
522 static struct pool_cache pmap_pv_cache;
523 
524 /*
525  * MULTIPROCESSOR: special VA's/ PTE's are actually allocated inside a
526  * maxcpus*NPTECL array of PTE's, to avoid cache line thrashing
527  * due to false sharing.
528  */
529 
530 #ifdef MULTIPROCESSOR
531 #define PTESLEW(pte, id) ((pte)+(id)*NPTECL)
532 #define VASLEW(va,id) ((va)+(id)*NPTECL*PAGE_SIZE)
533 #else
534 #define PTESLEW(pte, id) (pte)
535 #define VASLEW(va,id) (va)
536 #endif
537 
538 /*
539  * special VAs and the PTEs that map them
540  */
541 static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte, *early_zero_pte;
542 static char *csrcp, *cdstp, *zerop, *ptpp, *early_zerop;
543 
544 /*
545  * pool and cache that PDPs are allocated from
546  */
547 
548 static struct pool_cache pmap_pdp_cache;
549 int	pmap_pdp_ctor(void *, void *, int);
550 void	pmap_pdp_dtor(void *, void *);
551 #ifdef PAE
552 /* need to allocate items of 4 pages */
553 void *pmap_pdp_alloc(struct pool *, int);
554 void pmap_pdp_free(struct pool *, void *);
555 static struct pool_allocator pmap_pdp_allocator = {
556 	.pa_alloc = pmap_pdp_alloc,
557 	.pa_free = pmap_pdp_free,
558 	.pa_pagesz = PAGE_SIZE * PDP_SIZE,
559 };
560 #endif /* PAE */
561 
562 void *vmmap; /* XXX: used by mem.c... it should really uvm_map_reserve it */
563 
564 extern vaddr_t idt_vaddr;			/* we allocate IDT early */
565 extern paddr_t idt_paddr;
566 
567 #ifdef _LP64
568 extern vaddr_t lo32_vaddr;
569 extern vaddr_t lo32_paddr;
570 #endif
571 
572 extern int end;
573 
574 #ifdef i386
575 /* stuff to fix the pentium f00f bug */
576 extern vaddr_t pentium_idt_vaddr;
577 #endif
578 
579 
580 /*
581  * local prototypes
582  */
583 
584 static struct vm_page	*pmap_get_ptp(struct pmap *, vaddr_t,
585 				      pd_entry_t * const *);
586 static struct vm_page	*pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int);
587 static void		 pmap_freepage(struct pmap *, struct vm_page *, int);
588 static void		 pmap_free_ptp(struct pmap *, struct vm_page *,
589 				       vaddr_t, pt_entry_t *,
590 				       pd_entry_t * const *);
591 static bool		 pmap_is_curpmap(struct pmap *);
592 static bool		 pmap_is_active(struct pmap *, struct cpu_info *, bool);
593 static void		 pmap_map_ptes(struct pmap *, struct pmap **,
594 				       pt_entry_t **, pd_entry_t * const **);
595 static void		 pmap_do_remove(struct pmap *, vaddr_t, vaddr_t, int);
596 static bool		 pmap_remove_pte(struct pmap *, struct vm_page *,
597 					 pt_entry_t *, vaddr_t, int,
598 					 struct pv_entry **);
599 static pt_entry_t	 pmap_remove_ptes(struct pmap *, struct vm_page *,
600 					  vaddr_t, vaddr_t, vaddr_t, int,
601 					  struct pv_entry **);
602 #define PMAP_REMOVE_ALL		0	/* remove all mappings */
603 #define PMAP_REMOVE_SKIPWIRED	1	/* skip wired mappings */
604 
605 static void		 pmap_unmap_ptes(struct pmap *, struct pmap *);
606 static bool		 pmap_get_physpage(vaddr_t, int, paddr_t *);
607 static int		 pmap_pdes_invalid(vaddr_t, pd_entry_t * const *,
608 					   pd_entry_t *);
609 #define	pmap_pdes_valid(va, pdes, lastpde)	\
610 	(pmap_pdes_invalid((va), (pdes), (lastpde)) == 0)
611 static void		 pmap_alloc_level(pd_entry_t * const *, vaddr_t, int,
612 					  long *);
613 
614 static bool		 pmap_reactivate(struct pmap *);
615 
616 /*
617  * p m a p   h e l p e r   f u n c t i o n s
618  */
619 
620 static inline void
621 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff)
622 {
623 
624 	if (pmap == pmap_kernel()) {
625 		atomic_add_long(&pmap->pm_stats.resident_count, resid_diff);
626 		atomic_add_long(&pmap->pm_stats.wired_count, wired_diff);
627 	} else {
628 		KASSERT(mutex_owned(&pmap->pm_lock));
629 		pmap->pm_stats.resident_count += resid_diff;
630 		pmap->pm_stats.wired_count += wired_diff;
631 	}
632 }
633 
634 static inline void
635 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
636 {
637 	int resid_diff = ((npte & PG_V) ? 1 : 0) - ((opte & PG_V) ? 1 : 0);
638 	int wired_diff = ((npte & PG_W) ? 1 : 0) - ((opte & PG_W) ? 1 : 0);
639 
640 	KASSERT((npte & (PG_V | PG_W)) != PG_W);
641 	KASSERT((opte & (PG_V | PG_W)) != PG_W);
642 
643 	pmap_stats_update(pmap, resid_diff, wired_diff);
644 }
645 
646 /*
647  * ptp_to_pmap: lookup pmap by ptp
648  */
649 
650 static struct pmap *
651 ptp_to_pmap(struct vm_page *ptp)
652 {
653 	struct pmap *pmap;
654 
655 	if (ptp == NULL) {
656 		return pmap_kernel();
657 	}
658 	pmap = (struct pmap *)ptp->uobject;
659 	KASSERT(pmap != NULL);
660 	KASSERT(&pmap->pm_obj[0] == ptp->uobject);
661 	return pmap;
662 }
663 
664 static inline struct pv_pte *
665 pve_to_pvpte(struct pv_entry *pve)
666 {
667 
668 	KASSERT((void *)&pve->pve_pte == (void *)pve);
669 	return &pve->pve_pte;
670 }
671 
672 static inline struct pv_entry *
673 pvpte_to_pve(struct pv_pte *pvpte)
674 {
675 	struct pv_entry *pve = (void *)pvpte;
676 
677 	KASSERT(pve_to_pvpte(pve) == pvpte);
678 	return pve;
679 }
680 
681 /*
682  * pv_pte_first, pv_pte_next: PV list iterator.
683  */
684 
685 static struct pv_pte *
686 pv_pte_first(struct pmap_page *pp)
687 {
688 
689 	KASSERT(pp_locked(pp));
690 	if ((pp->pp_flags & PP_EMBEDDED) != 0) {
691 		return &pp->pp_pte;
692 	}
693 	return pve_to_pvpte(LIST_FIRST(&pp->pp_head.pvh_list));
694 }
695 
696 static struct pv_pte *
697 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte)
698 {
699 
700 	KASSERT(pvpte != NULL);
701 	KASSERT(pp_locked(pp));
702 	if (pvpte == &pp->pp_pte) {
703 		KASSERT((pp->pp_flags & PP_EMBEDDED) != 0);
704 		return NULL;
705 	}
706 	KASSERT((pp->pp_flags & PP_EMBEDDED) == 0);
707 	return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list));
708 }
709 
710 /*
711  * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
712  *		of course the kernel is always loaded
713  */
714 
715 inline static bool
716 pmap_is_curpmap(struct pmap *pmap)
717 {
718 #if defined(XEN) && defined(__x86_64__)
719 	/*
720 	 * Only kernel pmap is physically loaded.
721 	 * User PGD may be active, but TLB will be flushed
722 	 * with HYPERVISOR_iret anyway, so let's say no
723 	 */
724 	return(pmap == pmap_kernel());
725 #else /* XEN && __x86_64__*/
726 	return((pmap == pmap_kernel()) ||
727 	       (pmap == curcpu()->ci_pmap));
728 #endif
729 }
730 
731 /*
732  * pmap_is_active: is this pmap loaded into the specified processor's %cr3?
733  */
734 
735 inline static bool
736 pmap_is_active(struct pmap *pmap, struct cpu_info *ci, bool kernel)
737 {
738 
739 	return (pmap == pmap_kernel() ||
740 	    (pmap->pm_cpus & ci->ci_cpumask) != 0 ||
741 	    (kernel && (pmap->pm_kernel_cpus & ci->ci_cpumask) != 0));
742 }
743 
744 static void
745 pmap_apte_flush(struct pmap *pmap)
746 {
747 
748 	KASSERT(kpreempt_disabled());
749 
750 	/*
751 	 * Flush the APTE mapping from all other CPUs that
752 	 * are using the pmap we are using (who's APTE space
753 	 * is the one we've just modified).
754 	 *
755 	 * XXXthorpej -- find a way to defer the IPI.
756 	 */
757 	pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, 0);
758 	pmap_tlb_shootwait();
759 }
760 
761 /*
762  *	Add a reference to the specified pmap.
763  */
764 
765 inline void
766 pmap_reference(struct pmap *pmap)
767 {
768 
769 	atomic_inc_uint((unsigned *)&pmap->pm_obj[0].uo_refs);
770 }
771 
772 /*
773  * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
774  *
775  * => we lock enough pmaps to keep things locked in
776  * => must be undone with pmap_unmap_ptes before returning
777  */
778 
779 static void
780 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2,
781     pd_entry_t **ptepp, pd_entry_t * const **pdeppp)
782 {
783 	pd_entry_t opde, npde;
784 	struct pmap *ourpmap;
785 	struct cpu_info *ci;
786 	struct lwp *l;
787 	bool iscurrent;
788 	uint64_t ncsw;
789 #ifdef XEN
790 	int s;
791 #endif
792 
793 	/* the kernel's pmap is always accessible */
794 	if (pmap == pmap_kernel()) {
795 		*pmap2 = NULL;
796 		*ptepp = PTE_BASE;
797 		*pdeppp = normal_pdes;
798 		return;
799 	}
800 	KASSERT(kpreempt_disabled());
801 
802  retry:
803 	l = curlwp;
804 	ncsw = l->l_ncsw;
805  	ourpmap = NULL;
806 	ci = curcpu();
807 #if defined(XEN) && defined(__x86_64__)
808 	/*
809 	 * curmap can only be pmap_kernel so at this point
810 	 * pmap_is_curpmap is always false
811 	 */
812 	iscurrent = 0;
813 	ourpmap = pmap_kernel();
814 #else /* XEN && __x86_64__*/
815 	if (ci->ci_want_pmapload &&
816 	    vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) {
817 		pmap_load();
818 		if (l->l_ncsw != ncsw)
819 			goto retry;
820 	}
821 	iscurrent = pmap_is_curpmap(pmap);
822 	/* if curpmap then we are always mapped */
823 	if (iscurrent) {
824 		mutex_enter(&pmap->pm_lock);
825 		*pmap2 = NULL;
826 		*ptepp = PTE_BASE;
827 		*pdeppp = normal_pdes;
828 		goto out;
829 	}
830 	ourpmap = ci->ci_pmap;
831 #endif /* XEN && __x86_64__ */
832 
833 	/* need to lock both curpmap and pmap: use ordered locking */
834 	pmap_reference(ourpmap);
835 	if ((uintptr_t) pmap < (uintptr_t) ourpmap) {
836 		mutex_enter(&pmap->pm_lock);
837 		mutex_enter(&ourpmap->pm_lock);
838 	} else {
839 		mutex_enter(&ourpmap->pm_lock);
840 		mutex_enter(&pmap->pm_lock);
841 	}
842 
843 	if (l->l_ncsw != ncsw)
844 		goto unlock_and_retry;
845 
846 	/* need to load a new alternate pt space into curpmap? */
847 	COUNT(apdp_pde_map);
848 	opde = *APDP_PDE;
849 #ifdef XEN
850 	if (!pmap_valid_entry(opde) ||
851 	    pmap_pte2pa(opde) != pmap_pdirpa(pmap, 0)) {
852 		int i;
853 		s = splvm();
854 		/* Make recursive entry usable in user PGD */
855 		for (i = 0; i < PDP_SIZE; i++) {
856 			npde = pmap_pa2pte(
857 			    pmap_pdirpa(pmap, i * NPDPG)) | PG_k | PG_V;
858 			xpq_queue_pte_update(
859 			    xpmap_ptom(pmap_pdirpa(pmap, PDIR_SLOT_PTE + i)),
860 			    npde);
861 			xpq_queue_pte_update(xpmap_ptetomach(&APDP_PDE[i]),
862 			    npde);
863 #ifdef PAE
864 			/* update shadow entry too */
865 			xpq_queue_pte_update(
866 			    xpmap_ptetomach(&APDP_PDE_SHADOW[i]), npde);
867 #endif /* PAE */
868 			xpq_queue_invlpg(
869 			    (vaddr_t)&pmap->pm_pdir[PDIR_SLOT_PTE + i]);
870 		}
871 		xpq_flush_queue();
872 		if (pmap_valid_entry(opde))
873 			pmap_apte_flush(ourpmap);
874 		splx(s);
875 	}
876 #else /* XEN */
877 	npde = pmap_pa2pte(pmap_pdirpa(pmap, 0)) | PG_RW | PG_V;
878 	if (!pmap_valid_entry(opde) ||
879 	    pmap_pte2pa(opde) != pmap_pdirpa(pmap, 0)) {
880 		pmap_pte_set(APDP_PDE, npde);
881 		pmap_pte_flush();
882 		if (pmap_valid_entry(opde))
883 			pmap_apte_flush(ourpmap);
884 	}
885 #endif /* XEN */
886 	*pmap2 = ourpmap;
887 	*ptepp = APTE_BASE;
888 	*pdeppp = alternate_pdes;
889 	KASSERT(l->l_ncsw == ncsw);
890 #if !defined(XEN) || !defined(__x86_64__)
891  out:
892 #endif
893  	/*
894  	 * might have blocked, need to retry?
895  	 */
896 	if (l->l_ncsw != ncsw) {
897  unlock_and_retry:
898 	    	if (ourpmap != NULL) {
899 			mutex_exit(&ourpmap->pm_lock);
900 			pmap_destroy(ourpmap);
901 		}
902 		mutex_exit(&pmap->pm_lock);
903 		goto retry;
904 	}
905 
906 	return;
907 }
908 
909 /*
910  * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
911  */
912 
913 static void
914 pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2)
915 {
916 
917 	if (pmap == pmap_kernel()) {
918 		return;
919 	}
920 	KASSERT(kpreempt_disabled());
921 	if (pmap2 == NULL) {
922 		mutex_exit(&pmap->pm_lock);
923 	} else {
924 #if defined(XEN) && defined(__x86_64__)
925 		KASSERT(pmap2 == pmap_kernel());
926 #else
927 		KASSERT(curcpu()->ci_pmap == pmap2);
928 #endif
929 #if defined(MULTIPROCESSOR)
930 		pmap_pte_set(APDP_PDE, 0);
931 		pmap_pte_flush();
932 		pmap_apte_flush(pmap2);
933 #endif
934 		COUNT(apdp_pde_unmap);
935 		mutex_exit(&pmap->pm_lock);
936 		mutex_exit(&pmap2->pm_lock);
937 		pmap_destroy(pmap2);
938 	}
939 }
940 
941 inline static void
942 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte)
943 {
944 
945 #if !defined(__x86_64__)
946 	if (curproc == NULL || curproc->p_vmspace == NULL ||
947 	    pm != vm_map_pmap(&curproc->p_vmspace->vm_map))
948 		return;
949 
950 	if ((opte ^ npte) & PG_X)
951 		pmap_update_pg(va);
952 
953 	/*
954 	 * Executability was removed on the last executable change.
955 	 * Reset the code segment to something conservative and
956 	 * let the trap handler deal with setting the right limit.
957 	 * We can't do that because of locking constraints on the vm map.
958 	 */
959 
960 	if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) {
961 		struct trapframe *tf = curlwp->l_md.md_regs;
962 
963 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
964 		pm->pm_hiexec = I386_MAX_EXE_ADDR;
965 	}
966 #endif /* !defined(__x86_64__) */
967 }
968 
969 #if !defined(__x86_64__)
970 /*
971  * Fixup the code segment to cover all potential executable mappings.
972  * returns 0 if no changes to the code segment were made.
973  */
974 
975 int
976 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb)
977 {
978 	struct vm_map_entry *ent;
979 	struct pmap *pm = vm_map_pmap(map);
980 	vaddr_t va = 0;
981 
982 	vm_map_lock_read(map);
983 	for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) {
984 
985 		/*
986 		 * This entry has greater va than the entries before.
987 		 * We need to make it point to the last page, not past it.
988 		 */
989 
990 		if (ent->protection & VM_PROT_EXECUTE)
991 			va = trunc_page(ent->end) - PAGE_SIZE;
992 	}
993 	vm_map_unlock_read(map);
994 	if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL))
995 		return (0);
996 
997 	pm->pm_hiexec = va;
998 	if (pm->pm_hiexec > I386_MAX_EXE_ADDR) {
999 		tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
1000 	} else {
1001 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
1002 		return (0);
1003 	}
1004 	return (1);
1005 }
1006 #endif /* !defined(__x86_64__) */
1007 
1008 /*
1009  * p m a p   k e n t e r   f u n c t i o n s
1010  *
1011  * functions to quickly enter/remove pages from the kernel address
1012  * space.   pmap_kremove is exported to MI kernel.  we make use of
1013  * the recursive PTE mappings.
1014  */
1015 
1016 /*
1017  * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
1018  *
1019  * => no need to lock anything, assume va is already allocated
1020  * => should be faster than normal pmap enter function
1021  */
1022 
1023 void
1024 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot)
1025 {
1026 	pt_entry_t *pte, opte, npte;
1027 
1028 	KASSERT(!(prot & ~VM_PROT_ALL));
1029 
1030 	if (va < VM_MIN_KERNEL_ADDRESS)
1031 		pte = vtopte(va);
1032 	else
1033 		pte = kvtopte(va);
1034 #ifdef DOM0OPS
1035 	if (pa < pmap_pa_start || pa >= pmap_pa_end) {
1036 #ifdef DEBUG
1037 		printk("pmap_kenter_pa: pa 0x%" PRIx64 " for va 0x%" PRIx64
1038 		    " outside range\n", (int64_t)pa, (int64_t)va);
1039 #endif /* DEBUG */
1040 		npte = pa;
1041 	} else
1042 #endif /* DOM0OPS */
1043 		npte = pmap_pa2pte(pa);
1044 	npte |= protection_codes[prot] | PG_k | PG_V | pmap_pg_g;
1045 	opte = pmap_pte_testset(pte, npte); /* zap! */
1046 #if defined(DIAGNOSTIC)
1047 	/* XXX For now... */
1048 	if (opte & PG_PS)
1049 		panic("pmap_kenter_pa: PG_PS");
1050 #endif
1051 	if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) {
1052 		/* This should not happen, so no need to batch updates. */
1053 		kpreempt_disable();
1054 		pmap_tlb_shootdown(pmap_kernel(), va, 0, opte);
1055 		kpreempt_enable();
1056 	}
1057 }
1058 
1059 #ifdef XEN
1060 /*
1061  * pmap_kenter_ma: enter a kernel mapping without R/M (pv_entry) tracking
1062  *
1063  * => no need to lock anything, assume va is already allocated
1064  * => should be faster than normal pmap enter function
1065  * => we expect a MACHINE address
1066  */
1067 
1068 void
1069 pmap_kenter_ma(vaddr_t va, paddr_t ma, vm_prot_t prot)
1070 {
1071 	pt_entry_t *pte, opte, npte;
1072 
1073 	if (va < VM_MIN_KERNEL_ADDRESS)
1074 		pte = vtopte(va);
1075 	else
1076 		pte = kvtopte(va);
1077 
1078 	npte = ma | ((prot & VM_PROT_WRITE) ? PG_RW : PG_RO) |
1079 	     PG_V | PG_k;
1080 #ifndef XEN
1081 	if ((cpu_feature & CPUID_NOX) && !(prot & VM_PROT_EXECUTE))
1082 		npte |= PG_NX;
1083 #endif
1084 	opte = pmap_pte_testset (pte, npte); /* zap! */
1085 
1086 	if (pmap_valid_entry(opte)) {
1087 #if defined(MULTIPROCESSOR)
1088 		kpreempt_disable();
1089 		pmap_tlb_shootdown(pmap_kernel(), va, 0, opte);
1090 		kpreempt_enable();
1091 #else
1092 		/* Don't bother deferring in the single CPU case. */
1093 		pmap_update_pg(va);
1094 #endif
1095 	}
1096 }
1097 #endif	/* XEN */
1098 
1099 #if defined(__x86_64__)
1100 /*
1101  * Change protection for a virtual address. Local for a CPU only, don't
1102  * care about TLB shootdowns.
1103  *
1104  * => must be called with preemption disabled
1105  */
1106 void
1107 pmap_changeprot_local(vaddr_t va, vm_prot_t prot)
1108 {
1109 	pt_entry_t *pte, opte, npte;
1110 
1111 	KASSERT(kpreempt_disabled());
1112 
1113 	if (va < VM_MIN_KERNEL_ADDRESS)
1114 		pte = vtopte(va);
1115 	else
1116 		pte = kvtopte(va);
1117 
1118 	npte = opte = *pte;
1119 
1120 	if ((prot & VM_PROT_WRITE) != 0)
1121 		npte |= PG_RW;
1122 	else
1123 		npte &= ~PG_RW;
1124 
1125 	if (opte != npte) {
1126 		pmap_pte_set(pte, npte);
1127 		pmap_pte_flush();
1128 		invlpg(va);
1129 	}
1130 }
1131 #endif /* defined(__x86_64__) */
1132 
1133 /*
1134  * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
1135  *
1136  * => no need to lock anything
1137  * => caller must dispose of any vm_page mapped in the va range
1138  * => note: not an inline function
1139  * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
1140  * => we assume kernel only unmaps valid addresses and thus don't bother
1141  *    checking the valid bit before doing TLB flushing
1142  * => must be followed by call to pmap_update() before reuse of page
1143  */
1144 
1145 void
1146 pmap_kremove(vaddr_t sva, vsize_t len)
1147 {
1148 	pt_entry_t *pte, xpte;
1149 	vaddr_t va, eva;
1150 
1151 	eva = sva + len;
1152 	xpte = 0;
1153 
1154 	for (va = sva; va < eva; va += PAGE_SIZE) {
1155 		if (va < VM_MIN_KERNEL_ADDRESS)
1156 			pte = vtopte(va);
1157 		else
1158 			pte = kvtopte(va);
1159 		xpte |= pmap_pte_testset(pte, 0); /* zap! */
1160 #if defined(DIAGNOSTIC)
1161 		/* XXX For now... */
1162 		if (xpte & PG_PS)
1163 			panic("pmap_kremove: PG_PS");
1164 		if (xpte & PG_PVLIST)
1165 			panic("pmap_kremove: PG_PVLIST mapping for 0x%lx",
1166 			      va);
1167 #endif
1168 	}
1169 	if ((xpte & (PG_V | PG_U)) == (PG_V | PG_U)) {
1170 		kpreempt_disable();
1171 		pmap_tlb_shootdown(pmap_kernel(), sva, eva, xpte);
1172 		kpreempt_enable();
1173 	}
1174 }
1175 
1176 /*
1177  * p m a p   i n i t   f u n c t i o n s
1178  *
1179  * pmap_bootstrap and pmap_init are called during system startup
1180  * to init the pmap module.   pmap_bootstrap() does a low level
1181  * init just to get things rolling.   pmap_init() finishes the job.
1182  */
1183 
1184 /*
1185  * pmap_bootstrap: get the system in a state where it can run with VM
1186  *	properly enabled (called before main()).   the VM system is
1187  *      fully init'd later...
1188  *
1189  * => on i386, locore.s has already enabled the MMU by allocating
1190  *	a PDP for the kernel, and nkpde PTP's for the kernel.
1191  * => kva_start is the first free virtual address in kernel space
1192  */
1193 
1194 void
1195 pmap_bootstrap(vaddr_t kva_start)
1196 {
1197 	struct pmap *kpm;
1198 	pt_entry_t *pte;
1199 	int i;
1200 	vaddr_t kva;
1201 #ifdef XEN
1202 	pt_entry_t pg_nx = 0;
1203 #else
1204 	unsigned long p1i;
1205 	vaddr_t kva_end;
1206 	pt_entry_t pg_nx = (cpu_feature & CPUID_NOX ? PG_NX : 0);
1207 #endif
1208 
1209 	/*
1210 	 * set up our local static global vars that keep track of the
1211 	 * usage of KVM before kernel_map is set up
1212 	 */
1213 
1214 	virtual_avail = kva_start;		/* first free KVA */
1215 	virtual_end = VM_MAX_KERNEL_ADDRESS;	/* last KVA */
1216 
1217 	/*
1218 	 * set up protection_codes: we need to be able to convert from
1219 	 * a MI protection code (some combo of VM_PROT...) to something
1220 	 * we can jam into a i386 PTE.
1221 	 */
1222 
1223 	protection_codes[VM_PROT_NONE] = pg_nx;			/* --- */
1224 	protection_codes[VM_PROT_EXECUTE] = PG_RO | PG_X;	/* --x */
1225 	protection_codes[VM_PROT_READ] = PG_RO | pg_nx;		/* -r- */
1226 	protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO | PG_X;/* -rx */
1227 	protection_codes[VM_PROT_WRITE] = PG_RW | pg_nx;	/* w-- */
1228 	protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW | PG_X;/* w-x */
1229 	protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW | pg_nx;
1230 								/* wr- */
1231 	protection_codes[VM_PROT_ALL] = PG_RW | PG_X;		/* wrx */
1232 
1233 	/*
1234 	 * now we init the kernel's pmap
1235 	 *
1236 	 * the kernel pmap's pm_obj is not used for much.   however, in
1237 	 * user pmaps the pm_obj contains the list of active PTPs.
1238 	 * the pm_obj currently does not have a pager.   it might be possible
1239 	 * to add a pager that would allow a process to read-only mmap its
1240 	 * own page tables (fast user level vtophys?).   this may or may not
1241 	 * be useful.
1242 	 */
1243 
1244 	kpm = pmap_kernel();
1245 	for (i = 0; i < PTP_LEVELS - 1; i++) {
1246 		UVM_OBJ_INIT(&kpm->pm_obj[i], NULL, 1);
1247 		kpm->pm_ptphint[i] = NULL;
1248 	}
1249 	memset(&kpm->pm_list, 0, sizeof(kpm->pm_list));  /* pm_list not used */
1250 	kpm->pm_pdir = (pd_entry_t *)(lwp0.l_addr->u_pcb.pcb_cr3 + KERNBASE);
1251 #ifdef PAE
1252 	for (i = 0; i < PDP_SIZE; i++)
1253 		kpm->pm_pdirpa[i] =
1254 		    (paddr_t)lwp0.l_addr->u_pcb.pcb_cr3 + PAGE_SIZE * i;
1255 #else
1256 	kpm->pm_pdirpa = (paddr_t) lwp0.l_addr->u_pcb.pcb_cr3;
1257 #endif
1258 	kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
1259 		x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS);
1260 
1261 	/*
1262 	 * the above is just a rough estimate and not critical to the proper
1263 	 * operation of the system.
1264 	 */
1265 
1266 #ifndef XEN
1267 	/*
1268 	 * Begin to enable global TLB entries if they are supported.
1269 	 * The G bit has no effect until the CR4_PGE bit is set in CR4,
1270 	 * which happens in cpu_init(), which is run on each cpu
1271 	 * (and happens later)
1272 	 */
1273 
1274 	if (cpu_feature & CPUID_PGE) {
1275 		pmap_pg_g = PG_G;		/* enable software */
1276 
1277 		/* add PG_G attribute to already mapped kernel pages */
1278 		if (KERNBASE == VM_MIN_KERNEL_ADDRESS) {
1279 			kva_end = virtual_avail;
1280 		} else {
1281 			extern vaddr_t eblob, esym;
1282 			kva_end = (vaddr_t)&end;
1283 			if (esym > kva_end)
1284 				kva_end = esym;
1285 			if (eblob > kva_end)
1286 				kva_end = eblob;
1287 			kva_end = roundup(kva_end, PAGE_SIZE);
1288 		}
1289 		for (kva = KERNBASE; kva < kva_end; kva += PAGE_SIZE) {
1290 			p1i = pl1_i(kva);
1291 			if (pmap_valid_entry(PTE_BASE[p1i]))
1292 				PTE_BASE[p1i] |= PG_G;
1293 		}
1294 	}
1295 
1296 	/*
1297 	 * enable large pages if they are supported.
1298 	 */
1299 
1300 	if (cpu_feature & CPUID_PSE) {
1301 		paddr_t pa;
1302 		pd_entry_t *pde;
1303 		extern char __data_start;
1304 
1305 		lcr4(rcr4() | CR4_PSE);	/* enable hardware (via %cr4) */
1306 		pmap_largepages = 1;	/* enable software */
1307 
1308 		/*
1309 		 * the TLB must be flushed after enabling large pages
1310 		 * on Pentium CPUs, according to section 3.6.2.2 of
1311 		 * "Intel Architecture Software Developer's Manual,
1312 		 * Volume 3: System Programming".
1313 		 */
1314 		tlbflush();
1315 
1316 		/*
1317 		 * now, remap the kernel text using large pages.  we
1318 		 * assume that the linker has properly aligned the
1319 		 * .data segment to a NBPD_L2 boundary.
1320 		 */
1321 		kva_end = rounddown((vaddr_t)&__data_start, NBPD_L1);
1322 		for (pa = 0, kva = KERNBASE; kva + NBPD_L2 <= kva_end;
1323 		     kva += NBPD_L2, pa += NBPD_L2) {
1324 			pde = &L2_BASE[pl2_i(kva)];
1325 			*pde = pa | pmap_pg_g | PG_PS |
1326 			    PG_KR | PG_V;	/* zap! */
1327 			tlbflush();
1328 		}
1329 #if defined(DEBUG)
1330 		printf("kernel text is mapped with "
1331 		    "%lu large pages and %lu normal pages\n",
1332 		    (unsigned long)howmany(kva - KERNBASE, NBPD_L2),
1333 		    (unsigned long)howmany((vaddr_t)&__data_start - kva,
1334 		    NBPD_L1));
1335 #endif /* defined(DEBUG) */
1336 	}
1337 #endif /* !XEN */
1338 
1339 	if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
1340 		/*
1341 		 * zero_pte is stuck at the end of mapped space for the kernel
1342 		 * image (disjunct from kva space). This is done so that it
1343 		 * can safely be used in pmap_growkernel (pmap_get_physpage),
1344 		 * when it's called for the first time.
1345 		 * XXXfvdl fix this for MULTIPROCESSOR later.
1346 		 */
1347 
1348 		early_zerop = (void *)(KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2);
1349 		early_zero_pte = PTE_BASE + pl1_i((unsigned long)early_zerop);
1350 	}
1351 
1352 	/*
1353 	 * now we allocate the "special" VAs which are used for tmp mappings
1354 	 * by the pmap (and other modules).    we allocate the VAs by advancing
1355 	 * virtual_avail (note that there are no pages mapped at these VAs).
1356 	 * we find the PTE that maps the allocated VA via the linear PTE
1357 	 * mapping.
1358 	 */
1359 
1360 	pte = PTE_BASE + pl1_i(virtual_avail);
1361 
1362 #ifdef MULTIPROCESSOR
1363 	/*
1364 	 * Waste some VA space to avoid false sharing of cache lines
1365 	 * for page table pages: Give each possible CPU a cache line
1366 	 * of PTE's (8) to play with, though we only need 4.  We could
1367 	 * recycle some of this waste by putting the idle stacks here
1368 	 * as well; we could waste less space if we knew the largest
1369 	 * CPU ID beforehand.
1370 	 */
1371 	csrcp = (char *) virtual_avail;  csrc_pte = pte;
1372 
1373 	cdstp = (char *) virtual_avail+PAGE_SIZE;  cdst_pte = pte+1;
1374 
1375 	zerop = (char *) virtual_avail+PAGE_SIZE*2;  zero_pte = pte+2;
1376 
1377 	ptpp = (char *) virtual_avail+PAGE_SIZE*3;  ptp_pte = pte+3;
1378 
1379 	virtual_avail += PAGE_SIZE * maxcpus * NPTECL;
1380 	pte += maxcpus * NPTECL;
1381 #else
1382 	csrcp = (void *) virtual_avail;  csrc_pte = pte;	/* allocate */
1383 	virtual_avail += PAGE_SIZE; pte++;			/* advance */
1384 
1385 	cdstp = (void *) virtual_avail;  cdst_pte = pte;
1386 	virtual_avail += PAGE_SIZE; pte++;
1387 
1388 	zerop = (void *) virtual_avail;  zero_pte = pte;
1389 	virtual_avail += PAGE_SIZE; pte++;
1390 
1391 	ptpp = (void *) virtual_avail;  ptp_pte = pte;
1392 	virtual_avail += PAGE_SIZE; pte++;
1393 #endif
1394 
1395 	if (VM_MIN_KERNEL_ADDRESS == KERNBASE) {
1396 		early_zerop = zerop;
1397 		early_zero_pte = zero_pte;
1398 	}
1399 
1400 	/*
1401 	 * Nothing after this point actually needs pte;
1402 	 */
1403 	pte = (void *)0xdeadbeef;
1404 
1405 	/* XXX: vmmap used by mem.c... should be uvm_map_reserve */
1406 	/* XXXfvdl PTEs not needed here */
1407 	vmmap = (char *)virtual_avail;			/* don't need pte */
1408 	virtual_avail += PAGE_SIZE; pte++;
1409 
1410 #ifdef XEN
1411 #ifdef __x86_64__
1412 	/*
1413 	 * We want a dummy page directory for Xen:
1414 	 * when deactivate a pmap, Xen will still consider it active.
1415 	 * So we set user PGD to this one to lift all protection on
1416 	 * the now inactive page tables set.
1417 	 */
1418 	xen_dummy_user_pgd = avail_start;
1419 	avail_start += PAGE_SIZE;
1420 
1421 	/* Zero fill it, the less checks in Xen it requires the better */
1422 	memset((void *) (xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE);
1423 	/* Mark read-only */
1424 	HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE,
1425 	    pmap_pa2pte(xen_dummy_user_pgd) | PG_u | PG_V, UVMF_INVLPG);
1426 	/* Pin as L4 */
1427 	xpq_queue_pin_table(xpmap_ptom_masked(xen_dummy_user_pgd));
1428 #endif /* __x86_64__ */
1429 	idt_vaddr = virtual_avail;                      /* don't need pte */
1430 	idt_paddr = avail_start;                        /* steal a page */
1431 	/*
1432 	 * Xen require one more page as we can't store
1433 	 * GDT and LDT on the same page
1434 	 */
1435 	virtual_avail += 3 * PAGE_SIZE;
1436 	avail_start += 3 * PAGE_SIZE;
1437 #else /* XEN */
1438 	idt_vaddr = virtual_avail;			/* don't need pte */
1439 	idt_paddr = avail_start;			/* steal a page */
1440 #if defined(__x86_64__)
1441 	virtual_avail += 2 * PAGE_SIZE; pte += 2;
1442 	avail_start += 2 * PAGE_SIZE;
1443 #else /* defined(__x86_64__) */
1444 	virtual_avail += PAGE_SIZE; pte++;
1445 	avail_start += PAGE_SIZE;
1446 	/* pentium f00f bug stuff */
1447 	pentium_idt_vaddr = virtual_avail;		/* don't need pte */
1448 	virtual_avail += PAGE_SIZE; pte++;
1449 #endif /* defined(__x86_64__) */
1450 #endif /* XEN */
1451 
1452 #ifdef _LP64
1453 	/*
1454 	 * Grab a page below 4G for things that need it (i.e.
1455 	 * having an initial %cr3 for the MP trampoline).
1456 	 */
1457 	lo32_vaddr = virtual_avail;
1458 	virtual_avail += PAGE_SIZE; pte++;
1459 	lo32_paddr = avail_start;
1460 	avail_start += PAGE_SIZE;
1461 #endif
1462 
1463 	/*
1464 	 * now we reserve some VM for mapping pages when doing a crash dump
1465 	 */
1466 
1467 	virtual_avail = reserve_dumppages(virtual_avail);
1468 
1469 	/*
1470 	 * init the static-global locks and global lists.
1471 	 *
1472 	 * => pventry::pvh_lock (initialized elsewhere) must also be
1473 	 *      a spin lock, again at IPL_VM to prevent deadlock, and
1474 	 *	again is never taken from interrupt context.
1475 	 */
1476 
1477 	mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE);
1478 	LIST_INIT(&pmaps);
1479 	pmap_cpu_init_early(curcpu());
1480 
1481 	/*
1482 	 * initialize caches.
1483 	 */
1484 
1485 	pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), 0, 0, 0,
1486 	    "pmappl", NULL, IPL_NONE, NULL, NULL, NULL);
1487 #ifdef PAE
1488 	pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE * PDP_SIZE, 0, 0, 0,
1489 	    "pdppl", &pmap_pdp_allocator, IPL_NONE,
1490 	    pmap_pdp_ctor, pmap_pdp_dtor, NULL);
1491 #else /* PAE */
1492 	pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE, 0, 0, 0,
1493 	    "pdppl", NULL, IPL_NONE, pmap_pdp_ctor, pmap_pdp_dtor, NULL);
1494 #endif /* PAE */
1495 	pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0,
1496 	    PR_LARGECACHE, "pvpl", &pool_allocator_meta, IPL_NONE, NULL,
1497 	    NULL, NULL);
1498 
1499 	/*
1500 	 * ensure the TLB is sync'd with reality by flushing it...
1501 	 */
1502 
1503 	tlbflush();
1504 
1505 	/*
1506 	 * calculate pmap_maxkvaddr from nkptp[].
1507 	 */
1508 
1509 	kva = VM_MIN_KERNEL_ADDRESS;
1510 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
1511 		kva += nkptp[i] * nbpd[i];
1512 	}
1513 	pmap_maxkvaddr = kva;
1514 }
1515 
1516 #if defined(__x86_64__)
1517 /*
1518  * Pre-allocate PTPs for low memory, so that 1:1 mappings for various
1519  * trampoline code can be entered.
1520  */
1521 void
1522 pmap_prealloc_lowmem_ptps(void)
1523 {
1524 #ifdef XEN
1525 	int level;
1526 	paddr_t newp;
1527 	paddr_t pdes_pa;
1528 
1529 	pdes_pa = pmap_kernel()->pm_pdirpa;
1530 	level = PTP_LEVELS;
1531 	for (;;) {
1532 		newp = avail_start;
1533 		avail_start += PAGE_SIZE;
1534 		HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop,
1535 		    xpmap_ptom_masked(newp) | PG_u | PG_V | PG_RW, UVMF_INVLPG);
1536 		memset((void *)early_zerop, 0, PAGE_SIZE);
1537 		/* Mark R/O before installing */
1538 		HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop,
1539 		    xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG);
1540 		if (newp < (NKL2_KIMG_ENTRIES * NBPD_L2))
1541 			HYPERVISOR_update_va_mapping (newp + KERNBASE,
1542 			    xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG);
1543 		xpq_queue_pte_update (
1544 			xpmap_ptom_masked(pdes_pa)
1545 			+ (pl_i(0, level) * sizeof (pd_entry_t)),
1546 			xpmap_ptom_masked(newp) | PG_RW | PG_u | PG_V);
1547 		level--;
1548 		if (level <= 1)
1549 			break;
1550 		pdes_pa = newp;
1551 	}
1552 #else /* XEN */
1553 	pd_entry_t *pdes;
1554 	int level;
1555 	paddr_t newp;
1556 
1557 	pdes = pmap_kernel()->pm_pdir;
1558 	level = PTP_LEVELS;
1559 	for (;;) {
1560 		newp = avail_start;
1561 		avail_start += PAGE_SIZE;
1562 		*early_zero_pte = (newp & PG_FRAME) | PG_V | PG_RW;
1563 		pmap_update_pg((vaddr_t)early_zerop);
1564 		memset(early_zerop, 0, PAGE_SIZE);
1565 		pdes[pl_i(0, level)] = (newp & PG_FRAME) | PG_V | PG_RW;
1566 		level--;
1567 		if (level <= 1)
1568 			break;
1569 		pdes = normal_pdes[level - 2];
1570 	}
1571 #endif /* XEN */
1572 }
1573 #endif /* defined(__x86_64__) */
1574 
1575 /*
1576  * pmap_init: called from uvm_init, our job is to get the pmap
1577  * system ready to manage mappings...
1578  */
1579 
1580 void
1581 pmap_init(void)
1582 {
1583 	int i;
1584 
1585 	for (i = 0; i < PV_HASH_SIZE; i++) {
1586 		SLIST_INIT(&pv_hash_heads[i].hh_list);
1587 	}
1588 	for (i = 0; i < PV_HASH_LOCK_CNT; i++) {
1589 		mutex_init(&pv_hash_locks[i].lock, MUTEX_NODEBUG, IPL_VM);
1590 	}
1591 
1592 	/*
1593 	 * done: pmap module is up (and ready for business)
1594 	 */
1595 
1596 	pmap_initialized = true;
1597 }
1598 
1599 /*
1600  * pmap_cpu_init_early: perform early per-CPU initialization.
1601  */
1602 
1603 void
1604 pmap_cpu_init_early(struct cpu_info *ci)
1605 {
1606 	struct pmap_cpu *pc;
1607 	static uint8_t pmap_cpu_alloc;
1608 
1609 	pc = &pmap_cpu[pmap_cpu_alloc++].pc;
1610 	ci->ci_pmap_cpu = pc;
1611 }
1612 
1613 /*
1614  * pmap_cpu_init_late: perform late per-CPU initialization.
1615  */
1616 
1617 void
1618 pmap_cpu_init_late(struct cpu_info *ci)
1619 {
1620 
1621 	if (ci == &cpu_info_primary) {
1622 		evcnt_attach_dynamic(&pmap_tlb_evcnt, EVCNT_TYPE_INTR,
1623 		    NULL, "global", "TLB IPI");
1624 		evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC,
1625 		    NULL, "x86", "io bitmap copy");
1626 		evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC,
1627 		    NULL, "x86", "ldt sync");
1628 	}
1629 
1630 	evcnt_attach_dynamic(&ci->ci_tlb_evcnt, EVCNT_TYPE_MISC,
1631 	    NULL, device_xname(ci->ci_dev), "TLB IPI");
1632 }
1633 
1634 /*
1635  * p v _ e n t r y   f u n c t i o n s
1636  */
1637 
1638 /*
1639  * pmap_free_pvs: free a list of pv_entrys
1640  */
1641 
1642 static void
1643 pmap_free_pvs(struct pv_entry *pve)
1644 {
1645 	struct pv_entry *next;
1646 
1647 	for ( /* null */ ; pve != NULL ; pve = next) {
1648 		next = pve->pve_next;
1649 		pool_cache_put(&pmap_pv_cache, pve);
1650 	}
1651 }
1652 
1653 /*
1654  * main pv_entry manipulation functions:
1655  *   pmap_enter_pv: enter a mapping onto a pv_head list
1656  *   pmap_remove_pv: remove a mappiing from a pv_head list
1657  *
1658  * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock
1659  *       the pvh before calling
1660  */
1661 
1662 /*
1663  * insert_pv: a helper of pmap_enter_pv
1664  */
1665 
1666 static void
1667 insert_pv(struct pmap_page *pp, struct pv_entry *pve)
1668 {
1669 	struct pv_hash_head *hh;
1670 	kmutex_t *lock;
1671 	u_int hash;
1672 
1673 	KASSERT(pp_locked(pp));
1674 
1675 	hash = pvhash_hash(pve->pve_pte.pte_ptp, pve->pve_pte.pte_va);
1676 	lock = pvhash_lock(hash);
1677 	hh = pvhash_head(hash);
1678 	mutex_spin_enter(lock);
1679 	SLIST_INSERT_HEAD(&hh->hh_list, pve, pve_hash);
1680 	mutex_spin_exit(lock);
1681 
1682 	LIST_INSERT_HEAD(&pp->pp_head.pvh_list, pve, pve_list);
1683 }
1684 
1685 /*
1686  * pmap_enter_pv: enter a mapping onto a pv_head lst
1687  *
1688  * => caller should have the pp_lock locked
1689  * => caller should adjust ptp's wire_count before calling
1690  */
1691 
1692 static struct pv_entry *
1693 pmap_enter_pv(struct pmap_page *pp,
1694 	      struct pv_entry *pve,	/* preallocated pve for us to use */
1695 	      struct pv_entry **sparepve,
1696 	      struct vm_page *ptp,
1697 	      vaddr_t va)
1698 {
1699 
1700 	KASSERT(ptp == NULL || ptp->wire_count >= 2);
1701 	KASSERT(ptp == NULL || ptp->uobject != NULL);
1702 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
1703 	KASSERT(pp_locked(pp));
1704 
1705 	if ((pp->pp_flags & PP_EMBEDDED) == 0) {
1706 		if (LIST_EMPTY(&pp->pp_head.pvh_list)) {
1707 			pp->pp_flags |= PP_EMBEDDED;
1708 			pp->pp_pte.pte_ptp = ptp;
1709 			pp->pp_pte.pte_va = va;
1710 
1711 			return pve;
1712 		}
1713 	} else {
1714 		struct pv_entry *pve2;
1715 
1716 		pve2 = *sparepve;
1717 		*sparepve = NULL;
1718 
1719 		pve2->pve_pte = pp->pp_pte;
1720 		pp->pp_flags &= ~PP_EMBEDDED;
1721 		LIST_INIT(&pp->pp_head.pvh_list);
1722 		insert_pv(pp, pve2);
1723 	}
1724 
1725 	pve->pve_pte.pte_ptp = ptp;
1726 	pve->pve_pte.pte_va = va;
1727 	insert_pv(pp, pve);
1728 
1729 	return NULL;
1730 }
1731 
1732 /*
1733  * pmap_remove_pv: try to remove a mapping from a pv_list
1734  *
1735  * => caller should hold pp_lock [so that attrs can be adjusted]
1736  * => caller should adjust ptp's wire_count and free PTP if needed
1737  * => we return the removed pve
1738  */
1739 
1740 static struct pv_entry *
1741 pmap_remove_pv(struct pmap_page *pp, struct vm_page *ptp, vaddr_t va)
1742 {
1743 	struct pv_hash_head *hh;
1744 	struct pv_entry *pve;
1745 	kmutex_t *lock;
1746 	u_int hash;
1747 
1748 	KASSERT(ptp == NULL || ptp->uobject != NULL);
1749 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
1750 	KASSERT(pp_locked(pp));
1751 
1752 	if ((pp->pp_flags & PP_EMBEDDED) != 0) {
1753 		KASSERT(pp->pp_pte.pte_ptp == ptp);
1754 		KASSERT(pp->pp_pte.pte_va == va);
1755 
1756 		pp->pp_flags &= ~PP_EMBEDDED;
1757 		LIST_INIT(&pp->pp_head.pvh_list);
1758 
1759 		return NULL;
1760 	}
1761 
1762 	hash = pvhash_hash(ptp, va);
1763 	lock = pvhash_lock(hash);
1764 	hh = pvhash_head(hash);
1765 	mutex_spin_enter(lock);
1766 	pve = pvhash_remove(hh, ptp, va);
1767 	mutex_spin_exit(lock);
1768 
1769 	LIST_REMOVE(pve, pve_list);
1770 
1771 	return pve;
1772 }
1773 
1774 /*
1775  * p t p   f u n c t i o n s
1776  */
1777 
1778 static inline struct vm_page *
1779 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level)
1780 {
1781 	int lidx = level - 1;
1782 	struct vm_page *pg;
1783 
1784 	KASSERT(mutex_owned(&pmap->pm_lock));
1785 
1786 	if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] &&
1787 	    pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) {
1788 		return (pmap->pm_ptphint[lidx]);
1789 	}
1790 	PMAP_SUBOBJ_LOCK(pmap, lidx);
1791 	pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level));
1792 	PMAP_SUBOBJ_UNLOCK(pmap, lidx);
1793 
1794 	KASSERT(pg == NULL || pg->wire_count >= 1);
1795 	return pg;
1796 }
1797 
1798 static inline void
1799 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level)
1800 {
1801 	int lidx;
1802 	struct uvm_object *obj;
1803 
1804 	KASSERT(ptp->wire_count == 1);
1805 
1806 	lidx = level - 1;
1807 
1808 	obj = &pmap->pm_obj[lidx];
1809 	pmap_stats_update(pmap, -1, 0);
1810 	if (lidx != 0)
1811 		mutex_enter(&obj->vmobjlock);
1812 	if (pmap->pm_ptphint[lidx] == ptp)
1813 		pmap->pm_ptphint[lidx] = TAILQ_FIRST(&obj->memq);
1814 	ptp->wire_count = 0;
1815 	uvm_pagerealloc(ptp, NULL, 0);
1816 	VM_PAGE_TO_PP(ptp)->pp_link = curlwp->l_md.md_gc_ptp;
1817 	curlwp->l_md.md_gc_ptp = ptp;
1818 	if (lidx != 0)
1819 		mutex_exit(&obj->vmobjlock);
1820 }
1821 
1822 static void
1823 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
1824 	      pt_entry_t *ptes, pd_entry_t * const *pdes)
1825 {
1826 	unsigned long index;
1827 	int level;
1828 	vaddr_t invaladdr;
1829 #ifdef MULTIPROCESSOR
1830 	vaddr_t invaladdr2;
1831 #endif
1832 	pd_entry_t opde;
1833 	struct pmap *curpmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map);
1834 
1835 	KASSERT(pmap != pmap_kernel());
1836 	KASSERT(mutex_owned(&pmap->pm_lock));
1837 	KASSERT(kpreempt_disabled());
1838 
1839 	level = 1;
1840 	do {
1841 		index = pl_i(va, level + 1);
1842 		opde = pmap_pte_testset(&pdes[level - 1][index], 0);
1843 #if defined(XEN) && defined(__x86_64__)
1844 		/*
1845 		 * If ptp is a L3 currently mapped in kernel space,
1846 		 * clear it before freeing
1847 		 */
1848 		if (pmap->pm_pdirpa == xen_current_user_pgd
1849 		    && level == PTP_LEVELS - 1)
1850 			pmap_pte_set(&pmap_kernel()->pm_pdir[index], 0);
1851 #endif /* XEN && __x86_64__ */
1852 		pmap_freepage(pmap, ptp, level);
1853 		invaladdr = level == 1 ? (vaddr_t)ptes :
1854 		    (vaddr_t)pdes[level - 2];
1855 		pmap_tlb_shootdown(curpmap, invaladdr + index * PAGE_SIZE,
1856 		    0, opde);
1857 #if defined(MULTIPROCESSOR)
1858 		invaladdr2 = level == 1 ? (vaddr_t)PTE_BASE :
1859 		    (vaddr_t)normal_pdes[level - 2];
1860 		if (pmap != curpmap || invaladdr != invaladdr2) {
1861 			pmap_tlb_shootdown(pmap, invaladdr2 + index * PAGE_SIZE,
1862 			    0, opde);
1863 		}
1864 #endif
1865 		if (level < PTP_LEVELS - 1) {
1866 			ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1);
1867 			ptp->wire_count--;
1868 			if (ptp->wire_count > 1)
1869 				break;
1870 		}
1871 	} while (++level < PTP_LEVELS);
1872 	pmap_pte_flush();
1873 }
1874 
1875 /*
1876  * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
1877  *
1878  * => pmap should NOT be pmap_kernel()
1879  * => pmap should be locked
1880  * => preemption should be disabled
1881  */
1882 
1883 static struct vm_page *
1884 pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t * const *pdes)
1885 {
1886 	struct vm_page *ptp, *pptp;
1887 	int i;
1888 	unsigned long index;
1889 	pd_entry_t *pva;
1890 	paddr_t ppa, pa;
1891 	struct uvm_object *obj;
1892 
1893 	KASSERT(pmap != pmap_kernel());
1894 	KASSERT(mutex_owned(&pmap->pm_lock));
1895 	KASSERT(kpreempt_disabled());
1896 
1897 	ptp = NULL;
1898 	pa = (paddr_t)-1;
1899 
1900 	/*
1901 	 * Loop through all page table levels seeing if we need to
1902 	 * add a new page to that level.
1903 	 */
1904 	for (i = PTP_LEVELS; i > 1; i--) {
1905 		/*
1906 		 * Save values from previous round.
1907 		 */
1908 		pptp = ptp;
1909 		ppa = pa;
1910 
1911 		index = pl_i(va, i);
1912 		pva = pdes[i - 2];
1913 
1914 		if (pmap_valid_entry(pva[index])) {
1915 			ppa = pmap_pte2pa(pva[index]);
1916 			ptp = NULL;
1917 			continue;
1918 		}
1919 
1920 		obj = &pmap->pm_obj[i-2];
1921 		PMAP_SUBOBJ_LOCK(pmap, i - 2);
1922 		ptp = uvm_pagealloc(obj, ptp_va2o(va, i - 1), NULL,
1923 		    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
1924 		PMAP_SUBOBJ_UNLOCK(pmap, i - 2);
1925 
1926 		if (ptp == NULL)
1927 			return NULL;
1928 
1929 		ptp->flags &= ~PG_BUSY; /* never busy */
1930 		ptp->wire_count = 1;
1931 		pmap->pm_ptphint[i - 2] = ptp;
1932 		pa = VM_PAGE_TO_PHYS(ptp);
1933 		pmap_pte_set(&pva[index], (pd_entry_t)
1934 		        (pmap_pa2pte(pa) | PG_u | PG_RW | PG_V));
1935 #if defined(XEN) && defined(__x86_64__)
1936 		/*
1937 		 * In Xen we must enter the mapping in kernel map too
1938 		 * if pmap is curmap and modifying top level (PGD)
1939 		 */
1940 		if(i == PTP_LEVELS && pmap != pmap_kernel()) {
1941 		        pmap_pte_set(&pmap_kernel()->pm_pdir[index],
1942 		                (pd_entry_t) (pmap_pa2pte(pa)
1943 		                        | PG_u | PG_RW | PG_V));
1944 		}
1945 #endif /* XEN && __x86_64__ */
1946 		pmap_pte_flush();
1947 		pmap_stats_update(pmap, 1, 0);
1948 		/*
1949 		 * If we're not in the top level, increase the
1950 		 * wire count of the parent page.
1951 		 */
1952 		if (i < PTP_LEVELS) {
1953 			if (pptp == NULL)
1954 				pptp = pmap_find_ptp(pmap, va, ppa, i);
1955 #ifdef DIAGNOSTIC
1956 			if (pptp == NULL)
1957 				panic("pde page disappeared");
1958 #endif
1959 			pptp->wire_count++;
1960 		}
1961 	}
1962 
1963 	/*
1964 	 * ptp is not NULL if we just allocated a new ptp. If it's
1965 	 * still NULL, we must look up the existing one.
1966 	 */
1967 	if (ptp == NULL) {
1968 		ptp = pmap_find_ptp(pmap, va, ppa, 1);
1969 #ifdef DIAGNOSTIC
1970 		if (ptp == NULL) {
1971 			printf("va %lx ppa %lx\n", (unsigned long)va,
1972 			    (unsigned long)ppa);
1973 			panic("pmap_get_ptp: unmanaged user PTP");
1974 		}
1975 #endif
1976 	}
1977 
1978 	pmap->pm_ptphint[0] = ptp;
1979 	return(ptp);
1980 }
1981 
1982 /*
1983  * p m a p  l i f e c y c l e   f u n c t i o n s
1984  */
1985 
1986 /*
1987  * pmap_pdp_ctor: constructor for the PDP cache.
1988  */
1989 
1990 int
1991 pmap_pdp_ctor(void *arg, void *v, int flags)
1992 {
1993 	pd_entry_t *pdir = v;
1994 	paddr_t pdirpa = 0;	/* XXX: GCC */
1995 	vaddr_t object;
1996 	int i;
1997 
1998 #if !defined(XEN) || !defined(__x86_64__)
1999 	int npde;
2000 #endif
2001 #ifdef XEN
2002 	int s;
2003 #endif
2004 
2005 	/*
2006 	 * NOTE: The `pmap_lock' is held when the PDP is allocated.
2007 	 */
2008 
2009 #if defined(XEN) && defined(__x86_64__)
2010 	/* fetch the physical address of the page directory. */
2011 	(void) pmap_extract(pmap_kernel(), (vaddr_t) pdir, &pdirpa);
2012 
2013 	/* zero init area */
2014 	memset (pdir, 0, PAGE_SIZE); /* Xen wants a clean page */
2015 	/*
2016 	 * this pdir will NEVER be active in kernel mode
2017 	 * so mark recursive entry invalid
2018 	 */
2019 	pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa) | PG_u;
2020 	/*
2021 	 * PDP constructed this way won't be for kernel,
2022 	 * hence we don't put kernel mappings on Xen.
2023 	 * But we need to make pmap_create() happy, so put a dummy (without
2024 	 * PG_V) value at the right place.
2025 	 */
2026 	pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] =
2027 	     (unsigned long)-1 & PG_FRAME;
2028 #else /* XEN  && __x86_64__*/
2029 	/* zero init area */
2030 	memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t));
2031 
2032 	object = (vaddr_t)v;
2033 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2034 		/* fetch the physical address of the page directory. */
2035 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2036 		/* put in recursive PDE to map the PTEs */
2037 		pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PG_V;
2038 #ifndef XEN
2039 		pdir[PDIR_SLOT_PTE + i] |= PG_KW;
2040 #endif
2041 	}
2042 
2043 	/* copy kernel's PDE */
2044 	npde = nkptp[PTP_LEVELS - 1];
2045 
2046 	memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN],
2047 	    npde * sizeof(pd_entry_t));
2048 
2049 	/* zero the rest */
2050 	memset(&pdir[PDIR_SLOT_KERN + npde], 0,
2051 	    (NTOPLEVEL_PDES - (PDIR_SLOT_KERN + npde)) * sizeof(pd_entry_t));
2052 
2053 	if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
2054 		int idx = pl_i(KERNBASE, PTP_LEVELS);
2055 
2056 		pdir[idx] = PDP_BASE[idx];
2057 	}
2058 #endif /* XEN  && __x86_64__*/
2059 #ifdef XEN
2060 	s = splvm();
2061 	object = (vaddr_t)v;
2062 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2063 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2064 		/* remap this page RO */
2065 		pmap_kenter_pa(object, pdirpa, VM_PROT_READ);
2066 		pmap_update(pmap_kernel());
2067 		/*
2068 		 * pin as L2/L4 page, we have to do the page with the
2069 		 * PDIR_SLOT_PTE entries last
2070 		 */
2071 #ifdef PAE
2072 		if (i == l2tol3(PDIR_SLOT_PTE))
2073 			continue;
2074 #endif
2075 		xpq_queue_pin_table(xpmap_ptom_masked(pdirpa));
2076 	}
2077 #ifdef PAE
2078 	object = ((vaddr_t)pdir) + PAGE_SIZE  * l2tol3(PDIR_SLOT_PTE);
2079 	(void)pmap_extract(pmap_kernel(), object, &pdirpa);
2080 	xpq_queue_pin_table(xpmap_ptom_masked(pdirpa));
2081 #endif
2082 	xpq_flush_queue();
2083 	splx(s);
2084 #endif /* XEN */
2085 
2086 	return (0);
2087 }
2088 
2089 /*
2090  * pmap_pdp_dtor: destructor for the PDP cache.
2091  */
2092 
2093 void
2094 pmap_pdp_dtor(void *arg, void *v)
2095 {
2096 #ifdef XEN
2097 	paddr_t pdirpa = 0;	/* XXX: GCC */
2098 	vaddr_t object = (vaddr_t)v;
2099 	int i;
2100 	int s = splvm();
2101 	pt_entry_t *pte;
2102 
2103 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2104 		/* fetch the physical address of the page directory. */
2105 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2106 		/* unpin page table */
2107 		xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa));
2108 	}
2109 	object = (vaddr_t)v;
2110 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2111 		/* Set page RW again */
2112 		pte = kvtopte(object);
2113 		xpq_queue_pte_update(xpmap_ptetomach(pte), *pte | PG_RW);
2114 		xpq_queue_invlpg((vaddr_t)object);
2115 	}
2116 	xpq_flush_queue();
2117 	splx(s);
2118 #endif  /* XEN */
2119 }
2120 
2121 #ifdef PAE
2122 
2123 /* pmap_pdp_alloc: Allocate a page for the pdp memory pool. */
2124 
2125 void *
2126 pmap_pdp_alloc(struct pool *pp, int flags)
2127 {
2128 	return (void *)uvm_km_alloc(kernel_map,
2129 	    PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE,
2130 	    ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK)
2131 	    | UVM_KMF_WIRED);
2132 }
2133 
2134 /*
2135  * pmap_pdp_free: free a PDP
2136  */
2137 
2138 void
2139 pmap_pdp_free(struct pool *pp, void *v)
2140 {
2141 	uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE,
2142 	    UVM_KMF_WIRED);
2143 }
2144 #endif /* PAE */
2145 
2146 /*
2147  * pmap_create: create a pmap
2148  *
2149  * => note: old pmap interface took a "size" args which allowed for
2150  *	the creation of "software only" pmaps (not in bsd).
2151  */
2152 
2153 struct pmap *
2154 pmap_create(void)
2155 {
2156 	struct pmap *pmap;
2157 	int i;
2158 
2159 	pmap = pool_cache_get(&pmap_cache, PR_WAITOK);
2160 
2161 	/* init uvm_object */
2162 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2163 		UVM_OBJ_INIT(&pmap->pm_obj[i], NULL, 1);
2164 		pmap->pm_ptphint[i] = NULL;
2165 	}
2166 	pmap->pm_stats.wired_count = 0;
2167 	pmap->pm_stats.resident_count = 1;	/* count the PDP allocd below */
2168 #if !defined(__x86_64__)
2169 	pmap->pm_hiexec = 0;
2170 #endif /* !defined(__x86_64__) */
2171 	pmap->pm_flags = 0;
2172 	pmap->pm_cpus = 0;
2173 	pmap->pm_kernel_cpus = 0;
2174 
2175 	/* init the LDT */
2176 	pmap->pm_ldt = NULL;
2177 	pmap->pm_ldt_len = 0;
2178 	pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2179 
2180 	/* allocate PDP */
2181  try_again:
2182 	pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK);
2183 
2184 	mutex_enter(&pmaps_lock);
2185 
2186 	if (pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] == 0) {
2187 		mutex_exit(&pmaps_lock);
2188 		pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir);
2189 		goto try_again;
2190 	}
2191 
2192 #ifdef PAE
2193 	for (i = 0; i < PDP_SIZE; i++)
2194 		pmap->pm_pdirpa[i] =
2195 		    pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]);
2196 #else
2197 	pmap->pm_pdirpa = pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE]);
2198 #endif
2199 
2200 	LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
2201 
2202 	mutex_exit(&pmaps_lock);
2203 
2204 	return (pmap);
2205 }
2206 
2207 /*
2208  * pmap_destroy: drop reference count on pmap.   free pmap if
2209  *	reference count goes to zero.
2210  */
2211 
2212 void
2213 pmap_destroy(struct pmap *pmap)
2214 {
2215 	int i;
2216 #ifdef DIAGNOSTIC
2217 	struct cpu_info *ci;
2218 	CPU_INFO_ITERATOR cii;
2219 #endif /* DIAGNOSTIC */
2220 
2221 	/*
2222 	 * if we have torn down this pmap, process deferred frees and
2223 	 * invalidations now.
2224 	 */
2225 	if (__predict_false(curlwp->l_md.md_gc_pmap == pmap)) {
2226 		pmap_update(pmap);
2227 	}
2228 
2229 	/*
2230 	 * drop reference count
2231 	 */
2232 
2233 	if (atomic_dec_uint_nv((unsigned *)&pmap->pm_obj[0].uo_refs) > 0) {
2234 		return;
2235 	}
2236 
2237 #ifdef DIAGNOSTIC
2238 	for (CPU_INFO_FOREACH(cii, ci))
2239 		if (ci->ci_pmap == pmap)
2240 			panic("destroying pmap being used");
2241 #endif /* DIAGNOSTIC */
2242 
2243 	/*
2244 	 * reference count is zero, free pmap resources and then free pmap.
2245 	 */
2246 #ifdef XEN
2247 	/*
2248 	 * Xen lazy APDP handling:
2249 	 * clear APDP_PDE if pmap is the currently mapped
2250 	 */
2251 	if (xpmap_ptom_masked(pmap_pdirpa(pmap, 0)) == (*APDP_PDE & PG_FRAME)) {
2252 		kpreempt_disable();
2253 		for (i = 0; i < PDP_SIZE; i++) {
2254 	        	pmap_pte_set(&APDP_PDE[i], 0);
2255 #ifdef PAE
2256 			/* clear shadow entry too */
2257 	    		pmap_pte_set(&APDP_PDE_SHADOW[i], 0);
2258 #endif
2259 		}
2260 		pmap_pte_flush();
2261 	        pmap_apte_flush(pmap_kernel());
2262 	        kpreempt_enable();
2263 	}
2264 #endif
2265 
2266 	/*
2267 	 * remove it from global list of pmaps
2268 	 */
2269 
2270 	mutex_enter(&pmaps_lock);
2271 	LIST_REMOVE(pmap, pm_list);
2272 	mutex_exit(&pmaps_lock);
2273 
2274 	/*
2275 	 * destroyed pmap shouldn't have remaining PTPs
2276 	 */
2277 
2278 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2279 		KASSERT(pmap->pm_obj[i].uo_npages == 0);
2280 		KASSERT(TAILQ_EMPTY(&pmap->pm_obj[i].memq));
2281 	}
2282 
2283 	/*
2284 	 * MULTIPROCESSOR -- no need to flush out of other processors'
2285 	 * APTE space because we do that in pmap_unmap_ptes().
2286 	 */
2287 	pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir);
2288 
2289 #ifdef USER_LDT
2290 	if (pmap->pm_ldt != NULL) {
2291 		/*
2292 		 * no need to switch the LDT; this address space is gone,
2293 		 * nothing is using it.
2294 		 *
2295 		 * No need to lock the pmap for ldt_free (or anything else),
2296 		 * we're the last one to use it.
2297 		 */
2298 		mutex_enter(&cpu_lock);
2299 		ldt_free(pmap->pm_ldt_sel);
2300 		mutex_exit(&cpu_lock);
2301 		uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt,
2302 		    pmap->pm_ldt_len, UVM_KMF_WIRED);
2303 	}
2304 #endif
2305 
2306 	for (i = 0; i < PTP_LEVELS - 1; i++)
2307 		mutex_destroy(&pmap->pm_obj[i].vmobjlock);
2308 	pool_cache_put(&pmap_cache, pmap);
2309 }
2310 
2311 /*
2312  * pmap_remove_all: pmap is being torn down by the current thread.
2313  * avoid unnecessary invalidations.
2314  */
2315 
2316 void
2317 pmap_remove_all(struct pmap *pmap)
2318 {
2319 	lwp_t *l = curlwp;
2320 
2321 	KASSERT(l->l_md.md_gc_pmap == NULL);
2322 
2323 	l->l_md.md_gc_pmap = pmap;
2324 }
2325 
2326 #if defined(PMAP_FORK)
2327 /*
2328  * pmap_fork: perform any necessary data structure manipulation when
2329  * a VM space is forked.
2330  */
2331 
2332 void
2333 pmap_fork(struct pmap *pmap1, struct pmap *pmap2)
2334 {
2335 #ifdef USER_LDT
2336 	union descriptor *new_ldt;
2337 	size_t len;
2338 	int sel;
2339 
2340 	if (__predict_true(pmap1->pm_ldt == NULL)) {
2341 		return;
2342 	}
2343 
2344  retry:
2345 	if (pmap1->pm_ldt != NULL) {
2346 		len = pmap1->pm_ldt_len;
2347 		new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, len, 0,
2348 		    UVM_KMF_WIRED);
2349 		mutex_enter(&cpu_lock);
2350 		sel = ldt_alloc(new_ldt, len);
2351 		if (sel == -1) {
2352 			mutex_exit(&cpu_lock);
2353 			uvm_km_free(kernel_map, (vaddr_t)new_ldt, len,
2354 			    UVM_KMF_WIRED);
2355 			printf("WARNING: pmap_fork: unable to allocate LDT\n");
2356 			return;
2357 		}
2358 	} else {
2359 		len = -1;
2360 		new_ldt = NULL;
2361 		sel = -1;
2362 		mutex_enter(&cpu_lock);
2363 	}
2364 
2365  	/* Copy the LDT, if necessary. */
2366  	if (pmap1->pm_ldt != NULL) {
2367 		if (len != pmap1->pm_ldt_len) {
2368 			if (len != -1) {
2369 				ldt_free(sel);
2370 				uvm_km_free(kernel_map, (vaddr_t)new_ldt,
2371 				    len, UVM_KMF_WIRED);
2372 			}
2373 			mutex_exit(&cpu_lock);
2374 			goto retry;
2375 		}
2376 
2377 		memcpy(new_ldt, pmap1->pm_ldt, len);
2378 		pmap2->pm_ldt = new_ldt;
2379 		pmap2->pm_ldt_len = pmap1->pm_ldt_len;
2380 		pmap2->pm_ldt_sel = sel;
2381 		len = -1;
2382 	}
2383 
2384 	if (len != -1) {
2385 		ldt_free(sel);
2386 		uvm_km_free(kernel_map, (vaddr_t)new_ldt, len,
2387 		    UVM_KMF_WIRED);
2388 	}
2389 	mutex_exit(&cpu_lock);
2390 #endif /* USER_LDT */
2391 }
2392 #endif /* PMAP_FORK */
2393 
2394 #ifdef USER_LDT
2395 
2396 /*
2397  * pmap_ldt_xcall: cross call used by pmap_ldt_sync.  if the named pmap
2398  * is active, reload LDTR.
2399  */
2400 static void
2401 pmap_ldt_xcall(void *arg1, void *arg2)
2402 {
2403 	struct pmap *pm;
2404 
2405 	kpreempt_disable();
2406 	pm = arg1;
2407 	if (curcpu()->ci_pmap == pm) {
2408 		lldt(pm->pm_ldt_sel);
2409 	}
2410 	kpreempt_enable();
2411 }
2412 
2413 /*
2414  * pmap_ldt_sync: LDT selector for the named pmap is changing.  swap
2415  * in the new selector on all CPUs.
2416  */
2417 void
2418 pmap_ldt_sync(struct pmap *pm)
2419 {
2420 	uint64_t where;
2421 
2422 	KASSERT(mutex_owned(&cpu_lock));
2423 
2424 	pmap_ldt_evcnt.ev_count++;
2425 	where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL);
2426 	xc_wait(where);
2427 }
2428 
2429 /*
2430  * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and
2431  * restore the default.
2432  */
2433 
2434 void
2435 pmap_ldt_cleanup(struct lwp *l)
2436 {
2437 	pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap;
2438 	union descriptor *dp = NULL;
2439 	size_t len = 0;
2440 	int sel = -1;
2441 
2442 	if (__predict_true(pmap->pm_ldt == NULL)) {
2443 		return;
2444 	}
2445 
2446 	mutex_enter(&cpu_lock);
2447 	if (pmap->pm_ldt != NULL) {
2448 		sel = pmap->pm_ldt_sel;
2449 		dp = pmap->pm_ldt;
2450 		len = pmap->pm_ldt_len;
2451 		pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2452 		pmap->pm_ldt = NULL;
2453 		pmap->pm_ldt_len = 0;
2454 		pmap_ldt_sync(pmap);
2455 		ldt_free(sel);
2456 		uvm_km_free(kernel_map, (vaddr_t)dp, len, UVM_KMF_WIRED);
2457 	}
2458 	mutex_exit(&cpu_lock);
2459 }
2460 #endif /* USER_LDT */
2461 
2462 /*
2463  * pmap_activate: activate a process' pmap
2464  *
2465  * => must be called with kernel preemption disabled
2466  * => if lwp is the curlwp, then set ci_want_pmapload so that
2467  *    actual MMU context switch will be done by pmap_load() later
2468  */
2469 
2470 void
2471 pmap_activate(struct lwp *l)
2472 {
2473 	struct cpu_info *ci;
2474 	struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2475 
2476 	KASSERT(kpreempt_disabled());
2477 
2478 	ci = curcpu();
2479 
2480 	if (l == ci->ci_curlwp) {
2481 		struct pcb *pcb;
2482 
2483 		KASSERT(ci->ci_want_pmapload == 0);
2484 		KASSERT(ci->ci_tlbstate != TLBSTATE_VALID);
2485 #ifdef KSTACK_CHECK_DR0
2486 		/*
2487 		 * setup breakpoint on the top of stack
2488 		 */
2489 		if (l == &lwp0)
2490 			dr0(0, 0, 0, 0);
2491 		else
2492 			dr0(KSTACK_LOWEST_ADDR(l), 1, 3, 1);
2493 #endif
2494 
2495 		/*
2496 		 * no need to switch to kernel vmspace because
2497 		 * it's a subset of any vmspace.
2498 		 */
2499 
2500 		if (pmap == pmap_kernel()) {
2501 			ci->ci_want_pmapload = 0;
2502 			return;
2503 		}
2504 
2505 		pcb = &l->l_addr->u_pcb;
2506 		ci->ci_want_pmapload = 1;
2507 
2508 #if defined(__x86_64__)
2509 		if (pcb->pcb_flags & PCB_GS64)
2510 			wrmsr(MSR_KERNELGSBASE, pcb->pcb_gs);
2511 		if (pcb->pcb_flags & PCB_FS64)
2512 			wrmsr(MSR_FSBASE, pcb->pcb_fs);
2513 #endif /* defined(__x86_64__) */
2514 	}
2515 }
2516 
2517 /*
2518  * pmap_reactivate: try to regain reference to the pmap.
2519  *
2520  * => must be called with kernel preemption disabled
2521  */
2522 
2523 static bool
2524 pmap_reactivate(struct pmap *pmap)
2525 {
2526 	struct cpu_info *ci;
2527 	uint32_t cpumask;
2528 	bool result;
2529 	uint32_t oldcpus;
2530 
2531 	ci = curcpu();
2532 	cpumask = ci->ci_cpumask;
2533 
2534 	KASSERT(kpreempt_disabled());
2535 #if defined(XEN) && defined(__x86_64__)
2536 	KASSERT(pmap->pm_pdirpa == xen_current_user_pgd);
2537 #elif defined(PAE)
2538 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(pmap_l3pd[0]));
2539 #elif !defined(XEN) || (defined(XEN) && defined(XEN3))
2540 	KASSERT(pmap->pm_pdirpa == pmap_pte2pa(rcr3()));
2541 #endif
2542 
2543 	/*
2544 	 * if we still have a lazy reference to this pmap,
2545 	 * we can assume that there was no tlb shootdown
2546 	 * for this pmap in the meantime.
2547 	 *
2548 	 * the order of events here is important as we must
2549 	 * synchronize with TLB shootdown interrupts.  declare
2550 	 * interest in invalidations (TLBSTATE_VALID) and then
2551 	 * check the cpumask, which the IPIs can change only
2552 	 * when the state is TLBSTATE_LAZY.
2553 	 */
2554 
2555 	ci->ci_tlbstate = TLBSTATE_VALID;
2556 	oldcpus = pmap->pm_cpus;
2557 	KASSERT((pmap->pm_kernel_cpus & cpumask) != 0);
2558 	if (oldcpus & cpumask) {
2559 		/* got it */
2560 		result = true;
2561 	} else {
2562 		/* must reload */
2563 		atomic_or_32(&pmap->pm_cpus, cpumask);
2564 		result = false;
2565 	}
2566 
2567 	return result;
2568 }
2569 
2570 /*
2571  * pmap_load: actually switch pmap.  (fill in %cr3 and LDT info)
2572  */
2573 
2574 void
2575 pmap_load(void)
2576 {
2577 	struct cpu_info *ci;
2578 	uint32_t cpumask;
2579 	struct pmap *pmap;
2580 	struct pmap *oldpmap;
2581 	struct lwp *l;
2582 	struct pcb *pcb;
2583 	uint64_t ncsw;
2584 
2585 	kpreempt_disable();
2586  retry:
2587 	ci = curcpu();
2588 	if (!ci->ci_want_pmapload) {
2589 		kpreempt_enable();
2590 		return;
2591 	}
2592 	cpumask = ci->ci_cpumask;
2593 	l = ci->ci_curlwp;
2594 	ncsw = l->l_ncsw;
2595 
2596 	/* should be able to take ipis. */
2597 	KASSERT(ci->ci_ilevel < IPL_IPI);
2598 #ifdef XEN
2599 	/* XXX not yet KASSERT(x86_read_psl() != 0); */
2600 #else
2601 	KASSERT((x86_read_psl() & PSL_I) != 0);
2602 #endif
2603 
2604 	KASSERT(l != NULL);
2605 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2606 	KASSERT(pmap != pmap_kernel());
2607 	oldpmap = ci->ci_pmap;
2608 	pcb = &l->l_addr->u_pcb;
2609 
2610 	if (pmap == oldpmap) {
2611 		if (!pmap_reactivate(pmap)) {
2612 
2613 			/*
2614 			 * pmap has been changed during deactivated.
2615 			 * our tlb may be stale.
2616 			 */
2617 
2618 			tlbflush();
2619 		}
2620 
2621 		ci->ci_want_pmapload = 0;
2622 		kpreempt_enable();
2623 		return;
2624 	}
2625 
2626 	/*
2627 	 * grab a reference to the new pmap.
2628 	 */
2629 
2630 	pmap_reference(pmap);
2631 
2632 	/*
2633 	 * actually switch pmap.
2634 	 */
2635 
2636 	atomic_and_32(&oldpmap->pm_cpus, ~cpumask);
2637 	atomic_and_32(&oldpmap->pm_kernel_cpus, ~cpumask);
2638 
2639 #if defined(XEN) && defined(__x86_64__)
2640 	KASSERT(oldpmap->pm_pdirpa == xen_current_user_pgd ||
2641 	    oldpmap == pmap_kernel());
2642 #elif defined(PAE)
2643 	KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(pmap_l3pd[0]));
2644 #elif !defined(XEN) || (defined(XEN) && defined(XEN3))
2645 	KASSERT(oldpmap->pm_pdirpa == pmap_pte2pa(rcr3()));
2646 #endif
2647 	KASSERT((pmap->pm_cpus & cpumask) == 0);
2648 	KASSERT((pmap->pm_kernel_cpus & cpumask) == 0);
2649 
2650 	/*
2651 	 * mark the pmap in use by this processor.  again we must
2652 	 * synchronize with TLB shootdown interrupts, so set the
2653 	 * state VALID first, then register us for shootdown events
2654 	 * on this pmap.
2655 	 */
2656 
2657 	ci->ci_tlbstate = TLBSTATE_VALID;
2658 	atomic_or_32(&pmap->pm_cpus, cpumask);
2659 	atomic_or_32(&pmap->pm_kernel_cpus, cpumask);
2660 	ci->ci_pmap = pmap;
2661 
2662 	/*
2663 	 * update tss.  now that we have registered for invalidations
2664 	 * from other CPUs, we're good to load the page tables.
2665 	 */
2666 #ifdef PAE
2667 	pcb->pcb_cr3 = pmap_l3paddr;
2668 #else
2669 	pcb->pcb_cr3 = pmap->pm_pdirpa;
2670 #endif
2671 #if defined(XEN) && defined(__x86_64__)
2672 	/* kernel pmap always in cr3 and should never go in user cr3 */
2673 	if (pmap_pdirpa(pmap, 0) != pmap_pdirpa(pmap_kernel(), 0)) {
2674 		/*
2675 		 * Map user space address in kernel space and load
2676 		 * user cr3
2677 		 */
2678 		int i, s;
2679 		pd_entry_t *old_pgd, *new_pgd;
2680 		paddr_t addr;
2681 		s = splvm();
2682 		new_pgd  = pmap->pm_pdir;
2683 		old_pgd = pmap_kernel()->pm_pdir;
2684 		addr = xpmap_ptom(pmap_pdirpa(pmap_kernel(), 0));
2685 		for (i = 0; i < PDIR_SLOT_PTE;
2686 		    i++, addr += sizeof(pd_entry_t)) {
2687 			if ((new_pgd[i] & PG_V) || (old_pgd[i] & PG_V))
2688 				xpq_queue_pte_update(addr, new_pgd[i]);
2689 		}
2690 		xpq_flush_queue(); /* XXXtlb */
2691 		tlbflush();
2692 		xen_set_user_pgd(pmap_pdirpa(pmap, 0));
2693 		xen_current_user_pgd = pmap_pdirpa(pmap, 0);
2694 		splx(s);
2695 	}
2696 #else /* XEN && x86_64 */
2697 #if defined(XEN)
2698 	/*
2699 	 * clear APDP slot, in case it points to a page table that has
2700 	 * been freed
2701 	 */
2702 	if (*APDP_PDE) {
2703 		int i;
2704 		for (i = 0; i < PDP_SIZE; i++) {
2705 			pmap_pte_set(&APDP_PDE[i], 0);
2706 #ifdef PAE
2707 			/* clear shadow entry too */
2708 			pmap_pte_set(&APDP_PDE_SHADOW[i], 0);
2709 #endif
2710 		}
2711 	}
2712 	/* lldt() does pmap_pte_flush() */
2713 #else /* XEN */
2714 #if defined(i386)
2715 	ci->ci_tss.tss_ldt = pmap->pm_ldt_sel;
2716 	ci->ci_tss.tss_cr3 = pcb->pcb_cr3;
2717 #endif
2718 #endif /* XEN */
2719 	lldt(pmap->pm_ldt_sel);
2720 #ifdef PAE
2721 	{
2722 	paddr_t l3_pd = xpmap_ptom_masked(pmap_l3paddr);
2723 	int i;
2724 	int s = splvm();
2725 	/* don't update the kernel L3 slot */
2726 	for (i = 0 ; i < PDP_SIZE - 1  ; i++, l3_pd += sizeof(pd_entry_t)) {
2727 		xpq_queue_pte_update(l3_pd,
2728 		    xpmap_ptom(pmap->pm_pdirpa[i]) | PG_V);
2729 	}
2730 	tlbflush();
2731 	xpq_flush_queue();
2732 	splx(s);
2733 	}
2734 #else /* PAE */
2735 	lcr3(pcb->pcb_cr3);
2736 #endif /* PAE */
2737 #endif /* XEN && x86_64 */
2738 
2739 	ci->ci_want_pmapload = 0;
2740 
2741 	/*
2742 	 * we're now running with the new pmap.  drop the reference
2743 	 * to the old pmap.  if we block, we need to go around again.
2744 	 */
2745 
2746 	pmap_destroy(oldpmap);
2747 	if (l->l_ncsw != ncsw) {
2748 		goto retry;
2749 	}
2750 
2751 	kpreempt_enable();
2752 }
2753 
2754 /*
2755  * pmap_deactivate: deactivate a process' pmap
2756  *
2757  * => must be called with kernel preemption disabled (high SPL is enough)
2758  */
2759 
2760 void
2761 pmap_deactivate(struct lwp *l)
2762 {
2763 	struct pmap *pmap;
2764 	struct cpu_info *ci;
2765 
2766 	KASSERT(kpreempt_disabled());
2767 
2768 	if (l != curlwp) {
2769 		return;
2770 	}
2771 
2772 	/*
2773 	 * wait for pending TLB shootdowns to complete.  necessary
2774 	 * because TLB shootdown state is per-CPU, and the LWP may
2775 	 * be coming off the CPU before it has a chance to call
2776 	 * pmap_update().
2777 	 */
2778 	pmap_tlb_shootwait();
2779 
2780 	ci = curcpu();
2781 
2782 	if (ci->ci_want_pmapload) {
2783 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
2784 		    != pmap_kernel());
2785 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
2786 		    != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID);
2787 
2788 		/*
2789 		 * userspace has not been touched.
2790 		 * nothing to do here.
2791 		 */
2792 
2793 		ci->ci_want_pmapload = 0;
2794 		return;
2795 	}
2796 
2797 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2798 
2799 	if (pmap == pmap_kernel()) {
2800 		return;
2801 	}
2802 
2803 #if defined(XEN) && defined(__x86_64__)
2804 	KASSERT(pmap->pm_pdirpa == xen_current_user_pgd);
2805 #elif defined(PAE)
2806 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(pmap_l3pd[0]));
2807 #elif !defined(XEN) || (defined(XEN) && defined(XEN3))
2808 	KASSERT(pmap->pm_pdirpa == pmap_pte2pa(rcr3()));
2809 #endif
2810 	KASSERT(ci->ci_pmap == pmap);
2811 
2812 	/*
2813 	 * we aren't interested in TLB invalidations for this pmap,
2814 	 * at least for the time being.
2815 	 */
2816 
2817 	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
2818 	ci->ci_tlbstate = TLBSTATE_LAZY;
2819 }
2820 
2821 /*
2822  * end of lifecycle functions
2823  */
2824 
2825 /*
2826  * some misc. functions
2827  */
2828 
2829 static int
2830 pmap_pdes_invalid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde)
2831 {
2832 	int i;
2833 	unsigned long index;
2834 	pd_entry_t pde;
2835 
2836 	for (i = PTP_LEVELS; i > 1; i--) {
2837 		index = pl_i(va, i);
2838 		pde = pdes[i - 2][index];
2839 		if ((pde & PG_V) == 0)
2840 			return i;
2841 	}
2842 	if (lastpde != NULL)
2843 		*lastpde = pde;
2844 	return 0;
2845 }
2846 
2847 /*
2848  * pmap_extract: extract a PA for the given VA
2849  */
2850 
2851 bool
2852 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
2853 {
2854 	pt_entry_t *ptes, pte;
2855 	pd_entry_t pde;
2856 	pd_entry_t * const *pdes;
2857 	struct pmap *pmap2;
2858 	struct cpu_info *ci;
2859 	vaddr_t pa;
2860 	lwp_t *l;
2861 	bool hard, rv;
2862 
2863 	rv = false;
2864 	pa = 0;
2865 	l = curlwp;
2866 
2867 	KPREEMPT_DISABLE(l);
2868 	ci = l->l_cpu;
2869 	if (__predict_true(!ci->ci_want_pmapload && ci->ci_pmap == pmap) ||
2870 	    pmap == pmap_kernel()) {
2871 		/*
2872 		 * no need to lock, because it's pmap_kernel() or our
2873 		 * own pmap and is active.  if a user pmap, the caller
2874 		 * will hold the vm_map write/read locked and so prevent
2875 		 * entries from disappearing while we are here.  ptps
2876 		 * can disappear via pmap_remove(), pmap_protect() and
2877 		 * pmap_collect(), but they are called with the vm_map
2878 		 * write locked.
2879 		 */
2880 		hard = false;
2881 		ptes = PTE_BASE;
2882 		pdes = normal_pdes;
2883 	} else {
2884 		/* we lose, do it the hard way. */
2885 		hard = true;
2886 		pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
2887 	}
2888 	if (pmap_pdes_valid(va, pdes, &pde)) {
2889 		pte = ptes[pl1_i(va)];
2890 		if (pde & PG_PS) {
2891 			pa = (pde & PG_LGFRAME) | (va & (NBPD_L2 - 1));
2892 			rv = true;
2893 		} else if (__predict_true((pte & PG_V) != 0)) {
2894 			pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
2895 			rv = true;
2896 		}
2897 	}
2898 	if (__predict_false(hard)) {
2899 		pmap_unmap_ptes(pmap, pmap2);
2900 	}
2901 	KPREEMPT_ENABLE(l);
2902 	if (pap != NULL) {
2903 		*pap = pa;
2904 	}
2905 	return rv;
2906 }
2907 
2908 
2909 /*
2910  * vtophys: virtual address to physical address.  For use by
2911  * machine-dependent code only.
2912  */
2913 
2914 paddr_t
2915 vtophys(vaddr_t va)
2916 {
2917 	paddr_t pa;
2918 
2919 	if (pmap_extract(pmap_kernel(), va, &pa) == true)
2920 		return (pa);
2921 	return (0);
2922 }
2923 
2924 #ifdef XEN
2925 /*
2926  * pmap_extract_ma: extract a MA for the given VA
2927  */
2928 
2929 bool
2930 pmap_extract_ma(struct pmap *pmap, vaddr_t va, paddr_t *pap)
2931 {
2932 	pt_entry_t *ptes, pte;
2933 	pd_entry_t pde;
2934 	pd_entry_t * const *pdes;
2935 	struct pmap *pmap2;
2936 
2937 	kpreempt_disable();
2938 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
2939 	if (!pmap_pdes_valid(va, pdes, &pde)) {
2940 		pmap_unmap_ptes(pmap, pmap2);
2941 		kpreempt_enable();
2942 		return false;
2943 	}
2944 
2945 	pte = ptes[pl1_i(va)];
2946 	pmap_unmap_ptes(pmap, pmap2);
2947 	kpreempt_enable();
2948 
2949 	if (__predict_true((pte & PG_V) != 0)) {
2950 		if (pap != NULL)
2951 			*pap = (pte & PG_FRAME) | (va & (NBPD_L1 - 1));
2952 		return true;
2953 	}
2954 
2955 	return false;
2956 }
2957 
2958 /*
2959  * vtomach: virtual address to machine address.  For use by
2960  * machine-dependent code only.
2961  */
2962 
2963 paddr_t
2964 vtomach(vaddr_t va)
2965 {
2966 	paddr_t pa;
2967 
2968 	if (pmap_extract_ma(pmap_kernel(), va, &pa) == true)
2969 		return (pa);
2970 	return (0);
2971 }
2972 
2973 #endif /* XEN */
2974 
2975 
2976 
2977 /*
2978  * pmap_virtual_space: used during bootup [pmap_steal_memory] to
2979  *	determine the bounds of the kernel virtual addess space.
2980  */
2981 
2982 void
2983 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp)
2984 {
2985 	*startp = virtual_avail;
2986 	*endp = virtual_end;
2987 }
2988 
2989 /*
2990  * pmap_map: map a range of PAs into kvm.
2991  *
2992  * => used during crash dump
2993  * => XXX: pmap_map() should be phased out?
2994  */
2995 
2996 vaddr_t
2997 pmap_map(vaddr_t va, paddr_t spa, paddr_t epa, vm_prot_t prot)
2998 {
2999 	while (spa < epa) {
3000 		pmap_kenter_pa(va, spa, prot);
3001 		va += PAGE_SIZE;
3002 		spa += PAGE_SIZE;
3003 	}
3004 	pmap_update(pmap_kernel());
3005 	return va;
3006 }
3007 
3008 /*
3009  * pmap_zero_page: zero a page
3010  */
3011 
3012 void
3013 pmap_zero_page(paddr_t pa)
3014 {
3015 	pt_entry_t *zpte;
3016 	void *zerova;
3017 	int id;
3018 
3019 	kpreempt_disable();
3020 	id = cpu_number();
3021 	zpte = PTESLEW(zero_pte, id);
3022 	zerova = VASLEW(zerop, id);
3023 
3024 #ifdef DIAGNOSTIC
3025 	if (*zpte)
3026 		panic("pmap_zero_page: lock botch");
3027 #endif
3028 
3029 	pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k);
3030 	pmap_pte_flush();
3031 	pmap_update_pg((vaddr_t)zerova);		/* flush TLB */
3032 
3033 	memset(zerova, 0, PAGE_SIZE);
3034 
3035 #if defined(DIAGNOSTIC) || defined(XEN)
3036 	pmap_pte_set(zpte, 0);				/* zap ! */
3037 	pmap_pte_flush();
3038 #endif
3039 	kpreempt_enable();
3040 }
3041 
3042 /*
3043  * pmap_pagezeroidle: the same, for the idle loop page zero'er.
3044  * Returns true if the page was zero'd, false if we aborted for
3045  * some reason.
3046  */
3047 
3048 bool
3049 pmap_pageidlezero(paddr_t pa)
3050 {
3051 	pt_entry_t *zpte;
3052 	void *zerova;
3053 	bool rv;
3054 	int id;
3055 
3056 	id = cpu_number();
3057 	zpte = PTESLEW(zero_pte, id);
3058 	zerova = VASLEW(zerop, id);
3059 
3060 	KASSERT(cpu_feature & CPUID_SSE2);
3061 	KASSERT(*zpte == 0);
3062 
3063 	pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k);
3064 	pmap_pte_flush();
3065 	pmap_update_pg((vaddr_t)zerova);		/* flush TLB */
3066 
3067 	rv = sse2_idlezero_page(zerova);
3068 
3069 #if defined(DIAGNOSTIC) || defined(XEN)
3070 	pmap_pte_set(zpte, 0);				/* zap ! */
3071 	pmap_pte_flush();
3072 #endif
3073 
3074 	return rv;
3075 }
3076 
3077 /*
3078  * pmap_copy_page: copy a page
3079  */
3080 
3081 void
3082 pmap_copy_page(paddr_t srcpa, paddr_t dstpa)
3083 {
3084 	pt_entry_t *spte;
3085 	pt_entry_t *dpte;
3086 	void *csrcva;
3087 	void *cdstva;
3088 	int id;
3089 
3090 	kpreempt_disable();
3091 	id = cpu_number();
3092 	spte = PTESLEW(csrc_pte,id);
3093 	dpte = PTESLEW(cdst_pte,id);
3094 	csrcva = VASLEW(csrcp, id);
3095 	cdstva = VASLEW(cdstp, id);
3096 
3097 	KASSERT(*spte == 0 && *dpte == 0);
3098 
3099 	pmap_pte_set(spte, pmap_pa2pte(srcpa) | PG_V | PG_RW | PG_U | PG_k);
3100 	pmap_pte_set(dpte,
3101 	    pmap_pa2pte(dstpa) | PG_V | PG_RW | PG_M | PG_U | PG_k);
3102 	pmap_pte_flush();
3103 	pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva);
3104 
3105 	memcpy(cdstva, csrcva, PAGE_SIZE);
3106 
3107 #if defined(DIAGNOSTIC) || defined(XEN)
3108 	pmap_pte_set(spte, 0);
3109 	pmap_pte_set(dpte, 0);
3110 	pmap_pte_flush();
3111 #endif
3112 	kpreempt_enable();
3113 }
3114 
3115 static pt_entry_t *
3116 pmap_map_ptp(struct vm_page *ptp)
3117 {
3118 	pt_entry_t *ptppte;
3119 	void *ptpva;
3120 	int id;
3121 
3122 	KASSERT(kpreempt_disabled());
3123 
3124 	id = cpu_number();
3125 	ptppte = PTESLEW(ptp_pte, id);
3126 	ptpva = VASLEW(ptpp, id);
3127 #if !defined(XEN)
3128 	pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M |
3129 	    PG_RW | PG_U | PG_k);
3130 #else
3131 	pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M |
3132 	    PG_U | PG_k);
3133 #endif
3134 	pmap_pte_flush();
3135 	pmap_update_pg((vaddr_t)ptpva);
3136 
3137 	return (pt_entry_t *)ptpva;
3138 }
3139 
3140 static void
3141 pmap_unmap_ptp(void)
3142 {
3143 #if defined(DIAGNOSTIC) || defined(XEN)
3144 	pt_entry_t *pte;
3145 
3146 	KASSERT(kpreempt_disabled());
3147 
3148 	pte = PTESLEW(ptp_pte, cpu_number());
3149 	if (*pte != 0) {
3150 		pmap_pte_set(pte, 0);
3151 		pmap_pte_flush();
3152 	}
3153 #endif
3154 }
3155 
3156 static pt_entry_t *
3157 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
3158 {
3159 
3160 	KASSERT(kpreempt_disabled());
3161 	if (pmap_is_curpmap(pmap)) {
3162 		return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */
3163 	}
3164 	KASSERT(ptp != NULL);
3165 	return pmap_map_ptp(ptp) + pl1_pi(va);
3166 }
3167 
3168 static void
3169 pmap_unmap_pte(void)
3170 {
3171 
3172 	KASSERT(kpreempt_disabled());
3173 
3174 	pmap_unmap_ptp();
3175 }
3176 
3177 /*
3178  * p m a p   r e m o v e   f u n c t i o n s
3179  *
3180  * functions that remove mappings
3181  */
3182 
3183 /*
3184  * pmap_remove_ptes: remove PTEs from a PTP
3185  *
3186  * => must have proper locking on pmap_master_lock
3187  * => caller must hold pmap's lock
3188  * => PTP must be mapped into KVA
3189  * => PTP should be null if pmap == pmap_kernel()
3190  * => must be called with kernel preemption disabled
3191  * => returns composite pte if at least one page should be shot down
3192  */
3193 
3194 static pt_entry_t
3195 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
3196 		 vaddr_t startva, vaddr_t endva, int flags,
3197 		 struct pv_entry **pv_tofree)
3198 {
3199 	struct pv_entry *pve;
3200 	pt_entry_t *pte = (pt_entry_t *) ptpva;
3201 	pt_entry_t opte, xpte = 0;
3202 
3203 	KASSERT(pmap == pmap_kernel() || mutex_owned(&pmap->pm_lock));
3204 	KASSERT(kpreempt_disabled());
3205 
3206 	/*
3207 	 * note that ptpva points to the PTE that maps startva.   this may
3208 	 * or may not be the first PTE in the PTP.
3209 	 *
3210 	 * we loop through the PTP while there are still PTEs to look at
3211 	 * and the wire_count is greater than 1 (because we use the wire_count
3212 	 * to keep track of the number of real PTEs in the PTP).
3213 	 */
3214 
3215 	for (/*null*/; startva < endva && (ptp == NULL || ptp->wire_count > 1)
3216 			     ; pte++, startva += PAGE_SIZE) {
3217 		struct vm_page *pg;
3218 		struct pmap_page *pp;
3219 
3220 		if (!pmap_valid_entry(*pte))
3221 			continue;			/* VA not mapped */
3222 		if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) {
3223 			continue;
3224 		}
3225 
3226 		/* atomically save the old PTE and zap! it */
3227 		opte = pmap_pte_testset(pte, 0);
3228 		if (!pmap_valid_entry(opte)) {
3229 			continue;
3230 		}
3231 
3232 		pmap_exec_account(pmap, startva, opte, 0);
3233 		pmap_stats_update_bypte(pmap, 0, opte);
3234 		xpte |= opte;
3235 
3236 		if (ptp) {
3237 			ptp->wire_count--;		/* dropping a PTE */
3238 			/* Make sure that the PDE is flushed */
3239 			if (ptp->wire_count <= 1)
3240 				xpte |= PG_U;
3241 		}
3242 
3243 		/*
3244 		 * if we are not on a pv_head list we are done.
3245 		 */
3246 
3247 		if ((opte & PG_PVLIST) == 0) {
3248 #if defined(DIAGNOSTIC) && !defined(DOM0OPS)
3249 			if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL)
3250 				panic("pmap_remove_ptes: managed page without "
3251 				      "PG_PVLIST for 0x%lx", startva);
3252 #endif
3253 			continue;
3254 		}
3255 
3256 		pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte));
3257 #ifdef DIAGNOSTIC
3258 		if (pg == NULL)
3259 			panic("pmap_remove_ptes: unmanaged page marked "
3260 			      "PG_PVLIST, va = 0x%lx, pa = 0x%lx",
3261 			      startva, (u_long)pmap_pte2pa(opte));
3262 #endif
3263 
3264 		/* sync R/M bits */
3265 		pp = VM_PAGE_TO_PP(pg);
3266 		pp_lock(pp);
3267 		pp->pp_attrs |= opte;
3268 		pve = pmap_remove_pv(pp, ptp, startva);
3269 		pp_unlock(pp);
3270 
3271 		if (pve != NULL) {
3272 			pve->pve_next = *pv_tofree;
3273 			*pv_tofree = pve;
3274 		}
3275 
3276 		/* end of "for" loop: time for next pte */
3277 	}
3278 
3279 	return xpte;
3280 }
3281 
3282 
3283 /*
3284  * pmap_remove_pte: remove a single PTE from a PTP
3285  *
3286  * => must have proper locking on pmap_master_lock
3287  * => caller must hold pmap's lock
3288  * => PTP must be mapped into KVA
3289  * => PTP should be null if pmap == pmap_kernel()
3290  * => returns true if we removed a mapping
3291  * => must be called with kernel preemption disabled
3292  */
3293 
3294 static bool
3295 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
3296 		vaddr_t va, int flags, struct pv_entry **pv_tofree)
3297 {
3298 	pt_entry_t opte;
3299 	struct pv_entry *pve;
3300 	struct vm_page *pg;
3301 	struct pmap_page *pp;
3302 
3303 	KASSERT(pmap == pmap_kernel() || mutex_owned(&pmap->pm_lock));
3304 	KASSERT(pmap == pmap_kernel() || kpreempt_disabled());
3305 
3306 	if (!pmap_valid_entry(*pte))
3307 		return(false);		/* VA not mapped */
3308 	if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) {
3309 		return(false);
3310 	}
3311 
3312 	/* atomically save the old PTE and zap! it */
3313 	opte = pmap_pte_testset(pte, 0);
3314 	if (!pmap_valid_entry(opte)) {
3315 		return false;
3316 	}
3317 
3318 	pmap_exec_account(pmap, va, opte, 0);
3319 	pmap_stats_update_bypte(pmap, 0, opte);
3320 
3321 	if (opte & PG_U)
3322 		pmap_tlb_shootdown(pmap, va, 0, opte);
3323 
3324 	if (ptp) {
3325 		ptp->wire_count--;		/* dropping a PTE */
3326 		/* Make sure that the PDE is flushed */
3327 		if ((ptp->wire_count <= 1) && !(opte & PG_U))
3328 			pmap_tlb_shootdown(pmap, va, 0, opte);
3329 	}
3330 
3331 	/*
3332 	 * if we are not on a pv_head list we are done.
3333 	 */
3334 
3335 	if ((opte & PG_PVLIST) == 0) {
3336 #if defined(DIAGNOSTIC) && !defined(DOM0OPS)
3337 		if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL)
3338 			panic("pmap_remove_pte: managed page without "
3339 			      "PG_PVLIST for 0x%lx", va);
3340 #endif
3341 		return(true);
3342 	}
3343 
3344 	pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte));
3345 #ifdef DIAGNOSTIC
3346 	if (pg == NULL)
3347 		panic("pmap_remove_pte: unmanaged page marked "
3348 		    "PG_PVLIST, va = 0x%lx, pa = 0x%lx", va,
3349 		    (u_long)(pmap_pte2pa(opte)));
3350 #endif
3351 
3352 	/* sync R/M bits */
3353 	pp = VM_PAGE_TO_PP(pg);
3354 	pp_lock(pp);
3355 	pp->pp_attrs |= opte;
3356 	pve = pmap_remove_pv(pp, ptp, va);
3357 	pp_unlock(pp);
3358 
3359 	if (pve) {
3360 		pve->pve_next = *pv_tofree;
3361 		*pv_tofree = pve;
3362 	}
3363 
3364 	return(true);
3365 }
3366 
3367 /*
3368  * pmap_remove: top level mapping removal function
3369  *
3370  * => caller should not be holding any pmap locks
3371  */
3372 
3373 void
3374 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
3375 {
3376 	pmap_do_remove(pmap, sva, eva, PMAP_REMOVE_ALL);
3377 }
3378 
3379 /*
3380  * pmap_do_remove: mapping removal guts
3381  *
3382  * => caller should not be holding any pmap locks
3383  */
3384 
3385 static void
3386 pmap_do_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva, int flags)
3387 {
3388 	pt_entry_t *ptes, xpte = 0;
3389 	pd_entry_t pde;
3390 	pd_entry_t * const *pdes;
3391 	struct pv_entry *pv_tofree = NULL;
3392 	bool result;
3393 	paddr_t ptppa;
3394 	vaddr_t blkendva, va = sva;
3395 	struct vm_page *ptp;
3396 	struct pmap *pmap2;
3397 
3398 	kpreempt_disable();
3399 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
3400 
3401 	/*
3402 	 * removing one page?  take shortcut function.
3403 	 */
3404 
3405 	if (va + PAGE_SIZE == eva) {
3406 		if (pmap_pdes_valid(va, pdes, &pde)) {
3407 
3408 			/* PA of the PTP */
3409 			ptppa = pmap_pte2pa(pde);
3410 
3411 			/* get PTP if non-kernel mapping */
3412 			if (pmap == pmap_kernel()) {
3413 				/* we never free kernel PTPs */
3414 				ptp = NULL;
3415 			} else {
3416 				ptp = pmap_find_ptp(pmap, va, ptppa, 1);
3417 #ifdef DIAGNOSTIC
3418 				if (ptp == NULL)
3419 					panic("pmap_remove: unmanaged "
3420 					      "PTP detected");
3421 #endif
3422 			}
3423 
3424 			/* do it! */
3425 			result = pmap_remove_pte(pmap, ptp,
3426 			    &ptes[pl1_i(va)], va, flags, &pv_tofree);
3427 
3428 			/*
3429 			 * if mapping removed and the PTP is no longer
3430 			 * being used, free it!
3431 			 */
3432 
3433 			if (result && ptp && ptp->wire_count <= 1)
3434 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3435 		}
3436 	} else for (/* null */ ; va < eva ; va = blkendva) {
3437 		int lvl;
3438 
3439 		/* determine range of block */
3440 		blkendva = x86_round_pdr(va+1);
3441 		if (blkendva > eva)
3442 			blkendva = eva;
3443 
3444 		/*
3445 		 * XXXCDC: our PTE mappings should never be removed
3446 		 * with pmap_remove!  if we allow this (and why would
3447 		 * we?) then we end up freeing the pmap's page
3448 		 * directory page (PDP) before we are finished using
3449 		 * it when we hit in in the recursive mapping.  this
3450 		 * is BAD.
3451 		 *
3452 		 * long term solution is to move the PTEs out of user
3453 		 * address space.  and into kernel address space (up
3454 		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
3455 		 * be VM_MAX_ADDRESS.
3456 		 */
3457 
3458 		if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE)
3459 			/* XXXCDC: ugly hack to avoid freeing PDP here */
3460 			continue;
3461 
3462 		lvl = pmap_pdes_invalid(va, pdes, &pde);
3463 		if (lvl != 0) {
3464 			/*
3465 			 * skip a range corresponding to an invalid pde.
3466 			 */
3467 			blkendva = (va & ptp_masks[lvl - 1]) + nbpd[lvl - 1];
3468  			continue;
3469 		}
3470 
3471 		/* PA of the PTP */
3472 		ptppa = pmap_pte2pa(pde);
3473 
3474 		/* get PTP if non-kernel mapping */
3475 		if (pmap == pmap_kernel()) {
3476 			/* we never free kernel PTPs */
3477 			ptp = NULL;
3478 		} else {
3479 			ptp = pmap_find_ptp(pmap, va, ptppa, 1);
3480 #ifdef DIAGNOSTIC
3481 			if (ptp == NULL)
3482 				panic("pmap_remove: unmanaged PTP "
3483 				      "detected");
3484 #endif
3485 		}
3486 		xpte |= pmap_remove_ptes(pmap, ptp,
3487 		    (vaddr_t)&ptes[pl1_i(va)], va, blkendva,
3488 		    flags, &pv_tofree);
3489 
3490 		/* if PTP is no longer being used, free it! */
3491 		if (ptp && ptp->wire_count <= 1) {
3492 			pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3493 		}
3494 		if ((xpte & PG_U) != 0)
3495 			pmap_tlb_shootdown(pmap, sva, eva, xpte);
3496 	}
3497 	pmap_unmap_ptes(pmap, pmap2);		/* unlock pmap */
3498 	kpreempt_enable();
3499 
3500 	/* Now we free unused PVs */
3501 	if (pv_tofree)
3502 		pmap_free_pvs(pv_tofree);
3503 }
3504 
3505 /*
3506  * pmap_sync_pv: clear pte bits and return the old value of the pte.
3507  *
3508  * => called with pp_lock held. (thus preemption disabled)
3509  * => issues tlb shootdowns if necessary.
3510  */
3511 
3512 static int
3513 pmap_sync_pv(struct pv_pte *pvpte, pt_entry_t expect, int clearbits,
3514     pt_entry_t *optep)
3515 {
3516 	struct pmap *pmap;
3517 	struct vm_page *ptp;
3518 	vaddr_t va;
3519 	pt_entry_t *ptep;
3520 	pt_entry_t opte;
3521 	pt_entry_t npte;
3522 	bool need_shootdown;
3523 
3524 	ptp = pvpte->pte_ptp;
3525 	va = pvpte->pte_va;
3526 	KASSERT(ptp == NULL || ptp->uobject != NULL);
3527 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
3528 	pmap = ptp_to_pmap(ptp);
3529 
3530 	KASSERT((expect & ~(PG_FRAME | PG_V)) == 0);
3531 	KASSERT((expect & PG_V) != 0);
3532 	KASSERT(clearbits == ~0 || (clearbits & ~(PG_M | PG_U | PG_RW)) == 0);
3533 	KASSERT(kpreempt_disabled());
3534 
3535 	ptep = pmap_map_pte(pmap, ptp, va);
3536 	do {
3537 		opte = *ptep;
3538 		KASSERT((opte & (PG_M | PG_U)) != PG_M);
3539 		KASSERT((opte & (PG_U | PG_V)) != PG_U);
3540 		KASSERT(opte == 0 || (opte & PG_V) != 0);
3541 		if ((opte & (PG_FRAME | PG_V)) != expect) {
3542 
3543 			/*
3544 			 * we lost a race with a V->P operation like
3545 			 * pmap_remove().  wait for the competitor
3546 			 * reflecting pte bits into mp_attrs.
3547 			 *
3548 			 * issue a redundant TLB shootdown so that
3549 			 * we can wait for its completion.
3550 			 */
3551 
3552 			pmap_unmap_pte();
3553 			if (clearbits != 0) {
3554 				pmap_tlb_shootdown(pmap, va, 0,
3555 				    (pmap == pmap_kernel() ? PG_G : 0));
3556 			}
3557 			return EAGAIN;
3558 		}
3559 
3560 		/*
3561 		 * check if there's anything to do on this pte.
3562 		 */
3563 
3564 		if ((opte & clearbits) == 0) {
3565 			need_shootdown = false;
3566 			break;
3567 		}
3568 
3569 		/*
3570 		 * we need a shootdown if the pte is cached. (PG_U)
3571 		 *
3572 		 * ...unless we are clearing only the PG_RW bit and
3573 		 * it isn't cached as RW. (PG_M)
3574 		 */
3575 
3576 		need_shootdown = (opte & PG_U) != 0 &&
3577 		    !(clearbits == PG_RW && (opte & PG_M) == 0);
3578 
3579 		npte = opte & ~clearbits;
3580 
3581 		/*
3582 		 * if we need a shootdown anyway, clear PG_U and PG_M.
3583 		 */
3584 
3585 		if (need_shootdown) {
3586 			npte &= ~(PG_U | PG_M);
3587 		}
3588 		KASSERT((npte & (PG_M | PG_U)) != PG_M);
3589 		KASSERT((npte & (PG_U | PG_V)) != PG_U);
3590 		KASSERT(npte == 0 || (opte & PG_V) != 0);
3591 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
3592 
3593 	if (need_shootdown) {
3594 		pmap_tlb_shootdown(pmap, va, 0, opte);
3595 	}
3596 	pmap_unmap_pte();
3597 
3598 	*optep = opte;
3599 	return 0;
3600 }
3601 
3602 /*
3603  * pmap_page_remove: remove a managed vm_page from all pmaps that map it
3604  *
3605  * => R/M bits are sync'd back to attrs
3606  */
3607 
3608 void
3609 pmap_page_remove(struct vm_page *pg)
3610 {
3611 	struct pmap_page *pp;
3612 	struct pv_pte *pvpte;
3613 	struct pv_entry *killlist = NULL;
3614 	struct vm_page *ptp;
3615 	pt_entry_t expect;
3616 	lwp_t *l;
3617 	int count;
3618 
3619 #ifdef DIAGNOSTIC
3620 	int bank, off;
3621 
3622 	bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off);
3623 	if (bank == -1)
3624 		panic("pmap_page_remove: unmanaged page?");
3625 #endif
3626 
3627 	l = curlwp;
3628 	pp = VM_PAGE_TO_PP(pg);
3629 	expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V;
3630 	count = SPINLOCK_BACKOFF_MIN;
3631 	kpreempt_disable();
3632 startover:
3633 	pp_lock(pp);
3634 	while ((pvpte = pv_pte_first(pp)) != NULL) {
3635 		struct pmap *pmap;
3636 		struct pv_entry *pve;
3637 		pt_entry_t opte;
3638 		vaddr_t va;
3639 		int error;
3640 
3641 		/*
3642 		 * add a reference to the pmap before clearing the pte.
3643 		 * otherwise the pmap can disappear behind us.
3644 		 */
3645 
3646 		ptp = pvpte->pte_ptp;
3647 		pmap = ptp_to_pmap(ptp);
3648 		if (ptp != NULL) {
3649 			pmap_reference(pmap);
3650 		}
3651 
3652 		error = pmap_sync_pv(pvpte, expect, ~0, &opte);
3653 		if (error == EAGAIN) {
3654 			int hold_count;
3655 			pp_unlock(pp);
3656 			KERNEL_UNLOCK_ALL(curlwp, &hold_count);
3657 			if (ptp != NULL) {
3658 				pmap_destroy(pmap);
3659 			}
3660 			SPINLOCK_BACKOFF(count);
3661 			KERNEL_LOCK(hold_count, curlwp);
3662 			goto startover;
3663 		}
3664 
3665 		pp->pp_attrs |= opte;
3666 		va = pvpte->pte_va;
3667 		pve = pmap_remove_pv(pp, ptp, va);
3668 		pp_unlock(pp);
3669 
3670 		/* update the PTP reference count.  free if last reference. */
3671 		if (ptp != NULL) {
3672 			struct pmap *pmap2;
3673 			pt_entry_t *ptes;
3674 			pd_entry_t * const *pdes;
3675 
3676 			KASSERT(pmap != pmap_kernel());
3677 
3678 			pmap_tlb_shootwait();
3679 			pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3680 			pmap_stats_update_bypte(pmap, 0, opte);
3681 			ptp->wire_count--;
3682 			if (ptp->wire_count <= 1) {
3683 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3684 			}
3685 			pmap_unmap_ptes(pmap, pmap2);
3686 			pmap_destroy(pmap);
3687 		} else {
3688 			KASSERT(pmap == pmap_kernel());
3689 			pmap_stats_update_bypte(pmap, 0, opte);
3690 		}
3691 
3692 		if (pve != NULL) {
3693 			pve->pve_next = killlist;	/* mark it for death */
3694 			killlist = pve;
3695 		}
3696 		pp_lock(pp);
3697 	}
3698 	pp_unlock(pp);
3699 	kpreempt_enable();
3700 
3701 	/* Now free unused pvs. */
3702 	pmap_free_pvs(killlist);
3703 }
3704 
3705 /*
3706  * p m a p   a t t r i b u t e  f u n c t i o n s
3707  * functions that test/change managed page's attributes
3708  * since a page can be mapped multiple times we must check each PTE that
3709  * maps it by going down the pv lists.
3710  */
3711 
3712 /*
3713  * pmap_test_attrs: test a page's attributes
3714  */
3715 
3716 bool
3717 pmap_test_attrs(struct vm_page *pg, unsigned testbits)
3718 {
3719 	struct pmap_page *pp;
3720 	struct pv_pte *pvpte;
3721 	pt_entry_t expect;
3722 	u_int result;
3723 
3724 #if DIAGNOSTIC
3725 	int bank, off;
3726 
3727 	bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off);
3728 	if (bank == -1)
3729 		panic("pmap_test_attrs: unmanaged page?");
3730 #endif
3731 
3732 	pp = VM_PAGE_TO_PP(pg);
3733 	if ((pp->pp_attrs & testbits) != 0) {
3734 		return true;
3735 	}
3736 	expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V;
3737 	pp_lock(pp);
3738 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
3739 		pt_entry_t opte;
3740 		int error;
3741 
3742 		if ((pp->pp_attrs & testbits) != 0) {
3743 			break;
3744 		}
3745 		error = pmap_sync_pv(pvpte, expect, 0, &opte);
3746 		if (error == 0) {
3747 			pp->pp_attrs |= opte;
3748 		}
3749 	}
3750 	result = pp->pp_attrs & testbits;
3751 	pp_unlock(pp);
3752 
3753 	/*
3754 	 * note that we will exit the for loop with a non-null pve if
3755 	 * we have found the bits we are testing for.
3756 	 */
3757 
3758 	return result != 0;
3759 }
3760 
3761 /*
3762  * pmap_clear_attrs: clear the specified attribute for a page.
3763  *
3764  * => we return true if we cleared one of the bits we were asked to
3765  */
3766 
3767 bool
3768 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits)
3769 {
3770 	struct pmap_page *pp;
3771 	struct pv_pte *pvpte;
3772 	u_int result;
3773 	pt_entry_t expect;
3774 	int count;
3775 #ifdef DIAGNOSTIC
3776 	int bank, off;
3777 
3778 	bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off);
3779 	if (bank == -1)
3780 		panic("pmap_change_attrs: unmanaged page?");
3781 #endif
3782 
3783 	pp = VM_PAGE_TO_PP(pg);
3784 	expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V;
3785 	count = SPINLOCK_BACKOFF_MIN;
3786 	kpreempt_disable();
3787 startover:
3788 	pp_lock(pp);
3789 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
3790 		pt_entry_t opte;
3791 		int error;
3792 
3793 		error = pmap_sync_pv(pvpte, expect, clearbits, &opte);
3794 		if (error == EAGAIN) {
3795 			int hold_count;
3796 			pp_unlock(pp);
3797 			KERNEL_UNLOCK_ALL(curlwp, &hold_count);
3798 			SPINLOCK_BACKOFF(count);
3799 			KERNEL_LOCK(hold_count, curlwp);
3800 			goto startover;
3801 		}
3802 		pp->pp_attrs |= opte;
3803 	}
3804 	result = pp->pp_attrs & clearbits;
3805 	pp->pp_attrs &= ~clearbits;
3806 	pp_unlock(pp);
3807 	kpreempt_enable();
3808 
3809 	return result != 0;
3810 }
3811 
3812 
3813 /*
3814  * p m a p   p r o t e c t i o n   f u n c t i o n s
3815  */
3816 
3817 /*
3818  * pmap_page_protect: change the protection of all recorded mappings
3819  *	of a managed page
3820  *
3821  * => NOTE: this is an inline function in pmap.h
3822  */
3823 
3824 /* see pmap.h */
3825 
3826 /*
3827  * pmap_protect: set the protection in of the pages in a pmap
3828  *
3829  * => NOTE: this is an inline function in pmap.h
3830  */
3831 
3832 /* see pmap.h */
3833 
3834 /*
3835  * pmap_write_protect: write-protect pages in a pmap
3836  */
3837 
3838 void
3839 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
3840 {
3841 	pt_entry_t *ptes, *epte;
3842 	pt_entry_t *spte;
3843 	pd_entry_t * const *pdes;
3844 	vaddr_t blockend, va;
3845 	pt_entry_t opte;
3846 	struct pmap *pmap2;
3847 
3848 	KASSERT(curlwp->l_md.md_gc_pmap != pmap);
3849 
3850 	kpreempt_disable();
3851 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
3852 
3853 	/* should be ok, but just in case ... */
3854 	sva &= PG_FRAME;
3855 	eva &= PG_FRAME;
3856 
3857 	for (va = sva ; va < eva ; va = blockend) {
3858 
3859 		blockend = (va & L2_FRAME) + NBPD_L2;
3860 		if (blockend > eva)
3861 			blockend = eva;
3862 
3863 		/*
3864 		 * XXXCDC: our PTE mappings should never be write-protected!
3865 		 *
3866 		 * long term solution is to move the PTEs out of user
3867 		 * address space.  and into kernel address space (up
3868 		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
3869 		 * be VM_MAX_ADDRESS.
3870 		 */
3871 
3872 		/* XXXCDC: ugly hack to avoid freeing PDP here */
3873 		if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE)
3874 			continue;
3875 
3876 		/* empty block? */
3877 		if (!pmap_pdes_valid(va, pdes, NULL))
3878 			continue;
3879 
3880 #ifdef DIAGNOSTIC
3881 		if (va >= VM_MAXUSER_ADDRESS &&
3882 		    va < VM_MAX_ADDRESS)
3883 			panic("pmap_write_protect: PTE space");
3884 #endif
3885 
3886 		spte = &ptes[pl1_i(va)];
3887 		epte = &ptes[pl1_i(blockend)];
3888 
3889 		for (/*null */; spte < epte ; spte++) {
3890 			pt_entry_t npte;
3891 
3892 			do {
3893 				opte = *spte;
3894 				if ((~opte & (PG_RW | PG_V)) != 0) {
3895 					goto next;
3896 				}
3897 				npte = opte & ~PG_RW;
3898 			} while (pmap_pte_cas(spte, opte, npte) != opte);
3899 			if ((opte & PG_M) != 0) {
3900 				vaddr_t tva;
3901 
3902 				tva = x86_ptob(spte - ptes);
3903 				pmap_tlb_shootdown(pmap, tva, 0, opte);
3904 			}
3905 next:;
3906 		}
3907 	}
3908 
3909 	pmap_unmap_ptes(pmap, pmap2);	/* unlocks pmap */
3910 	kpreempt_enable();
3911 }
3912 
3913 /*
3914  * end of protection functions
3915  */
3916 
3917 /*
3918  * pmap_unwire: clear the wired bit in the PTE
3919  *
3920  * => mapping should already be in map
3921  */
3922 
3923 void
3924 pmap_unwire(struct pmap *pmap, vaddr_t va)
3925 {
3926 	pt_entry_t *ptes;
3927 	pd_entry_t * const *pdes;
3928 	struct pmap *pmap2;
3929 
3930 	kpreempt_disable();
3931 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
3932 
3933 	if (pmap_pdes_valid(va, pdes, NULL)) {
3934 		pt_entry_t *ptep = &ptes[pl1_i(va)];
3935 		pt_entry_t opte = *ptep;
3936 
3937 #ifdef DIAGNOSTIC
3938 		if (!pmap_valid_entry(opte))
3939 			panic("pmap_unwire: invalid (unmapped) va 0x%lx", va);
3940 #endif
3941 		if ((opte & PG_W) != 0) {
3942 			pt_entry_t npte = opte & ~PG_W;
3943 
3944 			opte = pmap_pte_testset(ptep, npte);
3945 			pmap_stats_update_bypte(pmap, npte, opte);
3946 		}
3947 #ifdef DIAGNOSTIC
3948 		else {
3949 			printf("pmap_unwire: wiring for pmap %p va 0x%lx "
3950 			       "didn't change!\n", pmap, va);
3951 		}
3952 #endif
3953 		pmap_unmap_ptes(pmap, pmap2);		/* unlocks map */
3954 	}
3955 #ifdef DIAGNOSTIC
3956 	else {
3957 		panic("pmap_unwire: invalid PDE");
3958 	}
3959 #endif
3960 	kpreempt_enable();
3961 }
3962 
3963 /*
3964  * pmap_collect: free resources held by a pmap
3965  *
3966  * => optional function.
3967  * => called when a process is swapped out to free memory.
3968  */
3969 
3970 void
3971 pmap_collect(struct pmap *pmap)
3972 {
3973 	/*
3974 	 * free all of the pt pages by removing the physical mappings
3975 	 * for its entire address space.
3976 	 */
3977 
3978 	pmap_do_remove(pmap, VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS,
3979 	    PMAP_REMOVE_SKIPWIRED);
3980 }
3981 
3982 /*
3983  * pmap_copy: copy mappings from one pmap to another
3984  *
3985  * => optional function
3986  * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
3987  */
3988 
3989 /*
3990  * defined as macro in pmap.h
3991  */
3992 
3993 /*
3994  * pmap_enter: enter a mapping into a pmap
3995  *
3996  * => must be done "now" ... no lazy-evaluation
3997  * => we set pmap => pv_head locking
3998  */
3999 #ifdef XEN
4000 int
4001 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa,
4002 	   vm_prot_t prot, int flags, int domid)
4003 {
4004 #else /* XEN */
4005 int
4006 pmap_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
4007 	   int flags)
4008 {
4009 	paddr_t ma = pa;
4010 #endif /* XEN */
4011 	pt_entry_t *ptes, opte, npte;
4012 	pt_entry_t *ptep;
4013 	pd_entry_t * const *pdes;
4014 	struct vm_page *ptp, *pg;
4015 	struct pmap_page *new_pp;
4016 	struct pmap_page *old_pp;
4017 	struct pv_entry *old_pve = NULL;
4018 	struct pv_entry *new_pve;
4019 	struct pv_entry *new_pve2;
4020 	int error;
4021 	bool wired = (flags & PMAP_WIRED) != 0;
4022 	struct pmap *pmap2;
4023 
4024 	KASSERT(pmap_initialized);
4025 	KASSERT(curlwp->l_md.md_gc_pmap != pmap);
4026 
4027 #ifdef DIAGNOSTIC
4028 	/* sanity check: totally out of range? */
4029 	if (va >= VM_MAX_KERNEL_ADDRESS)
4030 		panic("pmap_enter: too big");
4031 
4032 	if (va == (vaddr_t) PDP_BASE || va == (vaddr_t) APDP_BASE)
4033 		panic("pmap_enter: trying to map over PDP/APDP!");
4034 
4035 	/* sanity check: kernel PTPs should already have been pre-allocated */
4036 	if (va >= VM_MIN_KERNEL_ADDRESS &&
4037 	    !pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]))
4038 		panic("pmap_enter: missing kernel PTP for va %lx!", va);
4039 #endif /* DIAGNOSTIC */
4040 #ifdef XEN
4041 	KASSERT(domid == DOMID_SELF || pa == 0);
4042 #endif /* XEN */
4043 
4044 	npte = ma | protection_codes[prot] | PG_V;
4045 	if (wired)
4046 	        npte |= PG_W;
4047 	if (va < VM_MAXUSER_ADDRESS)
4048 		npte |= PG_u;
4049 	else if (va < VM_MAX_ADDRESS)
4050 		npte |= (PG_u | PG_RW);	/* XXXCDC: no longer needed? */
4051 	else
4052 		npte |= PG_k;
4053 	if (pmap == pmap_kernel())
4054 		npte |= pmap_pg_g;
4055 	if (flags & VM_PROT_ALL) {
4056 		npte |= PG_U;
4057 		if (flags & VM_PROT_WRITE) {
4058 			KASSERT((npte & PG_RW) != 0);
4059 			npte |= PG_M;
4060 		}
4061 	}
4062 
4063 #ifdef XEN
4064 	if (domid != DOMID_SELF)
4065 		pg = NULL;
4066 	else
4067 #endif
4068 		pg = PHYS_TO_VM_PAGE(pa);
4069 	if (pg != NULL) {
4070 		/* This is a managed page */
4071 		npte |= PG_PVLIST;
4072 		new_pp = VM_PAGE_TO_PP(pg);
4073 	} else {
4074 		new_pp = NULL;
4075 	}
4076 
4077 	/* get pves. */
4078 	new_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
4079 	new_pve2 = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
4080 	if (new_pve == NULL || new_pve2 == NULL) {
4081 		if (flags & PMAP_CANFAIL) {
4082 			error = ENOMEM;
4083 			goto out2;
4084 		}
4085 		panic("pmap_enter: pve allocation failed");
4086 	}
4087 
4088 	kpreempt_disable();
4089 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
4090 	if (pmap == pmap_kernel()) {
4091 		ptp = NULL;
4092 	} else {
4093 		ptp = pmap_get_ptp(pmap, va, pdes);
4094 		if (ptp == NULL) {
4095 			pmap_unmap_ptes(pmap, pmap2);
4096 			if (flags & PMAP_CANFAIL) {
4097 				error = ENOMEM;
4098 				goto out;
4099 			}
4100 			panic("pmap_enter: get ptp failed");
4101 		}
4102 	}
4103 
4104 	/*
4105 	 * update the pte.
4106 	 */
4107 
4108 	ptep = &ptes[pl1_i(va)];
4109 	do {
4110 		opte = *ptep;
4111 
4112 		/*
4113 		 * if the same page, inherit PG_U and PG_M.
4114 		 */
4115 		if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) {
4116 			npte |= opte & (PG_U | PG_M);
4117 		}
4118 #if defined(XEN)
4119 		if (domid != DOMID_SELF) {
4120 			/* pmap_pte_cas with error handling */
4121 			int s = splvm();
4122 			if (opte != *ptep) {
4123 				splx(s);
4124 				continue;
4125 			}
4126 			error = xpq_update_foreign(
4127 			    vtomach((vaddr_t)ptep), npte, domid);
4128 			splx(s);
4129 			if (error) {
4130 				if (ptp != NULL && ptp->wire_count <= 1) {
4131 					pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4132 				}
4133 				pmap_unmap_ptes(pmap, pmap2);
4134 				goto out;
4135 			}
4136 			break;
4137 		}
4138 #endif /* defined(XEN) */
4139 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
4140 
4141 	/*
4142 	 * update statistics and PTP's reference count.
4143 	 */
4144 
4145 	pmap_stats_update_bypte(pmap, npte, opte);
4146 	if (ptp != NULL && !pmap_valid_entry(opte)) {
4147 		ptp->wire_count++;
4148 	}
4149 	KASSERT(ptp == NULL || ptp->wire_count > 1);
4150 
4151 	/*
4152 	 * if the same page, we can skip pv_entry handling.
4153 	 */
4154 
4155 	if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) {
4156 		KASSERT(((opte ^ npte) & PG_PVLIST) == 0);
4157 		goto same_pa;
4158 	}
4159 
4160 	/*
4161 	 * if old page is managed, remove pv_entry from its list.
4162 	 */
4163 
4164 	if ((~opte & (PG_V | PG_PVLIST)) == 0) {
4165 		pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte));
4166 #ifdef DIAGNOSTIC
4167 		if (pg == NULL)
4168 			panic("pmap_enter: PG_PVLIST mapping with "
4169 			      "unmanaged page "
4170 			      "pa = 0x%" PRIx64 " (0x%" PRIx64 ")",
4171 			      (int64_t)pa, (int64_t)atop(pa));
4172 #endif
4173 		old_pp = VM_PAGE_TO_PP(pg);
4174 
4175 		pp_lock(old_pp);
4176 		old_pve = pmap_remove_pv(old_pp, ptp, va);
4177 		old_pp->pp_attrs |= opte;
4178 		pp_unlock(old_pp);
4179 	}
4180 
4181 	/*
4182 	 * if new page is managed, insert pv_entry into its list.
4183 	 */
4184 
4185 	if (new_pp) {
4186 		pp_lock(new_pp);
4187 		new_pve = pmap_enter_pv(new_pp, new_pve, &new_pve2, ptp, va);
4188 		pp_unlock(new_pp);
4189 	}
4190 
4191 same_pa:
4192 	pmap_unmap_ptes(pmap, pmap2);
4193 
4194 	/*
4195 	 * shootdown tlb if necessary.
4196 	 */
4197 
4198 	if ((~opte & (PG_V | PG_U)) == 0 &&
4199 	    ((opte ^ npte) & (PG_FRAME | PG_RW)) != 0) {
4200 		pmap_tlb_shootdown(pmap, va, 0, opte);
4201 	}
4202 
4203 	error = 0;
4204 out:
4205 	kpreempt_enable();
4206 out2:
4207 	if (old_pve != NULL) {
4208 		pool_cache_put(&pmap_pv_cache, old_pve);
4209 	}
4210 	if (new_pve != NULL) {
4211 		pool_cache_put(&pmap_pv_cache, new_pve);
4212 	}
4213 	if (new_pve2 != NULL) {
4214 		pool_cache_put(&pmap_pv_cache, new_pve2);
4215 	}
4216 
4217 	return error;
4218 }
4219 
4220 #ifdef XEN
4221 int
4222 pmap_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, int flags)
4223 {
4224         paddr_t ma;
4225 
4226 	if (__predict_false(pa < pmap_pa_start || pmap_pa_end <= pa)) {
4227 		ma = pa; /* XXX hack */
4228 	} else {
4229 		ma = xpmap_ptom(pa);
4230 	}
4231 
4232 	return pmap_enter_ma(pmap, va, ma, pa, prot, flags, DOMID_SELF);
4233 }
4234 #endif /* XEN */
4235 
4236 static bool
4237 pmap_get_physpage(vaddr_t va, int level, paddr_t *paddrp)
4238 {
4239 	struct vm_page *ptp;
4240 	struct pmap *kpm = pmap_kernel();
4241 
4242 	if (uvm.page_init_done == false) {
4243 		/*
4244 		 * we're growing the kernel pmap early (from
4245 		 * uvm_pageboot_alloc()).  this case must be
4246 		 * handled a little differently.
4247 		 */
4248 
4249 		if (uvm_page_physget(paddrp) == false)
4250 			panic("pmap_get_physpage: out of memory");
4251 		kpreempt_disable();
4252 		pmap_pte_set(early_zero_pte,
4253 		    pmap_pa2pte(*paddrp) | PG_V | PG_RW | PG_k);
4254 		pmap_pte_flush();
4255 		pmap_update_pg((vaddr_t)early_zerop);
4256 		memset(early_zerop, 0, PAGE_SIZE);
4257 #if defined(DIAGNOSTIC) || defined (XEN)
4258 		pmap_pte_set(early_zero_pte, 0);
4259 		pmap_pte_flush();
4260 #endif /* defined(DIAGNOSTIC) */
4261 		kpreempt_enable();
4262 	} else {
4263 		/* XXX */
4264 		PMAP_SUBOBJ_LOCK(kpm, level - 1);
4265 		ptp = uvm_pagealloc(&kpm->pm_obj[level - 1],
4266 				    ptp_va2o(va, level), NULL,
4267 				    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
4268 		PMAP_SUBOBJ_UNLOCK(kpm, level - 1);
4269 		if (ptp == NULL)
4270 			panic("pmap_get_physpage: out of memory");
4271 		ptp->flags &= ~PG_BUSY;
4272 		ptp->wire_count = 1;
4273 		*paddrp = VM_PAGE_TO_PHYS(ptp);
4274 	}
4275 	pmap_stats_update(kpm, 1, 0);
4276 	return true;
4277 }
4278 
4279 /*
4280  * Allocate the amount of specified ptps for a ptp level, and populate
4281  * all levels below accordingly, mapping virtual addresses starting at
4282  * kva.
4283  *
4284  * Used by pmap_growkernel.
4285  */
4286 static void
4287 pmap_alloc_level(pd_entry_t * const *pdes, vaddr_t kva, int lvl,
4288     long *needed_ptps)
4289 {
4290 	unsigned long i;
4291 	vaddr_t va;
4292 	paddr_t pa;
4293 	unsigned long index, endindex;
4294 	int level;
4295 	pd_entry_t *pdep;
4296 #ifdef XEN
4297 	int s = splvm(); /* protect xpq_* */
4298 #endif
4299 
4300 	for (level = lvl; level > 1; level--) {
4301 		if (level == PTP_LEVELS)
4302 			pdep = pmap_kernel()->pm_pdir;
4303 		else
4304 			pdep = pdes[level - 2];
4305 		va = kva;
4306 		index = pl_i_roundup(kva, level);
4307 		endindex = index + needed_ptps[level - 1] - 1;
4308 
4309 
4310 		for (i = index; i <= endindex; i++) {
4311 			KASSERT(!pmap_valid_entry(pdep[i]));
4312 			pmap_get_physpage(va, level - 1, &pa);
4313 #ifdef XEN
4314 			xpq_queue_pte_update((level == PTP_LEVELS) ?
4315 			    xpmap_ptom(pmap_pdirpa(pmap_kernel(), i)) :
4316 			    xpmap_ptetomach(&pdep[i]),
4317 			    pmap_pa2pte(pa) | PG_k | PG_V | PG_RW);
4318 #ifdef PAE
4319 			if (level == PTP_LEVELS &&  i > L2_SLOT_KERN) {
4320 				/* update real kernel PD too */
4321 				xpq_queue_pte_update(
4322 				    xpmap_ptetomach(&pmap_kl2pd[l2tol2(i)]),
4323 				    pmap_pa2pte(pa) | PG_k | PG_V | PG_RW);
4324 			}
4325 #endif
4326 #else /* XEN */
4327 			pdep[i] = pa | PG_RW | PG_V;
4328 #endif /* XEN */
4329 			KASSERT(level != PTP_LEVELS || nkptp[level - 1] +
4330 			    pl_i(VM_MIN_KERNEL_ADDRESS, level) == i);
4331 			nkptp[level - 1]++;
4332 			va += nbpd[level - 1];
4333 		}
4334 		pmap_pte_flush();
4335 	}
4336 #ifdef XEN
4337 	splx(s);
4338 #endif
4339 }
4340 
4341 /*
4342  * pmap_growkernel: increase usage of KVM space
4343  *
4344  * => we allocate new PTPs for the kernel and install them in all
4345  *	the pmaps on the system.
4346  */
4347 
4348 vaddr_t
4349 pmap_growkernel(vaddr_t maxkvaddr)
4350 {
4351 	struct pmap *kpm = pmap_kernel();
4352 #if !defined(XEN) || !defined(__x86_64__)
4353 	struct pmap *pm;
4354 #endif
4355 	int s, i;
4356 	long needed_kptp[PTP_LEVELS], target_nptp, old;
4357 	bool invalidate = false;
4358 
4359 	s = splvm();	/* to be safe */
4360 	mutex_enter(&kpm->pm_lock);
4361 
4362 	if (maxkvaddr <= pmap_maxkvaddr) {
4363 		mutex_exit(&kpm->pm_lock);
4364 		splx(s);
4365 		return pmap_maxkvaddr;
4366 	}
4367 
4368 	maxkvaddr = x86_round_pdr(maxkvaddr);
4369 	old = nkptp[PTP_LEVELS - 1];
4370 	/*
4371 	 * This loop could be optimized more, but pmap_growkernel()
4372 	 * is called infrequently.
4373 	 */
4374 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
4375 		target_nptp = pl_i_roundup(maxkvaddr, i + 1) -
4376 		    pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1);
4377 		/*
4378 		 * XXX only need to check toplevel.
4379 		 */
4380 		if (target_nptp > nkptpmax[i])
4381 			panic("out of KVA space");
4382 		KASSERT(target_nptp >= nkptp[i]);
4383 		needed_kptp[i] = target_nptp - nkptp[i];
4384 	}
4385 
4386 	pmap_alloc_level(normal_pdes, pmap_maxkvaddr, PTP_LEVELS, needed_kptp);
4387 
4388 	/*
4389 	 * If the number of top level entries changed, update all
4390 	 * pmaps.
4391 	 */
4392 	if (needed_kptp[PTP_LEVELS - 1] != 0) {
4393 #ifdef XEN
4394 #ifdef __x86_64__
4395 		/* nothing, kernel entries are never entered in user pmap */
4396 #else /* __x86_64__ */
4397 		mutex_enter(&pmaps_lock);
4398 		LIST_FOREACH(pm, &pmaps, pm_list) {
4399 			int pdkidx;
4400 			for (pdkidx =  PDIR_SLOT_KERN + old;
4401 			    pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1];
4402 			    pdkidx++) {
4403 				xpq_queue_pte_update(
4404 				    xpmap_ptom(pmap_pdirpa(pm, pdkidx)),
4405 				    kpm->pm_pdir[pdkidx]);
4406 			}
4407 			xpq_flush_queue();
4408 		}
4409 		mutex_exit(&pmaps_lock);
4410 #endif /* __x86_64__ */
4411 #else /* XEN */
4412 		unsigned newpdes;
4413 		newpdes = nkptp[PTP_LEVELS - 1] - old;
4414 		mutex_enter(&pmaps_lock);
4415 		LIST_FOREACH(pm, &pmaps, pm_list) {
4416 			memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old],
4417 			       &kpm->pm_pdir[PDIR_SLOT_KERN + old],
4418 			       newpdes * sizeof (pd_entry_t));
4419 		}
4420 		mutex_exit(&pmaps_lock);
4421 #endif
4422 		invalidate = true;
4423 	}
4424 	pmap_maxkvaddr = maxkvaddr;
4425 	mutex_exit(&kpm->pm_lock);
4426 	splx(s);
4427 
4428 	if (invalidate) {
4429 		/* Invalidate the PDP cache. */
4430 		pool_cache_invalidate(&pmap_pdp_cache);
4431 	}
4432 
4433 	return maxkvaddr;
4434 }
4435 
4436 #ifdef DEBUG
4437 void pmap_dump(struct pmap *, vaddr_t, vaddr_t);
4438 
4439 /*
4440  * pmap_dump: dump all the mappings from a pmap
4441  *
4442  * => caller should not be holding any pmap locks
4443  */
4444 
4445 void
4446 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
4447 {
4448 	pt_entry_t *ptes, *pte;
4449 	pd_entry_t * const *pdes;
4450 	struct pmap *pmap2;
4451 	vaddr_t blkendva;
4452 
4453 	/*
4454 	 * if end is out of range truncate.
4455 	 * if (end == start) update to max.
4456 	 */
4457 
4458 	if (eva > VM_MAXUSER_ADDRESS || eva <= sva)
4459 		eva = VM_MAXUSER_ADDRESS;
4460 
4461 	/*
4462 	 * we lock in the pmap => pv_head direction
4463 	 */
4464 
4465 	kpreempt_disable();
4466 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
4467 
4468 	/*
4469 	 * dumping a range of pages: we dump in PTP sized blocks (4MB)
4470 	 */
4471 
4472 	for (/* null */ ; sva < eva ; sva = blkendva) {
4473 
4474 		/* determine range of block */
4475 		blkendva = x86_round_pdr(sva+1);
4476 		if (blkendva > eva)
4477 			blkendva = eva;
4478 
4479 		/* valid block? */
4480 		if (!pmap_pdes_valid(sva, pdes, NULL))
4481 			continue;
4482 
4483 		pte = &ptes[pl1_i(sva)];
4484 		for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) {
4485 			if (!pmap_valid_entry(*pte))
4486 				continue;
4487 			printf("va %#lx -> pa %#lx (pte=%#lx)\n",
4488 			       sva, (unsigned long)*pte,
4489 			       (unsigned long)pmap_pte2pa(*pte));
4490 		}
4491 	}
4492 	pmap_unmap_ptes(pmap, pmap2);
4493 	kpreempt_enable();
4494 }
4495 #endif
4496 
4497 /*
4498  * pmap_tlb_shootdown: invalidate pages on all CPUs using pmap 'pm'
4499  *
4500  * => always invalidates locally before returning
4501  * => returns before remote CPUs have invalidated
4502  * => must be called with preemption disabled
4503  */
4504 
4505 void
4506 pmap_tlb_shootdown(struct pmap *pm, vaddr_t sva, vaddr_t eva, pt_entry_t pte)
4507 {
4508 #ifdef MULTIPROCESSOR
4509 	extern bool x86_mp_online;
4510 	struct cpu_info *ci;
4511 	struct pmap_mbox *mb, *selfmb;
4512 	CPU_INFO_ITERATOR cii;
4513 	uintptr_t head;
4514 	u_int count;
4515 	int s;
4516 #endif	/* MULTIPROCESSOR */
4517 	struct cpu_info *self;
4518 	bool kernel;
4519 
4520 	KASSERT(eva == 0 || eva >= sva);
4521 	KASSERT(kpreempt_disabled());
4522 
4523 	if (pte & PG_PS)
4524 		sva &= PG_LGFRAME;
4525 	pte &= PG_G;
4526 	self = curcpu();
4527 
4528 	if (sva == (vaddr_t)-1LL) {
4529 		kernel = true;
4530 	} else {
4531 		if (eva == 0)
4532 			eva = sva + PAGE_SIZE;
4533 		kernel = sva >= VM_MAXUSER_ADDRESS;
4534 		KASSERT(kernel == (eva > VM_MAXUSER_ADDRESS));
4535 	}
4536 
4537 	/*
4538 	 * if tearing down the pmap, do nothing.  we'll flush later
4539 	 * when we're ready to recycle/destroy it.
4540 	 */
4541 	if (__predict_false(curlwp->l_md.md_gc_pmap == pm)) {
4542 		return;
4543 	}
4544 
4545 	/*
4546 	 * If the range is larger than 32 pages, then invalidate
4547 	 * everything.
4548 	 */
4549 	if (sva != (vaddr_t)-1LL && eva - sva > (32 * PAGE_SIZE)) {
4550 		sva = (vaddr_t)-1LL;
4551 		eva = sva;
4552 	}
4553 
4554 #ifdef MULTIPROCESSOR
4555 	if (ncpu > 1 && x86_mp_online) {
4556 		selfmb = &self->ci_pmap_cpu->pc_mbox;
4557 
4558 		/*
4559 		 * If the CPUs have no notion of global pages then
4560 		 * reload of %cr3 is sufficient.
4561 		 */
4562 		if (pte != 0 && (cpu_feature & CPUID_PGE) == 0)
4563 			pte = 0;
4564 
4565 		if (pm == pmap_kernel()) {
4566 			/*
4567 			 * Mapped on all CPUs: use the broadcast mechanism.
4568 			 * Once we have the lock, increment the counter.
4569 			 */
4570 			s = splvm();
4571 			mb = &pmap_mbox;
4572 			count = SPINLOCK_BACKOFF_MIN;
4573 			do {
4574 				if ((head = mb->mb_head) != mb->mb_tail) {
4575 					splx(s);
4576 					while ((head = mb->mb_head) !=
4577 					    mb->mb_tail)
4578 						SPINLOCK_BACKOFF(count);
4579 					s = splvm();
4580 				}
4581 			} while (atomic_cas_ulong(
4582 			    (volatile u_long *)&mb->mb_head,
4583 			    head, head + ncpu - 1) != head);
4584 
4585 			/*
4586 			 * Once underway we must stay at IPL_VM until the
4587 			 * IPI is dispatched.  Otherwise interrupt handlers
4588 			 * on this CPU can deadlock against us.
4589 			 */
4590 			pmap_tlb_evcnt.ev_count++;
4591 			mb->mb_pointer = self;
4592 			mb->mb_addr1 = sva;
4593 			mb->mb_addr2 = eva;
4594 			mb->mb_global = pte;
4595 			x86_ipi(LAPIC_TLB_BCAST_VECTOR, LAPIC_DEST_ALLEXCL,
4596 			    LAPIC_DLMODE_FIXED);
4597 			self->ci_need_tlbwait = 1;
4598 			splx(s);
4599 		} else if ((pm->pm_cpus & ~self->ci_cpumask) != 0 ||
4600 		    (kernel && (pm->pm_kernel_cpus & ~self->ci_cpumask) != 0)) {
4601 			/*
4602 			 * We don't bother traversing the CPU list if only
4603 			 * used by this CPU.
4604 			 *
4605 			 * We can't do global flushes with the multicast
4606 			 * mechanism.
4607 			 */
4608 			KASSERT(pte == 0);
4609 
4610 			/*
4611 			 * Take ownership of the shootdown mailbox on each
4612 			 * CPU, fill the details and fire it off.
4613 			 */
4614 			s = splvm();
4615 			for (CPU_INFO_FOREACH(cii, ci)) {
4616 				if (ci == self ||
4617 				    !pmap_is_active(pm, ci, kernel) ||
4618 				    !(ci->ci_flags & CPUF_RUNNING))
4619 					continue;
4620 				selfmb->mb_head++;
4621 				mb = &ci->ci_pmap_cpu->pc_mbox;
4622 				count = SPINLOCK_BACKOFF_MIN;
4623 				while (atomic_cas_ulong(
4624 				    (u_long *)&mb->mb_pointer,
4625 				    0, (u_long)&selfmb->mb_tail) != 0) {
4626 				    	splx(s);
4627 					while (mb->mb_pointer != 0)
4628 						SPINLOCK_BACKOFF(count);
4629 					s = splvm();
4630 				}
4631 				mb->mb_addr1 = sva;
4632 				mb->mb_addr2 = eva;
4633 				mb->mb_global = pte;
4634 				if (x86_ipi(LAPIC_TLB_MCAST_VECTOR,
4635 				    ci->ci_cpuid, LAPIC_DLMODE_FIXED))
4636 					panic("pmap_tlb_shootdown: ipi failed");
4637 			}
4638 			self->ci_need_tlbwait = 1;
4639 			splx(s);
4640 		}
4641 	}
4642 #endif	/* MULTIPROCESSOR */
4643 
4644 	/* Update the current CPU before waiting for others. */
4645 	if (!pmap_is_active(pm, self, kernel))
4646 		return;
4647 
4648 	if (sva == (vaddr_t)-1LL) {
4649 		if (pte != 0)
4650 			tlbflushg();
4651 		else
4652 			tlbflush();
4653 	} else {
4654 		do {
4655 			pmap_update_pg(sva);
4656 			sva += PAGE_SIZE;
4657 		} while (sva < eva);
4658 	}
4659 }
4660 
4661 /*
4662  * pmap_tlb_shootwait: wait for pending TLB shootdowns to complete
4663  *
4664  * => only waits for operations generated by the current CPU
4665  * => must be called with preemption disabled
4666  */
4667 
4668 void
4669 pmap_tlb_shootwait(void)
4670 {
4671 	struct cpu_info *self;
4672 	struct pmap_mbox *mb;
4673 
4674 	KASSERT(kpreempt_disabled());
4675 
4676 	/*
4677 	 * Anything to do?  XXX Really we want to avoid touching the cache
4678 	 * lines of the two mailboxes, but the processor may read ahead.
4679 	 */
4680 	self = curcpu();
4681 	if (!self->ci_need_tlbwait)
4682 		return;
4683 	self->ci_need_tlbwait = 0;
4684 
4685 	/* If we own the global mailbox, wait for it to drain. */
4686 	mb = &pmap_mbox;
4687 	while (mb->mb_pointer == self && mb->mb_head != mb->mb_tail)
4688 		x86_pause();
4689 
4690 	/* If we own other CPU's mailboxes, wait for them to drain. */
4691 	mb = &self->ci_pmap_cpu->pc_mbox;
4692 	KASSERT(mb->mb_pointer != &mb->mb_tail);
4693 	while (mb->mb_head != mb->mb_tail)
4694 		x86_pause();
4695 }
4696 
4697 /*
4698  * pmap_update: process deferred invalidations
4699  */
4700 
4701 void
4702 pmap_update(struct pmap *pmap)
4703 {
4704 	struct vm_page *ptp, *empty_ptps;
4705 	struct pmap_page *pp;
4706 	lwp_t *l;
4707 
4708 	/*
4709 	 * if we have torn down this pmap, invalidate non-global TLB
4710 	 * entries on any processors using it.
4711 	 */
4712 	l = curlwp;
4713 	if (__predict_false(l->l_md.md_gc_pmap == pmap)) {
4714 		l->l_md.md_gc_pmap = NULL;
4715 		KPREEMPT_DISABLE(l);
4716 		pmap_tlb_shootdown(pmap, -1, -1, 0);
4717 		KPREEMPT_ENABLE(l);
4718 	}
4719 
4720 	/*
4721 	 * wait for tlb shootdowns to complete before returning control
4722 	 * to the caller.
4723 	 */
4724 	kpreempt_disable();
4725 	pmap_tlb_shootwait();
4726 	kpreempt_enable();
4727 
4728 	/*
4729 	 * now that shootdowns are complete, process deferred frees,
4730 	 * but not from interrupt context.
4731 	 */
4732 	if (l->l_md.md_gc_ptp != NULL) {
4733 		if (cpu_intr_p() || (l->l_pflag & LP_INTR) != 0) {
4734 			return;
4735 		}
4736 
4737 		empty_ptps = l->l_md.md_gc_ptp;
4738 		l->l_md.md_gc_ptp = NULL;
4739 
4740 		while ((ptp = empty_ptps) != NULL) {
4741 			ptp->flags |= PG_ZERO;
4742 			pp = VM_PAGE_TO_PP(ptp);
4743 			empty_ptps = pp->pp_link;
4744 			LIST_INIT(&pp->pp_head.pvh_list);
4745 			uvm_pagefree(ptp);
4746 		}
4747 	}
4748 }
4749 
4750 #if PTP_LEVELS > 4
4751 #error "Unsupported number of page table mappings"
4752 #endif
4753 
4754 paddr_t
4755 pmap_init_tmp_pgtbl(paddr_t pg)
4756 {
4757 	static bool maps_loaded;
4758 	static const paddr_t x86_tmp_pml_paddr[] = {
4759 	    4 * PAGE_SIZE,
4760 	    5 * PAGE_SIZE,
4761 	    6 * PAGE_SIZE,
4762 	    7 * PAGE_SIZE
4763 	};
4764 	static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 };
4765 
4766 	pd_entry_t *tmp_pml, *kernel_pml;
4767 
4768 	int level;
4769 
4770 	if (!maps_loaded) {
4771 		for (level = 0; level < PTP_LEVELS; ++level) {
4772 			x86_tmp_pml_vaddr[level] =
4773 			    uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
4774 			    UVM_KMF_VAONLY);
4775 
4776 			if (x86_tmp_pml_vaddr[level] == 0)
4777 				panic("mapping of real mode PML failed\n");
4778 			pmap_kenter_pa(x86_tmp_pml_vaddr[level],
4779 			    x86_tmp_pml_paddr[level],
4780 			    VM_PROT_READ | VM_PROT_WRITE);
4781 			pmap_update(pmap_kernel());
4782 		}
4783 		maps_loaded = true;
4784 	}
4785 
4786 	/* Zero levels 1-3 */
4787 	for (level = 0; level < PTP_LEVELS - 1; ++level) {
4788 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
4789 		memset(tmp_pml, 0, PAGE_SIZE);
4790 	}
4791 
4792 	/* Copy PML4 */
4793 	kernel_pml = pmap_kernel()->pm_pdir;
4794 	tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1];
4795 	memcpy(tmp_pml, kernel_pml, PAGE_SIZE);
4796 
4797 	/* Hook our own level 3 in */
4798 	tmp_pml[pl_i(pg, PTP_LEVELS)] =
4799 	    (x86_tmp_pml_paddr[PTP_LEVELS - 2] & PG_FRAME) | PG_RW | PG_V;
4800 
4801 	for (level = PTP_LEVELS - 1; level > 0; --level) {
4802 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
4803 
4804 		tmp_pml[pl_i(pg, level + 1)] =
4805 		    (x86_tmp_pml_paddr[level - 1] & PG_FRAME) | PG_RW | PG_V;
4806 	}
4807 
4808 	tmp_pml = (void *)x86_tmp_pml_vaddr[0];
4809 	tmp_pml[pl_i(pg, 1)] = (pg & PG_FRAME) | PG_RW | PG_V;
4810 
4811 	return x86_tmp_pml_paddr[PTP_LEVELS - 1];
4812 }
4813