xref: /netbsd-src/sys/arch/x86/x86/pmap.c (revision daf6c4152fcddc27c445489775ed1f66ab4ea9a9)
1 /*	$NetBSD: pmap.c,v 1.118 2011/02/11 23:08:38 jmcneill Exp $	*/
2 
3 /*
4  * Copyright (c) 2007 Manuel Bouyer.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *
26  */
27 
28 /*
29  * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
30  *
31  * Permission to use, copy, modify, and distribute this software for any
32  * purpose with or without fee is hereby granted, provided that the above
33  * copyright notice and this permission notice appear in all copies.
34  *
35  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
36  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
37  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
38  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
39  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
40  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
41  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
42  */
43 
44 /*
45  * Copyright (c) 1997 Charles D. Cranor and Washington University.
46  * All rights reserved.
47  *
48  * Redistribution and use in source and binary forms, with or without
49  * modification, are permitted provided that the following conditions
50  * are met:
51  * 1. Redistributions of source code must retain the above copyright
52  *    notice, this list of conditions and the following disclaimer.
53  * 2. Redistributions in binary form must reproduce the above copyright
54  *    notice, this list of conditions and the following disclaimer in the
55  *    documentation and/or other materials provided with the distribution.
56  *
57  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
58  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
59  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
60  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
61  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
62  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
63  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
64  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
65  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
66  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
67  */
68 
69 /*
70  * Copyright 2001 (c) Wasabi Systems, Inc.
71  * All rights reserved.
72  *
73  * Written by Frank van der Linden for Wasabi Systems, Inc.
74  *
75  * Redistribution and use in source and binary forms, with or without
76  * modification, are permitted provided that the following conditions
77  * are met:
78  * 1. Redistributions of source code must retain the above copyright
79  *    notice, this list of conditions and the following disclaimer.
80  * 2. Redistributions in binary form must reproduce the above copyright
81  *    notice, this list of conditions and the following disclaimer in the
82  *    documentation and/or other materials provided with the distribution.
83  * 3. All advertising materials mentioning features or use of this software
84  *    must display the following acknowledgement:
85  *      This product includes software developed for the NetBSD Project by
86  *      Wasabi Systems, Inc.
87  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
88  *    or promote products derived from this software without specific prior
89  *    written permission.
90  *
91  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
92  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
93  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
94  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
95  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
96  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
97  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
98  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
99  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
100  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
101  * POSSIBILITY OF SUCH DAMAGE.
102  */
103 
104 /*
105  * This is the i386 pmap modified and generalized to support x86-64
106  * as well. The idea is to hide the upper N levels of the page tables
107  * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest
108  * is mostly untouched, except that it uses some more generalized
109  * macros and interfaces.
110  *
111  * This pmap has been tested on the i386 as well, and it can be easily
112  * adapted to PAE.
113  *
114  * fvdl@wasabisystems.com 18-Jun-2001
115  */
116 
117 /*
118  * pmap.c: i386 pmap module rewrite
119  * Chuck Cranor <chuck@netbsd>
120  * 11-Aug-97
121  *
122  * history of this pmap module: in addition to my own input, i used
123  *    the following references for this rewrite of the i386 pmap:
124  *
125  * [1] the NetBSD i386 pmap.   this pmap appears to be based on the
126  *     BSD hp300 pmap done by Mike Hibler at University of Utah.
127  *     it was then ported to the i386 by William Jolitz of UUNET
128  *     Technologies, Inc.   Then Charles M. Hannum of the NetBSD
129  *     project fixed some bugs and provided some speed ups.
130  *
131  * [2] the FreeBSD i386 pmap.   this pmap seems to be the
132  *     Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson
133  *     and David Greenman.
134  *
135  * [3] the Mach pmap.   this pmap, from CMU, seems to have migrated
136  *     between several processors.   the VAX version was done by
137  *     Avadis Tevanian, Jr., and Michael Wayne Young.    the i386
138  *     version was done by Lance Berc, Mike Kupfer, Bob Baron,
139  *     David Golub, and Richard Draves.    the alpha version was
140  *     done by Alessandro Forin (CMU/Mach) and Chris Demetriou
141  *     (NetBSD/alpha).
142  */
143 
144 #include <sys/cdefs.h>
145 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.118 2011/02/11 23:08:38 jmcneill Exp $");
146 
147 #include "opt_user_ldt.h"
148 #include "opt_lockdebug.h"
149 #include "opt_multiprocessor.h"
150 #include "opt_xen.h"
151 #if !defined(__x86_64__)
152 #include "opt_kstack_dr0.h"
153 #endif /* !defined(__x86_64__) */
154 
155 #include <sys/param.h>
156 #include <sys/systm.h>
157 #include <sys/proc.h>
158 #include <sys/pool.h>
159 #include <sys/kernel.h>
160 #include <sys/atomic.h>
161 #include <sys/cpu.h>
162 #include <sys/intr.h>
163 #include <sys/xcall.h>
164 
165 #include <uvm/uvm.h>
166 
167 #include <dev/isa/isareg.h>
168 
169 #include <machine/specialreg.h>
170 #include <machine/gdt.h>
171 #include <machine/isa_machdep.h>
172 #include <machine/cpuvar.h>
173 
174 #include <x86/pmap.h>
175 #include <x86/pmap_pv.h>
176 
177 #include <x86/i82489reg.h>
178 #include <x86/i82489var.h>
179 
180 #ifdef XEN
181 #include <xen/xen3-public/xen.h>
182 #include <xen/hypervisor.h>
183 #endif
184 
185 /* flag to be used for kernel mappings: PG_u on Xen/amd64, 0 otherwise */
186 #if defined(XEN) && defined(__x86_64__)
187 #define PG_k PG_u
188 #else
189 #define PG_k 0
190 #endif
191 
192 /*
193  * general info:
194  *
195  *  - for an explanation of how the i386 MMU hardware works see
196  *    the comments in <machine/pte.h>.
197  *
198  *  - for an explanation of the general memory structure used by
199  *    this pmap (including the recursive mapping), see the comments
200  *    in <machine/pmap.h>.
201  *
202  * this file contains the code for the "pmap module."   the module's
203  * job is to manage the hardware's virtual to physical address mappings.
204  * note that there are two levels of mapping in the VM system:
205  *
206  *  [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
207  *      to map ranges of virtual address space to objects/files.  for
208  *      example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
209  *      to the file /bin/ls starting at offset zero."   note that
210  *      the upper layer mapping is not concerned with how individual
211  *      vm_pages are mapped.
212  *
213  *  [2] the lower layer of the VM system (the pmap) maintains the mappings
214  *      from virtual addresses.   it is concerned with which vm_page is
215  *      mapped where.   for example, when you run /bin/ls and start
216  *      at page 0x1000 the fault routine may lookup the correct page
217  *      of the /bin/ls file and then ask the pmap layer to establish
218  *      a mapping for it.
219  *
220  * note that information in the lower layer of the VM system can be
221  * thrown away since it can easily be reconstructed from the info
222  * in the upper layer.
223  *
224  * data structures we use include:
225  *
226  *  - struct pmap: describes the address space of one thread
227  *  - struct pv_entry: describes one <PMAP,VA> mapping of a PA
228  *  - struct pv_head: there is one pv_head per managed page of
229  *	physical memory.   the pv_head points to a list of pv_entry
230  *	structures which describe all the <PMAP,VA> pairs that this
231  *      page is mapped in.    this is critical for page based operations
232  *      such as pmap_page_protect() [change protection on _all_ mappings
233  *      of a page]
234  */
235 
236 /*
237  * memory allocation
238  *
239  *  - there are three data structures that we must dynamically allocate:
240  *
241  * [A] new process' page directory page (PDP)
242  *	- plan 1: done at pmap_create() we use
243  *	  uvm_km_alloc(kernel_map, PAGE_SIZE)  [fka kmem_alloc] to do this
244  *	  allocation.
245  *
246  * if we are low in free physical memory then we sleep in
247  * uvm_km_alloc -- in this case this is ok since we are creating
248  * a new pmap and should not be holding any locks.
249  *
250  * if the kernel is totally out of virtual space
251  * (i.e. uvm_km_alloc returns NULL), then we panic.
252  *
253  * [B] new page tables pages (PTP)
254  * 	- call uvm_pagealloc()
255  * 		=> success: zero page, add to pm_pdir
256  * 		=> failure: we are out of free vm_pages, let pmap_enter()
257  *		   tell UVM about it.
258  *
259  * note: for kernel PTPs, we start with NKPTP of them.   as we map
260  * kernel memory (at uvm_map time) we check to see if we've grown
261  * the kernel pmap.   if so, we call the optional function
262  * pmap_growkernel() to grow the kernel PTPs in advance.
263  *
264  * [C] pv_entry structures
265  */
266 
267 /*
268  * locking
269  *
270  * we have the following locks that we must contend with:
271  *
272  * mutexes:
273  *
274  * - pmap lock (per pmap, part of uvm_object)
275  *   this lock protects the fields in the pmap structure including
276  *   the non-kernel PDEs in the PDP, and the PTEs.  it also locks
277  *   in the alternate PTE space (since that is determined by the
278  *   entry in the PDP).
279  *
280  * - pvh_lock (per pv_head)
281  *   this lock protects the pv_entry list which is chained off the
282  *   pv_head structure for a specific managed PA.   it is locked
283  *   when traversing the list (e.g. adding/removing mappings,
284  *   syncing R/M bits, etc.)
285  *
286  * - pmaps_lock
287  *   this lock protects the list of active pmaps (headed by "pmaps").
288  *   we lock it when adding or removing pmaps from this list.
289  *
290  * tlb shootdown
291  *
292  * tlb shootdowns are hard interrupts that operate outside the spl
293  * framework: they don't need to be blocked provided that the pmap module
294  * gets the order of events correct.  the calls are made by talking directly
295  * to the lapic.  the stubs to handle the interrupts are quite short and do
296  * one of the following: invalidate a single page, a range of pages, all
297  * user tlb entries or the entire tlb.
298  *
299  * the cpus synchronize with each other using pmap_mbox structures which are
300  * aligned on 64-byte cache lines.  tlb shootdowns against the kernel pmap
301  * use a global mailbox and are generated using a broadcast ipi (broadcast
302  * to all but the sending cpu).  shootdowns against regular pmaps use
303  * per-cpu mailboxes and are multicast.  kernel and user shootdowns can
304  * execute simultaneously, as can shootdowns within different multithreaded
305  * processes.  TODO:
306  *
307  *   1. figure out which waitpoints can be deferered to pmap_update().
308  *   2. see if there is a cheap way to batch some updates.
309  */
310 
311 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER;
312 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER;
313 const long nkptpmax[] = NKPTPMAX_INITIALIZER;
314 const long nbpd[] = NBPD_INITIALIZER;
315 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER;
316 pd_entry_t * const alternate_pdes[] = APDES_INITIALIZER;
317 
318 long nkptp[] = NKPTP_INITIALIZER;
319 
320 static kmutex_t pmaps_lock;
321 
322 static vaddr_t pmap_maxkvaddr;
323 
324 #define COUNT(x)	/* nothing */
325 
326 /*
327  * XXX kludge: dummy locking to make KASSERTs in uvm_page.c comfortable.
328  * actual locking is done by pm_lock.
329  */
330 #if defined(DIAGNOSTIC)
331 #define	PMAP_SUBOBJ_LOCK(pm, idx) \
332 	KASSERT(mutex_owned(&(pm)->pm_lock)); \
333 	if ((idx) != 0) \
334 		mutex_enter(&(pm)->pm_obj[(idx)].vmobjlock)
335 #define	PMAP_SUBOBJ_UNLOCK(pm, idx) \
336 	KASSERT(mutex_owned(&(pm)->pm_lock)); \
337 	if ((idx) != 0) \
338 		mutex_exit(&(pm)->pm_obj[(idx)].vmobjlock)
339 #else /* defined(DIAGNOSTIC) */
340 #define	PMAP_SUBOBJ_LOCK(pm, idx)	/* nothing */
341 #define	PMAP_SUBOBJ_UNLOCK(pm, idx)	/* nothing */
342 #endif /* defined(DIAGNOSTIC) */
343 
344 /*
345  * Misc. event counters.
346  */
347 struct evcnt pmap_iobmp_evcnt;
348 struct evcnt pmap_ldt_evcnt;
349 
350 /*
351  * Global TLB shootdown mailbox.
352  */
353 struct evcnt pmap_tlb_evcnt __aligned(64);
354 struct pmap_mbox pmap_mbox __aligned(64);
355 
356 /*
357  * PAT
358  */
359 #define	PATENTRY(n, type)	(type << ((n) * 8))
360 #define	PAT_UC		0x0ULL
361 #define	PAT_WC		0x1ULL
362 #define	PAT_WT		0x4ULL
363 #define	PAT_WP		0x5ULL
364 #define	PAT_WB		0x6ULL
365 #define	PAT_UCMINUS	0x7ULL
366 
367 static bool cpu_pat_enabled = false;
368 
369 
370 /*
371  * Per-CPU data.  The pmap mailbox is cache intensive so gets its
372  * own line.  Note that the mailbox must be the first item.
373  */
374 struct pmap_cpu {
375 	/* TLB shootdown */
376 	struct pmap_mbox pc_mbox;
377 };
378 
379 union {
380 	struct pmap_cpu pc;
381 	uint8_t padding[64];
382 } pmap_cpu[MAXCPUS] __aligned(64);
383 
384 /*
385  * global data structures
386  */
387 
388 static struct pmap kernel_pmap_store;	/* the kernel's pmap (proc0) */
389 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store;
390 
391 /*
392  * pmap_pg_g: if our processor supports PG_G in the PTE then we
393  * set pmap_pg_g to PG_G (otherwise it is zero).
394  */
395 
396 int pmap_pg_g = 0;
397 
398 /*
399  * pmap_largepages: if our processor supports PG_PS and we are
400  * using it, this is set to true.
401  */
402 
403 int pmap_largepages;
404 
405 /*
406  * i386 physical memory comes in a big contig chunk with a small
407  * hole toward the front of it...  the following two paddr_t's
408  * (shared with machdep.c) describe the physical address space
409  * of this machine.
410  */
411 paddr_t avail_start;	/* PA of first available physical page */
412 paddr_t avail_end;	/* PA of last available physical page */
413 
414 #ifdef XEN
415 #ifdef __x86_64__
416 /* Dummy PGD for user cr3, used between pmap_deactivate() and pmap_activate() */
417 static paddr_t xen_dummy_user_pgd;
418 #endif /* __x86_64__ */
419 paddr_t pmap_pa_start; /* PA of first physical page for this domain */
420 paddr_t pmap_pa_end;   /* PA of last physical page for this domain */
421 #endif /* XEN */
422 
423 #define	VM_PAGE_TO_PP(pg)	(&(pg)->mdpage.mp_pp)
424 
425 #define	pp_lock(pp)	mutex_spin_enter(&(pp)->pp_lock)
426 #define	pp_unlock(pp)	mutex_spin_exit(&(pp)->pp_lock)
427 #define	pp_locked(pp)	mutex_owned(&(pp)->pp_lock)
428 
429 #define	PV_HASH_SIZE		32768
430 #define	PV_HASH_LOCK_CNT	32
431 
432 struct pv_hash_lock {
433 	kmutex_t lock;
434 } __aligned(CACHE_LINE_SIZE) pv_hash_locks[PV_HASH_LOCK_CNT]
435     __aligned(CACHE_LINE_SIZE);
436 
437 struct pv_hash_head {
438 	SLIST_HEAD(, pv_entry) hh_list;
439 } pv_hash_heads[PV_HASH_SIZE];
440 
441 static u_int
442 pvhash_hash(struct vm_page *ptp, vaddr_t va)
443 {
444 
445 	return (uintptr_t)ptp / sizeof(*ptp) + (va >> PAGE_SHIFT);
446 }
447 
448 static struct pv_hash_head *
449 pvhash_head(u_int hash)
450 {
451 
452 	return &pv_hash_heads[hash % PV_HASH_SIZE];
453 }
454 
455 static kmutex_t *
456 pvhash_lock(u_int hash)
457 {
458 
459 	return &pv_hash_locks[hash % PV_HASH_LOCK_CNT].lock;
460 }
461 
462 static struct pv_entry *
463 pvhash_remove(struct pv_hash_head *hh, struct vm_page *ptp, vaddr_t va)
464 {
465 	struct pv_entry *pve;
466 	struct pv_entry *prev;
467 
468 	prev = NULL;
469 	SLIST_FOREACH(pve, &hh->hh_list, pve_hash) {
470 		if (pve->pve_pte.pte_ptp == ptp &&
471 		    pve->pve_pte.pte_va == va) {
472 			if (prev != NULL) {
473 				SLIST_REMOVE_AFTER(prev, pve_hash);
474 			} else {
475 				SLIST_REMOVE_HEAD(&hh->hh_list, pve_hash);
476 			}
477 			break;
478 		}
479 		prev = pve;
480 	}
481 	return pve;
482 }
483 
484 /*
485  * other data structures
486  */
487 
488 static pt_entry_t protection_codes[8];	/* maps MI prot to i386 prot code */
489 static bool pmap_initialized = false;	/* pmap_init done yet? */
490 
491 /*
492  * the following two vaddr_t's are used during system startup
493  * to keep track of how much of the kernel's VM space we have used.
494  * once the system is started, the management of the remaining kernel
495  * VM space is turned over to the kernel_map vm_map.
496  */
497 
498 static vaddr_t virtual_avail;	/* VA of first free KVA */
499 static vaddr_t virtual_end;	/* VA of last free KVA */
500 
501 /*
502  * linked list of all non-kernel pmaps
503  */
504 
505 static struct pmap_head pmaps;
506 
507 /*
508  * pool that pmap structures are allocated from
509  */
510 
511 static struct pool_cache pmap_cache;
512 
513 /*
514  * pv_entry cache
515  */
516 
517 static struct pool_cache pmap_pv_cache;
518 
519 /*
520  * MULTIPROCESSOR: special VA's/ PTE's are actually allocated inside a
521  * maxcpus*NPTECL array of PTE's, to avoid cache line thrashing
522  * due to false sharing.
523  */
524 
525 #ifdef MULTIPROCESSOR
526 #define PTESLEW(pte, id) ((pte)+(id)*NPTECL)
527 #define VASLEW(va,id) ((va)+(id)*NPTECL*PAGE_SIZE)
528 #else
529 #define PTESLEW(pte, id) (pte)
530 #define VASLEW(va,id) (va)
531 #endif
532 
533 /*
534  * special VAs and the PTEs that map them
535  */
536 static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte, *early_zero_pte;
537 static char *csrcp, *cdstp, *zerop, *ptpp, *early_zerop;
538 
539 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int);
540 
541 /*
542  * pool and cache that PDPs are allocated from
543  */
544 
545 static struct pool_cache pmap_pdp_cache;
546 int	pmap_pdp_ctor(void *, void *, int);
547 void	pmap_pdp_dtor(void *, void *);
548 #ifdef PAE
549 /* need to allocate items of 4 pages */
550 void *pmap_pdp_alloc(struct pool *, int);
551 void pmap_pdp_free(struct pool *, void *);
552 static struct pool_allocator pmap_pdp_allocator = {
553 	.pa_alloc = pmap_pdp_alloc,
554 	.pa_free = pmap_pdp_free,
555 	.pa_pagesz = PAGE_SIZE * PDP_SIZE,
556 };
557 #endif /* PAE */
558 
559 void *vmmap; /* XXX: used by mem.c... it should really uvm_map_reserve it */
560 
561 extern vaddr_t idt_vaddr;			/* we allocate IDT early */
562 extern paddr_t idt_paddr;
563 
564 #ifdef _LP64
565 extern vaddr_t lo32_vaddr;
566 extern vaddr_t lo32_paddr;
567 #endif
568 
569 extern int end;
570 
571 #ifdef i386
572 /* stuff to fix the pentium f00f bug */
573 extern vaddr_t pentium_idt_vaddr;
574 #endif
575 
576 
577 /*
578  * local prototypes
579  */
580 
581 static struct vm_page	*pmap_get_ptp(struct pmap *, vaddr_t,
582 				      pd_entry_t * const *);
583 static struct vm_page	*pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int);
584 static void		 pmap_freepage(struct pmap *, struct vm_page *, int);
585 static void		 pmap_free_ptp(struct pmap *, struct vm_page *,
586 				       vaddr_t, pt_entry_t *,
587 				       pd_entry_t * const *);
588 static bool		 pmap_is_curpmap(struct pmap *);
589 static bool		 pmap_is_active(struct pmap *, struct cpu_info *, bool);
590 static bool		 pmap_remove_pte(struct pmap *, struct vm_page *,
591 					 pt_entry_t *, vaddr_t,
592 					 struct pv_entry **);
593 static pt_entry_t	 pmap_remove_ptes(struct pmap *, struct vm_page *,
594 					  vaddr_t, vaddr_t, vaddr_t,
595 					  struct pv_entry **);
596 
597 static void		 pmap_unmap_apdp(void);
598 static bool		 pmap_get_physpage(vaddr_t, int, paddr_t *);
599 static void		 pmap_alloc_level(pd_entry_t * const *, vaddr_t, int,
600 					  long *);
601 
602 static bool		 pmap_reactivate(struct pmap *);
603 
604 /*
605  * p m a p   h e l p e r   f u n c t i o n s
606  */
607 
608 static inline void
609 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff)
610 {
611 
612 	if (pmap == pmap_kernel()) {
613 		atomic_add_long(&pmap->pm_stats.resident_count, resid_diff);
614 		atomic_add_long(&pmap->pm_stats.wired_count, wired_diff);
615 	} else {
616 		KASSERT(mutex_owned(&pmap->pm_lock));
617 		pmap->pm_stats.resident_count += resid_diff;
618 		pmap->pm_stats.wired_count += wired_diff;
619 	}
620 }
621 
622 static inline void
623 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
624 {
625 	int resid_diff = ((npte & PG_V) ? 1 : 0) - ((opte & PG_V) ? 1 : 0);
626 	int wired_diff = ((npte & PG_W) ? 1 : 0) - ((opte & PG_W) ? 1 : 0);
627 
628 	KASSERT((npte & (PG_V | PG_W)) != PG_W);
629 	KASSERT((opte & (PG_V | PG_W)) != PG_W);
630 
631 	pmap_stats_update(pmap, resid_diff, wired_diff);
632 }
633 
634 /*
635  * ptp_to_pmap: lookup pmap by ptp
636  */
637 
638 static struct pmap *
639 ptp_to_pmap(struct vm_page *ptp)
640 {
641 	struct pmap *pmap;
642 
643 	if (ptp == NULL) {
644 		return pmap_kernel();
645 	}
646 	pmap = (struct pmap *)ptp->uobject;
647 	KASSERT(pmap != NULL);
648 	KASSERT(&pmap->pm_obj[0] == ptp->uobject);
649 	return pmap;
650 }
651 
652 static inline struct pv_pte *
653 pve_to_pvpte(struct pv_entry *pve)
654 {
655 
656 	KASSERT((void *)&pve->pve_pte == (void *)pve);
657 	return &pve->pve_pte;
658 }
659 
660 static inline struct pv_entry *
661 pvpte_to_pve(struct pv_pte *pvpte)
662 {
663 	struct pv_entry *pve = (void *)pvpte;
664 
665 	KASSERT(pve_to_pvpte(pve) == pvpte);
666 	return pve;
667 }
668 
669 /*
670  * pv_pte_first, pv_pte_next: PV list iterator.
671  */
672 
673 static struct pv_pte *
674 pv_pte_first(struct pmap_page *pp)
675 {
676 
677 	KASSERT(pp_locked(pp));
678 	if ((pp->pp_flags & PP_EMBEDDED) != 0) {
679 		return &pp->pp_pte;
680 	}
681 	return pve_to_pvpte(LIST_FIRST(&pp->pp_head.pvh_list));
682 }
683 
684 static struct pv_pte *
685 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte)
686 {
687 
688 	KASSERT(pvpte != NULL);
689 	KASSERT(pp_locked(pp));
690 	if (pvpte == &pp->pp_pte) {
691 		KASSERT((pp->pp_flags & PP_EMBEDDED) != 0);
692 		return NULL;
693 	}
694 	KASSERT((pp->pp_flags & PP_EMBEDDED) == 0);
695 	return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list));
696 }
697 
698 /*
699  * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
700  *		of course the kernel is always loaded
701  */
702 
703 inline static bool
704 pmap_is_curpmap(struct pmap *pmap)
705 {
706 #if defined(XEN) && defined(__x86_64__)
707 	/*
708 	 * Only kernel pmap is physically loaded.
709 	 * User PGD may be active, but TLB will be flushed
710 	 * with HYPERVISOR_iret anyway, so let's say no
711 	 */
712 	return(pmap == pmap_kernel());
713 #else /* XEN && __x86_64__*/
714 	return((pmap == pmap_kernel()) ||
715 	       (pmap == curcpu()->ci_pmap));
716 #endif
717 }
718 
719 /*
720  * pmap_is_active: is this pmap loaded into the specified processor's %cr3?
721  */
722 
723 inline static bool
724 pmap_is_active(struct pmap *pmap, struct cpu_info *ci, bool kernel)
725 {
726 
727 	return (pmap == pmap_kernel() ||
728 	    (pmap->pm_cpus & ci->ci_cpumask) != 0 ||
729 	    (kernel && (pmap->pm_kernel_cpus & ci->ci_cpumask) != 0));
730 }
731 
732 static void
733 pmap_apte_flush(struct pmap *pmap)
734 {
735 
736 	KASSERT(kpreempt_disabled());
737 
738 	/*
739 	 * Flush the APTE mapping from all other CPUs that
740 	 * are using the pmap we are using (who's APTE space
741 	 * is the one we've just modified).
742 	 *
743 	 * XXXthorpej -- find a way to defer the IPI.
744 	 */
745 	pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, 0);
746 	pmap_tlb_shootwait();
747 }
748 
749 /*
750  * Unmap the content of APDP PDEs
751  */
752 static void
753 pmap_unmap_apdp(void)
754 {
755 	int i;
756 
757 	for (i = 0; i < PDP_SIZE; i++) {
758 		pmap_pte_set(APDP_PDE+i, 0);
759 #if defined (XEN) && defined (PAE)
760 		/* clear shadow entries too */
761 		pmap_pte_set(APDP_PDE_SHADOW+i, 0);
762 #endif
763 	}
764 }
765 
766 /*
767  *	Add a reference to the specified pmap.
768  */
769 
770 inline void
771 pmap_reference(struct pmap *pmap)
772 {
773 
774 	atomic_inc_uint(&pmap->pm_obj[0].uo_refs);
775 }
776 
777 /*
778  * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
779  *
780  * => we lock enough pmaps to keep things locked in
781  * => must be undone with pmap_unmap_ptes before returning
782  */
783 
784 void
785 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2,
786 	      pd_entry_t **ptepp, pd_entry_t * const **pdeppp)
787 {
788 	pd_entry_t opde, npde;
789 	struct pmap *ourpmap;
790 	struct cpu_info *ci;
791 	struct lwp *l;
792 	bool iscurrent;
793 	uint64_t ncsw;
794 #ifdef XEN
795 	int s, i;
796 #endif
797 
798 	/* the kernel's pmap is always accessible */
799 	if (pmap == pmap_kernel()) {
800 		*pmap2 = NULL;
801 		*ptepp = PTE_BASE;
802 		*pdeppp = normal_pdes;
803 		return;
804 	}
805 	KASSERT(kpreempt_disabled());
806 
807  retry:
808 	l = curlwp;
809 	ncsw = l->l_ncsw;
810  	ourpmap = NULL;
811 	ci = curcpu();
812 #if defined(XEN) && defined(__x86_64__)
813 	/*
814 	 * curmap can only be pmap_kernel so at this point
815 	 * pmap_is_curpmap is always false
816 	 */
817 	iscurrent = 0;
818 	ourpmap = pmap_kernel();
819 #else /* XEN && __x86_64__*/
820 	if (ci->ci_want_pmapload &&
821 	    vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) {
822 		pmap_load();
823 		if (l->l_ncsw != ncsw)
824 			goto retry;
825 	}
826 	iscurrent = pmap_is_curpmap(pmap);
827 	/* if curpmap then we are always mapped */
828 	if (iscurrent) {
829 		mutex_enter(&pmap->pm_lock);
830 		*pmap2 = NULL;
831 		*ptepp = PTE_BASE;
832 		*pdeppp = normal_pdes;
833 		goto out;
834 	}
835 	ourpmap = ci->ci_pmap;
836 #endif /* XEN && __x86_64__ */
837 
838 	/* need to lock both curpmap and pmap: use ordered locking */
839 	pmap_reference(ourpmap);
840 	if ((uintptr_t) pmap < (uintptr_t) ourpmap) {
841 		mutex_enter(&pmap->pm_lock);
842 		mutex_enter(&ourpmap->pm_lock);
843 	} else {
844 		mutex_enter(&ourpmap->pm_lock);
845 		mutex_enter(&pmap->pm_lock);
846 	}
847 
848 	if (l->l_ncsw != ncsw)
849 		goto unlock_and_retry;
850 
851 	/* need to load a new alternate pt space into curpmap? */
852 	COUNT(apdp_pde_map);
853 	opde = *APDP_PDE;
854 	if (!pmap_valid_entry(opde) ||
855 	    pmap_pte2pa(opde) != pmap_pdirpa(pmap, 0)) {
856 #ifdef XEN
857 		s = splvm();
858 		/* Make recursive entry usable in user PGD */
859 		for (i = 0; i < PDP_SIZE; i++) {
860 			npde = pmap_pa2pte(
861 			    pmap_pdirpa(pmap, i * NPDPG)) | PG_k | PG_V;
862 			xpq_queue_pte_update(
863 			    xpmap_ptom(pmap_pdirpa(pmap, PDIR_SLOT_PTE + i)),
864 			    npde);
865 			xpq_queue_pte_update(xpmap_ptetomach(&APDP_PDE[i]),
866 			    npde);
867 #ifdef PAE
868 			/* update shadow entry too */
869 			xpq_queue_pte_update(
870 			    xpmap_ptetomach(&APDP_PDE_SHADOW[i]), npde);
871 #endif /* PAE */
872 			xpq_queue_invlpg(
873 			    (vaddr_t)&pmap->pm_pdir[PDIR_SLOT_PTE + i]);
874 		}
875 		if (pmap_valid_entry(opde))
876 			pmap_apte_flush(ourpmap);
877 		splx(s);
878 #else /* XEN */
879 		int i;
880 		for (i = 0; i < PDP_SIZE; i++) {
881 			npde = pmap_pa2pte(
882 			    pmap_pdirpa(pmap, i * NPDPG)) | PG_RW | PG_V;
883 			pmap_pte_set(APDP_PDE+i, npde);
884 		}
885 		pmap_pte_flush();
886 		if (pmap_valid_entry(opde))
887 			pmap_apte_flush(ourpmap);
888 #endif /* XEN */
889 	}
890 	*pmap2 = ourpmap;
891 	*ptepp = APTE_BASE;
892 	*pdeppp = alternate_pdes;
893 	KASSERT(l->l_ncsw == ncsw);
894 #if !defined(XEN) || !defined(__x86_64__)
895  out:
896 #endif
897  	/*
898  	 * might have blocked, need to retry?
899  	 */
900 	if (l->l_ncsw != ncsw) {
901  unlock_and_retry:
902 	    	if (ourpmap != NULL) {
903 			mutex_exit(&ourpmap->pm_lock);
904 			pmap_destroy(ourpmap);
905 		}
906 		mutex_exit(&pmap->pm_lock);
907 		goto retry;
908 	}
909 
910 	return;
911 }
912 
913 /*
914  * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
915  */
916 
917 void
918 pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2)
919 {
920 
921 	if (pmap == pmap_kernel()) {
922 		return;
923 	}
924 	KASSERT(kpreempt_disabled());
925 	if (pmap2 == NULL) {
926 		mutex_exit(&pmap->pm_lock);
927 	} else {
928 #if defined(XEN) && defined(__x86_64__)
929 		KASSERT(pmap2 == pmap_kernel());
930 #else
931 		KASSERT(curcpu()->ci_pmap == pmap2);
932 #endif
933 #if defined(MULTIPROCESSOR)
934 		pmap_unmap_apdp();
935 		pmap_pte_flush();
936 		pmap_apte_flush(pmap2);
937 #endif
938 		COUNT(apdp_pde_unmap);
939 		mutex_exit(&pmap->pm_lock);
940 		mutex_exit(&pmap2->pm_lock);
941 		pmap_destroy(pmap2);
942 	}
943 }
944 
945 inline static void
946 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte)
947 {
948 
949 #if !defined(__x86_64__)
950 	if (curproc == NULL || curproc->p_vmspace == NULL ||
951 	    pm != vm_map_pmap(&curproc->p_vmspace->vm_map))
952 		return;
953 
954 	if ((opte ^ npte) & PG_X)
955 		pmap_update_pg(va);
956 
957 	/*
958 	 * Executability was removed on the last executable change.
959 	 * Reset the code segment to something conservative and
960 	 * let the trap handler deal with setting the right limit.
961 	 * We can't do that because of locking constraints on the vm map.
962 	 */
963 
964 	if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) {
965 		struct trapframe *tf = curlwp->l_md.md_regs;
966 
967 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
968 		pm->pm_hiexec = I386_MAX_EXE_ADDR;
969 	}
970 #endif /* !defined(__x86_64__) */
971 }
972 
973 #if !defined(__x86_64__)
974 /*
975  * Fixup the code segment to cover all potential executable mappings.
976  * returns 0 if no changes to the code segment were made.
977  */
978 
979 int
980 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb)
981 {
982 	struct vm_map_entry *ent;
983 	struct pmap *pm = vm_map_pmap(map);
984 	vaddr_t va = 0;
985 
986 	vm_map_lock_read(map);
987 	for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) {
988 
989 		/*
990 		 * This entry has greater va than the entries before.
991 		 * We need to make it point to the last page, not past it.
992 		 */
993 
994 		if (ent->protection & VM_PROT_EXECUTE)
995 			va = trunc_page(ent->end) - PAGE_SIZE;
996 	}
997 	vm_map_unlock_read(map);
998 	if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL))
999 		return (0);
1000 
1001 	pm->pm_hiexec = va;
1002 	if (pm->pm_hiexec > I386_MAX_EXE_ADDR) {
1003 		tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
1004 	} else {
1005 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
1006 		return (0);
1007 	}
1008 	return (1);
1009 }
1010 #endif /* !defined(__x86_64__) */
1011 
1012 void
1013 pat_init(struct cpu_info *ci)
1014 {
1015 	uint64_t pat;
1016 
1017 	if (!(ci->ci_feat_val[0] & CPUID_PAT))
1018 		return;
1019 
1020 	/* We change WT to WC. Leave all other entries the default values. */
1021 	pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) |
1022 	      PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) |
1023 	      PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) |
1024 	      PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC);
1025 
1026 	wrmsr(MSR_CR_PAT, pat);
1027 	cpu_pat_enabled = true;
1028 	aprint_debug_dev(ci->ci_dev, "PAT enabled\n");
1029 }
1030 
1031 static pt_entry_t
1032 pmap_pat_flags(u_int flags)
1033 {
1034 	u_int cacheflags = (flags & PMAP_CACHE_MASK);
1035 
1036 	if (!cpu_pat_enabled) {
1037 		switch (cacheflags) {
1038 		case PMAP_NOCACHE:
1039 		case PMAP_NOCACHE_OVR:
1040 			/* results in PGC_UCMINUS on cpus which have
1041 			 * the cpuid PAT but PAT "disabled"
1042 			 */
1043 			return PG_N;
1044 		default:
1045 			return 0;
1046 		}
1047 	}
1048 
1049 	switch (cacheflags) {
1050 	case PMAP_NOCACHE:
1051 		return PGC_UC;
1052 	case PMAP_WRITE_COMBINE:
1053 		return PGC_WC;
1054 	case PMAP_WRITE_BACK:
1055 		return PGC_WB;
1056 	case PMAP_NOCACHE_OVR:
1057 		return PGC_UCMINUS;
1058 	}
1059 
1060 	return 0;
1061 }
1062 
1063 /*
1064  * p m a p   k e n t e r   f u n c t i o n s
1065  *
1066  * functions to quickly enter/remove pages from the kernel address
1067  * space.   pmap_kremove is exported to MI kernel.  we make use of
1068  * the recursive PTE mappings.
1069  */
1070 
1071 /*
1072  * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
1073  *
1074  * => no need to lock anything, assume va is already allocated
1075  * => should be faster than normal pmap enter function
1076  */
1077 
1078 void
1079 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
1080 {
1081 	pt_entry_t *pte, opte, npte;
1082 
1083 	KASSERT(!(prot & ~VM_PROT_ALL));
1084 
1085 	if (va < VM_MIN_KERNEL_ADDRESS)
1086 		pte = vtopte(va);
1087 	else
1088 		pte = kvtopte(va);
1089 #ifdef DOM0OPS
1090 	if (pa < pmap_pa_start || pa >= pmap_pa_end) {
1091 #ifdef DEBUG
1092 		printk("pmap_kenter_pa: pa 0x%" PRIx64 " for va 0x%" PRIx64
1093 		    " outside range\n", (int64_t)pa, (int64_t)va);
1094 #endif /* DEBUG */
1095 		npte = pa;
1096 	} else
1097 #endif /* DOM0OPS */
1098 		npte = pmap_pa2pte(pa);
1099 	npte |= protection_codes[prot] | PG_k | PG_V | pmap_pg_g;
1100 	npte |= pmap_pat_flags(flags);
1101 	opte = pmap_pte_testset(pte, npte); /* zap! */
1102 #if defined(DIAGNOSTIC)
1103 	/* XXX For now... */
1104 	if (opte & PG_PS)
1105 		panic("pmap_kenter_pa: PG_PS");
1106 #endif
1107 	if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) {
1108 		/* This should not happen, so no need to batch updates. */
1109 		kpreempt_disable();
1110 		pmap_tlb_shootdown(pmap_kernel(), va, 0, opte);
1111 		kpreempt_enable();
1112 	}
1113 }
1114 
1115 void
1116 pmap_emap_enter(vaddr_t va, paddr_t pa, vm_prot_t prot)
1117 {
1118 	pt_entry_t *pte, opte, npte;
1119 
1120 	KASSERT((prot & ~VM_PROT_ALL) == 0);
1121 	pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va);
1122 
1123 #ifdef DOM0OPS
1124 	if (pa < pmap_pa_start || pa >= pmap_pa_end) {
1125 		npte = pa;
1126 	} else
1127 #endif
1128 		npte = pmap_pa2pte(pa);
1129 
1130 	npte = pmap_pa2pte(pa);
1131 	npte |= protection_codes[prot] | PG_k | PG_V;
1132 	opte = pmap_pte_testset(pte, npte);
1133 }
1134 
1135 /*
1136  * pmap_emap_sync: perform TLB flush or pmap load, if it was deferred.
1137  */
1138 void
1139 pmap_emap_sync(bool canload)
1140 {
1141 	struct cpu_info *ci = curcpu();
1142 	struct pmap *pmap;
1143 
1144 	KASSERT(kpreempt_disabled());
1145 	if (__predict_true(ci->ci_want_pmapload && canload)) {
1146 		/*
1147 		 * XXX: Hint for pmap_reactivate(), which might suggest to
1148 		 * not perform TLB flush, if state has not changed.
1149 		 */
1150 		pmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map);
1151 		if (__predict_false(pmap == ci->ci_pmap)) {
1152 			const uint32_t cpumask = ci->ci_cpumask;
1153 			atomic_and_32(&pmap->pm_cpus, ~cpumask);
1154 		}
1155 		pmap_load();
1156 		KASSERT(ci->ci_want_pmapload == 0);
1157 	} else {
1158 		tlbflush();
1159 	}
1160 
1161 }
1162 
1163 void
1164 pmap_emap_remove(vaddr_t sva, vsize_t len)
1165 {
1166 	pt_entry_t *pte, xpte;
1167 	vaddr_t va, eva = sva + len;
1168 
1169 	for (va = sva; va < eva; va += PAGE_SIZE) {
1170 		pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va);
1171 		xpte |= pmap_pte_testset(pte, 0);
1172 	}
1173 }
1174 
1175 __weak_alias(pmap_kenter_ma, pmap_kenter_pa);
1176 
1177 #if defined(__x86_64__)
1178 /*
1179  * Change protection for a virtual address. Local for a CPU only, don't
1180  * care about TLB shootdowns.
1181  *
1182  * => must be called with preemption disabled
1183  */
1184 void
1185 pmap_changeprot_local(vaddr_t va, vm_prot_t prot)
1186 {
1187 	pt_entry_t *pte, opte, npte;
1188 
1189 	KASSERT(kpreempt_disabled());
1190 
1191 	if (va < VM_MIN_KERNEL_ADDRESS)
1192 		pte = vtopte(va);
1193 	else
1194 		pte = kvtopte(va);
1195 
1196 	npte = opte = *pte;
1197 
1198 	if ((prot & VM_PROT_WRITE) != 0)
1199 		npte |= PG_RW;
1200 	else
1201 		npte &= ~PG_RW;
1202 
1203 	if (opte != npte) {
1204 		pmap_pte_set(pte, npte);
1205 		pmap_pte_flush();
1206 		invlpg(va);
1207 	}
1208 }
1209 #endif /* defined(__x86_64__) */
1210 
1211 /*
1212  * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
1213  *
1214  * => no need to lock anything
1215  * => caller must dispose of any vm_page mapped in the va range
1216  * => note: not an inline function
1217  * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
1218  * => we assume kernel only unmaps valid addresses and thus don't bother
1219  *    checking the valid bit before doing TLB flushing
1220  * => must be followed by call to pmap_update() before reuse of page
1221  */
1222 
1223 void
1224 pmap_kremove(vaddr_t sva, vsize_t len)
1225 {
1226 	pt_entry_t *pte, xpte;
1227 	vaddr_t va, eva;
1228 
1229 	eva = sva + len;
1230 	xpte = 0;
1231 
1232 	for (va = sva; va < eva; va += PAGE_SIZE) {
1233 		if (va < VM_MIN_KERNEL_ADDRESS)
1234 			pte = vtopte(va);
1235 		else
1236 			pte = kvtopte(va);
1237 		xpte |= pmap_pte_testset(pte, 0); /* zap! */
1238 #if defined(DIAGNOSTIC)
1239 		/* XXX For now... */
1240 		if (xpte & PG_PS)
1241 			panic("pmap_kremove: PG_PS");
1242 		if (xpte & PG_PVLIST)
1243 			panic("pmap_kremove: PG_PVLIST mapping for 0x%lx",
1244 			      va);
1245 #endif
1246 	}
1247 	if ((xpte & (PG_V | PG_U)) == (PG_V | PG_U)) {
1248 		kpreempt_disable();
1249 		pmap_tlb_shootdown(pmap_kernel(), sva, eva, xpte);
1250 		kpreempt_enable();
1251 	}
1252 }
1253 
1254 /*
1255  * p m a p   i n i t   f u n c t i o n s
1256  *
1257  * pmap_bootstrap and pmap_init are called during system startup
1258  * to init the pmap module.   pmap_bootstrap() does a low level
1259  * init just to get things rolling.   pmap_init() finishes the job.
1260  */
1261 
1262 /*
1263  * pmap_bootstrap: get the system in a state where it can run with VM
1264  *	properly enabled (called before main()).   the VM system is
1265  *      fully init'd later...
1266  *
1267  * => on i386, locore.s has already enabled the MMU by allocating
1268  *	a PDP for the kernel, and nkpde PTP's for the kernel.
1269  * => kva_start is the first free virtual address in kernel space
1270  */
1271 
1272 void
1273 pmap_bootstrap(vaddr_t kva_start)
1274 {
1275 	struct pmap *kpm;
1276 	pt_entry_t *pte;
1277 	int i;
1278 	vaddr_t kva;
1279 #ifndef XEN
1280 	unsigned long p1i;
1281 	vaddr_t kva_end;
1282 #endif
1283 
1284 	pt_entry_t pg_nx = (cpu_feature[2] & CPUID_NOX ? PG_NX : 0);
1285 
1286 	/*
1287 	 * set up our local static global vars that keep track of the
1288 	 * usage of KVM before kernel_map is set up
1289 	 */
1290 
1291 	virtual_avail = kva_start;		/* first free KVA */
1292 	virtual_end = VM_MAX_KERNEL_ADDRESS;	/* last KVA */
1293 
1294 	/*
1295 	 * set up protection_codes: we need to be able to convert from
1296 	 * a MI protection code (some combo of VM_PROT...) to something
1297 	 * we can jam into a i386 PTE.
1298 	 */
1299 
1300 	protection_codes[VM_PROT_NONE] = pg_nx;			/* --- */
1301 	protection_codes[VM_PROT_EXECUTE] = PG_RO | PG_X;	/* --x */
1302 	protection_codes[VM_PROT_READ] = PG_RO | pg_nx;		/* -r- */
1303 	protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO | PG_X;/* -rx */
1304 	protection_codes[VM_PROT_WRITE] = PG_RW | pg_nx;	/* w-- */
1305 	protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW | PG_X;/* w-x */
1306 	protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW | pg_nx;
1307 								/* wr- */
1308 	protection_codes[VM_PROT_ALL] = PG_RW | PG_X;		/* wrx */
1309 
1310 	/*
1311 	 * now we init the kernel's pmap
1312 	 *
1313 	 * the kernel pmap's pm_obj is not used for much.   however, in
1314 	 * user pmaps the pm_obj contains the list of active PTPs.
1315 	 * the pm_obj currently does not have a pager.   it might be possible
1316 	 * to add a pager that would allow a process to read-only mmap its
1317 	 * own page tables (fast user level vtophys?).   this may or may not
1318 	 * be useful.
1319 	 */
1320 
1321 	kpm = pmap_kernel();
1322 	for (i = 0; i < PTP_LEVELS - 1; i++) {
1323 		UVM_OBJ_INIT(&kpm->pm_obj[i], NULL, 1);
1324 		kpm->pm_ptphint[i] = NULL;
1325 	}
1326 	memset(&kpm->pm_list, 0, sizeof(kpm->pm_list));  /* pm_list not used */
1327 
1328 	kpm->pm_pdir = (pd_entry_t *)(PDPpaddr + KERNBASE);
1329 	for (i = 0; i < PDP_SIZE; i++)
1330 		kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i;
1331 
1332 	kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
1333 		x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS);
1334 
1335 	/*
1336 	 * the above is just a rough estimate and not critical to the proper
1337 	 * operation of the system.
1338 	 */
1339 
1340 #ifndef XEN
1341 	/*
1342 	 * Begin to enable global TLB entries if they are supported.
1343 	 * The G bit has no effect until the CR4_PGE bit is set in CR4,
1344 	 * which happens in cpu_init(), which is run on each cpu
1345 	 * (and happens later)
1346 	 */
1347 
1348 	if (cpu_feature[0] & CPUID_PGE) {
1349 		pmap_pg_g = PG_G;		/* enable software */
1350 
1351 		/* add PG_G attribute to already mapped kernel pages */
1352 		if (KERNBASE == VM_MIN_KERNEL_ADDRESS) {
1353 			kva_end = virtual_avail;
1354 		} else {
1355 			extern vaddr_t eblob, esym;
1356 			kva_end = (vaddr_t)&end;
1357 			if (esym > kva_end)
1358 				kva_end = esym;
1359 			if (eblob > kva_end)
1360 				kva_end = eblob;
1361 			kva_end = roundup(kva_end, PAGE_SIZE);
1362 		}
1363 		for (kva = KERNBASE; kva < kva_end; kva += PAGE_SIZE) {
1364 			p1i = pl1_i(kva);
1365 			if (pmap_valid_entry(PTE_BASE[p1i]))
1366 				PTE_BASE[p1i] |= PG_G;
1367 		}
1368 	}
1369 
1370 	/*
1371 	 * enable large pages if they are supported.
1372 	 */
1373 
1374 	if (cpu_feature[0] & CPUID_PSE) {
1375 		paddr_t pa;
1376 		pd_entry_t *pde;
1377 		extern char __data_start;
1378 
1379 		lcr4(rcr4() | CR4_PSE);	/* enable hardware (via %cr4) */
1380 		pmap_largepages = 1;	/* enable software */
1381 
1382 		/*
1383 		 * the TLB must be flushed after enabling large pages
1384 		 * on Pentium CPUs, according to section 3.6.2.2 of
1385 		 * "Intel Architecture Software Developer's Manual,
1386 		 * Volume 3: System Programming".
1387 		 */
1388 		tlbflush();
1389 
1390 		/*
1391 		 * now, remap the kernel text using large pages.  we
1392 		 * assume that the linker has properly aligned the
1393 		 * .data segment to a NBPD_L2 boundary.
1394 		 */
1395 		kva_end = rounddown((vaddr_t)&__data_start, NBPD_L1);
1396 		for (pa = 0, kva = KERNBASE; kva + NBPD_L2 <= kva_end;
1397 		     kva += NBPD_L2, pa += NBPD_L2) {
1398 			pde = &L2_BASE[pl2_i(kva)];
1399 			*pde = pa | pmap_pg_g | PG_PS |
1400 			    PG_KR | PG_V;	/* zap! */
1401 			tlbflush();
1402 		}
1403 #if defined(DEBUG)
1404 		aprint_normal("kernel text is mapped with %" PRIuPSIZE " large "
1405 		    "pages and %" PRIuPSIZE " normal pages\n",
1406 		    howmany(kva - KERNBASE, NBPD_L2),
1407 		    howmany((vaddr_t)&__data_start - kva, NBPD_L1));
1408 #endif /* defined(DEBUG) */
1409 	}
1410 #endif /* !XEN */
1411 
1412 	if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
1413 		/*
1414 		 * zero_pte is stuck at the end of mapped space for the kernel
1415 		 * image (disjunct from kva space). This is done so that it
1416 		 * can safely be used in pmap_growkernel (pmap_get_physpage),
1417 		 * when it's called for the first time.
1418 		 * XXXfvdl fix this for MULTIPROCESSOR later.
1419 		 */
1420 
1421 		early_zerop = (void *)(KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2);
1422 		early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop);
1423 	}
1424 
1425 	/*
1426 	 * now we allocate the "special" VAs which are used for tmp mappings
1427 	 * by the pmap (and other modules).    we allocate the VAs by advancing
1428 	 * virtual_avail (note that there are no pages mapped at these VAs).
1429 	 * we find the PTE that maps the allocated VA via the linear PTE
1430 	 * mapping.
1431 	 */
1432 
1433 	pte = PTE_BASE + pl1_i(virtual_avail);
1434 
1435 #ifdef MULTIPROCESSOR
1436 	/*
1437 	 * Waste some VA space to avoid false sharing of cache lines
1438 	 * for page table pages: Give each possible CPU a cache line
1439 	 * of PTE's (8) to play with, though we only need 4.  We could
1440 	 * recycle some of this waste by putting the idle stacks here
1441 	 * as well; we could waste less space if we knew the largest
1442 	 * CPU ID beforehand.
1443 	 */
1444 	csrcp = (char *) virtual_avail;  csrc_pte = pte;
1445 
1446 	cdstp = (char *) virtual_avail+PAGE_SIZE;  cdst_pte = pte+1;
1447 
1448 	zerop = (char *) virtual_avail+PAGE_SIZE*2;  zero_pte = pte+2;
1449 
1450 	ptpp = (char *) virtual_avail+PAGE_SIZE*3;  ptp_pte = pte+3;
1451 
1452 	virtual_avail += PAGE_SIZE * maxcpus * NPTECL;
1453 	pte += maxcpus * NPTECL;
1454 #else
1455 	csrcp = (void *) virtual_avail;  csrc_pte = pte;	/* allocate */
1456 	virtual_avail += PAGE_SIZE; pte++;			/* advance */
1457 
1458 	cdstp = (void *) virtual_avail;  cdst_pte = pte;
1459 	virtual_avail += PAGE_SIZE; pte++;
1460 
1461 	zerop = (void *) virtual_avail;  zero_pte = pte;
1462 	virtual_avail += PAGE_SIZE; pte++;
1463 
1464 	ptpp = (void *) virtual_avail;  ptp_pte = pte;
1465 	virtual_avail += PAGE_SIZE; pte++;
1466 #endif
1467 
1468 	if (VM_MIN_KERNEL_ADDRESS == KERNBASE) {
1469 		early_zerop = zerop;
1470 		early_zero_pte = zero_pte;
1471 	}
1472 
1473 	/*
1474 	 * Nothing after this point actually needs pte;
1475 	 */
1476 	pte = (void *)0xdeadbeef;
1477 
1478 	/* XXX: vmmap used by mem.c... should be uvm_map_reserve */
1479 	/* XXXfvdl PTEs not needed here */
1480 	vmmap = (char *)virtual_avail;			/* don't need pte */
1481 	virtual_avail += PAGE_SIZE; pte++;
1482 
1483 #ifdef XEN
1484 #ifdef __x86_64__
1485 	/*
1486 	 * We want a dummy page directory for Xen:
1487 	 * when deactivate a pmap, Xen will still consider it active.
1488 	 * So we set user PGD to this one to lift all protection on
1489 	 * the now inactive page tables set.
1490 	 */
1491 	xen_dummy_user_pgd = avail_start;
1492 	avail_start += PAGE_SIZE;
1493 
1494 	/* Zero fill it, the less checks in Xen it requires the better */
1495 	memset((void *) (xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE);
1496 	/* Mark read-only */
1497 	HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE,
1498 	    pmap_pa2pte(xen_dummy_user_pgd) | PG_u | PG_V, UVMF_INVLPG);
1499 	/* Pin as L4 */
1500 	xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd));
1501 #endif /* __x86_64__ */
1502 	idt_vaddr = virtual_avail;                      /* don't need pte */
1503 	idt_paddr = avail_start;                        /* steal a page */
1504 	/*
1505 	 * Xen require one more page as we can't store
1506 	 * GDT and LDT on the same page
1507 	 */
1508 	virtual_avail += 3 * PAGE_SIZE;
1509 	avail_start += 3 * PAGE_SIZE;
1510 #else /* XEN */
1511 	idt_vaddr = virtual_avail;			/* don't need pte */
1512 	idt_paddr = avail_start;			/* steal a page */
1513 #if defined(__x86_64__)
1514 	virtual_avail += 2 * PAGE_SIZE; pte += 2;
1515 	avail_start += 2 * PAGE_SIZE;
1516 #else /* defined(__x86_64__) */
1517 	virtual_avail += PAGE_SIZE; pte++;
1518 	avail_start += PAGE_SIZE;
1519 	/* pentium f00f bug stuff */
1520 	pentium_idt_vaddr = virtual_avail;		/* don't need pte */
1521 	virtual_avail += PAGE_SIZE; pte++;
1522 #endif /* defined(__x86_64__) */
1523 #endif /* XEN */
1524 
1525 #ifdef _LP64
1526 	/*
1527 	 * Grab a page below 4G for things that need it (i.e.
1528 	 * having an initial %cr3 for the MP trampoline).
1529 	 */
1530 	lo32_vaddr = virtual_avail;
1531 	virtual_avail += PAGE_SIZE; pte++;
1532 	lo32_paddr = avail_start;
1533 	avail_start += PAGE_SIZE;
1534 #endif
1535 
1536 	/*
1537 	 * now we reserve some VM for mapping pages when doing a crash dump
1538 	 */
1539 
1540 	virtual_avail = reserve_dumppages(virtual_avail);
1541 
1542 	/*
1543 	 * init the static-global locks and global lists.
1544 	 *
1545 	 * => pventry::pvh_lock (initialized elsewhere) must also be
1546 	 *      a spin lock, again at IPL_VM to prevent deadlock, and
1547 	 *	again is never taken from interrupt context.
1548 	 */
1549 
1550 	mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE);
1551 	LIST_INIT(&pmaps);
1552 	pmap_cpu_init_early(curcpu());
1553 
1554 	/*
1555 	 * initialize caches.
1556 	 */
1557 
1558 	pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), 0, 0, 0,
1559 	    "pmappl", NULL, IPL_NONE, NULL, NULL, NULL);
1560 #ifdef PAE
1561 	pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE * PDP_SIZE, 0, 0, 0,
1562 	    "pdppl", &pmap_pdp_allocator, IPL_NONE,
1563 	    pmap_pdp_ctor, pmap_pdp_dtor, NULL);
1564 #else /* PAE */
1565 	pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE, 0, 0, 0,
1566 	    "pdppl", NULL, IPL_NONE, pmap_pdp_ctor, pmap_pdp_dtor, NULL);
1567 #endif /* PAE */
1568 	pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0,
1569 	    PR_LARGECACHE, "pvpl", &pool_allocator_meta, IPL_NONE, NULL,
1570 	    NULL, NULL);
1571 
1572 	/*
1573 	 * ensure the TLB is sync'd with reality by flushing it...
1574 	 */
1575 
1576 	tlbflush();
1577 
1578 	/*
1579 	 * calculate pmap_maxkvaddr from nkptp[].
1580 	 */
1581 
1582 	kva = VM_MIN_KERNEL_ADDRESS;
1583 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
1584 		kva += nkptp[i] * nbpd[i];
1585 	}
1586 	pmap_maxkvaddr = kva;
1587 }
1588 
1589 #if defined(__x86_64__)
1590 /*
1591  * Pre-allocate PTPs for low memory, so that 1:1 mappings for various
1592  * trampoline code can be entered.
1593  */
1594 void
1595 pmap_prealloc_lowmem_ptps(void)
1596 {
1597 #ifdef XEN
1598 	int level;
1599 	paddr_t newp;
1600 	paddr_t pdes_pa;
1601 
1602 	pdes_pa = pmap_pdirpa(pmap_kernel(), 0);
1603 	level = PTP_LEVELS;
1604 	for (;;) {
1605 		newp = avail_start;
1606 		avail_start += PAGE_SIZE;
1607 		HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop,
1608 		    xpmap_ptom_masked(newp) | PG_u | PG_V | PG_RW, UVMF_INVLPG);
1609 		memset((void *)early_zerop, 0, PAGE_SIZE);
1610 		/* Mark R/O before installing */
1611 		HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop,
1612 		    xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG);
1613 		if (newp < (NKL2_KIMG_ENTRIES * NBPD_L2))
1614 			HYPERVISOR_update_va_mapping (newp + KERNBASE,
1615 			    xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG);
1616 		xpq_queue_pte_update (
1617 			xpmap_ptom_masked(pdes_pa)
1618 			+ (pl_i(0, level) * sizeof (pd_entry_t)),
1619 			xpmap_ptom_masked(newp) | PG_RW | PG_u | PG_V);
1620 		level--;
1621 		if (level <= 1)
1622 			break;
1623 		pdes_pa = newp;
1624 	}
1625 #else /* XEN */
1626 	pd_entry_t *pdes;
1627 	int level;
1628 	paddr_t newp;
1629 
1630 	pdes = pmap_kernel()->pm_pdir;
1631 	level = PTP_LEVELS;
1632 	for (;;) {
1633 		newp = avail_start;
1634 		avail_start += PAGE_SIZE;
1635 		*early_zero_pte = (newp & PG_FRAME) | PG_V | PG_RW;
1636 		pmap_update_pg((vaddr_t)early_zerop);
1637 		memset(early_zerop, 0, PAGE_SIZE);
1638 		pdes[pl_i(0, level)] = (newp & PG_FRAME) | PG_V | PG_RW;
1639 		level--;
1640 		if (level <= 1)
1641 			break;
1642 		pdes = normal_pdes[level - 2];
1643 	}
1644 #endif /* XEN */
1645 }
1646 #endif /* defined(__x86_64__) */
1647 
1648 /*
1649  * pmap_init: called from uvm_init, our job is to get the pmap
1650  * system ready to manage mappings...
1651  */
1652 
1653 void
1654 pmap_init(void)
1655 {
1656 	int i;
1657 
1658 	for (i = 0; i < PV_HASH_SIZE; i++) {
1659 		SLIST_INIT(&pv_hash_heads[i].hh_list);
1660 	}
1661 	for (i = 0; i < PV_HASH_LOCK_CNT; i++) {
1662 		mutex_init(&pv_hash_locks[i].lock, MUTEX_NODEBUG, IPL_VM);
1663 	}
1664 
1665 	/*
1666 	 * done: pmap module is up (and ready for business)
1667 	 */
1668 
1669 	pmap_initialized = true;
1670 }
1671 
1672 /*
1673  * pmap_cpu_init_early: perform early per-CPU initialization.
1674  */
1675 
1676 void
1677 pmap_cpu_init_early(struct cpu_info *ci)
1678 {
1679 	struct pmap_cpu *pc;
1680 	static uint8_t pmap_cpu_alloc;
1681 
1682 	pc = &pmap_cpu[pmap_cpu_alloc++].pc;
1683 	ci->ci_pmap_cpu = pc;
1684 }
1685 
1686 /*
1687  * pmap_cpu_init_late: perform late per-CPU initialization.
1688  */
1689 
1690 void
1691 pmap_cpu_init_late(struct cpu_info *ci)
1692 {
1693 
1694 	if (ci == &cpu_info_primary) {
1695 		evcnt_attach_dynamic(&pmap_tlb_evcnt, EVCNT_TYPE_INTR,
1696 		    NULL, "global", "TLB IPI");
1697 		evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC,
1698 		    NULL, "x86", "io bitmap copy");
1699 		evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC,
1700 		    NULL, "x86", "ldt sync");
1701 	}
1702 
1703 	evcnt_attach_dynamic(&ci->ci_tlb_evcnt, EVCNT_TYPE_MISC,
1704 	    NULL, device_xname(ci->ci_dev), "TLB IPI");
1705 
1706 #ifdef PAE
1707 	int ret;
1708 	struct pglist pg;
1709 	struct vm_page *vmap;
1710 
1711 	/* The BP has already its own L3 page allocated in locore.S. */
1712 	if (ci == &cpu_info_primary)
1713 		return;
1714 
1715 	/*
1716 	 * Allocate a page for the per-CPU L3 PD. cr3 being 32 bits, PA musts
1717 	 * resides below the 4GB boundary.
1718 	 */
1719 	ret = uvm_pglistalloc(PAGE_SIZE, 0, 0x100000000ULL, 32, 0, &pg, 1, 0);
1720 	vmap = TAILQ_FIRST(&pg);
1721 
1722 	if (ret != 0 || vmap == NULL)
1723 		panic("%s: failed to allocate L3 pglist for CPU %d (ret %d)\n",
1724 			__func__, cpu_index(ci), ret);
1725 
1726 	ci->ci_pae_l3_pdirpa = vmap->phys_addr;
1727 
1728 	ci->ci_pae_l3_pdir = (paddr_t *)uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
1729 		UVM_KMF_VAONLY | UVM_KMF_NOWAIT);
1730 	if (ci->ci_pae_l3_pdir == NULL)
1731 		panic("%s: failed to allocate L3 PD for CPU %d\n",
1732 			__func__, cpu_index(ci));
1733 
1734 	pmap_kenter_pa((vaddr_t)ci->ci_pae_l3_pdir, ci->ci_pae_l3_pdirpa,
1735 		VM_PROT_READ | VM_PROT_WRITE, 0);
1736 
1737 	pmap_update(pmap_kernel());
1738 #endif
1739 }
1740 
1741 /*
1742  * p v _ e n t r y   f u n c t i o n s
1743  */
1744 
1745 /*
1746  * pmap_free_pvs: free a list of pv_entrys
1747  */
1748 
1749 static void
1750 pmap_free_pvs(struct pv_entry *pve)
1751 {
1752 	struct pv_entry *next;
1753 
1754 	for ( /* null */ ; pve != NULL ; pve = next) {
1755 		next = pve->pve_next;
1756 		pool_cache_put(&pmap_pv_cache, pve);
1757 	}
1758 }
1759 
1760 /*
1761  * main pv_entry manipulation functions:
1762  *   pmap_enter_pv: enter a mapping onto a pv_head list
1763  *   pmap_remove_pv: remove a mapping from a pv_head list
1764  *
1765  * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock
1766  *       the pvh before calling
1767  */
1768 
1769 /*
1770  * insert_pv: a helper of pmap_enter_pv
1771  */
1772 
1773 static void
1774 insert_pv(struct pmap_page *pp, struct pv_entry *pve)
1775 {
1776 	struct pv_hash_head *hh;
1777 	kmutex_t *lock;
1778 	u_int hash;
1779 
1780 	KASSERT(pp_locked(pp));
1781 
1782 	hash = pvhash_hash(pve->pve_pte.pte_ptp, pve->pve_pte.pte_va);
1783 	lock = pvhash_lock(hash);
1784 	hh = pvhash_head(hash);
1785 	mutex_spin_enter(lock);
1786 	SLIST_INSERT_HEAD(&hh->hh_list, pve, pve_hash);
1787 	mutex_spin_exit(lock);
1788 
1789 	LIST_INSERT_HEAD(&pp->pp_head.pvh_list, pve, pve_list);
1790 }
1791 
1792 /*
1793  * pmap_enter_pv: enter a mapping onto a pv_head lst
1794  *
1795  * => caller should have the pp_lock locked
1796  * => caller should adjust ptp's wire_count before calling
1797  */
1798 
1799 static struct pv_entry *
1800 pmap_enter_pv(struct pmap_page *pp,
1801 	      struct pv_entry *pve,	/* preallocated pve for us to use */
1802 	      struct pv_entry **sparepve,
1803 	      struct vm_page *ptp,
1804 	      vaddr_t va)
1805 {
1806 
1807 	KASSERT(ptp == NULL || ptp->wire_count >= 2);
1808 	KASSERT(ptp == NULL || ptp->uobject != NULL);
1809 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
1810 	KASSERT(pp_locked(pp));
1811 
1812 	if ((pp->pp_flags & PP_EMBEDDED) == 0) {
1813 		if (LIST_EMPTY(&pp->pp_head.pvh_list)) {
1814 			pp->pp_flags |= PP_EMBEDDED;
1815 			pp->pp_pte.pte_ptp = ptp;
1816 			pp->pp_pte.pte_va = va;
1817 
1818 			return pve;
1819 		}
1820 	} else {
1821 		struct pv_entry *pve2;
1822 
1823 		pve2 = *sparepve;
1824 		*sparepve = NULL;
1825 
1826 		pve2->pve_pte = pp->pp_pte;
1827 		pp->pp_flags &= ~PP_EMBEDDED;
1828 		LIST_INIT(&pp->pp_head.pvh_list);
1829 		insert_pv(pp, pve2);
1830 	}
1831 
1832 	pve->pve_pte.pte_ptp = ptp;
1833 	pve->pve_pte.pte_va = va;
1834 	insert_pv(pp, pve);
1835 
1836 	return NULL;
1837 }
1838 
1839 /*
1840  * pmap_remove_pv: try to remove a mapping from a pv_list
1841  *
1842  * => caller should hold pp_lock [so that attrs can be adjusted]
1843  * => caller should adjust ptp's wire_count and free PTP if needed
1844  * => we return the removed pve
1845  */
1846 
1847 static struct pv_entry *
1848 pmap_remove_pv(struct pmap_page *pp, struct vm_page *ptp, vaddr_t va)
1849 {
1850 	struct pv_hash_head *hh;
1851 	struct pv_entry *pve;
1852 	kmutex_t *lock;
1853 	u_int hash;
1854 
1855 	KASSERT(ptp == NULL || ptp->uobject != NULL);
1856 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
1857 	KASSERT(pp_locked(pp));
1858 
1859 	if ((pp->pp_flags & PP_EMBEDDED) != 0) {
1860 		KASSERT(pp->pp_pte.pte_ptp == ptp);
1861 		KASSERT(pp->pp_pte.pte_va == va);
1862 
1863 		pp->pp_flags &= ~PP_EMBEDDED;
1864 		LIST_INIT(&pp->pp_head.pvh_list);
1865 
1866 		return NULL;
1867 	}
1868 
1869 	hash = pvhash_hash(ptp, va);
1870 	lock = pvhash_lock(hash);
1871 	hh = pvhash_head(hash);
1872 	mutex_spin_enter(lock);
1873 	pve = pvhash_remove(hh, ptp, va);
1874 	mutex_spin_exit(lock);
1875 
1876 	LIST_REMOVE(pve, pve_list);
1877 
1878 	return pve;
1879 }
1880 
1881 /*
1882  * p t p   f u n c t i o n s
1883  */
1884 
1885 static inline struct vm_page *
1886 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level)
1887 {
1888 	int lidx = level - 1;
1889 	struct vm_page *pg;
1890 
1891 	KASSERT(mutex_owned(&pmap->pm_lock));
1892 
1893 	if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] &&
1894 	    pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) {
1895 		return (pmap->pm_ptphint[lidx]);
1896 	}
1897 	PMAP_SUBOBJ_LOCK(pmap, lidx);
1898 	pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level));
1899 	PMAP_SUBOBJ_UNLOCK(pmap, lidx);
1900 
1901 	KASSERT(pg == NULL || pg->wire_count >= 1);
1902 	return pg;
1903 }
1904 
1905 static inline void
1906 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level)
1907 {
1908 	lwp_t *l;
1909 	int lidx;
1910 	struct uvm_object *obj;
1911 
1912 	KASSERT(ptp->wire_count == 1);
1913 
1914 	lidx = level - 1;
1915 
1916 	obj = &pmap->pm_obj[lidx];
1917 	pmap_stats_update(pmap, -1, 0);
1918 	if (lidx != 0)
1919 		mutex_enter(&obj->vmobjlock);
1920 	if (pmap->pm_ptphint[lidx] == ptp)
1921 		pmap->pm_ptphint[lidx] = TAILQ_FIRST(&obj->memq);
1922 	ptp->wire_count = 0;
1923 	uvm_pagerealloc(ptp, NULL, 0);
1924 	l = curlwp;
1925 	KASSERT((l->l_pflag & LP_INTR) == 0);
1926 	VM_PAGE_TO_PP(ptp)->pp_link = l->l_md.md_gc_ptp;
1927 	l->l_md.md_gc_ptp = ptp;
1928 	if (lidx != 0)
1929 		mutex_exit(&obj->vmobjlock);
1930 }
1931 
1932 static void
1933 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
1934 	      pt_entry_t *ptes, pd_entry_t * const *pdes)
1935 {
1936 	unsigned long index;
1937 	int level;
1938 	vaddr_t invaladdr;
1939 #ifdef MULTIPROCESSOR
1940 	vaddr_t invaladdr2;
1941 #endif
1942 	pd_entry_t opde;
1943 	struct pmap *curpmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map);
1944 
1945 	KASSERT(pmap != pmap_kernel());
1946 	KASSERT(mutex_owned(&pmap->pm_lock));
1947 	KASSERT(kpreempt_disabled());
1948 
1949 	level = 1;
1950 	do {
1951 		index = pl_i(va, level + 1);
1952 		opde = pmap_pte_testset(&pdes[level - 1][index], 0);
1953 #if defined(XEN) && defined(__x86_64__)
1954 		/*
1955 		 * If ptp is a L3 currently mapped in kernel space,
1956 		 * clear it before freeing
1957 		 */
1958 		if (pmap_pdirpa(pmap, 0) == curcpu()->ci_xen_current_user_pgd
1959 		    && level == PTP_LEVELS - 1)
1960 			pmap_pte_set(&pmap_kernel()->pm_pdir[index], 0);
1961 #endif /* XEN && __x86_64__ */
1962 		pmap_freepage(pmap, ptp, level);
1963 		invaladdr = level == 1 ? (vaddr_t)ptes :
1964 		    (vaddr_t)pdes[level - 2];
1965 		pmap_tlb_shootdown(curpmap, invaladdr + index * PAGE_SIZE,
1966 		    0, opde);
1967 #if defined(MULTIPROCESSOR)
1968 		invaladdr2 = level == 1 ? (vaddr_t)PTE_BASE :
1969 		    (vaddr_t)normal_pdes[level - 2];
1970 		if (pmap != curpmap || invaladdr != invaladdr2) {
1971 			pmap_tlb_shootdown(pmap, invaladdr2 + index * PAGE_SIZE,
1972 			    0, opde);
1973 		}
1974 #endif
1975 		if (level < PTP_LEVELS - 1) {
1976 			ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1);
1977 			ptp->wire_count--;
1978 			if (ptp->wire_count > 1)
1979 				break;
1980 		}
1981 	} while (++level < PTP_LEVELS);
1982 	pmap_pte_flush();
1983 }
1984 
1985 /*
1986  * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
1987  *
1988  * => pmap should NOT be pmap_kernel()
1989  * => pmap should be locked
1990  * => preemption should be disabled
1991  */
1992 
1993 static struct vm_page *
1994 pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t * const *pdes)
1995 {
1996 	struct vm_page *ptp, *pptp;
1997 	int i;
1998 	unsigned long index;
1999 	pd_entry_t *pva;
2000 	paddr_t ppa, pa;
2001 	struct uvm_object *obj;
2002 
2003 	KASSERT(pmap != pmap_kernel());
2004 	KASSERT(mutex_owned(&pmap->pm_lock));
2005 	KASSERT(kpreempt_disabled());
2006 
2007 	ptp = NULL;
2008 	pa = (paddr_t)-1;
2009 
2010 	/*
2011 	 * Loop through all page table levels seeing if we need to
2012 	 * add a new page to that level.
2013 	 */
2014 	for (i = PTP_LEVELS; i > 1; i--) {
2015 		/*
2016 		 * Save values from previous round.
2017 		 */
2018 		pptp = ptp;
2019 		ppa = pa;
2020 
2021 		index = pl_i(va, i);
2022 		pva = pdes[i - 2];
2023 
2024 		if (pmap_valid_entry(pva[index])) {
2025 			ppa = pmap_pte2pa(pva[index]);
2026 			ptp = NULL;
2027 			continue;
2028 		}
2029 
2030 		obj = &pmap->pm_obj[i-2];
2031 		PMAP_SUBOBJ_LOCK(pmap, i - 2);
2032 		ptp = uvm_pagealloc(obj, ptp_va2o(va, i - 1), NULL,
2033 		    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
2034 		PMAP_SUBOBJ_UNLOCK(pmap, i - 2);
2035 
2036 		if (ptp == NULL)
2037 			return NULL;
2038 
2039 		ptp->flags &= ~PG_BUSY; /* never busy */
2040 		ptp->wire_count = 1;
2041 		pmap->pm_ptphint[i - 2] = ptp;
2042 		pa = VM_PAGE_TO_PHYS(ptp);
2043 		pmap_pte_set(&pva[index], (pd_entry_t)
2044 		        (pmap_pa2pte(pa) | PG_u | PG_RW | PG_V));
2045 #if defined(XEN) && defined(__x86_64__)
2046 		/*
2047 		 * In Xen we must enter the mapping in kernel map too
2048 		 * if pmap is curmap and modifying top level (PGD)
2049 		 */
2050 		if(i == PTP_LEVELS && pmap != pmap_kernel()) {
2051 		        pmap_pte_set(&pmap_kernel()->pm_pdir[index],
2052 		                (pd_entry_t) (pmap_pa2pte(pa)
2053 		                        | PG_u | PG_RW | PG_V));
2054 		}
2055 #endif /* XEN && __x86_64__ */
2056 		pmap_pte_flush();
2057 		pmap_stats_update(pmap, 1, 0);
2058 		/*
2059 		 * If we're not in the top level, increase the
2060 		 * wire count of the parent page.
2061 		 */
2062 		if (i < PTP_LEVELS) {
2063 			if (pptp == NULL)
2064 				pptp = pmap_find_ptp(pmap, va, ppa, i);
2065 #ifdef DIAGNOSTIC
2066 			if (pptp == NULL)
2067 				panic("pde page disappeared");
2068 #endif
2069 			pptp->wire_count++;
2070 		}
2071 	}
2072 
2073 	/*
2074 	 * ptp is not NULL if we just allocated a new ptp. If it's
2075 	 * still NULL, we must look up the existing one.
2076 	 */
2077 	if (ptp == NULL) {
2078 		ptp = pmap_find_ptp(pmap, va, ppa, 1);
2079 #ifdef DIAGNOSTIC
2080 		if (ptp == NULL) {
2081 			printf("va %" PRIxVADDR " ppa %" PRIxPADDR "\n",
2082 			    va, ppa);
2083 			panic("pmap_get_ptp: unmanaged user PTP");
2084 		}
2085 #endif
2086 	}
2087 
2088 	pmap->pm_ptphint[0] = ptp;
2089 	return(ptp);
2090 }
2091 
2092 /*
2093  * p m a p  l i f e c y c l e   f u n c t i o n s
2094  */
2095 
2096 /*
2097  * pmap_pdp_ctor: constructor for the PDP cache.
2098  */
2099 
2100 int
2101 pmap_pdp_ctor(void *arg, void *v, int flags)
2102 {
2103 	pd_entry_t *pdir = v;
2104 	paddr_t pdirpa = 0;	/* XXX: GCC */
2105 	vaddr_t object;
2106 	int i;
2107 
2108 #if !defined(XEN) || !defined(__x86_64__)
2109 	int npde;
2110 #endif
2111 #ifdef XEN
2112 	int s;
2113 #endif
2114 
2115 	/*
2116 	 * NOTE: The `pmap_lock' is held when the PDP is allocated.
2117 	 */
2118 
2119 #if defined(XEN) && defined(__x86_64__)
2120 	/* fetch the physical address of the page directory. */
2121 	(void) pmap_extract(pmap_kernel(), (vaddr_t) pdir, &pdirpa);
2122 
2123 	/* zero init area */
2124 	memset (pdir, 0, PAGE_SIZE); /* Xen wants a clean page */
2125 	/*
2126 	 * this pdir will NEVER be active in kernel mode
2127 	 * so mark recursive entry invalid
2128 	 */
2129 	pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa) | PG_u;
2130 	/*
2131 	 * PDP constructed this way won't be for kernel,
2132 	 * hence we don't put kernel mappings on Xen.
2133 	 * But we need to make pmap_create() happy, so put a dummy (without
2134 	 * PG_V) value at the right place.
2135 	 */
2136 	pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] =
2137 	     (pd_entry_t)-1 & PG_FRAME;
2138 #else /* XEN && __x86_64__*/
2139 	/* zero init area */
2140 	memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t));
2141 
2142 	object = (vaddr_t)v;
2143 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2144 		/* fetch the physical address of the page directory. */
2145 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2146 		/* put in recursive PDE to map the PTEs */
2147 		pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PG_V;
2148 #ifndef XEN
2149 		pdir[PDIR_SLOT_PTE + i] |= PG_KW;
2150 #endif
2151 	}
2152 
2153 	/* copy kernel's PDE */
2154 	npde = nkptp[PTP_LEVELS - 1];
2155 
2156 	memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN],
2157 	    npde * sizeof(pd_entry_t));
2158 
2159 	/* zero the rest */
2160 	memset(&pdir[PDIR_SLOT_KERN + npde], 0,
2161 	    (NTOPLEVEL_PDES - (PDIR_SLOT_KERN + npde)) * sizeof(pd_entry_t));
2162 
2163 	if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
2164 		int idx = pl_i(KERNBASE, PTP_LEVELS);
2165 
2166 		pdir[idx] = PDP_BASE[idx];
2167 	}
2168 #endif /* XEN  && __x86_64__*/
2169 #ifdef XEN
2170 	s = splvm();
2171 	object = (vaddr_t)v;
2172 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2173 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2174 		/* remap this page RO */
2175 		pmap_kenter_pa(object, pdirpa, VM_PROT_READ, 0);
2176 		pmap_update(pmap_kernel());
2177 		/*
2178 		 * pin as L2/L4 page, we have to do the page with the
2179 		 * PDIR_SLOT_PTE entries last
2180 		 */
2181 #ifdef PAE
2182 		if (i == l2tol3(PDIR_SLOT_PTE))
2183 			continue;
2184 #endif
2185 
2186 #ifdef __x86_64__
2187 		xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa));
2188 #else
2189 		xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2190 #endif
2191 	}
2192 #ifdef PAE
2193 	object = ((vaddr_t)pdir) + PAGE_SIZE  * l2tol3(PDIR_SLOT_PTE);
2194 	(void)pmap_extract(pmap_kernel(), object, &pdirpa);
2195 	xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2196 #endif
2197 	splx(s);
2198 #endif /* XEN */
2199 
2200 	return (0);
2201 }
2202 
2203 /*
2204  * pmap_pdp_dtor: destructor for the PDP cache.
2205  */
2206 
2207 void
2208 pmap_pdp_dtor(void *arg, void *v)
2209 {
2210 #ifdef XEN
2211 	paddr_t pdirpa = 0;	/* XXX: GCC */
2212 	vaddr_t object = (vaddr_t)v;
2213 	int i;
2214 	int s = splvm();
2215 	pt_entry_t *pte;
2216 
2217 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2218 		/* fetch the physical address of the page directory. */
2219 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2220 		/* unpin page table */
2221 		xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa));
2222 	}
2223 	object = (vaddr_t)v;
2224 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2225 		/* Set page RW again */
2226 		pte = kvtopte(object);
2227 		xpq_queue_pte_update(xpmap_ptetomach(pte), *pte | PG_RW);
2228 		xpq_queue_invlpg((vaddr_t)object);
2229 	}
2230 	splx(s);
2231 #endif  /* XEN */
2232 }
2233 
2234 #ifdef PAE
2235 
2236 /* pmap_pdp_alloc: Allocate a page for the pdp memory pool. */
2237 
2238 void *
2239 pmap_pdp_alloc(struct pool *pp, int flags)
2240 {
2241 	return (void *)uvm_km_alloc(kernel_map,
2242 	    PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE,
2243 	    ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK)
2244 	    | UVM_KMF_WIRED);
2245 }
2246 
2247 /*
2248  * pmap_pdp_free: free a PDP
2249  */
2250 
2251 void
2252 pmap_pdp_free(struct pool *pp, void *v)
2253 {
2254 	uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE,
2255 	    UVM_KMF_WIRED);
2256 }
2257 #endif /* PAE */
2258 
2259 /*
2260  * pmap_create: create a pmap
2261  *
2262  * => note: old pmap interface took a "size" args which allowed for
2263  *	the creation of "software only" pmaps (not in bsd).
2264  */
2265 
2266 struct pmap *
2267 pmap_create(void)
2268 {
2269 	struct pmap *pmap;
2270 	int i;
2271 
2272 	pmap = pool_cache_get(&pmap_cache, PR_WAITOK);
2273 
2274 	/* init uvm_object */
2275 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2276 		UVM_OBJ_INIT(&pmap->pm_obj[i], NULL, 1);
2277 		pmap->pm_ptphint[i] = NULL;
2278 	}
2279 	pmap->pm_stats.wired_count = 0;
2280 	/* count the PDP allocd below */
2281 	pmap->pm_stats.resident_count = PDP_SIZE;
2282 #if !defined(__x86_64__)
2283 	pmap->pm_hiexec = 0;
2284 #endif /* !defined(__x86_64__) */
2285 	pmap->pm_flags = 0;
2286 	pmap->pm_cpus = 0;
2287 	pmap->pm_kernel_cpus = 0;
2288 
2289 	/* init the LDT */
2290 	pmap->pm_ldt = NULL;
2291 	pmap->pm_ldt_len = 0;
2292 	pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2293 
2294 	/* allocate PDP */
2295  try_again:
2296 	pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK);
2297 
2298 	mutex_enter(&pmaps_lock);
2299 
2300 	if (pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] == 0) {
2301 		mutex_exit(&pmaps_lock);
2302 		pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir);
2303 		goto try_again;
2304 	}
2305 
2306 	for (i = 0; i < PDP_SIZE; i++)
2307 		pmap->pm_pdirpa[i] =
2308 		    pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]);
2309 
2310 	LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
2311 
2312 	mutex_exit(&pmaps_lock);
2313 
2314 	return (pmap);
2315 }
2316 
2317 /*
2318  * pmap_destroy: drop reference count on pmap.   free pmap if
2319  *	reference count goes to zero.
2320  */
2321 
2322 void
2323 pmap_destroy(struct pmap *pmap)
2324 {
2325 	int i;
2326 #ifdef DIAGNOSTIC
2327 	struct cpu_info *ci;
2328 	CPU_INFO_ITERATOR cii;
2329 #endif /* DIAGNOSTIC */
2330 
2331 	/*
2332 	 * if we have torn down this pmap, process deferred frees and
2333 	 * invalidations now.
2334 	 */
2335 	if (__predict_false(curlwp->l_md.md_gc_pmap == pmap)) {
2336 		pmap_update(pmap);
2337 	}
2338 
2339 	/*
2340 	 * drop reference count
2341 	 */
2342 
2343 	if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) {
2344 		return;
2345 	}
2346 
2347 #ifdef DIAGNOSTIC
2348 	for (CPU_INFO_FOREACH(cii, ci))
2349 		if (ci->ci_pmap == pmap)
2350 			panic("destroying pmap being used");
2351 #endif /* DIAGNOSTIC */
2352 
2353 	/*
2354 	 * reference count is zero, free pmap resources and then free pmap.
2355 	 */
2356 #ifdef XEN
2357 	/*
2358 	 * Xen lazy APDP handling:
2359 	 * clear APDP_PDE if pmap is the currently mapped
2360 	 */
2361 	if (xpmap_ptom_masked(pmap_pdirpa(pmap, 0)) == (*APDP_PDE & PG_FRAME)) {
2362 		kpreempt_disable();
2363 		pmap_unmap_apdp();
2364 		pmap_pte_flush();
2365 	        pmap_apte_flush(pmap_kernel());
2366 	        kpreempt_enable();
2367 	}
2368 #endif
2369 
2370 	/*
2371 	 * remove it from global list of pmaps
2372 	 */
2373 
2374 	mutex_enter(&pmaps_lock);
2375 	LIST_REMOVE(pmap, pm_list);
2376 	mutex_exit(&pmaps_lock);
2377 
2378 	/*
2379 	 * destroyed pmap shouldn't have remaining PTPs
2380 	 */
2381 
2382 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2383 		KASSERT(pmap->pm_obj[i].uo_npages == 0);
2384 		KASSERT(TAILQ_EMPTY(&pmap->pm_obj[i].memq));
2385 	}
2386 
2387 	/*
2388 	 * MULTIPROCESSOR -- no need to flush out of other processors'
2389 	 * APTE space because we do that in pmap_unmap_ptes().
2390 	 */
2391 	pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir);
2392 
2393 #ifdef USER_LDT
2394 	if (pmap->pm_ldt != NULL) {
2395 		/*
2396 		 * no need to switch the LDT; this address space is gone,
2397 		 * nothing is using it.
2398 		 *
2399 		 * No need to lock the pmap for ldt_free (or anything else),
2400 		 * we're the last one to use it.
2401 		 */
2402 		mutex_enter(&cpu_lock);
2403 		ldt_free(pmap->pm_ldt_sel);
2404 		mutex_exit(&cpu_lock);
2405 		uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt,
2406 		    pmap->pm_ldt_len, UVM_KMF_WIRED);
2407 	}
2408 #endif
2409 
2410 	for (i = 0; i < PTP_LEVELS - 1; i++)
2411 		mutex_destroy(&pmap->pm_obj[i].vmobjlock);
2412 	pool_cache_put(&pmap_cache, pmap);
2413 }
2414 
2415 /*
2416  * pmap_remove_all: pmap is being torn down by the current thread.
2417  * avoid unnecessary invalidations.
2418  */
2419 
2420 void
2421 pmap_remove_all(struct pmap *pmap)
2422 {
2423 	lwp_t *l = curlwp;
2424 
2425 	KASSERT(l->l_md.md_gc_pmap == NULL);
2426 
2427 	l->l_md.md_gc_pmap = pmap;
2428 }
2429 
2430 #if defined(PMAP_FORK)
2431 /*
2432  * pmap_fork: perform any necessary data structure manipulation when
2433  * a VM space is forked.
2434  */
2435 
2436 void
2437 pmap_fork(struct pmap *pmap1, struct pmap *pmap2)
2438 {
2439 #ifdef USER_LDT
2440 	union descriptor *new_ldt;
2441 	size_t len;
2442 	int sel;
2443 
2444 	if (__predict_true(pmap1->pm_ldt == NULL)) {
2445 		return;
2446 	}
2447 
2448  retry:
2449 	if (pmap1->pm_ldt != NULL) {
2450 		len = pmap1->pm_ldt_len;
2451 		new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, len, 0,
2452 		    UVM_KMF_WIRED);
2453 		mutex_enter(&cpu_lock);
2454 		sel = ldt_alloc(new_ldt, len);
2455 		if (sel == -1) {
2456 			mutex_exit(&cpu_lock);
2457 			uvm_km_free(kernel_map, (vaddr_t)new_ldt, len,
2458 			    UVM_KMF_WIRED);
2459 			printf("WARNING: pmap_fork: unable to allocate LDT\n");
2460 			return;
2461 		}
2462 	} else {
2463 		len = -1;
2464 		new_ldt = NULL;
2465 		sel = -1;
2466 		mutex_enter(&cpu_lock);
2467 	}
2468 
2469  	/* Copy the LDT, if necessary. */
2470  	if (pmap1->pm_ldt != NULL) {
2471 		if (len != pmap1->pm_ldt_len) {
2472 			if (len != -1) {
2473 				ldt_free(sel);
2474 				uvm_km_free(kernel_map, (vaddr_t)new_ldt,
2475 				    len, UVM_KMF_WIRED);
2476 			}
2477 			mutex_exit(&cpu_lock);
2478 			goto retry;
2479 		}
2480 
2481 		memcpy(new_ldt, pmap1->pm_ldt, len);
2482 		pmap2->pm_ldt = new_ldt;
2483 		pmap2->pm_ldt_len = pmap1->pm_ldt_len;
2484 		pmap2->pm_ldt_sel = sel;
2485 		len = -1;
2486 	}
2487 
2488 	if (len != -1) {
2489 		ldt_free(sel);
2490 		uvm_km_free(kernel_map, (vaddr_t)new_ldt, len,
2491 		    UVM_KMF_WIRED);
2492 	}
2493 	mutex_exit(&cpu_lock);
2494 #endif /* USER_LDT */
2495 }
2496 #endif /* PMAP_FORK */
2497 
2498 #ifdef USER_LDT
2499 
2500 /*
2501  * pmap_ldt_xcall: cross call used by pmap_ldt_sync.  if the named pmap
2502  * is active, reload LDTR.
2503  */
2504 static void
2505 pmap_ldt_xcall(void *arg1, void *arg2)
2506 {
2507 	struct pmap *pm;
2508 
2509 	kpreempt_disable();
2510 	pm = arg1;
2511 	if (curcpu()->ci_pmap == pm) {
2512 		lldt(pm->pm_ldt_sel);
2513 	}
2514 	kpreempt_enable();
2515 }
2516 
2517 /*
2518  * pmap_ldt_sync: LDT selector for the named pmap is changing.  swap
2519  * in the new selector on all CPUs.
2520  */
2521 void
2522 pmap_ldt_sync(struct pmap *pm)
2523 {
2524 	uint64_t where;
2525 
2526 	KASSERT(mutex_owned(&cpu_lock));
2527 
2528 	pmap_ldt_evcnt.ev_count++;
2529 	where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL);
2530 	xc_wait(where);
2531 }
2532 
2533 /*
2534  * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and
2535  * restore the default.
2536  */
2537 
2538 void
2539 pmap_ldt_cleanup(struct lwp *l)
2540 {
2541 	pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap;
2542 	union descriptor *dp = NULL;
2543 	size_t len = 0;
2544 	int sel = -1;
2545 
2546 	if (__predict_true(pmap->pm_ldt == NULL)) {
2547 		return;
2548 	}
2549 
2550 	mutex_enter(&cpu_lock);
2551 	if (pmap->pm_ldt != NULL) {
2552 		sel = pmap->pm_ldt_sel;
2553 		dp = pmap->pm_ldt;
2554 		len = pmap->pm_ldt_len;
2555 		pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2556 		pmap->pm_ldt = NULL;
2557 		pmap->pm_ldt_len = 0;
2558 		pmap_ldt_sync(pmap);
2559 		ldt_free(sel);
2560 		uvm_km_free(kernel_map, (vaddr_t)dp, len, UVM_KMF_WIRED);
2561 	}
2562 	mutex_exit(&cpu_lock);
2563 }
2564 #endif /* USER_LDT */
2565 
2566 /*
2567  * pmap_activate: activate a process' pmap
2568  *
2569  * => must be called with kernel preemption disabled
2570  * => if lwp is the curlwp, then set ci_want_pmapload so that
2571  *    actual MMU context switch will be done by pmap_load() later
2572  */
2573 
2574 void
2575 pmap_activate(struct lwp *l)
2576 {
2577 	struct cpu_info *ci;
2578 	struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2579 
2580 	KASSERT(kpreempt_disabled());
2581 
2582 	ci = curcpu();
2583 
2584 	if (l == ci->ci_curlwp) {
2585 		KASSERT(ci->ci_want_pmapload == 0);
2586 		KASSERT(ci->ci_tlbstate != TLBSTATE_VALID);
2587 #ifdef KSTACK_CHECK_DR0
2588 		/*
2589 		 * setup breakpoint on the top of stack
2590 		 */
2591 		if (l == &lwp0)
2592 			dr0(0, 0, 0, 0);
2593 		else
2594 			dr0(KSTACK_LOWEST_ADDR(l), 1, 3, 1);
2595 #endif
2596 
2597 		/*
2598 		 * no need to switch to kernel vmspace because
2599 		 * it's a subset of any vmspace.
2600 		 */
2601 
2602 		if (pmap == pmap_kernel()) {
2603 			ci->ci_want_pmapload = 0;
2604 			return;
2605 		}
2606 
2607 		ci->ci_want_pmapload = 1;
2608 	}
2609 }
2610 
2611 /*
2612  * pmap_reactivate: try to regain reference to the pmap.
2613  *
2614  * => must be called with kernel preemption disabled
2615  */
2616 
2617 static bool
2618 pmap_reactivate(struct pmap *pmap)
2619 {
2620 	struct cpu_info *ci;
2621 	uint32_t cpumask;
2622 	bool result;
2623 	uint32_t oldcpus;
2624 
2625 	ci = curcpu();
2626 	cpumask = ci->ci_cpumask;
2627 
2628 	KASSERT(kpreempt_disabled());
2629 #if defined(XEN) && defined(__x86_64__)
2630 	KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd);
2631 #elif defined(PAE)
2632 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
2633 #elif !defined(XEN)
2634 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()));
2635 #endif
2636 
2637 	/*
2638 	 * if we still have a lazy reference to this pmap,
2639 	 * we can assume that there was no tlb shootdown
2640 	 * for this pmap in the meantime.
2641 	 *
2642 	 * the order of events here is important as we must
2643 	 * synchronize with TLB shootdown interrupts.  declare
2644 	 * interest in invalidations (TLBSTATE_VALID) and then
2645 	 * check the cpumask, which the IPIs can change only
2646 	 * when the state is TLBSTATE_LAZY.
2647 	 */
2648 
2649 	ci->ci_tlbstate = TLBSTATE_VALID;
2650 	oldcpus = pmap->pm_cpus;
2651 	KASSERT((pmap->pm_kernel_cpus & cpumask) != 0);
2652 	if (oldcpus & cpumask) {
2653 		/* got it */
2654 		result = true;
2655 	} else {
2656 		/* must reload */
2657 		atomic_or_32(&pmap->pm_cpus, cpumask);
2658 		result = false;
2659 	}
2660 
2661 	return result;
2662 }
2663 
2664 /*
2665  * pmap_load: actually switch pmap.  (fill in %cr3 and LDT info)
2666  */
2667 
2668 void
2669 pmap_load(void)
2670 {
2671 	struct cpu_info *ci;
2672 	uint32_t cpumask;
2673 	struct pmap *pmap;
2674 	struct pmap *oldpmap;
2675 	struct lwp *l;
2676 	struct pcb *pcb;
2677 	uint64_t ncsw;
2678 
2679 	kpreempt_disable();
2680  retry:
2681 	ci = curcpu();
2682 	if (!ci->ci_want_pmapload) {
2683 		kpreempt_enable();
2684 		return;
2685 	}
2686 	cpumask = ci->ci_cpumask;
2687 	l = ci->ci_curlwp;
2688 	ncsw = l->l_ncsw;
2689 
2690 	/* should be able to take ipis. */
2691 	KASSERT(ci->ci_ilevel < IPL_HIGH);
2692 #ifdef XEN
2693 	/* XXX not yet KASSERT(x86_read_psl() != 0); */
2694 #else
2695 	KASSERT((x86_read_psl() & PSL_I) != 0);
2696 #endif
2697 
2698 	KASSERT(l != NULL);
2699 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2700 	KASSERT(pmap != pmap_kernel());
2701 	oldpmap = ci->ci_pmap;
2702 	pcb = lwp_getpcb(l);
2703 
2704 	if (pmap == oldpmap) {
2705 		if (!pmap_reactivate(pmap)) {
2706 			u_int gen = uvm_emap_gen_return();
2707 
2708 			/*
2709 			 * pmap has been changed during deactivated.
2710 			 * our tlb may be stale.
2711 			 */
2712 
2713 			tlbflush();
2714 			uvm_emap_update(gen);
2715 		}
2716 
2717 		ci->ci_want_pmapload = 0;
2718 		kpreempt_enable();
2719 		return;
2720 	}
2721 
2722 	/*
2723 	 * grab a reference to the new pmap.
2724 	 */
2725 
2726 	pmap_reference(pmap);
2727 
2728 	/*
2729 	 * actually switch pmap.
2730 	 */
2731 
2732 	atomic_and_32(&oldpmap->pm_cpus, ~cpumask);
2733 	atomic_and_32(&oldpmap->pm_kernel_cpus, ~cpumask);
2734 
2735 #if defined(XEN) && defined(__x86_64__)
2736 	KASSERT(pmap_pdirpa(oldpmap, 0) == ci->ci_xen_current_user_pgd ||
2737 	    oldpmap == pmap_kernel());
2738 #elif defined(PAE)
2739 	KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
2740 #elif !defined(XEN)
2741 	KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(rcr3()));
2742 #endif
2743 	KASSERT((pmap->pm_cpus & cpumask) == 0);
2744 	KASSERT((pmap->pm_kernel_cpus & cpumask) == 0);
2745 
2746 	/*
2747 	 * mark the pmap in use by this processor.  again we must
2748 	 * synchronize with TLB shootdown interrupts, so set the
2749 	 * state VALID first, then register us for shootdown events
2750 	 * on this pmap.
2751 	 */
2752 
2753 	ci->ci_tlbstate = TLBSTATE_VALID;
2754 	atomic_or_32(&pmap->pm_cpus, cpumask);
2755 	atomic_or_32(&pmap->pm_kernel_cpus, cpumask);
2756 	ci->ci_pmap = pmap;
2757 
2758 	/*
2759 	 * update tss.  now that we have registered for invalidations
2760 	 * from other CPUs, we're good to load the page tables.
2761 	 */
2762 #ifdef PAE
2763 	pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa;
2764 #else
2765 	pcb->pcb_cr3 = pmap_pdirpa(pmap, 0);
2766 #endif
2767 
2768 #ifdef i386
2769 #ifdef XEN
2770 	/*
2771 	 * clear APDP slot, in case it points to a page table that has
2772 	 * been freed
2773 	 */
2774 	if (*APDP_PDE) {
2775 		pmap_unmap_apdp();
2776 	}
2777 	/* lldt() does pmap_pte_flush() */
2778 #endif /* XEN */
2779 
2780 #ifndef XEN
2781 	ci->ci_tss.tss_ldt = pmap->pm_ldt_sel;
2782 	ci->ci_tss.tss_cr3 = pcb->pcb_cr3;
2783 #endif /* !XEN */
2784 #endif /* i386 */
2785 
2786 	lldt(pmap->pm_ldt_sel);
2787 
2788 	u_int gen = uvm_emap_gen_return();
2789 	cpu_load_pmap(pmap);
2790 	uvm_emap_update(gen);
2791 
2792 	ci->ci_want_pmapload = 0;
2793 
2794 	/*
2795 	 * we're now running with the new pmap.  drop the reference
2796 	 * to the old pmap.  if we block, we need to go around again.
2797 	 */
2798 
2799 	pmap_destroy(oldpmap);
2800 	if (l->l_ncsw != ncsw) {
2801 		goto retry;
2802 	}
2803 
2804 	kpreempt_enable();
2805 }
2806 
2807 /*
2808  * pmap_deactivate: deactivate a process' pmap
2809  *
2810  * => must be called with kernel preemption disabled (high SPL is enough)
2811  */
2812 
2813 void
2814 pmap_deactivate(struct lwp *l)
2815 {
2816 	struct pmap *pmap;
2817 	struct cpu_info *ci;
2818 
2819 	KASSERT(kpreempt_disabled());
2820 
2821 	if (l != curlwp) {
2822 		return;
2823 	}
2824 
2825 	/*
2826 	 * wait for pending TLB shootdowns to complete.  necessary
2827 	 * because TLB shootdown state is per-CPU, and the LWP may
2828 	 * be coming off the CPU before it has a chance to call
2829 	 * pmap_update().
2830 	 */
2831 	pmap_tlb_shootwait();
2832 
2833 	ci = curcpu();
2834 
2835 	if (ci->ci_want_pmapload) {
2836 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
2837 		    != pmap_kernel());
2838 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
2839 		    != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID);
2840 
2841 		/*
2842 		 * userspace has not been touched.
2843 		 * nothing to do here.
2844 		 */
2845 
2846 		ci->ci_want_pmapload = 0;
2847 		return;
2848 	}
2849 
2850 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2851 
2852 	if (pmap == pmap_kernel()) {
2853 		return;
2854 	}
2855 
2856 #if defined(XEN) && defined(__x86_64__)
2857 	KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd);
2858 #elif defined(PAE)
2859 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
2860 #elif !defined(XEN)
2861 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()));
2862 #endif
2863 	KASSERT(ci->ci_pmap == pmap);
2864 
2865 	/*
2866 	 * we aren't interested in TLB invalidations for this pmap,
2867 	 * at least for the time being.
2868 	 */
2869 
2870 	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
2871 	ci->ci_tlbstate = TLBSTATE_LAZY;
2872 }
2873 
2874 /*
2875  * end of lifecycle functions
2876  */
2877 
2878 /*
2879  * some misc. functions
2880  */
2881 
2882 int
2883 pmap_pdes_invalid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde)
2884 {
2885 	int i;
2886 	unsigned long index;
2887 	pd_entry_t pde;
2888 
2889 	for (i = PTP_LEVELS; i > 1; i--) {
2890 		index = pl_i(va, i);
2891 		pde = pdes[i - 2][index];
2892 		if ((pde & PG_V) == 0)
2893 			return i;
2894 	}
2895 	if (lastpde != NULL)
2896 		*lastpde = pde;
2897 	return 0;
2898 }
2899 
2900 /*
2901  * pmap_extract: extract a PA for the given VA
2902  */
2903 
2904 bool
2905 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
2906 {
2907 	pt_entry_t *ptes, pte;
2908 	pd_entry_t pde;
2909 	pd_entry_t * const *pdes;
2910 	struct pmap *pmap2;
2911 	struct cpu_info *ci;
2912 	paddr_t pa;
2913 	lwp_t *l;
2914 	bool hard, rv;
2915 
2916 	rv = false;
2917 	pa = 0;
2918 	l = curlwp;
2919 
2920 	KPREEMPT_DISABLE(l);
2921 	ci = l->l_cpu;
2922 	if (__predict_true(!ci->ci_want_pmapload && ci->ci_pmap == pmap) ||
2923 	    pmap == pmap_kernel()) {
2924 		/*
2925 		 * no need to lock, because it's pmap_kernel() or our
2926 		 * own pmap and is active.  if a user pmap, the caller
2927 		 * will hold the vm_map write/read locked and so prevent
2928 		 * entries from disappearing while we are here.  ptps
2929 		 * can disappear via pmap_remove() and pmap_protect(),
2930 		 * but they are called with the vm_map write locked.
2931 		 */
2932 		hard = false;
2933 		ptes = PTE_BASE;
2934 		pdes = normal_pdes;
2935 	} else {
2936 		/* we lose, do it the hard way. */
2937 		hard = true;
2938 		pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
2939 	}
2940 	if (pmap_pdes_valid(va, pdes, &pde)) {
2941 		pte = ptes[pl1_i(va)];
2942 		if (pde & PG_PS) {
2943 			pa = (pde & PG_LGFRAME) | (va & (NBPD_L2 - 1));
2944 			rv = true;
2945 		} else if (__predict_true((pte & PG_V) != 0)) {
2946 			pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
2947 			rv = true;
2948 		}
2949 	}
2950 	if (__predict_false(hard)) {
2951 		pmap_unmap_ptes(pmap, pmap2);
2952 	}
2953 	KPREEMPT_ENABLE(l);
2954 	if (pap != NULL) {
2955 		*pap = pa;
2956 	}
2957 	return rv;
2958 }
2959 
2960 
2961 /*
2962  * vtophys: virtual address to physical address.  For use by
2963  * machine-dependent code only.
2964  */
2965 
2966 paddr_t
2967 vtophys(vaddr_t va)
2968 {
2969 	paddr_t pa;
2970 
2971 	if (pmap_extract(pmap_kernel(), va, &pa) == true)
2972 		return (pa);
2973 	return (0);
2974 }
2975 
2976 __weak_alias(pmap_extract_ma, pmap_extract);
2977 
2978 #ifdef XEN
2979 
2980 /*
2981  * vtomach: virtual address to machine address.  For use by
2982  * machine-dependent code only.
2983  */
2984 
2985 paddr_t
2986 vtomach(vaddr_t va)
2987 {
2988 	paddr_t pa;
2989 
2990 	if (pmap_extract_ma(pmap_kernel(), va, &pa) == true)
2991 		return (pa);
2992 	return (0);
2993 }
2994 
2995 #endif /* XEN */
2996 
2997 /*
2998  * pmap_virtual_space: used during bootup [pmap_steal_memory] to
2999  *	determine the bounds of the kernel virtual addess space.
3000  */
3001 
3002 void
3003 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp)
3004 {
3005 	*startp = virtual_avail;
3006 	*endp = virtual_end;
3007 }
3008 
3009 /*
3010  * pmap_map: map a range of PAs into kvm.
3011  *
3012  * => used during crash dump
3013  * => XXX: pmap_map() should be phased out?
3014  */
3015 
3016 vaddr_t
3017 pmap_map(vaddr_t va, paddr_t spa, paddr_t epa, vm_prot_t prot)
3018 {
3019 	while (spa < epa) {
3020 		pmap_kenter_pa(va, spa, prot, 0);
3021 		va += PAGE_SIZE;
3022 		spa += PAGE_SIZE;
3023 	}
3024 	pmap_update(pmap_kernel());
3025 	return va;
3026 }
3027 
3028 /*
3029  * pmap_zero_page: zero a page
3030  */
3031 
3032 void
3033 pmap_zero_page(paddr_t pa)
3034 {
3035 	pt_entry_t *zpte;
3036 	void *zerova;
3037 	int id;
3038 
3039 	kpreempt_disable();
3040 	id = cpu_number();
3041 	zpte = PTESLEW(zero_pte, id);
3042 	zerova = VASLEW(zerop, id);
3043 
3044 #ifdef DIAGNOSTIC
3045 	if (*zpte)
3046 		panic("pmap_zero_page: lock botch");
3047 #endif
3048 
3049 	pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k);
3050 	pmap_pte_flush();
3051 	pmap_update_pg((vaddr_t)zerova);		/* flush TLB */
3052 
3053 	memset(zerova, 0, PAGE_SIZE);
3054 
3055 #if defined(DIAGNOSTIC) || defined(XEN)
3056 	pmap_pte_set(zpte, 0);				/* zap ! */
3057 	pmap_pte_flush();
3058 #endif
3059 	kpreempt_enable();
3060 }
3061 
3062 /*
3063  * pmap_pagezeroidle: the same, for the idle loop page zero'er.
3064  * Returns true if the page was zero'd, false if we aborted for
3065  * some reason.
3066  */
3067 
3068 bool
3069 pmap_pageidlezero(paddr_t pa)
3070 {
3071 	pt_entry_t *zpte;
3072 	void *zerova;
3073 	bool rv;
3074 	int id;
3075 
3076 	id = cpu_number();
3077 	zpte = PTESLEW(zero_pte, id);
3078 	zerova = VASLEW(zerop, id);
3079 
3080 	KASSERT(cpu_feature[0] & CPUID_SSE2);
3081 	KASSERT(*zpte == 0);
3082 
3083 	pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k);
3084 	pmap_pte_flush();
3085 	pmap_update_pg((vaddr_t)zerova);		/* flush TLB */
3086 
3087 	rv = sse2_idlezero_page(zerova);
3088 
3089 #if defined(DIAGNOSTIC) || defined(XEN)
3090 	pmap_pte_set(zpte, 0);				/* zap ! */
3091 	pmap_pte_flush();
3092 #endif
3093 
3094 	return rv;
3095 }
3096 
3097 /*
3098  * pmap_copy_page: copy a page
3099  */
3100 
3101 void
3102 pmap_copy_page(paddr_t srcpa, paddr_t dstpa)
3103 {
3104 	pt_entry_t *spte;
3105 	pt_entry_t *dpte;
3106 	void *csrcva;
3107 	void *cdstva;
3108 	int id;
3109 
3110 	kpreempt_disable();
3111 	id = cpu_number();
3112 	spte = PTESLEW(csrc_pte,id);
3113 	dpte = PTESLEW(cdst_pte,id);
3114 	csrcva = VASLEW(csrcp, id);
3115 	cdstva = VASLEW(cdstp, id);
3116 
3117 	KASSERT(*spte == 0 && *dpte == 0);
3118 
3119 	pmap_pte_set(spte, pmap_pa2pte(srcpa) | PG_V | PG_RW | PG_U | PG_k);
3120 	pmap_pte_set(dpte,
3121 	    pmap_pa2pte(dstpa) | PG_V | PG_RW | PG_M | PG_U | PG_k);
3122 	pmap_pte_flush();
3123 	pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva);
3124 
3125 	memcpy(cdstva, csrcva, PAGE_SIZE);
3126 
3127 #if defined(DIAGNOSTIC) || defined(XEN)
3128 	pmap_pte_set(spte, 0);
3129 	pmap_pte_set(dpte, 0);
3130 	pmap_pte_flush();
3131 #endif
3132 	kpreempt_enable();
3133 }
3134 
3135 static pt_entry_t *
3136 pmap_map_ptp(struct vm_page *ptp)
3137 {
3138 	pt_entry_t *ptppte;
3139 	void *ptpva;
3140 	int id;
3141 
3142 	KASSERT(kpreempt_disabled());
3143 
3144 	id = cpu_number();
3145 	ptppte = PTESLEW(ptp_pte, id);
3146 	ptpva = VASLEW(ptpp, id);
3147 #if !defined(XEN)
3148 	pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M |
3149 	    PG_RW | PG_U | PG_k);
3150 #else
3151 	pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M |
3152 	    PG_U | PG_k);
3153 #endif
3154 	pmap_pte_flush();
3155 	pmap_update_pg((vaddr_t)ptpva);
3156 
3157 	return (pt_entry_t *)ptpva;
3158 }
3159 
3160 static void
3161 pmap_unmap_ptp(void)
3162 {
3163 #if defined(DIAGNOSTIC) || defined(XEN)
3164 	pt_entry_t *pte;
3165 
3166 	KASSERT(kpreempt_disabled());
3167 
3168 	pte = PTESLEW(ptp_pte, cpu_number());
3169 	if (*pte != 0) {
3170 		pmap_pte_set(pte, 0);
3171 		pmap_pte_flush();
3172 	}
3173 #endif
3174 }
3175 
3176 static pt_entry_t *
3177 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
3178 {
3179 
3180 	KASSERT(kpreempt_disabled());
3181 	if (pmap_is_curpmap(pmap)) {
3182 		return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */
3183 	}
3184 	KASSERT(ptp != NULL);
3185 	return pmap_map_ptp(ptp) + pl1_pi(va);
3186 }
3187 
3188 static void
3189 pmap_unmap_pte(void)
3190 {
3191 
3192 	KASSERT(kpreempt_disabled());
3193 
3194 	pmap_unmap_ptp();
3195 }
3196 
3197 /*
3198  * p m a p   r e m o v e   f u n c t i o n s
3199  *
3200  * functions that remove mappings
3201  */
3202 
3203 /*
3204  * pmap_remove_ptes: remove PTEs from a PTP
3205  *
3206  * => must have proper locking on pmap_master_lock
3207  * => caller must hold pmap's lock
3208  * => PTP must be mapped into KVA
3209  * => PTP should be null if pmap == pmap_kernel()
3210  * => must be called with kernel preemption disabled
3211  * => returns composite pte if at least one page should be shot down
3212  */
3213 
3214 static pt_entry_t
3215 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
3216 		 vaddr_t startva, vaddr_t endva, struct pv_entry **pv_tofree)
3217 {
3218 	struct pv_entry *pve;
3219 	pt_entry_t *pte = (pt_entry_t *) ptpva;
3220 	pt_entry_t opte, xpte = 0;
3221 
3222 	KASSERT(pmap == pmap_kernel() || mutex_owned(&pmap->pm_lock));
3223 	KASSERT(kpreempt_disabled());
3224 
3225 	/*
3226 	 * note that ptpva points to the PTE that maps startva.   this may
3227 	 * or may not be the first PTE in the PTP.
3228 	 *
3229 	 * we loop through the PTP while there are still PTEs to look at
3230 	 * and the wire_count is greater than 1 (because we use the wire_count
3231 	 * to keep track of the number of real PTEs in the PTP).
3232 	 */
3233 
3234 	for (/*null*/; startva < endva && (ptp == NULL || ptp->wire_count > 1)
3235 			     ; pte++, startva += PAGE_SIZE) {
3236 		struct vm_page *pg;
3237 		struct pmap_page *pp;
3238 
3239 		if (!pmap_valid_entry(*pte))
3240 			continue;			/* VA not mapped */
3241 
3242 		/* atomically save the old PTE and zap! it */
3243 		opte = pmap_pte_testset(pte, 0);
3244 		if (!pmap_valid_entry(opte)) {
3245 			continue;
3246 		}
3247 
3248 		pmap_exec_account(pmap, startva, opte, 0);
3249 		pmap_stats_update_bypte(pmap, 0, opte);
3250 		xpte |= opte;
3251 
3252 		if (ptp) {
3253 			ptp->wire_count--;		/* dropping a PTE */
3254 			/* Make sure that the PDE is flushed */
3255 			if (ptp->wire_count <= 1)
3256 				xpte |= PG_U;
3257 		}
3258 
3259 		/*
3260 		 * if we are not on a pv_head list we are done.
3261 		 */
3262 
3263 		if ((opte & PG_PVLIST) == 0) {
3264 #if defined(DIAGNOSTIC) && !defined(DOM0OPS)
3265 			if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL)
3266 				panic("pmap_remove_ptes: managed page without "
3267 				      "PG_PVLIST for %#" PRIxVADDR, startva);
3268 #endif
3269 			continue;
3270 		}
3271 
3272 		pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte));
3273 #ifdef DIAGNOSTIC
3274 		if (pg == NULL)
3275 			panic("pmap_remove_ptes: unmanaged page marked "
3276 			      "PG_PVLIST, va = %#" PRIxVADDR ", "
3277 			      "pa = %#" PRIxPADDR,
3278 			      startva, (paddr_t)pmap_pte2pa(opte));
3279 #endif
3280 
3281 		/* sync R/M bits */
3282 		pp = VM_PAGE_TO_PP(pg);
3283 		pp_lock(pp);
3284 		pp->pp_attrs |= opte;
3285 		pve = pmap_remove_pv(pp, ptp, startva);
3286 		pp_unlock(pp);
3287 
3288 		if (pve != NULL) {
3289 			pve->pve_next = *pv_tofree;
3290 			*pv_tofree = pve;
3291 		}
3292 
3293 		/* end of "for" loop: time for next pte */
3294 	}
3295 
3296 	return xpte;
3297 }
3298 
3299 
3300 /*
3301  * pmap_remove_pte: remove a single PTE from a PTP
3302  *
3303  * => must have proper locking on pmap_master_lock
3304  * => caller must hold pmap's lock
3305  * => PTP must be mapped into KVA
3306  * => PTP should be null if pmap == pmap_kernel()
3307  * => returns true if we removed a mapping
3308  * => must be called with kernel preemption disabled
3309  */
3310 
3311 static bool
3312 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
3313 		vaddr_t va, struct pv_entry **pv_tofree)
3314 {
3315 	pt_entry_t opte;
3316 	struct pv_entry *pve;
3317 	struct vm_page *pg;
3318 	struct pmap_page *pp;
3319 
3320 	KASSERT(pmap == pmap_kernel() || mutex_owned(&pmap->pm_lock));
3321 	KASSERT(pmap == pmap_kernel() || kpreempt_disabled());
3322 
3323 	if (!pmap_valid_entry(*pte))
3324 		return(false);		/* VA not mapped */
3325 
3326 	/* atomically save the old PTE and zap! it */
3327 	opte = pmap_pte_testset(pte, 0);
3328 	if (!pmap_valid_entry(opte)) {
3329 		return false;
3330 	}
3331 
3332 	pmap_exec_account(pmap, va, opte, 0);
3333 	pmap_stats_update_bypte(pmap, 0, opte);
3334 
3335 	if (opte & PG_U)
3336 		pmap_tlb_shootdown(pmap, va, 0, opte);
3337 
3338 	if (ptp) {
3339 		ptp->wire_count--;		/* dropping a PTE */
3340 		/* Make sure that the PDE is flushed */
3341 		if ((ptp->wire_count <= 1) && !(opte & PG_U))
3342 			pmap_tlb_shootdown(pmap, va, 0, opte);
3343 	}
3344 
3345 	/*
3346 	 * if we are not on a pv_head list we are done.
3347 	 */
3348 
3349 	if ((opte & PG_PVLIST) == 0) {
3350 #if defined(DIAGNOSTIC) && !defined(DOM0OPS)
3351 		if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL)
3352 			panic("pmap_remove_pte: managed page without "
3353 			      "PG_PVLIST for %#" PRIxVADDR, va);
3354 #endif
3355 		return(true);
3356 	}
3357 
3358 	pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte));
3359 #ifdef DIAGNOSTIC
3360 	if (pg == NULL)
3361 		panic("pmap_remove_pte: unmanaged page marked "
3362 		    "PG_PVLIST, va = %#" PRIxVADDR ", pa = %#" PRIxPADDR,
3363 		    va, (paddr_t)pmap_pte2pa(opte));
3364 #endif
3365 
3366 	/* sync R/M bits */
3367 	pp = VM_PAGE_TO_PP(pg);
3368 	pp_lock(pp);
3369 	pp->pp_attrs |= opte;
3370 	pve = pmap_remove_pv(pp, ptp, va);
3371 	pp_unlock(pp);
3372 
3373 	if (pve) {
3374 		pve->pve_next = *pv_tofree;
3375 		*pv_tofree = pve;
3376 	}
3377 
3378 	return(true);
3379 }
3380 
3381 /*
3382  * pmap_remove: mapping removal function.
3383  *
3384  * => caller should not be holding any pmap locks
3385  */
3386 
3387 void
3388 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
3389 {
3390 	pt_entry_t *ptes, xpte = 0;
3391 	pd_entry_t pde;
3392 	pd_entry_t * const *pdes;
3393 	struct pv_entry *pv_tofree = NULL;
3394 	bool result;
3395 	int i;
3396 	paddr_t ptppa;
3397 	vaddr_t blkendva, va = sva;
3398 	struct vm_page *ptp;
3399 	struct pmap *pmap2;
3400 
3401 	kpreempt_disable();
3402 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
3403 
3404 	/*
3405 	 * removing one page?  take shortcut function.
3406 	 */
3407 
3408 	if (va + PAGE_SIZE == eva) {
3409 		if (pmap_pdes_valid(va, pdes, &pde)) {
3410 
3411 			/* PA of the PTP */
3412 			ptppa = pmap_pte2pa(pde);
3413 
3414 			/* get PTP if non-kernel mapping */
3415 			if (pmap == pmap_kernel()) {
3416 				/* we never free kernel PTPs */
3417 				ptp = NULL;
3418 			} else {
3419 				ptp = pmap_find_ptp(pmap, va, ptppa, 1);
3420 #ifdef DIAGNOSTIC
3421 				if (ptp == NULL)
3422 					panic("pmap_remove: unmanaged "
3423 					      "PTP detected");
3424 #endif
3425 			}
3426 
3427 			/* do it! */
3428 			result = pmap_remove_pte(pmap, ptp,
3429 			    &ptes[pl1_i(va)], va, &pv_tofree);
3430 
3431 			/*
3432 			 * if mapping removed and the PTP is no longer
3433 			 * being used, free it!
3434 			 */
3435 
3436 			if (result && ptp && ptp->wire_count <= 1)
3437 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3438 		}
3439 	} else for (/* null */ ; va < eva ; va = blkendva) {
3440 		int lvl;
3441 
3442 		/* determine range of block */
3443 		blkendva = x86_round_pdr(va+1);
3444 		if (blkendva > eva)
3445 			blkendva = eva;
3446 
3447 		/*
3448 		 * XXXCDC: our PTE mappings should never be removed
3449 		 * with pmap_remove!  if we allow this (and why would
3450 		 * we?) then we end up freeing the pmap's page
3451 		 * directory page (PDP) before we are finished using
3452 		 * it when we hit in in the recursive mapping.  this
3453 		 * is BAD.
3454 		 *
3455 		 * long term solution is to move the PTEs out of user
3456 		 * address space.  and into kernel address space (up
3457 		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
3458 		 * be VM_MAX_ADDRESS.
3459 		 */
3460 
3461 		/* XXXCDC: ugly hack to avoid freeing PDP here */
3462 		for (i = 0; i < PDP_SIZE; i++) {
3463 			if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i)
3464 				continue;
3465 		}
3466 
3467 		lvl = pmap_pdes_invalid(va, pdes, &pde);
3468 		if (lvl != 0) {
3469 			/*
3470 			 * skip a range corresponding to an invalid pde.
3471 			 */
3472 			blkendva = (va & ptp_masks[lvl - 1]) + nbpd[lvl - 1];
3473  			continue;
3474 		}
3475 
3476 		/* PA of the PTP */
3477 		ptppa = pmap_pte2pa(pde);
3478 
3479 		/* get PTP if non-kernel mapping */
3480 		if (pmap == pmap_kernel()) {
3481 			/* we never free kernel PTPs */
3482 			ptp = NULL;
3483 		} else {
3484 			ptp = pmap_find_ptp(pmap, va, ptppa, 1);
3485 #ifdef DIAGNOSTIC
3486 			if (ptp == NULL)
3487 				panic("pmap_remove: unmanaged PTP "
3488 				      "detected");
3489 #endif
3490 		}
3491 		xpte |= pmap_remove_ptes(pmap, ptp,
3492 		    (vaddr_t)&ptes[pl1_i(va)], va, blkendva, &pv_tofree);
3493 
3494 		/* if PTP is no longer being used, free it! */
3495 		if (ptp && ptp->wire_count <= 1) {
3496 			pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3497 		}
3498 		if ((xpte & PG_U) != 0)
3499 			pmap_tlb_shootdown(pmap, sva, eva, xpte);
3500 	}
3501 	pmap_unmap_ptes(pmap, pmap2);		/* unlock pmap */
3502 	kpreempt_enable();
3503 
3504 	/* Now we free unused PVs */
3505 	if (pv_tofree)
3506 		pmap_free_pvs(pv_tofree);
3507 }
3508 
3509 /*
3510  * pmap_sync_pv: clear pte bits and return the old value of the pte.
3511  *
3512  * => called with pp_lock held. (thus preemption disabled)
3513  * => issues tlb shootdowns if necessary.
3514  */
3515 
3516 static int
3517 pmap_sync_pv(struct pv_pte *pvpte, pt_entry_t expect, int clearbits,
3518     pt_entry_t *optep)
3519 {
3520 	struct pmap *pmap;
3521 	struct vm_page *ptp;
3522 	vaddr_t va;
3523 	pt_entry_t *ptep;
3524 	pt_entry_t opte;
3525 	pt_entry_t npte;
3526 	bool need_shootdown;
3527 
3528 	ptp = pvpte->pte_ptp;
3529 	va = pvpte->pte_va;
3530 	KASSERT(ptp == NULL || ptp->uobject != NULL);
3531 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
3532 	pmap = ptp_to_pmap(ptp);
3533 
3534 	KASSERT((expect & ~(PG_FRAME | PG_V)) == 0);
3535 	KASSERT((expect & PG_V) != 0);
3536 	KASSERT(clearbits == ~0 || (clearbits & ~(PG_M | PG_U | PG_RW)) == 0);
3537 	KASSERT(kpreempt_disabled());
3538 
3539 	ptep = pmap_map_pte(pmap, ptp, va);
3540 	do {
3541 		opte = *ptep;
3542 		KASSERT((opte & (PG_M | PG_U)) != PG_M);
3543 		KASSERT((opte & (PG_U | PG_V)) != PG_U);
3544 		KASSERT(opte == 0 || (opte & PG_V) != 0);
3545 		if ((opte & (PG_FRAME | PG_V)) != expect) {
3546 
3547 			/*
3548 			 * we lost a race with a V->P operation like
3549 			 * pmap_remove().  wait for the competitor
3550 			 * reflecting pte bits into mp_attrs.
3551 			 *
3552 			 * issue a redundant TLB shootdown so that
3553 			 * we can wait for its completion.
3554 			 */
3555 
3556 			pmap_unmap_pte();
3557 			if (clearbits != 0) {
3558 				pmap_tlb_shootdown(pmap, va, 0,
3559 				    (pmap == pmap_kernel() ? PG_G : 0));
3560 			}
3561 			return EAGAIN;
3562 		}
3563 
3564 		/*
3565 		 * check if there's anything to do on this pte.
3566 		 */
3567 
3568 		if ((opte & clearbits) == 0) {
3569 			need_shootdown = false;
3570 			break;
3571 		}
3572 
3573 		/*
3574 		 * we need a shootdown if the pte is cached. (PG_U)
3575 		 *
3576 		 * ...unless we are clearing only the PG_RW bit and
3577 		 * it isn't cached as RW. (PG_M)
3578 		 */
3579 
3580 		need_shootdown = (opte & PG_U) != 0 &&
3581 		    !(clearbits == PG_RW && (opte & PG_M) == 0);
3582 
3583 		npte = opte & ~clearbits;
3584 
3585 		/*
3586 		 * if we need a shootdown anyway, clear PG_U and PG_M.
3587 		 */
3588 
3589 		if (need_shootdown) {
3590 			npte &= ~(PG_U | PG_M);
3591 		}
3592 		KASSERT((npte & (PG_M | PG_U)) != PG_M);
3593 		KASSERT((npte & (PG_U | PG_V)) != PG_U);
3594 		KASSERT(npte == 0 || (opte & PG_V) != 0);
3595 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
3596 
3597 	if (need_shootdown) {
3598 		pmap_tlb_shootdown(pmap, va, 0, opte);
3599 	}
3600 	pmap_unmap_pte();
3601 
3602 	*optep = opte;
3603 	return 0;
3604 }
3605 
3606 /*
3607  * pmap_page_remove: remove a managed vm_page from all pmaps that map it
3608  *
3609  * => R/M bits are sync'd back to attrs
3610  */
3611 
3612 void
3613 pmap_page_remove(struct vm_page *pg)
3614 {
3615 	struct pmap_page *pp;
3616 	struct pv_pte *pvpte;
3617 	struct pv_entry *killlist = NULL;
3618 	struct vm_page *ptp;
3619 	pt_entry_t expect;
3620 	lwp_t *l;
3621 	int count;
3622 
3623 	l = curlwp;
3624 	pp = VM_PAGE_TO_PP(pg);
3625 	expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V;
3626 	count = SPINLOCK_BACKOFF_MIN;
3627 	kpreempt_disable();
3628 startover:
3629 	pp_lock(pp);
3630 	while ((pvpte = pv_pte_first(pp)) != NULL) {
3631 		struct pmap *pmap;
3632 		struct pv_entry *pve;
3633 		pt_entry_t opte;
3634 		vaddr_t va;
3635 		int error;
3636 
3637 		/*
3638 		 * add a reference to the pmap before clearing the pte.
3639 		 * otherwise the pmap can disappear behind us.
3640 		 */
3641 
3642 		ptp = pvpte->pte_ptp;
3643 		pmap = ptp_to_pmap(ptp);
3644 		if (ptp != NULL) {
3645 			pmap_reference(pmap);
3646 		}
3647 
3648 		error = pmap_sync_pv(pvpte, expect, ~0, &opte);
3649 		if (error == EAGAIN) {
3650 			int hold_count;
3651 			pp_unlock(pp);
3652 			KERNEL_UNLOCK_ALL(curlwp, &hold_count);
3653 			if (ptp != NULL) {
3654 				pmap_destroy(pmap);
3655 			}
3656 			SPINLOCK_BACKOFF(count);
3657 			KERNEL_LOCK(hold_count, curlwp);
3658 			goto startover;
3659 		}
3660 
3661 		pp->pp_attrs |= opte;
3662 		va = pvpte->pte_va;
3663 		pve = pmap_remove_pv(pp, ptp, va);
3664 		pp_unlock(pp);
3665 
3666 		/* update the PTP reference count.  free if last reference. */
3667 		if (ptp != NULL) {
3668 			struct pmap *pmap2;
3669 			pt_entry_t *ptes;
3670 			pd_entry_t * const *pdes;
3671 
3672 			KASSERT(pmap != pmap_kernel());
3673 
3674 			pmap_tlb_shootwait();
3675 			pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3676 			pmap_stats_update_bypte(pmap, 0, opte);
3677 			ptp->wire_count--;
3678 			if (ptp->wire_count <= 1) {
3679 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3680 			}
3681 			pmap_unmap_ptes(pmap, pmap2);
3682 			pmap_destroy(pmap);
3683 		} else {
3684 			KASSERT(pmap == pmap_kernel());
3685 			pmap_stats_update_bypte(pmap, 0, opte);
3686 		}
3687 
3688 		if (pve != NULL) {
3689 			pve->pve_next = killlist;	/* mark it for death */
3690 			killlist = pve;
3691 		}
3692 		pp_lock(pp);
3693 	}
3694 	pp_unlock(pp);
3695 	kpreempt_enable();
3696 
3697 	/* Now free unused pvs. */
3698 	pmap_free_pvs(killlist);
3699 }
3700 
3701 /*
3702  * p m a p   a t t r i b u t e  f u n c t i o n s
3703  * functions that test/change managed page's attributes
3704  * since a page can be mapped multiple times we must check each PTE that
3705  * maps it by going down the pv lists.
3706  */
3707 
3708 /*
3709  * pmap_test_attrs: test a page's attributes
3710  */
3711 
3712 bool
3713 pmap_test_attrs(struct vm_page *pg, unsigned testbits)
3714 {
3715 	struct pmap_page *pp;
3716 	struct pv_pte *pvpte;
3717 	pt_entry_t expect;
3718 	u_int result;
3719 
3720 	pp = VM_PAGE_TO_PP(pg);
3721 	if ((pp->pp_attrs & testbits) != 0) {
3722 		return true;
3723 	}
3724 	expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V;
3725 	pp_lock(pp);
3726 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
3727 		pt_entry_t opte;
3728 		int error;
3729 
3730 		if ((pp->pp_attrs & testbits) != 0) {
3731 			break;
3732 		}
3733 		error = pmap_sync_pv(pvpte, expect, 0, &opte);
3734 		if (error == 0) {
3735 			pp->pp_attrs |= opte;
3736 		}
3737 	}
3738 	result = pp->pp_attrs & testbits;
3739 	pp_unlock(pp);
3740 
3741 	/*
3742 	 * note that we will exit the for loop with a non-null pve if
3743 	 * we have found the bits we are testing for.
3744 	 */
3745 
3746 	return result != 0;
3747 }
3748 
3749 /*
3750  * pmap_clear_attrs: clear the specified attribute for a page.
3751  *
3752  * => we return true if we cleared one of the bits we were asked to
3753  */
3754 
3755 bool
3756 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits)
3757 {
3758 	struct pmap_page *pp;
3759 	struct pv_pte *pvpte;
3760 	u_int result;
3761 	pt_entry_t expect;
3762 	int count;
3763 
3764 	pp = VM_PAGE_TO_PP(pg);
3765 	expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V;
3766 	count = SPINLOCK_BACKOFF_MIN;
3767 	kpreempt_disable();
3768 startover:
3769 	pp_lock(pp);
3770 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
3771 		pt_entry_t opte;
3772 		int error;
3773 
3774 		error = pmap_sync_pv(pvpte, expect, clearbits, &opte);
3775 		if (error == EAGAIN) {
3776 			int hold_count;
3777 			pp_unlock(pp);
3778 			KERNEL_UNLOCK_ALL(curlwp, &hold_count);
3779 			SPINLOCK_BACKOFF(count);
3780 			KERNEL_LOCK(hold_count, curlwp);
3781 			goto startover;
3782 		}
3783 		pp->pp_attrs |= opte;
3784 	}
3785 	result = pp->pp_attrs & clearbits;
3786 	pp->pp_attrs &= ~clearbits;
3787 	pp_unlock(pp);
3788 	kpreempt_enable();
3789 
3790 	return result != 0;
3791 }
3792 
3793 
3794 /*
3795  * p m a p   p r o t e c t i o n   f u n c t i o n s
3796  */
3797 
3798 /*
3799  * pmap_page_protect: change the protection of all recorded mappings
3800  *	of a managed page
3801  *
3802  * => NOTE: this is an inline function in pmap.h
3803  */
3804 
3805 /* see pmap.h */
3806 
3807 /*
3808  * pmap_protect: set the protection in of the pages in a pmap
3809  *
3810  * => NOTE: this is an inline function in pmap.h
3811  */
3812 
3813 /* see pmap.h */
3814 
3815 /*
3816  * pmap_write_protect: write-protect pages in a pmap
3817  */
3818 
3819 void
3820 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
3821 {
3822 	int i;
3823 	pt_entry_t *ptes, *epte;
3824 	pt_entry_t *spte;
3825 	pd_entry_t * const *pdes;
3826 	vaddr_t blockend, va;
3827 	pt_entry_t opte;
3828 	struct pmap *pmap2;
3829 
3830 	KASSERT(curlwp->l_md.md_gc_pmap != pmap);
3831 
3832 	kpreempt_disable();
3833 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
3834 
3835 	/* should be ok, but just in case ... */
3836 	sva &= PG_FRAME;
3837 	eva &= PG_FRAME;
3838 
3839 	for (va = sva ; va < eva ; va = blockend) {
3840 
3841 		blockend = (va & L2_FRAME) + NBPD_L2;
3842 		if (blockend > eva)
3843 			blockend = eva;
3844 
3845 		/*
3846 		 * XXXCDC: our PTE mappings should never be write-protected!
3847 		 *
3848 		 * long term solution is to move the PTEs out of user
3849 		 * address space.  and into kernel address space (up
3850 		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
3851 		 * be VM_MAX_ADDRESS.
3852 		 */
3853 
3854 		/* XXXCDC: ugly hack to avoid freeing PDP here */
3855 		for (i = 0; i < PDP_SIZE; i++) {
3856 			if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i)
3857 				continue;
3858 		}
3859 
3860 		/* empty block? */
3861 		if (!pmap_pdes_valid(va, pdes, NULL))
3862 			continue;
3863 
3864 #ifdef DIAGNOSTIC
3865 		if (va >= VM_MAXUSER_ADDRESS &&
3866 		    va < VM_MAX_ADDRESS)
3867 			panic("pmap_write_protect: PTE space");
3868 #endif
3869 
3870 		spte = &ptes[pl1_i(va)];
3871 		epte = &ptes[pl1_i(blockend)];
3872 
3873 		for (/*null */; spte < epte ; spte++) {
3874 			pt_entry_t npte;
3875 
3876 			do {
3877 				opte = *spte;
3878 				if ((~opte & (PG_RW | PG_V)) != 0) {
3879 					goto next;
3880 				}
3881 				npte = opte & ~PG_RW;
3882 			} while (pmap_pte_cas(spte, opte, npte) != opte);
3883 			if ((opte & PG_M) != 0) {
3884 				vaddr_t tva;
3885 
3886 				tva = x86_ptob(spte - ptes);
3887 				pmap_tlb_shootdown(pmap, tva, 0, opte);
3888 			}
3889 next:;
3890 		}
3891 	}
3892 
3893 	pmap_unmap_ptes(pmap, pmap2);	/* unlocks pmap */
3894 	kpreempt_enable();
3895 }
3896 
3897 /*
3898  * end of protection functions
3899  */
3900 
3901 /*
3902  * pmap_unwire: clear the wired bit in the PTE
3903  *
3904  * => mapping should already be in map
3905  */
3906 
3907 void
3908 pmap_unwire(struct pmap *pmap, vaddr_t va)
3909 {
3910 	pt_entry_t *ptes;
3911 	pd_entry_t * const *pdes;
3912 	struct pmap *pmap2;
3913 
3914 	kpreempt_disable();
3915 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
3916 
3917 	if (pmap_pdes_valid(va, pdes, NULL)) {
3918 		pt_entry_t *ptep = &ptes[pl1_i(va)];
3919 		pt_entry_t opte = *ptep;
3920 
3921 #ifdef DIAGNOSTIC
3922 		if (!pmap_valid_entry(opte))
3923 			panic("pmap_unwire: invalid (unmapped) va 0x%lx", va);
3924 #endif
3925 		if ((opte & PG_W) != 0) {
3926 			pt_entry_t npte = opte & ~PG_W;
3927 
3928 			opte = pmap_pte_testset(ptep, npte);
3929 			pmap_stats_update_bypte(pmap, npte, opte);
3930 		}
3931 #ifdef DIAGNOSTIC
3932 		else {
3933 			printf("pmap_unwire: wiring for pmap %p va 0x%lx "
3934 			       "didn't change!\n", pmap, va);
3935 		}
3936 #endif
3937 		pmap_unmap_ptes(pmap, pmap2);		/* unlocks map */
3938 	}
3939 #ifdef DIAGNOSTIC
3940 	else {
3941 		panic("pmap_unwire: invalid PDE");
3942 	}
3943 #endif
3944 	kpreempt_enable();
3945 }
3946 
3947 /*
3948  * pmap_copy: copy mappings from one pmap to another
3949  *
3950  * => optional function
3951  * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
3952  */
3953 
3954 /*
3955  * defined as macro in pmap.h
3956  */
3957 
3958 __weak_alias(pmap_enter, pmap_enter_default);
3959 
3960 int
3961 pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
3962     u_int flags)
3963 {
3964 	return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0);
3965 }
3966 
3967 /*
3968  * pmap_enter: enter a mapping into a pmap
3969  *
3970  * => must be done "now" ... no lazy-evaluation
3971  * => we set pmap => pv_head locking
3972  */
3973 int
3974 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa,
3975 	   vm_prot_t prot, u_int flags, int domid)
3976 {
3977 	pt_entry_t *ptes, opte, npte;
3978 	pt_entry_t *ptep;
3979 	pd_entry_t * const *pdes;
3980 	struct vm_page *ptp, *pg;
3981 	struct pmap_page *new_pp;
3982 	struct pmap_page *old_pp;
3983 	struct pv_entry *old_pve = NULL;
3984 	struct pv_entry *new_pve;
3985 	struct pv_entry *new_pve2;
3986 	int error;
3987 	bool wired = (flags & PMAP_WIRED) != 0;
3988 	struct pmap *pmap2;
3989 
3990 	KASSERT(pmap_initialized);
3991 	KASSERT(curlwp->l_md.md_gc_pmap != pmap);
3992 
3993 #ifdef DIAGNOSTIC
3994 	/* sanity check: totally out of range? */
3995 	if (va >= VM_MAX_KERNEL_ADDRESS)
3996 		panic("pmap_enter: too big");
3997 
3998 	if (va == (vaddr_t) PDP_BASE || va == (vaddr_t) APDP_BASE)
3999 		panic("pmap_enter: trying to map over PDP/APDP!");
4000 
4001 	/* sanity check: kernel PTPs should already have been pre-allocated */
4002 	if (va >= VM_MIN_KERNEL_ADDRESS &&
4003 	    !pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]))
4004 		panic("pmap_enter: missing kernel PTP for va %lx!", va);
4005 #endif /* DIAGNOSTIC */
4006 #ifdef XEN
4007 	KASSERT(domid == DOMID_SELF || pa == 0);
4008 #endif /* XEN */
4009 
4010 	npte = ma | protection_codes[prot] | PG_V;
4011 	npte |= pmap_pat_flags(flags);
4012 	if (wired)
4013 	        npte |= PG_W;
4014 	if (va < VM_MAXUSER_ADDRESS)
4015 		npte |= PG_u;
4016 	else if (va < VM_MAX_ADDRESS)
4017 		npte |= (PG_u | PG_RW);	/* XXXCDC: no longer needed? */
4018 	else
4019 		npte |= PG_k;
4020 	if (pmap == pmap_kernel())
4021 		npte |= pmap_pg_g;
4022 	if (flags & VM_PROT_ALL) {
4023 		npte |= PG_U;
4024 		if (flags & VM_PROT_WRITE) {
4025 			KASSERT((npte & PG_RW) != 0);
4026 			npte |= PG_M;
4027 		}
4028 	}
4029 
4030 #ifdef XEN
4031 	if (domid != DOMID_SELF)
4032 		pg = NULL;
4033 	else
4034 #endif
4035 		pg = PHYS_TO_VM_PAGE(pa);
4036 	if (pg != NULL) {
4037 		/* This is a managed page */
4038 		npte |= PG_PVLIST;
4039 		new_pp = VM_PAGE_TO_PP(pg);
4040 	} else {
4041 		new_pp = NULL;
4042 	}
4043 
4044 	/* get pves. */
4045 	new_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
4046 	new_pve2 = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
4047 	if (new_pve == NULL || new_pve2 == NULL) {
4048 		if (flags & PMAP_CANFAIL) {
4049 			error = ENOMEM;
4050 			goto out2;
4051 		}
4052 		panic("pmap_enter: pve allocation failed");
4053 	}
4054 
4055 	kpreempt_disable();
4056 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
4057 	if (pmap == pmap_kernel()) {
4058 		ptp = NULL;
4059 	} else {
4060 		ptp = pmap_get_ptp(pmap, va, pdes);
4061 		if (ptp == NULL) {
4062 			pmap_unmap_ptes(pmap, pmap2);
4063 			if (flags & PMAP_CANFAIL) {
4064 				error = ENOMEM;
4065 				goto out;
4066 			}
4067 			panic("pmap_enter: get ptp failed");
4068 		}
4069 	}
4070 
4071 	/*
4072 	 * update the pte.
4073 	 */
4074 
4075 	ptep = &ptes[pl1_i(va)];
4076 	do {
4077 		opte = *ptep;
4078 
4079 		/*
4080 		 * if the same page, inherit PG_U and PG_M.
4081 		 */
4082 		if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) {
4083 			npte |= opte & (PG_U | PG_M);
4084 		}
4085 #if defined(XEN)
4086 		if (domid != DOMID_SELF) {
4087 			/* pmap_pte_cas with error handling */
4088 			int s = splvm();
4089 			if (opte != *ptep) {
4090 				splx(s);
4091 				continue;
4092 			}
4093 			error = xpq_update_foreign(
4094 			    vtomach((vaddr_t)ptep), npte, domid);
4095 			splx(s);
4096 			if (error) {
4097 				if (ptp != NULL && ptp->wire_count <= 1) {
4098 					pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4099 				}
4100 				pmap_unmap_ptes(pmap, pmap2);
4101 				goto out;
4102 			}
4103 			break;
4104 		}
4105 #endif /* defined(XEN) */
4106 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
4107 
4108 	/*
4109 	 * update statistics and PTP's reference count.
4110 	 */
4111 
4112 	pmap_stats_update_bypte(pmap, npte, opte);
4113 	if (ptp != NULL && !pmap_valid_entry(opte)) {
4114 		ptp->wire_count++;
4115 	}
4116 	KASSERT(ptp == NULL || ptp->wire_count > 1);
4117 
4118 	/*
4119 	 * if the same page, we can skip pv_entry handling.
4120 	 */
4121 
4122 	if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) {
4123 		KASSERT(((opte ^ npte) & PG_PVLIST) == 0);
4124 		goto same_pa;
4125 	}
4126 
4127 	/*
4128 	 * if old page is managed, remove pv_entry from its list.
4129 	 */
4130 
4131 	if ((~opte & (PG_V | PG_PVLIST)) == 0) {
4132 		pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte));
4133 #ifdef DIAGNOSTIC
4134 		if (pg == NULL)
4135 			panic("pmap_enter: PG_PVLIST mapping with "
4136 			      "unmanaged page "
4137 			      "pa = 0x%" PRIx64 " (0x%" PRIx64 ")",
4138 			      (int64_t)pa, (int64_t)atop(pa));
4139 #endif
4140 		old_pp = VM_PAGE_TO_PP(pg);
4141 
4142 		pp_lock(old_pp);
4143 		old_pve = pmap_remove_pv(old_pp, ptp, va);
4144 		old_pp->pp_attrs |= opte;
4145 		pp_unlock(old_pp);
4146 	}
4147 
4148 	/*
4149 	 * if new page is managed, insert pv_entry into its list.
4150 	 */
4151 
4152 	if (new_pp) {
4153 		pp_lock(new_pp);
4154 		new_pve = pmap_enter_pv(new_pp, new_pve, &new_pve2, ptp, va);
4155 		pp_unlock(new_pp);
4156 	}
4157 
4158 same_pa:
4159 	pmap_unmap_ptes(pmap, pmap2);
4160 
4161 	/*
4162 	 * shootdown tlb if necessary.
4163 	 */
4164 
4165 	if ((~opte & (PG_V | PG_U)) == 0 &&
4166 	    ((opte ^ npte) & (PG_FRAME | PG_RW)) != 0) {
4167 		pmap_tlb_shootdown(pmap, va, 0, opte);
4168 	}
4169 
4170 	error = 0;
4171 out:
4172 	kpreempt_enable();
4173 out2:
4174 	if (old_pve != NULL) {
4175 		pool_cache_put(&pmap_pv_cache, old_pve);
4176 	}
4177 	if (new_pve != NULL) {
4178 		pool_cache_put(&pmap_pv_cache, new_pve);
4179 	}
4180 	if (new_pve2 != NULL) {
4181 		pool_cache_put(&pmap_pv_cache, new_pve2);
4182 	}
4183 
4184 	return error;
4185 }
4186 
4187 static bool
4188 pmap_get_physpage(vaddr_t va, int level, paddr_t *paddrp)
4189 {
4190 	struct vm_page *ptp;
4191 	struct pmap *kpm = pmap_kernel();
4192 
4193 	if (uvm.page_init_done == false) {
4194 		/*
4195 		 * we're growing the kernel pmap early (from
4196 		 * uvm_pageboot_alloc()).  this case must be
4197 		 * handled a little differently.
4198 		 */
4199 
4200 		if (uvm_page_physget(paddrp) == false)
4201 			panic("pmap_get_physpage: out of memory");
4202 		kpreempt_disable();
4203 		pmap_pte_set(early_zero_pte,
4204 		    pmap_pa2pte(*paddrp) | PG_V | PG_RW | PG_k);
4205 		pmap_pte_flush();
4206 		pmap_update_pg((vaddr_t)early_zerop);
4207 		memset(early_zerop, 0, PAGE_SIZE);
4208 #if defined(DIAGNOSTIC) || defined (XEN)
4209 		pmap_pte_set(early_zero_pte, 0);
4210 		pmap_pte_flush();
4211 #endif /* defined(DIAGNOSTIC) */
4212 		kpreempt_enable();
4213 	} else {
4214 		/* XXX */
4215 		PMAP_SUBOBJ_LOCK(kpm, level - 1);
4216 		ptp = uvm_pagealloc(&kpm->pm_obj[level - 1],
4217 				    ptp_va2o(va, level), NULL,
4218 				    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
4219 		PMAP_SUBOBJ_UNLOCK(kpm, level - 1);
4220 		if (ptp == NULL)
4221 			panic("pmap_get_physpage: out of memory");
4222 		ptp->flags &= ~PG_BUSY;
4223 		ptp->wire_count = 1;
4224 		*paddrp = VM_PAGE_TO_PHYS(ptp);
4225 	}
4226 	pmap_stats_update(kpm, 1, 0);
4227 	return true;
4228 }
4229 
4230 /*
4231  * Allocate the amount of specified ptps for a ptp level, and populate
4232  * all levels below accordingly, mapping virtual addresses starting at
4233  * kva.
4234  *
4235  * Used by pmap_growkernel.
4236  */
4237 static void
4238 pmap_alloc_level(pd_entry_t * const *pdes, vaddr_t kva, int lvl,
4239     long *needed_ptps)
4240 {
4241 	unsigned long i;
4242 	vaddr_t va;
4243 	paddr_t pa;
4244 	unsigned long index, endindex;
4245 	int level;
4246 	pd_entry_t *pdep;
4247 #ifdef XEN
4248 	int s = splvm(); /* protect xpq_* */
4249 #endif
4250 
4251 	for (level = lvl; level > 1; level--) {
4252 		if (level == PTP_LEVELS)
4253 			pdep = pmap_kernel()->pm_pdir;
4254 		else
4255 			pdep = pdes[level - 2];
4256 		va = kva;
4257 		index = pl_i_roundup(kva, level);
4258 		endindex = index + needed_ptps[level - 1] - 1;
4259 
4260 
4261 		for (i = index; i <= endindex; i++) {
4262 			KASSERT(!pmap_valid_entry(pdep[i]));
4263 			pmap_get_physpage(va, level - 1, &pa);
4264 #ifdef XEN
4265 			xpq_queue_pte_update((level == PTP_LEVELS) ?
4266 			    xpmap_ptom(pmap_pdirpa(pmap_kernel(), i)) :
4267 			    xpmap_ptetomach(&pdep[i]),
4268 			    pmap_pa2pte(pa) | PG_k | PG_V | PG_RW);
4269 #ifdef PAE
4270 			if (level == PTP_LEVELS &&  i > L2_SLOT_KERN) {
4271 				/* update real kernel PD too */
4272 				xpq_queue_pte_update(
4273 				    xpmap_ptetomach(&pmap_kl2pd[l2tol2(i)]),
4274 				    pmap_pa2pte(pa) | PG_k | PG_V | PG_RW);
4275 			}
4276 #endif
4277 #else /* XEN */
4278 			pdep[i] = pmap_pa2pte(pa) | PG_k | PG_V | PG_RW;
4279 #endif /* XEN */
4280 			KASSERT(level != PTP_LEVELS || nkptp[level - 1] +
4281 			    pl_i(VM_MIN_KERNEL_ADDRESS, level) == i);
4282 			nkptp[level - 1]++;
4283 			va += nbpd[level - 1];
4284 		}
4285 		pmap_pte_flush();
4286 	}
4287 #ifdef XEN
4288 	splx(s);
4289 #endif
4290 }
4291 
4292 /*
4293  * pmap_growkernel: increase usage of KVM space
4294  *
4295  * => we allocate new PTPs for the kernel and install them in all
4296  *	the pmaps on the system.
4297  */
4298 
4299 vaddr_t
4300 pmap_growkernel(vaddr_t maxkvaddr)
4301 {
4302 	struct pmap *kpm = pmap_kernel();
4303 #if !defined(XEN) || !defined(__x86_64__)
4304 	struct pmap *pm;
4305 #endif
4306 	int s, i;
4307 	long needed_kptp[PTP_LEVELS], target_nptp, old;
4308 	bool invalidate = false;
4309 
4310 	s = splvm();	/* to be safe */
4311 	mutex_enter(&kpm->pm_lock);
4312 
4313 	if (maxkvaddr <= pmap_maxkvaddr) {
4314 		mutex_exit(&kpm->pm_lock);
4315 		splx(s);
4316 		return pmap_maxkvaddr;
4317 	}
4318 
4319 	maxkvaddr = x86_round_pdr(maxkvaddr);
4320 	old = nkptp[PTP_LEVELS - 1];
4321 	/*
4322 	 * This loop could be optimized more, but pmap_growkernel()
4323 	 * is called infrequently.
4324 	 */
4325 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
4326 		target_nptp = pl_i_roundup(maxkvaddr, i + 1) -
4327 		    pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1);
4328 		/*
4329 		 * XXX only need to check toplevel.
4330 		 */
4331 		if (target_nptp > nkptpmax[i])
4332 			panic("out of KVA space");
4333 		KASSERT(target_nptp >= nkptp[i]);
4334 		needed_kptp[i] = target_nptp - nkptp[i];
4335 	}
4336 
4337 	pmap_alloc_level(normal_pdes, pmap_maxkvaddr, PTP_LEVELS, needed_kptp);
4338 
4339 	/*
4340 	 * If the number of top level entries changed, update all
4341 	 * pmaps.
4342 	 */
4343 	if (needed_kptp[PTP_LEVELS - 1] != 0) {
4344 #ifdef XEN
4345 #ifdef __x86_64__
4346 		/* nothing, kernel entries are never entered in user pmap */
4347 #else /* __x86_64__ */
4348 		mutex_enter(&pmaps_lock);
4349 		LIST_FOREACH(pm, &pmaps, pm_list) {
4350 			int pdkidx;
4351 			for (pdkidx =  PDIR_SLOT_KERN + old;
4352 			    pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1];
4353 			    pdkidx++) {
4354 				xpq_queue_pte_update(
4355 				    xpmap_ptom(pmap_pdirpa(pm, pdkidx)),
4356 				    kpm->pm_pdir[pdkidx]);
4357 			}
4358 			xpq_flush_queue();
4359 		}
4360 		mutex_exit(&pmaps_lock);
4361 #endif /* __x86_64__ */
4362 #else /* XEN */
4363 		unsigned newpdes;
4364 		newpdes = nkptp[PTP_LEVELS - 1] - old;
4365 		mutex_enter(&pmaps_lock);
4366 		LIST_FOREACH(pm, &pmaps, pm_list) {
4367 			memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old],
4368 			       &kpm->pm_pdir[PDIR_SLOT_KERN + old],
4369 			       newpdes * sizeof (pd_entry_t));
4370 		}
4371 		mutex_exit(&pmaps_lock);
4372 #endif
4373 		invalidate = true;
4374 	}
4375 	pmap_maxkvaddr = maxkvaddr;
4376 	mutex_exit(&kpm->pm_lock);
4377 	splx(s);
4378 
4379 	if (invalidate) {
4380 		/* Invalidate the PDP cache. */
4381 		pool_cache_invalidate(&pmap_pdp_cache);
4382 	}
4383 
4384 	return maxkvaddr;
4385 }
4386 
4387 #ifdef DEBUG
4388 void pmap_dump(struct pmap *, vaddr_t, vaddr_t);
4389 
4390 /*
4391  * pmap_dump: dump all the mappings from a pmap
4392  *
4393  * => caller should not be holding any pmap locks
4394  */
4395 
4396 void
4397 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
4398 {
4399 	pt_entry_t *ptes, *pte;
4400 	pd_entry_t * const *pdes;
4401 	struct pmap *pmap2;
4402 	vaddr_t blkendva;
4403 
4404 	/*
4405 	 * if end is out of range truncate.
4406 	 * if (end == start) update to max.
4407 	 */
4408 
4409 	if (eva > VM_MAXUSER_ADDRESS || eva <= sva)
4410 		eva = VM_MAXUSER_ADDRESS;
4411 
4412 	/*
4413 	 * we lock in the pmap => pv_head direction
4414 	 */
4415 
4416 	kpreempt_disable();
4417 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
4418 
4419 	/*
4420 	 * dumping a range of pages: we dump in PTP sized blocks (4MB)
4421 	 */
4422 
4423 	for (/* null */ ; sva < eva ; sva = blkendva) {
4424 
4425 		/* determine range of block */
4426 		blkendva = x86_round_pdr(sva+1);
4427 		if (blkendva > eva)
4428 			blkendva = eva;
4429 
4430 		/* valid block? */
4431 		if (!pmap_pdes_valid(sva, pdes, NULL))
4432 			continue;
4433 
4434 		pte = &ptes[pl1_i(sva)];
4435 		for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) {
4436 			if (!pmap_valid_entry(*pte))
4437 				continue;
4438 			printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR
4439 			    " (pte=%#" PRIxPADDR ")\n",
4440 			    sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte);
4441 		}
4442 	}
4443 	pmap_unmap_ptes(pmap, pmap2);
4444 	kpreempt_enable();
4445 }
4446 #endif
4447 
4448 /*
4449  * pmap_tlb_shootdown: invalidate pages on all CPUs using pmap 'pm'
4450  *
4451  * => always invalidates locally before returning
4452  * => returns before remote CPUs have invalidated
4453  * => must be called with preemption disabled
4454  */
4455 
4456 void
4457 pmap_tlb_shootdown(struct pmap *pm, vaddr_t sva, vaddr_t eva, pt_entry_t pte)
4458 {
4459 #ifdef MULTIPROCESSOR
4460 	extern bool x86_mp_online;
4461 	struct cpu_info *ci;
4462 	struct pmap_mbox *mb, *selfmb;
4463 	CPU_INFO_ITERATOR cii;
4464 	uintptr_t head;
4465 	u_int count;
4466 	int s;
4467 #endif	/* MULTIPROCESSOR */
4468 	struct cpu_info *self;
4469 	bool kernel;
4470 
4471 	KASSERT(eva == 0 || eva >= sva);
4472 	KASSERT(kpreempt_disabled());
4473 
4474 	if (pte & PG_PS)
4475 		sva &= PG_LGFRAME;
4476 	pte &= PG_G;
4477 	self = curcpu();
4478 
4479 	if (sva == (vaddr_t)-1LL) {
4480 		kernel = true;
4481 	} else {
4482 		if (eva == 0)
4483 			eva = sva + PAGE_SIZE;
4484 		kernel = sva >= VM_MAXUSER_ADDRESS;
4485 		KASSERT(kernel == (eva > VM_MAXUSER_ADDRESS));
4486 	}
4487 
4488 	/*
4489 	 * if tearing down the pmap, do nothing.  we'll flush later
4490 	 * when we're ready to recycle/destroy it.
4491 	 */
4492 	if (__predict_false(curlwp->l_md.md_gc_pmap == pm)) {
4493 		return;
4494 	}
4495 
4496 	/*
4497 	 * If the range is larger than 32 pages, then invalidate
4498 	 * everything.
4499 	 */
4500 	if (sva != (vaddr_t)-1LL && eva - sva > (32 * PAGE_SIZE)) {
4501 		sva = (vaddr_t)-1LL;
4502 		eva = sva;
4503 	}
4504 
4505 #ifdef MULTIPROCESSOR
4506 	if (ncpu > 1 && x86_mp_online) {
4507 		selfmb = &self->ci_pmap_cpu->pc_mbox;
4508 
4509 		/*
4510 		 * If the CPUs have no notion of global pages then
4511 		 * reload of %cr3 is sufficient.
4512 		 */
4513 		if (pte != 0 && (cpu_feature[0] & CPUID_PGE) == 0)
4514 			pte = 0;
4515 
4516 		if (pm == pmap_kernel()) {
4517 			/*
4518 			 * Mapped on all CPUs: use the broadcast mechanism.
4519 			 * Once we have the lock, increment the counter.
4520 			 */
4521 			s = splvm();
4522 			mb = &pmap_mbox;
4523 			count = SPINLOCK_BACKOFF_MIN;
4524 			do {
4525 				if ((head = mb->mb_head) != mb->mb_tail) {
4526 					splx(s);
4527 					while ((head = mb->mb_head) !=
4528 					    mb->mb_tail)
4529 						SPINLOCK_BACKOFF(count);
4530 					s = splvm();
4531 				}
4532 			} while (atomic_cas_ulong(
4533 			    (volatile u_long *)&mb->mb_head,
4534 			    head, head + ncpu - 1) != head);
4535 
4536 			/*
4537 			 * Once underway we must stay at IPL_VM until the
4538 			 * IPI is dispatched.  Otherwise interrupt handlers
4539 			 * on this CPU can deadlock against us.
4540 			 */
4541 			pmap_tlb_evcnt.ev_count++;
4542 			mb->mb_pointer = self;
4543 			mb->mb_addr1 = sva;
4544 			mb->mb_addr2 = eva;
4545 			mb->mb_global = pte;
4546 			x86_ipi(LAPIC_TLB_BCAST_VECTOR, LAPIC_DEST_ALLEXCL,
4547 			    LAPIC_DLMODE_FIXED);
4548 			self->ci_need_tlbwait = 1;
4549 			splx(s);
4550 		} else if ((pm->pm_cpus & ~self->ci_cpumask) != 0 ||
4551 		    (kernel && (pm->pm_kernel_cpus & ~self->ci_cpumask) != 0)) {
4552 			/*
4553 			 * We don't bother traversing the CPU list if only
4554 			 * used by this CPU.
4555 			 *
4556 			 * We can't do global flushes with the multicast
4557 			 * mechanism.
4558 			 */
4559 			KASSERT(pte == 0);
4560 
4561 			/*
4562 			 * Take ownership of the shootdown mailbox on each
4563 			 * CPU, fill the details and fire it off.
4564 			 */
4565 			s = splvm();
4566 			for (CPU_INFO_FOREACH(cii, ci)) {
4567 				if (ci == self ||
4568 				    !pmap_is_active(pm, ci, kernel) ||
4569 				    !(ci->ci_flags & CPUF_RUNNING))
4570 					continue;
4571 				selfmb->mb_head++;
4572 				mb = &ci->ci_pmap_cpu->pc_mbox;
4573 				count = SPINLOCK_BACKOFF_MIN;
4574 				while (atomic_cas_ulong(
4575 				    (u_long *)&mb->mb_pointer,
4576 				    0, (u_long)&selfmb->mb_tail) != 0) {
4577 				    	splx(s);
4578 					while (mb->mb_pointer != 0)
4579 						SPINLOCK_BACKOFF(count);
4580 					s = splvm();
4581 				}
4582 				mb->mb_addr1 = sva;
4583 				mb->mb_addr2 = eva;
4584 				mb->mb_global = pte;
4585 				if (x86_ipi(LAPIC_TLB_MCAST_VECTOR,
4586 				    ci->ci_cpuid, LAPIC_DLMODE_FIXED))
4587 					panic("pmap_tlb_shootdown: ipi failed");
4588 			}
4589 			self->ci_need_tlbwait = 1;
4590 			splx(s);
4591 		}
4592 	}
4593 #endif	/* MULTIPROCESSOR */
4594 
4595 	/* Update the current CPU before waiting for others. */
4596 	if (!pmap_is_active(pm, self, kernel))
4597 		return;
4598 
4599 	if (sva == (vaddr_t)-1LL) {
4600 		u_int gen = uvm_emap_gen_return();
4601 		if (pte != 0) {
4602 			tlbflushg();
4603 		} else {
4604 			tlbflush();
4605 		}
4606 		uvm_emap_update(gen);
4607 	} else {
4608 		do {
4609 			pmap_update_pg(sva);
4610 			sva += PAGE_SIZE;
4611 		} while (sva < eva);
4612 	}
4613 }
4614 
4615 /*
4616  * pmap_tlb_shootwait: wait for pending TLB shootdowns to complete
4617  *
4618  * => only waits for operations generated by the current CPU
4619  * => must be called with preemption disabled
4620  */
4621 
4622 void
4623 pmap_tlb_shootwait(void)
4624 {
4625 	struct cpu_info *self;
4626 	struct pmap_mbox *mb;
4627 
4628 	KASSERT(kpreempt_disabled());
4629 
4630 	/*
4631 	 * Anything to do?  XXX Really we want to avoid touching the cache
4632 	 * lines of the two mailboxes, but the processor may read ahead.
4633 	 */
4634 	self = curcpu();
4635 	if (!self->ci_need_tlbwait)
4636 		return;
4637 	self->ci_need_tlbwait = 0;
4638 
4639 	/* If we own the global mailbox, wait for it to drain. */
4640 	mb = &pmap_mbox;
4641 	while (mb->mb_pointer == self && mb->mb_head != mb->mb_tail)
4642 		x86_pause();
4643 
4644 	/* If we own other CPU's mailboxes, wait for them to drain. */
4645 	mb = &self->ci_pmap_cpu->pc_mbox;
4646 	KASSERT(mb->mb_pointer != &mb->mb_tail);
4647 	while (mb->mb_head != mb->mb_tail)
4648 		x86_pause();
4649 }
4650 
4651 /*
4652  * pmap_update: process deferred invalidations
4653  */
4654 
4655 void
4656 pmap_update(struct pmap *pmap)
4657 {
4658 	struct vm_page *ptp, *empty_ptps;
4659 	struct pmap_page *pp;
4660 	lwp_t *l;
4661 
4662 	/*
4663 	 * if we have torn down this pmap, invalidate non-global TLB
4664 	 * entries on any processors using it.
4665 	 */
4666 	l = curlwp;
4667 	if (__predict_false(l->l_md.md_gc_pmap == pmap)) {
4668 		l->l_md.md_gc_pmap = NULL;
4669 		KPREEMPT_DISABLE(l);
4670 		pmap_tlb_shootdown(pmap, -1, -1, 0);
4671 		KPREEMPT_ENABLE(l);
4672 	}
4673 
4674 	/*
4675 	 * wait for tlb shootdowns to complete before returning control
4676 	 * to the caller.
4677 	 */
4678 	kpreempt_disable();
4679 	pmap_tlb_shootwait();
4680 	kpreempt_enable();
4681 
4682 	/*
4683 	 * now that shootdowns are complete, process deferred frees,
4684 	 * but not from interrupt context.
4685 	 */
4686 	if (l->l_md.md_gc_ptp != NULL) {
4687 		KASSERT((l->l_pflag & LP_INTR) == 0);
4688 		if (cpu_intr_p()) {
4689 			return;
4690 		}
4691 
4692 		empty_ptps = l->l_md.md_gc_ptp;
4693 		l->l_md.md_gc_ptp = NULL;
4694 
4695 		while ((ptp = empty_ptps) != NULL) {
4696 			ptp->flags |= PG_ZERO;
4697 			pp = VM_PAGE_TO_PP(ptp);
4698 			empty_ptps = pp->pp_link;
4699 			LIST_INIT(&pp->pp_head.pvh_list);
4700 			uvm_pagefree(ptp);
4701 		}
4702 	}
4703 }
4704 
4705 #if PTP_LEVELS > 4
4706 #error "Unsupported number of page table mappings"
4707 #endif
4708 
4709 paddr_t
4710 pmap_init_tmp_pgtbl(paddr_t pg)
4711 {
4712 	static bool maps_loaded;
4713 	static const paddr_t x86_tmp_pml_paddr[] = {
4714 	    4 * PAGE_SIZE,
4715 	    5 * PAGE_SIZE,
4716 	    6 * PAGE_SIZE,
4717 	    7 * PAGE_SIZE
4718 	};
4719 	static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 };
4720 
4721 	pd_entry_t *tmp_pml, *kernel_pml;
4722 
4723 	int level;
4724 
4725 	if (!maps_loaded) {
4726 		for (level = 0; level < PTP_LEVELS; ++level) {
4727 			x86_tmp_pml_vaddr[level] =
4728 			    uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
4729 			    UVM_KMF_VAONLY);
4730 
4731 			if (x86_tmp_pml_vaddr[level] == 0)
4732 				panic("mapping of real mode PML failed\n");
4733 			pmap_kenter_pa(x86_tmp_pml_vaddr[level],
4734 			    x86_tmp_pml_paddr[level],
4735 			    VM_PROT_READ | VM_PROT_WRITE, 0);
4736 			pmap_update(pmap_kernel());
4737 		}
4738 		maps_loaded = true;
4739 	}
4740 
4741 	/* Zero levels 1-3 */
4742 	for (level = 0; level < PTP_LEVELS - 1; ++level) {
4743 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
4744 		memset(tmp_pml, 0, PAGE_SIZE);
4745 	}
4746 
4747 	/* Copy PML4 */
4748 	kernel_pml = pmap_kernel()->pm_pdir;
4749 	tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1];
4750 	memcpy(tmp_pml, kernel_pml, PAGE_SIZE);
4751 
4752 #ifdef PAE
4753 	/*
4754 	 * Use the last 4 entries of the L2 page as L3 PD entries. These
4755 	 * last entries are unlikely to be used for temporary mappings.
4756 	 * 508: maps 0->1GB (userland)
4757 	 * 509: unused
4758 	 * 510: unused
4759 	 * 511: maps 3->4GB (kernel)
4760 	 */
4761 	tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PG_V;
4762 	tmp_pml[509] = 0;
4763 	tmp_pml[510] = 0;
4764 	tmp_pml[511] = pmap_pdirpa(pmap_kernel(),PDIR_SLOT_KERN) | PG_V;
4765 #endif
4766 
4767 	for (level = PTP_LEVELS - 1; level > 0; --level) {
4768 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
4769 
4770 		tmp_pml[pl_i(pg, level + 1)] =
4771 		    (x86_tmp_pml_paddr[level - 1] & PG_FRAME) | PG_RW | PG_V;
4772 	}
4773 
4774 	tmp_pml = (void *)x86_tmp_pml_vaddr[0];
4775 	tmp_pml[pl_i(pg, 1)] = (pg & PG_FRAME) | PG_RW | PG_V;
4776 
4777 #ifdef PAE
4778 	/* Return the PA of the L3 page (entry 508 of the L2 page) */
4779 	return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t);
4780 #endif
4781 
4782 	return x86_tmp_pml_paddr[PTP_LEVELS - 1];
4783 }
4784 
4785 u_int
4786 x86_mmap_flags(paddr_t mdpgno)
4787 {
4788 	u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK;
4789 	u_int pflag = 0;
4790 
4791 	if (nflag & X86_MMAP_FLAG_PREFETCH)
4792 		pflag |= PMAP_WRITE_COMBINE;
4793 
4794 	return pflag;
4795 }
4796