xref: /netbsd-src/sys/arch/x86/x86/pmap.c (revision ca453df649ce9db45b64d73678ba06cbccf9aa11)
1 /*	$NetBSD: pmap.c,v 1.127 2011/07/05 14:07:12 yamt Exp $	*/
2 
3 /*-
4  * Copyright (c) 2008, 2010 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 2007 Manuel Bouyer.
34  *
35  * Redistribution and use in source and binary forms, with or without
36  * modification, are permitted provided that the following conditions
37  * are met:
38  * 1. Redistributions of source code must retain the above copyright
39  *    notice, this list of conditions and the following disclaimer.
40  * 2. Redistributions in binary form must reproduce the above copyright
41  *    notice, this list of conditions and the following disclaimer in the
42  *    documentation and/or other materials provided with the distribution.
43  *
44  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
45  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
46  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
47  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
48  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
49  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
50  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
51  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
52  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
53  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
54  *
55  */
56 
57 /*
58  * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
59  *
60  * Permission to use, copy, modify, and distribute this software for any
61  * purpose with or without fee is hereby granted, provided that the above
62  * copyright notice and this permission notice appear in all copies.
63  *
64  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
65  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
66  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
67  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
68  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
69  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
70  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
71  */
72 
73 /*
74  * Copyright (c) 1997 Charles D. Cranor and Washington University.
75  * All rights reserved.
76  *
77  * Redistribution and use in source and binary forms, with or without
78  * modification, are permitted provided that the following conditions
79  * are met:
80  * 1. Redistributions of source code must retain the above copyright
81  *    notice, this list of conditions and the following disclaimer.
82  * 2. Redistributions in binary form must reproduce the above copyright
83  *    notice, this list of conditions and the following disclaimer in the
84  *    documentation and/or other materials provided with the distribution.
85  *
86  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
87  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
88  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
89  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
90  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
91  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
92  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
93  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
94  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
95  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
96  */
97 
98 /*
99  * Copyright 2001 (c) Wasabi Systems, Inc.
100  * All rights reserved.
101  *
102  * Written by Frank van der Linden for Wasabi Systems, Inc.
103  *
104  * Redistribution and use in source and binary forms, with or without
105  * modification, are permitted provided that the following conditions
106  * are met:
107  * 1. Redistributions of source code must retain the above copyright
108  *    notice, this list of conditions and the following disclaimer.
109  * 2. Redistributions in binary form must reproduce the above copyright
110  *    notice, this list of conditions and the following disclaimer in the
111  *    documentation and/or other materials provided with the distribution.
112  * 3. All advertising materials mentioning features or use of this software
113  *    must display the following acknowledgement:
114  *      This product includes software developed for the NetBSD Project by
115  *      Wasabi Systems, Inc.
116  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
117  *    or promote products derived from this software without specific prior
118  *    written permission.
119  *
120  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
121  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
122  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
123  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
124  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
125  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
126  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
127  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
128  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
129  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
130  * POSSIBILITY OF SUCH DAMAGE.
131  */
132 
133 /*
134  * This is the i386 pmap modified and generalized to support x86-64
135  * as well. The idea is to hide the upper N levels of the page tables
136  * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest
137  * is mostly untouched, except that it uses some more generalized
138  * macros and interfaces.
139  *
140  * This pmap has been tested on the i386 as well, and it can be easily
141  * adapted to PAE.
142  *
143  * fvdl@wasabisystems.com 18-Jun-2001
144  */
145 
146 /*
147  * pmap.c: i386 pmap module rewrite
148  * Chuck Cranor <chuck@netbsd>
149  * 11-Aug-97
150  *
151  * history of this pmap module: in addition to my own input, i used
152  *    the following references for this rewrite of the i386 pmap:
153  *
154  * [1] the NetBSD i386 pmap.   this pmap appears to be based on the
155  *     BSD hp300 pmap done by Mike Hibler at University of Utah.
156  *     it was then ported to the i386 by William Jolitz of UUNET
157  *     Technologies, Inc.   Then Charles M. Hannum of the NetBSD
158  *     project fixed some bugs and provided some speed ups.
159  *
160  * [2] the FreeBSD i386 pmap.   this pmap seems to be the
161  *     Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson
162  *     and David Greenman.
163  *
164  * [3] the Mach pmap.   this pmap, from CMU, seems to have migrated
165  *     between several processors.   the VAX version was done by
166  *     Avadis Tevanian, Jr., and Michael Wayne Young.    the i386
167  *     version was done by Lance Berc, Mike Kupfer, Bob Baron,
168  *     David Golub, and Richard Draves.    the alpha version was
169  *     done by Alessandro Forin (CMU/Mach) and Chris Demetriou
170  *     (NetBSD/alpha).
171  */
172 
173 #include <sys/cdefs.h>
174 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.127 2011/07/05 14:07:12 yamt Exp $");
175 
176 #include "opt_user_ldt.h"
177 #include "opt_lockdebug.h"
178 #include "opt_multiprocessor.h"
179 #include "opt_xen.h"
180 #if !defined(__x86_64__)
181 #include "opt_kstack_dr0.h"
182 #endif /* !defined(__x86_64__) */
183 
184 #include <sys/param.h>
185 #include <sys/systm.h>
186 #include <sys/proc.h>
187 #include <sys/pool.h>
188 #include <sys/kernel.h>
189 #include <sys/atomic.h>
190 #include <sys/cpu.h>
191 #include <sys/intr.h>
192 #include <sys/xcall.h>
193 
194 #include <uvm/uvm.h>
195 
196 #include <dev/isa/isareg.h>
197 
198 #include <machine/specialreg.h>
199 #include <machine/gdt.h>
200 #include <machine/isa_machdep.h>
201 #include <machine/cpuvar.h>
202 
203 #include <x86/pmap.h>
204 #include <x86/pmap_pv.h>
205 
206 #include <x86/i82489reg.h>
207 #include <x86/i82489var.h>
208 
209 #ifdef XEN
210 #include <xen/xen3-public/xen.h>
211 #include <xen/hypervisor.h>
212 #endif
213 
214 /* flag to be used for kernel mappings: PG_u on Xen/amd64, 0 otherwise */
215 #if defined(XEN) && defined(__x86_64__)
216 #define PG_k PG_u
217 #else
218 #define PG_k 0
219 #endif
220 
221 /*
222  * general info:
223  *
224  *  - for an explanation of how the i386 MMU hardware works see
225  *    the comments in <machine/pte.h>.
226  *
227  *  - for an explanation of the general memory structure used by
228  *    this pmap (including the recursive mapping), see the comments
229  *    in <machine/pmap.h>.
230  *
231  * this file contains the code for the "pmap module."   the module's
232  * job is to manage the hardware's virtual to physical address mappings.
233  * note that there are two levels of mapping in the VM system:
234  *
235  *  [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
236  *      to map ranges of virtual address space to objects/files.  for
237  *      example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
238  *      to the file /bin/ls starting at offset zero."   note that
239  *      the upper layer mapping is not concerned with how individual
240  *      vm_pages are mapped.
241  *
242  *  [2] the lower layer of the VM system (the pmap) maintains the mappings
243  *      from virtual addresses.   it is concerned with which vm_page is
244  *      mapped where.   for example, when you run /bin/ls and start
245  *      at page 0x1000 the fault routine may lookup the correct page
246  *      of the /bin/ls file and then ask the pmap layer to establish
247  *      a mapping for it.
248  *
249  * note that information in the lower layer of the VM system can be
250  * thrown away since it can easily be reconstructed from the info
251  * in the upper layer.
252  *
253  * data structures we use include:
254  *
255  *  - struct pmap: describes the address space of one thread
256  *  - struct pv_entry: describes one <PMAP,VA> mapping of a PA
257  *  - struct pv_head: there is one pv_head per managed page of
258  *	physical memory.   the pv_head points to a list of pv_entry
259  *	structures which describe all the <PMAP,VA> pairs that this
260  *      page is mapped in.    this is critical for page based operations
261  *      such as pmap_page_protect() [change protection on _all_ mappings
262  *      of a page]
263  */
264 
265 /*
266  * memory allocation
267  *
268  *  - there are three data structures that we must dynamically allocate:
269  *
270  * [A] new process' page directory page (PDP)
271  *	- plan 1: done at pmap_create() we use
272  *	  uvm_km_alloc(kernel_map, PAGE_SIZE)  [fka kmem_alloc] to do this
273  *	  allocation.
274  *
275  * if we are low in free physical memory then we sleep in
276  * uvm_km_alloc -- in this case this is ok since we are creating
277  * a new pmap and should not be holding any locks.
278  *
279  * if the kernel is totally out of virtual space
280  * (i.e. uvm_km_alloc returns NULL), then we panic.
281  *
282  * [B] new page tables pages (PTP)
283  * 	- call uvm_pagealloc()
284  * 		=> success: zero page, add to pm_pdir
285  * 		=> failure: we are out of free vm_pages, let pmap_enter()
286  *		   tell UVM about it.
287  *
288  * note: for kernel PTPs, we start with NKPTP of them.   as we map
289  * kernel memory (at uvm_map time) we check to see if we've grown
290  * the kernel pmap.   if so, we call the optional function
291  * pmap_growkernel() to grow the kernel PTPs in advance.
292  *
293  * [C] pv_entry structures
294  */
295 
296 /*
297  * locking
298  *
299  * we have the following locks that we must contend with:
300  *
301  * mutexes:
302  *
303  * - pmap lock (per pmap, part of uvm_object)
304  *   this lock protects the fields in the pmap structure including
305  *   the non-kernel PDEs in the PDP, and the PTEs.  it also locks
306  *   in the alternate PTE space (since that is determined by the
307  *   entry in the PDP).
308  *
309  * - pvh_lock (per pv_head)
310  *   this lock protects the pv_entry list which is chained off the
311  *   pv_head structure for a specific managed PA.   it is locked
312  *   when traversing the list (e.g. adding/removing mappings,
313  *   syncing R/M bits, etc.)
314  *
315  * - pmaps_lock
316  *   this lock protects the list of active pmaps (headed by "pmaps").
317  *   we lock it when adding or removing pmaps from this list.
318  */
319 
320 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER;
321 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER;
322 const long nkptpmax[] = NKPTPMAX_INITIALIZER;
323 const long nbpd[] = NBPD_INITIALIZER;
324 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER;
325 
326 long nkptp[] = NKPTP_INITIALIZER;
327 
328 static kmutex_t pmaps_lock;
329 
330 static vaddr_t pmap_maxkvaddr;
331 
332 /*
333  * XXX kludge: dummy locking to make KASSERTs in uvm_page.c comfortable.
334  * actual locking is done by pm_lock.
335  */
336 #if defined(DIAGNOSTIC)
337 #define	PMAP_SUBOBJ_LOCK(pm, idx) \
338 	KASSERT(mutex_owned((pm)->pm_lock)); \
339 	if ((idx) != 0) \
340 		mutex_enter((pm)->pm_obj[(idx)].vmobjlock)
341 #define	PMAP_SUBOBJ_UNLOCK(pm, idx) \
342 	KASSERT(mutex_owned((pm)->pm_lock)); \
343 	if ((idx) != 0) \
344 		mutex_exit((pm)->pm_obj[(idx)].vmobjlock)
345 #else /* defined(DIAGNOSTIC) */
346 #define	PMAP_SUBOBJ_LOCK(pm, idx)	/* nothing */
347 #define	PMAP_SUBOBJ_UNLOCK(pm, idx)	/* nothing */
348 #endif /* defined(DIAGNOSTIC) */
349 
350 /*
351  * Misc. event counters.
352  */
353 struct evcnt pmap_iobmp_evcnt;
354 struct evcnt pmap_ldt_evcnt;
355 
356 /*
357  * PAT
358  */
359 #define	PATENTRY(n, type)	(type << ((n) * 8))
360 #define	PAT_UC		0x0ULL
361 #define	PAT_WC		0x1ULL
362 #define	PAT_WT		0x4ULL
363 #define	PAT_WP		0x5ULL
364 #define	PAT_WB		0x6ULL
365 #define	PAT_UCMINUS	0x7ULL
366 
367 static bool cpu_pat_enabled = false;
368 
369 /*
370  * global data structures
371  */
372 
373 static struct pmap kernel_pmap_store;	/* the kernel's pmap (proc0) */
374 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store;
375 
376 /*
377  * pmap_pg_g: if our processor supports PG_G in the PTE then we
378  * set pmap_pg_g to PG_G (otherwise it is zero).
379  */
380 
381 int pmap_pg_g = 0;
382 
383 /*
384  * pmap_largepages: if our processor supports PG_PS and we are
385  * using it, this is set to true.
386  */
387 
388 int pmap_largepages;
389 
390 /*
391  * i386 physical memory comes in a big contig chunk with a small
392  * hole toward the front of it...  the following two paddr_t's
393  * (shared with machdep.c) describe the physical address space
394  * of this machine.
395  */
396 paddr_t avail_start;	/* PA of first available physical page */
397 paddr_t avail_end;	/* PA of last available physical page */
398 
399 #ifdef XEN
400 #ifdef __x86_64__
401 /* Dummy PGD for user cr3, used between pmap_deactivate() and pmap_activate() */
402 static paddr_t xen_dummy_user_pgd;
403 #endif /* __x86_64__ */
404 paddr_t pmap_pa_start; /* PA of first physical page for this domain */
405 paddr_t pmap_pa_end;   /* PA of last physical page for this domain */
406 #endif /* XEN */
407 
408 #define	VM_PAGE_TO_PP(pg)	(&(pg)->mdpage.mp_pp)
409 
410 #define	PV_HASH_SIZE		32768
411 #define	PV_HASH_LOCK_CNT	32
412 
413 struct pv_hash_lock {
414 	kmutex_t lock;
415 } __aligned(CACHE_LINE_SIZE) pv_hash_locks[PV_HASH_LOCK_CNT]
416     __aligned(CACHE_LINE_SIZE);
417 
418 struct pv_hash_head {
419 	SLIST_HEAD(, pv_entry) hh_list;
420 } pv_hash_heads[PV_HASH_SIZE];
421 
422 static u_int
423 pvhash_hash(struct vm_page *ptp, vaddr_t va)
424 {
425 
426 	return (uintptr_t)ptp / sizeof(*ptp) + (va >> PAGE_SHIFT);
427 }
428 
429 static struct pv_hash_head *
430 pvhash_head(u_int hash)
431 {
432 
433 	return &pv_hash_heads[hash % PV_HASH_SIZE];
434 }
435 
436 static kmutex_t *
437 pvhash_lock(u_int hash)
438 {
439 
440 	return &pv_hash_locks[hash % PV_HASH_LOCK_CNT].lock;
441 }
442 
443 static struct pv_entry *
444 pvhash_remove(struct pv_hash_head *hh, struct vm_page *ptp, vaddr_t va)
445 {
446 	struct pv_entry *pve;
447 	struct pv_entry *prev;
448 
449 	prev = NULL;
450 	SLIST_FOREACH(pve, &hh->hh_list, pve_hash) {
451 		if (pve->pve_pte.pte_ptp == ptp &&
452 		    pve->pve_pte.pte_va == va) {
453 			if (prev != NULL) {
454 				SLIST_REMOVE_AFTER(prev, pve_hash);
455 			} else {
456 				SLIST_REMOVE_HEAD(&hh->hh_list, pve_hash);
457 			}
458 			break;
459 		}
460 		prev = pve;
461 	}
462 	return pve;
463 }
464 
465 /*
466  * other data structures
467  */
468 
469 static pt_entry_t protection_codes[8];	/* maps MI prot to i386 prot code */
470 static bool pmap_initialized = false;	/* pmap_init done yet? */
471 
472 /*
473  * the following two vaddr_t's are used during system startup
474  * to keep track of how much of the kernel's VM space we have used.
475  * once the system is started, the management of the remaining kernel
476  * VM space is turned over to the kernel_map vm_map.
477  */
478 
479 static vaddr_t virtual_avail;	/* VA of first free KVA */
480 static vaddr_t virtual_end;	/* VA of last free KVA */
481 
482 /*
483  * linked list of all non-kernel pmaps
484  */
485 
486 static struct pmap_head pmaps;
487 
488 /*
489  * pool that pmap structures are allocated from
490  */
491 
492 static struct pool_cache pmap_cache;
493 
494 /*
495  * pv_entry cache
496  */
497 
498 static struct pool_cache pmap_pv_cache;
499 
500 /*
501  * MULTIPROCESSOR: special VA's/ PTE's are actually allocated inside a
502  * maxcpus*NPTECL array of PTE's, to avoid cache line thrashing
503  * due to false sharing.
504  */
505 
506 #ifdef MULTIPROCESSOR
507 #define PTESLEW(pte, id) ((pte)+(id)*NPTECL)
508 #define VASLEW(va,id) ((va)+(id)*NPTECL*PAGE_SIZE)
509 #else
510 #define PTESLEW(pte, id) (pte)
511 #define VASLEW(va,id) (va)
512 #endif
513 
514 /*
515  * special VAs and the PTEs that map them
516  */
517 static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte, *early_zero_pte;
518 static char *csrcp, *cdstp, *zerop, *ptpp, *early_zerop;
519 
520 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int);
521 
522 /*
523  * pool and cache that PDPs are allocated from
524  */
525 
526 static struct pool_cache pmap_pdp_cache;
527 int	pmap_pdp_ctor(void *, void *, int);
528 void	pmap_pdp_dtor(void *, void *);
529 #ifdef PAE
530 /* need to allocate items of 4 pages */
531 void *pmap_pdp_alloc(struct pool *, int);
532 void pmap_pdp_free(struct pool *, void *);
533 static struct pool_allocator pmap_pdp_allocator = {
534 	.pa_alloc = pmap_pdp_alloc,
535 	.pa_free = pmap_pdp_free,
536 	.pa_pagesz = PAGE_SIZE * PDP_SIZE,
537 };
538 #endif /* PAE */
539 
540 extern vaddr_t idt_vaddr;			/* we allocate IDT early */
541 extern paddr_t idt_paddr;
542 
543 #ifdef _LP64
544 extern vaddr_t lo32_vaddr;
545 extern vaddr_t lo32_paddr;
546 #endif
547 
548 extern int end;
549 
550 #ifdef i386
551 /* stuff to fix the pentium f00f bug */
552 extern vaddr_t pentium_idt_vaddr;
553 #endif
554 
555 
556 /*
557  * local prototypes
558  */
559 
560 static struct vm_page	*pmap_get_ptp(struct pmap *, vaddr_t,
561 				      pd_entry_t * const *);
562 static struct vm_page	*pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int);
563 static void		 pmap_freepage(struct pmap *, struct vm_page *, int);
564 static void		 pmap_free_ptp(struct pmap *, struct vm_page *,
565 				       vaddr_t, pt_entry_t *,
566 				       pd_entry_t * const *);
567 static bool		 pmap_is_active(struct pmap *, struct cpu_info *, bool);
568 static bool		 pmap_remove_pte(struct pmap *, struct vm_page *,
569 					 pt_entry_t *, vaddr_t,
570 					 struct pv_entry **);
571 static void		 pmap_remove_ptes(struct pmap *, struct vm_page *,
572 					  vaddr_t, vaddr_t, vaddr_t,
573 					  struct pv_entry **);
574 
575 static bool		 pmap_get_physpage(vaddr_t, int, paddr_t *);
576 static void		 pmap_alloc_level(pd_entry_t * const *, vaddr_t, int,
577 					  long *);
578 
579 static bool		 pmap_reactivate(struct pmap *);
580 
581 /*
582  * p m a p   h e l p e r   f u n c t i o n s
583  */
584 
585 static inline void
586 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff)
587 {
588 
589 	if (pmap == pmap_kernel()) {
590 		atomic_add_long(&pmap->pm_stats.resident_count, resid_diff);
591 		atomic_add_long(&pmap->pm_stats.wired_count, wired_diff);
592 	} else {
593 		KASSERT(mutex_owned(pmap->pm_lock));
594 		pmap->pm_stats.resident_count += resid_diff;
595 		pmap->pm_stats.wired_count += wired_diff;
596 	}
597 }
598 
599 static inline void
600 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
601 {
602 	int resid_diff = ((npte & PG_V) ? 1 : 0) - ((opte & PG_V) ? 1 : 0);
603 	int wired_diff = ((npte & PG_W) ? 1 : 0) - ((opte & PG_W) ? 1 : 0);
604 
605 	KASSERT((npte & (PG_V | PG_W)) != PG_W);
606 	KASSERT((opte & (PG_V | PG_W)) != PG_W);
607 
608 	pmap_stats_update(pmap, resid_diff, wired_diff);
609 }
610 
611 /*
612  * ptp_to_pmap: lookup pmap by ptp
613  */
614 
615 static struct pmap *
616 ptp_to_pmap(struct vm_page *ptp)
617 {
618 	struct pmap *pmap;
619 
620 	if (ptp == NULL) {
621 		return pmap_kernel();
622 	}
623 	pmap = (struct pmap *)ptp->uobject;
624 	KASSERT(pmap != NULL);
625 	KASSERT(&pmap->pm_obj[0] == ptp->uobject);
626 	return pmap;
627 }
628 
629 static inline struct pv_pte *
630 pve_to_pvpte(struct pv_entry *pve)
631 {
632 
633 	KASSERT((void *)&pve->pve_pte == (void *)pve);
634 	return &pve->pve_pte;
635 }
636 
637 static inline struct pv_entry *
638 pvpte_to_pve(struct pv_pte *pvpte)
639 {
640 	struct pv_entry *pve = (void *)pvpte;
641 
642 	KASSERT(pve_to_pvpte(pve) == pvpte);
643 	return pve;
644 }
645 
646 /*
647  * pv_pte_first, pv_pte_next: PV list iterator.
648  */
649 
650 static struct pv_pte *
651 pv_pte_first(struct pmap_page *pp)
652 {
653 
654 	if ((pp->pp_flags & PP_EMBEDDED) != 0) {
655 		return &pp->pp_pte;
656 	}
657 	return pve_to_pvpte(LIST_FIRST(&pp->pp_head.pvh_list));
658 }
659 
660 static struct pv_pte *
661 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte)
662 {
663 
664 	KASSERT(pvpte != NULL);
665 	if (pvpte == &pp->pp_pte) {
666 		KASSERT((pp->pp_flags & PP_EMBEDDED) != 0);
667 		return NULL;
668 	}
669 	KASSERT((pp->pp_flags & PP_EMBEDDED) == 0);
670 	return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list));
671 }
672 
673 /*
674  * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
675  *		of course the kernel is always loaded
676  */
677 
678 bool
679 pmap_is_curpmap(struct pmap *pmap)
680 {
681 #if defined(XEN) && defined(__x86_64__)
682 	/*
683 	 * Only kernel pmap is physically loaded.
684 	 * User PGD may be active, but TLB will be flushed
685 	 * with HYPERVISOR_iret anyway, so let's say no
686 	 */
687 	return(pmap == pmap_kernel());
688 #else /* XEN && __x86_64__*/
689 	return((pmap == pmap_kernel()) ||
690 	       (pmap == curcpu()->ci_pmap));
691 #endif
692 }
693 
694 /*
695  * pmap_is_active: is this pmap loaded into the specified processor's %cr3?
696  */
697 
698 inline static bool
699 pmap_is_active(struct pmap *pmap, struct cpu_info *ci, bool kernel)
700 {
701 
702 	return (pmap == pmap_kernel() ||
703 	    (pmap->pm_cpus & ci->ci_cpumask) != 0 ||
704 	    (kernel && (pmap->pm_kernel_cpus & ci->ci_cpumask) != 0));
705 }
706 
707 /*
708  *	Add a reference to the specified pmap.
709  */
710 
711 void
712 pmap_reference(struct pmap *pmap)
713 {
714 
715 	atomic_inc_uint(&pmap->pm_obj[0].uo_refs);
716 }
717 
718 #ifndef XEN
719 
720 /*
721  * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
722  *
723  * => we lock enough pmaps to keep things locked in
724  * => must be undone with pmap_unmap_ptes before returning
725  */
726 
727 void
728 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2,
729 	      pd_entry_t **ptepp, pd_entry_t * const **pdeppp)
730 {
731 	struct pmap *curpmap;
732 	struct cpu_info *ci;
733 	uint32_t cpumask;
734 	lwp_t *l;
735 
736 	/* The kernel's pmap is always accessible. */
737 	if (pmap == pmap_kernel()) {
738 		*pmap2 = NULL;
739 		*ptepp = PTE_BASE;
740 		*pdeppp = normal_pdes;
741 		return;
742 	}
743 	KASSERT(kpreempt_disabled());
744 
745 	l = curlwp;
746  retry:
747 	mutex_enter(pmap->pm_lock);
748 	ci = curcpu();
749 	curpmap = ci->ci_pmap;
750 	if (vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) {
751 		/* Our own pmap so just load it: easy. */
752 		if (__predict_false(ci->ci_want_pmapload)) {
753 			mutex_exit(pmap->pm_lock);
754 			pmap_load();
755 			goto retry;
756 		}
757 		KASSERT(pmap == curpmap);
758 	} else if (pmap == curpmap) {
759 		/*
760 		 * Already on the CPU: make it valid.  This is very
761 		 * often the case during exit(), when we have switched
762 		 * to the kernel pmap in order to destroy a user pmap.
763 		 */
764 		if (!pmap_reactivate(pmap)) {
765 			u_int gen = uvm_emap_gen_return();
766 			tlbflush();
767 			uvm_emap_update(gen);
768 		}
769 	} else {
770 		/*
771 		 * Toss current pmap from CPU, but keep ref to it.
772 		 * Can happen if we block during exit().
773 		 */
774 		cpumask = ci->ci_cpumask;
775 		atomic_and_32(&curpmap->pm_cpus, ~cpumask);
776 		atomic_and_32(&curpmap->pm_kernel_cpus, ~cpumask);
777 		ci->ci_pmap = pmap;
778 		ci->ci_tlbstate = TLBSTATE_VALID;
779 		atomic_or_32(&pmap->pm_cpus, cpumask);
780 		atomic_or_32(&pmap->pm_kernel_cpus, cpumask);
781 		cpu_load_pmap(pmap);
782 	}
783 	pmap->pm_ncsw = l->l_ncsw;
784 	*pmap2 = curpmap;
785 	*ptepp = PTE_BASE;
786 	*pdeppp = normal_pdes;
787 }
788 
789 /*
790  * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
791  */
792 
793 void
794 pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2)
795 {
796 	struct cpu_info *ci;
797 	struct pmap *mypmap;
798 
799 	KASSERT(kpreempt_disabled());
800 
801 	/* The kernel's pmap is always accessible. */
802 	if (pmap == pmap_kernel()) {
803 		return;
804 	}
805 
806 	/*
807 	 * We cannot tolerate context switches while mapped in.
808 	 * If it is our own pmap all we have to do is unlock.
809 	 */
810 	KASSERT(pmap->pm_ncsw == curlwp->l_ncsw);
811 	mypmap = vm_map_pmap(&curproc->p_vmspace->vm_map);
812 	if (pmap == mypmap) {
813 		mutex_exit(pmap->pm_lock);
814 		return;
815 	}
816 
817 	/*
818 	 * Mark whatever's on the CPU now as lazy and unlock.
819 	 * If the pmap was already installed, we are done.
820 	 */
821 	ci = curcpu();
822 	ci->ci_tlbstate = TLBSTATE_LAZY;
823 	ci->ci_want_pmapload = (mypmap != pmap_kernel());
824 	mutex_exit(pmap->pm_lock);
825 	if (pmap == pmap2) {
826 		return;
827 	}
828 
829 	/*
830 	 * We installed another pmap on the CPU.  Grab a reference to
831 	 * it and leave in place.  Toss the evicted pmap (can block).
832 	 */
833 	pmap_reference(pmap);
834 	pmap_destroy(pmap2);
835 }
836 
837 #endif
838 
839 inline static void
840 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte)
841 {
842 
843 #if !defined(__x86_64__)
844 	if (curproc == NULL || curproc->p_vmspace == NULL ||
845 	    pm != vm_map_pmap(&curproc->p_vmspace->vm_map))
846 		return;
847 
848 	if ((opte ^ npte) & PG_X)
849 		pmap_update_pg(va);
850 
851 	/*
852 	 * Executability was removed on the last executable change.
853 	 * Reset the code segment to something conservative and
854 	 * let the trap handler deal with setting the right limit.
855 	 * We can't do that because of locking constraints on the vm map.
856 	 */
857 
858 	if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) {
859 		struct trapframe *tf = curlwp->l_md.md_regs;
860 
861 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
862 		pm->pm_hiexec = I386_MAX_EXE_ADDR;
863 	}
864 #endif /* !defined(__x86_64__) */
865 }
866 
867 #if !defined(__x86_64__)
868 /*
869  * Fixup the code segment to cover all potential executable mappings.
870  * returns 0 if no changes to the code segment were made.
871  */
872 
873 int
874 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb)
875 {
876 	struct vm_map_entry *ent;
877 	struct pmap *pm = vm_map_pmap(map);
878 	vaddr_t va = 0;
879 
880 	vm_map_lock_read(map);
881 	for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) {
882 
883 		/*
884 		 * This entry has greater va than the entries before.
885 		 * We need to make it point to the last page, not past it.
886 		 */
887 
888 		if (ent->protection & VM_PROT_EXECUTE)
889 			va = trunc_page(ent->end) - PAGE_SIZE;
890 	}
891 	vm_map_unlock_read(map);
892 	if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL))
893 		return (0);
894 
895 	pm->pm_hiexec = va;
896 	if (pm->pm_hiexec > I386_MAX_EXE_ADDR) {
897 		tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
898 	} else {
899 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
900 		return (0);
901 	}
902 	return (1);
903 }
904 #endif /* !defined(__x86_64__) */
905 
906 void
907 pat_init(struct cpu_info *ci)
908 {
909 	uint64_t pat;
910 
911 	if (!(ci->ci_feat_val[0] & CPUID_PAT))
912 		return;
913 
914 	/* We change WT to WC. Leave all other entries the default values. */
915 	pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) |
916 	      PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) |
917 	      PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) |
918 	      PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC);
919 
920 	wrmsr(MSR_CR_PAT, pat);
921 	cpu_pat_enabled = true;
922 	aprint_debug_dev(ci->ci_dev, "PAT enabled\n");
923 }
924 
925 static pt_entry_t
926 pmap_pat_flags(u_int flags)
927 {
928 	u_int cacheflags = (flags & PMAP_CACHE_MASK);
929 
930 	if (!cpu_pat_enabled) {
931 		switch (cacheflags) {
932 		case PMAP_NOCACHE:
933 		case PMAP_NOCACHE_OVR:
934 			/* results in PGC_UCMINUS on cpus which have
935 			 * the cpuid PAT but PAT "disabled"
936 			 */
937 			return PG_N;
938 		default:
939 			return 0;
940 		}
941 	}
942 
943 	switch (cacheflags) {
944 	case PMAP_NOCACHE:
945 		return PGC_UC;
946 	case PMAP_WRITE_COMBINE:
947 		return PGC_WC;
948 	case PMAP_WRITE_BACK:
949 		return PGC_WB;
950 	case PMAP_NOCACHE_OVR:
951 		return PGC_UCMINUS;
952 	}
953 
954 	return 0;
955 }
956 
957 /*
958  * p m a p   k e n t e r   f u n c t i o n s
959  *
960  * functions to quickly enter/remove pages from the kernel address
961  * space.   pmap_kremove is exported to MI kernel.  we make use of
962  * the recursive PTE mappings.
963  */
964 
965 /*
966  * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
967  *
968  * => no need to lock anything, assume va is already allocated
969  * => should be faster than normal pmap enter function
970  */
971 
972 void
973 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
974 {
975 	pt_entry_t *pte, opte, npte;
976 
977 	KASSERT(!(prot & ~VM_PROT_ALL));
978 
979 	if (va < VM_MIN_KERNEL_ADDRESS)
980 		pte = vtopte(va);
981 	else
982 		pte = kvtopte(va);
983 #ifdef DOM0OPS
984 	if (pa < pmap_pa_start || pa >= pmap_pa_end) {
985 #ifdef DEBUG
986 		printk("pmap_kenter_pa: pa 0x%" PRIx64 " for va 0x%" PRIx64
987 		    " outside range\n", (int64_t)pa, (int64_t)va);
988 #endif /* DEBUG */
989 		npte = pa;
990 	} else
991 #endif /* DOM0OPS */
992 		npte = pmap_pa2pte(pa);
993 	npte |= protection_codes[prot] | PG_k | PG_V | pmap_pg_g;
994 	npte |= pmap_pat_flags(flags);
995 	opte = pmap_pte_testset(pte, npte); /* zap! */
996 #if defined(DIAGNOSTIC)
997 	/* XXX For now... */
998 	if (opte & PG_PS)
999 		panic("pmap_kenter_pa: PG_PS");
1000 #endif
1001 	if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) {
1002 #if defined(DIAGNOSTIC)
1003 		printf("pmap_kenter_pa: mapping already present\n");
1004 #endif
1005 		/* This should not happen. */
1006 		kpreempt_disable();
1007 		pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER);
1008 		kpreempt_enable();
1009 	}
1010 }
1011 
1012 void
1013 pmap_emap_enter(vaddr_t va, paddr_t pa, vm_prot_t prot)
1014 {
1015 	pt_entry_t *pte, opte, npte;
1016 
1017 	KASSERT((prot & ~VM_PROT_ALL) == 0);
1018 	pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va);
1019 
1020 #ifdef DOM0OPS
1021 	if (pa < pmap_pa_start || pa >= pmap_pa_end) {
1022 		npte = pa;
1023 	} else
1024 #endif
1025 		npte = pmap_pa2pte(pa);
1026 
1027 	npte = pmap_pa2pte(pa);
1028 	npte |= protection_codes[prot] | PG_k | PG_V;
1029 	opte = pmap_pte_testset(pte, npte);
1030 }
1031 
1032 /*
1033  * pmap_emap_sync: perform TLB flush or pmap load, if it was deferred.
1034  */
1035 void
1036 pmap_emap_sync(bool canload)
1037 {
1038 	struct cpu_info *ci = curcpu();
1039 	struct pmap *pmap;
1040 
1041 	KASSERT(kpreempt_disabled());
1042 	if (__predict_true(ci->ci_want_pmapload && canload)) {
1043 		/*
1044 		 * XXX: Hint for pmap_reactivate(), which might suggest to
1045 		 * not perform TLB flush, if state has not changed.
1046 		 */
1047 		pmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map);
1048 		if (__predict_false(pmap == ci->ci_pmap)) {
1049 			const uint32_t cpumask = ci->ci_cpumask;
1050 			atomic_and_32(&pmap->pm_cpus, ~cpumask);
1051 		}
1052 		pmap_load();
1053 		KASSERT(ci->ci_want_pmapload == 0);
1054 	} else {
1055 		tlbflush();
1056 	}
1057 
1058 }
1059 
1060 void
1061 pmap_emap_remove(vaddr_t sva, vsize_t len)
1062 {
1063 	pt_entry_t *pte, xpte;
1064 	vaddr_t va, eva = sva + len;
1065 
1066 	for (va = sva; va < eva; va += PAGE_SIZE) {
1067 		pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va);
1068 		xpte |= pmap_pte_testset(pte, 0);
1069 	}
1070 }
1071 
1072 __weak_alias(pmap_kenter_ma, pmap_kenter_pa);
1073 
1074 #if defined(__x86_64__)
1075 /*
1076  * Change protection for a virtual address. Local for a CPU only, don't
1077  * care about TLB shootdowns.
1078  *
1079  * => must be called with preemption disabled
1080  */
1081 void
1082 pmap_changeprot_local(vaddr_t va, vm_prot_t prot)
1083 {
1084 	pt_entry_t *pte, opte, npte;
1085 
1086 	KASSERT(kpreempt_disabled());
1087 
1088 	if (va < VM_MIN_KERNEL_ADDRESS)
1089 		pte = vtopte(va);
1090 	else
1091 		pte = kvtopte(va);
1092 
1093 	npte = opte = *pte;
1094 
1095 	if ((prot & VM_PROT_WRITE) != 0)
1096 		npte |= PG_RW;
1097 	else
1098 		npte &= ~PG_RW;
1099 
1100 	if (opte != npte) {
1101 		pmap_pte_set(pte, npte);
1102 		pmap_pte_flush();
1103 		invlpg(va);
1104 	}
1105 }
1106 #endif /* defined(__x86_64__) */
1107 
1108 /*
1109  * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
1110  *
1111  * => no need to lock anything
1112  * => caller must dispose of any vm_page mapped in the va range
1113  * => note: not an inline function
1114  * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
1115  * => we assume kernel only unmaps valid addresses and thus don't bother
1116  *    checking the valid bit before doing TLB flushing
1117  * => must be followed by call to pmap_update() before reuse of page
1118  */
1119 
1120 void
1121 pmap_kremove(vaddr_t sva, vsize_t len)
1122 {
1123 	pt_entry_t *pte, opte;
1124 	vaddr_t va, eva;
1125 
1126 	eva = sva + len;
1127 
1128 	kpreempt_disable();
1129 	for (va = sva; va < eva; va += PAGE_SIZE) {
1130 		if (va < VM_MIN_KERNEL_ADDRESS)
1131 			pte = vtopte(va);
1132 		else
1133 			pte = kvtopte(va);
1134 		opte = pmap_pte_testset(pte, 0); /* zap! */
1135 		if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) {
1136 			pmap_tlb_shootdown(pmap_kernel(), va, opte,
1137 			    TLBSHOOT_KREMOVE);
1138 		}
1139 		KASSERT((opte & PG_PS) == 0);
1140 		KASSERT((opte & PG_PVLIST) == 0);
1141 	}
1142 	kpreempt_enable();
1143 }
1144 
1145 /*
1146  * p m a p   i n i t   f u n c t i o n s
1147  *
1148  * pmap_bootstrap and pmap_init are called during system startup
1149  * to init the pmap module.   pmap_bootstrap() does a low level
1150  * init just to get things rolling.   pmap_init() finishes the job.
1151  */
1152 
1153 /*
1154  * pmap_bootstrap: get the system in a state where it can run with VM
1155  *	properly enabled (called before main()).   the VM system is
1156  *      fully init'd later...
1157  *
1158  * => on i386, locore.s has already enabled the MMU by allocating
1159  *	a PDP for the kernel, and nkpde PTP's for the kernel.
1160  * => kva_start is the first free virtual address in kernel space
1161  */
1162 
1163 void
1164 pmap_bootstrap(vaddr_t kva_start)
1165 {
1166 	struct pmap *kpm;
1167 	pt_entry_t *pte;
1168 	int i;
1169 	vaddr_t kva;
1170 #ifndef XEN
1171 	unsigned long p1i;
1172 	vaddr_t kva_end;
1173 #endif
1174 
1175 	pt_entry_t pg_nx = (cpu_feature[2] & CPUID_NOX ? PG_NX : 0);
1176 
1177 	/*
1178 	 * set up our local static global vars that keep track of the
1179 	 * usage of KVM before kernel_map is set up
1180 	 */
1181 
1182 	virtual_avail = kva_start;		/* first free KVA */
1183 	virtual_end = VM_MAX_KERNEL_ADDRESS;	/* last KVA */
1184 
1185 	/*
1186 	 * set up protection_codes: we need to be able to convert from
1187 	 * a MI protection code (some combo of VM_PROT...) to something
1188 	 * we can jam into a i386 PTE.
1189 	 */
1190 
1191 	protection_codes[VM_PROT_NONE] = pg_nx;			/* --- */
1192 	protection_codes[VM_PROT_EXECUTE] = PG_RO | PG_X;	/* --x */
1193 	protection_codes[VM_PROT_READ] = PG_RO | pg_nx;		/* -r- */
1194 	protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO | PG_X;/* -rx */
1195 	protection_codes[VM_PROT_WRITE] = PG_RW | pg_nx;	/* w-- */
1196 	protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW | PG_X;/* w-x */
1197 	protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW | pg_nx;
1198 								/* wr- */
1199 	protection_codes[VM_PROT_ALL] = PG_RW | PG_X;		/* wrx */
1200 
1201 	/*
1202 	 * now we init the kernel's pmap
1203 	 *
1204 	 * the kernel pmap's pm_obj is not used for much.   however, in
1205 	 * user pmaps the pm_obj contains the list of active PTPs.
1206 	 * the pm_obj currently does not have a pager.   it might be possible
1207 	 * to add a pager that would allow a process to read-only mmap its
1208 	 * own page tables (fast user level vtophys?).   this may or may not
1209 	 * be useful.
1210 	 */
1211 
1212 	kpm = pmap_kernel();
1213 	for (i = 0; i < PTP_LEVELS - 1; i++) {
1214 		mutex_init(&kpm->pm_obj_lock[i], MUTEX_DEFAULT, IPL_NONE);
1215 		uvm_obj_init(&kpm->pm_obj[i], NULL, false, 1);
1216 		uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_obj_lock[i]);
1217 		kpm->pm_ptphint[i] = NULL;
1218 	}
1219 	memset(&kpm->pm_list, 0, sizeof(kpm->pm_list));  /* pm_list not used */
1220 
1221 	kpm->pm_pdir = (pd_entry_t *)(PDPpaddr + KERNBASE);
1222 	for (i = 0; i < PDP_SIZE; i++)
1223 		kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i;
1224 
1225 	kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
1226 		x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS);
1227 
1228 	/*
1229 	 * the above is just a rough estimate and not critical to the proper
1230 	 * operation of the system.
1231 	 */
1232 
1233 #ifndef XEN
1234 	/*
1235 	 * Begin to enable global TLB entries if they are supported.
1236 	 * The G bit has no effect until the CR4_PGE bit is set in CR4,
1237 	 * which happens in cpu_init(), which is run on each cpu
1238 	 * (and happens later)
1239 	 */
1240 
1241 	if (cpu_feature[0] & CPUID_PGE) {
1242 		pmap_pg_g = PG_G;		/* enable software */
1243 
1244 		/* add PG_G attribute to already mapped kernel pages */
1245 		if (KERNBASE == VM_MIN_KERNEL_ADDRESS) {
1246 			kva_end = virtual_avail;
1247 		} else {
1248 			extern vaddr_t eblob, esym;
1249 			kva_end = (vaddr_t)&end;
1250 			if (esym > kva_end)
1251 				kva_end = esym;
1252 			if (eblob > kva_end)
1253 				kva_end = eblob;
1254 			kva_end = roundup(kva_end, PAGE_SIZE);
1255 		}
1256 		for (kva = KERNBASE; kva < kva_end; kva += PAGE_SIZE) {
1257 			p1i = pl1_i(kva);
1258 			if (pmap_valid_entry(PTE_BASE[p1i]))
1259 				PTE_BASE[p1i] |= PG_G;
1260 		}
1261 	}
1262 
1263 	/*
1264 	 * enable large pages if they are supported.
1265 	 */
1266 
1267 	if (cpu_feature[0] & CPUID_PSE) {
1268 		paddr_t pa;
1269 		pd_entry_t *pde;
1270 		extern char __data_start;
1271 
1272 		lcr4(rcr4() | CR4_PSE);	/* enable hardware (via %cr4) */
1273 		pmap_largepages = 1;	/* enable software */
1274 
1275 		/*
1276 		 * the TLB must be flushed after enabling large pages
1277 		 * on Pentium CPUs, according to section 3.6.2.2 of
1278 		 * "Intel Architecture Software Developer's Manual,
1279 		 * Volume 3: System Programming".
1280 		 */
1281 		tlbflushg();
1282 
1283 		/*
1284 		 * now, remap the kernel text using large pages.  we
1285 		 * assume that the linker has properly aligned the
1286 		 * .data segment to a NBPD_L2 boundary.
1287 		 */
1288 		kva_end = rounddown((vaddr_t)&__data_start, NBPD_L1);
1289 		for (pa = 0, kva = KERNBASE; kva + NBPD_L2 <= kva_end;
1290 		     kva += NBPD_L2, pa += NBPD_L2) {
1291 			pde = &L2_BASE[pl2_i(kva)];
1292 			*pde = pa | pmap_pg_g | PG_PS |
1293 			    PG_KR | PG_V;	/* zap! */
1294 			tlbflushg();
1295 		}
1296 #if defined(DEBUG)
1297 		aprint_normal("kernel text is mapped with %" PRIuPSIZE " large "
1298 		    "pages and %" PRIuPSIZE " normal pages\n",
1299 		    howmany(kva - KERNBASE, NBPD_L2),
1300 		    howmany((vaddr_t)&__data_start - kva, NBPD_L1));
1301 #endif /* defined(DEBUG) */
1302 	}
1303 #endif /* !XEN */
1304 
1305 	if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
1306 		/*
1307 		 * zero_pte is stuck at the end of mapped space for the kernel
1308 		 * image (disjunct from kva space). This is done so that it
1309 		 * can safely be used in pmap_growkernel (pmap_get_physpage),
1310 		 * when it's called for the first time.
1311 		 * XXXfvdl fix this for MULTIPROCESSOR later.
1312 		 */
1313 
1314 		early_zerop = (void *)(KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2);
1315 		early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop);
1316 	}
1317 
1318 	/*
1319 	 * now we allocate the "special" VAs which are used for tmp mappings
1320 	 * by the pmap (and other modules).    we allocate the VAs by advancing
1321 	 * virtual_avail (note that there are no pages mapped at these VAs).
1322 	 * we find the PTE that maps the allocated VA via the linear PTE
1323 	 * mapping.
1324 	 */
1325 
1326 	pte = PTE_BASE + pl1_i(virtual_avail);
1327 
1328 #ifdef MULTIPROCESSOR
1329 	/*
1330 	 * Waste some VA space to avoid false sharing of cache lines
1331 	 * for page table pages: Give each possible CPU a cache line
1332 	 * of PTE's (8) to play with, though we only need 4.  We could
1333 	 * recycle some of this waste by putting the idle stacks here
1334 	 * as well; we could waste less space if we knew the largest
1335 	 * CPU ID beforehand.
1336 	 */
1337 	csrcp = (char *) virtual_avail;  csrc_pte = pte;
1338 
1339 	cdstp = (char *) virtual_avail+PAGE_SIZE;  cdst_pte = pte+1;
1340 
1341 	zerop = (char *) virtual_avail+PAGE_SIZE*2;  zero_pte = pte+2;
1342 
1343 	ptpp = (char *) virtual_avail+PAGE_SIZE*3;  ptp_pte = pte+3;
1344 
1345 	virtual_avail += PAGE_SIZE * maxcpus * NPTECL;
1346 	pte += maxcpus * NPTECL;
1347 #else
1348 	csrcp = (void *) virtual_avail;  csrc_pte = pte;	/* allocate */
1349 	virtual_avail += PAGE_SIZE; pte++;			/* advance */
1350 
1351 	cdstp = (void *) virtual_avail;  cdst_pte = pte;
1352 	virtual_avail += PAGE_SIZE; pte++;
1353 
1354 	zerop = (void *) virtual_avail;  zero_pte = pte;
1355 	virtual_avail += PAGE_SIZE; pte++;
1356 
1357 	ptpp = (void *) virtual_avail;  ptp_pte = pte;
1358 	virtual_avail += PAGE_SIZE; pte++;
1359 #endif
1360 
1361 	if (VM_MIN_KERNEL_ADDRESS == KERNBASE) {
1362 		early_zerop = zerop;
1363 		early_zero_pte = zero_pte;
1364 	}
1365 
1366 	/*
1367 	 * Nothing after this point actually needs pte;
1368 	 */
1369 	pte = (void *)0xdeadbeef;
1370 
1371 #ifdef XEN
1372 #ifdef __x86_64__
1373 	/*
1374 	 * We want a dummy page directory for Xen:
1375 	 * when deactivate a pmap, Xen will still consider it active.
1376 	 * So we set user PGD to this one to lift all protection on
1377 	 * the now inactive page tables set.
1378 	 */
1379 	xen_dummy_user_pgd = avail_start;
1380 	avail_start += PAGE_SIZE;
1381 
1382 	/* Zero fill it, the less checks in Xen it requires the better */
1383 	memset((void *) (xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE);
1384 	/* Mark read-only */
1385 	HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE,
1386 	    pmap_pa2pte(xen_dummy_user_pgd) | PG_u | PG_V, UVMF_INVLPG);
1387 	/* Pin as L4 */
1388 	xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd));
1389 #endif /* __x86_64__ */
1390 	idt_vaddr = virtual_avail;                      /* don't need pte */
1391 	idt_paddr = avail_start;                        /* steal a page */
1392 	/*
1393 	 * Xen require one more page as we can't store
1394 	 * GDT and LDT on the same page
1395 	 */
1396 	virtual_avail += 3 * PAGE_SIZE;
1397 	avail_start += 3 * PAGE_SIZE;
1398 #else /* XEN */
1399 	idt_vaddr = virtual_avail;			/* don't need pte */
1400 	idt_paddr = avail_start;			/* steal a page */
1401 #if defined(__x86_64__)
1402 	virtual_avail += 2 * PAGE_SIZE; pte += 2;
1403 	avail_start += 2 * PAGE_SIZE;
1404 #else /* defined(__x86_64__) */
1405 	virtual_avail += PAGE_SIZE; pte++;
1406 	avail_start += PAGE_SIZE;
1407 	/* pentium f00f bug stuff */
1408 	pentium_idt_vaddr = virtual_avail;		/* don't need pte */
1409 	virtual_avail += PAGE_SIZE; pte++;
1410 #endif /* defined(__x86_64__) */
1411 #endif /* XEN */
1412 
1413 #ifdef _LP64
1414 	/*
1415 	 * Grab a page below 4G for things that need it (i.e.
1416 	 * having an initial %cr3 for the MP trampoline).
1417 	 */
1418 	lo32_vaddr = virtual_avail;
1419 	virtual_avail += PAGE_SIZE; pte++;
1420 	lo32_paddr = avail_start;
1421 	avail_start += PAGE_SIZE;
1422 #endif
1423 
1424 	/*
1425 	 * now we reserve some VM for mapping pages when doing a crash dump
1426 	 */
1427 
1428 	virtual_avail = reserve_dumppages(virtual_avail);
1429 
1430 	/*
1431 	 * init the static-global locks and global lists.
1432 	 *
1433 	 * => pventry::pvh_lock (initialized elsewhere) must also be
1434 	 *      a spin lock, again at IPL_VM to prevent deadlock, and
1435 	 *	again is never taken from interrupt context.
1436 	 */
1437 
1438 	mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE);
1439 	LIST_INIT(&pmaps);
1440 
1441 	/*
1442 	 * initialize caches.
1443 	 */
1444 
1445 	pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), 0, 0, 0,
1446 	    "pmappl", NULL, IPL_NONE, NULL, NULL, NULL);
1447 #ifdef PAE
1448 	pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE * PDP_SIZE, 0, 0, 0,
1449 	    "pdppl", &pmap_pdp_allocator, IPL_NONE,
1450 	    pmap_pdp_ctor, pmap_pdp_dtor, NULL);
1451 #else /* PAE */
1452 	pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE, 0, 0, 0,
1453 	    "pdppl", NULL, IPL_NONE, pmap_pdp_ctor, pmap_pdp_dtor, NULL);
1454 #endif /* PAE */
1455 	pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0,
1456 	    PR_LARGECACHE, "pvpl", &pool_allocator_meta, IPL_NONE, NULL,
1457 	    NULL, NULL);
1458 
1459 	/*
1460 	 * ensure the TLB is sync'd with reality by flushing it...
1461 	 */
1462 
1463 	tlbflushg();
1464 
1465 	/*
1466 	 * calculate pmap_maxkvaddr from nkptp[].
1467 	 */
1468 
1469 	kva = VM_MIN_KERNEL_ADDRESS;
1470 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
1471 		kva += nkptp[i] * nbpd[i];
1472 	}
1473 	pmap_maxkvaddr = kva;
1474 }
1475 
1476 #if defined(__x86_64__)
1477 /*
1478  * Pre-allocate PTPs for low memory, so that 1:1 mappings for various
1479  * trampoline code can be entered.
1480  */
1481 void
1482 pmap_prealloc_lowmem_ptps(void)
1483 {
1484 	int level;
1485 	paddr_t newp;
1486 #ifdef XEN
1487 	paddr_t pdes_pa;
1488 
1489 	pdes_pa = pmap_pdirpa(pmap_kernel(), 0);
1490 	level = PTP_LEVELS;
1491 	for (;;) {
1492 		newp = avail_start;
1493 		avail_start += PAGE_SIZE;
1494 		HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop,
1495 		    xpmap_ptom_masked(newp) | PG_u | PG_V | PG_RW, UVMF_INVLPG);
1496 		memset(early_zerop, 0, PAGE_SIZE);
1497 		/* Mark R/O before installing */
1498 		HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop,
1499 		    xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG);
1500 		if (newp < (NKL2_KIMG_ENTRIES * NBPD_L2))
1501 			HYPERVISOR_update_va_mapping (newp + KERNBASE,
1502 			    xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG);
1503 		xpq_queue_pte_update (
1504 		    xpmap_ptom_masked(pdes_pa)
1505 		    + (pl_i(0, level) * sizeof (pd_entry_t)),
1506 		    xpmap_ptom_masked(newp) | PG_RW | PG_u | PG_V);
1507 		pmap_pte_flush();
1508 		level--;
1509 		if (level <= 1)
1510 			break;
1511 		pdes_pa = newp;
1512 	}
1513 #else /* XEN */
1514 	pd_entry_t *pdes;
1515 
1516 	pdes = pmap_kernel()->pm_pdir;
1517 	level = PTP_LEVELS;
1518 	for (;;) {
1519 		newp = avail_start;
1520 		avail_start += PAGE_SIZE;
1521 		pmap_pte_set(early_zero_pte, (newp & PG_FRAME) | PG_V | PG_RW);
1522 		pmap_pte_flush();
1523 		pmap_update_pg((vaddr_t)early_zerop);
1524 		memset(early_zerop, 0, PAGE_SIZE);
1525 		pdes[pl_i(0, level)] = (newp & PG_FRAME) | PG_V | PG_RW;
1526 		level--;
1527 		if (level <= 1)
1528 			break;
1529 		pdes = normal_pdes[level - 2];
1530 	}
1531 #endif /* XEN */
1532 }
1533 #endif /* defined(__x86_64__) */
1534 
1535 /*
1536  * pmap_init: called from uvm_init, our job is to get the pmap
1537  * system ready to manage mappings...
1538  */
1539 
1540 void
1541 pmap_init(void)
1542 {
1543 	int i;
1544 
1545 	for (i = 0; i < PV_HASH_SIZE; i++) {
1546 		SLIST_INIT(&pv_hash_heads[i].hh_list);
1547 	}
1548 	for (i = 0; i < PV_HASH_LOCK_CNT; i++) {
1549 		mutex_init(&pv_hash_locks[i].lock, MUTEX_NODEBUG, IPL_VM);
1550 	}
1551 
1552 	pmap_tlb_init();
1553 
1554 	evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC,
1555 	    NULL, "x86", "io bitmap copy");
1556 	evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC,
1557 	    NULL, "x86", "ldt sync");
1558 
1559 	/*
1560 	 * done: pmap module is up (and ready for business)
1561 	 */
1562 
1563 	pmap_initialized = true;
1564 }
1565 
1566 /*
1567  * pmap_cpu_init_late: perform late per-CPU initialization.
1568  */
1569 
1570 void
1571 pmap_cpu_init_late(struct cpu_info *ci)
1572 {
1573 #ifdef PAE
1574 	int ret;
1575 	struct pglist pg;
1576 	struct vm_page *vmap;
1577 
1578 	/* The BP has already its own L3 page allocated in locore.S. */
1579 	if (ci == &cpu_info_primary)
1580 		return;
1581 
1582 	/*
1583 	 * Allocate a page for the per-CPU L3 PD. cr3 being 32 bits, PA musts
1584 	 * resides below the 4GB boundary.
1585 	 */
1586 	ret = uvm_pglistalloc(PAGE_SIZE, 0, 0x100000000ULL, 32, 0, &pg, 1, 0);
1587 	vmap = TAILQ_FIRST(&pg);
1588 
1589 	if (ret != 0 || vmap == NULL)
1590 		panic("%s: failed to allocate L3 pglist for CPU %d (ret %d)\n",
1591 			__func__, cpu_index(ci), ret);
1592 
1593 	ci->ci_pae_l3_pdirpa = vmap->phys_addr;
1594 
1595 	ci->ci_pae_l3_pdir = (paddr_t *)uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
1596 		UVM_KMF_VAONLY | UVM_KMF_NOWAIT);
1597 	if (ci->ci_pae_l3_pdir == NULL)
1598 		panic("%s: failed to allocate L3 PD for CPU %d\n",
1599 			__func__, cpu_index(ci));
1600 
1601 	pmap_kenter_pa((vaddr_t)ci->ci_pae_l3_pdir, ci->ci_pae_l3_pdirpa,
1602 		VM_PROT_READ | VM_PROT_WRITE, 0);
1603 
1604 	pmap_update(pmap_kernel());
1605 #endif
1606 }
1607 
1608 /*
1609  * p v _ e n t r y   f u n c t i o n s
1610  */
1611 
1612 /*
1613  * pmap_free_pvs: free a list of pv_entrys
1614  */
1615 
1616 static void
1617 pmap_free_pvs(struct pv_entry *pve)
1618 {
1619 	struct pv_entry *next;
1620 
1621 	for ( /* null */ ; pve != NULL ; pve = next) {
1622 		next = pve->pve_next;
1623 		pool_cache_put(&pmap_pv_cache, pve);
1624 	}
1625 }
1626 
1627 /*
1628  * main pv_entry manipulation functions:
1629  *   pmap_enter_pv: enter a mapping onto a pv_head list
1630  *   pmap_remove_pv: remove a mapping from a pv_head list
1631  *
1632  * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock
1633  *       the pvh before calling
1634  */
1635 
1636 /*
1637  * insert_pv: a helper of pmap_enter_pv
1638  */
1639 
1640 static void
1641 insert_pv(struct pmap_page *pp, struct pv_entry *pve)
1642 {
1643 	struct pv_hash_head *hh;
1644 	kmutex_t *lock;
1645 	u_int hash;
1646 
1647 	hash = pvhash_hash(pve->pve_pte.pte_ptp, pve->pve_pte.pte_va);
1648 	lock = pvhash_lock(hash);
1649 	hh = pvhash_head(hash);
1650 	mutex_spin_enter(lock);
1651 	SLIST_INSERT_HEAD(&hh->hh_list, pve, pve_hash);
1652 	mutex_spin_exit(lock);
1653 
1654 	LIST_INSERT_HEAD(&pp->pp_head.pvh_list, pve, pve_list);
1655 }
1656 
1657 /*
1658  * pmap_enter_pv: enter a mapping onto a pv_head lst
1659  *
1660  * => caller should adjust ptp's wire_count before calling
1661  */
1662 
1663 static struct pv_entry *
1664 pmap_enter_pv(struct pmap_page *pp,
1665 	      struct pv_entry *pve,	/* preallocated pve for us to use */
1666 	      struct pv_entry **sparepve,
1667 	      struct vm_page *ptp,
1668 	      vaddr_t va)
1669 {
1670 
1671 	KASSERT(ptp == NULL || ptp->wire_count >= 2);
1672 	KASSERT(ptp == NULL || ptp->uobject != NULL);
1673 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
1674 
1675 	if ((pp->pp_flags & PP_EMBEDDED) == 0) {
1676 		if (LIST_EMPTY(&pp->pp_head.pvh_list)) {
1677 			pp->pp_flags |= PP_EMBEDDED;
1678 			pp->pp_pte.pte_ptp = ptp;
1679 			pp->pp_pte.pte_va = va;
1680 
1681 			return pve;
1682 		}
1683 	} else {
1684 		struct pv_entry *pve2;
1685 
1686 		pve2 = *sparepve;
1687 		*sparepve = NULL;
1688 
1689 		pve2->pve_pte = pp->pp_pte;
1690 		pp->pp_flags &= ~PP_EMBEDDED;
1691 		LIST_INIT(&pp->pp_head.pvh_list);
1692 		insert_pv(pp, pve2);
1693 	}
1694 
1695 	pve->pve_pte.pte_ptp = ptp;
1696 	pve->pve_pte.pte_va = va;
1697 	insert_pv(pp, pve);
1698 
1699 	return NULL;
1700 }
1701 
1702 /*
1703  * pmap_remove_pv: try to remove a mapping from a pv_list
1704  *
1705  * => caller should adjust ptp's wire_count and free PTP if needed
1706  * => we return the removed pve
1707  */
1708 
1709 static struct pv_entry *
1710 pmap_remove_pv(struct pmap_page *pp, struct vm_page *ptp, vaddr_t va)
1711 {
1712 	struct pv_hash_head *hh;
1713 	struct pv_entry *pve;
1714 	kmutex_t *lock;
1715 	u_int hash;
1716 
1717 	KASSERT(ptp == NULL || ptp->uobject != NULL);
1718 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
1719 
1720 	if ((pp->pp_flags & PP_EMBEDDED) != 0) {
1721 		KASSERT(pp->pp_pte.pte_ptp == ptp);
1722 		KASSERT(pp->pp_pte.pte_va == va);
1723 
1724 		pp->pp_flags &= ~PP_EMBEDDED;
1725 		LIST_INIT(&pp->pp_head.pvh_list);
1726 
1727 		return NULL;
1728 	}
1729 
1730 	hash = pvhash_hash(ptp, va);
1731 	lock = pvhash_lock(hash);
1732 	hh = pvhash_head(hash);
1733 	mutex_spin_enter(lock);
1734 	pve = pvhash_remove(hh, ptp, va);
1735 	mutex_spin_exit(lock);
1736 
1737 	LIST_REMOVE(pve, pve_list);
1738 
1739 	return pve;
1740 }
1741 
1742 /*
1743  * p t p   f u n c t i o n s
1744  */
1745 
1746 static inline struct vm_page *
1747 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level)
1748 {
1749 	int lidx = level - 1;
1750 	struct vm_page *pg;
1751 
1752 	KASSERT(mutex_owned(pmap->pm_lock));
1753 
1754 	if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] &&
1755 	    pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) {
1756 		return (pmap->pm_ptphint[lidx]);
1757 	}
1758 	PMAP_SUBOBJ_LOCK(pmap, lidx);
1759 	pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level));
1760 	PMAP_SUBOBJ_UNLOCK(pmap, lidx);
1761 
1762 	KASSERT(pg == NULL || pg->wire_count >= 1);
1763 	return pg;
1764 }
1765 
1766 static inline void
1767 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level)
1768 {
1769 	lwp_t *l;
1770 	int lidx;
1771 	struct uvm_object *obj;
1772 
1773 	KASSERT(ptp->wire_count == 1);
1774 
1775 	lidx = level - 1;
1776 
1777 	obj = &pmap->pm_obj[lidx];
1778 	pmap_stats_update(pmap, -1, 0);
1779 	if (lidx != 0)
1780 		mutex_enter(obj->vmobjlock);
1781 	if (pmap->pm_ptphint[lidx] == ptp)
1782 		pmap->pm_ptphint[lidx] = TAILQ_FIRST(&obj->memq);
1783 	ptp->wire_count = 0;
1784 	uvm_pagerealloc(ptp, NULL, 0);
1785 	l = curlwp;
1786 	KASSERT((l->l_pflag & LP_INTR) == 0);
1787 	VM_PAGE_TO_PP(ptp)->pp_link = l->l_md.md_gc_ptp;
1788 	l->l_md.md_gc_ptp = ptp;
1789 	if (lidx != 0)
1790 		mutex_exit(obj->vmobjlock);
1791 }
1792 
1793 static void
1794 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
1795 	      pt_entry_t *ptes, pd_entry_t * const *pdes)
1796 {
1797 	unsigned long index;
1798 	int level;
1799 	vaddr_t invaladdr;
1800 	pd_entry_t opde;
1801 #ifdef XEN
1802 	struct pmap *curpmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map);
1803 #ifdef MULTIPROCESSOR
1804 	vaddr_t invaladdr2;
1805 #endif
1806 #endif
1807 
1808 	KASSERT(pmap != pmap_kernel());
1809 	KASSERT(mutex_owned(pmap->pm_lock));
1810 	KASSERT(kpreempt_disabled());
1811 
1812 	level = 1;
1813 	do {
1814 		index = pl_i(va, level + 1);
1815 		opde = pmap_pte_testset(&pdes[level - 1][index], 0);
1816 #if defined(XEN)
1817 #  if defined(__x86_64__)
1818 		/*
1819 		 * If ptp is a L3 currently mapped in kernel space,
1820 		 * clear it before freeing
1821 		 */
1822 		if (pmap_pdirpa(pmap, 0) == curcpu()->ci_xen_current_user_pgd
1823 		    && level == PTP_LEVELS - 1)
1824 			pmap_pte_set(&pmap_kernel()->pm_pdir[index], 0);
1825 #  endif /*__x86_64__ */
1826 		invaladdr = level == 1 ? (vaddr_t)ptes :
1827 		    (vaddr_t)pdes[level - 2];
1828 		pmap_tlb_shootdown(curpmap, invaladdr + index * PAGE_SIZE,
1829 		    opde, TLBSHOOT_FREE_PTP1);
1830 #  if defined(MULTIPROCESSOR)
1831 		invaladdr2 = level == 1 ? (vaddr_t)PTE_BASE :
1832 		    (vaddr_t)normal_pdes[level - 2];
1833 		if (pmap != curpmap || invaladdr != invaladdr2) {
1834 			pmap_tlb_shootdown(pmap, invaladdr2 + index * PAGE_SIZE,
1835 			    opde, TLBSHOOT_FREE_PTP2);
1836 		}
1837 #  endif /* MULTIPROCESSOR */
1838 #else	/* XEN */
1839 		invaladdr = level == 1 ? (vaddr_t)ptes :
1840 		    (vaddr_t)pdes[level - 2];
1841 		pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE,
1842 		    opde, TLBSHOOT_FREE_PTP1);
1843 #endif	/* XEN */
1844 		pmap_freepage(pmap, ptp, level);
1845 		if (level < PTP_LEVELS - 1) {
1846 			ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1);
1847 			ptp->wire_count--;
1848 			if (ptp->wire_count > 1)
1849 				break;
1850 		}
1851 	} while (++level < PTP_LEVELS);
1852 	pmap_pte_flush();
1853 }
1854 
1855 /*
1856  * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
1857  *
1858  * => pmap should NOT be pmap_kernel()
1859  * => pmap should be locked
1860  * => preemption should be disabled
1861  */
1862 
1863 static struct vm_page *
1864 pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t * const *pdes)
1865 {
1866 	struct vm_page *ptp, *pptp;
1867 	int i;
1868 	unsigned long index;
1869 	pd_entry_t *pva;
1870 	paddr_t ppa, pa;
1871 	struct uvm_object *obj;
1872 
1873 	KASSERT(pmap != pmap_kernel());
1874 	KASSERT(mutex_owned(pmap->pm_lock));
1875 	KASSERT(kpreempt_disabled());
1876 
1877 	ptp = NULL;
1878 	pa = (paddr_t)-1;
1879 
1880 	/*
1881 	 * Loop through all page table levels seeing if we need to
1882 	 * add a new page to that level.
1883 	 */
1884 	for (i = PTP_LEVELS; i > 1; i--) {
1885 		/*
1886 		 * Save values from previous round.
1887 		 */
1888 		pptp = ptp;
1889 		ppa = pa;
1890 
1891 		index = pl_i(va, i);
1892 		pva = pdes[i - 2];
1893 
1894 		if (pmap_valid_entry(pva[index])) {
1895 			ppa = pmap_pte2pa(pva[index]);
1896 			ptp = NULL;
1897 			continue;
1898 		}
1899 
1900 		obj = &pmap->pm_obj[i-2];
1901 		PMAP_SUBOBJ_LOCK(pmap, i - 2);
1902 		ptp = uvm_pagealloc(obj, ptp_va2o(va, i - 1), NULL,
1903 		    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
1904 		PMAP_SUBOBJ_UNLOCK(pmap, i - 2);
1905 
1906 		if (ptp == NULL)
1907 			return NULL;
1908 
1909 		ptp->flags &= ~PG_BUSY; /* never busy */
1910 		ptp->wire_count = 1;
1911 		pmap->pm_ptphint[i - 2] = ptp;
1912 		pa = VM_PAGE_TO_PHYS(ptp);
1913 		pmap_pte_set(&pva[index], (pd_entry_t)
1914 		        (pmap_pa2pte(pa) | PG_u | PG_RW | PG_V));
1915 #if defined(XEN) && defined(__x86_64__)
1916 		/*
1917 		 * In Xen we must enter the mapping in kernel map too
1918 		 * if pmap is curmap and modifying top level (PGD)
1919 		 */
1920 		if(i == PTP_LEVELS && pmap != pmap_kernel()) {
1921 		        pmap_pte_set(&pmap_kernel()->pm_pdir[index],
1922 		                (pd_entry_t) (pmap_pa2pte(pa)
1923 		                        | PG_u | PG_RW | PG_V));
1924 		}
1925 #endif /* XEN && __x86_64__ */
1926 		pmap_pte_flush();
1927 		pmap_stats_update(pmap, 1, 0);
1928 		/*
1929 		 * If we're not in the top level, increase the
1930 		 * wire count of the parent page.
1931 		 */
1932 		if (i < PTP_LEVELS) {
1933 			if (pptp == NULL)
1934 				pptp = pmap_find_ptp(pmap, va, ppa, i);
1935 #ifdef DIAGNOSTIC
1936 			if (pptp == NULL)
1937 				panic("pde page disappeared");
1938 #endif
1939 			pptp->wire_count++;
1940 		}
1941 	}
1942 
1943 	/*
1944 	 * ptp is not NULL if we just allocated a new ptp. If it's
1945 	 * still NULL, we must look up the existing one.
1946 	 */
1947 	if (ptp == NULL) {
1948 		ptp = pmap_find_ptp(pmap, va, ppa, 1);
1949 #ifdef DIAGNOSTIC
1950 		if (ptp == NULL) {
1951 			printf("va %" PRIxVADDR " ppa %" PRIxPADDR "\n",
1952 			    va, ppa);
1953 			panic("pmap_get_ptp: unmanaged user PTP");
1954 		}
1955 #endif
1956 	}
1957 
1958 	pmap->pm_ptphint[0] = ptp;
1959 	return(ptp);
1960 }
1961 
1962 /*
1963  * p m a p  l i f e c y c l e   f u n c t i o n s
1964  */
1965 
1966 /*
1967  * pmap_pdp_ctor: constructor for the PDP cache.
1968  */
1969 
1970 int
1971 pmap_pdp_ctor(void *arg, void *v, int flags)
1972 {
1973 	pd_entry_t *pdir = v;
1974 	paddr_t pdirpa = 0;	/* XXX: GCC */
1975 	vaddr_t object;
1976 	int i;
1977 
1978 #if !defined(XEN) || !defined(__x86_64__)
1979 	int npde;
1980 #endif
1981 #ifdef XEN
1982 	int s;
1983 #endif
1984 
1985 	/*
1986 	 * NOTE: The `pmap_lock' is held when the PDP is allocated.
1987 	 */
1988 
1989 #if defined(XEN) && defined(__x86_64__)
1990 	/* fetch the physical address of the page directory. */
1991 	(void) pmap_extract(pmap_kernel(), (vaddr_t) pdir, &pdirpa);
1992 
1993 	/* zero init area */
1994 	memset (pdir, 0, PAGE_SIZE); /* Xen wants a clean page */
1995 	/*
1996 	 * this pdir will NEVER be active in kernel mode
1997 	 * so mark recursive entry invalid
1998 	 */
1999 	pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa) | PG_u;
2000 	/*
2001 	 * PDP constructed this way won't be for kernel,
2002 	 * hence we don't put kernel mappings on Xen.
2003 	 * But we need to make pmap_create() happy, so put a dummy (without
2004 	 * PG_V) value at the right place.
2005 	 */
2006 	pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] =
2007 	     (pd_entry_t)-1 & PG_FRAME;
2008 #else /* XEN && __x86_64__*/
2009 	/* zero init area */
2010 	memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t));
2011 
2012 	object = (vaddr_t)v;
2013 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2014 		/* fetch the physical address of the page directory. */
2015 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2016 		/* put in recursive PDE to map the PTEs */
2017 		pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PG_V;
2018 #ifndef XEN
2019 		pdir[PDIR_SLOT_PTE + i] |= PG_KW;
2020 #endif
2021 	}
2022 
2023 	/* copy kernel's PDE */
2024 	npde = nkptp[PTP_LEVELS - 1];
2025 
2026 	memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN],
2027 	    npde * sizeof(pd_entry_t));
2028 
2029 	/* zero the rest */
2030 	memset(&pdir[PDIR_SLOT_KERN + npde], 0,
2031 	    (NTOPLEVEL_PDES - (PDIR_SLOT_KERN + npde)) * sizeof(pd_entry_t));
2032 
2033 	if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
2034 		int idx = pl_i(KERNBASE, PTP_LEVELS);
2035 
2036 		pdir[idx] = PDP_BASE[idx];
2037 	}
2038 #endif /* XEN  && __x86_64__*/
2039 #ifdef XEN
2040 	s = splvm();
2041 	object = (vaddr_t)v;
2042 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2043 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2044 		/* remap this page RO */
2045 		pmap_kenter_pa(object, pdirpa, VM_PROT_READ, 0);
2046 		pmap_update(pmap_kernel());
2047 		/*
2048 		 * pin as L2/L4 page, we have to do the page with the
2049 		 * PDIR_SLOT_PTE entries last
2050 		 */
2051 #ifdef PAE
2052 		if (i == l2tol3(PDIR_SLOT_PTE))
2053 			continue;
2054 #endif
2055 
2056 #ifdef __x86_64__
2057 		xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa));
2058 #else
2059 		xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2060 #endif
2061 	}
2062 #ifdef PAE
2063 	object = ((vaddr_t)pdir) + PAGE_SIZE  * l2tol3(PDIR_SLOT_PTE);
2064 	(void)pmap_extract(pmap_kernel(), object, &pdirpa);
2065 	xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2066 #endif
2067 	splx(s);
2068 #endif /* XEN */
2069 
2070 	return (0);
2071 }
2072 
2073 /*
2074  * pmap_pdp_dtor: destructor for the PDP cache.
2075  */
2076 
2077 void
2078 pmap_pdp_dtor(void *arg, void *v)
2079 {
2080 #ifdef XEN
2081 	paddr_t pdirpa = 0;	/* XXX: GCC */
2082 	vaddr_t object = (vaddr_t)v;
2083 	int i;
2084 	int s = splvm();
2085 	pt_entry_t *pte;
2086 
2087 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2088 		/* fetch the physical address of the page directory. */
2089 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2090 		/* unpin page table */
2091 		xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa));
2092 	}
2093 	object = (vaddr_t)v;
2094 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2095 		/* Set page RW again */
2096 		pte = kvtopte(object);
2097 		xpq_queue_pte_update(xpmap_ptetomach(pte), *pte | PG_RW);
2098 		xpq_queue_invlpg((vaddr_t)object);
2099 	}
2100 	splx(s);
2101 #endif  /* XEN */
2102 }
2103 
2104 #ifdef PAE
2105 
2106 /* pmap_pdp_alloc: Allocate a page for the pdp memory pool. */
2107 
2108 void *
2109 pmap_pdp_alloc(struct pool *pp, int flags)
2110 {
2111 	return (void *)uvm_km_alloc(kernel_map,
2112 	    PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE,
2113 	    ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK)
2114 	    | UVM_KMF_WIRED);
2115 }
2116 
2117 /*
2118  * pmap_pdp_free: free a PDP
2119  */
2120 
2121 void
2122 pmap_pdp_free(struct pool *pp, void *v)
2123 {
2124 	uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE,
2125 	    UVM_KMF_WIRED);
2126 }
2127 #endif /* PAE */
2128 
2129 /*
2130  * pmap_create: create a pmap
2131  *
2132  * => note: old pmap interface took a "size" args which allowed for
2133  *	the creation of "software only" pmaps (not in bsd).
2134  */
2135 
2136 struct pmap *
2137 pmap_create(void)
2138 {
2139 	struct pmap *pmap;
2140 	int i;
2141 
2142 	pmap = pool_cache_get(&pmap_cache, PR_WAITOK);
2143 
2144 	/* init uvm_object */
2145 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2146 		mutex_init(&pmap->pm_obj_lock[i], MUTEX_DEFAULT, IPL_NONE);
2147 		uvm_obj_init(&pmap->pm_obj[i], NULL, false, 1);
2148 		uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_obj_lock[i]);
2149 		pmap->pm_ptphint[i] = NULL;
2150 	}
2151 	pmap->pm_stats.wired_count = 0;
2152 	/* count the PDP allocd below */
2153 	pmap->pm_stats.resident_count = PDP_SIZE;
2154 #if !defined(__x86_64__)
2155 	pmap->pm_hiexec = 0;
2156 #endif /* !defined(__x86_64__) */
2157 	pmap->pm_flags = 0;
2158 	pmap->pm_cpus = 0;
2159 	pmap->pm_kernel_cpus = 0;
2160 	pmap->pm_gc_ptp = NULL;
2161 
2162 	/* init the LDT */
2163 	pmap->pm_ldt = NULL;
2164 	pmap->pm_ldt_len = 0;
2165 	pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2166 
2167 	/* allocate PDP */
2168  try_again:
2169 	pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK);
2170 
2171 	mutex_enter(&pmaps_lock);
2172 
2173 	if (pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] == 0) {
2174 		mutex_exit(&pmaps_lock);
2175 		pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir);
2176 		goto try_again;
2177 	}
2178 
2179 	for (i = 0; i < PDP_SIZE; i++)
2180 		pmap->pm_pdirpa[i] =
2181 		    pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]);
2182 
2183 	LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
2184 
2185 	mutex_exit(&pmaps_lock);
2186 
2187 	return (pmap);
2188 }
2189 
2190 /*
2191  * pmap_free_ptps: put a list of ptps back to the freelist.
2192  */
2193 
2194 static void
2195 pmap_free_ptps(struct vm_page *empty_ptps)
2196 {
2197 	struct vm_page *ptp;
2198 	struct pmap_page *pp;
2199 
2200 	while ((ptp = empty_ptps) != NULL) {
2201 		pp = VM_PAGE_TO_PP(ptp);
2202 		empty_ptps = pp->pp_link;
2203 		LIST_INIT(&pp->pp_head.pvh_list);
2204 		uvm_pagefree(ptp);
2205 	}
2206 }
2207 
2208 /*
2209  * pmap_destroy: drop reference count on pmap.   free pmap if
2210  *	reference count goes to zero.
2211  */
2212 
2213 void
2214 pmap_destroy(struct pmap *pmap)
2215 {
2216 	int i;
2217 #ifdef DIAGNOSTIC
2218 	struct cpu_info *ci;
2219 	CPU_INFO_ITERATOR cii;
2220 #endif /* DIAGNOSTIC */
2221 	lwp_t *l;
2222 
2223 	/*
2224 	 * If we have torn down this pmap, process deferred frees and
2225 	 * invalidations.  Free now if the system is low on memory.
2226 	 * Otherwise, free when the pmap is destroyed thus avoiding a
2227 	 * TLB shootdown.
2228 	 */
2229 	l = curlwp;
2230 	if (__predict_false(l->l_md.md_gc_pmap == pmap)) {
2231 		if (uvmexp.free < uvmexp.freetarg) {
2232 			pmap_update(pmap);
2233 		} else {
2234 			KASSERT(pmap->pm_gc_ptp == NULL);
2235 			pmap->pm_gc_ptp = l->l_md.md_gc_ptp;
2236 			l->l_md.md_gc_ptp = NULL;
2237 			l->l_md.md_gc_pmap = NULL;
2238 		}
2239 	}
2240 
2241 	/*
2242 	 * drop reference count
2243 	 */
2244 
2245 	if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) {
2246 		return;
2247 	}
2248 
2249 #ifdef DIAGNOSTIC
2250 	for (CPU_INFO_FOREACH(cii, ci))
2251 		if (ci->ci_pmap == pmap)
2252 			panic("destroying pmap being used");
2253 #endif /* DIAGNOSTIC */
2254 
2255 	/*
2256 	 * reference count is zero, free pmap resources and then free pmap.
2257 	 */
2258 #ifdef XEN
2259 	/*
2260 	 * Xen lazy APDP handling:
2261 	 * clear APDP_PDE if pmap is the currently mapped
2262 	 */
2263 	if (xpmap_ptom_masked(pmap_pdirpa(pmap, 0)) == (*APDP_PDE & PG_FRAME)) {
2264 		kpreempt_disable();
2265 		pmap_unmap_apdp();
2266 		pmap_pte_flush();
2267 	        pmap_apte_flush(pmap_kernel());
2268 	        kpreempt_enable();
2269 	}
2270 #endif
2271 
2272 	/*
2273 	 * remove it from global list of pmaps
2274 	 */
2275 
2276 	mutex_enter(&pmaps_lock);
2277 	LIST_REMOVE(pmap, pm_list);
2278 	mutex_exit(&pmaps_lock);
2279 
2280 	/*
2281 	 * Process deferred PTP frees.  No TLB shootdown required, as the
2282 	 * PTP pages are no longer visible to any CPU.
2283 	 */
2284 
2285 	pmap_free_ptps(pmap->pm_gc_ptp);
2286 
2287 	/*
2288 	 * destroyed pmap shouldn't have remaining PTPs
2289 	 */
2290 
2291 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2292 		KASSERT(pmap->pm_obj[i].uo_npages == 0);
2293 		KASSERT(TAILQ_EMPTY(&pmap->pm_obj[i].memq));
2294 	}
2295 
2296 	/*
2297 	 * MULTIPROCESSOR -- no need to flush out of other processors'
2298 	 * APTE space because we do that in pmap_unmap_ptes().
2299 	 */
2300 	pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir);
2301 
2302 #ifdef USER_LDT
2303 	if (pmap->pm_ldt != NULL) {
2304 		/*
2305 		 * no need to switch the LDT; this address space is gone,
2306 		 * nothing is using it.
2307 		 *
2308 		 * No need to lock the pmap for ldt_free (or anything else),
2309 		 * we're the last one to use it.
2310 		 */
2311 		mutex_enter(&cpu_lock);
2312 		ldt_free(pmap->pm_ldt_sel);
2313 		mutex_exit(&cpu_lock);
2314 		uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt,
2315 		    pmap->pm_ldt_len, UVM_KMF_WIRED);
2316 	}
2317 #endif
2318 
2319 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2320 		uvm_obj_destroy(&pmap->pm_obj[i], false);
2321 		mutex_destroy(&pmap->pm_obj_lock[i]);
2322 	}
2323 	pool_cache_put(&pmap_cache, pmap);
2324 }
2325 
2326 /*
2327  * pmap_remove_all: pmap is being torn down by the current thread.
2328  * avoid unnecessary invalidations.
2329  */
2330 
2331 void
2332 pmap_remove_all(struct pmap *pmap)
2333 {
2334 	lwp_t *l = curlwp;
2335 
2336 	KASSERT(l->l_md.md_gc_pmap == NULL);
2337 
2338 	l->l_md.md_gc_pmap = pmap;
2339 }
2340 
2341 #if defined(PMAP_FORK)
2342 /*
2343  * pmap_fork: perform any necessary data structure manipulation when
2344  * a VM space is forked.
2345  */
2346 
2347 void
2348 pmap_fork(struct pmap *pmap1, struct pmap *pmap2)
2349 {
2350 #ifdef USER_LDT
2351 	union descriptor *new_ldt;
2352 	size_t len;
2353 	int sel;
2354 
2355 	if (__predict_true(pmap1->pm_ldt == NULL)) {
2356 		return;
2357 	}
2358 
2359  retry:
2360 	if (pmap1->pm_ldt != NULL) {
2361 		len = pmap1->pm_ldt_len;
2362 		new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, len, 0,
2363 		    UVM_KMF_WIRED);
2364 		mutex_enter(&cpu_lock);
2365 		sel = ldt_alloc(new_ldt, len);
2366 		if (sel == -1) {
2367 			mutex_exit(&cpu_lock);
2368 			uvm_km_free(kernel_map, (vaddr_t)new_ldt, len,
2369 			    UVM_KMF_WIRED);
2370 			printf("WARNING: pmap_fork: unable to allocate LDT\n");
2371 			return;
2372 		}
2373 	} else {
2374 		len = -1;
2375 		new_ldt = NULL;
2376 		sel = -1;
2377 		mutex_enter(&cpu_lock);
2378 	}
2379 
2380  	/* Copy the LDT, if necessary. */
2381  	if (pmap1->pm_ldt != NULL) {
2382 		if (len != pmap1->pm_ldt_len) {
2383 			if (len != -1) {
2384 				ldt_free(sel);
2385 				uvm_km_free(kernel_map, (vaddr_t)new_ldt,
2386 				    len, UVM_KMF_WIRED);
2387 			}
2388 			mutex_exit(&cpu_lock);
2389 			goto retry;
2390 		}
2391 
2392 		memcpy(new_ldt, pmap1->pm_ldt, len);
2393 		pmap2->pm_ldt = new_ldt;
2394 		pmap2->pm_ldt_len = pmap1->pm_ldt_len;
2395 		pmap2->pm_ldt_sel = sel;
2396 		len = -1;
2397 	}
2398 
2399 	if (len != -1) {
2400 		ldt_free(sel);
2401 		uvm_km_free(kernel_map, (vaddr_t)new_ldt, len,
2402 		    UVM_KMF_WIRED);
2403 	}
2404 	mutex_exit(&cpu_lock);
2405 #endif /* USER_LDT */
2406 }
2407 #endif /* PMAP_FORK */
2408 
2409 #ifdef USER_LDT
2410 
2411 /*
2412  * pmap_ldt_xcall: cross call used by pmap_ldt_sync.  if the named pmap
2413  * is active, reload LDTR.
2414  */
2415 static void
2416 pmap_ldt_xcall(void *arg1, void *arg2)
2417 {
2418 	struct pmap *pm;
2419 
2420 	kpreempt_disable();
2421 	pm = arg1;
2422 	if (curcpu()->ci_pmap == pm) {
2423 		lldt(pm->pm_ldt_sel);
2424 	}
2425 	kpreempt_enable();
2426 }
2427 
2428 /*
2429  * pmap_ldt_sync: LDT selector for the named pmap is changing.  swap
2430  * in the new selector on all CPUs.
2431  */
2432 void
2433 pmap_ldt_sync(struct pmap *pm)
2434 {
2435 	uint64_t where;
2436 
2437 	KASSERT(mutex_owned(&cpu_lock));
2438 
2439 	pmap_ldt_evcnt.ev_count++;
2440 	where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL);
2441 	xc_wait(where);
2442 }
2443 
2444 /*
2445  * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and
2446  * restore the default.
2447  */
2448 
2449 void
2450 pmap_ldt_cleanup(struct lwp *l)
2451 {
2452 	pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap;
2453 	union descriptor *dp = NULL;
2454 	size_t len = 0;
2455 	int sel = -1;
2456 
2457 	if (__predict_true(pmap->pm_ldt == NULL)) {
2458 		return;
2459 	}
2460 
2461 	mutex_enter(&cpu_lock);
2462 	if (pmap->pm_ldt != NULL) {
2463 		sel = pmap->pm_ldt_sel;
2464 		dp = pmap->pm_ldt;
2465 		len = pmap->pm_ldt_len;
2466 		pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2467 		pmap->pm_ldt = NULL;
2468 		pmap->pm_ldt_len = 0;
2469 		pmap_ldt_sync(pmap);
2470 		ldt_free(sel);
2471 		uvm_km_free(kernel_map, (vaddr_t)dp, len, UVM_KMF_WIRED);
2472 	}
2473 	mutex_exit(&cpu_lock);
2474 }
2475 #endif /* USER_LDT */
2476 
2477 /*
2478  * pmap_activate: activate a process' pmap
2479  *
2480  * => must be called with kernel preemption disabled
2481  * => if lwp is the curlwp, then set ci_want_pmapload so that
2482  *    actual MMU context switch will be done by pmap_load() later
2483  */
2484 
2485 void
2486 pmap_activate(struct lwp *l)
2487 {
2488 	struct cpu_info *ci;
2489 	struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2490 
2491 	KASSERT(kpreempt_disabled());
2492 
2493 	ci = curcpu();
2494 
2495 	if (l == ci->ci_curlwp) {
2496 		KASSERT(ci->ci_want_pmapload == 0);
2497 		KASSERT(ci->ci_tlbstate != TLBSTATE_VALID);
2498 #ifdef KSTACK_CHECK_DR0
2499 		/*
2500 		 * setup breakpoint on the top of stack
2501 		 */
2502 		if (l == &lwp0)
2503 			dr0(0, 0, 0, 0);
2504 		else
2505 			dr0(KSTACK_LOWEST_ADDR(l), 1, 3, 1);
2506 #endif
2507 
2508 		/*
2509 		 * no need to switch to kernel vmspace because
2510 		 * it's a subset of any vmspace.
2511 		 */
2512 
2513 		if (pmap == pmap_kernel()) {
2514 			ci->ci_want_pmapload = 0;
2515 			return;
2516 		}
2517 
2518 		ci->ci_want_pmapload = 1;
2519 	}
2520 }
2521 
2522 /*
2523  * pmap_reactivate: try to regain reference to the pmap.
2524  *
2525  * => must be called with kernel preemption disabled
2526  */
2527 
2528 static bool
2529 pmap_reactivate(struct pmap *pmap)
2530 {
2531 	struct cpu_info *ci;
2532 	uint32_t cpumask;
2533 	bool result;
2534 	uint32_t oldcpus;
2535 
2536 	ci = curcpu();
2537 	cpumask = ci->ci_cpumask;
2538 
2539 	KASSERT(kpreempt_disabled());
2540 #if defined(XEN) && defined(__x86_64__)
2541 	KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd);
2542 #elif defined(PAE)
2543 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
2544 #elif !defined(XEN)
2545 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()));
2546 #endif
2547 
2548 	/*
2549 	 * if we still have a lazy reference to this pmap,
2550 	 * we can assume that there was no tlb shootdown
2551 	 * for this pmap in the meantime.
2552 	 *
2553 	 * the order of events here is important as we must
2554 	 * synchronize with TLB shootdown interrupts.  declare
2555 	 * interest in invalidations (TLBSTATE_VALID) and then
2556 	 * check the cpumask, which the IPIs can change only
2557 	 * when the state is TLBSTATE_LAZY.
2558 	 */
2559 
2560 	ci->ci_tlbstate = TLBSTATE_VALID;
2561 	oldcpus = pmap->pm_cpus;
2562 	KASSERT((pmap->pm_kernel_cpus & cpumask) != 0);
2563 	if (oldcpus & cpumask) {
2564 		/* got it */
2565 		result = true;
2566 	} else {
2567 		/* must reload */
2568 		atomic_or_32(&pmap->pm_cpus, cpumask);
2569 		result = false;
2570 	}
2571 
2572 	return result;
2573 }
2574 
2575 /*
2576  * pmap_load: actually switch pmap.  (fill in %cr3 and LDT info)
2577  */
2578 
2579 void
2580 pmap_load(void)
2581 {
2582 	struct cpu_info *ci;
2583 	uint32_t cpumask;
2584 	struct pmap *pmap;
2585 	struct pmap *oldpmap;
2586 	struct lwp *l;
2587 	struct pcb *pcb;
2588 	uint64_t ncsw;
2589 
2590 	kpreempt_disable();
2591  retry:
2592 	ci = curcpu();
2593 	if (!ci->ci_want_pmapload) {
2594 		kpreempt_enable();
2595 		return;
2596 	}
2597 	cpumask = ci->ci_cpumask;
2598 	l = ci->ci_curlwp;
2599 	ncsw = l->l_ncsw;
2600 
2601 	/* should be able to take ipis. */
2602 	KASSERT(ci->ci_ilevel < IPL_HIGH);
2603 #ifdef XEN
2604 	/* XXX not yet KASSERT(x86_read_psl() != 0); */
2605 #else
2606 	KASSERT((x86_read_psl() & PSL_I) != 0);
2607 #endif
2608 
2609 	KASSERT(l != NULL);
2610 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2611 	KASSERT(pmap != pmap_kernel());
2612 	oldpmap = ci->ci_pmap;
2613 	pcb = lwp_getpcb(l);
2614 
2615 	if (pmap == oldpmap) {
2616 		if (!pmap_reactivate(pmap)) {
2617 			u_int gen = uvm_emap_gen_return();
2618 
2619 			/*
2620 			 * pmap has been changed during deactivated.
2621 			 * our tlb may be stale.
2622 			 */
2623 
2624 			tlbflush();
2625 			uvm_emap_update(gen);
2626 		}
2627 
2628 		ci->ci_want_pmapload = 0;
2629 		kpreempt_enable();
2630 		return;
2631 	}
2632 
2633 	/*
2634 	 * grab a reference to the new pmap.
2635 	 */
2636 
2637 	pmap_reference(pmap);
2638 
2639 	/*
2640 	 * actually switch pmap.
2641 	 */
2642 
2643 	atomic_and_32(&oldpmap->pm_cpus, ~cpumask);
2644 	atomic_and_32(&oldpmap->pm_kernel_cpus, ~cpumask);
2645 
2646 #if defined(XEN) && defined(__x86_64__)
2647 	KASSERT(pmap_pdirpa(oldpmap, 0) == ci->ci_xen_current_user_pgd ||
2648 	    oldpmap == pmap_kernel());
2649 #elif defined(PAE)
2650 	KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
2651 #elif !defined(XEN)
2652 	KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(rcr3()));
2653 #endif
2654 	KASSERT((pmap->pm_cpus & cpumask) == 0);
2655 	KASSERT((pmap->pm_kernel_cpus & cpumask) == 0);
2656 
2657 	/*
2658 	 * mark the pmap in use by this processor.  again we must
2659 	 * synchronize with TLB shootdown interrupts, so set the
2660 	 * state VALID first, then register us for shootdown events
2661 	 * on this pmap.
2662 	 */
2663 
2664 	ci->ci_tlbstate = TLBSTATE_VALID;
2665 	atomic_or_32(&pmap->pm_cpus, cpumask);
2666 	atomic_or_32(&pmap->pm_kernel_cpus, cpumask);
2667 	ci->ci_pmap = pmap;
2668 
2669 	/*
2670 	 * update tss.  now that we have registered for invalidations
2671 	 * from other CPUs, we're good to load the page tables.
2672 	 */
2673 #ifdef PAE
2674 	pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa;
2675 #else
2676 	pcb->pcb_cr3 = pmap_pdirpa(pmap, 0);
2677 #endif
2678 
2679 #ifdef i386
2680 #ifdef XEN
2681 	/*
2682 	 * clear APDP slot, in case it points to a page table that has
2683 	 * been freed
2684 	 */
2685 	if (*APDP_PDE) {
2686 		pmap_unmap_apdp();
2687 	}
2688 	/* lldt() does pmap_pte_flush() */
2689 #endif /* XEN */
2690 
2691 #ifndef XEN
2692 	ci->ci_tss.tss_ldt = pmap->pm_ldt_sel;
2693 	ci->ci_tss.tss_cr3 = pcb->pcb_cr3;
2694 #endif /* !XEN */
2695 #endif /* i386 */
2696 
2697 	lldt(pmap->pm_ldt_sel);
2698 
2699 	u_int gen = uvm_emap_gen_return();
2700 	cpu_load_pmap(pmap);
2701 	uvm_emap_update(gen);
2702 
2703 	ci->ci_want_pmapload = 0;
2704 
2705 	/*
2706 	 * we're now running with the new pmap.  drop the reference
2707 	 * to the old pmap.  if we block, we need to go around again.
2708 	 */
2709 
2710 	pmap_destroy(oldpmap);
2711 	if (l->l_ncsw != ncsw) {
2712 		goto retry;
2713 	}
2714 
2715 	kpreempt_enable();
2716 }
2717 
2718 /*
2719  * pmap_deactivate: deactivate a process' pmap.
2720  *
2721  * => Must be called with kernel preemption disabled (high IPL is enough).
2722  */
2723 void
2724 pmap_deactivate(struct lwp *l)
2725 {
2726 	struct pmap *pmap;
2727 	struct cpu_info *ci;
2728 
2729 	KASSERT(kpreempt_disabled());
2730 
2731 	if (l != curlwp) {
2732 		return;
2733 	}
2734 
2735 	/*
2736 	 * Wait for pending TLB shootdowns to complete.  Necessary because
2737 	 * TLB shootdown state is per-CPU, and the LWP may be coming off
2738 	 * the CPU before it has a chance to call pmap_update(), e.g. due
2739 	 * to kernel preemption or blocking routine in between.
2740 	 */
2741 	pmap_tlb_shootnow();
2742 
2743 	ci = curcpu();
2744 
2745 	if (ci->ci_want_pmapload) {
2746 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
2747 		    != pmap_kernel());
2748 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
2749 		    != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID);
2750 
2751 		/*
2752 		 * userspace has not been touched.
2753 		 * nothing to do here.
2754 		 */
2755 
2756 		ci->ci_want_pmapload = 0;
2757 		return;
2758 	}
2759 
2760 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2761 
2762 	if (pmap == pmap_kernel()) {
2763 		return;
2764 	}
2765 
2766 #if defined(XEN) && defined(__x86_64__)
2767 	KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd);
2768 #elif defined(PAE)
2769 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
2770 #elif !defined(XEN)
2771 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()));
2772 #endif
2773 	KASSERT(ci->ci_pmap == pmap);
2774 
2775 	/*
2776 	 * we aren't interested in TLB invalidations for this pmap,
2777 	 * at least for the time being.
2778 	 */
2779 
2780 	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
2781 	ci->ci_tlbstate = TLBSTATE_LAZY;
2782 }
2783 
2784 /*
2785  * end of lifecycle functions
2786  */
2787 
2788 /*
2789  * some misc. functions
2790  */
2791 
2792 int
2793 pmap_pdes_invalid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde)
2794 {
2795 	int i;
2796 	unsigned long index;
2797 	pd_entry_t pde;
2798 
2799 	for (i = PTP_LEVELS; i > 1; i--) {
2800 		index = pl_i(va, i);
2801 		pde = pdes[i - 2][index];
2802 		if ((pde & PG_V) == 0)
2803 			return i;
2804 	}
2805 	if (lastpde != NULL)
2806 		*lastpde = pde;
2807 	return 0;
2808 }
2809 
2810 /*
2811  * pmap_extract: extract a PA for the given VA
2812  */
2813 
2814 bool
2815 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
2816 {
2817 	pt_entry_t *ptes, pte;
2818 	pd_entry_t pde;
2819 	pd_entry_t * const *pdes;
2820 	struct pmap *pmap2;
2821 	struct cpu_info *ci;
2822 	paddr_t pa;
2823 	lwp_t *l;
2824 	bool hard, rv;
2825 
2826 	rv = false;
2827 	pa = 0;
2828 	l = curlwp;
2829 
2830 	KPREEMPT_DISABLE(l);
2831 	ci = l->l_cpu;
2832 	if (__predict_true(!ci->ci_want_pmapload && ci->ci_pmap == pmap) ||
2833 	    pmap == pmap_kernel()) {
2834 		/*
2835 		 * no need to lock, because it's pmap_kernel() or our
2836 		 * own pmap and is active.  if a user pmap, the caller
2837 		 * will hold the vm_map write/read locked and so prevent
2838 		 * entries from disappearing while we are here.  ptps
2839 		 * can disappear via pmap_remove() and pmap_protect(),
2840 		 * but they are called with the vm_map write locked.
2841 		 */
2842 		hard = false;
2843 		ptes = PTE_BASE;
2844 		pdes = normal_pdes;
2845 	} else {
2846 		/* we lose, do it the hard way. */
2847 		hard = true;
2848 		pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
2849 	}
2850 	if (pmap_pdes_valid(va, pdes, &pde)) {
2851 		pte = ptes[pl1_i(va)];
2852 		if (pde & PG_PS) {
2853 			pa = (pde & PG_LGFRAME) | (va & (NBPD_L2 - 1));
2854 			rv = true;
2855 		} else if (__predict_true((pte & PG_V) != 0)) {
2856 			pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
2857 			rv = true;
2858 		}
2859 	}
2860 	if (__predict_false(hard)) {
2861 		pmap_unmap_ptes(pmap, pmap2);
2862 	}
2863 	KPREEMPT_ENABLE(l);
2864 	if (pap != NULL) {
2865 		*pap = pa;
2866 	}
2867 	return rv;
2868 }
2869 
2870 
2871 /*
2872  * vtophys: virtual address to physical address.  For use by
2873  * machine-dependent code only.
2874  */
2875 
2876 paddr_t
2877 vtophys(vaddr_t va)
2878 {
2879 	paddr_t pa;
2880 
2881 	if (pmap_extract(pmap_kernel(), va, &pa) == true)
2882 		return (pa);
2883 	return (0);
2884 }
2885 
2886 __weak_alias(pmap_extract_ma, pmap_extract);
2887 
2888 #ifdef XEN
2889 
2890 /*
2891  * vtomach: virtual address to machine address.  For use by
2892  * machine-dependent code only.
2893  */
2894 
2895 paddr_t
2896 vtomach(vaddr_t va)
2897 {
2898 	paddr_t pa;
2899 
2900 	if (pmap_extract_ma(pmap_kernel(), va, &pa) == true)
2901 		return (pa);
2902 	return (0);
2903 }
2904 
2905 #endif /* XEN */
2906 
2907 /*
2908  * pmap_virtual_space: used during bootup [pmap_steal_memory] to
2909  *	determine the bounds of the kernel virtual addess space.
2910  */
2911 
2912 void
2913 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp)
2914 {
2915 	*startp = virtual_avail;
2916 	*endp = virtual_end;
2917 }
2918 
2919 /*
2920  * pmap_map: map a range of PAs into kvm.
2921  *
2922  * => used during crash dump
2923  * => XXX: pmap_map() should be phased out?
2924  */
2925 
2926 vaddr_t
2927 pmap_map(vaddr_t va, paddr_t spa, paddr_t epa, vm_prot_t prot)
2928 {
2929 	while (spa < epa) {
2930 		pmap_kenter_pa(va, spa, prot, 0);
2931 		va += PAGE_SIZE;
2932 		spa += PAGE_SIZE;
2933 	}
2934 	pmap_update(pmap_kernel());
2935 	return va;
2936 }
2937 
2938 /*
2939  * pmap_zero_page: zero a page
2940  */
2941 
2942 void
2943 pmap_zero_page(paddr_t pa)
2944 {
2945 	pt_entry_t *zpte;
2946 	void *zerova;
2947 	int id;
2948 
2949 	kpreempt_disable();
2950 	id = cpu_number();
2951 	zpte = PTESLEW(zero_pte, id);
2952 	zerova = VASLEW(zerop, id);
2953 
2954 #ifdef DIAGNOSTIC
2955 	if (*zpte)
2956 		panic("pmap_zero_page: lock botch");
2957 #endif
2958 
2959 	pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k);
2960 	pmap_pte_flush();
2961 	pmap_update_pg((vaddr_t)zerova);		/* flush TLB */
2962 
2963 	memset(zerova, 0, PAGE_SIZE);
2964 
2965 #if defined(DIAGNOSTIC) || defined(XEN)
2966 	pmap_pte_set(zpte, 0);				/* zap ! */
2967 	pmap_pte_flush();
2968 #endif
2969 	kpreempt_enable();
2970 }
2971 
2972 /*
2973  * pmap_pagezeroidle: the same, for the idle loop page zero'er.
2974  * Returns true if the page was zero'd, false if we aborted for
2975  * some reason.
2976  */
2977 
2978 bool
2979 pmap_pageidlezero(paddr_t pa)
2980 {
2981 	pt_entry_t *zpte;
2982 	void *zerova;
2983 	bool rv;
2984 	int id;
2985 
2986 	id = cpu_number();
2987 	zpte = PTESLEW(zero_pte, id);
2988 	zerova = VASLEW(zerop, id);
2989 
2990 	KASSERT(cpu_feature[0] & CPUID_SSE2);
2991 	KASSERT(*zpte == 0);
2992 
2993 	pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k);
2994 	pmap_pte_flush();
2995 	pmap_update_pg((vaddr_t)zerova);		/* flush TLB */
2996 
2997 	rv = sse2_idlezero_page(zerova);
2998 
2999 #if defined(DIAGNOSTIC) || defined(XEN)
3000 	pmap_pte_set(zpte, 0);				/* zap ! */
3001 	pmap_pte_flush();
3002 #endif
3003 
3004 	return rv;
3005 }
3006 
3007 /*
3008  * pmap_copy_page: copy a page
3009  */
3010 
3011 void
3012 pmap_copy_page(paddr_t srcpa, paddr_t dstpa)
3013 {
3014 	pt_entry_t *spte;
3015 	pt_entry_t *dpte;
3016 	void *csrcva;
3017 	void *cdstva;
3018 	int id;
3019 
3020 	kpreempt_disable();
3021 	id = cpu_number();
3022 	spte = PTESLEW(csrc_pte,id);
3023 	dpte = PTESLEW(cdst_pte,id);
3024 	csrcva = VASLEW(csrcp, id);
3025 	cdstva = VASLEW(cdstp, id);
3026 
3027 	KASSERT(*spte == 0 && *dpte == 0);
3028 
3029 	pmap_pte_set(spte, pmap_pa2pte(srcpa) | PG_V | PG_RW | PG_U | PG_k);
3030 	pmap_pte_set(dpte,
3031 	    pmap_pa2pte(dstpa) | PG_V | PG_RW | PG_M | PG_U | PG_k);
3032 	pmap_pte_flush();
3033 	pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva);
3034 
3035 	memcpy(cdstva, csrcva, PAGE_SIZE);
3036 
3037 #if defined(DIAGNOSTIC) || defined(XEN)
3038 	pmap_pte_set(spte, 0);
3039 	pmap_pte_set(dpte, 0);
3040 	pmap_pte_flush();
3041 #endif
3042 	kpreempt_enable();
3043 }
3044 
3045 static pt_entry_t *
3046 pmap_map_ptp(struct vm_page *ptp)
3047 {
3048 	pt_entry_t *ptppte;
3049 	void *ptpva;
3050 	int id;
3051 
3052 	KASSERT(kpreempt_disabled());
3053 
3054 	id = cpu_number();
3055 	ptppte = PTESLEW(ptp_pte, id);
3056 	ptpva = VASLEW(ptpp, id);
3057 #if !defined(XEN)
3058 	pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M |
3059 	    PG_RW | PG_U | PG_k);
3060 #else
3061 	pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M |
3062 	    PG_U | PG_k);
3063 #endif
3064 	pmap_pte_flush();
3065 	pmap_update_pg((vaddr_t)ptpva);
3066 
3067 	return (pt_entry_t *)ptpva;
3068 }
3069 
3070 static void
3071 pmap_unmap_ptp(void)
3072 {
3073 #if defined(DIAGNOSTIC) || defined(XEN)
3074 	pt_entry_t *pte;
3075 
3076 	KASSERT(kpreempt_disabled());
3077 
3078 	pte = PTESLEW(ptp_pte, cpu_number());
3079 	if (*pte != 0) {
3080 		pmap_pte_set(pte, 0);
3081 		pmap_pte_flush();
3082 	}
3083 #endif
3084 }
3085 
3086 static pt_entry_t *
3087 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
3088 {
3089 
3090 	KASSERT(kpreempt_disabled());
3091 	if (pmap_is_curpmap(pmap)) {
3092 		return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */
3093 	}
3094 	KASSERT(ptp != NULL);
3095 	return pmap_map_ptp(ptp) + pl1_pi(va);
3096 }
3097 
3098 static void
3099 pmap_unmap_pte(void)
3100 {
3101 
3102 	KASSERT(kpreempt_disabled());
3103 
3104 	pmap_unmap_ptp();
3105 }
3106 
3107 /*
3108  * p m a p   r e m o v e   f u n c t i o n s
3109  *
3110  * functions that remove mappings
3111  */
3112 
3113 /*
3114  * pmap_remove_ptes: remove PTEs from a PTP
3115  *
3116  * => caller must hold pmap's lock
3117  * => PTP must be mapped into KVA
3118  * => PTP should be null if pmap == pmap_kernel()
3119  * => must be called with kernel preemption disabled
3120  * => returns composite pte if at least one page should be shot down
3121  */
3122 
3123 static void
3124 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
3125 		 vaddr_t startva, vaddr_t endva, struct pv_entry **pv_tofree)
3126 {
3127 	pt_entry_t *pte = (pt_entry_t *)ptpva;
3128 
3129 	KASSERT(pmap == pmap_kernel() || mutex_owned(pmap->pm_lock));
3130 	KASSERT(kpreempt_disabled());
3131 
3132 	/*
3133 	 * note that ptpva points to the PTE that maps startva.   this may
3134 	 * or may not be the first PTE in the PTP.
3135 	 *
3136 	 * we loop through the PTP while there are still PTEs to look at
3137 	 * and the wire_count is greater than 1 (because we use the wire_count
3138 	 * to keep track of the number of real PTEs in the PTP).
3139 	 */
3140 	while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) {
3141 		(void)pmap_remove_pte(pmap, ptp, pte, startva, pv_tofree);
3142 		startva += PAGE_SIZE;
3143 		pte++;
3144 	}
3145 }
3146 
3147 
3148 /*
3149  * pmap_remove_pte: remove a single PTE from a PTP.
3150  *
3151  * => caller must hold pmap's lock
3152  * => PTP must be mapped into KVA
3153  * => PTP should be null if pmap == pmap_kernel()
3154  * => returns true if we removed a mapping
3155  * => must be called with kernel preemption disabled
3156  */
3157 static bool
3158 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
3159 		vaddr_t va, struct pv_entry **pv_tofree)
3160 {
3161 	struct pv_entry *pve;
3162 	struct vm_page *pg;
3163 	struct pmap_page *pp;
3164 	pt_entry_t opte;
3165 
3166 	KASSERT(pmap == pmap_kernel() || mutex_owned(pmap->pm_lock));
3167 	KASSERT(kpreempt_disabled());
3168 
3169 	if (!pmap_valid_entry(*pte)) {
3170 		/* VA not mapped. */
3171 		return false;
3172 	}
3173 
3174 	/* Atomically save the old PTE and zap it. */
3175 	opte = pmap_pte_testset(pte, 0);
3176 	if (!pmap_valid_entry(opte)) {
3177 		return false;
3178 	}
3179 
3180 	pmap_exec_account(pmap, va, opte, 0);
3181 	pmap_stats_update_bypte(pmap, 0, opte);
3182 
3183 	if (ptp) {
3184 		/*
3185 		 * Dropping a PTE.  Make sure that the PDE is flushed.
3186 		 */
3187 		ptp->wire_count--;
3188 		if (ptp->wire_count <= 1) {
3189 			opte |= PG_U;
3190 		}
3191 	}
3192 
3193 	if ((opte & PG_U) != 0) {
3194 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE);
3195 	}
3196 
3197 	/*
3198 	 * If we are not on a pv_head list - we are done.
3199 	 */
3200 	if ((opte & PG_PVLIST) == 0) {
3201 #if defined(DIAGNOSTIC) && !defined(DOM0OPS)
3202 		if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL)
3203 			panic("pmap_remove_pte: managed page without "
3204 			      "PG_PVLIST for %#" PRIxVADDR, va);
3205 #endif
3206 		return true;
3207 	}
3208 
3209 	pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte));
3210 
3211 	KASSERTMSG(pg != NULL, ("pmap_remove_pte: unmanaged page marked "
3212 	    "PG_PVLIST, va = %#" PRIxVADDR ", pa = %#" PRIxPADDR,
3213 	    va, (paddr_t)pmap_pte2pa(opte)));
3214 
3215 	KASSERT(uvm_page_locked_p(pg));
3216 
3217 	/* Sync R/M bits. */
3218 	pp = VM_PAGE_TO_PP(pg);
3219 	pp->pp_attrs |= opte;
3220 	pve = pmap_remove_pv(pp, ptp, va);
3221 
3222 	if (pve) {
3223 		pve->pve_next = *pv_tofree;
3224 		*pv_tofree = pve;
3225 	}
3226 	return true;
3227 }
3228 
3229 /*
3230  * pmap_remove: mapping removal function.
3231  *
3232  * => caller should not be holding any pmap locks
3233  */
3234 
3235 void
3236 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
3237 {
3238 	pt_entry_t *ptes;
3239 	pd_entry_t pde;
3240 	pd_entry_t * const *pdes;
3241 	struct pv_entry *pv_tofree = NULL;
3242 	bool result;
3243 	int i;
3244 	paddr_t ptppa;
3245 	vaddr_t blkendva, va = sva;
3246 	struct vm_page *ptp;
3247 	struct pmap *pmap2;
3248 
3249 	kpreempt_disable();
3250 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
3251 
3252 	/*
3253 	 * removing one page?  take shortcut function.
3254 	 */
3255 
3256 	if (va + PAGE_SIZE == eva) {
3257 		if (pmap_pdes_valid(va, pdes, &pde)) {
3258 
3259 			/* PA of the PTP */
3260 			ptppa = pmap_pte2pa(pde);
3261 
3262 			/* Get PTP if non-kernel mapping. */
3263 			if (pmap != pmap_kernel()) {
3264 				ptp = pmap_find_ptp(pmap, va, ptppa, 1);
3265 				KASSERTMSG(ptp != NULL,
3266 				    ("pmap_remove: unmanaged PTP detected")
3267 				);
3268 			} else {
3269 				/* Never free kernel PTPs. */
3270 				ptp = NULL;
3271 			}
3272 
3273 			result = pmap_remove_pte(pmap, ptp,
3274 			    &ptes[pl1_i(va)], va, &pv_tofree);
3275 
3276 			/*
3277 			 * if mapping removed and the PTP is no longer
3278 			 * being used, free it!
3279 			 */
3280 
3281 			if (result && ptp && ptp->wire_count <= 1)
3282 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3283 		}
3284 	} else for (/* null */ ; va < eva ; va = blkendva) {
3285 		int lvl;
3286 
3287 		/* determine range of block */
3288 		blkendva = x86_round_pdr(va+1);
3289 		if (blkendva > eva)
3290 			blkendva = eva;
3291 
3292 		/*
3293 		 * XXXCDC: our PTE mappings should never be removed
3294 		 * with pmap_remove!  if we allow this (and why would
3295 		 * we?) then we end up freeing the pmap's page
3296 		 * directory page (PDP) before we are finished using
3297 		 * it when we hit in in the recursive mapping.  this
3298 		 * is BAD.
3299 		 *
3300 		 * long term solution is to move the PTEs out of user
3301 		 * address space.  and into kernel address space (up
3302 		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
3303 		 * be VM_MAX_ADDRESS.
3304 		 */
3305 
3306 		/* XXXCDC: ugly hack to avoid freeing PDP here */
3307 		for (i = 0; i < PDP_SIZE; i++) {
3308 			if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i)
3309 				continue;
3310 		}
3311 
3312 		lvl = pmap_pdes_invalid(va, pdes, &pde);
3313 		if (lvl != 0) {
3314 			/*
3315 			 * skip a range corresponding to an invalid pde.
3316 			 */
3317 			blkendva = (va & ptp_masks[lvl - 1]) + nbpd[lvl - 1];
3318  			continue;
3319 		}
3320 
3321 		/* PA of the PTP */
3322 		ptppa = pmap_pte2pa(pde);
3323 
3324 		/* Get PTP if non-kernel mapping. */
3325 		if (pmap != pmap_kernel()) {
3326 			ptp = pmap_find_ptp(pmap, va, ptppa, 1);
3327 			KASSERTMSG(ptp != NULL,
3328 			    ("pmap_remove: unmanaged PTP detected")
3329 			);
3330 		} else {
3331 			/* Never free kernel PTPs. */
3332 			ptp = NULL;
3333 		}
3334 
3335 		pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va,
3336 		    blkendva, &pv_tofree);
3337 
3338 		/* if PTP is no longer being used, free it! */
3339 		if (ptp && ptp->wire_count <= 1) {
3340 			pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3341 		}
3342 	}
3343 	pmap_unmap_ptes(pmap, pmap2);		/* unlock pmap */
3344 	kpreempt_enable();
3345 
3346 	/* Now we free unused PVs */
3347 	if (pv_tofree)
3348 		pmap_free_pvs(pv_tofree);
3349 }
3350 
3351 /*
3352  * pmap_sync_pv: clear pte bits and return the old value of the pte.
3353  *
3354  * => Caller should disable kernel preemption.
3355  * => issues tlb shootdowns if necessary.
3356  */
3357 
3358 static int
3359 pmap_sync_pv(struct pv_pte *pvpte, pt_entry_t expect, int clearbits,
3360     pt_entry_t *optep)
3361 {
3362 	struct pmap *pmap;
3363 	struct vm_page *ptp;
3364 	vaddr_t va;
3365 	pt_entry_t *ptep;
3366 	pt_entry_t opte;
3367 	pt_entry_t npte;
3368 	bool need_shootdown;
3369 
3370 	ptp = pvpte->pte_ptp;
3371 	va = pvpte->pte_va;
3372 	KASSERT(ptp == NULL || ptp->uobject != NULL);
3373 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
3374 	pmap = ptp_to_pmap(ptp);
3375 
3376 	KASSERT((expect & ~(PG_FRAME | PG_V)) == 0);
3377 	KASSERT((expect & PG_V) != 0);
3378 	KASSERT(clearbits == ~0 || (clearbits & ~(PG_M | PG_U | PG_RW)) == 0);
3379 	KASSERT(kpreempt_disabled());
3380 
3381 	ptep = pmap_map_pte(pmap, ptp, va);
3382 	do {
3383 		opte = *ptep;
3384 		KASSERT((opte & (PG_M | PG_U)) != PG_M);
3385 		KASSERT((opte & (PG_U | PG_V)) != PG_U);
3386 		KASSERT(opte == 0 || (opte & PG_V) != 0);
3387 		if ((opte & (PG_FRAME | PG_V)) != expect) {
3388 
3389 			/*
3390 			 * we lost a race with a V->P operation like
3391 			 * pmap_remove().  wait for the competitor
3392 			 * reflecting pte bits into mp_attrs.
3393 			 *
3394 			 * issue a redundant TLB shootdown so that
3395 			 * we can wait for its completion.
3396 			 */
3397 
3398 			pmap_unmap_pte();
3399 			if (clearbits != 0) {
3400 				pmap_tlb_shootdown(pmap, va,
3401 				    (pmap == pmap_kernel() ? PG_G : 0),
3402 				    TLBSHOOT_SYNC_PV1);
3403 			}
3404 			return EAGAIN;
3405 		}
3406 
3407 		/*
3408 		 * check if there's anything to do on this pte.
3409 		 */
3410 
3411 		if ((opte & clearbits) == 0) {
3412 			need_shootdown = false;
3413 			break;
3414 		}
3415 
3416 		/*
3417 		 * we need a shootdown if the pte is cached. (PG_U)
3418 		 *
3419 		 * ...unless we are clearing only the PG_RW bit and
3420 		 * it isn't cached as RW. (PG_M)
3421 		 */
3422 
3423 		need_shootdown = (opte & PG_U) != 0 &&
3424 		    !(clearbits == PG_RW && (opte & PG_M) == 0);
3425 
3426 		npte = opte & ~clearbits;
3427 
3428 		/*
3429 		 * if we need a shootdown anyway, clear PG_U and PG_M.
3430 		 */
3431 
3432 		if (need_shootdown) {
3433 			npte &= ~(PG_U | PG_M);
3434 		}
3435 		KASSERT((npte & (PG_M | PG_U)) != PG_M);
3436 		KASSERT((npte & (PG_U | PG_V)) != PG_U);
3437 		KASSERT(npte == 0 || (opte & PG_V) != 0);
3438 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
3439 
3440 	if (need_shootdown) {
3441 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV2);
3442 	}
3443 	pmap_unmap_pte();
3444 
3445 	*optep = opte;
3446 	return 0;
3447 }
3448 
3449 /*
3450  * pmap_page_remove: remove a managed vm_page from all pmaps that map it
3451  *
3452  * => R/M bits are sync'd back to attrs
3453  */
3454 
3455 void
3456 pmap_page_remove(struct vm_page *pg)
3457 {
3458 	struct pmap_page *pp;
3459 	struct pv_pte *pvpte;
3460 	struct pv_entry *killlist = NULL;
3461 	struct vm_page *ptp;
3462 	pt_entry_t expect;
3463 	lwp_t *l;
3464 	int count;
3465 
3466 	KASSERT(uvm_page_locked_p(pg));
3467 
3468 	l = curlwp;
3469 	pp = VM_PAGE_TO_PP(pg);
3470 	expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V;
3471 	count = SPINLOCK_BACKOFF_MIN;
3472 	kpreempt_disable();
3473 startover:
3474 	while ((pvpte = pv_pte_first(pp)) != NULL) {
3475 		struct pmap *pmap;
3476 		struct pv_entry *pve;
3477 		pt_entry_t opte;
3478 		vaddr_t va;
3479 		int error;
3480 
3481 		/*
3482 		 * add a reference to the pmap before clearing the pte.
3483 		 * otherwise the pmap can disappear behind us.
3484 		 */
3485 
3486 		ptp = pvpte->pte_ptp;
3487 		pmap = ptp_to_pmap(ptp);
3488 		if (ptp != NULL) {
3489 			pmap_reference(pmap);
3490 		}
3491 
3492 		error = pmap_sync_pv(pvpte, expect, ~0, &opte);
3493 		if (error == EAGAIN) {
3494 			int hold_count;
3495 			KERNEL_UNLOCK_ALL(curlwp, &hold_count);
3496 			if (ptp != NULL) {
3497 				pmap_destroy(pmap);
3498 			}
3499 			SPINLOCK_BACKOFF(count);
3500 			KERNEL_LOCK(hold_count, curlwp);
3501 			goto startover;
3502 		}
3503 
3504 		pp->pp_attrs |= opte;
3505 		va = pvpte->pte_va;
3506 		pve = pmap_remove_pv(pp, ptp, va);
3507 
3508 		/* update the PTP reference count.  free if last reference. */
3509 		if (ptp != NULL) {
3510 			struct pmap *pmap2;
3511 			pt_entry_t *ptes;
3512 			pd_entry_t * const *pdes;
3513 
3514 			KASSERT(pmap != pmap_kernel());
3515 
3516 			pmap_tlb_shootnow();
3517 			pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3518 			pmap_stats_update_bypte(pmap, 0, opte);
3519 			ptp->wire_count--;
3520 			if (ptp->wire_count <= 1) {
3521 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3522 			}
3523 			pmap_unmap_ptes(pmap, pmap2);
3524 			pmap_destroy(pmap);
3525 		} else {
3526 			KASSERT(pmap == pmap_kernel());
3527 			pmap_stats_update_bypte(pmap, 0, opte);
3528 		}
3529 
3530 		if (pve != NULL) {
3531 			pve->pve_next = killlist;	/* mark it for death */
3532 			killlist = pve;
3533 		}
3534 	}
3535 	pmap_tlb_shootnow();
3536 	kpreempt_enable();
3537 
3538 	/* Now free unused pvs. */
3539 	pmap_free_pvs(killlist);
3540 }
3541 
3542 /*
3543  * p m a p   a t t r i b u t e  f u n c t i o n s
3544  * functions that test/change managed page's attributes
3545  * since a page can be mapped multiple times we must check each PTE that
3546  * maps it by going down the pv lists.
3547  */
3548 
3549 /*
3550  * pmap_test_attrs: test a page's attributes
3551  */
3552 
3553 bool
3554 pmap_test_attrs(struct vm_page *pg, unsigned testbits)
3555 {
3556 	struct pmap_page *pp;
3557 	struct pv_pte *pvpte;
3558 	pt_entry_t expect;
3559 	u_int result;
3560 
3561 	KASSERT(uvm_page_locked_p(pg));
3562 
3563 	pp = VM_PAGE_TO_PP(pg);
3564 	if ((pp->pp_attrs & testbits) != 0) {
3565 		return true;
3566 	}
3567 	expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V;
3568 	kpreempt_disable();
3569 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
3570 		pt_entry_t opte;
3571 		int error;
3572 
3573 		if ((pp->pp_attrs & testbits) != 0) {
3574 			break;
3575 		}
3576 		error = pmap_sync_pv(pvpte, expect, 0, &opte);
3577 		if (error == 0) {
3578 			pp->pp_attrs |= opte;
3579 		}
3580 	}
3581 	result = pp->pp_attrs & testbits;
3582 	kpreempt_enable();
3583 
3584 	/*
3585 	 * note that we will exit the for loop with a non-null pve if
3586 	 * we have found the bits we are testing for.
3587 	 */
3588 
3589 	return result != 0;
3590 }
3591 
3592 /*
3593  * pmap_clear_attrs: clear the specified attribute for a page.
3594  *
3595  * => we return true if we cleared one of the bits we were asked to
3596  */
3597 
3598 bool
3599 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits)
3600 {
3601 	struct pmap_page *pp;
3602 	struct pv_pte *pvpte;
3603 	u_int result;
3604 	pt_entry_t expect;
3605 	int count;
3606 
3607 	KASSERT(uvm_page_locked_p(pg));
3608 
3609 	pp = VM_PAGE_TO_PP(pg);
3610 	expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V;
3611 	count = SPINLOCK_BACKOFF_MIN;
3612 	kpreempt_disable();
3613 startover:
3614 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
3615 		pt_entry_t opte;
3616 		int error;
3617 
3618 		error = pmap_sync_pv(pvpte, expect, clearbits, &opte);
3619 		if (error == EAGAIN) {
3620 			int hold_count;
3621 			KERNEL_UNLOCK_ALL(curlwp, &hold_count);
3622 			SPINLOCK_BACKOFF(count);
3623 			KERNEL_LOCK(hold_count, curlwp);
3624 			goto startover;
3625 		}
3626 		pp->pp_attrs |= opte;
3627 	}
3628 	result = pp->pp_attrs & clearbits;
3629 	pp->pp_attrs &= ~clearbits;
3630 	kpreempt_enable();
3631 
3632 	return result != 0;
3633 }
3634 
3635 
3636 /*
3637  * p m a p   p r o t e c t i o n   f u n c t i o n s
3638  */
3639 
3640 /*
3641  * pmap_page_protect: change the protection of all recorded mappings
3642  *	of a managed page
3643  *
3644  * => NOTE: this is an inline function in pmap.h
3645  */
3646 
3647 /* see pmap.h */
3648 
3649 /*
3650  * pmap_protect: set the protection in of the pages in a pmap
3651  *
3652  * => NOTE: this is an inline function in pmap.h
3653  */
3654 
3655 /* see pmap.h */
3656 
3657 /*
3658  * pmap_write_protect: write-protect pages in a pmap
3659  */
3660 
3661 void
3662 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
3663 {
3664 	int i;
3665 	pt_entry_t *ptes, *epte;
3666 	pt_entry_t *spte;
3667 	pd_entry_t * const *pdes;
3668 	vaddr_t blockend, va;
3669 	pt_entry_t opte;
3670 	struct pmap *pmap2;
3671 
3672 	KASSERT(curlwp->l_md.md_gc_pmap != pmap);
3673 
3674 	kpreempt_disable();
3675 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
3676 
3677 	/* should be ok, but just in case ... */
3678 	sva &= PG_FRAME;
3679 	eva &= PG_FRAME;
3680 
3681 	for (va = sva ; va < eva ; va = blockend) {
3682 
3683 		blockend = (va & L2_FRAME) + NBPD_L2;
3684 		if (blockend > eva)
3685 			blockend = eva;
3686 
3687 		/*
3688 		 * XXXCDC: our PTE mappings should never be write-protected!
3689 		 *
3690 		 * long term solution is to move the PTEs out of user
3691 		 * address space.  and into kernel address space (up
3692 		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
3693 		 * be VM_MAX_ADDRESS.
3694 		 */
3695 
3696 		/* XXXCDC: ugly hack to avoid freeing PDP here */
3697 		for (i = 0; i < PDP_SIZE; i++) {
3698 			if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i)
3699 				continue;
3700 		}
3701 
3702 		/* empty block? */
3703 		if (!pmap_pdes_valid(va, pdes, NULL))
3704 			continue;
3705 
3706 #ifdef DIAGNOSTIC
3707 		if (va >= VM_MAXUSER_ADDRESS &&
3708 		    va < VM_MAX_ADDRESS)
3709 			panic("pmap_write_protect: PTE space");
3710 #endif
3711 
3712 		spte = &ptes[pl1_i(va)];
3713 		epte = &ptes[pl1_i(blockend)];
3714 
3715 		for (/*null */; spte < epte ; spte++) {
3716 			pt_entry_t npte;
3717 
3718 			do {
3719 				opte = *spte;
3720 				if ((~opte & (PG_RW | PG_V)) != 0) {
3721 					goto next;
3722 				}
3723 				npte = opte & ~PG_RW;
3724 			} while (pmap_pte_cas(spte, opte, npte) != opte);
3725 			if ((opte & PG_M) != 0) {
3726 				vaddr_t tva;
3727 
3728 				tva = x86_ptob(spte - ptes);
3729 				pmap_tlb_shootdown(pmap, tva, opte,
3730 				    TLBSHOOT_WRITE_PROTECT);
3731 			}
3732 next:;
3733 		}
3734 	}
3735 
3736 	pmap_unmap_ptes(pmap, pmap2);	/* unlocks pmap */
3737 	kpreempt_enable();
3738 }
3739 
3740 /*
3741  * end of protection functions
3742  */
3743 
3744 /*
3745  * pmap_unwire: clear the wired bit in the PTE
3746  *
3747  * => mapping should already be in map
3748  */
3749 
3750 void
3751 pmap_unwire(struct pmap *pmap, vaddr_t va)
3752 {
3753 	pt_entry_t *ptes;
3754 	pd_entry_t * const *pdes;
3755 	struct pmap *pmap2;
3756 
3757 	kpreempt_disable();
3758 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
3759 
3760 	if (pmap_pdes_valid(va, pdes, NULL)) {
3761 		pt_entry_t *ptep = &ptes[pl1_i(va)];
3762 		pt_entry_t opte = *ptep;
3763 
3764 #ifdef DIAGNOSTIC
3765 		if (!pmap_valid_entry(opte))
3766 			panic("pmap_unwire: invalid (unmapped) va 0x%lx", va);
3767 #endif
3768 		if ((opte & PG_W) != 0) {
3769 			pt_entry_t npte = opte & ~PG_W;
3770 
3771 			opte = pmap_pte_testset(ptep, npte);
3772 			pmap_stats_update_bypte(pmap, npte, opte);
3773 		}
3774 #ifdef DIAGNOSTIC
3775 		else {
3776 			printf("pmap_unwire: wiring for pmap %p va 0x%lx "
3777 			       "didn't change!\n", pmap, va);
3778 		}
3779 #endif
3780 		pmap_unmap_ptes(pmap, pmap2);		/* unlocks map */
3781 	}
3782 #ifdef DIAGNOSTIC
3783 	else {
3784 		panic("pmap_unwire: invalid PDE");
3785 	}
3786 #endif
3787 	kpreempt_enable();
3788 }
3789 
3790 /*
3791  * pmap_copy: copy mappings from one pmap to another
3792  *
3793  * => optional function
3794  * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
3795  */
3796 
3797 /*
3798  * defined as macro in pmap.h
3799  */
3800 
3801 __weak_alias(pmap_enter, pmap_enter_default);
3802 
3803 int
3804 pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
3805     u_int flags)
3806 {
3807 	return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0);
3808 }
3809 
3810 /*
3811  * pmap_enter: enter a mapping into a pmap
3812  *
3813  * => must be done "now" ... no lazy-evaluation
3814  * => we set pmap => pv_head locking
3815  */
3816 int
3817 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa,
3818 	   vm_prot_t prot, u_int flags, int domid)
3819 {
3820 	pt_entry_t *ptes, opte, npte;
3821 	pt_entry_t *ptep;
3822 	pd_entry_t * const *pdes;
3823 	struct vm_page *ptp, *pg;
3824 	struct pmap_page *new_pp;
3825 	struct pmap_page *old_pp;
3826 	struct pv_entry *old_pve = NULL;
3827 	struct pv_entry *new_pve;
3828 	struct pv_entry *new_pve2;
3829 	int error;
3830 	bool wired = (flags & PMAP_WIRED) != 0;
3831 	struct pmap *pmap2;
3832 
3833 	KASSERT(pmap_initialized);
3834 	KASSERT(curlwp->l_md.md_gc_pmap != pmap);
3835 
3836 #ifdef DIAGNOSTIC
3837 	/* sanity check: totally out of range? */
3838 	if (va >= VM_MAX_KERNEL_ADDRESS)
3839 		panic("pmap_enter: too big");
3840 
3841 	if (va == (vaddr_t) PDP_BASE || va == (vaddr_t) APDP_BASE)
3842 		panic("pmap_enter: trying to map over PDP/APDP!");
3843 
3844 	/* sanity check: kernel PTPs should already have been pre-allocated */
3845 	if (va >= VM_MIN_KERNEL_ADDRESS &&
3846 	    !pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]))
3847 		panic("pmap_enter: missing kernel PTP for va %lx!", va);
3848 #endif /* DIAGNOSTIC */
3849 #ifdef XEN
3850 	KASSERT(domid == DOMID_SELF || pa == 0);
3851 #endif /* XEN */
3852 
3853 	npte = ma | protection_codes[prot] | PG_V;
3854 	npte |= pmap_pat_flags(flags);
3855 	if (wired)
3856 	        npte |= PG_W;
3857 	if (va < VM_MAXUSER_ADDRESS)
3858 		npte |= PG_u;
3859 	else if (va < VM_MAX_ADDRESS)
3860 		npte |= (PG_u | PG_RW);	/* XXXCDC: no longer needed? */
3861 	else
3862 		npte |= PG_k;
3863 	if (pmap == pmap_kernel())
3864 		npte |= pmap_pg_g;
3865 	if (flags & VM_PROT_ALL) {
3866 		npte |= PG_U;
3867 		if (flags & VM_PROT_WRITE) {
3868 			KASSERT((npte & PG_RW) != 0);
3869 			npte |= PG_M;
3870 		}
3871 	}
3872 
3873 #ifdef XEN
3874 	if (domid != DOMID_SELF)
3875 		pg = NULL;
3876 	else
3877 #endif
3878 		pg = PHYS_TO_VM_PAGE(pa);
3879 	if (pg != NULL) {
3880 		/* This is a managed page */
3881 		npte |= PG_PVLIST;
3882 		new_pp = VM_PAGE_TO_PP(pg);
3883 	} else {
3884 		new_pp = NULL;
3885 	}
3886 
3887 	/* get pves. */
3888 	new_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
3889 	new_pve2 = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
3890 	if (new_pve == NULL || new_pve2 == NULL) {
3891 		if (flags & PMAP_CANFAIL) {
3892 			error = ENOMEM;
3893 			goto out2;
3894 		}
3895 		panic("pmap_enter: pve allocation failed");
3896 	}
3897 
3898 	kpreempt_disable();
3899 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
3900 	if (pmap == pmap_kernel()) {
3901 		ptp = NULL;
3902 	} else {
3903 		ptp = pmap_get_ptp(pmap, va, pdes);
3904 		if (ptp == NULL) {
3905 			pmap_unmap_ptes(pmap, pmap2);
3906 			if (flags & PMAP_CANFAIL) {
3907 				error = ENOMEM;
3908 				goto out;
3909 			}
3910 			panic("pmap_enter: get ptp failed");
3911 		}
3912 	}
3913 
3914 	/*
3915 	 * update the pte.
3916 	 */
3917 
3918 	ptep = &ptes[pl1_i(va)];
3919 	do {
3920 		opte = *ptep;
3921 
3922 		/*
3923 		 * if the same page, inherit PG_U and PG_M.
3924 		 */
3925 		if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) {
3926 			npte |= opte & (PG_U | PG_M);
3927 		}
3928 #if defined(XEN)
3929 		if (domid != DOMID_SELF) {
3930 			/* pmap_pte_cas with error handling */
3931 			int s = splvm();
3932 			if (opte != *ptep) {
3933 				splx(s);
3934 				continue;
3935 			}
3936 			error = xpq_update_foreign(
3937 			    vtomach((vaddr_t)ptep), npte, domid);
3938 			splx(s);
3939 			if (error) {
3940 				if (ptp != NULL && ptp->wire_count <= 1) {
3941 					pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3942 				}
3943 				pmap_unmap_ptes(pmap, pmap2);
3944 				goto out;
3945 			}
3946 			break;
3947 		}
3948 #endif /* defined(XEN) */
3949 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
3950 
3951 	/*
3952 	 * update statistics and PTP's reference count.
3953 	 */
3954 
3955 	pmap_stats_update_bypte(pmap, npte, opte);
3956 	if (ptp != NULL && !pmap_valid_entry(opte)) {
3957 		ptp->wire_count++;
3958 	}
3959 	KASSERT(ptp == NULL || ptp->wire_count > 1);
3960 
3961 	/*
3962 	 * if the same page, we can skip pv_entry handling.
3963 	 */
3964 
3965 	if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) {
3966 		KASSERT(((opte ^ npte) & PG_PVLIST) == 0);
3967 		goto same_pa;
3968 	}
3969 
3970 	/*
3971 	 * if old page is managed, remove pv_entry from its list.
3972 	 */
3973 
3974 	if ((~opte & (PG_V | PG_PVLIST)) == 0) {
3975 		pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte));
3976 
3977 		KASSERTMSG(pg != NULL, ("pmap_enter: PG_PVLIST mapping with "
3978 		    "unmanaged page pa = 0x%" PRIx64 " (0x%" PRIx64 ")",
3979 		    (int64_t)pa, (int64_t)atop(pa)));
3980 
3981 		KASSERT(uvm_page_locked_p(pg));
3982 
3983 		old_pp = VM_PAGE_TO_PP(pg);
3984 		old_pve = pmap_remove_pv(old_pp, ptp, va);
3985 		old_pp->pp_attrs |= opte;
3986 	}
3987 
3988 	/*
3989 	 * if new page is managed, insert pv_entry into its list.
3990 	 */
3991 
3992 	if (new_pp) {
3993 		new_pve = pmap_enter_pv(new_pp, new_pve, &new_pve2, ptp, va);
3994 	}
3995 
3996 same_pa:
3997 	pmap_unmap_ptes(pmap, pmap2);
3998 
3999 	/*
4000 	 * shootdown tlb if necessary.
4001 	 */
4002 
4003 	if ((~opte & (PG_V | PG_U)) == 0 &&
4004 	    ((opte ^ npte) & (PG_FRAME | PG_RW)) != 0) {
4005 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER);
4006 	}
4007 
4008 	error = 0;
4009 out:
4010 	kpreempt_enable();
4011 out2:
4012 	if (old_pve != NULL) {
4013 		pool_cache_put(&pmap_pv_cache, old_pve);
4014 	}
4015 	if (new_pve != NULL) {
4016 		pool_cache_put(&pmap_pv_cache, new_pve);
4017 	}
4018 	if (new_pve2 != NULL) {
4019 		pool_cache_put(&pmap_pv_cache, new_pve2);
4020 	}
4021 
4022 	return error;
4023 }
4024 
4025 static bool
4026 pmap_get_physpage(vaddr_t va, int level, paddr_t *paddrp)
4027 {
4028 	struct vm_page *ptp;
4029 	struct pmap *kpm = pmap_kernel();
4030 
4031 	if (uvm.page_init_done == false) {
4032 		/*
4033 		 * we're growing the kernel pmap early (from
4034 		 * uvm_pageboot_alloc()).  this case must be
4035 		 * handled a little differently.
4036 		 */
4037 
4038 		if (uvm_page_physget(paddrp) == false)
4039 			panic("pmap_get_physpage: out of memory");
4040 		kpreempt_disable();
4041 		pmap_pte_set(early_zero_pte,
4042 		    pmap_pa2pte(*paddrp) | PG_V | PG_RW | PG_k);
4043 		pmap_pte_flush();
4044 		pmap_update_pg((vaddr_t)early_zerop);
4045 		memset(early_zerop, 0, PAGE_SIZE);
4046 #if defined(DIAGNOSTIC) || defined (XEN)
4047 		pmap_pte_set(early_zero_pte, 0);
4048 		pmap_pte_flush();
4049 #endif /* defined(DIAGNOSTIC) */
4050 		kpreempt_enable();
4051 	} else {
4052 		/* XXX */
4053 		ptp = uvm_pagealloc(NULL, 0, NULL,
4054 				    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
4055 		if (ptp == NULL)
4056 			panic("pmap_get_physpage: out of memory");
4057 		ptp->flags &= ~PG_BUSY;
4058 		ptp->wire_count = 1;
4059 		*paddrp = VM_PAGE_TO_PHYS(ptp);
4060 	}
4061 	pmap_stats_update(kpm, 1, 0);
4062 	return true;
4063 }
4064 
4065 /*
4066  * Allocate the amount of specified ptps for a ptp level, and populate
4067  * all levels below accordingly, mapping virtual addresses starting at
4068  * kva.
4069  *
4070  * Used by pmap_growkernel.
4071  */
4072 static void
4073 pmap_alloc_level(pd_entry_t * const *pdes, vaddr_t kva, int lvl,
4074     long *needed_ptps)
4075 {
4076 	unsigned long i;
4077 	vaddr_t va;
4078 	paddr_t pa;
4079 	unsigned long index, endindex;
4080 	int level;
4081 	pd_entry_t *pdep;
4082 #ifdef XEN
4083 	int s = splvm(); /* protect xpq_* */
4084 #endif
4085 
4086 	for (level = lvl; level > 1; level--) {
4087 		if (level == PTP_LEVELS)
4088 			pdep = pmap_kernel()->pm_pdir;
4089 		else
4090 			pdep = pdes[level - 2];
4091 		va = kva;
4092 		index = pl_i_roundup(kva, level);
4093 		endindex = index + needed_ptps[level - 1] - 1;
4094 
4095 
4096 		for (i = index; i <= endindex; i++) {
4097 			KASSERT(!pmap_valid_entry(pdep[i]));
4098 			pmap_get_physpage(va, level - 1, &pa);
4099 #ifdef XEN
4100 			xpq_queue_pte_update((level == PTP_LEVELS) ?
4101 			    xpmap_ptom(pmap_pdirpa(pmap_kernel(), i)) :
4102 			    xpmap_ptetomach(&pdep[i]),
4103 			    pmap_pa2pte(pa) | PG_k | PG_V | PG_RW);
4104 #ifdef PAE
4105 			if (level == PTP_LEVELS &&  i > L2_SLOT_KERN) {
4106 				/* update real kernel PD too */
4107 				xpq_queue_pte_update(
4108 				    xpmap_ptetomach(&pmap_kl2pd[l2tol2(i)]),
4109 				    pmap_pa2pte(pa) | PG_k | PG_V | PG_RW);
4110 			}
4111 #endif
4112 #else /* XEN */
4113 			pdep[i] = pmap_pa2pte(pa) | PG_k | PG_V | PG_RW;
4114 #endif /* XEN */
4115 			KASSERT(level != PTP_LEVELS || nkptp[level - 1] +
4116 			    pl_i(VM_MIN_KERNEL_ADDRESS, level) == i);
4117 			nkptp[level - 1]++;
4118 			va += nbpd[level - 1];
4119 		}
4120 		pmap_pte_flush();
4121 	}
4122 #ifdef XEN
4123 	splx(s);
4124 #endif
4125 }
4126 
4127 /*
4128  * pmap_growkernel: increase usage of KVM space
4129  *
4130  * => we allocate new PTPs for the kernel and install them in all
4131  *	the pmaps on the system.
4132  */
4133 
4134 vaddr_t
4135 pmap_growkernel(vaddr_t maxkvaddr)
4136 {
4137 	struct pmap *kpm = pmap_kernel();
4138 #if !defined(XEN) || !defined(__x86_64__)
4139 	struct pmap *pm;
4140 #endif
4141 	int s, i;
4142 	long needed_kptp[PTP_LEVELS], target_nptp, old;
4143 	bool invalidate = false;
4144 
4145 	s = splvm();	/* to be safe */
4146 	mutex_enter(kpm->pm_lock);
4147 
4148 	if (maxkvaddr <= pmap_maxkvaddr) {
4149 		mutex_exit(kpm->pm_lock);
4150 		splx(s);
4151 		return pmap_maxkvaddr;
4152 	}
4153 
4154 	maxkvaddr = x86_round_pdr(maxkvaddr);
4155 	old = nkptp[PTP_LEVELS - 1];
4156 	/*
4157 	 * This loop could be optimized more, but pmap_growkernel()
4158 	 * is called infrequently.
4159 	 */
4160 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
4161 		target_nptp = pl_i_roundup(maxkvaddr, i + 1) -
4162 		    pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1);
4163 		/*
4164 		 * XXX only need to check toplevel.
4165 		 */
4166 		if (target_nptp > nkptpmax[i])
4167 			panic("out of KVA space");
4168 		KASSERT(target_nptp >= nkptp[i]);
4169 		needed_kptp[i] = target_nptp - nkptp[i];
4170 	}
4171 
4172 	pmap_alloc_level(normal_pdes, pmap_maxkvaddr, PTP_LEVELS, needed_kptp);
4173 
4174 	/*
4175 	 * If the number of top level entries changed, update all
4176 	 * pmaps.
4177 	 */
4178 	if (needed_kptp[PTP_LEVELS - 1] != 0) {
4179 #ifdef XEN
4180 #ifdef __x86_64__
4181 		/* nothing, kernel entries are never entered in user pmap */
4182 #else /* __x86_64__ */
4183 		mutex_enter(&pmaps_lock);
4184 		LIST_FOREACH(pm, &pmaps, pm_list) {
4185 			int pdkidx;
4186 			for (pdkidx =  PDIR_SLOT_KERN + old;
4187 			    pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1];
4188 			    pdkidx++) {
4189 				xpq_queue_pte_update(
4190 				    xpmap_ptom(pmap_pdirpa(pm, pdkidx)),
4191 				    kpm->pm_pdir[pdkidx]);
4192 			}
4193 			xpq_flush_queue();
4194 		}
4195 		mutex_exit(&pmaps_lock);
4196 #endif /* __x86_64__ */
4197 #else /* XEN */
4198 		unsigned newpdes;
4199 		newpdes = nkptp[PTP_LEVELS - 1] - old;
4200 		mutex_enter(&pmaps_lock);
4201 		LIST_FOREACH(pm, &pmaps, pm_list) {
4202 			memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old],
4203 			       &kpm->pm_pdir[PDIR_SLOT_KERN + old],
4204 			       newpdes * sizeof (pd_entry_t));
4205 		}
4206 		mutex_exit(&pmaps_lock);
4207 #endif
4208 		invalidate = true;
4209 	}
4210 	pmap_maxkvaddr = maxkvaddr;
4211 	mutex_exit(kpm->pm_lock);
4212 	splx(s);
4213 
4214 	if (invalidate) {
4215 		/* Invalidate the PDP cache. */
4216 		pool_cache_invalidate(&pmap_pdp_cache);
4217 	}
4218 
4219 	return maxkvaddr;
4220 }
4221 
4222 #ifdef DEBUG
4223 void pmap_dump(struct pmap *, vaddr_t, vaddr_t);
4224 
4225 /*
4226  * pmap_dump: dump all the mappings from a pmap
4227  *
4228  * => caller should not be holding any pmap locks
4229  */
4230 
4231 void
4232 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
4233 {
4234 	pt_entry_t *ptes, *pte;
4235 	pd_entry_t * const *pdes;
4236 	struct pmap *pmap2;
4237 	vaddr_t blkendva;
4238 
4239 	/*
4240 	 * if end is out of range truncate.
4241 	 * if (end == start) update to max.
4242 	 */
4243 
4244 	if (eva > VM_MAXUSER_ADDRESS || eva <= sva)
4245 		eva = VM_MAXUSER_ADDRESS;
4246 
4247 	/*
4248 	 * we lock in the pmap => pv_head direction
4249 	 */
4250 
4251 	kpreempt_disable();
4252 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
4253 
4254 	/*
4255 	 * dumping a range of pages: we dump in PTP sized blocks (4MB)
4256 	 */
4257 
4258 	for (/* null */ ; sva < eva ; sva = blkendva) {
4259 
4260 		/* determine range of block */
4261 		blkendva = x86_round_pdr(sva+1);
4262 		if (blkendva > eva)
4263 			blkendva = eva;
4264 
4265 		/* valid block? */
4266 		if (!pmap_pdes_valid(sva, pdes, NULL))
4267 			continue;
4268 
4269 		pte = &ptes[pl1_i(sva)];
4270 		for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) {
4271 			if (!pmap_valid_entry(*pte))
4272 				continue;
4273 			printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR
4274 			    " (pte=%#" PRIxPADDR ")\n",
4275 			    sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte);
4276 		}
4277 	}
4278 	pmap_unmap_ptes(pmap, pmap2);
4279 	kpreempt_enable();
4280 }
4281 #endif
4282 
4283 /*
4284  * pmap_update: process deferred invalidations and frees.
4285  */
4286 
4287 void
4288 pmap_update(struct pmap *pmap)
4289 {
4290 	struct vm_page *empty_ptps;
4291 	lwp_t *l = curlwp;
4292 
4293 	/*
4294 	 * If we have torn down this pmap, invalidate non-global TLB
4295 	 * entries on any processors using it.
4296 	 */
4297 	KPREEMPT_DISABLE(l);
4298 	if (__predict_false(l->l_md.md_gc_pmap == pmap)) {
4299 		l->l_md.md_gc_pmap = NULL;
4300 		pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, TLBSHOOT_UPDATE);
4301 	}
4302 	/*
4303 	 * Initiate any pending TLB shootdowns.  Wait for them to
4304 	 * complete before returning control to the caller.
4305 	 */
4306 	pmap_tlb_shootnow();
4307 	KPREEMPT_ENABLE(l);
4308 
4309 	/*
4310 	 * Now that shootdowns are complete, process deferred frees,
4311 	 * but not from interrupt context.
4312 	 */
4313 	if (l->l_md.md_gc_ptp != NULL) {
4314 		KASSERT((l->l_pflag & LP_INTR) == 0);
4315 		if (cpu_intr_p()) {
4316 			return;
4317 		}
4318 		empty_ptps = l->l_md.md_gc_ptp;
4319 		l->l_md.md_gc_ptp = NULL;
4320 		pmap_free_ptps(empty_ptps);
4321 	}
4322 }
4323 
4324 #if PTP_LEVELS > 4
4325 #error "Unsupported number of page table mappings"
4326 #endif
4327 
4328 paddr_t
4329 pmap_init_tmp_pgtbl(paddr_t pg)
4330 {
4331 	static bool maps_loaded;
4332 	static const paddr_t x86_tmp_pml_paddr[] = {
4333 	    4 * PAGE_SIZE,
4334 	    5 * PAGE_SIZE,
4335 	    6 * PAGE_SIZE,
4336 	    7 * PAGE_SIZE
4337 	};
4338 	static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 };
4339 
4340 	pd_entry_t *tmp_pml, *kernel_pml;
4341 
4342 	int level;
4343 
4344 	if (!maps_loaded) {
4345 		for (level = 0; level < PTP_LEVELS; ++level) {
4346 			x86_tmp_pml_vaddr[level] =
4347 			    uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
4348 			    UVM_KMF_VAONLY);
4349 
4350 			if (x86_tmp_pml_vaddr[level] == 0)
4351 				panic("mapping of real mode PML failed\n");
4352 			pmap_kenter_pa(x86_tmp_pml_vaddr[level],
4353 			    x86_tmp_pml_paddr[level],
4354 			    VM_PROT_READ | VM_PROT_WRITE, 0);
4355 			pmap_update(pmap_kernel());
4356 		}
4357 		maps_loaded = true;
4358 	}
4359 
4360 	/* Zero levels 1-3 */
4361 	for (level = 0; level < PTP_LEVELS - 1; ++level) {
4362 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
4363 		memset(tmp_pml, 0, PAGE_SIZE);
4364 	}
4365 
4366 	/* Copy PML4 */
4367 	kernel_pml = pmap_kernel()->pm_pdir;
4368 	tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1];
4369 	memcpy(tmp_pml, kernel_pml, PAGE_SIZE);
4370 
4371 #ifdef PAE
4372 	/*
4373 	 * Use the last 4 entries of the L2 page as L3 PD entries. These
4374 	 * last entries are unlikely to be used for temporary mappings.
4375 	 * 508: maps 0->1GB (userland)
4376 	 * 509: unused
4377 	 * 510: unused
4378 	 * 511: maps 3->4GB (kernel)
4379 	 */
4380 	tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PG_V;
4381 	tmp_pml[509] = 0;
4382 	tmp_pml[510] = 0;
4383 	tmp_pml[511] = pmap_pdirpa(pmap_kernel(),PDIR_SLOT_KERN) | PG_V;
4384 #endif
4385 
4386 	for (level = PTP_LEVELS - 1; level > 0; --level) {
4387 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
4388 
4389 		tmp_pml[pl_i(pg, level + 1)] =
4390 		    (x86_tmp_pml_paddr[level - 1] & PG_FRAME) | PG_RW | PG_V;
4391 	}
4392 
4393 	tmp_pml = (void *)x86_tmp_pml_vaddr[0];
4394 	tmp_pml[pl_i(pg, 1)] = (pg & PG_FRAME) | PG_RW | PG_V;
4395 
4396 #ifdef PAE
4397 	/* Return the PA of the L3 page (entry 508 of the L2 page) */
4398 	return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t);
4399 #endif
4400 
4401 	return x86_tmp_pml_paddr[PTP_LEVELS - 1];
4402 }
4403 
4404 u_int
4405 x86_mmap_flags(paddr_t mdpgno)
4406 {
4407 	u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK;
4408 	u_int pflag = 0;
4409 
4410 	if (nflag & X86_MMAP_FLAG_PREFETCH)
4411 		pflag |= PMAP_WRITE_COMBINE;
4412 
4413 	return pflag;
4414 }
4415