xref: /netbsd-src/sys/arch/x86/x86/pmap.c (revision b7b7574d3bf8eeb51a1fa3977b59142ec6434a55)
1 /*	$NetBSD: pmap.c,v 1.183 2014/06/14 02:54:47 pgoyette Exp $	*/
2 
3 /*-
4  * Copyright (c) 2008, 2010 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 2007 Manuel Bouyer.
34  *
35  * Redistribution and use in source and binary forms, with or without
36  * modification, are permitted provided that the following conditions
37  * are met:
38  * 1. Redistributions of source code must retain the above copyright
39  *    notice, this list of conditions and the following disclaimer.
40  * 2. Redistributions in binary form must reproduce the above copyright
41  *    notice, this list of conditions and the following disclaimer in the
42  *    documentation and/or other materials provided with the distribution.
43  *
44  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
45  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
46  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
47  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
48  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
49  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
50  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
51  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
52  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
53  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
54  *
55  */
56 
57 /*
58  * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
59  *
60  * Permission to use, copy, modify, and distribute this software for any
61  * purpose with or without fee is hereby granted, provided that the above
62  * copyright notice and this permission notice appear in all copies.
63  *
64  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
65  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
66  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
67  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
68  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
69  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
70  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
71  */
72 
73 /*
74  * Copyright (c) 1997 Charles D. Cranor and Washington University.
75  * All rights reserved.
76  *
77  * Redistribution and use in source and binary forms, with or without
78  * modification, are permitted provided that the following conditions
79  * are met:
80  * 1. Redistributions of source code must retain the above copyright
81  *    notice, this list of conditions and the following disclaimer.
82  * 2. Redistributions in binary form must reproduce the above copyright
83  *    notice, this list of conditions and the following disclaimer in the
84  *    documentation and/or other materials provided with the distribution.
85  *
86  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
87  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
88  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
89  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
90  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
91  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
92  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
93  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
94  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
95  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
96  */
97 
98 /*
99  * Copyright 2001 (c) Wasabi Systems, Inc.
100  * All rights reserved.
101  *
102  * Written by Frank van der Linden for Wasabi Systems, Inc.
103  *
104  * Redistribution and use in source and binary forms, with or without
105  * modification, are permitted provided that the following conditions
106  * are met:
107  * 1. Redistributions of source code must retain the above copyright
108  *    notice, this list of conditions and the following disclaimer.
109  * 2. Redistributions in binary form must reproduce the above copyright
110  *    notice, this list of conditions and the following disclaimer in the
111  *    documentation and/or other materials provided with the distribution.
112  * 3. All advertising materials mentioning features or use of this software
113  *    must display the following acknowledgement:
114  *      This product includes software developed for the NetBSD Project by
115  *      Wasabi Systems, Inc.
116  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
117  *    or promote products derived from this software without specific prior
118  *    written permission.
119  *
120  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
121  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
122  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
123  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
124  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
125  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
126  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
127  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
128  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
129  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
130  * POSSIBILITY OF SUCH DAMAGE.
131  */
132 
133 /*
134  * This is the i386 pmap modified and generalized to support x86-64
135  * as well. The idea is to hide the upper N levels of the page tables
136  * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest
137  * is mostly untouched, except that it uses some more generalized
138  * macros and interfaces.
139  *
140  * This pmap has been tested on the i386 as well, and it can be easily
141  * adapted to PAE.
142  *
143  * fvdl@wasabisystems.com 18-Jun-2001
144  */
145 
146 /*
147  * pmap.c: i386 pmap module rewrite
148  * Chuck Cranor <chuck@netbsd>
149  * 11-Aug-97
150  *
151  * history of this pmap module: in addition to my own input, i used
152  *    the following references for this rewrite of the i386 pmap:
153  *
154  * [1] the NetBSD i386 pmap.   this pmap appears to be based on the
155  *     BSD hp300 pmap done by Mike Hibler at University of Utah.
156  *     it was then ported to the i386 by William Jolitz of UUNET
157  *     Technologies, Inc.   Then Charles M. Hannum of the NetBSD
158  *     project fixed some bugs and provided some speed ups.
159  *
160  * [2] the FreeBSD i386 pmap.   this pmap seems to be the
161  *     Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson
162  *     and David Greenman.
163  *
164  * [3] the Mach pmap.   this pmap, from CMU, seems to have migrated
165  *     between several processors.   the VAX version was done by
166  *     Avadis Tevanian, Jr., and Michael Wayne Young.    the i386
167  *     version was done by Lance Berc, Mike Kupfer, Bob Baron,
168  *     David Golub, and Richard Draves.    the alpha version was
169  *     done by Alessandro Forin (CMU/Mach) and Chris Demetriou
170  *     (NetBSD/alpha).
171  */
172 
173 #include <sys/cdefs.h>
174 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.183 2014/06/14 02:54:47 pgoyette Exp $");
175 
176 #include "opt_user_ldt.h"
177 #include "opt_lockdebug.h"
178 #include "opt_multiprocessor.h"
179 #include "opt_xen.h"
180 #if !defined(__x86_64__)
181 #include "opt_kstack_dr0.h"
182 #endif /* !defined(__x86_64__) */
183 
184 #include <sys/param.h>
185 #include <sys/systm.h>
186 #include <sys/proc.h>
187 #include <sys/pool.h>
188 #include <sys/kernel.h>
189 #include <sys/atomic.h>
190 #include <sys/cpu.h>
191 #include <sys/intr.h>
192 #include <sys/xcall.h>
193 #include <sys/kcore.h>
194 
195 #include <uvm/uvm.h>
196 
197 #include <dev/isa/isareg.h>
198 
199 #include <machine/specialreg.h>
200 #include <machine/gdt.h>
201 #include <machine/isa_machdep.h>
202 #include <machine/cpuvar.h>
203 #include <machine/cputypes.h>
204 
205 #include <x86/pmap.h>
206 #include <x86/pmap_pv.h>
207 
208 #include <x86/i82489reg.h>
209 #include <x86/i82489var.h>
210 
211 #ifdef XEN
212 #include <xen/xen-public/xen.h>
213 #include <xen/hypervisor.h>
214 #endif
215 
216 /*
217  * general info:
218  *
219  *  - for an explanation of how the i386 MMU hardware works see
220  *    the comments in <machine/pte.h>.
221  *
222  *  - for an explanation of the general memory structure used by
223  *    this pmap (including the recursive mapping), see the comments
224  *    in <machine/pmap.h>.
225  *
226  * this file contains the code for the "pmap module."   the module's
227  * job is to manage the hardware's virtual to physical address mappings.
228  * note that there are two levels of mapping in the VM system:
229  *
230  *  [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
231  *      to map ranges of virtual address space to objects/files.  for
232  *      example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
233  *      to the file /bin/ls starting at offset zero."   note that
234  *      the upper layer mapping is not concerned with how individual
235  *      vm_pages are mapped.
236  *
237  *  [2] the lower layer of the VM system (the pmap) maintains the mappings
238  *      from virtual addresses.   it is concerned with which vm_page is
239  *      mapped where.   for example, when you run /bin/ls and start
240  *      at page 0x1000 the fault routine may lookup the correct page
241  *      of the /bin/ls file and then ask the pmap layer to establish
242  *      a mapping for it.
243  *
244  * note that information in the lower layer of the VM system can be
245  * thrown away since it can easily be reconstructed from the info
246  * in the upper layer.
247  *
248  * data structures we use include:
249  *
250  *  - struct pmap: describes the address space of one thread
251  *  - struct pv_entry: describes one <PMAP,VA> mapping of a PA
252  *  - struct pv_head: there is one pv_head per managed page of
253  *	physical memory.   the pv_head points to a list of pv_entry
254  *	structures which describe all the <PMAP,VA> pairs that this
255  *      page is mapped in.    this is critical for page based operations
256  *      such as pmap_page_protect() [change protection on _all_ mappings
257  *      of a page]
258  */
259 
260 /*
261  * memory allocation
262  *
263  *  - there are three data structures that we must dynamically allocate:
264  *
265  * [A] new process' page directory page (PDP)
266  *	- plan 1: done at pmap_create() we use
267  *	  uvm_km_alloc(kernel_map, PAGE_SIZE)  [fka kmem_alloc] to do this
268  *	  allocation.
269  *
270  * if we are low in free physical memory then we sleep in
271  * uvm_km_alloc -- in this case this is ok since we are creating
272  * a new pmap and should not be holding any locks.
273  *
274  * if the kernel is totally out of virtual space
275  * (i.e. uvm_km_alloc returns NULL), then we panic.
276  *
277  * [B] new page tables pages (PTP)
278  * 	- call uvm_pagealloc()
279  * 		=> success: zero page, add to pm_pdir
280  * 		=> failure: we are out of free vm_pages, let pmap_enter()
281  *		   tell UVM about it.
282  *
283  * note: for kernel PTPs, we start with NKPTP of them.   as we map
284  * kernel memory (at uvm_map time) we check to see if we've grown
285  * the kernel pmap.   if so, we call the optional function
286  * pmap_growkernel() to grow the kernel PTPs in advance.
287  *
288  * [C] pv_entry structures
289  */
290 
291 /*
292  * locking
293  *
294  * we have the following locks that we must contend with:
295  *
296  * mutexes:
297  *
298  * - pmap lock (per pmap, part of uvm_object)
299  *   this lock protects the fields in the pmap structure including
300  *   the non-kernel PDEs in the PDP, and the PTEs.  it also locks
301  *   in the alternate PTE space (since that is determined by the
302  *   entry in the PDP).
303  *
304  * - pvh_lock (per pv_head)
305  *   this lock protects the pv_entry list which is chained off the
306  *   pv_head structure for a specific managed PA.   it is locked
307  *   when traversing the list (e.g. adding/removing mappings,
308  *   syncing R/M bits, etc.)
309  *
310  * - pmaps_lock
311  *   this lock protects the list of active pmaps (headed by "pmaps").
312  *   we lock it when adding or removing pmaps from this list.
313  */
314 
315 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER;
316 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER;
317 const long nkptpmax[] = NKPTPMAX_INITIALIZER;
318 const long nbpd[] = NBPD_INITIALIZER;
319 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER;
320 
321 long nkptp[] = NKPTP_INITIALIZER;
322 
323 struct pmap_head pmaps;
324 kmutex_t pmaps_lock;
325 
326 static vaddr_t pmap_maxkvaddr;
327 
328 /*
329  * XXX kludge: dummy locking to make KASSERTs in uvm_page.c comfortable.
330  * actual locking is done by pm_lock.
331  */
332 #if defined(DIAGNOSTIC)
333 #define	PMAP_SUBOBJ_LOCK(pm, idx) \
334 	KASSERT(mutex_owned((pm)->pm_lock)); \
335 	if ((idx) != 0) \
336 		mutex_enter((pm)->pm_obj[(idx)].vmobjlock)
337 #define	PMAP_SUBOBJ_UNLOCK(pm, idx) \
338 	KASSERT(mutex_owned((pm)->pm_lock)); \
339 	if ((idx) != 0) \
340 		mutex_exit((pm)->pm_obj[(idx)].vmobjlock)
341 #else /* defined(DIAGNOSTIC) */
342 #define	PMAP_SUBOBJ_LOCK(pm, idx)	/* nothing */
343 #define	PMAP_SUBOBJ_UNLOCK(pm, idx)	/* nothing */
344 #endif /* defined(DIAGNOSTIC) */
345 
346 /*
347  * Misc. event counters.
348  */
349 struct evcnt pmap_iobmp_evcnt;
350 struct evcnt pmap_ldt_evcnt;
351 
352 /*
353  * PAT
354  */
355 #define	PATENTRY(n, type)	(type << ((n) * 8))
356 #define	PAT_UC		0x0ULL
357 #define	PAT_WC		0x1ULL
358 #define	PAT_WT		0x4ULL
359 #define	PAT_WP		0x5ULL
360 #define	PAT_WB		0x6ULL
361 #define	PAT_UCMINUS	0x7ULL
362 
363 static bool cpu_pat_enabled __read_mostly = false;
364 
365 /*
366  * global data structures
367  */
368 
369 static struct pmap kernel_pmap_store;	/* the kernel's pmap (proc0) */
370 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store;
371 
372 /*
373  * pmap_pg_g: if our processor supports PG_G in the PTE then we
374  * set pmap_pg_g to PG_G (otherwise it is zero).
375  */
376 
377 int pmap_pg_g __read_mostly = 0;
378 
379 /*
380  * pmap_largepages: if our processor supports PG_PS and we are
381  * using it, this is set to true.
382  */
383 
384 int pmap_largepages __read_mostly;
385 
386 /*
387  * i386 physical memory comes in a big contig chunk with a small
388  * hole toward the front of it...  the following two paddr_t's
389  * (shared with machdep.c) describe the physical address space
390  * of this machine.
391  */
392 paddr_t avail_start __read_mostly; /* PA of first available physical page */
393 paddr_t avail_end __read_mostly; /* PA of last available physical page */
394 
395 #ifdef XEN
396 #ifdef __x86_64__
397 /* Dummy PGD for user cr3, used between pmap_deactivate() and pmap_activate() */
398 static paddr_t xen_dummy_user_pgd;
399 #endif /* __x86_64__ */
400 paddr_t pmap_pa_start; /* PA of first physical page for this domain */
401 paddr_t pmap_pa_end;   /* PA of last physical page for this domain */
402 #endif /* XEN */
403 
404 #define	VM_PAGE_TO_PP(pg)	(&(pg)->mdpage.mp_pp)
405 
406 #define	PV_HASH_SIZE		32768
407 #define	PV_HASH_LOCK_CNT	32
408 
409 struct pv_hash_lock {
410 	kmutex_t lock;
411 } __aligned(CACHE_LINE_SIZE) pv_hash_locks[PV_HASH_LOCK_CNT]
412     __aligned(CACHE_LINE_SIZE);
413 
414 struct pv_hash_head {
415 	SLIST_HEAD(, pv_entry) hh_list;
416 } pv_hash_heads[PV_HASH_SIZE];
417 
418 static u_int
419 pvhash_hash(struct vm_page *ptp, vaddr_t va)
420 {
421 
422 	return (uintptr_t)ptp / sizeof(*ptp) + (va >> PAGE_SHIFT);
423 }
424 
425 static struct pv_hash_head *
426 pvhash_head(u_int hash)
427 {
428 
429 	return &pv_hash_heads[hash % PV_HASH_SIZE];
430 }
431 
432 static kmutex_t *
433 pvhash_lock(u_int hash)
434 {
435 
436 	return &pv_hash_locks[hash % PV_HASH_LOCK_CNT].lock;
437 }
438 
439 static struct pv_entry *
440 pvhash_remove(struct pv_hash_head *hh, struct vm_page *ptp, vaddr_t va)
441 {
442 	struct pv_entry *pve;
443 	struct pv_entry *prev;
444 
445 	prev = NULL;
446 	SLIST_FOREACH(pve, &hh->hh_list, pve_hash) {
447 		if (pve->pve_pte.pte_ptp == ptp &&
448 		    pve->pve_pte.pte_va == va) {
449 			if (prev != NULL) {
450 				SLIST_REMOVE_AFTER(prev, pve_hash);
451 			} else {
452 				SLIST_REMOVE_HEAD(&hh->hh_list, pve_hash);
453 			}
454 			break;
455 		}
456 		prev = pve;
457 	}
458 	return pve;
459 }
460 
461 /*
462  * other data structures
463  */
464 
465 static pt_entry_t protection_codes[8] __read_mostly; /* maps MI prot to i386
466 							prot code */
467 static bool pmap_initialized __read_mostly = false; /* pmap_init done yet? */
468 
469 /*
470  * the following two vaddr_t's are used during system startup
471  * to keep track of how much of the kernel's VM space we have used.
472  * once the system is started, the management of the remaining kernel
473  * VM space is turned over to the kernel_map vm_map.
474  */
475 
476 static vaddr_t virtual_avail __read_mostly;	/* VA of first free KVA */
477 static vaddr_t virtual_end __read_mostly;	/* VA of last free KVA */
478 
479 /*
480  * pool that pmap structures are allocated from
481  */
482 
483 static struct pool_cache pmap_cache;
484 
485 /*
486  * pv_entry cache
487  */
488 
489 static struct pool_cache pmap_pv_cache;
490 
491 #ifdef __HAVE_DIRECT_MAP
492 
493 extern phys_ram_seg_t mem_clusters[];
494 extern int mem_cluster_cnt;
495 
496 #else
497 
498 /*
499  * MULTIPROCESSOR: special VA's/ PTE's are actually allocated inside a
500  * maxcpus*NPTECL array of PTE's, to avoid cache line thrashing
501  * due to false sharing.
502  */
503 
504 #ifdef MULTIPROCESSOR
505 #define PTESLEW(pte, id) ((pte)+(id)*NPTECL)
506 #define VASLEW(va,id) ((va)+(id)*NPTECL*PAGE_SIZE)
507 #else
508 #define PTESLEW(pte, id) ((void)id, pte)
509 #define VASLEW(va,id) ((void)id, va)
510 #endif
511 
512 /*
513  * special VAs and the PTEs that map them
514  */
515 static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte, *early_zero_pte;
516 static char *csrcp, *cdstp, *zerop, *ptpp;
517 #ifdef XEN
518 char *early_zerop; /* also referenced from xen_pmap_bootstrap() */
519 #else
520 static char *early_zerop;
521 #endif
522 
523 #endif
524 
525 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int);
526 
527 /* PDP pool_cache(9) and its callbacks */
528 struct pool_cache pmap_pdp_cache;
529 static int  pmap_pdp_ctor(void *, void *, int);
530 static void pmap_pdp_dtor(void *, void *);
531 #ifdef PAE
532 /* need to allocate items of 4 pages */
533 static void *pmap_pdp_alloc(struct pool *, int);
534 static void pmap_pdp_free(struct pool *, void *);
535 static struct pool_allocator pmap_pdp_allocator = {
536 	.pa_alloc = pmap_pdp_alloc,
537 	.pa_free = pmap_pdp_free,
538 	.pa_pagesz = PAGE_SIZE * PDP_SIZE,
539 };
540 #endif /* PAE */
541 
542 extern vaddr_t idt_vaddr;			/* we allocate IDT early */
543 extern paddr_t idt_paddr;
544 
545 #ifdef _LP64
546 extern vaddr_t lo32_vaddr;
547 extern vaddr_t lo32_paddr;
548 #endif
549 
550 extern int end;
551 
552 #ifdef i386
553 /* stuff to fix the pentium f00f bug */
554 extern vaddr_t pentium_idt_vaddr;
555 #endif
556 
557 
558 /*
559  * local prototypes
560  */
561 
562 static struct vm_page	*pmap_get_ptp(struct pmap *, vaddr_t,
563 				      pd_entry_t * const *);
564 static struct vm_page	*pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int);
565 static void		 pmap_freepage(struct pmap *, struct vm_page *, int);
566 static void		 pmap_free_ptp(struct pmap *, struct vm_page *,
567 				       vaddr_t, pt_entry_t *,
568 				       pd_entry_t * const *);
569 static bool		 pmap_remove_pte(struct pmap *, struct vm_page *,
570 					 pt_entry_t *, vaddr_t,
571 					 struct pv_entry **);
572 static void		 pmap_remove_ptes(struct pmap *, struct vm_page *,
573 					  vaddr_t, vaddr_t, vaddr_t,
574 					  struct pv_entry **);
575 
576 static bool		 pmap_get_physpage(vaddr_t, int, paddr_t *);
577 static void		 pmap_alloc_level(pd_entry_t * const *, vaddr_t, int,
578 					  long *);
579 
580 static bool		 pmap_reactivate(struct pmap *);
581 
582 /*
583  * p m a p   h e l p e r   f u n c t i o n s
584  */
585 
586 static inline void
587 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff)
588 {
589 
590 	if (pmap == pmap_kernel()) {
591 		atomic_add_long(&pmap->pm_stats.resident_count, resid_diff);
592 		atomic_add_long(&pmap->pm_stats.wired_count, wired_diff);
593 	} else {
594 		KASSERT(mutex_owned(pmap->pm_lock));
595 		pmap->pm_stats.resident_count += resid_diff;
596 		pmap->pm_stats.wired_count += wired_diff;
597 	}
598 }
599 
600 static inline void
601 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
602 {
603 	int resid_diff = ((npte & PG_V) ? 1 : 0) - ((opte & PG_V) ? 1 : 0);
604 	int wired_diff = ((npte & PG_W) ? 1 : 0) - ((opte & PG_W) ? 1 : 0);
605 
606 	KASSERT((npte & (PG_V | PG_W)) != PG_W);
607 	KASSERT((opte & (PG_V | PG_W)) != PG_W);
608 
609 	pmap_stats_update(pmap, resid_diff, wired_diff);
610 }
611 
612 /*
613  * ptp_to_pmap: lookup pmap by ptp
614  */
615 
616 static struct pmap *
617 ptp_to_pmap(struct vm_page *ptp)
618 {
619 	struct pmap *pmap;
620 
621 	if (ptp == NULL) {
622 		return pmap_kernel();
623 	}
624 	pmap = (struct pmap *)ptp->uobject;
625 	KASSERT(pmap != NULL);
626 	KASSERT(&pmap->pm_obj[0] == ptp->uobject);
627 	return pmap;
628 }
629 
630 static inline struct pv_pte *
631 pve_to_pvpte(struct pv_entry *pve)
632 {
633 
634 	KASSERT((void *)&pve->pve_pte == (void *)pve);
635 	return &pve->pve_pte;
636 }
637 
638 static inline struct pv_entry *
639 pvpte_to_pve(struct pv_pte *pvpte)
640 {
641 	struct pv_entry *pve = (void *)pvpte;
642 
643 	KASSERT(pve_to_pvpte(pve) == pvpte);
644 	return pve;
645 }
646 
647 /*
648  * pv_pte_first, pv_pte_next: PV list iterator.
649  */
650 
651 static struct pv_pte *
652 pv_pte_first(struct pmap_page *pp)
653 {
654 
655 	if ((pp->pp_flags & PP_EMBEDDED) != 0) {
656 		return &pp->pp_pte;
657 	}
658 	return pve_to_pvpte(LIST_FIRST(&pp->pp_head.pvh_list));
659 }
660 
661 static struct pv_pte *
662 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte)
663 {
664 
665 	KASSERT(pvpte != NULL);
666 	if (pvpte == &pp->pp_pte) {
667 		KASSERT((pp->pp_flags & PP_EMBEDDED) != 0);
668 		return NULL;
669 	}
670 	KASSERT((pp->pp_flags & PP_EMBEDDED) == 0);
671 	return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list));
672 }
673 
674 /*
675  * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
676  *		of course the kernel is always loaded
677  */
678 
679 bool
680 pmap_is_curpmap(struct pmap *pmap)
681 {
682 	return((pmap == pmap_kernel()) ||
683 	       (pmap == curcpu()->ci_pmap));
684 }
685 
686 /*
687  *	Add a reference to the specified pmap.
688  */
689 
690 void
691 pmap_reference(struct pmap *pmap)
692 {
693 
694 	atomic_inc_uint(&pmap->pm_obj[0].uo_refs);
695 }
696 
697 /*
698  * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
699  *
700  * there are several pmaps involved.  some or all of them might be same.
701  *
702  *	- the pmap given by the first argument
703  *		our caller wants to access this pmap's PTEs.
704  *
705  *	- pmap_kernel()
706  *		the kernel pmap.  note that it only contains the kernel part
707  *		of the address space which is shared by any pmap.  ie. any
708  *		pmap can be used instead of pmap_kernel() for our purpose.
709  *
710  *	- ci->ci_pmap
711  *		pmap currently loaded on the cpu.
712  *
713  *	- vm_map_pmap(&curproc->p_vmspace->vm_map)
714  *		current process' pmap.
715  *
716  * => we lock enough pmaps to keep things locked in
717  * => must be undone with pmap_unmap_ptes before returning
718  */
719 
720 void
721 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2,
722 	      pd_entry_t **ptepp, pd_entry_t * const **pdeppp)
723 {
724 	struct pmap *curpmap;
725 	struct cpu_info *ci;
726 	lwp_t *l;
727 
728 	/* The kernel's pmap is always accessible. */
729 	if (pmap == pmap_kernel()) {
730 		*pmap2 = NULL;
731 		*ptepp = PTE_BASE;
732 		*pdeppp = normal_pdes;
733 		return;
734 	}
735 	KASSERT(kpreempt_disabled());
736 
737 	l = curlwp;
738  retry:
739 	mutex_enter(pmap->pm_lock);
740 	ci = curcpu();
741 	curpmap = ci->ci_pmap;
742 	if (vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) {
743 		/* Our own pmap so just load it: easy. */
744 		if (__predict_false(ci->ci_want_pmapload)) {
745 			mutex_exit(pmap->pm_lock);
746 			pmap_load();
747 			goto retry;
748 		}
749 		KASSERT(pmap == curpmap);
750 	} else if (pmap == curpmap) {
751 		/*
752 		 * Already on the CPU: make it valid.  This is very
753 		 * often the case during exit(), when we have switched
754 		 * to the kernel pmap in order to destroy a user pmap.
755 		 */
756 		if (!pmap_reactivate(pmap)) {
757 			u_int gen = uvm_emap_gen_return();
758 			tlbflush();
759 			uvm_emap_update(gen);
760 		}
761 	} else {
762 		/*
763 		 * Toss current pmap from CPU, but keep a reference to it.
764 		 * The reference will be dropped by pmap_unmap_ptes().
765 		 * Can happen if we block during exit().
766 		 */
767 		const cpuid_t cid = cpu_index(ci);
768 
769 		kcpuset_atomic_clear(curpmap->pm_cpus, cid);
770 		kcpuset_atomic_clear(curpmap->pm_kernel_cpus, cid);
771 		ci->ci_pmap = pmap;
772 		ci->ci_tlbstate = TLBSTATE_VALID;
773 		kcpuset_atomic_set(pmap->pm_cpus, cid);
774 		kcpuset_atomic_set(pmap->pm_kernel_cpus, cid);
775 		cpu_load_pmap(pmap, curpmap);
776 	}
777 	pmap->pm_ncsw = l->l_ncsw;
778 	*pmap2 = curpmap;
779 	*ptepp = PTE_BASE;
780 #if defined(XEN) && defined(__x86_64__)
781 	KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] == L4_BASE);
782 	ci->ci_normal_pdes[PTP_LEVELS - 2] = pmap->pm_pdir;
783 	*pdeppp = ci->ci_normal_pdes;
784 #else /* XEN && __x86_64__ */
785 	*pdeppp = normal_pdes;
786 #endif /* XEN && __x86_64__ */
787 }
788 
789 /*
790  * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
791  */
792 
793 void
794 pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2)
795 {
796 	struct cpu_info *ci;
797 	struct pmap *mypmap;
798 
799 	KASSERT(kpreempt_disabled());
800 
801 	/* The kernel's pmap is always accessible. */
802 	if (pmap == pmap_kernel()) {
803 		return;
804 	}
805 
806 	ci = curcpu();
807 #if defined(XEN) && defined(__x86_64__)
808 	/* Reset per-cpu normal_pdes */
809 	KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] != L4_BASE);
810 	ci->ci_normal_pdes[PTP_LEVELS - 2] = L4_BASE;
811 #endif /* XEN && __x86_64__ */
812 	/*
813 	 * We cannot tolerate context switches while mapped in.
814 	 * If it is our own pmap all we have to do is unlock.
815 	 */
816 	KASSERT(pmap->pm_ncsw == curlwp->l_ncsw);
817 	mypmap = vm_map_pmap(&curproc->p_vmspace->vm_map);
818 	if (pmap == mypmap) {
819 		mutex_exit(pmap->pm_lock);
820 		return;
821 	}
822 
823 	/*
824 	 * Mark whatever's on the CPU now as lazy and unlock.
825 	 * If the pmap was already installed, we are done.
826 	 */
827 	ci->ci_tlbstate = TLBSTATE_LAZY;
828 	ci->ci_want_pmapload = (mypmap != pmap_kernel());
829 	mutex_exit(pmap->pm_lock);
830 	if (pmap == pmap2) {
831 		return;
832 	}
833 
834 	/*
835 	 * We installed another pmap on the CPU.  Grab a reference to
836 	 * it and leave in place.  Toss the evicted pmap (can block).
837 	 */
838 	pmap_reference(pmap);
839 	pmap_destroy(pmap2);
840 }
841 
842 
843 inline static void
844 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte)
845 {
846 
847 #if !defined(__x86_64__)
848 	if (curproc == NULL || curproc->p_vmspace == NULL ||
849 	    pm != vm_map_pmap(&curproc->p_vmspace->vm_map))
850 		return;
851 
852 	if ((opte ^ npte) & PG_X)
853 		pmap_update_pg(va);
854 
855 	/*
856 	 * Executability was removed on the last executable change.
857 	 * Reset the code segment to something conservative and
858 	 * let the trap handler deal with setting the right limit.
859 	 * We can't do that because of locking constraints on the vm map.
860 	 */
861 
862 	if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) {
863 		struct trapframe *tf = curlwp->l_md.md_regs;
864 
865 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
866 		pm->pm_hiexec = I386_MAX_EXE_ADDR;
867 	}
868 #endif /* !defined(__x86_64__) */
869 }
870 
871 #if !defined(__x86_64__)
872 /*
873  * Fixup the code segment to cover all potential executable mappings.
874  * returns 0 if no changes to the code segment were made.
875  */
876 
877 int
878 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb)
879 {
880 	struct vm_map_entry *ent;
881 	struct pmap *pm = vm_map_pmap(map);
882 	vaddr_t va = 0;
883 
884 	vm_map_lock_read(map);
885 	for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) {
886 
887 		/*
888 		 * This entry has greater va than the entries before.
889 		 * We need to make it point to the last page, not past it.
890 		 */
891 
892 		if (ent->protection & VM_PROT_EXECUTE)
893 			va = trunc_page(ent->end) - PAGE_SIZE;
894 	}
895 	vm_map_unlock_read(map);
896 	if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL))
897 		return (0);
898 
899 	pm->pm_hiexec = va;
900 	if (pm->pm_hiexec > I386_MAX_EXE_ADDR) {
901 		tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
902 	} else {
903 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
904 		return (0);
905 	}
906 	return (1);
907 }
908 #endif /* !defined(__x86_64__) */
909 
910 void
911 pat_init(struct cpu_info *ci)
912 {
913 	uint64_t pat;
914 
915 	if (!(ci->ci_feat_val[0] & CPUID_PAT))
916 		return;
917 
918 	/* We change WT to WC. Leave all other entries the default values. */
919 	pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) |
920 	      PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) |
921 	      PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) |
922 	      PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC);
923 
924 	wrmsr(MSR_CR_PAT, pat);
925 	cpu_pat_enabled = true;
926 	aprint_debug_dev(ci->ci_dev, "PAT enabled\n");
927 }
928 
929 static pt_entry_t
930 pmap_pat_flags(u_int flags)
931 {
932 	u_int cacheflags = (flags & PMAP_CACHE_MASK);
933 
934 	if (!cpu_pat_enabled) {
935 		switch (cacheflags) {
936 		case PMAP_NOCACHE:
937 		case PMAP_NOCACHE_OVR:
938 			/* results in PGC_UCMINUS on cpus which have
939 			 * the cpuid PAT but PAT "disabled"
940 			 */
941 			return PG_N;
942 		default:
943 			return 0;
944 		}
945 	}
946 
947 	switch (cacheflags) {
948 	case PMAP_NOCACHE:
949 		return PGC_UC;
950 	case PMAP_WRITE_COMBINE:
951 		return PGC_WC;
952 	case PMAP_WRITE_BACK:
953 		return PGC_WB;
954 	case PMAP_NOCACHE_OVR:
955 		return PGC_UCMINUS;
956 	}
957 
958 	return 0;
959 }
960 
961 /*
962  * p m a p   k e n t e r   f u n c t i o n s
963  *
964  * functions to quickly enter/remove pages from the kernel address
965  * space.   pmap_kremove is exported to MI kernel.  we make use of
966  * the recursive PTE mappings.
967  */
968 
969 /*
970  * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
971  *
972  * => no need to lock anything, assume va is already allocated
973  * => should be faster than normal pmap enter function
974  */
975 
976 void
977 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
978 {
979 	pt_entry_t *pte, opte, npte;
980 
981 	KASSERT(!(prot & ~VM_PROT_ALL));
982 
983 	if (va < VM_MIN_KERNEL_ADDRESS)
984 		pte = vtopte(va);
985 	else
986 		pte = kvtopte(va);
987 #ifdef DOM0OPS
988 	if (pa < pmap_pa_start || pa >= pmap_pa_end) {
989 #ifdef DEBUG
990 		printf_nolog("%s: pa 0x%" PRIx64 " for va 0x%" PRIx64
991 		    " outside range\n", __func__, (int64_t)pa, (int64_t)va);
992 #endif /* DEBUG */
993 		npte = pa;
994 	} else
995 #endif /* DOM0OPS */
996 		npte = pmap_pa2pte(pa);
997 	npte |= protection_codes[prot] | PG_k | PG_V | pmap_pg_g;
998 	npte |= pmap_pat_flags(flags);
999 	opte = pmap_pte_testset(pte, npte); /* zap! */
1000 #if defined(DIAGNOSTIC)
1001 	/* XXX For now... */
1002 	if (opte & PG_PS)
1003 		panic("%s: PG_PS", __func__);
1004 #endif
1005 	if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) {
1006 		/* This should not happen. */
1007 		printf_nolog("%s: mapping already present\n", __func__);
1008 		kpreempt_disable();
1009 		pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER);
1010 		kpreempt_enable();
1011 	}
1012 }
1013 
1014 void
1015 pmap_emap_enter(vaddr_t va, paddr_t pa, vm_prot_t prot)
1016 {
1017 	pt_entry_t *pte, npte;
1018 
1019 	KASSERT((prot & ~VM_PROT_ALL) == 0);
1020 	pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va);
1021 
1022 #ifdef DOM0OPS
1023 	if (pa < pmap_pa_start || pa >= pmap_pa_end) {
1024 		npte = pa;
1025 	} else
1026 #endif
1027 		npte = pmap_pa2pte(pa);
1028 
1029 	npte = pmap_pa2pte(pa);
1030 	npte |= protection_codes[prot] | PG_k | PG_V;
1031 	pmap_pte_set(pte, npte);
1032 }
1033 
1034 /*
1035  * pmap_emap_sync: perform TLB flush or pmap load, if it was deferred.
1036  */
1037 void
1038 pmap_emap_sync(bool canload)
1039 {
1040 	struct cpu_info *ci = curcpu();
1041 	struct pmap *pmap;
1042 
1043 	KASSERT(kpreempt_disabled());
1044 	if (__predict_true(ci->ci_want_pmapload && canload)) {
1045 		/*
1046 		 * XXX: Hint for pmap_reactivate(), which might suggest to
1047 		 * not perform TLB flush, if state has not changed.
1048 		 */
1049 		pmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map);
1050 		if (__predict_false(pmap == ci->ci_pmap)) {
1051 			kcpuset_atomic_clear(pmap->pm_cpus, cpu_index(ci));
1052 		}
1053 		pmap_load();
1054 		KASSERT(ci->ci_want_pmapload == 0);
1055 	} else {
1056 		tlbflush();
1057 	}
1058 
1059 }
1060 
1061 void
1062 pmap_emap_remove(vaddr_t sva, vsize_t len)
1063 {
1064 	pt_entry_t *pte;
1065 	vaddr_t va, eva = sva + len;
1066 
1067 	for (va = sva; va < eva; va += PAGE_SIZE) {
1068 		pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va);
1069 		pmap_pte_set(pte, 0);
1070 	}
1071 }
1072 
1073 __strict_weak_alias(pmap_kenter_ma, pmap_kenter_pa);
1074 
1075 #if defined(__x86_64__)
1076 /*
1077  * Change protection for a virtual address. Local for a CPU only, don't
1078  * care about TLB shootdowns.
1079  *
1080  * => must be called with preemption disabled
1081  */
1082 void
1083 pmap_changeprot_local(vaddr_t va, vm_prot_t prot)
1084 {
1085 	pt_entry_t *pte, opte, npte;
1086 
1087 	KASSERT(kpreempt_disabled());
1088 
1089 	if (va < VM_MIN_KERNEL_ADDRESS)
1090 		pte = vtopte(va);
1091 	else
1092 		pte = kvtopte(va);
1093 
1094 	npte = opte = *pte;
1095 
1096 	if ((prot & VM_PROT_WRITE) != 0)
1097 		npte |= PG_RW;
1098 	else
1099 		npte &= ~PG_RW;
1100 
1101 	if (opte != npte) {
1102 		pmap_pte_set(pte, npte);
1103 		pmap_pte_flush();
1104 		invlpg(va);
1105 	}
1106 }
1107 #endif /* defined(__x86_64__) */
1108 
1109 /*
1110  * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
1111  *
1112  * => no need to lock anything
1113  * => caller must dispose of any vm_page mapped in the va range
1114  * => note: not an inline function
1115  * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
1116  * => we assume kernel only unmaps valid addresses and thus don't bother
1117  *    checking the valid bit before doing TLB flushing
1118  * => must be followed by call to pmap_update() before reuse of page
1119  */
1120 
1121 static inline void
1122 pmap_kremove1(vaddr_t sva, vsize_t len, bool localonly)
1123 {
1124 	pt_entry_t *pte, opte;
1125 	vaddr_t va, eva;
1126 
1127 	eva = sva + len;
1128 
1129 	kpreempt_disable();
1130 	for (va = sva; va < eva; va += PAGE_SIZE) {
1131 		pte = kvtopte(va);
1132 		opte = pmap_pte_testset(pte, 0); /* zap! */
1133 		if ((opte & (PG_V | PG_U)) == (PG_V | PG_U) && !localonly) {
1134 			pmap_tlb_shootdown(pmap_kernel(), va, opte,
1135 			    TLBSHOOT_KREMOVE);
1136 		}
1137 		KASSERT((opte & PG_PS) == 0);
1138 		KASSERT((opte & PG_PVLIST) == 0);
1139 	}
1140 	if (localonly) {
1141 		tlbflushg();
1142 	}
1143 	kpreempt_enable();
1144 }
1145 
1146 void
1147 pmap_kremove(vaddr_t sva, vsize_t len)
1148 {
1149 
1150 	pmap_kremove1(sva, len, false);
1151 }
1152 
1153 /*
1154  * pmap_kremove_local: like pmap_kremove(), but only worry about
1155  * TLB invalidations on the current CPU.  this is only intended
1156  * for use while writing kernel crash dumps.
1157  */
1158 
1159 void
1160 pmap_kremove_local(vaddr_t sva, vsize_t len)
1161 {
1162 
1163 	KASSERT(panicstr != NULL);
1164 	pmap_kremove1(sva, len, true);
1165 }
1166 
1167 /*
1168  * p m a p   i n i t   f u n c t i o n s
1169  *
1170  * pmap_bootstrap and pmap_init are called during system startup
1171  * to init the pmap module.   pmap_bootstrap() does a low level
1172  * init just to get things rolling.   pmap_init() finishes the job.
1173  */
1174 
1175 /*
1176  * pmap_bootstrap: get the system in a state where it can run with VM
1177  *	properly enabled (called before main()).   the VM system is
1178  *      fully init'd later...
1179  *
1180  * => on i386, locore.s has already enabled the MMU by allocating
1181  *	a PDP for the kernel, and nkpde PTP's for the kernel.
1182  * => kva_start is the first free virtual address in kernel space
1183  */
1184 
1185 void
1186 pmap_bootstrap(vaddr_t kva_start)
1187 {
1188 	struct pmap *kpm;
1189 	pt_entry_t *pte;
1190 	int i;
1191 	vaddr_t kva;
1192 #ifndef XEN
1193 	pd_entry_t *pde;
1194 	unsigned long p1i;
1195 	vaddr_t kva_end;
1196 #endif
1197 #ifdef __HAVE_DIRECT_MAP
1198 	phys_ram_seg_t *mc;
1199 	long ndmpdp;
1200 	paddr_t lastpa, dmpd, dmpdp, pdp;
1201 	vaddr_t tmpva;
1202 #endif
1203 
1204 	pt_entry_t pg_nx = (cpu_feature[2] & CPUID_NOX ? PG_NX : 0);
1205 
1206 	/*
1207 	 * set up our local static global vars that keep track of the
1208 	 * usage of KVM before kernel_map is set up
1209 	 */
1210 
1211 	virtual_avail = kva_start;		/* first free KVA */
1212 	virtual_end = VM_MAX_KERNEL_ADDRESS;	/* last KVA */
1213 
1214 	/*
1215 	 * set up protection_codes: we need to be able to convert from
1216 	 * a MI protection code (some combo of VM_PROT...) to something
1217 	 * we can jam into a i386 PTE.
1218 	 */
1219 
1220 	protection_codes[VM_PROT_NONE] = pg_nx;			/* --- */
1221 	protection_codes[VM_PROT_EXECUTE] = PG_RO | PG_X;	/* --x */
1222 	protection_codes[VM_PROT_READ] = PG_RO | pg_nx;		/* -r- */
1223 	protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO | PG_X;/* -rx */
1224 	protection_codes[VM_PROT_WRITE] = PG_RW | pg_nx;	/* w-- */
1225 	protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW | PG_X;/* w-x */
1226 	protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW | pg_nx;
1227 								/* wr- */
1228 	protection_codes[VM_PROT_ALL] = PG_RW | PG_X;		/* wrx */
1229 
1230 	/*
1231 	 * now we init the kernel's pmap
1232 	 *
1233 	 * the kernel pmap's pm_obj is not used for much.   however, in
1234 	 * user pmaps the pm_obj contains the list of active PTPs.
1235 	 * the pm_obj currently does not have a pager.   it might be possible
1236 	 * to add a pager that would allow a process to read-only mmap its
1237 	 * own page tables (fast user level vtophys?).   this may or may not
1238 	 * be useful.
1239 	 */
1240 
1241 	kpm = pmap_kernel();
1242 	for (i = 0; i < PTP_LEVELS - 1; i++) {
1243 		mutex_init(&kpm->pm_obj_lock[i], MUTEX_DEFAULT, IPL_NONE);
1244 		uvm_obj_init(&kpm->pm_obj[i], NULL, false, 1);
1245 		uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_obj_lock[i]);
1246 		kpm->pm_ptphint[i] = NULL;
1247 	}
1248 	memset(&kpm->pm_list, 0, sizeof(kpm->pm_list));  /* pm_list not used */
1249 
1250 	kpm->pm_pdir = (pd_entry_t *)(PDPpaddr + KERNBASE);
1251 	for (i = 0; i < PDP_SIZE; i++)
1252 		kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i;
1253 
1254 	kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
1255 		x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS);
1256 
1257 	kcpuset_create(&kpm->pm_cpus, true);
1258 	kcpuset_create(&kpm->pm_kernel_cpus, true);
1259 
1260 	/*
1261 	 * the above is just a rough estimate and not critical to the proper
1262 	 * operation of the system.
1263 	 */
1264 
1265 #ifndef XEN
1266 	/*
1267 	 * Begin to enable global TLB entries if they are supported.
1268 	 * The G bit has no effect until the CR4_PGE bit is set in CR4,
1269 	 * which happens in cpu_init(), which is run on each cpu
1270 	 * (and happens later)
1271 	 */
1272 
1273 	if (cpu_feature[0] & CPUID_PGE) {
1274 		pmap_pg_g = PG_G;		/* enable software */
1275 
1276 		/* add PG_G attribute to already mapped kernel pages */
1277 		if (KERNBASE == VM_MIN_KERNEL_ADDRESS) {
1278 			kva_end = virtual_avail;
1279 		} else {
1280 			extern vaddr_t eblob, esym;
1281 			kva_end = (vaddr_t)&end;
1282 			if (esym > kva_end)
1283 				kva_end = esym;
1284 			if (eblob > kva_end)
1285 				kva_end = eblob;
1286 			kva_end = roundup(kva_end, PAGE_SIZE);
1287 		}
1288 		for (kva = KERNBASE; kva < kva_end; kva += PAGE_SIZE) {
1289 			p1i = pl1_i(kva);
1290 			if (pmap_valid_entry(PTE_BASE[p1i]))
1291 				PTE_BASE[p1i] |= PG_G;
1292 		}
1293 	}
1294 
1295 	/*
1296 	 * enable large pages if they are supported.
1297 	 */
1298 
1299 	if (cpu_feature[0] & CPUID_PSE) {
1300 		paddr_t pa;
1301 		extern char __data_start;
1302 
1303 		lcr4(rcr4() | CR4_PSE);	/* enable hardware (via %cr4) */
1304 		pmap_largepages = 1;	/* enable software */
1305 
1306 		/*
1307 		 * the TLB must be flushed after enabling large pages
1308 		 * on Pentium CPUs, according to section 3.6.2.2 of
1309 		 * "Intel Architecture Software Developer's Manual,
1310 		 * Volume 3: System Programming".
1311 		 */
1312 		tlbflushg();
1313 
1314 		/*
1315 		 * now, remap the kernel text using large pages.  we
1316 		 * assume that the linker has properly aligned the
1317 		 * .data segment to a NBPD_L2 boundary.
1318 		 */
1319 		kva_end = rounddown((vaddr_t)&__data_start, NBPD_L1);
1320 		for (pa = 0, kva = KERNBASE; kva + NBPD_L2 <= kva_end;
1321 		     kva += NBPD_L2, pa += NBPD_L2) {
1322 			pde = &L2_BASE[pl2_i(kva)];
1323 			*pde = pa | pmap_pg_g | PG_PS |
1324 			    PG_KR | PG_V;	/* zap! */
1325 			tlbflushg();
1326 		}
1327 #if defined(DEBUG)
1328 		aprint_normal("kernel text is mapped with %" PRIuPSIZE " large "
1329 		    "pages and %" PRIuPSIZE " normal pages\n",
1330 		    howmany(kva - KERNBASE, NBPD_L2),
1331 		    howmany((vaddr_t)&__data_start - kva, NBPD_L1));
1332 #endif /* defined(DEBUG) */
1333 	}
1334 #endif /* !XEN */
1335 
1336 #ifdef __HAVE_DIRECT_MAP
1337 
1338 	tmpva = (KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2);
1339 	pte = PTE_BASE + pl1_i(tmpva);
1340 
1341 	/*
1342 	 * Map the direct map.  Use 1GB pages if they are available,
1343 	 * otherwise use 2MB pages.  Note that the unused parts of
1344 	 * PTPs * must be zero outed, as they might be accessed due
1345 	 * to speculative execution.  Also, PG_G is not allowed on
1346 	 * non-leaf PTPs.
1347 	 */
1348 
1349 	lastpa = 0;
1350 	for (i = 0; i < mem_cluster_cnt; i++) {
1351 		mc = &mem_clusters[i];
1352 		lastpa = MAX(lastpa, mc->start + mc->size);
1353 	}
1354 
1355 	ndmpdp = (lastpa + NBPD_L3 - 1) >> L3_SHIFT;
1356 	dmpdp = avail_start;	avail_start += PAGE_SIZE;
1357 
1358 	*pte = dmpdp | PG_V | PG_RW;
1359 	pmap_update_pg(tmpva);
1360 	memset((void *)tmpva, 0, PAGE_SIZE);
1361 
1362 	if (cpu_feature[2] & CPUID_P1GB) {
1363 		for (i = 0; i < ndmpdp; i++) {
1364 			pdp = (paddr_t)&(((pd_entry_t *)dmpdp)[i]);
1365 			*pte = (pdp & PG_FRAME) | PG_V | PG_RW;
1366 			pmap_update_pg(tmpva);
1367 
1368 			pde = (pd_entry_t *)(tmpva + (pdp & ~PG_FRAME));
1369 			*pde = ((paddr_t)i << L3_SHIFT) |
1370 				PG_RW | PG_V | PG_U | PG_PS | PG_G;
1371 		}
1372 	} else {
1373 		dmpd = avail_start;	avail_start += ndmpdp * PAGE_SIZE;
1374 
1375 		for (i = 0; i < ndmpdp; i++) {
1376 			pdp = dmpd + i * PAGE_SIZE;
1377 			*pte = (pdp & PG_FRAME) | PG_V | PG_RW;
1378 			pmap_update_pg(tmpva);
1379 
1380 			memset((void *)tmpva, 0, PAGE_SIZE);
1381 		}
1382 		for (i = 0; i < NPDPG * ndmpdp; i++) {
1383 			pdp = (paddr_t)&(((pd_entry_t *)dmpd)[i]);
1384 			*pte = (pdp & PG_FRAME) | PG_V | PG_RW;
1385 			pmap_update_pg(tmpva);
1386 
1387 			pde = (pd_entry_t *)(tmpva + (pdp & ~PG_FRAME));
1388 			*pde = ((paddr_t)i << L2_SHIFT) |
1389 				PG_RW | PG_V | PG_U | PG_PS | PG_G;
1390 		}
1391 		for (i = 0; i < ndmpdp; i++) {
1392 			pdp = (paddr_t)&(((pd_entry_t *)dmpdp)[i]);
1393 			*pte = (pdp & PG_FRAME) | PG_V | PG_RW;
1394 			pmap_update_pg((vaddr_t)tmpva);
1395 
1396 			pde = (pd_entry_t *)(tmpva + (pdp & ~PG_FRAME));
1397 			*pde = (dmpd + (i << PAGE_SHIFT)) |
1398 				PG_RW | PG_V | PG_U;
1399 		}
1400 	}
1401 
1402 	kpm->pm_pdir[PDIR_SLOT_DIRECT] = dmpdp | PG_KW | PG_V | PG_U;
1403 
1404 	tlbflush();
1405 
1406 #else
1407 	if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
1408 		/*
1409 		 * zero_pte is stuck at the end of mapped space for the kernel
1410 		 * image (disjunct from kva space). This is done so that it
1411 		 * can safely be used in pmap_growkernel (pmap_get_physpage),
1412 		 * when it's called for the first time.
1413 		 * XXXfvdl fix this for MULTIPROCESSOR later.
1414 		 */
1415 #ifdef XEN
1416 		/* early_zerop initialized in xen_pmap_bootstrap() */
1417 #else
1418 		early_zerop = (void *)(KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2);
1419 #endif
1420 		early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop);
1421 	}
1422 
1423 	/*
1424 	 * now we allocate the "special" VAs which are used for tmp mappings
1425 	 * by the pmap (and other modules).    we allocate the VAs by advancing
1426 	 * virtual_avail (note that there are no pages mapped at these VAs).
1427 	 * we find the PTE that maps the allocated VA via the linear PTE
1428 	 * mapping.
1429 	 */
1430 
1431 	pte = PTE_BASE + pl1_i(virtual_avail);
1432 
1433 #ifdef MULTIPROCESSOR
1434 	/*
1435 	 * Waste some VA space to avoid false sharing of cache lines
1436 	 * for page table pages: Give each possible CPU a cache line
1437 	 * of PTE's (8) to play with, though we only need 4.  We could
1438 	 * recycle some of this waste by putting the idle stacks here
1439 	 * as well; we could waste less space if we knew the largest
1440 	 * CPU ID beforehand.
1441 	 */
1442 	csrcp = (char *) virtual_avail;  csrc_pte = pte;
1443 
1444 	cdstp = (char *) virtual_avail+PAGE_SIZE;  cdst_pte = pte+1;
1445 
1446 	zerop = (char *) virtual_avail+PAGE_SIZE*2;  zero_pte = pte+2;
1447 
1448 	ptpp = (char *) virtual_avail+PAGE_SIZE*3;  ptp_pte = pte+3;
1449 
1450 	virtual_avail += PAGE_SIZE * maxcpus * NPTECL;
1451 	pte += maxcpus * NPTECL;
1452 #else
1453 	csrcp = (void *) virtual_avail;  csrc_pte = pte;	/* allocate */
1454 	virtual_avail += PAGE_SIZE; pte++;			/* advance */
1455 
1456 	cdstp = (void *) virtual_avail;  cdst_pte = pte;
1457 	virtual_avail += PAGE_SIZE; pte++;
1458 
1459 	zerop = (void *) virtual_avail;  zero_pte = pte;
1460 	virtual_avail += PAGE_SIZE; pte++;
1461 
1462 	ptpp = (void *) virtual_avail;  ptp_pte = pte;
1463 	virtual_avail += PAGE_SIZE; pte++;
1464 #endif
1465 
1466 	if (VM_MIN_KERNEL_ADDRESS == KERNBASE) {
1467 		early_zerop = zerop;
1468 		early_zero_pte = zero_pte;
1469 	}
1470 #endif
1471 
1472 	/*
1473 	 * Nothing after this point actually needs pte.
1474 	 */
1475 	pte = (void *)0xdeadbeef;
1476 
1477 #ifdef XEN
1478 #ifdef __x86_64__
1479 	/*
1480 	 * We want a dummy page directory for Xen:
1481 	 * when deactivate a pmap, Xen will still consider it active.
1482 	 * So we set user PGD to this one to lift all protection on
1483 	 * the now inactive page tables set.
1484 	 */
1485 	xen_dummy_user_pgd = avail_start;
1486 	avail_start += PAGE_SIZE;
1487 
1488 	/* Zero fill it, the less checks in Xen it requires the better */
1489 	memset((void *) (xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE);
1490 	/* Mark read-only */
1491 	HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE,
1492 	    pmap_pa2pte(xen_dummy_user_pgd) | PG_u | PG_V, UVMF_INVLPG);
1493 	/* Pin as L4 */
1494 	xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd));
1495 #endif /* __x86_64__ */
1496 	idt_vaddr = virtual_avail;                      /* don't need pte */
1497 	idt_paddr = avail_start;                        /* steal a page */
1498 	/*
1499 	 * Xen require one more page as we can't store
1500 	 * GDT and LDT on the same page
1501 	 */
1502 	virtual_avail += 3 * PAGE_SIZE;
1503 	avail_start += 3 * PAGE_SIZE;
1504 #else /* XEN */
1505 	idt_vaddr = virtual_avail;			/* don't need pte */
1506 	idt_paddr = avail_start;			/* steal a page */
1507 #if defined(__x86_64__)
1508 	virtual_avail += 2 * PAGE_SIZE;
1509 	avail_start += 2 * PAGE_SIZE;
1510 #else /* defined(__x86_64__) */
1511 	virtual_avail += PAGE_SIZE;
1512 	avail_start += PAGE_SIZE;
1513 	/* pentium f00f bug stuff */
1514 	pentium_idt_vaddr = virtual_avail;		/* don't need pte */
1515 	virtual_avail += PAGE_SIZE;
1516 #endif /* defined(__x86_64__) */
1517 #endif /* XEN */
1518 
1519 #ifdef _LP64
1520 	/*
1521 	 * Grab a page below 4G for things that need it (i.e.
1522 	 * having an initial %cr3 for the MP trampoline).
1523 	 */
1524 	lo32_vaddr = virtual_avail;
1525 	virtual_avail += PAGE_SIZE;
1526 	lo32_paddr = avail_start;
1527 	avail_start += PAGE_SIZE;
1528 #endif
1529 
1530 	/*
1531 	 * now we reserve some VM for mapping pages when doing a crash dump
1532 	 */
1533 
1534 	virtual_avail = reserve_dumppages(virtual_avail);
1535 
1536 	/*
1537 	 * init the static-global locks and global lists.
1538 	 *
1539 	 * => pventry::pvh_lock (initialized elsewhere) must also be
1540 	 *      a spin lock, again at IPL_VM to prevent deadlock, and
1541 	 *	again is never taken from interrupt context.
1542 	 */
1543 
1544 	mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE);
1545 	LIST_INIT(&pmaps);
1546 
1547 	/*
1548 	 * ensure the TLB is sync'd with reality by flushing it...
1549 	 */
1550 
1551 	tlbflushg();
1552 
1553 	/*
1554 	 * calculate pmap_maxkvaddr from nkptp[].
1555 	 */
1556 
1557 	kva = VM_MIN_KERNEL_ADDRESS;
1558 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
1559 		kva += nkptp[i] * nbpd[i];
1560 	}
1561 	pmap_maxkvaddr = kva;
1562 }
1563 
1564 #if defined(__x86_64__)
1565 /*
1566  * Pre-allocate PTPs for low memory, so that 1:1 mappings for various
1567  * trampoline code can be entered.
1568  */
1569 void
1570 pmap_prealloc_lowmem_ptps(void)
1571 {
1572 	int level;
1573 	paddr_t newp;
1574 	pd_entry_t *pdes;
1575 
1576 	const pd_entry_t pteflags = PG_k | PG_V | PG_RW;
1577 
1578 	pdes = pmap_kernel()->pm_pdir;
1579 	level = PTP_LEVELS;
1580 	for (;;) {
1581 		newp = avail_start;
1582 		avail_start += PAGE_SIZE;
1583 #ifdef __HAVE_DIRECT_MAP
1584 		memset((void *)PMAP_DIRECT_MAP(newp), 0, PAGE_SIZE);
1585 #else
1586 		pmap_pte_set(early_zero_pte, pmap_pa2pte(newp) | pteflags);
1587 		pmap_pte_flush();
1588 		pmap_update_pg((vaddr_t)early_zerop);
1589 		memset(early_zerop, 0, PAGE_SIZE);
1590 #endif
1591 
1592 #ifdef XEN
1593 		/* Mark R/O before installing */
1594 		HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop,
1595 		    xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG);
1596 		if (newp < (NKL2_KIMG_ENTRIES * NBPD_L2))
1597 			HYPERVISOR_update_va_mapping (newp + KERNBASE,
1598 			    xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG);
1599 
1600 
1601 		if (level == PTP_LEVELS) { /* Top level pde is per-cpu */
1602 			pd_entry_t *kpm_pdir;
1603 			/* Reach it via recursive mapping */
1604 			kpm_pdir = normal_pdes[PTP_LEVELS - 2];
1605 
1606 			/* Set it as usual. We can't defer this
1607 			 * outside the loop since recursive
1608 			 * pte entries won't be accessible during
1609 			 * further iterations at lower levels
1610 			 * otherwise.
1611 			 */
1612 			pmap_pte_set(&kpm_pdir[pl_i(0, PTP_LEVELS)],
1613 			    pmap_pa2pte(newp) | pteflags);
1614 		}
1615 
1616 #endif /* XEN */
1617 		pmap_pte_set(&pdes[pl_i(0, level)],
1618 		    pmap_pa2pte(newp) | pteflags);
1619 
1620 		pmap_pte_flush();
1621 
1622 		level--;
1623 		if (level <= 1)
1624 			break;
1625 		pdes = normal_pdes[level - 2];
1626 	}
1627 }
1628 #endif /* defined(__x86_64__) */
1629 
1630 /*
1631  * pmap_init: called from uvm_init, our job is to get the pmap
1632  * system ready to manage mappings...
1633  */
1634 
1635 void
1636 pmap_init(void)
1637 {
1638 	int i, flags;
1639 
1640 	for (i = 0; i < PV_HASH_SIZE; i++) {
1641 		SLIST_INIT(&pv_hash_heads[i].hh_list);
1642 	}
1643 	for (i = 0; i < PV_HASH_LOCK_CNT; i++) {
1644 		mutex_init(&pv_hash_locks[i].lock, MUTEX_NODEBUG, IPL_VM);
1645 	}
1646 
1647 	/*
1648 	 * initialize caches.
1649 	 */
1650 
1651 	pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), 0, 0, 0,
1652 	    "pmappl", NULL, IPL_NONE, NULL, NULL, NULL);
1653 
1654 #ifdef XEN
1655 	/*
1656 	 * pool_cache(9) should not touch cached objects, since they
1657 	 * are pinned on xen and R/O for the domU
1658 	 */
1659 	flags = PR_NOTOUCH;
1660 #else /* XEN */
1661 	flags = 0;
1662 #endif /* XEN */
1663 #ifdef PAE
1664 	pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE * PDP_SIZE, 0, 0, flags,
1665 	    "pdppl", &pmap_pdp_allocator, IPL_NONE,
1666 	    pmap_pdp_ctor, pmap_pdp_dtor, NULL);
1667 #else /* PAE */
1668 	pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE, 0, 0, flags,
1669 	    "pdppl", NULL, IPL_NONE, pmap_pdp_ctor, pmap_pdp_dtor, NULL);
1670 #endif /* PAE */
1671 	pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0,
1672 	    PR_LARGECACHE, "pvpl", &pool_allocator_kmem, IPL_NONE, NULL,
1673 	    NULL, NULL);
1674 
1675 	pmap_tlb_init();
1676 
1677 	/* XXX: Since cpu_hatch() is only for secondary CPUs. */
1678 	pmap_tlb_cpu_init(curcpu());
1679 
1680 	evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC,
1681 	    NULL, "x86", "io bitmap copy");
1682 	evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC,
1683 	    NULL, "x86", "ldt sync");
1684 
1685 	/*
1686 	 * done: pmap module is up (and ready for business)
1687 	 */
1688 
1689 	pmap_initialized = true;
1690 }
1691 
1692 /*
1693  * pmap_cpu_init_late: perform late per-CPU initialization.
1694  */
1695 
1696 #ifndef XEN
1697 void
1698 pmap_cpu_init_late(struct cpu_info *ci)
1699 {
1700 	/*
1701 	 * The BP has already its own PD page allocated during early
1702 	 * MD startup.
1703 	 */
1704 	if (ci == &cpu_info_primary)
1705 		return;
1706 
1707 #ifdef PAE
1708 	cpu_alloc_l3_page(ci);
1709 #endif
1710 }
1711 #endif
1712 
1713 /*
1714  * p v _ e n t r y   f u n c t i o n s
1715  */
1716 
1717 /*
1718  * pmap_free_pvs: free a list of pv_entrys
1719  */
1720 
1721 static void
1722 pmap_free_pvs(struct pv_entry *pve)
1723 {
1724 	struct pv_entry *next;
1725 
1726 	for ( /* null */ ; pve != NULL ; pve = next) {
1727 		next = pve->pve_next;
1728 		pool_cache_put(&pmap_pv_cache, pve);
1729 	}
1730 }
1731 
1732 /*
1733  * main pv_entry manipulation functions:
1734  *   pmap_enter_pv: enter a mapping onto a pv_head list
1735  *   pmap_remove_pv: remove a mapping from a pv_head list
1736  *
1737  * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock
1738  *       the pvh before calling
1739  */
1740 
1741 /*
1742  * insert_pv: a helper of pmap_enter_pv
1743  */
1744 
1745 static void
1746 insert_pv(struct pmap_page *pp, struct pv_entry *pve)
1747 {
1748 	struct pv_hash_head *hh;
1749 	kmutex_t *lock;
1750 	u_int hash;
1751 
1752 	hash = pvhash_hash(pve->pve_pte.pte_ptp, pve->pve_pte.pte_va);
1753 	lock = pvhash_lock(hash);
1754 	hh = pvhash_head(hash);
1755 	mutex_spin_enter(lock);
1756 	SLIST_INSERT_HEAD(&hh->hh_list, pve, pve_hash);
1757 	mutex_spin_exit(lock);
1758 
1759 	LIST_INSERT_HEAD(&pp->pp_head.pvh_list, pve, pve_list);
1760 }
1761 
1762 /*
1763  * pmap_enter_pv: enter a mapping onto a pv_head lst
1764  *
1765  * => caller should adjust ptp's wire_count before calling
1766  */
1767 
1768 static struct pv_entry *
1769 pmap_enter_pv(struct pmap_page *pp,
1770 	      struct pv_entry *pve,	/* preallocated pve for us to use */
1771 	      struct pv_entry **sparepve,
1772 	      struct vm_page *ptp,
1773 	      vaddr_t va)
1774 {
1775 
1776 	KASSERT(ptp == NULL || ptp->wire_count >= 2);
1777 	KASSERT(ptp == NULL || ptp->uobject != NULL);
1778 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
1779 
1780 	if ((pp->pp_flags & PP_EMBEDDED) == 0) {
1781 		if (LIST_EMPTY(&pp->pp_head.pvh_list)) {
1782 			pp->pp_flags |= PP_EMBEDDED;
1783 			pp->pp_pte.pte_ptp = ptp;
1784 			pp->pp_pte.pte_va = va;
1785 
1786 			return pve;
1787 		}
1788 	} else {
1789 		struct pv_entry *pve2;
1790 
1791 		pve2 = *sparepve;
1792 		*sparepve = NULL;
1793 
1794 		pve2->pve_pte = pp->pp_pte;
1795 		pp->pp_flags &= ~PP_EMBEDDED;
1796 		LIST_INIT(&pp->pp_head.pvh_list);
1797 		insert_pv(pp, pve2);
1798 	}
1799 
1800 	pve->pve_pte.pte_ptp = ptp;
1801 	pve->pve_pte.pte_va = va;
1802 	insert_pv(pp, pve);
1803 
1804 	return NULL;
1805 }
1806 
1807 /*
1808  * pmap_remove_pv: try to remove a mapping from a pv_list
1809  *
1810  * => caller should adjust ptp's wire_count and free PTP if needed
1811  * => we return the removed pve
1812  */
1813 
1814 static struct pv_entry *
1815 pmap_remove_pv(struct pmap_page *pp, struct vm_page *ptp, vaddr_t va)
1816 {
1817 	struct pv_hash_head *hh;
1818 	struct pv_entry *pve;
1819 	kmutex_t *lock;
1820 	u_int hash;
1821 
1822 	KASSERT(ptp == NULL || ptp->uobject != NULL);
1823 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
1824 
1825 	if ((pp->pp_flags & PP_EMBEDDED) != 0) {
1826 		KASSERT(pp->pp_pte.pte_ptp == ptp);
1827 		KASSERT(pp->pp_pte.pte_va == va);
1828 
1829 		pp->pp_flags &= ~PP_EMBEDDED;
1830 		LIST_INIT(&pp->pp_head.pvh_list);
1831 
1832 		return NULL;
1833 	}
1834 
1835 	hash = pvhash_hash(ptp, va);
1836 	lock = pvhash_lock(hash);
1837 	hh = pvhash_head(hash);
1838 	mutex_spin_enter(lock);
1839 	pve = pvhash_remove(hh, ptp, va);
1840 	mutex_spin_exit(lock);
1841 
1842 	LIST_REMOVE(pve, pve_list);
1843 
1844 	return pve;
1845 }
1846 
1847 /*
1848  * p t p   f u n c t i o n s
1849  */
1850 
1851 static inline struct vm_page *
1852 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level)
1853 {
1854 	int lidx = level - 1;
1855 	struct vm_page *pg;
1856 
1857 	KASSERT(mutex_owned(pmap->pm_lock));
1858 
1859 	if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] &&
1860 	    pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) {
1861 		return (pmap->pm_ptphint[lidx]);
1862 	}
1863 	PMAP_SUBOBJ_LOCK(pmap, lidx);
1864 	pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level));
1865 	PMAP_SUBOBJ_UNLOCK(pmap, lidx);
1866 
1867 	KASSERT(pg == NULL || pg->wire_count >= 1);
1868 	return pg;
1869 }
1870 
1871 static inline void
1872 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level)
1873 {
1874 	lwp_t *l;
1875 	int lidx;
1876 	struct uvm_object *obj;
1877 
1878 	KASSERT(ptp->wire_count == 1);
1879 
1880 	lidx = level - 1;
1881 
1882 	obj = &pmap->pm_obj[lidx];
1883 	pmap_stats_update(pmap, -1, 0);
1884 	if (lidx != 0)
1885 		mutex_enter(obj->vmobjlock);
1886 	if (pmap->pm_ptphint[lidx] == ptp)
1887 		pmap->pm_ptphint[lidx] = TAILQ_FIRST(&obj->memq);
1888 	ptp->wire_count = 0;
1889 	uvm_pagerealloc(ptp, NULL, 0);
1890 	l = curlwp;
1891 	KASSERT((l->l_pflag & LP_INTR) == 0);
1892 	VM_PAGE_TO_PP(ptp)->pp_link = l->l_md.md_gc_ptp;
1893 	l->l_md.md_gc_ptp = ptp;
1894 	if (lidx != 0)
1895 		mutex_exit(obj->vmobjlock);
1896 }
1897 
1898 static void
1899 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
1900 	      pt_entry_t *ptes, pd_entry_t * const *pdes)
1901 {
1902 	unsigned long index;
1903 	int level;
1904 	vaddr_t invaladdr;
1905 	pd_entry_t opde;
1906 
1907 	KASSERT(pmap != pmap_kernel());
1908 	KASSERT(mutex_owned(pmap->pm_lock));
1909 	KASSERT(kpreempt_disabled());
1910 
1911 	level = 1;
1912 	do {
1913 		index = pl_i(va, level + 1);
1914 		opde = pmap_pte_testset(&pdes[level - 1][index], 0);
1915 #if defined(XEN)
1916 #  if defined(__x86_64__)
1917 		/*
1918 		 * If ptp is a L3 currently mapped in kernel space,
1919 		 * on any cpu, clear it before freeing
1920 		 */
1921 		if (level == PTP_LEVELS - 1) {
1922 			/*
1923 			 * Update the per-cpu PD on all cpus the current
1924 			 * pmap is active on
1925 			 */
1926 			xen_kpm_sync(pmap, index);
1927 		}
1928 #  endif /*__x86_64__ */
1929 		invaladdr = level == 1 ? (vaddr_t)ptes :
1930 		    (vaddr_t)pdes[level - 2];
1931 		pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE,
1932 		    opde, TLBSHOOT_FREE_PTP1);
1933 		pmap_tlb_shootnow();
1934 #else	/* XEN */
1935 		invaladdr = level == 1 ? (vaddr_t)ptes :
1936 		    (vaddr_t)pdes[level - 2];
1937 		pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE,
1938 		    opde, TLBSHOOT_FREE_PTP1);
1939 #endif	/* XEN */
1940 		pmap_freepage(pmap, ptp, level);
1941 		if (level < PTP_LEVELS - 1) {
1942 			ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1);
1943 			ptp->wire_count--;
1944 			if (ptp->wire_count > 1)
1945 				break;
1946 		}
1947 	} while (++level < PTP_LEVELS);
1948 	pmap_pte_flush();
1949 }
1950 
1951 /*
1952  * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
1953  *
1954  * => pmap should NOT be pmap_kernel()
1955  * => pmap should be locked
1956  * => preemption should be disabled
1957  */
1958 
1959 static struct vm_page *
1960 pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t * const *pdes)
1961 {
1962 	struct vm_page *ptp, *pptp;
1963 	int i;
1964 	unsigned long index;
1965 	pd_entry_t *pva;
1966 	paddr_t ppa, pa;
1967 	struct uvm_object *obj;
1968 
1969 	KASSERT(pmap != pmap_kernel());
1970 	KASSERT(mutex_owned(pmap->pm_lock));
1971 	KASSERT(kpreempt_disabled());
1972 
1973 	ptp = NULL;
1974 	pa = (paddr_t)-1;
1975 
1976 	/*
1977 	 * Loop through all page table levels seeing if we need to
1978 	 * add a new page to that level.
1979 	 */
1980 	for (i = PTP_LEVELS; i > 1; i--) {
1981 		/*
1982 		 * Save values from previous round.
1983 		 */
1984 		pptp = ptp;
1985 		ppa = pa;
1986 
1987 		index = pl_i(va, i);
1988 		pva = pdes[i - 2];
1989 
1990 		if (pmap_valid_entry(pva[index])) {
1991 			ppa = pmap_pte2pa(pva[index]);
1992 			ptp = NULL;
1993 			continue;
1994 		}
1995 
1996 		obj = &pmap->pm_obj[i-2];
1997 		PMAP_SUBOBJ_LOCK(pmap, i - 2);
1998 		ptp = uvm_pagealloc(obj, ptp_va2o(va, i - 1), NULL,
1999 		    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
2000 		PMAP_SUBOBJ_UNLOCK(pmap, i - 2);
2001 
2002 		if (ptp == NULL)
2003 			return NULL;
2004 
2005 		ptp->flags &= ~PG_BUSY; /* never busy */
2006 		ptp->wire_count = 1;
2007 		pmap->pm_ptphint[i - 2] = ptp;
2008 		pa = VM_PAGE_TO_PHYS(ptp);
2009 		pmap_pte_set(&pva[index], (pd_entry_t)
2010 		        (pmap_pa2pte(pa) | PG_u | PG_RW | PG_V));
2011 #if defined(XEN) && defined(__x86_64__)
2012 		if(i == PTP_LEVELS) {
2013 			/*
2014 			 * Update the per-cpu PD on all cpus the current
2015 			 * pmap is active on
2016 			 */
2017 			xen_kpm_sync(pmap, index);
2018 		}
2019 #endif
2020 		pmap_pte_flush();
2021 		pmap_stats_update(pmap, 1, 0);
2022 		/*
2023 		 * If we're not in the top level, increase the
2024 		 * wire count of the parent page.
2025 		 */
2026 		if (i < PTP_LEVELS) {
2027 			if (pptp == NULL) {
2028 				pptp = pmap_find_ptp(pmap, va, ppa, i);
2029 				KASSERT(pptp != NULL);
2030 			}
2031 			pptp->wire_count++;
2032 		}
2033 	}
2034 
2035 	/*
2036 	 * PTP is not NULL if we just allocated a new PTP.  If it is
2037 	 * still NULL, we must look up the existing one.
2038 	 */
2039 	if (ptp == NULL) {
2040 		ptp = pmap_find_ptp(pmap, va, ppa, 1);
2041 		KASSERTMSG(ptp != NULL, "pmap_get_ptp: va %" PRIxVADDR
2042 		    "ppa %" PRIxPADDR "\n", va, ppa);
2043 	}
2044 
2045 	pmap->pm_ptphint[0] = ptp;
2046 	return ptp;
2047 }
2048 
2049 /*
2050  * p m a p  l i f e c y c l e   f u n c t i o n s
2051  */
2052 
2053 /*
2054  * pmap_pdp_ctor: constructor for the PDP cache.
2055  */
2056 static int
2057 pmap_pdp_ctor(void *arg, void *v, int flags)
2058 {
2059 	pd_entry_t *pdir = v;
2060 	paddr_t pdirpa = 0;	/* XXX: GCC */
2061 	vaddr_t object;
2062 	int i;
2063 
2064 #if !defined(XEN) || !defined(__x86_64__)
2065 	int npde;
2066 #endif
2067 #ifdef XEN
2068 	int s;
2069 #endif
2070 
2071 	/*
2072 	 * NOTE: The `pmaps_lock' is held when the PDP is allocated.
2073 	 */
2074 
2075 #if defined(XEN) && defined(__x86_64__)
2076 	/* fetch the physical address of the page directory. */
2077 	(void) pmap_extract(pmap_kernel(), (vaddr_t) pdir, &pdirpa);
2078 
2079 	/* zero init area */
2080 	memset (pdir, 0, PAGE_SIZE); /* Xen wants a clean page */
2081 	/*
2082 	 * this pdir will NEVER be active in kernel mode
2083 	 * so mark recursive entry invalid
2084 	 */
2085 	pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa) | PG_u;
2086 	/*
2087 	 * PDP constructed this way won't be for kernel,
2088 	 * hence we don't put kernel mappings on Xen.
2089 	 * But we need to make pmap_create() happy, so put a dummy (without
2090 	 * PG_V) value at the right place.
2091 	 */
2092 	pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] =
2093 	     (pd_entry_t)-1 & PG_FRAME;
2094 #else /* XEN && __x86_64__*/
2095 	/* zero init area */
2096 	memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t));
2097 
2098 	object = (vaddr_t)v;
2099 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2100 		/* fetch the physical address of the page directory. */
2101 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2102 		/* put in recursive PDE to map the PTEs */
2103 		pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PG_V;
2104 #ifndef XEN
2105 		pdir[PDIR_SLOT_PTE + i] |= PG_KW;
2106 #endif
2107 	}
2108 
2109 	/* copy kernel's PDE */
2110 	npde = nkptp[PTP_LEVELS - 1];
2111 
2112 	memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN],
2113 	    npde * sizeof(pd_entry_t));
2114 
2115 	/* zero the rest */
2116 	memset(&pdir[PDIR_SLOT_KERN + npde], 0, (PAGE_SIZE * PDP_SIZE) -
2117 	    (PDIR_SLOT_KERN + npde) * sizeof(pd_entry_t));
2118 
2119 	if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
2120 		int idx = pl_i(KERNBASE, PTP_LEVELS);
2121 
2122 		pdir[idx] = PDP_BASE[idx];
2123 	}
2124 
2125 #ifdef __HAVE_DIRECT_MAP
2126 	pdir[PDIR_SLOT_DIRECT] = PDP_BASE[PDIR_SLOT_DIRECT];
2127 #endif
2128 
2129 #endif /* XEN  && __x86_64__*/
2130 #ifdef XEN
2131 	s = splvm();
2132 	object = (vaddr_t)v;
2133 	pmap_protect(pmap_kernel(), object, object + (PAGE_SIZE * PDP_SIZE),
2134 	    VM_PROT_READ);
2135 	pmap_update(pmap_kernel());
2136 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2137 		/*
2138 		 * pin as L2/L4 page, we have to do the page with the
2139 		 * PDIR_SLOT_PTE entries last
2140 		 */
2141 #ifdef PAE
2142 		if (i == l2tol3(PDIR_SLOT_PTE))
2143 			continue;
2144 #endif
2145 
2146 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2147 #ifdef __x86_64__
2148 		xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa));
2149 #else
2150 		xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2151 #endif
2152 	}
2153 #ifdef PAE
2154 	object = ((vaddr_t)pdir) + PAGE_SIZE  * l2tol3(PDIR_SLOT_PTE);
2155 	(void)pmap_extract(pmap_kernel(), object, &pdirpa);
2156 	xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2157 #endif
2158 	splx(s);
2159 #endif /* XEN */
2160 
2161 	return (0);
2162 }
2163 
2164 /*
2165  * pmap_pdp_dtor: destructor for the PDP cache.
2166  */
2167 
2168 static void
2169 pmap_pdp_dtor(void *arg, void *v)
2170 {
2171 #ifdef XEN
2172 	paddr_t pdirpa = 0;	/* XXX: GCC */
2173 	vaddr_t object = (vaddr_t)v;
2174 	int i;
2175 	int s = splvm();
2176 	pt_entry_t *pte;
2177 
2178 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2179 		/* fetch the physical address of the page directory. */
2180 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2181 		/* unpin page table */
2182 		xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa));
2183 	}
2184 	object = (vaddr_t)v;
2185 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2186 		/* Set page RW again */
2187 		pte = kvtopte(object);
2188 		pmap_pte_set(pte, *pte | PG_RW);
2189 		xen_bcast_invlpg((vaddr_t)object);
2190 	}
2191 	splx(s);
2192 #endif  /* XEN */
2193 }
2194 
2195 #ifdef PAE
2196 
2197 /* pmap_pdp_alloc: Allocate a page for the pdp memory pool. */
2198 
2199 static void *
2200 pmap_pdp_alloc(struct pool *pp, int flags)
2201 {
2202 	return (void *)uvm_km_alloc(kernel_map,
2203 	    PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE,
2204 	    ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK)
2205 	    | UVM_KMF_WIRED);
2206 }
2207 
2208 /*
2209  * pmap_pdp_free: free a PDP
2210  */
2211 
2212 static void
2213 pmap_pdp_free(struct pool *pp, void *v)
2214 {
2215 	uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE,
2216 	    UVM_KMF_WIRED);
2217 }
2218 #endif /* PAE */
2219 
2220 /*
2221  * pmap_create: create a pmap object.
2222  */
2223 struct pmap *
2224 pmap_create(void)
2225 {
2226 	struct pmap *pmap;
2227 	int i;
2228 
2229 	pmap = pool_cache_get(&pmap_cache, PR_WAITOK);
2230 
2231 	/* init uvm_object */
2232 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2233 		mutex_init(&pmap->pm_obj_lock[i], MUTEX_DEFAULT, IPL_NONE);
2234 		uvm_obj_init(&pmap->pm_obj[i], NULL, false, 1);
2235 		uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_obj_lock[i]);
2236 		pmap->pm_ptphint[i] = NULL;
2237 	}
2238 	pmap->pm_stats.wired_count = 0;
2239 	/* count the PDP allocd below */
2240 	pmap->pm_stats.resident_count = PDP_SIZE;
2241 #if !defined(__x86_64__)
2242 	pmap->pm_hiexec = 0;
2243 #endif /* !defined(__x86_64__) */
2244 	pmap->pm_flags = 0;
2245 	pmap->pm_gc_ptp = NULL;
2246 
2247 	kcpuset_create(&pmap->pm_cpus, true);
2248 	kcpuset_create(&pmap->pm_kernel_cpus, true);
2249 #ifdef XEN
2250 	kcpuset_create(&pmap->pm_xen_ptp_cpus, true);
2251 #endif
2252 	/* init the LDT */
2253 	pmap->pm_ldt = NULL;
2254 	pmap->pm_ldt_len = 0;
2255 	pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2256 
2257 	/* allocate PDP */
2258  try_again:
2259 	pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK);
2260 
2261 	mutex_enter(&pmaps_lock);
2262 
2263 	if (pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] == 0) {
2264 		mutex_exit(&pmaps_lock);
2265 		pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir);
2266 		goto try_again;
2267 	}
2268 
2269 	for (i = 0; i < PDP_SIZE; i++)
2270 		pmap->pm_pdirpa[i] =
2271 		    pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]);
2272 
2273 	LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
2274 
2275 	mutex_exit(&pmaps_lock);
2276 
2277 	return (pmap);
2278 }
2279 
2280 /*
2281  * pmap_free_ptps: put a list of ptps back to the freelist.
2282  */
2283 
2284 static void
2285 pmap_free_ptps(struct vm_page *empty_ptps)
2286 {
2287 	struct vm_page *ptp;
2288 	struct pmap_page *pp;
2289 
2290 	while ((ptp = empty_ptps) != NULL) {
2291 		pp = VM_PAGE_TO_PP(ptp);
2292 		empty_ptps = pp->pp_link;
2293 		LIST_INIT(&pp->pp_head.pvh_list);
2294 		uvm_pagefree(ptp);
2295 	}
2296 }
2297 
2298 /*
2299  * pmap_destroy: drop reference count on pmap.   free pmap if
2300  *	reference count goes to zero.
2301  */
2302 
2303 void
2304 pmap_destroy(struct pmap *pmap)
2305 {
2306 	lwp_t *l;
2307 	int i;
2308 
2309 	/*
2310 	 * If we have torn down this pmap, process deferred frees and
2311 	 * invalidations.  Free now if the system is low on memory.
2312 	 * Otherwise, free when the pmap is destroyed thus avoiding a
2313 	 * TLB shootdown.
2314 	 */
2315 	l = curlwp;
2316 	if (__predict_false(l->l_md.md_gc_pmap == pmap)) {
2317 		if (uvmexp.free < uvmexp.freetarg) {
2318 			pmap_update(pmap);
2319 		} else {
2320 			KASSERT(pmap->pm_gc_ptp == NULL);
2321 			pmap->pm_gc_ptp = l->l_md.md_gc_ptp;
2322 			l->l_md.md_gc_ptp = NULL;
2323 			l->l_md.md_gc_pmap = NULL;
2324 		}
2325 	}
2326 
2327 	/*
2328 	 * drop reference count
2329 	 */
2330 
2331 	if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) {
2332 		return;
2333 	}
2334 
2335 #ifdef DIAGNOSTIC
2336 	CPU_INFO_ITERATOR cii;
2337 	struct cpu_info *ci;
2338 
2339 	for (CPU_INFO_FOREACH(cii, ci)) {
2340 		if (ci->ci_pmap == pmap)
2341 			panic("destroying pmap being used");
2342 #if defined(XEN) && defined(__x86_64__)
2343 		for (i = 0; i < PDIR_SLOT_PTE; i++) {
2344 			if (pmap->pm_pdir[i] != 0 &&
2345 			    ci->ci_kpm_pdir[i] == pmap->pm_pdir[i]) {
2346 				printf("pmap_destroy(%p) pmap_kernel %p "
2347 				    "curcpu %d cpu %d ci_pmap %p "
2348 				    "ci->ci_kpm_pdir[%d]=%" PRIx64
2349 				    " pmap->pm_pdir[%d]=%" PRIx64 "\n",
2350 				    pmap, pmap_kernel(), curcpu()->ci_index,
2351 				    ci->ci_index, ci->ci_pmap,
2352 				    i, ci->ci_kpm_pdir[i],
2353 				    i, pmap->pm_pdir[i]);
2354 				panic("pmap_destroy: used pmap");
2355 			}
2356 		}
2357 #endif
2358 	}
2359 #endif /* DIAGNOSTIC */
2360 
2361 	/*
2362 	 * Reference count is zero, free pmap resources and then free pmap.
2363 	 * First, remove it from global list of pmaps.
2364 	 */
2365 
2366 	mutex_enter(&pmaps_lock);
2367 	LIST_REMOVE(pmap, pm_list);
2368 	mutex_exit(&pmaps_lock);
2369 
2370 	/*
2371 	 * Process deferred PTP frees.  No TLB shootdown required, as the
2372 	 * PTP pages are no longer visible to any CPU.
2373 	 */
2374 
2375 	pmap_free_ptps(pmap->pm_gc_ptp);
2376 
2377 	/*
2378 	 * destroyed pmap shouldn't have remaining PTPs
2379 	 */
2380 
2381 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2382 		KASSERT(pmap->pm_obj[i].uo_npages == 0);
2383 		KASSERT(TAILQ_EMPTY(&pmap->pm_obj[i].memq));
2384 	}
2385 
2386 	pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir);
2387 
2388 #ifdef USER_LDT
2389 	if (pmap->pm_ldt != NULL) {
2390 		/*
2391 		 * no need to switch the LDT; this address space is gone,
2392 		 * nothing is using it.
2393 		 *
2394 		 * No need to lock the pmap for ldt_free (or anything else),
2395 		 * we're the last one to use it.
2396 		 */
2397 		mutex_enter(&cpu_lock);
2398 		ldt_free(pmap->pm_ldt_sel);
2399 		mutex_exit(&cpu_lock);
2400 		uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt,
2401 		    pmap->pm_ldt_len, UVM_KMF_WIRED);
2402 	}
2403 #endif
2404 
2405 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2406 		uvm_obj_destroy(&pmap->pm_obj[i], false);
2407 		mutex_destroy(&pmap->pm_obj_lock[i]);
2408 	}
2409 	kcpuset_destroy(pmap->pm_cpus);
2410 	kcpuset_destroy(pmap->pm_kernel_cpus);
2411 #ifdef XEN
2412 	kcpuset_destroy(pmap->pm_xen_ptp_cpus);
2413 #endif
2414 	pool_cache_put(&pmap_cache, pmap);
2415 }
2416 
2417 /*
2418  * pmap_remove_all: pmap is being torn down by the current thread.
2419  * avoid unnecessary invalidations.
2420  */
2421 
2422 void
2423 pmap_remove_all(struct pmap *pmap)
2424 {
2425 	lwp_t *l = curlwp;
2426 
2427 	KASSERT(l->l_md.md_gc_pmap == NULL);
2428 
2429 	l->l_md.md_gc_pmap = pmap;
2430 }
2431 
2432 #if defined(PMAP_FORK)
2433 /*
2434  * pmap_fork: perform any necessary data structure manipulation when
2435  * a VM space is forked.
2436  */
2437 
2438 void
2439 pmap_fork(struct pmap *pmap1, struct pmap *pmap2)
2440 {
2441 #ifdef USER_LDT
2442 	union descriptor *new_ldt;
2443 	size_t len;
2444 	int sel;
2445 
2446 	if (__predict_true(pmap1->pm_ldt == NULL)) {
2447 		return;
2448 	}
2449 
2450  retry:
2451 	if (pmap1->pm_ldt != NULL) {
2452 		len = pmap1->pm_ldt_len;
2453 		new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, len, 0,
2454 		    UVM_KMF_WIRED);
2455 		mutex_enter(&cpu_lock);
2456 		sel = ldt_alloc(new_ldt, len);
2457 		if (sel == -1) {
2458 			mutex_exit(&cpu_lock);
2459 			uvm_km_free(kernel_map, (vaddr_t)new_ldt, len,
2460 			    UVM_KMF_WIRED);
2461 			printf("WARNING: pmap_fork: unable to allocate LDT\n");
2462 			return;
2463 		}
2464 	} else {
2465 		len = -1;
2466 		new_ldt = NULL;
2467 		sel = -1;
2468 		mutex_enter(&cpu_lock);
2469 	}
2470 
2471  	/* Copy the LDT, if necessary. */
2472  	if (pmap1->pm_ldt != NULL) {
2473 		if (len != pmap1->pm_ldt_len) {
2474 			if (len != -1) {
2475 				ldt_free(sel);
2476 				uvm_km_free(kernel_map, (vaddr_t)new_ldt,
2477 				    len, UVM_KMF_WIRED);
2478 			}
2479 			mutex_exit(&cpu_lock);
2480 			goto retry;
2481 		}
2482 
2483 		memcpy(new_ldt, pmap1->pm_ldt, len);
2484 		pmap2->pm_ldt = new_ldt;
2485 		pmap2->pm_ldt_len = pmap1->pm_ldt_len;
2486 		pmap2->pm_ldt_sel = sel;
2487 		len = -1;
2488 	}
2489 
2490 	if (len != -1) {
2491 		ldt_free(sel);
2492 		uvm_km_free(kernel_map, (vaddr_t)new_ldt, len,
2493 		    UVM_KMF_WIRED);
2494 	}
2495 	mutex_exit(&cpu_lock);
2496 #endif /* USER_LDT */
2497 }
2498 #endif /* PMAP_FORK */
2499 
2500 #ifdef USER_LDT
2501 
2502 /*
2503  * pmap_ldt_xcall: cross call used by pmap_ldt_sync.  if the named pmap
2504  * is active, reload LDTR.
2505  */
2506 static void
2507 pmap_ldt_xcall(void *arg1, void *arg2)
2508 {
2509 	struct pmap *pm;
2510 
2511 	kpreempt_disable();
2512 	pm = arg1;
2513 	if (curcpu()->ci_pmap == pm) {
2514 		lldt(pm->pm_ldt_sel);
2515 	}
2516 	kpreempt_enable();
2517 }
2518 
2519 /*
2520  * pmap_ldt_sync: LDT selector for the named pmap is changing.  swap
2521  * in the new selector on all CPUs.
2522  */
2523 void
2524 pmap_ldt_sync(struct pmap *pm)
2525 {
2526 	uint64_t where;
2527 
2528 	KASSERT(mutex_owned(&cpu_lock));
2529 
2530 	pmap_ldt_evcnt.ev_count++;
2531 	where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL);
2532 	xc_wait(where);
2533 }
2534 
2535 /*
2536  * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and
2537  * restore the default.
2538  */
2539 
2540 void
2541 pmap_ldt_cleanup(struct lwp *l)
2542 {
2543 	pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap;
2544 	union descriptor *dp = NULL;
2545 	size_t len = 0;
2546 	int sel = -1;
2547 
2548 	if (__predict_true(pmap->pm_ldt == NULL)) {
2549 		return;
2550 	}
2551 
2552 	mutex_enter(&cpu_lock);
2553 	if (pmap->pm_ldt != NULL) {
2554 		sel = pmap->pm_ldt_sel;
2555 		dp = pmap->pm_ldt;
2556 		len = pmap->pm_ldt_len;
2557 		pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2558 		pmap->pm_ldt = NULL;
2559 		pmap->pm_ldt_len = 0;
2560 		pmap_ldt_sync(pmap);
2561 		ldt_free(sel);
2562 		uvm_km_free(kernel_map, (vaddr_t)dp, len, UVM_KMF_WIRED);
2563 	}
2564 	mutex_exit(&cpu_lock);
2565 }
2566 #endif /* USER_LDT */
2567 
2568 /*
2569  * pmap_activate: activate a process' pmap
2570  *
2571  * => must be called with kernel preemption disabled
2572  * => if lwp is the curlwp, then set ci_want_pmapload so that
2573  *    actual MMU context switch will be done by pmap_load() later
2574  */
2575 
2576 void
2577 pmap_activate(struct lwp *l)
2578 {
2579 	struct cpu_info *ci;
2580 	struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2581 
2582 	KASSERT(kpreempt_disabled());
2583 
2584 	ci = curcpu();
2585 
2586 	if (l == ci->ci_curlwp) {
2587 		KASSERT(ci->ci_want_pmapload == 0);
2588 		KASSERT(ci->ci_tlbstate != TLBSTATE_VALID);
2589 #ifdef KSTACK_CHECK_DR0
2590 		/*
2591 		 * setup breakpoint on the top of stack
2592 		 */
2593 		if (l == &lwp0)
2594 			dr0(0, 0, 0, 0);
2595 		else
2596 			dr0(KSTACK_LOWEST_ADDR(l), 1, 3, 1);
2597 #endif
2598 
2599 		/*
2600 		 * no need to switch to kernel vmspace because
2601 		 * it's a subset of any vmspace.
2602 		 */
2603 
2604 		if (pmap == pmap_kernel()) {
2605 			ci->ci_want_pmapload = 0;
2606 			return;
2607 		}
2608 
2609 		ci->ci_want_pmapload = 1;
2610 	}
2611 }
2612 
2613 /*
2614  * pmap_reactivate: try to regain reference to the pmap.
2615  *
2616  * => Must be called with kernel preemption disabled.
2617  */
2618 
2619 static bool
2620 pmap_reactivate(struct pmap *pmap)
2621 {
2622 	struct cpu_info * const ci = curcpu();
2623 	const cpuid_t cid = cpu_index(ci);
2624 	bool result;
2625 
2626 	KASSERT(kpreempt_disabled());
2627 #if defined(XEN) && defined(__x86_64__)
2628 	KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd);
2629 #elif defined(PAE)
2630 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
2631 #elif !defined(XEN)
2632 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()));
2633 #endif
2634 
2635 	/*
2636 	 * If we still have a lazy reference to this pmap, we can assume
2637 	 * that there was no TLB shootdown for this pmap in the meantime.
2638 	 *
2639 	 * The order of events here is important as we must synchronize
2640 	 * with TLB shootdown interrupts.  Declare interest in invalidations
2641 	 * (TLBSTATE_VALID) and then check the CPU set, which the IPIs can
2642 	 * change only when the state is TLBSTATE_LAZY.
2643 	 */
2644 
2645 	ci->ci_tlbstate = TLBSTATE_VALID;
2646 	KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid));
2647 
2648 	if (kcpuset_isset(pmap->pm_cpus, cid)) {
2649 		/* We have the reference, state is valid. */
2650 		result = true;
2651 	} else {
2652 		/* Must reload the TLB. */
2653 		kcpuset_atomic_set(pmap->pm_cpus, cid);
2654 		result = false;
2655 	}
2656 	return result;
2657 }
2658 
2659 /*
2660  * pmap_load: perform the actual pmap switch, i.e. fill in %cr3 register
2661  * and relevant LDT info.
2662  *
2663  * Ensures that the current process' pmap is loaded on the current CPU's
2664  * MMU and that there are no stale TLB entries.
2665  *
2666  * => The caller should disable kernel preemption or do check-and-retry
2667  *    to prevent a preemption from undoing our efforts.
2668  * => This function may block.
2669  */
2670 void
2671 pmap_load(void)
2672 {
2673 	struct cpu_info *ci;
2674 	struct pmap *pmap, *oldpmap;
2675 	struct lwp *l;
2676 	struct pcb *pcb;
2677 	cpuid_t cid;
2678 	uint64_t ncsw;
2679 
2680 	kpreempt_disable();
2681  retry:
2682 	ci = curcpu();
2683 	if (!ci->ci_want_pmapload) {
2684 		kpreempt_enable();
2685 		return;
2686 	}
2687 	l = ci->ci_curlwp;
2688 	ncsw = l->l_ncsw;
2689 
2690 	/* should be able to take ipis. */
2691 	KASSERT(ci->ci_ilevel < IPL_HIGH);
2692 #ifdef XEN
2693 	/* Check to see if interrupts are enabled (ie; no events are masked) */
2694 	KASSERT(x86_read_psl() == 0);
2695 #else
2696 	KASSERT((x86_read_psl() & PSL_I) != 0);
2697 #endif
2698 
2699 	KASSERT(l != NULL);
2700 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2701 	KASSERT(pmap != pmap_kernel());
2702 	oldpmap = ci->ci_pmap;
2703 	pcb = lwp_getpcb(l);
2704 
2705 	if (pmap == oldpmap) {
2706 		if (!pmap_reactivate(pmap)) {
2707 			u_int gen = uvm_emap_gen_return();
2708 
2709 			/*
2710 			 * pmap has been changed during deactivated.
2711 			 * our tlb may be stale.
2712 			 */
2713 
2714 			tlbflush();
2715 			uvm_emap_update(gen);
2716 		}
2717 
2718 		ci->ci_want_pmapload = 0;
2719 		kpreempt_enable();
2720 		return;
2721 	}
2722 
2723 	/*
2724 	 * Acquire a reference to the new pmap and perform the switch.
2725 	 */
2726 
2727 	pmap_reference(pmap);
2728 
2729 	cid = cpu_index(ci);
2730 	kcpuset_atomic_clear(oldpmap->pm_cpus, cid);
2731 	kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid);
2732 
2733 #if defined(XEN) && defined(__x86_64__)
2734 	KASSERT(pmap_pdirpa(oldpmap, 0) == ci->ci_xen_current_user_pgd ||
2735 	    oldpmap == pmap_kernel());
2736 #elif defined(PAE)
2737 	KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
2738 #elif !defined(XEN)
2739 	KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(rcr3()));
2740 #endif
2741 	KASSERT(!kcpuset_isset(pmap->pm_cpus, cid));
2742 	KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid));
2743 
2744 	/*
2745 	 * Mark the pmap in use by this CPU.  Again, we must synchronize
2746 	 * with TLB shootdown interrupts, so set the state VALID first,
2747 	 * then register us for shootdown events on this pmap.
2748 	 */
2749 	ci->ci_tlbstate = TLBSTATE_VALID;
2750 	kcpuset_atomic_set(pmap->pm_cpus, cid);
2751 	kcpuset_atomic_set(pmap->pm_kernel_cpus, cid);
2752 	ci->ci_pmap = pmap;
2753 
2754 	/*
2755 	 * update tss.  now that we have registered for invalidations
2756 	 * from other CPUs, we're good to load the page tables.
2757 	 */
2758 #ifdef PAE
2759 	pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa;
2760 #else
2761 	pcb->pcb_cr3 = pmap_pdirpa(pmap, 0);
2762 #endif
2763 
2764 #ifdef i386
2765 #ifndef XEN
2766 	ci->ci_tss.tss_ldt = pmap->pm_ldt_sel;
2767 	ci->ci_tss.tss_cr3 = pcb->pcb_cr3;
2768 #endif /* !XEN */
2769 #endif /* i386 */
2770 
2771 	lldt(pmap->pm_ldt_sel);
2772 
2773 	u_int gen = uvm_emap_gen_return();
2774 	cpu_load_pmap(pmap, oldpmap);
2775 	uvm_emap_update(gen);
2776 
2777 	ci->ci_want_pmapload = 0;
2778 
2779 	/*
2780 	 * we're now running with the new pmap.  drop the reference
2781 	 * to the old pmap.  if we block, we need to go around again.
2782 	 */
2783 
2784 	pmap_destroy(oldpmap);
2785 	if (l->l_ncsw != ncsw) {
2786 		goto retry;
2787 	}
2788 
2789 	kpreempt_enable();
2790 }
2791 
2792 /*
2793  * pmap_deactivate: deactivate a process' pmap.
2794  *
2795  * => Must be called with kernel preemption disabled (high IPL is enough).
2796  */
2797 void
2798 pmap_deactivate(struct lwp *l)
2799 {
2800 	struct pmap *pmap;
2801 	struct cpu_info *ci;
2802 
2803 	KASSERT(kpreempt_disabled());
2804 
2805 	if (l != curlwp) {
2806 		return;
2807 	}
2808 
2809 	/*
2810 	 * Wait for pending TLB shootdowns to complete.  Necessary because
2811 	 * TLB shootdown state is per-CPU, and the LWP may be coming off
2812 	 * the CPU before it has a chance to call pmap_update(), e.g. due
2813 	 * to kernel preemption or blocking routine in between.
2814 	 */
2815 	pmap_tlb_shootnow();
2816 
2817 	ci = curcpu();
2818 
2819 	if (ci->ci_want_pmapload) {
2820 		/*
2821 		 * ci_want_pmapload means that our pmap is not loaded on
2822 		 * the CPU or TLB might be stale.  note that pmap_kernel()
2823 		 * is always considered loaded.
2824 		 */
2825 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
2826 		    != pmap_kernel());
2827 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
2828 		    != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID);
2829 
2830 		/*
2831 		 * userspace has not been touched.
2832 		 * nothing to do here.
2833 		 */
2834 
2835 		ci->ci_want_pmapload = 0;
2836 		return;
2837 	}
2838 
2839 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2840 
2841 	if (pmap == pmap_kernel()) {
2842 		return;
2843 	}
2844 
2845 #if defined(XEN) && defined(__x86_64__)
2846 	KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd);
2847 #elif defined(PAE)
2848 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
2849 #elif !defined(XEN)
2850 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()));
2851 #endif
2852 	KASSERT(ci->ci_pmap == pmap);
2853 
2854 	/*
2855 	 * we aren't interested in TLB invalidations for this pmap,
2856 	 * at least for the time being.
2857 	 */
2858 
2859 	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
2860 	ci->ci_tlbstate = TLBSTATE_LAZY;
2861 }
2862 
2863 /*
2864  * end of lifecycle functions
2865  */
2866 
2867 /*
2868  * some misc. functions
2869  */
2870 
2871 int
2872 pmap_pdes_invalid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde)
2873 {
2874 	int i;
2875 	unsigned long index;
2876 	pd_entry_t pde;
2877 
2878 	for (i = PTP_LEVELS; i > 1; i--) {
2879 		index = pl_i(va, i);
2880 		pde = pdes[i - 2][index];
2881 		if ((pde & PG_V) == 0)
2882 			return i;
2883 	}
2884 	if (lastpde != NULL)
2885 		*lastpde = pde;
2886 	return 0;
2887 }
2888 
2889 /*
2890  * pmap_extract: extract a PA for the given VA
2891  */
2892 
2893 bool
2894 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
2895 {
2896 	pt_entry_t *ptes, pte;
2897 	pd_entry_t pde;
2898 	pd_entry_t * const *pdes;
2899 	struct pmap *pmap2;
2900 	struct cpu_info *ci;
2901 	paddr_t pa;
2902 	lwp_t *l;
2903 	bool hard, rv;
2904 
2905 #ifdef __HAVE_DIRECT_MAP
2906 	if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
2907 		if (pap != NULL) {
2908 			*pap = va - PMAP_DIRECT_BASE;
2909 		}
2910 		return true;
2911 	}
2912 #endif
2913 
2914 	rv = false;
2915 	pa = 0;
2916 	l = curlwp;
2917 
2918 	KPREEMPT_DISABLE(l);
2919 	ci = l->l_cpu;
2920 	if (__predict_true(!ci->ci_want_pmapload && ci->ci_pmap == pmap) ||
2921 	    pmap == pmap_kernel()) {
2922 		/*
2923 		 * no need to lock, because it's pmap_kernel() or our
2924 		 * own pmap and is active.  if a user pmap, the caller
2925 		 * will hold the vm_map write/read locked and so prevent
2926 		 * entries from disappearing while we are here.  ptps
2927 		 * can disappear via pmap_remove() and pmap_protect(),
2928 		 * but they are called with the vm_map write locked.
2929 		 */
2930 		hard = false;
2931 		ptes = PTE_BASE;
2932 		pdes = normal_pdes;
2933 	} else {
2934 		/* we lose, do it the hard way. */
2935 		hard = true;
2936 		pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
2937 	}
2938 	if (pmap_pdes_valid(va, pdes, &pde)) {
2939 		pte = ptes[pl1_i(va)];
2940 		if (pde & PG_PS) {
2941 			pa = (pde & PG_LGFRAME) | (va & (NBPD_L2 - 1));
2942 			rv = true;
2943 		} else if (__predict_true((pte & PG_V) != 0)) {
2944 			pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
2945 			rv = true;
2946 		}
2947 	}
2948 	if (__predict_false(hard)) {
2949 		pmap_unmap_ptes(pmap, pmap2);
2950 	}
2951 	KPREEMPT_ENABLE(l);
2952 	if (pap != NULL) {
2953 		*pap = pa;
2954 	}
2955 	return rv;
2956 }
2957 
2958 
2959 /*
2960  * vtophys: virtual address to physical address.  For use by
2961  * machine-dependent code only.
2962  */
2963 
2964 paddr_t
2965 vtophys(vaddr_t va)
2966 {
2967 	paddr_t pa;
2968 
2969 	if (pmap_extract(pmap_kernel(), va, &pa) == true)
2970 		return (pa);
2971 	return (0);
2972 }
2973 
2974 __strict_weak_alias(pmap_extract_ma, pmap_extract);
2975 
2976 #ifdef XEN
2977 
2978 /*
2979  * vtomach: virtual address to machine address.  For use by
2980  * machine-dependent code only.
2981  */
2982 
2983 paddr_t
2984 vtomach(vaddr_t va)
2985 {
2986 	paddr_t pa;
2987 
2988 	if (pmap_extract_ma(pmap_kernel(), va, &pa) == true)
2989 		return (pa);
2990 	return (0);
2991 }
2992 
2993 #endif /* XEN */
2994 
2995 /*
2996  * pmap_virtual_space: used during bootup [pmap_steal_memory] to
2997  *	determine the bounds of the kernel virtual addess space.
2998  */
2999 
3000 void
3001 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp)
3002 {
3003 	*startp = virtual_avail;
3004 	*endp = virtual_end;
3005 }
3006 
3007 /*
3008  * pmap_zero_page: zero a page
3009  */
3010 
3011 void
3012 pmap_zero_page(paddr_t pa)
3013 {
3014 #if defined(__HAVE_DIRECT_MAP)
3015 	pagezero(PMAP_DIRECT_MAP(pa));
3016 #else
3017 #if defined(XEN)
3018 	if (XEN_VERSION_SUPPORTED(3, 4))
3019 		xen_pagezero(pa);
3020 #endif
3021 	pt_entry_t *zpte;
3022 	void *zerova;
3023 	int id;
3024 
3025 	kpreempt_disable();
3026 	id = cpu_number();
3027 	zpte = PTESLEW(zero_pte, id);
3028 	zerova = VASLEW(zerop, id);
3029 
3030 #ifdef DIAGNOSTIC
3031 	if (*zpte)
3032 		panic("pmap_zero_page: lock botch");
3033 #endif
3034 
3035 	pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k);
3036 	pmap_pte_flush();
3037 	pmap_update_pg((vaddr_t)zerova);		/* flush TLB */
3038 
3039 	memset(zerova, 0, PAGE_SIZE);
3040 
3041 #if defined(DIAGNOSTIC) || defined(XEN)
3042 	pmap_pte_set(zpte, 0);				/* zap ! */
3043 	pmap_pte_flush();
3044 #endif
3045 	kpreempt_enable();
3046 #endif /* defined(__HAVE_DIRECT_MAP) */
3047 }
3048 
3049 /*
3050  * pmap_pagezeroidle: the same, for the idle loop page zero'er.
3051  * Returns true if the page was zero'd, false if we aborted for
3052  * some reason.
3053  */
3054 
3055 bool
3056 pmap_pageidlezero(paddr_t pa)
3057 {
3058 #ifdef __HAVE_DIRECT_MAP
3059 	KASSERT(cpu_feature[0] & CPUID_SSE2);
3060 	return sse2_idlezero_page((void *)PMAP_DIRECT_MAP(pa));
3061 #else
3062 	pt_entry_t *zpte;
3063 	void *zerova;
3064 	bool rv;
3065 	int id;
3066 
3067 	id = cpu_number();
3068 	zpte = PTESLEW(zero_pte, id);
3069 	zerova = VASLEW(zerop, id);
3070 
3071 	KASSERT(cpu_feature[0] & CPUID_SSE2);
3072 	KASSERT(*zpte == 0);
3073 
3074 	pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k);
3075 	pmap_pte_flush();
3076 	pmap_update_pg((vaddr_t)zerova);		/* flush TLB */
3077 
3078 	rv = sse2_idlezero_page(zerova);
3079 
3080 #if defined(DIAGNOSTIC) || defined(XEN)
3081 	pmap_pte_set(zpte, 0);				/* zap ! */
3082 	pmap_pte_flush();
3083 #endif
3084 
3085 	return rv;
3086 #endif
3087 }
3088 
3089 /*
3090  * pmap_copy_page: copy a page
3091  */
3092 
3093 void
3094 pmap_copy_page(paddr_t srcpa, paddr_t dstpa)
3095 {
3096 #if defined(__HAVE_DIRECT_MAP)
3097 	vaddr_t srcva = PMAP_DIRECT_MAP(srcpa);
3098 	vaddr_t dstva = PMAP_DIRECT_MAP(dstpa);
3099 
3100 	memcpy((void *)dstva, (void *)srcva, PAGE_SIZE);
3101 #else
3102 #if defined(XEN)
3103 	if (XEN_VERSION_SUPPORTED(3, 4)) {
3104 		xen_copy_page(srcpa, dstpa);
3105 		return;
3106 	}
3107 #endif
3108 	pt_entry_t *spte;
3109 	pt_entry_t *dpte;
3110 	void *csrcva;
3111 	void *cdstva;
3112 	int id;
3113 
3114 	kpreempt_disable();
3115 	id = cpu_number();
3116 	spte = PTESLEW(csrc_pte,id);
3117 	dpte = PTESLEW(cdst_pte,id);
3118 	csrcva = VASLEW(csrcp, id);
3119 	cdstva = VASLEW(cdstp, id);
3120 
3121 	KASSERT(*spte == 0 && *dpte == 0);
3122 
3123 	pmap_pte_set(spte, pmap_pa2pte(srcpa) | PG_V | PG_RW | PG_U | PG_k);
3124 	pmap_pte_set(dpte,
3125 	    pmap_pa2pte(dstpa) | PG_V | PG_RW | PG_M | PG_U | PG_k);
3126 	pmap_pte_flush();
3127 	pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva);
3128 
3129 	memcpy(cdstva, csrcva, PAGE_SIZE);
3130 
3131 #if defined(DIAGNOSTIC) || defined(XEN)
3132 	pmap_pte_set(spte, 0);
3133 	pmap_pte_set(dpte, 0);
3134 	pmap_pte_flush();
3135 #endif
3136 	kpreempt_enable();
3137 #endif /* defined(__HAVE_DIRECT_MAP) */
3138 }
3139 
3140 static pt_entry_t *
3141 pmap_map_ptp(struct vm_page *ptp)
3142 {
3143 #ifdef __HAVE_DIRECT_MAP
3144 	return (void *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp));
3145 #else
3146 	pt_entry_t *ptppte;
3147 	void *ptpva;
3148 	int id;
3149 
3150 	KASSERT(kpreempt_disabled());
3151 
3152 	id = cpu_number();
3153 	ptppte = PTESLEW(ptp_pte, id);
3154 	ptpva = VASLEW(ptpp, id);
3155 #if !defined(XEN)
3156 	pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M |
3157 	    PG_RW | PG_U | PG_k);
3158 #else
3159 	pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M |
3160 	    PG_U | PG_k);
3161 #endif
3162 	pmap_pte_flush();
3163 	pmap_update_pg((vaddr_t)ptpva);
3164 
3165 	return (pt_entry_t *)ptpva;
3166 #endif
3167 }
3168 
3169 static void
3170 pmap_unmap_ptp(void)
3171 {
3172 #ifndef __HAVE_DIRECT_MAP
3173 #if defined(DIAGNOSTIC) || defined(XEN)
3174 	pt_entry_t *pte;
3175 
3176 	KASSERT(kpreempt_disabled());
3177 
3178 	pte = PTESLEW(ptp_pte, cpu_number());
3179 	if (*pte != 0) {
3180 		pmap_pte_set(pte, 0);
3181 		pmap_pte_flush();
3182 	}
3183 #endif
3184 #endif
3185 }
3186 
3187 static pt_entry_t *
3188 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
3189 {
3190 
3191 	KASSERT(kpreempt_disabled());
3192 	if (pmap_is_curpmap(pmap)) {
3193 		return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */
3194 	}
3195 	KASSERT(ptp != NULL);
3196 	return pmap_map_ptp(ptp) + pl1_pi(va);
3197 }
3198 
3199 static void
3200 pmap_unmap_pte(void)
3201 {
3202 
3203 	KASSERT(kpreempt_disabled());
3204 
3205 	pmap_unmap_ptp();
3206 }
3207 
3208 /*
3209  * p m a p   r e m o v e   f u n c t i o n s
3210  *
3211  * functions that remove mappings
3212  */
3213 
3214 /*
3215  * pmap_remove_ptes: remove PTEs from a PTP
3216  *
3217  * => caller must hold pmap's lock
3218  * => PTP must be mapped into KVA
3219  * => PTP should be null if pmap == pmap_kernel()
3220  * => must be called with kernel preemption disabled
3221  * => returns composite pte if at least one page should be shot down
3222  */
3223 
3224 static void
3225 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
3226 		 vaddr_t startva, vaddr_t endva, struct pv_entry **pv_tofree)
3227 {
3228 	pt_entry_t *pte = (pt_entry_t *)ptpva;
3229 
3230 	KASSERT(pmap == pmap_kernel() || mutex_owned(pmap->pm_lock));
3231 	KASSERT(kpreempt_disabled());
3232 
3233 	/*
3234 	 * note that ptpva points to the PTE that maps startva.   this may
3235 	 * or may not be the first PTE in the PTP.
3236 	 *
3237 	 * we loop through the PTP while there are still PTEs to look at
3238 	 * and the wire_count is greater than 1 (because we use the wire_count
3239 	 * to keep track of the number of real PTEs in the PTP).
3240 	 */
3241 	while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) {
3242 		(void)pmap_remove_pte(pmap, ptp, pte, startva, pv_tofree);
3243 		startva += PAGE_SIZE;
3244 		pte++;
3245 	}
3246 }
3247 
3248 
3249 /*
3250  * pmap_remove_pte: remove a single PTE from a PTP.
3251  *
3252  * => caller must hold pmap's lock
3253  * => PTP must be mapped into KVA
3254  * => PTP should be null if pmap == pmap_kernel()
3255  * => returns true if we removed a mapping
3256  * => must be called with kernel preemption disabled
3257  */
3258 static bool
3259 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
3260 		vaddr_t va, struct pv_entry **pv_tofree)
3261 {
3262 	struct pv_entry *pve;
3263 	struct vm_page *pg;
3264 	struct pmap_page *pp;
3265 	pt_entry_t opte;
3266 
3267 	KASSERT(pmap == pmap_kernel() || mutex_owned(pmap->pm_lock));
3268 	KASSERT(kpreempt_disabled());
3269 
3270 	if (!pmap_valid_entry(*pte)) {
3271 		/* VA not mapped. */
3272 		return false;
3273 	}
3274 
3275 	/* Atomically save the old PTE and zap it. */
3276 	opte = pmap_pte_testset(pte, 0);
3277 	if (!pmap_valid_entry(opte)) {
3278 		return false;
3279 	}
3280 
3281 	pmap_exec_account(pmap, va, opte, 0);
3282 	pmap_stats_update_bypte(pmap, 0, opte);
3283 
3284 	if (ptp) {
3285 		/*
3286 		 * Dropping a PTE.  Make sure that the PDE is flushed.
3287 		 */
3288 		ptp->wire_count--;
3289 		if (ptp->wire_count <= 1) {
3290 			opte |= PG_U;
3291 		}
3292 	}
3293 
3294 	if ((opte & PG_U) != 0) {
3295 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE);
3296 	}
3297 
3298 	/*
3299 	 * If we are not on a pv_head list - we are done.
3300 	 */
3301 	if ((opte & PG_PVLIST) == 0) {
3302 #if defined(DIAGNOSTIC) && !defined(DOM0OPS)
3303 		if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL)
3304 			panic("pmap_remove_pte: managed page without "
3305 			      "PG_PVLIST for %#" PRIxVADDR, va);
3306 #endif
3307 		return true;
3308 	}
3309 
3310 	pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte));
3311 
3312 	KASSERTMSG(pg != NULL, "pmap_remove_pte: unmanaged page marked "
3313 	    "PG_PVLIST, va = %#" PRIxVADDR ", pa = %#" PRIxPADDR,
3314 	    va, (paddr_t)pmap_pte2pa(opte));
3315 
3316 	KASSERT(uvm_page_locked_p(pg));
3317 
3318 	/* Sync R/M bits. */
3319 	pp = VM_PAGE_TO_PP(pg);
3320 	pp->pp_attrs |= opte;
3321 	pve = pmap_remove_pv(pp, ptp, va);
3322 
3323 	if (pve) {
3324 		pve->pve_next = *pv_tofree;
3325 		*pv_tofree = pve;
3326 	}
3327 	return true;
3328 }
3329 
3330 /*
3331  * pmap_remove: mapping removal function.
3332  *
3333  * => caller should not be holding any pmap locks
3334  */
3335 
3336 void
3337 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
3338 {
3339 	pt_entry_t *ptes;
3340 	pd_entry_t pde;
3341 	pd_entry_t * const *pdes;
3342 	struct pv_entry *pv_tofree = NULL;
3343 	bool result;
3344 	int i;
3345 	paddr_t ptppa;
3346 	vaddr_t blkendva, va = sva;
3347 	struct vm_page *ptp;
3348 	struct pmap *pmap2;
3349 
3350 	kpreempt_disable();
3351 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
3352 
3353 	/*
3354 	 * removing one page?  take shortcut function.
3355 	 */
3356 
3357 	if (va + PAGE_SIZE == eva) {
3358 		if (pmap_pdes_valid(va, pdes, &pde)) {
3359 
3360 			/* PA of the PTP */
3361 			ptppa = pmap_pte2pa(pde);
3362 
3363 			/* Get PTP if non-kernel mapping. */
3364 			if (pmap != pmap_kernel()) {
3365 				ptp = pmap_find_ptp(pmap, va, ptppa, 1);
3366 				KASSERTMSG(ptp != NULL,
3367 				    "pmap_remove: unmanaged PTP detected");
3368 			} else {
3369 				/* Never free kernel PTPs. */
3370 				ptp = NULL;
3371 			}
3372 
3373 			result = pmap_remove_pte(pmap, ptp,
3374 			    &ptes[pl1_i(va)], va, &pv_tofree);
3375 
3376 			/*
3377 			 * if mapping removed and the PTP is no longer
3378 			 * being used, free it!
3379 			 */
3380 
3381 			if (result && ptp && ptp->wire_count <= 1)
3382 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3383 		}
3384 	} else for (/* null */ ; va < eva ; va = blkendva) {
3385 		int lvl;
3386 
3387 		/* determine range of block */
3388 		blkendva = x86_round_pdr(va+1);
3389 		if (blkendva > eva)
3390 			blkendva = eva;
3391 
3392 		/*
3393 		 * XXXCDC: our PTE mappings should never be removed
3394 		 * with pmap_remove!  if we allow this (and why would
3395 		 * we?) then we end up freeing the pmap's page
3396 		 * directory page (PDP) before we are finished using
3397 		 * it when we hit in in the recursive mapping.  this
3398 		 * is BAD.
3399 		 *
3400 		 * long term solution is to move the PTEs out of user
3401 		 * address space.  and into kernel address space (up
3402 		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
3403 		 * be VM_MAX_ADDRESS.
3404 		 */
3405 
3406 		/* XXXCDC: ugly hack to avoid freeing PDP here */
3407 		for (i = 0; i < PDP_SIZE; i++) {
3408 			if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i)
3409 				continue;
3410 		}
3411 
3412 		lvl = pmap_pdes_invalid(va, pdes, &pde);
3413 		if (lvl != 0) {
3414 			/*
3415 			 * skip a range corresponding to an invalid pde.
3416 			 */
3417 			blkendva = (va & ptp_masks[lvl - 1]) + nbpd[lvl - 1];
3418  			continue;
3419 		}
3420 
3421 		/* PA of the PTP */
3422 		ptppa = pmap_pte2pa(pde);
3423 
3424 		/* Get PTP if non-kernel mapping. */
3425 		if (pmap != pmap_kernel()) {
3426 			ptp = pmap_find_ptp(pmap, va, ptppa, 1);
3427 			KASSERTMSG(ptp != NULL,
3428 			    "pmap_remove: unmanaged PTP detected");
3429 		} else {
3430 			/* Never free kernel PTPs. */
3431 			ptp = NULL;
3432 		}
3433 
3434 		pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va,
3435 		    blkendva, &pv_tofree);
3436 
3437 		/* if PTP is no longer being used, free it! */
3438 		if (ptp && ptp->wire_count <= 1) {
3439 			pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3440 		}
3441 	}
3442 	pmap_unmap_ptes(pmap, pmap2);		/* unlock pmap */
3443 	kpreempt_enable();
3444 
3445 	/* Now we free unused PVs */
3446 	if (pv_tofree)
3447 		pmap_free_pvs(pv_tofree);
3448 }
3449 
3450 /*
3451  * pmap_sync_pv: clear pte bits and return the old value of the pte.
3452  *
3453  * => Caller should disable kernel preemption.
3454  * => issues tlb shootdowns if necessary.
3455  */
3456 
3457 static int
3458 pmap_sync_pv(struct pv_pte *pvpte, pt_entry_t expect, int clearbits,
3459     pt_entry_t *optep)
3460 {
3461 	struct pmap *pmap;
3462 	struct vm_page *ptp;
3463 	vaddr_t va;
3464 	pt_entry_t *ptep;
3465 	pt_entry_t opte;
3466 	pt_entry_t npte;
3467 	bool need_shootdown;
3468 
3469 	ptp = pvpte->pte_ptp;
3470 	va = pvpte->pte_va;
3471 	KASSERT(ptp == NULL || ptp->uobject != NULL);
3472 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
3473 	pmap = ptp_to_pmap(ptp);
3474 
3475 	KASSERT((expect & ~(PG_FRAME | PG_V)) == 0);
3476 	KASSERT((expect & PG_V) != 0);
3477 	KASSERT(clearbits == ~0 || (clearbits & ~(PG_M | PG_U | PG_RW)) == 0);
3478 	KASSERT(kpreempt_disabled());
3479 
3480 	ptep = pmap_map_pte(pmap, ptp, va);
3481 	do {
3482 		opte = *ptep;
3483 		KASSERT((opte & (PG_M | PG_U)) != PG_M);
3484 		KASSERT((opte & (PG_U | PG_V)) != PG_U);
3485 		KASSERT(opte == 0 || (opte & PG_V) != 0);
3486 		if ((opte & (PG_FRAME | PG_V)) != expect) {
3487 
3488 			/*
3489 			 * we lost a race with a V->P operation like
3490 			 * pmap_remove().  wait for the competitor
3491 			 * reflecting pte bits into mp_attrs.
3492 			 *
3493 			 * issue a redundant TLB shootdown so that
3494 			 * we can wait for its completion.
3495 			 */
3496 
3497 			pmap_unmap_pte();
3498 			if (clearbits != 0) {
3499 				pmap_tlb_shootdown(pmap, va,
3500 				    (pmap == pmap_kernel() ? PG_G : 0),
3501 				    TLBSHOOT_SYNC_PV1);
3502 			}
3503 			return EAGAIN;
3504 		}
3505 
3506 		/*
3507 		 * check if there's anything to do on this pte.
3508 		 */
3509 
3510 		if ((opte & clearbits) == 0) {
3511 			need_shootdown = false;
3512 			break;
3513 		}
3514 
3515 		/*
3516 		 * we need a shootdown if the pte is cached. (PG_U)
3517 		 *
3518 		 * ...unless we are clearing only the PG_RW bit and
3519 		 * it isn't cached as RW. (PG_M)
3520 		 */
3521 
3522 		need_shootdown = (opte & PG_U) != 0 &&
3523 		    !(clearbits == PG_RW && (opte & PG_M) == 0);
3524 
3525 		npte = opte & ~clearbits;
3526 
3527 		/*
3528 		 * if we need a shootdown anyway, clear PG_U and PG_M.
3529 		 */
3530 
3531 		if (need_shootdown) {
3532 			npte &= ~(PG_U | PG_M);
3533 		}
3534 		KASSERT((npte & (PG_M | PG_U)) != PG_M);
3535 		KASSERT((npte & (PG_U | PG_V)) != PG_U);
3536 		KASSERT(npte == 0 || (opte & PG_V) != 0);
3537 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
3538 
3539 	if (need_shootdown) {
3540 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV2);
3541 	}
3542 	pmap_unmap_pte();
3543 
3544 	*optep = opte;
3545 	return 0;
3546 }
3547 
3548 /*
3549  * pmap_page_remove: remove a managed vm_page from all pmaps that map it
3550  *
3551  * => R/M bits are sync'd back to attrs
3552  */
3553 
3554 void
3555 pmap_page_remove(struct vm_page *pg)
3556 {
3557 	struct pmap_page *pp;
3558 	struct pv_pte *pvpte;
3559 	struct pv_entry *killlist = NULL;
3560 	struct vm_page *ptp;
3561 	pt_entry_t expect;
3562 	int count;
3563 
3564 	KASSERT(uvm_page_locked_p(pg));
3565 
3566 	pp = VM_PAGE_TO_PP(pg);
3567 	expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V;
3568 	count = SPINLOCK_BACKOFF_MIN;
3569 	kpreempt_disable();
3570 startover:
3571 	while ((pvpte = pv_pte_first(pp)) != NULL) {
3572 		struct pmap *pmap;
3573 		struct pv_entry *pve;
3574 		pt_entry_t opte;
3575 		vaddr_t va;
3576 		int error;
3577 
3578 		/*
3579 		 * add a reference to the pmap before clearing the pte.
3580 		 * otherwise the pmap can disappear behind us.
3581 		 */
3582 
3583 		ptp = pvpte->pte_ptp;
3584 		pmap = ptp_to_pmap(ptp);
3585 		if (ptp != NULL) {
3586 			pmap_reference(pmap);
3587 		}
3588 
3589 		error = pmap_sync_pv(pvpte, expect, ~0, &opte);
3590 		if (error == EAGAIN) {
3591 			int hold_count;
3592 			KERNEL_UNLOCK_ALL(curlwp, &hold_count);
3593 			if (ptp != NULL) {
3594 				pmap_destroy(pmap);
3595 			}
3596 			SPINLOCK_BACKOFF(count);
3597 			KERNEL_LOCK(hold_count, curlwp);
3598 			goto startover;
3599 		}
3600 
3601 		pp->pp_attrs |= opte;
3602 		va = pvpte->pte_va;
3603 		pve = pmap_remove_pv(pp, ptp, va);
3604 
3605 		/* update the PTP reference count.  free if last reference. */
3606 		if (ptp != NULL) {
3607 			struct pmap *pmap2;
3608 			pt_entry_t *ptes;
3609 			pd_entry_t * const *pdes;
3610 
3611 			KASSERT(pmap != pmap_kernel());
3612 
3613 			pmap_tlb_shootnow();
3614 			pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3615 			pmap_stats_update_bypte(pmap, 0, opte);
3616 			ptp->wire_count--;
3617 			if (ptp->wire_count <= 1) {
3618 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3619 			}
3620 			pmap_unmap_ptes(pmap, pmap2);
3621 			pmap_destroy(pmap);
3622 		} else {
3623 			KASSERT(pmap == pmap_kernel());
3624 			pmap_stats_update_bypte(pmap, 0, opte);
3625 		}
3626 
3627 		if (pve != NULL) {
3628 			pve->pve_next = killlist;	/* mark it for death */
3629 			killlist = pve;
3630 		}
3631 	}
3632 	pmap_tlb_shootnow();
3633 	kpreempt_enable();
3634 
3635 	/* Now free unused pvs. */
3636 	pmap_free_pvs(killlist);
3637 }
3638 
3639 /*
3640  * p m a p   a t t r i b u t e  f u n c t i o n s
3641  * functions that test/change managed page's attributes
3642  * since a page can be mapped multiple times we must check each PTE that
3643  * maps it by going down the pv lists.
3644  */
3645 
3646 /*
3647  * pmap_test_attrs: test a page's attributes
3648  */
3649 
3650 bool
3651 pmap_test_attrs(struct vm_page *pg, unsigned testbits)
3652 {
3653 	struct pmap_page *pp;
3654 	struct pv_pte *pvpte;
3655 	pt_entry_t expect;
3656 	u_int result;
3657 
3658 	KASSERT(uvm_page_locked_p(pg));
3659 
3660 	pp = VM_PAGE_TO_PP(pg);
3661 	if ((pp->pp_attrs & testbits) != 0) {
3662 		return true;
3663 	}
3664 	expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V;
3665 	kpreempt_disable();
3666 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
3667 		pt_entry_t opte;
3668 		int error;
3669 
3670 		if ((pp->pp_attrs & testbits) != 0) {
3671 			break;
3672 		}
3673 		error = pmap_sync_pv(pvpte, expect, 0, &opte);
3674 		if (error == 0) {
3675 			pp->pp_attrs |= opte;
3676 		}
3677 	}
3678 	result = pp->pp_attrs & testbits;
3679 	kpreempt_enable();
3680 
3681 	/*
3682 	 * note that we will exit the for loop with a non-null pve if
3683 	 * we have found the bits we are testing for.
3684 	 */
3685 
3686 	return result != 0;
3687 }
3688 
3689 /*
3690  * pmap_clear_attrs: clear the specified attribute for a page.
3691  *
3692  * => we return true if we cleared one of the bits we were asked to
3693  */
3694 
3695 bool
3696 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits)
3697 {
3698 	struct pmap_page *pp;
3699 	struct pv_pte *pvpte;
3700 	u_int result;
3701 	pt_entry_t expect;
3702 	int count;
3703 
3704 	KASSERT(uvm_page_locked_p(pg));
3705 
3706 	pp = VM_PAGE_TO_PP(pg);
3707 	expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V;
3708 	count = SPINLOCK_BACKOFF_MIN;
3709 	kpreempt_disable();
3710 startover:
3711 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
3712 		pt_entry_t opte;
3713 		int error;
3714 
3715 		error = pmap_sync_pv(pvpte, expect, clearbits, &opte);
3716 		if (error == EAGAIN) {
3717 			int hold_count;
3718 			KERNEL_UNLOCK_ALL(curlwp, &hold_count);
3719 			SPINLOCK_BACKOFF(count);
3720 			KERNEL_LOCK(hold_count, curlwp);
3721 			goto startover;
3722 		}
3723 		pp->pp_attrs |= opte;
3724 	}
3725 	result = pp->pp_attrs & clearbits;
3726 	pp->pp_attrs &= ~clearbits;
3727 	kpreempt_enable();
3728 
3729 	return result != 0;
3730 }
3731 
3732 
3733 /*
3734  * p m a p   p r o t e c t i o n   f u n c t i o n s
3735  */
3736 
3737 /*
3738  * pmap_page_protect: change the protection of all recorded mappings
3739  *	of a managed page
3740  *
3741  * => NOTE: this is an inline function in pmap.h
3742  */
3743 
3744 /* see pmap.h */
3745 
3746 /*
3747  * pmap_protect: set the protection in of the pages in a pmap
3748  *
3749  * => NOTE: this is an inline function in pmap.h
3750  */
3751 
3752 /* see pmap.h */
3753 
3754 /*
3755  * pmap_write_protect: write-protect pages in a pmap.
3756  */
3757 void
3758 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
3759 {
3760 	pt_entry_t *ptes;
3761 	pt_entry_t * const *pdes;
3762 	struct pmap *pmap2;
3763 	vaddr_t blockend, va;
3764 
3765 	KASSERT(curlwp->l_md.md_gc_pmap != pmap);
3766 
3767 	sva &= PG_FRAME;
3768 	eva &= PG_FRAME;
3769 
3770 	/* Acquire pmap. */
3771 	kpreempt_disable();
3772 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3773 
3774 	for (va = sva ; va < eva ; va = blockend) {
3775 		pt_entry_t *spte, *epte;
3776 		int i;
3777 
3778 		blockend = x86_round_pdr(va + 1);
3779 		if (blockend > eva)
3780 			blockend = eva;
3781 
3782 		/*
3783 		 * XXXCDC: our PTE mappings should never be write-protected!
3784 		 *
3785 		 * long term solution is to move the PTEs out of user
3786 		 * address space.  and into kernel address space (up
3787 		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
3788 		 * be VM_MAX_ADDRESS.
3789 		 */
3790 
3791 		/* XXXCDC: ugly hack to avoid freeing PDP here */
3792 		for (i = 0; i < PDP_SIZE; i++) {
3793 			if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i)
3794 				continue;
3795 		}
3796 
3797 		/* Is it a valid block? */
3798 		if (!pmap_pdes_valid(va, pdes, NULL)) {
3799 			continue;
3800 		}
3801 		KASSERT(va < VM_MAXUSER_ADDRESS || va >= VM_MAX_ADDRESS);
3802 
3803 		spte = &ptes[pl1_i(va)];
3804 		epte = &ptes[pl1_i(blockend)];
3805 
3806 		for (/*null */; spte < epte ; spte++) {
3807 			pt_entry_t opte, npte;
3808 
3809 			do {
3810 				opte = *spte;
3811 				if ((~opte & (PG_RW | PG_V)) != 0) {
3812 					goto next;
3813 				}
3814 				npte = opte & ~PG_RW;
3815 			} while (pmap_pte_cas(spte, opte, npte) != opte);
3816 
3817 			if ((opte & PG_M) != 0) {
3818 				vaddr_t tva = x86_ptob(spte - ptes);
3819 				pmap_tlb_shootdown(pmap, tva, opte,
3820 				    TLBSHOOT_WRITE_PROTECT);
3821 			}
3822 next:;
3823 		}
3824 	}
3825 
3826 	/* Release pmap. */
3827 	pmap_unmap_ptes(pmap, pmap2);
3828 	kpreempt_enable();
3829 }
3830 
3831 /*
3832  * pmap_unwire: clear the wired bit in the PTE.
3833  *
3834  * => Mapping should already be present.
3835  */
3836 void
3837 pmap_unwire(struct pmap *pmap, vaddr_t va)
3838 {
3839 	pt_entry_t *ptes, *ptep, opte;
3840 	pd_entry_t * const *pdes;
3841 	struct pmap *pmap2;
3842 
3843 	/* Acquire pmap. */
3844 	kpreempt_disable();
3845 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3846 
3847 	if (!pmap_pdes_valid(va, pdes, NULL)) {
3848 		panic("pmap_unwire: invalid PDE");
3849 	}
3850 
3851 	ptep = &ptes[pl1_i(va)];
3852 	opte = *ptep;
3853 	KASSERT(pmap_valid_entry(opte));
3854 
3855 	if (opte & PG_W) {
3856 		pt_entry_t npte = opte & ~PG_W;
3857 
3858 		opte = pmap_pte_testset(ptep, npte);
3859 		pmap_stats_update_bypte(pmap, npte, opte);
3860 	} else {
3861 		printf("pmap_unwire: wiring for pmap %p va 0x%lx "
3862 		    "did not change!\n", pmap, va);
3863 	}
3864 
3865 	/* Release pmap. */
3866 	pmap_unmap_ptes(pmap, pmap2);
3867 	kpreempt_enable();
3868 }
3869 
3870 /*
3871  * pmap_copy: copy mappings from one pmap to another
3872  *
3873  * => optional function
3874  * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
3875  */
3876 
3877 /*
3878  * defined as macro in pmap.h
3879  */
3880 
3881 __strict_weak_alias(pmap_enter, pmap_enter_default);
3882 
3883 int
3884 pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
3885     u_int flags)
3886 {
3887 	return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0);
3888 }
3889 
3890 /*
3891  * pmap_enter: enter a mapping into a pmap
3892  *
3893  * => must be done "now" ... no lazy-evaluation
3894  * => we set pmap => pv_head locking
3895  */
3896 int
3897 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa,
3898 	   vm_prot_t prot, u_int flags, int domid)
3899 {
3900 	pt_entry_t *ptes, opte, npte;
3901 	pt_entry_t *ptep;
3902 	pd_entry_t * const *pdes;
3903 	struct vm_page *ptp, *pg;
3904 	struct pmap_page *new_pp;
3905 	struct pmap_page *old_pp;
3906 	struct pv_entry *old_pve = NULL;
3907 	struct pv_entry *new_pve;
3908 	struct pv_entry *new_pve2;
3909 	int error;
3910 	bool wired = (flags & PMAP_WIRED) != 0;
3911 	struct pmap *pmap2;
3912 
3913 	KASSERT(pmap_initialized);
3914 	KASSERT(curlwp->l_md.md_gc_pmap != pmap);
3915 	KASSERT(va < VM_MAX_KERNEL_ADDRESS);
3916 	KASSERTMSG(va != (vaddr_t)PDP_BASE,
3917 	    "pmap_enter: trying to map over PDP!");
3918 	KASSERTMSG(va < VM_MIN_KERNEL_ADDRESS ||
3919 	    pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]),
3920 	    "pmap_enter: missing kernel PTP for VA %lx!", va);
3921 
3922 #ifdef XEN
3923 	KASSERT(domid == DOMID_SELF || pa == 0);
3924 #endif /* XEN */
3925 
3926 	npte = ma | protection_codes[prot] | PG_V;
3927 	npte |= pmap_pat_flags(flags);
3928 	if (wired)
3929 	        npte |= PG_W;
3930 	if (va < VM_MAXUSER_ADDRESS)
3931 		npte |= PG_u;
3932 	else if (va < VM_MAX_ADDRESS)
3933 		npte |= (PG_u | PG_RW);	/* XXXCDC: no longer needed? */
3934 	else
3935 		npte |= PG_k;
3936 	if (pmap == pmap_kernel())
3937 		npte |= pmap_pg_g;
3938 	if (flags & VM_PROT_ALL) {
3939 		npte |= PG_U;
3940 		if (flags & VM_PROT_WRITE) {
3941 			KASSERT((npte & PG_RW) != 0);
3942 			npte |= PG_M;
3943 		}
3944 	}
3945 
3946 #ifdef XEN
3947 	if (domid != DOMID_SELF)
3948 		pg = NULL;
3949 	else
3950 #endif
3951 		pg = PHYS_TO_VM_PAGE(pa);
3952 	if (pg != NULL) {
3953 		/* This is a managed page */
3954 		npte |= PG_PVLIST;
3955 		new_pp = VM_PAGE_TO_PP(pg);
3956 	} else {
3957 		new_pp = NULL;
3958 	}
3959 
3960 	/* get pves. */
3961 	new_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
3962 	new_pve2 = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
3963 	if (new_pve == NULL || new_pve2 == NULL) {
3964 		if (flags & PMAP_CANFAIL) {
3965 			error = ENOMEM;
3966 			goto out2;
3967 		}
3968 		panic("pmap_enter: pve allocation failed");
3969 	}
3970 
3971 	kpreempt_disable();
3972 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
3973 	if (pmap == pmap_kernel()) {
3974 		ptp = NULL;
3975 	} else {
3976 		ptp = pmap_get_ptp(pmap, va, pdes);
3977 		if (ptp == NULL) {
3978 			pmap_unmap_ptes(pmap, pmap2);
3979 			if (flags & PMAP_CANFAIL) {
3980 				error = ENOMEM;
3981 				goto out;
3982 			}
3983 			panic("pmap_enter: get ptp failed");
3984 		}
3985 	}
3986 
3987 	/*
3988 	 * update the pte.
3989 	 */
3990 
3991 	ptep = &ptes[pl1_i(va)];
3992 	do {
3993 		opte = *ptep;
3994 
3995 		/*
3996 		 * if the same page, inherit PG_U and PG_M.
3997 		 */
3998 		if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) {
3999 			npte |= opte & (PG_U | PG_M);
4000 		}
4001 #if defined(XEN)
4002 		if (domid != DOMID_SELF) {
4003 			/* pmap_pte_cas with error handling */
4004 			int s = splvm();
4005 			if (opte != *ptep) {
4006 				splx(s);
4007 				continue;
4008 			}
4009 			error = xpq_update_foreign(
4010 			    vtomach((vaddr_t)ptep), npte, domid);
4011 			splx(s);
4012 			if (error) {
4013 				if (ptp != NULL && ptp->wire_count <= 1) {
4014 					pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4015 				}
4016 				pmap_unmap_ptes(pmap, pmap2);
4017 				goto out;
4018 			}
4019 			break;
4020 		}
4021 #endif /* defined(XEN) */
4022 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
4023 
4024 	/*
4025 	 * update statistics and PTP's reference count.
4026 	 */
4027 
4028 	pmap_stats_update_bypte(pmap, npte, opte);
4029 	if (ptp != NULL && !pmap_valid_entry(opte)) {
4030 		ptp->wire_count++;
4031 	}
4032 	KASSERT(ptp == NULL || ptp->wire_count > 1);
4033 
4034 	/*
4035 	 * if the same page, we can skip pv_entry handling.
4036 	 */
4037 
4038 	if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) {
4039 		KASSERT(((opte ^ npte) & PG_PVLIST) == 0);
4040 		goto same_pa;
4041 	}
4042 
4043 	/*
4044 	 * if old page is managed, remove pv_entry from its list.
4045 	 */
4046 
4047 	if ((~opte & (PG_V | PG_PVLIST)) == 0) {
4048 		pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte));
4049 
4050 		KASSERTMSG(pg != NULL, "pmap_enter: PG_PVLIST mapping with "
4051 		    "unmanaged page pa = 0x%" PRIx64 " (0x%" PRIx64 ")",
4052 		    (int64_t)pa, (int64_t)atop(pa));
4053 
4054 		KASSERT(uvm_page_locked_p(pg));
4055 
4056 		old_pp = VM_PAGE_TO_PP(pg);
4057 		old_pve = pmap_remove_pv(old_pp, ptp, va);
4058 		old_pp->pp_attrs |= opte;
4059 	}
4060 
4061 	/*
4062 	 * if new page is managed, insert pv_entry into its list.
4063 	 */
4064 
4065 	if (new_pp) {
4066 		new_pve = pmap_enter_pv(new_pp, new_pve, &new_pve2, ptp, va);
4067 	}
4068 
4069 same_pa:
4070 	pmap_unmap_ptes(pmap, pmap2);
4071 
4072 	/*
4073 	 * shootdown tlb if necessary.
4074 	 */
4075 
4076 	if ((~opte & (PG_V | PG_U)) == 0 &&
4077 	    ((opte ^ npte) & (PG_FRAME | PG_RW)) != 0) {
4078 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER);
4079 	}
4080 
4081 	error = 0;
4082 out:
4083 	kpreempt_enable();
4084 out2:
4085 	if (old_pve != NULL) {
4086 		pool_cache_put(&pmap_pv_cache, old_pve);
4087 	}
4088 	if (new_pve != NULL) {
4089 		pool_cache_put(&pmap_pv_cache, new_pve);
4090 	}
4091 	if (new_pve2 != NULL) {
4092 		pool_cache_put(&pmap_pv_cache, new_pve2);
4093 	}
4094 
4095 	return error;
4096 }
4097 
4098 static bool
4099 pmap_get_physpage(vaddr_t va, int level, paddr_t *paddrp)
4100 {
4101 	struct vm_page *ptp;
4102 	struct pmap *kpm = pmap_kernel();
4103 
4104 	if (!uvm.page_init_done) {
4105 
4106 		/*
4107 		 * we're growing the kernel pmap early (from
4108 		 * uvm_pageboot_alloc()).  this case must be
4109 		 * handled a little differently.
4110 		 */
4111 
4112 		if (!uvm_page_physget(paddrp))
4113 			panic("pmap_get_physpage: out of memory");
4114 #if defined(__HAVE_DIRECT_MAP)
4115 		pagezero(PMAP_DIRECT_MAP(*paddrp));
4116 #else
4117 #if defined(XEN)
4118 		if (XEN_VERSION_SUPPORTED(3, 4)) {
4119 			xen_pagezero(*paddrp);
4120 			return true;
4121 		}
4122 #endif
4123 		kpreempt_disable();
4124 		pmap_pte_set(early_zero_pte,
4125 		    pmap_pa2pte(*paddrp) | PG_V | PG_RW | PG_k);
4126 		pmap_pte_flush();
4127 		pmap_update_pg((vaddr_t)early_zerop);
4128 		memset(early_zerop, 0, PAGE_SIZE);
4129 #if defined(DIAGNOSTIC)
4130 		pmap_pte_set(early_zero_pte, 0);
4131 		pmap_pte_flush();
4132 #endif /* defined(DIAGNOSTIC) */
4133 		kpreempt_enable();
4134 #endif /* defined(__HAVE_DIRECT_MAP) */
4135 	} else {
4136 		/* XXX */
4137 		ptp = uvm_pagealloc(NULL, 0, NULL,
4138 				    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
4139 		if (ptp == NULL)
4140 			panic("pmap_get_physpage: out of memory");
4141 		ptp->flags &= ~PG_BUSY;
4142 		ptp->wire_count = 1;
4143 		*paddrp = VM_PAGE_TO_PHYS(ptp);
4144 	}
4145 	pmap_stats_update(kpm, 1, 0);
4146 	return true;
4147 }
4148 
4149 /*
4150  * Allocate the amount of specified ptps for a ptp level, and populate
4151  * all levels below accordingly, mapping virtual addresses starting at
4152  * kva.
4153  *
4154  * Used by pmap_growkernel.
4155  */
4156 static void
4157 pmap_alloc_level(pd_entry_t * const *pdes, vaddr_t kva, int lvl,
4158     long *needed_ptps)
4159 {
4160 	unsigned long i;
4161 	vaddr_t va;
4162 	paddr_t pa;
4163 	unsigned long index, endindex;
4164 	int level;
4165 	pd_entry_t *pdep;
4166 #ifdef XEN
4167 	int s = splvm(); /* protect xpq_* */
4168 #endif
4169 
4170 	for (level = lvl; level > 1; level--) {
4171 		if (level == PTP_LEVELS)
4172 			pdep = pmap_kernel()->pm_pdir;
4173 		else
4174 			pdep = pdes[level - 2];
4175 		va = kva;
4176 		index = pl_i_roundup(kva, level);
4177 		endindex = index + needed_ptps[level - 1] - 1;
4178 
4179 
4180 		for (i = index; i <= endindex; i++) {
4181 			pt_entry_t pte;
4182 
4183 			KASSERT(!pmap_valid_entry(pdep[i]));
4184 			pmap_get_physpage(va, level - 1, &pa);
4185 			pte = pmap_pa2pte(pa) | PG_k | PG_V | PG_RW;
4186 #ifdef XEN
4187 			pmap_pte_set(&pdep[i], pte);
4188 #if defined(PAE) || defined(__x86_64__)
4189 			if (level == PTP_LEVELS && i >= PDIR_SLOT_KERN) {
4190 				if (__predict_true(
4191 				    cpu_info_primary.ci_flags & CPUF_PRESENT)) {
4192 					/* update per-cpu PMDs on all cpus */
4193 					xen_kpm_sync(pmap_kernel(), i);
4194 				} else {
4195 					/*
4196 					 * too early; update primary CPU
4197 					 * PMD only (without locks)
4198 					 */
4199 #ifdef PAE
4200 					pd_entry_t *cpu_pdep =
4201 					    &cpu_info_primary.ci_kpm_pdir[l2tol2(i)];
4202 #endif
4203 #ifdef __x86_64__
4204 					pd_entry_t *cpu_pdep =
4205 						&cpu_info_primary.ci_kpm_pdir[i];
4206 #endif
4207 					pmap_pte_set(cpu_pdep, pte);
4208 				}
4209 			}
4210 #endif /* PAE || __x86_64__ */
4211 #else /* XEN */
4212 			pdep[i] = pte;
4213 #endif /* XEN */
4214 			KASSERT(level != PTP_LEVELS || nkptp[level - 1] +
4215 			    pl_i(VM_MIN_KERNEL_ADDRESS, level) == i);
4216 			nkptp[level - 1]++;
4217 			va += nbpd[level - 1];
4218 		}
4219 		pmap_pte_flush();
4220 	}
4221 #ifdef XEN
4222 	splx(s);
4223 #endif
4224 }
4225 
4226 /*
4227  * pmap_growkernel: increase usage of KVM space
4228  *
4229  * => we allocate new PTPs for the kernel and install them in all
4230  *	the pmaps on the system.
4231  */
4232 
4233 vaddr_t
4234 pmap_growkernel(vaddr_t maxkvaddr)
4235 {
4236 	struct pmap *kpm = pmap_kernel();
4237 #if !defined(XEN) || !defined(__x86_64__)
4238 	struct pmap *pm;
4239 	long old;
4240 #endif
4241 	int s, i;
4242 	long needed_kptp[PTP_LEVELS], target_nptp;
4243 	bool invalidate = false;
4244 
4245 	s = splvm();	/* to be safe */
4246 	mutex_enter(kpm->pm_lock);
4247 
4248 	if (maxkvaddr <= pmap_maxkvaddr) {
4249 		mutex_exit(kpm->pm_lock);
4250 		splx(s);
4251 		return pmap_maxkvaddr;
4252 	}
4253 
4254 	maxkvaddr = x86_round_pdr(maxkvaddr);
4255 #if !defined(XEN) || !defined(__x86_64__)
4256 	old = nkptp[PTP_LEVELS - 1];
4257 #endif
4258 
4259 	/*
4260 	 * This loop could be optimized more, but pmap_growkernel()
4261 	 * is called infrequently.
4262 	 */
4263 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
4264 		target_nptp = pl_i_roundup(maxkvaddr, i + 1) -
4265 		    pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1);
4266 		/*
4267 		 * XXX only need to check toplevel.
4268 		 */
4269 		if (target_nptp > nkptpmax[i])
4270 			panic("out of KVA space");
4271 		KASSERT(target_nptp >= nkptp[i]);
4272 		needed_kptp[i] = target_nptp - nkptp[i];
4273 	}
4274 
4275 	pmap_alloc_level(normal_pdes, pmap_maxkvaddr, PTP_LEVELS, needed_kptp);
4276 
4277 	/*
4278 	 * If the number of top level entries changed, update all
4279 	 * pmaps.
4280 	 */
4281 	if (needed_kptp[PTP_LEVELS - 1] != 0) {
4282 #ifdef XEN
4283 #ifdef __x86_64__
4284 		/* nothing, kernel entries are never entered in user pmap */
4285 #else /* __x86_64__ */
4286 		mutex_enter(&pmaps_lock);
4287 		LIST_FOREACH(pm, &pmaps, pm_list) {
4288 			int pdkidx;
4289 			for (pdkidx =  PDIR_SLOT_KERN + old;
4290 			    pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1];
4291 			    pdkidx++) {
4292 				pmap_pte_set(&pm->pm_pdir[pdkidx],
4293 				    kpm->pm_pdir[pdkidx]);
4294 			}
4295 			pmap_pte_flush();
4296 		}
4297 		mutex_exit(&pmaps_lock);
4298 #endif /* __x86_64__ */
4299 #else /* XEN */
4300 		unsigned newpdes;
4301 		newpdes = nkptp[PTP_LEVELS - 1] - old;
4302 		mutex_enter(&pmaps_lock);
4303 		LIST_FOREACH(pm, &pmaps, pm_list) {
4304 			memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old],
4305 			       &kpm->pm_pdir[PDIR_SLOT_KERN + old],
4306 			       newpdes * sizeof (pd_entry_t));
4307 		}
4308 		mutex_exit(&pmaps_lock);
4309 #endif
4310 		invalidate = true;
4311 	}
4312 	pmap_maxkvaddr = maxkvaddr;
4313 	mutex_exit(kpm->pm_lock);
4314 	splx(s);
4315 
4316 	if (invalidate && pmap_initialized) {
4317 		/* Invalidate the PDP cache. */
4318 		pool_cache_invalidate(&pmap_pdp_cache);
4319 	}
4320 
4321 	return maxkvaddr;
4322 }
4323 
4324 #ifdef DEBUG
4325 void pmap_dump(struct pmap *, vaddr_t, vaddr_t);
4326 
4327 /*
4328  * pmap_dump: dump all the mappings from a pmap
4329  *
4330  * => caller should not be holding any pmap locks
4331  */
4332 
4333 void
4334 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
4335 {
4336 	pt_entry_t *ptes, *pte;
4337 	pd_entry_t * const *pdes;
4338 	struct pmap *pmap2;
4339 	vaddr_t blkendva;
4340 
4341 	/*
4342 	 * if end is out of range truncate.
4343 	 * if (end == start) update to max.
4344 	 */
4345 
4346 	if (eva > VM_MAXUSER_ADDRESS || eva <= sva)
4347 		eva = VM_MAXUSER_ADDRESS;
4348 
4349 	/*
4350 	 * we lock in the pmap => pv_head direction
4351 	 */
4352 
4353 	kpreempt_disable();
4354 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
4355 
4356 	/*
4357 	 * dumping a range of pages: we dump in PTP sized blocks (4MB)
4358 	 */
4359 
4360 	for (/* null */ ; sva < eva ; sva = blkendva) {
4361 
4362 		/* determine range of block */
4363 		blkendva = x86_round_pdr(sva+1);
4364 		if (blkendva > eva)
4365 			blkendva = eva;
4366 
4367 		/* valid block? */
4368 		if (!pmap_pdes_valid(sva, pdes, NULL))
4369 			continue;
4370 
4371 		pte = &ptes[pl1_i(sva)];
4372 		for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) {
4373 			if (!pmap_valid_entry(*pte))
4374 				continue;
4375 			printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR
4376 			    " (pte=%#" PRIxPADDR ")\n",
4377 			    sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte);
4378 		}
4379 	}
4380 	pmap_unmap_ptes(pmap, pmap2);
4381 	kpreempt_enable();
4382 }
4383 #endif
4384 
4385 /*
4386  * pmap_update: process deferred invalidations and frees.
4387  */
4388 
4389 void
4390 pmap_update(struct pmap *pmap)
4391 {
4392 	struct vm_page *empty_ptps;
4393 	lwp_t *l = curlwp;
4394 
4395 	/*
4396 	 * If we have torn down this pmap, invalidate non-global TLB
4397 	 * entries on any processors using it.
4398 	 */
4399 	KPREEMPT_DISABLE(l);
4400 	if (__predict_false(l->l_md.md_gc_pmap == pmap)) {
4401 		l->l_md.md_gc_pmap = NULL;
4402 		pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, TLBSHOOT_UPDATE);
4403 	}
4404 	/*
4405 	 * Initiate any pending TLB shootdowns.  Wait for them to
4406 	 * complete before returning control to the caller.
4407 	 */
4408 	pmap_tlb_shootnow();
4409 	KPREEMPT_ENABLE(l);
4410 
4411 	/*
4412 	 * Now that shootdowns are complete, process deferred frees,
4413 	 * but not from interrupt context.
4414 	 */
4415 	if (l->l_md.md_gc_ptp != NULL) {
4416 		KASSERT((l->l_pflag & LP_INTR) == 0);
4417 		if (cpu_intr_p()) {
4418 			return;
4419 		}
4420 		empty_ptps = l->l_md.md_gc_ptp;
4421 		l->l_md.md_gc_ptp = NULL;
4422 		pmap_free_ptps(empty_ptps);
4423 	}
4424 }
4425 
4426 #if PTP_LEVELS > 4
4427 #error "Unsupported number of page table mappings"
4428 #endif
4429 
4430 paddr_t
4431 pmap_init_tmp_pgtbl(paddr_t pg)
4432 {
4433 	static bool maps_loaded;
4434 	static const paddr_t x86_tmp_pml_paddr[] = {
4435 	    4 * PAGE_SIZE,
4436 	    5 * PAGE_SIZE,
4437 	    6 * PAGE_SIZE,
4438 	    7 * PAGE_SIZE
4439 	};
4440 	static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 };
4441 
4442 	pd_entry_t *tmp_pml, *kernel_pml;
4443 
4444 	int level;
4445 
4446 	if (!maps_loaded) {
4447 		for (level = 0; level < PTP_LEVELS; ++level) {
4448 			x86_tmp_pml_vaddr[level] =
4449 			    uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
4450 			    UVM_KMF_VAONLY);
4451 
4452 			if (x86_tmp_pml_vaddr[level] == 0)
4453 				panic("mapping of real mode PML failed\n");
4454 			pmap_kenter_pa(x86_tmp_pml_vaddr[level],
4455 			    x86_tmp_pml_paddr[level],
4456 			    VM_PROT_READ | VM_PROT_WRITE, 0);
4457 			pmap_update(pmap_kernel());
4458 		}
4459 		maps_loaded = true;
4460 	}
4461 
4462 	/* Zero levels 1-3 */
4463 	for (level = 0; level < PTP_LEVELS - 1; ++level) {
4464 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
4465 		memset(tmp_pml, 0, PAGE_SIZE);
4466 	}
4467 
4468 	/* Copy PML4 */
4469 	kernel_pml = pmap_kernel()->pm_pdir;
4470 	tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1];
4471 	memcpy(tmp_pml, kernel_pml, PAGE_SIZE);
4472 
4473 #ifdef PAE
4474 	/*
4475 	 * Use the last 4 entries of the L2 page as L3 PD entries. These
4476 	 * last entries are unlikely to be used for temporary mappings.
4477 	 * 508: maps 0->1GB (userland)
4478 	 * 509: unused
4479 	 * 510: unused
4480 	 * 511: maps 3->4GB (kernel)
4481 	 */
4482 	tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PG_V;
4483 	tmp_pml[509] = 0;
4484 	tmp_pml[510] = 0;
4485 	tmp_pml[511] = pmap_pdirpa(pmap_kernel(), PDIR_SLOT_KERN) | PG_V;
4486 #endif
4487 
4488 	for (level = PTP_LEVELS - 1; level > 0; --level) {
4489 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
4490 
4491 		tmp_pml[pl_i(pg, level + 1)] =
4492 		    (x86_tmp_pml_paddr[level - 1] & PG_FRAME) | PG_RW | PG_V;
4493 	}
4494 
4495 	tmp_pml = (void *)x86_tmp_pml_vaddr[0];
4496 	tmp_pml[pl_i(pg, 1)] = (pg & PG_FRAME) | PG_RW | PG_V;
4497 
4498 #ifdef PAE
4499 	/* Return the PA of the L3 page (entry 508 of the L2 page) */
4500 	return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t);
4501 #endif
4502 
4503 	return x86_tmp_pml_paddr[PTP_LEVELS - 1];
4504 }
4505 
4506 u_int
4507 x86_mmap_flags(paddr_t mdpgno)
4508 {
4509 	u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK;
4510 	u_int pflag = 0;
4511 
4512 	if (nflag & X86_MMAP_FLAG_PREFETCH)
4513 		pflag |= PMAP_WRITE_COMBINE;
4514 
4515 	return pflag;
4516 }
4517