xref: /netbsd-src/sys/arch/x86/x86/pmap.c (revision 2e2322c9c07009df921d11b1268f8506affbb8ba)
1 /*	$NetBSD: pmap.c,v 1.231 2016/12/13 10:54:27 kamil Exp $	*/
2 
3 /*-
4  * Copyright (c) 2008, 2010, 2016 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran, and by Maxime Villard.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 2007 Manuel Bouyer.
34  *
35  * Redistribution and use in source and binary forms, with or without
36  * modification, are permitted provided that the following conditions
37  * are met:
38  * 1. Redistributions of source code must retain the above copyright
39  *    notice, this list of conditions and the following disclaimer.
40  * 2. Redistributions in binary form must reproduce the above copyright
41  *    notice, this list of conditions and the following disclaimer in the
42  *    documentation and/or other materials provided with the distribution.
43  *
44  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
45  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
46  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
47  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
48  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
49  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
50  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
51  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
52  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
53  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
54  *
55  */
56 
57 /*
58  * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
59  *
60  * Permission to use, copy, modify, and distribute this software for any
61  * purpose with or without fee is hereby granted, provided that the above
62  * copyright notice and this permission notice appear in all copies.
63  *
64  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
65  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
66  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
67  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
68  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
69  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
70  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
71  */
72 
73 /*
74  * Copyright (c) 1997 Charles D. Cranor and Washington University.
75  * All rights reserved.
76  *
77  * Redistribution and use in source and binary forms, with or without
78  * modification, are permitted provided that the following conditions
79  * are met:
80  * 1. Redistributions of source code must retain the above copyright
81  *    notice, this list of conditions and the following disclaimer.
82  * 2. Redistributions in binary form must reproduce the above copyright
83  *    notice, this list of conditions and the following disclaimer in the
84  *    documentation and/or other materials provided with the distribution.
85  *
86  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
87  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
88  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
89  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
90  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
91  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
92  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
93  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
94  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
95  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
96  */
97 
98 /*
99  * Copyright 2001 (c) Wasabi Systems, Inc.
100  * All rights reserved.
101  *
102  * Written by Frank van der Linden for Wasabi Systems, Inc.
103  *
104  * Redistribution and use in source and binary forms, with or without
105  * modification, are permitted provided that the following conditions
106  * are met:
107  * 1. Redistributions of source code must retain the above copyright
108  *    notice, this list of conditions and the following disclaimer.
109  * 2. Redistributions in binary form must reproduce the above copyright
110  *    notice, this list of conditions and the following disclaimer in the
111  *    documentation and/or other materials provided with the distribution.
112  * 3. All advertising materials mentioning features or use of this software
113  *    must display the following acknowledgement:
114  *      This product includes software developed for the NetBSD Project by
115  *      Wasabi Systems, Inc.
116  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
117  *    or promote products derived from this software without specific prior
118  *    written permission.
119  *
120  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
121  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
122  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
123  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
124  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
125  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
126  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
127  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
128  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
129  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
130  * POSSIBILITY OF SUCH DAMAGE.
131  */
132 
133 /*
134  * This is the i386 pmap modified and generalized to support x86-64
135  * as well. The idea is to hide the upper N levels of the page tables
136  * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest
137  * is mostly untouched, except that it uses some more generalized
138  * macros and interfaces.
139  *
140  * This pmap has been tested on the i386 as well, and it can be easily
141  * adapted to PAE.
142  *
143  * fvdl@wasabisystems.com 18-Jun-2001
144  */
145 
146 /*
147  * pmap.c: i386 pmap module rewrite
148  * Chuck Cranor <chuck@netbsd>
149  * 11-Aug-97
150  *
151  * history of this pmap module: in addition to my own input, i used
152  *    the following references for this rewrite of the i386 pmap:
153  *
154  * [1] the NetBSD i386 pmap.   this pmap appears to be based on the
155  *     BSD hp300 pmap done by Mike Hibler at University of Utah.
156  *     it was then ported to the i386 by William Jolitz of UUNET
157  *     Technologies, Inc.   Then Charles M. Hannum of the NetBSD
158  *     project fixed some bugs and provided some speed ups.
159  *
160  * [2] the FreeBSD i386 pmap.   this pmap seems to be the
161  *     Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson
162  *     and David Greenman.
163  *
164  * [3] the Mach pmap.   this pmap, from CMU, seems to have migrated
165  *     between several processors.   the VAX version was done by
166  *     Avadis Tevanian, Jr., and Michael Wayne Young.    the i386
167  *     version was done by Lance Berc, Mike Kupfer, Bob Baron,
168  *     David Golub, and Richard Draves.    the alpha version was
169  *     done by Alessandro Forin (CMU/Mach) and Chris Demetriou
170  *     (NetBSD/alpha).
171  */
172 
173 #include <sys/cdefs.h>
174 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.231 2016/12/13 10:54:27 kamil Exp $");
175 
176 #include "opt_user_ldt.h"
177 #include "opt_lockdebug.h"
178 #include "opt_multiprocessor.h"
179 #include "opt_xen.h"
180 
181 #include <sys/param.h>
182 #include <sys/systm.h>
183 #include <sys/proc.h>
184 #include <sys/pool.h>
185 #include <sys/kernel.h>
186 #include <sys/atomic.h>
187 #include <sys/cpu.h>
188 #include <sys/intr.h>
189 #include <sys/xcall.h>
190 #include <sys/kcore.h>
191 
192 #include <uvm/uvm.h>
193 #include <uvm/pmap/pmap_pvt.h>
194 
195 #include <dev/isa/isareg.h>
196 
197 #include <machine/specialreg.h>
198 #include <machine/gdt.h>
199 #include <machine/isa_machdep.h>
200 #include <machine/cpuvar.h>
201 #include <machine/cputypes.h>
202 
203 #include <x86/pmap.h>
204 #include <x86/pmap_pv.h>
205 
206 #include <x86/i82489reg.h>
207 #include <x86/i82489var.h>
208 
209 #ifdef XEN
210 #include <xen/xen-public/xen.h>
211 #include <xen/hypervisor.h>
212 #endif
213 
214 /*
215  * general info:
216  *
217  *  - for an explanation of how the i386 MMU hardware works see
218  *    the comments in <machine/pte.h>.
219  *
220  *  - for an explanation of the general memory structure used by
221  *    this pmap (including the recursive mapping), see the comments
222  *    in <machine/pmap.h>.
223  *
224  * this file contains the code for the "pmap module."   the module's
225  * job is to manage the hardware's virtual to physical address mappings.
226  * note that there are two levels of mapping in the VM system:
227  *
228  *  [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
229  *      to map ranges of virtual address space to objects/files.  for
230  *      example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
231  *      to the file /bin/ls starting at offset zero."   note that
232  *      the upper layer mapping is not concerned with how individual
233  *      vm_pages are mapped.
234  *
235  *  [2] the lower layer of the VM system (the pmap) maintains the mappings
236  *      from virtual addresses.   it is concerned with which vm_page is
237  *      mapped where.   for example, when you run /bin/ls and start
238  *      at page 0x1000 the fault routine may lookup the correct page
239  *      of the /bin/ls file and then ask the pmap layer to establish
240  *      a mapping for it.
241  *
242  * note that information in the lower layer of the VM system can be
243  * thrown away since it can easily be reconstructed from the info
244  * in the upper layer.
245  *
246  * data structures we use include:
247  *
248  *  - struct pmap: describes the address space of one thread
249  *  - struct pmap_page: describes one pv-tracked page, without
250  *	necessarily a corresponding vm_page
251  *  - struct pv_entry: describes one <PMAP,VA> mapping of a PA
252  *  - struct pv_head: there is one pv_head per pv-tracked page of
253  *	physical memory.   the pv_head points to a list of pv_entry
254  *	structures which describe all the <PMAP,VA> pairs that this
255  *      page is mapped in.    this is critical for page based operations
256  *      such as pmap_page_protect() [change protection on _all_ mappings
257  *      of a page]
258  */
259 
260 /*
261  * memory allocation
262  *
263  *  - there are three data structures that we must dynamically allocate:
264  *
265  * [A] new process' page directory page (PDP)
266  *	- plan 1: done at pmap_create() we use
267  *	  uvm_km_alloc(kernel_map, PAGE_SIZE)  [fka kmem_alloc] to do this
268  *	  allocation.
269  *
270  * if we are low in free physical memory then we sleep in
271  * uvm_km_alloc -- in this case this is ok since we are creating
272  * a new pmap and should not be holding any locks.
273  *
274  * if the kernel is totally out of virtual space
275  * (i.e. uvm_km_alloc returns NULL), then we panic.
276  *
277  * [B] new page tables pages (PTP)
278  * 	- call uvm_pagealloc()
279  * 		=> success: zero page, add to pm_pdir
280  * 		=> failure: we are out of free vm_pages, let pmap_enter()
281  *		   tell UVM about it.
282  *
283  * note: for kernel PTPs, we start with NKPTP of them.   as we map
284  * kernel memory (at uvm_map time) we check to see if we've grown
285  * the kernel pmap.   if so, we call the optional function
286  * pmap_growkernel() to grow the kernel PTPs in advance.
287  *
288  * [C] pv_entry structures
289  */
290 
291 /*
292  * locking
293  *
294  * we have the following locks that we must contend with:
295  *
296  * mutexes:
297  *
298  * - pmap lock (per pmap, part of uvm_object)
299  *   this lock protects the fields in the pmap structure including
300  *   the non-kernel PDEs in the PDP, and the PTEs.  it also locks
301  *   in the alternate PTE space (since that is determined by the
302  *   entry in the PDP).
303  *
304  * - pvh_lock (per pv_head)
305  *   this lock protects the pv_entry list which is chained off the
306  *   pv_head structure for a specific pv-tracked PA.   it is locked
307  *   when traversing the list (e.g. adding/removing mappings,
308  *   syncing R/M bits, etc.)
309  *
310  * - pmaps_lock
311  *   this lock protects the list of active pmaps (headed by "pmaps").
312  *   we lock it when adding or removing pmaps from this list.
313  */
314 
315 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER;
316 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER;
317 const long nkptpmax[] = NKPTPMAX_INITIALIZER;
318 const long nbpd[] = NBPD_INITIALIZER;
319 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER;
320 
321 long nkptp[] = NKPTP_INITIALIZER;
322 
323 struct pmap_head pmaps;
324 kmutex_t pmaps_lock;
325 
326 static vaddr_t pmap_maxkvaddr;
327 
328 /*
329  * XXX kludge: dummy locking to make KASSERTs in uvm_page.c comfortable.
330  * actual locking is done by pm_lock.
331  */
332 #if defined(DIAGNOSTIC)
333 #define	PMAP_SUBOBJ_LOCK(pm, idx) \
334 	KASSERT(mutex_owned((pm)->pm_lock)); \
335 	if ((idx) != 0) \
336 		mutex_enter((pm)->pm_obj[(idx)].vmobjlock)
337 #define	PMAP_SUBOBJ_UNLOCK(pm, idx) \
338 	KASSERT(mutex_owned((pm)->pm_lock)); \
339 	if ((idx) != 0) \
340 		mutex_exit((pm)->pm_obj[(idx)].vmobjlock)
341 #else /* defined(DIAGNOSTIC) */
342 #define	PMAP_SUBOBJ_LOCK(pm, idx)	/* nothing */
343 #define	PMAP_SUBOBJ_UNLOCK(pm, idx)	/* nothing */
344 #endif /* defined(DIAGNOSTIC) */
345 
346 /*
347  * Misc. event counters.
348  */
349 struct evcnt pmap_iobmp_evcnt;
350 struct evcnt pmap_ldt_evcnt;
351 
352 /*
353  * PAT
354  */
355 #define	PATENTRY(n, type)	(type << ((n) * 8))
356 #define	PAT_UC		0x0ULL
357 #define	PAT_WC		0x1ULL
358 #define	PAT_WT		0x4ULL
359 #define	PAT_WP		0x5ULL
360 #define	PAT_WB		0x6ULL
361 #define	PAT_UCMINUS	0x7ULL
362 
363 static bool cpu_pat_enabled __read_mostly = false;
364 
365 /*
366  * Global data structures
367  */
368 
369 static struct pmap kernel_pmap_store;	/* the kernel's pmap (proc0) */
370 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store;
371 
372 /*
373  * pmap_pg_nx: if our processor supports PG_NX in the PTE then we
374  * set pmap_pg_nx to PG_NX (otherwise it is zero).
375  */
376 pd_entry_t pmap_pg_nx __read_mostly = 0;
377 
378 /*
379  * pmap_pg_g: if our processor supports PG_G in the PTE then we
380  * set pmap_pg_g to PG_G (otherwise it is zero).
381  */
382 pd_entry_t pmap_pg_g __read_mostly = 0;
383 
384 /*
385  * pmap_largepages: if our processor supports PG_PS and we are
386  * using it, this is set to true.
387  */
388 int pmap_largepages __read_mostly = 0;
389 
390 /*
391  * i386 physical memory comes in a big contig chunk with a small
392  * hole toward the front of it...  the following two paddr_t's
393  * (shared with machdep.c) describe the physical address space
394  * of this machine.
395  */
396 paddr_t avail_start __read_mostly; /* PA of first available physical page */
397 paddr_t avail_end __read_mostly; /* PA of last available physical page */
398 
399 #ifdef XEN
400 #ifdef __x86_64__
401 /* Dummy PGD for user cr3, used between pmap_deactivate() and pmap_activate() */
402 static paddr_t xen_dummy_user_pgd;
403 #endif /* __x86_64__ */
404 paddr_t pmap_pa_start; /* PA of first physical page for this domain */
405 paddr_t pmap_pa_end;   /* PA of last physical page for this domain */
406 #endif /* XEN */
407 
408 #define	VM_PAGE_TO_PP(pg)	(&(pg)->mdpage.mp_pp)
409 
410 #define	PV_HASH_SIZE		32768
411 #define	PV_HASH_LOCK_CNT	32
412 
413 struct pv_hash_lock {
414 	kmutex_t lock;
415 } __aligned(CACHE_LINE_SIZE) pv_hash_locks[PV_HASH_LOCK_CNT]
416     __aligned(CACHE_LINE_SIZE);
417 
418 struct pv_hash_head {
419 	SLIST_HEAD(, pv_entry) hh_list;
420 } pv_hash_heads[PV_HASH_SIZE];
421 
422 static u_int
423 pvhash_hash(struct vm_page *ptp, vaddr_t va)
424 {
425 
426 	return (uintptr_t)ptp / sizeof(*ptp) + (va >> PAGE_SHIFT);
427 }
428 
429 static struct pv_hash_head *
430 pvhash_head(u_int hash)
431 {
432 
433 	return &pv_hash_heads[hash % PV_HASH_SIZE];
434 }
435 
436 static kmutex_t *
437 pvhash_lock(u_int hash)
438 {
439 
440 	return &pv_hash_locks[hash % PV_HASH_LOCK_CNT].lock;
441 }
442 
443 static struct pv_entry *
444 pvhash_remove(struct pv_hash_head *hh, struct vm_page *ptp, vaddr_t va)
445 {
446 	struct pv_entry *pve;
447 	struct pv_entry *prev;
448 
449 	prev = NULL;
450 	SLIST_FOREACH(pve, &hh->hh_list, pve_hash) {
451 		if (pve->pve_pte.pte_ptp == ptp &&
452 		    pve->pve_pte.pte_va == va) {
453 			if (prev != NULL) {
454 				SLIST_REMOVE_AFTER(prev, pve_hash);
455 			} else {
456 				SLIST_REMOVE_HEAD(&hh->hh_list, pve_hash);
457 			}
458 			break;
459 		}
460 		prev = pve;
461 	}
462 	return pve;
463 }
464 
465 /*
466  * Other data structures
467  */
468 
469 static pt_entry_t protection_codes[8] __read_mostly;
470 
471 static bool pmap_initialized __read_mostly = false; /* pmap_init done yet? */
472 
473 /*
474  * The following two vaddr_t's are used during system startup to keep track of
475  * how much of the kernel's VM space we have used. Once the system is started,
476  * the management of the remaining kernel VM space is turned over to the
477  * kernel_map vm_map.
478  */
479 static vaddr_t virtual_avail __read_mostly;	/* VA of first free KVA */
480 static vaddr_t virtual_end __read_mostly;	/* VA of last free KVA */
481 
482 /*
483  * LAPIC virtual address, and fake physical address.
484  */
485 volatile vaddr_t local_apic_va;
486 paddr_t local_apic_pa;
487 
488 /*
489  * pool that pmap structures are allocated from
490  */
491 static struct pool_cache pmap_cache;
492 
493 /*
494  * pv_entry cache
495  */
496 static struct pool_cache pmap_pv_cache;
497 
498 #ifndef __HAVE_DIRECT_MAP
499 /*
500  * MULTIPROCESSOR: special VAs and PTEs are actually allocated inside a
501  * (maxcpus * NPTECL) array of PTE, to avoid cache line thrashing due to
502  * false sharing.
503  */
504 #ifdef MULTIPROCESSOR
505 #define PTESLEW(pte, id) ((pte)+(id)*NPTECL)
506 #define VASLEW(va,id) ((va)+(id)*NPTECL*PAGE_SIZE)
507 #else
508 #define PTESLEW(pte, id) ((void)id, pte)
509 #define VASLEW(va,id) ((void)id, va)
510 #endif
511 
512 /*
513  * Special VAs and the PTEs that map them
514  */
515 static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte, *early_zero_pte;
516 static char *csrcp, *cdstp, *zerop, *ptpp;
517 #ifdef XEN
518 char *early_zerop; /* also referenced from xen_locore() */
519 #else
520 static char *early_zerop;
521 #endif
522 
523 #endif
524 
525 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int);
526 
527 /* PDP pool_cache(9) and its callbacks */
528 struct pool_cache pmap_pdp_cache;
529 static int  pmap_pdp_ctor(void *, void *, int);
530 static void pmap_pdp_dtor(void *, void *);
531 #ifdef PAE
532 /* need to allocate items of 4 pages */
533 static void *pmap_pdp_alloc(struct pool *, int);
534 static void pmap_pdp_free(struct pool *, void *);
535 static struct pool_allocator pmap_pdp_allocator = {
536 	.pa_alloc = pmap_pdp_alloc,
537 	.pa_free = pmap_pdp_free,
538 	.pa_pagesz = PAGE_SIZE * PDP_SIZE,
539 };
540 #endif /* PAE */
541 
542 extern vaddr_t idt_vaddr;
543 extern paddr_t idt_paddr;
544 extern vaddr_t gdt_vaddr;
545 extern paddr_t gdt_paddr;
546 extern vaddr_t ldt_vaddr;
547 extern paddr_t ldt_paddr;
548 
549 extern int end;
550 
551 #ifdef i386
552 /* stuff to fix the pentium f00f bug */
553 extern vaddr_t pentium_idt_vaddr;
554 #endif
555 
556 /*
557  * Local prototypes
558  */
559 
560 static void pmap_init_lapic(void);
561 #ifdef __HAVE_DIRECT_MAP
562 static void pmap_init_directmap(struct pmap *);
563 #endif
564 #ifndef XEN
565 static void pmap_remap_largepages(void);
566 #endif
567 
568 static struct vm_page *pmap_get_ptp(struct pmap *, vaddr_t,
569     pd_entry_t * const *);
570 static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int);
571 static void pmap_freepage(struct pmap *, struct vm_page *, int);
572 static void pmap_free_ptp(struct pmap *, struct vm_page *, vaddr_t,
573     pt_entry_t *, pd_entry_t * const *);
574 static bool pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *,
575     vaddr_t, struct pv_entry **);
576 static void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t, vaddr_t,
577     vaddr_t, struct pv_entry **);
578 
579 static paddr_t pmap_get_physpage(void);
580 static void pmap_alloc_level(vaddr_t, long *);
581 
582 static bool pmap_reactivate(struct pmap *);
583 
584 /*
585  * p m a p   h e l p e r   f u n c t i o n s
586  */
587 
588 static inline void
589 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff)
590 {
591 
592 	if (pmap == pmap_kernel()) {
593 		atomic_add_long(&pmap->pm_stats.resident_count, resid_diff);
594 		atomic_add_long(&pmap->pm_stats.wired_count, wired_diff);
595 	} else {
596 		KASSERT(mutex_owned(pmap->pm_lock));
597 		pmap->pm_stats.resident_count += resid_diff;
598 		pmap->pm_stats.wired_count += wired_diff;
599 	}
600 }
601 
602 static inline void
603 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
604 {
605 	int resid_diff = ((npte & PG_V) ? 1 : 0) - ((opte & PG_V) ? 1 : 0);
606 	int wired_diff = ((npte & PG_W) ? 1 : 0) - ((opte & PG_W) ? 1 : 0);
607 
608 	KASSERT((npte & (PG_V | PG_W)) != PG_W);
609 	KASSERT((opte & (PG_V | PG_W)) != PG_W);
610 
611 	pmap_stats_update(pmap, resid_diff, wired_diff);
612 }
613 
614 /*
615  * ptp_to_pmap: lookup pmap by ptp
616  */
617 
618 static struct pmap *
619 ptp_to_pmap(struct vm_page *ptp)
620 {
621 	struct pmap *pmap;
622 
623 	if (ptp == NULL) {
624 		return pmap_kernel();
625 	}
626 	pmap = (struct pmap *)ptp->uobject;
627 	KASSERT(pmap != NULL);
628 	KASSERT(&pmap->pm_obj[0] == ptp->uobject);
629 	return pmap;
630 }
631 
632 static inline struct pv_pte *
633 pve_to_pvpte(struct pv_entry *pve)
634 {
635 
636 	KASSERT((void *)&pve->pve_pte == (void *)pve);
637 	return &pve->pve_pte;
638 }
639 
640 static inline struct pv_entry *
641 pvpte_to_pve(struct pv_pte *pvpte)
642 {
643 	struct pv_entry *pve = (void *)pvpte;
644 
645 	KASSERT(pve_to_pvpte(pve) == pvpte);
646 	return pve;
647 }
648 
649 /*
650  * pv_pte_first, pv_pte_next: PV list iterator.
651  */
652 
653 static struct pv_pte *
654 pv_pte_first(struct pmap_page *pp)
655 {
656 
657 	if ((pp->pp_flags & PP_EMBEDDED) != 0) {
658 		return &pp->pp_pte;
659 	}
660 	return pve_to_pvpte(LIST_FIRST(&pp->pp_head.pvh_list));
661 }
662 
663 static struct pv_pte *
664 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte)
665 {
666 
667 	KASSERT(pvpte != NULL);
668 	if (pvpte == &pp->pp_pte) {
669 		KASSERT((pp->pp_flags & PP_EMBEDDED) != 0);
670 		return NULL;
671 	}
672 	KASSERT((pp->pp_flags & PP_EMBEDDED) == 0);
673 	return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list));
674 }
675 
676 /*
677  * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
678  *		of course the kernel is always loaded
679  */
680 
681 bool
682 pmap_is_curpmap(struct pmap *pmap)
683 {
684 	return((pmap == pmap_kernel()) ||
685 	       (pmap == curcpu()->ci_pmap));
686 }
687 
688 /*
689  *	Add a reference to the specified pmap.
690  */
691 
692 void
693 pmap_reference(struct pmap *pmap)
694 {
695 
696 	atomic_inc_uint(&pmap->pm_obj[0].uo_refs);
697 }
698 
699 /*
700  * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
701  *
702  * there are several pmaps involved.  some or all of them might be same.
703  *
704  *	- the pmap given by the first argument
705  *		our caller wants to access this pmap's PTEs.
706  *
707  *	- pmap_kernel()
708  *		the kernel pmap.  note that it only contains the kernel part
709  *		of the address space which is shared by any pmap.  ie. any
710  *		pmap can be used instead of pmap_kernel() for our purpose.
711  *
712  *	- ci->ci_pmap
713  *		pmap currently loaded on the cpu.
714  *
715  *	- vm_map_pmap(&curproc->p_vmspace->vm_map)
716  *		current process' pmap.
717  *
718  * => we lock enough pmaps to keep things locked in
719  * => must be undone with pmap_unmap_ptes before returning
720  */
721 
722 void
723 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2,
724 	      pd_entry_t **ptepp, pd_entry_t * const **pdeppp)
725 {
726 	struct pmap *curpmap;
727 	struct cpu_info *ci;
728 	lwp_t *l;
729 
730 	/* The kernel's pmap is always accessible. */
731 	if (pmap == pmap_kernel()) {
732 		*pmap2 = NULL;
733 		*ptepp = PTE_BASE;
734 		*pdeppp = normal_pdes;
735 		return;
736 	}
737 	KASSERT(kpreempt_disabled());
738 
739 	l = curlwp;
740  retry:
741 	mutex_enter(pmap->pm_lock);
742 	ci = curcpu();
743 	curpmap = ci->ci_pmap;
744 	if (vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) {
745 		/* Our own pmap so just load it: easy. */
746 		if (__predict_false(ci->ci_want_pmapload)) {
747 			mutex_exit(pmap->pm_lock);
748 			pmap_load();
749 			goto retry;
750 		}
751 		KASSERT(pmap == curpmap);
752 	} else if (pmap == curpmap) {
753 		/*
754 		 * Already on the CPU: make it valid.  This is very
755 		 * often the case during exit(), when we have switched
756 		 * to the kernel pmap in order to destroy a user pmap.
757 		 */
758 		if (!pmap_reactivate(pmap)) {
759 			u_int gen = uvm_emap_gen_return();
760 			tlbflush();
761 			uvm_emap_update(gen);
762 		}
763 	} else {
764 		/*
765 		 * Toss current pmap from CPU, but keep a reference to it.
766 		 * The reference will be dropped by pmap_unmap_ptes().
767 		 * Can happen if we block during exit().
768 		 */
769 		const cpuid_t cid = cpu_index(ci);
770 
771 		kcpuset_atomic_clear(curpmap->pm_cpus, cid);
772 		kcpuset_atomic_clear(curpmap->pm_kernel_cpus, cid);
773 		ci->ci_pmap = pmap;
774 		ci->ci_tlbstate = TLBSTATE_VALID;
775 		kcpuset_atomic_set(pmap->pm_cpus, cid);
776 		kcpuset_atomic_set(pmap->pm_kernel_cpus, cid);
777 		cpu_load_pmap(pmap, curpmap);
778 	}
779 	pmap->pm_ncsw = l->l_ncsw;
780 	*pmap2 = curpmap;
781 	*ptepp = PTE_BASE;
782 #if defined(XEN) && defined(__x86_64__)
783 	KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] == L4_BASE);
784 	ci->ci_normal_pdes[PTP_LEVELS - 2] = pmap->pm_pdir;
785 	*pdeppp = ci->ci_normal_pdes;
786 #else /* XEN && __x86_64__ */
787 	*pdeppp = normal_pdes;
788 #endif /* XEN && __x86_64__ */
789 }
790 
791 /*
792  * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
793  */
794 
795 void
796 pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2)
797 {
798 	struct cpu_info *ci;
799 	struct pmap *mypmap;
800 
801 	KASSERT(kpreempt_disabled());
802 
803 	/* The kernel's pmap is always accessible. */
804 	if (pmap == pmap_kernel()) {
805 		return;
806 	}
807 
808 	ci = curcpu();
809 #if defined(XEN) && defined(__x86_64__)
810 	/* Reset per-cpu normal_pdes */
811 	KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] != L4_BASE);
812 	ci->ci_normal_pdes[PTP_LEVELS - 2] = L4_BASE;
813 #endif /* XEN && __x86_64__ */
814 	/*
815 	 * We cannot tolerate context switches while mapped in.
816 	 * If it is our own pmap all we have to do is unlock.
817 	 */
818 	KASSERT(pmap->pm_ncsw == curlwp->l_ncsw);
819 	mypmap = vm_map_pmap(&curproc->p_vmspace->vm_map);
820 	if (pmap == mypmap) {
821 		mutex_exit(pmap->pm_lock);
822 		return;
823 	}
824 
825 	/*
826 	 * Mark whatever's on the CPU now as lazy and unlock.
827 	 * If the pmap was already installed, we are done.
828 	 */
829 	ci->ci_tlbstate = TLBSTATE_LAZY;
830 	ci->ci_want_pmapload = (mypmap != pmap_kernel());
831 	mutex_exit(pmap->pm_lock);
832 	if (pmap == pmap2) {
833 		return;
834 	}
835 
836 	/*
837 	 * We installed another pmap on the CPU.  Grab a reference to
838 	 * it and leave in place.  Toss the evicted pmap (can block).
839 	 */
840 	pmap_reference(pmap);
841 	pmap_destroy(pmap2);
842 }
843 
844 
845 inline static void
846 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte)
847 {
848 
849 #if !defined(__x86_64__)
850 	if (curproc == NULL || curproc->p_vmspace == NULL ||
851 	    pm != vm_map_pmap(&curproc->p_vmspace->vm_map))
852 		return;
853 
854 	if ((opte ^ npte) & PG_X)
855 		pmap_update_pg(va);
856 
857 	/*
858 	 * Executability was removed on the last executable change.
859 	 * Reset the code segment to something conservative and
860 	 * let the trap handler deal with setting the right limit.
861 	 * We can't do that because of locking constraints on the vm map.
862 	 */
863 
864 	if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) {
865 		struct trapframe *tf = curlwp->l_md.md_regs;
866 
867 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
868 		pm->pm_hiexec = I386_MAX_EXE_ADDR;
869 	}
870 #endif /* !defined(__x86_64__) */
871 }
872 
873 #if !defined(__x86_64__)
874 /*
875  * Fixup the code segment to cover all potential executable mappings.
876  * returns 0 if no changes to the code segment were made.
877  */
878 
879 int
880 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb)
881 {
882 	struct vm_map_entry *ent;
883 	struct pmap *pm = vm_map_pmap(map);
884 	vaddr_t va = 0;
885 
886 	vm_map_lock_read(map);
887 	for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) {
888 
889 		/*
890 		 * This entry has greater va than the entries before.
891 		 * We need to make it point to the last page, not past it.
892 		 */
893 
894 		if (ent->protection & VM_PROT_EXECUTE)
895 			va = trunc_page(ent->end) - PAGE_SIZE;
896 	}
897 	vm_map_unlock_read(map);
898 	if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL))
899 		return (0);
900 
901 	pm->pm_hiexec = va;
902 	if (pm->pm_hiexec > I386_MAX_EXE_ADDR) {
903 		tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
904 	} else {
905 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
906 		return (0);
907 	}
908 	return (1);
909 }
910 #endif /* !defined(__x86_64__) */
911 
912 void
913 pat_init(struct cpu_info *ci)
914 {
915 	uint64_t pat;
916 
917 	if (!(ci->ci_feat_val[0] & CPUID_PAT))
918 		return;
919 
920 	/* We change WT to WC. Leave all other entries the default values. */
921 	pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) |
922 	      PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) |
923 	      PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) |
924 	      PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC);
925 
926 	wrmsr(MSR_CR_PAT, pat);
927 	cpu_pat_enabled = true;
928 	aprint_debug_dev(ci->ci_dev, "PAT enabled\n");
929 }
930 
931 static pt_entry_t
932 pmap_pat_flags(u_int flags)
933 {
934 	u_int cacheflags = (flags & PMAP_CACHE_MASK);
935 
936 	if (!cpu_pat_enabled) {
937 		switch (cacheflags) {
938 		case PMAP_NOCACHE:
939 		case PMAP_NOCACHE_OVR:
940 			/* results in PGC_UCMINUS on cpus which have
941 			 * the cpuid PAT but PAT "disabled"
942 			 */
943 			return PG_N;
944 		default:
945 			return 0;
946 		}
947 	}
948 
949 	switch (cacheflags) {
950 	case PMAP_NOCACHE:
951 		return PGC_UC;
952 	case PMAP_WRITE_COMBINE:
953 		return PGC_WC;
954 	case PMAP_WRITE_BACK:
955 		return PGC_WB;
956 	case PMAP_NOCACHE_OVR:
957 		return PGC_UCMINUS;
958 	}
959 
960 	return 0;
961 }
962 
963 /*
964  * p m a p   k e n t e r   f u n c t i o n s
965  *
966  * functions to quickly enter/remove pages from the kernel address
967  * space.   pmap_kremove is exported to MI kernel.  we make use of
968  * the recursive PTE mappings.
969  */
970 
971 /*
972  * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
973  *
974  * => no need to lock anything, assume va is already allocated
975  * => should be faster than normal pmap enter function
976  */
977 
978 void
979 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
980 {
981 	pt_entry_t *pte, opte, npte;
982 
983 	KASSERT(!(prot & ~VM_PROT_ALL));
984 
985 	if (va < VM_MIN_KERNEL_ADDRESS)
986 		pte = vtopte(va);
987 	else
988 		pte = kvtopte(va);
989 #ifdef DOM0OPS
990 	if (pa < pmap_pa_start || pa >= pmap_pa_end) {
991 #ifdef DEBUG
992 		printf_nolog("%s: pa 0x%" PRIx64 " for va 0x%" PRIx64
993 		    " outside range\n", __func__, (int64_t)pa, (int64_t)va);
994 #endif /* DEBUG */
995 		npte = pa;
996 	} else
997 #endif /* DOM0OPS */
998 		npte = pmap_pa2pte(pa);
999 	npte |= protection_codes[prot] | PG_k | PG_V | pmap_pg_g;
1000 	npte |= pmap_pat_flags(flags);
1001 	opte = pmap_pte_testset(pte, npte); /* zap! */
1002 #if defined(DIAGNOSTIC)
1003 	/*
1004 	 * XXX: make sure we are not dealing with a large page, since the only
1005 	 * large pages created are for the kernel image, and they should never
1006 	 * be kentered.
1007 	 */
1008 	if (opte & PG_PS)
1009 		panic("%s: PG_PS", __func__);
1010 #endif
1011 	if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) {
1012 		/* This should not happen. */
1013 		printf_nolog("%s: mapping already present\n", __func__);
1014 		kpreempt_disable();
1015 		pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER);
1016 		kpreempt_enable();
1017 	}
1018 }
1019 
1020 void
1021 pmap_emap_enter(vaddr_t va, paddr_t pa, vm_prot_t prot)
1022 {
1023 	pt_entry_t *pte, npte;
1024 
1025 	KASSERT((prot & ~VM_PROT_ALL) == 0);
1026 	pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va);
1027 
1028 #ifdef DOM0OPS
1029 	if (pa < pmap_pa_start || pa >= pmap_pa_end) {
1030 		npte = pa;
1031 	} else
1032 #endif
1033 		npte = pmap_pa2pte(pa);
1034 
1035 	npte = pmap_pa2pte(pa);
1036 	npte |= protection_codes[prot] | PG_k | PG_V;
1037 	pmap_pte_set(pte, npte);
1038 }
1039 
1040 /*
1041  * pmap_emap_sync: perform TLB flush or pmap load, if it was deferred.
1042  */
1043 void
1044 pmap_emap_sync(bool canload)
1045 {
1046 	struct cpu_info *ci = curcpu();
1047 	struct pmap *pmap;
1048 
1049 	KASSERT(kpreempt_disabled());
1050 	if (__predict_true(ci->ci_want_pmapload && canload)) {
1051 		/*
1052 		 * XXX: Hint for pmap_reactivate(), which might suggest to
1053 		 * not perform TLB flush, if state has not changed.
1054 		 */
1055 		pmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map);
1056 		if (__predict_false(pmap == ci->ci_pmap)) {
1057 			kcpuset_atomic_clear(pmap->pm_cpus, cpu_index(ci));
1058 		}
1059 		pmap_load();
1060 		KASSERT(ci->ci_want_pmapload == 0);
1061 	} else {
1062 		tlbflush();
1063 	}
1064 }
1065 
1066 void
1067 pmap_emap_remove(vaddr_t sva, vsize_t len)
1068 {
1069 	pt_entry_t *pte;
1070 	vaddr_t va, eva = sva + len;
1071 
1072 	for (va = sva; va < eva; va += PAGE_SIZE) {
1073 		pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va);
1074 		pmap_pte_set(pte, 0);
1075 	}
1076 }
1077 
1078 __strict_weak_alias(pmap_kenter_ma, pmap_kenter_pa);
1079 
1080 #if defined(__x86_64__)
1081 /*
1082  * Change protection for a virtual address. Local for a CPU only, don't
1083  * care about TLB shootdowns.
1084  *
1085  * => must be called with preemption disabled
1086  */
1087 void
1088 pmap_changeprot_local(vaddr_t va, vm_prot_t prot)
1089 {
1090 	pt_entry_t *pte, opte, npte;
1091 
1092 	KASSERT(kpreempt_disabled());
1093 
1094 	if (va < VM_MIN_KERNEL_ADDRESS)
1095 		pte = vtopte(va);
1096 	else
1097 		pte = kvtopte(va);
1098 
1099 	npte = opte = *pte;
1100 
1101 	if ((prot & VM_PROT_WRITE) != 0)
1102 		npte |= PG_RW;
1103 	else
1104 		npte &= ~PG_RW;
1105 
1106 	if (opte != npte) {
1107 		pmap_pte_set(pte, npte);
1108 		pmap_pte_flush();
1109 		invlpg(va);
1110 	}
1111 }
1112 #endif /* defined(__x86_64__) */
1113 
1114 /*
1115  * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
1116  *
1117  * => no need to lock anything
1118  * => caller must dispose of any vm_page mapped in the va range
1119  * => note: not an inline function
1120  * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
1121  * => we assume kernel only unmaps valid addresses and thus don't bother
1122  *    checking the valid bit before doing TLB flushing
1123  * => must be followed by call to pmap_update() before reuse of page
1124  */
1125 
1126 static inline void
1127 pmap_kremove1(vaddr_t sva, vsize_t len, bool localonly)
1128 {
1129 	pt_entry_t *pte, opte;
1130 	vaddr_t va, eva;
1131 
1132 	eva = sva + len;
1133 
1134 	kpreempt_disable();
1135 	for (va = sva; va < eva; va += PAGE_SIZE) {
1136 		pte = kvtopte(va);
1137 		opte = pmap_pte_testset(pte, 0); /* zap! */
1138 		if ((opte & (PG_V | PG_U)) == (PG_V | PG_U) && !localonly) {
1139 			pmap_tlb_shootdown(pmap_kernel(), va, opte,
1140 			    TLBSHOOT_KREMOVE);
1141 		}
1142 		KASSERT((opte & PG_PS) == 0);
1143 		KASSERT((opte & PG_PVLIST) == 0);
1144 	}
1145 	if (localonly) {
1146 		tlbflushg();
1147 	}
1148 	kpreempt_enable();
1149 }
1150 
1151 void
1152 pmap_kremove(vaddr_t sva, vsize_t len)
1153 {
1154 
1155 	pmap_kremove1(sva, len, false);
1156 }
1157 
1158 /*
1159  * pmap_kremove_local: like pmap_kremove(), but only worry about
1160  * TLB invalidations on the current CPU.  this is only intended
1161  * for use while writing kernel crash dumps.
1162  */
1163 
1164 void
1165 pmap_kremove_local(vaddr_t sva, vsize_t len)
1166 {
1167 
1168 	KASSERT(panicstr != NULL);
1169 	pmap_kremove1(sva, len, true);
1170 }
1171 
1172 /*
1173  * p m a p   i n i t   f u n c t i o n s
1174  *
1175  * pmap_bootstrap and pmap_init are called during system startup
1176  * to init the pmap module.   pmap_bootstrap() does a low level
1177  * init just to get things rolling.   pmap_init() finishes the job.
1178  */
1179 
1180 /*
1181  * pmap_bootstrap_valloc: allocate a virtual address in the bootstrap area.
1182  * This function is to be used before any VM system has been set up.
1183  *
1184  * The va is taken from virtual_avail.
1185  */
1186 static vaddr_t
1187 pmap_bootstrap_valloc(size_t npages)
1188 {
1189 	vaddr_t va = virtual_avail;
1190 	virtual_avail += npages * PAGE_SIZE;
1191 	return va;
1192 }
1193 
1194 /*
1195  * pmap_bootstrap_palloc: allocate a physical address in the bootstrap area.
1196  * This function is to be used before any VM system has been set up.
1197  *
1198  * The pa is taken from avail_start.
1199  */
1200 static paddr_t
1201 pmap_bootstrap_palloc(size_t npages)
1202 {
1203 	paddr_t pa = avail_start;
1204 	avail_start += npages * PAGE_SIZE;
1205 	return pa;
1206 }
1207 
1208 /*
1209  * pmap_bootstrap: get the system in a state where it can run with VM properly
1210  * enabled (called before main()). The VM system is fully init'd later.
1211  *
1212  * => on i386, locore.S has already enabled the MMU by allocating a PDP for the
1213  *    kernel, and nkpde PTP's for the kernel.
1214  * => kva_start is the first free virtual address in kernel space.
1215  */
1216 void
1217 pmap_bootstrap(vaddr_t kva_start)
1218 {
1219 	struct pmap *kpm;
1220 	int i;
1221 	vaddr_t kva;
1222 #ifndef XEN
1223 	unsigned long p1i;
1224 	vaddr_t kva_end;
1225 #endif
1226 
1227 	pmap_pg_nx = (cpu_feature[2] & CPUID_NOX ? PG_NX : 0);
1228 
1229 	/*
1230 	 * Set up our local static global vars that keep track of the usage of
1231 	 * KVM before kernel_map is set up.
1232 	 */
1233 	virtual_avail = kva_start;		/* first free KVA */
1234 	virtual_end = VM_MAX_KERNEL_ADDRESS;	/* last KVA */
1235 
1236 	/*
1237 	 * Set up protection_codes: we need to be able to convert from a MI
1238 	 * protection code (some combo of VM_PROT...) to something we can jam
1239 	 * into a x86 PTE.
1240 	 */
1241 	protection_codes[VM_PROT_NONE] = pmap_pg_nx;
1242 	protection_codes[VM_PROT_EXECUTE] = PG_RO | PG_X;
1243 	protection_codes[VM_PROT_READ] = PG_RO | pmap_pg_nx;
1244 	protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO | PG_X;
1245 	protection_codes[VM_PROT_WRITE] = PG_RW | pmap_pg_nx;
1246 	protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW | PG_X;
1247 	protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW | pmap_pg_nx;
1248 	protection_codes[VM_PROT_ALL] = PG_RW | PG_X;
1249 
1250 	/*
1251 	 * Now we init the kernel's pmap.
1252 	 *
1253 	 * The kernel pmap's pm_obj is not used for much. However, in user pmaps
1254 	 * the pm_obj contains the list of active PTPs.
1255 	 *
1256 	 * The pm_obj currently does not have a pager. It might be possible to
1257 	 * add a pager that would allow a process to read-only mmap its own page
1258 	 * tables (fast user-level vtophys?). This may or may not be useful.
1259 	 */
1260 	kpm = pmap_kernel();
1261 	for (i = 0; i < PTP_LEVELS - 1; i++) {
1262 		mutex_init(&kpm->pm_obj_lock[i], MUTEX_DEFAULT, IPL_NONE);
1263 		uvm_obj_init(&kpm->pm_obj[i], NULL, false, 1);
1264 		uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_obj_lock[i]);
1265 		kpm->pm_ptphint[i] = NULL;
1266 	}
1267 	memset(&kpm->pm_list, 0, sizeof(kpm->pm_list));  /* pm_list not used */
1268 
1269 	kpm->pm_pdir = (pd_entry_t *)(PDPpaddr + KERNBASE);
1270 	for (i = 0; i < PDP_SIZE; i++)
1271 		kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i;
1272 
1273 	kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
1274 		x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS);
1275 
1276 	kcpuset_create(&kpm->pm_cpus, true);
1277 	kcpuset_create(&kpm->pm_kernel_cpus, true);
1278 
1279 	/*
1280 	 * the above is just a rough estimate and not critical to the proper
1281 	 * operation of the system.
1282 	 */
1283 
1284 #ifndef XEN
1285 	/*
1286 	 * Begin to enable global TLB entries if they are supported.
1287 	 * The G bit has no effect until the CR4_PGE bit is set in CR4,
1288 	 * which happens in cpu_init(), which is run on each cpu
1289 	 * (and happens later)
1290 	 */
1291 	if (cpu_feature[0] & CPUID_PGE) {
1292 		pmap_pg_g = PG_G;		/* enable software */
1293 
1294 		/* add PG_G attribute to already mapped kernel pages */
1295 
1296 		if (KERNBASE == VM_MIN_KERNEL_ADDRESS) {
1297 			/* i386 only */
1298 			kva_end = virtual_avail;
1299 		} else {
1300 			/* amd64 only */
1301 			extern vaddr_t kern_end;
1302 			kva_end = kern_end;
1303 		}
1304 
1305 		for (kva = KERNBASE; kva < kva_end; kva += PAGE_SIZE) {
1306 			p1i = pl1_i(kva);
1307 			if (pmap_valid_entry(PTE_BASE[p1i]))
1308 				PTE_BASE[p1i] |= PG_G;
1309 		}
1310 	}
1311 
1312 	/*
1313 	 * Enable large pages if they are supported.
1314 	 */
1315 	if (cpu_feature[0] & CPUID_PSE) {
1316 		lcr4(rcr4() | CR4_PSE);	/* enable hardware (via %cr4) */
1317 		pmap_largepages = 1;	/* enable software */
1318 
1319 		/*
1320 		 * The TLB must be flushed after enabling large pages on Pentium
1321 		 * CPUs, according to section 3.6.2.2 of "Intel Architecture
1322 		 * Software Developer's Manual, Volume 3: System Programming".
1323 		 */
1324 		tlbflushg();
1325 
1326 		/* Remap the kernel. */
1327 		pmap_remap_largepages();
1328 	}
1329 #endif /* !XEN */
1330 
1331 	pmap_init_lapic();
1332 
1333 #ifdef __HAVE_DIRECT_MAP
1334 	pmap_init_directmap(kpm);
1335 #else
1336 	if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
1337 		/*
1338 		 * zero_pte is stuck at the end of mapped space for the kernel
1339 		 * image (disjunct from kva space). This is done so that it
1340 		 * can safely be used in pmap_growkernel (pmap_get_physpage),
1341 		 * when it's called for the first time.
1342 		 * XXXfvdl fix this for MULTIPROCESSOR later.
1343 		 */
1344 #ifdef XEN
1345 		/* early_zerop initialized in xen_locore() */
1346 #else
1347 		early_zerop = (void *)(KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2);
1348 #endif
1349 		early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop);
1350 	}
1351 
1352 	/*
1353 	 * Now we allocate the "special" VAs which are used for tmp mappings
1354 	 * by the pmap (and other modules). We allocate the VAs by advancing
1355 	 * virtual_avail (note that there are no pages mapped at these VAs).
1356 	 * we find the PTE that maps the allocated VA via the linear PTE
1357 	 * mapping.
1358 	 */
1359 
1360 	pt_entry_t *pte = PTE_BASE + pl1_i(virtual_avail);
1361 
1362 #ifdef MULTIPROCESSOR
1363 	/*
1364 	 * Waste some VA space to avoid false sharing of cache lines
1365 	 * for page table pages: Give each possible CPU a cache line
1366 	 * of PTE's (8) to play with, though we only need 4.  We could
1367 	 * recycle some of this waste by putting the idle stacks here
1368 	 * as well; we could waste less space if we knew the largest
1369 	 * CPU ID beforehand.
1370 	 */
1371 	csrcp = (char *) virtual_avail;  csrc_pte = pte;
1372 
1373 	cdstp = (char *) virtual_avail+PAGE_SIZE;  cdst_pte = pte+1;
1374 
1375 	zerop = (char *) virtual_avail+PAGE_SIZE*2;  zero_pte = pte+2;
1376 
1377 	ptpp = (char *) virtual_avail+PAGE_SIZE*3;  ptp_pte = pte+3;
1378 
1379 	virtual_avail += PAGE_SIZE * maxcpus * NPTECL;
1380 	pte += maxcpus * NPTECL;
1381 #else
1382 	csrcp = (void *) virtual_avail;  csrc_pte = pte;	/* allocate */
1383 	virtual_avail += PAGE_SIZE; pte++;			/* advance */
1384 
1385 	cdstp = (void *) virtual_avail;  cdst_pte = pte;
1386 	virtual_avail += PAGE_SIZE; pte++;
1387 
1388 	zerop = (void *) virtual_avail;  zero_pte = pte;
1389 	virtual_avail += PAGE_SIZE; pte++;
1390 
1391 	ptpp = (void *) virtual_avail;  ptp_pte = pte;
1392 	virtual_avail += PAGE_SIZE; pte++;
1393 #endif
1394 
1395 	if (VM_MIN_KERNEL_ADDRESS == KERNBASE) {
1396 		early_zerop = zerop;
1397 		early_zero_pte = zero_pte;
1398 	}
1399 #endif
1400 
1401 #if defined(XEN) && defined(__x86_64__)
1402 	/*
1403 	 * We want a dummy page directory for Xen: when deactivating a pmap, Xen
1404 	 * will still consider it active. So we set user PGD to this one to lift
1405 	 * all protection on the now inactive page tables set.
1406 	 */
1407 	xen_dummy_user_pgd = pmap_bootstrap_palloc(1);
1408 
1409 	/* Zero fill it, the less checks in Xen it requires the better */
1410 	memset((void *) (xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE);
1411 	/* Mark read-only */
1412 	HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE,
1413 	    pmap_pa2pte(xen_dummy_user_pgd) | PG_u | PG_V, UVMF_INVLPG);
1414 	/* Pin as L4 */
1415 	xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd));
1416 #endif
1417 
1418 	/*
1419 	 * Allocate space for the IDT, GDT and LDT.
1420 	 */
1421 	idt_vaddr = pmap_bootstrap_valloc(1);
1422 	idt_paddr = pmap_bootstrap_palloc(1);
1423 
1424 	gdt_vaddr = pmap_bootstrap_valloc(1);
1425 	gdt_paddr = pmap_bootstrap_palloc(1);
1426 
1427 	ldt_vaddr = pmap_bootstrap_valloc(1);
1428 	ldt_paddr = pmap_bootstrap_palloc(1);
1429 
1430 #if !defined(__x86_64__) && !defined(XEN)
1431 	/* pentium f00f bug stuff */
1432 	pentium_idt_vaddr = pmap_bootstrap_valloc(1);
1433 #endif
1434 
1435 	/*
1436 	 * Now we reserve some VM for mapping pages when doing a crash dump.
1437 	 */
1438 	virtual_avail = reserve_dumppages(virtual_avail);
1439 
1440 	/*
1441 	 * Init the static-global locks and global lists.
1442 	 *
1443 	 * => pventry::pvh_lock (initialized elsewhere) must also be
1444 	 *      a spin lock, again at IPL_VM to prevent deadlock, and
1445 	 *	again is never taken from interrupt context.
1446 	 */
1447 	mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE);
1448 	LIST_INIT(&pmaps);
1449 
1450 	/*
1451 	 * Ensure the TLB is sync'd with reality by flushing it...
1452 	 */
1453 	tlbflushg();
1454 
1455 	/*
1456 	 * Calculate pmap_maxkvaddr from nkptp[].
1457 	 */
1458 	kva = VM_MIN_KERNEL_ADDRESS;
1459 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
1460 		kva += nkptp[i] * nbpd[i];
1461 	}
1462 	pmap_maxkvaddr = kva;
1463 }
1464 
1465 static void
1466 pmap_init_lapic(void)
1467 {
1468 	/*
1469 	 * On CPUs that have no LAPIC, local_apic_va is never kentered. But our
1470 	 * x86 implementation relies a lot on this address to be valid; so just
1471 	 * allocate a fake physical page that will be kentered into
1472 	 * local_apic_va by machdep.
1473 	 *
1474 	 * If the LAPIC is present, the va will be remapped somewhere else
1475 	 * later in lapic_map.
1476 	 */
1477 	local_apic_va = pmap_bootstrap_valloc(1);
1478 	local_apic_pa = pmap_bootstrap_palloc(1);
1479 }
1480 
1481 #ifdef __HAVE_DIRECT_MAP
1482 /*
1483  * Create the amd64 direct map. Called only once at boot time.
1484  */
1485 static void
1486 pmap_init_directmap(struct pmap *kpm)
1487 {
1488 	extern phys_ram_seg_t mem_clusters[];
1489 	extern int mem_cluster_cnt;
1490 
1491 	paddr_t lastpa, dm_pd, dm_pdp, pdp;
1492 	vaddr_t tmpva;
1493 	pt_entry_t *pte;
1494 	pd_entry_t *pde;
1495 	phys_ram_seg_t *mc;
1496 	long n_dm_pdp;
1497 	int i;
1498 
1499 	const pd_entry_t pteflags = PG_V | PG_KW | pmap_pg_nx;
1500 
1501 	/* Get the last physical address available */
1502 	lastpa = 0;
1503 	for (i = 0; i < mem_cluster_cnt; i++) {
1504 		mc = &mem_clusters[i];
1505 		lastpa = MAX(lastpa, mc->start + mc->size);
1506 	}
1507 
1508 	/*
1509 	 * We allocate only one L4 entry for the direct map (PDIR_SLOT_DIRECT),
1510 	 * so we cannot map more than 512GB.
1511 	 */
1512 	if (lastpa > NBPD_L4) {
1513 		panic("RAM limit reached: > 512GB not supported");
1514 	}
1515 
1516 	/* Allocate L3. */
1517 	dm_pdp = pmap_bootstrap_palloc(1);
1518 
1519 	/* Number of L3 entries. */
1520 	n_dm_pdp = (lastpa + NBPD_L3 - 1) >> L3_SHIFT;
1521 
1522 	/* In locore.S, we allocated a tmp va. Use it now. */
1523 	tmpva = (KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2);
1524 	pte = PTE_BASE + pl1_i(tmpva);
1525 	*pte = dm_pdp | pteflags;
1526 	pmap_update_pg(tmpva);
1527 	memset((void *)tmpva, 0, PAGE_SIZE);
1528 
1529 	/*
1530 	 * Map the direct map RW. Use super pages (1GB) or large pages (2MB) if
1531 	 * they are supported. Note: PG_G is not allowed on non-leaf PTPs.
1532 	 */
1533 	if (cpu_feature[2] & CPUID_P1GB) {
1534 		/* Super pages are supported. Just create L3. */
1535 		for (i = 0; i < n_dm_pdp; i++) {
1536 			pdp = (paddr_t)&(((pd_entry_t *)dm_pdp)[i]);
1537 			*pte = (pdp & PG_FRAME) | pteflags;
1538 			pmap_update_pg(tmpva);
1539 
1540 			pde = (pd_entry_t *)(tmpva + (pdp & ~PG_FRAME));
1541 			*pde = ((paddr_t)i << L3_SHIFT) | pteflags | PG_U |
1542 			    PG_PS | PG_G;
1543 		}
1544 	} else {
1545 		/* Allocate L2. */
1546 		dm_pd = pmap_bootstrap_palloc(n_dm_pdp);
1547 
1548 		/* Zero out the L2 pages. */
1549 		for (i = 0; i < n_dm_pdp; i++) {
1550 			pdp = dm_pd + i * PAGE_SIZE;
1551 			*pte = (pdp & PG_FRAME) | pteflags;
1552 			pmap_update_pg(tmpva);
1553 
1554 			memset((void *)tmpva, 0, PAGE_SIZE);
1555 		}
1556 
1557 		KASSERT(pmap_largepages != 0);
1558 
1559 		/* Large pages are supported. Just create L2. */
1560 		for (i = 0; i < NPDPG * n_dm_pdp; i++) {
1561 			pdp = (paddr_t)&(((pd_entry_t *)dm_pd)[i]);
1562 			*pte = (pdp & PG_FRAME) | pteflags;
1563 			pmap_update_pg(tmpva);
1564 
1565 			pde = (pd_entry_t *)(tmpva + (pdp & ~PG_FRAME));
1566 			*pde = ((paddr_t)i << L2_SHIFT) | pteflags |
1567 			    PG_U | PG_PS | PG_G;
1568 		}
1569 
1570 		/* Fill in the L3 entries, linked to L2. */
1571 		for (i = 0; i < n_dm_pdp; i++) {
1572 			pdp = (paddr_t)&(((pd_entry_t *)dm_pdp)[i]);
1573 			*pte = (pdp & PG_FRAME) | pteflags;
1574 			pmap_update_pg(tmpva);
1575 
1576 			pde = (pd_entry_t *)(tmpva + (pdp & ~PG_FRAME));
1577 			*pde = (dm_pd + (i << PAGE_SHIFT)) | pteflags | PG_U;
1578 		}
1579 	}
1580 
1581 	kpm->pm_pdir[PDIR_SLOT_DIRECT] = dm_pdp | pteflags | PG_U;
1582 
1583 	*pte = 0;
1584 	pmap_update_pg(tmpva);
1585 
1586 	tlbflush();
1587 }
1588 #endif /* __HAVE_DIRECT_MAP */
1589 
1590 #ifndef XEN
1591 /*
1592  * Remap several kernel segments with large pages. We cover as many pages as we
1593  * can. Called only once at boot time, if the CPU supports large pages.
1594  */
1595 static void
1596 pmap_remap_largepages(void)
1597 {
1598 	extern char __rodata_start;
1599 	extern char __data_start;
1600 	extern char __kernel_end;
1601 	pd_entry_t *pde;
1602 	vaddr_t kva, kva_end;
1603 	paddr_t pa;
1604 
1605 	/* Remap the kernel text using large pages. */
1606 	kva = KERNBASE;
1607 	kva_end = rounddown((vaddr_t)&__rodata_start, NBPD_L1);
1608 	pa = kva - KERNBASE;
1609 	for (/* */; kva + NBPD_L2 <= kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1610 		pde = &L2_BASE[pl2_i(kva)];
1611 		*pde = pa | pmap_pg_g | PG_PS | PG_KR | PG_V;
1612 		tlbflushg();
1613 	}
1614 #if defined(DEBUG)
1615 	aprint_normal("kernel text is mapped with %" PRIuPSIZE " large "
1616 	    "pages and %" PRIuPSIZE " normal pages\n",
1617 	    howmany(kva - KERNBASE, NBPD_L2),
1618 	    howmany((vaddr_t)&__rodata_start - kva, NBPD_L1));
1619 #endif /* defined(DEBUG) */
1620 
1621 	/* Remap the kernel rodata using large pages. */
1622 	kva = roundup((vaddr_t)&__rodata_start, NBPD_L2);
1623 	kva_end = rounddown((vaddr_t)&__data_start, NBPD_L1);
1624 	pa = kva - KERNBASE;
1625 	for (/* */; kva + NBPD_L2 <= kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1626 		pde = &L2_BASE[pl2_i(kva)];
1627 		*pde = pa | pmap_pg_g | PG_PS | pmap_pg_nx | PG_KR | PG_V;
1628 		tlbflushg();
1629 	}
1630 
1631 	/* Remap the kernel data+bss using large pages. */
1632 	kva = roundup((vaddr_t)&__data_start, NBPD_L2);
1633 	kva_end = rounddown((vaddr_t)&__kernel_end, NBPD_L1);
1634 	pa = kva - KERNBASE;
1635 	for (/* */; kva + NBPD_L2 <= kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1636 		pde = &L2_BASE[pl2_i(kva)];
1637 		*pde = pa | pmap_pg_g | PG_PS | pmap_pg_nx | PG_KW | PG_V;
1638 		tlbflushg();
1639 	}
1640 }
1641 #endif /* !XEN */
1642 
1643 /*
1644  * pmap_init: called from uvm_init, our job is to get the pmap
1645  * system ready to manage mappings...
1646  */
1647 
1648 void
1649 pmap_init(void)
1650 {
1651 	int i, flags;
1652 
1653 	for (i = 0; i < PV_HASH_SIZE; i++) {
1654 		SLIST_INIT(&pv_hash_heads[i].hh_list);
1655 	}
1656 	for (i = 0; i < PV_HASH_LOCK_CNT; i++) {
1657 		mutex_init(&pv_hash_locks[i].lock, MUTEX_NODEBUG, IPL_VM);
1658 	}
1659 
1660 	/*
1661 	 * initialize caches.
1662 	 */
1663 
1664 	pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), 0, 0, 0,
1665 	    "pmappl", NULL, IPL_NONE, NULL, NULL, NULL);
1666 
1667 #ifdef XEN
1668 	/*
1669 	 * pool_cache(9) should not touch cached objects, since they
1670 	 * are pinned on xen and R/O for the domU
1671 	 */
1672 	flags = PR_NOTOUCH;
1673 #else /* XEN */
1674 	flags = 0;
1675 #endif /* XEN */
1676 #ifdef PAE
1677 	pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE * PDP_SIZE, 0, 0, flags,
1678 	    "pdppl", &pmap_pdp_allocator, IPL_NONE,
1679 	    pmap_pdp_ctor, pmap_pdp_dtor, NULL);
1680 #else /* PAE */
1681 	pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE, 0, 0, flags,
1682 	    "pdppl", NULL, IPL_NONE, pmap_pdp_ctor, pmap_pdp_dtor, NULL);
1683 #endif /* PAE */
1684 	pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0,
1685 	    PR_LARGECACHE, "pvpl", &pool_allocator_kmem, IPL_NONE, NULL,
1686 	    NULL, NULL);
1687 
1688 	pmap_tlb_init();
1689 
1690 	/* XXX: Since cpu_hatch() is only for secondary CPUs. */
1691 	pmap_tlb_cpu_init(curcpu());
1692 
1693 	evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC,
1694 	    NULL, "x86", "io bitmap copy");
1695 	evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC,
1696 	    NULL, "x86", "ldt sync");
1697 
1698 	/*
1699 	 * done: pmap module is up (and ready for business)
1700 	 */
1701 
1702 	pmap_initialized = true;
1703 }
1704 
1705 /*
1706  * pmap_cpu_init_late: perform late per-CPU initialization.
1707  */
1708 
1709 #ifndef XEN
1710 void
1711 pmap_cpu_init_late(struct cpu_info *ci)
1712 {
1713 	/*
1714 	 * The BP has already its own PD page allocated during early
1715 	 * MD startup.
1716 	 */
1717 	if (ci == &cpu_info_primary)
1718 		return;
1719 
1720 #ifdef PAE
1721 	cpu_alloc_l3_page(ci);
1722 #endif
1723 }
1724 #endif
1725 
1726 /*
1727  * p v _ e n t r y   f u n c t i o n s
1728  */
1729 
1730 /*
1731  * pmap_free_pvs: free a list of pv_entrys
1732  */
1733 
1734 static void
1735 pmap_free_pvs(struct pv_entry *pve)
1736 {
1737 	struct pv_entry *next;
1738 
1739 	for ( /* null */ ; pve != NULL ; pve = next) {
1740 		next = pve->pve_next;
1741 		pool_cache_put(&pmap_pv_cache, pve);
1742 	}
1743 }
1744 
1745 /*
1746  * main pv_entry manipulation functions:
1747  *   pmap_enter_pv: enter a mapping onto a pv_head list
1748  *   pmap_remove_pv: remove a mapping from a pv_head list
1749  *
1750  * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock
1751  *       the pvh before calling
1752  */
1753 
1754 /*
1755  * insert_pv: a helper of pmap_enter_pv
1756  */
1757 
1758 static void
1759 insert_pv(struct pmap_page *pp, struct pv_entry *pve)
1760 {
1761 	struct pv_hash_head *hh;
1762 	kmutex_t *lock;
1763 	u_int hash;
1764 
1765 	hash = pvhash_hash(pve->pve_pte.pte_ptp, pve->pve_pte.pte_va);
1766 	lock = pvhash_lock(hash);
1767 	hh = pvhash_head(hash);
1768 	mutex_spin_enter(lock);
1769 	SLIST_INSERT_HEAD(&hh->hh_list, pve, pve_hash);
1770 	mutex_spin_exit(lock);
1771 
1772 	LIST_INSERT_HEAD(&pp->pp_head.pvh_list, pve, pve_list);
1773 }
1774 
1775 /*
1776  * pmap_enter_pv: enter a mapping onto a pv_head lst
1777  *
1778  * => caller should adjust ptp's wire_count before calling
1779  * => caller has preallocated pve and *sparepve for us
1780  */
1781 
1782 static struct pv_entry *
1783 pmap_enter_pv(struct pmap_page *pp, struct pv_entry *pve,
1784     struct pv_entry **sparepve, struct vm_page *ptp, vaddr_t va)
1785 {
1786 
1787 	KASSERT(ptp == NULL || ptp->wire_count >= 2);
1788 	KASSERT(ptp == NULL || ptp->uobject != NULL);
1789 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
1790 
1791 	if ((pp->pp_flags & PP_EMBEDDED) == 0) {
1792 		if (LIST_EMPTY(&pp->pp_head.pvh_list)) {
1793 			pp->pp_flags |= PP_EMBEDDED;
1794 			pp->pp_pte.pte_ptp = ptp;
1795 			pp->pp_pte.pte_va = va;
1796 
1797 			return pve;
1798 		}
1799 	} else {
1800 		struct pv_entry *pve2;
1801 
1802 		pve2 = *sparepve;
1803 		*sparepve = NULL;
1804 
1805 		pve2->pve_pte = pp->pp_pte;
1806 		pp->pp_flags &= ~PP_EMBEDDED;
1807 		LIST_INIT(&pp->pp_head.pvh_list);
1808 		insert_pv(pp, pve2);
1809 	}
1810 
1811 	pve->pve_pte.pte_ptp = ptp;
1812 	pve->pve_pte.pte_va = va;
1813 	insert_pv(pp, pve);
1814 
1815 	return NULL;
1816 }
1817 
1818 /*
1819  * pmap_remove_pv: try to remove a mapping from a pv_list
1820  *
1821  * => caller should adjust ptp's wire_count and free PTP if needed
1822  * => we return the removed pve
1823  */
1824 
1825 static struct pv_entry *
1826 pmap_remove_pv(struct pmap_page *pp, struct vm_page *ptp, vaddr_t va)
1827 {
1828 	struct pv_hash_head *hh;
1829 	struct pv_entry *pve;
1830 	kmutex_t *lock;
1831 	u_int hash;
1832 
1833 	KASSERT(ptp == NULL || ptp->uobject != NULL);
1834 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
1835 
1836 	if ((pp->pp_flags & PP_EMBEDDED) != 0) {
1837 		KASSERT(pp->pp_pte.pte_ptp == ptp);
1838 		KASSERT(pp->pp_pte.pte_va == va);
1839 
1840 		pp->pp_flags &= ~PP_EMBEDDED;
1841 		LIST_INIT(&pp->pp_head.pvh_list);
1842 
1843 		return NULL;
1844 	}
1845 
1846 	hash = pvhash_hash(ptp, va);
1847 	lock = pvhash_lock(hash);
1848 	hh = pvhash_head(hash);
1849 	mutex_spin_enter(lock);
1850 	pve = pvhash_remove(hh, ptp, va);
1851 	mutex_spin_exit(lock);
1852 
1853 	LIST_REMOVE(pve, pve_list);
1854 
1855 	return pve;
1856 }
1857 
1858 /*
1859  * p t p   f u n c t i o n s
1860  */
1861 
1862 static inline struct vm_page *
1863 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level)
1864 {
1865 	int lidx = level - 1;
1866 	struct vm_page *pg;
1867 
1868 	KASSERT(mutex_owned(pmap->pm_lock));
1869 
1870 	if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] &&
1871 	    pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) {
1872 		return (pmap->pm_ptphint[lidx]);
1873 	}
1874 	PMAP_SUBOBJ_LOCK(pmap, lidx);
1875 	pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level));
1876 	PMAP_SUBOBJ_UNLOCK(pmap, lidx);
1877 
1878 	KASSERT(pg == NULL || pg->wire_count >= 1);
1879 	return pg;
1880 }
1881 
1882 static inline void
1883 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level)
1884 {
1885 	lwp_t *l;
1886 	int lidx;
1887 	struct uvm_object *obj;
1888 
1889 	KASSERT(ptp->wire_count == 1);
1890 
1891 	lidx = level - 1;
1892 
1893 	obj = &pmap->pm_obj[lidx];
1894 	pmap_stats_update(pmap, -1, 0);
1895 	if (lidx != 0)
1896 		mutex_enter(obj->vmobjlock);
1897 	if (pmap->pm_ptphint[lidx] == ptp)
1898 		pmap->pm_ptphint[lidx] = TAILQ_FIRST(&obj->memq);
1899 	ptp->wire_count = 0;
1900 	uvm_pagerealloc(ptp, NULL, 0);
1901 	l = curlwp;
1902 	KASSERT((l->l_pflag & LP_INTR) == 0);
1903 	VM_PAGE_TO_PP(ptp)->pp_link = l->l_md.md_gc_ptp;
1904 	l->l_md.md_gc_ptp = ptp;
1905 	if (lidx != 0)
1906 		mutex_exit(obj->vmobjlock);
1907 }
1908 
1909 static void
1910 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
1911 	      pt_entry_t *ptes, pd_entry_t * const *pdes)
1912 {
1913 	unsigned long index;
1914 	int level;
1915 	vaddr_t invaladdr;
1916 	pd_entry_t opde;
1917 
1918 	KASSERT(pmap != pmap_kernel());
1919 	KASSERT(mutex_owned(pmap->pm_lock));
1920 	KASSERT(kpreempt_disabled());
1921 
1922 	level = 1;
1923 	do {
1924 		index = pl_i(va, level + 1);
1925 		opde = pmap_pte_testset(&pdes[level - 1][index], 0);
1926 #if defined(XEN)
1927 #  if defined(__x86_64__)
1928 		/*
1929 		 * If ptp is a L3 currently mapped in kernel space,
1930 		 * on any cpu, clear it before freeing
1931 		 */
1932 		if (level == PTP_LEVELS - 1) {
1933 			/*
1934 			 * Update the per-cpu PD on all cpus the current
1935 			 * pmap is active on
1936 			 */
1937 			xen_kpm_sync(pmap, index);
1938 		}
1939 #  endif /*__x86_64__ */
1940 		invaladdr = level == 1 ? (vaddr_t)ptes :
1941 		    (vaddr_t)pdes[level - 2];
1942 		pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE,
1943 		    opde, TLBSHOOT_FREE_PTP1);
1944 		pmap_tlb_shootnow();
1945 #else	/* XEN */
1946 		invaladdr = level == 1 ? (vaddr_t)ptes :
1947 		    (vaddr_t)pdes[level - 2];
1948 		pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE,
1949 		    opde, TLBSHOOT_FREE_PTP1);
1950 #endif	/* XEN */
1951 		pmap_freepage(pmap, ptp, level);
1952 		if (level < PTP_LEVELS - 1) {
1953 			ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1);
1954 			ptp->wire_count--;
1955 			if (ptp->wire_count > 1)
1956 				break;
1957 		}
1958 	} while (++level < PTP_LEVELS);
1959 	pmap_pte_flush();
1960 }
1961 
1962 /*
1963  * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
1964  *
1965  * => pmap should NOT be pmap_kernel()
1966  * => pmap should be locked
1967  * => preemption should be disabled
1968  */
1969 
1970 static struct vm_page *
1971 pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t * const *pdes)
1972 {
1973 	struct vm_page *ptp, *pptp;
1974 	int i;
1975 	unsigned long index;
1976 	pd_entry_t *pva;
1977 	paddr_t ppa, pa;
1978 	struct uvm_object *obj;
1979 
1980 	KASSERT(pmap != pmap_kernel());
1981 	KASSERT(mutex_owned(pmap->pm_lock));
1982 	KASSERT(kpreempt_disabled());
1983 
1984 	ptp = NULL;
1985 	pa = (paddr_t)-1;
1986 
1987 	/*
1988 	 * Loop through all page table levels seeing if we need to
1989 	 * add a new page to that level.
1990 	 */
1991 	for (i = PTP_LEVELS; i > 1; i--) {
1992 		/*
1993 		 * Save values from previous round.
1994 		 */
1995 		pptp = ptp;
1996 		ppa = pa;
1997 
1998 		index = pl_i(va, i);
1999 		pva = pdes[i - 2];
2000 
2001 		if (pmap_valid_entry(pva[index])) {
2002 			ppa = pmap_pte2pa(pva[index]);
2003 			ptp = NULL;
2004 			continue;
2005 		}
2006 
2007 		obj = &pmap->pm_obj[i-2];
2008 		PMAP_SUBOBJ_LOCK(pmap, i - 2);
2009 		ptp = uvm_pagealloc(obj, ptp_va2o(va, i - 1), NULL,
2010 		    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
2011 		PMAP_SUBOBJ_UNLOCK(pmap, i - 2);
2012 
2013 		if (ptp == NULL)
2014 			return NULL;
2015 
2016 		ptp->flags &= ~PG_BUSY; /* never busy */
2017 		ptp->wire_count = 1;
2018 		pmap->pm_ptphint[i - 2] = ptp;
2019 		pa = VM_PAGE_TO_PHYS(ptp);
2020 		pmap_pte_set(&pva[index], (pd_entry_t)
2021 		        (pmap_pa2pte(pa) | PG_u | PG_RW | PG_V));
2022 #if defined(XEN) && defined(__x86_64__)
2023 		if(i == PTP_LEVELS) {
2024 			/*
2025 			 * Update the per-cpu PD on all cpus the current
2026 			 * pmap is active on
2027 			 */
2028 			xen_kpm_sync(pmap, index);
2029 		}
2030 #endif
2031 		pmap_pte_flush();
2032 		pmap_stats_update(pmap, 1, 0);
2033 		/*
2034 		 * If we're not in the top level, increase the
2035 		 * wire count of the parent page.
2036 		 */
2037 		if (i < PTP_LEVELS) {
2038 			if (pptp == NULL) {
2039 				pptp = pmap_find_ptp(pmap, va, ppa, i);
2040 				KASSERT(pptp != NULL);
2041 			}
2042 			pptp->wire_count++;
2043 		}
2044 	}
2045 
2046 	/*
2047 	 * PTP is not NULL if we just allocated a new PTP.  If it is
2048 	 * still NULL, we must look up the existing one.
2049 	 */
2050 	if (ptp == NULL) {
2051 		ptp = pmap_find_ptp(pmap, va, ppa, 1);
2052 		KASSERTMSG(ptp != NULL, "pmap_get_ptp: va %" PRIxVADDR
2053 		    "ppa %" PRIxPADDR "\n", va, ppa);
2054 	}
2055 
2056 	pmap->pm_ptphint[0] = ptp;
2057 	return ptp;
2058 }
2059 
2060 /*
2061  * p m a p   l i f e c y c l e   f u n c t i o n s
2062  */
2063 
2064 /*
2065  * pmap_pdp_ctor: constructor for the PDP cache.
2066  */
2067 static int
2068 pmap_pdp_ctor(void *arg, void *v, int flags)
2069 {
2070 	pd_entry_t *pdir = v;
2071 	paddr_t pdirpa = 0;
2072 	vaddr_t object;
2073 	int i;
2074 
2075 #if !defined(XEN) || !defined(__x86_64__)
2076 	int npde;
2077 #endif
2078 #ifdef XEN
2079 	int s;
2080 #endif
2081 
2082 	/*
2083 	 * NOTE: The `pmaps_lock' is held when the PDP is allocated.
2084 	 */
2085 
2086 #if defined(XEN) && defined(__x86_64__)
2087 	/* Fetch the physical address of the page directory */
2088 	(void)pmap_extract(pmap_kernel(), (vaddr_t)pdir, &pdirpa);
2089 
2090 	/* Zero the area */
2091 	memset(pdir, 0, PAGE_SIZE); /* Xen wants a clean page */
2092 
2093 	/*
2094 	 * This pdir will NEVER be active in kernel mode, so mark
2095 	 * recursive entry invalid.
2096 	 */
2097 	pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa) | PG_u;
2098 
2099 	/*
2100 	 * PDP constructed this way won't be for the kernel, hence we
2101 	 * don't put kernel mappings on Xen.
2102 	 *
2103 	 * But we need to make pmap_create() happy, so put a dummy
2104 	 * (without PG_V) value at the right place.
2105 	 */
2106 	pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] =
2107 	     (pd_entry_t)-1 & PG_FRAME;
2108 #else /* XEN && __x86_64__*/
2109 	/* Zero the area */
2110 	memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t));
2111 
2112 	object = (vaddr_t)v;
2113 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2114 		/* Fetch the physical address of the page directory */
2115 		(void)pmap_extract(pmap_kernel(), object, &pdirpa);
2116 
2117 		/* Put in recursive PDE to map the PTEs */
2118 		pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PG_V |
2119 		    pmap_pg_nx;
2120 #ifndef XEN
2121 		pdir[PDIR_SLOT_PTE + i] |= PG_KW;
2122 #endif
2123 	}
2124 
2125 	/* Copy the kernel's top level PDE */
2126 	npde = nkptp[PTP_LEVELS - 1];
2127 
2128 	memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN],
2129 	    npde * sizeof(pd_entry_t));
2130 
2131 	/* Zero the rest */
2132 	memset(&pdir[PDIR_SLOT_KERN + npde], 0, (PAGE_SIZE * PDP_SIZE) -
2133 	    (PDIR_SLOT_KERN + npde) * sizeof(pd_entry_t));
2134 
2135 	if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
2136 		int idx = pl_i(KERNBASE, PTP_LEVELS);
2137 		pdir[idx] = PDP_BASE[idx];
2138 	}
2139 
2140 #ifdef __HAVE_DIRECT_MAP
2141 	pdir[PDIR_SLOT_DIRECT] = PDP_BASE[PDIR_SLOT_DIRECT];
2142 #endif
2143 #endif /* XEN  && __x86_64__*/
2144 
2145 #ifdef XEN
2146 	s = splvm();
2147 	object = (vaddr_t)v;
2148 	pmap_protect(pmap_kernel(), object, object + (PAGE_SIZE * PDP_SIZE),
2149 	    VM_PROT_READ);
2150 	pmap_update(pmap_kernel());
2151 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2152 		/*
2153 		 * pin as L2/L4 page, we have to do the page with the
2154 		 * PDIR_SLOT_PTE entries last
2155 		 */
2156 #ifdef PAE
2157 		if (i == l2tol3(PDIR_SLOT_PTE))
2158 			continue;
2159 #endif
2160 
2161 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2162 #ifdef __x86_64__
2163 		xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa));
2164 #else
2165 		xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2166 #endif
2167 	}
2168 #ifdef PAE
2169 	object = ((vaddr_t)pdir) + PAGE_SIZE  * l2tol3(PDIR_SLOT_PTE);
2170 	(void)pmap_extract(pmap_kernel(), object, &pdirpa);
2171 	xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2172 #endif
2173 	splx(s);
2174 #endif /* XEN */
2175 
2176 	return (0);
2177 }
2178 
2179 /*
2180  * pmap_pdp_dtor: destructor for the PDP cache.
2181  */
2182 
2183 static void
2184 pmap_pdp_dtor(void *arg, void *v)
2185 {
2186 #ifdef XEN
2187 	paddr_t pdirpa = 0;	/* XXX: GCC */
2188 	vaddr_t object = (vaddr_t)v;
2189 	int i;
2190 	int s = splvm();
2191 	pt_entry_t *pte;
2192 
2193 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2194 		/* fetch the physical address of the page directory. */
2195 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2196 		/* unpin page table */
2197 		xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa));
2198 	}
2199 	object = (vaddr_t)v;
2200 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2201 		/* Set page RW again */
2202 		pte = kvtopte(object);
2203 		pmap_pte_set(pte, *pte | PG_RW);
2204 		xen_bcast_invlpg((vaddr_t)object);
2205 	}
2206 	splx(s);
2207 #endif  /* XEN */
2208 }
2209 
2210 #ifdef PAE
2211 
2212 /* pmap_pdp_alloc: Allocate a page for the pdp memory pool. */
2213 
2214 static void *
2215 pmap_pdp_alloc(struct pool *pp, int flags)
2216 {
2217 	return (void *)uvm_km_alloc(kernel_map,
2218 	    PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE,
2219 	    ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK)
2220 	    | UVM_KMF_WIRED);
2221 }
2222 
2223 /*
2224  * pmap_pdp_free: free a PDP
2225  */
2226 
2227 static void
2228 pmap_pdp_free(struct pool *pp, void *v)
2229 {
2230 	uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE,
2231 	    UVM_KMF_WIRED);
2232 }
2233 #endif /* PAE */
2234 
2235 /*
2236  * pmap_create: create a pmap object.
2237  */
2238 struct pmap *
2239 pmap_create(void)
2240 {
2241 	struct pmap *pmap;
2242 	int i;
2243 
2244 	pmap = pool_cache_get(&pmap_cache, PR_WAITOK);
2245 
2246 	/* init uvm_object */
2247 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2248 		mutex_init(&pmap->pm_obj_lock[i], MUTEX_DEFAULT, IPL_NONE);
2249 		uvm_obj_init(&pmap->pm_obj[i], NULL, false, 1);
2250 		uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_obj_lock[i]);
2251 		pmap->pm_ptphint[i] = NULL;
2252 	}
2253 	pmap->pm_stats.wired_count = 0;
2254 	/* count the PDP allocd below */
2255 	pmap->pm_stats.resident_count = PDP_SIZE;
2256 #if !defined(__x86_64__)
2257 	pmap->pm_hiexec = 0;
2258 #endif /* !defined(__x86_64__) */
2259 	pmap->pm_flags = 0;
2260 	pmap->pm_gc_ptp = NULL;
2261 
2262 	kcpuset_create(&pmap->pm_cpus, true);
2263 	kcpuset_create(&pmap->pm_kernel_cpus, true);
2264 #ifdef XEN
2265 	kcpuset_create(&pmap->pm_xen_ptp_cpus, true);
2266 #endif
2267 	/* init the LDT */
2268 	pmap->pm_ldt = NULL;
2269 	pmap->pm_ldt_len = 0;
2270 	pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2271 
2272 	/* allocate PDP */
2273  try_again:
2274 	pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK);
2275 
2276 	mutex_enter(&pmaps_lock);
2277 
2278 	if (pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] == 0) {
2279 		mutex_exit(&pmaps_lock);
2280 		pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir);
2281 		goto try_again;
2282 	}
2283 
2284 	for (i = 0; i < PDP_SIZE; i++)
2285 		pmap->pm_pdirpa[i] =
2286 		    pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]);
2287 
2288 	LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
2289 
2290 	mutex_exit(&pmaps_lock);
2291 
2292 	return (pmap);
2293 }
2294 
2295 /*
2296  * pmap_free_ptps: put a list of ptps back to the freelist.
2297  */
2298 
2299 void
2300 pmap_free_ptps(struct vm_page *empty_ptps)
2301 {
2302 	struct vm_page *ptp;
2303 	struct pmap_page *pp;
2304 
2305 	while ((ptp = empty_ptps) != NULL) {
2306 		pp = VM_PAGE_TO_PP(ptp);
2307 		empty_ptps = pp->pp_link;
2308 		LIST_INIT(&pp->pp_head.pvh_list);
2309 		uvm_pagefree(ptp);
2310 	}
2311 }
2312 
2313 /*
2314  * pmap_destroy: drop reference count on pmap.   free pmap if
2315  *	reference count goes to zero.
2316  */
2317 
2318 void
2319 pmap_destroy(struct pmap *pmap)
2320 {
2321 	lwp_t *l;
2322 	int i;
2323 
2324 	/*
2325 	 * If we have torn down this pmap, process deferred frees and
2326 	 * invalidations.  Free now if the system is low on memory.
2327 	 * Otherwise, free when the pmap is destroyed thus avoiding a
2328 	 * TLB shootdown.
2329 	 */
2330 	l = curlwp;
2331 	if (__predict_false(l->l_md.md_gc_pmap == pmap)) {
2332 		if (uvmexp.free < uvmexp.freetarg) {
2333 			pmap_update(pmap);
2334 		} else {
2335 			KASSERT(pmap->pm_gc_ptp == NULL);
2336 			pmap->pm_gc_ptp = l->l_md.md_gc_ptp;
2337 			l->l_md.md_gc_ptp = NULL;
2338 			l->l_md.md_gc_pmap = NULL;
2339 		}
2340 	}
2341 
2342 	/*
2343 	 * drop reference count
2344 	 */
2345 
2346 	if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) {
2347 		return;
2348 	}
2349 
2350 #ifdef DIAGNOSTIC
2351 	CPU_INFO_ITERATOR cii;
2352 	struct cpu_info *ci;
2353 
2354 	for (CPU_INFO_FOREACH(cii, ci)) {
2355 		if (ci->ci_pmap == pmap)
2356 			panic("destroying pmap being used");
2357 #if defined(XEN) && defined(__x86_64__)
2358 		for (i = 0; i < PDIR_SLOT_PTE; i++) {
2359 			if (pmap->pm_pdir[i] != 0 &&
2360 			    ci->ci_kpm_pdir[i] == pmap->pm_pdir[i]) {
2361 				printf("pmap_destroy(%p) pmap_kernel %p "
2362 				    "curcpu %d cpu %d ci_pmap %p "
2363 				    "ci->ci_kpm_pdir[%d]=%" PRIx64
2364 				    " pmap->pm_pdir[%d]=%" PRIx64 "\n",
2365 				    pmap, pmap_kernel(), curcpu()->ci_index,
2366 				    ci->ci_index, ci->ci_pmap,
2367 				    i, ci->ci_kpm_pdir[i],
2368 				    i, pmap->pm_pdir[i]);
2369 				panic("pmap_destroy: used pmap");
2370 			}
2371 		}
2372 #endif
2373 	}
2374 #endif /* DIAGNOSTIC */
2375 
2376 	/*
2377 	 * Reference count is zero, free pmap resources and then free pmap.
2378 	 * First, remove it from global list of pmaps.
2379 	 */
2380 
2381 	mutex_enter(&pmaps_lock);
2382 	LIST_REMOVE(pmap, pm_list);
2383 	mutex_exit(&pmaps_lock);
2384 
2385 	/*
2386 	 * Process deferred PTP frees.  No TLB shootdown required, as the
2387 	 * PTP pages are no longer visible to any CPU.
2388 	 */
2389 
2390 	pmap_free_ptps(pmap->pm_gc_ptp);
2391 
2392 	/*
2393 	 * destroyed pmap shouldn't have remaining PTPs
2394 	 */
2395 
2396 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2397 		KASSERT(pmap->pm_obj[i].uo_npages == 0);
2398 		KASSERT(TAILQ_EMPTY(&pmap->pm_obj[i].memq));
2399 	}
2400 
2401 	pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir);
2402 
2403 #ifdef USER_LDT
2404 	if (pmap->pm_ldt != NULL) {
2405 		/*
2406 		 * no need to switch the LDT; this address space is gone,
2407 		 * nothing is using it.
2408 		 *
2409 		 * No need to lock the pmap for ldt_free (or anything else),
2410 		 * we're the last one to use it.
2411 		 */
2412 		mutex_enter(&cpu_lock);
2413 		ldt_free(pmap->pm_ldt_sel);
2414 		mutex_exit(&cpu_lock);
2415 		uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt,
2416 		    pmap->pm_ldt_len, UVM_KMF_WIRED);
2417 	}
2418 #endif
2419 
2420 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2421 		uvm_obj_destroy(&pmap->pm_obj[i], false);
2422 		mutex_destroy(&pmap->pm_obj_lock[i]);
2423 	}
2424 	kcpuset_destroy(pmap->pm_cpus);
2425 	kcpuset_destroy(pmap->pm_kernel_cpus);
2426 #ifdef XEN
2427 	kcpuset_destroy(pmap->pm_xen_ptp_cpus);
2428 #endif
2429 	pool_cache_put(&pmap_cache, pmap);
2430 }
2431 
2432 /*
2433  * pmap_remove_all: pmap is being torn down by the current thread.
2434  * avoid unnecessary invalidations.
2435  */
2436 
2437 void
2438 pmap_remove_all(struct pmap *pmap)
2439 {
2440 	lwp_t *l = curlwp;
2441 
2442 	KASSERT(l->l_md.md_gc_pmap == NULL);
2443 
2444 	l->l_md.md_gc_pmap = pmap;
2445 }
2446 
2447 #if defined(PMAP_FORK)
2448 /*
2449  * pmap_fork: perform any necessary data structure manipulation when
2450  * a VM space is forked.
2451  */
2452 
2453 void
2454 pmap_fork(struct pmap *pmap1, struct pmap *pmap2)
2455 {
2456 #ifdef USER_LDT
2457 	union descriptor *new_ldt;
2458 	size_t len;
2459 	int sel;
2460 
2461 	if (__predict_true(pmap1->pm_ldt == NULL)) {
2462 		return;
2463 	}
2464 
2465 	/*
2466 	 * Copy the LDT into the new process.
2467 	 *
2468 	 * Read pmap1's ldt pointer and length unlocked; if it changes
2469 	 * behind our back we'll retry. This will starve if there's a
2470 	 * stream of LDT changes in another thread but that should not
2471 	 * happen.
2472 	 */
2473 
2474  retry:
2475 	if (pmap1->pm_ldt != NULL) {
2476 		len = pmap1->pm_ldt_len;
2477 		/* Allocate space for the new process's LDT */
2478 		new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, len, 0,
2479 		    UVM_KMF_WIRED);
2480 		if (new_ldt == NULL) {
2481 			printf("WARNING: pmap_fork: "
2482 			       "unable to allocate LDT space\n");
2483 			return;
2484 		}
2485 		mutex_enter(&cpu_lock);
2486 		/* Get a GDT slot for it */
2487 		sel = ldt_alloc(new_ldt, len);
2488 		if (sel == -1) {
2489 			mutex_exit(&cpu_lock);
2490 			uvm_km_free(kernel_map, (vaddr_t)new_ldt, len,
2491 			    UVM_KMF_WIRED);
2492 			printf("WARNING: pmap_fork: "
2493 			       "unable to allocate LDT selector\n");
2494 			return;
2495 		}
2496 	} else {
2497 		/* Wasn't anything there after all. */
2498 		len = -1;
2499 		new_ldt = NULL;
2500 		sel = -1;
2501 		mutex_enter(&cpu_lock);
2502 	}
2503 
2504  	/* If there's still something there now that we have cpu_lock... */
2505  	if (pmap1->pm_ldt != NULL) {
2506 		if (len != pmap1->pm_ldt_len) {
2507 			/* Oops, it changed. Drop what we did and try again */
2508 			if (len != -1) {
2509 				ldt_free(sel);
2510 				uvm_km_free(kernel_map, (vaddr_t)new_ldt,
2511 				    len, UVM_KMF_WIRED);
2512 			}
2513 			mutex_exit(&cpu_lock);
2514 			goto retry;
2515 		}
2516 
2517 		/* Copy the LDT data and install it in pmap2 */
2518 		memcpy(new_ldt, pmap1->pm_ldt, len);
2519 		pmap2->pm_ldt = new_ldt;
2520 		pmap2->pm_ldt_len = pmap1->pm_ldt_len;
2521 		pmap2->pm_ldt_sel = sel;
2522 		len = -1;
2523 	}
2524 
2525 	if (len != -1) {
2526 		/* There wasn't still something there, so mop up */
2527 		ldt_free(sel);
2528 		mutex_exit(&cpu_lock);
2529 		uvm_km_free(kernel_map, (vaddr_t)new_ldt, len,
2530 		    UVM_KMF_WIRED);
2531 	} else {
2532 		mutex_exit(&cpu_lock);
2533 	}
2534 #endif /* USER_LDT */
2535 }
2536 #endif /* PMAP_FORK */
2537 
2538 #ifdef USER_LDT
2539 
2540 /*
2541  * pmap_ldt_xcall: cross call used by pmap_ldt_sync.  if the named pmap
2542  * is active, reload LDTR.
2543  */
2544 static void
2545 pmap_ldt_xcall(void *arg1, void *arg2)
2546 {
2547 	struct pmap *pm;
2548 
2549 	kpreempt_disable();
2550 	pm = arg1;
2551 	if (curcpu()->ci_pmap == pm) {
2552 		lldt(pm->pm_ldt_sel);
2553 	}
2554 	kpreempt_enable();
2555 }
2556 
2557 /*
2558  * pmap_ldt_sync: LDT selector for the named pmap is changing.  swap
2559  * in the new selector on all CPUs.
2560  */
2561 void
2562 pmap_ldt_sync(struct pmap *pm)
2563 {
2564 	uint64_t where;
2565 
2566 	KASSERT(mutex_owned(&cpu_lock));
2567 
2568 	pmap_ldt_evcnt.ev_count++;
2569 	where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL);
2570 	xc_wait(where);
2571 }
2572 
2573 /*
2574  * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and
2575  * restore the default.
2576  */
2577 
2578 void
2579 pmap_ldt_cleanup(struct lwp *l)
2580 {
2581 	pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap;
2582 	union descriptor *dp = NULL;
2583 	size_t len = 0;
2584 	int sel = -1;
2585 
2586 	if (__predict_true(pmap->pm_ldt == NULL)) {
2587 		return;
2588 	}
2589 
2590 	mutex_enter(&cpu_lock);
2591 	if (pmap->pm_ldt != NULL) {
2592 		sel = pmap->pm_ldt_sel;
2593 		dp = pmap->pm_ldt;
2594 		len = pmap->pm_ldt_len;
2595 		pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2596 		pmap->pm_ldt = NULL;
2597 		pmap->pm_ldt_len = 0;
2598 		pmap_ldt_sync(pmap);
2599 		ldt_free(sel);
2600 		uvm_km_free(kernel_map, (vaddr_t)dp, len, UVM_KMF_WIRED);
2601 	}
2602 	mutex_exit(&cpu_lock);
2603 }
2604 #endif /* USER_LDT */
2605 
2606 /*
2607  * pmap_activate: activate a process' pmap
2608  *
2609  * => must be called with kernel preemption disabled
2610  * => if lwp is the curlwp, then set ci_want_pmapload so that
2611  *    actual MMU context switch will be done by pmap_load() later
2612  */
2613 
2614 void
2615 pmap_activate(struct lwp *l)
2616 {
2617 	struct cpu_info *ci;
2618 	struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2619 
2620 	KASSERT(kpreempt_disabled());
2621 
2622 	ci = curcpu();
2623 
2624 	if (l == ci->ci_curlwp) {
2625 		KASSERT(ci->ci_want_pmapload == 0);
2626 		KASSERT(ci->ci_tlbstate != TLBSTATE_VALID);
2627 
2628 		/*
2629 		 * no need to switch to kernel vmspace because
2630 		 * it's a subset of any vmspace.
2631 		 */
2632 
2633 		if (pmap == pmap_kernel()) {
2634 			ci->ci_want_pmapload = 0;
2635 			return;
2636 		}
2637 
2638 		ci->ci_want_pmapload = 1;
2639 	}
2640 }
2641 
2642 /*
2643  * pmap_reactivate: try to regain reference to the pmap.
2644  *
2645  * => Must be called with kernel preemption disabled.
2646  */
2647 
2648 static bool
2649 pmap_reactivate(struct pmap *pmap)
2650 {
2651 	struct cpu_info * const ci = curcpu();
2652 	const cpuid_t cid = cpu_index(ci);
2653 	bool result;
2654 
2655 	KASSERT(kpreempt_disabled());
2656 #if defined(XEN) && defined(__x86_64__)
2657 	KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd);
2658 #elif defined(PAE)
2659 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
2660 #elif !defined(XEN)
2661 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()));
2662 #endif
2663 
2664 	/*
2665 	 * If we still have a lazy reference to this pmap, we can assume
2666 	 * that there was no TLB shootdown for this pmap in the meantime.
2667 	 *
2668 	 * The order of events here is important as we must synchronize
2669 	 * with TLB shootdown interrupts.  Declare interest in invalidations
2670 	 * (TLBSTATE_VALID) and then check the CPU set, which the IPIs can
2671 	 * change only when the state is TLBSTATE_LAZY.
2672 	 */
2673 
2674 	ci->ci_tlbstate = TLBSTATE_VALID;
2675 	KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid));
2676 
2677 	if (kcpuset_isset(pmap->pm_cpus, cid)) {
2678 		/* We have the reference, state is valid. */
2679 		result = true;
2680 	} else {
2681 		/* Must reload the TLB. */
2682 		kcpuset_atomic_set(pmap->pm_cpus, cid);
2683 		result = false;
2684 	}
2685 	return result;
2686 }
2687 
2688 /*
2689  * pmap_load: perform the actual pmap switch, i.e. fill in %cr3 register
2690  * and relevant LDT info.
2691  *
2692  * Ensures that the current process' pmap is loaded on the current CPU's
2693  * MMU and that there are no stale TLB entries.
2694  *
2695  * => The caller should disable kernel preemption or do check-and-retry
2696  *    to prevent a preemption from undoing our efforts.
2697  * => This function may block.
2698  */
2699 void
2700 pmap_load(void)
2701 {
2702 	struct cpu_info *ci;
2703 	struct pmap *pmap, *oldpmap;
2704 	struct lwp *l;
2705 	struct pcb *pcb;
2706 	cpuid_t cid;
2707 	uint64_t ncsw;
2708 
2709 	kpreempt_disable();
2710  retry:
2711 	ci = curcpu();
2712 	if (!ci->ci_want_pmapload) {
2713 		kpreempt_enable();
2714 		return;
2715 	}
2716 	l = ci->ci_curlwp;
2717 	ncsw = l->l_ncsw;
2718 
2719 	/* should be able to take ipis. */
2720 	KASSERT(ci->ci_ilevel < IPL_HIGH);
2721 #ifdef XEN
2722 	/* Check to see if interrupts are enabled (ie; no events are masked) */
2723 	KASSERT(x86_read_psl() == 0);
2724 #else
2725 	KASSERT((x86_read_psl() & PSL_I) != 0);
2726 #endif
2727 
2728 	KASSERT(l != NULL);
2729 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2730 	KASSERT(pmap != pmap_kernel());
2731 	oldpmap = ci->ci_pmap;
2732 	pcb = lwp_getpcb(l);
2733 
2734 	if (pmap == oldpmap) {
2735 		if (!pmap_reactivate(pmap)) {
2736 			u_int gen = uvm_emap_gen_return();
2737 
2738 			/*
2739 			 * pmap has been changed during deactivated.
2740 			 * our tlb may be stale.
2741 			 */
2742 
2743 			tlbflush();
2744 			uvm_emap_update(gen);
2745 		}
2746 
2747 		ci->ci_want_pmapload = 0;
2748 		kpreempt_enable();
2749 		return;
2750 	}
2751 
2752 	/*
2753 	 * Acquire a reference to the new pmap and perform the switch.
2754 	 */
2755 
2756 	pmap_reference(pmap);
2757 
2758 	cid = cpu_index(ci);
2759 	kcpuset_atomic_clear(oldpmap->pm_cpus, cid);
2760 	kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid);
2761 
2762 #if defined(XEN) && defined(__x86_64__)
2763 	KASSERT(pmap_pdirpa(oldpmap, 0) == ci->ci_xen_current_user_pgd ||
2764 	    oldpmap == pmap_kernel());
2765 #elif defined(PAE)
2766 	KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
2767 #elif !defined(XEN)
2768 	KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(rcr3()));
2769 #endif
2770 	KASSERT(!kcpuset_isset(pmap->pm_cpus, cid));
2771 	KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid));
2772 
2773 	/*
2774 	 * Mark the pmap in use by this CPU.  Again, we must synchronize
2775 	 * with TLB shootdown interrupts, so set the state VALID first,
2776 	 * then register us for shootdown events on this pmap.
2777 	 */
2778 	ci->ci_tlbstate = TLBSTATE_VALID;
2779 	kcpuset_atomic_set(pmap->pm_cpus, cid);
2780 	kcpuset_atomic_set(pmap->pm_kernel_cpus, cid);
2781 	ci->ci_pmap = pmap;
2782 
2783 	/*
2784 	 * update tss.  now that we have registered for invalidations
2785 	 * from other CPUs, we're good to load the page tables.
2786 	 */
2787 #ifdef PAE
2788 	pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa;
2789 #else
2790 	pcb->pcb_cr3 = pmap_pdirpa(pmap, 0);
2791 #endif
2792 
2793 #ifdef i386
2794 #ifndef XEN
2795 	ci->ci_tss.tss_ldt = pmap->pm_ldt_sel;
2796 	ci->ci_tss.tss_cr3 = pcb->pcb_cr3;
2797 #endif /* !XEN */
2798 #endif /* i386 */
2799 
2800 	lldt(pmap->pm_ldt_sel);
2801 
2802 	u_int gen = uvm_emap_gen_return();
2803 	cpu_load_pmap(pmap, oldpmap);
2804 	uvm_emap_update(gen);
2805 
2806 	ci->ci_want_pmapload = 0;
2807 
2808 	/*
2809 	 * we're now running with the new pmap.  drop the reference
2810 	 * to the old pmap.  if we block, we need to go around again.
2811 	 */
2812 
2813 	pmap_destroy(oldpmap);
2814 	if (l->l_ncsw != ncsw) {
2815 		goto retry;
2816 	}
2817 
2818 	kpreempt_enable();
2819 }
2820 
2821 /*
2822  * pmap_deactivate: deactivate a process' pmap.
2823  *
2824  * => Must be called with kernel preemption disabled (high IPL is enough).
2825  */
2826 void
2827 pmap_deactivate(struct lwp *l)
2828 {
2829 	struct pmap *pmap;
2830 	struct cpu_info *ci;
2831 
2832 	KASSERT(kpreempt_disabled());
2833 
2834 	if (l != curlwp) {
2835 		return;
2836 	}
2837 
2838 	/*
2839 	 * Wait for pending TLB shootdowns to complete.  Necessary because
2840 	 * TLB shootdown state is per-CPU, and the LWP may be coming off
2841 	 * the CPU before it has a chance to call pmap_update(), e.g. due
2842 	 * to kernel preemption or blocking routine in between.
2843 	 */
2844 	pmap_tlb_shootnow();
2845 
2846 	ci = curcpu();
2847 
2848 	if (ci->ci_want_pmapload) {
2849 		/*
2850 		 * ci_want_pmapload means that our pmap is not loaded on
2851 		 * the CPU or TLB might be stale.  note that pmap_kernel()
2852 		 * is always considered loaded.
2853 		 */
2854 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
2855 		    != pmap_kernel());
2856 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
2857 		    != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID);
2858 
2859 		/*
2860 		 * userspace has not been touched.
2861 		 * nothing to do here.
2862 		 */
2863 
2864 		ci->ci_want_pmapload = 0;
2865 		return;
2866 	}
2867 
2868 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2869 
2870 	if (pmap == pmap_kernel()) {
2871 		return;
2872 	}
2873 
2874 #if defined(XEN) && defined(__x86_64__)
2875 	KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd);
2876 #elif defined(PAE)
2877 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
2878 #elif !defined(XEN)
2879 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()));
2880 #endif
2881 	KASSERT(ci->ci_pmap == pmap);
2882 
2883 	/*
2884 	 * we aren't interested in TLB invalidations for this pmap,
2885 	 * at least for the time being.
2886 	 */
2887 
2888 	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
2889 	ci->ci_tlbstate = TLBSTATE_LAZY;
2890 }
2891 
2892 /*
2893  * end of lifecycle functions
2894  */
2895 
2896 /*
2897  * some misc. functions
2898  */
2899 
2900 int
2901 pmap_pdes_invalid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde)
2902 {
2903 	int i;
2904 	unsigned long index;
2905 	pd_entry_t pde;
2906 
2907 	for (i = PTP_LEVELS; i > 1; i--) {
2908 		index = pl_i(va, i);
2909 		pde = pdes[i - 2][index];
2910 		if ((pde & PG_V) == 0)
2911 			return i;
2912 	}
2913 	if (lastpde != NULL)
2914 		*lastpde = pde;
2915 	return 0;
2916 }
2917 
2918 /*
2919  * pmap_extract: extract a PA for the given VA
2920  */
2921 
2922 bool
2923 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
2924 {
2925 	pt_entry_t *ptes, pte;
2926 	pd_entry_t pde;
2927 	pd_entry_t * const *pdes;
2928 	struct pmap *pmap2;
2929 	struct cpu_info *ci;
2930 	paddr_t pa;
2931 	lwp_t *l;
2932 	bool hard, rv;
2933 
2934 #ifdef __HAVE_DIRECT_MAP
2935 	if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
2936 		if (pap != NULL) {
2937 			*pap = va - PMAP_DIRECT_BASE;
2938 		}
2939 		return true;
2940 	}
2941 #endif
2942 
2943 	rv = false;
2944 	pa = 0;
2945 	l = curlwp;
2946 
2947 	kpreempt_disable();
2948 	ci = l->l_cpu;
2949 	if (__predict_true(!ci->ci_want_pmapload && ci->ci_pmap == pmap) ||
2950 	    pmap == pmap_kernel()) {
2951 		/*
2952 		 * no need to lock, because it's pmap_kernel() or our
2953 		 * own pmap and is active.  if a user pmap, the caller
2954 		 * will hold the vm_map write/read locked and so prevent
2955 		 * entries from disappearing while we are here.  ptps
2956 		 * can disappear via pmap_remove() and pmap_protect(),
2957 		 * but they are called with the vm_map write locked.
2958 		 */
2959 		hard = false;
2960 		ptes = PTE_BASE;
2961 		pdes = normal_pdes;
2962 	} else {
2963 		/* we lose, do it the hard way. */
2964 		hard = true;
2965 		pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
2966 	}
2967 	if (pmap_pdes_valid(va, pdes, &pde)) {
2968 		pte = ptes[pl1_i(va)];
2969 		if (pde & PG_PS) {
2970 			pa = (pde & PG_LGFRAME) | (va & (NBPD_L2 - 1));
2971 			rv = true;
2972 		} else if (__predict_true((pte & PG_V) != 0)) {
2973 			pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
2974 			rv = true;
2975 		}
2976 	}
2977 	if (__predict_false(hard)) {
2978 		pmap_unmap_ptes(pmap, pmap2);
2979 	}
2980 	kpreempt_enable();
2981 	if (pap != NULL) {
2982 		*pap = pa;
2983 	}
2984 	return rv;
2985 }
2986 
2987 
2988 /*
2989  * vtophys: virtual address to physical address.  For use by
2990  * machine-dependent code only.
2991  */
2992 
2993 paddr_t
2994 vtophys(vaddr_t va)
2995 {
2996 	paddr_t pa;
2997 
2998 	if (pmap_extract(pmap_kernel(), va, &pa) == true)
2999 		return (pa);
3000 	return (0);
3001 }
3002 
3003 __strict_weak_alias(pmap_extract_ma, pmap_extract);
3004 
3005 #ifdef XEN
3006 
3007 /*
3008  * vtomach: virtual address to machine address.  For use by
3009  * machine-dependent code only.
3010  */
3011 
3012 paddr_t
3013 vtomach(vaddr_t va)
3014 {
3015 	paddr_t pa;
3016 
3017 	if (pmap_extract_ma(pmap_kernel(), va, &pa) == true)
3018 		return (pa);
3019 	return (0);
3020 }
3021 
3022 #endif /* XEN */
3023 
3024 /*
3025  * pmap_virtual_space: used during bootup [pmap_steal_memory] to
3026  *	determine the bounds of the kernel virtual addess space.
3027  */
3028 
3029 void
3030 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp)
3031 {
3032 	*startp = virtual_avail;
3033 	*endp = virtual_end;
3034 }
3035 
3036 /*
3037  * pmap_zero_page: zero a page
3038  */
3039 
3040 void
3041 pmap_zero_page(paddr_t pa)
3042 {
3043 #if defined(__HAVE_DIRECT_MAP)
3044 	pagezero(PMAP_DIRECT_MAP(pa));
3045 #else
3046 #if defined(XEN)
3047 	if (XEN_VERSION_SUPPORTED(3, 4))
3048 		xen_pagezero(pa);
3049 #endif
3050 	pt_entry_t *zpte;
3051 	void *zerova;
3052 	int id;
3053 
3054 	const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_M | PG_U |
3055 	    PG_k;
3056 
3057 	kpreempt_disable();
3058 	id = cpu_number();
3059 	zpte = PTESLEW(zero_pte, id);
3060 	zerova = VASLEW(zerop, id);
3061 
3062 #ifdef DIAGNOSTIC
3063 	if (*zpte)
3064 		panic("pmap_zero_page: lock botch");
3065 #endif
3066 
3067 	pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags);
3068 	pmap_pte_flush();
3069 	pmap_update_pg((vaddr_t)zerova);		/* flush TLB */
3070 
3071 	memset(zerova, 0, PAGE_SIZE);
3072 
3073 #if defined(DIAGNOSTIC) || defined(XEN)
3074 	pmap_pte_set(zpte, 0);				/* zap ! */
3075 	pmap_pte_flush();
3076 #endif
3077 
3078 	kpreempt_enable();
3079 #endif /* defined(__HAVE_DIRECT_MAP) */
3080 }
3081 
3082 /*
3083  * pmap_pagezeroidle: the same, for the idle loop page zero'er.
3084  * Returns true if the page was zero'd, false if we aborted for
3085  * some reason.
3086  */
3087 
3088 bool
3089 pmap_pageidlezero(paddr_t pa)
3090 {
3091 #ifdef __HAVE_DIRECT_MAP
3092 	KASSERT(cpu_feature[0] & CPUID_SSE2);
3093 	return sse2_idlezero_page((void *)PMAP_DIRECT_MAP(pa));
3094 #else
3095 	pt_entry_t *zpte;
3096 	void *zerova;
3097 	bool rv;
3098 	int id;
3099 
3100 	const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_M | PG_U |
3101 	    PG_k;
3102 
3103 	id = cpu_number();
3104 	zpte = PTESLEW(zero_pte, id);
3105 	zerova = VASLEW(zerop, id);
3106 
3107 	KASSERT(cpu_feature[0] & CPUID_SSE2);
3108 	KASSERT(*zpte == 0);
3109 
3110 	pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags);
3111 	pmap_pte_flush();
3112 	pmap_update_pg((vaddr_t)zerova);		/* flush TLB */
3113 
3114 	rv = sse2_idlezero_page(zerova);
3115 
3116 #if defined(DIAGNOSTIC) || defined(XEN)
3117 	pmap_pte_set(zpte, 0);				/* zap ! */
3118 	pmap_pte_flush();
3119 #endif
3120 
3121 	return rv;
3122 #endif
3123 }
3124 
3125 /*
3126  * pmap_copy_page: copy a page
3127  */
3128 
3129 void
3130 pmap_copy_page(paddr_t srcpa, paddr_t dstpa)
3131 {
3132 #if defined(__HAVE_DIRECT_MAP)
3133 	vaddr_t srcva = PMAP_DIRECT_MAP(srcpa);
3134 	vaddr_t dstva = PMAP_DIRECT_MAP(dstpa);
3135 
3136 	memcpy((void *)dstva, (void *)srcva, PAGE_SIZE);
3137 #else
3138 #if defined(XEN)
3139 	if (XEN_VERSION_SUPPORTED(3, 4)) {
3140 		xen_copy_page(srcpa, dstpa);
3141 		return;
3142 	}
3143 #endif
3144 	pt_entry_t *spte;
3145 	pt_entry_t *dpte;
3146 	void *csrcva;
3147 	void *cdstva;
3148 	int id;
3149 
3150 	const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_U | PG_k;
3151 
3152 	kpreempt_disable();
3153 	id = cpu_number();
3154 	spte = PTESLEW(csrc_pte,id);
3155 	dpte = PTESLEW(cdst_pte,id);
3156 	csrcva = VASLEW(csrcp, id);
3157 	cdstva = VASLEW(cdstp, id);
3158 
3159 	KASSERT(*spte == 0 && *dpte == 0);
3160 
3161 	pmap_pte_set(spte, pmap_pa2pte(srcpa) | pteflags);
3162 	pmap_pte_set(dpte, pmap_pa2pte(dstpa) | pteflags | PG_M);
3163 	pmap_pte_flush();
3164 	pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva);
3165 
3166 	memcpy(cdstva, csrcva, PAGE_SIZE);
3167 
3168 #if defined(DIAGNOSTIC) || defined(XEN)
3169 	pmap_pte_set(spte, 0);
3170 	pmap_pte_set(dpte, 0);
3171 	pmap_pte_flush();
3172 #endif
3173 
3174 	kpreempt_enable();
3175 #endif /* defined(__HAVE_DIRECT_MAP) */
3176 }
3177 
3178 static pt_entry_t *
3179 pmap_map_ptp(struct vm_page *ptp)
3180 {
3181 #ifdef __HAVE_DIRECT_MAP
3182 	return (void *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp));
3183 #else
3184 	pt_entry_t *ptppte;
3185 	void *ptpva;
3186 	int id;
3187 
3188 	KASSERT(kpreempt_disabled());
3189 
3190 #ifndef XEN
3191 	const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_U | PG_M |
3192 	    PG_k;
3193 #else
3194 	const pd_entry_t pteflags = PG_V | pmap_pg_nx | PG_U | PG_M | PG_k;
3195 #endif
3196 
3197 	id = cpu_number();
3198 	ptppte = PTESLEW(ptp_pte, id);
3199 	ptpva = VASLEW(ptpp, id);
3200 	pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | pteflags);
3201 
3202 	pmap_pte_flush();
3203 	pmap_update_pg((vaddr_t)ptpva);
3204 
3205 	return (pt_entry_t *)ptpva;
3206 #endif
3207 }
3208 
3209 static void
3210 pmap_unmap_ptp(void)
3211 {
3212 #ifndef __HAVE_DIRECT_MAP
3213 #if defined(DIAGNOSTIC) || defined(XEN)
3214 	pt_entry_t *pte;
3215 
3216 	KASSERT(kpreempt_disabled());
3217 
3218 	pte = PTESLEW(ptp_pte, cpu_number());
3219 	if (*pte != 0) {
3220 		pmap_pte_set(pte, 0);
3221 		pmap_pte_flush();
3222 	}
3223 #endif
3224 #endif
3225 }
3226 
3227 static pt_entry_t *
3228 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
3229 {
3230 
3231 	KASSERT(kpreempt_disabled());
3232 	if (pmap_is_curpmap(pmap)) {
3233 		return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */
3234 	}
3235 	KASSERT(ptp != NULL);
3236 	return pmap_map_ptp(ptp) + pl1_pi(va);
3237 }
3238 
3239 static void
3240 pmap_unmap_pte(void)
3241 {
3242 
3243 	KASSERT(kpreempt_disabled());
3244 
3245 	pmap_unmap_ptp();
3246 }
3247 
3248 /*
3249  * p m a p   r e m o v e   f u n c t i o n s
3250  *
3251  * functions that remove mappings
3252  */
3253 
3254 /*
3255  * pmap_remove_ptes: remove PTEs from a PTP
3256  *
3257  * => caller must hold pmap's lock
3258  * => PTP must be mapped into KVA
3259  * => PTP should be null if pmap == pmap_kernel()
3260  * => must be called with kernel preemption disabled
3261  * => returns composite pte if at least one page should be shot down
3262  */
3263 
3264 static void
3265 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
3266 		 vaddr_t startva, vaddr_t endva, struct pv_entry **pv_tofree)
3267 {
3268 	pt_entry_t *pte = (pt_entry_t *)ptpva;
3269 
3270 	KASSERT(pmap == pmap_kernel() || mutex_owned(pmap->pm_lock));
3271 	KASSERT(kpreempt_disabled());
3272 
3273 	/*
3274 	 * note that ptpva points to the PTE that maps startva.   this may
3275 	 * or may not be the first PTE in the PTP.
3276 	 *
3277 	 * we loop through the PTP while there are still PTEs to look at
3278 	 * and the wire_count is greater than 1 (because we use the wire_count
3279 	 * to keep track of the number of real PTEs in the PTP).
3280 	 */
3281 	while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) {
3282 		(void)pmap_remove_pte(pmap, ptp, pte, startva, pv_tofree);
3283 		startva += PAGE_SIZE;
3284 		pte++;
3285 	}
3286 }
3287 
3288 
3289 /*
3290  * pmap_remove_pte: remove a single PTE from a PTP.
3291  *
3292  * => caller must hold pmap's lock
3293  * => PTP must be mapped into KVA
3294  * => PTP should be null if pmap == pmap_kernel()
3295  * => returns true if we removed a mapping
3296  * => must be called with kernel preemption disabled
3297  */
3298 static bool
3299 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
3300 		vaddr_t va, struct pv_entry **pv_tofree)
3301 {
3302 	struct pv_entry *pve;
3303 	struct vm_page *pg;
3304 	struct pmap_page *pp;
3305 	pt_entry_t opte;
3306 
3307 	KASSERT(pmap == pmap_kernel() || mutex_owned(pmap->pm_lock));
3308 	KASSERT(kpreempt_disabled());
3309 
3310 	if (!pmap_valid_entry(*pte)) {
3311 		/* VA not mapped. */
3312 		return false;
3313 	}
3314 
3315 	/* Atomically save the old PTE and zap it. */
3316 	opte = pmap_pte_testset(pte, 0);
3317 	if (!pmap_valid_entry(opte)) {
3318 		return false;
3319 	}
3320 
3321 	pmap_exec_account(pmap, va, opte, 0);
3322 	pmap_stats_update_bypte(pmap, 0, opte);
3323 
3324 	if (ptp) {
3325 		/*
3326 		 * Dropping a PTE.  Make sure that the PDE is flushed.
3327 		 */
3328 		ptp->wire_count--;
3329 		if (ptp->wire_count <= 1) {
3330 			opte |= PG_U;
3331 		}
3332 	}
3333 
3334 	if ((opte & PG_U) != 0) {
3335 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE);
3336 	}
3337 
3338 	/*
3339 	 * If we are not on a pv_head list - we are done.
3340 	 */
3341 	if ((opte & PG_PVLIST) == 0) {
3342 #if defined(DIAGNOSTIC) && !defined(DOM0OPS)
3343 		if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL ||
3344 		    pmap_pv_tracked(pmap_pte2pa(opte)) != NULL)
3345 			panic("pmap_remove_pte: managed or pv-tracked page"
3346 			    " without PG_PVLIST for %#"PRIxVADDR, va);
3347 #endif
3348 		return true;
3349 	}
3350 
3351 	if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
3352 		KASSERT(uvm_page_locked_p(pg));
3353 		pp = VM_PAGE_TO_PP(pg);
3354 	} else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
3355 		paddr_t pa = pmap_pte2pa(opte);
3356 		panic("pmap_remove_pte: PG_PVLIST with pv-untracked page"
3357 		    " va = 0x%"PRIxVADDR
3358 		    " pa = 0x%"PRIxPADDR" (0x%"PRIxPADDR")",
3359 		    va, pa, atop(pa));
3360 	}
3361 
3362 	/* Sync R/M bits. */
3363 	pp->pp_attrs |= opte;
3364 	pve = pmap_remove_pv(pp, ptp, va);
3365 
3366 	if (pve) {
3367 		pve->pve_next = *pv_tofree;
3368 		*pv_tofree = pve;
3369 	}
3370 	return true;
3371 }
3372 
3373 /*
3374  * pmap_remove: mapping removal function.
3375  *
3376  * => caller should not be holding any pmap locks
3377  */
3378 
3379 void
3380 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
3381 {
3382 	pt_entry_t *ptes;
3383 	pd_entry_t pde;
3384 	pd_entry_t * const *pdes;
3385 	struct pv_entry *pv_tofree = NULL;
3386 	bool result;
3387 	int i;
3388 	paddr_t ptppa;
3389 	vaddr_t blkendva, va = sva;
3390 	struct vm_page *ptp;
3391 	struct pmap *pmap2;
3392 
3393 	kpreempt_disable();
3394 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
3395 
3396 	/*
3397 	 * removing one page?  take shortcut function.
3398 	 */
3399 
3400 	if (va + PAGE_SIZE == eva) {
3401 		if (pmap_pdes_valid(va, pdes, &pde)) {
3402 
3403 			/* PA of the PTP */
3404 			ptppa = pmap_pte2pa(pde);
3405 
3406 			/* Get PTP if non-kernel mapping. */
3407 			if (pmap != pmap_kernel()) {
3408 				ptp = pmap_find_ptp(pmap, va, ptppa, 1);
3409 				KASSERTMSG(ptp != NULL,
3410 				    "pmap_remove: unmanaged PTP detected");
3411 			} else {
3412 				/* Never free kernel PTPs. */
3413 				ptp = NULL;
3414 			}
3415 
3416 			result = pmap_remove_pte(pmap, ptp,
3417 			    &ptes[pl1_i(va)], va, &pv_tofree);
3418 
3419 			/*
3420 			 * if mapping removed and the PTP is no longer
3421 			 * being used, free it!
3422 			 */
3423 
3424 			if (result && ptp && ptp->wire_count <= 1)
3425 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3426 		}
3427 	} else for (/* null */ ; va < eva ; va = blkendva) {
3428 		int lvl;
3429 
3430 		/* determine range of block */
3431 		blkendva = x86_round_pdr(va+1);
3432 		if (blkendva > eva)
3433 			blkendva = eva;
3434 
3435 		/*
3436 		 * Our PTE mappings should never be removed with pmap_remove.
3437 		 *
3438 		 * XXXmaxv: still needed?
3439 		 *
3440 		 * A long term solution is to move the PTEs out of user address
3441 		 * space, and into kernel address space. Then we can set
3442 		 * VM_MAXUSER_ADDRESS to be VM_MAX_ADDRESS.
3443 		 */
3444 		for (i = 0; i < PDP_SIZE; i++) {
3445 			if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i)
3446 				panic("PTE space accessed");
3447 		}
3448 
3449 		lvl = pmap_pdes_invalid(va, pdes, &pde);
3450 		if (lvl != 0) {
3451 			/*
3452 			 * skip a range corresponding to an invalid pde.
3453 			 */
3454 			blkendva = (va & ptp_masks[lvl - 1]) + nbpd[lvl - 1];
3455  			continue;
3456 		}
3457 
3458 		/* PA of the PTP */
3459 		ptppa = pmap_pte2pa(pde);
3460 
3461 		/* Get PTP if non-kernel mapping. */
3462 		if (pmap != pmap_kernel()) {
3463 			ptp = pmap_find_ptp(pmap, va, ptppa, 1);
3464 			KASSERTMSG(ptp != NULL,
3465 			    "pmap_remove: unmanaged PTP detected");
3466 		} else {
3467 			/* Never free kernel PTPs. */
3468 			ptp = NULL;
3469 		}
3470 
3471 		pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va,
3472 		    blkendva, &pv_tofree);
3473 
3474 		/* if PTP is no longer being used, free it! */
3475 		if (ptp && ptp->wire_count <= 1) {
3476 			pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3477 		}
3478 	}
3479 	pmap_unmap_ptes(pmap, pmap2);		/* unlock pmap */
3480 	kpreempt_enable();
3481 
3482 	/* Now we free unused PVs */
3483 	if (pv_tofree)
3484 		pmap_free_pvs(pv_tofree);
3485 }
3486 
3487 /*
3488  * pmap_sync_pv: clear pte bits and return the old value of the pte.
3489  *
3490  * => Caller should disable kernel preemption.
3491  * => issues tlb shootdowns if necessary.
3492  */
3493 
3494 static int
3495 pmap_sync_pv(struct pv_pte *pvpte, pt_entry_t expect, int clearbits,
3496     pt_entry_t *optep)
3497 {
3498 	struct pmap *pmap;
3499 	struct vm_page *ptp;
3500 	vaddr_t va;
3501 	pt_entry_t *ptep;
3502 	pt_entry_t opte;
3503 	pt_entry_t npte;
3504 	bool need_shootdown;
3505 
3506 	ptp = pvpte->pte_ptp;
3507 	va = pvpte->pte_va;
3508 	KASSERT(ptp == NULL || ptp->uobject != NULL);
3509 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
3510 	pmap = ptp_to_pmap(ptp);
3511 
3512 	KASSERT((expect & ~(PG_FRAME | PG_V)) == 0);
3513 	KASSERT((expect & PG_V) != 0);
3514 	KASSERT(clearbits == ~0 || (clearbits & ~(PG_M | PG_U | PG_RW)) == 0);
3515 	KASSERT(kpreempt_disabled());
3516 
3517 	ptep = pmap_map_pte(pmap, ptp, va);
3518 	do {
3519 		opte = *ptep;
3520 		KASSERT((opte & (PG_M | PG_U)) != PG_M);
3521 		KASSERT((opte & (PG_U | PG_V)) != PG_U);
3522 		KASSERT(opte == 0 || (opte & PG_V) != 0);
3523 		if ((opte & (PG_FRAME | PG_V)) != expect) {
3524 
3525 			/*
3526 			 * we lost a race with a V->P operation like
3527 			 * pmap_remove().  wait for the competitor
3528 			 * reflecting pte bits into mp_attrs.
3529 			 *
3530 			 * issue a redundant TLB shootdown so that
3531 			 * we can wait for its completion.
3532 			 */
3533 
3534 			pmap_unmap_pte();
3535 			if (clearbits != 0) {
3536 				pmap_tlb_shootdown(pmap, va,
3537 				    (pmap == pmap_kernel() ? PG_G : 0),
3538 				    TLBSHOOT_SYNC_PV1);
3539 			}
3540 			return EAGAIN;
3541 		}
3542 
3543 		/*
3544 		 * check if there's anything to do on this pte.
3545 		 */
3546 
3547 		if ((opte & clearbits) == 0) {
3548 			need_shootdown = false;
3549 			break;
3550 		}
3551 
3552 		/*
3553 		 * we need a shootdown if the pte is cached. (PG_U)
3554 		 *
3555 		 * ...unless we are clearing only the PG_RW bit and
3556 		 * it isn't cached as RW. (PG_M)
3557 		 */
3558 
3559 		need_shootdown = (opte & PG_U) != 0 &&
3560 		    !(clearbits == PG_RW && (opte & PG_M) == 0);
3561 
3562 		npte = opte & ~clearbits;
3563 
3564 		/*
3565 		 * if we need a shootdown anyway, clear PG_U and PG_M.
3566 		 */
3567 
3568 		if (need_shootdown) {
3569 			npte &= ~(PG_U | PG_M);
3570 		}
3571 		KASSERT((npte & (PG_M | PG_U)) != PG_M);
3572 		KASSERT((npte & (PG_U | PG_V)) != PG_U);
3573 		KASSERT(npte == 0 || (opte & PG_V) != 0);
3574 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
3575 
3576 	if (need_shootdown) {
3577 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV2);
3578 	}
3579 	pmap_unmap_pte();
3580 
3581 	*optep = opte;
3582 	return 0;
3583 }
3584 
3585 static void
3586 pmap_pp_remove(struct pmap_page *pp, paddr_t pa)
3587 {
3588 	struct pv_pte *pvpte;
3589 	struct pv_entry *killlist = NULL;
3590 	struct vm_page *ptp;
3591 	pt_entry_t expect;
3592 	int count;
3593 
3594 	expect = pmap_pa2pte(pa) | PG_V;
3595 	count = SPINLOCK_BACKOFF_MIN;
3596 	kpreempt_disable();
3597 startover:
3598 	while ((pvpte = pv_pte_first(pp)) != NULL) {
3599 		struct pmap *pmap;
3600 		struct pv_entry *pve;
3601 		pt_entry_t opte;
3602 		vaddr_t va;
3603 		int error;
3604 
3605 		/*
3606 		 * add a reference to the pmap before clearing the pte.
3607 		 * otherwise the pmap can disappear behind us.
3608 		 */
3609 
3610 		ptp = pvpte->pte_ptp;
3611 		pmap = ptp_to_pmap(ptp);
3612 		if (ptp != NULL) {
3613 			pmap_reference(pmap);
3614 		}
3615 
3616 		error = pmap_sync_pv(pvpte, expect, ~0, &opte);
3617 		if (error == EAGAIN) {
3618 			int hold_count;
3619 			KERNEL_UNLOCK_ALL(curlwp, &hold_count);
3620 			if (ptp != NULL) {
3621 				pmap_destroy(pmap);
3622 			}
3623 			SPINLOCK_BACKOFF(count);
3624 			KERNEL_LOCK(hold_count, curlwp);
3625 			goto startover;
3626 		}
3627 
3628 		pp->pp_attrs |= opte;
3629 		va = pvpte->pte_va;
3630 		pve = pmap_remove_pv(pp, ptp, va);
3631 
3632 		/* update the PTP reference count.  free if last reference. */
3633 		if (ptp != NULL) {
3634 			struct pmap *pmap2;
3635 			pt_entry_t *ptes;
3636 			pd_entry_t * const *pdes;
3637 
3638 			KASSERT(pmap != pmap_kernel());
3639 
3640 			pmap_tlb_shootnow();
3641 			pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3642 			pmap_stats_update_bypte(pmap, 0, opte);
3643 			ptp->wire_count--;
3644 			if (ptp->wire_count <= 1) {
3645 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3646 			}
3647 			pmap_unmap_ptes(pmap, pmap2);
3648 			pmap_destroy(pmap);
3649 		} else {
3650 			KASSERT(pmap == pmap_kernel());
3651 			pmap_stats_update_bypte(pmap, 0, opte);
3652 		}
3653 
3654 		if (pve != NULL) {
3655 			pve->pve_next = killlist;	/* mark it for death */
3656 			killlist = pve;
3657 		}
3658 	}
3659 	pmap_tlb_shootnow();
3660 	kpreempt_enable();
3661 
3662 	/* Now free unused pvs. */
3663 	pmap_free_pvs(killlist);
3664 }
3665 
3666 /*
3667  * pmap_page_remove: remove a managed vm_page from all pmaps that map it
3668  *
3669  * => R/M bits are sync'd back to attrs
3670  */
3671 
3672 void
3673 pmap_page_remove(struct vm_page *pg)
3674 {
3675 	struct pmap_page *pp;
3676 	paddr_t pa;
3677 
3678 	KASSERT(uvm_page_locked_p(pg));
3679 
3680 	pp = VM_PAGE_TO_PP(pg);
3681 	pa = VM_PAGE_TO_PHYS(pg);
3682 	pmap_pp_remove(pp, pa);
3683 }
3684 
3685 /*
3686  * pmap_pv_remove: remove an unmanaged pv-tracked page from all pmaps
3687  *	that map it
3688  */
3689 
3690 void
3691 pmap_pv_remove(paddr_t pa)
3692 {
3693 	struct pmap_page *pp;
3694 
3695 	pp = pmap_pv_tracked(pa);
3696 	if (pp == NULL)
3697 		panic("pmap_pv_protect: page not pv-tracked: 0x%"PRIxPADDR,
3698 		    pa);
3699 	pmap_pp_remove(pp, pa);
3700 }
3701 
3702 /*
3703  * p m a p   a t t r i b u t e  f u n c t i o n s
3704  * functions that test/change managed page's attributes
3705  * since a page can be mapped multiple times we must check each PTE that
3706  * maps it by going down the pv lists.
3707  */
3708 
3709 /*
3710  * pmap_test_attrs: test a page's attributes
3711  */
3712 
3713 bool
3714 pmap_test_attrs(struct vm_page *pg, unsigned testbits)
3715 {
3716 	struct pmap_page *pp;
3717 	struct pv_pte *pvpte;
3718 	pt_entry_t expect;
3719 	u_int result;
3720 
3721 	KASSERT(uvm_page_locked_p(pg));
3722 
3723 	pp = VM_PAGE_TO_PP(pg);
3724 	if ((pp->pp_attrs & testbits) != 0) {
3725 		return true;
3726 	}
3727 	expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V;
3728 	kpreempt_disable();
3729 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
3730 		pt_entry_t opte;
3731 		int error;
3732 
3733 		if ((pp->pp_attrs & testbits) != 0) {
3734 			break;
3735 		}
3736 		error = pmap_sync_pv(pvpte, expect, 0, &opte);
3737 		if (error == 0) {
3738 			pp->pp_attrs |= opte;
3739 		}
3740 	}
3741 	result = pp->pp_attrs & testbits;
3742 	kpreempt_enable();
3743 
3744 	/*
3745 	 * note that we will exit the for loop with a non-null pve if
3746 	 * we have found the bits we are testing for.
3747 	 */
3748 
3749 	return result != 0;
3750 }
3751 
3752 static bool
3753 pmap_pp_clear_attrs(struct pmap_page *pp, paddr_t pa, unsigned clearbits)
3754 {
3755 	struct pv_pte *pvpte;
3756 	u_int result;
3757 	pt_entry_t expect;
3758 	int count;
3759 
3760 	expect = pmap_pa2pte(pa) | PG_V;
3761 	count = SPINLOCK_BACKOFF_MIN;
3762 	kpreempt_disable();
3763 startover:
3764 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
3765 		pt_entry_t opte;
3766 		int error;
3767 
3768 		error = pmap_sync_pv(pvpte, expect, clearbits, &opte);
3769 		if (error == EAGAIN) {
3770 			int hold_count;
3771 			KERNEL_UNLOCK_ALL(curlwp, &hold_count);
3772 			SPINLOCK_BACKOFF(count);
3773 			KERNEL_LOCK(hold_count, curlwp);
3774 			goto startover;
3775 		}
3776 		pp->pp_attrs |= opte;
3777 	}
3778 	result = pp->pp_attrs & clearbits;
3779 	pp->pp_attrs &= ~clearbits;
3780 	pmap_tlb_shootnow();
3781 	kpreempt_enable();
3782 
3783 	return result != 0;
3784 }
3785 
3786 /*
3787  * pmap_clear_attrs: clear the specified attribute for a page.
3788  *
3789  * => we return true if we cleared one of the bits we were asked to
3790  */
3791 
3792 bool
3793 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits)
3794 {
3795 	struct pmap_page *pp;
3796 	paddr_t pa;
3797 
3798 	KASSERT(uvm_page_locked_p(pg));
3799 
3800 	pp = VM_PAGE_TO_PP(pg);
3801 	pa = VM_PAGE_TO_PHYS(pg);
3802 
3803 	return pmap_pp_clear_attrs(pp, pa, clearbits);
3804 }
3805 
3806 /*
3807  * pmap_pv_clear_attrs: clear the specified attributes for an unmanaged
3808  *	pv-tracked page.
3809  */
3810 
3811 bool
3812 pmap_pv_clear_attrs(paddr_t pa, unsigned clearbits)
3813 {
3814 	struct pmap_page *pp;
3815 
3816 	pp = pmap_pv_tracked(pa);
3817 	if (pp == NULL)
3818 		panic("pmap_pv_protect: page not pv-tracked: 0x%"PRIxPADDR,
3819 		    pa);
3820 
3821 	return pmap_pp_clear_attrs(pp, pa, clearbits);
3822 }
3823 
3824 /*
3825  * p m a p   p r o t e c t i o n   f u n c t i o n s
3826  */
3827 
3828 /*
3829  * pmap_page_protect: change the protection of all recorded mappings
3830  *	of a managed page
3831  *
3832  * => NOTE: this is an inline function in pmap.h
3833  */
3834 
3835 /* see pmap.h */
3836 
3837 /*
3838  * pmap_pv_protect: change the protection of all recorded mappings
3839  *	of an unmanaged pv-tracked page
3840  *
3841  * => NOTE: this is an inline function in pmap.h
3842  */
3843 
3844 /* see pmap.h */
3845 
3846 /*
3847  * pmap_protect: set the protection in of the pages in a pmap
3848  *
3849  * => NOTE: this is an inline function in pmap.h
3850  */
3851 
3852 /* see pmap.h */
3853 
3854 /*
3855  * pmap_write_protect: write-protect pages in a pmap.
3856  */
3857 void
3858 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
3859 {
3860 	pt_entry_t bit_rem, bit_put;
3861 	pt_entry_t *ptes;
3862 	pt_entry_t * const *pdes;
3863 	struct pmap *pmap2;
3864 	vaddr_t blockend, va;
3865 
3866 	KASSERT(curlwp->l_md.md_gc_pmap != pmap);
3867 
3868 	bit_rem = 0;
3869 	if (!(prot & VM_PROT_WRITE))
3870 		bit_rem = PG_RW;
3871 
3872 	bit_put = 0;
3873 	if (!(prot & VM_PROT_EXECUTE))
3874 		bit_put = pmap_pg_nx;
3875 
3876 	sva &= PG_FRAME;
3877 	eva &= PG_FRAME;
3878 
3879 	/* Acquire pmap. */
3880 	kpreempt_disable();
3881 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3882 
3883 	for (va = sva ; va < eva; va = blockend) {
3884 		pt_entry_t *spte, *epte;
3885 		int i;
3886 
3887 		blockend = x86_round_pdr(va + 1);
3888 		if (blockend > eva)
3889 			blockend = eva;
3890 
3891 		/*
3892 		 * Our PTE mappings should never be write-protected.
3893 		 *
3894 		 * XXXmaxv: still needed?
3895 		 *
3896 		 * A long term solution is to move the PTEs out of user address
3897 		 * space, and into kernel address space. Then we can set
3898 		 * VM_MAXUSER_ADDRESS to be VM_MAX_ADDRESS.
3899 		 */
3900 		for (i = 0; i < PDP_SIZE; i++) {
3901 			if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i)
3902 				panic("PTE space accessed");
3903 		}
3904 
3905 		/* Is it a valid block? */
3906 		if (!pmap_pdes_valid(va, pdes, NULL)) {
3907 			continue;
3908 		}
3909 		KASSERT(va < VM_MAXUSER_ADDRESS || va >= VM_MAX_ADDRESS);
3910 
3911 		spte = &ptes[pl1_i(va)];
3912 		epte = &ptes[pl1_i(blockend)];
3913 
3914 		for (/* */; spte < epte; spte++) {
3915 			pt_entry_t opte, npte;
3916 
3917 			do {
3918 				opte = *spte;
3919 				if (!pmap_valid_entry(opte)) {
3920 					goto next;
3921 				}
3922 				npte = (opte & ~bit_rem) | bit_put;
3923 			} while (pmap_pte_cas(spte, opte, npte) != opte);
3924 
3925 			if ((opte & PG_M) != 0) {
3926 				vaddr_t tva = x86_ptob(spte - ptes);
3927 				pmap_tlb_shootdown(pmap, tva, opte,
3928 				    TLBSHOOT_WRITE_PROTECT);
3929 			}
3930 next:;
3931 		}
3932 	}
3933 
3934 	/* Release pmap. */
3935 	pmap_unmap_ptes(pmap, pmap2);
3936 	kpreempt_enable();
3937 }
3938 
3939 /*
3940  * pmap_unwire: clear the wired bit in the PTE.
3941  *
3942  * => Mapping should already be present.
3943  */
3944 void
3945 pmap_unwire(struct pmap *pmap, vaddr_t va)
3946 {
3947 	pt_entry_t *ptes, *ptep, opte;
3948 	pd_entry_t * const *pdes;
3949 	struct pmap *pmap2;
3950 
3951 	/* Acquire pmap. */
3952 	kpreempt_disable();
3953 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3954 
3955 	if (!pmap_pdes_valid(va, pdes, NULL)) {
3956 		panic("pmap_unwire: invalid PDE");
3957 	}
3958 
3959 	ptep = &ptes[pl1_i(va)];
3960 	opte = *ptep;
3961 	KASSERT(pmap_valid_entry(opte));
3962 
3963 	if (opte & PG_W) {
3964 		pt_entry_t npte = opte & ~PG_W;
3965 
3966 		opte = pmap_pte_testset(ptep, npte);
3967 		pmap_stats_update_bypte(pmap, npte, opte);
3968 	} else {
3969 		printf("pmap_unwire: wiring for pmap %p va 0x%lx "
3970 		    "did not change!\n", pmap, va);
3971 	}
3972 
3973 	/* Release pmap. */
3974 	pmap_unmap_ptes(pmap, pmap2);
3975 	kpreempt_enable();
3976 }
3977 
3978 /*
3979  * pmap_copy: copy mappings from one pmap to another
3980  *
3981  * => optional function
3982  * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
3983  */
3984 
3985 /*
3986  * defined as macro in pmap.h
3987  */
3988 
3989 __strict_weak_alias(pmap_enter, pmap_enter_default);
3990 
3991 int
3992 pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
3993     u_int flags)
3994 {
3995 	return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0);
3996 }
3997 
3998 /*
3999  * pmap_enter: enter a mapping into a pmap
4000  *
4001  * => must be done "now" ... no lazy-evaluation
4002  * => we set pmap => pv_head locking
4003  */
4004 int
4005 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa,
4006 	   vm_prot_t prot, u_int flags, int domid)
4007 {
4008 	pt_entry_t *ptes, opte, npte;
4009 	pt_entry_t *ptep;
4010 	pd_entry_t * const *pdes;
4011 	struct vm_page *ptp;
4012 	struct vm_page *new_pg, *old_pg;
4013 	struct pmap_page *new_pp, *old_pp;
4014 	struct pv_entry *old_pve = NULL;
4015 	struct pv_entry *new_pve;
4016 	struct pv_entry *new_sparepve;
4017 	int error;
4018 	bool wired = (flags & PMAP_WIRED) != 0;
4019 	struct pmap *pmap2;
4020 
4021 	KASSERT(pmap_initialized);
4022 	KASSERT(curlwp->l_md.md_gc_pmap != pmap);
4023 	KASSERT(va < VM_MAX_KERNEL_ADDRESS);
4024 	KASSERTMSG(va != (vaddr_t)PDP_BASE,
4025 	    "pmap_enter: trying to map over PDP!");
4026 	KASSERTMSG(va < VM_MIN_KERNEL_ADDRESS ||
4027 	    pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]),
4028 	    "pmap_enter: missing kernel PTP for VA %lx!", va);
4029 
4030 #ifdef XEN
4031 	KASSERT(domid == DOMID_SELF || pa == 0);
4032 #endif /* XEN */
4033 
4034 	npte = ma | protection_codes[prot] | PG_V;
4035 	npte |= pmap_pat_flags(flags);
4036 	if (wired)
4037 	        npte |= PG_W;
4038 	if (va < VM_MAXUSER_ADDRESS)
4039 		npte |= PG_u;
4040 	else if (va < VM_MAX_ADDRESS)
4041 		panic("PTE space accessed");	/* XXXmaxv: no longer needed? */
4042 	else
4043 		npte |= PG_k;
4044 	if (pmap == pmap_kernel())
4045 		npte |= pmap_pg_g;
4046 	if (flags & VM_PROT_ALL) {
4047 		npte |= PG_U;
4048 		if (flags & VM_PROT_WRITE) {
4049 			KASSERT((npte & PG_RW) != 0);
4050 			npte |= PG_M;
4051 		}
4052 	}
4053 
4054 #ifdef XEN
4055 	if (domid != DOMID_SELF)
4056 		new_pg = NULL;
4057 	else
4058 #endif
4059 		new_pg = PHYS_TO_VM_PAGE(pa);
4060 	if (new_pg != NULL) {
4061 		/* This is a managed page */
4062 		npte |= PG_PVLIST;
4063 		new_pp = VM_PAGE_TO_PP(new_pg);
4064 	} else if ((new_pp = pmap_pv_tracked(pa)) != NULL) {
4065 		/* This is an unmanaged pv-tracked page */
4066 		npte |= PG_PVLIST;
4067 	} else {
4068 		new_pp = NULL;
4069 	}
4070 
4071 	/* get pves. */
4072 	new_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
4073 	new_sparepve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
4074 	if (new_pve == NULL || new_sparepve == NULL) {
4075 		if (flags & PMAP_CANFAIL) {
4076 			error = ENOMEM;
4077 			goto out2;
4078 		}
4079 		panic("pmap_enter: pve allocation failed");
4080 	}
4081 
4082 	kpreempt_disable();
4083 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
4084 	if (pmap == pmap_kernel()) {
4085 		ptp = NULL;
4086 	} else {
4087 		ptp = pmap_get_ptp(pmap, va, pdes);
4088 		if (ptp == NULL) {
4089 			pmap_unmap_ptes(pmap, pmap2);
4090 			if (flags & PMAP_CANFAIL) {
4091 				error = ENOMEM;
4092 				goto out;
4093 			}
4094 			panic("pmap_enter: get ptp failed");
4095 		}
4096 	}
4097 
4098 	/*
4099 	 * update the pte.
4100 	 */
4101 
4102 	ptep = &ptes[pl1_i(va)];
4103 	do {
4104 		opte = *ptep;
4105 
4106 		/*
4107 		 * if the same page, inherit PG_U and PG_M.
4108 		 */
4109 		if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) {
4110 			npte |= opte & (PG_U | PG_M);
4111 		}
4112 #if defined(XEN)
4113 		if (domid != DOMID_SELF) {
4114 			/* pmap_pte_cas with error handling */
4115 			int s = splvm();
4116 			if (opte != *ptep) {
4117 				splx(s);
4118 				continue;
4119 			}
4120 			error = xpq_update_foreign(
4121 			    vtomach((vaddr_t)ptep), npte, domid);
4122 			splx(s);
4123 			if (error) {
4124 				if (ptp != NULL && ptp->wire_count <= 1) {
4125 					pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4126 				}
4127 				pmap_unmap_ptes(pmap, pmap2);
4128 				goto out;
4129 			}
4130 			break;
4131 		}
4132 #endif /* defined(XEN) */
4133 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
4134 
4135 	/*
4136 	 * update statistics and PTP's reference count.
4137 	 */
4138 
4139 	pmap_stats_update_bypte(pmap, npte, opte);
4140 	if (ptp != NULL && !pmap_valid_entry(opte)) {
4141 		ptp->wire_count++;
4142 	}
4143 	KASSERT(ptp == NULL || ptp->wire_count > 1);
4144 
4145 	/*
4146 	 * if the same page, we can skip pv_entry handling.
4147 	 */
4148 
4149 	if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) {
4150 		KASSERT(((opte ^ npte) & PG_PVLIST) == 0);
4151 		goto same_pa;
4152 	}
4153 
4154 	/*
4155 	 * if old page is pv-tracked, remove pv_entry from its list.
4156 	 */
4157 
4158 	if ((~opte & (PG_V | PG_PVLIST)) == 0) {
4159 		if ((old_pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
4160 			KASSERT(uvm_page_locked_p(old_pg));
4161 			old_pp = VM_PAGE_TO_PP(old_pg);
4162 		} else if ((old_pp = pmap_pv_tracked(pmap_pte2pa(opte)))
4163 		    == NULL) {
4164 			pa = pmap_pte2pa(opte);
4165 			panic("pmap_enter: PG_PVLIST with pv-untracked page"
4166 			    " va = 0x%"PRIxVADDR
4167 			    " pa = 0x%" PRIxPADDR " (0x%" PRIxPADDR ")",
4168 			    va, pa, atop(pa));
4169 		}
4170 
4171 		old_pve = pmap_remove_pv(old_pp, ptp, va);
4172 		old_pp->pp_attrs |= opte;
4173 	}
4174 
4175 	/*
4176 	 * if new page is pv-tracked, insert pv_entry into its list.
4177 	 */
4178 
4179 	if (new_pp) {
4180 		new_pve = pmap_enter_pv(new_pp, new_pve, &new_sparepve, ptp, va);
4181 	}
4182 
4183 same_pa:
4184 	pmap_unmap_ptes(pmap, pmap2);
4185 
4186 	/*
4187 	 * shootdown tlb if necessary.
4188 	 */
4189 
4190 	if ((~opte & (PG_V | PG_U)) == 0 &&
4191 	    ((opte ^ npte) & (PG_FRAME | PG_RW)) != 0) {
4192 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER);
4193 	}
4194 
4195 	error = 0;
4196 out:
4197 	kpreempt_enable();
4198 out2:
4199 	if (old_pve != NULL) {
4200 		pool_cache_put(&pmap_pv_cache, old_pve);
4201 	}
4202 	if (new_pve != NULL) {
4203 		pool_cache_put(&pmap_pv_cache, new_pve);
4204 	}
4205 	if (new_sparepve != NULL) {
4206 		pool_cache_put(&pmap_pv_cache, new_sparepve);
4207 	}
4208 
4209 	return error;
4210 }
4211 
4212 static paddr_t
4213 pmap_get_physpage(void)
4214 {
4215 	struct vm_page *ptp;
4216 	struct pmap *kpm = pmap_kernel();
4217 	paddr_t pa;
4218 
4219 	if (!uvm.page_init_done) {
4220 		/*
4221 		 * We're growing the kernel pmap early (from
4222 		 * uvm_pageboot_alloc()). This case must be
4223 		 * handled a little differently.
4224 		 */
4225 
4226 		if (!uvm_page_physget(&pa))
4227 			panic("pmap_get_physpage: out of memory");
4228 #if defined(__HAVE_DIRECT_MAP)
4229 		pagezero(PMAP_DIRECT_MAP(pa));
4230 #else
4231 #if defined(XEN)
4232 		if (XEN_VERSION_SUPPORTED(3, 4)) {
4233 			xen_pagezero(pa);
4234 			return pa;
4235 		}
4236 #endif
4237 		kpreempt_disable();
4238 		pmap_pte_set(early_zero_pte, pmap_pa2pte(pa) | PG_V |
4239 		    PG_RW | pmap_pg_nx | PG_k);
4240 		pmap_pte_flush();
4241 		pmap_update_pg((vaddr_t)early_zerop);
4242 		memset(early_zerop, 0, PAGE_SIZE);
4243 #if defined(DIAGNOSTIC) || defined(XEN)
4244 		pmap_pte_set(early_zero_pte, 0);
4245 		pmap_pte_flush();
4246 #endif /* defined(DIAGNOSTIC) */
4247 		kpreempt_enable();
4248 #endif /* defined(__HAVE_DIRECT_MAP) */
4249 	} else {
4250 		/* XXX */
4251 		ptp = uvm_pagealloc(NULL, 0, NULL,
4252 				    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
4253 		if (ptp == NULL)
4254 			panic("pmap_get_physpage: out of memory");
4255 		ptp->flags &= ~PG_BUSY;
4256 		ptp->wire_count = 1;
4257 		pa = VM_PAGE_TO_PHYS(ptp);
4258 	}
4259 	pmap_stats_update(kpm, 1, 0);
4260 
4261 	return pa;
4262 }
4263 
4264 /*
4265  * Expand the page tree with the specified amount of PTPs, mapping virtual
4266  * addresses starting at kva. We populate all the levels but the last one
4267  * (L1). The nodes of the tree are created as RWX, but the pages covered
4268  * will be kentered in L1, with proper permissions.
4269  *
4270  * Used only by pmap_growkernel.
4271  */
4272 static void
4273 pmap_alloc_level(vaddr_t kva, long *needed_ptps)
4274 {
4275 	unsigned long i;
4276 	paddr_t pa;
4277 	unsigned long index, endindex;
4278 	int level;
4279 	pd_entry_t *pdep;
4280 #ifdef XEN
4281 	int s = splvm(); /* protect xpq_* */
4282 #endif
4283 
4284 	for (level = PTP_LEVELS; level > 1; level--) {
4285 		if (level == PTP_LEVELS)
4286 			pdep = pmap_kernel()->pm_pdir;
4287 		else
4288 			pdep = normal_pdes[level - 2];
4289 		index = pl_i_roundup(kva, level);
4290 		endindex = index + needed_ptps[level - 1] - 1;
4291 
4292 		for (i = index; i <= endindex; i++) {
4293 			pt_entry_t pte;
4294 
4295 			KASSERT(!pmap_valid_entry(pdep[i]));
4296 			pa = pmap_get_physpage();
4297 			pte = pmap_pa2pte(pa) | PG_k | PG_V | PG_RW;
4298 			pmap_pte_set(&pdep[i], pte);
4299 
4300 #if defined(XEN) && (defined(PAE) || defined(__x86_64__))
4301 			if (level == PTP_LEVELS && i >= PDIR_SLOT_KERN) {
4302 				if (__predict_true(
4303 				    cpu_info_primary.ci_flags & CPUF_PRESENT)) {
4304 					/* update per-cpu PMDs on all cpus */
4305 					xen_kpm_sync(pmap_kernel(), i);
4306 				} else {
4307 					/*
4308 					 * too early; update primary CPU
4309 					 * PMD only (without locks)
4310 					 */
4311 #ifdef PAE
4312 					pd_entry_t *cpu_pdep =
4313 					    &cpu_info_primary.ci_kpm_pdir[l2tol2(i)];
4314 #endif
4315 #ifdef __x86_64__
4316 					pd_entry_t *cpu_pdep =
4317 						&cpu_info_primary.ci_kpm_pdir[i];
4318 #endif
4319 					pmap_pte_set(cpu_pdep, pte);
4320 				}
4321 			}
4322 #endif /* XEN && (PAE || __x86_64__) */
4323 
4324 			KASSERT(level != PTP_LEVELS || nkptp[level - 1] +
4325 			    pl_i(VM_MIN_KERNEL_ADDRESS, level) == i);
4326 			nkptp[level - 1]++;
4327 		}
4328 		pmap_pte_flush();
4329 	}
4330 #ifdef XEN
4331 	splx(s);
4332 #endif
4333 }
4334 
4335 /*
4336  * pmap_growkernel: increase usage of KVM space.
4337  *
4338  * => we allocate new PTPs for the kernel and install them in all
4339  *    the pmaps on the system.
4340  */
4341 
4342 vaddr_t
4343 pmap_growkernel(vaddr_t maxkvaddr)
4344 {
4345 	struct pmap *kpm = pmap_kernel();
4346 #if !defined(XEN) || !defined(__x86_64__)
4347 	struct pmap *pm;
4348 	long old;
4349 #endif
4350 	int s, i;
4351 	long needed_kptp[PTP_LEVELS], target_nptp;
4352 	bool invalidate = false;
4353 
4354 	s = splvm();	/* to be safe */
4355 	mutex_enter(kpm->pm_lock);
4356 
4357 	if (maxkvaddr <= pmap_maxkvaddr) {
4358 		mutex_exit(kpm->pm_lock);
4359 		splx(s);
4360 		return pmap_maxkvaddr;
4361 	}
4362 
4363 	maxkvaddr = x86_round_pdr(maxkvaddr);
4364 #if !defined(XEN) || !defined(__x86_64__)
4365 	old = nkptp[PTP_LEVELS - 1];
4366 #endif
4367 
4368 	/* Initialize needed_kptp. */
4369 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
4370 		target_nptp = pl_i_roundup(maxkvaddr, i + 1) -
4371 		    pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1);
4372 
4373 		if (target_nptp > nkptpmax[i])
4374 			panic("out of KVA space");
4375 		KASSERT(target_nptp >= nkptp[i]);
4376 		needed_kptp[i] = target_nptp - nkptp[i];
4377 	}
4378 
4379 	pmap_alloc_level(pmap_maxkvaddr, needed_kptp);
4380 
4381 	/*
4382 	 * If the number of top level entries changed, update all pmaps.
4383 	 */
4384 	if (needed_kptp[PTP_LEVELS - 1] != 0) {
4385 #ifdef XEN
4386 #ifdef __x86_64__
4387 		/* nothing, kernel entries are never entered in user pmap */
4388 #else /* __x86_64__ */
4389 		mutex_enter(&pmaps_lock);
4390 		LIST_FOREACH(pm, &pmaps, pm_list) {
4391 			int pdkidx;
4392 			for (pdkidx = PDIR_SLOT_KERN + old;
4393 			    pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1];
4394 			    pdkidx++) {
4395 				pmap_pte_set(&pm->pm_pdir[pdkidx],
4396 				    kpm->pm_pdir[pdkidx]);
4397 			}
4398 			pmap_pte_flush();
4399 		}
4400 		mutex_exit(&pmaps_lock);
4401 #endif /* __x86_64__ */
4402 #else /* XEN */
4403 		unsigned newpdes;
4404 		newpdes = nkptp[PTP_LEVELS - 1] - old;
4405 		mutex_enter(&pmaps_lock);
4406 		LIST_FOREACH(pm, &pmaps, pm_list) {
4407 			memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old],
4408 			    &kpm->pm_pdir[PDIR_SLOT_KERN + old],
4409 			    newpdes * sizeof (pd_entry_t));
4410 		}
4411 		mutex_exit(&pmaps_lock);
4412 #endif
4413 		invalidate = true;
4414 	}
4415 	pmap_maxkvaddr = maxkvaddr;
4416 	mutex_exit(kpm->pm_lock);
4417 	splx(s);
4418 
4419 	if (invalidate && pmap_initialized) {
4420 		/* Invalidate the PDP cache. */
4421 		pool_cache_invalidate(&pmap_pdp_cache);
4422 	}
4423 
4424 	return maxkvaddr;
4425 }
4426 
4427 #ifdef DEBUG
4428 void pmap_dump(struct pmap *, vaddr_t, vaddr_t);
4429 
4430 /*
4431  * pmap_dump: dump all the mappings from a pmap
4432  *
4433  * => caller should not be holding any pmap locks
4434  */
4435 
4436 void
4437 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
4438 {
4439 	pt_entry_t *ptes, *pte;
4440 	pd_entry_t * const *pdes;
4441 	struct pmap *pmap2;
4442 	vaddr_t blkendva;
4443 
4444 	/*
4445 	 * if end is out of range truncate.
4446 	 * if (end == start) update to max.
4447 	 */
4448 
4449 	if (eva > VM_MAXUSER_ADDRESS || eva <= sva)
4450 		eva = VM_MAXUSER_ADDRESS;
4451 
4452 	/*
4453 	 * we lock in the pmap => pv_head direction
4454 	 */
4455 
4456 	kpreempt_disable();
4457 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
4458 
4459 	/*
4460 	 * dumping a range of pages: we dump in PTP sized blocks (4MB)
4461 	 */
4462 
4463 	for (/* null */ ; sva < eva ; sva = blkendva) {
4464 
4465 		/* determine range of block */
4466 		blkendva = x86_round_pdr(sva+1);
4467 		if (blkendva > eva)
4468 			blkendva = eva;
4469 
4470 		/* valid block? */
4471 		if (!pmap_pdes_valid(sva, pdes, NULL))
4472 			continue;
4473 
4474 		pte = &ptes[pl1_i(sva)];
4475 		for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) {
4476 			if (!pmap_valid_entry(*pte))
4477 				continue;
4478 			printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR
4479 			    " (pte=%#" PRIxPADDR ")\n",
4480 			    sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte);
4481 		}
4482 	}
4483 	pmap_unmap_ptes(pmap, pmap2);
4484 	kpreempt_enable();
4485 }
4486 #endif
4487 
4488 /*
4489  * pmap_update: process deferred invalidations and frees.
4490  */
4491 
4492 void
4493 pmap_update(struct pmap *pmap)
4494 {
4495 	struct vm_page *empty_ptps;
4496 	lwp_t *l = curlwp;
4497 
4498 	/*
4499 	 * If we have torn down this pmap, invalidate non-global TLB
4500 	 * entries on any processors using it.
4501 	 */
4502 	kpreempt_disable();
4503 	if (__predict_false(l->l_md.md_gc_pmap == pmap)) {
4504 		l->l_md.md_gc_pmap = NULL;
4505 		pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, TLBSHOOT_UPDATE);
4506 	}
4507 	/*
4508 	 * Initiate any pending TLB shootdowns.  Wait for them to
4509 	 * complete before returning control to the caller.
4510 	 */
4511 	pmap_tlb_shootnow();
4512 	kpreempt_enable();
4513 
4514 	/*
4515 	 * Now that shootdowns are complete, process deferred frees,
4516 	 * but not from interrupt context.
4517 	 */
4518 	if (l->l_md.md_gc_ptp != NULL) {
4519 		KASSERT((l->l_pflag & LP_INTR) == 0);
4520 		if (cpu_intr_p()) {
4521 			return;
4522 		}
4523 		empty_ptps = l->l_md.md_gc_ptp;
4524 		l->l_md.md_gc_ptp = NULL;
4525 		pmap_free_ptps(empty_ptps);
4526 	}
4527 }
4528 
4529 #if PTP_LEVELS > 4
4530 #error "Unsupported number of page table mappings"
4531 #endif
4532 
4533 paddr_t
4534 pmap_init_tmp_pgtbl(paddr_t pg)
4535 {
4536 	static bool maps_loaded;
4537 	static const paddr_t x86_tmp_pml_paddr[] = {
4538 	    4 * PAGE_SIZE,	/* L1 */
4539 	    5 * PAGE_SIZE,	/* L2 */
4540 	    6 * PAGE_SIZE,	/* L3 */
4541 	    7 * PAGE_SIZE	/* L4 */
4542 	};
4543 	static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 };
4544 
4545 	pd_entry_t *tmp_pml, *kernel_pml;
4546 
4547 	int level;
4548 
4549 	if (!maps_loaded) {
4550 		for (level = 0; level < PTP_LEVELS; ++level) {
4551 			x86_tmp_pml_vaddr[level] =
4552 			    uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
4553 			    UVM_KMF_VAONLY);
4554 
4555 			if (x86_tmp_pml_vaddr[level] == 0)
4556 				panic("mapping of real mode PML failed\n");
4557 			pmap_kenter_pa(x86_tmp_pml_vaddr[level],
4558 			    x86_tmp_pml_paddr[level],
4559 			    VM_PROT_READ | VM_PROT_WRITE, 0);
4560 		}
4561 		pmap_update(pmap_kernel());
4562 		maps_loaded = true;
4563 	}
4564 
4565 	/* Zero levels 1-3 */
4566 	for (level = 0; level < PTP_LEVELS - 1; ++level) {
4567 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
4568 		memset(tmp_pml, 0, PAGE_SIZE);
4569 	}
4570 
4571 	/* Copy PML4 */
4572 	kernel_pml = pmap_kernel()->pm_pdir;
4573 	tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1];
4574 	memcpy(tmp_pml, kernel_pml, PAGE_SIZE);
4575 
4576 #ifdef PAE
4577 	/*
4578 	 * Use the last 4 entries of the L2 page as L3 PD entries. These
4579 	 * last entries are unlikely to be used for temporary mappings.
4580 	 * 508: maps 0->1GB (userland)
4581 	 * 509: unused
4582 	 * 510: unused
4583 	 * 511: maps 3->4GB (kernel)
4584 	 */
4585 	tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PG_V;
4586 	tmp_pml[509] = 0;
4587 	tmp_pml[510] = 0;
4588 	tmp_pml[511] = pmap_pdirpa(pmap_kernel(), PDIR_SLOT_KERN) | PG_V;
4589 #endif
4590 
4591 	for (level = PTP_LEVELS - 1; level > 0; --level) {
4592 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
4593 
4594 		tmp_pml[pl_i(pg, level + 1)] =
4595 		    (x86_tmp_pml_paddr[level - 1] & PG_FRAME) | PG_RW | PG_V;
4596 	}
4597 
4598 	tmp_pml = (void *)x86_tmp_pml_vaddr[0];
4599 	tmp_pml[pl_i(pg, 1)] = (pg & PG_FRAME) | PG_RW | PG_V;
4600 
4601 #ifdef PAE
4602 	/* Return the PA of the L3 page (entry 508 of the L2 page) */
4603 	return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t);
4604 #endif
4605 
4606 	return x86_tmp_pml_paddr[PTP_LEVELS - 1];
4607 }
4608 
4609 u_int
4610 x86_mmap_flags(paddr_t mdpgno)
4611 {
4612 	u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK;
4613 	u_int pflag = 0;
4614 
4615 	if (nflag & X86_MMAP_FLAG_PREFETCH)
4616 		pflag |= PMAP_WRITE_COMBINE;
4617 
4618 	return pflag;
4619 }
4620