xref: /netbsd-src/sys/arch/x86/x86/pmap.c (revision 796c32c94f6e154afc9de0f63da35c91bb739b45)
1 /*	$NetBSD: pmap.c,v 1.267 2017/11/22 21:26:01 christos Exp $	*/
2 
3 /*
4  * Copyright (c) 2008, 2010, 2016, 2017 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran, and by Maxime Villard.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 2007 Manuel Bouyer.
34  *
35  * Redistribution and use in source and binary forms, with or without
36  * modification, are permitted provided that the following conditions
37  * are met:
38  * 1. Redistributions of source code must retain the above copyright
39  *    notice, this list of conditions and the following disclaimer.
40  * 2. Redistributions in binary form must reproduce the above copyright
41  *    notice, this list of conditions and the following disclaimer in the
42  *    documentation and/or other materials provided with the distribution.
43  *
44  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
45  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
46  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
47  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
48  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
49  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
50  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
51  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
52  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
53  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
54  */
55 
56 /*
57  * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
58  *
59  * Permission to use, copy, modify, and distribute this software for any
60  * purpose with or without fee is hereby granted, provided that the above
61  * copyright notice and this permission notice appear in all copies.
62  *
63  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
64  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
65  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
66  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
67  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
68  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
69  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
70  */
71 
72 /*
73  * Copyright (c) 1997 Charles D. Cranor and Washington University.
74  * All rights reserved.
75  *
76  * Redistribution and use in source and binary forms, with or without
77  * modification, are permitted provided that the following conditions
78  * are met:
79  * 1. Redistributions of source code must retain the above copyright
80  *    notice, this list of conditions and the following disclaimer.
81  * 2. Redistributions in binary form must reproduce the above copyright
82  *    notice, this list of conditions and the following disclaimer in the
83  *    documentation and/or other materials provided with the distribution.
84  *
85  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
86  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
87  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
88  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
89  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
90  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
91  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
92  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
93  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
94  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
95  */
96 
97 /*
98  * Copyright 2001 (c) Wasabi Systems, Inc.
99  * All rights reserved.
100  *
101  * Written by Frank van der Linden for Wasabi Systems, Inc.
102  *
103  * Redistribution and use in source and binary forms, with or without
104  * modification, are permitted provided that the following conditions
105  * are met:
106  * 1. Redistributions of source code must retain the above copyright
107  *    notice, this list of conditions and the following disclaimer.
108  * 2. Redistributions in binary form must reproduce the above copyright
109  *    notice, this list of conditions and the following disclaimer in the
110  *    documentation and/or other materials provided with the distribution.
111  * 3. All advertising materials mentioning features or use of this software
112  *    must display the following acknowledgement:
113  *      This product includes software developed for the NetBSD Project by
114  *      Wasabi Systems, Inc.
115  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
116  *    or promote products derived from this software without specific prior
117  *    written permission.
118  *
119  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
120  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
121  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
122  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
123  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
124  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
125  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
126  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
127  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
128  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
129  * POSSIBILITY OF SUCH DAMAGE.
130  */
131 
132 /*
133  * This is the i386 pmap modified and generalized to support x86-64
134  * as well. The idea is to hide the upper N levels of the page tables
135  * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest
136  * is mostly untouched, except that it uses some more generalized
137  * macros and interfaces.
138  *
139  * This pmap has been tested on the i386 as well, and it can be easily
140  * adapted to PAE.
141  *
142  * fvdl@wasabisystems.com 18-Jun-2001
143  */
144 
145 /*
146  * pmap.c: i386 pmap module rewrite
147  * Chuck Cranor <chuck@netbsd>
148  * 11-Aug-97
149  *
150  * history of this pmap module: in addition to my own input, i used
151  *    the following references for this rewrite of the i386 pmap:
152  *
153  * [1] the NetBSD i386 pmap.   this pmap appears to be based on the
154  *     BSD hp300 pmap done by Mike Hibler at University of Utah.
155  *     it was then ported to the i386 by William Jolitz of UUNET
156  *     Technologies, Inc.   Then Charles M. Hannum of the NetBSD
157  *     project fixed some bugs and provided some speed ups.
158  *
159  * [2] the FreeBSD i386 pmap.   this pmap seems to be the
160  *     Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson
161  *     and David Greenman.
162  *
163  * [3] the Mach pmap.   this pmap, from CMU, seems to have migrated
164  *     between several processors.   the VAX version was done by
165  *     Avadis Tevanian, Jr., and Michael Wayne Young.    the i386
166  *     version was done by Lance Berc, Mike Kupfer, Bob Baron,
167  *     David Golub, and Richard Draves.    the alpha version was
168  *     done by Alessandro Forin (CMU/Mach) and Chris Demetriou
169  *     (NetBSD/alpha).
170  */
171 
172 #include <sys/cdefs.h>
173 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.267 2017/11/22 21:26:01 christos Exp $");
174 
175 #include "opt_user_ldt.h"
176 #include "opt_lockdebug.h"
177 #include "opt_multiprocessor.h"
178 #include "opt_xen.h"
179 
180 #include <sys/param.h>
181 #include <sys/systm.h>
182 #include <sys/proc.h>
183 #include <sys/pool.h>
184 #include <sys/kernel.h>
185 #include <sys/atomic.h>
186 #include <sys/cpu.h>
187 #include <sys/intr.h>
188 #include <sys/xcall.h>
189 #include <sys/kcore.h>
190 
191 #include <uvm/uvm.h>
192 #include <uvm/pmap/pmap_pvt.h>
193 
194 #include <dev/isa/isareg.h>
195 
196 #include <machine/specialreg.h>
197 #include <machine/gdt.h>
198 #include <machine/isa_machdep.h>
199 #include <machine/cpuvar.h>
200 #include <machine/cputypes.h>
201 
202 #include <x86/pmap.h>
203 #include <x86/pmap_pv.h>
204 
205 #include <x86/i82489reg.h>
206 #include <x86/i82489var.h>
207 
208 #ifdef XEN
209 #include <xen/xen-public/xen.h>
210 #include <xen/hypervisor.h>
211 #endif
212 
213 /*
214  * general info:
215  *
216  *  - for an explanation of how the i386 MMU hardware works see
217  *    the comments in <machine/pte.h>.
218  *
219  *  - for an explanation of the general memory structure used by
220  *    this pmap (including the recursive mapping), see the comments
221  *    in <machine/pmap.h>.
222  *
223  * this file contains the code for the "pmap module."   the module's
224  * job is to manage the hardware's virtual to physical address mappings.
225  * note that there are two levels of mapping in the VM system:
226  *
227  *  [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
228  *      to map ranges of virtual address space to objects/files.  for
229  *      example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
230  *      to the file /bin/ls starting at offset zero."   note that
231  *      the upper layer mapping is not concerned with how individual
232  *      vm_pages are mapped.
233  *
234  *  [2] the lower layer of the VM system (the pmap) maintains the mappings
235  *      from virtual addresses.   it is concerned with which vm_page is
236  *      mapped where.   for example, when you run /bin/ls and start
237  *      at page 0x1000 the fault routine may lookup the correct page
238  *      of the /bin/ls file and then ask the pmap layer to establish
239  *      a mapping for it.
240  *
241  * note that information in the lower layer of the VM system can be
242  * thrown away since it can easily be reconstructed from the info
243  * in the upper layer.
244  *
245  * data structures we use include:
246  *
247  *  - struct pmap: describes the address space of one thread
248  *  - struct pmap_page: describes one pv-tracked page, without
249  *	necessarily a corresponding vm_page
250  *  - struct pv_entry: describes one <PMAP,VA> mapping of a PA
251  *  - struct pv_head: there is one pv_head per pv-tracked page of
252  *	physical memory.   the pv_head points to a list of pv_entry
253  *	structures which describe all the <PMAP,VA> pairs that this
254  *      page is mapped in.    this is critical for page based operations
255  *      such as pmap_page_protect() [change protection on _all_ mappings
256  *      of a page]
257  */
258 
259 /*
260  * memory allocation
261  *
262  *  - there are three data structures that we must dynamically allocate:
263  *
264  * [A] new process' page directory page (PDP)
265  *	- plan 1: done at pmap_create() we use
266  *	  uvm_km_alloc(kernel_map, PAGE_SIZE)  [fka kmem_alloc] to do this
267  *	  allocation.
268  *
269  * if we are low in free physical memory then we sleep in
270  * uvm_km_alloc -- in this case this is ok since we are creating
271  * a new pmap and should not be holding any locks.
272  *
273  * if the kernel is totally out of virtual space
274  * (i.e. uvm_km_alloc returns NULL), then we panic.
275  *
276  * [B] new page tables pages (PTP)
277  * 	- call uvm_pagealloc()
278  * 		=> success: zero page, add to pm_pdir
279  * 		=> failure: we are out of free vm_pages, let pmap_enter()
280  *		   tell UVM about it.
281  *
282  * note: for kernel PTPs, we start with NKPTP of them.   as we map
283  * kernel memory (at uvm_map time) we check to see if we've grown
284  * the kernel pmap.   if so, we call the optional function
285  * pmap_growkernel() to grow the kernel PTPs in advance.
286  *
287  * [C] pv_entry structures
288  */
289 
290 /*
291  * locking
292  *
293  * we have the following locks that we must contend with:
294  *
295  * mutexes:
296  *
297  * - pmap lock (per pmap, part of uvm_object)
298  *   this lock protects the fields in the pmap structure including
299  *   the non-kernel PDEs in the PDP, and the PTEs.  it also locks
300  *   in the alternate PTE space (since that is determined by the
301  *   entry in the PDP).
302  *
303  * - pvh_lock (per pv_head)
304  *   this lock protects the pv_entry list which is chained off the
305  *   pv_head structure for a specific pv-tracked PA.   it is locked
306  *   when traversing the list (e.g. adding/removing mappings,
307  *   syncing R/M bits, etc.)
308  *
309  * - pmaps_lock
310  *   this lock protects the list of active pmaps (headed by "pmaps").
311  *   we lock it when adding or removing pmaps from this list.
312  */
313 
314 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER;
315 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER;
316 const long nkptpmax[] = NKPTPMAX_INITIALIZER;
317 const long nbpd[] = NBPD_INITIALIZER;
318 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER;
319 
320 long nkptp[] = NKPTP_INITIALIZER;
321 
322 struct pmap_head pmaps;
323 kmutex_t pmaps_lock;
324 
325 static vaddr_t pmap_maxkvaddr;
326 
327 /*
328  * XXX kludge: dummy locking to make KASSERTs in uvm_page.c comfortable.
329  * actual locking is done by pm_lock.
330  */
331 #if defined(DIAGNOSTIC)
332 #define	PMAP_SUBOBJ_LOCK(pm, idx) \
333 	KASSERT(mutex_owned((pm)->pm_lock)); \
334 	if ((idx) != 0) \
335 		mutex_enter((pm)->pm_obj[(idx)].vmobjlock)
336 #define	PMAP_SUBOBJ_UNLOCK(pm, idx) \
337 	KASSERT(mutex_owned((pm)->pm_lock)); \
338 	if ((idx) != 0) \
339 		mutex_exit((pm)->pm_obj[(idx)].vmobjlock)
340 #else /* defined(DIAGNOSTIC) */
341 #define	PMAP_SUBOBJ_LOCK(pm, idx)	/* nothing */
342 #define	PMAP_SUBOBJ_UNLOCK(pm, idx)	/* nothing */
343 #endif /* defined(DIAGNOSTIC) */
344 
345 /*
346  * Misc. event counters.
347  */
348 struct evcnt pmap_iobmp_evcnt;
349 struct evcnt pmap_ldt_evcnt;
350 
351 /*
352  * PAT
353  */
354 #define	PATENTRY(n, type)	(type << ((n) * 8))
355 #define	PAT_UC		0x0ULL
356 #define	PAT_WC		0x1ULL
357 #define	PAT_WT		0x4ULL
358 #define	PAT_WP		0x5ULL
359 #define	PAT_WB		0x6ULL
360 #define	PAT_UCMINUS	0x7ULL
361 
362 static bool cpu_pat_enabled __read_mostly = false;
363 
364 /*
365  * Global data structures
366  */
367 
368 static struct pmap kernel_pmap_store;	/* the kernel's pmap (proc0) */
369 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store;
370 
371 struct bootspace bootspace __read_mostly;
372 
373 /*
374  * pmap_pg_nx: if our processor supports PG_NX in the PTE then we
375  * set pmap_pg_nx to PG_NX (otherwise it is zero).
376  */
377 pd_entry_t pmap_pg_nx __read_mostly = 0;
378 
379 /*
380  * pmap_pg_g: if our processor supports PG_G in the PTE then we
381  * set pmap_pg_g to PG_G (otherwise it is zero).
382  */
383 pd_entry_t pmap_pg_g __read_mostly = 0;
384 
385 /*
386  * pmap_largepages: if our processor supports PG_PS and we are
387  * using it, this is set to true.
388  */
389 int pmap_largepages __read_mostly = 0;
390 
391 /*
392  * i386 physical memory comes in a big contig chunk with a small
393  * hole toward the front of it...  the following two paddr_t's
394  * (shared with machdep.c) describe the physical address space
395  * of this machine.
396  */
397 paddr_t lowmem_rsvd __read_mostly;
398 paddr_t avail_start __read_mostly; /* PA of first available physical page */
399 paddr_t avail_end __read_mostly; /* PA of last available physical page */
400 
401 #ifdef XEN
402 paddr_t pmap_pa_start; /* PA of first physical page for this domain */
403 paddr_t pmap_pa_end;   /* PA of last physical page for this domain */
404 #endif
405 
406 #define	VM_PAGE_TO_PP(pg)	(&(pg)->mdpage.mp_pp)
407 
408 #define	PV_HASH_SIZE		32768
409 #define	PV_HASH_LOCK_CNT	32
410 
411 struct pv_hash_lock {
412 	kmutex_t lock;
413 } __aligned(CACHE_LINE_SIZE) pv_hash_locks[PV_HASH_LOCK_CNT]
414     __aligned(CACHE_LINE_SIZE);
415 
416 struct pv_hash_head {
417 	SLIST_HEAD(, pv_entry) hh_list;
418 } pv_hash_heads[PV_HASH_SIZE];
419 
420 static u_int
421 pvhash_hash(struct vm_page *ptp, vaddr_t va)
422 {
423 
424 	return (uintptr_t)ptp / sizeof(*ptp) + (va >> PAGE_SHIFT);
425 }
426 
427 static struct pv_hash_head *
428 pvhash_head(u_int hash)
429 {
430 
431 	return &pv_hash_heads[hash % PV_HASH_SIZE];
432 }
433 
434 static kmutex_t *
435 pvhash_lock(u_int hash)
436 {
437 
438 	return &pv_hash_locks[hash % PV_HASH_LOCK_CNT].lock;
439 }
440 
441 static struct pv_entry *
442 pvhash_remove(struct pv_hash_head *hh, struct vm_page *ptp, vaddr_t va)
443 {
444 	struct pv_entry *pve;
445 	struct pv_entry *prev;
446 
447 	prev = NULL;
448 	SLIST_FOREACH(pve, &hh->hh_list, pve_hash) {
449 		if (pve->pve_pte.pte_ptp == ptp &&
450 		    pve->pve_pte.pte_va == va) {
451 			if (prev != NULL) {
452 				SLIST_REMOVE_AFTER(prev, pve_hash);
453 			} else {
454 				SLIST_REMOVE_HEAD(&hh->hh_list, pve_hash);
455 			}
456 			break;
457 		}
458 		prev = pve;
459 	}
460 	return pve;
461 }
462 
463 /*
464  * Other data structures
465  */
466 
467 static pt_entry_t protection_codes[8] __read_mostly;
468 
469 static bool pmap_initialized __read_mostly = false; /* pmap_init done yet? */
470 
471 /*
472  * The following two vaddr_t's are used during system startup to keep track of
473  * how much of the kernel's VM space we have used. Once the system is started,
474  * the management of the remaining kernel VM space is turned over to the
475  * kernel_map vm_map.
476  */
477 static vaddr_t virtual_avail __read_mostly;	/* VA of first free KVA */
478 static vaddr_t virtual_end __read_mostly;	/* VA of last free KVA */
479 
480 #ifndef XEN
481 /*
482  * LAPIC virtual address, and fake physical address.
483  */
484 volatile vaddr_t local_apic_va __read_mostly;
485 paddr_t local_apic_pa __read_mostly;
486 #endif
487 
488 /*
489  * pool that pmap structures are allocated from
490  */
491 static struct pool_cache pmap_cache;
492 
493 /*
494  * pv_entry cache
495  */
496 static struct pool_cache pmap_pv_cache;
497 
498 #ifndef __HAVE_DIRECT_MAP
499 /*
500  * Special VAs and the PTEs that map them
501  */
502 static pt_entry_t *early_zero_pte;
503 static void pmap_vpage_cpualloc(struct cpu_info *);
504 #ifdef XEN
505 char *early_zerop; /* also referenced from xen_locore() */
506 #else
507 static char *early_zerop;
508 #endif
509 #endif
510 
511 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int);
512 
513 /* PDP pool_cache(9) and its callbacks */
514 struct pool_cache pmap_pdp_cache;
515 static int  pmap_pdp_ctor(void *, void *, int);
516 static void pmap_pdp_dtor(void *, void *);
517 #ifdef PAE
518 /* need to allocate items of 4 pages */
519 static void *pmap_pdp_alloc(struct pool *, int);
520 static void pmap_pdp_free(struct pool *, void *);
521 static struct pool_allocator pmap_pdp_allocator = {
522 	.pa_alloc = pmap_pdp_alloc,
523 	.pa_free = pmap_pdp_free,
524 	.pa_pagesz = PAGE_SIZE * PDP_SIZE,
525 };
526 #endif /* PAE */
527 
528 extern vaddr_t idt_vaddr;
529 extern paddr_t idt_paddr;
530 extern vaddr_t gdt_vaddr;
531 extern paddr_t gdt_paddr;
532 extern vaddr_t ldt_vaddr;
533 extern paddr_t ldt_paddr;
534 
535 extern int end;
536 
537 #ifdef i386
538 /* stuff to fix the pentium f00f bug */
539 extern vaddr_t pentium_idt_vaddr;
540 #endif
541 
542 /*
543  * Local prototypes
544  */
545 
546 #ifdef __HAVE_DIRECT_MAP
547 static void pmap_init_directmap(struct pmap *);
548 #endif
549 #ifndef XEN
550 static void pmap_init_lapic(void);
551 static void pmap_remap_global(void);
552 static void pmap_remap_largepages(void);
553 #endif
554 
555 static struct vm_page *pmap_get_ptp(struct pmap *, vaddr_t,
556     pd_entry_t * const *, int);
557 static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int);
558 static void pmap_freepage(struct pmap *, struct vm_page *, int);
559 static void pmap_free_ptp(struct pmap *, struct vm_page *, vaddr_t,
560     pt_entry_t *, pd_entry_t * const *);
561 static bool pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *,
562     vaddr_t, struct pv_entry **);
563 static void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t, vaddr_t,
564     vaddr_t, struct pv_entry **);
565 
566 static paddr_t pmap_get_physpage(void);
567 static void pmap_alloc_level(struct pmap *, vaddr_t, long *);
568 
569 static bool pmap_reactivate(struct pmap *);
570 
571 /*
572  * p m a p   h e l p e r   f u n c t i o n s
573  */
574 
575 static inline void
576 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff)
577 {
578 
579 	if (pmap == pmap_kernel()) {
580 		atomic_add_long(&pmap->pm_stats.resident_count, resid_diff);
581 		atomic_add_long(&pmap->pm_stats.wired_count, wired_diff);
582 	} else {
583 		KASSERT(mutex_owned(pmap->pm_lock));
584 		pmap->pm_stats.resident_count += resid_diff;
585 		pmap->pm_stats.wired_count += wired_diff;
586 	}
587 }
588 
589 static inline void
590 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
591 {
592 	int resid_diff = ((npte & PG_V) ? 1 : 0) - ((opte & PG_V) ? 1 : 0);
593 	int wired_diff = ((npte & PG_W) ? 1 : 0) - ((opte & PG_W) ? 1 : 0);
594 
595 	KASSERT((npte & (PG_V | PG_W)) != PG_W);
596 	KASSERT((opte & (PG_V | PG_W)) != PG_W);
597 
598 	pmap_stats_update(pmap, resid_diff, wired_diff);
599 }
600 
601 /*
602  * ptp_to_pmap: lookup pmap by ptp
603  */
604 
605 static struct pmap *
606 ptp_to_pmap(struct vm_page *ptp)
607 {
608 	struct pmap *pmap;
609 
610 	if (ptp == NULL) {
611 		return pmap_kernel();
612 	}
613 	pmap = (struct pmap *)ptp->uobject;
614 	KASSERT(pmap != NULL);
615 	KASSERT(&pmap->pm_obj[0] == ptp->uobject);
616 	return pmap;
617 }
618 
619 static inline struct pv_pte *
620 pve_to_pvpte(struct pv_entry *pve)
621 {
622 
623 	KASSERT((void *)&pve->pve_pte == (void *)pve);
624 	return &pve->pve_pte;
625 }
626 
627 static inline struct pv_entry *
628 pvpte_to_pve(struct pv_pte *pvpte)
629 {
630 	struct pv_entry *pve = (void *)pvpte;
631 
632 	KASSERT(pve_to_pvpte(pve) == pvpte);
633 	return pve;
634 }
635 
636 /*
637  * pv_pte_first, pv_pte_next: PV list iterator.
638  */
639 
640 static struct pv_pte *
641 pv_pte_first(struct pmap_page *pp)
642 {
643 
644 	if ((pp->pp_flags & PP_EMBEDDED) != 0) {
645 		return &pp->pp_pte;
646 	}
647 	return pve_to_pvpte(LIST_FIRST(&pp->pp_head.pvh_list));
648 }
649 
650 static struct pv_pte *
651 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte)
652 {
653 
654 	KASSERT(pvpte != NULL);
655 	if (pvpte == &pp->pp_pte) {
656 		KASSERT((pp->pp_flags & PP_EMBEDDED) != 0);
657 		return NULL;
658 	}
659 	KASSERT((pp->pp_flags & PP_EMBEDDED) == 0);
660 	return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list));
661 }
662 
663 /*
664  * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
665  *		of course the kernel is always loaded
666  */
667 
668 bool
669 pmap_is_curpmap(struct pmap *pmap)
670 {
671 	return((pmap == pmap_kernel()) ||
672 	       (pmap == curcpu()->ci_pmap));
673 }
674 
675 /*
676  *	Add a reference to the specified pmap.
677  */
678 
679 void
680 pmap_reference(struct pmap *pmap)
681 {
682 
683 	atomic_inc_uint(&pmap->pm_obj[0].uo_refs);
684 }
685 
686 /*
687  * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
688  *
689  * there are several pmaps involved.  some or all of them might be same.
690  *
691  *	- the pmap given by the first argument
692  *		our caller wants to access this pmap's PTEs.
693  *
694  *	- pmap_kernel()
695  *		the kernel pmap.  note that it only contains the kernel part
696  *		of the address space which is shared by any pmap.  ie. any
697  *		pmap can be used instead of pmap_kernel() for our purpose.
698  *
699  *	- ci->ci_pmap
700  *		pmap currently loaded on the cpu.
701  *
702  *	- vm_map_pmap(&curproc->p_vmspace->vm_map)
703  *		current process' pmap.
704  *
705  * => we lock enough pmaps to keep things locked in
706  * => must be undone with pmap_unmap_ptes before returning
707  */
708 
709 void
710 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2,
711 	      pd_entry_t **ptepp, pd_entry_t * const **pdeppp)
712 {
713 	struct pmap *curpmap;
714 	struct cpu_info *ci;
715 	lwp_t *l;
716 
717 	/* The kernel's pmap is always accessible. */
718 	if (pmap == pmap_kernel()) {
719 		*pmap2 = NULL;
720 		*ptepp = PTE_BASE;
721 		*pdeppp = normal_pdes;
722 		return;
723 	}
724 	KASSERT(kpreempt_disabled());
725 
726 	l = curlwp;
727  retry:
728 	mutex_enter(pmap->pm_lock);
729 	ci = curcpu();
730 	curpmap = ci->ci_pmap;
731 	if (vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) {
732 		/* Our own pmap so just load it: easy. */
733 		if (__predict_false(ci->ci_want_pmapload)) {
734 			mutex_exit(pmap->pm_lock);
735 			pmap_load();
736 			goto retry;
737 		}
738 		KASSERT(pmap == curpmap);
739 	} else if (pmap == curpmap) {
740 		/*
741 		 * Already on the CPU: make it valid.  This is very
742 		 * often the case during exit(), when we have switched
743 		 * to the kernel pmap in order to destroy a user pmap.
744 		 */
745 		if (!pmap_reactivate(pmap)) {
746 			u_int gen = uvm_emap_gen_return();
747 			tlbflush();
748 			uvm_emap_update(gen);
749 		}
750 	} else {
751 		/*
752 		 * Toss current pmap from CPU, but keep a reference to it.
753 		 * The reference will be dropped by pmap_unmap_ptes().
754 		 * Can happen if we block during exit().
755 		 */
756 		const cpuid_t cid = cpu_index(ci);
757 
758 		kcpuset_atomic_clear(curpmap->pm_cpus, cid);
759 		kcpuset_atomic_clear(curpmap->pm_kernel_cpus, cid);
760 		ci->ci_pmap = pmap;
761 		ci->ci_tlbstate = TLBSTATE_VALID;
762 		kcpuset_atomic_set(pmap->pm_cpus, cid);
763 		kcpuset_atomic_set(pmap->pm_kernel_cpus, cid);
764 		cpu_load_pmap(pmap, curpmap);
765 	}
766 	pmap->pm_ncsw = l->l_ncsw;
767 	*pmap2 = curpmap;
768 	*ptepp = PTE_BASE;
769 #if defined(XEN) && defined(__x86_64__)
770 	KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] == L4_BASE);
771 	ci->ci_normal_pdes[PTP_LEVELS - 2] = pmap->pm_pdir;
772 	*pdeppp = ci->ci_normal_pdes;
773 #else /* XEN && __x86_64__ */
774 	*pdeppp = normal_pdes;
775 #endif /* XEN && __x86_64__ */
776 }
777 
778 /*
779  * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
780  */
781 
782 void
783 pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2)
784 {
785 	struct cpu_info *ci;
786 	struct pmap *mypmap;
787 
788 	KASSERT(kpreempt_disabled());
789 
790 	/* The kernel's pmap is always accessible. */
791 	if (pmap == pmap_kernel()) {
792 		return;
793 	}
794 
795 	ci = curcpu();
796 #if defined(XEN) && defined(__x86_64__)
797 	/* Reset per-cpu normal_pdes */
798 	KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] != L4_BASE);
799 	ci->ci_normal_pdes[PTP_LEVELS - 2] = L4_BASE;
800 #endif /* XEN && __x86_64__ */
801 	/*
802 	 * We cannot tolerate context switches while mapped in.
803 	 * If it is our own pmap all we have to do is unlock.
804 	 */
805 	KASSERT(pmap->pm_ncsw == curlwp->l_ncsw);
806 	mypmap = vm_map_pmap(&curproc->p_vmspace->vm_map);
807 	if (pmap == mypmap) {
808 		mutex_exit(pmap->pm_lock);
809 		return;
810 	}
811 
812 	/*
813 	 * Mark whatever's on the CPU now as lazy and unlock.
814 	 * If the pmap was already installed, we are done.
815 	 */
816 	ci->ci_tlbstate = TLBSTATE_LAZY;
817 	ci->ci_want_pmapload = (mypmap != pmap_kernel());
818 	mutex_exit(pmap->pm_lock);
819 	if (pmap == pmap2) {
820 		return;
821 	}
822 
823 	/*
824 	 * We installed another pmap on the CPU.  Grab a reference to
825 	 * it and leave in place.  Toss the evicted pmap (can block).
826 	 */
827 	pmap_reference(pmap);
828 	pmap_destroy(pmap2);
829 }
830 
831 
832 inline static void
833 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte)
834 {
835 
836 #if !defined(__x86_64__)
837 	if (curproc == NULL || curproc->p_vmspace == NULL ||
838 	    pm != vm_map_pmap(&curproc->p_vmspace->vm_map))
839 		return;
840 
841 	if ((opte ^ npte) & PG_X)
842 		pmap_update_pg(va);
843 
844 	/*
845 	 * Executability was removed on the last executable change.
846 	 * Reset the code segment to something conservative and
847 	 * let the trap handler deal with setting the right limit.
848 	 * We can't do that because of locking constraints on the vm map.
849 	 */
850 
851 	if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) {
852 		struct trapframe *tf = curlwp->l_md.md_regs;
853 
854 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
855 		pm->pm_hiexec = I386_MAX_EXE_ADDR;
856 	}
857 #endif /* !defined(__x86_64__) */
858 }
859 
860 #if !defined(__x86_64__)
861 /*
862  * Fixup the code segment to cover all potential executable mappings.
863  * returns 0 if no changes to the code segment were made.
864  */
865 
866 int
867 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb)
868 {
869 	struct vm_map_entry *ent;
870 	struct pmap *pm = vm_map_pmap(map);
871 	vaddr_t va = 0;
872 
873 	vm_map_lock_read(map);
874 	for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) {
875 
876 		/*
877 		 * This entry has greater va than the entries before.
878 		 * We need to make it point to the last page, not past it.
879 		 */
880 
881 		if (ent->protection & VM_PROT_EXECUTE)
882 			va = trunc_page(ent->end) - PAGE_SIZE;
883 	}
884 	vm_map_unlock_read(map);
885 	if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL))
886 		return (0);
887 
888 	pm->pm_hiexec = va;
889 	if (pm->pm_hiexec > I386_MAX_EXE_ADDR) {
890 		tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
891 	} else {
892 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
893 		return (0);
894 	}
895 	return (1);
896 }
897 #endif /* !defined(__x86_64__) */
898 
899 void
900 pat_init(struct cpu_info *ci)
901 {
902 	uint64_t pat;
903 
904 	if (!(ci->ci_feat_val[0] & CPUID_PAT))
905 		return;
906 
907 	/* We change WT to WC. Leave all other entries the default values. */
908 	pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) |
909 	      PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) |
910 	      PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) |
911 	      PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC);
912 
913 	wrmsr(MSR_CR_PAT, pat);
914 	cpu_pat_enabled = true;
915 	aprint_debug_dev(ci->ci_dev, "PAT enabled\n");
916 }
917 
918 static pt_entry_t
919 pmap_pat_flags(u_int flags)
920 {
921 	u_int cacheflags = (flags & PMAP_CACHE_MASK);
922 
923 	if (!cpu_pat_enabled) {
924 		switch (cacheflags) {
925 		case PMAP_NOCACHE:
926 		case PMAP_NOCACHE_OVR:
927 			/* results in PGC_UCMINUS on cpus which have
928 			 * the cpuid PAT but PAT "disabled"
929 			 */
930 			return PG_N;
931 		default:
932 			return 0;
933 		}
934 	}
935 
936 	switch (cacheflags) {
937 	case PMAP_NOCACHE:
938 		return PGC_UC;
939 	case PMAP_WRITE_COMBINE:
940 		return PGC_WC;
941 	case PMAP_WRITE_BACK:
942 		return PGC_WB;
943 	case PMAP_NOCACHE_OVR:
944 		return PGC_UCMINUS;
945 	}
946 
947 	return 0;
948 }
949 
950 /*
951  * p m a p   k e n t e r   f u n c t i o n s
952  *
953  * functions to quickly enter/remove pages from the kernel address
954  * space.   pmap_kremove is exported to MI kernel.  we make use of
955  * the recursive PTE mappings.
956  */
957 
958 /*
959  * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
960  *
961  * => no need to lock anything, assume va is already allocated
962  * => should be faster than normal pmap enter function
963  */
964 
965 void
966 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
967 {
968 	pt_entry_t *pte, opte, npte;
969 
970 	KASSERT(!(prot & ~VM_PROT_ALL));
971 
972 	if (va < VM_MIN_KERNEL_ADDRESS)
973 		pte = vtopte(va);
974 	else
975 		pte = kvtopte(va);
976 #ifdef DOM0OPS
977 	if (pa < pmap_pa_start || pa >= pmap_pa_end) {
978 #ifdef DEBUG
979 		printf_nolog("%s: pa %#" PRIxPADDR " for va %#" PRIxVADDR
980 		    " outside range\n", __func__, pa, va);
981 #endif /* DEBUG */
982 		npte = pa;
983 	} else
984 #endif /* DOM0OPS */
985 		npte = pmap_pa2pte(pa);
986 	npte |= protection_codes[prot] | PG_V | pmap_pg_g;
987 	npte |= pmap_pat_flags(flags);
988 	opte = pmap_pte_testset(pte, npte); /* zap! */
989 
990 	/*
991 	 * XXX: make sure we are not dealing with a large page, since the only
992 	 * large pages created are for the kernel image, and they should never
993 	 * be kentered.
994 	 */
995 	KASSERTMSG(!(opte & PG_PS), "PG_PS va=%#"PRIxVADDR, va);
996 
997 	if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) {
998 		/* This should not happen. */
999 		printf_nolog("%s: mapping already present\n", __func__);
1000 		kpreempt_disable();
1001 		pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER);
1002 		kpreempt_enable();
1003 	}
1004 }
1005 
1006 void
1007 pmap_emap_enter(vaddr_t va, paddr_t pa, vm_prot_t prot)
1008 {
1009 	pt_entry_t *pte, npte;
1010 
1011 	KASSERT((prot & ~VM_PROT_ALL) == 0);
1012 	pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va);
1013 
1014 #ifdef DOM0OPS
1015 	if (pa < pmap_pa_start || pa >= pmap_pa_end) {
1016 		npte = pa;
1017 	} else
1018 #endif
1019 		npte = pmap_pa2pte(pa);
1020 
1021 	npte = pmap_pa2pte(pa);
1022 	npte |= protection_codes[prot] | PG_V;
1023 	pmap_pte_set(pte, npte);
1024 	pmap_pte_flush();
1025 }
1026 
1027 /*
1028  * pmap_emap_sync: perform TLB flush or pmap load, if it was deferred.
1029  */
1030 void
1031 pmap_emap_sync(bool canload)
1032 {
1033 	struct cpu_info *ci = curcpu();
1034 	struct pmap *pmap;
1035 
1036 	KASSERT(kpreempt_disabled());
1037 	if (__predict_true(ci->ci_want_pmapload && canload)) {
1038 		/*
1039 		 * XXX: Hint for pmap_reactivate(), which might suggest to
1040 		 * not perform TLB flush, if state has not changed.
1041 		 */
1042 		pmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map);
1043 		if (__predict_false(pmap == ci->ci_pmap)) {
1044 			kcpuset_atomic_clear(pmap->pm_cpus, cpu_index(ci));
1045 		}
1046 		pmap_load();
1047 		KASSERT(ci->ci_want_pmapload == 0);
1048 	} else {
1049 		tlbflush();
1050 	}
1051 }
1052 
1053 void
1054 pmap_emap_remove(vaddr_t sva, vsize_t len)
1055 {
1056 	pt_entry_t *pte;
1057 	vaddr_t va, eva = sva + len;
1058 
1059 	for (va = sva; va < eva; va += PAGE_SIZE) {
1060 		pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va);
1061 		pmap_pte_set(pte, 0);
1062 	}
1063 
1064 	pmap_pte_flush();
1065 }
1066 
1067 __strict_weak_alias(pmap_kenter_ma, pmap_kenter_pa);
1068 
1069 #if defined(__x86_64__)
1070 /*
1071  * Change protection for a virtual address. Local for a CPU only, don't
1072  * care about TLB shootdowns.
1073  *
1074  * => must be called with preemption disabled
1075  */
1076 void
1077 pmap_changeprot_local(vaddr_t va, vm_prot_t prot)
1078 {
1079 	pt_entry_t *pte, opte, npte;
1080 
1081 	KASSERT(kpreempt_disabled());
1082 
1083 	if (va < VM_MIN_KERNEL_ADDRESS)
1084 		pte = vtopte(va);
1085 	else
1086 		pte = kvtopte(va);
1087 
1088 	npte = opte = *pte;
1089 
1090 	if ((prot & VM_PROT_WRITE) != 0)
1091 		npte |= PG_RW;
1092 	else
1093 		npte &= ~PG_RW;
1094 
1095 	if (opte != npte) {
1096 		pmap_pte_set(pte, npte);
1097 		pmap_pte_flush();
1098 		invlpg(va);
1099 	}
1100 }
1101 #endif /* defined(__x86_64__) */
1102 
1103 /*
1104  * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
1105  *
1106  * => no need to lock anything
1107  * => caller must dispose of any vm_page mapped in the va range
1108  * => note: not an inline function
1109  * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
1110  * => we assume kernel only unmaps valid addresses and thus don't bother
1111  *    checking the valid bit before doing TLB flushing
1112  * => must be followed by call to pmap_update() before reuse of page
1113  */
1114 
1115 static inline void
1116 pmap_kremove1(vaddr_t sva, vsize_t len, bool localonly)
1117 {
1118 	pt_entry_t *pte, opte;
1119 	vaddr_t va, eva;
1120 
1121 	eva = sva + len;
1122 
1123 	kpreempt_disable();
1124 	for (va = sva; va < eva; va += PAGE_SIZE) {
1125 		pte = kvtopte(va);
1126 		opte = pmap_pte_testset(pte, 0); /* zap! */
1127 		if ((opte & (PG_V | PG_U)) == (PG_V | PG_U) && !localonly) {
1128 			pmap_tlb_shootdown(pmap_kernel(), va, opte,
1129 			    TLBSHOOT_KREMOVE);
1130 		}
1131 		KASSERTMSG((opte & PG_PS) == 0,
1132 		    "va %#" PRIxVADDR " is a large page", va);
1133 		KASSERTMSG((opte & PG_PVLIST) == 0,
1134 		    "va %#" PRIxVADDR " is a pv tracked page", va);
1135 	}
1136 	if (localonly) {
1137 		tlbflushg();
1138 	}
1139 	kpreempt_enable();
1140 }
1141 
1142 void
1143 pmap_kremove(vaddr_t sva, vsize_t len)
1144 {
1145 
1146 	pmap_kremove1(sva, len, false);
1147 }
1148 
1149 /*
1150  * pmap_kremove_local: like pmap_kremove(), but only worry about
1151  * TLB invalidations on the current CPU.  this is only intended
1152  * for use while writing kernel crash dumps, either after panic
1153  * or via reboot -d.
1154  */
1155 
1156 void
1157 pmap_kremove_local(vaddr_t sva, vsize_t len)
1158 {
1159 
1160 	pmap_kremove1(sva, len, true);
1161 }
1162 
1163 /*
1164  * p m a p   i n i t   f u n c t i o n s
1165  *
1166  * pmap_bootstrap and pmap_init are called during system startup
1167  * to init the pmap module.   pmap_bootstrap() does a low level
1168  * init just to get things rolling.   pmap_init() finishes the job.
1169  */
1170 
1171 /*
1172  * pmap_bootstrap_valloc: allocate a virtual address in the bootstrap area.
1173  * This function is to be used before any VM system has been set up.
1174  *
1175  * The va is taken from virtual_avail.
1176  */
1177 static vaddr_t
1178 pmap_bootstrap_valloc(size_t npages)
1179 {
1180 	vaddr_t va = virtual_avail;
1181 	virtual_avail += npages * PAGE_SIZE;
1182 	return va;
1183 }
1184 
1185 /*
1186  * pmap_bootstrap_palloc: allocate a physical address in the bootstrap area.
1187  * This function is to be used before any VM system has been set up.
1188  *
1189  * The pa is taken from avail_start.
1190  */
1191 static paddr_t
1192 pmap_bootstrap_palloc(size_t npages)
1193 {
1194 	paddr_t pa = avail_start;
1195 	avail_start += npages * PAGE_SIZE;
1196 	return pa;
1197 }
1198 
1199 /*
1200  * pmap_bootstrap: get the system in a state where it can run with VM properly
1201  * enabled (called before main()). The VM system is fully init'd later.
1202  *
1203  * => on i386, locore.S has already enabled the MMU by allocating a PDP for the
1204  *    kernel, and nkpde PTP's for the kernel.
1205  * => kva_start is the first free virtual address in kernel space.
1206  */
1207 void
1208 pmap_bootstrap(vaddr_t kva_start)
1209 {
1210 	struct pmap *kpm;
1211 	int i;
1212 	vaddr_t kva;
1213 
1214 	pmap_pg_nx = (cpu_feature[2] & CPUID_NOX ? PG_NX : 0);
1215 
1216 	/*
1217 	 * Set up our local static global vars that keep track of the usage of
1218 	 * KVM before kernel_map is set up.
1219 	 */
1220 	virtual_avail = kva_start;		/* first free KVA */
1221 	virtual_end = VM_MAX_KERNEL_ADDRESS;	/* last KVA */
1222 
1223 	/*
1224 	 * Set up protection_codes: we need to be able to convert from a MI
1225 	 * protection code (some combo of VM_PROT...) to something we can jam
1226 	 * into a x86 PTE.
1227 	 */
1228 	protection_codes[VM_PROT_NONE] = pmap_pg_nx;
1229 	protection_codes[VM_PROT_EXECUTE] = PG_RO | PG_X;
1230 	protection_codes[VM_PROT_READ] = PG_RO | pmap_pg_nx;
1231 	protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO | PG_X;
1232 	protection_codes[VM_PROT_WRITE] = PG_RW | pmap_pg_nx;
1233 	protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW | PG_X;
1234 	protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW | pmap_pg_nx;
1235 	protection_codes[VM_PROT_ALL] = PG_RW | PG_X;
1236 
1237 	/*
1238 	 * Now we init the kernel's pmap.
1239 	 *
1240 	 * The kernel pmap's pm_obj is not used for much. However, in user pmaps
1241 	 * the pm_obj contains the list of active PTPs.
1242 	 *
1243 	 * The pm_obj currently does not have a pager. It might be possible to
1244 	 * add a pager that would allow a process to read-only mmap its own page
1245 	 * tables (fast user-level vtophys?). This may or may not be useful.
1246 	 */
1247 	kpm = pmap_kernel();
1248 	for (i = 0; i < PTP_LEVELS - 1; i++) {
1249 		mutex_init(&kpm->pm_obj_lock[i], MUTEX_DEFAULT, IPL_NONE);
1250 		uvm_obj_init(&kpm->pm_obj[i], NULL, false, 1);
1251 		uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_obj_lock[i]);
1252 		kpm->pm_ptphint[i] = NULL;
1253 	}
1254 	memset(&kpm->pm_list, 0, sizeof(kpm->pm_list));  /* pm_list not used */
1255 
1256 	kpm->pm_pdir = (pd_entry_t *)bootspace.pdir;
1257 	for (i = 0; i < PDP_SIZE; i++)
1258 		kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i;
1259 
1260 	kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
1261 		x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS);
1262 
1263 	kcpuset_create(&kpm->pm_cpus, true);
1264 	kcpuset_create(&kpm->pm_kernel_cpus, true);
1265 
1266 	kpm->pm_ldt = NULL;
1267 	kpm->pm_ldt_len = 0;
1268 	kpm->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
1269 
1270 	/*
1271 	 * the above is just a rough estimate and not critical to the proper
1272 	 * operation of the system.
1273 	 */
1274 
1275 #ifndef XEN
1276 	/*
1277 	 * Begin to enable global TLB entries if they are supported.
1278 	 * The G bit has no effect until the CR4_PGE bit is set in CR4,
1279 	 * which happens in cpu_init(), which is run on each cpu
1280 	 * (and happens later)
1281 	 */
1282 	if (cpu_feature[0] & CPUID_PGE) {
1283 		pmap_pg_g = PG_G;		/* enable software */
1284 
1285 		/* add PG_G attribute to already mapped kernel pages */
1286 		pmap_remap_global();
1287 	}
1288 
1289 	/*
1290 	 * Enable large pages if they are supported.
1291 	 */
1292 	if (cpu_feature[0] & CPUID_PSE) {
1293 		lcr4(rcr4() | CR4_PSE);	/* enable hardware (via %cr4) */
1294 		pmap_largepages = 1;	/* enable software */
1295 
1296 		/*
1297 		 * The TLB must be flushed after enabling large pages on Pentium
1298 		 * CPUs, according to section 3.6.2.2 of "Intel Architecture
1299 		 * Software Developer's Manual, Volume 3: System Programming".
1300 		 */
1301 		tlbflushg();
1302 
1303 		/* Remap the kernel. */
1304 		pmap_remap_largepages();
1305 	}
1306 	pmap_init_lapic();
1307 #endif /* !XEN */
1308 
1309 #ifdef __HAVE_DIRECT_MAP
1310 	pmap_init_directmap(kpm);
1311 #else
1312 	pmap_vpage_cpualloc(&cpu_info_primary);
1313 
1314 	if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { /* i386 */
1315 		early_zerop = (void *)cpu_info_primary.vpage[VPAGE_ZER];
1316 		early_zero_pte = cpu_info_primary.vpage_pte[VPAGE_ZER];
1317 	} else { /* amd64 */
1318 		/*
1319 		 * zero_pte is stuck at the end of mapped space for the kernel
1320 		 * image (disjunct from kva space). This is done so that it
1321 		 * can safely be used in pmap_growkernel (pmap_get_physpage),
1322 		 * when it's called for the first time.
1323 		 * XXXfvdl fix this for MULTIPROCESSOR later.
1324 		 */
1325 #ifdef XEN
1326 		/* early_zerop initialized in xen_locore() */
1327 #else
1328 		early_zerop = (void *)bootspace.spareva;
1329 #endif
1330 		early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop);
1331 	}
1332 #endif
1333 
1334 #if defined(XEN) && defined(__x86_64__)
1335 	extern vaddr_t xen_dummy_page;
1336 	paddr_t xen_dummy_user_pgd;
1337 
1338 	/*
1339 	 * We want a dummy page directory for Xen: when deactivating a pmap,
1340 	 * Xen will still consider it active. So we set user PGD to this one
1341 	 * to lift all protection on the now inactive page tables set.
1342 	 */
1343 	xen_dummy_user_pgd = xen_dummy_page - KERNBASE;
1344 
1345 	/* Zero fill it, the less checks in Xen it requires the better */
1346 	memset((void *)(xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE);
1347 	/* Mark read-only */
1348 	HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE,
1349 	    pmap_pa2pte(xen_dummy_user_pgd) | PG_V | pmap_pg_nx,
1350 	    UVMF_INVLPG);
1351 	/* Pin as L4 */
1352 	xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd));
1353 #endif
1354 
1355 	/*
1356 	 * Allocate space for the IDT, GDT and LDT.
1357 	 */
1358 	idt_vaddr = pmap_bootstrap_valloc(1);
1359 	idt_paddr = pmap_bootstrap_palloc(1);
1360 
1361 	gdt_vaddr = pmap_bootstrap_valloc(1);
1362 	gdt_paddr = pmap_bootstrap_palloc(1);
1363 
1364 	ldt_vaddr = pmap_bootstrap_valloc(1);
1365 	ldt_paddr = pmap_bootstrap_palloc(1);
1366 
1367 #if !defined(__x86_64__) && !defined(XEN)
1368 	/* pentium f00f bug stuff */
1369 	pentium_idt_vaddr = pmap_bootstrap_valloc(1);
1370 #endif
1371 
1372 	/*
1373 	 * Now we reserve some VM for mapping pages when doing a crash dump.
1374 	 */
1375 	virtual_avail = reserve_dumppages(virtual_avail);
1376 
1377 	/*
1378 	 * Init the static-global locks and global lists.
1379 	 *
1380 	 * => pventry::pvh_lock (initialized elsewhere) must also be
1381 	 *      a spin lock, again at IPL_VM to prevent deadlock, and
1382 	 *	again is never taken from interrupt context.
1383 	 */
1384 	mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE);
1385 	LIST_INIT(&pmaps);
1386 
1387 	/*
1388 	 * Ensure the TLB is sync'd with reality by flushing it...
1389 	 */
1390 	tlbflushg();
1391 
1392 	/*
1393 	 * Calculate pmap_maxkvaddr from nkptp[].
1394 	 */
1395 	kva = VM_MIN_KERNEL_ADDRESS;
1396 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
1397 		kva += nkptp[i] * nbpd[i];
1398 	}
1399 	pmap_maxkvaddr = kva;
1400 }
1401 
1402 #ifndef XEN
1403 static void
1404 pmap_init_lapic(void)
1405 {
1406 	/*
1407 	 * On CPUs that have no LAPIC, local_apic_va is never kentered. But our
1408 	 * x86 implementation relies a lot on this address to be valid; so just
1409 	 * allocate a fake physical page that will be kentered into
1410 	 * local_apic_va by machdep.
1411 	 *
1412 	 * If the LAPIC is present, the va will be remapped somewhere else
1413 	 * later in lapic_map.
1414 	 */
1415 	local_apic_va = pmap_bootstrap_valloc(1);
1416 	local_apic_pa = pmap_bootstrap_palloc(1);
1417 }
1418 #endif
1419 
1420 #ifdef __HAVE_DIRECT_MAP
1421 /*
1422  * Create the amd64 direct map. Called only once at boot time.
1423  */
1424 static void
1425 pmap_init_directmap(struct pmap *kpm)
1426 {
1427 	extern phys_ram_seg_t mem_clusters[];
1428 	extern int mem_cluster_cnt;
1429 
1430 	paddr_t lastpa, L2page_pa, L3page_pa, pdp;
1431 	vaddr_t tmpva;
1432 	pt_entry_t *pte;
1433 	pd_entry_t *pde;
1434 	phys_ram_seg_t *mc;
1435 	size_t nL4e, nL3e, nL2e;
1436 	size_t pn, npd;
1437 	int i, n;
1438 
1439 	const pd_entry_t pteflags = PG_V | PG_KW | pmap_pg_nx;
1440 
1441 	CTASSERT(NL4_SLOT_DIRECT * NBPD_L4 == MAXPHYSMEM);
1442 
1443 	/* Get the last physical address available */
1444 	lastpa = 0;
1445 	for (i = 0; i < mem_cluster_cnt; i++) {
1446 		mc = &mem_clusters[i];
1447 		lastpa = MAX(lastpa, mc->start + mc->size);
1448 	}
1449 
1450 	/*
1451 	 * x86_add_cluster should have truncated the memory to MAXPHYSMEM.
1452 	 */
1453 	if (lastpa > MAXPHYSMEM) {
1454 		panic("pmap_init_directmap: lastpa incorrect");
1455 	}
1456 
1457 	/* We will use this temporary va. */
1458 	tmpva = bootspace.spareva;
1459 	pte = PTE_BASE + pl1_i(tmpva);
1460 
1461 	/* Number of L4 entries. */
1462 	nL4e = (lastpa + NBPD_L4 - 1) >> L4_SHIFT;
1463 	KASSERT(nL4e <= NL4_SLOT_DIRECT);
1464 
1465 	/* Allocate L3, and zero it out. */
1466 	L3page_pa = pmap_bootstrap_palloc(nL4e);
1467 	for (i = 0; i < nL4e; i++) {
1468 		pdp = L3page_pa + i * PAGE_SIZE;
1469 		*pte = (pdp & PG_FRAME) | pteflags;
1470 		pmap_update_pg(tmpva);
1471 		memset((void *)tmpva, 0, PAGE_SIZE);
1472 	}
1473 
1474 	/* Number of L3 entries. */
1475 	nL3e = (lastpa + NBPD_L3 - 1) >> L3_SHIFT;
1476 
1477 	/*
1478 	 * Map the direct map RW. Use super pages (1GB) or large pages (2MB) if
1479 	 * they are supported. Note: PG_G is not allowed on non-leaf PTPs.
1480 	 */
1481 	if (cpu_feature[2] & CPUID_P1GB) {
1482 		/* Super pages are supported. Just create L3. */
1483 		for (i = 0; i < nL3e; i++) {
1484 			pdp = (paddr_t)&(((pd_entry_t *)L3page_pa)[i]);
1485 			*pte = (pdp & PG_FRAME) | pteflags;
1486 			pmap_update_pg(tmpva);
1487 
1488 			pde = (pd_entry_t *)(tmpva + (pdp & ~PG_FRAME));
1489 			*pde = ((paddr_t)i << L3_SHIFT) | pteflags | PG_U |
1490 			    PG_PS | PG_G;
1491 		}
1492 	} else {
1493 		/* Allocate L2. */
1494 		L2page_pa = pmap_bootstrap_palloc(nL3e);
1495 
1496 		/* Number of L2 entries. */
1497 		nL2e = (lastpa + NBPD_L2 - 1) >> L2_SHIFT;
1498 
1499 		KASSERT(pmap_largepages != 0);
1500 
1501 		/* Large pages are supported. Just create L2. */
1502 		for (i = 0; i < nL3e; i++) {
1503 			pdp = L2page_pa + i * PAGE_SIZE;
1504 			*pte = (pdp & PG_FRAME) | pteflags;
1505 			pmap_update_pg(tmpva);
1506 
1507 			memset((void *)tmpva, 0, PAGE_SIZE);
1508 
1509 			pde = (pd_entry_t *)tmpva;
1510 			npd = ((i == nL3e - 1) && (nL2e % NPDPG != 0)) ?
1511 			    (nL2e % NPDPG) : NPDPG;
1512 			for (n = 0; n < npd; n++) {
1513 				pn = (i * NPDPG) + n;
1514 				pde[n] = ((paddr_t)pn << L2_SHIFT) | pteflags |
1515 					PG_U | PG_PS | PG_G;
1516 			}
1517 		}
1518 
1519 		/* Fill in the L3 entries, linked to L2. */
1520 		for (i = 0; i < nL4e; i++) {
1521 			pdp = L3page_pa + i * PAGE_SIZE;
1522 			*pte = (pdp & PG_FRAME) | pteflags;
1523 			pmap_update_pg(tmpva);
1524 
1525 			pde = (pd_entry_t *)tmpva;
1526 			npd = ((i == nL4e - 1) && (nL3e % NPDPG != 0)) ?
1527 			    (nL3e % NPDPG) : NPDPG;
1528 			for (n = 0; n < npd; n++) {
1529 				pn = (i * NPDPG) + n;
1530 				pde[n] = (L2page_pa + (pn << PAGE_SHIFT)) |
1531 				    pteflags | PG_U;
1532 			}
1533 		}
1534 	}
1535 
1536 	/* Fill in the L4 entries, linked to L3. */
1537 	for (i = 0; i < nL4e; i++) {
1538 		kpm->pm_pdir[PDIR_SLOT_DIRECT + i] =
1539 		    (L3page_pa + (i << PAGE_SHIFT)) | pteflags | PG_U;
1540 	}
1541 
1542 	*pte = 0;
1543 	pmap_update_pg(tmpva);
1544 
1545 	tlbflush();
1546 }
1547 #endif /* __HAVE_DIRECT_MAP */
1548 
1549 #ifndef XEN
1550 /*
1551  * Remap all of the virtual pages created so far with the PG_G bit.
1552  */
1553 static void
1554 pmap_remap_global(void)
1555 {
1556 	vaddr_t kva, kva_end;
1557 	unsigned long p1i;
1558 	size_t i;
1559 
1560 	/* head */
1561 	kva = bootspace.head.va;
1562 	kva_end = kva + bootspace.head.sz;
1563 	for ( ; kva < kva_end; kva += PAGE_SIZE) {
1564 		p1i = pl1_i(kva);
1565 		if (pmap_valid_entry(PTE_BASE[p1i]))
1566 			PTE_BASE[p1i] |= PG_G;
1567 	}
1568 
1569 	/* kernel segments */
1570 	for (i = 0; i < BTSPACE_NSEGS; i++) {
1571 		if (bootspace.segs[i].type == BTSEG_NONE) {
1572 			continue;
1573 		}
1574 		kva = bootspace.segs[i].va;
1575 		kva_end = kva + bootspace.segs[i].sz;
1576 		for ( ; kva < kva_end; kva += PAGE_SIZE) {
1577 			p1i = pl1_i(kva);
1578 			if (pmap_valid_entry(PTE_BASE[p1i]))
1579 				PTE_BASE[p1i] |= PG_G;
1580 		}
1581 	}
1582 
1583 	/* boot space */
1584 	kva = bootspace.boot.va;
1585 	kva_end = kva + bootspace.boot.sz;
1586 	for ( ; kva < kva_end; kva += PAGE_SIZE) {
1587 		p1i = pl1_i(kva);
1588 		if (pmap_valid_entry(PTE_BASE[p1i]))
1589 			PTE_BASE[p1i] |= PG_G;
1590 	}
1591 }
1592 
1593 /*
1594  * Remap several kernel segments with large pages. We cover as many pages as we
1595  * can. Called only once at boot time, if the CPU supports large pages.
1596  */
1597 static void
1598 pmap_remap_largepages(void)
1599 {
1600 	pd_entry_t *pde;
1601 	vaddr_t kva, kva_end;
1602 	paddr_t pa;
1603 	size_t i;
1604 
1605 	/* Remap the kernel text using large pages. */
1606 	for (i = 0; i < BTSPACE_NSEGS; i++) {
1607 		if (bootspace.segs[i].type != BTSEG_TEXT) {
1608 			continue;
1609 		}
1610 		kva = roundup(bootspace.segs[i].va, NBPD_L2);
1611 		kva_end = rounddown(bootspace.segs[i].va +
1612 			bootspace.segs[i].sz, NBPD_L1);
1613 		pa = roundup(bootspace.segs[i].pa, NBPD_L2);
1614 		for (/* */; kva + NBPD_L2 <= kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1615 			pde = &L2_BASE[pl2_i(kva)];
1616 			*pde = pa | pmap_pg_g | PG_PS | PG_KR | PG_V;
1617 			tlbflushg();
1618 		}
1619 	}
1620 
1621 	/* Remap the kernel rodata using large pages. */
1622 	for (i = 0; i < BTSPACE_NSEGS; i++) {
1623 		if (bootspace.segs[i].type != BTSEG_RODATA) {
1624 			continue;
1625 		}
1626 		kva = roundup(bootspace.segs[i].va, NBPD_L2);
1627 		kva_end = rounddown(bootspace.segs[i].va +
1628 			bootspace.segs[i].sz, NBPD_L1);
1629 		pa = roundup(bootspace.segs[i].pa, NBPD_L2);
1630 		for (/* */; kva + NBPD_L2 <= kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1631 			pde = &L2_BASE[pl2_i(kva)];
1632 			*pde = pa | pmap_pg_g | PG_PS | pmap_pg_nx | PG_KR | PG_V;
1633 			tlbflushg();
1634 		}
1635 	}
1636 
1637 	/* Remap the kernel data+bss using large pages. */
1638 	for (i = 0; i < BTSPACE_NSEGS; i++) {
1639 		if (bootspace.segs[i].type != BTSEG_DATA) {
1640 			continue;
1641 		}
1642 		kva = roundup(bootspace.segs[i].va, NBPD_L2);
1643 		kva_end = rounddown(bootspace.segs[i].va +
1644 			bootspace.segs[i].sz, NBPD_L1);
1645 		pa = roundup(bootspace.segs[i].pa, NBPD_L2);
1646 		for (/* */; kva + NBPD_L2 <= kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1647 			pde = &L2_BASE[pl2_i(kva)];
1648 			*pde = pa | pmap_pg_g | PG_PS | pmap_pg_nx | PG_KW | PG_V;
1649 			tlbflushg();
1650 		}
1651 	}
1652 }
1653 #endif /* !XEN */
1654 
1655 /*
1656  * pmap_init: called from uvm_init, our job is to get the pmap
1657  * system ready to manage mappings...
1658  */
1659 
1660 void
1661 pmap_init(void)
1662 {
1663 	int i, flags;
1664 
1665 	for (i = 0; i < PV_HASH_SIZE; i++) {
1666 		SLIST_INIT(&pv_hash_heads[i].hh_list);
1667 	}
1668 	for (i = 0; i < PV_HASH_LOCK_CNT; i++) {
1669 		mutex_init(&pv_hash_locks[i].lock, MUTEX_NODEBUG, IPL_VM);
1670 	}
1671 
1672 	/*
1673 	 * initialize caches.
1674 	 */
1675 
1676 	pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), 0, 0, 0,
1677 	    "pmappl", NULL, IPL_NONE, NULL, NULL, NULL);
1678 
1679 #ifdef XEN
1680 	/*
1681 	 * pool_cache(9) should not touch cached objects, since they
1682 	 * are pinned on xen and R/O for the domU
1683 	 */
1684 	flags = PR_NOTOUCH;
1685 #else /* XEN */
1686 	flags = 0;
1687 #endif /* XEN */
1688 #ifdef PAE
1689 	pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE * PDP_SIZE, 0, 0, flags,
1690 	    "pdppl", &pmap_pdp_allocator, IPL_NONE,
1691 	    pmap_pdp_ctor, pmap_pdp_dtor, NULL);
1692 #else /* PAE */
1693 	pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE, 0, 0, flags,
1694 	    "pdppl", NULL, IPL_NONE, pmap_pdp_ctor, pmap_pdp_dtor, NULL);
1695 #endif /* PAE */
1696 	pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0,
1697 	    PR_LARGECACHE, "pvpl", &pool_allocator_kmem, IPL_NONE, NULL,
1698 	    NULL, NULL);
1699 
1700 	pmap_tlb_init();
1701 
1702 	/* XXX: Since cpu_hatch() is only for secondary CPUs. */
1703 	pmap_tlb_cpu_init(curcpu());
1704 
1705 	evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC,
1706 	    NULL, "x86", "io bitmap copy");
1707 	evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC,
1708 	    NULL, "x86", "ldt sync");
1709 
1710 	/*
1711 	 * done: pmap module is up (and ready for business)
1712 	 */
1713 
1714 	pmap_initialized = true;
1715 }
1716 
1717 /*
1718  * pmap_cpu_init_late: perform late per-CPU initialization.
1719  */
1720 
1721 #ifndef XEN
1722 void
1723 pmap_cpu_init_late(struct cpu_info *ci)
1724 {
1725 	/*
1726 	 * The BP has already its own PD page allocated during early
1727 	 * MD startup.
1728 	 */
1729 	if (ci == &cpu_info_primary)
1730 		return;
1731 
1732 #ifdef PAE
1733 	cpu_alloc_l3_page(ci);
1734 #endif
1735 }
1736 #endif
1737 
1738 #ifndef __HAVE_DIRECT_MAP
1739 CTASSERT(CACHE_LINE_SIZE > sizeof(pt_entry_t));
1740 CTASSERT(CACHE_LINE_SIZE % sizeof(pt_entry_t) == 0);
1741 
1742 static void
1743 pmap_vpage_cpualloc(struct cpu_info *ci)
1744 {
1745 	bool primary = (ci == &cpu_info_primary);
1746 	size_t i, npages;
1747 	vaddr_t vabase;
1748 	vsize_t vrange;
1749 
1750 	npages = (CACHE_LINE_SIZE / sizeof(pt_entry_t));
1751 	KASSERT(npages >= VPAGE_MAX);
1752 	vrange = npages * PAGE_SIZE;
1753 
1754 	if (primary) {
1755 		while ((vabase = pmap_bootstrap_valloc(1)) % vrange != 0) {
1756 			/* Waste some pages to align properly */
1757 		}
1758 		/* The base is aligned, allocate the rest (contiguous) */
1759 		pmap_bootstrap_valloc(npages - 1);
1760 	} else {
1761 		vabase = uvm_km_alloc(kernel_map, vrange, vrange,
1762 		    UVM_KMF_VAONLY);
1763 		if (vabase == 0) {
1764 			panic("%s: failed to allocate tmp VA for CPU %d\n",
1765 			    __func__, cpu_index(ci));
1766 		}
1767 	}
1768 
1769 	KASSERT((vaddr_t)&PTE_BASE[pl1_i(vabase)] % CACHE_LINE_SIZE == 0);
1770 
1771 	for (i = 0; i < VPAGE_MAX; i++) {
1772 		ci->vpage[i] = vabase + i * PAGE_SIZE;
1773 		ci->vpage_pte[i] = PTE_BASE + pl1_i(ci->vpage[i]);
1774 	}
1775 }
1776 
1777 void
1778 pmap_vpage_cpu_init(struct cpu_info *ci)
1779 {
1780 	if (ci == &cpu_info_primary) {
1781 		/* cpu0 already taken care of in pmap_bootstrap */
1782 		return;
1783 	}
1784 
1785 	pmap_vpage_cpualloc(ci);
1786 }
1787 #endif
1788 
1789 /*
1790  * p v _ e n t r y   f u n c t i o n s
1791  */
1792 
1793 static bool
1794 pmap_pp_needs_pve(struct pmap_page *pp)
1795 {
1796 
1797 	/*
1798 	 * Adding a pv entry for this page only needs to allocate a pv_entry
1799 	 * structure if the page already has at least one pv entry,
1800 	 * since the first pv entry is stored in the pmap_page.
1801 	 */
1802 
1803 	return pp && ((pp->pp_flags & PP_EMBEDDED) != 0 ||
1804 	    !LIST_EMPTY(&pp->pp_head.pvh_list));
1805 }
1806 
1807 /*
1808  * pmap_free_pvs: free a list of pv_entrys
1809  */
1810 
1811 static void
1812 pmap_free_pvs(struct pv_entry *pve)
1813 {
1814 	struct pv_entry *next;
1815 
1816 	for ( /* null */ ; pve != NULL ; pve = next) {
1817 		next = pve->pve_next;
1818 		pool_cache_put(&pmap_pv_cache, pve);
1819 	}
1820 }
1821 
1822 /*
1823  * main pv_entry manipulation functions:
1824  *   pmap_enter_pv: enter a mapping onto a pv_head list
1825  *   pmap_remove_pv: remove a mapping from a pv_head list
1826  *
1827  * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock
1828  *       the pvh before calling
1829  */
1830 
1831 /*
1832  * insert_pv: a helper of pmap_enter_pv
1833  */
1834 
1835 static void
1836 insert_pv(struct pmap_page *pp, struct pv_entry *pve)
1837 {
1838 	struct pv_hash_head *hh;
1839 	kmutex_t *lock;
1840 	u_int hash;
1841 
1842 	hash = pvhash_hash(pve->pve_pte.pte_ptp, pve->pve_pte.pte_va);
1843 	lock = pvhash_lock(hash);
1844 	hh = pvhash_head(hash);
1845 	mutex_spin_enter(lock);
1846 	SLIST_INSERT_HEAD(&hh->hh_list, pve, pve_hash);
1847 	mutex_spin_exit(lock);
1848 
1849 	LIST_INSERT_HEAD(&pp->pp_head.pvh_list, pve, pve_list);
1850 }
1851 
1852 /*
1853  * pmap_enter_pv: enter a mapping onto a pv_head lst
1854  *
1855  * => caller should adjust ptp's wire_count before calling
1856  * => caller has preallocated pve and *sparepve for us
1857  */
1858 
1859 static struct pv_entry *
1860 pmap_enter_pv(struct pmap_page *pp, struct pv_entry *pve,
1861     struct pv_entry **sparepve, struct vm_page *ptp, vaddr_t va)
1862 {
1863 
1864 	KASSERT(ptp == NULL || ptp->wire_count >= 2);
1865 	KASSERT(ptp == NULL || ptp->uobject != NULL);
1866 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
1867 
1868 	if ((pp->pp_flags & PP_EMBEDDED) == 0) {
1869 		if (LIST_EMPTY(&pp->pp_head.pvh_list)) {
1870 			pp->pp_flags |= PP_EMBEDDED;
1871 			pp->pp_pte.pte_ptp = ptp;
1872 			pp->pp_pte.pte_va = va;
1873 
1874 			return pve;
1875 		}
1876 	} else {
1877 		struct pv_entry *pve2;
1878 
1879 		pve2 = *sparepve;
1880 		*sparepve = NULL;
1881 
1882 		pve2->pve_pte = pp->pp_pte;
1883 		pp->pp_flags &= ~PP_EMBEDDED;
1884 		LIST_INIT(&pp->pp_head.pvh_list);
1885 		insert_pv(pp, pve2);
1886 	}
1887 
1888 	pve->pve_pte.pte_ptp = ptp;
1889 	pve->pve_pte.pte_va = va;
1890 	insert_pv(pp, pve);
1891 
1892 	return NULL;
1893 }
1894 
1895 /*
1896  * pmap_remove_pv: try to remove a mapping from a pv_list
1897  *
1898  * => caller should adjust ptp's wire_count and free PTP if needed
1899  * => we return the removed pve
1900  */
1901 
1902 static struct pv_entry *
1903 pmap_remove_pv(struct pmap_page *pp, struct vm_page *ptp, vaddr_t va)
1904 {
1905 	struct pv_hash_head *hh;
1906 	struct pv_entry *pve;
1907 	kmutex_t *lock;
1908 	u_int hash;
1909 
1910 	KASSERT(ptp == NULL || ptp->uobject != NULL);
1911 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
1912 
1913 	if ((pp->pp_flags & PP_EMBEDDED) != 0) {
1914 		KASSERT(pp->pp_pte.pte_ptp == ptp);
1915 		KASSERT(pp->pp_pte.pte_va == va);
1916 
1917 		pp->pp_flags &= ~PP_EMBEDDED;
1918 		LIST_INIT(&pp->pp_head.pvh_list);
1919 
1920 		return NULL;
1921 	}
1922 
1923 	hash = pvhash_hash(ptp, va);
1924 	lock = pvhash_lock(hash);
1925 	hh = pvhash_head(hash);
1926 	mutex_spin_enter(lock);
1927 	pve = pvhash_remove(hh, ptp, va);
1928 	mutex_spin_exit(lock);
1929 
1930 	LIST_REMOVE(pve, pve_list);
1931 
1932 	return pve;
1933 }
1934 
1935 /*
1936  * p t p   f u n c t i o n s
1937  */
1938 
1939 static inline struct vm_page *
1940 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level)
1941 {
1942 	int lidx = level - 1;
1943 	struct vm_page *pg;
1944 
1945 	KASSERT(mutex_owned(pmap->pm_lock));
1946 
1947 	if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] &&
1948 	    pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) {
1949 		return (pmap->pm_ptphint[lidx]);
1950 	}
1951 	PMAP_SUBOBJ_LOCK(pmap, lidx);
1952 	pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level));
1953 	PMAP_SUBOBJ_UNLOCK(pmap, lidx);
1954 
1955 	KASSERT(pg == NULL || pg->wire_count >= 1);
1956 	return pg;
1957 }
1958 
1959 static inline void
1960 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level)
1961 {
1962 	lwp_t *l;
1963 	int lidx;
1964 	struct uvm_object *obj;
1965 
1966 	KASSERT(ptp->wire_count == 1);
1967 
1968 	lidx = level - 1;
1969 
1970 	obj = &pmap->pm_obj[lidx];
1971 	pmap_stats_update(pmap, -1, 0);
1972 	if (lidx != 0)
1973 		mutex_enter(obj->vmobjlock);
1974 	if (pmap->pm_ptphint[lidx] == ptp)
1975 		pmap->pm_ptphint[lidx] = TAILQ_FIRST(&obj->memq);
1976 	ptp->wire_count = 0;
1977 	uvm_pagerealloc(ptp, NULL, 0);
1978 	l = curlwp;
1979 	KASSERT((l->l_pflag & LP_INTR) == 0);
1980 	VM_PAGE_TO_PP(ptp)->pp_link = l->l_md.md_gc_ptp;
1981 	l->l_md.md_gc_ptp = ptp;
1982 	if (lidx != 0)
1983 		mutex_exit(obj->vmobjlock);
1984 }
1985 
1986 static void
1987 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
1988 	      pt_entry_t *ptes, pd_entry_t * const *pdes)
1989 {
1990 	unsigned long index;
1991 	int level;
1992 	vaddr_t invaladdr;
1993 	pd_entry_t opde;
1994 
1995 	KASSERT(pmap != pmap_kernel());
1996 	KASSERT(mutex_owned(pmap->pm_lock));
1997 	KASSERT(kpreempt_disabled());
1998 
1999 	level = 1;
2000 	do {
2001 		index = pl_i(va, level + 1);
2002 		opde = pmap_pte_testset(&pdes[level - 1][index], 0);
2003 #if defined(XEN)
2004 #  if defined(__x86_64__)
2005 		/*
2006 		 * If ptp is a L3 currently mapped in kernel space,
2007 		 * on any cpu, clear it before freeing
2008 		 */
2009 		if (level == PTP_LEVELS - 1) {
2010 			/*
2011 			 * Update the per-cpu PD on all cpus the current
2012 			 * pmap is active on
2013 			 */
2014 			xen_kpm_sync(pmap, index);
2015 		}
2016 #  endif /*__x86_64__ */
2017 		invaladdr = level == 1 ? (vaddr_t)ptes :
2018 		    (vaddr_t)pdes[level - 2];
2019 		pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE,
2020 		    opde, TLBSHOOT_FREE_PTP1);
2021 		pmap_tlb_shootnow();
2022 #else	/* XEN */
2023 		invaladdr = level == 1 ? (vaddr_t)ptes :
2024 		    (vaddr_t)pdes[level - 2];
2025 		pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE,
2026 		    opde, TLBSHOOT_FREE_PTP1);
2027 #endif	/* XEN */
2028 		pmap_freepage(pmap, ptp, level);
2029 		if (level < PTP_LEVELS - 1) {
2030 			ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1);
2031 			ptp->wire_count--;
2032 			if (ptp->wire_count > 1)
2033 				break;
2034 		}
2035 	} while (++level < PTP_LEVELS);
2036 	pmap_pte_flush();
2037 }
2038 
2039 /*
2040  * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
2041  *
2042  * => pmap should NOT be pmap_kernel()
2043  * => pmap should be locked
2044  * => preemption should be disabled
2045  */
2046 
2047 static struct vm_page *
2048 pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t * const *pdes, int flags)
2049 {
2050 	struct vm_page *ptp;
2051 	struct {
2052 		struct vm_page *pg;
2053 		bool new;
2054 	} pt[PTP_LEVELS + 1];
2055 	int i, aflags;
2056 	unsigned long index;
2057 	pd_entry_t *pva;
2058 	paddr_t pa;
2059 	struct uvm_object *obj;
2060 	voff_t off;
2061 
2062 	KASSERT(pmap != pmap_kernel());
2063 	KASSERT(mutex_owned(pmap->pm_lock));
2064 	KASSERT(kpreempt_disabled());
2065 
2066 	/*
2067 	 * Loop through all page table levels allocating a page
2068 	 * for any level where we don't already have one.
2069 	 */
2070 	memset(pt, 0, sizeof(pt));
2071 	aflags = ((flags & PMAP_CANFAIL) ? 0 : UVM_PGA_USERESERVE) |
2072 		UVM_PGA_ZERO;
2073 	for (i = PTP_LEVELS; i > 1; i--) {
2074 		obj = &pmap->pm_obj[i - 2];
2075 		off = ptp_va2o(va, i - 1);
2076 
2077 		PMAP_SUBOBJ_LOCK(pmap, i - 2);
2078 		pt[i].pg = uvm_pagelookup(obj, off);
2079 		if (pt[i].pg == NULL) {
2080 			pt[i].pg = uvm_pagealloc(obj, off, NULL, aflags);
2081 			pt[i].new = true;
2082 		}
2083 		PMAP_SUBOBJ_UNLOCK(pmap, i - 2);
2084 
2085 		if (pt[i].pg == NULL)
2086 			goto fail;
2087 	}
2088 
2089 	/*
2090 	 * Now that we have all the pages looked up or allocated,
2091 	 * loop through again installing any new ones into the tree.
2092 	 */
2093 	for (i = PTP_LEVELS; i > 1; i--) {
2094 		index = pl_i(va, i);
2095 		pva = pdes[i - 2];
2096 
2097 		if (pmap_valid_entry(pva[index])) {
2098 			KASSERT(!pt[i].new);
2099 			continue;
2100 		}
2101 
2102 		ptp = pt[i].pg;
2103 		ptp->flags &= ~PG_BUSY; /* never busy */
2104 		ptp->wire_count = 1;
2105 		pmap->pm_ptphint[i - 2] = ptp;
2106 		pa = VM_PAGE_TO_PHYS(ptp);
2107 		pmap_pte_set(&pva[index], (pd_entry_t)
2108 		    (pmap_pa2pte(pa) | PG_u | PG_RW | PG_V));
2109 #if defined(XEN) && defined(__x86_64__)
2110 		if (i == PTP_LEVELS) {
2111 
2112 			/*
2113 			 * Update the per-cpu PD on all cpus the current
2114 			 * pmap is active on
2115 			 */
2116 			xen_kpm_sync(pmap, index);
2117 		}
2118 #endif
2119 		pmap_pte_flush();
2120 		pmap_stats_update(pmap, 1, 0);
2121 
2122 		/*
2123 		 * If we're not in the top level, increase the
2124 		 * wire count of the parent page.
2125 		 */
2126 		if (i < PTP_LEVELS) {
2127 			pt[i + 1].pg->wire_count++;
2128 		}
2129 	}
2130 	ptp = pt[2].pg;
2131 	KASSERT(ptp != NULL);
2132 	pmap->pm_ptphint[0] = ptp;
2133 	return ptp;
2134 
2135 	/*
2136 	 * Allocation of a ptp failed, free any others that we just allocated.
2137 	 */
2138 fail:
2139 	for (i = PTP_LEVELS; i > 1; i--) {
2140 		if (pt[i].pg == NULL) {
2141 			break;
2142 		}
2143 		if (!pt[i].new) {
2144 			continue;
2145 		}
2146 		obj = &pmap->pm_obj[i - 2];
2147 		PMAP_SUBOBJ_LOCK(pmap, i - 2);
2148 		uvm_pagefree(pt[i].pg);
2149 		PMAP_SUBOBJ_UNLOCK(pmap, i - 2);
2150 	}
2151 	return NULL;
2152 }
2153 
2154 /*
2155  * p m a p   l i f e c y c l e   f u n c t i o n s
2156  */
2157 
2158 /*
2159  * pmap_pdp_ctor: constructor for the PDP cache.
2160  */
2161 static int
2162 pmap_pdp_ctor(void *arg, void *v, int flags)
2163 {
2164 	pd_entry_t *pdir = v;
2165 	paddr_t pdirpa = 0;
2166 	vaddr_t object;
2167 	int i;
2168 
2169 #if !defined(XEN) || !defined(__x86_64__)
2170 	int npde;
2171 #endif
2172 #ifdef XEN
2173 	int s;
2174 #endif
2175 
2176 	/*
2177 	 * NOTE: The `pmaps_lock' is held when the PDP is allocated.
2178 	 */
2179 
2180 #if defined(XEN) && defined(__x86_64__)
2181 	/* Fetch the physical address of the page directory */
2182 	(void)pmap_extract(pmap_kernel(), (vaddr_t)pdir, &pdirpa);
2183 
2184 	/* Zero the area */
2185 	memset(pdir, 0, PAGE_SIZE); /* Xen wants a clean page */
2186 
2187 	/*
2188 	 * This pdir will NEVER be active in kernel mode, so mark
2189 	 * recursive entry invalid.
2190 	 */
2191 	pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa);
2192 
2193 	/*
2194 	 * PDP constructed this way won't be for the kernel, hence we
2195 	 * don't put kernel mappings on Xen.
2196 	 *
2197 	 * But we need to make pmap_create() happy, so put a dummy
2198 	 * (without PG_V) value at the right place.
2199 	 */
2200 	pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] =
2201 	     (pd_entry_t)-1 & PG_FRAME;
2202 #else /* XEN && __x86_64__*/
2203 	/* Zero the area */
2204 	memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t));
2205 
2206 	object = (vaddr_t)v;
2207 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2208 		/* Fetch the physical address of the page directory */
2209 		(void)pmap_extract(pmap_kernel(), object, &pdirpa);
2210 
2211 		/* Put in recursive PDE to map the PTEs */
2212 		pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PG_V |
2213 		    pmap_pg_nx;
2214 #ifndef XEN
2215 		pdir[PDIR_SLOT_PTE + i] |= PG_KW;
2216 #endif
2217 	}
2218 
2219 	/* Copy the kernel's top level PDE */
2220 	npde = nkptp[PTP_LEVELS - 1];
2221 
2222 	memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN],
2223 	    npde * sizeof(pd_entry_t));
2224 
2225 	/* Zero the rest */
2226 	memset(&pdir[PDIR_SLOT_KERN + npde], 0, (PAGE_SIZE * PDP_SIZE) -
2227 	    (PDIR_SLOT_KERN + npde) * sizeof(pd_entry_t));
2228 
2229 	if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
2230 		int idx = pl_i(KERNBASE, PTP_LEVELS);
2231 		pdir[idx] = PDP_BASE[idx];
2232 	}
2233 
2234 #ifdef __HAVE_DIRECT_MAP
2235 	memcpy(&pdir[PDIR_SLOT_DIRECT], &PDP_BASE[PDIR_SLOT_DIRECT],
2236 	    NL4_SLOT_DIRECT * sizeof(pd_entry_t));
2237 #endif
2238 #endif /* XEN  && __x86_64__*/
2239 
2240 #ifdef XEN
2241 	s = splvm();
2242 	object = (vaddr_t)v;
2243 	pmap_protect(pmap_kernel(), object, object + (PAGE_SIZE * PDP_SIZE),
2244 	    VM_PROT_READ);
2245 	pmap_update(pmap_kernel());
2246 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2247 		/*
2248 		 * pin as L2/L4 page, we have to do the page with the
2249 		 * PDIR_SLOT_PTE entries last
2250 		 */
2251 #ifdef PAE
2252 		if (i == l2tol3(PDIR_SLOT_PTE))
2253 			continue;
2254 #endif
2255 
2256 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2257 #ifdef __x86_64__
2258 		xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa));
2259 #else
2260 		xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2261 #endif
2262 	}
2263 #ifdef PAE
2264 	object = ((vaddr_t)pdir) + PAGE_SIZE  * l2tol3(PDIR_SLOT_PTE);
2265 	(void)pmap_extract(pmap_kernel(), object, &pdirpa);
2266 	xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2267 #endif
2268 	splx(s);
2269 #endif /* XEN */
2270 
2271 	return (0);
2272 }
2273 
2274 /*
2275  * pmap_pdp_dtor: destructor for the PDP cache.
2276  */
2277 
2278 static void
2279 pmap_pdp_dtor(void *arg, void *v)
2280 {
2281 #ifdef XEN
2282 	paddr_t pdirpa = 0;	/* XXX: GCC */
2283 	vaddr_t object = (vaddr_t)v;
2284 	int i;
2285 	int s = splvm();
2286 	pt_entry_t *pte;
2287 
2288 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2289 		/* fetch the physical address of the page directory. */
2290 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2291 		/* unpin page table */
2292 		xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa));
2293 	}
2294 	object = (vaddr_t)v;
2295 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2296 		/* Set page RW again */
2297 		pte = kvtopte(object);
2298 		pmap_pte_set(pte, *pte | PG_RW);
2299 		xen_bcast_invlpg((vaddr_t)object);
2300 	}
2301 	splx(s);
2302 #endif  /* XEN */
2303 }
2304 
2305 #ifdef PAE
2306 
2307 /* pmap_pdp_alloc: Allocate a page for the pdp memory pool. */
2308 
2309 static void *
2310 pmap_pdp_alloc(struct pool *pp, int flags)
2311 {
2312 	return (void *)uvm_km_alloc(kernel_map,
2313 	    PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE,
2314 	    ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK)
2315 	    | UVM_KMF_WIRED);
2316 }
2317 
2318 /*
2319  * pmap_pdp_free: free a PDP
2320  */
2321 
2322 static void
2323 pmap_pdp_free(struct pool *pp, void *v)
2324 {
2325 	uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE,
2326 	    UVM_KMF_WIRED);
2327 }
2328 #endif /* PAE */
2329 
2330 /*
2331  * pmap_create: create a pmap object.
2332  */
2333 struct pmap *
2334 pmap_create(void)
2335 {
2336 	struct pmap *pmap;
2337 	int i;
2338 
2339 	pmap = pool_cache_get(&pmap_cache, PR_WAITOK);
2340 
2341 	/* init uvm_object */
2342 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2343 		mutex_init(&pmap->pm_obj_lock[i], MUTEX_DEFAULT, IPL_NONE);
2344 		uvm_obj_init(&pmap->pm_obj[i], NULL, false, 1);
2345 		uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_obj_lock[i]);
2346 		pmap->pm_ptphint[i] = NULL;
2347 	}
2348 	pmap->pm_stats.wired_count = 0;
2349 	/* count the PDP allocd below */
2350 	pmap->pm_stats.resident_count = PDP_SIZE;
2351 #if !defined(__x86_64__)
2352 	pmap->pm_hiexec = 0;
2353 #endif /* !defined(__x86_64__) */
2354 	pmap->pm_flags = 0;
2355 	pmap->pm_gc_ptp = NULL;
2356 
2357 	kcpuset_create(&pmap->pm_cpus, true);
2358 	kcpuset_create(&pmap->pm_kernel_cpus, true);
2359 #ifdef XEN
2360 	kcpuset_create(&pmap->pm_xen_ptp_cpus, true);
2361 #endif
2362 	/* init the LDT */
2363 	pmap->pm_ldt = NULL;
2364 	pmap->pm_ldt_len = 0;
2365 	pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2366 
2367 	/* allocate PDP */
2368  try_again:
2369 	pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK);
2370 
2371 	mutex_enter(&pmaps_lock);
2372 
2373 	if (pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] == 0) {
2374 		mutex_exit(&pmaps_lock);
2375 		pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir);
2376 		goto try_again;
2377 	}
2378 
2379 	for (i = 0; i < PDP_SIZE; i++)
2380 		pmap->pm_pdirpa[i] =
2381 		    pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]);
2382 
2383 	LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
2384 
2385 	mutex_exit(&pmaps_lock);
2386 
2387 	return (pmap);
2388 }
2389 
2390 /*
2391  * pmap_free_ptps: put a list of ptps back to the freelist.
2392  */
2393 
2394 void
2395 pmap_free_ptps(struct vm_page *empty_ptps)
2396 {
2397 	struct vm_page *ptp;
2398 	struct pmap_page *pp;
2399 
2400 	while ((ptp = empty_ptps) != NULL) {
2401 		pp = VM_PAGE_TO_PP(ptp);
2402 		empty_ptps = pp->pp_link;
2403 		LIST_INIT(&pp->pp_head.pvh_list);
2404 		uvm_pagefree(ptp);
2405 	}
2406 }
2407 
2408 /*
2409  * pmap_check_ptps: verify that none of the pmap's page table objects
2410  * have any pages allocated to them.
2411  */
2412 
2413 static inline void
2414 pmap_check_ptps(struct pmap *pmap)
2415 {
2416 	int i;
2417 
2418 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2419 		KASSERT(pmap->pm_obj[i].uo_npages == 0);
2420 		KASSERT(TAILQ_EMPTY(&pmap->pm_obj[i].memq));
2421 	}
2422 }
2423 
2424 /*
2425  * pmap_destroy: drop reference count on pmap.   free pmap if
2426  *	reference count goes to zero.
2427  */
2428 
2429 void
2430 pmap_destroy(struct pmap *pmap)
2431 {
2432 	lwp_t *l;
2433 	int i;
2434 
2435 	/*
2436 	 * If we have torn down this pmap, process deferred frees and
2437 	 * invalidations.  Free now if the system is low on memory.
2438 	 * Otherwise, free when the pmap is destroyed thus avoiding a
2439 	 * TLB shootdown.
2440 	 */
2441 	l = curlwp;
2442 	if (__predict_false(l->l_md.md_gc_pmap == pmap)) {
2443 		pmap_check_ptps(pmap);
2444 		if (uvmexp.free < uvmexp.freetarg) {
2445 			pmap_update(pmap);
2446 		} else {
2447 			KASSERT(pmap->pm_gc_ptp == NULL);
2448 			pmap->pm_gc_ptp = l->l_md.md_gc_ptp;
2449 			l->l_md.md_gc_ptp = NULL;
2450 			l->l_md.md_gc_pmap = NULL;
2451 		}
2452 	}
2453 
2454 	/*
2455 	 * drop reference count
2456 	 */
2457 
2458 	if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) {
2459 		return;
2460 	}
2461 
2462 #ifdef DIAGNOSTIC
2463 	CPU_INFO_ITERATOR cii;
2464 	struct cpu_info *ci;
2465 
2466 	for (CPU_INFO_FOREACH(cii, ci)) {
2467 		if (ci->ci_pmap == pmap)
2468 			panic("destroying pmap being used");
2469 #if defined(XEN) && defined(__x86_64__)
2470 		for (i = 0; i < PDIR_SLOT_PTE; i++) {
2471 			if (pmap->pm_pdir[i] != 0 &&
2472 			    ci->ci_kpm_pdir[i] == pmap->pm_pdir[i]) {
2473 				printf("pmap_destroy(%p) pmap_kernel %p "
2474 				    "curcpu %d cpu %d ci_pmap %p "
2475 				    "ci->ci_kpm_pdir[%d]=%" PRIx64
2476 				    " pmap->pm_pdir[%d]=%" PRIx64 "\n",
2477 				    pmap, pmap_kernel(), curcpu()->ci_index,
2478 				    ci->ci_index, ci->ci_pmap,
2479 				    i, ci->ci_kpm_pdir[i],
2480 				    i, pmap->pm_pdir[i]);
2481 				panic("%s: used pmap", __func__);
2482 			}
2483 		}
2484 #endif
2485 	}
2486 #endif /* DIAGNOSTIC */
2487 
2488 	/*
2489 	 * Reference count is zero, free pmap resources and then free pmap.
2490 	 * First, remove it from global list of pmaps.
2491 	 */
2492 
2493 	mutex_enter(&pmaps_lock);
2494 	LIST_REMOVE(pmap, pm_list);
2495 	mutex_exit(&pmaps_lock);
2496 
2497 	/*
2498 	 * Process deferred PTP frees.  No TLB shootdown required, as the
2499 	 * PTP pages are no longer visible to any CPU.
2500 	 */
2501 
2502 	pmap_free_ptps(pmap->pm_gc_ptp);
2503 
2504 	pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir);
2505 
2506 #ifdef USER_LDT
2507 	if (pmap->pm_ldt != NULL) {
2508 		/*
2509 		 * no need to switch the LDT; this address space is gone,
2510 		 * nothing is using it.
2511 		 *
2512 		 * No need to lock the pmap for ldt_free (or anything else),
2513 		 * we're the last one to use it.
2514 		 */
2515 		mutex_enter(&cpu_lock);
2516 		ldt_free(pmap->pm_ldt_sel);
2517 		mutex_exit(&cpu_lock);
2518 		uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt,
2519 		    pmap->pm_ldt_len, UVM_KMF_WIRED);
2520 	}
2521 #endif
2522 
2523 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2524 		uvm_obj_destroy(&pmap->pm_obj[i], false);
2525 		mutex_destroy(&pmap->pm_obj_lock[i]);
2526 	}
2527 	kcpuset_destroy(pmap->pm_cpus);
2528 	kcpuset_destroy(pmap->pm_kernel_cpus);
2529 #ifdef XEN
2530 	kcpuset_destroy(pmap->pm_xen_ptp_cpus);
2531 #endif
2532 
2533 	pmap_check_ptps(pmap);
2534 	pool_cache_put(&pmap_cache, pmap);
2535 }
2536 
2537 /*
2538  * pmap_remove_all: pmap is being torn down by the current thread.
2539  * avoid unnecessary invalidations.
2540  */
2541 
2542 void
2543 pmap_remove_all(struct pmap *pmap)
2544 {
2545 	lwp_t *l = curlwp;
2546 
2547 	KASSERT(l->l_md.md_gc_pmap == NULL);
2548 
2549 	l->l_md.md_gc_pmap = pmap;
2550 }
2551 
2552 #if defined(PMAP_FORK)
2553 /*
2554  * pmap_fork: perform any necessary data structure manipulation when
2555  * a VM space is forked.
2556  */
2557 
2558 void
2559 pmap_fork(struct pmap *pmap1, struct pmap *pmap2)
2560 {
2561 #ifdef USER_LDT
2562 	union descriptor *new_ldt;
2563 	size_t len;
2564 	int sel;
2565 
2566 	if (__predict_true(pmap1->pm_ldt == NULL)) {
2567 		return;
2568 	}
2569 
2570 	/*
2571 	 * Copy the LDT into the new process.
2572 	 *
2573 	 * Read pmap1's ldt pointer and length unlocked; if it changes
2574 	 * behind our back we'll retry. This will starve if there's a
2575 	 * stream of LDT changes in another thread but that should not
2576 	 * happen.
2577 	 */
2578 
2579  retry:
2580 	if (pmap1->pm_ldt != NULL) {
2581 		len = pmap1->pm_ldt_len;
2582 		/* Allocate space for the new process's LDT */
2583 		new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, len, 0,
2584 		    UVM_KMF_WIRED);
2585 		if (new_ldt == NULL) {
2586 			printf("WARNING: %s: unable to allocate LDT space\n",
2587 			    __func__);
2588 			return;
2589 		}
2590 		mutex_enter(&cpu_lock);
2591 		/* Get a GDT slot for it */
2592 		sel = ldt_alloc(new_ldt, len);
2593 		if (sel == -1) {
2594 			mutex_exit(&cpu_lock);
2595 			uvm_km_free(kernel_map, (vaddr_t)new_ldt, len,
2596 			    UVM_KMF_WIRED);
2597 			printf("WARNING: %s: unable to allocate LDT selector\n",
2598 			    __func__);
2599 			return;
2600 		}
2601 	} else {
2602 		/* Wasn't anything there after all. */
2603 		len = -1;
2604 		new_ldt = NULL;
2605 		sel = -1;
2606 		mutex_enter(&cpu_lock);
2607 	}
2608 
2609  	/* If there's still something there now that we have cpu_lock... */
2610  	if (pmap1->pm_ldt != NULL) {
2611 		if (len != pmap1->pm_ldt_len) {
2612 			/* Oops, it changed. Drop what we did and try again */
2613 			if (len != -1) {
2614 				ldt_free(sel);
2615 				uvm_km_free(kernel_map, (vaddr_t)new_ldt,
2616 				    len, UVM_KMF_WIRED);
2617 			}
2618 			mutex_exit(&cpu_lock);
2619 			goto retry;
2620 		}
2621 
2622 		/* Copy the LDT data and install it in pmap2 */
2623 		memcpy(new_ldt, pmap1->pm_ldt, len);
2624 		pmap2->pm_ldt = new_ldt;
2625 		pmap2->pm_ldt_len = pmap1->pm_ldt_len;
2626 		pmap2->pm_ldt_sel = sel;
2627 		len = -1;
2628 	}
2629 
2630 	if (len != -1) {
2631 		/* There wasn't still something there, so mop up */
2632 		ldt_free(sel);
2633 		mutex_exit(&cpu_lock);
2634 		uvm_km_free(kernel_map, (vaddr_t)new_ldt, len,
2635 		    UVM_KMF_WIRED);
2636 	} else {
2637 		mutex_exit(&cpu_lock);
2638 	}
2639 #endif /* USER_LDT */
2640 }
2641 #endif /* PMAP_FORK */
2642 
2643 #ifdef USER_LDT
2644 
2645 /*
2646  * pmap_ldt_xcall: cross call used by pmap_ldt_sync.  if the named pmap
2647  * is active, reload LDTR.
2648  */
2649 static void
2650 pmap_ldt_xcall(void *arg1, void *arg2)
2651 {
2652 	struct pmap *pm;
2653 
2654 	kpreempt_disable();
2655 	pm = arg1;
2656 	if (curcpu()->ci_pmap == pm) {
2657 		lldt(pm->pm_ldt_sel);
2658 	}
2659 	kpreempt_enable();
2660 }
2661 
2662 /*
2663  * pmap_ldt_sync: LDT selector for the named pmap is changing.  swap
2664  * in the new selector on all CPUs.
2665  */
2666 void
2667 pmap_ldt_sync(struct pmap *pm)
2668 {
2669 	uint64_t where;
2670 
2671 	KASSERT(mutex_owned(&cpu_lock));
2672 
2673 	pmap_ldt_evcnt.ev_count++;
2674 	where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL);
2675 	xc_wait(where);
2676 }
2677 
2678 /*
2679  * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and
2680  * restore the default.
2681  */
2682 
2683 void
2684 pmap_ldt_cleanup(struct lwp *l)
2685 {
2686 	pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap;
2687 	union descriptor *dp = NULL;
2688 	size_t len = 0;
2689 	int sel = -1;
2690 
2691 	if (__predict_true(pmap->pm_ldt == NULL)) {
2692 		return;
2693 	}
2694 
2695 	mutex_enter(&cpu_lock);
2696 	if (pmap->pm_ldt != NULL) {
2697 		sel = pmap->pm_ldt_sel;
2698 		dp = pmap->pm_ldt;
2699 		len = pmap->pm_ldt_len;
2700 		pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2701 		pmap->pm_ldt = NULL;
2702 		pmap->pm_ldt_len = 0;
2703 		pmap_ldt_sync(pmap);
2704 		ldt_free(sel);
2705 		uvm_km_free(kernel_map, (vaddr_t)dp, len, UVM_KMF_WIRED);
2706 	}
2707 	mutex_exit(&cpu_lock);
2708 }
2709 #endif /* USER_LDT */
2710 
2711 /*
2712  * pmap_activate: activate a process' pmap
2713  *
2714  * => must be called with kernel preemption disabled
2715  * => if lwp is the curlwp, then set ci_want_pmapload so that
2716  *    actual MMU context switch will be done by pmap_load() later
2717  */
2718 
2719 void
2720 pmap_activate(struct lwp *l)
2721 {
2722 	struct cpu_info *ci;
2723 	struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2724 
2725 	KASSERT(kpreempt_disabled());
2726 
2727 	ci = curcpu();
2728 
2729 	if (l == ci->ci_curlwp) {
2730 		KASSERT(ci->ci_want_pmapload == 0);
2731 		KASSERT(ci->ci_tlbstate != TLBSTATE_VALID);
2732 
2733 		/*
2734 		 * no need to switch to kernel vmspace because
2735 		 * it's a subset of any vmspace.
2736 		 */
2737 
2738 		if (pmap == pmap_kernel()) {
2739 			ci->ci_want_pmapload = 0;
2740 			return;
2741 		}
2742 
2743 		ci->ci_want_pmapload = 1;
2744 	}
2745 }
2746 
2747 /*
2748  * pmap_reactivate: try to regain reference to the pmap.
2749  *
2750  * => Must be called with kernel preemption disabled.
2751  */
2752 
2753 static bool
2754 pmap_reactivate(struct pmap *pmap)
2755 {
2756 	struct cpu_info * const ci = curcpu();
2757 	const cpuid_t cid = cpu_index(ci);
2758 	bool result;
2759 
2760 	KASSERT(kpreempt_disabled());
2761 #if defined(XEN) && defined(__x86_64__)
2762 	KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd);
2763 #elif defined(PAE)
2764 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
2765 #elif !defined(XEN)
2766 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()));
2767 #endif
2768 
2769 	/*
2770 	 * If we still have a lazy reference to this pmap, we can assume
2771 	 * that there was no TLB shootdown for this pmap in the meantime.
2772 	 *
2773 	 * The order of events here is important as we must synchronize
2774 	 * with TLB shootdown interrupts.  Declare interest in invalidations
2775 	 * (TLBSTATE_VALID) and then check the CPU set, which the IPIs can
2776 	 * change only when the state is TLBSTATE_LAZY.
2777 	 */
2778 
2779 	ci->ci_tlbstate = TLBSTATE_VALID;
2780 	KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid));
2781 
2782 	if (kcpuset_isset(pmap->pm_cpus, cid)) {
2783 		/* We have the reference, state is valid. */
2784 		result = true;
2785 	} else {
2786 		/* Must reload the TLB. */
2787 		kcpuset_atomic_set(pmap->pm_cpus, cid);
2788 		result = false;
2789 	}
2790 	return result;
2791 }
2792 
2793 /*
2794  * pmap_load: perform the actual pmap switch, i.e. fill in %cr3 register
2795  * and relevant LDT info.
2796  *
2797  * Ensures that the current process' pmap is loaded on the current CPU's
2798  * MMU and that there are no stale TLB entries.
2799  *
2800  * => The caller should disable kernel preemption or do check-and-retry
2801  *    to prevent a preemption from undoing our efforts.
2802  * => This function may block.
2803  */
2804 void
2805 pmap_load(void)
2806 {
2807 	struct cpu_info *ci;
2808 	struct pmap *pmap, *oldpmap;
2809 	struct lwp *l;
2810 	struct pcb *pcb;
2811 	cpuid_t cid;
2812 	uint64_t ncsw;
2813 
2814 	kpreempt_disable();
2815  retry:
2816 	ci = curcpu();
2817 	if (!ci->ci_want_pmapload) {
2818 		kpreempt_enable();
2819 		return;
2820 	}
2821 	l = ci->ci_curlwp;
2822 	ncsw = l->l_ncsw;
2823 
2824 	/* should be able to take ipis. */
2825 	KASSERT(ci->ci_ilevel < IPL_HIGH);
2826 #ifdef XEN
2827 	/* Check to see if interrupts are enabled (ie; no events are masked) */
2828 	KASSERT(x86_read_psl() == 0);
2829 #else
2830 	KASSERT((x86_read_psl() & PSL_I) != 0);
2831 #endif
2832 
2833 	KASSERT(l != NULL);
2834 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2835 	KASSERT(pmap != pmap_kernel());
2836 	oldpmap = ci->ci_pmap;
2837 	pcb = lwp_getpcb(l);
2838 
2839 	if (pmap == oldpmap) {
2840 		if (!pmap_reactivate(pmap)) {
2841 			u_int gen = uvm_emap_gen_return();
2842 
2843 			/*
2844 			 * pmap has been changed during deactivated.
2845 			 * our tlb may be stale.
2846 			 */
2847 
2848 			tlbflush();
2849 			uvm_emap_update(gen);
2850 		}
2851 
2852 		ci->ci_want_pmapload = 0;
2853 		kpreempt_enable();
2854 		return;
2855 	}
2856 
2857 	/*
2858 	 * Acquire a reference to the new pmap and perform the switch.
2859 	 */
2860 
2861 	pmap_reference(pmap);
2862 
2863 	cid = cpu_index(ci);
2864 	kcpuset_atomic_clear(oldpmap->pm_cpus, cid);
2865 	kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid);
2866 
2867 #if defined(XEN) && defined(__x86_64__)
2868 	KASSERT(pmap_pdirpa(oldpmap, 0) == ci->ci_xen_current_user_pgd ||
2869 	    oldpmap == pmap_kernel());
2870 #elif defined(PAE)
2871 	KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
2872 #elif !defined(XEN)
2873 	KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(rcr3()));
2874 #endif
2875 	KASSERT(!kcpuset_isset(pmap->pm_cpus, cid));
2876 	KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid));
2877 
2878 	/*
2879 	 * Mark the pmap in use by this CPU.  Again, we must synchronize
2880 	 * with TLB shootdown interrupts, so set the state VALID first,
2881 	 * then register us for shootdown events on this pmap.
2882 	 */
2883 	ci->ci_tlbstate = TLBSTATE_VALID;
2884 	kcpuset_atomic_set(pmap->pm_cpus, cid);
2885 	kcpuset_atomic_set(pmap->pm_kernel_cpus, cid);
2886 	ci->ci_pmap = pmap;
2887 
2888 	/*
2889 	 * update tss.  now that we have registered for invalidations
2890 	 * from other CPUs, we're good to load the page tables.
2891 	 */
2892 #ifdef PAE
2893 	pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa;
2894 #else
2895 	pcb->pcb_cr3 = pmap_pdirpa(pmap, 0);
2896 #endif
2897 
2898 #ifdef i386
2899 #ifndef XEN
2900 	ci->ci_tss.tss_ldt = pmap->pm_ldt_sel;
2901 	ci->ci_tss.tss_cr3 = pcb->pcb_cr3;
2902 #endif /* !XEN */
2903 #endif /* i386 */
2904 
2905 	lldt(pmap->pm_ldt_sel);
2906 
2907 	u_int gen = uvm_emap_gen_return();
2908 	cpu_load_pmap(pmap, oldpmap);
2909 	uvm_emap_update(gen);
2910 
2911 	ci->ci_want_pmapload = 0;
2912 
2913 	/*
2914 	 * we're now running with the new pmap.  drop the reference
2915 	 * to the old pmap.  if we block, we need to go around again.
2916 	 */
2917 
2918 	pmap_destroy(oldpmap);
2919 	if (l->l_ncsw != ncsw) {
2920 		goto retry;
2921 	}
2922 
2923 	kpreempt_enable();
2924 }
2925 
2926 /*
2927  * pmap_deactivate: deactivate a process' pmap.
2928  *
2929  * => Must be called with kernel preemption disabled (high IPL is enough).
2930  */
2931 void
2932 pmap_deactivate(struct lwp *l)
2933 {
2934 	struct pmap *pmap;
2935 	struct cpu_info *ci;
2936 
2937 	KASSERT(kpreempt_disabled());
2938 
2939 	if (l != curlwp) {
2940 		return;
2941 	}
2942 
2943 	/*
2944 	 * Wait for pending TLB shootdowns to complete.  Necessary because
2945 	 * TLB shootdown state is per-CPU, and the LWP may be coming off
2946 	 * the CPU before it has a chance to call pmap_update(), e.g. due
2947 	 * to kernel preemption or blocking routine in between.
2948 	 */
2949 	pmap_tlb_shootnow();
2950 
2951 	ci = curcpu();
2952 
2953 	if (ci->ci_want_pmapload) {
2954 		/*
2955 		 * ci_want_pmapload means that our pmap is not loaded on
2956 		 * the CPU or TLB might be stale.  note that pmap_kernel()
2957 		 * is always considered loaded.
2958 		 */
2959 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
2960 		    != pmap_kernel());
2961 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
2962 		    != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID);
2963 
2964 		/*
2965 		 * userspace has not been touched.
2966 		 * nothing to do here.
2967 		 */
2968 
2969 		ci->ci_want_pmapload = 0;
2970 		return;
2971 	}
2972 
2973 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2974 
2975 	if (pmap == pmap_kernel()) {
2976 		return;
2977 	}
2978 
2979 #if defined(XEN) && defined(__x86_64__)
2980 	KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd);
2981 #elif defined(PAE)
2982 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
2983 #elif !defined(XEN)
2984 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()));
2985 #endif
2986 	KASSERT(ci->ci_pmap == pmap);
2987 
2988 	/*
2989 	 * we aren't interested in TLB invalidations for this pmap,
2990 	 * at least for the time being.
2991 	 */
2992 
2993 	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
2994 	ci->ci_tlbstate = TLBSTATE_LAZY;
2995 }
2996 
2997 /*
2998  * end of lifecycle functions
2999  */
3000 
3001 /*
3002  * some misc. functions
3003  */
3004 
3005 int
3006 pmap_pdes_invalid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde)
3007 {
3008 	int i;
3009 	unsigned long index;
3010 	pd_entry_t pde;
3011 
3012 	for (i = PTP_LEVELS; i > 1; i--) {
3013 		index = pl_i(va, i);
3014 		pde = pdes[i - 2][index];
3015 		if ((pde & PG_V) == 0)
3016 			return i;
3017 	}
3018 	if (lastpde != NULL)
3019 		*lastpde = pde;
3020 	return 0;
3021 }
3022 
3023 /*
3024  * pmap_extract: extract a PA for the given VA
3025  */
3026 
3027 bool
3028 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
3029 {
3030 	pt_entry_t *ptes, pte;
3031 	pd_entry_t pde;
3032 	pd_entry_t * const *pdes;
3033 	struct pmap *pmap2;
3034 	struct cpu_info *ci;
3035 	paddr_t pa;
3036 	lwp_t *l;
3037 	bool hard, rv;
3038 
3039 #ifdef __HAVE_DIRECT_MAP
3040 	if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
3041 		if (pap != NULL) {
3042 			*pap = va - PMAP_DIRECT_BASE;
3043 		}
3044 		return true;
3045 	}
3046 #endif
3047 
3048 	rv = false;
3049 	pa = 0;
3050 	l = curlwp;
3051 
3052 	kpreempt_disable();
3053 	ci = l->l_cpu;
3054 	if (__predict_true(!ci->ci_want_pmapload && ci->ci_pmap == pmap) ||
3055 	    pmap == pmap_kernel()) {
3056 		/*
3057 		 * no need to lock, because it's pmap_kernel() or our
3058 		 * own pmap and is active.  if a user pmap, the caller
3059 		 * will hold the vm_map write/read locked and so prevent
3060 		 * entries from disappearing while we are here.  ptps
3061 		 * can disappear via pmap_remove() and pmap_protect(),
3062 		 * but they are called with the vm_map write locked.
3063 		 */
3064 		hard = false;
3065 		ptes = PTE_BASE;
3066 		pdes = normal_pdes;
3067 	} else {
3068 		/* we lose, do it the hard way. */
3069 		hard = true;
3070 		pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3071 	}
3072 	if (pmap_pdes_valid(va, pdes, &pde)) {
3073 		pte = ptes[pl1_i(va)];
3074 		if (pde & PG_PS) {
3075 			pa = (pde & PG_LGFRAME) | (va & (NBPD_L2 - 1));
3076 			rv = true;
3077 		} else if (__predict_true((pte & PG_V) != 0)) {
3078 			pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
3079 			rv = true;
3080 		}
3081 	}
3082 	if (__predict_false(hard)) {
3083 		pmap_unmap_ptes(pmap, pmap2);
3084 	}
3085 	kpreempt_enable();
3086 	if (pap != NULL) {
3087 		*pap = pa;
3088 	}
3089 	return rv;
3090 }
3091 
3092 
3093 /*
3094  * vtophys: virtual address to physical address.  For use by
3095  * machine-dependent code only.
3096  */
3097 
3098 paddr_t
3099 vtophys(vaddr_t va)
3100 {
3101 	paddr_t pa;
3102 
3103 	if (pmap_extract(pmap_kernel(), va, &pa) == true)
3104 		return (pa);
3105 	return (0);
3106 }
3107 
3108 __strict_weak_alias(pmap_extract_ma, pmap_extract);
3109 
3110 #ifdef XEN
3111 
3112 /*
3113  * vtomach: virtual address to machine address.  For use by
3114  * machine-dependent code only.
3115  */
3116 
3117 paddr_t
3118 vtomach(vaddr_t va)
3119 {
3120 	paddr_t pa;
3121 
3122 	if (pmap_extract_ma(pmap_kernel(), va, &pa) == true)
3123 		return (pa);
3124 	return (0);
3125 }
3126 
3127 #endif /* XEN */
3128 
3129 /*
3130  * pmap_virtual_space: used during bootup [pmap_steal_memory] to
3131  *	determine the bounds of the kernel virtual addess space.
3132  */
3133 
3134 void
3135 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp)
3136 {
3137 	*startp = virtual_avail;
3138 	*endp = virtual_end;
3139 }
3140 
3141 /*
3142  * pmap_zero_page: zero a page
3143  */
3144 
3145 void
3146 pmap_zero_page(paddr_t pa)
3147 {
3148 #if defined(__HAVE_DIRECT_MAP)
3149 	pagezero(PMAP_DIRECT_MAP(pa));
3150 #else
3151 #if defined(XEN)
3152 	if (XEN_VERSION_SUPPORTED(3, 4))
3153 		xen_pagezero(pa);
3154 #endif
3155 	struct cpu_info *ci;
3156 	pt_entry_t *zpte;
3157 	vaddr_t zerova;
3158 
3159 	const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_M | PG_U;
3160 
3161 	kpreempt_disable();
3162 
3163 	ci = curcpu();
3164 	zerova = ci->vpage[VPAGE_ZER];
3165 	zpte = ci->vpage_pte[VPAGE_ZER];
3166 
3167 	KASSERTMSG(!*zpte, "pmap_zero_page: lock botch");
3168 
3169 	pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags);
3170 	pmap_pte_flush();
3171 	pmap_update_pg(zerova);		/* flush TLB */
3172 
3173 	memset((void *)zerova, 0, PAGE_SIZE);
3174 
3175 #if defined(DIAGNOSTIC) || defined(XEN)
3176 	pmap_pte_set(zpte, 0);				/* zap ! */
3177 	pmap_pte_flush();
3178 #endif
3179 
3180 	kpreempt_enable();
3181 #endif /* defined(__HAVE_DIRECT_MAP) */
3182 }
3183 
3184 /*
3185  * pmap_pagezeroidle: the same, for the idle loop page zero'er.
3186  * Returns true if the page was zero'd, false if we aborted for
3187  * some reason.
3188  */
3189 
3190 bool
3191 pmap_pageidlezero(paddr_t pa)
3192 {
3193 #ifdef __HAVE_DIRECT_MAP
3194 	KASSERT(cpu_feature[0] & CPUID_SSE2);
3195 	return sse2_idlezero_page((void *)PMAP_DIRECT_MAP(pa));
3196 #else
3197 	struct cpu_info *ci;
3198 	pt_entry_t *zpte;
3199 	vaddr_t zerova;
3200 	bool rv;
3201 
3202 	const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_M | PG_U;
3203 
3204 	ci = curcpu();
3205 	zerova = ci->vpage[VPAGE_ZER];
3206 	zpte = ci->vpage_pte[VPAGE_ZER];
3207 
3208 	KASSERT(cpu_feature[0] & CPUID_SSE2);
3209 	KASSERT(*zpte == 0);
3210 
3211 	pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags);
3212 	pmap_pte_flush();
3213 	pmap_update_pg(zerova);		/* flush TLB */
3214 
3215 	rv = sse2_idlezero_page((void *)zerova);
3216 
3217 #if defined(DIAGNOSTIC) || defined(XEN)
3218 	pmap_pte_set(zpte, 0);				/* zap ! */
3219 	pmap_pte_flush();
3220 #endif
3221 
3222 	return rv;
3223 #endif
3224 }
3225 
3226 /*
3227  * pmap_copy_page: copy a page
3228  */
3229 
3230 void
3231 pmap_copy_page(paddr_t srcpa, paddr_t dstpa)
3232 {
3233 #if defined(__HAVE_DIRECT_MAP)
3234 	vaddr_t srcva = PMAP_DIRECT_MAP(srcpa);
3235 	vaddr_t dstva = PMAP_DIRECT_MAP(dstpa);
3236 
3237 	memcpy((void *)dstva, (void *)srcva, PAGE_SIZE);
3238 #else
3239 #if defined(XEN)
3240 	if (XEN_VERSION_SUPPORTED(3, 4)) {
3241 		xen_copy_page(srcpa, dstpa);
3242 		return;
3243 	}
3244 #endif
3245 	struct cpu_info *ci;
3246 	pt_entry_t *srcpte, *dstpte;
3247 	vaddr_t srcva, dstva;
3248 
3249 	const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_U;
3250 
3251 	kpreempt_disable();
3252 
3253 	ci = curcpu();
3254 	srcva = ci->vpage[VPAGE_SRC];
3255 	dstva = ci->vpage[VPAGE_DST];
3256 	srcpte = ci->vpage_pte[VPAGE_SRC];
3257 	dstpte = ci->vpage_pte[VPAGE_DST];
3258 
3259 	KASSERT(*srcpte == 0 && *dstpte == 0);
3260 
3261 	pmap_pte_set(srcpte, pmap_pa2pte(srcpa) | pteflags);
3262 	pmap_pte_set(dstpte, pmap_pa2pte(dstpa) | pteflags | PG_M);
3263 	pmap_pte_flush();
3264 	pmap_update_2pg(srcva, dstva);
3265 
3266 	memcpy((void *)dstva, (void *)srcva, PAGE_SIZE);
3267 
3268 #if defined(DIAGNOSTIC) || defined(XEN)
3269 	pmap_pte_set(srcpte, 0);
3270 	pmap_pte_set(dstpte, 0);
3271 	pmap_pte_flush();
3272 #endif
3273 
3274 	kpreempt_enable();
3275 #endif /* defined(__HAVE_DIRECT_MAP) */
3276 }
3277 
3278 static pt_entry_t *
3279 pmap_map_ptp(struct vm_page *ptp)
3280 {
3281 #ifdef __HAVE_DIRECT_MAP
3282 	return (void *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp));
3283 #else
3284 	struct cpu_info *ci;
3285 	pt_entry_t *ptppte;
3286 	vaddr_t ptpva;
3287 
3288 	KASSERT(kpreempt_disabled());
3289 
3290 #ifndef XEN
3291 	const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_U | PG_M;
3292 #else
3293 	const pd_entry_t pteflags = PG_V | pmap_pg_nx | PG_U | PG_M;
3294 #endif
3295 
3296 	ci = curcpu();
3297 	ptpva = ci->vpage[VPAGE_PTP];
3298 	ptppte = ci->vpage_pte[VPAGE_PTP];
3299 
3300 	pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | pteflags);
3301 
3302 	pmap_pte_flush();
3303 	pmap_update_pg(ptpva);
3304 
3305 	return (pt_entry_t *)ptpva;
3306 #endif
3307 }
3308 
3309 static void
3310 pmap_unmap_ptp(void)
3311 {
3312 #ifndef __HAVE_DIRECT_MAP
3313 #if defined(DIAGNOSTIC) || defined(XEN)
3314 	struct cpu_info *ci;
3315 	pt_entry_t *pte;
3316 
3317 	KASSERT(kpreempt_disabled());
3318 
3319 	ci = curcpu();
3320 	pte = ci->vpage_pte[VPAGE_PTP];
3321 
3322 	if (*pte != 0) {
3323 		pmap_pte_set(pte, 0);
3324 		pmap_pte_flush();
3325 	}
3326 #endif
3327 #endif
3328 }
3329 
3330 static pt_entry_t *
3331 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
3332 {
3333 
3334 	KASSERT(kpreempt_disabled());
3335 	if (pmap_is_curpmap(pmap)) {
3336 		return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */
3337 	}
3338 	KASSERT(ptp != NULL);
3339 	return pmap_map_ptp(ptp) + pl1_pi(va);
3340 }
3341 
3342 static void
3343 pmap_unmap_pte(void)
3344 {
3345 
3346 	KASSERT(kpreempt_disabled());
3347 
3348 	pmap_unmap_ptp();
3349 }
3350 
3351 /*
3352  * p m a p   r e m o v e   f u n c t i o n s
3353  *
3354  * functions that remove mappings
3355  */
3356 
3357 /*
3358  * pmap_remove_ptes: remove PTEs from a PTP
3359  *
3360  * => caller must hold pmap's lock
3361  * => PTP must be mapped into KVA
3362  * => PTP should be null if pmap == pmap_kernel()
3363  * => must be called with kernel preemption disabled
3364  * => returns composite pte if at least one page should be shot down
3365  */
3366 
3367 static void
3368 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
3369 		 vaddr_t startva, vaddr_t endva, struct pv_entry **pv_tofree)
3370 {
3371 	pt_entry_t *pte = (pt_entry_t *)ptpva;
3372 
3373 	KASSERT(pmap == pmap_kernel() || mutex_owned(pmap->pm_lock));
3374 	KASSERT(kpreempt_disabled());
3375 
3376 	/*
3377 	 * note that ptpva points to the PTE that maps startva.   this may
3378 	 * or may not be the first PTE in the PTP.
3379 	 *
3380 	 * we loop through the PTP while there are still PTEs to look at
3381 	 * and the wire_count is greater than 1 (because we use the wire_count
3382 	 * to keep track of the number of real PTEs in the PTP).
3383 	 */
3384 	while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) {
3385 		(void)pmap_remove_pte(pmap, ptp, pte, startva, pv_tofree);
3386 		startva += PAGE_SIZE;
3387 		pte++;
3388 	}
3389 }
3390 
3391 
3392 /*
3393  * pmap_remove_pte: remove a single PTE from a PTP.
3394  *
3395  * => caller must hold pmap's lock
3396  * => PTP must be mapped into KVA
3397  * => PTP should be null if pmap == pmap_kernel()
3398  * => returns true if we removed a mapping
3399  * => must be called with kernel preemption disabled
3400  */
3401 static bool
3402 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
3403 		vaddr_t va, struct pv_entry **pv_tofree)
3404 {
3405 	struct pv_entry *pve;
3406 	struct vm_page *pg;
3407 	struct pmap_page *pp;
3408 	pt_entry_t opte;
3409 
3410 	KASSERT(pmap == pmap_kernel() || mutex_owned(pmap->pm_lock));
3411 	KASSERT(kpreempt_disabled());
3412 
3413 	if (!pmap_valid_entry(*pte)) {
3414 		/* VA not mapped. */
3415 		return false;
3416 	}
3417 
3418 	/* Atomically save the old PTE and zap it. */
3419 	opte = pmap_pte_testset(pte, 0);
3420 	if (!pmap_valid_entry(opte)) {
3421 		return false;
3422 	}
3423 
3424 	pmap_exec_account(pmap, va, opte, 0);
3425 	pmap_stats_update_bypte(pmap, 0, opte);
3426 
3427 	if (ptp) {
3428 		/*
3429 		 * Dropping a PTE.  Make sure that the PDE is flushed.
3430 		 */
3431 		ptp->wire_count--;
3432 		if (ptp->wire_count <= 1) {
3433 			opte |= PG_U;
3434 		}
3435 	}
3436 
3437 	if ((opte & PG_U) != 0) {
3438 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE);
3439 	}
3440 
3441 	/*
3442 	 * If we are not on a pv_head list - we are done.
3443 	 */
3444 	if ((opte & PG_PVLIST) == 0) {
3445 #ifndef DOM0OPS
3446 		KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL),
3447 		    "managed page without PG_PVLIST for %#"PRIxVADDR, va);
3448 		KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
3449 		    "pv-tracked page without PG_PVLIST for %#"PRIxVADDR, va);
3450 #endif
3451 		return true;
3452 	}
3453 
3454 	if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
3455 		KASSERT(uvm_page_locked_p(pg));
3456 		pp = VM_PAGE_TO_PP(pg);
3457 	} else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
3458 		paddr_t pa = pmap_pte2pa(opte);
3459 		panic("%s: PG_PVLIST with pv-untracked page"
3460 		    " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")",
3461 		    __func__, va, pa, atop(pa));
3462 	}
3463 
3464 	/* Sync R/M bits. */
3465 	pp->pp_attrs |= opte;
3466 	pve = pmap_remove_pv(pp, ptp, va);
3467 
3468 	if (pve) {
3469 		pve->pve_next = *pv_tofree;
3470 		*pv_tofree = pve;
3471 	}
3472 	return true;
3473 }
3474 
3475 /*
3476  * pmap_remove: mapping removal function.
3477  *
3478  * => caller should not be holding any pmap locks
3479  */
3480 
3481 void
3482 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
3483 {
3484 	pt_entry_t *ptes;
3485 	pd_entry_t pde;
3486 	pd_entry_t * const *pdes;
3487 	struct pv_entry *pv_tofree = NULL;
3488 	bool result;
3489 	int i;
3490 	paddr_t ptppa;
3491 	vaddr_t blkendva, va = sva;
3492 	struct vm_page *ptp;
3493 	struct pmap *pmap2;
3494 
3495 	kpreempt_disable();
3496 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
3497 
3498 	/*
3499 	 * removing one page?  take shortcut function.
3500 	 */
3501 
3502 	if (va + PAGE_SIZE == eva) {
3503 		if (pmap_pdes_valid(va, pdes, &pde)) {
3504 
3505 			/* PA of the PTP */
3506 			ptppa = pmap_pte2pa(pde);
3507 
3508 			/* Get PTP if non-kernel mapping. */
3509 			if (pmap != pmap_kernel()) {
3510 				ptp = pmap_find_ptp(pmap, va, ptppa, 1);
3511 				KASSERTMSG(ptp != NULL,
3512 				    "%s: unmanaged PTP detected", __func__);
3513 			} else {
3514 				/* Never free kernel PTPs. */
3515 				ptp = NULL;
3516 			}
3517 
3518 			result = pmap_remove_pte(pmap, ptp,
3519 			    &ptes[pl1_i(va)], va, &pv_tofree);
3520 
3521 			/*
3522 			 * if mapping removed and the PTP is no longer
3523 			 * being used, free it!
3524 			 */
3525 
3526 			if (result && ptp && ptp->wire_count <= 1)
3527 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3528 		}
3529 	} else for (/* null */ ; va < eva ; va = blkendva) {
3530 		int lvl;
3531 
3532 		/* determine range of block */
3533 		blkendva = x86_round_pdr(va+1);
3534 		if (blkendva > eva)
3535 			blkendva = eva;
3536 
3537 		/*
3538 		 * Our PTE mappings should never be removed with pmap_remove.
3539 		 *
3540 		 * XXXmaxv: still needed?
3541 		 *
3542 		 * A long term solution is to move the PTEs out of user address
3543 		 * space, and into kernel address space. Then we can set
3544 		 * VM_MAXUSER_ADDRESS to be VM_MAX_ADDRESS.
3545 		 */
3546 		for (i = 0; i < PDP_SIZE; i++) {
3547 			if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i)
3548 				panic("PTE space accessed");
3549 		}
3550 
3551 		lvl = pmap_pdes_invalid(va, pdes, &pde);
3552 		if (lvl != 0) {
3553 			/*
3554 			 * skip a range corresponding to an invalid pde.
3555 			 */
3556 			blkendva = (va & ptp_masks[lvl - 1]) + nbpd[lvl - 1];
3557  			continue;
3558 		}
3559 
3560 		/* PA of the PTP */
3561 		ptppa = pmap_pte2pa(pde);
3562 
3563 		/* Get PTP if non-kernel mapping. */
3564 		if (pmap != pmap_kernel()) {
3565 			ptp = pmap_find_ptp(pmap, va, ptppa, 1);
3566 			KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected",
3567 			    __func__);
3568 		} else {
3569 			/* Never free kernel PTPs. */
3570 			ptp = NULL;
3571 		}
3572 
3573 		pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va,
3574 		    blkendva, &pv_tofree);
3575 
3576 		/* if PTP is no longer being used, free it! */
3577 		if (ptp && ptp->wire_count <= 1) {
3578 			pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3579 		}
3580 	}
3581 	pmap_unmap_ptes(pmap, pmap2);		/* unlock pmap */
3582 	kpreempt_enable();
3583 
3584 	/* Now we free unused PVs */
3585 	if (pv_tofree)
3586 		pmap_free_pvs(pv_tofree);
3587 }
3588 
3589 /*
3590  * pmap_sync_pv: clear pte bits and return the old value of the pte.
3591  *
3592  * => Caller should disable kernel preemption.
3593  * => issues tlb shootdowns if necessary.
3594  */
3595 
3596 static int
3597 pmap_sync_pv(struct pv_pte *pvpte, pt_entry_t expect, int clearbits,
3598     pt_entry_t *optep)
3599 {
3600 	struct pmap *pmap;
3601 	struct vm_page *ptp;
3602 	vaddr_t va;
3603 	pt_entry_t *ptep;
3604 	pt_entry_t opte;
3605 	pt_entry_t npte;
3606 	bool need_shootdown;
3607 
3608 	ptp = pvpte->pte_ptp;
3609 	va = pvpte->pte_va;
3610 	KASSERT(ptp == NULL || ptp->uobject != NULL);
3611 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
3612 	pmap = ptp_to_pmap(ptp);
3613 
3614 	KASSERT((expect & ~(PG_FRAME | PG_V)) == 0);
3615 	KASSERT((expect & PG_V) != 0);
3616 	KASSERT(clearbits == ~0 || (clearbits & ~(PG_M | PG_U | PG_RW)) == 0);
3617 	KASSERT(kpreempt_disabled());
3618 
3619 	ptep = pmap_map_pte(pmap, ptp, va);
3620 	do {
3621 		opte = *ptep;
3622 		KASSERT((opte & (PG_M | PG_U)) != PG_M);
3623 		KASSERT((opte & (PG_U | PG_V)) != PG_U);
3624 		KASSERT(opte == 0 || (opte & PG_V) != 0);
3625 		if ((opte & (PG_FRAME | PG_V)) != expect) {
3626 
3627 			/*
3628 			 * we lost a race with a V->P operation like
3629 			 * pmap_remove().  wait for the competitor
3630 			 * reflecting pte bits into mp_attrs.
3631 			 *
3632 			 * issue a redundant TLB shootdown so that
3633 			 * we can wait for its completion.
3634 			 */
3635 
3636 			pmap_unmap_pte();
3637 			if (clearbits != 0) {
3638 				pmap_tlb_shootdown(pmap, va,
3639 				    (pmap == pmap_kernel() ? PG_G : 0),
3640 				    TLBSHOOT_SYNC_PV1);
3641 			}
3642 			return EAGAIN;
3643 		}
3644 
3645 		/*
3646 		 * check if there's anything to do on this pte.
3647 		 */
3648 
3649 		if ((opte & clearbits) == 0) {
3650 			need_shootdown = false;
3651 			break;
3652 		}
3653 
3654 		/*
3655 		 * we need a shootdown if the pte is cached. (PG_U)
3656 		 *
3657 		 * ...unless we are clearing only the PG_RW bit and
3658 		 * it isn't cached as RW. (PG_M)
3659 		 */
3660 
3661 		need_shootdown = (opte & PG_U) != 0 &&
3662 		    !(clearbits == PG_RW && (opte & PG_M) == 0);
3663 
3664 		npte = opte & ~clearbits;
3665 
3666 		/*
3667 		 * if we need a shootdown anyway, clear PG_U and PG_M.
3668 		 */
3669 
3670 		if (need_shootdown) {
3671 			npte &= ~(PG_U | PG_M);
3672 		}
3673 		KASSERT((npte & (PG_M | PG_U)) != PG_M);
3674 		KASSERT((npte & (PG_U | PG_V)) != PG_U);
3675 		KASSERT(npte == 0 || (opte & PG_V) != 0);
3676 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
3677 
3678 	if (need_shootdown) {
3679 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV2);
3680 	}
3681 	pmap_unmap_pte();
3682 
3683 	*optep = opte;
3684 	return 0;
3685 }
3686 
3687 static void
3688 pmap_pp_remove(struct pmap_page *pp, paddr_t pa)
3689 {
3690 	struct pv_pte *pvpte;
3691 	struct pv_entry *killlist = NULL;
3692 	struct vm_page *ptp;
3693 	pt_entry_t expect;
3694 	int count;
3695 
3696 	expect = pmap_pa2pte(pa) | PG_V;
3697 	count = SPINLOCK_BACKOFF_MIN;
3698 	kpreempt_disable();
3699 startover:
3700 	while ((pvpte = pv_pte_first(pp)) != NULL) {
3701 		struct pmap *pmap;
3702 		struct pv_entry *pve;
3703 		pt_entry_t opte;
3704 		vaddr_t va;
3705 		int error;
3706 
3707 		/*
3708 		 * add a reference to the pmap before clearing the pte.
3709 		 * otherwise the pmap can disappear behind us.
3710 		 */
3711 
3712 		ptp = pvpte->pte_ptp;
3713 		pmap = ptp_to_pmap(ptp);
3714 		if (ptp != NULL) {
3715 			pmap_reference(pmap);
3716 		}
3717 
3718 		error = pmap_sync_pv(pvpte, expect, ~0, &opte);
3719 		if (error == EAGAIN) {
3720 			int hold_count;
3721 			KERNEL_UNLOCK_ALL(curlwp, &hold_count);
3722 			if (ptp != NULL) {
3723 				pmap_destroy(pmap);
3724 			}
3725 			SPINLOCK_BACKOFF(count);
3726 			KERNEL_LOCK(hold_count, curlwp);
3727 			goto startover;
3728 		}
3729 
3730 		pp->pp_attrs |= opte;
3731 		va = pvpte->pte_va;
3732 		pve = pmap_remove_pv(pp, ptp, va);
3733 
3734 		/* update the PTP reference count.  free if last reference. */
3735 		if (ptp != NULL) {
3736 			struct pmap *pmap2;
3737 			pt_entry_t *ptes;
3738 			pd_entry_t * const *pdes;
3739 
3740 			KASSERT(pmap != pmap_kernel());
3741 
3742 			pmap_tlb_shootnow();
3743 			pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3744 			pmap_stats_update_bypte(pmap, 0, opte);
3745 			ptp->wire_count--;
3746 			if (ptp->wire_count <= 1) {
3747 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3748 			}
3749 			pmap_unmap_ptes(pmap, pmap2);
3750 			pmap_destroy(pmap);
3751 		} else {
3752 			KASSERT(pmap == pmap_kernel());
3753 			pmap_stats_update_bypte(pmap, 0, opte);
3754 		}
3755 
3756 		if (pve != NULL) {
3757 			pve->pve_next = killlist;	/* mark it for death */
3758 			killlist = pve;
3759 		}
3760 	}
3761 	pmap_tlb_shootnow();
3762 	kpreempt_enable();
3763 
3764 	/* Now free unused pvs. */
3765 	pmap_free_pvs(killlist);
3766 }
3767 
3768 /*
3769  * pmap_page_remove: remove a managed vm_page from all pmaps that map it
3770  *
3771  * => R/M bits are sync'd back to attrs
3772  */
3773 
3774 void
3775 pmap_page_remove(struct vm_page *pg)
3776 {
3777 	struct pmap_page *pp;
3778 	paddr_t pa;
3779 
3780 	KASSERT(uvm_page_locked_p(pg));
3781 
3782 	pp = VM_PAGE_TO_PP(pg);
3783 	pa = VM_PAGE_TO_PHYS(pg);
3784 	pmap_pp_remove(pp, pa);
3785 }
3786 
3787 /*
3788  * pmap_pv_remove: remove an unmanaged pv-tracked page from all pmaps
3789  *	that map it
3790  */
3791 
3792 void
3793 pmap_pv_remove(paddr_t pa)
3794 {
3795 	struct pmap_page *pp;
3796 
3797 	pp = pmap_pv_tracked(pa);
3798 	if (pp == NULL)
3799 		panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa);
3800 	pmap_pp_remove(pp, pa);
3801 }
3802 
3803 /*
3804  * p m a p   a t t r i b u t e  f u n c t i o n s
3805  * functions that test/change managed page's attributes
3806  * since a page can be mapped multiple times we must check each PTE that
3807  * maps it by going down the pv lists.
3808  */
3809 
3810 /*
3811  * pmap_test_attrs: test a page's attributes
3812  */
3813 
3814 bool
3815 pmap_test_attrs(struct vm_page *pg, unsigned testbits)
3816 {
3817 	struct pmap_page *pp;
3818 	struct pv_pte *pvpte;
3819 	pt_entry_t expect;
3820 	u_int result;
3821 
3822 	KASSERT(uvm_page_locked_p(pg));
3823 
3824 	pp = VM_PAGE_TO_PP(pg);
3825 	if ((pp->pp_attrs & testbits) != 0) {
3826 		return true;
3827 	}
3828 	expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V;
3829 	kpreempt_disable();
3830 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
3831 		pt_entry_t opte;
3832 		int error;
3833 
3834 		if ((pp->pp_attrs & testbits) != 0) {
3835 			break;
3836 		}
3837 		error = pmap_sync_pv(pvpte, expect, 0, &opte);
3838 		if (error == 0) {
3839 			pp->pp_attrs |= opte;
3840 		}
3841 	}
3842 	result = pp->pp_attrs & testbits;
3843 	kpreempt_enable();
3844 
3845 	/*
3846 	 * note that we will exit the for loop with a non-null pve if
3847 	 * we have found the bits we are testing for.
3848 	 */
3849 
3850 	return result != 0;
3851 }
3852 
3853 static bool
3854 pmap_pp_clear_attrs(struct pmap_page *pp, paddr_t pa, unsigned clearbits)
3855 {
3856 	struct pv_pte *pvpte;
3857 	u_int result;
3858 	pt_entry_t expect;
3859 	int count;
3860 
3861 	expect = pmap_pa2pte(pa) | PG_V;
3862 	count = SPINLOCK_BACKOFF_MIN;
3863 	kpreempt_disable();
3864 startover:
3865 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
3866 		pt_entry_t opte;
3867 		int error;
3868 
3869 		error = pmap_sync_pv(pvpte, expect, clearbits, &opte);
3870 		if (error == EAGAIN) {
3871 			int hold_count;
3872 			KERNEL_UNLOCK_ALL(curlwp, &hold_count);
3873 			SPINLOCK_BACKOFF(count);
3874 			KERNEL_LOCK(hold_count, curlwp);
3875 			goto startover;
3876 		}
3877 		pp->pp_attrs |= opte;
3878 	}
3879 	result = pp->pp_attrs & clearbits;
3880 	pp->pp_attrs &= ~clearbits;
3881 	pmap_tlb_shootnow();
3882 	kpreempt_enable();
3883 
3884 	return result != 0;
3885 }
3886 
3887 /*
3888  * pmap_clear_attrs: clear the specified attribute for a page.
3889  *
3890  * => we return true if we cleared one of the bits we were asked to
3891  */
3892 
3893 bool
3894 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits)
3895 {
3896 	struct pmap_page *pp;
3897 	paddr_t pa;
3898 
3899 	KASSERT(uvm_page_locked_p(pg));
3900 
3901 	pp = VM_PAGE_TO_PP(pg);
3902 	pa = VM_PAGE_TO_PHYS(pg);
3903 
3904 	return pmap_pp_clear_attrs(pp, pa, clearbits);
3905 }
3906 
3907 /*
3908  * pmap_pv_clear_attrs: clear the specified attributes for an unmanaged
3909  *	pv-tracked page.
3910  */
3911 
3912 bool
3913 pmap_pv_clear_attrs(paddr_t pa, unsigned clearbits)
3914 {
3915 	struct pmap_page *pp;
3916 
3917 	pp = pmap_pv_tracked(pa);
3918 	if (pp == NULL)
3919 		panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa);
3920 
3921 	return pmap_pp_clear_attrs(pp, pa, clearbits);
3922 }
3923 
3924 /*
3925  * p m a p   p r o t e c t i o n   f u n c t i o n s
3926  */
3927 
3928 /*
3929  * pmap_page_protect: change the protection of all recorded mappings
3930  *	of a managed page
3931  *
3932  * => NOTE: this is an inline function in pmap.h
3933  */
3934 
3935 /* see pmap.h */
3936 
3937 /*
3938  * pmap_pv_protect: change the protection of all recorded mappings
3939  *	of an unmanaged pv-tracked page
3940  *
3941  * => NOTE: this is an inline function in pmap.h
3942  */
3943 
3944 /* see pmap.h */
3945 
3946 /*
3947  * pmap_protect: set the protection in of the pages in a pmap
3948  *
3949  * => NOTE: this is an inline function in pmap.h
3950  */
3951 
3952 /* see pmap.h */
3953 
3954 /*
3955  * pmap_write_protect: write-protect pages in a pmap.
3956  *
3957  * Note for Xen-amd64. Xen automatically adds PG_u to the kernel pages, but we
3958  * don't need to remove this bit when re-entering the PTEs here: Xen tracks the
3959  * kernel pages with a reserved bit (_PAGE_GUEST_KERNEL), so even if PG_u is
3960  * present the page will still be considered as a kernel page, and the privilege
3961  * separation will be enforced correctly.
3962  */
3963 void
3964 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
3965 {
3966 	pt_entry_t bit_rem, bit_put;
3967 	pt_entry_t *ptes;
3968 	pt_entry_t * const *pdes;
3969 	struct pmap *pmap2;
3970 	vaddr_t blockend, va;
3971 
3972 	KASSERT(curlwp->l_md.md_gc_pmap != pmap);
3973 
3974 	bit_rem = 0;
3975 	if (!(prot & VM_PROT_WRITE))
3976 		bit_rem = PG_RW;
3977 
3978 	bit_put = 0;
3979 	if (!(prot & VM_PROT_EXECUTE))
3980 		bit_put = pmap_pg_nx;
3981 
3982 	sva &= PG_FRAME;
3983 	eva &= PG_FRAME;
3984 
3985 	/* Acquire pmap. */
3986 	kpreempt_disable();
3987 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3988 
3989 	for (va = sva ; va < eva; va = blockend) {
3990 		pt_entry_t *spte, *epte;
3991 		int i;
3992 
3993 		blockend = x86_round_pdr(va + 1);
3994 		if (blockend > eva)
3995 			blockend = eva;
3996 
3997 		/*
3998 		 * Our PTE mappings should never be write-protected.
3999 		 *
4000 		 * XXXmaxv: still needed?
4001 		 *
4002 		 * A long term solution is to move the PTEs out of user address
4003 		 * space, and into kernel address space. Then we can set
4004 		 * VM_MAXUSER_ADDRESS to be VM_MAX_ADDRESS.
4005 		 */
4006 		for (i = 0; i < PDP_SIZE; i++) {
4007 			if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i)
4008 				panic("PTE space accessed");
4009 		}
4010 
4011 		/* Is it a valid block? */
4012 		if (!pmap_pdes_valid(va, pdes, NULL)) {
4013 			continue;
4014 		}
4015 		KASSERT(va < VM_MAXUSER_ADDRESS || va >= VM_MAX_ADDRESS);
4016 
4017 		spte = &ptes[pl1_i(va)];
4018 		epte = &ptes[pl1_i(blockend)];
4019 
4020 		for (/* */; spte < epte; spte++) {
4021 			pt_entry_t opte, npte;
4022 
4023 			do {
4024 				opte = *spte;
4025 				if (!pmap_valid_entry(opte)) {
4026 					goto next;
4027 				}
4028 				npte = (opte & ~bit_rem) | bit_put;
4029 			} while (pmap_pte_cas(spte, opte, npte) != opte);
4030 
4031 			if ((opte & PG_M) != 0) {
4032 				vaddr_t tva = x86_ptob(spte - ptes);
4033 				pmap_tlb_shootdown(pmap, tva, opte,
4034 				    TLBSHOOT_WRITE_PROTECT);
4035 			}
4036 next:;
4037 		}
4038 	}
4039 
4040 	/* Release pmap. */
4041 	pmap_unmap_ptes(pmap, pmap2);
4042 	kpreempt_enable();
4043 }
4044 
4045 /*
4046  * pmap_unwire: clear the wired bit in the PTE.
4047  *
4048  * => Mapping should already be present.
4049  */
4050 void
4051 pmap_unwire(struct pmap *pmap, vaddr_t va)
4052 {
4053 	pt_entry_t *ptes, *ptep, opte;
4054 	pd_entry_t * const *pdes;
4055 	struct pmap *pmap2;
4056 
4057 	/* Acquire pmap. */
4058 	kpreempt_disable();
4059 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4060 
4061 	if (!pmap_pdes_valid(va, pdes, NULL)) {
4062 		panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va);
4063 	}
4064 
4065 	ptep = &ptes[pl1_i(va)];
4066 	opte = *ptep;
4067 	KASSERT(pmap_valid_entry(opte));
4068 
4069 	if (opte & PG_W) {
4070 		pt_entry_t npte = opte & ~PG_W;
4071 
4072 		opte = pmap_pte_testset(ptep, npte);
4073 		pmap_stats_update_bypte(pmap, npte, opte);
4074 	} else {
4075 		printf("%s: wiring for pmap %p va %#" PRIxVADDR
4076 		    "did not change!\n", __func__, pmap, va);
4077 	}
4078 
4079 	/* Release pmap. */
4080 	pmap_unmap_ptes(pmap, pmap2);
4081 	kpreempt_enable();
4082 }
4083 
4084 /*
4085  * pmap_copy: copy mappings from one pmap to another
4086  *
4087  * => optional function
4088  * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
4089  */
4090 
4091 /*
4092  * defined as macro in pmap.h
4093  */
4094 
4095 __strict_weak_alias(pmap_enter, pmap_enter_default);
4096 
4097 int
4098 pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
4099     u_int flags)
4100 {
4101 	return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0);
4102 }
4103 
4104 /*
4105  * pmap_enter: enter a mapping into a pmap
4106  *
4107  * => must be done "now" ... no lazy-evaluation
4108  * => we set pmap => pv_head locking
4109  */
4110 int
4111 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa,
4112 	   vm_prot_t prot, u_int flags, int domid)
4113 {
4114 	pt_entry_t *ptes, opte, npte;
4115 	pt_entry_t *ptep;
4116 	pd_entry_t * const *pdes;
4117 	struct vm_page *ptp;
4118 	struct vm_page *new_pg, *old_pg;
4119 	struct pmap_page *new_pp, *old_pp;
4120 	struct pv_entry *old_pve = NULL;
4121 	struct pv_entry *new_pve;
4122 	struct pv_entry *new_sparepve;
4123 	int error;
4124 	bool wired = (flags & PMAP_WIRED) != 0;
4125 	struct pmap *pmap2;
4126 
4127 	KASSERT(pmap_initialized);
4128 	KASSERT(curlwp->l_md.md_gc_pmap != pmap);
4129 	KASSERT(va < VM_MAX_KERNEL_ADDRESS);
4130 	KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#"
4131 	    PRIxVADDR " over PDP!", __func__, va);
4132 	KASSERTMSG(va < VM_MIN_KERNEL_ADDRESS ||
4133 	    pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]),
4134 	    "%s: missing kernel PTP for va=%#" PRIxVADDR, __func__, va);
4135 
4136 #ifdef XEN
4137 	KASSERT(domid == DOMID_SELF || pa == 0);
4138 #endif /* XEN */
4139 
4140 	npte = ma | protection_codes[prot] | PG_V;
4141 	npte |= pmap_pat_flags(flags);
4142 	if (wired)
4143 	        npte |= PG_W;
4144 	if (va < VM_MAXUSER_ADDRESS)
4145 		npte |= PG_u;
4146 	else if (va < VM_MAX_ADDRESS)
4147 		panic("PTE space accessed");	/* XXXmaxv: no longer needed? */
4148 
4149 	if (pmap == pmap_kernel())
4150 		npte |= pmap_pg_g;
4151 	if (flags & VM_PROT_ALL) {
4152 		npte |= PG_U;
4153 		if (flags & VM_PROT_WRITE) {
4154 			KASSERT((npte & PG_RW) != 0);
4155 			npte |= PG_M;
4156 		}
4157 	}
4158 
4159 #ifdef XEN
4160 	if (domid != DOMID_SELF)
4161 		new_pg = NULL;
4162 	else
4163 #endif
4164 		new_pg = PHYS_TO_VM_PAGE(pa);
4165 	if (new_pg != NULL) {
4166 		/* This is a managed page */
4167 		npte |= PG_PVLIST;
4168 		new_pp = VM_PAGE_TO_PP(new_pg);
4169 	} else if ((new_pp = pmap_pv_tracked(pa)) != NULL) {
4170 		/* This is an unmanaged pv-tracked page */
4171 		npte |= PG_PVLIST;
4172 	} else {
4173 		new_pp = NULL;
4174 	}
4175 
4176 	/*
4177 	 * Try to get pves now if we might need them.
4178 	 * Keep going even if we fail, since we will not actually need them
4179 	 * if we are just changing the permissions on an existing mapping,
4180 	 * but we won't know if that's the case until later.
4181 	 */
4182 
4183 	bool needpves = pmap_pp_needs_pve(new_pp);
4184 	if (needpves) {
4185 		new_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
4186 		new_sparepve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
4187 	} else {
4188 		new_pve = NULL;
4189 		new_sparepve = NULL;
4190 	}
4191 
4192 	kpreempt_disable();
4193 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
4194 	if (pmap == pmap_kernel()) {
4195 		ptp = NULL;
4196 	} else {
4197 		ptp = pmap_get_ptp(pmap, va, pdes, flags);
4198 		if (ptp == NULL) {
4199 			pmap_unmap_ptes(pmap, pmap2);
4200 			if (flags & PMAP_CANFAIL) {
4201 				error = ENOMEM;
4202 				goto out;
4203 			}
4204 			panic("%s: get ptp failed", __func__);
4205 		}
4206 	}
4207 
4208 	/*
4209 	 * Check if there is an existing mapping.  If we are now sure that
4210 	 * we need pves and we failed to allocate them earlier, handle that.
4211 	 * Caching the value of oldpa here is safe because only the mod/ref bits
4212 	 * can change while the pmap is locked.
4213 	 */
4214 
4215 	ptep = &ptes[pl1_i(va)];
4216 	opte = *ptep;
4217 	bool have_oldpa = pmap_valid_entry(opte);
4218 	paddr_t oldpa = pmap_pte2pa(opte);
4219 
4220 	if (needpves && (!have_oldpa || oldpa != pa) &&
4221 	    (new_pve == NULL || new_sparepve == NULL)) {
4222 		pmap_unmap_ptes(pmap, pmap2);
4223 		if (flags & PMAP_CANFAIL) {
4224 			error = ENOMEM;
4225 			goto out;
4226 		}
4227 		panic("%s: pve allocation failed", __func__);
4228 	}
4229 
4230 	/*
4231 	 * update the pte.
4232 	 */
4233 
4234 	do {
4235 		opte = *ptep;
4236 
4237 		/*
4238 		 * if the same page, inherit PG_U and PG_M.
4239 		 */
4240 		if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) {
4241 			npte |= opte & (PG_U | PG_M);
4242 		}
4243 #if defined(XEN)
4244 		if (domid != DOMID_SELF) {
4245 			/* pmap_pte_cas with error handling */
4246 			int s = splvm();
4247 			if (opte != *ptep) {
4248 				splx(s);
4249 				continue;
4250 			}
4251 			error = xpq_update_foreign(
4252 			    vtomach((vaddr_t)ptep), npte, domid);
4253 			splx(s);
4254 			if (error) {
4255 				if (ptp != NULL && ptp->wire_count <= 1) {
4256 					pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4257 				}
4258 				pmap_unmap_ptes(pmap, pmap2);
4259 				goto out;
4260 			}
4261 			break;
4262 		}
4263 #endif /* defined(XEN) */
4264 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
4265 
4266 	/*
4267 	 * update statistics and PTP's reference count.
4268 	 */
4269 
4270 	pmap_stats_update_bypte(pmap, npte, opte);
4271 	if (ptp != NULL && !have_oldpa) {
4272 		ptp->wire_count++;
4273 	}
4274 	KASSERT(ptp == NULL || ptp->wire_count > 1);
4275 
4276 	/*
4277 	 * if the same page, we can skip pv_entry handling.
4278 	 */
4279 
4280 	if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) {
4281 		KASSERT(((opte ^ npte) & PG_PVLIST) == 0);
4282 		goto same_pa;
4283 	}
4284 
4285 	/*
4286 	 * if old page is pv-tracked, remove pv_entry from its list.
4287 	 */
4288 
4289 	if ((~opte & (PG_V | PG_PVLIST)) == 0) {
4290 		if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) {
4291 			KASSERT(uvm_page_locked_p(old_pg));
4292 			old_pp = VM_PAGE_TO_PP(old_pg);
4293 		} else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) {
4294 			panic("%s: PG_PVLIST with pv-untracked page"
4295 			    " va = %#"PRIxVADDR
4296 			    " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")",
4297 			    __func__, va, oldpa, atop(pa));
4298 		}
4299 
4300 		old_pve = pmap_remove_pv(old_pp, ptp, va);
4301 		old_pp->pp_attrs |= opte;
4302 	}
4303 
4304 	/*
4305 	 * if new page is pv-tracked, insert pv_entry into its list.
4306 	 */
4307 
4308 	if (new_pp) {
4309 		new_pve = pmap_enter_pv(new_pp, new_pve, &new_sparepve, ptp, va);
4310 	}
4311 
4312 same_pa:
4313 	pmap_unmap_ptes(pmap, pmap2);
4314 
4315 	/*
4316 	 * shootdown tlb if necessary.
4317 	 */
4318 
4319 	if ((~opte & (PG_V | PG_U)) == 0 &&
4320 	    ((opte ^ npte) & (PG_FRAME | PG_RW)) != 0) {
4321 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER);
4322 	}
4323 
4324 	error = 0;
4325 out:
4326 	kpreempt_enable();
4327 	if (old_pve != NULL) {
4328 		pool_cache_put(&pmap_pv_cache, old_pve);
4329 	}
4330 	if (new_pve != NULL) {
4331 		pool_cache_put(&pmap_pv_cache, new_pve);
4332 	}
4333 	if (new_sparepve != NULL) {
4334 		pool_cache_put(&pmap_pv_cache, new_sparepve);
4335 	}
4336 
4337 	return error;
4338 }
4339 
4340 static paddr_t
4341 pmap_get_physpage(void)
4342 {
4343 	struct vm_page *ptp;
4344 	struct pmap *kpm = pmap_kernel();
4345 	paddr_t pa;
4346 
4347 	if (!uvm.page_init_done) {
4348 		/*
4349 		 * We're growing the kernel pmap early (from
4350 		 * uvm_pageboot_alloc()). This case must be
4351 		 * handled a little differently.
4352 		 */
4353 
4354 		if (!uvm_page_physget(&pa))
4355 			panic("%s: out of memory", __func__);
4356 #if defined(__HAVE_DIRECT_MAP)
4357 		pagezero(PMAP_DIRECT_MAP(pa));
4358 #else
4359 #if defined(XEN)
4360 		if (XEN_VERSION_SUPPORTED(3, 4)) {
4361 			xen_pagezero(pa);
4362 			return pa;
4363 		}
4364 #endif
4365 		kpreempt_disable();
4366 		pmap_pte_set(early_zero_pte, pmap_pa2pte(pa) | PG_V |
4367 		    PG_RW | pmap_pg_nx);
4368 		pmap_pte_flush();
4369 		pmap_update_pg((vaddr_t)early_zerop);
4370 		memset(early_zerop, 0, PAGE_SIZE);
4371 #if defined(DIAGNOSTIC) || defined(XEN)
4372 		pmap_pte_set(early_zero_pte, 0);
4373 		pmap_pte_flush();
4374 #endif /* defined(DIAGNOSTIC) */
4375 		kpreempt_enable();
4376 #endif /* defined(__HAVE_DIRECT_MAP) */
4377 	} else {
4378 		/* XXX */
4379 		ptp = uvm_pagealloc(NULL, 0, NULL,
4380 				    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
4381 		if (ptp == NULL)
4382 			panic("%s: out of memory", __func__);
4383 		ptp->flags &= ~PG_BUSY;
4384 		ptp->wire_count = 1;
4385 		pa = VM_PAGE_TO_PHYS(ptp);
4386 	}
4387 	pmap_stats_update(kpm, 1, 0);
4388 
4389 	return pa;
4390 }
4391 
4392 /*
4393  * Expand the page tree with the specified amount of PTPs, mapping virtual
4394  * addresses starting at kva. We populate all the levels but the last one
4395  * (L1). The nodes of the tree are created as RWX, but the pages covered
4396  * will be kentered in L1, with proper permissions.
4397  *
4398  * Used only by pmap_growkernel.
4399  */
4400 static void
4401 pmap_alloc_level(struct pmap *cpm, vaddr_t kva, long *needed_ptps)
4402 {
4403 	unsigned long i;
4404 	paddr_t pa;
4405 	unsigned long index, endindex;
4406 	int level;
4407 	pd_entry_t *pdep;
4408 #ifdef XEN
4409 	int s = splvm(); /* protect xpq_* */
4410 #endif
4411 
4412 	for (level = PTP_LEVELS; level > 1; level--) {
4413 		if (level == PTP_LEVELS)
4414 			pdep = cpm->pm_pdir;
4415 		else
4416 			pdep = normal_pdes[level - 2];
4417 		index = pl_i_roundup(kva, level);
4418 		endindex = index + needed_ptps[level - 1] - 1;
4419 
4420 		for (i = index; i <= endindex; i++) {
4421 			pt_entry_t pte;
4422 
4423 			KASSERT(!pmap_valid_entry(pdep[i]));
4424 			pa = pmap_get_physpage();
4425 			pte = pmap_pa2pte(pa) | PG_V | PG_RW;
4426 			pmap_pte_set(&pdep[i], pte);
4427 
4428 #if defined(XEN) && (defined(PAE) || defined(__x86_64__))
4429 			if (level == PTP_LEVELS && i >= PDIR_SLOT_KERN) {
4430 				if (__predict_true(
4431 				    cpu_info_primary.ci_flags & CPUF_PRESENT)) {
4432 					/* update per-cpu PMDs on all cpus */
4433 					xen_kpm_sync(pmap_kernel(), i);
4434 				} else {
4435 					/*
4436 					 * too early; update primary CPU
4437 					 * PMD only (without locks)
4438 					 */
4439 #ifdef PAE
4440 					pd_entry_t *cpu_pdep =
4441 					    &cpu_info_primary.ci_kpm_pdir[l2tol2(i)];
4442 #endif
4443 #ifdef __x86_64__
4444 					pd_entry_t *cpu_pdep =
4445 						&cpu_info_primary.ci_kpm_pdir[i];
4446 #endif
4447 					pmap_pte_set(cpu_pdep, pte);
4448 				}
4449 			}
4450 #endif /* XEN && (PAE || __x86_64__) */
4451 
4452 			KASSERT(level != PTP_LEVELS || nkptp[level - 1] +
4453 			    pl_i(VM_MIN_KERNEL_ADDRESS, level) == i);
4454 			nkptp[level - 1]++;
4455 		}
4456 		pmap_pte_flush();
4457 	}
4458 #ifdef XEN
4459 	splx(s);
4460 #endif
4461 }
4462 
4463 /*
4464  * pmap_growkernel: increase usage of KVM space.
4465  *
4466  * => we allocate new PTPs for the kernel and install them in all
4467  *    the pmaps on the system.
4468  */
4469 
4470 vaddr_t
4471 pmap_growkernel(vaddr_t maxkvaddr)
4472 {
4473 	struct pmap *kpm = pmap_kernel();
4474 	struct pmap *cpm;
4475 #if !defined(XEN) || !defined(__x86_64__)
4476 	struct pmap *pm;
4477 	long old;
4478 #endif
4479 	int s, i;
4480 	long needed_kptp[PTP_LEVELS], target_nptp;
4481 	bool invalidate = false;
4482 
4483 	s = splvm();	/* to be safe */
4484 	mutex_enter(kpm->pm_lock);
4485 
4486 	if (maxkvaddr <= pmap_maxkvaddr) {
4487 		mutex_exit(kpm->pm_lock);
4488 		splx(s);
4489 		return pmap_maxkvaddr;
4490 	}
4491 
4492 	maxkvaddr = x86_round_pdr(maxkvaddr);
4493 #if !defined(XEN) || !defined(__x86_64__)
4494 	old = nkptp[PTP_LEVELS - 1];
4495 #endif
4496 
4497 	/* Initialize needed_kptp. */
4498 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
4499 		target_nptp = pl_i_roundup(maxkvaddr, i + 1) -
4500 		    pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1);
4501 
4502 		if (target_nptp > nkptpmax[i])
4503 			panic("out of KVA space");
4504 		KASSERT(target_nptp >= nkptp[i]);
4505 		needed_kptp[i] = target_nptp - nkptp[i];
4506 	}
4507 
4508 #if defined(XEN) && (defined(__x86_64__) || defined(PAE))
4509 	/* only pmap_kernel(), or the per-cpu map, has kernel entries */
4510 	cpm = kpm;
4511 #else
4512 	/* Get the current pmap */
4513 	if (__predict_true(cpu_info_primary.ci_flags & CPUF_PRESENT)) {
4514 		cpm = curcpu()->ci_pmap;
4515 	} else {
4516 		cpm = kpm;
4517 	}
4518 #endif
4519 
4520 	pmap_alloc_level(cpm, pmap_maxkvaddr, needed_kptp);
4521 
4522 	/*
4523 	 * If the number of top level entries changed, update all pmaps.
4524 	 */
4525 	if (needed_kptp[PTP_LEVELS - 1] != 0) {
4526 #ifdef XEN
4527 #ifdef __x86_64__
4528 		/* nothing, kernel entries are never entered in user pmap */
4529 #else /* __x86_64__ */
4530 		int pdkidx;
4531 #ifndef PAE
4532 		/*
4533 		 * for PAE this is not needed, because pmap_alloc_level()
4534 		 * already did update the per-CPU tables
4535 		 */
4536 		if (cpm != kpm) {
4537 			for (pdkidx = PDIR_SLOT_KERN + old;
4538 			    pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1];
4539 			    pdkidx++) {
4540 				pmap_pte_set(&kpm->pm_pdir[pdkidx],
4541 				    cpm->pm_pdir[pdkidx]);
4542 			}
4543 			pmap_pte_flush();
4544 		}
4545 #endif /* !PAE */
4546 
4547 		mutex_enter(&pmaps_lock);
4548 		LIST_FOREACH(pm, &pmaps, pm_list) {
4549 			for (pdkidx = PDIR_SLOT_KERN + old;
4550 			    pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1];
4551 			    pdkidx++) {
4552 				pmap_pte_set(&pm->pm_pdir[pdkidx],
4553 				    kpm->pm_pdir[pdkidx]);
4554 			}
4555 			pmap_pte_flush();
4556 		}
4557 		mutex_exit(&pmaps_lock);
4558 #endif /* __x86_64__ */
4559 #else /* XEN */
4560 		size_t newpdes;
4561 		newpdes = nkptp[PTP_LEVELS - 1] - old;
4562 		if (cpm != kpm) {
4563 			memcpy(&kpm->pm_pdir[PDIR_SLOT_KERN + old],
4564 			    &cpm->pm_pdir[PDIR_SLOT_KERN + old],
4565 			    newpdes * sizeof(pd_entry_t));
4566 		}
4567 
4568 		mutex_enter(&pmaps_lock);
4569 		LIST_FOREACH(pm, &pmaps, pm_list) {
4570 			memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old],
4571 			    &kpm->pm_pdir[PDIR_SLOT_KERN + old],
4572 			    newpdes * sizeof (pd_entry_t));
4573 		}
4574 		mutex_exit(&pmaps_lock);
4575 #endif
4576 		invalidate = true;
4577 	}
4578 	pmap_maxkvaddr = maxkvaddr;
4579 	mutex_exit(kpm->pm_lock);
4580 	splx(s);
4581 
4582 	if (invalidate && pmap_initialized) {
4583 		/* Invalidate the PDP cache. */
4584 		pool_cache_invalidate(&pmap_pdp_cache);
4585 	}
4586 
4587 	return maxkvaddr;
4588 }
4589 
4590 #ifdef DEBUG
4591 void pmap_dump(struct pmap *, vaddr_t, vaddr_t);
4592 
4593 /*
4594  * pmap_dump: dump all the mappings from a pmap
4595  *
4596  * => caller should not be holding any pmap locks
4597  */
4598 
4599 void
4600 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
4601 {
4602 	pt_entry_t *ptes, *pte;
4603 	pd_entry_t * const *pdes;
4604 	struct pmap *pmap2;
4605 	vaddr_t blkendva;
4606 
4607 	/*
4608 	 * if end is out of range truncate.
4609 	 * if (end == start) update to max.
4610 	 */
4611 
4612 	if (eva > VM_MAXUSER_ADDRESS || eva <= sva)
4613 		eva = VM_MAXUSER_ADDRESS;
4614 
4615 	/*
4616 	 * we lock in the pmap => pv_head direction
4617 	 */
4618 
4619 	kpreempt_disable();
4620 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
4621 
4622 	/*
4623 	 * dumping a range of pages: we dump in PTP sized blocks (4MB)
4624 	 */
4625 
4626 	for (/* null */ ; sva < eva ; sva = blkendva) {
4627 
4628 		/* determine range of block */
4629 		blkendva = x86_round_pdr(sva+1);
4630 		if (blkendva > eva)
4631 			blkendva = eva;
4632 
4633 		/* valid block? */
4634 		if (!pmap_pdes_valid(sva, pdes, NULL))
4635 			continue;
4636 
4637 		pte = &ptes[pl1_i(sva)];
4638 		for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) {
4639 			if (!pmap_valid_entry(*pte))
4640 				continue;
4641 			printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR
4642 			    " (pte=%#" PRIxPADDR ")\n",
4643 			    sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte);
4644 		}
4645 	}
4646 	pmap_unmap_ptes(pmap, pmap2);
4647 	kpreempt_enable();
4648 }
4649 #endif
4650 
4651 /*
4652  * pmap_update: process deferred invalidations and frees.
4653  */
4654 
4655 void
4656 pmap_update(struct pmap *pmap)
4657 {
4658 	struct vm_page *empty_ptps;
4659 	lwp_t *l = curlwp;
4660 
4661 	/*
4662 	 * If we have torn down this pmap, invalidate non-global TLB
4663 	 * entries on any processors using it.
4664 	 */
4665 	kpreempt_disable();
4666 	if (__predict_false(l->l_md.md_gc_pmap == pmap)) {
4667 		l->l_md.md_gc_pmap = NULL;
4668 		pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, TLBSHOOT_UPDATE);
4669 	}
4670 
4671 	/*
4672 	 * Initiate any pending TLB shootdowns.  Wait for them to
4673 	 * complete before returning control to the caller.
4674 	 */
4675 	pmap_tlb_shootnow();
4676 	kpreempt_enable();
4677 
4678 	/*
4679 	 * Now that shootdowns are complete, process deferred frees,
4680 	 * but not from interrupt context.
4681 	 */
4682 	if (l->l_md.md_gc_ptp != NULL) {
4683 		KASSERT((l->l_pflag & LP_INTR) == 0);
4684 		if (cpu_intr_p()) {
4685 			return;
4686 		}
4687 		empty_ptps = l->l_md.md_gc_ptp;
4688 		l->l_md.md_gc_ptp = NULL;
4689 		pmap_free_ptps(empty_ptps);
4690 	}
4691 }
4692 
4693 #if PTP_LEVELS > 4
4694 #error "Unsupported number of page table mappings"
4695 #endif
4696 
4697 paddr_t
4698 pmap_init_tmp_pgtbl(paddr_t pg)
4699 {
4700 	static bool maps_loaded;
4701 	static const paddr_t x86_tmp_pml_paddr[] = {
4702 	    4 * PAGE_SIZE,	/* L1 */
4703 	    5 * PAGE_SIZE,	/* L2 */
4704 	    6 * PAGE_SIZE,	/* L3 */
4705 	    7 * PAGE_SIZE	/* L4 */
4706 	};
4707 	static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 };
4708 
4709 	pd_entry_t *tmp_pml, *kernel_pml;
4710 
4711 	int level;
4712 
4713 	if (!maps_loaded) {
4714 		for (level = 0; level < PTP_LEVELS; ++level) {
4715 			x86_tmp_pml_vaddr[level] =
4716 			    uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
4717 			    UVM_KMF_VAONLY);
4718 
4719 			if (x86_tmp_pml_vaddr[level] == 0)
4720 				panic("mapping of real mode PML failed\n");
4721 			pmap_kenter_pa(x86_tmp_pml_vaddr[level],
4722 			    x86_tmp_pml_paddr[level],
4723 			    VM_PROT_READ | VM_PROT_WRITE, 0);
4724 		}
4725 		pmap_update(pmap_kernel());
4726 		maps_loaded = true;
4727 	}
4728 
4729 	/* Zero levels 1-3 */
4730 	for (level = 0; level < PTP_LEVELS - 1; ++level) {
4731 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
4732 		memset(tmp_pml, 0, PAGE_SIZE);
4733 	}
4734 
4735 	/* Copy PML4 */
4736 	kernel_pml = pmap_kernel()->pm_pdir;
4737 	tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1];
4738 	memcpy(tmp_pml, kernel_pml, PAGE_SIZE);
4739 
4740 #ifdef PAE
4741 	/*
4742 	 * Use the last 4 entries of the L2 page as L3 PD entries. These
4743 	 * last entries are unlikely to be used for temporary mappings.
4744 	 * 508: maps 0->1GB (userland)
4745 	 * 509: unused
4746 	 * 510: unused
4747 	 * 511: maps 3->4GB (kernel)
4748 	 */
4749 	tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PG_V;
4750 	tmp_pml[509] = 0;
4751 	tmp_pml[510] = 0;
4752 	tmp_pml[511] = pmap_pdirpa(pmap_kernel(), PDIR_SLOT_KERN) | PG_V;
4753 #endif
4754 
4755 	for (level = PTP_LEVELS - 1; level > 0; --level) {
4756 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
4757 
4758 		tmp_pml[pl_i(pg, level + 1)] =
4759 		    (x86_tmp_pml_paddr[level - 1] & PG_FRAME) | PG_RW | PG_V;
4760 	}
4761 
4762 	tmp_pml = (void *)x86_tmp_pml_vaddr[0];
4763 	tmp_pml[pl_i(pg, 1)] = (pg & PG_FRAME) | PG_RW | PG_V;
4764 
4765 #ifdef PAE
4766 	/* Return the PA of the L3 page (entry 508 of the L2 page) */
4767 	return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t);
4768 #endif
4769 
4770 	return x86_tmp_pml_paddr[PTP_LEVELS - 1];
4771 }
4772 
4773 u_int
4774 x86_mmap_flags(paddr_t mdpgno)
4775 {
4776 	u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK;
4777 	u_int pflag = 0;
4778 
4779 	if (nflag & X86_MMAP_FLAG_PREFETCH)
4780 		pflag |= PMAP_WRITE_COMBINE;
4781 
4782 	return pflag;
4783 }
4784