xref: /netbsd-src/sys/arch/x86/x86/pmap.c (revision e89934bbf778a6d6d6894877c4da59d0c7835b0f)
1 /*	$NetBSD: pmap.c,v 1.240 2017/02/11 14:11:24 maxv Exp $	*/
2 
3 /*-
4  * Copyright (c) 2008, 2010, 2016, 2017 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran, and by Maxime Villard.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 2007 Manuel Bouyer.
34  *
35  * Redistribution and use in source and binary forms, with or without
36  * modification, are permitted provided that the following conditions
37  * are met:
38  * 1. Redistributions of source code must retain the above copyright
39  *    notice, this list of conditions and the following disclaimer.
40  * 2. Redistributions in binary form must reproduce the above copyright
41  *    notice, this list of conditions and the following disclaimer in the
42  *    documentation and/or other materials provided with the distribution.
43  *
44  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
45  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
46  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
47  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
48  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
49  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
50  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
51  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
52  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
53  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
54  *
55  */
56 
57 /*
58  * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
59  *
60  * Permission to use, copy, modify, and distribute this software for any
61  * purpose with or without fee is hereby granted, provided that the above
62  * copyright notice and this permission notice appear in all copies.
63  *
64  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
65  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
66  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
67  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
68  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
69  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
70  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
71  */
72 
73 /*
74  * Copyright (c) 1997 Charles D. Cranor and Washington University.
75  * All rights reserved.
76  *
77  * Redistribution and use in source and binary forms, with or without
78  * modification, are permitted provided that the following conditions
79  * are met:
80  * 1. Redistributions of source code must retain the above copyright
81  *    notice, this list of conditions and the following disclaimer.
82  * 2. Redistributions in binary form must reproduce the above copyright
83  *    notice, this list of conditions and the following disclaimer in the
84  *    documentation and/or other materials provided with the distribution.
85  *
86  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
87  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
88  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
89  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
90  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
91  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
92  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
93  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
94  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
95  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
96  */
97 
98 /*
99  * Copyright 2001 (c) Wasabi Systems, Inc.
100  * All rights reserved.
101  *
102  * Written by Frank van der Linden for Wasabi Systems, Inc.
103  *
104  * Redistribution and use in source and binary forms, with or without
105  * modification, are permitted provided that the following conditions
106  * are met:
107  * 1. Redistributions of source code must retain the above copyright
108  *    notice, this list of conditions and the following disclaimer.
109  * 2. Redistributions in binary form must reproduce the above copyright
110  *    notice, this list of conditions and the following disclaimer in the
111  *    documentation and/or other materials provided with the distribution.
112  * 3. All advertising materials mentioning features or use of this software
113  *    must display the following acknowledgement:
114  *      This product includes software developed for the NetBSD Project by
115  *      Wasabi Systems, Inc.
116  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
117  *    or promote products derived from this software without specific prior
118  *    written permission.
119  *
120  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
121  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
122  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
123  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
124  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
125  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
126  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
127  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
128  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
129  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
130  * POSSIBILITY OF SUCH DAMAGE.
131  */
132 
133 /*
134  * This is the i386 pmap modified and generalized to support x86-64
135  * as well. The idea is to hide the upper N levels of the page tables
136  * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest
137  * is mostly untouched, except that it uses some more generalized
138  * macros and interfaces.
139  *
140  * This pmap has been tested on the i386 as well, and it can be easily
141  * adapted to PAE.
142  *
143  * fvdl@wasabisystems.com 18-Jun-2001
144  */
145 
146 /*
147  * pmap.c: i386 pmap module rewrite
148  * Chuck Cranor <chuck@netbsd>
149  * 11-Aug-97
150  *
151  * history of this pmap module: in addition to my own input, i used
152  *    the following references for this rewrite of the i386 pmap:
153  *
154  * [1] the NetBSD i386 pmap.   this pmap appears to be based on the
155  *     BSD hp300 pmap done by Mike Hibler at University of Utah.
156  *     it was then ported to the i386 by William Jolitz of UUNET
157  *     Technologies, Inc.   Then Charles M. Hannum of the NetBSD
158  *     project fixed some bugs and provided some speed ups.
159  *
160  * [2] the FreeBSD i386 pmap.   this pmap seems to be the
161  *     Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson
162  *     and David Greenman.
163  *
164  * [3] the Mach pmap.   this pmap, from CMU, seems to have migrated
165  *     between several processors.   the VAX version was done by
166  *     Avadis Tevanian, Jr., and Michael Wayne Young.    the i386
167  *     version was done by Lance Berc, Mike Kupfer, Bob Baron,
168  *     David Golub, and Richard Draves.    the alpha version was
169  *     done by Alessandro Forin (CMU/Mach) and Chris Demetriou
170  *     (NetBSD/alpha).
171  */
172 
173 #include <sys/cdefs.h>
174 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.240 2017/02/11 14:11:24 maxv Exp $");
175 
176 #include "opt_user_ldt.h"
177 #include "opt_lockdebug.h"
178 #include "opt_multiprocessor.h"
179 #include "opt_xen.h"
180 
181 #include <sys/param.h>
182 #include <sys/systm.h>
183 #include <sys/proc.h>
184 #include <sys/pool.h>
185 #include <sys/kernel.h>
186 #include <sys/atomic.h>
187 #include <sys/cpu.h>
188 #include <sys/intr.h>
189 #include <sys/xcall.h>
190 #include <sys/kcore.h>
191 
192 #include <uvm/uvm.h>
193 #include <uvm/pmap/pmap_pvt.h>
194 
195 #include <dev/isa/isareg.h>
196 
197 #include <machine/specialreg.h>
198 #include <machine/gdt.h>
199 #include <machine/isa_machdep.h>
200 #include <machine/cpuvar.h>
201 #include <machine/cputypes.h>
202 
203 #include <x86/pmap.h>
204 #include <x86/pmap_pv.h>
205 
206 #include <x86/i82489reg.h>
207 #include <x86/i82489var.h>
208 
209 #ifdef XEN
210 #include <xen/xen-public/xen.h>
211 #include <xen/hypervisor.h>
212 #endif
213 
214 /*
215  * general info:
216  *
217  *  - for an explanation of how the i386 MMU hardware works see
218  *    the comments in <machine/pte.h>.
219  *
220  *  - for an explanation of the general memory structure used by
221  *    this pmap (including the recursive mapping), see the comments
222  *    in <machine/pmap.h>.
223  *
224  * this file contains the code for the "pmap module."   the module's
225  * job is to manage the hardware's virtual to physical address mappings.
226  * note that there are two levels of mapping in the VM system:
227  *
228  *  [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
229  *      to map ranges of virtual address space to objects/files.  for
230  *      example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
231  *      to the file /bin/ls starting at offset zero."   note that
232  *      the upper layer mapping is not concerned with how individual
233  *      vm_pages are mapped.
234  *
235  *  [2] the lower layer of the VM system (the pmap) maintains the mappings
236  *      from virtual addresses.   it is concerned with which vm_page is
237  *      mapped where.   for example, when you run /bin/ls and start
238  *      at page 0x1000 the fault routine may lookup the correct page
239  *      of the /bin/ls file and then ask the pmap layer to establish
240  *      a mapping for it.
241  *
242  * note that information in the lower layer of the VM system can be
243  * thrown away since it can easily be reconstructed from the info
244  * in the upper layer.
245  *
246  * data structures we use include:
247  *
248  *  - struct pmap: describes the address space of one thread
249  *  - struct pmap_page: describes one pv-tracked page, without
250  *	necessarily a corresponding vm_page
251  *  - struct pv_entry: describes one <PMAP,VA> mapping of a PA
252  *  - struct pv_head: there is one pv_head per pv-tracked page of
253  *	physical memory.   the pv_head points to a list of pv_entry
254  *	structures which describe all the <PMAP,VA> pairs that this
255  *      page is mapped in.    this is critical for page based operations
256  *      such as pmap_page_protect() [change protection on _all_ mappings
257  *      of a page]
258  */
259 
260 /*
261  * memory allocation
262  *
263  *  - there are three data structures that we must dynamically allocate:
264  *
265  * [A] new process' page directory page (PDP)
266  *	- plan 1: done at pmap_create() we use
267  *	  uvm_km_alloc(kernel_map, PAGE_SIZE)  [fka kmem_alloc] to do this
268  *	  allocation.
269  *
270  * if we are low in free physical memory then we sleep in
271  * uvm_km_alloc -- in this case this is ok since we are creating
272  * a new pmap and should not be holding any locks.
273  *
274  * if the kernel is totally out of virtual space
275  * (i.e. uvm_km_alloc returns NULL), then we panic.
276  *
277  * [B] new page tables pages (PTP)
278  * 	- call uvm_pagealloc()
279  * 		=> success: zero page, add to pm_pdir
280  * 		=> failure: we are out of free vm_pages, let pmap_enter()
281  *		   tell UVM about it.
282  *
283  * note: for kernel PTPs, we start with NKPTP of them.   as we map
284  * kernel memory (at uvm_map time) we check to see if we've grown
285  * the kernel pmap.   if so, we call the optional function
286  * pmap_growkernel() to grow the kernel PTPs in advance.
287  *
288  * [C] pv_entry structures
289  */
290 
291 /*
292  * locking
293  *
294  * we have the following locks that we must contend with:
295  *
296  * mutexes:
297  *
298  * - pmap lock (per pmap, part of uvm_object)
299  *   this lock protects the fields in the pmap structure including
300  *   the non-kernel PDEs in the PDP, and the PTEs.  it also locks
301  *   in the alternate PTE space (since that is determined by the
302  *   entry in the PDP).
303  *
304  * - pvh_lock (per pv_head)
305  *   this lock protects the pv_entry list which is chained off the
306  *   pv_head structure for a specific pv-tracked PA.   it is locked
307  *   when traversing the list (e.g. adding/removing mappings,
308  *   syncing R/M bits, etc.)
309  *
310  * - pmaps_lock
311  *   this lock protects the list of active pmaps (headed by "pmaps").
312  *   we lock it when adding or removing pmaps from this list.
313  */
314 
315 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER;
316 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER;
317 const long nkptpmax[] = NKPTPMAX_INITIALIZER;
318 const long nbpd[] = NBPD_INITIALIZER;
319 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER;
320 
321 long nkptp[] = NKPTP_INITIALIZER;
322 
323 struct pmap_head pmaps;
324 kmutex_t pmaps_lock;
325 
326 static vaddr_t pmap_maxkvaddr;
327 
328 /*
329  * XXX kludge: dummy locking to make KASSERTs in uvm_page.c comfortable.
330  * actual locking is done by pm_lock.
331  */
332 #if defined(DIAGNOSTIC)
333 #define	PMAP_SUBOBJ_LOCK(pm, idx) \
334 	KASSERT(mutex_owned((pm)->pm_lock)); \
335 	if ((idx) != 0) \
336 		mutex_enter((pm)->pm_obj[(idx)].vmobjlock)
337 #define	PMAP_SUBOBJ_UNLOCK(pm, idx) \
338 	KASSERT(mutex_owned((pm)->pm_lock)); \
339 	if ((idx) != 0) \
340 		mutex_exit((pm)->pm_obj[(idx)].vmobjlock)
341 #else /* defined(DIAGNOSTIC) */
342 #define	PMAP_SUBOBJ_LOCK(pm, idx)	/* nothing */
343 #define	PMAP_SUBOBJ_UNLOCK(pm, idx)	/* nothing */
344 #endif /* defined(DIAGNOSTIC) */
345 
346 /*
347  * Misc. event counters.
348  */
349 struct evcnt pmap_iobmp_evcnt;
350 struct evcnt pmap_ldt_evcnt;
351 
352 /*
353  * PAT
354  */
355 #define	PATENTRY(n, type)	(type << ((n) * 8))
356 #define	PAT_UC		0x0ULL
357 #define	PAT_WC		0x1ULL
358 #define	PAT_WT		0x4ULL
359 #define	PAT_WP		0x5ULL
360 #define	PAT_WB		0x6ULL
361 #define	PAT_UCMINUS	0x7ULL
362 
363 static bool cpu_pat_enabled __read_mostly = false;
364 
365 /*
366  * Global data structures
367  */
368 
369 static struct pmap kernel_pmap_store;	/* the kernel's pmap (proc0) */
370 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store;
371 
372 /*
373  * pmap_pg_nx: if our processor supports PG_NX in the PTE then we
374  * set pmap_pg_nx to PG_NX (otherwise it is zero).
375  */
376 pd_entry_t pmap_pg_nx __read_mostly = 0;
377 
378 /*
379  * pmap_pg_g: if our processor supports PG_G in the PTE then we
380  * set pmap_pg_g to PG_G (otherwise it is zero).
381  */
382 pd_entry_t pmap_pg_g __read_mostly = 0;
383 
384 /*
385  * pmap_largepages: if our processor supports PG_PS and we are
386  * using it, this is set to true.
387  */
388 int pmap_largepages __read_mostly = 0;
389 
390 /*
391  * i386 physical memory comes in a big contig chunk with a small
392  * hole toward the front of it...  the following two paddr_t's
393  * (shared with machdep.c) describe the physical address space
394  * of this machine.
395  */
396 paddr_t lowmem_rsvd __read_mostly;
397 paddr_t avail_start __read_mostly; /* PA of first available physical page */
398 paddr_t avail_end __read_mostly; /* PA of last available physical page */
399 
400 #ifdef XEN
401 paddr_t pmap_pa_start; /* PA of first physical page for this domain */
402 paddr_t pmap_pa_end;   /* PA of last physical page for this domain */
403 #endif
404 
405 #define	VM_PAGE_TO_PP(pg)	(&(pg)->mdpage.mp_pp)
406 
407 #define	PV_HASH_SIZE		32768
408 #define	PV_HASH_LOCK_CNT	32
409 
410 struct pv_hash_lock {
411 	kmutex_t lock;
412 } __aligned(CACHE_LINE_SIZE) pv_hash_locks[PV_HASH_LOCK_CNT]
413     __aligned(CACHE_LINE_SIZE);
414 
415 struct pv_hash_head {
416 	SLIST_HEAD(, pv_entry) hh_list;
417 } pv_hash_heads[PV_HASH_SIZE];
418 
419 static u_int
420 pvhash_hash(struct vm_page *ptp, vaddr_t va)
421 {
422 
423 	return (uintptr_t)ptp / sizeof(*ptp) + (va >> PAGE_SHIFT);
424 }
425 
426 static struct pv_hash_head *
427 pvhash_head(u_int hash)
428 {
429 
430 	return &pv_hash_heads[hash % PV_HASH_SIZE];
431 }
432 
433 static kmutex_t *
434 pvhash_lock(u_int hash)
435 {
436 
437 	return &pv_hash_locks[hash % PV_HASH_LOCK_CNT].lock;
438 }
439 
440 static struct pv_entry *
441 pvhash_remove(struct pv_hash_head *hh, struct vm_page *ptp, vaddr_t va)
442 {
443 	struct pv_entry *pve;
444 	struct pv_entry *prev;
445 
446 	prev = NULL;
447 	SLIST_FOREACH(pve, &hh->hh_list, pve_hash) {
448 		if (pve->pve_pte.pte_ptp == ptp &&
449 		    pve->pve_pte.pte_va == va) {
450 			if (prev != NULL) {
451 				SLIST_REMOVE_AFTER(prev, pve_hash);
452 			} else {
453 				SLIST_REMOVE_HEAD(&hh->hh_list, pve_hash);
454 			}
455 			break;
456 		}
457 		prev = pve;
458 	}
459 	return pve;
460 }
461 
462 /*
463  * Other data structures
464  */
465 
466 static pt_entry_t protection_codes[8] __read_mostly;
467 
468 static bool pmap_initialized __read_mostly = false; /* pmap_init done yet? */
469 
470 /*
471  * The following two vaddr_t's are used during system startup to keep track of
472  * how much of the kernel's VM space we have used. Once the system is started,
473  * the management of the remaining kernel VM space is turned over to the
474  * kernel_map vm_map.
475  */
476 static vaddr_t virtual_avail __read_mostly;	/* VA of first free KVA */
477 static vaddr_t virtual_end __read_mostly;	/* VA of last free KVA */
478 
479 #ifndef XEN
480 /*
481  * LAPIC virtual address, and fake physical address.
482  */
483 volatile vaddr_t local_apic_va __read_mostly;
484 paddr_t local_apic_pa __read_mostly;
485 #endif
486 
487 /*
488  * pool that pmap structures are allocated from
489  */
490 static struct pool_cache pmap_cache;
491 
492 /*
493  * pv_entry cache
494  */
495 static struct pool_cache pmap_pv_cache;
496 
497 #ifndef __HAVE_DIRECT_MAP
498 /*
499  * Special VAs and the PTEs that map them
500  */
501 static pt_entry_t *early_zero_pte;
502 static void pmap_vpage_cpualloc(struct cpu_info *);
503 #ifdef XEN
504 char *early_zerop; /* also referenced from xen_locore() */
505 #else
506 static char *early_zerop;
507 #endif
508 #endif
509 
510 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int);
511 
512 /* PDP pool_cache(9) and its callbacks */
513 struct pool_cache pmap_pdp_cache;
514 static int  pmap_pdp_ctor(void *, void *, int);
515 static void pmap_pdp_dtor(void *, void *);
516 #ifdef PAE
517 /* need to allocate items of 4 pages */
518 static void *pmap_pdp_alloc(struct pool *, int);
519 static void pmap_pdp_free(struct pool *, void *);
520 static struct pool_allocator pmap_pdp_allocator = {
521 	.pa_alloc = pmap_pdp_alloc,
522 	.pa_free = pmap_pdp_free,
523 	.pa_pagesz = PAGE_SIZE * PDP_SIZE,
524 };
525 #endif /* PAE */
526 
527 extern vaddr_t idt_vaddr;
528 extern paddr_t idt_paddr;
529 extern vaddr_t gdt_vaddr;
530 extern paddr_t gdt_paddr;
531 extern vaddr_t ldt_vaddr;
532 extern paddr_t ldt_paddr;
533 
534 extern int end;
535 
536 #ifdef i386
537 /* stuff to fix the pentium f00f bug */
538 extern vaddr_t pentium_idt_vaddr;
539 #endif
540 
541 /*
542  * Local prototypes
543  */
544 
545 #ifdef __HAVE_DIRECT_MAP
546 static void pmap_init_directmap(struct pmap *);
547 #endif
548 #ifndef XEN
549 static void pmap_init_lapic(void);
550 static void pmap_remap_largepages(void);
551 #endif
552 
553 static struct vm_page *pmap_get_ptp(struct pmap *, vaddr_t,
554     pd_entry_t * const *);
555 static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int);
556 static void pmap_freepage(struct pmap *, struct vm_page *, int);
557 static void pmap_free_ptp(struct pmap *, struct vm_page *, vaddr_t,
558     pt_entry_t *, pd_entry_t * const *);
559 static bool pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *,
560     vaddr_t, struct pv_entry **);
561 static void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t, vaddr_t,
562     vaddr_t, struct pv_entry **);
563 
564 static paddr_t pmap_get_physpage(void);
565 static void pmap_alloc_level(vaddr_t, long *);
566 
567 static bool pmap_reactivate(struct pmap *);
568 
569 /*
570  * p m a p   h e l p e r   f u n c t i o n s
571  */
572 
573 static inline void
574 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff)
575 {
576 
577 	if (pmap == pmap_kernel()) {
578 		atomic_add_long(&pmap->pm_stats.resident_count, resid_diff);
579 		atomic_add_long(&pmap->pm_stats.wired_count, wired_diff);
580 	} else {
581 		KASSERT(mutex_owned(pmap->pm_lock));
582 		pmap->pm_stats.resident_count += resid_diff;
583 		pmap->pm_stats.wired_count += wired_diff;
584 	}
585 }
586 
587 static inline void
588 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
589 {
590 	int resid_diff = ((npte & PG_V) ? 1 : 0) - ((opte & PG_V) ? 1 : 0);
591 	int wired_diff = ((npte & PG_W) ? 1 : 0) - ((opte & PG_W) ? 1 : 0);
592 
593 	KASSERT((npte & (PG_V | PG_W)) != PG_W);
594 	KASSERT((opte & (PG_V | PG_W)) != PG_W);
595 
596 	pmap_stats_update(pmap, resid_diff, wired_diff);
597 }
598 
599 /*
600  * ptp_to_pmap: lookup pmap by ptp
601  */
602 
603 static struct pmap *
604 ptp_to_pmap(struct vm_page *ptp)
605 {
606 	struct pmap *pmap;
607 
608 	if (ptp == NULL) {
609 		return pmap_kernel();
610 	}
611 	pmap = (struct pmap *)ptp->uobject;
612 	KASSERT(pmap != NULL);
613 	KASSERT(&pmap->pm_obj[0] == ptp->uobject);
614 	return pmap;
615 }
616 
617 static inline struct pv_pte *
618 pve_to_pvpte(struct pv_entry *pve)
619 {
620 
621 	KASSERT((void *)&pve->pve_pte == (void *)pve);
622 	return &pve->pve_pte;
623 }
624 
625 static inline struct pv_entry *
626 pvpte_to_pve(struct pv_pte *pvpte)
627 {
628 	struct pv_entry *pve = (void *)pvpte;
629 
630 	KASSERT(pve_to_pvpte(pve) == pvpte);
631 	return pve;
632 }
633 
634 /*
635  * pv_pte_first, pv_pte_next: PV list iterator.
636  */
637 
638 static struct pv_pte *
639 pv_pte_first(struct pmap_page *pp)
640 {
641 
642 	if ((pp->pp_flags & PP_EMBEDDED) != 0) {
643 		return &pp->pp_pte;
644 	}
645 	return pve_to_pvpte(LIST_FIRST(&pp->pp_head.pvh_list));
646 }
647 
648 static struct pv_pte *
649 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte)
650 {
651 
652 	KASSERT(pvpte != NULL);
653 	if (pvpte == &pp->pp_pte) {
654 		KASSERT((pp->pp_flags & PP_EMBEDDED) != 0);
655 		return NULL;
656 	}
657 	KASSERT((pp->pp_flags & PP_EMBEDDED) == 0);
658 	return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list));
659 }
660 
661 /*
662  * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
663  *		of course the kernel is always loaded
664  */
665 
666 bool
667 pmap_is_curpmap(struct pmap *pmap)
668 {
669 	return((pmap == pmap_kernel()) ||
670 	       (pmap == curcpu()->ci_pmap));
671 }
672 
673 /*
674  *	Add a reference to the specified pmap.
675  */
676 
677 void
678 pmap_reference(struct pmap *pmap)
679 {
680 
681 	atomic_inc_uint(&pmap->pm_obj[0].uo_refs);
682 }
683 
684 /*
685  * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
686  *
687  * there are several pmaps involved.  some or all of them might be same.
688  *
689  *	- the pmap given by the first argument
690  *		our caller wants to access this pmap's PTEs.
691  *
692  *	- pmap_kernel()
693  *		the kernel pmap.  note that it only contains the kernel part
694  *		of the address space which is shared by any pmap.  ie. any
695  *		pmap can be used instead of pmap_kernel() for our purpose.
696  *
697  *	- ci->ci_pmap
698  *		pmap currently loaded on the cpu.
699  *
700  *	- vm_map_pmap(&curproc->p_vmspace->vm_map)
701  *		current process' pmap.
702  *
703  * => we lock enough pmaps to keep things locked in
704  * => must be undone with pmap_unmap_ptes before returning
705  */
706 
707 void
708 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2,
709 	      pd_entry_t **ptepp, pd_entry_t * const **pdeppp)
710 {
711 	struct pmap *curpmap;
712 	struct cpu_info *ci;
713 	lwp_t *l;
714 
715 	/* The kernel's pmap is always accessible. */
716 	if (pmap == pmap_kernel()) {
717 		*pmap2 = NULL;
718 		*ptepp = PTE_BASE;
719 		*pdeppp = normal_pdes;
720 		return;
721 	}
722 	KASSERT(kpreempt_disabled());
723 
724 	l = curlwp;
725  retry:
726 	mutex_enter(pmap->pm_lock);
727 	ci = curcpu();
728 	curpmap = ci->ci_pmap;
729 	if (vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) {
730 		/* Our own pmap so just load it: easy. */
731 		if (__predict_false(ci->ci_want_pmapload)) {
732 			mutex_exit(pmap->pm_lock);
733 			pmap_load();
734 			goto retry;
735 		}
736 		KASSERT(pmap == curpmap);
737 	} else if (pmap == curpmap) {
738 		/*
739 		 * Already on the CPU: make it valid.  This is very
740 		 * often the case during exit(), when we have switched
741 		 * to the kernel pmap in order to destroy a user pmap.
742 		 */
743 		if (!pmap_reactivate(pmap)) {
744 			u_int gen = uvm_emap_gen_return();
745 			tlbflush();
746 			uvm_emap_update(gen);
747 		}
748 	} else {
749 		/*
750 		 * Toss current pmap from CPU, but keep a reference to it.
751 		 * The reference will be dropped by pmap_unmap_ptes().
752 		 * Can happen if we block during exit().
753 		 */
754 		const cpuid_t cid = cpu_index(ci);
755 
756 		kcpuset_atomic_clear(curpmap->pm_cpus, cid);
757 		kcpuset_atomic_clear(curpmap->pm_kernel_cpus, cid);
758 		ci->ci_pmap = pmap;
759 		ci->ci_tlbstate = TLBSTATE_VALID;
760 		kcpuset_atomic_set(pmap->pm_cpus, cid);
761 		kcpuset_atomic_set(pmap->pm_kernel_cpus, cid);
762 		cpu_load_pmap(pmap, curpmap);
763 	}
764 	pmap->pm_ncsw = l->l_ncsw;
765 	*pmap2 = curpmap;
766 	*ptepp = PTE_BASE;
767 #if defined(XEN) && defined(__x86_64__)
768 	KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] == L4_BASE);
769 	ci->ci_normal_pdes[PTP_LEVELS - 2] = pmap->pm_pdir;
770 	*pdeppp = ci->ci_normal_pdes;
771 #else /* XEN && __x86_64__ */
772 	*pdeppp = normal_pdes;
773 #endif /* XEN && __x86_64__ */
774 }
775 
776 /*
777  * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
778  */
779 
780 void
781 pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2)
782 {
783 	struct cpu_info *ci;
784 	struct pmap *mypmap;
785 
786 	KASSERT(kpreempt_disabled());
787 
788 	/* The kernel's pmap is always accessible. */
789 	if (pmap == pmap_kernel()) {
790 		return;
791 	}
792 
793 	ci = curcpu();
794 #if defined(XEN) && defined(__x86_64__)
795 	/* Reset per-cpu normal_pdes */
796 	KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] != L4_BASE);
797 	ci->ci_normal_pdes[PTP_LEVELS - 2] = L4_BASE;
798 #endif /* XEN && __x86_64__ */
799 	/*
800 	 * We cannot tolerate context switches while mapped in.
801 	 * If it is our own pmap all we have to do is unlock.
802 	 */
803 	KASSERT(pmap->pm_ncsw == curlwp->l_ncsw);
804 	mypmap = vm_map_pmap(&curproc->p_vmspace->vm_map);
805 	if (pmap == mypmap) {
806 		mutex_exit(pmap->pm_lock);
807 		return;
808 	}
809 
810 	/*
811 	 * Mark whatever's on the CPU now as lazy and unlock.
812 	 * If the pmap was already installed, we are done.
813 	 */
814 	ci->ci_tlbstate = TLBSTATE_LAZY;
815 	ci->ci_want_pmapload = (mypmap != pmap_kernel());
816 	mutex_exit(pmap->pm_lock);
817 	if (pmap == pmap2) {
818 		return;
819 	}
820 
821 	/*
822 	 * We installed another pmap on the CPU.  Grab a reference to
823 	 * it and leave in place.  Toss the evicted pmap (can block).
824 	 */
825 	pmap_reference(pmap);
826 	pmap_destroy(pmap2);
827 }
828 
829 
830 inline static void
831 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte)
832 {
833 
834 #if !defined(__x86_64__)
835 	if (curproc == NULL || curproc->p_vmspace == NULL ||
836 	    pm != vm_map_pmap(&curproc->p_vmspace->vm_map))
837 		return;
838 
839 	if ((opte ^ npte) & PG_X)
840 		pmap_update_pg(va);
841 
842 	/*
843 	 * Executability was removed on the last executable change.
844 	 * Reset the code segment to something conservative and
845 	 * let the trap handler deal with setting the right limit.
846 	 * We can't do that because of locking constraints on the vm map.
847 	 */
848 
849 	if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) {
850 		struct trapframe *tf = curlwp->l_md.md_regs;
851 
852 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
853 		pm->pm_hiexec = I386_MAX_EXE_ADDR;
854 	}
855 #endif /* !defined(__x86_64__) */
856 }
857 
858 #if !defined(__x86_64__)
859 /*
860  * Fixup the code segment to cover all potential executable mappings.
861  * returns 0 if no changes to the code segment were made.
862  */
863 
864 int
865 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb)
866 {
867 	struct vm_map_entry *ent;
868 	struct pmap *pm = vm_map_pmap(map);
869 	vaddr_t va = 0;
870 
871 	vm_map_lock_read(map);
872 	for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) {
873 
874 		/*
875 		 * This entry has greater va than the entries before.
876 		 * We need to make it point to the last page, not past it.
877 		 */
878 
879 		if (ent->protection & VM_PROT_EXECUTE)
880 			va = trunc_page(ent->end) - PAGE_SIZE;
881 	}
882 	vm_map_unlock_read(map);
883 	if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL))
884 		return (0);
885 
886 	pm->pm_hiexec = va;
887 	if (pm->pm_hiexec > I386_MAX_EXE_ADDR) {
888 		tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
889 	} else {
890 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
891 		return (0);
892 	}
893 	return (1);
894 }
895 #endif /* !defined(__x86_64__) */
896 
897 void
898 pat_init(struct cpu_info *ci)
899 {
900 	uint64_t pat;
901 
902 	if (!(ci->ci_feat_val[0] & CPUID_PAT))
903 		return;
904 
905 	/* We change WT to WC. Leave all other entries the default values. */
906 	pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) |
907 	      PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) |
908 	      PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) |
909 	      PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC);
910 
911 	wrmsr(MSR_CR_PAT, pat);
912 	cpu_pat_enabled = true;
913 	aprint_debug_dev(ci->ci_dev, "PAT enabled\n");
914 }
915 
916 static pt_entry_t
917 pmap_pat_flags(u_int flags)
918 {
919 	u_int cacheflags = (flags & PMAP_CACHE_MASK);
920 
921 	if (!cpu_pat_enabled) {
922 		switch (cacheflags) {
923 		case PMAP_NOCACHE:
924 		case PMAP_NOCACHE_OVR:
925 			/* results in PGC_UCMINUS on cpus which have
926 			 * the cpuid PAT but PAT "disabled"
927 			 */
928 			return PG_N;
929 		default:
930 			return 0;
931 		}
932 	}
933 
934 	switch (cacheflags) {
935 	case PMAP_NOCACHE:
936 		return PGC_UC;
937 	case PMAP_WRITE_COMBINE:
938 		return PGC_WC;
939 	case PMAP_WRITE_BACK:
940 		return PGC_WB;
941 	case PMAP_NOCACHE_OVR:
942 		return PGC_UCMINUS;
943 	}
944 
945 	return 0;
946 }
947 
948 /*
949  * p m a p   k e n t e r   f u n c t i o n s
950  *
951  * functions to quickly enter/remove pages from the kernel address
952  * space.   pmap_kremove is exported to MI kernel.  we make use of
953  * the recursive PTE mappings.
954  */
955 
956 /*
957  * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
958  *
959  * => no need to lock anything, assume va is already allocated
960  * => should be faster than normal pmap enter function
961  */
962 
963 void
964 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
965 {
966 	pt_entry_t *pte, opte, npte;
967 
968 	KASSERT(!(prot & ~VM_PROT_ALL));
969 
970 	if (va < VM_MIN_KERNEL_ADDRESS)
971 		pte = vtopte(va);
972 	else
973 		pte = kvtopte(va);
974 #ifdef DOM0OPS
975 	if (pa < pmap_pa_start || pa >= pmap_pa_end) {
976 #ifdef DEBUG
977 		printf_nolog("%s: pa 0x%" PRIx64 " for va 0x%" PRIx64
978 		    " outside range\n", __func__, (int64_t)pa, (int64_t)va);
979 #endif /* DEBUG */
980 		npte = pa;
981 	} else
982 #endif /* DOM0OPS */
983 		npte = pmap_pa2pte(pa);
984 	npte |= protection_codes[prot] | PG_k | PG_V | pmap_pg_g;
985 	npte |= pmap_pat_flags(flags);
986 	opte = pmap_pte_testset(pte, npte); /* zap! */
987 #if defined(DIAGNOSTIC)
988 	/*
989 	 * XXX: make sure we are not dealing with a large page, since the only
990 	 * large pages created are for the kernel image, and they should never
991 	 * be kentered.
992 	 */
993 	if (opte & PG_PS)
994 		panic("%s: PG_PS", __func__);
995 #endif
996 	if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) {
997 		/* This should not happen. */
998 		printf_nolog("%s: mapping already present\n", __func__);
999 		kpreempt_disable();
1000 		pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER);
1001 		kpreempt_enable();
1002 	}
1003 }
1004 
1005 void
1006 pmap_emap_enter(vaddr_t va, paddr_t pa, vm_prot_t prot)
1007 {
1008 	pt_entry_t *pte, npte;
1009 
1010 	KASSERT((prot & ~VM_PROT_ALL) == 0);
1011 	pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va);
1012 
1013 #ifdef DOM0OPS
1014 	if (pa < pmap_pa_start || pa >= pmap_pa_end) {
1015 		npte = pa;
1016 	} else
1017 #endif
1018 		npte = pmap_pa2pte(pa);
1019 
1020 	npte = pmap_pa2pte(pa);
1021 	npte |= protection_codes[prot] | PG_k | PG_V;
1022 	pmap_pte_set(pte, npte);
1023 }
1024 
1025 /*
1026  * pmap_emap_sync: perform TLB flush or pmap load, if it was deferred.
1027  */
1028 void
1029 pmap_emap_sync(bool canload)
1030 {
1031 	struct cpu_info *ci = curcpu();
1032 	struct pmap *pmap;
1033 
1034 	KASSERT(kpreempt_disabled());
1035 	if (__predict_true(ci->ci_want_pmapload && canload)) {
1036 		/*
1037 		 * XXX: Hint for pmap_reactivate(), which might suggest to
1038 		 * not perform TLB flush, if state has not changed.
1039 		 */
1040 		pmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map);
1041 		if (__predict_false(pmap == ci->ci_pmap)) {
1042 			kcpuset_atomic_clear(pmap->pm_cpus, cpu_index(ci));
1043 		}
1044 		pmap_load();
1045 		KASSERT(ci->ci_want_pmapload == 0);
1046 	} else {
1047 		tlbflush();
1048 	}
1049 }
1050 
1051 void
1052 pmap_emap_remove(vaddr_t sva, vsize_t len)
1053 {
1054 	pt_entry_t *pte;
1055 	vaddr_t va, eva = sva + len;
1056 
1057 	for (va = sva; va < eva; va += PAGE_SIZE) {
1058 		pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va);
1059 		pmap_pte_set(pte, 0);
1060 	}
1061 }
1062 
1063 __strict_weak_alias(pmap_kenter_ma, pmap_kenter_pa);
1064 
1065 #if defined(__x86_64__)
1066 /*
1067  * Change protection for a virtual address. Local for a CPU only, don't
1068  * care about TLB shootdowns.
1069  *
1070  * => must be called with preemption disabled
1071  */
1072 void
1073 pmap_changeprot_local(vaddr_t va, vm_prot_t prot)
1074 {
1075 	pt_entry_t *pte, opte, npte;
1076 
1077 	KASSERT(kpreempt_disabled());
1078 
1079 	if (va < VM_MIN_KERNEL_ADDRESS)
1080 		pte = vtopte(va);
1081 	else
1082 		pte = kvtopte(va);
1083 
1084 	npte = opte = *pte;
1085 
1086 	if ((prot & VM_PROT_WRITE) != 0)
1087 		npte |= PG_RW;
1088 	else
1089 		npte &= ~PG_RW;
1090 
1091 	if (opte != npte) {
1092 		pmap_pte_set(pte, npte);
1093 		pmap_pte_flush();
1094 		invlpg(va);
1095 	}
1096 }
1097 #endif /* defined(__x86_64__) */
1098 
1099 /*
1100  * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
1101  *
1102  * => no need to lock anything
1103  * => caller must dispose of any vm_page mapped in the va range
1104  * => note: not an inline function
1105  * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
1106  * => we assume kernel only unmaps valid addresses and thus don't bother
1107  *    checking the valid bit before doing TLB flushing
1108  * => must be followed by call to pmap_update() before reuse of page
1109  */
1110 
1111 static inline void
1112 pmap_kremove1(vaddr_t sva, vsize_t len, bool localonly)
1113 {
1114 	pt_entry_t *pte, opte;
1115 	vaddr_t va, eva;
1116 
1117 	eva = sva + len;
1118 
1119 	kpreempt_disable();
1120 	for (va = sva; va < eva; va += PAGE_SIZE) {
1121 		pte = kvtopte(va);
1122 		opte = pmap_pte_testset(pte, 0); /* zap! */
1123 		if ((opte & (PG_V | PG_U)) == (PG_V | PG_U) && !localonly) {
1124 			pmap_tlb_shootdown(pmap_kernel(), va, opte,
1125 			    TLBSHOOT_KREMOVE);
1126 		}
1127 		KASSERT((opte & PG_PS) == 0);
1128 		KASSERT((opte & PG_PVLIST) == 0);
1129 	}
1130 	if (localonly) {
1131 		tlbflushg();
1132 	}
1133 	kpreempt_enable();
1134 }
1135 
1136 void
1137 pmap_kremove(vaddr_t sva, vsize_t len)
1138 {
1139 
1140 	pmap_kremove1(sva, len, false);
1141 }
1142 
1143 /*
1144  * pmap_kremove_local: like pmap_kremove(), but only worry about
1145  * TLB invalidations on the current CPU.  this is only intended
1146  * for use while writing kernel crash dumps.
1147  */
1148 
1149 void
1150 pmap_kremove_local(vaddr_t sva, vsize_t len)
1151 {
1152 
1153 	KASSERT(panicstr != NULL);
1154 	pmap_kremove1(sva, len, true);
1155 }
1156 
1157 /*
1158  * p m a p   i n i t   f u n c t i o n s
1159  *
1160  * pmap_bootstrap and pmap_init are called during system startup
1161  * to init the pmap module.   pmap_bootstrap() does a low level
1162  * init just to get things rolling.   pmap_init() finishes the job.
1163  */
1164 
1165 /*
1166  * pmap_bootstrap_valloc: allocate a virtual address in the bootstrap area.
1167  * This function is to be used before any VM system has been set up.
1168  *
1169  * The va is taken from virtual_avail.
1170  */
1171 static vaddr_t
1172 pmap_bootstrap_valloc(size_t npages)
1173 {
1174 	vaddr_t va = virtual_avail;
1175 	virtual_avail += npages * PAGE_SIZE;
1176 	return va;
1177 }
1178 
1179 /*
1180  * pmap_bootstrap_palloc: allocate a physical address in the bootstrap area.
1181  * This function is to be used before any VM system has been set up.
1182  *
1183  * The pa is taken from avail_start.
1184  */
1185 static paddr_t
1186 pmap_bootstrap_palloc(size_t npages)
1187 {
1188 	paddr_t pa = avail_start;
1189 	avail_start += npages * PAGE_SIZE;
1190 	return pa;
1191 }
1192 
1193 /*
1194  * pmap_bootstrap: get the system in a state where it can run with VM properly
1195  * enabled (called before main()). The VM system is fully init'd later.
1196  *
1197  * => on i386, locore.S has already enabled the MMU by allocating a PDP for the
1198  *    kernel, and nkpde PTP's for the kernel.
1199  * => kva_start is the first free virtual address in kernel space.
1200  */
1201 void
1202 pmap_bootstrap(vaddr_t kva_start)
1203 {
1204 	struct pmap *kpm;
1205 	int i;
1206 	vaddr_t kva;
1207 #ifndef XEN
1208 	unsigned long p1i;
1209 	vaddr_t kva_end;
1210 #endif
1211 
1212 	pmap_pg_nx = (cpu_feature[2] & CPUID_NOX ? PG_NX : 0);
1213 
1214 	/*
1215 	 * Set up our local static global vars that keep track of the usage of
1216 	 * KVM before kernel_map is set up.
1217 	 */
1218 	virtual_avail = kva_start;		/* first free KVA */
1219 	virtual_end = VM_MAX_KERNEL_ADDRESS;	/* last KVA */
1220 
1221 	/*
1222 	 * Set up protection_codes: we need to be able to convert from a MI
1223 	 * protection code (some combo of VM_PROT...) to something we can jam
1224 	 * into a x86 PTE.
1225 	 */
1226 	protection_codes[VM_PROT_NONE] = pmap_pg_nx;
1227 	protection_codes[VM_PROT_EXECUTE] = PG_RO | PG_X;
1228 	protection_codes[VM_PROT_READ] = PG_RO | pmap_pg_nx;
1229 	protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO | PG_X;
1230 	protection_codes[VM_PROT_WRITE] = PG_RW | pmap_pg_nx;
1231 	protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW | PG_X;
1232 	protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW | pmap_pg_nx;
1233 	protection_codes[VM_PROT_ALL] = PG_RW | PG_X;
1234 
1235 	/*
1236 	 * Now we init the kernel's pmap.
1237 	 *
1238 	 * The kernel pmap's pm_obj is not used for much. However, in user pmaps
1239 	 * the pm_obj contains the list of active PTPs.
1240 	 *
1241 	 * The pm_obj currently does not have a pager. It might be possible to
1242 	 * add a pager that would allow a process to read-only mmap its own page
1243 	 * tables (fast user-level vtophys?). This may or may not be useful.
1244 	 */
1245 	kpm = pmap_kernel();
1246 	for (i = 0; i < PTP_LEVELS - 1; i++) {
1247 		mutex_init(&kpm->pm_obj_lock[i], MUTEX_DEFAULT, IPL_NONE);
1248 		uvm_obj_init(&kpm->pm_obj[i], NULL, false, 1);
1249 		uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_obj_lock[i]);
1250 		kpm->pm_ptphint[i] = NULL;
1251 	}
1252 	memset(&kpm->pm_list, 0, sizeof(kpm->pm_list));  /* pm_list not used */
1253 
1254 	kpm->pm_pdir = (pd_entry_t *)(PDPpaddr + KERNBASE);
1255 	for (i = 0; i < PDP_SIZE; i++)
1256 		kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i;
1257 
1258 	kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
1259 		x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS);
1260 
1261 	kcpuset_create(&kpm->pm_cpus, true);
1262 	kcpuset_create(&kpm->pm_kernel_cpus, true);
1263 
1264 	/*
1265 	 * the above is just a rough estimate and not critical to the proper
1266 	 * operation of the system.
1267 	 */
1268 
1269 #ifndef XEN
1270 	/*
1271 	 * Begin to enable global TLB entries if they are supported.
1272 	 * The G bit has no effect until the CR4_PGE bit is set in CR4,
1273 	 * which happens in cpu_init(), which is run on each cpu
1274 	 * (and happens later)
1275 	 */
1276 	if (cpu_feature[0] & CPUID_PGE) {
1277 		pmap_pg_g = PG_G;		/* enable software */
1278 
1279 		/* add PG_G attribute to already mapped kernel pages */
1280 
1281 		if (KERNBASE == VM_MIN_KERNEL_ADDRESS) {
1282 			/* i386 only */
1283 			kva_end = virtual_avail;
1284 		} else {
1285 			/* amd64 only */
1286 			extern vaddr_t kern_end;
1287 			kva_end = kern_end;
1288 		}
1289 
1290 		for (kva = KERNBASE; kva < kva_end; kva += PAGE_SIZE) {
1291 			p1i = pl1_i(kva);
1292 			if (pmap_valid_entry(PTE_BASE[p1i]))
1293 				PTE_BASE[p1i] |= PG_G;
1294 		}
1295 	}
1296 
1297 	/*
1298 	 * Enable large pages if they are supported.
1299 	 */
1300 	if (cpu_feature[0] & CPUID_PSE) {
1301 		lcr4(rcr4() | CR4_PSE);	/* enable hardware (via %cr4) */
1302 		pmap_largepages = 1;	/* enable software */
1303 
1304 		/*
1305 		 * The TLB must be flushed after enabling large pages on Pentium
1306 		 * CPUs, according to section 3.6.2.2 of "Intel Architecture
1307 		 * Software Developer's Manual, Volume 3: System Programming".
1308 		 */
1309 		tlbflushg();
1310 
1311 		/* Remap the kernel. */
1312 		pmap_remap_largepages();
1313 	}
1314 	pmap_init_lapic();
1315 #endif /* !XEN */
1316 
1317 #ifdef __HAVE_DIRECT_MAP
1318 	pmap_init_directmap(kpm);
1319 #else
1320 	pmap_vpage_cpualloc(&cpu_info_primary);
1321 
1322 	if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { /* i386 */
1323 		early_zerop = (void *)cpu_info_primary.vpage[VPAGE_ZER];
1324 		early_zero_pte = cpu_info_primary.vpage_pte[VPAGE_ZER];
1325 	} else { /* amd64 */
1326 		/*
1327 		 * zero_pte is stuck at the end of mapped space for the kernel
1328 		 * image (disjunct from kva space). This is done so that it
1329 		 * can safely be used in pmap_growkernel (pmap_get_physpage),
1330 		 * when it's called for the first time.
1331 		 * XXXfvdl fix this for MULTIPROCESSOR later.
1332 		 */
1333 #ifdef XEN
1334 		/* early_zerop initialized in xen_locore() */
1335 #else
1336 		early_zerop = (void *)(KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2);
1337 #endif
1338 		early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop);
1339 	}
1340 #endif
1341 
1342 #if defined(XEN) && defined(__x86_64__)
1343 	extern vaddr_t xen_dummy_page;
1344 	paddr_t xen_dummy_user_pgd;
1345 
1346 	/*
1347 	 * We want a dummy page directory for Xen: when deactivating a pmap,
1348 	 * Xen will still consider it active. So we set user PGD to this one
1349 	 * to lift all protection on the now inactive page tables set.
1350 	 */
1351 	xen_dummy_user_pgd = xen_dummy_page - KERNBASE;
1352 
1353 	/* Zero fill it, the less checks in Xen it requires the better */
1354 	memset((void *)(xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE);
1355 	/* Mark read-only */
1356 	HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE,
1357 	    pmap_pa2pte(xen_dummy_user_pgd) | PG_u | PG_V | pmap_pg_nx,
1358 	    UVMF_INVLPG);
1359 	/* Pin as L4 */
1360 	xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd));
1361 #endif
1362 
1363 	/*
1364 	 * Allocate space for the IDT, GDT and LDT.
1365 	 */
1366 	idt_vaddr = pmap_bootstrap_valloc(1);
1367 	idt_paddr = pmap_bootstrap_palloc(1);
1368 
1369 	gdt_vaddr = pmap_bootstrap_valloc(1);
1370 	gdt_paddr = pmap_bootstrap_palloc(1);
1371 
1372 	ldt_vaddr = pmap_bootstrap_valloc(1);
1373 	ldt_paddr = pmap_bootstrap_palloc(1);
1374 
1375 #if !defined(__x86_64__) && !defined(XEN)
1376 	/* pentium f00f bug stuff */
1377 	pentium_idt_vaddr = pmap_bootstrap_valloc(1);
1378 #endif
1379 
1380 	/*
1381 	 * Now we reserve some VM for mapping pages when doing a crash dump.
1382 	 */
1383 	virtual_avail = reserve_dumppages(virtual_avail);
1384 
1385 	/*
1386 	 * Init the static-global locks and global lists.
1387 	 *
1388 	 * => pventry::pvh_lock (initialized elsewhere) must also be
1389 	 *      a spin lock, again at IPL_VM to prevent deadlock, and
1390 	 *	again is never taken from interrupt context.
1391 	 */
1392 	mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE);
1393 	LIST_INIT(&pmaps);
1394 
1395 	/*
1396 	 * Ensure the TLB is sync'd with reality by flushing it...
1397 	 */
1398 	tlbflushg();
1399 
1400 	/*
1401 	 * Calculate pmap_maxkvaddr from nkptp[].
1402 	 */
1403 	kva = VM_MIN_KERNEL_ADDRESS;
1404 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
1405 		kva += nkptp[i] * nbpd[i];
1406 	}
1407 	pmap_maxkvaddr = kva;
1408 }
1409 
1410 #ifndef XEN
1411 static void
1412 pmap_init_lapic(void)
1413 {
1414 	/*
1415 	 * On CPUs that have no LAPIC, local_apic_va is never kentered. But our
1416 	 * x86 implementation relies a lot on this address to be valid; so just
1417 	 * allocate a fake physical page that will be kentered into
1418 	 * local_apic_va by machdep.
1419 	 *
1420 	 * If the LAPIC is present, the va will be remapped somewhere else
1421 	 * later in lapic_map.
1422 	 */
1423 	local_apic_va = pmap_bootstrap_valloc(1);
1424 	local_apic_pa = pmap_bootstrap_palloc(1);
1425 }
1426 #endif
1427 
1428 #ifdef __HAVE_DIRECT_MAP
1429 /*
1430  * Create the amd64 direct map. Called only once at boot time.
1431  */
1432 static void
1433 pmap_init_directmap(struct pmap *kpm)
1434 {
1435 	extern phys_ram_seg_t mem_clusters[];
1436 	extern int mem_cluster_cnt;
1437 
1438 	paddr_t lastpa, L2page_pa, L3page_pa, pdp;
1439 	vaddr_t tmpva;
1440 	pt_entry_t *pte;
1441 	pd_entry_t *pde;
1442 	phys_ram_seg_t *mc;
1443 	size_t nL3e;
1444 	int i;
1445 
1446 	const pd_entry_t pteflags = PG_V | PG_KW | pmap_pg_nx;
1447 
1448 	/* Get the last physical address available */
1449 	lastpa = 0;
1450 	for (i = 0; i < mem_cluster_cnt; i++) {
1451 		mc = &mem_clusters[i];
1452 		lastpa = MAX(lastpa, mc->start + mc->size);
1453 	}
1454 
1455 	/*
1456 	 * We allocate only one L4 entry for the direct map (PDIR_SLOT_DIRECT),
1457 	 * so we cannot map more than 512GB.
1458 	 */
1459 	if (lastpa > NBPD_L4) {
1460 		panic("RAM limit reached: > 512GB not supported");
1461 	}
1462 
1463 	/* In locore.S, we allocated a tmp va. We will use it now. */
1464 	tmpva = (KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2);
1465 	pte = PTE_BASE + pl1_i(tmpva);
1466 
1467 	/* Allocate L3, and zero it out. */
1468 	L3page_pa = pmap_bootstrap_palloc(1);
1469 	*pte = L3page_pa | pteflags;
1470 	pmap_update_pg(tmpva);
1471 	memset((void *)tmpva, 0, PAGE_SIZE);
1472 
1473 	/* Number of L3 entries. */
1474 	nL3e = (lastpa + NBPD_L3 - 1) >> L3_SHIFT;
1475 
1476 	/*
1477 	 * Map the direct map RW. Use super pages (1GB) or large pages (2MB) if
1478 	 * they are supported. Note: PG_G is not allowed on non-leaf PTPs.
1479 	 */
1480 	if (cpu_feature[2] & CPUID_P1GB) {
1481 		/* Super pages are supported. Just create L3. */
1482 		for (i = 0; i < nL3e; i++) {
1483 			pdp = (paddr_t)&(((pd_entry_t *)L3page_pa)[i]);
1484 			*pte = (pdp & PG_FRAME) | pteflags;
1485 			pmap_update_pg(tmpva);
1486 
1487 			pde = (pd_entry_t *)(tmpva + (pdp & ~PG_FRAME));
1488 			*pde = ((paddr_t)i << L3_SHIFT) | pteflags | PG_U |
1489 			    PG_PS | PG_G;
1490 		}
1491 	} else {
1492 		/* Allocate L2. */
1493 		L2page_pa = pmap_bootstrap_palloc(nL3e);
1494 
1495 		/* Zero out the L2 pages. */
1496 		for (i = 0; i < nL3e; i++) {
1497 			pdp = L2page_pa + i * PAGE_SIZE;
1498 			*pte = (pdp & PG_FRAME) | pteflags;
1499 			pmap_update_pg(tmpva);
1500 
1501 			memset((void *)tmpva, 0, PAGE_SIZE);
1502 		}
1503 
1504 		KASSERT(pmap_largepages != 0);
1505 
1506 		/* Large pages are supported. Just create L2. */
1507 		for (i = 0; i < NPDPG * nL3e; i++) {
1508 			pdp = (paddr_t)&(((pd_entry_t *)L2page_pa)[i]);
1509 			*pte = (pdp & PG_FRAME) | pteflags;
1510 			pmap_update_pg(tmpva);
1511 
1512 			pde = (pd_entry_t *)(tmpva + (pdp & ~PG_FRAME));
1513 			*pde = ((paddr_t)i << L2_SHIFT) | pteflags |
1514 			    PG_U | PG_PS | PG_G;
1515 		}
1516 
1517 		/* Fill in the L3 entries, linked to L2. */
1518 		for (i = 0; i < nL3e; i++) {
1519 			pdp = (paddr_t)&(((pd_entry_t *)L3page_pa)[i]);
1520 			*pte = (pdp & PG_FRAME) | pteflags;
1521 			pmap_update_pg(tmpva);
1522 
1523 			pde = (pd_entry_t *)(tmpva + (pdp & ~PG_FRAME));
1524 			*pde = (L2page_pa + (i << PAGE_SHIFT)) | pteflags | PG_U;
1525 		}
1526 	}
1527 
1528 	kpm->pm_pdir[PDIR_SLOT_DIRECT] = L3page_pa | pteflags | PG_U;
1529 
1530 	*pte = 0;
1531 	pmap_update_pg(tmpva);
1532 
1533 	tlbflush();
1534 }
1535 #endif /* __HAVE_DIRECT_MAP */
1536 
1537 #ifndef XEN
1538 /*
1539  * Remap several kernel segments with large pages. We cover as many pages as we
1540  * can. Called only once at boot time, if the CPU supports large pages.
1541  */
1542 static void
1543 pmap_remap_largepages(void)
1544 {
1545 	extern char __rodata_start;
1546 	extern char __data_start;
1547 	extern char __kernel_end;
1548 	pd_entry_t *pde;
1549 	vaddr_t kva, kva_end;
1550 	paddr_t pa;
1551 
1552 	/* Remap the kernel text using large pages. */
1553 	kva = rounddown((vaddr_t)KERNTEXTOFF, NBPD_L2);
1554 	kva_end = rounddown((vaddr_t)&__rodata_start, NBPD_L1);
1555 	pa = kva - KERNBASE;
1556 	for (/* */; kva + NBPD_L2 <= kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1557 		pde = &L2_BASE[pl2_i(kva)];
1558 		*pde = pa | pmap_pg_g | PG_PS | PG_KR | PG_V;
1559 		tlbflushg();
1560 	}
1561 #if defined(DEBUG)
1562 	aprint_normal("kernel text is mapped with %" PRIuPSIZE " large "
1563 	    "pages and %" PRIuPSIZE " normal pages\n",
1564 	    howmany(kva - KERNBASE, NBPD_L2),
1565 	    howmany((vaddr_t)&__rodata_start - kva, NBPD_L1));
1566 #endif /* defined(DEBUG) */
1567 
1568 	/* Remap the kernel rodata using large pages. */
1569 	kva = roundup((vaddr_t)&__rodata_start, NBPD_L2);
1570 	kva_end = rounddown((vaddr_t)&__data_start, NBPD_L1);
1571 	pa = kva - KERNBASE;
1572 	for (/* */; kva + NBPD_L2 <= kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1573 		pde = &L2_BASE[pl2_i(kva)];
1574 		*pde = pa | pmap_pg_g | PG_PS | pmap_pg_nx | PG_KR | PG_V;
1575 		tlbflushg();
1576 	}
1577 
1578 	/* Remap the kernel data+bss using large pages. */
1579 	kva = roundup((vaddr_t)&__data_start, NBPD_L2);
1580 	kva_end = rounddown((vaddr_t)&__kernel_end, NBPD_L1);
1581 	pa = kva - KERNBASE;
1582 	for (/* */; kva + NBPD_L2 <= kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1583 		pde = &L2_BASE[pl2_i(kva)];
1584 		*pde = pa | pmap_pg_g | PG_PS | pmap_pg_nx | PG_KW | PG_V;
1585 		tlbflushg();
1586 	}
1587 }
1588 #endif /* !XEN */
1589 
1590 /*
1591  * pmap_init: called from uvm_init, our job is to get the pmap
1592  * system ready to manage mappings...
1593  */
1594 
1595 void
1596 pmap_init(void)
1597 {
1598 	int i, flags;
1599 
1600 	for (i = 0; i < PV_HASH_SIZE; i++) {
1601 		SLIST_INIT(&pv_hash_heads[i].hh_list);
1602 	}
1603 	for (i = 0; i < PV_HASH_LOCK_CNT; i++) {
1604 		mutex_init(&pv_hash_locks[i].lock, MUTEX_NODEBUG, IPL_VM);
1605 	}
1606 
1607 	/*
1608 	 * initialize caches.
1609 	 */
1610 
1611 	pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), 0, 0, 0,
1612 	    "pmappl", NULL, IPL_NONE, NULL, NULL, NULL);
1613 
1614 #ifdef XEN
1615 	/*
1616 	 * pool_cache(9) should not touch cached objects, since they
1617 	 * are pinned on xen and R/O for the domU
1618 	 */
1619 	flags = PR_NOTOUCH;
1620 #else /* XEN */
1621 	flags = 0;
1622 #endif /* XEN */
1623 #ifdef PAE
1624 	pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE * PDP_SIZE, 0, 0, flags,
1625 	    "pdppl", &pmap_pdp_allocator, IPL_NONE,
1626 	    pmap_pdp_ctor, pmap_pdp_dtor, NULL);
1627 #else /* PAE */
1628 	pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE, 0, 0, flags,
1629 	    "pdppl", NULL, IPL_NONE, pmap_pdp_ctor, pmap_pdp_dtor, NULL);
1630 #endif /* PAE */
1631 	pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0,
1632 	    PR_LARGECACHE, "pvpl", &pool_allocator_kmem, IPL_NONE, NULL,
1633 	    NULL, NULL);
1634 
1635 	pmap_tlb_init();
1636 
1637 	/* XXX: Since cpu_hatch() is only for secondary CPUs. */
1638 	pmap_tlb_cpu_init(curcpu());
1639 
1640 	evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC,
1641 	    NULL, "x86", "io bitmap copy");
1642 	evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC,
1643 	    NULL, "x86", "ldt sync");
1644 
1645 	/*
1646 	 * done: pmap module is up (and ready for business)
1647 	 */
1648 
1649 	pmap_initialized = true;
1650 }
1651 
1652 /*
1653  * pmap_cpu_init_late: perform late per-CPU initialization.
1654  */
1655 
1656 #ifndef XEN
1657 void
1658 pmap_cpu_init_late(struct cpu_info *ci)
1659 {
1660 	/*
1661 	 * The BP has already its own PD page allocated during early
1662 	 * MD startup.
1663 	 */
1664 	if (ci == &cpu_info_primary)
1665 		return;
1666 
1667 #ifdef PAE
1668 	cpu_alloc_l3_page(ci);
1669 #endif
1670 }
1671 #endif
1672 
1673 #ifndef __HAVE_DIRECT_MAP
1674 CTASSERT(CACHE_LINE_SIZE > sizeof(pt_entry_t));
1675 CTASSERT(CACHE_LINE_SIZE % sizeof(pt_entry_t) == 0);
1676 
1677 static void
1678 pmap_vpage_cpualloc(struct cpu_info *ci)
1679 {
1680 	bool primary = (ci == &cpu_info_primary);
1681 	size_t i, npages;
1682 	vaddr_t vabase;
1683 	vsize_t vrange;
1684 
1685 	npages = (CACHE_LINE_SIZE / sizeof(pt_entry_t));
1686 	KASSERT(npages >= VPAGE_MAX);
1687 	vrange = npages * PAGE_SIZE;
1688 
1689 	if (primary) {
1690 		while ((vabase = pmap_bootstrap_valloc(1)) % vrange != 0) {
1691 			/* Waste some pages to align properly */
1692 		}
1693 		/* The base is aligned, allocate the rest (contiguous) */
1694 		pmap_bootstrap_valloc(npages - 1);
1695 	} else {
1696 		vabase = uvm_km_alloc(kernel_map, vrange, vrange,
1697 		    UVM_KMF_VAONLY);
1698 		if (vabase == 0) {
1699 			panic("%s: failed to allocate tmp VA for CPU %d\n",
1700 			    __func__, cpu_index(ci));
1701 		}
1702 	}
1703 
1704 	KASSERT((vaddr_t)&PTE_BASE[pl1_i(vabase)] % CACHE_LINE_SIZE == 0);
1705 
1706 	for (i = 0; i < VPAGE_MAX; i++) {
1707 		ci->vpage[i] = vabase + i * PAGE_SIZE;
1708 		ci->vpage_pte[i] = PTE_BASE + pl1_i(ci->vpage[i]);
1709 	}
1710 }
1711 
1712 void
1713 pmap_vpage_cpu_init(struct cpu_info *ci)
1714 {
1715 	if (ci == &cpu_info_primary) {
1716 		/* cpu0 already taken care of in pmap_bootstrap */
1717 		return;
1718 	}
1719 
1720 	pmap_vpage_cpualloc(ci);
1721 }
1722 #endif
1723 
1724 /*
1725  * p v _ e n t r y   f u n c t i o n s
1726  */
1727 
1728 /*
1729  * pmap_free_pvs: free a list of pv_entrys
1730  */
1731 
1732 static void
1733 pmap_free_pvs(struct pv_entry *pve)
1734 {
1735 	struct pv_entry *next;
1736 
1737 	for ( /* null */ ; pve != NULL ; pve = next) {
1738 		next = pve->pve_next;
1739 		pool_cache_put(&pmap_pv_cache, pve);
1740 	}
1741 }
1742 
1743 /*
1744  * main pv_entry manipulation functions:
1745  *   pmap_enter_pv: enter a mapping onto a pv_head list
1746  *   pmap_remove_pv: remove a mapping from a pv_head list
1747  *
1748  * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock
1749  *       the pvh before calling
1750  */
1751 
1752 /*
1753  * insert_pv: a helper of pmap_enter_pv
1754  */
1755 
1756 static void
1757 insert_pv(struct pmap_page *pp, struct pv_entry *pve)
1758 {
1759 	struct pv_hash_head *hh;
1760 	kmutex_t *lock;
1761 	u_int hash;
1762 
1763 	hash = pvhash_hash(pve->pve_pte.pte_ptp, pve->pve_pte.pte_va);
1764 	lock = pvhash_lock(hash);
1765 	hh = pvhash_head(hash);
1766 	mutex_spin_enter(lock);
1767 	SLIST_INSERT_HEAD(&hh->hh_list, pve, pve_hash);
1768 	mutex_spin_exit(lock);
1769 
1770 	LIST_INSERT_HEAD(&pp->pp_head.pvh_list, pve, pve_list);
1771 }
1772 
1773 /*
1774  * pmap_enter_pv: enter a mapping onto a pv_head lst
1775  *
1776  * => caller should adjust ptp's wire_count before calling
1777  * => caller has preallocated pve and *sparepve for us
1778  */
1779 
1780 static struct pv_entry *
1781 pmap_enter_pv(struct pmap_page *pp, struct pv_entry *pve,
1782     struct pv_entry **sparepve, struct vm_page *ptp, vaddr_t va)
1783 {
1784 
1785 	KASSERT(ptp == NULL || ptp->wire_count >= 2);
1786 	KASSERT(ptp == NULL || ptp->uobject != NULL);
1787 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
1788 
1789 	if ((pp->pp_flags & PP_EMBEDDED) == 0) {
1790 		if (LIST_EMPTY(&pp->pp_head.pvh_list)) {
1791 			pp->pp_flags |= PP_EMBEDDED;
1792 			pp->pp_pte.pte_ptp = ptp;
1793 			pp->pp_pte.pte_va = va;
1794 
1795 			return pve;
1796 		}
1797 	} else {
1798 		struct pv_entry *pve2;
1799 
1800 		pve2 = *sparepve;
1801 		*sparepve = NULL;
1802 
1803 		pve2->pve_pte = pp->pp_pte;
1804 		pp->pp_flags &= ~PP_EMBEDDED;
1805 		LIST_INIT(&pp->pp_head.pvh_list);
1806 		insert_pv(pp, pve2);
1807 	}
1808 
1809 	pve->pve_pte.pte_ptp = ptp;
1810 	pve->pve_pte.pte_va = va;
1811 	insert_pv(pp, pve);
1812 
1813 	return NULL;
1814 }
1815 
1816 /*
1817  * pmap_remove_pv: try to remove a mapping from a pv_list
1818  *
1819  * => caller should adjust ptp's wire_count and free PTP if needed
1820  * => we return the removed pve
1821  */
1822 
1823 static struct pv_entry *
1824 pmap_remove_pv(struct pmap_page *pp, struct vm_page *ptp, vaddr_t va)
1825 {
1826 	struct pv_hash_head *hh;
1827 	struct pv_entry *pve;
1828 	kmutex_t *lock;
1829 	u_int hash;
1830 
1831 	KASSERT(ptp == NULL || ptp->uobject != NULL);
1832 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
1833 
1834 	if ((pp->pp_flags & PP_EMBEDDED) != 0) {
1835 		KASSERT(pp->pp_pte.pte_ptp == ptp);
1836 		KASSERT(pp->pp_pte.pte_va == va);
1837 
1838 		pp->pp_flags &= ~PP_EMBEDDED;
1839 		LIST_INIT(&pp->pp_head.pvh_list);
1840 
1841 		return NULL;
1842 	}
1843 
1844 	hash = pvhash_hash(ptp, va);
1845 	lock = pvhash_lock(hash);
1846 	hh = pvhash_head(hash);
1847 	mutex_spin_enter(lock);
1848 	pve = pvhash_remove(hh, ptp, va);
1849 	mutex_spin_exit(lock);
1850 
1851 	LIST_REMOVE(pve, pve_list);
1852 
1853 	return pve;
1854 }
1855 
1856 /*
1857  * p t p   f u n c t i o n s
1858  */
1859 
1860 static inline struct vm_page *
1861 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level)
1862 {
1863 	int lidx = level - 1;
1864 	struct vm_page *pg;
1865 
1866 	KASSERT(mutex_owned(pmap->pm_lock));
1867 
1868 	if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] &&
1869 	    pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) {
1870 		return (pmap->pm_ptphint[lidx]);
1871 	}
1872 	PMAP_SUBOBJ_LOCK(pmap, lidx);
1873 	pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level));
1874 	PMAP_SUBOBJ_UNLOCK(pmap, lidx);
1875 
1876 	KASSERT(pg == NULL || pg->wire_count >= 1);
1877 	return pg;
1878 }
1879 
1880 static inline void
1881 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level)
1882 {
1883 	lwp_t *l;
1884 	int lidx;
1885 	struct uvm_object *obj;
1886 
1887 	KASSERT(ptp->wire_count == 1);
1888 
1889 	lidx = level - 1;
1890 
1891 	obj = &pmap->pm_obj[lidx];
1892 	pmap_stats_update(pmap, -1, 0);
1893 	if (lidx != 0)
1894 		mutex_enter(obj->vmobjlock);
1895 	if (pmap->pm_ptphint[lidx] == ptp)
1896 		pmap->pm_ptphint[lidx] = TAILQ_FIRST(&obj->memq);
1897 	ptp->wire_count = 0;
1898 	uvm_pagerealloc(ptp, NULL, 0);
1899 	l = curlwp;
1900 	KASSERT((l->l_pflag & LP_INTR) == 0);
1901 	VM_PAGE_TO_PP(ptp)->pp_link = l->l_md.md_gc_ptp;
1902 	l->l_md.md_gc_ptp = ptp;
1903 	if (lidx != 0)
1904 		mutex_exit(obj->vmobjlock);
1905 }
1906 
1907 static void
1908 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
1909 	      pt_entry_t *ptes, pd_entry_t * const *pdes)
1910 {
1911 	unsigned long index;
1912 	int level;
1913 	vaddr_t invaladdr;
1914 	pd_entry_t opde;
1915 
1916 	KASSERT(pmap != pmap_kernel());
1917 	KASSERT(mutex_owned(pmap->pm_lock));
1918 	KASSERT(kpreempt_disabled());
1919 
1920 	level = 1;
1921 	do {
1922 		index = pl_i(va, level + 1);
1923 		opde = pmap_pte_testset(&pdes[level - 1][index], 0);
1924 #if defined(XEN)
1925 #  if defined(__x86_64__)
1926 		/*
1927 		 * If ptp is a L3 currently mapped in kernel space,
1928 		 * on any cpu, clear it before freeing
1929 		 */
1930 		if (level == PTP_LEVELS - 1) {
1931 			/*
1932 			 * Update the per-cpu PD on all cpus the current
1933 			 * pmap is active on
1934 			 */
1935 			xen_kpm_sync(pmap, index);
1936 		}
1937 #  endif /*__x86_64__ */
1938 		invaladdr = level == 1 ? (vaddr_t)ptes :
1939 		    (vaddr_t)pdes[level - 2];
1940 		pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE,
1941 		    opde, TLBSHOOT_FREE_PTP1);
1942 		pmap_tlb_shootnow();
1943 #else	/* XEN */
1944 		invaladdr = level == 1 ? (vaddr_t)ptes :
1945 		    (vaddr_t)pdes[level - 2];
1946 		pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE,
1947 		    opde, TLBSHOOT_FREE_PTP1);
1948 #endif	/* XEN */
1949 		pmap_freepage(pmap, ptp, level);
1950 		if (level < PTP_LEVELS - 1) {
1951 			ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1);
1952 			ptp->wire_count--;
1953 			if (ptp->wire_count > 1)
1954 				break;
1955 		}
1956 	} while (++level < PTP_LEVELS);
1957 	pmap_pte_flush();
1958 }
1959 
1960 /*
1961  * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
1962  *
1963  * => pmap should NOT be pmap_kernel()
1964  * => pmap should be locked
1965  * => preemption should be disabled
1966  */
1967 
1968 static struct vm_page *
1969 pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t * const *pdes)
1970 {
1971 	struct vm_page *ptp, *pptp;
1972 	int i;
1973 	unsigned long index;
1974 	pd_entry_t *pva;
1975 	paddr_t ppa, pa;
1976 	struct uvm_object *obj;
1977 
1978 	KASSERT(pmap != pmap_kernel());
1979 	KASSERT(mutex_owned(pmap->pm_lock));
1980 	KASSERT(kpreempt_disabled());
1981 
1982 	ptp = NULL;
1983 	pa = (paddr_t)-1;
1984 
1985 	/*
1986 	 * Loop through all page table levels seeing if we need to
1987 	 * add a new page to that level.
1988 	 */
1989 	for (i = PTP_LEVELS; i > 1; i--) {
1990 		/*
1991 		 * Save values from previous round.
1992 		 */
1993 		pptp = ptp;
1994 		ppa = pa;
1995 
1996 		index = pl_i(va, i);
1997 		pva = pdes[i - 2];
1998 
1999 		if (pmap_valid_entry(pva[index])) {
2000 			ppa = pmap_pte2pa(pva[index]);
2001 			ptp = NULL;
2002 			continue;
2003 		}
2004 
2005 		obj = &pmap->pm_obj[i-2];
2006 		PMAP_SUBOBJ_LOCK(pmap, i - 2);
2007 		ptp = uvm_pagealloc(obj, ptp_va2o(va, i - 1), NULL,
2008 		    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
2009 		PMAP_SUBOBJ_UNLOCK(pmap, i - 2);
2010 
2011 		if (ptp == NULL)
2012 			return NULL;
2013 
2014 		ptp->flags &= ~PG_BUSY; /* never busy */
2015 		ptp->wire_count = 1;
2016 		pmap->pm_ptphint[i - 2] = ptp;
2017 		pa = VM_PAGE_TO_PHYS(ptp);
2018 		pmap_pte_set(&pva[index], (pd_entry_t)
2019 		    (pmap_pa2pte(pa) | PG_u | PG_RW | PG_V));
2020 #if defined(XEN) && defined(__x86_64__)
2021 		if (i == PTP_LEVELS) {
2022 			/*
2023 			 * Update the per-cpu PD on all cpus the current
2024 			 * pmap is active on
2025 			 */
2026 			xen_kpm_sync(pmap, index);
2027 		}
2028 #endif
2029 		pmap_pte_flush();
2030 		pmap_stats_update(pmap, 1, 0);
2031 		/*
2032 		 * If we're not in the top level, increase the
2033 		 * wire count of the parent page.
2034 		 */
2035 		if (i < PTP_LEVELS) {
2036 			if (pptp == NULL) {
2037 				pptp = pmap_find_ptp(pmap, va, ppa, i);
2038 				KASSERT(pptp != NULL);
2039 			}
2040 			pptp->wire_count++;
2041 		}
2042 	}
2043 
2044 	/*
2045 	 * PTP is not NULL if we just allocated a new PTP.  If it is
2046 	 * still NULL, we must look up the existing one.
2047 	 */
2048 	if (ptp == NULL) {
2049 		ptp = pmap_find_ptp(pmap, va, ppa, 1);
2050 		KASSERTMSG(ptp != NULL, "pmap_get_ptp: va %" PRIxVADDR
2051 		    "ppa %" PRIxPADDR "\n", va, ppa);
2052 	}
2053 
2054 	pmap->pm_ptphint[0] = ptp;
2055 	return ptp;
2056 }
2057 
2058 /*
2059  * p m a p   l i f e c y c l e   f u n c t i o n s
2060  */
2061 
2062 /*
2063  * pmap_pdp_ctor: constructor for the PDP cache.
2064  */
2065 static int
2066 pmap_pdp_ctor(void *arg, void *v, int flags)
2067 {
2068 	pd_entry_t *pdir = v;
2069 	paddr_t pdirpa = 0;
2070 	vaddr_t object;
2071 	int i;
2072 
2073 #if !defined(XEN) || !defined(__x86_64__)
2074 	int npde;
2075 #endif
2076 #ifdef XEN
2077 	int s;
2078 #endif
2079 
2080 	/*
2081 	 * NOTE: The `pmaps_lock' is held when the PDP is allocated.
2082 	 */
2083 
2084 #if defined(XEN) && defined(__x86_64__)
2085 	/* Fetch the physical address of the page directory */
2086 	(void)pmap_extract(pmap_kernel(), (vaddr_t)pdir, &pdirpa);
2087 
2088 	/* Zero the area */
2089 	memset(pdir, 0, PAGE_SIZE); /* Xen wants a clean page */
2090 
2091 	/*
2092 	 * This pdir will NEVER be active in kernel mode, so mark
2093 	 * recursive entry invalid.
2094 	 */
2095 	pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa) | PG_u;
2096 
2097 	/*
2098 	 * PDP constructed this way won't be for the kernel, hence we
2099 	 * don't put kernel mappings on Xen.
2100 	 *
2101 	 * But we need to make pmap_create() happy, so put a dummy
2102 	 * (without PG_V) value at the right place.
2103 	 */
2104 	pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] =
2105 	     (pd_entry_t)-1 & PG_FRAME;
2106 #else /* XEN && __x86_64__*/
2107 	/* Zero the area */
2108 	memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t));
2109 
2110 	object = (vaddr_t)v;
2111 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2112 		/* Fetch the physical address of the page directory */
2113 		(void)pmap_extract(pmap_kernel(), object, &pdirpa);
2114 
2115 		/* Put in recursive PDE to map the PTEs */
2116 		pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PG_V |
2117 		    pmap_pg_nx;
2118 #ifndef XEN
2119 		pdir[PDIR_SLOT_PTE + i] |= PG_KW;
2120 #endif
2121 	}
2122 
2123 	/* Copy the kernel's top level PDE */
2124 	npde = nkptp[PTP_LEVELS - 1];
2125 
2126 	memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN],
2127 	    npde * sizeof(pd_entry_t));
2128 
2129 	/* Zero the rest */
2130 	memset(&pdir[PDIR_SLOT_KERN + npde], 0, (PAGE_SIZE * PDP_SIZE) -
2131 	    (PDIR_SLOT_KERN + npde) * sizeof(pd_entry_t));
2132 
2133 	if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
2134 		int idx = pl_i(KERNBASE, PTP_LEVELS);
2135 		pdir[idx] = PDP_BASE[idx];
2136 	}
2137 
2138 #ifdef __HAVE_DIRECT_MAP
2139 	pdir[PDIR_SLOT_DIRECT] = PDP_BASE[PDIR_SLOT_DIRECT];
2140 #endif
2141 #endif /* XEN  && __x86_64__*/
2142 
2143 #ifdef XEN
2144 	s = splvm();
2145 	object = (vaddr_t)v;
2146 	pmap_protect(pmap_kernel(), object, object + (PAGE_SIZE * PDP_SIZE),
2147 	    VM_PROT_READ);
2148 	pmap_update(pmap_kernel());
2149 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2150 		/*
2151 		 * pin as L2/L4 page, we have to do the page with the
2152 		 * PDIR_SLOT_PTE entries last
2153 		 */
2154 #ifdef PAE
2155 		if (i == l2tol3(PDIR_SLOT_PTE))
2156 			continue;
2157 #endif
2158 
2159 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2160 #ifdef __x86_64__
2161 		xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa));
2162 #else
2163 		xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2164 #endif
2165 	}
2166 #ifdef PAE
2167 	object = ((vaddr_t)pdir) + PAGE_SIZE  * l2tol3(PDIR_SLOT_PTE);
2168 	(void)pmap_extract(pmap_kernel(), object, &pdirpa);
2169 	xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2170 #endif
2171 	splx(s);
2172 #endif /* XEN */
2173 
2174 	return (0);
2175 }
2176 
2177 /*
2178  * pmap_pdp_dtor: destructor for the PDP cache.
2179  */
2180 
2181 static void
2182 pmap_pdp_dtor(void *arg, void *v)
2183 {
2184 #ifdef XEN
2185 	paddr_t pdirpa = 0;	/* XXX: GCC */
2186 	vaddr_t object = (vaddr_t)v;
2187 	int i;
2188 	int s = splvm();
2189 	pt_entry_t *pte;
2190 
2191 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2192 		/* fetch the physical address of the page directory. */
2193 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2194 		/* unpin page table */
2195 		xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa));
2196 	}
2197 	object = (vaddr_t)v;
2198 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2199 		/* Set page RW again */
2200 		pte = kvtopte(object);
2201 		pmap_pte_set(pte, *pte | PG_RW);
2202 		xen_bcast_invlpg((vaddr_t)object);
2203 	}
2204 	splx(s);
2205 #endif  /* XEN */
2206 }
2207 
2208 #ifdef PAE
2209 
2210 /* pmap_pdp_alloc: Allocate a page for the pdp memory pool. */
2211 
2212 static void *
2213 pmap_pdp_alloc(struct pool *pp, int flags)
2214 {
2215 	return (void *)uvm_km_alloc(kernel_map,
2216 	    PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE,
2217 	    ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK)
2218 	    | UVM_KMF_WIRED);
2219 }
2220 
2221 /*
2222  * pmap_pdp_free: free a PDP
2223  */
2224 
2225 static void
2226 pmap_pdp_free(struct pool *pp, void *v)
2227 {
2228 	uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE,
2229 	    UVM_KMF_WIRED);
2230 }
2231 #endif /* PAE */
2232 
2233 /*
2234  * pmap_create: create a pmap object.
2235  */
2236 struct pmap *
2237 pmap_create(void)
2238 {
2239 	struct pmap *pmap;
2240 	int i;
2241 
2242 	pmap = pool_cache_get(&pmap_cache, PR_WAITOK);
2243 
2244 	/* init uvm_object */
2245 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2246 		mutex_init(&pmap->pm_obj_lock[i], MUTEX_DEFAULT, IPL_NONE);
2247 		uvm_obj_init(&pmap->pm_obj[i], NULL, false, 1);
2248 		uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_obj_lock[i]);
2249 		pmap->pm_ptphint[i] = NULL;
2250 	}
2251 	pmap->pm_stats.wired_count = 0;
2252 	/* count the PDP allocd below */
2253 	pmap->pm_stats.resident_count = PDP_SIZE;
2254 #if !defined(__x86_64__)
2255 	pmap->pm_hiexec = 0;
2256 #endif /* !defined(__x86_64__) */
2257 	pmap->pm_flags = 0;
2258 	pmap->pm_gc_ptp = NULL;
2259 
2260 	kcpuset_create(&pmap->pm_cpus, true);
2261 	kcpuset_create(&pmap->pm_kernel_cpus, true);
2262 #ifdef XEN
2263 	kcpuset_create(&pmap->pm_xen_ptp_cpus, true);
2264 #endif
2265 	/* init the LDT */
2266 	pmap->pm_ldt = NULL;
2267 	pmap->pm_ldt_len = 0;
2268 	pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2269 
2270 	/* allocate PDP */
2271  try_again:
2272 	pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK);
2273 
2274 	mutex_enter(&pmaps_lock);
2275 
2276 	if (pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] == 0) {
2277 		mutex_exit(&pmaps_lock);
2278 		pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir);
2279 		goto try_again;
2280 	}
2281 
2282 	for (i = 0; i < PDP_SIZE; i++)
2283 		pmap->pm_pdirpa[i] =
2284 		    pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]);
2285 
2286 	LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
2287 
2288 	mutex_exit(&pmaps_lock);
2289 
2290 	return (pmap);
2291 }
2292 
2293 /*
2294  * pmap_free_ptps: put a list of ptps back to the freelist.
2295  */
2296 
2297 void
2298 pmap_free_ptps(struct vm_page *empty_ptps)
2299 {
2300 	struct vm_page *ptp;
2301 	struct pmap_page *pp;
2302 
2303 	while ((ptp = empty_ptps) != NULL) {
2304 		pp = VM_PAGE_TO_PP(ptp);
2305 		empty_ptps = pp->pp_link;
2306 		LIST_INIT(&pp->pp_head.pvh_list);
2307 		uvm_pagefree(ptp);
2308 	}
2309 }
2310 
2311 /*
2312  * pmap_destroy: drop reference count on pmap.   free pmap if
2313  *	reference count goes to zero.
2314  */
2315 
2316 void
2317 pmap_destroy(struct pmap *pmap)
2318 {
2319 	lwp_t *l;
2320 	int i;
2321 
2322 	/*
2323 	 * If we have torn down this pmap, process deferred frees and
2324 	 * invalidations.  Free now if the system is low on memory.
2325 	 * Otherwise, free when the pmap is destroyed thus avoiding a
2326 	 * TLB shootdown.
2327 	 */
2328 	l = curlwp;
2329 	if (__predict_false(l->l_md.md_gc_pmap == pmap)) {
2330 		if (uvmexp.free < uvmexp.freetarg) {
2331 			pmap_update(pmap);
2332 		} else {
2333 			KASSERT(pmap->pm_gc_ptp == NULL);
2334 			pmap->pm_gc_ptp = l->l_md.md_gc_ptp;
2335 			l->l_md.md_gc_ptp = NULL;
2336 			l->l_md.md_gc_pmap = NULL;
2337 		}
2338 	}
2339 
2340 	/*
2341 	 * drop reference count
2342 	 */
2343 
2344 	if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) {
2345 		return;
2346 	}
2347 
2348 #ifdef DIAGNOSTIC
2349 	CPU_INFO_ITERATOR cii;
2350 	struct cpu_info *ci;
2351 
2352 	for (CPU_INFO_FOREACH(cii, ci)) {
2353 		if (ci->ci_pmap == pmap)
2354 			panic("destroying pmap being used");
2355 #if defined(XEN) && defined(__x86_64__)
2356 		for (i = 0; i < PDIR_SLOT_PTE; i++) {
2357 			if (pmap->pm_pdir[i] != 0 &&
2358 			    ci->ci_kpm_pdir[i] == pmap->pm_pdir[i]) {
2359 				printf("pmap_destroy(%p) pmap_kernel %p "
2360 				    "curcpu %d cpu %d ci_pmap %p "
2361 				    "ci->ci_kpm_pdir[%d]=%" PRIx64
2362 				    " pmap->pm_pdir[%d]=%" PRIx64 "\n",
2363 				    pmap, pmap_kernel(), curcpu()->ci_index,
2364 				    ci->ci_index, ci->ci_pmap,
2365 				    i, ci->ci_kpm_pdir[i],
2366 				    i, pmap->pm_pdir[i]);
2367 				panic("pmap_destroy: used pmap");
2368 			}
2369 		}
2370 #endif
2371 	}
2372 #endif /* DIAGNOSTIC */
2373 
2374 	/*
2375 	 * Reference count is zero, free pmap resources and then free pmap.
2376 	 * First, remove it from global list of pmaps.
2377 	 */
2378 
2379 	mutex_enter(&pmaps_lock);
2380 	LIST_REMOVE(pmap, pm_list);
2381 	mutex_exit(&pmaps_lock);
2382 
2383 	/*
2384 	 * Process deferred PTP frees.  No TLB shootdown required, as the
2385 	 * PTP pages are no longer visible to any CPU.
2386 	 */
2387 
2388 	pmap_free_ptps(pmap->pm_gc_ptp);
2389 
2390 	/*
2391 	 * destroyed pmap shouldn't have remaining PTPs
2392 	 */
2393 
2394 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2395 		KASSERT(pmap->pm_obj[i].uo_npages == 0);
2396 		KASSERT(TAILQ_EMPTY(&pmap->pm_obj[i].memq));
2397 	}
2398 
2399 	pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir);
2400 
2401 #ifdef USER_LDT
2402 	if (pmap->pm_ldt != NULL) {
2403 		/*
2404 		 * no need to switch the LDT; this address space is gone,
2405 		 * nothing is using it.
2406 		 *
2407 		 * No need to lock the pmap for ldt_free (or anything else),
2408 		 * we're the last one to use it.
2409 		 */
2410 		mutex_enter(&cpu_lock);
2411 		ldt_free(pmap->pm_ldt_sel);
2412 		mutex_exit(&cpu_lock);
2413 		uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt,
2414 		    pmap->pm_ldt_len, UVM_KMF_WIRED);
2415 	}
2416 #endif
2417 
2418 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2419 		uvm_obj_destroy(&pmap->pm_obj[i], false);
2420 		mutex_destroy(&pmap->pm_obj_lock[i]);
2421 	}
2422 	kcpuset_destroy(pmap->pm_cpus);
2423 	kcpuset_destroy(pmap->pm_kernel_cpus);
2424 #ifdef XEN
2425 	kcpuset_destroy(pmap->pm_xen_ptp_cpus);
2426 #endif
2427 	pool_cache_put(&pmap_cache, pmap);
2428 }
2429 
2430 /*
2431  * pmap_remove_all: pmap is being torn down by the current thread.
2432  * avoid unnecessary invalidations.
2433  */
2434 
2435 void
2436 pmap_remove_all(struct pmap *pmap)
2437 {
2438 	lwp_t *l = curlwp;
2439 
2440 	KASSERT(l->l_md.md_gc_pmap == NULL);
2441 
2442 	l->l_md.md_gc_pmap = pmap;
2443 }
2444 
2445 #if defined(PMAP_FORK)
2446 /*
2447  * pmap_fork: perform any necessary data structure manipulation when
2448  * a VM space is forked.
2449  */
2450 
2451 void
2452 pmap_fork(struct pmap *pmap1, struct pmap *pmap2)
2453 {
2454 #ifdef USER_LDT
2455 	union descriptor *new_ldt;
2456 	size_t len;
2457 	int sel;
2458 
2459 	if (__predict_true(pmap1->pm_ldt == NULL)) {
2460 		return;
2461 	}
2462 
2463 	/*
2464 	 * Copy the LDT into the new process.
2465 	 *
2466 	 * Read pmap1's ldt pointer and length unlocked; if it changes
2467 	 * behind our back we'll retry. This will starve if there's a
2468 	 * stream of LDT changes in another thread but that should not
2469 	 * happen.
2470 	 */
2471 
2472  retry:
2473 	if (pmap1->pm_ldt != NULL) {
2474 		len = pmap1->pm_ldt_len;
2475 		/* Allocate space for the new process's LDT */
2476 		new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, len, 0,
2477 		    UVM_KMF_WIRED);
2478 		if (new_ldt == NULL) {
2479 			printf("WARNING: pmap_fork: "
2480 			       "unable to allocate LDT space\n");
2481 			return;
2482 		}
2483 		mutex_enter(&cpu_lock);
2484 		/* Get a GDT slot for it */
2485 		sel = ldt_alloc(new_ldt, len);
2486 		if (sel == -1) {
2487 			mutex_exit(&cpu_lock);
2488 			uvm_km_free(kernel_map, (vaddr_t)new_ldt, len,
2489 			    UVM_KMF_WIRED);
2490 			printf("WARNING: pmap_fork: "
2491 			       "unable to allocate LDT selector\n");
2492 			return;
2493 		}
2494 	} else {
2495 		/* Wasn't anything there after all. */
2496 		len = -1;
2497 		new_ldt = NULL;
2498 		sel = -1;
2499 		mutex_enter(&cpu_lock);
2500 	}
2501 
2502  	/* If there's still something there now that we have cpu_lock... */
2503  	if (pmap1->pm_ldt != NULL) {
2504 		if (len != pmap1->pm_ldt_len) {
2505 			/* Oops, it changed. Drop what we did and try again */
2506 			if (len != -1) {
2507 				ldt_free(sel);
2508 				uvm_km_free(kernel_map, (vaddr_t)new_ldt,
2509 				    len, UVM_KMF_WIRED);
2510 			}
2511 			mutex_exit(&cpu_lock);
2512 			goto retry;
2513 		}
2514 
2515 		/* Copy the LDT data and install it in pmap2 */
2516 		memcpy(new_ldt, pmap1->pm_ldt, len);
2517 		pmap2->pm_ldt = new_ldt;
2518 		pmap2->pm_ldt_len = pmap1->pm_ldt_len;
2519 		pmap2->pm_ldt_sel = sel;
2520 		len = -1;
2521 	}
2522 
2523 	if (len != -1) {
2524 		/* There wasn't still something there, so mop up */
2525 		ldt_free(sel);
2526 		mutex_exit(&cpu_lock);
2527 		uvm_km_free(kernel_map, (vaddr_t)new_ldt, len,
2528 		    UVM_KMF_WIRED);
2529 	} else {
2530 		mutex_exit(&cpu_lock);
2531 	}
2532 #endif /* USER_LDT */
2533 }
2534 #endif /* PMAP_FORK */
2535 
2536 #ifdef USER_LDT
2537 
2538 /*
2539  * pmap_ldt_xcall: cross call used by pmap_ldt_sync.  if the named pmap
2540  * is active, reload LDTR.
2541  */
2542 static void
2543 pmap_ldt_xcall(void *arg1, void *arg2)
2544 {
2545 	struct pmap *pm;
2546 
2547 	kpreempt_disable();
2548 	pm = arg1;
2549 	if (curcpu()->ci_pmap == pm) {
2550 		lldt(pm->pm_ldt_sel);
2551 	}
2552 	kpreempt_enable();
2553 }
2554 
2555 /*
2556  * pmap_ldt_sync: LDT selector for the named pmap is changing.  swap
2557  * in the new selector on all CPUs.
2558  */
2559 void
2560 pmap_ldt_sync(struct pmap *pm)
2561 {
2562 	uint64_t where;
2563 
2564 	KASSERT(mutex_owned(&cpu_lock));
2565 
2566 	pmap_ldt_evcnt.ev_count++;
2567 	where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL);
2568 	xc_wait(where);
2569 }
2570 
2571 /*
2572  * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and
2573  * restore the default.
2574  */
2575 
2576 void
2577 pmap_ldt_cleanup(struct lwp *l)
2578 {
2579 	pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap;
2580 	union descriptor *dp = NULL;
2581 	size_t len = 0;
2582 	int sel = -1;
2583 
2584 	if (__predict_true(pmap->pm_ldt == NULL)) {
2585 		return;
2586 	}
2587 
2588 	mutex_enter(&cpu_lock);
2589 	if (pmap->pm_ldt != NULL) {
2590 		sel = pmap->pm_ldt_sel;
2591 		dp = pmap->pm_ldt;
2592 		len = pmap->pm_ldt_len;
2593 		pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2594 		pmap->pm_ldt = NULL;
2595 		pmap->pm_ldt_len = 0;
2596 		pmap_ldt_sync(pmap);
2597 		ldt_free(sel);
2598 		uvm_km_free(kernel_map, (vaddr_t)dp, len, UVM_KMF_WIRED);
2599 	}
2600 	mutex_exit(&cpu_lock);
2601 }
2602 #endif /* USER_LDT */
2603 
2604 /*
2605  * pmap_activate: activate a process' pmap
2606  *
2607  * => must be called with kernel preemption disabled
2608  * => if lwp is the curlwp, then set ci_want_pmapload so that
2609  *    actual MMU context switch will be done by pmap_load() later
2610  */
2611 
2612 void
2613 pmap_activate(struct lwp *l)
2614 {
2615 	struct cpu_info *ci;
2616 	struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2617 
2618 	KASSERT(kpreempt_disabled());
2619 
2620 	ci = curcpu();
2621 
2622 	if (l == ci->ci_curlwp) {
2623 		KASSERT(ci->ci_want_pmapload == 0);
2624 		KASSERT(ci->ci_tlbstate != TLBSTATE_VALID);
2625 
2626 		/*
2627 		 * no need to switch to kernel vmspace because
2628 		 * it's a subset of any vmspace.
2629 		 */
2630 
2631 		if (pmap == pmap_kernel()) {
2632 			ci->ci_want_pmapload = 0;
2633 			return;
2634 		}
2635 
2636 		ci->ci_want_pmapload = 1;
2637 	}
2638 }
2639 
2640 /*
2641  * pmap_reactivate: try to regain reference to the pmap.
2642  *
2643  * => Must be called with kernel preemption disabled.
2644  */
2645 
2646 static bool
2647 pmap_reactivate(struct pmap *pmap)
2648 {
2649 	struct cpu_info * const ci = curcpu();
2650 	const cpuid_t cid = cpu_index(ci);
2651 	bool result;
2652 
2653 	KASSERT(kpreempt_disabled());
2654 #if defined(XEN) && defined(__x86_64__)
2655 	KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd);
2656 #elif defined(PAE)
2657 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
2658 #elif !defined(XEN)
2659 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()));
2660 #endif
2661 
2662 	/*
2663 	 * If we still have a lazy reference to this pmap, we can assume
2664 	 * that there was no TLB shootdown for this pmap in the meantime.
2665 	 *
2666 	 * The order of events here is important as we must synchronize
2667 	 * with TLB shootdown interrupts.  Declare interest in invalidations
2668 	 * (TLBSTATE_VALID) and then check the CPU set, which the IPIs can
2669 	 * change only when the state is TLBSTATE_LAZY.
2670 	 */
2671 
2672 	ci->ci_tlbstate = TLBSTATE_VALID;
2673 	KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid));
2674 
2675 	if (kcpuset_isset(pmap->pm_cpus, cid)) {
2676 		/* We have the reference, state is valid. */
2677 		result = true;
2678 	} else {
2679 		/* Must reload the TLB. */
2680 		kcpuset_atomic_set(pmap->pm_cpus, cid);
2681 		result = false;
2682 	}
2683 	return result;
2684 }
2685 
2686 /*
2687  * pmap_load: perform the actual pmap switch, i.e. fill in %cr3 register
2688  * and relevant LDT info.
2689  *
2690  * Ensures that the current process' pmap is loaded on the current CPU's
2691  * MMU and that there are no stale TLB entries.
2692  *
2693  * => The caller should disable kernel preemption or do check-and-retry
2694  *    to prevent a preemption from undoing our efforts.
2695  * => This function may block.
2696  */
2697 void
2698 pmap_load(void)
2699 {
2700 	struct cpu_info *ci;
2701 	struct pmap *pmap, *oldpmap;
2702 	struct lwp *l;
2703 	struct pcb *pcb;
2704 	cpuid_t cid;
2705 	uint64_t ncsw;
2706 
2707 	kpreempt_disable();
2708  retry:
2709 	ci = curcpu();
2710 	if (!ci->ci_want_pmapload) {
2711 		kpreempt_enable();
2712 		return;
2713 	}
2714 	l = ci->ci_curlwp;
2715 	ncsw = l->l_ncsw;
2716 
2717 	/* should be able to take ipis. */
2718 	KASSERT(ci->ci_ilevel < IPL_HIGH);
2719 #ifdef XEN
2720 	/* Check to see if interrupts are enabled (ie; no events are masked) */
2721 	KASSERT(x86_read_psl() == 0);
2722 #else
2723 	KASSERT((x86_read_psl() & PSL_I) != 0);
2724 #endif
2725 
2726 	KASSERT(l != NULL);
2727 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2728 	KASSERT(pmap != pmap_kernel());
2729 	oldpmap = ci->ci_pmap;
2730 	pcb = lwp_getpcb(l);
2731 
2732 	if (pmap == oldpmap) {
2733 		if (!pmap_reactivate(pmap)) {
2734 			u_int gen = uvm_emap_gen_return();
2735 
2736 			/*
2737 			 * pmap has been changed during deactivated.
2738 			 * our tlb may be stale.
2739 			 */
2740 
2741 			tlbflush();
2742 			uvm_emap_update(gen);
2743 		}
2744 
2745 		ci->ci_want_pmapload = 0;
2746 		kpreempt_enable();
2747 		return;
2748 	}
2749 
2750 	/*
2751 	 * Acquire a reference to the new pmap and perform the switch.
2752 	 */
2753 
2754 	pmap_reference(pmap);
2755 
2756 	cid = cpu_index(ci);
2757 	kcpuset_atomic_clear(oldpmap->pm_cpus, cid);
2758 	kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid);
2759 
2760 #if defined(XEN) && defined(__x86_64__)
2761 	KASSERT(pmap_pdirpa(oldpmap, 0) == ci->ci_xen_current_user_pgd ||
2762 	    oldpmap == pmap_kernel());
2763 #elif defined(PAE)
2764 	KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
2765 #elif !defined(XEN)
2766 	KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(rcr3()));
2767 #endif
2768 	KASSERT(!kcpuset_isset(pmap->pm_cpus, cid));
2769 	KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid));
2770 
2771 	/*
2772 	 * Mark the pmap in use by this CPU.  Again, we must synchronize
2773 	 * with TLB shootdown interrupts, so set the state VALID first,
2774 	 * then register us for shootdown events on this pmap.
2775 	 */
2776 	ci->ci_tlbstate = TLBSTATE_VALID;
2777 	kcpuset_atomic_set(pmap->pm_cpus, cid);
2778 	kcpuset_atomic_set(pmap->pm_kernel_cpus, cid);
2779 	ci->ci_pmap = pmap;
2780 
2781 	/*
2782 	 * update tss.  now that we have registered for invalidations
2783 	 * from other CPUs, we're good to load the page tables.
2784 	 */
2785 #ifdef PAE
2786 	pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa;
2787 #else
2788 	pcb->pcb_cr3 = pmap_pdirpa(pmap, 0);
2789 #endif
2790 
2791 #ifdef i386
2792 #ifndef XEN
2793 	ci->ci_tss.tss_ldt = pmap->pm_ldt_sel;
2794 	ci->ci_tss.tss_cr3 = pcb->pcb_cr3;
2795 #endif /* !XEN */
2796 #endif /* i386 */
2797 
2798 	lldt(pmap->pm_ldt_sel);
2799 
2800 	u_int gen = uvm_emap_gen_return();
2801 	cpu_load_pmap(pmap, oldpmap);
2802 	uvm_emap_update(gen);
2803 
2804 	ci->ci_want_pmapload = 0;
2805 
2806 	/*
2807 	 * we're now running with the new pmap.  drop the reference
2808 	 * to the old pmap.  if we block, we need to go around again.
2809 	 */
2810 
2811 	pmap_destroy(oldpmap);
2812 	if (l->l_ncsw != ncsw) {
2813 		goto retry;
2814 	}
2815 
2816 	kpreempt_enable();
2817 }
2818 
2819 /*
2820  * pmap_deactivate: deactivate a process' pmap.
2821  *
2822  * => Must be called with kernel preemption disabled (high IPL is enough).
2823  */
2824 void
2825 pmap_deactivate(struct lwp *l)
2826 {
2827 	struct pmap *pmap;
2828 	struct cpu_info *ci;
2829 
2830 	KASSERT(kpreempt_disabled());
2831 
2832 	if (l != curlwp) {
2833 		return;
2834 	}
2835 
2836 	/*
2837 	 * Wait for pending TLB shootdowns to complete.  Necessary because
2838 	 * TLB shootdown state is per-CPU, and the LWP may be coming off
2839 	 * the CPU before it has a chance to call pmap_update(), e.g. due
2840 	 * to kernel preemption or blocking routine in between.
2841 	 */
2842 	pmap_tlb_shootnow();
2843 
2844 	ci = curcpu();
2845 
2846 	if (ci->ci_want_pmapload) {
2847 		/*
2848 		 * ci_want_pmapload means that our pmap is not loaded on
2849 		 * the CPU or TLB might be stale.  note that pmap_kernel()
2850 		 * is always considered loaded.
2851 		 */
2852 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
2853 		    != pmap_kernel());
2854 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
2855 		    != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID);
2856 
2857 		/*
2858 		 * userspace has not been touched.
2859 		 * nothing to do here.
2860 		 */
2861 
2862 		ci->ci_want_pmapload = 0;
2863 		return;
2864 	}
2865 
2866 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2867 
2868 	if (pmap == pmap_kernel()) {
2869 		return;
2870 	}
2871 
2872 #if defined(XEN) && defined(__x86_64__)
2873 	KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd);
2874 #elif defined(PAE)
2875 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
2876 #elif !defined(XEN)
2877 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()));
2878 #endif
2879 	KASSERT(ci->ci_pmap == pmap);
2880 
2881 	/*
2882 	 * we aren't interested in TLB invalidations for this pmap,
2883 	 * at least for the time being.
2884 	 */
2885 
2886 	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
2887 	ci->ci_tlbstate = TLBSTATE_LAZY;
2888 }
2889 
2890 /*
2891  * end of lifecycle functions
2892  */
2893 
2894 /*
2895  * some misc. functions
2896  */
2897 
2898 int
2899 pmap_pdes_invalid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde)
2900 {
2901 	int i;
2902 	unsigned long index;
2903 	pd_entry_t pde;
2904 
2905 	for (i = PTP_LEVELS; i > 1; i--) {
2906 		index = pl_i(va, i);
2907 		pde = pdes[i - 2][index];
2908 		if ((pde & PG_V) == 0)
2909 			return i;
2910 	}
2911 	if (lastpde != NULL)
2912 		*lastpde = pde;
2913 	return 0;
2914 }
2915 
2916 /*
2917  * pmap_extract: extract a PA for the given VA
2918  */
2919 
2920 bool
2921 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
2922 {
2923 	pt_entry_t *ptes, pte;
2924 	pd_entry_t pde;
2925 	pd_entry_t * const *pdes;
2926 	struct pmap *pmap2;
2927 	struct cpu_info *ci;
2928 	paddr_t pa;
2929 	lwp_t *l;
2930 	bool hard, rv;
2931 
2932 #ifdef __HAVE_DIRECT_MAP
2933 	if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
2934 		if (pap != NULL) {
2935 			*pap = va - PMAP_DIRECT_BASE;
2936 		}
2937 		return true;
2938 	}
2939 #endif
2940 
2941 	rv = false;
2942 	pa = 0;
2943 	l = curlwp;
2944 
2945 	kpreempt_disable();
2946 	ci = l->l_cpu;
2947 	if (__predict_true(!ci->ci_want_pmapload && ci->ci_pmap == pmap) ||
2948 	    pmap == pmap_kernel()) {
2949 		/*
2950 		 * no need to lock, because it's pmap_kernel() or our
2951 		 * own pmap and is active.  if a user pmap, the caller
2952 		 * will hold the vm_map write/read locked and so prevent
2953 		 * entries from disappearing while we are here.  ptps
2954 		 * can disappear via pmap_remove() and pmap_protect(),
2955 		 * but they are called with the vm_map write locked.
2956 		 */
2957 		hard = false;
2958 		ptes = PTE_BASE;
2959 		pdes = normal_pdes;
2960 	} else {
2961 		/* we lose, do it the hard way. */
2962 		hard = true;
2963 		pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
2964 	}
2965 	if (pmap_pdes_valid(va, pdes, &pde)) {
2966 		pte = ptes[pl1_i(va)];
2967 		if (pde & PG_PS) {
2968 			pa = (pde & PG_LGFRAME) | (va & (NBPD_L2 - 1));
2969 			rv = true;
2970 		} else if (__predict_true((pte & PG_V) != 0)) {
2971 			pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
2972 			rv = true;
2973 		}
2974 	}
2975 	if (__predict_false(hard)) {
2976 		pmap_unmap_ptes(pmap, pmap2);
2977 	}
2978 	kpreempt_enable();
2979 	if (pap != NULL) {
2980 		*pap = pa;
2981 	}
2982 	return rv;
2983 }
2984 
2985 
2986 /*
2987  * vtophys: virtual address to physical address.  For use by
2988  * machine-dependent code only.
2989  */
2990 
2991 paddr_t
2992 vtophys(vaddr_t va)
2993 {
2994 	paddr_t pa;
2995 
2996 	if (pmap_extract(pmap_kernel(), va, &pa) == true)
2997 		return (pa);
2998 	return (0);
2999 }
3000 
3001 __strict_weak_alias(pmap_extract_ma, pmap_extract);
3002 
3003 #ifdef XEN
3004 
3005 /*
3006  * vtomach: virtual address to machine address.  For use by
3007  * machine-dependent code only.
3008  */
3009 
3010 paddr_t
3011 vtomach(vaddr_t va)
3012 {
3013 	paddr_t pa;
3014 
3015 	if (pmap_extract_ma(pmap_kernel(), va, &pa) == true)
3016 		return (pa);
3017 	return (0);
3018 }
3019 
3020 #endif /* XEN */
3021 
3022 /*
3023  * pmap_virtual_space: used during bootup [pmap_steal_memory] to
3024  *	determine the bounds of the kernel virtual addess space.
3025  */
3026 
3027 void
3028 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp)
3029 {
3030 	*startp = virtual_avail;
3031 	*endp = virtual_end;
3032 }
3033 
3034 /*
3035  * pmap_zero_page: zero a page
3036  */
3037 
3038 void
3039 pmap_zero_page(paddr_t pa)
3040 {
3041 #if defined(__HAVE_DIRECT_MAP)
3042 	pagezero(PMAP_DIRECT_MAP(pa));
3043 #else
3044 #if defined(XEN)
3045 	if (XEN_VERSION_SUPPORTED(3, 4))
3046 		xen_pagezero(pa);
3047 #endif
3048 	struct cpu_info *ci;
3049 	pt_entry_t *zpte;
3050 	vaddr_t zerova;
3051 
3052 	const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_M | PG_U |
3053 	    PG_k;
3054 
3055 	kpreempt_disable();
3056 
3057 	ci = curcpu();
3058 	zerova = ci->vpage[VPAGE_ZER];
3059 	zpte = ci->vpage_pte[VPAGE_ZER];
3060 
3061 #ifdef DIAGNOSTIC
3062 	if (*zpte)
3063 		panic("pmap_zero_page: lock botch");
3064 #endif
3065 
3066 	pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags);
3067 	pmap_pte_flush();
3068 	pmap_update_pg(zerova);		/* flush TLB */
3069 
3070 	memset((void *)zerova, 0, PAGE_SIZE);
3071 
3072 #if defined(DIAGNOSTIC) || defined(XEN)
3073 	pmap_pte_set(zpte, 0);				/* zap ! */
3074 	pmap_pte_flush();
3075 #endif
3076 
3077 	kpreempt_enable();
3078 #endif /* defined(__HAVE_DIRECT_MAP) */
3079 }
3080 
3081 /*
3082  * pmap_pagezeroidle: the same, for the idle loop page zero'er.
3083  * Returns true if the page was zero'd, false if we aborted for
3084  * some reason.
3085  */
3086 
3087 bool
3088 pmap_pageidlezero(paddr_t pa)
3089 {
3090 #ifdef __HAVE_DIRECT_MAP
3091 	KASSERT(cpu_feature[0] & CPUID_SSE2);
3092 	return sse2_idlezero_page((void *)PMAP_DIRECT_MAP(pa));
3093 #else
3094 	struct cpu_info *ci;
3095 	pt_entry_t *zpte;
3096 	vaddr_t zerova;
3097 	bool rv;
3098 
3099 	const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_M | PG_U |
3100 	    PG_k;
3101 
3102 	ci = curcpu();
3103 	zerova = ci->vpage[VPAGE_ZER];
3104 	zpte = ci->vpage_pte[VPAGE_ZER];
3105 
3106 	KASSERT(cpu_feature[0] & CPUID_SSE2);
3107 	KASSERT(*zpte == 0);
3108 
3109 	pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags);
3110 	pmap_pte_flush();
3111 	pmap_update_pg(zerova);		/* flush TLB */
3112 
3113 	rv = sse2_idlezero_page((void *)zerova);
3114 
3115 #if defined(DIAGNOSTIC) || defined(XEN)
3116 	pmap_pte_set(zpte, 0);				/* zap ! */
3117 	pmap_pte_flush();
3118 #endif
3119 
3120 	return rv;
3121 #endif
3122 }
3123 
3124 /*
3125  * pmap_copy_page: copy a page
3126  */
3127 
3128 void
3129 pmap_copy_page(paddr_t srcpa, paddr_t dstpa)
3130 {
3131 #if defined(__HAVE_DIRECT_MAP)
3132 	vaddr_t srcva = PMAP_DIRECT_MAP(srcpa);
3133 	vaddr_t dstva = PMAP_DIRECT_MAP(dstpa);
3134 
3135 	memcpy((void *)dstva, (void *)srcva, PAGE_SIZE);
3136 #else
3137 #if defined(XEN)
3138 	if (XEN_VERSION_SUPPORTED(3, 4)) {
3139 		xen_copy_page(srcpa, dstpa);
3140 		return;
3141 	}
3142 #endif
3143 	struct cpu_info *ci;
3144 	pt_entry_t *srcpte, *dstpte;
3145 	vaddr_t srcva, dstva;
3146 
3147 	const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_U | PG_k;
3148 
3149 	kpreempt_disable();
3150 
3151 	ci = curcpu();
3152 	srcva = ci->vpage[VPAGE_SRC];
3153 	dstva = ci->vpage[VPAGE_DST];
3154 	srcpte = ci->vpage_pte[VPAGE_SRC];
3155 	dstpte = ci->vpage_pte[VPAGE_DST];
3156 
3157 	KASSERT(*srcpte == 0 && *dstpte == 0);
3158 
3159 	pmap_pte_set(srcpte, pmap_pa2pte(srcpa) | pteflags);
3160 	pmap_pte_set(dstpte, pmap_pa2pte(dstpa) | pteflags | PG_M);
3161 	pmap_pte_flush();
3162 	pmap_update_2pg(srcva, dstva);
3163 
3164 	memcpy((void *)dstva, (void *)srcva, PAGE_SIZE);
3165 
3166 #if defined(DIAGNOSTIC) || defined(XEN)
3167 	pmap_pte_set(srcpte, 0);
3168 	pmap_pte_set(dstpte, 0);
3169 	pmap_pte_flush();
3170 #endif
3171 
3172 	kpreempt_enable();
3173 #endif /* defined(__HAVE_DIRECT_MAP) */
3174 }
3175 
3176 static pt_entry_t *
3177 pmap_map_ptp(struct vm_page *ptp)
3178 {
3179 #ifdef __HAVE_DIRECT_MAP
3180 	return (void *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp));
3181 #else
3182 	struct cpu_info *ci;
3183 	pt_entry_t *ptppte;
3184 	vaddr_t ptpva;
3185 
3186 	KASSERT(kpreempt_disabled());
3187 
3188 #ifndef XEN
3189 	const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_U | PG_M |
3190 	    PG_k;
3191 #else
3192 	const pd_entry_t pteflags = PG_V | pmap_pg_nx | PG_U | PG_M | PG_k;
3193 #endif
3194 
3195 	ci = curcpu();
3196 	ptpva = ci->vpage[VPAGE_PTP];
3197 	ptppte = ci->vpage_pte[VPAGE_PTP];
3198 
3199 	pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | pteflags);
3200 
3201 	pmap_pte_flush();
3202 	pmap_update_pg(ptpva);
3203 
3204 	return (pt_entry_t *)ptpva;
3205 #endif
3206 }
3207 
3208 static void
3209 pmap_unmap_ptp(void)
3210 {
3211 #ifndef __HAVE_DIRECT_MAP
3212 #if defined(DIAGNOSTIC) || defined(XEN)
3213 	struct cpu_info *ci;
3214 	pt_entry_t *pte;
3215 
3216 	KASSERT(kpreempt_disabled());
3217 
3218 	ci = curcpu();
3219 	pte = ci->vpage_pte[VPAGE_PTP];
3220 
3221 	if (*pte != 0) {
3222 		pmap_pte_set(pte, 0);
3223 		pmap_pte_flush();
3224 	}
3225 #endif
3226 #endif
3227 }
3228 
3229 static pt_entry_t *
3230 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
3231 {
3232 
3233 	KASSERT(kpreempt_disabled());
3234 	if (pmap_is_curpmap(pmap)) {
3235 		return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */
3236 	}
3237 	KASSERT(ptp != NULL);
3238 	return pmap_map_ptp(ptp) + pl1_pi(va);
3239 }
3240 
3241 static void
3242 pmap_unmap_pte(void)
3243 {
3244 
3245 	KASSERT(kpreempt_disabled());
3246 
3247 	pmap_unmap_ptp();
3248 }
3249 
3250 /*
3251  * p m a p   r e m o v e   f u n c t i o n s
3252  *
3253  * functions that remove mappings
3254  */
3255 
3256 /*
3257  * pmap_remove_ptes: remove PTEs from a PTP
3258  *
3259  * => caller must hold pmap's lock
3260  * => PTP must be mapped into KVA
3261  * => PTP should be null if pmap == pmap_kernel()
3262  * => must be called with kernel preemption disabled
3263  * => returns composite pte if at least one page should be shot down
3264  */
3265 
3266 static void
3267 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
3268 		 vaddr_t startva, vaddr_t endva, struct pv_entry **pv_tofree)
3269 {
3270 	pt_entry_t *pte = (pt_entry_t *)ptpva;
3271 
3272 	KASSERT(pmap == pmap_kernel() || mutex_owned(pmap->pm_lock));
3273 	KASSERT(kpreempt_disabled());
3274 
3275 	/*
3276 	 * note that ptpva points to the PTE that maps startva.   this may
3277 	 * or may not be the first PTE in the PTP.
3278 	 *
3279 	 * we loop through the PTP while there are still PTEs to look at
3280 	 * and the wire_count is greater than 1 (because we use the wire_count
3281 	 * to keep track of the number of real PTEs in the PTP).
3282 	 */
3283 	while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) {
3284 		(void)pmap_remove_pte(pmap, ptp, pte, startva, pv_tofree);
3285 		startva += PAGE_SIZE;
3286 		pte++;
3287 	}
3288 }
3289 
3290 
3291 /*
3292  * pmap_remove_pte: remove a single PTE from a PTP.
3293  *
3294  * => caller must hold pmap's lock
3295  * => PTP must be mapped into KVA
3296  * => PTP should be null if pmap == pmap_kernel()
3297  * => returns true if we removed a mapping
3298  * => must be called with kernel preemption disabled
3299  */
3300 static bool
3301 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
3302 		vaddr_t va, struct pv_entry **pv_tofree)
3303 {
3304 	struct pv_entry *pve;
3305 	struct vm_page *pg;
3306 	struct pmap_page *pp;
3307 	pt_entry_t opte;
3308 
3309 	KASSERT(pmap == pmap_kernel() || mutex_owned(pmap->pm_lock));
3310 	KASSERT(kpreempt_disabled());
3311 
3312 	if (!pmap_valid_entry(*pte)) {
3313 		/* VA not mapped. */
3314 		return false;
3315 	}
3316 
3317 	/* Atomically save the old PTE and zap it. */
3318 	opte = pmap_pte_testset(pte, 0);
3319 	if (!pmap_valid_entry(opte)) {
3320 		return false;
3321 	}
3322 
3323 	pmap_exec_account(pmap, va, opte, 0);
3324 	pmap_stats_update_bypte(pmap, 0, opte);
3325 
3326 	if (ptp) {
3327 		/*
3328 		 * Dropping a PTE.  Make sure that the PDE is flushed.
3329 		 */
3330 		ptp->wire_count--;
3331 		if (ptp->wire_count <= 1) {
3332 			opte |= PG_U;
3333 		}
3334 	}
3335 
3336 	if ((opte & PG_U) != 0) {
3337 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE);
3338 	}
3339 
3340 	/*
3341 	 * If we are not on a pv_head list - we are done.
3342 	 */
3343 	if ((opte & PG_PVLIST) == 0) {
3344 #if defined(DIAGNOSTIC) && !defined(DOM0OPS)
3345 		if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL ||
3346 		    pmap_pv_tracked(pmap_pte2pa(opte)) != NULL)
3347 			panic("pmap_remove_pte: managed or pv-tracked page"
3348 			    " without PG_PVLIST for %#"PRIxVADDR, va);
3349 #endif
3350 		return true;
3351 	}
3352 
3353 	if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
3354 		KASSERT(uvm_page_locked_p(pg));
3355 		pp = VM_PAGE_TO_PP(pg);
3356 	} else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
3357 		paddr_t pa = pmap_pte2pa(opte);
3358 		panic("pmap_remove_pte: PG_PVLIST with pv-untracked page"
3359 		    " va = 0x%"PRIxVADDR
3360 		    " pa = 0x%"PRIxPADDR" (0x%"PRIxPADDR")",
3361 		    va, pa, atop(pa));
3362 	}
3363 
3364 	/* Sync R/M bits. */
3365 	pp->pp_attrs |= opte;
3366 	pve = pmap_remove_pv(pp, ptp, va);
3367 
3368 	if (pve) {
3369 		pve->pve_next = *pv_tofree;
3370 		*pv_tofree = pve;
3371 	}
3372 	return true;
3373 }
3374 
3375 /*
3376  * pmap_remove: mapping removal function.
3377  *
3378  * => caller should not be holding any pmap locks
3379  */
3380 
3381 void
3382 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
3383 {
3384 	pt_entry_t *ptes;
3385 	pd_entry_t pde;
3386 	pd_entry_t * const *pdes;
3387 	struct pv_entry *pv_tofree = NULL;
3388 	bool result;
3389 	int i;
3390 	paddr_t ptppa;
3391 	vaddr_t blkendva, va = sva;
3392 	struct vm_page *ptp;
3393 	struct pmap *pmap2;
3394 
3395 	kpreempt_disable();
3396 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
3397 
3398 	/*
3399 	 * removing one page?  take shortcut function.
3400 	 */
3401 
3402 	if (va + PAGE_SIZE == eva) {
3403 		if (pmap_pdes_valid(va, pdes, &pde)) {
3404 
3405 			/* PA of the PTP */
3406 			ptppa = pmap_pte2pa(pde);
3407 
3408 			/* Get PTP if non-kernel mapping. */
3409 			if (pmap != pmap_kernel()) {
3410 				ptp = pmap_find_ptp(pmap, va, ptppa, 1);
3411 				KASSERTMSG(ptp != NULL,
3412 				    "pmap_remove: unmanaged PTP detected");
3413 			} else {
3414 				/* Never free kernel PTPs. */
3415 				ptp = NULL;
3416 			}
3417 
3418 			result = pmap_remove_pte(pmap, ptp,
3419 			    &ptes[pl1_i(va)], va, &pv_tofree);
3420 
3421 			/*
3422 			 * if mapping removed and the PTP is no longer
3423 			 * being used, free it!
3424 			 */
3425 
3426 			if (result && ptp && ptp->wire_count <= 1)
3427 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3428 		}
3429 	} else for (/* null */ ; va < eva ; va = blkendva) {
3430 		int lvl;
3431 
3432 		/* determine range of block */
3433 		blkendva = x86_round_pdr(va+1);
3434 		if (blkendva > eva)
3435 			blkendva = eva;
3436 
3437 		/*
3438 		 * Our PTE mappings should never be removed with pmap_remove.
3439 		 *
3440 		 * XXXmaxv: still needed?
3441 		 *
3442 		 * A long term solution is to move the PTEs out of user address
3443 		 * space, and into kernel address space. Then we can set
3444 		 * VM_MAXUSER_ADDRESS to be VM_MAX_ADDRESS.
3445 		 */
3446 		for (i = 0; i < PDP_SIZE; i++) {
3447 			if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i)
3448 				panic("PTE space accessed");
3449 		}
3450 
3451 		lvl = pmap_pdes_invalid(va, pdes, &pde);
3452 		if (lvl != 0) {
3453 			/*
3454 			 * skip a range corresponding to an invalid pde.
3455 			 */
3456 			blkendva = (va & ptp_masks[lvl - 1]) + nbpd[lvl - 1];
3457  			continue;
3458 		}
3459 
3460 		/* PA of the PTP */
3461 		ptppa = pmap_pte2pa(pde);
3462 
3463 		/* Get PTP if non-kernel mapping. */
3464 		if (pmap != pmap_kernel()) {
3465 			ptp = pmap_find_ptp(pmap, va, ptppa, 1);
3466 			KASSERTMSG(ptp != NULL,
3467 			    "pmap_remove: unmanaged PTP detected");
3468 		} else {
3469 			/* Never free kernel PTPs. */
3470 			ptp = NULL;
3471 		}
3472 
3473 		pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va,
3474 		    blkendva, &pv_tofree);
3475 
3476 		/* if PTP is no longer being used, free it! */
3477 		if (ptp && ptp->wire_count <= 1) {
3478 			pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3479 		}
3480 	}
3481 	pmap_unmap_ptes(pmap, pmap2);		/* unlock pmap */
3482 	kpreempt_enable();
3483 
3484 	/* Now we free unused PVs */
3485 	if (pv_tofree)
3486 		pmap_free_pvs(pv_tofree);
3487 }
3488 
3489 /*
3490  * pmap_sync_pv: clear pte bits and return the old value of the pte.
3491  *
3492  * => Caller should disable kernel preemption.
3493  * => issues tlb shootdowns if necessary.
3494  */
3495 
3496 static int
3497 pmap_sync_pv(struct pv_pte *pvpte, pt_entry_t expect, int clearbits,
3498     pt_entry_t *optep)
3499 {
3500 	struct pmap *pmap;
3501 	struct vm_page *ptp;
3502 	vaddr_t va;
3503 	pt_entry_t *ptep;
3504 	pt_entry_t opte;
3505 	pt_entry_t npte;
3506 	bool need_shootdown;
3507 
3508 	ptp = pvpte->pte_ptp;
3509 	va = pvpte->pte_va;
3510 	KASSERT(ptp == NULL || ptp->uobject != NULL);
3511 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
3512 	pmap = ptp_to_pmap(ptp);
3513 
3514 	KASSERT((expect & ~(PG_FRAME | PG_V)) == 0);
3515 	KASSERT((expect & PG_V) != 0);
3516 	KASSERT(clearbits == ~0 || (clearbits & ~(PG_M | PG_U | PG_RW)) == 0);
3517 	KASSERT(kpreempt_disabled());
3518 
3519 	ptep = pmap_map_pte(pmap, ptp, va);
3520 	do {
3521 		opte = *ptep;
3522 		KASSERT((opte & (PG_M | PG_U)) != PG_M);
3523 		KASSERT((opte & (PG_U | PG_V)) != PG_U);
3524 		KASSERT(opte == 0 || (opte & PG_V) != 0);
3525 		if ((opte & (PG_FRAME | PG_V)) != expect) {
3526 
3527 			/*
3528 			 * we lost a race with a V->P operation like
3529 			 * pmap_remove().  wait for the competitor
3530 			 * reflecting pte bits into mp_attrs.
3531 			 *
3532 			 * issue a redundant TLB shootdown so that
3533 			 * we can wait for its completion.
3534 			 */
3535 
3536 			pmap_unmap_pte();
3537 			if (clearbits != 0) {
3538 				pmap_tlb_shootdown(pmap, va,
3539 				    (pmap == pmap_kernel() ? PG_G : 0),
3540 				    TLBSHOOT_SYNC_PV1);
3541 			}
3542 			return EAGAIN;
3543 		}
3544 
3545 		/*
3546 		 * check if there's anything to do on this pte.
3547 		 */
3548 
3549 		if ((opte & clearbits) == 0) {
3550 			need_shootdown = false;
3551 			break;
3552 		}
3553 
3554 		/*
3555 		 * we need a shootdown if the pte is cached. (PG_U)
3556 		 *
3557 		 * ...unless we are clearing only the PG_RW bit and
3558 		 * it isn't cached as RW. (PG_M)
3559 		 */
3560 
3561 		need_shootdown = (opte & PG_U) != 0 &&
3562 		    !(clearbits == PG_RW && (opte & PG_M) == 0);
3563 
3564 		npte = opte & ~clearbits;
3565 
3566 		/*
3567 		 * if we need a shootdown anyway, clear PG_U and PG_M.
3568 		 */
3569 
3570 		if (need_shootdown) {
3571 			npte &= ~(PG_U | PG_M);
3572 		}
3573 		KASSERT((npte & (PG_M | PG_U)) != PG_M);
3574 		KASSERT((npte & (PG_U | PG_V)) != PG_U);
3575 		KASSERT(npte == 0 || (opte & PG_V) != 0);
3576 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
3577 
3578 	if (need_shootdown) {
3579 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV2);
3580 	}
3581 	pmap_unmap_pte();
3582 
3583 	*optep = opte;
3584 	return 0;
3585 }
3586 
3587 static void
3588 pmap_pp_remove(struct pmap_page *pp, paddr_t pa)
3589 {
3590 	struct pv_pte *pvpte;
3591 	struct pv_entry *killlist = NULL;
3592 	struct vm_page *ptp;
3593 	pt_entry_t expect;
3594 	int count;
3595 
3596 	expect = pmap_pa2pte(pa) | PG_V;
3597 	count = SPINLOCK_BACKOFF_MIN;
3598 	kpreempt_disable();
3599 startover:
3600 	while ((pvpte = pv_pte_first(pp)) != NULL) {
3601 		struct pmap *pmap;
3602 		struct pv_entry *pve;
3603 		pt_entry_t opte;
3604 		vaddr_t va;
3605 		int error;
3606 
3607 		/*
3608 		 * add a reference to the pmap before clearing the pte.
3609 		 * otherwise the pmap can disappear behind us.
3610 		 */
3611 
3612 		ptp = pvpte->pte_ptp;
3613 		pmap = ptp_to_pmap(ptp);
3614 		if (ptp != NULL) {
3615 			pmap_reference(pmap);
3616 		}
3617 
3618 		error = pmap_sync_pv(pvpte, expect, ~0, &opte);
3619 		if (error == EAGAIN) {
3620 			int hold_count;
3621 			KERNEL_UNLOCK_ALL(curlwp, &hold_count);
3622 			if (ptp != NULL) {
3623 				pmap_destroy(pmap);
3624 			}
3625 			SPINLOCK_BACKOFF(count);
3626 			KERNEL_LOCK(hold_count, curlwp);
3627 			goto startover;
3628 		}
3629 
3630 		pp->pp_attrs |= opte;
3631 		va = pvpte->pte_va;
3632 		pve = pmap_remove_pv(pp, ptp, va);
3633 
3634 		/* update the PTP reference count.  free if last reference. */
3635 		if (ptp != NULL) {
3636 			struct pmap *pmap2;
3637 			pt_entry_t *ptes;
3638 			pd_entry_t * const *pdes;
3639 
3640 			KASSERT(pmap != pmap_kernel());
3641 
3642 			pmap_tlb_shootnow();
3643 			pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3644 			pmap_stats_update_bypte(pmap, 0, opte);
3645 			ptp->wire_count--;
3646 			if (ptp->wire_count <= 1) {
3647 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3648 			}
3649 			pmap_unmap_ptes(pmap, pmap2);
3650 			pmap_destroy(pmap);
3651 		} else {
3652 			KASSERT(pmap == pmap_kernel());
3653 			pmap_stats_update_bypte(pmap, 0, opte);
3654 		}
3655 
3656 		if (pve != NULL) {
3657 			pve->pve_next = killlist;	/* mark it for death */
3658 			killlist = pve;
3659 		}
3660 	}
3661 	pmap_tlb_shootnow();
3662 	kpreempt_enable();
3663 
3664 	/* Now free unused pvs. */
3665 	pmap_free_pvs(killlist);
3666 }
3667 
3668 /*
3669  * pmap_page_remove: remove a managed vm_page from all pmaps that map it
3670  *
3671  * => R/M bits are sync'd back to attrs
3672  */
3673 
3674 void
3675 pmap_page_remove(struct vm_page *pg)
3676 {
3677 	struct pmap_page *pp;
3678 	paddr_t pa;
3679 
3680 	KASSERT(uvm_page_locked_p(pg));
3681 
3682 	pp = VM_PAGE_TO_PP(pg);
3683 	pa = VM_PAGE_TO_PHYS(pg);
3684 	pmap_pp_remove(pp, pa);
3685 }
3686 
3687 /*
3688  * pmap_pv_remove: remove an unmanaged pv-tracked page from all pmaps
3689  *	that map it
3690  */
3691 
3692 void
3693 pmap_pv_remove(paddr_t pa)
3694 {
3695 	struct pmap_page *pp;
3696 
3697 	pp = pmap_pv_tracked(pa);
3698 	if (pp == NULL)
3699 		panic("pmap_pv_protect: page not pv-tracked: 0x%"PRIxPADDR,
3700 		    pa);
3701 	pmap_pp_remove(pp, pa);
3702 }
3703 
3704 /*
3705  * p m a p   a t t r i b u t e  f u n c t i o n s
3706  * functions that test/change managed page's attributes
3707  * since a page can be mapped multiple times we must check each PTE that
3708  * maps it by going down the pv lists.
3709  */
3710 
3711 /*
3712  * pmap_test_attrs: test a page's attributes
3713  */
3714 
3715 bool
3716 pmap_test_attrs(struct vm_page *pg, unsigned testbits)
3717 {
3718 	struct pmap_page *pp;
3719 	struct pv_pte *pvpte;
3720 	pt_entry_t expect;
3721 	u_int result;
3722 
3723 	KASSERT(uvm_page_locked_p(pg));
3724 
3725 	pp = VM_PAGE_TO_PP(pg);
3726 	if ((pp->pp_attrs & testbits) != 0) {
3727 		return true;
3728 	}
3729 	expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V;
3730 	kpreempt_disable();
3731 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
3732 		pt_entry_t opte;
3733 		int error;
3734 
3735 		if ((pp->pp_attrs & testbits) != 0) {
3736 			break;
3737 		}
3738 		error = pmap_sync_pv(pvpte, expect, 0, &opte);
3739 		if (error == 0) {
3740 			pp->pp_attrs |= opte;
3741 		}
3742 	}
3743 	result = pp->pp_attrs & testbits;
3744 	kpreempt_enable();
3745 
3746 	/*
3747 	 * note that we will exit the for loop with a non-null pve if
3748 	 * we have found the bits we are testing for.
3749 	 */
3750 
3751 	return result != 0;
3752 }
3753 
3754 static bool
3755 pmap_pp_clear_attrs(struct pmap_page *pp, paddr_t pa, unsigned clearbits)
3756 {
3757 	struct pv_pte *pvpte;
3758 	u_int result;
3759 	pt_entry_t expect;
3760 	int count;
3761 
3762 	expect = pmap_pa2pte(pa) | PG_V;
3763 	count = SPINLOCK_BACKOFF_MIN;
3764 	kpreempt_disable();
3765 startover:
3766 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
3767 		pt_entry_t opte;
3768 		int error;
3769 
3770 		error = pmap_sync_pv(pvpte, expect, clearbits, &opte);
3771 		if (error == EAGAIN) {
3772 			int hold_count;
3773 			KERNEL_UNLOCK_ALL(curlwp, &hold_count);
3774 			SPINLOCK_BACKOFF(count);
3775 			KERNEL_LOCK(hold_count, curlwp);
3776 			goto startover;
3777 		}
3778 		pp->pp_attrs |= opte;
3779 	}
3780 	result = pp->pp_attrs & clearbits;
3781 	pp->pp_attrs &= ~clearbits;
3782 	pmap_tlb_shootnow();
3783 	kpreempt_enable();
3784 
3785 	return result != 0;
3786 }
3787 
3788 /*
3789  * pmap_clear_attrs: clear the specified attribute for a page.
3790  *
3791  * => we return true if we cleared one of the bits we were asked to
3792  */
3793 
3794 bool
3795 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits)
3796 {
3797 	struct pmap_page *pp;
3798 	paddr_t pa;
3799 
3800 	KASSERT(uvm_page_locked_p(pg));
3801 
3802 	pp = VM_PAGE_TO_PP(pg);
3803 	pa = VM_PAGE_TO_PHYS(pg);
3804 
3805 	return pmap_pp_clear_attrs(pp, pa, clearbits);
3806 }
3807 
3808 /*
3809  * pmap_pv_clear_attrs: clear the specified attributes for an unmanaged
3810  *	pv-tracked page.
3811  */
3812 
3813 bool
3814 pmap_pv_clear_attrs(paddr_t pa, unsigned clearbits)
3815 {
3816 	struct pmap_page *pp;
3817 
3818 	pp = pmap_pv_tracked(pa);
3819 	if (pp == NULL)
3820 		panic("pmap_pv_protect: page not pv-tracked: 0x%"PRIxPADDR,
3821 		    pa);
3822 
3823 	return pmap_pp_clear_attrs(pp, pa, clearbits);
3824 }
3825 
3826 /*
3827  * p m a p   p r o t e c t i o n   f u n c t i o n s
3828  */
3829 
3830 /*
3831  * pmap_page_protect: change the protection of all recorded mappings
3832  *	of a managed page
3833  *
3834  * => NOTE: this is an inline function in pmap.h
3835  */
3836 
3837 /* see pmap.h */
3838 
3839 /*
3840  * pmap_pv_protect: change the protection of all recorded mappings
3841  *	of an unmanaged pv-tracked page
3842  *
3843  * => NOTE: this is an inline function in pmap.h
3844  */
3845 
3846 /* see pmap.h */
3847 
3848 /*
3849  * pmap_protect: set the protection in of the pages in a pmap
3850  *
3851  * => NOTE: this is an inline function in pmap.h
3852  */
3853 
3854 /* see pmap.h */
3855 
3856 /*
3857  * pmap_write_protect: write-protect pages in a pmap.
3858  */
3859 void
3860 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
3861 {
3862 	pt_entry_t bit_rem, bit_put;
3863 	pt_entry_t *ptes;
3864 	pt_entry_t * const *pdes;
3865 	struct pmap *pmap2;
3866 	vaddr_t blockend, va;
3867 
3868 	KASSERT(curlwp->l_md.md_gc_pmap != pmap);
3869 
3870 	bit_rem = 0;
3871 	if (!(prot & VM_PROT_WRITE))
3872 		bit_rem = PG_RW;
3873 
3874 	bit_put = 0;
3875 	if (!(prot & VM_PROT_EXECUTE))
3876 		bit_put = pmap_pg_nx;
3877 
3878 	sva &= PG_FRAME;
3879 	eva &= PG_FRAME;
3880 
3881 	/* Acquire pmap. */
3882 	kpreempt_disable();
3883 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3884 
3885 	for (va = sva ; va < eva; va = blockend) {
3886 		pt_entry_t *spte, *epte;
3887 		int i;
3888 
3889 		blockend = x86_round_pdr(va + 1);
3890 		if (blockend > eva)
3891 			blockend = eva;
3892 
3893 		/*
3894 		 * Our PTE mappings should never be write-protected.
3895 		 *
3896 		 * XXXmaxv: still needed?
3897 		 *
3898 		 * A long term solution is to move the PTEs out of user address
3899 		 * space, and into kernel address space. Then we can set
3900 		 * VM_MAXUSER_ADDRESS to be VM_MAX_ADDRESS.
3901 		 */
3902 		for (i = 0; i < PDP_SIZE; i++) {
3903 			if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i)
3904 				panic("PTE space accessed");
3905 		}
3906 
3907 		/* Is it a valid block? */
3908 		if (!pmap_pdes_valid(va, pdes, NULL)) {
3909 			continue;
3910 		}
3911 		KASSERT(va < VM_MAXUSER_ADDRESS || va >= VM_MAX_ADDRESS);
3912 
3913 		spte = &ptes[pl1_i(va)];
3914 		epte = &ptes[pl1_i(blockend)];
3915 
3916 		for (/* */; spte < epte; spte++) {
3917 			pt_entry_t opte, npte;
3918 
3919 			do {
3920 				opte = *spte;
3921 				if (!pmap_valid_entry(opte)) {
3922 					goto next;
3923 				}
3924 				npte = (opte & ~bit_rem) | bit_put;
3925 			} while (pmap_pte_cas(spte, opte, npte) != opte);
3926 
3927 			if ((opte & PG_M) != 0) {
3928 				vaddr_t tva = x86_ptob(spte - ptes);
3929 				pmap_tlb_shootdown(pmap, tva, opte,
3930 				    TLBSHOOT_WRITE_PROTECT);
3931 			}
3932 next:;
3933 		}
3934 	}
3935 
3936 	/* Release pmap. */
3937 	pmap_unmap_ptes(pmap, pmap2);
3938 	kpreempt_enable();
3939 }
3940 
3941 /*
3942  * pmap_unwire: clear the wired bit in the PTE.
3943  *
3944  * => Mapping should already be present.
3945  */
3946 void
3947 pmap_unwire(struct pmap *pmap, vaddr_t va)
3948 {
3949 	pt_entry_t *ptes, *ptep, opte;
3950 	pd_entry_t * const *pdes;
3951 	struct pmap *pmap2;
3952 
3953 	/* Acquire pmap. */
3954 	kpreempt_disable();
3955 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3956 
3957 	if (!pmap_pdes_valid(va, pdes, NULL)) {
3958 		panic("pmap_unwire: invalid PDE");
3959 	}
3960 
3961 	ptep = &ptes[pl1_i(va)];
3962 	opte = *ptep;
3963 	KASSERT(pmap_valid_entry(opte));
3964 
3965 	if (opte & PG_W) {
3966 		pt_entry_t npte = opte & ~PG_W;
3967 
3968 		opte = pmap_pte_testset(ptep, npte);
3969 		pmap_stats_update_bypte(pmap, npte, opte);
3970 	} else {
3971 		printf("pmap_unwire: wiring for pmap %p va 0x%lx "
3972 		    "did not change!\n", pmap, va);
3973 	}
3974 
3975 	/* Release pmap. */
3976 	pmap_unmap_ptes(pmap, pmap2);
3977 	kpreempt_enable();
3978 }
3979 
3980 /*
3981  * pmap_copy: copy mappings from one pmap to another
3982  *
3983  * => optional function
3984  * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
3985  */
3986 
3987 /*
3988  * defined as macro in pmap.h
3989  */
3990 
3991 __strict_weak_alias(pmap_enter, pmap_enter_default);
3992 
3993 int
3994 pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
3995     u_int flags)
3996 {
3997 	return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0);
3998 }
3999 
4000 /*
4001  * pmap_enter: enter a mapping into a pmap
4002  *
4003  * => must be done "now" ... no lazy-evaluation
4004  * => we set pmap => pv_head locking
4005  */
4006 int
4007 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa,
4008 	   vm_prot_t prot, u_int flags, int domid)
4009 {
4010 	pt_entry_t *ptes, opte, npte;
4011 	pt_entry_t *ptep;
4012 	pd_entry_t * const *pdes;
4013 	struct vm_page *ptp;
4014 	struct vm_page *new_pg, *old_pg;
4015 	struct pmap_page *new_pp, *old_pp;
4016 	struct pv_entry *old_pve = NULL;
4017 	struct pv_entry *new_pve;
4018 	struct pv_entry *new_sparepve;
4019 	int error;
4020 	bool wired = (flags & PMAP_WIRED) != 0;
4021 	struct pmap *pmap2;
4022 
4023 	KASSERT(pmap_initialized);
4024 	KASSERT(curlwp->l_md.md_gc_pmap != pmap);
4025 	KASSERT(va < VM_MAX_KERNEL_ADDRESS);
4026 	KASSERTMSG(va != (vaddr_t)PDP_BASE,
4027 	    "pmap_enter: trying to map over PDP!");
4028 	KASSERTMSG(va < VM_MIN_KERNEL_ADDRESS ||
4029 	    pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]),
4030 	    "pmap_enter: missing kernel PTP for VA %lx!", va);
4031 
4032 #ifdef XEN
4033 	KASSERT(domid == DOMID_SELF || pa == 0);
4034 #endif /* XEN */
4035 
4036 	npte = ma | protection_codes[prot] | PG_V;
4037 	npte |= pmap_pat_flags(flags);
4038 	if (wired)
4039 	        npte |= PG_W;
4040 	if (va < VM_MAXUSER_ADDRESS)
4041 		npte |= PG_u;
4042 	else if (va < VM_MAX_ADDRESS)
4043 		panic("PTE space accessed");	/* XXXmaxv: no longer needed? */
4044 	else
4045 		npte |= PG_k;
4046 	if (pmap == pmap_kernel())
4047 		npte |= pmap_pg_g;
4048 	if (flags & VM_PROT_ALL) {
4049 		npte |= PG_U;
4050 		if (flags & VM_PROT_WRITE) {
4051 			KASSERT((npte & PG_RW) != 0);
4052 			npte |= PG_M;
4053 		}
4054 	}
4055 
4056 #ifdef XEN
4057 	if (domid != DOMID_SELF)
4058 		new_pg = NULL;
4059 	else
4060 #endif
4061 		new_pg = PHYS_TO_VM_PAGE(pa);
4062 	if (new_pg != NULL) {
4063 		/* This is a managed page */
4064 		npte |= PG_PVLIST;
4065 		new_pp = VM_PAGE_TO_PP(new_pg);
4066 	} else if ((new_pp = pmap_pv_tracked(pa)) != NULL) {
4067 		/* This is an unmanaged pv-tracked page */
4068 		npte |= PG_PVLIST;
4069 	} else {
4070 		new_pp = NULL;
4071 	}
4072 
4073 	/* get pves. */
4074 	new_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
4075 	new_sparepve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
4076 	if (new_pve == NULL || new_sparepve == NULL) {
4077 		if (flags & PMAP_CANFAIL) {
4078 			error = ENOMEM;
4079 			goto out2;
4080 		}
4081 		panic("pmap_enter: pve allocation failed");
4082 	}
4083 
4084 	kpreempt_disable();
4085 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
4086 	if (pmap == pmap_kernel()) {
4087 		ptp = NULL;
4088 	} else {
4089 		ptp = pmap_get_ptp(pmap, va, pdes);
4090 		if (ptp == NULL) {
4091 			pmap_unmap_ptes(pmap, pmap2);
4092 			if (flags & PMAP_CANFAIL) {
4093 				error = ENOMEM;
4094 				goto out;
4095 			}
4096 			panic("pmap_enter: get ptp failed");
4097 		}
4098 	}
4099 
4100 	/*
4101 	 * update the pte.
4102 	 */
4103 
4104 	ptep = &ptes[pl1_i(va)];
4105 	do {
4106 		opte = *ptep;
4107 
4108 		/*
4109 		 * if the same page, inherit PG_U and PG_M.
4110 		 */
4111 		if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) {
4112 			npte |= opte & (PG_U | PG_M);
4113 		}
4114 #if defined(XEN)
4115 		if (domid != DOMID_SELF) {
4116 			/* pmap_pte_cas with error handling */
4117 			int s = splvm();
4118 			if (opte != *ptep) {
4119 				splx(s);
4120 				continue;
4121 			}
4122 			error = xpq_update_foreign(
4123 			    vtomach((vaddr_t)ptep), npte, domid);
4124 			splx(s);
4125 			if (error) {
4126 				if (ptp != NULL && ptp->wire_count <= 1) {
4127 					pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4128 				}
4129 				pmap_unmap_ptes(pmap, pmap2);
4130 				goto out;
4131 			}
4132 			break;
4133 		}
4134 #endif /* defined(XEN) */
4135 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
4136 
4137 	/*
4138 	 * update statistics and PTP's reference count.
4139 	 */
4140 
4141 	pmap_stats_update_bypte(pmap, npte, opte);
4142 	if (ptp != NULL && !pmap_valid_entry(opte)) {
4143 		ptp->wire_count++;
4144 	}
4145 	KASSERT(ptp == NULL || ptp->wire_count > 1);
4146 
4147 	/*
4148 	 * if the same page, we can skip pv_entry handling.
4149 	 */
4150 
4151 	if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) {
4152 		KASSERT(((opte ^ npte) & PG_PVLIST) == 0);
4153 		goto same_pa;
4154 	}
4155 
4156 	/*
4157 	 * if old page is pv-tracked, remove pv_entry from its list.
4158 	 */
4159 
4160 	if ((~opte & (PG_V | PG_PVLIST)) == 0) {
4161 		if ((old_pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
4162 			KASSERT(uvm_page_locked_p(old_pg));
4163 			old_pp = VM_PAGE_TO_PP(old_pg);
4164 		} else if ((old_pp = pmap_pv_tracked(pmap_pte2pa(opte)))
4165 		    == NULL) {
4166 			pa = pmap_pte2pa(opte);
4167 			panic("pmap_enter: PG_PVLIST with pv-untracked page"
4168 			    " va = 0x%"PRIxVADDR
4169 			    " pa = 0x%" PRIxPADDR " (0x%" PRIxPADDR ")",
4170 			    va, pa, atop(pa));
4171 		}
4172 
4173 		old_pve = pmap_remove_pv(old_pp, ptp, va);
4174 		old_pp->pp_attrs |= opte;
4175 	}
4176 
4177 	/*
4178 	 * if new page is pv-tracked, insert pv_entry into its list.
4179 	 */
4180 
4181 	if (new_pp) {
4182 		new_pve = pmap_enter_pv(new_pp, new_pve, &new_sparepve, ptp, va);
4183 	}
4184 
4185 same_pa:
4186 	pmap_unmap_ptes(pmap, pmap2);
4187 
4188 	/*
4189 	 * shootdown tlb if necessary.
4190 	 */
4191 
4192 	if ((~opte & (PG_V | PG_U)) == 0 &&
4193 	    ((opte ^ npte) & (PG_FRAME | PG_RW)) != 0) {
4194 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER);
4195 	}
4196 
4197 	error = 0;
4198 out:
4199 	kpreempt_enable();
4200 out2:
4201 	if (old_pve != NULL) {
4202 		pool_cache_put(&pmap_pv_cache, old_pve);
4203 	}
4204 	if (new_pve != NULL) {
4205 		pool_cache_put(&pmap_pv_cache, new_pve);
4206 	}
4207 	if (new_sparepve != NULL) {
4208 		pool_cache_put(&pmap_pv_cache, new_sparepve);
4209 	}
4210 
4211 	return error;
4212 }
4213 
4214 static paddr_t
4215 pmap_get_physpage(void)
4216 {
4217 	struct vm_page *ptp;
4218 	struct pmap *kpm = pmap_kernel();
4219 	paddr_t pa;
4220 
4221 	if (!uvm.page_init_done) {
4222 		/*
4223 		 * We're growing the kernel pmap early (from
4224 		 * uvm_pageboot_alloc()). This case must be
4225 		 * handled a little differently.
4226 		 */
4227 
4228 		if (!uvm_page_physget(&pa))
4229 			panic("pmap_get_physpage: out of memory");
4230 #if defined(__HAVE_DIRECT_MAP)
4231 		pagezero(PMAP_DIRECT_MAP(pa));
4232 #else
4233 #if defined(XEN)
4234 		if (XEN_VERSION_SUPPORTED(3, 4)) {
4235 			xen_pagezero(pa);
4236 			return pa;
4237 		}
4238 #endif
4239 		kpreempt_disable();
4240 		pmap_pte_set(early_zero_pte, pmap_pa2pte(pa) | PG_V |
4241 		    PG_RW | pmap_pg_nx | PG_k);
4242 		pmap_pte_flush();
4243 		pmap_update_pg((vaddr_t)early_zerop);
4244 		memset(early_zerop, 0, PAGE_SIZE);
4245 #if defined(DIAGNOSTIC) || defined(XEN)
4246 		pmap_pte_set(early_zero_pte, 0);
4247 		pmap_pte_flush();
4248 #endif /* defined(DIAGNOSTIC) */
4249 		kpreempt_enable();
4250 #endif /* defined(__HAVE_DIRECT_MAP) */
4251 	} else {
4252 		/* XXX */
4253 		ptp = uvm_pagealloc(NULL, 0, NULL,
4254 				    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
4255 		if (ptp == NULL)
4256 			panic("pmap_get_physpage: out of memory");
4257 		ptp->flags &= ~PG_BUSY;
4258 		ptp->wire_count = 1;
4259 		pa = VM_PAGE_TO_PHYS(ptp);
4260 	}
4261 	pmap_stats_update(kpm, 1, 0);
4262 
4263 	return pa;
4264 }
4265 
4266 /*
4267  * Expand the page tree with the specified amount of PTPs, mapping virtual
4268  * addresses starting at kva. We populate all the levels but the last one
4269  * (L1). The nodes of the tree are created as RWX, but the pages covered
4270  * will be kentered in L1, with proper permissions.
4271  *
4272  * Used only by pmap_growkernel.
4273  */
4274 static void
4275 pmap_alloc_level(vaddr_t kva, long *needed_ptps)
4276 {
4277 	unsigned long i;
4278 	paddr_t pa;
4279 	unsigned long index, endindex;
4280 	int level;
4281 	pd_entry_t *pdep;
4282 #ifdef XEN
4283 	int s = splvm(); /* protect xpq_* */
4284 #endif
4285 
4286 	for (level = PTP_LEVELS; level > 1; level--) {
4287 		if (level == PTP_LEVELS)
4288 			pdep = pmap_kernel()->pm_pdir;
4289 		else
4290 			pdep = normal_pdes[level - 2];
4291 		index = pl_i_roundup(kva, level);
4292 		endindex = index + needed_ptps[level - 1] - 1;
4293 
4294 		for (i = index; i <= endindex; i++) {
4295 			pt_entry_t pte;
4296 
4297 			KASSERT(!pmap_valid_entry(pdep[i]));
4298 			pa = pmap_get_physpage();
4299 			pte = pmap_pa2pte(pa) | PG_k | PG_V | PG_RW;
4300 			pmap_pte_set(&pdep[i], pte);
4301 
4302 #if defined(XEN) && (defined(PAE) || defined(__x86_64__))
4303 			if (level == PTP_LEVELS && i >= PDIR_SLOT_KERN) {
4304 				if (__predict_true(
4305 				    cpu_info_primary.ci_flags & CPUF_PRESENT)) {
4306 					/* update per-cpu PMDs on all cpus */
4307 					xen_kpm_sync(pmap_kernel(), i);
4308 				} else {
4309 					/*
4310 					 * too early; update primary CPU
4311 					 * PMD only (without locks)
4312 					 */
4313 #ifdef PAE
4314 					pd_entry_t *cpu_pdep =
4315 					    &cpu_info_primary.ci_kpm_pdir[l2tol2(i)];
4316 #endif
4317 #ifdef __x86_64__
4318 					pd_entry_t *cpu_pdep =
4319 						&cpu_info_primary.ci_kpm_pdir[i];
4320 #endif
4321 					pmap_pte_set(cpu_pdep, pte);
4322 				}
4323 			}
4324 #endif /* XEN && (PAE || __x86_64__) */
4325 
4326 			KASSERT(level != PTP_LEVELS || nkptp[level - 1] +
4327 			    pl_i(VM_MIN_KERNEL_ADDRESS, level) == i);
4328 			nkptp[level - 1]++;
4329 		}
4330 		pmap_pte_flush();
4331 	}
4332 #ifdef XEN
4333 	splx(s);
4334 #endif
4335 }
4336 
4337 /*
4338  * pmap_growkernel: increase usage of KVM space.
4339  *
4340  * => we allocate new PTPs for the kernel and install them in all
4341  *    the pmaps on the system.
4342  */
4343 
4344 vaddr_t
4345 pmap_growkernel(vaddr_t maxkvaddr)
4346 {
4347 	struct pmap *kpm = pmap_kernel();
4348 #if !defined(XEN) || !defined(__x86_64__)
4349 	struct pmap *pm;
4350 	long old;
4351 #endif
4352 	int s, i;
4353 	long needed_kptp[PTP_LEVELS], target_nptp;
4354 	bool invalidate = false;
4355 
4356 	s = splvm();	/* to be safe */
4357 	mutex_enter(kpm->pm_lock);
4358 
4359 	if (maxkvaddr <= pmap_maxkvaddr) {
4360 		mutex_exit(kpm->pm_lock);
4361 		splx(s);
4362 		return pmap_maxkvaddr;
4363 	}
4364 
4365 	maxkvaddr = x86_round_pdr(maxkvaddr);
4366 #if !defined(XEN) || !defined(__x86_64__)
4367 	old = nkptp[PTP_LEVELS - 1];
4368 #endif
4369 
4370 	/* Initialize needed_kptp. */
4371 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
4372 		target_nptp = pl_i_roundup(maxkvaddr, i + 1) -
4373 		    pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1);
4374 
4375 		if (target_nptp > nkptpmax[i])
4376 			panic("out of KVA space");
4377 		KASSERT(target_nptp >= nkptp[i]);
4378 		needed_kptp[i] = target_nptp - nkptp[i];
4379 	}
4380 
4381 	pmap_alloc_level(pmap_maxkvaddr, needed_kptp);
4382 
4383 	/*
4384 	 * If the number of top level entries changed, update all pmaps.
4385 	 */
4386 	if (needed_kptp[PTP_LEVELS - 1] != 0) {
4387 #ifdef XEN
4388 #ifdef __x86_64__
4389 		/* nothing, kernel entries are never entered in user pmap */
4390 #else /* __x86_64__ */
4391 		mutex_enter(&pmaps_lock);
4392 		LIST_FOREACH(pm, &pmaps, pm_list) {
4393 			int pdkidx;
4394 			for (pdkidx = PDIR_SLOT_KERN + old;
4395 			    pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1];
4396 			    pdkidx++) {
4397 				pmap_pte_set(&pm->pm_pdir[pdkidx],
4398 				    kpm->pm_pdir[pdkidx]);
4399 			}
4400 			pmap_pte_flush();
4401 		}
4402 		mutex_exit(&pmaps_lock);
4403 #endif /* __x86_64__ */
4404 #else /* XEN */
4405 		unsigned newpdes;
4406 		newpdes = nkptp[PTP_LEVELS - 1] - old;
4407 		mutex_enter(&pmaps_lock);
4408 		LIST_FOREACH(pm, &pmaps, pm_list) {
4409 			memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old],
4410 			    &kpm->pm_pdir[PDIR_SLOT_KERN + old],
4411 			    newpdes * sizeof (pd_entry_t));
4412 		}
4413 		mutex_exit(&pmaps_lock);
4414 #endif
4415 		invalidate = true;
4416 	}
4417 	pmap_maxkvaddr = maxkvaddr;
4418 	mutex_exit(kpm->pm_lock);
4419 	splx(s);
4420 
4421 	if (invalidate && pmap_initialized) {
4422 		/* Invalidate the PDP cache. */
4423 		pool_cache_invalidate(&pmap_pdp_cache);
4424 	}
4425 
4426 	return maxkvaddr;
4427 }
4428 
4429 #ifdef DEBUG
4430 void pmap_dump(struct pmap *, vaddr_t, vaddr_t);
4431 
4432 /*
4433  * pmap_dump: dump all the mappings from a pmap
4434  *
4435  * => caller should not be holding any pmap locks
4436  */
4437 
4438 void
4439 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
4440 {
4441 	pt_entry_t *ptes, *pte;
4442 	pd_entry_t * const *pdes;
4443 	struct pmap *pmap2;
4444 	vaddr_t blkendva;
4445 
4446 	/*
4447 	 * if end is out of range truncate.
4448 	 * if (end == start) update to max.
4449 	 */
4450 
4451 	if (eva > VM_MAXUSER_ADDRESS || eva <= sva)
4452 		eva = VM_MAXUSER_ADDRESS;
4453 
4454 	/*
4455 	 * we lock in the pmap => pv_head direction
4456 	 */
4457 
4458 	kpreempt_disable();
4459 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
4460 
4461 	/*
4462 	 * dumping a range of pages: we dump in PTP sized blocks (4MB)
4463 	 */
4464 
4465 	for (/* null */ ; sva < eva ; sva = blkendva) {
4466 
4467 		/* determine range of block */
4468 		blkendva = x86_round_pdr(sva+1);
4469 		if (blkendva > eva)
4470 			blkendva = eva;
4471 
4472 		/* valid block? */
4473 		if (!pmap_pdes_valid(sva, pdes, NULL))
4474 			continue;
4475 
4476 		pte = &ptes[pl1_i(sva)];
4477 		for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) {
4478 			if (!pmap_valid_entry(*pte))
4479 				continue;
4480 			printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR
4481 			    " (pte=%#" PRIxPADDR ")\n",
4482 			    sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte);
4483 		}
4484 	}
4485 	pmap_unmap_ptes(pmap, pmap2);
4486 	kpreempt_enable();
4487 }
4488 #endif
4489 
4490 /*
4491  * pmap_update: process deferred invalidations and frees.
4492  */
4493 
4494 void
4495 pmap_update(struct pmap *pmap)
4496 {
4497 	struct vm_page *empty_ptps;
4498 	lwp_t *l = curlwp;
4499 
4500 	/*
4501 	 * If we have torn down this pmap, invalidate non-global TLB
4502 	 * entries on any processors using it.
4503 	 */
4504 	kpreempt_disable();
4505 	if (__predict_false(l->l_md.md_gc_pmap == pmap)) {
4506 		l->l_md.md_gc_pmap = NULL;
4507 		pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, TLBSHOOT_UPDATE);
4508 	}
4509 	/*
4510 	 * Initiate any pending TLB shootdowns.  Wait for them to
4511 	 * complete before returning control to the caller.
4512 	 */
4513 	pmap_tlb_shootnow();
4514 	kpreempt_enable();
4515 
4516 	/*
4517 	 * Now that shootdowns are complete, process deferred frees,
4518 	 * but not from interrupt context.
4519 	 */
4520 	if (l->l_md.md_gc_ptp != NULL) {
4521 		KASSERT((l->l_pflag & LP_INTR) == 0);
4522 		if (cpu_intr_p()) {
4523 			return;
4524 		}
4525 		empty_ptps = l->l_md.md_gc_ptp;
4526 		l->l_md.md_gc_ptp = NULL;
4527 		pmap_free_ptps(empty_ptps);
4528 	}
4529 }
4530 
4531 #if PTP_LEVELS > 4
4532 #error "Unsupported number of page table mappings"
4533 #endif
4534 
4535 paddr_t
4536 pmap_init_tmp_pgtbl(paddr_t pg)
4537 {
4538 	static bool maps_loaded;
4539 	static const paddr_t x86_tmp_pml_paddr[] = {
4540 	    4 * PAGE_SIZE,	/* L1 */
4541 	    5 * PAGE_SIZE,	/* L2 */
4542 	    6 * PAGE_SIZE,	/* L3 */
4543 	    7 * PAGE_SIZE	/* L4 */
4544 	};
4545 	static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 };
4546 
4547 	pd_entry_t *tmp_pml, *kernel_pml;
4548 
4549 	int level;
4550 
4551 	if (!maps_loaded) {
4552 		for (level = 0; level < PTP_LEVELS; ++level) {
4553 			x86_tmp_pml_vaddr[level] =
4554 			    uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
4555 			    UVM_KMF_VAONLY);
4556 
4557 			if (x86_tmp_pml_vaddr[level] == 0)
4558 				panic("mapping of real mode PML failed\n");
4559 			pmap_kenter_pa(x86_tmp_pml_vaddr[level],
4560 			    x86_tmp_pml_paddr[level],
4561 			    VM_PROT_READ | VM_PROT_WRITE, 0);
4562 		}
4563 		pmap_update(pmap_kernel());
4564 		maps_loaded = true;
4565 	}
4566 
4567 	/* Zero levels 1-3 */
4568 	for (level = 0; level < PTP_LEVELS - 1; ++level) {
4569 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
4570 		memset(tmp_pml, 0, PAGE_SIZE);
4571 	}
4572 
4573 	/* Copy PML4 */
4574 	kernel_pml = pmap_kernel()->pm_pdir;
4575 	tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1];
4576 	memcpy(tmp_pml, kernel_pml, PAGE_SIZE);
4577 
4578 #ifdef PAE
4579 	/*
4580 	 * Use the last 4 entries of the L2 page as L3 PD entries. These
4581 	 * last entries are unlikely to be used for temporary mappings.
4582 	 * 508: maps 0->1GB (userland)
4583 	 * 509: unused
4584 	 * 510: unused
4585 	 * 511: maps 3->4GB (kernel)
4586 	 */
4587 	tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PG_V;
4588 	tmp_pml[509] = 0;
4589 	tmp_pml[510] = 0;
4590 	tmp_pml[511] = pmap_pdirpa(pmap_kernel(), PDIR_SLOT_KERN) | PG_V;
4591 #endif
4592 
4593 	for (level = PTP_LEVELS - 1; level > 0; --level) {
4594 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
4595 
4596 		tmp_pml[pl_i(pg, level + 1)] =
4597 		    (x86_tmp_pml_paddr[level - 1] & PG_FRAME) | PG_RW | PG_V;
4598 	}
4599 
4600 	tmp_pml = (void *)x86_tmp_pml_vaddr[0];
4601 	tmp_pml[pl_i(pg, 1)] = (pg & PG_FRAME) | PG_RW | PG_V;
4602 
4603 #ifdef PAE
4604 	/* Return the PA of the L3 page (entry 508 of the L2 page) */
4605 	return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t);
4606 #endif
4607 
4608 	return x86_tmp_pml_paddr[PTP_LEVELS - 1];
4609 }
4610 
4611 u_int
4612 x86_mmap_flags(paddr_t mdpgno)
4613 {
4614 	u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK;
4615 	u_int pflag = 0;
4616 
4617 	if (nflag & X86_MMAP_FLAG_PREFETCH)
4618 		pflag |= PMAP_WRITE_COMBINE;
4619 
4620 	return pflag;
4621 }
4622