xref: /netbsd-src/sys/arch/x86/x86/pmap.c (revision 63aea4bd5b445e491ff0389fe27ec78b3099dba3)
1 /*	$NetBSD: pmap.c,v 1.189 2015/11/11 08:20:22 skrll Exp $	*/
2 
3 /*-
4  * Copyright (c) 2008, 2010 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 2007 Manuel Bouyer.
34  *
35  * Redistribution and use in source and binary forms, with or without
36  * modification, are permitted provided that the following conditions
37  * are met:
38  * 1. Redistributions of source code must retain the above copyright
39  *    notice, this list of conditions and the following disclaimer.
40  * 2. Redistributions in binary form must reproduce the above copyright
41  *    notice, this list of conditions and the following disclaimer in the
42  *    documentation and/or other materials provided with the distribution.
43  *
44  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
45  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
46  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
47  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
48  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
49  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
50  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
51  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
52  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
53  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
54  *
55  */
56 
57 /*
58  * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
59  *
60  * Permission to use, copy, modify, and distribute this software for any
61  * purpose with or without fee is hereby granted, provided that the above
62  * copyright notice and this permission notice appear in all copies.
63  *
64  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
65  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
66  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
67  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
68  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
69  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
70  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
71  */
72 
73 /*
74  * Copyright (c) 1997 Charles D. Cranor and Washington University.
75  * All rights reserved.
76  *
77  * Redistribution and use in source and binary forms, with or without
78  * modification, are permitted provided that the following conditions
79  * are met:
80  * 1. Redistributions of source code must retain the above copyright
81  *    notice, this list of conditions and the following disclaimer.
82  * 2. Redistributions in binary form must reproduce the above copyright
83  *    notice, this list of conditions and the following disclaimer in the
84  *    documentation and/or other materials provided with the distribution.
85  *
86  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
87  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
88  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
89  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
90  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
91  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
92  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
93  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
94  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
95  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
96  */
97 
98 /*
99  * Copyright 2001 (c) Wasabi Systems, Inc.
100  * All rights reserved.
101  *
102  * Written by Frank van der Linden for Wasabi Systems, Inc.
103  *
104  * Redistribution and use in source and binary forms, with or without
105  * modification, are permitted provided that the following conditions
106  * are met:
107  * 1. Redistributions of source code must retain the above copyright
108  *    notice, this list of conditions and the following disclaimer.
109  * 2. Redistributions in binary form must reproduce the above copyright
110  *    notice, this list of conditions and the following disclaimer in the
111  *    documentation and/or other materials provided with the distribution.
112  * 3. All advertising materials mentioning features or use of this software
113  *    must display the following acknowledgement:
114  *      This product includes software developed for the NetBSD Project by
115  *      Wasabi Systems, Inc.
116  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
117  *    or promote products derived from this software without specific prior
118  *    written permission.
119  *
120  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
121  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
122  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
123  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
124  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
125  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
126  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
127  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
128  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
129  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
130  * POSSIBILITY OF SUCH DAMAGE.
131  */
132 
133 /*
134  * This is the i386 pmap modified and generalized to support x86-64
135  * as well. The idea is to hide the upper N levels of the page tables
136  * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest
137  * is mostly untouched, except that it uses some more generalized
138  * macros and interfaces.
139  *
140  * This pmap has been tested on the i386 as well, and it can be easily
141  * adapted to PAE.
142  *
143  * fvdl@wasabisystems.com 18-Jun-2001
144  */
145 
146 /*
147  * pmap.c: i386 pmap module rewrite
148  * Chuck Cranor <chuck@netbsd>
149  * 11-Aug-97
150  *
151  * history of this pmap module: in addition to my own input, i used
152  *    the following references for this rewrite of the i386 pmap:
153  *
154  * [1] the NetBSD i386 pmap.   this pmap appears to be based on the
155  *     BSD hp300 pmap done by Mike Hibler at University of Utah.
156  *     it was then ported to the i386 by William Jolitz of UUNET
157  *     Technologies, Inc.   Then Charles M. Hannum of the NetBSD
158  *     project fixed some bugs and provided some speed ups.
159  *
160  * [2] the FreeBSD i386 pmap.   this pmap seems to be the
161  *     Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson
162  *     and David Greenman.
163  *
164  * [3] the Mach pmap.   this pmap, from CMU, seems to have migrated
165  *     between several processors.   the VAX version was done by
166  *     Avadis Tevanian, Jr., and Michael Wayne Young.    the i386
167  *     version was done by Lance Berc, Mike Kupfer, Bob Baron,
168  *     David Golub, and Richard Draves.    the alpha version was
169  *     done by Alessandro Forin (CMU/Mach) and Chris Demetriou
170  *     (NetBSD/alpha).
171  */
172 
173 #include <sys/cdefs.h>
174 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.189 2015/11/11 08:20:22 skrll Exp $");
175 
176 #include "opt_user_ldt.h"
177 #include "opt_lockdebug.h"
178 #include "opt_multiprocessor.h"
179 #include "opt_xen.h"
180 #if !defined(__x86_64__)
181 #include "opt_kstack_dr0.h"
182 #endif /* !defined(__x86_64__) */
183 
184 #include <sys/param.h>
185 #include <sys/systm.h>
186 #include <sys/proc.h>
187 #include <sys/pool.h>
188 #include <sys/kernel.h>
189 #include <sys/atomic.h>
190 #include <sys/cpu.h>
191 #include <sys/intr.h>
192 #include <sys/xcall.h>
193 #include <sys/kcore.h>
194 
195 #include <uvm/uvm.h>
196 #include <uvm/pmap/pmap_pvt.h>
197 
198 #include <dev/isa/isareg.h>
199 
200 #include <machine/specialreg.h>
201 #include <machine/gdt.h>
202 #include <machine/isa_machdep.h>
203 #include <machine/cpuvar.h>
204 #include <machine/cputypes.h>
205 
206 #include <x86/pmap.h>
207 #include <x86/pmap_pv.h>
208 
209 #include <x86/i82489reg.h>
210 #include <x86/i82489var.h>
211 
212 #ifdef XEN
213 #include <xen/xen-public/xen.h>
214 #include <xen/hypervisor.h>
215 #endif
216 
217 /*
218  * general info:
219  *
220  *  - for an explanation of how the i386 MMU hardware works see
221  *    the comments in <machine/pte.h>.
222  *
223  *  - for an explanation of the general memory structure used by
224  *    this pmap (including the recursive mapping), see the comments
225  *    in <machine/pmap.h>.
226  *
227  * this file contains the code for the "pmap module."   the module's
228  * job is to manage the hardware's virtual to physical address mappings.
229  * note that there are two levels of mapping in the VM system:
230  *
231  *  [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
232  *      to map ranges of virtual address space to objects/files.  for
233  *      example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
234  *      to the file /bin/ls starting at offset zero."   note that
235  *      the upper layer mapping is not concerned with how individual
236  *      vm_pages are mapped.
237  *
238  *  [2] the lower layer of the VM system (the pmap) maintains the mappings
239  *      from virtual addresses.   it is concerned with which vm_page is
240  *      mapped where.   for example, when you run /bin/ls and start
241  *      at page 0x1000 the fault routine may lookup the correct page
242  *      of the /bin/ls file and then ask the pmap layer to establish
243  *      a mapping for it.
244  *
245  * note that information in the lower layer of the VM system can be
246  * thrown away since it can easily be reconstructed from the info
247  * in the upper layer.
248  *
249  * data structures we use include:
250  *
251  *  - struct pmap: describes the address space of one thread
252  *  - struct pmap_page: describes one pv-tracked page, without
253  *	necessarily a corresponding vm_page
254  *  - struct pv_entry: describes one <PMAP,VA> mapping of a PA
255  *  - struct pv_head: there is one pv_head per pv-tracked page of
256  *	physical memory.   the pv_head points to a list of pv_entry
257  *	structures which describe all the <PMAP,VA> pairs that this
258  *      page is mapped in.    this is critical for page based operations
259  *      such as pmap_page_protect() [change protection on _all_ mappings
260  *      of a page]
261  */
262 
263 /*
264  * memory allocation
265  *
266  *  - there are three data structures that we must dynamically allocate:
267  *
268  * [A] new process' page directory page (PDP)
269  *	- plan 1: done at pmap_create() we use
270  *	  uvm_km_alloc(kernel_map, PAGE_SIZE)  [fka kmem_alloc] to do this
271  *	  allocation.
272  *
273  * if we are low in free physical memory then we sleep in
274  * uvm_km_alloc -- in this case this is ok since we are creating
275  * a new pmap and should not be holding any locks.
276  *
277  * if the kernel is totally out of virtual space
278  * (i.e. uvm_km_alloc returns NULL), then we panic.
279  *
280  * [B] new page tables pages (PTP)
281  * 	- call uvm_pagealloc()
282  * 		=> success: zero page, add to pm_pdir
283  * 		=> failure: we are out of free vm_pages, let pmap_enter()
284  *		   tell UVM about it.
285  *
286  * note: for kernel PTPs, we start with NKPTP of them.   as we map
287  * kernel memory (at uvm_map time) we check to see if we've grown
288  * the kernel pmap.   if so, we call the optional function
289  * pmap_growkernel() to grow the kernel PTPs in advance.
290  *
291  * [C] pv_entry structures
292  */
293 
294 /*
295  * locking
296  *
297  * we have the following locks that we must contend with:
298  *
299  * mutexes:
300  *
301  * - pmap lock (per pmap, part of uvm_object)
302  *   this lock protects the fields in the pmap structure including
303  *   the non-kernel PDEs in the PDP, and the PTEs.  it also locks
304  *   in the alternate PTE space (since that is determined by the
305  *   entry in the PDP).
306  *
307  * - pvh_lock (per pv_head)
308  *   this lock protects the pv_entry list which is chained off the
309  *   pv_head structure for a specific pv-tracked PA.   it is locked
310  *   when traversing the list (e.g. adding/removing mappings,
311  *   syncing R/M bits, etc.)
312  *
313  * - pmaps_lock
314  *   this lock protects the list of active pmaps (headed by "pmaps").
315  *   we lock it when adding or removing pmaps from this list.
316  */
317 
318 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER;
319 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER;
320 const long nkptpmax[] = NKPTPMAX_INITIALIZER;
321 const long nbpd[] = NBPD_INITIALIZER;
322 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER;
323 
324 long nkptp[] = NKPTP_INITIALIZER;
325 
326 struct pmap_head pmaps;
327 kmutex_t pmaps_lock;
328 
329 static vaddr_t pmap_maxkvaddr;
330 
331 /*
332  * XXX kludge: dummy locking to make KASSERTs in uvm_page.c comfortable.
333  * actual locking is done by pm_lock.
334  */
335 #if defined(DIAGNOSTIC)
336 #define	PMAP_SUBOBJ_LOCK(pm, idx) \
337 	KASSERT(mutex_owned((pm)->pm_lock)); \
338 	if ((idx) != 0) \
339 		mutex_enter((pm)->pm_obj[(idx)].vmobjlock)
340 #define	PMAP_SUBOBJ_UNLOCK(pm, idx) \
341 	KASSERT(mutex_owned((pm)->pm_lock)); \
342 	if ((idx) != 0) \
343 		mutex_exit((pm)->pm_obj[(idx)].vmobjlock)
344 #else /* defined(DIAGNOSTIC) */
345 #define	PMAP_SUBOBJ_LOCK(pm, idx)	/* nothing */
346 #define	PMAP_SUBOBJ_UNLOCK(pm, idx)	/* nothing */
347 #endif /* defined(DIAGNOSTIC) */
348 
349 /*
350  * Misc. event counters.
351  */
352 struct evcnt pmap_iobmp_evcnt;
353 struct evcnt pmap_ldt_evcnt;
354 
355 /*
356  * PAT
357  */
358 #define	PATENTRY(n, type)	(type << ((n) * 8))
359 #define	PAT_UC		0x0ULL
360 #define	PAT_WC		0x1ULL
361 #define	PAT_WT		0x4ULL
362 #define	PAT_WP		0x5ULL
363 #define	PAT_WB		0x6ULL
364 #define	PAT_UCMINUS	0x7ULL
365 
366 static bool cpu_pat_enabled __read_mostly = false;
367 
368 /*
369  * global data structures
370  */
371 
372 static struct pmap kernel_pmap_store;	/* the kernel's pmap (proc0) */
373 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store;
374 
375 /*
376  * pmap_pg_g: if our processor supports PG_G in the PTE then we
377  * set pmap_pg_g to PG_G (otherwise it is zero).
378  */
379 
380 int pmap_pg_g __read_mostly = 0;
381 
382 /*
383  * pmap_largepages: if our processor supports PG_PS and we are
384  * using it, this is set to true.
385  */
386 
387 int pmap_largepages __read_mostly;
388 
389 /*
390  * i386 physical memory comes in a big contig chunk with a small
391  * hole toward the front of it...  the following two paddr_t's
392  * (shared with machdep.c) describe the physical address space
393  * of this machine.
394  */
395 paddr_t avail_start __read_mostly; /* PA of first available physical page */
396 paddr_t avail_end __read_mostly; /* PA of last available physical page */
397 
398 #ifdef XEN
399 #ifdef __x86_64__
400 /* Dummy PGD for user cr3, used between pmap_deactivate() and pmap_activate() */
401 static paddr_t xen_dummy_user_pgd;
402 #endif /* __x86_64__ */
403 paddr_t pmap_pa_start; /* PA of first physical page for this domain */
404 paddr_t pmap_pa_end;   /* PA of last physical page for this domain */
405 #endif /* XEN */
406 
407 #define	VM_PAGE_TO_PP(pg)	(&(pg)->mdpage.mp_pp)
408 
409 #define	PV_HASH_SIZE		32768
410 #define	PV_HASH_LOCK_CNT	32
411 
412 struct pv_hash_lock {
413 	kmutex_t lock;
414 } __aligned(CACHE_LINE_SIZE) pv_hash_locks[PV_HASH_LOCK_CNT]
415     __aligned(CACHE_LINE_SIZE);
416 
417 struct pv_hash_head {
418 	SLIST_HEAD(, pv_entry) hh_list;
419 } pv_hash_heads[PV_HASH_SIZE];
420 
421 static u_int
422 pvhash_hash(struct vm_page *ptp, vaddr_t va)
423 {
424 
425 	return (uintptr_t)ptp / sizeof(*ptp) + (va >> PAGE_SHIFT);
426 }
427 
428 static struct pv_hash_head *
429 pvhash_head(u_int hash)
430 {
431 
432 	return &pv_hash_heads[hash % PV_HASH_SIZE];
433 }
434 
435 static kmutex_t *
436 pvhash_lock(u_int hash)
437 {
438 
439 	return &pv_hash_locks[hash % PV_HASH_LOCK_CNT].lock;
440 }
441 
442 static struct pv_entry *
443 pvhash_remove(struct pv_hash_head *hh, struct vm_page *ptp, vaddr_t va)
444 {
445 	struct pv_entry *pve;
446 	struct pv_entry *prev;
447 
448 	prev = NULL;
449 	SLIST_FOREACH(pve, &hh->hh_list, pve_hash) {
450 		if (pve->pve_pte.pte_ptp == ptp &&
451 		    pve->pve_pte.pte_va == va) {
452 			if (prev != NULL) {
453 				SLIST_REMOVE_AFTER(prev, pve_hash);
454 			} else {
455 				SLIST_REMOVE_HEAD(&hh->hh_list, pve_hash);
456 			}
457 			break;
458 		}
459 		prev = pve;
460 	}
461 	return pve;
462 }
463 
464 /*
465  * other data structures
466  */
467 
468 static pt_entry_t protection_codes[8] __read_mostly; /* maps MI prot to i386
469 							prot code */
470 static bool pmap_initialized __read_mostly = false; /* pmap_init done yet? */
471 
472 /*
473  * the following two vaddr_t's are used during system startup
474  * to keep track of how much of the kernel's VM space we have used.
475  * once the system is started, the management of the remaining kernel
476  * VM space is turned over to the kernel_map vm_map.
477  */
478 
479 static vaddr_t virtual_avail __read_mostly;	/* VA of first free KVA */
480 static vaddr_t virtual_end __read_mostly;	/* VA of last free KVA */
481 
482 /*
483  * pool that pmap structures are allocated from
484  */
485 
486 static struct pool_cache pmap_cache;
487 
488 /*
489  * pv_entry cache
490  */
491 
492 static struct pool_cache pmap_pv_cache;
493 
494 #ifdef __HAVE_DIRECT_MAP
495 
496 extern phys_ram_seg_t mem_clusters[];
497 extern int mem_cluster_cnt;
498 
499 #else
500 
501 /*
502  * MULTIPROCESSOR: special VA's/ PTE's are actually allocated inside a
503  * maxcpus*NPTECL array of PTE's, to avoid cache line thrashing
504  * due to false sharing.
505  */
506 
507 #ifdef MULTIPROCESSOR
508 #define PTESLEW(pte, id) ((pte)+(id)*NPTECL)
509 #define VASLEW(va,id) ((va)+(id)*NPTECL*PAGE_SIZE)
510 #else
511 #define PTESLEW(pte, id) ((void)id, pte)
512 #define VASLEW(va,id) ((void)id, va)
513 #endif
514 
515 /*
516  * special VAs and the PTEs that map them
517  */
518 static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte, *early_zero_pte;
519 static char *csrcp, *cdstp, *zerop, *ptpp;
520 #ifdef XEN
521 char *early_zerop; /* also referenced from xen_pmap_bootstrap() */
522 #else
523 static char *early_zerop;
524 #endif
525 
526 #endif
527 
528 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int);
529 
530 /* PDP pool_cache(9) and its callbacks */
531 struct pool_cache pmap_pdp_cache;
532 static int  pmap_pdp_ctor(void *, void *, int);
533 static void pmap_pdp_dtor(void *, void *);
534 #ifdef PAE
535 /* need to allocate items of 4 pages */
536 static void *pmap_pdp_alloc(struct pool *, int);
537 static void pmap_pdp_free(struct pool *, void *);
538 static struct pool_allocator pmap_pdp_allocator = {
539 	.pa_alloc = pmap_pdp_alloc,
540 	.pa_free = pmap_pdp_free,
541 	.pa_pagesz = PAGE_SIZE * PDP_SIZE,
542 };
543 #endif /* PAE */
544 
545 extern vaddr_t idt_vaddr;			/* we allocate IDT early */
546 extern paddr_t idt_paddr;
547 
548 #ifdef _LP64
549 extern vaddr_t lo32_vaddr;
550 extern vaddr_t lo32_paddr;
551 #endif
552 
553 extern int end;
554 
555 #ifdef i386
556 /* stuff to fix the pentium f00f bug */
557 extern vaddr_t pentium_idt_vaddr;
558 #endif
559 
560 
561 /*
562  * local prototypes
563  */
564 
565 static struct vm_page	*pmap_get_ptp(struct pmap *, vaddr_t,
566 				      pd_entry_t * const *);
567 static struct vm_page	*pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int);
568 static void		 pmap_freepage(struct pmap *, struct vm_page *, int);
569 static void		 pmap_free_ptp(struct pmap *, struct vm_page *,
570 				       vaddr_t, pt_entry_t *,
571 				       pd_entry_t * const *);
572 static bool		 pmap_remove_pte(struct pmap *, struct vm_page *,
573 					 pt_entry_t *, vaddr_t,
574 					 struct pv_entry **);
575 static void		 pmap_remove_ptes(struct pmap *, struct vm_page *,
576 					  vaddr_t, vaddr_t, vaddr_t,
577 					  struct pv_entry **);
578 
579 static bool		 pmap_get_physpage(vaddr_t, int, paddr_t *);
580 static void		 pmap_alloc_level(pd_entry_t * const *, vaddr_t, int,
581 					  long *);
582 
583 static bool		 pmap_reactivate(struct pmap *);
584 
585 /*
586  * p m a p   h e l p e r   f u n c t i o n s
587  */
588 
589 static inline void
590 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff)
591 {
592 
593 	if (pmap == pmap_kernel()) {
594 		atomic_add_long(&pmap->pm_stats.resident_count, resid_diff);
595 		atomic_add_long(&pmap->pm_stats.wired_count, wired_diff);
596 	} else {
597 		KASSERT(mutex_owned(pmap->pm_lock));
598 		pmap->pm_stats.resident_count += resid_diff;
599 		pmap->pm_stats.wired_count += wired_diff;
600 	}
601 }
602 
603 static inline void
604 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
605 {
606 	int resid_diff = ((npte & PG_V) ? 1 : 0) - ((opte & PG_V) ? 1 : 0);
607 	int wired_diff = ((npte & PG_W) ? 1 : 0) - ((opte & PG_W) ? 1 : 0);
608 
609 	KASSERT((npte & (PG_V | PG_W)) != PG_W);
610 	KASSERT((opte & (PG_V | PG_W)) != PG_W);
611 
612 	pmap_stats_update(pmap, resid_diff, wired_diff);
613 }
614 
615 /*
616  * ptp_to_pmap: lookup pmap by ptp
617  */
618 
619 static struct pmap *
620 ptp_to_pmap(struct vm_page *ptp)
621 {
622 	struct pmap *pmap;
623 
624 	if (ptp == NULL) {
625 		return pmap_kernel();
626 	}
627 	pmap = (struct pmap *)ptp->uobject;
628 	KASSERT(pmap != NULL);
629 	KASSERT(&pmap->pm_obj[0] == ptp->uobject);
630 	return pmap;
631 }
632 
633 static inline struct pv_pte *
634 pve_to_pvpte(struct pv_entry *pve)
635 {
636 
637 	KASSERT((void *)&pve->pve_pte == (void *)pve);
638 	return &pve->pve_pte;
639 }
640 
641 static inline struct pv_entry *
642 pvpte_to_pve(struct pv_pte *pvpte)
643 {
644 	struct pv_entry *pve = (void *)pvpte;
645 
646 	KASSERT(pve_to_pvpte(pve) == pvpte);
647 	return pve;
648 }
649 
650 /*
651  * pv_pte_first, pv_pte_next: PV list iterator.
652  */
653 
654 static struct pv_pte *
655 pv_pte_first(struct pmap_page *pp)
656 {
657 
658 	if ((pp->pp_flags & PP_EMBEDDED) != 0) {
659 		return &pp->pp_pte;
660 	}
661 	return pve_to_pvpte(LIST_FIRST(&pp->pp_head.pvh_list));
662 }
663 
664 static struct pv_pte *
665 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte)
666 {
667 
668 	KASSERT(pvpte != NULL);
669 	if (pvpte == &pp->pp_pte) {
670 		KASSERT((pp->pp_flags & PP_EMBEDDED) != 0);
671 		return NULL;
672 	}
673 	KASSERT((pp->pp_flags & PP_EMBEDDED) == 0);
674 	return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list));
675 }
676 
677 /*
678  * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
679  *		of course the kernel is always loaded
680  */
681 
682 bool
683 pmap_is_curpmap(struct pmap *pmap)
684 {
685 	return((pmap == pmap_kernel()) ||
686 	       (pmap == curcpu()->ci_pmap));
687 }
688 
689 /*
690  *	Add a reference to the specified pmap.
691  */
692 
693 void
694 pmap_reference(struct pmap *pmap)
695 {
696 
697 	atomic_inc_uint(&pmap->pm_obj[0].uo_refs);
698 }
699 
700 /*
701  * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
702  *
703  * there are several pmaps involved.  some or all of them might be same.
704  *
705  *	- the pmap given by the first argument
706  *		our caller wants to access this pmap's PTEs.
707  *
708  *	- pmap_kernel()
709  *		the kernel pmap.  note that it only contains the kernel part
710  *		of the address space which is shared by any pmap.  ie. any
711  *		pmap can be used instead of pmap_kernel() for our purpose.
712  *
713  *	- ci->ci_pmap
714  *		pmap currently loaded on the cpu.
715  *
716  *	- vm_map_pmap(&curproc->p_vmspace->vm_map)
717  *		current process' pmap.
718  *
719  * => we lock enough pmaps to keep things locked in
720  * => must be undone with pmap_unmap_ptes before returning
721  */
722 
723 void
724 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2,
725 	      pd_entry_t **ptepp, pd_entry_t * const **pdeppp)
726 {
727 	struct pmap *curpmap;
728 	struct cpu_info *ci;
729 	lwp_t *l;
730 
731 	/* The kernel's pmap is always accessible. */
732 	if (pmap == pmap_kernel()) {
733 		*pmap2 = NULL;
734 		*ptepp = PTE_BASE;
735 		*pdeppp = normal_pdes;
736 		return;
737 	}
738 	KASSERT(kpreempt_disabled());
739 
740 	l = curlwp;
741  retry:
742 	mutex_enter(pmap->pm_lock);
743 	ci = curcpu();
744 	curpmap = ci->ci_pmap;
745 	if (vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) {
746 		/* Our own pmap so just load it: easy. */
747 		if (__predict_false(ci->ci_want_pmapload)) {
748 			mutex_exit(pmap->pm_lock);
749 			pmap_load();
750 			goto retry;
751 		}
752 		KASSERT(pmap == curpmap);
753 	} else if (pmap == curpmap) {
754 		/*
755 		 * Already on the CPU: make it valid.  This is very
756 		 * often the case during exit(), when we have switched
757 		 * to the kernel pmap in order to destroy a user pmap.
758 		 */
759 		if (!pmap_reactivate(pmap)) {
760 			u_int gen = uvm_emap_gen_return();
761 			tlbflush();
762 			uvm_emap_update(gen);
763 		}
764 	} else {
765 		/*
766 		 * Toss current pmap from CPU, but keep a reference to it.
767 		 * The reference will be dropped by pmap_unmap_ptes().
768 		 * Can happen if we block during exit().
769 		 */
770 		const cpuid_t cid = cpu_index(ci);
771 
772 		kcpuset_atomic_clear(curpmap->pm_cpus, cid);
773 		kcpuset_atomic_clear(curpmap->pm_kernel_cpus, cid);
774 		ci->ci_pmap = pmap;
775 		ci->ci_tlbstate = TLBSTATE_VALID;
776 		kcpuset_atomic_set(pmap->pm_cpus, cid);
777 		kcpuset_atomic_set(pmap->pm_kernel_cpus, cid);
778 		cpu_load_pmap(pmap, curpmap);
779 	}
780 	pmap->pm_ncsw = l->l_ncsw;
781 	*pmap2 = curpmap;
782 	*ptepp = PTE_BASE;
783 #if defined(XEN) && defined(__x86_64__)
784 	KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] == L4_BASE);
785 	ci->ci_normal_pdes[PTP_LEVELS - 2] = pmap->pm_pdir;
786 	*pdeppp = ci->ci_normal_pdes;
787 #else /* XEN && __x86_64__ */
788 	*pdeppp = normal_pdes;
789 #endif /* XEN && __x86_64__ */
790 }
791 
792 /*
793  * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
794  */
795 
796 void
797 pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2)
798 {
799 	struct cpu_info *ci;
800 	struct pmap *mypmap;
801 
802 	KASSERT(kpreempt_disabled());
803 
804 	/* The kernel's pmap is always accessible. */
805 	if (pmap == pmap_kernel()) {
806 		return;
807 	}
808 
809 	ci = curcpu();
810 #if defined(XEN) && defined(__x86_64__)
811 	/* Reset per-cpu normal_pdes */
812 	KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] != L4_BASE);
813 	ci->ci_normal_pdes[PTP_LEVELS - 2] = L4_BASE;
814 #endif /* XEN && __x86_64__ */
815 	/*
816 	 * We cannot tolerate context switches while mapped in.
817 	 * If it is our own pmap all we have to do is unlock.
818 	 */
819 	KASSERT(pmap->pm_ncsw == curlwp->l_ncsw);
820 	mypmap = vm_map_pmap(&curproc->p_vmspace->vm_map);
821 	if (pmap == mypmap) {
822 		mutex_exit(pmap->pm_lock);
823 		return;
824 	}
825 
826 	/*
827 	 * Mark whatever's on the CPU now as lazy and unlock.
828 	 * If the pmap was already installed, we are done.
829 	 */
830 	ci->ci_tlbstate = TLBSTATE_LAZY;
831 	ci->ci_want_pmapload = (mypmap != pmap_kernel());
832 	mutex_exit(pmap->pm_lock);
833 	if (pmap == pmap2) {
834 		return;
835 	}
836 
837 	/*
838 	 * We installed another pmap on the CPU.  Grab a reference to
839 	 * it and leave in place.  Toss the evicted pmap (can block).
840 	 */
841 	pmap_reference(pmap);
842 	pmap_destroy(pmap2);
843 }
844 
845 
846 inline static void
847 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte)
848 {
849 
850 #if !defined(__x86_64__)
851 	if (curproc == NULL || curproc->p_vmspace == NULL ||
852 	    pm != vm_map_pmap(&curproc->p_vmspace->vm_map))
853 		return;
854 
855 	if ((opte ^ npte) & PG_X)
856 		pmap_update_pg(va);
857 
858 	/*
859 	 * Executability was removed on the last executable change.
860 	 * Reset the code segment to something conservative and
861 	 * let the trap handler deal with setting the right limit.
862 	 * We can't do that because of locking constraints on the vm map.
863 	 */
864 
865 	if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) {
866 		struct trapframe *tf = curlwp->l_md.md_regs;
867 
868 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
869 		pm->pm_hiexec = I386_MAX_EXE_ADDR;
870 	}
871 #endif /* !defined(__x86_64__) */
872 }
873 
874 #if !defined(__x86_64__)
875 /*
876  * Fixup the code segment to cover all potential executable mappings.
877  * returns 0 if no changes to the code segment were made.
878  */
879 
880 int
881 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb)
882 {
883 	struct vm_map_entry *ent;
884 	struct pmap *pm = vm_map_pmap(map);
885 	vaddr_t va = 0;
886 
887 	vm_map_lock_read(map);
888 	for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) {
889 
890 		/*
891 		 * This entry has greater va than the entries before.
892 		 * We need to make it point to the last page, not past it.
893 		 */
894 
895 		if (ent->protection & VM_PROT_EXECUTE)
896 			va = trunc_page(ent->end) - PAGE_SIZE;
897 	}
898 	vm_map_unlock_read(map);
899 	if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL))
900 		return (0);
901 
902 	pm->pm_hiexec = va;
903 	if (pm->pm_hiexec > I386_MAX_EXE_ADDR) {
904 		tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
905 	} else {
906 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
907 		return (0);
908 	}
909 	return (1);
910 }
911 #endif /* !defined(__x86_64__) */
912 
913 void
914 pat_init(struct cpu_info *ci)
915 {
916 	uint64_t pat;
917 
918 	if (!(ci->ci_feat_val[0] & CPUID_PAT))
919 		return;
920 
921 	/* We change WT to WC. Leave all other entries the default values. */
922 	pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) |
923 	      PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) |
924 	      PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) |
925 	      PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC);
926 
927 	wrmsr(MSR_CR_PAT, pat);
928 	cpu_pat_enabled = true;
929 	aprint_debug_dev(ci->ci_dev, "PAT enabled\n");
930 }
931 
932 static pt_entry_t
933 pmap_pat_flags(u_int flags)
934 {
935 	u_int cacheflags = (flags & PMAP_CACHE_MASK);
936 
937 	if (!cpu_pat_enabled) {
938 		switch (cacheflags) {
939 		case PMAP_NOCACHE:
940 		case PMAP_NOCACHE_OVR:
941 			/* results in PGC_UCMINUS on cpus which have
942 			 * the cpuid PAT but PAT "disabled"
943 			 */
944 			return PG_N;
945 		default:
946 			return 0;
947 		}
948 	}
949 
950 	switch (cacheflags) {
951 	case PMAP_NOCACHE:
952 		return PGC_UC;
953 	case PMAP_WRITE_COMBINE:
954 		return PGC_WC;
955 	case PMAP_WRITE_BACK:
956 		return PGC_WB;
957 	case PMAP_NOCACHE_OVR:
958 		return PGC_UCMINUS;
959 	}
960 
961 	return 0;
962 }
963 
964 /*
965  * p m a p   k e n t e r   f u n c t i o n s
966  *
967  * functions to quickly enter/remove pages from the kernel address
968  * space.   pmap_kremove is exported to MI kernel.  we make use of
969  * the recursive PTE mappings.
970  */
971 
972 /*
973  * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
974  *
975  * => no need to lock anything, assume va is already allocated
976  * => should be faster than normal pmap enter function
977  */
978 
979 void
980 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
981 {
982 	pt_entry_t *pte, opte, npte;
983 
984 	KASSERT(!(prot & ~VM_PROT_ALL));
985 
986 	if (va < VM_MIN_KERNEL_ADDRESS)
987 		pte = vtopte(va);
988 	else
989 		pte = kvtopte(va);
990 #ifdef DOM0OPS
991 	if (pa < pmap_pa_start || pa >= pmap_pa_end) {
992 #ifdef DEBUG
993 		printf_nolog("%s: pa 0x%" PRIx64 " for va 0x%" PRIx64
994 		    " outside range\n", __func__, (int64_t)pa, (int64_t)va);
995 #endif /* DEBUG */
996 		npte = pa;
997 	} else
998 #endif /* DOM0OPS */
999 		npte = pmap_pa2pte(pa);
1000 	npte |= protection_codes[prot] | PG_k | PG_V | pmap_pg_g;
1001 	npte |= pmap_pat_flags(flags);
1002 	opte = pmap_pte_testset(pte, npte); /* zap! */
1003 #if defined(DIAGNOSTIC)
1004 	/* XXX For now... */
1005 	if (opte & PG_PS)
1006 		panic("%s: PG_PS", __func__);
1007 #endif
1008 	if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) {
1009 		/* This should not happen. */
1010 		printf_nolog("%s: mapping already present\n", __func__);
1011 		kpreempt_disable();
1012 		pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER);
1013 		kpreempt_enable();
1014 	}
1015 }
1016 
1017 void
1018 pmap_emap_enter(vaddr_t va, paddr_t pa, vm_prot_t prot)
1019 {
1020 	pt_entry_t *pte, npte;
1021 
1022 	KASSERT((prot & ~VM_PROT_ALL) == 0);
1023 	pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va);
1024 
1025 #ifdef DOM0OPS
1026 	if (pa < pmap_pa_start || pa >= pmap_pa_end) {
1027 		npte = pa;
1028 	} else
1029 #endif
1030 		npte = pmap_pa2pte(pa);
1031 
1032 	npte = pmap_pa2pte(pa);
1033 	npte |= protection_codes[prot] | PG_k | PG_V;
1034 	pmap_pte_set(pte, npte);
1035 }
1036 
1037 /*
1038  * pmap_emap_sync: perform TLB flush or pmap load, if it was deferred.
1039  */
1040 void
1041 pmap_emap_sync(bool canload)
1042 {
1043 	struct cpu_info *ci = curcpu();
1044 	struct pmap *pmap;
1045 
1046 	KASSERT(kpreempt_disabled());
1047 	if (__predict_true(ci->ci_want_pmapload && canload)) {
1048 		/*
1049 		 * XXX: Hint for pmap_reactivate(), which might suggest to
1050 		 * not perform TLB flush, if state has not changed.
1051 		 */
1052 		pmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map);
1053 		if (__predict_false(pmap == ci->ci_pmap)) {
1054 			kcpuset_atomic_clear(pmap->pm_cpus, cpu_index(ci));
1055 		}
1056 		pmap_load();
1057 		KASSERT(ci->ci_want_pmapload == 0);
1058 	} else {
1059 		tlbflush();
1060 	}
1061 
1062 }
1063 
1064 void
1065 pmap_emap_remove(vaddr_t sva, vsize_t len)
1066 {
1067 	pt_entry_t *pte;
1068 	vaddr_t va, eva = sva + len;
1069 
1070 	for (va = sva; va < eva; va += PAGE_SIZE) {
1071 		pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va);
1072 		pmap_pte_set(pte, 0);
1073 	}
1074 }
1075 
1076 __strict_weak_alias(pmap_kenter_ma, pmap_kenter_pa);
1077 
1078 #if defined(__x86_64__)
1079 /*
1080  * Change protection for a virtual address. Local for a CPU only, don't
1081  * care about TLB shootdowns.
1082  *
1083  * => must be called with preemption disabled
1084  */
1085 void
1086 pmap_changeprot_local(vaddr_t va, vm_prot_t prot)
1087 {
1088 	pt_entry_t *pte, opte, npte;
1089 
1090 	KASSERT(kpreempt_disabled());
1091 
1092 	if (va < VM_MIN_KERNEL_ADDRESS)
1093 		pte = vtopte(va);
1094 	else
1095 		pte = kvtopte(va);
1096 
1097 	npte = opte = *pte;
1098 
1099 	if ((prot & VM_PROT_WRITE) != 0)
1100 		npte |= PG_RW;
1101 	else
1102 		npte &= ~PG_RW;
1103 
1104 	if (opte != npte) {
1105 		pmap_pte_set(pte, npte);
1106 		pmap_pte_flush();
1107 		invlpg(va);
1108 	}
1109 }
1110 #endif /* defined(__x86_64__) */
1111 
1112 /*
1113  * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
1114  *
1115  * => no need to lock anything
1116  * => caller must dispose of any vm_page mapped in the va range
1117  * => note: not an inline function
1118  * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
1119  * => we assume kernel only unmaps valid addresses and thus don't bother
1120  *    checking the valid bit before doing TLB flushing
1121  * => must be followed by call to pmap_update() before reuse of page
1122  */
1123 
1124 static inline void
1125 pmap_kremove1(vaddr_t sva, vsize_t len, bool localonly)
1126 {
1127 	pt_entry_t *pte, opte;
1128 	vaddr_t va, eva;
1129 
1130 	eva = sva + len;
1131 
1132 	kpreempt_disable();
1133 	for (va = sva; va < eva; va += PAGE_SIZE) {
1134 		pte = kvtopte(va);
1135 		opte = pmap_pte_testset(pte, 0); /* zap! */
1136 		if ((opte & (PG_V | PG_U)) == (PG_V | PG_U) && !localonly) {
1137 			pmap_tlb_shootdown(pmap_kernel(), va, opte,
1138 			    TLBSHOOT_KREMOVE);
1139 		}
1140 		KASSERT((opte & PG_PS) == 0);
1141 		KASSERT((opte & PG_PVLIST) == 0);
1142 	}
1143 	if (localonly) {
1144 		tlbflushg();
1145 	}
1146 	kpreempt_enable();
1147 }
1148 
1149 void
1150 pmap_kremove(vaddr_t sva, vsize_t len)
1151 {
1152 
1153 	pmap_kremove1(sva, len, false);
1154 }
1155 
1156 /*
1157  * pmap_kremove_local: like pmap_kremove(), but only worry about
1158  * TLB invalidations on the current CPU.  this is only intended
1159  * for use while writing kernel crash dumps.
1160  */
1161 
1162 void
1163 pmap_kremove_local(vaddr_t sva, vsize_t len)
1164 {
1165 
1166 	KASSERT(panicstr != NULL);
1167 	pmap_kremove1(sva, len, true);
1168 }
1169 
1170 /*
1171  * p m a p   i n i t   f u n c t i o n s
1172  *
1173  * pmap_bootstrap and pmap_init are called during system startup
1174  * to init the pmap module.   pmap_bootstrap() does a low level
1175  * init just to get things rolling.   pmap_init() finishes the job.
1176  */
1177 
1178 /*
1179  * pmap_bootstrap: get the system in a state where it can run with VM
1180  *	properly enabled (called before main()).   the VM system is
1181  *      fully init'd later...
1182  *
1183  * => on i386, locore.s has already enabled the MMU by allocating
1184  *	a PDP for the kernel, and nkpde PTP's for the kernel.
1185  * => kva_start is the first free virtual address in kernel space
1186  */
1187 
1188 void
1189 pmap_bootstrap(vaddr_t kva_start)
1190 {
1191 	struct pmap *kpm;
1192 	pt_entry_t *pte;
1193 	int i;
1194 	vaddr_t kva;
1195 #ifndef XEN
1196 	pd_entry_t *pde;
1197 	unsigned long p1i;
1198 	vaddr_t kva_end;
1199 #endif
1200 #ifdef __HAVE_DIRECT_MAP
1201 	phys_ram_seg_t *mc;
1202 	long ndmpdp;
1203 	paddr_t lastpa, dmpd, dmpdp, pdp;
1204 	vaddr_t tmpva;
1205 #endif
1206 
1207 	pt_entry_t pg_nx = (cpu_feature[2] & CPUID_NOX ? PG_NX : 0);
1208 
1209 	/*
1210 	 * set up our local static global vars that keep track of the
1211 	 * usage of KVM before kernel_map is set up
1212 	 */
1213 
1214 	virtual_avail = kva_start;		/* first free KVA */
1215 	virtual_end = VM_MAX_KERNEL_ADDRESS;	/* last KVA */
1216 
1217 	/*
1218 	 * set up protection_codes: we need to be able to convert from
1219 	 * a MI protection code (some combo of VM_PROT...) to something
1220 	 * we can jam into a i386 PTE.
1221 	 */
1222 
1223 	protection_codes[VM_PROT_NONE] = pg_nx;			/* --- */
1224 	protection_codes[VM_PROT_EXECUTE] = PG_RO | PG_X;	/* --x */
1225 	protection_codes[VM_PROT_READ] = PG_RO | pg_nx;		/* -r- */
1226 	protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO | PG_X;/* -rx */
1227 	protection_codes[VM_PROT_WRITE] = PG_RW | pg_nx;	/* w-- */
1228 	protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW | PG_X;/* w-x */
1229 	protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW | pg_nx;
1230 								/* wr- */
1231 	protection_codes[VM_PROT_ALL] = PG_RW | PG_X;		/* wrx */
1232 
1233 	/*
1234 	 * now we init the kernel's pmap
1235 	 *
1236 	 * the kernel pmap's pm_obj is not used for much.   however, in
1237 	 * user pmaps the pm_obj contains the list of active PTPs.
1238 	 * the pm_obj currently does not have a pager.   it might be possible
1239 	 * to add a pager that would allow a process to read-only mmap its
1240 	 * own page tables (fast user level vtophys?).   this may or may not
1241 	 * be useful.
1242 	 */
1243 
1244 	kpm = pmap_kernel();
1245 	for (i = 0; i < PTP_LEVELS - 1; i++) {
1246 		mutex_init(&kpm->pm_obj_lock[i], MUTEX_DEFAULT, IPL_NONE);
1247 		uvm_obj_init(&kpm->pm_obj[i], NULL, false, 1);
1248 		uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_obj_lock[i]);
1249 		kpm->pm_ptphint[i] = NULL;
1250 	}
1251 	memset(&kpm->pm_list, 0, sizeof(kpm->pm_list));  /* pm_list not used */
1252 
1253 	kpm->pm_pdir = (pd_entry_t *)(PDPpaddr + KERNBASE);
1254 	for (i = 0; i < PDP_SIZE; i++)
1255 		kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i;
1256 
1257 	kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
1258 		x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS);
1259 
1260 	kcpuset_create(&kpm->pm_cpus, true);
1261 	kcpuset_create(&kpm->pm_kernel_cpus, true);
1262 
1263 	/*
1264 	 * the above is just a rough estimate and not critical to the proper
1265 	 * operation of the system.
1266 	 */
1267 
1268 #ifndef XEN
1269 	/*
1270 	 * Begin to enable global TLB entries if they are supported.
1271 	 * The G bit has no effect until the CR4_PGE bit is set in CR4,
1272 	 * which happens in cpu_init(), which is run on each cpu
1273 	 * (and happens later)
1274 	 */
1275 
1276 	if (cpu_feature[0] & CPUID_PGE) {
1277 		pmap_pg_g = PG_G;		/* enable software */
1278 
1279 		/* add PG_G attribute to already mapped kernel pages */
1280 		if (KERNBASE == VM_MIN_KERNEL_ADDRESS) {
1281 			kva_end = virtual_avail;
1282 		} else {
1283 			extern vaddr_t eblob, esym;
1284 			kva_end = (vaddr_t)&end;
1285 			if (esym > kva_end)
1286 				kva_end = esym;
1287 			if (eblob > kva_end)
1288 				kva_end = eblob;
1289 			kva_end = roundup(kva_end, PAGE_SIZE);
1290 		}
1291 		for (kva = KERNBASE; kva < kva_end; kva += PAGE_SIZE) {
1292 			p1i = pl1_i(kva);
1293 			if (pmap_valid_entry(PTE_BASE[p1i]))
1294 				PTE_BASE[p1i] |= PG_G;
1295 		}
1296 	}
1297 
1298 	/*
1299 	 * enable large pages if they are supported.
1300 	 */
1301 
1302 	if (cpu_feature[0] & CPUID_PSE) {
1303 		paddr_t pa;
1304 		extern char __data_start;
1305 
1306 		lcr4(rcr4() | CR4_PSE);	/* enable hardware (via %cr4) */
1307 		pmap_largepages = 1;	/* enable software */
1308 
1309 		/*
1310 		 * the TLB must be flushed after enabling large pages
1311 		 * on Pentium CPUs, according to section 3.6.2.2 of
1312 		 * "Intel Architecture Software Developer's Manual,
1313 		 * Volume 3: System Programming".
1314 		 */
1315 		tlbflushg();
1316 
1317 		/*
1318 		 * now, remap the kernel text using large pages.  we
1319 		 * assume that the linker has properly aligned the
1320 		 * .data segment to a NBPD_L2 boundary.
1321 		 */
1322 		kva_end = rounddown((vaddr_t)&__data_start, NBPD_L1);
1323 		for (pa = 0, kva = KERNBASE; kva + NBPD_L2 <= kva_end;
1324 		     kva += NBPD_L2, pa += NBPD_L2) {
1325 			pde = &L2_BASE[pl2_i(kva)];
1326 			*pde = pa | pmap_pg_g | PG_PS |
1327 			    PG_KR | PG_V;	/* zap! */
1328 			tlbflushg();
1329 		}
1330 #if defined(DEBUG)
1331 		aprint_normal("kernel text is mapped with %" PRIuPSIZE " large "
1332 		    "pages and %" PRIuPSIZE " normal pages\n",
1333 		    howmany(kva - KERNBASE, NBPD_L2),
1334 		    howmany((vaddr_t)&__data_start - kva, NBPD_L1));
1335 #endif /* defined(DEBUG) */
1336 	}
1337 #endif /* !XEN */
1338 
1339 #ifdef __HAVE_DIRECT_MAP
1340 
1341 	tmpva = (KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2);
1342 	pte = PTE_BASE + pl1_i(tmpva);
1343 
1344 	/*
1345 	 * Map the direct map.  Use 1GB pages if they are available,
1346 	 * otherwise use 2MB pages.  Note that the unused parts of
1347 	 * PTPs * must be zero outed, as they might be accessed due
1348 	 * to speculative execution.  Also, PG_G is not allowed on
1349 	 * non-leaf PTPs.
1350 	 */
1351 
1352 	lastpa = 0;
1353 	for (i = 0; i < mem_cluster_cnt; i++) {
1354 		mc = &mem_clusters[i];
1355 		lastpa = MAX(lastpa, mc->start + mc->size);
1356 	}
1357 
1358 	ndmpdp = (lastpa + NBPD_L3 - 1) >> L3_SHIFT;
1359 	dmpdp = avail_start;	avail_start += PAGE_SIZE;
1360 
1361 	*pte = dmpdp | PG_V | PG_RW;
1362 	pmap_update_pg(tmpva);
1363 	memset((void *)tmpva, 0, PAGE_SIZE);
1364 
1365 	if (cpu_feature[2] & CPUID_P1GB) {
1366 		for (i = 0; i < ndmpdp; i++) {
1367 			pdp = (paddr_t)&(((pd_entry_t *)dmpdp)[i]);
1368 			*pte = (pdp & PG_FRAME) | PG_V | PG_RW;
1369 			pmap_update_pg(tmpva);
1370 
1371 			pde = (pd_entry_t *)(tmpva + (pdp & ~PG_FRAME));
1372 			*pde = ((paddr_t)i << L3_SHIFT) |
1373 				PG_RW | PG_V | PG_U | PG_PS | PG_G;
1374 		}
1375 	} else {
1376 		dmpd = avail_start;	avail_start += ndmpdp * PAGE_SIZE;
1377 
1378 		for (i = 0; i < ndmpdp; i++) {
1379 			pdp = dmpd + i * PAGE_SIZE;
1380 			*pte = (pdp & PG_FRAME) | PG_V | PG_RW;
1381 			pmap_update_pg(tmpva);
1382 
1383 			memset((void *)tmpva, 0, PAGE_SIZE);
1384 		}
1385 		for (i = 0; i < NPDPG * ndmpdp; i++) {
1386 			pdp = (paddr_t)&(((pd_entry_t *)dmpd)[i]);
1387 			*pte = (pdp & PG_FRAME) | PG_V | PG_RW;
1388 			pmap_update_pg(tmpva);
1389 
1390 			pde = (pd_entry_t *)(tmpva + (pdp & ~PG_FRAME));
1391 			*pde = ((paddr_t)i << L2_SHIFT) |
1392 				PG_RW | PG_V | PG_U | PG_PS | PG_G;
1393 		}
1394 		for (i = 0; i < ndmpdp; i++) {
1395 			pdp = (paddr_t)&(((pd_entry_t *)dmpdp)[i]);
1396 			*pte = (pdp & PG_FRAME) | PG_V | PG_RW;
1397 			pmap_update_pg((vaddr_t)tmpva);
1398 
1399 			pde = (pd_entry_t *)(tmpva + (pdp & ~PG_FRAME));
1400 			*pde = (dmpd + (i << PAGE_SHIFT)) |
1401 				PG_RW | PG_V | PG_U;
1402 		}
1403 	}
1404 
1405 	kpm->pm_pdir[PDIR_SLOT_DIRECT] = dmpdp | PG_KW | PG_V | PG_U;
1406 
1407 	tlbflush();
1408 
1409 #else
1410 	if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
1411 		/*
1412 		 * zero_pte is stuck at the end of mapped space for the kernel
1413 		 * image (disjunct from kva space). This is done so that it
1414 		 * can safely be used in pmap_growkernel (pmap_get_physpage),
1415 		 * when it's called for the first time.
1416 		 * XXXfvdl fix this for MULTIPROCESSOR later.
1417 		 */
1418 #ifdef XEN
1419 		/* early_zerop initialized in xen_pmap_bootstrap() */
1420 #else
1421 		early_zerop = (void *)(KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2);
1422 #endif
1423 		early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop);
1424 	}
1425 
1426 	/*
1427 	 * now we allocate the "special" VAs which are used for tmp mappings
1428 	 * by the pmap (and other modules).    we allocate the VAs by advancing
1429 	 * virtual_avail (note that there are no pages mapped at these VAs).
1430 	 * we find the PTE that maps the allocated VA via the linear PTE
1431 	 * mapping.
1432 	 */
1433 
1434 	pte = PTE_BASE + pl1_i(virtual_avail);
1435 
1436 #ifdef MULTIPROCESSOR
1437 	/*
1438 	 * Waste some VA space to avoid false sharing of cache lines
1439 	 * for page table pages: Give each possible CPU a cache line
1440 	 * of PTE's (8) to play with, though we only need 4.  We could
1441 	 * recycle some of this waste by putting the idle stacks here
1442 	 * as well; we could waste less space if we knew the largest
1443 	 * CPU ID beforehand.
1444 	 */
1445 	csrcp = (char *) virtual_avail;  csrc_pte = pte;
1446 
1447 	cdstp = (char *) virtual_avail+PAGE_SIZE;  cdst_pte = pte+1;
1448 
1449 	zerop = (char *) virtual_avail+PAGE_SIZE*2;  zero_pte = pte+2;
1450 
1451 	ptpp = (char *) virtual_avail+PAGE_SIZE*3;  ptp_pte = pte+3;
1452 
1453 	virtual_avail += PAGE_SIZE * maxcpus * NPTECL;
1454 	pte += maxcpus * NPTECL;
1455 #else
1456 	csrcp = (void *) virtual_avail;  csrc_pte = pte;	/* allocate */
1457 	virtual_avail += PAGE_SIZE; pte++;			/* advance */
1458 
1459 	cdstp = (void *) virtual_avail;  cdst_pte = pte;
1460 	virtual_avail += PAGE_SIZE; pte++;
1461 
1462 	zerop = (void *) virtual_avail;  zero_pte = pte;
1463 	virtual_avail += PAGE_SIZE; pte++;
1464 
1465 	ptpp = (void *) virtual_avail;  ptp_pte = pte;
1466 	virtual_avail += PAGE_SIZE; pte++;
1467 #endif
1468 
1469 	if (VM_MIN_KERNEL_ADDRESS == KERNBASE) {
1470 		early_zerop = zerop;
1471 		early_zero_pte = zero_pte;
1472 	}
1473 #endif
1474 
1475 	/*
1476 	 * Nothing after this point actually needs pte.
1477 	 */
1478 	pte = (void *)0xdeadbeef;
1479 
1480 #ifdef XEN
1481 #ifdef __x86_64__
1482 	/*
1483 	 * We want a dummy page directory for Xen:
1484 	 * when deactivate a pmap, Xen will still consider it active.
1485 	 * So we set user PGD to this one to lift all protection on
1486 	 * the now inactive page tables set.
1487 	 */
1488 	xen_dummy_user_pgd = avail_start;
1489 	avail_start += PAGE_SIZE;
1490 
1491 	/* Zero fill it, the less checks in Xen it requires the better */
1492 	memset((void *) (xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE);
1493 	/* Mark read-only */
1494 	HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE,
1495 	    pmap_pa2pte(xen_dummy_user_pgd) | PG_u | PG_V, UVMF_INVLPG);
1496 	/* Pin as L4 */
1497 	xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd));
1498 #endif /* __x86_64__ */
1499 	idt_vaddr = virtual_avail;                      /* don't need pte */
1500 	idt_paddr = avail_start;                        /* steal a page */
1501 	/*
1502 	 * Xen require one more page as we can't store
1503 	 * GDT and LDT on the same page
1504 	 */
1505 	virtual_avail += 3 * PAGE_SIZE;
1506 	avail_start += 3 * PAGE_SIZE;
1507 #else /* XEN */
1508 	idt_vaddr = virtual_avail;			/* don't need pte */
1509 	idt_paddr = avail_start;			/* steal a page */
1510 #if defined(__x86_64__)
1511 	virtual_avail += 2 * PAGE_SIZE;
1512 	avail_start += 2 * PAGE_SIZE;
1513 #else /* defined(__x86_64__) */
1514 	virtual_avail += PAGE_SIZE;
1515 	avail_start += PAGE_SIZE;
1516 	/* pentium f00f bug stuff */
1517 	pentium_idt_vaddr = virtual_avail;		/* don't need pte */
1518 	virtual_avail += PAGE_SIZE;
1519 #endif /* defined(__x86_64__) */
1520 #endif /* XEN */
1521 
1522 #ifdef _LP64
1523 	/*
1524 	 * Grab a page below 4G for things that need it (i.e.
1525 	 * having an initial %cr3 for the MP trampoline).
1526 	 */
1527 	lo32_vaddr = virtual_avail;
1528 	virtual_avail += PAGE_SIZE;
1529 	lo32_paddr = avail_start;
1530 	avail_start += PAGE_SIZE;
1531 #endif
1532 
1533 	/*
1534 	 * now we reserve some VM for mapping pages when doing a crash dump
1535 	 */
1536 
1537 	virtual_avail = reserve_dumppages(virtual_avail);
1538 
1539 	/*
1540 	 * init the static-global locks and global lists.
1541 	 *
1542 	 * => pventry::pvh_lock (initialized elsewhere) must also be
1543 	 *      a spin lock, again at IPL_VM to prevent deadlock, and
1544 	 *	again is never taken from interrupt context.
1545 	 */
1546 
1547 	mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE);
1548 	LIST_INIT(&pmaps);
1549 
1550 	/*
1551 	 * ensure the TLB is sync'd with reality by flushing it...
1552 	 */
1553 
1554 	tlbflushg();
1555 
1556 	/*
1557 	 * calculate pmap_maxkvaddr from nkptp[].
1558 	 */
1559 
1560 	kva = VM_MIN_KERNEL_ADDRESS;
1561 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
1562 		kva += nkptp[i] * nbpd[i];
1563 	}
1564 	pmap_maxkvaddr = kva;
1565 }
1566 
1567 #if defined(__x86_64__)
1568 /*
1569  * Pre-allocate PTPs for low memory, so that 1:1 mappings for various
1570  * trampoline code can be entered.
1571  */
1572 void
1573 pmap_prealloc_lowmem_ptps(void)
1574 {
1575 	int level;
1576 	paddr_t newp;
1577 	pd_entry_t *pdes;
1578 
1579 	const pd_entry_t pteflags = PG_k | PG_V | PG_RW;
1580 
1581 	pdes = pmap_kernel()->pm_pdir;
1582 	level = PTP_LEVELS;
1583 	for (;;) {
1584 		newp = avail_start;
1585 		avail_start += PAGE_SIZE;
1586 #ifdef __HAVE_DIRECT_MAP
1587 		memset((void *)PMAP_DIRECT_MAP(newp), 0, PAGE_SIZE);
1588 #else
1589 		pmap_pte_set(early_zero_pte, pmap_pa2pte(newp) | pteflags);
1590 		pmap_pte_flush();
1591 		pmap_update_pg((vaddr_t)early_zerop);
1592 		memset(early_zerop, 0, PAGE_SIZE);
1593 #endif
1594 
1595 #ifdef XEN
1596 		/* Mark R/O before installing */
1597 		HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop,
1598 		    xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG);
1599 		if (newp < (NKL2_KIMG_ENTRIES * NBPD_L2))
1600 			HYPERVISOR_update_va_mapping (newp + KERNBASE,
1601 			    xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG);
1602 
1603 
1604 		if (level == PTP_LEVELS) { /* Top level pde is per-cpu */
1605 			pd_entry_t *kpm_pdir;
1606 			/* Reach it via recursive mapping */
1607 			kpm_pdir = normal_pdes[PTP_LEVELS - 2];
1608 
1609 			/* Set it as usual. We can't defer this
1610 			 * outside the loop since recursive
1611 			 * pte entries won't be accessible during
1612 			 * further iterations at lower levels
1613 			 * otherwise.
1614 			 */
1615 			pmap_pte_set(&kpm_pdir[pl_i(0, PTP_LEVELS)],
1616 			    pmap_pa2pte(newp) | pteflags);
1617 		}
1618 
1619 #endif /* XEN */
1620 		pmap_pte_set(&pdes[pl_i(0, level)],
1621 		    pmap_pa2pte(newp) | pteflags);
1622 
1623 		pmap_pte_flush();
1624 
1625 		level--;
1626 		if (level <= 1)
1627 			break;
1628 		pdes = normal_pdes[level - 2];
1629 	}
1630 }
1631 #endif /* defined(__x86_64__) */
1632 
1633 /*
1634  * pmap_init: called from uvm_init, our job is to get the pmap
1635  * system ready to manage mappings...
1636  */
1637 
1638 void
1639 pmap_init(void)
1640 {
1641 	int i, flags;
1642 
1643 	for (i = 0; i < PV_HASH_SIZE; i++) {
1644 		SLIST_INIT(&pv_hash_heads[i].hh_list);
1645 	}
1646 	for (i = 0; i < PV_HASH_LOCK_CNT; i++) {
1647 		mutex_init(&pv_hash_locks[i].lock, MUTEX_NODEBUG, IPL_VM);
1648 	}
1649 
1650 	/*
1651 	 * initialize caches.
1652 	 */
1653 
1654 	pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), 0, 0, 0,
1655 	    "pmappl", NULL, IPL_NONE, NULL, NULL, NULL);
1656 
1657 #ifdef XEN
1658 	/*
1659 	 * pool_cache(9) should not touch cached objects, since they
1660 	 * are pinned on xen and R/O for the domU
1661 	 */
1662 	flags = PR_NOTOUCH;
1663 #else /* XEN */
1664 	flags = 0;
1665 #endif /* XEN */
1666 #ifdef PAE
1667 	pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE * PDP_SIZE, 0, 0, flags,
1668 	    "pdppl", &pmap_pdp_allocator, IPL_NONE,
1669 	    pmap_pdp_ctor, pmap_pdp_dtor, NULL);
1670 #else /* PAE */
1671 	pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE, 0, 0, flags,
1672 	    "pdppl", NULL, IPL_NONE, pmap_pdp_ctor, pmap_pdp_dtor, NULL);
1673 #endif /* PAE */
1674 	pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0,
1675 	    PR_LARGECACHE, "pvpl", &pool_allocator_kmem, IPL_NONE, NULL,
1676 	    NULL, NULL);
1677 
1678 	pmap_tlb_init();
1679 
1680 	/* XXX: Since cpu_hatch() is only for secondary CPUs. */
1681 	pmap_tlb_cpu_init(curcpu());
1682 
1683 	evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC,
1684 	    NULL, "x86", "io bitmap copy");
1685 	evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC,
1686 	    NULL, "x86", "ldt sync");
1687 
1688 	/*
1689 	 * done: pmap module is up (and ready for business)
1690 	 */
1691 
1692 	pmap_initialized = true;
1693 }
1694 
1695 /*
1696  * pmap_cpu_init_late: perform late per-CPU initialization.
1697  */
1698 
1699 #ifndef XEN
1700 void
1701 pmap_cpu_init_late(struct cpu_info *ci)
1702 {
1703 	/*
1704 	 * The BP has already its own PD page allocated during early
1705 	 * MD startup.
1706 	 */
1707 	if (ci == &cpu_info_primary)
1708 		return;
1709 
1710 #ifdef PAE
1711 	cpu_alloc_l3_page(ci);
1712 #endif
1713 }
1714 #endif
1715 
1716 /*
1717  * p v _ e n t r y   f u n c t i o n s
1718  */
1719 
1720 /*
1721  * pmap_free_pvs: free a list of pv_entrys
1722  */
1723 
1724 static void
1725 pmap_free_pvs(struct pv_entry *pve)
1726 {
1727 	struct pv_entry *next;
1728 
1729 	for ( /* null */ ; pve != NULL ; pve = next) {
1730 		next = pve->pve_next;
1731 		pool_cache_put(&pmap_pv_cache, pve);
1732 	}
1733 }
1734 
1735 /*
1736  * main pv_entry manipulation functions:
1737  *   pmap_enter_pv: enter a mapping onto a pv_head list
1738  *   pmap_remove_pv: remove a mapping from a pv_head list
1739  *
1740  * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock
1741  *       the pvh before calling
1742  */
1743 
1744 /*
1745  * insert_pv: a helper of pmap_enter_pv
1746  */
1747 
1748 static void
1749 insert_pv(struct pmap_page *pp, struct pv_entry *pve)
1750 {
1751 	struct pv_hash_head *hh;
1752 	kmutex_t *lock;
1753 	u_int hash;
1754 
1755 	hash = pvhash_hash(pve->pve_pte.pte_ptp, pve->pve_pte.pte_va);
1756 	lock = pvhash_lock(hash);
1757 	hh = pvhash_head(hash);
1758 	mutex_spin_enter(lock);
1759 	SLIST_INSERT_HEAD(&hh->hh_list, pve, pve_hash);
1760 	mutex_spin_exit(lock);
1761 
1762 	LIST_INSERT_HEAD(&pp->pp_head.pvh_list, pve, pve_list);
1763 }
1764 
1765 /*
1766  * pmap_enter_pv: enter a mapping onto a pv_head lst
1767  *
1768  * => caller should adjust ptp's wire_count before calling
1769  */
1770 
1771 static struct pv_entry *
1772 pmap_enter_pv(struct pmap_page *pp,
1773 	      struct pv_entry *pve,	/* preallocated pve for us to use */
1774 	      struct pv_entry **sparepve,
1775 	      struct vm_page *ptp,
1776 	      vaddr_t va)
1777 {
1778 
1779 	KASSERT(ptp == NULL || ptp->wire_count >= 2);
1780 	KASSERT(ptp == NULL || ptp->uobject != NULL);
1781 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
1782 
1783 	if ((pp->pp_flags & PP_EMBEDDED) == 0) {
1784 		if (LIST_EMPTY(&pp->pp_head.pvh_list)) {
1785 			pp->pp_flags |= PP_EMBEDDED;
1786 			pp->pp_pte.pte_ptp = ptp;
1787 			pp->pp_pte.pte_va = va;
1788 
1789 			return pve;
1790 		}
1791 	} else {
1792 		struct pv_entry *pve2;
1793 
1794 		pve2 = *sparepve;
1795 		*sparepve = NULL;
1796 
1797 		pve2->pve_pte = pp->pp_pte;
1798 		pp->pp_flags &= ~PP_EMBEDDED;
1799 		LIST_INIT(&pp->pp_head.pvh_list);
1800 		insert_pv(pp, pve2);
1801 	}
1802 
1803 	pve->pve_pte.pte_ptp = ptp;
1804 	pve->pve_pte.pte_va = va;
1805 	insert_pv(pp, pve);
1806 
1807 	return NULL;
1808 }
1809 
1810 /*
1811  * pmap_remove_pv: try to remove a mapping from a pv_list
1812  *
1813  * => caller should adjust ptp's wire_count and free PTP if needed
1814  * => we return the removed pve
1815  */
1816 
1817 static struct pv_entry *
1818 pmap_remove_pv(struct pmap_page *pp, struct vm_page *ptp, vaddr_t va)
1819 {
1820 	struct pv_hash_head *hh;
1821 	struct pv_entry *pve;
1822 	kmutex_t *lock;
1823 	u_int hash;
1824 
1825 	KASSERT(ptp == NULL || ptp->uobject != NULL);
1826 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
1827 
1828 	if ((pp->pp_flags & PP_EMBEDDED) != 0) {
1829 		KASSERT(pp->pp_pte.pte_ptp == ptp);
1830 		KASSERT(pp->pp_pte.pte_va == va);
1831 
1832 		pp->pp_flags &= ~PP_EMBEDDED;
1833 		LIST_INIT(&pp->pp_head.pvh_list);
1834 
1835 		return NULL;
1836 	}
1837 
1838 	hash = pvhash_hash(ptp, va);
1839 	lock = pvhash_lock(hash);
1840 	hh = pvhash_head(hash);
1841 	mutex_spin_enter(lock);
1842 	pve = pvhash_remove(hh, ptp, va);
1843 	mutex_spin_exit(lock);
1844 
1845 	LIST_REMOVE(pve, pve_list);
1846 
1847 	return pve;
1848 }
1849 
1850 /*
1851  * p t p   f u n c t i o n s
1852  */
1853 
1854 static inline struct vm_page *
1855 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level)
1856 {
1857 	int lidx = level - 1;
1858 	struct vm_page *pg;
1859 
1860 	KASSERT(mutex_owned(pmap->pm_lock));
1861 
1862 	if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] &&
1863 	    pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) {
1864 		return (pmap->pm_ptphint[lidx]);
1865 	}
1866 	PMAP_SUBOBJ_LOCK(pmap, lidx);
1867 	pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level));
1868 	PMAP_SUBOBJ_UNLOCK(pmap, lidx);
1869 
1870 	KASSERT(pg == NULL || pg->wire_count >= 1);
1871 	return pg;
1872 }
1873 
1874 static inline void
1875 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level)
1876 {
1877 	lwp_t *l;
1878 	int lidx;
1879 	struct uvm_object *obj;
1880 
1881 	KASSERT(ptp->wire_count == 1);
1882 
1883 	lidx = level - 1;
1884 
1885 	obj = &pmap->pm_obj[lidx];
1886 	pmap_stats_update(pmap, -1, 0);
1887 	if (lidx != 0)
1888 		mutex_enter(obj->vmobjlock);
1889 	if (pmap->pm_ptphint[lidx] == ptp)
1890 		pmap->pm_ptphint[lidx] = TAILQ_FIRST(&obj->memq);
1891 	ptp->wire_count = 0;
1892 	uvm_pagerealloc(ptp, NULL, 0);
1893 	l = curlwp;
1894 	KASSERT((l->l_pflag & LP_INTR) == 0);
1895 	VM_PAGE_TO_PP(ptp)->pp_link = l->l_md.md_gc_ptp;
1896 	l->l_md.md_gc_ptp = ptp;
1897 	if (lidx != 0)
1898 		mutex_exit(obj->vmobjlock);
1899 }
1900 
1901 static void
1902 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
1903 	      pt_entry_t *ptes, pd_entry_t * const *pdes)
1904 {
1905 	unsigned long index;
1906 	int level;
1907 	vaddr_t invaladdr;
1908 	pd_entry_t opde;
1909 
1910 	KASSERT(pmap != pmap_kernel());
1911 	KASSERT(mutex_owned(pmap->pm_lock));
1912 	KASSERT(kpreempt_disabled());
1913 
1914 	level = 1;
1915 	do {
1916 		index = pl_i(va, level + 1);
1917 		opde = pmap_pte_testset(&pdes[level - 1][index], 0);
1918 #if defined(XEN)
1919 #  if defined(__x86_64__)
1920 		/*
1921 		 * If ptp is a L3 currently mapped in kernel space,
1922 		 * on any cpu, clear it before freeing
1923 		 */
1924 		if (level == PTP_LEVELS - 1) {
1925 			/*
1926 			 * Update the per-cpu PD on all cpus the current
1927 			 * pmap is active on
1928 			 */
1929 			xen_kpm_sync(pmap, index);
1930 		}
1931 #  endif /*__x86_64__ */
1932 		invaladdr = level == 1 ? (vaddr_t)ptes :
1933 		    (vaddr_t)pdes[level - 2];
1934 		pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE,
1935 		    opde, TLBSHOOT_FREE_PTP1);
1936 		pmap_tlb_shootnow();
1937 #else	/* XEN */
1938 		invaladdr = level == 1 ? (vaddr_t)ptes :
1939 		    (vaddr_t)pdes[level - 2];
1940 		pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE,
1941 		    opde, TLBSHOOT_FREE_PTP1);
1942 #endif	/* XEN */
1943 		pmap_freepage(pmap, ptp, level);
1944 		if (level < PTP_LEVELS - 1) {
1945 			ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1);
1946 			ptp->wire_count--;
1947 			if (ptp->wire_count > 1)
1948 				break;
1949 		}
1950 	} while (++level < PTP_LEVELS);
1951 	pmap_pte_flush();
1952 }
1953 
1954 /*
1955  * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
1956  *
1957  * => pmap should NOT be pmap_kernel()
1958  * => pmap should be locked
1959  * => preemption should be disabled
1960  */
1961 
1962 static struct vm_page *
1963 pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t * const *pdes)
1964 {
1965 	struct vm_page *ptp, *pptp;
1966 	int i;
1967 	unsigned long index;
1968 	pd_entry_t *pva;
1969 	paddr_t ppa, pa;
1970 	struct uvm_object *obj;
1971 
1972 	KASSERT(pmap != pmap_kernel());
1973 	KASSERT(mutex_owned(pmap->pm_lock));
1974 	KASSERT(kpreempt_disabled());
1975 
1976 	ptp = NULL;
1977 	pa = (paddr_t)-1;
1978 
1979 	/*
1980 	 * Loop through all page table levels seeing if we need to
1981 	 * add a new page to that level.
1982 	 */
1983 	for (i = PTP_LEVELS; i > 1; i--) {
1984 		/*
1985 		 * Save values from previous round.
1986 		 */
1987 		pptp = ptp;
1988 		ppa = pa;
1989 
1990 		index = pl_i(va, i);
1991 		pva = pdes[i - 2];
1992 
1993 		if (pmap_valid_entry(pva[index])) {
1994 			ppa = pmap_pte2pa(pva[index]);
1995 			ptp = NULL;
1996 			continue;
1997 		}
1998 
1999 		obj = &pmap->pm_obj[i-2];
2000 		PMAP_SUBOBJ_LOCK(pmap, i - 2);
2001 		ptp = uvm_pagealloc(obj, ptp_va2o(va, i - 1), NULL,
2002 		    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
2003 		PMAP_SUBOBJ_UNLOCK(pmap, i - 2);
2004 
2005 		if (ptp == NULL)
2006 			return NULL;
2007 
2008 		ptp->flags &= ~PG_BUSY; /* never busy */
2009 		ptp->wire_count = 1;
2010 		pmap->pm_ptphint[i - 2] = ptp;
2011 		pa = VM_PAGE_TO_PHYS(ptp);
2012 		pmap_pte_set(&pva[index], (pd_entry_t)
2013 		        (pmap_pa2pte(pa) | PG_u | PG_RW | PG_V));
2014 #if defined(XEN) && defined(__x86_64__)
2015 		if(i == PTP_LEVELS) {
2016 			/*
2017 			 * Update the per-cpu PD on all cpus the current
2018 			 * pmap is active on
2019 			 */
2020 			xen_kpm_sync(pmap, index);
2021 		}
2022 #endif
2023 		pmap_pte_flush();
2024 		pmap_stats_update(pmap, 1, 0);
2025 		/*
2026 		 * If we're not in the top level, increase the
2027 		 * wire count of the parent page.
2028 		 */
2029 		if (i < PTP_LEVELS) {
2030 			if (pptp == NULL) {
2031 				pptp = pmap_find_ptp(pmap, va, ppa, i);
2032 				KASSERT(pptp != NULL);
2033 			}
2034 			pptp->wire_count++;
2035 		}
2036 	}
2037 
2038 	/*
2039 	 * PTP is not NULL if we just allocated a new PTP.  If it is
2040 	 * still NULL, we must look up the existing one.
2041 	 */
2042 	if (ptp == NULL) {
2043 		ptp = pmap_find_ptp(pmap, va, ppa, 1);
2044 		KASSERTMSG(ptp != NULL, "pmap_get_ptp: va %" PRIxVADDR
2045 		    "ppa %" PRIxPADDR "\n", va, ppa);
2046 	}
2047 
2048 	pmap->pm_ptphint[0] = ptp;
2049 	return ptp;
2050 }
2051 
2052 /*
2053  * p m a p  l i f e c y c l e   f u n c t i o n s
2054  */
2055 
2056 /*
2057  * pmap_pdp_ctor: constructor for the PDP cache.
2058  */
2059 static int
2060 pmap_pdp_ctor(void *arg, void *v, int flags)
2061 {
2062 	pd_entry_t *pdir = v;
2063 	paddr_t pdirpa = 0;	/* XXX: GCC */
2064 	vaddr_t object;
2065 	int i;
2066 
2067 #if !defined(XEN) || !defined(__x86_64__)
2068 	int npde;
2069 #endif
2070 #ifdef XEN
2071 	int s;
2072 #endif
2073 
2074 	/*
2075 	 * NOTE: The `pmaps_lock' is held when the PDP is allocated.
2076 	 */
2077 
2078 #if defined(XEN) && defined(__x86_64__)
2079 	/* fetch the physical address of the page directory. */
2080 	(void) pmap_extract(pmap_kernel(), (vaddr_t) pdir, &pdirpa);
2081 
2082 	/* zero init area */
2083 	memset (pdir, 0, PAGE_SIZE); /* Xen wants a clean page */
2084 	/*
2085 	 * this pdir will NEVER be active in kernel mode
2086 	 * so mark recursive entry invalid
2087 	 */
2088 	pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa) | PG_u;
2089 	/*
2090 	 * PDP constructed this way won't be for kernel,
2091 	 * hence we don't put kernel mappings on Xen.
2092 	 * But we need to make pmap_create() happy, so put a dummy (without
2093 	 * PG_V) value at the right place.
2094 	 */
2095 	pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] =
2096 	     (pd_entry_t)-1 & PG_FRAME;
2097 #else /* XEN && __x86_64__*/
2098 	/* zero init area */
2099 	memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t));
2100 
2101 	object = (vaddr_t)v;
2102 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2103 		/* fetch the physical address of the page directory. */
2104 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2105 		/* put in recursive PDE to map the PTEs */
2106 		pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PG_V;
2107 #ifndef XEN
2108 		pdir[PDIR_SLOT_PTE + i] |= PG_KW;
2109 #endif
2110 	}
2111 
2112 	/* copy kernel's PDE */
2113 	npde = nkptp[PTP_LEVELS - 1];
2114 
2115 	memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN],
2116 	    npde * sizeof(pd_entry_t));
2117 
2118 	/* zero the rest */
2119 	memset(&pdir[PDIR_SLOT_KERN + npde], 0, (PAGE_SIZE * PDP_SIZE) -
2120 	    (PDIR_SLOT_KERN + npde) * sizeof(pd_entry_t));
2121 
2122 	if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
2123 		int idx = pl_i(KERNBASE, PTP_LEVELS);
2124 
2125 		pdir[idx] = PDP_BASE[idx];
2126 	}
2127 
2128 #ifdef __HAVE_DIRECT_MAP
2129 	pdir[PDIR_SLOT_DIRECT] = PDP_BASE[PDIR_SLOT_DIRECT];
2130 #endif
2131 
2132 #endif /* XEN  && __x86_64__*/
2133 #ifdef XEN
2134 	s = splvm();
2135 	object = (vaddr_t)v;
2136 	pmap_protect(pmap_kernel(), object, object + (PAGE_SIZE * PDP_SIZE),
2137 	    VM_PROT_READ);
2138 	pmap_update(pmap_kernel());
2139 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2140 		/*
2141 		 * pin as L2/L4 page, we have to do the page with the
2142 		 * PDIR_SLOT_PTE entries last
2143 		 */
2144 #ifdef PAE
2145 		if (i == l2tol3(PDIR_SLOT_PTE))
2146 			continue;
2147 #endif
2148 
2149 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2150 #ifdef __x86_64__
2151 		xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa));
2152 #else
2153 		xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2154 #endif
2155 	}
2156 #ifdef PAE
2157 	object = ((vaddr_t)pdir) + PAGE_SIZE  * l2tol3(PDIR_SLOT_PTE);
2158 	(void)pmap_extract(pmap_kernel(), object, &pdirpa);
2159 	xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2160 #endif
2161 	splx(s);
2162 #endif /* XEN */
2163 
2164 	return (0);
2165 }
2166 
2167 /*
2168  * pmap_pdp_dtor: destructor for the PDP cache.
2169  */
2170 
2171 static void
2172 pmap_pdp_dtor(void *arg, void *v)
2173 {
2174 #ifdef XEN
2175 	paddr_t pdirpa = 0;	/* XXX: GCC */
2176 	vaddr_t object = (vaddr_t)v;
2177 	int i;
2178 	int s = splvm();
2179 	pt_entry_t *pte;
2180 
2181 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2182 		/* fetch the physical address of the page directory. */
2183 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2184 		/* unpin page table */
2185 		xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa));
2186 	}
2187 	object = (vaddr_t)v;
2188 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2189 		/* Set page RW again */
2190 		pte = kvtopte(object);
2191 		pmap_pte_set(pte, *pte | PG_RW);
2192 		xen_bcast_invlpg((vaddr_t)object);
2193 	}
2194 	splx(s);
2195 #endif  /* XEN */
2196 }
2197 
2198 #ifdef PAE
2199 
2200 /* pmap_pdp_alloc: Allocate a page for the pdp memory pool. */
2201 
2202 static void *
2203 pmap_pdp_alloc(struct pool *pp, int flags)
2204 {
2205 	return (void *)uvm_km_alloc(kernel_map,
2206 	    PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE,
2207 	    ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK)
2208 	    | UVM_KMF_WIRED);
2209 }
2210 
2211 /*
2212  * pmap_pdp_free: free a PDP
2213  */
2214 
2215 static void
2216 pmap_pdp_free(struct pool *pp, void *v)
2217 {
2218 	uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE,
2219 	    UVM_KMF_WIRED);
2220 }
2221 #endif /* PAE */
2222 
2223 /*
2224  * pmap_create: create a pmap object.
2225  */
2226 struct pmap *
2227 pmap_create(void)
2228 {
2229 	struct pmap *pmap;
2230 	int i;
2231 
2232 	pmap = pool_cache_get(&pmap_cache, PR_WAITOK);
2233 
2234 	/* init uvm_object */
2235 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2236 		mutex_init(&pmap->pm_obj_lock[i], MUTEX_DEFAULT, IPL_NONE);
2237 		uvm_obj_init(&pmap->pm_obj[i], NULL, false, 1);
2238 		uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_obj_lock[i]);
2239 		pmap->pm_ptphint[i] = NULL;
2240 	}
2241 	pmap->pm_stats.wired_count = 0;
2242 	/* count the PDP allocd below */
2243 	pmap->pm_stats.resident_count = PDP_SIZE;
2244 #if !defined(__x86_64__)
2245 	pmap->pm_hiexec = 0;
2246 #endif /* !defined(__x86_64__) */
2247 	pmap->pm_flags = 0;
2248 	pmap->pm_gc_ptp = NULL;
2249 
2250 	kcpuset_create(&pmap->pm_cpus, true);
2251 	kcpuset_create(&pmap->pm_kernel_cpus, true);
2252 #ifdef XEN
2253 	kcpuset_create(&pmap->pm_xen_ptp_cpus, true);
2254 #endif
2255 	/* init the LDT */
2256 	pmap->pm_ldt = NULL;
2257 	pmap->pm_ldt_len = 0;
2258 	pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2259 
2260 	/* allocate PDP */
2261  try_again:
2262 	pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK);
2263 
2264 	mutex_enter(&pmaps_lock);
2265 
2266 	if (pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] == 0) {
2267 		mutex_exit(&pmaps_lock);
2268 		pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir);
2269 		goto try_again;
2270 	}
2271 
2272 	for (i = 0; i < PDP_SIZE; i++)
2273 		pmap->pm_pdirpa[i] =
2274 		    pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]);
2275 
2276 	LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
2277 
2278 	mutex_exit(&pmaps_lock);
2279 
2280 	return (pmap);
2281 }
2282 
2283 /*
2284  * pmap_free_ptps: put a list of ptps back to the freelist.
2285  */
2286 
2287 static void
2288 pmap_free_ptps(struct vm_page *empty_ptps)
2289 {
2290 	struct vm_page *ptp;
2291 	struct pmap_page *pp;
2292 
2293 	while ((ptp = empty_ptps) != NULL) {
2294 		pp = VM_PAGE_TO_PP(ptp);
2295 		empty_ptps = pp->pp_link;
2296 		LIST_INIT(&pp->pp_head.pvh_list);
2297 		uvm_pagefree(ptp);
2298 	}
2299 }
2300 
2301 /*
2302  * pmap_destroy: drop reference count on pmap.   free pmap if
2303  *	reference count goes to zero.
2304  */
2305 
2306 void
2307 pmap_destroy(struct pmap *pmap)
2308 {
2309 	lwp_t *l;
2310 	int i;
2311 
2312 	/*
2313 	 * If we have torn down this pmap, process deferred frees and
2314 	 * invalidations.  Free now if the system is low on memory.
2315 	 * Otherwise, free when the pmap is destroyed thus avoiding a
2316 	 * TLB shootdown.
2317 	 */
2318 	l = curlwp;
2319 	if (__predict_false(l->l_md.md_gc_pmap == pmap)) {
2320 		if (uvmexp.free < uvmexp.freetarg) {
2321 			pmap_update(pmap);
2322 		} else {
2323 			KASSERT(pmap->pm_gc_ptp == NULL);
2324 			pmap->pm_gc_ptp = l->l_md.md_gc_ptp;
2325 			l->l_md.md_gc_ptp = NULL;
2326 			l->l_md.md_gc_pmap = NULL;
2327 		}
2328 	}
2329 
2330 	/*
2331 	 * drop reference count
2332 	 */
2333 
2334 	if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) {
2335 		return;
2336 	}
2337 
2338 #ifdef DIAGNOSTIC
2339 	CPU_INFO_ITERATOR cii;
2340 	struct cpu_info *ci;
2341 
2342 	for (CPU_INFO_FOREACH(cii, ci)) {
2343 		if (ci->ci_pmap == pmap)
2344 			panic("destroying pmap being used");
2345 #if defined(XEN) && defined(__x86_64__)
2346 		for (i = 0; i < PDIR_SLOT_PTE; i++) {
2347 			if (pmap->pm_pdir[i] != 0 &&
2348 			    ci->ci_kpm_pdir[i] == pmap->pm_pdir[i]) {
2349 				printf("pmap_destroy(%p) pmap_kernel %p "
2350 				    "curcpu %d cpu %d ci_pmap %p "
2351 				    "ci->ci_kpm_pdir[%d]=%" PRIx64
2352 				    " pmap->pm_pdir[%d]=%" PRIx64 "\n",
2353 				    pmap, pmap_kernel(), curcpu()->ci_index,
2354 				    ci->ci_index, ci->ci_pmap,
2355 				    i, ci->ci_kpm_pdir[i],
2356 				    i, pmap->pm_pdir[i]);
2357 				panic("pmap_destroy: used pmap");
2358 			}
2359 		}
2360 #endif
2361 	}
2362 #endif /* DIAGNOSTIC */
2363 
2364 	/*
2365 	 * Reference count is zero, free pmap resources and then free pmap.
2366 	 * First, remove it from global list of pmaps.
2367 	 */
2368 
2369 	mutex_enter(&pmaps_lock);
2370 	LIST_REMOVE(pmap, pm_list);
2371 	mutex_exit(&pmaps_lock);
2372 
2373 	/*
2374 	 * Process deferred PTP frees.  No TLB shootdown required, as the
2375 	 * PTP pages are no longer visible to any CPU.
2376 	 */
2377 
2378 	pmap_free_ptps(pmap->pm_gc_ptp);
2379 
2380 	/*
2381 	 * destroyed pmap shouldn't have remaining PTPs
2382 	 */
2383 
2384 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2385 		KASSERT(pmap->pm_obj[i].uo_npages == 0);
2386 		KASSERT(TAILQ_EMPTY(&pmap->pm_obj[i].memq));
2387 	}
2388 
2389 	pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir);
2390 
2391 #ifdef USER_LDT
2392 	if (pmap->pm_ldt != NULL) {
2393 		/*
2394 		 * no need to switch the LDT; this address space is gone,
2395 		 * nothing is using it.
2396 		 *
2397 		 * No need to lock the pmap for ldt_free (or anything else),
2398 		 * we're the last one to use it.
2399 		 */
2400 		mutex_enter(&cpu_lock);
2401 		ldt_free(pmap->pm_ldt_sel);
2402 		mutex_exit(&cpu_lock);
2403 		uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt,
2404 		    pmap->pm_ldt_len, UVM_KMF_WIRED);
2405 	}
2406 #endif
2407 
2408 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2409 		uvm_obj_destroy(&pmap->pm_obj[i], false);
2410 		mutex_destroy(&pmap->pm_obj_lock[i]);
2411 	}
2412 	kcpuset_destroy(pmap->pm_cpus);
2413 	kcpuset_destroy(pmap->pm_kernel_cpus);
2414 #ifdef XEN
2415 	kcpuset_destroy(pmap->pm_xen_ptp_cpus);
2416 #endif
2417 	pool_cache_put(&pmap_cache, pmap);
2418 }
2419 
2420 /*
2421  * pmap_remove_all: pmap is being torn down by the current thread.
2422  * avoid unnecessary invalidations.
2423  */
2424 
2425 void
2426 pmap_remove_all(struct pmap *pmap)
2427 {
2428 	lwp_t *l = curlwp;
2429 
2430 	KASSERT(l->l_md.md_gc_pmap == NULL);
2431 
2432 	l->l_md.md_gc_pmap = pmap;
2433 }
2434 
2435 #if defined(PMAP_FORK)
2436 /*
2437  * pmap_fork: perform any necessary data structure manipulation when
2438  * a VM space is forked.
2439  */
2440 
2441 void
2442 pmap_fork(struct pmap *pmap1, struct pmap *pmap2)
2443 {
2444 #ifdef USER_LDT
2445 	union descriptor *new_ldt;
2446 	size_t len;
2447 	int sel;
2448 
2449 	if (__predict_true(pmap1->pm_ldt == NULL)) {
2450 		return;
2451 	}
2452 
2453  retry:
2454 	if (pmap1->pm_ldt != NULL) {
2455 		len = pmap1->pm_ldt_len;
2456 		new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, len, 0,
2457 		    UVM_KMF_WIRED);
2458 		mutex_enter(&cpu_lock);
2459 		sel = ldt_alloc(new_ldt, len);
2460 		if (sel == -1) {
2461 			mutex_exit(&cpu_lock);
2462 			uvm_km_free(kernel_map, (vaddr_t)new_ldt, len,
2463 			    UVM_KMF_WIRED);
2464 			printf("WARNING: pmap_fork: unable to allocate LDT\n");
2465 			return;
2466 		}
2467 	} else {
2468 		len = -1;
2469 		new_ldt = NULL;
2470 		sel = -1;
2471 		mutex_enter(&cpu_lock);
2472 	}
2473 
2474  	/* Copy the LDT, if necessary. */
2475  	if (pmap1->pm_ldt != NULL) {
2476 		if (len != pmap1->pm_ldt_len) {
2477 			if (len != -1) {
2478 				ldt_free(sel);
2479 				uvm_km_free(kernel_map, (vaddr_t)new_ldt,
2480 				    len, UVM_KMF_WIRED);
2481 			}
2482 			mutex_exit(&cpu_lock);
2483 			goto retry;
2484 		}
2485 
2486 		memcpy(new_ldt, pmap1->pm_ldt, len);
2487 		pmap2->pm_ldt = new_ldt;
2488 		pmap2->pm_ldt_len = pmap1->pm_ldt_len;
2489 		pmap2->pm_ldt_sel = sel;
2490 		len = -1;
2491 	}
2492 
2493 	if (len != -1) {
2494 		ldt_free(sel);
2495 		uvm_km_free(kernel_map, (vaddr_t)new_ldt, len,
2496 		    UVM_KMF_WIRED);
2497 	}
2498 	mutex_exit(&cpu_lock);
2499 #endif /* USER_LDT */
2500 }
2501 #endif /* PMAP_FORK */
2502 
2503 #ifdef USER_LDT
2504 
2505 /*
2506  * pmap_ldt_xcall: cross call used by pmap_ldt_sync.  if the named pmap
2507  * is active, reload LDTR.
2508  */
2509 static void
2510 pmap_ldt_xcall(void *arg1, void *arg2)
2511 {
2512 	struct pmap *pm;
2513 
2514 	kpreempt_disable();
2515 	pm = arg1;
2516 	if (curcpu()->ci_pmap == pm) {
2517 		lldt(pm->pm_ldt_sel);
2518 	}
2519 	kpreempt_enable();
2520 }
2521 
2522 /*
2523  * pmap_ldt_sync: LDT selector for the named pmap is changing.  swap
2524  * in the new selector on all CPUs.
2525  */
2526 void
2527 pmap_ldt_sync(struct pmap *pm)
2528 {
2529 	uint64_t where;
2530 
2531 	KASSERT(mutex_owned(&cpu_lock));
2532 
2533 	pmap_ldt_evcnt.ev_count++;
2534 	where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL);
2535 	xc_wait(where);
2536 }
2537 
2538 /*
2539  * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and
2540  * restore the default.
2541  */
2542 
2543 void
2544 pmap_ldt_cleanup(struct lwp *l)
2545 {
2546 	pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap;
2547 	union descriptor *dp = NULL;
2548 	size_t len = 0;
2549 	int sel = -1;
2550 
2551 	if (__predict_true(pmap->pm_ldt == NULL)) {
2552 		return;
2553 	}
2554 
2555 	mutex_enter(&cpu_lock);
2556 	if (pmap->pm_ldt != NULL) {
2557 		sel = pmap->pm_ldt_sel;
2558 		dp = pmap->pm_ldt;
2559 		len = pmap->pm_ldt_len;
2560 		pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2561 		pmap->pm_ldt = NULL;
2562 		pmap->pm_ldt_len = 0;
2563 		pmap_ldt_sync(pmap);
2564 		ldt_free(sel);
2565 		uvm_km_free(kernel_map, (vaddr_t)dp, len, UVM_KMF_WIRED);
2566 	}
2567 	mutex_exit(&cpu_lock);
2568 }
2569 #endif /* USER_LDT */
2570 
2571 /*
2572  * pmap_activate: activate a process' pmap
2573  *
2574  * => must be called with kernel preemption disabled
2575  * => if lwp is the curlwp, then set ci_want_pmapload so that
2576  *    actual MMU context switch will be done by pmap_load() later
2577  */
2578 
2579 void
2580 pmap_activate(struct lwp *l)
2581 {
2582 	struct cpu_info *ci;
2583 	struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2584 
2585 	KASSERT(kpreempt_disabled());
2586 
2587 	ci = curcpu();
2588 
2589 	if (l == ci->ci_curlwp) {
2590 		KASSERT(ci->ci_want_pmapload == 0);
2591 		KASSERT(ci->ci_tlbstate != TLBSTATE_VALID);
2592 #ifdef KSTACK_CHECK_DR0
2593 		/*
2594 		 * setup breakpoint on the top of stack
2595 		 */
2596 		if (l == &lwp0)
2597 			dr0(0, 0, 0, 0);
2598 		else
2599 			dr0(KSTACK_LOWEST_ADDR(l), 1, 3, 1);
2600 #endif
2601 
2602 		/*
2603 		 * no need to switch to kernel vmspace because
2604 		 * it's a subset of any vmspace.
2605 		 */
2606 
2607 		if (pmap == pmap_kernel()) {
2608 			ci->ci_want_pmapload = 0;
2609 			return;
2610 		}
2611 
2612 		ci->ci_want_pmapload = 1;
2613 	}
2614 }
2615 
2616 /*
2617  * pmap_reactivate: try to regain reference to the pmap.
2618  *
2619  * => Must be called with kernel preemption disabled.
2620  */
2621 
2622 static bool
2623 pmap_reactivate(struct pmap *pmap)
2624 {
2625 	struct cpu_info * const ci = curcpu();
2626 	const cpuid_t cid = cpu_index(ci);
2627 	bool result;
2628 
2629 	KASSERT(kpreempt_disabled());
2630 #if defined(XEN) && defined(__x86_64__)
2631 	KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd);
2632 #elif defined(PAE)
2633 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
2634 #elif !defined(XEN)
2635 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()));
2636 #endif
2637 
2638 	/*
2639 	 * If we still have a lazy reference to this pmap, we can assume
2640 	 * that there was no TLB shootdown for this pmap in the meantime.
2641 	 *
2642 	 * The order of events here is important as we must synchronize
2643 	 * with TLB shootdown interrupts.  Declare interest in invalidations
2644 	 * (TLBSTATE_VALID) and then check the CPU set, which the IPIs can
2645 	 * change only when the state is TLBSTATE_LAZY.
2646 	 */
2647 
2648 	ci->ci_tlbstate = TLBSTATE_VALID;
2649 	KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid));
2650 
2651 	if (kcpuset_isset(pmap->pm_cpus, cid)) {
2652 		/* We have the reference, state is valid. */
2653 		result = true;
2654 	} else {
2655 		/* Must reload the TLB. */
2656 		kcpuset_atomic_set(pmap->pm_cpus, cid);
2657 		result = false;
2658 	}
2659 	return result;
2660 }
2661 
2662 /*
2663  * pmap_load: perform the actual pmap switch, i.e. fill in %cr3 register
2664  * and relevant LDT info.
2665  *
2666  * Ensures that the current process' pmap is loaded on the current CPU's
2667  * MMU and that there are no stale TLB entries.
2668  *
2669  * => The caller should disable kernel preemption or do check-and-retry
2670  *    to prevent a preemption from undoing our efforts.
2671  * => This function may block.
2672  */
2673 void
2674 pmap_load(void)
2675 {
2676 	struct cpu_info *ci;
2677 	struct pmap *pmap, *oldpmap;
2678 	struct lwp *l;
2679 	struct pcb *pcb;
2680 	cpuid_t cid;
2681 	uint64_t ncsw;
2682 
2683 	kpreempt_disable();
2684  retry:
2685 	ci = curcpu();
2686 	if (!ci->ci_want_pmapload) {
2687 		kpreempt_enable();
2688 		return;
2689 	}
2690 	l = ci->ci_curlwp;
2691 	ncsw = l->l_ncsw;
2692 
2693 	/* should be able to take ipis. */
2694 	KASSERT(ci->ci_ilevel < IPL_HIGH);
2695 #ifdef XEN
2696 	/* Check to see if interrupts are enabled (ie; no events are masked) */
2697 	KASSERT(x86_read_psl() == 0);
2698 #else
2699 	KASSERT((x86_read_psl() & PSL_I) != 0);
2700 #endif
2701 
2702 	KASSERT(l != NULL);
2703 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2704 	KASSERT(pmap != pmap_kernel());
2705 	oldpmap = ci->ci_pmap;
2706 	pcb = lwp_getpcb(l);
2707 
2708 	if (pmap == oldpmap) {
2709 		if (!pmap_reactivate(pmap)) {
2710 			u_int gen = uvm_emap_gen_return();
2711 
2712 			/*
2713 			 * pmap has been changed during deactivated.
2714 			 * our tlb may be stale.
2715 			 */
2716 
2717 			tlbflush();
2718 			uvm_emap_update(gen);
2719 		}
2720 
2721 		ci->ci_want_pmapload = 0;
2722 		kpreempt_enable();
2723 		return;
2724 	}
2725 
2726 	/*
2727 	 * Acquire a reference to the new pmap and perform the switch.
2728 	 */
2729 
2730 	pmap_reference(pmap);
2731 
2732 	cid = cpu_index(ci);
2733 	kcpuset_atomic_clear(oldpmap->pm_cpus, cid);
2734 	kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid);
2735 
2736 #if defined(XEN) && defined(__x86_64__)
2737 	KASSERT(pmap_pdirpa(oldpmap, 0) == ci->ci_xen_current_user_pgd ||
2738 	    oldpmap == pmap_kernel());
2739 #elif defined(PAE)
2740 	KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
2741 #elif !defined(XEN)
2742 	KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(rcr3()));
2743 #endif
2744 	KASSERT(!kcpuset_isset(pmap->pm_cpus, cid));
2745 	KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid));
2746 
2747 	/*
2748 	 * Mark the pmap in use by this CPU.  Again, we must synchronize
2749 	 * with TLB shootdown interrupts, so set the state VALID first,
2750 	 * then register us for shootdown events on this pmap.
2751 	 */
2752 	ci->ci_tlbstate = TLBSTATE_VALID;
2753 	kcpuset_atomic_set(pmap->pm_cpus, cid);
2754 	kcpuset_atomic_set(pmap->pm_kernel_cpus, cid);
2755 	ci->ci_pmap = pmap;
2756 
2757 	/*
2758 	 * update tss.  now that we have registered for invalidations
2759 	 * from other CPUs, we're good to load the page tables.
2760 	 */
2761 #ifdef PAE
2762 	pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa;
2763 #else
2764 	pcb->pcb_cr3 = pmap_pdirpa(pmap, 0);
2765 #endif
2766 
2767 #ifdef i386
2768 #ifndef XEN
2769 	ci->ci_tss.tss_ldt = pmap->pm_ldt_sel;
2770 	ci->ci_tss.tss_cr3 = pcb->pcb_cr3;
2771 #endif /* !XEN */
2772 #endif /* i386 */
2773 
2774 	lldt(pmap->pm_ldt_sel);
2775 
2776 	u_int gen = uvm_emap_gen_return();
2777 	cpu_load_pmap(pmap, oldpmap);
2778 	uvm_emap_update(gen);
2779 
2780 	ci->ci_want_pmapload = 0;
2781 
2782 	/*
2783 	 * we're now running with the new pmap.  drop the reference
2784 	 * to the old pmap.  if we block, we need to go around again.
2785 	 */
2786 
2787 	pmap_destroy(oldpmap);
2788 	if (l->l_ncsw != ncsw) {
2789 		goto retry;
2790 	}
2791 
2792 	kpreempt_enable();
2793 }
2794 
2795 /*
2796  * pmap_deactivate: deactivate a process' pmap.
2797  *
2798  * => Must be called with kernel preemption disabled (high IPL is enough).
2799  */
2800 void
2801 pmap_deactivate(struct lwp *l)
2802 {
2803 	struct pmap *pmap;
2804 	struct cpu_info *ci;
2805 
2806 	KASSERT(kpreempt_disabled());
2807 
2808 	if (l != curlwp) {
2809 		return;
2810 	}
2811 
2812 	/*
2813 	 * Wait for pending TLB shootdowns to complete.  Necessary because
2814 	 * TLB shootdown state is per-CPU, and the LWP may be coming off
2815 	 * the CPU before it has a chance to call pmap_update(), e.g. due
2816 	 * to kernel preemption or blocking routine in between.
2817 	 */
2818 	pmap_tlb_shootnow();
2819 
2820 	ci = curcpu();
2821 
2822 	if (ci->ci_want_pmapload) {
2823 		/*
2824 		 * ci_want_pmapload means that our pmap is not loaded on
2825 		 * the CPU or TLB might be stale.  note that pmap_kernel()
2826 		 * is always considered loaded.
2827 		 */
2828 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
2829 		    != pmap_kernel());
2830 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
2831 		    != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID);
2832 
2833 		/*
2834 		 * userspace has not been touched.
2835 		 * nothing to do here.
2836 		 */
2837 
2838 		ci->ci_want_pmapload = 0;
2839 		return;
2840 	}
2841 
2842 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2843 
2844 	if (pmap == pmap_kernel()) {
2845 		return;
2846 	}
2847 
2848 #if defined(XEN) && defined(__x86_64__)
2849 	KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd);
2850 #elif defined(PAE)
2851 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
2852 #elif !defined(XEN)
2853 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()));
2854 #endif
2855 	KASSERT(ci->ci_pmap == pmap);
2856 
2857 	/*
2858 	 * we aren't interested in TLB invalidations for this pmap,
2859 	 * at least for the time being.
2860 	 */
2861 
2862 	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
2863 	ci->ci_tlbstate = TLBSTATE_LAZY;
2864 }
2865 
2866 /*
2867  * end of lifecycle functions
2868  */
2869 
2870 /*
2871  * some misc. functions
2872  */
2873 
2874 int
2875 pmap_pdes_invalid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde)
2876 {
2877 	int i;
2878 	unsigned long index;
2879 	pd_entry_t pde;
2880 
2881 	for (i = PTP_LEVELS; i > 1; i--) {
2882 		index = pl_i(va, i);
2883 		pde = pdes[i - 2][index];
2884 		if ((pde & PG_V) == 0)
2885 			return i;
2886 	}
2887 	if (lastpde != NULL)
2888 		*lastpde = pde;
2889 	return 0;
2890 }
2891 
2892 /*
2893  * pmap_extract: extract a PA for the given VA
2894  */
2895 
2896 bool
2897 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
2898 {
2899 	pt_entry_t *ptes, pte;
2900 	pd_entry_t pde;
2901 	pd_entry_t * const *pdes;
2902 	struct pmap *pmap2;
2903 	struct cpu_info *ci;
2904 	paddr_t pa;
2905 	lwp_t *l;
2906 	bool hard, rv;
2907 
2908 #ifdef __HAVE_DIRECT_MAP
2909 	if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
2910 		if (pap != NULL) {
2911 			*pap = va - PMAP_DIRECT_BASE;
2912 		}
2913 		return true;
2914 	}
2915 #endif
2916 
2917 	rv = false;
2918 	pa = 0;
2919 	l = curlwp;
2920 
2921 	kpreempt_disable();
2922 	ci = l->l_cpu;
2923 	if (__predict_true(!ci->ci_want_pmapload && ci->ci_pmap == pmap) ||
2924 	    pmap == pmap_kernel()) {
2925 		/*
2926 		 * no need to lock, because it's pmap_kernel() or our
2927 		 * own pmap and is active.  if a user pmap, the caller
2928 		 * will hold the vm_map write/read locked and so prevent
2929 		 * entries from disappearing while we are here.  ptps
2930 		 * can disappear via pmap_remove() and pmap_protect(),
2931 		 * but they are called with the vm_map write locked.
2932 		 */
2933 		hard = false;
2934 		ptes = PTE_BASE;
2935 		pdes = normal_pdes;
2936 	} else {
2937 		/* we lose, do it the hard way. */
2938 		hard = true;
2939 		pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
2940 	}
2941 	if (pmap_pdes_valid(va, pdes, &pde)) {
2942 		pte = ptes[pl1_i(va)];
2943 		if (pde & PG_PS) {
2944 			pa = (pde & PG_LGFRAME) | (va & (NBPD_L2 - 1));
2945 			rv = true;
2946 		} else if (__predict_true((pte & PG_V) != 0)) {
2947 			pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
2948 			rv = true;
2949 		}
2950 	}
2951 	if (__predict_false(hard)) {
2952 		pmap_unmap_ptes(pmap, pmap2);
2953 	}
2954 	kpreempt_enable();
2955 	if (pap != NULL) {
2956 		*pap = pa;
2957 	}
2958 	return rv;
2959 }
2960 
2961 
2962 /*
2963  * vtophys: virtual address to physical address.  For use by
2964  * machine-dependent code only.
2965  */
2966 
2967 paddr_t
2968 vtophys(vaddr_t va)
2969 {
2970 	paddr_t pa;
2971 
2972 	if (pmap_extract(pmap_kernel(), va, &pa) == true)
2973 		return (pa);
2974 	return (0);
2975 }
2976 
2977 __strict_weak_alias(pmap_extract_ma, pmap_extract);
2978 
2979 #ifdef XEN
2980 
2981 /*
2982  * vtomach: virtual address to machine address.  For use by
2983  * machine-dependent code only.
2984  */
2985 
2986 paddr_t
2987 vtomach(vaddr_t va)
2988 {
2989 	paddr_t pa;
2990 
2991 	if (pmap_extract_ma(pmap_kernel(), va, &pa) == true)
2992 		return (pa);
2993 	return (0);
2994 }
2995 
2996 #endif /* XEN */
2997 
2998 /*
2999  * pmap_virtual_space: used during bootup [pmap_steal_memory] to
3000  *	determine the bounds of the kernel virtual addess space.
3001  */
3002 
3003 void
3004 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp)
3005 {
3006 	*startp = virtual_avail;
3007 	*endp = virtual_end;
3008 }
3009 
3010 /*
3011  * pmap_zero_page: zero a page
3012  */
3013 
3014 void
3015 pmap_zero_page(paddr_t pa)
3016 {
3017 #if defined(__HAVE_DIRECT_MAP)
3018 	pagezero(PMAP_DIRECT_MAP(pa));
3019 #else
3020 #if defined(XEN)
3021 	if (XEN_VERSION_SUPPORTED(3, 4))
3022 		xen_pagezero(pa);
3023 #endif
3024 	pt_entry_t *zpte;
3025 	void *zerova;
3026 	int id;
3027 
3028 	kpreempt_disable();
3029 	id = cpu_number();
3030 	zpte = PTESLEW(zero_pte, id);
3031 	zerova = VASLEW(zerop, id);
3032 
3033 #ifdef DIAGNOSTIC
3034 	if (*zpte)
3035 		panic("pmap_zero_page: lock botch");
3036 #endif
3037 
3038 	pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k);
3039 	pmap_pte_flush();
3040 	pmap_update_pg((vaddr_t)zerova);		/* flush TLB */
3041 
3042 	memset(zerova, 0, PAGE_SIZE);
3043 
3044 #if defined(DIAGNOSTIC) || defined(XEN)
3045 	pmap_pte_set(zpte, 0);				/* zap ! */
3046 	pmap_pte_flush();
3047 #endif
3048 	kpreempt_enable();
3049 #endif /* defined(__HAVE_DIRECT_MAP) */
3050 }
3051 
3052 /*
3053  * pmap_pagezeroidle: the same, for the idle loop page zero'er.
3054  * Returns true if the page was zero'd, false if we aborted for
3055  * some reason.
3056  */
3057 
3058 bool
3059 pmap_pageidlezero(paddr_t pa)
3060 {
3061 #ifdef __HAVE_DIRECT_MAP
3062 	KASSERT(cpu_feature[0] & CPUID_SSE2);
3063 	return sse2_idlezero_page((void *)PMAP_DIRECT_MAP(pa));
3064 #else
3065 	pt_entry_t *zpte;
3066 	void *zerova;
3067 	bool rv;
3068 	int id;
3069 
3070 	id = cpu_number();
3071 	zpte = PTESLEW(zero_pte, id);
3072 	zerova = VASLEW(zerop, id);
3073 
3074 	KASSERT(cpu_feature[0] & CPUID_SSE2);
3075 	KASSERT(*zpte == 0);
3076 
3077 	pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k);
3078 	pmap_pte_flush();
3079 	pmap_update_pg((vaddr_t)zerova);		/* flush TLB */
3080 
3081 	rv = sse2_idlezero_page(zerova);
3082 
3083 #if defined(DIAGNOSTIC) || defined(XEN)
3084 	pmap_pte_set(zpte, 0);				/* zap ! */
3085 	pmap_pte_flush();
3086 #endif
3087 
3088 	return rv;
3089 #endif
3090 }
3091 
3092 /*
3093  * pmap_copy_page: copy a page
3094  */
3095 
3096 void
3097 pmap_copy_page(paddr_t srcpa, paddr_t dstpa)
3098 {
3099 #if defined(__HAVE_DIRECT_MAP)
3100 	vaddr_t srcva = PMAP_DIRECT_MAP(srcpa);
3101 	vaddr_t dstva = PMAP_DIRECT_MAP(dstpa);
3102 
3103 	memcpy((void *)dstva, (void *)srcva, PAGE_SIZE);
3104 #else
3105 #if defined(XEN)
3106 	if (XEN_VERSION_SUPPORTED(3, 4)) {
3107 		xen_copy_page(srcpa, dstpa);
3108 		return;
3109 	}
3110 #endif
3111 	pt_entry_t *spte;
3112 	pt_entry_t *dpte;
3113 	void *csrcva;
3114 	void *cdstva;
3115 	int id;
3116 
3117 	kpreempt_disable();
3118 	id = cpu_number();
3119 	spte = PTESLEW(csrc_pte,id);
3120 	dpte = PTESLEW(cdst_pte,id);
3121 	csrcva = VASLEW(csrcp, id);
3122 	cdstva = VASLEW(cdstp, id);
3123 
3124 	KASSERT(*spte == 0 && *dpte == 0);
3125 
3126 	pmap_pte_set(spte, pmap_pa2pte(srcpa) | PG_V | PG_RW | PG_U | PG_k);
3127 	pmap_pte_set(dpte,
3128 	    pmap_pa2pte(dstpa) | PG_V | PG_RW | PG_M | PG_U | PG_k);
3129 	pmap_pte_flush();
3130 	pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva);
3131 
3132 	memcpy(cdstva, csrcva, PAGE_SIZE);
3133 
3134 #if defined(DIAGNOSTIC) || defined(XEN)
3135 	pmap_pte_set(spte, 0);
3136 	pmap_pte_set(dpte, 0);
3137 	pmap_pte_flush();
3138 #endif
3139 	kpreempt_enable();
3140 #endif /* defined(__HAVE_DIRECT_MAP) */
3141 }
3142 
3143 static pt_entry_t *
3144 pmap_map_ptp(struct vm_page *ptp)
3145 {
3146 #ifdef __HAVE_DIRECT_MAP
3147 	return (void *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp));
3148 #else
3149 	pt_entry_t *ptppte;
3150 	void *ptpva;
3151 	int id;
3152 
3153 	KASSERT(kpreempt_disabled());
3154 
3155 	id = cpu_number();
3156 	ptppte = PTESLEW(ptp_pte, id);
3157 	ptpva = VASLEW(ptpp, id);
3158 #if !defined(XEN)
3159 	pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M |
3160 	    PG_RW | PG_U | PG_k);
3161 #else
3162 	pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M |
3163 	    PG_U | PG_k);
3164 #endif
3165 	pmap_pte_flush();
3166 	pmap_update_pg((vaddr_t)ptpva);
3167 
3168 	return (pt_entry_t *)ptpva;
3169 #endif
3170 }
3171 
3172 static void
3173 pmap_unmap_ptp(void)
3174 {
3175 #ifndef __HAVE_DIRECT_MAP
3176 #if defined(DIAGNOSTIC) || defined(XEN)
3177 	pt_entry_t *pte;
3178 
3179 	KASSERT(kpreempt_disabled());
3180 
3181 	pte = PTESLEW(ptp_pte, cpu_number());
3182 	if (*pte != 0) {
3183 		pmap_pte_set(pte, 0);
3184 		pmap_pte_flush();
3185 	}
3186 #endif
3187 #endif
3188 }
3189 
3190 static pt_entry_t *
3191 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
3192 {
3193 
3194 	KASSERT(kpreempt_disabled());
3195 	if (pmap_is_curpmap(pmap)) {
3196 		return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */
3197 	}
3198 	KASSERT(ptp != NULL);
3199 	return pmap_map_ptp(ptp) + pl1_pi(va);
3200 }
3201 
3202 static void
3203 pmap_unmap_pte(void)
3204 {
3205 
3206 	KASSERT(kpreempt_disabled());
3207 
3208 	pmap_unmap_ptp();
3209 }
3210 
3211 /*
3212  * p m a p   r e m o v e   f u n c t i o n s
3213  *
3214  * functions that remove mappings
3215  */
3216 
3217 /*
3218  * pmap_remove_ptes: remove PTEs from a PTP
3219  *
3220  * => caller must hold pmap's lock
3221  * => PTP must be mapped into KVA
3222  * => PTP should be null if pmap == pmap_kernel()
3223  * => must be called with kernel preemption disabled
3224  * => returns composite pte if at least one page should be shot down
3225  */
3226 
3227 static void
3228 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
3229 		 vaddr_t startva, vaddr_t endva, struct pv_entry **pv_tofree)
3230 {
3231 	pt_entry_t *pte = (pt_entry_t *)ptpva;
3232 
3233 	KASSERT(pmap == pmap_kernel() || mutex_owned(pmap->pm_lock));
3234 	KASSERT(kpreempt_disabled());
3235 
3236 	/*
3237 	 * note that ptpva points to the PTE that maps startva.   this may
3238 	 * or may not be the first PTE in the PTP.
3239 	 *
3240 	 * we loop through the PTP while there are still PTEs to look at
3241 	 * and the wire_count is greater than 1 (because we use the wire_count
3242 	 * to keep track of the number of real PTEs in the PTP).
3243 	 */
3244 	while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) {
3245 		(void)pmap_remove_pte(pmap, ptp, pte, startva, pv_tofree);
3246 		startva += PAGE_SIZE;
3247 		pte++;
3248 	}
3249 }
3250 
3251 
3252 /*
3253  * pmap_remove_pte: remove a single PTE from a PTP.
3254  *
3255  * => caller must hold pmap's lock
3256  * => PTP must be mapped into KVA
3257  * => PTP should be null if pmap == pmap_kernel()
3258  * => returns true if we removed a mapping
3259  * => must be called with kernel preemption disabled
3260  */
3261 static bool
3262 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
3263 		vaddr_t va, struct pv_entry **pv_tofree)
3264 {
3265 	struct pv_entry *pve;
3266 	struct vm_page *pg;
3267 	struct pmap_page *pp;
3268 	pt_entry_t opte;
3269 
3270 	KASSERT(pmap == pmap_kernel() || mutex_owned(pmap->pm_lock));
3271 	KASSERT(kpreempt_disabled());
3272 
3273 	if (!pmap_valid_entry(*pte)) {
3274 		/* VA not mapped. */
3275 		return false;
3276 	}
3277 
3278 	/* Atomically save the old PTE and zap it. */
3279 	opte = pmap_pte_testset(pte, 0);
3280 	if (!pmap_valid_entry(opte)) {
3281 		return false;
3282 	}
3283 
3284 	pmap_exec_account(pmap, va, opte, 0);
3285 	pmap_stats_update_bypte(pmap, 0, opte);
3286 
3287 	if (ptp) {
3288 		/*
3289 		 * Dropping a PTE.  Make sure that the PDE is flushed.
3290 		 */
3291 		ptp->wire_count--;
3292 		if (ptp->wire_count <= 1) {
3293 			opte |= PG_U;
3294 		}
3295 	}
3296 
3297 	if ((opte & PG_U) != 0) {
3298 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE);
3299 	}
3300 
3301 	/*
3302 	 * If we are not on a pv_head list - we are done.
3303 	 */
3304 	if ((opte & PG_PVLIST) == 0) {
3305 #if defined(DIAGNOSTIC) && !defined(DOM0OPS)
3306 		if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL ||
3307 		    pmap_pv_tracked(pmap_pte2pa(opte)) != NULL)
3308 			panic("pmap_remove_pte: managed or pv-tracked page"
3309 			    " without PG_PVLIST for %#"PRIxVADDR, va);
3310 #endif
3311 		return true;
3312 	}
3313 
3314 	if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
3315 		KASSERT(uvm_page_locked_p(pg));
3316 		pp = VM_PAGE_TO_PP(pg);
3317 	} else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
3318 		paddr_t pa = pmap_pte2pa(opte);
3319 		panic("pmap_remove_pte: PG_PVLIST with pv-untracked page"
3320 		    " va = 0x%"PRIxVADDR
3321 		    " pa = 0x%"PRIxPADDR" (0x%"PRIxPADDR")",
3322 		    va, pa, atop(pa));
3323 	}
3324 
3325 	/* Sync R/M bits. */
3326 	pp->pp_attrs |= opte;
3327 	pve = pmap_remove_pv(pp, ptp, va);
3328 
3329 	if (pve) {
3330 		pve->pve_next = *pv_tofree;
3331 		*pv_tofree = pve;
3332 	}
3333 	return true;
3334 }
3335 
3336 /*
3337  * pmap_remove: mapping removal function.
3338  *
3339  * => caller should not be holding any pmap locks
3340  */
3341 
3342 void
3343 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
3344 {
3345 	pt_entry_t *ptes;
3346 	pd_entry_t pde;
3347 	pd_entry_t * const *pdes;
3348 	struct pv_entry *pv_tofree = NULL;
3349 	bool result;
3350 	int i;
3351 	paddr_t ptppa;
3352 	vaddr_t blkendva, va = sva;
3353 	struct vm_page *ptp;
3354 	struct pmap *pmap2;
3355 
3356 	kpreempt_disable();
3357 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
3358 
3359 	/*
3360 	 * removing one page?  take shortcut function.
3361 	 */
3362 
3363 	if (va + PAGE_SIZE == eva) {
3364 		if (pmap_pdes_valid(va, pdes, &pde)) {
3365 
3366 			/* PA of the PTP */
3367 			ptppa = pmap_pte2pa(pde);
3368 
3369 			/* Get PTP if non-kernel mapping. */
3370 			if (pmap != pmap_kernel()) {
3371 				ptp = pmap_find_ptp(pmap, va, ptppa, 1);
3372 				KASSERTMSG(ptp != NULL,
3373 				    "pmap_remove: unmanaged PTP detected");
3374 			} else {
3375 				/* Never free kernel PTPs. */
3376 				ptp = NULL;
3377 			}
3378 
3379 			result = pmap_remove_pte(pmap, ptp,
3380 			    &ptes[pl1_i(va)], va, &pv_tofree);
3381 
3382 			/*
3383 			 * if mapping removed and the PTP is no longer
3384 			 * being used, free it!
3385 			 */
3386 
3387 			if (result && ptp && ptp->wire_count <= 1)
3388 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3389 		}
3390 	} else for (/* null */ ; va < eva ; va = blkendva) {
3391 		int lvl;
3392 
3393 		/* determine range of block */
3394 		blkendva = x86_round_pdr(va+1);
3395 		if (blkendva > eva)
3396 			blkendva = eva;
3397 
3398 		/*
3399 		 * XXXCDC: our PTE mappings should never be removed
3400 		 * with pmap_remove!  if we allow this (and why would
3401 		 * we?) then we end up freeing the pmap's page
3402 		 * directory page (PDP) before we are finished using
3403 		 * it when we hit in in the recursive mapping.  this
3404 		 * is BAD.
3405 		 *
3406 		 * long term solution is to move the PTEs out of user
3407 		 * address space.  and into kernel address space (up
3408 		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
3409 		 * be VM_MAX_ADDRESS.
3410 		 */
3411 
3412 		/* XXXCDC: ugly hack to avoid freeing PDP here */
3413 		for (i = 0; i < PDP_SIZE; i++) {
3414 			if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i)
3415 				continue;
3416 		}
3417 
3418 		lvl = pmap_pdes_invalid(va, pdes, &pde);
3419 		if (lvl != 0) {
3420 			/*
3421 			 * skip a range corresponding to an invalid pde.
3422 			 */
3423 			blkendva = (va & ptp_masks[lvl - 1]) + nbpd[lvl - 1];
3424  			continue;
3425 		}
3426 
3427 		/* PA of the PTP */
3428 		ptppa = pmap_pte2pa(pde);
3429 
3430 		/* Get PTP if non-kernel mapping. */
3431 		if (pmap != pmap_kernel()) {
3432 			ptp = pmap_find_ptp(pmap, va, ptppa, 1);
3433 			KASSERTMSG(ptp != NULL,
3434 			    "pmap_remove: unmanaged PTP detected");
3435 		} else {
3436 			/* Never free kernel PTPs. */
3437 			ptp = NULL;
3438 		}
3439 
3440 		pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va,
3441 		    blkendva, &pv_tofree);
3442 
3443 		/* if PTP is no longer being used, free it! */
3444 		if (ptp && ptp->wire_count <= 1) {
3445 			pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3446 		}
3447 	}
3448 	pmap_unmap_ptes(pmap, pmap2);		/* unlock pmap */
3449 	kpreempt_enable();
3450 
3451 	/* Now we free unused PVs */
3452 	if (pv_tofree)
3453 		pmap_free_pvs(pv_tofree);
3454 }
3455 
3456 /*
3457  * pmap_sync_pv: clear pte bits and return the old value of the pte.
3458  *
3459  * => Caller should disable kernel preemption.
3460  * => issues tlb shootdowns if necessary.
3461  */
3462 
3463 static int
3464 pmap_sync_pv(struct pv_pte *pvpte, pt_entry_t expect, int clearbits,
3465     pt_entry_t *optep)
3466 {
3467 	struct pmap *pmap;
3468 	struct vm_page *ptp;
3469 	vaddr_t va;
3470 	pt_entry_t *ptep;
3471 	pt_entry_t opte;
3472 	pt_entry_t npte;
3473 	bool need_shootdown;
3474 
3475 	ptp = pvpte->pte_ptp;
3476 	va = pvpte->pte_va;
3477 	KASSERT(ptp == NULL || ptp->uobject != NULL);
3478 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
3479 	pmap = ptp_to_pmap(ptp);
3480 
3481 	KASSERT((expect & ~(PG_FRAME | PG_V)) == 0);
3482 	KASSERT((expect & PG_V) != 0);
3483 	KASSERT(clearbits == ~0 || (clearbits & ~(PG_M | PG_U | PG_RW)) == 0);
3484 	KASSERT(kpreempt_disabled());
3485 
3486 	ptep = pmap_map_pte(pmap, ptp, va);
3487 	do {
3488 		opte = *ptep;
3489 		KASSERT((opte & (PG_M | PG_U)) != PG_M);
3490 		KASSERT((opte & (PG_U | PG_V)) != PG_U);
3491 		KASSERT(opte == 0 || (opte & PG_V) != 0);
3492 		if ((opte & (PG_FRAME | PG_V)) != expect) {
3493 
3494 			/*
3495 			 * we lost a race with a V->P operation like
3496 			 * pmap_remove().  wait for the competitor
3497 			 * reflecting pte bits into mp_attrs.
3498 			 *
3499 			 * issue a redundant TLB shootdown so that
3500 			 * we can wait for its completion.
3501 			 */
3502 
3503 			pmap_unmap_pte();
3504 			if (clearbits != 0) {
3505 				pmap_tlb_shootdown(pmap, va,
3506 				    (pmap == pmap_kernel() ? PG_G : 0),
3507 				    TLBSHOOT_SYNC_PV1);
3508 			}
3509 			return EAGAIN;
3510 		}
3511 
3512 		/*
3513 		 * check if there's anything to do on this pte.
3514 		 */
3515 
3516 		if ((opte & clearbits) == 0) {
3517 			need_shootdown = false;
3518 			break;
3519 		}
3520 
3521 		/*
3522 		 * we need a shootdown if the pte is cached. (PG_U)
3523 		 *
3524 		 * ...unless we are clearing only the PG_RW bit and
3525 		 * it isn't cached as RW. (PG_M)
3526 		 */
3527 
3528 		need_shootdown = (opte & PG_U) != 0 &&
3529 		    !(clearbits == PG_RW && (opte & PG_M) == 0);
3530 
3531 		npte = opte & ~clearbits;
3532 
3533 		/*
3534 		 * if we need a shootdown anyway, clear PG_U and PG_M.
3535 		 */
3536 
3537 		if (need_shootdown) {
3538 			npte &= ~(PG_U | PG_M);
3539 		}
3540 		KASSERT((npte & (PG_M | PG_U)) != PG_M);
3541 		KASSERT((npte & (PG_U | PG_V)) != PG_U);
3542 		KASSERT(npte == 0 || (opte & PG_V) != 0);
3543 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
3544 
3545 	if (need_shootdown) {
3546 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV2);
3547 	}
3548 	pmap_unmap_pte();
3549 
3550 	*optep = opte;
3551 	return 0;
3552 }
3553 
3554 static void
3555 pmap_pp_remove(struct pmap_page *pp, paddr_t pa)
3556 {
3557 	struct pv_pte *pvpte;
3558 	struct pv_entry *killlist = NULL;
3559 	struct vm_page *ptp;
3560 	pt_entry_t expect;
3561 	int count;
3562 
3563 	expect = pmap_pa2pte(pa) | PG_V;
3564 	count = SPINLOCK_BACKOFF_MIN;
3565 	kpreempt_disable();
3566 startover:
3567 	while ((pvpte = pv_pte_first(pp)) != NULL) {
3568 		struct pmap *pmap;
3569 		struct pv_entry *pve;
3570 		pt_entry_t opte;
3571 		vaddr_t va;
3572 		int error;
3573 
3574 		/*
3575 		 * add a reference to the pmap before clearing the pte.
3576 		 * otherwise the pmap can disappear behind us.
3577 		 */
3578 
3579 		ptp = pvpte->pte_ptp;
3580 		pmap = ptp_to_pmap(ptp);
3581 		if (ptp != NULL) {
3582 			pmap_reference(pmap);
3583 		}
3584 
3585 		error = pmap_sync_pv(pvpte, expect, ~0, &opte);
3586 		if (error == EAGAIN) {
3587 			int hold_count;
3588 			KERNEL_UNLOCK_ALL(curlwp, &hold_count);
3589 			if (ptp != NULL) {
3590 				pmap_destroy(pmap);
3591 			}
3592 			SPINLOCK_BACKOFF(count);
3593 			KERNEL_LOCK(hold_count, curlwp);
3594 			goto startover;
3595 		}
3596 
3597 		pp->pp_attrs |= opte;
3598 		va = pvpte->pte_va;
3599 		pve = pmap_remove_pv(pp, ptp, va);
3600 
3601 		/* update the PTP reference count.  free if last reference. */
3602 		if (ptp != NULL) {
3603 			struct pmap *pmap2;
3604 			pt_entry_t *ptes;
3605 			pd_entry_t * const *pdes;
3606 
3607 			KASSERT(pmap != pmap_kernel());
3608 
3609 			pmap_tlb_shootnow();
3610 			pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3611 			pmap_stats_update_bypte(pmap, 0, opte);
3612 			ptp->wire_count--;
3613 			if (ptp->wire_count <= 1) {
3614 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3615 			}
3616 			pmap_unmap_ptes(pmap, pmap2);
3617 			pmap_destroy(pmap);
3618 		} else {
3619 			KASSERT(pmap == pmap_kernel());
3620 			pmap_stats_update_bypte(pmap, 0, opte);
3621 		}
3622 
3623 		if (pve != NULL) {
3624 			pve->pve_next = killlist;	/* mark it for death */
3625 			killlist = pve;
3626 		}
3627 	}
3628 	pmap_tlb_shootnow();
3629 	kpreempt_enable();
3630 
3631 	/* Now free unused pvs. */
3632 	pmap_free_pvs(killlist);
3633 }
3634 
3635 /*
3636  * pmap_page_remove: remove a managed vm_page from all pmaps that map it
3637  *
3638  * => R/M bits are sync'd back to attrs
3639  */
3640 
3641 void
3642 pmap_page_remove(struct vm_page *pg)
3643 {
3644 	struct pmap_page *pp;
3645 	paddr_t pa;
3646 
3647 	KASSERT(uvm_page_locked_p(pg));
3648 
3649 	pp = VM_PAGE_TO_PP(pg);
3650 	pa = VM_PAGE_TO_PHYS(pg);
3651 	pmap_pp_remove(pp, pa);
3652 }
3653 
3654 /*
3655  * pmap_pv_remove: remove an unmanaged pv-tracked page from all pmaps
3656  *	that map it
3657  */
3658 
3659 void
3660 pmap_pv_remove(paddr_t pa)
3661 {
3662 	struct pmap_page *pp;
3663 
3664 	pp = pmap_pv_tracked(pa);
3665 	if (pp == NULL)
3666 		panic("pmap_pv_protect: page not pv-tracked: 0x%"PRIxPADDR,
3667 		    pa);
3668 	pmap_pp_remove(pp, pa);
3669 }
3670 
3671 /*
3672  * p m a p   a t t r i b u t e  f u n c t i o n s
3673  * functions that test/change managed page's attributes
3674  * since a page can be mapped multiple times we must check each PTE that
3675  * maps it by going down the pv lists.
3676  */
3677 
3678 /*
3679  * pmap_test_attrs: test a page's attributes
3680  */
3681 
3682 bool
3683 pmap_test_attrs(struct vm_page *pg, unsigned testbits)
3684 {
3685 	struct pmap_page *pp;
3686 	struct pv_pte *pvpte;
3687 	pt_entry_t expect;
3688 	u_int result;
3689 
3690 	KASSERT(uvm_page_locked_p(pg));
3691 
3692 	pp = VM_PAGE_TO_PP(pg);
3693 	if ((pp->pp_attrs & testbits) != 0) {
3694 		return true;
3695 	}
3696 	expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V;
3697 	kpreempt_disable();
3698 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
3699 		pt_entry_t opte;
3700 		int error;
3701 
3702 		if ((pp->pp_attrs & testbits) != 0) {
3703 			break;
3704 		}
3705 		error = pmap_sync_pv(pvpte, expect, 0, &opte);
3706 		if (error == 0) {
3707 			pp->pp_attrs |= opte;
3708 		}
3709 	}
3710 	result = pp->pp_attrs & testbits;
3711 	kpreempt_enable();
3712 
3713 	/*
3714 	 * note that we will exit the for loop with a non-null pve if
3715 	 * we have found the bits we are testing for.
3716 	 */
3717 
3718 	return result != 0;
3719 }
3720 
3721 static bool
3722 pmap_pp_clear_attrs(struct pmap_page *pp, paddr_t pa, unsigned clearbits)
3723 {
3724 	struct pv_pte *pvpte;
3725 	u_int result;
3726 	pt_entry_t expect;
3727 	int count;
3728 
3729 	expect = pmap_pa2pte(pa) | PG_V;
3730 	count = SPINLOCK_BACKOFF_MIN;
3731 	kpreempt_disable();
3732 startover:
3733 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
3734 		pt_entry_t opte;
3735 		int error;
3736 
3737 		error = pmap_sync_pv(pvpte, expect, clearbits, &opte);
3738 		if (error == EAGAIN) {
3739 			int hold_count;
3740 			KERNEL_UNLOCK_ALL(curlwp, &hold_count);
3741 			SPINLOCK_BACKOFF(count);
3742 			KERNEL_LOCK(hold_count, curlwp);
3743 			goto startover;
3744 		}
3745 		pp->pp_attrs |= opte;
3746 	}
3747 	result = pp->pp_attrs & clearbits;
3748 	pp->pp_attrs &= ~clearbits;
3749 	kpreempt_enable();
3750 
3751 	return result != 0;
3752 }
3753 
3754 /*
3755  * pmap_clear_attrs: clear the specified attribute for a page.
3756  *
3757  * => we return true if we cleared one of the bits we were asked to
3758  */
3759 
3760 bool
3761 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits)
3762 {
3763 	struct pmap_page *pp;
3764 	paddr_t pa;
3765 
3766 	KASSERT(uvm_page_locked_p(pg));
3767 
3768 	pp = VM_PAGE_TO_PP(pg);
3769 	pa = VM_PAGE_TO_PHYS(pg);
3770 
3771 	return pmap_pp_clear_attrs(pp, pa, clearbits);
3772 }
3773 
3774 /*
3775  * pmap_pv_clear_attrs: clear the specified attributes for an unmanaged
3776  *	pv-tracked page.
3777  */
3778 
3779 bool
3780 pmap_pv_clear_attrs(paddr_t pa, unsigned clearbits)
3781 {
3782 	struct pmap_page *pp;
3783 
3784 	pp = pmap_pv_tracked(pa);
3785 	if (pp == NULL)
3786 		panic("pmap_pv_protect: page not pv-tracked: 0x%"PRIxPADDR,
3787 		    pa);
3788 
3789 	return pmap_pp_clear_attrs(pp, pa, clearbits);
3790 }
3791 
3792 /*
3793  * p m a p   p r o t e c t i o n   f u n c t i o n s
3794  */
3795 
3796 /*
3797  * pmap_page_protect: change the protection of all recorded mappings
3798  *	of a managed page
3799  *
3800  * => NOTE: this is an inline function in pmap.h
3801  */
3802 
3803 /* see pmap.h */
3804 
3805 /*
3806  * pmap_pv_protect: change the protection of all recorded mappings
3807  *	of an unmanaged pv-tracked page
3808  *
3809  * => NOTE: this is an inline function in pmap.h
3810  */
3811 
3812 /* see pmap.h */
3813 
3814 /*
3815  * pmap_protect: set the protection in of the pages in a pmap
3816  *
3817  * => NOTE: this is an inline function in pmap.h
3818  */
3819 
3820 /* see pmap.h */
3821 
3822 /*
3823  * pmap_write_protect: write-protect pages in a pmap.
3824  */
3825 void
3826 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
3827 {
3828 	pt_entry_t *ptes;
3829 	pt_entry_t * const *pdes;
3830 	struct pmap *pmap2;
3831 	vaddr_t blockend, va;
3832 
3833 	KASSERT(curlwp->l_md.md_gc_pmap != pmap);
3834 
3835 	sva &= PG_FRAME;
3836 	eva &= PG_FRAME;
3837 
3838 	/* Acquire pmap. */
3839 	kpreempt_disable();
3840 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3841 
3842 	for (va = sva ; va < eva ; va = blockend) {
3843 		pt_entry_t *spte, *epte;
3844 		int i;
3845 
3846 		blockend = x86_round_pdr(va + 1);
3847 		if (blockend > eva)
3848 			blockend = eva;
3849 
3850 		/*
3851 		 * XXXCDC: our PTE mappings should never be write-protected!
3852 		 *
3853 		 * long term solution is to move the PTEs out of user
3854 		 * address space.  and into kernel address space (up
3855 		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
3856 		 * be VM_MAX_ADDRESS.
3857 		 */
3858 
3859 		/* XXXCDC: ugly hack to avoid freeing PDP here */
3860 		for (i = 0; i < PDP_SIZE; i++) {
3861 			if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i)
3862 				continue;
3863 		}
3864 
3865 		/* Is it a valid block? */
3866 		if (!pmap_pdes_valid(va, pdes, NULL)) {
3867 			continue;
3868 		}
3869 		KASSERT(va < VM_MAXUSER_ADDRESS || va >= VM_MAX_ADDRESS);
3870 
3871 		spte = &ptes[pl1_i(va)];
3872 		epte = &ptes[pl1_i(blockend)];
3873 
3874 		for (/*null */; spte < epte ; spte++) {
3875 			pt_entry_t opte, npte;
3876 
3877 			do {
3878 				opte = *spte;
3879 				if ((~opte & (PG_RW | PG_V)) != 0) {
3880 					goto next;
3881 				}
3882 				npte = opte & ~PG_RW;
3883 			} while (pmap_pte_cas(spte, opte, npte) != opte);
3884 
3885 			if ((opte & PG_M) != 0) {
3886 				vaddr_t tva = x86_ptob(spte - ptes);
3887 				pmap_tlb_shootdown(pmap, tva, opte,
3888 				    TLBSHOOT_WRITE_PROTECT);
3889 			}
3890 next:;
3891 		}
3892 	}
3893 
3894 	/* Release pmap. */
3895 	pmap_unmap_ptes(pmap, pmap2);
3896 	kpreempt_enable();
3897 }
3898 
3899 /*
3900  * pmap_unwire: clear the wired bit in the PTE.
3901  *
3902  * => Mapping should already be present.
3903  */
3904 void
3905 pmap_unwire(struct pmap *pmap, vaddr_t va)
3906 {
3907 	pt_entry_t *ptes, *ptep, opte;
3908 	pd_entry_t * const *pdes;
3909 	struct pmap *pmap2;
3910 
3911 	/* Acquire pmap. */
3912 	kpreempt_disable();
3913 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3914 
3915 	if (!pmap_pdes_valid(va, pdes, NULL)) {
3916 		panic("pmap_unwire: invalid PDE");
3917 	}
3918 
3919 	ptep = &ptes[pl1_i(va)];
3920 	opte = *ptep;
3921 	KASSERT(pmap_valid_entry(opte));
3922 
3923 	if (opte & PG_W) {
3924 		pt_entry_t npte = opte & ~PG_W;
3925 
3926 		opte = pmap_pte_testset(ptep, npte);
3927 		pmap_stats_update_bypte(pmap, npte, opte);
3928 	} else {
3929 		printf("pmap_unwire: wiring for pmap %p va 0x%lx "
3930 		    "did not change!\n", pmap, va);
3931 	}
3932 
3933 	/* Release pmap. */
3934 	pmap_unmap_ptes(pmap, pmap2);
3935 	kpreempt_enable();
3936 }
3937 
3938 /*
3939  * pmap_copy: copy mappings from one pmap to another
3940  *
3941  * => optional function
3942  * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
3943  */
3944 
3945 /*
3946  * defined as macro in pmap.h
3947  */
3948 
3949 __strict_weak_alias(pmap_enter, pmap_enter_default);
3950 
3951 int
3952 pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
3953     u_int flags)
3954 {
3955 	return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0);
3956 }
3957 
3958 /*
3959  * pmap_enter: enter a mapping into a pmap
3960  *
3961  * => must be done "now" ... no lazy-evaluation
3962  * => we set pmap => pv_head locking
3963  */
3964 int
3965 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa,
3966 	   vm_prot_t prot, u_int flags, int domid)
3967 {
3968 	pt_entry_t *ptes, opte, npte;
3969 	pt_entry_t *ptep;
3970 	pd_entry_t * const *pdes;
3971 	struct vm_page *ptp;
3972 	struct vm_page *new_pg, *old_pg;
3973 	struct pmap_page *new_pp, *old_pp;
3974 	struct pv_entry *old_pve = NULL;
3975 	struct pv_entry *new_pve;
3976 	struct pv_entry *new_pve2;
3977 	int error;
3978 	bool wired = (flags & PMAP_WIRED) != 0;
3979 	struct pmap *pmap2;
3980 
3981 	KASSERT(pmap_initialized);
3982 	KASSERT(curlwp->l_md.md_gc_pmap != pmap);
3983 	KASSERT(va < VM_MAX_KERNEL_ADDRESS);
3984 	KASSERTMSG(va != (vaddr_t)PDP_BASE,
3985 	    "pmap_enter: trying to map over PDP!");
3986 	KASSERTMSG(va < VM_MIN_KERNEL_ADDRESS ||
3987 	    pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]),
3988 	    "pmap_enter: missing kernel PTP for VA %lx!", va);
3989 
3990 #ifdef XEN
3991 	KASSERT(domid == DOMID_SELF || pa == 0);
3992 #endif /* XEN */
3993 
3994 	npte = ma | protection_codes[prot] | PG_V;
3995 	npte |= pmap_pat_flags(flags);
3996 	if (wired)
3997 	        npte |= PG_W;
3998 	if (va < VM_MAXUSER_ADDRESS)
3999 		npte |= PG_u;
4000 	else if (va < VM_MAX_ADDRESS)
4001 		npte |= (PG_u | PG_RW);	/* XXXCDC: no longer needed? */
4002 	else
4003 		npte |= PG_k;
4004 	if (pmap == pmap_kernel())
4005 		npte |= pmap_pg_g;
4006 	if (flags & VM_PROT_ALL) {
4007 		npte |= PG_U;
4008 		if (flags & VM_PROT_WRITE) {
4009 			KASSERT((npte & PG_RW) != 0);
4010 			npte |= PG_M;
4011 		}
4012 	}
4013 
4014 #ifdef XEN
4015 	if (domid != DOMID_SELF)
4016 		new_pg = NULL;
4017 	else
4018 #endif
4019 		new_pg = PHYS_TO_VM_PAGE(pa);
4020 	if (new_pg != NULL) {
4021 		/* This is a managed page */
4022 		npte |= PG_PVLIST;
4023 		new_pp = VM_PAGE_TO_PP(new_pg);
4024 	} else if ((new_pp = pmap_pv_tracked(pa)) != NULL) {
4025 		/* This is an unmanaged pv-tracked page */
4026 		npte |= PG_PVLIST;
4027 	} else {
4028 		new_pp = NULL;
4029 	}
4030 
4031 	/* get pves. */
4032 	new_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
4033 	new_pve2 = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
4034 	if (new_pve == NULL || new_pve2 == NULL) {
4035 		if (flags & PMAP_CANFAIL) {
4036 			error = ENOMEM;
4037 			goto out2;
4038 		}
4039 		panic("pmap_enter: pve allocation failed");
4040 	}
4041 
4042 	kpreempt_disable();
4043 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
4044 	if (pmap == pmap_kernel()) {
4045 		ptp = NULL;
4046 	} else {
4047 		ptp = pmap_get_ptp(pmap, va, pdes);
4048 		if (ptp == NULL) {
4049 			pmap_unmap_ptes(pmap, pmap2);
4050 			if (flags & PMAP_CANFAIL) {
4051 				error = ENOMEM;
4052 				goto out;
4053 			}
4054 			panic("pmap_enter: get ptp failed");
4055 		}
4056 	}
4057 
4058 	/*
4059 	 * update the pte.
4060 	 */
4061 
4062 	ptep = &ptes[pl1_i(va)];
4063 	do {
4064 		opte = *ptep;
4065 
4066 		/*
4067 		 * if the same page, inherit PG_U and PG_M.
4068 		 */
4069 		if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) {
4070 			npte |= opte & (PG_U | PG_M);
4071 		}
4072 #if defined(XEN)
4073 		if (domid != DOMID_SELF) {
4074 			/* pmap_pte_cas with error handling */
4075 			int s = splvm();
4076 			if (opte != *ptep) {
4077 				splx(s);
4078 				continue;
4079 			}
4080 			error = xpq_update_foreign(
4081 			    vtomach((vaddr_t)ptep), npte, domid);
4082 			splx(s);
4083 			if (error) {
4084 				if (ptp != NULL && ptp->wire_count <= 1) {
4085 					pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4086 				}
4087 				pmap_unmap_ptes(pmap, pmap2);
4088 				goto out;
4089 			}
4090 			break;
4091 		}
4092 #endif /* defined(XEN) */
4093 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
4094 
4095 	/*
4096 	 * update statistics and PTP's reference count.
4097 	 */
4098 
4099 	pmap_stats_update_bypte(pmap, npte, opte);
4100 	if (ptp != NULL && !pmap_valid_entry(opte)) {
4101 		ptp->wire_count++;
4102 	}
4103 	KASSERT(ptp == NULL || ptp->wire_count > 1);
4104 
4105 	/*
4106 	 * if the same page, we can skip pv_entry handling.
4107 	 */
4108 
4109 	if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) {
4110 		KASSERT(((opte ^ npte) & PG_PVLIST) == 0);
4111 		goto same_pa;
4112 	}
4113 
4114 	/*
4115 	 * if old page is pv-tracked, remove pv_entry from its list.
4116 	 */
4117 
4118 	if ((~opte & (PG_V | PG_PVLIST)) == 0) {
4119 		if ((old_pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
4120 			KASSERT(uvm_page_locked_p(old_pg));
4121 			old_pp = VM_PAGE_TO_PP(old_pg);
4122 		} else if ((old_pp = pmap_pv_tracked(pmap_pte2pa(opte)))
4123 		    == NULL) {
4124 			pa = pmap_pte2pa(opte);
4125 			panic("pmap_enter: PG_PVLIST with pv-untracked page"
4126 			    " va = 0x%"PRIxVADDR
4127 			    " pa = 0x%" PRIxPADDR " (0x%" PRIxPADDR ")",
4128 			    va, pa, atop(pa));
4129 		}
4130 
4131 		old_pve = pmap_remove_pv(old_pp, ptp, va);
4132 		old_pp->pp_attrs |= opte;
4133 	}
4134 
4135 	/*
4136 	 * if new page is pv-tracked, insert pv_entry into its list.
4137 	 */
4138 
4139 	if (new_pp) {
4140 		new_pve = pmap_enter_pv(new_pp, new_pve, &new_pve2, ptp, va);
4141 	}
4142 
4143 same_pa:
4144 	pmap_unmap_ptes(pmap, pmap2);
4145 
4146 	/*
4147 	 * shootdown tlb if necessary.
4148 	 */
4149 
4150 	if ((~opte & (PG_V | PG_U)) == 0 &&
4151 	    ((opte ^ npte) & (PG_FRAME | PG_RW)) != 0) {
4152 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER);
4153 	}
4154 
4155 	error = 0;
4156 out:
4157 	kpreempt_enable();
4158 out2:
4159 	if (old_pve != NULL) {
4160 		pool_cache_put(&pmap_pv_cache, old_pve);
4161 	}
4162 	if (new_pve != NULL) {
4163 		pool_cache_put(&pmap_pv_cache, new_pve);
4164 	}
4165 	if (new_pve2 != NULL) {
4166 		pool_cache_put(&pmap_pv_cache, new_pve2);
4167 	}
4168 
4169 	return error;
4170 }
4171 
4172 static bool
4173 pmap_get_physpage(vaddr_t va, int level, paddr_t *paddrp)
4174 {
4175 	struct vm_page *ptp;
4176 	struct pmap *kpm = pmap_kernel();
4177 
4178 	if (!uvm.page_init_done) {
4179 
4180 		/*
4181 		 * we're growing the kernel pmap early (from
4182 		 * uvm_pageboot_alloc()).  this case must be
4183 		 * handled a little differently.
4184 		 */
4185 
4186 		if (!uvm_page_physget(paddrp))
4187 			panic("pmap_get_physpage: out of memory");
4188 #if defined(__HAVE_DIRECT_MAP)
4189 		pagezero(PMAP_DIRECT_MAP(*paddrp));
4190 #else
4191 #if defined(XEN)
4192 		if (XEN_VERSION_SUPPORTED(3, 4)) {
4193 			xen_pagezero(*paddrp);
4194 			return true;
4195 		}
4196 #endif
4197 		kpreempt_disable();
4198 		pmap_pte_set(early_zero_pte,
4199 		    pmap_pa2pte(*paddrp) | PG_V | PG_RW | PG_k);
4200 		pmap_pte_flush();
4201 		pmap_update_pg((vaddr_t)early_zerop);
4202 		memset(early_zerop, 0, PAGE_SIZE);
4203 #if defined(DIAGNOSTIC) || defined(XEN)
4204 		pmap_pte_set(early_zero_pte, 0);
4205 		pmap_pte_flush();
4206 #endif /* defined(DIAGNOSTIC) */
4207 		kpreempt_enable();
4208 #endif /* defined(__HAVE_DIRECT_MAP) */
4209 	} else {
4210 		/* XXX */
4211 		ptp = uvm_pagealloc(NULL, 0, NULL,
4212 				    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
4213 		if (ptp == NULL)
4214 			panic("pmap_get_physpage: out of memory");
4215 		ptp->flags &= ~PG_BUSY;
4216 		ptp->wire_count = 1;
4217 		*paddrp = VM_PAGE_TO_PHYS(ptp);
4218 	}
4219 	pmap_stats_update(kpm, 1, 0);
4220 	return true;
4221 }
4222 
4223 /*
4224  * Allocate the amount of specified ptps for a ptp level, and populate
4225  * all levels below accordingly, mapping virtual addresses starting at
4226  * kva.
4227  *
4228  * Used by pmap_growkernel.
4229  */
4230 static void
4231 pmap_alloc_level(pd_entry_t * const *pdes, vaddr_t kva, int lvl,
4232     long *needed_ptps)
4233 {
4234 	unsigned long i;
4235 	vaddr_t va;
4236 	paddr_t pa;
4237 	unsigned long index, endindex;
4238 	int level;
4239 	pd_entry_t *pdep;
4240 #ifdef XEN
4241 	int s = splvm(); /* protect xpq_* */
4242 #endif
4243 
4244 	for (level = lvl; level > 1; level--) {
4245 		if (level == PTP_LEVELS)
4246 			pdep = pmap_kernel()->pm_pdir;
4247 		else
4248 			pdep = pdes[level - 2];
4249 		va = kva;
4250 		index = pl_i_roundup(kva, level);
4251 		endindex = index + needed_ptps[level - 1] - 1;
4252 
4253 
4254 		for (i = index; i <= endindex; i++) {
4255 			pt_entry_t pte;
4256 
4257 			KASSERT(!pmap_valid_entry(pdep[i]));
4258 			pmap_get_physpage(va, level - 1, &pa);
4259 			pte = pmap_pa2pte(pa) | PG_k | PG_V | PG_RW;
4260 #ifdef XEN
4261 			pmap_pte_set(&pdep[i], pte);
4262 #if defined(PAE) || defined(__x86_64__)
4263 			if (level == PTP_LEVELS && i >= PDIR_SLOT_KERN) {
4264 				if (__predict_true(
4265 				    cpu_info_primary.ci_flags & CPUF_PRESENT)) {
4266 					/* update per-cpu PMDs on all cpus */
4267 					xen_kpm_sync(pmap_kernel(), i);
4268 				} else {
4269 					/*
4270 					 * too early; update primary CPU
4271 					 * PMD only (without locks)
4272 					 */
4273 #ifdef PAE
4274 					pd_entry_t *cpu_pdep =
4275 					    &cpu_info_primary.ci_kpm_pdir[l2tol2(i)];
4276 #endif
4277 #ifdef __x86_64__
4278 					pd_entry_t *cpu_pdep =
4279 						&cpu_info_primary.ci_kpm_pdir[i];
4280 #endif
4281 					pmap_pte_set(cpu_pdep, pte);
4282 				}
4283 			}
4284 #endif /* PAE || __x86_64__ */
4285 #else /* XEN */
4286 			pdep[i] = pte;
4287 #endif /* XEN */
4288 			KASSERT(level != PTP_LEVELS || nkptp[level - 1] +
4289 			    pl_i(VM_MIN_KERNEL_ADDRESS, level) == i);
4290 			nkptp[level - 1]++;
4291 			va += nbpd[level - 1];
4292 		}
4293 		pmap_pte_flush();
4294 	}
4295 #ifdef XEN
4296 	splx(s);
4297 #endif
4298 }
4299 
4300 /*
4301  * pmap_growkernel: increase usage of KVM space
4302  *
4303  * => we allocate new PTPs for the kernel and install them in all
4304  *	the pmaps on the system.
4305  */
4306 
4307 vaddr_t
4308 pmap_growkernel(vaddr_t maxkvaddr)
4309 {
4310 	struct pmap *kpm = pmap_kernel();
4311 #if !defined(XEN) || !defined(__x86_64__)
4312 	struct pmap *pm;
4313 	long old;
4314 #endif
4315 	int s, i;
4316 	long needed_kptp[PTP_LEVELS], target_nptp;
4317 	bool invalidate = false;
4318 
4319 	s = splvm();	/* to be safe */
4320 	mutex_enter(kpm->pm_lock);
4321 
4322 	if (maxkvaddr <= pmap_maxkvaddr) {
4323 		mutex_exit(kpm->pm_lock);
4324 		splx(s);
4325 		return pmap_maxkvaddr;
4326 	}
4327 
4328 	maxkvaddr = x86_round_pdr(maxkvaddr);
4329 #if !defined(XEN) || !defined(__x86_64__)
4330 	old = nkptp[PTP_LEVELS - 1];
4331 #endif
4332 
4333 	/*
4334 	 * This loop could be optimized more, but pmap_growkernel()
4335 	 * is called infrequently.
4336 	 */
4337 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
4338 		target_nptp = pl_i_roundup(maxkvaddr, i + 1) -
4339 		    pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1);
4340 		/*
4341 		 * XXX only need to check toplevel.
4342 		 */
4343 		if (target_nptp > nkptpmax[i])
4344 			panic("out of KVA space");
4345 		KASSERT(target_nptp >= nkptp[i]);
4346 		needed_kptp[i] = target_nptp - nkptp[i];
4347 	}
4348 
4349 	pmap_alloc_level(normal_pdes, pmap_maxkvaddr, PTP_LEVELS, needed_kptp);
4350 
4351 	/*
4352 	 * If the number of top level entries changed, update all
4353 	 * pmaps.
4354 	 */
4355 	if (needed_kptp[PTP_LEVELS - 1] != 0) {
4356 #ifdef XEN
4357 #ifdef __x86_64__
4358 		/* nothing, kernel entries are never entered in user pmap */
4359 #else /* __x86_64__ */
4360 		mutex_enter(&pmaps_lock);
4361 		LIST_FOREACH(pm, &pmaps, pm_list) {
4362 			int pdkidx;
4363 			for (pdkidx =  PDIR_SLOT_KERN + old;
4364 			    pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1];
4365 			    pdkidx++) {
4366 				pmap_pte_set(&pm->pm_pdir[pdkidx],
4367 				    kpm->pm_pdir[pdkidx]);
4368 			}
4369 			pmap_pte_flush();
4370 		}
4371 		mutex_exit(&pmaps_lock);
4372 #endif /* __x86_64__ */
4373 #else /* XEN */
4374 		unsigned newpdes;
4375 		newpdes = nkptp[PTP_LEVELS - 1] - old;
4376 		mutex_enter(&pmaps_lock);
4377 		LIST_FOREACH(pm, &pmaps, pm_list) {
4378 			memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old],
4379 			       &kpm->pm_pdir[PDIR_SLOT_KERN + old],
4380 			       newpdes * sizeof (pd_entry_t));
4381 		}
4382 		mutex_exit(&pmaps_lock);
4383 #endif
4384 		invalidate = true;
4385 	}
4386 	pmap_maxkvaddr = maxkvaddr;
4387 	mutex_exit(kpm->pm_lock);
4388 	splx(s);
4389 
4390 	if (invalidate && pmap_initialized) {
4391 		/* Invalidate the PDP cache. */
4392 		pool_cache_invalidate(&pmap_pdp_cache);
4393 	}
4394 
4395 	return maxkvaddr;
4396 }
4397 
4398 #ifdef DEBUG
4399 void pmap_dump(struct pmap *, vaddr_t, vaddr_t);
4400 
4401 /*
4402  * pmap_dump: dump all the mappings from a pmap
4403  *
4404  * => caller should not be holding any pmap locks
4405  */
4406 
4407 void
4408 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
4409 {
4410 	pt_entry_t *ptes, *pte;
4411 	pd_entry_t * const *pdes;
4412 	struct pmap *pmap2;
4413 	vaddr_t blkendva;
4414 
4415 	/*
4416 	 * if end is out of range truncate.
4417 	 * if (end == start) update to max.
4418 	 */
4419 
4420 	if (eva > VM_MAXUSER_ADDRESS || eva <= sva)
4421 		eva = VM_MAXUSER_ADDRESS;
4422 
4423 	/*
4424 	 * we lock in the pmap => pv_head direction
4425 	 */
4426 
4427 	kpreempt_disable();
4428 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
4429 
4430 	/*
4431 	 * dumping a range of pages: we dump in PTP sized blocks (4MB)
4432 	 */
4433 
4434 	for (/* null */ ; sva < eva ; sva = blkendva) {
4435 
4436 		/* determine range of block */
4437 		blkendva = x86_round_pdr(sva+1);
4438 		if (blkendva > eva)
4439 			blkendva = eva;
4440 
4441 		/* valid block? */
4442 		if (!pmap_pdes_valid(sva, pdes, NULL))
4443 			continue;
4444 
4445 		pte = &ptes[pl1_i(sva)];
4446 		for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) {
4447 			if (!pmap_valid_entry(*pte))
4448 				continue;
4449 			printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR
4450 			    " (pte=%#" PRIxPADDR ")\n",
4451 			    sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte);
4452 		}
4453 	}
4454 	pmap_unmap_ptes(pmap, pmap2);
4455 	kpreempt_enable();
4456 }
4457 #endif
4458 
4459 /*
4460  * pmap_update: process deferred invalidations and frees.
4461  */
4462 
4463 void
4464 pmap_update(struct pmap *pmap)
4465 {
4466 	struct vm_page *empty_ptps;
4467 	lwp_t *l = curlwp;
4468 
4469 	/*
4470 	 * If we have torn down this pmap, invalidate non-global TLB
4471 	 * entries on any processors using it.
4472 	 */
4473 	kpreempt_disable();
4474 	if (__predict_false(l->l_md.md_gc_pmap == pmap)) {
4475 		l->l_md.md_gc_pmap = NULL;
4476 		pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, TLBSHOOT_UPDATE);
4477 	}
4478 	/*
4479 	 * Initiate any pending TLB shootdowns.  Wait for them to
4480 	 * complete before returning control to the caller.
4481 	 */
4482 	pmap_tlb_shootnow();
4483 	kpreempt_enable();
4484 
4485 	/*
4486 	 * Now that shootdowns are complete, process deferred frees,
4487 	 * but not from interrupt context.
4488 	 */
4489 	if (l->l_md.md_gc_ptp != NULL) {
4490 		KASSERT((l->l_pflag & LP_INTR) == 0);
4491 		if (cpu_intr_p()) {
4492 			return;
4493 		}
4494 		empty_ptps = l->l_md.md_gc_ptp;
4495 		l->l_md.md_gc_ptp = NULL;
4496 		pmap_free_ptps(empty_ptps);
4497 	}
4498 }
4499 
4500 #if PTP_LEVELS > 4
4501 #error "Unsupported number of page table mappings"
4502 #endif
4503 
4504 paddr_t
4505 pmap_init_tmp_pgtbl(paddr_t pg)
4506 {
4507 	static bool maps_loaded;
4508 	static const paddr_t x86_tmp_pml_paddr[] = {
4509 	    4 * PAGE_SIZE,
4510 	    5 * PAGE_SIZE,
4511 	    6 * PAGE_SIZE,
4512 	    7 * PAGE_SIZE
4513 	};
4514 	static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 };
4515 
4516 	pd_entry_t *tmp_pml, *kernel_pml;
4517 
4518 	int level;
4519 
4520 	if (!maps_loaded) {
4521 		for (level = 0; level < PTP_LEVELS; ++level) {
4522 			x86_tmp_pml_vaddr[level] =
4523 			    uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
4524 			    UVM_KMF_VAONLY);
4525 
4526 			if (x86_tmp_pml_vaddr[level] == 0)
4527 				panic("mapping of real mode PML failed\n");
4528 			pmap_kenter_pa(x86_tmp_pml_vaddr[level],
4529 			    x86_tmp_pml_paddr[level],
4530 			    VM_PROT_READ | VM_PROT_WRITE, 0);
4531 			pmap_update(pmap_kernel());
4532 		}
4533 		maps_loaded = true;
4534 	}
4535 
4536 	/* Zero levels 1-3 */
4537 	for (level = 0; level < PTP_LEVELS - 1; ++level) {
4538 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
4539 		memset(tmp_pml, 0, PAGE_SIZE);
4540 	}
4541 
4542 	/* Copy PML4 */
4543 	kernel_pml = pmap_kernel()->pm_pdir;
4544 	tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1];
4545 	memcpy(tmp_pml, kernel_pml, PAGE_SIZE);
4546 
4547 #ifdef PAE
4548 	/*
4549 	 * Use the last 4 entries of the L2 page as L3 PD entries. These
4550 	 * last entries are unlikely to be used for temporary mappings.
4551 	 * 508: maps 0->1GB (userland)
4552 	 * 509: unused
4553 	 * 510: unused
4554 	 * 511: maps 3->4GB (kernel)
4555 	 */
4556 	tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PG_V;
4557 	tmp_pml[509] = 0;
4558 	tmp_pml[510] = 0;
4559 	tmp_pml[511] = pmap_pdirpa(pmap_kernel(), PDIR_SLOT_KERN) | PG_V;
4560 #endif
4561 
4562 	for (level = PTP_LEVELS - 1; level > 0; --level) {
4563 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
4564 
4565 		tmp_pml[pl_i(pg, level + 1)] =
4566 		    (x86_tmp_pml_paddr[level - 1] & PG_FRAME) | PG_RW | PG_V;
4567 	}
4568 
4569 	tmp_pml = (void *)x86_tmp_pml_vaddr[0];
4570 	tmp_pml[pl_i(pg, 1)] = (pg & PG_FRAME) | PG_RW | PG_V;
4571 
4572 #ifdef PAE
4573 	/* Return the PA of the L3 page (entry 508 of the L2 page) */
4574 	return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t);
4575 #endif
4576 
4577 	return x86_tmp_pml_paddr[PTP_LEVELS - 1];
4578 }
4579 
4580 u_int
4581 x86_mmap_flags(paddr_t mdpgno)
4582 {
4583 	u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK;
4584 	u_int pflag = 0;
4585 
4586 	if (nflag & X86_MMAP_FLAG_PREFETCH)
4587 		pflag |= PMAP_WRITE_COMBINE;
4588 
4589 	return pflag;
4590 }
4591