xref: /netbsd-src/sys/arch/x86/x86/pmap.c (revision a24efa7dea9f1f56c3bdb15a927d3516792ace1c)
1 /*	$NetBSD: pmap.c,v 1.196 2016/05/21 07:15:56 maxv Exp $	*/
2 
3 /*-
4  * Copyright (c) 2008, 2010 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 2007 Manuel Bouyer.
34  *
35  * Redistribution and use in source and binary forms, with or without
36  * modification, are permitted provided that the following conditions
37  * are met:
38  * 1. Redistributions of source code must retain the above copyright
39  *    notice, this list of conditions and the following disclaimer.
40  * 2. Redistributions in binary form must reproduce the above copyright
41  *    notice, this list of conditions and the following disclaimer in the
42  *    documentation and/or other materials provided with the distribution.
43  *
44  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
45  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
46  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
47  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
48  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
49  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
50  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
51  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
52  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
53  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
54  *
55  */
56 
57 /*
58  * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
59  *
60  * Permission to use, copy, modify, and distribute this software for any
61  * purpose with or without fee is hereby granted, provided that the above
62  * copyright notice and this permission notice appear in all copies.
63  *
64  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
65  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
66  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
67  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
68  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
69  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
70  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
71  */
72 
73 /*
74  * Copyright (c) 1997 Charles D. Cranor and Washington University.
75  * All rights reserved.
76  *
77  * Redistribution and use in source and binary forms, with or without
78  * modification, are permitted provided that the following conditions
79  * are met:
80  * 1. Redistributions of source code must retain the above copyright
81  *    notice, this list of conditions and the following disclaimer.
82  * 2. Redistributions in binary form must reproduce the above copyright
83  *    notice, this list of conditions and the following disclaimer in the
84  *    documentation and/or other materials provided with the distribution.
85  *
86  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
87  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
88  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
89  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
90  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
91  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
92  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
93  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
94  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
95  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
96  */
97 
98 /*
99  * Copyright 2001 (c) Wasabi Systems, Inc.
100  * All rights reserved.
101  *
102  * Written by Frank van der Linden for Wasabi Systems, Inc.
103  *
104  * Redistribution and use in source and binary forms, with or without
105  * modification, are permitted provided that the following conditions
106  * are met:
107  * 1. Redistributions of source code must retain the above copyright
108  *    notice, this list of conditions and the following disclaimer.
109  * 2. Redistributions in binary form must reproduce the above copyright
110  *    notice, this list of conditions and the following disclaimer in the
111  *    documentation and/or other materials provided with the distribution.
112  * 3. All advertising materials mentioning features or use of this software
113  *    must display the following acknowledgement:
114  *      This product includes software developed for the NetBSD Project by
115  *      Wasabi Systems, Inc.
116  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
117  *    or promote products derived from this software without specific prior
118  *    written permission.
119  *
120  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
121  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
122  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
123  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
124  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
125  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
126  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
127  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
128  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
129  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
130  * POSSIBILITY OF SUCH DAMAGE.
131  */
132 
133 /*
134  * This is the i386 pmap modified and generalized to support x86-64
135  * as well. The idea is to hide the upper N levels of the page tables
136  * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest
137  * is mostly untouched, except that it uses some more generalized
138  * macros and interfaces.
139  *
140  * This pmap has been tested on the i386 as well, and it can be easily
141  * adapted to PAE.
142  *
143  * fvdl@wasabisystems.com 18-Jun-2001
144  */
145 
146 /*
147  * pmap.c: i386 pmap module rewrite
148  * Chuck Cranor <chuck@netbsd>
149  * 11-Aug-97
150  *
151  * history of this pmap module: in addition to my own input, i used
152  *    the following references for this rewrite of the i386 pmap:
153  *
154  * [1] the NetBSD i386 pmap.   this pmap appears to be based on the
155  *     BSD hp300 pmap done by Mike Hibler at University of Utah.
156  *     it was then ported to the i386 by William Jolitz of UUNET
157  *     Technologies, Inc.   Then Charles M. Hannum of the NetBSD
158  *     project fixed some bugs and provided some speed ups.
159  *
160  * [2] the FreeBSD i386 pmap.   this pmap seems to be the
161  *     Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson
162  *     and David Greenman.
163  *
164  * [3] the Mach pmap.   this pmap, from CMU, seems to have migrated
165  *     between several processors.   the VAX version was done by
166  *     Avadis Tevanian, Jr., and Michael Wayne Young.    the i386
167  *     version was done by Lance Berc, Mike Kupfer, Bob Baron,
168  *     David Golub, and Richard Draves.    the alpha version was
169  *     done by Alessandro Forin (CMU/Mach) and Chris Demetriou
170  *     (NetBSD/alpha).
171  */
172 
173 #include <sys/cdefs.h>
174 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.196 2016/05/21 07:15:56 maxv Exp $");
175 
176 #include "opt_user_ldt.h"
177 #include "opt_lockdebug.h"
178 #include "opt_multiprocessor.h"
179 #include "opt_xen.h"
180 #if !defined(__x86_64__)
181 #include "opt_kstack_dr0.h"
182 #endif /* !defined(__x86_64__) */
183 
184 #include <sys/param.h>
185 #include <sys/systm.h>
186 #include <sys/proc.h>
187 #include <sys/pool.h>
188 #include <sys/kernel.h>
189 #include <sys/atomic.h>
190 #include <sys/cpu.h>
191 #include <sys/intr.h>
192 #include <sys/xcall.h>
193 #include <sys/kcore.h>
194 
195 #include <uvm/uvm.h>
196 #include <uvm/pmap/pmap_pvt.h>
197 
198 #include <dev/isa/isareg.h>
199 
200 #include <machine/specialreg.h>
201 #include <machine/gdt.h>
202 #include <machine/isa_machdep.h>
203 #include <machine/cpuvar.h>
204 #include <machine/cputypes.h>
205 
206 #include <x86/pmap.h>
207 #include <x86/pmap_pv.h>
208 
209 #include <x86/i82489reg.h>
210 #include <x86/i82489var.h>
211 
212 #ifdef XEN
213 #include <xen/xen-public/xen.h>
214 #include <xen/hypervisor.h>
215 #endif
216 
217 /*
218  * general info:
219  *
220  *  - for an explanation of how the i386 MMU hardware works see
221  *    the comments in <machine/pte.h>.
222  *
223  *  - for an explanation of the general memory structure used by
224  *    this pmap (including the recursive mapping), see the comments
225  *    in <machine/pmap.h>.
226  *
227  * this file contains the code for the "pmap module."   the module's
228  * job is to manage the hardware's virtual to physical address mappings.
229  * note that there are two levels of mapping in the VM system:
230  *
231  *  [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
232  *      to map ranges of virtual address space to objects/files.  for
233  *      example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
234  *      to the file /bin/ls starting at offset zero."   note that
235  *      the upper layer mapping is not concerned with how individual
236  *      vm_pages are mapped.
237  *
238  *  [2] the lower layer of the VM system (the pmap) maintains the mappings
239  *      from virtual addresses.   it is concerned with which vm_page is
240  *      mapped where.   for example, when you run /bin/ls and start
241  *      at page 0x1000 the fault routine may lookup the correct page
242  *      of the /bin/ls file and then ask the pmap layer to establish
243  *      a mapping for it.
244  *
245  * note that information in the lower layer of the VM system can be
246  * thrown away since it can easily be reconstructed from the info
247  * in the upper layer.
248  *
249  * data structures we use include:
250  *
251  *  - struct pmap: describes the address space of one thread
252  *  - struct pmap_page: describes one pv-tracked page, without
253  *	necessarily a corresponding vm_page
254  *  - struct pv_entry: describes one <PMAP,VA> mapping of a PA
255  *  - struct pv_head: there is one pv_head per pv-tracked page of
256  *	physical memory.   the pv_head points to a list of pv_entry
257  *	structures which describe all the <PMAP,VA> pairs that this
258  *      page is mapped in.    this is critical for page based operations
259  *      such as pmap_page_protect() [change protection on _all_ mappings
260  *      of a page]
261  */
262 
263 /*
264  * memory allocation
265  *
266  *  - there are three data structures that we must dynamically allocate:
267  *
268  * [A] new process' page directory page (PDP)
269  *	- plan 1: done at pmap_create() we use
270  *	  uvm_km_alloc(kernel_map, PAGE_SIZE)  [fka kmem_alloc] to do this
271  *	  allocation.
272  *
273  * if we are low in free physical memory then we sleep in
274  * uvm_km_alloc -- in this case this is ok since we are creating
275  * a new pmap and should not be holding any locks.
276  *
277  * if the kernel is totally out of virtual space
278  * (i.e. uvm_km_alloc returns NULL), then we panic.
279  *
280  * [B] new page tables pages (PTP)
281  * 	- call uvm_pagealloc()
282  * 		=> success: zero page, add to pm_pdir
283  * 		=> failure: we are out of free vm_pages, let pmap_enter()
284  *		   tell UVM about it.
285  *
286  * note: for kernel PTPs, we start with NKPTP of them.   as we map
287  * kernel memory (at uvm_map time) we check to see if we've grown
288  * the kernel pmap.   if so, we call the optional function
289  * pmap_growkernel() to grow the kernel PTPs in advance.
290  *
291  * [C] pv_entry structures
292  */
293 
294 /*
295  * locking
296  *
297  * we have the following locks that we must contend with:
298  *
299  * mutexes:
300  *
301  * - pmap lock (per pmap, part of uvm_object)
302  *   this lock protects the fields in the pmap structure including
303  *   the non-kernel PDEs in the PDP, and the PTEs.  it also locks
304  *   in the alternate PTE space (since that is determined by the
305  *   entry in the PDP).
306  *
307  * - pvh_lock (per pv_head)
308  *   this lock protects the pv_entry list which is chained off the
309  *   pv_head structure for a specific pv-tracked PA.   it is locked
310  *   when traversing the list (e.g. adding/removing mappings,
311  *   syncing R/M bits, etc.)
312  *
313  * - pmaps_lock
314  *   this lock protects the list of active pmaps (headed by "pmaps").
315  *   we lock it when adding or removing pmaps from this list.
316  */
317 
318 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER;
319 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER;
320 const long nkptpmax[] = NKPTPMAX_INITIALIZER;
321 const long nbpd[] = NBPD_INITIALIZER;
322 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER;
323 
324 long nkptp[] = NKPTP_INITIALIZER;
325 
326 struct pmap_head pmaps;
327 kmutex_t pmaps_lock;
328 
329 static vaddr_t pmap_maxkvaddr;
330 
331 /*
332  * XXX kludge: dummy locking to make KASSERTs in uvm_page.c comfortable.
333  * actual locking is done by pm_lock.
334  */
335 #if defined(DIAGNOSTIC)
336 #define	PMAP_SUBOBJ_LOCK(pm, idx) \
337 	KASSERT(mutex_owned((pm)->pm_lock)); \
338 	if ((idx) != 0) \
339 		mutex_enter((pm)->pm_obj[(idx)].vmobjlock)
340 #define	PMAP_SUBOBJ_UNLOCK(pm, idx) \
341 	KASSERT(mutex_owned((pm)->pm_lock)); \
342 	if ((idx) != 0) \
343 		mutex_exit((pm)->pm_obj[(idx)].vmobjlock)
344 #else /* defined(DIAGNOSTIC) */
345 #define	PMAP_SUBOBJ_LOCK(pm, idx)	/* nothing */
346 #define	PMAP_SUBOBJ_UNLOCK(pm, idx)	/* nothing */
347 #endif /* defined(DIAGNOSTIC) */
348 
349 /*
350  * Misc. event counters.
351  */
352 struct evcnt pmap_iobmp_evcnt;
353 struct evcnt pmap_ldt_evcnt;
354 
355 /*
356  * PAT
357  */
358 #define	PATENTRY(n, type)	(type << ((n) * 8))
359 #define	PAT_UC		0x0ULL
360 #define	PAT_WC		0x1ULL
361 #define	PAT_WT		0x4ULL
362 #define	PAT_WP		0x5ULL
363 #define	PAT_WB		0x6ULL
364 #define	PAT_UCMINUS	0x7ULL
365 
366 static bool cpu_pat_enabled __read_mostly = false;
367 
368 /*
369  * global data structures
370  */
371 
372 static struct pmap kernel_pmap_store;	/* the kernel's pmap (proc0) */
373 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store;
374 
375 /*
376  * pmap_pg_g: if our processor supports PG_G in the PTE then we
377  * set pmap_pg_g to PG_G (otherwise it is zero).
378  */
379 
380 int pmap_pg_g __read_mostly = 0;
381 
382 /*
383  * pmap_largepages: if our processor supports PG_PS and we are
384  * using it, this is set to true.
385  */
386 
387 int pmap_largepages __read_mostly;
388 
389 /*
390  * i386 physical memory comes in a big contig chunk with a small
391  * hole toward the front of it...  the following two paddr_t's
392  * (shared with machdep.c) describe the physical address space
393  * of this machine.
394  */
395 paddr_t avail_start __read_mostly; /* PA of first available physical page */
396 paddr_t avail_end __read_mostly; /* PA of last available physical page */
397 
398 #ifdef XEN
399 #ifdef __x86_64__
400 /* Dummy PGD for user cr3, used between pmap_deactivate() and pmap_activate() */
401 static paddr_t xen_dummy_user_pgd;
402 #endif /* __x86_64__ */
403 paddr_t pmap_pa_start; /* PA of first physical page for this domain */
404 paddr_t pmap_pa_end;   /* PA of last physical page for this domain */
405 #endif /* XEN */
406 
407 #define	VM_PAGE_TO_PP(pg)	(&(pg)->mdpage.mp_pp)
408 
409 #define	PV_HASH_SIZE		32768
410 #define	PV_HASH_LOCK_CNT	32
411 
412 struct pv_hash_lock {
413 	kmutex_t lock;
414 } __aligned(CACHE_LINE_SIZE) pv_hash_locks[PV_HASH_LOCK_CNT]
415     __aligned(CACHE_LINE_SIZE);
416 
417 struct pv_hash_head {
418 	SLIST_HEAD(, pv_entry) hh_list;
419 } pv_hash_heads[PV_HASH_SIZE];
420 
421 static u_int
422 pvhash_hash(struct vm_page *ptp, vaddr_t va)
423 {
424 
425 	return (uintptr_t)ptp / sizeof(*ptp) + (va >> PAGE_SHIFT);
426 }
427 
428 static struct pv_hash_head *
429 pvhash_head(u_int hash)
430 {
431 
432 	return &pv_hash_heads[hash % PV_HASH_SIZE];
433 }
434 
435 static kmutex_t *
436 pvhash_lock(u_int hash)
437 {
438 
439 	return &pv_hash_locks[hash % PV_HASH_LOCK_CNT].lock;
440 }
441 
442 static struct pv_entry *
443 pvhash_remove(struct pv_hash_head *hh, struct vm_page *ptp, vaddr_t va)
444 {
445 	struct pv_entry *pve;
446 	struct pv_entry *prev;
447 
448 	prev = NULL;
449 	SLIST_FOREACH(pve, &hh->hh_list, pve_hash) {
450 		if (pve->pve_pte.pte_ptp == ptp &&
451 		    pve->pve_pte.pte_va == va) {
452 			if (prev != NULL) {
453 				SLIST_REMOVE_AFTER(prev, pve_hash);
454 			} else {
455 				SLIST_REMOVE_HEAD(&hh->hh_list, pve_hash);
456 			}
457 			break;
458 		}
459 		prev = pve;
460 	}
461 	return pve;
462 }
463 
464 /*
465  * other data structures
466  */
467 
468 static pt_entry_t protection_codes[8] __read_mostly; /* maps MI prot to i386
469 							prot code */
470 static bool pmap_initialized __read_mostly = false; /* pmap_init done yet? */
471 
472 /*
473  * the following two vaddr_t's are used during system startup
474  * to keep track of how much of the kernel's VM space we have used.
475  * once the system is started, the management of the remaining kernel
476  * VM space is turned over to the kernel_map vm_map.
477  */
478 
479 static vaddr_t virtual_avail __read_mostly;	/* VA of first free KVA */
480 static vaddr_t virtual_end __read_mostly;	/* VA of last free KVA */
481 
482 /*
483  * pool that pmap structures are allocated from
484  */
485 
486 static struct pool_cache pmap_cache;
487 
488 /*
489  * pv_entry cache
490  */
491 
492 static struct pool_cache pmap_pv_cache;
493 
494 #ifdef __HAVE_DIRECT_MAP
495 
496 extern phys_ram_seg_t mem_clusters[];
497 extern int mem_cluster_cnt;
498 
499 #else
500 
501 /*
502  * MULTIPROCESSOR: special VA's/ PTE's are actually allocated inside a
503  * maxcpus*NPTECL array of PTE's, to avoid cache line thrashing
504  * due to false sharing.
505  */
506 
507 #ifdef MULTIPROCESSOR
508 #define PTESLEW(pte, id) ((pte)+(id)*NPTECL)
509 #define VASLEW(va,id) ((va)+(id)*NPTECL*PAGE_SIZE)
510 #else
511 #define PTESLEW(pte, id) ((void)id, pte)
512 #define VASLEW(va,id) ((void)id, va)
513 #endif
514 
515 /*
516  * special VAs and the PTEs that map them
517  */
518 static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte, *early_zero_pte;
519 static char *csrcp, *cdstp, *zerop, *ptpp;
520 #ifdef XEN
521 char *early_zerop; /* also referenced from xen_pmap_bootstrap() */
522 #else
523 static char *early_zerop;
524 #endif
525 
526 #endif
527 
528 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int);
529 
530 /* PDP pool_cache(9) and its callbacks */
531 struct pool_cache pmap_pdp_cache;
532 static int  pmap_pdp_ctor(void *, void *, int);
533 static void pmap_pdp_dtor(void *, void *);
534 #ifdef PAE
535 /* need to allocate items of 4 pages */
536 static void *pmap_pdp_alloc(struct pool *, int);
537 static void pmap_pdp_free(struct pool *, void *);
538 static struct pool_allocator pmap_pdp_allocator = {
539 	.pa_alloc = pmap_pdp_alloc,
540 	.pa_free = pmap_pdp_free,
541 	.pa_pagesz = PAGE_SIZE * PDP_SIZE,
542 };
543 #endif /* PAE */
544 
545 extern vaddr_t idt_vaddr;			/* we allocate IDT early */
546 extern paddr_t idt_paddr;
547 
548 #ifdef _LP64
549 extern vaddr_t lo32_vaddr;
550 extern vaddr_t lo32_paddr;
551 #endif
552 
553 extern int end;
554 
555 #ifdef i386
556 /* stuff to fix the pentium f00f bug */
557 extern vaddr_t pentium_idt_vaddr;
558 #endif
559 
560 
561 /*
562  * local prototypes
563  */
564 
565 static struct vm_page	*pmap_get_ptp(struct pmap *, vaddr_t,
566 				      pd_entry_t * const *);
567 static struct vm_page	*pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int);
568 static void		 pmap_freepage(struct pmap *, struct vm_page *, int);
569 static void		 pmap_free_ptp(struct pmap *, struct vm_page *,
570 				       vaddr_t, pt_entry_t *,
571 				       pd_entry_t * const *);
572 static bool		 pmap_remove_pte(struct pmap *, struct vm_page *,
573 					 pt_entry_t *, vaddr_t,
574 					 struct pv_entry **);
575 static void		 pmap_remove_ptes(struct pmap *, struct vm_page *,
576 					  vaddr_t, vaddr_t, vaddr_t,
577 					  struct pv_entry **);
578 
579 static bool		 pmap_get_physpage(vaddr_t, int, paddr_t *);
580 static void		 pmap_alloc_level(pd_entry_t * const *, vaddr_t, int,
581 					  long *);
582 
583 static bool		 pmap_reactivate(struct pmap *);
584 
585 /*
586  * p m a p   h e l p e r   f u n c t i o n s
587  */
588 
589 static inline void
590 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff)
591 {
592 
593 	if (pmap == pmap_kernel()) {
594 		atomic_add_long(&pmap->pm_stats.resident_count, resid_diff);
595 		atomic_add_long(&pmap->pm_stats.wired_count, wired_diff);
596 	} else {
597 		KASSERT(mutex_owned(pmap->pm_lock));
598 		pmap->pm_stats.resident_count += resid_diff;
599 		pmap->pm_stats.wired_count += wired_diff;
600 	}
601 }
602 
603 static inline void
604 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
605 {
606 	int resid_diff = ((npte & PG_V) ? 1 : 0) - ((opte & PG_V) ? 1 : 0);
607 	int wired_diff = ((npte & PG_W) ? 1 : 0) - ((opte & PG_W) ? 1 : 0);
608 
609 	KASSERT((npte & (PG_V | PG_W)) != PG_W);
610 	KASSERT((opte & (PG_V | PG_W)) != PG_W);
611 
612 	pmap_stats_update(pmap, resid_diff, wired_diff);
613 }
614 
615 /*
616  * ptp_to_pmap: lookup pmap by ptp
617  */
618 
619 static struct pmap *
620 ptp_to_pmap(struct vm_page *ptp)
621 {
622 	struct pmap *pmap;
623 
624 	if (ptp == NULL) {
625 		return pmap_kernel();
626 	}
627 	pmap = (struct pmap *)ptp->uobject;
628 	KASSERT(pmap != NULL);
629 	KASSERT(&pmap->pm_obj[0] == ptp->uobject);
630 	return pmap;
631 }
632 
633 static inline struct pv_pte *
634 pve_to_pvpte(struct pv_entry *pve)
635 {
636 
637 	KASSERT((void *)&pve->pve_pte == (void *)pve);
638 	return &pve->pve_pte;
639 }
640 
641 static inline struct pv_entry *
642 pvpte_to_pve(struct pv_pte *pvpte)
643 {
644 	struct pv_entry *pve = (void *)pvpte;
645 
646 	KASSERT(pve_to_pvpte(pve) == pvpte);
647 	return pve;
648 }
649 
650 /*
651  * pv_pte_first, pv_pte_next: PV list iterator.
652  */
653 
654 static struct pv_pte *
655 pv_pte_first(struct pmap_page *pp)
656 {
657 
658 	if ((pp->pp_flags & PP_EMBEDDED) != 0) {
659 		return &pp->pp_pte;
660 	}
661 	return pve_to_pvpte(LIST_FIRST(&pp->pp_head.pvh_list));
662 }
663 
664 static struct pv_pte *
665 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte)
666 {
667 
668 	KASSERT(pvpte != NULL);
669 	if (pvpte == &pp->pp_pte) {
670 		KASSERT((pp->pp_flags & PP_EMBEDDED) != 0);
671 		return NULL;
672 	}
673 	KASSERT((pp->pp_flags & PP_EMBEDDED) == 0);
674 	return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list));
675 }
676 
677 /*
678  * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
679  *		of course the kernel is always loaded
680  */
681 
682 bool
683 pmap_is_curpmap(struct pmap *pmap)
684 {
685 	return((pmap == pmap_kernel()) ||
686 	       (pmap == curcpu()->ci_pmap));
687 }
688 
689 /*
690  *	Add a reference to the specified pmap.
691  */
692 
693 void
694 pmap_reference(struct pmap *pmap)
695 {
696 
697 	atomic_inc_uint(&pmap->pm_obj[0].uo_refs);
698 }
699 
700 /*
701  * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
702  *
703  * there are several pmaps involved.  some or all of them might be same.
704  *
705  *	- the pmap given by the first argument
706  *		our caller wants to access this pmap's PTEs.
707  *
708  *	- pmap_kernel()
709  *		the kernel pmap.  note that it only contains the kernel part
710  *		of the address space which is shared by any pmap.  ie. any
711  *		pmap can be used instead of pmap_kernel() for our purpose.
712  *
713  *	- ci->ci_pmap
714  *		pmap currently loaded on the cpu.
715  *
716  *	- vm_map_pmap(&curproc->p_vmspace->vm_map)
717  *		current process' pmap.
718  *
719  * => we lock enough pmaps to keep things locked in
720  * => must be undone with pmap_unmap_ptes before returning
721  */
722 
723 void
724 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2,
725 	      pd_entry_t **ptepp, pd_entry_t * const **pdeppp)
726 {
727 	struct pmap *curpmap;
728 	struct cpu_info *ci;
729 	lwp_t *l;
730 
731 	/* The kernel's pmap is always accessible. */
732 	if (pmap == pmap_kernel()) {
733 		*pmap2 = NULL;
734 		*ptepp = PTE_BASE;
735 		*pdeppp = normal_pdes;
736 		return;
737 	}
738 	KASSERT(kpreempt_disabled());
739 
740 	l = curlwp;
741  retry:
742 	mutex_enter(pmap->pm_lock);
743 	ci = curcpu();
744 	curpmap = ci->ci_pmap;
745 	if (vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) {
746 		/* Our own pmap so just load it: easy. */
747 		if (__predict_false(ci->ci_want_pmapload)) {
748 			mutex_exit(pmap->pm_lock);
749 			pmap_load();
750 			goto retry;
751 		}
752 		KASSERT(pmap == curpmap);
753 	} else if (pmap == curpmap) {
754 		/*
755 		 * Already on the CPU: make it valid.  This is very
756 		 * often the case during exit(), when we have switched
757 		 * to the kernel pmap in order to destroy a user pmap.
758 		 */
759 		if (!pmap_reactivate(pmap)) {
760 			u_int gen = uvm_emap_gen_return();
761 			tlbflush();
762 			uvm_emap_update(gen);
763 		}
764 	} else {
765 		/*
766 		 * Toss current pmap from CPU, but keep a reference to it.
767 		 * The reference will be dropped by pmap_unmap_ptes().
768 		 * Can happen if we block during exit().
769 		 */
770 		const cpuid_t cid = cpu_index(ci);
771 
772 		kcpuset_atomic_clear(curpmap->pm_cpus, cid);
773 		kcpuset_atomic_clear(curpmap->pm_kernel_cpus, cid);
774 		ci->ci_pmap = pmap;
775 		ci->ci_tlbstate = TLBSTATE_VALID;
776 		kcpuset_atomic_set(pmap->pm_cpus, cid);
777 		kcpuset_atomic_set(pmap->pm_kernel_cpus, cid);
778 		cpu_load_pmap(pmap, curpmap);
779 	}
780 	pmap->pm_ncsw = l->l_ncsw;
781 	*pmap2 = curpmap;
782 	*ptepp = PTE_BASE;
783 #if defined(XEN) && defined(__x86_64__)
784 	KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] == L4_BASE);
785 	ci->ci_normal_pdes[PTP_LEVELS - 2] = pmap->pm_pdir;
786 	*pdeppp = ci->ci_normal_pdes;
787 #else /* XEN && __x86_64__ */
788 	*pdeppp = normal_pdes;
789 #endif /* XEN && __x86_64__ */
790 }
791 
792 /*
793  * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
794  */
795 
796 void
797 pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2)
798 {
799 	struct cpu_info *ci;
800 	struct pmap *mypmap;
801 
802 	KASSERT(kpreempt_disabled());
803 
804 	/* The kernel's pmap is always accessible. */
805 	if (pmap == pmap_kernel()) {
806 		return;
807 	}
808 
809 	ci = curcpu();
810 #if defined(XEN) && defined(__x86_64__)
811 	/* Reset per-cpu normal_pdes */
812 	KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] != L4_BASE);
813 	ci->ci_normal_pdes[PTP_LEVELS - 2] = L4_BASE;
814 #endif /* XEN && __x86_64__ */
815 	/*
816 	 * We cannot tolerate context switches while mapped in.
817 	 * If it is our own pmap all we have to do is unlock.
818 	 */
819 	KASSERT(pmap->pm_ncsw == curlwp->l_ncsw);
820 	mypmap = vm_map_pmap(&curproc->p_vmspace->vm_map);
821 	if (pmap == mypmap) {
822 		mutex_exit(pmap->pm_lock);
823 		return;
824 	}
825 
826 	/*
827 	 * Mark whatever's on the CPU now as lazy and unlock.
828 	 * If the pmap was already installed, we are done.
829 	 */
830 	ci->ci_tlbstate = TLBSTATE_LAZY;
831 	ci->ci_want_pmapload = (mypmap != pmap_kernel());
832 	mutex_exit(pmap->pm_lock);
833 	if (pmap == pmap2) {
834 		return;
835 	}
836 
837 	/*
838 	 * We installed another pmap on the CPU.  Grab a reference to
839 	 * it and leave in place.  Toss the evicted pmap (can block).
840 	 */
841 	pmap_reference(pmap);
842 	pmap_destroy(pmap2);
843 }
844 
845 
846 inline static void
847 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte)
848 {
849 
850 #if !defined(__x86_64__)
851 	if (curproc == NULL || curproc->p_vmspace == NULL ||
852 	    pm != vm_map_pmap(&curproc->p_vmspace->vm_map))
853 		return;
854 
855 	if ((opte ^ npte) & PG_X)
856 		pmap_update_pg(va);
857 
858 	/*
859 	 * Executability was removed on the last executable change.
860 	 * Reset the code segment to something conservative and
861 	 * let the trap handler deal with setting the right limit.
862 	 * We can't do that because of locking constraints on the vm map.
863 	 */
864 
865 	if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) {
866 		struct trapframe *tf = curlwp->l_md.md_regs;
867 
868 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
869 		pm->pm_hiexec = I386_MAX_EXE_ADDR;
870 	}
871 #endif /* !defined(__x86_64__) */
872 }
873 
874 #if !defined(__x86_64__)
875 /*
876  * Fixup the code segment to cover all potential executable mappings.
877  * returns 0 if no changes to the code segment were made.
878  */
879 
880 int
881 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb)
882 {
883 	struct vm_map_entry *ent;
884 	struct pmap *pm = vm_map_pmap(map);
885 	vaddr_t va = 0;
886 
887 	vm_map_lock_read(map);
888 	for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) {
889 
890 		/*
891 		 * This entry has greater va than the entries before.
892 		 * We need to make it point to the last page, not past it.
893 		 */
894 
895 		if (ent->protection & VM_PROT_EXECUTE)
896 			va = trunc_page(ent->end) - PAGE_SIZE;
897 	}
898 	vm_map_unlock_read(map);
899 	if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL))
900 		return (0);
901 
902 	pm->pm_hiexec = va;
903 	if (pm->pm_hiexec > I386_MAX_EXE_ADDR) {
904 		tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
905 	} else {
906 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
907 		return (0);
908 	}
909 	return (1);
910 }
911 #endif /* !defined(__x86_64__) */
912 
913 void
914 pat_init(struct cpu_info *ci)
915 {
916 	uint64_t pat;
917 
918 	if (!(ci->ci_feat_val[0] & CPUID_PAT))
919 		return;
920 
921 	/* We change WT to WC. Leave all other entries the default values. */
922 	pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) |
923 	      PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) |
924 	      PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) |
925 	      PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC);
926 
927 	wrmsr(MSR_CR_PAT, pat);
928 	cpu_pat_enabled = true;
929 	aprint_debug_dev(ci->ci_dev, "PAT enabled\n");
930 }
931 
932 static pt_entry_t
933 pmap_pat_flags(u_int flags)
934 {
935 	u_int cacheflags = (flags & PMAP_CACHE_MASK);
936 
937 	if (!cpu_pat_enabled) {
938 		switch (cacheflags) {
939 		case PMAP_NOCACHE:
940 		case PMAP_NOCACHE_OVR:
941 			/* results in PGC_UCMINUS on cpus which have
942 			 * the cpuid PAT but PAT "disabled"
943 			 */
944 			return PG_N;
945 		default:
946 			return 0;
947 		}
948 	}
949 
950 	switch (cacheflags) {
951 	case PMAP_NOCACHE:
952 		return PGC_UC;
953 	case PMAP_WRITE_COMBINE:
954 		return PGC_WC;
955 	case PMAP_WRITE_BACK:
956 		return PGC_WB;
957 	case PMAP_NOCACHE_OVR:
958 		return PGC_UCMINUS;
959 	}
960 
961 	return 0;
962 }
963 
964 /*
965  * p m a p   k e n t e r   f u n c t i o n s
966  *
967  * functions to quickly enter/remove pages from the kernel address
968  * space.   pmap_kremove is exported to MI kernel.  we make use of
969  * the recursive PTE mappings.
970  */
971 
972 /*
973  * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
974  *
975  * => no need to lock anything, assume va is already allocated
976  * => should be faster than normal pmap enter function
977  */
978 
979 void
980 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
981 {
982 	pt_entry_t *pte, opte, npte;
983 
984 	KASSERT(!(prot & ~VM_PROT_ALL));
985 
986 	if (va < VM_MIN_KERNEL_ADDRESS)
987 		pte = vtopte(va);
988 	else
989 		pte = kvtopte(va);
990 #ifdef DOM0OPS
991 	if (pa < pmap_pa_start || pa >= pmap_pa_end) {
992 #ifdef DEBUG
993 		printf_nolog("%s: pa 0x%" PRIx64 " for va 0x%" PRIx64
994 		    " outside range\n", __func__, (int64_t)pa, (int64_t)va);
995 #endif /* DEBUG */
996 		npte = pa;
997 	} else
998 #endif /* DOM0OPS */
999 		npte = pmap_pa2pte(pa);
1000 	npte |= protection_codes[prot] | PG_k | PG_V | pmap_pg_g;
1001 	npte |= pmap_pat_flags(flags);
1002 	opte = pmap_pte_testset(pte, npte); /* zap! */
1003 #if defined(DIAGNOSTIC)
1004 	/* XXX For now... */
1005 	if (opte & PG_PS)
1006 		panic("%s: PG_PS", __func__);
1007 #endif
1008 	if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) {
1009 		/* This should not happen. */
1010 		printf_nolog("%s: mapping already present\n", __func__);
1011 		kpreempt_disable();
1012 		pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER);
1013 		kpreempt_enable();
1014 	}
1015 }
1016 
1017 void
1018 pmap_emap_enter(vaddr_t va, paddr_t pa, vm_prot_t prot)
1019 {
1020 	pt_entry_t *pte, npte;
1021 
1022 	KASSERT((prot & ~VM_PROT_ALL) == 0);
1023 	pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va);
1024 
1025 #ifdef DOM0OPS
1026 	if (pa < pmap_pa_start || pa >= pmap_pa_end) {
1027 		npte = pa;
1028 	} else
1029 #endif
1030 		npte = pmap_pa2pte(pa);
1031 
1032 	npte = pmap_pa2pte(pa);
1033 	npte |= protection_codes[prot] | PG_k | PG_V;
1034 	pmap_pte_set(pte, npte);
1035 }
1036 
1037 /*
1038  * pmap_emap_sync: perform TLB flush or pmap load, if it was deferred.
1039  */
1040 void
1041 pmap_emap_sync(bool canload)
1042 {
1043 	struct cpu_info *ci = curcpu();
1044 	struct pmap *pmap;
1045 
1046 	KASSERT(kpreempt_disabled());
1047 	if (__predict_true(ci->ci_want_pmapload && canload)) {
1048 		/*
1049 		 * XXX: Hint for pmap_reactivate(), which might suggest to
1050 		 * not perform TLB flush, if state has not changed.
1051 		 */
1052 		pmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map);
1053 		if (__predict_false(pmap == ci->ci_pmap)) {
1054 			kcpuset_atomic_clear(pmap->pm_cpus, cpu_index(ci));
1055 		}
1056 		pmap_load();
1057 		KASSERT(ci->ci_want_pmapload == 0);
1058 	} else {
1059 		tlbflush();
1060 	}
1061 
1062 }
1063 
1064 void
1065 pmap_emap_remove(vaddr_t sva, vsize_t len)
1066 {
1067 	pt_entry_t *pte;
1068 	vaddr_t va, eva = sva + len;
1069 
1070 	for (va = sva; va < eva; va += PAGE_SIZE) {
1071 		pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va);
1072 		pmap_pte_set(pte, 0);
1073 	}
1074 }
1075 
1076 __strict_weak_alias(pmap_kenter_ma, pmap_kenter_pa);
1077 
1078 #if defined(__x86_64__)
1079 /*
1080  * Change protection for a virtual address. Local for a CPU only, don't
1081  * care about TLB shootdowns.
1082  *
1083  * => must be called with preemption disabled
1084  */
1085 void
1086 pmap_changeprot_local(vaddr_t va, vm_prot_t prot)
1087 {
1088 	pt_entry_t *pte, opte, npte;
1089 
1090 	KASSERT(kpreempt_disabled());
1091 
1092 	if (va < VM_MIN_KERNEL_ADDRESS)
1093 		pte = vtopte(va);
1094 	else
1095 		pte = kvtopte(va);
1096 
1097 	npte = opte = *pte;
1098 
1099 	if ((prot & VM_PROT_WRITE) != 0)
1100 		npte |= PG_RW;
1101 	else
1102 		npte &= ~PG_RW;
1103 
1104 	if (opte != npte) {
1105 		pmap_pte_set(pte, npte);
1106 		pmap_pte_flush();
1107 		invlpg(va);
1108 	}
1109 }
1110 #endif /* defined(__x86_64__) */
1111 
1112 /*
1113  * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
1114  *
1115  * => no need to lock anything
1116  * => caller must dispose of any vm_page mapped in the va range
1117  * => note: not an inline function
1118  * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
1119  * => we assume kernel only unmaps valid addresses and thus don't bother
1120  *    checking the valid bit before doing TLB flushing
1121  * => must be followed by call to pmap_update() before reuse of page
1122  */
1123 
1124 static inline void
1125 pmap_kremove1(vaddr_t sva, vsize_t len, bool localonly)
1126 {
1127 	pt_entry_t *pte, opte;
1128 	vaddr_t va, eva;
1129 
1130 	eva = sva + len;
1131 
1132 	kpreempt_disable();
1133 	for (va = sva; va < eva; va += PAGE_SIZE) {
1134 		pte = kvtopte(va);
1135 		opte = pmap_pte_testset(pte, 0); /* zap! */
1136 		if ((opte & (PG_V | PG_U)) == (PG_V | PG_U) && !localonly) {
1137 			pmap_tlb_shootdown(pmap_kernel(), va, opte,
1138 			    TLBSHOOT_KREMOVE);
1139 		}
1140 		KASSERT((opte & PG_PS) == 0);
1141 		KASSERT((opte & PG_PVLIST) == 0);
1142 	}
1143 	if (localonly) {
1144 		tlbflushg();
1145 	}
1146 	kpreempt_enable();
1147 }
1148 
1149 void
1150 pmap_kremove(vaddr_t sva, vsize_t len)
1151 {
1152 
1153 	pmap_kremove1(sva, len, false);
1154 }
1155 
1156 /*
1157  * pmap_kremove_local: like pmap_kremove(), but only worry about
1158  * TLB invalidations on the current CPU.  this is only intended
1159  * for use while writing kernel crash dumps.
1160  */
1161 
1162 void
1163 pmap_kremove_local(vaddr_t sva, vsize_t len)
1164 {
1165 
1166 	KASSERT(panicstr != NULL);
1167 	pmap_kremove1(sva, len, true);
1168 }
1169 
1170 /*
1171  * p m a p   i n i t   f u n c t i o n s
1172  *
1173  * pmap_bootstrap and pmap_init are called during system startup
1174  * to init the pmap module.   pmap_bootstrap() does a low level
1175  * init just to get things rolling.   pmap_init() finishes the job.
1176  */
1177 
1178 /*
1179  * pmap_bootstrap: get the system in a state where it can run with VM
1180  *	properly enabled (called before main()).   the VM system is
1181  *      fully init'd later...
1182  *
1183  * => on i386, locore.s has already enabled the MMU by allocating
1184  *	a PDP for the kernel, and nkpde PTP's for the kernel.
1185  * => kva_start is the first free virtual address in kernel space
1186  */
1187 
1188 void
1189 pmap_bootstrap(vaddr_t kva_start)
1190 {
1191 	struct pmap *kpm;
1192 	pt_entry_t *pte;
1193 	int i;
1194 	vaddr_t kva;
1195 #ifndef XEN
1196 	pd_entry_t *pde;
1197 	unsigned long p1i;
1198 	vaddr_t kva_end;
1199 #endif
1200 #ifdef __HAVE_DIRECT_MAP
1201 	phys_ram_seg_t *mc;
1202 	long ndmpdp;
1203 	paddr_t lastpa, dmpd, dmpdp, pdp;
1204 	vaddr_t tmpva;
1205 #endif
1206 
1207 	pt_entry_t pg_nx = (cpu_feature[2] & CPUID_NOX ? PG_NX : 0);
1208 
1209 	/*
1210 	 * set up our local static global vars that keep track of the
1211 	 * usage of KVM before kernel_map is set up
1212 	 */
1213 
1214 	virtual_avail = kva_start;		/* first free KVA */
1215 	virtual_end = VM_MAX_KERNEL_ADDRESS;	/* last KVA */
1216 
1217 	/*
1218 	 * set up protection_codes: we need to be able to convert from
1219 	 * a MI protection code (some combo of VM_PROT...) to something
1220 	 * we can jam into a i386 PTE.
1221 	 */
1222 
1223 	protection_codes[VM_PROT_NONE] = pg_nx;			/* --- */
1224 	protection_codes[VM_PROT_EXECUTE] = PG_RO | PG_X;	/* --x */
1225 	protection_codes[VM_PROT_READ] = PG_RO | pg_nx;		/* -r- */
1226 	protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO | PG_X;/* -rx */
1227 	protection_codes[VM_PROT_WRITE] = PG_RW | pg_nx;	/* w-- */
1228 	protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW | PG_X;/* w-x */
1229 	protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW | pg_nx;
1230 								/* wr- */
1231 	protection_codes[VM_PROT_ALL] = PG_RW | PG_X;		/* wrx */
1232 
1233 	/*
1234 	 * now we init the kernel's pmap
1235 	 *
1236 	 * the kernel pmap's pm_obj is not used for much.   however, in
1237 	 * user pmaps the pm_obj contains the list of active PTPs.
1238 	 * the pm_obj currently does not have a pager.   it might be possible
1239 	 * to add a pager that would allow a process to read-only mmap its
1240 	 * own page tables (fast user level vtophys?).   this may or may not
1241 	 * be useful.
1242 	 */
1243 
1244 	kpm = pmap_kernel();
1245 	for (i = 0; i < PTP_LEVELS - 1; i++) {
1246 		mutex_init(&kpm->pm_obj_lock[i], MUTEX_DEFAULT, IPL_NONE);
1247 		uvm_obj_init(&kpm->pm_obj[i], NULL, false, 1);
1248 		uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_obj_lock[i]);
1249 		kpm->pm_ptphint[i] = NULL;
1250 	}
1251 	memset(&kpm->pm_list, 0, sizeof(kpm->pm_list));  /* pm_list not used */
1252 
1253 	kpm->pm_pdir = (pd_entry_t *)(PDPpaddr + KERNBASE);
1254 	for (i = 0; i < PDP_SIZE; i++)
1255 		kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i;
1256 
1257 	kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
1258 		x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS);
1259 
1260 	kcpuset_create(&kpm->pm_cpus, true);
1261 	kcpuset_create(&kpm->pm_kernel_cpus, true);
1262 
1263 	/*
1264 	 * the above is just a rough estimate and not critical to the proper
1265 	 * operation of the system.
1266 	 */
1267 
1268 #ifndef XEN
1269 	/*
1270 	 * Begin to enable global TLB entries if they are supported.
1271 	 * The G bit has no effect until the CR4_PGE bit is set in CR4,
1272 	 * which happens in cpu_init(), which is run on each cpu
1273 	 * (and happens later)
1274 	 */
1275 	if (cpu_feature[0] & CPUID_PGE) {
1276 		pmap_pg_g = PG_G;		/* enable software */
1277 
1278 		/* add PG_G attribute to already mapped kernel pages */
1279 		if (KERNBASE == VM_MIN_KERNEL_ADDRESS) {
1280 			kva_end = virtual_avail;
1281 		} else {
1282 			extern vaddr_t eblob, esym;
1283 			kva_end = (vaddr_t)&end;
1284 			if (esym > kva_end)
1285 				kva_end = esym;
1286 			if (eblob > kva_end)
1287 				kva_end = eblob;
1288 			kva_end = roundup(kva_end, PAGE_SIZE);
1289 		}
1290 		for (kva = KERNBASE; kva < kva_end; kva += PAGE_SIZE) {
1291 			p1i = pl1_i(kva);
1292 			if (pmap_valid_entry(PTE_BASE[p1i]))
1293 				PTE_BASE[p1i] |= PG_G;
1294 		}
1295 	}
1296 
1297 	/*
1298 	 * Enable large pages if they are supported.
1299 	 */
1300 	if (cpu_feature[0] & CPUID_PSE) {
1301 		paddr_t pa;
1302 		extern char __rodata_start;
1303 		extern char __data_start;
1304 		extern char __kernel_end;
1305 
1306 		lcr4(rcr4() | CR4_PSE);	/* enable hardware (via %cr4) */
1307 		pmap_largepages = 1;	/* enable software */
1308 
1309 		/*
1310 		 * The TLB must be flushed after enabling large pages
1311 		 * on Pentium CPUs, according to section 3.6.2.2 of
1312 		 * "Intel Architecture Software Developer's Manual,
1313 		 * Volume 3: System Programming".
1314 		 */
1315 		tlbflushg();
1316 
1317 		/*
1318 		 * Now, we remap several kernel segments with large pages. We
1319 		 * cover as many pages as we can.
1320 		 */
1321 
1322 		/* Remap the kernel text using large pages. */
1323 		kva = KERNBASE;
1324 		kva_end = rounddown((vaddr_t)&__rodata_start, NBPD_L1);
1325 		pa = kva - KERNBASE;
1326 		for (/* */; kva + NBPD_L2 <= kva_end; kva += NBPD_L2,
1327 		    pa += NBPD_L2) {
1328 			pde = &L2_BASE[pl2_i(kva)];
1329 			*pde = pa | pmap_pg_g | PG_PS | PG_KR | PG_V;
1330 			tlbflushg();
1331 		}
1332 #if defined(DEBUG)
1333 		aprint_normal("kernel text is mapped with %" PRIuPSIZE " large "
1334 		    "pages and %" PRIuPSIZE " normal pages\n",
1335 		    howmany(kva - KERNBASE, NBPD_L2),
1336 		    howmany((vaddr_t)&__rodata_start - kva, NBPD_L1));
1337 #endif /* defined(DEBUG) */
1338 
1339 		/* Remap the kernel rodata using large pages. */
1340 		kva = roundup((vaddr_t)&__rodata_start, NBPD_L2);
1341 		kva_end = rounddown((vaddr_t)&__data_start, NBPD_L1);
1342 		pa = kva - KERNBASE;
1343 		for (/* */; kva + NBPD_L2 <= kva_end; kva += NBPD_L2,
1344 		    pa += NBPD_L2) {
1345 			pde = &L2_BASE[pl2_i(kva)];
1346 			*pde = pa | pmap_pg_g | PG_PS | pg_nx | PG_KR | PG_V;
1347 			tlbflushg();
1348 		}
1349 
1350 		/* Remap the kernel data+bss using large pages. */
1351 		kva = roundup((vaddr_t)&__data_start, NBPD_L2);
1352 		kva_end = rounddown((vaddr_t)&__kernel_end, NBPD_L1);
1353 		pa = kva - KERNBASE;
1354 		for (/* */; kva + NBPD_L2 <= kva_end; kva += NBPD_L2,
1355 		    pa += NBPD_L2) {
1356 			pde = &L2_BASE[pl2_i(kva)];
1357 			*pde = pa | pmap_pg_g | PG_PS | pg_nx | PG_KW | PG_V;
1358 			tlbflushg();
1359 		}
1360 	}
1361 #endif /* !XEN */
1362 
1363 #ifdef __HAVE_DIRECT_MAP
1364 
1365 	tmpva = (KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2);
1366 	pte = PTE_BASE + pl1_i(tmpva);
1367 
1368 	/*
1369 	 * Map the direct map RW.  Use 1GB pages if they are available,
1370 	 * otherwise use 2MB pages.  Note that the unused parts of
1371 	 * PTPs * must be zero outed, as they might be accessed due
1372 	 * to speculative execution.  Also, PG_G is not allowed on
1373 	 * non-leaf PTPs.
1374 	 */
1375 
1376 	lastpa = 0;
1377 	for (i = 0; i < mem_cluster_cnt; i++) {
1378 		mc = &mem_clusters[i];
1379 		lastpa = MAX(lastpa, mc->start + mc->size);
1380 	}
1381 
1382 	ndmpdp = (lastpa + NBPD_L3 - 1) >> L3_SHIFT;
1383 	dmpdp = avail_start;	avail_start += PAGE_SIZE;
1384 
1385 	*pte = dmpdp | PG_V | PG_RW | pg_nx;
1386 	pmap_update_pg(tmpva);
1387 	memset((void *)tmpva, 0, PAGE_SIZE);
1388 
1389 	if (cpu_feature[2] & CPUID_P1GB) {
1390 		for (i = 0; i < ndmpdp; i++) {
1391 			pdp = (paddr_t)&(((pd_entry_t *)dmpdp)[i]);
1392 			*pte = (pdp & PG_FRAME) | PG_V | PG_RW | pg_nx;
1393 			pmap_update_pg(tmpva);
1394 
1395 			pde = (pd_entry_t *)(tmpva + (pdp & ~PG_FRAME));
1396 			*pde = ((paddr_t)i << L3_SHIFT) | PG_RW | pg_nx |
1397 			    PG_V | PG_U | PG_PS | PG_G;
1398 		}
1399 	} else {
1400 		dmpd = avail_start;	avail_start += ndmpdp * PAGE_SIZE;
1401 
1402 		for (i = 0; i < ndmpdp; i++) {
1403 			pdp = dmpd + i * PAGE_SIZE;
1404 			*pte = (pdp & PG_FRAME) | PG_V | PG_RW | pg_nx;
1405 			pmap_update_pg(tmpva);
1406 
1407 			memset((void *)tmpva, 0, PAGE_SIZE);
1408 		}
1409 		for (i = 0; i < NPDPG * ndmpdp; i++) {
1410 			pdp = (paddr_t)&(((pd_entry_t *)dmpd)[i]);
1411 			*pte = (pdp & PG_FRAME) | PG_V | PG_RW | pg_nx;
1412 			pmap_update_pg(tmpva);
1413 
1414 			pde = (pd_entry_t *)(tmpva + (pdp & ~PG_FRAME));
1415 			*pde = ((paddr_t)i << L2_SHIFT) | PG_RW | pg_nx |
1416 			    PG_V | PG_U | PG_PS | PG_G;
1417 		}
1418 		for (i = 0; i < ndmpdp; i++) {
1419 			pdp = (paddr_t)&(((pd_entry_t *)dmpdp)[i]);
1420 			*pte = (pdp & PG_FRAME) | PG_V | PG_RW | pg_nx;
1421 			pmap_update_pg((vaddr_t)tmpva);
1422 
1423 			pde = (pd_entry_t *)(tmpva + (pdp & ~PG_FRAME));
1424 			*pde = (dmpd + (i << PAGE_SHIFT)) | PG_RW | pg_nx |
1425 			    PG_V | PG_U;
1426 		}
1427 	}
1428 
1429 	kpm->pm_pdir[PDIR_SLOT_DIRECT] = dmpdp | PG_KW | pg_nx | PG_V | PG_U;
1430 
1431 	tlbflush();
1432 
1433 #else
1434 	if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
1435 		/*
1436 		 * zero_pte is stuck at the end of mapped space for the kernel
1437 		 * image (disjunct from kva space). This is done so that it
1438 		 * can safely be used in pmap_growkernel (pmap_get_physpage),
1439 		 * when it's called for the first time.
1440 		 * XXXfvdl fix this for MULTIPROCESSOR later.
1441 		 */
1442 #ifdef XEN
1443 		/* early_zerop initialized in xen_pmap_bootstrap() */
1444 #else
1445 		early_zerop = (void *)(KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2);
1446 #endif
1447 		early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop);
1448 	}
1449 
1450 	/*
1451 	 * now we allocate the "special" VAs which are used for tmp mappings
1452 	 * by the pmap (and other modules).    we allocate the VAs by advancing
1453 	 * virtual_avail (note that there are no pages mapped at these VAs).
1454 	 * we find the PTE that maps the allocated VA via the linear PTE
1455 	 * mapping.
1456 	 */
1457 
1458 	pte = PTE_BASE + pl1_i(virtual_avail);
1459 
1460 #ifdef MULTIPROCESSOR
1461 	/*
1462 	 * Waste some VA space to avoid false sharing of cache lines
1463 	 * for page table pages: Give each possible CPU a cache line
1464 	 * of PTE's (8) to play with, though we only need 4.  We could
1465 	 * recycle some of this waste by putting the idle stacks here
1466 	 * as well; we could waste less space if we knew the largest
1467 	 * CPU ID beforehand.
1468 	 */
1469 	csrcp = (char *) virtual_avail;  csrc_pte = pte;
1470 
1471 	cdstp = (char *) virtual_avail+PAGE_SIZE;  cdst_pte = pte+1;
1472 
1473 	zerop = (char *) virtual_avail+PAGE_SIZE*2;  zero_pte = pte+2;
1474 
1475 	ptpp = (char *) virtual_avail+PAGE_SIZE*3;  ptp_pte = pte+3;
1476 
1477 	virtual_avail += PAGE_SIZE * maxcpus * NPTECL;
1478 	pte += maxcpus * NPTECL;
1479 #else
1480 	csrcp = (void *) virtual_avail;  csrc_pte = pte;	/* allocate */
1481 	virtual_avail += PAGE_SIZE; pte++;			/* advance */
1482 
1483 	cdstp = (void *) virtual_avail;  cdst_pte = pte;
1484 	virtual_avail += PAGE_SIZE; pte++;
1485 
1486 	zerop = (void *) virtual_avail;  zero_pte = pte;
1487 	virtual_avail += PAGE_SIZE; pte++;
1488 
1489 	ptpp = (void *) virtual_avail;  ptp_pte = pte;
1490 	virtual_avail += PAGE_SIZE; pte++;
1491 #endif
1492 
1493 	if (VM_MIN_KERNEL_ADDRESS == KERNBASE) {
1494 		early_zerop = zerop;
1495 		early_zero_pte = zero_pte;
1496 	}
1497 #endif
1498 
1499 	/*
1500 	 * Nothing after this point actually needs pte.
1501 	 */
1502 	pte = (void *)0xdeadbeef;
1503 
1504 #ifdef XEN
1505 #ifdef __x86_64__
1506 	/*
1507 	 * We want a dummy page directory for Xen:
1508 	 * when deactivate a pmap, Xen will still consider it active.
1509 	 * So we set user PGD to this one to lift all protection on
1510 	 * the now inactive page tables set.
1511 	 */
1512 	xen_dummy_user_pgd = avail_start;
1513 	avail_start += PAGE_SIZE;
1514 
1515 	/* Zero fill it, the less checks in Xen it requires the better */
1516 	memset((void *) (xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE);
1517 	/* Mark read-only */
1518 	HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE,
1519 	    pmap_pa2pte(xen_dummy_user_pgd) | PG_u | PG_V, UVMF_INVLPG);
1520 	/* Pin as L4 */
1521 	xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd));
1522 #endif /* __x86_64__ */
1523 	idt_vaddr = virtual_avail;                      /* don't need pte */
1524 	idt_paddr = avail_start;                        /* steal a page */
1525 	/*
1526 	 * Xen require one more page as we can't store
1527 	 * GDT and LDT on the same page
1528 	 */
1529 	virtual_avail += 3 * PAGE_SIZE;
1530 	avail_start += 3 * PAGE_SIZE;
1531 #else /* XEN */
1532 	idt_vaddr = virtual_avail;			/* don't need pte */
1533 	idt_paddr = avail_start;			/* steal a page */
1534 #if defined(__x86_64__)
1535 	virtual_avail += 2 * PAGE_SIZE;
1536 	avail_start += 2 * PAGE_SIZE;
1537 #else /* defined(__x86_64__) */
1538 	virtual_avail += PAGE_SIZE;
1539 	avail_start += PAGE_SIZE;
1540 	/* pentium f00f bug stuff */
1541 	pentium_idt_vaddr = virtual_avail;		/* don't need pte */
1542 	virtual_avail += PAGE_SIZE;
1543 #endif /* defined(__x86_64__) */
1544 #endif /* XEN */
1545 
1546 #ifdef _LP64
1547 	/*
1548 	 * Grab a page below 4G for things that need it (i.e.
1549 	 * having an initial %cr3 for the MP trampoline).
1550 	 */
1551 	lo32_vaddr = virtual_avail;
1552 	virtual_avail += PAGE_SIZE;
1553 	lo32_paddr = avail_start;
1554 	avail_start += PAGE_SIZE;
1555 #endif
1556 
1557 	/*
1558 	 * now we reserve some VM for mapping pages when doing a crash dump
1559 	 */
1560 
1561 	virtual_avail = reserve_dumppages(virtual_avail);
1562 
1563 	/*
1564 	 * init the static-global locks and global lists.
1565 	 *
1566 	 * => pventry::pvh_lock (initialized elsewhere) must also be
1567 	 *      a spin lock, again at IPL_VM to prevent deadlock, and
1568 	 *	again is never taken from interrupt context.
1569 	 */
1570 
1571 	mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE);
1572 	LIST_INIT(&pmaps);
1573 
1574 	/*
1575 	 * ensure the TLB is sync'd with reality by flushing it...
1576 	 */
1577 
1578 	tlbflushg();
1579 
1580 	/*
1581 	 * calculate pmap_maxkvaddr from nkptp[].
1582 	 */
1583 
1584 	kva = VM_MIN_KERNEL_ADDRESS;
1585 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
1586 		kva += nkptp[i] * nbpd[i];
1587 	}
1588 	pmap_maxkvaddr = kva;
1589 }
1590 
1591 #if defined(__x86_64__)
1592 /*
1593  * Pre-allocate PTPs for low memory, so that 1:1 mappings for various
1594  * trampoline code can be entered.
1595  */
1596 void
1597 pmap_prealloc_lowmem_ptps(void)
1598 {
1599 	int level;
1600 	paddr_t newp;
1601 	pd_entry_t *pdes;
1602 
1603 	const pd_entry_t pteflags = PG_k | PG_V | PG_RW;
1604 
1605 	pdes = pmap_kernel()->pm_pdir;
1606 	level = PTP_LEVELS;
1607 	for (;;) {
1608 		newp = avail_start;
1609 		avail_start += PAGE_SIZE;
1610 #ifdef __HAVE_DIRECT_MAP
1611 		memset((void *)PMAP_DIRECT_MAP(newp), 0, PAGE_SIZE);
1612 #else
1613 		pmap_pte_set(early_zero_pte, pmap_pa2pte(newp) | pteflags);
1614 		pmap_pte_flush();
1615 		pmap_update_pg((vaddr_t)early_zerop);
1616 		memset(early_zerop, 0, PAGE_SIZE);
1617 #endif
1618 
1619 #ifdef XEN
1620 		/* Mark R/O before installing */
1621 		HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop,
1622 		    xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG);
1623 		if (newp < (NKL2_KIMG_ENTRIES * NBPD_L2))
1624 			HYPERVISOR_update_va_mapping (newp + KERNBASE,
1625 			    xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG);
1626 
1627 
1628 		if (level == PTP_LEVELS) { /* Top level pde is per-cpu */
1629 			pd_entry_t *kpm_pdir;
1630 			/* Reach it via recursive mapping */
1631 			kpm_pdir = normal_pdes[PTP_LEVELS - 2];
1632 
1633 			/* Set it as usual. We can't defer this
1634 			 * outside the loop since recursive
1635 			 * pte entries won't be accessible during
1636 			 * further iterations at lower levels
1637 			 * otherwise.
1638 			 */
1639 			pmap_pte_set(&kpm_pdir[pl_i(0, PTP_LEVELS)],
1640 			    pmap_pa2pte(newp) | pteflags);
1641 		}
1642 
1643 #endif /* XEN */
1644 		pmap_pte_set(&pdes[pl_i(0, level)],
1645 		    pmap_pa2pte(newp) | pteflags);
1646 
1647 		pmap_pte_flush();
1648 
1649 		level--;
1650 		if (level <= 1)
1651 			break;
1652 		pdes = normal_pdes[level - 2];
1653 	}
1654 }
1655 #endif /* defined(__x86_64__) */
1656 
1657 /*
1658  * pmap_init: called from uvm_init, our job is to get the pmap
1659  * system ready to manage mappings...
1660  */
1661 
1662 void
1663 pmap_init(void)
1664 {
1665 	int i, flags;
1666 
1667 	for (i = 0; i < PV_HASH_SIZE; i++) {
1668 		SLIST_INIT(&pv_hash_heads[i].hh_list);
1669 	}
1670 	for (i = 0; i < PV_HASH_LOCK_CNT; i++) {
1671 		mutex_init(&pv_hash_locks[i].lock, MUTEX_NODEBUG, IPL_VM);
1672 	}
1673 
1674 	/*
1675 	 * initialize caches.
1676 	 */
1677 
1678 	pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), 0, 0, 0,
1679 	    "pmappl", NULL, IPL_NONE, NULL, NULL, NULL);
1680 
1681 #ifdef XEN
1682 	/*
1683 	 * pool_cache(9) should not touch cached objects, since they
1684 	 * are pinned on xen and R/O for the domU
1685 	 */
1686 	flags = PR_NOTOUCH;
1687 #else /* XEN */
1688 	flags = 0;
1689 #endif /* XEN */
1690 #ifdef PAE
1691 	pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE * PDP_SIZE, 0, 0, flags,
1692 	    "pdppl", &pmap_pdp_allocator, IPL_NONE,
1693 	    pmap_pdp_ctor, pmap_pdp_dtor, NULL);
1694 #else /* PAE */
1695 	pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE, 0, 0, flags,
1696 	    "pdppl", NULL, IPL_NONE, pmap_pdp_ctor, pmap_pdp_dtor, NULL);
1697 #endif /* PAE */
1698 	pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0,
1699 	    PR_LARGECACHE, "pvpl", &pool_allocator_kmem, IPL_NONE, NULL,
1700 	    NULL, NULL);
1701 
1702 	pmap_tlb_init();
1703 
1704 	/* XXX: Since cpu_hatch() is only for secondary CPUs. */
1705 	pmap_tlb_cpu_init(curcpu());
1706 
1707 	evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC,
1708 	    NULL, "x86", "io bitmap copy");
1709 	evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC,
1710 	    NULL, "x86", "ldt sync");
1711 
1712 	/*
1713 	 * done: pmap module is up (and ready for business)
1714 	 */
1715 
1716 	pmap_initialized = true;
1717 }
1718 
1719 /*
1720  * pmap_cpu_init_late: perform late per-CPU initialization.
1721  */
1722 
1723 #ifndef XEN
1724 void
1725 pmap_cpu_init_late(struct cpu_info *ci)
1726 {
1727 	/*
1728 	 * The BP has already its own PD page allocated during early
1729 	 * MD startup.
1730 	 */
1731 	if (ci == &cpu_info_primary)
1732 		return;
1733 
1734 #ifdef PAE
1735 	cpu_alloc_l3_page(ci);
1736 #endif
1737 }
1738 #endif
1739 
1740 /*
1741  * p v _ e n t r y   f u n c t i o n s
1742  */
1743 
1744 /*
1745  * pmap_free_pvs: free a list of pv_entrys
1746  */
1747 
1748 static void
1749 pmap_free_pvs(struct pv_entry *pve)
1750 {
1751 	struct pv_entry *next;
1752 
1753 	for ( /* null */ ; pve != NULL ; pve = next) {
1754 		next = pve->pve_next;
1755 		pool_cache_put(&pmap_pv_cache, pve);
1756 	}
1757 }
1758 
1759 /*
1760  * main pv_entry manipulation functions:
1761  *   pmap_enter_pv: enter a mapping onto a pv_head list
1762  *   pmap_remove_pv: remove a mapping from a pv_head list
1763  *
1764  * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock
1765  *       the pvh before calling
1766  */
1767 
1768 /*
1769  * insert_pv: a helper of pmap_enter_pv
1770  */
1771 
1772 static void
1773 insert_pv(struct pmap_page *pp, struct pv_entry *pve)
1774 {
1775 	struct pv_hash_head *hh;
1776 	kmutex_t *lock;
1777 	u_int hash;
1778 
1779 	hash = pvhash_hash(pve->pve_pte.pte_ptp, pve->pve_pte.pte_va);
1780 	lock = pvhash_lock(hash);
1781 	hh = pvhash_head(hash);
1782 	mutex_spin_enter(lock);
1783 	SLIST_INSERT_HEAD(&hh->hh_list, pve, pve_hash);
1784 	mutex_spin_exit(lock);
1785 
1786 	LIST_INSERT_HEAD(&pp->pp_head.pvh_list, pve, pve_list);
1787 }
1788 
1789 /*
1790  * pmap_enter_pv: enter a mapping onto a pv_head lst
1791  *
1792  * => caller should adjust ptp's wire_count before calling
1793  */
1794 
1795 static struct pv_entry *
1796 pmap_enter_pv(struct pmap_page *pp,
1797 	      struct pv_entry *pve,	/* preallocated pve for us to use */
1798 	      struct pv_entry **sparepve,
1799 	      struct vm_page *ptp,
1800 	      vaddr_t va)
1801 {
1802 
1803 	KASSERT(ptp == NULL || ptp->wire_count >= 2);
1804 	KASSERT(ptp == NULL || ptp->uobject != NULL);
1805 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
1806 
1807 	if ((pp->pp_flags & PP_EMBEDDED) == 0) {
1808 		if (LIST_EMPTY(&pp->pp_head.pvh_list)) {
1809 			pp->pp_flags |= PP_EMBEDDED;
1810 			pp->pp_pte.pte_ptp = ptp;
1811 			pp->pp_pte.pte_va = va;
1812 
1813 			return pve;
1814 		}
1815 	} else {
1816 		struct pv_entry *pve2;
1817 
1818 		pve2 = *sparepve;
1819 		*sparepve = NULL;
1820 
1821 		pve2->pve_pte = pp->pp_pte;
1822 		pp->pp_flags &= ~PP_EMBEDDED;
1823 		LIST_INIT(&pp->pp_head.pvh_list);
1824 		insert_pv(pp, pve2);
1825 	}
1826 
1827 	pve->pve_pte.pte_ptp = ptp;
1828 	pve->pve_pte.pte_va = va;
1829 	insert_pv(pp, pve);
1830 
1831 	return NULL;
1832 }
1833 
1834 /*
1835  * pmap_remove_pv: try to remove a mapping from a pv_list
1836  *
1837  * => caller should adjust ptp's wire_count and free PTP if needed
1838  * => we return the removed pve
1839  */
1840 
1841 static struct pv_entry *
1842 pmap_remove_pv(struct pmap_page *pp, struct vm_page *ptp, vaddr_t va)
1843 {
1844 	struct pv_hash_head *hh;
1845 	struct pv_entry *pve;
1846 	kmutex_t *lock;
1847 	u_int hash;
1848 
1849 	KASSERT(ptp == NULL || ptp->uobject != NULL);
1850 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
1851 
1852 	if ((pp->pp_flags & PP_EMBEDDED) != 0) {
1853 		KASSERT(pp->pp_pte.pte_ptp == ptp);
1854 		KASSERT(pp->pp_pte.pte_va == va);
1855 
1856 		pp->pp_flags &= ~PP_EMBEDDED;
1857 		LIST_INIT(&pp->pp_head.pvh_list);
1858 
1859 		return NULL;
1860 	}
1861 
1862 	hash = pvhash_hash(ptp, va);
1863 	lock = pvhash_lock(hash);
1864 	hh = pvhash_head(hash);
1865 	mutex_spin_enter(lock);
1866 	pve = pvhash_remove(hh, ptp, va);
1867 	mutex_spin_exit(lock);
1868 
1869 	LIST_REMOVE(pve, pve_list);
1870 
1871 	return pve;
1872 }
1873 
1874 /*
1875  * p t p   f u n c t i o n s
1876  */
1877 
1878 static inline struct vm_page *
1879 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level)
1880 {
1881 	int lidx = level - 1;
1882 	struct vm_page *pg;
1883 
1884 	KASSERT(mutex_owned(pmap->pm_lock));
1885 
1886 	if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] &&
1887 	    pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) {
1888 		return (pmap->pm_ptphint[lidx]);
1889 	}
1890 	PMAP_SUBOBJ_LOCK(pmap, lidx);
1891 	pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level));
1892 	PMAP_SUBOBJ_UNLOCK(pmap, lidx);
1893 
1894 	KASSERT(pg == NULL || pg->wire_count >= 1);
1895 	return pg;
1896 }
1897 
1898 static inline void
1899 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level)
1900 {
1901 	lwp_t *l;
1902 	int lidx;
1903 	struct uvm_object *obj;
1904 
1905 	KASSERT(ptp->wire_count == 1);
1906 
1907 	lidx = level - 1;
1908 
1909 	obj = &pmap->pm_obj[lidx];
1910 	pmap_stats_update(pmap, -1, 0);
1911 	if (lidx != 0)
1912 		mutex_enter(obj->vmobjlock);
1913 	if (pmap->pm_ptphint[lidx] == ptp)
1914 		pmap->pm_ptphint[lidx] = TAILQ_FIRST(&obj->memq);
1915 	ptp->wire_count = 0;
1916 	uvm_pagerealloc(ptp, NULL, 0);
1917 	l = curlwp;
1918 	KASSERT((l->l_pflag & LP_INTR) == 0);
1919 	VM_PAGE_TO_PP(ptp)->pp_link = l->l_md.md_gc_ptp;
1920 	l->l_md.md_gc_ptp = ptp;
1921 	if (lidx != 0)
1922 		mutex_exit(obj->vmobjlock);
1923 }
1924 
1925 static void
1926 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
1927 	      pt_entry_t *ptes, pd_entry_t * const *pdes)
1928 {
1929 	unsigned long index;
1930 	int level;
1931 	vaddr_t invaladdr;
1932 	pd_entry_t opde;
1933 
1934 	KASSERT(pmap != pmap_kernel());
1935 	KASSERT(mutex_owned(pmap->pm_lock));
1936 	KASSERT(kpreempt_disabled());
1937 
1938 	level = 1;
1939 	do {
1940 		index = pl_i(va, level + 1);
1941 		opde = pmap_pte_testset(&pdes[level - 1][index], 0);
1942 #if defined(XEN)
1943 #  if defined(__x86_64__)
1944 		/*
1945 		 * If ptp is a L3 currently mapped in kernel space,
1946 		 * on any cpu, clear it before freeing
1947 		 */
1948 		if (level == PTP_LEVELS - 1) {
1949 			/*
1950 			 * Update the per-cpu PD on all cpus the current
1951 			 * pmap is active on
1952 			 */
1953 			xen_kpm_sync(pmap, index);
1954 		}
1955 #  endif /*__x86_64__ */
1956 		invaladdr = level == 1 ? (vaddr_t)ptes :
1957 		    (vaddr_t)pdes[level - 2];
1958 		pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE,
1959 		    opde, TLBSHOOT_FREE_PTP1);
1960 		pmap_tlb_shootnow();
1961 #else	/* XEN */
1962 		invaladdr = level == 1 ? (vaddr_t)ptes :
1963 		    (vaddr_t)pdes[level - 2];
1964 		pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE,
1965 		    opde, TLBSHOOT_FREE_PTP1);
1966 #endif	/* XEN */
1967 		pmap_freepage(pmap, ptp, level);
1968 		if (level < PTP_LEVELS - 1) {
1969 			ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1);
1970 			ptp->wire_count--;
1971 			if (ptp->wire_count > 1)
1972 				break;
1973 		}
1974 	} while (++level < PTP_LEVELS);
1975 	pmap_pte_flush();
1976 }
1977 
1978 /*
1979  * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
1980  *
1981  * => pmap should NOT be pmap_kernel()
1982  * => pmap should be locked
1983  * => preemption should be disabled
1984  */
1985 
1986 static struct vm_page *
1987 pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t * const *pdes)
1988 {
1989 	struct vm_page *ptp, *pptp;
1990 	int i;
1991 	unsigned long index;
1992 	pd_entry_t *pva;
1993 	paddr_t ppa, pa;
1994 	struct uvm_object *obj;
1995 
1996 	KASSERT(pmap != pmap_kernel());
1997 	KASSERT(mutex_owned(pmap->pm_lock));
1998 	KASSERT(kpreempt_disabled());
1999 
2000 	ptp = NULL;
2001 	pa = (paddr_t)-1;
2002 
2003 	/*
2004 	 * Loop through all page table levels seeing if we need to
2005 	 * add a new page to that level.
2006 	 */
2007 	for (i = PTP_LEVELS; i > 1; i--) {
2008 		/*
2009 		 * Save values from previous round.
2010 		 */
2011 		pptp = ptp;
2012 		ppa = pa;
2013 
2014 		index = pl_i(va, i);
2015 		pva = pdes[i - 2];
2016 
2017 		if (pmap_valid_entry(pva[index])) {
2018 			ppa = pmap_pte2pa(pva[index]);
2019 			ptp = NULL;
2020 			continue;
2021 		}
2022 
2023 		obj = &pmap->pm_obj[i-2];
2024 		PMAP_SUBOBJ_LOCK(pmap, i - 2);
2025 		ptp = uvm_pagealloc(obj, ptp_va2o(va, i - 1), NULL,
2026 		    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
2027 		PMAP_SUBOBJ_UNLOCK(pmap, i - 2);
2028 
2029 		if (ptp == NULL)
2030 			return NULL;
2031 
2032 		ptp->flags &= ~PG_BUSY; /* never busy */
2033 		ptp->wire_count = 1;
2034 		pmap->pm_ptphint[i - 2] = ptp;
2035 		pa = VM_PAGE_TO_PHYS(ptp);
2036 		pmap_pte_set(&pva[index], (pd_entry_t)
2037 		        (pmap_pa2pte(pa) | PG_u | PG_RW | PG_V));
2038 #if defined(XEN) && defined(__x86_64__)
2039 		if(i == PTP_LEVELS) {
2040 			/*
2041 			 * Update the per-cpu PD on all cpus the current
2042 			 * pmap is active on
2043 			 */
2044 			xen_kpm_sync(pmap, index);
2045 		}
2046 #endif
2047 		pmap_pte_flush();
2048 		pmap_stats_update(pmap, 1, 0);
2049 		/*
2050 		 * If we're not in the top level, increase the
2051 		 * wire count of the parent page.
2052 		 */
2053 		if (i < PTP_LEVELS) {
2054 			if (pptp == NULL) {
2055 				pptp = pmap_find_ptp(pmap, va, ppa, i);
2056 				KASSERT(pptp != NULL);
2057 			}
2058 			pptp->wire_count++;
2059 		}
2060 	}
2061 
2062 	/*
2063 	 * PTP is not NULL if we just allocated a new PTP.  If it is
2064 	 * still NULL, we must look up the existing one.
2065 	 */
2066 	if (ptp == NULL) {
2067 		ptp = pmap_find_ptp(pmap, va, ppa, 1);
2068 		KASSERTMSG(ptp != NULL, "pmap_get_ptp: va %" PRIxVADDR
2069 		    "ppa %" PRIxPADDR "\n", va, ppa);
2070 	}
2071 
2072 	pmap->pm_ptphint[0] = ptp;
2073 	return ptp;
2074 }
2075 
2076 /*
2077  * p m a p  l i f e c y c l e   f u n c t i o n s
2078  */
2079 
2080 /*
2081  * pmap_pdp_ctor: constructor for the PDP cache.
2082  */
2083 static int
2084 pmap_pdp_ctor(void *arg, void *v, int flags)
2085 {
2086 	pd_entry_t *pdir = v;
2087 	paddr_t pdirpa = 0;	/* XXX: GCC */
2088 	vaddr_t object;
2089 	int i;
2090 
2091 #if !defined(XEN) || !defined(__x86_64__)
2092 	int npde;
2093 #endif
2094 #ifdef XEN
2095 	int s;
2096 #endif
2097 
2098 	/*
2099 	 * NOTE: The `pmaps_lock' is held when the PDP is allocated.
2100 	 */
2101 
2102 #if defined(XEN) && defined(__x86_64__)
2103 	/* fetch the physical address of the page directory. */
2104 	(void) pmap_extract(pmap_kernel(), (vaddr_t) pdir, &pdirpa);
2105 
2106 	/* zero init area */
2107 	memset (pdir, 0, PAGE_SIZE); /* Xen wants a clean page */
2108 	/*
2109 	 * this pdir will NEVER be active in kernel mode
2110 	 * so mark recursive entry invalid
2111 	 */
2112 	pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa) | PG_u;
2113 	/*
2114 	 * PDP constructed this way won't be for kernel,
2115 	 * hence we don't put kernel mappings on Xen.
2116 	 * But we need to make pmap_create() happy, so put a dummy (without
2117 	 * PG_V) value at the right place.
2118 	 */
2119 	pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] =
2120 	     (pd_entry_t)-1 & PG_FRAME;
2121 #else /* XEN && __x86_64__*/
2122 	/* zero init area */
2123 	memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t));
2124 
2125 	object = (vaddr_t)v;
2126 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2127 		/* fetch the physical address of the page directory. */
2128 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2129 		/* put in recursive PDE to map the PTEs */
2130 		pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PG_V;
2131 #ifndef XEN
2132 		pdir[PDIR_SLOT_PTE + i] |= PG_KW;
2133 #endif
2134 	}
2135 
2136 	/* copy kernel's PDE */
2137 	npde = nkptp[PTP_LEVELS - 1];
2138 
2139 	memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN],
2140 	    npde * sizeof(pd_entry_t));
2141 
2142 	/* zero the rest */
2143 	memset(&pdir[PDIR_SLOT_KERN + npde], 0, (PAGE_SIZE * PDP_SIZE) -
2144 	    (PDIR_SLOT_KERN + npde) * sizeof(pd_entry_t));
2145 
2146 	if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
2147 		int idx = pl_i(KERNBASE, PTP_LEVELS);
2148 
2149 		pdir[idx] = PDP_BASE[idx];
2150 	}
2151 
2152 #ifdef __HAVE_DIRECT_MAP
2153 	pdir[PDIR_SLOT_DIRECT] = PDP_BASE[PDIR_SLOT_DIRECT];
2154 #endif
2155 
2156 #endif /* XEN  && __x86_64__*/
2157 #ifdef XEN
2158 	s = splvm();
2159 	object = (vaddr_t)v;
2160 	pmap_protect(pmap_kernel(), object, object + (PAGE_SIZE * PDP_SIZE),
2161 	    VM_PROT_READ);
2162 	pmap_update(pmap_kernel());
2163 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2164 		/*
2165 		 * pin as L2/L4 page, we have to do the page with the
2166 		 * PDIR_SLOT_PTE entries last
2167 		 */
2168 #ifdef PAE
2169 		if (i == l2tol3(PDIR_SLOT_PTE))
2170 			continue;
2171 #endif
2172 
2173 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2174 #ifdef __x86_64__
2175 		xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa));
2176 #else
2177 		xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2178 #endif
2179 	}
2180 #ifdef PAE
2181 	object = ((vaddr_t)pdir) + PAGE_SIZE  * l2tol3(PDIR_SLOT_PTE);
2182 	(void)pmap_extract(pmap_kernel(), object, &pdirpa);
2183 	xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2184 #endif
2185 	splx(s);
2186 #endif /* XEN */
2187 
2188 	return (0);
2189 }
2190 
2191 /*
2192  * pmap_pdp_dtor: destructor for the PDP cache.
2193  */
2194 
2195 static void
2196 pmap_pdp_dtor(void *arg, void *v)
2197 {
2198 #ifdef XEN
2199 	paddr_t pdirpa = 0;	/* XXX: GCC */
2200 	vaddr_t object = (vaddr_t)v;
2201 	int i;
2202 	int s = splvm();
2203 	pt_entry_t *pte;
2204 
2205 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2206 		/* fetch the physical address of the page directory. */
2207 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2208 		/* unpin page table */
2209 		xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa));
2210 	}
2211 	object = (vaddr_t)v;
2212 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2213 		/* Set page RW again */
2214 		pte = kvtopte(object);
2215 		pmap_pte_set(pte, *pte | PG_RW);
2216 		xen_bcast_invlpg((vaddr_t)object);
2217 	}
2218 	splx(s);
2219 #endif  /* XEN */
2220 }
2221 
2222 #ifdef PAE
2223 
2224 /* pmap_pdp_alloc: Allocate a page for the pdp memory pool. */
2225 
2226 static void *
2227 pmap_pdp_alloc(struct pool *pp, int flags)
2228 {
2229 	return (void *)uvm_km_alloc(kernel_map,
2230 	    PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE,
2231 	    ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK)
2232 	    | UVM_KMF_WIRED);
2233 }
2234 
2235 /*
2236  * pmap_pdp_free: free a PDP
2237  */
2238 
2239 static void
2240 pmap_pdp_free(struct pool *pp, void *v)
2241 {
2242 	uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE,
2243 	    UVM_KMF_WIRED);
2244 }
2245 #endif /* PAE */
2246 
2247 /*
2248  * pmap_create: create a pmap object.
2249  */
2250 struct pmap *
2251 pmap_create(void)
2252 {
2253 	struct pmap *pmap;
2254 	int i;
2255 
2256 	pmap = pool_cache_get(&pmap_cache, PR_WAITOK);
2257 
2258 	/* init uvm_object */
2259 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2260 		mutex_init(&pmap->pm_obj_lock[i], MUTEX_DEFAULT, IPL_NONE);
2261 		uvm_obj_init(&pmap->pm_obj[i], NULL, false, 1);
2262 		uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_obj_lock[i]);
2263 		pmap->pm_ptphint[i] = NULL;
2264 	}
2265 	pmap->pm_stats.wired_count = 0;
2266 	/* count the PDP allocd below */
2267 	pmap->pm_stats.resident_count = PDP_SIZE;
2268 #if !defined(__x86_64__)
2269 	pmap->pm_hiexec = 0;
2270 #endif /* !defined(__x86_64__) */
2271 	pmap->pm_flags = 0;
2272 	pmap->pm_gc_ptp = NULL;
2273 
2274 	kcpuset_create(&pmap->pm_cpus, true);
2275 	kcpuset_create(&pmap->pm_kernel_cpus, true);
2276 #ifdef XEN
2277 	kcpuset_create(&pmap->pm_xen_ptp_cpus, true);
2278 #endif
2279 	/* init the LDT */
2280 	pmap->pm_ldt = NULL;
2281 	pmap->pm_ldt_len = 0;
2282 	pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2283 
2284 	/* allocate PDP */
2285  try_again:
2286 	pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK);
2287 
2288 	mutex_enter(&pmaps_lock);
2289 
2290 	if (pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] == 0) {
2291 		mutex_exit(&pmaps_lock);
2292 		pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir);
2293 		goto try_again;
2294 	}
2295 
2296 	for (i = 0; i < PDP_SIZE; i++)
2297 		pmap->pm_pdirpa[i] =
2298 		    pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]);
2299 
2300 	LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
2301 
2302 	mutex_exit(&pmaps_lock);
2303 
2304 	return (pmap);
2305 }
2306 
2307 /*
2308  * pmap_free_ptps: put a list of ptps back to the freelist.
2309  */
2310 
2311 static void
2312 pmap_free_ptps(struct vm_page *empty_ptps)
2313 {
2314 	struct vm_page *ptp;
2315 	struct pmap_page *pp;
2316 
2317 	while ((ptp = empty_ptps) != NULL) {
2318 		pp = VM_PAGE_TO_PP(ptp);
2319 		empty_ptps = pp->pp_link;
2320 		LIST_INIT(&pp->pp_head.pvh_list);
2321 		uvm_pagefree(ptp);
2322 	}
2323 }
2324 
2325 /*
2326  * pmap_destroy: drop reference count on pmap.   free pmap if
2327  *	reference count goes to zero.
2328  */
2329 
2330 void
2331 pmap_destroy(struct pmap *pmap)
2332 {
2333 	lwp_t *l;
2334 	int i;
2335 
2336 	/*
2337 	 * If we have torn down this pmap, process deferred frees and
2338 	 * invalidations.  Free now if the system is low on memory.
2339 	 * Otherwise, free when the pmap is destroyed thus avoiding a
2340 	 * TLB shootdown.
2341 	 */
2342 	l = curlwp;
2343 	if (__predict_false(l->l_md.md_gc_pmap == pmap)) {
2344 		if (uvmexp.free < uvmexp.freetarg) {
2345 			pmap_update(pmap);
2346 		} else {
2347 			KASSERT(pmap->pm_gc_ptp == NULL);
2348 			pmap->pm_gc_ptp = l->l_md.md_gc_ptp;
2349 			l->l_md.md_gc_ptp = NULL;
2350 			l->l_md.md_gc_pmap = NULL;
2351 		}
2352 	}
2353 
2354 	/*
2355 	 * drop reference count
2356 	 */
2357 
2358 	if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) {
2359 		return;
2360 	}
2361 
2362 #ifdef DIAGNOSTIC
2363 	CPU_INFO_ITERATOR cii;
2364 	struct cpu_info *ci;
2365 
2366 	for (CPU_INFO_FOREACH(cii, ci)) {
2367 		if (ci->ci_pmap == pmap)
2368 			panic("destroying pmap being used");
2369 #if defined(XEN) && defined(__x86_64__)
2370 		for (i = 0; i < PDIR_SLOT_PTE; i++) {
2371 			if (pmap->pm_pdir[i] != 0 &&
2372 			    ci->ci_kpm_pdir[i] == pmap->pm_pdir[i]) {
2373 				printf("pmap_destroy(%p) pmap_kernel %p "
2374 				    "curcpu %d cpu %d ci_pmap %p "
2375 				    "ci->ci_kpm_pdir[%d]=%" PRIx64
2376 				    " pmap->pm_pdir[%d]=%" PRIx64 "\n",
2377 				    pmap, pmap_kernel(), curcpu()->ci_index,
2378 				    ci->ci_index, ci->ci_pmap,
2379 				    i, ci->ci_kpm_pdir[i],
2380 				    i, pmap->pm_pdir[i]);
2381 				panic("pmap_destroy: used pmap");
2382 			}
2383 		}
2384 #endif
2385 	}
2386 #endif /* DIAGNOSTIC */
2387 
2388 	/*
2389 	 * Reference count is zero, free pmap resources and then free pmap.
2390 	 * First, remove it from global list of pmaps.
2391 	 */
2392 
2393 	mutex_enter(&pmaps_lock);
2394 	LIST_REMOVE(pmap, pm_list);
2395 	mutex_exit(&pmaps_lock);
2396 
2397 	/*
2398 	 * Process deferred PTP frees.  No TLB shootdown required, as the
2399 	 * PTP pages are no longer visible to any CPU.
2400 	 */
2401 
2402 	pmap_free_ptps(pmap->pm_gc_ptp);
2403 
2404 	/*
2405 	 * destroyed pmap shouldn't have remaining PTPs
2406 	 */
2407 
2408 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2409 		KASSERT(pmap->pm_obj[i].uo_npages == 0);
2410 		KASSERT(TAILQ_EMPTY(&pmap->pm_obj[i].memq));
2411 	}
2412 
2413 	pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir);
2414 
2415 #ifdef USER_LDT
2416 	if (pmap->pm_ldt != NULL) {
2417 		/*
2418 		 * no need to switch the LDT; this address space is gone,
2419 		 * nothing is using it.
2420 		 *
2421 		 * No need to lock the pmap for ldt_free (or anything else),
2422 		 * we're the last one to use it.
2423 		 */
2424 		mutex_enter(&cpu_lock);
2425 		ldt_free(pmap->pm_ldt_sel);
2426 		mutex_exit(&cpu_lock);
2427 		uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt,
2428 		    pmap->pm_ldt_len, UVM_KMF_WIRED);
2429 	}
2430 #endif
2431 
2432 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2433 		uvm_obj_destroy(&pmap->pm_obj[i], false);
2434 		mutex_destroy(&pmap->pm_obj_lock[i]);
2435 	}
2436 	kcpuset_destroy(pmap->pm_cpus);
2437 	kcpuset_destroy(pmap->pm_kernel_cpus);
2438 #ifdef XEN
2439 	kcpuset_destroy(pmap->pm_xen_ptp_cpus);
2440 #endif
2441 	pool_cache_put(&pmap_cache, pmap);
2442 }
2443 
2444 /*
2445  * pmap_remove_all: pmap is being torn down by the current thread.
2446  * avoid unnecessary invalidations.
2447  */
2448 
2449 void
2450 pmap_remove_all(struct pmap *pmap)
2451 {
2452 	lwp_t *l = curlwp;
2453 
2454 	KASSERT(l->l_md.md_gc_pmap == NULL);
2455 
2456 	l->l_md.md_gc_pmap = pmap;
2457 }
2458 
2459 #if defined(PMAP_FORK)
2460 /*
2461  * pmap_fork: perform any necessary data structure manipulation when
2462  * a VM space is forked.
2463  */
2464 
2465 void
2466 pmap_fork(struct pmap *pmap1, struct pmap *pmap2)
2467 {
2468 #ifdef USER_LDT
2469 	union descriptor *new_ldt;
2470 	size_t len;
2471 	int sel;
2472 
2473 	if (__predict_true(pmap1->pm_ldt == NULL)) {
2474 		return;
2475 	}
2476 
2477  retry:
2478 	if (pmap1->pm_ldt != NULL) {
2479 		len = pmap1->pm_ldt_len;
2480 		new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, len, 0,
2481 		    UVM_KMF_WIRED);
2482 		mutex_enter(&cpu_lock);
2483 		sel = ldt_alloc(new_ldt, len);
2484 		if (sel == -1) {
2485 			mutex_exit(&cpu_lock);
2486 			uvm_km_free(kernel_map, (vaddr_t)new_ldt, len,
2487 			    UVM_KMF_WIRED);
2488 			printf("WARNING: pmap_fork: unable to allocate LDT\n");
2489 			return;
2490 		}
2491 	} else {
2492 		len = -1;
2493 		new_ldt = NULL;
2494 		sel = -1;
2495 		mutex_enter(&cpu_lock);
2496 	}
2497 
2498  	/* Copy the LDT, if necessary. */
2499  	if (pmap1->pm_ldt != NULL) {
2500 		if (len != pmap1->pm_ldt_len) {
2501 			if (len != -1) {
2502 				ldt_free(sel);
2503 				uvm_km_free(kernel_map, (vaddr_t)new_ldt,
2504 				    len, UVM_KMF_WIRED);
2505 			}
2506 			mutex_exit(&cpu_lock);
2507 			goto retry;
2508 		}
2509 
2510 		memcpy(new_ldt, pmap1->pm_ldt, len);
2511 		pmap2->pm_ldt = new_ldt;
2512 		pmap2->pm_ldt_len = pmap1->pm_ldt_len;
2513 		pmap2->pm_ldt_sel = sel;
2514 		len = -1;
2515 	}
2516 
2517 	if (len != -1) {
2518 		ldt_free(sel);
2519 		uvm_km_free(kernel_map, (vaddr_t)new_ldt, len,
2520 		    UVM_KMF_WIRED);
2521 	}
2522 	mutex_exit(&cpu_lock);
2523 #endif /* USER_LDT */
2524 }
2525 #endif /* PMAP_FORK */
2526 
2527 #ifdef USER_LDT
2528 
2529 /*
2530  * pmap_ldt_xcall: cross call used by pmap_ldt_sync.  if the named pmap
2531  * is active, reload LDTR.
2532  */
2533 static void
2534 pmap_ldt_xcall(void *arg1, void *arg2)
2535 {
2536 	struct pmap *pm;
2537 
2538 	kpreempt_disable();
2539 	pm = arg1;
2540 	if (curcpu()->ci_pmap == pm) {
2541 		lldt(pm->pm_ldt_sel);
2542 	}
2543 	kpreempt_enable();
2544 }
2545 
2546 /*
2547  * pmap_ldt_sync: LDT selector for the named pmap is changing.  swap
2548  * in the new selector on all CPUs.
2549  */
2550 void
2551 pmap_ldt_sync(struct pmap *pm)
2552 {
2553 	uint64_t where;
2554 
2555 	KASSERT(mutex_owned(&cpu_lock));
2556 
2557 	pmap_ldt_evcnt.ev_count++;
2558 	where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL);
2559 	xc_wait(where);
2560 }
2561 
2562 /*
2563  * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and
2564  * restore the default.
2565  */
2566 
2567 void
2568 pmap_ldt_cleanup(struct lwp *l)
2569 {
2570 	pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap;
2571 	union descriptor *dp = NULL;
2572 	size_t len = 0;
2573 	int sel = -1;
2574 
2575 	if (__predict_true(pmap->pm_ldt == NULL)) {
2576 		return;
2577 	}
2578 
2579 	mutex_enter(&cpu_lock);
2580 	if (pmap->pm_ldt != NULL) {
2581 		sel = pmap->pm_ldt_sel;
2582 		dp = pmap->pm_ldt;
2583 		len = pmap->pm_ldt_len;
2584 		pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2585 		pmap->pm_ldt = NULL;
2586 		pmap->pm_ldt_len = 0;
2587 		pmap_ldt_sync(pmap);
2588 		ldt_free(sel);
2589 		uvm_km_free(kernel_map, (vaddr_t)dp, len, UVM_KMF_WIRED);
2590 	}
2591 	mutex_exit(&cpu_lock);
2592 }
2593 #endif /* USER_LDT */
2594 
2595 /*
2596  * pmap_activate: activate a process' pmap
2597  *
2598  * => must be called with kernel preemption disabled
2599  * => if lwp is the curlwp, then set ci_want_pmapload so that
2600  *    actual MMU context switch will be done by pmap_load() later
2601  */
2602 
2603 void
2604 pmap_activate(struct lwp *l)
2605 {
2606 	struct cpu_info *ci;
2607 	struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2608 
2609 	KASSERT(kpreempt_disabled());
2610 
2611 	ci = curcpu();
2612 
2613 	if (l == ci->ci_curlwp) {
2614 		KASSERT(ci->ci_want_pmapload == 0);
2615 		KASSERT(ci->ci_tlbstate != TLBSTATE_VALID);
2616 #ifdef KSTACK_CHECK_DR0
2617 		/*
2618 		 * setup breakpoint on the top of stack
2619 		 */
2620 		if (l == &lwp0)
2621 			dr0(0, 0, 0, 0);
2622 		else
2623 			dr0(KSTACK_LOWEST_ADDR(l), 1, 3, 1);
2624 #endif
2625 
2626 		/*
2627 		 * no need to switch to kernel vmspace because
2628 		 * it's a subset of any vmspace.
2629 		 */
2630 
2631 		if (pmap == pmap_kernel()) {
2632 			ci->ci_want_pmapload = 0;
2633 			return;
2634 		}
2635 
2636 		ci->ci_want_pmapload = 1;
2637 	}
2638 }
2639 
2640 /*
2641  * pmap_reactivate: try to regain reference to the pmap.
2642  *
2643  * => Must be called with kernel preemption disabled.
2644  */
2645 
2646 static bool
2647 pmap_reactivate(struct pmap *pmap)
2648 {
2649 	struct cpu_info * const ci = curcpu();
2650 	const cpuid_t cid = cpu_index(ci);
2651 	bool result;
2652 
2653 	KASSERT(kpreempt_disabled());
2654 #if defined(XEN) && defined(__x86_64__)
2655 	KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd);
2656 #elif defined(PAE)
2657 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
2658 #elif !defined(XEN)
2659 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()));
2660 #endif
2661 
2662 	/*
2663 	 * If we still have a lazy reference to this pmap, we can assume
2664 	 * that there was no TLB shootdown for this pmap in the meantime.
2665 	 *
2666 	 * The order of events here is important as we must synchronize
2667 	 * with TLB shootdown interrupts.  Declare interest in invalidations
2668 	 * (TLBSTATE_VALID) and then check the CPU set, which the IPIs can
2669 	 * change only when the state is TLBSTATE_LAZY.
2670 	 */
2671 
2672 	ci->ci_tlbstate = TLBSTATE_VALID;
2673 	KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid));
2674 
2675 	if (kcpuset_isset(pmap->pm_cpus, cid)) {
2676 		/* We have the reference, state is valid. */
2677 		result = true;
2678 	} else {
2679 		/* Must reload the TLB. */
2680 		kcpuset_atomic_set(pmap->pm_cpus, cid);
2681 		result = false;
2682 	}
2683 	return result;
2684 }
2685 
2686 /*
2687  * pmap_load: perform the actual pmap switch, i.e. fill in %cr3 register
2688  * and relevant LDT info.
2689  *
2690  * Ensures that the current process' pmap is loaded on the current CPU's
2691  * MMU and that there are no stale TLB entries.
2692  *
2693  * => The caller should disable kernel preemption or do check-and-retry
2694  *    to prevent a preemption from undoing our efforts.
2695  * => This function may block.
2696  */
2697 void
2698 pmap_load(void)
2699 {
2700 	struct cpu_info *ci;
2701 	struct pmap *pmap, *oldpmap;
2702 	struct lwp *l;
2703 	struct pcb *pcb;
2704 	cpuid_t cid;
2705 	uint64_t ncsw;
2706 
2707 	kpreempt_disable();
2708  retry:
2709 	ci = curcpu();
2710 	if (!ci->ci_want_pmapload) {
2711 		kpreempt_enable();
2712 		return;
2713 	}
2714 	l = ci->ci_curlwp;
2715 	ncsw = l->l_ncsw;
2716 
2717 	/* should be able to take ipis. */
2718 	KASSERT(ci->ci_ilevel < IPL_HIGH);
2719 #ifdef XEN
2720 	/* Check to see if interrupts are enabled (ie; no events are masked) */
2721 	KASSERT(x86_read_psl() == 0);
2722 #else
2723 	KASSERT((x86_read_psl() & PSL_I) != 0);
2724 #endif
2725 
2726 	KASSERT(l != NULL);
2727 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2728 	KASSERT(pmap != pmap_kernel());
2729 	oldpmap = ci->ci_pmap;
2730 	pcb = lwp_getpcb(l);
2731 
2732 	if (pmap == oldpmap) {
2733 		if (!pmap_reactivate(pmap)) {
2734 			u_int gen = uvm_emap_gen_return();
2735 
2736 			/*
2737 			 * pmap has been changed during deactivated.
2738 			 * our tlb may be stale.
2739 			 */
2740 
2741 			tlbflush();
2742 			uvm_emap_update(gen);
2743 		}
2744 
2745 		ci->ci_want_pmapload = 0;
2746 		kpreempt_enable();
2747 		return;
2748 	}
2749 
2750 	/*
2751 	 * Acquire a reference to the new pmap and perform the switch.
2752 	 */
2753 
2754 	pmap_reference(pmap);
2755 
2756 	cid = cpu_index(ci);
2757 	kcpuset_atomic_clear(oldpmap->pm_cpus, cid);
2758 	kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid);
2759 
2760 #if defined(XEN) && defined(__x86_64__)
2761 	KASSERT(pmap_pdirpa(oldpmap, 0) == ci->ci_xen_current_user_pgd ||
2762 	    oldpmap == pmap_kernel());
2763 #elif defined(PAE)
2764 	KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
2765 #elif !defined(XEN)
2766 	KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(rcr3()));
2767 #endif
2768 	KASSERT(!kcpuset_isset(pmap->pm_cpus, cid));
2769 	KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid));
2770 
2771 	/*
2772 	 * Mark the pmap in use by this CPU.  Again, we must synchronize
2773 	 * with TLB shootdown interrupts, so set the state VALID first,
2774 	 * then register us for shootdown events on this pmap.
2775 	 */
2776 	ci->ci_tlbstate = TLBSTATE_VALID;
2777 	kcpuset_atomic_set(pmap->pm_cpus, cid);
2778 	kcpuset_atomic_set(pmap->pm_kernel_cpus, cid);
2779 	ci->ci_pmap = pmap;
2780 
2781 	/*
2782 	 * update tss.  now that we have registered for invalidations
2783 	 * from other CPUs, we're good to load the page tables.
2784 	 */
2785 #ifdef PAE
2786 	pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa;
2787 #else
2788 	pcb->pcb_cr3 = pmap_pdirpa(pmap, 0);
2789 #endif
2790 
2791 #ifdef i386
2792 #ifndef XEN
2793 	ci->ci_tss.tss_ldt = pmap->pm_ldt_sel;
2794 	ci->ci_tss.tss_cr3 = pcb->pcb_cr3;
2795 #endif /* !XEN */
2796 #endif /* i386 */
2797 
2798 	lldt(pmap->pm_ldt_sel);
2799 
2800 	u_int gen = uvm_emap_gen_return();
2801 	cpu_load_pmap(pmap, oldpmap);
2802 	uvm_emap_update(gen);
2803 
2804 	ci->ci_want_pmapload = 0;
2805 
2806 	/*
2807 	 * we're now running with the new pmap.  drop the reference
2808 	 * to the old pmap.  if we block, we need to go around again.
2809 	 */
2810 
2811 	pmap_destroy(oldpmap);
2812 	if (l->l_ncsw != ncsw) {
2813 		goto retry;
2814 	}
2815 
2816 	kpreempt_enable();
2817 }
2818 
2819 /*
2820  * pmap_deactivate: deactivate a process' pmap.
2821  *
2822  * => Must be called with kernel preemption disabled (high IPL is enough).
2823  */
2824 void
2825 pmap_deactivate(struct lwp *l)
2826 {
2827 	struct pmap *pmap;
2828 	struct cpu_info *ci;
2829 
2830 	KASSERT(kpreempt_disabled());
2831 
2832 	if (l != curlwp) {
2833 		return;
2834 	}
2835 
2836 	/*
2837 	 * Wait for pending TLB shootdowns to complete.  Necessary because
2838 	 * TLB shootdown state is per-CPU, and the LWP may be coming off
2839 	 * the CPU before it has a chance to call pmap_update(), e.g. due
2840 	 * to kernel preemption or blocking routine in between.
2841 	 */
2842 	pmap_tlb_shootnow();
2843 
2844 	ci = curcpu();
2845 
2846 	if (ci->ci_want_pmapload) {
2847 		/*
2848 		 * ci_want_pmapload means that our pmap is not loaded on
2849 		 * the CPU or TLB might be stale.  note that pmap_kernel()
2850 		 * is always considered loaded.
2851 		 */
2852 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
2853 		    != pmap_kernel());
2854 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
2855 		    != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID);
2856 
2857 		/*
2858 		 * userspace has not been touched.
2859 		 * nothing to do here.
2860 		 */
2861 
2862 		ci->ci_want_pmapload = 0;
2863 		return;
2864 	}
2865 
2866 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2867 
2868 	if (pmap == pmap_kernel()) {
2869 		return;
2870 	}
2871 
2872 #if defined(XEN) && defined(__x86_64__)
2873 	KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd);
2874 #elif defined(PAE)
2875 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
2876 #elif !defined(XEN)
2877 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()));
2878 #endif
2879 	KASSERT(ci->ci_pmap == pmap);
2880 
2881 	/*
2882 	 * we aren't interested in TLB invalidations for this pmap,
2883 	 * at least for the time being.
2884 	 */
2885 
2886 	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
2887 	ci->ci_tlbstate = TLBSTATE_LAZY;
2888 }
2889 
2890 /*
2891  * end of lifecycle functions
2892  */
2893 
2894 /*
2895  * some misc. functions
2896  */
2897 
2898 int
2899 pmap_pdes_invalid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde)
2900 {
2901 	int i;
2902 	unsigned long index;
2903 	pd_entry_t pde;
2904 
2905 	for (i = PTP_LEVELS; i > 1; i--) {
2906 		index = pl_i(va, i);
2907 		pde = pdes[i - 2][index];
2908 		if ((pde & PG_V) == 0)
2909 			return i;
2910 	}
2911 	if (lastpde != NULL)
2912 		*lastpde = pde;
2913 	return 0;
2914 }
2915 
2916 /*
2917  * pmap_extract: extract a PA for the given VA
2918  */
2919 
2920 bool
2921 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
2922 {
2923 	pt_entry_t *ptes, pte;
2924 	pd_entry_t pde;
2925 	pd_entry_t * const *pdes;
2926 	struct pmap *pmap2;
2927 	struct cpu_info *ci;
2928 	paddr_t pa;
2929 	lwp_t *l;
2930 	bool hard, rv;
2931 
2932 #ifdef __HAVE_DIRECT_MAP
2933 	if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
2934 		if (pap != NULL) {
2935 			*pap = va - PMAP_DIRECT_BASE;
2936 		}
2937 		return true;
2938 	}
2939 #endif
2940 
2941 	rv = false;
2942 	pa = 0;
2943 	l = curlwp;
2944 
2945 	kpreempt_disable();
2946 	ci = l->l_cpu;
2947 	if (__predict_true(!ci->ci_want_pmapload && ci->ci_pmap == pmap) ||
2948 	    pmap == pmap_kernel()) {
2949 		/*
2950 		 * no need to lock, because it's pmap_kernel() or our
2951 		 * own pmap and is active.  if a user pmap, the caller
2952 		 * will hold the vm_map write/read locked and so prevent
2953 		 * entries from disappearing while we are here.  ptps
2954 		 * can disappear via pmap_remove() and pmap_protect(),
2955 		 * but they are called with the vm_map write locked.
2956 		 */
2957 		hard = false;
2958 		ptes = PTE_BASE;
2959 		pdes = normal_pdes;
2960 	} else {
2961 		/* we lose, do it the hard way. */
2962 		hard = true;
2963 		pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
2964 	}
2965 	if (pmap_pdes_valid(va, pdes, &pde)) {
2966 		pte = ptes[pl1_i(va)];
2967 		if (pde & PG_PS) {
2968 			pa = (pde & PG_LGFRAME) | (va & (NBPD_L2 - 1));
2969 			rv = true;
2970 		} else if (__predict_true((pte & PG_V) != 0)) {
2971 			pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
2972 			rv = true;
2973 		}
2974 	}
2975 	if (__predict_false(hard)) {
2976 		pmap_unmap_ptes(pmap, pmap2);
2977 	}
2978 	kpreempt_enable();
2979 	if (pap != NULL) {
2980 		*pap = pa;
2981 	}
2982 	return rv;
2983 }
2984 
2985 
2986 /*
2987  * vtophys: virtual address to physical address.  For use by
2988  * machine-dependent code only.
2989  */
2990 
2991 paddr_t
2992 vtophys(vaddr_t va)
2993 {
2994 	paddr_t pa;
2995 
2996 	if (pmap_extract(pmap_kernel(), va, &pa) == true)
2997 		return (pa);
2998 	return (0);
2999 }
3000 
3001 __strict_weak_alias(pmap_extract_ma, pmap_extract);
3002 
3003 #ifdef XEN
3004 
3005 /*
3006  * vtomach: virtual address to machine address.  For use by
3007  * machine-dependent code only.
3008  */
3009 
3010 paddr_t
3011 vtomach(vaddr_t va)
3012 {
3013 	paddr_t pa;
3014 
3015 	if (pmap_extract_ma(pmap_kernel(), va, &pa) == true)
3016 		return (pa);
3017 	return (0);
3018 }
3019 
3020 #endif /* XEN */
3021 
3022 /*
3023  * pmap_virtual_space: used during bootup [pmap_steal_memory] to
3024  *	determine the bounds of the kernel virtual addess space.
3025  */
3026 
3027 void
3028 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp)
3029 {
3030 	*startp = virtual_avail;
3031 	*endp = virtual_end;
3032 }
3033 
3034 /*
3035  * pmap_zero_page: zero a page
3036  */
3037 
3038 void
3039 pmap_zero_page(paddr_t pa)
3040 {
3041 #if defined(__HAVE_DIRECT_MAP)
3042 	pagezero(PMAP_DIRECT_MAP(pa));
3043 #else
3044 #if defined(XEN)
3045 	if (XEN_VERSION_SUPPORTED(3, 4))
3046 		xen_pagezero(pa);
3047 #endif
3048 	pt_entry_t *zpte;
3049 	void *zerova;
3050 	int id;
3051 
3052 	kpreempt_disable();
3053 	id = cpu_number();
3054 	zpte = PTESLEW(zero_pte, id);
3055 	zerova = VASLEW(zerop, id);
3056 
3057 #ifdef DIAGNOSTIC
3058 	if (*zpte)
3059 		panic("pmap_zero_page: lock botch");
3060 #endif
3061 
3062 	pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k);
3063 	pmap_pte_flush();
3064 	pmap_update_pg((vaddr_t)zerova);		/* flush TLB */
3065 
3066 	memset(zerova, 0, PAGE_SIZE);
3067 
3068 #if defined(DIAGNOSTIC) || defined(XEN)
3069 	pmap_pte_set(zpte, 0);				/* zap ! */
3070 	pmap_pte_flush();
3071 #endif
3072 	kpreempt_enable();
3073 #endif /* defined(__HAVE_DIRECT_MAP) */
3074 }
3075 
3076 /*
3077  * pmap_pagezeroidle: the same, for the idle loop page zero'er.
3078  * Returns true if the page was zero'd, false if we aborted for
3079  * some reason.
3080  */
3081 
3082 bool
3083 pmap_pageidlezero(paddr_t pa)
3084 {
3085 #ifdef __HAVE_DIRECT_MAP
3086 	KASSERT(cpu_feature[0] & CPUID_SSE2);
3087 	return sse2_idlezero_page((void *)PMAP_DIRECT_MAP(pa));
3088 #else
3089 	pt_entry_t *zpte;
3090 	void *zerova;
3091 	bool rv;
3092 	int id;
3093 
3094 	id = cpu_number();
3095 	zpte = PTESLEW(zero_pte, id);
3096 	zerova = VASLEW(zerop, id);
3097 
3098 	KASSERT(cpu_feature[0] & CPUID_SSE2);
3099 	KASSERT(*zpte == 0);
3100 
3101 	pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k);
3102 	pmap_pte_flush();
3103 	pmap_update_pg((vaddr_t)zerova);		/* flush TLB */
3104 
3105 	rv = sse2_idlezero_page(zerova);
3106 
3107 #if defined(DIAGNOSTIC) || defined(XEN)
3108 	pmap_pte_set(zpte, 0);				/* zap ! */
3109 	pmap_pte_flush();
3110 #endif
3111 
3112 	return rv;
3113 #endif
3114 }
3115 
3116 /*
3117  * pmap_copy_page: copy a page
3118  */
3119 
3120 void
3121 pmap_copy_page(paddr_t srcpa, paddr_t dstpa)
3122 {
3123 #if defined(__HAVE_DIRECT_MAP)
3124 	vaddr_t srcva = PMAP_DIRECT_MAP(srcpa);
3125 	vaddr_t dstva = PMAP_DIRECT_MAP(dstpa);
3126 
3127 	memcpy((void *)dstva, (void *)srcva, PAGE_SIZE);
3128 #else
3129 #if defined(XEN)
3130 	if (XEN_VERSION_SUPPORTED(3, 4)) {
3131 		xen_copy_page(srcpa, dstpa);
3132 		return;
3133 	}
3134 #endif
3135 	pt_entry_t *spte;
3136 	pt_entry_t *dpte;
3137 	void *csrcva;
3138 	void *cdstva;
3139 	int id;
3140 
3141 	kpreempt_disable();
3142 	id = cpu_number();
3143 	spte = PTESLEW(csrc_pte,id);
3144 	dpte = PTESLEW(cdst_pte,id);
3145 	csrcva = VASLEW(csrcp, id);
3146 	cdstva = VASLEW(cdstp, id);
3147 
3148 	KASSERT(*spte == 0 && *dpte == 0);
3149 
3150 	pmap_pte_set(spte, pmap_pa2pte(srcpa) | PG_V | PG_RW | PG_U | PG_k);
3151 	pmap_pte_set(dpte,
3152 	    pmap_pa2pte(dstpa) | PG_V | PG_RW | PG_M | PG_U | PG_k);
3153 	pmap_pte_flush();
3154 	pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva);
3155 
3156 	memcpy(cdstva, csrcva, PAGE_SIZE);
3157 
3158 #if defined(DIAGNOSTIC) || defined(XEN)
3159 	pmap_pte_set(spte, 0);
3160 	pmap_pte_set(dpte, 0);
3161 	pmap_pte_flush();
3162 #endif
3163 	kpreempt_enable();
3164 #endif /* defined(__HAVE_DIRECT_MAP) */
3165 }
3166 
3167 static pt_entry_t *
3168 pmap_map_ptp(struct vm_page *ptp)
3169 {
3170 #ifdef __HAVE_DIRECT_MAP
3171 	return (void *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp));
3172 #else
3173 	pt_entry_t *ptppte;
3174 	void *ptpva;
3175 	int id;
3176 
3177 	KASSERT(kpreempt_disabled());
3178 
3179 	id = cpu_number();
3180 	ptppte = PTESLEW(ptp_pte, id);
3181 	ptpva = VASLEW(ptpp, id);
3182 #if !defined(XEN)
3183 	pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M |
3184 	    PG_RW | PG_U | PG_k);
3185 #else
3186 	pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M |
3187 	    PG_U | PG_k);
3188 #endif
3189 	pmap_pte_flush();
3190 	pmap_update_pg((vaddr_t)ptpva);
3191 
3192 	return (pt_entry_t *)ptpva;
3193 #endif
3194 }
3195 
3196 static void
3197 pmap_unmap_ptp(void)
3198 {
3199 #ifndef __HAVE_DIRECT_MAP
3200 #if defined(DIAGNOSTIC) || defined(XEN)
3201 	pt_entry_t *pte;
3202 
3203 	KASSERT(kpreempt_disabled());
3204 
3205 	pte = PTESLEW(ptp_pte, cpu_number());
3206 	if (*pte != 0) {
3207 		pmap_pte_set(pte, 0);
3208 		pmap_pte_flush();
3209 	}
3210 #endif
3211 #endif
3212 }
3213 
3214 static pt_entry_t *
3215 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
3216 {
3217 
3218 	KASSERT(kpreempt_disabled());
3219 	if (pmap_is_curpmap(pmap)) {
3220 		return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */
3221 	}
3222 	KASSERT(ptp != NULL);
3223 	return pmap_map_ptp(ptp) + pl1_pi(va);
3224 }
3225 
3226 static void
3227 pmap_unmap_pte(void)
3228 {
3229 
3230 	KASSERT(kpreempt_disabled());
3231 
3232 	pmap_unmap_ptp();
3233 }
3234 
3235 /*
3236  * p m a p   r e m o v e   f u n c t i o n s
3237  *
3238  * functions that remove mappings
3239  */
3240 
3241 /*
3242  * pmap_remove_ptes: remove PTEs from a PTP
3243  *
3244  * => caller must hold pmap's lock
3245  * => PTP must be mapped into KVA
3246  * => PTP should be null if pmap == pmap_kernel()
3247  * => must be called with kernel preemption disabled
3248  * => returns composite pte if at least one page should be shot down
3249  */
3250 
3251 static void
3252 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
3253 		 vaddr_t startva, vaddr_t endva, struct pv_entry **pv_tofree)
3254 {
3255 	pt_entry_t *pte = (pt_entry_t *)ptpva;
3256 
3257 	KASSERT(pmap == pmap_kernel() || mutex_owned(pmap->pm_lock));
3258 	KASSERT(kpreempt_disabled());
3259 
3260 	/*
3261 	 * note that ptpva points to the PTE that maps startva.   this may
3262 	 * or may not be the first PTE in the PTP.
3263 	 *
3264 	 * we loop through the PTP while there are still PTEs to look at
3265 	 * and the wire_count is greater than 1 (because we use the wire_count
3266 	 * to keep track of the number of real PTEs in the PTP).
3267 	 */
3268 	while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) {
3269 		(void)pmap_remove_pte(pmap, ptp, pte, startva, pv_tofree);
3270 		startva += PAGE_SIZE;
3271 		pte++;
3272 	}
3273 }
3274 
3275 
3276 /*
3277  * pmap_remove_pte: remove a single PTE from a PTP.
3278  *
3279  * => caller must hold pmap's lock
3280  * => PTP must be mapped into KVA
3281  * => PTP should be null if pmap == pmap_kernel()
3282  * => returns true if we removed a mapping
3283  * => must be called with kernel preemption disabled
3284  */
3285 static bool
3286 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
3287 		vaddr_t va, struct pv_entry **pv_tofree)
3288 {
3289 	struct pv_entry *pve;
3290 	struct vm_page *pg;
3291 	struct pmap_page *pp;
3292 	pt_entry_t opte;
3293 
3294 	KASSERT(pmap == pmap_kernel() || mutex_owned(pmap->pm_lock));
3295 	KASSERT(kpreempt_disabled());
3296 
3297 	if (!pmap_valid_entry(*pte)) {
3298 		/* VA not mapped. */
3299 		return false;
3300 	}
3301 
3302 	/* Atomically save the old PTE and zap it. */
3303 	opte = pmap_pte_testset(pte, 0);
3304 	if (!pmap_valid_entry(opte)) {
3305 		return false;
3306 	}
3307 
3308 	pmap_exec_account(pmap, va, opte, 0);
3309 	pmap_stats_update_bypte(pmap, 0, opte);
3310 
3311 	if (ptp) {
3312 		/*
3313 		 * Dropping a PTE.  Make sure that the PDE is flushed.
3314 		 */
3315 		ptp->wire_count--;
3316 		if (ptp->wire_count <= 1) {
3317 			opte |= PG_U;
3318 		}
3319 	}
3320 
3321 	if ((opte & PG_U) != 0) {
3322 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE);
3323 	}
3324 
3325 	/*
3326 	 * If we are not on a pv_head list - we are done.
3327 	 */
3328 	if ((opte & PG_PVLIST) == 0) {
3329 #if defined(DIAGNOSTIC) && !defined(DOM0OPS)
3330 		if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL ||
3331 		    pmap_pv_tracked(pmap_pte2pa(opte)) != NULL)
3332 			panic("pmap_remove_pte: managed or pv-tracked page"
3333 			    " without PG_PVLIST for %#"PRIxVADDR, va);
3334 #endif
3335 		return true;
3336 	}
3337 
3338 	if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
3339 		KASSERT(uvm_page_locked_p(pg));
3340 		pp = VM_PAGE_TO_PP(pg);
3341 	} else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
3342 		paddr_t pa = pmap_pte2pa(opte);
3343 		panic("pmap_remove_pte: PG_PVLIST with pv-untracked page"
3344 		    " va = 0x%"PRIxVADDR
3345 		    " pa = 0x%"PRIxPADDR" (0x%"PRIxPADDR")",
3346 		    va, pa, atop(pa));
3347 	}
3348 
3349 	/* Sync R/M bits. */
3350 	pp->pp_attrs |= opte;
3351 	pve = pmap_remove_pv(pp, ptp, va);
3352 
3353 	if (pve) {
3354 		pve->pve_next = *pv_tofree;
3355 		*pv_tofree = pve;
3356 	}
3357 	return true;
3358 }
3359 
3360 /*
3361  * pmap_remove: mapping removal function.
3362  *
3363  * => caller should not be holding any pmap locks
3364  */
3365 
3366 void
3367 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
3368 {
3369 	pt_entry_t *ptes;
3370 	pd_entry_t pde;
3371 	pd_entry_t * const *pdes;
3372 	struct pv_entry *pv_tofree = NULL;
3373 	bool result;
3374 	int i;
3375 	paddr_t ptppa;
3376 	vaddr_t blkendva, va = sva;
3377 	struct vm_page *ptp;
3378 	struct pmap *pmap2;
3379 
3380 	kpreempt_disable();
3381 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
3382 
3383 	/*
3384 	 * removing one page?  take shortcut function.
3385 	 */
3386 
3387 	if (va + PAGE_SIZE == eva) {
3388 		if (pmap_pdes_valid(va, pdes, &pde)) {
3389 
3390 			/* PA of the PTP */
3391 			ptppa = pmap_pte2pa(pde);
3392 
3393 			/* Get PTP if non-kernel mapping. */
3394 			if (pmap != pmap_kernel()) {
3395 				ptp = pmap_find_ptp(pmap, va, ptppa, 1);
3396 				KASSERTMSG(ptp != NULL,
3397 				    "pmap_remove: unmanaged PTP detected");
3398 			} else {
3399 				/* Never free kernel PTPs. */
3400 				ptp = NULL;
3401 			}
3402 
3403 			result = pmap_remove_pte(pmap, ptp,
3404 			    &ptes[pl1_i(va)], va, &pv_tofree);
3405 
3406 			/*
3407 			 * if mapping removed and the PTP is no longer
3408 			 * being used, free it!
3409 			 */
3410 
3411 			if (result && ptp && ptp->wire_count <= 1)
3412 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3413 		}
3414 	} else for (/* null */ ; va < eva ; va = blkendva) {
3415 		int lvl;
3416 
3417 		/* determine range of block */
3418 		blkendva = x86_round_pdr(va+1);
3419 		if (blkendva > eva)
3420 			blkendva = eva;
3421 
3422 		/*
3423 		 * XXXCDC: our PTE mappings should never be removed
3424 		 * with pmap_remove!  if we allow this (and why would
3425 		 * we?) then we end up freeing the pmap's page
3426 		 * directory page (PDP) before we are finished using
3427 		 * it when we hit in in the recursive mapping.  this
3428 		 * is BAD.
3429 		 *
3430 		 * long term solution is to move the PTEs out of user
3431 		 * address space.  and into kernel address space (up
3432 		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
3433 		 * be VM_MAX_ADDRESS.
3434 		 */
3435 
3436 		/* XXXCDC: ugly hack to avoid freeing PDP here */
3437 		for (i = 0; i < PDP_SIZE; i++) {
3438 			if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i)
3439 				continue;
3440 		}
3441 
3442 		lvl = pmap_pdes_invalid(va, pdes, &pde);
3443 		if (lvl != 0) {
3444 			/*
3445 			 * skip a range corresponding to an invalid pde.
3446 			 */
3447 			blkendva = (va & ptp_masks[lvl - 1]) + nbpd[lvl - 1];
3448  			continue;
3449 		}
3450 
3451 		/* PA of the PTP */
3452 		ptppa = pmap_pte2pa(pde);
3453 
3454 		/* Get PTP if non-kernel mapping. */
3455 		if (pmap != pmap_kernel()) {
3456 			ptp = pmap_find_ptp(pmap, va, ptppa, 1);
3457 			KASSERTMSG(ptp != NULL,
3458 			    "pmap_remove: unmanaged PTP detected");
3459 		} else {
3460 			/* Never free kernel PTPs. */
3461 			ptp = NULL;
3462 		}
3463 
3464 		pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va,
3465 		    blkendva, &pv_tofree);
3466 
3467 		/* if PTP is no longer being used, free it! */
3468 		if (ptp && ptp->wire_count <= 1) {
3469 			pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3470 		}
3471 	}
3472 	pmap_unmap_ptes(pmap, pmap2);		/* unlock pmap */
3473 	kpreempt_enable();
3474 
3475 	/* Now we free unused PVs */
3476 	if (pv_tofree)
3477 		pmap_free_pvs(pv_tofree);
3478 }
3479 
3480 /*
3481  * pmap_sync_pv: clear pte bits and return the old value of the pte.
3482  *
3483  * => Caller should disable kernel preemption.
3484  * => issues tlb shootdowns if necessary.
3485  */
3486 
3487 static int
3488 pmap_sync_pv(struct pv_pte *pvpte, pt_entry_t expect, int clearbits,
3489     pt_entry_t *optep)
3490 {
3491 	struct pmap *pmap;
3492 	struct vm_page *ptp;
3493 	vaddr_t va;
3494 	pt_entry_t *ptep;
3495 	pt_entry_t opte;
3496 	pt_entry_t npte;
3497 	bool need_shootdown;
3498 
3499 	ptp = pvpte->pte_ptp;
3500 	va = pvpte->pte_va;
3501 	KASSERT(ptp == NULL || ptp->uobject != NULL);
3502 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
3503 	pmap = ptp_to_pmap(ptp);
3504 
3505 	KASSERT((expect & ~(PG_FRAME | PG_V)) == 0);
3506 	KASSERT((expect & PG_V) != 0);
3507 	KASSERT(clearbits == ~0 || (clearbits & ~(PG_M | PG_U | PG_RW)) == 0);
3508 	KASSERT(kpreempt_disabled());
3509 
3510 	ptep = pmap_map_pte(pmap, ptp, va);
3511 	do {
3512 		opte = *ptep;
3513 		KASSERT((opte & (PG_M | PG_U)) != PG_M);
3514 		KASSERT((opte & (PG_U | PG_V)) != PG_U);
3515 		KASSERT(opte == 0 || (opte & PG_V) != 0);
3516 		if ((opte & (PG_FRAME | PG_V)) != expect) {
3517 
3518 			/*
3519 			 * we lost a race with a V->P operation like
3520 			 * pmap_remove().  wait for the competitor
3521 			 * reflecting pte bits into mp_attrs.
3522 			 *
3523 			 * issue a redundant TLB shootdown so that
3524 			 * we can wait for its completion.
3525 			 */
3526 
3527 			pmap_unmap_pte();
3528 			if (clearbits != 0) {
3529 				pmap_tlb_shootdown(pmap, va,
3530 				    (pmap == pmap_kernel() ? PG_G : 0),
3531 				    TLBSHOOT_SYNC_PV1);
3532 			}
3533 			return EAGAIN;
3534 		}
3535 
3536 		/*
3537 		 * check if there's anything to do on this pte.
3538 		 */
3539 
3540 		if ((opte & clearbits) == 0) {
3541 			need_shootdown = false;
3542 			break;
3543 		}
3544 
3545 		/*
3546 		 * we need a shootdown if the pte is cached. (PG_U)
3547 		 *
3548 		 * ...unless we are clearing only the PG_RW bit and
3549 		 * it isn't cached as RW. (PG_M)
3550 		 */
3551 
3552 		need_shootdown = (opte & PG_U) != 0 &&
3553 		    !(clearbits == PG_RW && (opte & PG_M) == 0);
3554 
3555 		npte = opte & ~clearbits;
3556 
3557 		/*
3558 		 * if we need a shootdown anyway, clear PG_U and PG_M.
3559 		 */
3560 
3561 		if (need_shootdown) {
3562 			npte &= ~(PG_U | PG_M);
3563 		}
3564 		KASSERT((npte & (PG_M | PG_U)) != PG_M);
3565 		KASSERT((npte & (PG_U | PG_V)) != PG_U);
3566 		KASSERT(npte == 0 || (opte & PG_V) != 0);
3567 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
3568 
3569 	if (need_shootdown) {
3570 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV2);
3571 	}
3572 	pmap_unmap_pte();
3573 
3574 	*optep = opte;
3575 	return 0;
3576 }
3577 
3578 static void
3579 pmap_pp_remove(struct pmap_page *pp, paddr_t pa)
3580 {
3581 	struct pv_pte *pvpte;
3582 	struct pv_entry *killlist = NULL;
3583 	struct vm_page *ptp;
3584 	pt_entry_t expect;
3585 	int count;
3586 
3587 	expect = pmap_pa2pte(pa) | PG_V;
3588 	count = SPINLOCK_BACKOFF_MIN;
3589 	kpreempt_disable();
3590 startover:
3591 	while ((pvpte = pv_pte_first(pp)) != NULL) {
3592 		struct pmap *pmap;
3593 		struct pv_entry *pve;
3594 		pt_entry_t opte;
3595 		vaddr_t va;
3596 		int error;
3597 
3598 		/*
3599 		 * add a reference to the pmap before clearing the pte.
3600 		 * otherwise the pmap can disappear behind us.
3601 		 */
3602 
3603 		ptp = pvpte->pte_ptp;
3604 		pmap = ptp_to_pmap(ptp);
3605 		if (ptp != NULL) {
3606 			pmap_reference(pmap);
3607 		}
3608 
3609 		error = pmap_sync_pv(pvpte, expect, ~0, &opte);
3610 		if (error == EAGAIN) {
3611 			int hold_count;
3612 			KERNEL_UNLOCK_ALL(curlwp, &hold_count);
3613 			if (ptp != NULL) {
3614 				pmap_destroy(pmap);
3615 			}
3616 			SPINLOCK_BACKOFF(count);
3617 			KERNEL_LOCK(hold_count, curlwp);
3618 			goto startover;
3619 		}
3620 
3621 		pp->pp_attrs |= opte;
3622 		va = pvpte->pte_va;
3623 		pve = pmap_remove_pv(pp, ptp, va);
3624 
3625 		/* update the PTP reference count.  free if last reference. */
3626 		if (ptp != NULL) {
3627 			struct pmap *pmap2;
3628 			pt_entry_t *ptes;
3629 			pd_entry_t * const *pdes;
3630 
3631 			KASSERT(pmap != pmap_kernel());
3632 
3633 			pmap_tlb_shootnow();
3634 			pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3635 			pmap_stats_update_bypte(pmap, 0, opte);
3636 			ptp->wire_count--;
3637 			if (ptp->wire_count <= 1) {
3638 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3639 			}
3640 			pmap_unmap_ptes(pmap, pmap2);
3641 			pmap_destroy(pmap);
3642 		} else {
3643 			KASSERT(pmap == pmap_kernel());
3644 			pmap_stats_update_bypte(pmap, 0, opte);
3645 		}
3646 
3647 		if (pve != NULL) {
3648 			pve->pve_next = killlist;	/* mark it for death */
3649 			killlist = pve;
3650 		}
3651 	}
3652 	pmap_tlb_shootnow();
3653 	kpreempt_enable();
3654 
3655 	/* Now free unused pvs. */
3656 	pmap_free_pvs(killlist);
3657 }
3658 
3659 /*
3660  * pmap_page_remove: remove a managed vm_page from all pmaps that map it
3661  *
3662  * => R/M bits are sync'd back to attrs
3663  */
3664 
3665 void
3666 pmap_page_remove(struct vm_page *pg)
3667 {
3668 	struct pmap_page *pp;
3669 	paddr_t pa;
3670 
3671 	KASSERT(uvm_page_locked_p(pg));
3672 
3673 	pp = VM_PAGE_TO_PP(pg);
3674 	pa = VM_PAGE_TO_PHYS(pg);
3675 	pmap_pp_remove(pp, pa);
3676 }
3677 
3678 /*
3679  * pmap_pv_remove: remove an unmanaged pv-tracked page from all pmaps
3680  *	that map it
3681  */
3682 
3683 void
3684 pmap_pv_remove(paddr_t pa)
3685 {
3686 	struct pmap_page *pp;
3687 
3688 	pp = pmap_pv_tracked(pa);
3689 	if (pp == NULL)
3690 		panic("pmap_pv_protect: page not pv-tracked: 0x%"PRIxPADDR,
3691 		    pa);
3692 	pmap_pp_remove(pp, pa);
3693 }
3694 
3695 /*
3696  * p m a p   a t t r i b u t e  f u n c t i o n s
3697  * functions that test/change managed page's attributes
3698  * since a page can be mapped multiple times we must check each PTE that
3699  * maps it by going down the pv lists.
3700  */
3701 
3702 /*
3703  * pmap_test_attrs: test a page's attributes
3704  */
3705 
3706 bool
3707 pmap_test_attrs(struct vm_page *pg, unsigned testbits)
3708 {
3709 	struct pmap_page *pp;
3710 	struct pv_pte *pvpte;
3711 	pt_entry_t expect;
3712 	u_int result;
3713 
3714 	KASSERT(uvm_page_locked_p(pg));
3715 
3716 	pp = VM_PAGE_TO_PP(pg);
3717 	if ((pp->pp_attrs & testbits) != 0) {
3718 		return true;
3719 	}
3720 	expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V;
3721 	kpreempt_disable();
3722 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
3723 		pt_entry_t opte;
3724 		int error;
3725 
3726 		if ((pp->pp_attrs & testbits) != 0) {
3727 			break;
3728 		}
3729 		error = pmap_sync_pv(pvpte, expect, 0, &opte);
3730 		if (error == 0) {
3731 			pp->pp_attrs |= opte;
3732 		}
3733 	}
3734 	result = pp->pp_attrs & testbits;
3735 	kpreempt_enable();
3736 
3737 	/*
3738 	 * note that we will exit the for loop with a non-null pve if
3739 	 * we have found the bits we are testing for.
3740 	 */
3741 
3742 	return result != 0;
3743 }
3744 
3745 static bool
3746 pmap_pp_clear_attrs(struct pmap_page *pp, paddr_t pa, unsigned clearbits)
3747 {
3748 	struct pv_pte *pvpte;
3749 	u_int result;
3750 	pt_entry_t expect;
3751 	int count;
3752 
3753 	expect = pmap_pa2pte(pa) | PG_V;
3754 	count = SPINLOCK_BACKOFF_MIN;
3755 	kpreempt_disable();
3756 startover:
3757 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
3758 		pt_entry_t opte;
3759 		int error;
3760 
3761 		error = pmap_sync_pv(pvpte, expect, clearbits, &opte);
3762 		if (error == EAGAIN) {
3763 			int hold_count;
3764 			KERNEL_UNLOCK_ALL(curlwp, &hold_count);
3765 			SPINLOCK_BACKOFF(count);
3766 			KERNEL_LOCK(hold_count, curlwp);
3767 			goto startover;
3768 		}
3769 		pp->pp_attrs |= opte;
3770 	}
3771 	result = pp->pp_attrs & clearbits;
3772 	pp->pp_attrs &= ~clearbits;
3773 	pmap_tlb_shootnow();
3774 	kpreempt_enable();
3775 
3776 	return result != 0;
3777 }
3778 
3779 /*
3780  * pmap_clear_attrs: clear the specified attribute for a page.
3781  *
3782  * => we return true if we cleared one of the bits we were asked to
3783  */
3784 
3785 bool
3786 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits)
3787 {
3788 	struct pmap_page *pp;
3789 	paddr_t pa;
3790 
3791 	KASSERT(uvm_page_locked_p(pg));
3792 
3793 	pp = VM_PAGE_TO_PP(pg);
3794 	pa = VM_PAGE_TO_PHYS(pg);
3795 
3796 	return pmap_pp_clear_attrs(pp, pa, clearbits);
3797 }
3798 
3799 /*
3800  * pmap_pv_clear_attrs: clear the specified attributes for an unmanaged
3801  *	pv-tracked page.
3802  */
3803 
3804 bool
3805 pmap_pv_clear_attrs(paddr_t pa, unsigned clearbits)
3806 {
3807 	struct pmap_page *pp;
3808 
3809 	pp = pmap_pv_tracked(pa);
3810 	if (pp == NULL)
3811 		panic("pmap_pv_protect: page not pv-tracked: 0x%"PRIxPADDR,
3812 		    pa);
3813 
3814 	return pmap_pp_clear_attrs(pp, pa, clearbits);
3815 }
3816 
3817 /*
3818  * p m a p   p r o t e c t i o n   f u n c t i o n s
3819  */
3820 
3821 /*
3822  * pmap_page_protect: change the protection of all recorded mappings
3823  *	of a managed page
3824  *
3825  * => NOTE: this is an inline function in pmap.h
3826  */
3827 
3828 /* see pmap.h */
3829 
3830 /*
3831  * pmap_pv_protect: change the protection of all recorded mappings
3832  *	of an unmanaged pv-tracked page
3833  *
3834  * => NOTE: this is an inline function in pmap.h
3835  */
3836 
3837 /* see pmap.h */
3838 
3839 /*
3840  * pmap_protect: set the protection in of the pages in a pmap
3841  *
3842  * => NOTE: this is an inline function in pmap.h
3843  */
3844 
3845 /* see pmap.h */
3846 
3847 /*
3848  * pmap_write_protect: write-protect pages in a pmap.
3849  */
3850 void
3851 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
3852 {
3853 	pt_entry_t *ptes;
3854 	pt_entry_t * const *pdes;
3855 	struct pmap *pmap2;
3856 	vaddr_t blockend, va;
3857 
3858 	KASSERT(curlwp->l_md.md_gc_pmap != pmap);
3859 
3860 	sva &= PG_FRAME;
3861 	eva &= PG_FRAME;
3862 
3863 	/* Acquire pmap. */
3864 	kpreempt_disable();
3865 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3866 
3867 	for (va = sva ; va < eva ; va = blockend) {
3868 		pt_entry_t *spte, *epte;
3869 		int i;
3870 
3871 		blockend = x86_round_pdr(va + 1);
3872 		if (blockend > eva)
3873 			blockend = eva;
3874 
3875 		/*
3876 		 * XXXCDC: our PTE mappings should never be write-protected!
3877 		 *
3878 		 * long term solution is to move the PTEs out of user
3879 		 * address space.  and into kernel address space (up
3880 		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
3881 		 * be VM_MAX_ADDRESS.
3882 		 */
3883 
3884 		/* XXXCDC: ugly hack to avoid freeing PDP here */
3885 		for (i = 0; i < PDP_SIZE; i++) {
3886 			if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i)
3887 				continue;
3888 		}
3889 
3890 		/* Is it a valid block? */
3891 		if (!pmap_pdes_valid(va, pdes, NULL)) {
3892 			continue;
3893 		}
3894 		KASSERT(va < VM_MAXUSER_ADDRESS || va >= VM_MAX_ADDRESS);
3895 
3896 		spte = &ptes[pl1_i(va)];
3897 		epte = &ptes[pl1_i(blockend)];
3898 
3899 		for (/*null */; spte < epte ; spte++) {
3900 			pt_entry_t opte, npte;
3901 
3902 			do {
3903 				opte = *spte;
3904 				if ((~opte & (PG_RW | PG_V)) != 0) {
3905 					goto next;
3906 				}
3907 				npte = opte & ~PG_RW;
3908 			} while (pmap_pte_cas(spte, opte, npte) != opte);
3909 
3910 			if ((opte & PG_M) != 0) {
3911 				vaddr_t tva = x86_ptob(spte - ptes);
3912 				pmap_tlb_shootdown(pmap, tva, opte,
3913 				    TLBSHOOT_WRITE_PROTECT);
3914 			}
3915 next:;
3916 		}
3917 	}
3918 
3919 	/* Release pmap. */
3920 	pmap_unmap_ptes(pmap, pmap2);
3921 	kpreempt_enable();
3922 }
3923 
3924 /*
3925  * pmap_unwire: clear the wired bit in the PTE.
3926  *
3927  * => Mapping should already be present.
3928  */
3929 void
3930 pmap_unwire(struct pmap *pmap, vaddr_t va)
3931 {
3932 	pt_entry_t *ptes, *ptep, opte;
3933 	pd_entry_t * const *pdes;
3934 	struct pmap *pmap2;
3935 
3936 	/* Acquire pmap. */
3937 	kpreempt_disable();
3938 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3939 
3940 	if (!pmap_pdes_valid(va, pdes, NULL)) {
3941 		panic("pmap_unwire: invalid PDE");
3942 	}
3943 
3944 	ptep = &ptes[pl1_i(va)];
3945 	opte = *ptep;
3946 	KASSERT(pmap_valid_entry(opte));
3947 
3948 	if (opte & PG_W) {
3949 		pt_entry_t npte = opte & ~PG_W;
3950 
3951 		opte = pmap_pte_testset(ptep, npte);
3952 		pmap_stats_update_bypte(pmap, npte, opte);
3953 	} else {
3954 		printf("pmap_unwire: wiring for pmap %p va 0x%lx "
3955 		    "did not change!\n", pmap, va);
3956 	}
3957 
3958 	/* Release pmap. */
3959 	pmap_unmap_ptes(pmap, pmap2);
3960 	kpreempt_enable();
3961 }
3962 
3963 /*
3964  * pmap_copy: copy mappings from one pmap to another
3965  *
3966  * => optional function
3967  * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
3968  */
3969 
3970 /*
3971  * defined as macro in pmap.h
3972  */
3973 
3974 __strict_weak_alias(pmap_enter, pmap_enter_default);
3975 
3976 int
3977 pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
3978     u_int flags)
3979 {
3980 	return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0);
3981 }
3982 
3983 /*
3984  * pmap_enter: enter a mapping into a pmap
3985  *
3986  * => must be done "now" ... no lazy-evaluation
3987  * => we set pmap => pv_head locking
3988  */
3989 int
3990 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa,
3991 	   vm_prot_t prot, u_int flags, int domid)
3992 {
3993 	pt_entry_t *ptes, opte, npte;
3994 	pt_entry_t *ptep;
3995 	pd_entry_t * const *pdes;
3996 	struct vm_page *ptp;
3997 	struct vm_page *new_pg, *old_pg;
3998 	struct pmap_page *new_pp, *old_pp;
3999 	struct pv_entry *old_pve = NULL;
4000 	struct pv_entry *new_pve;
4001 	struct pv_entry *new_pve2;
4002 	int error;
4003 	bool wired = (flags & PMAP_WIRED) != 0;
4004 	struct pmap *pmap2;
4005 
4006 	KASSERT(pmap_initialized);
4007 	KASSERT(curlwp->l_md.md_gc_pmap != pmap);
4008 	KASSERT(va < VM_MAX_KERNEL_ADDRESS);
4009 	KASSERTMSG(va != (vaddr_t)PDP_BASE,
4010 	    "pmap_enter: trying to map over PDP!");
4011 	KASSERTMSG(va < VM_MIN_KERNEL_ADDRESS ||
4012 	    pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]),
4013 	    "pmap_enter: missing kernel PTP for VA %lx!", va);
4014 
4015 #ifdef XEN
4016 	KASSERT(domid == DOMID_SELF || pa == 0);
4017 #endif /* XEN */
4018 
4019 	npte = ma | protection_codes[prot] | PG_V;
4020 	npte |= pmap_pat_flags(flags);
4021 	if (wired)
4022 	        npte |= PG_W;
4023 	if (va < VM_MAXUSER_ADDRESS)
4024 		npte |= PG_u;
4025 	else if (va < VM_MAX_ADDRESS)
4026 		npte |= (PG_u | PG_RW);	/* XXXCDC: no longer needed? */
4027 	else
4028 		npte |= PG_k;
4029 	if (pmap == pmap_kernel())
4030 		npte |= pmap_pg_g;
4031 	if (flags & VM_PROT_ALL) {
4032 		npte |= PG_U;
4033 		if (flags & VM_PROT_WRITE) {
4034 			KASSERT((npte & PG_RW) != 0);
4035 			npte |= PG_M;
4036 		}
4037 	}
4038 
4039 #ifdef XEN
4040 	if (domid != DOMID_SELF)
4041 		new_pg = NULL;
4042 	else
4043 #endif
4044 		new_pg = PHYS_TO_VM_PAGE(pa);
4045 	if (new_pg != NULL) {
4046 		/* This is a managed page */
4047 		npte |= PG_PVLIST;
4048 		new_pp = VM_PAGE_TO_PP(new_pg);
4049 	} else if ((new_pp = pmap_pv_tracked(pa)) != NULL) {
4050 		/* This is an unmanaged pv-tracked page */
4051 		npte |= PG_PVLIST;
4052 	} else {
4053 		new_pp = NULL;
4054 	}
4055 
4056 	/* get pves. */
4057 	new_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
4058 	new_pve2 = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
4059 	if (new_pve == NULL || new_pve2 == NULL) {
4060 		if (flags & PMAP_CANFAIL) {
4061 			error = ENOMEM;
4062 			goto out2;
4063 		}
4064 		panic("pmap_enter: pve allocation failed");
4065 	}
4066 
4067 	kpreempt_disable();
4068 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
4069 	if (pmap == pmap_kernel()) {
4070 		ptp = NULL;
4071 	} else {
4072 		ptp = pmap_get_ptp(pmap, va, pdes);
4073 		if (ptp == NULL) {
4074 			pmap_unmap_ptes(pmap, pmap2);
4075 			if (flags & PMAP_CANFAIL) {
4076 				error = ENOMEM;
4077 				goto out;
4078 			}
4079 			panic("pmap_enter: get ptp failed");
4080 		}
4081 	}
4082 
4083 	/*
4084 	 * update the pte.
4085 	 */
4086 
4087 	ptep = &ptes[pl1_i(va)];
4088 	do {
4089 		opte = *ptep;
4090 
4091 		/*
4092 		 * if the same page, inherit PG_U and PG_M.
4093 		 */
4094 		if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) {
4095 			npte |= opte & (PG_U | PG_M);
4096 		}
4097 #if defined(XEN)
4098 		if (domid != DOMID_SELF) {
4099 			/* pmap_pte_cas with error handling */
4100 			int s = splvm();
4101 			if (opte != *ptep) {
4102 				splx(s);
4103 				continue;
4104 			}
4105 			error = xpq_update_foreign(
4106 			    vtomach((vaddr_t)ptep), npte, domid);
4107 			splx(s);
4108 			if (error) {
4109 				if (ptp != NULL && ptp->wire_count <= 1) {
4110 					pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4111 				}
4112 				pmap_unmap_ptes(pmap, pmap2);
4113 				goto out;
4114 			}
4115 			break;
4116 		}
4117 #endif /* defined(XEN) */
4118 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
4119 
4120 	/*
4121 	 * update statistics and PTP's reference count.
4122 	 */
4123 
4124 	pmap_stats_update_bypte(pmap, npte, opte);
4125 	if (ptp != NULL && !pmap_valid_entry(opte)) {
4126 		ptp->wire_count++;
4127 	}
4128 	KASSERT(ptp == NULL || ptp->wire_count > 1);
4129 
4130 	/*
4131 	 * if the same page, we can skip pv_entry handling.
4132 	 */
4133 
4134 	if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) {
4135 		KASSERT(((opte ^ npte) & PG_PVLIST) == 0);
4136 		goto same_pa;
4137 	}
4138 
4139 	/*
4140 	 * if old page is pv-tracked, remove pv_entry from its list.
4141 	 */
4142 
4143 	if ((~opte & (PG_V | PG_PVLIST)) == 0) {
4144 		if ((old_pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
4145 			KASSERT(uvm_page_locked_p(old_pg));
4146 			old_pp = VM_PAGE_TO_PP(old_pg);
4147 		} else if ((old_pp = pmap_pv_tracked(pmap_pte2pa(opte)))
4148 		    == NULL) {
4149 			pa = pmap_pte2pa(opte);
4150 			panic("pmap_enter: PG_PVLIST with pv-untracked page"
4151 			    " va = 0x%"PRIxVADDR
4152 			    " pa = 0x%" PRIxPADDR " (0x%" PRIxPADDR ")",
4153 			    va, pa, atop(pa));
4154 		}
4155 
4156 		old_pve = pmap_remove_pv(old_pp, ptp, va);
4157 		old_pp->pp_attrs |= opte;
4158 	}
4159 
4160 	/*
4161 	 * if new page is pv-tracked, insert pv_entry into its list.
4162 	 */
4163 
4164 	if (new_pp) {
4165 		new_pve = pmap_enter_pv(new_pp, new_pve, &new_pve2, ptp, va);
4166 	}
4167 
4168 same_pa:
4169 	pmap_unmap_ptes(pmap, pmap2);
4170 
4171 	/*
4172 	 * shootdown tlb if necessary.
4173 	 */
4174 
4175 	if ((~opte & (PG_V | PG_U)) == 0 &&
4176 	    ((opte ^ npte) & (PG_FRAME | PG_RW)) != 0) {
4177 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER);
4178 	}
4179 
4180 	error = 0;
4181 out:
4182 	kpreempt_enable();
4183 out2:
4184 	if (old_pve != NULL) {
4185 		pool_cache_put(&pmap_pv_cache, old_pve);
4186 	}
4187 	if (new_pve != NULL) {
4188 		pool_cache_put(&pmap_pv_cache, new_pve);
4189 	}
4190 	if (new_pve2 != NULL) {
4191 		pool_cache_put(&pmap_pv_cache, new_pve2);
4192 	}
4193 
4194 	return error;
4195 }
4196 
4197 static bool
4198 pmap_get_physpage(vaddr_t va, int level, paddr_t *paddrp)
4199 {
4200 	struct vm_page *ptp;
4201 	struct pmap *kpm = pmap_kernel();
4202 
4203 	if (!uvm.page_init_done) {
4204 
4205 		/*
4206 		 * we're growing the kernel pmap early (from
4207 		 * uvm_pageboot_alloc()).  this case must be
4208 		 * handled a little differently.
4209 		 */
4210 
4211 		if (!uvm_page_physget(paddrp))
4212 			panic("pmap_get_physpage: out of memory");
4213 #if defined(__HAVE_DIRECT_MAP)
4214 		pagezero(PMAP_DIRECT_MAP(*paddrp));
4215 #else
4216 #if defined(XEN)
4217 		if (XEN_VERSION_SUPPORTED(3, 4)) {
4218 			xen_pagezero(*paddrp);
4219 			return true;
4220 		}
4221 #endif
4222 		kpreempt_disable();
4223 		pmap_pte_set(early_zero_pte,
4224 		    pmap_pa2pte(*paddrp) | PG_V | PG_RW | PG_k);
4225 		pmap_pte_flush();
4226 		pmap_update_pg((vaddr_t)early_zerop);
4227 		memset(early_zerop, 0, PAGE_SIZE);
4228 #if defined(DIAGNOSTIC) || defined(XEN)
4229 		pmap_pte_set(early_zero_pte, 0);
4230 		pmap_pte_flush();
4231 #endif /* defined(DIAGNOSTIC) */
4232 		kpreempt_enable();
4233 #endif /* defined(__HAVE_DIRECT_MAP) */
4234 	} else {
4235 		/* XXX */
4236 		ptp = uvm_pagealloc(NULL, 0, NULL,
4237 				    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
4238 		if (ptp == NULL)
4239 			panic("pmap_get_physpage: out of memory");
4240 		ptp->flags &= ~PG_BUSY;
4241 		ptp->wire_count = 1;
4242 		*paddrp = VM_PAGE_TO_PHYS(ptp);
4243 	}
4244 	pmap_stats_update(kpm, 1, 0);
4245 	return true;
4246 }
4247 
4248 /*
4249  * Allocate the amount of specified ptps for a ptp level, and populate
4250  * all levels below accordingly, mapping virtual addresses starting at
4251  * kva.
4252  *
4253  * Used by pmap_growkernel.
4254  */
4255 static void
4256 pmap_alloc_level(pd_entry_t * const *pdes, vaddr_t kva, int lvl,
4257     long *needed_ptps)
4258 {
4259 	unsigned long i;
4260 	vaddr_t va;
4261 	paddr_t pa;
4262 	unsigned long index, endindex;
4263 	int level;
4264 	pd_entry_t *pdep;
4265 #ifdef XEN
4266 	int s = splvm(); /* protect xpq_* */
4267 #endif
4268 
4269 	for (level = lvl; level > 1; level--) {
4270 		if (level == PTP_LEVELS)
4271 			pdep = pmap_kernel()->pm_pdir;
4272 		else
4273 			pdep = pdes[level - 2];
4274 		va = kva;
4275 		index = pl_i_roundup(kva, level);
4276 		endindex = index + needed_ptps[level - 1] - 1;
4277 
4278 
4279 		for (i = index; i <= endindex; i++) {
4280 			pt_entry_t pte;
4281 
4282 			KASSERT(!pmap_valid_entry(pdep[i]));
4283 			pmap_get_physpage(va, level - 1, &pa);
4284 			pte = pmap_pa2pte(pa) | PG_k | PG_V | PG_RW;
4285 #ifdef XEN
4286 			pmap_pte_set(&pdep[i], pte);
4287 #if defined(PAE) || defined(__x86_64__)
4288 			if (level == PTP_LEVELS && i >= PDIR_SLOT_KERN) {
4289 				if (__predict_true(
4290 				    cpu_info_primary.ci_flags & CPUF_PRESENT)) {
4291 					/* update per-cpu PMDs on all cpus */
4292 					xen_kpm_sync(pmap_kernel(), i);
4293 				} else {
4294 					/*
4295 					 * too early; update primary CPU
4296 					 * PMD only (without locks)
4297 					 */
4298 #ifdef PAE
4299 					pd_entry_t *cpu_pdep =
4300 					    &cpu_info_primary.ci_kpm_pdir[l2tol2(i)];
4301 #endif
4302 #ifdef __x86_64__
4303 					pd_entry_t *cpu_pdep =
4304 						&cpu_info_primary.ci_kpm_pdir[i];
4305 #endif
4306 					pmap_pte_set(cpu_pdep, pte);
4307 				}
4308 			}
4309 #endif /* PAE || __x86_64__ */
4310 #else /* XEN */
4311 			pdep[i] = pte;
4312 #endif /* XEN */
4313 			KASSERT(level != PTP_LEVELS || nkptp[level - 1] +
4314 			    pl_i(VM_MIN_KERNEL_ADDRESS, level) == i);
4315 			nkptp[level - 1]++;
4316 			va += nbpd[level - 1];
4317 		}
4318 		pmap_pte_flush();
4319 	}
4320 #ifdef XEN
4321 	splx(s);
4322 #endif
4323 }
4324 
4325 /*
4326  * pmap_growkernel: increase usage of KVM space
4327  *
4328  * => we allocate new PTPs for the kernel and install them in all
4329  *	the pmaps on the system.
4330  */
4331 
4332 vaddr_t
4333 pmap_growkernel(vaddr_t maxkvaddr)
4334 {
4335 	struct pmap *kpm = pmap_kernel();
4336 #if !defined(XEN) || !defined(__x86_64__)
4337 	struct pmap *pm;
4338 	long old;
4339 #endif
4340 	int s, i;
4341 	long needed_kptp[PTP_LEVELS], target_nptp;
4342 	bool invalidate = false;
4343 
4344 	s = splvm();	/* to be safe */
4345 	mutex_enter(kpm->pm_lock);
4346 
4347 	if (maxkvaddr <= pmap_maxkvaddr) {
4348 		mutex_exit(kpm->pm_lock);
4349 		splx(s);
4350 		return pmap_maxkvaddr;
4351 	}
4352 
4353 	maxkvaddr = x86_round_pdr(maxkvaddr);
4354 #if !defined(XEN) || !defined(__x86_64__)
4355 	old = nkptp[PTP_LEVELS - 1];
4356 #endif
4357 
4358 	/*
4359 	 * This loop could be optimized more, but pmap_growkernel()
4360 	 * is called infrequently.
4361 	 */
4362 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
4363 		target_nptp = pl_i_roundup(maxkvaddr, i + 1) -
4364 		    pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1);
4365 		/*
4366 		 * XXX only need to check toplevel.
4367 		 */
4368 		if (target_nptp > nkptpmax[i])
4369 			panic("out of KVA space");
4370 		KASSERT(target_nptp >= nkptp[i]);
4371 		needed_kptp[i] = target_nptp - nkptp[i];
4372 	}
4373 
4374 	pmap_alloc_level(normal_pdes, pmap_maxkvaddr, PTP_LEVELS, needed_kptp);
4375 
4376 	/*
4377 	 * If the number of top level entries changed, update all
4378 	 * pmaps.
4379 	 */
4380 	if (needed_kptp[PTP_LEVELS - 1] != 0) {
4381 #ifdef XEN
4382 #ifdef __x86_64__
4383 		/* nothing, kernel entries are never entered in user pmap */
4384 #else /* __x86_64__ */
4385 		mutex_enter(&pmaps_lock);
4386 		LIST_FOREACH(pm, &pmaps, pm_list) {
4387 			int pdkidx;
4388 			for (pdkidx =  PDIR_SLOT_KERN + old;
4389 			    pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1];
4390 			    pdkidx++) {
4391 				pmap_pte_set(&pm->pm_pdir[pdkidx],
4392 				    kpm->pm_pdir[pdkidx]);
4393 			}
4394 			pmap_pte_flush();
4395 		}
4396 		mutex_exit(&pmaps_lock);
4397 #endif /* __x86_64__ */
4398 #else /* XEN */
4399 		unsigned newpdes;
4400 		newpdes = nkptp[PTP_LEVELS - 1] - old;
4401 		mutex_enter(&pmaps_lock);
4402 		LIST_FOREACH(pm, &pmaps, pm_list) {
4403 			memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old],
4404 			       &kpm->pm_pdir[PDIR_SLOT_KERN + old],
4405 			       newpdes * sizeof (pd_entry_t));
4406 		}
4407 		mutex_exit(&pmaps_lock);
4408 #endif
4409 		invalidate = true;
4410 	}
4411 	pmap_maxkvaddr = maxkvaddr;
4412 	mutex_exit(kpm->pm_lock);
4413 	splx(s);
4414 
4415 	if (invalidate && pmap_initialized) {
4416 		/* Invalidate the PDP cache. */
4417 		pool_cache_invalidate(&pmap_pdp_cache);
4418 	}
4419 
4420 	return maxkvaddr;
4421 }
4422 
4423 #ifdef DEBUG
4424 void pmap_dump(struct pmap *, vaddr_t, vaddr_t);
4425 
4426 /*
4427  * pmap_dump: dump all the mappings from a pmap
4428  *
4429  * => caller should not be holding any pmap locks
4430  */
4431 
4432 void
4433 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
4434 {
4435 	pt_entry_t *ptes, *pte;
4436 	pd_entry_t * const *pdes;
4437 	struct pmap *pmap2;
4438 	vaddr_t blkendva;
4439 
4440 	/*
4441 	 * if end is out of range truncate.
4442 	 * if (end == start) update to max.
4443 	 */
4444 
4445 	if (eva > VM_MAXUSER_ADDRESS || eva <= sva)
4446 		eva = VM_MAXUSER_ADDRESS;
4447 
4448 	/*
4449 	 * we lock in the pmap => pv_head direction
4450 	 */
4451 
4452 	kpreempt_disable();
4453 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
4454 
4455 	/*
4456 	 * dumping a range of pages: we dump in PTP sized blocks (4MB)
4457 	 */
4458 
4459 	for (/* null */ ; sva < eva ; sva = blkendva) {
4460 
4461 		/* determine range of block */
4462 		blkendva = x86_round_pdr(sva+1);
4463 		if (blkendva > eva)
4464 			blkendva = eva;
4465 
4466 		/* valid block? */
4467 		if (!pmap_pdes_valid(sva, pdes, NULL))
4468 			continue;
4469 
4470 		pte = &ptes[pl1_i(sva)];
4471 		for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) {
4472 			if (!pmap_valid_entry(*pte))
4473 				continue;
4474 			printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR
4475 			    " (pte=%#" PRIxPADDR ")\n",
4476 			    sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte);
4477 		}
4478 	}
4479 	pmap_unmap_ptes(pmap, pmap2);
4480 	kpreempt_enable();
4481 }
4482 #endif
4483 
4484 /*
4485  * pmap_update: process deferred invalidations and frees.
4486  */
4487 
4488 void
4489 pmap_update(struct pmap *pmap)
4490 {
4491 	struct vm_page *empty_ptps;
4492 	lwp_t *l = curlwp;
4493 
4494 	/*
4495 	 * If we have torn down this pmap, invalidate non-global TLB
4496 	 * entries on any processors using it.
4497 	 */
4498 	kpreempt_disable();
4499 	if (__predict_false(l->l_md.md_gc_pmap == pmap)) {
4500 		l->l_md.md_gc_pmap = NULL;
4501 		pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, TLBSHOOT_UPDATE);
4502 	}
4503 	/*
4504 	 * Initiate any pending TLB shootdowns.  Wait for them to
4505 	 * complete before returning control to the caller.
4506 	 */
4507 	pmap_tlb_shootnow();
4508 	kpreempt_enable();
4509 
4510 	/*
4511 	 * Now that shootdowns are complete, process deferred frees,
4512 	 * but not from interrupt context.
4513 	 */
4514 	if (l->l_md.md_gc_ptp != NULL) {
4515 		KASSERT((l->l_pflag & LP_INTR) == 0);
4516 		if (cpu_intr_p()) {
4517 			return;
4518 		}
4519 		empty_ptps = l->l_md.md_gc_ptp;
4520 		l->l_md.md_gc_ptp = NULL;
4521 		pmap_free_ptps(empty_ptps);
4522 	}
4523 }
4524 
4525 #if PTP_LEVELS > 4
4526 #error "Unsupported number of page table mappings"
4527 #endif
4528 
4529 paddr_t
4530 pmap_init_tmp_pgtbl(paddr_t pg)
4531 {
4532 	static bool maps_loaded;
4533 	static const paddr_t x86_tmp_pml_paddr[] = {
4534 	    4 * PAGE_SIZE,	/* L1 */
4535 	    5 * PAGE_SIZE,	/* L2 */
4536 	    6 * PAGE_SIZE,	/* L3 */
4537 	    7 * PAGE_SIZE	/* L4 */
4538 	};
4539 	static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 };
4540 
4541 	pd_entry_t *tmp_pml, *kernel_pml;
4542 
4543 	int level;
4544 
4545 	if (!maps_loaded) {
4546 		for (level = 0; level < PTP_LEVELS; ++level) {
4547 			x86_tmp_pml_vaddr[level] =
4548 			    uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
4549 			    UVM_KMF_VAONLY);
4550 
4551 			if (x86_tmp_pml_vaddr[level] == 0)
4552 				panic("mapping of real mode PML failed\n");
4553 			pmap_kenter_pa(x86_tmp_pml_vaddr[level],
4554 			    x86_tmp_pml_paddr[level],
4555 			    VM_PROT_READ | VM_PROT_WRITE, 0);
4556 			pmap_update(pmap_kernel());
4557 		}
4558 		maps_loaded = true;
4559 	}
4560 
4561 	/* Zero levels 1-3 */
4562 	for (level = 0; level < PTP_LEVELS - 1; ++level) {
4563 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
4564 		memset(tmp_pml, 0, PAGE_SIZE);
4565 	}
4566 
4567 	/* Copy PML4 */
4568 	kernel_pml = pmap_kernel()->pm_pdir;
4569 	tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1];
4570 	memcpy(tmp_pml, kernel_pml, PAGE_SIZE);
4571 
4572 #ifdef PAE
4573 	/*
4574 	 * Use the last 4 entries of the L2 page as L3 PD entries. These
4575 	 * last entries are unlikely to be used for temporary mappings.
4576 	 * 508: maps 0->1GB (userland)
4577 	 * 509: unused
4578 	 * 510: unused
4579 	 * 511: maps 3->4GB (kernel)
4580 	 */
4581 	tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PG_V;
4582 	tmp_pml[509] = 0;
4583 	tmp_pml[510] = 0;
4584 	tmp_pml[511] = pmap_pdirpa(pmap_kernel(), PDIR_SLOT_KERN) | PG_V;
4585 #endif
4586 
4587 	for (level = PTP_LEVELS - 1; level > 0; --level) {
4588 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
4589 
4590 		tmp_pml[pl_i(pg, level + 1)] =
4591 		    (x86_tmp_pml_paddr[level - 1] & PG_FRAME) | PG_RW | PG_V;
4592 	}
4593 
4594 	tmp_pml = (void *)x86_tmp_pml_vaddr[0];
4595 	tmp_pml[pl_i(pg, 1)] = (pg & PG_FRAME) | PG_RW | PG_V;
4596 
4597 #ifdef PAE
4598 	/* Return the PA of the L3 page (entry 508 of the L2 page) */
4599 	return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t);
4600 #endif
4601 
4602 	return x86_tmp_pml_paddr[PTP_LEVELS - 1];
4603 }
4604 
4605 u_int
4606 x86_mmap_flags(paddr_t mdpgno)
4607 {
4608 	u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK;
4609 	u_int pflag = 0;
4610 
4611 	if (nflag & X86_MMAP_FLAG_PREFETCH)
4612 		pflag |= PMAP_WRITE_COMBINE;
4613 
4614 	return pflag;
4615 }
4616