xref: /netbsd-src/sys/arch/x86/x86/pmap.c (revision d909946ca08dceb44d7d0f22ec9488679695d976)
1 /*	$NetBSD: pmap.c,v 1.220 2016/08/19 18:24:57 maxv Exp $	*/
2 
3 /*-
4  * Copyright (c) 2008, 2010, 2016 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran, and by Maxime Villard.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 2007 Manuel Bouyer.
34  *
35  * Redistribution and use in source and binary forms, with or without
36  * modification, are permitted provided that the following conditions
37  * are met:
38  * 1. Redistributions of source code must retain the above copyright
39  *    notice, this list of conditions and the following disclaimer.
40  * 2. Redistributions in binary form must reproduce the above copyright
41  *    notice, this list of conditions and the following disclaimer in the
42  *    documentation and/or other materials provided with the distribution.
43  *
44  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
45  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
46  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
47  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
48  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
49  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
50  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
51  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
52  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
53  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
54  *
55  */
56 
57 /*
58  * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
59  *
60  * Permission to use, copy, modify, and distribute this software for any
61  * purpose with or without fee is hereby granted, provided that the above
62  * copyright notice and this permission notice appear in all copies.
63  *
64  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
65  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
66  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
67  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
68  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
69  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
70  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
71  */
72 
73 /*
74  * Copyright (c) 1997 Charles D. Cranor and Washington University.
75  * All rights reserved.
76  *
77  * Redistribution and use in source and binary forms, with or without
78  * modification, are permitted provided that the following conditions
79  * are met:
80  * 1. Redistributions of source code must retain the above copyright
81  *    notice, this list of conditions and the following disclaimer.
82  * 2. Redistributions in binary form must reproduce the above copyright
83  *    notice, this list of conditions and the following disclaimer in the
84  *    documentation and/or other materials provided with the distribution.
85  *
86  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
87  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
88  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
89  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
90  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
91  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
92  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
93  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
94  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
95  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
96  */
97 
98 /*
99  * Copyright 2001 (c) Wasabi Systems, Inc.
100  * All rights reserved.
101  *
102  * Written by Frank van der Linden for Wasabi Systems, Inc.
103  *
104  * Redistribution and use in source and binary forms, with or without
105  * modification, are permitted provided that the following conditions
106  * are met:
107  * 1. Redistributions of source code must retain the above copyright
108  *    notice, this list of conditions and the following disclaimer.
109  * 2. Redistributions in binary form must reproduce the above copyright
110  *    notice, this list of conditions and the following disclaimer in the
111  *    documentation and/or other materials provided with the distribution.
112  * 3. All advertising materials mentioning features or use of this software
113  *    must display the following acknowledgement:
114  *      This product includes software developed for the NetBSD Project by
115  *      Wasabi Systems, Inc.
116  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
117  *    or promote products derived from this software without specific prior
118  *    written permission.
119  *
120  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
121  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
122  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
123  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
124  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
125  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
126  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
127  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
128  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
129  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
130  * POSSIBILITY OF SUCH DAMAGE.
131  */
132 
133 /*
134  * This is the i386 pmap modified and generalized to support x86-64
135  * as well. The idea is to hide the upper N levels of the page tables
136  * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest
137  * is mostly untouched, except that it uses some more generalized
138  * macros and interfaces.
139  *
140  * This pmap has been tested on the i386 as well, and it can be easily
141  * adapted to PAE.
142  *
143  * fvdl@wasabisystems.com 18-Jun-2001
144  */
145 
146 /*
147  * pmap.c: i386 pmap module rewrite
148  * Chuck Cranor <chuck@netbsd>
149  * 11-Aug-97
150  *
151  * history of this pmap module: in addition to my own input, i used
152  *    the following references for this rewrite of the i386 pmap:
153  *
154  * [1] the NetBSD i386 pmap.   this pmap appears to be based on the
155  *     BSD hp300 pmap done by Mike Hibler at University of Utah.
156  *     it was then ported to the i386 by William Jolitz of UUNET
157  *     Technologies, Inc.   Then Charles M. Hannum of the NetBSD
158  *     project fixed some bugs and provided some speed ups.
159  *
160  * [2] the FreeBSD i386 pmap.   this pmap seems to be the
161  *     Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson
162  *     and David Greenman.
163  *
164  * [3] the Mach pmap.   this pmap, from CMU, seems to have migrated
165  *     between several processors.   the VAX version was done by
166  *     Avadis Tevanian, Jr., and Michael Wayne Young.    the i386
167  *     version was done by Lance Berc, Mike Kupfer, Bob Baron,
168  *     David Golub, and Richard Draves.    the alpha version was
169  *     done by Alessandro Forin (CMU/Mach) and Chris Demetriou
170  *     (NetBSD/alpha).
171  */
172 
173 #include <sys/cdefs.h>
174 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.220 2016/08/19 18:24:57 maxv Exp $");
175 
176 #include "opt_user_ldt.h"
177 #include "opt_lockdebug.h"
178 #include "opt_multiprocessor.h"
179 #include "opt_xen.h"
180 #if !defined(__x86_64__)
181 #include "opt_kstack_dr0.h"
182 #endif /* !defined(__x86_64__) */
183 
184 #include <sys/param.h>
185 #include <sys/systm.h>
186 #include <sys/proc.h>
187 #include <sys/pool.h>
188 #include <sys/kernel.h>
189 #include <sys/atomic.h>
190 #include <sys/cpu.h>
191 #include <sys/intr.h>
192 #include <sys/xcall.h>
193 #include <sys/kcore.h>
194 
195 #include <uvm/uvm.h>
196 #include <uvm/pmap/pmap_pvt.h>
197 
198 #include <dev/isa/isareg.h>
199 
200 #include <machine/specialreg.h>
201 #include <machine/gdt.h>
202 #include <machine/isa_machdep.h>
203 #include <machine/cpuvar.h>
204 #include <machine/cputypes.h>
205 
206 #include <x86/pmap.h>
207 #include <x86/pmap_pv.h>
208 
209 #include <x86/i82489reg.h>
210 #include <x86/i82489var.h>
211 
212 #ifdef XEN
213 #include <xen/xen-public/xen.h>
214 #include <xen/hypervisor.h>
215 #endif
216 
217 /*
218  * general info:
219  *
220  *  - for an explanation of how the i386 MMU hardware works see
221  *    the comments in <machine/pte.h>.
222  *
223  *  - for an explanation of the general memory structure used by
224  *    this pmap (including the recursive mapping), see the comments
225  *    in <machine/pmap.h>.
226  *
227  * this file contains the code for the "pmap module."   the module's
228  * job is to manage the hardware's virtual to physical address mappings.
229  * note that there are two levels of mapping in the VM system:
230  *
231  *  [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
232  *      to map ranges of virtual address space to objects/files.  for
233  *      example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
234  *      to the file /bin/ls starting at offset zero."   note that
235  *      the upper layer mapping is not concerned with how individual
236  *      vm_pages are mapped.
237  *
238  *  [2] the lower layer of the VM system (the pmap) maintains the mappings
239  *      from virtual addresses.   it is concerned with which vm_page is
240  *      mapped where.   for example, when you run /bin/ls and start
241  *      at page 0x1000 the fault routine may lookup the correct page
242  *      of the /bin/ls file and then ask the pmap layer to establish
243  *      a mapping for it.
244  *
245  * note that information in the lower layer of the VM system can be
246  * thrown away since it can easily be reconstructed from the info
247  * in the upper layer.
248  *
249  * data structures we use include:
250  *
251  *  - struct pmap: describes the address space of one thread
252  *  - struct pmap_page: describes one pv-tracked page, without
253  *	necessarily a corresponding vm_page
254  *  - struct pv_entry: describes one <PMAP,VA> mapping of a PA
255  *  - struct pv_head: there is one pv_head per pv-tracked page of
256  *	physical memory.   the pv_head points to a list of pv_entry
257  *	structures which describe all the <PMAP,VA> pairs that this
258  *      page is mapped in.    this is critical for page based operations
259  *      such as pmap_page_protect() [change protection on _all_ mappings
260  *      of a page]
261  */
262 
263 /*
264  * memory allocation
265  *
266  *  - there are three data structures that we must dynamically allocate:
267  *
268  * [A] new process' page directory page (PDP)
269  *	- plan 1: done at pmap_create() we use
270  *	  uvm_km_alloc(kernel_map, PAGE_SIZE)  [fka kmem_alloc] to do this
271  *	  allocation.
272  *
273  * if we are low in free physical memory then we sleep in
274  * uvm_km_alloc -- in this case this is ok since we are creating
275  * a new pmap and should not be holding any locks.
276  *
277  * if the kernel is totally out of virtual space
278  * (i.e. uvm_km_alloc returns NULL), then we panic.
279  *
280  * [B] new page tables pages (PTP)
281  * 	- call uvm_pagealloc()
282  * 		=> success: zero page, add to pm_pdir
283  * 		=> failure: we are out of free vm_pages, let pmap_enter()
284  *		   tell UVM about it.
285  *
286  * note: for kernel PTPs, we start with NKPTP of them.   as we map
287  * kernel memory (at uvm_map time) we check to see if we've grown
288  * the kernel pmap.   if so, we call the optional function
289  * pmap_growkernel() to grow the kernel PTPs in advance.
290  *
291  * [C] pv_entry structures
292  */
293 
294 /*
295  * locking
296  *
297  * we have the following locks that we must contend with:
298  *
299  * mutexes:
300  *
301  * - pmap lock (per pmap, part of uvm_object)
302  *   this lock protects the fields in the pmap structure including
303  *   the non-kernel PDEs in the PDP, and the PTEs.  it also locks
304  *   in the alternate PTE space (since that is determined by the
305  *   entry in the PDP).
306  *
307  * - pvh_lock (per pv_head)
308  *   this lock protects the pv_entry list which is chained off the
309  *   pv_head structure for a specific pv-tracked PA.   it is locked
310  *   when traversing the list (e.g. adding/removing mappings,
311  *   syncing R/M bits, etc.)
312  *
313  * - pmaps_lock
314  *   this lock protects the list of active pmaps (headed by "pmaps").
315  *   we lock it when adding or removing pmaps from this list.
316  */
317 
318 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER;
319 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER;
320 const long nkptpmax[] = NKPTPMAX_INITIALIZER;
321 const long nbpd[] = NBPD_INITIALIZER;
322 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER;
323 
324 long nkptp[] = NKPTP_INITIALIZER;
325 
326 struct pmap_head pmaps;
327 kmutex_t pmaps_lock;
328 
329 static vaddr_t pmap_maxkvaddr;
330 
331 /*
332  * XXX kludge: dummy locking to make KASSERTs in uvm_page.c comfortable.
333  * actual locking is done by pm_lock.
334  */
335 #if defined(DIAGNOSTIC)
336 #define	PMAP_SUBOBJ_LOCK(pm, idx) \
337 	KASSERT(mutex_owned((pm)->pm_lock)); \
338 	if ((idx) != 0) \
339 		mutex_enter((pm)->pm_obj[(idx)].vmobjlock)
340 #define	PMAP_SUBOBJ_UNLOCK(pm, idx) \
341 	KASSERT(mutex_owned((pm)->pm_lock)); \
342 	if ((idx) != 0) \
343 		mutex_exit((pm)->pm_obj[(idx)].vmobjlock)
344 #else /* defined(DIAGNOSTIC) */
345 #define	PMAP_SUBOBJ_LOCK(pm, idx)	/* nothing */
346 #define	PMAP_SUBOBJ_UNLOCK(pm, idx)	/* nothing */
347 #endif /* defined(DIAGNOSTIC) */
348 
349 /*
350  * Misc. event counters.
351  */
352 struct evcnt pmap_iobmp_evcnt;
353 struct evcnt pmap_ldt_evcnt;
354 
355 /*
356  * PAT
357  */
358 #define	PATENTRY(n, type)	(type << ((n) * 8))
359 #define	PAT_UC		0x0ULL
360 #define	PAT_WC		0x1ULL
361 #define	PAT_WT		0x4ULL
362 #define	PAT_WP		0x5ULL
363 #define	PAT_WB		0x6ULL
364 #define	PAT_UCMINUS	0x7ULL
365 
366 static bool cpu_pat_enabled __read_mostly = false;
367 
368 /*
369  * Global data structures
370  */
371 
372 static struct pmap kernel_pmap_store;	/* the kernel's pmap (proc0) */
373 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store;
374 
375 /*
376  * pmap_pg_nx: if our processor supports PG_NX in the PTE then we
377  * set pmap_pg_nx to PG_NX (otherwise it is zero).
378  */
379 pd_entry_t pmap_pg_nx __read_mostly = 0;
380 
381 /*
382  * pmap_pg_g: if our processor supports PG_G in the PTE then we
383  * set pmap_pg_g to PG_G (otherwise it is zero).
384  */
385 pd_entry_t pmap_pg_g __read_mostly = 0;
386 
387 /*
388  * pmap_largepages: if our processor supports PG_PS and we are
389  * using it, this is set to true.
390  */
391 int pmap_largepages __read_mostly = 0;
392 
393 /*
394  * i386 physical memory comes in a big contig chunk with a small
395  * hole toward the front of it...  the following two paddr_t's
396  * (shared with machdep.c) describe the physical address space
397  * of this machine.
398  */
399 paddr_t avail_start __read_mostly; /* PA of first available physical page */
400 paddr_t avail_end __read_mostly; /* PA of last available physical page */
401 
402 #ifdef XEN
403 #ifdef __x86_64__
404 /* Dummy PGD for user cr3, used between pmap_deactivate() and pmap_activate() */
405 static paddr_t xen_dummy_user_pgd;
406 #endif /* __x86_64__ */
407 paddr_t pmap_pa_start; /* PA of first physical page for this domain */
408 paddr_t pmap_pa_end;   /* PA of last physical page for this domain */
409 #endif /* XEN */
410 
411 #define	VM_PAGE_TO_PP(pg)	(&(pg)->mdpage.mp_pp)
412 
413 #define	PV_HASH_SIZE		32768
414 #define	PV_HASH_LOCK_CNT	32
415 
416 struct pv_hash_lock {
417 	kmutex_t lock;
418 } __aligned(CACHE_LINE_SIZE) pv_hash_locks[PV_HASH_LOCK_CNT]
419     __aligned(CACHE_LINE_SIZE);
420 
421 struct pv_hash_head {
422 	SLIST_HEAD(, pv_entry) hh_list;
423 } pv_hash_heads[PV_HASH_SIZE];
424 
425 static u_int
426 pvhash_hash(struct vm_page *ptp, vaddr_t va)
427 {
428 
429 	return (uintptr_t)ptp / sizeof(*ptp) + (va >> PAGE_SHIFT);
430 }
431 
432 static struct pv_hash_head *
433 pvhash_head(u_int hash)
434 {
435 
436 	return &pv_hash_heads[hash % PV_HASH_SIZE];
437 }
438 
439 static kmutex_t *
440 pvhash_lock(u_int hash)
441 {
442 
443 	return &pv_hash_locks[hash % PV_HASH_LOCK_CNT].lock;
444 }
445 
446 static struct pv_entry *
447 pvhash_remove(struct pv_hash_head *hh, struct vm_page *ptp, vaddr_t va)
448 {
449 	struct pv_entry *pve;
450 	struct pv_entry *prev;
451 
452 	prev = NULL;
453 	SLIST_FOREACH(pve, &hh->hh_list, pve_hash) {
454 		if (pve->pve_pte.pte_ptp == ptp &&
455 		    pve->pve_pte.pte_va == va) {
456 			if (prev != NULL) {
457 				SLIST_REMOVE_AFTER(prev, pve_hash);
458 			} else {
459 				SLIST_REMOVE_HEAD(&hh->hh_list, pve_hash);
460 			}
461 			break;
462 		}
463 		prev = pve;
464 	}
465 	return pve;
466 }
467 
468 /*
469  * Other data structures
470  */
471 
472 static pt_entry_t protection_codes[8] __read_mostly;
473 
474 static bool pmap_initialized __read_mostly = false; /* pmap_init done yet? */
475 
476 /*
477  * The following two vaddr_t's are used during system startup to keep track of
478  * how much of the kernel's VM space we have used. Once the system is started,
479  * the management of the remaining kernel VM space is turned over to the
480  * kernel_map vm_map.
481  */
482 static vaddr_t virtual_avail __read_mostly;	/* VA of first free KVA */
483 static vaddr_t virtual_end __read_mostly;	/* VA of last free KVA */
484 
485 /*
486  * pool that pmap structures are allocated from
487  */
488 static struct pool_cache pmap_cache;
489 
490 /*
491  * pv_entry cache
492  */
493 static struct pool_cache pmap_pv_cache;
494 
495 #ifndef __HAVE_DIRECT_MAP
496 /*
497  * MULTIPROCESSOR: special VAs and PTEs are actually allocated inside a
498  * (maxcpus * NPTECL) array of PTE, to avoid cache line thrashing due to
499  * false sharing.
500  */
501 #ifdef MULTIPROCESSOR
502 #define PTESLEW(pte, id) ((pte)+(id)*NPTECL)
503 #define VASLEW(va,id) ((va)+(id)*NPTECL*PAGE_SIZE)
504 #else
505 #define PTESLEW(pte, id) ((void)id, pte)
506 #define VASLEW(va,id) ((void)id, va)
507 #endif
508 
509 /*
510  * Special VAs and the PTEs that map them
511  */
512 static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte, *early_zero_pte;
513 static char *csrcp, *cdstp, *zerop, *ptpp;
514 #ifdef XEN
515 char *early_zerop; /* also referenced from xen_pmap_bootstrap() */
516 #else
517 static char *early_zerop;
518 #endif
519 
520 #endif
521 
522 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int);
523 
524 /* PDP pool_cache(9) and its callbacks */
525 struct pool_cache pmap_pdp_cache;
526 static int  pmap_pdp_ctor(void *, void *, int);
527 static void pmap_pdp_dtor(void *, void *);
528 #ifdef PAE
529 /* need to allocate items of 4 pages */
530 static void *pmap_pdp_alloc(struct pool *, int);
531 static void pmap_pdp_free(struct pool *, void *);
532 static struct pool_allocator pmap_pdp_allocator = {
533 	.pa_alloc = pmap_pdp_alloc,
534 	.pa_free = pmap_pdp_free,
535 	.pa_pagesz = PAGE_SIZE * PDP_SIZE,
536 };
537 #endif /* PAE */
538 
539 extern vaddr_t idt_vaddr;
540 extern paddr_t idt_paddr;
541 
542 extern int end;
543 
544 #ifdef i386
545 /* stuff to fix the pentium f00f bug */
546 extern vaddr_t pentium_idt_vaddr;
547 #endif
548 
549 /*
550  * Local prototypes
551  */
552 
553 #ifdef __HAVE_DIRECT_MAP
554 static void pmap_init_directmap(struct pmap *);
555 #endif
556 #ifndef XEN
557 static void pmap_remap_largepages(void);
558 #endif
559 
560 static struct vm_page *pmap_get_ptp(struct pmap *, vaddr_t,
561     pd_entry_t * const *);
562 static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int);
563 static void pmap_freepage(struct pmap *, struct vm_page *, int);
564 static void pmap_free_ptp(struct pmap *, struct vm_page *, vaddr_t,
565     pt_entry_t *, pd_entry_t * const *);
566 static bool pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *,
567     vaddr_t, struct pv_entry **);
568 static void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t, vaddr_t,
569     vaddr_t, struct pv_entry **);
570 
571 static paddr_t pmap_get_physpage(void);
572 static void pmap_alloc_level(vaddr_t, long *);
573 
574 static bool pmap_reactivate(struct pmap *);
575 
576 /*
577  * p m a p   h e l p e r   f u n c t i o n s
578  */
579 
580 static inline void
581 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff)
582 {
583 
584 	if (pmap == pmap_kernel()) {
585 		atomic_add_long(&pmap->pm_stats.resident_count, resid_diff);
586 		atomic_add_long(&pmap->pm_stats.wired_count, wired_diff);
587 	} else {
588 		KASSERT(mutex_owned(pmap->pm_lock));
589 		pmap->pm_stats.resident_count += resid_diff;
590 		pmap->pm_stats.wired_count += wired_diff;
591 	}
592 }
593 
594 static inline void
595 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
596 {
597 	int resid_diff = ((npte & PG_V) ? 1 : 0) - ((opte & PG_V) ? 1 : 0);
598 	int wired_diff = ((npte & PG_W) ? 1 : 0) - ((opte & PG_W) ? 1 : 0);
599 
600 	KASSERT((npte & (PG_V | PG_W)) != PG_W);
601 	KASSERT((opte & (PG_V | PG_W)) != PG_W);
602 
603 	pmap_stats_update(pmap, resid_diff, wired_diff);
604 }
605 
606 /*
607  * ptp_to_pmap: lookup pmap by ptp
608  */
609 
610 static struct pmap *
611 ptp_to_pmap(struct vm_page *ptp)
612 {
613 	struct pmap *pmap;
614 
615 	if (ptp == NULL) {
616 		return pmap_kernel();
617 	}
618 	pmap = (struct pmap *)ptp->uobject;
619 	KASSERT(pmap != NULL);
620 	KASSERT(&pmap->pm_obj[0] == ptp->uobject);
621 	return pmap;
622 }
623 
624 static inline struct pv_pte *
625 pve_to_pvpte(struct pv_entry *pve)
626 {
627 
628 	KASSERT((void *)&pve->pve_pte == (void *)pve);
629 	return &pve->pve_pte;
630 }
631 
632 static inline struct pv_entry *
633 pvpte_to_pve(struct pv_pte *pvpte)
634 {
635 	struct pv_entry *pve = (void *)pvpte;
636 
637 	KASSERT(pve_to_pvpte(pve) == pvpte);
638 	return pve;
639 }
640 
641 /*
642  * pv_pte_first, pv_pte_next: PV list iterator.
643  */
644 
645 static struct pv_pte *
646 pv_pte_first(struct pmap_page *pp)
647 {
648 
649 	if ((pp->pp_flags & PP_EMBEDDED) != 0) {
650 		return &pp->pp_pte;
651 	}
652 	return pve_to_pvpte(LIST_FIRST(&pp->pp_head.pvh_list));
653 }
654 
655 static struct pv_pte *
656 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte)
657 {
658 
659 	KASSERT(pvpte != NULL);
660 	if (pvpte == &pp->pp_pte) {
661 		KASSERT((pp->pp_flags & PP_EMBEDDED) != 0);
662 		return NULL;
663 	}
664 	KASSERT((pp->pp_flags & PP_EMBEDDED) == 0);
665 	return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list));
666 }
667 
668 /*
669  * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
670  *		of course the kernel is always loaded
671  */
672 
673 bool
674 pmap_is_curpmap(struct pmap *pmap)
675 {
676 	return((pmap == pmap_kernel()) ||
677 	       (pmap == curcpu()->ci_pmap));
678 }
679 
680 /*
681  *	Add a reference to the specified pmap.
682  */
683 
684 void
685 pmap_reference(struct pmap *pmap)
686 {
687 
688 	atomic_inc_uint(&pmap->pm_obj[0].uo_refs);
689 }
690 
691 /*
692  * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
693  *
694  * there are several pmaps involved.  some or all of them might be same.
695  *
696  *	- the pmap given by the first argument
697  *		our caller wants to access this pmap's PTEs.
698  *
699  *	- pmap_kernel()
700  *		the kernel pmap.  note that it only contains the kernel part
701  *		of the address space which is shared by any pmap.  ie. any
702  *		pmap can be used instead of pmap_kernel() for our purpose.
703  *
704  *	- ci->ci_pmap
705  *		pmap currently loaded on the cpu.
706  *
707  *	- vm_map_pmap(&curproc->p_vmspace->vm_map)
708  *		current process' pmap.
709  *
710  * => we lock enough pmaps to keep things locked in
711  * => must be undone with pmap_unmap_ptes before returning
712  */
713 
714 void
715 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2,
716 	      pd_entry_t **ptepp, pd_entry_t * const **pdeppp)
717 {
718 	struct pmap *curpmap;
719 	struct cpu_info *ci;
720 	lwp_t *l;
721 
722 	/* The kernel's pmap is always accessible. */
723 	if (pmap == pmap_kernel()) {
724 		*pmap2 = NULL;
725 		*ptepp = PTE_BASE;
726 		*pdeppp = normal_pdes;
727 		return;
728 	}
729 	KASSERT(kpreempt_disabled());
730 
731 	l = curlwp;
732  retry:
733 	mutex_enter(pmap->pm_lock);
734 	ci = curcpu();
735 	curpmap = ci->ci_pmap;
736 	if (vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) {
737 		/* Our own pmap so just load it: easy. */
738 		if (__predict_false(ci->ci_want_pmapload)) {
739 			mutex_exit(pmap->pm_lock);
740 			pmap_load();
741 			goto retry;
742 		}
743 		KASSERT(pmap == curpmap);
744 	} else if (pmap == curpmap) {
745 		/*
746 		 * Already on the CPU: make it valid.  This is very
747 		 * often the case during exit(), when we have switched
748 		 * to the kernel pmap in order to destroy a user pmap.
749 		 */
750 		if (!pmap_reactivate(pmap)) {
751 			u_int gen = uvm_emap_gen_return();
752 			tlbflush();
753 			uvm_emap_update(gen);
754 		}
755 	} else {
756 		/*
757 		 * Toss current pmap from CPU, but keep a reference to it.
758 		 * The reference will be dropped by pmap_unmap_ptes().
759 		 * Can happen if we block during exit().
760 		 */
761 		const cpuid_t cid = cpu_index(ci);
762 
763 		kcpuset_atomic_clear(curpmap->pm_cpus, cid);
764 		kcpuset_atomic_clear(curpmap->pm_kernel_cpus, cid);
765 		ci->ci_pmap = pmap;
766 		ci->ci_tlbstate = TLBSTATE_VALID;
767 		kcpuset_atomic_set(pmap->pm_cpus, cid);
768 		kcpuset_atomic_set(pmap->pm_kernel_cpus, cid);
769 		cpu_load_pmap(pmap, curpmap);
770 	}
771 	pmap->pm_ncsw = l->l_ncsw;
772 	*pmap2 = curpmap;
773 	*ptepp = PTE_BASE;
774 #if defined(XEN) && defined(__x86_64__)
775 	KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] == L4_BASE);
776 	ci->ci_normal_pdes[PTP_LEVELS - 2] = pmap->pm_pdir;
777 	*pdeppp = ci->ci_normal_pdes;
778 #else /* XEN && __x86_64__ */
779 	*pdeppp = normal_pdes;
780 #endif /* XEN && __x86_64__ */
781 }
782 
783 /*
784  * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
785  */
786 
787 void
788 pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2)
789 {
790 	struct cpu_info *ci;
791 	struct pmap *mypmap;
792 
793 	KASSERT(kpreempt_disabled());
794 
795 	/* The kernel's pmap is always accessible. */
796 	if (pmap == pmap_kernel()) {
797 		return;
798 	}
799 
800 	ci = curcpu();
801 #if defined(XEN) && defined(__x86_64__)
802 	/* Reset per-cpu normal_pdes */
803 	KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] != L4_BASE);
804 	ci->ci_normal_pdes[PTP_LEVELS - 2] = L4_BASE;
805 #endif /* XEN && __x86_64__ */
806 	/*
807 	 * We cannot tolerate context switches while mapped in.
808 	 * If it is our own pmap all we have to do is unlock.
809 	 */
810 	KASSERT(pmap->pm_ncsw == curlwp->l_ncsw);
811 	mypmap = vm_map_pmap(&curproc->p_vmspace->vm_map);
812 	if (pmap == mypmap) {
813 		mutex_exit(pmap->pm_lock);
814 		return;
815 	}
816 
817 	/*
818 	 * Mark whatever's on the CPU now as lazy and unlock.
819 	 * If the pmap was already installed, we are done.
820 	 */
821 	ci->ci_tlbstate = TLBSTATE_LAZY;
822 	ci->ci_want_pmapload = (mypmap != pmap_kernel());
823 	mutex_exit(pmap->pm_lock);
824 	if (pmap == pmap2) {
825 		return;
826 	}
827 
828 	/*
829 	 * We installed another pmap on the CPU.  Grab a reference to
830 	 * it and leave in place.  Toss the evicted pmap (can block).
831 	 */
832 	pmap_reference(pmap);
833 	pmap_destroy(pmap2);
834 }
835 
836 
837 inline static void
838 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte)
839 {
840 
841 #if !defined(__x86_64__)
842 	if (curproc == NULL || curproc->p_vmspace == NULL ||
843 	    pm != vm_map_pmap(&curproc->p_vmspace->vm_map))
844 		return;
845 
846 	if ((opte ^ npte) & PG_X)
847 		pmap_update_pg(va);
848 
849 	/*
850 	 * Executability was removed on the last executable change.
851 	 * Reset the code segment to something conservative and
852 	 * let the trap handler deal with setting the right limit.
853 	 * We can't do that because of locking constraints on the vm map.
854 	 */
855 
856 	if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) {
857 		struct trapframe *tf = curlwp->l_md.md_regs;
858 
859 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
860 		pm->pm_hiexec = I386_MAX_EXE_ADDR;
861 	}
862 #endif /* !defined(__x86_64__) */
863 }
864 
865 #if !defined(__x86_64__)
866 /*
867  * Fixup the code segment to cover all potential executable mappings.
868  * returns 0 if no changes to the code segment were made.
869  */
870 
871 int
872 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb)
873 {
874 	struct vm_map_entry *ent;
875 	struct pmap *pm = vm_map_pmap(map);
876 	vaddr_t va = 0;
877 
878 	vm_map_lock_read(map);
879 	for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) {
880 
881 		/*
882 		 * This entry has greater va than the entries before.
883 		 * We need to make it point to the last page, not past it.
884 		 */
885 
886 		if (ent->protection & VM_PROT_EXECUTE)
887 			va = trunc_page(ent->end) - PAGE_SIZE;
888 	}
889 	vm_map_unlock_read(map);
890 	if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL))
891 		return (0);
892 
893 	pm->pm_hiexec = va;
894 	if (pm->pm_hiexec > I386_MAX_EXE_ADDR) {
895 		tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
896 	} else {
897 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
898 		return (0);
899 	}
900 	return (1);
901 }
902 #endif /* !defined(__x86_64__) */
903 
904 void
905 pat_init(struct cpu_info *ci)
906 {
907 	uint64_t pat;
908 
909 	if (!(ci->ci_feat_val[0] & CPUID_PAT))
910 		return;
911 
912 	/* We change WT to WC. Leave all other entries the default values. */
913 	pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) |
914 	      PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) |
915 	      PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) |
916 	      PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC);
917 
918 	wrmsr(MSR_CR_PAT, pat);
919 	cpu_pat_enabled = true;
920 	aprint_debug_dev(ci->ci_dev, "PAT enabled\n");
921 }
922 
923 static pt_entry_t
924 pmap_pat_flags(u_int flags)
925 {
926 	u_int cacheflags = (flags & PMAP_CACHE_MASK);
927 
928 	if (!cpu_pat_enabled) {
929 		switch (cacheflags) {
930 		case PMAP_NOCACHE:
931 		case PMAP_NOCACHE_OVR:
932 			/* results in PGC_UCMINUS on cpus which have
933 			 * the cpuid PAT but PAT "disabled"
934 			 */
935 			return PG_N;
936 		default:
937 			return 0;
938 		}
939 	}
940 
941 	switch (cacheflags) {
942 	case PMAP_NOCACHE:
943 		return PGC_UC;
944 	case PMAP_WRITE_COMBINE:
945 		return PGC_WC;
946 	case PMAP_WRITE_BACK:
947 		return PGC_WB;
948 	case PMAP_NOCACHE_OVR:
949 		return PGC_UCMINUS;
950 	}
951 
952 	return 0;
953 }
954 
955 /*
956  * p m a p   k e n t e r   f u n c t i o n s
957  *
958  * functions to quickly enter/remove pages from the kernel address
959  * space.   pmap_kremove is exported to MI kernel.  we make use of
960  * the recursive PTE mappings.
961  */
962 
963 /*
964  * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
965  *
966  * => no need to lock anything, assume va is already allocated
967  * => should be faster than normal pmap enter function
968  */
969 
970 void
971 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
972 {
973 	pt_entry_t *pte, opte, npte;
974 
975 	KASSERT(!(prot & ~VM_PROT_ALL));
976 
977 	if (va < VM_MIN_KERNEL_ADDRESS)
978 		pte = vtopte(va);
979 	else
980 		pte = kvtopte(va);
981 #ifdef DOM0OPS
982 	if (pa < pmap_pa_start || pa >= pmap_pa_end) {
983 #ifdef DEBUG
984 		printf_nolog("%s: pa 0x%" PRIx64 " for va 0x%" PRIx64
985 		    " outside range\n", __func__, (int64_t)pa, (int64_t)va);
986 #endif /* DEBUG */
987 		npte = pa;
988 	} else
989 #endif /* DOM0OPS */
990 		npte = pmap_pa2pte(pa);
991 	npte |= protection_codes[prot] | PG_k | PG_V | pmap_pg_g;
992 	npte |= pmap_pat_flags(flags);
993 	opte = pmap_pte_testset(pte, npte); /* zap! */
994 #if defined(DIAGNOSTIC)
995 	/*
996 	 * XXX: make sure we are not dealing with a large page, since the only
997 	 * large pages created are for the kernel image, and they should never
998 	 * be kentered.
999 	 */
1000 	if (opte & PG_PS)
1001 		panic("%s: PG_PS", __func__);
1002 #endif
1003 	if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) {
1004 		/* This should not happen. */
1005 		printf_nolog("%s: mapping already present\n", __func__);
1006 		kpreempt_disable();
1007 		pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER);
1008 		kpreempt_enable();
1009 	}
1010 }
1011 
1012 void
1013 pmap_emap_enter(vaddr_t va, paddr_t pa, vm_prot_t prot)
1014 {
1015 	pt_entry_t *pte, npte;
1016 
1017 	KASSERT((prot & ~VM_PROT_ALL) == 0);
1018 	pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va);
1019 
1020 #ifdef DOM0OPS
1021 	if (pa < pmap_pa_start || pa >= pmap_pa_end) {
1022 		npte = pa;
1023 	} else
1024 #endif
1025 		npte = pmap_pa2pte(pa);
1026 
1027 	npte = pmap_pa2pte(pa);
1028 	npte |= protection_codes[prot] | PG_k | PG_V;
1029 	pmap_pte_set(pte, npte);
1030 }
1031 
1032 /*
1033  * pmap_emap_sync: perform TLB flush or pmap load, if it was deferred.
1034  */
1035 void
1036 pmap_emap_sync(bool canload)
1037 {
1038 	struct cpu_info *ci = curcpu();
1039 	struct pmap *pmap;
1040 
1041 	KASSERT(kpreempt_disabled());
1042 	if (__predict_true(ci->ci_want_pmapload && canload)) {
1043 		/*
1044 		 * XXX: Hint for pmap_reactivate(), which might suggest to
1045 		 * not perform TLB flush, if state has not changed.
1046 		 */
1047 		pmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map);
1048 		if (__predict_false(pmap == ci->ci_pmap)) {
1049 			kcpuset_atomic_clear(pmap->pm_cpus, cpu_index(ci));
1050 		}
1051 		pmap_load();
1052 		KASSERT(ci->ci_want_pmapload == 0);
1053 	} else {
1054 		tlbflush();
1055 	}
1056 }
1057 
1058 void
1059 pmap_emap_remove(vaddr_t sva, vsize_t len)
1060 {
1061 	pt_entry_t *pte;
1062 	vaddr_t va, eva = sva + len;
1063 
1064 	for (va = sva; va < eva; va += PAGE_SIZE) {
1065 		pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va);
1066 		pmap_pte_set(pte, 0);
1067 	}
1068 }
1069 
1070 __strict_weak_alias(pmap_kenter_ma, pmap_kenter_pa);
1071 
1072 #if defined(__x86_64__)
1073 /*
1074  * Change protection for a virtual address. Local for a CPU only, don't
1075  * care about TLB shootdowns.
1076  *
1077  * => must be called with preemption disabled
1078  */
1079 void
1080 pmap_changeprot_local(vaddr_t va, vm_prot_t prot)
1081 {
1082 	pt_entry_t *pte, opte, npte;
1083 
1084 	KASSERT(kpreempt_disabled());
1085 
1086 	if (va < VM_MIN_KERNEL_ADDRESS)
1087 		pte = vtopte(va);
1088 	else
1089 		pte = kvtopte(va);
1090 
1091 	npte = opte = *pte;
1092 
1093 	if ((prot & VM_PROT_WRITE) != 0)
1094 		npte |= PG_RW;
1095 	else
1096 		npte &= ~PG_RW;
1097 
1098 	if (opte != npte) {
1099 		pmap_pte_set(pte, npte);
1100 		pmap_pte_flush();
1101 		invlpg(va);
1102 	}
1103 }
1104 #endif /* defined(__x86_64__) */
1105 
1106 /*
1107  * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
1108  *
1109  * => no need to lock anything
1110  * => caller must dispose of any vm_page mapped in the va range
1111  * => note: not an inline function
1112  * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
1113  * => we assume kernel only unmaps valid addresses and thus don't bother
1114  *    checking the valid bit before doing TLB flushing
1115  * => must be followed by call to pmap_update() before reuse of page
1116  */
1117 
1118 static inline void
1119 pmap_kremove1(vaddr_t sva, vsize_t len, bool localonly)
1120 {
1121 	pt_entry_t *pte, opte;
1122 	vaddr_t va, eva;
1123 
1124 	eva = sva + len;
1125 
1126 	kpreempt_disable();
1127 	for (va = sva; va < eva; va += PAGE_SIZE) {
1128 		pte = kvtopte(va);
1129 		opte = pmap_pte_testset(pte, 0); /* zap! */
1130 		if ((opte & (PG_V | PG_U)) == (PG_V | PG_U) && !localonly) {
1131 			pmap_tlb_shootdown(pmap_kernel(), va, opte,
1132 			    TLBSHOOT_KREMOVE);
1133 		}
1134 		KASSERT((opte & PG_PS) == 0);
1135 		KASSERT((opte & PG_PVLIST) == 0);
1136 	}
1137 	if (localonly) {
1138 		tlbflushg();
1139 	}
1140 	kpreempt_enable();
1141 }
1142 
1143 void
1144 pmap_kremove(vaddr_t sva, vsize_t len)
1145 {
1146 
1147 	pmap_kremove1(sva, len, false);
1148 }
1149 
1150 /*
1151  * pmap_kremove_local: like pmap_kremove(), but only worry about
1152  * TLB invalidations on the current CPU.  this is only intended
1153  * for use while writing kernel crash dumps.
1154  */
1155 
1156 void
1157 pmap_kremove_local(vaddr_t sva, vsize_t len)
1158 {
1159 
1160 	KASSERT(panicstr != NULL);
1161 	pmap_kremove1(sva, len, true);
1162 }
1163 
1164 /*
1165  * p m a p   i n i t   f u n c t i o n s
1166  *
1167  * pmap_bootstrap and pmap_init are called during system startup
1168  * to init the pmap module.   pmap_bootstrap() does a low level
1169  * init just to get things rolling.   pmap_init() finishes the job.
1170  */
1171 
1172 /*
1173  * pmap_bootstrap_valloc: allocate a virtual address in the bootstrap area.
1174  * This function is to be used before any VM system has been set up.
1175  *
1176  * The va is taken from virtual_avail.
1177  */
1178 static vaddr_t
1179 pmap_bootstrap_valloc(size_t npages)
1180 {
1181 	vaddr_t va = virtual_avail;
1182 	virtual_avail += npages * PAGE_SIZE;
1183 	return va;
1184 }
1185 
1186 /*
1187  * pmap_bootstrap_palloc: allocate a physical address in the bootstrap area.
1188  * This function is to be used before any VM system has been set up.
1189  *
1190  * The pa is taken from avail_start.
1191  */
1192 static paddr_t
1193 pmap_bootstrap_palloc(size_t npages)
1194 {
1195 	paddr_t pa = avail_start;
1196 	avail_start += npages * PAGE_SIZE;
1197 	return pa;
1198 }
1199 
1200 /*
1201  * pmap_bootstrap: get the system in a state where it can run with VM properly
1202  * enabled (called before main()). The VM system is fully init'd later.
1203  *
1204  * => on i386, locore.S has already enabled the MMU by allocating a PDP for the
1205  *    kernel, and nkpde PTP's for the kernel.
1206  * => kva_start is the first free virtual address in kernel space.
1207  */
1208 void
1209 pmap_bootstrap(vaddr_t kva_start)
1210 {
1211 	struct pmap *kpm;
1212 	int i;
1213 	vaddr_t kva;
1214 #ifndef XEN
1215 	unsigned long p1i;
1216 	vaddr_t kva_end;
1217 #endif
1218 
1219 	pmap_pg_nx = (cpu_feature[2] & CPUID_NOX ? PG_NX : 0);
1220 
1221 	/*
1222 	 * Set up our local static global vars that keep track of the usage of
1223 	 * KVM before kernel_map is set up.
1224 	 */
1225 	virtual_avail = kva_start;		/* first free KVA */
1226 	virtual_end = VM_MAX_KERNEL_ADDRESS;	/* last KVA */
1227 
1228 	/*
1229 	 * Set up protection_codes: we need to be able to convert from a MI
1230 	 * protection code (some combo of VM_PROT...) to something we can jam
1231 	 * into a x86 PTE.
1232 	 */
1233 	protection_codes[VM_PROT_NONE] = pmap_pg_nx;
1234 	protection_codes[VM_PROT_EXECUTE] = PG_RO | PG_X;
1235 	protection_codes[VM_PROT_READ] = PG_RO | pmap_pg_nx;
1236 	protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO | PG_X;
1237 	protection_codes[VM_PROT_WRITE] = PG_RW | pmap_pg_nx;
1238 	protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW | PG_X;
1239 	protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW | pmap_pg_nx;
1240 	protection_codes[VM_PROT_ALL] = PG_RW | PG_X;
1241 
1242 	/*
1243 	 * Now we init the kernel's pmap.
1244 	 *
1245 	 * The kernel pmap's pm_obj is not used for much. However, in user pmaps
1246 	 * the pm_obj contains the list of active PTPs.
1247 	 *
1248 	 * The pm_obj currently does not have a pager. It might be possible to
1249 	 * add a pager that would allow a process to read-only mmap its own page
1250 	 * tables (fast user-level vtophys?). This may or may not be useful.
1251 	 */
1252 	kpm = pmap_kernel();
1253 	for (i = 0; i < PTP_LEVELS - 1; i++) {
1254 		mutex_init(&kpm->pm_obj_lock[i], MUTEX_DEFAULT, IPL_NONE);
1255 		uvm_obj_init(&kpm->pm_obj[i], NULL, false, 1);
1256 		uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_obj_lock[i]);
1257 		kpm->pm_ptphint[i] = NULL;
1258 	}
1259 	memset(&kpm->pm_list, 0, sizeof(kpm->pm_list));  /* pm_list not used */
1260 
1261 	kpm->pm_pdir = (pd_entry_t *)(PDPpaddr + KERNBASE);
1262 	for (i = 0; i < PDP_SIZE; i++)
1263 		kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i;
1264 
1265 	kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
1266 		x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS);
1267 
1268 	kcpuset_create(&kpm->pm_cpus, true);
1269 	kcpuset_create(&kpm->pm_kernel_cpus, true);
1270 
1271 	/*
1272 	 * the above is just a rough estimate and not critical to the proper
1273 	 * operation of the system.
1274 	 */
1275 
1276 #ifndef XEN
1277 	/*
1278 	 * Begin to enable global TLB entries if they are supported.
1279 	 * The G bit has no effect until the CR4_PGE bit is set in CR4,
1280 	 * which happens in cpu_init(), which is run on each cpu
1281 	 * (and happens later)
1282 	 */
1283 	if (cpu_feature[0] & CPUID_PGE) {
1284 		pmap_pg_g = PG_G;		/* enable software */
1285 
1286 		/* add PG_G attribute to already mapped kernel pages */
1287 		if (KERNBASE == VM_MIN_KERNEL_ADDRESS) {
1288 			kva_end = virtual_avail;
1289 		} else {
1290 			extern vaddr_t eblob, esym;
1291 			kva_end = (vaddr_t)&end;
1292 			if (esym > kva_end)
1293 				kva_end = esym;
1294 			if (eblob > kva_end)
1295 				kva_end = eblob;
1296 			kva_end = roundup(kva_end, PAGE_SIZE);
1297 		}
1298 		for (kva = KERNBASE; kva < kva_end; kva += PAGE_SIZE) {
1299 			p1i = pl1_i(kva);
1300 			if (pmap_valid_entry(PTE_BASE[p1i]))
1301 				PTE_BASE[p1i] |= PG_G;
1302 		}
1303 	}
1304 
1305 	/*
1306 	 * Enable large pages if they are supported.
1307 	 */
1308 	if (cpu_feature[0] & CPUID_PSE) {
1309 		lcr4(rcr4() | CR4_PSE);	/* enable hardware (via %cr4) */
1310 		pmap_largepages = 1;	/* enable software */
1311 
1312 		/*
1313 		 * The TLB must be flushed after enabling large pages on Pentium
1314 		 * CPUs, according to section 3.6.2.2 of "Intel Architecture
1315 		 * Software Developer's Manual, Volume 3: System Programming".
1316 		 */
1317 		tlbflushg();
1318 
1319 		/* Remap the kernel. */
1320 		pmap_remap_largepages();
1321 	}
1322 #endif /* !XEN */
1323 
1324 #ifdef __HAVE_DIRECT_MAP
1325 	pmap_init_directmap(kpm);
1326 #else
1327 	if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
1328 		/*
1329 		 * zero_pte is stuck at the end of mapped space for the kernel
1330 		 * image (disjunct from kva space). This is done so that it
1331 		 * can safely be used in pmap_growkernel (pmap_get_physpage),
1332 		 * when it's called for the first time.
1333 		 * XXXfvdl fix this for MULTIPROCESSOR later.
1334 		 */
1335 #ifdef XEN
1336 		/* early_zerop initialized in xen_pmap_bootstrap() */
1337 #else
1338 		early_zerop = (void *)(KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2);
1339 #endif
1340 		early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop);
1341 	}
1342 
1343 	/*
1344 	 * Now we allocate the "special" VAs which are used for tmp mappings
1345 	 * by the pmap (and other modules). We allocate the VAs by advancing
1346 	 * virtual_avail (note that there are no pages mapped at these VAs).
1347 	 * we find the PTE that maps the allocated VA via the linear PTE
1348 	 * mapping.
1349 	 */
1350 
1351 	pt_entry_t *pte = PTE_BASE + pl1_i(virtual_avail);
1352 
1353 #ifdef MULTIPROCESSOR
1354 	/*
1355 	 * Waste some VA space to avoid false sharing of cache lines
1356 	 * for page table pages: Give each possible CPU a cache line
1357 	 * of PTE's (8) to play with, though we only need 4.  We could
1358 	 * recycle some of this waste by putting the idle stacks here
1359 	 * as well; we could waste less space if we knew the largest
1360 	 * CPU ID beforehand.
1361 	 */
1362 	csrcp = (char *) virtual_avail;  csrc_pte = pte;
1363 
1364 	cdstp = (char *) virtual_avail+PAGE_SIZE;  cdst_pte = pte+1;
1365 
1366 	zerop = (char *) virtual_avail+PAGE_SIZE*2;  zero_pte = pte+2;
1367 
1368 	ptpp = (char *) virtual_avail+PAGE_SIZE*3;  ptp_pte = pte+3;
1369 
1370 	virtual_avail += PAGE_SIZE * maxcpus * NPTECL;
1371 	pte += maxcpus * NPTECL;
1372 #else
1373 	csrcp = (void *) virtual_avail;  csrc_pte = pte;	/* allocate */
1374 	virtual_avail += PAGE_SIZE; pte++;			/* advance */
1375 
1376 	cdstp = (void *) virtual_avail;  cdst_pte = pte;
1377 	virtual_avail += PAGE_SIZE; pte++;
1378 
1379 	zerop = (void *) virtual_avail;  zero_pte = pte;
1380 	virtual_avail += PAGE_SIZE; pte++;
1381 
1382 	ptpp = (void *) virtual_avail;  ptp_pte = pte;
1383 	virtual_avail += PAGE_SIZE; pte++;
1384 #endif
1385 
1386 	if (VM_MIN_KERNEL_ADDRESS == KERNBASE) {
1387 		early_zerop = zerop;
1388 		early_zero_pte = zero_pte;
1389 	}
1390 #endif
1391 
1392 #ifdef XEN
1393 #ifdef __x86_64__
1394 	/*
1395 	 * We want a dummy page directory for Xen: when deactivating a pmap, Xen
1396 	 * will still consider it active. So we set user PGD to this one to lift
1397 	 * all protection on the now inactive page tables set.
1398 	 */
1399 	xen_dummy_user_pgd = pmap_bootstrap_palloc(1);
1400 
1401 	/* Zero fill it, the less checks in Xen it requires the better */
1402 	memset((void *) (xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE);
1403 	/* Mark read-only */
1404 	HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE,
1405 	    pmap_pa2pte(xen_dummy_user_pgd) | PG_u | PG_V, UVMF_INVLPG);
1406 	/* Pin as L4 */
1407 	xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd));
1408 #endif /* __x86_64__ */
1409 	/*
1410 	 * Xen requires one more page as we can't store GDT and LDT on the same
1411 	 * page.
1412 	 */
1413 	idt_vaddr = pmap_bootstrap_valloc(3);
1414 	idt_paddr = pmap_bootstrap_palloc(3);
1415 #else /* XEN */
1416 
1417 #if defined(__x86_64__)
1418 	idt_vaddr = pmap_bootstrap_valloc(2);
1419 	idt_paddr = pmap_bootstrap_palloc(2);
1420 #else
1421 	idt_vaddr = pmap_bootstrap_valloc(1);
1422 	idt_paddr = pmap_bootstrap_palloc(1);
1423 
1424 	/* pentium f00f bug stuff */
1425 	pentium_idt_vaddr = pmap_bootstrap_valloc(1);
1426 #endif
1427 
1428 #endif /* XEN */
1429 
1430 	/*
1431 	 * Now we reserve some VM for mapping pages when doing a crash dump.
1432 	 */
1433 	virtual_avail = reserve_dumppages(virtual_avail);
1434 
1435 	/*
1436 	 * Init the static-global locks and global lists.
1437 	 *
1438 	 * => pventry::pvh_lock (initialized elsewhere) must also be
1439 	 *      a spin lock, again at IPL_VM to prevent deadlock, and
1440 	 *	again is never taken from interrupt context.
1441 	 */
1442 	mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE);
1443 	LIST_INIT(&pmaps);
1444 
1445 	/*
1446 	 * Ensure the TLB is sync'd with reality by flushing it...
1447 	 */
1448 	tlbflushg();
1449 
1450 	/*
1451 	 * Calculate pmap_maxkvaddr from nkptp[].
1452 	 */
1453 	kva = VM_MIN_KERNEL_ADDRESS;
1454 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
1455 		kva += nkptp[i] * nbpd[i];
1456 	}
1457 	pmap_maxkvaddr = kva;
1458 }
1459 
1460 #ifdef __HAVE_DIRECT_MAP
1461 /*
1462  * Create the amd64 direct map. Called only once at boot time.
1463  */
1464 static void
1465 pmap_init_directmap(struct pmap *kpm)
1466 {
1467 	extern phys_ram_seg_t mem_clusters[];
1468 	extern int mem_cluster_cnt;
1469 
1470 	paddr_t lastpa, dm_pd, dm_pdp, pdp;
1471 	vaddr_t tmpva;
1472 	pt_entry_t *pte;
1473 	pd_entry_t *pde;
1474 	phys_ram_seg_t *mc;
1475 	long n_dm_pdp;
1476 	int i;
1477 
1478 	const pd_entry_t pteflags = PG_V | PG_KW | pmap_pg_nx;
1479 
1480 	/* Get the last physical address available */
1481 	lastpa = 0;
1482 	for (i = 0; i < mem_cluster_cnt; i++) {
1483 		mc = &mem_clusters[i];
1484 		lastpa = MAX(lastpa, mc->start + mc->size);
1485 	}
1486 
1487 	/*
1488 	 * We allocate only one L4 entry for the direct map (PDIR_SLOT_DIRECT),
1489 	 * so we cannot map more than 512GB.
1490 	 */
1491 	if (lastpa > NBPD_L4) {
1492 		panic("RAM limit reached: > 512GB not supported");
1493 	}
1494 
1495 	/* Allocate L3. */
1496 	dm_pdp = pmap_bootstrap_palloc(1);
1497 
1498 	/* Number of L3 entries. */
1499 	n_dm_pdp = (lastpa + NBPD_L3 - 1) >> L3_SHIFT;
1500 
1501 	/* In locore.S, we allocated a tmp va. Use it now. */
1502 	tmpva = (KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2);
1503 	pte = PTE_BASE + pl1_i(tmpva);
1504 	*pte = dm_pdp | pteflags;
1505 	pmap_update_pg(tmpva);
1506 	memset((void *)tmpva, 0, PAGE_SIZE);
1507 
1508 	/*
1509 	 * Map the direct map RW. Use super pages (1GB) or large pages (2MB) if
1510 	 * they are supported. Note: PG_G is not allowed on non-leaf PTPs.
1511 	 */
1512 	if (cpu_feature[2] & CPUID_P1GB) {
1513 		/* Super pages are supported. Just create L3. */
1514 		for (i = 0; i < n_dm_pdp; i++) {
1515 			pdp = (paddr_t)&(((pd_entry_t *)dm_pdp)[i]);
1516 			*pte = (pdp & PG_FRAME) | pteflags;
1517 			pmap_update_pg(tmpva);
1518 
1519 			pde = (pd_entry_t *)(tmpva + (pdp & ~PG_FRAME));
1520 			*pde = ((paddr_t)i << L3_SHIFT) | pteflags | PG_U |
1521 			    PG_PS | PG_G;
1522 		}
1523 	} else {
1524 		/* Allocate L2. */
1525 		dm_pd = pmap_bootstrap_palloc(n_dm_pdp);
1526 
1527 		/* Zero out the L2 pages. */
1528 		for (i = 0; i < n_dm_pdp; i++) {
1529 			pdp = dm_pd + i * PAGE_SIZE;
1530 			*pte = (pdp & PG_FRAME) | pteflags;
1531 			pmap_update_pg(tmpva);
1532 
1533 			memset((void *)tmpva, 0, PAGE_SIZE);
1534 		}
1535 
1536 		KASSERT(pmap_largepages != 0);
1537 
1538 		/* Large pages are supported. Just create L2. */
1539 		for (i = 0; i < NPDPG * n_dm_pdp; i++) {
1540 			pdp = (paddr_t)&(((pd_entry_t *)dm_pd)[i]);
1541 			*pte = (pdp & PG_FRAME) | pteflags;
1542 			pmap_update_pg(tmpva);
1543 
1544 			pde = (pd_entry_t *)(tmpva + (pdp & ~PG_FRAME));
1545 			*pde = ((paddr_t)i << L2_SHIFT) | pteflags |
1546 			    PG_U | PG_PS | PG_G;
1547 		}
1548 
1549 		/* Fill in the L3 entries, linked to L2. */
1550 		for (i = 0; i < n_dm_pdp; i++) {
1551 			pdp = (paddr_t)&(((pd_entry_t *)dm_pdp)[i]);
1552 			*pte = (pdp & PG_FRAME) | pteflags;
1553 			pmap_update_pg(tmpva);
1554 
1555 			pde = (pd_entry_t *)(tmpva + (pdp & ~PG_FRAME));
1556 			*pde = (dm_pd + (i << PAGE_SHIFT)) | pteflags | PG_U;
1557 		}
1558 	}
1559 
1560 	kpm->pm_pdir[PDIR_SLOT_DIRECT] = dm_pdp | pteflags | PG_U;
1561 
1562 	tlbflush();
1563 }
1564 #endif /* __HAVE_DIRECT_MAP */
1565 
1566 #ifndef XEN
1567 /*
1568  * Remap several kernel segments with large pages. We cover as many pages as we
1569  * can. Called only once at boot time, if the CPU supports large pages.
1570  */
1571 static void
1572 pmap_remap_largepages(void)
1573 {
1574 	extern char __rodata_start;
1575 	extern char __data_start;
1576 	extern char __kernel_end;
1577 	pd_entry_t *pde;
1578 	vaddr_t kva, kva_end;
1579 	paddr_t pa;
1580 
1581 	/* Remap the kernel text using large pages. */
1582 	kva = KERNBASE;
1583 	kva_end = rounddown((vaddr_t)&__rodata_start, NBPD_L1);
1584 	pa = kva - KERNBASE;
1585 	for (/* */; kva + NBPD_L2 <= kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1586 		pde = &L2_BASE[pl2_i(kva)];
1587 		*pde = pa | pmap_pg_g | PG_PS | PG_KR | PG_V;
1588 		tlbflushg();
1589 	}
1590 #if defined(DEBUG)
1591 	aprint_normal("kernel text is mapped with %" PRIuPSIZE " large "
1592 	    "pages and %" PRIuPSIZE " normal pages\n",
1593 	    howmany(kva - KERNBASE, NBPD_L2),
1594 	    howmany((vaddr_t)&__rodata_start - kva, NBPD_L1));
1595 #endif /* defined(DEBUG) */
1596 
1597 	/* Remap the kernel rodata using large pages. */
1598 	kva = roundup((vaddr_t)&__rodata_start, NBPD_L2);
1599 	kva_end = rounddown((vaddr_t)&__data_start, NBPD_L1);
1600 	pa = kva - KERNBASE;
1601 	for (/* */; kva + NBPD_L2 <= kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1602 		pde = &L2_BASE[pl2_i(kva)];
1603 		*pde = pa | pmap_pg_g | PG_PS | pmap_pg_nx | PG_KR | PG_V;
1604 		tlbflushg();
1605 	}
1606 
1607 	/* Remap the kernel data+bss using large pages. */
1608 	/*
1609 	 * XXX: we need to make sure the first page (PAGE_SIZE) of .data is not
1610 	 * mapped with a large page. As bizarre as it might seem, this first
1611 	 * page is used as the VA for the LAPIC page.
1612 	 */
1613 	kva = roundup((vaddr_t)&__data_start+PAGE_SIZE, NBPD_L2);
1614 	kva_end = rounddown((vaddr_t)&__kernel_end, NBPD_L1);
1615 	pa = kva - KERNBASE;
1616 	for (/* */; kva + NBPD_L2 <= kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1617 		pde = &L2_BASE[pl2_i(kva)];
1618 		*pde = pa | pmap_pg_g | PG_PS | pmap_pg_nx | PG_KW | PG_V;
1619 		tlbflushg();
1620 	}
1621 }
1622 #endif /* !XEN */
1623 
1624 /*
1625  * pmap_init: called from uvm_init, our job is to get the pmap
1626  * system ready to manage mappings...
1627  */
1628 
1629 void
1630 pmap_init(void)
1631 {
1632 	int i, flags;
1633 
1634 	for (i = 0; i < PV_HASH_SIZE; i++) {
1635 		SLIST_INIT(&pv_hash_heads[i].hh_list);
1636 	}
1637 	for (i = 0; i < PV_HASH_LOCK_CNT; i++) {
1638 		mutex_init(&pv_hash_locks[i].lock, MUTEX_NODEBUG, IPL_VM);
1639 	}
1640 
1641 	/*
1642 	 * initialize caches.
1643 	 */
1644 
1645 	pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), 0, 0, 0,
1646 	    "pmappl", NULL, IPL_NONE, NULL, NULL, NULL);
1647 
1648 #ifdef XEN
1649 	/*
1650 	 * pool_cache(9) should not touch cached objects, since they
1651 	 * are pinned on xen and R/O for the domU
1652 	 */
1653 	flags = PR_NOTOUCH;
1654 #else /* XEN */
1655 	flags = 0;
1656 #endif /* XEN */
1657 #ifdef PAE
1658 	pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE * PDP_SIZE, 0, 0, flags,
1659 	    "pdppl", &pmap_pdp_allocator, IPL_NONE,
1660 	    pmap_pdp_ctor, pmap_pdp_dtor, NULL);
1661 #else /* PAE */
1662 	pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE, 0, 0, flags,
1663 	    "pdppl", NULL, IPL_NONE, pmap_pdp_ctor, pmap_pdp_dtor, NULL);
1664 #endif /* PAE */
1665 	pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0,
1666 	    PR_LARGECACHE, "pvpl", &pool_allocator_kmem, IPL_NONE, NULL,
1667 	    NULL, NULL);
1668 
1669 	pmap_tlb_init();
1670 
1671 	/* XXX: Since cpu_hatch() is only for secondary CPUs. */
1672 	pmap_tlb_cpu_init(curcpu());
1673 
1674 	evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC,
1675 	    NULL, "x86", "io bitmap copy");
1676 	evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC,
1677 	    NULL, "x86", "ldt sync");
1678 
1679 	/*
1680 	 * done: pmap module is up (and ready for business)
1681 	 */
1682 
1683 	pmap_initialized = true;
1684 }
1685 
1686 /*
1687  * pmap_cpu_init_late: perform late per-CPU initialization.
1688  */
1689 
1690 #ifndef XEN
1691 void
1692 pmap_cpu_init_late(struct cpu_info *ci)
1693 {
1694 	/*
1695 	 * The BP has already its own PD page allocated during early
1696 	 * MD startup.
1697 	 */
1698 	if (ci == &cpu_info_primary)
1699 		return;
1700 
1701 #ifdef PAE
1702 	cpu_alloc_l3_page(ci);
1703 #endif
1704 }
1705 #endif
1706 
1707 /*
1708  * p v _ e n t r y   f u n c t i o n s
1709  */
1710 
1711 /*
1712  * pmap_free_pvs: free a list of pv_entrys
1713  */
1714 
1715 static void
1716 pmap_free_pvs(struct pv_entry *pve)
1717 {
1718 	struct pv_entry *next;
1719 
1720 	for ( /* null */ ; pve != NULL ; pve = next) {
1721 		next = pve->pve_next;
1722 		pool_cache_put(&pmap_pv_cache, pve);
1723 	}
1724 }
1725 
1726 /*
1727  * main pv_entry manipulation functions:
1728  *   pmap_enter_pv: enter a mapping onto a pv_head list
1729  *   pmap_remove_pv: remove a mapping from a pv_head list
1730  *
1731  * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock
1732  *       the pvh before calling
1733  */
1734 
1735 /*
1736  * insert_pv: a helper of pmap_enter_pv
1737  */
1738 
1739 static void
1740 insert_pv(struct pmap_page *pp, struct pv_entry *pve)
1741 {
1742 	struct pv_hash_head *hh;
1743 	kmutex_t *lock;
1744 	u_int hash;
1745 
1746 	hash = pvhash_hash(pve->pve_pte.pte_ptp, pve->pve_pte.pte_va);
1747 	lock = pvhash_lock(hash);
1748 	hh = pvhash_head(hash);
1749 	mutex_spin_enter(lock);
1750 	SLIST_INSERT_HEAD(&hh->hh_list, pve, pve_hash);
1751 	mutex_spin_exit(lock);
1752 
1753 	LIST_INSERT_HEAD(&pp->pp_head.pvh_list, pve, pve_list);
1754 }
1755 
1756 /*
1757  * pmap_enter_pv: enter a mapping onto a pv_head lst
1758  *
1759  * => caller should adjust ptp's wire_count before calling
1760  * => caller has preallocated pve and *sparepve for us
1761  */
1762 
1763 static struct pv_entry *
1764 pmap_enter_pv(struct pmap_page *pp, struct pv_entry *pve,
1765     struct pv_entry **sparepve, struct vm_page *ptp, vaddr_t va)
1766 {
1767 
1768 	KASSERT(ptp == NULL || ptp->wire_count >= 2);
1769 	KASSERT(ptp == NULL || ptp->uobject != NULL);
1770 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
1771 
1772 	if ((pp->pp_flags & PP_EMBEDDED) == 0) {
1773 		if (LIST_EMPTY(&pp->pp_head.pvh_list)) {
1774 			pp->pp_flags |= PP_EMBEDDED;
1775 			pp->pp_pte.pte_ptp = ptp;
1776 			pp->pp_pte.pte_va = va;
1777 
1778 			return pve;
1779 		}
1780 	} else {
1781 		struct pv_entry *pve2;
1782 
1783 		pve2 = *sparepve;
1784 		*sparepve = NULL;
1785 
1786 		pve2->pve_pte = pp->pp_pte;
1787 		pp->pp_flags &= ~PP_EMBEDDED;
1788 		LIST_INIT(&pp->pp_head.pvh_list);
1789 		insert_pv(pp, pve2);
1790 	}
1791 
1792 	pve->pve_pte.pte_ptp = ptp;
1793 	pve->pve_pte.pte_va = va;
1794 	insert_pv(pp, pve);
1795 
1796 	return NULL;
1797 }
1798 
1799 /*
1800  * pmap_remove_pv: try to remove a mapping from a pv_list
1801  *
1802  * => caller should adjust ptp's wire_count and free PTP if needed
1803  * => we return the removed pve
1804  */
1805 
1806 static struct pv_entry *
1807 pmap_remove_pv(struct pmap_page *pp, struct vm_page *ptp, vaddr_t va)
1808 {
1809 	struct pv_hash_head *hh;
1810 	struct pv_entry *pve;
1811 	kmutex_t *lock;
1812 	u_int hash;
1813 
1814 	KASSERT(ptp == NULL || ptp->uobject != NULL);
1815 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
1816 
1817 	if ((pp->pp_flags & PP_EMBEDDED) != 0) {
1818 		KASSERT(pp->pp_pte.pte_ptp == ptp);
1819 		KASSERT(pp->pp_pte.pte_va == va);
1820 
1821 		pp->pp_flags &= ~PP_EMBEDDED;
1822 		LIST_INIT(&pp->pp_head.pvh_list);
1823 
1824 		return NULL;
1825 	}
1826 
1827 	hash = pvhash_hash(ptp, va);
1828 	lock = pvhash_lock(hash);
1829 	hh = pvhash_head(hash);
1830 	mutex_spin_enter(lock);
1831 	pve = pvhash_remove(hh, ptp, va);
1832 	mutex_spin_exit(lock);
1833 
1834 	LIST_REMOVE(pve, pve_list);
1835 
1836 	return pve;
1837 }
1838 
1839 /*
1840  * p t p   f u n c t i o n s
1841  */
1842 
1843 static inline struct vm_page *
1844 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level)
1845 {
1846 	int lidx = level - 1;
1847 	struct vm_page *pg;
1848 
1849 	KASSERT(mutex_owned(pmap->pm_lock));
1850 
1851 	if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] &&
1852 	    pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) {
1853 		return (pmap->pm_ptphint[lidx]);
1854 	}
1855 	PMAP_SUBOBJ_LOCK(pmap, lidx);
1856 	pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level));
1857 	PMAP_SUBOBJ_UNLOCK(pmap, lidx);
1858 
1859 	KASSERT(pg == NULL || pg->wire_count >= 1);
1860 	return pg;
1861 }
1862 
1863 static inline void
1864 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level)
1865 {
1866 	lwp_t *l;
1867 	int lidx;
1868 	struct uvm_object *obj;
1869 
1870 	KASSERT(ptp->wire_count == 1);
1871 
1872 	lidx = level - 1;
1873 
1874 	obj = &pmap->pm_obj[lidx];
1875 	pmap_stats_update(pmap, -1, 0);
1876 	if (lidx != 0)
1877 		mutex_enter(obj->vmobjlock);
1878 	if (pmap->pm_ptphint[lidx] == ptp)
1879 		pmap->pm_ptphint[lidx] = TAILQ_FIRST(&obj->memq);
1880 	ptp->wire_count = 0;
1881 	uvm_pagerealloc(ptp, NULL, 0);
1882 	l = curlwp;
1883 	KASSERT((l->l_pflag & LP_INTR) == 0);
1884 	VM_PAGE_TO_PP(ptp)->pp_link = l->l_md.md_gc_ptp;
1885 	l->l_md.md_gc_ptp = ptp;
1886 	if (lidx != 0)
1887 		mutex_exit(obj->vmobjlock);
1888 }
1889 
1890 static void
1891 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
1892 	      pt_entry_t *ptes, pd_entry_t * const *pdes)
1893 {
1894 	unsigned long index;
1895 	int level;
1896 	vaddr_t invaladdr;
1897 	pd_entry_t opde;
1898 
1899 	KASSERT(pmap != pmap_kernel());
1900 	KASSERT(mutex_owned(pmap->pm_lock));
1901 	KASSERT(kpreempt_disabled());
1902 
1903 	level = 1;
1904 	do {
1905 		index = pl_i(va, level + 1);
1906 		opde = pmap_pte_testset(&pdes[level - 1][index], 0);
1907 #if defined(XEN)
1908 #  if defined(__x86_64__)
1909 		/*
1910 		 * If ptp is a L3 currently mapped in kernel space,
1911 		 * on any cpu, clear it before freeing
1912 		 */
1913 		if (level == PTP_LEVELS - 1) {
1914 			/*
1915 			 * Update the per-cpu PD on all cpus the current
1916 			 * pmap is active on
1917 			 */
1918 			xen_kpm_sync(pmap, index);
1919 		}
1920 #  endif /*__x86_64__ */
1921 		invaladdr = level == 1 ? (vaddr_t)ptes :
1922 		    (vaddr_t)pdes[level - 2];
1923 		pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE,
1924 		    opde, TLBSHOOT_FREE_PTP1);
1925 		pmap_tlb_shootnow();
1926 #else	/* XEN */
1927 		invaladdr = level == 1 ? (vaddr_t)ptes :
1928 		    (vaddr_t)pdes[level - 2];
1929 		pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE,
1930 		    opde, TLBSHOOT_FREE_PTP1);
1931 #endif	/* XEN */
1932 		pmap_freepage(pmap, ptp, level);
1933 		if (level < PTP_LEVELS - 1) {
1934 			ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1);
1935 			ptp->wire_count--;
1936 			if (ptp->wire_count > 1)
1937 				break;
1938 		}
1939 	} while (++level < PTP_LEVELS);
1940 	pmap_pte_flush();
1941 }
1942 
1943 /*
1944  * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
1945  *
1946  * => pmap should NOT be pmap_kernel()
1947  * => pmap should be locked
1948  * => preemption should be disabled
1949  */
1950 
1951 static struct vm_page *
1952 pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t * const *pdes)
1953 {
1954 	struct vm_page *ptp, *pptp;
1955 	int i;
1956 	unsigned long index;
1957 	pd_entry_t *pva;
1958 	paddr_t ppa, pa;
1959 	struct uvm_object *obj;
1960 
1961 	KASSERT(pmap != pmap_kernel());
1962 	KASSERT(mutex_owned(pmap->pm_lock));
1963 	KASSERT(kpreempt_disabled());
1964 
1965 	ptp = NULL;
1966 	pa = (paddr_t)-1;
1967 
1968 	/*
1969 	 * Loop through all page table levels seeing if we need to
1970 	 * add a new page to that level.
1971 	 */
1972 	for (i = PTP_LEVELS; i > 1; i--) {
1973 		/*
1974 		 * Save values from previous round.
1975 		 */
1976 		pptp = ptp;
1977 		ppa = pa;
1978 
1979 		index = pl_i(va, i);
1980 		pva = pdes[i - 2];
1981 
1982 		if (pmap_valid_entry(pva[index])) {
1983 			ppa = pmap_pte2pa(pva[index]);
1984 			ptp = NULL;
1985 			continue;
1986 		}
1987 
1988 		obj = &pmap->pm_obj[i-2];
1989 		PMAP_SUBOBJ_LOCK(pmap, i - 2);
1990 		ptp = uvm_pagealloc(obj, ptp_va2o(va, i - 1), NULL,
1991 		    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
1992 		PMAP_SUBOBJ_UNLOCK(pmap, i - 2);
1993 
1994 		if (ptp == NULL)
1995 			return NULL;
1996 
1997 		ptp->flags &= ~PG_BUSY; /* never busy */
1998 		ptp->wire_count = 1;
1999 		pmap->pm_ptphint[i - 2] = ptp;
2000 		pa = VM_PAGE_TO_PHYS(ptp);
2001 		pmap_pte_set(&pva[index], (pd_entry_t)
2002 		        (pmap_pa2pte(pa) | PG_u | PG_RW | PG_V));
2003 #if defined(XEN) && defined(__x86_64__)
2004 		if(i == PTP_LEVELS) {
2005 			/*
2006 			 * Update the per-cpu PD on all cpus the current
2007 			 * pmap is active on
2008 			 */
2009 			xen_kpm_sync(pmap, index);
2010 		}
2011 #endif
2012 		pmap_pte_flush();
2013 		pmap_stats_update(pmap, 1, 0);
2014 		/*
2015 		 * If we're not in the top level, increase the
2016 		 * wire count of the parent page.
2017 		 */
2018 		if (i < PTP_LEVELS) {
2019 			if (pptp == NULL) {
2020 				pptp = pmap_find_ptp(pmap, va, ppa, i);
2021 				KASSERT(pptp != NULL);
2022 			}
2023 			pptp->wire_count++;
2024 		}
2025 	}
2026 
2027 	/*
2028 	 * PTP is not NULL if we just allocated a new PTP.  If it is
2029 	 * still NULL, we must look up the existing one.
2030 	 */
2031 	if (ptp == NULL) {
2032 		ptp = pmap_find_ptp(pmap, va, ppa, 1);
2033 		KASSERTMSG(ptp != NULL, "pmap_get_ptp: va %" PRIxVADDR
2034 		    "ppa %" PRIxPADDR "\n", va, ppa);
2035 	}
2036 
2037 	pmap->pm_ptphint[0] = ptp;
2038 	return ptp;
2039 }
2040 
2041 /*
2042  * p m a p   l i f e c y c l e   f u n c t i o n s
2043  */
2044 
2045 /*
2046  * pmap_pdp_ctor: constructor for the PDP cache.
2047  */
2048 static int
2049 pmap_pdp_ctor(void *arg, void *v, int flags)
2050 {
2051 	pd_entry_t *pdir = v;
2052 	paddr_t pdirpa = 0;
2053 	vaddr_t object;
2054 	int i;
2055 
2056 #if !defined(XEN) || !defined(__x86_64__)
2057 	int npde;
2058 #endif
2059 #ifdef XEN
2060 	int s;
2061 #endif
2062 
2063 	/*
2064 	 * NOTE: The `pmaps_lock' is held when the PDP is allocated.
2065 	 */
2066 
2067 #if defined(XEN) && defined(__x86_64__)
2068 	/* Fetch the physical address of the page directory */
2069 	(void)pmap_extract(pmap_kernel(), (vaddr_t)pdir, &pdirpa);
2070 
2071 	/* Zero the area */
2072 	memset(pdir, 0, PAGE_SIZE); /* Xen wants a clean page */
2073 
2074 	/*
2075 	 * This pdir will NEVER be active in kernel mode, so mark
2076 	 * recursive entry invalid.
2077 	 */
2078 	pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa) | PG_u;
2079 
2080 	/*
2081 	 * PDP constructed this way won't be for the kernel, hence we
2082 	 * don't put kernel mappings on Xen.
2083 	 *
2084 	 * But we need to make pmap_create() happy, so put a dummy
2085 	 * (without PG_V) value at the right place.
2086 	 */
2087 	pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] =
2088 	     (pd_entry_t)-1 & PG_FRAME;
2089 #else /* XEN && __x86_64__*/
2090 	/* Zero the area */
2091 	memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t));
2092 
2093 	object = (vaddr_t)v;
2094 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2095 		/* Fetch the physical address of the page directory */
2096 		(void)pmap_extract(pmap_kernel(), object, &pdirpa);
2097 
2098 		/* Put in recursive PDE to map the PTEs */
2099 		pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PG_V |
2100 		    pmap_pg_nx;
2101 #ifndef XEN
2102 		pdir[PDIR_SLOT_PTE + i] |= PG_KW;
2103 #endif
2104 	}
2105 
2106 	/* Copy the kernel's top level PDE */
2107 	npde = nkptp[PTP_LEVELS - 1];
2108 
2109 	memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN],
2110 	    npde * sizeof(pd_entry_t));
2111 
2112 	/* Zero the rest */
2113 	memset(&pdir[PDIR_SLOT_KERN + npde], 0, (PAGE_SIZE * PDP_SIZE) -
2114 	    (PDIR_SLOT_KERN + npde) * sizeof(pd_entry_t));
2115 
2116 	if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
2117 		int idx = pl_i(KERNBASE, PTP_LEVELS);
2118 		pdir[idx] = PDP_BASE[idx];
2119 	}
2120 
2121 #ifdef __HAVE_DIRECT_MAP
2122 	pdir[PDIR_SLOT_DIRECT] = PDP_BASE[PDIR_SLOT_DIRECT];
2123 #endif
2124 #endif /* XEN  && __x86_64__*/
2125 
2126 #ifdef XEN
2127 	s = splvm();
2128 	object = (vaddr_t)v;
2129 	pmap_protect(pmap_kernel(), object, object + (PAGE_SIZE * PDP_SIZE),
2130 	    VM_PROT_READ);
2131 	pmap_update(pmap_kernel());
2132 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2133 		/*
2134 		 * pin as L2/L4 page, we have to do the page with the
2135 		 * PDIR_SLOT_PTE entries last
2136 		 */
2137 #ifdef PAE
2138 		if (i == l2tol3(PDIR_SLOT_PTE))
2139 			continue;
2140 #endif
2141 
2142 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2143 #ifdef __x86_64__
2144 		xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa));
2145 #else
2146 		xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2147 #endif
2148 	}
2149 #ifdef PAE
2150 	object = ((vaddr_t)pdir) + PAGE_SIZE  * l2tol3(PDIR_SLOT_PTE);
2151 	(void)pmap_extract(pmap_kernel(), object, &pdirpa);
2152 	xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2153 #endif
2154 	splx(s);
2155 #endif /* XEN */
2156 
2157 	return (0);
2158 }
2159 
2160 /*
2161  * pmap_pdp_dtor: destructor for the PDP cache.
2162  */
2163 
2164 static void
2165 pmap_pdp_dtor(void *arg, void *v)
2166 {
2167 #ifdef XEN
2168 	paddr_t pdirpa = 0;	/* XXX: GCC */
2169 	vaddr_t object = (vaddr_t)v;
2170 	int i;
2171 	int s = splvm();
2172 	pt_entry_t *pte;
2173 
2174 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2175 		/* fetch the physical address of the page directory. */
2176 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2177 		/* unpin page table */
2178 		xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa));
2179 	}
2180 	object = (vaddr_t)v;
2181 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2182 		/* Set page RW again */
2183 		pte = kvtopte(object);
2184 		pmap_pte_set(pte, *pte | PG_RW);
2185 		xen_bcast_invlpg((vaddr_t)object);
2186 	}
2187 	splx(s);
2188 #endif  /* XEN */
2189 }
2190 
2191 #ifdef PAE
2192 
2193 /* pmap_pdp_alloc: Allocate a page for the pdp memory pool. */
2194 
2195 static void *
2196 pmap_pdp_alloc(struct pool *pp, int flags)
2197 {
2198 	return (void *)uvm_km_alloc(kernel_map,
2199 	    PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE,
2200 	    ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK)
2201 	    | UVM_KMF_WIRED);
2202 }
2203 
2204 /*
2205  * pmap_pdp_free: free a PDP
2206  */
2207 
2208 static void
2209 pmap_pdp_free(struct pool *pp, void *v)
2210 {
2211 	uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE,
2212 	    UVM_KMF_WIRED);
2213 }
2214 #endif /* PAE */
2215 
2216 /*
2217  * pmap_create: create a pmap object.
2218  */
2219 struct pmap *
2220 pmap_create(void)
2221 {
2222 	struct pmap *pmap;
2223 	int i;
2224 
2225 	pmap = pool_cache_get(&pmap_cache, PR_WAITOK);
2226 
2227 	/* init uvm_object */
2228 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2229 		mutex_init(&pmap->pm_obj_lock[i], MUTEX_DEFAULT, IPL_NONE);
2230 		uvm_obj_init(&pmap->pm_obj[i], NULL, false, 1);
2231 		uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_obj_lock[i]);
2232 		pmap->pm_ptphint[i] = NULL;
2233 	}
2234 	pmap->pm_stats.wired_count = 0;
2235 	/* count the PDP allocd below */
2236 	pmap->pm_stats.resident_count = PDP_SIZE;
2237 #if !defined(__x86_64__)
2238 	pmap->pm_hiexec = 0;
2239 #endif /* !defined(__x86_64__) */
2240 	pmap->pm_flags = 0;
2241 	pmap->pm_gc_ptp = NULL;
2242 
2243 	kcpuset_create(&pmap->pm_cpus, true);
2244 	kcpuset_create(&pmap->pm_kernel_cpus, true);
2245 #ifdef XEN
2246 	kcpuset_create(&pmap->pm_xen_ptp_cpus, true);
2247 #endif
2248 	/* init the LDT */
2249 	pmap->pm_ldt = NULL;
2250 	pmap->pm_ldt_len = 0;
2251 	pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2252 
2253 	/* allocate PDP */
2254  try_again:
2255 	pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK);
2256 
2257 	mutex_enter(&pmaps_lock);
2258 
2259 	if (pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] == 0) {
2260 		mutex_exit(&pmaps_lock);
2261 		pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir);
2262 		goto try_again;
2263 	}
2264 
2265 	for (i = 0; i < PDP_SIZE; i++)
2266 		pmap->pm_pdirpa[i] =
2267 		    pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]);
2268 
2269 	LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
2270 
2271 	mutex_exit(&pmaps_lock);
2272 
2273 	return (pmap);
2274 }
2275 
2276 /*
2277  * pmap_free_ptps: put a list of ptps back to the freelist.
2278  */
2279 
2280 static void
2281 pmap_free_ptps(struct vm_page *empty_ptps)
2282 {
2283 	struct vm_page *ptp;
2284 	struct pmap_page *pp;
2285 
2286 	while ((ptp = empty_ptps) != NULL) {
2287 		pp = VM_PAGE_TO_PP(ptp);
2288 		empty_ptps = pp->pp_link;
2289 		LIST_INIT(&pp->pp_head.pvh_list);
2290 		uvm_pagefree(ptp);
2291 	}
2292 }
2293 
2294 /*
2295  * pmap_destroy: drop reference count on pmap.   free pmap if
2296  *	reference count goes to zero.
2297  */
2298 
2299 void
2300 pmap_destroy(struct pmap *pmap)
2301 {
2302 	lwp_t *l;
2303 	int i;
2304 
2305 	/*
2306 	 * If we have torn down this pmap, process deferred frees and
2307 	 * invalidations.  Free now if the system is low on memory.
2308 	 * Otherwise, free when the pmap is destroyed thus avoiding a
2309 	 * TLB shootdown.
2310 	 */
2311 	l = curlwp;
2312 	if (__predict_false(l->l_md.md_gc_pmap == pmap)) {
2313 		if (uvmexp.free < uvmexp.freetarg) {
2314 			pmap_update(pmap);
2315 		} else {
2316 			KASSERT(pmap->pm_gc_ptp == NULL);
2317 			pmap->pm_gc_ptp = l->l_md.md_gc_ptp;
2318 			l->l_md.md_gc_ptp = NULL;
2319 			l->l_md.md_gc_pmap = NULL;
2320 		}
2321 	}
2322 
2323 	/*
2324 	 * drop reference count
2325 	 */
2326 
2327 	if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) {
2328 		return;
2329 	}
2330 
2331 #ifdef DIAGNOSTIC
2332 	CPU_INFO_ITERATOR cii;
2333 	struct cpu_info *ci;
2334 
2335 	for (CPU_INFO_FOREACH(cii, ci)) {
2336 		if (ci->ci_pmap == pmap)
2337 			panic("destroying pmap being used");
2338 #if defined(XEN) && defined(__x86_64__)
2339 		for (i = 0; i < PDIR_SLOT_PTE; i++) {
2340 			if (pmap->pm_pdir[i] != 0 &&
2341 			    ci->ci_kpm_pdir[i] == pmap->pm_pdir[i]) {
2342 				printf("pmap_destroy(%p) pmap_kernel %p "
2343 				    "curcpu %d cpu %d ci_pmap %p "
2344 				    "ci->ci_kpm_pdir[%d]=%" PRIx64
2345 				    " pmap->pm_pdir[%d]=%" PRIx64 "\n",
2346 				    pmap, pmap_kernel(), curcpu()->ci_index,
2347 				    ci->ci_index, ci->ci_pmap,
2348 				    i, ci->ci_kpm_pdir[i],
2349 				    i, pmap->pm_pdir[i]);
2350 				panic("pmap_destroy: used pmap");
2351 			}
2352 		}
2353 #endif
2354 	}
2355 #endif /* DIAGNOSTIC */
2356 
2357 	/*
2358 	 * Reference count is zero, free pmap resources and then free pmap.
2359 	 * First, remove it from global list of pmaps.
2360 	 */
2361 
2362 	mutex_enter(&pmaps_lock);
2363 	LIST_REMOVE(pmap, pm_list);
2364 	mutex_exit(&pmaps_lock);
2365 
2366 	/*
2367 	 * Process deferred PTP frees.  No TLB shootdown required, as the
2368 	 * PTP pages are no longer visible to any CPU.
2369 	 */
2370 
2371 	pmap_free_ptps(pmap->pm_gc_ptp);
2372 
2373 	/*
2374 	 * destroyed pmap shouldn't have remaining PTPs
2375 	 */
2376 
2377 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2378 		KASSERT(pmap->pm_obj[i].uo_npages == 0);
2379 		KASSERT(TAILQ_EMPTY(&pmap->pm_obj[i].memq));
2380 	}
2381 
2382 	pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir);
2383 
2384 #ifdef USER_LDT
2385 	if (pmap->pm_ldt != NULL) {
2386 		/*
2387 		 * no need to switch the LDT; this address space is gone,
2388 		 * nothing is using it.
2389 		 *
2390 		 * No need to lock the pmap for ldt_free (or anything else),
2391 		 * we're the last one to use it.
2392 		 */
2393 		mutex_enter(&cpu_lock);
2394 		ldt_free(pmap->pm_ldt_sel);
2395 		mutex_exit(&cpu_lock);
2396 		uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt,
2397 		    pmap->pm_ldt_len, UVM_KMF_WIRED);
2398 	}
2399 #endif
2400 
2401 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2402 		uvm_obj_destroy(&pmap->pm_obj[i], false);
2403 		mutex_destroy(&pmap->pm_obj_lock[i]);
2404 	}
2405 	kcpuset_destroy(pmap->pm_cpus);
2406 	kcpuset_destroy(pmap->pm_kernel_cpus);
2407 #ifdef XEN
2408 	kcpuset_destroy(pmap->pm_xen_ptp_cpus);
2409 #endif
2410 	pool_cache_put(&pmap_cache, pmap);
2411 }
2412 
2413 /*
2414  * pmap_remove_all: pmap is being torn down by the current thread.
2415  * avoid unnecessary invalidations.
2416  */
2417 
2418 void
2419 pmap_remove_all(struct pmap *pmap)
2420 {
2421 	lwp_t *l = curlwp;
2422 
2423 	KASSERT(l->l_md.md_gc_pmap == NULL);
2424 
2425 	l->l_md.md_gc_pmap = pmap;
2426 }
2427 
2428 #if defined(PMAP_FORK)
2429 /*
2430  * pmap_fork: perform any necessary data structure manipulation when
2431  * a VM space is forked.
2432  */
2433 
2434 void
2435 pmap_fork(struct pmap *pmap1, struct pmap *pmap2)
2436 {
2437 #ifdef USER_LDT
2438 	union descriptor *new_ldt;
2439 	size_t len;
2440 	int sel;
2441 
2442 	if (__predict_true(pmap1->pm_ldt == NULL)) {
2443 		return;
2444 	}
2445 
2446  retry:
2447 	if (pmap1->pm_ldt != NULL) {
2448 		len = pmap1->pm_ldt_len;
2449 		new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, len, 0,
2450 		    UVM_KMF_WIRED);
2451 		mutex_enter(&cpu_lock);
2452 		sel = ldt_alloc(new_ldt, len);
2453 		if (sel == -1) {
2454 			mutex_exit(&cpu_lock);
2455 			uvm_km_free(kernel_map, (vaddr_t)new_ldt, len,
2456 			    UVM_KMF_WIRED);
2457 			printf("WARNING: pmap_fork: unable to allocate LDT\n");
2458 			return;
2459 		}
2460 	} else {
2461 		len = -1;
2462 		new_ldt = NULL;
2463 		sel = -1;
2464 		mutex_enter(&cpu_lock);
2465 	}
2466 
2467  	/* Copy the LDT, if necessary. */
2468  	if (pmap1->pm_ldt != NULL) {
2469 		if (len != pmap1->pm_ldt_len) {
2470 			if (len != -1) {
2471 				ldt_free(sel);
2472 				uvm_km_free(kernel_map, (vaddr_t)new_ldt,
2473 				    len, UVM_KMF_WIRED);
2474 			}
2475 			mutex_exit(&cpu_lock);
2476 			goto retry;
2477 		}
2478 
2479 		memcpy(new_ldt, pmap1->pm_ldt, len);
2480 		pmap2->pm_ldt = new_ldt;
2481 		pmap2->pm_ldt_len = pmap1->pm_ldt_len;
2482 		pmap2->pm_ldt_sel = sel;
2483 		len = -1;
2484 	}
2485 
2486 	if (len != -1) {
2487 		ldt_free(sel);
2488 		uvm_km_free(kernel_map, (vaddr_t)new_ldt, len,
2489 		    UVM_KMF_WIRED);
2490 	}
2491 	mutex_exit(&cpu_lock);
2492 #endif /* USER_LDT */
2493 }
2494 #endif /* PMAP_FORK */
2495 
2496 #ifdef USER_LDT
2497 
2498 /*
2499  * pmap_ldt_xcall: cross call used by pmap_ldt_sync.  if the named pmap
2500  * is active, reload LDTR.
2501  */
2502 static void
2503 pmap_ldt_xcall(void *arg1, void *arg2)
2504 {
2505 	struct pmap *pm;
2506 
2507 	kpreempt_disable();
2508 	pm = arg1;
2509 	if (curcpu()->ci_pmap == pm) {
2510 		lldt(pm->pm_ldt_sel);
2511 	}
2512 	kpreempt_enable();
2513 }
2514 
2515 /*
2516  * pmap_ldt_sync: LDT selector for the named pmap is changing.  swap
2517  * in the new selector on all CPUs.
2518  */
2519 void
2520 pmap_ldt_sync(struct pmap *pm)
2521 {
2522 	uint64_t where;
2523 
2524 	KASSERT(mutex_owned(&cpu_lock));
2525 
2526 	pmap_ldt_evcnt.ev_count++;
2527 	where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL);
2528 	xc_wait(where);
2529 }
2530 
2531 /*
2532  * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and
2533  * restore the default.
2534  */
2535 
2536 void
2537 pmap_ldt_cleanup(struct lwp *l)
2538 {
2539 	pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap;
2540 	union descriptor *dp = NULL;
2541 	size_t len = 0;
2542 	int sel = -1;
2543 
2544 	if (__predict_true(pmap->pm_ldt == NULL)) {
2545 		return;
2546 	}
2547 
2548 	mutex_enter(&cpu_lock);
2549 	if (pmap->pm_ldt != NULL) {
2550 		sel = pmap->pm_ldt_sel;
2551 		dp = pmap->pm_ldt;
2552 		len = pmap->pm_ldt_len;
2553 		pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2554 		pmap->pm_ldt = NULL;
2555 		pmap->pm_ldt_len = 0;
2556 		pmap_ldt_sync(pmap);
2557 		ldt_free(sel);
2558 		uvm_km_free(kernel_map, (vaddr_t)dp, len, UVM_KMF_WIRED);
2559 	}
2560 	mutex_exit(&cpu_lock);
2561 }
2562 #endif /* USER_LDT */
2563 
2564 /*
2565  * pmap_activate: activate a process' pmap
2566  *
2567  * => must be called with kernel preemption disabled
2568  * => if lwp is the curlwp, then set ci_want_pmapload so that
2569  *    actual MMU context switch will be done by pmap_load() later
2570  */
2571 
2572 void
2573 pmap_activate(struct lwp *l)
2574 {
2575 	struct cpu_info *ci;
2576 	struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2577 
2578 	KASSERT(kpreempt_disabled());
2579 
2580 	ci = curcpu();
2581 
2582 	if (l == ci->ci_curlwp) {
2583 		KASSERT(ci->ci_want_pmapload == 0);
2584 		KASSERT(ci->ci_tlbstate != TLBSTATE_VALID);
2585 #ifdef KSTACK_CHECK_DR0
2586 		/*
2587 		 * setup breakpoint on the top of stack
2588 		 */
2589 		if (l == &lwp0)
2590 			dr0(0, 0, 0, 0);
2591 		else
2592 			dr0(KSTACK_LOWEST_ADDR(l), 1, 3, 1);
2593 #endif
2594 
2595 		/*
2596 		 * no need to switch to kernel vmspace because
2597 		 * it's a subset of any vmspace.
2598 		 */
2599 
2600 		if (pmap == pmap_kernel()) {
2601 			ci->ci_want_pmapload = 0;
2602 			return;
2603 		}
2604 
2605 		ci->ci_want_pmapload = 1;
2606 	}
2607 }
2608 
2609 /*
2610  * pmap_reactivate: try to regain reference to the pmap.
2611  *
2612  * => Must be called with kernel preemption disabled.
2613  */
2614 
2615 static bool
2616 pmap_reactivate(struct pmap *pmap)
2617 {
2618 	struct cpu_info * const ci = curcpu();
2619 	const cpuid_t cid = cpu_index(ci);
2620 	bool result;
2621 
2622 	KASSERT(kpreempt_disabled());
2623 #if defined(XEN) && defined(__x86_64__)
2624 	KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd);
2625 #elif defined(PAE)
2626 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
2627 #elif !defined(XEN)
2628 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()));
2629 #endif
2630 
2631 	/*
2632 	 * If we still have a lazy reference to this pmap, we can assume
2633 	 * that there was no TLB shootdown for this pmap in the meantime.
2634 	 *
2635 	 * The order of events here is important as we must synchronize
2636 	 * with TLB shootdown interrupts.  Declare interest in invalidations
2637 	 * (TLBSTATE_VALID) and then check the CPU set, which the IPIs can
2638 	 * change only when the state is TLBSTATE_LAZY.
2639 	 */
2640 
2641 	ci->ci_tlbstate = TLBSTATE_VALID;
2642 	KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid));
2643 
2644 	if (kcpuset_isset(pmap->pm_cpus, cid)) {
2645 		/* We have the reference, state is valid. */
2646 		result = true;
2647 	} else {
2648 		/* Must reload the TLB. */
2649 		kcpuset_atomic_set(pmap->pm_cpus, cid);
2650 		result = false;
2651 	}
2652 	return result;
2653 }
2654 
2655 /*
2656  * pmap_load: perform the actual pmap switch, i.e. fill in %cr3 register
2657  * and relevant LDT info.
2658  *
2659  * Ensures that the current process' pmap is loaded on the current CPU's
2660  * MMU and that there are no stale TLB entries.
2661  *
2662  * => The caller should disable kernel preemption or do check-and-retry
2663  *    to prevent a preemption from undoing our efforts.
2664  * => This function may block.
2665  */
2666 void
2667 pmap_load(void)
2668 {
2669 	struct cpu_info *ci;
2670 	struct pmap *pmap, *oldpmap;
2671 	struct lwp *l;
2672 	struct pcb *pcb;
2673 	cpuid_t cid;
2674 	uint64_t ncsw;
2675 
2676 	kpreempt_disable();
2677  retry:
2678 	ci = curcpu();
2679 	if (!ci->ci_want_pmapload) {
2680 		kpreempt_enable();
2681 		return;
2682 	}
2683 	l = ci->ci_curlwp;
2684 	ncsw = l->l_ncsw;
2685 
2686 	/* should be able to take ipis. */
2687 	KASSERT(ci->ci_ilevel < IPL_HIGH);
2688 #ifdef XEN
2689 	/* Check to see if interrupts are enabled (ie; no events are masked) */
2690 	KASSERT(x86_read_psl() == 0);
2691 #else
2692 	KASSERT((x86_read_psl() & PSL_I) != 0);
2693 #endif
2694 
2695 	KASSERT(l != NULL);
2696 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2697 	KASSERT(pmap != pmap_kernel());
2698 	oldpmap = ci->ci_pmap;
2699 	pcb = lwp_getpcb(l);
2700 
2701 	if (pmap == oldpmap) {
2702 		if (!pmap_reactivate(pmap)) {
2703 			u_int gen = uvm_emap_gen_return();
2704 
2705 			/*
2706 			 * pmap has been changed during deactivated.
2707 			 * our tlb may be stale.
2708 			 */
2709 
2710 			tlbflush();
2711 			uvm_emap_update(gen);
2712 		}
2713 
2714 		ci->ci_want_pmapload = 0;
2715 		kpreempt_enable();
2716 		return;
2717 	}
2718 
2719 	/*
2720 	 * Acquire a reference to the new pmap and perform the switch.
2721 	 */
2722 
2723 	pmap_reference(pmap);
2724 
2725 	cid = cpu_index(ci);
2726 	kcpuset_atomic_clear(oldpmap->pm_cpus, cid);
2727 	kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid);
2728 
2729 #if defined(XEN) && defined(__x86_64__)
2730 	KASSERT(pmap_pdirpa(oldpmap, 0) == ci->ci_xen_current_user_pgd ||
2731 	    oldpmap == pmap_kernel());
2732 #elif defined(PAE)
2733 	KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
2734 #elif !defined(XEN)
2735 	KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(rcr3()));
2736 #endif
2737 	KASSERT(!kcpuset_isset(pmap->pm_cpus, cid));
2738 	KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid));
2739 
2740 	/*
2741 	 * Mark the pmap in use by this CPU.  Again, we must synchronize
2742 	 * with TLB shootdown interrupts, so set the state VALID first,
2743 	 * then register us for shootdown events on this pmap.
2744 	 */
2745 	ci->ci_tlbstate = TLBSTATE_VALID;
2746 	kcpuset_atomic_set(pmap->pm_cpus, cid);
2747 	kcpuset_atomic_set(pmap->pm_kernel_cpus, cid);
2748 	ci->ci_pmap = pmap;
2749 
2750 	/*
2751 	 * update tss.  now that we have registered for invalidations
2752 	 * from other CPUs, we're good to load the page tables.
2753 	 */
2754 #ifdef PAE
2755 	pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa;
2756 #else
2757 	pcb->pcb_cr3 = pmap_pdirpa(pmap, 0);
2758 #endif
2759 
2760 #ifdef i386
2761 #ifndef XEN
2762 	ci->ci_tss.tss_ldt = pmap->pm_ldt_sel;
2763 	ci->ci_tss.tss_cr3 = pcb->pcb_cr3;
2764 #endif /* !XEN */
2765 #endif /* i386 */
2766 
2767 	lldt(pmap->pm_ldt_sel);
2768 
2769 	u_int gen = uvm_emap_gen_return();
2770 	cpu_load_pmap(pmap, oldpmap);
2771 	uvm_emap_update(gen);
2772 
2773 	ci->ci_want_pmapload = 0;
2774 
2775 	/*
2776 	 * we're now running with the new pmap.  drop the reference
2777 	 * to the old pmap.  if we block, we need to go around again.
2778 	 */
2779 
2780 	pmap_destroy(oldpmap);
2781 	if (l->l_ncsw != ncsw) {
2782 		goto retry;
2783 	}
2784 
2785 	kpreempt_enable();
2786 }
2787 
2788 /*
2789  * pmap_deactivate: deactivate a process' pmap.
2790  *
2791  * => Must be called with kernel preemption disabled (high IPL is enough).
2792  */
2793 void
2794 pmap_deactivate(struct lwp *l)
2795 {
2796 	struct pmap *pmap;
2797 	struct cpu_info *ci;
2798 
2799 	KASSERT(kpreempt_disabled());
2800 
2801 	if (l != curlwp) {
2802 		return;
2803 	}
2804 
2805 	/*
2806 	 * Wait for pending TLB shootdowns to complete.  Necessary because
2807 	 * TLB shootdown state is per-CPU, and the LWP may be coming off
2808 	 * the CPU before it has a chance to call pmap_update(), e.g. due
2809 	 * to kernel preemption or blocking routine in between.
2810 	 */
2811 	pmap_tlb_shootnow();
2812 
2813 	ci = curcpu();
2814 
2815 	if (ci->ci_want_pmapload) {
2816 		/*
2817 		 * ci_want_pmapload means that our pmap is not loaded on
2818 		 * the CPU or TLB might be stale.  note that pmap_kernel()
2819 		 * is always considered loaded.
2820 		 */
2821 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
2822 		    != pmap_kernel());
2823 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
2824 		    != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID);
2825 
2826 		/*
2827 		 * userspace has not been touched.
2828 		 * nothing to do here.
2829 		 */
2830 
2831 		ci->ci_want_pmapload = 0;
2832 		return;
2833 	}
2834 
2835 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2836 
2837 	if (pmap == pmap_kernel()) {
2838 		return;
2839 	}
2840 
2841 #if defined(XEN) && defined(__x86_64__)
2842 	KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd);
2843 #elif defined(PAE)
2844 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
2845 #elif !defined(XEN)
2846 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()));
2847 #endif
2848 	KASSERT(ci->ci_pmap == pmap);
2849 
2850 	/*
2851 	 * we aren't interested in TLB invalidations for this pmap,
2852 	 * at least for the time being.
2853 	 */
2854 
2855 	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
2856 	ci->ci_tlbstate = TLBSTATE_LAZY;
2857 }
2858 
2859 /*
2860  * end of lifecycle functions
2861  */
2862 
2863 /*
2864  * some misc. functions
2865  */
2866 
2867 int
2868 pmap_pdes_invalid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde)
2869 {
2870 	int i;
2871 	unsigned long index;
2872 	pd_entry_t pde;
2873 
2874 	for (i = PTP_LEVELS; i > 1; i--) {
2875 		index = pl_i(va, i);
2876 		pde = pdes[i - 2][index];
2877 		if ((pde & PG_V) == 0)
2878 			return i;
2879 	}
2880 	if (lastpde != NULL)
2881 		*lastpde = pde;
2882 	return 0;
2883 }
2884 
2885 /*
2886  * pmap_extract: extract a PA for the given VA
2887  */
2888 
2889 bool
2890 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
2891 {
2892 	pt_entry_t *ptes, pte;
2893 	pd_entry_t pde;
2894 	pd_entry_t * const *pdes;
2895 	struct pmap *pmap2;
2896 	struct cpu_info *ci;
2897 	paddr_t pa;
2898 	lwp_t *l;
2899 	bool hard, rv;
2900 
2901 #ifdef __HAVE_DIRECT_MAP
2902 	if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
2903 		if (pap != NULL) {
2904 			*pap = va - PMAP_DIRECT_BASE;
2905 		}
2906 		return true;
2907 	}
2908 #endif
2909 
2910 	rv = false;
2911 	pa = 0;
2912 	l = curlwp;
2913 
2914 	kpreempt_disable();
2915 	ci = l->l_cpu;
2916 	if (__predict_true(!ci->ci_want_pmapload && ci->ci_pmap == pmap) ||
2917 	    pmap == pmap_kernel()) {
2918 		/*
2919 		 * no need to lock, because it's pmap_kernel() or our
2920 		 * own pmap and is active.  if a user pmap, the caller
2921 		 * will hold the vm_map write/read locked and so prevent
2922 		 * entries from disappearing while we are here.  ptps
2923 		 * can disappear via pmap_remove() and pmap_protect(),
2924 		 * but they are called with the vm_map write locked.
2925 		 */
2926 		hard = false;
2927 		ptes = PTE_BASE;
2928 		pdes = normal_pdes;
2929 	} else {
2930 		/* we lose, do it the hard way. */
2931 		hard = true;
2932 		pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
2933 	}
2934 	if (pmap_pdes_valid(va, pdes, &pde)) {
2935 		pte = ptes[pl1_i(va)];
2936 		if (pde & PG_PS) {
2937 			pa = (pde & PG_LGFRAME) | (va & (NBPD_L2 - 1));
2938 			rv = true;
2939 		} else if (__predict_true((pte & PG_V) != 0)) {
2940 			pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
2941 			rv = true;
2942 		}
2943 	}
2944 	if (__predict_false(hard)) {
2945 		pmap_unmap_ptes(pmap, pmap2);
2946 	}
2947 	kpreempt_enable();
2948 	if (pap != NULL) {
2949 		*pap = pa;
2950 	}
2951 	return rv;
2952 }
2953 
2954 
2955 /*
2956  * vtophys: virtual address to physical address.  For use by
2957  * machine-dependent code only.
2958  */
2959 
2960 paddr_t
2961 vtophys(vaddr_t va)
2962 {
2963 	paddr_t pa;
2964 
2965 	if (pmap_extract(pmap_kernel(), va, &pa) == true)
2966 		return (pa);
2967 	return (0);
2968 }
2969 
2970 __strict_weak_alias(pmap_extract_ma, pmap_extract);
2971 
2972 #ifdef XEN
2973 
2974 /*
2975  * vtomach: virtual address to machine address.  For use by
2976  * machine-dependent code only.
2977  */
2978 
2979 paddr_t
2980 vtomach(vaddr_t va)
2981 {
2982 	paddr_t pa;
2983 
2984 	if (pmap_extract_ma(pmap_kernel(), va, &pa) == true)
2985 		return (pa);
2986 	return (0);
2987 }
2988 
2989 #endif /* XEN */
2990 
2991 /*
2992  * pmap_virtual_space: used during bootup [pmap_steal_memory] to
2993  *	determine the bounds of the kernel virtual addess space.
2994  */
2995 
2996 void
2997 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp)
2998 {
2999 	*startp = virtual_avail;
3000 	*endp = virtual_end;
3001 }
3002 
3003 /*
3004  * pmap_zero_page: zero a page
3005  */
3006 
3007 void
3008 pmap_zero_page(paddr_t pa)
3009 {
3010 #if defined(__HAVE_DIRECT_MAP)
3011 	pagezero(PMAP_DIRECT_MAP(pa));
3012 #else
3013 #if defined(XEN)
3014 	if (XEN_VERSION_SUPPORTED(3, 4))
3015 		xen_pagezero(pa);
3016 #endif
3017 	pt_entry_t *zpte;
3018 	void *zerova;
3019 	int id;
3020 
3021 	const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_M | PG_U |
3022 	    PG_k;
3023 
3024 	kpreempt_disable();
3025 	id = cpu_number();
3026 	zpte = PTESLEW(zero_pte, id);
3027 	zerova = VASLEW(zerop, id);
3028 
3029 #ifdef DIAGNOSTIC
3030 	if (*zpte)
3031 		panic("pmap_zero_page: lock botch");
3032 #endif
3033 
3034 	pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags);
3035 	pmap_pte_flush();
3036 	pmap_update_pg((vaddr_t)zerova);		/* flush TLB */
3037 
3038 	memset(zerova, 0, PAGE_SIZE);
3039 
3040 #if defined(DIAGNOSTIC) || defined(XEN)
3041 	pmap_pte_set(zpte, 0);				/* zap ! */
3042 	pmap_pte_flush();
3043 #endif
3044 
3045 	kpreempt_enable();
3046 #endif /* defined(__HAVE_DIRECT_MAP) */
3047 }
3048 
3049 /*
3050  * pmap_pagezeroidle: the same, for the idle loop page zero'er.
3051  * Returns true if the page was zero'd, false if we aborted for
3052  * some reason.
3053  */
3054 
3055 bool
3056 pmap_pageidlezero(paddr_t pa)
3057 {
3058 #ifdef __HAVE_DIRECT_MAP
3059 	KASSERT(cpu_feature[0] & CPUID_SSE2);
3060 	return sse2_idlezero_page((void *)PMAP_DIRECT_MAP(pa));
3061 #else
3062 	pt_entry_t *zpte;
3063 	void *zerova;
3064 	bool rv;
3065 	int id;
3066 
3067 	const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_M | PG_U |
3068 	    PG_k;
3069 
3070 	id = cpu_number();
3071 	zpte = PTESLEW(zero_pte, id);
3072 	zerova = VASLEW(zerop, id);
3073 
3074 	KASSERT(cpu_feature[0] & CPUID_SSE2);
3075 	KASSERT(*zpte == 0);
3076 
3077 	pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags);
3078 	pmap_pte_flush();
3079 	pmap_update_pg((vaddr_t)zerova);		/* flush TLB */
3080 
3081 	rv = sse2_idlezero_page(zerova);
3082 
3083 #if defined(DIAGNOSTIC) || defined(XEN)
3084 	pmap_pte_set(zpte, 0);				/* zap ! */
3085 	pmap_pte_flush();
3086 #endif
3087 
3088 	return rv;
3089 #endif
3090 }
3091 
3092 /*
3093  * pmap_copy_page: copy a page
3094  */
3095 
3096 void
3097 pmap_copy_page(paddr_t srcpa, paddr_t dstpa)
3098 {
3099 #if defined(__HAVE_DIRECT_MAP)
3100 	vaddr_t srcva = PMAP_DIRECT_MAP(srcpa);
3101 	vaddr_t dstva = PMAP_DIRECT_MAP(dstpa);
3102 
3103 	memcpy((void *)dstva, (void *)srcva, PAGE_SIZE);
3104 #else
3105 #if defined(XEN)
3106 	if (XEN_VERSION_SUPPORTED(3, 4)) {
3107 		xen_copy_page(srcpa, dstpa);
3108 		return;
3109 	}
3110 #endif
3111 	pt_entry_t *spte;
3112 	pt_entry_t *dpte;
3113 	void *csrcva;
3114 	void *cdstva;
3115 	int id;
3116 
3117 	const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_U | PG_k;
3118 
3119 	kpreempt_disable();
3120 	id = cpu_number();
3121 	spte = PTESLEW(csrc_pte,id);
3122 	dpte = PTESLEW(cdst_pte,id);
3123 	csrcva = VASLEW(csrcp, id);
3124 	cdstva = VASLEW(cdstp, id);
3125 
3126 	KASSERT(*spte == 0 && *dpte == 0);
3127 
3128 	pmap_pte_set(spte, pmap_pa2pte(srcpa) | pteflags);
3129 	pmap_pte_set(dpte, pmap_pa2pte(dstpa) | pteflags | PG_M);
3130 	pmap_pte_flush();
3131 	pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva);
3132 
3133 	memcpy(cdstva, csrcva, PAGE_SIZE);
3134 
3135 #if defined(DIAGNOSTIC) || defined(XEN)
3136 	pmap_pte_set(spte, 0);
3137 	pmap_pte_set(dpte, 0);
3138 	pmap_pte_flush();
3139 #endif
3140 
3141 	kpreempt_enable();
3142 #endif /* defined(__HAVE_DIRECT_MAP) */
3143 }
3144 
3145 static pt_entry_t *
3146 pmap_map_ptp(struct vm_page *ptp)
3147 {
3148 #ifdef __HAVE_DIRECT_MAP
3149 	return (void *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp));
3150 #else
3151 	pt_entry_t *ptppte;
3152 	void *ptpva;
3153 	int id;
3154 
3155 	KASSERT(kpreempt_disabled());
3156 
3157 #ifndef XEN
3158 	const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_U | PG_M |
3159 	    PG_k;
3160 #else
3161 	const pd_entry_t pteflags = PG_V | pmap_pg_nx | PG_U | PG_M | PG_k;
3162 #endif
3163 
3164 	id = cpu_number();
3165 	ptppte = PTESLEW(ptp_pte, id);
3166 	ptpva = VASLEW(ptpp, id);
3167 	pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | pteflags);
3168 
3169 	pmap_pte_flush();
3170 	pmap_update_pg((vaddr_t)ptpva);
3171 
3172 	return (pt_entry_t *)ptpva;
3173 #endif
3174 }
3175 
3176 static void
3177 pmap_unmap_ptp(void)
3178 {
3179 #ifndef __HAVE_DIRECT_MAP
3180 #if defined(DIAGNOSTIC) || defined(XEN)
3181 	pt_entry_t *pte;
3182 
3183 	KASSERT(kpreempt_disabled());
3184 
3185 	pte = PTESLEW(ptp_pte, cpu_number());
3186 	if (*pte != 0) {
3187 		pmap_pte_set(pte, 0);
3188 		pmap_pte_flush();
3189 	}
3190 #endif
3191 #endif
3192 }
3193 
3194 static pt_entry_t *
3195 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
3196 {
3197 
3198 	KASSERT(kpreempt_disabled());
3199 	if (pmap_is_curpmap(pmap)) {
3200 		return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */
3201 	}
3202 	KASSERT(ptp != NULL);
3203 	return pmap_map_ptp(ptp) + pl1_pi(va);
3204 }
3205 
3206 static void
3207 pmap_unmap_pte(void)
3208 {
3209 
3210 	KASSERT(kpreempt_disabled());
3211 
3212 	pmap_unmap_ptp();
3213 }
3214 
3215 /*
3216  * p m a p   r e m o v e   f u n c t i o n s
3217  *
3218  * functions that remove mappings
3219  */
3220 
3221 /*
3222  * pmap_remove_ptes: remove PTEs from a PTP
3223  *
3224  * => caller must hold pmap's lock
3225  * => PTP must be mapped into KVA
3226  * => PTP should be null if pmap == pmap_kernel()
3227  * => must be called with kernel preemption disabled
3228  * => returns composite pte if at least one page should be shot down
3229  */
3230 
3231 static void
3232 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
3233 		 vaddr_t startva, vaddr_t endva, struct pv_entry **pv_tofree)
3234 {
3235 	pt_entry_t *pte = (pt_entry_t *)ptpva;
3236 
3237 	KASSERT(pmap == pmap_kernel() || mutex_owned(pmap->pm_lock));
3238 	KASSERT(kpreempt_disabled());
3239 
3240 	/*
3241 	 * note that ptpva points to the PTE that maps startva.   this may
3242 	 * or may not be the first PTE in the PTP.
3243 	 *
3244 	 * we loop through the PTP while there are still PTEs to look at
3245 	 * and the wire_count is greater than 1 (because we use the wire_count
3246 	 * to keep track of the number of real PTEs in the PTP).
3247 	 */
3248 	while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) {
3249 		(void)pmap_remove_pte(pmap, ptp, pte, startva, pv_tofree);
3250 		startva += PAGE_SIZE;
3251 		pte++;
3252 	}
3253 }
3254 
3255 
3256 /*
3257  * pmap_remove_pte: remove a single PTE from a PTP.
3258  *
3259  * => caller must hold pmap's lock
3260  * => PTP must be mapped into KVA
3261  * => PTP should be null if pmap == pmap_kernel()
3262  * => returns true if we removed a mapping
3263  * => must be called with kernel preemption disabled
3264  */
3265 static bool
3266 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
3267 		vaddr_t va, struct pv_entry **pv_tofree)
3268 {
3269 	struct pv_entry *pve;
3270 	struct vm_page *pg;
3271 	struct pmap_page *pp;
3272 	pt_entry_t opte;
3273 
3274 	KASSERT(pmap == pmap_kernel() || mutex_owned(pmap->pm_lock));
3275 	KASSERT(kpreempt_disabled());
3276 
3277 	if (!pmap_valid_entry(*pte)) {
3278 		/* VA not mapped. */
3279 		return false;
3280 	}
3281 
3282 	/* Atomically save the old PTE and zap it. */
3283 	opte = pmap_pte_testset(pte, 0);
3284 	if (!pmap_valid_entry(opte)) {
3285 		return false;
3286 	}
3287 
3288 	pmap_exec_account(pmap, va, opte, 0);
3289 	pmap_stats_update_bypte(pmap, 0, opte);
3290 
3291 	if (ptp) {
3292 		/*
3293 		 * Dropping a PTE.  Make sure that the PDE is flushed.
3294 		 */
3295 		ptp->wire_count--;
3296 		if (ptp->wire_count <= 1) {
3297 			opte |= PG_U;
3298 		}
3299 	}
3300 
3301 	if ((opte & PG_U) != 0) {
3302 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE);
3303 	}
3304 
3305 	/*
3306 	 * If we are not on a pv_head list - we are done.
3307 	 */
3308 	if ((opte & PG_PVLIST) == 0) {
3309 #if defined(DIAGNOSTIC) && !defined(DOM0OPS)
3310 		if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL ||
3311 		    pmap_pv_tracked(pmap_pte2pa(opte)) != NULL)
3312 			panic("pmap_remove_pte: managed or pv-tracked page"
3313 			    " without PG_PVLIST for %#"PRIxVADDR, va);
3314 #endif
3315 		return true;
3316 	}
3317 
3318 	if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
3319 		KASSERT(uvm_page_locked_p(pg));
3320 		pp = VM_PAGE_TO_PP(pg);
3321 	} else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
3322 		paddr_t pa = pmap_pte2pa(opte);
3323 		panic("pmap_remove_pte: PG_PVLIST with pv-untracked page"
3324 		    " va = 0x%"PRIxVADDR
3325 		    " pa = 0x%"PRIxPADDR" (0x%"PRIxPADDR")",
3326 		    va, pa, atop(pa));
3327 	}
3328 
3329 	/* Sync R/M bits. */
3330 	pp->pp_attrs |= opte;
3331 	pve = pmap_remove_pv(pp, ptp, va);
3332 
3333 	if (pve) {
3334 		pve->pve_next = *pv_tofree;
3335 		*pv_tofree = pve;
3336 	}
3337 	return true;
3338 }
3339 
3340 /*
3341  * pmap_remove: mapping removal function.
3342  *
3343  * => caller should not be holding any pmap locks
3344  */
3345 
3346 void
3347 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
3348 {
3349 	pt_entry_t *ptes;
3350 	pd_entry_t pde;
3351 	pd_entry_t * const *pdes;
3352 	struct pv_entry *pv_tofree = NULL;
3353 	bool result;
3354 	int i;
3355 	paddr_t ptppa;
3356 	vaddr_t blkendva, va = sva;
3357 	struct vm_page *ptp;
3358 	struct pmap *pmap2;
3359 
3360 	kpreempt_disable();
3361 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
3362 
3363 	/*
3364 	 * removing one page?  take shortcut function.
3365 	 */
3366 
3367 	if (va + PAGE_SIZE == eva) {
3368 		if (pmap_pdes_valid(va, pdes, &pde)) {
3369 
3370 			/* PA of the PTP */
3371 			ptppa = pmap_pte2pa(pde);
3372 
3373 			/* Get PTP if non-kernel mapping. */
3374 			if (pmap != pmap_kernel()) {
3375 				ptp = pmap_find_ptp(pmap, va, ptppa, 1);
3376 				KASSERTMSG(ptp != NULL,
3377 				    "pmap_remove: unmanaged PTP detected");
3378 			} else {
3379 				/* Never free kernel PTPs. */
3380 				ptp = NULL;
3381 			}
3382 
3383 			result = pmap_remove_pte(pmap, ptp,
3384 			    &ptes[pl1_i(va)], va, &pv_tofree);
3385 
3386 			/*
3387 			 * if mapping removed and the PTP is no longer
3388 			 * being used, free it!
3389 			 */
3390 
3391 			if (result && ptp && ptp->wire_count <= 1)
3392 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3393 		}
3394 	} else for (/* null */ ; va < eva ; va = blkendva) {
3395 		int lvl;
3396 
3397 		/* determine range of block */
3398 		blkendva = x86_round_pdr(va+1);
3399 		if (blkendva > eva)
3400 			blkendva = eva;
3401 
3402 		/*
3403 		 * Our PTE mappings should never be removed with pmap_remove.
3404 		 *
3405 		 * XXXmaxv: still needed?
3406 		 *
3407 		 * A long term solution is to move the PTEs out of user address
3408 		 * space, and into kernel address space. Then we can set
3409 		 * VM_MAXUSER_ADDRESS to be VM_MAX_ADDRESS.
3410 		 */
3411 		for (i = 0; i < PDP_SIZE; i++) {
3412 			if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i)
3413 				panic("PTE space accessed");
3414 		}
3415 
3416 		lvl = pmap_pdes_invalid(va, pdes, &pde);
3417 		if (lvl != 0) {
3418 			/*
3419 			 * skip a range corresponding to an invalid pde.
3420 			 */
3421 			blkendva = (va & ptp_masks[lvl - 1]) + nbpd[lvl - 1];
3422  			continue;
3423 		}
3424 
3425 		/* PA of the PTP */
3426 		ptppa = pmap_pte2pa(pde);
3427 
3428 		/* Get PTP if non-kernel mapping. */
3429 		if (pmap != pmap_kernel()) {
3430 			ptp = pmap_find_ptp(pmap, va, ptppa, 1);
3431 			KASSERTMSG(ptp != NULL,
3432 			    "pmap_remove: unmanaged PTP detected");
3433 		} else {
3434 			/* Never free kernel PTPs. */
3435 			ptp = NULL;
3436 		}
3437 
3438 		pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va,
3439 		    blkendva, &pv_tofree);
3440 
3441 		/* if PTP is no longer being used, free it! */
3442 		if (ptp && ptp->wire_count <= 1) {
3443 			pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3444 		}
3445 	}
3446 	pmap_unmap_ptes(pmap, pmap2);		/* unlock pmap */
3447 	kpreempt_enable();
3448 
3449 	/* Now we free unused PVs */
3450 	if (pv_tofree)
3451 		pmap_free_pvs(pv_tofree);
3452 }
3453 
3454 /*
3455  * pmap_sync_pv: clear pte bits and return the old value of the pte.
3456  *
3457  * => Caller should disable kernel preemption.
3458  * => issues tlb shootdowns if necessary.
3459  */
3460 
3461 static int
3462 pmap_sync_pv(struct pv_pte *pvpte, pt_entry_t expect, int clearbits,
3463     pt_entry_t *optep)
3464 {
3465 	struct pmap *pmap;
3466 	struct vm_page *ptp;
3467 	vaddr_t va;
3468 	pt_entry_t *ptep;
3469 	pt_entry_t opte;
3470 	pt_entry_t npte;
3471 	bool need_shootdown;
3472 
3473 	ptp = pvpte->pte_ptp;
3474 	va = pvpte->pte_va;
3475 	KASSERT(ptp == NULL || ptp->uobject != NULL);
3476 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
3477 	pmap = ptp_to_pmap(ptp);
3478 
3479 	KASSERT((expect & ~(PG_FRAME | PG_V)) == 0);
3480 	KASSERT((expect & PG_V) != 0);
3481 	KASSERT(clearbits == ~0 || (clearbits & ~(PG_M | PG_U | PG_RW)) == 0);
3482 	KASSERT(kpreempt_disabled());
3483 
3484 	ptep = pmap_map_pte(pmap, ptp, va);
3485 	do {
3486 		opte = *ptep;
3487 		KASSERT((opte & (PG_M | PG_U)) != PG_M);
3488 		KASSERT((opte & (PG_U | PG_V)) != PG_U);
3489 		KASSERT(opte == 0 || (opte & PG_V) != 0);
3490 		if ((opte & (PG_FRAME | PG_V)) != expect) {
3491 
3492 			/*
3493 			 * we lost a race with a V->P operation like
3494 			 * pmap_remove().  wait for the competitor
3495 			 * reflecting pte bits into mp_attrs.
3496 			 *
3497 			 * issue a redundant TLB shootdown so that
3498 			 * we can wait for its completion.
3499 			 */
3500 
3501 			pmap_unmap_pte();
3502 			if (clearbits != 0) {
3503 				pmap_tlb_shootdown(pmap, va,
3504 				    (pmap == pmap_kernel() ? PG_G : 0),
3505 				    TLBSHOOT_SYNC_PV1);
3506 			}
3507 			return EAGAIN;
3508 		}
3509 
3510 		/*
3511 		 * check if there's anything to do on this pte.
3512 		 */
3513 
3514 		if ((opte & clearbits) == 0) {
3515 			need_shootdown = false;
3516 			break;
3517 		}
3518 
3519 		/*
3520 		 * we need a shootdown if the pte is cached. (PG_U)
3521 		 *
3522 		 * ...unless we are clearing only the PG_RW bit and
3523 		 * it isn't cached as RW. (PG_M)
3524 		 */
3525 
3526 		need_shootdown = (opte & PG_U) != 0 &&
3527 		    !(clearbits == PG_RW && (opte & PG_M) == 0);
3528 
3529 		npte = opte & ~clearbits;
3530 
3531 		/*
3532 		 * if we need a shootdown anyway, clear PG_U and PG_M.
3533 		 */
3534 
3535 		if (need_shootdown) {
3536 			npte &= ~(PG_U | PG_M);
3537 		}
3538 		KASSERT((npte & (PG_M | PG_U)) != PG_M);
3539 		KASSERT((npte & (PG_U | PG_V)) != PG_U);
3540 		KASSERT(npte == 0 || (opte & PG_V) != 0);
3541 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
3542 
3543 	if (need_shootdown) {
3544 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV2);
3545 	}
3546 	pmap_unmap_pte();
3547 
3548 	*optep = opte;
3549 	return 0;
3550 }
3551 
3552 static void
3553 pmap_pp_remove(struct pmap_page *pp, paddr_t pa)
3554 {
3555 	struct pv_pte *pvpte;
3556 	struct pv_entry *killlist = NULL;
3557 	struct vm_page *ptp;
3558 	pt_entry_t expect;
3559 	int count;
3560 
3561 	expect = pmap_pa2pte(pa) | PG_V;
3562 	count = SPINLOCK_BACKOFF_MIN;
3563 	kpreempt_disable();
3564 startover:
3565 	while ((pvpte = pv_pte_first(pp)) != NULL) {
3566 		struct pmap *pmap;
3567 		struct pv_entry *pve;
3568 		pt_entry_t opte;
3569 		vaddr_t va;
3570 		int error;
3571 
3572 		/*
3573 		 * add a reference to the pmap before clearing the pte.
3574 		 * otherwise the pmap can disappear behind us.
3575 		 */
3576 
3577 		ptp = pvpte->pte_ptp;
3578 		pmap = ptp_to_pmap(ptp);
3579 		if (ptp != NULL) {
3580 			pmap_reference(pmap);
3581 		}
3582 
3583 		error = pmap_sync_pv(pvpte, expect, ~0, &opte);
3584 		if (error == EAGAIN) {
3585 			int hold_count;
3586 			KERNEL_UNLOCK_ALL(curlwp, &hold_count);
3587 			if (ptp != NULL) {
3588 				pmap_destroy(pmap);
3589 			}
3590 			SPINLOCK_BACKOFF(count);
3591 			KERNEL_LOCK(hold_count, curlwp);
3592 			goto startover;
3593 		}
3594 
3595 		pp->pp_attrs |= opte;
3596 		va = pvpte->pte_va;
3597 		pve = pmap_remove_pv(pp, ptp, va);
3598 
3599 		/* update the PTP reference count.  free if last reference. */
3600 		if (ptp != NULL) {
3601 			struct pmap *pmap2;
3602 			pt_entry_t *ptes;
3603 			pd_entry_t * const *pdes;
3604 
3605 			KASSERT(pmap != pmap_kernel());
3606 
3607 			pmap_tlb_shootnow();
3608 			pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3609 			pmap_stats_update_bypte(pmap, 0, opte);
3610 			ptp->wire_count--;
3611 			if (ptp->wire_count <= 1) {
3612 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3613 			}
3614 			pmap_unmap_ptes(pmap, pmap2);
3615 			pmap_destroy(pmap);
3616 		} else {
3617 			KASSERT(pmap == pmap_kernel());
3618 			pmap_stats_update_bypte(pmap, 0, opte);
3619 		}
3620 
3621 		if (pve != NULL) {
3622 			pve->pve_next = killlist;	/* mark it for death */
3623 			killlist = pve;
3624 		}
3625 	}
3626 	pmap_tlb_shootnow();
3627 	kpreempt_enable();
3628 
3629 	/* Now free unused pvs. */
3630 	pmap_free_pvs(killlist);
3631 }
3632 
3633 /*
3634  * pmap_page_remove: remove a managed vm_page from all pmaps that map it
3635  *
3636  * => R/M bits are sync'd back to attrs
3637  */
3638 
3639 void
3640 pmap_page_remove(struct vm_page *pg)
3641 {
3642 	struct pmap_page *pp;
3643 	paddr_t pa;
3644 
3645 	KASSERT(uvm_page_locked_p(pg));
3646 
3647 	pp = VM_PAGE_TO_PP(pg);
3648 	pa = VM_PAGE_TO_PHYS(pg);
3649 	pmap_pp_remove(pp, pa);
3650 }
3651 
3652 /*
3653  * pmap_pv_remove: remove an unmanaged pv-tracked page from all pmaps
3654  *	that map it
3655  */
3656 
3657 void
3658 pmap_pv_remove(paddr_t pa)
3659 {
3660 	struct pmap_page *pp;
3661 
3662 	pp = pmap_pv_tracked(pa);
3663 	if (pp == NULL)
3664 		panic("pmap_pv_protect: page not pv-tracked: 0x%"PRIxPADDR,
3665 		    pa);
3666 	pmap_pp_remove(pp, pa);
3667 }
3668 
3669 /*
3670  * p m a p   a t t r i b u t e  f u n c t i o n s
3671  * functions that test/change managed page's attributes
3672  * since a page can be mapped multiple times we must check each PTE that
3673  * maps it by going down the pv lists.
3674  */
3675 
3676 /*
3677  * pmap_test_attrs: test a page's attributes
3678  */
3679 
3680 bool
3681 pmap_test_attrs(struct vm_page *pg, unsigned testbits)
3682 {
3683 	struct pmap_page *pp;
3684 	struct pv_pte *pvpte;
3685 	pt_entry_t expect;
3686 	u_int result;
3687 
3688 	KASSERT(uvm_page_locked_p(pg));
3689 
3690 	pp = VM_PAGE_TO_PP(pg);
3691 	if ((pp->pp_attrs & testbits) != 0) {
3692 		return true;
3693 	}
3694 	expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V;
3695 	kpreempt_disable();
3696 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
3697 		pt_entry_t opte;
3698 		int error;
3699 
3700 		if ((pp->pp_attrs & testbits) != 0) {
3701 			break;
3702 		}
3703 		error = pmap_sync_pv(pvpte, expect, 0, &opte);
3704 		if (error == 0) {
3705 			pp->pp_attrs |= opte;
3706 		}
3707 	}
3708 	result = pp->pp_attrs & testbits;
3709 	kpreempt_enable();
3710 
3711 	/*
3712 	 * note that we will exit the for loop with a non-null pve if
3713 	 * we have found the bits we are testing for.
3714 	 */
3715 
3716 	return result != 0;
3717 }
3718 
3719 static bool
3720 pmap_pp_clear_attrs(struct pmap_page *pp, paddr_t pa, unsigned clearbits)
3721 {
3722 	struct pv_pte *pvpte;
3723 	u_int result;
3724 	pt_entry_t expect;
3725 	int count;
3726 
3727 	expect = pmap_pa2pte(pa) | PG_V;
3728 	count = SPINLOCK_BACKOFF_MIN;
3729 	kpreempt_disable();
3730 startover:
3731 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
3732 		pt_entry_t opte;
3733 		int error;
3734 
3735 		error = pmap_sync_pv(pvpte, expect, clearbits, &opte);
3736 		if (error == EAGAIN) {
3737 			int hold_count;
3738 			KERNEL_UNLOCK_ALL(curlwp, &hold_count);
3739 			SPINLOCK_BACKOFF(count);
3740 			KERNEL_LOCK(hold_count, curlwp);
3741 			goto startover;
3742 		}
3743 		pp->pp_attrs |= opte;
3744 	}
3745 	result = pp->pp_attrs & clearbits;
3746 	pp->pp_attrs &= ~clearbits;
3747 	pmap_tlb_shootnow();
3748 	kpreempt_enable();
3749 
3750 	return result != 0;
3751 }
3752 
3753 /*
3754  * pmap_clear_attrs: clear the specified attribute for a page.
3755  *
3756  * => we return true if we cleared one of the bits we were asked to
3757  */
3758 
3759 bool
3760 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits)
3761 {
3762 	struct pmap_page *pp;
3763 	paddr_t pa;
3764 
3765 	KASSERT(uvm_page_locked_p(pg));
3766 
3767 	pp = VM_PAGE_TO_PP(pg);
3768 	pa = VM_PAGE_TO_PHYS(pg);
3769 
3770 	return pmap_pp_clear_attrs(pp, pa, clearbits);
3771 }
3772 
3773 /*
3774  * pmap_pv_clear_attrs: clear the specified attributes for an unmanaged
3775  *	pv-tracked page.
3776  */
3777 
3778 bool
3779 pmap_pv_clear_attrs(paddr_t pa, unsigned clearbits)
3780 {
3781 	struct pmap_page *pp;
3782 
3783 	pp = pmap_pv_tracked(pa);
3784 	if (pp == NULL)
3785 		panic("pmap_pv_protect: page not pv-tracked: 0x%"PRIxPADDR,
3786 		    pa);
3787 
3788 	return pmap_pp_clear_attrs(pp, pa, clearbits);
3789 }
3790 
3791 /*
3792  * p m a p   p r o t e c t i o n   f u n c t i o n s
3793  */
3794 
3795 /*
3796  * pmap_page_protect: change the protection of all recorded mappings
3797  *	of a managed page
3798  *
3799  * => NOTE: this is an inline function in pmap.h
3800  */
3801 
3802 /* see pmap.h */
3803 
3804 /*
3805  * pmap_pv_protect: change the protection of all recorded mappings
3806  *	of an unmanaged pv-tracked page
3807  *
3808  * => NOTE: this is an inline function in pmap.h
3809  */
3810 
3811 /* see pmap.h */
3812 
3813 /*
3814  * pmap_protect: set the protection in of the pages in a pmap
3815  *
3816  * => NOTE: this is an inline function in pmap.h
3817  */
3818 
3819 /* see pmap.h */
3820 
3821 /*
3822  * pmap_write_protect: write-protect pages in a pmap.
3823  */
3824 void
3825 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
3826 {
3827 	pt_entry_t bit_rem, bit_put;
3828 	pt_entry_t *ptes;
3829 	pt_entry_t * const *pdes;
3830 	struct pmap *pmap2;
3831 	vaddr_t blockend, va;
3832 
3833 	KASSERT(curlwp->l_md.md_gc_pmap != pmap);
3834 
3835 	bit_rem = 0;
3836 	if (!(prot & VM_PROT_WRITE))
3837 		bit_rem = PG_RW;
3838 
3839 	bit_put = 0;
3840 	if (!(prot & VM_PROT_EXECUTE))
3841 		bit_put = pmap_pg_nx;
3842 
3843 	sva &= PG_FRAME;
3844 	eva &= PG_FRAME;
3845 
3846 	/* Acquire pmap. */
3847 	kpreempt_disable();
3848 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3849 
3850 	for (va = sva ; va < eva; va = blockend) {
3851 		pt_entry_t *spte, *epte;
3852 		int i;
3853 
3854 		blockend = x86_round_pdr(va + 1);
3855 		if (blockend > eva)
3856 			blockend = eva;
3857 
3858 		/*
3859 		 * Our PTE mappings should never be write-protected.
3860 		 *
3861 		 * XXXmaxv: still needed?
3862 		 *
3863 		 * A long term solution is to move the PTEs out of user address
3864 		 * space, and into kernel address space. Then we can set
3865 		 * VM_MAXUSER_ADDRESS to be VM_MAX_ADDRESS.
3866 		 */
3867 		for (i = 0; i < PDP_SIZE; i++) {
3868 			if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i)
3869 				panic("PTE space accessed");
3870 		}
3871 
3872 		/* Is it a valid block? */
3873 		if (!pmap_pdes_valid(va, pdes, NULL)) {
3874 			continue;
3875 		}
3876 		KASSERT(va < VM_MAXUSER_ADDRESS || va >= VM_MAX_ADDRESS);
3877 
3878 		spte = &ptes[pl1_i(va)];
3879 		epte = &ptes[pl1_i(blockend)];
3880 
3881 		for (/* */; spte < epte; spte++) {
3882 			pt_entry_t opte, npte;
3883 
3884 			do {
3885 				opte = *spte;
3886 				if (!pmap_valid_entry(opte)) {
3887 					goto next;
3888 				}
3889 				npte = (opte & ~bit_rem) | bit_put;
3890 			} while (pmap_pte_cas(spte, opte, npte) != opte);
3891 
3892 			if ((opte & PG_M) != 0) {
3893 				vaddr_t tva = x86_ptob(spte - ptes);
3894 				pmap_tlb_shootdown(pmap, tva, opte,
3895 				    TLBSHOOT_WRITE_PROTECT);
3896 			}
3897 next:;
3898 		}
3899 	}
3900 
3901 	/* Release pmap. */
3902 	pmap_unmap_ptes(pmap, pmap2);
3903 	kpreempt_enable();
3904 }
3905 
3906 /*
3907  * pmap_unwire: clear the wired bit in the PTE.
3908  *
3909  * => Mapping should already be present.
3910  */
3911 void
3912 pmap_unwire(struct pmap *pmap, vaddr_t va)
3913 {
3914 	pt_entry_t *ptes, *ptep, opte;
3915 	pd_entry_t * const *pdes;
3916 	struct pmap *pmap2;
3917 
3918 	/* Acquire pmap. */
3919 	kpreempt_disable();
3920 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3921 
3922 	if (!pmap_pdes_valid(va, pdes, NULL)) {
3923 		panic("pmap_unwire: invalid PDE");
3924 	}
3925 
3926 	ptep = &ptes[pl1_i(va)];
3927 	opte = *ptep;
3928 	KASSERT(pmap_valid_entry(opte));
3929 
3930 	if (opte & PG_W) {
3931 		pt_entry_t npte = opte & ~PG_W;
3932 
3933 		opte = pmap_pte_testset(ptep, npte);
3934 		pmap_stats_update_bypte(pmap, npte, opte);
3935 	} else {
3936 		printf("pmap_unwire: wiring for pmap %p va 0x%lx "
3937 		    "did not change!\n", pmap, va);
3938 	}
3939 
3940 	/* Release pmap. */
3941 	pmap_unmap_ptes(pmap, pmap2);
3942 	kpreempt_enable();
3943 }
3944 
3945 /*
3946  * pmap_copy: copy mappings from one pmap to another
3947  *
3948  * => optional function
3949  * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
3950  */
3951 
3952 /*
3953  * defined as macro in pmap.h
3954  */
3955 
3956 __strict_weak_alias(pmap_enter, pmap_enter_default);
3957 
3958 int
3959 pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
3960     u_int flags)
3961 {
3962 	return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0);
3963 }
3964 
3965 /*
3966  * pmap_enter: enter a mapping into a pmap
3967  *
3968  * => must be done "now" ... no lazy-evaluation
3969  * => we set pmap => pv_head locking
3970  */
3971 int
3972 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa,
3973 	   vm_prot_t prot, u_int flags, int domid)
3974 {
3975 	pt_entry_t *ptes, opte, npte;
3976 	pt_entry_t *ptep;
3977 	pd_entry_t * const *pdes;
3978 	struct vm_page *ptp;
3979 	struct vm_page *new_pg, *old_pg;
3980 	struct pmap_page *new_pp, *old_pp;
3981 	struct pv_entry *old_pve = NULL;
3982 	struct pv_entry *new_pve;
3983 	struct pv_entry *new_sparepve;
3984 	int error;
3985 	bool wired = (flags & PMAP_WIRED) != 0;
3986 	struct pmap *pmap2;
3987 
3988 	KASSERT(pmap_initialized);
3989 	KASSERT(curlwp->l_md.md_gc_pmap != pmap);
3990 	KASSERT(va < VM_MAX_KERNEL_ADDRESS);
3991 	KASSERTMSG(va != (vaddr_t)PDP_BASE,
3992 	    "pmap_enter: trying to map over PDP!");
3993 	KASSERTMSG(va < VM_MIN_KERNEL_ADDRESS ||
3994 	    pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]),
3995 	    "pmap_enter: missing kernel PTP for VA %lx!", va);
3996 
3997 #ifdef XEN
3998 	KASSERT(domid == DOMID_SELF || pa == 0);
3999 #endif /* XEN */
4000 
4001 	npte = ma | protection_codes[prot] | PG_V;
4002 	npte |= pmap_pat_flags(flags);
4003 	if (wired)
4004 	        npte |= PG_W;
4005 	if (va < VM_MAXUSER_ADDRESS)
4006 		npte |= PG_u;
4007 	else if (va < VM_MAX_ADDRESS)
4008 		panic("PTE space accessed");	/* XXXmaxv: no longer needed? */
4009 	else
4010 		npte |= PG_k;
4011 	if (pmap == pmap_kernel())
4012 		npte |= pmap_pg_g;
4013 	if (flags & VM_PROT_ALL) {
4014 		npte |= PG_U;
4015 		if (flags & VM_PROT_WRITE) {
4016 			KASSERT((npte & PG_RW) != 0);
4017 			npte |= PG_M;
4018 		}
4019 	}
4020 
4021 #ifdef XEN
4022 	if (domid != DOMID_SELF)
4023 		new_pg = NULL;
4024 	else
4025 #endif
4026 		new_pg = PHYS_TO_VM_PAGE(pa);
4027 	if (new_pg != NULL) {
4028 		/* This is a managed page */
4029 		npte |= PG_PVLIST;
4030 		new_pp = VM_PAGE_TO_PP(new_pg);
4031 	} else if ((new_pp = pmap_pv_tracked(pa)) != NULL) {
4032 		/* This is an unmanaged pv-tracked page */
4033 		npte |= PG_PVLIST;
4034 	} else {
4035 		new_pp = NULL;
4036 	}
4037 
4038 	/* get pves. */
4039 	new_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
4040 	new_sparepve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
4041 	if (new_pve == NULL || new_sparepve == NULL) {
4042 		if (flags & PMAP_CANFAIL) {
4043 			error = ENOMEM;
4044 			goto out2;
4045 		}
4046 		panic("pmap_enter: pve allocation failed");
4047 	}
4048 
4049 	kpreempt_disable();
4050 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
4051 	if (pmap == pmap_kernel()) {
4052 		ptp = NULL;
4053 	} else {
4054 		ptp = pmap_get_ptp(pmap, va, pdes);
4055 		if (ptp == NULL) {
4056 			pmap_unmap_ptes(pmap, pmap2);
4057 			if (flags & PMAP_CANFAIL) {
4058 				error = ENOMEM;
4059 				goto out;
4060 			}
4061 			panic("pmap_enter: get ptp failed");
4062 		}
4063 	}
4064 
4065 	/*
4066 	 * update the pte.
4067 	 */
4068 
4069 	ptep = &ptes[pl1_i(va)];
4070 	do {
4071 		opte = *ptep;
4072 
4073 		/*
4074 		 * if the same page, inherit PG_U and PG_M.
4075 		 */
4076 		if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) {
4077 			npte |= opte & (PG_U | PG_M);
4078 		}
4079 #if defined(XEN)
4080 		if (domid != DOMID_SELF) {
4081 			/* pmap_pte_cas with error handling */
4082 			int s = splvm();
4083 			if (opte != *ptep) {
4084 				splx(s);
4085 				continue;
4086 			}
4087 			error = xpq_update_foreign(
4088 			    vtomach((vaddr_t)ptep), npte, domid);
4089 			splx(s);
4090 			if (error) {
4091 				if (ptp != NULL && ptp->wire_count <= 1) {
4092 					pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4093 				}
4094 				pmap_unmap_ptes(pmap, pmap2);
4095 				goto out;
4096 			}
4097 			break;
4098 		}
4099 #endif /* defined(XEN) */
4100 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
4101 
4102 	/*
4103 	 * update statistics and PTP's reference count.
4104 	 */
4105 
4106 	pmap_stats_update_bypte(pmap, npte, opte);
4107 	if (ptp != NULL && !pmap_valid_entry(opte)) {
4108 		ptp->wire_count++;
4109 	}
4110 	KASSERT(ptp == NULL || ptp->wire_count > 1);
4111 
4112 	/*
4113 	 * if the same page, we can skip pv_entry handling.
4114 	 */
4115 
4116 	if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) {
4117 		KASSERT(((opte ^ npte) & PG_PVLIST) == 0);
4118 		goto same_pa;
4119 	}
4120 
4121 	/*
4122 	 * if old page is pv-tracked, remove pv_entry from its list.
4123 	 */
4124 
4125 	if ((~opte & (PG_V | PG_PVLIST)) == 0) {
4126 		if ((old_pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
4127 			KASSERT(uvm_page_locked_p(old_pg));
4128 			old_pp = VM_PAGE_TO_PP(old_pg);
4129 		} else if ((old_pp = pmap_pv_tracked(pmap_pte2pa(opte)))
4130 		    == NULL) {
4131 			pa = pmap_pte2pa(opte);
4132 			panic("pmap_enter: PG_PVLIST with pv-untracked page"
4133 			    " va = 0x%"PRIxVADDR
4134 			    " pa = 0x%" PRIxPADDR " (0x%" PRIxPADDR ")",
4135 			    va, pa, atop(pa));
4136 		}
4137 
4138 		old_pve = pmap_remove_pv(old_pp, ptp, va);
4139 		old_pp->pp_attrs |= opte;
4140 	}
4141 
4142 	/*
4143 	 * if new page is pv-tracked, insert pv_entry into its list.
4144 	 */
4145 
4146 	if (new_pp) {
4147 		new_pve = pmap_enter_pv(new_pp, new_pve, &new_sparepve, ptp, va);
4148 	}
4149 
4150 same_pa:
4151 	pmap_unmap_ptes(pmap, pmap2);
4152 
4153 	/*
4154 	 * shootdown tlb if necessary.
4155 	 */
4156 
4157 	if ((~opte & (PG_V | PG_U)) == 0 &&
4158 	    ((opte ^ npte) & (PG_FRAME | PG_RW)) != 0) {
4159 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER);
4160 	}
4161 
4162 	error = 0;
4163 out:
4164 	kpreempt_enable();
4165 out2:
4166 	if (old_pve != NULL) {
4167 		pool_cache_put(&pmap_pv_cache, old_pve);
4168 	}
4169 	if (new_pve != NULL) {
4170 		pool_cache_put(&pmap_pv_cache, new_pve);
4171 	}
4172 	if (new_sparepve != NULL) {
4173 		pool_cache_put(&pmap_pv_cache, new_sparepve);
4174 	}
4175 
4176 	return error;
4177 }
4178 
4179 static paddr_t
4180 pmap_get_physpage(void)
4181 {
4182 	struct vm_page *ptp;
4183 	struct pmap *kpm = pmap_kernel();
4184 	paddr_t pa;
4185 
4186 	if (!uvm.page_init_done) {
4187 		/*
4188 		 * We're growing the kernel pmap early (from
4189 		 * uvm_pageboot_alloc()). This case must be
4190 		 * handled a little differently.
4191 		 */
4192 
4193 		if (!uvm_page_physget(&pa))
4194 			panic("pmap_get_physpage: out of memory");
4195 #if defined(__HAVE_DIRECT_MAP)
4196 		pagezero(PMAP_DIRECT_MAP(pa));
4197 #else
4198 #if defined(XEN)
4199 		if (XEN_VERSION_SUPPORTED(3, 4)) {
4200 			xen_pagezero(pa);
4201 			return pa;
4202 		}
4203 #endif
4204 		kpreempt_disable();
4205 		pmap_pte_set(early_zero_pte, pmap_pa2pte(pa) | PG_V |
4206 		    PG_RW | pmap_pg_nx | PG_k);
4207 		pmap_pte_flush();
4208 		pmap_update_pg((vaddr_t)early_zerop);
4209 		memset(early_zerop, 0, PAGE_SIZE);
4210 #if defined(DIAGNOSTIC) || defined(XEN)
4211 		pmap_pte_set(early_zero_pte, 0);
4212 		pmap_pte_flush();
4213 #endif /* defined(DIAGNOSTIC) */
4214 		kpreempt_enable();
4215 #endif /* defined(__HAVE_DIRECT_MAP) */
4216 	} else {
4217 		/* XXX */
4218 		ptp = uvm_pagealloc(NULL, 0, NULL,
4219 				    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
4220 		if (ptp == NULL)
4221 			panic("pmap_get_physpage: out of memory");
4222 		ptp->flags &= ~PG_BUSY;
4223 		ptp->wire_count = 1;
4224 		pa = VM_PAGE_TO_PHYS(ptp);
4225 	}
4226 	pmap_stats_update(kpm, 1, 0);
4227 
4228 	return pa;
4229 }
4230 
4231 /*
4232  * Expand the page tree with the specified amount of PTPs, mapping virtual
4233  * addresses starting at kva. We populate all the levels but the last one
4234  * (L1). The nodes of the tree are created as RWX, but the pages covered
4235  * will be kentered in L1, with proper permissions.
4236  *
4237  * Used only by pmap_growkernel.
4238  */
4239 static void
4240 pmap_alloc_level(vaddr_t kva, long *needed_ptps)
4241 {
4242 	unsigned long i;
4243 	paddr_t pa;
4244 	unsigned long index, endindex;
4245 	int level;
4246 	pd_entry_t *pdep;
4247 #ifdef XEN
4248 	int s = splvm(); /* protect xpq_* */
4249 #endif
4250 
4251 	for (level = PTP_LEVELS; level > 1; level--) {
4252 		if (level == PTP_LEVELS)
4253 			pdep = pmap_kernel()->pm_pdir;
4254 		else
4255 			pdep = normal_pdes[level - 2];
4256 		index = pl_i_roundup(kva, level);
4257 		endindex = index + needed_ptps[level - 1] - 1;
4258 
4259 		for (i = index; i <= endindex; i++) {
4260 			pt_entry_t pte;
4261 
4262 			KASSERT(!pmap_valid_entry(pdep[i]));
4263 			pa = pmap_get_physpage();
4264 			pte = pmap_pa2pte(pa) | PG_k | PG_V | PG_RW;
4265 			pmap_pte_set(&pdep[i], pte);
4266 
4267 #if defined(XEN) && (defined(PAE) || defined(__x86_64__))
4268 			if (level == PTP_LEVELS && i >= PDIR_SLOT_KERN) {
4269 				if (__predict_true(
4270 				    cpu_info_primary.ci_flags & CPUF_PRESENT)) {
4271 					/* update per-cpu PMDs on all cpus */
4272 					xen_kpm_sync(pmap_kernel(), i);
4273 				} else {
4274 					/*
4275 					 * too early; update primary CPU
4276 					 * PMD only (without locks)
4277 					 */
4278 #ifdef PAE
4279 					pd_entry_t *cpu_pdep =
4280 					    &cpu_info_primary.ci_kpm_pdir[l2tol2(i)];
4281 #endif
4282 #ifdef __x86_64__
4283 					pd_entry_t *cpu_pdep =
4284 						&cpu_info_primary.ci_kpm_pdir[i];
4285 #endif
4286 					pmap_pte_set(cpu_pdep, pte);
4287 				}
4288 			}
4289 #endif /* XEN && (PAE || __x86_64__) */
4290 
4291 			KASSERT(level != PTP_LEVELS || nkptp[level - 1] +
4292 			    pl_i(VM_MIN_KERNEL_ADDRESS, level) == i);
4293 			nkptp[level - 1]++;
4294 		}
4295 		pmap_pte_flush();
4296 	}
4297 #ifdef XEN
4298 	splx(s);
4299 #endif
4300 }
4301 
4302 /*
4303  * pmap_growkernel: increase usage of KVM space.
4304  *
4305  * => we allocate new PTPs for the kernel and install them in all
4306  *    the pmaps on the system.
4307  */
4308 
4309 vaddr_t
4310 pmap_growkernel(vaddr_t maxkvaddr)
4311 {
4312 	struct pmap *kpm = pmap_kernel();
4313 #if !defined(XEN) || !defined(__x86_64__)
4314 	struct pmap *pm;
4315 	long old;
4316 #endif
4317 	int s, i;
4318 	long needed_kptp[PTP_LEVELS], target_nptp;
4319 	bool invalidate = false;
4320 
4321 	s = splvm();	/* to be safe */
4322 	mutex_enter(kpm->pm_lock);
4323 
4324 	if (maxkvaddr <= pmap_maxkvaddr) {
4325 		mutex_exit(kpm->pm_lock);
4326 		splx(s);
4327 		return pmap_maxkvaddr;
4328 	}
4329 
4330 	maxkvaddr = x86_round_pdr(maxkvaddr);
4331 #if !defined(XEN) || !defined(__x86_64__)
4332 	old = nkptp[PTP_LEVELS - 1];
4333 #endif
4334 
4335 	/* Initialize needed_kptp. */
4336 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
4337 		target_nptp = pl_i_roundup(maxkvaddr, i + 1) -
4338 		    pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1);
4339 
4340 		if (target_nptp > nkptpmax[i])
4341 			panic("out of KVA space");
4342 		KASSERT(target_nptp >= nkptp[i]);
4343 		needed_kptp[i] = target_nptp - nkptp[i];
4344 	}
4345 
4346 	pmap_alloc_level(pmap_maxkvaddr, needed_kptp);
4347 
4348 	/*
4349 	 * If the number of top level entries changed, update all pmaps.
4350 	 */
4351 	if (needed_kptp[PTP_LEVELS - 1] != 0) {
4352 #ifdef XEN
4353 #ifdef __x86_64__
4354 		/* nothing, kernel entries are never entered in user pmap */
4355 #else /* __x86_64__ */
4356 		mutex_enter(&pmaps_lock);
4357 		LIST_FOREACH(pm, &pmaps, pm_list) {
4358 			int pdkidx;
4359 			for (pdkidx = PDIR_SLOT_KERN + old;
4360 			    pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1];
4361 			    pdkidx++) {
4362 				pmap_pte_set(&pm->pm_pdir[pdkidx],
4363 				    kpm->pm_pdir[pdkidx]);
4364 			}
4365 			pmap_pte_flush();
4366 		}
4367 		mutex_exit(&pmaps_lock);
4368 #endif /* __x86_64__ */
4369 #else /* XEN */
4370 		unsigned newpdes;
4371 		newpdes = nkptp[PTP_LEVELS - 1] - old;
4372 		mutex_enter(&pmaps_lock);
4373 		LIST_FOREACH(pm, &pmaps, pm_list) {
4374 			memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old],
4375 			    &kpm->pm_pdir[PDIR_SLOT_KERN + old],
4376 			    newpdes * sizeof (pd_entry_t));
4377 		}
4378 		mutex_exit(&pmaps_lock);
4379 #endif
4380 		invalidate = true;
4381 	}
4382 	pmap_maxkvaddr = maxkvaddr;
4383 	mutex_exit(kpm->pm_lock);
4384 	splx(s);
4385 
4386 	if (invalidate && pmap_initialized) {
4387 		/* Invalidate the PDP cache. */
4388 		pool_cache_invalidate(&pmap_pdp_cache);
4389 	}
4390 
4391 	return maxkvaddr;
4392 }
4393 
4394 #ifdef DEBUG
4395 void pmap_dump(struct pmap *, vaddr_t, vaddr_t);
4396 
4397 /*
4398  * pmap_dump: dump all the mappings from a pmap
4399  *
4400  * => caller should not be holding any pmap locks
4401  */
4402 
4403 void
4404 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
4405 {
4406 	pt_entry_t *ptes, *pte;
4407 	pd_entry_t * const *pdes;
4408 	struct pmap *pmap2;
4409 	vaddr_t blkendva;
4410 
4411 	/*
4412 	 * if end is out of range truncate.
4413 	 * if (end == start) update to max.
4414 	 */
4415 
4416 	if (eva > VM_MAXUSER_ADDRESS || eva <= sva)
4417 		eva = VM_MAXUSER_ADDRESS;
4418 
4419 	/*
4420 	 * we lock in the pmap => pv_head direction
4421 	 */
4422 
4423 	kpreempt_disable();
4424 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
4425 
4426 	/*
4427 	 * dumping a range of pages: we dump in PTP sized blocks (4MB)
4428 	 */
4429 
4430 	for (/* null */ ; sva < eva ; sva = blkendva) {
4431 
4432 		/* determine range of block */
4433 		blkendva = x86_round_pdr(sva+1);
4434 		if (blkendva > eva)
4435 			blkendva = eva;
4436 
4437 		/* valid block? */
4438 		if (!pmap_pdes_valid(sva, pdes, NULL))
4439 			continue;
4440 
4441 		pte = &ptes[pl1_i(sva)];
4442 		for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) {
4443 			if (!pmap_valid_entry(*pte))
4444 				continue;
4445 			printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR
4446 			    " (pte=%#" PRIxPADDR ")\n",
4447 			    sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte);
4448 		}
4449 	}
4450 	pmap_unmap_ptes(pmap, pmap2);
4451 	kpreempt_enable();
4452 }
4453 #endif
4454 
4455 /*
4456  * pmap_update: process deferred invalidations and frees.
4457  */
4458 
4459 void
4460 pmap_update(struct pmap *pmap)
4461 {
4462 	struct vm_page *empty_ptps;
4463 	lwp_t *l = curlwp;
4464 
4465 	/*
4466 	 * If we have torn down this pmap, invalidate non-global TLB
4467 	 * entries on any processors using it.
4468 	 */
4469 	kpreempt_disable();
4470 	if (__predict_false(l->l_md.md_gc_pmap == pmap)) {
4471 		l->l_md.md_gc_pmap = NULL;
4472 		pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, TLBSHOOT_UPDATE);
4473 	}
4474 	/*
4475 	 * Initiate any pending TLB shootdowns.  Wait for them to
4476 	 * complete before returning control to the caller.
4477 	 */
4478 	pmap_tlb_shootnow();
4479 	kpreempt_enable();
4480 
4481 	/*
4482 	 * Now that shootdowns are complete, process deferred frees,
4483 	 * but not from interrupt context.
4484 	 */
4485 	if (l->l_md.md_gc_ptp != NULL) {
4486 		KASSERT((l->l_pflag & LP_INTR) == 0);
4487 		if (cpu_intr_p()) {
4488 			return;
4489 		}
4490 		empty_ptps = l->l_md.md_gc_ptp;
4491 		l->l_md.md_gc_ptp = NULL;
4492 		pmap_free_ptps(empty_ptps);
4493 	}
4494 }
4495 
4496 #if PTP_LEVELS > 4
4497 #error "Unsupported number of page table mappings"
4498 #endif
4499 
4500 paddr_t
4501 pmap_init_tmp_pgtbl(paddr_t pg)
4502 {
4503 	static bool maps_loaded;
4504 	static const paddr_t x86_tmp_pml_paddr[] = {
4505 	    4 * PAGE_SIZE,	/* L1 */
4506 	    5 * PAGE_SIZE,	/* L2 */
4507 	    6 * PAGE_SIZE,	/* L3 */
4508 	    7 * PAGE_SIZE	/* L4 */
4509 	};
4510 	static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 };
4511 
4512 	pd_entry_t *tmp_pml, *kernel_pml;
4513 
4514 	int level;
4515 
4516 	if (!maps_loaded) {
4517 		for (level = 0; level < PTP_LEVELS; ++level) {
4518 			x86_tmp_pml_vaddr[level] =
4519 			    uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
4520 			    UVM_KMF_VAONLY);
4521 
4522 			if (x86_tmp_pml_vaddr[level] == 0)
4523 				panic("mapping of real mode PML failed\n");
4524 			pmap_kenter_pa(x86_tmp_pml_vaddr[level],
4525 			    x86_tmp_pml_paddr[level],
4526 			    VM_PROT_READ | VM_PROT_WRITE, 0);
4527 			pmap_update(pmap_kernel());
4528 		}
4529 		maps_loaded = true;
4530 	}
4531 
4532 	/* Zero levels 1-3 */
4533 	for (level = 0; level < PTP_LEVELS - 1; ++level) {
4534 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
4535 		memset(tmp_pml, 0, PAGE_SIZE);
4536 	}
4537 
4538 	/* Copy PML4 */
4539 	kernel_pml = pmap_kernel()->pm_pdir;
4540 	tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1];
4541 	memcpy(tmp_pml, kernel_pml, PAGE_SIZE);
4542 
4543 #ifdef PAE
4544 	/*
4545 	 * Use the last 4 entries of the L2 page as L3 PD entries. These
4546 	 * last entries are unlikely to be used for temporary mappings.
4547 	 * 508: maps 0->1GB (userland)
4548 	 * 509: unused
4549 	 * 510: unused
4550 	 * 511: maps 3->4GB (kernel)
4551 	 */
4552 	tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PG_V;
4553 	tmp_pml[509] = 0;
4554 	tmp_pml[510] = 0;
4555 	tmp_pml[511] = pmap_pdirpa(pmap_kernel(), PDIR_SLOT_KERN) | PG_V;
4556 #endif
4557 
4558 	for (level = PTP_LEVELS - 1; level > 0; --level) {
4559 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
4560 
4561 		tmp_pml[pl_i(pg, level + 1)] =
4562 		    (x86_tmp_pml_paddr[level - 1] & PG_FRAME) | PG_RW | PG_V;
4563 	}
4564 
4565 	tmp_pml = (void *)x86_tmp_pml_vaddr[0];
4566 	tmp_pml[pl_i(pg, 1)] = (pg & PG_FRAME) | PG_RW | PG_V;
4567 
4568 #ifdef PAE
4569 	/* Return the PA of the L3 page (entry 508 of the L2 page) */
4570 	return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t);
4571 #endif
4572 
4573 	return x86_tmp_pml_paddr[PTP_LEVELS - 1];
4574 }
4575 
4576 u_int
4577 x86_mmap_flags(paddr_t mdpgno)
4578 {
4579 	u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK;
4580 	u_int pflag = 0;
4581 
4582 	if (nflag & X86_MMAP_FLAG_PREFETCH)
4583 		pflag |= PMAP_WRITE_COMBINE;
4584 
4585 	return pflag;
4586 }
4587