xref: /netbsd-src/sys/arch/x86/x86/pmap.c (revision b1c86f5f087524e68db12794ee9c3e3da1ab17a0)
1 /*	$NetBSD: pmap.c,v 1.113 2010/07/24 00:45:56 jym Exp $	*/
2 
3 /*
4  * Copyright (c) 2007 Manuel Bouyer.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *
26  */
27 
28 /*
29  * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
30  *
31  * Permission to use, copy, modify, and distribute this software for any
32  * purpose with or without fee is hereby granted, provided that the above
33  * copyright notice and this permission notice appear in all copies.
34  *
35  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
36  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
37  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
38  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
39  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
40  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
41  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
42  */
43 
44 /*
45  *
46  * Copyright (c) 1997 Charles D. Cranor and Washington University.
47  * All rights reserved.
48  *
49  * Redistribution and use in source and binary forms, with or without
50  * modification, are permitted provided that the following conditions
51  * are met:
52  * 1. Redistributions of source code must retain the above copyright
53  *    notice, this list of conditions and the following disclaimer.
54  * 2. Redistributions in binary form must reproduce the above copyright
55  *    notice, this list of conditions and the following disclaimer in the
56  *    documentation and/or other materials provided with the distribution.
57  * 3. All advertising materials mentioning features or use of this software
58  *    must display the following acknowledgement:
59  *      This product includes software developed by Charles D. Cranor and
60  *      Washington University.
61  * 4. The name of the author may not be used to endorse or promote products
62  *    derived from this software without specific prior written permission.
63  *
64  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
65  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
66  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
67  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
68  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
69  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
70  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
71  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
72  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
73  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
74  */
75 
76 /*
77  * Copyright 2001 (c) Wasabi Systems, Inc.
78  * All rights reserved.
79  *
80  * Written by Frank van der Linden for Wasabi Systems, Inc.
81  *
82  * Redistribution and use in source and binary forms, with or without
83  * modification, are permitted provided that the following conditions
84  * are met:
85  * 1. Redistributions of source code must retain the above copyright
86  *    notice, this list of conditions and the following disclaimer.
87  * 2. Redistributions in binary form must reproduce the above copyright
88  *    notice, this list of conditions and the following disclaimer in the
89  *    documentation and/or other materials provided with the distribution.
90  * 3. All advertising materials mentioning features or use of this software
91  *    must display the following acknowledgement:
92  *      This product includes software developed for the NetBSD Project by
93  *      Wasabi Systems, Inc.
94  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
95  *    or promote products derived from this software without specific prior
96  *    written permission.
97  *
98  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
99  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
100  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
101  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
102  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
103  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
104  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
105  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
106  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
107  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
108  * POSSIBILITY OF SUCH DAMAGE.
109  */
110 
111 /*
112  * This is the i386 pmap modified and generalized to support x86-64
113  * as well. The idea is to hide the upper N levels of the page tables
114  * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest
115  * is mostly untouched, except that it uses some more generalized
116  * macros and interfaces.
117  *
118  * This pmap has been tested on the i386 as well, and it can be easily
119  * adapted to PAE.
120  *
121  * fvdl@wasabisystems.com 18-Jun-2001
122  */
123 
124 /*
125  * pmap.c: i386 pmap module rewrite
126  * Chuck Cranor <chuck@ccrc.wustl.edu>
127  * 11-Aug-97
128  *
129  * history of this pmap module: in addition to my own input, i used
130  *    the following references for this rewrite of the i386 pmap:
131  *
132  * [1] the NetBSD i386 pmap.   this pmap appears to be based on the
133  *     BSD hp300 pmap done by Mike Hibler at University of Utah.
134  *     it was then ported to the i386 by William Jolitz of UUNET
135  *     Technologies, Inc.   Then Charles M. Hannum of the NetBSD
136  *     project fixed some bugs and provided some speed ups.
137  *
138  * [2] the FreeBSD i386 pmap.   this pmap seems to be the
139  *     Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson
140  *     and David Greenman.
141  *
142  * [3] the Mach pmap.   this pmap, from CMU, seems to have migrated
143  *     between several processors.   the VAX version was done by
144  *     Avadis Tevanian, Jr., and Michael Wayne Young.    the i386
145  *     version was done by Lance Berc, Mike Kupfer, Bob Baron,
146  *     David Golub, and Richard Draves.    the alpha version was
147  *     done by Alessandro Forin (CMU/Mach) and Chris Demetriou
148  *     (NetBSD/alpha).
149  */
150 
151 #include <sys/cdefs.h>
152 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.113 2010/07/24 00:45:56 jym Exp $");
153 
154 #include "opt_user_ldt.h"
155 #include "opt_lockdebug.h"
156 #include "opt_multiprocessor.h"
157 #include "opt_xen.h"
158 #if !defined(__x86_64__)
159 #include "opt_kstack_dr0.h"
160 #endif /* !defined(__x86_64__) */
161 
162 #include <sys/param.h>
163 #include <sys/systm.h>
164 #include <sys/proc.h>
165 #include <sys/pool.h>
166 #include <sys/kernel.h>
167 #include <sys/atomic.h>
168 #include <sys/cpu.h>
169 #include <sys/intr.h>
170 #include <sys/xcall.h>
171 
172 #include <uvm/uvm.h>
173 
174 #include <dev/isa/isareg.h>
175 
176 #include <machine/specialreg.h>
177 #include <machine/gdt.h>
178 #include <machine/isa_machdep.h>
179 #include <machine/cpuvar.h>
180 
181 #include <x86/pmap.h>
182 #include <x86/pmap_pv.h>
183 
184 #include <x86/i82489reg.h>
185 #include <x86/i82489var.h>
186 
187 #ifdef XEN
188 #include <xen/xen3-public/xen.h>
189 #include <xen/hypervisor.h>
190 #endif
191 
192 /* flag to be used for kernel mappings: PG_u on Xen/amd64, 0 otherwise */
193 #if defined(XEN) && defined(__x86_64__)
194 #define PG_k PG_u
195 #else
196 #define PG_k 0
197 #endif
198 
199 /*
200  * general info:
201  *
202  *  - for an explanation of how the i386 MMU hardware works see
203  *    the comments in <machine/pte.h>.
204  *
205  *  - for an explanation of the general memory structure used by
206  *    this pmap (including the recursive mapping), see the comments
207  *    in <machine/pmap.h>.
208  *
209  * this file contains the code for the "pmap module."   the module's
210  * job is to manage the hardware's virtual to physical address mappings.
211  * note that there are two levels of mapping in the VM system:
212  *
213  *  [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
214  *      to map ranges of virtual address space to objects/files.  for
215  *      example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
216  *      to the file /bin/ls starting at offset zero."   note that
217  *      the upper layer mapping is not concerned with how individual
218  *      vm_pages are mapped.
219  *
220  *  [2] the lower layer of the VM system (the pmap) maintains the mappings
221  *      from virtual addresses.   it is concerned with which vm_page is
222  *      mapped where.   for example, when you run /bin/ls and start
223  *      at page 0x1000 the fault routine may lookup the correct page
224  *      of the /bin/ls file and then ask the pmap layer to establish
225  *      a mapping for it.
226  *
227  * note that information in the lower layer of the VM system can be
228  * thrown away since it can easily be reconstructed from the info
229  * in the upper layer.
230  *
231  * data structures we use include:
232  *
233  *  - struct pmap: describes the address space of one thread
234  *  - struct pv_entry: describes one <PMAP,VA> mapping of a PA
235  *  - struct pv_head: there is one pv_head per managed page of
236  *	physical memory.   the pv_head points to a list of pv_entry
237  *	structures which describe all the <PMAP,VA> pairs that this
238  *      page is mapped in.    this is critical for page based operations
239  *      such as pmap_page_protect() [change protection on _all_ mappings
240  *      of a page]
241  */
242 
243 /*
244  * memory allocation
245  *
246  *  - there are three data structures that we must dynamically allocate:
247  *
248  * [A] new process' page directory page (PDP)
249  *	- plan 1: done at pmap_create() we use
250  *	  uvm_km_alloc(kernel_map, PAGE_SIZE)  [fka kmem_alloc] to do this
251  *	  allocation.
252  *
253  * if we are low in free physical memory then we sleep in
254  * uvm_km_alloc -- in this case this is ok since we are creating
255  * a new pmap and should not be holding any locks.
256  *
257  * if the kernel is totally out of virtual space
258  * (i.e. uvm_km_alloc returns NULL), then we panic.
259  *
260  * [B] new page tables pages (PTP)
261  * 	- call uvm_pagealloc()
262  * 		=> success: zero page, add to pm_pdir
263  * 		=> failure: we are out of free vm_pages, let pmap_enter()
264  *		   tell UVM about it.
265  *
266  * note: for kernel PTPs, we start with NKPTP of them.   as we map
267  * kernel memory (at uvm_map time) we check to see if we've grown
268  * the kernel pmap.   if so, we call the optional function
269  * pmap_growkernel() to grow the kernel PTPs in advance.
270  *
271  * [C] pv_entry structures
272  */
273 
274 /*
275  * locking
276  *
277  * we have the following locks that we must contend with:
278  *
279  * mutexes:
280  *
281  * - pmap lock (per pmap, part of uvm_object)
282  *   this lock protects the fields in the pmap structure including
283  *   the non-kernel PDEs in the PDP, and the PTEs.  it also locks
284  *   in the alternate PTE space (since that is determined by the
285  *   entry in the PDP).
286  *
287  * - pvh_lock (per pv_head)
288  *   this lock protects the pv_entry list which is chained off the
289  *   pv_head structure for a specific managed PA.   it is locked
290  *   when traversing the list (e.g. adding/removing mappings,
291  *   syncing R/M bits, etc.)
292  *
293  * - pmaps_lock
294  *   this lock protects the list of active pmaps (headed by "pmaps").
295  *   we lock it when adding or removing pmaps from this list.
296  *
297  * tlb shootdown
298  *
299  * tlb shootdowns are hard interrupts that operate outside the spl
300  * framework: they don't need to be blocked provided that the pmap module
301  * gets the order of events correct.  the calls are made by talking directly
302  * to the lapic.  the stubs to handle the interrupts are quite short and do
303  * one of the following: invalidate a single page, a range of pages, all
304  * user tlb entries or the entire tlb.
305  *
306  * the cpus synchronize with each other using pmap_mbox structures which are
307  * aligned on 64-byte cache lines.  tlb shootdowns against the kernel pmap
308  * use a global mailbox and are generated using a broadcast ipi (broadcast
309  * to all but the sending cpu).  shootdowns against regular pmaps use
310  * per-cpu mailboxes and are multicast.  kernel and user shootdowns can
311  * execute simultaneously, as can shootdowns within different multithreaded
312  * processes.  TODO:
313  *
314  *   1. figure out which waitpoints can be deferered to pmap_update().
315  *   2. see if there is a cheap way to batch some updates.
316  */
317 
318 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER;
319 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER;
320 const long nkptpmax[] = NKPTPMAX_INITIALIZER;
321 const long nbpd[] = NBPD_INITIALIZER;
322 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER;
323 pd_entry_t * const alternate_pdes[] = APDES_INITIALIZER;
324 
325 long nkptp[] = NKPTP_INITIALIZER;
326 
327 static kmutex_t pmaps_lock;
328 
329 static vaddr_t pmap_maxkvaddr;
330 
331 #define COUNT(x)	/* nothing */
332 
333 /*
334  * XXX kludge: dummy locking to make KASSERTs in uvm_page.c comfortable.
335  * actual locking is done by pm_lock.
336  */
337 #if defined(DIAGNOSTIC)
338 #define	PMAP_SUBOBJ_LOCK(pm, idx) \
339 	KASSERT(mutex_owned(&(pm)->pm_lock)); \
340 	if ((idx) != 0) \
341 		mutex_enter(&(pm)->pm_obj[(idx)].vmobjlock)
342 #define	PMAP_SUBOBJ_UNLOCK(pm, idx) \
343 	KASSERT(mutex_owned(&(pm)->pm_lock)); \
344 	if ((idx) != 0) \
345 		mutex_exit(&(pm)->pm_obj[(idx)].vmobjlock)
346 #else /* defined(DIAGNOSTIC) */
347 #define	PMAP_SUBOBJ_LOCK(pm, idx)	/* nothing */
348 #define	PMAP_SUBOBJ_UNLOCK(pm, idx)	/* nothing */
349 #endif /* defined(DIAGNOSTIC) */
350 
351 /*
352  * Misc. event counters.
353  */
354 struct evcnt pmap_iobmp_evcnt;
355 struct evcnt pmap_ldt_evcnt;
356 
357 /*
358  * Global TLB shootdown mailbox.
359  */
360 struct evcnt pmap_tlb_evcnt __aligned(64);
361 struct pmap_mbox pmap_mbox __aligned(64);
362 
363 /*
364  * PAT
365  */
366 #define	PATENTRY(n, type)	(type << ((n) * 8))
367 #define	PAT_UC		0x0ULL
368 #define	PAT_WC		0x1ULL
369 #define	PAT_WT		0x4ULL
370 #define	PAT_WP		0x5ULL
371 #define	PAT_WB		0x6ULL
372 #define	PAT_UCMINUS	0x7ULL
373 
374 static bool cpu_pat_enabled = false;
375 
376 
377 /*
378  * Per-CPU data.  The pmap mailbox is cache intensive so gets its
379  * own line.  Note that the mailbox must be the first item.
380  */
381 struct pmap_cpu {
382 	/* TLB shootdown */
383 	struct pmap_mbox pc_mbox;
384 };
385 
386 union {
387 	struct pmap_cpu pc;
388 	uint8_t padding[64];
389 } pmap_cpu[MAXCPUS] __aligned(64);
390 
391 /*
392  * global data structures
393  */
394 
395 static struct pmap kernel_pmap_store;	/* the kernel's pmap (proc0) */
396 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store;
397 
398 /*
399  * pmap_pg_g: if our processor supports PG_G in the PTE then we
400  * set pmap_pg_g to PG_G (otherwise it is zero).
401  */
402 
403 int pmap_pg_g = 0;
404 
405 /*
406  * pmap_largepages: if our processor supports PG_PS and we are
407  * using it, this is set to true.
408  */
409 
410 int pmap_largepages;
411 
412 /*
413  * i386 physical memory comes in a big contig chunk with a small
414  * hole toward the front of it...  the following two paddr_t's
415  * (shared with machdep.c) describe the physical address space
416  * of this machine.
417  */
418 paddr_t avail_start;	/* PA of first available physical page */
419 paddr_t avail_end;	/* PA of last available physical page */
420 
421 #ifdef XEN
422 #ifdef __x86_64__
423 /* Dummy PGD for user cr3, used between pmap_deactivate() and pmap_activate() */
424 static paddr_t xen_dummy_user_pgd;
425 #endif /* __x86_64__ */
426 paddr_t pmap_pa_start; /* PA of first physical page for this domain */
427 paddr_t pmap_pa_end;   /* PA of last physical page for this domain */
428 #endif /* XEN */
429 
430 #define	VM_PAGE_TO_PP(pg)	(&(pg)->mdpage.mp_pp)
431 
432 #define	pp_lock(pp)	mutex_spin_enter(&(pp)->pp_lock)
433 #define	pp_unlock(pp)	mutex_spin_exit(&(pp)->pp_lock)
434 #define	pp_locked(pp)	mutex_owned(&(pp)->pp_lock)
435 
436 #define	PV_HASH_SIZE		32768
437 #define	PV_HASH_LOCK_CNT	32
438 
439 struct pv_hash_lock {
440 	kmutex_t lock;
441 } __aligned(CACHE_LINE_SIZE) pv_hash_locks[PV_HASH_LOCK_CNT]
442     __aligned(CACHE_LINE_SIZE);
443 
444 struct pv_hash_head {
445 	SLIST_HEAD(, pv_entry) hh_list;
446 } pv_hash_heads[PV_HASH_SIZE];
447 
448 static u_int
449 pvhash_hash(struct vm_page *ptp, vaddr_t va)
450 {
451 
452 	return (uintptr_t)ptp / sizeof(*ptp) + (va >> PAGE_SHIFT);
453 }
454 
455 static struct pv_hash_head *
456 pvhash_head(u_int hash)
457 {
458 
459 	return &pv_hash_heads[hash % PV_HASH_SIZE];
460 }
461 
462 static kmutex_t *
463 pvhash_lock(u_int hash)
464 {
465 
466 	return &pv_hash_locks[hash % PV_HASH_LOCK_CNT].lock;
467 }
468 
469 static struct pv_entry *
470 pvhash_remove(struct pv_hash_head *hh, struct vm_page *ptp, vaddr_t va)
471 {
472 	struct pv_entry *pve;
473 	struct pv_entry *prev;
474 
475 	prev = NULL;
476 	SLIST_FOREACH(pve, &hh->hh_list, pve_hash) {
477 		if (pve->pve_pte.pte_ptp == ptp &&
478 		    pve->pve_pte.pte_va == va) {
479 			if (prev != NULL) {
480 				SLIST_REMOVE_AFTER(prev, pve_hash);
481 			} else {
482 				SLIST_REMOVE_HEAD(&hh->hh_list, pve_hash);
483 			}
484 			break;
485 		}
486 		prev = pve;
487 	}
488 	return pve;
489 }
490 
491 /*
492  * other data structures
493  */
494 
495 static pt_entry_t protection_codes[8];	/* maps MI prot to i386 prot code */
496 static bool pmap_initialized = false;	/* pmap_init done yet? */
497 
498 /*
499  * the following two vaddr_t's are used during system startup
500  * to keep track of how much of the kernel's VM space we have used.
501  * once the system is started, the management of the remaining kernel
502  * VM space is turned over to the kernel_map vm_map.
503  */
504 
505 static vaddr_t virtual_avail;	/* VA of first free KVA */
506 static vaddr_t virtual_end;	/* VA of last free KVA */
507 
508 /*
509  * linked list of all non-kernel pmaps
510  */
511 
512 static struct pmap_head pmaps;
513 
514 /*
515  * pool that pmap structures are allocated from
516  */
517 
518 static struct pool_cache pmap_cache;
519 
520 /*
521  * pv_entry cache
522  */
523 
524 static struct pool_cache pmap_pv_cache;
525 
526 /*
527  * MULTIPROCESSOR: special VA's/ PTE's are actually allocated inside a
528  * maxcpus*NPTECL array of PTE's, to avoid cache line thrashing
529  * due to false sharing.
530  */
531 
532 #ifdef MULTIPROCESSOR
533 #define PTESLEW(pte, id) ((pte)+(id)*NPTECL)
534 #define VASLEW(va,id) ((va)+(id)*NPTECL*PAGE_SIZE)
535 #else
536 #define PTESLEW(pte, id) (pte)
537 #define VASLEW(va,id) (va)
538 #endif
539 
540 /*
541  * special VAs and the PTEs that map them
542  */
543 static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte, *early_zero_pte;
544 static char *csrcp, *cdstp, *zerop, *ptpp, *early_zerop;
545 
546 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int);
547 
548 /*
549  * pool and cache that PDPs are allocated from
550  */
551 
552 static struct pool_cache pmap_pdp_cache;
553 int	pmap_pdp_ctor(void *, void *, int);
554 void	pmap_pdp_dtor(void *, void *);
555 #ifdef PAE
556 /* need to allocate items of 4 pages */
557 void *pmap_pdp_alloc(struct pool *, int);
558 void pmap_pdp_free(struct pool *, void *);
559 static struct pool_allocator pmap_pdp_allocator = {
560 	.pa_alloc = pmap_pdp_alloc,
561 	.pa_free = pmap_pdp_free,
562 	.pa_pagesz = PAGE_SIZE * PDP_SIZE,
563 };
564 #endif /* PAE */
565 
566 void *vmmap; /* XXX: used by mem.c... it should really uvm_map_reserve it */
567 
568 extern vaddr_t idt_vaddr;			/* we allocate IDT early */
569 extern paddr_t idt_paddr;
570 
571 #ifdef _LP64
572 extern vaddr_t lo32_vaddr;
573 extern vaddr_t lo32_paddr;
574 #endif
575 
576 extern int end;
577 
578 #ifdef i386
579 /* stuff to fix the pentium f00f bug */
580 extern vaddr_t pentium_idt_vaddr;
581 #endif
582 
583 
584 /*
585  * local prototypes
586  */
587 
588 static struct vm_page	*pmap_get_ptp(struct pmap *, vaddr_t,
589 				      pd_entry_t * const *);
590 static struct vm_page	*pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int);
591 static void		 pmap_freepage(struct pmap *, struct vm_page *, int);
592 static void		 pmap_free_ptp(struct pmap *, struct vm_page *,
593 				       vaddr_t, pt_entry_t *,
594 				       pd_entry_t * const *);
595 static bool		 pmap_is_curpmap(struct pmap *);
596 static bool		 pmap_is_active(struct pmap *, struct cpu_info *, bool);
597 static bool		 pmap_remove_pte(struct pmap *, struct vm_page *,
598 					 pt_entry_t *, vaddr_t,
599 					 struct pv_entry **);
600 static pt_entry_t	 pmap_remove_ptes(struct pmap *, struct vm_page *,
601 					  vaddr_t, vaddr_t, vaddr_t,
602 					  struct pv_entry **);
603 
604 static void		 pmap_unmap_apdp(void);
605 static bool		 pmap_get_physpage(vaddr_t, int, paddr_t *);
606 static void		 pmap_alloc_level(pd_entry_t * const *, vaddr_t, int,
607 					  long *);
608 
609 static bool		 pmap_reactivate(struct pmap *);
610 
611 /*
612  * p m a p   h e l p e r   f u n c t i o n s
613  */
614 
615 static inline void
616 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff)
617 {
618 
619 	if (pmap == pmap_kernel()) {
620 		atomic_add_long(&pmap->pm_stats.resident_count, resid_diff);
621 		atomic_add_long(&pmap->pm_stats.wired_count, wired_diff);
622 	} else {
623 		KASSERT(mutex_owned(&pmap->pm_lock));
624 		pmap->pm_stats.resident_count += resid_diff;
625 		pmap->pm_stats.wired_count += wired_diff;
626 	}
627 }
628 
629 static inline void
630 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
631 {
632 	int resid_diff = ((npte & PG_V) ? 1 : 0) - ((opte & PG_V) ? 1 : 0);
633 	int wired_diff = ((npte & PG_W) ? 1 : 0) - ((opte & PG_W) ? 1 : 0);
634 
635 	KASSERT((npte & (PG_V | PG_W)) != PG_W);
636 	KASSERT((opte & (PG_V | PG_W)) != PG_W);
637 
638 	pmap_stats_update(pmap, resid_diff, wired_diff);
639 }
640 
641 /*
642  * ptp_to_pmap: lookup pmap by ptp
643  */
644 
645 static struct pmap *
646 ptp_to_pmap(struct vm_page *ptp)
647 {
648 	struct pmap *pmap;
649 
650 	if (ptp == NULL) {
651 		return pmap_kernel();
652 	}
653 	pmap = (struct pmap *)ptp->uobject;
654 	KASSERT(pmap != NULL);
655 	KASSERT(&pmap->pm_obj[0] == ptp->uobject);
656 	return pmap;
657 }
658 
659 static inline struct pv_pte *
660 pve_to_pvpte(struct pv_entry *pve)
661 {
662 
663 	KASSERT((void *)&pve->pve_pte == (void *)pve);
664 	return &pve->pve_pte;
665 }
666 
667 static inline struct pv_entry *
668 pvpte_to_pve(struct pv_pte *pvpte)
669 {
670 	struct pv_entry *pve = (void *)pvpte;
671 
672 	KASSERT(pve_to_pvpte(pve) == pvpte);
673 	return pve;
674 }
675 
676 /*
677  * pv_pte_first, pv_pte_next: PV list iterator.
678  */
679 
680 static struct pv_pte *
681 pv_pte_first(struct pmap_page *pp)
682 {
683 
684 	KASSERT(pp_locked(pp));
685 	if ((pp->pp_flags & PP_EMBEDDED) != 0) {
686 		return &pp->pp_pte;
687 	}
688 	return pve_to_pvpte(LIST_FIRST(&pp->pp_head.pvh_list));
689 }
690 
691 static struct pv_pte *
692 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte)
693 {
694 
695 	KASSERT(pvpte != NULL);
696 	KASSERT(pp_locked(pp));
697 	if (pvpte == &pp->pp_pte) {
698 		KASSERT((pp->pp_flags & PP_EMBEDDED) != 0);
699 		return NULL;
700 	}
701 	KASSERT((pp->pp_flags & PP_EMBEDDED) == 0);
702 	return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list));
703 }
704 
705 /*
706  * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
707  *		of course the kernel is always loaded
708  */
709 
710 inline static bool
711 pmap_is_curpmap(struct pmap *pmap)
712 {
713 #if defined(XEN) && defined(__x86_64__)
714 	/*
715 	 * Only kernel pmap is physically loaded.
716 	 * User PGD may be active, but TLB will be flushed
717 	 * with HYPERVISOR_iret anyway, so let's say no
718 	 */
719 	return(pmap == pmap_kernel());
720 #else /* XEN && __x86_64__*/
721 	return((pmap == pmap_kernel()) ||
722 	       (pmap == curcpu()->ci_pmap));
723 #endif
724 }
725 
726 /*
727  * pmap_is_active: is this pmap loaded into the specified processor's %cr3?
728  */
729 
730 inline static bool
731 pmap_is_active(struct pmap *pmap, struct cpu_info *ci, bool kernel)
732 {
733 
734 	return (pmap == pmap_kernel() ||
735 	    (pmap->pm_cpus & ci->ci_cpumask) != 0 ||
736 	    (kernel && (pmap->pm_kernel_cpus & ci->ci_cpumask) != 0));
737 }
738 
739 static void
740 pmap_apte_flush(struct pmap *pmap)
741 {
742 
743 	KASSERT(kpreempt_disabled());
744 
745 	/*
746 	 * Flush the APTE mapping from all other CPUs that
747 	 * are using the pmap we are using (who's APTE space
748 	 * is the one we've just modified).
749 	 *
750 	 * XXXthorpej -- find a way to defer the IPI.
751 	 */
752 	pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, 0);
753 	pmap_tlb_shootwait();
754 }
755 
756 /*
757  * Unmap the content of APDP PDEs
758  */
759 static void
760 pmap_unmap_apdp(void)
761 {
762 	int i;
763 
764 	for (i = 0; i < PDP_SIZE; i++) {
765 		pmap_pte_set(APDP_PDE+i, 0);
766 #if defined (XEN) && defined (PAE)
767 		/* clear shadow entries too */
768 		pmap_pte_set(APDP_PDE_SHADOW+i, 0);
769 #endif
770 	}
771 }
772 
773 /*
774  *	Add a reference to the specified pmap.
775  */
776 
777 inline void
778 pmap_reference(struct pmap *pmap)
779 {
780 
781 	atomic_inc_uint(&pmap->pm_obj[0].uo_refs);
782 }
783 
784 /*
785  * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
786  *
787  * => we lock enough pmaps to keep things locked in
788  * => must be undone with pmap_unmap_ptes before returning
789  */
790 
791 void
792 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2,
793 	      pd_entry_t **ptepp, pd_entry_t * const **pdeppp)
794 {
795 	pd_entry_t opde, npde;
796 	struct pmap *ourpmap;
797 	struct cpu_info *ci;
798 	struct lwp *l;
799 	bool iscurrent;
800 	uint64_t ncsw;
801 #ifdef XEN
802 	int s, i;
803 #endif
804 
805 	/* the kernel's pmap is always accessible */
806 	if (pmap == pmap_kernel()) {
807 		*pmap2 = NULL;
808 		*ptepp = PTE_BASE;
809 		*pdeppp = normal_pdes;
810 		return;
811 	}
812 	KASSERT(kpreempt_disabled());
813 
814  retry:
815 	l = curlwp;
816 	ncsw = l->l_ncsw;
817  	ourpmap = NULL;
818 	ci = curcpu();
819 #if defined(XEN) && defined(__x86_64__)
820 	/*
821 	 * curmap can only be pmap_kernel so at this point
822 	 * pmap_is_curpmap is always false
823 	 */
824 	iscurrent = 0;
825 	ourpmap = pmap_kernel();
826 #else /* XEN && __x86_64__*/
827 	if (ci->ci_want_pmapload &&
828 	    vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) {
829 		pmap_load();
830 		if (l->l_ncsw != ncsw)
831 			goto retry;
832 	}
833 	iscurrent = pmap_is_curpmap(pmap);
834 	/* if curpmap then we are always mapped */
835 	if (iscurrent) {
836 		mutex_enter(&pmap->pm_lock);
837 		*pmap2 = NULL;
838 		*ptepp = PTE_BASE;
839 		*pdeppp = normal_pdes;
840 		goto out;
841 	}
842 	ourpmap = ci->ci_pmap;
843 #endif /* XEN && __x86_64__ */
844 
845 	/* need to lock both curpmap and pmap: use ordered locking */
846 	pmap_reference(ourpmap);
847 	if ((uintptr_t) pmap < (uintptr_t) ourpmap) {
848 		mutex_enter(&pmap->pm_lock);
849 		mutex_enter(&ourpmap->pm_lock);
850 	} else {
851 		mutex_enter(&ourpmap->pm_lock);
852 		mutex_enter(&pmap->pm_lock);
853 	}
854 
855 	if (l->l_ncsw != ncsw)
856 		goto unlock_and_retry;
857 
858 	/* need to load a new alternate pt space into curpmap? */
859 	COUNT(apdp_pde_map);
860 	opde = *APDP_PDE;
861 	if (!pmap_valid_entry(opde) ||
862 	    pmap_pte2pa(opde) != pmap_pdirpa(pmap, 0)) {
863 #ifdef XEN
864 		s = splvm();
865 		/* Make recursive entry usable in user PGD */
866 		for (i = 0; i < PDP_SIZE; i++) {
867 			npde = pmap_pa2pte(
868 			    pmap_pdirpa(pmap, i * NPDPG)) | PG_k | PG_V;
869 			xpq_queue_pte_update(
870 			    xpmap_ptom(pmap_pdirpa(pmap, PDIR_SLOT_PTE + i)),
871 			    npde);
872 			xpq_queue_pte_update(xpmap_ptetomach(&APDP_PDE[i]),
873 			    npde);
874 #ifdef PAE
875 			/* update shadow entry too */
876 			xpq_queue_pte_update(
877 			    xpmap_ptetomach(&APDP_PDE_SHADOW[i]), npde);
878 #endif /* PAE */
879 			xpq_queue_invlpg(
880 			    (vaddr_t)&pmap->pm_pdir[PDIR_SLOT_PTE + i]);
881 		}
882 		if (pmap_valid_entry(opde))
883 			pmap_apte_flush(ourpmap);
884 		splx(s);
885 #else /* XEN */
886 		int i;
887 		for (i = 0; i < PDP_SIZE; i++) {
888 			npde = pmap_pa2pte(
889 			    pmap_pdirpa(pmap, i * NPDPG)) | PG_RW | PG_V;
890 			pmap_pte_set(APDP_PDE+i, npde);
891 		}
892 		pmap_pte_flush();
893 		if (pmap_valid_entry(opde))
894 			pmap_apte_flush(ourpmap);
895 #endif /* XEN */
896 	}
897 	*pmap2 = ourpmap;
898 	*ptepp = APTE_BASE;
899 	*pdeppp = alternate_pdes;
900 	KASSERT(l->l_ncsw == ncsw);
901 #if !defined(XEN) || !defined(__x86_64__)
902  out:
903 #endif
904  	/*
905  	 * might have blocked, need to retry?
906  	 */
907 	if (l->l_ncsw != ncsw) {
908  unlock_and_retry:
909 	    	if (ourpmap != NULL) {
910 			mutex_exit(&ourpmap->pm_lock);
911 			pmap_destroy(ourpmap);
912 		}
913 		mutex_exit(&pmap->pm_lock);
914 		goto retry;
915 	}
916 
917 	return;
918 }
919 
920 /*
921  * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
922  */
923 
924 void
925 pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2)
926 {
927 
928 	if (pmap == pmap_kernel()) {
929 		return;
930 	}
931 	KASSERT(kpreempt_disabled());
932 	if (pmap2 == NULL) {
933 		mutex_exit(&pmap->pm_lock);
934 	} else {
935 #if defined(XEN) && defined(__x86_64__)
936 		KASSERT(pmap2 == pmap_kernel());
937 #else
938 		KASSERT(curcpu()->ci_pmap == pmap2);
939 #endif
940 #if defined(MULTIPROCESSOR)
941 		pmap_unmap_apdp();
942 		pmap_pte_flush();
943 		pmap_apte_flush(pmap2);
944 #endif
945 		COUNT(apdp_pde_unmap);
946 		mutex_exit(&pmap->pm_lock);
947 		mutex_exit(&pmap2->pm_lock);
948 		pmap_destroy(pmap2);
949 	}
950 }
951 
952 inline static void
953 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte)
954 {
955 
956 #if !defined(__x86_64__)
957 	if (curproc == NULL || curproc->p_vmspace == NULL ||
958 	    pm != vm_map_pmap(&curproc->p_vmspace->vm_map))
959 		return;
960 
961 	if ((opte ^ npte) & PG_X)
962 		pmap_update_pg(va);
963 
964 	/*
965 	 * Executability was removed on the last executable change.
966 	 * Reset the code segment to something conservative and
967 	 * let the trap handler deal with setting the right limit.
968 	 * We can't do that because of locking constraints on the vm map.
969 	 */
970 
971 	if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) {
972 		struct trapframe *tf = curlwp->l_md.md_regs;
973 
974 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
975 		pm->pm_hiexec = I386_MAX_EXE_ADDR;
976 	}
977 #endif /* !defined(__x86_64__) */
978 }
979 
980 #if !defined(__x86_64__)
981 /*
982  * Fixup the code segment to cover all potential executable mappings.
983  * returns 0 if no changes to the code segment were made.
984  */
985 
986 int
987 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb)
988 {
989 	struct vm_map_entry *ent;
990 	struct pmap *pm = vm_map_pmap(map);
991 	vaddr_t va = 0;
992 
993 	vm_map_lock_read(map);
994 	for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) {
995 
996 		/*
997 		 * This entry has greater va than the entries before.
998 		 * We need to make it point to the last page, not past it.
999 		 */
1000 
1001 		if (ent->protection & VM_PROT_EXECUTE)
1002 			va = trunc_page(ent->end) - PAGE_SIZE;
1003 	}
1004 	vm_map_unlock_read(map);
1005 	if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL))
1006 		return (0);
1007 
1008 	pm->pm_hiexec = va;
1009 	if (pm->pm_hiexec > I386_MAX_EXE_ADDR) {
1010 		tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
1011 	} else {
1012 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
1013 		return (0);
1014 	}
1015 	return (1);
1016 }
1017 #endif /* !defined(__x86_64__) */
1018 
1019 void
1020 pat_init(struct cpu_info *ci)
1021 {
1022 	uint64_t pat;
1023 
1024 	if (!(ci->ci_feat_val[0] & CPUID_PAT))
1025 		return;
1026 
1027 	/* We change WT to WC. Leave all other entries the default values. */
1028 	pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) |
1029 	      PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) |
1030 	      PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) |
1031 	      PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC);
1032 
1033 	wrmsr(MSR_CR_PAT, pat);
1034 	cpu_pat_enabled = true;
1035 	aprint_debug_dev(ci->ci_dev, "PAT enabled\n");
1036 }
1037 
1038 static pt_entry_t
1039 pmap_pat_flags(u_int flags)
1040 {
1041 	u_int cacheflags = (flags & PMAP_CACHE_MASK);
1042 
1043 	if (!cpu_pat_enabled) {
1044 		switch (cacheflags) {
1045 		case PMAP_NOCACHE:
1046 		case PMAP_NOCACHE_OVR:
1047 			/* results in PGC_UCMINUS on cpus which have
1048 			 * the cpuid PAT but PAT "disabled"
1049 			 */
1050 			return PG_N;
1051 		default:
1052 			return 0;
1053 		}
1054 	}
1055 
1056 	switch (cacheflags) {
1057 	case PMAP_NOCACHE:
1058 		return PGC_UC;
1059 	case PMAP_WRITE_COMBINE:
1060 		return PGC_WC;
1061 	case PMAP_WRITE_BACK:
1062 		return PGC_WB;
1063 	case PMAP_NOCACHE_OVR:
1064 		return PGC_UCMINUS;
1065 	}
1066 
1067 	return 0;
1068 }
1069 
1070 /*
1071  * p m a p   k e n t e r   f u n c t i o n s
1072  *
1073  * functions to quickly enter/remove pages from the kernel address
1074  * space.   pmap_kremove is exported to MI kernel.  we make use of
1075  * the recursive PTE mappings.
1076  */
1077 
1078 /*
1079  * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
1080  *
1081  * => no need to lock anything, assume va is already allocated
1082  * => should be faster than normal pmap enter function
1083  */
1084 
1085 void
1086 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
1087 {
1088 	pt_entry_t *pte, opte, npte;
1089 
1090 	KASSERT(!(prot & ~VM_PROT_ALL));
1091 
1092 	if (va < VM_MIN_KERNEL_ADDRESS)
1093 		pte = vtopte(va);
1094 	else
1095 		pte = kvtopte(va);
1096 #ifdef DOM0OPS
1097 	if (pa < pmap_pa_start || pa >= pmap_pa_end) {
1098 #ifdef DEBUG
1099 		printk("pmap_kenter_pa: pa 0x%" PRIx64 " for va 0x%" PRIx64
1100 		    " outside range\n", (int64_t)pa, (int64_t)va);
1101 #endif /* DEBUG */
1102 		npte = pa;
1103 	} else
1104 #endif /* DOM0OPS */
1105 		npte = pmap_pa2pte(pa);
1106 	npte |= protection_codes[prot] | PG_k | PG_V | pmap_pg_g;
1107 	npte |= pmap_pat_flags(flags);
1108 	opte = pmap_pte_testset(pte, npte); /* zap! */
1109 #if defined(DIAGNOSTIC)
1110 	/* XXX For now... */
1111 	if (opte & PG_PS)
1112 		panic("pmap_kenter_pa: PG_PS");
1113 #endif
1114 	if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) {
1115 		/* This should not happen, so no need to batch updates. */
1116 		kpreempt_disable();
1117 		pmap_tlb_shootdown(pmap_kernel(), va, 0, opte);
1118 		kpreempt_enable();
1119 	}
1120 }
1121 
1122 void
1123 pmap_emap_enter(vaddr_t va, paddr_t pa, vm_prot_t prot)
1124 {
1125 	pt_entry_t *pte, opte, npte;
1126 
1127 	KASSERT((prot & ~VM_PROT_ALL) == 0);
1128 	pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va);
1129 
1130 #ifdef DOM0OPS
1131 	if (pa < pmap_pa_start || pa >= pmap_pa_end) {
1132 		npte = pa;
1133 	} else
1134 #endif
1135 		npte = pmap_pa2pte(pa);
1136 
1137 	npte = pmap_pa2pte(pa);
1138 	npte |= protection_codes[prot] | PG_k | PG_V;
1139 	opte = pmap_pte_testset(pte, npte);
1140 }
1141 
1142 /*
1143  * pmap_emap_sync: perform TLB flush or pmap load, if it was deferred.
1144  */
1145 void
1146 pmap_emap_sync(bool canload)
1147 {
1148 	struct cpu_info *ci = curcpu();
1149 	struct pmap *pmap;
1150 
1151 	KASSERT(kpreempt_disabled());
1152 	if (__predict_true(ci->ci_want_pmapload && canload)) {
1153 		/*
1154 		 * XXX: Hint for pmap_reactivate(), which might suggest to
1155 		 * not perform TLB flush, if state has not changed.
1156 		 */
1157 		pmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map);
1158 		if (__predict_false(pmap == ci->ci_pmap)) {
1159 			const uint32_t cpumask = ci->ci_cpumask;
1160 			atomic_and_32(&pmap->pm_cpus, ~cpumask);
1161 		}
1162 		pmap_load();
1163 		KASSERT(ci->ci_want_pmapload == 0);
1164 	} else {
1165 		tlbflush();
1166 	}
1167 
1168 }
1169 
1170 void
1171 pmap_emap_remove(vaddr_t sva, vsize_t len)
1172 {
1173 	pt_entry_t *pte, xpte;
1174 	vaddr_t va, eva = sva + len;
1175 
1176 	for (va = sva; va < eva; va += PAGE_SIZE) {
1177 		pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va);
1178 		xpte |= pmap_pte_testset(pte, 0);
1179 	}
1180 }
1181 
1182 __weak_alias(pmap_kenter_ma, pmap_kenter_pa);
1183 
1184 #if defined(__x86_64__)
1185 /*
1186  * Change protection for a virtual address. Local for a CPU only, don't
1187  * care about TLB shootdowns.
1188  *
1189  * => must be called with preemption disabled
1190  */
1191 void
1192 pmap_changeprot_local(vaddr_t va, vm_prot_t prot)
1193 {
1194 	pt_entry_t *pte, opte, npte;
1195 
1196 	KASSERT(kpreempt_disabled());
1197 
1198 	if (va < VM_MIN_KERNEL_ADDRESS)
1199 		pte = vtopte(va);
1200 	else
1201 		pte = kvtopte(va);
1202 
1203 	npte = opte = *pte;
1204 
1205 	if ((prot & VM_PROT_WRITE) != 0)
1206 		npte |= PG_RW;
1207 	else
1208 		npte &= ~PG_RW;
1209 
1210 	if (opte != npte) {
1211 		pmap_pte_set(pte, npte);
1212 		pmap_pte_flush();
1213 		invlpg(va);
1214 	}
1215 }
1216 #endif /* defined(__x86_64__) */
1217 
1218 /*
1219  * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
1220  *
1221  * => no need to lock anything
1222  * => caller must dispose of any vm_page mapped in the va range
1223  * => note: not an inline function
1224  * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
1225  * => we assume kernel only unmaps valid addresses and thus don't bother
1226  *    checking the valid bit before doing TLB flushing
1227  * => must be followed by call to pmap_update() before reuse of page
1228  */
1229 
1230 void
1231 pmap_kremove(vaddr_t sva, vsize_t len)
1232 {
1233 	pt_entry_t *pte, xpte;
1234 	vaddr_t va, eva;
1235 
1236 	eva = sva + len;
1237 	xpte = 0;
1238 
1239 	for (va = sva; va < eva; va += PAGE_SIZE) {
1240 		if (va < VM_MIN_KERNEL_ADDRESS)
1241 			pte = vtopte(va);
1242 		else
1243 			pte = kvtopte(va);
1244 		xpte |= pmap_pte_testset(pte, 0); /* zap! */
1245 #if defined(DIAGNOSTIC)
1246 		/* XXX For now... */
1247 		if (xpte & PG_PS)
1248 			panic("pmap_kremove: PG_PS");
1249 		if (xpte & PG_PVLIST)
1250 			panic("pmap_kremove: PG_PVLIST mapping for 0x%lx",
1251 			      va);
1252 #endif
1253 	}
1254 	if ((xpte & (PG_V | PG_U)) == (PG_V | PG_U)) {
1255 		kpreempt_disable();
1256 		pmap_tlb_shootdown(pmap_kernel(), sva, eva, xpte);
1257 		kpreempt_enable();
1258 	}
1259 }
1260 
1261 /*
1262  * p m a p   i n i t   f u n c t i o n s
1263  *
1264  * pmap_bootstrap and pmap_init are called during system startup
1265  * to init the pmap module.   pmap_bootstrap() does a low level
1266  * init just to get things rolling.   pmap_init() finishes the job.
1267  */
1268 
1269 /*
1270  * pmap_bootstrap: get the system in a state where it can run with VM
1271  *	properly enabled (called before main()).   the VM system is
1272  *      fully init'd later...
1273  *
1274  * => on i386, locore.s has already enabled the MMU by allocating
1275  *	a PDP for the kernel, and nkpde PTP's for the kernel.
1276  * => kva_start is the first free virtual address in kernel space
1277  */
1278 
1279 void
1280 pmap_bootstrap(vaddr_t kva_start)
1281 {
1282 	struct pmap *kpm;
1283 	pt_entry_t *pte;
1284 	int i;
1285 	vaddr_t kva;
1286 #ifndef XEN
1287 	unsigned long p1i;
1288 	vaddr_t kva_end;
1289 #endif
1290 
1291 	pt_entry_t pg_nx = (cpu_feature[2] & CPUID_NOX ? PG_NX : 0);
1292 
1293 	/*
1294 	 * set up our local static global vars that keep track of the
1295 	 * usage of KVM before kernel_map is set up
1296 	 */
1297 
1298 	virtual_avail = kva_start;		/* first free KVA */
1299 	virtual_end = VM_MAX_KERNEL_ADDRESS;	/* last KVA */
1300 
1301 	/*
1302 	 * set up protection_codes: we need to be able to convert from
1303 	 * a MI protection code (some combo of VM_PROT...) to something
1304 	 * we can jam into a i386 PTE.
1305 	 */
1306 
1307 	protection_codes[VM_PROT_NONE] = pg_nx;			/* --- */
1308 	protection_codes[VM_PROT_EXECUTE] = PG_RO | PG_X;	/* --x */
1309 	protection_codes[VM_PROT_READ] = PG_RO | pg_nx;		/* -r- */
1310 	protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO | PG_X;/* -rx */
1311 	protection_codes[VM_PROT_WRITE] = PG_RW | pg_nx;	/* w-- */
1312 	protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW | PG_X;/* w-x */
1313 	protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW | pg_nx;
1314 								/* wr- */
1315 	protection_codes[VM_PROT_ALL] = PG_RW | PG_X;		/* wrx */
1316 
1317 	/*
1318 	 * now we init the kernel's pmap
1319 	 *
1320 	 * the kernel pmap's pm_obj is not used for much.   however, in
1321 	 * user pmaps the pm_obj contains the list of active PTPs.
1322 	 * the pm_obj currently does not have a pager.   it might be possible
1323 	 * to add a pager that would allow a process to read-only mmap its
1324 	 * own page tables (fast user level vtophys?).   this may or may not
1325 	 * be useful.
1326 	 */
1327 
1328 	kpm = pmap_kernel();
1329 	for (i = 0; i < PTP_LEVELS - 1; i++) {
1330 		UVM_OBJ_INIT(&kpm->pm_obj[i], NULL, 1);
1331 		kpm->pm_ptphint[i] = NULL;
1332 	}
1333 	memset(&kpm->pm_list, 0, sizeof(kpm->pm_list));  /* pm_list not used */
1334 
1335 	kpm->pm_pdir = (pd_entry_t *)(PDPpaddr + KERNBASE);
1336 	for (i = 0; i < PDP_SIZE; i++)
1337 		kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i;
1338 
1339 	kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
1340 		x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS);
1341 
1342 	/*
1343 	 * the above is just a rough estimate and not critical to the proper
1344 	 * operation of the system.
1345 	 */
1346 
1347 #ifndef XEN
1348 	/*
1349 	 * Begin to enable global TLB entries if they are supported.
1350 	 * The G bit has no effect until the CR4_PGE bit is set in CR4,
1351 	 * which happens in cpu_init(), which is run on each cpu
1352 	 * (and happens later)
1353 	 */
1354 
1355 	if (cpu_feature[0] & CPUID_PGE) {
1356 		pmap_pg_g = PG_G;		/* enable software */
1357 
1358 		/* add PG_G attribute to already mapped kernel pages */
1359 		if (KERNBASE == VM_MIN_KERNEL_ADDRESS) {
1360 			kva_end = virtual_avail;
1361 		} else {
1362 			extern vaddr_t eblob, esym;
1363 			kva_end = (vaddr_t)&end;
1364 			if (esym > kva_end)
1365 				kva_end = esym;
1366 			if (eblob > kva_end)
1367 				kva_end = eblob;
1368 			kva_end = roundup(kva_end, PAGE_SIZE);
1369 		}
1370 		for (kva = KERNBASE; kva < kva_end; kva += PAGE_SIZE) {
1371 			p1i = pl1_i(kva);
1372 			if (pmap_valid_entry(PTE_BASE[p1i]))
1373 				PTE_BASE[p1i] |= PG_G;
1374 		}
1375 	}
1376 
1377 	/*
1378 	 * enable large pages if they are supported.
1379 	 */
1380 
1381 	if (cpu_feature[0] & CPUID_PSE) {
1382 		paddr_t pa;
1383 		pd_entry_t *pde;
1384 		extern char __data_start;
1385 
1386 		lcr4(rcr4() | CR4_PSE);	/* enable hardware (via %cr4) */
1387 		pmap_largepages = 1;	/* enable software */
1388 
1389 		/*
1390 		 * the TLB must be flushed after enabling large pages
1391 		 * on Pentium CPUs, according to section 3.6.2.2 of
1392 		 * "Intel Architecture Software Developer's Manual,
1393 		 * Volume 3: System Programming".
1394 		 */
1395 		tlbflush();
1396 
1397 		/*
1398 		 * now, remap the kernel text using large pages.  we
1399 		 * assume that the linker has properly aligned the
1400 		 * .data segment to a NBPD_L2 boundary.
1401 		 */
1402 		kva_end = rounddown((vaddr_t)&__data_start, NBPD_L1);
1403 		for (pa = 0, kva = KERNBASE; kva + NBPD_L2 <= kva_end;
1404 		     kva += NBPD_L2, pa += NBPD_L2) {
1405 			pde = &L2_BASE[pl2_i(kva)];
1406 			*pde = pa | pmap_pg_g | PG_PS |
1407 			    PG_KR | PG_V;	/* zap! */
1408 			tlbflush();
1409 		}
1410 #if defined(DEBUG)
1411 		aprint_normal("kernel text is mapped with %" PRIuPSIZE " large "
1412 		    "pages and %" PRIuPSIZE " normal pages\n",
1413 		    howmany(kva - KERNBASE, NBPD_L2),
1414 		    howmany((vaddr_t)&__data_start - kva, NBPD_L1));
1415 #endif /* defined(DEBUG) */
1416 	}
1417 #endif /* !XEN */
1418 
1419 	if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
1420 		/*
1421 		 * zero_pte is stuck at the end of mapped space for the kernel
1422 		 * image (disjunct from kva space). This is done so that it
1423 		 * can safely be used in pmap_growkernel (pmap_get_physpage),
1424 		 * when it's called for the first time.
1425 		 * XXXfvdl fix this for MULTIPROCESSOR later.
1426 		 */
1427 
1428 		early_zerop = (void *)(KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2);
1429 		early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop);
1430 	}
1431 
1432 	/*
1433 	 * now we allocate the "special" VAs which are used for tmp mappings
1434 	 * by the pmap (and other modules).    we allocate the VAs by advancing
1435 	 * virtual_avail (note that there are no pages mapped at these VAs).
1436 	 * we find the PTE that maps the allocated VA via the linear PTE
1437 	 * mapping.
1438 	 */
1439 
1440 	pte = PTE_BASE + pl1_i(virtual_avail);
1441 
1442 #ifdef MULTIPROCESSOR
1443 	/*
1444 	 * Waste some VA space to avoid false sharing of cache lines
1445 	 * for page table pages: Give each possible CPU a cache line
1446 	 * of PTE's (8) to play with, though we only need 4.  We could
1447 	 * recycle some of this waste by putting the idle stacks here
1448 	 * as well; we could waste less space if we knew the largest
1449 	 * CPU ID beforehand.
1450 	 */
1451 	csrcp = (char *) virtual_avail;  csrc_pte = pte;
1452 
1453 	cdstp = (char *) virtual_avail+PAGE_SIZE;  cdst_pte = pte+1;
1454 
1455 	zerop = (char *) virtual_avail+PAGE_SIZE*2;  zero_pte = pte+2;
1456 
1457 	ptpp = (char *) virtual_avail+PAGE_SIZE*3;  ptp_pte = pte+3;
1458 
1459 	virtual_avail += PAGE_SIZE * maxcpus * NPTECL;
1460 	pte += maxcpus * NPTECL;
1461 #else
1462 	csrcp = (void *) virtual_avail;  csrc_pte = pte;	/* allocate */
1463 	virtual_avail += PAGE_SIZE; pte++;			/* advance */
1464 
1465 	cdstp = (void *) virtual_avail;  cdst_pte = pte;
1466 	virtual_avail += PAGE_SIZE; pte++;
1467 
1468 	zerop = (void *) virtual_avail;  zero_pte = pte;
1469 	virtual_avail += PAGE_SIZE; pte++;
1470 
1471 	ptpp = (void *) virtual_avail;  ptp_pte = pte;
1472 	virtual_avail += PAGE_SIZE; pte++;
1473 #endif
1474 
1475 	if (VM_MIN_KERNEL_ADDRESS == KERNBASE) {
1476 		early_zerop = zerop;
1477 		early_zero_pte = zero_pte;
1478 	}
1479 
1480 	/*
1481 	 * Nothing after this point actually needs pte;
1482 	 */
1483 	pte = (void *)0xdeadbeef;
1484 
1485 	/* XXX: vmmap used by mem.c... should be uvm_map_reserve */
1486 	/* XXXfvdl PTEs not needed here */
1487 	vmmap = (char *)virtual_avail;			/* don't need pte */
1488 	virtual_avail += PAGE_SIZE; pte++;
1489 
1490 #ifdef XEN
1491 #ifdef __x86_64__
1492 	/*
1493 	 * We want a dummy page directory for Xen:
1494 	 * when deactivate a pmap, Xen will still consider it active.
1495 	 * So we set user PGD to this one to lift all protection on
1496 	 * the now inactive page tables set.
1497 	 */
1498 	xen_dummy_user_pgd = avail_start;
1499 	avail_start += PAGE_SIZE;
1500 
1501 	/* Zero fill it, the less checks in Xen it requires the better */
1502 	memset((void *) (xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE);
1503 	/* Mark read-only */
1504 	HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE,
1505 	    pmap_pa2pte(xen_dummy_user_pgd) | PG_u | PG_V, UVMF_INVLPG);
1506 	/* Pin as L4 */
1507 	xpq_queue_pin_table(xpmap_ptom_masked(xen_dummy_user_pgd));
1508 #endif /* __x86_64__ */
1509 	idt_vaddr = virtual_avail;                      /* don't need pte */
1510 	idt_paddr = avail_start;                        /* steal a page */
1511 	/*
1512 	 * Xen require one more page as we can't store
1513 	 * GDT and LDT on the same page
1514 	 */
1515 	virtual_avail += 3 * PAGE_SIZE;
1516 	avail_start += 3 * PAGE_SIZE;
1517 #else /* XEN */
1518 	idt_vaddr = virtual_avail;			/* don't need pte */
1519 	idt_paddr = avail_start;			/* steal a page */
1520 #if defined(__x86_64__)
1521 	virtual_avail += 2 * PAGE_SIZE; pte += 2;
1522 	avail_start += 2 * PAGE_SIZE;
1523 #else /* defined(__x86_64__) */
1524 	virtual_avail += PAGE_SIZE; pte++;
1525 	avail_start += PAGE_SIZE;
1526 	/* pentium f00f bug stuff */
1527 	pentium_idt_vaddr = virtual_avail;		/* don't need pte */
1528 	virtual_avail += PAGE_SIZE; pte++;
1529 #endif /* defined(__x86_64__) */
1530 #endif /* XEN */
1531 
1532 #ifdef _LP64
1533 	/*
1534 	 * Grab a page below 4G for things that need it (i.e.
1535 	 * having an initial %cr3 for the MP trampoline).
1536 	 */
1537 	lo32_vaddr = virtual_avail;
1538 	virtual_avail += PAGE_SIZE; pte++;
1539 	lo32_paddr = avail_start;
1540 	avail_start += PAGE_SIZE;
1541 #endif
1542 
1543 	/*
1544 	 * now we reserve some VM for mapping pages when doing a crash dump
1545 	 */
1546 
1547 	virtual_avail = reserve_dumppages(virtual_avail);
1548 
1549 	/*
1550 	 * init the static-global locks and global lists.
1551 	 *
1552 	 * => pventry::pvh_lock (initialized elsewhere) must also be
1553 	 *      a spin lock, again at IPL_VM to prevent deadlock, and
1554 	 *	again is never taken from interrupt context.
1555 	 */
1556 
1557 	mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE);
1558 	LIST_INIT(&pmaps);
1559 	pmap_cpu_init_early(curcpu());
1560 
1561 	/*
1562 	 * initialize caches.
1563 	 */
1564 
1565 	pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), 0, 0, 0,
1566 	    "pmappl", NULL, IPL_NONE, NULL, NULL, NULL);
1567 #ifdef PAE
1568 	pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE * PDP_SIZE, 0, 0, 0,
1569 	    "pdppl", &pmap_pdp_allocator, IPL_NONE,
1570 	    pmap_pdp_ctor, pmap_pdp_dtor, NULL);
1571 #else /* PAE */
1572 	pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE, 0, 0, 0,
1573 	    "pdppl", NULL, IPL_NONE, pmap_pdp_ctor, pmap_pdp_dtor, NULL);
1574 #endif /* PAE */
1575 	pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0,
1576 	    PR_LARGECACHE, "pvpl", &pool_allocator_meta, IPL_NONE, NULL,
1577 	    NULL, NULL);
1578 
1579 	/*
1580 	 * ensure the TLB is sync'd with reality by flushing it...
1581 	 */
1582 
1583 	tlbflush();
1584 
1585 	/*
1586 	 * calculate pmap_maxkvaddr from nkptp[].
1587 	 */
1588 
1589 	kva = VM_MIN_KERNEL_ADDRESS;
1590 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
1591 		kva += nkptp[i] * nbpd[i];
1592 	}
1593 	pmap_maxkvaddr = kva;
1594 }
1595 
1596 #if defined(__x86_64__)
1597 /*
1598  * Pre-allocate PTPs for low memory, so that 1:1 mappings for various
1599  * trampoline code can be entered.
1600  */
1601 void
1602 pmap_prealloc_lowmem_ptps(void)
1603 {
1604 #ifdef XEN
1605 	int level;
1606 	paddr_t newp;
1607 	paddr_t pdes_pa;
1608 
1609 	pdes_pa = pmap_pdirpa(pmap_kernel(), 0);
1610 	level = PTP_LEVELS;
1611 	for (;;) {
1612 		newp = avail_start;
1613 		avail_start += PAGE_SIZE;
1614 		HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop,
1615 		    xpmap_ptom_masked(newp) | PG_u | PG_V | PG_RW, UVMF_INVLPG);
1616 		memset((void *)early_zerop, 0, PAGE_SIZE);
1617 		/* Mark R/O before installing */
1618 		HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop,
1619 		    xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG);
1620 		if (newp < (NKL2_KIMG_ENTRIES * NBPD_L2))
1621 			HYPERVISOR_update_va_mapping (newp + KERNBASE,
1622 			    xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG);
1623 		xpq_queue_pte_update (
1624 			xpmap_ptom_masked(pdes_pa)
1625 			+ (pl_i(0, level) * sizeof (pd_entry_t)),
1626 			xpmap_ptom_masked(newp) | PG_RW | PG_u | PG_V);
1627 		level--;
1628 		if (level <= 1)
1629 			break;
1630 		pdes_pa = newp;
1631 	}
1632 #else /* XEN */
1633 	pd_entry_t *pdes;
1634 	int level;
1635 	paddr_t newp;
1636 
1637 	pdes = pmap_kernel()->pm_pdir;
1638 	level = PTP_LEVELS;
1639 	for (;;) {
1640 		newp = avail_start;
1641 		avail_start += PAGE_SIZE;
1642 		*early_zero_pte = (newp & PG_FRAME) | PG_V | PG_RW;
1643 		pmap_update_pg((vaddr_t)early_zerop);
1644 		memset(early_zerop, 0, PAGE_SIZE);
1645 		pdes[pl_i(0, level)] = (newp & PG_FRAME) | PG_V | PG_RW;
1646 		level--;
1647 		if (level <= 1)
1648 			break;
1649 		pdes = normal_pdes[level - 2];
1650 	}
1651 #endif /* XEN */
1652 }
1653 #endif /* defined(__x86_64__) */
1654 
1655 /*
1656  * pmap_init: called from uvm_init, our job is to get the pmap
1657  * system ready to manage mappings...
1658  */
1659 
1660 void
1661 pmap_init(void)
1662 {
1663 	int i;
1664 
1665 	for (i = 0; i < PV_HASH_SIZE; i++) {
1666 		SLIST_INIT(&pv_hash_heads[i].hh_list);
1667 	}
1668 	for (i = 0; i < PV_HASH_LOCK_CNT; i++) {
1669 		mutex_init(&pv_hash_locks[i].lock, MUTEX_NODEBUG, IPL_VM);
1670 	}
1671 
1672 	/*
1673 	 * done: pmap module is up (and ready for business)
1674 	 */
1675 
1676 	pmap_initialized = true;
1677 }
1678 
1679 /*
1680  * pmap_cpu_init_early: perform early per-CPU initialization.
1681  */
1682 
1683 void
1684 pmap_cpu_init_early(struct cpu_info *ci)
1685 {
1686 	struct pmap_cpu *pc;
1687 	static uint8_t pmap_cpu_alloc;
1688 
1689 	pc = &pmap_cpu[pmap_cpu_alloc++].pc;
1690 	ci->ci_pmap_cpu = pc;
1691 }
1692 
1693 /*
1694  * pmap_cpu_init_late: perform late per-CPU initialization.
1695  */
1696 
1697 void
1698 pmap_cpu_init_late(struct cpu_info *ci)
1699 {
1700 
1701 	if (ci == &cpu_info_primary) {
1702 		evcnt_attach_dynamic(&pmap_tlb_evcnt, EVCNT_TYPE_INTR,
1703 		    NULL, "global", "TLB IPI");
1704 		evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC,
1705 		    NULL, "x86", "io bitmap copy");
1706 		evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC,
1707 		    NULL, "x86", "ldt sync");
1708 	}
1709 
1710 	evcnt_attach_dynamic(&ci->ci_tlb_evcnt, EVCNT_TYPE_MISC,
1711 	    NULL, device_xname(ci->ci_dev), "TLB IPI");
1712 
1713 #ifdef PAE
1714 	int ret;
1715 	struct pglist pg;
1716 	struct vm_page *vmap;
1717 
1718 	/* The BP has already its own L3 page allocated in locore.S. */
1719 	if (ci == &cpu_info_primary)
1720 		return;
1721 
1722 	/*
1723 	 * Allocate a page for the per-CPU L3 PD. cr3 being 32 bits, PA musts
1724 	 * resides below the 4GB boundary.
1725 	 */
1726 	ret = uvm_pglistalloc(PAGE_SIZE, 0, 0x100000000ULL, 32, 0, &pg, 1, 0);
1727 	vmap = TAILQ_FIRST(&pg);
1728 
1729 	if (ret != 0 || vmap == NULL)
1730 		panic("%s: failed to allocate L3 pglist for CPU %d (ret %d)\n",
1731 			__func__, cpu_index(ci), ret);
1732 
1733 	ci->ci_pae_l3_pdirpa = vmap->phys_addr;
1734 
1735 	ci->ci_pae_l3_pdir = (paddr_t *)uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
1736 		UVM_KMF_VAONLY | UVM_KMF_NOWAIT);
1737 	if (ci->ci_pae_l3_pdir == NULL)
1738 		panic("%s: failed to allocate L3 PD for CPU %d\n",
1739 			__func__, cpu_index(ci));
1740 
1741 	pmap_kenter_pa((vaddr_t)ci->ci_pae_l3_pdir, ci->ci_pae_l3_pdirpa,
1742 		VM_PROT_READ | VM_PROT_WRITE, 0);
1743 
1744 	pmap_update(pmap_kernel());
1745 #endif
1746 }
1747 
1748 /*
1749  * p v _ e n t r y   f u n c t i o n s
1750  */
1751 
1752 /*
1753  * pmap_free_pvs: free a list of pv_entrys
1754  */
1755 
1756 static void
1757 pmap_free_pvs(struct pv_entry *pve)
1758 {
1759 	struct pv_entry *next;
1760 
1761 	for ( /* null */ ; pve != NULL ; pve = next) {
1762 		next = pve->pve_next;
1763 		pool_cache_put(&pmap_pv_cache, pve);
1764 	}
1765 }
1766 
1767 /*
1768  * main pv_entry manipulation functions:
1769  *   pmap_enter_pv: enter a mapping onto a pv_head list
1770  *   pmap_remove_pv: remove a mapping from a pv_head list
1771  *
1772  * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock
1773  *       the pvh before calling
1774  */
1775 
1776 /*
1777  * insert_pv: a helper of pmap_enter_pv
1778  */
1779 
1780 static void
1781 insert_pv(struct pmap_page *pp, struct pv_entry *pve)
1782 {
1783 	struct pv_hash_head *hh;
1784 	kmutex_t *lock;
1785 	u_int hash;
1786 
1787 	KASSERT(pp_locked(pp));
1788 
1789 	hash = pvhash_hash(pve->pve_pte.pte_ptp, pve->pve_pte.pte_va);
1790 	lock = pvhash_lock(hash);
1791 	hh = pvhash_head(hash);
1792 	mutex_spin_enter(lock);
1793 	SLIST_INSERT_HEAD(&hh->hh_list, pve, pve_hash);
1794 	mutex_spin_exit(lock);
1795 
1796 	LIST_INSERT_HEAD(&pp->pp_head.pvh_list, pve, pve_list);
1797 }
1798 
1799 /*
1800  * pmap_enter_pv: enter a mapping onto a pv_head lst
1801  *
1802  * => caller should have the pp_lock locked
1803  * => caller should adjust ptp's wire_count before calling
1804  */
1805 
1806 static struct pv_entry *
1807 pmap_enter_pv(struct pmap_page *pp,
1808 	      struct pv_entry *pve,	/* preallocated pve for us to use */
1809 	      struct pv_entry **sparepve,
1810 	      struct vm_page *ptp,
1811 	      vaddr_t va)
1812 {
1813 
1814 	KASSERT(ptp == NULL || ptp->wire_count >= 2);
1815 	KASSERT(ptp == NULL || ptp->uobject != NULL);
1816 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
1817 	KASSERT(pp_locked(pp));
1818 
1819 	if ((pp->pp_flags & PP_EMBEDDED) == 0) {
1820 		if (LIST_EMPTY(&pp->pp_head.pvh_list)) {
1821 			pp->pp_flags |= PP_EMBEDDED;
1822 			pp->pp_pte.pte_ptp = ptp;
1823 			pp->pp_pte.pte_va = va;
1824 
1825 			return pve;
1826 		}
1827 	} else {
1828 		struct pv_entry *pve2;
1829 
1830 		pve2 = *sparepve;
1831 		*sparepve = NULL;
1832 
1833 		pve2->pve_pte = pp->pp_pte;
1834 		pp->pp_flags &= ~PP_EMBEDDED;
1835 		LIST_INIT(&pp->pp_head.pvh_list);
1836 		insert_pv(pp, pve2);
1837 	}
1838 
1839 	pve->pve_pte.pte_ptp = ptp;
1840 	pve->pve_pte.pte_va = va;
1841 	insert_pv(pp, pve);
1842 
1843 	return NULL;
1844 }
1845 
1846 /*
1847  * pmap_remove_pv: try to remove a mapping from a pv_list
1848  *
1849  * => caller should hold pp_lock [so that attrs can be adjusted]
1850  * => caller should adjust ptp's wire_count and free PTP if needed
1851  * => we return the removed pve
1852  */
1853 
1854 static struct pv_entry *
1855 pmap_remove_pv(struct pmap_page *pp, struct vm_page *ptp, vaddr_t va)
1856 {
1857 	struct pv_hash_head *hh;
1858 	struct pv_entry *pve;
1859 	kmutex_t *lock;
1860 	u_int hash;
1861 
1862 	KASSERT(ptp == NULL || ptp->uobject != NULL);
1863 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
1864 	KASSERT(pp_locked(pp));
1865 
1866 	if ((pp->pp_flags & PP_EMBEDDED) != 0) {
1867 		KASSERT(pp->pp_pte.pte_ptp == ptp);
1868 		KASSERT(pp->pp_pte.pte_va == va);
1869 
1870 		pp->pp_flags &= ~PP_EMBEDDED;
1871 		LIST_INIT(&pp->pp_head.pvh_list);
1872 
1873 		return NULL;
1874 	}
1875 
1876 	hash = pvhash_hash(ptp, va);
1877 	lock = pvhash_lock(hash);
1878 	hh = pvhash_head(hash);
1879 	mutex_spin_enter(lock);
1880 	pve = pvhash_remove(hh, ptp, va);
1881 	mutex_spin_exit(lock);
1882 
1883 	LIST_REMOVE(pve, pve_list);
1884 
1885 	return pve;
1886 }
1887 
1888 /*
1889  * p t p   f u n c t i o n s
1890  */
1891 
1892 static inline struct vm_page *
1893 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level)
1894 {
1895 	int lidx = level - 1;
1896 	struct vm_page *pg;
1897 
1898 	KASSERT(mutex_owned(&pmap->pm_lock));
1899 
1900 	if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] &&
1901 	    pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) {
1902 		return (pmap->pm_ptphint[lidx]);
1903 	}
1904 	PMAP_SUBOBJ_LOCK(pmap, lidx);
1905 	pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level));
1906 	PMAP_SUBOBJ_UNLOCK(pmap, lidx);
1907 
1908 	KASSERT(pg == NULL || pg->wire_count >= 1);
1909 	return pg;
1910 }
1911 
1912 static inline void
1913 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level)
1914 {
1915 	int lidx;
1916 	struct uvm_object *obj;
1917 
1918 	KASSERT(ptp->wire_count == 1);
1919 
1920 	lidx = level - 1;
1921 
1922 	obj = &pmap->pm_obj[lidx];
1923 	pmap_stats_update(pmap, -1, 0);
1924 	if (lidx != 0)
1925 		mutex_enter(&obj->vmobjlock);
1926 	if (pmap->pm_ptphint[lidx] == ptp)
1927 		pmap->pm_ptphint[lidx] = TAILQ_FIRST(&obj->memq);
1928 	ptp->wire_count = 0;
1929 	uvm_pagerealloc(ptp, NULL, 0);
1930 	VM_PAGE_TO_PP(ptp)->pp_link = curlwp->l_md.md_gc_ptp;
1931 	curlwp->l_md.md_gc_ptp = ptp;
1932 	if (lidx != 0)
1933 		mutex_exit(&obj->vmobjlock);
1934 }
1935 
1936 static void
1937 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
1938 	      pt_entry_t *ptes, pd_entry_t * const *pdes)
1939 {
1940 	unsigned long index;
1941 	int level;
1942 	vaddr_t invaladdr;
1943 #ifdef MULTIPROCESSOR
1944 	vaddr_t invaladdr2;
1945 #endif
1946 	pd_entry_t opde;
1947 	struct pmap *curpmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map);
1948 
1949 	KASSERT(pmap != pmap_kernel());
1950 	KASSERT(mutex_owned(&pmap->pm_lock));
1951 	KASSERT(kpreempt_disabled());
1952 
1953 	level = 1;
1954 	do {
1955 		index = pl_i(va, level + 1);
1956 		opde = pmap_pte_testset(&pdes[level - 1][index], 0);
1957 #if defined(XEN) && defined(__x86_64__)
1958 		/*
1959 		 * If ptp is a L3 currently mapped in kernel space,
1960 		 * clear it before freeing
1961 		 */
1962 		if (pmap_pdirpa(pmap, 0) == curcpu()->ci_xen_current_user_pgd
1963 		    && level == PTP_LEVELS - 1)
1964 			pmap_pte_set(&pmap_kernel()->pm_pdir[index], 0);
1965 #endif /* XEN && __x86_64__ */
1966 		pmap_freepage(pmap, ptp, level);
1967 		invaladdr = level == 1 ? (vaddr_t)ptes :
1968 		    (vaddr_t)pdes[level - 2];
1969 		pmap_tlb_shootdown(curpmap, invaladdr + index * PAGE_SIZE,
1970 		    0, opde);
1971 #if defined(MULTIPROCESSOR)
1972 		invaladdr2 = level == 1 ? (vaddr_t)PTE_BASE :
1973 		    (vaddr_t)normal_pdes[level - 2];
1974 		if (pmap != curpmap || invaladdr != invaladdr2) {
1975 			pmap_tlb_shootdown(pmap, invaladdr2 + index * PAGE_SIZE,
1976 			    0, opde);
1977 		}
1978 #endif
1979 		if (level < PTP_LEVELS - 1) {
1980 			ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1);
1981 			ptp->wire_count--;
1982 			if (ptp->wire_count > 1)
1983 				break;
1984 		}
1985 	} while (++level < PTP_LEVELS);
1986 	pmap_pte_flush();
1987 }
1988 
1989 /*
1990  * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
1991  *
1992  * => pmap should NOT be pmap_kernel()
1993  * => pmap should be locked
1994  * => preemption should be disabled
1995  */
1996 
1997 static struct vm_page *
1998 pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t * const *pdes)
1999 {
2000 	struct vm_page *ptp, *pptp;
2001 	int i;
2002 	unsigned long index;
2003 	pd_entry_t *pva;
2004 	paddr_t ppa, pa;
2005 	struct uvm_object *obj;
2006 
2007 	KASSERT(pmap != pmap_kernel());
2008 	KASSERT(mutex_owned(&pmap->pm_lock));
2009 	KASSERT(kpreempt_disabled());
2010 
2011 	ptp = NULL;
2012 	pa = (paddr_t)-1;
2013 
2014 	/*
2015 	 * Loop through all page table levels seeing if we need to
2016 	 * add a new page to that level.
2017 	 */
2018 	for (i = PTP_LEVELS; i > 1; i--) {
2019 		/*
2020 		 * Save values from previous round.
2021 		 */
2022 		pptp = ptp;
2023 		ppa = pa;
2024 
2025 		index = pl_i(va, i);
2026 		pva = pdes[i - 2];
2027 
2028 		if (pmap_valid_entry(pva[index])) {
2029 			ppa = pmap_pte2pa(pva[index]);
2030 			ptp = NULL;
2031 			continue;
2032 		}
2033 
2034 		obj = &pmap->pm_obj[i-2];
2035 		PMAP_SUBOBJ_LOCK(pmap, i - 2);
2036 		ptp = uvm_pagealloc(obj, ptp_va2o(va, i - 1), NULL,
2037 		    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
2038 		PMAP_SUBOBJ_UNLOCK(pmap, i - 2);
2039 
2040 		if (ptp == NULL)
2041 			return NULL;
2042 
2043 		ptp->flags &= ~PG_BUSY; /* never busy */
2044 		ptp->wire_count = 1;
2045 		pmap->pm_ptphint[i - 2] = ptp;
2046 		pa = VM_PAGE_TO_PHYS(ptp);
2047 		pmap_pte_set(&pva[index], (pd_entry_t)
2048 		        (pmap_pa2pte(pa) | PG_u | PG_RW | PG_V));
2049 #if defined(XEN) && defined(__x86_64__)
2050 		/*
2051 		 * In Xen we must enter the mapping in kernel map too
2052 		 * if pmap is curmap and modifying top level (PGD)
2053 		 */
2054 		if(i == PTP_LEVELS && pmap != pmap_kernel()) {
2055 		        pmap_pte_set(&pmap_kernel()->pm_pdir[index],
2056 		                (pd_entry_t) (pmap_pa2pte(pa)
2057 		                        | PG_u | PG_RW | PG_V));
2058 		}
2059 #endif /* XEN && __x86_64__ */
2060 		pmap_pte_flush();
2061 		pmap_stats_update(pmap, 1, 0);
2062 		/*
2063 		 * If we're not in the top level, increase the
2064 		 * wire count of the parent page.
2065 		 */
2066 		if (i < PTP_LEVELS) {
2067 			if (pptp == NULL)
2068 				pptp = pmap_find_ptp(pmap, va, ppa, i);
2069 #ifdef DIAGNOSTIC
2070 			if (pptp == NULL)
2071 				panic("pde page disappeared");
2072 #endif
2073 			pptp->wire_count++;
2074 		}
2075 	}
2076 
2077 	/*
2078 	 * ptp is not NULL if we just allocated a new ptp. If it's
2079 	 * still NULL, we must look up the existing one.
2080 	 */
2081 	if (ptp == NULL) {
2082 		ptp = pmap_find_ptp(pmap, va, ppa, 1);
2083 #ifdef DIAGNOSTIC
2084 		if (ptp == NULL) {
2085 			printf("va %" PRIxVADDR " ppa %" PRIxPADDR "\n",
2086 			    va, ppa);
2087 			panic("pmap_get_ptp: unmanaged user PTP");
2088 		}
2089 #endif
2090 	}
2091 
2092 	pmap->pm_ptphint[0] = ptp;
2093 	return(ptp);
2094 }
2095 
2096 /*
2097  * p m a p  l i f e c y c l e   f u n c t i o n s
2098  */
2099 
2100 /*
2101  * pmap_pdp_ctor: constructor for the PDP cache.
2102  */
2103 
2104 int
2105 pmap_pdp_ctor(void *arg, void *v, int flags)
2106 {
2107 	pd_entry_t *pdir = v;
2108 	paddr_t pdirpa = 0;	/* XXX: GCC */
2109 	vaddr_t object;
2110 	int i;
2111 
2112 #if !defined(XEN) || !defined(__x86_64__)
2113 	int npde;
2114 #endif
2115 #ifdef XEN
2116 	int s;
2117 #endif
2118 
2119 	/*
2120 	 * NOTE: The `pmap_lock' is held when the PDP is allocated.
2121 	 */
2122 
2123 #if defined(XEN) && defined(__x86_64__)
2124 	/* fetch the physical address of the page directory. */
2125 	(void) pmap_extract(pmap_kernel(), (vaddr_t) pdir, &pdirpa);
2126 
2127 	/* zero init area */
2128 	memset (pdir, 0, PAGE_SIZE); /* Xen wants a clean page */
2129 	/*
2130 	 * this pdir will NEVER be active in kernel mode
2131 	 * so mark recursive entry invalid
2132 	 */
2133 	pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa) | PG_u;
2134 	/*
2135 	 * PDP constructed this way won't be for kernel,
2136 	 * hence we don't put kernel mappings on Xen.
2137 	 * But we need to make pmap_create() happy, so put a dummy (without
2138 	 * PG_V) value at the right place.
2139 	 */
2140 	pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] =
2141 	     (pd_entry_t)-1 & PG_FRAME;
2142 #else /* XEN && __x86_64__*/
2143 	/* zero init area */
2144 	memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t));
2145 
2146 	object = (vaddr_t)v;
2147 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2148 		/* fetch the physical address of the page directory. */
2149 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2150 		/* put in recursive PDE to map the PTEs */
2151 		pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PG_V;
2152 #ifndef XEN
2153 		pdir[PDIR_SLOT_PTE + i] |= PG_KW;
2154 #endif
2155 	}
2156 
2157 	/* copy kernel's PDE */
2158 	npde = nkptp[PTP_LEVELS - 1];
2159 
2160 	memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN],
2161 	    npde * sizeof(pd_entry_t));
2162 
2163 	/* zero the rest */
2164 	memset(&pdir[PDIR_SLOT_KERN + npde], 0,
2165 	    (NTOPLEVEL_PDES - (PDIR_SLOT_KERN + npde)) * sizeof(pd_entry_t));
2166 
2167 	if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
2168 		int idx = pl_i(KERNBASE, PTP_LEVELS);
2169 
2170 		pdir[idx] = PDP_BASE[idx];
2171 	}
2172 #endif /* XEN  && __x86_64__*/
2173 #ifdef XEN
2174 	s = splvm();
2175 	object = (vaddr_t)v;
2176 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2177 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2178 		/* remap this page RO */
2179 		pmap_kenter_pa(object, pdirpa, VM_PROT_READ, 0);
2180 		pmap_update(pmap_kernel());
2181 		/*
2182 		 * pin as L2/L4 page, we have to do the page with the
2183 		 * PDIR_SLOT_PTE entries last
2184 		 */
2185 #ifdef PAE
2186 		if (i == l2tol3(PDIR_SLOT_PTE))
2187 			continue;
2188 #endif
2189 		xpq_queue_pin_table(xpmap_ptom_masked(pdirpa));
2190 	}
2191 #ifdef PAE
2192 	object = ((vaddr_t)pdir) + PAGE_SIZE  * l2tol3(PDIR_SLOT_PTE);
2193 	(void)pmap_extract(pmap_kernel(), object, &pdirpa);
2194 	xpq_queue_pin_table(xpmap_ptom_masked(pdirpa));
2195 #endif
2196 	splx(s);
2197 #endif /* XEN */
2198 
2199 	return (0);
2200 }
2201 
2202 /*
2203  * pmap_pdp_dtor: destructor for the PDP cache.
2204  */
2205 
2206 void
2207 pmap_pdp_dtor(void *arg, void *v)
2208 {
2209 #ifdef XEN
2210 	paddr_t pdirpa = 0;	/* XXX: GCC */
2211 	vaddr_t object = (vaddr_t)v;
2212 	int i;
2213 	int s = splvm();
2214 	pt_entry_t *pte;
2215 
2216 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2217 		/* fetch the physical address of the page directory. */
2218 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2219 		/* unpin page table */
2220 		xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa));
2221 	}
2222 	object = (vaddr_t)v;
2223 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2224 		/* Set page RW again */
2225 		pte = kvtopte(object);
2226 		xpq_queue_pte_update(xpmap_ptetomach(pte), *pte | PG_RW);
2227 		xpq_queue_invlpg((vaddr_t)object);
2228 	}
2229 	splx(s);
2230 #endif  /* XEN */
2231 }
2232 
2233 #ifdef PAE
2234 
2235 /* pmap_pdp_alloc: Allocate a page for the pdp memory pool. */
2236 
2237 void *
2238 pmap_pdp_alloc(struct pool *pp, int flags)
2239 {
2240 	return (void *)uvm_km_alloc(kernel_map,
2241 	    PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE,
2242 	    ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK)
2243 	    | UVM_KMF_WIRED);
2244 }
2245 
2246 /*
2247  * pmap_pdp_free: free a PDP
2248  */
2249 
2250 void
2251 pmap_pdp_free(struct pool *pp, void *v)
2252 {
2253 	uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE,
2254 	    UVM_KMF_WIRED);
2255 }
2256 #endif /* PAE */
2257 
2258 /*
2259  * pmap_create: create a pmap
2260  *
2261  * => note: old pmap interface took a "size" args which allowed for
2262  *	the creation of "software only" pmaps (not in bsd).
2263  */
2264 
2265 struct pmap *
2266 pmap_create(void)
2267 {
2268 	struct pmap *pmap;
2269 	int i;
2270 
2271 	pmap = pool_cache_get(&pmap_cache, PR_WAITOK);
2272 
2273 	/* init uvm_object */
2274 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2275 		UVM_OBJ_INIT(&pmap->pm_obj[i], NULL, 1);
2276 		pmap->pm_ptphint[i] = NULL;
2277 	}
2278 	pmap->pm_stats.wired_count = 0;
2279 	/* count the PDP allocd below */
2280 	pmap->pm_stats.resident_count = PDP_SIZE;
2281 #if !defined(__x86_64__)
2282 	pmap->pm_hiexec = 0;
2283 #endif /* !defined(__x86_64__) */
2284 	pmap->pm_flags = 0;
2285 	pmap->pm_cpus = 0;
2286 	pmap->pm_kernel_cpus = 0;
2287 
2288 	/* init the LDT */
2289 	pmap->pm_ldt = NULL;
2290 	pmap->pm_ldt_len = 0;
2291 	pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2292 
2293 	/* allocate PDP */
2294  try_again:
2295 	pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK);
2296 
2297 	mutex_enter(&pmaps_lock);
2298 
2299 	if (pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] == 0) {
2300 		mutex_exit(&pmaps_lock);
2301 		pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir);
2302 		goto try_again;
2303 	}
2304 
2305 	for (i = 0; i < PDP_SIZE; i++)
2306 		pmap->pm_pdirpa[i] =
2307 		    pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]);
2308 
2309 	LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
2310 
2311 	mutex_exit(&pmaps_lock);
2312 
2313 	return (pmap);
2314 }
2315 
2316 /*
2317  * pmap_destroy: drop reference count on pmap.   free pmap if
2318  *	reference count goes to zero.
2319  */
2320 
2321 void
2322 pmap_destroy(struct pmap *pmap)
2323 {
2324 	int i;
2325 #ifdef DIAGNOSTIC
2326 	struct cpu_info *ci;
2327 	CPU_INFO_ITERATOR cii;
2328 #endif /* DIAGNOSTIC */
2329 
2330 	/*
2331 	 * if we have torn down this pmap, process deferred frees and
2332 	 * invalidations now.
2333 	 */
2334 	if (__predict_false(curlwp->l_md.md_gc_pmap == pmap)) {
2335 		pmap_update(pmap);
2336 	}
2337 
2338 	/*
2339 	 * drop reference count
2340 	 */
2341 
2342 	if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) {
2343 		return;
2344 	}
2345 
2346 #ifdef DIAGNOSTIC
2347 	for (CPU_INFO_FOREACH(cii, ci))
2348 		if (ci->ci_pmap == pmap)
2349 			panic("destroying pmap being used");
2350 #endif /* DIAGNOSTIC */
2351 
2352 	/*
2353 	 * reference count is zero, free pmap resources and then free pmap.
2354 	 */
2355 #ifdef XEN
2356 	/*
2357 	 * Xen lazy APDP handling:
2358 	 * clear APDP_PDE if pmap is the currently mapped
2359 	 */
2360 	if (xpmap_ptom_masked(pmap_pdirpa(pmap, 0)) == (*APDP_PDE & PG_FRAME)) {
2361 		kpreempt_disable();
2362 		pmap_unmap_apdp();
2363 		pmap_pte_flush();
2364 	        pmap_apte_flush(pmap_kernel());
2365 	        kpreempt_enable();
2366 	}
2367 #endif
2368 
2369 	/*
2370 	 * remove it from global list of pmaps
2371 	 */
2372 
2373 	mutex_enter(&pmaps_lock);
2374 	LIST_REMOVE(pmap, pm_list);
2375 	mutex_exit(&pmaps_lock);
2376 
2377 	/*
2378 	 * destroyed pmap shouldn't have remaining PTPs
2379 	 */
2380 
2381 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2382 		KASSERT(pmap->pm_obj[i].uo_npages == 0);
2383 		KASSERT(TAILQ_EMPTY(&pmap->pm_obj[i].memq));
2384 	}
2385 
2386 	/*
2387 	 * MULTIPROCESSOR -- no need to flush out of other processors'
2388 	 * APTE space because we do that in pmap_unmap_ptes().
2389 	 */
2390 	pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir);
2391 
2392 #ifdef USER_LDT
2393 	if (pmap->pm_ldt != NULL) {
2394 		/*
2395 		 * no need to switch the LDT; this address space is gone,
2396 		 * nothing is using it.
2397 		 *
2398 		 * No need to lock the pmap for ldt_free (or anything else),
2399 		 * we're the last one to use it.
2400 		 */
2401 		mutex_enter(&cpu_lock);
2402 		ldt_free(pmap->pm_ldt_sel);
2403 		mutex_exit(&cpu_lock);
2404 		uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt,
2405 		    pmap->pm_ldt_len, UVM_KMF_WIRED);
2406 	}
2407 #endif
2408 
2409 	for (i = 0; i < PTP_LEVELS - 1; i++)
2410 		mutex_destroy(&pmap->pm_obj[i].vmobjlock);
2411 	pool_cache_put(&pmap_cache, pmap);
2412 }
2413 
2414 /*
2415  * pmap_remove_all: pmap is being torn down by the current thread.
2416  * avoid unnecessary invalidations.
2417  */
2418 
2419 void
2420 pmap_remove_all(struct pmap *pmap)
2421 {
2422 	lwp_t *l = curlwp;
2423 
2424 	KASSERT(l->l_md.md_gc_pmap == NULL);
2425 
2426 	l->l_md.md_gc_pmap = pmap;
2427 }
2428 
2429 #if defined(PMAP_FORK)
2430 /*
2431  * pmap_fork: perform any necessary data structure manipulation when
2432  * a VM space is forked.
2433  */
2434 
2435 void
2436 pmap_fork(struct pmap *pmap1, struct pmap *pmap2)
2437 {
2438 #ifdef USER_LDT
2439 	union descriptor *new_ldt;
2440 	size_t len;
2441 	int sel;
2442 
2443 	if (__predict_true(pmap1->pm_ldt == NULL)) {
2444 		return;
2445 	}
2446 
2447  retry:
2448 	if (pmap1->pm_ldt != NULL) {
2449 		len = pmap1->pm_ldt_len;
2450 		new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, len, 0,
2451 		    UVM_KMF_WIRED);
2452 		mutex_enter(&cpu_lock);
2453 		sel = ldt_alloc(new_ldt, len);
2454 		if (sel == -1) {
2455 			mutex_exit(&cpu_lock);
2456 			uvm_km_free(kernel_map, (vaddr_t)new_ldt, len,
2457 			    UVM_KMF_WIRED);
2458 			printf("WARNING: pmap_fork: unable to allocate LDT\n");
2459 			return;
2460 		}
2461 	} else {
2462 		len = -1;
2463 		new_ldt = NULL;
2464 		sel = -1;
2465 		mutex_enter(&cpu_lock);
2466 	}
2467 
2468  	/* Copy the LDT, if necessary. */
2469  	if (pmap1->pm_ldt != NULL) {
2470 		if (len != pmap1->pm_ldt_len) {
2471 			if (len != -1) {
2472 				ldt_free(sel);
2473 				uvm_km_free(kernel_map, (vaddr_t)new_ldt,
2474 				    len, UVM_KMF_WIRED);
2475 			}
2476 			mutex_exit(&cpu_lock);
2477 			goto retry;
2478 		}
2479 
2480 		memcpy(new_ldt, pmap1->pm_ldt, len);
2481 		pmap2->pm_ldt = new_ldt;
2482 		pmap2->pm_ldt_len = pmap1->pm_ldt_len;
2483 		pmap2->pm_ldt_sel = sel;
2484 		len = -1;
2485 	}
2486 
2487 	if (len != -1) {
2488 		ldt_free(sel);
2489 		uvm_km_free(kernel_map, (vaddr_t)new_ldt, len,
2490 		    UVM_KMF_WIRED);
2491 	}
2492 	mutex_exit(&cpu_lock);
2493 #endif /* USER_LDT */
2494 }
2495 #endif /* PMAP_FORK */
2496 
2497 #ifdef USER_LDT
2498 
2499 /*
2500  * pmap_ldt_xcall: cross call used by pmap_ldt_sync.  if the named pmap
2501  * is active, reload LDTR.
2502  */
2503 static void
2504 pmap_ldt_xcall(void *arg1, void *arg2)
2505 {
2506 	struct pmap *pm;
2507 
2508 	kpreempt_disable();
2509 	pm = arg1;
2510 	if (curcpu()->ci_pmap == pm) {
2511 		lldt(pm->pm_ldt_sel);
2512 	}
2513 	kpreempt_enable();
2514 }
2515 
2516 /*
2517  * pmap_ldt_sync: LDT selector for the named pmap is changing.  swap
2518  * in the new selector on all CPUs.
2519  */
2520 void
2521 pmap_ldt_sync(struct pmap *pm)
2522 {
2523 	uint64_t where;
2524 
2525 	KASSERT(mutex_owned(&cpu_lock));
2526 
2527 	pmap_ldt_evcnt.ev_count++;
2528 	where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL);
2529 	xc_wait(where);
2530 }
2531 
2532 /*
2533  * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and
2534  * restore the default.
2535  */
2536 
2537 void
2538 pmap_ldt_cleanup(struct lwp *l)
2539 {
2540 	pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap;
2541 	union descriptor *dp = NULL;
2542 	size_t len = 0;
2543 	int sel = -1;
2544 
2545 	if (__predict_true(pmap->pm_ldt == NULL)) {
2546 		return;
2547 	}
2548 
2549 	mutex_enter(&cpu_lock);
2550 	if (pmap->pm_ldt != NULL) {
2551 		sel = pmap->pm_ldt_sel;
2552 		dp = pmap->pm_ldt;
2553 		len = pmap->pm_ldt_len;
2554 		pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2555 		pmap->pm_ldt = NULL;
2556 		pmap->pm_ldt_len = 0;
2557 		pmap_ldt_sync(pmap);
2558 		ldt_free(sel);
2559 		uvm_km_free(kernel_map, (vaddr_t)dp, len, UVM_KMF_WIRED);
2560 	}
2561 	mutex_exit(&cpu_lock);
2562 }
2563 #endif /* USER_LDT */
2564 
2565 /*
2566  * pmap_activate: activate a process' pmap
2567  *
2568  * => must be called with kernel preemption disabled
2569  * => if lwp is the curlwp, then set ci_want_pmapload so that
2570  *    actual MMU context switch will be done by pmap_load() later
2571  */
2572 
2573 void
2574 pmap_activate(struct lwp *l)
2575 {
2576 	struct cpu_info *ci;
2577 	struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2578 
2579 	KASSERT(kpreempt_disabled());
2580 
2581 	ci = curcpu();
2582 
2583 	if (l == ci->ci_curlwp) {
2584 		KASSERT(ci->ci_want_pmapload == 0);
2585 		KASSERT(ci->ci_tlbstate != TLBSTATE_VALID);
2586 #ifdef KSTACK_CHECK_DR0
2587 		/*
2588 		 * setup breakpoint on the top of stack
2589 		 */
2590 		if (l == &lwp0)
2591 			dr0(0, 0, 0, 0);
2592 		else
2593 			dr0(KSTACK_LOWEST_ADDR(l), 1, 3, 1);
2594 #endif
2595 
2596 		/*
2597 		 * no need to switch to kernel vmspace because
2598 		 * it's a subset of any vmspace.
2599 		 */
2600 
2601 		if (pmap == pmap_kernel()) {
2602 			ci->ci_want_pmapload = 0;
2603 			return;
2604 		}
2605 
2606 		ci->ci_want_pmapload = 1;
2607 	}
2608 }
2609 
2610 /*
2611  * pmap_reactivate: try to regain reference to the pmap.
2612  *
2613  * => must be called with kernel preemption disabled
2614  */
2615 
2616 static bool
2617 pmap_reactivate(struct pmap *pmap)
2618 {
2619 	struct cpu_info *ci;
2620 	uint32_t cpumask;
2621 	bool result;
2622 	uint32_t oldcpus;
2623 
2624 	ci = curcpu();
2625 	cpumask = ci->ci_cpumask;
2626 
2627 	KASSERT(kpreempt_disabled());
2628 #if defined(XEN) && defined(__x86_64__)
2629 	KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd);
2630 #elif defined(PAE)
2631 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
2632 #elif !defined(XEN)
2633 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()));
2634 #endif
2635 
2636 	/*
2637 	 * if we still have a lazy reference to this pmap,
2638 	 * we can assume that there was no tlb shootdown
2639 	 * for this pmap in the meantime.
2640 	 *
2641 	 * the order of events here is important as we must
2642 	 * synchronize with TLB shootdown interrupts.  declare
2643 	 * interest in invalidations (TLBSTATE_VALID) and then
2644 	 * check the cpumask, which the IPIs can change only
2645 	 * when the state is TLBSTATE_LAZY.
2646 	 */
2647 
2648 	ci->ci_tlbstate = TLBSTATE_VALID;
2649 	oldcpus = pmap->pm_cpus;
2650 	KASSERT((pmap->pm_kernel_cpus & cpumask) != 0);
2651 	if (oldcpus & cpumask) {
2652 		/* got it */
2653 		result = true;
2654 	} else {
2655 		/* must reload */
2656 		atomic_or_32(&pmap->pm_cpus, cpumask);
2657 		result = false;
2658 	}
2659 
2660 	return result;
2661 }
2662 
2663 /*
2664  * pmap_load: actually switch pmap.  (fill in %cr3 and LDT info)
2665  */
2666 
2667 void
2668 pmap_load(void)
2669 {
2670 	struct cpu_info *ci;
2671 	uint32_t cpumask;
2672 	struct pmap *pmap;
2673 	struct pmap *oldpmap;
2674 	struct lwp *l;
2675 	struct pcb *pcb;
2676 	uint64_t ncsw;
2677 
2678 	kpreempt_disable();
2679  retry:
2680 	ci = curcpu();
2681 	if (!ci->ci_want_pmapload) {
2682 		kpreempt_enable();
2683 		return;
2684 	}
2685 	cpumask = ci->ci_cpumask;
2686 	l = ci->ci_curlwp;
2687 	ncsw = l->l_ncsw;
2688 
2689 	/* should be able to take ipis. */
2690 	KASSERT(ci->ci_ilevel < IPL_HIGH);
2691 #ifdef XEN
2692 	/* XXX not yet KASSERT(x86_read_psl() != 0); */
2693 #else
2694 	KASSERT((x86_read_psl() & PSL_I) != 0);
2695 #endif
2696 
2697 	KASSERT(l != NULL);
2698 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2699 	KASSERT(pmap != pmap_kernel());
2700 	oldpmap = ci->ci_pmap;
2701 	pcb = lwp_getpcb(l);
2702 
2703 	if (pmap == oldpmap) {
2704 		if (!pmap_reactivate(pmap)) {
2705 			u_int gen = uvm_emap_gen_return();
2706 
2707 			/*
2708 			 * pmap has been changed during deactivated.
2709 			 * our tlb may be stale.
2710 			 */
2711 
2712 			tlbflush();
2713 			uvm_emap_update(gen);
2714 		}
2715 
2716 		ci->ci_want_pmapload = 0;
2717 		kpreempt_enable();
2718 		return;
2719 	}
2720 
2721 	/*
2722 	 * grab a reference to the new pmap.
2723 	 */
2724 
2725 	pmap_reference(pmap);
2726 
2727 	/*
2728 	 * actually switch pmap.
2729 	 */
2730 
2731 	atomic_and_32(&oldpmap->pm_cpus, ~cpumask);
2732 	atomic_and_32(&oldpmap->pm_kernel_cpus, ~cpumask);
2733 
2734 #if defined(XEN) && defined(__x86_64__)
2735 	KASSERT(pmap_pdirpa(oldpmap, 0) == ci->ci_xen_current_user_pgd ||
2736 	    oldpmap == pmap_kernel());
2737 #elif defined(PAE)
2738 	KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
2739 #elif !defined(XEN)
2740 	KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(rcr3()));
2741 #endif
2742 	KASSERT((pmap->pm_cpus & cpumask) == 0);
2743 	KASSERT((pmap->pm_kernel_cpus & cpumask) == 0);
2744 
2745 	/*
2746 	 * mark the pmap in use by this processor.  again we must
2747 	 * synchronize with TLB shootdown interrupts, so set the
2748 	 * state VALID first, then register us for shootdown events
2749 	 * on this pmap.
2750 	 */
2751 
2752 	ci->ci_tlbstate = TLBSTATE_VALID;
2753 	atomic_or_32(&pmap->pm_cpus, cpumask);
2754 	atomic_or_32(&pmap->pm_kernel_cpus, cpumask);
2755 	ci->ci_pmap = pmap;
2756 
2757 	/*
2758 	 * update tss.  now that we have registered for invalidations
2759 	 * from other CPUs, we're good to load the page tables.
2760 	 */
2761 #ifdef PAE
2762 	pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa;
2763 #else
2764 	pcb->pcb_cr3 = pmap_pdirpa(pmap, 0);
2765 #endif
2766 
2767 #ifdef i386
2768 #ifdef XEN
2769 	/*
2770 	 * clear APDP slot, in case it points to a page table that has
2771 	 * been freed
2772 	 */
2773 	if (*APDP_PDE) {
2774 		pmap_unmap_apdp();
2775 	}
2776 	/* lldt() does pmap_pte_flush() */
2777 #endif /* XEN */
2778 
2779 #ifndef XEN
2780 	ci->ci_tss.tss_ldt = pmap->pm_ldt_sel;
2781 	ci->ci_tss.tss_cr3 = pcb->pcb_cr3;
2782 #endif /* !XEN */
2783 #endif /* i386 */
2784 
2785 	lldt(pmap->pm_ldt_sel);
2786 
2787 	u_int gen = uvm_emap_gen_return();
2788 	cpu_load_pmap(pmap);
2789 	uvm_emap_update(gen);
2790 
2791 	ci->ci_want_pmapload = 0;
2792 
2793 	/*
2794 	 * we're now running with the new pmap.  drop the reference
2795 	 * to the old pmap.  if we block, we need to go around again.
2796 	 */
2797 
2798 	pmap_destroy(oldpmap);
2799 	if (l->l_ncsw != ncsw) {
2800 		goto retry;
2801 	}
2802 
2803 	kpreempt_enable();
2804 }
2805 
2806 /*
2807  * pmap_deactivate: deactivate a process' pmap
2808  *
2809  * => must be called with kernel preemption disabled (high SPL is enough)
2810  */
2811 
2812 void
2813 pmap_deactivate(struct lwp *l)
2814 {
2815 	struct pmap *pmap;
2816 	struct cpu_info *ci;
2817 
2818 	KASSERT(kpreempt_disabled());
2819 
2820 	if (l != curlwp) {
2821 		return;
2822 	}
2823 
2824 	/*
2825 	 * wait for pending TLB shootdowns to complete.  necessary
2826 	 * because TLB shootdown state is per-CPU, and the LWP may
2827 	 * be coming off the CPU before it has a chance to call
2828 	 * pmap_update().
2829 	 */
2830 	pmap_tlb_shootwait();
2831 
2832 	ci = curcpu();
2833 
2834 	if (ci->ci_want_pmapload) {
2835 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
2836 		    != pmap_kernel());
2837 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
2838 		    != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID);
2839 
2840 		/*
2841 		 * userspace has not been touched.
2842 		 * nothing to do here.
2843 		 */
2844 
2845 		ci->ci_want_pmapload = 0;
2846 		return;
2847 	}
2848 
2849 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2850 
2851 	if (pmap == pmap_kernel()) {
2852 		return;
2853 	}
2854 
2855 #if defined(XEN) && defined(__x86_64__)
2856 	KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd);
2857 #elif defined(PAE)
2858 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
2859 #elif !defined(XEN)
2860 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()));
2861 #endif
2862 	KASSERT(ci->ci_pmap == pmap);
2863 
2864 	/*
2865 	 * we aren't interested in TLB invalidations for this pmap,
2866 	 * at least for the time being.
2867 	 */
2868 
2869 	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
2870 	ci->ci_tlbstate = TLBSTATE_LAZY;
2871 }
2872 
2873 /*
2874  * end of lifecycle functions
2875  */
2876 
2877 /*
2878  * some misc. functions
2879  */
2880 
2881 int
2882 pmap_pdes_invalid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde)
2883 {
2884 	int i;
2885 	unsigned long index;
2886 	pd_entry_t pde;
2887 
2888 	for (i = PTP_LEVELS; i > 1; i--) {
2889 		index = pl_i(va, i);
2890 		pde = pdes[i - 2][index];
2891 		if ((pde & PG_V) == 0)
2892 			return i;
2893 	}
2894 	if (lastpde != NULL)
2895 		*lastpde = pde;
2896 	return 0;
2897 }
2898 
2899 /*
2900  * pmap_extract: extract a PA for the given VA
2901  */
2902 
2903 bool
2904 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
2905 {
2906 	pt_entry_t *ptes, pte;
2907 	pd_entry_t pde;
2908 	pd_entry_t * const *pdes;
2909 	struct pmap *pmap2;
2910 	struct cpu_info *ci;
2911 	paddr_t pa;
2912 	lwp_t *l;
2913 	bool hard, rv;
2914 
2915 	rv = false;
2916 	pa = 0;
2917 	l = curlwp;
2918 
2919 	KPREEMPT_DISABLE(l);
2920 	ci = l->l_cpu;
2921 	if (__predict_true(!ci->ci_want_pmapload && ci->ci_pmap == pmap) ||
2922 	    pmap == pmap_kernel()) {
2923 		/*
2924 		 * no need to lock, because it's pmap_kernel() or our
2925 		 * own pmap and is active.  if a user pmap, the caller
2926 		 * will hold the vm_map write/read locked and so prevent
2927 		 * entries from disappearing while we are here.  ptps
2928 		 * can disappear via pmap_remove() and pmap_protect(),
2929 		 * but they are called with the vm_map write locked.
2930 		 */
2931 		hard = false;
2932 		ptes = PTE_BASE;
2933 		pdes = normal_pdes;
2934 	} else {
2935 		/* we lose, do it the hard way. */
2936 		hard = true;
2937 		pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
2938 	}
2939 	if (pmap_pdes_valid(va, pdes, &pde)) {
2940 		pte = ptes[pl1_i(va)];
2941 		if (pde & PG_PS) {
2942 			pa = (pde & PG_LGFRAME) | (va & (NBPD_L2 - 1));
2943 			rv = true;
2944 		} else if (__predict_true((pte & PG_V) != 0)) {
2945 			pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
2946 			rv = true;
2947 		}
2948 	}
2949 	if (__predict_false(hard)) {
2950 		pmap_unmap_ptes(pmap, pmap2);
2951 	}
2952 	KPREEMPT_ENABLE(l);
2953 	if (pap != NULL) {
2954 		*pap = pa;
2955 	}
2956 	return rv;
2957 }
2958 
2959 
2960 /*
2961  * vtophys: virtual address to physical address.  For use by
2962  * machine-dependent code only.
2963  */
2964 
2965 paddr_t
2966 vtophys(vaddr_t va)
2967 {
2968 	paddr_t pa;
2969 
2970 	if (pmap_extract(pmap_kernel(), va, &pa) == true)
2971 		return (pa);
2972 	return (0);
2973 }
2974 
2975 __weak_alias(pmap_extract_ma, pmap_extract);
2976 
2977 #ifdef XEN
2978 
2979 /*
2980  * vtomach: virtual address to machine address.  For use by
2981  * machine-dependent code only.
2982  */
2983 
2984 paddr_t
2985 vtomach(vaddr_t va)
2986 {
2987 	paddr_t pa;
2988 
2989 	if (pmap_extract_ma(pmap_kernel(), va, &pa) == true)
2990 		return (pa);
2991 	return (0);
2992 }
2993 
2994 #endif /* XEN */
2995 
2996 /*
2997  * pmap_virtual_space: used during bootup [pmap_steal_memory] to
2998  *	determine the bounds of the kernel virtual addess space.
2999  */
3000 
3001 void
3002 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp)
3003 {
3004 	*startp = virtual_avail;
3005 	*endp = virtual_end;
3006 }
3007 
3008 /*
3009  * pmap_map: map a range of PAs into kvm.
3010  *
3011  * => used during crash dump
3012  * => XXX: pmap_map() should be phased out?
3013  */
3014 
3015 vaddr_t
3016 pmap_map(vaddr_t va, paddr_t spa, paddr_t epa, vm_prot_t prot)
3017 {
3018 	while (spa < epa) {
3019 		pmap_kenter_pa(va, spa, prot, 0);
3020 		va += PAGE_SIZE;
3021 		spa += PAGE_SIZE;
3022 	}
3023 	pmap_update(pmap_kernel());
3024 	return va;
3025 }
3026 
3027 /*
3028  * pmap_zero_page: zero a page
3029  */
3030 
3031 void
3032 pmap_zero_page(paddr_t pa)
3033 {
3034 	pt_entry_t *zpte;
3035 	void *zerova;
3036 	int id;
3037 
3038 	kpreempt_disable();
3039 	id = cpu_number();
3040 	zpte = PTESLEW(zero_pte, id);
3041 	zerova = VASLEW(zerop, id);
3042 
3043 #ifdef DIAGNOSTIC
3044 	if (*zpte)
3045 		panic("pmap_zero_page: lock botch");
3046 #endif
3047 
3048 	pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k);
3049 	pmap_pte_flush();
3050 	pmap_update_pg((vaddr_t)zerova);		/* flush TLB */
3051 
3052 	memset(zerova, 0, PAGE_SIZE);
3053 
3054 #if defined(DIAGNOSTIC) || defined(XEN)
3055 	pmap_pte_set(zpte, 0);				/* zap ! */
3056 	pmap_pte_flush();
3057 #endif
3058 	kpreempt_enable();
3059 }
3060 
3061 /*
3062  * pmap_pagezeroidle: the same, for the idle loop page zero'er.
3063  * Returns true if the page was zero'd, false if we aborted for
3064  * some reason.
3065  */
3066 
3067 bool
3068 pmap_pageidlezero(paddr_t pa)
3069 {
3070 	pt_entry_t *zpte;
3071 	void *zerova;
3072 	bool rv;
3073 	int id;
3074 
3075 	id = cpu_number();
3076 	zpte = PTESLEW(zero_pte, id);
3077 	zerova = VASLEW(zerop, id);
3078 
3079 	KASSERT(cpu_feature[0] & CPUID_SSE2);
3080 	KASSERT(*zpte == 0);
3081 
3082 	pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k);
3083 	pmap_pte_flush();
3084 	pmap_update_pg((vaddr_t)zerova);		/* flush TLB */
3085 
3086 	rv = sse2_idlezero_page(zerova);
3087 
3088 #if defined(DIAGNOSTIC) || defined(XEN)
3089 	pmap_pte_set(zpte, 0);				/* zap ! */
3090 	pmap_pte_flush();
3091 #endif
3092 
3093 	return rv;
3094 }
3095 
3096 /*
3097  * pmap_copy_page: copy a page
3098  */
3099 
3100 void
3101 pmap_copy_page(paddr_t srcpa, paddr_t dstpa)
3102 {
3103 	pt_entry_t *spte;
3104 	pt_entry_t *dpte;
3105 	void *csrcva;
3106 	void *cdstva;
3107 	int id;
3108 
3109 	kpreempt_disable();
3110 	id = cpu_number();
3111 	spte = PTESLEW(csrc_pte,id);
3112 	dpte = PTESLEW(cdst_pte,id);
3113 	csrcva = VASLEW(csrcp, id);
3114 	cdstva = VASLEW(cdstp, id);
3115 
3116 	KASSERT(*spte == 0 && *dpte == 0);
3117 
3118 	pmap_pte_set(spte, pmap_pa2pte(srcpa) | PG_V | PG_RW | PG_U | PG_k);
3119 	pmap_pte_set(dpte,
3120 	    pmap_pa2pte(dstpa) | PG_V | PG_RW | PG_M | PG_U | PG_k);
3121 	pmap_pte_flush();
3122 	pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva);
3123 
3124 	memcpy(cdstva, csrcva, PAGE_SIZE);
3125 
3126 #if defined(DIAGNOSTIC) || defined(XEN)
3127 	pmap_pte_set(spte, 0);
3128 	pmap_pte_set(dpte, 0);
3129 	pmap_pte_flush();
3130 #endif
3131 	kpreempt_enable();
3132 }
3133 
3134 static pt_entry_t *
3135 pmap_map_ptp(struct vm_page *ptp)
3136 {
3137 	pt_entry_t *ptppte;
3138 	void *ptpva;
3139 	int id;
3140 
3141 	KASSERT(kpreempt_disabled());
3142 
3143 	id = cpu_number();
3144 	ptppte = PTESLEW(ptp_pte, id);
3145 	ptpva = VASLEW(ptpp, id);
3146 #if !defined(XEN)
3147 	pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M |
3148 	    PG_RW | PG_U | PG_k);
3149 #else
3150 	pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M |
3151 	    PG_U | PG_k);
3152 #endif
3153 	pmap_pte_flush();
3154 	pmap_update_pg((vaddr_t)ptpva);
3155 
3156 	return (pt_entry_t *)ptpva;
3157 }
3158 
3159 static void
3160 pmap_unmap_ptp(void)
3161 {
3162 #if defined(DIAGNOSTIC) || defined(XEN)
3163 	pt_entry_t *pte;
3164 
3165 	KASSERT(kpreempt_disabled());
3166 
3167 	pte = PTESLEW(ptp_pte, cpu_number());
3168 	if (*pte != 0) {
3169 		pmap_pte_set(pte, 0);
3170 		pmap_pte_flush();
3171 	}
3172 #endif
3173 }
3174 
3175 static pt_entry_t *
3176 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
3177 {
3178 
3179 	KASSERT(kpreempt_disabled());
3180 	if (pmap_is_curpmap(pmap)) {
3181 		return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */
3182 	}
3183 	KASSERT(ptp != NULL);
3184 	return pmap_map_ptp(ptp) + pl1_pi(va);
3185 }
3186 
3187 static void
3188 pmap_unmap_pte(void)
3189 {
3190 
3191 	KASSERT(kpreempt_disabled());
3192 
3193 	pmap_unmap_ptp();
3194 }
3195 
3196 /*
3197  * p m a p   r e m o v e   f u n c t i o n s
3198  *
3199  * functions that remove mappings
3200  */
3201 
3202 /*
3203  * pmap_remove_ptes: remove PTEs from a PTP
3204  *
3205  * => must have proper locking on pmap_master_lock
3206  * => caller must hold pmap's lock
3207  * => PTP must be mapped into KVA
3208  * => PTP should be null if pmap == pmap_kernel()
3209  * => must be called with kernel preemption disabled
3210  * => returns composite pte if at least one page should be shot down
3211  */
3212 
3213 static pt_entry_t
3214 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
3215 		 vaddr_t startva, vaddr_t endva, struct pv_entry **pv_tofree)
3216 {
3217 	struct pv_entry *pve;
3218 	pt_entry_t *pte = (pt_entry_t *) ptpva;
3219 	pt_entry_t opte, xpte = 0;
3220 
3221 	KASSERT(pmap == pmap_kernel() || mutex_owned(&pmap->pm_lock));
3222 	KASSERT(kpreempt_disabled());
3223 
3224 	/*
3225 	 * note that ptpva points to the PTE that maps startva.   this may
3226 	 * or may not be the first PTE in the PTP.
3227 	 *
3228 	 * we loop through the PTP while there are still PTEs to look at
3229 	 * and the wire_count is greater than 1 (because we use the wire_count
3230 	 * to keep track of the number of real PTEs in the PTP).
3231 	 */
3232 
3233 	for (/*null*/; startva < endva && (ptp == NULL || ptp->wire_count > 1)
3234 			     ; pte++, startva += PAGE_SIZE) {
3235 		struct vm_page *pg;
3236 		struct pmap_page *pp;
3237 
3238 		if (!pmap_valid_entry(*pte))
3239 			continue;			/* VA not mapped */
3240 
3241 		/* atomically save the old PTE and zap! it */
3242 		opte = pmap_pte_testset(pte, 0);
3243 		if (!pmap_valid_entry(opte)) {
3244 			continue;
3245 		}
3246 
3247 		pmap_exec_account(pmap, startva, opte, 0);
3248 		pmap_stats_update_bypte(pmap, 0, opte);
3249 		xpte |= opte;
3250 
3251 		if (ptp) {
3252 			ptp->wire_count--;		/* dropping a PTE */
3253 			/* Make sure that the PDE is flushed */
3254 			if (ptp->wire_count <= 1)
3255 				xpte |= PG_U;
3256 		}
3257 
3258 		/*
3259 		 * if we are not on a pv_head list we are done.
3260 		 */
3261 
3262 		if ((opte & PG_PVLIST) == 0) {
3263 #if defined(DIAGNOSTIC) && !defined(DOM0OPS)
3264 			if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL)
3265 				panic("pmap_remove_ptes: managed page without "
3266 				      "PG_PVLIST for %#" PRIxVADDR, startva);
3267 #endif
3268 			continue;
3269 		}
3270 
3271 		pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte));
3272 #ifdef DIAGNOSTIC
3273 		if (pg == NULL)
3274 			panic("pmap_remove_ptes: unmanaged page marked "
3275 			      "PG_PVLIST, va = %#" PRIxVADDR ", "
3276 			      "pa = %#" PRIxPADDR,
3277 			      startva, (paddr_t)pmap_pte2pa(opte));
3278 #endif
3279 
3280 		/* sync R/M bits */
3281 		pp = VM_PAGE_TO_PP(pg);
3282 		pp_lock(pp);
3283 		pp->pp_attrs |= opte;
3284 		pve = pmap_remove_pv(pp, ptp, startva);
3285 		pp_unlock(pp);
3286 
3287 		if (pve != NULL) {
3288 			pve->pve_next = *pv_tofree;
3289 			*pv_tofree = pve;
3290 		}
3291 
3292 		/* end of "for" loop: time for next pte */
3293 	}
3294 
3295 	return xpte;
3296 }
3297 
3298 
3299 /*
3300  * pmap_remove_pte: remove a single PTE from a PTP
3301  *
3302  * => must have proper locking on pmap_master_lock
3303  * => caller must hold pmap's lock
3304  * => PTP must be mapped into KVA
3305  * => PTP should be null if pmap == pmap_kernel()
3306  * => returns true if we removed a mapping
3307  * => must be called with kernel preemption disabled
3308  */
3309 
3310 static bool
3311 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
3312 		vaddr_t va, struct pv_entry **pv_tofree)
3313 {
3314 	pt_entry_t opte;
3315 	struct pv_entry *pve;
3316 	struct vm_page *pg;
3317 	struct pmap_page *pp;
3318 
3319 	KASSERT(pmap == pmap_kernel() || mutex_owned(&pmap->pm_lock));
3320 	KASSERT(pmap == pmap_kernel() || kpreempt_disabled());
3321 
3322 	if (!pmap_valid_entry(*pte))
3323 		return(false);		/* VA not mapped */
3324 
3325 	/* atomically save the old PTE and zap! it */
3326 	opte = pmap_pte_testset(pte, 0);
3327 	if (!pmap_valid_entry(opte)) {
3328 		return false;
3329 	}
3330 
3331 	pmap_exec_account(pmap, va, opte, 0);
3332 	pmap_stats_update_bypte(pmap, 0, opte);
3333 
3334 	if (opte & PG_U)
3335 		pmap_tlb_shootdown(pmap, va, 0, opte);
3336 
3337 	if (ptp) {
3338 		ptp->wire_count--;		/* dropping a PTE */
3339 		/* Make sure that the PDE is flushed */
3340 		if ((ptp->wire_count <= 1) && !(opte & PG_U))
3341 			pmap_tlb_shootdown(pmap, va, 0, opte);
3342 	}
3343 
3344 	/*
3345 	 * if we are not on a pv_head list we are done.
3346 	 */
3347 
3348 	if ((opte & PG_PVLIST) == 0) {
3349 #if defined(DIAGNOSTIC) && !defined(DOM0OPS)
3350 		if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL)
3351 			panic("pmap_remove_pte: managed page without "
3352 			      "PG_PVLIST for %#" PRIxVADDR, va);
3353 #endif
3354 		return(true);
3355 	}
3356 
3357 	pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte));
3358 #ifdef DIAGNOSTIC
3359 	if (pg == NULL)
3360 		panic("pmap_remove_pte: unmanaged page marked "
3361 		    "PG_PVLIST, va = %#" PRIxVADDR ", pa = %#" PRIxPADDR,
3362 		    va, (paddr_t)pmap_pte2pa(opte));
3363 #endif
3364 
3365 	/* sync R/M bits */
3366 	pp = VM_PAGE_TO_PP(pg);
3367 	pp_lock(pp);
3368 	pp->pp_attrs |= opte;
3369 	pve = pmap_remove_pv(pp, ptp, va);
3370 	pp_unlock(pp);
3371 
3372 	if (pve) {
3373 		pve->pve_next = *pv_tofree;
3374 		*pv_tofree = pve;
3375 	}
3376 
3377 	return(true);
3378 }
3379 
3380 /*
3381  * pmap_remove: mapping removal function.
3382  *
3383  * => caller should not be holding any pmap locks
3384  */
3385 
3386 void
3387 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
3388 {
3389 	pt_entry_t *ptes, xpte = 0;
3390 	pd_entry_t pde;
3391 	pd_entry_t * const *pdes;
3392 	struct pv_entry *pv_tofree = NULL;
3393 	bool result;
3394 	int i;
3395 	paddr_t ptppa;
3396 	vaddr_t blkendva, va = sva;
3397 	struct vm_page *ptp;
3398 	struct pmap *pmap2;
3399 
3400 	kpreempt_disable();
3401 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
3402 
3403 	/*
3404 	 * removing one page?  take shortcut function.
3405 	 */
3406 
3407 	if (va + PAGE_SIZE == eva) {
3408 		if (pmap_pdes_valid(va, pdes, &pde)) {
3409 
3410 			/* PA of the PTP */
3411 			ptppa = pmap_pte2pa(pde);
3412 
3413 			/* get PTP if non-kernel mapping */
3414 			if (pmap == pmap_kernel()) {
3415 				/* we never free kernel PTPs */
3416 				ptp = NULL;
3417 			} else {
3418 				ptp = pmap_find_ptp(pmap, va, ptppa, 1);
3419 #ifdef DIAGNOSTIC
3420 				if (ptp == NULL)
3421 					panic("pmap_remove: unmanaged "
3422 					      "PTP detected");
3423 #endif
3424 			}
3425 
3426 			/* do it! */
3427 			result = pmap_remove_pte(pmap, ptp,
3428 			    &ptes[pl1_i(va)], va, &pv_tofree);
3429 
3430 			/*
3431 			 * if mapping removed and the PTP is no longer
3432 			 * being used, free it!
3433 			 */
3434 
3435 			if (result && ptp && ptp->wire_count <= 1)
3436 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3437 		}
3438 	} else for (/* null */ ; va < eva ; va = blkendva) {
3439 		int lvl;
3440 
3441 		/* determine range of block */
3442 		blkendva = x86_round_pdr(va+1);
3443 		if (blkendva > eva)
3444 			blkendva = eva;
3445 
3446 		/*
3447 		 * XXXCDC: our PTE mappings should never be removed
3448 		 * with pmap_remove!  if we allow this (and why would
3449 		 * we?) then we end up freeing the pmap's page
3450 		 * directory page (PDP) before we are finished using
3451 		 * it when we hit in in the recursive mapping.  this
3452 		 * is BAD.
3453 		 *
3454 		 * long term solution is to move the PTEs out of user
3455 		 * address space.  and into kernel address space (up
3456 		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
3457 		 * be VM_MAX_ADDRESS.
3458 		 */
3459 
3460 		/* XXXCDC: ugly hack to avoid freeing PDP here */
3461 		for (i = 0; i < PDP_SIZE; i++) {
3462 			if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i)
3463 				continue;
3464 		}
3465 
3466 		lvl = pmap_pdes_invalid(va, pdes, &pde);
3467 		if (lvl != 0) {
3468 			/*
3469 			 * skip a range corresponding to an invalid pde.
3470 			 */
3471 			blkendva = (va & ptp_masks[lvl - 1]) + nbpd[lvl - 1];
3472  			continue;
3473 		}
3474 
3475 		/* PA of the PTP */
3476 		ptppa = pmap_pte2pa(pde);
3477 
3478 		/* get PTP if non-kernel mapping */
3479 		if (pmap == pmap_kernel()) {
3480 			/* we never free kernel PTPs */
3481 			ptp = NULL;
3482 		} else {
3483 			ptp = pmap_find_ptp(pmap, va, ptppa, 1);
3484 #ifdef DIAGNOSTIC
3485 			if (ptp == NULL)
3486 				panic("pmap_remove: unmanaged PTP "
3487 				      "detected");
3488 #endif
3489 		}
3490 		xpte |= pmap_remove_ptes(pmap, ptp,
3491 		    (vaddr_t)&ptes[pl1_i(va)], va, blkendva, &pv_tofree);
3492 
3493 		/* if PTP is no longer being used, free it! */
3494 		if (ptp && ptp->wire_count <= 1) {
3495 			pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3496 		}
3497 		if ((xpte & PG_U) != 0)
3498 			pmap_tlb_shootdown(pmap, sva, eva, xpte);
3499 	}
3500 	pmap_unmap_ptes(pmap, pmap2);		/* unlock pmap */
3501 	kpreempt_enable();
3502 
3503 	/* Now we free unused PVs */
3504 	if (pv_tofree)
3505 		pmap_free_pvs(pv_tofree);
3506 }
3507 
3508 /*
3509  * pmap_sync_pv: clear pte bits and return the old value of the pte.
3510  *
3511  * => called with pp_lock held. (thus preemption disabled)
3512  * => issues tlb shootdowns if necessary.
3513  */
3514 
3515 static int
3516 pmap_sync_pv(struct pv_pte *pvpte, pt_entry_t expect, int clearbits,
3517     pt_entry_t *optep)
3518 {
3519 	struct pmap *pmap;
3520 	struct vm_page *ptp;
3521 	vaddr_t va;
3522 	pt_entry_t *ptep;
3523 	pt_entry_t opte;
3524 	pt_entry_t npte;
3525 	bool need_shootdown;
3526 
3527 	ptp = pvpte->pte_ptp;
3528 	va = pvpte->pte_va;
3529 	KASSERT(ptp == NULL || ptp->uobject != NULL);
3530 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
3531 	pmap = ptp_to_pmap(ptp);
3532 
3533 	KASSERT((expect & ~(PG_FRAME | PG_V)) == 0);
3534 	KASSERT((expect & PG_V) != 0);
3535 	KASSERT(clearbits == ~0 || (clearbits & ~(PG_M | PG_U | PG_RW)) == 0);
3536 	KASSERT(kpreempt_disabled());
3537 
3538 	ptep = pmap_map_pte(pmap, ptp, va);
3539 	do {
3540 		opte = *ptep;
3541 		KASSERT((opte & (PG_M | PG_U)) != PG_M);
3542 		KASSERT((opte & (PG_U | PG_V)) != PG_U);
3543 		KASSERT(opte == 0 || (opte & PG_V) != 0);
3544 		if ((opte & (PG_FRAME | PG_V)) != expect) {
3545 
3546 			/*
3547 			 * we lost a race with a V->P operation like
3548 			 * pmap_remove().  wait for the competitor
3549 			 * reflecting pte bits into mp_attrs.
3550 			 *
3551 			 * issue a redundant TLB shootdown so that
3552 			 * we can wait for its completion.
3553 			 */
3554 
3555 			pmap_unmap_pte();
3556 			if (clearbits != 0) {
3557 				pmap_tlb_shootdown(pmap, va, 0,
3558 				    (pmap == pmap_kernel() ? PG_G : 0));
3559 			}
3560 			return EAGAIN;
3561 		}
3562 
3563 		/*
3564 		 * check if there's anything to do on this pte.
3565 		 */
3566 
3567 		if ((opte & clearbits) == 0) {
3568 			need_shootdown = false;
3569 			break;
3570 		}
3571 
3572 		/*
3573 		 * we need a shootdown if the pte is cached. (PG_U)
3574 		 *
3575 		 * ...unless we are clearing only the PG_RW bit and
3576 		 * it isn't cached as RW. (PG_M)
3577 		 */
3578 
3579 		need_shootdown = (opte & PG_U) != 0 &&
3580 		    !(clearbits == PG_RW && (opte & PG_M) == 0);
3581 
3582 		npte = opte & ~clearbits;
3583 
3584 		/*
3585 		 * if we need a shootdown anyway, clear PG_U and PG_M.
3586 		 */
3587 
3588 		if (need_shootdown) {
3589 			npte &= ~(PG_U | PG_M);
3590 		}
3591 		KASSERT((npte & (PG_M | PG_U)) != PG_M);
3592 		KASSERT((npte & (PG_U | PG_V)) != PG_U);
3593 		KASSERT(npte == 0 || (opte & PG_V) != 0);
3594 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
3595 
3596 	if (need_shootdown) {
3597 		pmap_tlb_shootdown(pmap, va, 0, opte);
3598 	}
3599 	pmap_unmap_pte();
3600 
3601 	*optep = opte;
3602 	return 0;
3603 }
3604 
3605 /*
3606  * pmap_page_remove: remove a managed vm_page from all pmaps that map it
3607  *
3608  * => R/M bits are sync'd back to attrs
3609  */
3610 
3611 void
3612 pmap_page_remove(struct vm_page *pg)
3613 {
3614 	struct pmap_page *pp;
3615 	struct pv_pte *pvpte;
3616 	struct pv_entry *killlist = NULL;
3617 	struct vm_page *ptp;
3618 	pt_entry_t expect;
3619 	lwp_t *l;
3620 	int count;
3621 
3622 	l = curlwp;
3623 	pp = VM_PAGE_TO_PP(pg);
3624 	expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V;
3625 	count = SPINLOCK_BACKOFF_MIN;
3626 	kpreempt_disable();
3627 startover:
3628 	pp_lock(pp);
3629 	while ((pvpte = pv_pte_first(pp)) != NULL) {
3630 		struct pmap *pmap;
3631 		struct pv_entry *pve;
3632 		pt_entry_t opte;
3633 		vaddr_t va;
3634 		int error;
3635 
3636 		/*
3637 		 * add a reference to the pmap before clearing the pte.
3638 		 * otherwise the pmap can disappear behind us.
3639 		 */
3640 
3641 		ptp = pvpte->pte_ptp;
3642 		pmap = ptp_to_pmap(ptp);
3643 		if (ptp != NULL) {
3644 			pmap_reference(pmap);
3645 		}
3646 
3647 		error = pmap_sync_pv(pvpte, expect, ~0, &opte);
3648 		if (error == EAGAIN) {
3649 			int hold_count;
3650 			pp_unlock(pp);
3651 			KERNEL_UNLOCK_ALL(curlwp, &hold_count);
3652 			if (ptp != NULL) {
3653 				pmap_destroy(pmap);
3654 			}
3655 			SPINLOCK_BACKOFF(count);
3656 			KERNEL_LOCK(hold_count, curlwp);
3657 			goto startover;
3658 		}
3659 
3660 		pp->pp_attrs |= opte;
3661 		va = pvpte->pte_va;
3662 		pve = pmap_remove_pv(pp, ptp, va);
3663 		pp_unlock(pp);
3664 
3665 		/* update the PTP reference count.  free if last reference. */
3666 		if (ptp != NULL) {
3667 			struct pmap *pmap2;
3668 			pt_entry_t *ptes;
3669 			pd_entry_t * const *pdes;
3670 
3671 			KASSERT(pmap != pmap_kernel());
3672 
3673 			pmap_tlb_shootwait();
3674 			pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3675 			pmap_stats_update_bypte(pmap, 0, opte);
3676 			ptp->wire_count--;
3677 			if (ptp->wire_count <= 1) {
3678 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3679 			}
3680 			pmap_unmap_ptes(pmap, pmap2);
3681 			pmap_destroy(pmap);
3682 		} else {
3683 			KASSERT(pmap == pmap_kernel());
3684 			pmap_stats_update_bypte(pmap, 0, opte);
3685 		}
3686 
3687 		if (pve != NULL) {
3688 			pve->pve_next = killlist;	/* mark it for death */
3689 			killlist = pve;
3690 		}
3691 		pp_lock(pp);
3692 	}
3693 	pp_unlock(pp);
3694 	kpreempt_enable();
3695 
3696 	/* Now free unused pvs. */
3697 	pmap_free_pvs(killlist);
3698 }
3699 
3700 /*
3701  * p m a p   a t t r i b u t e  f u n c t i o n s
3702  * functions that test/change managed page's attributes
3703  * since a page can be mapped multiple times we must check each PTE that
3704  * maps it by going down the pv lists.
3705  */
3706 
3707 /*
3708  * pmap_test_attrs: test a page's attributes
3709  */
3710 
3711 bool
3712 pmap_test_attrs(struct vm_page *pg, unsigned testbits)
3713 {
3714 	struct pmap_page *pp;
3715 	struct pv_pte *pvpte;
3716 	pt_entry_t expect;
3717 	u_int result;
3718 
3719 	pp = VM_PAGE_TO_PP(pg);
3720 	if ((pp->pp_attrs & testbits) != 0) {
3721 		return true;
3722 	}
3723 	expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V;
3724 	pp_lock(pp);
3725 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
3726 		pt_entry_t opte;
3727 		int error;
3728 
3729 		if ((pp->pp_attrs & testbits) != 0) {
3730 			break;
3731 		}
3732 		error = pmap_sync_pv(pvpte, expect, 0, &opte);
3733 		if (error == 0) {
3734 			pp->pp_attrs |= opte;
3735 		}
3736 	}
3737 	result = pp->pp_attrs & testbits;
3738 	pp_unlock(pp);
3739 
3740 	/*
3741 	 * note that we will exit the for loop with a non-null pve if
3742 	 * we have found the bits we are testing for.
3743 	 */
3744 
3745 	return result != 0;
3746 }
3747 
3748 /*
3749  * pmap_clear_attrs: clear the specified attribute for a page.
3750  *
3751  * => we return true if we cleared one of the bits we were asked to
3752  */
3753 
3754 bool
3755 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits)
3756 {
3757 	struct pmap_page *pp;
3758 	struct pv_pte *pvpte;
3759 	u_int result;
3760 	pt_entry_t expect;
3761 	int count;
3762 
3763 	pp = VM_PAGE_TO_PP(pg);
3764 	expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V;
3765 	count = SPINLOCK_BACKOFF_MIN;
3766 	kpreempt_disable();
3767 startover:
3768 	pp_lock(pp);
3769 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
3770 		pt_entry_t opte;
3771 		int error;
3772 
3773 		error = pmap_sync_pv(pvpte, expect, clearbits, &opte);
3774 		if (error == EAGAIN) {
3775 			int hold_count;
3776 			pp_unlock(pp);
3777 			KERNEL_UNLOCK_ALL(curlwp, &hold_count);
3778 			SPINLOCK_BACKOFF(count);
3779 			KERNEL_LOCK(hold_count, curlwp);
3780 			goto startover;
3781 		}
3782 		pp->pp_attrs |= opte;
3783 	}
3784 	result = pp->pp_attrs & clearbits;
3785 	pp->pp_attrs &= ~clearbits;
3786 	pp_unlock(pp);
3787 	kpreempt_enable();
3788 
3789 	return result != 0;
3790 }
3791 
3792 
3793 /*
3794  * p m a p   p r o t e c t i o n   f u n c t i o n s
3795  */
3796 
3797 /*
3798  * pmap_page_protect: change the protection of all recorded mappings
3799  *	of a managed page
3800  *
3801  * => NOTE: this is an inline function in pmap.h
3802  */
3803 
3804 /* see pmap.h */
3805 
3806 /*
3807  * pmap_protect: set the protection in of the pages in a pmap
3808  *
3809  * => NOTE: this is an inline function in pmap.h
3810  */
3811 
3812 /* see pmap.h */
3813 
3814 /*
3815  * pmap_write_protect: write-protect pages in a pmap
3816  */
3817 
3818 void
3819 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
3820 {
3821 	int i;
3822 	pt_entry_t *ptes, *epte;
3823 	pt_entry_t *spte;
3824 	pd_entry_t * const *pdes;
3825 	vaddr_t blockend, va;
3826 	pt_entry_t opte;
3827 	struct pmap *pmap2;
3828 
3829 	KASSERT(curlwp->l_md.md_gc_pmap != pmap);
3830 
3831 	kpreempt_disable();
3832 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
3833 
3834 	/* should be ok, but just in case ... */
3835 	sva &= PG_FRAME;
3836 	eva &= PG_FRAME;
3837 
3838 	for (va = sva ; va < eva ; va = blockend) {
3839 
3840 		blockend = (va & L2_FRAME) + NBPD_L2;
3841 		if (blockend > eva)
3842 			blockend = eva;
3843 
3844 		/*
3845 		 * XXXCDC: our PTE mappings should never be write-protected!
3846 		 *
3847 		 * long term solution is to move the PTEs out of user
3848 		 * address space.  and into kernel address space (up
3849 		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
3850 		 * be VM_MAX_ADDRESS.
3851 		 */
3852 
3853 		/* XXXCDC: ugly hack to avoid freeing PDP here */
3854 		for (i = 0; i < PDP_SIZE; i++) {
3855 			if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i)
3856 				continue;
3857 		}
3858 
3859 		/* empty block? */
3860 		if (!pmap_pdes_valid(va, pdes, NULL))
3861 			continue;
3862 
3863 #ifdef DIAGNOSTIC
3864 		if (va >= VM_MAXUSER_ADDRESS &&
3865 		    va < VM_MAX_ADDRESS)
3866 			panic("pmap_write_protect: PTE space");
3867 #endif
3868 
3869 		spte = &ptes[pl1_i(va)];
3870 		epte = &ptes[pl1_i(blockend)];
3871 
3872 		for (/*null */; spte < epte ; spte++) {
3873 			pt_entry_t npte;
3874 
3875 			do {
3876 				opte = *spte;
3877 				if ((~opte & (PG_RW | PG_V)) != 0) {
3878 					goto next;
3879 				}
3880 				npte = opte & ~PG_RW;
3881 			} while (pmap_pte_cas(spte, opte, npte) != opte);
3882 			if ((opte & PG_M) != 0) {
3883 				vaddr_t tva;
3884 
3885 				tva = x86_ptob(spte - ptes);
3886 				pmap_tlb_shootdown(pmap, tva, 0, opte);
3887 			}
3888 next:;
3889 		}
3890 	}
3891 
3892 	pmap_unmap_ptes(pmap, pmap2);	/* unlocks pmap */
3893 	kpreempt_enable();
3894 }
3895 
3896 /*
3897  * end of protection functions
3898  */
3899 
3900 /*
3901  * pmap_unwire: clear the wired bit in the PTE
3902  *
3903  * => mapping should already be in map
3904  */
3905 
3906 void
3907 pmap_unwire(struct pmap *pmap, vaddr_t va)
3908 {
3909 	pt_entry_t *ptes;
3910 	pd_entry_t * const *pdes;
3911 	struct pmap *pmap2;
3912 
3913 	kpreempt_disable();
3914 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
3915 
3916 	if (pmap_pdes_valid(va, pdes, NULL)) {
3917 		pt_entry_t *ptep = &ptes[pl1_i(va)];
3918 		pt_entry_t opte = *ptep;
3919 
3920 #ifdef DIAGNOSTIC
3921 		if (!pmap_valid_entry(opte))
3922 			panic("pmap_unwire: invalid (unmapped) va 0x%lx", va);
3923 #endif
3924 		if ((opte & PG_W) != 0) {
3925 			pt_entry_t npte = opte & ~PG_W;
3926 
3927 			opte = pmap_pte_testset(ptep, npte);
3928 			pmap_stats_update_bypte(pmap, npte, opte);
3929 		}
3930 #ifdef DIAGNOSTIC
3931 		else {
3932 			printf("pmap_unwire: wiring for pmap %p va 0x%lx "
3933 			       "didn't change!\n", pmap, va);
3934 		}
3935 #endif
3936 		pmap_unmap_ptes(pmap, pmap2);		/* unlocks map */
3937 	}
3938 #ifdef DIAGNOSTIC
3939 	else {
3940 		panic("pmap_unwire: invalid PDE");
3941 	}
3942 #endif
3943 	kpreempt_enable();
3944 }
3945 
3946 /*
3947  * pmap_copy: copy mappings from one pmap to another
3948  *
3949  * => optional function
3950  * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
3951  */
3952 
3953 /*
3954  * defined as macro in pmap.h
3955  */
3956 
3957 __weak_alias(pmap_enter, pmap_enter_default);
3958 
3959 int
3960 pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
3961     u_int flags)
3962 {
3963 	return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0);
3964 }
3965 
3966 /*
3967  * pmap_enter: enter a mapping into a pmap
3968  *
3969  * => must be done "now" ... no lazy-evaluation
3970  * => we set pmap => pv_head locking
3971  */
3972 int
3973 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa,
3974 	   vm_prot_t prot, u_int flags, int domid)
3975 {
3976 	pt_entry_t *ptes, opte, npte;
3977 	pt_entry_t *ptep;
3978 	pd_entry_t * const *pdes;
3979 	struct vm_page *ptp, *pg;
3980 	struct pmap_page *new_pp;
3981 	struct pmap_page *old_pp;
3982 	struct pv_entry *old_pve = NULL;
3983 	struct pv_entry *new_pve;
3984 	struct pv_entry *new_pve2;
3985 	int error;
3986 	bool wired = (flags & PMAP_WIRED) != 0;
3987 	struct pmap *pmap2;
3988 
3989 	KASSERT(pmap_initialized);
3990 	KASSERT(curlwp->l_md.md_gc_pmap != pmap);
3991 
3992 #ifdef DIAGNOSTIC
3993 	/* sanity check: totally out of range? */
3994 	if (va >= VM_MAX_KERNEL_ADDRESS)
3995 		panic("pmap_enter: too big");
3996 
3997 	if (va == (vaddr_t) PDP_BASE || va == (vaddr_t) APDP_BASE)
3998 		panic("pmap_enter: trying to map over PDP/APDP!");
3999 
4000 	/* sanity check: kernel PTPs should already have been pre-allocated */
4001 	if (va >= VM_MIN_KERNEL_ADDRESS &&
4002 	    !pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]))
4003 		panic("pmap_enter: missing kernel PTP for va %lx!", va);
4004 #endif /* DIAGNOSTIC */
4005 #ifdef XEN
4006 	KASSERT(domid == DOMID_SELF || pa == 0);
4007 #endif /* XEN */
4008 
4009 	npte = ma | protection_codes[prot] | PG_V;
4010 	npte |= pmap_pat_flags(flags);
4011 	if (wired)
4012 	        npte |= PG_W;
4013 	if (va < VM_MAXUSER_ADDRESS)
4014 		npte |= PG_u;
4015 	else if (va < VM_MAX_ADDRESS)
4016 		npte |= (PG_u | PG_RW);	/* XXXCDC: no longer needed? */
4017 	else
4018 		npte |= PG_k;
4019 	if (pmap == pmap_kernel())
4020 		npte |= pmap_pg_g;
4021 	if (flags & VM_PROT_ALL) {
4022 		npte |= PG_U;
4023 		if (flags & VM_PROT_WRITE) {
4024 			KASSERT((npte & PG_RW) != 0);
4025 			npte |= PG_M;
4026 		}
4027 	}
4028 
4029 #ifdef XEN
4030 	if (domid != DOMID_SELF)
4031 		pg = NULL;
4032 	else
4033 #endif
4034 		pg = PHYS_TO_VM_PAGE(pa);
4035 	if (pg != NULL) {
4036 		/* This is a managed page */
4037 		npte |= PG_PVLIST;
4038 		new_pp = VM_PAGE_TO_PP(pg);
4039 	} else {
4040 		new_pp = NULL;
4041 	}
4042 
4043 	/* get pves. */
4044 	new_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
4045 	new_pve2 = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
4046 	if (new_pve == NULL || new_pve2 == NULL) {
4047 		if (flags & PMAP_CANFAIL) {
4048 			error = ENOMEM;
4049 			goto out2;
4050 		}
4051 		panic("pmap_enter: pve allocation failed");
4052 	}
4053 
4054 	kpreempt_disable();
4055 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
4056 	if (pmap == pmap_kernel()) {
4057 		ptp = NULL;
4058 	} else {
4059 		ptp = pmap_get_ptp(pmap, va, pdes);
4060 		if (ptp == NULL) {
4061 			pmap_unmap_ptes(pmap, pmap2);
4062 			if (flags & PMAP_CANFAIL) {
4063 				error = ENOMEM;
4064 				goto out;
4065 			}
4066 			panic("pmap_enter: get ptp failed");
4067 		}
4068 	}
4069 
4070 	/*
4071 	 * update the pte.
4072 	 */
4073 
4074 	ptep = &ptes[pl1_i(va)];
4075 	do {
4076 		opte = *ptep;
4077 
4078 		/*
4079 		 * if the same page, inherit PG_U and PG_M.
4080 		 */
4081 		if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) {
4082 			npte |= opte & (PG_U | PG_M);
4083 		}
4084 #if defined(XEN)
4085 		if (domid != DOMID_SELF) {
4086 			/* pmap_pte_cas with error handling */
4087 			int s = splvm();
4088 			if (opte != *ptep) {
4089 				splx(s);
4090 				continue;
4091 			}
4092 			error = xpq_update_foreign(
4093 			    vtomach((vaddr_t)ptep), npte, domid);
4094 			splx(s);
4095 			if (error) {
4096 				if (ptp != NULL && ptp->wire_count <= 1) {
4097 					pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4098 				}
4099 				pmap_unmap_ptes(pmap, pmap2);
4100 				goto out;
4101 			}
4102 			break;
4103 		}
4104 #endif /* defined(XEN) */
4105 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
4106 
4107 	/*
4108 	 * update statistics and PTP's reference count.
4109 	 */
4110 
4111 	pmap_stats_update_bypte(pmap, npte, opte);
4112 	if (ptp != NULL && !pmap_valid_entry(opte)) {
4113 		ptp->wire_count++;
4114 	}
4115 	KASSERT(ptp == NULL || ptp->wire_count > 1);
4116 
4117 	/*
4118 	 * if the same page, we can skip pv_entry handling.
4119 	 */
4120 
4121 	if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) {
4122 		KASSERT(((opte ^ npte) & PG_PVLIST) == 0);
4123 		goto same_pa;
4124 	}
4125 
4126 	/*
4127 	 * if old page is managed, remove pv_entry from its list.
4128 	 */
4129 
4130 	if ((~opte & (PG_V | PG_PVLIST)) == 0) {
4131 		pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte));
4132 #ifdef DIAGNOSTIC
4133 		if (pg == NULL)
4134 			panic("pmap_enter: PG_PVLIST mapping with "
4135 			      "unmanaged page "
4136 			      "pa = 0x%" PRIx64 " (0x%" PRIx64 ")",
4137 			      (int64_t)pa, (int64_t)atop(pa));
4138 #endif
4139 		old_pp = VM_PAGE_TO_PP(pg);
4140 
4141 		pp_lock(old_pp);
4142 		old_pve = pmap_remove_pv(old_pp, ptp, va);
4143 		old_pp->pp_attrs |= opte;
4144 		pp_unlock(old_pp);
4145 	}
4146 
4147 	/*
4148 	 * if new page is managed, insert pv_entry into its list.
4149 	 */
4150 
4151 	if (new_pp) {
4152 		pp_lock(new_pp);
4153 		new_pve = pmap_enter_pv(new_pp, new_pve, &new_pve2, ptp, va);
4154 		pp_unlock(new_pp);
4155 	}
4156 
4157 same_pa:
4158 	pmap_unmap_ptes(pmap, pmap2);
4159 
4160 	/*
4161 	 * shootdown tlb if necessary.
4162 	 */
4163 
4164 	if ((~opte & (PG_V | PG_U)) == 0 &&
4165 	    ((opte ^ npte) & (PG_FRAME | PG_RW)) != 0) {
4166 		pmap_tlb_shootdown(pmap, va, 0, opte);
4167 	}
4168 
4169 	error = 0;
4170 out:
4171 	kpreempt_enable();
4172 out2:
4173 	if (old_pve != NULL) {
4174 		pool_cache_put(&pmap_pv_cache, old_pve);
4175 	}
4176 	if (new_pve != NULL) {
4177 		pool_cache_put(&pmap_pv_cache, new_pve);
4178 	}
4179 	if (new_pve2 != NULL) {
4180 		pool_cache_put(&pmap_pv_cache, new_pve2);
4181 	}
4182 
4183 	return error;
4184 }
4185 
4186 static bool
4187 pmap_get_physpage(vaddr_t va, int level, paddr_t *paddrp)
4188 {
4189 	struct vm_page *ptp;
4190 	struct pmap *kpm = pmap_kernel();
4191 
4192 	if (uvm.page_init_done == false) {
4193 		/*
4194 		 * we're growing the kernel pmap early (from
4195 		 * uvm_pageboot_alloc()).  this case must be
4196 		 * handled a little differently.
4197 		 */
4198 
4199 		if (uvm_page_physget(paddrp) == false)
4200 			panic("pmap_get_physpage: out of memory");
4201 		kpreempt_disable();
4202 		pmap_pte_set(early_zero_pte,
4203 		    pmap_pa2pte(*paddrp) | PG_V | PG_RW | PG_k);
4204 		pmap_pte_flush();
4205 		pmap_update_pg((vaddr_t)early_zerop);
4206 		memset(early_zerop, 0, PAGE_SIZE);
4207 #if defined(DIAGNOSTIC) || defined (XEN)
4208 		pmap_pte_set(early_zero_pte, 0);
4209 		pmap_pte_flush();
4210 #endif /* defined(DIAGNOSTIC) */
4211 		kpreempt_enable();
4212 	} else {
4213 		/* XXX */
4214 		PMAP_SUBOBJ_LOCK(kpm, level - 1);
4215 		ptp = uvm_pagealloc(&kpm->pm_obj[level - 1],
4216 				    ptp_va2o(va, level), NULL,
4217 				    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
4218 		PMAP_SUBOBJ_UNLOCK(kpm, level - 1);
4219 		if (ptp == NULL)
4220 			panic("pmap_get_physpage: out of memory");
4221 		ptp->flags &= ~PG_BUSY;
4222 		ptp->wire_count = 1;
4223 		*paddrp = VM_PAGE_TO_PHYS(ptp);
4224 	}
4225 	pmap_stats_update(kpm, 1, 0);
4226 	return true;
4227 }
4228 
4229 /*
4230  * Allocate the amount of specified ptps for a ptp level, and populate
4231  * all levels below accordingly, mapping virtual addresses starting at
4232  * kva.
4233  *
4234  * Used by pmap_growkernel.
4235  */
4236 static void
4237 pmap_alloc_level(pd_entry_t * const *pdes, vaddr_t kva, int lvl,
4238     long *needed_ptps)
4239 {
4240 	unsigned long i;
4241 	vaddr_t va;
4242 	paddr_t pa;
4243 	unsigned long index, endindex;
4244 	int level;
4245 	pd_entry_t *pdep;
4246 #ifdef XEN
4247 	int s = splvm(); /* protect xpq_* */
4248 #endif
4249 
4250 	for (level = lvl; level > 1; level--) {
4251 		if (level == PTP_LEVELS)
4252 			pdep = pmap_kernel()->pm_pdir;
4253 		else
4254 			pdep = pdes[level - 2];
4255 		va = kva;
4256 		index = pl_i_roundup(kva, level);
4257 		endindex = index + needed_ptps[level - 1] - 1;
4258 
4259 
4260 		for (i = index; i <= endindex; i++) {
4261 			KASSERT(!pmap_valid_entry(pdep[i]));
4262 			pmap_get_physpage(va, level - 1, &pa);
4263 #ifdef XEN
4264 			xpq_queue_pte_update((level == PTP_LEVELS) ?
4265 			    xpmap_ptom(pmap_pdirpa(pmap_kernel(), i)) :
4266 			    xpmap_ptetomach(&pdep[i]),
4267 			    pmap_pa2pte(pa) | PG_k | PG_V | PG_RW);
4268 #ifdef PAE
4269 			if (level == PTP_LEVELS &&  i > L2_SLOT_KERN) {
4270 				/* update real kernel PD too */
4271 				xpq_queue_pte_update(
4272 				    xpmap_ptetomach(&pmap_kl2pd[l2tol2(i)]),
4273 				    pmap_pa2pte(pa) | PG_k | PG_V | PG_RW);
4274 			}
4275 #endif
4276 #else /* XEN */
4277 			pdep[i] = pmap_pa2pte(pa) | PG_k | PG_V | PG_RW;
4278 #endif /* XEN */
4279 			KASSERT(level != PTP_LEVELS || nkptp[level - 1] +
4280 			    pl_i(VM_MIN_KERNEL_ADDRESS, level) == i);
4281 			nkptp[level - 1]++;
4282 			va += nbpd[level - 1];
4283 		}
4284 		pmap_pte_flush();
4285 	}
4286 #ifdef XEN
4287 	splx(s);
4288 #endif
4289 }
4290 
4291 /*
4292  * pmap_growkernel: increase usage of KVM space
4293  *
4294  * => we allocate new PTPs for the kernel and install them in all
4295  *	the pmaps on the system.
4296  */
4297 
4298 vaddr_t
4299 pmap_growkernel(vaddr_t maxkvaddr)
4300 {
4301 	struct pmap *kpm = pmap_kernel();
4302 #if !defined(XEN) || !defined(__x86_64__)
4303 	struct pmap *pm;
4304 #endif
4305 	int s, i;
4306 	long needed_kptp[PTP_LEVELS], target_nptp, old;
4307 	bool invalidate = false;
4308 
4309 	s = splvm();	/* to be safe */
4310 	mutex_enter(&kpm->pm_lock);
4311 
4312 	if (maxkvaddr <= pmap_maxkvaddr) {
4313 		mutex_exit(&kpm->pm_lock);
4314 		splx(s);
4315 		return pmap_maxkvaddr;
4316 	}
4317 
4318 	maxkvaddr = x86_round_pdr(maxkvaddr);
4319 	old = nkptp[PTP_LEVELS - 1];
4320 	/*
4321 	 * This loop could be optimized more, but pmap_growkernel()
4322 	 * is called infrequently.
4323 	 */
4324 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
4325 		target_nptp = pl_i_roundup(maxkvaddr, i + 1) -
4326 		    pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1);
4327 		/*
4328 		 * XXX only need to check toplevel.
4329 		 */
4330 		if (target_nptp > nkptpmax[i])
4331 			panic("out of KVA space");
4332 		KASSERT(target_nptp >= nkptp[i]);
4333 		needed_kptp[i] = target_nptp - nkptp[i];
4334 	}
4335 
4336 	pmap_alloc_level(normal_pdes, pmap_maxkvaddr, PTP_LEVELS, needed_kptp);
4337 
4338 	/*
4339 	 * If the number of top level entries changed, update all
4340 	 * pmaps.
4341 	 */
4342 	if (needed_kptp[PTP_LEVELS - 1] != 0) {
4343 #ifdef XEN
4344 #ifdef __x86_64__
4345 		/* nothing, kernel entries are never entered in user pmap */
4346 #else /* __x86_64__ */
4347 		mutex_enter(&pmaps_lock);
4348 		LIST_FOREACH(pm, &pmaps, pm_list) {
4349 			int pdkidx;
4350 			for (pdkidx =  PDIR_SLOT_KERN + old;
4351 			    pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1];
4352 			    pdkidx++) {
4353 				xpq_queue_pte_update(
4354 				    xpmap_ptom(pmap_pdirpa(pm, pdkidx)),
4355 				    kpm->pm_pdir[pdkidx]);
4356 			}
4357 			xpq_flush_queue();
4358 		}
4359 		mutex_exit(&pmaps_lock);
4360 #endif /* __x86_64__ */
4361 #else /* XEN */
4362 		unsigned newpdes;
4363 		newpdes = nkptp[PTP_LEVELS - 1] - old;
4364 		mutex_enter(&pmaps_lock);
4365 		LIST_FOREACH(pm, &pmaps, pm_list) {
4366 			memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old],
4367 			       &kpm->pm_pdir[PDIR_SLOT_KERN + old],
4368 			       newpdes * sizeof (pd_entry_t));
4369 		}
4370 		mutex_exit(&pmaps_lock);
4371 #endif
4372 		invalidate = true;
4373 	}
4374 	pmap_maxkvaddr = maxkvaddr;
4375 	mutex_exit(&kpm->pm_lock);
4376 	splx(s);
4377 
4378 	if (invalidate) {
4379 		/* Invalidate the PDP cache. */
4380 		pool_cache_invalidate(&pmap_pdp_cache);
4381 	}
4382 
4383 	return maxkvaddr;
4384 }
4385 
4386 #ifdef DEBUG
4387 void pmap_dump(struct pmap *, vaddr_t, vaddr_t);
4388 
4389 /*
4390  * pmap_dump: dump all the mappings from a pmap
4391  *
4392  * => caller should not be holding any pmap locks
4393  */
4394 
4395 void
4396 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
4397 {
4398 	pt_entry_t *ptes, *pte;
4399 	pd_entry_t * const *pdes;
4400 	struct pmap *pmap2;
4401 	vaddr_t blkendva;
4402 
4403 	/*
4404 	 * if end is out of range truncate.
4405 	 * if (end == start) update to max.
4406 	 */
4407 
4408 	if (eva > VM_MAXUSER_ADDRESS || eva <= sva)
4409 		eva = VM_MAXUSER_ADDRESS;
4410 
4411 	/*
4412 	 * we lock in the pmap => pv_head direction
4413 	 */
4414 
4415 	kpreempt_disable();
4416 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
4417 
4418 	/*
4419 	 * dumping a range of pages: we dump in PTP sized blocks (4MB)
4420 	 */
4421 
4422 	for (/* null */ ; sva < eva ; sva = blkendva) {
4423 
4424 		/* determine range of block */
4425 		blkendva = x86_round_pdr(sva+1);
4426 		if (blkendva > eva)
4427 			blkendva = eva;
4428 
4429 		/* valid block? */
4430 		if (!pmap_pdes_valid(sva, pdes, NULL))
4431 			continue;
4432 
4433 		pte = &ptes[pl1_i(sva)];
4434 		for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) {
4435 			if (!pmap_valid_entry(*pte))
4436 				continue;
4437 			printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR
4438 			    " (pte=%#" PRIxPADDR ")\n",
4439 			    sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte);
4440 		}
4441 	}
4442 	pmap_unmap_ptes(pmap, pmap2);
4443 	kpreempt_enable();
4444 }
4445 #endif
4446 
4447 /*
4448  * pmap_tlb_shootdown: invalidate pages on all CPUs using pmap 'pm'
4449  *
4450  * => always invalidates locally before returning
4451  * => returns before remote CPUs have invalidated
4452  * => must be called with preemption disabled
4453  */
4454 
4455 void
4456 pmap_tlb_shootdown(struct pmap *pm, vaddr_t sva, vaddr_t eva, pt_entry_t pte)
4457 {
4458 #ifdef MULTIPROCESSOR
4459 	extern bool x86_mp_online;
4460 	struct cpu_info *ci;
4461 	struct pmap_mbox *mb, *selfmb;
4462 	CPU_INFO_ITERATOR cii;
4463 	uintptr_t head;
4464 	u_int count;
4465 	int s;
4466 #endif	/* MULTIPROCESSOR */
4467 	struct cpu_info *self;
4468 	bool kernel;
4469 
4470 	KASSERT(eva == 0 || eva >= sva);
4471 	KASSERT(kpreempt_disabled());
4472 
4473 	if (pte & PG_PS)
4474 		sva &= PG_LGFRAME;
4475 	pte &= PG_G;
4476 	self = curcpu();
4477 
4478 	if (sva == (vaddr_t)-1LL) {
4479 		kernel = true;
4480 	} else {
4481 		if (eva == 0)
4482 			eva = sva + PAGE_SIZE;
4483 		kernel = sva >= VM_MAXUSER_ADDRESS;
4484 		KASSERT(kernel == (eva > VM_MAXUSER_ADDRESS));
4485 	}
4486 
4487 	/*
4488 	 * if tearing down the pmap, do nothing.  we'll flush later
4489 	 * when we're ready to recycle/destroy it.
4490 	 */
4491 	if (__predict_false(curlwp->l_md.md_gc_pmap == pm)) {
4492 		return;
4493 	}
4494 
4495 	/*
4496 	 * If the range is larger than 32 pages, then invalidate
4497 	 * everything.
4498 	 */
4499 	if (sva != (vaddr_t)-1LL && eva - sva > (32 * PAGE_SIZE)) {
4500 		sva = (vaddr_t)-1LL;
4501 		eva = sva;
4502 	}
4503 
4504 #ifdef MULTIPROCESSOR
4505 	if (ncpu > 1 && x86_mp_online) {
4506 		selfmb = &self->ci_pmap_cpu->pc_mbox;
4507 
4508 		/*
4509 		 * If the CPUs have no notion of global pages then
4510 		 * reload of %cr3 is sufficient.
4511 		 */
4512 		if (pte != 0 && (cpu_feature[0] & CPUID_PGE) == 0)
4513 			pte = 0;
4514 
4515 		if (pm == pmap_kernel()) {
4516 			/*
4517 			 * Mapped on all CPUs: use the broadcast mechanism.
4518 			 * Once we have the lock, increment the counter.
4519 			 */
4520 			s = splvm();
4521 			mb = &pmap_mbox;
4522 			count = SPINLOCK_BACKOFF_MIN;
4523 			do {
4524 				if ((head = mb->mb_head) != mb->mb_tail) {
4525 					splx(s);
4526 					while ((head = mb->mb_head) !=
4527 					    mb->mb_tail)
4528 						SPINLOCK_BACKOFF(count);
4529 					s = splvm();
4530 				}
4531 			} while (atomic_cas_ulong(
4532 			    (volatile u_long *)&mb->mb_head,
4533 			    head, head + ncpu - 1) != head);
4534 
4535 			/*
4536 			 * Once underway we must stay at IPL_VM until the
4537 			 * IPI is dispatched.  Otherwise interrupt handlers
4538 			 * on this CPU can deadlock against us.
4539 			 */
4540 			pmap_tlb_evcnt.ev_count++;
4541 			mb->mb_pointer = self;
4542 			mb->mb_addr1 = sva;
4543 			mb->mb_addr2 = eva;
4544 			mb->mb_global = pte;
4545 			x86_ipi(LAPIC_TLB_BCAST_VECTOR, LAPIC_DEST_ALLEXCL,
4546 			    LAPIC_DLMODE_FIXED);
4547 			self->ci_need_tlbwait = 1;
4548 			splx(s);
4549 		} else if ((pm->pm_cpus & ~self->ci_cpumask) != 0 ||
4550 		    (kernel && (pm->pm_kernel_cpus & ~self->ci_cpumask) != 0)) {
4551 			/*
4552 			 * We don't bother traversing the CPU list if only
4553 			 * used by this CPU.
4554 			 *
4555 			 * We can't do global flushes with the multicast
4556 			 * mechanism.
4557 			 */
4558 			KASSERT(pte == 0);
4559 
4560 			/*
4561 			 * Take ownership of the shootdown mailbox on each
4562 			 * CPU, fill the details and fire it off.
4563 			 */
4564 			s = splvm();
4565 			for (CPU_INFO_FOREACH(cii, ci)) {
4566 				if (ci == self ||
4567 				    !pmap_is_active(pm, ci, kernel) ||
4568 				    !(ci->ci_flags & CPUF_RUNNING))
4569 					continue;
4570 				selfmb->mb_head++;
4571 				mb = &ci->ci_pmap_cpu->pc_mbox;
4572 				count = SPINLOCK_BACKOFF_MIN;
4573 				while (atomic_cas_ulong(
4574 				    (u_long *)&mb->mb_pointer,
4575 				    0, (u_long)&selfmb->mb_tail) != 0) {
4576 				    	splx(s);
4577 					while (mb->mb_pointer != 0)
4578 						SPINLOCK_BACKOFF(count);
4579 					s = splvm();
4580 				}
4581 				mb->mb_addr1 = sva;
4582 				mb->mb_addr2 = eva;
4583 				mb->mb_global = pte;
4584 				if (x86_ipi(LAPIC_TLB_MCAST_VECTOR,
4585 				    ci->ci_cpuid, LAPIC_DLMODE_FIXED))
4586 					panic("pmap_tlb_shootdown: ipi failed");
4587 			}
4588 			self->ci_need_tlbwait = 1;
4589 			splx(s);
4590 		}
4591 	}
4592 #endif	/* MULTIPROCESSOR */
4593 
4594 	/* Update the current CPU before waiting for others. */
4595 	if (!pmap_is_active(pm, self, kernel))
4596 		return;
4597 
4598 	if (sva == (vaddr_t)-1LL) {
4599 		u_int gen = uvm_emap_gen_return();
4600 		if (pte != 0) {
4601 			tlbflushg();
4602 		} else {
4603 			tlbflush();
4604 		}
4605 		uvm_emap_update(gen);
4606 	} else {
4607 		do {
4608 			pmap_update_pg(sva);
4609 			sva += PAGE_SIZE;
4610 		} while (sva < eva);
4611 	}
4612 }
4613 
4614 /*
4615  * pmap_tlb_shootwait: wait for pending TLB shootdowns to complete
4616  *
4617  * => only waits for operations generated by the current CPU
4618  * => must be called with preemption disabled
4619  */
4620 
4621 void
4622 pmap_tlb_shootwait(void)
4623 {
4624 	struct cpu_info *self;
4625 	struct pmap_mbox *mb;
4626 
4627 	KASSERT(kpreempt_disabled());
4628 
4629 	/*
4630 	 * Anything to do?  XXX Really we want to avoid touching the cache
4631 	 * lines of the two mailboxes, but the processor may read ahead.
4632 	 */
4633 	self = curcpu();
4634 	if (!self->ci_need_tlbwait)
4635 		return;
4636 	self->ci_need_tlbwait = 0;
4637 
4638 	/* If we own the global mailbox, wait for it to drain. */
4639 	mb = &pmap_mbox;
4640 	while (mb->mb_pointer == self && mb->mb_head != mb->mb_tail)
4641 		x86_pause();
4642 
4643 	/* If we own other CPU's mailboxes, wait for them to drain. */
4644 	mb = &self->ci_pmap_cpu->pc_mbox;
4645 	KASSERT(mb->mb_pointer != &mb->mb_tail);
4646 	while (mb->mb_head != mb->mb_tail)
4647 		x86_pause();
4648 }
4649 
4650 /*
4651  * pmap_update: process deferred invalidations
4652  */
4653 
4654 void
4655 pmap_update(struct pmap *pmap)
4656 {
4657 	struct vm_page *ptp, *empty_ptps;
4658 	struct pmap_page *pp;
4659 	lwp_t *l;
4660 
4661 	/*
4662 	 * if we have torn down this pmap, invalidate non-global TLB
4663 	 * entries on any processors using it.
4664 	 */
4665 	l = curlwp;
4666 	if (__predict_false(l->l_md.md_gc_pmap == pmap)) {
4667 		l->l_md.md_gc_pmap = NULL;
4668 		KPREEMPT_DISABLE(l);
4669 		pmap_tlb_shootdown(pmap, -1, -1, 0);
4670 		KPREEMPT_ENABLE(l);
4671 	}
4672 
4673 	/*
4674 	 * wait for tlb shootdowns to complete before returning control
4675 	 * to the caller.
4676 	 */
4677 	kpreempt_disable();
4678 	pmap_tlb_shootwait();
4679 	kpreempt_enable();
4680 
4681 	/*
4682 	 * now that shootdowns are complete, process deferred frees,
4683 	 * but not from interrupt context.
4684 	 */
4685 	if (l->l_md.md_gc_ptp != NULL) {
4686 		if (cpu_intr_p() || (l->l_pflag & LP_INTR) != 0) {
4687 			return;
4688 		}
4689 
4690 		empty_ptps = l->l_md.md_gc_ptp;
4691 		l->l_md.md_gc_ptp = NULL;
4692 
4693 		while ((ptp = empty_ptps) != NULL) {
4694 			ptp->flags |= PG_ZERO;
4695 			pp = VM_PAGE_TO_PP(ptp);
4696 			empty_ptps = pp->pp_link;
4697 			LIST_INIT(&pp->pp_head.pvh_list);
4698 			uvm_pagefree(ptp);
4699 		}
4700 	}
4701 }
4702 
4703 #if PTP_LEVELS > 4
4704 #error "Unsupported number of page table mappings"
4705 #endif
4706 
4707 paddr_t
4708 pmap_init_tmp_pgtbl(paddr_t pg)
4709 {
4710 	static bool maps_loaded;
4711 	static const paddr_t x86_tmp_pml_paddr[] = {
4712 	    4 * PAGE_SIZE,
4713 	    5 * PAGE_SIZE,
4714 	    6 * PAGE_SIZE,
4715 	    7 * PAGE_SIZE
4716 	};
4717 	static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 };
4718 
4719 	pd_entry_t *tmp_pml, *kernel_pml;
4720 
4721 	int level;
4722 
4723 	if (!maps_loaded) {
4724 		for (level = 0; level < PTP_LEVELS; ++level) {
4725 			x86_tmp_pml_vaddr[level] =
4726 			    uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
4727 			    UVM_KMF_VAONLY);
4728 
4729 			if (x86_tmp_pml_vaddr[level] == 0)
4730 				panic("mapping of real mode PML failed\n");
4731 			pmap_kenter_pa(x86_tmp_pml_vaddr[level],
4732 			    x86_tmp_pml_paddr[level],
4733 			    VM_PROT_READ | VM_PROT_WRITE, 0);
4734 			pmap_update(pmap_kernel());
4735 		}
4736 		maps_loaded = true;
4737 	}
4738 
4739 	/* Zero levels 1-3 */
4740 	for (level = 0; level < PTP_LEVELS - 1; ++level) {
4741 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
4742 		memset(tmp_pml, 0, PAGE_SIZE);
4743 	}
4744 
4745 	/* Copy PML4 */
4746 	kernel_pml = pmap_kernel()->pm_pdir;
4747 	tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1];
4748 	memcpy(tmp_pml, kernel_pml, PAGE_SIZE);
4749 
4750 #ifdef PAE
4751 	/*
4752 	 * Use the last 4 entries of the L2 page as L3 PD entries. These
4753 	 * last entries are unlikely to be used for temporary mappings.
4754 	 * 508: maps 0->1GB (userland)
4755 	 * 509: unused
4756 	 * 510: unused
4757 	 * 511: maps 3->4GB (kernel)
4758 	 */
4759 	tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PG_V;
4760 	tmp_pml[509] = 0;
4761 	tmp_pml[510] = 0;
4762 	tmp_pml[511] = pmap_pdirpa(pmap_kernel(),PDIR_SLOT_KERN) | PG_V;
4763 #endif
4764 
4765 	for (level = PTP_LEVELS - 1; level > 0; --level) {
4766 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
4767 
4768 		tmp_pml[pl_i(pg, level + 1)] =
4769 		    (x86_tmp_pml_paddr[level - 1] & PG_FRAME) | PG_RW | PG_V;
4770 	}
4771 
4772 	tmp_pml = (void *)x86_tmp_pml_vaddr[0];
4773 	tmp_pml[pl_i(pg, 1)] = (pg & PG_FRAME) | PG_RW | PG_V;
4774 
4775 #ifdef PAE
4776 	/* Return the PA of the L3 page (entry 508 of the L2 page) */
4777 	return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t);
4778 #endif
4779 
4780 	return x86_tmp_pml_paddr[PTP_LEVELS - 1];
4781 }
4782