xref: /netbsd-src/sys/arch/x86/x86/pmap.c (revision 4e6df137e8e14049b5a701d249962c480449c141)
1 /*	$NetBSD: pmap.c,v 1.105 2010/02/26 19:25:07 jym Exp $	*/
2 
3 /*
4  * Copyright (c) 2007 Manuel Bouyer.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *
26  */
27 
28 /*
29  * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
30  *
31  * Permission to use, copy, modify, and distribute this software for any
32  * purpose with or without fee is hereby granted, provided that the above
33  * copyright notice and this permission notice appear in all copies.
34  *
35  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
36  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
37  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
38  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
39  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
40  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
41  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
42  */
43 
44 /*
45  *
46  * Copyright (c) 1997 Charles D. Cranor and Washington University.
47  * All rights reserved.
48  *
49  * Redistribution and use in source and binary forms, with or without
50  * modification, are permitted provided that the following conditions
51  * are met:
52  * 1. Redistributions of source code must retain the above copyright
53  *    notice, this list of conditions and the following disclaimer.
54  * 2. Redistributions in binary form must reproduce the above copyright
55  *    notice, this list of conditions and the following disclaimer in the
56  *    documentation and/or other materials provided with the distribution.
57  * 3. All advertising materials mentioning features or use of this software
58  *    must display the following acknowledgement:
59  *      This product includes software developed by Charles D. Cranor and
60  *      Washington University.
61  * 4. The name of the author may not be used to endorse or promote products
62  *    derived from this software without specific prior written permission.
63  *
64  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
65  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
66  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
67  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
68  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
69  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
70  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
71  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
72  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
73  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
74  */
75 
76 /*
77  * Copyright 2001 (c) Wasabi Systems, Inc.
78  * All rights reserved.
79  *
80  * Written by Frank van der Linden for Wasabi Systems, Inc.
81  *
82  * Redistribution and use in source and binary forms, with or without
83  * modification, are permitted provided that the following conditions
84  * are met:
85  * 1. Redistributions of source code must retain the above copyright
86  *    notice, this list of conditions and the following disclaimer.
87  * 2. Redistributions in binary form must reproduce the above copyright
88  *    notice, this list of conditions and the following disclaimer in the
89  *    documentation and/or other materials provided with the distribution.
90  * 3. All advertising materials mentioning features or use of this software
91  *    must display the following acknowledgement:
92  *      This product includes software developed for the NetBSD Project by
93  *      Wasabi Systems, Inc.
94  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
95  *    or promote products derived from this software without specific prior
96  *    written permission.
97  *
98  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
99  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
100  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
101  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
102  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
103  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
104  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
105  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
106  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
107  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
108  * POSSIBILITY OF SUCH DAMAGE.
109  */
110 
111 /*
112  * This is the i386 pmap modified and generalized to support x86-64
113  * as well. The idea is to hide the upper N levels of the page tables
114  * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest
115  * is mostly untouched, except that it uses some more generalized
116  * macros and interfaces.
117  *
118  * This pmap has been tested on the i386 as well, and it can be easily
119  * adapted to PAE.
120  *
121  * fvdl@wasabisystems.com 18-Jun-2001
122  */
123 
124 /*
125  * pmap.c: i386 pmap module rewrite
126  * Chuck Cranor <chuck@ccrc.wustl.edu>
127  * 11-Aug-97
128  *
129  * history of this pmap module: in addition to my own input, i used
130  *    the following references for this rewrite of the i386 pmap:
131  *
132  * [1] the NetBSD i386 pmap.   this pmap appears to be based on the
133  *     BSD hp300 pmap done by Mike Hibler at University of Utah.
134  *     it was then ported to the i386 by William Jolitz of UUNET
135  *     Technologies, Inc.   Then Charles M. Hannum of the NetBSD
136  *     project fixed some bugs and provided some speed ups.
137  *
138  * [2] the FreeBSD i386 pmap.   this pmap seems to be the
139  *     Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson
140  *     and David Greenman.
141  *
142  * [3] the Mach pmap.   this pmap, from CMU, seems to have migrated
143  *     between several processors.   the VAX version was done by
144  *     Avadis Tevanian, Jr., and Michael Wayne Young.    the i386
145  *     version was done by Lance Berc, Mike Kupfer, Bob Baron,
146  *     David Golub, and Richard Draves.    the alpha version was
147  *     done by Alessandro Forin (CMU/Mach) and Chris Demetriou
148  *     (NetBSD/alpha).
149  */
150 
151 #include <sys/cdefs.h>
152 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.105 2010/02/26 19:25:07 jym Exp $");
153 
154 #include "opt_user_ldt.h"
155 #include "opt_lockdebug.h"
156 #include "opt_multiprocessor.h"
157 #include "opt_xen.h"
158 #if !defined(__x86_64__)
159 #include "opt_kstack_dr0.h"
160 #endif /* !defined(__x86_64__) */
161 
162 #include <sys/param.h>
163 #include <sys/systm.h>
164 #include <sys/proc.h>
165 #include <sys/pool.h>
166 #include <sys/kernel.h>
167 #include <sys/atomic.h>
168 #include <sys/cpu.h>
169 #include <sys/intr.h>
170 #include <sys/xcall.h>
171 
172 #include <uvm/uvm.h>
173 
174 #include <dev/isa/isareg.h>
175 
176 #include <machine/specialreg.h>
177 #include <machine/gdt.h>
178 #include <machine/isa_machdep.h>
179 #include <machine/cpuvar.h>
180 
181 #include <x86/pmap.h>
182 #include <x86/pmap_pv.h>
183 
184 #include <x86/i82489reg.h>
185 #include <x86/i82489var.h>
186 
187 #ifdef XEN
188 #include <xen/xen3-public/xen.h>
189 #include <xen/hypervisor.h>
190 #endif
191 
192 /* flag to be used for kernel mappings: PG_u on Xen/amd64, 0 otherwise */
193 #if defined(XEN) && defined(__x86_64__)
194 #define PG_k PG_u
195 #else
196 #define PG_k 0
197 #endif
198 
199 /*
200  * general info:
201  *
202  *  - for an explanation of how the i386 MMU hardware works see
203  *    the comments in <machine/pte.h>.
204  *
205  *  - for an explanation of the general memory structure used by
206  *    this pmap (including the recursive mapping), see the comments
207  *    in <machine/pmap.h>.
208  *
209  * this file contains the code for the "pmap module."   the module's
210  * job is to manage the hardware's virtual to physical address mappings.
211  * note that there are two levels of mapping in the VM system:
212  *
213  *  [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
214  *      to map ranges of virtual address space to objects/files.  for
215  *      example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
216  *      to the file /bin/ls starting at offset zero."   note that
217  *      the upper layer mapping is not concerned with how individual
218  *      vm_pages are mapped.
219  *
220  *  [2] the lower layer of the VM system (the pmap) maintains the mappings
221  *      from virtual addresses.   it is concerned with which vm_page is
222  *      mapped where.   for example, when you run /bin/ls and start
223  *      at page 0x1000 the fault routine may lookup the correct page
224  *      of the /bin/ls file and then ask the pmap layer to establish
225  *      a mapping for it.
226  *
227  * note that information in the lower layer of the VM system can be
228  * thrown away since it can easily be reconstructed from the info
229  * in the upper layer.
230  *
231  * data structures we use include:
232  *
233  *  - struct pmap: describes the address space of one thread
234  *  - struct pv_entry: describes one <PMAP,VA> mapping of a PA
235  *  - struct pv_head: there is one pv_head per managed page of
236  *	physical memory.   the pv_head points to a list of pv_entry
237  *	structures which describe all the <PMAP,VA> pairs that this
238  *      page is mapped in.    this is critical for page based operations
239  *      such as pmap_page_protect() [change protection on _all_ mappings
240  *      of a page]
241  */
242 
243 /*
244  * memory allocation
245  *
246  *  - there are three data structures that we must dynamically allocate:
247  *
248  * [A] new process' page directory page (PDP)
249  *	- plan 1: done at pmap_create() we use
250  *	  uvm_km_alloc(kernel_map, PAGE_SIZE)  [fka kmem_alloc] to do this
251  *	  allocation.
252  *
253  * if we are low in free physical memory then we sleep in
254  * uvm_km_alloc -- in this case this is ok since we are creating
255  * a new pmap and should not be holding any locks.
256  *
257  * if the kernel is totally out of virtual space
258  * (i.e. uvm_km_alloc returns NULL), then we panic.
259  *
260  * [B] new page tables pages (PTP)
261  * 	- call uvm_pagealloc()
262  * 		=> success: zero page, add to pm_pdir
263  * 		=> failure: we are out of free vm_pages, let pmap_enter()
264  *		   tell UVM about it.
265  *
266  * note: for kernel PTPs, we start with NKPTP of them.   as we map
267  * kernel memory (at uvm_map time) we check to see if we've grown
268  * the kernel pmap.   if so, we call the optional function
269  * pmap_growkernel() to grow the kernel PTPs in advance.
270  *
271  * [C] pv_entry structures
272  */
273 
274 /*
275  * locking
276  *
277  * we have the following locks that we must contend with:
278  *
279  * mutexes:
280  *
281  * - pmap lock (per pmap, part of uvm_object)
282  *   this lock protects the fields in the pmap structure including
283  *   the non-kernel PDEs in the PDP, and the PTEs.  it also locks
284  *   in the alternate PTE space (since that is determined by the
285  *   entry in the PDP).
286  *
287  * - pvh_lock (per pv_head)
288  *   this lock protects the pv_entry list which is chained off the
289  *   pv_head structure for a specific managed PA.   it is locked
290  *   when traversing the list (e.g. adding/removing mappings,
291  *   syncing R/M bits, etc.)
292  *
293  * - pmaps_lock
294  *   this lock protects the list of active pmaps (headed by "pmaps").
295  *   we lock it when adding or removing pmaps from this list.
296  *
297  * tlb shootdown
298  *
299  * tlb shootdowns are hard interrupts that operate outside the spl
300  * framework: they don't need to be blocked provided that the pmap module
301  * gets the order of events correct.  the calls are made by talking directly
302  * to the lapic.  the stubs to handle the interrupts are quite short and do
303  * one of the following: invalidate a single page, a range of pages, all
304  * user tlb entries or the entire tlb.
305  *
306  * the cpus synchronize with each other using pmap_mbox structures which are
307  * aligned on 64-byte cache lines.  tlb shootdowns against the kernel pmap
308  * use a global mailbox and are generated using a broadcast ipi (broadcast
309  * to all but the sending cpu).  shootdowns against regular pmaps use
310  * per-cpu mailboxes and are multicast.  kernel and user shootdowns can
311  * execute simultaneously, as can shootdowns within different multithreaded
312  * processes.  TODO:
313  *
314  *   1. figure out which waitpoints can be deferered to pmap_update().
315  *   2. see if there is a cheap way to batch some updates.
316  */
317 
318 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER;
319 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER;
320 const long nkptpmax[] = NKPTPMAX_INITIALIZER;
321 const long nbpd[] = NBPD_INITIALIZER;
322 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER;
323 pd_entry_t * const alternate_pdes[] = APDES_INITIALIZER;
324 
325 long nkptp[] = NKPTP_INITIALIZER;
326 
327 static kmutex_t pmaps_lock;
328 
329 static vaddr_t pmap_maxkvaddr;
330 
331 #define COUNT(x)	/* nothing */
332 
333 /*
334  * XXX kludge: dummy locking to make KASSERTs in uvm_page.c comfortable.
335  * actual locking is done by pm_lock.
336  */
337 #if defined(DIAGNOSTIC)
338 #define	PMAP_SUBOBJ_LOCK(pm, idx) \
339 	KASSERT(mutex_owned(&(pm)->pm_lock)); \
340 	if ((idx) != 0) \
341 		mutex_enter(&(pm)->pm_obj[(idx)].vmobjlock)
342 #define	PMAP_SUBOBJ_UNLOCK(pm, idx) \
343 	KASSERT(mutex_owned(&(pm)->pm_lock)); \
344 	if ((idx) != 0) \
345 		mutex_exit(&(pm)->pm_obj[(idx)].vmobjlock)
346 #else /* defined(DIAGNOSTIC) */
347 #define	PMAP_SUBOBJ_LOCK(pm, idx)	/* nothing */
348 #define	PMAP_SUBOBJ_UNLOCK(pm, idx)	/* nothing */
349 #endif /* defined(DIAGNOSTIC) */
350 
351 /*
352  * Misc. event counters.
353  */
354 struct evcnt pmap_iobmp_evcnt;
355 struct evcnt pmap_ldt_evcnt;
356 
357 /*
358  * Global TLB shootdown mailbox.
359  */
360 struct evcnt pmap_tlb_evcnt __aligned(64);
361 struct pmap_mbox pmap_mbox __aligned(64);
362 
363 /*
364  * Per-CPU data.  The pmap mailbox is cache intensive so gets its
365  * own line.  Note that the mailbox must be the first item.
366  */
367 struct pmap_cpu {
368 	/* TLB shootdown */
369 	struct pmap_mbox pc_mbox;
370 };
371 
372 union {
373 	struct pmap_cpu pc;
374 	uint8_t padding[64];
375 } pmap_cpu[MAXCPUS] __aligned(64);
376 
377 /*
378  * global data structures
379  */
380 
381 static struct pmap kernel_pmap_store;	/* the kernel's pmap (proc0) */
382 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store;
383 
384 /*
385  * pmap_pg_g: if our processor supports PG_G in the PTE then we
386  * set pmap_pg_g to PG_G (otherwise it is zero).
387  */
388 
389 int pmap_pg_g = 0;
390 
391 /*
392  * pmap_largepages: if our processor supports PG_PS and we are
393  * using it, this is set to true.
394  */
395 
396 int pmap_largepages;
397 
398 /*
399  * i386 physical memory comes in a big contig chunk with a small
400  * hole toward the front of it...  the following two paddr_t's
401  * (shared with machdep.c) describe the physical address space
402  * of this machine.
403  */
404 paddr_t avail_start;	/* PA of first available physical page */
405 paddr_t avail_end;	/* PA of last available physical page */
406 
407 #ifdef XEN
408 #ifdef __x86_64__
409 /* Dummy PGD for user cr3, used between pmap_deactivate() and pmap_activate() */
410 static paddr_t xen_dummy_user_pgd;
411 /* Currently active user PGD (can't use rcr3()) */
412 static paddr_t xen_current_user_pgd = 0;
413 #endif /* __x86_64__ */
414 paddr_t pmap_pa_start; /* PA of first physical page for this domain */
415 paddr_t pmap_pa_end;   /* PA of last physical page for this domain */
416 #endif /* XEN */
417 
418 #define	VM_PAGE_TO_PP(pg)	(&(pg)->mdpage.mp_pp)
419 
420 #define	pp_lock(pp)	mutex_spin_enter(&(pp)->pp_lock)
421 #define	pp_unlock(pp)	mutex_spin_exit(&(pp)->pp_lock)
422 #define	pp_locked(pp)	mutex_owned(&(pp)->pp_lock)
423 
424 #define	PV_HASH_SIZE		32768
425 #define	PV_HASH_LOCK_CNT	32
426 
427 struct pv_hash_lock {
428 	kmutex_t lock;
429 } __aligned(CACHE_LINE_SIZE) pv_hash_locks[PV_HASH_LOCK_CNT]
430     __aligned(CACHE_LINE_SIZE);
431 
432 struct pv_hash_head {
433 	SLIST_HEAD(, pv_entry) hh_list;
434 } pv_hash_heads[PV_HASH_SIZE];
435 
436 static u_int
437 pvhash_hash(struct vm_page *ptp, vaddr_t va)
438 {
439 
440 	return (uintptr_t)ptp / sizeof(*ptp) + (va >> PAGE_SHIFT);
441 }
442 
443 static struct pv_hash_head *
444 pvhash_head(u_int hash)
445 {
446 
447 	return &pv_hash_heads[hash % PV_HASH_SIZE];
448 }
449 
450 static kmutex_t *
451 pvhash_lock(u_int hash)
452 {
453 
454 	return &pv_hash_locks[hash % PV_HASH_LOCK_CNT].lock;
455 }
456 
457 static struct pv_entry *
458 pvhash_remove(struct pv_hash_head *hh, struct vm_page *ptp, vaddr_t va)
459 {
460 	struct pv_entry *pve;
461 	struct pv_entry *prev;
462 
463 	prev = NULL;
464 	SLIST_FOREACH(pve, &hh->hh_list, pve_hash) {
465 		if (pve->pve_pte.pte_ptp == ptp &&
466 		    pve->pve_pte.pte_va == va) {
467 			if (prev != NULL) {
468 				SLIST_REMOVE_AFTER(prev, pve_hash);
469 			} else {
470 				SLIST_REMOVE_HEAD(&hh->hh_list, pve_hash);
471 			}
472 			break;
473 		}
474 		prev = pve;
475 	}
476 	return pve;
477 }
478 
479 /*
480  * other data structures
481  */
482 
483 static pt_entry_t protection_codes[8];	/* maps MI prot to i386 prot code */
484 static bool pmap_initialized = false;	/* pmap_init done yet? */
485 
486 /*
487  * the following two vaddr_t's are used during system startup
488  * to keep track of how much of the kernel's VM space we have used.
489  * once the system is started, the management of the remaining kernel
490  * VM space is turned over to the kernel_map vm_map.
491  */
492 
493 static vaddr_t virtual_avail;	/* VA of first free KVA */
494 static vaddr_t virtual_end;	/* VA of last free KVA */
495 
496 /*
497  * linked list of all non-kernel pmaps
498  */
499 
500 static struct pmap_head pmaps;
501 
502 /*
503  * pool that pmap structures are allocated from
504  */
505 
506 static struct pool_cache pmap_cache;
507 
508 /*
509  * pv_entry cache
510  */
511 
512 static struct pool_cache pmap_pv_cache;
513 
514 /*
515  * MULTIPROCESSOR: special VA's/ PTE's are actually allocated inside a
516  * maxcpus*NPTECL array of PTE's, to avoid cache line thrashing
517  * due to false sharing.
518  */
519 
520 #ifdef MULTIPROCESSOR
521 #define PTESLEW(pte, id) ((pte)+(id)*NPTECL)
522 #define VASLEW(va,id) ((va)+(id)*NPTECL*PAGE_SIZE)
523 #else
524 #define PTESLEW(pte, id) (pte)
525 #define VASLEW(va,id) (va)
526 #endif
527 
528 /*
529  * special VAs and the PTEs that map them
530  */
531 static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte, *early_zero_pte;
532 static char *csrcp, *cdstp, *zerop, *ptpp, *early_zerop;
533 
534 /*
535  * pool and cache that PDPs are allocated from
536  */
537 
538 static struct pool_cache pmap_pdp_cache;
539 int	pmap_pdp_ctor(void *, void *, int);
540 void	pmap_pdp_dtor(void *, void *);
541 #ifdef PAE
542 /* need to allocate items of 4 pages */
543 void *pmap_pdp_alloc(struct pool *, int);
544 void pmap_pdp_free(struct pool *, void *);
545 static struct pool_allocator pmap_pdp_allocator = {
546 	.pa_alloc = pmap_pdp_alloc,
547 	.pa_free = pmap_pdp_free,
548 	.pa_pagesz = PAGE_SIZE * PDP_SIZE,
549 };
550 #endif /* PAE */
551 
552 void *vmmap; /* XXX: used by mem.c... it should really uvm_map_reserve it */
553 
554 extern vaddr_t idt_vaddr;			/* we allocate IDT early */
555 extern paddr_t idt_paddr;
556 
557 #ifdef _LP64
558 extern vaddr_t lo32_vaddr;
559 extern vaddr_t lo32_paddr;
560 #endif
561 
562 extern int end;
563 
564 #ifdef i386
565 /* stuff to fix the pentium f00f bug */
566 extern vaddr_t pentium_idt_vaddr;
567 #endif
568 
569 
570 /*
571  * local prototypes
572  */
573 
574 static struct vm_page	*pmap_get_ptp(struct pmap *, vaddr_t,
575 				      pd_entry_t * const *);
576 static struct vm_page	*pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int);
577 static void		 pmap_freepage(struct pmap *, struct vm_page *, int);
578 static void		 pmap_free_ptp(struct pmap *, struct vm_page *,
579 				       vaddr_t, pt_entry_t *,
580 				       pd_entry_t * const *);
581 static bool		 pmap_is_curpmap(struct pmap *);
582 static bool		 pmap_is_active(struct pmap *, struct cpu_info *, bool);
583 static void		 pmap_map_ptes(struct pmap *, struct pmap **,
584 				       pt_entry_t **, pd_entry_t * const **);
585 static bool		 pmap_remove_pte(struct pmap *, struct vm_page *,
586 					 pt_entry_t *, vaddr_t,
587 					 struct pv_entry **);
588 static pt_entry_t	 pmap_remove_ptes(struct pmap *, struct vm_page *,
589 					  vaddr_t, vaddr_t, vaddr_t,
590 					  struct pv_entry **);
591 
592 static void		 pmap_unmap_ptes(struct pmap *, struct pmap *);
593 static void		 pmap_unmap_apdp(void);
594 static bool		 pmap_get_physpage(vaddr_t, int, paddr_t *);
595 static int		 pmap_pdes_invalid(vaddr_t, pd_entry_t * const *,
596 					   pd_entry_t *);
597 #define	pmap_pdes_valid(va, pdes, lastpde)	\
598 	(pmap_pdes_invalid((va), (pdes), (lastpde)) == 0)
599 static void		 pmap_alloc_level(pd_entry_t * const *, vaddr_t, int,
600 					  long *);
601 
602 static bool		 pmap_reactivate(struct pmap *);
603 
604 /*
605  * p m a p   h e l p e r   f u n c t i o n s
606  */
607 
608 static inline void
609 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff)
610 {
611 
612 	if (pmap == pmap_kernel()) {
613 		atomic_add_long(&pmap->pm_stats.resident_count, resid_diff);
614 		atomic_add_long(&pmap->pm_stats.wired_count, wired_diff);
615 	} else {
616 		KASSERT(mutex_owned(&pmap->pm_lock));
617 		pmap->pm_stats.resident_count += resid_diff;
618 		pmap->pm_stats.wired_count += wired_diff;
619 	}
620 }
621 
622 static inline void
623 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
624 {
625 	int resid_diff = ((npte & PG_V) ? 1 : 0) - ((opte & PG_V) ? 1 : 0);
626 	int wired_diff = ((npte & PG_W) ? 1 : 0) - ((opte & PG_W) ? 1 : 0);
627 
628 	KASSERT((npte & (PG_V | PG_W)) != PG_W);
629 	KASSERT((opte & (PG_V | PG_W)) != PG_W);
630 
631 	pmap_stats_update(pmap, resid_diff, wired_diff);
632 }
633 
634 /*
635  * ptp_to_pmap: lookup pmap by ptp
636  */
637 
638 static struct pmap *
639 ptp_to_pmap(struct vm_page *ptp)
640 {
641 	struct pmap *pmap;
642 
643 	if (ptp == NULL) {
644 		return pmap_kernel();
645 	}
646 	pmap = (struct pmap *)ptp->uobject;
647 	KASSERT(pmap != NULL);
648 	KASSERT(&pmap->pm_obj[0] == ptp->uobject);
649 	return pmap;
650 }
651 
652 static inline struct pv_pte *
653 pve_to_pvpte(struct pv_entry *pve)
654 {
655 
656 	KASSERT((void *)&pve->pve_pte == (void *)pve);
657 	return &pve->pve_pte;
658 }
659 
660 static inline struct pv_entry *
661 pvpte_to_pve(struct pv_pte *pvpte)
662 {
663 	struct pv_entry *pve = (void *)pvpte;
664 
665 	KASSERT(pve_to_pvpte(pve) == pvpte);
666 	return pve;
667 }
668 
669 /*
670  * pv_pte_first, pv_pte_next: PV list iterator.
671  */
672 
673 static struct pv_pte *
674 pv_pte_first(struct pmap_page *pp)
675 {
676 
677 	KASSERT(pp_locked(pp));
678 	if ((pp->pp_flags & PP_EMBEDDED) != 0) {
679 		return &pp->pp_pte;
680 	}
681 	return pve_to_pvpte(LIST_FIRST(&pp->pp_head.pvh_list));
682 }
683 
684 static struct pv_pte *
685 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte)
686 {
687 
688 	KASSERT(pvpte != NULL);
689 	KASSERT(pp_locked(pp));
690 	if (pvpte == &pp->pp_pte) {
691 		KASSERT((pp->pp_flags & PP_EMBEDDED) != 0);
692 		return NULL;
693 	}
694 	KASSERT((pp->pp_flags & PP_EMBEDDED) == 0);
695 	return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list));
696 }
697 
698 /*
699  * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
700  *		of course the kernel is always loaded
701  */
702 
703 inline static bool
704 pmap_is_curpmap(struct pmap *pmap)
705 {
706 #if defined(XEN) && defined(__x86_64__)
707 	/*
708 	 * Only kernel pmap is physically loaded.
709 	 * User PGD may be active, but TLB will be flushed
710 	 * with HYPERVISOR_iret anyway, so let's say no
711 	 */
712 	return(pmap == pmap_kernel());
713 #else /* XEN && __x86_64__*/
714 	return((pmap == pmap_kernel()) ||
715 	       (pmap == curcpu()->ci_pmap));
716 #endif
717 }
718 
719 /*
720  * pmap_is_active: is this pmap loaded into the specified processor's %cr3?
721  */
722 
723 inline static bool
724 pmap_is_active(struct pmap *pmap, struct cpu_info *ci, bool kernel)
725 {
726 
727 	return (pmap == pmap_kernel() ||
728 	    (pmap->pm_cpus & ci->ci_cpumask) != 0 ||
729 	    (kernel && (pmap->pm_kernel_cpus & ci->ci_cpumask) != 0));
730 }
731 
732 static void
733 pmap_apte_flush(struct pmap *pmap)
734 {
735 
736 	KASSERT(kpreempt_disabled());
737 
738 	/*
739 	 * Flush the APTE mapping from all other CPUs that
740 	 * are using the pmap we are using (who's APTE space
741 	 * is the one we've just modified).
742 	 *
743 	 * XXXthorpej -- find a way to defer the IPI.
744 	 */
745 	pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, 0);
746 	pmap_tlb_shootwait();
747 }
748 
749 /*
750  * Unmap the content of APDP PDEs
751  */
752 static void
753 pmap_unmap_apdp(void) {
754 	int i;
755 
756 	for (i = 0; i < PDP_SIZE; i++) {
757 		pmap_pte_set(APDP_PDE+i, 0);
758 #if defined (XEN) && defined (PAE)
759 		/* clear shadow entries too */
760 		pmap_pte_set(APDP_PDE_SHADOW+i, 0);
761 #endif
762 	}
763 }
764 
765 /*
766  *	Add a reference to the specified pmap.
767  */
768 
769 inline void
770 pmap_reference(struct pmap *pmap)
771 {
772 
773 	atomic_inc_uint(&pmap->pm_obj[0].uo_refs);
774 }
775 
776 /*
777  * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
778  *
779  * => we lock enough pmaps to keep things locked in
780  * => must be undone with pmap_unmap_ptes before returning
781  */
782 
783 static void
784 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2,
785     pd_entry_t **ptepp, pd_entry_t * const **pdeppp)
786 {
787 	pd_entry_t opde, npde;
788 	struct pmap *ourpmap;
789 	struct cpu_info *ci;
790 	struct lwp *l;
791 	bool iscurrent;
792 	uint64_t ncsw;
793 #ifdef XEN
794 	int s;
795 #endif
796 
797 	/* the kernel's pmap is always accessible */
798 	if (pmap == pmap_kernel()) {
799 		*pmap2 = NULL;
800 		*ptepp = PTE_BASE;
801 		*pdeppp = normal_pdes;
802 		return;
803 	}
804 	KASSERT(kpreempt_disabled());
805 
806  retry:
807 	l = curlwp;
808 	ncsw = l->l_ncsw;
809  	ourpmap = NULL;
810 	ci = curcpu();
811 #if defined(XEN) && defined(__x86_64__)
812 	/*
813 	 * curmap can only be pmap_kernel so at this point
814 	 * pmap_is_curpmap is always false
815 	 */
816 	iscurrent = 0;
817 	ourpmap = pmap_kernel();
818 #else /* XEN && __x86_64__*/
819 	if (ci->ci_want_pmapload &&
820 	    vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) {
821 		pmap_load();
822 		if (l->l_ncsw != ncsw)
823 			goto retry;
824 	}
825 	iscurrent = pmap_is_curpmap(pmap);
826 	/* if curpmap then we are always mapped */
827 	if (iscurrent) {
828 		mutex_enter(&pmap->pm_lock);
829 		*pmap2 = NULL;
830 		*ptepp = PTE_BASE;
831 		*pdeppp = normal_pdes;
832 		goto out;
833 	}
834 	ourpmap = ci->ci_pmap;
835 #endif /* XEN && __x86_64__ */
836 
837 	/* need to lock both curpmap and pmap: use ordered locking */
838 	pmap_reference(ourpmap);
839 	if ((uintptr_t) pmap < (uintptr_t) ourpmap) {
840 		mutex_enter(&pmap->pm_lock);
841 		mutex_enter(&ourpmap->pm_lock);
842 	} else {
843 		mutex_enter(&ourpmap->pm_lock);
844 		mutex_enter(&pmap->pm_lock);
845 	}
846 
847 	if (l->l_ncsw != ncsw)
848 		goto unlock_and_retry;
849 
850 	/* need to load a new alternate pt space into curpmap? */
851 	COUNT(apdp_pde_map);
852 	opde = *APDP_PDE;
853 	if (!pmap_valid_entry(opde) ||
854 	    pmap_pte2pa(opde) != pmap_pdirpa(pmap, 0)) {
855 #ifdef XEN
856 		int i;
857 		s = splvm();
858 		/* Make recursive entry usable in user PGD */
859 		for (i = 0; i < PDP_SIZE; i++) {
860 			npde = pmap_pa2pte(
861 			    pmap_pdirpa(pmap, i * NPDPG)) | PG_k | PG_V;
862 			xpq_queue_pte_update(
863 			    xpmap_ptom(pmap_pdirpa(pmap, PDIR_SLOT_PTE + i)),
864 			    npde);
865 			xpq_queue_pte_update(xpmap_ptetomach(&APDP_PDE[i]),
866 			    npde);
867 #ifdef PAE
868 			/* update shadow entry too */
869 			xpq_queue_pte_update(
870 			    xpmap_ptetomach(&APDP_PDE_SHADOW[i]), npde);
871 #endif /* PAE */
872 			xpq_queue_invlpg(
873 			    (vaddr_t)&pmap->pm_pdir[PDIR_SLOT_PTE + i]);
874 		}
875 		if (pmap_valid_entry(opde))
876 			pmap_apte_flush(ourpmap);
877 		splx(s);
878 #else /* XEN */
879 		int i;
880 		for (i = 0; i < PDP_SIZE; i++) {
881 			npde = pmap_pa2pte(
882 			    pmap_pdirpa(pmap, i * NPDPG)) | PG_RW | PG_V;
883 			pmap_pte_set(APDP_PDE+i, npde);
884 		}
885 		pmap_pte_flush();
886 		if (pmap_valid_entry(opde))
887 			pmap_apte_flush(ourpmap);
888 #endif /* XEN */
889 	}
890 	*pmap2 = ourpmap;
891 	*ptepp = APTE_BASE;
892 	*pdeppp = alternate_pdes;
893 	KASSERT(l->l_ncsw == ncsw);
894 #if !defined(XEN) || !defined(__x86_64__)
895  out:
896 #endif
897  	/*
898  	 * might have blocked, need to retry?
899  	 */
900 	if (l->l_ncsw != ncsw) {
901  unlock_and_retry:
902 	    	if (ourpmap != NULL) {
903 			mutex_exit(&ourpmap->pm_lock);
904 			pmap_destroy(ourpmap);
905 		}
906 		mutex_exit(&pmap->pm_lock);
907 		goto retry;
908 	}
909 
910 	return;
911 }
912 
913 /*
914  * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
915  */
916 
917 static void
918 pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2)
919 {
920 
921 	if (pmap == pmap_kernel()) {
922 		return;
923 	}
924 	KASSERT(kpreempt_disabled());
925 	if (pmap2 == NULL) {
926 		mutex_exit(&pmap->pm_lock);
927 	} else {
928 #if defined(XEN) && defined(__x86_64__)
929 		KASSERT(pmap2 == pmap_kernel());
930 #else
931 		KASSERT(curcpu()->ci_pmap == pmap2);
932 #endif
933 #if defined(MULTIPROCESSOR)
934 		pmap_unmap_apdp();
935 		pmap_pte_flush();
936 		pmap_apte_flush(pmap2);
937 #endif
938 		COUNT(apdp_pde_unmap);
939 		mutex_exit(&pmap->pm_lock);
940 		mutex_exit(&pmap2->pm_lock);
941 		pmap_destroy(pmap2);
942 	}
943 }
944 
945 inline static void
946 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte)
947 {
948 
949 #if !defined(__x86_64__)
950 	if (curproc == NULL || curproc->p_vmspace == NULL ||
951 	    pm != vm_map_pmap(&curproc->p_vmspace->vm_map))
952 		return;
953 
954 	if ((opte ^ npte) & PG_X)
955 		pmap_update_pg(va);
956 
957 	/*
958 	 * Executability was removed on the last executable change.
959 	 * Reset the code segment to something conservative and
960 	 * let the trap handler deal with setting the right limit.
961 	 * We can't do that because of locking constraints on the vm map.
962 	 */
963 
964 	if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) {
965 		struct trapframe *tf = curlwp->l_md.md_regs;
966 
967 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
968 		pm->pm_hiexec = I386_MAX_EXE_ADDR;
969 	}
970 #endif /* !defined(__x86_64__) */
971 }
972 
973 #if !defined(__x86_64__)
974 /*
975  * Fixup the code segment to cover all potential executable mappings.
976  * returns 0 if no changes to the code segment were made.
977  */
978 
979 int
980 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb)
981 {
982 	struct vm_map_entry *ent;
983 	struct pmap *pm = vm_map_pmap(map);
984 	vaddr_t va = 0;
985 
986 	vm_map_lock_read(map);
987 	for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) {
988 
989 		/*
990 		 * This entry has greater va than the entries before.
991 		 * We need to make it point to the last page, not past it.
992 		 */
993 
994 		if (ent->protection & VM_PROT_EXECUTE)
995 			va = trunc_page(ent->end) - PAGE_SIZE;
996 	}
997 	vm_map_unlock_read(map);
998 	if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL))
999 		return (0);
1000 
1001 	pm->pm_hiexec = va;
1002 	if (pm->pm_hiexec > I386_MAX_EXE_ADDR) {
1003 		tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
1004 	} else {
1005 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
1006 		return (0);
1007 	}
1008 	return (1);
1009 }
1010 #endif /* !defined(__x86_64__) */
1011 
1012 /*
1013  * p m a p   k e n t e r   f u n c t i o n s
1014  *
1015  * functions to quickly enter/remove pages from the kernel address
1016  * space.   pmap_kremove is exported to MI kernel.  we make use of
1017  * the recursive PTE mappings.
1018  */
1019 
1020 /*
1021  * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
1022  *
1023  * => no need to lock anything, assume va is already allocated
1024  * => should be faster than normal pmap enter function
1025  */
1026 
1027 void
1028 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
1029 {
1030 	pt_entry_t *pte, opte, npte;
1031 
1032 	KASSERT(!(prot & ~VM_PROT_ALL));
1033 
1034 	if (va < VM_MIN_KERNEL_ADDRESS)
1035 		pte = vtopte(va);
1036 	else
1037 		pte = kvtopte(va);
1038 #ifdef DOM0OPS
1039 	if (pa < pmap_pa_start || pa >= pmap_pa_end) {
1040 #ifdef DEBUG
1041 		printk("pmap_kenter_pa: pa 0x%" PRIx64 " for va 0x%" PRIx64
1042 		    " outside range\n", (int64_t)pa, (int64_t)va);
1043 #endif /* DEBUG */
1044 		npte = pa;
1045 	} else
1046 #endif /* DOM0OPS */
1047 		npte = pmap_pa2pte(pa);
1048 	npte |= protection_codes[prot] | PG_k | PG_V | pmap_pg_g;
1049 	if (flags & PMAP_NOCACHE)
1050 		npte |= PG_N;
1051 	opte = pmap_pte_testset(pte, npte); /* zap! */
1052 #if defined(DIAGNOSTIC)
1053 	/* XXX For now... */
1054 	if (opte & PG_PS)
1055 		panic("pmap_kenter_pa: PG_PS");
1056 #endif
1057 	if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) {
1058 		/* This should not happen, so no need to batch updates. */
1059 		kpreempt_disable();
1060 		pmap_tlb_shootdown(pmap_kernel(), va, 0, opte);
1061 		kpreempt_enable();
1062 	}
1063 }
1064 
1065 void
1066 pmap_emap_enter(vaddr_t va, paddr_t pa, vm_prot_t prot)
1067 {
1068 	pt_entry_t *pte, opte, npte;
1069 
1070 	KASSERT((prot & ~VM_PROT_ALL) == 0);
1071 	pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va);
1072 
1073 #ifdef DOM0OPS
1074 	if (pa < pmap_pa_start || pa >= pmap_pa_end) {
1075 		npte = pa;
1076 	} else
1077 #endif
1078 		npte = pmap_pa2pte(pa);
1079 
1080 	npte = pmap_pa2pte(pa);
1081 	npte |= protection_codes[prot] | PG_k | PG_V;
1082 	opte = pmap_pte_testset(pte, npte);
1083 }
1084 
1085 /*
1086  * pmap_emap_sync: perform TLB flush or pmap load, if it was deferred.
1087  */
1088 void
1089 pmap_emap_sync(bool canload)
1090 {
1091 	struct cpu_info *ci = curcpu();
1092 	struct pmap *pmap;
1093 
1094 	KASSERT(kpreempt_disabled());
1095 	if (__predict_true(ci->ci_want_pmapload && canload)) {
1096 		/*
1097 		 * XXX: Hint for pmap_reactivate(), which might suggest to
1098 		 * not perform TLB flush, if state has not changed.
1099 		 */
1100 		pmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map);
1101 		if (__predict_false(pmap == ci->ci_pmap)) {
1102 			const uint32_t cpumask = ci->ci_cpumask;
1103 			atomic_and_32(&pmap->pm_cpus, ~cpumask);
1104 		}
1105 		pmap_load();
1106 		KASSERT(ci->ci_want_pmapload == 0);
1107 	} else {
1108 		tlbflush();
1109 	}
1110 
1111 }
1112 
1113 void
1114 pmap_emap_remove(vaddr_t sva, vsize_t len)
1115 {
1116 	pt_entry_t *pte, xpte;
1117 	vaddr_t va, eva = sva + len;
1118 
1119 	for (va = sva; va < eva; va += PAGE_SIZE) {
1120 		pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va);
1121 		xpte |= pmap_pte_testset(pte, 0);
1122 	}
1123 }
1124 
1125 #ifdef XEN
1126 /*
1127  * pmap_kenter_ma: enter a kernel mapping without R/M (pv_entry) tracking
1128  *
1129  * => no need to lock anything, assume va is already allocated
1130  * => should be faster than normal pmap enter function
1131  * => we expect a MACHINE address
1132  */
1133 
1134 void
1135 pmap_kenter_ma(vaddr_t va, paddr_t ma, vm_prot_t prot, u_int flags)
1136 {
1137 	pt_entry_t *pte, opte, npte;
1138 
1139 	if (va < VM_MIN_KERNEL_ADDRESS)
1140 		pte = vtopte(va);
1141 	else
1142 		pte = kvtopte(va);
1143 
1144 	npte = ma | ((prot & VM_PROT_WRITE) ? PG_RW : PG_RO) |
1145 	     PG_V | PG_k;
1146 	if (flags & PMAP_NOCACHE)
1147 		npte |= PG_N;
1148 
1149 #ifndef XEN
1150 	if ((cpu_feature & CPUID_NOX) && !(prot & VM_PROT_EXECUTE))
1151 		npte |= PG_NX;
1152 #endif
1153 	opte = pmap_pte_testset (pte, npte); /* zap! */
1154 
1155 	if (pmap_valid_entry(opte)) {
1156 #if defined(MULTIPROCESSOR)
1157 		kpreempt_disable();
1158 		pmap_tlb_shootdown(pmap_kernel(), va, 0, opte);
1159 		kpreempt_enable();
1160 #else
1161 		/* Don't bother deferring in the single CPU case. */
1162 		pmap_update_pg(va);
1163 #endif
1164 	}
1165 }
1166 #endif	/* XEN */
1167 
1168 #if defined(__x86_64__)
1169 /*
1170  * Change protection for a virtual address. Local for a CPU only, don't
1171  * care about TLB shootdowns.
1172  *
1173  * => must be called with preemption disabled
1174  */
1175 void
1176 pmap_changeprot_local(vaddr_t va, vm_prot_t prot)
1177 {
1178 	pt_entry_t *pte, opte, npte;
1179 
1180 	KASSERT(kpreempt_disabled());
1181 
1182 	if (va < VM_MIN_KERNEL_ADDRESS)
1183 		pte = vtopte(va);
1184 	else
1185 		pte = kvtopte(va);
1186 
1187 	npte = opte = *pte;
1188 
1189 	if ((prot & VM_PROT_WRITE) != 0)
1190 		npte |= PG_RW;
1191 	else
1192 		npte &= ~PG_RW;
1193 
1194 	if (opte != npte) {
1195 		pmap_pte_set(pte, npte);
1196 		pmap_pte_flush();
1197 		invlpg(va);
1198 	}
1199 }
1200 #endif /* defined(__x86_64__) */
1201 
1202 /*
1203  * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
1204  *
1205  * => no need to lock anything
1206  * => caller must dispose of any vm_page mapped in the va range
1207  * => note: not an inline function
1208  * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
1209  * => we assume kernel only unmaps valid addresses and thus don't bother
1210  *    checking the valid bit before doing TLB flushing
1211  * => must be followed by call to pmap_update() before reuse of page
1212  */
1213 
1214 void
1215 pmap_kremove(vaddr_t sva, vsize_t len)
1216 {
1217 	pt_entry_t *pte, xpte;
1218 	vaddr_t va, eva;
1219 
1220 	eva = sva + len;
1221 	xpte = 0;
1222 
1223 	for (va = sva; va < eva; va += PAGE_SIZE) {
1224 		if (va < VM_MIN_KERNEL_ADDRESS)
1225 			pte = vtopte(va);
1226 		else
1227 			pte = kvtopte(va);
1228 		xpte |= pmap_pte_testset(pte, 0); /* zap! */
1229 #if defined(DIAGNOSTIC)
1230 		/* XXX For now... */
1231 		if (xpte & PG_PS)
1232 			panic("pmap_kremove: PG_PS");
1233 		if (xpte & PG_PVLIST)
1234 			panic("pmap_kremove: PG_PVLIST mapping for 0x%lx",
1235 			      va);
1236 #endif
1237 	}
1238 	if ((xpte & (PG_V | PG_U)) == (PG_V | PG_U)) {
1239 		kpreempt_disable();
1240 		pmap_tlb_shootdown(pmap_kernel(), sva, eva, xpte);
1241 		kpreempt_enable();
1242 	}
1243 }
1244 
1245 /*
1246  * p m a p   i n i t   f u n c t i o n s
1247  *
1248  * pmap_bootstrap and pmap_init are called during system startup
1249  * to init the pmap module.   pmap_bootstrap() does a low level
1250  * init just to get things rolling.   pmap_init() finishes the job.
1251  */
1252 
1253 /*
1254  * pmap_bootstrap: get the system in a state where it can run with VM
1255  *	properly enabled (called before main()).   the VM system is
1256  *      fully init'd later...
1257  *
1258  * => on i386, locore.s has already enabled the MMU by allocating
1259  *	a PDP for the kernel, and nkpde PTP's for the kernel.
1260  * => kva_start is the first free virtual address in kernel space
1261  */
1262 
1263 void
1264 pmap_bootstrap(vaddr_t kva_start)
1265 {
1266 	struct pmap *kpm;
1267 	pt_entry_t *pte;
1268 	struct pcb *pcb;
1269 	int i;
1270 	vaddr_t kva;
1271 #ifdef XEN
1272 	pt_entry_t pg_nx = 0;
1273 #else
1274 	unsigned long p1i;
1275 	vaddr_t kva_end;
1276 	pt_entry_t pg_nx = (cpu_feature & CPUID_NOX ? PG_NX : 0);
1277 #endif
1278 
1279 	/*
1280 	 * set up our local static global vars that keep track of the
1281 	 * usage of KVM before kernel_map is set up
1282 	 */
1283 
1284 	virtual_avail = kva_start;		/* first free KVA */
1285 	virtual_end = VM_MAX_KERNEL_ADDRESS;	/* last KVA */
1286 
1287 	/*
1288 	 * set up protection_codes: we need to be able to convert from
1289 	 * a MI protection code (some combo of VM_PROT...) to something
1290 	 * we can jam into a i386 PTE.
1291 	 */
1292 
1293 	protection_codes[VM_PROT_NONE] = pg_nx;			/* --- */
1294 	protection_codes[VM_PROT_EXECUTE] = PG_RO | PG_X;	/* --x */
1295 	protection_codes[VM_PROT_READ] = PG_RO | pg_nx;		/* -r- */
1296 	protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO | PG_X;/* -rx */
1297 	protection_codes[VM_PROT_WRITE] = PG_RW | pg_nx;	/* w-- */
1298 	protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW | PG_X;/* w-x */
1299 	protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW | pg_nx;
1300 								/* wr- */
1301 	protection_codes[VM_PROT_ALL] = PG_RW | PG_X;		/* wrx */
1302 
1303 	/*
1304 	 * now we init the kernel's pmap
1305 	 *
1306 	 * the kernel pmap's pm_obj is not used for much.   however, in
1307 	 * user pmaps the pm_obj contains the list of active PTPs.
1308 	 * the pm_obj currently does not have a pager.   it might be possible
1309 	 * to add a pager that would allow a process to read-only mmap its
1310 	 * own page tables (fast user level vtophys?).   this may or may not
1311 	 * be useful.
1312 	 */
1313 
1314 	kpm = pmap_kernel();
1315 	for (i = 0; i < PTP_LEVELS - 1; i++) {
1316 		UVM_OBJ_INIT(&kpm->pm_obj[i], NULL, 1);
1317 		kpm->pm_ptphint[i] = NULL;
1318 	}
1319 	memset(&kpm->pm_list, 0, sizeof(kpm->pm_list));  /* pm_list not used */
1320 	pcb = lwp_getpcb(&lwp0);
1321 	kpm->pm_pdir = (pd_entry_t *)(pcb->pcb_cr3 + KERNBASE);
1322 #ifdef PAE
1323 	for (i = 0; i < PDP_SIZE; i++)
1324 		kpm->pm_pdirpa[i] = (paddr_t)pcb->pcb_cr3 + PAGE_SIZE * i;
1325 #else
1326 	kpm->pm_pdirpa = (paddr_t)pcb->pcb_cr3;
1327 #endif
1328 	kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
1329 		x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS);
1330 
1331 	/*
1332 	 * the above is just a rough estimate and not critical to the proper
1333 	 * operation of the system.
1334 	 */
1335 
1336 #ifndef XEN
1337 	/*
1338 	 * Begin to enable global TLB entries if they are supported.
1339 	 * The G bit has no effect until the CR4_PGE bit is set in CR4,
1340 	 * which happens in cpu_init(), which is run on each cpu
1341 	 * (and happens later)
1342 	 */
1343 
1344 	if (cpu_feature & CPUID_PGE) {
1345 		pmap_pg_g = PG_G;		/* enable software */
1346 
1347 		/* add PG_G attribute to already mapped kernel pages */
1348 		if (KERNBASE == VM_MIN_KERNEL_ADDRESS) {
1349 			kva_end = virtual_avail;
1350 		} else {
1351 			extern vaddr_t eblob, esym;
1352 			kva_end = (vaddr_t)&end;
1353 			if (esym > kva_end)
1354 				kva_end = esym;
1355 			if (eblob > kva_end)
1356 				kva_end = eblob;
1357 			kva_end = roundup(kva_end, PAGE_SIZE);
1358 		}
1359 		for (kva = KERNBASE; kva < kva_end; kva += PAGE_SIZE) {
1360 			p1i = pl1_i(kva);
1361 			if (pmap_valid_entry(PTE_BASE[p1i]))
1362 				PTE_BASE[p1i] |= PG_G;
1363 		}
1364 	}
1365 
1366 	/*
1367 	 * enable large pages if they are supported.
1368 	 */
1369 
1370 	if (cpu_feature & CPUID_PSE) {
1371 		paddr_t pa;
1372 		pd_entry_t *pde;
1373 		extern char __data_start;
1374 
1375 		lcr4(rcr4() | CR4_PSE);	/* enable hardware (via %cr4) */
1376 		pmap_largepages = 1;	/* enable software */
1377 
1378 		/*
1379 		 * the TLB must be flushed after enabling large pages
1380 		 * on Pentium CPUs, according to section 3.6.2.2 of
1381 		 * "Intel Architecture Software Developer's Manual,
1382 		 * Volume 3: System Programming".
1383 		 */
1384 		tlbflush();
1385 
1386 		/*
1387 		 * now, remap the kernel text using large pages.  we
1388 		 * assume that the linker has properly aligned the
1389 		 * .data segment to a NBPD_L2 boundary.
1390 		 */
1391 		kva_end = rounddown((vaddr_t)&__data_start, NBPD_L1);
1392 		for (pa = 0, kva = KERNBASE; kva + NBPD_L2 <= kva_end;
1393 		     kva += NBPD_L2, pa += NBPD_L2) {
1394 			pde = &L2_BASE[pl2_i(kva)];
1395 			*pde = pa | pmap_pg_g | PG_PS |
1396 			    PG_KR | PG_V;	/* zap! */
1397 			tlbflush();
1398 		}
1399 #if defined(DEBUG)
1400 		aprint_normal("kernel text is mapped with %" PRIuPSIZE " large "
1401 		    "pages and %" PRIuPSIZE " normal pages\n",
1402 		    howmany(kva - KERNBASE, NBPD_L2),
1403 		    howmany((vaddr_t)&__data_start - kva, NBPD_L1));
1404 #endif /* defined(DEBUG) */
1405 	}
1406 #endif /* !XEN */
1407 
1408 	if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
1409 		/*
1410 		 * zero_pte is stuck at the end of mapped space for the kernel
1411 		 * image (disjunct from kva space). This is done so that it
1412 		 * can safely be used in pmap_growkernel (pmap_get_physpage),
1413 		 * when it's called for the first time.
1414 		 * XXXfvdl fix this for MULTIPROCESSOR later.
1415 		 */
1416 
1417 		early_zerop = (void *)(KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2);
1418 		early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop);
1419 	}
1420 
1421 	/*
1422 	 * now we allocate the "special" VAs which are used for tmp mappings
1423 	 * by the pmap (and other modules).    we allocate the VAs by advancing
1424 	 * virtual_avail (note that there are no pages mapped at these VAs).
1425 	 * we find the PTE that maps the allocated VA via the linear PTE
1426 	 * mapping.
1427 	 */
1428 
1429 	pte = PTE_BASE + pl1_i(virtual_avail);
1430 
1431 #ifdef MULTIPROCESSOR
1432 	/*
1433 	 * Waste some VA space to avoid false sharing of cache lines
1434 	 * for page table pages: Give each possible CPU a cache line
1435 	 * of PTE's (8) to play with, though we only need 4.  We could
1436 	 * recycle some of this waste by putting the idle stacks here
1437 	 * as well; we could waste less space if we knew the largest
1438 	 * CPU ID beforehand.
1439 	 */
1440 	csrcp = (char *) virtual_avail;  csrc_pte = pte;
1441 
1442 	cdstp = (char *) virtual_avail+PAGE_SIZE;  cdst_pte = pte+1;
1443 
1444 	zerop = (char *) virtual_avail+PAGE_SIZE*2;  zero_pte = pte+2;
1445 
1446 	ptpp = (char *) virtual_avail+PAGE_SIZE*3;  ptp_pte = pte+3;
1447 
1448 	virtual_avail += PAGE_SIZE * maxcpus * NPTECL;
1449 	pte += maxcpus * NPTECL;
1450 #else
1451 	csrcp = (void *) virtual_avail;  csrc_pte = pte;	/* allocate */
1452 	virtual_avail += PAGE_SIZE; pte++;			/* advance */
1453 
1454 	cdstp = (void *) virtual_avail;  cdst_pte = pte;
1455 	virtual_avail += PAGE_SIZE; pte++;
1456 
1457 	zerop = (void *) virtual_avail;  zero_pte = pte;
1458 	virtual_avail += PAGE_SIZE; pte++;
1459 
1460 	ptpp = (void *) virtual_avail;  ptp_pte = pte;
1461 	virtual_avail += PAGE_SIZE; pte++;
1462 #endif
1463 
1464 	if (VM_MIN_KERNEL_ADDRESS == KERNBASE) {
1465 		early_zerop = zerop;
1466 		early_zero_pte = zero_pte;
1467 	}
1468 
1469 	/*
1470 	 * Nothing after this point actually needs pte;
1471 	 */
1472 	pte = (void *)0xdeadbeef;
1473 
1474 	/* XXX: vmmap used by mem.c... should be uvm_map_reserve */
1475 	/* XXXfvdl PTEs not needed here */
1476 	vmmap = (char *)virtual_avail;			/* don't need pte */
1477 	virtual_avail += PAGE_SIZE; pte++;
1478 
1479 #ifdef XEN
1480 #ifdef __x86_64__
1481 	/*
1482 	 * We want a dummy page directory for Xen:
1483 	 * when deactivate a pmap, Xen will still consider it active.
1484 	 * So we set user PGD to this one to lift all protection on
1485 	 * the now inactive page tables set.
1486 	 */
1487 	xen_dummy_user_pgd = avail_start;
1488 	avail_start += PAGE_SIZE;
1489 
1490 	/* Zero fill it, the less checks in Xen it requires the better */
1491 	memset((void *) (xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE);
1492 	/* Mark read-only */
1493 	HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE,
1494 	    pmap_pa2pte(xen_dummy_user_pgd) | PG_u | PG_V, UVMF_INVLPG);
1495 	/* Pin as L4 */
1496 	xpq_queue_pin_table(xpmap_ptom_masked(xen_dummy_user_pgd));
1497 #endif /* __x86_64__ */
1498 	idt_vaddr = virtual_avail;                      /* don't need pte */
1499 	idt_paddr = avail_start;                        /* steal a page */
1500 	/*
1501 	 * Xen require one more page as we can't store
1502 	 * GDT and LDT on the same page
1503 	 */
1504 	virtual_avail += 3 * PAGE_SIZE;
1505 	avail_start += 3 * PAGE_SIZE;
1506 #else /* XEN */
1507 	idt_vaddr = virtual_avail;			/* don't need pte */
1508 	idt_paddr = avail_start;			/* steal a page */
1509 #if defined(__x86_64__)
1510 	virtual_avail += 2 * PAGE_SIZE; pte += 2;
1511 	avail_start += 2 * PAGE_SIZE;
1512 #else /* defined(__x86_64__) */
1513 	virtual_avail += PAGE_SIZE; pte++;
1514 	avail_start += PAGE_SIZE;
1515 	/* pentium f00f bug stuff */
1516 	pentium_idt_vaddr = virtual_avail;		/* don't need pte */
1517 	virtual_avail += PAGE_SIZE; pte++;
1518 #endif /* defined(__x86_64__) */
1519 #endif /* XEN */
1520 
1521 #ifdef _LP64
1522 	/*
1523 	 * Grab a page below 4G for things that need it (i.e.
1524 	 * having an initial %cr3 for the MP trampoline).
1525 	 */
1526 	lo32_vaddr = virtual_avail;
1527 	virtual_avail += PAGE_SIZE; pte++;
1528 	lo32_paddr = avail_start;
1529 	avail_start += PAGE_SIZE;
1530 #endif
1531 
1532 	/*
1533 	 * now we reserve some VM for mapping pages when doing a crash dump
1534 	 */
1535 
1536 	virtual_avail = reserve_dumppages(virtual_avail);
1537 
1538 	/*
1539 	 * init the static-global locks and global lists.
1540 	 *
1541 	 * => pventry::pvh_lock (initialized elsewhere) must also be
1542 	 *      a spin lock, again at IPL_VM to prevent deadlock, and
1543 	 *	again is never taken from interrupt context.
1544 	 */
1545 
1546 	mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE);
1547 	LIST_INIT(&pmaps);
1548 	pmap_cpu_init_early(curcpu());
1549 
1550 	/*
1551 	 * initialize caches.
1552 	 */
1553 
1554 	pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), 0, 0, 0,
1555 	    "pmappl", NULL, IPL_NONE, NULL, NULL, NULL);
1556 #ifdef PAE
1557 	pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE * PDP_SIZE, 0, 0, 0,
1558 	    "pdppl", &pmap_pdp_allocator, IPL_NONE,
1559 	    pmap_pdp_ctor, pmap_pdp_dtor, NULL);
1560 #else /* PAE */
1561 	pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE, 0, 0, 0,
1562 	    "pdppl", NULL, IPL_NONE, pmap_pdp_ctor, pmap_pdp_dtor, NULL);
1563 #endif /* PAE */
1564 	pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0,
1565 	    PR_LARGECACHE, "pvpl", &pool_allocator_meta, IPL_NONE, NULL,
1566 	    NULL, NULL);
1567 
1568 	/*
1569 	 * ensure the TLB is sync'd with reality by flushing it...
1570 	 */
1571 
1572 	tlbflush();
1573 
1574 	/*
1575 	 * calculate pmap_maxkvaddr from nkptp[].
1576 	 */
1577 
1578 	kva = VM_MIN_KERNEL_ADDRESS;
1579 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
1580 		kva += nkptp[i] * nbpd[i];
1581 	}
1582 	pmap_maxkvaddr = kva;
1583 }
1584 
1585 #if defined(__x86_64__)
1586 /*
1587  * Pre-allocate PTPs for low memory, so that 1:1 mappings for various
1588  * trampoline code can be entered.
1589  */
1590 void
1591 pmap_prealloc_lowmem_ptps(void)
1592 {
1593 #ifdef XEN
1594 	int level;
1595 	paddr_t newp;
1596 	paddr_t pdes_pa;
1597 
1598 	pdes_pa = pmap_kernel()->pm_pdirpa;
1599 	level = PTP_LEVELS;
1600 	for (;;) {
1601 		newp = avail_start;
1602 		avail_start += PAGE_SIZE;
1603 		HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop,
1604 		    xpmap_ptom_masked(newp) | PG_u | PG_V | PG_RW, UVMF_INVLPG);
1605 		memset((void *)early_zerop, 0, PAGE_SIZE);
1606 		/* Mark R/O before installing */
1607 		HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop,
1608 		    xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG);
1609 		if (newp < (NKL2_KIMG_ENTRIES * NBPD_L2))
1610 			HYPERVISOR_update_va_mapping (newp + KERNBASE,
1611 			    xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG);
1612 		xpq_queue_pte_update (
1613 			xpmap_ptom_masked(pdes_pa)
1614 			+ (pl_i(0, level) * sizeof (pd_entry_t)),
1615 			xpmap_ptom_masked(newp) | PG_RW | PG_u | PG_V);
1616 		level--;
1617 		if (level <= 1)
1618 			break;
1619 		pdes_pa = newp;
1620 	}
1621 #else /* XEN */
1622 	pd_entry_t *pdes;
1623 	int level;
1624 	paddr_t newp;
1625 
1626 	pdes = pmap_kernel()->pm_pdir;
1627 	level = PTP_LEVELS;
1628 	for (;;) {
1629 		newp = avail_start;
1630 		avail_start += PAGE_SIZE;
1631 		*early_zero_pte = (newp & PG_FRAME) | PG_V | PG_RW;
1632 		pmap_update_pg((vaddr_t)early_zerop);
1633 		memset(early_zerop, 0, PAGE_SIZE);
1634 		pdes[pl_i(0, level)] = (newp & PG_FRAME) | PG_V | PG_RW;
1635 		level--;
1636 		if (level <= 1)
1637 			break;
1638 		pdes = normal_pdes[level - 2];
1639 	}
1640 #endif /* XEN */
1641 }
1642 #endif /* defined(__x86_64__) */
1643 
1644 /*
1645  * pmap_init: called from uvm_init, our job is to get the pmap
1646  * system ready to manage mappings...
1647  */
1648 
1649 void
1650 pmap_init(void)
1651 {
1652 	int i;
1653 
1654 	for (i = 0; i < PV_HASH_SIZE; i++) {
1655 		SLIST_INIT(&pv_hash_heads[i].hh_list);
1656 	}
1657 	for (i = 0; i < PV_HASH_LOCK_CNT; i++) {
1658 		mutex_init(&pv_hash_locks[i].lock, MUTEX_NODEBUG, IPL_VM);
1659 	}
1660 
1661 	/*
1662 	 * done: pmap module is up (and ready for business)
1663 	 */
1664 
1665 	pmap_initialized = true;
1666 }
1667 
1668 /*
1669  * pmap_cpu_init_early: perform early per-CPU initialization.
1670  */
1671 
1672 void
1673 pmap_cpu_init_early(struct cpu_info *ci)
1674 {
1675 	struct pmap_cpu *pc;
1676 	static uint8_t pmap_cpu_alloc;
1677 
1678 	pc = &pmap_cpu[pmap_cpu_alloc++].pc;
1679 	ci->ci_pmap_cpu = pc;
1680 }
1681 
1682 /*
1683  * pmap_cpu_init_late: perform late per-CPU initialization.
1684  */
1685 
1686 void
1687 pmap_cpu_init_late(struct cpu_info *ci)
1688 {
1689 
1690 	if (ci == &cpu_info_primary) {
1691 		evcnt_attach_dynamic(&pmap_tlb_evcnt, EVCNT_TYPE_INTR,
1692 		    NULL, "global", "TLB IPI");
1693 		evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC,
1694 		    NULL, "x86", "io bitmap copy");
1695 		evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC,
1696 		    NULL, "x86", "ldt sync");
1697 	}
1698 
1699 	evcnt_attach_dynamic(&ci->ci_tlb_evcnt, EVCNT_TYPE_MISC,
1700 	    NULL, device_xname(ci->ci_dev), "TLB IPI");
1701 }
1702 
1703 /*
1704  * p v _ e n t r y   f u n c t i o n s
1705  */
1706 
1707 /*
1708  * pmap_free_pvs: free a list of pv_entrys
1709  */
1710 
1711 static void
1712 pmap_free_pvs(struct pv_entry *pve)
1713 {
1714 	struct pv_entry *next;
1715 
1716 	for ( /* null */ ; pve != NULL ; pve = next) {
1717 		next = pve->pve_next;
1718 		pool_cache_put(&pmap_pv_cache, pve);
1719 	}
1720 }
1721 
1722 /*
1723  * main pv_entry manipulation functions:
1724  *   pmap_enter_pv: enter a mapping onto a pv_head list
1725  *   pmap_remove_pv: remove a mapping from a pv_head list
1726  *
1727  * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock
1728  *       the pvh before calling
1729  */
1730 
1731 /*
1732  * insert_pv: a helper of pmap_enter_pv
1733  */
1734 
1735 static void
1736 insert_pv(struct pmap_page *pp, struct pv_entry *pve)
1737 {
1738 	struct pv_hash_head *hh;
1739 	kmutex_t *lock;
1740 	u_int hash;
1741 
1742 	KASSERT(pp_locked(pp));
1743 
1744 	hash = pvhash_hash(pve->pve_pte.pte_ptp, pve->pve_pte.pte_va);
1745 	lock = pvhash_lock(hash);
1746 	hh = pvhash_head(hash);
1747 	mutex_spin_enter(lock);
1748 	SLIST_INSERT_HEAD(&hh->hh_list, pve, pve_hash);
1749 	mutex_spin_exit(lock);
1750 
1751 	LIST_INSERT_HEAD(&pp->pp_head.pvh_list, pve, pve_list);
1752 }
1753 
1754 /*
1755  * pmap_enter_pv: enter a mapping onto a pv_head lst
1756  *
1757  * => caller should have the pp_lock locked
1758  * => caller should adjust ptp's wire_count before calling
1759  */
1760 
1761 static struct pv_entry *
1762 pmap_enter_pv(struct pmap_page *pp,
1763 	      struct pv_entry *pve,	/* preallocated pve for us to use */
1764 	      struct pv_entry **sparepve,
1765 	      struct vm_page *ptp,
1766 	      vaddr_t va)
1767 {
1768 
1769 	KASSERT(ptp == NULL || ptp->wire_count >= 2);
1770 	KASSERT(ptp == NULL || ptp->uobject != NULL);
1771 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
1772 	KASSERT(pp_locked(pp));
1773 
1774 	if ((pp->pp_flags & PP_EMBEDDED) == 0) {
1775 		if (LIST_EMPTY(&pp->pp_head.pvh_list)) {
1776 			pp->pp_flags |= PP_EMBEDDED;
1777 			pp->pp_pte.pte_ptp = ptp;
1778 			pp->pp_pte.pte_va = va;
1779 
1780 			return pve;
1781 		}
1782 	} else {
1783 		struct pv_entry *pve2;
1784 
1785 		pve2 = *sparepve;
1786 		*sparepve = NULL;
1787 
1788 		pve2->pve_pte = pp->pp_pte;
1789 		pp->pp_flags &= ~PP_EMBEDDED;
1790 		LIST_INIT(&pp->pp_head.pvh_list);
1791 		insert_pv(pp, pve2);
1792 	}
1793 
1794 	pve->pve_pte.pte_ptp = ptp;
1795 	pve->pve_pte.pte_va = va;
1796 	insert_pv(pp, pve);
1797 
1798 	return NULL;
1799 }
1800 
1801 /*
1802  * pmap_remove_pv: try to remove a mapping from a pv_list
1803  *
1804  * => caller should hold pp_lock [so that attrs can be adjusted]
1805  * => caller should adjust ptp's wire_count and free PTP if needed
1806  * => we return the removed pve
1807  */
1808 
1809 static struct pv_entry *
1810 pmap_remove_pv(struct pmap_page *pp, struct vm_page *ptp, vaddr_t va)
1811 {
1812 	struct pv_hash_head *hh;
1813 	struct pv_entry *pve;
1814 	kmutex_t *lock;
1815 	u_int hash;
1816 
1817 	KASSERT(ptp == NULL || ptp->uobject != NULL);
1818 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
1819 	KASSERT(pp_locked(pp));
1820 
1821 	if ((pp->pp_flags & PP_EMBEDDED) != 0) {
1822 		KASSERT(pp->pp_pte.pte_ptp == ptp);
1823 		KASSERT(pp->pp_pte.pte_va == va);
1824 
1825 		pp->pp_flags &= ~PP_EMBEDDED;
1826 		LIST_INIT(&pp->pp_head.pvh_list);
1827 
1828 		return NULL;
1829 	}
1830 
1831 	hash = pvhash_hash(ptp, va);
1832 	lock = pvhash_lock(hash);
1833 	hh = pvhash_head(hash);
1834 	mutex_spin_enter(lock);
1835 	pve = pvhash_remove(hh, ptp, va);
1836 	mutex_spin_exit(lock);
1837 
1838 	LIST_REMOVE(pve, pve_list);
1839 
1840 	return pve;
1841 }
1842 
1843 /*
1844  * p t p   f u n c t i o n s
1845  */
1846 
1847 static inline struct vm_page *
1848 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level)
1849 {
1850 	int lidx = level - 1;
1851 	struct vm_page *pg;
1852 
1853 	KASSERT(mutex_owned(&pmap->pm_lock));
1854 
1855 	if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] &&
1856 	    pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) {
1857 		return (pmap->pm_ptphint[lidx]);
1858 	}
1859 	PMAP_SUBOBJ_LOCK(pmap, lidx);
1860 	pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level));
1861 	PMAP_SUBOBJ_UNLOCK(pmap, lidx);
1862 
1863 	KASSERT(pg == NULL || pg->wire_count >= 1);
1864 	return pg;
1865 }
1866 
1867 static inline void
1868 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level)
1869 {
1870 	int lidx;
1871 	struct uvm_object *obj;
1872 
1873 	KASSERT(ptp->wire_count == 1);
1874 
1875 	lidx = level - 1;
1876 
1877 	obj = &pmap->pm_obj[lidx];
1878 	pmap_stats_update(pmap, -1, 0);
1879 	if (lidx != 0)
1880 		mutex_enter(&obj->vmobjlock);
1881 	if (pmap->pm_ptphint[lidx] == ptp)
1882 		pmap->pm_ptphint[lidx] = TAILQ_FIRST(&obj->memq);
1883 	ptp->wire_count = 0;
1884 	uvm_pagerealloc(ptp, NULL, 0);
1885 	VM_PAGE_TO_PP(ptp)->pp_link = curlwp->l_md.md_gc_ptp;
1886 	curlwp->l_md.md_gc_ptp = ptp;
1887 	if (lidx != 0)
1888 		mutex_exit(&obj->vmobjlock);
1889 }
1890 
1891 static void
1892 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
1893 	      pt_entry_t *ptes, pd_entry_t * const *pdes)
1894 {
1895 	unsigned long index;
1896 	int level;
1897 	vaddr_t invaladdr;
1898 #ifdef MULTIPROCESSOR
1899 	vaddr_t invaladdr2;
1900 #endif
1901 	pd_entry_t opde;
1902 	struct pmap *curpmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map);
1903 
1904 	KASSERT(pmap != pmap_kernel());
1905 	KASSERT(mutex_owned(&pmap->pm_lock));
1906 	KASSERT(kpreempt_disabled());
1907 
1908 	level = 1;
1909 	do {
1910 		index = pl_i(va, level + 1);
1911 		opde = pmap_pte_testset(&pdes[level - 1][index], 0);
1912 #if defined(XEN) && defined(__x86_64__)
1913 		/*
1914 		 * If ptp is a L3 currently mapped in kernel space,
1915 		 * clear it before freeing
1916 		 */
1917 		if (pmap->pm_pdirpa == xen_current_user_pgd
1918 		    && level == PTP_LEVELS - 1)
1919 			pmap_pte_set(&pmap_kernel()->pm_pdir[index], 0);
1920 #endif /* XEN && __x86_64__ */
1921 		pmap_freepage(pmap, ptp, level);
1922 		invaladdr = level == 1 ? (vaddr_t)ptes :
1923 		    (vaddr_t)pdes[level - 2];
1924 		pmap_tlb_shootdown(curpmap, invaladdr + index * PAGE_SIZE,
1925 		    0, opde);
1926 #if defined(MULTIPROCESSOR)
1927 		invaladdr2 = level == 1 ? (vaddr_t)PTE_BASE :
1928 		    (vaddr_t)normal_pdes[level - 2];
1929 		if (pmap != curpmap || invaladdr != invaladdr2) {
1930 			pmap_tlb_shootdown(pmap, invaladdr2 + index * PAGE_SIZE,
1931 			    0, opde);
1932 		}
1933 #endif
1934 		if (level < PTP_LEVELS - 1) {
1935 			ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1);
1936 			ptp->wire_count--;
1937 			if (ptp->wire_count > 1)
1938 				break;
1939 		}
1940 	} while (++level < PTP_LEVELS);
1941 	pmap_pte_flush();
1942 }
1943 
1944 /*
1945  * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
1946  *
1947  * => pmap should NOT be pmap_kernel()
1948  * => pmap should be locked
1949  * => preemption should be disabled
1950  */
1951 
1952 static struct vm_page *
1953 pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t * const *pdes)
1954 {
1955 	struct vm_page *ptp, *pptp;
1956 	int i;
1957 	unsigned long index;
1958 	pd_entry_t *pva;
1959 	paddr_t ppa, pa;
1960 	struct uvm_object *obj;
1961 
1962 	KASSERT(pmap != pmap_kernel());
1963 	KASSERT(mutex_owned(&pmap->pm_lock));
1964 	KASSERT(kpreempt_disabled());
1965 
1966 	ptp = NULL;
1967 	pa = (paddr_t)-1;
1968 
1969 	/*
1970 	 * Loop through all page table levels seeing if we need to
1971 	 * add a new page to that level.
1972 	 */
1973 	for (i = PTP_LEVELS; i > 1; i--) {
1974 		/*
1975 		 * Save values from previous round.
1976 		 */
1977 		pptp = ptp;
1978 		ppa = pa;
1979 
1980 		index = pl_i(va, i);
1981 		pva = pdes[i - 2];
1982 
1983 		if (pmap_valid_entry(pva[index])) {
1984 			ppa = pmap_pte2pa(pva[index]);
1985 			ptp = NULL;
1986 			continue;
1987 		}
1988 
1989 		obj = &pmap->pm_obj[i-2];
1990 		PMAP_SUBOBJ_LOCK(pmap, i - 2);
1991 		ptp = uvm_pagealloc(obj, ptp_va2o(va, i - 1), NULL,
1992 		    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
1993 		PMAP_SUBOBJ_UNLOCK(pmap, i - 2);
1994 
1995 		if (ptp == NULL)
1996 			return NULL;
1997 
1998 		ptp->flags &= ~PG_BUSY; /* never busy */
1999 		ptp->wire_count = 1;
2000 		pmap->pm_ptphint[i - 2] = ptp;
2001 		pa = VM_PAGE_TO_PHYS(ptp);
2002 		pmap_pte_set(&pva[index], (pd_entry_t)
2003 		        (pmap_pa2pte(pa) | PG_u | PG_RW | PG_V));
2004 #if defined(XEN) && defined(__x86_64__)
2005 		/*
2006 		 * In Xen we must enter the mapping in kernel map too
2007 		 * if pmap is curmap and modifying top level (PGD)
2008 		 */
2009 		if(i == PTP_LEVELS && pmap != pmap_kernel()) {
2010 		        pmap_pte_set(&pmap_kernel()->pm_pdir[index],
2011 		                (pd_entry_t) (pmap_pa2pte(pa)
2012 		                        | PG_u | PG_RW | PG_V));
2013 		}
2014 #endif /* XEN && __x86_64__ */
2015 		pmap_pte_flush();
2016 		pmap_stats_update(pmap, 1, 0);
2017 		/*
2018 		 * If we're not in the top level, increase the
2019 		 * wire count of the parent page.
2020 		 */
2021 		if (i < PTP_LEVELS) {
2022 			if (pptp == NULL)
2023 				pptp = pmap_find_ptp(pmap, va, ppa, i);
2024 #ifdef DIAGNOSTIC
2025 			if (pptp == NULL)
2026 				panic("pde page disappeared");
2027 #endif
2028 			pptp->wire_count++;
2029 		}
2030 	}
2031 
2032 	/*
2033 	 * ptp is not NULL if we just allocated a new ptp. If it's
2034 	 * still NULL, we must look up the existing one.
2035 	 */
2036 	if (ptp == NULL) {
2037 		ptp = pmap_find_ptp(pmap, va, ppa, 1);
2038 #ifdef DIAGNOSTIC
2039 		if (ptp == NULL) {
2040 			printf("va %" PRIxVADDR " ppa %" PRIxPADDR "\n",
2041 			    va, ppa);
2042 			panic("pmap_get_ptp: unmanaged user PTP");
2043 		}
2044 #endif
2045 	}
2046 
2047 	pmap->pm_ptphint[0] = ptp;
2048 	return(ptp);
2049 }
2050 
2051 /*
2052  * p m a p  l i f e c y c l e   f u n c t i o n s
2053  */
2054 
2055 /*
2056  * pmap_pdp_ctor: constructor for the PDP cache.
2057  */
2058 
2059 int
2060 pmap_pdp_ctor(void *arg, void *v, int flags)
2061 {
2062 	pd_entry_t *pdir = v;
2063 	paddr_t pdirpa = 0;	/* XXX: GCC */
2064 	vaddr_t object;
2065 	int i;
2066 
2067 #if !defined(XEN) || !defined(__x86_64__)
2068 	int npde;
2069 #endif
2070 #ifdef XEN
2071 	int s;
2072 #endif
2073 
2074 	/*
2075 	 * NOTE: The `pmap_lock' is held when the PDP is allocated.
2076 	 */
2077 
2078 #if defined(XEN) && defined(__x86_64__)
2079 	/* fetch the physical address of the page directory. */
2080 	(void) pmap_extract(pmap_kernel(), (vaddr_t) pdir, &pdirpa);
2081 
2082 	/* zero init area */
2083 	memset (pdir, 0, PAGE_SIZE); /* Xen wants a clean page */
2084 	/*
2085 	 * this pdir will NEVER be active in kernel mode
2086 	 * so mark recursive entry invalid
2087 	 */
2088 	pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa) | PG_u;
2089 	/*
2090 	 * PDP constructed this way won't be for kernel,
2091 	 * hence we don't put kernel mappings on Xen.
2092 	 * But we need to make pmap_create() happy, so put a dummy (without
2093 	 * PG_V) value at the right place.
2094 	 */
2095 	pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] =
2096 	     (pd_entry_t)-1 & PG_FRAME;
2097 #else /* XEN && __x86_64__*/
2098 	/* zero init area */
2099 	memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t));
2100 
2101 	object = (vaddr_t)v;
2102 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2103 		/* fetch the physical address of the page directory. */
2104 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2105 		/* put in recursive PDE to map the PTEs */
2106 		pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PG_V;
2107 #ifndef XEN
2108 		pdir[PDIR_SLOT_PTE + i] |= PG_KW;
2109 #endif
2110 	}
2111 
2112 	/* copy kernel's PDE */
2113 	npde = nkptp[PTP_LEVELS - 1];
2114 
2115 	memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN],
2116 	    npde * sizeof(pd_entry_t));
2117 
2118 	/* zero the rest */
2119 	memset(&pdir[PDIR_SLOT_KERN + npde], 0,
2120 	    (NTOPLEVEL_PDES - (PDIR_SLOT_KERN + npde)) * sizeof(pd_entry_t));
2121 
2122 	if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
2123 		int idx = pl_i(KERNBASE, PTP_LEVELS);
2124 
2125 		pdir[idx] = PDP_BASE[idx];
2126 	}
2127 #endif /* XEN  && __x86_64__*/
2128 #ifdef XEN
2129 	s = splvm();
2130 	object = (vaddr_t)v;
2131 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2132 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2133 		/* remap this page RO */
2134 		pmap_kenter_pa(object, pdirpa, VM_PROT_READ, 0);
2135 		pmap_update(pmap_kernel());
2136 		/*
2137 		 * pin as L2/L4 page, we have to do the page with the
2138 		 * PDIR_SLOT_PTE entries last
2139 		 */
2140 #ifdef PAE
2141 		if (i == l2tol3(PDIR_SLOT_PTE))
2142 			continue;
2143 #endif
2144 		xpq_queue_pin_table(xpmap_ptom_masked(pdirpa));
2145 	}
2146 #ifdef PAE
2147 	object = ((vaddr_t)pdir) + PAGE_SIZE  * l2tol3(PDIR_SLOT_PTE);
2148 	(void)pmap_extract(pmap_kernel(), object, &pdirpa);
2149 	xpq_queue_pin_table(xpmap_ptom_masked(pdirpa));
2150 #endif
2151 	splx(s);
2152 #endif /* XEN */
2153 
2154 	return (0);
2155 }
2156 
2157 /*
2158  * pmap_pdp_dtor: destructor for the PDP cache.
2159  */
2160 
2161 void
2162 pmap_pdp_dtor(void *arg, void *v)
2163 {
2164 #ifdef XEN
2165 	paddr_t pdirpa = 0;	/* XXX: GCC */
2166 	vaddr_t object = (vaddr_t)v;
2167 	int i;
2168 	int s = splvm();
2169 	pt_entry_t *pte;
2170 
2171 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2172 		/* fetch the physical address of the page directory. */
2173 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2174 		/* unpin page table */
2175 		xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa));
2176 	}
2177 	object = (vaddr_t)v;
2178 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2179 		/* Set page RW again */
2180 		pte = kvtopte(object);
2181 		xpq_queue_pte_update(xpmap_ptetomach(pte), *pte | PG_RW);
2182 		xpq_queue_invlpg((vaddr_t)object);
2183 	}
2184 	splx(s);
2185 #endif  /* XEN */
2186 }
2187 
2188 #ifdef PAE
2189 
2190 /* pmap_pdp_alloc: Allocate a page for the pdp memory pool. */
2191 
2192 void *
2193 pmap_pdp_alloc(struct pool *pp, int flags)
2194 {
2195 	return (void *)uvm_km_alloc(kernel_map,
2196 	    PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE,
2197 	    ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK)
2198 	    | UVM_KMF_WIRED);
2199 }
2200 
2201 /*
2202  * pmap_pdp_free: free a PDP
2203  */
2204 
2205 void
2206 pmap_pdp_free(struct pool *pp, void *v)
2207 {
2208 	uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE,
2209 	    UVM_KMF_WIRED);
2210 }
2211 #endif /* PAE */
2212 
2213 /*
2214  * pmap_create: create a pmap
2215  *
2216  * => note: old pmap interface took a "size" args which allowed for
2217  *	the creation of "software only" pmaps (not in bsd).
2218  */
2219 
2220 struct pmap *
2221 pmap_create(void)
2222 {
2223 	struct pmap *pmap;
2224 	int i;
2225 
2226 	pmap = pool_cache_get(&pmap_cache, PR_WAITOK);
2227 
2228 	/* init uvm_object */
2229 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2230 		UVM_OBJ_INIT(&pmap->pm_obj[i], NULL, 1);
2231 		pmap->pm_ptphint[i] = NULL;
2232 	}
2233 	pmap->pm_stats.wired_count = 0;
2234 	/* count the PDP allocd below */
2235 	pmap->pm_stats.resident_count = PDP_SIZE;
2236 #if !defined(__x86_64__)
2237 	pmap->pm_hiexec = 0;
2238 #endif /* !defined(__x86_64__) */
2239 	pmap->pm_flags = 0;
2240 	pmap->pm_cpus = 0;
2241 	pmap->pm_kernel_cpus = 0;
2242 
2243 	/* init the LDT */
2244 	pmap->pm_ldt = NULL;
2245 	pmap->pm_ldt_len = 0;
2246 	pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2247 
2248 	/* allocate PDP */
2249  try_again:
2250 	pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK);
2251 
2252 	mutex_enter(&pmaps_lock);
2253 
2254 	if (pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] == 0) {
2255 		mutex_exit(&pmaps_lock);
2256 		pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir);
2257 		goto try_again;
2258 	}
2259 
2260 #ifdef PAE
2261 	for (i = 0; i < PDP_SIZE; i++)
2262 		pmap->pm_pdirpa[i] =
2263 		    pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]);
2264 #else
2265 	pmap->pm_pdirpa = pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE]);
2266 #endif
2267 
2268 	LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
2269 
2270 	mutex_exit(&pmaps_lock);
2271 
2272 	return (pmap);
2273 }
2274 
2275 /*
2276  * pmap_destroy: drop reference count on pmap.   free pmap if
2277  *	reference count goes to zero.
2278  */
2279 
2280 void
2281 pmap_destroy(struct pmap *pmap)
2282 {
2283 	int i;
2284 #ifdef DIAGNOSTIC
2285 	struct cpu_info *ci;
2286 	CPU_INFO_ITERATOR cii;
2287 #endif /* DIAGNOSTIC */
2288 
2289 	/*
2290 	 * if we have torn down this pmap, process deferred frees and
2291 	 * invalidations now.
2292 	 */
2293 	if (__predict_false(curlwp->l_md.md_gc_pmap == pmap)) {
2294 		pmap_update(pmap);
2295 	}
2296 
2297 	/*
2298 	 * drop reference count
2299 	 */
2300 
2301 	if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) {
2302 		return;
2303 	}
2304 
2305 #ifdef DIAGNOSTIC
2306 	for (CPU_INFO_FOREACH(cii, ci))
2307 		if (ci->ci_pmap == pmap)
2308 			panic("destroying pmap being used");
2309 #endif /* DIAGNOSTIC */
2310 
2311 	/*
2312 	 * reference count is zero, free pmap resources and then free pmap.
2313 	 */
2314 #ifdef XEN
2315 	/*
2316 	 * Xen lazy APDP handling:
2317 	 * clear APDP_PDE if pmap is the currently mapped
2318 	 */
2319 	if (xpmap_ptom_masked(pmap_pdirpa(pmap, 0)) == (*APDP_PDE & PG_FRAME)) {
2320 		kpreempt_disable();
2321 		pmap_unmap_apdp();
2322 		pmap_pte_flush();
2323 	        pmap_apte_flush(pmap_kernel());
2324 	        kpreempt_enable();
2325 	}
2326 #endif
2327 
2328 	/*
2329 	 * remove it from global list of pmaps
2330 	 */
2331 
2332 	mutex_enter(&pmaps_lock);
2333 	LIST_REMOVE(pmap, pm_list);
2334 	mutex_exit(&pmaps_lock);
2335 
2336 	/*
2337 	 * destroyed pmap shouldn't have remaining PTPs
2338 	 */
2339 
2340 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2341 		KASSERT(pmap->pm_obj[i].uo_npages == 0);
2342 		KASSERT(TAILQ_EMPTY(&pmap->pm_obj[i].memq));
2343 	}
2344 
2345 	/*
2346 	 * MULTIPROCESSOR -- no need to flush out of other processors'
2347 	 * APTE space because we do that in pmap_unmap_ptes().
2348 	 */
2349 	pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir);
2350 
2351 #ifdef USER_LDT
2352 	if (pmap->pm_ldt != NULL) {
2353 		/*
2354 		 * no need to switch the LDT; this address space is gone,
2355 		 * nothing is using it.
2356 		 *
2357 		 * No need to lock the pmap for ldt_free (or anything else),
2358 		 * we're the last one to use it.
2359 		 */
2360 		mutex_enter(&cpu_lock);
2361 		ldt_free(pmap->pm_ldt_sel);
2362 		mutex_exit(&cpu_lock);
2363 		uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt,
2364 		    pmap->pm_ldt_len, UVM_KMF_WIRED);
2365 	}
2366 #endif
2367 
2368 	for (i = 0; i < PTP_LEVELS - 1; i++)
2369 		mutex_destroy(&pmap->pm_obj[i].vmobjlock);
2370 	pool_cache_put(&pmap_cache, pmap);
2371 }
2372 
2373 /*
2374  * pmap_remove_all: pmap is being torn down by the current thread.
2375  * avoid unnecessary invalidations.
2376  */
2377 
2378 void
2379 pmap_remove_all(struct pmap *pmap)
2380 {
2381 	lwp_t *l = curlwp;
2382 
2383 	KASSERT(l->l_md.md_gc_pmap == NULL);
2384 
2385 	l->l_md.md_gc_pmap = pmap;
2386 }
2387 
2388 #if defined(PMAP_FORK)
2389 /*
2390  * pmap_fork: perform any necessary data structure manipulation when
2391  * a VM space is forked.
2392  */
2393 
2394 void
2395 pmap_fork(struct pmap *pmap1, struct pmap *pmap2)
2396 {
2397 #ifdef USER_LDT
2398 	union descriptor *new_ldt;
2399 	size_t len;
2400 	int sel;
2401 
2402 	if (__predict_true(pmap1->pm_ldt == NULL)) {
2403 		return;
2404 	}
2405 
2406  retry:
2407 	if (pmap1->pm_ldt != NULL) {
2408 		len = pmap1->pm_ldt_len;
2409 		new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, len, 0,
2410 		    UVM_KMF_WIRED);
2411 		mutex_enter(&cpu_lock);
2412 		sel = ldt_alloc(new_ldt, len);
2413 		if (sel == -1) {
2414 			mutex_exit(&cpu_lock);
2415 			uvm_km_free(kernel_map, (vaddr_t)new_ldt, len,
2416 			    UVM_KMF_WIRED);
2417 			printf("WARNING: pmap_fork: unable to allocate LDT\n");
2418 			return;
2419 		}
2420 	} else {
2421 		len = -1;
2422 		new_ldt = NULL;
2423 		sel = -1;
2424 		mutex_enter(&cpu_lock);
2425 	}
2426 
2427  	/* Copy the LDT, if necessary. */
2428  	if (pmap1->pm_ldt != NULL) {
2429 		if (len != pmap1->pm_ldt_len) {
2430 			if (len != -1) {
2431 				ldt_free(sel);
2432 				uvm_km_free(kernel_map, (vaddr_t)new_ldt,
2433 				    len, UVM_KMF_WIRED);
2434 			}
2435 			mutex_exit(&cpu_lock);
2436 			goto retry;
2437 		}
2438 
2439 		memcpy(new_ldt, pmap1->pm_ldt, len);
2440 		pmap2->pm_ldt = new_ldt;
2441 		pmap2->pm_ldt_len = pmap1->pm_ldt_len;
2442 		pmap2->pm_ldt_sel = sel;
2443 		len = -1;
2444 	}
2445 
2446 	if (len != -1) {
2447 		ldt_free(sel);
2448 		uvm_km_free(kernel_map, (vaddr_t)new_ldt, len,
2449 		    UVM_KMF_WIRED);
2450 	}
2451 	mutex_exit(&cpu_lock);
2452 #endif /* USER_LDT */
2453 }
2454 #endif /* PMAP_FORK */
2455 
2456 #ifdef USER_LDT
2457 
2458 /*
2459  * pmap_ldt_xcall: cross call used by pmap_ldt_sync.  if the named pmap
2460  * is active, reload LDTR.
2461  */
2462 static void
2463 pmap_ldt_xcall(void *arg1, void *arg2)
2464 {
2465 	struct pmap *pm;
2466 
2467 	kpreempt_disable();
2468 	pm = arg1;
2469 	if (curcpu()->ci_pmap == pm) {
2470 		lldt(pm->pm_ldt_sel);
2471 	}
2472 	kpreempt_enable();
2473 }
2474 
2475 /*
2476  * pmap_ldt_sync: LDT selector for the named pmap is changing.  swap
2477  * in the new selector on all CPUs.
2478  */
2479 void
2480 pmap_ldt_sync(struct pmap *pm)
2481 {
2482 	uint64_t where;
2483 
2484 	KASSERT(mutex_owned(&cpu_lock));
2485 
2486 	pmap_ldt_evcnt.ev_count++;
2487 	where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL);
2488 	xc_wait(where);
2489 }
2490 
2491 /*
2492  * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and
2493  * restore the default.
2494  */
2495 
2496 void
2497 pmap_ldt_cleanup(struct lwp *l)
2498 {
2499 	pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap;
2500 	union descriptor *dp = NULL;
2501 	size_t len = 0;
2502 	int sel = -1;
2503 
2504 	if (__predict_true(pmap->pm_ldt == NULL)) {
2505 		return;
2506 	}
2507 
2508 	mutex_enter(&cpu_lock);
2509 	if (pmap->pm_ldt != NULL) {
2510 		sel = pmap->pm_ldt_sel;
2511 		dp = pmap->pm_ldt;
2512 		len = pmap->pm_ldt_len;
2513 		pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2514 		pmap->pm_ldt = NULL;
2515 		pmap->pm_ldt_len = 0;
2516 		pmap_ldt_sync(pmap);
2517 		ldt_free(sel);
2518 		uvm_km_free(kernel_map, (vaddr_t)dp, len, UVM_KMF_WIRED);
2519 	}
2520 	mutex_exit(&cpu_lock);
2521 }
2522 #endif /* USER_LDT */
2523 
2524 /*
2525  * pmap_activate: activate a process' pmap
2526  *
2527  * => must be called with kernel preemption disabled
2528  * => if lwp is the curlwp, then set ci_want_pmapload so that
2529  *    actual MMU context switch will be done by pmap_load() later
2530  */
2531 
2532 void
2533 pmap_activate(struct lwp *l)
2534 {
2535 	struct cpu_info *ci;
2536 	struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2537 
2538 	KASSERT(kpreempt_disabled());
2539 
2540 	ci = curcpu();
2541 
2542 	if (l == ci->ci_curlwp) {
2543 		struct pcb *pcb;
2544 
2545 		KASSERT(ci->ci_want_pmapload == 0);
2546 		KASSERT(ci->ci_tlbstate != TLBSTATE_VALID);
2547 #ifdef KSTACK_CHECK_DR0
2548 		/*
2549 		 * setup breakpoint on the top of stack
2550 		 */
2551 		if (l == &lwp0)
2552 			dr0(0, 0, 0, 0);
2553 		else
2554 			dr0(KSTACK_LOWEST_ADDR(l), 1, 3, 1);
2555 #endif
2556 
2557 		/*
2558 		 * no need to switch to kernel vmspace because
2559 		 * it's a subset of any vmspace.
2560 		 */
2561 
2562 		if (pmap == pmap_kernel()) {
2563 			ci->ci_want_pmapload = 0;
2564 			return;
2565 		}
2566 
2567 		pcb = lwp_getpcb(l);
2568 		ci->ci_want_pmapload = 1;
2569 
2570 #if defined(__x86_64__)
2571 		if (pcb->pcb_flags & PCB_GS64)
2572 			wrmsr(MSR_KERNELGSBASE, pcb->pcb_gs);
2573 		if (pcb->pcb_flags & PCB_FS64)
2574 			wrmsr(MSR_FSBASE, pcb->pcb_fs);
2575 #endif /* defined(__x86_64__) */
2576 	}
2577 }
2578 
2579 /*
2580  * pmap_reactivate: try to regain reference to the pmap.
2581  *
2582  * => must be called with kernel preemption disabled
2583  */
2584 
2585 static bool
2586 pmap_reactivate(struct pmap *pmap)
2587 {
2588 	struct cpu_info *ci;
2589 	uint32_t cpumask;
2590 	bool result;
2591 	uint32_t oldcpus;
2592 
2593 	ci = curcpu();
2594 	cpumask = ci->ci_cpumask;
2595 
2596 	KASSERT(kpreempt_disabled());
2597 #if defined(XEN) && defined(__x86_64__)
2598 	KASSERT(pmap->pm_pdirpa == xen_current_user_pgd);
2599 #elif defined(PAE)
2600 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(pmap_l3pd[0]));
2601 #elif !defined(XEN)
2602 	KASSERT(pmap->pm_pdirpa == pmap_pte2pa(rcr3()));
2603 #endif
2604 
2605 	/*
2606 	 * if we still have a lazy reference to this pmap,
2607 	 * we can assume that there was no tlb shootdown
2608 	 * for this pmap in the meantime.
2609 	 *
2610 	 * the order of events here is important as we must
2611 	 * synchronize with TLB shootdown interrupts.  declare
2612 	 * interest in invalidations (TLBSTATE_VALID) and then
2613 	 * check the cpumask, which the IPIs can change only
2614 	 * when the state is TLBSTATE_LAZY.
2615 	 */
2616 
2617 	ci->ci_tlbstate = TLBSTATE_VALID;
2618 	oldcpus = pmap->pm_cpus;
2619 	KASSERT((pmap->pm_kernel_cpus & cpumask) != 0);
2620 	if (oldcpus & cpumask) {
2621 		/* got it */
2622 		result = true;
2623 	} else {
2624 		/* must reload */
2625 		atomic_or_32(&pmap->pm_cpus, cpumask);
2626 		result = false;
2627 	}
2628 
2629 	return result;
2630 }
2631 
2632 /*
2633  * pmap_load: actually switch pmap.  (fill in %cr3 and LDT info)
2634  */
2635 
2636 void
2637 pmap_load(void)
2638 {
2639 	struct cpu_info *ci;
2640 	uint32_t cpumask;
2641 	struct pmap *pmap;
2642 	struct pmap *oldpmap;
2643 	struct lwp *l;
2644 	struct pcb *pcb;
2645 	uint64_t ncsw;
2646 
2647 	kpreempt_disable();
2648  retry:
2649 	ci = curcpu();
2650 	if (!ci->ci_want_pmapload) {
2651 		kpreempt_enable();
2652 		return;
2653 	}
2654 	cpumask = ci->ci_cpumask;
2655 	l = ci->ci_curlwp;
2656 	ncsw = l->l_ncsw;
2657 
2658 	/* should be able to take ipis. */
2659 	KASSERT(ci->ci_ilevel < IPL_HIGH);
2660 #ifdef XEN
2661 	/* XXX not yet KASSERT(x86_read_psl() != 0); */
2662 #else
2663 	KASSERT((x86_read_psl() & PSL_I) != 0);
2664 #endif
2665 
2666 	KASSERT(l != NULL);
2667 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2668 	KASSERT(pmap != pmap_kernel());
2669 	oldpmap = ci->ci_pmap;
2670 	pcb = lwp_getpcb(l);
2671 
2672 	if (pmap == oldpmap) {
2673 		if (!pmap_reactivate(pmap)) {
2674 			u_int gen = uvm_emap_gen_return();
2675 
2676 			/*
2677 			 * pmap has been changed during deactivated.
2678 			 * our tlb may be stale.
2679 			 */
2680 
2681 			tlbflush();
2682 			uvm_emap_update(gen);
2683 		}
2684 
2685 		ci->ci_want_pmapload = 0;
2686 		kpreempt_enable();
2687 		return;
2688 	}
2689 
2690 	/*
2691 	 * grab a reference to the new pmap.
2692 	 */
2693 
2694 	pmap_reference(pmap);
2695 
2696 	/*
2697 	 * actually switch pmap.
2698 	 */
2699 
2700 	atomic_and_32(&oldpmap->pm_cpus, ~cpumask);
2701 	atomic_and_32(&oldpmap->pm_kernel_cpus, ~cpumask);
2702 
2703 #if defined(XEN) && defined(__x86_64__)
2704 	KASSERT(oldpmap->pm_pdirpa == xen_current_user_pgd ||
2705 	    oldpmap == pmap_kernel());
2706 #elif defined(PAE)
2707 	KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(pmap_l3pd[0]));
2708 #elif !defined(XEN)
2709 	KASSERT(oldpmap->pm_pdirpa == pmap_pte2pa(rcr3()));
2710 #endif
2711 	KASSERT((pmap->pm_cpus & cpumask) == 0);
2712 	KASSERT((pmap->pm_kernel_cpus & cpumask) == 0);
2713 
2714 	/*
2715 	 * mark the pmap in use by this processor.  again we must
2716 	 * synchronize with TLB shootdown interrupts, so set the
2717 	 * state VALID first, then register us for shootdown events
2718 	 * on this pmap.
2719 	 */
2720 
2721 	ci->ci_tlbstate = TLBSTATE_VALID;
2722 	atomic_or_32(&pmap->pm_cpus, cpumask);
2723 	atomic_or_32(&pmap->pm_kernel_cpus, cpumask);
2724 	ci->ci_pmap = pmap;
2725 
2726 	/*
2727 	 * update tss.  now that we have registered for invalidations
2728 	 * from other CPUs, we're good to load the page tables.
2729 	 */
2730 #ifdef PAE
2731 	pcb->pcb_cr3 = pmap_l3paddr;
2732 #else
2733 	pcb->pcb_cr3 = pmap->pm_pdirpa;
2734 #endif
2735 #if defined(XEN) && defined(__x86_64__)
2736 	/* kernel pmap always in cr3 and should never go in user cr3 */
2737 	if (pmap_pdirpa(pmap, 0) != pmap_pdirpa(pmap_kernel(), 0)) {
2738 		/*
2739 		 * Map user space address in kernel space and load
2740 		 * user cr3
2741 		 */
2742 		int i, s;
2743 		pd_entry_t *old_pgd, *new_pgd;
2744 		paddr_t addr;
2745 		s = splvm();
2746 		new_pgd  = pmap->pm_pdir;
2747 		old_pgd = pmap_kernel()->pm_pdir;
2748 		addr = xpmap_ptom(pmap_pdirpa(pmap_kernel(), 0));
2749 		for (i = 0; i < PDIR_SLOT_PTE;
2750 		    i++, addr += sizeof(pd_entry_t)) {
2751 			if ((new_pgd[i] & PG_V) || (old_pgd[i] & PG_V))
2752 				xpq_queue_pte_update(addr, new_pgd[i]);
2753 		}
2754 		tlbflush();
2755 		xen_set_user_pgd(pmap_pdirpa(pmap, 0));
2756 		xen_current_user_pgd = pmap_pdirpa(pmap, 0);
2757 		splx(s);
2758 	}
2759 #else /* XEN && x86_64 */
2760 #if defined(XEN)
2761 	/*
2762 	 * clear APDP slot, in case it points to a page table that has
2763 	 * been freed
2764 	 */
2765 	if (*APDP_PDE) {
2766 		pmap_unmap_apdp();
2767 	}
2768 	/* lldt() does pmap_pte_flush() */
2769 #else /* XEN */
2770 #if defined(i386)
2771 	ci->ci_tss.tss_ldt = pmap->pm_ldt_sel;
2772 	ci->ci_tss.tss_cr3 = pcb->pcb_cr3;
2773 #endif
2774 #endif /* XEN */
2775 	lldt(pmap->pm_ldt_sel);
2776 #ifdef PAE
2777 	{
2778 	paddr_t l3_pd = xpmap_ptom_masked(pmap_l3paddr);
2779 	int i;
2780 	int s = splvm();
2781 	/* don't update the kernel L3 slot */
2782 	for (i = 0 ; i < PDP_SIZE - 1; i++, l3_pd += sizeof(pd_entry_t)) {
2783 		xpq_queue_pte_update(l3_pd,
2784 		    xpmap_ptom(pmap->pm_pdirpa[i]) | PG_V);
2785 	}
2786 	tlbflush();
2787 	splx(s);
2788 	}
2789 #else /* PAE */
2790 	{
2791 	u_int gen = uvm_emap_gen_return();
2792 	lcr3(pcb->pcb_cr3);
2793 	uvm_emap_update(gen);
2794 	}
2795 #endif /* PAE */
2796 #endif /* XEN && x86_64 */
2797 
2798 	ci->ci_want_pmapload = 0;
2799 
2800 	/*
2801 	 * we're now running with the new pmap.  drop the reference
2802 	 * to the old pmap.  if we block, we need to go around again.
2803 	 */
2804 
2805 	pmap_destroy(oldpmap);
2806 	if (l->l_ncsw != ncsw) {
2807 		goto retry;
2808 	}
2809 
2810 	kpreempt_enable();
2811 }
2812 
2813 /*
2814  * pmap_deactivate: deactivate a process' pmap
2815  *
2816  * => must be called with kernel preemption disabled (high SPL is enough)
2817  */
2818 
2819 void
2820 pmap_deactivate(struct lwp *l)
2821 {
2822 	struct pmap *pmap;
2823 	struct cpu_info *ci;
2824 
2825 	KASSERT(kpreempt_disabled());
2826 
2827 	if (l != curlwp) {
2828 		return;
2829 	}
2830 
2831 	/*
2832 	 * wait for pending TLB shootdowns to complete.  necessary
2833 	 * because TLB shootdown state is per-CPU, and the LWP may
2834 	 * be coming off the CPU before it has a chance to call
2835 	 * pmap_update().
2836 	 */
2837 	pmap_tlb_shootwait();
2838 
2839 	ci = curcpu();
2840 
2841 	if (ci->ci_want_pmapload) {
2842 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
2843 		    != pmap_kernel());
2844 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
2845 		    != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID);
2846 
2847 		/*
2848 		 * userspace has not been touched.
2849 		 * nothing to do here.
2850 		 */
2851 
2852 		ci->ci_want_pmapload = 0;
2853 		return;
2854 	}
2855 
2856 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2857 
2858 	if (pmap == pmap_kernel()) {
2859 		return;
2860 	}
2861 
2862 #if defined(XEN) && defined(__x86_64__)
2863 	KASSERT(pmap->pm_pdirpa == xen_current_user_pgd);
2864 #elif defined(PAE)
2865 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(pmap_l3pd[0]));
2866 #elif !defined(XEN)
2867 	KASSERT(pmap->pm_pdirpa == pmap_pte2pa(rcr3()));
2868 #endif
2869 	KASSERT(ci->ci_pmap == pmap);
2870 
2871 	/*
2872 	 * we aren't interested in TLB invalidations for this pmap,
2873 	 * at least for the time being.
2874 	 */
2875 
2876 	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
2877 	ci->ci_tlbstate = TLBSTATE_LAZY;
2878 }
2879 
2880 /*
2881  * end of lifecycle functions
2882  */
2883 
2884 /*
2885  * some misc. functions
2886  */
2887 
2888 static int
2889 pmap_pdes_invalid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde)
2890 {
2891 	int i;
2892 	unsigned long index;
2893 	pd_entry_t pde;
2894 
2895 	for (i = PTP_LEVELS; i > 1; i--) {
2896 		index = pl_i(va, i);
2897 		pde = pdes[i - 2][index];
2898 		if ((pde & PG_V) == 0)
2899 			return i;
2900 	}
2901 	if (lastpde != NULL)
2902 		*lastpde = pde;
2903 	return 0;
2904 }
2905 
2906 /*
2907  * pmap_extract: extract a PA for the given VA
2908  */
2909 
2910 bool
2911 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
2912 {
2913 	pt_entry_t *ptes, pte;
2914 	pd_entry_t pde;
2915 	pd_entry_t * const *pdes;
2916 	struct pmap *pmap2;
2917 	struct cpu_info *ci;
2918 	paddr_t pa;
2919 	lwp_t *l;
2920 	bool hard, rv;
2921 
2922 	rv = false;
2923 	pa = 0;
2924 	l = curlwp;
2925 
2926 	KPREEMPT_DISABLE(l);
2927 	ci = l->l_cpu;
2928 	if (__predict_true(!ci->ci_want_pmapload && ci->ci_pmap == pmap) ||
2929 	    pmap == pmap_kernel()) {
2930 		/*
2931 		 * no need to lock, because it's pmap_kernel() or our
2932 		 * own pmap and is active.  if a user pmap, the caller
2933 		 * will hold the vm_map write/read locked and so prevent
2934 		 * entries from disappearing while we are here.  ptps
2935 		 * can disappear via pmap_remove() and pmap_protect(),
2936 		 * but they are called with the vm_map write locked.
2937 		 */
2938 		hard = false;
2939 		ptes = PTE_BASE;
2940 		pdes = normal_pdes;
2941 	} else {
2942 		/* we lose, do it the hard way. */
2943 		hard = true;
2944 		pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
2945 	}
2946 	if (pmap_pdes_valid(va, pdes, &pde)) {
2947 		pte = ptes[pl1_i(va)];
2948 		if (pde & PG_PS) {
2949 			pa = (pde & PG_LGFRAME) | (va & (NBPD_L2 - 1));
2950 			rv = true;
2951 		} else if (__predict_true((pte & PG_V) != 0)) {
2952 			pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
2953 			rv = true;
2954 		}
2955 	}
2956 	if (__predict_false(hard)) {
2957 		pmap_unmap_ptes(pmap, pmap2);
2958 	}
2959 	KPREEMPT_ENABLE(l);
2960 	if (pap != NULL) {
2961 		*pap = pa;
2962 	}
2963 	return rv;
2964 }
2965 
2966 
2967 /*
2968  * vtophys: virtual address to physical address.  For use by
2969  * machine-dependent code only.
2970  */
2971 
2972 paddr_t
2973 vtophys(vaddr_t va)
2974 {
2975 	paddr_t pa;
2976 
2977 	if (pmap_extract(pmap_kernel(), va, &pa) == true)
2978 		return (pa);
2979 	return (0);
2980 }
2981 
2982 #ifdef XEN
2983 /*
2984  * pmap_extract_ma: extract a MA for the given VA
2985  */
2986 
2987 bool
2988 pmap_extract_ma(struct pmap *pmap, vaddr_t va, paddr_t *pap)
2989 {
2990 	pt_entry_t *ptes, pte;
2991 	pd_entry_t pde;
2992 	pd_entry_t * const *pdes;
2993 	struct pmap *pmap2;
2994 
2995 	kpreempt_disable();
2996 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
2997 	if (!pmap_pdes_valid(va, pdes, &pde)) {
2998 		pmap_unmap_ptes(pmap, pmap2);
2999 		kpreempt_enable();
3000 		return false;
3001 	}
3002 
3003 	pte = ptes[pl1_i(va)];
3004 	pmap_unmap_ptes(pmap, pmap2);
3005 	kpreempt_enable();
3006 
3007 	if (__predict_true((pte & PG_V) != 0)) {
3008 		if (pap != NULL)
3009 			*pap = (pte & PG_FRAME) | (va & (NBPD_L1 - 1));
3010 		return true;
3011 	}
3012 
3013 	return false;
3014 }
3015 
3016 /*
3017  * vtomach: virtual address to machine address.  For use by
3018  * machine-dependent code only.
3019  */
3020 
3021 paddr_t
3022 vtomach(vaddr_t va)
3023 {
3024 	paddr_t pa;
3025 
3026 	if (pmap_extract_ma(pmap_kernel(), va, &pa) == true)
3027 		return (pa);
3028 	return (0);
3029 }
3030 
3031 #endif /* XEN */
3032 
3033 
3034 
3035 /*
3036  * pmap_virtual_space: used during bootup [pmap_steal_memory] to
3037  *	determine the bounds of the kernel virtual addess space.
3038  */
3039 
3040 void
3041 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp)
3042 {
3043 	*startp = virtual_avail;
3044 	*endp = virtual_end;
3045 }
3046 
3047 /*
3048  * pmap_map: map a range of PAs into kvm.
3049  *
3050  * => used during crash dump
3051  * => XXX: pmap_map() should be phased out?
3052  */
3053 
3054 vaddr_t
3055 pmap_map(vaddr_t va, paddr_t spa, paddr_t epa, vm_prot_t prot)
3056 {
3057 	while (spa < epa) {
3058 		pmap_kenter_pa(va, spa, prot, 0);
3059 		va += PAGE_SIZE;
3060 		spa += PAGE_SIZE;
3061 	}
3062 	pmap_update(pmap_kernel());
3063 	return va;
3064 }
3065 
3066 /*
3067  * pmap_zero_page: zero a page
3068  */
3069 
3070 void
3071 pmap_zero_page(paddr_t pa)
3072 {
3073 	pt_entry_t *zpte;
3074 	void *zerova;
3075 	int id;
3076 
3077 	kpreempt_disable();
3078 	id = cpu_number();
3079 	zpte = PTESLEW(zero_pte, id);
3080 	zerova = VASLEW(zerop, id);
3081 
3082 #ifdef DIAGNOSTIC
3083 	if (*zpte)
3084 		panic("pmap_zero_page: lock botch");
3085 #endif
3086 
3087 	pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k);
3088 	pmap_pte_flush();
3089 	pmap_update_pg((vaddr_t)zerova);		/* flush TLB */
3090 
3091 	memset(zerova, 0, PAGE_SIZE);
3092 
3093 #if defined(DIAGNOSTIC) || defined(XEN)
3094 	pmap_pte_set(zpte, 0);				/* zap ! */
3095 	pmap_pte_flush();
3096 #endif
3097 	kpreempt_enable();
3098 }
3099 
3100 /*
3101  * pmap_pagezeroidle: the same, for the idle loop page zero'er.
3102  * Returns true if the page was zero'd, false if we aborted for
3103  * some reason.
3104  */
3105 
3106 bool
3107 pmap_pageidlezero(paddr_t pa)
3108 {
3109 	pt_entry_t *zpte;
3110 	void *zerova;
3111 	bool rv;
3112 	int id;
3113 
3114 	id = cpu_number();
3115 	zpte = PTESLEW(zero_pte, id);
3116 	zerova = VASLEW(zerop, id);
3117 
3118 	KASSERT(cpu_feature & CPUID_SSE2);
3119 	KASSERT(*zpte == 0);
3120 
3121 	pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k);
3122 	pmap_pte_flush();
3123 	pmap_update_pg((vaddr_t)zerova);		/* flush TLB */
3124 
3125 	rv = sse2_idlezero_page(zerova);
3126 
3127 #if defined(DIAGNOSTIC) || defined(XEN)
3128 	pmap_pte_set(zpte, 0);				/* zap ! */
3129 	pmap_pte_flush();
3130 #endif
3131 
3132 	return rv;
3133 }
3134 
3135 /*
3136  * pmap_copy_page: copy a page
3137  */
3138 
3139 void
3140 pmap_copy_page(paddr_t srcpa, paddr_t dstpa)
3141 {
3142 	pt_entry_t *spte;
3143 	pt_entry_t *dpte;
3144 	void *csrcva;
3145 	void *cdstva;
3146 	int id;
3147 
3148 	kpreempt_disable();
3149 	id = cpu_number();
3150 	spte = PTESLEW(csrc_pte,id);
3151 	dpte = PTESLEW(cdst_pte,id);
3152 	csrcva = VASLEW(csrcp, id);
3153 	cdstva = VASLEW(cdstp, id);
3154 
3155 	KASSERT(*spte == 0 && *dpte == 0);
3156 
3157 	pmap_pte_set(spte, pmap_pa2pte(srcpa) | PG_V | PG_RW | PG_U | PG_k);
3158 	pmap_pte_set(dpte,
3159 	    pmap_pa2pte(dstpa) | PG_V | PG_RW | PG_M | PG_U | PG_k);
3160 	pmap_pte_flush();
3161 	pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva);
3162 
3163 	memcpy(cdstva, csrcva, PAGE_SIZE);
3164 
3165 #if defined(DIAGNOSTIC) || defined(XEN)
3166 	pmap_pte_set(spte, 0);
3167 	pmap_pte_set(dpte, 0);
3168 	pmap_pte_flush();
3169 #endif
3170 	kpreempt_enable();
3171 }
3172 
3173 static pt_entry_t *
3174 pmap_map_ptp(struct vm_page *ptp)
3175 {
3176 	pt_entry_t *ptppte;
3177 	void *ptpva;
3178 	int id;
3179 
3180 	KASSERT(kpreempt_disabled());
3181 
3182 	id = cpu_number();
3183 	ptppte = PTESLEW(ptp_pte, id);
3184 	ptpva = VASLEW(ptpp, id);
3185 #if !defined(XEN)
3186 	pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M |
3187 	    PG_RW | PG_U | PG_k);
3188 #else
3189 	pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M |
3190 	    PG_U | PG_k);
3191 #endif
3192 	pmap_pte_flush();
3193 	pmap_update_pg((vaddr_t)ptpva);
3194 
3195 	return (pt_entry_t *)ptpva;
3196 }
3197 
3198 static void
3199 pmap_unmap_ptp(void)
3200 {
3201 #if defined(DIAGNOSTIC) || defined(XEN)
3202 	pt_entry_t *pte;
3203 
3204 	KASSERT(kpreempt_disabled());
3205 
3206 	pte = PTESLEW(ptp_pte, cpu_number());
3207 	if (*pte != 0) {
3208 		pmap_pte_set(pte, 0);
3209 		pmap_pte_flush();
3210 	}
3211 #endif
3212 }
3213 
3214 static pt_entry_t *
3215 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
3216 {
3217 
3218 	KASSERT(kpreempt_disabled());
3219 	if (pmap_is_curpmap(pmap)) {
3220 		return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */
3221 	}
3222 	KASSERT(ptp != NULL);
3223 	return pmap_map_ptp(ptp) + pl1_pi(va);
3224 }
3225 
3226 static void
3227 pmap_unmap_pte(void)
3228 {
3229 
3230 	KASSERT(kpreempt_disabled());
3231 
3232 	pmap_unmap_ptp();
3233 }
3234 
3235 /*
3236  * p m a p   r e m o v e   f u n c t i o n s
3237  *
3238  * functions that remove mappings
3239  */
3240 
3241 /*
3242  * pmap_remove_ptes: remove PTEs from a PTP
3243  *
3244  * => must have proper locking on pmap_master_lock
3245  * => caller must hold pmap's lock
3246  * => PTP must be mapped into KVA
3247  * => PTP should be null if pmap == pmap_kernel()
3248  * => must be called with kernel preemption disabled
3249  * => returns composite pte if at least one page should be shot down
3250  */
3251 
3252 static pt_entry_t
3253 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
3254 		 vaddr_t startva, vaddr_t endva, struct pv_entry **pv_tofree)
3255 {
3256 	struct pv_entry *pve;
3257 	pt_entry_t *pte = (pt_entry_t *) ptpva;
3258 	pt_entry_t opte, xpte = 0;
3259 
3260 	KASSERT(pmap == pmap_kernel() || mutex_owned(&pmap->pm_lock));
3261 	KASSERT(kpreempt_disabled());
3262 
3263 	/*
3264 	 * note that ptpva points to the PTE that maps startva.   this may
3265 	 * or may not be the first PTE in the PTP.
3266 	 *
3267 	 * we loop through the PTP while there are still PTEs to look at
3268 	 * and the wire_count is greater than 1 (because we use the wire_count
3269 	 * to keep track of the number of real PTEs in the PTP).
3270 	 */
3271 
3272 	for (/*null*/; startva < endva && (ptp == NULL || ptp->wire_count > 1)
3273 			     ; pte++, startva += PAGE_SIZE) {
3274 		struct vm_page *pg;
3275 		struct pmap_page *pp;
3276 
3277 		if (!pmap_valid_entry(*pte))
3278 			continue;			/* VA not mapped */
3279 
3280 		/* atomically save the old PTE and zap! it */
3281 		opte = pmap_pte_testset(pte, 0);
3282 		if (!pmap_valid_entry(opte)) {
3283 			continue;
3284 		}
3285 
3286 		pmap_exec_account(pmap, startva, opte, 0);
3287 		pmap_stats_update_bypte(pmap, 0, opte);
3288 		xpte |= opte;
3289 
3290 		if (ptp) {
3291 			ptp->wire_count--;		/* dropping a PTE */
3292 			/* Make sure that the PDE is flushed */
3293 			if (ptp->wire_count <= 1)
3294 				xpte |= PG_U;
3295 		}
3296 
3297 		/*
3298 		 * if we are not on a pv_head list we are done.
3299 		 */
3300 
3301 		if ((opte & PG_PVLIST) == 0) {
3302 #if defined(DIAGNOSTIC) && !defined(DOM0OPS)
3303 			if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL)
3304 				panic("pmap_remove_ptes: managed page without "
3305 				      "PG_PVLIST for %#" PRIxVADDR, startva);
3306 #endif
3307 			continue;
3308 		}
3309 
3310 		pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte));
3311 #ifdef DIAGNOSTIC
3312 		if (pg == NULL)
3313 			panic("pmap_remove_ptes: unmanaged page marked "
3314 			      "PG_PVLIST, va = %#" PRIxVADDR ", "
3315 			      "pa = %#" PRIxPADDR,
3316 			      startva, (paddr_t)pmap_pte2pa(opte));
3317 #endif
3318 
3319 		/* sync R/M bits */
3320 		pp = VM_PAGE_TO_PP(pg);
3321 		pp_lock(pp);
3322 		pp->pp_attrs |= opte;
3323 		pve = pmap_remove_pv(pp, ptp, startva);
3324 		pp_unlock(pp);
3325 
3326 		if (pve != NULL) {
3327 			pve->pve_next = *pv_tofree;
3328 			*pv_tofree = pve;
3329 		}
3330 
3331 		/* end of "for" loop: time for next pte */
3332 	}
3333 
3334 	return xpte;
3335 }
3336 
3337 
3338 /*
3339  * pmap_remove_pte: remove a single PTE from a PTP
3340  *
3341  * => must have proper locking on pmap_master_lock
3342  * => caller must hold pmap's lock
3343  * => PTP must be mapped into KVA
3344  * => PTP should be null if pmap == pmap_kernel()
3345  * => returns true if we removed a mapping
3346  * => must be called with kernel preemption disabled
3347  */
3348 
3349 static bool
3350 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
3351 		vaddr_t va, struct pv_entry **pv_tofree)
3352 {
3353 	pt_entry_t opte;
3354 	struct pv_entry *pve;
3355 	struct vm_page *pg;
3356 	struct pmap_page *pp;
3357 
3358 	KASSERT(pmap == pmap_kernel() || mutex_owned(&pmap->pm_lock));
3359 	KASSERT(pmap == pmap_kernel() || kpreempt_disabled());
3360 
3361 	if (!pmap_valid_entry(*pte))
3362 		return(false);		/* VA not mapped */
3363 
3364 	/* atomically save the old PTE and zap! it */
3365 	opte = pmap_pte_testset(pte, 0);
3366 	if (!pmap_valid_entry(opte)) {
3367 		return false;
3368 	}
3369 
3370 	pmap_exec_account(pmap, va, opte, 0);
3371 	pmap_stats_update_bypte(pmap, 0, opte);
3372 
3373 	if (opte & PG_U)
3374 		pmap_tlb_shootdown(pmap, va, 0, opte);
3375 
3376 	if (ptp) {
3377 		ptp->wire_count--;		/* dropping a PTE */
3378 		/* Make sure that the PDE is flushed */
3379 		if ((ptp->wire_count <= 1) && !(opte & PG_U))
3380 			pmap_tlb_shootdown(pmap, va, 0, opte);
3381 	}
3382 
3383 	/*
3384 	 * if we are not on a pv_head list we are done.
3385 	 */
3386 
3387 	if ((opte & PG_PVLIST) == 0) {
3388 #if defined(DIAGNOSTIC) && !defined(DOM0OPS)
3389 		if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL)
3390 			panic("pmap_remove_pte: managed page without "
3391 			      "PG_PVLIST for %#" PRIxVADDR, va);
3392 #endif
3393 		return(true);
3394 	}
3395 
3396 	pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte));
3397 #ifdef DIAGNOSTIC
3398 	if (pg == NULL)
3399 		panic("pmap_remove_pte: unmanaged page marked "
3400 		    "PG_PVLIST, va = %#" PRIxVADDR ", pa = %#" PRIxPADDR,
3401 		    va, (paddr_t)pmap_pte2pa(opte));
3402 #endif
3403 
3404 	/* sync R/M bits */
3405 	pp = VM_PAGE_TO_PP(pg);
3406 	pp_lock(pp);
3407 	pp->pp_attrs |= opte;
3408 	pve = pmap_remove_pv(pp, ptp, va);
3409 	pp_unlock(pp);
3410 
3411 	if (pve) {
3412 		pve->pve_next = *pv_tofree;
3413 		*pv_tofree = pve;
3414 	}
3415 
3416 	return(true);
3417 }
3418 
3419 /*
3420  * pmap_remove: mapping removal function.
3421  *
3422  * => caller should not be holding any pmap locks
3423  */
3424 
3425 void
3426 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
3427 {
3428 	pt_entry_t *ptes, xpte = 0;
3429 	pd_entry_t pde;
3430 	pd_entry_t * const *pdes;
3431 	struct pv_entry *pv_tofree = NULL;
3432 	bool result;
3433 	paddr_t ptppa;
3434 	vaddr_t blkendva, va = sva;
3435 	struct vm_page *ptp;
3436 	struct pmap *pmap2;
3437 
3438 	kpreempt_disable();
3439 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
3440 
3441 	/*
3442 	 * removing one page?  take shortcut function.
3443 	 */
3444 
3445 	if (va + PAGE_SIZE == eva) {
3446 		if (pmap_pdes_valid(va, pdes, &pde)) {
3447 
3448 			/* PA of the PTP */
3449 			ptppa = pmap_pte2pa(pde);
3450 
3451 			/* get PTP if non-kernel mapping */
3452 			if (pmap == pmap_kernel()) {
3453 				/* we never free kernel PTPs */
3454 				ptp = NULL;
3455 			} else {
3456 				ptp = pmap_find_ptp(pmap, va, ptppa, 1);
3457 #ifdef DIAGNOSTIC
3458 				if (ptp == NULL)
3459 					panic("pmap_remove: unmanaged "
3460 					      "PTP detected");
3461 #endif
3462 			}
3463 
3464 			/* do it! */
3465 			result = pmap_remove_pte(pmap, ptp,
3466 			    &ptes[pl1_i(va)], va, &pv_tofree);
3467 
3468 			/*
3469 			 * if mapping removed and the PTP is no longer
3470 			 * being used, free it!
3471 			 */
3472 
3473 			if (result && ptp && ptp->wire_count <= 1)
3474 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3475 		}
3476 	} else for (/* null */ ; va < eva ; va = blkendva) {
3477 		int lvl;
3478 
3479 		/* determine range of block */
3480 		blkendva = x86_round_pdr(va+1);
3481 		if (blkendva > eva)
3482 			blkendva = eva;
3483 
3484 		/*
3485 		 * XXXCDC: our PTE mappings should never be removed
3486 		 * with pmap_remove!  if we allow this (and why would
3487 		 * we?) then we end up freeing the pmap's page
3488 		 * directory page (PDP) before we are finished using
3489 		 * it when we hit in in the recursive mapping.  this
3490 		 * is BAD.
3491 		 *
3492 		 * long term solution is to move the PTEs out of user
3493 		 * address space.  and into kernel address space (up
3494 		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
3495 		 * be VM_MAX_ADDRESS.
3496 		 */
3497 
3498 		if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE)
3499 			/* XXXCDC: ugly hack to avoid freeing PDP here */
3500 			continue;
3501 
3502 		lvl = pmap_pdes_invalid(va, pdes, &pde);
3503 		if (lvl != 0) {
3504 			/*
3505 			 * skip a range corresponding to an invalid pde.
3506 			 */
3507 			blkendva = (va & ptp_masks[lvl - 1]) + nbpd[lvl - 1];
3508  			continue;
3509 		}
3510 
3511 		/* PA of the PTP */
3512 		ptppa = pmap_pte2pa(pde);
3513 
3514 		/* get PTP if non-kernel mapping */
3515 		if (pmap == pmap_kernel()) {
3516 			/* we never free kernel PTPs */
3517 			ptp = NULL;
3518 		} else {
3519 			ptp = pmap_find_ptp(pmap, va, ptppa, 1);
3520 #ifdef DIAGNOSTIC
3521 			if (ptp == NULL)
3522 				panic("pmap_remove: unmanaged PTP "
3523 				      "detected");
3524 #endif
3525 		}
3526 		xpte |= pmap_remove_ptes(pmap, ptp,
3527 		    (vaddr_t)&ptes[pl1_i(va)], va, blkendva, &pv_tofree);
3528 
3529 		/* if PTP is no longer being used, free it! */
3530 		if (ptp && ptp->wire_count <= 1) {
3531 			pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3532 		}
3533 		if ((xpte & PG_U) != 0)
3534 			pmap_tlb_shootdown(pmap, sva, eva, xpte);
3535 	}
3536 	pmap_unmap_ptes(pmap, pmap2);		/* unlock pmap */
3537 	kpreempt_enable();
3538 
3539 	/* Now we free unused PVs */
3540 	if (pv_tofree)
3541 		pmap_free_pvs(pv_tofree);
3542 }
3543 
3544 /*
3545  * pmap_sync_pv: clear pte bits and return the old value of the pte.
3546  *
3547  * => called with pp_lock held. (thus preemption disabled)
3548  * => issues tlb shootdowns if necessary.
3549  */
3550 
3551 static int
3552 pmap_sync_pv(struct pv_pte *pvpte, pt_entry_t expect, int clearbits,
3553     pt_entry_t *optep)
3554 {
3555 	struct pmap *pmap;
3556 	struct vm_page *ptp;
3557 	vaddr_t va;
3558 	pt_entry_t *ptep;
3559 	pt_entry_t opte;
3560 	pt_entry_t npte;
3561 	bool need_shootdown;
3562 
3563 	ptp = pvpte->pte_ptp;
3564 	va = pvpte->pte_va;
3565 	KASSERT(ptp == NULL || ptp->uobject != NULL);
3566 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
3567 	pmap = ptp_to_pmap(ptp);
3568 
3569 	KASSERT((expect & ~(PG_FRAME | PG_V)) == 0);
3570 	KASSERT((expect & PG_V) != 0);
3571 	KASSERT(clearbits == ~0 || (clearbits & ~(PG_M | PG_U | PG_RW)) == 0);
3572 	KASSERT(kpreempt_disabled());
3573 
3574 	ptep = pmap_map_pte(pmap, ptp, va);
3575 	do {
3576 		opte = *ptep;
3577 		KASSERT((opte & (PG_M | PG_U)) != PG_M);
3578 		KASSERT((opte & (PG_U | PG_V)) != PG_U);
3579 		KASSERT(opte == 0 || (opte & PG_V) != 0);
3580 		if ((opte & (PG_FRAME | PG_V)) != expect) {
3581 
3582 			/*
3583 			 * we lost a race with a V->P operation like
3584 			 * pmap_remove().  wait for the competitor
3585 			 * reflecting pte bits into mp_attrs.
3586 			 *
3587 			 * issue a redundant TLB shootdown so that
3588 			 * we can wait for its completion.
3589 			 */
3590 
3591 			pmap_unmap_pte();
3592 			if (clearbits != 0) {
3593 				pmap_tlb_shootdown(pmap, va, 0,
3594 				    (pmap == pmap_kernel() ? PG_G : 0));
3595 			}
3596 			return EAGAIN;
3597 		}
3598 
3599 		/*
3600 		 * check if there's anything to do on this pte.
3601 		 */
3602 
3603 		if ((opte & clearbits) == 0) {
3604 			need_shootdown = false;
3605 			break;
3606 		}
3607 
3608 		/*
3609 		 * we need a shootdown if the pte is cached. (PG_U)
3610 		 *
3611 		 * ...unless we are clearing only the PG_RW bit and
3612 		 * it isn't cached as RW. (PG_M)
3613 		 */
3614 
3615 		need_shootdown = (opte & PG_U) != 0 &&
3616 		    !(clearbits == PG_RW && (opte & PG_M) == 0);
3617 
3618 		npte = opte & ~clearbits;
3619 
3620 		/*
3621 		 * if we need a shootdown anyway, clear PG_U and PG_M.
3622 		 */
3623 
3624 		if (need_shootdown) {
3625 			npte &= ~(PG_U | PG_M);
3626 		}
3627 		KASSERT((npte & (PG_M | PG_U)) != PG_M);
3628 		KASSERT((npte & (PG_U | PG_V)) != PG_U);
3629 		KASSERT(npte == 0 || (opte & PG_V) != 0);
3630 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
3631 
3632 	if (need_shootdown) {
3633 		pmap_tlb_shootdown(pmap, va, 0, opte);
3634 	}
3635 	pmap_unmap_pte();
3636 
3637 	*optep = opte;
3638 	return 0;
3639 }
3640 
3641 /*
3642  * pmap_page_remove: remove a managed vm_page from all pmaps that map it
3643  *
3644  * => R/M bits are sync'd back to attrs
3645  */
3646 
3647 void
3648 pmap_page_remove(struct vm_page *pg)
3649 {
3650 	struct pmap_page *pp;
3651 	struct pv_pte *pvpte;
3652 	struct pv_entry *killlist = NULL;
3653 	struct vm_page *ptp;
3654 	pt_entry_t expect;
3655 	lwp_t *l;
3656 	int count;
3657 
3658 	l = curlwp;
3659 	pp = VM_PAGE_TO_PP(pg);
3660 	expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V;
3661 	count = SPINLOCK_BACKOFF_MIN;
3662 	kpreempt_disable();
3663 startover:
3664 	pp_lock(pp);
3665 	while ((pvpte = pv_pte_first(pp)) != NULL) {
3666 		struct pmap *pmap;
3667 		struct pv_entry *pve;
3668 		pt_entry_t opte;
3669 		vaddr_t va;
3670 		int error;
3671 
3672 		/*
3673 		 * add a reference to the pmap before clearing the pte.
3674 		 * otherwise the pmap can disappear behind us.
3675 		 */
3676 
3677 		ptp = pvpte->pte_ptp;
3678 		pmap = ptp_to_pmap(ptp);
3679 		if (ptp != NULL) {
3680 			pmap_reference(pmap);
3681 		}
3682 
3683 		error = pmap_sync_pv(pvpte, expect, ~0, &opte);
3684 		if (error == EAGAIN) {
3685 			int hold_count;
3686 			pp_unlock(pp);
3687 			KERNEL_UNLOCK_ALL(curlwp, &hold_count);
3688 			if (ptp != NULL) {
3689 				pmap_destroy(pmap);
3690 			}
3691 			SPINLOCK_BACKOFF(count);
3692 			KERNEL_LOCK(hold_count, curlwp);
3693 			goto startover;
3694 		}
3695 
3696 		pp->pp_attrs |= opte;
3697 		va = pvpte->pte_va;
3698 		pve = pmap_remove_pv(pp, ptp, va);
3699 		pp_unlock(pp);
3700 
3701 		/* update the PTP reference count.  free if last reference. */
3702 		if (ptp != NULL) {
3703 			struct pmap *pmap2;
3704 			pt_entry_t *ptes;
3705 			pd_entry_t * const *pdes;
3706 
3707 			KASSERT(pmap != pmap_kernel());
3708 
3709 			pmap_tlb_shootwait();
3710 			pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3711 			pmap_stats_update_bypte(pmap, 0, opte);
3712 			ptp->wire_count--;
3713 			if (ptp->wire_count <= 1) {
3714 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3715 			}
3716 			pmap_unmap_ptes(pmap, pmap2);
3717 			pmap_destroy(pmap);
3718 		} else {
3719 			KASSERT(pmap == pmap_kernel());
3720 			pmap_stats_update_bypte(pmap, 0, opte);
3721 		}
3722 
3723 		if (pve != NULL) {
3724 			pve->pve_next = killlist;	/* mark it for death */
3725 			killlist = pve;
3726 		}
3727 		pp_lock(pp);
3728 	}
3729 	pp_unlock(pp);
3730 	kpreempt_enable();
3731 
3732 	/* Now free unused pvs. */
3733 	pmap_free_pvs(killlist);
3734 }
3735 
3736 /*
3737  * p m a p   a t t r i b u t e  f u n c t i o n s
3738  * functions that test/change managed page's attributes
3739  * since a page can be mapped multiple times we must check each PTE that
3740  * maps it by going down the pv lists.
3741  */
3742 
3743 /*
3744  * pmap_test_attrs: test a page's attributes
3745  */
3746 
3747 bool
3748 pmap_test_attrs(struct vm_page *pg, unsigned testbits)
3749 {
3750 	struct pmap_page *pp;
3751 	struct pv_pte *pvpte;
3752 	pt_entry_t expect;
3753 	u_int result;
3754 
3755 	pp = VM_PAGE_TO_PP(pg);
3756 	if ((pp->pp_attrs & testbits) != 0) {
3757 		return true;
3758 	}
3759 	expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V;
3760 	pp_lock(pp);
3761 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
3762 		pt_entry_t opte;
3763 		int error;
3764 
3765 		if ((pp->pp_attrs & testbits) != 0) {
3766 			break;
3767 		}
3768 		error = pmap_sync_pv(pvpte, expect, 0, &opte);
3769 		if (error == 0) {
3770 			pp->pp_attrs |= opte;
3771 		}
3772 	}
3773 	result = pp->pp_attrs & testbits;
3774 	pp_unlock(pp);
3775 
3776 	/*
3777 	 * note that we will exit the for loop with a non-null pve if
3778 	 * we have found the bits we are testing for.
3779 	 */
3780 
3781 	return result != 0;
3782 }
3783 
3784 /*
3785  * pmap_clear_attrs: clear the specified attribute for a page.
3786  *
3787  * => we return true if we cleared one of the bits we were asked to
3788  */
3789 
3790 bool
3791 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits)
3792 {
3793 	struct pmap_page *pp;
3794 	struct pv_pte *pvpte;
3795 	u_int result;
3796 	pt_entry_t expect;
3797 	int count;
3798 
3799 	pp = VM_PAGE_TO_PP(pg);
3800 	expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V;
3801 	count = SPINLOCK_BACKOFF_MIN;
3802 	kpreempt_disable();
3803 startover:
3804 	pp_lock(pp);
3805 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
3806 		pt_entry_t opte;
3807 		int error;
3808 
3809 		error = pmap_sync_pv(pvpte, expect, clearbits, &opte);
3810 		if (error == EAGAIN) {
3811 			int hold_count;
3812 			pp_unlock(pp);
3813 			KERNEL_UNLOCK_ALL(curlwp, &hold_count);
3814 			SPINLOCK_BACKOFF(count);
3815 			KERNEL_LOCK(hold_count, curlwp);
3816 			goto startover;
3817 		}
3818 		pp->pp_attrs |= opte;
3819 	}
3820 	result = pp->pp_attrs & clearbits;
3821 	pp->pp_attrs &= ~clearbits;
3822 	pp_unlock(pp);
3823 	kpreempt_enable();
3824 
3825 	return result != 0;
3826 }
3827 
3828 
3829 /*
3830  * p m a p   p r o t e c t i o n   f u n c t i o n s
3831  */
3832 
3833 /*
3834  * pmap_page_protect: change the protection of all recorded mappings
3835  *	of a managed page
3836  *
3837  * => NOTE: this is an inline function in pmap.h
3838  */
3839 
3840 /* see pmap.h */
3841 
3842 /*
3843  * pmap_protect: set the protection in of the pages in a pmap
3844  *
3845  * => NOTE: this is an inline function in pmap.h
3846  */
3847 
3848 /* see pmap.h */
3849 
3850 /*
3851  * pmap_write_protect: write-protect pages in a pmap
3852  */
3853 
3854 void
3855 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
3856 {
3857 	pt_entry_t *ptes, *epte;
3858 	pt_entry_t *spte;
3859 	pd_entry_t * const *pdes;
3860 	vaddr_t blockend, va;
3861 	pt_entry_t opte;
3862 	struct pmap *pmap2;
3863 
3864 	KASSERT(curlwp->l_md.md_gc_pmap != pmap);
3865 
3866 	kpreempt_disable();
3867 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
3868 
3869 	/* should be ok, but just in case ... */
3870 	sva &= PG_FRAME;
3871 	eva &= PG_FRAME;
3872 
3873 	for (va = sva ; va < eva ; va = blockend) {
3874 
3875 		blockend = (va & L2_FRAME) + NBPD_L2;
3876 		if (blockend > eva)
3877 			blockend = eva;
3878 
3879 		/*
3880 		 * XXXCDC: our PTE mappings should never be write-protected!
3881 		 *
3882 		 * long term solution is to move the PTEs out of user
3883 		 * address space.  and into kernel address space (up
3884 		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
3885 		 * be VM_MAX_ADDRESS.
3886 		 */
3887 
3888 		/* XXXCDC: ugly hack to avoid freeing PDP here */
3889 		if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE)
3890 			continue;
3891 
3892 		/* empty block? */
3893 		if (!pmap_pdes_valid(va, pdes, NULL))
3894 			continue;
3895 
3896 #ifdef DIAGNOSTIC
3897 		if (va >= VM_MAXUSER_ADDRESS &&
3898 		    va < VM_MAX_ADDRESS)
3899 			panic("pmap_write_protect: PTE space");
3900 #endif
3901 
3902 		spte = &ptes[pl1_i(va)];
3903 		epte = &ptes[pl1_i(blockend)];
3904 
3905 		for (/*null */; spte < epte ; spte++) {
3906 			pt_entry_t npte;
3907 
3908 			do {
3909 				opte = *spte;
3910 				if ((~opte & (PG_RW | PG_V)) != 0) {
3911 					goto next;
3912 				}
3913 				npte = opte & ~PG_RW;
3914 			} while (pmap_pte_cas(spte, opte, npte) != opte);
3915 			if ((opte & PG_M) != 0) {
3916 				vaddr_t tva;
3917 
3918 				tva = x86_ptob(spte - ptes);
3919 				pmap_tlb_shootdown(pmap, tva, 0, opte);
3920 			}
3921 next:;
3922 		}
3923 	}
3924 
3925 	pmap_unmap_ptes(pmap, pmap2);	/* unlocks pmap */
3926 	kpreempt_enable();
3927 }
3928 
3929 /*
3930  * end of protection functions
3931  */
3932 
3933 /*
3934  * pmap_unwire: clear the wired bit in the PTE
3935  *
3936  * => mapping should already be in map
3937  */
3938 
3939 void
3940 pmap_unwire(struct pmap *pmap, vaddr_t va)
3941 {
3942 	pt_entry_t *ptes;
3943 	pd_entry_t * const *pdes;
3944 	struct pmap *pmap2;
3945 
3946 	kpreempt_disable();
3947 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
3948 
3949 	if (pmap_pdes_valid(va, pdes, NULL)) {
3950 		pt_entry_t *ptep = &ptes[pl1_i(va)];
3951 		pt_entry_t opte = *ptep;
3952 
3953 #ifdef DIAGNOSTIC
3954 		if (!pmap_valid_entry(opte))
3955 			panic("pmap_unwire: invalid (unmapped) va 0x%lx", va);
3956 #endif
3957 		if ((opte & PG_W) != 0) {
3958 			pt_entry_t npte = opte & ~PG_W;
3959 
3960 			opte = pmap_pte_testset(ptep, npte);
3961 			pmap_stats_update_bypte(pmap, npte, opte);
3962 		}
3963 #ifdef DIAGNOSTIC
3964 		else {
3965 			printf("pmap_unwire: wiring for pmap %p va 0x%lx "
3966 			       "didn't change!\n", pmap, va);
3967 		}
3968 #endif
3969 		pmap_unmap_ptes(pmap, pmap2);		/* unlocks map */
3970 	}
3971 #ifdef DIAGNOSTIC
3972 	else {
3973 		panic("pmap_unwire: invalid PDE");
3974 	}
3975 #endif
3976 	kpreempt_enable();
3977 }
3978 
3979 /*
3980  * pmap_copy: copy mappings from one pmap to another
3981  *
3982  * => optional function
3983  * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
3984  */
3985 
3986 /*
3987  * defined as macro in pmap.h
3988  */
3989 
3990 /*
3991  * pmap_enter: enter a mapping into a pmap
3992  *
3993  * => must be done "now" ... no lazy-evaluation
3994  * => we set pmap => pv_head locking
3995  */
3996 #ifdef XEN
3997 int
3998 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa,
3999 	   vm_prot_t prot, u_int flags, int domid)
4000 {
4001 #else /* XEN */
4002 int
4003 pmap_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
4004 	   u_int flags)
4005 {
4006 	paddr_t ma = pa;
4007 #endif /* XEN */
4008 	pt_entry_t *ptes, opte, npte;
4009 	pt_entry_t *ptep;
4010 	pd_entry_t * const *pdes;
4011 	struct vm_page *ptp, *pg;
4012 	struct pmap_page *new_pp;
4013 	struct pmap_page *old_pp;
4014 	struct pv_entry *old_pve = NULL;
4015 	struct pv_entry *new_pve;
4016 	struct pv_entry *new_pve2;
4017 	int error;
4018 	bool wired = (flags & PMAP_WIRED) != 0;
4019 	struct pmap *pmap2;
4020 
4021 	KASSERT(pmap_initialized);
4022 	KASSERT(curlwp->l_md.md_gc_pmap != pmap);
4023 
4024 #ifdef DIAGNOSTIC
4025 	/* sanity check: totally out of range? */
4026 	if (va >= VM_MAX_KERNEL_ADDRESS)
4027 		panic("pmap_enter: too big");
4028 
4029 	if (va == (vaddr_t) PDP_BASE || va == (vaddr_t) APDP_BASE)
4030 		panic("pmap_enter: trying to map over PDP/APDP!");
4031 
4032 	/* sanity check: kernel PTPs should already have been pre-allocated */
4033 	if (va >= VM_MIN_KERNEL_ADDRESS &&
4034 	    !pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]))
4035 		panic("pmap_enter: missing kernel PTP for va %lx!", va);
4036 #endif /* DIAGNOSTIC */
4037 #ifdef XEN
4038 	KASSERT(domid == DOMID_SELF || pa == 0);
4039 #endif /* XEN */
4040 
4041 	npte = ma | protection_codes[prot] | PG_V;
4042 	if (wired)
4043 	        npte |= PG_W;
4044 	if (flags & PMAP_NOCACHE)
4045 		npte |= PG_N;
4046 	if (va < VM_MAXUSER_ADDRESS)
4047 		npte |= PG_u;
4048 	else if (va < VM_MAX_ADDRESS)
4049 		npte |= (PG_u | PG_RW);	/* XXXCDC: no longer needed? */
4050 	else
4051 		npte |= PG_k;
4052 	if (pmap == pmap_kernel())
4053 		npte |= pmap_pg_g;
4054 	if (flags & VM_PROT_ALL) {
4055 		npte |= PG_U;
4056 		if (flags & VM_PROT_WRITE) {
4057 			KASSERT((npte & PG_RW) != 0);
4058 			npte |= PG_M;
4059 		}
4060 	}
4061 
4062 #ifdef XEN
4063 	if (domid != DOMID_SELF)
4064 		pg = NULL;
4065 	else
4066 #endif
4067 		pg = PHYS_TO_VM_PAGE(pa);
4068 	if (pg != NULL) {
4069 		/* This is a managed page */
4070 		npte |= PG_PVLIST;
4071 		new_pp = VM_PAGE_TO_PP(pg);
4072 	} else {
4073 		new_pp = NULL;
4074 	}
4075 
4076 	/* get pves. */
4077 	new_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
4078 	new_pve2 = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
4079 	if (new_pve == NULL || new_pve2 == NULL) {
4080 		if (flags & PMAP_CANFAIL) {
4081 			error = ENOMEM;
4082 			goto out2;
4083 		}
4084 		panic("pmap_enter: pve allocation failed");
4085 	}
4086 
4087 	kpreempt_disable();
4088 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
4089 	if (pmap == pmap_kernel()) {
4090 		ptp = NULL;
4091 	} else {
4092 		ptp = pmap_get_ptp(pmap, va, pdes);
4093 		if (ptp == NULL) {
4094 			pmap_unmap_ptes(pmap, pmap2);
4095 			if (flags & PMAP_CANFAIL) {
4096 				error = ENOMEM;
4097 				goto out;
4098 			}
4099 			panic("pmap_enter: get ptp failed");
4100 		}
4101 	}
4102 
4103 	/*
4104 	 * update the pte.
4105 	 */
4106 
4107 	ptep = &ptes[pl1_i(va)];
4108 	do {
4109 		opte = *ptep;
4110 
4111 		/*
4112 		 * if the same page, inherit PG_U and PG_M.
4113 		 */
4114 		if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) {
4115 			npte |= opte & (PG_U | PG_M);
4116 		}
4117 #if defined(XEN)
4118 		if (domid != DOMID_SELF) {
4119 			/* pmap_pte_cas with error handling */
4120 			int s = splvm();
4121 			if (opte != *ptep) {
4122 				splx(s);
4123 				continue;
4124 			}
4125 			error = xpq_update_foreign(
4126 			    vtomach((vaddr_t)ptep), npte, domid);
4127 			splx(s);
4128 			if (error) {
4129 				if (ptp != NULL && ptp->wire_count <= 1) {
4130 					pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4131 				}
4132 				pmap_unmap_ptes(pmap, pmap2);
4133 				goto out;
4134 			}
4135 			break;
4136 		}
4137 #endif /* defined(XEN) */
4138 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
4139 
4140 	/*
4141 	 * update statistics and PTP's reference count.
4142 	 */
4143 
4144 	pmap_stats_update_bypte(pmap, npte, opte);
4145 	if (ptp != NULL && !pmap_valid_entry(opte)) {
4146 		ptp->wire_count++;
4147 	}
4148 	KASSERT(ptp == NULL || ptp->wire_count > 1);
4149 
4150 	/*
4151 	 * if the same page, we can skip pv_entry handling.
4152 	 */
4153 
4154 	if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) {
4155 		KASSERT(((opte ^ npte) & PG_PVLIST) == 0);
4156 		goto same_pa;
4157 	}
4158 
4159 	/*
4160 	 * if old page is managed, remove pv_entry from its list.
4161 	 */
4162 
4163 	if ((~opte & (PG_V | PG_PVLIST)) == 0) {
4164 		pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte));
4165 #ifdef DIAGNOSTIC
4166 		if (pg == NULL)
4167 			panic("pmap_enter: PG_PVLIST mapping with "
4168 			      "unmanaged page "
4169 			      "pa = 0x%" PRIx64 " (0x%" PRIx64 ")",
4170 			      (int64_t)pa, (int64_t)atop(pa));
4171 #endif
4172 		old_pp = VM_PAGE_TO_PP(pg);
4173 
4174 		pp_lock(old_pp);
4175 		old_pve = pmap_remove_pv(old_pp, ptp, va);
4176 		old_pp->pp_attrs |= opte;
4177 		pp_unlock(old_pp);
4178 	}
4179 
4180 	/*
4181 	 * if new page is managed, insert pv_entry into its list.
4182 	 */
4183 
4184 	if (new_pp) {
4185 		pp_lock(new_pp);
4186 		new_pve = pmap_enter_pv(new_pp, new_pve, &new_pve2, ptp, va);
4187 		pp_unlock(new_pp);
4188 	}
4189 
4190 same_pa:
4191 	pmap_unmap_ptes(pmap, pmap2);
4192 
4193 	/*
4194 	 * shootdown tlb if necessary.
4195 	 */
4196 
4197 	if ((~opte & (PG_V | PG_U)) == 0 &&
4198 	    ((opte ^ npte) & (PG_FRAME | PG_RW)) != 0) {
4199 		pmap_tlb_shootdown(pmap, va, 0, opte);
4200 	}
4201 
4202 	error = 0;
4203 out:
4204 	kpreempt_enable();
4205 out2:
4206 	if (old_pve != NULL) {
4207 		pool_cache_put(&pmap_pv_cache, old_pve);
4208 	}
4209 	if (new_pve != NULL) {
4210 		pool_cache_put(&pmap_pv_cache, new_pve);
4211 	}
4212 	if (new_pve2 != NULL) {
4213 		pool_cache_put(&pmap_pv_cache, new_pve2);
4214 	}
4215 
4216 	return error;
4217 }
4218 
4219 #ifdef XEN
4220 int
4221 pmap_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
4222 {
4223         paddr_t ma;
4224 
4225 	if (__predict_false(pa < pmap_pa_start || pmap_pa_end <= pa)) {
4226 		ma = pa; /* XXX hack */
4227 	} else {
4228 		ma = xpmap_ptom(pa);
4229 	}
4230 
4231 	return pmap_enter_ma(pmap, va, ma, pa, prot, flags, DOMID_SELF);
4232 }
4233 #endif /* XEN */
4234 
4235 static bool
4236 pmap_get_physpage(vaddr_t va, int level, paddr_t *paddrp)
4237 {
4238 	struct vm_page *ptp;
4239 	struct pmap *kpm = pmap_kernel();
4240 
4241 	if (uvm.page_init_done == false) {
4242 		/*
4243 		 * we're growing the kernel pmap early (from
4244 		 * uvm_pageboot_alloc()).  this case must be
4245 		 * handled a little differently.
4246 		 */
4247 
4248 		if (uvm_page_physget(paddrp) == false)
4249 			panic("pmap_get_physpage: out of memory");
4250 		kpreempt_disable();
4251 		pmap_pte_set(early_zero_pte,
4252 		    pmap_pa2pte(*paddrp) | PG_V | PG_RW | PG_k);
4253 		pmap_pte_flush();
4254 		pmap_update_pg((vaddr_t)early_zerop);
4255 		memset(early_zerop, 0, PAGE_SIZE);
4256 #if defined(DIAGNOSTIC) || defined (XEN)
4257 		pmap_pte_set(early_zero_pte, 0);
4258 		pmap_pte_flush();
4259 #endif /* defined(DIAGNOSTIC) */
4260 		kpreempt_enable();
4261 	} else {
4262 		/* XXX */
4263 		PMAP_SUBOBJ_LOCK(kpm, level - 1);
4264 		ptp = uvm_pagealloc(&kpm->pm_obj[level - 1],
4265 				    ptp_va2o(va, level), NULL,
4266 				    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
4267 		PMAP_SUBOBJ_UNLOCK(kpm, level - 1);
4268 		if (ptp == NULL)
4269 			panic("pmap_get_physpage: out of memory");
4270 		ptp->flags &= ~PG_BUSY;
4271 		ptp->wire_count = 1;
4272 		*paddrp = VM_PAGE_TO_PHYS(ptp);
4273 	}
4274 	pmap_stats_update(kpm, 1, 0);
4275 	return true;
4276 }
4277 
4278 /*
4279  * Allocate the amount of specified ptps for a ptp level, and populate
4280  * all levels below accordingly, mapping virtual addresses starting at
4281  * kva.
4282  *
4283  * Used by pmap_growkernel.
4284  */
4285 static void
4286 pmap_alloc_level(pd_entry_t * const *pdes, vaddr_t kva, int lvl,
4287     long *needed_ptps)
4288 {
4289 	unsigned long i;
4290 	vaddr_t va;
4291 	paddr_t pa;
4292 	unsigned long index, endindex;
4293 	int level;
4294 	pd_entry_t *pdep;
4295 #ifdef XEN
4296 	int s = splvm(); /* protect xpq_* */
4297 #endif
4298 
4299 	for (level = lvl; level > 1; level--) {
4300 		if (level == PTP_LEVELS)
4301 			pdep = pmap_kernel()->pm_pdir;
4302 		else
4303 			pdep = pdes[level - 2];
4304 		va = kva;
4305 		index = pl_i_roundup(kva, level);
4306 		endindex = index + needed_ptps[level - 1] - 1;
4307 
4308 
4309 		for (i = index; i <= endindex; i++) {
4310 			KASSERT(!pmap_valid_entry(pdep[i]));
4311 			pmap_get_physpage(va, level - 1, &pa);
4312 #ifdef XEN
4313 			xpq_queue_pte_update((level == PTP_LEVELS) ?
4314 			    xpmap_ptom(pmap_pdirpa(pmap_kernel(), i)) :
4315 			    xpmap_ptetomach(&pdep[i]),
4316 			    pmap_pa2pte(pa) | PG_k | PG_V | PG_RW);
4317 #ifdef PAE
4318 			if (level == PTP_LEVELS &&  i > L2_SLOT_KERN) {
4319 				/* update real kernel PD too */
4320 				xpq_queue_pte_update(
4321 				    xpmap_ptetomach(&pmap_kl2pd[l2tol2(i)]),
4322 				    pmap_pa2pte(pa) | PG_k | PG_V | PG_RW);
4323 			}
4324 #endif
4325 #else /* XEN */
4326 			pdep[i] = pa | PG_RW | PG_V;
4327 #endif /* XEN */
4328 			KASSERT(level != PTP_LEVELS || nkptp[level - 1] +
4329 			    pl_i(VM_MIN_KERNEL_ADDRESS, level) == i);
4330 			nkptp[level - 1]++;
4331 			va += nbpd[level - 1];
4332 		}
4333 		pmap_pte_flush();
4334 	}
4335 #ifdef XEN
4336 	splx(s);
4337 #endif
4338 }
4339 
4340 /*
4341  * pmap_growkernel: increase usage of KVM space
4342  *
4343  * => we allocate new PTPs for the kernel and install them in all
4344  *	the pmaps on the system.
4345  */
4346 
4347 vaddr_t
4348 pmap_growkernel(vaddr_t maxkvaddr)
4349 {
4350 	struct pmap *kpm = pmap_kernel();
4351 #if !defined(XEN) || !defined(__x86_64__)
4352 	struct pmap *pm;
4353 #endif
4354 	int s, i;
4355 	long needed_kptp[PTP_LEVELS], target_nptp, old;
4356 	bool invalidate = false;
4357 
4358 	s = splvm();	/* to be safe */
4359 	mutex_enter(&kpm->pm_lock);
4360 
4361 	if (maxkvaddr <= pmap_maxkvaddr) {
4362 		mutex_exit(&kpm->pm_lock);
4363 		splx(s);
4364 		return pmap_maxkvaddr;
4365 	}
4366 
4367 	maxkvaddr = x86_round_pdr(maxkvaddr);
4368 	old = nkptp[PTP_LEVELS - 1];
4369 	/*
4370 	 * This loop could be optimized more, but pmap_growkernel()
4371 	 * is called infrequently.
4372 	 */
4373 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
4374 		target_nptp = pl_i_roundup(maxkvaddr, i + 1) -
4375 		    pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1);
4376 		/*
4377 		 * XXX only need to check toplevel.
4378 		 */
4379 		if (target_nptp > nkptpmax[i])
4380 			panic("out of KVA space");
4381 		KASSERT(target_nptp >= nkptp[i]);
4382 		needed_kptp[i] = target_nptp - nkptp[i];
4383 	}
4384 
4385 	pmap_alloc_level(normal_pdes, pmap_maxkvaddr, PTP_LEVELS, needed_kptp);
4386 
4387 	/*
4388 	 * If the number of top level entries changed, update all
4389 	 * pmaps.
4390 	 */
4391 	if (needed_kptp[PTP_LEVELS - 1] != 0) {
4392 #ifdef XEN
4393 #ifdef __x86_64__
4394 		/* nothing, kernel entries are never entered in user pmap */
4395 #else /* __x86_64__ */
4396 		mutex_enter(&pmaps_lock);
4397 		LIST_FOREACH(pm, &pmaps, pm_list) {
4398 			int pdkidx;
4399 			for (pdkidx =  PDIR_SLOT_KERN + old;
4400 			    pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1];
4401 			    pdkidx++) {
4402 				xpq_queue_pte_update(
4403 				    xpmap_ptom(pmap_pdirpa(pm, pdkidx)),
4404 				    kpm->pm_pdir[pdkidx]);
4405 			}
4406 			xpq_flush_queue();
4407 		}
4408 		mutex_exit(&pmaps_lock);
4409 #endif /* __x86_64__ */
4410 #else /* XEN */
4411 		unsigned newpdes;
4412 		newpdes = nkptp[PTP_LEVELS - 1] - old;
4413 		mutex_enter(&pmaps_lock);
4414 		LIST_FOREACH(pm, &pmaps, pm_list) {
4415 			memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old],
4416 			       &kpm->pm_pdir[PDIR_SLOT_KERN + old],
4417 			       newpdes * sizeof (pd_entry_t));
4418 		}
4419 		mutex_exit(&pmaps_lock);
4420 #endif
4421 		invalidate = true;
4422 	}
4423 	pmap_maxkvaddr = maxkvaddr;
4424 	mutex_exit(&kpm->pm_lock);
4425 	splx(s);
4426 
4427 	if (invalidate) {
4428 		/* Invalidate the PDP cache. */
4429 		pool_cache_invalidate(&pmap_pdp_cache);
4430 	}
4431 
4432 	return maxkvaddr;
4433 }
4434 
4435 #ifdef DEBUG
4436 void pmap_dump(struct pmap *, vaddr_t, vaddr_t);
4437 
4438 /*
4439  * pmap_dump: dump all the mappings from a pmap
4440  *
4441  * => caller should not be holding any pmap locks
4442  */
4443 
4444 void
4445 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
4446 {
4447 	pt_entry_t *ptes, *pte;
4448 	pd_entry_t * const *pdes;
4449 	struct pmap *pmap2;
4450 	vaddr_t blkendva;
4451 
4452 	/*
4453 	 * if end is out of range truncate.
4454 	 * if (end == start) update to max.
4455 	 */
4456 
4457 	if (eva > VM_MAXUSER_ADDRESS || eva <= sva)
4458 		eva = VM_MAXUSER_ADDRESS;
4459 
4460 	/*
4461 	 * we lock in the pmap => pv_head direction
4462 	 */
4463 
4464 	kpreempt_disable();
4465 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
4466 
4467 	/*
4468 	 * dumping a range of pages: we dump in PTP sized blocks (4MB)
4469 	 */
4470 
4471 	for (/* null */ ; sva < eva ; sva = blkendva) {
4472 
4473 		/* determine range of block */
4474 		blkendva = x86_round_pdr(sva+1);
4475 		if (blkendva > eva)
4476 			blkendva = eva;
4477 
4478 		/* valid block? */
4479 		if (!pmap_pdes_valid(sva, pdes, NULL))
4480 			continue;
4481 
4482 		pte = &ptes[pl1_i(sva)];
4483 		for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) {
4484 			if (!pmap_valid_entry(*pte))
4485 				continue;
4486 			printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR
4487 			    " (pte=%#" PRIxPADDR ")\n",
4488 			    sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte);
4489 		}
4490 	}
4491 	pmap_unmap_ptes(pmap, pmap2);
4492 	kpreempt_enable();
4493 }
4494 #endif
4495 
4496 /*
4497  * pmap_tlb_shootdown: invalidate pages on all CPUs using pmap 'pm'
4498  *
4499  * => always invalidates locally before returning
4500  * => returns before remote CPUs have invalidated
4501  * => must be called with preemption disabled
4502  */
4503 
4504 void
4505 pmap_tlb_shootdown(struct pmap *pm, vaddr_t sva, vaddr_t eva, pt_entry_t pte)
4506 {
4507 #ifdef MULTIPROCESSOR
4508 	extern bool x86_mp_online;
4509 	struct cpu_info *ci;
4510 	struct pmap_mbox *mb, *selfmb;
4511 	CPU_INFO_ITERATOR cii;
4512 	uintptr_t head;
4513 	u_int count;
4514 	int s;
4515 #endif	/* MULTIPROCESSOR */
4516 	struct cpu_info *self;
4517 	bool kernel;
4518 
4519 	KASSERT(eva == 0 || eva >= sva);
4520 	KASSERT(kpreempt_disabled());
4521 
4522 	if (pte & PG_PS)
4523 		sva &= PG_LGFRAME;
4524 	pte &= PG_G;
4525 	self = curcpu();
4526 
4527 	if (sva == (vaddr_t)-1LL) {
4528 		kernel = true;
4529 	} else {
4530 		if (eva == 0)
4531 			eva = sva + PAGE_SIZE;
4532 		kernel = sva >= VM_MAXUSER_ADDRESS;
4533 		KASSERT(kernel == (eva > VM_MAXUSER_ADDRESS));
4534 	}
4535 
4536 	/*
4537 	 * if tearing down the pmap, do nothing.  we'll flush later
4538 	 * when we're ready to recycle/destroy it.
4539 	 */
4540 	if (__predict_false(curlwp->l_md.md_gc_pmap == pm)) {
4541 		return;
4542 	}
4543 
4544 	/*
4545 	 * If the range is larger than 32 pages, then invalidate
4546 	 * everything.
4547 	 */
4548 	if (sva != (vaddr_t)-1LL && eva - sva > (32 * PAGE_SIZE)) {
4549 		sva = (vaddr_t)-1LL;
4550 		eva = sva;
4551 	}
4552 
4553 #ifdef MULTIPROCESSOR
4554 	if (ncpu > 1 && x86_mp_online) {
4555 		selfmb = &self->ci_pmap_cpu->pc_mbox;
4556 
4557 		/*
4558 		 * If the CPUs have no notion of global pages then
4559 		 * reload of %cr3 is sufficient.
4560 		 */
4561 		if (pte != 0 && (cpu_feature & CPUID_PGE) == 0)
4562 			pte = 0;
4563 
4564 		if (pm == pmap_kernel()) {
4565 			/*
4566 			 * Mapped on all CPUs: use the broadcast mechanism.
4567 			 * Once we have the lock, increment the counter.
4568 			 */
4569 			s = splvm();
4570 			mb = &pmap_mbox;
4571 			count = SPINLOCK_BACKOFF_MIN;
4572 			do {
4573 				if ((head = mb->mb_head) != mb->mb_tail) {
4574 					splx(s);
4575 					while ((head = mb->mb_head) !=
4576 					    mb->mb_tail)
4577 						SPINLOCK_BACKOFF(count);
4578 					s = splvm();
4579 				}
4580 			} while (atomic_cas_ulong(
4581 			    (volatile u_long *)&mb->mb_head,
4582 			    head, head + ncpu - 1) != head);
4583 
4584 			/*
4585 			 * Once underway we must stay at IPL_VM until the
4586 			 * IPI is dispatched.  Otherwise interrupt handlers
4587 			 * on this CPU can deadlock against us.
4588 			 */
4589 			pmap_tlb_evcnt.ev_count++;
4590 			mb->mb_pointer = self;
4591 			mb->mb_addr1 = sva;
4592 			mb->mb_addr2 = eva;
4593 			mb->mb_global = pte;
4594 			x86_ipi(LAPIC_TLB_BCAST_VECTOR, LAPIC_DEST_ALLEXCL,
4595 			    LAPIC_DLMODE_FIXED);
4596 			self->ci_need_tlbwait = 1;
4597 			splx(s);
4598 		} else if ((pm->pm_cpus & ~self->ci_cpumask) != 0 ||
4599 		    (kernel && (pm->pm_kernel_cpus & ~self->ci_cpumask) != 0)) {
4600 			/*
4601 			 * We don't bother traversing the CPU list if only
4602 			 * used by this CPU.
4603 			 *
4604 			 * We can't do global flushes with the multicast
4605 			 * mechanism.
4606 			 */
4607 			KASSERT(pte == 0);
4608 
4609 			/*
4610 			 * Take ownership of the shootdown mailbox on each
4611 			 * CPU, fill the details and fire it off.
4612 			 */
4613 			s = splvm();
4614 			for (CPU_INFO_FOREACH(cii, ci)) {
4615 				if (ci == self ||
4616 				    !pmap_is_active(pm, ci, kernel) ||
4617 				    !(ci->ci_flags & CPUF_RUNNING))
4618 					continue;
4619 				selfmb->mb_head++;
4620 				mb = &ci->ci_pmap_cpu->pc_mbox;
4621 				count = SPINLOCK_BACKOFF_MIN;
4622 				while (atomic_cas_ulong(
4623 				    (u_long *)&mb->mb_pointer,
4624 				    0, (u_long)&selfmb->mb_tail) != 0) {
4625 				    	splx(s);
4626 					while (mb->mb_pointer != 0)
4627 						SPINLOCK_BACKOFF(count);
4628 					s = splvm();
4629 				}
4630 				mb->mb_addr1 = sva;
4631 				mb->mb_addr2 = eva;
4632 				mb->mb_global = pte;
4633 				if (x86_ipi(LAPIC_TLB_MCAST_VECTOR,
4634 				    ci->ci_cpuid, LAPIC_DLMODE_FIXED))
4635 					panic("pmap_tlb_shootdown: ipi failed");
4636 			}
4637 			self->ci_need_tlbwait = 1;
4638 			splx(s);
4639 		}
4640 	}
4641 #endif	/* MULTIPROCESSOR */
4642 
4643 	/* Update the current CPU before waiting for others. */
4644 	if (!pmap_is_active(pm, self, kernel))
4645 		return;
4646 
4647 	if (sva == (vaddr_t)-1LL) {
4648 		u_int gen = uvm_emap_gen_return();
4649 		if (pte != 0) {
4650 			tlbflushg();
4651 		} else {
4652 			tlbflush();
4653 		}
4654 		uvm_emap_update(gen);
4655 	} else {
4656 		do {
4657 			pmap_update_pg(sva);
4658 			sva += PAGE_SIZE;
4659 		} while (sva < eva);
4660 	}
4661 }
4662 
4663 /*
4664  * pmap_tlb_shootwait: wait for pending TLB shootdowns to complete
4665  *
4666  * => only waits for operations generated by the current CPU
4667  * => must be called with preemption disabled
4668  */
4669 
4670 void
4671 pmap_tlb_shootwait(void)
4672 {
4673 	struct cpu_info *self;
4674 	struct pmap_mbox *mb;
4675 
4676 	KASSERT(kpreempt_disabled());
4677 
4678 	/*
4679 	 * Anything to do?  XXX Really we want to avoid touching the cache
4680 	 * lines of the two mailboxes, but the processor may read ahead.
4681 	 */
4682 	self = curcpu();
4683 	if (!self->ci_need_tlbwait)
4684 		return;
4685 	self->ci_need_tlbwait = 0;
4686 
4687 	/* If we own the global mailbox, wait for it to drain. */
4688 	mb = &pmap_mbox;
4689 	while (mb->mb_pointer == self && mb->mb_head != mb->mb_tail)
4690 		x86_pause();
4691 
4692 	/* If we own other CPU's mailboxes, wait for them to drain. */
4693 	mb = &self->ci_pmap_cpu->pc_mbox;
4694 	KASSERT(mb->mb_pointer != &mb->mb_tail);
4695 	while (mb->mb_head != mb->mb_tail)
4696 		x86_pause();
4697 }
4698 
4699 /*
4700  * pmap_update: process deferred invalidations
4701  */
4702 
4703 void
4704 pmap_update(struct pmap *pmap)
4705 {
4706 	struct vm_page *ptp, *empty_ptps;
4707 	struct pmap_page *pp;
4708 	lwp_t *l;
4709 
4710 	/*
4711 	 * if we have torn down this pmap, invalidate non-global TLB
4712 	 * entries on any processors using it.
4713 	 */
4714 	l = curlwp;
4715 	if (__predict_false(l->l_md.md_gc_pmap == pmap)) {
4716 		l->l_md.md_gc_pmap = NULL;
4717 		KPREEMPT_DISABLE(l);
4718 		pmap_tlb_shootdown(pmap, -1, -1, 0);
4719 		KPREEMPT_ENABLE(l);
4720 	}
4721 
4722 	/*
4723 	 * wait for tlb shootdowns to complete before returning control
4724 	 * to the caller.
4725 	 */
4726 	kpreempt_disable();
4727 	pmap_tlb_shootwait();
4728 	kpreempt_enable();
4729 
4730 	/*
4731 	 * now that shootdowns are complete, process deferred frees,
4732 	 * but not from interrupt context.
4733 	 */
4734 	if (l->l_md.md_gc_ptp != NULL) {
4735 		if (cpu_intr_p() || (l->l_pflag & LP_INTR) != 0) {
4736 			return;
4737 		}
4738 
4739 		empty_ptps = l->l_md.md_gc_ptp;
4740 		l->l_md.md_gc_ptp = NULL;
4741 
4742 		while ((ptp = empty_ptps) != NULL) {
4743 			ptp->flags |= PG_ZERO;
4744 			pp = VM_PAGE_TO_PP(ptp);
4745 			empty_ptps = pp->pp_link;
4746 			LIST_INIT(&pp->pp_head.pvh_list);
4747 			uvm_pagefree(ptp);
4748 		}
4749 	}
4750 }
4751 
4752 #if PTP_LEVELS > 4
4753 #error "Unsupported number of page table mappings"
4754 #endif
4755 
4756 paddr_t
4757 pmap_init_tmp_pgtbl(paddr_t pg)
4758 {
4759 	static bool maps_loaded;
4760 	static const paddr_t x86_tmp_pml_paddr[] = {
4761 	    4 * PAGE_SIZE,
4762 	    5 * PAGE_SIZE,
4763 	    6 * PAGE_SIZE,
4764 	    7 * PAGE_SIZE
4765 	};
4766 	static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 };
4767 
4768 	pd_entry_t *tmp_pml, *kernel_pml;
4769 
4770 	int level;
4771 
4772 	if (!maps_loaded) {
4773 		for (level = 0; level < PTP_LEVELS; ++level) {
4774 			x86_tmp_pml_vaddr[level] =
4775 			    uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
4776 			    UVM_KMF_VAONLY);
4777 
4778 			if (x86_tmp_pml_vaddr[level] == 0)
4779 				panic("mapping of real mode PML failed\n");
4780 			pmap_kenter_pa(x86_tmp_pml_vaddr[level],
4781 			    x86_tmp_pml_paddr[level],
4782 			    VM_PROT_READ | VM_PROT_WRITE, 0);
4783 			pmap_update(pmap_kernel());
4784 		}
4785 		maps_loaded = true;
4786 	}
4787 
4788 	/* Zero levels 1-3 */
4789 	for (level = 0; level < PTP_LEVELS - 1; ++level) {
4790 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
4791 		memset(tmp_pml, 0, PAGE_SIZE);
4792 	}
4793 
4794 	/* Copy PML4 */
4795 	kernel_pml = pmap_kernel()->pm_pdir;
4796 	tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1];
4797 	memcpy(tmp_pml, kernel_pml, PAGE_SIZE);
4798 
4799 	for (level = PTP_LEVELS - 1; level > 0; --level) {
4800 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
4801 
4802 		tmp_pml[pl_i(pg, level + 1)] =
4803 		    (x86_tmp_pml_paddr[level - 1] & PG_FRAME) | PG_RW | PG_V;
4804 	}
4805 
4806 	tmp_pml = (void *)x86_tmp_pml_vaddr[0];
4807 	tmp_pml[pl_i(pg, 1)] = (pg & PG_FRAME) | PG_RW | PG_V;
4808 
4809 	return x86_tmp_pml_paddr[PTP_LEVELS - 1];
4810 }
4811