xref: /netbsd-src/sys/arch/x86/x86/pmap.c (revision 7fa608457b817eca6e0977b37f758ae064f3c99c)
1 /*	$NetBSD: pmap.c,v 1.5 2007/11/11 01:30:55 ad Exp $	*/
2 
3 /*
4  *
5  * Copyright (c) 1997 Charles D. Cranor and Washington University.
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. All advertising materials mentioning features or use of this software
17  *    must display the following acknowledgement:
18  *      This product includes software developed by Charles D. Cranor and
19  *      Washington University.
20  * 4. The name of the author may not be used to endorse or promote products
21  *    derived from this software without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
24  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
25  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
26  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
28  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
32  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  */
34 
35 /*
36  * Copyright 2001 (c) Wasabi Systems, Inc.
37  * All rights reserved.
38  *
39  * Written by Frank van der Linden for Wasabi Systems, Inc.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  * 1. Redistributions of source code must retain the above copyright
45  *    notice, this list of conditions and the following disclaimer.
46  * 2. Redistributions in binary form must reproduce the above copyright
47  *    notice, this list of conditions and the following disclaimer in the
48  *    documentation and/or other materials provided with the distribution.
49  * 3. All advertising materials mentioning features or use of this software
50  *    must display the following acknowledgement:
51  *      This product includes software developed for the NetBSD Project by
52  *      Wasabi Systems, Inc.
53  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
54  *    or promote products derived from this software without specific prior
55  *    written permission.
56  *
57  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
58  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
59  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
60  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
61  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
62  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
63  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
64  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
65  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
66  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
67  * POSSIBILITY OF SUCH DAMAGE.
68  */
69 
70 /*
71  * This is the i386 pmap modified and generalized to support x86-64
72  * as well. The idea is to hide the upper N levels of the page tables
73  * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest
74  * is mostly untouched, except that it uses some more generalized
75  * macros and interfaces.
76  *
77  * This pmap has been tested on the i386 as well, and it can be easily
78  * adapted to PAE.
79  *
80  * fvdl@wasabisystems.com 18-Jun-2001
81  */
82 
83 /*
84  * pmap.c: i386 pmap module rewrite
85  * Chuck Cranor <chuck@ccrc.wustl.edu>
86  * 11-Aug-97
87  *
88  * history of this pmap module: in addition to my own input, i used
89  *    the following references for this rewrite of the i386 pmap:
90  *
91  * [1] the NetBSD i386 pmap.   this pmap appears to be based on the
92  *     BSD hp300 pmap done by Mike Hibler at University of Utah.
93  *     it was then ported to the i386 by William Jolitz of UUNET
94  *     Technologies, Inc.   Then Charles M. Hannum of the NetBSD
95  *     project fixed some bugs and provided some speed ups.
96  *
97  * [2] the FreeBSD i386 pmap.   this pmap seems to be the
98  *     Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson
99  *     and David Greenman.
100  *
101  * [3] the Mach pmap.   this pmap, from CMU, seems to have migrated
102  *     between several processors.   the VAX version was done by
103  *     Avadis Tevanian, Jr., and Michael Wayne Young.    the i386
104  *     version was done by Lance Berc, Mike Kupfer, Bob Baron,
105  *     David Golub, and Richard Draves.    the alpha version was
106  *     done by Alessandro Forin (CMU/Mach) and Chris Demetriou
107  *     (NetBSD/alpha).
108  */
109 
110 #include <sys/cdefs.h>
111 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.5 2007/11/11 01:30:55 ad Exp $");
112 
113 #ifndef __x86_64__
114 #include "opt_cputype.h"
115 #endif
116 #include "opt_user_ldt.h"
117 #include "opt_lockdebug.h"
118 #include "opt_multiprocessor.h"
119 #if !defined(__x86_64__)
120 #include "opt_kstack_dr0.h"
121 #endif /* !defined(__x86_64__) */
122 
123 #include <sys/param.h>
124 #include <sys/systm.h>
125 #include <sys/proc.h>
126 #include <sys/malloc.h>
127 #include <sys/pool.h>
128 #include <sys/user.h>
129 #include <sys/kernel.h>
130 
131 #include <uvm/uvm.h>
132 
133 #include <dev/isa/isareg.h>
134 
135 #include <machine/atomic.h>
136 #include <machine/cpu.h>
137 #include <machine/specialreg.h>
138 #include <machine/gdt.h>
139 #include <machine/intr.h>
140 #include <machine/isa_machdep.h>
141 #include <machine/cpuvar.h>
142 
143 #include <x86/i82489reg.h>
144 #include <x86/i82489var.h>
145 
146 /* XXX */
147 void		atomic_inc_uint(volatile unsigned int *);
148 unsigned int	atomic_dec_uint_nv(volatile unsigned int *);
149 
150 /*
151  * general info:
152  *
153  *  - for an explanation of how the i386 MMU hardware works see
154  *    the comments in <machine/pte.h>.
155  *
156  *  - for an explanation of the general memory structure used by
157  *    this pmap (including the recursive mapping), see the comments
158  *    in <machine/pmap.h>.
159  *
160  * this file contains the code for the "pmap module."   the module's
161  * job is to manage the hardware's virtual to physical address mappings.
162  * note that there are two levels of mapping in the VM system:
163  *
164  *  [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
165  *      to map ranges of virtual address space to objects/files.  for
166  *      example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
167  *      to the file /bin/ls starting at offset zero."   note that
168  *      the upper layer mapping is not concerned with how individual
169  *      vm_pages are mapped.
170  *
171  *  [2] the lower layer of the VM system (the pmap) maintains the mappings
172  *      from virtual addresses.   it is concerned with which vm_page is
173  *      mapped where.   for example, when you run /bin/ls and start
174  *      at page 0x1000 the fault routine may lookup the correct page
175  *      of the /bin/ls file and then ask the pmap layer to establish
176  *      a mapping for it.
177  *
178  * note that information in the lower layer of the VM system can be
179  * thrown away since it can easily be reconstructed from the info
180  * in the upper layer.
181  *
182  * data structures we use include:
183  *
184  *  - struct pmap: describes the address space of one thread
185  *  - struct pv_entry: describes one <PMAP,VA> mapping of a PA
186  *  - struct pv_head: there is one pv_head per managed page of
187  *	physical memory.   the pv_head points to a list of pv_entry
188  *	structures which describe all the <PMAP,VA> pairs that this
189  *      page is mapped in.    this is critical for page based operations
190  *      such as pmap_page_protect() [change protection on _all_ mappings
191  *      of a page]
192  *  - pv_page/pv_page_info: pv_entry's are allocated out of pv_page's.
193  *      if we run out of pv_entry's we allocate a new pv_page and free
194  *      its pv_entrys.
195  */
196 
197 /*
198  * memory allocation
199  *
200  *  - there are three data structures that we must dynamically allocate:
201  *
202  * [A] new process' page directory page (PDP)
203  *	- plan 1: done at pmap_create() we use
204  *	  uvm_km_alloc(kernel_map, PAGE_SIZE)  [fka kmem_alloc] to do this
205  *	  allocation.
206  *
207  * if we are low in free physical memory then we sleep in
208  * uvm_km_alloc -- in this case this is ok since we are creating
209  * a new pmap and should not be holding any locks.
210  *
211  * if the kernel is totally out of virtual space
212  * (i.e. uvm_km_alloc returns NULL), then we panic.
213  *
214  * XXX: the fork code currently has no way to return an "out of
215  * memory, try again" error code since uvm_fork [fka vm_fork]
216  * is a void function.
217  *
218  * [B] new page tables pages (PTP)
219  * 	- call uvm_pagealloc()
220  * 		=> success: zero page, add to pm_pdir
221  * 		=> failure: we are out of free vm_pages, let pmap_enter()
222  *		   tell UVM about it.
223  *
224  * note: for kernel PTPs, we start with NKPTP of them.   as we map
225  * kernel memory (at uvm_map time) we check to see if we've grown
226  * the kernel pmap.   if so, we call the optional function
227  * pmap_growkernel() to grow the kernel PTPs in advance.
228  *
229  * [C] pv_entry structures
230  *	- plan 1: try to allocate one off the free list
231  *		=> success: done!
232  *		=> failure: no more free pv_entrys on the list
233  *	- plan 2: try to allocate a new pv_page to add a chunk of
234  *	pv_entrys to the free list
235  *		[a] obtain a free, unmapped, VA in kmem_map.  either
236  *		we have one saved from a previous call, or we allocate
237  *		one now using a "vm_map_lock_try" in uvm_map
238  *		=> success: we have an unmapped VA, continue to [b]
239  *		=> failure: unable to lock kmem_map or out of VA in it.
240  *			move on to plan 3.
241  *		[b] allocate a page for the VA
242  *		=> success: map it in, free the pv_entry's, DONE!
243  *		=> failure: no free vm_pages, etc.
244  *			save VA for later call to [a], go to plan 3.
245  *	If we fail, we simply let pmap_enter() tell UVM about it.
246  */
247 
248 /*
249  * locking
250  *
251  * we have the following locks that we must contend with:
252  *
253  * RW locks:
254  *
255  *  - pmap_main_lock
256  *    this lock is used to prevent deadlock and/or provide mutex
257  *    access to the pmap system.   most operations lock the pmap
258  *    structure first, then they lock the pv_lists (if needed).
259  *    however, some operations such as pmap_page_protect lock
260  *    the pv_lists and then lock pmaps.   in order to prevent a
261  *    cycle, we require a mutex lock when locking the pv_lists
262  *    first.   thus, the "pmap = >pv_list" lockers must gain a
263  *    read-lock on pmap_main_lock before locking the pmap.   and
264  *    the "pv_list => pmap" lockers must gain a write-lock on
265  *    pmap_main_lock before locking.    since only one thread
266  *    can write-lock a lock at a time, this provides mutex.
267  *
268  * mutexes:
269  *
270  * - pmap lock (per pmap, part of uvm_object)
271  *   this lock protects the fields in the pmap structure including
272  *   the non-kernel PDEs in the PDP, and the PTEs.  it also locks
273  *   in the alternate PTE space (since that is determined by the
274  *   entry in the PDP).
275  *
276  * - pvh_lock (per pv_head)
277  *   this lock protects the pv_entry list which is chained off the
278  *   pv_head structure for a specific managed PA.   it is locked
279  *   when traversing the list (e.g. adding/removing mappings,
280  *   syncing R/M bits, etc.)
281  *
282  * - pmaps_lock
283  *   this lock protects the list of active pmaps (headed by "pmaps").
284  *   we lock it when adding or removing pmaps from this list.
285  *
286  * tlb shootdown
287  *
288  * tlb shootdowns are hard interrupts that operate outside the spl
289  * framework: they don't need to be blocked provided that the pmap module
290  * gets the order of events correct.  the calls are made by talking directly
291  * to the lapic.  the stubs to handle the interrupts are quite short and do
292  * one of the following: invalidate a single page, a range of pages, all
293  * user tlb entries or the entire tlb.
294  *
295  * the cpus synchronize with each other using pmap_mbox structures which are
296  * aligned on 64-byte cache lines.  tlb shootdowns against the kernel pmap
297  * use a global mailbox and are generated using a broadcast ipi (broadcast
298  * to all but the sending cpu).  shootdowns against regular pmaps use
299  * per-cpu mailboxes and are multicast.  kernel and user shootdowns can
300  * execute simultaneously, as can shootdowns within different multithreaded
301  * processes.  TODO:
302  *
303  *   1. figure out which waitpoints can be deferered to pmap_update().
304  *   2. see if there is a cheap way to batch some updates.
305  */
306 
307 vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER;
308 int ptp_shifts[] = PTP_SHIFT_INITIALIZER;
309 long nkptp[] = NKPTP_INITIALIZER;
310 long nkptpmax[] = NKPTPMAX_INITIALIZER;
311 long nbpd[] = NBPD_INITIALIZER;
312 pd_entry_t *normal_pdes[] = PDES_INITIALIZER;
313 pd_entry_t *alternate_pdes[] = APDES_INITIALIZER;
314 
315 /*
316  * locking data structures.  to enable the locks, changes from the
317  * 'vmlocking' cvs branch are required.  for now, just stub them out.
318  */
319 
320 #define rw_enter(a, b)		/* nothing */
321 #define	rw_exit(a)		/* nothing */
322 #define	mutex_enter(a)		simple_lock(a)
323 #define	mutex_exit(a)		simple_unlock(a)
324 #define	mutex_init(a, b, c)	simple_lock_init(a)
325 #define	mutex_owned(a)		(1)
326 #define	mutex_destroy(a)	/* nothing */
327 #define kmutex_t		struct simplelock
328 
329 static kmutex_t pmaps_lock;
330 static krwlock_t pmap_main_lock;
331 
332 static vaddr_t pmap_maxkvaddr;
333 
334 #define COUNT(x)	/* nothing */
335 
336 TAILQ_HEAD(pv_pagelist, pv_page);
337 typedef struct pv_pagelist pv_pagelist_t;
338 
339 /*
340  * Global TLB shootdown mailbox.
341  */
342 struct evcnt pmap_tlb_evcnt __aligned(64);
343 struct pmap_mbox pmap_mbox __aligned(64);
344 
345 /*
346  * Per-CPU data.  The pmap mailbox is cache intensive so gets its
347  * own line.  Note that the mailbox must be the first item.
348  */
349 struct pmap_cpu {
350 	/* TLB shootdown */
351 	struct pmap_mbox pc_mbox;
352 };
353 
354 union {
355 	struct pmap_cpu pc;
356 	uint8_t padding[128];
357 } pmap_cpu[X86_MAXPROCS] __aligned(64);
358 
359 /*
360  * global data structures
361  */
362 
363 struct pmap kernel_pmap_store;	/* the kernel's pmap (proc0) */
364 
365 /*
366  * pmap_pg_g: if our processor supports PG_G in the PTE then we
367  * set pmap_pg_g to PG_G (otherwise it is zero).
368  */
369 
370 int pmap_pg_g = 0;
371 
372 /*
373  * pmap_largepages: if our processor supports PG_PS and we are
374  * using it, this is set to true.
375  */
376 
377 int pmap_largepages;
378 
379 /*
380  * i386 physical memory comes in a big contig chunk with a small
381  * hole toward the front of it...  the following two paddr_t's
382  * (shared with machdep.c) describe the physical address space
383  * of this machine.
384  */
385 paddr_t avail_start;	/* PA of first available physical page */
386 paddr_t avail_end;	/* PA of last available physical page */
387 
388 /*
389  * other data structures
390  */
391 
392 static pt_entry_t protection_codes[8];	/* maps MI prot to i386 prot code */
393 static bool pmap_initialized = false;	/* pmap_init done yet? */
394 
395 /*
396  * the following two vaddr_t's are used during system startup
397  * to keep track of how much of the kernel's VM space we have used.
398  * once the system is started, the management of the remaining kernel
399  * VM space is turned over to the kernel_map vm_map.
400  */
401 
402 static vaddr_t virtual_avail;	/* VA of first free KVA */
403 static vaddr_t virtual_end;	/* VA of last free KVA */
404 
405 /*
406  * pv_page management structures
407  */
408 
409 #define PVE_LOWAT (PVE_PER_PVPAGE / 2)	/* free pv_entry low water mark */
410 #define PVE_HIWAT (PVE_LOWAT + (PVE_PER_PVPAGE * 2))
411 					/* high water mark */
412 
413 static inline int
414 pv_compare(struct pv_entry *a, struct pv_entry *b)
415 {
416 
417 	if (a->pv_pmap < b->pv_pmap)
418 		return (-1);
419 	else if (a->pv_pmap > b->pv_pmap)
420 		return (1);
421 	else if (a->pv_va < b->pv_va)
422 		return (-1);
423 	else if (a->pv_va > b->pv_va)
424 		return (1);
425 	else
426 		return (0);
427 }
428 
429 SPLAY_PROTOTYPE(pvtree, pv_entry, pv_node, pv_compare);
430 SPLAY_GENERATE(pvtree, pv_entry, pv_node, pv_compare);
431 
432 /*
433  * linked list of all non-kernel pmaps
434  */
435 
436 static struct pmap_head pmaps;
437 
438 /*
439  * pool that pmap structures are allocated from
440  */
441 
442 static struct pool_cache pmap_cache;
443 
444 /*
445  * pv_entry cache
446  */
447 
448 struct pool_cache pmap_pv_cache;
449 
450 /*
451  * MULTIPROCESSOR: special VA's/ PTE's are actually allocated inside a
452  * X86_MAXPROCS*NPTECL array of PTE's, to avoid cache line thrashing
453  * due to false sharing.
454  */
455 
456 #ifdef MULTIPROCESSOR
457 #define PTESLEW(pte, id) ((pte)+(id)*NPTECL)
458 #define VASLEW(va,id) ((va)+(id)*NPTECL*PAGE_SIZE)
459 #else
460 #define PTESLEW(pte, id) (pte)
461 #define VASLEW(va,id) (va)
462 #endif
463 
464 /*
465  * special VAs and the PTEs that map them
466  */
467 static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte, *early_zero_pte;
468 static char *csrcp, *cdstp, *zerop, *ptpp, *early_zerop;
469 
470 /*
471  * pool and cache that PDPs are allocated from
472  */
473 
474 static struct pool_cache pmap_pdp_cache;
475 
476 int	pmap_pdp_ctor(void *, void *, int);
477 
478 void *vmmap; /* XXX: used by mem.c... it should really uvm_map_reserve it */
479 
480 extern vaddr_t idt_vaddr;			/* we allocate IDT early */
481 extern paddr_t idt_paddr;
482 
483 #ifdef _LP64
484 extern vaddr_t lo32_vaddr;
485 extern vaddr_t lo32_paddr;
486 #endif
487 
488 extern int end;
489 
490 #if defined(I586_CPU)
491 /* stuff to fix the pentium f00f bug */
492 extern vaddr_t pentium_idt_vaddr;
493 #endif
494 
495 
496 /*
497  * local prototypes
498  */
499 
500 static struct vm_page	*pmap_get_ptp(struct pmap *, vaddr_t, pd_entry_t **);
501 static struct vm_page	*pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int);
502 static void		 pmap_freepage(struct pmap *, struct vm_page *, int,
503 				       struct vm_page **);
504 static void		 pmap_free_ptp(struct pmap *, struct vm_page *,
505 				       vaddr_t, pt_entry_t *, pd_entry_t **,
506 				       struct vm_page **);
507 static bool		 pmap_is_curpmap(struct pmap *);
508 static bool		 pmap_is_active(struct pmap *, struct cpu_info *, bool);
509 static void		 pmap_map_ptes(struct pmap *, struct pmap **,
510 				       pt_entry_t **, pd_entry_t ***);
511 static struct pv_entry	*pmap_remove_pv(struct pv_head *, struct pmap *,
512 					vaddr_t);
513 static void		 pmap_do_remove(struct pmap *, vaddr_t, vaddr_t, int);
514 static bool		 pmap_remove_pte(struct pmap *, struct vm_page *,
515 					 pt_entry_t *, vaddr_t, int,
516 					 struct pv_entry **);
517 static pt_entry_t	 pmap_remove_ptes(struct pmap *, struct vm_page *,
518 					  vaddr_t, vaddr_t, vaddr_t, int,
519 					  struct pv_entry **);
520 #define PMAP_REMOVE_ALL		0	/* remove all mappings */
521 #define PMAP_REMOVE_SKIPWIRED	1	/* skip wired mappings */
522 
523 static void		 pmap_unmap_ptes(struct pmap *, struct pmap *);
524 static bool		 pmap_get_physpage(vaddr_t, int, paddr_t *);
525 static int		 pmap_pdes_invalid(vaddr_t, pd_entry_t **,
526 					   pd_entry_t *);
527 #define	pmap_pdes_valid(va, pdes, lastpde)	\
528 	(pmap_pdes_invalid((va), (pdes), (lastpde)) == 0)
529 static void		 pmap_alloc_level(pd_entry_t **, vaddr_t, int, long *);
530 
531 static bool		 pmap_reactivate(struct pmap *);
532 
533 /*
534  * p m a p   h e l p e r   f u n c t i o n s
535  */
536 
537 /*
538  * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
539  *		of course the kernel is always loaded
540  */
541 
542 inline static bool
543 pmap_is_curpmap(struct pmap *pmap)
544 {
545 
546 	return((pmap == pmap_kernel()) ||
547 	       (pmap == curcpu()->ci_pmap));
548 }
549 
550 /*
551  * pmap_is_active: is this pmap loaded into the specified processor's %cr3?
552  */
553 
554 inline static bool
555 pmap_is_active(struct pmap *pmap, struct cpu_info *ci, bool kernel)
556 {
557 
558 	return (pmap == pmap_kernel() ||
559 	    (pmap->pm_cpus & ci->ci_cpumask) != 0 ||
560 	    (kernel && (pmap->pm_kernel_cpus & ci->ci_cpumask) != 0));
561 }
562 
563 static void
564 pmap_apte_flush(struct pmap *pmap)
565 {
566 
567 	/*
568 	 * Flush the APTE mapping from all other CPUs that
569 	 * are using the pmap we are using (who's APTE space
570 	 * is the one we've just modified).
571 	 *
572 	 * XXXthorpej -- find a way to defer the IPI.
573 	 */
574 	pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, 0);
575 	pmap_tlb_shootwait();
576 }
577 
578 /*
579  * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
580  *
581  * => we lock enough pmaps to keep things locked in
582  * => must be undone with pmap_unmap_ptes before returning
583  */
584 
585 static void
586 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2,
587     pd_entry_t **ptepp, pd_entry_t ***pdeppp)
588 {
589 	pd_entry_t opde;
590 	struct pmap *ourpmap;
591 	struct cpu_info *ci;
592 	struct lwp *l;
593 	bool iscurrent;
594 	uint64_t ncsw;
595 
596 	/* the kernel's pmap is always accessible */
597 	if (pmap == pmap_kernel()) {
598 		*pmap2 = NULL;
599 		*ptepp = PTE_BASE;
600 		*pdeppp = normal_pdes;
601 		return;
602 	}
603 
604  retry:
605 	crit_enter();
606 	l = curlwp;
607 	ncsw = l->l_ncsw;
608  	ourpmap = NULL;
609 	ci = curcpu();
610 	if (ci->ci_want_pmapload &&
611 	    vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) {
612 		pmap_load();
613 		if (l->l_ncsw != ncsw) {
614 			crit_exit();
615 			goto retry;
616 		}
617 	}
618 	iscurrent = pmap_is_curpmap(pmap);
619 
620 	/* if curpmap then we are always mapped */
621 	if (iscurrent) {
622 		mutex_enter(&pmap->pm_lock);
623 		*pmap2 = NULL;
624 		*ptepp = PTE_BASE;
625 		*pdeppp = normal_pdes;
626 		goto out;
627 	}
628 
629 	ourpmap = ci->ci_pmap;
630 
631 	/* need to lock both curpmap and pmap: use ordered locking */
632 	if ((uintptr_t) pmap < (uintptr_t) ourpmap) {
633 		mutex_enter(&pmap->pm_lock);
634 		mutex_enter(&ourpmap->pm_lock);
635 	} else {
636 		mutex_enter(&ourpmap->pm_lock);
637 		mutex_enter(&pmap->pm_lock);
638 	}
639 
640 	if (l->l_ncsw != ncsw)
641 		goto unlock_and_retry;
642 
643 	/* need to load a new alternate pt space into curpmap? */
644 	COUNT(apdp_pde_map);
645 	opde = *APDP_PDE;
646 	if (!pmap_valid_entry(opde) || (opde & PG_FRAME) != pmap->pm_pdirpa) {
647 		*APDP_PDE = (pd_entry_t) (pmap->pm_pdirpa | PG_RW | PG_V);
648 		if (pmap_valid_entry(opde))
649 			pmap_apte_flush(ourpmap);
650 	}
651 
652 	*pmap2 = ourpmap;
653 	*ptepp = APTE_BASE;
654 	*pdeppp = alternate_pdes;
655 	KASSERT(l->l_ncsw == ncsw);
656  out:
657  	/*
658  	 * might have blocked, need to retry?
659  	 */
660 	if (l->l_ncsw != ncsw) {
661  unlock_and_retry:
662 		crit_exit();
663 	    	if (ourpmap != NULL)
664 			mutex_exit(&ourpmap->pm_lock);
665 		mutex_exit(&pmap->pm_lock);
666 		goto retry;
667 	}
668 
669 	return;
670 }
671 
672 /*
673  * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
674  */
675 
676 static void
677 pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2)
678 {
679 
680 	if (pmap == pmap_kernel()) {
681 		return;
682 	}
683 	if (pmap2 == NULL) {
684 		mutex_exit(&pmap->pm_lock);
685 	} else {
686 		KASSERT(curcpu()->ci_pmap == pmap2);
687 #if defined(MULTIPROCESSOR)
688 		*APDP_PDE = 0;
689 		pmap_apte_flush(pmap2);
690 #endif
691 		COUNT(apdp_pde_unmap);
692 		mutex_exit(&pmap->pm_lock);
693 		mutex_exit(&pmap2->pm_lock);
694 	}
695 
696 	/* re-enable preemption */
697 	crit_exit();
698 }
699 
700 inline static void
701 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte)
702 {
703 
704 #if !defined(__x86_64__)
705 	if (curproc == NULL || curproc->p_vmspace == NULL ||
706 	    pm != vm_map_pmap(&curproc->p_vmspace->vm_map))
707 		return;
708 
709 	if ((opte ^ npte) & PG_X)
710 		pmap_update_pg(va);
711 
712 	/*
713 	 * Executability was removed on the last executable change.
714 	 * Reset the code segment to something conservative and
715 	 * let the trap handler deal with setting the right limit.
716 	 * We can't do that because of locking constraints on the vm map.
717 	 */
718 
719 	if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) {
720 		struct trapframe *tf = curlwp->l_md.md_regs;
721 		struct pcb *pcb = &curlwp->l_addr->u_pcb;
722 
723 		pcb->pcb_cs = tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
724 		pm->pm_hiexec = I386_MAX_EXE_ADDR;
725 	}
726 #endif /* !defined(__x86_64__) */
727 }
728 
729 #if !defined(__x86_64__)
730 /*
731  * Fixup the code segment to cover all potential executable mappings.
732  * returns 0 if no changes to the code segment were made.
733  */
734 
735 int
736 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb)
737 {
738 	struct vm_map_entry *ent;
739 	struct pmap *pm = vm_map_pmap(map);
740 	vaddr_t va = 0;
741 
742 	vm_map_lock_read(map);
743 	for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) {
744 
745 		/*
746 		 * This entry has greater va than the entries before.
747 		 * We need to make it point to the last page, not past it.
748 		 */
749 
750 		if (ent->protection & VM_PROT_EXECUTE)
751 			va = trunc_page(ent->end) - PAGE_SIZE;
752 	}
753 	vm_map_unlock_read(map);
754 	if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL))
755 		return (0);
756 
757 	pm->pm_hiexec = va;
758 	if (pm->pm_hiexec > I386_MAX_EXE_ADDR) {
759 		pcb->pcb_cs = tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
760 	} else {
761 		pcb->pcb_cs = tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
762 		return (0);
763 	}
764 	return (1);
765 }
766 #endif /* !defined(__x86_64__) */
767 
768 /*
769  * p m a p   k e n t e r   f u n c t i o n s
770  *
771  * functions to quickly enter/remove pages from the kernel address
772  * space.   pmap_kremove is exported to MI kernel.  we make use of
773  * the recursive PTE mappings.
774  */
775 
776 /*
777  * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
778  *
779  * => no need to lock anything, assume va is already allocated
780  * => should be faster than normal pmap enter function
781  */
782 
783 void
784 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot)
785 {
786 	pt_entry_t *pte, opte, npte;
787 
788 	if (va < VM_MIN_KERNEL_ADDRESS)
789 		pte = vtopte(va);
790 	else
791 		pte = kvtopte(va);
792 
793 	npte = pa | protection_codes[prot] | PG_V | pmap_pg_g;
794 	opte = pmap_pte_set(pte, npte); /* zap! */
795 #if defined(DIAGNOSTIC)
796 	/* XXX For now... */
797 	if (opte & PG_PS)
798 		panic("pmap_kenter_pa: PG_PS");
799 #endif
800 	if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) {
801 		/* This should not happen, so no need to batch updates. */
802 		crit_enter();
803 		pmap_tlb_shootdown(pmap_kernel(), va, 0, opte);
804 		crit_exit();
805 	}
806 }
807 
808 #if defined(__x86_64__)
809 /*
810  * Change protection for a virtual address. Local for a CPU only, don't
811  * care about TLB shootdowns.
812  */
813 void
814 pmap_changeprot_local(vaddr_t va, vm_prot_t prot)
815 {
816 	pt_entry_t *pte, opte;
817 
818 	if (va < VM_MIN_KERNEL_ADDRESS)
819 		pte = vtopte(va);
820 	else
821 		pte = kvtopte(va);
822 
823 	opte = *pte;
824 
825 	if ((prot & VM_PROT_WRITE) != 0)
826 		*pte |= PG_RW;
827 	else
828 		*pte &= ~PG_RW;
829 
830 	if (opte != *pte)
831 		invlpg(va);
832 }
833 #endif /* defined(__x86_64__) */
834 
835 /*
836  * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
837  *
838  * => no need to lock anything
839  * => caller must dispose of any vm_page mapped in the va range
840  * => note: not an inline function
841  * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
842  * => we assume kernel only unmaps valid addresses and thus don't bother
843  *    checking the valid bit before doing TLB flushing
844  * => must be followed by call to pmap_update() before reuse of page
845  */
846 
847 void
848 pmap_kremove(vaddr_t sva, vsize_t len)
849 {
850 	pt_entry_t *pte, xpte;
851 	vaddr_t va, eva;
852 
853 	eva = sva + len;
854 	xpte = 0;
855 
856 	for (va = sva; va < eva; va += PAGE_SIZE) {
857 		if (va < VM_MIN_KERNEL_ADDRESS)
858 			pte = vtopte(va);
859 		else
860 			pte = kvtopte(va);
861 		xpte |= pmap_pte_set(pte, 0); /* zap! */
862 #if defined(DIAGNOSTIC)
863 		/* XXX For now... */
864 		if (xpte & PG_PS)
865 			panic("pmap_kremove: PG_PS");
866 		if (xpte & PG_PVLIST)
867 			panic("pmap_kremove: PG_PVLIST mapping for 0x%lx",
868 			      va);
869 #endif
870 	}
871 	if ((xpte & (PG_V | PG_U)) == (PG_V | PG_U)) {
872 		crit_enter();
873 		pmap_tlb_shootdown(pmap_kernel(), sva, eva, xpte);
874 		crit_exit();
875 	}
876 }
877 
878 /*
879  * p m a p   i n i t   f u n c t i o n s
880  *
881  * pmap_bootstrap and pmap_init are called during system startup
882  * to init the pmap module.   pmap_bootstrap() does a low level
883  * init just to get things rolling.   pmap_init() finishes the job.
884  */
885 
886 /*
887  * pmap_bootstrap: get the system in a state where it can run with VM
888  *	properly enabled (called before main()).   the VM system is
889  *      fully init'd later...
890  *
891  * => on i386, locore.s has already enabled the MMU by allocating
892  *	a PDP for the kernel, and nkpde PTP's for the kernel.
893  * => kva_start is the first free virtual address in kernel space
894  */
895 
896 void
897 pmap_bootstrap(vaddr_t kva_start)
898 {
899 	vaddr_t kva;
900 	vaddr_t kva_end;
901 	struct pmap *kpm;
902 	pt_entry_t *pte;
903 	int i;
904 	unsigned long p1i;
905 	pt_entry_t pg_nx = (cpu_feature & CPUID_NOX ? PG_NX : 0);
906 
907 	/*
908 	 * set up our local static global vars that keep track of the
909 	 * usage of KVM before kernel_map is set up
910 	 */
911 
912 	virtual_avail = kva_start;		/* first free KVA */
913 	virtual_end = VM_MAX_KERNEL_ADDRESS;	/* last KVA */
914 
915 	/*
916 	 * set up protection_codes: we need to be able to convert from
917 	 * a MI protection code (some combo of VM_PROT...) to something
918 	 * we can jam into a i386 PTE.
919 	 */
920 
921 	protection_codes[VM_PROT_NONE] = pg_nx;			/* --- */
922 	protection_codes[VM_PROT_EXECUTE] = PG_RO | PG_X;	/* --x */
923 	protection_codes[VM_PROT_READ] = PG_RO | pg_nx;		/* -r- */
924 	protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO | PG_X;/* -rx */
925 	protection_codes[VM_PROT_WRITE] = PG_RW | pg_nx;	/* w-- */
926 	protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW | PG_X;/* w-x */
927 	protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW | pg_nx;
928 								/* wr- */
929 	protection_codes[VM_PROT_ALL] = PG_RW | PG_X;		/* wrx */
930 
931 	/*
932 	 * now we init the kernel's pmap
933 	 *
934 	 * the kernel pmap's pm_obj is not used for much.   however, in
935 	 * user pmaps the pm_obj contains the list of active PTPs.
936 	 * the pm_obj currently does not have a pager.   it might be possible
937 	 * to add a pager that would allow a process to read-only mmap its
938 	 * own page tables (fast user level vtophys?).   this may or may not
939 	 * be useful.
940 	 */
941 
942 	kpm = pmap_kernel();
943 	for (i = 0; i < PTP_LEVELS - 1; i++) {
944 		UVM_OBJ_INIT(&kpm->pm_obj[i], NULL, 1);
945 		kpm->pm_ptphint[i] = NULL;
946 	}
947 	memset(&kpm->pm_list, 0, sizeof(kpm->pm_list));  /* pm_list not used */
948 	kpm->pm_pdir = (pd_entry_t *)(lwp0.l_addr->u_pcb.pcb_cr3 + KERNBASE);
949 	kpm->pm_pdirpa = (paddr_t) lwp0.l_addr->u_pcb.pcb_cr3;
950 	kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
951 		x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS);
952 
953 	/*
954 	 * the above is just a rough estimate and not critical to the proper
955 	 * operation of the system.
956 	 */
957 
958 	/*
959 	 * Begin to enable global TLB entries if they are supported.
960 	 * The G bit has no effect until the CR4_PGE bit is set in CR4,
961 	 * which happens in cpu_init(), which is run on each cpu
962 	 * (and happens later)
963 	 */
964 
965 	if (cpu_feature & CPUID_PGE) {
966 		pmap_pg_g = PG_G;		/* enable software */
967 
968 		/* add PG_G attribute to already mapped kernel pages */
969 		if (KERNBASE == VM_MIN_KERNEL_ADDRESS) {
970 			kva_end = virtual_avail;
971 		} else {
972 			kva_end = roundup((vaddr_t)&end, PAGE_SIZE);
973 		}
974 		for (kva = KERNBASE; kva < kva_end; kva += PAGE_SIZE) {
975 			p1i = pl1_i(kva);
976 			if (pmap_valid_entry(PTE_BASE[p1i]))
977 				PTE_BASE[p1i] |= PG_G;
978 		}
979 	}
980 
981 	/*
982 	 * enable large pages if they are supported.
983 	 */
984 
985 	if (cpu_feature & CPUID_PSE) {
986 		paddr_t pa;
987 		pd_entry_t *pde;
988 		extern char __data_start;
989 
990 		lcr4(rcr4() | CR4_PSE);	/* enable hardware (via %cr4) */
991 		pmap_largepages = 1;	/* enable software */
992 
993 		/*
994 		 * the TLB must be flushed after enabling large pages
995 		 * on Pentium CPUs, according to section 3.6.2.2 of
996 		 * "Intel Architecture Software Developer's Manual,
997 		 * Volume 3: System Programming".
998 		 */
999 		tlbflush();
1000 
1001 		/*
1002 		 * now, remap the kernel text using large pages.  we
1003 		 * assume that the linker has properly aligned the
1004 		 * .data segment to a NBPD_L2 boundary.
1005 		 */
1006 		kva_end = rounddown((vaddr_t)&__data_start, NBPD_L1);
1007 		for (pa = 0, kva = KERNBASE; kva + NBPD_L2 <= kva_end;
1008 		     kva += NBPD_L2, pa += NBPD_L2) {
1009 			pde = &L2_BASE[pl2_i(kva)];
1010 			*pde = pa | pmap_pg_g | PG_PS |
1011 			    PG_KR | PG_V;	/* zap! */
1012 			tlbflush();
1013 		}
1014 #if defined(DEBUG)
1015 		printf("kernel text is mapped with "
1016 		    "%lu large pages and %lu normal pages\n",
1017 		    (unsigned long)howmany(kva - KERNBASE, NBPD_L2),
1018 		    (unsigned long)howmany((vaddr_t)&__data_start - kva,
1019 		    NBPD_L1));
1020 #endif /* defined(DEBUG) */
1021 	}
1022 
1023 	if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
1024 		/*
1025 		 * zero_pte is stuck at the end of mapped space for the kernel
1026 		 * image (disjunct from kva space). This is done so that it
1027 		 * can safely be used in pmap_growkernel (pmap_get_physpage),
1028 		 * when it's called for the first time.
1029 		 * XXXfvdl fix this for MULTIPROCESSOR later.
1030 		 */
1031 
1032 		early_zerop = (void *)(KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2);
1033 		early_zero_pte = PTE_BASE + pl1_i((unsigned long)early_zerop);
1034 	}
1035 
1036 	/*
1037 	 * now we allocate the "special" VAs which are used for tmp mappings
1038 	 * by the pmap (and other modules).    we allocate the VAs by advancing
1039 	 * virtual_avail (note that there are no pages mapped at these VAs).
1040 	 * we find the PTE that maps the allocated VA via the linear PTE
1041 	 * mapping.
1042 	 */
1043 
1044 	pte = PTE_BASE + pl1_i(virtual_avail);
1045 
1046 #ifdef MULTIPROCESSOR
1047 	/*
1048 	 * Waste some VA space to avoid false sharing of cache lines
1049 	 * for page table pages: Give each possible CPU a cache line
1050 	 * of PTE's (8) to play with, though we only need 4.  We could
1051 	 * recycle some of this waste by putting the idle stacks here
1052 	 * as well; we could waste less space if we knew the largest
1053 	 * CPU ID beforehand.
1054 	 */
1055 	csrcp = (char *) virtual_avail;  csrc_pte = pte;
1056 
1057 	cdstp = (char *) virtual_avail+PAGE_SIZE;  cdst_pte = pte+1;
1058 
1059 	zerop = (char *) virtual_avail+PAGE_SIZE*2;  zero_pte = pte+2;
1060 
1061 	ptpp = (char *) virtual_avail+PAGE_SIZE*3;  ptp_pte = pte+3;
1062 
1063 	virtual_avail += PAGE_SIZE * X86_MAXPROCS * NPTECL;
1064 	pte += X86_MAXPROCS * NPTECL;
1065 #else
1066 	csrcp = (void *) virtual_avail;  csrc_pte = pte;	/* allocate */
1067 	virtual_avail += PAGE_SIZE; pte++;			/* advance */
1068 
1069 	cdstp = (void *) virtual_avail;  cdst_pte = pte;
1070 	virtual_avail += PAGE_SIZE; pte++;
1071 
1072 	zerop = (void *) virtual_avail;  zero_pte = pte;
1073 	virtual_avail += PAGE_SIZE; pte++;
1074 
1075 	ptpp = (void *) virtual_avail;  ptp_pte = pte;
1076 	virtual_avail += PAGE_SIZE; pte++;
1077 #endif
1078 
1079 	if (VM_MIN_KERNEL_ADDRESS == KERNBASE) {
1080 		early_zerop = zerop;
1081 		early_zero_pte = zero_pte;
1082 	}
1083 
1084 	/*
1085 	 * Nothing after this point actually needs pte;
1086 	 */
1087 	pte = (void *)0xdeadbeef;
1088 
1089 	/* XXX: vmmap used by mem.c... should be uvm_map_reserve */
1090 	/* XXXfvdl PTEs not needed here */
1091 	vmmap = (char *)virtual_avail;			/* don't need pte */
1092 	virtual_avail += PAGE_SIZE; pte++;
1093 
1094 	idt_vaddr = virtual_avail;			/* don't need pte */
1095 	idt_paddr = avail_start;			/* steal a page */
1096 #if defined(__x86_64__)
1097 	virtual_avail += 2 * PAGE_SIZE; pte += 2;
1098 	avail_start += 2 * PAGE_SIZE;
1099 #else /* defined(__x86_64__) */
1100 	virtual_avail += PAGE_SIZE; pte++;
1101 	avail_start += PAGE_SIZE;
1102 #endif /* defined(__x86_64__) */
1103 
1104 #if defined(I586_CPU)
1105 	/* pentium f00f bug stuff */
1106 	pentium_idt_vaddr = virtual_avail;		/* don't need pte */
1107 	virtual_avail += PAGE_SIZE; pte++;
1108 #endif
1109 
1110 #ifdef _LP64
1111 	/*
1112 	 * Grab a page below 4G for things that need it (i.e.
1113 	 * having an initial %cr3 for the MP trampoline).
1114 	 */
1115 	lo32_vaddr = virtual_avail;
1116 	virtual_avail += PAGE_SIZE; pte++;
1117 	lo32_paddr = avail_start;
1118 	avail_start += PAGE_SIZE;
1119 #endif
1120 
1121 	/*
1122 	 * now we reserve some VM for mapping pages when doing a crash dump
1123 	 */
1124 
1125 	virtual_avail = reserve_dumppages(virtual_avail);
1126 
1127 	/*
1128 	 * init the static-global locks and global lists.
1129 	 *
1130 	 * => pventry::pvh_lock (initialized elsewhere) must also be
1131 	 *      a spin lock, again at IPL_VM to prevent deadlock, and
1132 	 *	again is never taken from interrupt context.
1133 	 */
1134 
1135 	rw_init(&pmap_main_lock);
1136 	mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE);
1137 	LIST_INIT(&pmaps);
1138 	pmap_cpu_init_early(curcpu());
1139 
1140 	/*
1141 	 * initialize caches.
1142 	 */
1143 
1144 	pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), 0, 0, 0, "pmappl",
1145 	    &pool_allocator_nointr, IPL_NONE, NULL, NULL, NULL);
1146 	pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE, 0, 0, 0, "pdppl",
1147 	    &pool_allocator_nointr, IPL_NONE, pmap_pdp_ctor, NULL, NULL);
1148 	pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0, 0,
1149 	    "pvpl", &pool_allocator_meta, IPL_NONE, NULL, NULL, NULL);
1150 
1151 	/*
1152 	 * ensure the TLB is sync'd with reality by flushing it...
1153 	 */
1154 
1155 	tlbflush();
1156 
1157 	/*
1158 	 * calculate pmap_maxkvaddr from nkptp[].
1159 	 */
1160 
1161 	kva = VM_MIN_KERNEL_ADDRESS;
1162 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
1163 		kva += nkptp[i] * nbpd[i];
1164 	}
1165 	pmap_maxkvaddr = kva;
1166 }
1167 
1168 #if defined(__x86_64__)
1169 /*
1170  * Pre-allocate PTPs for low memory, so that 1:1 mappings for various
1171  * trampoline code can be entered.
1172  */
1173 void
1174 pmap_prealloc_lowmem_ptps(void)
1175 {
1176 	pd_entry_t *pdes;
1177 	int level;
1178 	paddr_t newp;
1179 
1180 	pdes = pmap_kernel()->pm_pdir;
1181 	level = PTP_LEVELS;
1182 	for (;;) {
1183 		newp = avail_start;
1184 		avail_start += PAGE_SIZE;
1185 		*early_zero_pte = (newp & PG_FRAME) | PG_V | PG_RW;
1186 		pmap_update_pg((vaddr_t)early_zerop);
1187 		memset(early_zerop, 0, PAGE_SIZE);
1188 		pdes[pl_i(0, level)] = (newp & PG_FRAME) | PG_V | PG_RW;
1189 		level--;
1190 		if (level <= 1)
1191 			break;
1192 		pdes = normal_pdes[level - 2];
1193 	}
1194 }
1195 #endif /* defined(__x86_64__) */
1196 
1197 /*
1198  * pmap_init: called from uvm_init, our job is to get the pmap
1199  * system ready to manage mappings...
1200  */
1201 
1202 void
1203 pmap_init(void)
1204 {
1205 
1206 	/*
1207 	 * done: pmap module is up (and ready for business)
1208 	 */
1209 
1210 	pmap_initialized = true;
1211 }
1212 
1213 /*
1214  * pmap_cpu_init_early: perform early per-CPU initialization.
1215  */
1216 
1217 void
1218 pmap_cpu_init_early(struct cpu_info *ci)
1219 {
1220 	struct pmap_cpu *pc;
1221 	static uint8_t pmap_cpu_alloc;
1222 
1223 	pc = &pmap_cpu[pmap_cpu_alloc++].pc;
1224 	ci->ci_pmap_cpu = pc;
1225 }
1226 
1227 /*
1228  * pmap_cpu_init_late: perform late per-CPU initialization.
1229  */
1230 
1231 void
1232 pmap_cpu_init_late(struct cpu_info *ci)
1233 {
1234 
1235 	if (ci == &cpu_info_primary)
1236 		evcnt_attach_dynamic(&pmap_tlb_evcnt, EVCNT_TYPE_INTR,
1237 		    NULL, "global", "TLB IPI");
1238 	evcnt_attach_dynamic(&ci->ci_tlb_evcnt, EVCNT_TYPE_INTR,
1239 	    NULL, ci->ci_dev->dv_xname, "TLB IPI");
1240 }
1241 
1242 /*
1243  * p v _ e n t r y   f u n c t i o n s
1244  */
1245 
1246 /*
1247  * pmap_free_pvs: free a list of pv_entrys
1248  */
1249 
1250 static void
1251 pmap_free_pvs(struct pv_entry *pv)
1252 {
1253 	struct pv_entry *next;
1254 
1255 	for ( /* null */ ; pv != NULL ; pv = next) {
1256 		next = SPLAY_RIGHT(pv, pv_node);
1257 		pool_cache_put(&pmap_pv_cache, pv);
1258 	}
1259 }
1260 
1261 /*
1262  * pmap_lock_pvhs: Lock pvh1 and optional pvh2
1263  *                 Observe locking order when locking both pvhs
1264  */
1265 
1266 static void
1267 pmap_lock_pvhs(struct pv_head *pvh1, struct pv_head *pvh2)
1268 {
1269 
1270 	if (pvh2 == NULL) {
1271 		mutex_spin_enter(&pvh1->pvh_lock);
1272 		return;
1273 	}
1274 
1275 	if (pvh1 < pvh2) {
1276 		mutex_spin_enter(&pvh1->pvh_lock);
1277 		mutex_spin_enter(&pvh2->pvh_lock);
1278 	} else {
1279 		mutex_spin_enter(&pvh2->pvh_lock);
1280 		mutex_spin_enter(&pvh1->pvh_lock);
1281 	}
1282 }
1283 
1284 
1285 /*
1286  * main pv_entry manipulation functions:
1287  *   pmap_enter_pv: enter a mapping onto a pv_head list
1288  *   pmap_remove_pv: remove a mappiing from a pv_head list
1289  *
1290  * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock
1291  *       the pvh before calling
1292  */
1293 
1294 /*
1295  * pmap_enter_pv: enter a mapping onto a pv_head lst
1296  *
1297  * => caller should hold the proper lock on pmap_main_lock
1298  * => caller should have pmap locked
1299  * => caller should have the pv_head locked
1300  * => caller should adjust ptp's wire_count before calling
1301  */
1302 
1303 static void
1304 pmap_enter_pv(struct pv_head *pvh,
1305 	      struct pv_entry *pve,	/* preallocated pve for us to use */
1306 	      struct pmap *pmap,
1307 	      vaddr_t va,
1308 	      struct vm_page *ptp)	/* PTP in pmap that maps this VA */
1309 {
1310 	pve->pv_pmap = pmap;
1311 	pve->pv_va = va;
1312 	pve->pv_ptp = ptp;			/* NULL for kernel pmap */
1313 	SPLAY_INSERT(pvtree, &pvh->pvh_root, pve); /* add to locked list */
1314 }
1315 
1316 /*
1317  * pmap_remove_pv: try to remove a mapping from a pv_list
1318  *
1319  * => caller should hold proper lock on pmap_main_lock
1320  * => pmap should be locked
1321  * => caller should hold lock on pv_head [so that attrs can be adjusted]
1322  * => caller should adjust ptp's wire_count and free PTP if needed
1323  * => we return the removed pve
1324  */
1325 
1326 static struct pv_entry *
1327 pmap_remove_pv(struct pv_head *pvh, struct pmap *pmap, vaddr_t va)
1328 {
1329 	struct pv_entry tmp, *pve;
1330 
1331 	tmp.pv_pmap = pmap;
1332 	tmp.pv_va = va;
1333 	pve = SPLAY_FIND(pvtree, &pvh->pvh_root, &tmp);
1334 	if (pve == NULL)
1335 		return (NULL);
1336 	SPLAY_REMOVE(pvtree, &pvh->pvh_root, pve);
1337 	return(pve);				/* return removed pve */
1338 }
1339 
1340 /*
1341  * p t p   f u n c t i o n s
1342  */
1343 
1344 static inline struct vm_page *
1345 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level)
1346 {
1347 	int lidx = level - 1;
1348 	struct vm_page *pg;
1349 
1350 	if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] &&
1351 	    pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) {
1352 		return (pmap->pm_ptphint[lidx]);
1353 	}
1354 	if (lidx == 0)
1355 		pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level));
1356 	else {
1357 		mutex_enter(&pmap->pm_obj[lidx].vmobjlock);
1358 		pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level));
1359 		mutex_exit(&pmap->pm_obj[lidx].vmobjlock);
1360 	}
1361 	return pg;
1362 }
1363 
1364 static inline void
1365 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level,
1366     struct vm_page **empty_ptps)
1367 {
1368 	int lidx;
1369 	struct uvm_object *obj;
1370 
1371 	lidx = level - 1;
1372 
1373 	obj = &pmap->pm_obj[lidx];
1374 	pmap->pm_stats.resident_count--;
1375 	if (lidx != 0)
1376 		mutex_enter(&obj->vmobjlock);
1377 	if (pmap->pm_ptphint[lidx] == ptp)
1378 		pmap->pm_ptphint[lidx] = TAILQ_FIRST(&obj->memq);
1379 	ptp->wire_count = 0;
1380 	uvm_pagerealloc(ptp, NULL, 0);
1381 	ptp->flags |= PG_ZERO;
1382 	ptp->mdpage.mp_link = *empty_ptps;
1383 	*empty_ptps = ptp;
1384 	if (lidx != 0)
1385 		mutex_exit(&obj->vmobjlock);
1386 }
1387 
1388 static void
1389 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
1390 	      pt_entry_t *ptes, pd_entry_t **pdes, struct vm_page **empty_ptps)
1391 {
1392 	unsigned long index;
1393 	int level;
1394 	vaddr_t invaladdr;
1395 	pd_entry_t opde;
1396 	struct pmap *curpmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map);
1397 
1398 	level = 1;
1399 	do {
1400 		pmap_freepage(pmap, ptp, level, empty_ptps);
1401 		index = pl_i(va, level + 1);
1402 		opde = pmap_pte_set(&pdes[level - 1][index], 0);
1403 		invaladdr = level == 1 ? (vaddr_t)ptes :
1404 		    (vaddr_t)pdes[level - 2];
1405 		pmap_tlb_shootdown(curpmap, invaladdr + index * PAGE_SIZE,
1406 		    0, opde);
1407 #if defined(MULTIPROCESSOR)
1408 		invaladdr = level == 1 ? (vaddr_t)PTE_BASE :
1409 		    (vaddr_t)normal_pdes[level - 2];
1410 		pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE,
1411 		    0, opde);
1412 #endif
1413 		if (level < PTP_LEVELS - 1) {
1414 			ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1);
1415 			ptp->wire_count--;
1416 			if (ptp->wire_count > 1)
1417 				break;
1418 		}
1419 	} while (++level < PTP_LEVELS);
1420 }
1421 
1422 /*
1423  * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
1424  *
1425  * => pmap should NOT be pmap_kernel()
1426  * => pmap should be locked
1427  */
1428 
1429 static struct vm_page *
1430 pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t **pdes)
1431 {
1432 	struct vm_page *ptp, *pptp;
1433 	int i;
1434 	unsigned long index;
1435 	pd_entry_t *pva;
1436 	paddr_t ppa, pa;
1437 	struct uvm_object *obj;
1438 
1439 	ptp = NULL;
1440 	pa = (paddr_t)-1;
1441 
1442 	/*
1443 	 * Loop through all page table levels seeing if we need to
1444 	 * add a new page to that level.
1445 	 */
1446 	for (i = PTP_LEVELS; i > 1; i--) {
1447 		/*
1448 		 * Save values from previous round.
1449 		 */
1450 		pptp = ptp;
1451 		ppa = pa;
1452 
1453 		index = pl_i(va, i);
1454 		pva = pdes[i - 2];
1455 
1456 		if (pmap_valid_entry(pva[index])) {
1457 			ppa = pva[index] & PG_FRAME;
1458 			ptp = NULL;
1459 			continue;
1460 		}
1461 
1462 		obj = &pmap->pm_obj[i-2];
1463 		/*
1464 		 * XXX pm_obj[0] is pm_lock, which is already locked.
1465 		 */
1466 		if (i != 2)
1467 			mutex_enter(&obj->vmobjlock);
1468 		ptp = uvm_pagealloc(obj, ptp_va2o(va, i - 1), NULL,
1469 		    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
1470 		if (i != 2)
1471 			mutex_exit(&obj->vmobjlock);
1472 
1473 		if (ptp == NULL)
1474 			return NULL;
1475 
1476 		ptp->flags &= ~PG_BUSY; /* never busy */
1477 		ptp->wire_count = 1;
1478 		pmap->pm_ptphint[i - 2] = ptp;
1479 		pa = VM_PAGE_TO_PHYS(ptp);
1480 		pva[index] = (pd_entry_t) (pa | PG_u | PG_RW | PG_V);
1481 		pmap->pm_stats.resident_count++;
1482 		/*
1483 		 * If we're not in the top level, increase the
1484 		 * wire count of the parent page.
1485 		 */
1486 		if (i < PTP_LEVELS) {
1487 			if (pptp == NULL)
1488 				pptp = pmap_find_ptp(pmap, va, ppa, i);
1489 #ifdef DIAGNOSTIC
1490 			if (pptp == NULL)
1491 				panic("pde page disappeared");
1492 #endif
1493 			pptp->wire_count++;
1494 		}
1495 	}
1496 
1497 	/*
1498 	 * ptp is not NULL if we just allocated a new ptp. If it's
1499 	 * still NULL, we must look up the existing one.
1500 	 */
1501 	if (ptp == NULL) {
1502 		ptp = pmap_find_ptp(pmap, va, ppa, 1);
1503 #ifdef DIAGNOSTIC
1504 		if (ptp == NULL) {
1505 			printf("va %lx ppa %lx\n", (unsigned long)va,
1506 			    (unsigned long)ppa);
1507 			panic("pmap_get_ptp: unmanaged user PTP");
1508 		}
1509 #endif
1510 	}
1511 
1512 	pmap->pm_ptphint[0] = ptp;
1513 	return(ptp);
1514 }
1515 
1516 /*
1517  * p m a p  l i f e c y c l e   f u n c t i o n s
1518  */
1519 
1520 /*
1521  * pmap_pdp_ctor: constructor for the PDP cache.
1522  */
1523 
1524 int
1525 pmap_pdp_ctor(void *arg, void *object, int flags)
1526 {
1527 	pd_entry_t *pdir = object;
1528 	paddr_t pdirpa = 0;	/* XXX: GCC */
1529 	int npde;
1530 
1531 	/*
1532 	 * NOTE: The `pmap_lock' is held when the PDP is allocated.
1533 	 */
1534 
1535 	/* fetch the physical address of the page directory. */
1536 	(void) pmap_extract(pmap_kernel(), (vaddr_t) pdir, &pdirpa);
1537 
1538 	/* zero init area */
1539 	memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t));
1540 
1541 	/* put in recursive PDE to map the PTEs */
1542 	pdir[PDIR_SLOT_PTE] = pdirpa | PG_V | PG_KW;
1543 
1544 	npde = nkptp[PTP_LEVELS - 1];
1545 
1546 	/* put in kernel VM PDEs */
1547 	memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN],
1548 	    npde * sizeof(pd_entry_t));
1549 
1550 	/* zero the rest */
1551 	memset(&pdir[PDIR_SLOT_KERN + npde], 0,
1552 	    (NTOPLEVEL_PDES - (PDIR_SLOT_KERN + npde)) * sizeof(pd_entry_t));
1553 
1554 	if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
1555 		int idx = pl_i(KERNBASE, PTP_LEVELS);
1556 
1557 		pdir[idx] = PDP_BASE[idx];
1558 	}
1559 
1560 	return (0);
1561 }
1562 
1563 /*
1564  * pmap_create: create a pmap
1565  *
1566  * => note: old pmap interface took a "size" args which allowed for
1567  *	the creation of "software only" pmaps (not in bsd).
1568  */
1569 
1570 struct pmap *
1571 pmap_create(void)
1572 {
1573 	struct pmap *pmap;
1574 	int i;
1575 
1576 	pmap = pool_cache_get(&pmap_cache, PR_WAITOK);
1577 
1578 	/* init uvm_object */
1579 	for (i = 0; i < PTP_LEVELS - 1; i++) {
1580 		UVM_OBJ_INIT(&pmap->pm_obj[i], NULL, 1);
1581 		pmap->pm_ptphint[i] = NULL;
1582 	}
1583 	pmap->pm_stats.wired_count = 0;
1584 	pmap->pm_stats.resident_count = 1;	/* count the PDP allocd below */
1585 #if !defined(__x86_64__)
1586 	pmap->pm_hiexec = 0;
1587 #endif /* !defined(__x86_64__) */
1588 	pmap->pm_flags = 0;
1589 	pmap->pm_cpus = 0;
1590 	pmap->pm_kernel_cpus = 0;
1591 
1592 	/* init the LDT */
1593 	pmap->pm_ldt = NULL;
1594 	pmap->pm_ldt_len = 0;
1595 	pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
1596 
1597 	/* allocate PDP */
1598  try_again:
1599 	pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK);
1600 
1601 	mutex_enter(&pmaps_lock);
1602 
1603 	if (pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] == 0) {
1604 		mutex_exit(&pmaps_lock);
1605 		pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir);
1606 		goto try_again;
1607 	}
1608 
1609 	pmap->pm_pdirpa = pmap->pm_pdir[PDIR_SLOT_PTE] & PG_FRAME;
1610 
1611 	LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
1612 
1613 	mutex_exit(&pmaps_lock);
1614 
1615 	return (pmap);
1616 }
1617 
1618 /*
1619  * pmap_destroy: drop reference count on pmap.   free pmap if
1620  *	reference count goes to zero.
1621  */
1622 
1623 void
1624 pmap_destroy(struct pmap *pmap)
1625 {
1626 	int i;
1627 #ifdef DIAGNOSTIC
1628 	struct cpu_info *ci;
1629 	CPU_INFO_ITERATOR cii;
1630 #endif /* DIAGNOSTIC */
1631 
1632 	/*
1633 	 * drop reference count
1634 	 */
1635 
1636 	if (atomic_dec_uint_nv((unsigned *)&pmap->pm_obj[0].uo_refs) > 0) {
1637 		return;
1638 	}
1639 
1640 #ifdef DIAGNOSTIC
1641 	for (CPU_INFO_FOREACH(cii, ci))
1642 		if (ci->ci_pmap == pmap)
1643 			panic("destroying pmap being used");
1644 #endif /* DIAGNOSTIC */
1645 
1646 	/*
1647 	 * reference count is zero, free pmap resources and then free pmap.
1648 	 */
1649 
1650 	/*
1651 	 * remove it from global list of pmaps
1652 	 */
1653 
1654 	KERNEL_LOCK(1, NULL);
1655 
1656 	mutex_enter(&pmaps_lock);
1657 	LIST_REMOVE(pmap, pm_list);
1658 	mutex_exit(&pmaps_lock);
1659 
1660 	/*
1661 	 * destroyed pmap shouldn't have remaining PTPs
1662 	 */
1663 
1664 	for (i = 0; i < PTP_LEVELS - 1; i++) {
1665 		KASSERT(pmap->pm_obj[i].uo_npages == 0);
1666 		KASSERT(TAILQ_EMPTY(&pmap->pm_obj[i].memq));
1667 	}
1668 
1669 	/*
1670 	 * MULTIPROCESSOR -- no need to flush out of other processors'
1671 	 * APTE space because we do that in pmap_unmap_ptes().
1672 	 */
1673 	pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir);
1674 
1675 #ifdef USER_LDT
1676 	if (pmap->pm_flags & PMF_USER_LDT) {
1677 		/*
1678 		 * no need to switch the LDT; this address space is gone,
1679 		 * nothing is using it.
1680 		 *
1681 		 * No need to lock the pmap for ldt_free (or anything else),
1682 		 * we're the last one to use it.
1683 		 */
1684 		ldt_free(pmap->pm_ldt_sel);
1685 		uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt,
1686 		    pmap->pm_ldt_len * sizeof(union descriptor), UVM_KMF_WIRED);
1687 	}
1688 #endif
1689 
1690 	for (i = 0; i < PTP_LEVELS - 1; i++)
1691 		mutex_destroy(&pmap->pm_obj[i].vmobjlock);
1692 	pool_cache_put(&pmap_cache, pmap);
1693 
1694 	KERNEL_UNLOCK_ONE(NULL);
1695 }
1696 
1697 /*
1698  *	Add a reference to the specified pmap.
1699  */
1700 
1701 inline void
1702 pmap_reference(struct pmap *pmap)
1703 {
1704 
1705 	atomic_inc_uint((unsigned *)&pmap->pm_obj[0].uo_refs);
1706 }
1707 
1708 #if defined(PMAP_FORK)
1709 /*
1710  * pmap_fork: perform any necessary data structure manipulation when
1711  * a VM space is forked.
1712  */
1713 
1714 void
1715 pmap_fork(struct pmap *pmap1, struct pmap *pmap2)
1716 {
1717 #ifdef USER_LDT
1718 	union descriptor *new_ldt;
1719 	size_t len;
1720 	int sel;
1721 
1722  retry:
1723 	if (pmap1->pm_flags & PMF_USER_LDT) {
1724 		len = pmap1->pm_ldt_len * sizeof(union descriptor);
1725 		new_ldt = (union descriptor *)uvm_km_alloc(kernel_map,
1726 		    len, 0, UVM_KMF_WIRED);
1727 		sel = ldt_alloc(new_ldt, len);
1728 	} else {
1729 		len = -1;
1730 		new_ldt = NULL;
1731 		sel = -1;
1732 	}
1733 
1734 	if ((uintptr_t) pmap1 < (uintptr_t) pmap2) {
1735 		mutex_enter(&pmap1->pm_obj.vmobjlock);
1736 		mutex_enter(&pmap2->pm_obj.vmobjlock);
1737 	} else {
1738 		mutex_enter(&pmap2->pm_obj.vmobjlock);
1739 		mutex_enter(&pmap1->pm_obj.vmobjlock);
1740 	}
1741 
1742  	/* Copy the LDT, if necessary. */
1743  	if (pmap1->pm_flags & PMF_USER_LDT) {
1744 		if (len != pmap1->pm_ldt_len * sizeof(union descriptor)) {
1745 			mutex_exit(&pmap2->pm_obj.vmobjlock);
1746 			mutex_exit(&pmap1->pm_obj.vmobjlock);
1747 			if (len != -1) {
1748 				ldt_free(sel);
1749 				uvm_km_free(kernel_map, (vaddr_t)new_ldt,
1750 				    len, UVM_KMF_WIRED);
1751 			}
1752 			goto retry;
1753 		}
1754 
1755 		memcpy(new_ldt, pmap1->pm_ldt, len);
1756 		pmap2->pm_ldt = new_ldt;
1757 		pmap2->pm_ldt_len = pmap1->pm_ldt_len;
1758 		pmap2->pm_flags |= PMF_USER_LDT;
1759 		pmap2->pm_ldt_sel = sel;
1760 		len = -1;
1761 	}
1762 
1763 	mutex_exit(&pmap2->pm_obj.vmobjlock);
1764 	mutex_exit(&pmap1->pm_obj.vmobjlock);
1765 
1766 	if (len != -1) {
1767 		ldt_free(sel);
1768 		uvm_km_free(kernel_map, (vaddr_t)new_ldt, len,
1769 		    UVM_KMF_WIRED);
1770 	}
1771 #endif /* USER_LDT */
1772 }
1773 #endif /* PMAP_FORK */
1774 
1775 #ifdef USER_LDT
1776 /*
1777  * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and
1778  * restore the default.
1779  */
1780 
1781 void
1782 pmap_ldt_cleanup(struct lwp *l)
1783 {
1784 	struct pcb *pcb = &l->l_addr->u_pcb;
1785 	pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap;
1786 	union descriptor *old_ldt = NULL;
1787 	size_t len = 0;
1788 	int sel = -1;
1789 
1790 	mutex_enter(&pmap->pm_lock);
1791 
1792 	if (pmap->pm_flags & PMF_USER_LDT) {
1793 		sel = pmap->pm_ldt_sel;
1794 		pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
1795 		pcb->pcb_ldt_sel = pmap->pm_ldt_sel;
1796 		if (l == curlwp)
1797 			lldt(pcb->pcb_ldt_sel);
1798 		old_ldt = pmap->pm_ldt;
1799 		len = pmap->pm_ldt_len * sizeof(union descriptor);
1800 		pmap->pm_ldt = NULL;
1801 		pmap->pm_ldt_len = 0;
1802 		pmap->pm_flags &= ~PMF_USER_LDT;
1803 	}
1804 
1805 	mutex_exit(&pmap->pm_lock);
1806 
1807 	if (sel != -1)
1808 		ldt_free(sel);
1809 	if (old_ldt != NULL)
1810 		uvm_km_free(kernel_map, (vaddr_t)old_ldt, len, UVM_KMF_WIRED);
1811 }
1812 #endif /* USER_LDT */
1813 
1814 /*
1815  * pmap_activate: activate a process' pmap
1816  *
1817  * => must be called with kernel preemption disabled
1818  * => if lwp is the curlwp, then set ci_want_pmapload so that
1819  *    actual MMU context switch will be done by pmap_load() later
1820  */
1821 
1822 void
1823 pmap_activate(struct lwp *l)
1824 {
1825 	struct cpu_info *ci;
1826 	struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
1827 
1828 	ci = curcpu();
1829 
1830 	if (l == ci->ci_curlwp) {
1831 		struct pcb *pcb;
1832 
1833 		KASSERT(ci->ci_want_pmapload == 0);
1834 		KASSERT(ci->ci_tlbstate != TLBSTATE_VALID);
1835 #ifdef KSTACK_CHECK_DR0
1836 		/*
1837 		 * setup breakpoint on the top of stack
1838 		 */
1839 		if (l == &lwp0)
1840 			dr0(0, 0, 0, 0);
1841 		else
1842 			dr0(KSTACK_LOWEST_ADDR(l), 1, 3, 1);
1843 #endif
1844 
1845 		/*
1846 		 * no need to switch to kernel vmspace because
1847 		 * it's a subset of any vmspace.
1848 		 */
1849 
1850 		if (pmap == pmap_kernel()) {
1851 			ci->ci_want_pmapload = 0;
1852 			return;
1853 		}
1854 
1855 		pcb = &l->l_addr->u_pcb;
1856 		pcb->pcb_ldt_sel = pmap->pm_ldt_sel;
1857 
1858 		ci->ci_want_pmapload = 1;
1859 
1860 #if defined(__x86_64__)
1861 		if (pcb->pcb_flags & PCB_GS64)
1862 			wrmsr(MSR_KERNELGSBASE, pcb->pcb_gs);
1863 		if (pcb->pcb_flags & PCB_FS64)
1864 			wrmsr(MSR_FSBASE, pcb->pcb_fs);
1865 #endif /* defined(__x86_64__) */
1866 	}
1867 }
1868 
1869 /*
1870  * pmap_reactivate: try to regain reference to the pmap.
1871  *
1872  * => must be called with kernel preemption disabled
1873  */
1874 
1875 static bool
1876 pmap_reactivate(struct pmap *pmap)
1877 {
1878 	struct cpu_info *ci;
1879 	uint32_t cpumask;
1880 	bool result;
1881 	uint32_t oldcpus;
1882 
1883 	ci = curcpu();
1884 	cpumask = ci->ci_cpumask;
1885 
1886 	KASSERT(pmap->pm_pdirpa == rcr3());
1887 
1888 	/*
1889 	 * if we still have a lazy reference to this pmap,
1890 	 * we can assume that there was no tlb shootdown
1891 	 * for this pmap in the meantime.
1892 	 *
1893 	 * the order of events here is important as we must
1894 	 * synchronize with TLB shootdown interrupts.  declare
1895 	 * interest in invalidations (TLBSTATE_VALID) and then
1896 	 * check the cpumask, which the IPIs can change only
1897 	 * when the state is !TLBSTATE_VALID.
1898 	 */
1899 
1900 	ci->ci_tlbstate = TLBSTATE_VALID;
1901 	oldcpus = pmap->pm_cpus;
1902 	x86_atomic_setbits_l(&pmap->pm_cpus, cpumask);
1903 	KASSERT((pmap->pm_kernel_cpus & cpumask) != 0);
1904 	if (oldcpus & cpumask) {
1905 		/* got it */
1906 		result = true;
1907 	} else {
1908 		result = false;
1909 	}
1910 
1911 	return result;
1912 }
1913 
1914 /*
1915  * pmap_load: actually switch pmap.  (fill in %cr3 and LDT info)
1916  */
1917 
1918 void
1919 pmap_load(void)
1920 {
1921 	struct cpu_info *ci;
1922 	uint32_t cpumask;
1923 	struct pmap *pmap;
1924 	struct pmap *oldpmap;
1925 	struct lwp *l;
1926 	struct pcb *pcb;
1927 	uint64_t ncsw;
1928 
1929 	crit_enter();
1930 	KASSERT(curcpu()->ci_want_pmapload);
1931  retry:
1932 	ci = curcpu();
1933 	cpumask = ci->ci_cpumask;
1934 
1935 	/* should be able to take ipis. */
1936 	KASSERT(ci->ci_ilevel < IPL_IPI);
1937 	KASSERT((x86_read_psl() & PSL_I) != 0);
1938 
1939 	l = ci->ci_curlwp;
1940 	KASSERT(l != NULL);
1941 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
1942 	KASSERT(pmap != pmap_kernel());
1943 	oldpmap = ci->ci_pmap;
1944 
1945 	pcb = &l->l_addr->u_pcb;
1946 	/* loaded by pmap_activate */
1947 	KASSERT(pcb->pcb_ldt_sel == pmap->pm_ldt_sel);
1948 
1949 	if (pmap == oldpmap) {
1950 		if (!pmap_reactivate(pmap)) {
1951 
1952 			/*
1953 			 * pmap has been changed during deactivated.
1954 			 * our tlb may be stale.
1955 			 */
1956 
1957 			tlbflush();
1958 		}
1959 
1960 		ci->ci_want_pmapload = 0;
1961 		crit_exit();
1962 		return;
1963 	}
1964 
1965 	/*
1966 	 * grab a reference to the new pmap.
1967 	 */
1968 
1969 	pmap_reference(pmap);
1970 
1971 	/*
1972 	 * actually switch pmap.
1973 	 */
1974 
1975 	x86_atomic_clearbits_l(&oldpmap->pm_cpus, cpumask);
1976 	x86_atomic_clearbits_l(&oldpmap->pm_kernel_cpus, cpumask);
1977 
1978 	KASSERT(oldpmap->pm_pdirpa == rcr3());
1979 	KASSERT((pmap->pm_cpus & cpumask) == 0);
1980 	KASSERT((pmap->pm_kernel_cpus & cpumask) == 0);
1981 
1982 	/*
1983 	 * mark the pmap in use by this processor.  again we must
1984 	 * synchronize with TLB shootdown interrupts, so set the
1985 	 * state VALID first, then register us for shootdown events
1986 	 * on this pmap.
1987 	 */
1988 
1989 	ci->ci_tlbstate = TLBSTATE_VALID;
1990 	x86_atomic_setbits_l(&pmap->pm_cpus, cpumask);
1991 	x86_atomic_setbits_l(&pmap->pm_kernel_cpus, cpumask);
1992 	ci->ci_pmap = pmap;
1993 
1994 	/*
1995 	 * update tss.  now that we have registered for invalidations
1996 	 * from other CPUs, we're good to load the page tables.
1997 	 */
1998 
1999 	lldt(pcb->pcb_ldt_sel);
2000 	pcb->pcb_cr3 = pmap->pm_pdirpa;
2001 	lcr3(pcb->pcb_cr3);
2002 
2003 	ci->ci_want_pmapload = 0;
2004 
2005 	/*
2006 	 * we're now running with the new pmap.  drop the reference
2007 	 * to the old pmap.  if we block, we need to go around again.
2008 	 */
2009 
2010 	ncsw = l->l_ncsw;
2011 	pmap_destroy(oldpmap);
2012 	if (l->l_ncsw != ncsw) {
2013 		goto retry;
2014 	}
2015 
2016 	crit_exit();
2017 }
2018 
2019 /*
2020  * pmap_deactivate: deactivate a process' pmap
2021  *
2022  * => must be called with kernel preemption disabled (high SPL is enough)
2023  */
2024 
2025 void
2026 pmap_deactivate(struct lwp *l)
2027 {
2028 	struct pmap *pmap;
2029 	struct cpu_info *ci;
2030 
2031 	if (l != curlwp) {
2032 		return;
2033 	}
2034 
2035 	/*
2036 	 * wait for pending TLB shootdowns to complete.  necessary
2037 	 * because TLB shootdown state is per-CPU, and the LWP may
2038 	 * be coming off the CPU before it has a chance to call
2039 	 * pmap_update().
2040 	 */
2041 	pmap_tlb_shootwait();
2042 
2043 	ci = curcpu();
2044 
2045 	if (ci->ci_want_pmapload) {
2046 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
2047 		    != pmap_kernel());
2048 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
2049 		    != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID);
2050 
2051 		/*
2052 		 * userspace has not been touched.
2053 		 * nothing to do here.
2054 		 */
2055 
2056 		ci->ci_want_pmapload = 0;
2057 		return;
2058 	}
2059 
2060 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2061 
2062 	if (pmap == pmap_kernel()) {
2063 		return;
2064 	}
2065 
2066 	KASSERT(pmap->pm_pdirpa == rcr3());
2067 	KASSERT(ci->ci_pmap == pmap);
2068 
2069 	/*
2070 	 * we aren't interested in TLB invalidations for this pmap,
2071 	 * at least for the time being.
2072 	 */
2073 
2074 	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
2075 	ci->ci_tlbstate = TLBSTATE_LAZY;
2076 }
2077 
2078 /*
2079  * end of lifecycle functions
2080  */
2081 
2082 /*
2083  * some misc. functions
2084  */
2085 
2086 static int
2087 pmap_pdes_invalid(vaddr_t va, pd_entry_t **pdes, pd_entry_t *lastpde)
2088 {
2089 	int i;
2090 	unsigned long index;
2091 	pd_entry_t pde;
2092 
2093 	for (i = PTP_LEVELS; i > 1; i--) {
2094 		index = pl_i(va, i);
2095 		pde = pdes[i - 2][index];
2096 		if ((pde & PG_V) == 0)
2097 			return i;
2098 	}
2099 	if (lastpde != NULL)
2100 		*lastpde = pde;
2101 	return 0;
2102 }
2103 
2104 /*
2105  * pmap_extract: extract a PA for the given VA
2106  */
2107 
2108 bool
2109 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
2110 {
2111 	pt_entry_t *ptes, pte;
2112 	pd_entry_t pde, **pdes;
2113 	struct pmap *pmap2;
2114 
2115 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
2116 	if (!pmap_pdes_valid(va, pdes, &pde)) {
2117 		pmap_unmap_ptes(pmap, pmap2);
2118 		return false;
2119 	}
2120 	pte = ptes[pl1_i(va)];
2121 	pmap_unmap_ptes(pmap, pmap2);
2122 
2123 	if (pde & PG_PS) {
2124 		if (pap != NULL)
2125 			*pap = (pde & PG_LGFRAME) | (va & (NBPD_L2 - 1));
2126 		return (true);
2127 	}
2128 
2129 	if (__predict_true((pte & PG_V) != 0)) {
2130 		if (pap != NULL)
2131 			*pap = (pte & PG_FRAME) | (va & (NBPD_L1 - 1));
2132 		return (true);
2133 	}
2134 
2135 	return false;
2136 }
2137 
2138 
2139 /*
2140  * vtophys: virtual address to physical address.  For use by
2141  * machine-dependent code only.
2142  */
2143 
2144 paddr_t
2145 vtophys(vaddr_t va)
2146 {
2147 	paddr_t pa;
2148 
2149 	if (pmap_extract(pmap_kernel(), va, &pa) == true)
2150 		return (pa);
2151 	return (0);
2152 }
2153 
2154 
2155 /*
2156  * pmap_virtual_space: used during bootup [pmap_steal_memory] to
2157  *	determine the bounds of the kernel virtual addess space.
2158  */
2159 
2160 void
2161 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp)
2162 {
2163 	*startp = virtual_avail;
2164 	*endp = virtual_end;
2165 }
2166 
2167 /*
2168  * pmap_map: map a range of PAs into kvm.
2169  *
2170  * => used during crash dump
2171  * => XXX: pmap_map() should be phased out?
2172  */
2173 
2174 vaddr_t
2175 pmap_map(vaddr_t va, paddr_t spa, paddr_t epa, vm_prot_t prot)
2176 {
2177 	while (spa < epa) {
2178 		pmap_enter(pmap_kernel(), va, spa, prot, 0);
2179 		va += PAGE_SIZE;
2180 		spa += PAGE_SIZE;
2181 	}
2182 	pmap_update(pmap_kernel());
2183 	return va;
2184 }
2185 
2186 /*
2187  * pmap_zero_page: zero a page
2188  */
2189 
2190 void
2191 pmap_zero_page(paddr_t pa)
2192 {
2193 #ifdef MULTIPROCESSOR
2194 	int id = cpu_number();
2195 #endif
2196 	pt_entry_t *zpte = PTESLEW(zero_pte, id);
2197 	void *zerova = VASLEW(zerop, id);
2198 
2199 #ifdef DIAGNOSTIC
2200 	if (*zpte)
2201 		panic("pmap_zero_page: lock botch");
2202 #endif
2203 
2204 	*zpte = (pa & PG_FRAME) | PG_V | PG_RW | PG_M | PG_U; /* map in */
2205 	pmap_update_pg((vaddr_t)zerova);		/* flush TLB */
2206 
2207 	if (cpu_feature & CPUID_SSE2)
2208 		sse2_zero_page(zerova);
2209 	else
2210 		memset(zerova, 0, PAGE_SIZE);
2211 
2212 #ifdef DIAGNOSTIC
2213 	*zpte = 0;					/* zap! */
2214 #endif
2215 }
2216 
2217 /*
2218  * pmap_pagezeroidle: the same, for the idle loop page zero'er.
2219  * Returns true if the page was zero'd, false if we aborted for
2220  * some reason.
2221  */
2222 
2223 bool
2224 pmap_pageidlezero(paddr_t pa)
2225 {
2226 
2227 	pmap_zero_page(pa);
2228 	return true;
2229 }
2230 
2231 /*
2232  * pmap_copy_page: copy a page
2233  */
2234 
2235 void
2236 pmap_copy_page(paddr_t srcpa, paddr_t dstpa)
2237 {
2238 #ifdef MULTIPROCESSOR
2239 	int id = cpu_number();
2240 #endif
2241 	pt_entry_t *spte = PTESLEW(csrc_pte,id);
2242 	pt_entry_t *dpte = PTESLEW(cdst_pte,id);
2243 	void *csrcva = VASLEW(csrcp, id);
2244 	void *cdstva = VASLEW(cdstp, id);
2245 
2246 #ifdef DIAGNOSTIC
2247 	if (*spte || *dpte)
2248 		panic("pmap_copy_page: lock botch");
2249 #endif
2250 
2251 	*spte = (srcpa & PG_FRAME) | PG_V | PG_RW | PG_U;
2252 	*dpte = (dstpa & PG_FRAME) | PG_V | PG_RW | PG_M | PG_U;
2253 	pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva);
2254 	if (cpu_feature & CPUID_SSE2)
2255 		sse2_copy_page(csrcva, cdstva);
2256 	else
2257 		memcpy(cdstva, csrcva, PAGE_SIZE);
2258 #ifdef DIAGNOSTIC
2259 	*spte = *dpte = 0;			/* zap! */
2260 #endif
2261 }
2262 
2263 /*
2264  * p m a p   r e m o v e   f u n c t i o n s
2265  *
2266  * functions that remove mappings
2267  */
2268 
2269 /*
2270  * pmap_remove_ptes: remove PTEs from a PTP
2271  *
2272  * => must have proper locking on pmap_master_lock
2273  * => caller must hold pmap's lock
2274  * => PTP must be mapped into KVA
2275  * => PTP should be null if pmap == pmap_kernel()
2276  * => must be called with kernel preemption disabled
2277  * => returns composite pte if at least one page should be shot down
2278  */
2279 
2280 static pt_entry_t
2281 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
2282 		 vaddr_t startva, vaddr_t endva, int flags,
2283 		 struct pv_entry **pv_tofree)
2284 {
2285 	struct pv_entry *pve;
2286 	pt_entry_t *pte = (pt_entry_t *) ptpva;
2287 	pt_entry_t opte, xpte = 0;
2288 
2289 	/*
2290 	 * note that ptpva points to the PTE that maps startva.   this may
2291 	 * or may not be the first PTE in the PTP.
2292 	 *
2293 	 * we loop through the PTP while there are still PTEs to look at
2294 	 * and the wire_count is greater than 1 (because we use the wire_count
2295 	 * to keep track of the number of real PTEs in the PTP).
2296 	 */
2297 
2298 	for (/*null*/; startva < endva && (ptp == NULL || ptp->wire_count > 1)
2299 			     ; pte++, startva += PAGE_SIZE) {
2300 		struct vm_page *pg;
2301 		struct vm_page_md *mdpg;
2302 
2303 		if (!pmap_valid_entry(*pte))
2304 			continue;			/* VA not mapped */
2305 		if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) {
2306 			continue;
2307 		}
2308 
2309 		/* atomically save the old PTE and zap! it */
2310 		opte = pmap_pte_set(pte, 0);
2311 		pmap_exec_account(pmap, startva, opte, 0);
2312 		KASSERT(pmap_valid_entry(opte));
2313 
2314 		if (opte & PG_W)
2315 			pmap->pm_stats.wired_count--;
2316 		pmap->pm_stats.resident_count--;
2317 		xpte |= opte;
2318 
2319 		if (ptp) {
2320 			ptp->wire_count--;		/* dropping a PTE */
2321 			/* Make sure that the PDE is flushed */
2322 			if (ptp->wire_count <= 1)
2323 				xpte |= PG_U;
2324 		}
2325 
2326 		/*
2327 		 * if we are not on a pv_head list we are done.
2328 		 */
2329 
2330 		if ((opte & PG_PVLIST) == 0) {
2331 #ifdef DIAGNOSTIC
2332 			if (PHYS_TO_VM_PAGE(opte & PG_FRAME) != NULL)
2333 				panic("pmap_remove_ptes: managed page without "
2334 				      "PG_PVLIST for 0x%lx", startva);
2335 #endif
2336 			continue;
2337 		}
2338 
2339 		pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
2340 #ifdef DIAGNOSTIC
2341 		if (pg == NULL)
2342 			panic("pmap_remove_ptes: unmanaged page marked "
2343 			      "PG_PVLIST, va = 0x%lx, pa = 0x%lx",
2344 			      startva, (u_long)(opte & PG_FRAME));
2345 #endif
2346 		mdpg = &pg->mdpage;
2347 
2348 		/* sync R/M bits */
2349 		mutex_spin_enter(&mdpg->mp_pvhead.pvh_lock);
2350 		mdpg->mp_attrs |= (opte & (PG_U|PG_M));
2351 		pve = pmap_remove_pv(&mdpg->mp_pvhead, pmap, startva);
2352 		mutex_spin_exit(&mdpg->mp_pvhead.pvh_lock);
2353 
2354 		if (pve) {
2355 			SPLAY_RIGHT(pve, pv_node) = *pv_tofree;
2356 			*pv_tofree = pve;
2357 		}
2358 
2359 		/* end of "for" loop: time for next pte */
2360 	}
2361 
2362 	return xpte;
2363 }
2364 
2365 
2366 /*
2367  * pmap_remove_pte: remove a single PTE from a PTP
2368  *
2369  * => must have proper locking on pmap_master_lock
2370  * => caller must hold pmap's lock
2371  * => PTP must be mapped into KVA
2372  * => PTP should be null if pmap == pmap_kernel()
2373  * => returns true if we removed a mapping
2374  * => must be called with kernel preemption disabled
2375  */
2376 
2377 static bool
2378 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
2379 		vaddr_t va, int flags, struct pv_entry **pv_tofree)
2380 {
2381 	pt_entry_t opte;
2382 	struct pv_entry *pve;
2383 	struct vm_page *pg;
2384 	struct vm_page_md *mdpg;
2385 
2386 	if (!pmap_valid_entry(*pte))
2387 		return(false);		/* VA not mapped */
2388 	if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) {
2389 		return(false);
2390 	}
2391 
2392 	/* atomically save the old PTE and zap! it */
2393 	opte = pmap_pte_set(pte, 0);
2394 	pmap_exec_account(pmap, va, opte, 0);
2395 	KASSERT(pmap_valid_entry(opte));
2396 
2397 	if (opte & PG_W)
2398 		pmap->pm_stats.wired_count--;
2399 	pmap->pm_stats.resident_count--;
2400 
2401 	if (opte & PG_U)
2402 		pmap_tlb_shootdown(pmap, va, 0, opte);
2403 
2404 	if (ptp) {
2405 		ptp->wire_count--;		/* dropping a PTE */
2406 		/* Make sure that the PDE is flushed */
2407 		if ((ptp->wire_count <= 1) && !(opte & PG_U))
2408 			pmap_tlb_shootdown(pmap, va, 0, opte);
2409 	}
2410 
2411 	/*
2412 	 * if we are not on a pv_head list we are done.
2413 	 */
2414 
2415 	if ((opte & PG_PVLIST) == 0) {
2416 #ifdef DIAGNOSTIC
2417 		if (PHYS_TO_VM_PAGE(opte & PG_FRAME) != NULL)
2418 			panic("pmap_remove_pte: managed page without "
2419 			      "PG_PVLIST for 0x%lx", va);
2420 #endif
2421 		return(true);
2422 	}
2423 
2424 	pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
2425 #ifdef DIAGNOSTIC
2426 	if (pg == NULL)
2427 		panic("pmap_remove_pte: unmanaged page marked "
2428 		    "PG_PVLIST, va = 0x%lx, pa = 0x%lx", va,
2429 		    (u_long)(opte & PG_FRAME));
2430 #endif
2431 	mdpg = &pg->mdpage;
2432 
2433 	/* sync R/M bits */
2434 	mutex_spin_enter(&mdpg->mp_pvhead.pvh_lock);
2435 	mdpg->mp_attrs |= (opte & (PG_U|PG_M));
2436 	pve = pmap_remove_pv(&mdpg->mp_pvhead, pmap, va);
2437 	mutex_spin_exit(&mdpg->mp_pvhead.pvh_lock);
2438 
2439 	if (pve) {
2440 		SPLAY_RIGHT(pve, pv_node) = *pv_tofree;
2441 		*pv_tofree = pve;
2442 	}
2443 
2444 	return(true);
2445 }
2446 
2447 /*
2448  * pmap_remove: top level mapping removal function
2449  *
2450  * => caller should not be holding any pmap locks
2451  */
2452 
2453 void
2454 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
2455 {
2456 	pmap_do_remove(pmap, sva, eva, PMAP_REMOVE_ALL);
2457 }
2458 
2459 /*
2460  * pmap_do_remove: mapping removal guts
2461  *
2462  * => caller should not be holding any pmap locks
2463  */
2464 
2465 static void
2466 pmap_do_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva, int flags)
2467 {
2468 	pt_entry_t *ptes, xpte = 0;
2469 	pd_entry_t **pdes, pde;
2470 	struct pv_entry *pv_tofree = NULL;
2471 	bool result;
2472 	paddr_t ptppa;
2473 	vaddr_t blkendva, va = sva;
2474 	struct vm_page *ptp, *empty_ptps = NULL;
2475 	struct pmap *pmap2;
2476 
2477 	/*
2478 	 * we lock in the pmap => pv_head direction
2479 	 */
2480 
2481 	rw_enter(&pmap_main_lock, RW_READER);
2482 
2483 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
2484 
2485 	/*
2486 	 * removing one page?  take shortcut function.
2487 	 */
2488 
2489 	if (va + PAGE_SIZE == eva) {
2490 		if (pmap_pdes_valid(va, pdes, &pde)) {
2491 
2492 			/* PA of the PTP */
2493 			ptppa = pde & PG_FRAME;
2494 
2495 			/* get PTP if non-kernel mapping */
2496 			if (pmap == pmap_kernel()) {
2497 				/* we never free kernel PTPs */
2498 				ptp = NULL;
2499 			} else {
2500 				ptp = pmap_find_ptp(pmap, va, ptppa, 1);
2501 #ifdef DIAGNOSTIC
2502 				if (ptp == NULL)
2503 					panic("pmap_remove: unmanaged "
2504 					      "PTP detected");
2505 #endif
2506 			}
2507 
2508 			/* do it! */
2509 			result = pmap_remove_pte(pmap, ptp,
2510 			    &ptes[pl1_i(va)], va, flags, &pv_tofree);
2511 
2512 			/*
2513 			 * if mapping removed and the PTP is no longer
2514 			 * being used, free it!
2515 			 */
2516 
2517 			if (result && ptp && ptp->wire_count <= 1)
2518 				pmap_free_ptp(pmap, ptp, va, ptes, pdes,
2519 				    &empty_ptps);
2520 		}
2521 	} else for (/* null */ ; va < eva ; va = blkendva) {
2522 		int lvl;
2523 
2524 		/* determine range of block */
2525 		blkendva = x86_round_pdr(va+1);
2526 		if (blkendva > eva)
2527 			blkendva = eva;
2528 
2529 		/*
2530 		 * XXXCDC: our PTE mappings should never be removed
2531 		 * with pmap_remove!  if we allow this (and why would
2532 		 * we?) then we end up freeing the pmap's page
2533 		 * directory page (PDP) before we are finished using
2534 		 * it when we hit in in the recursive mapping.  this
2535 		 * is BAD.
2536 		 *
2537 		 * long term solution is to move the PTEs out of user
2538 		 * address space.  and into kernel address space (up
2539 		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
2540 		 * be VM_MAX_ADDRESS.
2541 		 */
2542 
2543 		if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE)
2544 			/* XXXCDC: ugly hack to avoid freeing PDP here */
2545 			continue;
2546 
2547 		lvl = pmap_pdes_invalid(va, pdes, &pde);
2548 		if (lvl != 0) {
2549 			/*
2550 			 * skip a range corresponding to an invalid pde.
2551 			 */
2552 			blkendva = (va & ptp_masks[lvl - 1]) + nbpd[lvl - 1];
2553  			continue;
2554 		}
2555 
2556 		/* PA of the PTP */
2557 		ptppa = pde & PG_FRAME;
2558 
2559 		/* get PTP if non-kernel mapping */
2560 		if (pmap == pmap_kernel()) {
2561 			/* we never free kernel PTPs */
2562 			ptp = NULL;
2563 		} else {
2564 			ptp = pmap_find_ptp(pmap, va, ptppa, 1);
2565 #ifdef DIAGNOSTIC
2566 			if (ptp == NULL)
2567 				panic("pmap_remove: unmanaged PTP "
2568 				      "detected");
2569 #endif
2570 		}
2571 		xpte |= pmap_remove_ptes(pmap, ptp,
2572 		    (vaddr_t)&ptes[pl1_i(va)], va, blkendva,
2573 		    flags, &pv_tofree);
2574 
2575 		/* if PTP is no longer being used, free it! */
2576 		if (ptp && ptp->wire_count <= 1) {
2577 			pmap_free_ptp(pmap, ptp, va, ptes, pdes, &empty_ptps);
2578 		}
2579 		if ((xpte & PG_U) != 0)
2580 			pmap_tlb_shootdown(pmap, sva, eva, xpte);
2581 	}
2582 	pmap_tlb_shootwait();
2583 	pmap_unmap_ptes(pmap, pmap2);		/* unlock pmap */
2584 	rw_exit(&pmap_main_lock);
2585 
2586 	/* Now we can free unused PVs and ptps */
2587 	if (pv_tofree)
2588 		pmap_free_pvs(pv_tofree);
2589 	for (ptp = empty_ptps; ptp != NULL; ptp = empty_ptps) {
2590 		empty_ptps = ptp->mdpage.mp_link;
2591 		uvm_pagefree(ptp);
2592 	}
2593 }
2594 
2595 /*
2596  * pmap_page_remove: remove a managed vm_page from all pmaps that map it
2597  *
2598  * => we set pv_head => pmap locking
2599  * => R/M bits are sync'd back to attrs
2600  */
2601 
2602 void
2603 pmap_page_remove(struct vm_page *pg)
2604 {
2605 	struct pv_head *pvh;
2606 	struct pv_entry *pve, *npve, *killlist = NULL;
2607 	pt_entry_t *ptes, opte;
2608 	pd_entry_t **pdes;
2609 #ifdef DIAGNOSTIC
2610 	pd_entry_t pde;
2611 #endif
2612 	struct vm_page *empty_ptps = NULL;
2613 	struct vm_page *ptp;
2614 	struct pmap *pmap2;
2615 
2616 #ifdef DIAGNOSTIC
2617 	int bank, off;
2618 
2619 	bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off);
2620 	if (bank == -1)
2621 		panic("pmap_page_remove: unmanaged page?");
2622 #endif
2623 
2624 	pvh = &pg->mdpage.mp_pvhead;
2625 	if (SPLAY_ROOT(&pvh->pvh_root) == NULL) {
2626 		return;
2627 	}
2628 
2629 	/* set pv_head => pmap locking */
2630 	rw_enter(&pmap_main_lock, RW_WRITER);
2631 
2632 	for (pve = SPLAY_MIN(pvtree, &pvh->pvh_root); pve != NULL; pve = npve) {
2633 		npve = SPLAY_NEXT(pvtree, &pvh->pvh_root, pve);
2634 
2635 		/* locks pmap */
2636 		pmap_map_ptes(pve->pv_pmap, &pmap2, &ptes, &pdes);
2637 
2638 #ifdef DIAGNOSTIC
2639 		if (pve->pv_ptp && pmap_pdes_valid(pve->pv_va, pdes, &pde) &&
2640 		   (pde & PG_FRAME) != VM_PAGE_TO_PHYS(pve->pv_ptp)) {
2641 			printf("pmap_page_remove: pg=%p: va=%lx, pv_ptp=%p\n",
2642 			       pg, pve->pv_va, pve->pv_ptp);
2643 			printf("pmap_page_remove: PTP's phys addr: "
2644 			       "actual=%lx, recorded=%lx\n",
2645 			       (unsigned long)(pde & PG_FRAME),
2646 			       (unsigned long)VM_PAGE_TO_PHYS(pve->pv_ptp));
2647 			panic("pmap_page_remove: mapped managed page has "
2648 			      "invalid pv_ptp field");
2649 		}
2650 #endif
2651 
2652 		/* atomically save the old PTE and zap! it */
2653 		opte = pmap_pte_set(&ptes[pl1_i(pve->pv_va)], 0);
2654 		KASSERT(pmap_valid_entry(opte));
2655 		KDASSERT((opte & PG_FRAME) == VM_PAGE_TO_PHYS(pg));
2656 
2657 		if (opte & PG_W)
2658 			pve->pv_pmap->pm_stats.wired_count--;
2659 		pve->pv_pmap->pm_stats.resident_count--;
2660 
2661 		/* Shootdown only if referenced */
2662 		if (opte & PG_U)
2663 			pmap_tlb_shootdown(pve->pv_pmap, pve->pv_va, 0, opte);
2664 
2665 		/* sync R/M bits */
2666 		pg->mdpage.mp_attrs |= (opte & (PG_U|PG_M));
2667 
2668 		/* update the PTP reference count.  free if last reference. */
2669 		if (pve->pv_ptp) {
2670 			pve->pv_ptp->wire_count--;
2671 			if (pve->pv_ptp->wire_count <= 1) {
2672 				pmap_free_ptp(pve->pv_pmap, pve->pv_ptp,
2673 				    pve->pv_va, ptes, pdes, &empty_ptps);
2674 			}
2675 		}
2676 
2677 		pmap_unmap_ptes(pve->pv_pmap, pmap2);	/* unlocks pmap */
2678 		SPLAY_REMOVE(pvtree, &pvh->pvh_root, pve); /* remove it */
2679 		SPLAY_RIGHT(pve, pv_node) = killlist;	/* mark it for death */
2680 		killlist = pve;
2681 	}
2682 	rw_exit(&pmap_main_lock);
2683 
2684 	crit_enter();
2685 	pmap_tlb_shootwait();
2686 	crit_exit();
2687 
2688 	/* Now we can free unused pvs and ptps. */
2689 	pmap_free_pvs(killlist);
2690 	for (ptp = empty_ptps; ptp != NULL; ptp = empty_ptps) {
2691 		empty_ptps = ptp->mdpage.mp_link;
2692 		uvm_pagefree(ptp);
2693 	}
2694 }
2695 
2696 /*
2697  * p m a p   a t t r i b u t e  f u n c t i o n s
2698  * functions that test/change managed page's attributes
2699  * since a page can be mapped multiple times we must check each PTE that
2700  * maps it by going down the pv lists.
2701  */
2702 
2703 /*
2704  * pmap_test_attrs: test a page's attributes
2705  *
2706  * => we set pv_head => pmap locking
2707  */
2708 
2709 bool
2710 pmap_test_attrs(struct vm_page *pg, unsigned testbits)
2711 {
2712 	struct vm_page_md *mdpg;
2713 	int *myattrs;
2714 	struct pv_head *pvh;
2715 	struct pv_entry *pve;
2716 	struct pmap *pmap2;
2717 	pt_entry_t pte;
2718 
2719 #if DIAGNOSTIC
2720 	int bank, off;
2721 
2722 	bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off);
2723 	if (bank == -1)
2724 		panic("pmap_test_attrs: unmanaged page?");
2725 #endif
2726 	mdpg = &pg->mdpage;
2727 
2728 	/*
2729 	 * before locking: see if attributes are already set and if so,
2730 	 * return!
2731 	 */
2732 
2733 	myattrs = &mdpg->mp_attrs;
2734 	if (*myattrs & testbits)
2735 		return(true);
2736 
2737 	/* test to see if there is a list before bothering to lock */
2738 	pvh = &mdpg->mp_pvhead;
2739 	if (SPLAY_ROOT(&pvh->pvh_root) == NULL) {
2740 		return(false);
2741 	}
2742 
2743 	/* nope, gonna have to do it the hard way */
2744 	rw_enter(&pmap_main_lock, RW_WRITER);
2745 
2746 	for (pve = SPLAY_MIN(pvtree, &pvh->pvh_root);
2747 	     pve != NULL && (*myattrs & testbits) == 0;
2748 	     pve = SPLAY_NEXT(pvtree, &pvh->pvh_root, pve)) {
2749 		pt_entry_t *ptes;
2750 		pd_entry_t **pdes;
2751 
2752 		pmap_map_ptes(pve->pv_pmap, &pmap2, &ptes, &pdes);
2753 		pte = ptes[pl1_i(pve->pv_va)];
2754 		pmap_unmap_ptes(pve->pv_pmap, pmap2);
2755 		*myattrs |= pte;
2756 	}
2757 
2758 	/*
2759 	 * note that we will exit the for loop with a non-null pve if
2760 	 * we have found the bits we are testing for.
2761 	 */
2762 
2763 	rw_exit(&pmap_main_lock);
2764 	return((*myattrs & testbits) != 0);
2765 }
2766 
2767 /*
2768  * pmap_clear_attrs: clear the specified attribute for a page.
2769  *
2770  * => we set pv_head => pmap locking
2771  * => we return true if we cleared one of the bits we were asked to
2772  */
2773 
2774 bool
2775 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits)
2776 {
2777 	struct vm_page_md *mdpg;
2778 	uint32_t result;
2779 	struct pv_head *pvh;
2780 	struct pv_entry *pve;
2781 	pt_entry_t *ptes, opte;
2782 	int *myattrs;
2783 	struct pmap *pmap2;
2784 
2785 #ifdef DIAGNOSTIC
2786 	int bank, off;
2787 
2788 	bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off);
2789 	if (bank == -1)
2790 		panic("pmap_change_attrs: unmanaged page?");
2791 #endif
2792 	mdpg = &pg->mdpage;
2793 
2794 	rw_enter(&pmap_main_lock, RW_WRITER);
2795 	pvh = &mdpg->mp_pvhead;
2796 
2797 	myattrs = &mdpg->mp_attrs;
2798 	result = *myattrs & clearbits;
2799 	*myattrs &= ~clearbits;
2800 
2801 	SPLAY_FOREACH(pve, pvtree, &pvh->pvh_root) {
2802 		pt_entry_t *ptep;
2803 		pd_entry_t **pdes;
2804 
2805 		/* locks pmap */
2806 		pmap_map_ptes(pve->pv_pmap, &pmap2, &ptes, &pdes);
2807 #ifdef DIAGNOSTIC
2808 		if (!pmap_pdes_valid(pve->pv_va, pdes, NULL))
2809 			panic("pmap_change_attrs: mapping without PTP "
2810 			      "detected");
2811 #endif
2812 		ptep = &ptes[pl1_i(pve->pv_va)];
2813 		opte = *ptep;
2814 		KASSERT(pmap_valid_entry(opte));
2815 		KDASSERT((opte & PG_FRAME) == VM_PAGE_TO_PHYS(pg));
2816 		if (opte & clearbits) {
2817 			/* We need to do something */
2818 			if (clearbits == PG_RW) {
2819 				result |= PG_RW;
2820 
2821 				/*
2822 				 * On write protect we might not need to flush
2823 				 * the TLB
2824 				 */
2825 
2826 				/* First zap the RW bit! */
2827 				pmap_pte_clearbits(ptep, PG_RW);
2828 				opte = *ptep;
2829 
2830 				/*
2831 				 * Then test if it is not cached as RW the TLB
2832 				 */
2833 				if (!(opte & PG_M))
2834 					goto no_tlb_shootdown;
2835 			}
2836 
2837 			/*
2838 			 * Since we need a shootdown we might as well
2839 			 * always clear PG_U AND PG_M.
2840 			 */
2841 
2842 			/* zap! */
2843 			opte = pmap_pte_set(ptep, (opte & ~(PG_U | PG_M)));
2844 
2845 			result |= (opte & clearbits);
2846 			*myattrs |= (opte & ~(clearbits));
2847 
2848 			pmap_tlb_shootdown(pve->pv_pmap, pve->pv_va, 0, opte);
2849 		}
2850 no_tlb_shootdown:
2851 		pmap_unmap_ptes(pve->pv_pmap, pmap2);	/* unlocks pmap */
2852 	}
2853 
2854 	rw_exit(&pmap_main_lock);
2855 
2856 	crit_enter();
2857 	pmap_tlb_shootwait();
2858 	crit_exit();
2859 
2860 	return(result != 0);
2861 }
2862 
2863 
2864 /*
2865  * p m a p   p r o t e c t i o n   f u n c t i o n s
2866  */
2867 
2868 /*
2869  * pmap_page_protect: change the protection of all recorded mappings
2870  *	of a managed page
2871  *
2872  * => NOTE: this is an inline function in pmap.h
2873  */
2874 
2875 /* see pmap.h */
2876 
2877 /*
2878  * pmap_protect: set the protection in of the pages in a pmap
2879  *
2880  * => NOTE: this is an inline function in pmap.h
2881  */
2882 
2883 /* see pmap.h */
2884 
2885 /*
2886  * pmap_write_protect: write-protect pages in a pmap
2887  */
2888 
2889 void
2890 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
2891 {
2892 	pt_entry_t *ptes, *epte, xpte;
2893 	volatile pt_entry_t *spte;
2894 	pd_entry_t **pdes;
2895 	vaddr_t blockend, va, tva;
2896 	pt_entry_t opte;
2897 	struct pmap *pmap2;
2898 
2899 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
2900 
2901 	/* should be ok, but just in case ... */
2902 	sva &= PG_FRAME;
2903 	eva &= PG_FRAME;
2904 	xpte = 0;
2905 
2906 	for (va = sva ; va < eva ; va = blockend) {
2907 
2908 		blockend = (va & L2_FRAME) + NBPD_L2;
2909 		if (blockend > eva)
2910 			blockend = eva;
2911 
2912 		/*
2913 		 * XXXCDC: our PTE mappings should never be write-protected!
2914 		 *
2915 		 * long term solution is to move the PTEs out of user
2916 		 * address space.  and into kernel address space (up
2917 		 * with APTE).  then we can set VM_MAXUSER_ADDRESS to
2918 		 * be VM_MAX_ADDRESS.
2919 		 */
2920 
2921 		/* XXXCDC: ugly hack to avoid freeing PDP here */
2922 		if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE)
2923 			continue;
2924 
2925 		/* empty block? */
2926 		if (!pmap_pdes_valid(va, pdes, NULL))
2927 			continue;
2928 
2929 #ifdef DIAGNOSTIC
2930 		if (va >= VM_MAXUSER_ADDRESS &&
2931 		    va < VM_MAX_ADDRESS)
2932 			panic("pmap_write_protect: PTE space");
2933 #endif
2934 
2935 		spte = &ptes[pl1_i(va)];
2936 		epte = &ptes[pl1_i(blockend)];
2937 
2938 		for (/*null */; spte < epte ; spte++) {
2939 			opte = *spte;
2940 			xpte |= opte;
2941 			if ((opte & (PG_RW|PG_V)) == (PG_RW|PG_V)) {
2942 				pmap_pte_clearbits(spte, PG_RW); /* zap! */
2943 				if (*spte & PG_M) {
2944 					tva = x86_ptob(spte - ptes);
2945 					pmap_tlb_shootdown(pmap, tva, 0, opte);
2946 				}
2947 			}
2948 		}
2949 	}
2950 
2951 	/*
2952 	 * if we kept a removal record and removed some pages update the TLB
2953 	 */
2954 	pmap_tlb_shootdown(pmap, sva, eva, xpte);
2955 	pmap_tlb_shootwait();
2956 	pmap_unmap_ptes(pmap, pmap2);	/* unlocks pmap */
2957 }
2958 
2959 /*
2960  * end of protection functions
2961  */
2962 
2963 /*
2964  * pmap_unwire: clear the wired bit in the PTE
2965  *
2966  * => mapping should already be in map
2967  */
2968 
2969 void
2970 pmap_unwire(struct pmap *pmap, vaddr_t va)
2971 {
2972 	pt_entry_t *ptes;
2973 	pd_entry_t **pdes;
2974 	struct pmap *pmap2;
2975 
2976 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
2977 
2978 	if (pmap_pdes_valid(va, pdes, NULL)) {
2979 
2980 #ifdef DIAGNOSTIC
2981 		if (!pmap_valid_entry(ptes[pl1_i(va)]))
2982 			panic("pmap_unwire: invalid (unmapped) va 0x%lx", va);
2983 #endif
2984 		if ((ptes[pl1_i(va)] & PG_W) != 0) {
2985 			pmap_pte_clearbits(&ptes[pl1_i(va)], PG_W);
2986 			pmap->pm_stats.wired_count--;
2987 		}
2988 #ifdef DIAGNOSTIC
2989 		else {
2990 			printf("pmap_unwire: wiring for pmap %p va 0x%lx "
2991 			       "didn't change!\n", pmap, va);
2992 		}
2993 #endif
2994 		pmap_unmap_ptes(pmap, pmap2);		/* unlocks map */
2995 	}
2996 #ifdef DIAGNOSTIC
2997 	else {
2998 		panic("pmap_unwire: invalid PDE");
2999 	}
3000 #endif
3001 }
3002 
3003 /*
3004  * pmap_collect: free resources held by a pmap
3005  *
3006  * => optional function.
3007  * => called when a process is swapped out to free memory.
3008  */
3009 
3010 void
3011 pmap_collect(struct pmap *pmap)
3012 {
3013 	/*
3014 	 * free all of the pt pages by removing the physical mappings
3015 	 * for its entire address space.
3016 	 */
3017 
3018 	pmap_do_remove(pmap, VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS,
3019 	    PMAP_REMOVE_SKIPWIRED);
3020 }
3021 
3022 /*
3023  * pmap_copy: copy mappings from one pmap to another
3024  *
3025  * => optional function
3026  * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
3027  */
3028 
3029 /*
3030  * defined as macro in pmap.h
3031  */
3032 
3033 /*
3034  * pmap_enter: enter a mapping into a pmap
3035  *
3036  * => must be done "now" ... no lazy-evaluation
3037  * => we set pmap => pv_head locking
3038  */
3039 
3040 int
3041 pmap_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
3042 	   int flags)
3043 {
3044 	pt_entry_t *ptes, opte, npte;
3045 	pt_entry_t *ptep;
3046 	pd_entry_t **pdes;
3047 	struct vm_page *ptp, *pg;
3048 	struct vm_page_md *mdpg;
3049 	struct pv_head *old_pvh, *new_pvh;
3050 	struct pv_entry *pve = NULL, *freepve, *freepve2 = NULL;
3051 	int error;
3052 	bool wired = (flags & PMAP_WIRED) != 0;
3053 	struct pmap *pmap2;
3054 
3055 	KASSERT(pmap_initialized);
3056 
3057 #ifdef DIAGNOSTIC
3058 	/* sanity check: totally out of range? */
3059 	if (va >= VM_MAX_KERNEL_ADDRESS)
3060 		panic("pmap_enter: too big");
3061 
3062 	if (va == (vaddr_t) PDP_BASE || va == (vaddr_t) APDP_BASE)
3063 		panic("pmap_enter: trying to map over PDP/APDP!");
3064 
3065 	/* sanity check: kernel PTPs should already have been pre-allocated */
3066 	if (va >= VM_MIN_KERNEL_ADDRESS &&
3067 	    !pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]))
3068 		panic("pmap_enter: missing kernel PTP for va %lx!", va);
3069 #endif
3070 
3071 	npte = pa | protection_codes[prot] | PG_V;
3072 	if (wired)
3073 	        npte |= PG_W;
3074 	if (va < VM_MAXUSER_ADDRESS)
3075 		npte |= PG_u;
3076 	else if (va < VM_MAX_ADDRESS)
3077 		npte |= (PG_u | PG_RW);	/* XXXCDC: no longer needed? */
3078 	if (pmap == pmap_kernel())
3079 		npte |= pmap_pg_g;
3080 	if (flags & VM_PROT_ALL) {
3081 		npte |= PG_U;
3082 		if (flags & VM_PROT_WRITE)
3083 			npte |= PG_M;
3084 	}
3085 
3086 	/* get a pve. */
3087 	freepve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
3088 
3089 	/* get lock */
3090 	rw_enter(&pmap_main_lock, RW_READER);
3091 
3092 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
3093 	if (pmap == pmap_kernel()) {
3094 		ptp = NULL;
3095 	} else {
3096 		ptp = pmap_get_ptp(pmap, va, pdes);
3097 		if (ptp == NULL) {
3098 			if (flags & PMAP_CANFAIL) {
3099 				error = ENOMEM;
3100 				goto out;
3101 			}
3102 			panic("pmap_enter: get ptp failed");
3103 		}
3104 	}
3105 
3106 	/*
3107 	 * Get first view on old PTE
3108 	 * on SMP the PTE might gain PG_U and PG_M flags
3109 	 * before we zap it later
3110 	 */
3111 	ptep = &ptes[pl1_i(va)];
3112 	opte = *ptep;		/* old PTE */
3113 
3114 	/*
3115 	 * is there currently a valid mapping at our VA and does it
3116 	 * map to the same PA as the one we want to map ?
3117 	 */
3118 
3119 	if (pmap_valid_entry(opte) && ((opte & PG_FRAME) == pa)) {
3120 
3121 		/*
3122 		 * first, calculate pm_stats updates.  resident count will not
3123 		 * change since we are replacing/changing a valid mapping.
3124 		 * wired count might change...
3125 		 */
3126 		pmap->pm_stats.wired_count +=
3127 		    ((npte & PG_W) ? 1 : 0 - (opte & PG_W) ? 1 : 0);
3128 
3129 		npte |= (opte & PG_PVLIST);
3130 
3131 		/* zap! */
3132 		opte = pmap_pte_set(ptep, npte);
3133 
3134 		/*
3135 		 * if this is on the PVLIST, sync R/M bit
3136 		 */
3137 		if (opte & PG_PVLIST) {
3138 			pg = PHYS_TO_VM_PAGE(pa);
3139 #ifdef DIAGNOSTIC
3140 			if (pg == NULL)
3141 				panic("pmap_enter: same pa PG_PVLIST "
3142 				      "mapping with unmanaged page "
3143 				      "pa = 0x%lx (0x%lx)", pa,
3144 				      atop(pa));
3145 #endif
3146 			mdpg = &pg->mdpage;
3147 			old_pvh = &mdpg->mp_pvhead;
3148 			mutex_spin_enter(&old_pvh->pvh_lock);
3149 			mdpg->mp_attrs |= opte;
3150 			mutex_spin_exit(&old_pvh->pvh_lock);
3151 		}
3152 		goto shootdown_now;
3153 	}
3154 
3155 	pg = PHYS_TO_VM_PAGE(pa);
3156 	if (pg != NULL) {
3157 		/* This is a managed page */
3158 		npte |= PG_PVLIST;
3159 		mdpg = &pg->mdpage;
3160 		new_pvh = &mdpg->mp_pvhead;
3161 		if ((opte & (PG_PVLIST | PG_V)) != (PG_PVLIST | PG_V)) {
3162 			/* We can not steal a pve - allocate one */
3163 			pve = freepve;
3164 			freepve = NULL;
3165 			if (pve == NULL) {
3166 				if (!(flags & PMAP_CANFAIL))
3167 					panic("pmap_enter: "
3168 					    "no pv entries available");
3169 				error = ENOMEM;
3170 				goto out;
3171 			}
3172   		}
3173 	} else {
3174 		new_pvh = NULL;
3175 	}
3176 
3177 	/*
3178 	 * is there currently a valid mapping at our VA?
3179 	 */
3180 
3181 	if (pmap_valid_entry(opte)) {
3182 
3183 		/*
3184 		 * changing PAs: we must remove the old one first
3185 		 */
3186 
3187 		/*
3188 		 * first, calculate pm_stats updates.  resident count will not
3189 		 * change since we are replacing/changing a valid mapping.
3190 		 * wired count might change...
3191 		 */
3192 		pmap->pm_stats.wired_count +=
3193 		    ((npte & PG_W) ? 1 : 0 - (opte & PG_W) ? 1 : 0);
3194 
3195 		if (opte & PG_PVLIST) {
3196 			pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
3197 #ifdef DIAGNOSTIC
3198 			if (pg == NULL)
3199 				panic("pmap_enter: PG_PVLIST mapping with "
3200 				      "unmanaged page "
3201 				      "pa = 0x%lx (0x%lx)", pa, atop(pa));
3202 #endif
3203 			mdpg = &pg->mdpage;
3204 			old_pvh = &mdpg->mp_pvhead;
3205 
3206 			/* new_pvh is NULL if page will not be managed */
3207 			pmap_lock_pvhs(old_pvh, new_pvh);
3208 
3209 			/* zap! */
3210 			opte = pmap_pte_set(ptep, npte);
3211 
3212 			pve = pmap_remove_pv(old_pvh, pmap, va);
3213 			KASSERT(pve != 0);
3214 			mdpg->mp_attrs |= opte;
3215 
3216 			if (new_pvh != NULL) {
3217 				pmap_enter_pv(new_pvh, pve, pmap, va, ptp);
3218 				mutex_spin_exit(&new_pvh->pvh_lock);
3219 			}
3220 			mutex_spin_exit(&old_pvh->pvh_lock);
3221 			if (new_pvh == NULL)
3222 				freepve2 = pve;
3223 			goto shootdown_test;
3224 		}
3225 	} else {	/* opte not valid */
3226 		pmap->pm_stats.resident_count++;
3227 		if (wired)
3228 			pmap->pm_stats.wired_count++;
3229 		if (ptp)
3230 			ptp->wire_count++;
3231 	}
3232 
3233 	if (new_pvh) {
3234 		mutex_spin_enter(&new_pvh->pvh_lock);
3235 		pmap_enter_pv(new_pvh, pve, pmap, va, ptp);
3236 		mutex_spin_exit(&new_pvh->pvh_lock);
3237 	}
3238 
3239 	opte = pmap_pte_set(ptep, npte);   /* zap! */
3240 
3241 shootdown_test:
3242 	/* Update page attributes if needed */
3243 	if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) {
3244 shootdown_now:
3245 		pmap_tlb_shootdown(pmap, va, 0, opte);
3246 		pmap_tlb_shootwait();
3247 	}
3248 
3249 	error = 0;
3250 
3251 out:
3252 	pmap_unmap_ptes(pmap, pmap2);
3253 	rw_exit(&pmap_main_lock);
3254 
3255 	if (freepve != NULL) {
3256 		/* put back the pv, we don't need it. */
3257 		pool_cache_put(&pmap_pv_cache, freepve);
3258 	}
3259 	if (freepve2 != NULL)
3260 		pool_cache_put(&pmap_pv_cache, freepve2);
3261 
3262 	return error;
3263 }
3264 
3265 static bool
3266 pmap_get_physpage(vaddr_t va, int level, paddr_t *paddrp)
3267 {
3268 	struct vm_page *ptp;
3269 	struct pmap *kpm = pmap_kernel();
3270 
3271 	if (uvm.page_init_done == false) {
3272 		/*
3273 		 * we're growing the kernel pmap early (from
3274 		 * uvm_pageboot_alloc()).  this case must be
3275 		 * handled a little differently.
3276 		 */
3277 
3278 		if (uvm_page_physget(paddrp) == false)
3279 			panic("pmap_get_physpage: out of memory");
3280 		*early_zero_pte = (*paddrp & PG_FRAME) | PG_V | PG_RW;
3281 		pmap_update_pg((vaddr_t)early_zerop);
3282 		memset(early_zerop, 0, PAGE_SIZE);
3283 #if defined(DIAGNOSTIC)
3284 		*early_zero_pte = 0;
3285 #endif /* defined(DIAGNOSTIC) */
3286 	} else {
3287 		/* XXX */
3288 		if (level != 1)
3289 			mutex_enter(&kpm->pm_obj[level - 1].vmobjlock);
3290 		ptp = uvm_pagealloc(&kpm->pm_obj[level - 1],
3291 				    ptp_va2o(va, level), NULL,
3292 				    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
3293 		if (level != 1)
3294 			mutex_exit(&kpm->pm_obj[level - 1].vmobjlock);
3295 		if (ptp == NULL)
3296 			panic("pmap_get_physpage: out of memory");
3297 		ptp->flags &= ~PG_BUSY;
3298 		ptp->wire_count = 1;
3299 		*paddrp = VM_PAGE_TO_PHYS(ptp);
3300 	}
3301 	kpm->pm_stats.resident_count++;
3302 	return true;
3303 }
3304 
3305 /*
3306  * Allocate the amount of specified ptps for a ptp level, and populate
3307  * all levels below accordingly, mapping virtual addresses starting at
3308  * kva.
3309  *
3310  * Used by pmap_growkernel.
3311  */
3312 static void
3313 pmap_alloc_level(pd_entry_t **pdes, vaddr_t kva, int lvl, long *needed_ptps)
3314 {
3315 	unsigned long i;
3316 	vaddr_t va;
3317 	paddr_t pa;
3318 	unsigned long index, endindex;
3319 	int level;
3320 	pd_entry_t *pdep;
3321 
3322 	for (level = lvl; level > 1; level--) {
3323 		if (level == PTP_LEVELS)
3324 			pdep = pmap_kernel()->pm_pdir;
3325 		else
3326 			pdep = pdes[level - 2];
3327 		va = kva;
3328 		index = pl_i_roundup(kva, level);
3329 		endindex = index + needed_ptps[level - 1] - 1;
3330 
3331 		for (i = index; i <= endindex; i++) {
3332 			KASSERT(!pmap_valid_entry(pdep[i]));
3333 			pmap_get_physpage(va, level - 1, &pa);
3334 			pdep[i] = pa | PG_RW | PG_V;
3335 			KASSERT(level != PTP_LEVELS || nkptp[level - 1] +
3336 			    pl_i(VM_MIN_KERNEL_ADDRESS, level) == i);
3337 			nkptp[level - 1]++;
3338 			va += nbpd[level - 1];
3339 		}
3340 	}
3341 
3342 	/* For nkptp vs pmap_pdp_cache. */
3343 	mb_write();
3344 }
3345 
3346 /*
3347  * pmap_growkernel: increase usage of KVM space
3348  *
3349  * => we allocate new PTPs for the kernel and install them in all
3350  *	the pmaps on the system.
3351  */
3352 
3353 vaddr_t
3354 pmap_growkernel(vaddr_t maxkvaddr)
3355 {
3356 	struct pmap *kpm = pmap_kernel(), *pm;
3357 	int s, i;
3358 	unsigned newpdes;
3359 	long needed_kptp[PTP_LEVELS], target_nptp, old;
3360 	bool invalidate = false;
3361 
3362 	s = splvm();	/* to be safe */
3363 	mutex_enter(&kpm->pm_lock);
3364 
3365 	if (maxkvaddr <= pmap_maxkvaddr) {
3366 		mutex_exit(&kpm->pm_lock);
3367 		splx(s);
3368 		return pmap_maxkvaddr;
3369 	}
3370 
3371 	maxkvaddr = x86_round_pdr(maxkvaddr);
3372 	old = nkptp[PTP_LEVELS - 1];
3373 	/*
3374 	 * This loop could be optimized more, but pmap_growkernel()
3375 	 * is called infrequently.
3376 	 */
3377 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
3378 		target_nptp = pl_i_roundup(maxkvaddr, i + 1) -
3379 		    pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1);
3380 		/*
3381 		 * XXX only need to check toplevel.
3382 		 */
3383 		if (target_nptp > nkptpmax[i])
3384 			panic("out of KVA space");
3385 		KASSERT(target_nptp >= nkptp[i]);
3386 		needed_kptp[i] = target_nptp - nkptp[i];
3387 	}
3388 
3389 	pmap_alloc_level(normal_pdes, pmap_maxkvaddr, PTP_LEVELS, needed_kptp);
3390 
3391 	/*
3392 	 * If the number of top level entries changed, update all
3393 	 * pmaps.
3394 	 */
3395 	if (needed_kptp[PTP_LEVELS - 1] != 0) {
3396 		newpdes = nkptp[PTP_LEVELS - 1] - old;
3397 		mutex_enter(&pmaps_lock);
3398 		LIST_FOREACH(pm, &pmaps, pm_list) {
3399 			memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old],
3400 			       &kpm->pm_pdir[PDIR_SLOT_KERN + old],
3401 			       newpdes * sizeof (pd_entry_t));
3402 		}
3403 		mutex_exit(&pmaps_lock);
3404 		invalidate = true;
3405 	}
3406 	pmap_maxkvaddr = maxkvaddr;
3407 	mutex_exit(&kpm->pm_lock);
3408 	splx(s);
3409 
3410 	if (invalidate) {
3411 		/* Invalidate the PDP cache. */
3412 		pool_cache_invalidate(&pmap_pdp_cache);
3413 	}
3414 
3415 	return maxkvaddr;
3416 }
3417 
3418 #ifdef DEBUG
3419 void pmap_dump(struct pmap *, vaddr_t, vaddr_t);
3420 
3421 /*
3422  * pmap_dump: dump all the mappings from a pmap
3423  *
3424  * => caller should not be holding any pmap locks
3425  */
3426 
3427 void
3428 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
3429 {
3430 	pt_entry_t *ptes, *pte;
3431 	pd_entry_t **pdes;
3432 	struct pmap *pmap2;
3433 	vaddr_t blkendva;
3434 
3435 	/*
3436 	 * if end is out of range truncate.
3437 	 * if (end == start) update to max.
3438 	 */
3439 
3440 	if (eva > VM_MAXUSER_ADDRESS || eva <= sva)
3441 		eva = VM_MAXUSER_ADDRESS;
3442 
3443 	/*
3444 	 * we lock in the pmap => pv_head direction
3445 	 */
3446 
3447 	rw_enter(&pmap_main_lock, RW_READER);
3448 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
3449 
3450 	/*
3451 	 * dumping a range of pages: we dump in PTP sized blocks (4MB)
3452 	 */
3453 
3454 	for (/* null */ ; sva < eva ; sva = blkendva) {
3455 
3456 		/* determine range of block */
3457 		blkendva = x86_round_pdr(sva+1);
3458 		if (blkendva > eva)
3459 			blkendva = eva;
3460 
3461 		/* valid block? */
3462 		if (!pmap_pdes_valid(sva, pdes, NULL))
3463 			continue;
3464 
3465 		pte = &ptes[pl1_i(sva)];
3466 		for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) {
3467 			if (!pmap_valid_entry(*pte))
3468 				continue;
3469 			printf("va %#lx -> pa %#lx (pte=%#lx)\n",
3470 			       sva, (unsigned long)*pte,
3471 			       (unsigned long)*pte & PG_FRAME);
3472 		}
3473 	}
3474 	pmap_unmap_ptes(pmap, pmap2);
3475 	rw_exit(&pmap_main_lock);
3476 }
3477 #endif
3478 
3479 /*
3480  * pmap_tlb_shootdown: invalidate pages on all CPUs using pmap 'pm'
3481  *
3482  * => always invalidates locally before returning
3483  * => returns before remote CPUs have invalidated
3484  * => must be called with preemption disabled
3485  */
3486 
3487 void
3488 pmap_tlb_shootdown(struct pmap *pm, vaddr_t sva, vaddr_t eva, pt_entry_t pte)
3489 {
3490 #ifdef MULTIPROCESSOR
3491 	extern int _lock_cas(volatile uintptr_t *, uintptr_t, uintptr_t);
3492 	extern bool x86_mp_online;
3493 	struct cpu_info *ci;
3494 	struct pmap_mbox *mb, *selfmb;
3495 	CPU_INFO_ITERATOR cii;
3496 	uintptr_t head;
3497 	u_int count;
3498 	int s;
3499 #endif	/* MULTIPROCESSOR */
3500 	struct cpu_info *self;
3501 	bool kernel;
3502 
3503 	KASSERT(eva == 0 || eva >= sva);
3504 
3505 	if (pte & PG_PS)
3506 		sva &= PG_LGFRAME;
3507 	pte &= PG_G;
3508 	self = curcpu();
3509 
3510 	if (sva == (vaddr_t)-1LL) {
3511 		kernel = true;
3512 	} else {
3513 		if (eva == 0)
3514 			eva = sva + PAGE_SIZE;
3515 		kernel = sva >= VM_MAXUSER_ADDRESS;
3516 		KASSERT(kernel == (eva > VM_MAXUSER_ADDRESS));
3517 	}
3518 
3519 	/*
3520 	 * If the range is larger than 32 pages, then invalidate
3521 	 * everything.
3522 	 */
3523 	if (sva != (vaddr_t)-1LL && eva - sva > (32 * PAGE_SIZE)) {
3524 		sva = (vaddr_t)-1LL;
3525 		eva = sva;
3526 	}
3527 
3528 #ifdef MULTIPROCESSOR
3529 	if (ncpu > 1 && x86_mp_online) {
3530 		selfmb = &self->ci_pmap_cpu->pc_mbox;
3531 
3532 		/*
3533 		 * If the CPUs have no notion of global pages then
3534 		 * reload of %cr3 is sufficient.
3535 		 */
3536 		if (pte != 0 && (cpu_feature & CPUID_PGE) == 0)
3537 			pte = 0;
3538 
3539 		if (pm == pmap_kernel()) {
3540 			/*
3541 			 * Mapped on all CPUs: use the broadcast mechanism.
3542 			 * Once we have the lock, increment the counter.
3543 			 */
3544 			s = splvm();
3545 			mb = &pmap_mbox;
3546 			count = SPINLOCK_BACKOFF_MIN;
3547 			do {
3548 				if ((head = mb->mb_head) != mb->mb_tail) {
3549 					splx(s);
3550 					while ((head = mb->mb_head) !=
3551 					    mb->mb_tail)
3552 						SPINLOCK_BACKOFF(count);
3553 					s = splvm();
3554 				}
3555 			} while (!_lock_cas(&mb->mb_head, head,
3556 			    head + ncpu - 1));
3557 
3558 			/*
3559 			 * Once underway we must stay at IPL_VM until the
3560 			 * IPI is dispatched.  Otherwise interrupt handlers
3561 			 * on this CPU can deadlock against us.
3562 			 */
3563 			pmap_tlb_evcnt.ev_count++;
3564 			mb->mb_pointer = self;
3565 			mb->mb_addr1 = sva;
3566 			mb->mb_addr2 = eva;
3567 			mb->mb_global = pte;
3568 			x86_ipi(LAPIC_TLB_BCAST_VECTOR, LAPIC_DEST_ALLEXCL,
3569 			    LAPIC_DLMODE_FIXED);
3570 			self->ci_need_tlbwait = 1;
3571 			splx(s);
3572 		} else if ((pm->pm_cpus & ~self->ci_cpumask) != 0 ||
3573 		    (kernel && (pm->pm_kernel_cpus & ~self->ci_cpumask) != 0)) {
3574 			/*
3575 			 * We don't bother traversing the CPU list if only
3576 			 * used by this CPU.
3577 			 *
3578 			 * We can't do global flushes with the multicast
3579 			 * mechanism.
3580 			 */
3581 			KASSERT(pte == 0);
3582 
3583 			/*
3584 			 * Take ownership of the shootdown mailbox on each
3585 			 * CPU, fill the details and fire it off.
3586 			 */
3587 			s = splvm();
3588 			for (CPU_INFO_FOREACH(cii, ci)) {
3589 				if (ci == self ||
3590 				    !pmap_is_active(pm, ci, kernel) ||
3591 				    !(ci->ci_flags & CPUF_RUNNING))
3592 					continue;
3593 				selfmb->mb_head++;
3594 				mb = &ci->ci_pmap_cpu->pc_mbox;
3595 				count = SPINLOCK_BACKOFF_MIN;
3596 				while (!_lock_cas((uintptr_t *)&mb->mb_pointer,
3597 				    0, (uintptr_t)&selfmb->mb_tail)) {
3598 				    	splx(s);
3599 					while (mb->mb_pointer != 0)
3600 						SPINLOCK_BACKOFF(count);
3601 					s = splvm();
3602 				}
3603 				mb->mb_addr1 = sva;
3604 				mb->mb_addr2 = eva;
3605 				mb->mb_global = pte;
3606 				if (x86_ipi(LAPIC_TLB_MCAST_VECTOR,
3607 				    ci->ci_apicid,
3608 				    LAPIC_DLMODE_FIXED))
3609 					panic("pmap_tlb_shootdown: ipi failed");
3610 			}
3611 			self->ci_need_tlbwait = 1;
3612 			splx(s);
3613 		}
3614 	}
3615 #endif	/* MULTIPROCESSOR */
3616 
3617 	/* Update the current CPU before waiting for others. */
3618 	if (!pmap_is_active(pm, self, kernel))
3619 		return;
3620 
3621 	if (sva == (vaddr_t)-1LL) {
3622 		if (pte != 0)
3623 			tlbflushg();
3624 		else
3625 			tlbflush();
3626 	} else {
3627 		do {
3628 			pmap_update_pg(sva);
3629 			sva += PAGE_SIZE;
3630 		} while (sva < eva);
3631 	}
3632 }
3633 
3634 /*
3635  * pmap_tlb_shootwait: wait for pending TLB shootdowns to complete
3636  *
3637  * => only waits for operations generated by the current CPU
3638  * => must be called with preemption disabled
3639  */
3640 
3641 void
3642 pmap_tlb_shootwait(void)
3643 {
3644 	struct cpu_info *self;
3645 	struct pmap_mbox *mb;
3646 
3647 	/*
3648 	 * Anything to do?  XXX Really we want to avoid touching the cache
3649 	 * lines of the two mailboxes, but the processor may read ahead.
3650 	 */
3651 	self = curcpu();
3652 	if (!self->ci_need_tlbwait)
3653 		return;
3654 	self->ci_need_tlbwait = 0;
3655 
3656 	/* If we own the global mailbox, wait for it to drain. */
3657 	mb = &pmap_mbox;
3658 	while (mb->mb_pointer == self && mb->mb_head != mb->mb_tail)
3659 		x86_pause();
3660 
3661 	/* If we own other CPU's mailboxes, wait for them to drain. */
3662 	mb = &self->ci_pmap_cpu->pc_mbox;
3663 	KASSERT(mb->mb_pointer != &mb->mb_tail);
3664 	while (mb->mb_head != mb->mb_tail)
3665 		x86_pause();
3666 }
3667 
3668 /*
3669  * pmap_update: process deferred invalidations
3670  */
3671 
3672 void
3673 pmap_update(struct pmap *pm)
3674 {
3675 
3676 	crit_enter();
3677 	pmap_tlb_shootwait();
3678 	crit_exit();
3679 }
3680