xref: /netbsd-src/sys/arch/alpha/alpha/pmap.c (revision 2a698ff5a341b6ec6aca83e780ad320a752687c1)
1 /* $NetBSD: pmap.c,v 1.276 2021/04/03 15:29:02 thorpej Exp $ */
2 
3 /*-
4  * Copyright (c) 1998, 1999, 2000, 2001, 2007, 2008, 2020
5  * 	The NetBSD Foundation, Inc.
6  * All rights reserved.
7  *
8  * This code is derived from software contributed to The NetBSD Foundation
9  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
10  * NASA Ames Research Center, by Andrew Doran and Mindaugas Rasiukevicius,
11  * and by Chris G. Demetriou.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
24  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
26  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
27  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
28  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
29  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
30  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
31  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32  * POSSIBILITY OF SUCH DAMAGE.
33  */
34 
35 /*
36  * Copyright (c) 1991, 1993
37  *	The Regents of the University of California.  All rights reserved.
38  *
39  * This code is derived from software contributed to Berkeley by
40  * the Systems Programming Group of the University of Utah Computer
41  * Science Department.
42  *
43  * Redistribution and use in source and binary forms, with or without
44  * modification, are permitted provided that the following conditions
45  * are met:
46  * 1. Redistributions of source code must retain the above copyright
47  *    notice, this list of conditions and the following disclaimer.
48  * 2. Redistributions in binary form must reproduce the above copyright
49  *    notice, this list of conditions and the following disclaimer in the
50  *    documentation and/or other materials provided with the distribution.
51  * 3. Neither the name of the University nor the names of its contributors
52  *    may be used to endorse or promote products derived from this software
53  *    without specific prior written permission.
54  *
55  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
56  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
57  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
58  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
59  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
60  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
61  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
62  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
63  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
64  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
65  * SUCH DAMAGE.
66  *
67  *	@(#)pmap.c	8.6 (Berkeley) 5/27/94
68  */
69 
70 /*
71  * DEC Alpha physical map management code.
72  *
73  * History:
74  *
75  *	This pmap started life as a Motorola 68851/68030 pmap,
76  *	written by Mike Hibler at the University of Utah.
77  *
78  *	It was modified for the DEC Alpha by Chris Demetriou
79  *	at Carnegie Mellon University.
80  *
81  *	Support for non-contiguous physical memory was added by
82  *	Jason R. Thorpe of the Numerical Aerospace Simulation
83  *	Facility, NASA Ames Research Center and Chris Demetriou.
84  *
85  *	Page table management and a major cleanup were undertaken
86  *	by Jason R. Thorpe, with lots of help from Ross Harvey of
87  *	Avalon Computer Systems and from Chris Demetriou.
88  *
89  *	Support for the new UVM pmap interface was written by
90  *	Jason R. Thorpe.
91  *
92  *	Support for ASNs was written by Jason R. Thorpe, again
93  *	with help from Chris Demetriou and Ross Harvey.
94  *
95  *	The locking protocol was written by Jason R. Thorpe,
96  *	using Chuck Cranor's i386 pmap for UVM as a model.
97  *
98  *	TLB shootdown code was written (and then subsequently
99  *	rewritten some years later, borrowing some ideas from
100  *	the x86 pmap) by Jason R. Thorpe.
101  *
102  *	Multiprocessor modifications by Andrew Doran and
103  *	Jason R. Thorpe.
104  *
105  * Notes:
106  *
107  *	All user page table access is done via K0SEG.  Kernel
108  *	page table access is done via the recursive Virtual Page
109  *	Table becase kernel PT pages are pre-allocated and never
110  *	freed, so no VPT fault handling is requiried.
111  */
112 
113 /*
114  *	Manages physical address maps.
115  *
116  *	Since the information managed by this module is
117  *	also stored by the logical address mapping module,
118  *	this module may throw away valid virtual-to-physical
119  *	mappings at almost any time.  However, invalidations
120  *	of virtual-to-physical mappings must be done as
121  *	requested.
122  *
123  *	In order to cope with hardware architectures which
124  *	make virtual-to-physical map invalidates expensive,
125  *	this module may delay invalidate or reduced protection
126  *	operations until such time as they are actually
127  *	necessary.  This module is given full information as
128  *	to which processors are currently using which maps,
129  *	and to when physical maps must be made correct.
130  */
131 
132 #include "opt_lockdebug.h"
133 #include "opt_sysv.h"
134 #include "opt_multiprocessor.h"
135 
136 #include <sys/cdefs.h>			/* RCS ID & Copyright macro defns */
137 
138 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.276 2021/04/03 15:29:02 thorpej Exp $");
139 
140 #include <sys/param.h>
141 #include <sys/systm.h>
142 #include <sys/kernel.h>
143 #include <sys/proc.h>
144 #include <sys/malloc.h>
145 #include <sys/pool.h>
146 #include <sys/buf.h>
147 #include <sys/evcnt.h>
148 #include <sys/atomic.h>
149 #include <sys/cpu.h>
150 
151 #include <uvm/uvm.h>
152 
153 #if defined(MULTIPROCESSOR)
154 #include <machine/rpb.h>
155 #endif
156 
157 #ifdef DEBUG
158 #define	PDB_FOLLOW	0x0001
159 #define	PDB_INIT	0x0002
160 #define	PDB_ENTER	0x0004
161 #define	PDB_REMOVE	0x0008
162 #define	PDB_CREATE	0x0010
163 #define	PDB_PTPAGE	0x0020
164 #define	PDB_ASN		0x0040
165 #define	PDB_BITS	0x0080
166 #define	PDB_COLLECT	0x0100
167 #define	PDB_PROTECT	0x0200
168 #define	PDB_BOOTSTRAP	0x1000
169 #define	PDB_PARANOIA	0x2000
170 #define	PDB_WIRING	0x4000
171 #define	PDB_PVDUMP	0x8000
172 
173 int debugmap = 0;
174 int pmapdebug = PDB_PARANOIA;
175 #endif
176 
177 #if defined(MULTIPROCESSOR)
178 #define	PMAP_MP(x)	x
179 #else
180 #define	PMAP_MP(x)	__nothing
181 #endif /* MULTIPROCESSOR */
182 
183 /*
184  * Given a map and a machine independent protection code,
185  * convert to an alpha protection code.
186  */
187 #define pte_prot(m, p)	(protection_codes[m == pmap_kernel() ? 0 : 1][p])
188 static int	protection_codes[2][8] __read_mostly;
189 
190 /*
191  * kernel_lev1map:
192  *
193  *	Kernel level 1 page table.  This maps all kernel level 2
194  *	page table pages, and is used as a template for all user
195  *	pmap level 1 page tables.  When a new user level 1 page
196  *	table is allocated, all kernel_lev1map PTEs for kernel
197  *	addresses are copied to the new map.
198  *
199  *	The kernel also has an initial set of kernel level 2 page
200  *	table pages.  These map the kernel level 3 page table pages.
201  *	As kernel level 3 page table pages are added, more level 2
202  *	page table pages may be added to map them.  These pages are
203  *	never freed.
204  *
205  *	Finally, the kernel also has an initial set of kernel level
206  *	3 page table pages.  These map pages in K1SEG.  More level
207  *	3 page table pages may be added at run-time if additional
208  *	K1SEG address space is required.  These pages are never freed.
209  *
210  * NOTE: When mappings are inserted into the kernel pmap, all
211  * level 2 and level 3 page table pages must already be allocated
212  * and mapped into the parent page table.
213  */
214 pt_entry_t	*kernel_lev1map __read_mostly;
215 
216 /*
217  * Virtual Page Table.
218  */
219 static pt_entry_t *VPT __read_mostly;
220 
221 static struct {
222 	struct pmap k_pmap;
223 } kernel_pmap_store __cacheline_aligned;
224 
225 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store.k_pmap;
226 
227 /* PA of first available physical page */
228 paddr_t    	avail_start __read_mostly;
229 
230 /* PA of last available physical page */
231 paddr_t		avail_end __read_mostly;
232 
233 /* VA of last avail page (end of kernel AS) */
234 static vaddr_t	virtual_end __read_mostly;
235 
236 /* Has pmap_init completed? */
237 static bool pmap_initialized __read_mostly;
238 
239 /* Instrumentation */
240 u_long		pmap_pages_stolen __read_mostly;
241 
242 /*
243  * This variable contains the number of CPU IDs we need to allocate
244  * space for when allocating the pmap structure.  It is used to
245  * size a per-CPU array of ASN and ASN Generation number.
246  */
247 static u_long 	pmap_ncpuids __read_mostly;
248 
249 #ifndef PMAP_PV_LOWAT
250 #define	PMAP_PV_LOWAT	16
251 #endif
252 int		pmap_pv_lowat __read_mostly = PMAP_PV_LOWAT;
253 
254 /*
255  * List of all pmaps, used to update them when e.g. additional kernel
256  * page tables are allocated.  This list is kept LRU-ordered by
257  * pmap_activate().
258  */
259 static TAILQ_HEAD(, pmap) pmap_all_pmaps __cacheline_aligned;
260 
261 /*
262  * The pools from which pmap structures and sub-structures are allocated.
263  */
264 static struct pool_cache pmap_pmap_cache __read_mostly;
265 static struct pool_cache pmap_l1pt_cache __read_mostly;
266 static struct pool_cache pmap_pv_cache __read_mostly;
267 
268 CTASSERT(offsetof(struct pmap, pm_asni[0]) == COHERENCY_UNIT);
269 CTASSERT(PMAP_SIZEOF(ALPHA_MAXPROCS) < ALPHA_PGBYTES);
270 CTASSERT(sizeof(struct pmap_asn_info) == COHERENCY_UNIT);
271 
272 /*
273  * Address Space Numbers.
274  *
275  * On many implementations of the Alpha architecture, the TLB entries and
276  * I-cache blocks are tagged with a unique number within an implementation-
277  * specified range.  When a process context becomes active, the ASN is used
278  * to match TLB entries; if a TLB entry for a particular VA does not match
279  * the current ASN, it is ignored (one could think of the processor as
280  * having a collection of <max ASN> separate TLBs).  This allows operating
281  * system software to skip the TLB flush that would otherwise be necessary
282  * at context switch time.
283  *
284  * Alpha PTEs have a bit in them (PG_ASM - Address Space Match) that
285  * causes TLB entries to match any ASN.  The PALcode also provides
286  * a TBI (Translation Buffer Invalidate) operation that flushes all
287  * TLB entries that _do not_ have PG_ASM.  We use this bit for kernel
288  * mappings, so that invalidation of all user mappings does not invalidate
289  * kernel mappings (which are consistent across all processes).
290  *
291  * pmap_next_asn always indicates to the next ASN to use.  When
292  * pmap_next_asn exceeds pmap_max_asn, we start a new ASN generation.
293  *
294  * When a new ASN generation is created, the per-process (i.e. non-PG_ASM)
295  * TLB entries and the I-cache are flushed, the generation number is bumped,
296  * and pmap_next_asn is changed to indicate the first non-reserved ASN.
297  *
298  * We reserve ASN #0 for pmaps that use the global kernel_lev1map.  This
299  * prevents the following scenario to ensure no accidental accesses to
300  * user space for LWPs using the kernel pmap.  This is important because
301  * the PALcode may use the recursive VPT to service TLB misses.
302  *
303  * By reserving an ASN for the kernel, we are guaranteeing that an lwp
304  * will not see any valid user space TLB entries until it passes through
305  * pmap_activate() for the first time.
306  *
307  * On processors that do not support ASNs, the PALcode invalidates
308  * non-ASM TLB entries automatically on swpctx.  We completely skip
309  * the ASN machinery in this case because the PALcode neither reads
310  * nor writes that field of the HWPCB.
311  */
312 
313 /* max ASN supported by the system */
314 static u_int	pmap_max_asn __read_mostly;
315 
316 /*
317  * Locking:
318  *
319  *	READ/WRITE LOCKS
320  *	----------------
321  *
322  *	* pmap_main_lock - This lock is used to prevent deadlock and/or
323  *	  provide mutex access to the pmap module.  Most operations lock
324  *	  the pmap first, then PV lists as needed.  However, some operations,
325  *	  such as pmap_page_protect(), lock the PV lists before locking
326  *	  the pmaps.  To prevent deadlock, we require a mutex lock on the
327  *	  pmap module if locking in the PV->pmap direction.  This is
328  *	  implemented by acquiring a (shared) read lock on pmap_main_lock
329  *	  if locking pmap->PV and a (exclusive) write lock if locking in
330  *	  the PV->pmap direction.  Since only one thread can hold a write
331  *	  lock at a time, this provides the mutex.
332  *
333  *	MUTEXES
334  *	-------
335  *
336  *	* pmap lock (global hash) - These locks protect the pmap structures.
337  *
338  *	* pmap activation lock (global hash) - These IPL_SCHED spin locks
339  *	  synchronize pmap_activate() and TLB shootdowns.  This has a lock
340  *	  ordering constraint with the tlb_lock:
341  *
342  *		tlb_lock -> pmap activation lock
343  *
344  *	* pvh_lock (global hash) - These locks protect the PV lists for
345  *	  managed pages.
346  *
347  *	* tlb_lock - This IPL_VM lock serializes local and remote TLB
348  *	  invalidation.
349  *
350  *	* pmap_all_pmaps_lock - This lock protects the global list of
351  *	  all pmaps.
352  *
353  *	* pmap_growkernel_lock - This lock protects pmap_growkernel()
354  *	  and the virtual_end variable.
355  *
356  *	  There is a lock ordering constraint for pmap_growkernel_lock.
357  *	  pmap_growkernel() acquires the locks in the following order:
358  *
359  *		pmap_growkernel_lock (write) -> pmap_all_pmaps_lock ->
360  *		    pmap lock
361  *
362  *	  We need to ensure consistency between user pmaps and the
363  *	  kernel_lev1map.  For this reason, pmap_growkernel_lock must
364  *	  be held to prevent kernel_lev1map changing across pmaps
365  *	  being added to / removed from the global pmaps list.
366  *
367  *	Address space number management (global ASN counters and per-pmap
368  *	ASN state) are not locked; they use arrays of values indexed
369  *	per-processor.
370  *
371  *	All internal functions which operate on a pmap are called
372  *	with the pmap already locked by the caller (which will be
373  *	an interface function).
374  */
375 static krwlock_t pmap_main_lock __cacheline_aligned;
376 static kmutex_t pmap_all_pmaps_lock __cacheline_aligned;
377 static krwlock_t pmap_growkernel_lock __cacheline_aligned;
378 
379 #define	PMAP_MAP_TO_HEAD_LOCK()		rw_enter(&pmap_main_lock, RW_READER)
380 #define	PMAP_MAP_TO_HEAD_UNLOCK()	rw_exit(&pmap_main_lock)
381 #define	PMAP_HEAD_TO_MAP_LOCK()		rw_enter(&pmap_main_lock, RW_WRITER)
382 #define	PMAP_HEAD_TO_MAP_UNLOCK()	rw_exit(&pmap_main_lock)
383 
384 static union {
385 	kmutex_t	lock;
386 	uint8_t		pad[COHERENCY_UNIT];
387 } pmap_pvh_locks[64] __cacheline_aligned;
388 
389 #define	PVH_LOCK_HASH(pg)						\
390 	((((uintptr_t)(pg)) >> 6) & 63)
391 
392 static inline kmutex_t *
393 pmap_pvh_lock(struct vm_page *pg)
394 {
395 	return &pmap_pvh_locks[PVH_LOCK_HASH(pg)].lock;
396 }
397 
398 static union {
399 	struct {
400 		kmutex_t	lock;
401 		kmutex_t	activation_lock;
402 	} locks;
403 	uint8_t		pad[COHERENCY_UNIT];
404 } pmap_pmap_locks[64] __cacheline_aligned;
405 
406 #define	PMAP_LOCK_HASH(pm)						\
407 	((((uintptr_t)(pm)) >> 6) & 63)
408 
409 static inline kmutex_t *
410 pmap_pmap_lock(pmap_t const pmap)
411 {
412 	return &pmap_pmap_locks[PMAP_LOCK_HASH(pmap)].locks.lock;
413 }
414 
415 static inline kmutex_t *
416 pmap_activation_lock(pmap_t const pmap)
417 {
418 	return &pmap_pmap_locks[PMAP_LOCK_HASH(pmap)].locks.activation_lock;
419 }
420 
421 #define	PMAP_LOCK(pmap)		mutex_enter(pmap_pmap_lock(pmap))
422 #define	PMAP_UNLOCK(pmap)	mutex_exit(pmap_pmap_lock(pmap))
423 
424 #define	PMAP_ACT_LOCK(pmap)	mutex_spin_enter(pmap_activation_lock(pmap))
425 #define	PMAP_ACT_TRYLOCK(pmap)	mutex_tryenter(pmap_activation_lock(pmap))
426 #define	PMAP_ACT_UNLOCK(pmap)	mutex_spin_exit(pmap_activation_lock(pmap))
427 
428 #if defined(MULTIPROCESSOR)
429 #define	pmap_all_cpus()		cpus_running
430 #else
431 #define	pmap_all_cpus()		~0UL
432 #endif /* MULTIPROCESSOR */
433 
434 /*
435  * TLB management.
436  *
437  * TLB invalidations need to be performed on local and remote CPUs
438  * whenever parts of the PTE that the hardware or PALcode understands
439  * changes.  In order amortize the cost of these operations, we will
440  * queue up to 8 addresses to invalidate in a batch.  Any more than
441  * that, and we will hit the entire TLB.
442  *
443  * Some things that add complexity:
444  *
445  * ==> ASNs. A CPU may have valid TLB entries for other than the current
446  *     address spaace.  We can only invalidate TLB entries for the current
447  *     address space, so when asked to invalidate a VA for the non-current
448  *     pmap on a given CPU, we simply invalidate the ASN for that pmap,CPU
449  *     tuple so that new one is allocated on the next activation on that
450  *     CPU.  N.B. that for CPUs that don't implement ASNs, SWPCTX does all
451  *     the work necessary, so we can skip some work in the pmap module
452  *     itself.
453  *
454  *     When a pmap is activated on a given CPU, we set a corresponding
455  *     bit in pmap::pm_cpus, indicating that it potentially has valid
456  *     TLB entries for that address space.  This bitmap is then used to
457  *     determine which remote CPUs need to be notified of invalidations.
458  *     The bit is cleared when the ASN is invalidated on that CPU.
459  *
460  *     In order to serialize with activating an address space on a
461  *     given CPU (that we can reliably send notifications only to
462  *     relevant remote CPUs), we acquire the pmap lock in pmap_activate()
463  *     and also hold the lock while remote shootdowns take place.
464  *     This does not apply to the kernel pmap; all CPUs are notified about
465  *     invalidations for the kernel pmap, and the pmap lock is not held
466  *     in pmap_activate() for the kernel pmap.
467  *
468  * ==> P->V operations (e.g. pmap_page_protect()) may require sending
469  *     invalidations for multiple address spaces.  We only track one
470  *     address space at a time, and if we encounter more than one, then
471  *     the notification each CPU gets is to hit the entire TLB.  Note
472  *     also that we can't serialize with pmap_activate() in this case,
473  *     so all CPUs will get the notification, and they check when
474  *     processing the notification if the pmap is current on that CPU.
475  *
476  * Invalidation information is gathered into a pmap_tlb_context structure
477  * that includes room for 8 VAs, the pmap the VAs belong to, a bitmap of
478  * CPUs to be notified, and a list for PT pages that are freed during
479  * removal off mappings.  The number of valid addresses in the list as
480  * well as flags are sqeezed into the lower bits of the first two VAs.
481  * Storage for this structure is allocated on the stack.  We need to be
482  * careful to keep the size of this struture under control.
483  *
484  * When notifying remote CPUs, we acquire the tlb_lock (which also
485  * blocks IPIs), record the pointer to our context structure, set a
486  * global bitmap off CPUs to be notified, and then send the IPIs to
487  * each victim.  While the other CPUs are in-flight, we then perform
488  * any invalidations necessary on the local CPU.  Once that is done,
489  * we then wait the the global context pointer to be cleared, which
490  * will be done by the final remote CPU to complete their work. This
491  * method reduces cache line contention during pocessing.
492  *
493  * When removing mappings in user pmaps, this implemention frees page
494  * table pages back to the VM system once they contain no valid mappings.
495  * As we do this, we must ensure to invalidate TLB entries that the
496  * CPU might hold for the respective recursive VPT mappings.  This must
497  * be done whenever an L1 or L2 PTE is invalidated.  Until these VPT
498  * translations are invalidated, the PT pages must not be reused.  For
499  * this reason, we keep a list of freed PT pages in the context stucture
500  * and drain them off once all invalidations are complete.
501  *
502  * NOTE: The value of TLB_CTX_MAXVA is tuned to accommodate the UBC
503  * window size (defined as 64KB on alpha in <machine/vmparam.h>).
504  */
505 
506 #define	TLB_CTX_MAXVA		8
507 #define	TLB_CTX_ALLVA		PAGE_MASK
508 
509 #define	TLB_CTX_F_ASM		__BIT(0)
510 #define	TLB_CTX_F_IMB		__BIT(1)
511 #define	TLB_CTX_F_KIMB		__BIT(2)
512 #define	TLB_CTX_F_PV		__BIT(3)
513 #define	TLB_CTX_F_MULTI		__BIT(4)
514 
515 #define	TLB_CTX_COUNT(ctx)	((ctx)->t_addrdata[0] & PAGE_MASK)
516 #define	TLB_CTX_INC_COUNT(ctx)	 (ctx)->t_addrdata[0]++
517 #define	TLB_CTX_SET_ALLVA(ctx)	 (ctx)->t_addrdata[0] |= TLB_CTX_ALLVA
518 
519 #define	TLB_CTX_FLAGS(ctx)	((ctx)->t_addrdata[1] & PAGE_MASK)
520 #define	TLB_CTX_SET_FLAG(ctx, f) (ctx)->t_addrdata[1] |= (f)
521 
522 #define	TLB_CTX_VA(ctx, i)	((ctx)->t_addrdata[(i)] & ~PAGE_MASK)
523 #define	TLB_CTX_SETVA(ctx, i, va)					\
524 	(ctx)->t_addrdata[(i)] = (va) | ((ctx)->t_addrdata[(i)] & PAGE_MASK)
525 
526 struct pmap_tlb_context {
527 	uintptr_t	t_addrdata[TLB_CTX_MAXVA];
528 	pmap_t		t_pmap;
529 	LIST_HEAD(, vm_page) t_freeptq;
530 };
531 
532 static struct {
533 	kmutex_t	lock;
534 	struct evcnt	events;
535 } tlb_shootdown __cacheline_aligned;
536 #define	tlb_lock	tlb_shootdown.lock
537 #define	tlb_evcnt	tlb_shootdown.events
538 #if defined(MULTIPROCESSOR)
539 static const struct pmap_tlb_context *tlb_context __cacheline_aligned;
540 static unsigned long tlb_pending __cacheline_aligned;
541 #endif /* MULTIPROCESSOR */
542 
543 #if defined(TLB_STATS)
544 #define	TLB_COUNT_DECL(cnt)	static struct evcnt tlb_stat_##cnt
545 #define	TLB_COUNT(cnt)		atomic_inc_64(&tlb_stat_##cnt .ev_count)
546 #define	TLB_COUNT_ATTACH(cnt)						\
547 	evcnt_attach_dynamic_nozero(&tlb_stat_##cnt, EVCNT_TYPE_MISC,	\
548 	    NULL, "TLB", #cnt)
549 
550 TLB_COUNT_DECL(invalidate_multi_tbia);
551 TLB_COUNT_DECL(invalidate_multi_tbiap);
552 TLB_COUNT_DECL(invalidate_multi_imb);
553 
554 TLB_COUNT_DECL(invalidate_kern_tbia);
555 TLB_COUNT_DECL(invalidate_kern_tbis);
556 TLB_COUNT_DECL(invalidate_kern_imb);
557 
558 TLB_COUNT_DECL(invalidate_user_not_current);
559 TLB_COUNT_DECL(invalidate_user_lazy_imb);
560 TLB_COUNT_DECL(invalidate_user_tbiap);
561 TLB_COUNT_DECL(invalidate_user_tbis);
562 
563 TLB_COUNT_DECL(shootdown_kernel);
564 TLB_COUNT_DECL(shootdown_user);
565 TLB_COUNT_DECL(shootdown_imb);
566 TLB_COUNT_DECL(shootdown_kimb);
567 TLB_COUNT_DECL(shootdown_overflow);
568 
569 TLB_COUNT_DECL(shootdown_all_user);
570 TLB_COUNT_DECL(shootdown_all_user_imb);
571 
572 TLB_COUNT_DECL(shootdown_pv);
573 TLB_COUNT_DECL(shootdown_pv_multi);
574 
575 TLB_COUNT_DECL(shootnow_over_notify);
576 TLB_COUNT_DECL(shootnow_remote);
577 
578 TLB_COUNT_DECL(reason_remove_kernel);
579 TLB_COUNT_DECL(reason_remove_user);
580 TLB_COUNT_DECL(reason_page_protect_read);
581 TLB_COUNT_DECL(reason_page_protect_none);
582 TLB_COUNT_DECL(reason_protect);
583 TLB_COUNT_DECL(reason_enter_kernel);
584 TLB_COUNT_DECL(reason_enter_user);
585 TLB_COUNT_DECL(reason_kenter);
586 TLB_COUNT_DECL(reason_enter_l2pt_delref);
587 TLB_COUNT_DECL(reason_enter_l3pt_delref);
588 TLB_COUNT_DECL(reason_kremove);
589 TLB_COUNT_DECL(reason_clear_modify);
590 TLB_COUNT_DECL(reason_clear_reference);
591 TLB_COUNT_DECL(reason_emulate_reference);
592 
593 TLB_COUNT_DECL(asn_reuse);
594 TLB_COUNT_DECL(asn_newgen);
595 TLB_COUNT_DECL(asn_assign);
596 
597 TLB_COUNT_DECL(activate_both_change);
598 TLB_COUNT_DECL(activate_asn_change);
599 TLB_COUNT_DECL(activate_ptbr_change);
600 TLB_COUNT_DECL(activate_swpctx);
601 TLB_COUNT_DECL(activate_skip_swpctx);
602 
603 #else /* ! TLB_STATS */
604 #define	TLB_COUNT(cnt)		__nothing
605 #define	TLB_COUNT_ATTACH(cnt)	__nothing
606 #endif /* TLB_STATS */
607 
608 static void
609 pmap_tlb_init(void)
610 {
611 	/* mutex is initialized in pmap_bootstrap(). */
612 
613 	evcnt_attach_dynamic_nozero(&tlb_evcnt, EVCNT_TYPE_MISC,
614 	    NULL, "TLB", "shootdown");
615 
616 	TLB_COUNT_ATTACH(invalidate_multi_tbia);
617 	TLB_COUNT_ATTACH(invalidate_multi_tbiap);
618 	TLB_COUNT_ATTACH(invalidate_multi_imb);
619 
620 	TLB_COUNT_ATTACH(invalidate_kern_tbia);
621 	TLB_COUNT_ATTACH(invalidate_kern_tbis);
622 	TLB_COUNT_ATTACH(invalidate_kern_imb);
623 
624 	TLB_COUNT_ATTACH(invalidate_user_not_current);
625 	TLB_COUNT_ATTACH(invalidate_user_lazy_imb);
626 	TLB_COUNT_ATTACH(invalidate_user_tbiap);
627 	TLB_COUNT_ATTACH(invalidate_user_tbis);
628 
629 	TLB_COUNT_ATTACH(shootdown_kernel);
630 	TLB_COUNT_ATTACH(shootdown_user);
631 	TLB_COUNT_ATTACH(shootdown_imb);
632 	TLB_COUNT_ATTACH(shootdown_kimb);
633 	TLB_COUNT_ATTACH(shootdown_overflow);
634 
635 	TLB_COUNT_ATTACH(shootdown_all_user);
636 	TLB_COUNT_ATTACH(shootdown_all_user_imb);
637 
638 	TLB_COUNT_ATTACH(shootdown_pv);
639 	TLB_COUNT_ATTACH(shootdown_pv_multi);
640 
641 	TLB_COUNT_ATTACH(shootnow_over_notify);
642 	TLB_COUNT_ATTACH(shootnow_remote);
643 
644 	TLB_COUNT_ATTACH(reason_remove_kernel);
645 	TLB_COUNT_ATTACH(reason_remove_user);
646 	TLB_COUNT_ATTACH(reason_page_protect_read);
647 	TLB_COUNT_ATTACH(reason_page_protect_none);
648 	TLB_COUNT_ATTACH(reason_protect);
649 	TLB_COUNT_ATTACH(reason_enter_kernel);
650 	TLB_COUNT_ATTACH(reason_enter_user);
651 	TLB_COUNT_ATTACH(reason_kenter);
652 	TLB_COUNT_ATTACH(reason_enter_l2pt_delref);
653 	TLB_COUNT_ATTACH(reason_enter_l3pt_delref);
654 	TLB_COUNT_ATTACH(reason_kremove);
655 	TLB_COUNT_ATTACH(reason_clear_modify);
656 	TLB_COUNT_ATTACH(reason_clear_reference);
657 
658 	TLB_COUNT_ATTACH(asn_reuse);
659 	TLB_COUNT_ATTACH(asn_newgen);
660 	TLB_COUNT_ATTACH(asn_assign);
661 
662 	TLB_COUNT_ATTACH(activate_both_change);
663 	TLB_COUNT_ATTACH(activate_asn_change);
664 	TLB_COUNT_ATTACH(activate_ptbr_change);
665 	TLB_COUNT_ATTACH(activate_swpctx);
666 	TLB_COUNT_ATTACH(activate_skip_swpctx);
667 }
668 
669 static inline void
670 pmap_tlb_context_init(struct pmap_tlb_context * const tlbctx)
671 {
672 	/* Initialize the minimum number of fields. */
673 	tlbctx->t_addrdata[0] = 0;
674 	tlbctx->t_addrdata[1] = 0;
675 	tlbctx->t_pmap = NULL;
676 	LIST_INIT(&tlbctx->t_freeptq);
677 }
678 
679 static void
680 pmap_tlb_shootdown(pmap_t const pmap, vaddr_t const va,
681     pt_entry_t const pte_bits, struct pmap_tlb_context * const tlbctx)
682 {
683 	KASSERT(pmap != NULL);
684 	KASSERT((va & PAGE_MASK) == 0);
685 
686 	/*
687 	 * Figure out who needs to hear about this, and the scope
688 	 * of an all-entries invalidate.
689 	 */
690 	if (pmap == pmap_kernel()) {
691 		TLB_COUNT(shootdown_kernel);
692 		KASSERT(pte_bits & PG_ASM);
693 		TLB_CTX_SET_FLAG(tlbctx, TLB_CTX_F_ASM);
694 
695 		/* Note if an I-stream sync is also needed. */
696 		if (pte_bits & PG_EXEC) {
697 			TLB_COUNT(shootdown_kimb);
698 			TLB_CTX_SET_FLAG(tlbctx, TLB_CTX_F_KIMB);
699 		}
700 	} else {
701 		TLB_COUNT(shootdown_user);
702 		KASSERT((pte_bits & PG_ASM) == 0);
703 
704 		/* Note if an I-stream sync is also needed. */
705 		if (pte_bits & PG_EXEC) {
706 			TLB_COUNT(shootdown_imb);
707 			TLB_CTX_SET_FLAG(tlbctx, TLB_CTX_F_IMB);
708 		}
709 	}
710 
711 	KASSERT(tlbctx->t_pmap == NULL || tlbctx->t_pmap == pmap);
712 	tlbctx->t_pmap = pmap;
713 
714 	/*
715 	 * If we're already at the max, just tell each active CPU
716 	 * to nail everything.
717 	 */
718 	const uintptr_t count = TLB_CTX_COUNT(tlbctx);
719 	if (count > TLB_CTX_MAXVA) {
720 		return;
721 	}
722 	if (count == TLB_CTX_MAXVA) {
723 		TLB_COUNT(shootdown_overflow);
724 		TLB_CTX_SET_ALLVA(tlbctx);
725 		return;
726 	}
727 
728 	TLB_CTX_SETVA(tlbctx, count, va);
729 	TLB_CTX_INC_COUNT(tlbctx);
730 }
731 
732 static void
733 pmap_tlb_shootdown_all_user(pmap_t const pmap, pt_entry_t const pte_bits,
734     struct pmap_tlb_context * const tlbctx)
735 {
736 	KASSERT(pmap != pmap_kernel());
737 
738 	TLB_COUNT(shootdown_all_user);
739 
740 	/* Note if an I-stream sync is also needed. */
741 	if (pte_bits & PG_EXEC) {
742 		TLB_COUNT(shootdown_all_user_imb);
743 		TLB_CTX_SET_FLAG(tlbctx, TLB_CTX_F_IMB);
744 	}
745 
746 	TLB_CTX_SET_ALLVA(tlbctx);
747 }
748 
749 static void
750 pmap_tlb_shootdown_pv(const pv_entry_t pv, pt_entry_t const pte_bits,
751     struct pmap_tlb_context * const tlbctx)
752 {
753 	uintptr_t flags = TLB_CTX_F_PV;
754 
755 	TLB_COUNT(shootdown_pv);
756 
757 	if (tlbctx->t_pmap == NULL || tlbctx->t_pmap == pv->pv_pmap) {
758 		if (tlbctx->t_pmap == NULL) {
759 			pmap_reference(pv->pv_pmap);
760 		}
761 		pmap_tlb_shootdown(pv->pv_pmap, pv->pv_va, pte_bits, tlbctx);
762 	} else {
763 		TLB_COUNT(shootdown_pv_multi);
764 		flags |= TLB_CTX_F_MULTI;
765 		if (pv->pv_pmap == pmap_kernel()) {
766 			KASSERT(pte_bits & PG_ASM);
767 			flags |= TLB_CTX_F_ASM;
768 		} else {
769 			KASSERT((pte_bits & PG_ASM) == 0);
770 		}
771 
772 		/*
773 		 * No need to distinguish between kernel and user IMB
774 		 * here; see pmap_tlb_invalidate_multi().
775 		 */
776 		if (pte_bits & PG_EXEC) {
777 			flags |= TLB_CTX_F_IMB;
778 		}
779 		TLB_CTX_SET_ALLVA(tlbctx);
780 	}
781 	TLB_CTX_SET_FLAG(tlbctx, flags);
782 }
783 
784 static void
785 pmap_tlb_invalidate_multi(const struct pmap_tlb_context * const tlbctx)
786 {
787 	if (TLB_CTX_FLAGS(tlbctx) & TLB_CTX_F_ASM) {
788 		TLB_COUNT(invalidate_multi_tbia);
789 		ALPHA_TBIA();
790 	} else {
791 		TLB_COUNT(invalidate_multi_tbiap);
792 		ALPHA_TBIAP();
793 	}
794 	if (TLB_CTX_FLAGS(tlbctx) & (TLB_CTX_F_IMB | TLB_CTX_F_KIMB)) {
795 		TLB_COUNT(invalidate_multi_imb);
796 		alpha_pal_imb();
797 	}
798 }
799 
800 static void
801 pmap_tlb_invalidate_kernel(const struct pmap_tlb_context * const tlbctx)
802 {
803 	const uintptr_t count = TLB_CTX_COUNT(tlbctx);
804 
805 	if (count == TLB_CTX_ALLVA) {
806 		TLB_COUNT(invalidate_kern_tbia);
807 		ALPHA_TBIA();
808 	} else {
809 		TLB_COUNT(invalidate_kern_tbis);
810 		for (uintptr_t i = 0; i < count; i++) {
811 			ALPHA_TBIS(TLB_CTX_VA(tlbctx, i));
812 		}
813 	}
814 	if (TLB_CTX_FLAGS(tlbctx) & TLB_CTX_F_KIMB) {
815 		TLB_COUNT(invalidate_kern_imb);
816 		alpha_pal_imb();
817 	}
818 }
819 
820 static void
821 pmap_tlb_invalidate(const struct pmap_tlb_context * const tlbctx,
822     const struct cpu_info * const ci)
823 {
824 	const uintptr_t count = TLB_CTX_COUNT(tlbctx);
825 
826 	if (TLB_CTX_FLAGS(tlbctx) & TLB_CTX_F_MULTI) {
827 		pmap_tlb_invalidate_multi(tlbctx);
828 		return;
829 	}
830 
831 	if (TLB_CTX_FLAGS(tlbctx) & TLB_CTX_F_ASM) {
832 		pmap_tlb_invalidate_kernel(tlbctx);
833 		return;
834 	}
835 
836 	KASSERT(kpreempt_disabled());
837 
838 	pmap_t const pmap = tlbctx->t_pmap;
839 	KASSERT(pmap != NULL);
840 
841 	const u_long cpu_mask = 1UL << ci->ci_cpuid;
842 
843 	if (__predict_false(pmap != ci->ci_pmap)) {
844 		TLB_COUNT(invalidate_user_not_current);
845 
846 		/*
847 		 * For CPUs that don't implement ASNs, the SWPCTX call
848 		 * does all of the TLB invalidation work for us.
849 		 */
850 		if (__predict_false(pmap_max_asn == 0)) {
851 			return;
852 		}
853 
854 		/*
855 		 * We cannot directly invalidate the TLB in this case,
856 		 * so force allocation of a new ASN when the pmap becomes
857 		 * active again.
858 		 */
859 		pmap->pm_asni[ci->ci_cpuid].pma_asngen = PMAP_ASNGEN_INVALID;
860 		atomic_and_ulong(&pmap->pm_cpus, ~cpu_mask);
861 
862 		/*
863 		 * This isn't strictly necessary; when we allocate a
864 		 * new ASN, we're going to clear this bit and skip
865 		 * syncing the I-stream.  But we will keep this bit
866 		 * of accounting for internal consistency.
867 		 */
868 		if (TLB_CTX_FLAGS(tlbctx) & TLB_CTX_F_IMB) {
869 			atomic_or_ulong(&pmap->pm_needisync, cpu_mask);
870 		}
871 		return;
872 	}
873 
874 	if (TLB_CTX_FLAGS(tlbctx) & TLB_CTX_F_IMB) {
875 		TLB_COUNT(invalidate_user_lazy_imb);
876 		atomic_or_ulong(&pmap->pm_needisync, cpu_mask);
877 	}
878 
879 	if (count == TLB_CTX_ALLVA) {
880 		/*
881 		 * Another option here for CPUs that implement ASNs is
882 		 * to allocate a new ASN and do a SWPCTX.  That's almost
883 		 * certainly faster than a TBIAP, but would require us
884 		 * to synchronize against IPIs in pmap_activate().
885 		 */
886 		TLB_COUNT(invalidate_user_tbiap);
887 		KASSERT((TLB_CTX_FLAGS(tlbctx) & TLB_CTX_F_ASM) == 0);
888 		ALPHA_TBIAP();
889 	} else {
890 		TLB_COUNT(invalidate_user_tbis);
891 		for (uintptr_t i = 0; i < count; i++) {
892 			ALPHA_TBIS(TLB_CTX_VA(tlbctx, i));
893 		}
894 	}
895 }
896 
897 static void
898 pmap_tlb_shootnow(const struct pmap_tlb_context * const tlbctx)
899 {
900 
901 	if (TLB_CTX_COUNT(tlbctx) == 0) {
902 		/* No work to do. */
903 		return;
904 	}
905 
906 	/*
907 	 * Acquire the shootdown mutex.  This will also block IPL_VM
908 	 * interrupts and disable preemption.  It is critically important
909 	 * that IPIs not be blocked in this routine.
910 	 */
911 	KASSERT((alpha_pal_rdps() & ALPHA_PSL_IPL_MASK) < ALPHA_PSL_IPL_CLOCK);
912 	mutex_spin_enter(&tlb_lock);
913 	tlb_evcnt.ev_count++;
914 
915 	const struct cpu_info *ci = curcpu();
916 	const u_long this_cpu = 1UL << ci->ci_cpuid;
917 	u_long active_cpus;
918 	bool activation_locked, activation_lock_tried;
919 
920 	/*
921 	 * Figure out who to notify.  If it's for the kernel or
922 	 * multiple aaddress spaces, we notify everybody.  If
923 	 * it's a single user pmap, then we try to acquire the
924 	 * activation lock so we can get an accurate accounting
925 	 * of who needs to be notified.  If we can't acquire
926 	 * the activation lock, then just notify everyone and
927 	 * let them sort it out when they process the IPI.
928 	 */
929 	if (TLB_CTX_FLAGS(tlbctx) & (TLB_CTX_F_ASM | TLB_CTX_F_MULTI)) {
930 		active_cpus = pmap_all_cpus();
931 		activation_locked = false;
932 		activation_lock_tried = false;
933 	} else {
934 		KASSERT(tlbctx->t_pmap != NULL);
935 		activation_locked = PMAP_ACT_TRYLOCK(tlbctx->t_pmap);
936 		if (__predict_true(activation_locked)) {
937 			active_cpus = tlbctx->t_pmap->pm_cpus;
938 		} else {
939 			TLB_COUNT(shootnow_over_notify);
940 			active_cpus = pmap_all_cpus();
941 		}
942 		activation_lock_tried = true;
943 	}
944 
945 #if defined(MULTIPROCESSOR)
946 	/*
947 	 * If there are remote CPUs that need to do work, get them
948 	 * started now.
949 	 */
950 	const u_long remote_cpus = active_cpus & ~this_cpu;
951 	KASSERT(tlb_context == NULL);
952 	if (remote_cpus) {
953 		TLB_COUNT(shootnow_remote);
954 		tlb_context = tlbctx;
955 		tlb_pending = remote_cpus;
956 		alpha_multicast_ipi(remote_cpus, ALPHA_IPI_SHOOTDOWN);
957 	}
958 #endif /* MULTIPROCESSOR */
959 
960 	/*
961 	 * Now that the remotes have been notified, release the
962 	 * activation lock.
963 	 */
964 	if (activation_lock_tried) {
965 		if (activation_locked) {
966 			KASSERT(tlbctx->t_pmap != NULL);
967 			PMAP_ACT_UNLOCK(tlbctx->t_pmap);
968 		}
969 		/*
970 		 * When we tried to acquire the activation lock, we
971 		 * raised IPL to IPL_SCHED (even if we ultimately
972 		 * failed to acquire the lock), which blocks out IPIs.
973 		 * Force our IPL back down to IPL_VM so that we can
974 		 * receive IPIs.
975 		 */
976 		alpha_pal_swpipl(IPL_VM);
977 	}
978 
979 	/*
980 	 * Do any work that we might need to do.  We don't need to
981 	 * synchronize with activation here because we know that
982 	 * for the current CPU, activation status will not change.
983 	 */
984 	if (active_cpus & this_cpu) {
985 		pmap_tlb_invalidate(tlbctx, ci);
986 	}
987 
988 #if defined(MULTIPROCESSOR)
989 	/* Wait for remote CPUs to finish. */
990 	if (remote_cpus) {
991 		int backoff = SPINLOCK_BACKOFF_MIN;
992 		u_int spins = 0;
993 
994 		while (atomic_load_acquire(&tlb_context) != NULL) {
995 			SPINLOCK_BACKOFF(backoff);
996 			if (spins++ > 0x0fffffff) {
997 				printf("TLB LOCAL MASK  = 0x%016lx\n",
998 				    this_cpu);
999 				printf("TLB REMOTE MASK = 0x%016lx\n",
1000 				    remote_cpus);
1001 				printf("TLB REMOTE PENDING = 0x%016lx\n",
1002 				    tlb_pending);
1003 				printf("TLB CONTEXT = %p\n", tlb_context);
1004 				printf("TLB LOCAL IPL = %lu\n",
1005 				    alpha_pal_rdps() & ALPHA_PSL_IPL_MASK);
1006 				panic("pmap_tlb_shootnow");
1007 			}
1008 		}
1009 	}
1010 	KASSERT(tlb_context == NULL);
1011 #endif /* MULTIPROCESSOR */
1012 
1013 	mutex_spin_exit(&tlb_lock);
1014 
1015 	if (__predict_false(TLB_CTX_FLAGS(tlbctx) & TLB_CTX_F_PV)) {
1016 		/*
1017 		 * P->V TLB operations may operate on multiple pmaps.
1018 		 * The shootdown takes a reference on the first pmap it
1019 		 * encounters, in order to prevent it from disappearing,
1020 		 * in the hope that we end up with a single-pmap P->V
1021 		 * operation (instrumentation shows this is not rare).
1022 		 *
1023 		 * Once this shootdown is finished globally, we need to
1024 		 * release this extra reference.
1025 		 */
1026 		KASSERT(tlbctx->t_pmap != NULL);
1027 		pmap_destroy(tlbctx->t_pmap);
1028 	}
1029 }
1030 
1031 #if defined(MULTIPROCESSOR)
1032 void
1033 pmap_tlb_shootdown_ipi(struct cpu_info * const ci,
1034     struct trapframe * const tf __unused)
1035 {
1036 	KASSERT(tlb_context != NULL);
1037 	pmap_tlb_invalidate(tlb_context, ci);
1038 	if (atomic_and_ulong_nv(&tlb_pending, ~(1UL << ci->ci_cpuid)) == 0) {
1039 		atomic_store_release(&tlb_context, NULL);
1040 	}
1041 }
1042 #endif /* MULTIPROCESSOR */
1043 
1044 static void
1045 pmap_tlb_physpage_free(paddr_t const ptpa,
1046     struct pmap_tlb_context * const tlbctx)
1047 {
1048 	struct vm_page * const pg = PHYS_TO_VM_PAGE(ptpa);
1049 
1050 	KASSERT(pg != NULL);
1051 
1052 #ifdef DEBUG
1053 	struct vm_page_md * const md = VM_PAGE_TO_MD(pg);
1054 	KDASSERT(md->pvh_refcnt == 0);
1055 #endif
1056 
1057 	LIST_INSERT_HEAD(&tlbctx->t_freeptq, pg, pageq.list);
1058 }
1059 
1060 static void
1061 pmap_tlb_ptpage_drain(struct pmap_tlb_context * const tlbctx)
1062 {
1063 	struct vm_page *pg;
1064 
1065 	while ((pg = LIST_FIRST(&tlbctx->t_freeptq)) != NULL) {
1066 		LIST_REMOVE(pg, pageq.list);
1067 		uvm_pagefree(pg);
1068 	}
1069 }
1070 
1071 /*
1072  * Internal routines
1073  */
1074 static void	alpha_protection_init(void);
1075 static pt_entry_t pmap_remove_mapping(pmap_t, vaddr_t, pt_entry_t *, bool,
1076 				      pv_entry_t *,
1077 				      struct pmap_tlb_context *);
1078 static void	pmap_changebit(struct vm_page *, pt_entry_t, pt_entry_t,
1079 			       struct pmap_tlb_context *);
1080 
1081 /*
1082  * PT page management functions.
1083  */
1084 static int	pmap_ptpage_alloc(pt_entry_t *, int);
1085 static void	pmap_ptpage_free(pt_entry_t *, struct pmap_tlb_context *);
1086 static void	pmap_l3pt_delref(pmap_t, vaddr_t, pt_entry_t *,
1087 		     struct pmap_tlb_context *);
1088 static void	pmap_l2pt_delref(pmap_t, pt_entry_t *, pt_entry_t *,
1089 		     struct pmap_tlb_context *);
1090 static void	pmap_l1pt_delref(pmap_t, pt_entry_t *);
1091 
1092 static void	*pmap_l1pt_alloc(struct pool *, int);
1093 static void	pmap_l1pt_free(struct pool *, void *);
1094 
1095 static struct pool_allocator pmap_l1pt_allocator = {
1096 	pmap_l1pt_alloc, pmap_l1pt_free, 0,
1097 };
1098 
1099 static int	pmap_l1pt_ctor(void *, void *, int);
1100 
1101 /*
1102  * PV table management functions.
1103  */
1104 static int	pmap_pv_enter(pmap_t, struct vm_page *, vaddr_t, pt_entry_t *,
1105 			      bool, pv_entry_t);
1106 static void	pmap_pv_remove(pmap_t, struct vm_page *, vaddr_t, bool,
1107 			       pv_entry_t *);
1108 static void	*pmap_pv_page_alloc(struct pool *, int);
1109 static void	pmap_pv_page_free(struct pool *, void *);
1110 
1111 static struct pool_allocator pmap_pv_page_allocator = {
1112 	pmap_pv_page_alloc, pmap_pv_page_free, 0,
1113 };
1114 
1115 #ifdef DEBUG
1116 void	pmap_pv_dump(paddr_t);
1117 #endif
1118 
1119 #define	pmap_pv_alloc()		pool_cache_get(&pmap_pv_cache, PR_NOWAIT)
1120 #define	pmap_pv_free(pv)	pool_cache_put(&pmap_pv_cache, (pv))
1121 
1122 /*
1123  * ASN management functions.
1124  */
1125 static u_int	pmap_asn_alloc(pmap_t, struct cpu_info *);
1126 
1127 /*
1128  * Misc. functions.
1129  */
1130 static bool	pmap_physpage_alloc(int, paddr_t *);
1131 static void	pmap_physpage_free(paddr_t);
1132 static int	pmap_physpage_addref(void *);
1133 static int	pmap_physpage_delref(void *);
1134 
1135 static bool	vtophys_internal(vaddr_t, paddr_t *p);
1136 
1137 /*
1138  * PMAP_KERNEL_PTE:
1139  *
1140  *	Get a kernel PTE.
1141  *
1142  *	If debugging, do a table walk.  If not debugging, just use
1143  *	the Virtual Page Table, since all kernel page tables are
1144  *	pre-allocated and mapped in.
1145  */
1146 #ifdef DEBUG
1147 #define	PMAP_KERNEL_PTE(va)						\
1148 ({									\
1149 	pt_entry_t *l1pte_, *l2pte_;					\
1150 									\
1151 	l1pte_ = pmap_l1pte(pmap_kernel(), va);				\
1152 	if (pmap_pte_v(l1pte_) == 0) {					\
1153 		printf("kernel level 1 PTE not valid, va 0x%lx "	\
1154 		    "(line %d)\n", (va), __LINE__);			\
1155 		panic("PMAP_KERNEL_PTE");				\
1156 	}								\
1157 	l2pte_ = pmap_l2pte(pmap_kernel(), va, l1pte_);			\
1158 	if (pmap_pte_v(l2pte_) == 0) {					\
1159 		printf("kernel level 2 PTE not valid, va 0x%lx "	\
1160 		    "(line %d)\n", (va), __LINE__);			\
1161 		panic("PMAP_KERNEL_PTE");				\
1162 	}								\
1163 	pmap_l3pte(pmap_kernel(), va, l2pte_);				\
1164 })
1165 #else
1166 #define	PMAP_KERNEL_PTE(va)	(&VPT[VPT_INDEX((va))])
1167 #endif
1168 
1169 /*
1170  * PMAP_STAT_{INCR,DECR}:
1171  *
1172  *	Increment or decrement a pmap statistic.
1173  */
1174 #define	PMAP_STAT_INCR(s, v)	atomic_add_long((unsigned long *)(&(s)), (v))
1175 #define	PMAP_STAT_DECR(s, v)	atomic_add_long((unsigned long *)(&(s)), -(v))
1176 
1177 /*
1178  * pmap_init_cpu:
1179  *
1180  *	Initilize pmap data in the cpu_info.
1181  */
1182 void
1183 pmap_init_cpu(struct cpu_info * const ci)
1184 {
1185 	pmap_t const pmap = pmap_kernel();
1186 
1187 	/* All CPUs start out using the kernel pmap. */
1188 	atomic_or_ulong(&pmap->pm_cpus, 1UL << ci->ci_cpuid);
1189 	pmap_reference(pmap);
1190 	ci->ci_pmap = pmap;
1191 
1192 	/* Initialize ASN allocation logic. */
1193 	ci->ci_next_asn = PMAP_ASN_FIRST_USER;
1194 	ci->ci_asn_gen = PMAP_ASNGEN_INITIAL;
1195 }
1196 
1197 /*
1198  * pmap_bootstrap:
1199  *
1200  *	Bootstrap the system to run with virtual memory.
1201  *
1202  *	Note: no locking is necessary in this function.
1203  */
1204 void
1205 pmap_bootstrap(paddr_t ptaddr, u_int maxasn, u_long ncpuids)
1206 {
1207 	vsize_t lev2mapsize, lev3mapsize;
1208 	pt_entry_t *lev2map, *lev3map;
1209 	pt_entry_t pte;
1210 	vsize_t bufsz;
1211 	struct pcb *pcb;
1212 	int i;
1213 
1214 #ifdef DEBUG
1215 	if (pmapdebug & (PDB_FOLLOW|PDB_BOOTSTRAP))
1216 		printf("pmap_bootstrap(0x%lx, %u)\n", ptaddr, maxasn);
1217 #endif
1218 
1219 	/*
1220 	 * Compute the number of pages kmem_arena will have.
1221 	 */
1222 	kmeminit_nkmempages();
1223 
1224 	/*
1225 	 * Figure out how many initial PTE's are necessary to map the
1226 	 * kernel.  We also reserve space for kmem_alloc_pageable()
1227 	 * for vm_fork().
1228 	 */
1229 
1230 	/* Get size of buffer cache and set an upper limit */
1231 	bufsz = buf_memcalc();
1232 	buf_setvalimit(bufsz);
1233 
1234 	lev3mapsize =
1235 		(VM_PHYS_SIZE + (ubc_nwins << ubc_winshift) +
1236 		 bufsz + 16 * NCARGS + pager_map_size) / PAGE_SIZE +
1237 		(maxproc * UPAGES) + nkmempages;
1238 
1239 	lev3mapsize = roundup(lev3mapsize, NPTEPG);
1240 
1241 	/*
1242 	 * Initialize `FYI' variables.  Note we're relying on
1243 	 * the fact that BSEARCH sorts the vm_physmem[] array
1244 	 * for us.
1245 	 */
1246 	avail_start = ptoa(uvm_physseg_get_avail_start(uvm_physseg_get_first()));
1247 	avail_end = ptoa(uvm_physseg_get_avail_end(uvm_physseg_get_last()));
1248 	virtual_end = VM_MIN_KERNEL_ADDRESS + lev3mapsize * PAGE_SIZE;
1249 
1250 #if 0
1251 	printf("avail_start = 0x%lx\n", avail_start);
1252 	printf("avail_end = 0x%lx\n", avail_end);
1253 	printf("virtual_end = 0x%lx\n", virtual_end);
1254 #endif
1255 
1256 	/*
1257 	 * Allocate a level 1 PTE table for the kernel.
1258 	 * This is always one page long.
1259 	 * IF THIS IS NOT A MULTIPLE OF PAGE_SIZE, ALL WILL GO TO HELL.
1260 	 */
1261 	kernel_lev1map = (pt_entry_t *)
1262 	    uvm_pageboot_alloc(sizeof(pt_entry_t) * NPTEPG);
1263 
1264 	/*
1265 	 * Allocate a level 2 PTE table for the kernel.
1266 	 * These must map all of the level3 PTEs.
1267 	 * IF THIS IS NOT A MULTIPLE OF PAGE_SIZE, ALL WILL GO TO HELL.
1268 	 */
1269 	lev2mapsize = roundup(howmany(lev3mapsize, NPTEPG), NPTEPG);
1270 	lev2map = (pt_entry_t *)
1271 	    uvm_pageboot_alloc(sizeof(pt_entry_t) * lev2mapsize);
1272 
1273 	/*
1274 	 * Allocate a level 3 PTE table for the kernel.
1275 	 * Contains lev3mapsize PTEs.
1276 	 */
1277 	lev3map = (pt_entry_t *)
1278 	    uvm_pageboot_alloc(sizeof(pt_entry_t) * lev3mapsize);
1279 
1280 	/*
1281 	 * Set up level 1 page table
1282 	 */
1283 
1284 	/* Map all of the level 2 pte pages */
1285 	for (i = 0; i < howmany(lev2mapsize, NPTEPG); i++) {
1286 		pte = (ALPHA_K0SEG_TO_PHYS(((vaddr_t)lev2map) +
1287 		    (i*PAGE_SIZE)) >> PGSHIFT) << PG_SHIFT;
1288 		pte |= PG_V | PG_ASM | PG_KRE | PG_KWE | PG_WIRED;
1289 		kernel_lev1map[l1pte_index(VM_MIN_KERNEL_ADDRESS +
1290 		    (i*PAGE_SIZE*NPTEPG*NPTEPG))] = pte;
1291 	}
1292 
1293 	/* Map the virtual page table */
1294 	pte = (ALPHA_K0SEG_TO_PHYS((vaddr_t)kernel_lev1map) >> PGSHIFT)
1295 	    << PG_SHIFT;
1296 	pte |= PG_V | PG_KRE | PG_KWE; /* NOTE NO ASM */
1297 	kernel_lev1map[l1pte_index(VPTBASE)] = pte;
1298 	VPT = (pt_entry_t *)VPTBASE;
1299 
1300 	/*
1301 	 * Set up level 2 page table.
1302 	 */
1303 	/* Map all of the level 3 pte pages */
1304 	for (i = 0; i < howmany(lev3mapsize, NPTEPG); i++) {
1305 		pte = (ALPHA_K0SEG_TO_PHYS(((vaddr_t)lev3map) +
1306 		    (i*PAGE_SIZE)) >> PGSHIFT) << PG_SHIFT;
1307 		pte |= PG_V | PG_ASM | PG_KRE | PG_KWE | PG_WIRED;
1308 		lev2map[l2pte_index(VM_MIN_KERNEL_ADDRESS+
1309 		    (i*PAGE_SIZE*NPTEPG))] = pte;
1310 	}
1311 
1312 	/* Initialize the pmap_growkernel_lock. */
1313 	rw_init(&pmap_growkernel_lock);
1314 
1315 	/*
1316 	 * Set up level three page table (lev3map)
1317 	 */
1318 	/* Nothing to do; it's already zero'd */
1319 
1320 	/*
1321 	 * Initialize the pmap pools and list.
1322 	 */
1323 	pmap_ncpuids = ncpuids;
1324 	pool_cache_bootstrap(&pmap_pmap_cache, PMAP_SIZEOF(pmap_ncpuids),
1325 	    COHERENCY_UNIT, 0, 0, "pmap", NULL, IPL_NONE, NULL, NULL, NULL);
1326 	pool_cache_bootstrap(&pmap_l1pt_cache, PAGE_SIZE, 0, 0, 0, "pmapl1pt",
1327 	    &pmap_l1pt_allocator, IPL_NONE, pmap_l1pt_ctor, NULL, NULL);
1328 	pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0,
1329 	    PR_LARGECACHE, "pmappv", &pmap_pv_page_allocator, IPL_NONE, NULL,
1330 	    NULL, NULL);
1331 
1332 	TAILQ_INIT(&pmap_all_pmaps);
1333 
1334 	/* Initialize the ASN logic.  See also pmap_init_cpu(). */
1335 	pmap_max_asn = maxasn;
1336 
1337 	/*
1338 	 * Initialize the locks.
1339 	 */
1340 	rw_init(&pmap_main_lock);
1341 	mutex_init(&pmap_all_pmaps_lock, MUTEX_DEFAULT, IPL_NONE);
1342 	for (i = 0; i < __arraycount(pmap_pvh_locks); i++) {
1343 		mutex_init(&pmap_pvh_locks[i].lock, MUTEX_DEFAULT, IPL_NONE);
1344 	}
1345 	for (i = 0; i < __arraycount(pmap_pvh_locks); i++) {
1346 		mutex_init(&pmap_pmap_locks[i].locks.lock,
1347 		    MUTEX_DEFAULT, IPL_NONE);
1348 		mutex_init(&pmap_pmap_locks[i].locks.activation_lock,
1349 		    MUTEX_SPIN, IPL_SCHED);
1350 	}
1351 
1352 	/*
1353 	 * This must block any interrupt from which a TLB shootdown
1354 	 * could be issued, but must NOT block IPIs.
1355 	 */
1356 	mutex_init(&tlb_lock, MUTEX_SPIN, IPL_VM);
1357 
1358 	/*
1359 	 * Initialize kernel pmap.  Note that all kernel mappings
1360 	 * have PG_ASM set, so the ASN doesn't really matter for
1361 	 * the kernel pmap.  Also, since the kernel pmap always
1362 	 * references kernel_lev1map, it always has an invalid ASN
1363 	 * generation.
1364 	 */
1365 	memset(pmap_kernel(), 0, sizeof(struct pmap));
1366 	pmap_kernel()->pm_lev1map = kernel_lev1map;
1367 	pmap_kernel()->pm_count = 1;
1368 	/* Kernel pmap does not have ASN info. */
1369 	TAILQ_INSERT_TAIL(&pmap_all_pmaps, pmap_kernel(), pm_list);
1370 
1371 	/*
1372 	 * Set up lwp0's PCB such that the ptbr points to the right place
1373 	 * and has the kernel pmap's (really unused) ASN.
1374 	 */
1375 	pcb = lwp_getpcb(&lwp0);
1376 	pcb->pcb_hw.apcb_ptbr =
1377 	    ALPHA_K0SEG_TO_PHYS((vaddr_t)kernel_lev1map) >> PGSHIFT;
1378 	pcb->pcb_hw.apcb_asn = PMAP_ASN_KERNEL;
1379 
1380 	struct cpu_info * const ci = curcpu();
1381 	pmap_init_cpu(ci);
1382 }
1383 
1384 /*
1385  * pmap_virtual_space:		[ INTERFACE ]
1386  *
1387  *	Define the initial bounds of the kernel virtual address space.
1388  */
1389 void
1390 pmap_virtual_space(vaddr_t *vstartp, vaddr_t *vendp)
1391 {
1392 
1393 	*vstartp = VM_MIN_KERNEL_ADDRESS;	/* kernel is in K0SEG */
1394 	*vendp = VM_MAX_KERNEL_ADDRESS;		/* we use pmap_growkernel */
1395 }
1396 
1397 /*
1398  * pmap_steal_memory:		[ INTERFACE ]
1399  *
1400  *	Bootstrap memory allocator (alternative to vm_bootstrap_steal_memory()).
1401  *	This function allows for early dynamic memory allocation until the
1402  *	virtual memory system has been bootstrapped.  After that point, either
1403  *	kmem_alloc or malloc should be used.  This function works by stealing
1404  *	pages from the (to be) managed page pool, then implicitly mapping the
1405  *	pages (by using their k0seg addresses) and zeroing them.
1406  *
1407  *	It may be used once the physical memory segments have been pre-loaded
1408  *	into the vm_physmem[] array.  Early memory allocation MUST use this
1409  *	interface!  This cannot be used after vm_page_startup(), and will
1410  *	generate a panic if tried.
1411  *
1412  *	Note that this memory will never be freed, and in essence it is wired
1413  *	down.
1414  *
1415  *	We must adjust *vstartp and/or *vendp iff we use address space
1416  *	from the kernel virtual address range defined by pmap_virtual_space().
1417  *
1418  *	Note: no locking is necessary in this function.
1419  */
1420 vaddr_t
1421 pmap_steal_memory(vsize_t size, vaddr_t *vstartp, vaddr_t *vendp)
1422 {
1423 	int npgs;
1424 	vaddr_t va;
1425 	paddr_t pa;
1426 
1427 	uvm_physseg_t bank;
1428 
1429 	size = round_page(size);
1430 	npgs = atop(size);
1431 
1432 #if 0
1433 	printf("PSM: size 0x%lx (npgs 0x%x)\n", size, npgs);
1434 #endif
1435 
1436 	for (bank = uvm_physseg_get_first();
1437 	     uvm_physseg_valid_p(bank);
1438 	     bank = uvm_physseg_get_next(bank)) {
1439 		if (uvm.page_init_done == true)
1440 			panic("pmap_steal_memory: called _after_ bootstrap");
1441 
1442 #if 0
1443 		printf("     bank %d: avail_start 0x%"PRIxPADDR", start 0x%"PRIxPADDR", "
1444 		    "avail_end 0x%"PRIxPADDR"\n", bank, uvm_physseg_get_avail_start(bank),
1445 		    uvm_physseg_get_start(bank), uvm_physseg_get_avail_end(bank));
1446 #endif
1447 
1448 		if (uvm_physseg_get_avail_start(bank) != uvm_physseg_get_start(bank) ||
1449 		    uvm_physseg_get_avail_start(bank) >= uvm_physseg_get_avail_end(bank))
1450 			continue;
1451 
1452 #if 0
1453 		printf("             avail_end - avail_start = 0x%"PRIxPADDR"\n",
1454 		    uvm_physseg_get_avail_end(bank) - uvm_physseg_get_avail_start(bank));
1455 #endif
1456 
1457 		if (uvm_physseg_get_avail_end(bank) - uvm_physseg_get_avail_start(bank)
1458 		    < npgs)
1459 			continue;
1460 
1461 		/*
1462 		 * There are enough pages here; steal them!
1463 		 */
1464 		pa = ptoa(uvm_physseg_get_start(bank));
1465 		uvm_physseg_unplug(atop(pa), npgs);
1466 
1467 		va = ALPHA_PHYS_TO_K0SEG(pa);
1468 		memset((void *)va, 0, size);
1469 		pmap_pages_stolen += npgs;
1470 		return (va);
1471 	}
1472 
1473 	/*
1474 	 * If we got here, this was no memory left.
1475 	 */
1476 	panic("pmap_steal_memory: no memory to steal");
1477 }
1478 
1479 /*
1480  * pmap_init:			[ INTERFACE ]
1481  *
1482  *	Initialize the pmap module.  Called by vm_init(), to initialize any
1483  *	structures that the pmap system needs to map virtual memory.
1484  *
1485  *	Note: no locking is necessary in this function.
1486  */
1487 void
1488 pmap_init(void)
1489 {
1490 
1491 #ifdef DEBUG
1492 	if (pmapdebug & PDB_FOLLOW)
1493 	        printf("pmap_init()\n");
1494 #endif
1495 
1496 	/* initialize protection array */
1497 	alpha_protection_init();
1498 
1499 	/* Initialize TLB handling. */
1500 	pmap_tlb_init();
1501 
1502 	/*
1503 	 * Set a low water mark on the pv_entry pool, so that we are
1504 	 * more likely to have these around even in extreme memory
1505 	 * starvation.
1506 	 */
1507 	pool_cache_setlowat(&pmap_pv_cache, pmap_pv_lowat);
1508 
1509 	/*
1510 	 * Now it is safe to enable pv entry recording.
1511 	 */
1512 	pmap_initialized = true;
1513 
1514 #if 0
1515 	for (uvm_physseg_t bank = uvm_physseg_get_first();
1516 	    uvm_physseg_valid_p(bank);
1517 	    bank = uvm_physseg_get_next(bank)) {
1518 		printf("bank %d\n", bank);
1519 		printf("\tstart = 0x%lx\n", ptoa(uvm_physseg_get_start(bank)));
1520 		printf("\tend = 0x%lx\n", ptoa(uvm_physseg_get_end(bank)));
1521 		printf("\tavail_start = 0x%lx\n",
1522 		    ptoa(uvm_physseg_get_avail_start(bank)));
1523 		printf("\tavail_end = 0x%lx\n",
1524 		    ptoa(uvm_physseg_get_avail_end(bank)));
1525 	}
1526 #endif
1527 }
1528 
1529 /*
1530  * pmap_create:			[ INTERFACE ]
1531  *
1532  *	Create and return a physical map.
1533  *
1534  *	Note: no locking is necessary in this function.
1535  */
1536 pmap_t
1537 pmap_create(void)
1538 {
1539 	pmap_t pmap;
1540 	int i;
1541 
1542 #ifdef DEBUG
1543 	if (pmapdebug & (PDB_FOLLOW|PDB_CREATE))
1544 		printf("pmap_create()\n");
1545 #endif
1546 
1547 	pmap = pool_cache_get(&pmap_pmap_cache, PR_WAITOK);
1548 	memset(pmap, 0, sizeof(*pmap));
1549 
1550 	pmap->pm_count = 1;
1551 
1552 	/*
1553 	 * There are only kernel mappings at this point; give the pmap
1554 	 * the kernel ASN.  This will be initialized to correct values
1555 	 * when the pmap is activated.
1556 	 */
1557 	for (i = 0; i < pmap_ncpuids; i++) {
1558 		pmap->pm_asni[i].pma_asn = PMAP_ASN_KERNEL;
1559 		pmap->pm_asni[i].pma_asngen = PMAP_ASNGEN_INVALID;
1560 	}
1561 
1562  try_again:
1563 	rw_enter(&pmap_growkernel_lock, RW_READER);
1564 
1565 	pmap->pm_lev1map = pool_cache_get(&pmap_l1pt_cache, PR_NOWAIT);
1566 	if (__predict_false(pmap->pm_lev1map == NULL)) {
1567 		rw_exit(&pmap_growkernel_lock);
1568 		(void) kpause("pmap_create", false, hz >> 2, NULL);
1569 		goto try_again;
1570 	}
1571 
1572 	mutex_enter(&pmap_all_pmaps_lock);
1573 	TAILQ_INSERT_TAIL(&pmap_all_pmaps, pmap, pm_list);
1574 	mutex_exit(&pmap_all_pmaps_lock);
1575 
1576 	rw_exit(&pmap_growkernel_lock);
1577 
1578 	return (pmap);
1579 }
1580 
1581 /*
1582  * pmap_destroy:		[ INTERFACE ]
1583  *
1584  *	Drop the reference count on the specified pmap, releasing
1585  *	all resources if the reference count drops to zero.
1586  */
1587 void
1588 pmap_destroy(pmap_t pmap)
1589 {
1590 
1591 #ifdef DEBUG
1592 	if (pmapdebug & PDB_FOLLOW)
1593 		printf("pmap_destroy(%p)\n", pmap);
1594 #endif
1595 
1596 	PMAP_MP(membar_exit());
1597 	if (atomic_dec_ulong_nv(&pmap->pm_count) > 0)
1598 		return;
1599 
1600 	rw_enter(&pmap_growkernel_lock, RW_READER);
1601 
1602 	/*
1603 	 * Remove it from the global list of all pmaps.
1604 	 */
1605 	mutex_enter(&pmap_all_pmaps_lock);
1606 	TAILQ_REMOVE(&pmap_all_pmaps, pmap, pm_list);
1607 	mutex_exit(&pmap_all_pmaps_lock);
1608 
1609 	pool_cache_put(&pmap_l1pt_cache, pmap->pm_lev1map);
1610 	pmap->pm_lev1map = NULL;
1611 
1612 	rw_exit(&pmap_growkernel_lock);
1613 
1614 	pool_cache_put(&pmap_pmap_cache, pmap);
1615 }
1616 
1617 /*
1618  * pmap_reference:		[ INTERFACE ]
1619  *
1620  *	Add a reference to the specified pmap.
1621  */
1622 void
1623 pmap_reference(pmap_t pmap)
1624 {
1625 
1626 #ifdef DEBUG
1627 	if (pmapdebug & PDB_FOLLOW)
1628 		printf("pmap_reference(%p)\n", pmap);
1629 #endif
1630 
1631 	atomic_inc_ulong(&pmap->pm_count);
1632 	PMAP_MP(membar_enter());
1633 }
1634 
1635 /*
1636  * pmap_remove:			[ INTERFACE ]
1637  *
1638  *	Remove the given range of addresses from the specified map.
1639  *
1640  *	It is assumed that the start and end are properly
1641  *	rounded to the page size.
1642  */
1643 static void
1644 pmap_remove_internal(pmap_t pmap, vaddr_t sva, vaddr_t eva,
1645     struct pmap_tlb_context * const tlbctx)
1646 {
1647 	pt_entry_t *l1pte, *l2pte, *l3pte;
1648 	pt_entry_t *saved_l2pte, *saved_l3pte;
1649 	vaddr_t l1eva, l2eva, l3vptva;
1650 	pt_entry_t pte_bits;
1651 
1652 #ifdef DEBUG
1653 	if (pmapdebug & (PDB_FOLLOW|PDB_REMOVE|PDB_PROTECT))
1654 		printf("pmap_remove(%p, %lx, %lx)\n", pmap, sva, eva);
1655 #endif
1656 
1657 	/*
1658 	 * If this is the kernel pmap, we can use a faster method
1659 	 * for accessing the PTEs (since the PT pages are always
1660 	 * resident).
1661 	 *
1662 	 * Note that this routine should NEVER be called from an
1663 	 * interrupt context; pmap_kremove() is used for that.
1664 	 */
1665 	if (pmap == pmap_kernel()) {
1666 		PMAP_MAP_TO_HEAD_LOCK();
1667 		PMAP_LOCK(pmap);
1668 
1669 		while (sva < eva) {
1670 			l3pte = PMAP_KERNEL_PTE(sva);
1671 			if (pmap_pte_v(l3pte)) {
1672 				pte_bits = pmap_remove_mapping(pmap, sva,
1673 				    l3pte, true, NULL, tlbctx);
1674 				pmap_tlb_shootdown(pmap, sva, pte_bits,
1675 				    tlbctx);
1676 			}
1677 			sva += PAGE_SIZE;
1678 		}
1679 
1680 		PMAP_MAP_TO_HEAD_UNLOCK();
1681 		PMAP_UNLOCK(pmap);
1682 		pmap_tlb_shootnow(tlbctx);
1683 		pmap_tlb_ptpage_drain(tlbctx);
1684 		TLB_COUNT(reason_remove_kernel);
1685 
1686 		return;
1687 	}
1688 
1689 	KASSERT(sva < VM_MAXUSER_ADDRESS);
1690 	KASSERT(eva <= VM_MAXUSER_ADDRESS);
1691 	KASSERT(pmap->pm_lev1map != kernel_lev1map);
1692 
1693 	PMAP_MAP_TO_HEAD_LOCK();
1694 	PMAP_LOCK(pmap);
1695 
1696 	l1pte = pmap_l1pte(pmap, sva);
1697 
1698 	for (; sva < eva; sva = l1eva, l1pte++) {
1699 		l1eva = alpha_trunc_l1seg(sva) + ALPHA_L1SEG_SIZE;
1700 		if (pmap_pte_v(l1pte)) {
1701 			saved_l2pte = l2pte = pmap_l2pte(pmap, sva, l1pte);
1702 
1703 			/*
1704 			 * Add a reference to the L2 table so it won't
1705 			 * get removed from under us.
1706 			 */
1707 			pmap_physpage_addref(saved_l2pte);
1708 
1709 			for (; sva < l1eva && sva < eva; sva = l2eva, l2pte++) {
1710 				l2eva =
1711 				    alpha_trunc_l2seg(sva) + ALPHA_L2SEG_SIZE;
1712 				if (pmap_pte_v(l2pte)) {
1713 					saved_l3pte = l3pte =
1714 					    pmap_l3pte(pmap, sva, l2pte);
1715 
1716 					/*
1717 					 * Add a reference to the L3 table so
1718 					 * it won't get removed from under us.
1719 					 */
1720 					pmap_physpage_addref(saved_l3pte);
1721 
1722 					/*
1723 					 * Remember this sva; if the L3 table
1724 					 * gets removed, we need to invalidate
1725 					 * the VPT TLB entry for it.
1726 					 */
1727 					l3vptva = sva;
1728 
1729 					for (; sva < l2eva && sva < eva;
1730 					     sva += PAGE_SIZE, l3pte++) {
1731 						if (!pmap_pte_v(l3pte)) {
1732 							continue;
1733 						}
1734 						pte_bits =
1735 						    pmap_remove_mapping(
1736 							pmap, sva,
1737 							l3pte, true,
1738 							NULL, tlbctx);
1739 						pmap_tlb_shootdown(pmap,
1740 						    sva, pte_bits, tlbctx);
1741 					}
1742 
1743 					/*
1744 					 * Remove the reference to the L3
1745 					 * table that we added above.  This
1746 					 * may free the L3 table.
1747 					 */
1748 					pmap_l3pt_delref(pmap, l3vptva,
1749 					    saved_l3pte, tlbctx);
1750 				}
1751 			}
1752 
1753 			/*
1754 			 * Remove the reference to the L2 table that we
1755 			 * added above.  This may free the L2 table.
1756 			 */
1757 			pmap_l2pt_delref(pmap, l1pte, saved_l2pte, tlbctx);
1758 		}
1759 	}
1760 
1761 	PMAP_MAP_TO_HEAD_UNLOCK();
1762 	PMAP_UNLOCK(pmap);
1763 	pmap_tlb_shootnow(tlbctx);
1764 	pmap_tlb_ptpage_drain(tlbctx);
1765 	TLB_COUNT(reason_remove_user);
1766 }
1767 
1768 void
1769 pmap_remove(pmap_t pmap, vaddr_t sva, vaddr_t eva)
1770 {
1771 	struct pmap_tlb_context tlbctx;
1772 
1773 	pmap_tlb_context_init(&tlbctx);
1774 	pmap_remove_internal(pmap, sva, eva, &tlbctx);
1775 }
1776 
1777 /*
1778  * pmap_page_protect:		[ INTERFACE ]
1779  *
1780  *	Lower the permission for all mappings to a given page to
1781  *	the permissions specified.
1782  */
1783 void
1784 pmap_page_protect(struct vm_page *pg, vm_prot_t prot)
1785 {
1786 	struct vm_page_md * const md = VM_PAGE_TO_MD(pg);
1787 	pv_entry_t pv, nextpv;
1788 	pt_entry_t opte;
1789 	kmutex_t *lock;
1790 	struct pmap_tlb_context tlbctx;
1791 
1792 #ifdef DEBUG
1793 	paddr_t pa = VM_PAGE_TO_PHYS(pg);
1794 
1795 
1796 	if ((pmapdebug & (PDB_FOLLOW|PDB_PROTECT)) ||
1797 	    (prot == VM_PROT_NONE && (pmapdebug & PDB_REMOVE)))
1798 		printf("pmap_page_protect(%p, %x)\n", pg, prot);
1799 #endif
1800 
1801 	pmap_tlb_context_init(&tlbctx);
1802 
1803 	switch (prot) {
1804 	case VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE:
1805 	case VM_PROT_READ|VM_PROT_WRITE:
1806 		return;
1807 
1808 	/* copy_on_write */
1809 	case VM_PROT_READ|VM_PROT_EXECUTE:
1810 	case VM_PROT_READ:
1811 		PMAP_HEAD_TO_MAP_LOCK();
1812 		lock = pmap_pvh_lock(pg);
1813 		mutex_enter(lock);
1814 		for (pv = md->pvh_list; pv != NULL; pv = pv->pv_next) {
1815 			PMAP_LOCK(pv->pv_pmap);
1816 			opte = atomic_load_relaxed(pv->pv_pte);
1817 			if (opte & (PG_KWE | PG_UWE)) {
1818 				atomic_store_relaxed(pv->pv_pte,
1819 				    opte & ~(PG_KWE | PG_UWE));
1820 				pmap_tlb_shootdown_pv(pv, opte, &tlbctx);
1821 			}
1822 			PMAP_UNLOCK(pv->pv_pmap);
1823 		}
1824 		mutex_exit(lock);
1825 		PMAP_HEAD_TO_MAP_UNLOCK();
1826 		pmap_tlb_shootnow(&tlbctx);
1827 		TLB_COUNT(reason_page_protect_read);
1828 		return;
1829 
1830 	/* remove_all */
1831 	default:
1832 		break;
1833 	}
1834 
1835 	PMAP_HEAD_TO_MAP_LOCK();
1836 	lock = pmap_pvh_lock(pg);
1837 	mutex_enter(lock);
1838 	for (pv = md->pvh_list; pv != NULL; pv = nextpv) {
1839 		pt_entry_t pte_bits;
1840 
1841 		nextpv = pv->pv_next;
1842 
1843 		PMAP_LOCK(pv->pv_pmap);
1844 		pte_bits = pmap_remove_mapping(pv->pv_pmap, pv->pv_va,
1845 		    pv->pv_pte, false, NULL, &tlbctx);
1846 		pmap_tlb_shootdown_pv(pv, pte_bits, &tlbctx);
1847 		PMAP_UNLOCK(pv->pv_pmap);
1848 	}
1849 	mutex_exit(lock);
1850 	PMAP_HEAD_TO_MAP_UNLOCK();
1851 	pmap_tlb_shootnow(&tlbctx);
1852 	pmap_tlb_ptpage_drain(&tlbctx);
1853 	TLB_COUNT(reason_page_protect_none);
1854 }
1855 
1856 /*
1857  * pmap_protect:		[ INTERFACE ]
1858  *
1859  *	Set the physical protection on the specified range of this map
1860  *	as requested.
1861  */
1862 void
1863 pmap_protect(pmap_t pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
1864 {
1865 	pt_entry_t *l1pte, *l2pte, *l3pte, opte;
1866 	vaddr_t l1eva, l2eva;
1867 	struct pmap_tlb_context tlbctx;
1868 
1869 #ifdef DEBUG
1870 	if (pmapdebug & (PDB_FOLLOW|PDB_PROTECT))
1871 		printf("pmap_protect(%p, %lx, %lx, %x)\n",
1872 		    pmap, sva, eva, prot);
1873 #endif
1874 
1875 	pmap_tlb_context_init(&tlbctx);
1876 
1877 	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
1878 		pmap_remove_internal(pmap, sva, eva, &tlbctx);
1879 		return;
1880 	}
1881 
1882 	const pt_entry_t bits = pte_prot(pmap, prot);
1883 
1884 	PMAP_LOCK(pmap);
1885 
1886 	l1pte = pmap_l1pte(pmap, sva);
1887 	for (; sva < eva; sva = l1eva, l1pte++) {
1888 		l1eva = alpha_trunc_l1seg(sva) + ALPHA_L1SEG_SIZE;
1889 		if (pmap_pte_v(l1pte)) {
1890 			l2pte = pmap_l2pte(pmap, sva, l1pte);
1891 			for (; sva < l1eva && sva < eva; sva = l2eva, l2pte++) {
1892 				l2eva =
1893 				    alpha_trunc_l2seg(sva) + ALPHA_L2SEG_SIZE;
1894 				if (pmap_pte_v(l2pte)) {
1895 					l3pte = pmap_l3pte(pmap, sva, l2pte);
1896 					for (; sva < l2eva && sva < eva;
1897 					     sva += PAGE_SIZE, l3pte++) {
1898 						if (pmap_pte_v(l3pte) &&
1899 						    pmap_pte_prot_chg(l3pte,
1900 								      bits)) {
1901 							opte = atomic_load_relaxed(l3pte);
1902 							pmap_pte_set_prot(l3pte,
1903 							   bits);
1904 							pmap_tlb_shootdown(pmap,
1905 							    sva, opte, &tlbctx);
1906 						}
1907 					}
1908 				}
1909 			}
1910 		}
1911 	}
1912 
1913 	PMAP_UNLOCK(pmap);
1914 	pmap_tlb_shootnow(&tlbctx);
1915 	TLB_COUNT(reason_protect);
1916 }
1917 
1918 /*
1919  * pmap_enter_tlb_shootdown:
1920  *
1921  *	Carry out a TLB shootdown on behalf of a pmap_enter()
1922  *	or a pmap_kenter_pa().  This is factored out separately
1923  *	because we expect it to be not a common case.
1924  */
1925 static void __noinline
1926 pmap_enter_tlb_shootdown(pmap_t const pmap, vaddr_t const va,
1927     pt_entry_t const pte_bits, bool locked)
1928 {
1929 	struct pmap_tlb_context tlbctx;
1930 
1931 	pmap_tlb_context_init(&tlbctx);
1932 	pmap_tlb_shootdown(pmap, va, pte_bits, &tlbctx);
1933 	if (locked) {
1934 		PMAP_UNLOCK(pmap);
1935 	}
1936 	pmap_tlb_shootnow(&tlbctx);
1937 }
1938 
1939 /*
1940  * pmap_enter_l2pt_delref:
1941  *
1942  *	Release a reference on an L2 PT page for pmap_enter().
1943  *	This is factored out separately becacause we expect it
1944  *	to be a rare case.
1945  */
1946 static void __noinline
1947 pmap_enter_l2pt_delref(pmap_t const pmap, pt_entry_t * const l1pte,
1948     pt_entry_t * const l2pte)
1949 {
1950 	struct pmap_tlb_context tlbctx;
1951 
1952 	/*
1953 	 * PALcode may have tried to service a TLB miss with
1954 	 * this L2 PTE, so we need to make sure we don't actully
1955 	 * free the PT page untl we've shot down any TLB entries
1956 	 * for this VPT index.
1957 	 */
1958 
1959 	pmap_tlb_context_init(&tlbctx);
1960 	pmap_l2pt_delref(pmap, l1pte, l2pte, &tlbctx);
1961 	PMAP_UNLOCK(pmap);
1962 	pmap_tlb_shootnow(&tlbctx);
1963 	pmap_tlb_ptpage_drain(&tlbctx);
1964 	TLB_COUNT(reason_enter_l2pt_delref);
1965 }
1966 
1967 /*
1968  * pmap_enter_l3pt_delref:
1969  *
1970  *	Release a reference on an L3 PT page for pmap_enter().
1971  *	This is factored out separately becacause we expect it
1972  *	to be a rare case.
1973  */
1974 static void __noinline
1975 pmap_enter_l3pt_delref(pmap_t const pmap, vaddr_t const va,
1976     pt_entry_t * const pte)
1977 {
1978 	struct pmap_tlb_context tlbctx;
1979 
1980 	/*
1981 	 * PALcode may have tried to service a TLB miss with
1982 	 * this PTE, so we need to make sure we don't actully
1983 	 * free the PT page untl we've shot down any TLB entries
1984 	 * for this VPT index.
1985 	 */
1986 
1987 	pmap_tlb_context_init(&tlbctx);
1988 	pmap_l3pt_delref(pmap, va, pte, &tlbctx);
1989 	PMAP_UNLOCK(pmap);
1990 	pmap_tlb_shootnow(&tlbctx);
1991 	pmap_tlb_ptpage_drain(&tlbctx);
1992 	TLB_COUNT(reason_enter_l3pt_delref);
1993 }
1994 
1995 /*
1996  * pmap_enter:			[ INTERFACE ]
1997  *
1998  *	Insert the given physical page (p) at
1999  *	the specified virtual address (v) in the
2000  *	target physical map with the protection requested.
2001  *
2002  *	If specified, the page will be wired down, meaning
2003  *	that the related pte can not be reclaimed.
2004  *
2005  *	Note:  This is the only routine which MAY NOT lazy-evaluate
2006  *	or lose information.  That is, this routine must actually
2007  *	insert this page into the given map NOW.
2008  */
2009 int
2010 pmap_enter(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
2011 {
2012 	pt_entry_t *pte, npte, opte;
2013 	pv_entry_t opv = NULL;
2014 	paddr_t opa;
2015 	bool tflush = false;
2016 	int error = 0;
2017 	kmutex_t *lock;
2018 
2019 #ifdef DEBUG
2020 	if (pmapdebug & (PDB_FOLLOW|PDB_ENTER))
2021 		printf("pmap_enter(%p, %lx, %lx, %x, %x)\n",
2022 		       pmap, va, pa, prot, flags);
2023 #endif
2024 	struct vm_page * const pg = PHYS_TO_VM_PAGE(pa);
2025 	const bool wired = (flags & PMAP_WIRED) != 0;
2026 
2027 	PMAP_MAP_TO_HEAD_LOCK();
2028 	PMAP_LOCK(pmap);
2029 
2030 	if (pmap == pmap_kernel()) {
2031 		KASSERT(va >= VM_MIN_KERNEL_ADDRESS);
2032 		pte = PMAP_KERNEL_PTE(va);
2033 	} else {
2034 		pt_entry_t *l1pte, *l2pte;
2035 
2036 		KASSERT(va < VM_MAXUSER_ADDRESS);
2037 		KASSERT(pmap->pm_lev1map != kernel_lev1map);
2038 
2039 		/*
2040 		 * Check to see if the level 1 PTE is valid, and
2041 		 * allocate a new level 2 page table page if it's not.
2042 		 * A reference will be added to the level 2 table when
2043 		 * the level 3 table is created.
2044 		 */
2045 		l1pte = pmap_l1pte(pmap, va);
2046 		if (pmap_pte_v(l1pte) == 0) {
2047 			pmap_physpage_addref(l1pte);
2048 			error = pmap_ptpage_alloc(l1pte, PGU_L2PT);
2049 			if (error) {
2050 				pmap_l1pt_delref(pmap, l1pte);
2051 				if (flags & PMAP_CANFAIL)
2052 					goto out;
2053 				panic("pmap_enter: unable to create L2 PT "
2054 				    "page");
2055 			}
2056 #ifdef DEBUG
2057 			if (pmapdebug & PDB_PTPAGE)
2058 				printf("pmap_enter: new level 2 table at "
2059 				    "0x%lx\n", pmap_pte_pa(l1pte));
2060 #endif
2061 		}
2062 
2063 		/*
2064 		 * Check to see if the level 2 PTE is valid, and
2065 		 * allocate a new level 3 page table page if it's not.
2066 		 * A reference will be added to the level 3 table when
2067 		 * the mapping is validated.
2068 		 */
2069 		l2pte = pmap_l2pte(pmap, va, l1pte);
2070 		if (pmap_pte_v(l2pte) == 0) {
2071 			pmap_physpage_addref(l2pte);
2072 			error = pmap_ptpage_alloc(l2pte, PGU_L3PT);
2073 			if (error) {
2074 				/* unlocks pmap */
2075 				pmap_enter_l2pt_delref(pmap, l1pte, l2pte);
2076 				if (flags & PMAP_CANFAIL) {
2077 					PMAP_LOCK(pmap);
2078 					goto out;
2079 				}
2080 				panic("pmap_enter: unable to create L3 PT "
2081 				    "page");
2082 			}
2083 #ifdef DEBUG
2084 			if (pmapdebug & PDB_PTPAGE)
2085 				printf("pmap_enter: new level 3 table at "
2086 				    "0x%lx\n", pmap_pte_pa(l2pte));
2087 #endif
2088 		}
2089 
2090 		/*
2091 		 * Get the PTE that will map the page.
2092 		 */
2093 		pte = pmap_l3pte(pmap, va, l2pte);
2094 	}
2095 
2096 	/* Remember all of the old PTE; used for TBI check later. */
2097 	opte = atomic_load_relaxed(pte);
2098 
2099 	/*
2100 	 * Check to see if the old mapping is valid.  If not, validate the
2101 	 * new one immediately.
2102 	 */
2103 	if ((opte & PG_V) == 0) {
2104 		/* No TLB invalidatons needed for new mappings. */
2105 
2106 		if (pmap != pmap_kernel()) {
2107 			/*
2108 			 * New mappings gain a reference on the level 3
2109 			 * table.
2110 			 */
2111 			pmap_physpage_addref(pte);
2112 		}
2113 		goto validate_enterpv;
2114 	}
2115 
2116 	opa = pmap_pte_pa(pte);
2117 
2118 	if (opa == pa) {
2119 		/*
2120 		 * Mapping has not changed; must be a protection or
2121 		 * wiring change.
2122 		 */
2123 		if (pmap_pte_w_chg(pte, wired ? PG_WIRED : 0)) {
2124 #ifdef DEBUG
2125 			if (pmapdebug & PDB_ENTER)
2126 				printf("pmap_enter: wiring change -> %d\n",
2127 				    wired);
2128 #endif
2129 			/* Adjust the wiring count. */
2130 			if (wired)
2131 				PMAP_STAT_INCR(pmap->pm_stats.wired_count, 1);
2132 			else
2133 				PMAP_STAT_DECR(pmap->pm_stats.wired_count, 1);
2134 		}
2135 
2136 		/* Set the PTE. */
2137 		goto validate;
2138 	}
2139 
2140 	/*
2141 	 * The mapping has changed.  We need to invalidate the
2142 	 * old mapping before creating the new one.
2143 	 */
2144 #ifdef DEBUG
2145 	if (pmapdebug & PDB_ENTER)
2146 		printf("pmap_enter: removing old mapping 0x%lx\n", va);
2147 #endif
2148 	if (pmap != pmap_kernel()) {
2149 		/*
2150 		 * Gain an extra reference on the level 3 table.
2151 		 * pmap_remove_mapping() will delete a reference,
2152 		 * and we don't want the table to be erroneously
2153 		 * freed.
2154 		 */
2155 		pmap_physpage_addref(pte);
2156 	}
2157 	/* Already have the bits from opte above. */
2158 	(void) pmap_remove_mapping(pmap, va, pte, true, &opv, NULL);
2159 
2160  validate_enterpv:
2161 	/* Enter the mapping into the pv_table if appropriate. */
2162 	if (pg != NULL) {
2163 		error = pmap_pv_enter(pmap, pg, va, pte, true, opv);
2164 		if (error) {
2165 			/* This can only fail if opv == NULL */
2166 			KASSERT(opv == NULL);
2167 
2168 			/* unlocks pmap */
2169 			pmap_enter_l3pt_delref(pmap, va, pte);
2170 			if (flags & PMAP_CANFAIL) {
2171 				PMAP_LOCK(pmap);
2172 				goto out;
2173 			}
2174 			panic("pmap_enter: unable to enter mapping in PV "
2175 			    "table");
2176 		}
2177 		opv = NULL;
2178 	}
2179 
2180 	/* Increment counters. */
2181 	PMAP_STAT_INCR(pmap->pm_stats.resident_count, 1);
2182 	if (wired)
2183 		PMAP_STAT_INCR(pmap->pm_stats.wired_count, 1);
2184 
2185  validate:
2186 	/* Build the new PTE. */
2187 	npte = ((pa >> PGSHIFT) << PG_SHIFT) | pte_prot(pmap, prot) | PG_V;
2188 	if (pg != NULL) {
2189 		struct vm_page_md * const md = VM_PAGE_TO_MD(pg);
2190 		int attrs;
2191 
2192 		KASSERT(((flags & VM_PROT_ALL) & ~prot) == 0);
2193 
2194 		lock = pmap_pvh_lock(pg);
2195 		mutex_enter(lock);
2196 		if (flags & VM_PROT_WRITE)
2197 			md->pvh_attrs |= (PGA_REFERENCED|PGA_MODIFIED);
2198 		else if (flags & VM_PROT_ALL)
2199 			md->pvh_attrs |= PGA_REFERENCED;
2200 		attrs = md->pvh_attrs;
2201 		mutex_exit(lock);
2202 
2203 		/* Set up referenced/modified emulation for new mapping. */
2204 		if ((attrs & PGA_REFERENCED) == 0)
2205 			npte |= PG_FOR | PG_FOW | PG_FOE;
2206 		else if ((attrs & PGA_MODIFIED) == 0)
2207 			npte |= PG_FOW;
2208 
2209 		/*
2210 		 * Mapping was entered on PV list.
2211 		 */
2212 		npte |= PG_PVLIST;
2213 	}
2214 	if (wired)
2215 		npte |= PG_WIRED;
2216 #ifdef DEBUG
2217 	if (pmapdebug & PDB_ENTER)
2218 		printf("pmap_enter: new pte = 0x%lx\n", npte);
2219 #endif
2220 
2221 	/*
2222 	 * If the HW / PALcode portion of the new PTE is the same as the
2223 	 * old PTE, no TBI is necessary.
2224 	 */
2225 	if (opte & PG_V) {
2226 		tflush = PG_PALCODE(opte) != PG_PALCODE(npte);
2227 	}
2228 
2229 	/* Set the new PTE. */
2230 	atomic_store_relaxed(pte, npte);
2231 
2232 out:
2233 	PMAP_MAP_TO_HEAD_UNLOCK();
2234 
2235 	/*
2236 	 * Invalidate the TLB entry for this VA and any appropriate
2237 	 * caches.
2238 	 */
2239 	if (tflush) {
2240 		/* unlocks pmap */
2241 		pmap_enter_tlb_shootdown(pmap, va, opte, true);
2242 		if (pmap == pmap_kernel()) {
2243 			TLB_COUNT(reason_enter_kernel);
2244 		} else {
2245 			TLB_COUNT(reason_enter_user);
2246 		}
2247 	} else {
2248 		PMAP_UNLOCK(pmap);
2249 	}
2250 
2251 	if (opv)
2252 		pmap_pv_free(opv);
2253 
2254 	return error;
2255 }
2256 
2257 /*
2258  * pmap_kenter_pa:		[ INTERFACE ]
2259  *
2260  *	Enter a va -> pa mapping into the kernel pmap without any
2261  *	physical->virtual tracking.
2262  *
2263  *	Note: no locking is necessary in this function.
2264  */
2265 void
2266 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
2267 {
2268 	pmap_t const pmap = pmap_kernel();
2269 
2270 #ifdef DEBUG
2271 	if (pmapdebug & (PDB_FOLLOW|PDB_ENTER))
2272 		printf("pmap_kenter_pa(%lx, %lx, %x)\n",
2273 		    va, pa, prot);
2274 #endif
2275 
2276 	KASSERT(va >= VM_MIN_KERNEL_ADDRESS);
2277 
2278 	pt_entry_t * const pte = PMAP_KERNEL_PTE(va);
2279 
2280 	/* Build the new PTE. */
2281 	const pt_entry_t npte =
2282 	    ((pa >> PGSHIFT) << PG_SHIFT) | pte_prot(pmap_kernel(), prot) |
2283 	    PG_V | PG_WIRED;
2284 
2285 	/* Set the new PTE. */
2286 	const pt_entry_t opte = atomic_load_relaxed(pte);
2287 	atomic_store_relaxed(pte, npte);
2288 
2289 	PMAP_STAT_INCR(pmap->pm_stats.resident_count, 1);
2290 	PMAP_STAT_INCR(pmap->pm_stats.wired_count, 1);
2291 
2292 	/*
2293 	 * There should not have been anything here, previously,
2294 	 * so we can skip TLB shootdowns, etc. in the common case.
2295 	 */
2296 	if (__predict_false(opte & PG_V)) {
2297 		const pt_entry_t diff = npte ^ opte;
2298 
2299 		printf_nolog("%s: mapping already present\n", __func__);
2300 		PMAP_STAT_DECR(pmap->pm_stats.resident_count, 1);
2301 		if (diff & PG_WIRED)
2302 			PMAP_STAT_DECR(pmap->pm_stats.wired_count, 1);
2303 		/* XXX Can't handle this case. */
2304 		if (diff & PG_PVLIST)
2305 			panic("pmap_kenter_pa: old mapping was managed");
2306 
2307 		pmap_enter_tlb_shootdown(pmap_kernel(), va, opte, false);
2308 		TLB_COUNT(reason_kenter);
2309 	}
2310 }
2311 
2312 /*
2313  * pmap_kremove:		[ INTERFACE ]
2314  *
2315  *	Remove a mapping entered with pmap_kenter_pa() starting at va,
2316  *	for size bytes (assumed to be page rounded).
2317  */
2318 void
2319 pmap_kremove(vaddr_t va, vsize_t size)
2320 {
2321 	pt_entry_t *pte, opte;
2322 	pmap_t const pmap = pmap_kernel();
2323 	struct pmap_tlb_context tlbctx;
2324 	int count = 0;
2325 
2326 #ifdef DEBUG
2327 	if (pmapdebug & (PDB_FOLLOW|PDB_ENTER))
2328 		printf("pmap_kremove(%lx, %lx)\n",
2329 		    va, size);
2330 #endif
2331 
2332 	pmap_tlb_context_init(&tlbctx);
2333 
2334 	KASSERT(va >= VM_MIN_KERNEL_ADDRESS);
2335 
2336 	for (; size != 0; size -= PAGE_SIZE, va += PAGE_SIZE) {
2337 		pte = PMAP_KERNEL_PTE(va);
2338 		opte = atomic_load_relaxed(pte);
2339 		if (opte & PG_V) {
2340 			KASSERT((opte & PG_PVLIST) == 0);
2341 
2342 			/* Zap the mapping. */
2343 			atomic_store_relaxed(pte, PG_NV);
2344 			pmap_tlb_shootdown(pmap, va, opte, &tlbctx);
2345 
2346 			count++;
2347 		}
2348 	}
2349 
2350 	/* Update stats. */
2351 	if (__predict_true(count != 0)) {
2352 		PMAP_STAT_DECR(pmap->pm_stats.resident_count, count);
2353 		PMAP_STAT_DECR(pmap->pm_stats.wired_count, count);
2354 	}
2355 
2356 	pmap_tlb_shootnow(&tlbctx);
2357 	TLB_COUNT(reason_kremove);
2358 }
2359 
2360 /*
2361  * pmap_unwire:			[ INTERFACE ]
2362  *
2363  *	Clear the wired attribute for a map/virtual-address pair.
2364  *
2365  *	The mapping must already exist in the pmap.
2366  */
2367 void
2368 pmap_unwire(pmap_t pmap, vaddr_t va)
2369 {
2370 	pt_entry_t *pte;
2371 
2372 #ifdef DEBUG
2373 	if (pmapdebug & PDB_FOLLOW)
2374 		printf("pmap_unwire(%p, %lx)\n", pmap, va);
2375 #endif
2376 
2377 	PMAP_LOCK(pmap);
2378 
2379 	pte = pmap_l3pte(pmap, va, NULL);
2380 
2381 	KASSERT(pte != NULL);
2382 	KASSERT(pmap_pte_v(pte));
2383 
2384 	/*
2385 	 * If wiring actually changed (always?) clear the wire bit and
2386 	 * update the wire count.  Note that wiring is not a hardware
2387 	 * characteristic so there is no need to invalidate the TLB.
2388 	 */
2389 	if (pmap_pte_w_chg(pte, 0)) {
2390 		pmap_pte_set_w(pte, false);
2391 		PMAP_STAT_DECR(pmap->pm_stats.wired_count, 1);
2392 	}
2393 #ifdef DEBUG
2394 	else {
2395 		printf("pmap_unwire: wiring for pmap %p va 0x%lx "
2396 		    "didn't change!\n", pmap, va);
2397 	}
2398 #endif
2399 
2400 	PMAP_UNLOCK(pmap);
2401 }
2402 
2403 /*
2404  * pmap_extract:		[ INTERFACE ]
2405  *
2406  *	Extract the physical address associated with the given
2407  *	pmap/virtual address pair.
2408  */
2409 bool
2410 pmap_extract(pmap_t pmap, vaddr_t va, paddr_t *pap)
2411 {
2412 	pt_entry_t *l1pte, *l2pte, *l3pte;
2413 	paddr_t pa;
2414 
2415 #ifdef DEBUG
2416 	if (pmapdebug & PDB_FOLLOW)
2417 		printf("pmap_extract(%p, %lx) -> ", pmap, va);
2418 #endif
2419 
2420 	/*
2421 	 * Take a faster path for the kernel pmap.  Avoids locking,
2422 	 * handles K0SEG.
2423 	 */
2424 	if (__predict_true(pmap == pmap_kernel())) {
2425 		if (__predict_true(vtophys_internal(va, pap))) {
2426 #ifdef DEBUG
2427 			if (pmapdebug & PDB_FOLLOW)
2428 				printf("0x%lx (kernel vtophys)\n", pa);
2429 #endif
2430 			return true;
2431 		}
2432 #ifdef DEBUG
2433 		if (pmapdebug & PDB_FOLLOW)
2434 			printf("failed (kernel vtophys)\n");
2435 #endif
2436 		return false;
2437 	}
2438 
2439 	PMAP_LOCK(pmap);
2440 
2441 	l1pte = pmap_l1pte(pmap, va);
2442 	if (pmap_pte_v(l1pte) == 0)
2443 		goto out;
2444 
2445 	l2pte = pmap_l2pte(pmap, va, l1pte);
2446 	if (pmap_pte_v(l2pte) == 0)
2447 		goto out;
2448 
2449 	l3pte = pmap_l3pte(pmap, va, l2pte);
2450 	if (pmap_pte_v(l3pte) == 0)
2451 		goto out;
2452 
2453 	pa = pmap_pte_pa(l3pte) | (va & PGOFSET);
2454 	PMAP_UNLOCK(pmap);
2455 	if (pap != NULL)
2456 		*pap = pa;
2457 #ifdef DEBUG
2458 	if (pmapdebug & PDB_FOLLOW)
2459 		printf("0x%lx\n", pa);
2460 #endif
2461 	return (true);
2462 
2463  out:
2464 	PMAP_UNLOCK(pmap);
2465 #ifdef DEBUG
2466 	if (pmapdebug & PDB_FOLLOW)
2467 		printf("failed\n");
2468 #endif
2469 	return (false);
2470 }
2471 
2472 /*
2473  * pmap_copy:			[ INTERFACE ]
2474  *
2475  *	Copy the mapping range specified by src_addr/len
2476  *	from the source map to the range dst_addr/len
2477  *	in the destination map.
2478  *
2479  *	This routine is only advisory and need not do anything.
2480  */
2481 /* call deleted in <machine/pmap.h> */
2482 
2483 /*
2484  * pmap_update:			[ INTERFACE ]
2485  *
2486  *	Require that all active physical maps contain no
2487  *	incorrect entries NOW, by processing any deferred
2488  *	pmap operations.
2489  */
2490 /* call deleted in <machine/pmap.h> */
2491 
2492 /*
2493  * pmap_activate:		[ INTERFACE ]
2494  *
2495  *	Activate the pmap used by the specified process.  This includes
2496  *	reloading the MMU context of the current process, and marking
2497  *	the pmap in use by the processor.
2498  */
2499 void
2500 pmap_activate(struct lwp *l)
2501 {
2502 	struct pmap * const pmap = l->l_proc->p_vmspace->vm_map.pmap;
2503 	struct pcb * const pcb = lwp_getpcb(l);
2504 
2505 #ifdef DEBUG
2506 	if (pmapdebug & PDB_FOLLOW)
2507 		printf("pmap_activate(%p)\n", l);
2508 #endif
2509 
2510 	KASSERT(kpreempt_disabled());
2511 
2512 	struct cpu_info * const ci = curcpu();
2513 
2514 	KASSERT(l == ci->ci_curlwp);
2515 
2516 	u_long const old_ptbr = pcb->pcb_hw.apcb_ptbr;
2517 	u_int const old_asn = pcb->pcb_hw.apcb_asn;
2518 
2519 	/*
2520 	 * We hold the activation lock to synchronize with TLB shootdown.
2521 	 * The kernel pmap does not require those tests because shootdowns
2522 	 * for the kernel pmap are always sent to all CPUs.
2523 	 */
2524 	if (pmap != pmap_kernel()) {
2525 		PMAP_ACT_LOCK(pmap);
2526 		pcb->pcb_hw.apcb_asn = pmap_asn_alloc(pmap, ci);
2527 		atomic_or_ulong(&pmap->pm_cpus, (1UL << ci->ci_cpuid));
2528 	} else {
2529 		pcb->pcb_hw.apcb_asn = PMAP_ASN_KERNEL;
2530 	}
2531 	pcb->pcb_hw.apcb_ptbr =
2532 	    ALPHA_K0SEG_TO_PHYS((vaddr_t)pmap->pm_lev1map) >> PGSHIFT;
2533 
2534 	/*
2535 	 * Check to see if the ASN or page table base has changed; if
2536 	 * so, switch to our own context again so that it will take
2537 	 * effect.
2538 	 *
2539 	 * We test ASN first because it's the most likely value to change.
2540 	 */
2541 	if (old_asn != pcb->pcb_hw.apcb_asn ||
2542 	    old_ptbr != pcb->pcb_hw.apcb_ptbr) {
2543 		if (old_asn != pcb->pcb_hw.apcb_asn &&
2544 		    old_ptbr != pcb->pcb_hw.apcb_ptbr) {
2545 			TLB_COUNT(activate_both_change);
2546 		} else if (old_asn != pcb->pcb_hw.apcb_asn) {
2547 			TLB_COUNT(activate_asn_change);
2548 		} else {
2549 			TLB_COUNT(activate_ptbr_change);
2550 		}
2551 		(void) alpha_pal_swpctx((u_long)l->l_md.md_pcbpaddr);
2552 		TLB_COUNT(activate_swpctx);
2553 	} else {
2554 		TLB_COUNT(activate_skip_swpctx);
2555 	}
2556 
2557 	pmap_reference(pmap);
2558 	ci->ci_pmap = pmap;
2559 
2560 	if (pmap != pmap_kernel()) {
2561 		PMAP_ACT_UNLOCK(pmap);
2562 	}
2563 }
2564 
2565 /*
2566  * pmap_deactivate:		[ INTERFACE ]
2567  *
2568  *	Mark that the pmap used by the specified process is no longer
2569  *	in use by the processor.
2570  */
2571 void
2572 pmap_deactivate(struct lwp *l)
2573 {
2574 	struct pmap * const pmap = l->l_proc->p_vmspace->vm_map.pmap;
2575 
2576 #ifdef DEBUG
2577 	if (pmapdebug & PDB_FOLLOW)
2578 		printf("pmap_deactivate(%p)\n", l);
2579 #endif
2580 
2581 	KASSERT(kpreempt_disabled());
2582 
2583 	struct cpu_info * const ci = curcpu();
2584 
2585 	KASSERT(l == ci->ci_curlwp);
2586 	KASSERT(pmap == ci->ci_pmap);
2587 
2588 	/*
2589 	 * There is no need to switch to a different PTBR here,
2590 	 * because a pmap_activate() or SWPCTX is guaranteed
2591 	 * before whatever lev1map we're on now is invalidated
2592 	 * or before user space is accessed again.
2593 	 *
2594 	 * Because only kernel mappings will be accessed before the
2595 	 * next pmap_activate() call, we consider our CPU to be on
2596 	 * the kernel pmap.
2597 	 */
2598 	ci->ci_pmap = pmap_kernel();
2599 	KASSERT(atomic_load_relaxed(&pmap->pm_count) > 1);
2600 	pmap_destroy(pmap);
2601 }
2602 
2603 /*
2604  * pmap_zero_page:		[ INTERFACE ]
2605  *
2606  *	Zero the specified (machine independent) page by mapping the page
2607  *	into virtual memory and clear its contents, one machine dependent
2608  *	page at a time.
2609  *
2610  *	Note: no locking is necessary in this function.
2611  */
2612 void
2613 pmap_zero_page(paddr_t phys)
2614 {
2615 	u_long *p0, *p1, *pend;
2616 
2617 #ifdef DEBUG
2618 	if (pmapdebug & PDB_FOLLOW)
2619 		printf("pmap_zero_page(%lx)\n", phys);
2620 #endif
2621 
2622 	p0 = (u_long *)ALPHA_PHYS_TO_K0SEG(phys);
2623 	p1 = NULL;
2624 	pend = (u_long *)((u_long)p0 + PAGE_SIZE);
2625 
2626 	/*
2627 	 * Unroll the loop a bit, doing 16 quadwords per iteration.
2628 	 * Do only 8 back-to-back stores, and alternate registers.
2629 	 */
2630 	do {
2631 		__asm volatile(
2632 		"# BEGIN loop body\n"
2633 		"	addq	%2, (8 * 8), %1		\n"
2634 		"	stq	$31, (0 * 8)(%0)	\n"
2635 		"	stq	$31, (1 * 8)(%0)	\n"
2636 		"	stq	$31, (2 * 8)(%0)	\n"
2637 		"	stq	$31, (3 * 8)(%0)	\n"
2638 		"	stq	$31, (4 * 8)(%0)	\n"
2639 		"	stq	$31, (5 * 8)(%0)	\n"
2640 		"	stq	$31, (6 * 8)(%0)	\n"
2641 		"	stq	$31, (7 * 8)(%0)	\n"
2642 		"					\n"
2643 		"	addq	%3, (8 * 8), %0		\n"
2644 		"	stq	$31, (0 * 8)(%1)	\n"
2645 		"	stq	$31, (1 * 8)(%1)	\n"
2646 		"	stq	$31, (2 * 8)(%1)	\n"
2647 		"	stq	$31, (3 * 8)(%1)	\n"
2648 		"	stq	$31, (4 * 8)(%1)	\n"
2649 		"	stq	$31, (5 * 8)(%1)	\n"
2650 		"	stq	$31, (6 * 8)(%1)	\n"
2651 		"	stq	$31, (7 * 8)(%1)	\n"
2652 		"	# END loop body"
2653 		: "=r" (p0), "=r" (p1)
2654 		: "0" (p0), "1" (p1)
2655 		: "memory");
2656 	} while (p0 < pend);
2657 }
2658 
2659 /*
2660  * pmap_copy_page:		[ INTERFACE ]
2661  *
2662  *	Copy the specified (machine independent) page by mapping the page
2663  *	into virtual memory and using memcpy to copy the page, one machine
2664  *	dependent page at a time.
2665  *
2666  *	Note: no locking is necessary in this function.
2667  */
2668 void
2669 pmap_copy_page(paddr_t src, paddr_t dst)
2670 {
2671 	const void *s;
2672 	void *d;
2673 
2674 #ifdef DEBUG
2675 	if (pmapdebug & PDB_FOLLOW)
2676 		printf("pmap_copy_page(%lx, %lx)\n", src, dst);
2677 #endif
2678 	s = (const void *)ALPHA_PHYS_TO_K0SEG(src);
2679 	d = (void *)ALPHA_PHYS_TO_K0SEG(dst);
2680 	memcpy(d, s, PAGE_SIZE);
2681 }
2682 
2683 /*
2684  * pmap_pageidlezero:		[ INTERFACE ]
2685  *
2686  *	Page zero'er for the idle loop.  Returns true if the
2687  *	page was zero'd, FALSE if we aborted for some reason.
2688  */
2689 bool
2690 pmap_pageidlezero(paddr_t pa)
2691 {
2692 	u_long *ptr;
2693 	int i, cnt = PAGE_SIZE / sizeof(u_long);
2694 
2695 	for (i = 0, ptr = (u_long *) ALPHA_PHYS_TO_K0SEG(pa); i < cnt; i++) {
2696 		if (sched_curcpu_runnable_p()) {
2697 			/*
2698 			 * An LWP has become ready.  Abort now,
2699 			 * so we don't keep it waiting while we
2700 			 * finish zeroing the page.
2701 			 */
2702 			return (false);
2703 		}
2704 		*ptr++ = 0;
2705 	}
2706 
2707 	return (true);
2708 }
2709 
2710 /*
2711  * pmap_clear_modify:		[ INTERFACE ]
2712  *
2713  *	Clear the modify bits on the specified physical page.
2714  */
2715 bool
2716 pmap_clear_modify(struct vm_page *pg)
2717 {
2718 	struct vm_page_md * const md = VM_PAGE_TO_MD(pg);
2719 	bool rv = false;
2720 	kmutex_t *lock;
2721 	struct pmap_tlb_context tlbctx;
2722 
2723 #ifdef DEBUG
2724 	if (pmapdebug & PDB_FOLLOW)
2725 		printf("pmap_clear_modify(%p)\n", pg);
2726 #endif
2727 
2728 	pmap_tlb_context_init(&tlbctx);
2729 
2730 	PMAP_HEAD_TO_MAP_LOCK();
2731 	lock = pmap_pvh_lock(pg);
2732 	mutex_enter(lock);
2733 
2734 	if (md->pvh_attrs & PGA_MODIFIED) {
2735 		rv = true;
2736 		pmap_changebit(pg, PG_FOW, ~0UL, &tlbctx);
2737 		md->pvh_attrs &= ~PGA_MODIFIED;
2738 	}
2739 
2740 	mutex_exit(lock);
2741 	PMAP_HEAD_TO_MAP_UNLOCK();
2742 
2743 	pmap_tlb_shootnow(&tlbctx);
2744 	TLB_COUNT(reason_clear_modify);
2745 
2746 	return (rv);
2747 }
2748 
2749 /*
2750  * pmap_clear_reference:	[ INTERFACE ]
2751  *
2752  *	Clear the reference bit on the specified physical page.
2753  */
2754 bool
2755 pmap_clear_reference(struct vm_page *pg)
2756 {
2757 	struct vm_page_md * const md = VM_PAGE_TO_MD(pg);
2758 	bool rv = false;
2759 	kmutex_t *lock;
2760 	struct pmap_tlb_context tlbctx;
2761 
2762 #ifdef DEBUG
2763 	if (pmapdebug & PDB_FOLLOW)
2764 		printf("pmap_clear_reference(%p)\n", pg);
2765 #endif
2766 
2767 	pmap_tlb_context_init(&tlbctx);
2768 
2769 	PMAP_HEAD_TO_MAP_LOCK();
2770 	lock = pmap_pvh_lock(pg);
2771 	mutex_enter(lock);
2772 
2773 	if (md->pvh_attrs & PGA_REFERENCED) {
2774 		rv = true;
2775 		pmap_changebit(pg, PG_FOR | PG_FOW | PG_FOE, ~0UL, &tlbctx);
2776 		md->pvh_attrs &= ~PGA_REFERENCED;
2777 	}
2778 
2779 	mutex_exit(lock);
2780 	PMAP_HEAD_TO_MAP_UNLOCK();
2781 
2782 	pmap_tlb_shootnow(&tlbctx);
2783 	TLB_COUNT(reason_clear_reference);
2784 
2785 	return (rv);
2786 }
2787 
2788 /*
2789  * pmap_is_referenced:		[ INTERFACE ]
2790  *
2791  *	Return whether or not the specified physical page is referenced
2792  *	by any physical maps.
2793  */
2794 /* See <machine/pmap.h> */
2795 
2796 /*
2797  * pmap_is_modified:		[ INTERFACE ]
2798  *
2799  *	Return whether or not the specified physical page is modified
2800  *	by any physical maps.
2801  */
2802 /* See <machine/pmap.h> */
2803 
2804 /*
2805  * pmap_phys_address:		[ INTERFACE ]
2806  *
2807  *	Return the physical address corresponding to the specified
2808  *	cookie.  Used by the device pager to decode a device driver's
2809  *	mmap entry point return value.
2810  *
2811  *	Note: no locking is necessary in this function.
2812  */
2813 paddr_t
2814 pmap_phys_address(paddr_t ppn)
2815 {
2816 
2817 	return (alpha_ptob(ppn));
2818 }
2819 
2820 /*
2821  * Miscellaneous support routines follow
2822  */
2823 
2824 /*
2825  * alpha_protection_init:
2826  *
2827  *	Initialize Alpha protection code array.
2828  *
2829  *	Note: no locking is necessary in this function.
2830  */
2831 static void
2832 alpha_protection_init(void)
2833 {
2834 	int prot, *kp, *up;
2835 
2836 	kp = protection_codes[0];
2837 	up = protection_codes[1];
2838 
2839 	for (prot = 0; prot < 8; prot++) {
2840 		kp[prot] = PG_ASM;
2841 		up[prot] = 0;
2842 
2843 		if (prot & VM_PROT_READ) {
2844 			kp[prot] |= PG_KRE;
2845 			up[prot] |= PG_KRE | PG_URE;
2846 		}
2847 		if (prot & VM_PROT_WRITE) {
2848 			kp[prot] |= PG_KWE;
2849 			up[prot] |= PG_KWE | PG_UWE;
2850 		}
2851 		if (prot & VM_PROT_EXECUTE) {
2852 			kp[prot] |= PG_EXEC | PG_KRE;
2853 			up[prot] |= PG_EXEC | PG_KRE | PG_URE;
2854 		} else {
2855 			kp[prot] |= PG_FOE;
2856 			up[prot] |= PG_FOE;
2857 		}
2858 	}
2859 }
2860 
2861 /*
2862  * pmap_remove_mapping:
2863  *
2864  *	Invalidate a single page denoted by pmap/va.
2865  *
2866  *	If (pte != NULL), it is the already computed PTE for the page.
2867  *
2868  *	Note: locking in this function is complicated by the fact
2869  *	that we can be called when the PV list is already locked.
2870  *	(pmap_page_protect()).  In this case, the caller must be
2871  *	careful to get the next PV entry while we remove this entry
2872  *	from beneath it.  We assume that the pmap itself is already
2873  *	locked; dolock applies only to the PV list.
2874  *
2875  *	Returns important PTE bits that the caller needs to check for
2876  *	TLB / I-stream invalidation purposes.
2877  */
2878 static pt_entry_t
2879 pmap_remove_mapping(pmap_t pmap, vaddr_t va, pt_entry_t *pte,
2880     bool dolock, pv_entry_t *opvp, struct pmap_tlb_context * const tlbctx)
2881 {
2882 	pt_entry_t opte;
2883 	paddr_t pa;
2884 	struct vm_page *pg;		/* if != NULL, page is managed */
2885 
2886 #ifdef DEBUG
2887 	if (pmapdebug & (PDB_FOLLOW|PDB_REMOVE|PDB_PROTECT))
2888 		printf("pmap_remove_mapping(%p, %lx, %p, %d, %p)\n",
2889 		       pmap, va, pte, dolock, opvp);
2890 #endif
2891 
2892 	/*
2893 	 * PTE not provided, compute it from pmap and va.
2894 	 */
2895 	if (pte == NULL) {
2896 		pte = pmap_l3pte(pmap, va, NULL);
2897 		if (pmap_pte_v(pte) == 0)
2898 			return 0;
2899 	}
2900 
2901 	opte = *pte;
2902 
2903 	pa = PG_PFNUM(opte) << PGSHIFT;
2904 
2905 	/*
2906 	 * Update statistics
2907 	 */
2908 	if (pmap_pte_w(pte))
2909 		PMAP_STAT_DECR(pmap->pm_stats.wired_count, 1);
2910 	PMAP_STAT_DECR(pmap->pm_stats.resident_count, 1);
2911 
2912 	/*
2913 	 * Invalidate the PTE after saving the reference modify info.
2914 	 */
2915 #ifdef DEBUG
2916 	if (pmapdebug & PDB_REMOVE)
2917 		printf("remove: invalidating pte at %p\n", pte);
2918 #endif
2919 	atomic_store_relaxed(pte, PG_NV);
2920 
2921 	/*
2922 	 * If we're removing a user mapping, check to see if we
2923 	 * can free page table pages.
2924 	 */
2925 	if (pmap != pmap_kernel()) {
2926 		/*
2927 		 * Delete the reference on the level 3 table.  It will
2928 		 * delete references on the level 2 and 1 tables as
2929 		 * appropriate.
2930 		 */
2931 		pmap_l3pt_delref(pmap, va, pte, tlbctx);
2932 	}
2933 
2934 	if (opte & PG_PVLIST) {
2935 		/*
2936 		 * Remove it from the PV table.
2937 		 */
2938 		pg = PHYS_TO_VM_PAGE(pa);
2939 		KASSERT(pg != NULL);
2940 		pmap_pv_remove(pmap, pg, va, dolock, opvp);
2941 		KASSERT(opvp == NULL || *opvp != NULL);
2942 	}
2943 
2944 	return opte & (PG_V | PG_ASM | PG_EXEC);
2945 }
2946 
2947 /*
2948  * pmap_changebit:
2949  *
2950  *	Set or clear the specified PTE bits for all mappings on the
2951  *	specified page.
2952  *
2953  *	Note: we assume that the pv_head is already locked, and that
2954  *	the caller has acquired a PV->pmap mutex so that we can lock
2955  *	the pmaps as we encounter them.
2956  */
2957 static void
2958 pmap_changebit(struct vm_page *pg, pt_entry_t set, pt_entry_t mask,
2959     struct pmap_tlb_context * const tlbctx)
2960 {
2961 	struct vm_page_md * const md = VM_PAGE_TO_MD(pg);
2962 	pv_entry_t pv;
2963 	pt_entry_t *pte, npte, opte;
2964 
2965 #ifdef DEBUG
2966 	if (pmapdebug & PDB_BITS)
2967 		printf("pmap_changebit(%p, 0x%lx, 0x%lx)\n",
2968 		    pg, set, mask);
2969 #endif
2970 
2971 	/*
2972 	 * Loop over all current mappings setting/clearing as apropos.
2973 	 */
2974 	for (pv = md->pvh_list; pv != NULL; pv = pv->pv_next) {
2975 		PMAP_LOCK(pv->pv_pmap);
2976 
2977 		pte = pv->pv_pte;
2978 
2979 		opte = atomic_load_relaxed(pte);
2980 		npte = (opte | set) & mask;
2981 		if (npte != opte) {
2982 			atomic_store_relaxed(pte, npte);
2983 			pmap_tlb_shootdown_pv(pv, opte, tlbctx);
2984 		}
2985 		PMAP_UNLOCK(pv->pv_pmap);
2986 	}
2987 }
2988 
2989 /*
2990  * pmap_emulate_reference:
2991  *
2992  *	Emulate reference and/or modified bit hits.
2993  *	Return 1 if this was an execute fault on a non-exec mapping,
2994  *	otherwise return 0.
2995  */
2996 int
2997 pmap_emulate_reference(struct lwp *l, vaddr_t v, int user, int type)
2998 {
2999 	struct pmap *pmap = l->l_proc->p_vmspace->vm_map.pmap;
3000 	pt_entry_t faultoff, *pte;
3001 	struct vm_page *pg;
3002 	paddr_t pa;
3003 	bool didlock = false;
3004 	bool exec = false;
3005 	kmutex_t *lock;
3006 
3007 #ifdef DEBUG
3008 	if (pmapdebug & PDB_FOLLOW)
3009 		printf("pmap_emulate_reference: %p, 0x%lx, %d, %d\n",
3010 		    l, v, user, type);
3011 #endif
3012 
3013 	/*
3014 	 * Convert process and virtual address to physical address.
3015 	 */
3016 	if (v >= VM_MIN_KERNEL_ADDRESS) {
3017 		if (user)
3018 			panic("pmap_emulate_reference: user ref to kernel");
3019 		/*
3020 		 * No need to lock here; kernel PT pages never go away.
3021 		 */
3022 		pte = PMAP_KERNEL_PTE(v);
3023 	} else {
3024 #ifdef DIAGNOSTIC
3025 		if (l == NULL)
3026 			panic("pmap_emulate_reference: bad proc");
3027 		if (l->l_proc->p_vmspace == NULL)
3028 			panic("pmap_emulate_reference: bad p_vmspace");
3029 #endif
3030 		PMAP_LOCK(pmap);
3031 		didlock = true;
3032 		pte = pmap_l3pte(pmap, v, NULL);
3033 		/*
3034 		 * We'll unlock below where we're done with the PTE.
3035 		 */
3036 	}
3037 	exec = pmap_pte_exec(pte);
3038 	if (!exec && type == ALPHA_MMCSR_FOE) {
3039 		if (didlock)
3040 			PMAP_UNLOCK(pmap);
3041 	       return (1);
3042 	}
3043 #ifdef DEBUG
3044 	if (pmapdebug & PDB_FOLLOW) {
3045 		printf("\tpte = %p, ", pte);
3046 		printf("*pte = 0x%lx\n", *pte);
3047 	}
3048 #endif
3049 
3050 	pa = pmap_pte_pa(pte);
3051 
3052 	/*
3053 	 * We're now done with the PTE.  If it was a user pmap, unlock
3054 	 * it now.
3055 	 */
3056 	if (didlock)
3057 		PMAP_UNLOCK(pmap);
3058 
3059 #ifdef DEBUG
3060 	if (pmapdebug & PDB_FOLLOW)
3061 		printf("\tpa = 0x%lx\n", pa);
3062 #endif
3063 #ifdef DIAGNOSTIC
3064 	if (!uvm_pageismanaged(pa))
3065 		panic("pmap_emulate_reference(%p, 0x%lx, %d, %d): "
3066 		      "pa 0x%lx not managed", l, v, user, type, pa);
3067 #endif
3068 
3069 	/*
3070 	 * Twiddle the appropriate bits to reflect the reference
3071 	 * and/or modification..
3072 	 *
3073 	 * The rules:
3074 	 * 	(1) always mark page as used, and
3075 	 *	(2) if it was a write fault, mark page as modified.
3076 	 */
3077 	pg = PHYS_TO_VM_PAGE(pa);
3078 	struct vm_page_md * const md = VM_PAGE_TO_MD(pg);
3079 	struct pmap_tlb_context tlbctx;
3080 
3081 	pmap_tlb_context_init(&tlbctx);
3082 
3083 	PMAP_HEAD_TO_MAP_LOCK();
3084 	lock = pmap_pvh_lock(pg);
3085 	mutex_enter(lock);
3086 
3087 	if (type == ALPHA_MMCSR_FOW) {
3088 		md->pvh_attrs |= (PGA_REFERENCED|PGA_MODIFIED);
3089 		faultoff = PG_FOR | PG_FOW;
3090 	} else {
3091 		md->pvh_attrs |= PGA_REFERENCED;
3092 		faultoff = PG_FOR;
3093 		if (exec) {
3094 			faultoff |= PG_FOE;
3095 		}
3096 	}
3097 	pmap_changebit(pg, 0, ~faultoff, &tlbctx);
3098 
3099 	mutex_exit(lock);
3100 	PMAP_HEAD_TO_MAP_UNLOCK();
3101 
3102 	pmap_tlb_shootnow(&tlbctx);
3103 	TLB_COUNT(reason_emulate_reference);
3104 
3105 	return (0);
3106 }
3107 
3108 #ifdef DEBUG
3109 /*
3110  * pmap_pv_dump:
3111  *
3112  *	Dump the physical->virtual data for the specified page.
3113  */
3114 void
3115 pmap_pv_dump(paddr_t pa)
3116 {
3117 	struct vm_page *pg;
3118 	struct vm_page_md *md;
3119 	pv_entry_t pv;
3120 	kmutex_t *lock;
3121 
3122 	pg = PHYS_TO_VM_PAGE(pa);
3123 	md = VM_PAGE_TO_MD(pg);
3124 
3125 	lock = pmap_pvh_lock(pg);
3126 	mutex_enter(lock);
3127 
3128 	printf("pa 0x%lx (attrs = 0x%x):\n", pa, md->pvh_attrs);
3129 	for (pv = md->pvh_list; pv != NULL; pv = pv->pv_next)
3130 		printf("     pmap %p, va 0x%lx\n",
3131 		    pv->pv_pmap, pv->pv_va);
3132 	printf("\n");
3133 
3134 	mutex_exit(lock);
3135 }
3136 #endif
3137 
3138 /*
3139  * vtophys:
3140  *
3141  *	Return the physical address corresponding to the K0SEG or
3142  *	K1SEG address provided.
3143  *
3144  *	Note: no locking is necessary in this function.
3145  */
3146 static bool
3147 vtophys_internal(vaddr_t const vaddr, paddr_t * const pap)
3148 {
3149 	paddr_t pa;
3150 
3151 	KASSERT(vaddr >= ALPHA_K0SEG_BASE);
3152 
3153 	if (vaddr <= ALPHA_K0SEG_END) {
3154 		pa = ALPHA_K0SEG_TO_PHYS(vaddr);
3155 	} else {
3156 		pt_entry_t * const pte = PMAP_KERNEL_PTE(vaddr);
3157 		if (__predict_false(! pmap_pte_v(pte))) {
3158 			return false;
3159 		}
3160 		pa = pmap_pte_pa(pte) | (vaddr & PGOFSET);
3161 	}
3162 
3163 	if (pap != NULL) {
3164 		*pap = pa;
3165 	}
3166 
3167 	return true;
3168 }
3169 
3170 paddr_t
3171 vtophys(vaddr_t const vaddr)
3172 {
3173 	paddr_t pa;
3174 
3175 	if (__predict_false(! vtophys_internal(vaddr, &pa)))
3176 		pa = 0;
3177 	return pa;
3178 }
3179 
3180 /******************** pv_entry management ********************/
3181 
3182 /*
3183  * pmap_pv_enter:
3184  *
3185  *	Add a physical->virtual entry to the pv_table.
3186  */
3187 static int
3188 pmap_pv_enter(pmap_t pmap, struct vm_page *pg, vaddr_t va, pt_entry_t *pte,
3189     bool dolock, pv_entry_t newpv)
3190 {
3191 	struct vm_page_md * const md = VM_PAGE_TO_MD(pg);
3192 	kmutex_t *lock;
3193 
3194 	/*
3195 	 * Allocate and fill in the new pv_entry.
3196 	 */
3197 	if (newpv == NULL) {
3198 		newpv = pmap_pv_alloc();
3199 		if (newpv == NULL)
3200 			return ENOMEM;
3201 	}
3202 	newpv->pv_va = va;
3203 	newpv->pv_pmap = pmap;
3204 	newpv->pv_pte = pte;
3205 
3206 	if (dolock) {
3207 		lock = pmap_pvh_lock(pg);
3208 		mutex_enter(lock);
3209 	}
3210 
3211 #ifdef DEBUG
3212     {
3213 	pv_entry_t pv;
3214 	/*
3215 	 * Make sure the entry doesn't already exist.
3216 	 */
3217 	for (pv = md->pvh_list; pv != NULL; pv = pv->pv_next) {
3218 		if (pmap == pv->pv_pmap && va == pv->pv_va) {
3219 			printf("pmap = %p, va = 0x%lx\n", pmap, va);
3220 			panic("pmap_pv_enter: already in pv table");
3221 		}
3222 	}
3223     }
3224 #endif
3225 
3226 	/*
3227 	 * ...and put it in the list.
3228 	 */
3229 	newpv->pv_next = md->pvh_list;
3230 	md->pvh_list = newpv;
3231 
3232 	if (dolock) {
3233 		mutex_exit(lock);
3234 	}
3235 
3236 	return 0;
3237 }
3238 
3239 /*
3240  * pmap_pv_remove:
3241  *
3242  *	Remove a physical->virtual entry from the pv_table.
3243  */
3244 static void
3245 pmap_pv_remove(pmap_t pmap, struct vm_page *pg, vaddr_t va, bool dolock,
3246 	pv_entry_t *opvp)
3247 {
3248 	struct vm_page_md * const md = VM_PAGE_TO_MD(pg);
3249 	pv_entry_t pv, *pvp;
3250 	kmutex_t *lock;
3251 
3252 	if (dolock) {
3253 		lock = pmap_pvh_lock(pg);
3254 		mutex_enter(lock);
3255 	} else {
3256 		lock = NULL; /* XXX stupid gcc */
3257 	}
3258 
3259 	/*
3260 	 * Find the entry to remove.
3261 	 */
3262 	for (pvp = &md->pvh_list, pv = *pvp;
3263 	     pv != NULL; pvp = &pv->pv_next, pv = *pvp)
3264 		if (pmap == pv->pv_pmap && va == pv->pv_va)
3265 			break;
3266 
3267 #ifdef DEBUG
3268 	if (pv == NULL)
3269 		panic("pmap_pv_remove: not in pv table");
3270 #endif
3271 
3272 	*pvp = pv->pv_next;
3273 
3274 	if (dolock) {
3275 		mutex_exit(lock);
3276 	}
3277 
3278 	if (opvp != NULL)
3279 		*opvp = pv;
3280 	else
3281 		pmap_pv_free(pv);
3282 }
3283 
3284 /*
3285  * pmap_pv_page_alloc:
3286  *
3287  *	Allocate a page for the pv_entry pool.
3288  */
3289 static void *
3290 pmap_pv_page_alloc(struct pool *pp, int flags)
3291 {
3292 	paddr_t pg;
3293 
3294 	if (pmap_physpage_alloc(PGU_PVENT, &pg))
3295 		return ((void *)ALPHA_PHYS_TO_K0SEG(pg));
3296 	return (NULL);
3297 }
3298 
3299 /*
3300  * pmap_pv_page_free:
3301  *
3302  *	Free a pv_entry pool page.
3303  */
3304 static void
3305 pmap_pv_page_free(struct pool *pp, void *v)
3306 {
3307 
3308 	pmap_physpage_free(ALPHA_K0SEG_TO_PHYS((vaddr_t)v));
3309 }
3310 
3311 /******************** misc. functions ********************/
3312 
3313 /*
3314  * pmap_physpage_alloc:
3315  *
3316  *	Allocate a single page from the VM system and return the
3317  *	physical address for that page.
3318  */
3319 static bool
3320 pmap_physpage_alloc(int usage, paddr_t *pap)
3321 {
3322 	struct vm_page *pg;
3323 	paddr_t pa;
3324 
3325 	/*
3326 	 * Don't ask for a zero'd page in the L1PT case -- we will
3327 	 * properly initialize it in the constructor.
3328 	 */
3329 
3330 	pg = uvm_pagealloc(NULL, 0, NULL, usage == PGU_L1PT ?
3331 	    UVM_PGA_USERESERVE : UVM_PGA_USERESERVE|UVM_PGA_ZERO);
3332 	if (pg != NULL) {
3333 		pa = VM_PAGE_TO_PHYS(pg);
3334 #ifdef DEBUG
3335 		struct vm_page_md * const md = VM_PAGE_TO_MD(pg);
3336 		if (md->pvh_refcnt != 0) {
3337 			printf("pmap_physpage_alloc: page 0x%lx has "
3338 			    "%d references\n", pa, md->pvh_refcnt);
3339 			panic("pmap_physpage_alloc");
3340 		}
3341 #endif
3342 		*pap = pa;
3343 		return (true);
3344 	}
3345 	return (false);
3346 }
3347 
3348 /*
3349  * pmap_physpage_free:
3350  *
3351  *	Free the single page table page at the specified physical address.
3352  */
3353 static void
3354 pmap_physpage_free(paddr_t pa)
3355 {
3356 	struct vm_page *pg;
3357 
3358 	if ((pg = PHYS_TO_VM_PAGE(pa)) == NULL)
3359 		panic("pmap_physpage_free: bogus physical page address");
3360 
3361 #ifdef DEBUG
3362 	struct vm_page_md * const md = VM_PAGE_TO_MD(pg);
3363 	if (md->pvh_refcnt != 0)
3364 		panic("pmap_physpage_free: page still has references");
3365 #endif
3366 
3367 	uvm_pagefree(pg);
3368 }
3369 
3370 /*
3371  * pmap_physpage_addref:
3372  *
3373  *	Add a reference to the specified special use page.
3374  */
3375 static int
3376 pmap_physpage_addref(void *kva)
3377 {
3378 	struct vm_page *pg;
3379 	struct vm_page_md *md;
3380 	paddr_t pa;
3381 
3382 	pa = ALPHA_K0SEG_TO_PHYS(trunc_page((vaddr_t)kva));
3383 	pg = PHYS_TO_VM_PAGE(pa);
3384 	md = VM_PAGE_TO_MD(pg);
3385 
3386 	KASSERT((int)md->pvh_refcnt >= 0);
3387 
3388 	return atomic_inc_uint_nv(&md->pvh_refcnt);
3389 }
3390 
3391 /*
3392  * pmap_physpage_delref:
3393  *
3394  *	Delete a reference to the specified special use page.
3395  */
3396 static int
3397 pmap_physpage_delref(void *kva)
3398 {
3399 	struct vm_page *pg;
3400 	struct vm_page_md *md;
3401 	paddr_t pa;
3402 
3403 	pa = ALPHA_K0SEG_TO_PHYS(trunc_page((vaddr_t)kva));
3404 	pg = PHYS_TO_VM_PAGE(pa);
3405 	md = VM_PAGE_TO_MD(pg);
3406 
3407 	KASSERT((int)md->pvh_refcnt > 0);
3408 
3409 	return atomic_dec_uint_nv(&md->pvh_refcnt);
3410 }
3411 
3412 /******************** page table page management ********************/
3413 
3414 static bool
3415 pmap_kptpage_alloc(paddr_t *pap)
3416 {
3417 	if (uvm.page_init_done == false) {
3418 		/*
3419 		 * We're growing the kernel pmap early (from
3420 		 * uvm_pageboot_alloc()).  This case must
3421 		 * be handled a little differently.
3422 		 */
3423 		*pap = ALPHA_K0SEG_TO_PHYS(
3424 		    pmap_steal_memory(PAGE_SIZE, NULL, NULL));
3425 		return true;
3426 	}
3427 
3428 	return pmap_physpage_alloc(PGU_NORMAL, pap);
3429 }
3430 
3431 /*
3432  * pmap_growkernel:		[ INTERFACE ]
3433  *
3434  *	Grow the kernel address space.  This is a hint from the
3435  *	upper layer to pre-allocate more kernel PT pages.
3436  */
3437 vaddr_t
3438 pmap_growkernel(vaddr_t maxkvaddr)
3439 {
3440 	struct pmap *kpm = pmap_kernel(), *pm;
3441 	paddr_t ptaddr;
3442 	pt_entry_t *l1pte, *l2pte, pte;
3443 	vaddr_t va;
3444 	int l1idx;
3445 
3446 	rw_enter(&pmap_growkernel_lock, RW_WRITER);
3447 
3448 	if (maxkvaddr <= virtual_end)
3449 		goto out;		/* we are OK */
3450 
3451 	va = virtual_end;
3452 
3453 	while (va < maxkvaddr) {
3454 		/*
3455 		 * If there is no valid L1 PTE (i.e. no L2 PT page),
3456 		 * allocate a new L2 PT page and insert it into the
3457 		 * L1 map.
3458 		 */
3459 		l1pte = pmap_l1pte(kpm, va);
3460 		if (pmap_pte_v(l1pte) == 0) {
3461 			if (!pmap_kptpage_alloc(&ptaddr))
3462 				goto die;
3463 			pte = (atop(ptaddr) << PG_SHIFT) |
3464 			    PG_V | PG_ASM | PG_KRE | PG_KWE | PG_WIRED;
3465 			*l1pte = pte;
3466 
3467 			l1idx = l1pte_index(va);
3468 
3469 			/* Update all the user pmaps. */
3470 			mutex_enter(&pmap_all_pmaps_lock);
3471 			for (pm = TAILQ_FIRST(&pmap_all_pmaps);
3472 			     pm != NULL; pm = TAILQ_NEXT(pm, pm_list)) {
3473 				/* Skip the kernel pmap. */
3474 				if (pm == pmap_kernel())
3475 					continue;
3476 
3477 				/*
3478 				 * Any pmaps published on the global list
3479 				 * should never be referencing kernel_lev1map.
3480 				 */
3481 				KASSERT(pm->pm_lev1map != kernel_lev1map);
3482 
3483 				PMAP_LOCK(pm);
3484 				pm->pm_lev1map[l1idx] = pte;
3485 				PMAP_UNLOCK(pm);
3486 			}
3487 			mutex_exit(&pmap_all_pmaps_lock);
3488 		}
3489 
3490 		/*
3491 		 * Have an L2 PT page now, add the L3 PT page.
3492 		 */
3493 		l2pte = pmap_l2pte(kpm, va, l1pte);
3494 		KASSERT(pmap_pte_v(l2pte) == 0);
3495 		if (!pmap_kptpage_alloc(&ptaddr))
3496 			goto die;
3497 		*l2pte = (atop(ptaddr) << PG_SHIFT) |
3498 		    PG_V | PG_ASM | PG_KRE | PG_KWE | PG_WIRED;
3499 		va += ALPHA_L2SEG_SIZE;
3500 	}
3501 
3502 	/* Invalidate the L1 PT cache. */
3503 	pool_cache_invalidate(&pmap_l1pt_cache);
3504 
3505 	virtual_end = va;
3506 
3507  out:
3508 	rw_exit(&pmap_growkernel_lock);
3509 
3510 	return (virtual_end);
3511 
3512  die:
3513 	panic("pmap_growkernel: out of memory");
3514 }
3515 
3516 /*
3517  * pmap_l1pt_ctor:
3518  *
3519  *	Pool cache constructor for L1 PT pages.
3520  *
3521  *	Note: The growkernel lock is held across allocations
3522  *	from our pool_cache, so we don't need to acquire it
3523  *	ourselves.
3524  */
3525 static int
3526 pmap_l1pt_ctor(void *arg, void *object, int flags)
3527 {
3528 	pt_entry_t *l1pt = object, pte;
3529 	int i;
3530 
3531 	/*
3532 	 * Initialize the new level 1 table by zeroing the
3533 	 * user portion and copying the kernel mappings into
3534 	 * the kernel portion.
3535 	 */
3536 	for (i = 0; i < l1pte_index(VM_MIN_KERNEL_ADDRESS); i++)
3537 		l1pt[i] = 0;
3538 
3539 	for (i = l1pte_index(VM_MIN_KERNEL_ADDRESS);
3540 	     i <= l1pte_index(VM_MAX_KERNEL_ADDRESS); i++)
3541 		l1pt[i] = kernel_lev1map[i];
3542 
3543 	/*
3544 	 * Now, map the new virtual page table.  NOTE: NO ASM!
3545 	 */
3546 	pte = ((ALPHA_K0SEG_TO_PHYS((vaddr_t) l1pt) >> PGSHIFT) << PG_SHIFT) |
3547 	    PG_V | PG_KRE | PG_KWE;
3548 	l1pt[l1pte_index(VPTBASE)] = pte;
3549 
3550 	return (0);
3551 }
3552 
3553 /*
3554  * pmap_l1pt_alloc:
3555  *
3556  *	Page alloctaor for L1 PT pages.
3557  */
3558 static void *
3559 pmap_l1pt_alloc(struct pool *pp, int flags)
3560 {
3561 	paddr_t ptpa;
3562 
3563 	/*
3564 	 * Attempt to allocate a free page.
3565 	 */
3566 	if (pmap_physpage_alloc(PGU_L1PT, &ptpa) == false)
3567 		return (NULL);
3568 
3569 	return ((void *) ALPHA_PHYS_TO_K0SEG(ptpa));
3570 }
3571 
3572 /*
3573  * pmap_l1pt_free:
3574  *
3575  *	Page freer for L1 PT pages.
3576  */
3577 static void
3578 pmap_l1pt_free(struct pool *pp, void *v)
3579 {
3580 
3581 	pmap_physpage_free(ALPHA_K0SEG_TO_PHYS((vaddr_t) v));
3582 }
3583 
3584 /*
3585  * pmap_ptpage_alloc:
3586  *
3587  *	Allocate a level 2 or level 3 page table page for a user
3588  *	pmap, and initialize the PTE that references it.
3589  *
3590  *	Note: the pmap must already be locked.
3591  */
3592 static int
3593 pmap_ptpage_alloc(pt_entry_t * const pte, int const usage)
3594 {
3595 	paddr_t ptpa;
3596 
3597 	/*
3598 	 * Allocate the page table page.
3599 	 */
3600 	if (pmap_physpage_alloc(usage, &ptpa) == false)
3601 		return (ENOMEM);
3602 
3603 	/*
3604 	 * Initialize the referencing PTE.
3605 	 */
3606 	const pt_entry_t npte = ((ptpa >> PGSHIFT) << PG_SHIFT) |
3607 	    PG_V | PG_KRE | PG_KWE | PG_WIRED;
3608 
3609 	atomic_store_relaxed(pte, npte);
3610 
3611 	return (0);
3612 }
3613 
3614 /*
3615  * pmap_ptpage_free:
3616  *
3617  *	Free the level 2 or level 3 page table page referenced
3618  *	be the provided PTE.
3619  *
3620  *	Note: the pmap must already be locked.
3621  */
3622 static void
3623 pmap_ptpage_free(pt_entry_t * const pte, struct pmap_tlb_context * const tlbctx)
3624 {
3625 
3626 	/*
3627 	 * Extract the physical address of the page from the PTE
3628 	 * and clear the entry.
3629 	 */
3630 	const paddr_t ptpa = pmap_pte_pa(pte);
3631 	atomic_store_relaxed(pte, PG_NV);
3632 
3633 #ifdef DEBUG
3634 	pmap_zero_page(ptpa);
3635 #endif
3636 	pmap_tlb_physpage_free(ptpa, tlbctx);
3637 }
3638 
3639 /*
3640  * pmap_l3pt_delref:
3641  *
3642  *	Delete a reference on a level 3 PT page.  If the reference drops
3643  *	to zero, free it.
3644  *
3645  *	Note: the pmap must already be locked.
3646  */
3647 static void
3648 pmap_l3pt_delref(pmap_t pmap, vaddr_t va, pt_entry_t *l3pte,
3649     struct pmap_tlb_context * const tlbctx)
3650 {
3651 	pt_entry_t *l1pte, *l2pte;
3652 
3653 	l1pte = pmap_l1pte(pmap, va);
3654 	l2pte = pmap_l2pte(pmap, va, l1pte);
3655 
3656 #ifdef DIAGNOSTIC
3657 	if (pmap == pmap_kernel())
3658 		panic("pmap_l3pt_delref: kernel pmap");
3659 #endif
3660 
3661 	if (pmap_physpage_delref(l3pte) == 0) {
3662 		/*
3663 		 * No more mappings; we can free the level 3 table.
3664 		 */
3665 #ifdef DEBUG
3666 		if (pmapdebug & PDB_PTPAGE)
3667 			printf("pmap_l3pt_delref: freeing level 3 table at "
3668 			    "0x%lx\n", pmap_pte_pa(l2pte));
3669 #endif
3670 		/*
3671 		 * You can pass NULL if you know the last refrence won't
3672 		 * be dropped.
3673 		 */
3674 		KASSERT(tlbctx != NULL);
3675 		pmap_ptpage_free(l2pte, tlbctx);
3676 
3677 		/*
3678 		 * We've freed a level 3 table, so we must invalidate
3679 		 * any now-stale TLB entries for the corresponding VPT
3680 		 * VA range.  Easiest way to guarantee this is to hit
3681 		 * all of the user TLB entries.
3682 		 */
3683 		pmap_tlb_shootdown_all_user(pmap, PG_V, tlbctx);
3684 
3685 		/*
3686 		 * We've freed a level 3 table, so delete the reference
3687 		 * on the level 2 table.
3688 		 */
3689 		pmap_l2pt_delref(pmap, l1pte, l2pte, tlbctx);
3690 	}
3691 }
3692 
3693 /*
3694  * pmap_l2pt_delref:
3695  *
3696  *	Delete a reference on a level 2 PT page.  If the reference drops
3697  *	to zero, free it.
3698  *
3699  *	Note: the pmap must already be locked.
3700  */
3701 static void
3702 pmap_l2pt_delref(pmap_t pmap, pt_entry_t *l1pte, pt_entry_t *l2pte,
3703     struct pmap_tlb_context * const tlbctx)
3704 {
3705 
3706 #ifdef DIAGNOSTIC
3707 	if (pmap == pmap_kernel())
3708 		panic("pmap_l2pt_delref: kernel pmap");
3709 #endif
3710 
3711 	if (pmap_physpage_delref(l2pte) == 0) {
3712 		/*
3713 		 * No more mappings in this segment; we can free the
3714 		 * level 2 table.
3715 		 */
3716 #ifdef DEBUG
3717 		if (pmapdebug & PDB_PTPAGE)
3718 			printf("pmap_l2pt_delref: freeing level 2 table at "
3719 			    "0x%lx\n", pmap_pte_pa(l1pte));
3720 #endif
3721 		/*
3722 		 * You can pass NULL if you know the last refrence won't
3723 		 * be dropped.
3724 		 */
3725 		KASSERT(tlbctx != NULL);
3726 		pmap_ptpage_free(l1pte, tlbctx);
3727 
3728 		/*
3729 		 * We've freed a level 2 table, so we must invalidate
3730 		 * any now-stale TLB entries for the corresponding VPT
3731 		 * VA range.  Easiest way to guarantee this is to hit
3732 		 * all of the user TLB entries.
3733 		 */
3734 		pmap_tlb_shootdown_all_user(pmap, PG_V, tlbctx);
3735 
3736 		/*
3737 		 * We've freed a level 2 table, so delete the reference
3738 		 * on the level 1 table.
3739 		 */
3740 		pmap_l1pt_delref(pmap, l1pte);
3741 	}
3742 }
3743 
3744 /*
3745  * pmap_l1pt_delref:
3746  *
3747  *	Delete a reference on a level 1 PT page.
3748  */
3749 static void
3750 pmap_l1pt_delref(pmap_t pmap, pt_entry_t *l1pte)
3751 {
3752 
3753 	KASSERT(pmap != pmap_kernel());
3754 
3755 	(void)pmap_physpage_delref(l1pte);
3756 }
3757 
3758 /******************** Address Space Number management ********************/
3759 
3760 /*
3761  * pmap_asn_alloc:
3762  *
3763  *	Allocate and assign an ASN to the specified pmap.
3764  *
3765  *	Note: the pmap must already be locked.  This may be called from
3766  *	an interprocessor interrupt, and in that case, the sender of
3767  *	the IPI has the pmap lock.
3768  */
3769 static u_int
3770 pmap_asn_alloc(pmap_t const pmap, struct cpu_info * const ci)
3771 {
3772 
3773 #ifdef DEBUG
3774 	if (pmapdebug & (PDB_FOLLOW|PDB_ASN))
3775 		printf("pmap_asn_alloc(%p)\n", pmap);
3776 #endif
3777 
3778 	KASSERT(pmap != pmap_kernel());
3779 	KASSERT(pmap->pm_lev1map != kernel_lev1map);
3780 	KASSERT(kpreempt_disabled());
3781 
3782 	/* No work to do if the the CPU does not implement ASNs. */
3783 	if (pmap_max_asn == 0)
3784 		return 0;
3785 
3786 	struct pmap_asn_info * const pma = &pmap->pm_asni[ci->ci_cpuid];
3787 
3788 	/*
3789 	 * Hopefully, we can continue using the one we have...
3790 	 *
3791 	 * N.B. the generation check will fail the first time
3792 	 * any pmap is activated on a given CPU, because we start
3793 	 * the generation counter at 1, but initialize pmaps with
3794 	 * 0; this forces the first ASN allocation to occur.
3795 	 */
3796 	if (pma->pma_asngen == ci->ci_asn_gen) {
3797 #ifdef DEBUG
3798 		if (pmapdebug & PDB_ASN)
3799 			printf("pmap_asn_alloc: same generation, keeping %u\n",
3800 			    pma->pma_asn);
3801 #endif
3802 		TLB_COUNT(asn_reuse);
3803 		return pma->pma_asn;
3804 	}
3805 
3806 	/*
3807 	 * Need to assign a new ASN.  Grab the next one, incrementing
3808 	 * the generation number if we have to.
3809 	 */
3810 	if (ci->ci_next_asn > pmap_max_asn) {
3811 		/*
3812 		 * Invalidate all non-PG_ASM TLB entries and the
3813 		 * I-cache, and bump the generation number.
3814 		 */
3815 		ALPHA_TBIAP();
3816 		alpha_pal_imb();
3817 
3818 		ci->ci_next_asn = PMAP_ASN_FIRST_USER;
3819 		ci->ci_asn_gen++;
3820 		TLB_COUNT(asn_newgen);
3821 
3822 		/*
3823 		 * Make sure the generation number doesn't wrap.  We could
3824 		 * handle this scenario by traversing all of the pmaps,
3825 		 * and invalidating the generation number on those which
3826 		 * are not currently in use by this processor.
3827 		 *
3828 		 * However... considering that we're using an unsigned 64-bit
3829 		 * integer for generation numbers, on non-ASN CPUs, we won't
3830 		 * wrap for approximately 75 billion years on a 128-ASN CPU
3831 		 * (assuming 1000 switch * operations per second).
3832 		 *
3833 		 * So, we don't bother.
3834 		 */
3835 		KASSERT(ci->ci_asn_gen != PMAP_ASNGEN_INVALID);
3836 #ifdef DEBUG
3837 		if (pmapdebug & PDB_ASN)
3838 			printf("pmap_asn_alloc: generation bumped to %lu\n",
3839 			    ci->ci_asn_ge);
3840 #endif
3841 	}
3842 
3843 	/*
3844 	 * Assign the new ASN and validate the generation number.
3845 	 */
3846 	pma->pma_asn = ci->ci_next_asn++;
3847 	pma->pma_asngen = ci->ci_asn_gen;
3848 	TLB_COUNT(asn_assign);
3849 
3850 	/*
3851 	 * We have a new ASN, so we can skip any pending I-stream sync
3852 	 * on the way back out to user space.
3853 	 */
3854 	atomic_and_ulong(&pmap->pm_needisync, ~(1UL << ci->ci_cpuid));
3855 
3856 #ifdef DEBUG
3857 	if (pmapdebug & PDB_ASN)
3858 		printf("pmap_asn_alloc: assigning %u to pmap %p\n",
3859 		    pma->pma_asn, pmap);
3860 #endif
3861 	return pma->pma_asn;
3862 }
3863