xref: /netbsd-src/sys/arch/alpha/alpha/pmap.c (revision 19bb83c8e54d1afcbe2e9711a8c4c5db06c58800)
1 /* $NetBSD: pmap.c,v 1.308 2023/12/30 23:07:42 thorpej Exp $ */
2 
3 /*-
4  * Copyright (c) 1998, 1999, 2000, 2001, 2007, 2008, 2020
5  * 	The NetBSD Foundation, Inc.
6  * All rights reserved.
7  *
8  * This code is derived from software contributed to The NetBSD Foundation
9  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
10  * NASA Ames Research Center, by Andrew Doran and Mindaugas Rasiukevicius,
11  * and by Chris G. Demetriou.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
24  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
26  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
27  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
28  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
29  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
30  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
31  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32  * POSSIBILITY OF SUCH DAMAGE.
33  */
34 
35 /*
36  * Copyright (c) 1991, 1993
37  *	The Regents of the University of California.  All rights reserved.
38  *
39  * This code is derived from software contributed to Berkeley by
40  * the Systems Programming Group of the University of Utah Computer
41  * Science Department.
42  *
43  * Redistribution and use in source and binary forms, with or without
44  * modification, are permitted provided that the following conditions
45  * are met:
46  * 1. Redistributions of source code must retain the above copyright
47  *    notice, this list of conditions and the following disclaimer.
48  * 2. Redistributions in binary form must reproduce the above copyright
49  *    notice, this list of conditions and the following disclaimer in the
50  *    documentation and/or other materials provided with the distribution.
51  * 3. Neither the name of the University nor the names of its contributors
52  *    may be used to endorse or promote products derived from this software
53  *    without specific prior written permission.
54  *
55  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
56  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
57  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
58  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
59  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
60  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
61  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
62  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
63  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
64  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
65  * SUCH DAMAGE.
66  *
67  *	@(#)pmap.c	8.6 (Berkeley) 5/27/94
68  */
69 
70 /*
71  * DEC Alpha physical map management code.
72  *
73  * History:
74  *
75  *	This pmap started life as a Motorola 68851/68030 pmap,
76  *	written by Mike Hibler at the University of Utah.
77  *
78  *	It was modified for the DEC Alpha by Chris Demetriou
79  *	at Carnegie Mellon University.
80  *
81  *	Support for non-contiguous physical memory was added by
82  *	Jason R. Thorpe of the Numerical Aerospace Simulation
83  *	Facility, NASA Ames Research Center and Chris Demetriou.
84  *
85  *	Page table management and a major cleanup were undertaken
86  *	by Jason R. Thorpe, with lots of help from Ross Harvey of
87  *	Avalon Computer Systems and from Chris Demetriou.
88  *
89  *	Support for the new UVM pmap interface was written by
90  *	Jason R. Thorpe.
91  *
92  *	Support for ASNs was written by Jason R. Thorpe, again
93  *	with help from Chris Demetriou and Ross Harvey.
94  *
95  *	The locking protocol was written by Jason R. Thorpe,
96  *	using Chuck Cranor's i386 pmap for UVM as a model.
97  *
98  *	TLB shootdown code was written (and then subsequently
99  *	rewritten some years later, borrowing some ideas from
100  *	the x86 pmap) by Jason R. Thorpe.
101  *
102  *	Multiprocessor modifications by Andrew Doran and
103  *	Jason R. Thorpe.
104  *
105  * Notes:
106  *
107  *	All user page table access is done via K0SEG.  Kernel
108  *	page table access is done via the recursive Virtual Page
109  *	Table because kernel PT pages are pre-allocated and never
110  *	freed, so no VPT fault handling is required.
111  */
112 
113 /*
114  *	Manages physical address maps.
115  *
116  *	Since the information managed by this module is
117  *	also stored by the logical address mapping module,
118  *	this module may throw away valid virtual-to-physical
119  *	mappings at almost any time.  However, invalidations
120  *	of virtual-to-physical mappings must be done as
121  *	requested.
122  *
123  *	In order to cope with hardware architectures which
124  *	make virtual-to-physical map invalidates expensive,
125  *	this module may delay invalidate or reduced protection
126  *	operations until such time as they are actually
127  *	necessary.  This module is given full information as
128  *	to which processors are currently using which maps,
129  *	and to when physical maps must be made correct.
130  */
131 
132 #include "opt_lockdebug.h"
133 #include "opt_sysv.h"
134 #include "opt_multiprocessor.h"
135 
136 #include <sys/cdefs.h>			/* RCS ID & Copyright macro defns */
137 
138 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.308 2023/12/30 23:07:42 thorpej Exp $");
139 
140 #include <sys/param.h>
141 #include <sys/systm.h>
142 #include <sys/kernel.h>
143 #include <sys/proc.h>
144 #include <sys/pool.h>
145 #include <sys/buf.h>
146 #include <sys/evcnt.h>
147 #include <sys/atomic.h>
148 #include <sys/cpu.h>
149 
150 #include <uvm/uvm.h>
151 
152 #if defined(MULTIPROCESSOR)
153 #include <machine/rpb.h>
154 #endif
155 
156 #ifdef DEBUG
157 #define	PDB_FOLLOW	0x0001
158 #define	PDB_INIT	0x0002
159 #define	PDB_ENTER	0x0004
160 #define	PDB_REMOVE	0x0008
161 #define	PDB_CREATE	0x0010
162 #define	PDB_PTPAGE	0x0020
163 #define	PDB_ASN		0x0040
164 #define	PDB_BITS	0x0080
165 #define	PDB_COLLECT	0x0100
166 #define	PDB_PROTECT	0x0200
167 #define	PDB_BOOTSTRAP	0x1000
168 #define	PDB_PARANOIA	0x2000
169 #define	PDB_WIRING	0x4000
170 #define	PDB_PVDUMP	0x8000
171 
172 int debugmap = 0;
173 int pmapdebug = PDB_PARANOIA;
174 #endif
175 
176 #if defined(MULTIPROCESSOR)
177 #define	PMAP_MP(x)	x
178 #else
179 #define	PMAP_MP(x)	__nothing
180 #endif /* MULTIPROCESSOR */
181 
182 /*
183  * Given a map and a machine independent protection code,
184  * convert to an alpha protection code.
185  */
186 #define pte_prot(m, p)	(protection_codes[m == pmap_kernel() ? 0 : 1][p])
187 static int	protection_codes[2][8] __read_mostly;
188 
189 /*
190  * kernel_lev1map:
191  *
192  *	Kernel level 1 page table.  This maps all kernel level 2
193  *	page table pages, and is used as a template for all user
194  *	pmap level 1 page tables.  When a new user level 1 page
195  *	table is allocated, all kernel_lev1map PTEs for kernel
196  *	addresses are copied to the new map.
197  *
198  *	The kernel also has an initial set of kernel level 2 page
199  *	table pages.  These map the kernel level 3 page table pages.
200  *	As kernel level 3 page table pages are added, more level 2
201  *	page table pages may be added to map them.  These pages are
202  *	never freed.
203  *
204  *	Finally, the kernel also has an initial set of kernel level
205  *	3 page table pages.  These map pages in K1SEG.  More level
206  *	3 page table pages may be added at run-time if additional
207  *	K1SEG address space is required.  These pages are never freed.
208  *
209  * NOTE: When mappings are inserted into the kernel pmap, all
210  * level 2 and level 3 page table pages must already be allocated
211  * and mapped into the parent page table.
212  */
213 pt_entry_t	*kernel_lev1map __read_mostly;
214 
215 /*
216  * Virtual Page Table.
217  */
218 static pt_entry_t *VPT __read_mostly;
219 
220 static struct {
221 	struct pmap k_pmap;
222 } kernel_pmap_store __cacheline_aligned;
223 
224 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store.k_pmap;
225 
226 /* PA of first available physical page */
227 paddr_t    	avail_start __read_mostly;
228 
229 /* PA of last available physical page */
230 paddr_t		avail_end __read_mostly;
231 
232 /* VA of last avail page (end of kernel AS) */
233 static vaddr_t	virtual_end __read_mostly;
234 
235 /* Has pmap_init completed? */
236 static bool pmap_initialized __read_mostly;
237 
238 /* Instrumentation */
239 u_long		pmap_pages_stolen __read_mostly;
240 
241 /*
242  * This variable contains the number of CPU IDs we need to allocate
243  * space for when allocating the pmap structure.  It is used to
244  * size a per-CPU array of ASN and ASN Generation number.
245  */
246 static u_long 	pmap_ncpuids __read_mostly;
247 
248 #ifndef PMAP_PV_LOWAT
249 #define	PMAP_PV_LOWAT	16
250 #endif
251 int		pmap_pv_lowat __read_mostly = PMAP_PV_LOWAT;
252 
253 /*
254  * List of all pmaps, used to update them when e.g. additional kernel
255  * page tables are allocated.  This list is kept LRU-ordered by
256  * pmap_activate().
257  */
258 static TAILQ_HEAD(, pmap) pmap_all_pmaps __cacheline_aligned;
259 
260 /*
261  * Instrument the number of calls to pmap_growkernel().
262  */
263 static struct evcnt pmap_growkernel_evcnt __read_mostly;
264 
265 /*
266  * The pools from which pmap structures and sub-structures are allocated.
267  */
268 static struct pool_cache pmap_pmap_cache __read_mostly;
269 static struct pool_cache pmap_l1pt_cache __read_mostly;
270 static struct pool_cache pmap_pv_cache __read_mostly;
271 
272 CTASSERT(offsetof(struct pmap, pm_percpu[0]) == COHERENCY_UNIT);
273 CTASSERT(PMAP_SIZEOF(ALPHA_MAXPROCS) < ALPHA_PGBYTES);
274 CTASSERT(sizeof(struct pmap_percpu) == COHERENCY_UNIT);
275 
276 /*
277  * Address Space Numbers.
278  *
279  * On many implementations of the Alpha architecture, the TLB entries and
280  * I-cache blocks are tagged with a unique number within an implementation-
281  * specified range.  When a process context becomes active, the ASN is used
282  * to match TLB entries; if a TLB entry for a particular VA does not match
283  * the current ASN, it is ignored (one could think of the processor as
284  * having a collection of <max ASN> separate TLBs).  This allows operating
285  * system software to skip the TLB flush that would otherwise be necessary
286  * at context switch time.
287  *
288  * Alpha PTEs have a bit in them (PG_ASM - Address Space Match) that
289  * causes TLB entries to match any ASN.  The PALcode also provides
290  * a TBI (Translation Buffer Invalidate) operation that flushes all
291  * TLB entries that _do not_ have PG_ASM.  We use this bit for kernel
292  * mappings, so that invalidation of all user mappings does not invalidate
293  * kernel mappings (which are consistent across all processes).
294  *
295  * pmap_next_asn always indicates to the next ASN to use.  When
296  * pmap_next_asn exceeds pmap_max_asn, we start a new ASN generation.
297  *
298  * When a new ASN generation is created, the per-process (i.e. non-PG_ASM)
299  * TLB entries and the I-cache are flushed, the generation number is bumped,
300  * and pmap_next_asn is changed to indicate the first non-reserved ASN.
301  *
302  * We reserve ASN #0 for pmaps that use the global kernel_lev1map.  This
303  * prevents the following scenario to ensure no accidental accesses to
304  * user space for LWPs using the kernel pmap.  This is important because
305  * the PALcode may use the recursive VPT to service TLB misses.
306  *
307  * By reserving an ASN for the kernel, we are guaranteeing that an lwp
308  * will not see any valid user space TLB entries until it passes through
309  * pmap_activate() for the first time.
310  *
311  * On processors that do not support ASNs, the PALcode invalidates
312  * non-ASM TLB entries automatically on swpctx.  We completely skip
313  * the ASN machinery in this case because the PALcode neither reads
314  * nor writes that field of the HWPCB.
315  */
316 
317 /* max ASN supported by the system */
318 static u_int	pmap_max_asn __read_mostly;
319 
320 /*
321  * Locking:
322  *
323  *	READ/WRITE LOCKS
324  *	----------------
325  *
326  *	* pmap_main_lock - This lock is used to prevent deadlock and/or
327  *	  provide mutex access to the pmap module.  Most operations lock
328  *	  the pmap first, then PV lists as needed.  However, some operations,
329  *	  such as pmap_page_protect(), lock the PV lists before locking
330  *	  the pmaps.  To prevent deadlock, we require a mutex lock on the
331  *	  pmap module if locking in the PV->pmap direction.  This is
332  *	  implemented by acquiring a (shared) read lock on pmap_main_lock
333  *	  if locking pmap->PV and a (exclusive) write lock if locking in
334  *	  the PV->pmap direction.  Since only one thread can hold a write
335  *	  lock at a time, this provides the mutex.
336  *
337  *	MUTEXES
338  *	-------
339  *
340  *	* pmap lock (global hash) - These locks protect the pmap structures.
341  *
342  *	* pmap activation lock (global hash) - These IPL_SCHED spin locks
343  *	  synchronize pmap_activate() and TLB shootdowns.  This has a lock
344  *	  ordering constraint with the tlb_lock:
345  *
346  *		tlb_lock -> pmap activation lock
347  *
348  *	* pvh_lock (global hash) - These locks protect the PV lists for
349  *	  managed pages.
350  *
351  *	* tlb_lock - This IPL_VM lock serializes local and remote TLB
352  *	  invalidation.
353  *
354  *	* pmap_all_pmaps_lock - This lock protects the global list of
355  *	  all pmaps.
356  *
357  *	* pmap_growkernel_lock - This lock protects pmap_growkernel()
358  *	  and the virtual_end variable.
359  *
360  *	  There is a lock ordering constraint for pmap_growkernel_lock.
361  *	  pmap_growkernel() acquires the locks in the following order:
362  *
363  *		pmap_growkernel_lock (write) -> pmap_all_pmaps_lock ->
364  *		    pmap lock
365  *
366  *	  We need to ensure consistency between user pmaps and the
367  *	  kernel_lev1map.  For this reason, pmap_growkernel_lock must
368  *	  be held to prevent kernel_lev1map changing across pmaps
369  *	  being added to / removed from the global pmaps list.
370  *
371  *	Address space number management (global ASN counters and per-pmap
372  *	ASN state) are not locked; they use arrays of values indexed
373  *	per-processor.
374  *
375  *	All internal functions which operate on a pmap are called
376  *	with the pmap already locked by the caller (which will be
377  *	an interface function).
378  */
379 static krwlock_t pmap_main_lock __cacheline_aligned;
380 static kmutex_t pmap_all_pmaps_lock __cacheline_aligned;
381 static krwlock_t pmap_growkernel_lock __cacheline_aligned;
382 
383 #define	PMAP_MAP_TO_HEAD_LOCK()		rw_enter(&pmap_main_lock, RW_READER)
384 #define	PMAP_MAP_TO_HEAD_UNLOCK()	rw_exit(&pmap_main_lock)
385 #define	PMAP_HEAD_TO_MAP_LOCK()		rw_enter(&pmap_main_lock, RW_WRITER)
386 #define	PMAP_HEAD_TO_MAP_UNLOCK()	rw_exit(&pmap_main_lock)
387 
388 static union {
389 	kmutex_t	lock;
390 	uint8_t		pad[COHERENCY_UNIT];
391 } pmap_pvh_locks[64] __cacheline_aligned;
392 
393 #define	PVH_LOCK_HASH(pg)						\
394 	((((uintptr_t)(pg)) >> 6) & 63)
395 
396 static inline kmutex_t *
pmap_pvh_lock(struct vm_page * pg)397 pmap_pvh_lock(struct vm_page *pg)
398 {
399 	return &pmap_pvh_locks[PVH_LOCK_HASH(pg)].lock;
400 }
401 
402 static union {
403 	struct {
404 		kmutex_t	lock;
405 		kmutex_t	activation_lock;
406 	} locks;
407 	uint8_t		pad[COHERENCY_UNIT];
408 } pmap_pmap_locks[64] __cacheline_aligned;
409 
410 #define	PMAP_LOCK_HASH(pm)						\
411 	((((uintptr_t)(pm)) >> 6) & 63)
412 
413 static inline kmutex_t *
pmap_pmap_lock(pmap_t const pmap)414 pmap_pmap_lock(pmap_t const pmap)
415 {
416 	return &pmap_pmap_locks[PMAP_LOCK_HASH(pmap)].locks.lock;
417 }
418 
419 static inline kmutex_t *
pmap_activation_lock(pmap_t const pmap)420 pmap_activation_lock(pmap_t const pmap)
421 {
422 	return &pmap_pmap_locks[PMAP_LOCK_HASH(pmap)].locks.activation_lock;
423 }
424 
425 #define	PMAP_LOCK(pmap)		mutex_enter(pmap_pmap_lock(pmap))
426 #define	PMAP_UNLOCK(pmap)	mutex_exit(pmap_pmap_lock(pmap))
427 
428 #define	PMAP_ACT_LOCK(pmap)	mutex_spin_enter(pmap_activation_lock(pmap))
429 #define	PMAP_ACT_TRYLOCK(pmap)	mutex_tryenter(pmap_activation_lock(pmap))
430 #define	PMAP_ACT_UNLOCK(pmap)	mutex_spin_exit(pmap_activation_lock(pmap))
431 
432 #if defined(MULTIPROCESSOR)
433 #define	pmap_all_cpus()		cpus_running
434 #else
435 #define	pmap_all_cpus()		~0UL
436 #endif /* MULTIPROCESSOR */
437 
438 /*
439  * TLB context structure; see description in "TLB management" section
440  * below.
441  */
442 #define	TLB_CTX_MAXVA		8
443 #define	TLB_CTX_ALLVA		PAGE_MASK
444 struct pmap_tlb_context {
445 	uintptr_t		t_addrdata[TLB_CTX_MAXVA];
446 	pmap_t			t_pmap;
447 	struct pmap_pagelist	t_freeptq;
448 	struct pmap_pvlist	t_freepvq;
449 };
450 
451 /*
452  * Internal routines
453  */
454 static void	alpha_protection_init(void);
455 static pt_entry_t pmap_remove_mapping(pmap_t, vaddr_t, pt_entry_t *, bool,
456 				      pv_entry_t *,
457 				      struct pmap_tlb_context *);
458 static void	pmap_changebit(struct vm_page *, pt_entry_t, pt_entry_t,
459 			       struct pmap_tlb_context *);
460 
461 /*
462  * PT page management functions.
463  */
464 static int	pmap_ptpage_alloc(pmap_t, pt_entry_t *, int);
465 static void	pmap_ptpage_free(pmap_t, pt_entry_t *,
466 				 struct pmap_tlb_context *);
467 static void	pmap_l3pt_delref(pmap_t, vaddr_t, pt_entry_t *,
468 		     struct pmap_tlb_context *);
469 static void	pmap_l2pt_delref(pmap_t, pt_entry_t *, pt_entry_t *,
470 		     struct pmap_tlb_context *);
471 static void	pmap_l1pt_delref(pmap_t, pt_entry_t *);
472 
473 static void	*pmap_l1pt_alloc(struct pool *, int);
474 static void	pmap_l1pt_free(struct pool *, void *);
475 
476 static struct pool_allocator pmap_l1pt_allocator = {
477 	pmap_l1pt_alloc, pmap_l1pt_free, 0,
478 };
479 
480 static int	pmap_l1pt_ctor(void *, void *, int);
481 
482 /*
483  * PV table management functions.
484  */
485 static int	pmap_pv_enter(pmap_t, struct vm_page *, vaddr_t, pt_entry_t *,
486 			      bool, pv_entry_t);
487 static void	pmap_pv_remove(pmap_t, struct vm_page *, vaddr_t, bool,
488 			       pv_entry_t *, struct pmap_tlb_context *);
489 static void	*pmap_pv_page_alloc(struct pool *, int);
490 static void	pmap_pv_page_free(struct pool *, void *);
491 
492 static struct pool_allocator pmap_pv_page_allocator = {
493 	pmap_pv_page_alloc, pmap_pv_page_free, 0,
494 };
495 
496 #ifdef DEBUG
497 void	pmap_pv_dump(paddr_t);
498 #endif
499 
500 #define	pmap_pv_alloc()		pool_cache_get(&pmap_pv_cache, PR_NOWAIT)
501 #define	pmap_pv_free(pv)	pool_cache_put(&pmap_pv_cache, (pv))
502 
503 /*
504  * Generic routine for freeing pages on a pmap_pagelist back to
505  * the system.
506  */
507 static void
pmap_pagelist_free(struct pmap_pagelist * const list)508 pmap_pagelist_free(struct pmap_pagelist * const list)
509 {
510 	struct vm_page *pg;
511 
512 	while ((pg = LIST_FIRST(list)) != NULL) {
513 		LIST_REMOVE(pg, pageq.list);
514 		/* Fix up ref count; it's not always 0 when we get here. */
515 		PHYSPAGE_REFCNT_SET(pg, 0);
516 		uvm_pagefree(pg);
517 	}
518 }
519 
520 /*
521  * Generic routine for freeing a list of PV entries back to the
522  * system.
523  */
524 static void
pmap_pvlist_free(struct pmap_pvlist * const list)525 pmap_pvlist_free(struct pmap_pvlist * const list)
526 {
527 	pv_entry_t pv;
528 
529 	while ((pv = LIST_FIRST(list)) != NULL) {
530 		LIST_REMOVE(pv, pv_link);
531 		pmap_pv_free(pv);
532 	}
533 }
534 
535 /*
536  * TLB management.
537  *
538  * TLB invalidations need to be performed on local and remote CPUs
539  * whenever parts of the PTE that the hardware or PALcode understands
540  * changes.  In order amortize the cost of these operations, we will
541  * queue up to 8 addresses to invalidate in a batch.  Any more than
542  * that, and we will hit the entire TLB.
543  *
544  * Some things that add complexity:
545  *
546  * ==> ASNs. A CPU may have valid TLB entries for other than the current
547  *     address space.  We can only invalidate TLB entries for the current
548  *     address space, so when asked to invalidate a VA for the non-current
549  *     pmap on a given CPU, we simply invalidate the ASN for that pmap,CPU
550  *     tuple so that new one is allocated on the next activation on that
551  *     CPU.  N.B. that for CPUs that don't implement ASNs, SWPCTX does all
552  *     the work necessary, so we can skip some work in the pmap module
553  *     itself.
554  *
555  *     When a pmap is activated on a given CPU, we set a corresponding
556  *     bit in pmap::pm_cpus, indicating that it potentially has valid
557  *     TLB entries for that address space.  This bitmap is then used to
558  *     determine which remote CPUs need to be notified of invalidations.
559  *     The bit is cleared when the ASN is invalidated on that CPU.
560  *
561  *     In order to serialize with activating an address space on a
562  *     given CPU (that we can reliably send notifications only to
563  *     relevant remote CPUs), we acquire the pmap lock in pmap_activate()
564  *     and also hold the lock while remote shootdowns take place.
565  *     This does not apply to the kernel pmap; all CPUs are notified about
566  *     invalidations for the kernel pmap, and the pmap lock is not held
567  *     in pmap_activate() for the kernel pmap.
568  *
569  * ==> P->V operations (e.g. pmap_page_protect()) may require sending
570  *     invalidations for multiple address spaces.  We only track one
571  *     address space at a time, and if we encounter more than one, then
572  *     the notification each CPU gets is to hit the entire TLB.  Note
573  *     also that we can't serialize with pmap_activate() in this case,
574  *     so all CPUs will get the notification, and they check when
575  *     processing the notification if the pmap is current on that CPU.
576  *
577  * Invalidation information is gathered into a pmap_tlb_context structure
578  * that includes room for 8 VAs, the pmap the VAs belong to, a bitmap of
579  * CPUs to be notified, and a list for PT pages that are freed during
580  * removal off mappings.  The number of valid addresses in the list as
581  * well as flags are squeezed into the lower bits of the first two VAs.
582  * Storage for this structure is allocated on the stack.  We need to be
583  * careful to keep the size of this structure under control.
584  *
585  * When notifying remote CPUs, we acquire the tlb_lock (which also
586  * blocks IPIs), record the pointer to our context structure, set a
587  * global bitmap off CPUs to be notified, and then send the IPIs to
588  * each victim.  While the other CPUs are in-flight, we then perform
589  * any invalidations necessary on the local CPU.  Once that is done,
590  * we then wait the global context pointer to be cleared, which
591  * will be done by the final remote CPU to complete their work. This
592  * method reduces cache line contention during processing.
593  *
594  * When removing mappings in user pmaps, this implementation frees page
595  * table pages back to the VM system once they contain no valid mappings.
596  * As we do this, we must ensure to invalidate TLB entries that the
597  * CPU might hold for the respective recursive VPT mappings.  This must
598  * be done whenever an L1 or L2 PTE is invalidated.  Until these VPT
599  * translations are invalidated, the PT pages must not be reused.  For
600  * this reason, we keep a list of freed PT pages in the context structure
601  * and drain them off once all invalidations are complete.
602  *
603  * NOTE: The value of TLB_CTX_MAXVA is tuned to accommodate the UBC
604  * window size (defined as 64KB on alpha in <machine/vmparam.h>).
605  */
606 
607 #define	TLB_CTX_F_ASM		__BIT(0)
608 #define	TLB_CTX_F_IMB		__BIT(1)
609 #define	TLB_CTX_F_KIMB		__BIT(2)
610 #define	TLB_CTX_F_PV		__BIT(3)
611 #define	TLB_CTX_F_MULTI		__BIT(4)
612 
613 #define	TLB_CTX_COUNT(ctx)	((ctx)->t_addrdata[0] & PAGE_MASK)
614 #define	TLB_CTX_INC_COUNT(ctx)	 (ctx)->t_addrdata[0]++
615 #define	TLB_CTX_SET_ALLVA(ctx)	 (ctx)->t_addrdata[0] |= TLB_CTX_ALLVA
616 
617 #define	TLB_CTX_FLAGS(ctx)	((ctx)->t_addrdata[1] & PAGE_MASK)
618 #define	TLB_CTX_SET_FLAG(ctx, f) (ctx)->t_addrdata[1] |= (f)
619 
620 #define	TLB_CTX_VA(ctx, i)	((ctx)->t_addrdata[(i)] & ~PAGE_MASK)
621 #define	TLB_CTX_SETVA(ctx, i, va)					\
622 	(ctx)->t_addrdata[(i)] = (va) | ((ctx)->t_addrdata[(i)] & PAGE_MASK)
623 
624 static struct {
625 	kmutex_t	lock;
626 	struct evcnt	events;
627 } tlb_shootdown __cacheline_aligned;
628 #define	tlb_lock	tlb_shootdown.lock
629 #define	tlb_evcnt	tlb_shootdown.events
630 #if defined(MULTIPROCESSOR)
631 static const struct pmap_tlb_context *tlb_context __cacheline_aligned;
632 static unsigned long tlb_pending __cacheline_aligned;
633 #endif /* MULTIPROCESSOR */
634 
635 #if defined(TLB_STATS)
636 #define	TLB_COUNT_DECL(cnt)	static struct evcnt tlb_stat_##cnt
637 #define	TLB_COUNT(cnt)		atomic_inc_64(&tlb_stat_##cnt .ev_count)
638 #define	TLB_COUNT_ATTACH(cnt)						\
639 	evcnt_attach_dynamic_nozero(&tlb_stat_##cnt, EVCNT_TYPE_MISC,	\
640 	    NULL, "TLB", #cnt)
641 
642 TLB_COUNT_DECL(invalidate_multi_tbia);
643 TLB_COUNT_DECL(invalidate_multi_tbiap);
644 TLB_COUNT_DECL(invalidate_multi_imb);
645 
646 TLB_COUNT_DECL(invalidate_kern_tbia);
647 TLB_COUNT_DECL(invalidate_kern_tbis);
648 TLB_COUNT_DECL(invalidate_kern_imb);
649 
650 TLB_COUNT_DECL(invalidate_user_not_current);
651 TLB_COUNT_DECL(invalidate_user_lazy_imb);
652 TLB_COUNT_DECL(invalidate_user_tbiap);
653 TLB_COUNT_DECL(invalidate_user_tbis);
654 
655 TLB_COUNT_DECL(shootdown_kernel);
656 TLB_COUNT_DECL(shootdown_user);
657 TLB_COUNT_DECL(shootdown_imb);
658 TLB_COUNT_DECL(shootdown_kimb);
659 TLB_COUNT_DECL(shootdown_overflow);
660 
661 TLB_COUNT_DECL(shootdown_all_user);
662 TLB_COUNT_DECL(shootdown_all_user_imb);
663 
664 TLB_COUNT_DECL(shootdown_pv);
665 TLB_COUNT_DECL(shootdown_pv_multi);
666 
667 TLB_COUNT_DECL(shootnow_over_notify);
668 TLB_COUNT_DECL(shootnow_remote);
669 
670 TLB_COUNT_DECL(reason_remove_kernel);
671 TLB_COUNT_DECL(reason_remove_user);
672 TLB_COUNT_DECL(reason_remove_all_user);
673 TLB_COUNT_DECL(reason_page_protect_read);
674 TLB_COUNT_DECL(reason_page_protect_none);
675 TLB_COUNT_DECL(reason_protect);
676 TLB_COUNT_DECL(reason_enter_kernel);
677 TLB_COUNT_DECL(reason_enter_user);
678 TLB_COUNT_DECL(reason_kenter);
679 TLB_COUNT_DECL(reason_enter_l2pt_delref);
680 TLB_COUNT_DECL(reason_enter_l3pt_delref);
681 TLB_COUNT_DECL(reason_kremove);
682 TLB_COUNT_DECL(reason_clear_modify);
683 TLB_COUNT_DECL(reason_clear_reference);
684 TLB_COUNT_DECL(reason_emulate_reference);
685 
686 TLB_COUNT_DECL(asn_reuse);
687 TLB_COUNT_DECL(asn_newgen);
688 TLB_COUNT_DECL(asn_assign);
689 
690 TLB_COUNT_DECL(activate_both_change);
691 TLB_COUNT_DECL(activate_asn_change);
692 TLB_COUNT_DECL(activate_ptbr_change);
693 TLB_COUNT_DECL(activate_swpctx);
694 TLB_COUNT_DECL(activate_skip_swpctx);
695 
696 #else /* ! TLB_STATS */
697 #define	TLB_COUNT(cnt)		__nothing
698 #define	TLB_COUNT_ATTACH(cnt)	__nothing
699 #endif /* TLB_STATS */
700 
701 static void
pmap_tlb_init(void)702 pmap_tlb_init(void)
703 {
704 	/* mutex is initialized in pmap_bootstrap(). */
705 
706 	evcnt_attach_dynamic_nozero(&tlb_evcnt, EVCNT_TYPE_MISC,
707 	    NULL, "TLB", "shootdown");
708 
709 	TLB_COUNT_ATTACH(invalidate_multi_tbia);
710 	TLB_COUNT_ATTACH(invalidate_multi_tbiap);
711 	TLB_COUNT_ATTACH(invalidate_multi_imb);
712 
713 	TLB_COUNT_ATTACH(invalidate_kern_tbia);
714 	TLB_COUNT_ATTACH(invalidate_kern_tbis);
715 	TLB_COUNT_ATTACH(invalidate_kern_imb);
716 
717 	TLB_COUNT_ATTACH(invalidate_user_not_current);
718 	TLB_COUNT_ATTACH(invalidate_user_lazy_imb);
719 	TLB_COUNT_ATTACH(invalidate_user_tbiap);
720 	TLB_COUNT_ATTACH(invalidate_user_tbis);
721 
722 	TLB_COUNT_ATTACH(shootdown_kernel);
723 	TLB_COUNT_ATTACH(shootdown_user);
724 	TLB_COUNT_ATTACH(shootdown_imb);
725 	TLB_COUNT_ATTACH(shootdown_kimb);
726 	TLB_COUNT_ATTACH(shootdown_overflow);
727 
728 	TLB_COUNT_ATTACH(shootdown_all_user);
729 	TLB_COUNT_ATTACH(shootdown_all_user_imb);
730 
731 	TLB_COUNT_ATTACH(shootdown_pv);
732 	TLB_COUNT_ATTACH(shootdown_pv_multi);
733 
734 	TLB_COUNT_ATTACH(shootnow_over_notify);
735 	TLB_COUNT_ATTACH(shootnow_remote);
736 
737 	TLB_COUNT_ATTACH(reason_remove_kernel);
738 	TLB_COUNT_ATTACH(reason_remove_user);
739 	TLB_COUNT_ATTACH(reason_remove_all_user);
740 	TLB_COUNT_ATTACH(reason_page_protect_read);
741 	TLB_COUNT_ATTACH(reason_page_protect_none);
742 	TLB_COUNT_ATTACH(reason_protect);
743 	TLB_COUNT_ATTACH(reason_enter_kernel);
744 	TLB_COUNT_ATTACH(reason_enter_user);
745 	TLB_COUNT_ATTACH(reason_kenter);
746 	TLB_COUNT_ATTACH(reason_enter_l2pt_delref);
747 	TLB_COUNT_ATTACH(reason_enter_l3pt_delref);
748 	TLB_COUNT_ATTACH(reason_kremove);
749 	TLB_COUNT_ATTACH(reason_clear_modify);
750 	TLB_COUNT_ATTACH(reason_clear_reference);
751 
752 	TLB_COUNT_ATTACH(asn_reuse);
753 	TLB_COUNT_ATTACH(asn_newgen);
754 	TLB_COUNT_ATTACH(asn_assign);
755 
756 	TLB_COUNT_ATTACH(activate_both_change);
757 	TLB_COUNT_ATTACH(activate_asn_change);
758 	TLB_COUNT_ATTACH(activate_ptbr_change);
759 	TLB_COUNT_ATTACH(activate_swpctx);
760 	TLB_COUNT_ATTACH(activate_skip_swpctx);
761 }
762 
763 static inline void
pmap_tlb_context_init(struct pmap_tlb_context * const tlbctx,uintptr_t flags)764 pmap_tlb_context_init(struct pmap_tlb_context * const tlbctx, uintptr_t flags)
765 {
766 	/* Initialize the minimum number of fields. */
767 	tlbctx->t_addrdata[0] = 0;
768 	tlbctx->t_addrdata[1] = flags;
769 	tlbctx->t_pmap = NULL;
770 	LIST_INIT(&tlbctx->t_freeptq);
771 	LIST_INIT(&tlbctx->t_freepvq);
772 }
773 
774 static void
pmap_tlb_shootdown_internal(pmap_t const pmap,vaddr_t const va,pt_entry_t const pte_bits,struct pmap_tlb_context * const tlbctx)775 pmap_tlb_shootdown_internal(pmap_t const pmap, vaddr_t const va,
776     pt_entry_t const pte_bits, struct pmap_tlb_context * const tlbctx)
777 {
778 	KASSERT(pmap != NULL);
779 	KASSERT((va & PAGE_MASK) == 0);
780 
781 	/*
782 	 * Figure out who needs to hear about this, and the scope
783 	 * of an all-entries invalidate.
784 	 */
785 	if (pmap == pmap_kernel()) {
786 		TLB_COUNT(shootdown_kernel);
787 		KASSERT(pte_bits & PG_ASM);
788 		TLB_CTX_SET_FLAG(tlbctx, TLB_CTX_F_ASM);
789 
790 		/* Note if an I-stream sync is also needed. */
791 		if (pte_bits & PG_EXEC) {
792 			TLB_COUNT(shootdown_kimb);
793 			TLB_CTX_SET_FLAG(tlbctx, TLB_CTX_F_KIMB);
794 		}
795 	} else {
796 		TLB_COUNT(shootdown_user);
797 		KASSERT((pte_bits & PG_ASM) == 0);
798 
799 		/* Note if an I-stream sync is also needed. */
800 		if (pte_bits & PG_EXEC) {
801 			TLB_COUNT(shootdown_imb);
802 			TLB_CTX_SET_FLAG(tlbctx, TLB_CTX_F_IMB);
803 		}
804 	}
805 
806 	KASSERT(tlbctx->t_pmap == NULL || tlbctx->t_pmap == pmap);
807 	tlbctx->t_pmap = pmap;
808 
809 	/*
810 	 * If we're already at the max, just tell each active CPU
811 	 * to nail everything.
812 	 */
813 	const uintptr_t count = TLB_CTX_COUNT(tlbctx);
814 	if (count > TLB_CTX_MAXVA) {
815 		return;
816 	}
817 	if (count == TLB_CTX_MAXVA) {
818 		TLB_COUNT(shootdown_overflow);
819 		TLB_CTX_SET_ALLVA(tlbctx);
820 		return;
821 	}
822 
823 	TLB_CTX_SETVA(tlbctx, count, va);
824 	TLB_CTX_INC_COUNT(tlbctx);
825 }
826 
827 static void
pmap_tlb_shootdown(pmap_t const pmap,vaddr_t const va,pt_entry_t const pte_bits,struct pmap_tlb_context * const tlbctx)828 pmap_tlb_shootdown(pmap_t const pmap, vaddr_t const va,
829     pt_entry_t const pte_bits, struct pmap_tlb_context * const tlbctx)
830 {
831 	KASSERT((TLB_CTX_FLAGS(tlbctx) & TLB_CTX_F_PV) == 0);
832 	pmap_tlb_shootdown_internal(pmap, va, pte_bits, tlbctx);
833 }
834 
835 static void
pmap_tlb_shootdown_all_user(pmap_t const pmap,pt_entry_t const pte_bits,struct pmap_tlb_context * const tlbctx)836 pmap_tlb_shootdown_all_user(pmap_t const pmap, pt_entry_t const pte_bits,
837     struct pmap_tlb_context * const tlbctx)
838 {
839 	KASSERT(pmap != pmap_kernel());
840 
841 	TLB_COUNT(shootdown_all_user);
842 
843 	/* Note if an I-stream sync is also needed. */
844 	if (pte_bits & PG_EXEC) {
845 		TLB_COUNT(shootdown_all_user_imb);
846 		TLB_CTX_SET_FLAG(tlbctx, TLB_CTX_F_IMB);
847 	}
848 
849 	if (TLB_CTX_FLAGS(tlbctx) & TLB_CTX_F_PV) {
850 		if (tlbctx->t_pmap == NULL || tlbctx->t_pmap == pmap) {
851 			if (tlbctx->t_pmap == NULL) {
852 				pmap_reference(pmap);
853 				tlbctx->t_pmap = pmap;
854 			}
855 		} else {
856 			TLB_CTX_SET_FLAG(tlbctx, TLB_CTX_F_MULTI);
857 		}
858 	} else {
859 		KASSERT(tlbctx->t_pmap == NULL || tlbctx->t_pmap == pmap);
860 		tlbctx->t_pmap = pmap;
861 	}
862 
863 	TLB_CTX_SET_ALLVA(tlbctx);
864 }
865 
866 static void
pmap_tlb_shootdown_pv(pmap_t const pmap,vaddr_t const va,pt_entry_t const pte_bits,struct pmap_tlb_context * const tlbctx)867 pmap_tlb_shootdown_pv(pmap_t const pmap, vaddr_t const va,
868     pt_entry_t const pte_bits, struct pmap_tlb_context * const tlbctx)
869 {
870 
871 	KASSERT(TLB_CTX_FLAGS(tlbctx) & TLB_CTX_F_PV);
872 
873 	TLB_COUNT(shootdown_pv);
874 
875 	if (tlbctx->t_pmap == NULL || tlbctx->t_pmap == pmap) {
876 		if (tlbctx->t_pmap == NULL) {
877 			pmap_reference(pmap);
878 			tlbctx->t_pmap = pmap;
879 		}
880 		pmap_tlb_shootdown_internal(pmap, va, pte_bits, tlbctx);
881 	} else {
882 		TLB_COUNT(shootdown_pv_multi);
883 		uintptr_t flags = TLB_CTX_F_MULTI;
884 		if (pmap == pmap_kernel()) {
885 			KASSERT(pte_bits & PG_ASM);
886 			flags |= TLB_CTX_F_ASM;
887 		} else {
888 			KASSERT((pte_bits & PG_ASM) == 0);
889 		}
890 
891 		/*
892 		 * No need to distinguish between kernel and user IMB
893 		 * here; see pmap_tlb_invalidate_multi().
894 		 */
895 		if (pte_bits & PG_EXEC) {
896 			flags |= TLB_CTX_F_IMB;
897 		}
898 		TLB_CTX_SET_ALLVA(tlbctx);
899 		TLB_CTX_SET_FLAG(tlbctx, flags);
900 	}
901 }
902 
903 static void
pmap_tlb_invalidate_multi(const struct pmap_tlb_context * const tlbctx)904 pmap_tlb_invalidate_multi(const struct pmap_tlb_context * const tlbctx)
905 {
906 	if (TLB_CTX_FLAGS(tlbctx) & TLB_CTX_F_ASM) {
907 		TLB_COUNT(invalidate_multi_tbia);
908 		ALPHA_TBIA();
909 	} else {
910 		TLB_COUNT(invalidate_multi_tbiap);
911 		ALPHA_TBIAP();
912 	}
913 	if (TLB_CTX_FLAGS(tlbctx) & (TLB_CTX_F_IMB | TLB_CTX_F_KIMB)) {
914 		TLB_COUNT(invalidate_multi_imb);
915 		alpha_pal_imb();
916 	}
917 }
918 
919 static void
pmap_tlb_invalidate_kernel(const struct pmap_tlb_context * const tlbctx)920 pmap_tlb_invalidate_kernel(const struct pmap_tlb_context * const tlbctx)
921 {
922 	const uintptr_t count = TLB_CTX_COUNT(tlbctx);
923 
924 	if (count == TLB_CTX_ALLVA) {
925 		TLB_COUNT(invalidate_kern_tbia);
926 		ALPHA_TBIA();
927 	} else {
928 		TLB_COUNT(invalidate_kern_tbis);
929 		for (uintptr_t i = 0; i < count; i++) {
930 			ALPHA_TBIS(TLB_CTX_VA(tlbctx, i));
931 		}
932 	}
933 	if (TLB_CTX_FLAGS(tlbctx) & TLB_CTX_F_KIMB) {
934 		TLB_COUNT(invalidate_kern_imb);
935 		alpha_pal_imb();
936 	}
937 }
938 
939 static void
pmap_tlb_invalidate(const struct pmap_tlb_context * const tlbctx,const struct cpu_info * const ci)940 pmap_tlb_invalidate(const struct pmap_tlb_context * const tlbctx,
941     const struct cpu_info * const ci)
942 {
943 	const uintptr_t count = TLB_CTX_COUNT(tlbctx);
944 
945 	if (TLB_CTX_FLAGS(tlbctx) & TLB_CTX_F_MULTI) {
946 		pmap_tlb_invalidate_multi(tlbctx);
947 		return;
948 	}
949 
950 	if (TLB_CTX_FLAGS(tlbctx) & TLB_CTX_F_ASM) {
951 		pmap_tlb_invalidate_kernel(tlbctx);
952 		return;
953 	}
954 
955 	KASSERT(kpreempt_disabled());
956 
957 	pmap_t const pmap = tlbctx->t_pmap;
958 	KASSERT(pmap != NULL);
959 
960 	if (__predict_false(pmap != ci->ci_pmap)) {
961 		TLB_COUNT(invalidate_user_not_current);
962 
963 		/*
964 		 * For CPUs that don't implement ASNs, the SWPCTX call
965 		 * does all of the TLB invalidation work for us.
966 		 */
967 		if (__predict_false(pmap_max_asn == 0)) {
968 			return;
969 		}
970 
971 		const u_long cpu_mask = 1UL << ci->ci_cpuid;
972 
973 		/*
974 		 * We cannot directly invalidate the TLB in this case,
975 		 * so force allocation of a new ASN when the pmap becomes
976 		 * active again.
977 		 */
978 		pmap->pm_percpu[ci->ci_cpuid].pmc_asngen = PMAP_ASNGEN_INVALID;
979 		atomic_and_ulong(&pmap->pm_cpus, ~cpu_mask);
980 
981 		/*
982 		 * This isn't strictly necessary; when we allocate a
983 		 * new ASN, we're going to clear this bit and skip
984 		 * syncing the I-stream.  But we will keep this bit
985 		 * of accounting for internal consistency.
986 		 */
987 		if (TLB_CTX_FLAGS(tlbctx) & TLB_CTX_F_IMB) {
988 			pmap->pm_percpu[ci->ci_cpuid].pmc_needisync = 1;
989 		}
990 		return;
991 	}
992 
993 	if (TLB_CTX_FLAGS(tlbctx) & TLB_CTX_F_IMB) {
994 		TLB_COUNT(invalidate_user_lazy_imb);
995 		pmap->pm_percpu[ci->ci_cpuid].pmc_needisync = 1;
996 	}
997 
998 	if (count == TLB_CTX_ALLVA) {
999 		/*
1000 		 * Another option here for CPUs that implement ASNs is
1001 		 * to allocate a new ASN and do a SWPCTX.  That's almost
1002 		 * certainly faster than a TBIAP, but would require us
1003 		 * to synchronize against IPIs in pmap_activate().
1004 		 */
1005 		TLB_COUNT(invalidate_user_tbiap);
1006 		KASSERT((TLB_CTX_FLAGS(tlbctx) & TLB_CTX_F_ASM) == 0);
1007 		ALPHA_TBIAP();
1008 	} else {
1009 		TLB_COUNT(invalidate_user_tbis);
1010 		for (uintptr_t i = 0; i < count; i++) {
1011 			ALPHA_TBIS(TLB_CTX_VA(tlbctx, i));
1012 		}
1013 	}
1014 }
1015 
1016 static void
pmap_tlb_shootnow(const struct pmap_tlb_context * const tlbctx)1017 pmap_tlb_shootnow(const struct pmap_tlb_context * const tlbctx)
1018 {
1019 
1020 	if (TLB_CTX_COUNT(tlbctx) == 0) {
1021 		/* No work to do. */
1022 		return;
1023 	}
1024 
1025 	/*
1026 	 * Acquire the shootdown mutex.  This will also block IPL_VM
1027 	 * interrupts and disable preemption.  It is critically important
1028 	 * that IPIs not be blocked in this routine.
1029 	 */
1030 	KASSERT(alpha_pal_rdps() < ALPHA_PSL_IPL_CLOCK);
1031 	mutex_spin_enter(&tlb_lock);
1032 	tlb_evcnt.ev_count++;
1033 
1034 	const struct cpu_info *ci = curcpu();
1035 	const u_long this_cpu = 1UL << ci->ci_cpuid;
1036 	u_long active_cpus;
1037 	bool activation_locked, activation_lock_tried;
1038 
1039 	/*
1040 	 * Figure out who to notify.  If it's for the kernel or
1041 	 * multiple address spaces, we notify everybody.  If
1042 	 * it's a single user pmap, then we try to acquire the
1043 	 * activation lock so we can get an accurate accounting
1044 	 * of who needs to be notified.  If we can't acquire
1045 	 * the activation lock, then just notify everyone and
1046 	 * let them sort it out when they process the IPI.
1047 	 */
1048 	if (TLB_CTX_FLAGS(tlbctx) & (TLB_CTX_F_ASM | TLB_CTX_F_MULTI)) {
1049 		active_cpus = pmap_all_cpus();
1050 		activation_locked = false;
1051 		activation_lock_tried = false;
1052 	} else {
1053 		KASSERT(tlbctx->t_pmap != NULL);
1054 		activation_locked = PMAP_ACT_TRYLOCK(tlbctx->t_pmap);
1055 		if (__predict_true(activation_locked)) {
1056 			active_cpus = tlbctx->t_pmap->pm_cpus;
1057 		} else {
1058 			TLB_COUNT(shootnow_over_notify);
1059 			active_cpus = pmap_all_cpus();
1060 		}
1061 		activation_lock_tried = true;
1062 	}
1063 
1064 #if defined(MULTIPROCESSOR)
1065 	/*
1066 	 * If there are remote CPUs that need to do work, get them
1067 	 * started now.
1068 	 */
1069 	const u_long remote_cpus = active_cpus & ~this_cpu;
1070 	KASSERT(tlb_context == NULL);
1071 	if (remote_cpus) {
1072 		TLB_COUNT(shootnow_remote);
1073 		tlb_context = tlbctx;
1074 		tlb_pending = remote_cpus;
1075 		alpha_multicast_ipi(remote_cpus, ALPHA_IPI_SHOOTDOWN);
1076 	}
1077 #endif /* MULTIPROCESSOR */
1078 
1079 	/*
1080 	 * Now that the remotes have been notified, release the
1081 	 * activation lock.
1082 	 */
1083 	if (activation_lock_tried) {
1084 		if (activation_locked) {
1085 			KASSERT(tlbctx->t_pmap != NULL);
1086 			PMAP_ACT_UNLOCK(tlbctx->t_pmap);
1087 		}
1088 		/*
1089 		 * When we tried to acquire the activation lock, we
1090 		 * raised IPL to IPL_SCHED (even if we ultimately
1091 		 * failed to acquire the lock), which blocks out IPIs.
1092 		 * Force our IPL back down to IPL_VM so that we can
1093 		 * receive IPIs.
1094 		 */
1095 		alpha_pal_swpipl(IPL_VM);
1096 	}
1097 
1098 	/*
1099 	 * Do any work that we might need to do.  We don't need to
1100 	 * synchronize with activation here because we know that
1101 	 * for the current CPU, activation status will not change.
1102 	 */
1103 	if (active_cpus & this_cpu) {
1104 		pmap_tlb_invalidate(tlbctx, ci);
1105 	}
1106 
1107 #if defined(MULTIPROCESSOR)
1108 	/* Wait for remote CPUs to finish. */
1109 	if (remote_cpus) {
1110 		int backoff = SPINLOCK_BACKOFF_MIN;
1111 		u_int spins = 0;
1112 
1113 		while (atomic_load_acquire(&tlb_context) != NULL) {
1114 			SPINLOCK_BACKOFF(backoff);
1115 			if (spins++ > 0x0fffffff) {
1116 				printf("TLB LOCAL MASK  = 0x%016lx\n",
1117 				    this_cpu);
1118 				printf("TLB REMOTE MASK = 0x%016lx\n",
1119 				    remote_cpus);
1120 				printf("TLB REMOTE PENDING = 0x%016lx\n",
1121 				    tlb_pending);
1122 				printf("TLB CONTEXT = %p\n", tlb_context);
1123 				printf("TLB LOCAL IPL = %lu\n",
1124 				    alpha_pal_rdps());
1125 				panic("pmap_tlb_shootnow");
1126 			}
1127 		}
1128 	}
1129 	KASSERT(tlb_context == NULL);
1130 #endif /* MULTIPROCESSOR */
1131 
1132 	mutex_spin_exit(&tlb_lock);
1133 
1134 	if (__predict_false(TLB_CTX_FLAGS(tlbctx) & TLB_CTX_F_PV)) {
1135 		/*
1136 		 * P->V TLB operations may operate on multiple pmaps.
1137 		 * The shootdown takes a reference on the first pmap it
1138 		 * encounters, in order to prevent it from disappearing,
1139 		 * in the hope that we end up with a single-pmap P->V
1140 		 * operation (instrumentation shows this is not rare).
1141 		 *
1142 		 * Once this shootdown is finished globally, we need to
1143 		 * release this extra reference.
1144 		 */
1145 		KASSERT(tlbctx->t_pmap != NULL);
1146 		pmap_destroy(tlbctx->t_pmap);
1147 	}
1148 }
1149 
1150 #if defined(MULTIPROCESSOR)
1151 void
pmap_tlb_shootdown_ipi(struct cpu_info * const ci,struct trapframe * const tf __unused)1152 pmap_tlb_shootdown_ipi(struct cpu_info * const ci,
1153 
1154     struct trapframe * const tf __unused)
1155 {
1156 	KASSERT(tlb_context != NULL);
1157 	pmap_tlb_invalidate(tlb_context, ci);
1158 	if (atomic_and_ulong_nv(&tlb_pending, ~(1UL << ci->ci_cpuid)) == 0) {
1159 		atomic_store_release(&tlb_context, NULL);
1160 	}
1161 }
1162 #endif /* MULTIPROCESSOR */
1163 
1164 static inline void
pmap_tlb_context_drain(struct pmap_tlb_context * const tlbctx)1165 pmap_tlb_context_drain(struct pmap_tlb_context * const tlbctx)
1166 {
1167 	if (! LIST_EMPTY(&tlbctx->t_freeptq)) {
1168 		pmap_pagelist_free(&tlbctx->t_freeptq);
1169 	}
1170 	if (! LIST_EMPTY(&tlbctx->t_freepvq)) {
1171 		pmap_pvlist_free(&tlbctx->t_freepvq);
1172 	}
1173 }
1174 
1175 /*
1176  * ASN management functions.
1177  */
1178 static u_int	pmap_asn_alloc(pmap_t, struct cpu_info *);
1179 
1180 /*
1181  * Misc. functions.
1182  */
1183 static struct vm_page *pmap_physpage_alloc(int);
1184 static void	pmap_physpage_free(paddr_t);
1185 static int	pmap_physpage_addref(void *);
1186 static int	pmap_physpage_delref(void *);
1187 
1188 static bool	vtophys_internal(vaddr_t, paddr_t *p);
1189 
1190 /*
1191  * PMAP_KERNEL_PTE:
1192  *
1193  *	Get a kernel PTE.
1194  *
1195  *	If debugging, do a table walk.  If not debugging, just use
1196  *	the Virtual Page Table, since all kernel page tables are
1197  *	pre-allocated and mapped in.
1198  */
1199 #ifdef DEBUG
1200 #define	PMAP_KERNEL_PTE(va)						\
1201 ({									\
1202 	pt_entry_t *l1pte_, *l2pte_;					\
1203 									\
1204 	l1pte_ = pmap_l1pte(kernel_lev1map, va);			\
1205 	if (pmap_pte_v(l1pte_) == 0) {					\
1206 		printf("kernel level 1 PTE not valid, va 0x%lx "	\
1207 		    "(line %d) pte=%p *pte=0x%016lx\n", (va), __LINE__,	\
1208 		    l1pte_, *l1pte_);					\
1209 		panic("PMAP_KERNEL_PTE");				\
1210 	}								\
1211 	l2pte_ = pmap_l2pte(kernel_lev1map, va, l1pte_);		\
1212 	if (pmap_pte_v(l2pte_) == 0) {					\
1213 		printf("kernel level 2 PTE not valid, va 0x%lx "	\
1214 		    "(line %d) pte=%p *pte=0x%016lx\n", (va), __LINE__,	\
1215 		    l2pte_, *l2pte_);					\
1216 		panic("PMAP_KERNEL_PTE");				\
1217 	}								\
1218 	pmap_l3pte(kernel_lev1map, va, l2pte_);				\
1219 })
1220 #else
1221 #define	PMAP_KERNEL_PTE(va)	(&VPT[VPT_INDEX((va))])
1222 #endif
1223 
1224 /*
1225  * PMAP_STAT_{INCR,DECR}:
1226  *
1227  *	Increment or decrement a pmap statistic.
1228  */
1229 #define	PMAP_STAT_INCR(s, v)	atomic_add_long((unsigned long *)(&(s)), (v))
1230 #define	PMAP_STAT_DECR(s, v)	atomic_add_long((unsigned long *)(&(s)), -(v))
1231 
1232 /*
1233  * pmap_init_cpu:
1234  *
1235  *	Initilize pmap data in the cpu_info.
1236  */
1237 void
pmap_init_cpu(struct cpu_info * const ci)1238 pmap_init_cpu(struct cpu_info * const ci)
1239 {
1240 	pmap_t const pmap = pmap_kernel();
1241 
1242 	/* All CPUs start out using the kernel pmap. */
1243 	atomic_or_ulong(&pmap->pm_cpus, 1UL << ci->ci_cpuid);
1244 	pmap_reference(pmap);
1245 	ci->ci_pmap = pmap;
1246 
1247 	/* Initialize ASN allocation logic. */
1248 	ci->ci_next_asn = PMAP_ASN_FIRST_USER;
1249 	ci->ci_asn_gen = PMAP_ASNGEN_INITIAL;
1250 }
1251 
1252 /*
1253  * pmap_bootstrap:
1254  *
1255  *	Bootstrap the system to run with virtual memory.
1256  *
1257  *	Note: no locking is necessary in this function.
1258  */
1259 void
pmap_bootstrap(paddr_t ptaddr,u_int maxasn,u_long ncpuids)1260 pmap_bootstrap(paddr_t ptaddr, u_int maxasn, u_long ncpuids)
1261 {
1262 	vsize_t lev2mapsize, lev3mapsize;
1263 	pt_entry_t *lev2map, *lev3map;
1264 	pt_entry_t pte;
1265 	vsize_t bufsz;
1266 	struct pcb *pcb;
1267 	int i;
1268 
1269 #ifdef DEBUG
1270 	if (pmapdebug & (PDB_FOLLOW|PDB_BOOTSTRAP))
1271 		printf("pmap_bootstrap(0x%lx, %u)\n", ptaddr, maxasn);
1272 #endif
1273 
1274 	/*
1275 	 * Compute the number of pages kmem_arena will have.
1276 	 */
1277 	kmeminit_nkmempages();
1278 
1279 	/*
1280 	 * Figure out how many initial PTE's are necessary to map the
1281 	 * kernel.  We also reserve space for kmem_alloc_pageable()
1282 	 * for vm_fork().
1283 	 */
1284 
1285 	/* Get size of buffer cache and set an upper limit */
1286 	bufsz = buf_memcalc();
1287 	buf_setvalimit(bufsz);
1288 
1289 	lev3mapsize =
1290 		(VM_PHYS_SIZE + (ubc_nwins << ubc_winshift) +
1291 		 bufsz + 16 * NCARGS + pager_map_size) / PAGE_SIZE +
1292 		(maxproc * UPAGES) + nkmempages;
1293 
1294 	lev3mapsize = roundup(lev3mapsize, NPTEPG);
1295 
1296 	/*
1297 	 * Initialize `FYI' variables.  Note we're relying on
1298 	 * the fact that BSEARCH sorts the vm_physmem[] array
1299 	 * for us.
1300 	 */
1301 	avail_start = ptoa(uvm_physseg_get_avail_start(uvm_physseg_get_first()));
1302 	avail_end = ptoa(uvm_physseg_get_avail_end(uvm_physseg_get_last()));
1303 	virtual_end = VM_MIN_KERNEL_ADDRESS + lev3mapsize * PAGE_SIZE;
1304 
1305 #if 0
1306 	printf("avail_start = 0x%lx\n", avail_start);
1307 	printf("avail_end = 0x%lx\n", avail_end);
1308 	printf("virtual_end = 0x%lx\n", virtual_end);
1309 #endif
1310 
1311 	/*
1312 	 * Allocate a level 1 PTE table for the kernel.
1313 	 * This is always one page long.
1314 	 * IF THIS IS NOT A MULTIPLE OF PAGE_SIZE, ALL WILL GO TO HELL.
1315 	 */
1316 	kernel_lev1map = (pt_entry_t *)
1317 	    uvm_pageboot_alloc(sizeof(pt_entry_t) * NPTEPG);
1318 
1319 	/*
1320 	 * Allocate a level 2 PTE table for the kernel.
1321 	 * These must map all of the level3 PTEs.
1322 	 * IF THIS IS NOT A MULTIPLE OF PAGE_SIZE, ALL WILL GO TO HELL.
1323 	 */
1324 	lev2mapsize = roundup(howmany(lev3mapsize, NPTEPG), NPTEPG);
1325 	lev2map = (pt_entry_t *)
1326 	    uvm_pageboot_alloc(sizeof(pt_entry_t) * lev2mapsize);
1327 
1328 	/*
1329 	 * Allocate a level 3 PTE table for the kernel.
1330 	 * Contains lev3mapsize PTEs.
1331 	 */
1332 	lev3map = (pt_entry_t *)
1333 	    uvm_pageboot_alloc(sizeof(pt_entry_t) * lev3mapsize);
1334 
1335 	/*
1336 	 * Set up level 1 page table
1337 	 */
1338 
1339 	/* Map all of the level 2 pte pages */
1340 	for (i = 0; i < howmany(lev2mapsize, NPTEPG); i++) {
1341 		pte = (ALPHA_K0SEG_TO_PHYS(((vaddr_t)lev2map) +
1342 		    (i*PAGE_SIZE)) >> PGSHIFT) << PG_SHIFT;
1343 		pte |= PG_V | PG_ASM | PG_KRE | PG_KWE | PG_WIRED;
1344 		kernel_lev1map[l1pte_index(VM_MIN_KERNEL_ADDRESS +
1345 		    (i*PAGE_SIZE*NPTEPG*NPTEPG))] = pte;
1346 	}
1347 
1348 	/* Map the virtual page table */
1349 	pte = (ALPHA_K0SEG_TO_PHYS((vaddr_t)kernel_lev1map) >> PGSHIFT)
1350 	    << PG_SHIFT;
1351 	pte |= PG_V | PG_KRE | PG_KWE; /* NOTE NO ASM */
1352 	kernel_lev1map[l1pte_index(VPTBASE)] = pte;
1353 	VPT = (pt_entry_t *)VPTBASE;
1354 
1355 	/*
1356 	 * Set up level 2 page table.
1357 	 */
1358 	/* Map all of the level 3 pte pages */
1359 	for (i = 0; i < howmany(lev3mapsize, NPTEPG); i++) {
1360 		pte = (ALPHA_K0SEG_TO_PHYS(((vaddr_t)lev3map) +
1361 		    (i*PAGE_SIZE)) >> PGSHIFT) << PG_SHIFT;
1362 		pte |= PG_V | PG_ASM | PG_KRE | PG_KWE | PG_WIRED;
1363 		/*
1364 		 * No need to use l2pte_index() here; it's equivalent
1365 		 * to just indexing with our loop variable i, but will
1366 		 * fall over if we end up with more than 1 L2 PT page.
1367 		 *
1368 		 * In other words:
1369 		 *
1370 		 *	l2pte_index(VM_MIN_KERNEL_ADDRESS +
1371 		 *	            (i*PAGE_SIZE*NPTEPG))
1372 		 *
1373 		 * ...is the same as 'i' so long as i stays below 1024.
1374 		 */
1375 		lev2map[i] = pte;
1376 	}
1377 
1378 	/* Initialize the pmap_growkernel_lock. */
1379 	rw_init(&pmap_growkernel_lock);
1380 
1381 	/*
1382 	 * Set up level three page table (lev3map)
1383 	 */
1384 	/* Nothing to do; it's already zero'd */
1385 
1386 	/*
1387 	 * Initialize the pmap pools and list.
1388 	 */
1389 	pmap_ncpuids = ncpuids;
1390 	pool_cache_bootstrap(&pmap_pmap_cache, PMAP_SIZEOF(pmap_ncpuids),
1391 	    COHERENCY_UNIT, 0, 0, "pmap", NULL, IPL_NONE, NULL, NULL, NULL);
1392 	pool_cache_bootstrap(&pmap_l1pt_cache, PAGE_SIZE, 0, 0, 0, "pmapl1pt",
1393 	    &pmap_l1pt_allocator, IPL_NONE, pmap_l1pt_ctor, NULL, NULL);
1394 	pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0,
1395 	    PR_LARGECACHE, "pmappv", &pmap_pv_page_allocator, IPL_NONE, NULL,
1396 	    NULL, NULL);
1397 
1398 	TAILQ_INIT(&pmap_all_pmaps);
1399 
1400 	/* Initialize the ASN logic.  See also pmap_init_cpu(). */
1401 	pmap_max_asn = maxasn;
1402 
1403 	/*
1404 	 * Initialize the locks.
1405 	 */
1406 	rw_init(&pmap_main_lock);
1407 	mutex_init(&pmap_all_pmaps_lock, MUTEX_DEFAULT, IPL_NONE);
1408 	for (i = 0; i < __arraycount(pmap_pvh_locks); i++) {
1409 		mutex_init(&pmap_pvh_locks[i].lock, MUTEX_DEFAULT, IPL_NONE);
1410 	}
1411 	for (i = 0; i < __arraycount(pmap_pvh_locks); i++) {
1412 		mutex_init(&pmap_pmap_locks[i].locks.lock,
1413 		    MUTEX_DEFAULT, IPL_NONE);
1414 		mutex_init(&pmap_pmap_locks[i].locks.activation_lock,
1415 		    MUTEX_SPIN, IPL_SCHED);
1416 	}
1417 
1418 	/*
1419 	 * This must block any interrupt from which a TLB shootdown
1420 	 * could be issued, but must NOT block IPIs.
1421 	 */
1422 	mutex_init(&tlb_lock, MUTEX_SPIN, IPL_VM);
1423 
1424 	/*
1425 	 * Initialize kernel pmap.  Note that all kernel mappings
1426 	 * have PG_ASM set, so the ASN doesn't really matter for
1427 	 * the kernel pmap.  Also, since the kernel pmap always
1428 	 * references kernel_lev1map, it always has an invalid ASN
1429 	 * generation.
1430 	 */
1431 	memset(pmap_kernel(), 0, sizeof(struct pmap));
1432 	LIST_INIT(&pmap_kernel()->pm_ptpages);
1433 	LIST_INIT(&pmap_kernel()->pm_pvents);
1434 	atomic_store_relaxed(&pmap_kernel()->pm_count, 1);
1435 	/* Kernel pmap does not have per-CPU info. */
1436 	TAILQ_INSERT_TAIL(&pmap_all_pmaps, pmap_kernel(), pm_list);
1437 
1438 	/*
1439 	 * Set up lwp0's PCB such that the ptbr points to the right place
1440 	 * and has the kernel pmap's (really unused) ASN.
1441 	 */
1442 	pcb = lwp_getpcb(&lwp0);
1443 	pcb->pcb_hw.apcb_ptbr =
1444 	    ALPHA_K0SEG_TO_PHYS((vaddr_t)kernel_lev1map) >> PGSHIFT;
1445 	pcb->pcb_hw.apcb_asn = PMAP_ASN_KERNEL;
1446 
1447 	struct cpu_info * const ci = curcpu();
1448 	pmap_init_cpu(ci);
1449 }
1450 
1451 /*
1452  * pmap_virtual_space:		[ INTERFACE ]
1453  *
1454  *	Define the initial bounds of the kernel virtual address space.
1455  */
1456 void
pmap_virtual_space(vaddr_t * vstartp,vaddr_t * vendp)1457 pmap_virtual_space(vaddr_t *vstartp, vaddr_t *vendp)
1458 {
1459 
1460 	*vstartp = VM_MIN_KERNEL_ADDRESS;	/* kernel is in K0SEG */
1461 	*vendp = VM_MAX_KERNEL_ADDRESS;		/* we use pmap_growkernel */
1462 }
1463 
1464 /*
1465  * pmap_steal_memory:		[ INTERFACE ]
1466  *
1467  *	Bootstrap memory allocator (alternative to vm_bootstrap_steal_memory()).
1468  *	This function allows for early dynamic memory allocation until the
1469  *	virtual memory system has been bootstrapped.  After that point, either
1470  *	kmem_alloc or malloc should be used.  This function works by stealing
1471  *	pages from the (to be) managed page pool, then implicitly mapping the
1472  *	pages (by using their k0seg addresses) and zeroing them.
1473  *
1474  *	It may be used once the physical memory segments have been pre-loaded
1475  *	into the vm_physmem[] array.  Early memory allocation MUST use this
1476  *	interface!  This cannot be used after vm_page_startup(), and will
1477  *	generate a panic if tried.
1478  *
1479  *	Note that this memory will never be freed, and in essence it is wired
1480  *	down.
1481  *
1482  *	We must adjust *vstartp and/or *vendp iff we use address space
1483  *	from the kernel virtual address range defined by pmap_virtual_space().
1484  *
1485  *	Note: no locking is necessary in this function.
1486  */
1487 vaddr_t
pmap_steal_memory(vsize_t size,vaddr_t * vstartp,vaddr_t * vendp)1488 pmap_steal_memory(vsize_t size, vaddr_t *vstartp, vaddr_t *vendp)
1489 {
1490 	int npgs;
1491 	vaddr_t va;
1492 	paddr_t pa;
1493 
1494 	uvm_physseg_t bank;
1495 
1496 	size = round_page(size);
1497 	npgs = atop(size);
1498 
1499 #if 0
1500 	printf("PSM: size 0x%lx (npgs 0x%x)\n", size, npgs);
1501 #endif
1502 
1503 	for (bank = uvm_physseg_get_first();
1504 	     uvm_physseg_valid_p(bank);
1505 	     bank = uvm_physseg_get_next(bank)) {
1506 		if (uvm.page_init_done == true)
1507 			panic("pmap_steal_memory: called _after_ bootstrap");
1508 
1509 #if 0
1510 		printf("     bank %d: avail_start 0x%"PRIxPADDR", start 0x%"PRIxPADDR", "
1511 		    "avail_end 0x%"PRIxPADDR"\n", bank, uvm_physseg_get_avail_start(bank),
1512 		    uvm_physseg_get_start(bank), uvm_physseg_get_avail_end(bank));
1513 #endif
1514 
1515 		if (uvm_physseg_get_avail_start(bank) != uvm_physseg_get_start(bank) ||
1516 		    uvm_physseg_get_avail_start(bank) >= uvm_physseg_get_avail_end(bank))
1517 			continue;
1518 
1519 #if 0
1520 		printf("             avail_end - avail_start = 0x%"PRIxPADDR"\n",
1521 		    uvm_physseg_get_avail_end(bank) - uvm_physseg_get_avail_start(bank));
1522 #endif
1523 
1524 		if (uvm_physseg_get_avail_end(bank) - uvm_physseg_get_avail_start(bank)
1525 		    < npgs)
1526 			continue;
1527 
1528 		/*
1529 		 * There are enough pages here; steal them!
1530 		 */
1531 		pa = ptoa(uvm_physseg_get_start(bank));
1532 		uvm_physseg_unplug(atop(pa), npgs);
1533 
1534 		va = ALPHA_PHYS_TO_K0SEG(pa);
1535 		memset((void *)va, 0, size);
1536 		pmap_pages_stolen += npgs;
1537 		return (va);
1538 	}
1539 
1540 	/*
1541 	 * If we got here, this was no memory left.
1542 	 */
1543 	panic("pmap_steal_memory: no memory to steal");
1544 }
1545 
1546 /*
1547  * pmap_init:			[ INTERFACE ]
1548  *
1549  *	Initialize the pmap module.  Called by vm_init(), to initialize any
1550  *	structures that the pmap system needs to map virtual memory.
1551  *
1552  *	Note: no locking is necessary in this function.
1553  */
1554 void
pmap_init(void)1555 pmap_init(void)
1556 {
1557 
1558 #ifdef DEBUG
1559 	if (pmapdebug & PDB_FOLLOW)
1560 	        printf("pmap_init()\n");
1561 #endif
1562 
1563 	/* initialize protection array */
1564 	alpha_protection_init();
1565 
1566 	/* Initialize TLB handling. */
1567 	pmap_tlb_init();
1568 
1569 	/* Instrument pmap_growkernel(). */
1570 	evcnt_attach_dynamic_nozero(&pmap_growkernel_evcnt, EVCNT_TYPE_MISC,
1571 	    NULL, "pmap", "growkernel");
1572 
1573 	/*
1574 	 * Set a low water mark on the pv_entry pool, so that we are
1575 	 * more likely to have these around even in extreme memory
1576 	 * starvation.
1577 	 */
1578 	pool_cache_setlowat(&pmap_pv_cache, pmap_pv_lowat);
1579 
1580 	/*
1581 	 * Now it is safe to enable pv entry recording.
1582 	 */
1583 	pmap_initialized = true;
1584 
1585 #if 0
1586 	for (uvm_physseg_t bank = uvm_physseg_get_first();
1587 	    uvm_physseg_valid_p(bank);
1588 	    bank = uvm_physseg_get_next(bank)) {
1589 		printf("bank %d\n", bank);
1590 		printf("\tstart = 0x%lx\n", ptoa(uvm_physseg_get_start(bank)));
1591 		printf("\tend = 0x%lx\n", ptoa(uvm_physseg_get_end(bank)));
1592 		printf("\tavail_start = 0x%lx\n",
1593 		    ptoa(uvm_physseg_get_avail_start(bank)));
1594 		printf("\tavail_end = 0x%lx\n",
1595 		    ptoa(uvm_physseg_get_avail_end(bank)));
1596 	}
1597 #endif
1598 }
1599 
1600 /*
1601  * pmap_create:			[ INTERFACE ]
1602  *
1603  *	Create and return a physical map.
1604  *
1605  *	Note: no locking is necessary in this function.
1606  */
1607 pmap_t
pmap_create(void)1608 pmap_create(void)
1609 {
1610 	pmap_t pmap;
1611 	pt_entry_t *lev1map;
1612 	int i;
1613 
1614 #ifdef DEBUG
1615 	if (pmapdebug & (PDB_FOLLOW|PDB_CREATE))
1616 		printf("pmap_create()\n");
1617 #endif
1618 
1619 	pmap = pool_cache_get(&pmap_pmap_cache, PR_WAITOK);
1620 	memset(pmap, 0, sizeof(*pmap));
1621 	LIST_INIT(&pmap->pm_ptpages);
1622 	LIST_INIT(&pmap->pm_pvents);
1623 
1624 	atomic_store_relaxed(&pmap->pm_count, 1);
1625 
1626  try_again:
1627 	rw_enter(&pmap_growkernel_lock, RW_READER);
1628 
1629 	lev1map = pool_cache_get(&pmap_l1pt_cache, PR_NOWAIT);
1630 	if (__predict_false(lev1map == NULL)) {
1631 		rw_exit(&pmap_growkernel_lock);
1632 		(void) kpause("pmap_create", false, hz >> 2, NULL);
1633 		goto try_again;
1634 	}
1635 
1636 	/*
1637 	 * There are only kernel mappings at this point; give the pmap
1638 	 * the kernel ASN.  This will be initialized to correct values
1639 	 * when the pmap is activated.
1640 	 *
1641 	 * We stash a pointer to the pmap's lev1map in each CPU's
1642 	 * private data.  It remains constant for the life of the
1643 	 * pmap, and gives us more room in the shared pmap structure.
1644 	 */
1645 	for (i = 0; i < pmap_ncpuids; i++) {
1646 		pmap->pm_percpu[i].pmc_asn = PMAP_ASN_KERNEL;
1647 		pmap->pm_percpu[i].pmc_asngen = PMAP_ASNGEN_INVALID;
1648 		pmap->pm_percpu[i].pmc_lev1map = lev1map;
1649 	}
1650 
1651 	mutex_enter(&pmap_all_pmaps_lock);
1652 	TAILQ_INSERT_TAIL(&pmap_all_pmaps, pmap, pm_list);
1653 	mutex_exit(&pmap_all_pmaps_lock);
1654 
1655 	rw_exit(&pmap_growkernel_lock);
1656 
1657 	return (pmap);
1658 }
1659 
1660 /*
1661  * pmap_destroy:		[ INTERFACE ]
1662  *
1663  *	Drop the reference count on the specified pmap, releasing
1664  *	all resources if the reference count drops to zero.
1665  */
1666 void
pmap_destroy(pmap_t pmap)1667 pmap_destroy(pmap_t pmap)
1668 {
1669 
1670 #ifdef DEBUG
1671 	if (pmapdebug & PDB_FOLLOW)
1672 		printf("pmap_destroy(%p)\n", pmap);
1673 #endif
1674 
1675 	PMAP_MP(membar_release());
1676 	KASSERT(atomic_load_relaxed(&pmap->pm_count) > 0);
1677 	if (atomic_dec_uint_nv(&pmap->pm_count) > 0)
1678 		return;
1679 	PMAP_MP(membar_acquire());
1680 
1681 	pt_entry_t *lev1map = pmap_lev1map(pmap);
1682 
1683 	rw_enter(&pmap_growkernel_lock, RW_READER);
1684 
1685 	/*
1686 	 * Remove it from the global list of all pmaps.
1687 	 */
1688 	mutex_enter(&pmap_all_pmaps_lock);
1689 	TAILQ_REMOVE(&pmap_all_pmaps, pmap, pm_list);
1690 	mutex_exit(&pmap_all_pmaps_lock);
1691 
1692 	pool_cache_put(&pmap_l1pt_cache, lev1map);
1693 #ifdef DIAGNOSTIC
1694 	int i;
1695 	for (i = 0; i < pmap_ncpuids; i++) {
1696 		pmap->pm_percpu[i].pmc_lev1map = (pt_entry_t *)0xdeadbeefUL;
1697 	}
1698 #endif /* DIAGNOSTIC */
1699 
1700 	rw_exit(&pmap_growkernel_lock);
1701 
1702 	pool_cache_put(&pmap_pmap_cache, pmap);
1703 }
1704 
1705 /*
1706  * pmap_reference:		[ INTERFACE ]
1707  *
1708  *	Add a reference to the specified pmap.
1709  */
1710 void
pmap_reference(pmap_t pmap)1711 pmap_reference(pmap_t pmap)
1712 {
1713 	unsigned int newcount __diagused;
1714 
1715 #ifdef DEBUG
1716 	if (pmapdebug & PDB_FOLLOW)
1717 		printf("pmap_reference(%p)\n", pmap);
1718 #endif
1719 
1720 	newcount = atomic_inc_uint_nv(&pmap->pm_count);
1721 	KASSERT(newcount != 0);
1722 }
1723 
1724 /*
1725  * pmap_remove:			[ INTERFACE ]
1726  *
1727  *	Remove the given range of addresses from the specified map.
1728  *
1729  *	It is assumed that the start and end are properly
1730  *	rounded to the page size.
1731  */
1732 static void
pmap_remove_internal(pmap_t pmap,vaddr_t sva,vaddr_t eva,struct pmap_tlb_context * const tlbctx)1733 pmap_remove_internal(pmap_t pmap, vaddr_t sva, vaddr_t eva,
1734     struct pmap_tlb_context * const tlbctx)
1735 {
1736 	pt_entry_t *l1pte, *l2pte, *l3pte;
1737 	pt_entry_t *saved_l2pte, *saved_l3pte;
1738 	vaddr_t l1eva, l2eva, l3vptva;
1739 	pt_entry_t pte_bits;
1740 
1741 #ifdef DEBUG
1742 	if (pmapdebug & (PDB_FOLLOW|PDB_REMOVE|PDB_PROTECT))
1743 		printf("pmap_remove(%p, %lx, %lx)\n", pmap, sva, eva);
1744 #endif
1745 
1746 	/*
1747 	 * If this is the kernel pmap, we can use a faster method
1748 	 * for accessing the PTEs (since the PT pages are always
1749 	 * resident).
1750 	 *
1751 	 * Note that this routine should NEVER be called from an
1752 	 * interrupt context; pmap_kremove() is used for that.
1753 	 */
1754 	if (pmap == pmap_kernel()) {
1755 		PMAP_MAP_TO_HEAD_LOCK();
1756 		PMAP_LOCK(pmap);
1757 
1758 		while (sva < eva) {
1759 			l3pte = PMAP_KERNEL_PTE(sva);
1760 			if (pmap_pte_v(l3pte)) {
1761 				pte_bits = pmap_remove_mapping(pmap, sva,
1762 				    l3pte, true, NULL, tlbctx);
1763 				pmap_tlb_shootdown(pmap, sva, pte_bits,
1764 				    tlbctx);
1765 			}
1766 			sva += PAGE_SIZE;
1767 		}
1768 
1769 		PMAP_MAP_TO_HEAD_UNLOCK();
1770 		PMAP_UNLOCK(pmap);
1771 		pmap_tlb_shootnow(tlbctx);
1772 		/* kernel PT pages are never freed. */
1773 		KASSERT(LIST_EMPTY(&tlbctx->t_freeptq));
1774 		/* ...but we might have freed PV entries. */
1775 		pmap_tlb_context_drain(tlbctx);
1776 		TLB_COUNT(reason_remove_kernel);
1777 
1778 		return;
1779 	}
1780 
1781 	pt_entry_t * const lev1map = pmap_lev1map(pmap);
1782 
1783 	KASSERT(sva < VM_MAXUSER_ADDRESS);
1784 	KASSERT(eva <= VM_MAXUSER_ADDRESS);
1785 	KASSERT(lev1map != kernel_lev1map);
1786 
1787 	PMAP_MAP_TO_HEAD_LOCK();
1788 	PMAP_LOCK(pmap);
1789 
1790 	l1pte = pmap_l1pte(lev1map, sva);
1791 
1792 	for (; sva < eva; sva = l1eva, l1pte++) {
1793 		l1eva = alpha_trunc_l1seg(sva) + ALPHA_L1SEG_SIZE;
1794 		if (pmap_pte_v(l1pte)) {
1795 			saved_l2pte = l2pte = pmap_l2pte(lev1map, sva, l1pte);
1796 
1797 			/*
1798 			 * Add a reference to the L2 table so it won't
1799 			 * get removed from under us.
1800 			 */
1801 			pmap_physpage_addref(saved_l2pte);
1802 
1803 			for (; sva < l1eva && sva < eva; sva = l2eva, l2pte++) {
1804 				l2eva =
1805 				    alpha_trunc_l2seg(sva) + ALPHA_L2SEG_SIZE;
1806 				if (pmap_pte_v(l2pte)) {
1807 					saved_l3pte = l3pte =
1808 					    pmap_l3pte(lev1map, sva, l2pte);
1809 
1810 					/*
1811 					 * Add a reference to the L3 table so
1812 					 * it won't get removed from under us.
1813 					 */
1814 					pmap_physpage_addref(saved_l3pte);
1815 
1816 					/*
1817 					 * Remember this sva; if the L3 table
1818 					 * gets removed, we need to invalidate
1819 					 * the VPT TLB entry for it.
1820 					 */
1821 					l3vptva = sva;
1822 
1823 					for (; sva < l2eva && sva < eva;
1824 					     sva += PAGE_SIZE, l3pte++) {
1825 						if (!pmap_pte_v(l3pte)) {
1826 							continue;
1827 						}
1828 						pte_bits =
1829 						    pmap_remove_mapping(
1830 							pmap, sva,
1831 							l3pte, true,
1832 							NULL, tlbctx);
1833 						pmap_tlb_shootdown(pmap,
1834 						    sva, pte_bits, tlbctx);
1835 					}
1836 
1837 					/*
1838 					 * Remove the reference to the L3
1839 					 * table that we added above.  This
1840 					 * may free the L3 table.
1841 					 */
1842 					pmap_l3pt_delref(pmap, l3vptva,
1843 					    saved_l3pte, tlbctx);
1844 				}
1845 			}
1846 
1847 			/*
1848 			 * Remove the reference to the L2 table that we
1849 			 * added above.  This may free the L2 table.
1850 			 */
1851 			pmap_l2pt_delref(pmap, l1pte, saved_l2pte, tlbctx);
1852 		}
1853 	}
1854 
1855 	PMAP_MAP_TO_HEAD_UNLOCK();
1856 	PMAP_UNLOCK(pmap);
1857 	pmap_tlb_shootnow(tlbctx);
1858 	pmap_tlb_context_drain(tlbctx);
1859 	TLB_COUNT(reason_remove_user);
1860 }
1861 
1862 void
pmap_remove(pmap_t pmap,vaddr_t sva,vaddr_t eva)1863 pmap_remove(pmap_t pmap, vaddr_t sva, vaddr_t eva)
1864 {
1865 	struct pmap_tlb_context tlbctx;
1866 
1867 	pmap_tlb_context_init(&tlbctx, 0);
1868 	pmap_remove_internal(pmap, sva, eva, &tlbctx);
1869 }
1870 
1871 /*
1872  * pmap_remove_all:		[ INTERFACE ]
1873  *
1874  *	Remove all mappings from a pmap in bulk.  This is only called
1875  *	when it's known that the address space is no longer visible to
1876  *	any user process (e.g. during exit or exec).
1877  */
1878 bool
pmap_remove_all(pmap_t pmap)1879 pmap_remove_all(pmap_t pmap)
1880 {
1881 	struct pmap_tlb_context tlbctx;
1882 	struct vm_page *pg;
1883 	pv_entry_t pv;
1884 
1885 	KASSERT(pmap != pmap_kernel());
1886 
1887 	/*
1888 	 * This process is pretty simple:
1889 	 *
1890 	 * ==> (1) Zero out the user-space portion of the lev1map.
1891 	 *
1892 	 * ==> (2) Copy the PT page list to the tlbctx and re-init.
1893 	 *
1894 	 * ==> (3) Walk the PV entry list and remove each entry.
1895 	 *
1896 	 * ==> (4) Zero the wired and resident count.
1897 	 *
1898 	 * Once we've done that, we just need to free everything
1899 	 * back to the system.
1900 	 */
1901 
1902 	pmap_tlb_context_init(&tlbctx, 0);
1903 
1904 	PMAP_MAP_TO_HEAD_LOCK();
1905 	PMAP_LOCK(pmap);
1906 
1907 	/* Step 1 */
1908 	pt_entry_t * const lev1map = pmap_lev1map(pmap);
1909 	memset(lev1map, 0,
1910 	       l1pte_index(VM_MAXUSER_ADDRESS) * sizeof(pt_entry_t));
1911 
1912 	/* Step 2 */
1913 	LIST_MOVE(&pmap->pm_ptpages, &tlbctx.t_freeptq, pageq.list);
1914 
1915 	/* Fix up the reference count on the lev1map page. */
1916 	pg = PHYS_TO_VM_PAGE(ALPHA_K0SEG_TO_PHYS((vaddr_t)lev1map));
1917 	PHYSPAGE_REFCNT_SET(pg, 0);
1918 
1919 	/* Step 3 */
1920 	while ((pv = LIST_FIRST(&pmap->pm_pvents)) != NULL) {
1921 		KASSERT(pv->pv_pmap == pmap);
1922 		pmap_pv_remove(pmap, PHYS_TO_VM_PAGE(pmap_pte_pa(pv->pv_pte)),
1923 		    pv->pv_va, true, NULL, &tlbctx);
1924 	}
1925 
1926 	/* Step 4 */
1927 	atomic_store_relaxed(&pmap->pm_stats.wired_count, 0);
1928 	atomic_store_relaxed(&pmap->pm_stats.resident_count, 0);
1929 
1930 	pmap_tlb_shootdown_all_user(pmap, PG_EXEC, &tlbctx);
1931 
1932 	PMAP_UNLOCK(pmap);
1933 	PMAP_MAP_TO_HEAD_UNLOCK();
1934 
1935 	pmap_tlb_shootnow(&tlbctx);
1936 	pmap_tlb_context_drain(&tlbctx);
1937 	TLB_COUNT(reason_remove_all_user);
1938 
1939 	return true;
1940 }
1941 
1942 /*
1943  * pmap_page_protect:		[ INTERFACE ]
1944  *
1945  *	Lower the permission for all mappings to a given page to
1946  *	the permissions specified.
1947  */
1948 void
pmap_page_protect(struct vm_page * pg,vm_prot_t prot)1949 pmap_page_protect(struct vm_page *pg, vm_prot_t prot)
1950 {
1951 	pv_entry_t pv, nextpv;
1952 	pt_entry_t opte;
1953 	kmutex_t *lock;
1954 	struct pmap_tlb_context tlbctx;
1955 
1956 #ifdef DEBUG
1957 	if ((pmapdebug & (PDB_FOLLOW|PDB_PROTECT)) ||
1958 	    (prot == VM_PROT_NONE && (pmapdebug & PDB_REMOVE)))
1959 		printf("pmap_page_protect(%p, %x)\n", pg, prot);
1960 #endif
1961 
1962 	pmap_tlb_context_init(&tlbctx, TLB_CTX_F_PV);
1963 
1964 	switch (prot) {
1965 	case VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE:
1966 	case VM_PROT_READ|VM_PROT_WRITE:
1967 		return;
1968 
1969 	/* copy_on_write */
1970 	case VM_PROT_READ|VM_PROT_EXECUTE:
1971 	case VM_PROT_READ:
1972 		PMAP_HEAD_TO_MAP_LOCK();
1973 		lock = pmap_pvh_lock(pg);
1974 		mutex_enter(lock);
1975 		for (pv = VM_MDPAGE_PVS(pg); pv != NULL; pv = pv->pv_next) {
1976 			PMAP_LOCK(pv->pv_pmap);
1977 			opte = atomic_load_relaxed(pv->pv_pte);
1978 			if (opte & (PG_KWE | PG_UWE)) {
1979 				atomic_store_relaxed(pv->pv_pte,
1980 				    opte & ~(PG_KWE | PG_UWE));
1981 				pmap_tlb_shootdown_pv(pv->pv_pmap, pv->pv_va,
1982 				    opte, &tlbctx);
1983 			}
1984 			PMAP_UNLOCK(pv->pv_pmap);
1985 		}
1986 		mutex_exit(lock);
1987 		PMAP_HEAD_TO_MAP_UNLOCK();
1988 		pmap_tlb_shootnow(&tlbctx);
1989 		TLB_COUNT(reason_page_protect_read);
1990 		return;
1991 
1992 	/* remove_all */
1993 	default:
1994 		break;
1995 	}
1996 
1997 	PMAP_HEAD_TO_MAP_LOCK();
1998 	lock = pmap_pvh_lock(pg);
1999 	mutex_enter(lock);
2000 	for (pv = VM_MDPAGE_PVS(pg); pv != NULL; pv = nextpv) {
2001 		pt_entry_t pte_bits;
2002 		pmap_t pmap;
2003 		vaddr_t va;
2004 
2005 		nextpv = pv->pv_next;
2006 
2007 		PMAP_LOCK(pv->pv_pmap);
2008 		pmap = pv->pv_pmap;
2009 		va = pv->pv_va;
2010 		pte_bits = pmap_remove_mapping(pmap, va, pv->pv_pte,
2011 		    false, NULL, &tlbctx);
2012 		pmap_tlb_shootdown_pv(pmap, va, pte_bits, &tlbctx);
2013 		PMAP_UNLOCK(pv->pv_pmap);
2014 	}
2015 	mutex_exit(lock);
2016 	PMAP_HEAD_TO_MAP_UNLOCK();
2017 	pmap_tlb_shootnow(&tlbctx);
2018 	pmap_tlb_context_drain(&tlbctx);
2019 	TLB_COUNT(reason_page_protect_none);
2020 }
2021 
2022 /*
2023  * pmap_protect:		[ INTERFACE ]
2024  *
2025  *	Set the physical protection on the specified range of this map
2026  *	as requested.
2027  */
2028 void
pmap_protect(pmap_t pmap,vaddr_t sva,vaddr_t eva,vm_prot_t prot)2029 pmap_protect(pmap_t pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
2030 {
2031 	pt_entry_t *l1pte, *l2pte, *l3pte, opte;
2032 	vaddr_t l1eva, l2eva;
2033 	struct pmap_tlb_context tlbctx;
2034 
2035 #ifdef DEBUG
2036 	if (pmapdebug & (PDB_FOLLOW|PDB_PROTECT))
2037 		printf("pmap_protect(%p, %lx, %lx, %x)\n",
2038 		    pmap, sva, eva, prot);
2039 #endif
2040 
2041 	pmap_tlb_context_init(&tlbctx, 0);
2042 
2043 	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
2044 		pmap_remove_internal(pmap, sva, eva, &tlbctx);
2045 		return;
2046 	}
2047 
2048 	const pt_entry_t bits = pte_prot(pmap, prot);
2049 	pt_entry_t * const lev1map = pmap_lev1map(pmap);
2050 
2051 	PMAP_LOCK(pmap);
2052 
2053 	l1pte = pmap_l1pte(lev1map, sva);
2054 	for (; sva < eva; sva = l1eva, l1pte++) {
2055 		l1eva = alpha_trunc_l1seg(sva) + ALPHA_L1SEG_SIZE;
2056 		if (pmap_pte_v(l1pte)) {
2057 			l2pte = pmap_l2pte(lev1map, sva, l1pte);
2058 			for (; sva < l1eva && sva < eva; sva = l2eva, l2pte++) {
2059 				l2eva =
2060 				    alpha_trunc_l2seg(sva) + ALPHA_L2SEG_SIZE;
2061 				if (pmap_pte_v(l2pte)) {
2062 					l3pte = pmap_l3pte(lev1map, sva, l2pte);
2063 					for (; sva < l2eva && sva < eva;
2064 					     sva += PAGE_SIZE, l3pte++) {
2065 						if (pmap_pte_v(l3pte) &&
2066 						    pmap_pte_prot_chg(l3pte,
2067 								      bits)) {
2068 							opte = atomic_load_relaxed(l3pte);
2069 							pmap_pte_set_prot(l3pte,
2070 							   bits);
2071 							pmap_tlb_shootdown(pmap,
2072 							    sva, opte, &tlbctx);
2073 						}
2074 					}
2075 				}
2076 			}
2077 		}
2078 	}
2079 
2080 	PMAP_UNLOCK(pmap);
2081 	pmap_tlb_shootnow(&tlbctx);
2082 	TLB_COUNT(reason_protect);
2083 }
2084 
2085 /*
2086  * pmap_enter_tlb_shootdown:
2087  *
2088  *	Carry out a TLB shootdown on behalf of a pmap_enter()
2089  *	or a pmap_kenter_pa().  This is factored out separately
2090  *	because we expect it to be not a common case.
2091  */
2092 static void __noinline
pmap_enter_tlb_shootdown(pmap_t const pmap,vaddr_t const va,pt_entry_t const pte_bits,bool locked)2093 pmap_enter_tlb_shootdown(pmap_t const pmap, vaddr_t const va,
2094     pt_entry_t const pte_bits, bool locked)
2095 {
2096 	struct pmap_tlb_context tlbctx;
2097 
2098 	pmap_tlb_context_init(&tlbctx, 0);
2099 	pmap_tlb_shootdown(pmap, va, pte_bits, &tlbctx);
2100 	if (locked) {
2101 		PMAP_UNLOCK(pmap);
2102 	}
2103 	pmap_tlb_shootnow(&tlbctx);
2104 }
2105 
2106 /*
2107  * pmap_enter_l2pt_delref:
2108  *
2109  *	Release a reference on an L2 PT page for pmap_enter().
2110  *	This is factored out separately because we expect it
2111  *	to be a rare case.
2112  */
2113 static void __noinline
pmap_enter_l2pt_delref(pmap_t const pmap,pt_entry_t * const l1pte,pt_entry_t * const l2pte)2114 pmap_enter_l2pt_delref(pmap_t const pmap, pt_entry_t * const l1pte,
2115     pt_entry_t * const l2pte)
2116 {
2117 	struct pmap_tlb_context tlbctx;
2118 
2119 	/*
2120 	 * PALcode may have tried to service a TLB miss with
2121 	 * this L2 PTE, so we need to make sure we don't actually
2122 	 * free the PT page until we've shot down any TLB entries
2123 	 * for this VPT index.
2124 	 */
2125 
2126 	pmap_tlb_context_init(&tlbctx, 0);
2127 	pmap_l2pt_delref(pmap, l1pte, l2pte, &tlbctx);
2128 	PMAP_UNLOCK(pmap);
2129 	pmap_tlb_shootnow(&tlbctx);
2130 	pmap_tlb_context_drain(&tlbctx);
2131 	TLB_COUNT(reason_enter_l2pt_delref);
2132 }
2133 
2134 /*
2135  * pmap_enter_l3pt_delref:
2136  *
2137  *	Release a reference on an L3 PT page for pmap_enter().
2138  *	This is factored out separately because we expect it
2139  *	to be a rare case.
2140  */
2141 static void __noinline
pmap_enter_l3pt_delref(pmap_t const pmap,vaddr_t const va,pt_entry_t * const pte)2142 pmap_enter_l3pt_delref(pmap_t const pmap, vaddr_t const va,
2143     pt_entry_t * const pte)
2144 {
2145 	struct pmap_tlb_context tlbctx;
2146 
2147 	/*
2148 	 * PALcode may have tried to service a TLB miss with
2149 	 * this PTE, so we need to make sure we don't actually
2150 	 * free the PT page until we've shot down any TLB entries
2151 	 * for this VPT index.
2152 	 */
2153 
2154 	pmap_tlb_context_init(&tlbctx, 0);
2155 	pmap_l3pt_delref(pmap, va, pte, &tlbctx);
2156 	PMAP_UNLOCK(pmap);
2157 	pmap_tlb_shootnow(&tlbctx);
2158 	pmap_tlb_context_drain(&tlbctx);
2159 	TLB_COUNT(reason_enter_l3pt_delref);
2160 }
2161 
2162 /*
2163  * pmap_enter:			[ INTERFACE ]
2164  *
2165  *	Insert the given physical page (p) at
2166  *	the specified virtual address (v) in the
2167  *	target physical map with the protection requested.
2168  *
2169  *	If specified, the page will be wired down, meaning
2170  *	that the related pte can not be reclaimed.
2171  *
2172  *	Note:  This is the only routine which MAY NOT lazy-evaluate
2173  *	or lose information.  That is, this routine must actually
2174  *	insert this page into the given map NOW.
2175  */
2176 int
pmap_enter(pmap_t pmap,vaddr_t va,paddr_t pa,vm_prot_t prot,u_int flags)2177 pmap_enter(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
2178 {
2179 	pt_entry_t *pte, npte, opte;
2180 	pv_entry_t opv = NULL;
2181 	paddr_t opa;
2182 	bool tflush = false;
2183 	int error = 0;
2184 	kmutex_t *lock;
2185 
2186 #ifdef DEBUG
2187 	if (pmapdebug & (PDB_FOLLOW|PDB_ENTER))
2188 		printf("pmap_enter(%p, %lx, %lx, %x, %x)\n",
2189 		       pmap, va, pa, prot, flags);
2190 #endif
2191 	struct vm_page * const pg = PHYS_TO_VM_PAGE(pa);
2192 	const bool wired = (flags & PMAP_WIRED) != 0;
2193 
2194 	PMAP_MAP_TO_HEAD_LOCK();
2195 	PMAP_LOCK(pmap);
2196 
2197 	if (pmap == pmap_kernel()) {
2198 		KASSERT(va >= VM_MIN_KERNEL_ADDRESS);
2199 		pte = PMAP_KERNEL_PTE(va);
2200 	} else {
2201 		pt_entry_t *l1pte, *l2pte;
2202 		pt_entry_t * const lev1map = pmap_lev1map(pmap);
2203 
2204 		KASSERT(va < VM_MAXUSER_ADDRESS);
2205 		KASSERT(lev1map != kernel_lev1map);
2206 
2207 		/*
2208 		 * Check to see if the level 1 PTE is valid, and
2209 		 * allocate a new level 2 page table page if it's not.
2210 		 * A reference will be added to the level 2 table when
2211 		 * the level 3 table is created.
2212 		 */
2213 		l1pte = pmap_l1pte(lev1map, va);
2214 		if (pmap_pte_v(l1pte) == 0) {
2215 			pmap_physpage_addref(l1pte);
2216 			error = pmap_ptpage_alloc(pmap, l1pte, PGU_L2PT);
2217 			if (error) {
2218 				pmap_l1pt_delref(pmap, l1pte);
2219 				if (flags & PMAP_CANFAIL)
2220 					goto out;
2221 				panic("pmap_enter: unable to create L2 PT "
2222 				    "page");
2223 			}
2224 #ifdef DEBUG
2225 			if (pmapdebug & PDB_PTPAGE)
2226 				printf("pmap_enter: new level 2 table at "
2227 				    "0x%lx\n", pmap_pte_pa(l1pte));
2228 #endif
2229 		}
2230 
2231 		/*
2232 		 * Check to see if the level 2 PTE is valid, and
2233 		 * allocate a new level 3 page table page if it's not.
2234 		 * A reference will be added to the level 3 table when
2235 		 * the mapping is validated.
2236 		 */
2237 		l2pte = pmap_l2pte(lev1map, va, l1pte);
2238 		if (pmap_pte_v(l2pte) == 0) {
2239 			pmap_physpage_addref(l2pte);
2240 			error = pmap_ptpage_alloc(pmap, l2pte, PGU_L3PT);
2241 			if (error) {
2242 				/* unlocks pmap */
2243 				pmap_enter_l2pt_delref(pmap, l1pte, l2pte);
2244 				if (flags & PMAP_CANFAIL) {
2245 					PMAP_LOCK(pmap);
2246 					goto out;
2247 				}
2248 				panic("pmap_enter: unable to create L3 PT "
2249 				    "page");
2250 			}
2251 #ifdef DEBUG
2252 			if (pmapdebug & PDB_PTPAGE)
2253 				printf("pmap_enter: new level 3 table at "
2254 				    "0x%lx\n", pmap_pte_pa(l2pte));
2255 #endif
2256 		}
2257 
2258 		/*
2259 		 * Get the PTE that will map the page.
2260 		 */
2261 		pte = pmap_l3pte(lev1map, va, l2pte);
2262 	}
2263 
2264 	/* Remember all of the old PTE; used for TBI check later. */
2265 	opte = atomic_load_relaxed(pte);
2266 
2267 	/*
2268 	 * Check to see if the old mapping is valid.  If not, validate the
2269 	 * new one immediately.
2270 	 */
2271 	if ((opte & PG_V) == 0) {
2272 		/* No TLB invalidations needed for new mappings. */
2273 
2274 		if (pmap != pmap_kernel()) {
2275 			/*
2276 			 * New mappings gain a reference on the level 3
2277 			 * table.
2278 			 */
2279 			pmap_physpage_addref(pte);
2280 		}
2281 		goto validate_enterpv;
2282 	}
2283 
2284 	opa = pmap_pte_pa(pte);
2285 
2286 	if (opa == pa) {
2287 		/*
2288 		 * Mapping has not changed; must be a protection or
2289 		 * wiring change.
2290 		 */
2291 		if (pmap_pte_w_chg(pte, wired ? PG_WIRED : 0)) {
2292 #ifdef DEBUG
2293 			if (pmapdebug & PDB_ENTER)
2294 				printf("pmap_enter: wiring change -> %d\n",
2295 				    wired);
2296 #endif
2297 			/* Adjust the wiring count. */
2298 			if (wired)
2299 				PMAP_STAT_INCR(pmap->pm_stats.wired_count, 1);
2300 			else
2301 				PMAP_STAT_DECR(pmap->pm_stats.wired_count, 1);
2302 		}
2303 
2304 		/* Set the PTE. */
2305 		goto validate;
2306 	}
2307 
2308 	/*
2309 	 * The mapping has changed.  We need to invalidate the
2310 	 * old mapping before creating the new one.
2311 	 */
2312 #ifdef DEBUG
2313 	if (pmapdebug & PDB_ENTER)
2314 		printf("pmap_enter: removing old mapping 0x%lx\n", va);
2315 #endif
2316 	if (pmap != pmap_kernel()) {
2317 		/*
2318 		 * Gain an extra reference on the level 3 table.
2319 		 * pmap_remove_mapping() will delete a reference,
2320 		 * and we don't want the table to be erroneously
2321 		 * freed.
2322 		 */
2323 		pmap_physpage_addref(pte);
2324 	}
2325 	/* Already have the bits from opte above. */
2326 	(void) pmap_remove_mapping(pmap, va, pte, true, &opv, NULL);
2327 
2328  validate_enterpv:
2329 	/* Enter the mapping into the pv_table if appropriate. */
2330 	if (pg != NULL) {
2331 		error = pmap_pv_enter(pmap, pg, va, pte, true, opv);
2332 		if (error) {
2333 			/* This can only fail if opv == NULL */
2334 			KASSERT(opv == NULL);
2335 
2336 			/* unlocks pmap */
2337 			pmap_enter_l3pt_delref(pmap, va, pte);
2338 			if (flags & PMAP_CANFAIL) {
2339 				PMAP_LOCK(pmap);
2340 				goto out;
2341 			}
2342 			panic("pmap_enter: unable to enter mapping in PV "
2343 			    "table");
2344 		}
2345 		opv = NULL;
2346 	}
2347 
2348 	/* Increment counters. */
2349 	PMAP_STAT_INCR(pmap->pm_stats.resident_count, 1);
2350 	if (wired)
2351 		PMAP_STAT_INCR(pmap->pm_stats.wired_count, 1);
2352 
2353  validate:
2354 	/* Build the new PTE. */
2355 	npte = ((pa >> PGSHIFT) << PG_SHIFT) | pte_prot(pmap, prot) | PG_V;
2356 	if (pg != NULL) {
2357 		struct vm_page_md * const md = VM_PAGE_TO_MD(pg);
2358 		uintptr_t attrs = 0;
2359 
2360 		KASSERT(((flags & VM_PROT_ALL) & ~prot) == 0);
2361 
2362 		if (flags & VM_PROT_WRITE)
2363 			attrs |= (PGA_REFERENCED|PGA_MODIFIED);
2364 		else if (flags & VM_PROT_ALL)
2365 			attrs |= PGA_REFERENCED;
2366 
2367 		lock = pmap_pvh_lock(pg);
2368 		mutex_enter(lock);
2369 		attrs = (md->pvh_listx |= attrs);
2370 		mutex_exit(lock);
2371 
2372 		/* Set up referenced/modified emulation for new mapping. */
2373 		if ((attrs & PGA_REFERENCED) == 0)
2374 			npte |= PG_FOR | PG_FOW | PG_FOE;
2375 		else if ((attrs & PGA_MODIFIED) == 0)
2376 			npte |= PG_FOW;
2377 
2378 		/*
2379 		 * Mapping was entered on PV list.
2380 		 */
2381 		npte |= PG_PVLIST;
2382 	}
2383 	if (wired)
2384 		npte |= PG_WIRED;
2385 #ifdef DEBUG
2386 	if (pmapdebug & PDB_ENTER)
2387 		printf("pmap_enter: new pte = 0x%lx\n", npte);
2388 #endif
2389 
2390 	/*
2391 	 * If the HW / PALcode portion of the new PTE is the same as the
2392 	 * old PTE, no TBI is necessary.
2393 	 */
2394 	if (opte & PG_V) {
2395 		tflush = PG_PALCODE(opte) != PG_PALCODE(npte);
2396 	}
2397 
2398 	/* Set the new PTE. */
2399 	atomic_store_relaxed(pte, npte);
2400 
2401 out:
2402 	PMAP_MAP_TO_HEAD_UNLOCK();
2403 
2404 	/*
2405 	 * Invalidate the TLB entry for this VA and any appropriate
2406 	 * caches.
2407 	 */
2408 	if (tflush) {
2409 		/* unlocks pmap */
2410 		pmap_enter_tlb_shootdown(pmap, va, opte, true);
2411 		if (pmap == pmap_kernel()) {
2412 			TLB_COUNT(reason_enter_kernel);
2413 		} else {
2414 			TLB_COUNT(reason_enter_user);
2415 		}
2416 	} else {
2417 		PMAP_UNLOCK(pmap);
2418 	}
2419 
2420 	if (opv)
2421 		pmap_pv_free(opv);
2422 
2423 	return error;
2424 }
2425 
2426 /*
2427  * pmap_kenter_pa:		[ INTERFACE ]
2428  *
2429  *	Enter a va -> pa mapping into the kernel pmap without any
2430  *	physical->virtual tracking.
2431  *
2432  *	Note: no locking is necessary in this function.
2433  */
2434 void
pmap_kenter_pa(vaddr_t va,paddr_t pa,vm_prot_t prot,u_int flags)2435 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
2436 {
2437 	pmap_t const pmap = pmap_kernel();
2438 
2439 #ifdef DEBUG
2440 	if (pmapdebug & (PDB_FOLLOW|PDB_ENTER))
2441 		printf("pmap_kenter_pa(%lx, %lx, %x)\n",
2442 		    va, pa, prot);
2443 #endif
2444 
2445 	KASSERT(va >= VM_MIN_KERNEL_ADDRESS);
2446 
2447 	pt_entry_t * const pte = PMAP_KERNEL_PTE(va);
2448 
2449 	/* Build the new PTE. */
2450 	const pt_entry_t npte =
2451 	    ((pa >> PGSHIFT) << PG_SHIFT) | pte_prot(pmap_kernel(), prot) |
2452 	    PG_V | PG_WIRED;
2453 
2454 	/* Set the new PTE. */
2455 	const pt_entry_t opte = atomic_load_relaxed(pte);
2456 	atomic_store_relaxed(pte, npte);
2457 
2458 	PMAP_STAT_INCR(pmap->pm_stats.resident_count, 1);
2459 	PMAP_STAT_INCR(pmap->pm_stats.wired_count, 1);
2460 
2461 	/*
2462 	 * There should not have been anything here, previously,
2463 	 * so we can skip TLB shootdowns, etc. in the common case.
2464 	 */
2465 	if (__predict_false(opte & PG_V)) {
2466 		const pt_entry_t diff = npte ^ opte;
2467 
2468 		printf_nolog("%s: mapping already present\n", __func__);
2469 		PMAP_STAT_DECR(pmap->pm_stats.resident_count, 1);
2470 		if (diff & PG_WIRED)
2471 			PMAP_STAT_DECR(pmap->pm_stats.wired_count, 1);
2472 		/* XXX Can't handle this case. */
2473 		if (diff & PG_PVLIST)
2474 			panic("pmap_kenter_pa: old mapping was managed");
2475 
2476 		pmap_enter_tlb_shootdown(pmap_kernel(), va, opte, false);
2477 		TLB_COUNT(reason_kenter);
2478 	}
2479 }
2480 
2481 /*
2482  * pmap_kremove:		[ INTERFACE ]
2483  *
2484  *	Remove a mapping entered with pmap_kenter_pa() starting at va,
2485  *	for size bytes (assumed to be page rounded).
2486  */
2487 void
pmap_kremove(vaddr_t va,vsize_t size)2488 pmap_kremove(vaddr_t va, vsize_t size)
2489 {
2490 	pt_entry_t *pte, opte;
2491 	pmap_t const pmap = pmap_kernel();
2492 	struct pmap_tlb_context tlbctx;
2493 	int count = 0;
2494 
2495 #ifdef DEBUG
2496 	if (pmapdebug & (PDB_FOLLOW|PDB_ENTER))
2497 		printf("pmap_kremove(%lx, %lx)\n",
2498 		    va, size);
2499 #endif
2500 
2501 	pmap_tlb_context_init(&tlbctx, 0);
2502 
2503 	KASSERT(va >= VM_MIN_KERNEL_ADDRESS);
2504 
2505 	for (; size != 0; size -= PAGE_SIZE, va += PAGE_SIZE) {
2506 		pte = PMAP_KERNEL_PTE(va);
2507 		opte = atomic_load_relaxed(pte);
2508 		if (opte & PG_V) {
2509 			KASSERT((opte & PG_PVLIST) == 0);
2510 
2511 			/* Zap the mapping. */
2512 			atomic_store_relaxed(pte, PG_NV);
2513 			pmap_tlb_shootdown(pmap, va, opte, &tlbctx);
2514 
2515 			count++;
2516 		}
2517 	}
2518 
2519 	/* Update stats. */
2520 	if (__predict_true(count != 0)) {
2521 		PMAP_STAT_DECR(pmap->pm_stats.resident_count, count);
2522 		PMAP_STAT_DECR(pmap->pm_stats.wired_count, count);
2523 	}
2524 
2525 	pmap_tlb_shootnow(&tlbctx);
2526 	TLB_COUNT(reason_kremove);
2527 }
2528 
2529 /*
2530  * pmap_unwire:			[ INTERFACE ]
2531  *
2532  *	Clear the wired attribute for a map/virtual-address pair.
2533  *
2534  *	The mapping must already exist in the pmap.
2535  */
2536 void
pmap_unwire(pmap_t pmap,vaddr_t va)2537 pmap_unwire(pmap_t pmap, vaddr_t va)
2538 {
2539 	pt_entry_t *pte;
2540 
2541 #ifdef DEBUG
2542 	if (pmapdebug & PDB_FOLLOW)
2543 		printf("pmap_unwire(%p, %lx)\n", pmap, va);
2544 #endif
2545 
2546 	PMAP_LOCK(pmap);
2547 
2548 	pte = pmap_l3pte(pmap_lev1map(pmap), va, NULL);
2549 
2550 	KASSERT(pte != NULL);
2551 	KASSERT(pmap_pte_v(pte));
2552 
2553 	/*
2554 	 * If wiring actually changed (always?) clear the wire bit and
2555 	 * update the wire count.  Note that wiring is not a hardware
2556 	 * characteristic so there is no need to invalidate the TLB.
2557 	 */
2558 	if (pmap_pte_w_chg(pte, 0)) {
2559 		pmap_pte_set_w(pte, false);
2560 		PMAP_STAT_DECR(pmap->pm_stats.wired_count, 1);
2561 	}
2562 #ifdef DEBUG
2563 	else {
2564 		printf("pmap_unwire: wiring for pmap %p va 0x%lx "
2565 		    "didn't change!\n", pmap, va);
2566 	}
2567 #endif
2568 
2569 	PMAP_UNLOCK(pmap);
2570 }
2571 
2572 /*
2573  * pmap_extract:		[ INTERFACE ]
2574  *
2575  *	Extract the physical address associated with the given
2576  *	pmap/virtual address pair.
2577  */
2578 bool
pmap_extract(pmap_t pmap,vaddr_t va,paddr_t * pap)2579 pmap_extract(pmap_t pmap, vaddr_t va, paddr_t *pap)
2580 {
2581 	pt_entry_t *l1pte, *l2pte, *l3pte;
2582 	paddr_t pa;
2583 
2584 #ifdef DEBUG
2585 	if (pmapdebug & PDB_FOLLOW)
2586 		printf("pmap_extract(%p, %lx) -> ", pmap, va);
2587 #endif
2588 
2589 	/*
2590 	 * Take a faster path for the kernel pmap.  Avoids locking,
2591 	 * handles K0SEG.
2592 	 */
2593 	if (__predict_true(pmap == pmap_kernel())) {
2594 #ifdef DEBUG
2595 		bool address_is_valid = vtophys_internal(va, pap);
2596 		if (pmapdebug & PDB_FOLLOW) {
2597 			if (address_is_valid) {
2598 				printf("0x%lx (kernel vtophys)\n", *pap);
2599 			} else {
2600 				printf("failed (kernel vtophys)\n");
2601 			}
2602 		}
2603 		return address_is_valid;
2604 #else
2605 		return vtophys_internal(va, pap);
2606 #endif
2607 	}
2608 
2609 	pt_entry_t * const lev1map = pmap_lev1map(pmap);
2610 
2611 	PMAP_LOCK(pmap);
2612 
2613 	l1pte = pmap_l1pte(lev1map, va);
2614 	if (pmap_pte_v(l1pte) == 0)
2615 		goto out;
2616 
2617 	l2pte = pmap_l2pte(lev1map, va, l1pte);
2618 	if (pmap_pte_v(l2pte) == 0)
2619 		goto out;
2620 
2621 	l3pte = pmap_l3pte(lev1map, va, l2pte);
2622 	if (pmap_pte_v(l3pte) == 0)
2623 		goto out;
2624 
2625 	pa = pmap_pte_pa(l3pte) | (va & PGOFSET);
2626 	PMAP_UNLOCK(pmap);
2627 	if (pap != NULL)
2628 		*pap = pa;
2629 #ifdef DEBUG
2630 	if (pmapdebug & PDB_FOLLOW)
2631 		printf("0x%lx\n", pa);
2632 #endif
2633 	return (true);
2634 
2635  out:
2636 	PMAP_UNLOCK(pmap);
2637 #ifdef DEBUG
2638 	if (pmapdebug & PDB_FOLLOW)
2639 		printf("failed\n");
2640 #endif
2641 	return (false);
2642 }
2643 
2644 /*
2645  * pmap_copy:			[ INTERFACE ]
2646  *
2647  *	Copy the mapping range specified by src_addr/len
2648  *	from the source map to the range dst_addr/len
2649  *	in the destination map.
2650  *
2651  *	This routine is only advisory and need not do anything.
2652  */
2653 /* call deleted in <machine/pmap.h> */
2654 
2655 /*
2656  * pmap_update:			[ INTERFACE ]
2657  *
2658  *	Require that all active physical maps contain no
2659  *	incorrect entries NOW, by processing any deferred
2660  *	pmap operations.
2661  */
2662 /* call deleted in <machine/pmap.h> */
2663 
2664 /*
2665  * pmap_activate:		[ INTERFACE ]
2666  *
2667  *	Activate the pmap used by the specified process.  This includes
2668  *	reloading the MMU context of the current process, and marking
2669  *	the pmap in use by the processor.
2670  */
2671 void
pmap_activate(struct lwp * l)2672 pmap_activate(struct lwp *l)
2673 {
2674 	struct pmap * const pmap = l->l_proc->p_vmspace->vm_map.pmap;
2675 	struct pcb * const pcb = lwp_getpcb(l);
2676 
2677 #ifdef DEBUG
2678 	if (pmapdebug & PDB_FOLLOW)
2679 		printf("pmap_activate(%p)\n", l);
2680 #endif
2681 
2682 	KASSERT(kpreempt_disabled());
2683 
2684 	struct cpu_info * const ci = curcpu();
2685 
2686 	KASSERT(l == ci->ci_curlwp);
2687 
2688 	u_long const old_ptbr = pcb->pcb_hw.apcb_ptbr;
2689 	u_int const old_asn = pcb->pcb_hw.apcb_asn;
2690 
2691 	/*
2692 	 * We hold the activation lock to synchronize with TLB shootdown.
2693 	 * The kernel pmap does not require those tests because shootdowns
2694 	 * for the kernel pmap are always sent to all CPUs.
2695 	 */
2696 	if (pmap != pmap_kernel()) {
2697 		PMAP_ACT_LOCK(pmap);
2698 		pcb->pcb_hw.apcb_asn = pmap_asn_alloc(pmap, ci);
2699 		atomic_or_ulong(&pmap->pm_cpus, (1UL << ci->ci_cpuid));
2700 	} else {
2701 		pcb->pcb_hw.apcb_asn = PMAP_ASN_KERNEL;
2702 	}
2703 	pcb->pcb_hw.apcb_ptbr =
2704 	    ALPHA_K0SEG_TO_PHYS((vaddr_t)pmap_lev1map(pmap)) >> PGSHIFT;
2705 
2706 	/*
2707 	 * Check to see if the ASN or page table base has changed; if
2708 	 * so, switch to our own context again so that it will take
2709 	 * effect.
2710 	 *
2711 	 * We test ASN first because it's the most likely value to change.
2712 	 */
2713 	if (old_asn != pcb->pcb_hw.apcb_asn ||
2714 	    old_ptbr != pcb->pcb_hw.apcb_ptbr) {
2715 		if (old_asn != pcb->pcb_hw.apcb_asn &&
2716 		    old_ptbr != pcb->pcb_hw.apcb_ptbr) {
2717 			TLB_COUNT(activate_both_change);
2718 		} else if (old_asn != pcb->pcb_hw.apcb_asn) {
2719 			TLB_COUNT(activate_asn_change);
2720 		} else {
2721 			TLB_COUNT(activate_ptbr_change);
2722 		}
2723 		(void) alpha_pal_swpctx((u_long)l->l_md.md_pcbpaddr);
2724 		TLB_COUNT(activate_swpctx);
2725 	} else {
2726 		TLB_COUNT(activate_skip_swpctx);
2727 	}
2728 
2729 	pmap_reference(pmap);
2730 	ci->ci_pmap = pmap;
2731 
2732 	if (pmap != pmap_kernel()) {
2733 		PMAP_ACT_UNLOCK(pmap);
2734 	}
2735 }
2736 
2737 /*
2738  * pmap_deactivate:		[ INTERFACE ]
2739  *
2740  *	Mark that the pmap used by the specified process is no longer
2741  *	in use by the processor.
2742  */
2743 void
pmap_deactivate(struct lwp * l)2744 pmap_deactivate(struct lwp *l)
2745 {
2746 	struct pmap * const pmap = l->l_proc->p_vmspace->vm_map.pmap;
2747 
2748 #ifdef DEBUG
2749 	if (pmapdebug & PDB_FOLLOW)
2750 		printf("pmap_deactivate(%p)\n", l);
2751 #endif
2752 
2753 	KASSERT(kpreempt_disabled());
2754 
2755 	struct cpu_info * const ci = curcpu();
2756 
2757 	KASSERT(l == ci->ci_curlwp);
2758 	KASSERT(pmap == ci->ci_pmap);
2759 
2760 	/*
2761 	 * There is no need to switch to a different PTBR here,
2762 	 * because a pmap_activate() or SWPCTX is guaranteed
2763 	 * before whatever lev1map we're on now is invalidated
2764 	 * or before user space is accessed again.
2765 	 *
2766 	 * Because only kernel mappings will be accessed before the
2767 	 * next pmap_activate() call, we consider our CPU to be on
2768 	 * the kernel pmap.
2769 	 */
2770 	ci->ci_pmap = pmap_kernel();
2771 	KASSERT(atomic_load_relaxed(&pmap->pm_count) > 1);
2772 	pmap_destroy(pmap);
2773 }
2774 
2775 /* pmap_zero_page() is in pmap_subr.s */
2776 
2777 /* pmap_copy_page() is in pmap_subr.s */
2778 
2779 /*
2780  * pmap_pageidlezero:		[ INTERFACE ]
2781  *
2782  *	Page zero'er for the idle loop.  Returns true if the
2783  *	page was zero'd, FALSE if we aborted for some reason.
2784  */
2785 bool
pmap_pageidlezero(paddr_t pa)2786 pmap_pageidlezero(paddr_t pa)
2787 {
2788 	u_long *ptr;
2789 	int i, cnt = PAGE_SIZE / sizeof(u_long);
2790 
2791 	for (i = 0, ptr = (u_long *) ALPHA_PHYS_TO_K0SEG(pa); i < cnt; i++) {
2792 		if (sched_curcpu_runnable_p()) {
2793 			/*
2794 			 * An LWP has become ready.  Abort now,
2795 			 * so we don't keep it waiting while we
2796 			 * finish zeroing the page.
2797 			 */
2798 			return (false);
2799 		}
2800 		*ptr++ = 0;
2801 	}
2802 
2803 	return (true);
2804 }
2805 
2806 /*
2807  * pmap_clear_modify:		[ INTERFACE ]
2808  *
2809  *	Clear the modify bits on the specified physical page.
2810  */
2811 bool
pmap_clear_modify(struct vm_page * pg)2812 pmap_clear_modify(struct vm_page *pg)
2813 {
2814 	struct vm_page_md * const md = VM_PAGE_TO_MD(pg);
2815 	bool rv = false;
2816 	kmutex_t *lock;
2817 	struct pmap_tlb_context tlbctx;
2818 
2819 #ifdef DEBUG
2820 	if (pmapdebug & PDB_FOLLOW)
2821 		printf("pmap_clear_modify(%p)\n", pg);
2822 #endif
2823 
2824 	pmap_tlb_context_init(&tlbctx, TLB_CTX_F_PV);
2825 
2826 	PMAP_HEAD_TO_MAP_LOCK();
2827 	lock = pmap_pvh_lock(pg);
2828 	mutex_enter(lock);
2829 
2830 	if (md->pvh_listx & PGA_MODIFIED) {
2831 		rv = true;
2832 		pmap_changebit(pg, PG_FOW, ~0UL, &tlbctx);
2833 		md->pvh_listx &= ~PGA_MODIFIED;
2834 	}
2835 
2836 	mutex_exit(lock);
2837 	PMAP_HEAD_TO_MAP_UNLOCK();
2838 
2839 	pmap_tlb_shootnow(&tlbctx);
2840 	TLB_COUNT(reason_clear_modify);
2841 
2842 	return (rv);
2843 }
2844 
2845 /*
2846  * pmap_clear_reference:	[ INTERFACE ]
2847  *
2848  *	Clear the reference bit on the specified physical page.
2849  */
2850 bool
pmap_clear_reference(struct vm_page * pg)2851 pmap_clear_reference(struct vm_page *pg)
2852 {
2853 	struct vm_page_md * const md = VM_PAGE_TO_MD(pg);
2854 	bool rv = false;
2855 	kmutex_t *lock;
2856 	struct pmap_tlb_context tlbctx;
2857 
2858 #ifdef DEBUG
2859 	if (pmapdebug & PDB_FOLLOW)
2860 		printf("pmap_clear_reference(%p)\n", pg);
2861 #endif
2862 
2863 	pmap_tlb_context_init(&tlbctx, TLB_CTX_F_PV);
2864 
2865 	PMAP_HEAD_TO_MAP_LOCK();
2866 	lock = pmap_pvh_lock(pg);
2867 	mutex_enter(lock);
2868 
2869 	if (md->pvh_listx & PGA_REFERENCED) {
2870 		rv = true;
2871 		pmap_changebit(pg, PG_FOR | PG_FOW | PG_FOE, ~0UL, &tlbctx);
2872 		md->pvh_listx &= ~PGA_REFERENCED;
2873 	}
2874 
2875 	mutex_exit(lock);
2876 	PMAP_HEAD_TO_MAP_UNLOCK();
2877 
2878 	pmap_tlb_shootnow(&tlbctx);
2879 	TLB_COUNT(reason_clear_reference);
2880 
2881 	return (rv);
2882 }
2883 
2884 /*
2885  * pmap_is_referenced:		[ INTERFACE ]
2886  *
2887  *	Return whether or not the specified physical page is referenced
2888  *	by any physical maps.
2889  */
2890 /* See <machine/pmap.h> */
2891 
2892 /*
2893  * pmap_is_modified:		[ INTERFACE ]
2894  *
2895  *	Return whether or not the specified physical page is modified
2896  *	by any physical maps.
2897  */
2898 /* See <machine/pmap.h> */
2899 
2900 /*
2901  * pmap_phys_address:		[ INTERFACE ]
2902  *
2903  *	Return the physical address corresponding to the specified
2904  *	cookie.  Used by the device pager to decode a device driver's
2905  *	mmap entry point return value.
2906  *
2907  *	Note: no locking is necessary in this function.
2908  */
2909 paddr_t
pmap_phys_address(paddr_t ppn)2910 pmap_phys_address(paddr_t ppn)
2911 {
2912 
2913 	return (alpha_ptob(ppn));
2914 }
2915 
2916 /*
2917  * Miscellaneous support routines follow
2918  */
2919 
2920 /*
2921  * alpha_protection_init:
2922  *
2923  *	Initialize Alpha protection code array.
2924  *
2925  *	Note: no locking is necessary in this function.
2926  */
2927 static void
alpha_protection_init(void)2928 alpha_protection_init(void)
2929 {
2930 	int prot, *kp, *up;
2931 
2932 	kp = protection_codes[0];
2933 	up = protection_codes[1];
2934 
2935 	for (prot = 0; prot < 8; prot++) {
2936 		kp[prot] = PG_ASM;
2937 		up[prot] = 0;
2938 
2939 		if (prot & VM_PROT_READ) {
2940 			kp[prot] |= PG_KRE;
2941 			up[prot] |= PG_KRE | PG_URE;
2942 		}
2943 		if (prot & VM_PROT_WRITE) {
2944 			kp[prot] |= PG_KWE;
2945 			up[prot] |= PG_KWE | PG_UWE;
2946 		}
2947 		if (prot & VM_PROT_EXECUTE) {
2948 			kp[prot] |= PG_EXEC | PG_KRE;
2949 			up[prot] |= PG_EXEC | PG_KRE | PG_URE;
2950 		} else {
2951 			kp[prot] |= PG_FOE;
2952 			up[prot] |= PG_FOE;
2953 		}
2954 	}
2955 }
2956 
2957 /*
2958  * pmap_remove_mapping:
2959  *
2960  *	Invalidate a single page denoted by pmap/va.
2961  *
2962  *	If (pte != NULL), it is the already computed PTE for the page.
2963  *
2964  *	Note: locking in this function is complicated by the fact
2965  *	that it can be called when the PV list is already locked.
2966  *	(pmap_page_protect()).  In this case, the caller must be
2967  *	careful to get the next PV entry while we remove this entry
2968  *	from beneath it.  We assume that the pmap itself is already
2969  *	locked; dolock applies only to the PV list.
2970  *
2971  *	Returns important PTE bits that the caller needs to check for
2972  *	TLB / I-stream invalidation purposes.
2973  */
2974 static pt_entry_t
pmap_remove_mapping(pmap_t pmap,vaddr_t va,pt_entry_t * pte,bool dolock,pv_entry_t * opvp,struct pmap_tlb_context * const tlbctx)2975 pmap_remove_mapping(pmap_t pmap, vaddr_t va, pt_entry_t *pte,
2976     bool dolock, pv_entry_t *opvp, struct pmap_tlb_context * const tlbctx)
2977 {
2978 	pt_entry_t opte;
2979 	paddr_t pa;
2980 	struct vm_page *pg;		/* if != NULL, page is managed */
2981 
2982 #ifdef DEBUG
2983 	if (pmapdebug & (PDB_FOLLOW|PDB_REMOVE|PDB_PROTECT))
2984 		printf("pmap_remove_mapping(%p, %lx, %p, %d, %p, %p)\n",
2985 		       pmap, va, pte, dolock, opvp, tlbctx);
2986 #endif
2987 
2988 	/*
2989 	 * PTE not provided, compute it from pmap and va.
2990 	 */
2991 	if (pte == NULL) {
2992 		pte = pmap_l3pte(pmap_lev1map(pmap), va, NULL);
2993 		if (pmap_pte_v(pte) == 0)
2994 			return 0;
2995 	}
2996 
2997 	opte = *pte;
2998 
2999 	pa = PG_PFNUM(opte) << PGSHIFT;
3000 
3001 	/*
3002 	 * Update statistics
3003 	 */
3004 	if (pmap_pte_w(pte))
3005 		PMAP_STAT_DECR(pmap->pm_stats.wired_count, 1);
3006 	PMAP_STAT_DECR(pmap->pm_stats.resident_count, 1);
3007 
3008 	/*
3009 	 * Invalidate the PTE after saving the reference modify info.
3010 	 */
3011 #ifdef DEBUG
3012 	if (pmapdebug & PDB_REMOVE)
3013 		printf("remove: invalidating pte at %p\n", pte);
3014 #endif
3015 	atomic_store_relaxed(pte, PG_NV);
3016 
3017 	/*
3018 	 * If we're removing a user mapping, check to see if we
3019 	 * can free page table pages.
3020 	 */
3021 	if (pmap != pmap_kernel()) {
3022 		/*
3023 		 * Delete the reference on the level 3 table.  It will
3024 		 * delete references on the level 2 and 1 tables as
3025 		 * appropriate.
3026 		 */
3027 		pmap_l3pt_delref(pmap, va, pte, tlbctx);
3028 	}
3029 
3030 	if (opte & PG_PVLIST) {
3031 		/*
3032 		 * Remove it from the PV table.
3033 		 */
3034 		pg = PHYS_TO_VM_PAGE(pa);
3035 		KASSERT(pg != NULL);
3036 		pmap_pv_remove(pmap, pg, va, dolock, opvp, tlbctx);
3037 		KASSERT(opvp == NULL || *opvp != NULL);
3038 	}
3039 
3040 	return opte & (PG_V | PG_ASM | PG_EXEC);
3041 }
3042 
3043 /*
3044  * pmap_changebit:
3045  *
3046  *	Set or clear the specified PTE bits for all mappings on the
3047  *	specified page.
3048  *
3049  *	Note: we assume that the pv_head is already locked, and that
3050  *	the caller has acquired a PV->pmap mutex so that we can lock
3051  *	the pmaps as we encounter them.
3052  */
3053 static void
pmap_changebit(struct vm_page * pg,pt_entry_t set,pt_entry_t mask,struct pmap_tlb_context * const tlbctx)3054 pmap_changebit(struct vm_page *pg, pt_entry_t set, pt_entry_t mask,
3055     struct pmap_tlb_context * const tlbctx)
3056 {
3057 	pv_entry_t pv;
3058 	pt_entry_t *pte, npte, opte;
3059 
3060 #ifdef DEBUG
3061 	if (pmapdebug & PDB_BITS)
3062 		printf("pmap_changebit(%p, 0x%lx, 0x%lx)\n",
3063 		    pg, set, mask);
3064 #endif
3065 
3066 	/*
3067 	 * Loop over all current mappings setting/clearing as apropos.
3068 	 */
3069 	for (pv = VM_MDPAGE_PVS(pg); pv != NULL; pv = pv->pv_next) {
3070 		PMAP_LOCK(pv->pv_pmap);
3071 
3072 		pte = pv->pv_pte;
3073 
3074 		opte = atomic_load_relaxed(pte);
3075 		npte = (opte | set) & mask;
3076 		if (npte != opte) {
3077 			atomic_store_relaxed(pte, npte);
3078 			pmap_tlb_shootdown_pv(pv->pv_pmap, pv->pv_va,
3079 			    opte, tlbctx);
3080 		}
3081 		PMAP_UNLOCK(pv->pv_pmap);
3082 	}
3083 }
3084 
3085 /*
3086  * pmap_emulate_reference:
3087  *
3088  *	Emulate reference and/or modified bit hits.
3089  *	Return 1 if this was an execute fault on a non-exec mapping,
3090  *	otherwise return 0.
3091  */
3092 int
pmap_emulate_reference(struct lwp * l,vaddr_t v,int user,int type)3093 pmap_emulate_reference(struct lwp *l, vaddr_t v, int user, int type)
3094 {
3095 	struct pmap *pmap = l->l_proc->p_vmspace->vm_map.pmap;
3096 	pt_entry_t faultoff, *pte;
3097 	struct vm_page *pg;
3098 	paddr_t pa;
3099 	bool didlock = false;
3100 	bool exec = false;
3101 	kmutex_t *lock;
3102 
3103 #ifdef DEBUG
3104 	if (pmapdebug & PDB_FOLLOW)
3105 		printf("pmap_emulate_reference: %p, 0x%lx, %d, %d\n",
3106 		    l, v, user, type);
3107 #endif
3108 
3109 	/*
3110 	 * Convert process and virtual address to physical address.
3111 	 */
3112 	if (v >= VM_MIN_KERNEL_ADDRESS) {
3113 		if (user)
3114 			panic("pmap_emulate_reference: user ref to kernel");
3115 		/*
3116 		 * No need to lock here; kernel PT pages never go away.
3117 		 */
3118 		pte = PMAP_KERNEL_PTE(v);
3119 	} else {
3120 #ifdef DIAGNOSTIC
3121 		if (l == NULL)
3122 			panic("pmap_emulate_reference: bad proc");
3123 		if (l->l_proc->p_vmspace == NULL)
3124 			panic("pmap_emulate_reference: bad p_vmspace");
3125 #endif
3126 		PMAP_LOCK(pmap);
3127 		didlock = true;
3128 		pte = pmap_l3pte(pmap_lev1map(pmap), v, NULL);
3129 		/*
3130 		 * We'll unlock below where we're done with the PTE.
3131 		 */
3132 	}
3133 	exec = pmap_pte_exec(pte);
3134 	if (!exec && type == ALPHA_MMCSR_FOE) {
3135 		if (didlock)
3136 			PMAP_UNLOCK(pmap);
3137 	       return (1);
3138 	}
3139 #ifdef DEBUG
3140 	if (pmapdebug & PDB_FOLLOW) {
3141 		printf("\tpte = %p, ", pte);
3142 		printf("*pte = 0x%lx\n", *pte);
3143 	}
3144 #endif
3145 
3146 	pa = pmap_pte_pa(pte);
3147 
3148 	/*
3149 	 * We're now done with the PTE.  If it was a user pmap, unlock
3150 	 * it now.
3151 	 */
3152 	if (didlock)
3153 		PMAP_UNLOCK(pmap);
3154 
3155 #ifdef DEBUG
3156 	if (pmapdebug & PDB_FOLLOW)
3157 		printf("\tpa = 0x%lx\n", pa);
3158 #endif
3159 #ifdef DIAGNOSTIC
3160 	if (!uvm_pageismanaged(pa))
3161 		panic("pmap_emulate_reference(%p, 0x%lx, %d, %d): "
3162 		      "pa 0x%lx not managed", l, v, user, type, pa);
3163 #endif
3164 
3165 	/*
3166 	 * Twiddle the appropriate bits to reflect the reference
3167 	 * and/or modification..
3168 	 *
3169 	 * The rules:
3170 	 * 	(1) always mark page as used, and
3171 	 *	(2) if it was a write fault, mark page as modified.
3172 	 */
3173 	pg = PHYS_TO_VM_PAGE(pa);
3174 	struct vm_page_md * const md = VM_PAGE_TO_MD(pg);
3175 	struct pmap_tlb_context tlbctx;
3176 
3177 	pmap_tlb_context_init(&tlbctx, TLB_CTX_F_PV);
3178 
3179 	PMAP_HEAD_TO_MAP_LOCK();
3180 	lock = pmap_pvh_lock(pg);
3181 	mutex_enter(lock);
3182 
3183 	if (type == ALPHA_MMCSR_FOW) {
3184 		md->pvh_listx |= (PGA_REFERENCED|PGA_MODIFIED);
3185 		faultoff = PG_FOR | PG_FOW;
3186 	} else {
3187 		md->pvh_listx |= PGA_REFERENCED;
3188 		faultoff = PG_FOR;
3189 		if (exec) {
3190 			faultoff |= PG_FOE;
3191 		}
3192 	}
3193 	pmap_changebit(pg, 0, ~faultoff, &tlbctx);
3194 
3195 	mutex_exit(lock);
3196 	PMAP_HEAD_TO_MAP_UNLOCK();
3197 
3198 	pmap_tlb_shootnow(&tlbctx);
3199 	TLB_COUNT(reason_emulate_reference);
3200 
3201 	return (0);
3202 }
3203 
3204 #ifdef DEBUG
3205 /*
3206  * pmap_pv_dump:
3207  *
3208  *	Dump the physical->virtual data for the specified page.
3209  */
3210 void
pmap_pv_dump(paddr_t pa)3211 pmap_pv_dump(paddr_t pa)
3212 {
3213 	struct vm_page *pg;
3214 	struct vm_page_md *md;
3215 	pv_entry_t pv;
3216 	kmutex_t *lock;
3217 
3218 	pg = PHYS_TO_VM_PAGE(pa);
3219 	md = VM_PAGE_TO_MD(pg);
3220 
3221 	lock = pmap_pvh_lock(pg);
3222 	mutex_enter(lock);
3223 
3224 	printf("pa 0x%lx (attrs = 0x%lx):\n", pa, md->pvh_listx & PGA_ATTRS);
3225 	for (pv = VM_MDPAGE_PVS(pg); pv != NULL; pv = pv->pv_next)
3226 		printf("     pmap %p, va 0x%lx\n",
3227 		    pv->pv_pmap, pv->pv_va);
3228 	printf("\n");
3229 
3230 	mutex_exit(lock);
3231 }
3232 #endif
3233 
3234 /*
3235  * vtophys:
3236  *
3237  *	Return the physical address corresponding to the K0SEG or
3238  *	K1SEG address provided.
3239  *
3240  *	Note: no locking is necessary in this function.
3241  */
3242 static bool
vtophys_internal(vaddr_t const vaddr,paddr_t * const pap)3243 vtophys_internal(vaddr_t const vaddr, paddr_t * const pap)
3244 {
3245 	paddr_t pa;
3246 
3247 	KASSERT(vaddr >= ALPHA_K0SEG_BASE);
3248 
3249 	if (vaddr <= ALPHA_K0SEG_END) {
3250 		pa = ALPHA_K0SEG_TO_PHYS(vaddr);
3251 	} else {
3252 		pt_entry_t * const pte = PMAP_KERNEL_PTE(vaddr);
3253 		if (__predict_false(! pmap_pte_v(pte))) {
3254 			return false;
3255 		}
3256 		pa = pmap_pte_pa(pte) | (vaddr & PGOFSET);
3257 	}
3258 
3259 	if (pap != NULL) {
3260 		*pap = pa;
3261 	}
3262 
3263 	return true;
3264 }
3265 
3266 paddr_t
vtophys(vaddr_t const vaddr)3267 vtophys(vaddr_t const vaddr)
3268 {
3269 	paddr_t pa;
3270 
3271 	if (__predict_false(! vtophys_internal(vaddr, &pa)))
3272 		pa = 0;
3273 	return pa;
3274 }
3275 
3276 /******************** pv_entry management ********************/
3277 
3278 /*
3279  * pmap_pv_enter:
3280  *
3281  *	Add a physical->virtual entry to the pv_table.
3282  */
3283 static int
pmap_pv_enter(pmap_t pmap,struct vm_page * pg,vaddr_t va,pt_entry_t * pte,bool dolock,pv_entry_t newpv)3284 pmap_pv_enter(pmap_t pmap, struct vm_page *pg, vaddr_t va, pt_entry_t *pte,
3285     bool dolock, pv_entry_t newpv)
3286 {
3287 	struct vm_page_md * const md = VM_PAGE_TO_MD(pg);
3288 	kmutex_t *lock;
3289 
3290 	/*
3291 	 * Allocate and fill in the new pv_entry.
3292 	 */
3293 	if (newpv == NULL) {
3294 		newpv = pmap_pv_alloc();
3295 		if (newpv == NULL)
3296 			return ENOMEM;
3297 	}
3298 	newpv->pv_va = va;
3299 	newpv->pv_pmap = pmap;
3300 	newpv->pv_pte = pte;
3301 
3302 	if (dolock) {
3303 		lock = pmap_pvh_lock(pg);
3304 		mutex_enter(lock);
3305 	}
3306 
3307 #ifdef DEBUG
3308     {
3309 	pv_entry_t pv;
3310 	/*
3311 	 * Make sure the entry doesn't already exist.
3312 	 */
3313 	for (pv = VM_MDPAGE_PVS(pg); pv != NULL; pv = pv->pv_next) {
3314 		if (pmap == pv->pv_pmap && va == pv->pv_va) {
3315 			printf("pmap = %p, va = 0x%lx\n", pmap, va);
3316 			panic("pmap_pv_enter: already in pv table");
3317 		}
3318 	}
3319     }
3320 #endif
3321 
3322 	/*
3323 	 * ...and put it in the list.
3324 	 */
3325 	uintptr_t const attrs = md->pvh_listx & PGA_ATTRS;
3326 	newpv->pv_next = (struct pv_entry *)(md->pvh_listx & ~PGA_ATTRS);
3327 	md->pvh_listx = (uintptr_t)newpv | attrs;
3328 	LIST_INSERT_HEAD(&pmap->pm_pvents, newpv, pv_link);
3329 
3330 	if (dolock) {
3331 		mutex_exit(lock);
3332 	}
3333 
3334 	return 0;
3335 }
3336 
3337 /*
3338  * pmap_pv_remove:
3339  *
3340  *	Remove a physical->virtual entry from the pv_table.
3341  */
3342 static void
pmap_pv_remove(pmap_t pmap,struct vm_page * pg,vaddr_t va,bool dolock,pv_entry_t * opvp,struct pmap_tlb_context * const tlbctx)3343 pmap_pv_remove(pmap_t pmap, struct vm_page *pg, vaddr_t va, bool dolock,
3344     pv_entry_t *opvp, struct pmap_tlb_context * const tlbctx)
3345 {
3346 	struct vm_page_md * const md = VM_PAGE_TO_MD(pg);
3347 	pv_entry_t pv, *pvp;
3348 	kmutex_t *lock;
3349 
3350 	if (dolock) {
3351 		lock = pmap_pvh_lock(pg);
3352 		mutex_enter(lock);
3353 	} else {
3354 		lock = NULL; /* XXX stupid gcc */
3355 	}
3356 
3357 	/*
3358 	 * Find the entry to remove.
3359 	 */
3360 	for (pvp = (struct pv_entry **)&md->pvh_listx, pv = VM_MDPAGE_PVS(pg);
3361 	     pv != NULL; pvp = &pv->pv_next, pv = *pvp)
3362 		if (pmap == pv->pv_pmap && va == pv->pv_va)
3363 			break;
3364 
3365 	KASSERT(pv != NULL);
3366 
3367 	/*
3368 	 * The page attributes are in the lower 2 bits of the first
3369 	 * PV entry pointer.  Rather than comparing the pointer address
3370 	 * and branching, we just always preserve what might be there
3371 	 * (either attribute bits or zero bits).
3372 	 */
3373 	*pvp = (pv_entry_t)((uintptr_t)pv->pv_next |
3374 			    (((uintptr_t)*pvp) & PGA_ATTRS));
3375 	LIST_REMOVE(pv, pv_link);
3376 
3377 	if (dolock) {
3378 		mutex_exit(lock);
3379 	}
3380 
3381 	if (opvp != NULL) {
3382 		*opvp = pv;
3383 	} else {
3384 		KASSERT(tlbctx != NULL);
3385 		LIST_INSERT_HEAD(&tlbctx->t_freepvq, pv, pv_link);
3386 	}
3387 }
3388 
3389 /*
3390  * pmap_pv_page_alloc:
3391  *
3392  *	Allocate a page for the pv_entry pool.
3393  */
3394 static void *
pmap_pv_page_alloc(struct pool * pp,int flags)3395 pmap_pv_page_alloc(struct pool *pp, int flags)
3396 {
3397 	struct vm_page * const pg = pmap_physpage_alloc(PGU_PVENT);
3398 	if (__predict_false(pg == NULL)) {
3399 		return NULL;
3400 	}
3401 	return (void *)ALPHA_PHYS_TO_K0SEG(VM_PAGE_TO_PHYS(pg));
3402 }
3403 
3404 /*
3405  * pmap_pv_page_free:
3406  *
3407  *	Free a pv_entry pool page.
3408  */
3409 static void
pmap_pv_page_free(struct pool * pp,void * v)3410 pmap_pv_page_free(struct pool *pp, void *v)
3411 {
3412 
3413 	pmap_physpage_free(ALPHA_K0SEG_TO_PHYS((vaddr_t)v));
3414 }
3415 
3416 /******************** misc. functions ********************/
3417 
3418 /*
3419  * pmap_physpage_alloc:
3420  *
3421  *	Allocate a single page from the VM system and return the
3422  *	physical address for that page.
3423  */
3424 static struct vm_page *
pmap_physpage_alloc(int usage)3425 pmap_physpage_alloc(int usage)
3426 {
3427 	struct vm_page *pg;
3428 
3429 	/*
3430 	 * Don't ask for a zero'd page in the L1PT case -- we will
3431 	 * properly initialize it in the constructor.
3432 	 */
3433 
3434 	pg = uvm_pagealloc(NULL, 0, NULL, usage == PGU_L1PT ?
3435 	    UVM_PGA_USERESERVE : UVM_PGA_USERESERVE|UVM_PGA_ZERO);
3436 	if (pg != NULL) {
3437 		KASSERT(PHYSPAGE_REFCNT(pg) == 0);
3438 	}
3439 	return pg;
3440 }
3441 
3442 /*
3443  * pmap_physpage_free:
3444  *
3445  *	Free the single page table page at the specified physical address.
3446  */
3447 static void
pmap_physpage_free(paddr_t pa)3448 pmap_physpage_free(paddr_t pa)
3449 {
3450 	struct vm_page *pg;
3451 
3452 	if ((pg = PHYS_TO_VM_PAGE(pa)) == NULL)
3453 		panic("pmap_physpage_free: bogus physical page address");
3454 
3455 	KASSERT(PHYSPAGE_REFCNT(pg) == 0);
3456 
3457 	uvm_pagefree(pg);
3458 }
3459 
3460 /*
3461  * pmap_physpage_addref:
3462  *
3463  *	Add a reference to the specified special use page.
3464  */
3465 static int
pmap_physpage_addref(void * kva)3466 pmap_physpage_addref(void *kva)
3467 {
3468 	struct vm_page *pg;
3469 	paddr_t pa;
3470 
3471 	pa = ALPHA_K0SEG_TO_PHYS(trunc_page((vaddr_t)kva));
3472 	pg = PHYS_TO_VM_PAGE(pa);
3473 
3474 	KASSERT(PHYSPAGE_REFCNT(pg) < UINT32_MAX);
3475 
3476 	return PHYSPAGE_REFCNT_INC(pg);
3477 }
3478 
3479 /*
3480  * pmap_physpage_delref:
3481  *
3482  *	Delete a reference to the specified special use page.
3483  */
3484 static int
pmap_physpage_delref(void * kva)3485 pmap_physpage_delref(void *kva)
3486 {
3487 	struct vm_page *pg;
3488 	paddr_t pa;
3489 
3490 	pa = ALPHA_K0SEG_TO_PHYS(trunc_page((vaddr_t)kva));
3491 	pg = PHYS_TO_VM_PAGE(pa);
3492 
3493 	KASSERT(PHYSPAGE_REFCNT(pg) != 0);
3494 
3495 	return PHYSPAGE_REFCNT_DEC(pg);
3496 }
3497 
3498 /******************** page table page management ********************/
3499 
3500 static bool
pmap_kptpage_alloc(paddr_t * pap)3501 pmap_kptpage_alloc(paddr_t *pap)
3502 {
3503 	if (uvm.page_init_done == false) {
3504 		/*
3505 		 * We're growing the kernel pmap early (from
3506 		 * uvm_pageboot_alloc()).  This case must
3507 		 * be handled a little differently.
3508 		 */
3509 		*pap = ALPHA_K0SEG_TO_PHYS(
3510 		    pmap_steal_memory(PAGE_SIZE, NULL, NULL));
3511 		return true;
3512 	}
3513 
3514 	struct vm_page * const pg = pmap_physpage_alloc(PGU_NORMAL);
3515 	if (__predict_true(pg != NULL)) {
3516 		*pap = VM_PAGE_TO_PHYS(pg);
3517 		return true;
3518 	}
3519 	return false;
3520 }
3521 
3522 /*
3523  * pmap_growkernel:		[ INTERFACE ]
3524  *
3525  *	Grow the kernel address space.  This is a hint from the
3526  *	upper layer to pre-allocate more kernel PT pages.
3527  */
3528 vaddr_t
pmap_growkernel(vaddr_t maxkvaddr)3529 pmap_growkernel(vaddr_t maxkvaddr)
3530 {
3531 	struct pmap *pm;
3532 	paddr_t ptaddr;
3533 	pt_entry_t *l1pte, *l2pte, pte;
3534 	pt_entry_t *lev1map;
3535 	vaddr_t va;
3536 	int l1idx;
3537 
3538 	rw_enter(&pmap_growkernel_lock, RW_WRITER);
3539 
3540 	if (maxkvaddr <= virtual_end)
3541 		goto out;		/* we are OK */
3542 
3543 	pmap_growkernel_evcnt.ev_count++;
3544 
3545 	va = virtual_end;
3546 
3547 	while (va < maxkvaddr) {
3548 		/*
3549 		 * If there is no valid L1 PTE (i.e. no L2 PT page),
3550 		 * allocate a new L2 PT page and insert it into the
3551 		 * L1 map.
3552 		 */
3553 		l1pte = pmap_l1pte(kernel_lev1map, va);
3554 		if (pmap_pte_v(l1pte) == 0) {
3555 			if (!pmap_kptpage_alloc(&ptaddr))
3556 				goto die;
3557 			pte = (atop(ptaddr) << PG_SHIFT) |
3558 			    PG_V | PG_ASM | PG_KRE | PG_KWE | PG_WIRED;
3559 			*l1pte = pte;
3560 
3561 			l1idx = l1pte_index(va);
3562 
3563 			/* Update all the user pmaps. */
3564 			mutex_enter(&pmap_all_pmaps_lock);
3565 			for (pm = TAILQ_FIRST(&pmap_all_pmaps);
3566 			     pm != NULL; pm = TAILQ_NEXT(pm, pm_list)) {
3567 				/* Skip the kernel pmap. */
3568 				if (pm == pmap_kernel())
3569 					continue;
3570 
3571 				/*
3572 				 * Any pmaps published on the global list
3573 				 * should never be referencing kernel_lev1map.
3574 				 */
3575 				lev1map = pmap_lev1map(pm);
3576 				KASSERT(lev1map != kernel_lev1map);
3577 
3578 				PMAP_LOCK(pm);
3579 				lev1map[l1idx] = pte;
3580 				PMAP_UNLOCK(pm);
3581 			}
3582 			mutex_exit(&pmap_all_pmaps_lock);
3583 		}
3584 
3585 		/*
3586 		 * Have an L2 PT page now, add the L3 PT page.
3587 		 */
3588 		l2pte = pmap_l2pte(kernel_lev1map, va, l1pte);
3589 		KASSERT(pmap_pte_v(l2pte) == 0);
3590 		if (!pmap_kptpage_alloc(&ptaddr))
3591 			goto die;
3592 		*l2pte = (atop(ptaddr) << PG_SHIFT) |
3593 		    PG_V | PG_ASM | PG_KRE | PG_KWE | PG_WIRED;
3594 		va += ALPHA_L2SEG_SIZE;
3595 	}
3596 
3597 	/* Invalidate the L1 PT cache. */
3598 	pool_cache_invalidate(&pmap_l1pt_cache);
3599 
3600 	virtual_end = va;
3601 
3602  out:
3603 	rw_exit(&pmap_growkernel_lock);
3604 
3605 	return (virtual_end);
3606 
3607  die:
3608 	panic("pmap_growkernel: out of memory");
3609 }
3610 
3611 /*
3612  * pmap_l1pt_ctor:
3613  *
3614  *	Pool cache constructor for L1 PT pages.
3615  *
3616  *	Note: The growkernel lock is held across allocations
3617  *	from our pool_cache, so we don't need to acquire it
3618  *	ourselves.
3619  */
3620 static int
pmap_l1pt_ctor(void * arg,void * object,int flags)3621 pmap_l1pt_ctor(void *arg, void *object, int flags)
3622 {
3623 	pt_entry_t *l1pt = object, pte;
3624 	int i;
3625 
3626 	/*
3627 	 * Initialize the new level 1 table by zeroing the
3628 	 * user portion and copying the kernel mappings into
3629 	 * the kernel portion.
3630 	 */
3631 	for (i = 0; i < l1pte_index(VM_MIN_KERNEL_ADDRESS); i++)
3632 		l1pt[i] = 0;
3633 
3634 	for (i = l1pte_index(VM_MIN_KERNEL_ADDRESS);
3635 	     i <= l1pte_index(VM_MAX_KERNEL_ADDRESS); i++)
3636 		l1pt[i] = kernel_lev1map[i];
3637 
3638 	/*
3639 	 * Now, map the new virtual page table.  NOTE: NO ASM!
3640 	 */
3641 	pte = ((ALPHA_K0SEG_TO_PHYS((vaddr_t) l1pt) >> PGSHIFT) << PG_SHIFT) |
3642 	    PG_V | PG_KRE | PG_KWE;
3643 	l1pt[l1pte_index(VPTBASE)] = pte;
3644 
3645 	return (0);
3646 }
3647 
3648 /*
3649  * pmap_l1pt_alloc:
3650  *
3651  *	Page allocator for L1 PT pages.
3652  */
3653 static void *
pmap_l1pt_alloc(struct pool * pp,int flags)3654 pmap_l1pt_alloc(struct pool *pp, int flags)
3655 {
3656 	/*
3657 	 * Attempt to allocate a free page.
3658 	 */
3659 	struct vm_page * const pg = pmap_physpage_alloc(PGU_L1PT);
3660 	if (__predict_false(pg == NULL)) {
3661 		return NULL;
3662 	}
3663 	return (void *)ALPHA_PHYS_TO_K0SEG(VM_PAGE_TO_PHYS(pg));
3664 }
3665 
3666 /*
3667  * pmap_l1pt_free:
3668  *
3669  *	Page freer for L1 PT pages.
3670  */
3671 static void
pmap_l1pt_free(struct pool * pp,void * v)3672 pmap_l1pt_free(struct pool *pp, void *v)
3673 {
3674 
3675 	pmap_physpage_free(ALPHA_K0SEG_TO_PHYS((vaddr_t) v));
3676 }
3677 
3678 /*
3679  * pmap_ptpage_alloc:
3680  *
3681  *	Allocate a level 2 or level 3 page table page for a user
3682  *	pmap, and initialize the PTE that references it.
3683  *
3684  *	Note: the pmap must already be locked.
3685  */
3686 static int
pmap_ptpage_alloc(pmap_t pmap,pt_entry_t * const pte,int const usage)3687 pmap_ptpage_alloc(pmap_t pmap, pt_entry_t * const pte, int const usage)
3688 {
3689 	/*
3690 	 * Allocate the page table page.
3691 	 */
3692 	struct vm_page * const pg = pmap_physpage_alloc(usage);
3693 	if (__predict_false(pg == NULL)) {
3694 		return ENOMEM;
3695 	}
3696 
3697 	LIST_INSERT_HEAD(&pmap->pm_ptpages, pg, pageq.list);
3698 
3699 	/*
3700 	 * Initialize the referencing PTE.
3701 	 */
3702 	const pt_entry_t npte = ((VM_PAGE_TO_PHYS(pg) >> PGSHIFT) << PG_SHIFT) |
3703 	    PG_V | PG_KRE | PG_KWE | PG_WIRED;
3704 
3705 	atomic_store_relaxed(pte, npte);
3706 
3707 	return (0);
3708 }
3709 
3710 /*
3711  * pmap_ptpage_free:
3712  *
3713  *	Free the level 2 or level 3 page table page referenced
3714  *	be the provided PTE.
3715  *
3716  *	Note: the pmap must already be locked.
3717  */
3718 static void
pmap_ptpage_free(pmap_t pmap,pt_entry_t * const pte,struct pmap_tlb_context * const tlbctx)3719 pmap_ptpage_free(pmap_t pmap, pt_entry_t * const pte,
3720     struct pmap_tlb_context * const tlbctx)
3721 {
3722 
3723 	/*
3724 	 * Extract the physical address of the page from the PTE
3725 	 * and clear the entry.
3726 	 */
3727 	const paddr_t ptpa = pmap_pte_pa(pte);
3728 	atomic_store_relaxed(pte, PG_NV);
3729 
3730 	struct vm_page * const pg = PHYS_TO_VM_PAGE(ptpa);
3731 	KASSERT(pg != NULL);
3732 
3733 	KASSERT(PHYSPAGE_REFCNT(pg) == 0);
3734 #ifdef DEBUG
3735 	pmap_zero_page(ptpa);
3736 #endif
3737 
3738 	LIST_REMOVE(pg, pageq.list);
3739 	LIST_INSERT_HEAD(&tlbctx->t_freeptq, pg, pageq.list);
3740 }
3741 
3742 /*
3743  * pmap_l3pt_delref:
3744  *
3745  *	Delete a reference on a level 3 PT page.  If the reference drops
3746  *	to zero, free it.
3747  *
3748  *	Note: the pmap must already be locked.
3749  */
3750 static void
pmap_l3pt_delref(pmap_t pmap,vaddr_t va,pt_entry_t * l3pte,struct pmap_tlb_context * const tlbctx)3751 pmap_l3pt_delref(pmap_t pmap, vaddr_t va, pt_entry_t *l3pte,
3752     struct pmap_tlb_context * const tlbctx)
3753 {
3754 	pt_entry_t *l1pte, *l2pte;
3755 	pt_entry_t * const lev1map = pmap_lev1map(pmap);
3756 
3757 	l1pte = pmap_l1pte(lev1map, va);
3758 	l2pte = pmap_l2pte(lev1map, va, l1pte);
3759 
3760 #ifdef DIAGNOSTIC
3761 	if (pmap == pmap_kernel())
3762 		panic("pmap_l3pt_delref: kernel pmap");
3763 #endif
3764 
3765 	if (pmap_physpage_delref(l3pte) == 0) {
3766 		/*
3767 		 * No more mappings; we can free the level 3 table.
3768 		 */
3769 #ifdef DEBUG
3770 		if (pmapdebug & PDB_PTPAGE)
3771 			printf("pmap_l3pt_delref: freeing level 3 table at "
3772 			    "0x%lx\n", pmap_pte_pa(l2pte));
3773 #endif
3774 		/*
3775 		 * You can pass NULL if you know the last reference won't
3776 		 * be dropped.
3777 		 */
3778 		KASSERT(tlbctx != NULL);
3779 		pmap_ptpage_free(pmap, l2pte, tlbctx);
3780 
3781 		/*
3782 		 * We've freed a level 3 table, so we must invalidate
3783 		 * any now-stale TLB entries for the corresponding VPT
3784 		 * VA range.  Easiest way to guarantee this is to hit
3785 		 * all of the user TLB entries.
3786 		 */
3787 		pmap_tlb_shootdown_all_user(pmap, PG_V, tlbctx);
3788 
3789 		/*
3790 		 * We've freed a level 3 table, so delete the reference
3791 		 * on the level 2 table.
3792 		 */
3793 		pmap_l2pt_delref(pmap, l1pte, l2pte, tlbctx);
3794 	}
3795 }
3796 
3797 /*
3798  * pmap_l2pt_delref:
3799  *
3800  *	Delete a reference on a level 2 PT page.  If the reference drops
3801  *	to zero, free it.
3802  *
3803  *	Note: the pmap must already be locked.
3804  */
3805 static void
pmap_l2pt_delref(pmap_t pmap,pt_entry_t * l1pte,pt_entry_t * l2pte,struct pmap_tlb_context * const tlbctx)3806 pmap_l2pt_delref(pmap_t pmap, pt_entry_t *l1pte, pt_entry_t *l2pte,
3807     struct pmap_tlb_context * const tlbctx)
3808 {
3809 
3810 #ifdef DIAGNOSTIC
3811 	if (pmap == pmap_kernel())
3812 		panic("pmap_l2pt_delref: kernel pmap");
3813 #endif
3814 
3815 	if (pmap_physpage_delref(l2pte) == 0) {
3816 		/*
3817 		 * No more mappings in this segment; we can free the
3818 		 * level 2 table.
3819 		 */
3820 #ifdef DEBUG
3821 		if (pmapdebug & PDB_PTPAGE)
3822 			printf("pmap_l2pt_delref: freeing level 2 table at "
3823 			    "0x%lx\n", pmap_pte_pa(l1pte));
3824 #endif
3825 		/*
3826 		 * You can pass NULL if you know the last reference won't
3827 		 * be dropped.
3828 		 */
3829 		KASSERT(tlbctx != NULL);
3830 		pmap_ptpage_free(pmap, l1pte, tlbctx);
3831 
3832 		/*
3833 		 * We've freed a level 2 table, so we must invalidate
3834 		 * any now-stale TLB entries for the corresponding VPT
3835 		 * VA range.  Easiest way to guarantee this is to hit
3836 		 * all of the user TLB entries.
3837 		 */
3838 		pmap_tlb_shootdown_all_user(pmap, PG_V, tlbctx);
3839 
3840 		/*
3841 		 * We've freed a level 2 table, so delete the reference
3842 		 * on the level 1 table.
3843 		 */
3844 		pmap_l1pt_delref(pmap, l1pte);
3845 	}
3846 }
3847 
3848 /*
3849  * pmap_l1pt_delref:
3850  *
3851  *	Delete a reference on a level 1 PT page.
3852  */
3853 static void
pmap_l1pt_delref(pmap_t pmap,pt_entry_t * l1pte)3854 pmap_l1pt_delref(pmap_t pmap, pt_entry_t *l1pte)
3855 {
3856 
3857 	KASSERT(pmap != pmap_kernel());
3858 
3859 	(void)pmap_physpage_delref(l1pte);
3860 }
3861 
3862 /******************** Address Space Number management ********************/
3863 
3864 /*
3865  * pmap_asn_alloc:
3866  *
3867  *	Allocate and assign an ASN to the specified pmap.
3868  *
3869  *	Note: the pmap must already be locked.  This may be called from
3870  *	an interprocessor interrupt, and in that case, the sender of
3871  *	the IPI has the pmap lock.
3872  */
3873 static u_int
pmap_asn_alloc(pmap_t const pmap,struct cpu_info * const ci)3874 pmap_asn_alloc(pmap_t const pmap, struct cpu_info * const ci)
3875 {
3876 
3877 #ifdef DEBUG
3878 	if (pmapdebug & (PDB_FOLLOW|PDB_ASN))
3879 		printf("pmap_asn_alloc(%p)\n", pmap);
3880 #endif
3881 
3882 	KASSERT(pmap != pmap_kernel());
3883 	KASSERT(pmap->pm_percpu[ci->ci_cpuid].pmc_lev1map != kernel_lev1map);
3884 	KASSERT(kpreempt_disabled());
3885 
3886 	/* No work to do if the CPU does not implement ASNs. */
3887 	if (pmap_max_asn == 0)
3888 		return 0;
3889 
3890 	struct pmap_percpu * const pmc = &pmap->pm_percpu[ci->ci_cpuid];
3891 
3892 	/*
3893 	 * Hopefully, we can continue using the one we have...
3894 	 *
3895 	 * N.B. the generation check will fail the first time
3896 	 * any pmap is activated on a given CPU, because we start
3897 	 * the generation counter at 1, but initialize pmaps with
3898 	 * 0; this forces the first ASN allocation to occur.
3899 	 */
3900 	if (pmc->pmc_asngen == ci->ci_asn_gen) {
3901 #ifdef DEBUG
3902 		if (pmapdebug & PDB_ASN)
3903 			printf("pmap_asn_alloc: same generation, keeping %u\n",
3904 			    pmc->pmc_asn);
3905 #endif
3906 		TLB_COUNT(asn_reuse);
3907 		return pmc->pmc_asn;
3908 	}
3909 
3910 	/*
3911 	 * Need to assign a new ASN.  Grab the next one, incrementing
3912 	 * the generation number if we have to.
3913 	 */
3914 	if (ci->ci_next_asn > pmap_max_asn) {
3915 		/*
3916 		 * Invalidate all non-PG_ASM TLB entries and the
3917 		 * I-cache, and bump the generation number.
3918 		 */
3919 		ALPHA_TBIAP();
3920 		alpha_pal_imb();
3921 
3922 		ci->ci_next_asn = PMAP_ASN_FIRST_USER;
3923 		ci->ci_asn_gen++;
3924 		TLB_COUNT(asn_newgen);
3925 
3926 		/*
3927 		 * Make sure the generation number doesn't wrap.  We could
3928 		 * handle this scenario by traversing all of the pmaps,
3929 		 * and invalidating the generation number on those which
3930 		 * are not currently in use by this processor.
3931 		 *
3932 		 * However... considering that we're using an unsigned 64-bit
3933 		 * integer for generation numbers, on non-ASN CPUs, we won't
3934 		 * wrap for approximately 75 billion years on a 128-ASN CPU
3935 		 * (assuming 1000 switch * operations per second).
3936 		 *
3937 		 * So, we don't bother.
3938 		 */
3939 		KASSERT(ci->ci_asn_gen != PMAP_ASNGEN_INVALID);
3940 #ifdef DEBUG
3941 		if (pmapdebug & PDB_ASN)
3942 			printf("pmap_asn_alloc: generation bumped to %lu\n",
3943 			    ci->ci_asn_gen);
3944 #endif
3945 	}
3946 
3947 	/*
3948 	 * Assign the new ASN and validate the generation number.
3949 	 */
3950 	pmc->pmc_asn = ci->ci_next_asn++;
3951 	pmc->pmc_asngen = ci->ci_asn_gen;
3952 	TLB_COUNT(asn_assign);
3953 
3954 	/*
3955 	 * We have a new ASN, so we can skip any pending I-stream sync
3956 	 * on the way back out to user space.
3957 	 */
3958 	pmc->pmc_needisync = 0;
3959 
3960 #ifdef DEBUG
3961 	if (pmapdebug & PDB_ASN)
3962 		printf("pmap_asn_alloc: assigning %u to pmap %p\n",
3963 		    pmc->pmc_asn, pmap);
3964 #endif
3965 	return pmc->pmc_asn;
3966 }
3967