xref: /openbsd-src/sys/arch/arm64/arm64/pmap.c (revision 6f6231dc4f0a07fe7b8a4824090617757514f823)
1 /* $OpenBSD: pmap.c,v 1.107 2025/01/25 12:29:35 kettenis Exp $ */
2 /*
3  * Copyright (c) 2008-2009,2014-2016 Dale Rahn <drahn@dalerahn.com>
4  *
5  * Permission to use, copy, modify, and distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16  */
17 
18 #include <sys/param.h>
19 #include <sys/systm.h>
20 #include <sys/atomic.h>
21 #include <sys/pool.h>
22 #include <sys/proc.h>
23 
24 #include <uvm/uvm.h>
25 
26 #include <machine/cpufunc.h>
27 #include <machine/pmap.h>
28 
29 #include <machine/db_machdep.h>
30 #include <ddb/db_extern.h>
31 #include <ddb/db_output.h>
32 
33 void pmap_setttb(struct proc *p);
34 void pmap_allocate_asid(pmap_t);
35 void pmap_free_asid(pmap_t pm);
36 
37 /* We run userland code with ASIDs that have the low bit set. */
38 #define ASID_USER	1
39 
40 static inline void
41 ttlb_flush(pmap_t pm, vaddr_t va)
42 {
43 	vaddr_t resva;
44 
45 	resva = ((va >> PAGE_SHIFT) & ((1ULL << 44) - 1));
46 	if (pm == pmap_kernel()) {
47 		cpu_tlb_flush_all_asid(resva);
48 	} else {
49 		resva |= (uint64_t)pm->pm_asid << 48;
50 		cpu_tlb_flush_asid(resva);
51 		resva |= (uint64_t)ASID_USER << 48;
52 		cpu_tlb_flush_asid(resva);
53 	}
54 }
55 
56 struct pmap kernel_pmap_;
57 struct pmap pmap_tramp;
58 
59 LIST_HEAD(pted_pv_head, pte_desc);
60 
61 struct pte_desc {
62 	LIST_ENTRY(pte_desc) pted_pv_list;
63 	uint64_t pted_pte;
64 	pmap_t pted_pmap;
65 	vaddr_t pted_va;
66 };
67 
68 struct pmapvp0 {
69 	uint64_t l0[VP_IDX0_CNT];
70 	struct pmapvp1 *vp[VP_IDX0_CNT];
71 };
72 
73 struct pmapvp1 {
74 	uint64_t l1[VP_IDX1_CNT];
75 	struct pmapvp2 *vp[VP_IDX1_CNT];
76 };
77 
78 struct pmapvp2 {
79 	uint64_t l2[VP_IDX2_CNT];
80 	struct pmapvp3 *vp[VP_IDX2_CNT];
81 };
82 
83 struct pmapvp3 {
84 	uint64_t l3[VP_IDX3_CNT];
85 	struct pte_desc *vp[VP_IDX3_CNT];
86 };
87 CTASSERT(sizeof(struct pmapvp0) == sizeof(struct pmapvp1));
88 CTASSERT(sizeof(struct pmapvp0) == sizeof(struct pmapvp2));
89 CTASSERT(sizeof(struct pmapvp0) == sizeof(struct pmapvp3));
90 
91 void	pmap_vp_destroy(pmap_t pm);
92 
93 /* Allocator for VP pool. */
94 void	*pmap_vp_page_alloc(struct pool *, int, int *);
95 void	pmap_vp_page_free(struct pool *, void *);
96 
97 struct pool_allocator pmap_vp_allocator = {
98 	pmap_vp_page_alloc, pmap_vp_page_free, sizeof(struct pmapvp0)
99 };
100 
101 void	pmap_remove_pted(pmap_t, struct pte_desc *);
102 void	pmap_kremove_pg(vaddr_t);
103 void	pmap_set_l1(struct pmap *, uint64_t, struct pmapvp1 *);
104 void	pmap_set_l2(struct pmap *, uint64_t, struct pmapvp1 *,
105 	    struct pmapvp2 *);
106 void	pmap_set_l3(struct pmap *, uint64_t, struct pmapvp2 *,
107 	    struct pmapvp3 *);
108 
109 void	pmap_fill_pte(pmap_t, vaddr_t, paddr_t, struct pte_desc *,
110 	    vm_prot_t, int, int);
111 void	pmap_icache_sync_page(struct pmap *, paddr_t);
112 void	pmap_pte_insert(struct pte_desc *);
113 void	pmap_pte_remove(struct pte_desc *, int);
114 void	pmap_pte_update(struct pte_desc *, uint64_t *);
115 void	pmap_release(pmap_t);
116 paddr_t	pmap_steal_avail(size_t, int, void **);
117 void	pmap_remove_avail(paddr_t, paddr_t);
118 vaddr_t	pmap_map_stolen(vaddr_t);
119 
120 vaddr_t vmmap;
121 vaddr_t zero_page;
122 vaddr_t copy_src_page;
123 vaddr_t copy_dst_page;
124 
125 struct pool pmap_pmap_pool;
126 struct pool pmap_pted_pool;
127 struct pool pmap_vp_pool;
128 
129 int pmap_initialized = 0;
130 
131 struct mem_region {
132 	vaddr_t start;
133 	vsize_t size;
134 };
135 
136 struct mem_region pmap_avail_regions[10];
137 struct mem_region pmap_allocated_regions[10];
138 struct mem_region *pmap_avail = &pmap_avail_regions[0];
139 struct mem_region *pmap_allocated = &pmap_allocated_regions[0];
140 int pmap_cnt_avail, pmap_cnt_allocated;
141 uint64_t pmap_avail_kvo;
142 
143 static inline void
144 pmap_lock(struct pmap *pmap)
145 {
146 	if (pmap != pmap_kernel())
147 		mtx_enter(&pmap->pm_mtx);
148 }
149 
150 static inline void
151 pmap_unlock(struct pmap *pmap)
152 {
153 	if (pmap != pmap_kernel())
154 		mtx_leave(&pmap->pm_mtx);
155 }
156 
157 #define PMAP_ASSERT_LOCKED(pmap) 			\
158 	if ((pmap) != pmap_kernel()) 			\
159 		MUTEX_ASSERT_LOCKED(&(pmap)->pm_mtx);
160 
161 /* virtual to physical helpers */
162 static inline int
163 VP_IDX0(vaddr_t va)
164 {
165 	return (va >> VP_IDX0_POS) & VP_IDX0_MASK;
166 }
167 
168 static inline int
169 VP_IDX1(vaddr_t va)
170 {
171 	return (va >> VP_IDX1_POS) & VP_IDX1_MASK;
172 }
173 
174 static inline int
175 VP_IDX2(vaddr_t va)
176 {
177 	return (va >> VP_IDX2_POS) & VP_IDX2_MASK;
178 }
179 
180 static inline int
181 VP_IDX3(vaddr_t va)
182 {
183 	return (va >> VP_IDX3_POS) & VP_IDX3_MASK;
184 }
185 
186 const uint64_t ap_bits_user[8] = {
187 	[PROT_NONE]				= 0,
188 	[PROT_READ]				= ATTR_PXN|ATTR_UXN|ATTR_AF|ATTR_AP(3),
189 	[PROT_WRITE]				= ATTR_PXN|ATTR_UXN|ATTR_AF|ATTR_AP(1),
190 	[PROT_WRITE|PROT_READ]			= ATTR_PXN|ATTR_UXN|ATTR_AF|ATTR_AP(1),
191 	[PROT_EXEC]				= ATTR_PXN|ATTR_AF|ATTR_AP(2),
192 	[PROT_EXEC|PROT_READ]			= ATTR_PXN|ATTR_AF|ATTR_AP(3),
193 	[PROT_EXEC|PROT_WRITE]			= ATTR_PXN|ATTR_AF|ATTR_AP(1),
194 	[PROT_EXEC|PROT_WRITE|PROT_READ]	= ATTR_PXN|ATTR_AF|ATTR_AP(1),
195 };
196 
197 const uint64_t ap_bits_kern[8] = {
198 	[PROT_NONE]				= 0,
199 	[PROT_READ]				= ATTR_PXN|ATTR_UXN|ATTR_AF|ATTR_AP(2),
200 	[PROT_WRITE]				= ATTR_PXN|ATTR_UXN|ATTR_AF|ATTR_AP(0),
201 	[PROT_WRITE|PROT_READ]			= ATTR_PXN|ATTR_UXN|ATTR_AF|ATTR_AP(0),
202 	[PROT_EXEC]				= ATTR_UXN|ATTR_AF|ATTR_AP(2),
203 	[PROT_EXEC|PROT_READ]			= ATTR_UXN|ATTR_AF|ATTR_AP(2),
204 	[PROT_EXEC|PROT_WRITE]			= ATTR_UXN|ATTR_AF|ATTR_AP(0),
205 	[PROT_EXEC|PROT_WRITE|PROT_READ]	= ATTR_UXN|ATTR_AF|ATTR_AP(0),
206 };
207 
208 /*
209  * We allocate ASIDs in pairs.  The first ASID is used to run the
210  * kernel and has both userland and the full kernel mapped.  The
211  * second ASID is used for running userland and has only the
212  * trampoline page mapped in addition to userland.
213  */
214 
215 #define PMAP_MAX_NASID	(1 << 16)
216 #define PMAP_ASID_MASK	(PMAP_MAX_NASID - 1)
217 int pmap_nasid = (1 << 8);
218 
219 uint32_t pmap_asid[PMAP_MAX_NASID / 32];
220 unsigned long pmap_asid_gen = PMAP_MAX_NASID;
221 struct mutex pmap_asid_mtx = MUTEX_INITIALIZER(IPL_HIGH);
222 
223 int
224 pmap_find_asid(pmap_t pm)
225 {
226 	uint32_t bits;
227 	int asid, bit;
228 	int retry;
229 
230 	MUTEX_ASSERT_LOCKED(&pmap_asid_mtx);
231 
232 	/* Attempt to re-use the old ASID. */
233 	asid = pm->pm_asid & PMAP_ASID_MASK;
234 	bit = asid & (32 - 1);
235 	bits = pmap_asid[asid / 32];
236 	if ((bits & (3U << bit)) == 0)
237 		return asid;
238 
239 	/* Attempt to obtain a random ASID. */
240 	for (retry = 5; retry > 0; retry--) {
241 		asid = arc4random() & (pmap_nasid - 2);
242 		bit = (asid & (32 - 1));
243 		bits = pmap_asid[asid / 32];
244 		if ((bits & (3U << bit)) == 0)
245 			return asid;
246 	}
247 
248 	/* Do a linear search if that fails. */
249 	for (asid = 0; asid < pmap_nasid; asid += 32) {
250 		bits = pmap_asid[asid / 32];
251 		if (bits == ~0)
252 			continue;
253 		for (bit = 0; bit < 32; bit += 2) {
254 			if ((bits & (3U << bit)) == 0)
255 				return asid + bit;
256 		}
257 	}
258 
259 	return -1;
260 }
261 
262 int
263 pmap_rollover_asid(pmap_t pm)
264 {
265 	struct cpu_info *ci;
266 	CPU_INFO_ITERATOR cii;
267 	unsigned long gen;
268 	int asid, bit;
269 
270 	MUTEX_ASSERT_LOCKED(&pmap_asid_mtx);
271 
272 	/* Start a new generation.  Mark ASID 0 as in-use again. */
273 	gen = atomic_add_long_nv(&pmap_asid_gen, PMAP_MAX_NASID);
274 	memset(pmap_asid, 0, (pmap_nasid / 32) * sizeof(uint32_t));
275 	pmap_asid[0] |= (3U << 0);
276 
277 	/*
278 	 * Carry over all the ASIDs that are currently active into the
279 	 * new generation and reserve them.
280 	 * CPUs in cpu_switchto() will spin in pmap_setttb() waiting for
281 	 * the mutex. In that case an old ASID will be carried over but
282 	 * that is not problematic.
283 	 */
284 	CPU_INFO_FOREACH(cii, ci) {
285 		asid = ci->ci_curpm->pm_asid & PMAP_ASID_MASK;
286 		ci->ci_curpm->pm_asid = asid | gen;
287 		bit = (asid & (32 - 1));
288 		pmap_asid[asid / 32] |= (3U << bit);
289 	}
290 
291 	/* Flush the TLBs on all CPUs. */
292 	cpu_tlb_flush();
293 
294 	if ((pm->pm_asid & ~PMAP_ASID_MASK) == gen)
295 		return pm->pm_asid & PMAP_ASID_MASK;
296 
297 	return pmap_find_asid(pm);
298 }
299 
300 void
301 pmap_allocate_asid(pmap_t pm)
302 {
303 	int asid, bit;
304 
305 	mtx_enter(&pmap_asid_mtx);
306 	asid = pmap_find_asid(pm);
307 	if (asid == -1) {
308 		/*
309 		 * We have no free ASIDs.  Do a rollover to clear all
310 		 * inactive ASIDs and pick a fresh one.
311 		 */
312 		asid = pmap_rollover_asid(pm);
313 	}
314 	KASSERT(asid > 0 && asid < pmap_nasid);
315 	bit = asid & (32 - 1);
316 	pmap_asid[asid / 32] |= (3U << bit);
317 	pm->pm_asid = asid | pmap_asid_gen;
318 	mtx_leave(&pmap_asid_mtx);
319 }
320 
321 void
322 pmap_free_asid(pmap_t pm)
323 {
324 	int asid, bit;
325 
326 	KASSERT(pm != curcpu()->ci_curpm);
327 	cpu_tlb_flush_asid_all((uint64_t)pm->pm_asid << 48);
328 	cpu_tlb_flush_asid_all((uint64_t)(pm->pm_asid | ASID_USER) << 48);
329 
330 	mtx_enter(&pmap_asid_mtx);
331 	if ((pm->pm_asid & ~PMAP_ASID_MASK) == pmap_asid_gen) {
332 		asid = pm->pm_asid & PMAP_ASID_MASK;
333 		bit = (asid & (32 - 1));
334 		pmap_asid[asid / 32] &= ~(3U << bit);
335 	}
336 	mtx_leave(&pmap_asid_mtx);
337 }
338 
339 /*
340  * This is used for pmap_kernel() mappings, they are not to be removed
341  * from the vp table because they were statically initialized at the
342  * initial pmap initialization. This is so that memory allocation
343  * is not necessary in the pmap_kernel() mappings.
344  * Otherwise bad race conditions can appear.
345  */
346 struct pte_desc *
347 pmap_vp_lookup(pmap_t pm, vaddr_t va, uint64_t **pl3entry)
348 {
349 	struct pmapvp1 *vp1;
350 	struct pmapvp2 *vp2;
351 	struct pmapvp3 *vp3;
352 	struct pte_desc *pted;
353 
354 	if (pm->have_4_level_pt) {
355 		if (pm->pm_vp.l0 == NULL) {
356 			return NULL;
357 		}
358 		vp1 = pm->pm_vp.l0->vp[VP_IDX0(va)];
359 	} else {
360 		vp1 = pm->pm_vp.l1;
361 	}
362 	if (vp1 == NULL) {
363 		return NULL;
364 	}
365 
366 	vp2 = vp1->vp[VP_IDX1(va)];
367 	if (vp2 == NULL) {
368 		return NULL;
369 	}
370 
371 	vp3 = vp2->vp[VP_IDX2(va)];
372 	if (vp3 == NULL) {
373 		return NULL;
374 	}
375 
376 	pted = vp3->vp[VP_IDX3(va)];
377 	if (pl3entry != NULL)
378 		*pl3entry = &(vp3->l3[VP_IDX3(va)]);
379 
380 	return pted;
381 }
382 
383 /*
384  * Create a V -> P mapping for the given pmap and virtual address
385  * with reference to the pte descriptor that is used to map the page.
386  * This code should track allocations of vp table allocations
387  * so they can be freed efficiently.
388  *
389  * XXX it may be possible to save some bits of count in the
390  * upper address bits of the pa or the pte entry.
391  * However that does make populating the other bits more tricky.
392  * each level has 512 entries, so that mean 9 bits to store
393  * stash 3 bits each in the first 3 entries?
394  */
395 int
396 pmap_vp_enter(pmap_t pm, vaddr_t va, struct pte_desc *pted, int flags)
397 {
398 	struct pmapvp1 *vp1;
399 	struct pmapvp2 *vp2;
400 	struct pmapvp3 *vp3;
401 
402 	PMAP_ASSERT_LOCKED(pm);
403 
404 	if (pm->have_4_level_pt) {
405 		vp1 = pm->pm_vp.l0->vp[VP_IDX0(va)];
406 		if (vp1 == NULL) {
407 			vp1 = pool_get(&pmap_vp_pool, PR_NOWAIT | PR_ZERO);
408 			if (vp1 == NULL) {
409 				if ((flags & PMAP_CANFAIL) == 0)
410 					panic("%s: unable to allocate L1",
411 					    __func__);
412 				return ENOMEM;
413 			}
414 			pmap_set_l1(pm, va, vp1);
415 		}
416 	} else {
417 		vp1 = pm->pm_vp.l1;
418 	}
419 
420 	vp2 = vp1->vp[VP_IDX1(va)];
421 	if (vp2 == NULL) {
422 		vp2 = pool_get(&pmap_vp_pool, PR_NOWAIT | PR_ZERO);
423 		if (vp2 == NULL) {
424 			if ((flags & PMAP_CANFAIL) == 0)
425 				panic("%s: unable to allocate L2", __func__);
426 			return ENOMEM;
427 		}
428 		pmap_set_l2(pm, va, vp1, vp2);
429 	}
430 
431 	vp3 = vp2->vp[VP_IDX2(va)];
432 	if (vp3 == NULL) {
433 		vp3 = pool_get(&pmap_vp_pool, PR_NOWAIT | PR_ZERO);
434 		if (vp3 == NULL) {
435 			if ((flags & PMAP_CANFAIL) == 0)
436 				panic("%s: unable to allocate L3", __func__);
437 			return ENOMEM;
438 		}
439 		pmap_set_l3(pm, va, vp2, vp3);
440 	}
441 
442 	vp3->vp[VP_IDX3(va)] = pted;
443 	return 0;
444 }
445 
446 void
447 pmap_vp_populate(pmap_t pm, vaddr_t va)
448 {
449 	struct pte_desc *pted;
450 	struct pmapvp1 *vp1;
451 	struct pmapvp2 *vp2;
452 	struct pmapvp3 *vp3;
453 	void *vp;
454 
455 	pted = pool_get(&pmap_pted_pool, PR_WAITOK | PR_ZERO);
456 	vp = pool_get(&pmap_vp_pool, PR_WAITOK | PR_ZERO);
457 
458 	pmap_lock(pm);
459 
460 	if (pm->have_4_level_pt) {
461 		vp1 = pm->pm_vp.l0->vp[VP_IDX0(va)];
462 		if (vp1 == NULL) {
463 			vp1 = vp; vp = NULL;
464 			pmap_set_l1(pm, va, vp1);
465 		}
466 	} else {
467 		vp1 = pm->pm_vp.l1;
468 	}
469 
470 	if (vp == NULL) {
471 		pmap_unlock(pm);
472 		vp = pool_get(&pmap_vp_pool, PR_WAITOK | PR_ZERO);
473 		pmap_lock(pm);
474 	}
475 
476 	vp2 = vp1->vp[VP_IDX1(va)];
477 	if (vp2 == NULL) {
478 		vp2 = vp; vp = NULL;
479 		pmap_set_l2(pm, va, vp1, vp2);
480 	}
481 
482 	if (vp == NULL) {
483 		pmap_unlock(pm);
484 		vp = pool_get(&pmap_vp_pool, PR_WAITOK | PR_ZERO);
485 		pmap_lock(pm);
486 	}
487 
488 	vp3 = vp2->vp[VP_IDX2(va)];
489 	if (vp3 == NULL) {
490 		vp3 = vp; vp = NULL;
491 		pmap_set_l3(pm, va, vp2, vp3);
492 	}
493 
494 	if (vp3->vp[VP_IDX3(va)] == NULL) {
495 		vp3->vp[VP_IDX3(va)] = pted;
496 		pted = NULL;
497 	}
498 
499 	pmap_unlock(pm);
500 
501 	if (vp)
502 		pool_put(&pmap_vp_pool, vp);
503 	if (pted)
504 		pool_put(&pmap_pted_pool, pted);
505 }
506 
507 void *
508 pmap_vp_page_alloc(struct pool *pp, int flags, int *slowdown)
509 {
510 	struct kmem_dyn_mode kd = KMEM_DYN_INITIALIZER;
511 
512 	kd.kd_waitok = ISSET(flags, PR_WAITOK);
513 	kd.kd_trylock = ISSET(flags, PR_NOWAIT);
514 	kd.kd_slowdown = slowdown;
515 
516 	return km_alloc(pp->pr_pgsize, &kv_any, &kp_dirty, &kd);
517 }
518 
519 void
520 pmap_vp_page_free(struct pool *pp, void *v)
521 {
522 	km_free(v, pp->pr_pgsize, &kv_any, &kp_dirty);
523 }
524 
525 static inline u_int32_t
526 PTED_MANAGED(struct pte_desc *pted)
527 {
528 	return (pted->pted_va & PTED_VA_MANAGED_M);
529 }
530 
531 static inline u_int32_t
532 PTED_WIRED(struct pte_desc *pted)
533 {
534 	return (pted->pted_va & PTED_VA_WIRED_M);
535 }
536 
537 static inline u_int32_t
538 PTED_VALID(struct pte_desc *pted)
539 {
540 	return (pted->pted_pte != 0);
541 }
542 
543 /*
544  * PV entries -
545  * manipulate the physical to virtual translations for the entire system.
546  *
547  * QUESTION: should all mapped memory be stored in PV tables? Or
548  * is it alright to only store "ram" memory. Currently device mappings
549  * are not stored.
550  * It makes sense to pre-allocate mappings for all of "ram" memory, since
551  * it is likely that it will be mapped at some point, but would it also
552  * make sense to use a tree/table like is use for pmap to store device
553  * mappings?
554  * Further notes: It seems that the PV table is only used for pmap_protect
555  * and other paging related operations. Given this, it is not necessary
556  * to store any pmap_kernel() entries in PV tables and does not make
557  * sense to store device mappings in PV either.
558  *
559  * Note: unlike other powerpc pmap designs, the array is only an array
560  * of pointers. Since the same structure is used for holding information
561  * in the VP table, the PV table, and for kernel mappings, the wired entries.
562  * Allocate one data structure to hold all of the info, instead of replicating
563  * it multiple times.
564  *
565  * One issue of making this a single data structure is that two pointers are
566  * wasted for every page which does not map ram (device mappings), this
567  * should be a low percentage of mapped pages in the system, so should not
568  * have too noticeable unnecessary ram consumption.
569  */
570 
571 void
572 pmap_enter_pv(struct pte_desc *pted, struct vm_page *pg)
573 {
574 	/*
575 	 * XXX does this test mean that some pages try to be managed,
576 	 * but this is called too soon?
577 	 */
578 	if (__predict_false(!pmap_initialized))
579 		return;
580 
581 	mtx_enter(&pg->mdpage.pv_mtx);
582 	LIST_INSERT_HEAD(&(pg->mdpage.pv_list), pted, pted_pv_list);
583 	pted->pted_va |= PTED_VA_MANAGED_M;
584 	mtx_leave(&pg->mdpage.pv_mtx);
585 }
586 
587 void
588 pmap_remove_pv(struct pte_desc *pted)
589 {
590 	struct vm_page *pg = PHYS_TO_VM_PAGE(pted->pted_pte & PTE_RPGN);
591 
592 	mtx_enter(&pg->mdpage.pv_mtx);
593 	LIST_REMOVE(pted, pted_pv_list);
594 	mtx_leave(&pg->mdpage.pv_mtx);
595 }
596 
597 int
598 pmap_enter(pmap_t pm, vaddr_t va, paddr_t pa, vm_prot_t prot, int flags)
599 {
600 	struct pte_desc *pted;
601 	struct vm_page *pg;
602 	int error;
603 	int cache = PMAP_CACHE_WB;
604 
605 	if (pa & PMAP_NOCACHE)
606 		cache = PMAP_CACHE_CI;
607 	if (pa & PMAP_DEVICE)
608 		cache = PMAP_CACHE_DEV_NGNRNE;
609 	pg = PHYS_TO_VM_PAGE(pa);
610 
611 	pmap_lock(pm);
612 	pted = pmap_vp_lookup(pm, va, NULL);
613 	if (pted && PTED_VALID(pted)) {
614 		pmap_remove_pted(pm, pted);
615 		/* we lost our pted if it was user */
616 		if (pm != pmap_kernel())
617 			pted = pmap_vp_lookup(pm, va, NULL);
618 	}
619 
620 	pm->pm_stats.resident_count++;
621 
622 	/* Do not have pted for this, get one and put it in VP */
623 	if (pted == NULL) {
624 		pted = pool_get(&pmap_pted_pool, PR_NOWAIT | PR_ZERO);
625 		if (pted == NULL) {
626 			if ((flags & PMAP_CANFAIL) == 0)
627 				panic("%s: failed to allocate pted", __func__);
628 			error = ENOMEM;
629 			goto out;
630 		}
631 		if (pmap_vp_enter(pm, va, pted, flags)) {
632 			if ((flags & PMAP_CANFAIL) == 0)
633 				panic("%s: failed to allocate L2/L3", __func__);
634 			error = ENOMEM;
635 			pool_put(&pmap_pted_pool, pted);
636 			goto out;
637 		}
638 	}
639 
640 	/*
641 	 * If it should be enabled _right now_, we can skip doing ref/mod
642 	 * emulation. Any access includes reference, modified only by write.
643 	 */
644 	if (pg != NULL &&
645 	    ((flags & PROT_MASK) || (pg->pg_flags & PG_PMAP_REF))) {
646 		atomic_setbits_int(&pg->pg_flags, PG_PMAP_REF);
647 		if ((prot & PROT_WRITE) && (flags & PROT_WRITE)) {
648 			atomic_setbits_int(&pg->pg_flags, PG_PMAP_MOD);
649 			atomic_clearbits_int(&pg->pg_flags, PG_PMAP_EXE);
650 		}
651 	}
652 
653 	pmap_fill_pte(pm, va, pa, pted, prot, flags, cache);
654 
655 	if (pg != NULL) {
656 		pmap_enter_pv(pted, pg); /* only managed mem */
657 	}
658 
659 	if (pg != NULL && (flags & PROT_EXEC)) {
660 		if ((pg->pg_flags & PG_PMAP_EXE) == 0)
661 			pmap_icache_sync_page(pm, pa);
662 		atomic_setbits_int(&pg->pg_flags, PG_PMAP_EXE);
663 	}
664 
665 	/*
666 	 * Insert into table, if this mapping said it needed to be mapped
667 	 * now.
668 	 */
669 	if (flags & (PROT_READ|PROT_WRITE|PROT_EXEC|PMAP_WIRED)) {
670 		pmap_pte_insert(pted);
671 		ttlb_flush(pm, va & ~PAGE_MASK);
672 	}
673 
674 	error = 0;
675 out:
676 	pmap_unlock(pm);
677 	return error;
678 }
679 
680 void
681 pmap_populate(pmap_t pm, vaddr_t va)
682 {
683 	pmap_vp_populate(pm, va);
684 }
685 
686 /*
687  * Remove the given range of mapping entries.
688  */
689 void
690 pmap_remove(pmap_t pm, vaddr_t sva, vaddr_t eva)
691 {
692 	struct pte_desc *pted;
693 	vaddr_t va;
694 
695 	pmap_lock(pm);
696 	for (va = sva; va < eva; va += PAGE_SIZE) {
697 		pted = pmap_vp_lookup(pm, va, NULL);
698 
699 		if (pted == NULL)
700 			continue;
701 
702 		if (PTED_WIRED(pted)) {
703 			pm->pm_stats.wired_count--;
704 			pted->pted_va &= ~PTED_VA_WIRED_M;
705 		}
706 
707 		if (PTED_VALID(pted))
708 			pmap_remove_pted(pm, pted);
709 	}
710 	pmap_unlock(pm);
711 }
712 
713 /*
714  * remove a single mapping, notice that this code is O(1)
715  */
716 void
717 pmap_remove_pted(pmap_t pm, struct pte_desc *pted)
718 {
719 	pm->pm_stats.resident_count--;
720 
721 	if (PTED_WIRED(pted)) {
722 		pm->pm_stats.wired_count--;
723 		pted->pted_va &= ~PTED_VA_WIRED_M;
724 	}
725 
726 	pmap_pte_remove(pted, pm != pmap_kernel());
727 	ttlb_flush(pm, pted->pted_va & ~PAGE_MASK);
728 
729 	if (pted->pted_va & PTED_VA_EXEC_M) {
730 		pted->pted_va &= ~PTED_VA_EXEC_M;
731 	}
732 
733 	if (PTED_MANAGED(pted))
734 		pmap_remove_pv(pted);
735 
736 	pted->pted_pte = 0;
737 	pted->pted_va = 0;
738 
739 	if (pm != pmap_kernel())
740 		pool_put(&pmap_pted_pool, pted);
741 }
742 
743 
744 /*
745  * Enter a kernel mapping for the given page.
746  * kernel mappings have a larger set of prerequisites than normal mappings.
747  *
748  * 1. no memory should be allocated to create a kernel mapping.
749  * 2. a vp mapping should already exist, even if invalid. (see 1)
750  * 3. all vp tree mappings should already exist (see 1)
751  *
752  */
753 void
754 _pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, int flags, int cache)
755 {
756 	pmap_t pm = pmap_kernel();
757 	struct pte_desc *pted;
758 	struct vm_page *pg;
759 
760 	pted = pmap_vp_lookup(pm, va, NULL);
761 	if (pted == NULL) {
762 		panic("pted not preallocated in pmap_kernel() va %lx pa %lx",
763 		    va, pa);
764 	}
765 
766 	if (pted && PTED_VALID(pted))
767 		pmap_kremove_pg(va); /* pted is reused */
768 
769 	pm->pm_stats.resident_count++;
770 
771 	flags |= PMAP_WIRED; /* kernel mappings are always wired. */
772 	/* Calculate PTE */
773 	pmap_fill_pte(pm, va, pa, pted, prot, flags, cache);
774 
775 	/* Insert into table */
776 	pmap_pte_insert(pted);
777 	ttlb_flush(pm, va & ~PAGE_MASK);
778 
779 	pg = PHYS_TO_VM_PAGE(pted->pted_pte & PTE_RPGN);
780 	if (pg && (cache == PMAP_CACHE_CI || cache == PMAP_CACHE_DEV_NGNRNE))
781 		cpu_idcache_wbinv_range(va & ~PAGE_MASK, PAGE_SIZE);
782 }
783 
784 void
785 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot)
786 {
787 	_pmap_kenter_pa(va, pa, prot, prot,
788 	    (pa & PMAP_NOCACHE) ? PMAP_CACHE_CI : PMAP_CACHE_WB);
789 }
790 
791 void
792 pmap_kenter_cache(vaddr_t va, paddr_t pa, vm_prot_t prot, int cacheable)
793 {
794 	_pmap_kenter_pa(va, pa, prot, prot, cacheable);
795 }
796 
797 /*
798  * remove kernel (pmap_kernel()) mapping, one page
799  */
800 void
801 pmap_kremove_pg(vaddr_t va)
802 {
803 	pmap_t pm = pmap_kernel();
804 	struct pte_desc *pted;
805 	int s;
806 
807 	pted = pmap_vp_lookup(pm, va, NULL);
808 	if (pted == NULL)
809 		return;
810 
811 	if (!PTED_VALID(pted))
812 		return; /* not mapped */
813 
814 	s = splvm();
815 
816 	pm->pm_stats.resident_count--;
817 
818 	pmap_pte_remove(pted, 0);
819 	ttlb_flush(pm, pted->pted_va & ~PAGE_MASK);
820 
821 	if (pted->pted_va & PTED_VA_EXEC_M)
822 		pted->pted_va &= ~PTED_VA_EXEC_M;
823 
824 	if (PTED_MANAGED(pted))
825 		pmap_remove_pv(pted);
826 
827 	if (PTED_WIRED(pted))
828 		pm->pm_stats.wired_count--;
829 
830 	/* invalidate pted; */
831 	pted->pted_pte = 0;
832 	pted->pted_va = 0;
833 
834 	splx(s);
835 }
836 
837 /*
838  * remove kernel (pmap_kernel()) mappings
839  */
840 void
841 pmap_kremove(vaddr_t va, vsize_t len)
842 {
843 	for (len >>= PAGE_SHIFT; len >0; len--, va += PAGE_SIZE)
844 		pmap_kremove_pg(va);
845 }
846 
847 void
848 pmap_fill_pte(pmap_t pm, vaddr_t va, paddr_t pa, struct pte_desc *pted,
849     vm_prot_t prot, int flags, int cache)
850 {
851 	pted->pted_va = va;
852 	pted->pted_pmap = pm;
853 
854 	switch (cache) {
855 	case PMAP_CACHE_WB:
856 		break;
857 	case PMAP_CACHE_WT:
858 		break;
859 	case PMAP_CACHE_CI:
860 		break;
861 	case PMAP_CACHE_DEV_NGNRNE:
862 		break;
863 	case PMAP_CACHE_DEV_NGNRE:
864 		break;
865 	default:
866 		panic("%s: invalid cache mode", __func__);
867 	}
868 	pted->pted_va |= cache;
869 
870 	pted->pted_va |= prot & (PROT_READ|PROT_WRITE|PROT_EXEC);
871 
872 	if (flags & PMAP_WIRED) {
873 		pted->pted_va |= PTED_VA_WIRED_M;
874 		pm->pm_stats.wired_count++;
875 	}
876 
877 	pted->pted_pte = pa & PTE_RPGN;
878 	pted->pted_pte |= flags & (PROT_READ|PROT_WRITE|PROT_EXEC);
879 }
880 
881 /*
882  * Fill the given physical page with zeros.
883  */
884 void
885 pmap_zero_page(struct vm_page *pg)
886 {
887 	paddr_t pa = VM_PAGE_TO_PHYS(pg);
888 	vaddr_t va = zero_page + cpu_number() * PAGE_SIZE;
889 
890 	KASSERT(curcpu()->ci_idepth == 0);
891 
892 	pmap_kenter_pa(va, pa, PROT_READ|PROT_WRITE);
893 	pagezero_cache(va);
894 	pmap_kremove_pg(va);
895 }
896 
897 /*
898  * Copy the given physical page.
899  */
900 void
901 pmap_copy_page(struct vm_page *srcpg, struct vm_page *dstpg)
902 {
903 	paddr_t srcpa = VM_PAGE_TO_PHYS(srcpg);
904 	paddr_t dstpa = VM_PAGE_TO_PHYS(dstpg);
905 	vaddr_t srcva = copy_src_page + cpu_number() * PAGE_SIZE;
906 	vaddr_t dstva = copy_dst_page + cpu_number() * PAGE_SIZE;
907 	int s;
908 
909 	/*
910 	 * XXX The buffer flipper (incorrectly?) uses pmap_copy_page()
911 	 * (from uvm_pagerealloc_multi()) from interrupt context!
912 	 */
913 	s = splbio();
914 	pmap_kenter_pa(srcva, srcpa, PROT_READ);
915 	pmap_kenter_pa(dstva, dstpa, PROT_READ|PROT_WRITE);
916 	memcpy((void *)dstva, (void *)srcva, PAGE_SIZE);
917 	pmap_kremove_pg(srcva);
918 	pmap_kremove_pg(dstva);
919 	splx(s);
920 }
921 
922 void
923 pmap_pinit(pmap_t pm)
924 {
925 	vaddr_t l0va;
926 
927 	/* Allocate a full L0/L1 table. */
928 	if (pm->have_4_level_pt) {
929 		while (pm->pm_vp.l0 == NULL) {
930 			pm->pm_vp.l0 = pool_get(&pmap_vp_pool,
931 			    PR_WAITOK | PR_ZERO);
932 		}
933 		l0va = (vaddr_t)pm->pm_vp.l0->l0; /* top level is l0 */
934 	} else {
935 		while (pm->pm_vp.l1 == NULL) {
936 
937 			pm->pm_vp.l1 = pool_get(&pmap_vp_pool,
938 			    PR_WAITOK | PR_ZERO);
939 		}
940 		l0va = (vaddr_t)pm->pm_vp.l1->l1; /* top level is l1 */
941 
942 	}
943 
944 	pmap_extract(pmap_kernel(), l0va, (paddr_t *)&pm->pm_pt0pa);
945 
946 	pmap_reference(pm);
947 }
948 
949 int pmap_vp_poolcache = 0; /* force vp poolcache to allocate late */
950 
951 /*
952  * Create and return a physical map.
953  */
954 pmap_t
955 pmap_create(void)
956 {
957 	pmap_t pmap;
958 
959 	pmap = pool_get(&pmap_pmap_pool, PR_WAITOK | PR_ZERO);
960 
961 	mtx_init(&pmap->pm_mtx, IPL_VM);
962 
963 	pmap_pinit(pmap);
964 	if (pmap_vp_poolcache == 0) {
965 		pool_setlowat(&pmap_vp_pool, 20);
966 		pmap_vp_poolcache = 20;
967 	}
968 	return (pmap);
969 }
970 
971 /*
972  * Add a reference to a given pmap.
973  */
974 void
975 pmap_reference(pmap_t pm)
976 {
977 	atomic_inc_int(&pm->pm_refs);
978 }
979 
980 /*
981  * Retire the given pmap from service.
982  * Should only be called if the map contains no valid mappings.
983  */
984 void
985 pmap_destroy(pmap_t pm)
986 {
987 	int refs;
988 
989 	refs = atomic_dec_int_nv(&pm->pm_refs);
990 	if (refs > 0)
991 		return;
992 
993 	/*
994 	 * reference count is zero, free pmap resources and free pmap.
995 	 */
996 	pmap_release(pm);
997 	pmap_free_asid(pm);
998 	pool_put(&pmap_pmap_pool, pm);
999 }
1000 
1001 /*
1002  * Release any resources held by the given physical map.
1003  * Called when a pmap initialized by pmap_pinit is being released.
1004  */
1005 void
1006 pmap_release(pmap_t pm)
1007 {
1008 	pmap_vp_destroy(pm);
1009 }
1010 
1011 void
1012 pmap_vp_destroy_l2_l3(pmap_t pm, struct pmapvp1 *vp1)
1013 {
1014 	struct pmapvp2 *vp2;
1015 	struct pmapvp3 *vp3;
1016 	struct pte_desc *pted;
1017 	int j, k, l;
1018 
1019 	for (j = 0; j < VP_IDX1_CNT; j++) {
1020 		vp2 = vp1->vp[j];
1021 		if (vp2 == NULL)
1022 			continue;
1023 		vp1->vp[j] = NULL;
1024 
1025 		for (k = 0; k < VP_IDX2_CNT; k++) {
1026 			vp3 = vp2->vp[k];
1027 			if (vp3 == NULL)
1028 				continue;
1029 			vp2->vp[k] = NULL;
1030 
1031 			for (l = 0; l < VP_IDX3_CNT; l++) {
1032 				pted = vp3->vp[l];
1033 				if (pted == NULL)
1034 					continue;
1035 				vp3->vp[l] = NULL;
1036 
1037 				pool_put(&pmap_pted_pool, pted);
1038 			}
1039 			pool_put(&pmap_vp_pool, vp3);
1040 		}
1041 		pool_put(&pmap_vp_pool, vp2);
1042 	}
1043 }
1044 
1045 void
1046 pmap_vp_destroy(pmap_t pm)
1047 {
1048 	struct pmapvp0 *vp0;
1049 	struct pmapvp1 *vp1;
1050 	int i;
1051 
1052 	/*
1053 	 * XXX Is there a better way to share this code between 3 and
1054 	 * 4 level tables?  Split the lower levels into a different
1055 	 * function?
1056 	 */
1057 	if (!pm->have_4_level_pt) {
1058 		pmap_vp_destroy_l2_l3(pm, pm->pm_vp.l1);
1059 		pool_put(&pmap_vp_pool, pm->pm_vp.l1);
1060 		pm->pm_vp.l1 = NULL;
1061 		return;
1062 	}
1063 
1064 	vp0 = pm->pm_vp.l0;
1065 	for (i = 0; i < VP_IDX0_CNT; i++) {
1066 		vp1 = vp0->vp[i];
1067 		if (vp1 == NULL)
1068 			continue;
1069 		vp0->vp[i] = NULL;
1070 
1071 		pmap_vp_destroy_l2_l3(pm, vp1);
1072 		pool_put(&pmap_vp_pool, vp1);
1073 	}
1074 	pool_put(&pmap_vp_pool, vp0);
1075 	pm->pm_vp.l0 = NULL;
1076 }
1077 
1078 vaddr_t virtual_avail;
1079 int	pmap_virtual_space_called;
1080 
1081 static inline uint64_t
1082 VP_Lx(paddr_t pa)
1083 {
1084 	/*
1085 	 * This function takes the pa address given and manipulates it
1086 	 * into the form that should be inserted into the VM table.
1087 	 */
1088 	return pa | Lx_TYPE_PT;
1089 }
1090 
1091 /*
1092  * In pmap_bootstrap() we allocate the page tables for the first GB
1093  * of the kernel address space.
1094  */
1095 vaddr_t pmap_maxkvaddr = VM_MIN_KERNEL_ADDRESS + 1024 * 1024 * 1024;
1096 
1097 /*
1098  * Allocator for growing the kernel page tables.  We use a dedicated
1099  * submap to make sure we have the space to map them as we are called
1100  * when address space is tight!
1101  */
1102 
1103 struct vm_map *pmap_kvp_map;
1104 
1105 const struct kmem_va_mode kv_kvp = {
1106 	.kv_map = &pmap_kvp_map,
1107 	.kv_wait = 0
1108 };
1109 
1110 void *
1111 pmap_kvp_alloc(void)
1112 {
1113 	void *kvp;
1114 
1115 	if (!uvm.page_init_done && !pmap_virtual_space_called) {
1116 		paddr_t pa[2];
1117 		vaddr_t va;
1118 
1119 		if (!uvm_page_physget(&pa[0]) || !uvm_page_physget(&pa[1]))
1120 			panic("%s: out of memory", __func__);
1121 
1122 		va = virtual_avail;
1123 		virtual_avail += 2 * PAGE_SIZE;
1124 		KASSERT(virtual_avail <= pmap_maxkvaddr);
1125 		kvp = (void *)va;
1126 
1127 		pmap_kenter_pa(va, pa[0], PROT_READ|PROT_WRITE);
1128 		pmap_kenter_pa(va + PAGE_SIZE, pa[1], PROT_READ|PROT_WRITE);
1129 		pagezero_cache(va);
1130 		pagezero_cache(va + PAGE_SIZE);
1131 	} else {
1132 		kvp = km_alloc(sizeof(struct pmapvp0), &kv_kvp, &kp_zero,
1133 		    &kd_nowait);
1134 	}
1135 
1136 	return kvp;
1137 }
1138 
1139 struct pte_desc *
1140 pmap_kpted_alloc(void)
1141 {
1142 	static struct pte_desc *pted;
1143 	static int npted;
1144 
1145 	if (npted == 0) {
1146 		if (!uvm.page_init_done && !pmap_virtual_space_called) {
1147 			paddr_t pa;
1148 			vaddr_t va;
1149 
1150 			if (!uvm_page_physget(&pa))
1151 				panic("%s: out of memory", __func__);
1152 
1153 			va = virtual_avail;
1154 			virtual_avail += PAGE_SIZE;
1155 			KASSERT(virtual_avail <= pmap_maxkvaddr);
1156 			pted = (struct pte_desc *)va;
1157 
1158 			pmap_kenter_pa(va, pa, PROT_READ|PROT_WRITE);
1159 			pagezero_cache(va);
1160 		} else {
1161 			pted = km_alloc(PAGE_SIZE, &kv_kvp, &kp_zero,
1162 			    &kd_nowait);
1163 			if (pted == NULL)
1164 				return NULL;
1165 		}
1166 
1167 		npted = PAGE_SIZE / sizeof(struct pte_desc);
1168 	}
1169 
1170 	npted--;
1171 	return pted++;
1172 }
1173 
1174 vaddr_t
1175 pmap_growkernel(vaddr_t maxkvaddr)
1176 {
1177 	struct pmapvp1 *vp1 = pmap_kernel()->pm_vp.l1;
1178 	struct pmapvp2 *vp2;
1179 	struct pmapvp3 *vp3;
1180 	struct pte_desc *pted;
1181 	paddr_t pa;
1182 	int lb_idx2, ub_idx2;
1183 	int i, j, k;
1184 	int s;
1185 
1186 	if (maxkvaddr <= pmap_maxkvaddr)
1187 		return pmap_maxkvaddr;
1188 
1189 	/*
1190 	 * Not strictly necessary, but we use an interrupt-safe map
1191 	 * and uvm asserts that we're at IPL_VM.
1192 	 */
1193 	s = splvm();
1194 
1195 	for (i = VP_IDX1(pmap_maxkvaddr); i <= VP_IDX1(maxkvaddr - 1); i++) {
1196 		vp2 = vp1->vp[i];
1197 		if (vp2 == NULL) {
1198 			vp2 = pmap_kvp_alloc();
1199 			if (vp2 == NULL)
1200 				goto fail;
1201 			pmap_extract(pmap_kernel(), (vaddr_t)vp2, &pa);
1202 			vp1->vp[i] = vp2;
1203 			vp1->l1[i] = VP_Lx(pa);
1204 		}
1205 
1206 		if (i == VP_IDX1(pmap_maxkvaddr)) {
1207 			lb_idx2 = VP_IDX2(pmap_maxkvaddr);
1208 		} else {
1209 			lb_idx2 = 0;
1210 		}
1211 
1212 		if (i == VP_IDX1(maxkvaddr - 1)) {
1213 			ub_idx2 = VP_IDX2(maxkvaddr - 1);
1214 		} else {
1215 			ub_idx2 = VP_IDX2_CNT - 1;
1216 		}
1217 
1218 		for (j = lb_idx2; j <= ub_idx2; j++) {
1219 			vp3 = vp2->vp[j];
1220 			if (vp3 == NULL) {
1221 				vp3 = pmap_kvp_alloc();
1222 				if (vp3 == NULL)
1223 					goto fail;
1224 				pmap_extract(pmap_kernel(), (vaddr_t)vp3, &pa);
1225 				vp2->vp[j] = vp3;
1226 				vp2->l2[j] = VP_Lx(pa);
1227 			}
1228 
1229 			for (k = 0; k <= VP_IDX3_CNT - 1; k++) {
1230 				if (vp3->vp[k] == NULL) {
1231 					pted = pmap_kpted_alloc();
1232 					if (pted == NULL)
1233 						goto fail;
1234 					vp3->vp[k] = pted;
1235 					pmap_maxkvaddr += PAGE_SIZE;
1236 				}
1237 			}
1238 		}
1239 	}
1240 	KASSERT(pmap_maxkvaddr >= maxkvaddr);
1241 
1242 fail:
1243 	splx(s);
1244 
1245 	return pmap_maxkvaddr;
1246 }
1247 
1248 void pmap_setup_avail(uint64_t ram_start, uint64_t ram_end, uint64_t kvo);
1249 
1250 /*
1251  * Initialize pmap setup.
1252  * ALL of the code which deals with avail needs rewritten as an actual
1253  * memory allocation.
1254  */
1255 CTASSERT(sizeof(struct pmapvp0) == 2 * PAGE_SIZE);
1256 
1257 int mappings_allocated = 0;
1258 int pted_allocated = 0;
1259 
1260 extern char __text_start[], _etext[];
1261 extern char __rodata_start[], _erodata[];
1262 
1263 vaddr_t
1264 pmap_bootstrap(long kvo, paddr_t lpt1, long kernelstart, long kernelend,
1265     long ram_start, long ram_end)
1266 {
1267 	void  *va;
1268 	paddr_t pa, pt1pa;
1269 	struct pmapvp1 *vp1;
1270 	struct pmapvp2 *vp2;
1271 	struct pmapvp3 *vp3;
1272 	struct pte_desc *pted;
1273 	vaddr_t vstart;
1274 	uint64_t id_aa64mmfr0;
1275 	int i, j, k;
1276 	int lb_idx2, ub_idx2;
1277 
1278 	pmap_setup_avail(ram_start, ram_end, kvo);
1279 
1280 	/*
1281 	 * in theory we could start with just the memory in the
1282 	 * kernel, however this could 'allocate' the bootloader and
1283 	 * bootstrap vm table, which we may need to preserve until
1284 	 * later.
1285 	 */
1286 	printf("removing %lx-%lx\n", ram_start, kernelstart+kvo);
1287 	pmap_remove_avail(ram_start, kernelstart+kvo);
1288 	printf("removing %lx-%lx\n", kernelstart+kvo, kernelend+kvo);
1289 	pmap_remove_avail(kernelstart+kvo, kernelend+kvo);
1290 
1291 	/*
1292 	 * KERNEL IS ASSUMED TO BE 39 bits (or less), start from L1,
1293 	 * not L0 ALSO kernel mappings may not cover enough ram to
1294 	 * bootstrap so all accesses initializing tables must be done
1295 	 * via physical pointers
1296 	 */
1297 
1298 	pt1pa = pmap_steal_avail(2 * sizeof(struct pmapvp1), Lx_TABLE_ALIGN,
1299 	    &va);
1300 	vp1 = (struct pmapvp1 *)pt1pa;
1301 	pmap_kernel()->pm_vp.l1 = (struct pmapvp1 *)va;
1302 	pmap_kernel()->pm_privileged = 1;
1303 	pmap_kernel()->pm_guarded = ATTR_GP;
1304 	pmap_kernel()->pm_asid = 0;
1305 
1306 	mtx_init(&pmap_tramp.pm_mtx, IPL_VM);
1307 	pmap_tramp.pm_vp.l1 = (struct pmapvp1 *)va + 1;
1308 	pmap_tramp.pm_privileged = 1;
1309 	pmap_tramp.pm_guarded = ATTR_GP;
1310 	pmap_tramp.pm_asid = 0;
1311 
1312 	/* Mark ASID 0 as in-use. */
1313 	pmap_asid[0] |= (3U << 0);
1314 
1315 	/* allocate Lx entries */
1316 	for (i = VP_IDX1(VM_MIN_KERNEL_ADDRESS);
1317 	    i <= VP_IDX1(pmap_maxkvaddr - 1);
1318 	    i++) {
1319 		mappings_allocated++;
1320 		pa = pmap_steal_avail(sizeof(struct pmapvp2), Lx_TABLE_ALIGN,
1321 		    &va);
1322 		vp2 = (struct pmapvp2 *)pa; /* indexed physically */
1323 		vp1->vp[i] = va;
1324 		vp1->l1[i] = VP_Lx(pa);
1325 
1326 		if (i == VP_IDX1(VM_MIN_KERNEL_ADDRESS)) {
1327 			lb_idx2 = VP_IDX2(VM_MIN_KERNEL_ADDRESS);
1328 		} else {
1329 			lb_idx2 = 0;
1330 		}
1331 		if (i == VP_IDX1(pmap_maxkvaddr - 1)) {
1332 			ub_idx2 = VP_IDX2(pmap_maxkvaddr - 1);
1333 		} else {
1334 			ub_idx2 = VP_IDX2_CNT - 1;
1335 		}
1336 		for (j = lb_idx2; j <= ub_idx2; j++) {
1337 			mappings_allocated++;
1338 			pa = pmap_steal_avail(sizeof(struct pmapvp3),
1339 			    Lx_TABLE_ALIGN, &va);
1340 			vp3 = (struct pmapvp3 *)pa; /* indexed physically */
1341 			vp2->vp[j] = va;
1342 			vp2->l2[j] = VP_Lx(pa);
1343 
1344 		}
1345 	}
1346 	/* allocate Lx entries */
1347 	for (i = VP_IDX1(VM_MIN_KERNEL_ADDRESS);
1348 	    i <= VP_IDX1(pmap_maxkvaddr - 1);
1349 	    i++) {
1350 		/* access must be performed physical */
1351 		vp2 = (void *)((long)vp1->vp[i] + kvo);
1352 
1353 		if (i == VP_IDX1(VM_MIN_KERNEL_ADDRESS)) {
1354 			lb_idx2 = VP_IDX2(VM_MIN_KERNEL_ADDRESS);
1355 		} else {
1356 			lb_idx2 = 0;
1357 		}
1358 		if (i == VP_IDX1(pmap_maxkvaddr - 1)) {
1359 			ub_idx2 = VP_IDX2(pmap_maxkvaddr - 1);
1360 		} else {
1361 			ub_idx2 = VP_IDX2_CNT - 1;
1362 		}
1363 		for (j = lb_idx2; j <= ub_idx2; j++) {
1364 			/* access must be performed physical */
1365 			vp3 = (void *)((long)vp2->vp[j] + kvo);
1366 
1367 			for (k = 0; k <= VP_IDX3_CNT - 1; k++) {
1368 				pted_allocated++;
1369 				pa = pmap_steal_avail(sizeof(struct pte_desc),
1370 				    4, &va);
1371 				pted = va;
1372 				vp3->vp[k] = pted;
1373 			}
1374 		}
1375 	}
1376 
1377 	pa = pmap_steal_avail(Lx_TABLE_ALIGN, Lx_TABLE_ALIGN, &va);
1378 	memset((void *)pa, 0, Lx_TABLE_ALIGN);
1379 	pmap_kernel()->pm_pt0pa = pa;
1380 
1381 	pmap_avail_fixup();
1382 
1383 	/*
1384 	 * At this point we are still running on the bootstrap page
1385 	 * tables however all memory for the final page tables is
1386 	 * 'allocated' and should now be mapped.  This means we are
1387 	 * able to use the virtual addressing to enter the final
1388 	 * mappings into the new mapping tables.
1389 	 */
1390 	vstart = pmap_map_stolen(kernelstart);
1391 
1392 	void (switch_mmu_kernel)(long);
1393 	void (*switch_mmu_kernel_table)(long) =
1394 	    (void *)((long)&switch_mmu_kernel + kvo);
1395 	switch_mmu_kernel_table(pt1pa);
1396 
1397 	printf("all mapped\n");
1398 
1399 	curcpu()->ci_curpm = pmap_kernel();
1400 
1401 	id_aa64mmfr0 = READ_SPECIALREG(id_aa64mmfr0_el1);
1402 	if (ID_AA64MMFR0_ASID_BITS(id_aa64mmfr0) == ID_AA64MMFR0_ASID_BITS_16)
1403 		pmap_nasid = (1 << 16);
1404 
1405 	vmmap = vstart;
1406 	vstart += PAGE_SIZE;
1407 
1408 	return vstart;
1409 }
1410 
1411 void
1412 pmap_set_l1(struct pmap *pm, uint64_t va, struct pmapvp1 *l1_va)
1413 {
1414 	uint64_t pg_entry;
1415 	paddr_t l1_pa;
1416 	int idx0;
1417 
1418 	if (pmap_extract(pmap_kernel(), (vaddr_t)l1_va, &l1_pa) == 0)
1419 		panic("unable to find vp pa mapping %p", l1_va);
1420 
1421 	if (l1_pa & (Lx_TABLE_ALIGN-1))
1422 		panic("misaligned L2 table");
1423 
1424 	pg_entry = VP_Lx(l1_pa);
1425 
1426 	idx0 = VP_IDX0(va);
1427 	pm->pm_vp.l0->vp[idx0] = l1_va;
1428 	pm->pm_vp.l0->l0[idx0] = pg_entry;
1429 }
1430 
1431 void
1432 pmap_set_l2(struct pmap *pm, uint64_t va, struct pmapvp1 *vp1,
1433     struct pmapvp2 *l2_va)
1434 {
1435 	uint64_t pg_entry;
1436 	paddr_t l2_pa;
1437 	int idx1;
1438 
1439 	if (pmap_extract(pmap_kernel(), (vaddr_t)l2_va, &l2_pa) == 0)
1440 		panic("unable to find vp pa mapping %p", l2_va);
1441 
1442 	if (l2_pa & (Lx_TABLE_ALIGN-1))
1443 		panic("misaligned L2 table");
1444 
1445 	pg_entry = VP_Lx(l2_pa);
1446 
1447 	idx1 = VP_IDX1(va);
1448 	vp1->vp[idx1] = l2_va;
1449 	vp1->l1[idx1] = pg_entry;
1450 }
1451 
1452 void
1453 pmap_set_l3(struct pmap *pm, uint64_t va, struct pmapvp2 *vp2,
1454     struct pmapvp3 *l3_va)
1455 {
1456 	uint64_t pg_entry;
1457 	paddr_t l3_pa;
1458 	int idx2;
1459 
1460 	if (pmap_extract(pmap_kernel(), (vaddr_t)l3_va, &l3_pa) == 0)
1461 		panic("unable to find vp pa mapping %p", l3_va);
1462 
1463 	if (l3_pa & (Lx_TABLE_ALIGN-1))
1464 		panic("misaligned L2 table");
1465 
1466 	pg_entry = VP_Lx(l3_pa);
1467 
1468 	idx2 = VP_IDX2(va);
1469 	vp2->vp[idx2] = l3_va;
1470 	vp2->l2[idx2] = pg_entry;
1471 }
1472 
1473 /*
1474  * activate a pmap entry
1475  */
1476 void
1477 pmap_activate(struct proc *p)
1478 {
1479 	pmap_t pm = p->p_vmspace->vm_map.pmap;
1480 
1481 	if (p == curproc && pm != curcpu()->ci_curpm)
1482 		pmap_setttb(p);
1483 }
1484 
1485 /*
1486  * deactivate a pmap entry
1487  */
1488 void
1489 pmap_deactivate(struct proc *p)
1490 {
1491 }
1492 
1493 /*
1494  * Get the physical page address for the given pmap/virtual address.
1495  */
1496 int
1497 pmap_extract(pmap_t pm, vaddr_t va, paddr_t *pap)
1498 {
1499 	struct pte_desc *pted;
1500 
1501 	pmap_lock(pm);
1502 	pted = pmap_vp_lookup(pm, va, NULL);
1503 	if (!pted || !PTED_VALID(pted)) {
1504 		pmap_unlock(pm);
1505 		return 0;
1506 	}
1507 	if (pap != NULL)
1508 		*pap = (pted->pted_pte & PTE_RPGN) | (va & PAGE_MASK);
1509 	pmap_unlock(pm);
1510 
1511 	return 1;
1512 }
1513 
1514 void
1515 pmap_page_ro(pmap_t pm, vaddr_t va, vm_prot_t prot)
1516 {
1517 	struct pte_desc *pted;
1518 	uint64_t *pl3;
1519 
1520 	/* Every VA needs a pted, even unmanaged ones. */
1521 	pted = pmap_vp_lookup(pm, va, &pl3);
1522 	if (!pted || !PTED_VALID(pted)) {
1523 		return;
1524 	}
1525 
1526 	pted->pted_va &= ~PROT_WRITE;
1527 	pted->pted_pte &= ~PROT_WRITE;
1528 	if ((prot & PROT_READ) == 0) {
1529 		pted->pted_va &= ~PROT_READ;
1530 		pted->pted_pte &= ~PROT_READ;
1531 	}
1532 	if ((prot & PROT_EXEC) == 0) {
1533 		pted->pted_va &= ~PROT_EXEC;
1534 		pted->pted_pte &= ~PROT_EXEC;
1535 	}
1536 	pmap_pte_update(pted, pl3);
1537 	ttlb_flush(pm, pted->pted_va & ~PAGE_MASK);
1538 }
1539 
1540 #ifdef DDB
1541 void
1542 pmap_page_rw(pmap_t pm, vaddr_t va)
1543 {
1544 	struct pte_desc *pted;
1545 	uint64_t *pl3;
1546 
1547 	/* Every VA needs a pted, even unmanaged ones. */
1548 	pted = pmap_vp_lookup(pm, va, &pl3);
1549 	if (!pted || !PTED_VALID(pted)) {
1550 		return;
1551 	}
1552 
1553 	pted->pted_va |= PROT_WRITE;
1554 	pted->pted_pte |= PROT_WRITE;
1555 	pmap_pte_update(pted, pl3);
1556 	ttlb_flush(pm, pted->pted_va & ~PAGE_MASK);
1557 }
1558 #endif /* DDB */
1559 
1560 /*
1561  * Lower the protection on the specified physical page.
1562  */
1563 void
1564 pmap_page_protect(struct vm_page *pg, vm_prot_t prot)
1565 {
1566 	struct pte_desc *pted;
1567 	struct pmap *pm;
1568 
1569 	if (prot != PROT_NONE) {
1570 		mtx_enter(&pg->mdpage.pv_mtx);
1571 		LIST_FOREACH(pted, &(pg->mdpage.pv_list), pted_pv_list) {
1572 			pmap_page_ro(pted->pted_pmap, pted->pted_va, prot);
1573 		}
1574 		mtx_leave(&pg->mdpage.pv_mtx);
1575 		return;
1576 	}
1577 
1578 	mtx_enter(&pg->mdpage.pv_mtx);
1579 	while ((pted = LIST_FIRST(&(pg->mdpage.pv_list))) != NULL) {
1580 		pmap_reference(pted->pted_pmap);
1581 		pm = pted->pted_pmap;
1582 		mtx_leave(&pg->mdpage.pv_mtx);
1583 
1584 		pmap_lock(pm);
1585 
1586 		/*
1587 		 * We dropped the pvlist lock before grabbing the pmap
1588 		 * lock to avoid lock ordering problems.  This means
1589 		 * we have to check the pvlist again since somebody
1590 		 * else might have modified it.  All we care about is
1591 		 * that the pvlist entry matches the pmap we just
1592 		 * locked.  If it doesn't, unlock the pmap and try
1593 		 * again.
1594 		 */
1595 		mtx_enter(&pg->mdpage.pv_mtx);
1596 		pted = LIST_FIRST(&(pg->mdpage.pv_list));
1597 		if (pted == NULL || pted->pted_pmap != pm) {
1598 			mtx_leave(&pg->mdpage.pv_mtx);
1599 			pmap_unlock(pm);
1600 			pmap_destroy(pm);
1601 			mtx_enter(&pg->mdpage.pv_mtx);
1602 			continue;
1603 		}
1604 		mtx_leave(&pg->mdpage.pv_mtx);
1605 
1606 		pmap_remove_pted(pm, pted);
1607 		pmap_unlock(pm);
1608 		pmap_destroy(pm);
1609 
1610 		mtx_enter(&pg->mdpage.pv_mtx);
1611 	}
1612 	/* page is being reclaimed, sync icache next use */
1613 	atomic_clearbits_int(&pg->pg_flags, PG_PMAP_EXE);
1614 	mtx_leave(&pg->mdpage.pv_mtx);
1615 }
1616 
1617 void
1618 pmap_protect(pmap_t pm, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
1619 {
1620 	if (prot & (PROT_READ | PROT_EXEC)) {
1621 		pmap_lock(pm);
1622 		while (sva < eva) {
1623 			pmap_page_ro(pm, sva, prot);
1624 			sva += PAGE_SIZE;
1625 		}
1626 		pmap_unlock(pm);
1627 		return;
1628 	}
1629 	pmap_remove(pm, sva, eva);
1630 }
1631 
1632 void
1633 pmap_init(void)
1634 {
1635 	uint64_t tcr;
1636 
1637 	/*
1638 	 * Now that we are in virtual address space we don't need
1639 	 * the identity mapping in TTBR0 and can set the TCR to a
1640 	 * more useful value.
1641 	 */
1642 	WRITE_SPECIALREG(ttbr0_el1, pmap_kernel()->pm_pt0pa);
1643 	__asm volatile("isb");
1644 	tcr = READ_SPECIALREG(tcr_el1);
1645 	tcr &= ~TCR_T0SZ(0x3f);
1646 	tcr |= TCR_T0SZ(64 - USER_SPACE_BITS);
1647 	tcr |= TCR_A1;
1648 	WRITE_SPECIALREG(tcr_el1, tcr);
1649 	cpu_tlb_flush();
1650 
1651 	pool_init(&pmap_pmap_pool, sizeof(struct pmap), 0, IPL_NONE, 0,
1652 	    "pmap", NULL);
1653 	pool_setlowat(&pmap_pmap_pool, 2);
1654 	pool_init(&pmap_pted_pool, sizeof(struct pte_desc), 0, IPL_VM, 0,
1655 	    "pted", NULL);
1656 	pool_setlowat(&pmap_pted_pool, 20);
1657 	pool_init(&pmap_vp_pool, sizeof(struct pmapvp0), PAGE_SIZE, IPL_VM, 0,
1658 	    "vp", &pmap_vp_allocator);
1659 	/* pool_setlowat(&pmap_vp_pool, 20); */
1660 
1661 	pmap_initialized = 1;
1662 }
1663 
1664 void
1665 pmap_proc_iflush(struct process *pr, vaddr_t va, vsize_t len)
1666 {
1667 	struct pmap *pm = vm_map_pmap(&pr->ps_vmspace->vm_map);
1668 	vaddr_t kva = zero_page + cpu_number() * PAGE_SIZE;
1669 	paddr_t pa;
1670 	vsize_t clen;
1671 	vsize_t off;
1672 
1673 	/*
1674 	 * If we're called for the current process, we can simply
1675 	 * flush the data cache to the point of unification and
1676 	 * invalidate the instruction cache.
1677 	 */
1678 	if (pr == curproc->p_p) {
1679 		cpu_icache_sync_range(va, len);
1680 		return;
1681 	}
1682 
1683 	/*
1684 	 * Flush and invalidate through an aliased mapping.  This
1685 	 * assumes the instruction cache is PIPT.  That is only true
1686 	 * for some of the hardware we run on.
1687 	 */
1688 	while (len > 0) {
1689 		/* add one to always round up to the next page */
1690 		clen = round_page(va + 1) - va;
1691 		if (clen > len)
1692 			clen = len;
1693 
1694 		off = va - trunc_page(va);
1695 		if (pmap_extract(pm, trunc_page(va), &pa)) {
1696 			pmap_kenter_pa(kva, pa, PROT_READ|PROT_WRITE);
1697 			cpu_icache_sync_range(kva + off, clen);
1698 			pmap_kremove_pg(kva);
1699 		}
1700 
1701 		len -= clen;
1702 		va += clen;
1703 	}
1704 }
1705 
1706 void
1707 pmap_icache_sync_page(struct pmap *pm, paddr_t pa)
1708 {
1709 	vaddr_t kva = zero_page + cpu_number() * PAGE_SIZE;
1710 
1711 	pmap_kenter_pa(kva, pa, PROT_READ|PROT_WRITE);
1712 	cpu_icache_sync_range(kva, PAGE_SIZE);
1713 	pmap_kremove_pg(kva);
1714 }
1715 
1716 void
1717 pmap_pte_insert(struct pte_desc *pted)
1718 {
1719 	pmap_t pm = pted->pted_pmap;
1720 	uint64_t *pl3;
1721 
1722 	if (pmap_vp_lookup(pm, pted->pted_va, &pl3) == NULL) {
1723 		panic("%s: have a pted, but missing a vp"
1724 		    " for %lx va pmap %p", __func__, pted->pted_va, pm);
1725 	}
1726 
1727 	pmap_pte_update(pted, pl3);
1728 }
1729 
1730 void
1731 pmap_pte_update(struct pte_desc *pted, uint64_t *pl3)
1732 {
1733 	uint64_t pte, access_bits;
1734 	pmap_t pm = pted->pted_pmap;
1735 	uint64_t attr = ATTR_nG;
1736 
1737 	/* see mair in locore.S */
1738 	switch (pted->pted_va & PMAP_CACHE_BITS) {
1739 	case PMAP_CACHE_WB:
1740 		/* inner and outer writeback */
1741 		attr |= ATTR_IDX(PTE_ATTR_WB);
1742 		attr |= ATTR_SH(SH_INNER);
1743 		break;
1744 	case PMAP_CACHE_WT:
1745 		 /* inner and outer writethrough */
1746 		attr |= ATTR_IDX(PTE_ATTR_WT);
1747 		attr |= ATTR_SH(SH_INNER);
1748 		break;
1749 	case PMAP_CACHE_CI:
1750 		attr |= ATTR_IDX(PTE_ATTR_CI);
1751 		attr |= ATTR_SH(SH_INNER);
1752 		break;
1753 	case PMAP_CACHE_DEV_NGNRNE:
1754 		attr |= ATTR_IDX(PTE_ATTR_DEV_NGNRNE);
1755 		attr |= ATTR_SH(SH_INNER);
1756 		break;
1757 	case PMAP_CACHE_DEV_NGNRE:
1758 		attr |= ATTR_IDX(PTE_ATTR_DEV_NGNRE);
1759 		attr |= ATTR_SH(SH_INNER);
1760 		break;
1761 	default:
1762 		panic("%s: invalid cache mode", __func__);
1763 	}
1764 
1765 	if (pm->pm_privileged)
1766 		access_bits = ap_bits_kern[pted->pted_pte & PROT_MASK];
1767 	else
1768 		access_bits = ap_bits_user[pted->pted_pte & PROT_MASK];
1769 
1770 #ifndef SMALL_KERNEL
1771 	access_bits |= pm->pm_guarded;
1772 #endif
1773 
1774 	pte = (pted->pted_pte & PTE_RPGN) | attr | access_bits | L3_P;
1775 	*pl3 = access_bits ? pte : 0;
1776 }
1777 
1778 void
1779 pmap_pte_remove(struct pte_desc *pted, int remove_pted)
1780 {
1781 	struct pmapvp1 *vp1;
1782 	struct pmapvp2 *vp2;
1783 	struct pmapvp3 *vp3;
1784 	pmap_t pm = pted->pted_pmap;
1785 
1786 	if (pm->have_4_level_pt)
1787 		vp1 = pm->pm_vp.l0->vp[VP_IDX0(pted->pted_va)];
1788 	else
1789 		vp1 = pm->pm_vp.l1;
1790 	if (vp1 == NULL) {
1791 		panic("have a pted, but missing the l1 for %lx va pmap %p",
1792 		    pted->pted_va, pm);
1793 	}
1794 	vp2 = vp1->vp[VP_IDX1(pted->pted_va)];
1795 	if (vp2 == NULL) {
1796 		panic("have a pted, but missing the l2 for %lx va pmap %p",
1797 		    pted->pted_va, pm);
1798 	}
1799 	vp3 = vp2->vp[VP_IDX2(pted->pted_va)];
1800 	if (vp3 == NULL) {
1801 		panic("have a pted, but missing the l3 for %lx va pmap %p",
1802 		    pted->pted_va, pm);
1803 	}
1804 	vp3->l3[VP_IDX3(pted->pted_va)] = 0;
1805 	if (remove_pted)
1806 		vp3->vp[VP_IDX3(pted->pted_va)] = NULL;
1807 }
1808 
1809 /*
1810  * This function exists to do software referenced/modified emulation.
1811  * Its purpose is to tell the caller that a fault was generated either
1812  * for this emulation, or to tell the caller that it's a legit fault.
1813  */
1814 int
1815 pmap_fault_fixup(pmap_t pm, vaddr_t va, vm_prot_t ftype)
1816 {
1817 	struct pte_desc *pted;
1818 	struct vm_page *pg;
1819 	paddr_t pa;
1820 	uint64_t *pl3 = NULL;
1821 	int retcode = 0;
1822 
1823 	pmap_lock(pm);
1824 
1825 	/* Every VA needs a pted, even unmanaged ones. */
1826 	pted = pmap_vp_lookup(pm, va, &pl3);
1827 	if (!pted || !PTED_VALID(pted))
1828 		goto done;
1829 
1830 	/* There has to be a PA for the VA, get it. */
1831 	pa = (pted->pted_pte & PTE_RPGN);
1832 
1833 	/* If it's unmanaged, it must not fault. */
1834 	pg = PHYS_TO_VM_PAGE(pa);
1835 	if (pg == NULL)
1836 		goto done;
1837 
1838 	/*
1839 	 * Check the fault types to find out if we were doing
1840 	 * any mod/ref emulation and fixup the PTE if we were.
1841 	 */
1842 	if ((ftype & PROT_WRITE) && /* fault caused by a write */
1843 	    !(pted->pted_pte & PROT_WRITE) && /* and write is disabled now */
1844 	    (pted->pted_va & PROT_WRITE)) { /* but is supposedly allowed */
1845 
1846 		/*
1847 		 * Page modified emulation. A write always includes
1848 		 * a reference.  This means that we can enable read and
1849 		 * exec as well, akin to the page reference emulation.
1850 		 */
1851 		atomic_setbits_int(&pg->pg_flags, PG_PMAP_MOD|PG_PMAP_REF);
1852 		atomic_clearbits_int(&pg->pg_flags, PG_PMAP_EXE);
1853 
1854 		/* Thus, enable read, write and exec. */
1855 		pted->pted_pte |=
1856 		    (pted->pted_va & (PROT_READ|PROT_WRITE|PROT_EXEC));
1857 	} else if ((ftype & PROT_EXEC) && /* fault caused by an exec */
1858 	    !(pted->pted_pte & PROT_EXEC) && /* and exec is disabled now */
1859 	    (pted->pted_va & PROT_EXEC)) { /* but is supposedly allowed */
1860 
1861 		/*
1862 		 * Exec always includes a reference. Since we now know
1863 		 * the page has been accessed, we can enable read as well
1864 		 * if UVM allows it.
1865 		 */
1866 		atomic_setbits_int(&pg->pg_flags, PG_PMAP_REF);
1867 
1868 		/* Thus, enable read and exec. */
1869 		pted->pted_pte |= (pted->pted_va & (PROT_READ|PROT_EXEC));
1870 	} else if ((ftype & PROT_READ) && /* fault caused by a read */
1871 	    !(pted->pted_pte & PROT_READ) && /* and read is disabled now */
1872 	    (pted->pted_va & PROT_READ)) { /* but is supposedly allowed */
1873 
1874 		/*
1875 		 * Page referenced emulation. Since we now know the page
1876 		 * has been accessed, we can enable exec as well if UVM
1877 		 * allows it.
1878 		 */
1879 		atomic_setbits_int(&pg->pg_flags, PG_PMAP_REF);
1880 
1881 		/* Thus, enable read and exec. */
1882 		pted->pted_pte |= (pted->pted_va & (PROT_READ|PROT_EXEC));
1883 	} else {
1884 		/* didn't catch it, so probably broken */
1885 		goto done;
1886 	}
1887 
1888 	/*
1889 	 * If this is a page that can be executed, make sure to invalidate
1890 	 * the instruction cache if the page has been modified or not used
1891 	 * yet.
1892 	 */
1893 	if (pted->pted_va & PROT_EXEC) {
1894 		if ((pg->pg_flags & PG_PMAP_EXE) == 0)
1895 			pmap_icache_sync_page(pm, pa);
1896 		atomic_setbits_int(&pg->pg_flags, PG_PMAP_EXE);
1897 	}
1898 
1899 	/* We actually made a change, so flush it and sync. */
1900 	pmap_pte_update(pted, pl3);
1901 	ttlb_flush(pm, va & ~PAGE_MASK);
1902 
1903 	retcode = 1;
1904 done:
1905 	pmap_unlock(pm);
1906 	return retcode;
1907 }
1908 
1909 void
1910 pmap_postinit(void)
1911 {
1912 	extern char trampoline_vectors[];
1913 	extern char trampoline_vectors_end[];
1914 	paddr_t pa;
1915 	vaddr_t minaddr, maxaddr;
1916 	u_long npteds, npages;
1917 
1918 	memset(pmap_tramp.pm_vp.l1, 0, sizeof(struct pmapvp1));
1919 	pmap_extract(pmap_kernel(), (vaddr_t)trampoline_vectors, &pa);
1920 	minaddr = (vaddr_t)trampoline_vectors;
1921 	maxaddr = (vaddr_t)trampoline_vectors_end;
1922 	while (minaddr < maxaddr) {
1923 		pmap_enter(&pmap_tramp, minaddr, pa,
1924 		    PROT_READ | PROT_EXEC, PROT_READ | PROT_EXEC | PMAP_WIRED);
1925 		minaddr += PAGE_SIZE;
1926 		pa += PAGE_SIZE;
1927 	}
1928 
1929 	/*
1930 	 * Reserve enough virtual address space to grow the kernel
1931 	 * page tables.  We need a descriptor for each page as well as
1932 	 * an extra page for level 1/2/3 page tables for management.
1933 	 * To simplify the code, we always allocate full tables at
1934 	 * level 3, so take that into account.
1935 	 */
1936 	npteds = (VM_MAX_KERNEL_ADDRESS - pmap_maxkvaddr + 1) / PAGE_SIZE;
1937 	npteds = roundup(npteds, VP_IDX3_CNT);
1938 	npages = howmany(npteds, PAGE_SIZE / (sizeof(struct pte_desc)));
1939 	npages += 2 * howmany(npteds, VP_IDX3_CNT);
1940 	npages += 2 * howmany(npteds, VP_IDX3_CNT * VP_IDX2_CNT);
1941 	npages += 2 * howmany(npteds, VP_IDX3_CNT * VP_IDX2_CNT * VP_IDX1_CNT);
1942 
1943 	/*
1944 	 * Use an interrupt safe map such that we don't recurse into
1945 	 * uvm_map() to allocate map entries.
1946 	 */
1947 	minaddr = vm_map_min(kernel_map);
1948 	pmap_kvp_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
1949 	    npages * PAGE_SIZE, VM_MAP_INTRSAFE, FALSE, NULL);
1950 }
1951 
1952 void
1953 pmap_init_percpu(void)
1954 {
1955 	pool_cache_init(&pmap_pted_pool);
1956 	pool_cache_init(&pmap_vp_pool);
1957 }
1958 
1959 void
1960 pmap_update(pmap_t pm)
1961 {
1962 }
1963 
1964 int
1965 pmap_is_referenced(struct vm_page *pg)
1966 {
1967 	return ((pg->pg_flags & PG_PMAP_REF) != 0);
1968 }
1969 
1970 int
1971 pmap_is_modified(struct vm_page *pg)
1972 {
1973 	return ((pg->pg_flags & PG_PMAP_MOD) != 0);
1974 }
1975 
1976 int
1977 pmap_clear_modify(struct vm_page *pg)
1978 {
1979 	struct pte_desc *pted;
1980 
1981 	atomic_clearbits_int(&pg->pg_flags, PG_PMAP_MOD);
1982 
1983 	mtx_enter(&pg->mdpage.pv_mtx);
1984 	LIST_FOREACH(pted, &(pg->mdpage.pv_list), pted_pv_list) {
1985 		pted->pted_pte &= ~PROT_WRITE;
1986 		pmap_pte_insert(pted);
1987 		ttlb_flush(pted->pted_pmap, pted->pted_va & ~PAGE_MASK);
1988 	}
1989 	mtx_leave(&pg->mdpage.pv_mtx);
1990 
1991 	return 0;
1992 }
1993 
1994 /*
1995  * When this turns off read permissions it also disables write permissions
1996  * so that mod is correctly tracked after clear_ref; FAULT_READ; FAULT_WRITE;
1997  */
1998 int
1999 pmap_clear_reference(struct vm_page *pg)
2000 {
2001 	struct pte_desc *pted;
2002 
2003 	atomic_clearbits_int(&pg->pg_flags, PG_PMAP_REF);
2004 
2005 	mtx_enter(&pg->mdpage.pv_mtx);
2006 	LIST_FOREACH(pted, &(pg->mdpage.pv_list), pted_pv_list) {
2007 		pted->pted_pte &= ~PROT_MASK;
2008 		pmap_pte_insert(pted);
2009 		ttlb_flush(pted->pted_pmap, pted->pted_va & ~PAGE_MASK);
2010 	}
2011 	mtx_leave(&pg->mdpage.pv_mtx);
2012 
2013 	return 0;
2014 }
2015 
2016 void
2017 pmap_unwire(pmap_t pm, vaddr_t va)
2018 {
2019 	struct pte_desc *pted;
2020 
2021 	pmap_lock(pm);
2022 	pted = pmap_vp_lookup(pm, va, NULL);
2023 	if (pted != NULL && PTED_WIRED(pted)) {
2024 		pm->pm_stats.wired_count--;
2025 		pted->pted_va &= ~PTED_VA_WIRED_M;
2026 	}
2027 	pmap_unlock(pm);
2028 }
2029 
2030 void
2031 pmap_remove_holes(struct vmspace *vm)
2032 {
2033 	/* NOOP */
2034 }
2035 
2036 void
2037 pmap_virtual_space(vaddr_t *start, vaddr_t *end)
2038 {
2039 	*start = virtual_avail;
2040 	*end = VM_MAX_KERNEL_ADDRESS;
2041 
2042 	/* Prevent further KVA stealing. */
2043 	pmap_virtual_space_called = 1;
2044 }
2045 
2046 void
2047 pmap_setup_avail(uint64_t ram_start, uint64_t ram_end, uint64_t kvo)
2048 {
2049 	/* This makes several assumptions
2050 	 * 1) kernel will be located 'low' in memory
2051 	 * 2) memory will not start at VM_MIN_KERNEL_ADDRESS
2052 	 * 3) several MB of memory starting just after the kernel will
2053 	 *    be premapped at the kernel address in the bootstrap mappings
2054 	 * 4) kvo will be the 64 bit number to add to the ram address to
2055 	 *    obtain the kernel virtual mapping of the ram. KVO == PA -> VA
2056 	 * 5) it is generally assumed that these translations will occur with
2057 	 *    large granularity, at minimum the translation will be page
2058 	 *    aligned, if not 'section' or greater.
2059 	 */
2060 
2061 	pmap_avail_kvo = kvo;
2062 	pmap_avail[0].start = ram_start;
2063 	pmap_avail[0].size = ram_end-ram_start;
2064 
2065 	/* XXX - multiple sections */
2066 	physmem = atop(pmap_avail[0].size);
2067 
2068 	pmap_cnt_avail = 1;
2069 
2070 	pmap_avail_fixup();
2071 }
2072 
2073 void
2074 pmap_avail_fixup(void)
2075 {
2076 	struct mem_region *mp;
2077 	vaddr_t align;
2078 	vaddr_t end;
2079 
2080 	mp = pmap_avail;
2081 	while(mp->size !=0) {
2082 		align = round_page(mp->start);
2083 		if (mp->start != align) {
2084 			pmap_remove_avail(mp->start, align);
2085 			mp = pmap_avail;
2086 			continue;
2087 		}
2088 		end = mp->start+mp->size;
2089 		align = trunc_page(end);
2090 		if (end != align) {
2091 			pmap_remove_avail(align, end);
2092 			mp = pmap_avail;
2093 			continue;
2094 		}
2095 		mp++;
2096 	}
2097 }
2098 
2099 /* remove a given region from avail memory */
2100 void
2101 pmap_remove_avail(paddr_t base, paddr_t end)
2102 {
2103 	struct mem_region *mp;
2104 	int i;
2105 	long mpend;
2106 
2107 	/* remove given region from available */
2108 	for (mp = pmap_avail; mp->size; mp++) {
2109 		/*
2110 		 * Check if this region holds all of the region
2111 		 */
2112 		mpend = mp->start + mp->size;
2113 		if (base > mpend) {
2114 			continue;
2115 		}
2116 		if (base <= mp->start) {
2117 			if (end <= mp->start)
2118 				break; /* region not present -??? */
2119 
2120 			if (end >= mpend) {
2121 				/* covers whole region */
2122 				/* shorten */
2123 				for (i = mp - pmap_avail;
2124 				    i < pmap_cnt_avail;
2125 				    i++) {
2126 					pmap_avail[i] = pmap_avail[i+1];
2127 				}
2128 				pmap_cnt_avail--;
2129 				pmap_avail[pmap_cnt_avail].size = 0;
2130 			} else {
2131 				mp->start = end;
2132 				mp->size = mpend - end;
2133 			}
2134 		} else {
2135 			/* start after the beginning */
2136 			if (end >= mpend) {
2137 				/* just truncate */
2138 				mp->size = base - mp->start;
2139 			} else {
2140 				/* split */
2141 				for (i = pmap_cnt_avail;
2142 				    i > (mp - pmap_avail);
2143 				    i--) {
2144 					pmap_avail[i] = pmap_avail[i - 1];
2145 				}
2146 				pmap_cnt_avail++;
2147 				mp->size = base - mp->start;
2148 				mp++;
2149 				mp->start = end;
2150 				mp->size = mpend - end;
2151 			}
2152 		}
2153 	}
2154 	for (mp = pmap_allocated; mp->size != 0; mp++) {
2155 		if (base < mp->start) {
2156 			if (end == mp->start) {
2157 				mp->start = base;
2158 				mp->size += end - base;
2159 				break;
2160 			}
2161 			/* lengthen */
2162 			for (i = pmap_cnt_allocated; i > (mp - pmap_allocated);
2163 			    i--) {
2164 				pmap_allocated[i] = pmap_allocated[i - 1];
2165 			}
2166 			pmap_cnt_allocated++;
2167 			mp->start = base;
2168 			mp->size = end - base;
2169 			return;
2170 		}
2171 		if (base == (mp->start + mp->size)) {
2172 			mp->size += end - base;
2173 			return;
2174 		}
2175 	}
2176 	if (mp->size == 0) {
2177 		mp->start = base;
2178 		mp->size  = end - base;
2179 		pmap_cnt_allocated++;
2180 	}
2181 }
2182 
2183 /* XXX - this zeros pages via their physical address */
2184 paddr_t
2185 pmap_steal_avail(size_t size, int align, void **kva)
2186 {
2187 	struct mem_region *mp;
2188 	long start;
2189 	long remsize;
2190 
2191 	for (mp = pmap_avail; mp->size; mp++) {
2192 		if (mp->size > size) {
2193 			start = (mp->start + (align -1)) & ~(align -1);
2194 			remsize = mp->size - (start - mp->start);
2195 			if (remsize >= 0) {
2196 				pmap_remove_avail(start, start+size);
2197 				if (kva != NULL){
2198 					*kva = (void *)(start - pmap_avail_kvo);
2199 				}
2200 				bzero((void*)(start), size);
2201 				return start;
2202 			}
2203 		}
2204 	}
2205 	panic ("unable to allocate region with size %lx align %x",
2206 	    size, align);
2207 }
2208 
2209 vaddr_t
2210 pmap_map_stolen(vaddr_t kernel_start)
2211 {
2212 	struct mem_region *mp;
2213 	paddr_t pa;
2214 	vaddr_t va;
2215 	uint64_t e;
2216 
2217 	for (mp = pmap_allocated; mp->size; mp++) {
2218 		for (e = 0; e < mp->size; e += PAGE_SIZE) {
2219 			int prot = PROT_READ | PROT_WRITE;
2220 
2221 			pa = mp->start + e;
2222 			va = pa - pmap_avail_kvo;
2223 
2224 			if (va < VM_MIN_KERNEL_ADDRESS ||
2225 			    va >= VM_MAX_KERNEL_ADDRESS)
2226 				continue;
2227 
2228 			if (va >= (vaddr_t)__text_start &&
2229 			    va < (vaddr_t)_etext)
2230 				prot = PROT_READ | PROT_EXEC;
2231 			else if (va >= (vaddr_t)__rodata_start &&
2232 			    va < (vaddr_t)_erodata)
2233 				prot = PROT_READ;
2234 
2235 			pmap_kenter_cache(va, pa, prot, PMAP_CACHE_WB);
2236 		}
2237 	}
2238 
2239 	return va + PAGE_SIZE;
2240 }
2241 
2242 void
2243 pmap_physload_avail(void)
2244 {
2245 	struct mem_region *mp;
2246 	uint64_t start, end;
2247 
2248 	for (mp = pmap_avail; mp->size; mp++) {
2249 		if (mp->size < PAGE_SIZE) {
2250 			printf(" skipped - too small\n");
2251 			continue;
2252 		}
2253 		start = mp->start;
2254 		if (start & PAGE_MASK) {
2255 			start = PAGE_SIZE + (start & PMAP_PA_MASK);
2256 		}
2257 		end = mp->start + mp->size;
2258 		if (end & PAGE_MASK) {
2259 			end = (end & PMAP_PA_MASK);
2260 		}
2261 		uvm_page_physload(atop(start), atop(end),
2262 		    atop(start), atop(end), 0);
2263 
2264 	}
2265 }
2266 
2267 void
2268 pmap_show_mapping(uint64_t va)
2269 {
2270 	struct pmapvp1 *vp1;
2271 	struct pmapvp2 *vp2;
2272 	struct pmapvp3 *vp3;
2273 	struct pte_desc *pted;
2274 	struct pmap *pm;
2275 	uint64_t ttbr0, tcr;
2276 
2277 	printf("showing mapping of %llx\n", va);
2278 
2279 	if (va & 1ULL << 63)
2280 		pm = pmap_kernel();
2281 	else
2282 		pm = curproc->p_vmspace->vm_map.pmap;
2283 
2284 	if (pm->have_4_level_pt) {
2285 		printf("  vp0 = %p off %x\n",  pm->pm_vp.l0, VP_IDX0(va)*8);
2286 		vp1 = pm->pm_vp.l0->vp[VP_IDX0(va)];
2287 		if (vp1 == NULL)
2288 			return;
2289 	} else {
2290 		vp1 = pm->pm_vp.l1;
2291 	}
2292 
2293 	__asm volatile ("mrs     %x0, ttbr0_el1" : "=r"(ttbr0));
2294 	__asm volatile ("mrs     %x0, tcr_el1" : "=r"(tcr));
2295 	printf("  ttbr0 %llx %llx tcr %llx\n", ttbr0, pm->pm_pt0pa, tcr);
2296 	printf("  vp1 = %p\n", vp1);
2297 
2298 	vp2 = vp1->vp[VP_IDX1(va)];
2299 	printf("  vp2 = %p lp2 = %llx idx1 off %x\n",
2300 		vp2, vp1->l1[VP_IDX1(va)], VP_IDX1(va)*8);
2301 	if (vp2 == NULL)
2302 		return;
2303 
2304 	vp3 = vp2->vp[VP_IDX2(va)];
2305 	printf("  vp3 = %p lp3 = %llx idx2 off %x\n",
2306 		vp3, vp2->l2[VP_IDX2(va)], VP_IDX2(va)*8);
2307 	if (vp3 == NULL)
2308 		return;
2309 
2310 	pted = vp3->vp[VP_IDX3(va)];
2311 	printf("  pted = %p lp3 = %llx idx3 off  %x\n",
2312 		pted, vp3->l3[VP_IDX3(va)], VP_IDX3(va)*8);
2313 }
2314 
2315 __attribute__((target("+pauth")))
2316 void
2317 pmap_setpauthkeys(struct pmap *pm)
2318 {
2319 	if (ID_AA64ISAR1_APA(cpu_id_aa64isar1) >= ID_AA64ISAR1_APA_PAC ||
2320 	    ID_AA64ISAR1_API(cpu_id_aa64isar1) >= ID_AA64ISAR1_API_PAC) {
2321 		__asm volatile ("msr apiakeylo_el1, %0"
2322 		    :: "r"(pm->pm_apiakey[0]));
2323 		__asm volatile ("msr apiakeyhi_el1, %0"
2324 		    :: "r"(pm->pm_apiakey[1]));
2325 		__asm volatile ("msr apdakeylo_el1, %0"
2326 		    :: "r"(pm->pm_apdakey[0]));
2327 		__asm volatile ("msr apdakeyhi_el1, %0"
2328 		    :: "r"(pm->pm_apdakey[1]));
2329 		__asm volatile ("msr apibkeylo_el1, %0"
2330 		    :: "r"(pm->pm_apibkey[0]));
2331 		__asm volatile ("msr apibkeyhi_el1, %0"
2332 		    :: "r"(pm->pm_apibkey[1]));
2333 		__asm volatile ("msr apdbkeylo_el1, %0"
2334 		    :: "r"(pm->pm_apdbkey[0]));
2335 		__asm volatile ("msr apdbkeyhi_el1, %0"
2336 		    :: "r"(pm->pm_apdbkey[1]));
2337 	}
2338 
2339 	if (ID_AA64ISAR1_GPA(cpu_id_aa64isar1) >= ID_AA64ISAR1_GPA_IMPL ||
2340 	    ID_AA64ISAR1_GPI(cpu_id_aa64isar1) >= ID_AA64ISAR1_GPI_IMPL) {
2341 		__asm volatile ("msr apgakeylo_el1, %0"
2342 		    :: "r"(pm->pm_apgakey[0]));
2343 		__asm volatile ("msr apgakeyhi_el1, %0"
2344 		    :: "r"(pm->pm_apgakey[1]));
2345 	}
2346 }
2347 
2348 void
2349 pmap_setttb(struct proc *p)
2350 {
2351 	struct cpu_info *ci = curcpu();
2352 	pmap_t pm = p->p_vmspace->vm_map.pmap;
2353 
2354 	/*
2355 	 * If the generation of the ASID for the new pmap doesn't
2356 	 * match the current generation, allocate a new ASID.
2357 	 */
2358 	if (pm != pmap_kernel() &&
2359 	    (pm->pm_asid & ~PMAP_ASID_MASK) != READ_ONCE(pmap_asid_gen))
2360 		pmap_allocate_asid(pm);
2361 
2362 	if (pm != pmap_kernel())
2363 		pmap_setpauthkeys(pm);
2364 
2365 	WRITE_SPECIALREG(ttbr0_el1, pmap_kernel()->pm_pt0pa);
2366 	__asm volatile("isb");
2367 	cpu_setttb(pm->pm_asid, pm->pm_pt0pa);
2368 	ci->ci_curpm = pm;
2369 	ci->ci_flush_bp();
2370 }
2371