1 /*- 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1994 John S. Dyson 5 * All rights reserved. 6 * Copyright (c) 1994 David Greenman 7 * All rights reserved. 8 * Copyright (c) 2003 Peter Wemm 9 * All rights reserved. 10 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 11 * All rights reserved. 12 * Copyright (c) 2014 Andrew Turner 13 * All rights reserved. 14 * Copyright (c) 2014-2016 The FreeBSD Foundation 15 * All rights reserved. 16 * 17 * This code is derived from software contributed to Berkeley by 18 * the Systems Programming Group of the University of Utah Computer 19 * Science Department and William Jolitz of UUNET Technologies Inc. 20 * 21 * This software was developed by Andrew Turner under sponsorship from 22 * the FreeBSD Foundation. 23 * 24 * Redistribution and use in source and binary forms, with or without 25 * modification, are permitted provided that the following conditions 26 * are met: 27 * 1. Redistributions of source code must retain the above copyright 28 * notice, this list of conditions and the following disclaimer. 29 * 2. Redistributions in binary form must reproduce the above copyright 30 * notice, this list of conditions and the following disclaimer in the 31 * documentation and/or other materials provided with the distribution. 32 * 3. All advertising materials mentioning features or use of this software 33 * must display the following acknowledgement: 34 * This product includes software developed by the University of 35 * California, Berkeley and its contributors. 36 * 4. Neither the name of the University nor the names of its contributors 37 * may be used to endorse or promote products derived from this software 38 * without specific prior written permission. 39 * 40 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 41 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 42 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 43 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 44 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 45 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 46 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 47 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 48 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 49 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 50 * SUCH DAMAGE. 51 */ 52 /*- 53 * Copyright (c) 2003 Networks Associates Technology, Inc. 54 * All rights reserved. 55 * 56 * This software was developed for the FreeBSD Project by Jake Burkholder, 57 * Safeport Network Services, and Network Associates Laboratories, the 58 * Security Research Division of Network Associates, Inc. under 59 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 60 * CHATS research program. 61 * 62 * Redistribution and use in source and binary forms, with or without 63 * modification, are permitted provided that the following conditions 64 * are met: 65 * 1. Redistributions of source code must retain the above copyright 66 * notice, this list of conditions and the following disclaimer. 67 * 2. Redistributions in binary form must reproduce the above copyright 68 * notice, this list of conditions and the following disclaimer in the 69 * documentation and/or other materials provided with the distribution. 70 * 71 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 72 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 73 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 74 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 75 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 76 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 77 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 78 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 79 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 80 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 81 * SUCH DAMAGE. 82 */ 83 84 #include <sys/cdefs.h> 85 /* 86 * Manages physical address maps. 87 * 88 * Since the information managed by this module is 89 * also stored by the logical address mapping module, 90 * this module may throw away valid virtual-to-physical 91 * mappings at almost any time. However, invalidations 92 * of virtual-to-physical mappings must be done as 93 * requested. 94 * 95 * In order to cope with hardware architectures which 96 * make virtual-to-physical map invalidates expensive, 97 * this module may delay invalidate or reduced protection 98 * operations until such time as they are actually 99 * necessary. This module is given full information as 100 * to which processors are currently using which maps, 101 * and to when physical maps must be made correct. 102 */ 103 104 #include "opt_vm.h" 105 106 #include <sys/param.h> 107 #include <sys/asan.h> 108 #include <sys/bitstring.h> 109 #include <sys/bus.h> 110 #include <sys/systm.h> 111 #include <sys/kernel.h> 112 #include <sys/ktr.h> 113 #include <sys/limits.h> 114 #include <sys/lock.h> 115 #include <sys/malloc.h> 116 #include <sys/mman.h> 117 #include <sys/msan.h> 118 #include <sys/msgbuf.h> 119 #include <sys/mutex.h> 120 #include <sys/physmem.h> 121 #include <sys/proc.h> 122 #include <sys/rangeset.h> 123 #include <sys/rwlock.h> 124 #include <sys/sbuf.h> 125 #include <sys/sx.h> 126 #include <sys/vmem.h> 127 #include <sys/vmmeter.h> 128 #include <sys/sched.h> 129 #include <sys/sysctl.h> 130 #include <sys/_unrhdr.h> 131 #include <sys/smp.h> 132 133 #include <vm/vm.h> 134 #include <vm/vm_param.h> 135 #include <vm/vm_kern.h> 136 #include <vm/vm_page.h> 137 #include <vm/vm_map.h> 138 #include <vm/vm_object.h> 139 #include <vm/vm_extern.h> 140 #include <vm/vm_pageout.h> 141 #include <vm/vm_pager.h> 142 #include <vm/vm_phys.h> 143 #include <vm/vm_radix.h> 144 #include <vm/vm_reserv.h> 145 #include <vm/vm_dumpset.h> 146 #include <vm/uma.h> 147 148 #include <machine/asan.h> 149 #include <machine/cpu_feat.h> 150 #include <machine/machdep.h> 151 #include <machine/md_var.h> 152 #include <machine/pcb.h> 153 154 #ifdef NUMA 155 #define PMAP_MEMDOM MAXMEMDOM 156 #else 157 #define PMAP_MEMDOM 1 158 #endif 159 160 #define PMAP_ASSERT_STAGE1(pmap) MPASS((pmap)->pm_stage == PM_STAGE1) 161 #define PMAP_ASSERT_STAGE2(pmap) MPASS((pmap)->pm_stage == PM_STAGE2) 162 163 #define NL0PG (PAGE_SIZE/(sizeof (pd_entry_t))) 164 #define NL1PG (PAGE_SIZE/(sizeof (pd_entry_t))) 165 #define NL2PG (PAGE_SIZE/(sizeof (pd_entry_t))) 166 #define NL3PG (PAGE_SIZE/(sizeof (pt_entry_t))) 167 168 #define NUL0E L0_ENTRIES 169 #define NUL1E (NUL0E * NL1PG) 170 #define NUL2E (NUL1E * NL2PG) 171 172 #ifdef PV_STATS 173 #define PV_STAT(x) do { x ; } while (0) 174 #define __pvused 175 #else 176 #define PV_STAT(x) do { } while (0) 177 #define __pvused __unused 178 #endif 179 180 #define pmap_l0_pindex(v) (NUL2E + NUL1E + ((v) >> L0_SHIFT)) 181 #define pmap_l1_pindex(v) (NUL2E + ((v) >> L1_SHIFT)) 182 #define pmap_l2_pindex(v) ((v) >> L2_SHIFT) 183 184 #ifdef __ARM_FEATURE_BTI_DEFAULT 185 #define ATTR_KERN_GP ATTR_S1_GP 186 #else 187 #define ATTR_KERN_GP 0 188 #endif 189 #define PMAP_SAN_PTE_BITS (ATTR_AF | ATTR_S1_XN | pmap_sh_attr | \ 190 ATTR_KERN_GP | ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | ATTR_S1_AP(ATTR_S1_AP_RW)) 191 192 struct pmap_large_md_page { 193 struct rwlock pv_lock; 194 struct md_page pv_page; 195 /* Pad to a power of 2, see pmap_init_pv_table(). */ 196 int pv_pad[2]; 197 }; 198 199 __exclusive_cache_line static struct pmap_large_md_page pv_dummy_large; 200 #define pv_dummy pv_dummy_large.pv_page 201 __read_mostly static struct pmap_large_md_page *pv_table; 202 203 static struct pmap_large_md_page * 204 _pa_to_pmdp(vm_paddr_t pa) 205 { 206 struct vm_phys_seg *seg; 207 208 if ((seg = vm_phys_paddr_to_seg(pa)) != NULL) 209 return ((struct pmap_large_md_page *)seg->md_first + 210 pmap_l2_pindex(pa) - pmap_l2_pindex(seg->start)); 211 return (NULL); 212 } 213 214 static struct pmap_large_md_page * 215 pa_to_pmdp(vm_paddr_t pa) 216 { 217 struct pmap_large_md_page *pvd; 218 219 pvd = _pa_to_pmdp(pa); 220 if (pvd == NULL) 221 panic("pa 0x%jx not within vm_phys_segs", (uintmax_t)pa); 222 return (pvd); 223 } 224 225 static struct pmap_large_md_page * 226 page_to_pmdp(vm_page_t m) 227 { 228 struct vm_phys_seg *seg; 229 230 seg = &vm_phys_segs[m->segind]; 231 return ((struct pmap_large_md_page *)seg->md_first + 232 pmap_l2_pindex(VM_PAGE_TO_PHYS(m)) - pmap_l2_pindex(seg->start)); 233 } 234 235 #define pa_to_pvh(pa) (&(pa_to_pmdp(pa)->pv_page)) 236 #define page_to_pvh(m) (&(page_to_pmdp(m)->pv_page)) 237 238 #define PHYS_TO_PV_LIST_LOCK(pa) ({ \ 239 struct pmap_large_md_page *_pvd; \ 240 struct rwlock *_lock; \ 241 _pvd = _pa_to_pmdp(pa); \ 242 if (__predict_false(_pvd == NULL)) \ 243 _lock = &pv_dummy_large.pv_lock; \ 244 else \ 245 _lock = &(_pvd->pv_lock); \ 246 _lock; \ 247 }) 248 249 static struct rwlock * 250 VM_PAGE_TO_PV_LIST_LOCK(vm_page_t m) 251 { 252 if ((m->flags & PG_FICTITIOUS) == 0) 253 return (&page_to_pmdp(m)->pv_lock); 254 else 255 return (&pv_dummy_large.pv_lock); 256 } 257 258 #define CHANGE_PV_LIST_LOCK(lockp, new_lock) do { \ 259 struct rwlock **_lockp = (lockp); \ 260 struct rwlock *_new_lock = (new_lock); \ 261 \ 262 if (_new_lock != *_lockp) { \ 263 if (*_lockp != NULL) \ 264 rw_wunlock(*_lockp); \ 265 *_lockp = _new_lock; \ 266 rw_wlock(*_lockp); \ 267 } \ 268 } while (0) 269 270 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) \ 271 CHANGE_PV_LIST_LOCK(lockp, PHYS_TO_PV_LIST_LOCK(pa)) 272 273 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ 274 CHANGE_PV_LIST_LOCK(lockp, VM_PAGE_TO_PV_LIST_LOCK(m)) 275 276 #define RELEASE_PV_LIST_LOCK(lockp) do { \ 277 struct rwlock **_lockp = (lockp); \ 278 \ 279 if (*_lockp != NULL) { \ 280 rw_wunlock(*_lockp); \ 281 *_lockp = NULL; \ 282 } \ 283 } while (0) 284 285 #define PTE_TO_VM_PAGE(pte) PHYS_TO_VM_PAGE(PTE_TO_PHYS(pte)) 286 #define VM_PAGE_TO_PTE(m) PHYS_TO_PTE(VM_PAGE_TO_PHYS(m)) 287 288 /* 289 * The presence of this flag indicates that the mapping is writeable. 290 * If the ATTR_S1_AP_RO bit is also set, then the mapping is clean, otherwise 291 * it is dirty. This flag may only be set on managed mappings. 292 * 293 * The DBM bit is reserved on ARMv8.0 but it seems we can safely treat it 294 * as a software managed bit. 295 */ 296 #define ATTR_SW_DBM ATTR_DBM 297 298 struct pmap kernel_pmap_store; 299 300 /* Used for mapping ACPI memory before VM is initialized */ 301 #define PMAP_PREINIT_MAPPING_COUNT 32 302 #define PMAP_PREINIT_MAPPING_SIZE (PMAP_PREINIT_MAPPING_COUNT * L2_SIZE) 303 static vm_offset_t preinit_map_va; /* Start VA of pre-init mapping space */ 304 static int vm_initialized = 0; /* No need to use pre-init maps when set */ 305 306 /* 307 * Reserve a few L2 blocks starting from 'preinit_map_va' pointer. 308 * Always map entire L2 block for simplicity. 309 * VA of L2 block = preinit_map_va + i * L2_SIZE 310 */ 311 static struct pmap_preinit_mapping { 312 vm_paddr_t pa; 313 vm_offset_t va; 314 vm_size_t size; 315 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; 316 317 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 318 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 319 vm_offset_t kernel_vm_end = 0; 320 321 /* 322 * Data for the pv entry allocation mechanism. 323 */ 324 #ifdef NUMA 325 static __inline int 326 pc_to_domain(struct pv_chunk *pc) 327 { 328 return (vm_phys_domain(DMAP_TO_PHYS((vm_offset_t)pc))); 329 } 330 #else 331 static __inline int 332 pc_to_domain(struct pv_chunk *pc __unused) 333 { 334 return (0); 335 } 336 #endif 337 338 struct pv_chunks_list { 339 struct mtx pvc_lock; 340 TAILQ_HEAD(pch, pv_chunk) pvc_list; 341 int active_reclaims; 342 } __aligned(CACHE_LINE_SIZE); 343 344 struct pv_chunks_list __exclusive_cache_line pv_chunks[PMAP_MEMDOM]; 345 346 vm_paddr_t dmap_phys_base; /* The start of the dmap region */ 347 vm_paddr_t dmap_phys_max; /* The limit of the dmap region */ 348 vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */ 349 350 extern pt_entry_t pagetable_l0_ttbr1[]; 351 352 #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) 353 static vm_paddr_t physmap[PHYSMAP_SIZE]; 354 static u_int physmap_idx; 355 356 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 357 "VM/pmap parameters"); 358 359 bool pmap_lpa_enabled __read_mostly = false; 360 pt_entry_t pmap_sh_attr __read_mostly = ATTR_SH(ATTR_SH_IS); 361 362 #if PAGE_SIZE == PAGE_SIZE_4K 363 #define L1_BLOCKS_SUPPORTED 1 364 #else 365 #define L1_BLOCKS_SUPPORTED (pmap_lpa_enabled) 366 #endif 367 368 #define PMAP_ASSERT_L1_BLOCKS_SUPPORTED MPASS(L1_BLOCKS_SUPPORTED) 369 370 static bool pmap_l1_supported __read_mostly = false; 371 372 /* 373 * This ASID allocator uses a bit vector ("asid_set") to remember which ASIDs 374 * that it has currently allocated to a pmap, a cursor ("asid_next") to 375 * optimize its search for a free ASID in the bit vector, and an epoch number 376 * ("asid_epoch") to indicate when it has reclaimed all previously allocated 377 * ASIDs that are not currently active on a processor. 378 * 379 * The current epoch number is always in the range [0, INT_MAX). Negative 380 * numbers and INT_MAX are reserved for special cases that are described 381 * below. 382 */ 383 struct asid_set { 384 int asid_bits; 385 bitstr_t *asid_set; 386 int asid_set_size; 387 int asid_next; 388 int asid_epoch; 389 struct mtx asid_set_mutex; 390 }; 391 392 static struct asid_set asids; 393 static struct asid_set vmids; 394 395 static SYSCTL_NODE(_vm_pmap, OID_AUTO, asid, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 396 "ASID allocator"); 397 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, bits, CTLFLAG_RD, &asids.asid_bits, 0, 398 "The number of bits in an ASID"); 399 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, next, CTLFLAG_RD, &asids.asid_next, 0, 400 "The last allocated ASID plus one"); 401 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, epoch, CTLFLAG_RD, &asids.asid_epoch, 0, 402 "The current epoch number"); 403 404 static SYSCTL_NODE(_vm_pmap, OID_AUTO, vmid, CTLFLAG_RD, 0, "VMID allocator"); 405 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, bits, CTLFLAG_RD, &vmids.asid_bits, 0, 406 "The number of bits in an VMID"); 407 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, next, CTLFLAG_RD, &vmids.asid_next, 0, 408 "The last allocated VMID plus one"); 409 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, epoch, CTLFLAG_RD, &vmids.asid_epoch, 0, 410 "The current epoch number"); 411 412 void (*pmap_clean_stage2_tlbi)(void); 413 void (*pmap_stage2_invalidate_range)(uint64_t, vm_offset_t, vm_offset_t, bool); 414 void (*pmap_stage2_invalidate_all)(uint64_t); 415 416 /* 417 * A pmap's cookie encodes an ASID and epoch number. Cookies for reserved 418 * ASIDs have a negative epoch number, specifically, INT_MIN. Cookies for 419 * dynamically allocated ASIDs have a non-negative epoch number. 420 * 421 * An invalid ASID is represented by -1. 422 * 423 * There are two special-case cookie values: (1) COOKIE_FROM(-1, INT_MIN), 424 * which indicates that an ASID should never be allocated to the pmap, and 425 * (2) COOKIE_FROM(-1, INT_MAX), which indicates that an ASID should be 426 * allocated when the pmap is next activated. 427 */ 428 #define COOKIE_FROM(asid, epoch) ((long)((u_int)(asid) | \ 429 ((u_long)(epoch) << 32))) 430 #define COOKIE_TO_ASID(cookie) ((int)(cookie)) 431 #define COOKIE_TO_EPOCH(cookie) ((int)((u_long)(cookie) >> 32)) 432 433 #define TLBI_VA_SHIFT 12 434 #define TLBI_VA_MASK ((1ul << 44) - 1) 435 #define TLBI_VA(addr) (((addr) >> TLBI_VA_SHIFT) & TLBI_VA_MASK) 436 437 static int __read_frequently superpages_enabled = 1; 438 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled, 439 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &superpages_enabled, 0, 440 "Are large page mappings enabled?"); 441 442 /* 443 * True when Branch Target Identification should be used by userspace. This 444 * allows pmap to mark pages as guarded with ATTR_S1_GP. 445 */ 446 __read_mostly static bool pmap_bti_support = false; 447 448 /* 449 * Internal flags for pmap_enter()'s helper functions. 450 */ 451 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ 452 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ 453 454 TAILQ_HEAD(pv_chunklist, pv_chunk); 455 456 static void free_pv_chunk(struct pv_chunk *pc); 457 static void free_pv_chunk_batch(struct pv_chunklist *batch); 458 static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 459 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); 460 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); 461 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 462 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 463 vm_offset_t va); 464 465 static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte); 466 static bool pmap_activate_int(pmap_t pmap); 467 static void pmap_alloc_asid(pmap_t pmap); 468 static int pmap_change_props_locked(vm_offset_t va, vm_size_t size, 469 vm_prot_t prot, int mode, bool skip_unmapped); 470 static bool pmap_copy_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, 471 pt_entry_t l3e, vm_page_t ml3, struct rwlock **lockp); 472 static pt_entry_t *pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va); 473 static pt_entry_t *pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, 474 vm_offset_t va, struct rwlock **lockp); 475 static pt_entry_t *pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va); 476 static bool pmap_demote_l2c(pmap_t pmap, pt_entry_t *l2p, vm_offset_t va); 477 static bool pmap_demote_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va); 478 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 479 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); 480 static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, 481 u_int flags, vm_page_t m, struct rwlock **lockp); 482 static int pmap_enter_l3c(pmap_t pmap, vm_offset_t va, pt_entry_t l3e, u_int flags, 483 vm_page_t m, vm_page_t *ml3p, struct rwlock **lockp); 484 static bool pmap_every_pte_zero(vm_paddr_t pa); 485 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted, 486 bool all_l3e_AF_set); 487 static pt_entry_t pmap_load_l3c(pt_entry_t *l3p); 488 static void pmap_mask_set_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, 489 vm_offset_t *vap, vm_offset_t va_next, pt_entry_t mask, pt_entry_t nbits); 490 static bool pmap_pv_insert_l3c(pmap_t pmap, vm_offset_t va, vm_page_t m, 491 struct rwlock **lockp); 492 static void pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va); 493 static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, 494 pd_entry_t l1e, struct spglist *free, struct rwlock **lockp); 495 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva, 496 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp); 497 static bool pmap_remove_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, 498 vm_offset_t *vap, vm_offset_t va_next, vm_page_t ml3, struct spglist *free, 499 struct rwlock **lockp); 500 static void pmap_reset_asid_set(pmap_t pmap); 501 static bool pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 502 vm_page_t m, struct rwlock **lockp); 503 504 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, 505 struct rwlock **lockp); 506 507 static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, 508 struct spglist *free); 509 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); 510 static void pmap_update_entry(pmap_t pmap, pd_entry_t *pte, pd_entry_t newpte, 511 vm_offset_t va, vm_size_t size); 512 static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); 513 514 static uma_zone_t pmap_bti_ranges_zone; 515 static bool pmap_bti_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 516 pt_entry_t *pte); 517 static pt_entry_t pmap_pte_bti(pmap_t pmap, vm_offset_t va); 518 static void pmap_bti_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); 519 static void *bti_dup_range(void *ctx, void *data); 520 static void bti_free_range(void *ctx, void *node); 521 static int pmap_bti_copy(pmap_t dst_pmap, pmap_t src_pmap); 522 static void pmap_bti_deassign_all(pmap_t pmap); 523 524 /* 525 * These load the old table data and store the new value. 526 * They need to be atomic as the System MMU may write to the table at 527 * the same time as the CPU. 528 */ 529 #define pmap_clear(table) atomic_store_64(table, 0) 530 #define pmap_clear_bits(table, bits) atomic_clear_64(table, bits) 531 #define pmap_load(table) (*table) 532 #define pmap_load_clear(table) atomic_swap_64(table, 0) 533 #define pmap_load_store(table, entry) atomic_swap_64(table, entry) 534 #define pmap_set_bits(table, bits) atomic_set_64(table, bits) 535 #define pmap_store(table, entry) atomic_store_64(table, entry) 536 537 /********************/ 538 /* Inline functions */ 539 /********************/ 540 541 static __inline void 542 pagecopy(void *s, void *d) 543 { 544 545 memcpy(d, s, PAGE_SIZE); 546 } 547 548 static __inline pd_entry_t * 549 pmap_l0(pmap_t pmap, vm_offset_t va) 550 { 551 552 return (&pmap->pm_l0[pmap_l0_index(va)]); 553 } 554 555 static __inline pd_entry_t * 556 pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va) 557 { 558 pd_entry_t *l1; 559 560 l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l0))); 561 return (&l1[pmap_l1_index(va)]); 562 } 563 564 static __inline pd_entry_t * 565 pmap_l1(pmap_t pmap, vm_offset_t va) 566 { 567 pd_entry_t *l0; 568 569 l0 = pmap_l0(pmap, va); 570 if ((pmap_load(l0) & ATTR_DESCR_MASK) != L0_TABLE) 571 return (NULL); 572 573 return (pmap_l0_to_l1(l0, va)); 574 } 575 576 static __inline pd_entry_t * 577 pmap_l1_to_l2(pd_entry_t *l1p, vm_offset_t va) 578 { 579 pd_entry_t l1, *l2p; 580 581 l1 = pmap_load(l1p); 582 583 KASSERT(ADDR_IS_CANONICAL(va), 584 ("%s: Address not in canonical form: %lx", __func__, va)); 585 /* 586 * The valid bit may be clear if pmap_update_entry() is concurrently 587 * modifying the entry, so for KVA only the entry type may be checked. 588 */ 589 KASSERT(ADDR_IS_KERNEL(va) || (l1 & ATTR_DESCR_VALID) != 0, 590 ("%s: L1 entry %#lx for %#lx is invalid", __func__, l1, va)); 591 KASSERT((l1 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE, 592 ("%s: L1 entry %#lx for %#lx is a leaf", __func__, l1, va)); 593 l2p = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(l1)); 594 return (&l2p[pmap_l2_index(va)]); 595 } 596 597 static __inline pd_entry_t * 598 pmap_l2(pmap_t pmap, vm_offset_t va) 599 { 600 pd_entry_t *l1; 601 602 l1 = pmap_l1(pmap, va); 603 if ((pmap_load(l1) & ATTR_DESCR_MASK) != L1_TABLE) 604 return (NULL); 605 606 return (pmap_l1_to_l2(l1, va)); 607 } 608 609 static __inline pt_entry_t * 610 pmap_l2_to_l3(pd_entry_t *l2p, vm_offset_t va) 611 { 612 pd_entry_t l2; 613 pt_entry_t *l3p; 614 615 l2 = pmap_load(l2p); 616 617 KASSERT(ADDR_IS_CANONICAL(va), 618 ("%s: Address not in canonical form: %lx", __func__, va)); 619 /* 620 * The valid bit may be clear if pmap_update_entry() is concurrently 621 * modifying the entry, so for KVA only the entry type may be checked. 622 */ 623 KASSERT(ADDR_IS_KERNEL(va) || (l2 & ATTR_DESCR_VALID) != 0, 624 ("%s: L2 entry %#lx for %#lx is invalid", __func__, l2, va)); 625 KASSERT((l2 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE, 626 ("%s: L2 entry %#lx for %#lx is a leaf", __func__, l2, va)); 627 l3p = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(l2)); 628 return (&l3p[pmap_l3_index(va)]); 629 } 630 631 /* 632 * Returns the lowest valid pde for a given virtual address. 633 * The next level may or may not point to a valid page or block. 634 */ 635 static __inline pd_entry_t * 636 pmap_pde(pmap_t pmap, vm_offset_t va, int *level) 637 { 638 pd_entry_t *l0, *l1, *l2, desc; 639 640 l0 = pmap_l0(pmap, va); 641 desc = pmap_load(l0) & ATTR_DESCR_MASK; 642 if (desc != L0_TABLE) { 643 *level = -1; 644 return (NULL); 645 } 646 647 l1 = pmap_l0_to_l1(l0, va); 648 desc = pmap_load(l1) & ATTR_DESCR_MASK; 649 if (desc != L1_TABLE) { 650 *level = 0; 651 return (l0); 652 } 653 654 l2 = pmap_l1_to_l2(l1, va); 655 desc = pmap_load(l2) & ATTR_DESCR_MASK; 656 if (desc != L2_TABLE) { 657 *level = 1; 658 return (l1); 659 } 660 661 *level = 2; 662 return (l2); 663 } 664 665 /* 666 * Returns the lowest valid pte block or table entry for a given virtual 667 * address. If there are no valid entries return NULL and set the level to 668 * the first invalid level. 669 */ 670 static __inline pt_entry_t * 671 pmap_pte(pmap_t pmap, vm_offset_t va, int *level) 672 { 673 pd_entry_t *l1, *l2, desc; 674 pt_entry_t *l3; 675 676 l1 = pmap_l1(pmap, va); 677 if (l1 == NULL) { 678 *level = 0; 679 return (NULL); 680 } 681 desc = pmap_load(l1) & ATTR_DESCR_MASK; 682 if (desc == L1_BLOCK) { 683 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 684 *level = 1; 685 return (l1); 686 } 687 688 if (desc != L1_TABLE) { 689 *level = 1; 690 return (NULL); 691 } 692 693 l2 = pmap_l1_to_l2(l1, va); 694 desc = pmap_load(l2) & ATTR_DESCR_MASK; 695 if (desc == L2_BLOCK) { 696 *level = 2; 697 return (l2); 698 } 699 700 if (desc != L2_TABLE) { 701 *level = 2; 702 return (NULL); 703 } 704 705 *level = 3; 706 l3 = pmap_l2_to_l3(l2, va); 707 if ((pmap_load(l3) & ATTR_DESCR_MASK) != L3_PAGE) 708 return (NULL); 709 710 return (l3); 711 } 712 713 /* 714 * If the given pmap has an L{1,2}_BLOCK or L3_PAGE entry at the specified 715 * level that maps the specified virtual address, then a pointer to that entry 716 * is returned. Otherwise, NULL is returned, unless INVARIANTS are enabled 717 * and a diagnostic message is provided, in which case this function panics. 718 */ 719 static __always_inline pt_entry_t * 720 pmap_pte_exists(pmap_t pmap, vm_offset_t va, int level, const char *diag) 721 { 722 pd_entry_t *l0p, *l1p, *l2p; 723 pt_entry_t desc, *l3p; 724 int walk_level __diagused; 725 726 KASSERT(level >= 0 && level < 4, 727 ("%s: %s passed an out-of-range level (%d)", __func__, diag, 728 level)); 729 l0p = pmap_l0(pmap, va); 730 desc = pmap_load(l0p) & ATTR_DESCR_MASK; 731 if (desc == L0_TABLE && level > 0) { 732 l1p = pmap_l0_to_l1(l0p, va); 733 desc = pmap_load(l1p) & ATTR_DESCR_MASK; 734 if (desc == L1_BLOCK && level == 1) { 735 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 736 return (l1p); 737 } 738 if (desc == L1_TABLE && level > 1) { 739 l2p = pmap_l1_to_l2(l1p, va); 740 desc = pmap_load(l2p) & ATTR_DESCR_MASK; 741 if (desc == L2_BLOCK && level == 2) 742 return (l2p); 743 else if (desc == L2_TABLE && level > 2) { 744 l3p = pmap_l2_to_l3(l2p, va); 745 desc = pmap_load(l3p) & ATTR_DESCR_MASK; 746 if (desc == L3_PAGE && level == 3) 747 return (l3p); 748 else 749 walk_level = 3; 750 } else 751 walk_level = 2; 752 } else 753 walk_level = 1; 754 } else 755 walk_level = 0; 756 KASSERT(diag == NULL, 757 ("%s: va %#lx not mapped at level %d, desc %ld at level %d", 758 diag, va, level, desc, walk_level)); 759 return (NULL); 760 } 761 762 bool 763 pmap_ps_enabled(pmap_t pmap) 764 { 765 /* 766 * Promotion requires a hypervisor call when the kernel is running 767 * in EL1. To stop this disable superpage support on non-stage 1 768 * pmaps for now. 769 */ 770 if (pmap->pm_stage != PM_STAGE1) 771 return (false); 772 773 #ifdef KMSAN 774 /* 775 * The break-before-make in pmap_update_entry() results in a situation 776 * where a CPU may call into the KMSAN runtime while the entry is 777 * invalid. If the entry is used to map the current thread structure, 778 * then the runtime will attempt to access unmapped memory. Avoid this 779 * by simply disabling superpage promotion for the kernel map. 780 */ 781 if (pmap == kernel_pmap) 782 return (false); 783 #endif 784 785 return (superpages_enabled != 0); 786 } 787 788 bool 789 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l0, pd_entry_t **l1, 790 pd_entry_t **l2, pt_entry_t **l3) 791 { 792 pd_entry_t *l0p, *l1p, *l2p; 793 794 if (pmap->pm_l0 == NULL) 795 return (false); 796 797 l0p = pmap_l0(pmap, va); 798 *l0 = l0p; 799 800 if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE) 801 return (false); 802 803 l1p = pmap_l0_to_l1(l0p, va); 804 *l1 = l1p; 805 806 if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) { 807 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 808 *l2 = NULL; 809 *l3 = NULL; 810 return (true); 811 } 812 813 if ((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_TABLE) 814 return (false); 815 816 l2p = pmap_l1_to_l2(l1p, va); 817 *l2 = l2p; 818 819 if ((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK) { 820 *l3 = NULL; 821 return (true); 822 } 823 824 if ((pmap_load(l2p) & ATTR_DESCR_MASK) != L2_TABLE) 825 return (false); 826 827 *l3 = pmap_l2_to_l3(l2p, va); 828 829 return (true); 830 } 831 832 static __inline int 833 pmap_l3_valid(pt_entry_t l3) 834 { 835 836 return ((l3 & ATTR_DESCR_MASK) == L3_PAGE); 837 } 838 839 CTASSERT(L1_BLOCK == L2_BLOCK); 840 841 static pt_entry_t 842 pmap_pte_memattr(pmap_t pmap, vm_memattr_t memattr) 843 { 844 pt_entry_t val; 845 846 if (pmap->pm_stage == PM_STAGE1) { 847 val = ATTR_S1_IDX(memattr); 848 if (memattr == VM_MEMATTR_DEVICE) 849 val |= ATTR_S1_XN; 850 return (val); 851 } 852 853 val = 0; 854 855 switch (memattr) { 856 case VM_MEMATTR_DEVICE: 857 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_DEVICE_nGnRnE) | 858 ATTR_S2_XN(ATTR_S2_XN_ALL)); 859 case VM_MEMATTR_UNCACHEABLE: 860 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_NC)); 861 case VM_MEMATTR_WRITE_BACK: 862 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WB)); 863 case VM_MEMATTR_WRITE_THROUGH: 864 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WT)); 865 default: 866 panic("%s: invalid memory attribute %x", __func__, memattr); 867 } 868 } 869 870 static pt_entry_t 871 pmap_pte_prot(pmap_t pmap, vm_prot_t prot) 872 { 873 pt_entry_t val; 874 875 val = 0; 876 if (pmap->pm_stage == PM_STAGE1) { 877 if ((prot & VM_PROT_EXECUTE) == 0) 878 val |= ATTR_S1_XN; 879 if ((prot & VM_PROT_WRITE) == 0) 880 val |= ATTR_S1_AP(ATTR_S1_AP_RO); 881 } else { 882 if ((prot & VM_PROT_WRITE) != 0) 883 val |= ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE); 884 if ((prot & VM_PROT_READ) != 0) 885 val |= ATTR_S2_S2AP(ATTR_S2_S2AP_READ); 886 if ((prot & VM_PROT_EXECUTE) == 0) 887 val |= ATTR_S2_XN(ATTR_S2_XN_ALL); 888 } 889 890 return (val); 891 } 892 893 /* 894 * Checks if the PTE is dirty. 895 */ 896 static inline int 897 pmap_pte_dirty(pmap_t pmap, pt_entry_t pte) 898 { 899 900 KASSERT((pte & ATTR_SW_MANAGED) != 0, ("pte %#lx is unmanaged", pte)); 901 902 if (pmap->pm_stage == PM_STAGE1) { 903 KASSERT((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) != 0, 904 ("pte %#lx is writeable and missing ATTR_SW_DBM", pte)); 905 906 return ((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 907 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM)); 908 } 909 910 return ((pte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) == 911 ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)); 912 } 913 914 static __inline void 915 pmap_resident_count_inc(pmap_t pmap, int count) 916 { 917 918 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 919 pmap->pm_stats.resident_count += count; 920 } 921 922 static __inline void 923 pmap_resident_count_dec(pmap_t pmap, int count) 924 { 925 926 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 927 KASSERT(pmap->pm_stats.resident_count >= count, 928 ("pmap %p resident count underflow %ld %d", pmap, 929 pmap->pm_stats.resident_count, count)); 930 pmap->pm_stats.resident_count -= count; 931 } 932 933 static vm_paddr_t 934 pmap_early_vtophys(vm_offset_t va) 935 { 936 vm_paddr_t pa_page; 937 938 pa_page = arm64_address_translate_s1e1r(va) & PAR_PA_MASK; 939 return (pa_page | (va & PAR_LOW_MASK)); 940 } 941 942 /* State of the bootstrapped DMAP page tables */ 943 struct pmap_bootstrap_state { 944 pt_entry_t *l1; 945 pt_entry_t *l2; 946 pt_entry_t *l3; 947 vm_offset_t freemempos; 948 vm_offset_t va; 949 vm_paddr_t pa; 950 pt_entry_t table_attrs; 951 u_int l0_slot; 952 u_int l1_slot; 953 u_int l2_slot; 954 bool dmap_valid; 955 }; 956 957 /* The bootstrap state */ 958 static struct pmap_bootstrap_state bs_state = { 959 .l1 = NULL, 960 .l2 = NULL, 961 .l3 = NULL, 962 .table_attrs = TATTR_PXN_TABLE, 963 .l0_slot = L0_ENTRIES, 964 .l1_slot = Ln_ENTRIES, 965 .l2_slot = Ln_ENTRIES, 966 .dmap_valid = false, 967 }; 968 969 static void 970 pmap_bootstrap_l0_table(struct pmap_bootstrap_state *state) 971 { 972 vm_paddr_t l1_pa; 973 pd_entry_t l0e; 974 u_int l0_slot; 975 976 /* Link the level 0 table to a level 1 table */ 977 l0_slot = pmap_l0_index(state->va); 978 if (l0_slot != state->l0_slot) { 979 /* 980 * Make sure we move from a low address to high address 981 * before the DMAP region is ready. This ensures we never 982 * modify an existing mapping until we can map from a 983 * physical address to a virtual address. 984 */ 985 MPASS(state->l0_slot < l0_slot || 986 state->l0_slot == L0_ENTRIES || 987 state->dmap_valid); 988 989 /* Reset lower levels */ 990 state->l2 = NULL; 991 state->l3 = NULL; 992 state->l1_slot = Ln_ENTRIES; 993 state->l2_slot = Ln_ENTRIES; 994 995 /* Check the existing L0 entry */ 996 state->l0_slot = l0_slot; 997 if (state->dmap_valid) { 998 l0e = pagetable_l0_ttbr1[l0_slot]; 999 if ((l0e & ATTR_DESCR_VALID) != 0) { 1000 MPASS((l0e & ATTR_DESCR_MASK) == L0_TABLE); 1001 l1_pa = PTE_TO_PHYS(l0e); 1002 state->l1 = (pt_entry_t *)PHYS_TO_DMAP(l1_pa); 1003 return; 1004 } 1005 } 1006 1007 /* Create a new L0 table entry */ 1008 state->l1 = (pt_entry_t *)state->freemempos; 1009 memset(state->l1, 0, PAGE_SIZE); 1010 state->freemempos += PAGE_SIZE; 1011 1012 l1_pa = pmap_early_vtophys((vm_offset_t)state->l1); 1013 MPASS((l1_pa & Ln_TABLE_MASK) == 0); 1014 MPASS(pagetable_l0_ttbr1[l0_slot] == 0); 1015 pmap_store(&pagetable_l0_ttbr1[l0_slot], PHYS_TO_PTE(l1_pa) | 1016 TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0 | L0_TABLE); 1017 } 1018 KASSERT(state->l1 != NULL, ("%s: NULL l1", __func__)); 1019 } 1020 1021 static void 1022 pmap_bootstrap_l1_table(struct pmap_bootstrap_state *state) 1023 { 1024 vm_paddr_t l2_pa; 1025 pd_entry_t l1e; 1026 u_int l1_slot; 1027 1028 /* Make sure there is a valid L0 -> L1 table */ 1029 pmap_bootstrap_l0_table(state); 1030 1031 /* Link the level 1 table to a level 2 table */ 1032 l1_slot = pmap_l1_index(state->va); 1033 if (l1_slot != state->l1_slot) { 1034 /* See pmap_bootstrap_l0_table for a description */ 1035 MPASS(state->l1_slot < l1_slot || 1036 state->l1_slot == Ln_ENTRIES || 1037 state->dmap_valid); 1038 1039 /* Reset lower levels */ 1040 state->l3 = NULL; 1041 state->l2_slot = Ln_ENTRIES; 1042 1043 /* Check the existing L1 entry */ 1044 state->l1_slot = l1_slot; 1045 if (state->dmap_valid) { 1046 l1e = state->l1[l1_slot]; 1047 if ((l1e & ATTR_DESCR_VALID) != 0) { 1048 MPASS((l1e & ATTR_DESCR_MASK) == L1_TABLE); 1049 l2_pa = PTE_TO_PHYS(l1e); 1050 state->l2 = (pt_entry_t *)PHYS_TO_DMAP(l2_pa); 1051 return; 1052 } 1053 } 1054 1055 /* Create a new L1 table entry */ 1056 state->l2 = (pt_entry_t *)state->freemempos; 1057 memset(state->l2, 0, PAGE_SIZE); 1058 state->freemempos += PAGE_SIZE; 1059 1060 l2_pa = pmap_early_vtophys((vm_offset_t)state->l2); 1061 MPASS((l2_pa & Ln_TABLE_MASK) == 0); 1062 MPASS(state->l1[l1_slot] == 0); 1063 pmap_store(&state->l1[l1_slot], PHYS_TO_PTE(l2_pa) | 1064 state->table_attrs | L1_TABLE); 1065 } 1066 KASSERT(state->l2 != NULL, ("%s: NULL l2", __func__)); 1067 } 1068 1069 static void 1070 pmap_bootstrap_l2_table(struct pmap_bootstrap_state *state) 1071 { 1072 vm_paddr_t l3_pa; 1073 pd_entry_t l2e; 1074 u_int l2_slot; 1075 1076 /* Make sure there is a valid L1 -> L2 table */ 1077 pmap_bootstrap_l1_table(state); 1078 1079 /* Link the level 2 table to a level 3 table */ 1080 l2_slot = pmap_l2_index(state->va); 1081 if (l2_slot != state->l2_slot) { 1082 /* See pmap_bootstrap_l0_table for a description */ 1083 MPASS(state->l2_slot < l2_slot || 1084 state->l2_slot == Ln_ENTRIES || 1085 state->dmap_valid); 1086 1087 /* Check the existing L2 entry */ 1088 state->l2_slot = l2_slot; 1089 if (state->dmap_valid) { 1090 l2e = state->l2[l2_slot]; 1091 if ((l2e & ATTR_DESCR_VALID) != 0) { 1092 MPASS((l2e & ATTR_DESCR_MASK) == L2_TABLE); 1093 l3_pa = PTE_TO_PHYS(l2e); 1094 state->l3 = (pt_entry_t *)PHYS_TO_DMAP(l3_pa); 1095 return; 1096 } 1097 } 1098 1099 /* Create a new L2 table entry */ 1100 state->l3 = (pt_entry_t *)state->freemempos; 1101 memset(state->l3, 0, PAGE_SIZE); 1102 state->freemempos += PAGE_SIZE; 1103 1104 l3_pa = pmap_early_vtophys((vm_offset_t)state->l3); 1105 MPASS((l3_pa & Ln_TABLE_MASK) == 0); 1106 MPASS(state->l2[l2_slot] == 0); 1107 pmap_store(&state->l2[l2_slot], PHYS_TO_PTE(l3_pa) | 1108 state->table_attrs | L2_TABLE); 1109 } 1110 KASSERT(state->l3 != NULL, ("%s: NULL l3", __func__)); 1111 } 1112 1113 static void 1114 pmap_bootstrap_l2_block(struct pmap_bootstrap_state *state, int i) 1115 { 1116 pt_entry_t contig; 1117 u_int l2_slot; 1118 bool first; 1119 1120 if ((physmap[i + 1] - state->pa) < L2_SIZE) 1121 return; 1122 1123 /* Make sure there is a valid L1 table */ 1124 pmap_bootstrap_l1_table(state); 1125 1126 MPASS((state->va & L2_OFFSET) == 0); 1127 for (first = true, contig = 0; 1128 state->va < DMAP_MAX_ADDRESS && 1129 (physmap[i + 1] - state->pa) >= L2_SIZE; 1130 state->va += L2_SIZE, state->pa += L2_SIZE) { 1131 /* 1132 * Stop if we are about to walk off the end of what the 1133 * current L1 slot can address. 1134 */ 1135 if (!first && (state->pa & L1_OFFSET) == 0) 1136 break; 1137 1138 /* 1139 * If we have an aligned, contiguous chunk of L2C_ENTRIES 1140 * L2 blocks, set the contiguous bit within each PTE so that 1141 * the chunk can be cached using only one TLB entry. 1142 */ 1143 if ((state->pa & L2C_OFFSET) == 0) { 1144 if (state->va + L2C_SIZE < DMAP_MAX_ADDRESS && 1145 physmap[i + 1] - state->pa >= L2C_SIZE) { 1146 contig = ATTR_CONTIGUOUS; 1147 } else { 1148 contig = 0; 1149 } 1150 } 1151 1152 first = false; 1153 l2_slot = pmap_l2_index(state->va); 1154 MPASS((state->pa & L2_OFFSET) == 0); 1155 MPASS(state->l2[l2_slot] == 0); 1156 pmap_store(&state->l2[l2_slot], PHYS_TO_PTE(state->pa) | 1157 ATTR_AF | pmap_sh_attr | ATTR_S1_XN | ATTR_KERN_GP | 1158 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | contig | L2_BLOCK); 1159 } 1160 MPASS(state->va == (state->pa - dmap_phys_base + DMAP_MIN_ADDRESS)); 1161 } 1162 1163 static void 1164 pmap_bootstrap_l3_page(struct pmap_bootstrap_state *state, int i) 1165 { 1166 pt_entry_t contig; 1167 u_int l3_slot; 1168 bool first; 1169 1170 if (physmap[i + 1] - state->pa < L3_SIZE) 1171 return; 1172 1173 /* Make sure there is a valid L2 table */ 1174 pmap_bootstrap_l2_table(state); 1175 1176 MPASS((state->va & L3_OFFSET) == 0); 1177 for (first = true, contig = 0; 1178 state->va < DMAP_MAX_ADDRESS && 1179 physmap[i + 1] - state->pa >= L3_SIZE; 1180 state->va += L3_SIZE, state->pa += L3_SIZE) { 1181 /* 1182 * Stop if we are about to walk off the end of what the 1183 * current L2 slot can address. 1184 */ 1185 if (!first && (state->pa & L2_OFFSET) == 0) 1186 break; 1187 1188 /* 1189 * If we have an aligned, contiguous chunk of L3C_ENTRIES 1190 * L3 pages, set the contiguous bit within each PTE so that 1191 * the chunk can be cached using only one TLB entry. 1192 */ 1193 if ((state->pa & L3C_OFFSET) == 0) { 1194 if (state->va + L3C_SIZE < DMAP_MAX_ADDRESS && 1195 physmap[i + 1] - state->pa >= L3C_SIZE) { 1196 contig = ATTR_CONTIGUOUS; 1197 } else { 1198 contig = 0; 1199 } 1200 } 1201 1202 first = false; 1203 l3_slot = pmap_l3_index(state->va); 1204 MPASS((state->pa & L3_OFFSET) == 0); 1205 MPASS(state->l3[l3_slot] == 0); 1206 pmap_store(&state->l3[l3_slot], PHYS_TO_PTE(state->pa) | 1207 ATTR_AF | pmap_sh_attr | ATTR_S1_XN | ATTR_KERN_GP | 1208 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | contig | L3_PAGE); 1209 } 1210 MPASS(state->va == (state->pa - dmap_phys_base + DMAP_MIN_ADDRESS)); 1211 } 1212 1213 static void 1214 pmap_bootstrap_dmap(void) 1215 { 1216 int i; 1217 1218 /* Fill in physmap array. */ 1219 physmap_idx = physmem_avail(physmap, nitems(physmap)); 1220 1221 dmap_phys_base = physmap[0] & ~L1_OFFSET; 1222 dmap_phys_max = 0; 1223 dmap_max_addr = 0; 1224 1225 for (i = 0; i < physmap_idx; i += 2) { 1226 bs_state.pa = physmap[i] & ~L3_OFFSET; 1227 bs_state.va = bs_state.pa - dmap_phys_base + DMAP_MIN_ADDRESS; 1228 1229 /* Create L3 mappings at the start of the region */ 1230 if ((bs_state.pa & L2_OFFSET) != 0) 1231 pmap_bootstrap_l3_page(&bs_state, i); 1232 MPASS(bs_state.pa <= physmap[i + 1]); 1233 1234 if (L1_BLOCKS_SUPPORTED) { 1235 /* Create L2 mappings at the start of the region */ 1236 if ((bs_state.pa & L1_OFFSET) != 0) 1237 pmap_bootstrap_l2_block(&bs_state, i); 1238 MPASS(bs_state.pa <= physmap[i + 1]); 1239 1240 /* Create the main L1 block mappings */ 1241 for (; bs_state.va < DMAP_MAX_ADDRESS && 1242 (physmap[i + 1] - bs_state.pa) >= L1_SIZE; 1243 bs_state.va += L1_SIZE, bs_state.pa += L1_SIZE) { 1244 /* Make sure there is a valid L1 table */ 1245 pmap_bootstrap_l0_table(&bs_state); 1246 MPASS((bs_state.pa & L1_OFFSET) == 0); 1247 pmap_store( 1248 &bs_state.l1[pmap_l1_index(bs_state.va)], 1249 PHYS_TO_PTE(bs_state.pa) | ATTR_AF | 1250 pmap_sh_attr | 1251 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | 1252 ATTR_S1_XN | ATTR_KERN_GP | L1_BLOCK); 1253 } 1254 MPASS(bs_state.pa <= physmap[i + 1]); 1255 1256 /* Create L2 mappings at the end of the region */ 1257 pmap_bootstrap_l2_block(&bs_state, i); 1258 } else { 1259 while (bs_state.va < DMAP_MAX_ADDRESS && 1260 (physmap[i + 1] - bs_state.pa) >= L2_SIZE) { 1261 pmap_bootstrap_l2_block(&bs_state, i); 1262 } 1263 } 1264 MPASS(bs_state.pa <= physmap[i + 1]); 1265 1266 /* Create L3 mappings at the end of the region */ 1267 pmap_bootstrap_l3_page(&bs_state, i); 1268 MPASS(bs_state.pa == physmap[i + 1]); 1269 1270 if (bs_state.pa > dmap_phys_max) { 1271 dmap_phys_max = bs_state.pa; 1272 dmap_max_addr = bs_state.va; 1273 } 1274 } 1275 1276 cpu_tlb_flushID(); 1277 } 1278 1279 static void 1280 pmap_bootstrap_l2(vm_offset_t va) 1281 { 1282 KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address")); 1283 1284 /* Leave bs_state.pa as it's only needed to bootstrap blocks and pages*/ 1285 bs_state.va = va; 1286 1287 for (; bs_state.va < VM_MAX_KERNEL_ADDRESS; bs_state.va += L1_SIZE) 1288 pmap_bootstrap_l1_table(&bs_state); 1289 } 1290 1291 static void 1292 pmap_bootstrap_l3(vm_offset_t va) 1293 { 1294 KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address")); 1295 1296 /* Leave bs_state.pa as it's only needed to bootstrap blocks and pages*/ 1297 bs_state.va = va; 1298 1299 for (; bs_state.va < VM_MAX_KERNEL_ADDRESS; bs_state.va += L2_SIZE) 1300 pmap_bootstrap_l2_table(&bs_state); 1301 } 1302 1303 /* 1304 * Bootstrap the system enough to run with virtual memory. 1305 */ 1306 void 1307 pmap_bootstrap(vm_size_t kernlen) 1308 { 1309 vm_offset_t dpcpu, msgbufpv; 1310 vm_paddr_t start_pa, pa; 1311 uint64_t tcr; 1312 1313 tcr = READ_SPECIALREG(tcr_el1); 1314 1315 /* Verify that the ASID is set through TTBR0. */ 1316 KASSERT((tcr & TCR_A1) == 0, ("pmap_bootstrap: TCR_EL1.A1 != 0")); 1317 1318 if ((tcr & TCR_DS) != 0) 1319 pmap_lpa_enabled = true; 1320 1321 pmap_l1_supported = L1_BLOCKS_SUPPORTED; 1322 1323 /* Set this early so we can use the pagetable walking functions */ 1324 kernel_pmap_store.pm_l0 = pagetable_l0_ttbr1; 1325 PMAP_LOCK_INIT(kernel_pmap); 1326 kernel_pmap->pm_l0_paddr = 1327 pmap_early_vtophys((vm_offset_t)kernel_pmap_store.pm_l0); 1328 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 1329 vm_radix_init(&kernel_pmap->pm_root); 1330 kernel_pmap->pm_cookie = COOKIE_FROM(-1, INT_MIN); 1331 kernel_pmap->pm_stage = PM_STAGE1; 1332 kernel_pmap->pm_levels = 4; 1333 kernel_pmap->pm_ttbr = kernel_pmap->pm_l0_paddr; 1334 kernel_pmap->pm_asid_set = &asids; 1335 1336 bs_state.freemempos = KERNBASE + kernlen; 1337 bs_state.freemempos = roundup2(bs_state.freemempos, PAGE_SIZE); 1338 1339 /* Create a direct map region early so we can use it for pa -> va */ 1340 pmap_bootstrap_dmap(); 1341 bs_state.dmap_valid = true; 1342 1343 /* 1344 * We only use PXN when we know nothing will be executed from it, e.g. 1345 * the DMAP region. 1346 */ 1347 bs_state.table_attrs &= ~TATTR_PXN_TABLE; 1348 1349 start_pa = pa = pmap_early_vtophys(KERNBASE); 1350 1351 /* 1352 * Create the l2 tables up to VM_MAX_KERNEL_ADDRESS. We assume that the 1353 * loader allocated the first and only l2 page table page used to map 1354 * the kernel, preloaded files and module metadata. 1355 */ 1356 pmap_bootstrap_l2(KERNBASE + L1_SIZE); 1357 /* And the l3 tables for the early devmap */ 1358 pmap_bootstrap_l3(VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE)); 1359 1360 cpu_tlb_flushID(); 1361 1362 #define alloc_pages(var, np) \ 1363 (var) = bs_state.freemempos; \ 1364 bs_state.freemempos += (np * PAGE_SIZE); \ 1365 memset((char *)(var), 0, ((np) * PAGE_SIZE)); 1366 1367 /* Allocate dynamic per-cpu area. */ 1368 alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE); 1369 dpcpu_init((void *)dpcpu, 0); 1370 1371 /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */ 1372 alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE); 1373 msgbufp = (void *)msgbufpv; 1374 1375 /* Reserve some VA space for early BIOS/ACPI mapping */ 1376 preinit_map_va = roundup2(bs_state.freemempos, L2_SIZE); 1377 1378 virtual_avail = preinit_map_va + PMAP_PREINIT_MAPPING_SIZE; 1379 virtual_avail = roundup2(virtual_avail, L1_SIZE); 1380 virtual_end = VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE); 1381 kernel_vm_end = virtual_avail; 1382 1383 pa = pmap_early_vtophys(bs_state.freemempos); 1384 1385 physmem_exclude_region(start_pa, pa - start_pa, EXFLAG_NOALLOC); 1386 1387 cpu_tlb_flushID(); 1388 } 1389 1390 #if defined(KASAN) || defined(KMSAN) 1391 static void 1392 pmap_bootstrap_allocate_san_l2(vm_paddr_t start_pa, vm_paddr_t end_pa, 1393 vm_offset_t *vap, vm_offset_t eva) 1394 { 1395 vm_paddr_t pa; 1396 vm_offset_t va; 1397 pd_entry_t *l2; 1398 1399 va = *vap; 1400 pa = rounddown2(end_pa - L2_SIZE, L2_SIZE); 1401 for (; pa >= start_pa && va < eva; va += L2_SIZE, pa -= L2_SIZE) { 1402 l2 = pmap_l2(kernel_pmap, va); 1403 1404 /* 1405 * KASAN stack checking results in us having already allocated 1406 * part of our shadow map, so we can just skip those segments. 1407 */ 1408 if ((pmap_load(l2) & ATTR_DESCR_VALID) != 0) { 1409 pa += L2_SIZE; 1410 continue; 1411 } 1412 1413 bzero((void *)PHYS_TO_DMAP(pa), L2_SIZE); 1414 physmem_exclude_region(pa, L2_SIZE, EXFLAG_NOALLOC); 1415 pmap_store(l2, PHYS_TO_PTE(pa) | PMAP_SAN_PTE_BITS | L2_BLOCK); 1416 } 1417 *vap = va; 1418 } 1419 1420 /* 1421 * Finish constructing the initial shadow map: 1422 * - Count how many pages from KERNBASE to virtual_avail (scaled for 1423 * shadow map) 1424 * - Map that entire range using L2 superpages. 1425 */ 1426 static void 1427 pmap_bootstrap_san1(vm_offset_t va, int scale) 1428 { 1429 vm_offset_t eva; 1430 vm_paddr_t kernstart; 1431 int i; 1432 1433 kernstart = pmap_early_vtophys(KERNBASE); 1434 1435 /* 1436 * Rebuild physmap one more time, we may have excluded more regions from 1437 * allocation since pmap_bootstrap(). 1438 */ 1439 physmap_idx = physmem_avail(physmap, nitems(physmap)); 1440 1441 eva = va + (virtual_avail - VM_MIN_KERNEL_ADDRESS) / scale; 1442 1443 /* 1444 * Find a slot in the physmap large enough for what we needed. We try to put 1445 * the shadow map as high up as we can to avoid depleting the lower 4GB in case 1446 * it's needed for, e.g., an xhci controller that can only do 32-bit DMA. 1447 */ 1448 for (i = physmap_idx - 2; i >= 0; i -= 2) { 1449 vm_paddr_t plow, phigh; 1450 1451 /* L2 mappings must be backed by memory that is L2-aligned */ 1452 plow = roundup2(physmap[i], L2_SIZE); 1453 phigh = physmap[i + 1]; 1454 if (plow >= phigh) 1455 continue; 1456 if (kernstart >= plow && kernstart < phigh) 1457 phigh = kernstart; 1458 if (phigh - plow >= L2_SIZE) { 1459 pmap_bootstrap_allocate_san_l2(plow, phigh, &va, eva); 1460 if (va >= eva) 1461 break; 1462 } 1463 } 1464 if (i < 0) 1465 panic("Could not find phys region for shadow map"); 1466 1467 /* 1468 * Done. We should now have a valid shadow address mapped for all KVA 1469 * that has been mapped so far, i.e., KERNBASE to virtual_avail. Thus, 1470 * shadow accesses by the sanitizer runtime will succeed for this range. 1471 * When the kernel virtual address range is later expanded, as will 1472 * happen in vm_mem_init(), the shadow map will be grown as well. This 1473 * is handled by pmap_san_enter(). 1474 */ 1475 } 1476 1477 void 1478 pmap_bootstrap_san(void) 1479 { 1480 #ifdef KASAN 1481 pmap_bootstrap_san1(KASAN_MIN_ADDRESS, KASAN_SHADOW_SCALE); 1482 #else 1483 static uint8_t kmsan_shad_ptp[PAGE_SIZE * 2] __aligned(PAGE_SIZE); 1484 static uint8_t kmsan_orig_ptp[PAGE_SIZE * 2] __aligned(PAGE_SIZE); 1485 pd_entry_t *l0, *l1; 1486 1487 if (virtual_avail - VM_MIN_KERNEL_ADDRESS > L1_SIZE) 1488 panic("initial kernel map is too large"); 1489 1490 l0 = pmap_l0(kernel_pmap, KMSAN_SHAD_MIN_ADDRESS); 1491 pmap_store(l0, L0_TABLE | PHYS_TO_PTE( 1492 pmap_early_vtophys((vm_offset_t)kmsan_shad_ptp))); 1493 l1 = pmap_l0_to_l1(l0, KMSAN_SHAD_MIN_ADDRESS); 1494 pmap_store(l1, L1_TABLE | PHYS_TO_PTE( 1495 pmap_early_vtophys((vm_offset_t)kmsan_shad_ptp + PAGE_SIZE))); 1496 pmap_bootstrap_san1(KMSAN_SHAD_MIN_ADDRESS, 1); 1497 1498 l0 = pmap_l0(kernel_pmap, KMSAN_ORIG_MIN_ADDRESS); 1499 pmap_store(l0, L0_TABLE | PHYS_TO_PTE( 1500 pmap_early_vtophys((vm_offset_t)kmsan_orig_ptp))); 1501 l1 = pmap_l0_to_l1(l0, KMSAN_ORIG_MIN_ADDRESS); 1502 pmap_store(l1, L1_TABLE | PHYS_TO_PTE( 1503 pmap_early_vtophys((vm_offset_t)kmsan_orig_ptp + PAGE_SIZE))); 1504 pmap_bootstrap_san1(KMSAN_ORIG_MIN_ADDRESS, 1); 1505 #endif 1506 } 1507 #endif 1508 1509 /* 1510 * Initialize a vm_page's machine-dependent fields. 1511 */ 1512 void 1513 pmap_page_init(vm_page_t m) 1514 { 1515 1516 TAILQ_INIT(&m->md.pv_list); 1517 m->md.pv_memattr = VM_MEMATTR_WRITE_BACK; 1518 } 1519 1520 static void 1521 pmap_init_asids(struct asid_set *set, int bits) 1522 { 1523 int i; 1524 1525 set->asid_bits = bits; 1526 1527 /* 1528 * We may be too early in the overall initialization process to use 1529 * bit_alloc(). 1530 */ 1531 set->asid_set_size = 1 << set->asid_bits; 1532 set->asid_set = kmem_malloc(bitstr_size(set->asid_set_size), 1533 M_WAITOK | M_ZERO); 1534 for (i = 0; i < ASID_FIRST_AVAILABLE; i++) 1535 bit_set(set->asid_set, i); 1536 set->asid_next = ASID_FIRST_AVAILABLE; 1537 mtx_init(&set->asid_set_mutex, "asid set", NULL, MTX_SPIN); 1538 } 1539 1540 static void 1541 pmap_init_pv_table(void) 1542 { 1543 struct vm_phys_seg *seg, *next_seg; 1544 struct pmap_large_md_page *pvd; 1545 vm_size_t s; 1546 int domain, i, j, pages; 1547 1548 /* 1549 * We depend on the size being evenly divisible into a page so 1550 * that the pv_table array can be indexed directly while 1551 * safely spanning multiple pages from different domains. 1552 */ 1553 CTASSERT(PAGE_SIZE % sizeof(*pvd) == 0); 1554 1555 /* 1556 * Calculate the size of the array. 1557 */ 1558 s = 0; 1559 for (i = 0; i < vm_phys_nsegs; i++) { 1560 seg = &vm_phys_segs[i]; 1561 pages = pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1562 pmap_l2_pindex(seg->start); 1563 s += round_page(pages * sizeof(*pvd)); 1564 } 1565 pv_table = (struct pmap_large_md_page *)kva_alloc(s); 1566 if (pv_table == NULL) 1567 panic("%s: kva_alloc failed\n", __func__); 1568 1569 /* 1570 * Iterate physical segments to allocate domain-local memory for PV 1571 * list headers. 1572 */ 1573 pvd = pv_table; 1574 for (i = 0; i < vm_phys_nsegs; i++) { 1575 seg = &vm_phys_segs[i]; 1576 pages = pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1577 pmap_l2_pindex(seg->start); 1578 domain = seg->domain; 1579 1580 s = round_page(pages * sizeof(*pvd)); 1581 1582 for (j = 0; j < s; j += PAGE_SIZE) { 1583 vm_page_t m = vm_page_alloc_noobj_domain(domain, 1584 VM_ALLOC_ZERO); 1585 if (m == NULL) 1586 panic("failed to allocate PV table page"); 1587 pmap_qenter((vm_offset_t)pvd + j, &m, 1); 1588 } 1589 1590 for (j = 0; j < s / sizeof(*pvd); j++) { 1591 rw_init_flags(&pvd->pv_lock, "pmap pv list", RW_NEW); 1592 TAILQ_INIT(&pvd->pv_page.pv_list); 1593 pvd++; 1594 } 1595 } 1596 pvd = &pv_dummy_large; 1597 memset(pvd, 0, sizeof(*pvd)); 1598 rw_init_flags(&pvd->pv_lock, "pmap pv list dummy", RW_NEW); 1599 TAILQ_INIT(&pvd->pv_page.pv_list); 1600 1601 /* 1602 * Set pointers from vm_phys_segs to pv_table. 1603 */ 1604 for (i = 0, pvd = pv_table; i < vm_phys_nsegs; i++) { 1605 seg = &vm_phys_segs[i]; 1606 seg->md_first = pvd; 1607 pvd += pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1608 pmap_l2_pindex(seg->start); 1609 1610 /* 1611 * If there is a following segment, and the final 1612 * superpage of this segment and the initial superpage 1613 * of the next segment are the same then adjust the 1614 * pv_table entry for that next segment down by one so 1615 * that the pv_table entries will be shared. 1616 */ 1617 if (i + 1 < vm_phys_nsegs) { 1618 next_seg = &vm_phys_segs[i + 1]; 1619 if (pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1 == 1620 pmap_l2_pindex(next_seg->start)) { 1621 pvd--; 1622 } 1623 } 1624 } 1625 } 1626 1627 static bool 1628 pmap_dbm_check(const struct cpu_feat *feat __unused, u_int midr __unused) 1629 { 1630 uint64_t id_aa64mmfr1; 1631 1632 id_aa64mmfr1 = READ_SPECIALREG(id_aa64mmfr1_el1); 1633 return (ID_AA64MMFR1_HAFDBS_VAL(id_aa64mmfr1) >= 1634 ID_AA64MMFR1_HAFDBS_AF_DBS); 1635 } 1636 1637 static bool 1638 pmap_dbm_has_errata(const struct cpu_feat *feat __unused, u_int midr, 1639 u_int **errata_list, u_int *errata_count) 1640 { 1641 /* Disable on Cortex-A55 for erratum 1024718 - all revisions */ 1642 if (CPU_MATCH(CPU_IMPL_MASK | CPU_PART_MASK, CPU_IMPL_ARM, 1643 CPU_PART_CORTEX_A55, 0, 0)) { 1644 static u_int errata_id = 1024718; 1645 1646 *errata_list = &errata_id; 1647 *errata_count = 1; 1648 return (true); 1649 } 1650 1651 /* Disable on Cortex-A510 for erratum 2051678 - r0p0 to r0p2 */ 1652 if (CPU_MATCH(CPU_IMPL_MASK | CPU_PART_MASK | CPU_VAR_MASK, 1653 CPU_IMPL_ARM, CPU_PART_CORTEX_A510, 0, 0)) { 1654 if (CPU_REV(PCPU_GET(midr)) < 3) { 1655 static u_int errata_id = 2051678; 1656 1657 *errata_list = &errata_id; 1658 *errata_count = 1; 1659 return (true); 1660 } 1661 } 1662 1663 return (false); 1664 } 1665 1666 static void 1667 pmap_dbm_enable(const struct cpu_feat *feat __unused, 1668 cpu_feat_errata errata_status, u_int *errata_list __unused, 1669 u_int errata_count) 1670 { 1671 uint64_t tcr; 1672 1673 /* Skip if there is an erratum affecting DBM */ 1674 if (errata_status != ERRATA_NONE) 1675 return; 1676 1677 tcr = READ_SPECIALREG(tcr_el1) | TCR_HD; 1678 WRITE_SPECIALREG(tcr_el1, tcr); 1679 isb(); 1680 /* Flush the local TLB for the TCR_HD flag change */ 1681 dsb(nshst); 1682 __asm __volatile("tlbi vmalle1"); 1683 dsb(nsh); 1684 isb(); 1685 } 1686 1687 static struct cpu_feat feat_dbm = { 1688 .feat_name = "FEAT_HAFDBS (DBM)", 1689 .feat_check = pmap_dbm_check, 1690 .feat_has_errata = pmap_dbm_has_errata, 1691 .feat_enable = pmap_dbm_enable, 1692 .feat_flags = CPU_FEAT_AFTER_DEV | CPU_FEAT_PER_CPU, 1693 }; 1694 DATA_SET(cpu_feat_set, feat_dbm); 1695 1696 /* 1697 * Initialize the pmap module. 1698 * 1699 * Called by vm_mem_init(), to initialize any structures that the pmap 1700 * system needs to map virtual memory. 1701 */ 1702 void 1703 pmap_init(void) 1704 { 1705 uint64_t mmfr1; 1706 int i, vmid_bits; 1707 1708 /* 1709 * Are large page mappings enabled? 1710 */ 1711 TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled); 1712 if (superpages_enabled) { 1713 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 1714 ("pmap_init: can't assign to pagesizes[1]")); 1715 pagesizes[1] = L3C_SIZE; 1716 KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0, 1717 ("pmap_init: can't assign to pagesizes[2]")); 1718 pagesizes[2] = L2_SIZE; 1719 if (L1_BLOCKS_SUPPORTED) { 1720 KASSERT(MAXPAGESIZES > 3 && pagesizes[3] == 0, 1721 ("pmap_init: can't assign to pagesizes[3]")); 1722 pagesizes[3] = L1_SIZE; 1723 } 1724 } 1725 1726 /* 1727 * Initialize the ASID allocator. 1728 */ 1729 pmap_init_asids(&asids, 1730 (READ_SPECIALREG(tcr_el1) & TCR_ASID_16) != 0 ? 16 : 8); 1731 1732 if (has_hyp()) { 1733 mmfr1 = READ_SPECIALREG(id_aa64mmfr1_el1); 1734 vmid_bits = 8; 1735 1736 if (ID_AA64MMFR1_VMIDBits_VAL(mmfr1) == 1737 ID_AA64MMFR1_VMIDBits_16) 1738 vmid_bits = 16; 1739 pmap_init_asids(&vmids, vmid_bits); 1740 } 1741 1742 /* 1743 * Initialize pv chunk lists. 1744 */ 1745 for (i = 0; i < PMAP_MEMDOM; i++) { 1746 mtx_init(&pv_chunks[i].pvc_lock, "pmap pv chunk list", NULL, 1747 MTX_DEF); 1748 TAILQ_INIT(&pv_chunks[i].pvc_list); 1749 } 1750 pmap_init_pv_table(); 1751 1752 vm_initialized = 1; 1753 } 1754 1755 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l1, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 1756 "L1 (1GB/64GB) page mapping counters"); 1757 1758 static COUNTER_U64_DEFINE_EARLY(pmap_l1_demotions); 1759 SYSCTL_COUNTER_U64(_vm_pmap_l1, OID_AUTO, demotions, CTLFLAG_RD, 1760 &pmap_l1_demotions, "L1 (1GB/64GB) page demotions"); 1761 1762 SYSCTL_BOOL(_vm_pmap_l1, OID_AUTO, supported, CTLFLAG_RD, &pmap_l1_supported, 1763 0, "L1 blocks are supported"); 1764 1765 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2c, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 1766 "L2C (32MB/1GB) page mapping counters"); 1767 1768 static COUNTER_U64_DEFINE_EARLY(pmap_l2c_demotions); 1769 SYSCTL_COUNTER_U64(_vm_pmap_l2c, OID_AUTO, demotions, CTLFLAG_RD, 1770 &pmap_l2c_demotions, "L2C (32MB/1GB) page demotions"); 1771 1772 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 1773 "2MB page mapping counters"); 1774 1775 static u_long pmap_l2_demotions; 1776 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD, 1777 &pmap_l2_demotions, 0, "2MB page demotions"); 1778 1779 static u_long pmap_l2_mappings; 1780 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD, 1781 &pmap_l2_mappings, 0, "2MB page mappings"); 1782 1783 static u_long pmap_l2_p_failures; 1784 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD, 1785 &pmap_l2_p_failures, 0, "2MB page promotion failures"); 1786 1787 static u_long pmap_l2_promotions; 1788 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD, 1789 &pmap_l2_promotions, 0, "2MB page promotions"); 1790 1791 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l3c, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 1792 "L3C (64KB/2MB) page mapping counters"); 1793 1794 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_demotions); 1795 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, demotions, CTLFLAG_RD, 1796 &pmap_l3c_demotions, "L3C (64KB/2MB) page demotions"); 1797 1798 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_mappings); 1799 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, mappings, CTLFLAG_RD, 1800 &pmap_l3c_mappings, "L3C (64KB/2MB) page mappings"); 1801 1802 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_p_failures); 1803 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, p_failures, CTLFLAG_RD, 1804 &pmap_l3c_p_failures, "L3C (64KB/2MB) page promotion failures"); 1805 1806 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_promotions); 1807 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, promotions, CTLFLAG_RD, 1808 &pmap_l3c_promotions, "L3C (64KB/2MB) page promotions"); 1809 1810 /* 1811 * If the given value for "final_only" is false, then any cached intermediate- 1812 * level entries, i.e., L{0,1,2}_TABLE entries, are invalidated in addition to 1813 * any cached final-level entry, i.e., either an L{1,2}_BLOCK or L3_PAGE entry. 1814 * Otherwise, just the cached final-level entry is invalidated. 1815 */ 1816 static __inline void 1817 pmap_s1_invalidate_kernel(uint64_t r, bool final_only) 1818 { 1819 if (final_only) 1820 __asm __volatile("tlbi vaale1is, %0" : : "r" (r)); 1821 else 1822 __asm __volatile("tlbi vaae1is, %0" : : "r" (r)); 1823 } 1824 1825 static __inline void 1826 pmap_s1_invalidate_user(uint64_t r, bool final_only) 1827 { 1828 if (final_only) 1829 __asm __volatile("tlbi vale1is, %0" : : "r" (r)); 1830 else 1831 __asm __volatile("tlbi vae1is, %0" : : "r" (r)); 1832 } 1833 1834 /* 1835 * Invalidates any cached final- and optionally intermediate-level TLB entries 1836 * for the specified virtual address in the given virtual address space. 1837 */ 1838 static __inline void 1839 pmap_s1_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only) 1840 { 1841 uint64_t r; 1842 1843 PMAP_ASSERT_STAGE1(pmap); 1844 1845 dsb(ishst); 1846 r = TLBI_VA(va); 1847 if (pmap == kernel_pmap) { 1848 pmap_s1_invalidate_kernel(r, final_only); 1849 } else { 1850 r |= ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)); 1851 pmap_s1_invalidate_user(r, final_only); 1852 } 1853 dsb(ish); 1854 isb(); 1855 } 1856 1857 static __inline void 1858 pmap_s2_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only) 1859 { 1860 PMAP_ASSERT_STAGE2(pmap); 1861 MPASS(pmap_stage2_invalidate_range != NULL); 1862 pmap_stage2_invalidate_range(pmap_to_ttbr0(pmap), va, va + PAGE_SIZE, 1863 final_only); 1864 } 1865 1866 static __inline void 1867 pmap_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only) 1868 { 1869 if (pmap->pm_stage == PM_STAGE1) 1870 pmap_s1_invalidate_page(pmap, va, final_only); 1871 else 1872 pmap_s2_invalidate_page(pmap, va, final_only); 1873 } 1874 1875 /* 1876 * Use stride L{1,2}_SIZE when invalidating the TLB entries for L{1,2}_BLOCK 1877 * mappings. Otherwise, use stride L3_SIZE. 1878 */ 1879 static __inline void 1880 pmap_s1_invalidate_strided(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 1881 vm_offset_t stride, bool final_only) 1882 { 1883 uint64_t end, r, start; 1884 1885 PMAP_ASSERT_STAGE1(pmap); 1886 1887 dsb(ishst); 1888 if (pmap == kernel_pmap) { 1889 start = TLBI_VA(sva); 1890 end = TLBI_VA(eva); 1891 for (r = start; r < end; r += TLBI_VA(stride)) 1892 pmap_s1_invalidate_kernel(r, final_only); 1893 } else { 1894 start = end = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)); 1895 start |= TLBI_VA(sva); 1896 end |= TLBI_VA(eva); 1897 for (r = start; r < end; r += TLBI_VA(stride)) 1898 pmap_s1_invalidate_user(r, final_only); 1899 } 1900 dsb(ish); 1901 isb(); 1902 } 1903 1904 /* 1905 * Invalidates any cached final- and optionally intermediate-level TLB entries 1906 * for the specified virtual address range in the given virtual address space. 1907 */ 1908 static __inline void 1909 pmap_s1_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 1910 bool final_only) 1911 { 1912 pmap_s1_invalidate_strided(pmap, sva, eva, L3_SIZE, final_only); 1913 } 1914 1915 static __inline void 1916 pmap_s2_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 1917 bool final_only) 1918 { 1919 PMAP_ASSERT_STAGE2(pmap); 1920 MPASS(pmap_stage2_invalidate_range != NULL); 1921 pmap_stage2_invalidate_range(pmap_to_ttbr0(pmap), sva, eva, final_only); 1922 } 1923 1924 static __inline void 1925 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 1926 bool final_only) 1927 { 1928 if (pmap->pm_stage == PM_STAGE1) 1929 pmap_s1_invalidate_range(pmap, sva, eva, final_only); 1930 else 1931 pmap_s2_invalidate_range(pmap, sva, eva, final_only); 1932 } 1933 1934 /* 1935 * Invalidates all cached intermediate- and final-level TLB entries for the 1936 * given virtual address space. 1937 */ 1938 static __inline void 1939 pmap_s1_invalidate_all(pmap_t pmap) 1940 { 1941 uint64_t r; 1942 1943 PMAP_ASSERT_STAGE1(pmap); 1944 1945 dsb(ishst); 1946 if (pmap == kernel_pmap) { 1947 __asm __volatile("tlbi vmalle1is"); 1948 } else { 1949 r = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)); 1950 __asm __volatile("tlbi aside1is, %0" : : "r" (r)); 1951 } 1952 dsb(ish); 1953 isb(); 1954 } 1955 1956 static __inline void 1957 pmap_s2_invalidate_all(pmap_t pmap) 1958 { 1959 PMAP_ASSERT_STAGE2(pmap); 1960 MPASS(pmap_stage2_invalidate_all != NULL); 1961 pmap_stage2_invalidate_all(pmap_to_ttbr0(pmap)); 1962 } 1963 1964 static __inline void 1965 pmap_invalidate_all(pmap_t pmap) 1966 { 1967 if (pmap->pm_stage == PM_STAGE1) 1968 pmap_s1_invalidate_all(pmap); 1969 else 1970 pmap_s2_invalidate_all(pmap); 1971 } 1972 1973 /* 1974 * Routine: pmap_extract 1975 * Function: 1976 * Extract the physical page address associated 1977 * with the given map/virtual_address pair. 1978 */ 1979 vm_paddr_t 1980 pmap_extract(pmap_t pmap, vm_offset_t va) 1981 { 1982 pt_entry_t *pte, tpte; 1983 vm_paddr_t pa; 1984 int lvl; 1985 1986 pa = 0; 1987 PMAP_LOCK(pmap); 1988 /* 1989 * Find the block or page map for this virtual address. pmap_pte 1990 * will return either a valid block/page entry, or NULL. 1991 */ 1992 pte = pmap_pte(pmap, va, &lvl); 1993 if (pte != NULL) { 1994 tpte = pmap_load(pte); 1995 pa = PTE_TO_PHYS(tpte); 1996 switch(lvl) { 1997 case 1: 1998 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 1999 KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK, 2000 ("pmap_extract: Invalid L1 pte found: %lx", 2001 tpte & ATTR_DESCR_MASK)); 2002 pa |= (va & L1_OFFSET); 2003 break; 2004 case 2: 2005 KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK, 2006 ("pmap_extract: Invalid L2 pte found: %lx", 2007 tpte & ATTR_DESCR_MASK)); 2008 pa |= (va & L2_OFFSET); 2009 break; 2010 case 3: 2011 KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE, 2012 ("pmap_extract: Invalid L3 pte found: %lx", 2013 tpte & ATTR_DESCR_MASK)); 2014 pa |= (va & L3_OFFSET); 2015 break; 2016 } 2017 } 2018 PMAP_UNLOCK(pmap); 2019 return (pa); 2020 } 2021 2022 /* 2023 * Routine: pmap_extract_and_hold 2024 * Function: 2025 * Atomically extract and hold the physical page 2026 * with the given pmap and virtual address pair 2027 * if that mapping permits the given protection. 2028 */ 2029 vm_page_t 2030 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 2031 { 2032 pt_entry_t *pte, tpte; 2033 vm_offset_t off; 2034 vm_page_t m; 2035 int lvl; 2036 bool use; 2037 2038 m = NULL; 2039 PMAP_LOCK(pmap); 2040 pte = pmap_pte(pmap, va, &lvl); 2041 if (pte != NULL) { 2042 tpte = pmap_load(pte); 2043 2044 KASSERT(lvl > 0 && lvl <= 3, 2045 ("pmap_extract_and_hold: Invalid level %d", lvl)); 2046 /* 2047 * Check that the pte is either a L3 page, or a L1 or L2 block 2048 * entry. We can assume L1_BLOCK == L2_BLOCK. 2049 */ 2050 KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) || 2051 (lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK), 2052 ("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl, 2053 tpte & ATTR_DESCR_MASK)); 2054 2055 use = false; 2056 if ((prot & VM_PROT_WRITE) == 0) 2057 use = true; 2058 else if (pmap->pm_stage == PM_STAGE1 && 2059 (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW)) 2060 use = true; 2061 else if (pmap->pm_stage == PM_STAGE2 && 2062 ((tpte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) == 2063 ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE))) 2064 use = true; 2065 2066 if (use) { 2067 switch (lvl) { 2068 case 1: 2069 off = va & L1_OFFSET; 2070 break; 2071 case 2: 2072 off = va & L2_OFFSET; 2073 break; 2074 case 3: 2075 default: 2076 off = 0; 2077 } 2078 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tpte) | off); 2079 if (m != NULL && !vm_page_wire_mapped(m)) 2080 m = NULL; 2081 } 2082 } 2083 PMAP_UNLOCK(pmap); 2084 return (m); 2085 } 2086 2087 /* 2088 * Walks the page tables to translate a kernel virtual address to a 2089 * physical address. Returns true if the kva is valid and stores the 2090 * physical address in pa if it is not NULL. 2091 * 2092 * See the comment above data_abort() for the rationale for specifying 2093 * NO_PERTHREAD_SSP here. 2094 */ 2095 bool NO_PERTHREAD_SSP 2096 pmap_klookup(vm_offset_t va, vm_paddr_t *pa) 2097 { 2098 pt_entry_t *pte, tpte; 2099 register_t intr; 2100 uint64_t par; 2101 2102 /* 2103 * Disable interrupts so we don't get interrupted between asking 2104 * for address translation, and getting the result back. 2105 */ 2106 intr = intr_disable(); 2107 par = arm64_address_translate_s1e1r(va); 2108 intr_restore(intr); 2109 2110 if (PAR_SUCCESS(par)) { 2111 if (pa != NULL) 2112 *pa = (par & PAR_PA_MASK) | (va & PAR_LOW_MASK); 2113 return (true); 2114 } 2115 2116 /* 2117 * Fall back to walking the page table. The address translation 2118 * instruction may fail when the page is in a break-before-make 2119 * sequence. As we only clear the valid bit in said sequence we 2120 * can walk the page table to find the physical address. 2121 */ 2122 2123 pte = pmap_l1(kernel_pmap, va); 2124 if (pte == NULL) 2125 return (false); 2126 2127 /* 2128 * A concurrent pmap_update_entry() will clear the entry's valid bit 2129 * but leave the rest of the entry unchanged. Therefore, we treat a 2130 * non-zero entry as being valid, and we ignore the valid bit when 2131 * determining whether the entry maps a block, page, or table. 2132 */ 2133 tpte = pmap_load(pte); 2134 if (tpte == 0) 2135 return (false); 2136 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) { 2137 if (pa != NULL) 2138 *pa = PTE_TO_PHYS(tpte) | (va & L1_OFFSET); 2139 return (true); 2140 } 2141 pte = pmap_l1_to_l2(&tpte, va); 2142 tpte = pmap_load(pte); 2143 if (tpte == 0) 2144 return (false); 2145 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) { 2146 if (pa != NULL) 2147 *pa = PTE_TO_PHYS(tpte) | (va & L2_OFFSET); 2148 return (true); 2149 } 2150 pte = pmap_l2_to_l3(&tpte, va); 2151 tpte = pmap_load(pte); 2152 if (tpte == 0) 2153 return (false); 2154 if (pa != NULL) 2155 *pa = PTE_TO_PHYS(tpte) | (va & L3_OFFSET); 2156 return (true); 2157 } 2158 2159 /* 2160 * Routine: pmap_kextract 2161 * Function: 2162 * Extract the physical page address associated with the given kernel 2163 * virtual address. 2164 */ 2165 vm_paddr_t 2166 pmap_kextract(vm_offset_t va) 2167 { 2168 vm_paddr_t pa; 2169 2170 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) 2171 return (DMAP_TO_PHYS(va)); 2172 2173 if (pmap_klookup(va, &pa) == false) 2174 return (0); 2175 return (pa); 2176 } 2177 2178 /*************************************************** 2179 * Low level mapping routines..... 2180 ***************************************************/ 2181 2182 void 2183 pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode) 2184 { 2185 pd_entry_t *pde; 2186 pt_entry_t attr, old_l3e, *pte; 2187 vm_offset_t va; 2188 vm_page_t mpte; 2189 int error, lvl; 2190 2191 KASSERT((pa & L3_OFFSET) == 0, 2192 ("pmap_kenter: Invalid physical address")); 2193 KASSERT((sva & L3_OFFSET) == 0, 2194 ("pmap_kenter: Invalid virtual address")); 2195 KASSERT((size & PAGE_MASK) == 0, 2196 ("pmap_kenter: Mapping is not page-sized")); 2197 2198 attr = ATTR_AF | pmap_sh_attr | ATTR_S1_AP(ATTR_S1_AP_RW) | 2199 ATTR_S1_XN | ATTR_KERN_GP | ATTR_S1_IDX(mode); 2200 old_l3e = 0; 2201 va = sva; 2202 while (size != 0) { 2203 pde = pmap_pde(kernel_pmap, va, &lvl); 2204 KASSERT(pde != NULL, 2205 ("pmap_kenter: Invalid page entry, va: 0x%lx", va)); 2206 KASSERT(lvl == 2, ("pmap_kenter: Invalid level %d", lvl)); 2207 2208 /* 2209 * If we have an aligned, contiguous chunk of L2_SIZE, try 2210 * to create an L2_BLOCK mapping. 2211 */ 2212 if ((va & L2_OFFSET) == 0 && size >= L2_SIZE && 2213 (pa & L2_OFFSET) == 0 && vm_initialized) { 2214 mpte = PTE_TO_VM_PAGE(pmap_load(pde)); 2215 KASSERT(pmap_every_pte_zero(VM_PAGE_TO_PHYS(mpte)), 2216 ("pmap_kenter: Unexpected mapping")); 2217 PMAP_LOCK(kernel_pmap); 2218 error = pmap_insert_pt_page(kernel_pmap, mpte, false, 2219 false); 2220 if (error == 0) { 2221 attr &= ~ATTR_CONTIGUOUS; 2222 2223 /* 2224 * Although the page table page "mpte" should 2225 * be devoid of mappings, the TLB might hold 2226 * intermediate entries that reference it, so 2227 * we perform a single-page invalidation. 2228 */ 2229 pmap_update_entry(kernel_pmap, pde, 2230 PHYS_TO_PTE(pa) | attr | L2_BLOCK, va, 2231 PAGE_SIZE); 2232 } 2233 PMAP_UNLOCK(kernel_pmap); 2234 if (error == 0) { 2235 va += L2_SIZE; 2236 pa += L2_SIZE; 2237 size -= L2_SIZE; 2238 continue; 2239 } 2240 } 2241 2242 /* 2243 * If we have an aligned, contiguous chunk of L3C_ENTRIES 2244 * L3 pages, set the contiguous bit within each PTE so that 2245 * the chunk can be cached using only one TLB entry. 2246 */ 2247 if ((va & L3C_OFFSET) == 0 && (pa & L3C_OFFSET) == 0) { 2248 if (size >= L3C_SIZE) 2249 attr |= ATTR_CONTIGUOUS; 2250 else 2251 attr &= ~ATTR_CONTIGUOUS; 2252 } 2253 2254 pte = pmap_l2_to_l3(pde, va); 2255 old_l3e |= pmap_load_store(pte, PHYS_TO_PTE(pa) | attr | 2256 L3_PAGE); 2257 2258 va += PAGE_SIZE; 2259 pa += PAGE_SIZE; 2260 size -= PAGE_SIZE; 2261 } 2262 if ((old_l3e & ATTR_DESCR_VALID) != 0) 2263 pmap_s1_invalidate_range(kernel_pmap, sva, va, true); 2264 else { 2265 /* 2266 * Because the old entries were invalid and the new mappings 2267 * are not executable, an isb is not required. 2268 */ 2269 dsb(ishst); 2270 } 2271 } 2272 2273 void 2274 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa) 2275 { 2276 2277 pmap_kenter(sva, size, pa, VM_MEMATTR_DEVICE); 2278 } 2279 2280 /* 2281 * Remove a page from the kernel pagetables. 2282 */ 2283 void 2284 pmap_kremove(vm_offset_t va) 2285 { 2286 pt_entry_t *pte; 2287 2288 pte = pmap_pte_exists(kernel_pmap, va, 3, __func__); 2289 KASSERT((pmap_load(pte) & ATTR_CONTIGUOUS) == 0, 2290 ("pmap_kremove: unexpected ATTR_CONTIGUOUS")); 2291 pmap_clear(pte); 2292 pmap_s1_invalidate_page(kernel_pmap, va, true); 2293 } 2294 2295 /* 2296 * Remove the specified range of mappings from the kernel address space. 2297 * 2298 * Should only be applied to mappings that were created by pmap_kenter() or 2299 * pmap_kenter_device(). Nothing about this function is actually specific 2300 * to device mappings. 2301 */ 2302 void 2303 pmap_kremove_device(vm_offset_t sva, vm_size_t size) 2304 { 2305 pt_entry_t *ptep, *ptep_end; 2306 vm_offset_t va; 2307 int lvl; 2308 2309 KASSERT((sva & L3_OFFSET) == 0, 2310 ("pmap_kremove_device: Invalid virtual address")); 2311 KASSERT((size & PAGE_MASK) == 0, 2312 ("pmap_kremove_device: Mapping is not page-sized")); 2313 2314 va = sva; 2315 while (size != 0) { 2316 ptep = pmap_pte(kernel_pmap, va, &lvl); 2317 KASSERT(ptep != NULL, ("Invalid page table, va: 0x%lx", va)); 2318 switch (lvl) { 2319 case 2: 2320 KASSERT((va & L2_OFFSET) == 0, 2321 ("Unaligned virtual address")); 2322 KASSERT(size >= L2_SIZE, ("Insufficient size")); 2323 2324 if (va != sva) { 2325 pmap_s1_invalidate_range(kernel_pmap, sva, va, 2326 true); 2327 } 2328 pmap_clear(ptep); 2329 pmap_s1_invalidate_page(kernel_pmap, va, true); 2330 PMAP_LOCK(kernel_pmap); 2331 pmap_remove_kernel_l2(kernel_pmap, ptep, va); 2332 PMAP_UNLOCK(kernel_pmap); 2333 2334 va += L2_SIZE; 2335 sva = va; 2336 size -= L2_SIZE; 2337 break; 2338 case 3: 2339 if ((pmap_load(ptep) & ATTR_CONTIGUOUS) != 0) { 2340 KASSERT((va & L3C_OFFSET) == 0, 2341 ("Unaligned L3C virtual address")); 2342 KASSERT(size >= L3C_SIZE, 2343 ("Insufficient L3C size")); 2344 2345 ptep_end = ptep + L3C_ENTRIES; 2346 for (; ptep < ptep_end; ptep++) 2347 pmap_clear(ptep); 2348 2349 va += L3C_SIZE; 2350 size -= L3C_SIZE; 2351 break; 2352 } 2353 pmap_clear(ptep); 2354 2355 va += PAGE_SIZE; 2356 size -= PAGE_SIZE; 2357 break; 2358 default: 2359 __assert_unreachable(); 2360 break; 2361 } 2362 } 2363 if (va != sva) 2364 pmap_s1_invalidate_range(kernel_pmap, sva, va, true); 2365 } 2366 2367 /* 2368 * Used to map a range of physical addresses into kernel 2369 * virtual address space. 2370 * 2371 * The value passed in '*virt' is a suggested virtual address for 2372 * the mapping. Architectures which can support a direct-mapped 2373 * physical to virtual region can return the appropriate address 2374 * within that region, leaving '*virt' unchanged. Other 2375 * architectures should map the pages starting at '*virt' and 2376 * update '*virt' with the first usable address after the mapped 2377 * region. 2378 */ 2379 vm_offset_t 2380 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 2381 { 2382 return PHYS_TO_DMAP(start); 2383 } 2384 2385 /* 2386 * Add a list of wired pages to the kva 2387 * this routine is only used for temporary 2388 * kernel mappings that do not need to have 2389 * page modification or references recorded. 2390 * Note that old mappings are simply written 2391 * over. The page *must* be wired. 2392 * Note: SMP coherent. Uses a ranged shootdown IPI. 2393 */ 2394 void 2395 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 2396 { 2397 pd_entry_t *pde; 2398 pt_entry_t attr, old_l3e, *pte; 2399 vm_offset_t va; 2400 vm_page_t m; 2401 int i, lvl; 2402 2403 old_l3e = 0; 2404 va = sva; 2405 for (i = 0; i < count; i++) { 2406 pde = pmap_pde(kernel_pmap, va, &lvl); 2407 KASSERT(pde != NULL, 2408 ("pmap_qenter: Invalid page entry, va: 0x%lx", va)); 2409 KASSERT(lvl == 2, 2410 ("pmap_qenter: Invalid level %d", lvl)); 2411 2412 m = ma[i]; 2413 attr = ATTR_AF | pmap_sh_attr | 2414 ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN | 2415 ATTR_KERN_GP | ATTR_S1_IDX(m->md.pv_memattr) | L3_PAGE; 2416 pte = pmap_l2_to_l3(pde, va); 2417 old_l3e |= pmap_load_store(pte, VM_PAGE_TO_PTE(m) | attr); 2418 2419 va += L3_SIZE; 2420 } 2421 if ((old_l3e & ATTR_DESCR_VALID) != 0) 2422 pmap_s1_invalidate_range(kernel_pmap, sva, va, true); 2423 else { 2424 /* 2425 * Because the old entries were invalid and the new mappings 2426 * are not executable, an isb is not required. 2427 */ 2428 dsb(ishst); 2429 } 2430 } 2431 2432 /* 2433 * This routine tears out page mappings from the 2434 * kernel -- it is meant only for temporary mappings. 2435 */ 2436 void 2437 pmap_qremove(vm_offset_t sva, int count) 2438 { 2439 pt_entry_t *pte; 2440 vm_offset_t va; 2441 2442 KASSERT(ADDR_IS_CANONICAL(sva), 2443 ("%s: Address not in canonical form: %lx", __func__, sva)); 2444 KASSERT(ADDR_IS_KERNEL(sva), ("usermode va %lx", sva)); 2445 2446 va = sva; 2447 while (count-- > 0) { 2448 pte = pmap_pte_exists(kernel_pmap, va, 3, NULL); 2449 if (pte != NULL) { 2450 pmap_clear(pte); 2451 } 2452 2453 va += PAGE_SIZE; 2454 } 2455 pmap_s1_invalidate_range(kernel_pmap, sva, va, true); 2456 } 2457 2458 /*************************************************** 2459 * Page table page management routines..... 2460 ***************************************************/ 2461 /* 2462 * Schedule the specified unused page table page to be freed. Specifically, 2463 * add the page to the specified list of pages that will be released to the 2464 * physical memory manager after the TLB has been updated. 2465 */ 2466 static __inline void 2467 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, bool set_PG_ZERO) 2468 { 2469 2470 if (set_PG_ZERO) 2471 m->flags |= PG_ZERO; 2472 else 2473 m->flags &= ~PG_ZERO; 2474 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 2475 } 2476 2477 /* 2478 * Decrements a page table page's reference count, which is used to record the 2479 * number of valid page table entries within the page. If the reference count 2480 * drops to zero, then the page table page is unmapped. Returns true if the 2481 * page table page was unmapped and false otherwise. 2482 */ 2483 static inline bool 2484 pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 2485 { 2486 2487 --m->ref_count; 2488 if (m->ref_count == 0) { 2489 _pmap_unwire_l3(pmap, va, m, free); 2490 return (true); 2491 } else 2492 return (false); 2493 } 2494 2495 static void 2496 _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 2497 { 2498 2499 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2500 /* 2501 * unmap the page table page 2502 */ 2503 if (m->pindex >= (NUL2E + NUL1E)) { 2504 /* l1 page */ 2505 pd_entry_t *l0; 2506 2507 l0 = pmap_l0(pmap, va); 2508 pmap_clear(l0); 2509 } else if (m->pindex >= NUL2E) { 2510 /* l2 page */ 2511 pd_entry_t *l1; 2512 2513 l1 = pmap_l1(pmap, va); 2514 pmap_clear(l1); 2515 } else { 2516 /* l3 page */ 2517 pd_entry_t *l2; 2518 2519 l2 = pmap_l2(pmap, va); 2520 pmap_clear(l2); 2521 } 2522 pmap_resident_count_dec(pmap, 1); 2523 if (m->pindex < NUL2E) { 2524 /* We just released an l3, unhold the matching l2 */ 2525 pd_entry_t *l1, tl1; 2526 vm_page_t l2pg; 2527 2528 l1 = pmap_l1(pmap, va); 2529 tl1 = pmap_load(l1); 2530 l2pg = PTE_TO_VM_PAGE(tl1); 2531 pmap_unwire_l3(pmap, va, l2pg, free); 2532 } else if (m->pindex < (NUL2E + NUL1E)) { 2533 /* We just released an l2, unhold the matching l1 */ 2534 pd_entry_t *l0, tl0; 2535 vm_page_t l1pg; 2536 2537 l0 = pmap_l0(pmap, va); 2538 tl0 = pmap_load(l0); 2539 l1pg = PTE_TO_VM_PAGE(tl0); 2540 pmap_unwire_l3(pmap, va, l1pg, free); 2541 } 2542 pmap_invalidate_page(pmap, va, false); 2543 2544 /* 2545 * Put page on a list so that it is released after 2546 * *ALL* TLB shootdown is done 2547 */ 2548 pmap_add_delayed_free_list(m, free, true); 2549 } 2550 2551 /* 2552 * After removing a page table entry, this routine is used to 2553 * conditionally free the page, and manage the reference count. 2554 */ 2555 static int 2556 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, 2557 struct spglist *free) 2558 { 2559 vm_page_t mpte; 2560 2561 KASSERT(ADDR_IS_CANONICAL(va), 2562 ("%s: Address not in canonical form: %lx", __func__, va)); 2563 if (ADDR_IS_KERNEL(va)) 2564 return (0); 2565 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 2566 mpte = PTE_TO_VM_PAGE(ptepde); 2567 return (pmap_unwire_l3(pmap, va, mpte, free)); 2568 } 2569 2570 /* 2571 * Release a page table page reference after a failed attempt to create a 2572 * mapping. 2573 */ 2574 static void 2575 pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte) 2576 { 2577 struct spglist free; 2578 2579 SLIST_INIT(&free); 2580 if (pmap_unwire_l3(pmap, va, mpte, &free)) 2581 vm_page_free_pages_toq(&free, true); 2582 } 2583 2584 void 2585 pmap_pinit0(pmap_t pmap) 2586 { 2587 2588 PMAP_LOCK_INIT(pmap); 2589 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); 2590 pmap->pm_l0_paddr = READ_SPECIALREG(ttbr0_el1); 2591 pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr); 2592 TAILQ_INIT(&pmap->pm_pvchunk); 2593 vm_radix_init(&pmap->pm_root); 2594 pmap->pm_cookie = COOKIE_FROM(ASID_RESERVED_FOR_PID_0, INT_MIN); 2595 pmap->pm_stage = PM_STAGE1; 2596 pmap->pm_levels = 4; 2597 pmap->pm_ttbr = pmap->pm_l0_paddr; 2598 pmap->pm_asid_set = &asids; 2599 pmap->pm_bti = NULL; 2600 2601 PCPU_SET(curpmap, pmap); 2602 } 2603 2604 int 2605 pmap_pinit_stage(pmap_t pmap, enum pmap_stage stage, int levels) 2606 { 2607 vm_page_t m; 2608 2609 /* 2610 * allocate the l0 page 2611 */ 2612 m = vm_page_alloc_noobj(VM_ALLOC_WAITOK | VM_ALLOC_WIRED | 2613 VM_ALLOC_ZERO); 2614 pmap->pm_l0_paddr = VM_PAGE_TO_PHYS(m); 2615 pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr); 2616 2617 TAILQ_INIT(&pmap->pm_pvchunk); 2618 vm_radix_init(&pmap->pm_root); 2619 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); 2620 pmap->pm_cookie = COOKIE_FROM(-1, INT_MAX); 2621 2622 MPASS(levels == 3 || levels == 4); 2623 pmap->pm_levels = levels; 2624 pmap->pm_stage = stage; 2625 pmap->pm_bti = NULL; 2626 switch (stage) { 2627 case PM_STAGE1: 2628 pmap->pm_asid_set = &asids; 2629 if (pmap_bti_support) { 2630 pmap->pm_bti = malloc(sizeof(struct rangeset), M_DEVBUF, 2631 M_ZERO | M_WAITOK); 2632 rangeset_init(pmap->pm_bti, bti_dup_range, 2633 bti_free_range, pmap, M_NOWAIT); 2634 } 2635 break; 2636 case PM_STAGE2: 2637 pmap->pm_asid_set = &vmids; 2638 break; 2639 default: 2640 panic("%s: Invalid pmap type %d", __func__, stage); 2641 break; 2642 } 2643 2644 /* XXX Temporarily disable deferred ASID allocation. */ 2645 pmap_alloc_asid(pmap); 2646 2647 /* 2648 * Allocate the level 1 entry to use as the root. This will increase 2649 * the refcount on the level 1 page so it won't be removed until 2650 * pmap_release() is called. 2651 */ 2652 if (pmap->pm_levels == 3) { 2653 PMAP_LOCK(pmap); 2654 m = _pmap_alloc_l3(pmap, NUL2E + NUL1E, NULL); 2655 PMAP_UNLOCK(pmap); 2656 } 2657 pmap->pm_ttbr = VM_PAGE_TO_PHYS(m); 2658 2659 return (1); 2660 } 2661 2662 int 2663 pmap_pinit(pmap_t pmap) 2664 { 2665 2666 return (pmap_pinit_stage(pmap, PM_STAGE1, 4)); 2667 } 2668 2669 /* 2670 * This routine is called if the desired page table page does not exist. 2671 * 2672 * If page table page allocation fails, this routine may sleep before 2673 * returning NULL. It sleeps only if a lock pointer was given. 2674 * 2675 * Note: If a page allocation fails at page table level two or three, 2676 * one or two pages may be held during the wait, only to be released 2677 * afterwards. This conservative approach is easily argued to avoid 2678 * race conditions. 2679 */ 2680 static vm_page_t 2681 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) 2682 { 2683 vm_page_t m, l1pg, l2pg; 2684 2685 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2686 2687 /* 2688 * Allocate a page table page. 2689 */ 2690 if ((m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 2691 if (lockp != NULL) { 2692 RELEASE_PV_LIST_LOCK(lockp); 2693 PMAP_UNLOCK(pmap); 2694 vm_wait(NULL); 2695 PMAP_LOCK(pmap); 2696 } 2697 2698 /* 2699 * Indicate the need to retry. While waiting, the page table 2700 * page may have been allocated. 2701 */ 2702 return (NULL); 2703 } 2704 m->pindex = ptepindex; 2705 2706 /* 2707 * Because of AArch64's weak memory consistency model, we must have a 2708 * barrier here to ensure that the stores for zeroing "m", whether by 2709 * pmap_zero_page() or an earlier function, are visible before adding 2710 * "m" to the page table. Otherwise, a page table walk by another 2711 * processor's MMU could see the mapping to "m" and a stale, non-zero 2712 * PTE within "m". 2713 */ 2714 dmb(ishst); 2715 2716 /* 2717 * Map the pagetable page into the process address space, if 2718 * it isn't already there. 2719 */ 2720 2721 if (ptepindex >= (NUL2E + NUL1E)) { 2722 pd_entry_t *l0p, l0e; 2723 vm_pindex_t l0index; 2724 2725 l0index = ptepindex - (NUL2E + NUL1E); 2726 l0p = &pmap->pm_l0[l0index]; 2727 KASSERT((pmap_load(l0p) & ATTR_DESCR_VALID) == 0, 2728 ("%s: L0 entry %#lx is valid", __func__, pmap_load(l0p))); 2729 l0e = VM_PAGE_TO_PTE(m) | L0_TABLE; 2730 2731 /* 2732 * Mark all kernel memory as not accessible from userspace 2733 * and userspace memory as not executable from the kernel. 2734 * This has been done for the bootstrap L0 entries in 2735 * locore.S. 2736 */ 2737 if (pmap == kernel_pmap) 2738 l0e |= TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0; 2739 else 2740 l0e |= TATTR_PXN_TABLE; 2741 pmap_store(l0p, l0e); 2742 } else if (ptepindex >= NUL2E) { 2743 vm_pindex_t l0index, l1index; 2744 pd_entry_t *l0, *l1; 2745 pd_entry_t tl0; 2746 2747 l1index = ptepindex - NUL2E; 2748 l0index = l1index >> Ln_ENTRIES_SHIFT; 2749 2750 l0 = &pmap->pm_l0[l0index]; 2751 tl0 = pmap_load(l0); 2752 if (tl0 == 0) { 2753 /* recurse for allocating page dir */ 2754 if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index, 2755 lockp) == NULL) { 2756 vm_page_unwire_noq(m); 2757 vm_page_free_zero(m); 2758 return (NULL); 2759 } 2760 } else { 2761 l1pg = PTE_TO_VM_PAGE(tl0); 2762 l1pg->ref_count++; 2763 } 2764 2765 l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l0))); 2766 l1 = &l1[ptepindex & Ln_ADDR_MASK]; 2767 KASSERT((pmap_load(l1) & ATTR_DESCR_VALID) == 0, 2768 ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1))); 2769 pmap_store(l1, VM_PAGE_TO_PTE(m) | L1_TABLE); 2770 } else { 2771 vm_pindex_t l0index, l1index; 2772 pd_entry_t *l0, *l1, *l2; 2773 pd_entry_t tl0, tl1; 2774 2775 l1index = ptepindex >> Ln_ENTRIES_SHIFT; 2776 l0index = l1index >> Ln_ENTRIES_SHIFT; 2777 2778 l0 = &pmap->pm_l0[l0index]; 2779 tl0 = pmap_load(l0); 2780 if (tl0 == 0) { 2781 /* recurse for allocating page dir */ 2782 if (_pmap_alloc_l3(pmap, NUL2E + l1index, 2783 lockp) == NULL) { 2784 vm_page_unwire_noq(m); 2785 vm_page_free_zero(m); 2786 return (NULL); 2787 } 2788 tl0 = pmap_load(l0); 2789 l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(tl0)); 2790 l1 = &l1[l1index & Ln_ADDR_MASK]; 2791 } else { 2792 l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(tl0)); 2793 l1 = &l1[l1index & Ln_ADDR_MASK]; 2794 tl1 = pmap_load(l1); 2795 if (tl1 == 0) { 2796 /* recurse for allocating page dir */ 2797 if (_pmap_alloc_l3(pmap, NUL2E + l1index, 2798 lockp) == NULL) { 2799 vm_page_unwire_noq(m); 2800 vm_page_free_zero(m); 2801 return (NULL); 2802 } 2803 } else { 2804 l2pg = PTE_TO_VM_PAGE(tl1); 2805 l2pg->ref_count++; 2806 } 2807 } 2808 2809 l2 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l1))); 2810 l2 = &l2[ptepindex & Ln_ADDR_MASK]; 2811 KASSERT((pmap_load(l2) & ATTR_DESCR_VALID) == 0, 2812 ("%s: L2 entry %#lx is valid", __func__, pmap_load(l2))); 2813 pmap_store(l2, VM_PAGE_TO_PTE(m) | L2_TABLE); 2814 } 2815 2816 pmap_resident_count_inc(pmap, 1); 2817 2818 return (m); 2819 } 2820 2821 static pd_entry_t * 2822 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, vm_page_t *l2pgp, 2823 struct rwlock **lockp) 2824 { 2825 pd_entry_t *l1, *l2; 2826 vm_page_t l2pg; 2827 vm_pindex_t l2pindex; 2828 2829 KASSERT(ADDR_IS_CANONICAL(va), 2830 ("%s: Address not in canonical form: %lx", __func__, va)); 2831 2832 retry: 2833 l1 = pmap_l1(pmap, va); 2834 if (l1 != NULL && (pmap_load(l1) & ATTR_DESCR_MASK) == L1_TABLE) { 2835 l2 = pmap_l1_to_l2(l1, va); 2836 if (!ADDR_IS_KERNEL(va)) { 2837 /* Add a reference to the L2 page. */ 2838 l2pg = PTE_TO_VM_PAGE(pmap_load(l1)); 2839 l2pg->ref_count++; 2840 } else 2841 l2pg = NULL; 2842 } else if (!ADDR_IS_KERNEL(va)) { 2843 /* Allocate a L2 page. */ 2844 l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT; 2845 l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp); 2846 if (l2pg == NULL) { 2847 if (lockp != NULL) 2848 goto retry; 2849 else 2850 return (NULL); 2851 } 2852 l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg)); 2853 l2 = &l2[pmap_l2_index(va)]; 2854 } else 2855 panic("pmap_alloc_l2: missing page table page for va %#lx", 2856 va); 2857 *l2pgp = l2pg; 2858 return (l2); 2859 } 2860 2861 static vm_page_t 2862 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 2863 { 2864 vm_pindex_t ptepindex; 2865 pd_entry_t *pde, tpde; 2866 #ifdef INVARIANTS 2867 pt_entry_t *pte; 2868 #endif 2869 vm_page_t m; 2870 int lvl; 2871 2872 /* 2873 * Calculate pagetable page index 2874 */ 2875 ptepindex = pmap_l2_pindex(va); 2876 retry: 2877 /* 2878 * Get the page directory entry 2879 */ 2880 pde = pmap_pde(pmap, va, &lvl); 2881 2882 /* 2883 * If the page table page is mapped, we just increment the hold count, 2884 * and activate it. If we get a level 2 pde it will point to a level 3 2885 * table. 2886 */ 2887 switch (lvl) { 2888 case -1: 2889 break; 2890 case 0: 2891 #ifdef INVARIANTS 2892 pte = pmap_l0_to_l1(pde, va); 2893 KASSERT(pmap_load(pte) == 0, 2894 ("pmap_alloc_l3: TODO: l0 superpages")); 2895 #endif 2896 break; 2897 case 1: 2898 #ifdef INVARIANTS 2899 pte = pmap_l1_to_l2(pde, va); 2900 KASSERT(pmap_load(pte) == 0, 2901 ("pmap_alloc_l3: TODO: l1 superpages")); 2902 #endif 2903 break; 2904 case 2: 2905 tpde = pmap_load(pde); 2906 if (tpde != 0) { 2907 m = PTE_TO_VM_PAGE(tpde); 2908 m->ref_count++; 2909 return (m); 2910 } 2911 break; 2912 default: 2913 panic("pmap_alloc_l3: Invalid level %d", lvl); 2914 } 2915 2916 /* 2917 * Here if the pte page isn't mapped, or if it has been deallocated. 2918 */ 2919 m = _pmap_alloc_l3(pmap, ptepindex, lockp); 2920 if (m == NULL && lockp != NULL) 2921 goto retry; 2922 2923 return (m); 2924 } 2925 2926 /*************************************************** 2927 * Pmap allocation/deallocation routines. 2928 ***************************************************/ 2929 2930 /* 2931 * Release any resources held by the given physical map. 2932 * Called when a pmap initialized by pmap_pinit is being released. 2933 * Should only be called if the map contains no valid mappings. 2934 */ 2935 void 2936 pmap_release(pmap_t pmap) 2937 { 2938 bool rv __diagused; 2939 struct spglist freelist; 2940 struct asid_set *set; 2941 vm_page_t m; 2942 int asid; 2943 2944 if (pmap->pm_levels != 4) { 2945 PMAP_ASSERT_STAGE2(pmap); 2946 KASSERT(pmap->pm_stats.resident_count == 1, 2947 ("pmap_release: pmap resident count %ld != 0", 2948 pmap->pm_stats.resident_count)); 2949 KASSERT((pmap->pm_l0[0] & ATTR_DESCR_VALID) == ATTR_DESCR_VALID, 2950 ("pmap_release: Invalid l0 entry: %lx", pmap->pm_l0[0])); 2951 2952 SLIST_INIT(&freelist); 2953 m = PHYS_TO_VM_PAGE(pmap->pm_ttbr); 2954 PMAP_LOCK(pmap); 2955 rv = pmap_unwire_l3(pmap, 0, m, &freelist); 2956 PMAP_UNLOCK(pmap); 2957 MPASS(rv == true); 2958 vm_page_free_pages_toq(&freelist, true); 2959 } 2960 2961 KASSERT(pmap->pm_stats.resident_count == 0, 2962 ("pmap_release: pmap resident count %ld != 0", 2963 pmap->pm_stats.resident_count)); 2964 KASSERT(vm_radix_is_empty(&pmap->pm_root), 2965 ("pmap_release: pmap has reserved page table page(s)")); 2966 2967 set = pmap->pm_asid_set; 2968 KASSERT(set != NULL, ("%s: NULL asid set", __func__)); 2969 2970 /* 2971 * Allow the ASID to be reused. In stage 2 VMIDs we don't invalidate 2972 * the entries when removing them so rely on a later tlb invalidation. 2973 * this will happen when updating the VMID generation. Because of this 2974 * we don't reuse VMIDs within a generation. 2975 */ 2976 if (pmap->pm_stage == PM_STAGE1) { 2977 mtx_lock_spin(&set->asid_set_mutex); 2978 if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch) { 2979 asid = COOKIE_TO_ASID(pmap->pm_cookie); 2980 KASSERT(asid >= ASID_FIRST_AVAILABLE && 2981 asid < set->asid_set_size, 2982 ("pmap_release: pmap cookie has out-of-range asid")); 2983 bit_clear(set->asid_set, asid); 2984 } 2985 mtx_unlock_spin(&set->asid_set_mutex); 2986 2987 if (pmap->pm_bti != NULL) { 2988 rangeset_fini(pmap->pm_bti); 2989 free(pmap->pm_bti, M_DEVBUF); 2990 } 2991 } 2992 2993 m = PHYS_TO_VM_PAGE(pmap->pm_l0_paddr); 2994 vm_page_unwire_noq(m); 2995 vm_page_free_zero(m); 2996 } 2997 2998 static int 2999 kvm_size(SYSCTL_HANDLER_ARGS) 3000 { 3001 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; 3002 3003 return sysctl_handle_long(oidp, &ksize, 0, req); 3004 } 3005 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 3006 0, 0, kvm_size, "LU", 3007 "Size of KVM"); 3008 3009 static int 3010 kvm_free(SYSCTL_HANDLER_ARGS) 3011 { 3012 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 3013 3014 return sysctl_handle_long(oidp, &kfree, 0, req); 3015 } 3016 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 3017 0, 0, kvm_free, "LU", 3018 "Amount of KVM free"); 3019 3020 /* 3021 * grow the number of kernel page table entries, if needed 3022 */ 3023 void 3024 pmap_growkernel(vm_offset_t addr) 3025 { 3026 vm_page_t nkpg; 3027 pd_entry_t *l0, *l1, *l2; 3028 3029 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 3030 3031 addr = roundup2(addr, L2_SIZE); 3032 if (addr - 1 >= vm_map_max(kernel_map)) 3033 addr = vm_map_max(kernel_map); 3034 if (kernel_vm_end < addr) { 3035 kasan_shadow_map(kernel_vm_end, addr - kernel_vm_end); 3036 kmsan_shadow_map(kernel_vm_end, addr - kernel_vm_end); 3037 } 3038 while (kernel_vm_end < addr) { 3039 l0 = pmap_l0(kernel_pmap, kernel_vm_end); 3040 KASSERT(pmap_load(l0) != 0, 3041 ("pmap_growkernel: No level 0 kernel entry")); 3042 3043 l1 = pmap_l0_to_l1(l0, kernel_vm_end); 3044 if (pmap_load(l1) == 0) { 3045 /* We need a new PDP entry */ 3046 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | 3047 VM_ALLOC_NOFREE | VM_ALLOC_WIRED | VM_ALLOC_ZERO); 3048 if (nkpg == NULL) 3049 panic("pmap_growkernel: no memory to grow kernel"); 3050 nkpg->pindex = pmap_l1_pindex(kernel_vm_end); 3051 /* See the dmb() in _pmap_alloc_l3(). */ 3052 dmb(ishst); 3053 pmap_store(l1, VM_PAGE_TO_PTE(nkpg) | L1_TABLE); 3054 continue; /* try again */ 3055 } 3056 l2 = pmap_l1_to_l2(l1, kernel_vm_end); 3057 if (pmap_load(l2) != 0) { 3058 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; 3059 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 3060 kernel_vm_end = vm_map_max(kernel_map); 3061 break; 3062 } 3063 continue; 3064 } 3065 3066 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | 3067 VM_ALLOC_NOFREE | VM_ALLOC_WIRED | VM_ALLOC_ZERO); 3068 if (nkpg == NULL) 3069 panic("pmap_growkernel: no memory to grow kernel"); 3070 nkpg->pindex = pmap_l2_pindex(kernel_vm_end); 3071 /* See the dmb() in _pmap_alloc_l3(). */ 3072 dmb(ishst); 3073 pmap_store(l2, VM_PAGE_TO_PTE(nkpg) | L2_TABLE); 3074 3075 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; 3076 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 3077 kernel_vm_end = vm_map_max(kernel_map); 3078 break; 3079 } 3080 } 3081 } 3082 3083 /*************************************************** 3084 * page management routines. 3085 ***************************************************/ 3086 3087 static const uint64_t pc_freemask[_NPCM] = { 3088 [0 ... _NPCM - 2] = PC_FREEN, 3089 [_NPCM - 1] = PC_FREEL 3090 }; 3091 3092 #ifdef PV_STATS 3093 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 3094 3095 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 3096 "Current number of pv entry chunks"); 3097 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 3098 "Current number of pv entry chunks allocated"); 3099 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 3100 "Current number of pv entry chunks frees"); 3101 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 3102 "Number of times tried to get a chunk page but failed."); 3103 3104 static long pv_entry_frees, pv_entry_allocs, pv_entry_count; 3105 static int pv_entry_spare; 3106 3107 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 3108 "Current number of pv entry frees"); 3109 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 3110 "Current number of pv entry allocs"); 3111 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 3112 "Current number of pv entries"); 3113 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 3114 "Current number of spare pv entries"); 3115 #endif 3116 3117 /* 3118 * We are in a serious low memory condition. Resort to 3119 * drastic measures to free some pages so we can allocate 3120 * another pv entry chunk. 3121 * 3122 * Returns NULL if PV entries were reclaimed from the specified pmap. 3123 * 3124 * We do not, however, unmap 2mpages because subsequent accesses will 3125 * allocate per-page pv entries until repromotion occurs, thereby 3126 * exacerbating the shortage of free pv entries. 3127 */ 3128 static vm_page_t 3129 reclaim_pv_chunk_domain(pmap_t locked_pmap, struct rwlock **lockp, int domain) 3130 { 3131 struct pv_chunks_list *pvc; 3132 struct pv_chunk *pc, *pc_marker, *pc_marker_end; 3133 struct pv_chunk_header pc_marker_b, pc_marker_end_b; 3134 struct md_page *pvh; 3135 pd_entry_t *pde; 3136 pmap_t next_pmap, pmap; 3137 pt_entry_t *pte, tpte; 3138 pv_entry_t pv; 3139 vm_offset_t va; 3140 vm_page_t m, m_pc; 3141 struct spglist free; 3142 uint64_t inuse; 3143 int bit, field, freed, lvl; 3144 3145 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 3146 KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); 3147 3148 pmap = NULL; 3149 m_pc = NULL; 3150 SLIST_INIT(&free); 3151 bzero(&pc_marker_b, sizeof(pc_marker_b)); 3152 bzero(&pc_marker_end_b, sizeof(pc_marker_end_b)); 3153 pc_marker = (struct pv_chunk *)&pc_marker_b; 3154 pc_marker_end = (struct pv_chunk *)&pc_marker_end_b; 3155 3156 pvc = &pv_chunks[domain]; 3157 mtx_lock(&pvc->pvc_lock); 3158 pvc->active_reclaims++; 3159 TAILQ_INSERT_HEAD(&pvc->pvc_list, pc_marker, pc_lru); 3160 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc_marker_end, pc_lru); 3161 while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end && 3162 SLIST_EMPTY(&free)) { 3163 next_pmap = pc->pc_pmap; 3164 if (next_pmap == NULL) { 3165 /* 3166 * The next chunk is a marker. However, it is 3167 * not our marker, so active_reclaims must be 3168 * > 1. Consequently, the next_chunk code 3169 * will not rotate the pv_chunks list. 3170 */ 3171 goto next_chunk; 3172 } 3173 mtx_unlock(&pvc->pvc_lock); 3174 3175 /* 3176 * A pv_chunk can only be removed from the pc_lru list 3177 * when both pvc->pvc_lock is owned and the 3178 * corresponding pmap is locked. 3179 */ 3180 if (pmap != next_pmap) { 3181 if (pmap != NULL && pmap != locked_pmap) 3182 PMAP_UNLOCK(pmap); 3183 pmap = next_pmap; 3184 /* Avoid deadlock and lock recursion. */ 3185 if (pmap > locked_pmap) { 3186 RELEASE_PV_LIST_LOCK(lockp); 3187 PMAP_LOCK(pmap); 3188 mtx_lock(&pvc->pvc_lock); 3189 continue; 3190 } else if (pmap != locked_pmap) { 3191 if (PMAP_TRYLOCK(pmap)) { 3192 mtx_lock(&pvc->pvc_lock); 3193 continue; 3194 } else { 3195 pmap = NULL; /* pmap is not locked */ 3196 mtx_lock(&pvc->pvc_lock); 3197 pc = TAILQ_NEXT(pc_marker, pc_lru); 3198 if (pc == NULL || 3199 pc->pc_pmap != next_pmap) 3200 continue; 3201 goto next_chunk; 3202 } 3203 } 3204 } 3205 3206 /* 3207 * Destroy every non-wired, 4 KB page mapping in the chunk. 3208 */ 3209 freed = 0; 3210 for (field = 0; field < _NPCM; field++) { 3211 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 3212 inuse != 0; inuse &= ~(1UL << bit)) { 3213 bit = ffsl(inuse) - 1; 3214 pv = &pc->pc_pventry[field * 64 + bit]; 3215 va = pv->pv_va; 3216 pde = pmap_pde(pmap, va, &lvl); 3217 if (lvl != 2) 3218 continue; 3219 pte = pmap_l2_to_l3(pde, va); 3220 tpte = pmap_load(pte); 3221 if ((tpte & ATTR_SW_WIRED) != 0) 3222 continue; 3223 if ((tpte & ATTR_CONTIGUOUS) != 0) 3224 (void)pmap_demote_l3c(pmap, pte, va); 3225 tpte = pmap_load_clear(pte); 3226 m = PTE_TO_VM_PAGE(tpte); 3227 if (pmap_pte_dirty(pmap, tpte)) 3228 vm_page_dirty(m); 3229 if ((tpte & ATTR_AF) != 0) { 3230 pmap_s1_invalidate_page(pmap, va, true); 3231 vm_page_aflag_set(m, PGA_REFERENCED); 3232 } 3233 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 3234 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 3235 m->md.pv_gen++; 3236 if (TAILQ_EMPTY(&m->md.pv_list) && 3237 (m->flags & PG_FICTITIOUS) == 0) { 3238 pvh = page_to_pvh(m); 3239 if (TAILQ_EMPTY(&pvh->pv_list)) { 3240 vm_page_aflag_clear(m, 3241 PGA_WRITEABLE); 3242 } 3243 } 3244 pc->pc_map[field] |= 1UL << bit; 3245 pmap_unuse_pt(pmap, va, pmap_load(pde), &free); 3246 freed++; 3247 } 3248 } 3249 if (freed == 0) { 3250 mtx_lock(&pvc->pvc_lock); 3251 goto next_chunk; 3252 } 3253 /* Every freed mapping is for a 4 KB page. */ 3254 pmap_resident_count_dec(pmap, freed); 3255 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 3256 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 3257 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 3258 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3259 if (pc_is_free(pc)) { 3260 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 3261 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 3262 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 3263 /* Entire chunk is free; return it. */ 3264 m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 3265 dump_drop_page(m_pc->phys_addr); 3266 mtx_lock(&pvc->pvc_lock); 3267 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 3268 break; 3269 } 3270 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3271 mtx_lock(&pvc->pvc_lock); 3272 /* One freed pv entry in locked_pmap is sufficient. */ 3273 if (pmap == locked_pmap) 3274 break; 3275 3276 next_chunk: 3277 TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru); 3278 TAILQ_INSERT_AFTER(&pvc->pvc_list, pc, pc_marker, pc_lru); 3279 if (pvc->active_reclaims == 1 && pmap != NULL) { 3280 /* 3281 * Rotate the pv chunks list so that we do not 3282 * scan the same pv chunks that could not be 3283 * freed (because they contained a wired 3284 * and/or superpage mapping) on every 3285 * invocation of reclaim_pv_chunk(). 3286 */ 3287 while ((pc = TAILQ_FIRST(&pvc->pvc_list)) != pc_marker){ 3288 MPASS(pc->pc_pmap != NULL); 3289 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 3290 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru); 3291 } 3292 } 3293 } 3294 TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru); 3295 TAILQ_REMOVE(&pvc->pvc_list, pc_marker_end, pc_lru); 3296 pvc->active_reclaims--; 3297 mtx_unlock(&pvc->pvc_lock); 3298 if (pmap != NULL && pmap != locked_pmap) 3299 PMAP_UNLOCK(pmap); 3300 if (m_pc == NULL && !SLIST_EMPTY(&free)) { 3301 m_pc = SLIST_FIRST(&free); 3302 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 3303 /* Recycle a freed page table page. */ 3304 m_pc->ref_count = 1; 3305 } 3306 vm_page_free_pages_toq(&free, true); 3307 return (m_pc); 3308 } 3309 3310 static vm_page_t 3311 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) 3312 { 3313 vm_page_t m; 3314 int i, domain; 3315 3316 domain = PCPU_GET(domain); 3317 for (i = 0; i < vm_ndomains; i++) { 3318 m = reclaim_pv_chunk_domain(locked_pmap, lockp, domain); 3319 if (m != NULL) 3320 break; 3321 domain = (domain + 1) % vm_ndomains; 3322 } 3323 3324 return (m); 3325 } 3326 3327 /* 3328 * free the pv_entry back to the free list 3329 */ 3330 static void 3331 free_pv_entry(pmap_t pmap, pv_entry_t pv) 3332 { 3333 struct pv_chunk *pc; 3334 int idx, field, bit; 3335 3336 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3337 PV_STAT(atomic_add_long(&pv_entry_frees, 1)); 3338 PV_STAT(atomic_add_int(&pv_entry_spare, 1)); 3339 PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); 3340 pc = pv_to_chunk(pv); 3341 idx = pv - &pc->pc_pventry[0]; 3342 field = idx / 64; 3343 bit = idx % 64; 3344 pc->pc_map[field] |= 1ul << bit; 3345 if (!pc_is_free(pc)) { 3346 /* 98% of the time, pc is already at the head of the list. */ 3347 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { 3348 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3349 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3350 } 3351 return; 3352 } 3353 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3354 free_pv_chunk(pc); 3355 } 3356 3357 static void 3358 free_pv_chunk_dequeued(struct pv_chunk *pc) 3359 { 3360 vm_page_t m; 3361 3362 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 3363 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 3364 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 3365 /* entire chunk is free, return it */ 3366 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 3367 dump_drop_page(m->phys_addr); 3368 vm_page_unwire_noq(m); 3369 vm_page_free(m); 3370 } 3371 3372 static void 3373 free_pv_chunk(struct pv_chunk *pc) 3374 { 3375 struct pv_chunks_list *pvc; 3376 3377 pvc = &pv_chunks[pc_to_domain(pc)]; 3378 mtx_lock(&pvc->pvc_lock); 3379 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 3380 mtx_unlock(&pvc->pvc_lock); 3381 free_pv_chunk_dequeued(pc); 3382 } 3383 3384 static void 3385 free_pv_chunk_batch(struct pv_chunklist *batch) 3386 { 3387 struct pv_chunks_list *pvc; 3388 struct pv_chunk *pc, *npc; 3389 int i; 3390 3391 for (i = 0; i < vm_ndomains; i++) { 3392 if (TAILQ_EMPTY(&batch[i])) 3393 continue; 3394 pvc = &pv_chunks[i]; 3395 mtx_lock(&pvc->pvc_lock); 3396 TAILQ_FOREACH(pc, &batch[i], pc_list) { 3397 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 3398 } 3399 mtx_unlock(&pvc->pvc_lock); 3400 } 3401 3402 for (i = 0; i < vm_ndomains; i++) { 3403 TAILQ_FOREACH_SAFE(pc, &batch[i], pc_list, npc) { 3404 free_pv_chunk_dequeued(pc); 3405 } 3406 } 3407 } 3408 3409 /* 3410 * Returns a new PV entry, allocating a new PV chunk from the system when 3411 * needed. If this PV chunk allocation fails and a PV list lock pointer was 3412 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is 3413 * returned. 3414 * 3415 * The given PV list lock may be released. 3416 */ 3417 static pv_entry_t 3418 get_pv_entry(pmap_t pmap, struct rwlock **lockp) 3419 { 3420 struct pv_chunks_list *pvc; 3421 int bit, field; 3422 pv_entry_t pv; 3423 struct pv_chunk *pc; 3424 vm_page_t m; 3425 3426 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3427 PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); 3428 retry: 3429 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 3430 if (pc != NULL) { 3431 for (field = 0; field < _NPCM; field++) { 3432 if (pc->pc_map[field]) { 3433 bit = ffsl(pc->pc_map[field]) - 1; 3434 break; 3435 } 3436 } 3437 if (field < _NPCM) { 3438 pv = &pc->pc_pventry[field * 64 + bit]; 3439 pc->pc_map[field] &= ~(1ul << bit); 3440 /* If this was the last item, move it to tail */ 3441 if (pc_is_full(pc)) { 3442 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3443 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, 3444 pc_list); 3445 } 3446 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 3447 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); 3448 return (pv); 3449 } 3450 } 3451 /* No free items, allocate another chunk */ 3452 m = vm_page_alloc_noobj(VM_ALLOC_WIRED); 3453 if (m == NULL) { 3454 if (lockp == NULL) { 3455 PV_STAT(pc_chunk_tryfail++); 3456 return (NULL); 3457 } 3458 m = reclaim_pv_chunk(pmap, lockp); 3459 if (m == NULL) 3460 goto retry; 3461 } 3462 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 3463 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 3464 dump_add_page(m->phys_addr); 3465 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 3466 pc->pc_pmap = pmap; 3467 memcpy(pc->pc_map, pc_freemask, sizeof(pc_freemask)); 3468 pc->pc_map[0] &= ~1ul; /* preallocated bit 0 */ 3469 pvc = &pv_chunks[vm_page_domain(m)]; 3470 mtx_lock(&pvc->pvc_lock); 3471 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru); 3472 mtx_unlock(&pvc->pvc_lock); 3473 pv = &pc->pc_pventry[0]; 3474 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3475 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 3476 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); 3477 return (pv); 3478 } 3479 3480 /* 3481 * Ensure that the number of spare PV entries in the specified pmap meets or 3482 * exceeds the given count, "needed". 3483 * 3484 * The given PV list lock may be released. 3485 */ 3486 static void 3487 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) 3488 { 3489 struct pv_chunks_list *pvc; 3490 struct pch new_tail[PMAP_MEMDOM]; 3491 struct pv_chunk *pc; 3492 vm_page_t m; 3493 int avail, free, i; 3494 bool reclaimed; 3495 3496 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3497 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); 3498 3499 /* 3500 * Newly allocated PV chunks must be stored in a private list until 3501 * the required number of PV chunks have been allocated. Otherwise, 3502 * reclaim_pv_chunk() could recycle one of these chunks. In 3503 * contrast, these chunks must be added to the pmap upon allocation. 3504 */ 3505 for (i = 0; i < PMAP_MEMDOM; i++) 3506 TAILQ_INIT(&new_tail[i]); 3507 retry: 3508 avail = 0; 3509 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { 3510 bit_count((bitstr_t *)pc->pc_map, 0, 3511 sizeof(pc->pc_map) * NBBY, &free); 3512 if (free == 0) 3513 break; 3514 avail += free; 3515 if (avail >= needed) 3516 break; 3517 } 3518 for (reclaimed = false; avail < needed; avail += _NPCPV) { 3519 m = vm_page_alloc_noobj(VM_ALLOC_WIRED); 3520 if (m == NULL) { 3521 m = reclaim_pv_chunk(pmap, lockp); 3522 if (m == NULL) 3523 goto retry; 3524 reclaimed = true; 3525 } 3526 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 3527 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 3528 dump_add_page(m->phys_addr); 3529 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 3530 pc->pc_pmap = pmap; 3531 memcpy(pc->pc_map, pc_freemask, sizeof(pc_freemask)); 3532 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3533 TAILQ_INSERT_TAIL(&new_tail[vm_page_domain(m)], pc, pc_lru); 3534 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); 3535 3536 /* 3537 * The reclaim might have freed a chunk from the current pmap. 3538 * If that chunk contained available entries, we need to 3539 * re-count the number of available entries. 3540 */ 3541 if (reclaimed) 3542 goto retry; 3543 } 3544 for (i = 0; i < vm_ndomains; i++) { 3545 if (TAILQ_EMPTY(&new_tail[i])) 3546 continue; 3547 pvc = &pv_chunks[i]; 3548 mtx_lock(&pvc->pvc_lock); 3549 TAILQ_CONCAT(&pvc->pvc_list, &new_tail[i], pc_lru); 3550 mtx_unlock(&pvc->pvc_lock); 3551 } 3552 } 3553 3554 /* 3555 * First find and then remove the pv entry for the specified pmap and virtual 3556 * address from the specified pv list. Returns the pv entry if found and NULL 3557 * otherwise. This operation can be performed on pv lists for either 4KB or 3558 * 2MB page mappings. 3559 */ 3560 static __inline pv_entry_t 3561 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 3562 { 3563 pv_entry_t pv; 3564 3565 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3566 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 3567 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 3568 pvh->pv_gen++; 3569 break; 3570 } 3571 } 3572 return (pv); 3573 } 3574 3575 /* 3576 * After demotion from a 2MB page mapping to 512 4KB page mappings, 3577 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 3578 * entries for each of the 4KB page mappings. 3579 */ 3580 static void 3581 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 3582 struct rwlock **lockp) 3583 { 3584 struct md_page *pvh; 3585 struct pv_chunk *pc; 3586 pv_entry_t pv; 3587 vm_offset_t va_last; 3588 vm_page_t m; 3589 int bit, field; 3590 3591 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3592 KASSERT((va & L2_OFFSET) == 0, 3593 ("pmap_pv_demote_l2: va is not 2mpage aligned")); 3594 KASSERT((pa & L2_OFFSET) == 0, 3595 ("pmap_pv_demote_l2: pa is not 2mpage aligned")); 3596 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 3597 3598 /* 3599 * Transfer the 2mpage's pv entry for this mapping to the first 3600 * page's pv list. Once this transfer begins, the pv list lock 3601 * must not be released until the last pv entry is reinstantiated. 3602 */ 3603 pvh = pa_to_pvh(pa); 3604 pv = pmap_pvh_remove(pvh, pmap, va); 3605 KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found")); 3606 m = PHYS_TO_VM_PAGE(pa); 3607 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3608 m->md.pv_gen++; 3609 /* Instantiate the remaining Ln_ENTRIES - 1 pv entries. */ 3610 PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1)); 3611 va_last = va + L2_SIZE - PAGE_SIZE; 3612 for (;;) { 3613 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 3614 KASSERT(!pc_is_full(pc), ("pmap_pv_demote_l2: missing spare")); 3615 for (field = 0; field < _NPCM; field++) { 3616 while (pc->pc_map[field]) { 3617 bit = ffsl(pc->pc_map[field]) - 1; 3618 pc->pc_map[field] &= ~(1ul << bit); 3619 pv = &pc->pc_pventry[field * 64 + bit]; 3620 va += PAGE_SIZE; 3621 pv->pv_va = va; 3622 m++; 3623 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3624 ("pmap_pv_demote_l2: page %p is not managed", m)); 3625 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3626 m->md.pv_gen++; 3627 if (va == va_last) 3628 goto out; 3629 } 3630 } 3631 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3632 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 3633 } 3634 out: 3635 if (pc_is_full(pc)) { 3636 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3637 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 3638 } 3639 PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1)); 3640 PV_STAT(atomic_subtract_int(&pv_entry_spare, Ln_ENTRIES - 1)); 3641 } 3642 3643 /* 3644 * First find and then destroy the pv entry for the specified pmap and virtual 3645 * address. This operation can be performed on pv lists for either 4KB or 2MB 3646 * page mappings. 3647 */ 3648 static void 3649 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 3650 { 3651 pv_entry_t pv; 3652 3653 pv = pmap_pvh_remove(pvh, pmap, va); 3654 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 3655 free_pv_entry(pmap, pv); 3656 } 3657 3658 /* 3659 * Conditionally create the PV entry for a 4KB page mapping if the required 3660 * memory can be allocated without resorting to reclamation. 3661 */ 3662 static bool 3663 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, 3664 struct rwlock **lockp) 3665 { 3666 pv_entry_t pv; 3667 3668 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3669 /* Pass NULL instead of the lock pointer to disable reclamation. */ 3670 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 3671 pv->pv_va = va; 3672 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 3673 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3674 m->md.pv_gen++; 3675 return (true); 3676 } else 3677 return (false); 3678 } 3679 3680 /* 3681 * Create the PV entry for a 2MB page mapping. Always returns true unless the 3682 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns 3683 * false if the PV entry cannot be allocated without resorting to reclamation. 3684 */ 3685 static bool 3686 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags, 3687 struct rwlock **lockp) 3688 { 3689 struct md_page *pvh; 3690 pv_entry_t pv; 3691 vm_paddr_t pa; 3692 3693 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3694 /* Pass NULL instead of the lock pointer to disable reclamation. */ 3695 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? 3696 NULL : lockp)) == NULL) 3697 return (false); 3698 pv->pv_va = va; 3699 pa = PTE_TO_PHYS(l2e); 3700 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 3701 pvh = pa_to_pvh(pa); 3702 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 3703 pvh->pv_gen++; 3704 return (true); 3705 } 3706 3707 /* 3708 * Conditionally creates the PV entries for a L3C superpage mapping if 3709 * the required memory can be allocated without resorting to reclamation. 3710 */ 3711 static bool 3712 pmap_pv_insert_l3c(pmap_t pmap, vm_offset_t va, vm_page_t m, 3713 struct rwlock **lockp) 3714 { 3715 pv_entry_t pv; 3716 vm_offset_t tva; 3717 vm_paddr_t pa __diagused; 3718 vm_page_t mt; 3719 3720 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3721 KASSERT((va & L3C_OFFSET) == 0, 3722 ("pmap_pv_insert_l3c: va is not aligned")); 3723 pa = VM_PAGE_TO_PHYS(m); 3724 KASSERT((pa & L3C_OFFSET) == 0, 3725 ("pmap_pv_insert_l3c: pa is not aligned")); 3726 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 3727 for (mt = m, tva = va; mt < &m[L3C_ENTRIES]; mt++, tva += L3_SIZE) { 3728 /* Pass NULL instead of lockp to disable reclamation. */ 3729 pv = get_pv_entry(pmap, NULL); 3730 if (__predict_false(pv == NULL)) { 3731 while (tva > va) { 3732 mt--; 3733 tva -= L3_SIZE; 3734 pmap_pvh_free(&mt->md, pmap, tva); 3735 } 3736 return (false); 3737 } 3738 pv->pv_va = tva; 3739 TAILQ_INSERT_TAIL(&mt->md.pv_list, pv, pv_next); 3740 mt->md.pv_gen++; 3741 } 3742 return (true); 3743 } 3744 3745 static void 3746 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) 3747 { 3748 pt_entry_t newl2, oldl2 __diagused; 3749 vm_page_t ml3; 3750 vm_paddr_t ml3pa; 3751 3752 KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va)); 3753 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); 3754 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3755 3756 ml3 = pmap_remove_pt_page(pmap, va); 3757 if (ml3 == NULL) 3758 panic("pmap_remove_kernel_l2: Missing pt page"); 3759 3760 ml3pa = VM_PAGE_TO_PHYS(ml3); 3761 newl2 = PHYS_TO_PTE(ml3pa) | L2_TABLE; 3762 3763 /* 3764 * If this page table page was unmapped by a promotion, then it 3765 * contains valid mappings. Zero it to invalidate those mappings. 3766 */ 3767 if (vm_page_any_valid(ml3)) 3768 pagezero((void *)PHYS_TO_DMAP(ml3pa)); 3769 3770 /* 3771 * Demote the mapping. The caller must have already invalidated the 3772 * mapping (i.e., the "break" in break-before-make). 3773 */ 3774 oldl2 = pmap_load_store(l2, newl2); 3775 KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx", 3776 __func__, l2, oldl2)); 3777 } 3778 3779 /* 3780 * pmap_remove_l2: Do the things to unmap a level 2 superpage. 3781 */ 3782 static int 3783 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, 3784 pd_entry_t l1e, struct spglist *free, struct rwlock **lockp) 3785 { 3786 struct md_page *pvh; 3787 pt_entry_t old_l2; 3788 vm_page_t m, ml3, mt; 3789 3790 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3791 KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned")); 3792 old_l2 = pmap_load_clear(l2); 3793 KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK, 3794 ("pmap_remove_l2: L2e %lx is not a block mapping", old_l2)); 3795 3796 /* 3797 * Since a promotion must break the 4KB page mappings before making 3798 * the 2MB page mapping, a pmap_s1_invalidate_page() suffices. 3799 */ 3800 pmap_s1_invalidate_page(pmap, sva, true); 3801 3802 if (old_l2 & ATTR_SW_WIRED) 3803 pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE; 3804 pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE); 3805 if (old_l2 & ATTR_SW_MANAGED) { 3806 m = PTE_TO_VM_PAGE(old_l2); 3807 pvh = page_to_pvh(m); 3808 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 3809 pmap_pvh_free(pvh, pmap, sva); 3810 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) { 3811 if (pmap_pte_dirty(pmap, old_l2)) 3812 vm_page_dirty(mt); 3813 if (old_l2 & ATTR_AF) 3814 vm_page_aflag_set(mt, PGA_REFERENCED); 3815 if (TAILQ_EMPTY(&mt->md.pv_list) && 3816 TAILQ_EMPTY(&pvh->pv_list)) 3817 vm_page_aflag_clear(mt, PGA_WRITEABLE); 3818 } 3819 } 3820 if (pmap == kernel_pmap) { 3821 pmap_remove_kernel_l2(pmap, l2, sva); 3822 } else { 3823 ml3 = pmap_remove_pt_page(pmap, sva); 3824 if (ml3 != NULL) { 3825 KASSERT(vm_page_any_valid(ml3), 3826 ("pmap_remove_l2: l3 page not promoted")); 3827 pmap_resident_count_dec(pmap, 1); 3828 KASSERT(ml3->ref_count == NL3PG, 3829 ("pmap_remove_l2: l3 page ref count error")); 3830 ml3->ref_count = 0; 3831 pmap_add_delayed_free_list(ml3, free, false); 3832 } 3833 } 3834 return (pmap_unuse_pt(pmap, sva, l1e, free)); 3835 } 3836 3837 /* 3838 * pmap_remove_l3: do the things to unmap a page in a process 3839 */ 3840 static int 3841 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va, 3842 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp) 3843 { 3844 struct md_page *pvh; 3845 pt_entry_t old_l3; 3846 vm_page_t m; 3847 3848 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3849 old_l3 = pmap_load(l3); 3850 if ((old_l3 & ATTR_CONTIGUOUS) != 0) 3851 (void)pmap_demote_l3c(pmap, l3, va); 3852 old_l3 = pmap_load_clear(l3); 3853 pmap_s1_invalidate_page(pmap, va, true); 3854 if (old_l3 & ATTR_SW_WIRED) 3855 pmap->pm_stats.wired_count -= 1; 3856 pmap_resident_count_dec(pmap, 1); 3857 if (old_l3 & ATTR_SW_MANAGED) { 3858 m = PTE_TO_VM_PAGE(old_l3); 3859 if (pmap_pte_dirty(pmap, old_l3)) 3860 vm_page_dirty(m); 3861 if (old_l3 & ATTR_AF) 3862 vm_page_aflag_set(m, PGA_REFERENCED); 3863 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 3864 pmap_pvh_free(&m->md, pmap, va); 3865 if (TAILQ_EMPTY(&m->md.pv_list) && 3866 (m->flags & PG_FICTITIOUS) == 0) { 3867 pvh = page_to_pvh(m); 3868 if (TAILQ_EMPTY(&pvh->pv_list)) 3869 vm_page_aflag_clear(m, PGA_WRITEABLE); 3870 } 3871 } 3872 return (pmap_unuse_pt(pmap, va, l2e, free)); 3873 } 3874 3875 /* 3876 * Removes the specified L3C superpage mapping. Requests TLB invalidations 3877 * to be performed by the caller through the returned "*vap". Returns true 3878 * if the level 3 table "ml3" was unmapped and added to the spglist "free". 3879 * Otherwise, returns false. 3880 */ 3881 static bool 3882 pmap_remove_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, vm_offset_t *vap, 3883 vm_offset_t va_next, vm_page_t ml3, struct spglist *free, 3884 struct rwlock **lockp) 3885 { 3886 struct md_page *pvh; 3887 struct rwlock *new_lock; 3888 pt_entry_t first_l3e, l3e, *tl3p; 3889 vm_offset_t tva; 3890 vm_page_t m, mt; 3891 3892 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3893 KASSERT(((uintptr_t)l3p & ((L3C_ENTRIES * sizeof(pt_entry_t)) - 1)) == 3894 0, ("pmap_remove_l3c: l3p is not aligned")); 3895 KASSERT((va & L3C_OFFSET) == 0, 3896 ("pmap_remove_l3c: va is not aligned")); 3897 3898 /* 3899 * Hardware accessed and dirty bit maintenance might only update a 3900 * single L3 entry, so we must combine the accessed and dirty bits 3901 * from this entire set of contiguous L3 entries. 3902 */ 3903 first_l3e = pmap_load_clear(l3p); 3904 for (tl3p = l3p + 1; tl3p < &l3p[L3C_ENTRIES]; tl3p++) { 3905 l3e = pmap_load_clear(tl3p); 3906 KASSERT((l3e & ATTR_CONTIGUOUS) != 0, 3907 ("pmap_remove_l3c: l3e is missing ATTR_CONTIGUOUS")); 3908 if ((l3e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) == 3909 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RW))) 3910 first_l3e &= ~ATTR_S1_AP_RW_BIT; 3911 first_l3e |= l3e & ATTR_AF; 3912 } 3913 if ((first_l3e & ATTR_SW_WIRED) != 0) 3914 pmap->pm_stats.wired_count -= L3C_ENTRIES; 3915 pmap_resident_count_dec(pmap, L3C_ENTRIES); 3916 if ((first_l3e & ATTR_SW_MANAGED) != 0) { 3917 m = PTE_TO_VM_PAGE(first_l3e); 3918 new_lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3919 if (new_lock != *lockp) { 3920 if (*lockp != NULL) { 3921 /* 3922 * Pending TLB invalidations must be 3923 * performed before the PV list lock is 3924 * released. Otherwise, a concurrent 3925 * pmap_remove_all() on a physical page 3926 * could return while a stale TLB entry 3927 * still provides access to that page. 3928 */ 3929 if (*vap != va_next) { 3930 pmap_invalidate_range(pmap, *vap, va, 3931 true); 3932 *vap = va_next; 3933 } 3934 rw_wunlock(*lockp); 3935 } 3936 *lockp = new_lock; 3937 rw_wlock(*lockp); 3938 } 3939 pvh = page_to_pvh(m); 3940 for (mt = m, tva = va; mt < &m[L3C_ENTRIES]; mt++, tva += 3941 L3_SIZE) { 3942 if (pmap_pte_dirty(pmap, first_l3e)) 3943 vm_page_dirty(mt); 3944 if ((first_l3e & ATTR_AF) != 0) 3945 vm_page_aflag_set(mt, PGA_REFERENCED); 3946 pmap_pvh_free(&mt->md, pmap, tva); 3947 if (TAILQ_EMPTY(&mt->md.pv_list) && 3948 TAILQ_EMPTY(&pvh->pv_list)) 3949 vm_page_aflag_clear(mt, PGA_WRITEABLE); 3950 } 3951 } 3952 if (*vap == va_next) 3953 *vap = va; 3954 if (ml3 != NULL) { 3955 ml3->ref_count -= L3C_ENTRIES; 3956 if (ml3->ref_count == 0) { 3957 _pmap_unwire_l3(pmap, va, ml3, free); 3958 return (true); 3959 } 3960 } 3961 return (false); 3962 } 3963 3964 /* 3965 * Remove the specified range of addresses from the L3 page table that is 3966 * identified by the given L2 entry. 3967 */ 3968 static void 3969 pmap_remove_l3_range(pmap_t pmap, pd_entry_t l2e, vm_offset_t sva, 3970 vm_offset_t eva, struct spglist *free, struct rwlock **lockp) 3971 { 3972 struct md_page *pvh; 3973 struct rwlock *new_lock; 3974 pt_entry_t *l3, old_l3; 3975 vm_offset_t va; 3976 vm_page_t l3pg, m; 3977 3978 KASSERT(ADDR_IS_CANONICAL(sva), 3979 ("%s: Start address not in canonical form: %lx", __func__, sva)); 3980 KASSERT(ADDR_IS_CANONICAL(eva) || eva == VM_MAX_USER_ADDRESS, 3981 ("%s: End address not in canonical form: %lx", __func__, eva)); 3982 3983 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3984 KASSERT(rounddown2(sva, L2_SIZE) + L2_SIZE == roundup2(eva, L2_SIZE), 3985 ("pmap_remove_l3_range: range crosses an L3 page table boundary")); 3986 l3pg = !ADDR_IS_KERNEL(sva) ? PTE_TO_VM_PAGE(l2e) : NULL; 3987 va = eva; 3988 for (l3 = pmap_l2_to_l3(&l2e, sva); sva != eva; l3++, sva += L3_SIZE) { 3989 old_l3 = pmap_load(l3); 3990 if (!pmap_l3_valid(old_l3)) { 3991 if (va != eva) { 3992 pmap_invalidate_range(pmap, va, sva, true); 3993 va = eva; 3994 } 3995 continue; 3996 } 3997 if ((old_l3 & ATTR_CONTIGUOUS) != 0) { 3998 /* 3999 * Is this entire set of contiguous L3 entries being 4000 * removed? Handle the possibility that "eva" is zero 4001 * because of address wraparound. 4002 */ 4003 if ((sva & L3C_OFFSET) == 0 && 4004 sva + L3C_OFFSET <= eva - 1) { 4005 if (pmap_remove_l3c(pmap, l3, sva, &va, eva, 4006 l3pg, free, lockp)) { 4007 /* The L3 table was unmapped. */ 4008 sva += L3C_SIZE; 4009 break; 4010 } 4011 l3 += L3C_ENTRIES - 1; 4012 sva += L3C_SIZE - L3_SIZE; 4013 continue; 4014 } 4015 4016 (void)pmap_demote_l3c(pmap, l3, sva); 4017 } 4018 old_l3 = pmap_load_clear(l3); 4019 if ((old_l3 & ATTR_SW_WIRED) != 0) 4020 pmap->pm_stats.wired_count--; 4021 pmap_resident_count_dec(pmap, 1); 4022 if ((old_l3 & ATTR_SW_MANAGED) != 0) { 4023 m = PTE_TO_VM_PAGE(old_l3); 4024 if (pmap_pte_dirty(pmap, old_l3)) 4025 vm_page_dirty(m); 4026 if ((old_l3 & ATTR_AF) != 0) 4027 vm_page_aflag_set(m, PGA_REFERENCED); 4028 new_lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4029 if (new_lock != *lockp) { 4030 if (*lockp != NULL) { 4031 /* 4032 * Pending TLB invalidations must be 4033 * performed before the PV list lock is 4034 * released. Otherwise, a concurrent 4035 * pmap_remove_all() on a physical page 4036 * could return while a stale TLB entry 4037 * still provides access to that page. 4038 */ 4039 if (va != eva) { 4040 pmap_invalidate_range(pmap, va, 4041 sva, true); 4042 va = eva; 4043 } 4044 rw_wunlock(*lockp); 4045 } 4046 *lockp = new_lock; 4047 rw_wlock(*lockp); 4048 } 4049 pmap_pvh_free(&m->md, pmap, sva); 4050 if (TAILQ_EMPTY(&m->md.pv_list) && 4051 (m->flags & PG_FICTITIOUS) == 0) { 4052 pvh = page_to_pvh(m); 4053 if (TAILQ_EMPTY(&pvh->pv_list)) 4054 vm_page_aflag_clear(m, PGA_WRITEABLE); 4055 } 4056 } 4057 if (l3pg != NULL && pmap_unwire_l3(pmap, sva, l3pg, free)) { 4058 /* 4059 * _pmap_unwire_l3() has already invalidated the TLB 4060 * entries at all levels for "sva". So, we need not 4061 * perform "sva += L3_SIZE;" here. Moreover, we need 4062 * not perform "va = sva;" if "sva" is at the start 4063 * of a new valid range consisting of a single page. 4064 */ 4065 break; 4066 } 4067 if (va == eva) 4068 va = sva; 4069 } 4070 if (va != eva) 4071 pmap_invalidate_range(pmap, va, sva, true); 4072 } 4073 4074 static void 4075 pmap_remove1(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, bool map_delete) 4076 { 4077 struct rwlock *lock; 4078 vm_offset_t va_next; 4079 pd_entry_t *l0, *l1, *l2; 4080 pt_entry_t l3_paddr; 4081 struct spglist free; 4082 4083 /* 4084 * Perform an unsynchronized read. This is, however, safe. 4085 */ 4086 if (pmap->pm_stats.resident_count == 0) 4087 return; 4088 4089 SLIST_INIT(&free); 4090 4091 PMAP_LOCK(pmap); 4092 if (map_delete) 4093 pmap_bti_on_remove(pmap, sva, eva); 4094 4095 lock = NULL; 4096 for (; sva < eva; sva = va_next) { 4097 if (pmap->pm_stats.resident_count == 0) 4098 break; 4099 4100 l0 = pmap_l0(pmap, sva); 4101 if (pmap_load(l0) == 0) { 4102 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 4103 if (va_next < sva) 4104 va_next = eva; 4105 continue; 4106 } 4107 4108 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 4109 if (va_next < sva) 4110 va_next = eva; 4111 l1 = pmap_l0_to_l1(l0, sva); 4112 if (pmap_load(l1) == 0) 4113 continue; 4114 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { 4115 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 4116 KASSERT(va_next <= eva, 4117 ("partial update of non-transparent 1G page " 4118 "l1 %#lx sva %#lx eva %#lx va_next %#lx", 4119 pmap_load(l1), sva, eva, va_next)); 4120 MPASS(pmap != kernel_pmap); 4121 MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0); 4122 pmap_clear(l1); 4123 pmap_s1_invalidate_page(pmap, sva, true); 4124 pmap_resident_count_dec(pmap, L1_SIZE / PAGE_SIZE); 4125 pmap_unuse_pt(pmap, sva, pmap_load(l0), &free); 4126 continue; 4127 } 4128 4129 /* 4130 * Calculate index for next page table. 4131 */ 4132 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 4133 if (va_next < sva) 4134 va_next = eva; 4135 4136 l2 = pmap_l1_to_l2(l1, sva); 4137 if (l2 == NULL) 4138 continue; 4139 4140 l3_paddr = pmap_load(l2); 4141 4142 if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) { 4143 if (sva + L2_SIZE == va_next && eva >= va_next) { 4144 pmap_remove_l2(pmap, l2, sva, pmap_load(l1), 4145 &free, &lock); 4146 continue; 4147 } else if (pmap_demote_l2_locked(pmap, l2, sva, 4148 &lock) == NULL) 4149 continue; 4150 l3_paddr = pmap_load(l2); 4151 } 4152 4153 /* 4154 * Weed out invalid mappings. 4155 */ 4156 if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE) 4157 continue; 4158 4159 /* 4160 * Limit our scan to either the end of the va represented 4161 * by the current page table page, or to the end of the 4162 * range being removed. 4163 */ 4164 if (va_next > eva) 4165 va_next = eva; 4166 4167 pmap_remove_l3_range(pmap, l3_paddr, sva, va_next, &free, 4168 &lock); 4169 } 4170 if (lock != NULL) 4171 rw_wunlock(lock); 4172 PMAP_UNLOCK(pmap); 4173 vm_page_free_pages_toq(&free, true); 4174 } 4175 4176 /* 4177 * Remove the given range of addresses from the specified map. 4178 * 4179 * It is assumed that the start and end are properly 4180 * rounded to the page size. 4181 */ 4182 void 4183 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 4184 { 4185 pmap_remove1(pmap, sva, eva, false); 4186 } 4187 4188 /* 4189 * Remove the given range of addresses as part of a logical unmap 4190 * operation. This has the effect of calling pmap_remove(), but 4191 * also clears any metadata that should persist for the lifetime 4192 * of a logical mapping. 4193 */ 4194 void 4195 pmap_map_delete(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 4196 { 4197 pmap_remove1(pmap, sva, eva, true); 4198 } 4199 4200 /* 4201 * Routine: pmap_remove_all 4202 * Function: 4203 * Removes this physical page from 4204 * all physical maps in which it resides. 4205 * Reflects back modify bits to the pager. 4206 * 4207 * Notes: 4208 * Original versions of this routine were very 4209 * inefficient because they iteratively called 4210 * pmap_remove (slow...) 4211 */ 4212 4213 void 4214 pmap_remove_all(vm_page_t m) 4215 { 4216 struct md_page *pvh; 4217 pv_entry_t pv; 4218 pmap_t pmap; 4219 struct rwlock *lock; 4220 pd_entry_t *pde, tpde; 4221 pt_entry_t *pte, tpte; 4222 vm_offset_t va; 4223 struct spglist free; 4224 int lvl, pvh_gen, md_gen; 4225 4226 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4227 ("pmap_remove_all: page %p is not managed", m)); 4228 SLIST_INIT(&free); 4229 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4230 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m); 4231 rw_wlock(lock); 4232 retry: 4233 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 4234 pmap = PV_PMAP(pv); 4235 if (!PMAP_TRYLOCK(pmap)) { 4236 pvh_gen = pvh->pv_gen; 4237 rw_wunlock(lock); 4238 PMAP_LOCK(pmap); 4239 rw_wlock(lock); 4240 if (pvh_gen != pvh->pv_gen) { 4241 PMAP_UNLOCK(pmap); 4242 goto retry; 4243 } 4244 } 4245 va = pv->pv_va; 4246 pte = pmap_pte_exists(pmap, va, 2, __func__); 4247 pmap_demote_l2_locked(pmap, pte, va, &lock); 4248 PMAP_UNLOCK(pmap); 4249 } 4250 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 4251 pmap = PV_PMAP(pv); 4252 if (!PMAP_TRYLOCK(pmap)) { 4253 pvh_gen = pvh->pv_gen; 4254 md_gen = m->md.pv_gen; 4255 rw_wunlock(lock); 4256 PMAP_LOCK(pmap); 4257 rw_wlock(lock); 4258 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 4259 PMAP_UNLOCK(pmap); 4260 goto retry; 4261 } 4262 } 4263 pmap_resident_count_dec(pmap, 1); 4264 4265 pde = pmap_pde(pmap, pv->pv_va, &lvl); 4266 KASSERT(pde != NULL, 4267 ("pmap_remove_all: no page directory entry found")); 4268 KASSERT(lvl == 2, 4269 ("pmap_remove_all: invalid pde level %d", lvl)); 4270 tpde = pmap_load(pde); 4271 4272 pte = pmap_l2_to_l3(pde, pv->pv_va); 4273 tpte = pmap_load(pte); 4274 if ((tpte & ATTR_CONTIGUOUS) != 0) 4275 (void)pmap_demote_l3c(pmap, pte, pv->pv_va); 4276 tpte = pmap_load_clear(pte); 4277 if (tpte & ATTR_SW_WIRED) 4278 pmap->pm_stats.wired_count--; 4279 if ((tpte & ATTR_AF) != 0) { 4280 pmap_invalidate_page(pmap, pv->pv_va, true); 4281 vm_page_aflag_set(m, PGA_REFERENCED); 4282 } 4283 4284 /* 4285 * Update the vm_page_t clean and reference bits. 4286 */ 4287 if (pmap_pte_dirty(pmap, tpte)) 4288 vm_page_dirty(m); 4289 pmap_unuse_pt(pmap, pv->pv_va, tpde, &free); 4290 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4291 m->md.pv_gen++; 4292 free_pv_entry(pmap, pv); 4293 PMAP_UNLOCK(pmap); 4294 } 4295 vm_page_aflag_clear(m, PGA_WRITEABLE); 4296 rw_wunlock(lock); 4297 vm_page_free_pages_toq(&free, true); 4298 } 4299 4300 /* 4301 * Masks and sets bits in a level 2 page table entries in the specified pmap 4302 */ 4303 static void 4304 pmap_protect_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pt_entry_t mask, 4305 pt_entry_t nbits) 4306 { 4307 pd_entry_t old_l2; 4308 vm_page_t m, mt; 4309 4310 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4311 PMAP_ASSERT_STAGE1(pmap); 4312 KASSERT((sva & L2_OFFSET) == 0, 4313 ("pmap_protect_l2: sva is not 2mpage aligned")); 4314 old_l2 = pmap_load(l2); 4315 KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK, 4316 ("pmap_protect_l2: L2e %lx is not a block mapping", old_l2)); 4317 4318 /* 4319 * Return if the L2 entry already has the desired access restrictions 4320 * in place. 4321 */ 4322 if ((old_l2 & mask) == nbits) 4323 return; 4324 4325 while (!atomic_fcmpset_64(l2, &old_l2, (old_l2 & ~mask) | nbits)) 4326 cpu_spinwait(); 4327 4328 /* 4329 * When a dirty read/write superpage mapping is write protected, 4330 * update the dirty field of each of the superpage's constituent 4KB 4331 * pages. 4332 */ 4333 if ((old_l2 & ATTR_SW_MANAGED) != 0 && 4334 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 && 4335 pmap_pte_dirty(pmap, old_l2)) { 4336 m = PTE_TO_VM_PAGE(old_l2); 4337 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 4338 vm_page_dirty(mt); 4339 } 4340 4341 /* 4342 * Since a promotion must break the 4KB page mappings before making 4343 * the 2MB page mapping, a pmap_s1_invalidate_page() suffices. 4344 */ 4345 pmap_s1_invalidate_page(pmap, sva, true); 4346 } 4347 4348 /* 4349 * Masks and sets bits in the specified L3C superpage mapping. 4350 * 4351 * Requests TLB invalidations to be performed by the caller through the 4352 * returned "*vap". 4353 */ 4354 static void 4355 pmap_mask_set_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, 4356 vm_offset_t *vap, vm_offset_t va_next, pt_entry_t mask, pt_entry_t nbits) 4357 { 4358 pt_entry_t l3e, *tl3p; 4359 vm_page_t m, mt; 4360 bool dirty; 4361 4362 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4363 KASSERT(((uintptr_t)l3p & ((L3C_ENTRIES * sizeof(pt_entry_t)) - 1)) == 4364 0, ("pmap_mask_set_l3c: l3p is not aligned")); 4365 KASSERT((va & L3C_OFFSET) == 0, 4366 ("pmap_mask_set_l3c: va is not aligned")); 4367 dirty = false; 4368 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) { 4369 l3e = pmap_load(tl3p); 4370 KASSERT((l3e & ATTR_CONTIGUOUS) != 0, 4371 ("pmap_mask_set_l3c: l3e is missing ATTR_CONTIGUOUS")); 4372 while (!atomic_fcmpset_64(tl3p, &l3e, (l3e & ~mask) | nbits)) 4373 cpu_spinwait(); 4374 if ((l3e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) == 4375 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RW))) 4376 dirty = true; 4377 } 4378 4379 /* 4380 * When a dirty read/write superpage mapping is write protected, 4381 * update the dirty field of each of the superpage's constituent 4KB 4382 * pages. 4383 */ 4384 if ((l3e & ATTR_SW_MANAGED) != 0 && 4385 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 && 4386 dirty) { 4387 m = PTE_TO_VM_PAGE(pmap_load(l3p)); 4388 for (mt = m; mt < &m[L3C_ENTRIES]; mt++) 4389 vm_page_dirty(mt); 4390 } 4391 4392 if (*vap == va_next) 4393 *vap = va; 4394 } 4395 4396 /* 4397 * Masks and sets bits in last level page table entries in the specified 4398 * pmap and range 4399 */ 4400 static void 4401 pmap_mask_set_locked(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t mask, 4402 pt_entry_t nbits, bool invalidate) 4403 { 4404 vm_offset_t va, va_next; 4405 pd_entry_t *l0, *l1, *l2; 4406 pt_entry_t *l3p, l3; 4407 4408 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4409 for (; sva < eva; sva = va_next) { 4410 l0 = pmap_l0(pmap, sva); 4411 if (pmap_load(l0) == 0) { 4412 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 4413 if (va_next < sva) 4414 va_next = eva; 4415 continue; 4416 } 4417 4418 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 4419 if (va_next < sva) 4420 va_next = eva; 4421 l1 = pmap_l0_to_l1(l0, sva); 4422 if (pmap_load(l1) == 0) 4423 continue; 4424 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { 4425 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 4426 KASSERT(va_next <= eva, 4427 ("partial update of non-transparent 1G page " 4428 "l1 %#lx sva %#lx eva %#lx va_next %#lx", 4429 pmap_load(l1), sva, eva, va_next)); 4430 MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0); 4431 if ((pmap_load(l1) & mask) != nbits) { 4432 pmap_store(l1, (pmap_load(l1) & ~mask) | nbits); 4433 if (invalidate) 4434 pmap_s1_invalidate_page(pmap, sva, true); 4435 } 4436 continue; 4437 } 4438 4439 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 4440 if (va_next < sva) 4441 va_next = eva; 4442 4443 l2 = pmap_l1_to_l2(l1, sva); 4444 if (pmap_load(l2) == 0) 4445 continue; 4446 4447 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) { 4448 if (sva + L2_SIZE == va_next && eva >= va_next) { 4449 pmap_protect_l2(pmap, l2, sva, mask, nbits); 4450 continue; 4451 } else if ((pmap_load(l2) & mask) == nbits || 4452 pmap_demote_l2(pmap, l2, sva) == NULL) 4453 continue; 4454 } 4455 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, 4456 ("pmap_protect: Invalid L2 entry after demotion")); 4457 4458 if (va_next > eva) 4459 va_next = eva; 4460 4461 va = va_next; 4462 for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++, 4463 sva += L3_SIZE) { 4464 l3 = pmap_load(l3p); 4465 4466 /* 4467 * Go to the next L3 entry if the current one is 4468 * invalid or already has the desired access 4469 * restrictions in place. (The latter case occurs 4470 * frequently. For example, in a "buildworld" 4471 * workload, almost 1 out of 4 L3 entries already 4472 * have the desired restrictions.) 4473 */ 4474 if (!pmap_l3_valid(l3) || (l3 & mask) == nbits) { 4475 if (va != va_next) { 4476 if (invalidate) 4477 pmap_s1_invalidate_range(pmap, 4478 va, sva, true); 4479 va = va_next; 4480 } 4481 if ((l3 & ATTR_CONTIGUOUS) != 0) { 4482 /* 4483 * Does this L3C page extend beyond 4484 * the requested range? Handle the 4485 * possibility that "va_next" is zero. 4486 */ 4487 if ((sva | L3C_OFFSET) > va_next - 1) 4488 break; 4489 4490 /* 4491 * Skip ahead to the last L3_PAGE 4492 * within this L3C page. 4493 */ 4494 l3p = (pt_entry_t *)((uintptr_t)l3p | 4495 ((L3C_ENTRIES - 1) * 4496 sizeof(pt_entry_t))); 4497 sva |= L3C_SIZE - L3_SIZE; 4498 } 4499 continue; 4500 } 4501 4502 if ((l3 & ATTR_CONTIGUOUS) != 0) { 4503 /* 4504 * Is this entire set of contiguous L3 entries 4505 * being protected? Handle the possibility 4506 * that "va_next" is zero because of address 4507 * wraparound. 4508 */ 4509 if ((sva & L3C_OFFSET) == 0 && 4510 sva + L3C_OFFSET <= va_next - 1) { 4511 pmap_mask_set_l3c(pmap, l3p, sva, &va, 4512 va_next, mask, nbits); 4513 l3p += L3C_ENTRIES - 1; 4514 sva += L3C_SIZE - L3_SIZE; 4515 continue; 4516 } 4517 4518 (void)pmap_demote_l3c(pmap, l3p, sva); 4519 4520 /* 4521 * The L3 entry's accessed bit may have changed. 4522 */ 4523 l3 = pmap_load(l3p); 4524 } 4525 while (!atomic_fcmpset_64(l3p, &l3, (l3 & ~mask) | 4526 nbits)) 4527 cpu_spinwait(); 4528 4529 /* 4530 * When a dirty read/write mapping is write protected, 4531 * update the page's dirty field. 4532 */ 4533 if ((l3 & ATTR_SW_MANAGED) != 0 && 4534 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 && 4535 pmap_pte_dirty(pmap, l3)) 4536 vm_page_dirty(PTE_TO_VM_PAGE(l3)); 4537 4538 if (va == va_next) 4539 va = sva; 4540 } 4541 if (va != va_next && invalidate) 4542 pmap_s1_invalidate_range(pmap, va, sva, true); 4543 } 4544 } 4545 4546 static void 4547 pmap_mask_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t mask, 4548 pt_entry_t nbits, bool invalidate) 4549 { 4550 PMAP_LOCK(pmap); 4551 pmap_mask_set_locked(pmap, sva, eva, mask, nbits, invalidate); 4552 PMAP_UNLOCK(pmap); 4553 } 4554 4555 /* 4556 * Set the physical protection on the 4557 * specified range of this map as requested. 4558 */ 4559 void 4560 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 4561 { 4562 pt_entry_t mask, nbits; 4563 4564 PMAP_ASSERT_STAGE1(pmap); 4565 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); 4566 if (prot == VM_PROT_NONE) { 4567 pmap_remove(pmap, sva, eva); 4568 return; 4569 } 4570 4571 mask = nbits = 0; 4572 if ((prot & VM_PROT_WRITE) == 0) { 4573 mask |= ATTR_S1_AP_RW_BIT | ATTR_SW_DBM; 4574 nbits |= ATTR_S1_AP(ATTR_S1_AP_RO); 4575 } 4576 if ((prot & VM_PROT_EXECUTE) == 0) { 4577 mask |= ATTR_S1_XN; 4578 nbits |= ATTR_S1_XN; 4579 } 4580 if (pmap == kernel_pmap) { 4581 mask |= ATTR_KERN_GP; 4582 nbits |= ATTR_KERN_GP; 4583 } 4584 if (mask == 0) 4585 return; 4586 4587 pmap_mask_set(pmap, sva, eva, mask, nbits, true); 4588 } 4589 4590 void 4591 pmap_disable_promotion(vm_offset_t sva, vm_size_t size) 4592 { 4593 4594 MPASS((sva & L3_OFFSET) == 0); 4595 MPASS(((sva + size) & L3_OFFSET) == 0); 4596 4597 pmap_mask_set(kernel_pmap, sva, sva + size, ATTR_SW_NO_PROMOTE, 4598 ATTR_SW_NO_PROMOTE, false); 4599 } 4600 4601 /* 4602 * Inserts the specified page table page into the specified pmap's collection 4603 * of idle page table pages. Each of a pmap's page table pages is responsible 4604 * for mapping a distinct range of virtual addresses. The pmap's collection is 4605 * ordered by this virtual address range. 4606 * 4607 * If "promoted" is false, then the page table page "mpte" must be zero filled; 4608 * "mpte"'s valid field will be set to 0. 4609 * 4610 * If "promoted" is true and "all_l3e_AF_set" is false, then "mpte" must 4611 * contain valid mappings with identical attributes except for ATTR_AF; 4612 * "mpte"'s valid field will be set to 1. 4613 * 4614 * If "promoted" and "all_l3e_AF_set" are both true, then "mpte" must contain 4615 * valid mappings with identical attributes including ATTR_AF; "mpte"'s valid 4616 * field will be set to VM_PAGE_BITS_ALL. 4617 */ 4618 static __inline int 4619 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted, 4620 bool all_l3e_AF_set) 4621 { 4622 4623 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4624 KASSERT(promoted || !all_l3e_AF_set, 4625 ("a zero-filled PTP can't have ATTR_AF set in every PTE")); 4626 mpte->valid = promoted ? (all_l3e_AF_set ? VM_PAGE_BITS_ALL : 1) : 0; 4627 return (vm_radix_insert(&pmap->pm_root, mpte)); 4628 } 4629 4630 /* 4631 * Removes the page table page mapping the specified virtual address from the 4632 * specified pmap's collection of idle page table pages, and returns it. 4633 * Otherwise, returns NULL if there is no page table page corresponding to the 4634 * specified virtual address. 4635 */ 4636 static __inline vm_page_t 4637 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) 4638 { 4639 4640 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4641 return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va))); 4642 } 4643 4644 /* 4645 * Performs a break-before-make update of a pmap entry. This is needed when 4646 * either promoting or demoting pages to ensure the TLB doesn't get into an 4647 * inconsistent state. 4648 */ 4649 static void 4650 pmap_update_entry(pmap_t pmap, pd_entry_t *ptep, pd_entry_t newpte, 4651 vm_offset_t va, vm_size_t size) 4652 { 4653 register_t intr; 4654 4655 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4656 KASSERT((newpte & ATTR_SW_NO_PROMOTE) == 0, 4657 ("%s: Updating non-promote pte", __func__)); 4658 4659 /* 4660 * Ensure we don't get switched out with the page table in an 4661 * inconsistent state. We also need to ensure no interrupts fire 4662 * as they may make use of an address we are about to invalidate. 4663 */ 4664 intr = intr_disable(); 4665 4666 /* 4667 * Clear the old mapping's valid bit, but leave the rest of the entry 4668 * unchanged, so that a lockless, concurrent pmap_kextract() can still 4669 * lookup the physical address. 4670 */ 4671 pmap_clear_bits(ptep, ATTR_DESCR_VALID); 4672 4673 /* 4674 * When promoting, the L{1,2}_TABLE entry that is being replaced might 4675 * be cached, so we invalidate intermediate entries as well as final 4676 * entries. 4677 */ 4678 pmap_s1_invalidate_range(pmap, va, va + size, false); 4679 4680 /* Create the new mapping */ 4681 pmap_store(ptep, newpte); 4682 dsb(ishst); 4683 4684 intr_restore(intr); 4685 } 4686 4687 /* 4688 * Performs a break-before-make update of an ATTR_CONTIGUOUS mapping. 4689 */ 4690 static void __nosanitizecoverage 4691 pmap_update_strided(pmap_t pmap, pd_entry_t *ptep, pd_entry_t *ptep_end, 4692 pd_entry_t newpte, vm_offset_t va, vm_offset_t stride, vm_size_t size) 4693 { 4694 pd_entry_t *lip; 4695 register_t intr; 4696 4697 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4698 KASSERT((newpte & ATTR_SW_NO_PROMOTE) == 0, 4699 ("%s: Updating non-promote pte", __func__)); 4700 4701 /* 4702 * Ensure we don't get switched out with the page table in an 4703 * inconsistent state. We also need to ensure no interrupts fire 4704 * as they may make use of an address we are about to invalidate. 4705 */ 4706 intr = intr_disable(); 4707 4708 /* 4709 * Clear the old mapping's valid bits, but leave the rest of each 4710 * entry unchanged, so that a lockless, concurrent pmap_kextract() can 4711 * still lookup the physical address. 4712 */ 4713 for (lip = ptep; lip < ptep_end; lip++) 4714 pmap_clear_bits(lip, ATTR_DESCR_VALID); 4715 4716 /* Only final entries are changing. */ 4717 pmap_s1_invalidate_strided(pmap, va, va + size, stride, true); 4718 4719 /* Create the new mapping. */ 4720 for (lip = ptep; lip < ptep_end; lip++) { 4721 pmap_store(lip, newpte); 4722 newpte += stride; 4723 } 4724 dsb(ishst); 4725 4726 intr_restore(intr); 4727 } 4728 4729 #if VM_NRESERVLEVEL > 0 4730 /* 4731 * After promotion from 512 4KB page mappings to a single 2MB page mapping, 4732 * replace the many pv entries for the 4KB page mappings by a single pv entry 4733 * for the 2MB page mapping. 4734 */ 4735 static void 4736 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 4737 struct rwlock **lockp) 4738 { 4739 struct md_page *pvh; 4740 pv_entry_t pv; 4741 vm_offset_t va_last; 4742 vm_page_t m; 4743 4744 KASSERT((pa & L2_OFFSET) == 0, 4745 ("pmap_pv_promote_l2: pa is not 2mpage aligned")); 4746 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 4747 4748 /* 4749 * Transfer the first page's pv entry for this mapping to the 2mpage's 4750 * pv list. Aside from avoiding the cost of a call to get_pv_entry(), 4751 * a transfer avoids the possibility that get_pv_entry() calls 4752 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the 4753 * mappings that is being promoted. 4754 */ 4755 m = PHYS_TO_VM_PAGE(pa); 4756 va = va & ~L2_OFFSET; 4757 pv = pmap_pvh_remove(&m->md, pmap, va); 4758 KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv not found")); 4759 pvh = page_to_pvh(m); 4760 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 4761 pvh->pv_gen++; 4762 /* Free the remaining NPTEPG - 1 pv entries. */ 4763 va_last = va + L2_SIZE - PAGE_SIZE; 4764 do { 4765 m++; 4766 va += PAGE_SIZE; 4767 pmap_pvh_free(&m->md, pmap, va); 4768 } while (va < va_last); 4769 } 4770 4771 /* 4772 * Tries to promote the 512, contiguous 4KB page mappings that are within a 4773 * single level 2 table entry to a single 2MB page mapping. For promotion 4774 * to occur, two conditions must be met: (1) the 4KB page mappings must map 4775 * aligned, contiguous physical memory and (2) the 4KB page mappings must have 4776 * identical characteristics. 4777 */ 4778 static bool 4779 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, vm_page_t mpte, 4780 struct rwlock **lockp) 4781 { 4782 pt_entry_t all_l3e_AF, *firstl3, *l3, newl2, oldl3, pa; 4783 4784 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4785 4786 /* 4787 * Currently, this function only supports promotion on stage 1 pmaps 4788 * because it tests stage 1 specific fields and performs a break- 4789 * before-make sequence that is incorrect for stage 2 pmaps. 4790 */ 4791 if (pmap->pm_stage != PM_STAGE1 || !pmap_ps_enabled(pmap)) 4792 return (false); 4793 4794 /* 4795 * Examine the first L3E in the specified PTP. Abort if this L3E is 4796 * ineligible for promotion... 4797 */ 4798 firstl3 = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l2))); 4799 newl2 = pmap_load(firstl3); 4800 if ((newl2 & ATTR_SW_NO_PROMOTE) != 0) 4801 return (false); 4802 /* ... is not the first physical page within an L2 block */ 4803 if ((PTE_TO_PHYS(newl2) & L2_OFFSET) != 0 || 4804 ((newl2 & ATTR_DESCR_MASK) != L3_PAGE)) { /* ... or is invalid */ 4805 atomic_add_long(&pmap_l2_p_failures, 1); 4806 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" 4807 " in pmap %p", va, pmap); 4808 return (false); 4809 } 4810 4811 /* 4812 * Both here and in the below "for" loop, to allow for repromotion 4813 * after MADV_FREE, conditionally write protect a clean L3E before 4814 * possibly aborting the promotion due to other L3E attributes. Why? 4815 * Suppose that MADV_FREE is applied to a part of a superpage, the 4816 * address range [S, E). pmap_advise() will demote the superpage 4817 * mapping, destroy the 4KB page mapping at the end of [S, E), and 4818 * set AP_RO and clear AF in the L3Es for the rest of [S, E). Later, 4819 * imagine that the memory in [S, E) is recycled, but the last 4KB 4820 * page in [S, E) is not the last to be rewritten, or simply accessed. 4821 * In other words, there is still a 4KB page in [S, E), call it P, 4822 * that is writeable but AP_RO is set and AF is clear in P's L3E. 4823 * Unless we write protect P before aborting the promotion, if and 4824 * when P is finally rewritten, there won't be a page fault to trigger 4825 * repromotion. 4826 */ 4827 setl2: 4828 if ((newl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 4829 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) { 4830 /* 4831 * When the mapping is clean, i.e., ATTR_S1_AP_RO is set, 4832 * ATTR_SW_DBM can be cleared without a TLB invalidation. 4833 */ 4834 if (!atomic_fcmpset_64(firstl3, &newl2, newl2 & ~ATTR_SW_DBM)) 4835 goto setl2; 4836 newl2 &= ~ATTR_SW_DBM; 4837 CTR2(KTR_PMAP, "pmap_promote_l2: protect for va %#lx" 4838 " in pmap %p", va & ~L2_OFFSET, pmap); 4839 } 4840 4841 /* 4842 * Examine each of the other L3Es in the specified PTP. Abort if this 4843 * L3E maps an unexpected 4KB physical page or does not have identical 4844 * characteristics to the first L3E. If ATTR_AF is not set in every 4845 * PTE, then request that the PTP be refilled on demotion. 4846 */ 4847 all_l3e_AF = newl2 & ATTR_AF; 4848 pa = (PTE_TO_PHYS(newl2) | (newl2 & ATTR_DESCR_MASK)) 4849 + L2_SIZE - PAGE_SIZE; 4850 for (l3 = firstl3 + NL3PG - 1; l3 > firstl3; l3--) { 4851 oldl3 = pmap_load(l3); 4852 if ((PTE_TO_PHYS(oldl3) | (oldl3 & ATTR_DESCR_MASK)) != pa) { 4853 atomic_add_long(&pmap_l2_p_failures, 1); 4854 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" 4855 " in pmap %p", va, pmap); 4856 return (false); 4857 } 4858 setl3: 4859 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 4860 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) { 4861 /* 4862 * When the mapping is clean, i.e., ATTR_S1_AP_RO is 4863 * set, ATTR_SW_DBM can be cleared without a TLB 4864 * invalidation. 4865 */ 4866 if (!atomic_fcmpset_64(l3, &oldl3, oldl3 & 4867 ~ATTR_SW_DBM)) 4868 goto setl3; 4869 oldl3 &= ~ATTR_SW_DBM; 4870 } 4871 if ((oldl3 & ATTR_PROMOTE) != (newl2 & ATTR_PROMOTE)) { 4872 atomic_add_long(&pmap_l2_p_failures, 1); 4873 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" 4874 " in pmap %p", va, pmap); 4875 return (false); 4876 } 4877 all_l3e_AF &= oldl3; 4878 pa -= PAGE_SIZE; 4879 } 4880 4881 /* 4882 * Unless all PTEs have ATTR_AF set, clear it from the superpage 4883 * mapping, so that promotions triggered by speculative mappings, 4884 * such as pmap_enter_quick(), don't automatically mark the 4885 * underlying pages as referenced. 4886 */ 4887 newl2 &= ~(ATTR_CONTIGUOUS | ATTR_AF | ATTR_DESCR_MASK) | all_l3e_AF; 4888 4889 /* 4890 * Save the page table page in its current state until the L2 4891 * mapping the superpage is demoted by pmap_demote_l2() or 4892 * destroyed by pmap_remove_l3(). 4893 */ 4894 if (mpte == NULL) 4895 mpte = PTE_TO_VM_PAGE(pmap_load(l2)); 4896 KASSERT(mpte >= vm_page_array && 4897 mpte < &vm_page_array[vm_page_array_size], 4898 ("pmap_promote_l2: page table page is out of range")); 4899 KASSERT(mpte->pindex == pmap_l2_pindex(va), 4900 ("pmap_promote_l2: page table page's pindex is wrong")); 4901 if (pmap_insert_pt_page(pmap, mpte, true, all_l3e_AF != 0)) { 4902 atomic_add_long(&pmap_l2_p_failures, 1); 4903 CTR2(KTR_PMAP, 4904 "pmap_promote_l2: failure for va %#lx in pmap %p", va, 4905 pmap); 4906 return (false); 4907 } 4908 4909 if ((newl2 & ATTR_SW_MANAGED) != 0) 4910 pmap_pv_promote_l2(pmap, va, PTE_TO_PHYS(newl2), lockp); 4911 4912 pmap_update_entry(pmap, l2, newl2 | L2_BLOCK, va & ~L2_OFFSET, L2_SIZE); 4913 4914 atomic_add_long(&pmap_l2_promotions, 1); 4915 CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va, 4916 pmap); 4917 return (true); 4918 } 4919 4920 /* 4921 * Tries to promote an aligned, contiguous set of base page mappings to a 4922 * single L3C page mapping. For promotion to occur, two conditions must be 4923 * met: (1) the base page mappings must map aligned, contiguous physical 4924 * memory and (2) the base page mappings must have identical characteristics 4925 * except for the accessed flag. 4926 */ 4927 static bool 4928 pmap_promote_l3c(pmap_t pmap, pd_entry_t *l3p, vm_offset_t va) 4929 { 4930 pd_entry_t all_l3e_AF, firstl3c, *l3, oldl3, pa; 4931 4932 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4933 4934 /* 4935 * Currently, this function only supports promotion on stage 1 pmaps 4936 * because it tests stage 1 specific fields and performs a break- 4937 * before-make sequence that is incorrect for stage 2 pmaps. 4938 */ 4939 if (pmap->pm_stage != PM_STAGE1 || !pmap_ps_enabled(pmap)) 4940 return (false); 4941 4942 /* 4943 * Compute the address of the first L3 entry in the superpage 4944 * candidate. 4945 */ 4946 l3p = (pt_entry_t *)((uintptr_t)l3p & ~((L3C_ENTRIES * 4947 sizeof(pt_entry_t)) - 1)); 4948 4949 firstl3c = pmap_load(l3p); 4950 4951 /* 4952 * Examine the first L3 entry. Abort if this L3E is ineligible for 4953 * promotion... 4954 */ 4955 if ((firstl3c & ATTR_SW_NO_PROMOTE) != 0) 4956 return (false); 4957 /* ...is not properly aligned... */ 4958 if ((PTE_TO_PHYS(firstl3c) & L3C_OFFSET) != 0 || 4959 (firstl3c & ATTR_DESCR_MASK) != L3_PAGE) { /* ...or is invalid. */ 4960 counter_u64_add(pmap_l3c_p_failures, 1); 4961 CTR2(KTR_PMAP, "pmap_promote_l3c: failure for va %#lx" 4962 " in pmap %p", va, pmap); 4963 return (false); 4964 } 4965 4966 /* 4967 * If the first L3 entry is a clean read-write mapping, convert it 4968 * to a read-only mapping. See pmap_promote_l2() for the rationale. 4969 */ 4970 set_first: 4971 if ((firstl3c & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 4972 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) { 4973 /* 4974 * When the mapping is clean, i.e., ATTR_S1_AP_RO is set, 4975 * ATTR_SW_DBM can be cleared without a TLB invalidation. 4976 */ 4977 if (!atomic_fcmpset_64(l3p, &firstl3c, firstl3c & ~ATTR_SW_DBM)) 4978 goto set_first; 4979 firstl3c &= ~ATTR_SW_DBM; 4980 CTR2(KTR_PMAP, "pmap_promote_l3c: protect for va %#lx" 4981 " in pmap %p", va & ~L3C_OFFSET, pmap); 4982 } 4983 4984 /* 4985 * Check that the rest of the L3 entries are compatible with the first, 4986 * and convert clean read-write mappings to read-only mappings. 4987 */ 4988 all_l3e_AF = firstl3c & ATTR_AF; 4989 pa = (PTE_TO_PHYS(firstl3c) | (firstl3c & ATTR_DESCR_MASK)) + 4990 L3C_SIZE - PAGE_SIZE; 4991 for (l3 = l3p + L3C_ENTRIES - 1; l3 > l3p; l3--) { 4992 oldl3 = pmap_load(l3); 4993 if ((PTE_TO_PHYS(oldl3) | (oldl3 & ATTR_DESCR_MASK)) != pa) { 4994 counter_u64_add(pmap_l3c_p_failures, 1); 4995 CTR2(KTR_PMAP, "pmap_promote_l3c: failure for va %#lx" 4996 " in pmap %p", va, pmap); 4997 return (false); 4998 } 4999 set_l3: 5000 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 5001 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) { 5002 /* 5003 * When the mapping is clean, i.e., ATTR_S1_AP_RO is 5004 * set, ATTR_SW_DBM can be cleared without a TLB 5005 * invalidation. 5006 */ 5007 if (!atomic_fcmpset_64(l3, &oldl3, oldl3 & 5008 ~ATTR_SW_DBM)) 5009 goto set_l3; 5010 oldl3 &= ~ATTR_SW_DBM; 5011 CTR2(KTR_PMAP, "pmap_promote_l3c: protect for va %#lx" 5012 " in pmap %p", (oldl3 & ~ATTR_MASK & L3C_OFFSET) | 5013 (va & ~L3C_OFFSET), pmap); 5014 } 5015 if ((oldl3 & ATTR_PROMOTE) != (firstl3c & ATTR_PROMOTE)) { 5016 counter_u64_add(pmap_l3c_p_failures, 1); 5017 CTR2(KTR_PMAP, "pmap_promote_l3c: failure for va %#lx" 5018 " in pmap %p", va, pmap); 5019 return (false); 5020 } 5021 all_l3e_AF &= oldl3; 5022 pa -= PAGE_SIZE; 5023 } 5024 5025 /* 5026 * Unless all PTEs have ATTR_AF set, clear it from the superpage 5027 * mapping, so that promotions triggered by speculative mappings, 5028 * such as pmap_enter_quick(), don't automatically mark the 5029 * underlying pages as referenced. 5030 */ 5031 firstl3c &= ~ATTR_AF | all_l3e_AF; 5032 5033 /* 5034 * Remake the mappings with the contiguous bit set. 5035 */ 5036 pmap_update_strided(pmap, l3p, l3p + L3C_ENTRIES, firstl3c | 5037 ATTR_CONTIGUOUS, va & ~L3C_OFFSET, L3_SIZE, L3C_SIZE); 5038 5039 counter_u64_add(pmap_l3c_promotions, 1); 5040 CTR2(KTR_PMAP, "pmap_promote_l3c: success for va %#lx in pmap %p", va, 5041 pmap); 5042 return (true); 5043 } 5044 #endif /* VM_NRESERVLEVEL > 0 */ 5045 5046 static int 5047 pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t pte, int flags, 5048 int psind) 5049 { 5050 pd_entry_t *l0p, *l1p, *l2p, *l3p, newpte, origpte, *tl3p; 5051 vm_page_t mp; 5052 5053 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5054 KASSERT(psind > 0 && psind < MAXPAGESIZES, 5055 ("psind %d unexpected", psind)); 5056 KASSERT((PTE_TO_PHYS(pte) & (pagesizes[psind] - 1)) == 0, 5057 ("unaligned phys address %#lx pte %#lx psind %d", 5058 PTE_TO_PHYS(pte), pte, psind)); 5059 5060 restart: 5061 newpte = pte; 5062 if (!pmap_bti_same(pmap, va, va + pagesizes[psind], &newpte)) 5063 return (KERN_PROTECTION_FAILURE); 5064 if (psind == 3) { 5065 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 5066 5067 KASSERT(pagesizes[psind] == L1_SIZE, 5068 ("pagesizes[%d] != L1_SIZE", psind)); 5069 l0p = pmap_l0(pmap, va); 5070 if ((pmap_load(l0p) & ATTR_DESCR_VALID) == 0) { 5071 mp = _pmap_alloc_l3(pmap, pmap_l0_pindex(va), NULL); 5072 if (mp == NULL) { 5073 if ((flags & PMAP_ENTER_NOSLEEP) != 0) 5074 return (KERN_RESOURCE_SHORTAGE); 5075 PMAP_UNLOCK(pmap); 5076 vm_wait(NULL); 5077 PMAP_LOCK(pmap); 5078 goto restart; 5079 } 5080 l1p = pmap_l0_to_l1(l0p, va); 5081 KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va)); 5082 origpte = pmap_load(l1p); 5083 } else { 5084 l1p = pmap_l0_to_l1(l0p, va); 5085 KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va)); 5086 origpte = pmap_load(l1p); 5087 if ((origpte & ATTR_DESCR_VALID) == 0) { 5088 mp = PTE_TO_VM_PAGE(pmap_load(l0p)); 5089 mp->ref_count++; 5090 } 5091 } 5092 KASSERT((PTE_TO_PHYS(origpte) == PTE_TO_PHYS(newpte) && 5093 (origpte & ATTR_DESCR_MASK) == L1_BLOCK) || 5094 (origpte & ATTR_DESCR_VALID) == 0, 5095 ("va %#lx changing 1G phys page l1 %#lx newpte %#lx", 5096 va, origpte, newpte)); 5097 pmap_store(l1p, newpte); 5098 } else if (psind == 2) { 5099 KASSERT(pagesizes[psind] == L2_SIZE, 5100 ("pagesizes[%d] != L2_SIZE", psind)); 5101 l2p = pmap_l2(pmap, va); 5102 if (l2p == NULL) { 5103 mp = _pmap_alloc_l3(pmap, pmap_l1_pindex(va), NULL); 5104 if (mp == NULL) { 5105 if ((flags & PMAP_ENTER_NOSLEEP) != 0) 5106 return (KERN_RESOURCE_SHORTAGE); 5107 PMAP_UNLOCK(pmap); 5108 vm_wait(NULL); 5109 PMAP_LOCK(pmap); 5110 goto restart; 5111 } 5112 l2p = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp)); 5113 l2p = &l2p[pmap_l2_index(va)]; 5114 origpte = pmap_load(l2p); 5115 } else { 5116 l1p = pmap_l1(pmap, va); 5117 origpte = pmap_load(l2p); 5118 if ((origpte & ATTR_DESCR_VALID) == 0) { 5119 mp = PTE_TO_VM_PAGE(pmap_load(l1p)); 5120 mp->ref_count++; 5121 } 5122 } 5123 KASSERT((origpte & ATTR_DESCR_VALID) == 0 || 5124 ((origpte & ATTR_DESCR_MASK) == L2_BLOCK && 5125 PTE_TO_PHYS(origpte) == PTE_TO_PHYS(newpte)), 5126 ("va %#lx changing 2M phys page l2 %#lx newpte %#lx", 5127 va, origpte, newpte)); 5128 pmap_store(l2p, newpte); 5129 } else /* (psind == 1) */ { 5130 KASSERT(pagesizes[psind] == L3C_SIZE, 5131 ("pagesizes[%d] != L3C_SIZE", psind)); 5132 l2p = pmap_l2(pmap, va); 5133 if (l2p == NULL || (pmap_load(l2p) & ATTR_DESCR_VALID) == 0) { 5134 mp = _pmap_alloc_l3(pmap, pmap_l2_pindex(va), NULL); 5135 if (mp == NULL) { 5136 if ((flags & PMAP_ENTER_NOSLEEP) != 0) 5137 return (KERN_RESOURCE_SHORTAGE); 5138 PMAP_UNLOCK(pmap); 5139 vm_wait(NULL); 5140 PMAP_LOCK(pmap); 5141 goto restart; 5142 } 5143 mp->ref_count += L3C_ENTRIES - 1; 5144 l3p = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp)); 5145 l3p = &l3p[pmap_l3_index(va)]; 5146 } else { 5147 l3p = pmap_l2_to_l3(l2p, va); 5148 if ((pmap_load(l3p) & ATTR_DESCR_VALID) == 0) { 5149 mp = PTE_TO_VM_PAGE(pmap_load(l2p)); 5150 mp->ref_count += L3C_ENTRIES; 5151 } 5152 } 5153 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) { 5154 origpte = pmap_load(tl3p); 5155 KASSERT((origpte & ATTR_DESCR_VALID) == 0 || 5156 ((origpte & ATTR_CONTIGUOUS) != 0 && 5157 PTE_TO_PHYS(origpte) == PTE_TO_PHYS(newpte)), 5158 ("va %#lx changing 64K phys page l3 %#lx newpte %#lx", 5159 va, origpte, newpte)); 5160 pmap_store(tl3p, newpte); 5161 newpte += L3_SIZE; 5162 } 5163 } 5164 dsb(ishst); 5165 5166 if ((origpte & ATTR_DESCR_VALID) == 0) 5167 pmap_resident_count_inc(pmap, pagesizes[psind] / PAGE_SIZE); 5168 if ((newpte & ATTR_SW_WIRED) != 0 && (origpte & ATTR_SW_WIRED) == 0) 5169 pmap->pm_stats.wired_count += pagesizes[psind] / PAGE_SIZE; 5170 else if ((newpte & ATTR_SW_WIRED) == 0 && 5171 (origpte & ATTR_SW_WIRED) != 0) 5172 pmap->pm_stats.wired_count -= pagesizes[psind] / PAGE_SIZE; 5173 5174 return (KERN_SUCCESS); 5175 } 5176 5177 /* 5178 * Insert the given physical page (p) at 5179 * the specified virtual address (v) in the 5180 * target physical map with the protection requested. 5181 * 5182 * If specified, the page will be wired down, meaning 5183 * that the related pte can not be reclaimed. 5184 * 5185 * NB: This is the only routine which MAY NOT lazy-evaluate 5186 * or lose information. That is, this routine must actually 5187 * insert this page into the given map NOW. 5188 */ 5189 int 5190 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 5191 u_int flags, int8_t psind) 5192 { 5193 struct rwlock *lock; 5194 pd_entry_t *pde; 5195 pt_entry_t new_l3, orig_l3; 5196 pt_entry_t *l2, *l3; 5197 pv_entry_t pv; 5198 vm_paddr_t opa, pa; 5199 vm_page_t mpte, om; 5200 bool nosleep; 5201 int full_lvl, lvl, rv; 5202 5203 KASSERT(ADDR_IS_CANONICAL(va), 5204 ("%s: Address not in canonical form: %lx", __func__, va)); 5205 5206 va = trunc_page(va); 5207 if ((m->oflags & VPO_UNMANAGED) == 0) 5208 VM_PAGE_OBJECT_BUSY_ASSERT(m); 5209 pa = VM_PAGE_TO_PHYS(m); 5210 new_l3 = (pt_entry_t)(PHYS_TO_PTE(pa) | ATTR_AF | pmap_sh_attr | 5211 L3_PAGE); 5212 new_l3 |= pmap_pte_memattr(pmap, m->md.pv_memattr); 5213 new_l3 |= pmap_pte_prot(pmap, prot); 5214 if ((flags & PMAP_ENTER_WIRED) != 0) 5215 new_l3 |= ATTR_SW_WIRED; 5216 if (pmap->pm_stage == PM_STAGE1) { 5217 if (!ADDR_IS_KERNEL(va)) 5218 new_l3 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN; 5219 else 5220 new_l3 |= ATTR_S1_UXN; 5221 if (pmap != kernel_pmap) 5222 new_l3 |= ATTR_S1_nG; 5223 } else { 5224 /* 5225 * Clear the access flag on executable mappings, this will be 5226 * set later when the page is accessed. The fault handler is 5227 * required to invalidate the I-cache. 5228 * 5229 * TODO: Switch to the valid flag to allow hardware management 5230 * of the access flag. Much of the pmap code assumes the 5231 * valid flag is set and fails to destroy the old page tables 5232 * correctly if it is clear. 5233 */ 5234 if (prot & VM_PROT_EXECUTE) 5235 new_l3 &= ~ATTR_AF; 5236 } 5237 if ((m->oflags & VPO_UNMANAGED) == 0) { 5238 new_l3 |= ATTR_SW_MANAGED; 5239 if ((prot & VM_PROT_WRITE) != 0) { 5240 new_l3 |= ATTR_SW_DBM; 5241 if ((flags & VM_PROT_WRITE) == 0) { 5242 if (pmap->pm_stage == PM_STAGE1) 5243 new_l3 |= ATTR_S1_AP(ATTR_S1_AP_RO); 5244 else 5245 new_l3 &= 5246 ~ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE); 5247 } 5248 } 5249 } 5250 5251 CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa); 5252 5253 lock = NULL; 5254 PMAP_LOCK(pmap); 5255 if ((flags & PMAP_ENTER_LARGEPAGE) != 0) { 5256 KASSERT((m->oflags & VPO_UNMANAGED) != 0, 5257 ("managed largepage va %#lx flags %#x", va, flags)); 5258 if (psind == 3) { 5259 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 5260 new_l3 &= ~L3_PAGE; 5261 new_l3 |= L1_BLOCK; 5262 } else if (psind == 2) { 5263 new_l3 &= ~L3_PAGE; 5264 new_l3 |= L2_BLOCK; 5265 } else /* (psind == 1) */ 5266 new_l3 |= ATTR_CONTIGUOUS; 5267 rv = pmap_enter_largepage(pmap, va, new_l3, flags, psind); 5268 goto out; 5269 } 5270 if (psind == 2) { 5271 /* Assert the required virtual and physical alignment. */ 5272 KASSERT((va & L2_OFFSET) == 0, ("pmap_enter: va unaligned")); 5273 KASSERT(m->psind > 1, ("pmap_enter: m->psind < psind")); 5274 rv = pmap_enter_l2(pmap, va, (new_l3 & ~L3_PAGE) | L2_BLOCK, 5275 flags, m, &lock); 5276 goto out; 5277 } 5278 mpte = NULL; 5279 if (psind == 1) { 5280 KASSERT((va & L3C_OFFSET) == 0, ("pmap_enter: va unaligned")); 5281 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); 5282 rv = pmap_enter_l3c(pmap, va, new_l3 | ATTR_CONTIGUOUS, flags, 5283 m, &mpte, &lock); 5284 #if VM_NRESERVLEVEL > 0 5285 /* 5286 * Attempt L2 promotion, if both the PTP and a level 1 5287 * reservation are fully populated. 5288 */ 5289 if (rv == KERN_SUCCESS && 5290 (mpte == NULL || mpte->ref_count == NL3PG) && 5291 (m->flags & PG_FICTITIOUS) == 0 && 5292 vm_reserv_level_iffullpop(m) == 1) { 5293 pde = pmap_l2(pmap, va); 5294 (void)pmap_promote_l2(pmap, pde, va, mpte, &lock); 5295 } 5296 #endif 5297 goto out; 5298 } 5299 5300 /* 5301 * In the case that a page table page is not 5302 * resident, we are creating it here. 5303 */ 5304 retry: 5305 pde = pmap_pde(pmap, va, &lvl); 5306 if (pde != NULL && lvl == 2) { 5307 l3 = pmap_l2_to_l3(pde, va); 5308 if (!ADDR_IS_KERNEL(va) && mpte == NULL) { 5309 mpte = PTE_TO_VM_PAGE(pmap_load(pde)); 5310 mpte->ref_count++; 5311 } 5312 goto havel3; 5313 } else if (pde != NULL && lvl == 1) { 5314 l2 = pmap_l1_to_l2(pde, va); 5315 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK && 5316 (l3 = pmap_demote_l2_locked(pmap, l2, va, &lock)) != NULL) { 5317 l3 = &l3[pmap_l3_index(va)]; 5318 if (!ADDR_IS_KERNEL(va)) { 5319 mpte = PTE_TO_VM_PAGE(pmap_load(l2)); 5320 mpte->ref_count++; 5321 } 5322 goto havel3; 5323 } 5324 /* We need to allocate an L3 table. */ 5325 } 5326 if (!ADDR_IS_KERNEL(va)) { 5327 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; 5328 5329 /* 5330 * We use _pmap_alloc_l3() instead of pmap_alloc_l3() in order 5331 * to handle the possibility that a superpage mapping for "va" 5332 * was created while we slept. 5333 */ 5334 mpte = _pmap_alloc_l3(pmap, pmap_l2_pindex(va), 5335 nosleep ? NULL : &lock); 5336 if (mpte == NULL && nosleep) { 5337 CTR0(KTR_PMAP, "pmap_enter: mpte == NULL"); 5338 rv = KERN_RESOURCE_SHORTAGE; 5339 goto out; 5340 } 5341 goto retry; 5342 } else 5343 panic("pmap_enter: missing L3 table for kernel va %#lx", va); 5344 5345 havel3: 5346 orig_l3 = pmap_load(l3); 5347 opa = PTE_TO_PHYS(orig_l3); 5348 pv = NULL; 5349 new_l3 |= pmap_pte_bti(pmap, va); 5350 5351 /* 5352 * Is the specified virtual address already mapped? 5353 */ 5354 if (pmap_l3_valid(orig_l3)) { 5355 /* 5356 * Wiring change, just update stats. We don't worry about 5357 * wiring PT pages as they remain resident as long as there 5358 * are valid mappings in them. Hence, if a user page is wired, 5359 * the PT page will be also. 5360 */ 5361 if ((flags & PMAP_ENTER_WIRED) != 0 && 5362 (orig_l3 & ATTR_SW_WIRED) == 0) 5363 pmap->pm_stats.wired_count++; 5364 else if ((flags & PMAP_ENTER_WIRED) == 0 && 5365 (orig_l3 & ATTR_SW_WIRED) != 0) 5366 pmap->pm_stats.wired_count--; 5367 5368 /* 5369 * Remove the extra PT page reference. 5370 */ 5371 if (mpte != NULL) { 5372 mpte->ref_count--; 5373 KASSERT(mpte->ref_count > 0, 5374 ("pmap_enter: missing reference to page table page," 5375 " va: 0x%lx", va)); 5376 } 5377 5378 /* 5379 * Has the physical page changed? 5380 */ 5381 if (opa == pa) { 5382 /* 5383 * No, might be a protection or wiring change. 5384 */ 5385 if ((orig_l3 & ATTR_SW_MANAGED) != 0 && 5386 (new_l3 & ATTR_SW_DBM) != 0) 5387 vm_page_aflag_set(m, PGA_WRITEABLE); 5388 goto validate; 5389 } 5390 5391 /* 5392 * The physical page has changed. Temporarily invalidate 5393 * the mapping. 5394 */ 5395 if ((orig_l3 & ATTR_CONTIGUOUS) != 0) 5396 (void)pmap_demote_l3c(pmap, l3, va); 5397 orig_l3 = pmap_load_clear(l3); 5398 KASSERT(PTE_TO_PHYS(orig_l3) == opa, 5399 ("pmap_enter: unexpected pa update for %#lx", va)); 5400 if ((orig_l3 & ATTR_SW_MANAGED) != 0) { 5401 om = PHYS_TO_VM_PAGE(opa); 5402 5403 /* 5404 * The pmap lock is sufficient to synchronize with 5405 * concurrent calls to pmap_page_test_mappings() and 5406 * pmap_ts_referenced(). 5407 */ 5408 if (pmap_pte_dirty(pmap, orig_l3)) 5409 vm_page_dirty(om); 5410 if ((orig_l3 & ATTR_AF) != 0) { 5411 pmap_invalidate_page(pmap, va, true); 5412 vm_page_aflag_set(om, PGA_REFERENCED); 5413 } 5414 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, om); 5415 pv = pmap_pvh_remove(&om->md, pmap, va); 5416 if ((m->oflags & VPO_UNMANAGED) != 0) 5417 free_pv_entry(pmap, pv); 5418 if ((om->a.flags & PGA_WRITEABLE) != 0 && 5419 TAILQ_EMPTY(&om->md.pv_list) && 5420 ((om->flags & PG_FICTITIOUS) != 0 || 5421 TAILQ_EMPTY(&page_to_pvh(om)->pv_list))) 5422 vm_page_aflag_clear(om, PGA_WRITEABLE); 5423 } else { 5424 KASSERT((orig_l3 & ATTR_AF) != 0, 5425 ("pmap_enter: unmanaged mapping lacks ATTR_AF")); 5426 pmap_invalidate_page(pmap, va, true); 5427 } 5428 orig_l3 = 0; 5429 } else { 5430 /* 5431 * Increment the counters. 5432 */ 5433 if ((new_l3 & ATTR_SW_WIRED) != 0) 5434 pmap->pm_stats.wired_count++; 5435 pmap_resident_count_inc(pmap, 1); 5436 } 5437 /* 5438 * Enter on the PV list if part of our managed memory. 5439 */ 5440 if ((m->oflags & VPO_UNMANAGED) == 0) { 5441 if (pv == NULL) { 5442 pv = get_pv_entry(pmap, &lock); 5443 pv->pv_va = va; 5444 } 5445 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 5446 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5447 m->md.pv_gen++; 5448 if ((new_l3 & ATTR_SW_DBM) != 0) 5449 vm_page_aflag_set(m, PGA_WRITEABLE); 5450 } 5451 5452 validate: 5453 if (pmap->pm_stage == PM_STAGE1) { 5454 /* 5455 * Sync icache if exec permission and attribute 5456 * VM_MEMATTR_WRITE_BACK is set. Do it now, before the mapping 5457 * is stored and made valid for hardware table walk. If done 5458 * later, then other can access this page before caches are 5459 * properly synced. Don't do it for kernel memory which is 5460 * mapped with exec permission even if the memory isn't going 5461 * to hold executable code. The only time when icache sync is 5462 * needed is after kernel module is loaded and the relocation 5463 * info is processed. And it's done in elf_cpu_load_file(). 5464 */ 5465 if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap && 5466 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK && 5467 (opa != pa || (orig_l3 & ATTR_S1_XN))) { 5468 PMAP_ASSERT_STAGE1(pmap); 5469 cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa), 5470 PAGE_SIZE); 5471 } 5472 } else { 5473 cpu_dcache_wb_range((void *)PHYS_TO_DMAP(pa), PAGE_SIZE); 5474 } 5475 5476 /* 5477 * Update the L3 entry 5478 */ 5479 if (pmap_l3_valid(orig_l3)) { 5480 KASSERT(opa == pa, ("pmap_enter: invalid update")); 5481 if ((orig_l3 & ~ATTR_AF) != (new_l3 & ~ATTR_AF)) { 5482 /* same PA, different attributes */ 5483 if ((orig_l3 & ATTR_CONTIGUOUS) != 0) 5484 (void)pmap_demote_l3c(pmap, l3, va); 5485 orig_l3 = pmap_load_store(l3, new_l3); 5486 pmap_invalidate_page(pmap, va, true); 5487 if ((orig_l3 & ATTR_SW_MANAGED) != 0 && 5488 pmap_pte_dirty(pmap, orig_l3)) 5489 vm_page_dirty(m); 5490 } else { 5491 /* 5492 * orig_l3 == new_l3 5493 * This can happens if multiple threads simultaneously 5494 * access not yet mapped page. This bad for performance 5495 * since this can cause full demotion-NOP-promotion 5496 * cycle. 5497 * Another possible reasons are: 5498 * - VM and pmap memory layout are diverged 5499 * - tlb flush is missing somewhere and CPU doesn't see 5500 * actual mapping. 5501 */ 5502 CTR4(KTR_PMAP, "%s: already mapped page - " 5503 "pmap %p va 0x%#lx pte 0x%lx", 5504 __func__, pmap, va, new_l3); 5505 } 5506 } else { 5507 /* New mapping */ 5508 pmap_store(l3, new_l3); 5509 dsb(ishst); 5510 } 5511 5512 #if VM_NRESERVLEVEL > 0 5513 /* 5514 * First, attempt L3C promotion, if the virtual and physical addresses 5515 * are aligned with each other and an underlying reservation has the 5516 * neighboring L3 pages allocated. The first condition is simply an 5517 * optimization that recognizes some eventual promotion failures early 5518 * at a lower run-time cost. Then, if both a level 1 reservation and 5519 * the PTP are fully populated, attempt L2 promotion. 5520 */ 5521 if ((va & L3C_OFFSET) == (pa & L3C_OFFSET) && 5522 (m->flags & PG_FICTITIOUS) == 0 && 5523 (full_lvl = vm_reserv_level_iffullpop(m)) >= 0 && 5524 pmap_promote_l3c(pmap, l3, va) && 5525 full_lvl == 1 && (mpte == NULL || mpte->ref_count == NL3PG)) 5526 (void)pmap_promote_l2(pmap, pde, va, mpte, &lock); 5527 #endif 5528 5529 rv = KERN_SUCCESS; 5530 out: 5531 if (lock != NULL) 5532 rw_wunlock(lock); 5533 PMAP_UNLOCK(pmap); 5534 return (rv); 5535 } 5536 5537 /* 5538 * Tries to create a read- and/or execute-only L2 page mapping. Returns 5539 * KERN_SUCCESS if the mapping was created. Otherwise, returns an error 5540 * value. See pmap_enter_l2() for the possible error values when "no sleep", 5541 * "no replace", and "no reclaim" are specified. 5542 */ 5543 static int 5544 pmap_enter_l2_rx(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 5545 struct rwlock **lockp) 5546 { 5547 pd_entry_t new_l2; 5548 5549 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5550 PMAP_ASSERT_STAGE1(pmap); 5551 KASSERT(ADDR_IS_CANONICAL(va), 5552 ("%s: Address not in canonical form: %lx", __func__, va)); 5553 5554 new_l2 = (pd_entry_t)(VM_PAGE_TO_PTE(m) | pmap_sh_attr | 5555 ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) | 5556 L2_BLOCK); 5557 if ((m->oflags & VPO_UNMANAGED) == 0) 5558 new_l2 |= ATTR_SW_MANAGED; 5559 else 5560 new_l2 |= ATTR_AF; 5561 if ((prot & VM_PROT_EXECUTE) == 0 || 5562 m->md.pv_memattr == VM_MEMATTR_DEVICE) 5563 new_l2 |= ATTR_S1_XN; 5564 if (!ADDR_IS_KERNEL(va)) 5565 new_l2 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN; 5566 else 5567 new_l2 |= ATTR_S1_UXN; 5568 if (pmap != kernel_pmap) 5569 new_l2 |= ATTR_S1_nG; 5570 return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP | 5571 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m, lockp)); 5572 } 5573 5574 /* 5575 * Returns true if every page table entry in the specified page table is 5576 * zero. 5577 */ 5578 static bool 5579 pmap_every_pte_zero(vm_paddr_t pa) 5580 { 5581 pt_entry_t *pt_end, *pte; 5582 5583 KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned")); 5584 pte = (pt_entry_t *)PHYS_TO_DMAP(pa); 5585 for (pt_end = pte + Ln_ENTRIES; pte < pt_end; pte++) { 5586 if (*pte != 0) 5587 return (false); 5588 } 5589 return (true); 5590 } 5591 5592 /* 5593 * Tries to create the specified L2 page mapping. Returns KERN_SUCCESS if 5594 * the mapping was created, and one of KERN_FAILURE, KERN_NO_SPACE, or 5595 * KERN_RESOURCE_SHORTAGE otherwise. Returns KERN_FAILURE if 5596 * PMAP_ENTER_NOREPLACE was specified and a base page mapping already exists 5597 * within the L2 virtual address range starting at the specified virtual 5598 * address. Returns KERN_NO_SPACE if PMAP_ENTER_NOREPLACE was specified and a 5599 * L2 page mapping already exists at the specified virtual address. Returns 5600 * KERN_RESOURCE_SHORTAGE if either (1) PMAP_ENTER_NOSLEEP was specified and a 5601 * page table page allocation failed or (2) PMAP_ENTER_NORECLAIM was specified 5602 * and a PV entry allocation failed. 5603 */ 5604 static int 5605 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, 5606 vm_page_t m, struct rwlock **lockp) 5607 { 5608 struct spglist free; 5609 pd_entry_t *l2, old_l2; 5610 vm_page_t l2pg, mt; 5611 vm_page_t uwptpg; 5612 5613 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5614 KASSERT(ADDR_IS_CANONICAL(va), 5615 ("%s: Address not in canonical form: %lx", __func__, va)); 5616 5617 if ((l2 = pmap_alloc_l2(pmap, va, &l2pg, (flags & 5618 PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) { 5619 CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", 5620 va, pmap); 5621 return (KERN_RESOURCE_SHORTAGE); 5622 } 5623 5624 /* 5625 * If bti is not the same for the whole l2 range, return failure 5626 * and let vm_fault() cope. Check after l2 allocation, since 5627 * it could sleep. 5628 */ 5629 if (!pmap_bti_same(pmap, va, va + L2_SIZE, &new_l2)) { 5630 KASSERT(l2pg != NULL, ("pmap_enter_l2: missing L2 PTP")); 5631 pmap_abort_ptp(pmap, va, l2pg); 5632 return (KERN_PROTECTION_FAILURE); 5633 } 5634 5635 /* 5636 * If there are existing mappings, either abort or remove them. 5637 */ 5638 if ((old_l2 = pmap_load(l2)) != 0) { 5639 KASSERT(l2pg == NULL || l2pg->ref_count > 1, 5640 ("pmap_enter_l2: l2pg's ref count is too low")); 5641 if ((flags & PMAP_ENTER_NOREPLACE) != 0) { 5642 if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK) { 5643 if (l2pg != NULL) 5644 l2pg->ref_count--; 5645 CTR2(KTR_PMAP, 5646 "pmap_enter_l2: no space for va %#lx" 5647 " in pmap %p", va, pmap); 5648 return (KERN_NO_SPACE); 5649 } else if (!ADDR_IS_KERNEL(va) || 5650 !pmap_every_pte_zero(PTE_TO_PHYS(old_l2))) { 5651 if (l2pg != NULL) 5652 l2pg->ref_count--; 5653 CTR2(KTR_PMAP, 5654 "pmap_enter_l2: failure for va %#lx" 5655 " in pmap %p", va, pmap); 5656 return (KERN_FAILURE); 5657 } 5658 } 5659 SLIST_INIT(&free); 5660 if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK) 5661 (void)pmap_remove_l2(pmap, l2, va, 5662 pmap_load(pmap_l1(pmap, va)), &free, lockp); 5663 else 5664 pmap_remove_l3_range(pmap, old_l2, va, va + L2_SIZE, 5665 &free, lockp); 5666 if (!ADDR_IS_KERNEL(va)) { 5667 vm_page_free_pages_toq(&free, true); 5668 KASSERT(pmap_load(l2) == 0, 5669 ("pmap_enter_l2: non-zero L2 entry %p", l2)); 5670 } else { 5671 KASSERT(SLIST_EMPTY(&free), 5672 ("pmap_enter_l2: freed kernel page table page")); 5673 5674 /* 5675 * Both pmap_remove_l2() and pmap_remove_l3_range() 5676 * will leave the kernel page table page zero filled. 5677 * Nonetheless, the TLB could have an intermediate 5678 * entry for the kernel page table page, so request 5679 * an invalidation at all levels after clearing 5680 * the L2_TABLE entry. 5681 */ 5682 mt = PTE_TO_VM_PAGE(pmap_load(l2)); 5683 if (pmap_insert_pt_page(pmap, mt, false, false)) 5684 panic("pmap_enter_l2: trie insert failed"); 5685 pmap_clear(l2); 5686 pmap_s1_invalidate_page(pmap, va, false); 5687 } 5688 } 5689 5690 /* 5691 * Allocate leaf ptpage for wired userspace pages. 5692 */ 5693 uwptpg = NULL; 5694 if ((new_l2 & ATTR_SW_WIRED) != 0 && pmap != kernel_pmap) { 5695 uwptpg = vm_page_alloc_noobj(VM_ALLOC_WIRED); 5696 if (uwptpg == NULL) { 5697 pmap_abort_ptp(pmap, va, l2pg); 5698 return (KERN_RESOURCE_SHORTAGE); 5699 } 5700 uwptpg->pindex = pmap_l2_pindex(va); 5701 if (pmap_insert_pt_page(pmap, uwptpg, true, false)) { 5702 vm_page_unwire_noq(uwptpg); 5703 vm_page_free(uwptpg); 5704 pmap_abort_ptp(pmap, va, l2pg); 5705 return (KERN_RESOURCE_SHORTAGE); 5706 } 5707 pmap_resident_count_inc(pmap, 1); 5708 uwptpg->ref_count = NL3PG; 5709 } 5710 if ((new_l2 & ATTR_SW_MANAGED) != 0) { 5711 /* 5712 * Abort this mapping if its PV entry could not be created. 5713 */ 5714 if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) { 5715 if (l2pg != NULL) 5716 pmap_abort_ptp(pmap, va, l2pg); 5717 if (uwptpg != NULL) { 5718 mt = pmap_remove_pt_page(pmap, va); 5719 KASSERT(mt == uwptpg, 5720 ("removed pt page %p, expected %p", mt, 5721 uwptpg)); 5722 pmap_resident_count_dec(pmap, 1); 5723 uwptpg->ref_count = 1; 5724 vm_page_unwire_noq(uwptpg); 5725 vm_page_free(uwptpg); 5726 } 5727 CTR2(KTR_PMAP, 5728 "pmap_enter_l2: failure for va %#lx in pmap %p", 5729 va, pmap); 5730 return (KERN_RESOURCE_SHORTAGE); 5731 } 5732 if ((new_l2 & ATTR_SW_DBM) != 0) 5733 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 5734 vm_page_aflag_set(mt, PGA_WRITEABLE); 5735 } 5736 5737 /* 5738 * Increment counters. 5739 */ 5740 if ((new_l2 & ATTR_SW_WIRED) != 0) 5741 pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE; 5742 pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE; 5743 5744 /* 5745 * Conditionally sync the icache. See pmap_enter() for details. 5746 */ 5747 if ((new_l2 & ATTR_S1_XN) == 0 && (PTE_TO_PHYS(new_l2) != 5748 PTE_TO_PHYS(old_l2) || (old_l2 & ATTR_S1_XN) != 0) && 5749 pmap != kernel_pmap && m->md.pv_memattr == VM_MEMATTR_WRITE_BACK) { 5750 cpu_icache_sync_range((void *)PHYS_TO_DMAP(PTE_TO_PHYS(new_l2)), 5751 L2_SIZE); 5752 } 5753 5754 /* 5755 * Map the superpage. 5756 */ 5757 pmap_store(l2, new_l2); 5758 dsb(ishst); 5759 5760 atomic_add_long(&pmap_l2_mappings, 1); 5761 CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p", 5762 va, pmap); 5763 5764 return (KERN_SUCCESS); 5765 } 5766 5767 /* 5768 * Tries to create a read- and/or execute-only L3C page mapping. Returns 5769 * KERN_SUCCESS if the mapping was created. Otherwise, returns an error 5770 * value. 5771 */ 5772 static int 5773 pmap_enter_l3c_rx(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t *ml3p, 5774 vm_prot_t prot, struct rwlock **lockp) 5775 { 5776 pt_entry_t l3e; 5777 5778 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5779 PMAP_ASSERT_STAGE1(pmap); 5780 KASSERT(ADDR_IS_CANONICAL(va), 5781 ("%s: Address not in canonical form: %lx", __func__, va)); 5782 5783 l3e = VM_PAGE_TO_PTE(m) | pmap_sh_attr | 5784 ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) | 5785 ATTR_CONTIGUOUS | L3_PAGE; 5786 if ((m->oflags & VPO_UNMANAGED) == 0) 5787 l3e |= ATTR_SW_MANAGED; 5788 else 5789 l3e |= ATTR_AF; 5790 if ((prot & VM_PROT_EXECUTE) == 0 || 5791 m->md.pv_memattr == VM_MEMATTR_DEVICE) 5792 l3e |= ATTR_S1_XN; 5793 if (!ADDR_IS_KERNEL(va)) 5794 l3e |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN; 5795 else 5796 l3e |= ATTR_S1_UXN; 5797 if (pmap != kernel_pmap) 5798 l3e |= ATTR_S1_nG; 5799 return (pmap_enter_l3c(pmap, va, l3e, PMAP_ENTER_NOSLEEP | 5800 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m, ml3p, lockp)); 5801 } 5802 5803 static int 5804 pmap_enter_l3c(pmap_t pmap, vm_offset_t va, pt_entry_t l3e, u_int flags, 5805 vm_page_t m, vm_page_t *ml3p, struct rwlock **lockp) 5806 { 5807 pd_entry_t *l2p, *pde; 5808 pt_entry_t *l3p, *tl3p; 5809 vm_page_t mt; 5810 vm_paddr_t pa; 5811 vm_pindex_t l2pindex; 5812 int lvl; 5813 5814 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5815 KASSERT((va & L3C_OFFSET) == 0, 5816 ("pmap_enter_l3c: va is not aligned")); 5817 KASSERT(!VA_IS_CLEANMAP(va) || (l3e & ATTR_SW_MANAGED) == 0, 5818 ("pmap_enter_l3c: managed mapping within the clean submap")); 5819 KASSERT((l3e & ATTR_CONTIGUOUS) != 0, 5820 ("pmap_enter_l3c: l3e is missing ATTR_CONTIGUOUS")); 5821 5822 /* 5823 * If the L3 PTP is not resident, we attempt to create it here. 5824 */ 5825 if (!ADDR_IS_KERNEL(va)) { 5826 /* 5827 * Were we given the correct L3 PTP? If so, we can simply 5828 * increment its ref count. 5829 */ 5830 l2pindex = pmap_l2_pindex(va); 5831 if (*ml3p != NULL && (*ml3p)->pindex == l2pindex) { 5832 (*ml3p)->ref_count += L3C_ENTRIES; 5833 } else { 5834 retry: 5835 /* 5836 * Get the L2 entry. 5837 */ 5838 pde = pmap_pde(pmap, va, &lvl); 5839 5840 /* 5841 * If the L2 entry is a superpage, we either abort or 5842 * demote depending on the given flags. 5843 */ 5844 if (lvl == 1) { 5845 l2p = pmap_l1_to_l2(pde, va); 5846 if ((pmap_load(l2p) & ATTR_DESCR_MASK) == 5847 L2_BLOCK) { 5848 if ((flags & PMAP_ENTER_NOREPLACE) != 0) 5849 return (KERN_FAILURE); 5850 l3p = pmap_demote_l2_locked(pmap, l2p, 5851 va, lockp); 5852 if (l3p != NULL) { 5853 *ml3p = PTE_TO_VM_PAGE( 5854 pmap_load(l2p)); 5855 (*ml3p)->ref_count += 5856 L3C_ENTRIES; 5857 goto have_l3p; 5858 } 5859 } 5860 /* We need to allocate an L3 PTP. */ 5861 } 5862 5863 /* 5864 * If the L3 PTP is mapped, we just increment its ref 5865 * count. Otherwise, we attempt to allocate it. 5866 */ 5867 if (lvl == 2 && pmap_load(pde) != 0) { 5868 *ml3p = PTE_TO_VM_PAGE(pmap_load(pde)); 5869 (*ml3p)->ref_count += L3C_ENTRIES; 5870 } else { 5871 *ml3p = _pmap_alloc_l3(pmap, l2pindex, (flags & 5872 PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp); 5873 if (*ml3p == NULL) { 5874 if ((flags & PMAP_ENTER_NOSLEEP) != 0) 5875 return (KERN_FAILURE); 5876 5877 /* 5878 * The page table may have changed 5879 * while we slept. 5880 */ 5881 goto retry; 5882 } 5883 (*ml3p)->ref_count += L3C_ENTRIES - 1; 5884 } 5885 } 5886 l3p = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(*ml3p)); 5887 } else { 5888 *ml3p = NULL; 5889 5890 /* 5891 * If the L2 entry is a superpage, we either abort or demote 5892 * depending on the given flags. 5893 */ 5894 pde = pmap_pde(kernel_pmap, va, &lvl); 5895 if (lvl == 1) { 5896 l2p = pmap_l1_to_l2(pde, va); 5897 KASSERT((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK, 5898 ("pmap_enter_l3c: missing L2 block")); 5899 if ((flags & PMAP_ENTER_NOREPLACE) != 0) 5900 return (KERN_FAILURE); 5901 l3p = pmap_demote_l2_locked(pmap, l2p, va, lockp); 5902 } else { 5903 KASSERT(lvl == 2, 5904 ("pmap_enter_l3c: Invalid level %d", lvl)); 5905 l3p = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS( 5906 pmap_load(pde))); 5907 } 5908 } 5909 have_l3p: 5910 l3p = &l3p[pmap_l3_index(va)]; 5911 5912 /* 5913 * If bti is not the same for the whole L3C range, return failure 5914 * and let vm_fault() cope. Check after L3 allocation, since 5915 * it could sleep. 5916 */ 5917 if (!pmap_bti_same(pmap, va, va + L3C_SIZE, &l3e)) { 5918 KASSERT(*ml3p != NULL, ("pmap_enter_l3c: missing L3 PTP")); 5919 (*ml3p)->ref_count -= L3C_ENTRIES - 1; 5920 pmap_abort_ptp(pmap, va, *ml3p); 5921 *ml3p = NULL; 5922 return (KERN_PROTECTION_FAILURE); 5923 } 5924 5925 /* 5926 * If there are existing mappings, either abort or remove them. 5927 */ 5928 if ((flags & PMAP_ENTER_NOREPLACE) != 0) { 5929 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) { 5930 if (pmap_load(tl3p) != 0) { 5931 if (*ml3p != NULL) 5932 (*ml3p)->ref_count -= L3C_ENTRIES; 5933 return (KERN_FAILURE); 5934 } 5935 } 5936 } else { 5937 /* 5938 * Because we increment the L3 page's reference count above, 5939 * it is guaranteed not to be freed here and we can pass NULL 5940 * instead of a valid free list. 5941 */ 5942 pmap_remove_l3_range(pmap, pmap_load(pmap_l2(pmap, va)), va, 5943 va + L3C_SIZE, NULL, lockp); 5944 } 5945 5946 /* 5947 * Enter on the PV list if part of our managed memory. 5948 */ 5949 if ((l3e & ATTR_SW_MANAGED) != 0) { 5950 if (!pmap_pv_insert_l3c(pmap, va, m, lockp)) { 5951 if (*ml3p != NULL) { 5952 (*ml3p)->ref_count -= L3C_ENTRIES - 1; 5953 pmap_abort_ptp(pmap, va, *ml3p); 5954 *ml3p = NULL; 5955 } 5956 return (KERN_RESOURCE_SHORTAGE); 5957 } 5958 if ((l3e & ATTR_SW_DBM) != 0) 5959 for (mt = m; mt < &m[L3C_ENTRIES]; mt++) 5960 vm_page_aflag_set(mt, PGA_WRITEABLE); 5961 } 5962 5963 /* 5964 * Increment counters. 5965 */ 5966 if ((l3e & ATTR_SW_WIRED) != 0) 5967 pmap->pm_stats.wired_count += L3C_ENTRIES; 5968 pmap_resident_count_inc(pmap, L3C_ENTRIES); 5969 5970 pa = VM_PAGE_TO_PHYS(m); 5971 KASSERT((pa & L3C_OFFSET) == 0, ("pmap_enter_l3c: pa is not aligned")); 5972 5973 /* 5974 * Sync the icache before the mapping is stored. 5975 */ 5976 if ((l3e & ATTR_S1_XN) == 0 && pmap != kernel_pmap && 5977 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK) 5978 cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa), L3C_SIZE); 5979 5980 /* 5981 * Map the superpage. 5982 */ 5983 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) { 5984 pmap_store(tl3p, l3e); 5985 l3e += L3_SIZE; 5986 } 5987 dsb(ishst); 5988 5989 counter_u64_add(pmap_l3c_mappings, 1); 5990 CTR2(KTR_PMAP, "pmap_enter_l3c: success for va %#lx in pmap %p", 5991 va, pmap); 5992 return (KERN_SUCCESS); 5993 } 5994 5995 /* 5996 * Maps a sequence of resident pages belonging to the same object. 5997 * The sequence begins with the given page m_start. This page is 5998 * mapped at the given virtual address start. Each subsequent page is 5999 * mapped at a virtual address that is offset from start by the same 6000 * amount as the page is offset from m_start within the object. The 6001 * last page in the sequence is the page with the largest offset from 6002 * m_start that can be mapped at a virtual address less than the given 6003 * virtual address end. Not every virtual page between start and end 6004 * is mapped; only those for which a resident page exists with the 6005 * corresponding offset from m_start are mapped. 6006 */ 6007 void 6008 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 6009 vm_page_t m_start, vm_prot_t prot) 6010 { 6011 struct rwlock *lock; 6012 vm_offset_t va; 6013 vm_page_t m, mpte; 6014 vm_pindex_t diff, psize; 6015 int rv; 6016 6017 VM_OBJECT_ASSERT_LOCKED(m_start->object); 6018 6019 psize = atop(end - start); 6020 mpte = NULL; 6021 m = m_start; 6022 lock = NULL; 6023 PMAP_LOCK(pmap); 6024 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 6025 va = start + ptoa(diff); 6026 if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end && 6027 m->psind == 2 && pmap_ps_enabled(pmap) && 6028 ((rv = pmap_enter_l2_rx(pmap, va, m, prot, &lock)) == 6029 KERN_SUCCESS || rv == KERN_NO_SPACE)) 6030 m = &m[L2_SIZE / PAGE_SIZE - 1]; 6031 else if ((va & L3C_OFFSET) == 0 && va + L3C_SIZE <= end && 6032 m->psind >= 1 && pmap_ps_enabled(pmap) && 6033 ((rv = pmap_enter_l3c_rx(pmap, va, m, &mpte, prot, 6034 &lock)) == KERN_SUCCESS || rv == KERN_NO_SPACE)) 6035 m = &m[L3C_ENTRIES - 1]; 6036 else { 6037 /* 6038 * In general, if a superpage mapping were possible, 6039 * it would have been created above. That said, if 6040 * start and end are not superpage aligned, then 6041 * promotion might be possible at the ends of [start, 6042 * end). However, in practice, those promotion 6043 * attempts are so unlikely to succeed that they are 6044 * not worth trying. 6045 */ 6046 mpte = pmap_enter_quick_locked(pmap, va, m, prot | 6047 VM_PROT_NO_PROMOTE, mpte, &lock); 6048 } 6049 m = TAILQ_NEXT(m, listq); 6050 } 6051 if (lock != NULL) 6052 rw_wunlock(lock); 6053 PMAP_UNLOCK(pmap); 6054 } 6055 6056 /* 6057 * this code makes some *MAJOR* assumptions: 6058 * 1. Current pmap & pmap exists. 6059 * 2. Not wired. 6060 * 3. Read access. 6061 * 4. No page table pages. 6062 * but is *MUCH* faster than pmap_enter... 6063 */ 6064 6065 void 6066 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 6067 { 6068 struct rwlock *lock; 6069 6070 lock = NULL; 6071 PMAP_LOCK(pmap); 6072 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); 6073 if (lock != NULL) 6074 rw_wunlock(lock); 6075 PMAP_UNLOCK(pmap); 6076 } 6077 6078 static vm_page_t 6079 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 6080 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) 6081 { 6082 pt_entry_t *l1, *l2, *l3, l3_val; 6083 vm_paddr_t pa; 6084 int full_lvl, lvl; 6085 6086 KASSERT(!VA_IS_CLEANMAP(va) || 6087 (m->oflags & VPO_UNMANAGED) != 0, 6088 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 6089 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6090 PMAP_ASSERT_STAGE1(pmap); 6091 KASSERT(ADDR_IS_CANONICAL(va), 6092 ("%s: Address not in canonical form: %lx", __func__, va)); 6093 l2 = NULL; 6094 6095 CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va); 6096 /* 6097 * In the case that a page table page is not 6098 * resident, we are creating it here. 6099 */ 6100 if (!ADDR_IS_KERNEL(va)) { 6101 vm_pindex_t l2pindex; 6102 6103 /* 6104 * Calculate pagetable page index 6105 */ 6106 l2pindex = pmap_l2_pindex(va); 6107 if (mpte && (mpte->pindex == l2pindex)) { 6108 mpte->ref_count++; 6109 } else { 6110 /* 6111 * If the page table page is mapped, we just increment 6112 * the hold count, and activate it. Otherwise, we 6113 * attempt to allocate a page table page, passing NULL 6114 * instead of the PV list lock pointer because we don't 6115 * intend to sleep. If this attempt fails, we don't 6116 * retry. Instead, we give up. 6117 */ 6118 l1 = pmap_l1(pmap, va); 6119 if (l1 != NULL && pmap_load(l1) != 0) { 6120 if ((pmap_load(l1) & ATTR_DESCR_MASK) == 6121 L1_BLOCK) 6122 return (NULL); 6123 l2 = pmap_l1_to_l2(l1, va); 6124 if (pmap_load(l2) != 0) { 6125 if ((pmap_load(l2) & ATTR_DESCR_MASK) == 6126 L2_BLOCK) 6127 return (NULL); 6128 mpte = PTE_TO_VM_PAGE(pmap_load(l2)); 6129 mpte->ref_count++; 6130 } else { 6131 mpte = _pmap_alloc_l3(pmap, l2pindex, 6132 NULL); 6133 if (mpte == NULL) 6134 return (mpte); 6135 } 6136 } else { 6137 mpte = _pmap_alloc_l3(pmap, l2pindex, NULL); 6138 if (mpte == NULL) 6139 return (mpte); 6140 } 6141 } 6142 l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); 6143 l3 = &l3[pmap_l3_index(va)]; 6144 } else { 6145 mpte = NULL; 6146 l2 = pmap_pde(kernel_pmap, va, &lvl); 6147 KASSERT(l2 != NULL, 6148 ("pmap_enter_quick_locked: Invalid page entry, va: 0x%lx", 6149 va)); 6150 KASSERT(lvl == 2, 6151 ("pmap_enter_quick_locked: Invalid level %d", lvl)); 6152 l3 = pmap_l2_to_l3(l2, va); 6153 } 6154 6155 /* 6156 * Abort if a mapping already exists. 6157 */ 6158 if (pmap_load(l3) != 0) { 6159 if (mpte != NULL) 6160 mpte->ref_count--; 6161 return (NULL); 6162 } 6163 6164 /* 6165 * Enter on the PV list if part of our managed memory. 6166 */ 6167 if ((m->oflags & VPO_UNMANAGED) == 0 && 6168 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { 6169 if (mpte != NULL) 6170 pmap_abort_ptp(pmap, va, mpte); 6171 return (NULL); 6172 } 6173 6174 /* 6175 * Increment counters 6176 */ 6177 pmap_resident_count_inc(pmap, 1); 6178 6179 pa = VM_PAGE_TO_PHYS(m); 6180 l3_val = PHYS_TO_PTE(pa) | pmap_sh_attr | 6181 ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) | L3_PAGE; 6182 l3_val |= pmap_pte_bti(pmap, va); 6183 if ((prot & VM_PROT_EXECUTE) == 0 || 6184 m->md.pv_memattr == VM_MEMATTR_DEVICE) 6185 l3_val |= ATTR_S1_XN; 6186 if (!ADDR_IS_KERNEL(va)) 6187 l3_val |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN; 6188 else 6189 l3_val |= ATTR_S1_UXN; 6190 if (pmap != kernel_pmap) 6191 l3_val |= ATTR_S1_nG; 6192 6193 /* 6194 * Now validate mapping with RO protection 6195 */ 6196 if ((m->oflags & VPO_UNMANAGED) == 0) 6197 l3_val |= ATTR_SW_MANAGED; 6198 else 6199 l3_val |= ATTR_AF; 6200 6201 /* Sync icache before the mapping is stored to PTE */ 6202 if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap && 6203 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK) 6204 cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa), PAGE_SIZE); 6205 6206 pmap_store(l3, l3_val); 6207 dsb(ishst); 6208 6209 #if VM_NRESERVLEVEL > 0 6210 /* 6211 * First, attempt L3C promotion, if the virtual and physical addresses 6212 * are aligned with each other and an underlying reservation has the 6213 * neighboring L3 pages allocated. The first condition is simply an 6214 * optimization that recognizes some eventual promotion failures early 6215 * at a lower run-time cost. Then, attempt L2 promotion, if both a 6216 * level 1 reservation and the PTP are fully populated. 6217 */ 6218 if ((prot & VM_PROT_NO_PROMOTE) == 0 && 6219 (va & L3C_OFFSET) == (pa & L3C_OFFSET) && 6220 (m->flags & PG_FICTITIOUS) == 0 && 6221 (full_lvl = vm_reserv_level_iffullpop(m)) >= 0 && 6222 pmap_promote_l3c(pmap, l3, va) && 6223 full_lvl == 1 && (mpte == NULL || mpte->ref_count == NL3PG)) { 6224 if (l2 == NULL) 6225 l2 = pmap_l2(pmap, va); 6226 6227 /* 6228 * If promotion succeeds, then the next call to this function 6229 * should not be given the unmapped PTP as a hint. 6230 */ 6231 if (pmap_promote_l2(pmap, l2, va, mpte, lockp)) 6232 mpte = NULL; 6233 } 6234 #endif 6235 6236 return (mpte); 6237 } 6238 6239 /* 6240 * This code maps large physical mmap regions into the 6241 * processor address space. Note that some shortcuts 6242 * are taken, but the code works. 6243 */ 6244 void 6245 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 6246 vm_pindex_t pindex, vm_size_t size) 6247 { 6248 6249 VM_OBJECT_ASSERT_WLOCKED(object); 6250 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 6251 ("pmap_object_init_pt: non-device object")); 6252 } 6253 6254 /* 6255 * Clear the wired attribute from the mappings for the specified range of 6256 * addresses in the given pmap. Every valid mapping within that range 6257 * must have the wired attribute set. In contrast, invalid mappings 6258 * cannot have the wired attribute set, so they are ignored. 6259 * 6260 * The wired attribute of the page table entry is not a hardware feature, 6261 * so there is no need to invalidate any TLB entries. 6262 */ 6263 void 6264 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 6265 { 6266 vm_offset_t va_next; 6267 pd_entry_t *l0, *l1, *l2; 6268 pt_entry_t *l3; 6269 bool partial_l3c; 6270 6271 PMAP_LOCK(pmap); 6272 for (; sva < eva; sva = va_next) { 6273 l0 = pmap_l0(pmap, sva); 6274 if (pmap_load(l0) == 0) { 6275 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 6276 if (va_next < sva) 6277 va_next = eva; 6278 continue; 6279 } 6280 6281 l1 = pmap_l0_to_l1(l0, sva); 6282 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 6283 if (va_next < sva) 6284 va_next = eva; 6285 if (pmap_load(l1) == 0) 6286 continue; 6287 6288 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { 6289 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 6290 KASSERT(va_next <= eva, 6291 ("partial update of non-transparent 1G page " 6292 "l1 %#lx sva %#lx eva %#lx va_next %#lx", 6293 pmap_load(l1), sva, eva, va_next)); 6294 MPASS(pmap != kernel_pmap); 6295 MPASS((pmap_load(l1) & (ATTR_SW_MANAGED | 6296 ATTR_SW_WIRED)) == ATTR_SW_WIRED); 6297 pmap_clear_bits(l1, ATTR_SW_WIRED); 6298 pmap->pm_stats.wired_count -= L1_SIZE / PAGE_SIZE; 6299 continue; 6300 } 6301 6302 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 6303 if (va_next < sva) 6304 va_next = eva; 6305 6306 l2 = pmap_l1_to_l2(l1, sva); 6307 if (pmap_load(l2) == 0) 6308 continue; 6309 6310 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) { 6311 if ((pmap_load(l2) & ATTR_SW_WIRED) == 0) 6312 panic("pmap_unwire: l2 %#jx is missing " 6313 "ATTR_SW_WIRED", (uintmax_t)pmap_load(l2)); 6314 6315 /* 6316 * Are we unwiring the entire large page? If not, 6317 * demote the mapping and fall through. 6318 */ 6319 if (sva + L2_SIZE == va_next && eva >= va_next) { 6320 pmap_clear_bits(l2, ATTR_SW_WIRED); 6321 pmap->pm_stats.wired_count -= L2_SIZE / 6322 PAGE_SIZE; 6323 continue; 6324 } else if (pmap_demote_l2(pmap, l2, sva) == NULL) 6325 panic("pmap_unwire: demotion failed"); 6326 } 6327 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, 6328 ("pmap_unwire: Invalid l2 entry after demotion")); 6329 6330 if (va_next > eva) 6331 va_next = eva; 6332 for (partial_l3c = true, l3 = pmap_l2_to_l3(l2, sva); 6333 sva != va_next; l3++, sva += L3_SIZE) { 6334 if (pmap_load(l3) == 0) 6335 continue; 6336 if ((pmap_load(l3) & ATTR_CONTIGUOUS) != 0) { 6337 /* 6338 * Avoid demotion for whole-page unwiring. 6339 */ 6340 if ((sva & L3C_OFFSET) == 0) { 6341 /* 6342 * Handle the possibility that 6343 * "va_next" is zero because of 6344 * address wraparound. 6345 */ 6346 partial_l3c = sva + L3C_OFFSET > 6347 va_next - 1; 6348 } 6349 if (partial_l3c) 6350 (void)pmap_demote_l3c(pmap, l3, sva); 6351 } 6352 if ((pmap_load(l3) & ATTR_SW_WIRED) == 0) 6353 panic("pmap_unwire: l3 %#jx is missing " 6354 "ATTR_SW_WIRED", (uintmax_t)pmap_load(l3)); 6355 6356 /* 6357 * ATTR_SW_WIRED must be cleared atomically. Although 6358 * the pmap lock synchronizes access to ATTR_SW_WIRED, 6359 * the System MMU may write to the entry concurrently. 6360 */ 6361 pmap_clear_bits(l3, ATTR_SW_WIRED); 6362 pmap->pm_stats.wired_count--; 6363 } 6364 } 6365 PMAP_UNLOCK(pmap); 6366 } 6367 6368 /* 6369 * This function requires that the caller has already added one to ml3's 6370 * ref_count in anticipation of creating a 4KB page mapping. 6371 */ 6372 static bool 6373 pmap_copy_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, pt_entry_t l3e, 6374 vm_page_t ml3, struct rwlock **lockp) 6375 { 6376 pt_entry_t *tl3p; 6377 6378 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6379 KASSERT((va & L3C_OFFSET) == 0, 6380 ("pmap_copy_l3c: va is not aligned")); 6381 KASSERT((l3e & ATTR_SW_MANAGED) != 0, 6382 ("pmap_copy_l3c: l3e is not managed")); 6383 6384 /* 6385 * Abort if a mapping already exists. 6386 */ 6387 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) 6388 if (pmap_load(tl3p) != 0) { 6389 if (ml3 != NULL) 6390 ml3->ref_count--; 6391 return (false); 6392 } 6393 6394 if (!pmap_pv_insert_l3c(pmap, va, PTE_TO_VM_PAGE(l3e), lockp)) { 6395 if (ml3 != NULL) 6396 pmap_abort_ptp(pmap, va, ml3); 6397 return (false); 6398 } 6399 ml3->ref_count += L3C_ENTRIES - 1; 6400 6401 /* 6402 * Clear the wired and accessed bits. However, leave the dirty bit 6403 * unchanged because read/write superpage mappings are required to be 6404 * dirty. 6405 */ 6406 l3e &= ~(ATTR_SW_WIRED | ATTR_AF); 6407 6408 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) { 6409 pmap_store(tl3p, l3e); 6410 l3e += L3_SIZE; 6411 } 6412 pmap_resident_count_inc(pmap, L3C_ENTRIES); 6413 counter_u64_add(pmap_l3c_mappings, 1); 6414 CTR2(KTR_PMAP, "pmap_copy_l3c: success for va %#lx in pmap %p", 6415 va, pmap); 6416 return (true); 6417 } 6418 6419 /* 6420 * Copy the range specified by src_addr/len 6421 * from the source map to the range dst_addr/len 6422 * in the destination map. 6423 * 6424 * This routine is only advisory and need not do anything. 6425 * 6426 * Because the executable mappings created by this routine are copied, 6427 * it should not have to flush the instruction cache. 6428 */ 6429 void 6430 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 6431 vm_offset_t src_addr) 6432 { 6433 struct rwlock *lock; 6434 pd_entry_t *l0, *l1, *l2, srcptepaddr; 6435 pt_entry_t *dst_pte, mask, nbits, ptetemp, *src_pte; 6436 vm_offset_t addr, end_addr, va_next; 6437 vm_page_t dst_m, dstmpte, srcmpte; 6438 6439 PMAP_ASSERT_STAGE1(dst_pmap); 6440 PMAP_ASSERT_STAGE1(src_pmap); 6441 6442 if (dst_addr != src_addr) 6443 return; 6444 end_addr = src_addr + len; 6445 lock = NULL; 6446 if (dst_pmap < src_pmap) { 6447 PMAP_LOCK(dst_pmap); 6448 PMAP_LOCK(src_pmap); 6449 } else { 6450 PMAP_LOCK(src_pmap); 6451 PMAP_LOCK(dst_pmap); 6452 } 6453 for (addr = src_addr; addr < end_addr; addr = va_next) { 6454 l0 = pmap_l0(src_pmap, addr); 6455 if (pmap_load(l0) == 0) { 6456 va_next = (addr + L0_SIZE) & ~L0_OFFSET; 6457 if (va_next < addr) 6458 va_next = end_addr; 6459 continue; 6460 } 6461 6462 va_next = (addr + L1_SIZE) & ~L1_OFFSET; 6463 if (va_next < addr) 6464 va_next = end_addr; 6465 l1 = pmap_l0_to_l1(l0, addr); 6466 if (pmap_load(l1) == 0) 6467 continue; 6468 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { 6469 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 6470 KASSERT(va_next <= end_addr, 6471 ("partial update of non-transparent 1G page " 6472 "l1 %#lx addr %#lx end_addr %#lx va_next %#lx", 6473 pmap_load(l1), addr, end_addr, va_next)); 6474 srcptepaddr = pmap_load(l1); 6475 l1 = pmap_l1(dst_pmap, addr); 6476 if (l1 == NULL) { 6477 if (_pmap_alloc_l3(dst_pmap, 6478 pmap_l0_pindex(addr), NULL) == NULL) 6479 break; 6480 l1 = pmap_l1(dst_pmap, addr); 6481 } else { 6482 l0 = pmap_l0(dst_pmap, addr); 6483 dst_m = PTE_TO_VM_PAGE(pmap_load(l0)); 6484 dst_m->ref_count++; 6485 } 6486 KASSERT(pmap_load(l1) == 0, 6487 ("1G mapping present in dst pmap " 6488 "l1 %#lx addr %#lx end_addr %#lx va_next %#lx", 6489 pmap_load(l1), addr, end_addr, va_next)); 6490 pmap_store(l1, srcptepaddr & ~ATTR_SW_WIRED); 6491 pmap_resident_count_inc(dst_pmap, L1_SIZE / PAGE_SIZE); 6492 continue; 6493 } 6494 6495 va_next = (addr + L2_SIZE) & ~L2_OFFSET; 6496 if (va_next < addr) 6497 va_next = end_addr; 6498 l2 = pmap_l1_to_l2(l1, addr); 6499 srcptepaddr = pmap_load(l2); 6500 if (srcptepaddr == 0) 6501 continue; 6502 if ((srcptepaddr & ATTR_DESCR_MASK) == L2_BLOCK) { 6503 /* 6504 * We can only virtual copy whole superpages. 6505 */ 6506 if ((addr & L2_OFFSET) != 0 || 6507 addr + L2_SIZE > end_addr) 6508 continue; 6509 l2 = pmap_alloc_l2(dst_pmap, addr, &dst_m, NULL); 6510 if (l2 == NULL) 6511 break; 6512 if (pmap_load(l2) == 0 && 6513 ((srcptepaddr & ATTR_SW_MANAGED) == 0 || 6514 pmap_pv_insert_l2(dst_pmap, addr, srcptepaddr, 6515 PMAP_ENTER_NORECLAIM, &lock))) { 6516 /* 6517 * We leave the dirty bit unchanged because 6518 * managed read/write superpage mappings are 6519 * required to be dirty. However, managed 6520 * superpage mappings are not required to 6521 * have their accessed bit set, so we clear 6522 * it because we don't know if this mapping 6523 * will be used. 6524 */ 6525 srcptepaddr &= ~ATTR_SW_WIRED; 6526 if ((srcptepaddr & ATTR_SW_MANAGED) != 0) 6527 srcptepaddr &= ~ATTR_AF; 6528 pmap_store(l2, srcptepaddr); 6529 pmap_resident_count_inc(dst_pmap, L2_SIZE / 6530 PAGE_SIZE); 6531 atomic_add_long(&pmap_l2_mappings, 1); 6532 } else 6533 pmap_abort_ptp(dst_pmap, addr, dst_m); 6534 continue; 6535 } 6536 KASSERT((srcptepaddr & ATTR_DESCR_MASK) == L2_TABLE, 6537 ("pmap_copy: invalid L2 entry")); 6538 srcmpte = PTE_TO_VM_PAGE(srcptepaddr); 6539 KASSERT(srcmpte->ref_count > 0, 6540 ("pmap_copy: source page table page is unused")); 6541 if (va_next > end_addr) 6542 va_next = end_addr; 6543 src_pte = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(srcptepaddr)); 6544 src_pte = &src_pte[pmap_l3_index(addr)]; 6545 dstmpte = NULL; 6546 for (; addr < va_next; addr += PAGE_SIZE, src_pte++) { 6547 ptetemp = pmap_load(src_pte); 6548 6549 /* 6550 * We only virtual copy managed pages. 6551 */ 6552 if ((ptetemp & ATTR_SW_MANAGED) == 0) 6553 continue; 6554 6555 if (dstmpte != NULL) { 6556 KASSERT(dstmpte->pindex == pmap_l2_pindex(addr), 6557 ("dstmpte pindex/addr mismatch")); 6558 dstmpte->ref_count++; 6559 } else if ((dstmpte = pmap_alloc_l3(dst_pmap, addr, 6560 NULL)) == NULL) 6561 goto out; 6562 dst_pte = (pt_entry_t *) 6563 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); 6564 dst_pte = &dst_pte[pmap_l3_index(addr)]; 6565 if ((ptetemp & ATTR_CONTIGUOUS) != 0 && (addr & 6566 L3C_OFFSET) == 0 && addr + L3C_OFFSET <= 6567 va_next - 1) { 6568 if (!pmap_copy_l3c(dst_pmap, dst_pte, addr, 6569 ptetemp, dstmpte, &lock)) 6570 goto out; 6571 addr += L3C_SIZE - PAGE_SIZE; 6572 src_pte += L3C_ENTRIES - 1; 6573 } else if (pmap_load(dst_pte) == 0 && 6574 pmap_try_insert_pv_entry(dst_pmap, addr, 6575 PTE_TO_VM_PAGE(ptetemp), &lock)) { 6576 /* 6577 * Clear the wired, contiguous, modified, and 6578 * accessed bits from the destination PTE. 6579 * The contiguous bit is cleared because we 6580 * are not copying the entire L3C superpage. 6581 */ 6582 mask = ATTR_SW_WIRED | ATTR_CONTIGUOUS | 6583 ATTR_AF; 6584 nbits = 0; 6585 if ((ptetemp & ATTR_SW_DBM) != 0) 6586 nbits |= ATTR_S1_AP_RW_BIT; 6587 pmap_store(dst_pte, (ptetemp & ~mask) | nbits); 6588 pmap_resident_count_inc(dst_pmap, 1); 6589 } else { 6590 pmap_abort_ptp(dst_pmap, addr, dstmpte); 6591 goto out; 6592 } 6593 /* Have we copied all of the valid mappings? */ 6594 if (dstmpte->ref_count >= srcmpte->ref_count) 6595 break; 6596 } 6597 } 6598 out: 6599 /* 6600 * XXX This barrier may not be needed because the destination pmap is 6601 * not active. 6602 */ 6603 dsb(ishst); 6604 6605 if (lock != NULL) 6606 rw_wunlock(lock); 6607 PMAP_UNLOCK(src_pmap); 6608 PMAP_UNLOCK(dst_pmap); 6609 } 6610 6611 int 6612 pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap) 6613 { 6614 int error; 6615 6616 if (dst_pmap->pm_stage != src_pmap->pm_stage) 6617 return (EINVAL); 6618 6619 if (dst_pmap->pm_stage != PM_STAGE1 || src_pmap->pm_bti == NULL) 6620 return (0); 6621 6622 for (;;) { 6623 if (dst_pmap < src_pmap) { 6624 PMAP_LOCK(dst_pmap); 6625 PMAP_LOCK(src_pmap); 6626 } else { 6627 PMAP_LOCK(src_pmap); 6628 PMAP_LOCK(dst_pmap); 6629 } 6630 error = pmap_bti_copy(dst_pmap, src_pmap); 6631 /* Clean up partial copy on failure due to no memory. */ 6632 if (error == ENOMEM) 6633 pmap_bti_deassign_all(dst_pmap); 6634 PMAP_UNLOCK(src_pmap); 6635 PMAP_UNLOCK(dst_pmap); 6636 if (error != ENOMEM) 6637 break; 6638 vm_wait(NULL); 6639 } 6640 return (error); 6641 } 6642 6643 /* 6644 * pmap_zero_page zeros the specified hardware page by mapping 6645 * the page into KVM and using bzero to clear its contents. 6646 */ 6647 void 6648 pmap_zero_page(vm_page_t m) 6649 { 6650 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 6651 6652 pagezero((void *)va); 6653 } 6654 6655 /* 6656 * pmap_zero_page_area zeros the specified hardware page by mapping 6657 * the page into KVM and using bzero to clear its contents. 6658 * 6659 * off and size may not cover an area beyond a single hardware page. 6660 */ 6661 void 6662 pmap_zero_page_area(vm_page_t m, int off, int size) 6663 { 6664 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 6665 6666 if (off == 0 && size == PAGE_SIZE) 6667 pagezero((void *)va); 6668 else 6669 bzero((char *)va + off, size); 6670 } 6671 6672 /* 6673 * pmap_copy_page copies the specified (machine independent) 6674 * page by mapping the page into virtual memory and using 6675 * bcopy to copy the page, one machine dependent page at a 6676 * time. 6677 */ 6678 void 6679 pmap_copy_page(vm_page_t msrc, vm_page_t mdst) 6680 { 6681 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 6682 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 6683 6684 pagecopy((void *)src, (void *)dst); 6685 } 6686 6687 int unmapped_buf_allowed = 1; 6688 6689 void 6690 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 6691 vm_offset_t b_offset, int xfersize) 6692 { 6693 void *a_cp, *b_cp; 6694 vm_page_t m_a, m_b; 6695 vm_paddr_t p_a, p_b; 6696 vm_offset_t a_pg_offset, b_pg_offset; 6697 int cnt; 6698 6699 while (xfersize > 0) { 6700 a_pg_offset = a_offset & PAGE_MASK; 6701 m_a = ma[a_offset >> PAGE_SHIFT]; 6702 p_a = m_a->phys_addr; 6703 b_pg_offset = b_offset & PAGE_MASK; 6704 m_b = mb[b_offset >> PAGE_SHIFT]; 6705 p_b = m_b->phys_addr; 6706 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 6707 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 6708 if (__predict_false(!PHYS_IN_DMAP(p_a))) { 6709 panic("!DMAP a %lx", p_a); 6710 } else { 6711 a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset; 6712 } 6713 if (__predict_false(!PHYS_IN_DMAP(p_b))) { 6714 panic("!DMAP b %lx", p_b); 6715 } else { 6716 b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset; 6717 } 6718 bcopy(a_cp, b_cp, cnt); 6719 a_offset += cnt; 6720 b_offset += cnt; 6721 xfersize -= cnt; 6722 } 6723 } 6724 6725 vm_offset_t 6726 pmap_quick_enter_page(vm_page_t m) 6727 { 6728 6729 return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m))); 6730 } 6731 6732 void 6733 pmap_quick_remove_page(vm_offset_t addr) 6734 { 6735 } 6736 6737 /* 6738 * Returns true if the pmap's pv is one of the first 6739 * 16 pvs linked to from this page. This count may 6740 * be changed upwards or downwards in the future; it 6741 * is only necessary that true be returned for a small 6742 * subset of pmaps for proper page aging. 6743 */ 6744 bool 6745 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 6746 { 6747 struct md_page *pvh; 6748 struct rwlock *lock; 6749 pv_entry_t pv; 6750 int loops = 0; 6751 bool rv; 6752 6753 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 6754 ("pmap_page_exists_quick: page %p is not managed", m)); 6755 rv = false; 6756 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 6757 rw_rlock(lock); 6758 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 6759 if (PV_PMAP(pv) == pmap) { 6760 rv = true; 6761 break; 6762 } 6763 loops++; 6764 if (loops >= 16) 6765 break; 6766 } 6767 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 6768 pvh = page_to_pvh(m); 6769 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 6770 if (PV_PMAP(pv) == pmap) { 6771 rv = true; 6772 break; 6773 } 6774 loops++; 6775 if (loops >= 16) 6776 break; 6777 } 6778 } 6779 rw_runlock(lock); 6780 return (rv); 6781 } 6782 6783 /* 6784 * pmap_page_wired_mappings: 6785 * 6786 * Return the number of managed mappings to the given physical page 6787 * that are wired. 6788 */ 6789 int 6790 pmap_page_wired_mappings(vm_page_t m) 6791 { 6792 struct rwlock *lock; 6793 struct md_page *pvh; 6794 pmap_t pmap; 6795 pt_entry_t *pte; 6796 pv_entry_t pv; 6797 int count, md_gen, pvh_gen; 6798 6799 if ((m->oflags & VPO_UNMANAGED) != 0) 6800 return (0); 6801 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 6802 rw_rlock(lock); 6803 restart: 6804 count = 0; 6805 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 6806 pmap = PV_PMAP(pv); 6807 if (!PMAP_TRYLOCK(pmap)) { 6808 md_gen = m->md.pv_gen; 6809 rw_runlock(lock); 6810 PMAP_LOCK(pmap); 6811 rw_rlock(lock); 6812 if (md_gen != m->md.pv_gen) { 6813 PMAP_UNLOCK(pmap); 6814 goto restart; 6815 } 6816 } 6817 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__); 6818 if ((pmap_load(pte) & ATTR_SW_WIRED) != 0) 6819 count++; 6820 PMAP_UNLOCK(pmap); 6821 } 6822 if ((m->flags & PG_FICTITIOUS) == 0) { 6823 pvh = page_to_pvh(m); 6824 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 6825 pmap = PV_PMAP(pv); 6826 if (!PMAP_TRYLOCK(pmap)) { 6827 md_gen = m->md.pv_gen; 6828 pvh_gen = pvh->pv_gen; 6829 rw_runlock(lock); 6830 PMAP_LOCK(pmap); 6831 rw_rlock(lock); 6832 if (md_gen != m->md.pv_gen || 6833 pvh_gen != pvh->pv_gen) { 6834 PMAP_UNLOCK(pmap); 6835 goto restart; 6836 } 6837 } 6838 pte = pmap_pte_exists(pmap, pv->pv_va, 2, __func__); 6839 if ((pmap_load(pte) & ATTR_SW_WIRED) != 0) 6840 count++; 6841 PMAP_UNLOCK(pmap); 6842 } 6843 } 6844 rw_runlock(lock); 6845 return (count); 6846 } 6847 6848 /* 6849 * Returns true if the given page is mapped individually or as part of 6850 * a 2mpage. Otherwise, returns false. 6851 */ 6852 bool 6853 pmap_page_is_mapped(vm_page_t m) 6854 { 6855 struct rwlock *lock; 6856 bool rv; 6857 6858 if ((m->oflags & VPO_UNMANAGED) != 0) 6859 return (false); 6860 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 6861 rw_rlock(lock); 6862 rv = !TAILQ_EMPTY(&m->md.pv_list) || 6863 ((m->flags & PG_FICTITIOUS) == 0 && 6864 !TAILQ_EMPTY(&page_to_pvh(m)->pv_list)); 6865 rw_runlock(lock); 6866 return (rv); 6867 } 6868 6869 /* 6870 * Destroy all managed, non-wired mappings in the given user-space 6871 * pmap. This pmap cannot be active on any processor besides the 6872 * caller. 6873 * 6874 * This function cannot be applied to the kernel pmap. Moreover, it 6875 * is not intended for general use. It is only to be used during 6876 * process termination. Consequently, it can be implemented in ways 6877 * that make it faster than pmap_remove(). First, it can more quickly 6878 * destroy mappings by iterating over the pmap's collection of PV 6879 * entries, rather than searching the page table. Second, it doesn't 6880 * have to test and clear the page table entries atomically, because 6881 * no processor is currently accessing the user address space. In 6882 * particular, a page table entry's dirty bit won't change state once 6883 * this function starts. 6884 */ 6885 void 6886 pmap_remove_pages(pmap_t pmap) 6887 { 6888 pd_entry_t *pde; 6889 pt_entry_t *pte, tpte; 6890 struct spglist free; 6891 struct pv_chunklist free_chunks[PMAP_MEMDOM]; 6892 vm_page_t m, ml3, mt; 6893 pv_entry_t pv; 6894 struct md_page *pvh; 6895 struct pv_chunk *pc, *npc; 6896 struct rwlock *lock; 6897 int64_t bit; 6898 uint64_t inuse, bitmask; 6899 int allfree, field, i, idx, lvl; 6900 int freed __pvused; 6901 vm_paddr_t pa; 6902 6903 lock = NULL; 6904 6905 for (i = 0; i < PMAP_MEMDOM; i++) 6906 TAILQ_INIT(&free_chunks[i]); 6907 SLIST_INIT(&free); 6908 PMAP_LOCK(pmap); 6909 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 6910 allfree = 1; 6911 freed = 0; 6912 for (field = 0; field < _NPCM; field++) { 6913 inuse = ~pc->pc_map[field] & pc_freemask[field]; 6914 while (inuse != 0) { 6915 bit = ffsl(inuse) - 1; 6916 bitmask = 1UL << bit; 6917 idx = field * 64 + bit; 6918 pv = &pc->pc_pventry[idx]; 6919 inuse &= ~bitmask; 6920 6921 pde = pmap_pde(pmap, pv->pv_va, &lvl); 6922 KASSERT(pde != NULL, 6923 ("Attempting to remove an unmapped page")); 6924 6925 switch(lvl) { 6926 case 1: 6927 pte = pmap_l1_to_l2(pde, pv->pv_va); 6928 tpte = pmap_load(pte); 6929 KASSERT((tpte & ATTR_DESCR_MASK) == 6930 L2_BLOCK, 6931 ("Attempting to remove an invalid " 6932 "block: %lx", tpte)); 6933 break; 6934 case 2: 6935 pte = pmap_l2_to_l3(pde, pv->pv_va); 6936 tpte = pmap_load(pte); 6937 KASSERT((tpte & ATTR_DESCR_MASK) == 6938 L3_PAGE, 6939 ("Attempting to remove an invalid " 6940 "page: %lx", tpte)); 6941 break; 6942 default: 6943 panic( 6944 "Invalid page directory level: %d", 6945 lvl); 6946 } 6947 6948 /* 6949 * We cannot remove wired mappings at this time. 6950 * 6951 * For L3C superpages, all of the constituent PTEs 6952 * should have the wired bit set, so we don't 6953 * check for ATTR_CONTIGUOUS here. 6954 */ 6955 if (tpte & ATTR_SW_WIRED) { 6956 allfree = 0; 6957 continue; 6958 } 6959 6960 /* Mark free */ 6961 pc->pc_map[field] |= bitmask; 6962 6963 /* 6964 * Because this pmap is not active on other 6965 * processors, the dirty bit cannot have 6966 * changed state since we last loaded pte. 6967 */ 6968 pmap_clear(pte); 6969 6970 pa = PTE_TO_PHYS(tpte); 6971 6972 m = PHYS_TO_VM_PAGE(pa); 6973 KASSERT(m->phys_addr == pa, 6974 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 6975 m, (uintmax_t)m->phys_addr, 6976 (uintmax_t)tpte)); 6977 6978 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 6979 m < &vm_page_array[vm_page_array_size], 6980 ("pmap_remove_pages: bad pte %#jx", 6981 (uintmax_t)tpte)); 6982 6983 /* 6984 * Update the vm_page_t clean/reference bits. 6985 * 6986 * We don't check for ATTR_CONTIGUOUS here 6987 * because writeable L3C superpages are expected 6988 * to be dirty, i.e., every constituent PTE 6989 * should be dirty. 6990 */ 6991 if (pmap_pte_dirty(pmap, tpte)) { 6992 switch (lvl) { 6993 case 1: 6994 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 6995 vm_page_dirty(mt); 6996 break; 6997 case 2: 6998 vm_page_dirty(m); 6999 break; 7000 } 7001 } 7002 7003 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 7004 7005 switch (lvl) { 7006 case 1: 7007 pmap_resident_count_dec(pmap, 7008 L2_SIZE / PAGE_SIZE); 7009 pvh = page_to_pvh(m); 7010 TAILQ_REMOVE(&pvh->pv_list, pv,pv_next); 7011 pvh->pv_gen++; 7012 if (TAILQ_EMPTY(&pvh->pv_list)) { 7013 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 7014 if ((mt->a.flags & PGA_WRITEABLE) != 0 && 7015 TAILQ_EMPTY(&mt->md.pv_list)) 7016 vm_page_aflag_clear(mt, PGA_WRITEABLE); 7017 } 7018 ml3 = pmap_remove_pt_page(pmap, 7019 pv->pv_va); 7020 if (ml3 != NULL) { 7021 KASSERT(vm_page_any_valid(ml3), 7022 ("pmap_remove_pages: l3 page not promoted")); 7023 pmap_resident_count_dec(pmap,1); 7024 KASSERT(ml3->ref_count == NL3PG, 7025 ("pmap_remove_pages: l3 page ref count error")); 7026 ml3->ref_count = 0; 7027 pmap_add_delayed_free_list(ml3, 7028 &free, false); 7029 } 7030 break; 7031 case 2: 7032 pmap_resident_count_dec(pmap, 1); 7033 TAILQ_REMOVE(&m->md.pv_list, pv, 7034 pv_next); 7035 m->md.pv_gen++; 7036 if ((m->a.flags & PGA_WRITEABLE) != 0 && 7037 TAILQ_EMPTY(&m->md.pv_list) && 7038 (m->flags & PG_FICTITIOUS) == 0) { 7039 pvh = page_to_pvh(m); 7040 if (TAILQ_EMPTY(&pvh->pv_list)) 7041 vm_page_aflag_clear(m, 7042 PGA_WRITEABLE); 7043 } 7044 break; 7045 } 7046 pmap_unuse_pt(pmap, pv->pv_va, pmap_load(pde), 7047 &free); 7048 freed++; 7049 } 7050 } 7051 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 7052 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 7053 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 7054 if (allfree) { 7055 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 7056 TAILQ_INSERT_TAIL(&free_chunks[pc_to_domain(pc)], pc, 7057 pc_list); 7058 } 7059 } 7060 if (lock != NULL) 7061 rw_wunlock(lock); 7062 pmap_invalidate_all(pmap); 7063 pmap_bti_deassign_all(pmap); 7064 free_pv_chunk_batch(free_chunks); 7065 PMAP_UNLOCK(pmap); 7066 vm_page_free_pages_toq(&free, true); 7067 } 7068 7069 /* 7070 * This is used to check if a page has been accessed or modified. 7071 */ 7072 static bool 7073 pmap_page_test_mappings(vm_page_t m, bool accessed, bool modified) 7074 { 7075 struct rwlock *lock; 7076 pv_entry_t pv; 7077 struct md_page *pvh; 7078 pt_entry_t l3e, mask, *pte, value; 7079 pmap_t pmap; 7080 int md_gen, pvh_gen; 7081 bool rv; 7082 7083 rv = false; 7084 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 7085 rw_rlock(lock); 7086 restart: 7087 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 7088 pmap = PV_PMAP(pv); 7089 PMAP_ASSERT_STAGE1(pmap); 7090 if (!PMAP_TRYLOCK(pmap)) { 7091 md_gen = m->md.pv_gen; 7092 rw_runlock(lock); 7093 PMAP_LOCK(pmap); 7094 rw_rlock(lock); 7095 if (md_gen != m->md.pv_gen) { 7096 PMAP_UNLOCK(pmap); 7097 goto restart; 7098 } 7099 } 7100 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__); 7101 mask = 0; 7102 value = 0; 7103 if (modified) { 7104 mask |= ATTR_S1_AP_RW_BIT; 7105 value |= ATTR_S1_AP(ATTR_S1_AP_RW); 7106 } 7107 if (accessed) { 7108 mask |= ATTR_AF | ATTR_DESCR_MASK; 7109 value |= ATTR_AF | L3_PAGE; 7110 } 7111 l3e = pmap_load(pte); 7112 if ((l3e & ATTR_CONTIGUOUS) != 0) 7113 l3e = pmap_load_l3c(pte); 7114 PMAP_UNLOCK(pmap); 7115 rv = (l3e & mask) == value; 7116 if (rv) 7117 goto out; 7118 } 7119 if ((m->flags & PG_FICTITIOUS) == 0) { 7120 pvh = page_to_pvh(m); 7121 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 7122 pmap = PV_PMAP(pv); 7123 PMAP_ASSERT_STAGE1(pmap); 7124 if (!PMAP_TRYLOCK(pmap)) { 7125 md_gen = m->md.pv_gen; 7126 pvh_gen = pvh->pv_gen; 7127 rw_runlock(lock); 7128 PMAP_LOCK(pmap); 7129 rw_rlock(lock); 7130 if (md_gen != m->md.pv_gen || 7131 pvh_gen != pvh->pv_gen) { 7132 PMAP_UNLOCK(pmap); 7133 goto restart; 7134 } 7135 } 7136 pte = pmap_pte_exists(pmap, pv->pv_va, 2, __func__); 7137 mask = 0; 7138 value = 0; 7139 if (modified) { 7140 mask |= ATTR_S1_AP_RW_BIT; 7141 value |= ATTR_S1_AP(ATTR_S1_AP_RW); 7142 } 7143 if (accessed) { 7144 mask |= ATTR_AF | ATTR_DESCR_MASK; 7145 value |= ATTR_AF | L2_BLOCK; 7146 } 7147 rv = (pmap_load(pte) & mask) == value; 7148 PMAP_UNLOCK(pmap); 7149 if (rv) 7150 goto out; 7151 } 7152 } 7153 out: 7154 rw_runlock(lock); 7155 return (rv); 7156 } 7157 7158 /* 7159 * pmap_is_modified: 7160 * 7161 * Return whether or not the specified physical page was modified 7162 * in any physical maps. 7163 */ 7164 bool 7165 pmap_is_modified(vm_page_t m) 7166 { 7167 7168 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 7169 ("pmap_is_modified: page %p is not managed", m)); 7170 7171 /* 7172 * If the page is not busied then this check is racy. 7173 */ 7174 if (!pmap_page_is_write_mapped(m)) 7175 return (false); 7176 return (pmap_page_test_mappings(m, false, true)); 7177 } 7178 7179 /* 7180 * pmap_is_prefaultable: 7181 * 7182 * Return whether or not the specified virtual address is eligible 7183 * for prefault. 7184 */ 7185 bool 7186 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 7187 { 7188 pd_entry_t *pde; 7189 pt_entry_t *pte; 7190 bool rv; 7191 int lvl; 7192 7193 /* 7194 * Return true if and only if the L3 entry for the specified virtual 7195 * address is allocated but invalid. 7196 */ 7197 rv = false; 7198 PMAP_LOCK(pmap); 7199 pde = pmap_pde(pmap, addr, &lvl); 7200 if (pde != NULL && lvl == 2) { 7201 pte = pmap_l2_to_l3(pde, addr); 7202 rv = pmap_load(pte) == 0; 7203 } 7204 PMAP_UNLOCK(pmap); 7205 return (rv); 7206 } 7207 7208 /* 7209 * pmap_is_referenced: 7210 * 7211 * Return whether or not the specified physical page was referenced 7212 * in any physical maps. 7213 */ 7214 bool 7215 pmap_is_referenced(vm_page_t m) 7216 { 7217 7218 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 7219 ("pmap_is_referenced: page %p is not managed", m)); 7220 return (pmap_page_test_mappings(m, true, false)); 7221 } 7222 7223 /* 7224 * Clear the write and modified bits in each of the given page's mappings. 7225 */ 7226 void 7227 pmap_remove_write(vm_page_t m) 7228 { 7229 struct md_page *pvh; 7230 pmap_t pmap; 7231 struct rwlock *lock; 7232 pv_entry_t next_pv, pv; 7233 pt_entry_t oldpte, *pte, set, clear, mask, val; 7234 vm_offset_t va; 7235 int md_gen, pvh_gen; 7236 7237 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 7238 ("pmap_remove_write: page %p is not managed", m)); 7239 vm_page_assert_busied(m); 7240 7241 if (!pmap_page_is_write_mapped(m)) 7242 return; 7243 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 7244 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m); 7245 rw_wlock(lock); 7246 retry: 7247 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 7248 pmap = PV_PMAP(pv); 7249 PMAP_ASSERT_STAGE1(pmap); 7250 if (!PMAP_TRYLOCK(pmap)) { 7251 pvh_gen = pvh->pv_gen; 7252 rw_wunlock(lock); 7253 PMAP_LOCK(pmap); 7254 rw_wlock(lock); 7255 if (pvh_gen != pvh->pv_gen) { 7256 PMAP_UNLOCK(pmap); 7257 goto retry; 7258 } 7259 } 7260 va = pv->pv_va; 7261 pte = pmap_pte_exists(pmap, va, 2, __func__); 7262 if ((pmap_load(pte) & ATTR_SW_DBM) != 0) 7263 (void)pmap_demote_l2_locked(pmap, pte, va, &lock); 7264 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 7265 ("inconsistent pv lock %p %p for page %p", 7266 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 7267 PMAP_UNLOCK(pmap); 7268 } 7269 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 7270 pmap = PV_PMAP(pv); 7271 if (!PMAP_TRYLOCK(pmap)) { 7272 pvh_gen = pvh->pv_gen; 7273 md_gen = m->md.pv_gen; 7274 rw_wunlock(lock); 7275 PMAP_LOCK(pmap); 7276 rw_wlock(lock); 7277 if (pvh_gen != pvh->pv_gen || 7278 md_gen != m->md.pv_gen) { 7279 PMAP_UNLOCK(pmap); 7280 goto retry; 7281 } 7282 } 7283 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__); 7284 oldpte = pmap_load(pte); 7285 if ((oldpte & ATTR_SW_DBM) != 0) { 7286 if ((oldpte & ATTR_CONTIGUOUS) != 0) { 7287 (void)pmap_demote_l3c(pmap, pte, pv->pv_va); 7288 7289 /* 7290 * The L3 entry's accessed bit may have 7291 * changed. 7292 */ 7293 oldpte = pmap_load(pte); 7294 } 7295 if (pmap->pm_stage == PM_STAGE1) { 7296 set = ATTR_S1_AP_RW_BIT; 7297 clear = 0; 7298 mask = ATTR_S1_AP_RW_BIT; 7299 val = ATTR_S1_AP(ATTR_S1_AP_RW); 7300 } else { 7301 set = 0; 7302 clear = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE); 7303 mask = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE); 7304 val = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE); 7305 } 7306 clear |= ATTR_SW_DBM; 7307 while (!atomic_fcmpset_64(pte, &oldpte, 7308 (oldpte | set) & ~clear)) 7309 cpu_spinwait(); 7310 7311 if ((oldpte & mask) == val) 7312 vm_page_dirty(m); 7313 pmap_invalidate_page(pmap, pv->pv_va, true); 7314 } 7315 PMAP_UNLOCK(pmap); 7316 } 7317 rw_wunlock(lock); 7318 vm_page_aflag_clear(m, PGA_WRITEABLE); 7319 } 7320 7321 /* 7322 * pmap_ts_referenced: 7323 * 7324 * Return a count of reference bits for a page, clearing those bits. 7325 * It is not necessary for every reference bit to be cleared, but it 7326 * is necessary that 0 only be returned when there are truly no 7327 * reference bits set. 7328 * 7329 * As an optimization, update the page's dirty field if a modified bit is 7330 * found while counting reference bits. This opportunistic update can be 7331 * performed at low cost and can eliminate the need for some future calls 7332 * to pmap_is_modified(). However, since this function stops after 7333 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 7334 * dirty pages. Those dirty pages will only be detected by a future call 7335 * to pmap_is_modified(). 7336 */ 7337 int 7338 pmap_ts_referenced(vm_page_t m) 7339 { 7340 struct md_page *pvh; 7341 pv_entry_t pv, pvf; 7342 pmap_t pmap; 7343 struct rwlock *lock; 7344 pt_entry_t *pte, tpte; 7345 vm_offset_t va; 7346 vm_paddr_t pa; 7347 int cleared, md_gen, not_cleared, pvh_gen; 7348 struct spglist free; 7349 7350 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 7351 ("pmap_ts_referenced: page %p is not managed", m)); 7352 SLIST_INIT(&free); 7353 cleared = 0; 7354 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m); 7355 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 7356 rw_wlock(lock); 7357 retry: 7358 not_cleared = 0; 7359 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 7360 goto small_mappings; 7361 pv = pvf; 7362 do { 7363 if (pvf == NULL) 7364 pvf = pv; 7365 pmap = PV_PMAP(pv); 7366 if (!PMAP_TRYLOCK(pmap)) { 7367 pvh_gen = pvh->pv_gen; 7368 rw_wunlock(lock); 7369 PMAP_LOCK(pmap); 7370 rw_wlock(lock); 7371 if (pvh_gen != pvh->pv_gen) { 7372 PMAP_UNLOCK(pmap); 7373 goto retry; 7374 } 7375 } 7376 va = pv->pv_va; 7377 pte = pmap_pte_exists(pmap, va, 2, __func__); 7378 tpte = pmap_load(pte); 7379 if (pmap_pte_dirty(pmap, tpte)) { 7380 /* 7381 * Although "tpte" is mapping a 2MB page, because 7382 * this function is called at a 4KB page granularity, 7383 * we only update the 4KB page under test. 7384 */ 7385 vm_page_dirty(m); 7386 } 7387 if ((tpte & ATTR_AF) != 0) { 7388 pa = VM_PAGE_TO_PHYS(m); 7389 7390 /* 7391 * Since this reference bit is shared by 512 4KB pages, 7392 * it should not be cleared every time it is tested. 7393 * Apply a simple "hash" function on the physical page 7394 * number, the virtual superpage number, and the pmap 7395 * address to select one 4KB page out of the 512 on 7396 * which testing the reference bit will result in 7397 * clearing that reference bit. This function is 7398 * designed to avoid the selection of the same 4KB page 7399 * for every 2MB page mapping. 7400 * 7401 * On demotion, a mapping that hasn't been referenced 7402 * is simply destroyed. To avoid the possibility of a 7403 * subsequent page fault on a demoted wired mapping, 7404 * always leave its reference bit set. Moreover, 7405 * since the superpage is wired, the current state of 7406 * its reference bit won't affect page replacement. 7407 */ 7408 if ((((pa >> PAGE_SHIFT) ^ (va >> L2_SHIFT) ^ 7409 (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 && 7410 (tpte & ATTR_SW_WIRED) == 0) { 7411 pmap_clear_bits(pte, ATTR_AF); 7412 pmap_invalidate_page(pmap, va, true); 7413 cleared++; 7414 } else 7415 not_cleared++; 7416 } 7417 PMAP_UNLOCK(pmap); 7418 /* Rotate the PV list if it has more than one entry. */ 7419 if (TAILQ_NEXT(pv, pv_next) != NULL) { 7420 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 7421 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 7422 pvh->pv_gen++; 7423 } 7424 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) 7425 goto out; 7426 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 7427 small_mappings: 7428 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 7429 goto out; 7430 pv = pvf; 7431 do { 7432 if (pvf == NULL) 7433 pvf = pv; 7434 pmap = PV_PMAP(pv); 7435 if (!PMAP_TRYLOCK(pmap)) { 7436 pvh_gen = pvh->pv_gen; 7437 md_gen = m->md.pv_gen; 7438 rw_wunlock(lock); 7439 PMAP_LOCK(pmap); 7440 rw_wlock(lock); 7441 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 7442 PMAP_UNLOCK(pmap); 7443 goto retry; 7444 } 7445 } 7446 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__); 7447 tpte = pmap_load(pte); 7448 if (pmap_pte_dirty(pmap, tpte)) 7449 vm_page_dirty(m); 7450 if ((tpte & ATTR_AF) != 0) { 7451 if ((tpte & ATTR_SW_WIRED) == 0) { 7452 /* 7453 * Clear the accessed bit in this L3 entry 7454 * regardless of the contiguous bit. 7455 */ 7456 pmap_clear_bits(pte, ATTR_AF); 7457 pmap_invalidate_page(pmap, pv->pv_va, true); 7458 cleared++; 7459 } else 7460 not_cleared++; 7461 } else if ((tpte & ATTR_CONTIGUOUS) != 0 && 7462 (pmap_load_l3c(pte) & ATTR_AF) != 0) { 7463 /* 7464 * An L3C superpage mapping is regarded as accessed 7465 * until the accessed bit has been cleared in all 7466 * of its constituent entries. 7467 */ 7468 not_cleared++; 7469 } 7470 PMAP_UNLOCK(pmap); 7471 /* Rotate the PV list if it has more than one entry. */ 7472 if (TAILQ_NEXT(pv, pv_next) != NULL) { 7473 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 7474 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 7475 m->md.pv_gen++; 7476 } 7477 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + 7478 not_cleared < PMAP_TS_REFERENCED_MAX); 7479 out: 7480 rw_wunlock(lock); 7481 vm_page_free_pages_toq(&free, true); 7482 return (cleared + not_cleared); 7483 } 7484 7485 /* 7486 * Apply the given advice to the specified range of addresses within the 7487 * given pmap. Depending on the advice, clear the referenced and/or 7488 * modified flags in each mapping and set the mapped page's dirty field. 7489 */ 7490 void 7491 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 7492 { 7493 struct rwlock *lock; 7494 vm_offset_t va, va_next, dva; 7495 vm_page_t m; 7496 pd_entry_t *l0, *l1, *l2, oldl2; 7497 pt_entry_t *l3, *dl3, oldl3; 7498 7499 PMAP_ASSERT_STAGE1(pmap); 7500 7501 if (advice != MADV_DONTNEED && advice != MADV_FREE) 7502 return; 7503 7504 PMAP_LOCK(pmap); 7505 for (; sva < eva; sva = va_next) { 7506 l0 = pmap_l0(pmap, sva); 7507 if (pmap_load(l0) == 0) { 7508 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 7509 if (va_next < sva) 7510 va_next = eva; 7511 continue; 7512 } 7513 7514 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 7515 if (va_next < sva) 7516 va_next = eva; 7517 l1 = pmap_l0_to_l1(l0, sva); 7518 if (pmap_load(l1) == 0) 7519 continue; 7520 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { 7521 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 7522 continue; 7523 } 7524 7525 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 7526 if (va_next < sva) 7527 va_next = eva; 7528 l2 = pmap_l1_to_l2(l1, sva); 7529 oldl2 = pmap_load(l2); 7530 if (oldl2 == 0) 7531 continue; 7532 if ((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK) { 7533 if ((oldl2 & ATTR_SW_MANAGED) == 0) 7534 continue; 7535 lock = NULL; 7536 if (!pmap_demote_l2_locked(pmap, l2, sva, &lock)) { 7537 if (lock != NULL) 7538 rw_wunlock(lock); 7539 7540 /* 7541 * The 2MB page mapping was destroyed. 7542 */ 7543 continue; 7544 } 7545 7546 /* 7547 * Unless the page mappings are wired, remove the 7548 * mapping to a single page so that a subsequent 7549 * access may repromote. Choosing the last page 7550 * within the address range [sva, min(va_next, eva)) 7551 * generally results in more repromotions. Since the 7552 * underlying page table page is fully populated, this 7553 * removal never frees a page table page. 7554 */ 7555 if ((oldl2 & ATTR_SW_WIRED) == 0) { 7556 va = eva; 7557 if (va > va_next) 7558 va = va_next; 7559 va -= PAGE_SIZE; 7560 KASSERT(va >= sva, 7561 ("pmap_advise: no address gap")); 7562 l3 = pmap_l2_to_l3(l2, va); 7563 KASSERT(pmap_load(l3) != 0, 7564 ("pmap_advise: invalid PTE")); 7565 pmap_remove_l3(pmap, l3, va, pmap_load(l2), 7566 NULL, &lock); 7567 } 7568 if (lock != NULL) 7569 rw_wunlock(lock); 7570 } 7571 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, 7572 ("pmap_advise: invalid L2 entry after demotion")); 7573 if (va_next > eva) 7574 va_next = eva; 7575 va = va_next; 7576 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 7577 sva += L3_SIZE) { 7578 oldl3 = pmap_load(l3); 7579 if ((oldl3 & (ATTR_SW_MANAGED | ATTR_DESCR_MASK)) != 7580 (ATTR_SW_MANAGED | L3_PAGE)) 7581 goto maybe_invlrng; 7582 else if (pmap_pte_dirty(pmap, oldl3)) { 7583 if (advice == MADV_DONTNEED) { 7584 /* 7585 * Future calls to pmap_is_modified() 7586 * can be avoided by making the page 7587 * dirty now. 7588 */ 7589 m = PTE_TO_VM_PAGE(oldl3); 7590 vm_page_dirty(m); 7591 } 7592 if ((oldl3 & ATTR_CONTIGUOUS) != 0) { 7593 /* 7594 * Unconditionally demote the L3C 7595 * superpage because we do not allow 7596 * writeable, clean superpages. 7597 */ 7598 (void)pmap_demote_l3c(pmap, l3, sva); 7599 7600 /* 7601 * Destroy the final mapping before the 7602 * next L3C boundary or va_next, 7603 * whichever comes first, so that a 7604 * subsequent access may act as a 7605 * repromotion trigger. 7606 */ 7607 if ((oldl3 & ATTR_SW_WIRED) == 0) { 7608 dva = MIN((sva & ~L3C_OFFSET) + 7609 L3C_SIZE - PAGE_SIZE, 7610 va_next - PAGE_SIZE); 7611 dl3 = pmap_l2_to_l3(l2, dva); 7612 KASSERT(pmap_load(dl3) != 0, 7613 ("pmap_advise: invalid PTE")); 7614 lock = NULL; 7615 pmap_remove_l3(pmap, dl3, dva, 7616 pmap_load(l2), NULL, &lock); 7617 if (lock != NULL) 7618 rw_wunlock(lock); 7619 } 7620 7621 /* 7622 * The L3 entry's accessed bit may have 7623 * changed. 7624 */ 7625 oldl3 = pmap_load(l3); 7626 } 7627 7628 /* 7629 * Check that we did not just destroy this entry so 7630 * we avoid corrupting the page able. 7631 */ 7632 if (oldl3 != 0) { 7633 while (!atomic_fcmpset_long(l3, &oldl3, 7634 (oldl3 & ~ATTR_AF) | 7635 ATTR_S1_AP(ATTR_S1_AP_RO))) 7636 cpu_spinwait(); 7637 } 7638 } else if ((oldl3 & ATTR_AF) != 0) { 7639 /* 7640 * Clear the accessed bit in this L3 entry 7641 * regardless of the contiguous bit. 7642 */ 7643 pmap_clear_bits(l3, ATTR_AF); 7644 } else 7645 goto maybe_invlrng; 7646 if (va == va_next) 7647 va = sva; 7648 continue; 7649 maybe_invlrng: 7650 if (va != va_next) { 7651 pmap_s1_invalidate_range(pmap, va, sva, true); 7652 va = va_next; 7653 } 7654 } 7655 if (va != va_next) 7656 pmap_s1_invalidate_range(pmap, va, sva, true); 7657 } 7658 PMAP_UNLOCK(pmap); 7659 } 7660 7661 /* 7662 * Clear the modify bits on the specified physical page. 7663 */ 7664 void 7665 pmap_clear_modify(vm_page_t m) 7666 { 7667 struct md_page *pvh; 7668 struct rwlock *lock; 7669 pmap_t pmap; 7670 pv_entry_t next_pv, pv; 7671 pd_entry_t *l2, oldl2; 7672 pt_entry_t *l3, oldl3; 7673 vm_offset_t va; 7674 int md_gen, pvh_gen; 7675 7676 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 7677 ("pmap_clear_modify: page %p is not managed", m)); 7678 vm_page_assert_busied(m); 7679 7680 if (!pmap_page_is_write_mapped(m)) 7681 return; 7682 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m); 7683 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 7684 rw_wlock(lock); 7685 restart: 7686 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 7687 pmap = PV_PMAP(pv); 7688 PMAP_ASSERT_STAGE1(pmap); 7689 if (!PMAP_TRYLOCK(pmap)) { 7690 pvh_gen = pvh->pv_gen; 7691 rw_wunlock(lock); 7692 PMAP_LOCK(pmap); 7693 rw_wlock(lock); 7694 if (pvh_gen != pvh->pv_gen) { 7695 PMAP_UNLOCK(pmap); 7696 goto restart; 7697 } 7698 } 7699 va = pv->pv_va; 7700 l2 = pmap_l2(pmap, va); 7701 oldl2 = pmap_load(l2); 7702 /* If oldl2 has ATTR_SW_DBM set, then it is also dirty. */ 7703 if ((oldl2 & ATTR_SW_DBM) != 0 && 7704 pmap_demote_l2_locked(pmap, l2, va, &lock) && 7705 (oldl2 & ATTR_SW_WIRED) == 0) { 7706 /* 7707 * Write protect the mapping to a single page so that 7708 * a subsequent write access may repromote. 7709 */ 7710 va += VM_PAGE_TO_PHYS(m) - PTE_TO_PHYS(oldl2); 7711 l3 = pmap_l2_to_l3(l2, va); 7712 oldl3 = pmap_load(l3); 7713 while (!atomic_fcmpset_long(l3, &oldl3, 7714 (oldl3 & ~ATTR_SW_DBM) | ATTR_S1_AP(ATTR_S1_AP_RO))) 7715 cpu_spinwait(); 7716 vm_page_dirty(m); 7717 pmap_s1_invalidate_page(pmap, va, true); 7718 } 7719 PMAP_UNLOCK(pmap); 7720 } 7721 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 7722 pmap = PV_PMAP(pv); 7723 PMAP_ASSERT_STAGE1(pmap); 7724 if (!PMAP_TRYLOCK(pmap)) { 7725 md_gen = m->md.pv_gen; 7726 pvh_gen = pvh->pv_gen; 7727 rw_wunlock(lock); 7728 PMAP_LOCK(pmap); 7729 rw_wlock(lock); 7730 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 7731 PMAP_UNLOCK(pmap); 7732 goto restart; 7733 } 7734 } 7735 l2 = pmap_l2(pmap, pv->pv_va); 7736 l3 = pmap_l2_to_l3(l2, pv->pv_va); 7737 oldl3 = pmap_load(l3); 7738 KASSERT((oldl3 & ATTR_CONTIGUOUS) == 0 || 7739 (oldl3 & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) != 7740 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RO)), 7741 ("writeable L3C superpage not dirty")); 7742 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == ATTR_SW_DBM) { 7743 if ((oldl3 & ATTR_CONTIGUOUS) != 0) 7744 (void)pmap_demote_l3c(pmap, l3, pv->pv_va); 7745 pmap_set_bits(l3, ATTR_S1_AP(ATTR_S1_AP_RO)); 7746 pmap_s1_invalidate_page(pmap, pv->pv_va, true); 7747 } 7748 PMAP_UNLOCK(pmap); 7749 } 7750 rw_wunlock(lock); 7751 } 7752 7753 void * 7754 pmap_mapbios(vm_paddr_t pa, vm_size_t size) 7755 { 7756 struct pmap_preinit_mapping *ppim; 7757 vm_offset_t va, offset; 7758 pd_entry_t old_l2e, *pde; 7759 pt_entry_t *l2; 7760 int i, lvl, l2_blocks, free_l2_count, start_idx; 7761 7762 if (!vm_initialized) { 7763 /* 7764 * No L3 ptables so map entire L2 blocks where start VA is: 7765 * preinit_map_va + start_idx * L2_SIZE 7766 * There may be duplicate mappings (multiple VA -> same PA) but 7767 * ARM64 dcache is always PIPT so that's acceptable. 7768 */ 7769 if (size == 0) 7770 return (NULL); 7771 7772 /* Calculate how many L2 blocks are needed for the mapping */ 7773 l2_blocks = (roundup2(pa + size, L2_SIZE) - 7774 rounddown2(pa, L2_SIZE)) >> L2_SHIFT; 7775 7776 offset = pa & L2_OFFSET; 7777 7778 if (preinit_map_va == 0) 7779 return (NULL); 7780 7781 /* Map 2MiB L2 blocks from reserved VA space */ 7782 7783 free_l2_count = 0; 7784 start_idx = -1; 7785 /* Find enough free contiguous VA space */ 7786 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 7787 ppim = pmap_preinit_mapping + i; 7788 if (free_l2_count > 0 && ppim->pa != 0) { 7789 /* Not enough space here */ 7790 free_l2_count = 0; 7791 start_idx = -1; 7792 continue; 7793 } 7794 7795 if (ppim->pa == 0) { 7796 /* Free L2 block */ 7797 if (start_idx == -1) 7798 start_idx = i; 7799 free_l2_count++; 7800 if (free_l2_count == l2_blocks) 7801 break; 7802 } 7803 } 7804 if (free_l2_count != l2_blocks) 7805 panic("%s: too many preinit mappings", __func__); 7806 7807 va = preinit_map_va + (start_idx * L2_SIZE); 7808 for (i = start_idx; i < start_idx + l2_blocks; i++) { 7809 /* Mark entries as allocated */ 7810 ppim = pmap_preinit_mapping + i; 7811 ppim->pa = pa; 7812 ppim->va = va + offset; 7813 ppim->size = size; 7814 } 7815 7816 /* Map L2 blocks */ 7817 pa = rounddown2(pa, L2_SIZE); 7818 old_l2e = 0; 7819 for (i = 0; i < l2_blocks; i++) { 7820 pde = pmap_pde(kernel_pmap, va, &lvl); 7821 KASSERT(pde != NULL, 7822 ("pmap_mapbios: Invalid page entry, va: 0x%lx", 7823 va)); 7824 KASSERT(lvl == 1, 7825 ("pmap_mapbios: Invalid level %d", lvl)); 7826 7827 /* Insert L2_BLOCK */ 7828 l2 = pmap_l1_to_l2(pde, va); 7829 old_l2e |= pmap_load_store(l2, 7830 PHYS_TO_PTE(pa) | ATTR_AF | pmap_sh_attr | 7831 ATTR_S1_XN | ATTR_KERN_GP | 7832 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | L2_BLOCK); 7833 7834 va += L2_SIZE; 7835 pa += L2_SIZE; 7836 } 7837 if ((old_l2e & ATTR_DESCR_VALID) != 0) 7838 pmap_s1_invalidate_all(kernel_pmap); 7839 else { 7840 /* 7841 * Because the old entries were invalid and the new 7842 * mappings are not executable, an isb is not required. 7843 */ 7844 dsb(ishst); 7845 } 7846 7847 va = preinit_map_va + (start_idx * L2_SIZE); 7848 7849 } else { 7850 /* kva_alloc may be used to map the pages */ 7851 offset = pa & PAGE_MASK; 7852 size = round_page(offset + size); 7853 7854 va = kva_alloc(size); 7855 if (va == 0) 7856 panic("%s: Couldn't allocate KVA", __func__); 7857 7858 pde = pmap_pde(kernel_pmap, va, &lvl); 7859 KASSERT(lvl == 2, ("pmap_mapbios: Invalid level %d", lvl)); 7860 7861 /* L3 table is linked */ 7862 va = trunc_page(va); 7863 pa = trunc_page(pa); 7864 pmap_kenter(va, size, pa, memory_mapping_mode(pa)); 7865 } 7866 7867 return ((void *)(va + offset)); 7868 } 7869 7870 void 7871 pmap_unmapbios(void *p, vm_size_t size) 7872 { 7873 struct pmap_preinit_mapping *ppim; 7874 vm_offset_t offset, va, va_trunc; 7875 pd_entry_t *pde; 7876 pt_entry_t *l2; 7877 int i, lvl, l2_blocks, block; 7878 bool preinit_map; 7879 7880 va = (vm_offset_t)p; 7881 l2_blocks = 7882 (roundup2(va + size, L2_SIZE) - rounddown2(va, L2_SIZE)) >> L2_SHIFT; 7883 KASSERT(l2_blocks > 0, ("pmap_unmapbios: invalid size %lx", size)); 7884 7885 /* Remove preinit mapping */ 7886 preinit_map = false; 7887 block = 0; 7888 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 7889 ppim = pmap_preinit_mapping + i; 7890 if (ppim->va == va) { 7891 KASSERT(ppim->size == size, 7892 ("pmap_unmapbios: size mismatch")); 7893 ppim->va = 0; 7894 ppim->pa = 0; 7895 ppim->size = 0; 7896 preinit_map = true; 7897 offset = block * L2_SIZE; 7898 va_trunc = rounddown2(va, L2_SIZE) + offset; 7899 7900 /* Remove L2_BLOCK */ 7901 pde = pmap_pde(kernel_pmap, va_trunc, &lvl); 7902 KASSERT(pde != NULL, 7903 ("pmap_unmapbios: Invalid page entry, va: 0x%lx", 7904 va_trunc)); 7905 l2 = pmap_l1_to_l2(pde, va_trunc); 7906 pmap_clear(l2); 7907 7908 if (block == (l2_blocks - 1)) 7909 break; 7910 block++; 7911 } 7912 } 7913 if (preinit_map) { 7914 pmap_s1_invalidate_all(kernel_pmap); 7915 return; 7916 } 7917 7918 /* Unmap the pages reserved with kva_alloc. */ 7919 if (vm_initialized) { 7920 offset = va & PAGE_MASK; 7921 size = round_page(offset + size); 7922 va = trunc_page(va); 7923 7924 /* Unmap and invalidate the pages */ 7925 pmap_kremove_device(va, size); 7926 7927 kva_free(va, size); 7928 } 7929 } 7930 7931 /* 7932 * Sets the memory attribute for the specified page. 7933 */ 7934 void 7935 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 7936 { 7937 7938 m->md.pv_memattr = ma; 7939 7940 /* 7941 * If "m" is a normal page, update its direct mapping. This update 7942 * can be relied upon to perform any cache operations that are 7943 * required for data coherence. 7944 */ 7945 if ((m->flags & PG_FICTITIOUS) == 0 && 7946 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, 7947 m->md.pv_memattr) != 0) 7948 panic("memory attribute change on the direct map failed"); 7949 } 7950 7951 /* 7952 * Changes the specified virtual address range's memory type to that given by 7953 * the parameter "mode". The specified virtual address range must be 7954 * completely contained within either the direct map or the kernel map. If 7955 * the virtual address range is contained within the kernel map, then the 7956 * memory type for each of the corresponding ranges of the direct map is also 7957 * changed. (The corresponding ranges of the direct map are those ranges that 7958 * map the same physical pages as the specified virtual address range.) These 7959 * changes to the direct map are necessary because Intel describes the 7960 * behavior of their processors as "undefined" if two or more mappings to the 7961 * same physical page have different memory types. 7962 * 7963 * Returns zero if the change completed successfully, and either EINVAL or 7964 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 7965 * of the virtual address range was not mapped, and ENOMEM is returned if 7966 * there was insufficient memory available to complete the change. In the 7967 * latter case, the memory type may have been changed on some part of the 7968 * virtual address range or the direct map. 7969 */ 7970 int 7971 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) 7972 { 7973 int error; 7974 7975 PMAP_LOCK(kernel_pmap); 7976 error = pmap_change_props_locked(va, size, PROT_NONE, mode, false); 7977 PMAP_UNLOCK(kernel_pmap); 7978 return (error); 7979 } 7980 7981 /* 7982 * Changes the specified virtual address range's protections to those 7983 * specified by "prot". Like pmap_change_attr(), protections for aliases 7984 * in the direct map are updated as well. Protections on aliasing mappings may 7985 * be a subset of the requested protections; for example, mappings in the direct 7986 * map are never executable. 7987 */ 7988 int 7989 pmap_change_prot(vm_offset_t va, vm_size_t size, vm_prot_t prot) 7990 { 7991 int error; 7992 7993 /* Only supported within the kernel map. */ 7994 if (va < VM_MIN_KERNEL_ADDRESS) 7995 return (EINVAL); 7996 7997 PMAP_LOCK(kernel_pmap); 7998 error = pmap_change_props_locked(va, size, prot, -1, false); 7999 PMAP_UNLOCK(kernel_pmap); 8000 return (error); 8001 } 8002 8003 static int 8004 pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot, 8005 int mode, bool skip_unmapped) 8006 { 8007 vm_offset_t base, offset, tmpva; 8008 vm_size_t pte_size; 8009 vm_paddr_t pa; 8010 pt_entry_t pte, *ptep, *newpte; 8011 pt_entry_t bits, mask; 8012 int lvl, rv; 8013 8014 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 8015 base = trunc_page(va); 8016 offset = va & PAGE_MASK; 8017 size = round_page(offset + size); 8018 8019 if (!VIRT_IN_DMAP(base) && 8020 !(base >= VM_MIN_KERNEL_ADDRESS && base < VM_MAX_KERNEL_ADDRESS)) 8021 return (EINVAL); 8022 8023 bits = 0; 8024 mask = 0; 8025 if (mode != -1) { 8026 bits = ATTR_S1_IDX(mode); 8027 mask = ATTR_S1_IDX_MASK; 8028 if (mode == VM_MEMATTR_DEVICE) { 8029 mask |= ATTR_S1_XN; 8030 bits |= ATTR_S1_XN; 8031 } 8032 } 8033 if (prot != VM_PROT_NONE) { 8034 /* Don't mark the DMAP as executable. It never is on arm64. */ 8035 if (VIRT_IN_DMAP(base)) { 8036 prot &= ~VM_PROT_EXECUTE; 8037 /* 8038 * XXX Mark the DMAP as writable for now. We rely 8039 * on this in ddb & dtrace to insert breakpoint 8040 * instructions. 8041 */ 8042 prot |= VM_PROT_WRITE; 8043 } 8044 8045 if ((prot & VM_PROT_WRITE) == 0) { 8046 bits |= ATTR_S1_AP(ATTR_S1_AP_RO); 8047 } 8048 if ((prot & VM_PROT_EXECUTE) == 0) { 8049 bits |= ATTR_S1_PXN; 8050 } 8051 bits |= ATTR_S1_UXN; 8052 mask |= ATTR_S1_AP_MASK | ATTR_S1_XN; 8053 } 8054 8055 for (tmpva = base; tmpva < base + size; ) { 8056 ptep = pmap_pte(kernel_pmap, tmpva, &lvl); 8057 if (ptep == NULL && !skip_unmapped) { 8058 return (EINVAL); 8059 } else if ((ptep == NULL && skip_unmapped) || 8060 (pmap_load(ptep) & mask) == bits) { 8061 /* 8062 * We already have the correct attribute or there 8063 * is no memory mapped at this address and we are 8064 * skipping unmapped memory. 8065 */ 8066 switch (lvl) { 8067 default: 8068 panic("Invalid DMAP table level: %d\n", lvl); 8069 case 1: 8070 tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE; 8071 break; 8072 case 2: 8073 tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE; 8074 break; 8075 case 3: 8076 tmpva += PAGE_SIZE; 8077 break; 8078 } 8079 } else { 8080 /* We can't demote/promote this entry */ 8081 MPASS((pmap_load(ptep) & ATTR_SW_NO_PROMOTE) == 0); 8082 8083 /* 8084 * Find the entry and demote it if the requested change 8085 * only applies to part of the address range mapped by 8086 * the entry. 8087 */ 8088 switch (lvl) { 8089 default: 8090 panic("Invalid DMAP table level: %d\n", lvl); 8091 case 1: 8092 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 8093 if ((tmpva & L1_OFFSET) == 0 && 8094 (base + size - tmpva) >= L1_SIZE) { 8095 pte_size = L1_SIZE; 8096 break; 8097 } 8098 newpte = pmap_demote_l1(kernel_pmap, ptep, 8099 tmpva & ~L1_OFFSET); 8100 if (newpte == NULL) 8101 return (EINVAL); 8102 ptep = pmap_l1_to_l2(ptep, tmpva); 8103 /* FALLTHROUGH */ 8104 case 2: 8105 if ((pmap_load(ptep) & ATTR_CONTIGUOUS) != 0) { 8106 if ((tmpva & L2C_OFFSET) == 0 && 8107 (base + size - tmpva) >= L2C_SIZE) { 8108 pte_size = L2C_SIZE; 8109 break; 8110 } 8111 if (!pmap_demote_l2c(kernel_pmap, ptep, 8112 tmpva)) 8113 return (EINVAL); 8114 } 8115 if ((tmpva & L2_OFFSET) == 0 && 8116 (base + size - tmpva) >= L2_SIZE) { 8117 pte_size = L2_SIZE; 8118 break; 8119 } 8120 newpte = pmap_demote_l2(kernel_pmap, ptep, 8121 tmpva); 8122 if (newpte == NULL) 8123 return (EINVAL); 8124 ptep = pmap_l2_to_l3(ptep, tmpva); 8125 /* FALLTHROUGH */ 8126 case 3: 8127 if ((pmap_load(ptep) & ATTR_CONTIGUOUS) != 0) { 8128 if ((tmpva & L3C_OFFSET) == 0 && 8129 (base + size - tmpva) >= L3C_SIZE) { 8130 pte_size = L3C_SIZE; 8131 break; 8132 } 8133 if (!pmap_demote_l3c(kernel_pmap, ptep, 8134 tmpva)) 8135 return (EINVAL); 8136 } 8137 pte_size = PAGE_SIZE; 8138 break; 8139 } 8140 8141 /* Update the entry */ 8142 pte = pmap_load(ptep); 8143 pte &= ~mask; 8144 pte |= bits; 8145 8146 switch (pte_size) { 8147 case L2C_SIZE: 8148 pmap_update_strided(kernel_pmap, ptep, ptep + 8149 L2C_ENTRIES, pte, tmpva, L2_SIZE, L2C_SIZE); 8150 break; 8151 case L3C_SIZE: 8152 pmap_update_strided(kernel_pmap, ptep, ptep + 8153 L3C_ENTRIES, pte, tmpva, L3_SIZE, L3C_SIZE); 8154 break; 8155 default: 8156 /* 8157 * We are updating a single block or page entry, 8158 * so regardless of pte_size pass PAGE_SIZE in 8159 * order that a single TLB invalidation is 8160 * performed. 8161 */ 8162 pmap_update_entry(kernel_pmap, ptep, pte, tmpva, 8163 PAGE_SIZE); 8164 break; 8165 } 8166 8167 pa = PTE_TO_PHYS(pte); 8168 if (!VIRT_IN_DMAP(tmpva) && PHYS_IN_DMAP(pa)) { 8169 /* 8170 * Keep the DMAP memory in sync. 8171 */ 8172 rv = pmap_change_props_locked( 8173 PHYS_TO_DMAP(pa), pte_size, 8174 prot, mode, true); 8175 if (rv != 0) 8176 return (rv); 8177 } 8178 8179 /* 8180 * If moving to a non-cacheable entry flush 8181 * the cache. 8182 */ 8183 if (mode == VM_MEMATTR_UNCACHEABLE) 8184 cpu_dcache_wbinv_range((void *)tmpva, pte_size); 8185 tmpva += pte_size; 8186 } 8187 } 8188 8189 return (0); 8190 } 8191 8192 /* 8193 * Create an L2 table to map all addresses within an L1 mapping. 8194 */ 8195 static pt_entry_t * 8196 pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va) 8197 { 8198 pt_entry_t *l2, newl2, oldl1; 8199 vm_offset_t tmpl1; 8200 vm_paddr_t l2phys, phys; 8201 vm_page_t ml2; 8202 int i; 8203 8204 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 8205 oldl1 = pmap_load(l1); 8206 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 8207 KASSERT((oldl1 & ATTR_DESCR_MASK) == L1_BLOCK, 8208 ("pmap_demote_l1: Demoting a non-block entry")); 8209 KASSERT((va & L1_OFFSET) == 0, 8210 ("pmap_demote_l1: Invalid virtual address %#lx", va)); 8211 KASSERT((oldl1 & ATTR_SW_MANAGED) == 0, 8212 ("pmap_demote_l1: Level 1 table shouldn't be managed")); 8213 KASSERT((oldl1 & ATTR_SW_NO_PROMOTE) == 0, 8214 ("pmap_demote_l1: Demoting entry with no-demote flag set")); 8215 8216 tmpl1 = 0; 8217 if (va <= (vm_offset_t)l1 && va + L1_SIZE > (vm_offset_t)l1) { 8218 tmpl1 = kva_alloc(PAGE_SIZE); 8219 if (tmpl1 == 0) 8220 return (NULL); 8221 } 8222 8223 if ((ml2 = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED)) == 8224 NULL) { 8225 CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx" 8226 " in pmap %p", va, pmap); 8227 l2 = NULL; 8228 goto fail; 8229 } 8230 8231 l2phys = VM_PAGE_TO_PHYS(ml2); 8232 l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys); 8233 8234 /* Address the range points at */ 8235 phys = PTE_TO_PHYS(oldl1); 8236 /* The attributed from the old l1 table to be copied */ 8237 newl2 = oldl1 & ATTR_MASK; 8238 8239 /* Create the new entries */ 8240 newl2 |= ATTR_CONTIGUOUS; 8241 for (i = 0; i < Ln_ENTRIES; i++) { 8242 l2[i] = newl2 | phys; 8243 phys += L2_SIZE; 8244 } 8245 KASSERT(l2[0] == (ATTR_CONTIGUOUS | (oldl1 & ~ATTR_DESCR_MASK) | 8246 L2_BLOCK), ("Invalid l2 page (%lx != %lx)", l2[0], 8247 ATTR_CONTIGUOUS | (oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK)); 8248 8249 if (tmpl1 != 0) { 8250 pmap_kenter(tmpl1, PAGE_SIZE, 8251 DMAP_TO_PHYS((vm_offset_t)l1) & ~L3_OFFSET, 8252 VM_MEMATTR_WRITE_BACK); 8253 l1 = (pt_entry_t *)(tmpl1 + ((vm_offset_t)l1 & PAGE_MASK)); 8254 } 8255 8256 pmap_update_entry(pmap, l1, l2phys | L1_TABLE, va, PAGE_SIZE); 8257 8258 counter_u64_add(pmap_l1_demotions, 1); 8259 fail: 8260 if (tmpl1 != 0) { 8261 pmap_kremove(tmpl1); 8262 kva_free(tmpl1, PAGE_SIZE); 8263 } 8264 8265 return (l2); 8266 } 8267 8268 static void 8269 pmap_fill_l3(pt_entry_t *firstl3, pt_entry_t newl3) 8270 { 8271 pt_entry_t *l3; 8272 8273 for (l3 = firstl3; l3 - firstl3 < Ln_ENTRIES; l3++) { 8274 *l3 = newl3; 8275 newl3 += L3_SIZE; 8276 } 8277 } 8278 8279 static void 8280 pmap_demote_l2_check(pt_entry_t *firstl3p __unused, pt_entry_t newl3e __unused) 8281 { 8282 #ifdef INVARIANTS 8283 #ifdef DIAGNOSTIC 8284 pt_entry_t *xl3p, *yl3p; 8285 8286 for (xl3p = firstl3p; xl3p < firstl3p + Ln_ENTRIES; 8287 xl3p++, newl3e += PAGE_SIZE) { 8288 if (PTE_TO_PHYS(pmap_load(xl3p)) != PTE_TO_PHYS(newl3e)) { 8289 printf("pmap_demote_l2: xl3e %zd and newl3e map " 8290 "different pages: found %#lx, expected %#lx\n", 8291 xl3p - firstl3p, pmap_load(xl3p), newl3e); 8292 printf("page table dump\n"); 8293 for (yl3p = firstl3p; yl3p < firstl3p + Ln_ENTRIES; 8294 yl3p++) { 8295 printf("%zd %#lx\n", yl3p - firstl3p, 8296 pmap_load(yl3p)); 8297 } 8298 panic("firstpte"); 8299 } 8300 } 8301 #else 8302 KASSERT(PTE_TO_PHYS(pmap_load(firstl3p)) == PTE_TO_PHYS(newl3e), 8303 ("pmap_demote_l2: firstl3 and newl3e map different physical" 8304 " addresses")); 8305 #endif 8306 #endif 8307 } 8308 8309 static void 8310 pmap_demote_l2_abort(pmap_t pmap, vm_offset_t va, pt_entry_t *l2, 8311 struct rwlock **lockp) 8312 { 8313 struct spglist free; 8314 8315 SLIST_INIT(&free); 8316 (void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), &free, 8317 lockp); 8318 vm_page_free_pages_toq(&free, true); 8319 } 8320 8321 /* 8322 * Create an L3 table to map all addresses within an L2 mapping. 8323 */ 8324 static pt_entry_t * 8325 pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va, 8326 struct rwlock **lockp) 8327 { 8328 pt_entry_t *l3, newl3, oldl2; 8329 vm_offset_t tmpl2; 8330 vm_paddr_t l3phys; 8331 vm_page_t ml3; 8332 8333 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 8334 PMAP_ASSERT_STAGE1(pmap); 8335 KASSERT(ADDR_IS_CANONICAL(va), 8336 ("%s: Address not in canonical form: %lx", __func__, va)); 8337 8338 l3 = NULL; 8339 oldl2 = pmap_load(l2); 8340 KASSERT((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK, 8341 ("pmap_demote_l2: Demoting a non-block entry")); 8342 KASSERT((oldl2 & ATTR_SW_NO_PROMOTE) == 0, 8343 ("pmap_demote_l2: Demoting entry with no-demote flag set")); 8344 va &= ~L2_OFFSET; 8345 8346 tmpl2 = 0; 8347 if (va <= (vm_offset_t)l2 && va + L2_SIZE > (vm_offset_t)l2) { 8348 tmpl2 = kva_alloc(PAGE_SIZE); 8349 if (tmpl2 == 0) 8350 return (NULL); 8351 } 8352 8353 /* 8354 * Invalidate the 2MB page mapping and return "failure" if the 8355 * mapping was never accessed. 8356 */ 8357 if ((oldl2 & ATTR_AF) == 0) { 8358 KASSERT((oldl2 & ATTR_SW_WIRED) == 0, 8359 ("pmap_demote_l2: a wired mapping is missing ATTR_AF")); 8360 pmap_demote_l2_abort(pmap, va, l2, lockp); 8361 CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx in pmap %p", 8362 va, pmap); 8363 goto fail; 8364 } 8365 8366 if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) { 8367 KASSERT((oldl2 & ATTR_SW_WIRED) == 0, 8368 ("pmap_demote_l2: page table page for a wired mapping" 8369 " is missing")); 8370 8371 /* 8372 * If the page table page is missing and the mapping 8373 * is for a kernel address, the mapping must belong to 8374 * either the direct map or the early kernel memory. 8375 * Page table pages are preallocated for every other 8376 * part of the kernel address space, so the direct map 8377 * region and early kernel memory are the only parts of the 8378 * kernel address space that must be handled here. 8379 */ 8380 KASSERT(!ADDR_IS_KERNEL(va) || VIRT_IN_DMAP(va) || 8381 (va >= VM_MIN_KERNEL_ADDRESS && va < kernel_vm_end), 8382 ("pmap_demote_l2: No saved mpte for va %#lx", va)); 8383 8384 /* 8385 * If the 2MB page mapping belongs to the direct map 8386 * region of the kernel's address space, then the page 8387 * allocation request specifies the highest possible 8388 * priority (VM_ALLOC_INTERRUPT). Otherwise, the 8389 * priority is normal. 8390 */ 8391 ml3 = vm_page_alloc_noobj( 8392 (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 0) | 8393 VM_ALLOC_WIRED); 8394 8395 /* 8396 * If the allocation of the new page table page fails, 8397 * invalidate the 2MB page mapping and return "failure". 8398 */ 8399 if (ml3 == NULL) { 8400 pmap_demote_l2_abort(pmap, va, l2, lockp); 8401 CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx" 8402 " in pmap %p", va, pmap); 8403 goto fail; 8404 } 8405 ml3->pindex = pmap_l2_pindex(va); 8406 8407 if (!ADDR_IS_KERNEL(va)) { 8408 ml3->ref_count = NL3PG; 8409 pmap_resident_count_inc(pmap, 1); 8410 } 8411 } 8412 l3phys = VM_PAGE_TO_PHYS(ml3); 8413 l3 = (pt_entry_t *)PHYS_TO_DMAP(l3phys); 8414 newl3 = ATTR_CONTIGUOUS | (oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE; 8415 KASSERT((oldl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) != 8416 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM), 8417 ("pmap_demote_l2: L2 entry is writeable but not dirty")); 8418 8419 /* 8420 * If the PTP is not leftover from an earlier promotion or it does not 8421 * have ATTR_AF set in every L3E, then fill it. The new L3Es will all 8422 * have ATTR_AF set. 8423 * 8424 * When pmap_update_entry() clears the old L2 mapping, it (indirectly) 8425 * performs a dsb(). That dsb() ensures that the stores for filling 8426 * "l3" are visible before "l3" is added to the page table. 8427 */ 8428 if (!vm_page_all_valid(ml3)) 8429 pmap_fill_l3(l3, newl3); 8430 8431 pmap_demote_l2_check(l3, newl3); 8432 8433 /* 8434 * If the mapping has changed attributes, update the L3Es. 8435 */ 8436 if ((pmap_load(l3) & ATTR_PROMOTE) != (newl3 & ATTR_PROMOTE)) 8437 pmap_fill_l3(l3, newl3); 8438 8439 /* 8440 * Map the temporary page so we don't lose access to the l2 table. 8441 */ 8442 if (tmpl2 != 0) { 8443 pmap_kenter(tmpl2, PAGE_SIZE, 8444 DMAP_TO_PHYS((vm_offset_t)l2) & ~L3_OFFSET, 8445 VM_MEMATTR_WRITE_BACK); 8446 l2 = (pt_entry_t *)(tmpl2 + ((vm_offset_t)l2 & PAGE_MASK)); 8447 } 8448 8449 /* 8450 * The spare PV entries must be reserved prior to demoting the 8451 * mapping, that is, prior to changing the PDE. Otherwise, the state 8452 * of the L2 and the PV lists will be inconsistent, which can result 8453 * in reclaim_pv_chunk() attempting to remove a PV entry from the 8454 * wrong PV list and pmap_pv_demote_l2() failing to find the expected 8455 * PV entry for the 2MB page mapping that is being demoted. 8456 */ 8457 if ((oldl2 & ATTR_SW_MANAGED) != 0) 8458 reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp); 8459 8460 /* 8461 * Pass PAGE_SIZE so that a single TLB invalidation is performed on 8462 * the 2MB page mapping. 8463 */ 8464 pmap_update_entry(pmap, l2, l3phys | L2_TABLE, va, PAGE_SIZE); 8465 8466 /* 8467 * Demote the PV entry. 8468 */ 8469 if ((oldl2 & ATTR_SW_MANAGED) != 0) 8470 pmap_pv_demote_l2(pmap, va, PTE_TO_PHYS(oldl2), lockp); 8471 8472 atomic_add_long(&pmap_l2_demotions, 1); 8473 CTR3(KTR_PMAP, "pmap_demote_l2: success for va %#lx" 8474 " in pmap %p %lx", va, pmap, l3[0]); 8475 8476 fail: 8477 if (tmpl2 != 0) { 8478 pmap_kremove(tmpl2); 8479 kva_free(tmpl2, PAGE_SIZE); 8480 } 8481 8482 return (l3); 8483 8484 } 8485 8486 static pt_entry_t * 8487 pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) 8488 { 8489 struct rwlock *lock; 8490 pt_entry_t *l3; 8491 8492 lock = NULL; 8493 l3 = pmap_demote_l2_locked(pmap, l2, va, &lock); 8494 if (lock != NULL) 8495 rw_wunlock(lock); 8496 return (l3); 8497 } 8498 8499 /* 8500 * Demote an L2C superpage mapping to L2C_ENTRIES L2 block mappings. 8501 */ 8502 static bool 8503 pmap_demote_l2c(pmap_t pmap, pt_entry_t *l2p, vm_offset_t va) 8504 { 8505 pd_entry_t *l2c_end, *l2c_start, l2e, mask, nbits, *tl2p; 8506 vm_offset_t tmpl3; 8507 register_t intr; 8508 8509 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 8510 PMAP_ASSERT_STAGE1(pmap); 8511 l2c_start = (pd_entry_t *)((uintptr_t)l2p & ~((L2C_ENTRIES * 8512 sizeof(pd_entry_t)) - 1)); 8513 l2c_end = l2c_start + L2C_ENTRIES; 8514 tmpl3 = 0; 8515 if ((va & ~L2C_OFFSET) < (vm_offset_t)l2c_end && 8516 (vm_offset_t)l2c_start < (va & ~L2C_OFFSET) + L2C_SIZE) { 8517 tmpl3 = kva_alloc(PAGE_SIZE); 8518 if (tmpl3 == 0) 8519 return (false); 8520 pmap_kenter(tmpl3, PAGE_SIZE, 8521 DMAP_TO_PHYS((vm_offset_t)l2c_start) & ~L3_OFFSET, 8522 VM_MEMATTR_WRITE_BACK); 8523 l2c_start = (pd_entry_t *)(tmpl3 + 8524 ((vm_offset_t)l2c_start & PAGE_MASK)); 8525 l2c_end = (pd_entry_t *)(tmpl3 + 8526 ((vm_offset_t)l2c_end & PAGE_MASK)); 8527 } 8528 mask = 0; 8529 nbits = ATTR_DESCR_VALID; 8530 intr = intr_disable(); 8531 8532 /* 8533 * Break the mappings. 8534 */ 8535 for (tl2p = l2c_start; tl2p < l2c_end; tl2p++) { 8536 /* 8537 * Clear the mapping's contiguous and valid bits, but leave 8538 * the rest of the entry unchanged, so that a lockless, 8539 * concurrent pmap_kextract() can still lookup the physical 8540 * address. 8541 */ 8542 l2e = pmap_load(tl2p); 8543 KASSERT((l2e & ATTR_CONTIGUOUS) != 0, 8544 ("pmap_demote_l2c: missing ATTR_CONTIGUOUS")); 8545 KASSERT((l2e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) != 8546 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RO)), 8547 ("pmap_demote_l2c: missing ATTR_S1_AP_RW")); 8548 while (!atomic_fcmpset_64(tl2p, &l2e, l2e & ~(ATTR_CONTIGUOUS | 8549 ATTR_DESCR_VALID))) 8550 cpu_spinwait(); 8551 8552 /* 8553 * Hardware accessed and dirty bit maintenance might only 8554 * update a single L2 entry, so we must combine the accessed 8555 * and dirty bits from this entire set of contiguous L2 8556 * entries. 8557 */ 8558 if ((l2e & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 8559 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM)) 8560 mask = ATTR_S1_AP_RW_BIT; 8561 nbits |= l2e & ATTR_AF; 8562 } 8563 if ((nbits & ATTR_AF) != 0) { 8564 pmap_s1_invalidate_strided(pmap, va & ~L2C_OFFSET, (va + 8565 L2C_SIZE) & ~L2C_OFFSET, L2_SIZE, true); 8566 } 8567 8568 /* 8569 * Remake the mappings, updating the accessed and dirty bits. 8570 */ 8571 for (tl2p = l2c_start; tl2p < l2c_end; tl2p++) { 8572 l2e = pmap_load(tl2p); 8573 while (!atomic_fcmpset_64(tl2p, &l2e, (l2e & ~mask) | nbits)) 8574 cpu_spinwait(); 8575 } 8576 dsb(ishst); 8577 8578 intr_restore(intr); 8579 if (tmpl3 != 0) { 8580 pmap_kremove(tmpl3); 8581 kva_free(tmpl3, PAGE_SIZE); 8582 } 8583 counter_u64_add(pmap_l2c_demotions, 1); 8584 CTR2(KTR_PMAP, "pmap_demote_l2c: success for va %#lx in pmap %p", 8585 va, pmap); 8586 return (true); 8587 } 8588 8589 /* 8590 * Demote a L3C superpage mapping to L3C_ENTRIES 4KB page mappings. 8591 */ 8592 static bool 8593 pmap_demote_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va) 8594 { 8595 pt_entry_t *l3c_end, *l3c_start, l3e, mask, nbits, *tl3p; 8596 vm_offset_t tmpl3; 8597 register_t intr; 8598 8599 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 8600 l3c_start = (pt_entry_t *)((uintptr_t)l3p & ~((L3C_ENTRIES * 8601 sizeof(pt_entry_t)) - 1)); 8602 l3c_end = l3c_start + L3C_ENTRIES; 8603 tmpl3 = 0; 8604 if ((va & ~L3C_OFFSET) < (vm_offset_t)l3c_end && 8605 (vm_offset_t)l3c_start < (va & ~L3C_OFFSET) + L3C_SIZE) { 8606 tmpl3 = kva_alloc(PAGE_SIZE); 8607 if (tmpl3 == 0) 8608 return (false); 8609 pmap_kenter(tmpl3, PAGE_SIZE, 8610 DMAP_TO_PHYS((vm_offset_t)l3c_start) & ~L3_OFFSET, 8611 VM_MEMATTR_WRITE_BACK); 8612 l3c_start = (pt_entry_t *)(tmpl3 + 8613 ((vm_offset_t)l3c_start & PAGE_MASK)); 8614 l3c_end = (pt_entry_t *)(tmpl3 + 8615 ((vm_offset_t)l3c_end & PAGE_MASK)); 8616 } 8617 mask = 0; 8618 nbits = ATTR_DESCR_VALID; 8619 intr = intr_disable(); 8620 8621 /* 8622 * Break the mappings. 8623 */ 8624 for (tl3p = l3c_start; tl3p < l3c_end; tl3p++) { 8625 /* 8626 * Clear the mapping's contiguous and valid bits, but leave 8627 * the rest of the entry unchanged, so that a lockless, 8628 * concurrent pmap_kextract() can still lookup the physical 8629 * address. 8630 */ 8631 l3e = pmap_load(tl3p); 8632 KASSERT((l3e & ATTR_CONTIGUOUS) != 0, 8633 ("pmap_demote_l3c: missing ATTR_CONTIGUOUS")); 8634 KASSERT((l3e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) != 8635 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RO)), 8636 ("pmap_demote_l3c: missing ATTR_S1_AP_RW")); 8637 while (!atomic_fcmpset_64(tl3p, &l3e, l3e & ~(ATTR_CONTIGUOUS | 8638 ATTR_DESCR_VALID))) 8639 cpu_spinwait(); 8640 8641 /* 8642 * Hardware accessed and dirty bit maintenance might only 8643 * update a single L3 entry, so we must combine the accessed 8644 * and dirty bits from this entire set of contiguous L3 8645 * entries. 8646 */ 8647 if ((l3e & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 8648 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM)) 8649 mask = ATTR_S1_AP_RW_BIT; 8650 nbits |= l3e & ATTR_AF; 8651 } 8652 if ((nbits & ATTR_AF) != 0) { 8653 pmap_invalidate_range(pmap, va & ~L3C_OFFSET, (va + L3C_SIZE) & 8654 ~L3C_OFFSET, true); 8655 } 8656 8657 /* 8658 * Remake the mappings, updating the accessed and dirty bits. 8659 */ 8660 for (tl3p = l3c_start; tl3p < l3c_end; tl3p++) { 8661 l3e = pmap_load(tl3p); 8662 while (!atomic_fcmpset_64(tl3p, &l3e, (l3e & ~mask) | nbits)) 8663 cpu_spinwait(); 8664 } 8665 dsb(ishst); 8666 8667 intr_restore(intr); 8668 if (tmpl3 != 0) { 8669 pmap_kremove(tmpl3); 8670 kva_free(tmpl3, PAGE_SIZE); 8671 } 8672 counter_u64_add(pmap_l3c_demotions, 1); 8673 CTR2(KTR_PMAP, "pmap_demote_l3c: success for va %#lx in pmap %p", 8674 va, pmap); 8675 return (true); 8676 } 8677 8678 /* 8679 * Accumulate the accessed and dirty bits within a L3C superpage and 8680 * return the specified PTE with them applied correctly. 8681 */ 8682 static pt_entry_t 8683 pmap_load_l3c(pt_entry_t *l3p) 8684 { 8685 pt_entry_t *l3c_end, *l3c_start, l3e, mask, nbits, *tl3p; 8686 8687 l3c_start = (pt_entry_t *)((uintptr_t)l3p & ~((L3C_ENTRIES * 8688 sizeof(pt_entry_t)) - 1)); 8689 l3c_end = l3c_start + L3C_ENTRIES; 8690 mask = 0; 8691 nbits = 0; 8692 /* Iterate over each mapping in the superpage. */ 8693 for (tl3p = l3c_start; tl3p < l3c_end; tl3p++) { 8694 l3e = pmap_load(tl3p); 8695 KASSERT((l3e & ATTR_CONTIGUOUS) != 0, 8696 ("pmap_load_l3c: missing ATTR_CONTIGUOUS")); 8697 /* Update mask if the current page has its dirty bit set. */ 8698 if ((l3e & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 8699 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM)) 8700 mask = ATTR_S1_AP_RW_BIT; 8701 /* Update nbits if the accessed bit is set. */ 8702 nbits |= l3e & ATTR_AF; 8703 } 8704 return ((pmap_load(l3p) & ~mask) | nbits); 8705 } 8706 8707 /* 8708 * Perform the pmap work for mincore(2). If the page is not both referenced and 8709 * modified by this pmap, returns its physical address so that the caller can 8710 * find other mappings. 8711 */ 8712 int 8713 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) 8714 { 8715 pt_entry_t *pte, tpte; 8716 vm_paddr_t mask, pa; 8717 int lvl, psind, val; 8718 bool managed; 8719 8720 PMAP_ASSERT_STAGE1(pmap); 8721 PMAP_LOCK(pmap); 8722 pte = pmap_pte(pmap, addr, &lvl); 8723 if (pte != NULL) { 8724 tpte = pmap_load(pte); 8725 8726 switch (lvl) { 8727 case 3: 8728 mask = L3_OFFSET; 8729 psind = (tpte & ATTR_CONTIGUOUS) != 0 ? 1 : 0; 8730 break; 8731 case 2: 8732 mask = L2_OFFSET; 8733 psind = 2; 8734 break; 8735 case 1: 8736 mask = L1_OFFSET; 8737 psind = 3; 8738 break; 8739 default: 8740 panic("pmap_mincore: invalid level %d", lvl); 8741 } 8742 8743 managed = (tpte & ATTR_SW_MANAGED) != 0; 8744 val = MINCORE_INCORE | MINCORE_PSIND(psind); 8745 if ((managed && pmap_pte_dirty(pmap, tpte)) || (!managed && 8746 (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW))) 8747 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 8748 if ((tpte & ATTR_AF) == ATTR_AF) 8749 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 8750 8751 pa = PTE_TO_PHYS(tpte) | (addr & mask); 8752 } else { 8753 managed = false; 8754 val = 0; 8755 } 8756 8757 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 8758 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { 8759 *pap = pa; 8760 } 8761 PMAP_UNLOCK(pmap); 8762 return (val); 8763 } 8764 8765 /* 8766 * Garbage collect every ASID that is neither active on a processor nor 8767 * reserved. 8768 */ 8769 static void 8770 pmap_reset_asid_set(pmap_t pmap) 8771 { 8772 pmap_t curpmap; 8773 int asid, cpuid, epoch; 8774 struct asid_set *set; 8775 enum pmap_stage stage; 8776 8777 set = pmap->pm_asid_set; 8778 stage = pmap->pm_stage; 8779 8780 set = pmap->pm_asid_set; 8781 KASSERT(set != NULL, ("%s: NULL asid set", __func__)); 8782 mtx_assert(&set->asid_set_mutex, MA_OWNED); 8783 8784 /* 8785 * Ensure that the store to asid_epoch is globally visible before the 8786 * loads from pc_curpmap are performed. 8787 */ 8788 epoch = set->asid_epoch + 1; 8789 if (epoch == INT_MAX) 8790 epoch = 0; 8791 set->asid_epoch = epoch; 8792 dsb(ishst); 8793 if (stage == PM_STAGE1) { 8794 __asm __volatile("tlbi vmalle1is"); 8795 } else { 8796 KASSERT(pmap_clean_stage2_tlbi != NULL, 8797 ("%s: Unset stage 2 tlb invalidation callback\n", 8798 __func__)); 8799 pmap_clean_stage2_tlbi(); 8800 } 8801 dsb(ish); 8802 bit_nclear(set->asid_set, ASID_FIRST_AVAILABLE, 8803 set->asid_set_size - 1); 8804 CPU_FOREACH(cpuid) { 8805 if (cpuid == curcpu) 8806 continue; 8807 if (stage == PM_STAGE1) { 8808 curpmap = pcpu_find(cpuid)->pc_curpmap; 8809 PMAP_ASSERT_STAGE1(pmap); 8810 } else { 8811 curpmap = pcpu_find(cpuid)->pc_curvmpmap; 8812 if (curpmap == NULL) 8813 continue; 8814 PMAP_ASSERT_STAGE2(pmap); 8815 } 8816 KASSERT(curpmap->pm_asid_set == set, ("Incorrect set")); 8817 asid = COOKIE_TO_ASID(curpmap->pm_cookie); 8818 if (asid == -1) 8819 continue; 8820 bit_set(set->asid_set, asid); 8821 curpmap->pm_cookie = COOKIE_FROM(asid, epoch); 8822 } 8823 } 8824 8825 /* 8826 * Allocate a new ASID for the specified pmap. 8827 */ 8828 static void 8829 pmap_alloc_asid(pmap_t pmap) 8830 { 8831 struct asid_set *set; 8832 int new_asid; 8833 8834 set = pmap->pm_asid_set; 8835 KASSERT(set != NULL, ("%s: NULL asid set", __func__)); 8836 8837 mtx_lock_spin(&set->asid_set_mutex); 8838 8839 /* 8840 * While this processor was waiting to acquire the asid set mutex, 8841 * pmap_reset_asid_set() running on another processor might have 8842 * updated this pmap's cookie to the current epoch. In which case, we 8843 * don't need to allocate a new ASID. 8844 */ 8845 if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch) 8846 goto out; 8847 8848 bit_ffc_at(set->asid_set, set->asid_next, set->asid_set_size, 8849 &new_asid); 8850 if (new_asid == -1) { 8851 bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE, 8852 set->asid_next, &new_asid); 8853 if (new_asid == -1) { 8854 pmap_reset_asid_set(pmap); 8855 bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE, 8856 set->asid_set_size, &new_asid); 8857 KASSERT(new_asid != -1, ("ASID allocation failure")); 8858 } 8859 } 8860 bit_set(set->asid_set, new_asid); 8861 set->asid_next = new_asid + 1; 8862 pmap->pm_cookie = COOKIE_FROM(new_asid, set->asid_epoch); 8863 out: 8864 mtx_unlock_spin(&set->asid_set_mutex); 8865 } 8866 8867 static uint64_t __read_mostly ttbr_flags; 8868 8869 /* 8870 * Compute the value that should be stored in ttbr0 to activate the specified 8871 * pmap. This value may change from time to time. 8872 */ 8873 uint64_t 8874 pmap_to_ttbr0(pmap_t pmap) 8875 { 8876 uint64_t ttbr; 8877 8878 ttbr = pmap->pm_ttbr; 8879 ttbr |= ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)); 8880 ttbr |= ttbr_flags; 8881 8882 return (ttbr); 8883 } 8884 8885 static void 8886 pmap_set_cnp(void *arg) 8887 { 8888 uint64_t ttbr0, ttbr1; 8889 u_int cpuid; 8890 8891 cpuid = *(u_int *)arg; 8892 if (cpuid == curcpu) { 8893 /* 8894 * Set the flags while all CPUs are handling the 8895 * smp_rendezvous so will not call pmap_to_ttbr0. Any calls 8896 * to pmap_to_ttbr0 after this will have the CnP flag set. 8897 * The dsb after invalidating the TLB will act as a barrier 8898 * to ensure all CPUs can observe this change. 8899 */ 8900 ttbr_flags |= TTBR_CnP; 8901 } 8902 8903 ttbr0 = READ_SPECIALREG(ttbr0_el1); 8904 ttbr0 |= TTBR_CnP; 8905 8906 ttbr1 = READ_SPECIALREG(ttbr1_el1); 8907 ttbr1 |= TTBR_CnP; 8908 8909 /* Update ttbr{0,1}_el1 with the CnP flag */ 8910 WRITE_SPECIALREG(ttbr0_el1, ttbr0); 8911 WRITE_SPECIALREG(ttbr1_el1, ttbr1); 8912 isb(); 8913 __asm __volatile("tlbi vmalle1is"); 8914 dsb(ish); 8915 isb(); 8916 } 8917 8918 /* 8919 * Defer enabling some features until we have read the ID registers to know 8920 * if they are supported on all CPUs. 8921 */ 8922 static void 8923 pmap_init_mp(void *dummy __unused) 8924 { 8925 uint64_t reg; 8926 8927 if (get_kernel_reg(ID_AA64PFR1_EL1, ®)) { 8928 if (ID_AA64PFR1_BT_VAL(reg) != ID_AA64PFR1_BT_NONE) { 8929 if (bootverbose) 8930 printf("Enabling BTI\n"); 8931 pmap_bti_support = true; 8932 8933 pmap_bti_ranges_zone = uma_zcreate("BTI ranges", 8934 sizeof(struct rs_el), NULL, NULL, NULL, NULL, 8935 UMA_ALIGN_PTR, 0); 8936 } 8937 } 8938 } 8939 SYSINIT(pmap_init_mp, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_mp, NULL); 8940 8941 /* 8942 * Defer enabling CnP until we have read the ID registers to know if it's 8943 * supported on all CPUs. 8944 */ 8945 static void 8946 pmap_init_cnp(void *dummy __unused) 8947 { 8948 uint64_t reg; 8949 u_int cpuid; 8950 8951 if (!get_kernel_reg(ID_AA64MMFR2_EL1, ®)) 8952 return; 8953 8954 if (ID_AA64MMFR2_CnP_VAL(reg) != ID_AA64MMFR2_CnP_NONE) { 8955 if (bootverbose) 8956 printf("Enabling CnP\n"); 8957 cpuid = curcpu; 8958 smp_rendezvous(NULL, pmap_set_cnp, NULL, &cpuid); 8959 } 8960 8961 } 8962 SYSINIT(pmap_init_cnp, SI_SUB_SMP, SI_ORDER_ANY, pmap_init_cnp, NULL); 8963 8964 static bool 8965 pmap_activate_int(pmap_t pmap) 8966 { 8967 struct asid_set *set; 8968 int epoch; 8969 8970 KASSERT(PCPU_GET(curpmap) != NULL, ("no active pmap")); 8971 KASSERT(pmap != kernel_pmap, ("kernel pmap activation")); 8972 8973 if ((pmap->pm_stage == PM_STAGE1 && pmap == PCPU_GET(curpmap)) || 8974 (pmap->pm_stage == PM_STAGE2 && pmap == PCPU_GET(curvmpmap))) { 8975 /* 8976 * Handle the possibility that the old thread was preempted 8977 * after an "ic" or "tlbi" instruction but before it performed 8978 * a "dsb" instruction. If the old thread migrates to a new 8979 * processor, its completion of a "dsb" instruction on that 8980 * new processor does not guarantee that the "ic" or "tlbi" 8981 * instructions performed on the old processor have completed. 8982 */ 8983 dsb(ish); 8984 return (false); 8985 } 8986 8987 set = pmap->pm_asid_set; 8988 KASSERT(set != NULL, ("%s: NULL asid set", __func__)); 8989 8990 /* 8991 * Ensure that the store to curpmap is globally visible before the 8992 * load from asid_epoch is performed. 8993 */ 8994 if (pmap->pm_stage == PM_STAGE1) 8995 PCPU_SET(curpmap, pmap); 8996 else 8997 PCPU_SET(curvmpmap, pmap); 8998 dsb(ish); 8999 epoch = COOKIE_TO_EPOCH(pmap->pm_cookie); 9000 if (epoch >= 0 && epoch != set->asid_epoch) 9001 pmap_alloc_asid(pmap); 9002 9003 if (pmap->pm_stage == PM_STAGE1) { 9004 set_ttbr0(pmap_to_ttbr0(pmap)); 9005 if (PCPU_GET(bcast_tlbi_workaround) != 0) 9006 invalidate_local_icache(); 9007 } 9008 return (true); 9009 } 9010 9011 void 9012 pmap_activate_vm(pmap_t pmap) 9013 { 9014 9015 PMAP_ASSERT_STAGE2(pmap); 9016 9017 (void)pmap_activate_int(pmap); 9018 } 9019 9020 void 9021 pmap_activate(struct thread *td) 9022 { 9023 pmap_t pmap; 9024 9025 pmap = vmspace_pmap(td->td_proc->p_vmspace); 9026 PMAP_ASSERT_STAGE1(pmap); 9027 critical_enter(); 9028 (void)pmap_activate_int(pmap); 9029 critical_exit(); 9030 } 9031 9032 /* 9033 * Activate the thread we are switching to. 9034 * To simplify the assembly in cpu_throw return the new threads pcb. 9035 */ 9036 struct pcb * 9037 pmap_switch(struct thread *new) 9038 { 9039 pcpu_bp_harden bp_harden; 9040 struct pcb *pcb; 9041 9042 /* Store the new curthread */ 9043 PCPU_SET(curthread, new); 9044 9045 /* And the new pcb */ 9046 pcb = new->td_pcb; 9047 PCPU_SET(curpcb, pcb); 9048 9049 /* 9050 * TODO: We may need to flush the cache here if switching 9051 * to a user process. 9052 */ 9053 9054 if (pmap_activate_int(vmspace_pmap(new->td_proc->p_vmspace))) { 9055 /* 9056 * Stop userspace from training the branch predictor against 9057 * other processes. This will call into a CPU specific 9058 * function that clears the branch predictor state. 9059 */ 9060 bp_harden = PCPU_GET(bp_harden); 9061 if (bp_harden != NULL) 9062 bp_harden(); 9063 } 9064 9065 return (pcb); 9066 } 9067 9068 void 9069 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz) 9070 { 9071 9072 PMAP_ASSERT_STAGE1(pmap); 9073 KASSERT(ADDR_IS_CANONICAL(va), 9074 ("%s: Address not in canonical form: %lx", __func__, va)); 9075 9076 if (ADDR_IS_KERNEL(va)) { 9077 cpu_icache_sync_range((void *)va, sz); 9078 } else { 9079 u_int len, offset; 9080 vm_paddr_t pa; 9081 9082 /* Find the length of data in this page to flush */ 9083 offset = va & PAGE_MASK; 9084 len = imin(PAGE_SIZE - offset, sz); 9085 9086 while (sz != 0) { 9087 /* Extract the physical address & find it in the DMAP */ 9088 pa = pmap_extract(pmap, va); 9089 if (pa != 0) 9090 cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa), 9091 len); 9092 9093 /* Move to the next page */ 9094 sz -= len; 9095 va += len; 9096 /* Set the length for the next iteration */ 9097 len = imin(PAGE_SIZE, sz); 9098 } 9099 } 9100 } 9101 9102 static int 9103 pmap_stage2_fault(pmap_t pmap, uint64_t esr, uint64_t far) 9104 { 9105 pd_entry_t *pdep; 9106 pt_entry_t *ptep, pte; 9107 int rv, lvl, dfsc; 9108 9109 PMAP_ASSERT_STAGE2(pmap); 9110 rv = KERN_FAILURE; 9111 9112 /* Data and insn aborts use same encoding for FSC field. */ 9113 dfsc = esr & ISS_DATA_DFSC_MASK; 9114 switch (dfsc) { 9115 case ISS_DATA_DFSC_TF_L0: 9116 case ISS_DATA_DFSC_TF_L1: 9117 case ISS_DATA_DFSC_TF_L2: 9118 case ISS_DATA_DFSC_TF_L3: 9119 PMAP_LOCK(pmap); 9120 pdep = pmap_pde(pmap, far, &lvl); 9121 if (pdep == NULL || lvl != (dfsc - ISS_DATA_DFSC_TF_L1)) { 9122 PMAP_UNLOCK(pmap); 9123 break; 9124 } 9125 9126 switch (lvl) { 9127 case 0: 9128 ptep = pmap_l0_to_l1(pdep, far); 9129 break; 9130 case 1: 9131 ptep = pmap_l1_to_l2(pdep, far); 9132 break; 9133 case 2: 9134 ptep = pmap_l2_to_l3(pdep, far); 9135 break; 9136 default: 9137 panic("%s: Invalid pde level %d", __func__,lvl); 9138 } 9139 goto fault_exec; 9140 9141 case ISS_DATA_DFSC_AFF_L1: 9142 case ISS_DATA_DFSC_AFF_L2: 9143 case ISS_DATA_DFSC_AFF_L3: 9144 PMAP_LOCK(pmap); 9145 ptep = pmap_pte(pmap, far, &lvl); 9146 fault_exec: 9147 if (ptep != NULL && (pte = pmap_load(ptep)) != 0) { 9148 /* 9149 * If accessing an executable page invalidate 9150 * the I-cache so it will be valid when we 9151 * continue execution in the guest. The D-cache 9152 * is assumed to already be clean to the Point 9153 * of Coherency. 9154 */ 9155 if ((pte & ATTR_S2_XN_MASK) != 9156 ATTR_S2_XN(ATTR_S2_XN_NONE)) { 9157 invalidate_icache(); 9158 } 9159 pmap_set_bits(ptep, ATTR_AF | ATTR_DESCR_VALID); 9160 rv = KERN_SUCCESS; 9161 } 9162 PMAP_UNLOCK(pmap); 9163 break; 9164 } 9165 9166 return (rv); 9167 } 9168 9169 int 9170 pmap_fault(pmap_t pmap, uint64_t esr, uint64_t far) 9171 { 9172 pt_entry_t pte, *ptep; 9173 register_t intr; 9174 uint64_t ec, par; 9175 int lvl, rv; 9176 9177 rv = KERN_FAILURE; 9178 9179 ec = ESR_ELx_EXCEPTION(esr); 9180 switch (ec) { 9181 case EXCP_INSN_ABORT_L: 9182 case EXCP_INSN_ABORT: 9183 case EXCP_DATA_ABORT_L: 9184 case EXCP_DATA_ABORT: 9185 break; 9186 default: 9187 return (rv); 9188 } 9189 9190 if (pmap->pm_stage == PM_STAGE2) 9191 return (pmap_stage2_fault(pmap, esr, far)); 9192 9193 /* Data and insn aborts use same encoding for FSC field. */ 9194 switch (esr & ISS_DATA_DFSC_MASK) { 9195 case ISS_DATA_DFSC_AFF_L1: 9196 case ISS_DATA_DFSC_AFF_L2: 9197 case ISS_DATA_DFSC_AFF_L3: 9198 PMAP_LOCK(pmap); 9199 ptep = pmap_pte(pmap, far, &lvl); 9200 if (ptep != NULL) { 9201 pmap_set_bits(ptep, ATTR_AF); 9202 rv = KERN_SUCCESS; 9203 /* 9204 * XXXMJ as an optimization we could mark the entry 9205 * dirty if this is a write fault. 9206 */ 9207 } 9208 PMAP_UNLOCK(pmap); 9209 break; 9210 case ISS_DATA_DFSC_PF_L1: 9211 case ISS_DATA_DFSC_PF_L2: 9212 case ISS_DATA_DFSC_PF_L3: 9213 if ((ec != EXCP_DATA_ABORT_L && ec != EXCP_DATA_ABORT) || 9214 (esr & ISS_DATA_WnR) == 0) 9215 return (rv); 9216 PMAP_LOCK(pmap); 9217 ptep = pmap_pte(pmap, far, &lvl); 9218 if (ptep != NULL && 9219 ((pte = pmap_load(ptep)) & ATTR_SW_DBM) != 0) { 9220 if ((pte & ATTR_S1_AP_RW_BIT) == 9221 ATTR_S1_AP(ATTR_S1_AP_RO)) { 9222 pmap_clear_bits(ptep, ATTR_S1_AP_RW_BIT); 9223 pmap_s1_invalidate_page(pmap, far, true); 9224 } 9225 rv = KERN_SUCCESS; 9226 } 9227 PMAP_UNLOCK(pmap); 9228 break; 9229 case ISS_DATA_DFSC_TF_L0: 9230 case ISS_DATA_DFSC_TF_L1: 9231 case ISS_DATA_DFSC_TF_L2: 9232 case ISS_DATA_DFSC_TF_L3: 9233 /* 9234 * Retry the translation. A break-before-make sequence can 9235 * produce a transient fault. 9236 */ 9237 if (pmap == kernel_pmap) { 9238 /* 9239 * The translation fault may have occurred within a 9240 * critical section. Therefore, we must check the 9241 * address without acquiring the kernel pmap's lock. 9242 */ 9243 if (pmap_klookup(far, NULL)) 9244 rv = KERN_SUCCESS; 9245 } else { 9246 bool owned; 9247 9248 /* 9249 * In the EFIRT driver we lock the pmap before 9250 * calling into the runtime service. As the lock 9251 * is already owned by the current thread skip 9252 * locking it again. 9253 */ 9254 owned = PMAP_OWNED(pmap); 9255 if (!owned) 9256 PMAP_LOCK(pmap); 9257 /* Ask the MMU to check the address. */ 9258 intr = intr_disable(); 9259 par = arm64_address_translate_s1e0r(far); 9260 intr_restore(intr); 9261 if (!owned) 9262 PMAP_UNLOCK(pmap); 9263 9264 /* 9265 * If the translation was successful, then we can 9266 * return success to the trap handler. 9267 */ 9268 if (PAR_SUCCESS(par)) 9269 rv = KERN_SUCCESS; 9270 } 9271 break; 9272 } 9273 9274 return (rv); 9275 } 9276 9277 /* 9278 * Increase the starting virtual address of the given mapping if a 9279 * different alignment might result in more superpage mappings. 9280 */ 9281 void 9282 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 9283 vm_offset_t *addr, vm_size_t size) 9284 { 9285 vm_offset_t superpage_offset; 9286 9287 if (size < L3C_SIZE) 9288 return; 9289 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 9290 offset += ptoa(object->pg_color); 9291 9292 /* 9293 * Considering the object's physical alignment, is the mapping large 9294 * enough to encompass an L2 (2MB/32MB) superpage ... 9295 */ 9296 superpage_offset = offset & L2_OFFSET; 9297 if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) >= L2_SIZE) { 9298 /* 9299 * If the virtual and physical alignments differ, then 9300 * increase the virtual address so that the alignments match. 9301 */ 9302 if ((*addr & L2_OFFSET) < superpage_offset) 9303 *addr = (*addr & ~L2_OFFSET) + superpage_offset; 9304 else if ((*addr & L2_OFFSET) > superpage_offset) 9305 *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + 9306 superpage_offset; 9307 return; 9308 } 9309 /* ... or an L3C (64KB/2MB) superpage? */ 9310 superpage_offset = offset & L3C_OFFSET; 9311 if (size - ((L3C_SIZE - superpage_offset) & L3C_OFFSET) >= L3C_SIZE) { 9312 if ((*addr & L3C_OFFSET) < superpage_offset) 9313 *addr = (*addr & ~L3C_OFFSET) + superpage_offset; 9314 else if ((*addr & L3C_OFFSET) > superpage_offset) 9315 *addr = ((*addr + L3C_OFFSET) & ~L3C_OFFSET) + 9316 superpage_offset; 9317 } 9318 } 9319 9320 /** 9321 * Get the kernel virtual address of a set of physical pages. If there are 9322 * physical addresses not covered by the DMAP perform a transient mapping 9323 * that will be removed when calling pmap_unmap_io_transient. 9324 * 9325 * \param page The pages the caller wishes to obtain the virtual 9326 * address on the kernel memory map. 9327 * \param vaddr On return contains the kernel virtual memory address 9328 * of the pages passed in the page parameter. 9329 * \param count Number of pages passed in. 9330 * \param can_fault true if the thread using the mapped pages can take 9331 * page faults, false otherwise. 9332 * 9333 * \returns true if the caller must call pmap_unmap_io_transient when 9334 * finished or false otherwise. 9335 * 9336 */ 9337 bool 9338 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 9339 bool can_fault) 9340 { 9341 vm_paddr_t paddr; 9342 bool needs_mapping; 9343 int error __diagused, i; 9344 9345 /* 9346 * Allocate any KVA space that we need, this is done in a separate 9347 * loop to prevent calling vmem_alloc while pinned. 9348 */ 9349 needs_mapping = false; 9350 for (i = 0; i < count; i++) { 9351 paddr = VM_PAGE_TO_PHYS(page[i]); 9352 if (__predict_false(!PHYS_IN_DMAP(paddr))) { 9353 error = vmem_alloc(kernel_arena, PAGE_SIZE, 9354 M_BESTFIT | M_WAITOK, &vaddr[i]); 9355 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 9356 needs_mapping = true; 9357 } else { 9358 vaddr[i] = PHYS_TO_DMAP(paddr); 9359 } 9360 } 9361 9362 /* Exit early if everything is covered by the DMAP */ 9363 if (!needs_mapping) 9364 return (false); 9365 9366 if (!can_fault) 9367 sched_pin(); 9368 for (i = 0; i < count; i++) { 9369 paddr = VM_PAGE_TO_PHYS(page[i]); 9370 if (!PHYS_IN_DMAP(paddr)) { 9371 panic( 9372 "pmap_map_io_transient: TODO: Map out of DMAP data"); 9373 } 9374 } 9375 9376 return (needs_mapping); 9377 } 9378 9379 void 9380 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 9381 bool can_fault) 9382 { 9383 vm_paddr_t paddr; 9384 int i; 9385 9386 if (!can_fault) 9387 sched_unpin(); 9388 for (i = 0; i < count; i++) { 9389 paddr = VM_PAGE_TO_PHYS(page[i]); 9390 if (!PHYS_IN_DMAP(paddr)) { 9391 panic("ARM64TODO: pmap_unmap_io_transient: Unmap data"); 9392 } 9393 } 9394 } 9395 9396 bool 9397 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) 9398 { 9399 9400 return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_THROUGH); 9401 } 9402 9403 static void * 9404 bti_dup_range(void *ctx __unused, void *data) 9405 { 9406 struct rs_el *node, *new_node; 9407 9408 new_node = uma_zalloc(pmap_bti_ranges_zone, M_NOWAIT); 9409 if (new_node == NULL) 9410 return (NULL); 9411 node = data; 9412 memcpy(new_node, node, sizeof(*node)); 9413 return (new_node); 9414 } 9415 9416 static void 9417 bti_free_range(void *ctx __unused, void *node) 9418 { 9419 9420 uma_zfree(pmap_bti_ranges_zone, node); 9421 } 9422 9423 static int 9424 pmap_bti_assign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 9425 { 9426 struct rs_el *rs; 9427 int error; 9428 9429 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9430 PMAP_ASSERT_STAGE1(pmap); 9431 MPASS(pmap->pm_bti != NULL); 9432 rs = uma_zalloc(pmap_bti_ranges_zone, M_NOWAIT); 9433 if (rs == NULL) 9434 return (ENOMEM); 9435 error = rangeset_insert(pmap->pm_bti, sva, eva, rs); 9436 if (error != 0) 9437 uma_zfree(pmap_bti_ranges_zone, rs); 9438 return (error); 9439 } 9440 9441 static void 9442 pmap_bti_deassign_all(pmap_t pmap) 9443 { 9444 9445 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9446 if (pmap->pm_bti != NULL) 9447 rangeset_remove_all(pmap->pm_bti); 9448 } 9449 9450 /* 9451 * Returns true if the BTI setting is the same across the specified address 9452 * range, and false otherwise. When returning true, updates the referenced PTE 9453 * to reflect the BTI setting. 9454 * 9455 * Only stage 1 pmaps support BTI. The kernel pmap is always a stage 1 pmap 9456 * that has the same BTI setting implicitly across its entire address range. 9457 */ 9458 static bool 9459 pmap_bti_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t *pte) 9460 { 9461 struct rs_el *rs; 9462 vm_offset_t va; 9463 9464 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9465 KASSERT(ADDR_IS_CANONICAL(sva), 9466 ("%s: Start address not in canonical form: %lx", __func__, sva)); 9467 KASSERT(ADDR_IS_CANONICAL(eva), 9468 ("%s: End address not in canonical form: %lx", __func__, eva)); 9469 KASSERT((*pte & ATTR_S1_GP) == 0, 9470 ("%s: pte %lx has ATTR_S1_GP preset", __func__, *pte)); 9471 9472 if (pmap == kernel_pmap) { 9473 *pte |= ATTR_KERN_GP; 9474 return (true); 9475 } 9476 if (pmap->pm_bti == NULL) 9477 return (true); 9478 PMAP_ASSERT_STAGE1(pmap); 9479 rs = rangeset_containing(pmap->pm_bti, sva); 9480 if (rs == NULL) 9481 return (rangeset_empty(pmap->pm_bti, sva, eva)); 9482 while ((va = rs->re_end) < eva) { 9483 if ((rs = rangeset_beginning(pmap->pm_bti, va)) == NULL) 9484 return (false); 9485 } 9486 *pte |= ATTR_S1_GP; 9487 return (true); 9488 } 9489 9490 static pt_entry_t 9491 pmap_pte_bti(pmap_t pmap, vm_offset_t va) 9492 { 9493 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9494 MPASS(ADDR_IS_CANONICAL(va)); 9495 9496 if (pmap->pm_stage != PM_STAGE1) 9497 return (0); 9498 if (pmap == kernel_pmap) 9499 return (ATTR_KERN_GP); 9500 if (pmap->pm_bti != NULL && 9501 rangeset_containing(pmap->pm_bti, va) != NULL) 9502 return (ATTR_S1_GP); 9503 return (0); 9504 } 9505 9506 static void 9507 pmap_bti_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 9508 { 9509 9510 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9511 if (pmap->pm_bti != NULL) 9512 rangeset_remove(pmap->pm_bti, sva, eva); 9513 } 9514 9515 static int 9516 pmap_bti_copy(pmap_t dst_pmap, pmap_t src_pmap) 9517 { 9518 9519 PMAP_LOCK_ASSERT(dst_pmap, MA_OWNED); 9520 PMAP_LOCK_ASSERT(src_pmap, MA_OWNED); 9521 MPASS(src_pmap->pm_stage == dst_pmap->pm_stage); 9522 MPASS(src_pmap->pm_bti != NULL); 9523 MPASS(dst_pmap->pm_bti != NULL); 9524 if (src_pmap->pm_bti->rs_data_ctx == NULL) 9525 return (0); 9526 return (rangeset_copy(dst_pmap->pm_bti, src_pmap->pm_bti)); 9527 } 9528 9529 static void 9530 pmap_bti_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, bool set) 9531 { 9532 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9533 PMAP_ASSERT_STAGE1(pmap); 9534 9535 pmap_mask_set_locked(pmap, sva, eva, ATTR_S1_GP, set ? ATTR_S1_GP : 0, 9536 true); 9537 } 9538 9539 int 9540 pmap_bti_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 9541 { 9542 int error; 9543 9544 if (pmap->pm_bti == NULL) 9545 return (0); 9546 if (!ADDR_IS_CANONICAL(sva) || !ADDR_IS_CANONICAL(eva)) 9547 return (EINVAL); 9548 if (pmap->pm_stage != PM_STAGE1) 9549 return (EINVAL); 9550 if (eva <= sva || ADDR_IS_KERNEL(eva)) 9551 return (EFAULT); 9552 9553 sva = trunc_page(sva); 9554 eva = round_page(eva); 9555 for (;;) { 9556 PMAP_LOCK(pmap); 9557 error = pmap_bti_assign(pmap, sva, eva); 9558 if (error == 0) 9559 pmap_bti_update_range(pmap, sva, eva, true); 9560 PMAP_UNLOCK(pmap); 9561 if (error != ENOMEM) 9562 break; 9563 vm_wait(NULL); 9564 } 9565 return (error); 9566 } 9567 9568 #if defined(KASAN) || defined(KMSAN) 9569 static pd_entry_t *pmap_san_early_l2; 9570 9571 #define SAN_BOOTSTRAP_L2_SIZE (1 * L2_SIZE) 9572 #define SAN_BOOTSTRAP_SIZE (2 * PAGE_SIZE) 9573 static vm_offset_t __nosanitizeaddress 9574 pmap_san_enter_bootstrap_alloc_l2(void) 9575 { 9576 static uint8_t bootstrap_data[SAN_BOOTSTRAP_L2_SIZE] __aligned(L2_SIZE); 9577 static size_t offset = 0; 9578 vm_offset_t addr; 9579 9580 if (offset + L2_SIZE > sizeof(bootstrap_data)) { 9581 panic("%s: out of memory for the bootstrap shadow map L2 entries", 9582 __func__); 9583 } 9584 9585 addr = (uintptr_t)&bootstrap_data[offset]; 9586 offset += L2_SIZE; 9587 return (addr); 9588 } 9589 9590 /* 9591 * SAN L1 + L2 pages, maybe L3 entries later? 9592 */ 9593 static vm_offset_t __nosanitizeaddress 9594 pmap_san_enter_bootstrap_alloc_pages(int npages) 9595 { 9596 static uint8_t bootstrap_data[SAN_BOOTSTRAP_SIZE] __aligned(PAGE_SIZE); 9597 static size_t offset = 0; 9598 vm_offset_t addr; 9599 9600 if (offset + (npages * PAGE_SIZE) > sizeof(bootstrap_data)) { 9601 panic("%s: out of memory for the bootstrap shadow map", 9602 __func__); 9603 } 9604 9605 addr = (uintptr_t)&bootstrap_data[offset]; 9606 offset += (npages * PAGE_SIZE); 9607 return (addr); 9608 } 9609 9610 static void __nosanitizeaddress 9611 pmap_san_enter_bootstrap(void) 9612 { 9613 vm_offset_t freemempos; 9614 9615 /* L1, L2 */ 9616 freemempos = pmap_san_enter_bootstrap_alloc_pages(2); 9617 bs_state.freemempos = freemempos; 9618 bs_state.va = KASAN_MIN_ADDRESS; 9619 pmap_bootstrap_l1_table(&bs_state); 9620 pmap_san_early_l2 = bs_state.l2; 9621 } 9622 9623 static vm_page_t 9624 pmap_san_enter_alloc_l3(void) 9625 { 9626 vm_page_t m; 9627 9628 m = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED | 9629 VM_ALLOC_ZERO); 9630 if (m == NULL) 9631 panic("%s: no memory to grow shadow map", __func__); 9632 return (m); 9633 } 9634 9635 static vm_page_t 9636 pmap_san_enter_alloc_l2(void) 9637 { 9638 return (vm_page_alloc_noobj_contig(VM_ALLOC_WIRED | VM_ALLOC_ZERO, 9639 Ln_ENTRIES, 0, ~0ul, L2_SIZE, 0, VM_MEMATTR_DEFAULT)); 9640 } 9641 9642 void __nosanitizeaddress __nosanitizememory 9643 pmap_san_enter(vm_offset_t va) 9644 { 9645 pd_entry_t *l1, *l2; 9646 pt_entry_t *l3; 9647 vm_page_t m; 9648 9649 if (virtual_avail == 0) { 9650 vm_offset_t block; 9651 int slot; 9652 bool first; 9653 9654 /* Temporary shadow map prior to pmap_bootstrap(). */ 9655 first = pmap_san_early_l2 == NULL; 9656 if (first) 9657 pmap_san_enter_bootstrap(); 9658 9659 l2 = pmap_san_early_l2; 9660 slot = pmap_l2_index(va); 9661 9662 if ((pmap_load(&l2[slot]) & ATTR_DESCR_VALID) == 0) { 9663 MPASS(first); 9664 block = pmap_san_enter_bootstrap_alloc_l2(); 9665 pmap_store(&l2[slot], 9666 PHYS_TO_PTE(pmap_early_vtophys(block)) | 9667 PMAP_SAN_PTE_BITS | L2_BLOCK); 9668 dmb(ishst); 9669 } 9670 9671 return; 9672 } 9673 9674 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 9675 l1 = pmap_l1(kernel_pmap, va); 9676 MPASS(l1 != NULL); 9677 if ((pmap_load(l1) & ATTR_DESCR_VALID) == 0) { 9678 m = pmap_san_enter_alloc_l3(); 9679 pmap_store(l1, VM_PAGE_TO_PTE(m) | L1_TABLE); 9680 } 9681 l2 = pmap_l1_to_l2(l1, va); 9682 if ((pmap_load(l2) & ATTR_DESCR_VALID) == 0) { 9683 m = pmap_san_enter_alloc_l2(); 9684 if (m != NULL) { 9685 pmap_store(l2, VM_PAGE_TO_PTE(m) | 9686 PMAP_SAN_PTE_BITS | L2_BLOCK); 9687 } else { 9688 m = pmap_san_enter_alloc_l3(); 9689 pmap_store(l2, VM_PAGE_TO_PTE(m) | L2_TABLE); 9690 } 9691 dmb(ishst); 9692 } 9693 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) 9694 return; 9695 l3 = pmap_l2_to_l3(l2, va); 9696 if ((pmap_load(l3) & ATTR_DESCR_VALID) != 0) 9697 return; 9698 m = pmap_san_enter_alloc_l3(); 9699 pmap_store(l3, VM_PAGE_TO_PTE(m) | PMAP_SAN_PTE_BITS | L3_PAGE); 9700 dmb(ishst); 9701 } 9702 #endif /* KASAN || KMSAN */ 9703 9704 /* 9705 * Track a range of the kernel's virtual address space that is contiguous 9706 * in various mapping attributes. 9707 */ 9708 struct pmap_kernel_map_range { 9709 vm_offset_t sva; 9710 pt_entry_t attrs; 9711 int l3pages; 9712 int l3contig; 9713 int l2blocks; 9714 int l2contig; 9715 int l1blocks; 9716 }; 9717 9718 static void 9719 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, 9720 vm_offset_t eva) 9721 { 9722 const char *mode; 9723 int index; 9724 9725 if (eva <= range->sva) 9726 return; 9727 9728 index = range->attrs & ATTR_S1_IDX_MASK; 9729 switch (index) { 9730 case ATTR_S1_IDX(VM_MEMATTR_DEVICE_NP): 9731 mode = "DEV-NP"; 9732 break; 9733 case ATTR_S1_IDX(VM_MEMATTR_DEVICE): 9734 mode = "DEV"; 9735 break; 9736 case ATTR_S1_IDX(VM_MEMATTR_UNCACHEABLE): 9737 mode = "UC"; 9738 break; 9739 case ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK): 9740 mode = "WB"; 9741 break; 9742 case ATTR_S1_IDX(VM_MEMATTR_WRITE_THROUGH): 9743 mode = "WT"; 9744 break; 9745 default: 9746 printf( 9747 "%s: unknown memory type %x for range 0x%016lx-0x%016lx\n", 9748 __func__, index, range->sva, eva); 9749 mode = "??"; 9750 break; 9751 } 9752 9753 sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c%c %6s %d %d %d %d %d\n", 9754 range->sva, eva, 9755 (range->attrs & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP_RW ? 'w' : '-', 9756 (range->attrs & ATTR_S1_PXN) != 0 ? '-' : 'x', 9757 (range->attrs & ATTR_S1_UXN) != 0 ? '-' : 'X', 9758 (range->attrs & ATTR_S1_AP(ATTR_S1_AP_USER)) != 0 ? 'u' : 's', 9759 (range->attrs & ATTR_S1_GP) != 0 ? 'g' : '-', 9760 mode, range->l1blocks, range->l2contig, range->l2blocks, 9761 range->l3contig, range->l3pages); 9762 9763 /* Reset to sentinel value. */ 9764 range->sva = 0xfffffffffffffffful; 9765 } 9766 9767 /* 9768 * Determine whether the attributes specified by a page table entry match those 9769 * being tracked by the current range. 9770 */ 9771 static bool 9772 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs) 9773 { 9774 9775 return (range->attrs == attrs); 9776 } 9777 9778 static void 9779 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, 9780 pt_entry_t attrs) 9781 { 9782 9783 memset(range, 0, sizeof(*range)); 9784 range->sva = va; 9785 range->attrs = attrs; 9786 } 9787 9788 /* Get the block/page attributes that correspond to the table attributes */ 9789 static pt_entry_t 9790 sysctl_kmaps_table_attrs(pd_entry_t table) 9791 { 9792 pt_entry_t attrs; 9793 9794 attrs = 0; 9795 if ((table & TATTR_UXN_TABLE) != 0) 9796 attrs |= ATTR_S1_UXN; 9797 if ((table & TATTR_PXN_TABLE) != 0) 9798 attrs |= ATTR_S1_PXN; 9799 if ((table & TATTR_AP_TABLE_RO) != 0) 9800 attrs |= ATTR_S1_AP(ATTR_S1_AP_RO); 9801 9802 return (attrs); 9803 } 9804 9805 /* Read the block/page attributes we care about */ 9806 static pt_entry_t 9807 sysctl_kmaps_block_attrs(pt_entry_t block) 9808 { 9809 return (block & (ATTR_S1_AP_MASK | ATTR_S1_XN | ATTR_S1_IDX_MASK | 9810 ATTR_S1_GP)); 9811 } 9812 9813 /* 9814 * Given a leaf PTE, derive the mapping's attributes. If they do not match 9815 * those of the current run, dump the address range and its attributes, and 9816 * begin a new run. 9817 */ 9818 static void 9819 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, 9820 vm_offset_t va, pd_entry_t l0e, pd_entry_t l1e, pd_entry_t l2e, 9821 pt_entry_t l3e) 9822 { 9823 pt_entry_t attrs; 9824 9825 attrs = sysctl_kmaps_table_attrs(l0e); 9826 9827 if ((l1e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) { 9828 attrs |= sysctl_kmaps_block_attrs(l1e); 9829 goto done; 9830 } 9831 attrs |= sysctl_kmaps_table_attrs(l1e); 9832 9833 if ((l2e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) { 9834 attrs |= sysctl_kmaps_block_attrs(l2e); 9835 goto done; 9836 } 9837 attrs |= sysctl_kmaps_table_attrs(l2e); 9838 attrs |= sysctl_kmaps_block_attrs(l3e); 9839 9840 done: 9841 if (range->sva > va || !sysctl_kmaps_match(range, attrs)) { 9842 sysctl_kmaps_dump(sb, range, va); 9843 sysctl_kmaps_reinit(range, va, attrs); 9844 } 9845 } 9846 9847 static int 9848 sysctl_kmaps(SYSCTL_HANDLER_ARGS) 9849 { 9850 struct pmap_kernel_map_range range; 9851 struct sbuf sbuf, *sb; 9852 pd_entry_t l0e, *l1, l1e, *l2, l2e; 9853 pt_entry_t *l3, l3e; 9854 vm_offset_t sva; 9855 vm_paddr_t pa; 9856 int error, i, j, k, l; 9857 9858 error = sysctl_wire_old_buffer(req, 0); 9859 if (error != 0) 9860 return (error); 9861 sb = &sbuf; 9862 sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); 9863 9864 /* Sentinel value. */ 9865 range.sva = 0xfffffffffffffffful; 9866 9867 /* 9868 * Iterate over the kernel page tables without holding the kernel pmap 9869 * lock. Kernel page table pages are never freed, so at worst we will 9870 * observe inconsistencies in the output. 9871 */ 9872 for (sva = 0xffff000000000000ul, i = pmap_l0_index(sva); i < Ln_ENTRIES; 9873 i++) { 9874 if (i == pmap_l0_index(DMAP_MIN_ADDRESS)) 9875 sbuf_printf(sb, "\nDirect map:\n"); 9876 else if (i == pmap_l0_index(VM_MIN_KERNEL_ADDRESS)) 9877 sbuf_printf(sb, "\nKernel map:\n"); 9878 #ifdef KASAN 9879 else if (i == pmap_l0_index(KASAN_MIN_ADDRESS)) 9880 sbuf_printf(sb, "\nKASAN shadow map:\n"); 9881 #endif 9882 #ifdef KMSAN 9883 else if (i == pmap_l0_index(KMSAN_SHAD_MIN_ADDRESS)) 9884 sbuf_printf(sb, "\nKMSAN shadow map:\n"); 9885 else if (i == pmap_l0_index(KMSAN_ORIG_MIN_ADDRESS)) 9886 sbuf_printf(sb, "\nKMSAN origin map:\n"); 9887 #endif 9888 9889 l0e = kernel_pmap->pm_l0[i]; 9890 if ((l0e & ATTR_DESCR_VALID) == 0) { 9891 sysctl_kmaps_dump(sb, &range, sva); 9892 sva += L0_SIZE; 9893 continue; 9894 } 9895 pa = PTE_TO_PHYS(l0e); 9896 l1 = (pd_entry_t *)PHYS_TO_DMAP(pa); 9897 9898 for (j = pmap_l1_index(sva); j < Ln_ENTRIES; j++) { 9899 l1e = l1[j]; 9900 if ((l1e & ATTR_DESCR_VALID) == 0) { 9901 sysctl_kmaps_dump(sb, &range, sva); 9902 sva += L1_SIZE; 9903 continue; 9904 } 9905 if ((l1e & ATTR_DESCR_MASK) == L1_BLOCK) { 9906 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 9907 sysctl_kmaps_check(sb, &range, sva, l0e, l1e, 9908 0, 0); 9909 range.l1blocks++; 9910 sva += L1_SIZE; 9911 continue; 9912 } 9913 pa = PTE_TO_PHYS(l1e); 9914 l2 = (pd_entry_t *)PHYS_TO_DMAP(pa); 9915 9916 for (k = pmap_l2_index(sva); k < Ln_ENTRIES; k++) { 9917 l2e = l2[k]; 9918 if ((l2e & ATTR_DESCR_VALID) == 0) { 9919 sysctl_kmaps_dump(sb, &range, sva); 9920 sva += L2_SIZE; 9921 continue; 9922 } 9923 if ((l2e & ATTR_DESCR_MASK) == L2_BLOCK) { 9924 sysctl_kmaps_check(sb, &range, sva, 9925 l0e, l1e, l2e, 0); 9926 if ((l2e & ATTR_CONTIGUOUS) != 0) 9927 range.l2contig += 9928 k % L2C_ENTRIES == 0 ? 9929 1 : 0; 9930 else 9931 range.l2blocks++; 9932 sva += L2_SIZE; 9933 continue; 9934 } 9935 pa = PTE_TO_PHYS(l2e); 9936 l3 = (pt_entry_t *)PHYS_TO_DMAP(pa); 9937 9938 for (l = pmap_l3_index(sva); l < Ln_ENTRIES; 9939 l++, sva += L3_SIZE) { 9940 l3e = l3[l]; 9941 if ((l3e & ATTR_DESCR_VALID) == 0) { 9942 sysctl_kmaps_dump(sb, &range, 9943 sva); 9944 continue; 9945 } 9946 sysctl_kmaps_check(sb, &range, sva, 9947 l0e, l1e, l2e, l3e); 9948 if ((l3e & ATTR_CONTIGUOUS) != 0) 9949 range.l3contig += 9950 l % L3C_ENTRIES == 0 ? 9951 1 : 0; 9952 else 9953 range.l3pages++; 9954 } 9955 } 9956 } 9957 } 9958 9959 error = sbuf_finish(sb); 9960 sbuf_delete(sb); 9961 return (error); 9962 } 9963 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps, 9964 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP, 9965 NULL, 0, sysctl_kmaps, "A", 9966 "Dump kernel address layout"); 9967