1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 1991 Regents of the University of California. 5 * All rights reserved. 6 * Copyright (c) 1994 John S. Dyson 7 * All rights reserved. 8 * Copyright (c) 1994 David Greenman 9 * All rights reserved. 10 * Copyright (c) 2003 Peter Wemm 11 * All rights reserved. 12 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 13 * All rights reserved. 14 * Copyright (c) 2014 Andrew Turner 15 * All rights reserved. 16 * Copyright (c) 2014 The FreeBSD Foundation 17 * All rights reserved. 18 * Copyright (c) 2015-2018 Ruslan Bukin <br@bsdpad.com> 19 * All rights reserved. 20 * 21 * This code is derived from software contributed to Berkeley by 22 * the Systems Programming Group of the University of Utah Computer 23 * Science Department and William Jolitz of UUNET Technologies Inc. 24 * 25 * Portions of this software were developed by Andrew Turner under 26 * sponsorship from The FreeBSD Foundation. 27 * 28 * Portions of this software were developed by SRI International and the 29 * University of Cambridge Computer Laboratory under DARPA/AFRL contract 30 * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme. 31 * 32 * Portions of this software were developed by the University of Cambridge 33 * Computer Laboratory as part of the CTSRD Project, with support from the 34 * UK Higher Education Innovation Fund (HEIF). 35 * 36 * Redistribution and use in source and binary forms, with or without 37 * modification, are permitted provided that the following conditions 38 * are met: 39 * 1. Redistributions of source code must retain the above copyright 40 * notice, this list of conditions and the following disclaimer. 41 * 2. Redistributions in binary form must reproduce the above copyright 42 * notice, this list of conditions and the following disclaimer in the 43 * documentation and/or other materials provided with the distribution. 44 * 3. All advertising materials mentioning features or use of this software 45 * must display the following acknowledgement: 46 * This product includes software developed by the University of 47 * California, Berkeley and its contributors. 48 * 4. Neither the name of the University nor the names of its contributors 49 * may be used to endorse or promote products derived from this software 50 * without specific prior written permission. 51 * 52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 62 * SUCH DAMAGE. 63 */ 64 /*- 65 * Copyright (c) 2003 Networks Associates Technology, Inc. 66 * All rights reserved. 67 * 68 * This software was developed for the FreeBSD Project by Jake Burkholder, 69 * Safeport Network Services, and Network Associates Laboratories, the 70 * Security Research Division of Network Associates, Inc. under 71 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 72 * CHATS research program. 73 * 74 * Redistribution and use in source and binary forms, with or without 75 * modification, are permitted provided that the following conditions 76 * are met: 77 * 1. Redistributions of source code must retain the above copyright 78 * notice, this list of conditions and the following disclaimer. 79 * 2. Redistributions in binary form must reproduce the above copyright 80 * notice, this list of conditions and the following disclaimer in the 81 * documentation and/or other materials provided with the distribution. 82 * 83 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 84 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 85 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 86 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 87 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 88 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 89 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 90 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 91 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 92 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 93 * SUCH DAMAGE. 94 */ 95 96 /* 97 * Manages physical address maps. 98 * 99 * Since the information managed by this module is 100 * also stored by the logical address mapping module, 101 * this module may throw away valid virtual-to-physical 102 * mappings at almost any time. However, invalidations 103 * of virtual-to-physical mappings must be done as 104 * requested. 105 * 106 * In order to cope with hardware architectures which 107 * make virtual-to-physical map invalidates expensive, 108 * this module may delay invalidate or reduced protection 109 * operations until such time as they are actually 110 * necessary. This module is given full information as 111 * to which processors are currently using which maps, 112 * and to when physical maps must be made correct. 113 */ 114 115 #include "opt_pmap.h" 116 117 #include <sys/param.h> 118 #include <sys/systm.h> 119 #include <sys/bitstring.h> 120 #include <sys/bus.h> 121 #include <sys/cpuset.h> 122 #include <sys/kernel.h> 123 #include <sys/ktr.h> 124 #include <sys/lock.h> 125 #include <sys/malloc.h> 126 #include <sys/mman.h> 127 #include <sys/msgbuf.h> 128 #include <sys/mutex.h> 129 #include <sys/physmem.h> 130 #include <sys/proc.h> 131 #include <sys/rwlock.h> 132 #include <sys/sbuf.h> 133 #include <sys/sx.h> 134 #include <sys/vmem.h> 135 #include <sys/vmmeter.h> 136 #include <sys/sched.h> 137 #include <sys/sysctl.h> 138 #include <sys/smp.h> 139 140 #include <vm/vm.h> 141 #include <vm/vm_param.h> 142 #include <vm/vm_kern.h> 143 #include <vm/vm_page.h> 144 #include <vm/vm_map.h> 145 #include <vm/vm_object.h> 146 #include <vm/vm_extern.h> 147 #include <vm/vm_pageout.h> 148 #include <vm/vm_pager.h> 149 #include <vm/vm_phys.h> 150 #include <vm/vm_radix.h> 151 #include <vm/vm_reserv.h> 152 #include <vm/vm_dumpset.h> 153 #include <vm/uma.h> 154 155 #include <machine/machdep.h> 156 #include <machine/md_var.h> 157 #include <machine/pcb.h> 158 #include <machine/sbi.h> 159 #include <machine/thead.h> 160 161 /* 162 * Boundary values for the page table page index space: 163 * 164 * L3 pages: [0, NUL2E) 165 * L2 pages: [NUL2E, NUL2E + NUL1E) 166 * L1 pages: [NUL2E + NUL1E, NUL2E + NUL1E + NUL0E) 167 * 168 * Note that these ranges are used in both SV39 and SV48 mode. In SV39 mode the 169 * ranges are not fully populated since there are at most Ln_ENTRIES^2 L3 pages 170 * in a set of page tables. 171 */ 172 #define NUL0E Ln_ENTRIES 173 #define NUL1E (Ln_ENTRIES * NUL0E) 174 #define NUL2E (Ln_ENTRIES * NUL1E) 175 176 #ifdef PV_STATS 177 #define PV_STAT(x) do { x ; } while (0) 178 #define __pv_stat_used 179 #else 180 #define PV_STAT(x) do { } while (0) 181 #define __pv_stat_used __unused 182 #endif 183 184 #define pmap_l1_pindex(v) (NUL2E + ((v) >> L1_SHIFT)) 185 #define pmap_l2_pindex(v) ((v) >> L2_SHIFT) 186 #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 187 188 #define NPV_LIST_LOCKS MAXCPU 189 190 #define PHYS_TO_PV_LIST_LOCK(pa) \ 191 (&pv_list_locks[pmap_l2_pindex(pa) % NPV_LIST_LOCKS]) 192 193 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ 194 struct rwlock **_lockp = (lockp); \ 195 struct rwlock *_new_lock; \ 196 \ 197 _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ 198 if (_new_lock != *_lockp) { \ 199 if (*_lockp != NULL) \ 200 rw_wunlock(*_lockp); \ 201 *_lockp = _new_lock; \ 202 rw_wlock(*_lockp); \ 203 } \ 204 } while (0) 205 206 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ 207 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) 208 209 #define RELEASE_PV_LIST_LOCK(lockp) do { \ 210 struct rwlock **_lockp = (lockp); \ 211 \ 212 if (*_lockp != NULL) { \ 213 rw_wunlock(*_lockp); \ 214 *_lockp = NULL; \ 215 } \ 216 } while (0) 217 218 #define VM_PAGE_TO_PV_LIST_LOCK(m) \ 219 PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) 220 221 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 222 "VM/pmap parameters"); 223 224 /* The list of all the user pmaps */ 225 LIST_HEAD(pmaplist, pmap); 226 static struct pmaplist allpmaps = LIST_HEAD_INITIALIZER(); 227 228 enum pmap_mode __read_frequently pmap_mode = PMAP_MODE_SV39; 229 SYSCTL_INT(_vm_pmap, OID_AUTO, mode, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 230 &pmap_mode, 0, 231 "translation mode, 0 = SV39, 1 = SV48"); 232 233 struct pmap kernel_pmap_store; 234 235 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 236 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 237 vm_offset_t kernel_vm_end = 0; 238 239 vm_paddr_t dmap_phys_base; /* The start of the dmap region */ 240 vm_paddr_t dmap_phys_max; /* The limit of the dmap region */ 241 vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */ 242 243 /* This code assumes all L1 DMAP entries will be used */ 244 CTASSERT((DMAP_MIN_ADDRESS & ~L1_OFFSET) == DMAP_MIN_ADDRESS); 245 CTASSERT((DMAP_MAX_ADDRESS & ~L1_OFFSET) == DMAP_MAX_ADDRESS); 246 247 /* 248 * This code assumes that the early DEVMAP is L2_SIZE aligned. 249 */ 250 CTASSERT((PMAP_MAPDEV_EARLY_SIZE & L2_OFFSET) == 0); 251 252 static struct rwlock_padalign pvh_global_lock; 253 static struct mtx_padalign allpmaps_lock; 254 255 static int __read_frequently superpages_enabled = 1; 256 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled, 257 CTLFLAG_RDTUN, &superpages_enabled, 0, 258 "Enable support for transparent superpages"); 259 260 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 261 "2MB page mapping counters"); 262 263 static u_long pmap_l2_demotions; 264 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD, 265 &pmap_l2_demotions, 0, 266 "2MB page demotions"); 267 268 static u_long pmap_l2_mappings; 269 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD, 270 &pmap_l2_mappings, 0, 271 "2MB page mappings"); 272 273 static u_long pmap_l2_p_failures; 274 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD, 275 &pmap_l2_p_failures, 0, 276 "2MB page promotion failures"); 277 278 static u_long pmap_l2_promotions; 279 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD, 280 &pmap_l2_promotions, 0, 281 "2MB page promotions"); 282 283 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l1, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 284 "L1 (1GB) page mapping counters"); 285 286 static COUNTER_U64_DEFINE_EARLY(pmap_l1_demotions); 287 SYSCTL_COUNTER_U64(_vm_pmap_l1, OID_AUTO, demotions, CTLFLAG_RD, 288 &pmap_l1_demotions, "L1 (1GB) page demotions"); 289 290 /* 291 * Data for the pv entry allocation mechanism 292 */ 293 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 294 static struct mtx pv_chunks_mutex; 295 static struct rwlock pv_list_locks[NPV_LIST_LOCKS]; 296 static struct md_page *pv_table; 297 static struct md_page pv_dummy; 298 299 extern cpuset_t all_harts; 300 301 /* 302 * Internal flags for pmap_enter()'s helper functions. 303 */ 304 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ 305 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ 306 307 static void free_pv_chunk(struct pv_chunk *pc); 308 static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 309 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); 310 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); 311 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 312 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 313 vm_offset_t va); 314 static bool pmap_demote_l1(pmap_t pmap, pd_entry_t *l1, vm_offset_t va); 315 static bool pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va); 316 static bool pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, 317 vm_offset_t va, struct rwlock **lockp); 318 static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, 319 u_int flags, vm_page_t m, struct rwlock **lockp); 320 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 321 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); 322 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva, 323 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); 324 static bool pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 325 vm_page_t m, struct rwlock **lockp); 326 327 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, 328 struct rwlock **lockp); 329 330 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, 331 struct spglist *free); 332 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); 333 334 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode); 335 336 static uint64_t pmap_satp_mode(void); 337 338 #define pmap_clear(pte) pmap_store(pte, 0) 339 #define pmap_clear_bits(pte, bits) atomic_clear_64(pte, bits) 340 #define pmap_load_store(pte, entry) atomic_swap_64(pte, entry) 341 #define pmap_load_clear(pte) pmap_load_store(pte, 0) 342 #define pmap_load(pte) atomic_load_64(pte) 343 #define pmap_store(pte, entry) atomic_store_64(pte, entry) 344 #define pmap_store_bits(pte, bits) atomic_set_64(pte, bits) 345 346 /********************/ 347 /* Inline functions */ 348 /********************/ 349 350 static __inline void 351 pagecopy(void *s, void *d) 352 { 353 354 memcpy(d, s, PAGE_SIZE); 355 } 356 357 static __inline void 358 pagezero(void *p) 359 { 360 361 bzero(p, PAGE_SIZE); 362 } 363 364 #define pmap_l0_index(va) (((va) >> L0_SHIFT) & Ln_ADDR_MASK) 365 #define pmap_l1_index(va) (((va) >> L1_SHIFT) & Ln_ADDR_MASK) 366 #define pmap_l2_index(va) (((va) >> L2_SHIFT) & Ln_ADDR_MASK) 367 #define pmap_l3_index(va) (((va) >> L3_SHIFT) & Ln_ADDR_MASK) 368 369 #define PTE_TO_PHYS(pte) \ 370 ((((pte) & ~PTE_HI_MASK) >> PTE_PPN0_S) * PAGE_SIZE) 371 #define L2PTE_TO_PHYS(l2) \ 372 ((((l2) & ~PTE_HI_MASK) >> PTE_PPN1_S) << L2_SHIFT) 373 #define L1PTE_TO_PHYS(l1) \ 374 ((((l1) & ~PTE_HI_MASK) >> PTE_PPN2_S) << L1_SHIFT) 375 #define PTE_TO_VM_PAGE(pte) PHYS_TO_VM_PAGE(PTE_TO_PHYS(pte)) 376 377 /* 378 * Construct a page table entry of the specified level pointing to physical 379 * address pa, with PTE bits 'bits'. 380 * 381 * A leaf PTE of any level must point to an address matching its alignment, 382 * e.g. L2 pages must be 2MB aligned in memory. 383 */ 384 #define L1_PTE(pa, bits) ((((pa) >> L1_SHIFT) << PTE_PPN2_S) | (bits)) 385 #define L2_PTE(pa, bits) ((((pa) >> L2_SHIFT) << PTE_PPN1_S) | (bits)) 386 #define L3_PTE(pa, bits) ((((pa) >> L3_SHIFT) << PTE_PPN0_S) | (bits)) 387 388 /* 389 * Construct a page directory entry (PDE), pointing to next level entry at pa, 390 * with PTE bits 'bits'. 391 * 392 * Unlike PTEs, page directory entries can point to any 4K-aligned physical 393 * address. 394 */ 395 #define L0_PDE(pa, bits) L3_PTE(pa, bits) 396 #define L1_PDE(pa, bits) L3_PTE(pa, bits) 397 #define L2_PDE(pa, bits) L3_PTE(pa, bits) 398 399 static __inline pd_entry_t * 400 pmap_l0(pmap_t pmap, vm_offset_t va) 401 { 402 KASSERT(pmap_mode != PMAP_MODE_SV39, ("%s: in SV39 mode", __func__)); 403 KASSERT(VIRT_IS_VALID(va), 404 ("%s: malformed virtual address %#lx", __func__, va)); 405 return (&pmap->pm_top[pmap_l0_index(va)]); 406 } 407 408 static __inline pd_entry_t * 409 pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va) 410 { 411 vm_paddr_t phys; 412 pd_entry_t *l1; 413 414 KASSERT(pmap_mode != PMAP_MODE_SV39, ("%s: in SV39 mode", __func__)); 415 phys = PTE_TO_PHYS(pmap_load(l0)); 416 l1 = (pd_entry_t *)PHYS_TO_DMAP(phys); 417 418 return (&l1[pmap_l1_index(va)]); 419 } 420 421 static __inline pd_entry_t * 422 pmap_l1(pmap_t pmap, vm_offset_t va) 423 { 424 pd_entry_t *l0; 425 426 KASSERT(VIRT_IS_VALID(va), 427 ("%s: malformed virtual address %#lx", __func__, va)); 428 if (pmap_mode == PMAP_MODE_SV39) { 429 return (&pmap->pm_top[pmap_l1_index(va)]); 430 } else { 431 l0 = pmap_l0(pmap, va); 432 if ((pmap_load(l0) & PTE_V) == 0) 433 return (NULL); 434 if ((pmap_load(l0) & PTE_RX) != 0) 435 return (NULL); 436 return (pmap_l0_to_l1(l0, va)); 437 } 438 } 439 440 static __inline pd_entry_t * 441 pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va) 442 { 443 vm_paddr_t phys; 444 pd_entry_t *l2; 445 446 phys = PTE_TO_PHYS(pmap_load(l1)); 447 l2 = (pd_entry_t *)PHYS_TO_DMAP(phys); 448 449 return (&l2[pmap_l2_index(va)]); 450 } 451 452 static __inline pd_entry_t * 453 pmap_l2(pmap_t pmap, vm_offset_t va) 454 { 455 pd_entry_t *l1; 456 457 l1 = pmap_l1(pmap, va); 458 if (l1 == NULL) 459 return (NULL); 460 if ((pmap_load(l1) & PTE_V) == 0) 461 return (NULL); 462 if ((pmap_load(l1) & PTE_RX) != 0) 463 return (NULL); 464 465 return (pmap_l1_to_l2(l1, va)); 466 } 467 468 static __inline pt_entry_t * 469 pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va) 470 { 471 vm_paddr_t phys; 472 pt_entry_t *l3; 473 474 phys = PTE_TO_PHYS(pmap_load(l2)); 475 l3 = (pd_entry_t *)PHYS_TO_DMAP(phys); 476 477 return (&l3[pmap_l3_index(va)]); 478 } 479 480 static __inline pt_entry_t * 481 pmap_l3(pmap_t pmap, vm_offset_t va) 482 { 483 pd_entry_t *l2; 484 485 l2 = pmap_l2(pmap, va); 486 if (l2 == NULL) 487 return (NULL); 488 if ((pmap_load(l2) & PTE_V) == 0) 489 return (NULL); 490 if ((pmap_load(l2) & PTE_RX) != 0) 491 return (NULL); 492 493 return (pmap_l2_to_l3(l2, va)); 494 } 495 496 static __inline void 497 pmap_resident_count_inc(pmap_t pmap, int count) 498 { 499 500 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 501 pmap->pm_stats.resident_count += count; 502 } 503 504 static __inline void 505 pmap_resident_count_dec(pmap_t pmap, int count) 506 { 507 508 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 509 KASSERT(pmap->pm_stats.resident_count >= count, 510 ("pmap %p resident count underflow %ld %d", pmap, 511 pmap->pm_stats.resident_count, count)); 512 pmap->pm_stats.resident_count -= count; 513 } 514 515 static void 516 pmap_distribute_l1(struct pmap *pmap, vm_pindex_t l1index, 517 pt_entry_t entry) 518 { 519 struct pmap *user_pmap; 520 pd_entry_t *l1; 521 522 /* 523 * Distribute new kernel L1 entry to all the user pmaps. This is only 524 * necessary with three-level paging configured: with four-level paging 525 * the kernel's half of the top-level page table page is static and can 526 * simply be copied at pmap initialization time. 527 */ 528 if (pmap != kernel_pmap || pmap_mode != PMAP_MODE_SV39) 529 return; 530 531 mtx_lock(&allpmaps_lock); 532 LIST_FOREACH(user_pmap, &allpmaps, pm_list) { 533 l1 = &user_pmap->pm_top[l1index]; 534 pmap_store(l1, entry); 535 } 536 mtx_unlock(&allpmaps_lock); 537 } 538 539 /* 540 * Holds the PTE mode bits (defined in pte.h) for defining e.g. cacheability. 541 * 542 * The indices correspond to the VM_MEMATTR_* defines in riscv/include/vm.h. 543 * 544 * The array will be empty if no mode bits are supported by the CPU, e.g. when 545 * lacking the Svpbmt extension. 546 */ 547 static __read_frequently pt_entry_t memattr_bits[VM_MEMATTR_TOTAL]; 548 static __read_frequently pt_entry_t memattr_mask; 549 550 static __inline pt_entry_t 551 pmap_memattr_bits(vm_memattr_t mode) 552 { 553 KASSERT(pmap_is_valid_memattr(kernel_pmap, mode), 554 ("invalid memory mode %u\n", mode)); 555 return (memattr_bits[(int)mode]); 556 } 557 558 /* 559 * This should only be used during pmap bootstrap e.g. by 560 * pmap_create_pagetables(). 561 */ 562 static pt_entry_t * 563 pmap_early_alloc_tables(vm_paddr_t *freemempos, int npages) 564 { 565 pt_entry_t *pt; 566 567 pt = (pt_entry_t *)*freemempos; 568 *freemempos += npages * PAGE_SIZE; 569 bzero(pt, npages * PAGE_SIZE); 570 571 return (pt); 572 } 573 574 /* 575 * Construct the direct map -- a linear mapping of physical memory into 576 * the kernel address space. 577 * 578 * We walk the list of physical memory segments (of arbitrary size and 579 * address) mapping each appropriately using L2 and L1 superpages. 580 * Consequently, the DMAP address space will have unmapped regions 581 * corresponding to any holes between physical memory segments. 582 * 583 * The lowest usable physical address will always be mapped to 584 * DMAP_MIN_ADDRESS. 585 */ 586 static vm_paddr_t 587 pmap_bootstrap_dmap(pd_entry_t *l1, vm_paddr_t freemempos) 588 { 589 vm_paddr_t physmap[PHYS_AVAIL_ENTRIES]; 590 vm_offset_t va; 591 vm_paddr_t min_pa, max_pa, pa, endpa; 592 pd_entry_t *l2; 593 pt_entry_t memattr; 594 u_int l1slot, l2slot; 595 int physmap_idx; 596 597 physmap_idx = physmem_avail(physmap, nitems(physmap)); 598 min_pa = physmap[0]; 599 max_pa = physmap[physmap_idx - 1]; 600 601 printf("physmap_idx %u\n", physmap_idx); 602 printf("min_pa %lx\n", min_pa); 603 printf("max_pa %lx\n", max_pa); 604 605 /* Set the limits of the DMAP region. */ 606 dmap_phys_base = rounddown(min_pa, L1_SIZE); 607 dmap_phys_max = max_pa; 608 609 memattr = pmap_memattr_bits(VM_MEMATTR_DEFAULT); 610 611 /* Walk the physmap table. */ 612 l2 = NULL; 613 l1slot = Ln_ENTRIES; /* sentinel value */ 614 for (int idx = 0; idx < physmap_idx; idx += 2) { 615 pa = rounddown(physmap[idx], L2_SIZE); 616 endpa = physmap[idx + 1]; 617 618 /* Virtual address for this range. */ 619 va = PHYS_TO_DMAP(pa); 620 621 /* Any 1GB possible for this range? */ 622 if (roundup(pa, L1_SIZE) + L1_SIZE > endpa) 623 goto l2end; 624 625 /* Loop until the next 1GB boundary. */ 626 while ((pa & L1_OFFSET) != 0) { 627 if (l2 == NULL || pmap_l1_index(va) != l1slot) { 628 /* Need to alloc another page table. */ 629 l2 = pmap_early_alloc_tables(&freemempos, 1); 630 631 /* Link it. */ 632 l1slot = pmap_l1_index(va); 633 pmap_store(&l1[l1slot], 634 L1_PDE((vm_paddr_t)l2, PTE_V)); 635 } 636 637 /* map l2 pages */ 638 l2slot = pmap_l2_index(va); 639 pmap_store(&l2[l2slot], L2_PTE(pa, PTE_KERN | memattr)); 640 641 pa += L2_SIZE; 642 va += L2_SIZE; 643 } 644 645 /* Map what we can with 1GB superpages. */ 646 while (pa + L1_SIZE - 1 < endpa) { 647 /* map l1 pages */ 648 l1slot = pmap_l1_index(va); 649 pmap_store(&l1[l1slot], L1_PTE(pa, PTE_KERN | memattr)); 650 651 pa += L1_SIZE; 652 va += L1_SIZE; 653 } 654 655 l2end: 656 while (pa < endpa) { 657 if (l2 == NULL || pmap_l1_index(va) != l1slot) { 658 /* Need to alloc another page table. */ 659 l2 = pmap_early_alloc_tables(&freemempos, 1); 660 661 /* Link it. */ 662 l1slot = pmap_l1_index(va); 663 pmap_store(&l1[l1slot], 664 L1_PDE((vm_paddr_t)l2, PTE_V)); 665 } 666 667 /* map l2 pages */ 668 l2slot = pmap_l2_index(va); 669 pmap_store(&l2[l2slot], L2_PTE(pa, PTE_KERN | memattr)); 670 671 pa += L2_SIZE; 672 va += L2_SIZE; 673 } 674 } 675 676 /* And finally, the limit on DMAP VA. */ 677 dmap_max_addr = va; 678 679 return (freemempos); 680 } 681 682 /* 683 * Create a new set of pagetables to run the kernel with. 684 * 685 * An initial, temporary setup was created in locore.S, which serves well 686 * enough to get us this far. It mapped kernstart -> KERNBASE, using 2MB 687 * superpages, and created a 1GB identity map, which allows this function 688 * to dereference physical addresses. 689 * 690 * The memory backing these page tables is allocated in the space 691 * immediately following the kernel's preload area. Depending on the size 692 * of this area, some, all, or none of these pages can be implicitly 693 * mapped by the kernel's 2MB mappings. This memory will only ever be 694 * accessed through the direct map, however. 695 */ 696 static vm_paddr_t 697 pmap_create_pagetables(vm_paddr_t kernstart, vm_size_t kernlen, 698 vm_paddr_t *root_pt_phys) 699 { 700 pt_entry_t *l0, *l1, *kern_l2, *kern_l3, *devmap_l3; 701 pt_entry_t memattr; 702 pd_entry_t *devmap_l2; 703 vm_paddr_t kernend, freemempos, pa; 704 int nkernl2, nkernl3, ndevmapl3; 705 int i, slot; 706 int mode; 707 708 kernend = kernstart + kernlen; 709 710 /* Static allocations begin after the kernel staging area. */ 711 freemempos = roundup2(kernend, PAGE_SIZE); 712 713 /* Detect Sv48 mode. */ 714 mode = PMAP_MODE_SV39; 715 TUNABLE_INT_FETCH("vm.pmap.mode", &mode); 716 717 if (mode == PMAP_MODE_SV48 && (mmu_caps & MMU_SV48) != 0) { 718 /* 719 * Sv48 mode: allocate an L0 page table to be the root. The 720 * layout of KVA is otherwise identical to Sv39. 721 */ 722 l0 = pmap_early_alloc_tables(&freemempos, 1); 723 *root_pt_phys = (vm_paddr_t)l0; 724 pmap_mode = PMAP_MODE_SV48; 725 } else { 726 l0 = NULL; 727 } 728 729 /* 730 * Allocate an L1 page table. 731 */ 732 l1 = pmap_early_alloc_tables(&freemempos, 1); 733 if (pmap_mode == PMAP_MODE_SV39) 734 *root_pt_phys = (vm_paddr_t)l1; 735 736 /* 737 * Allocate a set of L2 page tables for KVA. Most likely, only 1 is 738 * needed. 739 */ 740 nkernl2 = howmany(howmany(kernlen, L2_SIZE), Ln_ENTRIES); 741 kern_l2 = pmap_early_alloc_tables(&freemempos, nkernl2); 742 743 /* 744 * Allocate an L2 page table for the static devmap, located at the end 745 * of KVA. We can expect that the devmap will always be less than 1GB 746 * in size. 747 */ 748 devmap_l2 = pmap_early_alloc_tables(&freemempos, 1); 749 750 /* Allocate L3 page tables for the devmap. */ 751 ndevmapl3 = howmany(howmany(PMAP_MAPDEV_EARLY_SIZE, L3_SIZE), 752 Ln_ENTRIES); 753 devmap_l3 = pmap_early_alloc_tables(&freemempos, ndevmapl3); 754 755 /* 756 * Allocate some L3 bootstrap pages, for early KVA allocations before 757 * vm_mem_init() has run. For example, the message buffer. 758 * 759 * A somewhat arbitrary choice of 32MB. This should be more than enough 760 * for any early allocations. There is no need to worry about waste, as 761 * whatever is not used will be consumed by later calls to 762 * pmap_growkernel(). 763 */ 764 nkernl3 = 16; 765 kern_l3 = pmap_early_alloc_tables(&freemempos, nkernl3); 766 767 /* Bootstrap the direct map. */ 768 freemempos = pmap_bootstrap_dmap(l1, freemempos); 769 770 /* Allocations are done. */ 771 if (freemempos < roundup2(kernend, L2_SIZE)) 772 freemempos = roundup2(kernend, L2_SIZE); 773 774 /* Memory attributes for standard/main memory. */ 775 memattr = pmap_memattr_bits(VM_MEMATTR_DEFAULT); 776 777 /* 778 * Map the kernel (and preloaded modules or data) using L2 superpages. 779 * 780 * kernstart is 2MB-aligned. This is enforced by loader(8) and required 781 * by locore assembly. 782 * 783 * TODO: eventually, this should be done with proper permissions for 784 * each segment, rather than mapping the entire kernel and preloaded 785 * modules RWX. 786 */ 787 slot = pmap_l2_index(KERNBASE); 788 for (pa = kernstart; pa < kernend; pa += L2_SIZE, slot++) { 789 pmap_store(&kern_l2[slot], 790 L2_PTE(pa, PTE_KERN | PTE_X | memattr)); 791 } 792 793 /* 794 * Connect the L3 bootstrap pages to the kernel L2 table. The L3 PTEs 795 * themselves are invalid. 796 */ 797 slot = pmap_l2_index(freemempos - kernstart + KERNBASE); 798 for (i = 0; i < nkernl3; i++, slot++) { 799 pa = (vm_paddr_t)kern_l3 + ptoa(i); 800 pmap_store(&kern_l2[slot], L2_PDE(pa, PTE_V)); 801 } 802 803 /* Connect the L2 tables to the L1 table. */ 804 slot = pmap_l1_index(KERNBASE); 805 for (i = 0; i < nkernl2; i++, slot++) { 806 pa = (vm_paddr_t)kern_l2 + ptoa(i); 807 pmap_store(&l1[slot], L1_PDE(pa, PTE_V)); 808 } 809 810 /* Connect the L1 table to L0, if in use. */ 811 if (pmap_mode == PMAP_MODE_SV48) { 812 slot = pmap_l0_index(KERNBASE); 813 pmap_store(&l0[slot], L0_PDE((vm_paddr_t)l1, PTE_V)); 814 } 815 816 /* 817 * Connect the devmap L3 pages to the L2 table. The devmap PTEs 818 * themselves are invalid. 819 */ 820 slot = pmap_l2_index(DEVMAP_MIN_VADDR); 821 for (i = 0; i < ndevmapl3; i++, slot++) { 822 pa = (vm_paddr_t)devmap_l3 + ptoa(i); 823 pmap_store(&devmap_l2[slot], L2_PDE(pa, PTE_V)); 824 } 825 826 /* Connect the devmap L2 pages to the L1 table. */ 827 slot = pmap_l1_index(DEVMAP_MIN_VADDR); 828 pa = (vm_paddr_t)devmap_l2; 829 pmap_store(&l1[slot], L1_PDE(pa, PTE_V)); 830 831 /* Return the next position of free memory */ 832 return (freemempos); 833 } 834 835 /* 836 * Bootstrap the system enough to run with virtual memory. 837 */ 838 void 839 pmap_bootstrap(vm_paddr_t kernstart, vm_size_t kernlen) 840 { 841 vm_paddr_t freemempos, pa; 842 vm_paddr_t root_pt_phys; 843 vm_offset_t freeva; 844 vm_offset_t dpcpu, msgbufpv; 845 pt_entry_t *pte; 846 int i; 847 848 printf("pmap_bootstrap %lx %lx\n", kernstart, kernlen); 849 850 PMAP_LOCK_INIT(kernel_pmap); 851 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 852 vm_radix_init(&kernel_pmap->pm_root); 853 854 rw_init(&pvh_global_lock, "pmap pv global"); 855 856 /* 857 * Set the current CPU as active in the kernel pmap. Secondary cores 858 * will add themselves later in init_secondary(). The SBI firmware 859 * may rely on this mask being precise, so CPU_FILL() is not used. 860 */ 861 CPU_SET(PCPU_GET(hart), &kernel_pmap->pm_active); 862 863 /* 864 * Set up the memory attribute bits. 865 */ 866 if (has_svpbmt) { 867 memattr_bits[VM_MEMATTR_PMA] = PTE_MA_NONE; 868 memattr_bits[VM_MEMATTR_UNCACHEABLE] = PTE_MA_NC; 869 memattr_bits[VM_MEMATTR_DEVICE] = PTE_MA_IO; 870 memattr_mask = PTE_MA_MASK; 871 } else if (has_errata_thead_pbmt) { 872 memattr_bits[VM_MEMATTR_PMA] = PTE_THEAD_MA_NONE; 873 memattr_bits[VM_MEMATTR_UNCACHEABLE] = PTE_THEAD_MA_NC; 874 memattr_bits[VM_MEMATTR_DEVICE] = PTE_THEAD_MA_IO; 875 memattr_mask = PTE_THEAD_MA_MASK; 876 } 877 878 /* Create a new set of pagetables to run the kernel in. */ 879 freemempos = pmap_create_pagetables(kernstart, kernlen, &root_pt_phys); 880 881 /* Switch to the newly created page tables. */ 882 kernel_pmap->pm_stage = PM_STAGE1; 883 kernel_pmap->pm_top = (pd_entry_t *)PHYS_TO_DMAP(root_pt_phys); 884 kernel_pmap->pm_satp = atop(root_pt_phys) | pmap_satp_mode(); 885 csr_write(satp, kernel_pmap->pm_satp); 886 sfence_vma(); 887 888 /* 889 * Now, we need to make a few more static reservations from KVA. 890 * 891 * Set freeva to freemempos virtual address, and be sure to advance 892 * them together. 893 */ 894 freeva = freemempos - kernstart + KERNBASE; 895 #define reserve_space(var, pa, size) \ 896 do { \ 897 var = freeva; \ 898 pa = freemempos; \ 899 freeva += size; \ 900 freemempos += size; \ 901 } while (0) 902 903 /* Allocate the dynamic per-cpu area. */ 904 reserve_space(dpcpu, pa, DPCPU_SIZE); 905 906 /* Map it. */ 907 pte = pmap_l3(kernel_pmap, dpcpu); 908 KASSERT(pte != NULL, ("Bootstrap pages missing")); 909 for (i = 0; i < howmany(DPCPU_SIZE, PAGE_SIZE); i++) 910 pmap_store(&pte[i], L3_PTE(pa + ptoa(i), PTE_KERN | 911 pmap_memattr_bits(VM_MEMATTR_DEFAULT))); 912 913 /* Now, it can be initialized. */ 914 dpcpu_init((void *)dpcpu, 0); 915 916 /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */ 917 reserve_space(msgbufpv, pa, round_page(msgbufsize)); 918 msgbufp = (void *)msgbufpv; 919 920 /* Map it. */ 921 pte = pmap_l3(kernel_pmap, msgbufpv); 922 KASSERT(pte != NULL, ("Bootstrap pages missing")); 923 for (i = 0; i < howmany(msgbufsize, PAGE_SIZE); i++) 924 pmap_store(&pte[i], L3_PTE(pa + ptoa(i), PTE_KERN | 925 pmap_memattr_bits(VM_MEMATTR_DEFAULT))); 926 927 #undef reserve_space 928 929 /* Mark the bounds of our available virtual address space */ 930 virtual_avail = kernel_vm_end = freeva; 931 virtual_end = DEVMAP_MIN_VADDR; 932 933 /* Exclude the reserved physical memory from allocations. */ 934 physmem_exclude_region(kernstart, freemempos - kernstart, 935 EXFLAG_NOALLOC); 936 } 937 938 /* 939 * Initialize a vm_page's machine-dependent fields. 940 */ 941 void 942 pmap_page_init(vm_page_t m) 943 { 944 945 TAILQ_INIT(&m->md.pv_list); 946 m->md.pv_memattr = VM_MEMATTR_DEFAULT; 947 } 948 949 /* 950 * Initialize the pmap module. 951 * 952 * Called by vm_mem_init(), to initialize any structures that the pmap 953 * system needs to map virtual memory. 954 */ 955 void 956 pmap_init(void) 957 { 958 vm_size_t s; 959 int i, pv_npg; 960 961 /* 962 * Initialize the pv chunk and pmap list mutexes. 963 */ 964 mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); 965 mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_DEF); 966 967 /* 968 * Initialize the pool of pv list locks. 969 */ 970 for (i = 0; i < NPV_LIST_LOCKS; i++) 971 rw_init(&pv_list_locks[i], "pmap pv list"); 972 973 /* 974 * Calculate the size of the pv head table for superpages. 975 */ 976 pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L2_SIZE); 977 978 /* 979 * Allocate memory for the pv head table for superpages. 980 */ 981 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 982 s = round_page(s); 983 pv_table = kmem_malloc(s, M_WAITOK | M_ZERO); 984 for (i = 0; i < pv_npg; i++) 985 TAILQ_INIT(&pv_table[i].pv_list); 986 TAILQ_INIT(&pv_dummy.pv_list); 987 988 if (superpages_enabled) 989 pagesizes[1] = L2_SIZE; 990 } 991 992 #ifdef SMP 993 /* 994 * For SMP, these functions have to use IPIs for coherence. 995 * 996 * In general, the calling thread uses a plain fence to order the 997 * writes to the page tables before invoking an SBI callback to invoke 998 * sfence_vma() on remote CPUs. 999 */ 1000 static void 1001 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 1002 { 1003 cpuset_t mask; 1004 1005 sched_pin(); 1006 mask = pmap->pm_active; 1007 CPU_CLR(PCPU_GET(hart), &mask); 1008 fence(); 1009 if (!CPU_EMPTY(&mask) && smp_started) 1010 sbi_remote_sfence_vma(mask.__bits, va, 1); 1011 sfence_vma_page(va); 1012 sched_unpin(); 1013 } 1014 1015 static void 1016 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1017 { 1018 cpuset_t mask; 1019 1020 sched_pin(); 1021 mask = pmap->pm_active; 1022 CPU_CLR(PCPU_GET(hart), &mask); 1023 fence(); 1024 if (!CPU_EMPTY(&mask) && smp_started) 1025 sbi_remote_sfence_vma(mask.__bits, sva, eva - sva + 1); 1026 1027 /* 1028 * Might consider a loop of sfence_vma_page() for a small 1029 * number of pages in the future. 1030 */ 1031 sfence_vma(); 1032 sched_unpin(); 1033 } 1034 1035 static void 1036 pmap_invalidate_all(pmap_t pmap) 1037 { 1038 cpuset_t mask; 1039 1040 sched_pin(); 1041 mask = pmap->pm_active; 1042 CPU_CLR(PCPU_GET(hart), &mask); 1043 1044 /* 1045 * XXX: The SBI doc doesn't detail how to specify x0 as the 1046 * address to perform a global fence. BBL currently treats 1047 * all sfence_vma requests as global however. 1048 */ 1049 fence(); 1050 if (!CPU_EMPTY(&mask) && smp_started) 1051 sbi_remote_sfence_vma(mask.__bits, 0, 0); 1052 sfence_vma(); 1053 sched_unpin(); 1054 } 1055 #else 1056 /* 1057 * Normal, non-SMP, invalidation functions. 1058 * We inline these within pmap.c for speed. 1059 */ 1060 static __inline void 1061 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 1062 { 1063 1064 sfence_vma_page(va); 1065 } 1066 1067 static __inline void 1068 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1069 { 1070 1071 /* 1072 * Might consider a loop of sfence_vma_page() for a small 1073 * number of pages in the future. 1074 */ 1075 sfence_vma(); 1076 } 1077 1078 static __inline void 1079 pmap_invalidate_all(pmap_t pmap) 1080 { 1081 1082 sfence_vma(); 1083 } 1084 #endif 1085 1086 /* 1087 * Routine: pmap_extract 1088 * Function: 1089 * Extract the physical page address associated 1090 * with the given map/virtual_address pair. 1091 */ 1092 vm_paddr_t 1093 pmap_extract(pmap_t pmap, vm_offset_t va) 1094 { 1095 pd_entry_t *l2p, l2; 1096 pt_entry_t *l3p; 1097 vm_paddr_t pa; 1098 1099 pa = 0; 1100 1101 /* 1102 * Start with an L2 lookup, L1 superpages are currently not implemented. 1103 */ 1104 PMAP_LOCK(pmap); 1105 l2p = pmap_l2(pmap, va); 1106 if (l2p != NULL && ((l2 = pmap_load(l2p)) & PTE_V) != 0) { 1107 if ((l2 & PTE_RWX) == 0) { 1108 l3p = pmap_l2_to_l3(l2p, va); 1109 pa = PTE_TO_PHYS(pmap_load(l3p)); 1110 pa |= (va & L3_OFFSET); 1111 } else { 1112 /* L2 is a superpage mapping. */ 1113 pa = L2PTE_TO_PHYS(l2); 1114 pa |= (va & L2_OFFSET); 1115 } 1116 } 1117 PMAP_UNLOCK(pmap); 1118 return (pa); 1119 } 1120 1121 /* 1122 * Routine: pmap_extract_and_hold 1123 * Function: 1124 * Atomically extract and hold the physical page 1125 * with the given pmap and virtual address pair 1126 * if that mapping permits the given protection. 1127 */ 1128 vm_page_t 1129 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 1130 { 1131 pt_entry_t *l3p, l3; 1132 vm_page_t m; 1133 1134 m = NULL; 1135 PMAP_LOCK(pmap); 1136 l3p = pmap_l3(pmap, va); 1137 if (l3p != NULL && (l3 = pmap_load(l3p)) != 0) { 1138 if ((l3 & PTE_W) != 0 || (prot & VM_PROT_WRITE) == 0) { 1139 m = PTE_TO_VM_PAGE(l3); 1140 if (!vm_page_wire_mapped(m)) 1141 m = NULL; 1142 } 1143 } 1144 PMAP_UNLOCK(pmap); 1145 return (m); 1146 } 1147 1148 /* 1149 * Routine: pmap_kextract 1150 * Function: 1151 * Extract the physical page address associated with the given kernel 1152 * virtual address. 1153 */ 1154 vm_paddr_t 1155 pmap_kextract(vm_offset_t va) 1156 { 1157 pd_entry_t *l2, l2e; 1158 pt_entry_t *l3; 1159 vm_paddr_t pa; 1160 1161 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 1162 pa = DMAP_TO_PHYS(va); 1163 } else { 1164 l2 = pmap_l2(kernel_pmap, va); 1165 if (l2 == NULL) 1166 panic("pmap_kextract: No l2"); 1167 l2e = pmap_load(l2); 1168 /* 1169 * Beware of concurrent promotion and demotion! We must 1170 * use l2e rather than loading from l2 multiple times to 1171 * ensure we see a consistent state, including the 1172 * implicit load in pmap_l2_to_l3. It is, however, safe 1173 * to use an old l2e because the L3 page is preserved by 1174 * promotion. 1175 */ 1176 if ((l2e & PTE_RX) != 0) { 1177 /* superpages */ 1178 pa = L2PTE_TO_PHYS(l2e); 1179 pa |= (va & L2_OFFSET); 1180 return (pa); 1181 } 1182 1183 l3 = pmap_l2_to_l3(&l2e, va); 1184 pa = PTE_TO_PHYS(pmap_load(l3)); 1185 pa |= (va & PAGE_MASK); 1186 } 1187 return (pa); 1188 } 1189 1190 /*************************************************** 1191 * Low level mapping routines..... 1192 ***************************************************/ 1193 1194 void 1195 pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode) 1196 { 1197 pt_entry_t entry; 1198 pt_entry_t *l3; 1199 pt_entry_t memattr; 1200 vm_offset_t va; 1201 pn_t pn; 1202 1203 KASSERT((pa & L3_OFFSET) == 0, 1204 ("pmap_kenter_device: Invalid physical address")); 1205 KASSERT((sva & L3_OFFSET) == 0, 1206 ("pmap_kenter_device: Invalid virtual address")); 1207 KASSERT((size & PAGE_MASK) == 0, 1208 ("pmap_kenter_device: Mapping is not page-sized")); 1209 1210 memattr = pmap_memattr_bits(mode); 1211 va = sva; 1212 while (size != 0) { 1213 l3 = pmap_l3(kernel_pmap, va); 1214 KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va)); 1215 1216 pn = (pa / PAGE_SIZE); 1217 entry = PTE_KERN; 1218 entry |= memattr; 1219 entry |= (pn << PTE_PPN0_S); 1220 pmap_store(l3, entry); 1221 1222 va += PAGE_SIZE; 1223 pa += PAGE_SIZE; 1224 size -= PAGE_SIZE; 1225 } 1226 pmap_invalidate_range(kernel_pmap, sva, va); 1227 } 1228 1229 void 1230 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa) 1231 { 1232 pmap_kenter(sva, size, pa, VM_MEMATTR_DEVICE); 1233 } 1234 1235 /* 1236 * Remove a page from the kernel pagetables. 1237 * Note: not SMP coherent. 1238 */ 1239 void 1240 pmap_kremove(vm_offset_t va) 1241 { 1242 pt_entry_t *l3; 1243 1244 l3 = pmap_l3(kernel_pmap, va); 1245 KASSERT(l3 != NULL, ("pmap_kremove: Invalid address")); 1246 1247 pmap_clear(l3); 1248 sfence_vma(); 1249 } 1250 1251 void 1252 pmap_kremove_device(vm_offset_t sva, vm_size_t size) 1253 { 1254 pt_entry_t *l3; 1255 vm_offset_t va; 1256 1257 KASSERT((sva & L3_OFFSET) == 0, 1258 ("pmap_kremove_device: Invalid virtual address")); 1259 KASSERT((size & PAGE_MASK) == 0, 1260 ("pmap_kremove_device: Mapping is not page-sized")); 1261 1262 va = sva; 1263 while (size != 0) { 1264 l3 = pmap_l3(kernel_pmap, va); 1265 KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va)); 1266 pmap_clear(l3); 1267 1268 va += PAGE_SIZE; 1269 size -= PAGE_SIZE; 1270 } 1271 1272 pmap_invalidate_range(kernel_pmap, sva, va); 1273 } 1274 1275 /* 1276 * Used to map a range of physical addresses into kernel 1277 * virtual address space. 1278 * 1279 * The value passed in '*virt' is a suggested virtual address for 1280 * the mapping. Architectures which can support a direct-mapped 1281 * physical to virtual region can return the appropriate address 1282 * within that region, leaving '*virt' unchanged. Other 1283 * architectures should map the pages starting at '*virt' and 1284 * update '*virt' with the first usable address after the mapped 1285 * region. 1286 */ 1287 vm_offset_t 1288 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 1289 { 1290 1291 return PHYS_TO_DMAP(start); 1292 } 1293 1294 /* 1295 * Add a list of wired pages to the kva 1296 * this routine is only used for temporary 1297 * kernel mappings that do not need to have 1298 * page modification or references recorded. 1299 * Note that old mappings are simply written 1300 * over. The page *must* be wired. 1301 * Note: SMP coherent. Uses a ranged shootdown IPI. 1302 */ 1303 void 1304 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 1305 { 1306 pt_entry_t *l3; 1307 vm_paddr_t pa; 1308 vm_offset_t va; 1309 vm_page_t m; 1310 pt_entry_t entry; 1311 pn_t pn; 1312 int i; 1313 1314 va = sva; 1315 for (i = 0; i < count; i++) { 1316 m = ma[i]; 1317 pa = VM_PAGE_TO_PHYS(m); 1318 pn = (pa / PAGE_SIZE); 1319 l3 = pmap_l3(kernel_pmap, va); 1320 1321 entry = PTE_KERN; 1322 entry |= pmap_memattr_bits(m->md.pv_memattr); 1323 entry |= (pn << PTE_PPN0_S); 1324 pmap_store(l3, entry); 1325 1326 va += L3_SIZE; 1327 } 1328 pmap_invalidate_range(kernel_pmap, sva, va); 1329 } 1330 1331 /* 1332 * This routine tears out page mappings from the 1333 * kernel -- it is meant only for temporary mappings. 1334 * Note: SMP coherent. Uses a ranged shootdown IPI. 1335 */ 1336 void 1337 pmap_qremove(vm_offset_t sva, int count) 1338 { 1339 pt_entry_t *l3; 1340 vm_offset_t va; 1341 1342 KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", sva)); 1343 1344 for (va = sva; count-- > 0; va += PAGE_SIZE) { 1345 l3 = pmap_l3(kernel_pmap, va); 1346 KASSERT(l3 != NULL, ("pmap_kremove: Invalid address")); 1347 pmap_clear(l3); 1348 } 1349 pmap_invalidate_range(kernel_pmap, sva, va); 1350 } 1351 1352 bool 1353 pmap_ps_enabled(pmap_t pmap __unused) 1354 { 1355 1356 return (superpages_enabled); 1357 } 1358 1359 /*************************************************** 1360 * Page table page management routines..... 1361 ***************************************************/ 1362 /* 1363 * Schedule the specified unused page table page to be freed. Specifically, 1364 * add the page to the specified list of pages that will be released to the 1365 * physical memory manager after the TLB has been updated. 1366 */ 1367 static __inline void 1368 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, bool set_PG_ZERO) 1369 { 1370 1371 if (set_PG_ZERO) 1372 m->flags |= PG_ZERO; 1373 else 1374 m->flags &= ~PG_ZERO; 1375 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 1376 } 1377 1378 /* 1379 * Inserts the specified page table page into the specified pmap's collection 1380 * of idle page table pages. Each of a pmap's page table pages is responsible 1381 * for mapping a distinct range of virtual addresses. The pmap's collection is 1382 * ordered by this virtual address range. 1383 * 1384 * If "promoted" is false, then the page table page "mpte" must be zero filled; 1385 * "mpte"'s valid field will be set to 0. 1386 * 1387 * If "promoted" is true and "all_l3e_PTE_A_set" is false, then "mpte" must 1388 * contain valid mappings with identical attributes except for PTE_A; 1389 * "mpte"'s valid field will be set to 1. 1390 * 1391 * If "promoted" and "all_l3e_PTE_A_set" are both true, then "mpte" must contain 1392 * valid mappings with identical attributes including PTE_A; "mpte"'s valid 1393 * field will be set to VM_PAGE_BITS_ALL. 1394 */ 1395 static __inline int 1396 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted, 1397 bool all_l3e_PTE_A_set) 1398 { 1399 1400 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1401 KASSERT(promoted || !all_l3e_PTE_A_set, 1402 ("a zero-filled PTP can't have PTE_A set in every PTE")); 1403 mpte->valid = promoted ? (all_l3e_PTE_A_set ? VM_PAGE_BITS_ALL : 1) : 0; 1404 return (vm_radix_insert(&pmap->pm_root, mpte)); 1405 } 1406 1407 /* 1408 * Removes the page table page mapping the specified virtual address from the 1409 * specified pmap's collection of idle page table pages, and returns it. 1410 * Otherwise, returns NULL if there is no page table page corresponding to the 1411 * specified virtual address. 1412 */ 1413 static __inline vm_page_t 1414 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) 1415 { 1416 1417 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1418 return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va))); 1419 } 1420 1421 /* 1422 * Decrements a page table page's reference count, which is used to record the 1423 * number of valid page table entries within the page. If the reference count 1424 * drops to zero, then the page table page is unmapped. Returns true if the 1425 * page table page was unmapped and false otherwise. 1426 */ 1427 static inline bool 1428 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 1429 { 1430 KASSERT(m->ref_count > 0, 1431 ("%s: page %p ref count underflow", __func__, m)); 1432 1433 --m->ref_count; 1434 if (m->ref_count == 0) { 1435 _pmap_unwire_ptp(pmap, va, m, free); 1436 return (true); 1437 } else { 1438 return (false); 1439 } 1440 } 1441 1442 static void 1443 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 1444 { 1445 1446 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1447 if (m->pindex >= NUL2E + NUL1E) { 1448 pd_entry_t *l0; 1449 l0 = pmap_l0(pmap, va); 1450 pmap_clear(l0); 1451 } else if (m->pindex >= NUL2E) { 1452 pd_entry_t *l1; 1453 l1 = pmap_l1(pmap, va); 1454 pmap_clear(l1); 1455 pmap_distribute_l1(pmap, pmap_l1_index(va), 0); 1456 } else { 1457 pd_entry_t *l2; 1458 l2 = pmap_l2(pmap, va); 1459 pmap_clear(l2); 1460 } 1461 pmap_resident_count_dec(pmap, 1); 1462 if (m->pindex < NUL2E) { 1463 pd_entry_t *l1; 1464 vm_page_t pdpg; 1465 1466 l1 = pmap_l1(pmap, va); 1467 pdpg = PTE_TO_VM_PAGE(pmap_load(l1)); 1468 pmap_unwire_ptp(pmap, va, pdpg, free); 1469 } else if (m->pindex < NUL2E + NUL1E && pmap_mode != PMAP_MODE_SV39) { 1470 pd_entry_t *l0; 1471 vm_page_t pdpg; 1472 1473 l0 = pmap_l0(pmap, va); 1474 pdpg = PTE_TO_VM_PAGE(pmap_load(l0)); 1475 pmap_unwire_ptp(pmap, va, pdpg, free); 1476 } 1477 pmap_invalidate_page(pmap, va); 1478 1479 vm_wire_sub(1); 1480 1481 /* 1482 * Put page on a list so that it is released after 1483 * *ALL* TLB shootdown is done 1484 */ 1485 pmap_add_delayed_free_list(m, free, true); 1486 } 1487 1488 /* 1489 * After removing a page table entry, this routine is used to 1490 * conditionally free the page, and manage the reference count. 1491 */ 1492 static int 1493 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, 1494 struct spglist *free) 1495 { 1496 vm_page_t mpte; 1497 1498 if (va >= VM_MAXUSER_ADDRESS) 1499 return (0); 1500 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 1501 mpte = PTE_TO_VM_PAGE(ptepde); 1502 return (pmap_unwire_ptp(pmap, va, mpte, free)); 1503 } 1504 1505 static uint64_t 1506 pmap_satp_mode(void) 1507 { 1508 return (pmap_mode == PMAP_MODE_SV39 ? SATP_MODE_SV39 : SATP_MODE_SV48); 1509 } 1510 1511 void 1512 pmap_pinit0(pmap_t pmap) 1513 { 1514 PMAP_LOCK_INIT(pmap); 1515 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); 1516 pmap->pm_stage = PM_STAGE1; 1517 pmap->pm_top = kernel_pmap->pm_top; 1518 pmap->pm_satp = pmap_satp_mode() | 1519 (vtophys(pmap->pm_top) >> PAGE_SHIFT); 1520 CPU_ZERO(&pmap->pm_active); 1521 TAILQ_INIT(&pmap->pm_pvchunk); 1522 vm_radix_init(&pmap->pm_root); 1523 pmap_activate_boot(pmap); 1524 } 1525 1526 int 1527 pmap_pinit_stage(pmap_t pmap, enum pmap_stage stage) 1528 { 1529 vm_paddr_t topphys; 1530 vm_page_t m; 1531 size_t i; 1532 1533 /* 1534 * Top directory is 4 pages in hypervisor case. 1535 * Current address space layout makes 3 of them unused. 1536 */ 1537 if (stage == PM_STAGE1) 1538 m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO | 1539 VM_ALLOC_WAITOK); 1540 else 1541 m = vm_page_alloc_noobj_contig(VM_ALLOC_WIRED | VM_ALLOC_ZERO, 1542 4, 0, ~0ul, L2_SIZE, 0, VM_MEMATTR_DEFAULT); 1543 1544 topphys = VM_PAGE_TO_PHYS(m); 1545 pmap->pm_top = (pd_entry_t *)PHYS_TO_DMAP(topphys); 1546 pmap->pm_satp = pmap_satp_mode() | (topphys >> PAGE_SHIFT); 1547 pmap->pm_stage = stage; 1548 1549 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); 1550 1551 CPU_ZERO(&pmap->pm_active); 1552 1553 if (stage == PM_STAGE2) 1554 goto finish; 1555 1556 if (pmap_mode == PMAP_MODE_SV39) { 1557 /* 1558 * Copy L1 entries from the kernel pmap. This must be done with 1559 * the allpmaps lock held to avoid races with 1560 * pmap_distribute_l1(). 1561 */ 1562 mtx_lock(&allpmaps_lock); 1563 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); 1564 for (i = pmap_l1_index(VM_MIN_KERNEL_ADDRESS); 1565 i < pmap_l1_index(VM_MAX_KERNEL_ADDRESS); i++) 1566 pmap->pm_top[i] = kernel_pmap->pm_top[i]; 1567 for (i = pmap_l1_index(DMAP_MIN_ADDRESS); 1568 i < pmap_l1_index(DMAP_MAX_ADDRESS); i++) 1569 pmap->pm_top[i] = kernel_pmap->pm_top[i]; 1570 mtx_unlock(&allpmaps_lock); 1571 } else { 1572 i = pmap_l0_index(VM_MIN_KERNEL_ADDRESS); 1573 pmap->pm_top[i] = kernel_pmap->pm_top[i]; 1574 } 1575 1576 finish: 1577 TAILQ_INIT(&pmap->pm_pvchunk); 1578 vm_radix_init(&pmap->pm_root); 1579 1580 return (1); 1581 } 1582 1583 int 1584 pmap_pinit(pmap_t pmap) 1585 { 1586 1587 return (pmap_pinit_stage(pmap, PM_STAGE1)); 1588 } 1589 1590 /* 1591 * This routine is called if the desired page table page does not exist. 1592 * 1593 * If page table page allocation fails, this routine may sleep before 1594 * returning NULL. It sleeps only if a lock pointer was given. 1595 * 1596 * Note: If a page allocation fails at page table level two or three, 1597 * one or two pages may be held during the wait, only to be released 1598 * afterwards. This conservative approach is easily argued to avoid 1599 * race conditions. 1600 */ 1601 static vm_page_t 1602 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) 1603 { 1604 vm_page_t m, pdpg; 1605 pt_entry_t entry; 1606 vm_paddr_t phys; 1607 pn_t pn; 1608 1609 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1610 1611 /* 1612 * Allocate a page table page. 1613 */ 1614 m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO); 1615 if (m == NULL) { 1616 if (lockp != NULL) { 1617 RELEASE_PV_LIST_LOCK(lockp); 1618 PMAP_UNLOCK(pmap); 1619 rw_runlock(&pvh_global_lock); 1620 vm_wait(NULL); 1621 rw_rlock(&pvh_global_lock); 1622 PMAP_LOCK(pmap); 1623 } 1624 1625 /* 1626 * Indicate the need to retry. While waiting, the page table 1627 * page may have been allocated. 1628 */ 1629 return (NULL); 1630 } 1631 m->pindex = ptepindex; 1632 1633 /* 1634 * Map the pagetable page into the process address space, if 1635 * it isn't already there. 1636 */ 1637 pn = VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT; 1638 if (ptepindex >= NUL2E + NUL1E) { 1639 pd_entry_t *l0; 1640 vm_pindex_t l0index; 1641 1642 KASSERT(pmap_mode != PMAP_MODE_SV39, 1643 ("%s: pindex %#lx in SV39 mode", __func__, ptepindex)); 1644 KASSERT(ptepindex < NUL2E + NUL1E + NUL0E, 1645 ("%s: pindex %#lx out of range", __func__, ptepindex)); 1646 1647 l0index = ptepindex - (NUL2E + NUL1E); 1648 l0 = &pmap->pm_top[l0index]; 1649 KASSERT((pmap_load(l0) & PTE_V) == 0, 1650 ("%s: L0 entry %#lx is valid", __func__, pmap_load(l0))); 1651 1652 entry = PTE_V | (pn << PTE_PPN0_S); 1653 pmap_store(l0, entry); 1654 } else if (ptepindex >= NUL2E) { 1655 pd_entry_t *l0, *l1; 1656 vm_pindex_t l0index, l1index; 1657 1658 l1index = ptepindex - NUL2E; 1659 if (pmap_mode == PMAP_MODE_SV39) { 1660 l1 = &pmap->pm_top[l1index]; 1661 } else { 1662 l0index = l1index >> Ln_ENTRIES_SHIFT; 1663 l0 = &pmap->pm_top[l0index]; 1664 if (pmap_load(l0) == 0) { 1665 /* Recurse to allocate the L1 page. */ 1666 if (_pmap_alloc_l3(pmap, 1667 NUL2E + NUL1E + l0index, lockp) == NULL) 1668 goto fail; 1669 phys = PTE_TO_PHYS(pmap_load(l0)); 1670 } else { 1671 phys = PTE_TO_PHYS(pmap_load(l0)); 1672 pdpg = PHYS_TO_VM_PAGE(phys); 1673 pdpg->ref_count++; 1674 } 1675 l1 = (pd_entry_t *)PHYS_TO_DMAP(phys); 1676 l1 = &l1[ptepindex & Ln_ADDR_MASK]; 1677 } 1678 KASSERT((pmap_load(l1) & PTE_V) == 0, 1679 ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1))); 1680 1681 entry = PTE_V | (pn << PTE_PPN0_S); 1682 pmap_store(l1, entry); 1683 pmap_distribute_l1(pmap, l1index, entry); 1684 } else { 1685 vm_pindex_t l0index, l1index; 1686 pd_entry_t *l0, *l1, *l2; 1687 1688 l1index = ptepindex >> (L1_SHIFT - L2_SHIFT); 1689 if (pmap_mode == PMAP_MODE_SV39) { 1690 l1 = &pmap->pm_top[l1index]; 1691 if (pmap_load(l1) == 0) { 1692 /* recurse for allocating page dir */ 1693 if (_pmap_alloc_l3(pmap, NUL2E + l1index, 1694 lockp) == NULL) 1695 goto fail; 1696 } else { 1697 pdpg = PTE_TO_VM_PAGE(pmap_load(l1)); 1698 pdpg->ref_count++; 1699 } 1700 } else { 1701 l0index = l1index >> Ln_ENTRIES_SHIFT; 1702 l0 = &pmap->pm_top[l0index]; 1703 if (pmap_load(l0) == 0) { 1704 /* Recurse to allocate the L1 entry. */ 1705 if (_pmap_alloc_l3(pmap, NUL2E + l1index, 1706 lockp) == NULL) 1707 goto fail; 1708 phys = PTE_TO_PHYS(pmap_load(l0)); 1709 l1 = (pd_entry_t *)PHYS_TO_DMAP(phys); 1710 l1 = &l1[l1index & Ln_ADDR_MASK]; 1711 } else { 1712 phys = PTE_TO_PHYS(pmap_load(l0)); 1713 l1 = (pd_entry_t *)PHYS_TO_DMAP(phys); 1714 l1 = &l1[l1index & Ln_ADDR_MASK]; 1715 if (pmap_load(l1) == 0) { 1716 /* Recurse to allocate the L2 page. */ 1717 if (_pmap_alloc_l3(pmap, 1718 NUL2E + l1index, lockp) == NULL) 1719 goto fail; 1720 } else { 1721 pdpg = PTE_TO_VM_PAGE(pmap_load(l1)); 1722 pdpg->ref_count++; 1723 } 1724 } 1725 } 1726 1727 phys = PTE_TO_PHYS(pmap_load(l1)); 1728 l2 = (pd_entry_t *)PHYS_TO_DMAP(phys); 1729 l2 = &l2[ptepindex & Ln_ADDR_MASK]; 1730 KASSERT((pmap_load(l2) & PTE_V) == 0, 1731 ("%s: L2 entry %#lx is valid", __func__, pmap_load(l2))); 1732 1733 entry = PTE_V | (pn << PTE_PPN0_S); 1734 pmap_store(l2, entry); 1735 } 1736 1737 pmap_resident_count_inc(pmap, 1); 1738 1739 return (m); 1740 1741 fail: 1742 vm_page_unwire_noq(m); 1743 vm_page_free_zero(m); 1744 return (NULL); 1745 } 1746 1747 static vm_page_t 1748 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 1749 { 1750 pd_entry_t *l1; 1751 vm_page_t l2pg; 1752 vm_pindex_t pindex; 1753 1754 retry: 1755 l1 = pmap_l1(pmap, va); 1756 if (l1 != NULL && (pmap_load(l1) & PTE_V) != 0) { 1757 KASSERT((pmap_load(l1) & PTE_RWX) == 0, 1758 ("%s: L1 entry %#lx for VA %#lx is a leaf", __func__, 1759 pmap_load(l1), va)); 1760 /* Add a reference to the L2 page. */ 1761 l2pg = PTE_TO_VM_PAGE(pmap_load(l1)); 1762 l2pg->ref_count++; 1763 } else { 1764 /* Allocate a L2 page. */ 1765 pindex = pmap_l1_pindex(va); 1766 l2pg = _pmap_alloc_l3(pmap, pindex, lockp); 1767 if (l2pg == NULL && lockp != NULL) 1768 goto retry; 1769 } 1770 return (l2pg); 1771 } 1772 1773 static vm_page_t 1774 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 1775 { 1776 vm_pindex_t ptepindex; 1777 pd_entry_t *l2; 1778 vm_page_t m; 1779 1780 /* 1781 * Calculate pagetable page index 1782 */ 1783 ptepindex = pmap_l2_pindex(va); 1784 retry: 1785 /* 1786 * Get the page directory entry 1787 */ 1788 l2 = pmap_l2(pmap, va); 1789 1790 /* 1791 * If the page table page is mapped, we just increment the 1792 * hold count, and activate it. 1793 */ 1794 if (l2 != NULL && pmap_load(l2) != 0) { 1795 m = PTE_TO_VM_PAGE(pmap_load(l2)); 1796 m->ref_count++; 1797 } else { 1798 /* 1799 * Here if the pte page isn't mapped, or if it has been 1800 * deallocated. 1801 */ 1802 m = _pmap_alloc_l3(pmap, ptepindex, lockp); 1803 if (m == NULL && lockp != NULL) 1804 goto retry; 1805 } 1806 return (m); 1807 } 1808 1809 /*************************************************** 1810 * Pmap allocation/deallocation routines. 1811 ***************************************************/ 1812 1813 /* 1814 * Release any resources held by the given physical map. 1815 * Called when a pmap initialized by pmap_pinit is being released. 1816 * Should only be called if the map contains no valid mappings. 1817 */ 1818 void 1819 pmap_release(pmap_t pmap) 1820 { 1821 vm_page_t m; 1822 int npages; 1823 int i; 1824 1825 KASSERT(pmap->pm_stats.resident_count == 0, 1826 ("pmap_release: pmap resident count %ld != 0", 1827 pmap->pm_stats.resident_count)); 1828 KASSERT(CPU_EMPTY(&pmap->pm_active), 1829 ("releasing active pmap %p", pmap)); 1830 1831 if (pmap->pm_stage == PM_STAGE2) 1832 goto finish; 1833 1834 if (pmap_mode == PMAP_MODE_SV39) { 1835 mtx_lock(&allpmaps_lock); 1836 LIST_REMOVE(pmap, pm_list); 1837 mtx_unlock(&allpmaps_lock); 1838 } 1839 1840 finish: 1841 npages = pmap->pm_stage == PM_STAGE2 ? 4 : 1; 1842 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_top)); 1843 for (i = 0; i < npages; i++) { 1844 vm_page_unwire_noq(m); 1845 vm_page_free(m); 1846 m++; 1847 } 1848 } 1849 1850 static int 1851 kvm_size(SYSCTL_HANDLER_ARGS) 1852 { 1853 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; 1854 1855 return sysctl_handle_long(oidp, &ksize, 0, req); 1856 } 1857 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 1858 0, 0, kvm_size, "LU", 1859 "Size of KVM"); 1860 1861 static int 1862 kvm_free(SYSCTL_HANDLER_ARGS) 1863 { 1864 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 1865 1866 return sysctl_handle_long(oidp, &kfree, 0, req); 1867 } 1868 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 1869 0, 0, kvm_free, "LU", 1870 "Amount of KVM free"); 1871 1872 /* 1873 * grow the number of kernel page table entries, if needed 1874 */ 1875 void 1876 pmap_growkernel(vm_offset_t addr) 1877 { 1878 vm_paddr_t paddr; 1879 vm_page_t nkpg; 1880 pd_entry_t *l1, *l2; 1881 pt_entry_t entry; 1882 pn_t pn; 1883 1884 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 1885 1886 addr = roundup2(addr, L2_SIZE); 1887 if (addr - 1 >= vm_map_max(kernel_map)) 1888 addr = vm_map_max(kernel_map); 1889 while (kernel_vm_end < addr) { 1890 l1 = pmap_l1(kernel_pmap, kernel_vm_end); 1891 if (pmap_load(l1) == 0) { 1892 /* We need a new PDP entry */ 1893 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | 1894 VM_ALLOC_NOFREE | VM_ALLOC_WIRED | VM_ALLOC_ZERO); 1895 if (nkpg == NULL) 1896 panic("%s: no memory to grow kernel", __func__); 1897 nkpg->pindex = pmap_l1_pindex(kernel_vm_end); 1898 paddr = VM_PAGE_TO_PHYS(nkpg); 1899 1900 pn = (paddr / PAGE_SIZE); 1901 entry = (PTE_V); 1902 entry |= (pn << PTE_PPN0_S); 1903 pmap_store(l1, entry); 1904 pmap_distribute_l1(kernel_pmap, 1905 pmap_l1_index(kernel_vm_end), entry); 1906 continue; /* try again */ 1907 } 1908 l2 = pmap_l1_to_l2(l1, kernel_vm_end); 1909 if ((pmap_load(l2) & PTE_V) != 0 && 1910 (pmap_load(l2) & PTE_RWX) == 0) { 1911 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; 1912 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 1913 kernel_vm_end = vm_map_max(kernel_map); 1914 break; 1915 } 1916 continue; 1917 } 1918 1919 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | 1920 VM_ALLOC_NOFREE | VM_ALLOC_WIRED | VM_ALLOC_ZERO); 1921 if (nkpg == NULL) 1922 panic("%s: no memory to grow kernel", __func__); 1923 nkpg->pindex = pmap_l2_pindex(kernel_vm_end); 1924 paddr = VM_PAGE_TO_PHYS(nkpg); 1925 1926 pn = (paddr / PAGE_SIZE); 1927 entry = (PTE_V); 1928 entry |= (pn << PTE_PPN0_S); 1929 pmap_store(l2, entry); 1930 1931 pmap_invalidate_page(kernel_pmap, kernel_vm_end); 1932 1933 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; 1934 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 1935 kernel_vm_end = vm_map_max(kernel_map); 1936 break; 1937 } 1938 } 1939 } 1940 1941 /*************************************************** 1942 * page management routines. 1943 ***************************************************/ 1944 1945 static const uint64_t pc_freemask[_NPCM] = { 1946 [0 ... _NPCM - 2] = PC_FREEN, 1947 [_NPCM - 1] = PC_FREEL 1948 }; 1949 1950 #ifdef PV_STATS 1951 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 1952 1953 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 1954 "Current number of pv entry chunks"); 1955 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 1956 "Current number of pv entry chunks allocated"); 1957 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 1958 "Current number of pv entry chunks frees"); 1959 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 1960 "Number of times tried to get a chunk page but failed."); 1961 1962 static long pv_entry_frees, pv_entry_allocs, pv_entry_count; 1963 static int pv_entry_spare; 1964 1965 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 1966 "Current number of pv entry frees"); 1967 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 1968 "Current number of pv entry allocs"); 1969 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 1970 "Current number of pv entries"); 1971 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 1972 "Current number of spare pv entries"); 1973 #endif 1974 1975 /* 1976 * We are in a serious low memory condition. Resort to 1977 * drastic measures to free some pages so we can allocate 1978 * another pv entry chunk. 1979 * 1980 * Returns NULL if PV entries were reclaimed from the specified pmap. 1981 * 1982 * We do not, however, unmap 2mpages because subsequent accesses will 1983 * allocate per-page pv entries until repromotion occurs, thereby 1984 * exacerbating the shortage of free pv entries. 1985 */ 1986 static vm_page_t 1987 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) 1988 { 1989 1990 panic("RISCVTODO: reclaim_pv_chunk"); 1991 } 1992 1993 /* 1994 * free the pv_entry back to the free list 1995 */ 1996 static void 1997 free_pv_entry(pmap_t pmap, pv_entry_t pv) 1998 { 1999 struct pv_chunk *pc; 2000 int idx, field, bit; 2001 2002 rw_assert(&pvh_global_lock, RA_LOCKED); 2003 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2004 PV_STAT(atomic_add_long(&pv_entry_frees, 1)); 2005 PV_STAT(atomic_add_int(&pv_entry_spare, 1)); 2006 PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); 2007 pc = pv_to_chunk(pv); 2008 idx = pv - &pc->pc_pventry[0]; 2009 field = idx / 64; 2010 bit = idx % 64; 2011 pc->pc_map[field] |= 1ul << bit; 2012 if (!pc_is_free(pc)) { 2013 /* 98% of the time, pc is already at the head of the list. */ 2014 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { 2015 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2016 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2017 } 2018 return; 2019 } 2020 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2021 free_pv_chunk(pc); 2022 } 2023 2024 static void 2025 free_pv_chunk(struct pv_chunk *pc) 2026 { 2027 vm_page_t m; 2028 2029 mtx_lock(&pv_chunks_mutex); 2030 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2031 mtx_unlock(&pv_chunks_mutex); 2032 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 2033 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 2034 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 2035 /* entire chunk is free, return it */ 2036 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 2037 dump_drop_page(m->phys_addr); 2038 vm_page_unwire_noq(m); 2039 vm_page_free(m); 2040 } 2041 2042 /* 2043 * Returns a new PV entry, allocating a new PV chunk from the system when 2044 * needed. If this PV chunk allocation fails and a PV list lock pointer was 2045 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is 2046 * returned. 2047 * 2048 * The given PV list lock may be released. 2049 */ 2050 static pv_entry_t 2051 get_pv_entry(pmap_t pmap, struct rwlock **lockp) 2052 { 2053 int bit, field; 2054 pv_entry_t pv; 2055 struct pv_chunk *pc; 2056 vm_page_t m; 2057 2058 rw_assert(&pvh_global_lock, RA_LOCKED); 2059 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2060 PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); 2061 retry: 2062 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 2063 if (pc != NULL) { 2064 for (field = 0; field < _NPCM; field++) { 2065 if (pc->pc_map[field]) { 2066 bit = ffsl(pc->pc_map[field]) - 1; 2067 break; 2068 } 2069 } 2070 if (field < _NPCM) { 2071 pv = &pc->pc_pventry[field * 64 + bit]; 2072 pc->pc_map[field] &= ~(1ul << bit); 2073 /* If this was the last item, move it to tail */ 2074 if (pc_is_full(pc)) { 2075 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2076 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, 2077 pc_list); 2078 } 2079 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 2080 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); 2081 return (pv); 2082 } 2083 } 2084 /* No free items, allocate another chunk */ 2085 m = vm_page_alloc_noobj(VM_ALLOC_WIRED); 2086 if (m == NULL) { 2087 if (lockp == NULL) { 2088 PV_STAT(pc_chunk_tryfail++); 2089 return (NULL); 2090 } 2091 m = reclaim_pv_chunk(pmap, lockp); 2092 if (m == NULL) 2093 goto retry; 2094 } 2095 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 2096 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 2097 dump_add_page(m->phys_addr); 2098 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 2099 pc->pc_pmap = pmap; 2100 pc->pc_map[0] = PC_FREEN & ~1ul; /* preallocated bit 0 */ 2101 pc->pc_map[1] = PC_FREEN; 2102 pc->pc_map[2] = PC_FREEL; 2103 mtx_lock(&pv_chunks_mutex); 2104 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 2105 mtx_unlock(&pv_chunks_mutex); 2106 pv = &pc->pc_pventry[0]; 2107 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2108 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 2109 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); 2110 return (pv); 2111 } 2112 2113 /* 2114 * Ensure that the number of spare PV entries in the specified pmap meets or 2115 * exceeds the given count, "needed". 2116 * 2117 * The given PV list lock may be released. 2118 */ 2119 static void 2120 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) 2121 { 2122 struct pch new_tail; 2123 struct pv_chunk *pc; 2124 vm_page_t m; 2125 int avail, free; 2126 bool reclaimed; 2127 2128 rw_assert(&pvh_global_lock, RA_LOCKED); 2129 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2130 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); 2131 2132 /* 2133 * Newly allocated PV chunks must be stored in a private list until 2134 * the required number of PV chunks have been allocated. Otherwise, 2135 * reclaim_pv_chunk() could recycle one of these chunks. In 2136 * contrast, these chunks must be added to the pmap upon allocation. 2137 */ 2138 TAILQ_INIT(&new_tail); 2139 retry: 2140 avail = 0; 2141 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { 2142 bit_count((bitstr_t *)pc->pc_map, 0, 2143 sizeof(pc->pc_map) * NBBY, &free); 2144 if (free == 0) 2145 break; 2146 avail += free; 2147 if (avail >= needed) 2148 break; 2149 } 2150 for (reclaimed = false; avail < needed; avail += _NPCPV) { 2151 m = vm_page_alloc_noobj(VM_ALLOC_WIRED); 2152 if (m == NULL) { 2153 m = reclaim_pv_chunk(pmap, lockp); 2154 if (m == NULL) 2155 goto retry; 2156 reclaimed = true; 2157 } 2158 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 2159 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 2160 dump_add_page(m->phys_addr); 2161 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 2162 pc->pc_pmap = pmap; 2163 pc->pc_map[0] = PC_FREEN; 2164 pc->pc_map[1] = PC_FREEN; 2165 pc->pc_map[2] = PC_FREEL; 2166 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2167 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 2168 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); 2169 2170 /* 2171 * The reclaim might have freed a chunk from the current pmap. 2172 * If that chunk contained available entries, we need to 2173 * re-count the number of available entries. 2174 */ 2175 if (reclaimed) 2176 goto retry; 2177 } 2178 if (!TAILQ_EMPTY(&new_tail)) { 2179 mtx_lock(&pv_chunks_mutex); 2180 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); 2181 mtx_unlock(&pv_chunks_mutex); 2182 } 2183 } 2184 2185 /* 2186 * First find and then remove the pv entry for the specified pmap and virtual 2187 * address from the specified pv list. Returns the pv entry if found and NULL 2188 * otherwise. This operation can be performed on pv lists for either 4KB or 2189 * 2MB page mappings. 2190 */ 2191 static __inline pv_entry_t 2192 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2193 { 2194 pv_entry_t pv; 2195 2196 rw_assert(&pvh_global_lock, RA_LOCKED); 2197 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 2198 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 2199 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 2200 pvh->pv_gen++; 2201 break; 2202 } 2203 } 2204 return (pv); 2205 } 2206 2207 /* 2208 * First find and then destroy the pv entry for the specified pmap and virtual 2209 * address. This operation can be performed on pv lists for either 4KB or 2MB 2210 * page mappings. 2211 */ 2212 static void 2213 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2214 { 2215 pv_entry_t pv; 2216 2217 pv = pmap_pvh_remove(pvh, pmap, va); 2218 2219 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found for %#lx", va)); 2220 free_pv_entry(pmap, pv); 2221 } 2222 2223 /* 2224 * Conditionally create the PV entry for a 4KB page mapping if the required 2225 * memory can be allocated without resorting to reclamation. 2226 */ 2227 static bool 2228 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, 2229 struct rwlock **lockp) 2230 { 2231 pv_entry_t pv; 2232 2233 rw_assert(&pvh_global_lock, RA_LOCKED); 2234 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2235 /* Pass NULL instead of the lock pointer to disable reclamation. */ 2236 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 2237 pv->pv_va = va; 2238 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 2239 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2240 m->md.pv_gen++; 2241 return (true); 2242 } else 2243 return (false); 2244 } 2245 2246 /* 2247 * After demotion from a 2MB page mapping to 512 4KB page mappings, 2248 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 2249 * entries for each of the 4KB page mappings. 2250 */ 2251 static void __unused 2252 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 2253 struct rwlock **lockp) 2254 { 2255 struct md_page *pvh; 2256 struct pv_chunk *pc; 2257 pv_entry_t pv; 2258 vm_page_t m; 2259 vm_offset_t va_last; 2260 int bit, field; 2261 2262 rw_assert(&pvh_global_lock, RA_LOCKED); 2263 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2264 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 2265 2266 /* 2267 * Transfer the 2mpage's pv entry for this mapping to the first 2268 * page's pv list. Once this transfer begins, the pv list lock 2269 * must not be released until the last pv entry is reinstantiated. 2270 */ 2271 pvh = pa_to_pvh(pa); 2272 va &= ~L2_OFFSET; 2273 pv = pmap_pvh_remove(pvh, pmap, va); 2274 KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found")); 2275 m = PHYS_TO_VM_PAGE(pa); 2276 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2277 m->md.pv_gen++; 2278 /* Instantiate the remaining 511 pv entries. */ 2279 PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1)); 2280 va_last = va + L2_SIZE - PAGE_SIZE; 2281 for (;;) { 2282 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 2283 KASSERT(!pc_is_full(pc), ("pmap_pv_demote_l2: missing spare")); 2284 for (field = 0; field < _NPCM; field++) { 2285 while (pc->pc_map[field] != 0) { 2286 bit = ffsl(pc->pc_map[field]) - 1; 2287 pc->pc_map[field] &= ~(1ul << bit); 2288 pv = &pc->pc_pventry[field * 64 + bit]; 2289 va += PAGE_SIZE; 2290 pv->pv_va = va; 2291 m++; 2292 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 2293 ("pmap_pv_demote_l2: page %p is not managed", m)); 2294 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2295 m->md.pv_gen++; 2296 if (va == va_last) 2297 goto out; 2298 } 2299 } 2300 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2301 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 2302 } 2303 out: 2304 if (pc_is_full(pc)) { 2305 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2306 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 2307 } 2308 PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1)); 2309 PV_STAT(atomic_add_int(&pv_entry_spare, -(Ln_ENTRIES - 1))); 2310 } 2311 2312 #if VM_NRESERVLEVEL > 0 2313 static void 2314 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 2315 struct rwlock **lockp) 2316 { 2317 struct md_page *pvh; 2318 pv_entry_t pv; 2319 vm_page_t m; 2320 vm_offset_t va_last; 2321 2322 rw_assert(&pvh_global_lock, RA_LOCKED); 2323 KASSERT((pa & L2_OFFSET) == 0, 2324 ("pmap_pv_promote_l2: misaligned pa %#lx", pa)); 2325 2326 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 2327 2328 m = PHYS_TO_VM_PAGE(pa); 2329 va = va & ~L2_OFFSET; 2330 pv = pmap_pvh_remove(&m->md, pmap, va); 2331 KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv for %#lx not found", va)); 2332 pvh = pa_to_pvh(pa); 2333 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 2334 pvh->pv_gen++; 2335 2336 va_last = va + L2_SIZE - PAGE_SIZE; 2337 do { 2338 m++; 2339 va += PAGE_SIZE; 2340 pmap_pvh_free(&m->md, pmap, va); 2341 } while (va < va_last); 2342 } 2343 #endif /* VM_NRESERVLEVEL > 0 */ 2344 2345 /* 2346 * Create the PV entry for a 2MB page mapping. Always returns true unless the 2347 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns 2348 * false if the PV entry cannot be allocated without resorting to reclamation. 2349 */ 2350 static bool 2351 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags, 2352 struct rwlock **lockp) 2353 { 2354 struct md_page *pvh; 2355 pv_entry_t pv; 2356 vm_paddr_t pa; 2357 2358 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2359 /* Pass NULL instead of the lock pointer to disable reclamation. */ 2360 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? 2361 NULL : lockp)) == NULL) 2362 return (false); 2363 pv->pv_va = va; 2364 pa = PTE_TO_PHYS(l2e); 2365 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 2366 pvh = pa_to_pvh(pa); 2367 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 2368 pvh->pv_gen++; 2369 return (true); 2370 } 2371 2372 static void 2373 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) 2374 { 2375 pt_entry_t newl2, oldl2 __diagused; 2376 vm_page_t ml3; 2377 vm_paddr_t ml3pa; 2378 2379 KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va)); 2380 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); 2381 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2382 2383 ml3 = pmap_remove_pt_page(pmap, va); 2384 if (ml3 == NULL) 2385 panic("pmap_remove_kernel_l2: Missing pt page"); 2386 2387 ml3pa = VM_PAGE_TO_PHYS(ml3); 2388 newl2 = ml3pa | PTE_V; 2389 2390 /* 2391 * If this page table page was unmapped by a promotion, then it 2392 * contains valid mappings. Zero it to invalidate those mappings. 2393 */ 2394 if (vm_page_any_valid(ml3)) 2395 pagezero((void *)PHYS_TO_DMAP(ml3pa)); 2396 2397 /* 2398 * Demote the mapping. 2399 */ 2400 oldl2 = pmap_load_store(l2, newl2); 2401 KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx", 2402 __func__, l2, oldl2)); 2403 } 2404 2405 /* 2406 * pmap_remove_l2: Do the things to unmap a level 2 superpage. 2407 */ 2408 static int 2409 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, 2410 pd_entry_t l1e, struct spglist *free, struct rwlock **lockp) 2411 { 2412 struct md_page *pvh; 2413 pt_entry_t oldl2; 2414 vm_offset_t eva, va; 2415 vm_page_t m, ml3; 2416 2417 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2418 KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned")); 2419 oldl2 = pmap_load_clear(l2); 2420 KASSERT((oldl2 & PTE_RWX) != 0, 2421 ("pmap_remove_l2: L2e %lx is not a superpage mapping", oldl2)); 2422 2423 /* 2424 * The sfence.vma documentation states that it is sufficient to specify 2425 * a single address within a superpage mapping. However, since we do 2426 * not perform any invalidation upon promotion, TLBs may still be 2427 * caching 4KB mappings within the superpage, so we must invalidate the 2428 * entire range. 2429 */ 2430 pmap_invalidate_range(pmap, sva, sva + L2_SIZE); 2431 if ((oldl2 & PTE_SW_WIRED) != 0) 2432 pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE; 2433 pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE); 2434 if ((oldl2 & PTE_SW_MANAGED) != 0) { 2435 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, PTE_TO_PHYS(oldl2)); 2436 pvh = pa_to_pvh(PTE_TO_PHYS(oldl2)); 2437 pmap_pvh_free(pvh, pmap, sva); 2438 eva = sva + L2_SIZE; 2439 for (va = sva, m = PTE_TO_VM_PAGE(oldl2); 2440 va < eva; va += PAGE_SIZE, m++) { 2441 if ((oldl2 & PTE_D) != 0) 2442 vm_page_dirty(m); 2443 if ((oldl2 & PTE_A) != 0) 2444 vm_page_aflag_set(m, PGA_REFERENCED); 2445 if (TAILQ_EMPTY(&m->md.pv_list) && 2446 TAILQ_EMPTY(&pvh->pv_list)) 2447 vm_page_aflag_clear(m, PGA_WRITEABLE); 2448 } 2449 } 2450 if (pmap == kernel_pmap) { 2451 pmap_remove_kernel_l2(pmap, l2, sva); 2452 } else { 2453 ml3 = pmap_remove_pt_page(pmap, sva); 2454 if (ml3 != NULL) { 2455 KASSERT(vm_page_any_valid(ml3), 2456 ("pmap_remove_l2: l3 page not promoted")); 2457 pmap_resident_count_dec(pmap, 1); 2458 KASSERT(ml3->ref_count == Ln_ENTRIES, 2459 ("pmap_remove_l2: l3 page ref count error")); 2460 ml3->ref_count = 1; 2461 vm_page_unwire_noq(ml3); 2462 pmap_add_delayed_free_list(ml3, free, false); 2463 } 2464 } 2465 return (pmap_unuse_pt(pmap, sva, l1e, free)); 2466 } 2467 2468 /* 2469 * pmap_remove_l3: do the things to unmap a page in a process 2470 */ 2471 static int 2472 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va, 2473 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp) 2474 { 2475 struct md_page *pvh; 2476 pt_entry_t old_l3; 2477 vm_page_t m; 2478 2479 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2480 old_l3 = pmap_load_clear(l3); 2481 pmap_invalidate_page(pmap, va); 2482 if (old_l3 & PTE_SW_WIRED) 2483 pmap->pm_stats.wired_count -= 1; 2484 pmap_resident_count_dec(pmap, 1); 2485 if (old_l3 & PTE_SW_MANAGED) { 2486 m = PTE_TO_VM_PAGE(old_l3); 2487 if ((old_l3 & PTE_D) != 0) 2488 vm_page_dirty(m); 2489 if (old_l3 & PTE_A) 2490 vm_page_aflag_set(m, PGA_REFERENCED); 2491 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 2492 pmap_pvh_free(&m->md, pmap, va); 2493 if (TAILQ_EMPTY(&m->md.pv_list) && 2494 (m->flags & PG_FICTITIOUS) == 0) { 2495 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2496 if (TAILQ_EMPTY(&pvh->pv_list)) 2497 vm_page_aflag_clear(m, PGA_WRITEABLE); 2498 } 2499 } 2500 2501 return (pmap_unuse_pt(pmap, va, l2e, free)); 2502 } 2503 2504 /* 2505 * Remove the given range of addresses from the specified map. 2506 * 2507 * It is assumed that the start and end are properly 2508 * rounded to the page size. 2509 */ 2510 void 2511 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 2512 { 2513 struct spglist free; 2514 struct rwlock *lock; 2515 vm_offset_t va, va_next; 2516 pd_entry_t *l0, *l1, *l2, l2e; 2517 pt_entry_t *l3; 2518 2519 /* 2520 * Perform an unsynchronized read. This is, however, safe. 2521 */ 2522 if (pmap->pm_stats.resident_count == 0) 2523 return; 2524 2525 SLIST_INIT(&free); 2526 2527 rw_rlock(&pvh_global_lock); 2528 PMAP_LOCK(pmap); 2529 2530 lock = NULL; 2531 for (; sva < eva; sva = va_next) { 2532 if (pmap->pm_stats.resident_count == 0) 2533 break; 2534 2535 if (pmap_mode == PMAP_MODE_SV48) { 2536 l0 = pmap_l0(pmap, sva); 2537 if (pmap_load(l0) == 0) { 2538 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 2539 if (va_next < sva) 2540 va_next = eva; 2541 continue; 2542 } 2543 l1 = pmap_l0_to_l1(l0, sva); 2544 } else { 2545 l1 = pmap_l1(pmap, sva); 2546 } 2547 2548 if (pmap_load(l1) == 0) { 2549 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 2550 if (va_next < sva) 2551 va_next = eva; 2552 continue; 2553 } 2554 2555 /* 2556 * Calculate index for next page table. 2557 */ 2558 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 2559 if (va_next < sva) 2560 va_next = eva; 2561 2562 l2 = pmap_l1_to_l2(l1, sva); 2563 if ((l2e = pmap_load(l2)) == 0) 2564 continue; 2565 if ((l2e & PTE_RWX) != 0) { 2566 if (sva + L2_SIZE == va_next && eva >= va_next) { 2567 (void)pmap_remove_l2(pmap, l2, sva, 2568 pmap_load(l1), &free, &lock); 2569 continue; 2570 } else if (!pmap_demote_l2_locked(pmap, l2, sva, 2571 &lock)) { 2572 /* 2573 * The large page mapping was destroyed. 2574 */ 2575 continue; 2576 } 2577 l2e = pmap_load(l2); 2578 } 2579 2580 /* 2581 * Limit our scan to either the end of the va represented 2582 * by the current page table page, or to the end of the 2583 * range being removed. 2584 */ 2585 if (va_next > eva) 2586 va_next = eva; 2587 2588 va = va_next; 2589 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 2590 sva += L3_SIZE) { 2591 if (pmap_load(l3) == 0) { 2592 if (va != va_next) { 2593 pmap_invalidate_range(pmap, va, sva); 2594 va = va_next; 2595 } 2596 continue; 2597 } 2598 if (va == va_next) 2599 va = sva; 2600 if (pmap_remove_l3(pmap, l3, sva, l2e, &free, &lock)) { 2601 sva += L3_SIZE; 2602 break; 2603 } 2604 } 2605 if (va != va_next) 2606 pmap_invalidate_range(pmap, va, sva); 2607 } 2608 if (lock != NULL) 2609 rw_wunlock(lock); 2610 rw_runlock(&pvh_global_lock); 2611 PMAP_UNLOCK(pmap); 2612 vm_page_free_pages_toq(&free, false); 2613 } 2614 2615 /* 2616 * Routine: pmap_remove_all 2617 * Function: 2618 * Removes this physical page from 2619 * all physical maps in which it resides. 2620 * Reflects back modify bits to the pager. 2621 * 2622 * Notes: 2623 * Original versions of this routine were very 2624 * inefficient because they iteratively called 2625 * pmap_remove (slow...) 2626 */ 2627 2628 void 2629 pmap_remove_all(vm_page_t m) 2630 { 2631 struct spglist free; 2632 struct md_page *pvh; 2633 pmap_t pmap; 2634 pt_entry_t *l3, l3e; 2635 pd_entry_t *l2, l2e __diagused; 2636 pv_entry_t pv; 2637 vm_offset_t va; 2638 2639 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 2640 ("pmap_remove_all: page %p is not managed", m)); 2641 SLIST_INIT(&free); 2642 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 2643 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2644 2645 rw_wlock(&pvh_global_lock); 2646 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 2647 pmap = PV_PMAP(pv); 2648 PMAP_LOCK(pmap); 2649 va = pv->pv_va; 2650 l2 = pmap_l2(pmap, va); 2651 (void)pmap_demote_l2(pmap, l2, va); 2652 PMAP_UNLOCK(pmap); 2653 } 2654 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 2655 pmap = PV_PMAP(pv); 2656 PMAP_LOCK(pmap); 2657 pmap_resident_count_dec(pmap, 1); 2658 l2 = pmap_l2(pmap, pv->pv_va); 2659 KASSERT(l2 != NULL, ("pmap_remove_all: no l2 table found")); 2660 l2e = pmap_load(l2); 2661 2662 KASSERT((l2e & PTE_RX) == 0, 2663 ("pmap_remove_all: found a superpage in %p's pv list", m)); 2664 2665 l3 = pmap_l2_to_l3(l2, pv->pv_va); 2666 l3e = pmap_load_clear(l3); 2667 pmap_invalidate_page(pmap, pv->pv_va); 2668 if (l3e & PTE_SW_WIRED) 2669 pmap->pm_stats.wired_count--; 2670 if ((l3e & PTE_A) != 0) 2671 vm_page_aflag_set(m, PGA_REFERENCED); 2672 2673 /* 2674 * Update the vm_page_t clean and reference bits. 2675 */ 2676 if ((l3e & PTE_D) != 0) 2677 vm_page_dirty(m); 2678 pmap_unuse_pt(pmap, pv->pv_va, pmap_load(l2), &free); 2679 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 2680 m->md.pv_gen++; 2681 free_pv_entry(pmap, pv); 2682 PMAP_UNLOCK(pmap); 2683 } 2684 vm_page_aflag_clear(m, PGA_WRITEABLE); 2685 rw_wunlock(&pvh_global_lock); 2686 vm_page_free_pages_toq(&free, false); 2687 } 2688 2689 /* 2690 * Set the physical protection on the 2691 * specified range of this map as requested. 2692 */ 2693 void 2694 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 2695 { 2696 pd_entry_t *l0, *l1, *l2, l2e; 2697 pt_entry_t *l3, l3e, mask; 2698 vm_page_t m, mt; 2699 vm_offset_t va_next; 2700 bool anychanged, pv_lists_locked; 2701 2702 if ((prot & VM_PROT_READ) == VM_PROT_NONE) { 2703 pmap_remove(pmap, sva, eva); 2704 return; 2705 } 2706 2707 if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == 2708 (VM_PROT_WRITE | VM_PROT_EXECUTE)) 2709 return; 2710 2711 anychanged = false; 2712 pv_lists_locked = false; 2713 mask = 0; 2714 if ((prot & VM_PROT_WRITE) == 0) 2715 mask |= PTE_W | PTE_D; 2716 if ((prot & VM_PROT_EXECUTE) == 0) 2717 mask |= PTE_X; 2718 resume: 2719 PMAP_LOCK(pmap); 2720 for (; sva < eva; sva = va_next) { 2721 if (pmap_mode == PMAP_MODE_SV48) { 2722 l0 = pmap_l0(pmap, sva); 2723 if (pmap_load(l0) == 0) { 2724 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 2725 if (va_next < sva) 2726 va_next = eva; 2727 continue; 2728 } 2729 l1 = pmap_l0_to_l1(l0, sva); 2730 } else { 2731 l1 = pmap_l1(pmap, sva); 2732 } 2733 2734 if (pmap_load(l1) == 0) { 2735 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 2736 if (va_next < sva) 2737 va_next = eva; 2738 continue; 2739 } 2740 2741 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 2742 if (va_next < sva) 2743 va_next = eva; 2744 2745 l2 = pmap_l1_to_l2(l1, sva); 2746 if ((l2e = pmap_load(l2)) == 0) 2747 continue; 2748 if ((l2e & PTE_RWX) != 0) { 2749 if (sva + L2_SIZE == va_next && eva >= va_next) { 2750 retryl2: 2751 if ((prot & VM_PROT_WRITE) == 0 && 2752 (l2e & (PTE_SW_MANAGED | PTE_D)) == 2753 (PTE_SW_MANAGED | PTE_D)) { 2754 m = PTE_TO_VM_PAGE(l2e); 2755 for (mt = m; mt < &m[Ln_ENTRIES]; mt++) 2756 vm_page_dirty(mt); 2757 } 2758 if (!atomic_fcmpset_long(l2, &l2e, l2e & ~mask)) 2759 goto retryl2; 2760 anychanged = true; 2761 continue; 2762 } else { 2763 if (!pv_lists_locked) { 2764 pv_lists_locked = true; 2765 if (!rw_try_rlock(&pvh_global_lock)) { 2766 if (anychanged) 2767 pmap_invalidate_all( 2768 pmap); 2769 PMAP_UNLOCK(pmap); 2770 rw_rlock(&pvh_global_lock); 2771 goto resume; 2772 } 2773 } 2774 if (!pmap_demote_l2(pmap, l2, sva)) { 2775 /* 2776 * The large page mapping was destroyed. 2777 */ 2778 continue; 2779 } 2780 } 2781 } 2782 2783 if (va_next > eva) 2784 va_next = eva; 2785 2786 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 2787 sva += L3_SIZE) { 2788 l3e = pmap_load(l3); 2789 retryl3: 2790 if ((l3e & PTE_V) == 0) 2791 continue; 2792 if ((prot & VM_PROT_WRITE) == 0 && 2793 (l3e & (PTE_SW_MANAGED | PTE_D)) == 2794 (PTE_SW_MANAGED | PTE_D)) { 2795 m = PTE_TO_VM_PAGE(l3e); 2796 vm_page_dirty(m); 2797 } 2798 if (!atomic_fcmpset_long(l3, &l3e, l3e & ~mask)) 2799 goto retryl3; 2800 anychanged = true; 2801 } 2802 } 2803 if (anychanged) 2804 pmap_invalidate_all(pmap); 2805 if (pv_lists_locked) 2806 rw_runlock(&pvh_global_lock); 2807 PMAP_UNLOCK(pmap); 2808 } 2809 2810 int 2811 pmap_fault(pmap_t pmap, vm_offset_t va, vm_prot_t ftype) 2812 { 2813 pd_entry_t *l2, l2e; 2814 pt_entry_t bits, *pte, oldpte; 2815 int rv; 2816 2817 KASSERT(VIRT_IS_VALID(va), ("pmap_fault: invalid va %#lx", va)); 2818 2819 rv = 0; 2820 PMAP_LOCK(pmap); 2821 l2 = pmap_l2(pmap, va); 2822 if (l2 == NULL || ((l2e = pmap_load(l2)) & PTE_V) == 0) 2823 goto done; 2824 if ((l2e & PTE_RWX) == 0) { 2825 pte = pmap_l2_to_l3(l2, va); 2826 if (((oldpte = pmap_load(pte)) & PTE_V) == 0) 2827 goto done; 2828 } else { 2829 pte = l2; 2830 oldpte = l2e; 2831 } 2832 2833 if ((pmap != kernel_pmap && (oldpte & PTE_U) == 0) || 2834 (ftype == VM_PROT_WRITE && (oldpte & PTE_W) == 0) || 2835 (ftype == VM_PROT_EXECUTE && (oldpte & PTE_X) == 0) || 2836 (ftype == VM_PROT_READ && (oldpte & PTE_R) == 0)) 2837 goto done; 2838 2839 bits = PTE_A; 2840 if (ftype == VM_PROT_WRITE) 2841 bits |= PTE_D; 2842 2843 /* 2844 * Spurious faults can occur if the implementation caches invalid 2845 * entries in the TLB, or if simultaneous accesses on multiple CPUs 2846 * race with each other. 2847 */ 2848 if ((oldpte & bits) != bits) 2849 pmap_store_bits(pte, bits); 2850 sfence_vma(); 2851 rv = 1; 2852 done: 2853 PMAP_UNLOCK(pmap); 2854 return (rv); 2855 } 2856 2857 /* 2858 * Demote the specified L1 page to separate L2 pages. 2859 * Currently only used for DMAP entries. 2860 */ 2861 static bool 2862 pmap_demote_l1(pmap_t pmap, pd_entry_t *l1, vm_offset_t va) 2863 { 2864 vm_page_t m; 2865 pt_entry_t *l2, oldl1, newl2; 2866 pd_entry_t newl1; 2867 vm_paddr_t l2phys; 2868 2869 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2870 2871 oldl1 = pmap_load(l1); 2872 KASSERT((oldl1 & PTE_RWX) != 0, 2873 ("pmap_demote_l1: oldl1 is not a leaf PTE")); 2874 KASSERT((oldl1 & PTE_A) != 0, 2875 ("pmap_demote_l1: oldl1 is missing PTE_A")); 2876 KASSERT((oldl1 & (PTE_D | PTE_W)) != PTE_W, 2877 ("pmap_demote_l1: not dirty!")); 2878 KASSERT((oldl1 & PTE_SW_MANAGED) == 0, 2879 ("pmap_demote_l1: L1 table shouldn't be managed")); 2880 KASSERT(VIRT_IN_DMAP(va), 2881 ("pmap_demote_l1: is unsupported for non-DMAP va=%#lx", va)); 2882 2883 /* Demoting L1 means we need to allocate a new page-table page. */ 2884 m = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED); 2885 if (m == NULL) { 2886 CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx in pmap %p", 2887 va, pmap); 2888 return (false); 2889 } 2890 2891 l2phys = VM_PAGE_TO_PHYS(m); 2892 l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys); 2893 2894 /* 2895 * Create new entries, relying on the fact that only the low bits 2896 * (index) of the physical address are changing. 2897 */ 2898 newl2 = oldl1; 2899 for (int i = 0; i < Ln_ENTRIES; i++) 2900 pmap_store(&l2[i], newl2 | (i << PTE_PPN1_S)); 2901 2902 /* 2903 * And update the L1 entry. 2904 * 2905 * NB: flushing the TLB is the responsibility of the caller. Cached 2906 * translations are still "correct" for demoted mappings until some 2907 * subset of the demoted range is modified. 2908 */ 2909 newl1 = ((l2phys / PAGE_SIZE) << PTE_PPN0_S) | PTE_V; 2910 pmap_store(l1, newl1); 2911 2912 counter_u64_add(pmap_l1_demotions, 1); 2913 CTR2(KTR_PMAP, "pmap_demote_l1: success for va %#lx in pmap %p", 2914 va, pmap); 2915 return (true); 2916 } 2917 2918 static bool 2919 pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va) 2920 { 2921 struct rwlock *lock; 2922 bool rv; 2923 2924 lock = NULL; 2925 rv = pmap_demote_l2_locked(pmap, l2, va, &lock); 2926 if (lock != NULL) 2927 rw_wunlock(lock); 2928 return (rv); 2929 } 2930 2931 /* 2932 * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page 2933 * mapping is invalidated. 2934 */ 2935 static bool 2936 pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, 2937 struct rwlock **lockp) 2938 { 2939 struct spglist free; 2940 vm_page_t mpte; 2941 pd_entry_t newl2, oldl2; 2942 pt_entry_t *firstl3, newl3; 2943 vm_paddr_t mptepa; 2944 int i; 2945 2946 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2947 2948 oldl2 = pmap_load(l2); 2949 KASSERT((oldl2 & PTE_RWX) != 0, 2950 ("pmap_demote_l2_locked: oldl2 is not a leaf entry")); 2951 if ((oldl2 & PTE_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == 2952 NULL) { 2953 KASSERT((oldl2 & PTE_SW_WIRED) == 0, 2954 ("pmap_demote_l2_locked: page table page for a wired mapping is missing")); 2955 if ((oldl2 & PTE_A) == 0 || (mpte = vm_page_alloc_noobj( 2956 (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 0) | 2957 VM_ALLOC_WIRED)) == NULL) { 2958 SLIST_INIT(&free); 2959 (void)pmap_remove_l2(pmap, l2, va & ~L2_OFFSET, 2960 pmap_load(pmap_l1(pmap, va)), &free, lockp); 2961 vm_page_free_pages_toq(&free, true); 2962 CTR2(KTR_PMAP, "pmap_demote_l2_locked: " 2963 "failure for va %#lx in pmap %p", va, pmap); 2964 return (false); 2965 } 2966 mpte->pindex = pmap_l2_pindex(va); 2967 if (va < VM_MAXUSER_ADDRESS) { 2968 mpte->ref_count = Ln_ENTRIES; 2969 pmap_resident_count_inc(pmap, 1); 2970 } 2971 } 2972 mptepa = VM_PAGE_TO_PHYS(mpte); 2973 firstl3 = (pt_entry_t *)PHYS_TO_DMAP(mptepa); 2974 newl2 = ((mptepa / PAGE_SIZE) << PTE_PPN0_S) | PTE_V; 2975 KASSERT((oldl2 & PTE_A) != 0, 2976 ("pmap_demote_l2_locked: oldl2 is missing PTE_A")); 2977 KASSERT((oldl2 & (PTE_D | PTE_W)) != PTE_W, 2978 ("pmap_demote_l2_locked: oldl2 is missing PTE_D")); 2979 newl3 = oldl2; 2980 2981 /* 2982 * If the page table page is not leftover from an earlier promotion, 2983 * initialize it. 2984 */ 2985 if (!vm_page_all_valid(mpte)) { 2986 for (i = 0; i < Ln_ENTRIES; i++) 2987 pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S)); 2988 } 2989 KASSERT(PTE_TO_PHYS(pmap_load(firstl3)) == PTE_TO_PHYS(newl3), 2990 ("pmap_demote_l2_locked: firstl3 and newl3 map different physical " 2991 "addresses")); 2992 2993 /* 2994 * If the mapping has changed attributes, update the PTEs. 2995 */ 2996 if ((pmap_load(firstl3) & PTE_PROMOTE) != (newl3 & PTE_PROMOTE)) 2997 for (i = 0; i < Ln_ENTRIES; i++) 2998 pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S)); 2999 3000 /* 3001 * The spare PV entries must be reserved prior to demoting the 3002 * mapping, that is, prior to changing the L2 entry. Otherwise, the 3003 * state of the L2 entry and the PV lists will be inconsistent, which 3004 * can result in reclaim_pv_chunk() attempting to remove a PV entry from 3005 * the wrong PV list and pmap_pv_demote_l2() failing to find the 3006 * expected PV entry for the 2MB page mapping that is being demoted. 3007 */ 3008 if ((oldl2 & PTE_SW_MANAGED) != 0) 3009 reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp); 3010 3011 /* 3012 * Demote the mapping. 3013 */ 3014 pmap_store(l2, newl2); 3015 3016 /* 3017 * Demote the PV entry. 3018 */ 3019 if ((oldl2 & PTE_SW_MANAGED) != 0) 3020 pmap_pv_demote_l2(pmap, va, PTE_TO_PHYS(oldl2), lockp); 3021 3022 atomic_add_long(&pmap_l2_demotions, 1); 3023 CTR2(KTR_PMAP, "pmap_demote_l2_locked: success for va %#lx in pmap %p", 3024 va, pmap); 3025 return (true); 3026 } 3027 3028 #if VM_NRESERVLEVEL > 0 3029 static bool 3030 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, vm_page_t ml3, 3031 struct rwlock **lockp) 3032 { 3033 pt_entry_t all_l3e_PTE_A, *firstl3, firstl3e, *l3, l3e; 3034 vm_paddr_t pa; 3035 3036 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3037 if (!pmap_ps_enabled(pmap)) 3038 return (false); 3039 3040 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 3041 ("pmap_promote_l2: invalid l2 entry %p", l2)); 3042 3043 /* 3044 * Examine the first L3E in the specified PTP. Abort if this L3E is 3045 * ineligible for promotion or does not map the first 4KB physical page 3046 * within a 2MB page. 3047 */ 3048 firstl3 = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l2))); 3049 firstl3e = pmap_load(firstl3); 3050 pa = PTE_TO_PHYS(firstl3e); 3051 if ((pa & L2_OFFSET) != 0) { 3052 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p", 3053 va, pmap); 3054 atomic_add_long(&pmap_l2_p_failures, 1); 3055 return (false); 3056 } 3057 3058 /* 3059 * Downgrade a clean, writable mapping to read-only to ensure that the 3060 * hardware does not set PTE_D while we are comparing PTEs. 3061 * 3062 * Upon a write access to a clean mapping, the implementation will 3063 * either atomically check protections and set PTE_D, or raise a page 3064 * fault. In the latter case, the pmap lock provides atomicity. Thus, 3065 * we do not issue an sfence.vma here and instead rely on pmap_fault() 3066 * to do so lazily. 3067 */ 3068 while ((firstl3e & (PTE_W | PTE_D)) == PTE_W) { 3069 if (atomic_fcmpset_64(firstl3, &firstl3e, firstl3e & ~PTE_W)) { 3070 firstl3e &= ~PTE_W; 3071 break; 3072 } 3073 } 3074 3075 /* 3076 * Examine each of the other PTEs in the specified PTP. Abort if this 3077 * PTE maps an unexpected 4KB physical page or does not have identical 3078 * characteristics to the first PTE. 3079 */ 3080 all_l3e_PTE_A = firstl3e & PTE_A; 3081 pa += L2_SIZE - PAGE_SIZE; 3082 for (l3 = firstl3 + Ln_ENTRIES - 1; l3 > firstl3; l3--) { 3083 l3e = pmap_load(l3); 3084 if (PTE_TO_PHYS(l3e) != pa) { 3085 CTR2(KTR_PMAP, 3086 "pmap_promote_l2: failure for va %#lx pmap %p", 3087 va, pmap); 3088 atomic_add_long(&pmap_l2_p_failures, 1); 3089 return (false); 3090 } 3091 while ((l3e & (PTE_W | PTE_D)) == PTE_W) { 3092 if (atomic_fcmpset_64(l3, &l3e, l3e & ~PTE_W)) { 3093 l3e &= ~PTE_W; 3094 break; 3095 } 3096 } 3097 if ((l3e & PTE_PROMOTE) != (firstl3e & PTE_PROMOTE)) { 3098 CTR2(KTR_PMAP, 3099 "pmap_promote_l2: failure for va %#lx pmap %p", 3100 va, pmap); 3101 atomic_add_long(&pmap_l2_p_failures, 1); 3102 return (false); 3103 } 3104 all_l3e_PTE_A &= l3e; 3105 pa -= PAGE_SIZE; 3106 } 3107 3108 /* 3109 * Unless all PTEs have PTE_A set, clear it from the superpage 3110 * mapping, so that promotions triggered by speculative mappings, 3111 * such as pmap_enter_quick(), don't automatically mark the 3112 * underlying pages as referenced. 3113 */ 3114 firstl3e &= ~PTE_A | all_l3e_PTE_A; 3115 3116 /* 3117 * Save the page table page in its current state until the L2 3118 * mapping the superpage is demoted by pmap_demote_l2() or 3119 * destroyed by pmap_remove_l3(). 3120 */ 3121 if (ml3 == NULL) 3122 ml3 = PTE_TO_VM_PAGE(pmap_load(l2)); 3123 KASSERT(ml3->pindex == pmap_l2_pindex(va), 3124 ("pmap_promote_l2: page table page's pindex is wrong")); 3125 if (pmap_insert_pt_page(pmap, ml3, true, all_l3e_PTE_A != 0)) { 3126 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p", 3127 va, pmap); 3128 atomic_add_long(&pmap_l2_p_failures, 1); 3129 return (false); 3130 } 3131 3132 if ((firstl3e & PTE_SW_MANAGED) != 0) 3133 pmap_pv_promote_l2(pmap, va, PTE_TO_PHYS(firstl3e), lockp); 3134 3135 pmap_store(l2, firstl3e); 3136 3137 atomic_add_long(&pmap_l2_promotions, 1); 3138 CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va, 3139 pmap); 3140 return (true); 3141 } 3142 #endif 3143 3144 /* 3145 * Insert the given physical page (p) at 3146 * the specified virtual address (v) in the 3147 * target physical map with the protection requested. 3148 * 3149 * If specified, the page will be wired down, meaning 3150 * that the related pte can not be reclaimed. 3151 * 3152 * NB: This is the only routine which MAY NOT lazy-evaluate 3153 * or lose information. That is, this routine must actually 3154 * insert this page into the given map NOW. 3155 */ 3156 int 3157 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 3158 u_int flags, int8_t psind) 3159 { 3160 struct rwlock *lock; 3161 pd_entry_t *l2, l2e; 3162 pt_entry_t new_l3, orig_l3; 3163 pt_entry_t *l3; 3164 pv_entry_t pv; 3165 vm_paddr_t opa, pa; 3166 vm_page_t mpte, om; 3167 pn_t pn; 3168 int rv; 3169 bool nosleep; 3170 3171 va = trunc_page(va); 3172 if ((m->oflags & VPO_UNMANAGED) == 0) 3173 VM_PAGE_OBJECT_BUSY_ASSERT(m); 3174 pa = VM_PAGE_TO_PHYS(m); 3175 pn = (pa / PAGE_SIZE); 3176 3177 new_l3 = PTE_V | PTE_R | PTE_A; 3178 if (prot & VM_PROT_EXECUTE) 3179 new_l3 |= PTE_X; 3180 if (flags & VM_PROT_WRITE) 3181 new_l3 |= PTE_D; 3182 if (prot & VM_PROT_WRITE) 3183 new_l3 |= PTE_W; 3184 if (va < VM_MAX_USER_ADDRESS) 3185 new_l3 |= PTE_U; 3186 3187 new_l3 |= (pn << PTE_PPN0_S); 3188 if ((flags & PMAP_ENTER_WIRED) != 0) 3189 new_l3 |= PTE_SW_WIRED; 3190 new_l3 |= pmap_memattr_bits(m->md.pv_memattr); 3191 3192 /* 3193 * Set modified bit gratuitously for writeable mappings if 3194 * the page is unmanaged. We do not want to take a fault 3195 * to do the dirty bit accounting for these mappings. 3196 */ 3197 if ((m->oflags & VPO_UNMANAGED) != 0) { 3198 if (prot & VM_PROT_WRITE) 3199 new_l3 |= PTE_D; 3200 } else 3201 new_l3 |= PTE_SW_MANAGED; 3202 3203 CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa); 3204 3205 lock = NULL; 3206 mpte = NULL; 3207 rw_rlock(&pvh_global_lock); 3208 PMAP_LOCK(pmap); 3209 if (psind == 1) { 3210 /* Assert the required virtual and physical alignment. */ 3211 KASSERT((va & L2_OFFSET) == 0, 3212 ("pmap_enter: va %#lx unaligned", va)); 3213 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); 3214 rv = pmap_enter_l2(pmap, va, new_l3, flags, m, &lock); 3215 goto out; 3216 } 3217 3218 l2 = pmap_l2(pmap, va); 3219 if (l2 != NULL && ((l2e = pmap_load(l2)) & PTE_V) != 0 && 3220 ((l2e & PTE_RWX) == 0 || pmap_demote_l2_locked(pmap, l2, 3221 va, &lock))) { 3222 l3 = pmap_l2_to_l3(l2, va); 3223 if (va < VM_MAXUSER_ADDRESS) { 3224 mpte = PTE_TO_VM_PAGE(pmap_load(l2)); 3225 mpte->ref_count++; 3226 } 3227 } else if (va < VM_MAXUSER_ADDRESS) { 3228 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; 3229 mpte = pmap_alloc_l3(pmap, va, nosleep ? NULL : &lock); 3230 if (mpte == NULL && nosleep) { 3231 CTR0(KTR_PMAP, "pmap_enter: mpte == NULL"); 3232 if (lock != NULL) 3233 rw_wunlock(lock); 3234 rw_runlock(&pvh_global_lock); 3235 PMAP_UNLOCK(pmap); 3236 return (KERN_RESOURCE_SHORTAGE); 3237 } 3238 l3 = pmap_l3(pmap, va); 3239 } else { 3240 panic("pmap_enter: missing L3 table for kernel va %#lx", va); 3241 } 3242 3243 orig_l3 = pmap_load(l3); 3244 opa = PTE_TO_PHYS(orig_l3); 3245 pv = NULL; 3246 3247 /* 3248 * Is the specified virtual address already mapped? 3249 */ 3250 if ((orig_l3 & PTE_V) != 0) { 3251 /* 3252 * Wiring change, just update stats. We don't worry about 3253 * wiring PT pages as they remain resident as long as there 3254 * are valid mappings in them. Hence, if a user page is wired, 3255 * the PT page will be also. 3256 */ 3257 if ((flags & PMAP_ENTER_WIRED) != 0 && 3258 (orig_l3 & PTE_SW_WIRED) == 0) 3259 pmap->pm_stats.wired_count++; 3260 else if ((flags & PMAP_ENTER_WIRED) == 0 && 3261 (orig_l3 & PTE_SW_WIRED) != 0) 3262 pmap->pm_stats.wired_count--; 3263 3264 /* 3265 * Remove the extra PT page reference. 3266 */ 3267 if (mpte != NULL) { 3268 mpte->ref_count--; 3269 KASSERT(mpte->ref_count > 0, 3270 ("pmap_enter: missing reference to page table page," 3271 " va: 0x%lx", va)); 3272 } 3273 3274 /* 3275 * Has the physical page changed? 3276 */ 3277 if (opa == pa) { 3278 /* 3279 * No, might be a protection or wiring change. 3280 */ 3281 if ((orig_l3 & PTE_SW_MANAGED) != 0 && 3282 (new_l3 & PTE_W) != 0) 3283 vm_page_aflag_set(m, PGA_WRITEABLE); 3284 goto validate; 3285 } 3286 3287 /* 3288 * The physical page has changed. Temporarily invalidate 3289 * the mapping. This ensures that all threads sharing the 3290 * pmap keep a consistent view of the mapping, which is 3291 * necessary for the correct handling of COW faults. It 3292 * also permits reuse of the old mapping's PV entry, 3293 * avoiding an allocation. 3294 * 3295 * For consistency, handle unmanaged mappings the same way. 3296 */ 3297 orig_l3 = pmap_load_clear(l3); 3298 KASSERT(PTE_TO_PHYS(orig_l3) == opa, 3299 ("pmap_enter: unexpected pa update for %#lx", va)); 3300 if ((orig_l3 & PTE_SW_MANAGED) != 0) { 3301 om = PHYS_TO_VM_PAGE(opa); 3302 3303 /* 3304 * The pmap lock is sufficient to synchronize with 3305 * concurrent calls to pmap_page_test_mappings() and 3306 * pmap_ts_referenced(). 3307 */ 3308 if ((orig_l3 & PTE_D) != 0) 3309 vm_page_dirty(om); 3310 if ((orig_l3 & PTE_A) != 0) 3311 vm_page_aflag_set(om, PGA_REFERENCED); 3312 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); 3313 pv = pmap_pvh_remove(&om->md, pmap, va); 3314 KASSERT(pv != NULL, 3315 ("pmap_enter: no PV entry for %#lx", va)); 3316 if ((new_l3 & PTE_SW_MANAGED) == 0) 3317 free_pv_entry(pmap, pv); 3318 if ((om->a.flags & PGA_WRITEABLE) != 0 && 3319 TAILQ_EMPTY(&om->md.pv_list) && 3320 ((om->flags & PG_FICTITIOUS) != 0 || 3321 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 3322 vm_page_aflag_clear(om, PGA_WRITEABLE); 3323 } 3324 pmap_invalidate_page(pmap, va); 3325 orig_l3 = 0; 3326 } else { 3327 /* 3328 * Increment the counters. 3329 */ 3330 if ((new_l3 & PTE_SW_WIRED) != 0) 3331 pmap->pm_stats.wired_count++; 3332 pmap_resident_count_inc(pmap, 1); 3333 } 3334 /* 3335 * Enter on the PV list if part of our managed memory. 3336 */ 3337 if ((new_l3 & PTE_SW_MANAGED) != 0) { 3338 if (pv == NULL) { 3339 pv = get_pv_entry(pmap, &lock); 3340 pv->pv_va = va; 3341 } 3342 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); 3343 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3344 m->md.pv_gen++; 3345 if ((new_l3 & PTE_W) != 0) 3346 vm_page_aflag_set(m, PGA_WRITEABLE); 3347 } 3348 3349 validate: 3350 /* 3351 * Sync the i-cache on all harts before updating the PTE 3352 * if the new PTE is executable. 3353 */ 3354 if (prot & VM_PROT_EXECUTE) 3355 pmap_sync_icache(pmap, va, PAGE_SIZE); 3356 3357 /* 3358 * Update the L3 entry. 3359 */ 3360 if (orig_l3 != 0) { 3361 orig_l3 = pmap_load_store(l3, new_l3); 3362 pmap_invalidate_page(pmap, va); 3363 KASSERT(PTE_TO_PHYS(orig_l3) == pa, 3364 ("pmap_enter: invalid update")); 3365 if ((orig_l3 & (PTE_D | PTE_SW_MANAGED)) == 3366 (PTE_D | PTE_SW_MANAGED)) 3367 vm_page_dirty(m); 3368 } else { 3369 pmap_store(l3, new_l3); 3370 } 3371 3372 #if VM_NRESERVLEVEL > 0 3373 if (mpte != NULL && mpte->ref_count == Ln_ENTRIES && 3374 (m->flags & PG_FICTITIOUS) == 0 && 3375 vm_reserv_level_iffullpop(m) == 0) 3376 (void)pmap_promote_l2(pmap, l2, va, mpte, &lock); 3377 #endif 3378 3379 rv = KERN_SUCCESS; 3380 out: 3381 if (lock != NULL) 3382 rw_wunlock(lock); 3383 rw_runlock(&pvh_global_lock); 3384 PMAP_UNLOCK(pmap); 3385 return (rv); 3386 } 3387 3388 /* 3389 * Release a page table page reference after a failed attempt to create a 3390 * mapping. 3391 */ 3392 static void 3393 pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t l2pg) 3394 { 3395 struct spglist free; 3396 3397 SLIST_INIT(&free); 3398 if (pmap_unwire_ptp(pmap, va, l2pg, &free)) { 3399 /* 3400 * Although "va" is not mapped, paging-structure 3401 * caches could nonetheless have entries that 3402 * refer to the freed page table pages. 3403 * Invalidate those entries. 3404 */ 3405 pmap_invalidate_page(pmap, va); 3406 vm_page_free_pages_toq(&free, true); 3407 } 3408 } 3409 3410 /* 3411 * Tries to create a read- and/or execute-only 2MB page mapping. Returns 3412 * KERN_SUCCESS if the mapping was created. Otherwise, returns an error 3413 * value. See pmap_enter_l2() for the possible error values when "no sleep", 3414 * "no replace", and "no reclaim" are specified. 3415 */ 3416 static int 3417 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 3418 struct rwlock **lockp) 3419 { 3420 pd_entry_t new_l2; 3421 pn_t pn; 3422 3423 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3424 3425 pn = VM_PAGE_TO_PHYS(m) / PAGE_SIZE; 3426 new_l2 = (pd_entry_t)((pn << PTE_PPN0_S) | PTE_R | PTE_V | 3427 pmap_memattr_bits(m->md.pv_memattr)); 3428 if ((m->oflags & VPO_UNMANAGED) == 0) 3429 new_l2 |= PTE_SW_MANAGED; 3430 if ((prot & VM_PROT_EXECUTE) != 0) 3431 new_l2 |= PTE_X; 3432 if (va < VM_MAXUSER_ADDRESS) 3433 new_l2 |= PTE_U; 3434 return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP | 3435 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp)); 3436 } 3437 3438 /* 3439 * Returns true if every page table entry in the specified page table is 3440 * zero. 3441 */ 3442 static bool 3443 pmap_every_pte_zero(vm_paddr_t pa) 3444 { 3445 pt_entry_t *pt_end, *pte; 3446 3447 KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned")); 3448 pte = (pt_entry_t *)PHYS_TO_DMAP(pa); 3449 for (pt_end = pte + Ln_ENTRIES; pte < pt_end; pte++) { 3450 if (*pte != 0) 3451 return (false); 3452 } 3453 return (true); 3454 } 3455 3456 /* 3457 * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if 3458 * the mapping was created, and one of KERN_FAILURE, KERN_NO_SPACE, or 3459 * KERN_RESOURCE_SHORTAGE otherwise. Returns KERN_FAILURE if 3460 * PMAP_ENTER_NOREPLACE was specified and a 4KB page mapping already exists 3461 * within the 2MB virtual address range starting at the specified virtual 3462 * address. Returns KERN_NO_SPACE if PMAP_ENTER_NOREPLACE was specified and a 3463 * 2MB page mapping already exists at the specified virtual address. Returns 3464 * KERN_RESOURCE_SHORTAGE if either (1) PMAP_ENTER_NOSLEEP was specified and a 3465 * page table page allocation failed or (2) PMAP_ENTER_NORECLAIM was specified 3466 * and a PV entry allocation failed. 3467 * 3468 * The parameter "m" is only used when creating a managed, writeable mapping. 3469 */ 3470 static int 3471 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, 3472 vm_page_t m, struct rwlock **lockp) 3473 { 3474 struct spglist free; 3475 pd_entry_t *l2, *l3, oldl2; 3476 vm_offset_t sva; 3477 vm_page_t l2pg, mt; 3478 vm_page_t uwptpg; 3479 3480 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3481 3482 if ((l2pg = pmap_alloc_l2(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ? 3483 NULL : lockp)) == NULL) { 3484 CTR2(KTR_PMAP, "pmap_enter_l2: failed to allocate PT page" 3485 " for va %#lx in pmap %p", va, pmap); 3486 return (KERN_RESOURCE_SHORTAGE); 3487 } 3488 3489 l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg)); 3490 l2 = &l2[pmap_l2_index(va)]; 3491 if ((oldl2 = pmap_load(l2)) != 0) { 3492 KASSERT(l2pg->ref_count > 1, 3493 ("pmap_enter_l2: l2pg's ref count is too low")); 3494 if ((flags & PMAP_ENTER_NOREPLACE) != 0) { 3495 if ((oldl2 & PTE_RWX) != 0) { 3496 l2pg->ref_count--; 3497 CTR2(KTR_PMAP, 3498 "pmap_enter_l2: no space for va %#lx" 3499 " in pmap %p", va, pmap); 3500 return (KERN_NO_SPACE); 3501 } else if (va < VM_MAXUSER_ADDRESS || 3502 !pmap_every_pte_zero(L2PTE_TO_PHYS(oldl2))) { 3503 l2pg->ref_count--; 3504 CTR2(KTR_PMAP, "pmap_enter_l2:" 3505 " failed to replace existing mapping" 3506 " for va %#lx in pmap %p", va, pmap); 3507 return (KERN_FAILURE); 3508 } 3509 } 3510 SLIST_INIT(&free); 3511 if ((oldl2 & PTE_RWX) != 0) 3512 (void)pmap_remove_l2(pmap, l2, va, 3513 pmap_load(pmap_l1(pmap, va)), &free, lockp); 3514 else 3515 for (sva = va; sva < va + L2_SIZE; sva += PAGE_SIZE) { 3516 l3 = pmap_l2_to_l3(l2, sva); 3517 if ((pmap_load(l3) & PTE_V) != 0 && 3518 pmap_remove_l3(pmap, l3, sva, oldl2, &free, 3519 lockp) != 0) 3520 break; 3521 } 3522 vm_page_free_pages_toq(&free, true); 3523 if (va >= VM_MAXUSER_ADDRESS) { 3524 /* 3525 * Both pmap_remove_l2() and pmap_remove_l3() will 3526 * leave the kernel page table page zero filled. 3527 */ 3528 mt = PTE_TO_VM_PAGE(pmap_load(l2)); 3529 if (pmap_insert_pt_page(pmap, mt, false, false)) 3530 panic("pmap_enter_l2: trie insert failed"); 3531 } else 3532 KASSERT(pmap_load(l2) == 0, 3533 ("pmap_enter_l2: non-zero L2 entry %p", l2)); 3534 } 3535 3536 /* 3537 * Allocate leaf ptpage for wired userspace pages. 3538 */ 3539 uwptpg = NULL; 3540 if ((new_l2 & PTE_SW_WIRED) != 0 && pmap != kernel_pmap) { 3541 uwptpg = vm_page_alloc_noobj(VM_ALLOC_WIRED); 3542 if (uwptpg == NULL) { 3543 pmap_abort_ptp(pmap, va, l2pg); 3544 return (KERN_RESOURCE_SHORTAGE); 3545 } 3546 uwptpg->pindex = pmap_l2_pindex(va); 3547 if (pmap_insert_pt_page(pmap, uwptpg, true, false)) { 3548 vm_page_unwire_noq(uwptpg); 3549 vm_page_free(uwptpg); 3550 pmap_abort_ptp(pmap, va, l2pg); 3551 return (KERN_RESOURCE_SHORTAGE); 3552 } 3553 pmap_resident_count_inc(pmap, 1); 3554 uwptpg->ref_count = Ln_ENTRIES; 3555 } 3556 if ((new_l2 & PTE_SW_MANAGED) != 0) { 3557 /* 3558 * Abort this mapping if its PV entry could not be created. 3559 */ 3560 if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) { 3561 pmap_abort_ptp(pmap, va, l2pg); 3562 if (uwptpg != NULL) { 3563 mt = pmap_remove_pt_page(pmap, va); 3564 KASSERT(mt == uwptpg, 3565 ("removed pt page %p, expected %p", mt, 3566 uwptpg)); 3567 pmap_resident_count_dec(pmap, 1); 3568 uwptpg->ref_count = 1; 3569 vm_page_unwire_noq(uwptpg); 3570 vm_page_free(uwptpg); 3571 } 3572 CTR2(KTR_PMAP, 3573 "pmap_enter_l2: failed to create PV entry" 3574 " for va %#lx in pmap %p", va, pmap); 3575 return (KERN_RESOURCE_SHORTAGE); 3576 } 3577 if ((new_l2 & PTE_W) != 0) 3578 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 3579 vm_page_aflag_set(mt, PGA_WRITEABLE); 3580 } 3581 3582 /* 3583 * Increment counters. 3584 */ 3585 if ((new_l2 & PTE_SW_WIRED) != 0) 3586 pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE; 3587 pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE; 3588 3589 /* 3590 * Map the superpage. 3591 */ 3592 pmap_store(l2, new_l2); 3593 3594 atomic_add_long(&pmap_l2_mappings, 1); 3595 CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p", 3596 va, pmap); 3597 3598 return (KERN_SUCCESS); 3599 } 3600 3601 /* 3602 * Maps a sequence of resident pages belonging to the same object. 3603 * The sequence begins with the given page m_start. This page is 3604 * mapped at the given virtual address start. Each subsequent page is 3605 * mapped at a virtual address that is offset from start by the same 3606 * amount as the page is offset from m_start within the object. The 3607 * last page in the sequence is the page with the largest offset from 3608 * m_start that can be mapped at a virtual address less than the given 3609 * virtual address end. Not every virtual page between start and end 3610 * is mapped; only those for which a resident page exists with the 3611 * corresponding offset from m_start are mapped. 3612 */ 3613 void 3614 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 3615 vm_page_t m_start, vm_prot_t prot) 3616 { 3617 struct rwlock *lock; 3618 vm_offset_t va; 3619 vm_page_t m, mpte; 3620 vm_pindex_t diff, psize; 3621 int rv; 3622 3623 VM_OBJECT_ASSERT_LOCKED(m_start->object); 3624 3625 psize = atop(end - start); 3626 mpte = NULL; 3627 m = m_start; 3628 lock = NULL; 3629 rw_rlock(&pvh_global_lock); 3630 PMAP_LOCK(pmap); 3631 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 3632 va = start + ptoa(diff); 3633 if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end && 3634 m->psind == 1 && pmap_ps_enabled(pmap) && 3635 ((rv = pmap_enter_2mpage(pmap, va, m, prot, &lock)) == 3636 KERN_SUCCESS || rv == KERN_NO_SPACE)) 3637 m = &m[L2_SIZE / PAGE_SIZE - 1]; 3638 else 3639 mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, 3640 &lock); 3641 m = TAILQ_NEXT(m, listq); 3642 } 3643 if (lock != NULL) 3644 rw_wunlock(lock); 3645 rw_runlock(&pvh_global_lock); 3646 PMAP_UNLOCK(pmap); 3647 } 3648 3649 /* 3650 * this code makes some *MAJOR* assumptions: 3651 * 1. Current pmap & pmap exists. 3652 * 2. Not wired. 3653 * 3. Read access. 3654 * 4. No page table pages. 3655 * but is *MUCH* faster than pmap_enter... 3656 */ 3657 3658 void 3659 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 3660 { 3661 struct rwlock *lock; 3662 3663 lock = NULL; 3664 rw_rlock(&pvh_global_lock); 3665 PMAP_LOCK(pmap); 3666 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); 3667 if (lock != NULL) 3668 rw_wunlock(lock); 3669 rw_runlock(&pvh_global_lock); 3670 PMAP_UNLOCK(pmap); 3671 } 3672 3673 static vm_page_t 3674 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 3675 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) 3676 { 3677 struct spglist free; 3678 pd_entry_t *l2; 3679 pt_entry_t *l3, newl3; 3680 3681 KASSERT(!VA_IS_CLEANMAP(va) || 3682 (m->oflags & VPO_UNMANAGED) != 0, 3683 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 3684 rw_assert(&pvh_global_lock, RA_LOCKED); 3685 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3686 l2 = NULL; 3687 3688 CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va); 3689 /* 3690 * In the case that a page table page is not 3691 * resident, we are creating it here. 3692 */ 3693 if (va < VM_MAXUSER_ADDRESS) { 3694 vm_pindex_t l2pindex; 3695 3696 /* 3697 * Calculate pagetable page index 3698 */ 3699 l2pindex = pmap_l2_pindex(va); 3700 if (mpte && (mpte->pindex == l2pindex)) { 3701 mpte->ref_count++; 3702 } else { 3703 /* 3704 * Get the l2 entry 3705 */ 3706 l2 = pmap_l2(pmap, va); 3707 3708 /* 3709 * If the page table page is mapped, we just increment 3710 * the hold count, and activate it. Otherwise, we 3711 * attempt to allocate a page table page. If this 3712 * attempt fails, we don't retry. Instead, we give up. 3713 */ 3714 if (l2 != NULL && pmap_load(l2) != 0) { 3715 if ((pmap_load(l2) & PTE_RWX) != 0) 3716 return (NULL); 3717 mpte = PTE_TO_VM_PAGE(pmap_load(l2)); 3718 mpte->ref_count++; 3719 } else { 3720 /* 3721 * Pass NULL instead of the PV list lock 3722 * pointer, because we don't intend to sleep. 3723 */ 3724 mpte = _pmap_alloc_l3(pmap, l2pindex, NULL); 3725 if (mpte == NULL) 3726 return (mpte); 3727 } 3728 } 3729 l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); 3730 l3 = &l3[pmap_l3_index(va)]; 3731 } else { 3732 mpte = NULL; 3733 l3 = pmap_l3(kernel_pmap, va); 3734 } 3735 if (l3 == NULL) 3736 panic("pmap_enter_quick_locked: No l3"); 3737 if (pmap_load(l3) != 0) { 3738 if (mpte != NULL) 3739 mpte->ref_count--; 3740 return (NULL); 3741 } 3742 3743 /* 3744 * Enter on the PV list if part of our managed memory. 3745 */ 3746 if ((m->oflags & VPO_UNMANAGED) == 0 && 3747 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { 3748 if (mpte != NULL) { 3749 SLIST_INIT(&free); 3750 if (pmap_unwire_ptp(pmap, va, mpte, &free)) 3751 vm_page_free_pages_toq(&free, false); 3752 } 3753 return (NULL); 3754 } 3755 3756 /* 3757 * Increment counters 3758 */ 3759 pmap_resident_count_inc(pmap, 1); 3760 3761 newl3 = ((VM_PAGE_TO_PHYS(m) / PAGE_SIZE) << PTE_PPN0_S) | 3762 PTE_V | PTE_R | pmap_memattr_bits(m->md.pv_memattr); 3763 if ((prot & VM_PROT_EXECUTE) != 0) 3764 newl3 |= PTE_X; 3765 if ((m->oflags & VPO_UNMANAGED) == 0) 3766 newl3 |= PTE_SW_MANAGED; 3767 if (va < VM_MAX_USER_ADDRESS) 3768 newl3 |= PTE_U; 3769 3770 /* 3771 * Sync the i-cache on all harts before updating the PTE 3772 * if the new PTE is executable. 3773 */ 3774 if (prot & VM_PROT_EXECUTE) 3775 pmap_sync_icache(pmap, va, PAGE_SIZE); 3776 3777 pmap_store(l3, newl3); 3778 3779 #if VM_NRESERVLEVEL > 0 3780 /* 3781 * If both the PTP and the reservation are fully populated, then attempt 3782 * promotion. 3783 */ 3784 if ((prot & VM_PROT_NO_PROMOTE) == 0 && 3785 (mpte == NULL || mpte->ref_count == Ln_ENTRIES) && 3786 (m->flags & PG_FICTITIOUS) == 0 && 3787 vm_reserv_level_iffullpop(m) == 0) { 3788 if (l2 == NULL) 3789 l2 = pmap_l2(pmap, va); 3790 3791 /* 3792 * If promotion succeeds, then the next call to this function 3793 * should not be given the unmapped PTP as a hint. 3794 */ 3795 if (pmap_promote_l2(pmap, l2, va, mpte, lockp)) 3796 mpte = NULL; 3797 } 3798 #endif 3799 3800 return (mpte); 3801 } 3802 3803 /* 3804 * This code maps large physical mmap regions into the 3805 * processor address space. Note that some shortcuts 3806 * are taken, but the code works. 3807 */ 3808 void 3809 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 3810 vm_pindex_t pindex, vm_size_t size) 3811 { 3812 3813 VM_OBJECT_ASSERT_WLOCKED(object); 3814 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 3815 ("pmap_object_init_pt: non-device object")); 3816 } 3817 3818 /* 3819 * Clear the wired attribute from the mappings for the specified range of 3820 * addresses in the given pmap. Every valid mapping within that range 3821 * must have the wired attribute set. In contrast, invalid mappings 3822 * cannot have the wired attribute set, so they are ignored. 3823 * 3824 * The wired attribute of the page table entry is not a hardware feature, 3825 * so there is no need to invalidate any TLB entries. 3826 */ 3827 void 3828 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3829 { 3830 vm_offset_t va_next; 3831 pd_entry_t *l0, *l1, *l2, l2e; 3832 pt_entry_t *l3, l3e; 3833 bool pv_lists_locked; 3834 3835 pv_lists_locked = false; 3836 retry: 3837 PMAP_LOCK(pmap); 3838 for (; sva < eva; sva = va_next) { 3839 if (pmap_mode == PMAP_MODE_SV48) { 3840 l0 = pmap_l0(pmap, sva); 3841 if (pmap_load(l0) == 0) { 3842 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 3843 if (va_next < sva) 3844 va_next = eva; 3845 continue; 3846 } 3847 l1 = pmap_l0_to_l1(l0, sva); 3848 } else { 3849 l1 = pmap_l1(pmap, sva); 3850 } 3851 3852 if (pmap_load(l1) == 0) { 3853 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 3854 if (va_next < sva) 3855 va_next = eva; 3856 continue; 3857 } 3858 3859 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 3860 if (va_next < sva) 3861 va_next = eva; 3862 3863 l2 = pmap_l1_to_l2(l1, sva); 3864 if ((l2e = pmap_load(l2)) == 0) 3865 continue; 3866 if ((l2e & PTE_RWX) != 0) { 3867 if (sva + L2_SIZE == va_next && eva >= va_next) { 3868 if ((l2e & PTE_SW_WIRED) == 0) 3869 panic("pmap_unwire: l2 %#jx is missing " 3870 "PTE_SW_WIRED", (uintmax_t)l2e); 3871 pmap_clear_bits(l2, PTE_SW_WIRED); 3872 continue; 3873 } else { 3874 if (!pv_lists_locked) { 3875 pv_lists_locked = true; 3876 if (!rw_try_rlock(&pvh_global_lock)) { 3877 PMAP_UNLOCK(pmap); 3878 rw_rlock(&pvh_global_lock); 3879 /* Repeat sva. */ 3880 goto retry; 3881 } 3882 } 3883 if (!pmap_demote_l2(pmap, l2, sva)) 3884 panic("pmap_unwire: demotion failed"); 3885 } 3886 } 3887 3888 if (va_next > eva) 3889 va_next = eva; 3890 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 3891 sva += L3_SIZE) { 3892 if ((l3e = pmap_load(l3)) == 0) 3893 continue; 3894 if ((l3e & PTE_SW_WIRED) == 0) 3895 panic("pmap_unwire: l3 %#jx is missing " 3896 "PTE_SW_WIRED", (uintmax_t)l3e); 3897 3898 /* 3899 * PG_W must be cleared atomically. Although the pmap 3900 * lock synchronizes access to PG_W, another processor 3901 * could be setting PG_M and/or PG_A concurrently. 3902 */ 3903 pmap_clear_bits(l3, PTE_SW_WIRED); 3904 pmap->pm_stats.wired_count--; 3905 } 3906 } 3907 if (pv_lists_locked) 3908 rw_runlock(&pvh_global_lock); 3909 PMAP_UNLOCK(pmap); 3910 } 3911 3912 /* 3913 * Copy the range specified by src_addr/len 3914 * from the source map to the range dst_addr/len 3915 * in the destination map. 3916 * 3917 * This routine is only advisory and need not do anything. 3918 */ 3919 3920 void 3921 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 3922 vm_offset_t src_addr) 3923 { 3924 3925 } 3926 3927 /* 3928 * pmap_zero_page zeros the specified hardware page by mapping 3929 * the page into KVM and using bzero to clear its contents. 3930 */ 3931 void 3932 pmap_zero_page(vm_page_t m) 3933 { 3934 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 3935 3936 pagezero((void *)va); 3937 } 3938 3939 /* 3940 * pmap_zero_page_area zeros the specified hardware page by mapping 3941 * the page into KVM and using bzero to clear its contents. 3942 * 3943 * off and size may not cover an area beyond a single hardware page. 3944 */ 3945 void 3946 pmap_zero_page_area(vm_page_t m, int off, int size) 3947 { 3948 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 3949 3950 if (off == 0 && size == PAGE_SIZE) 3951 pagezero((void *)va); 3952 else 3953 bzero((char *)va + off, size); 3954 } 3955 3956 /* 3957 * pmap_copy_page copies the specified (machine independent) 3958 * page by mapping the page into virtual memory and using 3959 * bcopy to copy the page, one machine dependent page at a 3960 * time. 3961 */ 3962 void 3963 pmap_copy_page(vm_page_t msrc, vm_page_t mdst) 3964 { 3965 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 3966 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 3967 3968 pagecopy((void *)src, (void *)dst); 3969 } 3970 3971 int unmapped_buf_allowed = 1; 3972 3973 void 3974 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 3975 vm_offset_t b_offset, int xfersize) 3976 { 3977 void *a_cp, *b_cp; 3978 vm_page_t m_a, m_b; 3979 vm_paddr_t p_a, p_b; 3980 vm_offset_t a_pg_offset, b_pg_offset; 3981 int cnt; 3982 3983 while (xfersize > 0) { 3984 a_pg_offset = a_offset & PAGE_MASK; 3985 m_a = ma[a_offset >> PAGE_SHIFT]; 3986 p_a = m_a->phys_addr; 3987 b_pg_offset = b_offset & PAGE_MASK; 3988 m_b = mb[b_offset >> PAGE_SHIFT]; 3989 p_b = m_b->phys_addr; 3990 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 3991 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 3992 if (__predict_false(!PHYS_IN_DMAP(p_a))) { 3993 panic("!DMAP a %lx", p_a); 3994 } else { 3995 a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset; 3996 } 3997 if (__predict_false(!PHYS_IN_DMAP(p_b))) { 3998 panic("!DMAP b %lx", p_b); 3999 } else { 4000 b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset; 4001 } 4002 bcopy(a_cp, b_cp, cnt); 4003 a_offset += cnt; 4004 b_offset += cnt; 4005 xfersize -= cnt; 4006 } 4007 } 4008 4009 vm_offset_t 4010 pmap_quick_enter_page(vm_page_t m) 4011 { 4012 4013 return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m))); 4014 } 4015 4016 void 4017 pmap_quick_remove_page(vm_offset_t addr) 4018 { 4019 } 4020 4021 /* 4022 * Returns true if the pmap's pv is one of the first 4023 * 16 pvs linked to from this page. This count may 4024 * be changed upwards or downwards in the future; it 4025 * is only necessary that true be returned for a small 4026 * subset of pmaps for proper page aging. 4027 */ 4028 bool 4029 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 4030 { 4031 struct md_page *pvh; 4032 struct rwlock *lock; 4033 pv_entry_t pv; 4034 int loops = 0; 4035 bool rv; 4036 4037 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4038 ("pmap_page_exists_quick: page %p is not managed", m)); 4039 rv = false; 4040 rw_rlock(&pvh_global_lock); 4041 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4042 rw_rlock(lock); 4043 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4044 if (PV_PMAP(pv) == pmap) { 4045 rv = true; 4046 break; 4047 } 4048 loops++; 4049 if (loops >= 16) 4050 break; 4051 } 4052 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 4053 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4054 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4055 if (PV_PMAP(pv) == pmap) { 4056 rv = true; 4057 break; 4058 } 4059 loops++; 4060 if (loops >= 16) 4061 break; 4062 } 4063 } 4064 rw_runlock(lock); 4065 rw_runlock(&pvh_global_lock); 4066 return (rv); 4067 } 4068 4069 /* 4070 * pmap_page_wired_mappings: 4071 * 4072 * Return the number of managed mappings to the given physical page 4073 * that are wired. 4074 */ 4075 int 4076 pmap_page_wired_mappings(vm_page_t m) 4077 { 4078 struct md_page *pvh; 4079 struct rwlock *lock; 4080 pmap_t pmap; 4081 pd_entry_t *l2; 4082 pt_entry_t *l3; 4083 pv_entry_t pv; 4084 int count, md_gen, pvh_gen; 4085 4086 if ((m->oflags & VPO_UNMANAGED) != 0) 4087 return (0); 4088 rw_rlock(&pvh_global_lock); 4089 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4090 rw_rlock(lock); 4091 restart: 4092 count = 0; 4093 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4094 pmap = PV_PMAP(pv); 4095 if (!PMAP_TRYLOCK(pmap)) { 4096 md_gen = m->md.pv_gen; 4097 rw_runlock(lock); 4098 PMAP_LOCK(pmap); 4099 rw_rlock(lock); 4100 if (md_gen != m->md.pv_gen) { 4101 PMAP_UNLOCK(pmap); 4102 goto restart; 4103 } 4104 } 4105 l2 = pmap_l2(pmap, pv->pv_va); 4106 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 4107 ("%s: found a 2mpage in page %p's pv list", __func__, m)); 4108 l3 = pmap_l2_to_l3(l2, pv->pv_va); 4109 if ((pmap_load(l3) & PTE_SW_WIRED) != 0) 4110 count++; 4111 PMAP_UNLOCK(pmap); 4112 } 4113 if ((m->flags & PG_FICTITIOUS) == 0) { 4114 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4115 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4116 pmap = PV_PMAP(pv); 4117 if (!PMAP_TRYLOCK(pmap)) { 4118 md_gen = m->md.pv_gen; 4119 pvh_gen = pvh->pv_gen; 4120 rw_runlock(lock); 4121 PMAP_LOCK(pmap); 4122 rw_rlock(lock); 4123 if (md_gen != m->md.pv_gen || 4124 pvh_gen != pvh->pv_gen) { 4125 PMAP_UNLOCK(pmap); 4126 goto restart; 4127 } 4128 } 4129 l2 = pmap_l2(pmap, pv->pv_va); 4130 if ((pmap_load(l2) & PTE_SW_WIRED) != 0) 4131 count++; 4132 PMAP_UNLOCK(pmap); 4133 } 4134 } 4135 rw_runlock(lock); 4136 rw_runlock(&pvh_global_lock); 4137 return (count); 4138 } 4139 4140 /* 4141 * Returns true if the given page is mapped individually or as part of 4142 * a 2mpage. Otherwise, returns false. 4143 */ 4144 bool 4145 pmap_page_is_mapped(vm_page_t m) 4146 { 4147 struct rwlock *lock; 4148 bool rv; 4149 4150 if ((m->oflags & VPO_UNMANAGED) != 0) 4151 return (false); 4152 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4153 rw_rlock(lock); 4154 rv = !TAILQ_EMPTY(&m->md.pv_list) || 4155 ((m->flags & PG_FICTITIOUS) == 0 && 4156 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 4157 rw_runlock(lock); 4158 return (rv); 4159 } 4160 4161 static void 4162 pmap_remove_pages_pv(pmap_t pmap, vm_page_t m, pv_entry_t pv, 4163 struct spglist *free, bool superpage) 4164 { 4165 struct md_page *pvh; 4166 vm_page_t mpte, mt; 4167 4168 if (superpage) { 4169 pmap_resident_count_dec(pmap, Ln_ENTRIES); 4170 pvh = pa_to_pvh(m->phys_addr); 4171 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 4172 pvh->pv_gen++; 4173 if (TAILQ_EMPTY(&pvh->pv_list)) { 4174 for (mt = m; mt < &m[Ln_ENTRIES]; mt++) 4175 if (TAILQ_EMPTY(&mt->md.pv_list) && 4176 (mt->a.flags & PGA_WRITEABLE) != 0) 4177 vm_page_aflag_clear(mt, PGA_WRITEABLE); 4178 } 4179 mpte = pmap_remove_pt_page(pmap, pv->pv_va); 4180 if (mpte != NULL) { 4181 KASSERT(vm_page_any_valid(mpte), 4182 ("pmap_remove_pages: pte page not promoted")); 4183 pmap_resident_count_dec(pmap, 1); 4184 KASSERT(mpte->ref_count == Ln_ENTRIES, 4185 ("pmap_remove_pages: pte page ref count error")); 4186 mpte->ref_count = 0; 4187 pmap_add_delayed_free_list(mpte, free, false); 4188 } 4189 } else { 4190 pmap_resident_count_dec(pmap, 1); 4191 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4192 m->md.pv_gen++; 4193 if (TAILQ_EMPTY(&m->md.pv_list) && 4194 (m->a.flags & PGA_WRITEABLE) != 0) { 4195 pvh = pa_to_pvh(m->phys_addr); 4196 if (TAILQ_EMPTY(&pvh->pv_list)) 4197 vm_page_aflag_clear(m, PGA_WRITEABLE); 4198 } 4199 } 4200 } 4201 4202 /* 4203 * Destroy all managed, non-wired mappings in the given user-space 4204 * pmap. This pmap cannot be active on any processor besides the 4205 * caller. 4206 * 4207 * This function cannot be applied to the kernel pmap. Moreover, it 4208 * is not intended for general use. It is only to be used during 4209 * process termination. Consequently, it can be implemented in ways 4210 * that make it faster than pmap_remove(). First, it can more quickly 4211 * destroy mappings by iterating over the pmap's collection of PV 4212 * entries, rather than searching the page table. Second, it doesn't 4213 * have to test and clear the page table entries atomically, because 4214 * no processor is currently accessing the user address space. In 4215 * particular, a page table entry's dirty bit won't change state once 4216 * this function starts. 4217 */ 4218 void 4219 pmap_remove_pages(pmap_t pmap) 4220 { 4221 struct spglist free; 4222 pd_entry_t ptepde; 4223 pt_entry_t *pte, tpte; 4224 vm_page_t m, mt; 4225 pv_entry_t pv; 4226 struct pv_chunk *pc, *npc; 4227 struct rwlock *lock; 4228 int64_t bit; 4229 uint64_t inuse, bitmask; 4230 int allfree, field, freed __pv_stat_used, idx; 4231 bool superpage; 4232 4233 lock = NULL; 4234 4235 SLIST_INIT(&free); 4236 rw_rlock(&pvh_global_lock); 4237 PMAP_LOCK(pmap); 4238 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 4239 allfree = 1; 4240 freed = 0; 4241 for (field = 0; field < _NPCM; field++) { 4242 inuse = ~pc->pc_map[field] & pc_freemask[field]; 4243 while (inuse != 0) { 4244 bit = ffsl(inuse) - 1; 4245 bitmask = 1UL << bit; 4246 idx = field * 64 + bit; 4247 pv = &pc->pc_pventry[idx]; 4248 inuse &= ~bitmask; 4249 4250 pte = pmap_l1(pmap, pv->pv_va); 4251 ptepde = pmap_load(pte); 4252 pte = pmap_l1_to_l2(pte, pv->pv_va); 4253 tpte = pmap_load(pte); 4254 4255 KASSERT((tpte & PTE_V) != 0, 4256 ("L2 PTE is invalid... bogus PV entry? " 4257 "va=%#lx, pte=%#lx", pv->pv_va, tpte)); 4258 if ((tpte & PTE_RWX) != 0) { 4259 superpage = true; 4260 } else { 4261 ptepde = tpte; 4262 pte = pmap_l2_to_l3(pte, pv->pv_va); 4263 tpte = pmap_load(pte); 4264 superpage = false; 4265 } 4266 4267 /* 4268 * We cannot remove wired pages from a 4269 * process' mapping at this time. 4270 */ 4271 if (tpte & PTE_SW_WIRED) { 4272 allfree = 0; 4273 continue; 4274 } 4275 4276 m = PTE_TO_VM_PAGE(tpte); 4277 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 4278 m < &vm_page_array[vm_page_array_size], 4279 ("pmap_remove_pages: bad pte %#jx", 4280 (uintmax_t)tpte)); 4281 4282 pmap_clear(pte); 4283 4284 /* 4285 * Update the vm_page_t clean/reference bits. 4286 */ 4287 if ((tpte & (PTE_D | PTE_W)) == 4288 (PTE_D | PTE_W)) { 4289 if (superpage) 4290 for (mt = m; 4291 mt < &m[Ln_ENTRIES]; mt++) 4292 vm_page_dirty(mt); 4293 else 4294 vm_page_dirty(m); 4295 } 4296 4297 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 4298 4299 /* Mark free */ 4300 pc->pc_map[field] |= bitmask; 4301 4302 pmap_remove_pages_pv(pmap, m, pv, &free, 4303 superpage); 4304 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); 4305 freed++; 4306 } 4307 } 4308 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 4309 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 4310 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 4311 if (allfree) { 4312 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 4313 free_pv_chunk(pc); 4314 } 4315 } 4316 if (lock != NULL) 4317 rw_wunlock(lock); 4318 pmap_invalidate_all(pmap); 4319 rw_runlock(&pvh_global_lock); 4320 PMAP_UNLOCK(pmap); 4321 vm_page_free_pages_toq(&free, false); 4322 } 4323 4324 static bool 4325 pmap_page_test_mappings(vm_page_t m, bool accessed, bool modified) 4326 { 4327 struct md_page *pvh; 4328 struct rwlock *lock; 4329 pd_entry_t *l2; 4330 pt_entry_t *l3, mask; 4331 pv_entry_t pv; 4332 pmap_t pmap; 4333 int md_gen, pvh_gen; 4334 bool rv; 4335 4336 mask = 0; 4337 if (modified) 4338 mask |= PTE_D; 4339 if (accessed) 4340 mask |= PTE_A; 4341 4342 rv = false; 4343 rw_rlock(&pvh_global_lock); 4344 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4345 rw_rlock(lock); 4346 restart: 4347 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4348 pmap = PV_PMAP(pv); 4349 if (!PMAP_TRYLOCK(pmap)) { 4350 md_gen = m->md.pv_gen; 4351 rw_runlock(lock); 4352 PMAP_LOCK(pmap); 4353 rw_rlock(lock); 4354 if (md_gen != m->md.pv_gen) { 4355 PMAP_UNLOCK(pmap); 4356 goto restart; 4357 } 4358 } 4359 l2 = pmap_l2(pmap, pv->pv_va); 4360 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 4361 ("%s: found a 2mpage in page %p's pv list", __func__, m)); 4362 l3 = pmap_l2_to_l3(l2, pv->pv_va); 4363 rv = (pmap_load(l3) & mask) == mask; 4364 PMAP_UNLOCK(pmap); 4365 if (rv) 4366 goto out; 4367 } 4368 if ((m->flags & PG_FICTITIOUS) == 0) { 4369 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4370 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4371 pmap = PV_PMAP(pv); 4372 if (!PMAP_TRYLOCK(pmap)) { 4373 md_gen = m->md.pv_gen; 4374 pvh_gen = pvh->pv_gen; 4375 rw_runlock(lock); 4376 PMAP_LOCK(pmap); 4377 rw_rlock(lock); 4378 if (md_gen != m->md.pv_gen || 4379 pvh_gen != pvh->pv_gen) { 4380 PMAP_UNLOCK(pmap); 4381 goto restart; 4382 } 4383 } 4384 l2 = pmap_l2(pmap, pv->pv_va); 4385 rv = (pmap_load(l2) & mask) == mask; 4386 PMAP_UNLOCK(pmap); 4387 if (rv) 4388 goto out; 4389 } 4390 } 4391 out: 4392 rw_runlock(lock); 4393 rw_runlock(&pvh_global_lock); 4394 return (rv); 4395 } 4396 4397 /* 4398 * pmap_is_modified: 4399 * 4400 * Return whether or not the specified physical page was modified 4401 * in any physical maps. 4402 */ 4403 bool 4404 pmap_is_modified(vm_page_t m) 4405 { 4406 4407 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4408 ("pmap_is_modified: page %p is not managed", m)); 4409 4410 /* 4411 * If the page is not busied then this check is racy. 4412 */ 4413 if (!pmap_page_is_write_mapped(m)) 4414 return (false); 4415 return (pmap_page_test_mappings(m, false, true)); 4416 } 4417 4418 /* 4419 * pmap_is_prefaultable: 4420 * 4421 * Return whether or not the specified virtual address is eligible 4422 * for prefault. 4423 */ 4424 bool 4425 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 4426 { 4427 pt_entry_t *l3; 4428 bool rv; 4429 4430 /* 4431 * Return true if and only if the L3 entry for the specified virtual 4432 * address is allocated but invalid. 4433 */ 4434 rv = false; 4435 PMAP_LOCK(pmap); 4436 l3 = pmap_l3(pmap, addr); 4437 if (l3 != NULL && pmap_load(l3) == 0) { 4438 rv = true; 4439 } 4440 PMAP_UNLOCK(pmap); 4441 return (rv); 4442 } 4443 4444 /* 4445 * pmap_is_referenced: 4446 * 4447 * Return whether or not the specified physical page was referenced 4448 * in any physical maps. 4449 */ 4450 bool 4451 pmap_is_referenced(vm_page_t m) 4452 { 4453 4454 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4455 ("pmap_is_referenced: page %p is not managed", m)); 4456 return (pmap_page_test_mappings(m, true, false)); 4457 } 4458 4459 /* 4460 * Clear the write and modified bits in each of the given page's mappings. 4461 */ 4462 void 4463 pmap_remove_write(vm_page_t m) 4464 { 4465 struct md_page *pvh; 4466 struct rwlock *lock; 4467 pmap_t pmap; 4468 pd_entry_t *l2; 4469 pt_entry_t *l3, oldl3, newl3; 4470 pv_entry_t next_pv, pv; 4471 vm_offset_t va; 4472 int md_gen, pvh_gen; 4473 4474 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4475 ("pmap_remove_write: page %p is not managed", m)); 4476 vm_page_assert_busied(m); 4477 4478 if (!pmap_page_is_write_mapped(m)) 4479 return; 4480 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4481 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 4482 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4483 rw_rlock(&pvh_global_lock); 4484 retry_pv_loop: 4485 rw_wlock(lock); 4486 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 4487 pmap = PV_PMAP(pv); 4488 if (!PMAP_TRYLOCK(pmap)) { 4489 pvh_gen = pvh->pv_gen; 4490 rw_wunlock(lock); 4491 PMAP_LOCK(pmap); 4492 rw_wlock(lock); 4493 if (pvh_gen != pvh->pv_gen) { 4494 PMAP_UNLOCK(pmap); 4495 rw_wunlock(lock); 4496 goto retry_pv_loop; 4497 } 4498 } 4499 va = pv->pv_va; 4500 l2 = pmap_l2(pmap, va); 4501 if ((pmap_load(l2) & PTE_W) != 0) 4502 (void)pmap_demote_l2_locked(pmap, l2, va, &lock); 4503 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 4504 ("inconsistent pv lock %p %p for page %p", 4505 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 4506 PMAP_UNLOCK(pmap); 4507 } 4508 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4509 pmap = PV_PMAP(pv); 4510 if (!PMAP_TRYLOCK(pmap)) { 4511 pvh_gen = pvh->pv_gen; 4512 md_gen = m->md.pv_gen; 4513 rw_wunlock(lock); 4514 PMAP_LOCK(pmap); 4515 rw_wlock(lock); 4516 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 4517 PMAP_UNLOCK(pmap); 4518 rw_wunlock(lock); 4519 goto retry_pv_loop; 4520 } 4521 } 4522 l2 = pmap_l2(pmap, pv->pv_va); 4523 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 4524 ("%s: found a 2mpage in page %p's pv list", __func__, m)); 4525 l3 = pmap_l2_to_l3(l2, pv->pv_va); 4526 oldl3 = pmap_load(l3); 4527 retry: 4528 if ((oldl3 & PTE_W) != 0) { 4529 newl3 = oldl3 & ~(PTE_D | PTE_W); 4530 if (!atomic_fcmpset_long(l3, &oldl3, newl3)) 4531 goto retry; 4532 if ((oldl3 & PTE_D) != 0) 4533 vm_page_dirty(m); 4534 pmap_invalidate_page(pmap, pv->pv_va); 4535 } 4536 PMAP_UNLOCK(pmap); 4537 } 4538 rw_wunlock(lock); 4539 vm_page_aflag_clear(m, PGA_WRITEABLE); 4540 rw_runlock(&pvh_global_lock); 4541 } 4542 4543 /* 4544 * pmap_ts_referenced: 4545 * 4546 * Return a count of reference bits for a page, clearing those bits. 4547 * It is not necessary for every reference bit to be cleared, but it 4548 * is necessary that 0 only be returned when there are truly no 4549 * reference bits set. 4550 * 4551 * As an optimization, update the page's dirty field if a modified bit is 4552 * found while counting reference bits. This opportunistic update can be 4553 * performed at low cost and can eliminate the need for some future calls 4554 * to pmap_is_modified(). However, since this function stops after 4555 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 4556 * dirty pages. Those dirty pages will only be detected by a future call 4557 * to pmap_is_modified(). 4558 */ 4559 int 4560 pmap_ts_referenced(vm_page_t m) 4561 { 4562 struct spglist free; 4563 struct md_page *pvh; 4564 struct rwlock *lock; 4565 pv_entry_t pv, pvf; 4566 pmap_t pmap; 4567 pd_entry_t *l2, l2e; 4568 pt_entry_t *l3, l3e; 4569 vm_paddr_t pa; 4570 vm_offset_t va; 4571 int cleared, md_gen, not_cleared, pvh_gen; 4572 4573 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4574 ("pmap_ts_referenced: page %p is not managed", m)); 4575 SLIST_INIT(&free); 4576 cleared = 0; 4577 pa = VM_PAGE_TO_PHYS(m); 4578 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); 4579 4580 lock = PHYS_TO_PV_LIST_LOCK(pa); 4581 rw_rlock(&pvh_global_lock); 4582 rw_wlock(lock); 4583 retry: 4584 not_cleared = 0; 4585 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 4586 goto small_mappings; 4587 pv = pvf; 4588 do { 4589 pmap = PV_PMAP(pv); 4590 if (!PMAP_TRYLOCK(pmap)) { 4591 pvh_gen = pvh->pv_gen; 4592 rw_wunlock(lock); 4593 PMAP_LOCK(pmap); 4594 rw_wlock(lock); 4595 if (pvh_gen != pvh->pv_gen) { 4596 PMAP_UNLOCK(pmap); 4597 goto retry; 4598 } 4599 } 4600 va = pv->pv_va; 4601 l2 = pmap_l2(pmap, va); 4602 l2e = pmap_load(l2); 4603 if ((l2e & (PTE_W | PTE_D)) == (PTE_W | PTE_D)) { 4604 /* 4605 * Although l2e is mapping a 2MB page, because 4606 * this function is called at a 4KB page granularity, 4607 * we only update the 4KB page under test. 4608 */ 4609 vm_page_dirty(m); 4610 } 4611 if ((l2e & PTE_A) != 0) { 4612 /* 4613 * Since this reference bit is shared by 512 4KB 4614 * pages, it should not be cleared every time it is 4615 * tested. Apply a simple "hash" function on the 4616 * physical page number, the virtual superpage number, 4617 * and the pmap address to select one 4KB page out of 4618 * the 512 on which testing the reference bit will 4619 * result in clearing that reference bit. This 4620 * function is designed to avoid the selection of the 4621 * same 4KB page for every 2MB page mapping. 4622 * 4623 * On demotion, a mapping that hasn't been referenced 4624 * is simply destroyed. To avoid the possibility of a 4625 * subsequent page fault on a demoted wired mapping, 4626 * always leave its reference bit set. Moreover, 4627 * since the superpage is wired, the current state of 4628 * its reference bit won't affect page replacement. 4629 */ 4630 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^ 4631 (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 && 4632 (l2e & PTE_SW_WIRED) == 0) { 4633 pmap_clear_bits(l2, PTE_A); 4634 pmap_invalidate_page(pmap, va); 4635 cleared++; 4636 } else 4637 not_cleared++; 4638 } 4639 PMAP_UNLOCK(pmap); 4640 /* Rotate the PV list if it has more than one entry. */ 4641 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 4642 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 4643 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 4644 pvh->pv_gen++; 4645 } 4646 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) 4647 goto out; 4648 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 4649 small_mappings: 4650 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 4651 goto out; 4652 pv = pvf; 4653 do { 4654 pmap = PV_PMAP(pv); 4655 if (!PMAP_TRYLOCK(pmap)) { 4656 pvh_gen = pvh->pv_gen; 4657 md_gen = m->md.pv_gen; 4658 rw_wunlock(lock); 4659 PMAP_LOCK(pmap); 4660 rw_wlock(lock); 4661 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 4662 PMAP_UNLOCK(pmap); 4663 goto retry; 4664 } 4665 } 4666 l2 = pmap_l2(pmap, pv->pv_va); 4667 4668 KASSERT((pmap_load(l2) & PTE_RX) == 0, 4669 ("pmap_ts_referenced: found an invalid l2 table")); 4670 4671 l3 = pmap_l2_to_l3(l2, pv->pv_va); 4672 l3e = pmap_load(l3); 4673 if ((l3e & PTE_D) != 0) 4674 vm_page_dirty(m); 4675 if ((l3e & PTE_A) != 0) { 4676 if ((l3e & PTE_SW_WIRED) == 0) { 4677 /* 4678 * Wired pages cannot be paged out so 4679 * doing accessed bit emulation for 4680 * them is wasted effort. We do the 4681 * hard work for unwired pages only. 4682 */ 4683 pmap_clear_bits(l3, PTE_A); 4684 pmap_invalidate_page(pmap, pv->pv_va); 4685 cleared++; 4686 } else 4687 not_cleared++; 4688 } 4689 PMAP_UNLOCK(pmap); 4690 /* Rotate the PV list if it has more than one entry. */ 4691 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 4692 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4693 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 4694 m->md.pv_gen++; 4695 } 4696 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + 4697 not_cleared < PMAP_TS_REFERENCED_MAX); 4698 out: 4699 rw_wunlock(lock); 4700 rw_runlock(&pvh_global_lock); 4701 vm_page_free_pages_toq(&free, false); 4702 return (cleared + not_cleared); 4703 } 4704 4705 /* 4706 * Apply the given advice to the specified range of addresses within the 4707 * given pmap. Depending on the advice, clear the referenced and/or 4708 * modified flags in each mapping and set the mapped page's dirty field. 4709 */ 4710 void 4711 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 4712 { 4713 } 4714 4715 /* 4716 * Clear the modify bits on the specified physical page. 4717 */ 4718 void 4719 pmap_clear_modify(vm_page_t m) 4720 { 4721 struct md_page *pvh; 4722 struct rwlock *lock; 4723 pmap_t pmap; 4724 pv_entry_t next_pv, pv; 4725 pd_entry_t *l2, oldl2; 4726 pt_entry_t *l3; 4727 vm_offset_t va; 4728 int md_gen, pvh_gen; 4729 4730 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4731 ("%s: page %p is not managed", __func__, m)); 4732 vm_page_assert_busied(m); 4733 4734 if (!pmap_page_is_write_mapped(m)) 4735 return; 4736 4737 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 4738 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4739 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4740 rw_rlock(&pvh_global_lock); 4741 rw_wlock(lock); 4742 restart: 4743 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 4744 pmap = PV_PMAP(pv); 4745 if (!PMAP_TRYLOCK(pmap)) { 4746 pvh_gen = pvh->pv_gen; 4747 rw_wunlock(lock); 4748 PMAP_LOCK(pmap); 4749 rw_wlock(lock); 4750 if (pvh_gen != pvh->pv_gen) { 4751 PMAP_UNLOCK(pmap); 4752 goto restart; 4753 } 4754 } 4755 va = pv->pv_va; 4756 l2 = pmap_l2(pmap, va); 4757 oldl2 = pmap_load(l2); 4758 /* If oldl2 has PTE_W set, then it also has PTE_D set. */ 4759 if ((oldl2 & PTE_W) != 0 && 4760 pmap_demote_l2_locked(pmap, l2, va, &lock) && 4761 (oldl2 & PTE_SW_WIRED) == 0) { 4762 /* 4763 * Write protect the mapping to a single page so that 4764 * a subsequent write access may repromote. 4765 */ 4766 va += VM_PAGE_TO_PHYS(m) - PTE_TO_PHYS(oldl2); 4767 l3 = pmap_l2_to_l3(l2, va); 4768 pmap_clear_bits(l3, PTE_D | PTE_W); 4769 vm_page_dirty(m); 4770 pmap_invalidate_page(pmap, va); 4771 } 4772 PMAP_UNLOCK(pmap); 4773 } 4774 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4775 pmap = PV_PMAP(pv); 4776 if (!PMAP_TRYLOCK(pmap)) { 4777 md_gen = m->md.pv_gen; 4778 pvh_gen = pvh->pv_gen; 4779 rw_wunlock(lock); 4780 PMAP_LOCK(pmap); 4781 rw_wlock(lock); 4782 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 4783 PMAP_UNLOCK(pmap); 4784 goto restart; 4785 } 4786 } 4787 l2 = pmap_l2(pmap, pv->pv_va); 4788 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 4789 ("%s: found a 2mpage in page %p's pv list", __func__, m)); 4790 l3 = pmap_l2_to_l3(l2, pv->pv_va); 4791 if ((pmap_load(l3) & (PTE_D | PTE_W)) == (PTE_D | PTE_W)) { 4792 pmap_clear_bits(l3, PTE_D | PTE_W); 4793 pmap_invalidate_page(pmap, pv->pv_va); 4794 } 4795 PMAP_UNLOCK(pmap); 4796 } 4797 rw_wunlock(lock); 4798 rw_runlock(&pvh_global_lock); 4799 } 4800 4801 void * 4802 pmap_mapbios(vm_paddr_t pa, vm_size_t size) 4803 { 4804 4805 return ((void *)PHYS_TO_DMAP(pa)); 4806 } 4807 4808 void 4809 pmap_unmapbios(void *p, vm_size_t size) 4810 { 4811 } 4812 4813 /* 4814 * Sets the memory attribute for the specified page. 4815 */ 4816 void 4817 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 4818 { 4819 4820 m->md.pv_memattr = ma; 4821 4822 /* 4823 * If "m" is a normal page, update its direct mapping. This update 4824 * can be relied upon to perform any cache operations that are 4825 * required for data coherence. 4826 */ 4827 if ((m->flags & PG_FICTITIOUS) == 0 && 4828 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, 4829 m->md.pv_memattr) != 0) 4830 panic("memory attribute change on the direct map failed"); 4831 } 4832 4833 /* 4834 * Changes the specified virtual address range's memory type to that given by 4835 * the parameter "mode". The specified virtual address range must be 4836 * completely contained within either the direct map or the kernel map. 4837 * 4838 * Returns zero if the change completed successfully, and either EINVAL or 4839 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 4840 * of the virtual address range was not mapped, and ENOMEM is returned if 4841 * there was insufficient memory available to complete the change. In the 4842 * latter case, the memory type may have been changed on some part of the 4843 * virtual address range. 4844 */ 4845 int 4846 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) 4847 { 4848 int error; 4849 4850 PMAP_LOCK(kernel_pmap); 4851 error = pmap_change_attr_locked(va, size, mode); 4852 PMAP_UNLOCK(kernel_pmap); 4853 return (error); 4854 } 4855 4856 static int 4857 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode) 4858 { 4859 vm_offset_t base, offset, tmpva; 4860 vm_paddr_t phys; 4861 pd_entry_t *l1, l1e; 4862 pd_entry_t *l2, l2e; 4863 pt_entry_t *l3, l3e; 4864 pt_entry_t bits, mask; 4865 bool anychanged = false; 4866 int error = 0; 4867 4868 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 4869 base = trunc_page(va); 4870 offset = va & PAGE_MASK; 4871 size = round_page(offset + size); 4872 4873 if (!VIRT_IN_DMAP(base) && 4874 !(base >= VM_MIN_KERNEL_ADDRESS && base < VM_MAX_KERNEL_ADDRESS)) 4875 return (EINVAL); 4876 4877 bits = pmap_memattr_bits(mode); 4878 mask = memattr_mask; 4879 4880 /* First loop: perform PTE validation and demotions as necessary. */ 4881 for (tmpva = base; tmpva < base + size; ) { 4882 l1 = pmap_l1(kernel_pmap, tmpva); 4883 if (l1 == NULL || ((l1e = pmap_load(l1)) & PTE_V) == 0) 4884 return (EINVAL); 4885 if ((l1e & PTE_RWX) != 0) { 4886 /* 4887 * If the existing PTE has the correct attributes, then 4888 * no need to demote. 4889 */ 4890 if ((l1e & mask) == bits) { 4891 tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE; 4892 continue; 4893 } 4894 4895 /* 4896 * If the 1GB page fits in the remaining range, we 4897 * don't need to demote. 4898 */ 4899 if ((tmpva & L1_OFFSET) == 0 && 4900 tmpva + L1_SIZE <= base + size) { 4901 tmpva += L1_SIZE; 4902 continue; 4903 } 4904 4905 if (!pmap_demote_l1(kernel_pmap, l1, tmpva)) 4906 return (EINVAL); 4907 } 4908 l2 = pmap_l1_to_l2(l1, tmpva); 4909 if (((l2e = pmap_load(l2)) & PTE_V) == 0) 4910 return (EINVAL); 4911 if ((l2e & PTE_RWX) != 0) { 4912 /* 4913 * If the existing PTE has the correct attributes, then 4914 * no need to demote. 4915 */ 4916 if ((l2e & mask) == bits) { 4917 tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE; 4918 continue; 4919 } 4920 4921 /* 4922 * If the 2MB page fits in the remaining range, we 4923 * don't need to demote. 4924 */ 4925 if ((tmpva & L2_OFFSET) == 0 && 4926 tmpva + L2_SIZE <= base + size) { 4927 tmpva += L2_SIZE; 4928 continue; 4929 } 4930 4931 if (!pmap_demote_l2(kernel_pmap, l2, tmpva)) 4932 panic("l2 demotion failed"); 4933 } 4934 l3 = pmap_l2_to_l3(l2, tmpva); 4935 if (((l3e = pmap_load(l3)) & PTE_V) == 0) 4936 return (EINVAL); 4937 4938 tmpva += PAGE_SIZE; 4939 } 4940 4941 /* Second loop: perform PTE updates. */ 4942 for (tmpva = base; tmpva < base + size; ) { 4943 l1 = pmap_l1(kernel_pmap, tmpva); 4944 l1e = pmap_load(l1); 4945 if ((l1e & PTE_RWX) != 0) { 4946 /* Unchanged. */ 4947 if ((l1e & mask) == bits) { 4948 tmpva += L1_SIZE; 4949 continue; 4950 } 4951 4952 l1e &= ~mask; 4953 l1e |= bits; 4954 pmap_store(l1, l1e); 4955 anychanged = true; 4956 4957 /* Update corresponding DMAP entry */ 4958 phys = L1PTE_TO_PHYS(l1e); 4959 if (!VIRT_IN_DMAP(tmpva) && PHYS_IN_DMAP(phys)) { 4960 error = pmap_change_attr_locked( 4961 PHYS_TO_DMAP(phys), L1_SIZE, mode); 4962 if (error != 0) 4963 break; 4964 } 4965 tmpva += L1_SIZE; 4966 continue; 4967 } 4968 4969 l2 = pmap_l1_to_l2(l1, tmpva); 4970 l2e = pmap_load(l2); 4971 if ((l2e & PTE_RWX) != 0) { 4972 /* Unchanged. */ 4973 if ((l2e & mask) == bits) { 4974 tmpva += L2_SIZE; 4975 continue; 4976 } 4977 4978 l2e &= ~mask; 4979 l2e |= bits; 4980 pmap_store(l2, l2e); 4981 anychanged = true; 4982 4983 /* Update corresponding DMAP entry */ 4984 phys = L2PTE_TO_PHYS(l2e); 4985 if (!VIRT_IN_DMAP(tmpva) && PHYS_IN_DMAP(phys)) { 4986 error = pmap_change_attr_locked( 4987 PHYS_TO_DMAP(phys), L2_SIZE, mode); 4988 if (error != 0) 4989 break; 4990 } 4991 tmpva += L2_SIZE; 4992 continue; 4993 } 4994 4995 l3 = pmap_l2_to_l3(l2, tmpva); 4996 l3e = pmap_load(l3); 4997 4998 /* Unchanged. */ 4999 if ((l3e & mask) == bits) { 5000 tmpva += PAGE_SIZE; 5001 continue; 5002 } 5003 5004 l3e &= ~mask; 5005 l3e |= bits; 5006 pmap_store(l3, l3e); 5007 anychanged = true; 5008 5009 phys = PTE_TO_PHYS(l3e); 5010 if (!VIRT_IN_DMAP(tmpva) && PHYS_IN_DMAP(phys)) { 5011 error = pmap_change_attr_locked(PHYS_TO_DMAP(phys), 5012 L3_SIZE, mode); 5013 if (error != 0) 5014 break; 5015 } 5016 tmpva += PAGE_SIZE; 5017 } 5018 5019 if (anychanged) { 5020 pmap_invalidate_range(kernel_pmap, base, tmpva); 5021 if (mode == VM_MEMATTR_UNCACHEABLE) 5022 cpu_dcache_wbinv_range(base, size); 5023 } 5024 5025 return (error); 5026 } 5027 5028 /* 5029 * Perform the pmap work for mincore(2). If the page is not both referenced and 5030 * modified by this pmap, returns its physical address so that the caller can 5031 * find other mappings. 5032 */ 5033 int 5034 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) 5035 { 5036 pt_entry_t *l2, *l3, tpte; 5037 vm_paddr_t pa; 5038 int val; 5039 bool managed; 5040 5041 PMAP_LOCK(pmap); 5042 l2 = pmap_l2(pmap, addr); 5043 if (l2 != NULL && ((tpte = pmap_load(l2)) & PTE_V) != 0) { 5044 if ((tpte & PTE_RWX) != 0) { 5045 pa = PTE_TO_PHYS(tpte) | (addr & L2_OFFSET); 5046 val = MINCORE_INCORE | MINCORE_PSIND(1); 5047 } else { 5048 l3 = pmap_l2_to_l3(l2, addr); 5049 tpte = pmap_load(l3); 5050 if ((tpte & PTE_V) == 0) { 5051 PMAP_UNLOCK(pmap); 5052 return (0); 5053 } 5054 pa = PTE_TO_PHYS(tpte) | (addr & L3_OFFSET); 5055 val = MINCORE_INCORE; 5056 } 5057 5058 if ((tpte & PTE_D) != 0) 5059 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 5060 if ((tpte & PTE_A) != 0) 5061 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 5062 managed = (tpte & PTE_SW_MANAGED) == PTE_SW_MANAGED; 5063 } else { 5064 managed = false; 5065 val = 0; 5066 } 5067 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 5068 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { 5069 *pap = pa; 5070 } 5071 PMAP_UNLOCK(pmap); 5072 return (val); 5073 } 5074 5075 void 5076 pmap_activate_sw(struct thread *td) 5077 { 5078 pmap_t oldpmap, pmap; 5079 u_int hart; 5080 5081 oldpmap = PCPU_GET(curpmap); 5082 pmap = vmspace_pmap(td->td_proc->p_vmspace); 5083 if (pmap == oldpmap) 5084 return; 5085 csr_write(satp, pmap->pm_satp); 5086 5087 hart = PCPU_GET(hart); 5088 #ifdef SMP 5089 CPU_SET_ATOMIC(hart, &pmap->pm_active); 5090 CPU_CLR_ATOMIC(hart, &oldpmap->pm_active); 5091 #else 5092 CPU_SET(hart, &pmap->pm_active); 5093 CPU_CLR(hart, &oldpmap->pm_active); 5094 #endif 5095 PCPU_SET(curpmap, pmap); 5096 5097 sfence_vma(); 5098 } 5099 5100 void 5101 pmap_activate(struct thread *td) 5102 { 5103 5104 critical_enter(); 5105 pmap_activate_sw(td); 5106 critical_exit(); 5107 } 5108 5109 void 5110 pmap_activate_boot(pmap_t pmap) 5111 { 5112 u_int hart; 5113 5114 hart = PCPU_GET(hart); 5115 #ifdef SMP 5116 CPU_SET_ATOMIC(hart, &pmap->pm_active); 5117 #else 5118 CPU_SET(hart, &pmap->pm_active); 5119 #endif 5120 PCPU_SET(curpmap, pmap); 5121 } 5122 5123 void 5124 pmap_active_cpus(pmap_t pmap, cpuset_t *res) 5125 { 5126 *res = pmap->pm_active; 5127 } 5128 5129 void 5130 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz) 5131 { 5132 cpuset_t mask; 5133 5134 /* 5135 * From the RISC-V User-Level ISA V2.2: 5136 * 5137 * "To make a store to instruction memory visible to all 5138 * RISC-V harts, the writing hart has to execute a data FENCE 5139 * before requesting that all remote RISC-V harts execute a 5140 * FENCE.I." 5141 * 5142 * However, this is slightly misleading; we still need to 5143 * perform a FENCE.I for the local hart, as FENCE does nothing 5144 * for its icache. FENCE.I alone is also sufficient for the 5145 * local hart. 5146 */ 5147 sched_pin(); 5148 mask = all_harts; 5149 CPU_CLR(PCPU_GET(hart), &mask); 5150 fence_i(); 5151 if (!CPU_EMPTY(&mask) && smp_started) { 5152 fence(); 5153 sbi_remote_fence_i(mask.__bits); 5154 } 5155 sched_unpin(); 5156 } 5157 5158 /* 5159 * Increase the starting virtual address of the given mapping if a 5160 * different alignment might result in more superpage mappings. 5161 */ 5162 void 5163 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 5164 vm_offset_t *addr, vm_size_t size) 5165 { 5166 vm_offset_t superpage_offset; 5167 5168 if (size < L2_SIZE) 5169 return; 5170 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 5171 offset += ptoa(object->pg_color); 5172 superpage_offset = offset & L2_OFFSET; 5173 if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE || 5174 (*addr & L2_OFFSET) == superpage_offset) 5175 return; 5176 if ((*addr & L2_OFFSET) < superpage_offset) 5177 *addr = (*addr & ~L2_OFFSET) + superpage_offset; 5178 else 5179 *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset; 5180 } 5181 5182 /** 5183 * Get the kernel virtual address of a set of physical pages. If there are 5184 * physical addresses not covered by the DMAP perform a transient mapping 5185 * that will be removed when calling pmap_unmap_io_transient. 5186 * 5187 * \param page The pages the caller wishes to obtain the virtual 5188 * address on the kernel memory map. 5189 * \param vaddr On return contains the kernel virtual memory address 5190 * of the pages passed in the page parameter. 5191 * \param count Number of pages passed in. 5192 * \param can_fault true if the thread using the mapped pages can take 5193 * page faults, false otherwise. 5194 * 5195 * \returns true if the caller must call pmap_unmap_io_transient when 5196 * finished or false otherwise. 5197 * 5198 */ 5199 bool 5200 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 5201 bool can_fault) 5202 { 5203 vm_paddr_t paddr; 5204 bool needs_mapping; 5205 int error __diagused, i; 5206 5207 /* 5208 * Allocate any KVA space that we need, this is done in a separate 5209 * loop to prevent calling vmem_alloc while pinned. 5210 */ 5211 needs_mapping = false; 5212 for (i = 0; i < count; i++) { 5213 paddr = VM_PAGE_TO_PHYS(page[i]); 5214 if (__predict_false(paddr >= DMAP_MAX_PHYSADDR)) { 5215 error = vmem_alloc(kernel_arena, PAGE_SIZE, 5216 M_BESTFIT | M_WAITOK, &vaddr[i]); 5217 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 5218 needs_mapping = true; 5219 } else { 5220 vaddr[i] = PHYS_TO_DMAP(paddr); 5221 } 5222 } 5223 5224 /* Exit early if everything is covered by the DMAP */ 5225 if (!needs_mapping) 5226 return (false); 5227 5228 if (!can_fault) 5229 sched_pin(); 5230 for (i = 0; i < count; i++) { 5231 paddr = VM_PAGE_TO_PHYS(page[i]); 5232 if (paddr >= DMAP_MAX_PHYSADDR) { 5233 panic( 5234 "pmap_map_io_transient: TODO: Map out of DMAP data"); 5235 } 5236 } 5237 5238 return (needs_mapping); 5239 } 5240 5241 void 5242 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 5243 bool can_fault) 5244 { 5245 vm_paddr_t paddr; 5246 int i; 5247 5248 if (!can_fault) 5249 sched_unpin(); 5250 for (i = 0; i < count; i++) { 5251 paddr = VM_PAGE_TO_PHYS(page[i]); 5252 if (paddr >= DMAP_MAX_PHYSADDR) { 5253 panic("RISCVTODO: pmap_unmap_io_transient: Unmap data"); 5254 } 5255 } 5256 } 5257 5258 bool 5259 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) 5260 { 5261 5262 return (mode >= VM_MEMATTR_DEFAULT && mode <= VM_MEMATTR_LAST); 5263 } 5264 5265 bool 5266 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l1, pd_entry_t **l2, 5267 pt_entry_t **l3) 5268 { 5269 pd_entry_t *l1p, *l2p; 5270 5271 /* Get l1 directory entry. */ 5272 l1p = pmap_l1(pmap, va); 5273 *l1 = l1p; 5274 5275 if (l1p == NULL || (pmap_load(l1p) & PTE_V) == 0) 5276 return (false); 5277 5278 if ((pmap_load(l1p) & PTE_RX) != 0) { 5279 *l2 = NULL; 5280 *l3 = NULL; 5281 return (true); 5282 } 5283 5284 /* Get l2 directory entry. */ 5285 l2p = pmap_l1_to_l2(l1p, va); 5286 *l2 = l2p; 5287 5288 if (l2p == NULL || (pmap_load(l2p) & PTE_V) == 0) 5289 return (false); 5290 5291 if ((pmap_load(l2p) & PTE_RX) != 0) { 5292 *l3 = NULL; 5293 return (true); 5294 } 5295 5296 /* Get l3 page table entry. */ 5297 *l3 = pmap_l2_to_l3(l2p, va); 5298 5299 return (true); 5300 } 5301 5302 /* 5303 * Track a range of the kernel's virtual address space that is contiguous 5304 * in various mapping attributes. 5305 */ 5306 struct pmap_kernel_map_range { 5307 vm_offset_t sva; 5308 pt_entry_t attrs; 5309 int l3pages; 5310 int l2pages; 5311 int l1pages; 5312 }; 5313 5314 static void 5315 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, 5316 vm_offset_t eva) 5317 { 5318 char *mode; 5319 int i; 5320 5321 if (eva <= range->sva) 5322 return; 5323 5324 for (i = 0; i < nitems(memattr_bits); i++) 5325 if ((range->attrs & memattr_mask) == memattr_bits[i]) 5326 break; 5327 5328 switch (i) { 5329 case VM_MEMATTR_PMA: 5330 mode = "PMA"; 5331 break; 5332 case VM_MEMATTR_UNCACHEABLE: 5333 mode = "NC "; 5334 break; 5335 case VM_MEMATTR_DEVICE: 5336 mode = "IO "; 5337 break; 5338 default: 5339 mode = "???"; 5340 break; 5341 } 5342 5343 sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %s %d %d %d\n", 5344 range->sva, eva, 5345 (range->attrs & PTE_W) == PTE_W ? 'w' : '-', 5346 (range->attrs & PTE_X) == PTE_X ? 'x' : '-', 5347 (range->attrs & PTE_U) == PTE_U ? 'u' : 's', 5348 (range->attrs & PTE_G) == PTE_G ? 'g' : '-', 5349 mode, range->l1pages, range->l2pages, range->l3pages); 5350 5351 /* Reset to sentinel value. */ 5352 range->sva = 0xfffffffffffffffful; 5353 } 5354 5355 /* 5356 * Determine whether the attributes specified by a page table entry match those 5357 * being tracked by the current range. 5358 */ 5359 static bool 5360 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs) 5361 { 5362 5363 return (range->attrs == attrs); 5364 } 5365 5366 static void 5367 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, 5368 pt_entry_t attrs) 5369 { 5370 5371 memset(range, 0, sizeof(*range)); 5372 range->sva = va; 5373 range->attrs = attrs; 5374 } 5375 5376 /* 5377 * Given a leaf PTE, derive the mapping's attributes. If they do not match 5378 * those of the current run, dump the address range and its attributes, and 5379 * begin a new run. 5380 */ 5381 static void 5382 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, 5383 vm_offset_t va, pd_entry_t l1e, pd_entry_t l2e, pt_entry_t l3e) 5384 { 5385 pt_entry_t attrs; 5386 5387 /* The PTE global bit is inherited by lower levels. */ 5388 attrs = l1e & PTE_G; 5389 if ((l1e & PTE_RWX) != 0) { 5390 attrs |= l1e & (PTE_RWX | PTE_U); 5391 attrs |= l1e & memattr_mask; 5392 } else if (l2e != 0) 5393 attrs |= l2e & PTE_G; 5394 5395 if ((l2e & PTE_RWX) != 0) { 5396 attrs |= l2e & (PTE_RWX | PTE_U); 5397 attrs |= l2e & memattr_mask; 5398 } else if (l3e != 0) { 5399 attrs |= l3e & (PTE_RWX | PTE_U | PTE_G); 5400 attrs |= l3e & memattr_mask; 5401 } 5402 5403 if (range->sva > va || !sysctl_kmaps_match(range, attrs)) { 5404 sysctl_kmaps_dump(sb, range, va); 5405 sysctl_kmaps_reinit(range, va, attrs); 5406 } 5407 } 5408 5409 static int 5410 sysctl_kmaps(SYSCTL_HANDLER_ARGS) 5411 { 5412 struct pmap_kernel_map_range range; 5413 struct sbuf sbuf, *sb; 5414 pd_entry_t *l1, l1e, *l2, l2e; 5415 pt_entry_t *l3, l3e; 5416 vm_offset_t sva; 5417 vm_paddr_t pa; 5418 int error, i, j, k; 5419 5420 error = sysctl_wire_old_buffer(req, 0); 5421 if (error != 0) 5422 return (error); 5423 sb = &sbuf; 5424 sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); 5425 5426 /* Sentinel value. */ 5427 range.sva = 0xfffffffffffffffful; 5428 5429 /* 5430 * Iterate over the kernel page tables without holding the kernel pmap 5431 * lock. Kernel page table pages are never freed, so at worst we will 5432 * observe inconsistencies in the output. 5433 */ 5434 sva = VM_MIN_KERNEL_ADDRESS; 5435 for (i = pmap_l1_index(sva); i < Ln_ENTRIES; i++) { 5436 if (i == pmap_l1_index(DMAP_MIN_ADDRESS)) 5437 sbuf_printf(sb, "\nDirect map:\n"); 5438 else if (i == pmap_l1_index(VM_MIN_KERNEL_ADDRESS)) 5439 sbuf_printf(sb, "\nKernel map:\n"); 5440 5441 l1 = pmap_l1(kernel_pmap, sva); 5442 l1e = pmap_load(l1); 5443 if ((l1e & PTE_V) == 0) { 5444 sysctl_kmaps_dump(sb, &range, sva); 5445 sva += L1_SIZE; 5446 continue; 5447 } 5448 if ((l1e & PTE_RWX) != 0) { 5449 sysctl_kmaps_check(sb, &range, sva, l1e, 0, 0); 5450 range.l1pages++; 5451 sva += L1_SIZE; 5452 continue; 5453 } 5454 pa = PTE_TO_PHYS(l1e); 5455 l2 = (pd_entry_t *)PHYS_TO_DMAP(pa); 5456 5457 for (j = pmap_l2_index(sva); j < Ln_ENTRIES; j++) { 5458 l2e = l2[j]; 5459 if ((l2e & PTE_V) == 0) { 5460 sysctl_kmaps_dump(sb, &range, sva); 5461 sva += L2_SIZE; 5462 continue; 5463 } 5464 if ((l2e & PTE_RWX) != 0) { 5465 sysctl_kmaps_check(sb, &range, sva, l1e, l2e, 0); 5466 range.l2pages++; 5467 sva += L2_SIZE; 5468 continue; 5469 } 5470 pa = PTE_TO_PHYS(l2e); 5471 l3 = (pd_entry_t *)PHYS_TO_DMAP(pa); 5472 5473 for (k = pmap_l3_index(sva); k < Ln_ENTRIES; k++, 5474 sva += L3_SIZE) { 5475 l3e = l3[k]; 5476 if ((l3e & PTE_V) == 0) { 5477 sysctl_kmaps_dump(sb, &range, sva); 5478 continue; 5479 } 5480 sysctl_kmaps_check(sb, &range, sva, 5481 l1e, l2e, l3e); 5482 range.l3pages++; 5483 } 5484 } 5485 } 5486 5487 error = sbuf_finish(sb); 5488 sbuf_delete(sb); 5489 return (error); 5490 } 5491 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps, 5492 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP, 5493 NULL, 0, sysctl_kmaps, "A", 5494 "Dump kernel address layout"); 5495