1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 1991 Regents of the University of California. 5 * All rights reserved. 6 * Copyright (c) 1994 John S. Dyson 7 * All rights reserved. 8 * Copyright (c) 1994 David Greenman 9 * All rights reserved. 10 * Copyright (c) 2003 Peter Wemm 11 * All rights reserved. 12 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 13 * All rights reserved. 14 * Copyright (c) 2014 Andrew Turner 15 * All rights reserved. 16 * Copyright (c) 2014 The FreeBSD Foundation 17 * All rights reserved. 18 * Copyright (c) 2015-2018 Ruslan Bukin <br@bsdpad.com> 19 * All rights reserved. 20 * 21 * This code is derived from software contributed to Berkeley by 22 * the Systems Programming Group of the University of Utah Computer 23 * Science Department and William Jolitz of UUNET Technologies Inc. 24 * 25 * Portions of this software were developed by Andrew Turner under 26 * sponsorship from The FreeBSD Foundation. 27 * 28 * Portions of this software were developed by SRI International and the 29 * University of Cambridge Computer Laboratory under DARPA/AFRL contract 30 * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme. 31 * 32 * Portions of this software were developed by the University of Cambridge 33 * Computer Laboratory as part of the CTSRD Project, with support from the 34 * UK Higher Education Innovation Fund (HEIF). 35 * 36 * Redistribution and use in source and binary forms, with or without 37 * modification, are permitted provided that the following conditions 38 * are met: 39 * 1. Redistributions of source code must retain the above copyright 40 * notice, this list of conditions and the following disclaimer. 41 * 2. Redistributions in binary form must reproduce the above copyright 42 * notice, this list of conditions and the following disclaimer in the 43 * documentation and/or other materials provided with the distribution. 44 * 3. All advertising materials mentioning features or use of this software 45 * must display the following acknowledgement: 46 * This product includes software developed by the University of 47 * California, Berkeley and its contributors. 48 * 4. Neither the name of the University nor the names of its contributors 49 * may be used to endorse or promote products derived from this software 50 * without specific prior written permission. 51 * 52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 62 * SUCH DAMAGE. 63 */ 64 /*- 65 * Copyright (c) 2003 Networks Associates Technology, Inc. 66 * All rights reserved. 67 * 68 * This software was developed for the FreeBSD Project by Jake Burkholder, 69 * Safeport Network Services, and Network Associates Laboratories, the 70 * Security Research Division of Network Associates, Inc. under 71 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 72 * CHATS research program. 73 * 74 * Redistribution and use in source and binary forms, with or without 75 * modification, are permitted provided that the following conditions 76 * are met: 77 * 1. Redistributions of source code must retain the above copyright 78 * notice, this list of conditions and the following disclaimer. 79 * 2. Redistributions in binary form must reproduce the above copyright 80 * notice, this list of conditions and the following disclaimer in the 81 * documentation and/or other materials provided with the distribution. 82 * 83 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 84 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 85 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 86 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 87 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 88 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 89 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 90 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 91 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 92 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 93 * SUCH DAMAGE. 94 */ 95 96 /* 97 * Manages physical address maps. 98 * 99 * Since the information managed by this module is 100 * also stored by the logical address mapping module, 101 * this module may throw away valid virtual-to-physical 102 * mappings at almost any time. However, invalidations 103 * of virtual-to-physical mappings must be done as 104 * requested. 105 * 106 * In order to cope with hardware architectures which 107 * make virtual-to-physical map invalidates expensive, 108 * this module may delay invalidate or reduced protection 109 * operations until such time as they are actually 110 * necessary. This module is given full information as 111 * to which processors are currently using which maps, 112 * and to when physical maps must be made correct. 113 */ 114 115 #include "opt_pmap.h" 116 117 #include <sys/param.h> 118 #include <sys/systm.h> 119 #include <sys/bitstring.h> 120 #include <sys/bus.h> 121 #include <sys/cpuset.h> 122 #include <sys/kernel.h> 123 #include <sys/ktr.h> 124 #include <sys/lock.h> 125 #include <sys/malloc.h> 126 #include <sys/mman.h> 127 #include <sys/msgbuf.h> 128 #include <sys/mutex.h> 129 #include <sys/physmem.h> 130 #include <sys/proc.h> 131 #include <sys/rwlock.h> 132 #include <sys/sbuf.h> 133 #include <sys/sx.h> 134 #include <sys/vmem.h> 135 #include <sys/vmmeter.h> 136 #include <sys/sched.h> 137 #include <sys/sysctl.h> 138 #include <sys/smp.h> 139 140 #include <vm/vm.h> 141 #include <vm/vm_param.h> 142 #include <vm/vm_kern.h> 143 #include <vm/vm_page.h> 144 #include <vm/vm_map.h> 145 #include <vm/vm_object.h> 146 #include <vm/vm_extern.h> 147 #include <vm/vm_pageout.h> 148 #include <vm/vm_pager.h> 149 #include <vm/vm_phys.h> 150 #include <vm/vm_radix.h> 151 #include <vm/vm_reserv.h> 152 #include <vm/vm_dumpset.h> 153 #include <vm/uma.h> 154 155 #include <machine/machdep.h> 156 #include <machine/md_var.h> 157 #include <machine/pcb.h> 158 #include <machine/sbi.h> 159 160 /* 161 * Boundary values for the page table page index space: 162 * 163 * L3 pages: [0, NUL2E) 164 * L2 pages: [NUL2E, NUL2E + NUL1E) 165 * L1 pages: [NUL2E + NUL1E, NUL2E + NUL1E + NUL0E) 166 * 167 * Note that these ranges are used in both SV39 and SV48 mode. In SV39 mode the 168 * ranges are not fully populated since there are at most Ln_ENTRIES^2 L3 pages 169 * in a set of page tables. 170 */ 171 #define NUL0E Ln_ENTRIES 172 #define NUL1E (Ln_ENTRIES * NUL0E) 173 #define NUL2E (Ln_ENTRIES * NUL1E) 174 175 #ifdef PV_STATS 176 #define PV_STAT(x) do { x ; } while (0) 177 #define __pv_stat_used 178 #else 179 #define PV_STAT(x) do { } while (0) 180 #define __pv_stat_used __unused 181 #endif 182 183 #define pmap_l1_pindex(v) (NUL2E + ((v) >> L1_SHIFT)) 184 #define pmap_l2_pindex(v) ((v) >> L2_SHIFT) 185 #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 186 187 #define NPV_LIST_LOCKS MAXCPU 188 189 #define PHYS_TO_PV_LIST_LOCK(pa) \ 190 (&pv_list_locks[pmap_l2_pindex(pa) % NPV_LIST_LOCKS]) 191 192 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ 193 struct rwlock **_lockp = (lockp); \ 194 struct rwlock *_new_lock; \ 195 \ 196 _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ 197 if (_new_lock != *_lockp) { \ 198 if (*_lockp != NULL) \ 199 rw_wunlock(*_lockp); \ 200 *_lockp = _new_lock; \ 201 rw_wlock(*_lockp); \ 202 } \ 203 } while (0) 204 205 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ 206 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) 207 208 #define RELEASE_PV_LIST_LOCK(lockp) do { \ 209 struct rwlock **_lockp = (lockp); \ 210 \ 211 if (*_lockp != NULL) { \ 212 rw_wunlock(*_lockp); \ 213 *_lockp = NULL; \ 214 } \ 215 } while (0) 216 217 #define VM_PAGE_TO_PV_LIST_LOCK(m) \ 218 PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) 219 220 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 221 "VM/pmap parameters"); 222 223 /* The list of all the user pmaps */ 224 LIST_HEAD(pmaplist, pmap); 225 static struct pmaplist allpmaps = LIST_HEAD_INITIALIZER(); 226 227 enum pmap_mode __read_frequently pmap_mode = PMAP_MODE_SV39; 228 SYSCTL_INT(_vm_pmap, OID_AUTO, mode, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 229 &pmap_mode, 0, 230 "translation mode, 0 = SV39, 1 = SV48"); 231 232 struct pmap kernel_pmap_store; 233 234 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 235 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 236 vm_offset_t kernel_vm_end = 0; 237 238 vm_paddr_t dmap_phys_base; /* The start of the dmap region */ 239 vm_paddr_t dmap_phys_max; /* The limit of the dmap region */ 240 vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */ 241 242 /* This code assumes all L1 DMAP entries will be used */ 243 CTASSERT((DMAP_MIN_ADDRESS & ~L1_OFFSET) == DMAP_MIN_ADDRESS); 244 CTASSERT((DMAP_MAX_ADDRESS & ~L1_OFFSET) == DMAP_MAX_ADDRESS); 245 246 /* 247 * This code assumes that the early DEVMAP is L2_SIZE aligned. 248 */ 249 CTASSERT((PMAP_MAPDEV_EARLY_SIZE & L2_OFFSET) == 0); 250 251 static struct rwlock_padalign pvh_global_lock; 252 static struct mtx_padalign allpmaps_lock; 253 254 static int __read_frequently superpages_enabled = 1; 255 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled, 256 CTLFLAG_RDTUN, &superpages_enabled, 0, 257 "Enable support for transparent superpages"); 258 259 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 260 "2MB page mapping counters"); 261 262 static u_long pmap_l2_demotions; 263 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD, 264 &pmap_l2_demotions, 0, 265 "2MB page demotions"); 266 267 static u_long pmap_l2_mappings; 268 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD, 269 &pmap_l2_mappings, 0, 270 "2MB page mappings"); 271 272 static u_long pmap_l2_p_failures; 273 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD, 274 &pmap_l2_p_failures, 0, 275 "2MB page promotion failures"); 276 277 static u_long pmap_l2_promotions; 278 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD, 279 &pmap_l2_promotions, 0, 280 "2MB page promotions"); 281 282 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l1, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 283 "L1 (1GB) page mapping counters"); 284 285 static COUNTER_U64_DEFINE_EARLY(pmap_l1_demotions); 286 SYSCTL_COUNTER_U64(_vm_pmap_l1, OID_AUTO, demotions, CTLFLAG_RD, 287 &pmap_l1_demotions, "L1 (1GB) page demotions"); 288 289 /* 290 * Data for the pv entry allocation mechanism 291 */ 292 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 293 static struct mtx pv_chunks_mutex; 294 static struct rwlock pv_list_locks[NPV_LIST_LOCKS]; 295 static struct md_page *pv_table; 296 static struct md_page pv_dummy; 297 298 extern cpuset_t all_harts; 299 300 /* 301 * Internal flags for pmap_enter()'s helper functions. 302 */ 303 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ 304 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ 305 306 static void free_pv_chunk(struct pv_chunk *pc); 307 static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 308 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); 309 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); 310 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 311 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 312 vm_offset_t va); 313 static bool pmap_demote_l1(pmap_t pmap, pd_entry_t *l1, vm_offset_t va); 314 static bool pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va); 315 static bool pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, 316 vm_offset_t va, struct rwlock **lockp); 317 static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, 318 u_int flags, vm_page_t m, struct rwlock **lockp); 319 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 320 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); 321 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva, 322 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); 323 static bool pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 324 vm_page_t m, struct rwlock **lockp); 325 326 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, 327 struct rwlock **lockp); 328 329 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, 330 struct spglist *free); 331 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); 332 333 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode); 334 335 static uint64_t pmap_satp_mode(void); 336 337 #define pmap_clear(pte) pmap_store(pte, 0) 338 #define pmap_clear_bits(pte, bits) atomic_clear_64(pte, bits) 339 #define pmap_load_store(pte, entry) atomic_swap_64(pte, entry) 340 #define pmap_load_clear(pte) pmap_load_store(pte, 0) 341 #define pmap_load(pte) atomic_load_64(pte) 342 #define pmap_store(pte, entry) atomic_store_64(pte, entry) 343 #define pmap_store_bits(pte, bits) atomic_set_64(pte, bits) 344 345 /********************/ 346 /* Inline functions */ 347 /********************/ 348 349 static __inline void 350 pagecopy(void *s, void *d) 351 { 352 353 memcpy(d, s, PAGE_SIZE); 354 } 355 356 static __inline void 357 pagezero(void *p) 358 { 359 360 bzero(p, PAGE_SIZE); 361 } 362 363 #define pmap_l0_index(va) (((va) >> L0_SHIFT) & Ln_ADDR_MASK) 364 #define pmap_l1_index(va) (((va) >> L1_SHIFT) & Ln_ADDR_MASK) 365 #define pmap_l2_index(va) (((va) >> L2_SHIFT) & Ln_ADDR_MASK) 366 #define pmap_l3_index(va) (((va) >> L3_SHIFT) & Ln_ADDR_MASK) 367 368 #define PTE_TO_PHYS(pte) \ 369 ((((pte) & ~PTE_HI_MASK) >> PTE_PPN0_S) * PAGE_SIZE) 370 #define L2PTE_TO_PHYS(l2) \ 371 ((((l2) & ~PTE_HI_MASK) >> PTE_PPN1_S) << L2_SHIFT) 372 #define L1PTE_TO_PHYS(l1) \ 373 ((((l1) & ~PTE_HI_MASK) >> PTE_PPN2_S) << L1_SHIFT) 374 #define PTE_TO_VM_PAGE(pte) PHYS_TO_VM_PAGE(PTE_TO_PHYS(pte)) 375 376 /* 377 * Construct a page table entry of the specified level pointing to physical 378 * address pa, with PTE bits 'bits'. 379 * 380 * A leaf PTE of any level must point to an address matching its alignment, 381 * e.g. L2 pages must be 2MB aligned in memory. 382 */ 383 #define L1_PTE(pa, bits) ((((pa) >> L1_SHIFT) << PTE_PPN2_S) | (bits)) 384 #define L2_PTE(pa, bits) ((((pa) >> L2_SHIFT) << PTE_PPN1_S) | (bits)) 385 #define L3_PTE(pa, bits) ((((pa) >> L3_SHIFT) << PTE_PPN0_S) | (bits)) 386 387 /* 388 * Construct a page directory entry (PDE), pointing to next level entry at pa, 389 * with PTE bits 'bits'. 390 * 391 * Unlike PTEs, page directory entries can point to any 4K-aligned physical 392 * address. 393 */ 394 #define L0_PDE(pa, bits) L3_PTE(pa, bits) 395 #define L1_PDE(pa, bits) L3_PTE(pa, bits) 396 #define L2_PDE(pa, bits) L3_PTE(pa, bits) 397 398 static __inline pd_entry_t * 399 pmap_l0(pmap_t pmap, vm_offset_t va) 400 { 401 KASSERT(pmap_mode != PMAP_MODE_SV39, ("%s: in SV39 mode", __func__)); 402 KASSERT(VIRT_IS_VALID(va), 403 ("%s: malformed virtual address %#lx", __func__, va)); 404 return (&pmap->pm_top[pmap_l0_index(va)]); 405 } 406 407 static __inline pd_entry_t * 408 pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va) 409 { 410 vm_paddr_t phys; 411 pd_entry_t *l1; 412 413 KASSERT(pmap_mode != PMAP_MODE_SV39, ("%s: in SV39 mode", __func__)); 414 phys = PTE_TO_PHYS(pmap_load(l0)); 415 l1 = (pd_entry_t *)PHYS_TO_DMAP(phys); 416 417 return (&l1[pmap_l1_index(va)]); 418 } 419 420 static __inline pd_entry_t * 421 pmap_l1(pmap_t pmap, vm_offset_t va) 422 { 423 pd_entry_t *l0; 424 425 KASSERT(VIRT_IS_VALID(va), 426 ("%s: malformed virtual address %#lx", __func__, va)); 427 if (pmap_mode == PMAP_MODE_SV39) { 428 return (&pmap->pm_top[pmap_l1_index(va)]); 429 } else { 430 l0 = pmap_l0(pmap, va); 431 if ((pmap_load(l0) & PTE_V) == 0) 432 return (NULL); 433 if ((pmap_load(l0) & PTE_RX) != 0) 434 return (NULL); 435 return (pmap_l0_to_l1(l0, va)); 436 } 437 } 438 439 static __inline pd_entry_t * 440 pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va) 441 { 442 vm_paddr_t phys; 443 pd_entry_t *l2; 444 445 phys = PTE_TO_PHYS(pmap_load(l1)); 446 l2 = (pd_entry_t *)PHYS_TO_DMAP(phys); 447 448 return (&l2[pmap_l2_index(va)]); 449 } 450 451 static __inline pd_entry_t * 452 pmap_l2(pmap_t pmap, vm_offset_t va) 453 { 454 pd_entry_t *l1; 455 456 l1 = pmap_l1(pmap, va); 457 if (l1 == NULL) 458 return (NULL); 459 if ((pmap_load(l1) & PTE_V) == 0) 460 return (NULL); 461 if ((pmap_load(l1) & PTE_RX) != 0) 462 return (NULL); 463 464 return (pmap_l1_to_l2(l1, va)); 465 } 466 467 static __inline pt_entry_t * 468 pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va) 469 { 470 vm_paddr_t phys; 471 pt_entry_t *l3; 472 473 phys = PTE_TO_PHYS(pmap_load(l2)); 474 l3 = (pd_entry_t *)PHYS_TO_DMAP(phys); 475 476 return (&l3[pmap_l3_index(va)]); 477 } 478 479 static __inline pt_entry_t * 480 pmap_l3(pmap_t pmap, vm_offset_t va) 481 { 482 pd_entry_t *l2; 483 484 l2 = pmap_l2(pmap, va); 485 if (l2 == NULL) 486 return (NULL); 487 if ((pmap_load(l2) & PTE_V) == 0) 488 return (NULL); 489 if ((pmap_load(l2) & PTE_RX) != 0) 490 return (NULL); 491 492 return (pmap_l2_to_l3(l2, va)); 493 } 494 495 static __inline void 496 pmap_resident_count_inc(pmap_t pmap, int count) 497 { 498 499 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 500 pmap->pm_stats.resident_count += count; 501 } 502 503 static __inline void 504 pmap_resident_count_dec(pmap_t pmap, int count) 505 { 506 507 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 508 KASSERT(pmap->pm_stats.resident_count >= count, 509 ("pmap %p resident count underflow %ld %d", pmap, 510 pmap->pm_stats.resident_count, count)); 511 pmap->pm_stats.resident_count -= count; 512 } 513 514 static void 515 pmap_distribute_l1(struct pmap *pmap, vm_pindex_t l1index, 516 pt_entry_t entry) 517 { 518 struct pmap *user_pmap; 519 pd_entry_t *l1; 520 521 /* 522 * Distribute new kernel L1 entry to all the user pmaps. This is only 523 * necessary with three-level paging configured: with four-level paging 524 * the kernel's half of the top-level page table page is static and can 525 * simply be copied at pmap initialization time. 526 */ 527 if (pmap != kernel_pmap || pmap_mode != PMAP_MODE_SV39) 528 return; 529 530 mtx_lock(&allpmaps_lock); 531 LIST_FOREACH(user_pmap, &allpmaps, pm_list) { 532 l1 = &user_pmap->pm_top[l1index]; 533 pmap_store(l1, entry); 534 } 535 mtx_unlock(&allpmaps_lock); 536 } 537 538 /* 539 * Holds the PTE mode bits (defined in pte.h) for defining e.g. cacheability. 540 * 541 * The indices correspond to the VM_MEMATTR_* defines in riscv/include/vm.h. 542 * 543 * The array will be empty if no mode bits are supported by the CPU, e.g. when 544 * lacking the Svpbmt extension. 545 */ 546 static __read_frequently pt_entry_t memattr_bits[VM_MEMATTR_TOTAL]; 547 static __read_frequently pt_entry_t memattr_mask; 548 549 static __inline pt_entry_t 550 pmap_memattr_bits(vm_memattr_t mode) 551 { 552 KASSERT(pmap_is_valid_memattr(kernel_pmap, mode), 553 ("invalid memory mode %u\n", mode)); 554 return (memattr_bits[(int)mode]); 555 } 556 557 /* 558 * This should only be used during pmap bootstrap e.g. by 559 * pmap_create_pagetables(). 560 */ 561 static pt_entry_t * 562 pmap_early_alloc_tables(vm_paddr_t *freemempos, int npages) 563 { 564 pt_entry_t *pt; 565 566 pt = (pt_entry_t *)*freemempos; 567 *freemempos += npages * PAGE_SIZE; 568 bzero(pt, npages * PAGE_SIZE); 569 570 return (pt); 571 } 572 573 /* 574 * Construct the direct map -- a linear mapping of physical memory into 575 * the kernel address space. 576 * 577 * We walk the list of physical memory segments (of arbitrary size and 578 * address) mapping each appropriately using L2 and L1 superpages. 579 * Consequently, the DMAP address space will have unmapped regions 580 * corresponding to any holes between physical memory segments. 581 * 582 * The lowest usable physical address will always be mapped to 583 * DMAP_MIN_ADDRESS. 584 */ 585 static vm_paddr_t 586 pmap_bootstrap_dmap(pd_entry_t *l1, vm_paddr_t freemempos) 587 { 588 vm_paddr_t physmap[PHYS_AVAIL_ENTRIES]; 589 vm_offset_t va; 590 vm_paddr_t min_pa, max_pa, pa, endpa; 591 pd_entry_t *l2; 592 pt_entry_t memattr; 593 u_int l1slot, l2slot; 594 int physmap_idx; 595 596 physmap_idx = physmem_avail(physmap, nitems(physmap)); 597 min_pa = physmap[0]; 598 max_pa = physmap[physmap_idx - 1]; 599 600 printf("physmap_idx %u\n", physmap_idx); 601 printf("min_pa %lx\n", min_pa); 602 printf("max_pa %lx\n", max_pa); 603 604 /* Set the limits of the DMAP region. */ 605 dmap_phys_base = rounddown(min_pa, L1_SIZE); 606 dmap_phys_max = max_pa; 607 608 memattr = pmap_memattr_bits(VM_MEMATTR_DEFAULT); 609 610 /* Walk the physmap table. */ 611 l2 = NULL; 612 l1slot = Ln_ENTRIES; /* sentinel value */ 613 for (int idx = 0; idx < physmap_idx; idx += 2) { 614 pa = rounddown(physmap[idx], L2_SIZE); 615 endpa = physmap[idx + 1]; 616 617 /* Virtual address for this range. */ 618 va = PHYS_TO_DMAP(pa); 619 620 /* Any 1GB possible for this range? */ 621 if (roundup(pa, L1_SIZE) + L1_SIZE > endpa) 622 goto l2end; 623 624 /* Loop until the next 1GB boundary. */ 625 while ((pa & L1_OFFSET) != 0) { 626 if (l2 == NULL || pmap_l1_index(va) != l1slot) { 627 /* Need to alloc another page table. */ 628 l2 = pmap_early_alloc_tables(&freemempos, 1); 629 630 /* Link it. */ 631 l1slot = pmap_l1_index(va); 632 pmap_store(&l1[l1slot], 633 L1_PDE((vm_paddr_t)l2, PTE_V)); 634 } 635 636 /* map l2 pages */ 637 l2slot = pmap_l2_index(va); 638 pmap_store(&l2[l2slot], L2_PTE(pa, PTE_KERN | memattr)); 639 640 pa += L2_SIZE; 641 va += L2_SIZE; 642 } 643 644 /* Map what we can with 1GB superpages. */ 645 while (pa + L1_SIZE - 1 < endpa) { 646 /* map l1 pages */ 647 l1slot = pmap_l1_index(va); 648 pmap_store(&l1[l1slot], L1_PTE(pa, PTE_KERN | memattr)); 649 650 pa += L1_SIZE; 651 va += L1_SIZE; 652 } 653 654 l2end: 655 while (pa < endpa) { 656 if (l2 == NULL || pmap_l1_index(va) != l1slot) { 657 /* Need to alloc another page table. */ 658 l2 = pmap_early_alloc_tables(&freemempos, 1); 659 660 /* Link it. */ 661 l1slot = pmap_l1_index(va); 662 pmap_store(&l1[l1slot], 663 L1_PDE((vm_paddr_t)l2, PTE_V)); 664 } 665 666 /* map l2 pages */ 667 l2slot = pmap_l2_index(va); 668 pmap_store(&l2[l2slot], L2_PTE(pa, PTE_KERN | memattr)); 669 670 pa += L2_SIZE; 671 va += L2_SIZE; 672 } 673 } 674 675 /* And finally, the limit on DMAP VA. */ 676 dmap_max_addr = va; 677 678 return (freemempos); 679 } 680 681 /* 682 * Create a new set of pagetables to run the kernel with. 683 * 684 * An initial, temporary setup was created in locore.S, which serves well 685 * enough to get us this far. It mapped kernstart -> KERNBASE, using 2MB 686 * superpages, and created a 1GB identity map, which allows this function 687 * to dereference physical addresses. 688 * 689 * The memory backing these page tables is allocated in the space 690 * immediately following the kernel's preload area. Depending on the size 691 * of this area, some, all, or none of these pages can be implicitly 692 * mapped by the kernel's 2MB mappings. This memory will only ever be 693 * accessed through the direct map, however. 694 */ 695 static vm_paddr_t 696 pmap_create_pagetables(vm_paddr_t kernstart, vm_size_t kernlen, 697 vm_paddr_t *root_pt_phys) 698 { 699 pt_entry_t *l0, *l1, *kern_l2, *kern_l3, *devmap_l3; 700 pt_entry_t memattr; 701 pd_entry_t *devmap_l2; 702 vm_paddr_t kernend, freemempos, pa; 703 int nkernl2, nkernl3, ndevmapl3; 704 int i, slot; 705 int mode; 706 707 kernend = kernstart + kernlen; 708 709 /* Static allocations begin after the kernel staging area. */ 710 freemempos = roundup2(kernend, PAGE_SIZE); 711 712 /* Detect Sv48 mode. */ 713 mode = PMAP_MODE_SV39; 714 TUNABLE_INT_FETCH("vm.pmap.mode", &mode); 715 716 if (mode == PMAP_MODE_SV48 && (mmu_caps & MMU_SV48) != 0) { 717 /* 718 * Sv48 mode: allocate an L0 page table to be the root. The 719 * layout of KVA is otherwise identical to Sv39. 720 */ 721 l0 = pmap_early_alloc_tables(&freemempos, 1); 722 *root_pt_phys = (vm_paddr_t)l0; 723 pmap_mode = PMAP_MODE_SV48; 724 } else { 725 l0 = NULL; 726 } 727 728 /* 729 * Allocate an L1 page table. 730 */ 731 l1 = pmap_early_alloc_tables(&freemempos, 1); 732 if (pmap_mode == PMAP_MODE_SV39) 733 *root_pt_phys = (vm_paddr_t)l1; 734 735 /* 736 * Allocate a set of L2 page tables for KVA. Most likely, only 1 is 737 * needed. 738 */ 739 nkernl2 = howmany(howmany(kernlen, L2_SIZE), Ln_ENTRIES); 740 kern_l2 = pmap_early_alloc_tables(&freemempos, nkernl2); 741 742 /* 743 * Allocate an L2 page table for the static devmap, located at the end 744 * of KVA. We can expect that the devmap will always be less than 1GB 745 * in size. 746 */ 747 devmap_l2 = pmap_early_alloc_tables(&freemempos, 1); 748 749 /* Allocate L3 page tables for the devmap. */ 750 ndevmapl3 = howmany(howmany(PMAP_MAPDEV_EARLY_SIZE, L3_SIZE), 751 Ln_ENTRIES); 752 devmap_l3 = pmap_early_alloc_tables(&freemempos, ndevmapl3); 753 754 /* 755 * Allocate some L3 bootstrap pages, for early KVA allocations before 756 * vm_mem_init() has run. For example, the message buffer. 757 * 758 * A somewhat arbitrary choice of 32MB. This should be more than enough 759 * for any early allocations. There is no need to worry about waste, as 760 * whatever is not used will be consumed by later calls to 761 * pmap_growkernel(). 762 */ 763 nkernl3 = 16; 764 kern_l3 = pmap_early_alloc_tables(&freemempos, nkernl3); 765 766 /* Bootstrap the direct map. */ 767 freemempos = pmap_bootstrap_dmap(l1, freemempos); 768 769 /* Allocations are done. */ 770 if (freemempos < roundup2(kernend, L2_SIZE)) 771 freemempos = roundup2(kernend, L2_SIZE); 772 773 /* Memory attributes for standard/main memory. */ 774 memattr = pmap_memattr_bits(VM_MEMATTR_DEFAULT); 775 776 /* 777 * Map the kernel (and preloaded modules or data) using L2 superpages. 778 * 779 * kernstart is 2MB-aligned. This is enforced by loader(8) and required 780 * by locore assembly. 781 * 782 * TODO: eventually, this should be done with proper permissions for 783 * each segment, rather than mapping the entire kernel and preloaded 784 * modules RWX. 785 */ 786 slot = pmap_l2_index(KERNBASE); 787 for (pa = kernstart; pa < kernend; pa += L2_SIZE, slot++) { 788 pmap_store(&kern_l2[slot], 789 L2_PTE(pa, PTE_KERN | PTE_X | memattr)); 790 } 791 792 /* 793 * Connect the L3 bootstrap pages to the kernel L2 table. The L3 PTEs 794 * themselves are invalid. 795 */ 796 slot = pmap_l2_index(freemempos - kernstart + KERNBASE); 797 for (i = 0; i < nkernl3; i++, slot++) { 798 pa = (vm_paddr_t)kern_l3 + ptoa(i); 799 pmap_store(&kern_l2[slot], L2_PDE(pa, PTE_V)); 800 } 801 802 /* Connect the L2 tables to the L1 table. */ 803 slot = pmap_l1_index(KERNBASE); 804 for (i = 0; i < nkernl2; i++, slot++) { 805 pa = (vm_paddr_t)kern_l2 + ptoa(i); 806 pmap_store(&l1[slot], L1_PDE(pa, PTE_V)); 807 } 808 809 /* Connect the L1 table to L0, if in use. */ 810 if (pmap_mode == PMAP_MODE_SV48) { 811 slot = pmap_l0_index(KERNBASE); 812 pmap_store(&l0[slot], L0_PDE((vm_paddr_t)l1, PTE_V)); 813 } 814 815 /* 816 * Connect the devmap L3 pages to the L2 table. The devmap PTEs 817 * themselves are invalid. 818 */ 819 slot = pmap_l2_index(DEVMAP_MIN_VADDR); 820 for (i = 0; i < ndevmapl3; i++, slot++) { 821 pa = (vm_paddr_t)devmap_l3 + ptoa(i); 822 pmap_store(&devmap_l2[slot], L2_PDE(pa, PTE_V)); 823 } 824 825 /* Connect the devmap L2 pages to the L1 table. */ 826 slot = pmap_l1_index(DEVMAP_MIN_VADDR); 827 pa = (vm_paddr_t)devmap_l2; 828 pmap_store(&l1[slot], L1_PDE(pa, PTE_V)); 829 830 /* Return the next position of free memory */ 831 return (freemempos); 832 } 833 834 /* 835 * Bootstrap the system enough to run with virtual memory. 836 */ 837 void 838 pmap_bootstrap(vm_paddr_t kernstart, vm_size_t kernlen) 839 { 840 vm_paddr_t freemempos, pa; 841 vm_paddr_t root_pt_phys; 842 vm_offset_t freeva; 843 vm_offset_t dpcpu, msgbufpv; 844 pt_entry_t *pte; 845 int i; 846 847 printf("pmap_bootstrap %lx %lx\n", kernstart, kernlen); 848 849 PMAP_LOCK_INIT(kernel_pmap); 850 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 851 vm_radix_init(&kernel_pmap->pm_root); 852 853 rw_init(&pvh_global_lock, "pmap pv global"); 854 855 /* 856 * Set the current CPU as active in the kernel pmap. Secondary cores 857 * will add themselves later in init_secondary(). The SBI firmware 858 * may rely on this mask being precise, so CPU_FILL() is not used. 859 */ 860 CPU_SET(PCPU_GET(hart), &kernel_pmap->pm_active); 861 862 /* 863 * Set up the memory attribute bits. 864 */ 865 if (has_svpbmt) { 866 memattr_bits[VM_MEMATTR_PMA] = PTE_MA_NONE; 867 memattr_bits[VM_MEMATTR_UNCACHEABLE] = PTE_MA_NC; 868 memattr_bits[VM_MEMATTR_DEVICE] = PTE_MA_IO; 869 memattr_mask = PTE_MA_MASK; 870 } 871 872 /* Create a new set of pagetables to run the kernel in. */ 873 freemempos = pmap_create_pagetables(kernstart, kernlen, &root_pt_phys); 874 875 /* Switch to the newly created page tables. */ 876 kernel_pmap->pm_stage = PM_STAGE1; 877 kernel_pmap->pm_top = (pd_entry_t *)PHYS_TO_DMAP(root_pt_phys); 878 kernel_pmap->pm_satp = atop(root_pt_phys) | pmap_satp_mode(); 879 csr_write(satp, kernel_pmap->pm_satp); 880 sfence_vma(); 881 882 /* 883 * Now, we need to make a few more static reservations from KVA. 884 * 885 * Set freeva to freemempos virtual address, and be sure to advance 886 * them together. 887 */ 888 freeva = freemempos - kernstart + KERNBASE; 889 #define reserve_space(var, pa, size) \ 890 do { \ 891 var = freeva; \ 892 pa = freemempos; \ 893 freeva += size; \ 894 freemempos += size; \ 895 } while (0) 896 897 /* Allocate the dynamic per-cpu area. */ 898 reserve_space(dpcpu, pa, DPCPU_SIZE); 899 900 /* Map it. */ 901 pte = pmap_l3(kernel_pmap, dpcpu); 902 KASSERT(pte != NULL, ("Bootstrap pages missing")); 903 for (i = 0; i < howmany(DPCPU_SIZE, PAGE_SIZE); i++) 904 pmap_store(&pte[i], L3_PTE(pa + ptoa(i), PTE_KERN | 905 pmap_memattr_bits(VM_MEMATTR_DEFAULT))); 906 907 /* Now, it can be initialized. */ 908 dpcpu_init((void *)dpcpu, 0); 909 910 /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */ 911 reserve_space(msgbufpv, pa, round_page(msgbufsize)); 912 msgbufp = (void *)msgbufpv; 913 914 /* Map it. */ 915 pte = pmap_l3(kernel_pmap, msgbufpv); 916 KASSERT(pte != NULL, ("Bootstrap pages missing")); 917 for (i = 0; i < howmany(msgbufsize, PAGE_SIZE); i++) 918 pmap_store(&pte[i], L3_PTE(pa + ptoa(i), PTE_KERN | 919 pmap_memattr_bits(VM_MEMATTR_DEFAULT))); 920 921 #undef reserve_space 922 923 /* Mark the bounds of our available virtual address space */ 924 virtual_avail = kernel_vm_end = freeva; 925 virtual_end = DEVMAP_MIN_VADDR; 926 927 /* Exclude the reserved physical memory from allocations. */ 928 physmem_exclude_region(kernstart, freemempos - kernstart, 929 EXFLAG_NOALLOC); 930 } 931 932 /* 933 * Initialize a vm_page's machine-dependent fields. 934 */ 935 void 936 pmap_page_init(vm_page_t m) 937 { 938 939 TAILQ_INIT(&m->md.pv_list); 940 m->md.pv_memattr = VM_MEMATTR_DEFAULT; 941 } 942 943 /* 944 * Initialize the pmap module. 945 * 946 * Called by vm_mem_init(), to initialize any structures that the pmap 947 * system needs to map virtual memory. 948 */ 949 void 950 pmap_init(void) 951 { 952 vm_size_t s; 953 int i, pv_npg; 954 955 /* 956 * Initialize the pv chunk and pmap list mutexes. 957 */ 958 mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); 959 mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_DEF); 960 961 /* 962 * Initialize the pool of pv list locks. 963 */ 964 for (i = 0; i < NPV_LIST_LOCKS; i++) 965 rw_init(&pv_list_locks[i], "pmap pv list"); 966 967 /* 968 * Calculate the size of the pv head table for superpages. 969 */ 970 pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L2_SIZE); 971 972 /* 973 * Allocate memory for the pv head table for superpages. 974 */ 975 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 976 s = round_page(s); 977 pv_table = kmem_malloc(s, M_WAITOK | M_ZERO); 978 for (i = 0; i < pv_npg; i++) 979 TAILQ_INIT(&pv_table[i].pv_list); 980 TAILQ_INIT(&pv_dummy.pv_list); 981 982 if (superpages_enabled) 983 pagesizes[1] = L2_SIZE; 984 } 985 986 #ifdef SMP 987 /* 988 * For SMP, these functions have to use IPIs for coherence. 989 * 990 * In general, the calling thread uses a plain fence to order the 991 * writes to the page tables before invoking an SBI callback to invoke 992 * sfence_vma() on remote CPUs. 993 */ 994 static void 995 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 996 { 997 cpuset_t mask; 998 999 sched_pin(); 1000 mask = pmap->pm_active; 1001 CPU_CLR(PCPU_GET(hart), &mask); 1002 fence(); 1003 if (!CPU_EMPTY(&mask) && smp_started) 1004 sbi_remote_sfence_vma(mask.__bits, va, 1); 1005 sfence_vma_page(va); 1006 sched_unpin(); 1007 } 1008 1009 static void 1010 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1011 { 1012 cpuset_t mask; 1013 1014 sched_pin(); 1015 mask = pmap->pm_active; 1016 CPU_CLR(PCPU_GET(hart), &mask); 1017 fence(); 1018 if (!CPU_EMPTY(&mask) && smp_started) 1019 sbi_remote_sfence_vma(mask.__bits, sva, eva - sva + 1); 1020 1021 /* 1022 * Might consider a loop of sfence_vma_page() for a small 1023 * number of pages in the future. 1024 */ 1025 sfence_vma(); 1026 sched_unpin(); 1027 } 1028 1029 static void 1030 pmap_invalidate_all(pmap_t pmap) 1031 { 1032 cpuset_t mask; 1033 1034 sched_pin(); 1035 mask = pmap->pm_active; 1036 CPU_CLR(PCPU_GET(hart), &mask); 1037 1038 /* 1039 * XXX: The SBI doc doesn't detail how to specify x0 as the 1040 * address to perform a global fence. BBL currently treats 1041 * all sfence_vma requests as global however. 1042 */ 1043 fence(); 1044 if (!CPU_EMPTY(&mask) && smp_started) 1045 sbi_remote_sfence_vma(mask.__bits, 0, 0); 1046 sfence_vma(); 1047 sched_unpin(); 1048 } 1049 #else 1050 /* 1051 * Normal, non-SMP, invalidation functions. 1052 * We inline these within pmap.c for speed. 1053 */ 1054 static __inline void 1055 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 1056 { 1057 1058 sfence_vma_page(va); 1059 } 1060 1061 static __inline void 1062 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1063 { 1064 1065 /* 1066 * Might consider a loop of sfence_vma_page() for a small 1067 * number of pages in the future. 1068 */ 1069 sfence_vma(); 1070 } 1071 1072 static __inline void 1073 pmap_invalidate_all(pmap_t pmap) 1074 { 1075 1076 sfence_vma(); 1077 } 1078 #endif 1079 1080 /* 1081 * Routine: pmap_extract 1082 * Function: 1083 * Extract the physical page address associated 1084 * with the given map/virtual_address pair. 1085 */ 1086 vm_paddr_t 1087 pmap_extract(pmap_t pmap, vm_offset_t va) 1088 { 1089 pd_entry_t *l2p, l2; 1090 pt_entry_t *l3p; 1091 vm_paddr_t pa; 1092 1093 pa = 0; 1094 1095 /* 1096 * Start with an L2 lookup, L1 superpages are currently not implemented. 1097 */ 1098 PMAP_LOCK(pmap); 1099 l2p = pmap_l2(pmap, va); 1100 if (l2p != NULL && ((l2 = pmap_load(l2p)) & PTE_V) != 0) { 1101 if ((l2 & PTE_RWX) == 0) { 1102 l3p = pmap_l2_to_l3(l2p, va); 1103 pa = PTE_TO_PHYS(pmap_load(l3p)); 1104 pa |= (va & L3_OFFSET); 1105 } else { 1106 /* L2 is a superpage mapping. */ 1107 pa = L2PTE_TO_PHYS(l2); 1108 pa |= (va & L2_OFFSET); 1109 } 1110 } 1111 PMAP_UNLOCK(pmap); 1112 return (pa); 1113 } 1114 1115 /* 1116 * Routine: pmap_extract_and_hold 1117 * Function: 1118 * Atomically extract and hold the physical page 1119 * with the given pmap and virtual address pair 1120 * if that mapping permits the given protection. 1121 */ 1122 vm_page_t 1123 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 1124 { 1125 pt_entry_t *l3p, l3; 1126 vm_page_t m; 1127 1128 m = NULL; 1129 PMAP_LOCK(pmap); 1130 l3p = pmap_l3(pmap, va); 1131 if (l3p != NULL && (l3 = pmap_load(l3p)) != 0) { 1132 if ((l3 & PTE_W) != 0 || (prot & VM_PROT_WRITE) == 0) { 1133 m = PTE_TO_VM_PAGE(l3); 1134 if (!vm_page_wire_mapped(m)) 1135 m = NULL; 1136 } 1137 } 1138 PMAP_UNLOCK(pmap); 1139 return (m); 1140 } 1141 1142 /* 1143 * Routine: pmap_kextract 1144 * Function: 1145 * Extract the physical page address associated with the given kernel 1146 * virtual address. 1147 */ 1148 vm_paddr_t 1149 pmap_kextract(vm_offset_t va) 1150 { 1151 pd_entry_t *l2, l2e; 1152 pt_entry_t *l3; 1153 vm_paddr_t pa; 1154 1155 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 1156 pa = DMAP_TO_PHYS(va); 1157 } else { 1158 l2 = pmap_l2(kernel_pmap, va); 1159 if (l2 == NULL) 1160 panic("pmap_kextract: No l2"); 1161 l2e = pmap_load(l2); 1162 /* 1163 * Beware of concurrent promotion and demotion! We must 1164 * use l2e rather than loading from l2 multiple times to 1165 * ensure we see a consistent state, including the 1166 * implicit load in pmap_l2_to_l3. It is, however, safe 1167 * to use an old l2e because the L3 page is preserved by 1168 * promotion. 1169 */ 1170 if ((l2e & PTE_RX) != 0) { 1171 /* superpages */ 1172 pa = L2PTE_TO_PHYS(l2e); 1173 pa |= (va & L2_OFFSET); 1174 return (pa); 1175 } 1176 1177 l3 = pmap_l2_to_l3(&l2e, va); 1178 pa = PTE_TO_PHYS(pmap_load(l3)); 1179 pa |= (va & PAGE_MASK); 1180 } 1181 return (pa); 1182 } 1183 1184 /*************************************************** 1185 * Low level mapping routines..... 1186 ***************************************************/ 1187 1188 void 1189 pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode) 1190 { 1191 pt_entry_t entry; 1192 pt_entry_t *l3; 1193 pt_entry_t memattr; 1194 vm_offset_t va; 1195 pn_t pn; 1196 1197 KASSERT((pa & L3_OFFSET) == 0, 1198 ("pmap_kenter_device: Invalid physical address")); 1199 KASSERT((sva & L3_OFFSET) == 0, 1200 ("pmap_kenter_device: Invalid virtual address")); 1201 KASSERT((size & PAGE_MASK) == 0, 1202 ("pmap_kenter_device: Mapping is not page-sized")); 1203 1204 memattr = pmap_memattr_bits(mode); 1205 va = sva; 1206 while (size != 0) { 1207 l3 = pmap_l3(kernel_pmap, va); 1208 KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va)); 1209 1210 pn = (pa / PAGE_SIZE); 1211 entry = PTE_KERN; 1212 entry |= memattr; 1213 entry |= (pn << PTE_PPN0_S); 1214 pmap_store(l3, entry); 1215 1216 va += PAGE_SIZE; 1217 pa += PAGE_SIZE; 1218 size -= PAGE_SIZE; 1219 } 1220 pmap_invalidate_range(kernel_pmap, sva, va); 1221 } 1222 1223 void 1224 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa) 1225 { 1226 pmap_kenter(sva, size, pa, VM_MEMATTR_DEVICE); 1227 } 1228 1229 /* 1230 * Remove a page from the kernel pagetables. 1231 * Note: not SMP coherent. 1232 */ 1233 void 1234 pmap_kremove(vm_offset_t va) 1235 { 1236 pt_entry_t *l3; 1237 1238 l3 = pmap_l3(kernel_pmap, va); 1239 KASSERT(l3 != NULL, ("pmap_kremove: Invalid address")); 1240 1241 pmap_clear(l3); 1242 sfence_vma(); 1243 } 1244 1245 void 1246 pmap_kremove_device(vm_offset_t sva, vm_size_t size) 1247 { 1248 pt_entry_t *l3; 1249 vm_offset_t va; 1250 1251 KASSERT((sva & L3_OFFSET) == 0, 1252 ("pmap_kremove_device: Invalid virtual address")); 1253 KASSERT((size & PAGE_MASK) == 0, 1254 ("pmap_kremove_device: Mapping is not page-sized")); 1255 1256 va = sva; 1257 while (size != 0) { 1258 l3 = pmap_l3(kernel_pmap, va); 1259 KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va)); 1260 pmap_clear(l3); 1261 1262 va += PAGE_SIZE; 1263 size -= PAGE_SIZE; 1264 } 1265 1266 pmap_invalidate_range(kernel_pmap, sva, va); 1267 } 1268 1269 /* 1270 * Used to map a range of physical addresses into kernel 1271 * virtual address space. 1272 * 1273 * The value passed in '*virt' is a suggested virtual address for 1274 * the mapping. Architectures which can support a direct-mapped 1275 * physical to virtual region can return the appropriate address 1276 * within that region, leaving '*virt' unchanged. Other 1277 * architectures should map the pages starting at '*virt' and 1278 * update '*virt' with the first usable address after the mapped 1279 * region. 1280 */ 1281 vm_offset_t 1282 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 1283 { 1284 1285 return PHYS_TO_DMAP(start); 1286 } 1287 1288 /* 1289 * Add a list of wired pages to the kva 1290 * this routine is only used for temporary 1291 * kernel mappings that do not need to have 1292 * page modification or references recorded. 1293 * Note that old mappings are simply written 1294 * over. The page *must* be wired. 1295 * Note: SMP coherent. Uses a ranged shootdown IPI. 1296 */ 1297 void 1298 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 1299 { 1300 pt_entry_t *l3; 1301 vm_paddr_t pa; 1302 vm_offset_t va; 1303 vm_page_t m; 1304 pt_entry_t entry; 1305 pn_t pn; 1306 int i; 1307 1308 va = sva; 1309 for (i = 0; i < count; i++) { 1310 m = ma[i]; 1311 pa = VM_PAGE_TO_PHYS(m); 1312 pn = (pa / PAGE_SIZE); 1313 l3 = pmap_l3(kernel_pmap, va); 1314 1315 entry = PTE_KERN; 1316 entry |= pmap_memattr_bits(m->md.pv_memattr); 1317 entry |= (pn << PTE_PPN0_S); 1318 pmap_store(l3, entry); 1319 1320 va += L3_SIZE; 1321 } 1322 pmap_invalidate_range(kernel_pmap, sva, va); 1323 } 1324 1325 /* 1326 * This routine tears out page mappings from the 1327 * kernel -- it is meant only for temporary mappings. 1328 * Note: SMP coherent. Uses a ranged shootdown IPI. 1329 */ 1330 void 1331 pmap_qremove(vm_offset_t sva, int count) 1332 { 1333 pt_entry_t *l3; 1334 vm_offset_t va; 1335 1336 KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", sva)); 1337 1338 for (va = sva; count-- > 0; va += PAGE_SIZE) { 1339 l3 = pmap_l3(kernel_pmap, va); 1340 KASSERT(l3 != NULL, ("pmap_kremove: Invalid address")); 1341 pmap_clear(l3); 1342 } 1343 pmap_invalidate_range(kernel_pmap, sva, va); 1344 } 1345 1346 bool 1347 pmap_ps_enabled(pmap_t pmap __unused) 1348 { 1349 1350 return (superpages_enabled); 1351 } 1352 1353 /*************************************************** 1354 * Page table page management routines..... 1355 ***************************************************/ 1356 /* 1357 * Schedule the specified unused page table page to be freed. Specifically, 1358 * add the page to the specified list of pages that will be released to the 1359 * physical memory manager after the TLB has been updated. 1360 */ 1361 static __inline void 1362 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, bool set_PG_ZERO) 1363 { 1364 1365 if (set_PG_ZERO) 1366 m->flags |= PG_ZERO; 1367 else 1368 m->flags &= ~PG_ZERO; 1369 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 1370 } 1371 1372 /* 1373 * Inserts the specified page table page into the specified pmap's collection 1374 * of idle page table pages. Each of a pmap's page table pages is responsible 1375 * for mapping a distinct range of virtual addresses. The pmap's collection is 1376 * ordered by this virtual address range. 1377 * 1378 * If "promoted" is false, then the page table page "mpte" must be zero filled; 1379 * "mpte"'s valid field will be set to 0. 1380 * 1381 * If "promoted" is true and "all_l3e_PTE_A_set" is false, then "mpte" must 1382 * contain valid mappings with identical attributes except for PTE_A; 1383 * "mpte"'s valid field will be set to 1. 1384 * 1385 * If "promoted" and "all_l3e_PTE_A_set" are both true, then "mpte" must contain 1386 * valid mappings with identical attributes including PTE_A; "mpte"'s valid 1387 * field will be set to VM_PAGE_BITS_ALL. 1388 */ 1389 static __inline int 1390 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted, 1391 bool all_l3e_PTE_A_set) 1392 { 1393 1394 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1395 KASSERT(promoted || !all_l3e_PTE_A_set, 1396 ("a zero-filled PTP can't have PTE_A set in every PTE")); 1397 mpte->valid = promoted ? (all_l3e_PTE_A_set ? VM_PAGE_BITS_ALL : 1) : 0; 1398 return (vm_radix_insert(&pmap->pm_root, mpte)); 1399 } 1400 1401 /* 1402 * Removes the page table page mapping the specified virtual address from the 1403 * specified pmap's collection of idle page table pages, and returns it. 1404 * Otherwise, returns NULL if there is no page table page corresponding to the 1405 * specified virtual address. 1406 */ 1407 static __inline vm_page_t 1408 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) 1409 { 1410 1411 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1412 return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va))); 1413 } 1414 1415 /* 1416 * Decrements a page table page's reference count, which is used to record the 1417 * number of valid page table entries within the page. If the reference count 1418 * drops to zero, then the page table page is unmapped. Returns true if the 1419 * page table page was unmapped and false otherwise. 1420 */ 1421 static inline bool 1422 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 1423 { 1424 KASSERT(m->ref_count > 0, 1425 ("%s: page %p ref count underflow", __func__, m)); 1426 1427 --m->ref_count; 1428 if (m->ref_count == 0) { 1429 _pmap_unwire_ptp(pmap, va, m, free); 1430 return (true); 1431 } else { 1432 return (false); 1433 } 1434 } 1435 1436 static void 1437 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 1438 { 1439 1440 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1441 if (m->pindex >= NUL2E + NUL1E) { 1442 pd_entry_t *l0; 1443 l0 = pmap_l0(pmap, va); 1444 pmap_clear(l0); 1445 } else if (m->pindex >= NUL2E) { 1446 pd_entry_t *l1; 1447 l1 = pmap_l1(pmap, va); 1448 pmap_clear(l1); 1449 pmap_distribute_l1(pmap, pmap_l1_index(va), 0); 1450 } else { 1451 pd_entry_t *l2; 1452 l2 = pmap_l2(pmap, va); 1453 pmap_clear(l2); 1454 } 1455 pmap_resident_count_dec(pmap, 1); 1456 if (m->pindex < NUL2E) { 1457 pd_entry_t *l1; 1458 vm_page_t pdpg; 1459 1460 l1 = pmap_l1(pmap, va); 1461 pdpg = PTE_TO_VM_PAGE(pmap_load(l1)); 1462 pmap_unwire_ptp(pmap, va, pdpg, free); 1463 } else if (m->pindex < NUL2E + NUL1E && pmap_mode != PMAP_MODE_SV39) { 1464 pd_entry_t *l0; 1465 vm_page_t pdpg; 1466 1467 MPASS(pmap_mode != PMAP_MODE_SV39); 1468 l0 = pmap_l0(pmap, va); 1469 pdpg = PTE_TO_VM_PAGE(pmap_load(l0)); 1470 pmap_unwire_ptp(pmap, va, pdpg, free); 1471 } 1472 pmap_invalidate_page(pmap, va); 1473 1474 vm_wire_sub(1); 1475 1476 /* 1477 * Put page on a list so that it is released after 1478 * *ALL* TLB shootdown is done 1479 */ 1480 pmap_add_delayed_free_list(m, free, true); 1481 } 1482 1483 /* 1484 * After removing a page table entry, this routine is used to 1485 * conditionally free the page, and manage the reference count. 1486 */ 1487 static int 1488 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, 1489 struct spglist *free) 1490 { 1491 vm_page_t mpte; 1492 1493 if (va >= VM_MAXUSER_ADDRESS) 1494 return (0); 1495 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 1496 mpte = PTE_TO_VM_PAGE(ptepde); 1497 return (pmap_unwire_ptp(pmap, va, mpte, free)); 1498 } 1499 1500 static uint64_t 1501 pmap_satp_mode(void) 1502 { 1503 return (pmap_mode == PMAP_MODE_SV39 ? SATP_MODE_SV39 : SATP_MODE_SV48); 1504 } 1505 1506 void 1507 pmap_pinit0(pmap_t pmap) 1508 { 1509 PMAP_LOCK_INIT(pmap); 1510 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); 1511 pmap->pm_stage = PM_STAGE1; 1512 pmap->pm_top = kernel_pmap->pm_top; 1513 pmap->pm_satp = pmap_satp_mode() | 1514 (vtophys(pmap->pm_top) >> PAGE_SHIFT); 1515 CPU_ZERO(&pmap->pm_active); 1516 TAILQ_INIT(&pmap->pm_pvchunk); 1517 vm_radix_init(&pmap->pm_root); 1518 pmap_activate_boot(pmap); 1519 } 1520 1521 int 1522 pmap_pinit_stage(pmap_t pmap, enum pmap_stage stage) 1523 { 1524 vm_paddr_t topphys; 1525 vm_page_t m; 1526 size_t i; 1527 1528 /* 1529 * Top directory is 4 pages in hypervisor case. 1530 * Current address space layout makes 3 of them unused. 1531 */ 1532 if (stage == PM_STAGE1) 1533 m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO | 1534 VM_ALLOC_WAITOK); 1535 else 1536 m = vm_page_alloc_noobj_contig(VM_ALLOC_WIRED | VM_ALLOC_ZERO, 1537 4, 0, ~0ul, L2_SIZE, 0, VM_MEMATTR_DEFAULT); 1538 1539 topphys = VM_PAGE_TO_PHYS(m); 1540 pmap->pm_top = (pd_entry_t *)PHYS_TO_DMAP(topphys); 1541 pmap->pm_satp = pmap_satp_mode() | (topphys >> PAGE_SHIFT); 1542 pmap->pm_stage = stage; 1543 1544 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); 1545 1546 CPU_ZERO(&pmap->pm_active); 1547 1548 if (stage == PM_STAGE2) 1549 goto finish; 1550 1551 if (pmap_mode == PMAP_MODE_SV39) { 1552 /* 1553 * Copy L1 entries from the kernel pmap. This must be done with 1554 * the allpmaps lock held to avoid races with 1555 * pmap_distribute_l1(). 1556 */ 1557 mtx_lock(&allpmaps_lock); 1558 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); 1559 for (i = pmap_l1_index(VM_MIN_KERNEL_ADDRESS); 1560 i < pmap_l1_index(VM_MAX_KERNEL_ADDRESS); i++) 1561 pmap->pm_top[i] = kernel_pmap->pm_top[i]; 1562 for (i = pmap_l1_index(DMAP_MIN_ADDRESS); 1563 i < pmap_l1_index(DMAP_MAX_ADDRESS); i++) 1564 pmap->pm_top[i] = kernel_pmap->pm_top[i]; 1565 mtx_unlock(&allpmaps_lock); 1566 } else { 1567 i = pmap_l0_index(VM_MIN_KERNEL_ADDRESS); 1568 pmap->pm_top[i] = kernel_pmap->pm_top[i]; 1569 } 1570 1571 finish: 1572 TAILQ_INIT(&pmap->pm_pvchunk); 1573 vm_radix_init(&pmap->pm_root); 1574 1575 return (1); 1576 } 1577 1578 int 1579 pmap_pinit(pmap_t pmap) 1580 { 1581 1582 return (pmap_pinit_stage(pmap, PM_STAGE1)); 1583 } 1584 1585 /* 1586 * This routine is called if the desired page table page does not exist. 1587 * 1588 * If page table page allocation fails, this routine may sleep before 1589 * returning NULL. It sleeps only if a lock pointer was given. 1590 * 1591 * Note: If a page allocation fails at page table level two or three, 1592 * one or two pages may be held during the wait, only to be released 1593 * afterwards. This conservative approach is easily argued to avoid 1594 * race conditions. 1595 */ 1596 static vm_page_t 1597 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) 1598 { 1599 vm_page_t m, pdpg; 1600 pt_entry_t entry; 1601 vm_paddr_t phys; 1602 pn_t pn; 1603 1604 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1605 1606 /* 1607 * Allocate a page table page. 1608 */ 1609 m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO); 1610 if (m == NULL) { 1611 if (lockp != NULL) { 1612 RELEASE_PV_LIST_LOCK(lockp); 1613 PMAP_UNLOCK(pmap); 1614 rw_runlock(&pvh_global_lock); 1615 vm_wait(NULL); 1616 rw_rlock(&pvh_global_lock); 1617 PMAP_LOCK(pmap); 1618 } 1619 1620 /* 1621 * Indicate the need to retry. While waiting, the page table 1622 * page may have been allocated. 1623 */ 1624 return (NULL); 1625 } 1626 m->pindex = ptepindex; 1627 1628 /* 1629 * Map the pagetable page into the process address space, if 1630 * it isn't already there. 1631 */ 1632 pn = VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT; 1633 if (ptepindex >= NUL2E + NUL1E) { 1634 pd_entry_t *l0; 1635 vm_pindex_t l0index; 1636 1637 KASSERT(pmap_mode != PMAP_MODE_SV39, 1638 ("%s: pindex %#lx in SV39 mode", __func__, ptepindex)); 1639 KASSERT(ptepindex < NUL2E + NUL1E + NUL0E, 1640 ("%s: pindex %#lx out of range", __func__, ptepindex)); 1641 1642 l0index = ptepindex - (NUL2E + NUL1E); 1643 l0 = &pmap->pm_top[l0index]; 1644 KASSERT((pmap_load(l0) & PTE_V) == 0, 1645 ("%s: L0 entry %#lx is valid", __func__, pmap_load(l0))); 1646 1647 entry = PTE_V | (pn << PTE_PPN0_S); 1648 pmap_store(l0, entry); 1649 } else if (ptepindex >= NUL2E) { 1650 pd_entry_t *l0, *l1; 1651 vm_pindex_t l0index, l1index; 1652 1653 l1index = ptepindex - NUL2E; 1654 if (pmap_mode == PMAP_MODE_SV39) { 1655 l1 = &pmap->pm_top[l1index]; 1656 } else { 1657 l0index = l1index >> Ln_ENTRIES_SHIFT; 1658 l0 = &pmap->pm_top[l0index]; 1659 if (pmap_load(l0) == 0) { 1660 /* Recurse to allocate the L1 page. */ 1661 if (_pmap_alloc_l3(pmap, 1662 NUL2E + NUL1E + l0index, lockp) == NULL) 1663 goto fail; 1664 phys = PTE_TO_PHYS(pmap_load(l0)); 1665 } else { 1666 phys = PTE_TO_PHYS(pmap_load(l0)); 1667 pdpg = PHYS_TO_VM_PAGE(phys); 1668 pdpg->ref_count++; 1669 } 1670 l1 = (pd_entry_t *)PHYS_TO_DMAP(phys); 1671 l1 = &l1[ptepindex & Ln_ADDR_MASK]; 1672 } 1673 KASSERT((pmap_load(l1) & PTE_V) == 0, 1674 ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1))); 1675 1676 entry = PTE_V | (pn << PTE_PPN0_S); 1677 pmap_store(l1, entry); 1678 pmap_distribute_l1(pmap, l1index, entry); 1679 } else { 1680 vm_pindex_t l0index, l1index; 1681 pd_entry_t *l0, *l1, *l2; 1682 1683 l1index = ptepindex >> (L1_SHIFT - L2_SHIFT); 1684 if (pmap_mode == PMAP_MODE_SV39) { 1685 l1 = &pmap->pm_top[l1index]; 1686 if (pmap_load(l1) == 0) { 1687 /* recurse for allocating page dir */ 1688 if (_pmap_alloc_l3(pmap, NUL2E + l1index, 1689 lockp) == NULL) 1690 goto fail; 1691 } else { 1692 pdpg = PTE_TO_VM_PAGE(pmap_load(l1)); 1693 pdpg->ref_count++; 1694 } 1695 } else { 1696 l0index = l1index >> Ln_ENTRIES_SHIFT; 1697 l0 = &pmap->pm_top[l0index]; 1698 if (pmap_load(l0) == 0) { 1699 /* Recurse to allocate the L1 entry. */ 1700 if (_pmap_alloc_l3(pmap, NUL2E + l1index, 1701 lockp) == NULL) 1702 goto fail; 1703 phys = PTE_TO_PHYS(pmap_load(l0)); 1704 l1 = (pd_entry_t *)PHYS_TO_DMAP(phys); 1705 l1 = &l1[l1index & Ln_ADDR_MASK]; 1706 } else { 1707 phys = PTE_TO_PHYS(pmap_load(l0)); 1708 l1 = (pd_entry_t *)PHYS_TO_DMAP(phys); 1709 l1 = &l1[l1index & Ln_ADDR_MASK]; 1710 if (pmap_load(l1) == 0) { 1711 /* Recurse to allocate the L2 page. */ 1712 if (_pmap_alloc_l3(pmap, 1713 NUL2E + l1index, lockp) == NULL) 1714 goto fail; 1715 } else { 1716 pdpg = PTE_TO_VM_PAGE(pmap_load(l1)); 1717 pdpg->ref_count++; 1718 } 1719 } 1720 } 1721 1722 phys = PTE_TO_PHYS(pmap_load(l1)); 1723 l2 = (pd_entry_t *)PHYS_TO_DMAP(phys); 1724 l2 = &l2[ptepindex & Ln_ADDR_MASK]; 1725 KASSERT((pmap_load(l2) & PTE_V) == 0, 1726 ("%s: L2 entry %#lx is valid", __func__, pmap_load(l2))); 1727 1728 entry = PTE_V | (pn << PTE_PPN0_S); 1729 pmap_store(l2, entry); 1730 } 1731 1732 pmap_resident_count_inc(pmap, 1); 1733 1734 return (m); 1735 1736 fail: 1737 vm_page_unwire_noq(m); 1738 vm_page_free_zero(m); 1739 return (NULL); 1740 } 1741 1742 static vm_page_t 1743 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 1744 { 1745 pd_entry_t *l1; 1746 vm_page_t l2pg; 1747 vm_pindex_t l2pindex; 1748 1749 retry: 1750 l1 = pmap_l1(pmap, va); 1751 if (l1 != NULL && (pmap_load(l1) & PTE_V) != 0) { 1752 KASSERT((pmap_load(l1) & PTE_RWX) == 0, 1753 ("%s: L1 entry %#lx for VA %#lx is a leaf", __func__, 1754 pmap_load(l1), va)); 1755 /* Add a reference to the L2 page. */ 1756 l2pg = PTE_TO_VM_PAGE(pmap_load(l1)); 1757 l2pg->ref_count++; 1758 } else { 1759 /* Allocate a L2 page. */ 1760 l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT; 1761 l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp); 1762 if (l2pg == NULL && lockp != NULL) 1763 goto retry; 1764 } 1765 return (l2pg); 1766 } 1767 1768 static vm_page_t 1769 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 1770 { 1771 vm_pindex_t ptepindex; 1772 pd_entry_t *l2; 1773 vm_page_t m; 1774 1775 /* 1776 * Calculate pagetable page index 1777 */ 1778 ptepindex = pmap_l2_pindex(va); 1779 retry: 1780 /* 1781 * Get the page directory entry 1782 */ 1783 l2 = pmap_l2(pmap, va); 1784 1785 /* 1786 * If the page table page is mapped, we just increment the 1787 * hold count, and activate it. 1788 */ 1789 if (l2 != NULL && pmap_load(l2) != 0) { 1790 m = PTE_TO_VM_PAGE(pmap_load(l2)); 1791 m->ref_count++; 1792 } else { 1793 /* 1794 * Here if the pte page isn't mapped, or if it has been 1795 * deallocated. 1796 */ 1797 m = _pmap_alloc_l3(pmap, ptepindex, lockp); 1798 if (m == NULL && lockp != NULL) 1799 goto retry; 1800 } 1801 return (m); 1802 } 1803 1804 /*************************************************** 1805 * Pmap allocation/deallocation routines. 1806 ***************************************************/ 1807 1808 /* 1809 * Release any resources held by the given physical map. 1810 * Called when a pmap initialized by pmap_pinit is being released. 1811 * Should only be called if the map contains no valid mappings. 1812 */ 1813 void 1814 pmap_release(pmap_t pmap) 1815 { 1816 vm_page_t m; 1817 int npages; 1818 int i; 1819 1820 KASSERT(pmap->pm_stats.resident_count == 0, 1821 ("pmap_release: pmap resident count %ld != 0", 1822 pmap->pm_stats.resident_count)); 1823 KASSERT(CPU_EMPTY(&pmap->pm_active), 1824 ("releasing active pmap %p", pmap)); 1825 1826 if (pmap->pm_stage == PM_STAGE2) 1827 goto finish; 1828 1829 if (pmap_mode == PMAP_MODE_SV39) { 1830 mtx_lock(&allpmaps_lock); 1831 LIST_REMOVE(pmap, pm_list); 1832 mtx_unlock(&allpmaps_lock); 1833 } 1834 1835 finish: 1836 npages = pmap->pm_stage == PM_STAGE2 ? 4 : 1; 1837 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_top)); 1838 for (i = 0; i < npages; i++) { 1839 vm_page_unwire_noq(m); 1840 vm_page_free(m); 1841 m++; 1842 } 1843 } 1844 1845 static int 1846 kvm_size(SYSCTL_HANDLER_ARGS) 1847 { 1848 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; 1849 1850 return sysctl_handle_long(oidp, &ksize, 0, req); 1851 } 1852 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 1853 0, 0, kvm_size, "LU", 1854 "Size of KVM"); 1855 1856 static int 1857 kvm_free(SYSCTL_HANDLER_ARGS) 1858 { 1859 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 1860 1861 return sysctl_handle_long(oidp, &kfree, 0, req); 1862 } 1863 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 1864 0, 0, kvm_free, "LU", 1865 "Amount of KVM free"); 1866 1867 /* 1868 * grow the number of kernel page table entries, if needed 1869 */ 1870 void 1871 pmap_growkernel(vm_offset_t addr) 1872 { 1873 vm_paddr_t paddr; 1874 vm_page_t nkpg; 1875 pd_entry_t *l1, *l2; 1876 pt_entry_t entry; 1877 pn_t pn; 1878 1879 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 1880 1881 addr = roundup2(addr, L2_SIZE); 1882 if (addr - 1 >= vm_map_max(kernel_map)) 1883 addr = vm_map_max(kernel_map); 1884 while (kernel_vm_end < addr) { 1885 l1 = pmap_l1(kernel_pmap, kernel_vm_end); 1886 if (pmap_load(l1) == 0) { 1887 /* We need a new PDP entry */ 1888 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | 1889 VM_ALLOC_NOFREE | VM_ALLOC_WIRED | VM_ALLOC_ZERO); 1890 if (nkpg == NULL) 1891 panic("pmap_growkernel: no memory to grow kernel"); 1892 nkpg->pindex = kernel_vm_end >> L1_SHIFT; 1893 paddr = VM_PAGE_TO_PHYS(nkpg); 1894 1895 pn = (paddr / PAGE_SIZE); 1896 entry = (PTE_V); 1897 entry |= (pn << PTE_PPN0_S); 1898 pmap_store(l1, entry); 1899 pmap_distribute_l1(kernel_pmap, 1900 pmap_l1_index(kernel_vm_end), entry); 1901 continue; /* try again */ 1902 } 1903 l2 = pmap_l1_to_l2(l1, kernel_vm_end); 1904 if ((pmap_load(l2) & PTE_V) != 0 && 1905 (pmap_load(l2) & PTE_RWX) == 0) { 1906 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; 1907 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 1908 kernel_vm_end = vm_map_max(kernel_map); 1909 break; 1910 } 1911 continue; 1912 } 1913 1914 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | 1915 VM_ALLOC_NOFREE | VM_ALLOC_WIRED | VM_ALLOC_ZERO); 1916 if (nkpg == NULL) 1917 panic("pmap_growkernel: no memory to grow kernel"); 1918 nkpg->pindex = kernel_vm_end >> L2_SHIFT; 1919 paddr = VM_PAGE_TO_PHYS(nkpg); 1920 1921 pn = (paddr / PAGE_SIZE); 1922 entry = (PTE_V); 1923 entry |= (pn << PTE_PPN0_S); 1924 pmap_store(l2, entry); 1925 1926 pmap_invalidate_page(kernel_pmap, kernel_vm_end); 1927 1928 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; 1929 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 1930 kernel_vm_end = vm_map_max(kernel_map); 1931 break; 1932 } 1933 } 1934 } 1935 1936 /*************************************************** 1937 * page management routines. 1938 ***************************************************/ 1939 1940 static const uint64_t pc_freemask[_NPCM] = { 1941 [0 ... _NPCM - 2] = PC_FREEN, 1942 [_NPCM - 1] = PC_FREEL 1943 }; 1944 1945 #ifdef PV_STATS 1946 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 1947 1948 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 1949 "Current number of pv entry chunks"); 1950 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 1951 "Current number of pv entry chunks allocated"); 1952 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 1953 "Current number of pv entry chunks frees"); 1954 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 1955 "Number of times tried to get a chunk page but failed."); 1956 1957 static long pv_entry_frees, pv_entry_allocs, pv_entry_count; 1958 static int pv_entry_spare; 1959 1960 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 1961 "Current number of pv entry frees"); 1962 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 1963 "Current number of pv entry allocs"); 1964 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 1965 "Current number of pv entries"); 1966 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 1967 "Current number of spare pv entries"); 1968 #endif 1969 1970 /* 1971 * We are in a serious low memory condition. Resort to 1972 * drastic measures to free some pages so we can allocate 1973 * another pv entry chunk. 1974 * 1975 * Returns NULL if PV entries were reclaimed from the specified pmap. 1976 * 1977 * We do not, however, unmap 2mpages because subsequent accesses will 1978 * allocate per-page pv entries until repromotion occurs, thereby 1979 * exacerbating the shortage of free pv entries. 1980 */ 1981 static vm_page_t 1982 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) 1983 { 1984 1985 panic("RISCVTODO: reclaim_pv_chunk"); 1986 } 1987 1988 /* 1989 * free the pv_entry back to the free list 1990 */ 1991 static void 1992 free_pv_entry(pmap_t pmap, pv_entry_t pv) 1993 { 1994 struct pv_chunk *pc; 1995 int idx, field, bit; 1996 1997 rw_assert(&pvh_global_lock, RA_LOCKED); 1998 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1999 PV_STAT(atomic_add_long(&pv_entry_frees, 1)); 2000 PV_STAT(atomic_add_int(&pv_entry_spare, 1)); 2001 PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); 2002 pc = pv_to_chunk(pv); 2003 idx = pv - &pc->pc_pventry[0]; 2004 field = idx / 64; 2005 bit = idx % 64; 2006 pc->pc_map[field] |= 1ul << bit; 2007 if (!pc_is_free(pc)) { 2008 /* 98% of the time, pc is already at the head of the list. */ 2009 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { 2010 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2011 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2012 } 2013 return; 2014 } 2015 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2016 free_pv_chunk(pc); 2017 } 2018 2019 static void 2020 free_pv_chunk(struct pv_chunk *pc) 2021 { 2022 vm_page_t m; 2023 2024 mtx_lock(&pv_chunks_mutex); 2025 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2026 mtx_unlock(&pv_chunks_mutex); 2027 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 2028 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 2029 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 2030 /* entire chunk is free, return it */ 2031 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 2032 dump_drop_page(m->phys_addr); 2033 vm_page_unwire_noq(m); 2034 vm_page_free(m); 2035 } 2036 2037 /* 2038 * Returns a new PV entry, allocating a new PV chunk from the system when 2039 * needed. If this PV chunk allocation fails and a PV list lock pointer was 2040 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is 2041 * returned. 2042 * 2043 * The given PV list lock may be released. 2044 */ 2045 static pv_entry_t 2046 get_pv_entry(pmap_t pmap, struct rwlock **lockp) 2047 { 2048 int bit, field; 2049 pv_entry_t pv; 2050 struct pv_chunk *pc; 2051 vm_page_t m; 2052 2053 rw_assert(&pvh_global_lock, RA_LOCKED); 2054 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2055 PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); 2056 retry: 2057 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 2058 if (pc != NULL) { 2059 for (field = 0; field < _NPCM; field++) { 2060 if (pc->pc_map[field]) { 2061 bit = ffsl(pc->pc_map[field]) - 1; 2062 break; 2063 } 2064 } 2065 if (field < _NPCM) { 2066 pv = &pc->pc_pventry[field * 64 + bit]; 2067 pc->pc_map[field] &= ~(1ul << bit); 2068 /* If this was the last item, move it to tail */ 2069 if (pc_is_full(pc)) { 2070 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2071 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, 2072 pc_list); 2073 } 2074 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 2075 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); 2076 return (pv); 2077 } 2078 } 2079 /* No free items, allocate another chunk */ 2080 m = vm_page_alloc_noobj(VM_ALLOC_WIRED); 2081 if (m == NULL) { 2082 if (lockp == NULL) { 2083 PV_STAT(pc_chunk_tryfail++); 2084 return (NULL); 2085 } 2086 m = reclaim_pv_chunk(pmap, lockp); 2087 if (m == NULL) 2088 goto retry; 2089 } 2090 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 2091 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 2092 dump_add_page(m->phys_addr); 2093 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 2094 pc->pc_pmap = pmap; 2095 pc->pc_map[0] = PC_FREEN & ~1ul; /* preallocated bit 0 */ 2096 pc->pc_map[1] = PC_FREEN; 2097 pc->pc_map[2] = PC_FREEL; 2098 mtx_lock(&pv_chunks_mutex); 2099 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 2100 mtx_unlock(&pv_chunks_mutex); 2101 pv = &pc->pc_pventry[0]; 2102 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2103 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 2104 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); 2105 return (pv); 2106 } 2107 2108 /* 2109 * Ensure that the number of spare PV entries in the specified pmap meets or 2110 * exceeds the given count, "needed". 2111 * 2112 * The given PV list lock may be released. 2113 */ 2114 static void 2115 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) 2116 { 2117 struct pch new_tail; 2118 struct pv_chunk *pc; 2119 vm_page_t m; 2120 int avail, free; 2121 bool reclaimed; 2122 2123 rw_assert(&pvh_global_lock, RA_LOCKED); 2124 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2125 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); 2126 2127 /* 2128 * Newly allocated PV chunks must be stored in a private list until 2129 * the required number of PV chunks have been allocated. Otherwise, 2130 * reclaim_pv_chunk() could recycle one of these chunks. In 2131 * contrast, these chunks must be added to the pmap upon allocation. 2132 */ 2133 TAILQ_INIT(&new_tail); 2134 retry: 2135 avail = 0; 2136 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { 2137 bit_count((bitstr_t *)pc->pc_map, 0, 2138 sizeof(pc->pc_map) * NBBY, &free); 2139 if (free == 0) 2140 break; 2141 avail += free; 2142 if (avail >= needed) 2143 break; 2144 } 2145 for (reclaimed = false; avail < needed; avail += _NPCPV) { 2146 m = vm_page_alloc_noobj(VM_ALLOC_WIRED); 2147 if (m == NULL) { 2148 m = reclaim_pv_chunk(pmap, lockp); 2149 if (m == NULL) 2150 goto retry; 2151 reclaimed = true; 2152 } 2153 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 2154 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 2155 dump_add_page(m->phys_addr); 2156 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 2157 pc->pc_pmap = pmap; 2158 pc->pc_map[0] = PC_FREEN; 2159 pc->pc_map[1] = PC_FREEN; 2160 pc->pc_map[2] = PC_FREEL; 2161 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2162 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 2163 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); 2164 2165 /* 2166 * The reclaim might have freed a chunk from the current pmap. 2167 * If that chunk contained available entries, we need to 2168 * re-count the number of available entries. 2169 */ 2170 if (reclaimed) 2171 goto retry; 2172 } 2173 if (!TAILQ_EMPTY(&new_tail)) { 2174 mtx_lock(&pv_chunks_mutex); 2175 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); 2176 mtx_unlock(&pv_chunks_mutex); 2177 } 2178 } 2179 2180 /* 2181 * First find and then remove the pv entry for the specified pmap and virtual 2182 * address from the specified pv list. Returns the pv entry if found and NULL 2183 * otherwise. This operation can be performed on pv lists for either 4KB or 2184 * 2MB page mappings. 2185 */ 2186 static __inline pv_entry_t 2187 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2188 { 2189 pv_entry_t pv; 2190 2191 rw_assert(&pvh_global_lock, RA_LOCKED); 2192 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 2193 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 2194 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 2195 pvh->pv_gen++; 2196 break; 2197 } 2198 } 2199 return (pv); 2200 } 2201 2202 /* 2203 * First find and then destroy the pv entry for the specified pmap and virtual 2204 * address. This operation can be performed on pv lists for either 4KB or 2MB 2205 * page mappings. 2206 */ 2207 static void 2208 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2209 { 2210 pv_entry_t pv; 2211 2212 pv = pmap_pvh_remove(pvh, pmap, va); 2213 2214 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found for %#lx", va)); 2215 free_pv_entry(pmap, pv); 2216 } 2217 2218 /* 2219 * Conditionally create the PV entry for a 4KB page mapping if the required 2220 * memory can be allocated without resorting to reclamation. 2221 */ 2222 static bool 2223 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, 2224 struct rwlock **lockp) 2225 { 2226 pv_entry_t pv; 2227 2228 rw_assert(&pvh_global_lock, RA_LOCKED); 2229 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2230 /* Pass NULL instead of the lock pointer to disable reclamation. */ 2231 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 2232 pv->pv_va = va; 2233 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 2234 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2235 m->md.pv_gen++; 2236 return (true); 2237 } else 2238 return (false); 2239 } 2240 2241 /* 2242 * After demotion from a 2MB page mapping to 512 4KB page mappings, 2243 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 2244 * entries for each of the 4KB page mappings. 2245 */ 2246 static void __unused 2247 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 2248 struct rwlock **lockp) 2249 { 2250 struct md_page *pvh; 2251 struct pv_chunk *pc; 2252 pv_entry_t pv; 2253 vm_page_t m; 2254 vm_offset_t va_last; 2255 int bit, field; 2256 2257 rw_assert(&pvh_global_lock, RA_LOCKED); 2258 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2259 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 2260 2261 /* 2262 * Transfer the 2mpage's pv entry for this mapping to the first 2263 * page's pv list. Once this transfer begins, the pv list lock 2264 * must not be released until the last pv entry is reinstantiated. 2265 */ 2266 pvh = pa_to_pvh(pa); 2267 va &= ~L2_OFFSET; 2268 pv = pmap_pvh_remove(pvh, pmap, va); 2269 KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found")); 2270 m = PHYS_TO_VM_PAGE(pa); 2271 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2272 m->md.pv_gen++; 2273 /* Instantiate the remaining 511 pv entries. */ 2274 PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1)); 2275 va_last = va + L2_SIZE - PAGE_SIZE; 2276 for (;;) { 2277 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 2278 KASSERT(!pc_is_full(pc), ("pmap_pv_demote_l2: missing spare")); 2279 for (field = 0; field < _NPCM; field++) { 2280 while (pc->pc_map[field] != 0) { 2281 bit = ffsl(pc->pc_map[field]) - 1; 2282 pc->pc_map[field] &= ~(1ul << bit); 2283 pv = &pc->pc_pventry[field * 64 + bit]; 2284 va += PAGE_SIZE; 2285 pv->pv_va = va; 2286 m++; 2287 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 2288 ("pmap_pv_demote_l2: page %p is not managed", m)); 2289 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2290 m->md.pv_gen++; 2291 if (va == va_last) 2292 goto out; 2293 } 2294 } 2295 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2296 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 2297 } 2298 out: 2299 if (pc_is_full(pc)) { 2300 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2301 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 2302 } 2303 PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1)); 2304 PV_STAT(atomic_add_int(&pv_entry_spare, -(Ln_ENTRIES - 1))); 2305 } 2306 2307 #if VM_NRESERVLEVEL > 0 2308 static void 2309 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 2310 struct rwlock **lockp) 2311 { 2312 struct md_page *pvh; 2313 pv_entry_t pv; 2314 vm_page_t m; 2315 vm_offset_t va_last; 2316 2317 rw_assert(&pvh_global_lock, RA_LOCKED); 2318 KASSERT((pa & L2_OFFSET) == 0, 2319 ("pmap_pv_promote_l2: misaligned pa %#lx", pa)); 2320 2321 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 2322 2323 m = PHYS_TO_VM_PAGE(pa); 2324 va = va & ~L2_OFFSET; 2325 pv = pmap_pvh_remove(&m->md, pmap, va); 2326 KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv for %#lx not found", va)); 2327 pvh = pa_to_pvh(pa); 2328 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 2329 pvh->pv_gen++; 2330 2331 va_last = va + L2_SIZE - PAGE_SIZE; 2332 do { 2333 m++; 2334 va += PAGE_SIZE; 2335 pmap_pvh_free(&m->md, pmap, va); 2336 } while (va < va_last); 2337 } 2338 #endif /* VM_NRESERVLEVEL > 0 */ 2339 2340 /* 2341 * Create the PV entry for a 2MB page mapping. Always returns true unless the 2342 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns 2343 * false if the PV entry cannot be allocated without resorting to reclamation. 2344 */ 2345 static bool 2346 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags, 2347 struct rwlock **lockp) 2348 { 2349 struct md_page *pvh; 2350 pv_entry_t pv; 2351 vm_paddr_t pa; 2352 2353 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2354 /* Pass NULL instead of the lock pointer to disable reclamation. */ 2355 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? 2356 NULL : lockp)) == NULL) 2357 return (false); 2358 pv->pv_va = va; 2359 pa = PTE_TO_PHYS(l2e); 2360 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 2361 pvh = pa_to_pvh(pa); 2362 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 2363 pvh->pv_gen++; 2364 return (true); 2365 } 2366 2367 static void 2368 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) 2369 { 2370 pt_entry_t newl2, oldl2 __diagused; 2371 vm_page_t ml3; 2372 vm_paddr_t ml3pa; 2373 2374 KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va)); 2375 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); 2376 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2377 2378 ml3 = pmap_remove_pt_page(pmap, va); 2379 if (ml3 == NULL) 2380 panic("pmap_remove_kernel_l2: Missing pt page"); 2381 2382 ml3pa = VM_PAGE_TO_PHYS(ml3); 2383 newl2 = ml3pa | PTE_V; 2384 2385 /* 2386 * If this page table page was unmapped by a promotion, then it 2387 * contains valid mappings. Zero it to invalidate those mappings. 2388 */ 2389 if (vm_page_any_valid(ml3)) 2390 pagezero((void *)PHYS_TO_DMAP(ml3pa)); 2391 2392 /* 2393 * Demote the mapping. 2394 */ 2395 oldl2 = pmap_load_store(l2, newl2); 2396 KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx", 2397 __func__, l2, oldl2)); 2398 } 2399 2400 /* 2401 * pmap_remove_l2: Do the things to unmap a level 2 superpage. 2402 */ 2403 static int 2404 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, 2405 pd_entry_t l1e, struct spglist *free, struct rwlock **lockp) 2406 { 2407 struct md_page *pvh; 2408 pt_entry_t oldl2; 2409 vm_offset_t eva, va; 2410 vm_page_t m, ml3; 2411 2412 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2413 KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned")); 2414 oldl2 = pmap_load_clear(l2); 2415 KASSERT((oldl2 & PTE_RWX) != 0, 2416 ("pmap_remove_l2: L2e %lx is not a superpage mapping", oldl2)); 2417 2418 /* 2419 * The sfence.vma documentation states that it is sufficient to specify 2420 * a single address within a superpage mapping. However, since we do 2421 * not perform any invalidation upon promotion, TLBs may still be 2422 * caching 4KB mappings within the superpage, so we must invalidate the 2423 * entire range. 2424 */ 2425 pmap_invalidate_range(pmap, sva, sva + L2_SIZE); 2426 if ((oldl2 & PTE_SW_WIRED) != 0) 2427 pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE; 2428 pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE); 2429 if ((oldl2 & PTE_SW_MANAGED) != 0) { 2430 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, PTE_TO_PHYS(oldl2)); 2431 pvh = pa_to_pvh(PTE_TO_PHYS(oldl2)); 2432 pmap_pvh_free(pvh, pmap, sva); 2433 eva = sva + L2_SIZE; 2434 for (va = sva, m = PTE_TO_VM_PAGE(oldl2); 2435 va < eva; va += PAGE_SIZE, m++) { 2436 if ((oldl2 & PTE_D) != 0) 2437 vm_page_dirty(m); 2438 if ((oldl2 & PTE_A) != 0) 2439 vm_page_aflag_set(m, PGA_REFERENCED); 2440 if (TAILQ_EMPTY(&m->md.pv_list) && 2441 TAILQ_EMPTY(&pvh->pv_list)) 2442 vm_page_aflag_clear(m, PGA_WRITEABLE); 2443 } 2444 } 2445 if (pmap == kernel_pmap) { 2446 pmap_remove_kernel_l2(pmap, l2, sva); 2447 } else { 2448 ml3 = pmap_remove_pt_page(pmap, sva); 2449 if (ml3 != NULL) { 2450 KASSERT(vm_page_any_valid(ml3), 2451 ("pmap_remove_l2: l3 page not promoted")); 2452 pmap_resident_count_dec(pmap, 1); 2453 KASSERT(ml3->ref_count == Ln_ENTRIES, 2454 ("pmap_remove_l2: l3 page ref count error")); 2455 ml3->ref_count = 1; 2456 vm_page_unwire_noq(ml3); 2457 pmap_add_delayed_free_list(ml3, free, false); 2458 } 2459 } 2460 return (pmap_unuse_pt(pmap, sva, l1e, free)); 2461 } 2462 2463 /* 2464 * pmap_remove_l3: do the things to unmap a page in a process 2465 */ 2466 static int 2467 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va, 2468 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp) 2469 { 2470 struct md_page *pvh; 2471 pt_entry_t old_l3; 2472 vm_page_t m; 2473 2474 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2475 old_l3 = pmap_load_clear(l3); 2476 pmap_invalidate_page(pmap, va); 2477 if (old_l3 & PTE_SW_WIRED) 2478 pmap->pm_stats.wired_count -= 1; 2479 pmap_resident_count_dec(pmap, 1); 2480 if (old_l3 & PTE_SW_MANAGED) { 2481 m = PTE_TO_VM_PAGE(old_l3); 2482 if ((old_l3 & PTE_D) != 0) 2483 vm_page_dirty(m); 2484 if (old_l3 & PTE_A) 2485 vm_page_aflag_set(m, PGA_REFERENCED); 2486 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 2487 pmap_pvh_free(&m->md, pmap, va); 2488 if (TAILQ_EMPTY(&m->md.pv_list) && 2489 (m->flags & PG_FICTITIOUS) == 0) { 2490 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2491 if (TAILQ_EMPTY(&pvh->pv_list)) 2492 vm_page_aflag_clear(m, PGA_WRITEABLE); 2493 } 2494 } 2495 2496 return (pmap_unuse_pt(pmap, va, l2e, free)); 2497 } 2498 2499 /* 2500 * Remove the given range of addresses from the specified map. 2501 * 2502 * It is assumed that the start and end are properly 2503 * rounded to the page size. 2504 */ 2505 void 2506 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 2507 { 2508 struct spglist free; 2509 struct rwlock *lock; 2510 vm_offset_t va, va_next; 2511 pd_entry_t *l0, *l1, *l2, l2e; 2512 pt_entry_t *l3; 2513 2514 /* 2515 * Perform an unsynchronized read. This is, however, safe. 2516 */ 2517 if (pmap->pm_stats.resident_count == 0) 2518 return; 2519 2520 SLIST_INIT(&free); 2521 2522 rw_rlock(&pvh_global_lock); 2523 PMAP_LOCK(pmap); 2524 2525 lock = NULL; 2526 for (; sva < eva; sva = va_next) { 2527 if (pmap->pm_stats.resident_count == 0) 2528 break; 2529 2530 if (pmap_mode == PMAP_MODE_SV48) { 2531 l0 = pmap_l0(pmap, sva); 2532 if (pmap_load(l0) == 0) { 2533 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 2534 if (va_next < sva) 2535 va_next = eva; 2536 continue; 2537 } 2538 l1 = pmap_l0_to_l1(l0, sva); 2539 } else { 2540 l1 = pmap_l1(pmap, sva); 2541 } 2542 2543 if (pmap_load(l1) == 0) { 2544 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 2545 if (va_next < sva) 2546 va_next = eva; 2547 continue; 2548 } 2549 2550 /* 2551 * Calculate index for next page table. 2552 */ 2553 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 2554 if (va_next < sva) 2555 va_next = eva; 2556 2557 l2 = pmap_l1_to_l2(l1, sva); 2558 if ((l2e = pmap_load(l2)) == 0) 2559 continue; 2560 if ((l2e & PTE_RWX) != 0) { 2561 if (sva + L2_SIZE == va_next && eva >= va_next) { 2562 (void)pmap_remove_l2(pmap, l2, sva, 2563 pmap_load(l1), &free, &lock); 2564 continue; 2565 } else if (!pmap_demote_l2_locked(pmap, l2, sva, 2566 &lock)) { 2567 /* 2568 * The large page mapping was destroyed. 2569 */ 2570 continue; 2571 } 2572 l2e = pmap_load(l2); 2573 } 2574 2575 /* 2576 * Limit our scan to either the end of the va represented 2577 * by the current page table page, or to the end of the 2578 * range being removed. 2579 */ 2580 if (va_next > eva) 2581 va_next = eva; 2582 2583 va = va_next; 2584 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 2585 sva += L3_SIZE) { 2586 if (pmap_load(l3) == 0) { 2587 if (va != va_next) { 2588 pmap_invalidate_range(pmap, va, sva); 2589 va = va_next; 2590 } 2591 continue; 2592 } 2593 if (va == va_next) 2594 va = sva; 2595 if (pmap_remove_l3(pmap, l3, sva, l2e, &free, &lock)) { 2596 sva += L3_SIZE; 2597 break; 2598 } 2599 } 2600 if (va != va_next) 2601 pmap_invalidate_range(pmap, va, sva); 2602 } 2603 if (lock != NULL) 2604 rw_wunlock(lock); 2605 rw_runlock(&pvh_global_lock); 2606 PMAP_UNLOCK(pmap); 2607 vm_page_free_pages_toq(&free, false); 2608 } 2609 2610 /* 2611 * Routine: pmap_remove_all 2612 * Function: 2613 * Removes this physical page from 2614 * all physical maps in which it resides. 2615 * Reflects back modify bits to the pager. 2616 * 2617 * Notes: 2618 * Original versions of this routine were very 2619 * inefficient because they iteratively called 2620 * pmap_remove (slow...) 2621 */ 2622 2623 void 2624 pmap_remove_all(vm_page_t m) 2625 { 2626 struct spglist free; 2627 struct md_page *pvh; 2628 pmap_t pmap; 2629 pt_entry_t *l3, l3e; 2630 pd_entry_t *l2, l2e __diagused; 2631 pv_entry_t pv; 2632 vm_offset_t va; 2633 2634 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 2635 ("pmap_remove_all: page %p is not managed", m)); 2636 SLIST_INIT(&free); 2637 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 2638 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2639 2640 rw_wlock(&pvh_global_lock); 2641 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 2642 pmap = PV_PMAP(pv); 2643 PMAP_LOCK(pmap); 2644 va = pv->pv_va; 2645 l2 = pmap_l2(pmap, va); 2646 (void)pmap_demote_l2(pmap, l2, va); 2647 PMAP_UNLOCK(pmap); 2648 } 2649 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 2650 pmap = PV_PMAP(pv); 2651 PMAP_LOCK(pmap); 2652 pmap_resident_count_dec(pmap, 1); 2653 l2 = pmap_l2(pmap, pv->pv_va); 2654 KASSERT(l2 != NULL, ("pmap_remove_all: no l2 table found")); 2655 l2e = pmap_load(l2); 2656 2657 KASSERT((l2e & PTE_RX) == 0, 2658 ("pmap_remove_all: found a superpage in %p's pv list", m)); 2659 2660 l3 = pmap_l2_to_l3(l2, pv->pv_va); 2661 l3e = pmap_load_clear(l3); 2662 pmap_invalidate_page(pmap, pv->pv_va); 2663 if (l3e & PTE_SW_WIRED) 2664 pmap->pm_stats.wired_count--; 2665 if ((l3e & PTE_A) != 0) 2666 vm_page_aflag_set(m, PGA_REFERENCED); 2667 2668 /* 2669 * Update the vm_page_t clean and reference bits. 2670 */ 2671 if ((l3e & PTE_D) != 0) 2672 vm_page_dirty(m); 2673 pmap_unuse_pt(pmap, pv->pv_va, pmap_load(l2), &free); 2674 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 2675 m->md.pv_gen++; 2676 free_pv_entry(pmap, pv); 2677 PMAP_UNLOCK(pmap); 2678 } 2679 vm_page_aflag_clear(m, PGA_WRITEABLE); 2680 rw_wunlock(&pvh_global_lock); 2681 vm_page_free_pages_toq(&free, false); 2682 } 2683 2684 /* 2685 * Set the physical protection on the 2686 * specified range of this map as requested. 2687 */ 2688 void 2689 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 2690 { 2691 pd_entry_t *l0, *l1, *l2, l2e; 2692 pt_entry_t *l3, l3e, mask; 2693 vm_page_t m, mt; 2694 vm_offset_t va_next; 2695 bool anychanged, pv_lists_locked; 2696 2697 if ((prot & VM_PROT_READ) == VM_PROT_NONE) { 2698 pmap_remove(pmap, sva, eva); 2699 return; 2700 } 2701 2702 if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == 2703 (VM_PROT_WRITE | VM_PROT_EXECUTE)) 2704 return; 2705 2706 anychanged = false; 2707 pv_lists_locked = false; 2708 mask = 0; 2709 if ((prot & VM_PROT_WRITE) == 0) 2710 mask |= PTE_W | PTE_D; 2711 if ((prot & VM_PROT_EXECUTE) == 0) 2712 mask |= PTE_X; 2713 resume: 2714 PMAP_LOCK(pmap); 2715 for (; sva < eva; sva = va_next) { 2716 if (pmap_mode == PMAP_MODE_SV48) { 2717 l0 = pmap_l0(pmap, sva); 2718 if (pmap_load(l0) == 0) { 2719 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 2720 if (va_next < sva) 2721 va_next = eva; 2722 continue; 2723 } 2724 l1 = pmap_l0_to_l1(l0, sva); 2725 } else { 2726 l1 = pmap_l1(pmap, sva); 2727 } 2728 2729 if (pmap_load(l1) == 0) { 2730 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 2731 if (va_next < sva) 2732 va_next = eva; 2733 continue; 2734 } 2735 2736 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 2737 if (va_next < sva) 2738 va_next = eva; 2739 2740 l2 = pmap_l1_to_l2(l1, sva); 2741 if ((l2e = pmap_load(l2)) == 0) 2742 continue; 2743 if ((l2e & PTE_RWX) != 0) { 2744 if (sva + L2_SIZE == va_next && eva >= va_next) { 2745 retryl2: 2746 if ((prot & VM_PROT_WRITE) == 0 && 2747 (l2e & (PTE_SW_MANAGED | PTE_D)) == 2748 (PTE_SW_MANAGED | PTE_D)) { 2749 m = PTE_TO_VM_PAGE(l2e); 2750 for (mt = m; mt < &m[Ln_ENTRIES]; mt++) 2751 vm_page_dirty(mt); 2752 } 2753 if (!atomic_fcmpset_long(l2, &l2e, l2e & ~mask)) 2754 goto retryl2; 2755 anychanged = true; 2756 continue; 2757 } else { 2758 if (!pv_lists_locked) { 2759 pv_lists_locked = true; 2760 if (!rw_try_rlock(&pvh_global_lock)) { 2761 if (anychanged) 2762 pmap_invalidate_all( 2763 pmap); 2764 PMAP_UNLOCK(pmap); 2765 rw_rlock(&pvh_global_lock); 2766 goto resume; 2767 } 2768 } 2769 if (!pmap_demote_l2(pmap, l2, sva)) { 2770 /* 2771 * The large page mapping was destroyed. 2772 */ 2773 continue; 2774 } 2775 } 2776 } 2777 2778 if (va_next > eva) 2779 va_next = eva; 2780 2781 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 2782 sva += L3_SIZE) { 2783 l3e = pmap_load(l3); 2784 retryl3: 2785 if ((l3e & PTE_V) == 0) 2786 continue; 2787 if ((prot & VM_PROT_WRITE) == 0 && 2788 (l3e & (PTE_SW_MANAGED | PTE_D)) == 2789 (PTE_SW_MANAGED | PTE_D)) { 2790 m = PTE_TO_VM_PAGE(l3e); 2791 vm_page_dirty(m); 2792 } 2793 if (!atomic_fcmpset_long(l3, &l3e, l3e & ~mask)) 2794 goto retryl3; 2795 anychanged = true; 2796 } 2797 } 2798 if (anychanged) 2799 pmap_invalidate_all(pmap); 2800 if (pv_lists_locked) 2801 rw_runlock(&pvh_global_lock); 2802 PMAP_UNLOCK(pmap); 2803 } 2804 2805 int 2806 pmap_fault(pmap_t pmap, vm_offset_t va, vm_prot_t ftype) 2807 { 2808 pd_entry_t *l2, l2e; 2809 pt_entry_t bits, *pte, oldpte; 2810 int rv; 2811 2812 KASSERT(VIRT_IS_VALID(va), ("pmap_fault: invalid va %#lx", va)); 2813 2814 rv = 0; 2815 PMAP_LOCK(pmap); 2816 l2 = pmap_l2(pmap, va); 2817 if (l2 == NULL || ((l2e = pmap_load(l2)) & PTE_V) == 0) 2818 goto done; 2819 if ((l2e & PTE_RWX) == 0) { 2820 pte = pmap_l2_to_l3(l2, va); 2821 if (((oldpte = pmap_load(pte)) & PTE_V) == 0) 2822 goto done; 2823 } else { 2824 pte = l2; 2825 oldpte = l2e; 2826 } 2827 2828 if ((pmap != kernel_pmap && (oldpte & PTE_U) == 0) || 2829 (ftype == VM_PROT_WRITE && (oldpte & PTE_W) == 0) || 2830 (ftype == VM_PROT_EXECUTE && (oldpte & PTE_X) == 0) || 2831 (ftype == VM_PROT_READ && (oldpte & PTE_R) == 0)) 2832 goto done; 2833 2834 bits = PTE_A; 2835 if (ftype == VM_PROT_WRITE) 2836 bits |= PTE_D; 2837 2838 /* 2839 * Spurious faults can occur if the implementation caches invalid 2840 * entries in the TLB, or if simultaneous accesses on multiple CPUs 2841 * race with each other. 2842 */ 2843 if ((oldpte & bits) != bits) 2844 pmap_store_bits(pte, bits); 2845 sfence_vma(); 2846 rv = 1; 2847 done: 2848 PMAP_UNLOCK(pmap); 2849 return (rv); 2850 } 2851 2852 /* 2853 * Demote the specified L1 page to separate L2 pages. 2854 * Currently only used for DMAP entries. 2855 */ 2856 static bool 2857 pmap_demote_l1(pmap_t pmap, pd_entry_t *l1, vm_offset_t va) 2858 { 2859 vm_page_t m; 2860 pt_entry_t *l2, oldl1, newl2; 2861 pd_entry_t newl1; 2862 vm_paddr_t l2phys; 2863 2864 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2865 2866 oldl1 = pmap_load(l1); 2867 KASSERT((oldl1 & PTE_RWX) != 0, 2868 ("pmap_demote_l1: oldl1 is not a leaf PTE")); 2869 KASSERT((oldl1 & PTE_A) != 0, 2870 ("pmap_demote_l1: oldl1 is missing PTE_A")); 2871 KASSERT((oldl1 & (PTE_D | PTE_W)) != PTE_W, 2872 ("pmap_demote_l1: not dirty!")); 2873 KASSERT((oldl1 & PTE_SW_MANAGED) == 0, 2874 ("pmap_demote_l1: L1 table shouldn't be managed")); 2875 KASSERT(VIRT_IN_DMAP(va), 2876 ("pmap_demote_l1: is unsupported for non-DMAP va=%#lx", va)); 2877 2878 /* Demoting L1 means we need to allocate a new page-table page. */ 2879 m = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED); 2880 if (m == NULL) { 2881 CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx in pmap %p", 2882 va, pmap); 2883 return (false); 2884 } 2885 2886 l2phys = VM_PAGE_TO_PHYS(m); 2887 l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys); 2888 2889 /* 2890 * Create new entries, relying on the fact that only the low bits 2891 * (index) of the physical address are changing. 2892 */ 2893 newl2 = oldl1; 2894 for (int i = 0; i < Ln_ENTRIES; i++) 2895 pmap_store(&l2[i], newl2 | (i << PTE_PPN1_S)); 2896 2897 /* 2898 * And update the L1 entry. 2899 * 2900 * NB: flushing the TLB is the responsibility of the caller. Cached 2901 * translations are still "correct" for demoted mappings until some 2902 * subset of the demoted range is modified. 2903 */ 2904 newl1 = ((l2phys / PAGE_SIZE) << PTE_PPN0_S) | PTE_V; 2905 pmap_store(l1, newl1); 2906 2907 counter_u64_add(pmap_l1_demotions, 1); 2908 CTR2(KTR_PMAP, "pmap_demote_l1: success for va %#lx in pmap %p", 2909 va, pmap); 2910 return (true); 2911 } 2912 2913 static bool 2914 pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va) 2915 { 2916 struct rwlock *lock; 2917 bool rv; 2918 2919 lock = NULL; 2920 rv = pmap_demote_l2_locked(pmap, l2, va, &lock); 2921 if (lock != NULL) 2922 rw_wunlock(lock); 2923 return (rv); 2924 } 2925 2926 /* 2927 * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page 2928 * mapping is invalidated. 2929 */ 2930 static bool 2931 pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, 2932 struct rwlock **lockp) 2933 { 2934 struct spglist free; 2935 vm_page_t mpte; 2936 pd_entry_t newl2, oldl2; 2937 pt_entry_t *firstl3, newl3; 2938 vm_paddr_t mptepa; 2939 int i; 2940 2941 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2942 2943 oldl2 = pmap_load(l2); 2944 KASSERT((oldl2 & PTE_RWX) != 0, 2945 ("pmap_demote_l2_locked: oldl2 is not a leaf entry")); 2946 if ((oldl2 & PTE_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == 2947 NULL) { 2948 KASSERT((oldl2 & PTE_SW_WIRED) == 0, 2949 ("pmap_demote_l2_locked: page table page for a wired mapping is missing")); 2950 if ((oldl2 & PTE_A) == 0 || (mpte = vm_page_alloc_noobj( 2951 (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 0) | 2952 VM_ALLOC_WIRED)) == NULL) { 2953 SLIST_INIT(&free); 2954 (void)pmap_remove_l2(pmap, l2, va & ~L2_OFFSET, 2955 pmap_load(pmap_l1(pmap, va)), &free, lockp); 2956 vm_page_free_pages_toq(&free, true); 2957 CTR2(KTR_PMAP, "pmap_demote_l2_locked: " 2958 "failure for va %#lx in pmap %p", va, pmap); 2959 return (false); 2960 } 2961 mpte->pindex = pmap_l2_pindex(va); 2962 if (va < VM_MAXUSER_ADDRESS) { 2963 mpte->ref_count = Ln_ENTRIES; 2964 pmap_resident_count_inc(pmap, 1); 2965 } 2966 } 2967 mptepa = VM_PAGE_TO_PHYS(mpte); 2968 firstl3 = (pt_entry_t *)PHYS_TO_DMAP(mptepa); 2969 newl2 = ((mptepa / PAGE_SIZE) << PTE_PPN0_S) | PTE_V; 2970 KASSERT((oldl2 & PTE_A) != 0, 2971 ("pmap_demote_l2_locked: oldl2 is missing PTE_A")); 2972 KASSERT((oldl2 & (PTE_D | PTE_W)) != PTE_W, 2973 ("pmap_demote_l2_locked: oldl2 is missing PTE_D")); 2974 newl3 = oldl2; 2975 2976 /* 2977 * If the page table page is not leftover from an earlier promotion, 2978 * initialize it. 2979 */ 2980 if (!vm_page_all_valid(mpte)) { 2981 for (i = 0; i < Ln_ENTRIES; i++) 2982 pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S)); 2983 } 2984 KASSERT(PTE_TO_PHYS(pmap_load(firstl3)) == PTE_TO_PHYS(newl3), 2985 ("pmap_demote_l2_locked: firstl3 and newl3 map different physical " 2986 "addresses")); 2987 2988 /* 2989 * If the mapping has changed attributes, update the PTEs. 2990 */ 2991 if ((pmap_load(firstl3) & PTE_PROMOTE) != (newl3 & PTE_PROMOTE)) 2992 for (i = 0; i < Ln_ENTRIES; i++) 2993 pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S)); 2994 2995 /* 2996 * The spare PV entries must be reserved prior to demoting the 2997 * mapping, that is, prior to changing the L2 entry. Otherwise, the 2998 * state of the L2 entry and the PV lists will be inconsistent, which 2999 * can result in reclaim_pv_chunk() attempting to remove a PV entry from 3000 * the wrong PV list and pmap_pv_demote_l2() failing to find the 3001 * expected PV entry for the 2MB page mapping that is being demoted. 3002 */ 3003 if ((oldl2 & PTE_SW_MANAGED) != 0) 3004 reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp); 3005 3006 /* 3007 * Demote the mapping. 3008 */ 3009 pmap_store(l2, newl2); 3010 3011 /* 3012 * Demote the PV entry. 3013 */ 3014 if ((oldl2 & PTE_SW_MANAGED) != 0) 3015 pmap_pv_demote_l2(pmap, va, PTE_TO_PHYS(oldl2), lockp); 3016 3017 atomic_add_long(&pmap_l2_demotions, 1); 3018 CTR2(KTR_PMAP, "pmap_demote_l2_locked: success for va %#lx in pmap %p", 3019 va, pmap); 3020 return (true); 3021 } 3022 3023 #if VM_NRESERVLEVEL > 0 3024 static bool 3025 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, vm_page_t ml3, 3026 struct rwlock **lockp) 3027 { 3028 pt_entry_t all_l3e_PTE_A, *firstl3, firstl3e, *l3, l3e; 3029 vm_paddr_t pa; 3030 3031 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3032 if (!pmap_ps_enabled(pmap)) 3033 return (false); 3034 3035 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 3036 ("pmap_promote_l2: invalid l2 entry %p", l2)); 3037 3038 /* 3039 * Examine the first L3E in the specified PTP. Abort if this L3E is 3040 * ineligible for promotion or does not map the first 4KB physical page 3041 * within a 2MB page. 3042 */ 3043 firstl3 = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l2))); 3044 firstl3e = pmap_load(firstl3); 3045 pa = PTE_TO_PHYS(firstl3e); 3046 if ((pa & L2_OFFSET) != 0) { 3047 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p", 3048 va, pmap); 3049 atomic_add_long(&pmap_l2_p_failures, 1); 3050 return (false); 3051 } 3052 3053 /* 3054 * Downgrade a clean, writable mapping to read-only to ensure that the 3055 * hardware does not set PTE_D while we are comparing PTEs. 3056 * 3057 * Upon a write access to a clean mapping, the implementation will 3058 * either atomically check protections and set PTE_D, or raise a page 3059 * fault. In the latter case, the pmap lock provides atomicity. Thus, 3060 * we do not issue an sfence.vma here and instead rely on pmap_fault() 3061 * to do so lazily. 3062 */ 3063 while ((firstl3e & (PTE_W | PTE_D)) == PTE_W) { 3064 if (atomic_fcmpset_64(firstl3, &firstl3e, firstl3e & ~PTE_W)) { 3065 firstl3e &= ~PTE_W; 3066 break; 3067 } 3068 } 3069 3070 /* 3071 * Examine each of the other PTEs in the specified PTP. Abort if this 3072 * PTE maps an unexpected 4KB physical page or does not have identical 3073 * characteristics to the first PTE. 3074 */ 3075 all_l3e_PTE_A = firstl3e & PTE_A; 3076 pa += L2_SIZE - PAGE_SIZE; 3077 for (l3 = firstl3 + Ln_ENTRIES - 1; l3 > firstl3; l3--) { 3078 l3e = pmap_load(l3); 3079 if (PTE_TO_PHYS(l3e) != pa) { 3080 CTR2(KTR_PMAP, 3081 "pmap_promote_l2: failure for va %#lx pmap %p", 3082 va, pmap); 3083 atomic_add_long(&pmap_l2_p_failures, 1); 3084 return (false); 3085 } 3086 while ((l3e & (PTE_W | PTE_D)) == PTE_W) { 3087 if (atomic_fcmpset_64(l3, &l3e, l3e & ~PTE_W)) { 3088 l3e &= ~PTE_W; 3089 break; 3090 } 3091 } 3092 if ((l3e & PTE_PROMOTE) != (firstl3e & PTE_PROMOTE)) { 3093 CTR2(KTR_PMAP, 3094 "pmap_promote_l2: failure for va %#lx pmap %p", 3095 va, pmap); 3096 atomic_add_long(&pmap_l2_p_failures, 1); 3097 return (false); 3098 } 3099 all_l3e_PTE_A &= l3e; 3100 pa -= PAGE_SIZE; 3101 } 3102 3103 /* 3104 * Unless all PTEs have PTE_A set, clear it from the superpage 3105 * mapping, so that promotions triggered by speculative mappings, 3106 * such as pmap_enter_quick(), don't automatically mark the 3107 * underlying pages as referenced. 3108 */ 3109 firstl3e &= ~PTE_A | all_l3e_PTE_A; 3110 3111 /* 3112 * Save the page table page in its current state until the L2 3113 * mapping the superpage is demoted by pmap_demote_l2() or 3114 * destroyed by pmap_remove_l3(). 3115 */ 3116 if (ml3 == NULL) 3117 ml3 = PTE_TO_VM_PAGE(pmap_load(l2)); 3118 KASSERT(ml3->pindex == pmap_l2_pindex(va), 3119 ("pmap_promote_l2: page table page's pindex is wrong")); 3120 if (pmap_insert_pt_page(pmap, ml3, true, all_l3e_PTE_A != 0)) { 3121 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p", 3122 va, pmap); 3123 atomic_add_long(&pmap_l2_p_failures, 1); 3124 return (false); 3125 } 3126 3127 if ((firstl3e & PTE_SW_MANAGED) != 0) 3128 pmap_pv_promote_l2(pmap, va, PTE_TO_PHYS(firstl3e), lockp); 3129 3130 pmap_store(l2, firstl3e); 3131 3132 atomic_add_long(&pmap_l2_promotions, 1); 3133 CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va, 3134 pmap); 3135 return (true); 3136 } 3137 #endif 3138 3139 /* 3140 * Insert the given physical page (p) at 3141 * the specified virtual address (v) in the 3142 * target physical map with the protection requested. 3143 * 3144 * If specified, the page will be wired down, meaning 3145 * that the related pte can not be reclaimed. 3146 * 3147 * NB: This is the only routine which MAY NOT lazy-evaluate 3148 * or lose information. That is, this routine must actually 3149 * insert this page into the given map NOW. 3150 */ 3151 int 3152 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 3153 u_int flags, int8_t psind) 3154 { 3155 struct rwlock *lock; 3156 pd_entry_t *l2, l2e; 3157 pt_entry_t new_l3, orig_l3; 3158 pt_entry_t *l3; 3159 pv_entry_t pv; 3160 vm_paddr_t opa, pa; 3161 vm_page_t mpte, om; 3162 pn_t pn; 3163 int rv; 3164 bool nosleep; 3165 3166 va = trunc_page(va); 3167 if ((m->oflags & VPO_UNMANAGED) == 0) 3168 VM_PAGE_OBJECT_BUSY_ASSERT(m); 3169 pa = VM_PAGE_TO_PHYS(m); 3170 pn = (pa / PAGE_SIZE); 3171 3172 new_l3 = PTE_V | PTE_R | PTE_A; 3173 if (prot & VM_PROT_EXECUTE) 3174 new_l3 |= PTE_X; 3175 if (flags & VM_PROT_WRITE) 3176 new_l3 |= PTE_D; 3177 if (prot & VM_PROT_WRITE) 3178 new_l3 |= PTE_W; 3179 if (va < VM_MAX_USER_ADDRESS) 3180 new_l3 |= PTE_U; 3181 3182 new_l3 |= (pn << PTE_PPN0_S); 3183 if ((flags & PMAP_ENTER_WIRED) != 0) 3184 new_l3 |= PTE_SW_WIRED; 3185 new_l3 |= pmap_memattr_bits(m->md.pv_memattr); 3186 3187 /* 3188 * Set modified bit gratuitously for writeable mappings if 3189 * the page is unmanaged. We do not want to take a fault 3190 * to do the dirty bit accounting for these mappings. 3191 */ 3192 if ((m->oflags & VPO_UNMANAGED) != 0) { 3193 if (prot & VM_PROT_WRITE) 3194 new_l3 |= PTE_D; 3195 } else 3196 new_l3 |= PTE_SW_MANAGED; 3197 3198 CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa); 3199 3200 lock = NULL; 3201 mpte = NULL; 3202 rw_rlock(&pvh_global_lock); 3203 PMAP_LOCK(pmap); 3204 if (psind == 1) { 3205 /* Assert the required virtual and physical alignment. */ 3206 KASSERT((va & L2_OFFSET) == 0, 3207 ("pmap_enter: va %#lx unaligned", va)); 3208 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); 3209 rv = pmap_enter_l2(pmap, va, new_l3, flags, m, &lock); 3210 goto out; 3211 } 3212 3213 l2 = pmap_l2(pmap, va); 3214 if (l2 != NULL && ((l2e = pmap_load(l2)) & PTE_V) != 0 && 3215 ((l2e & PTE_RWX) == 0 || pmap_demote_l2_locked(pmap, l2, 3216 va, &lock))) { 3217 l3 = pmap_l2_to_l3(l2, va); 3218 if (va < VM_MAXUSER_ADDRESS) { 3219 mpte = PTE_TO_VM_PAGE(pmap_load(l2)); 3220 mpte->ref_count++; 3221 } 3222 } else if (va < VM_MAXUSER_ADDRESS) { 3223 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; 3224 mpte = pmap_alloc_l3(pmap, va, nosleep ? NULL : &lock); 3225 if (mpte == NULL && nosleep) { 3226 CTR0(KTR_PMAP, "pmap_enter: mpte == NULL"); 3227 if (lock != NULL) 3228 rw_wunlock(lock); 3229 rw_runlock(&pvh_global_lock); 3230 PMAP_UNLOCK(pmap); 3231 return (KERN_RESOURCE_SHORTAGE); 3232 } 3233 l3 = pmap_l3(pmap, va); 3234 } else { 3235 panic("pmap_enter: missing L3 table for kernel va %#lx", va); 3236 } 3237 3238 orig_l3 = pmap_load(l3); 3239 opa = PTE_TO_PHYS(orig_l3); 3240 pv = NULL; 3241 3242 /* 3243 * Is the specified virtual address already mapped? 3244 */ 3245 if ((orig_l3 & PTE_V) != 0) { 3246 /* 3247 * Wiring change, just update stats. We don't worry about 3248 * wiring PT pages as they remain resident as long as there 3249 * are valid mappings in them. Hence, if a user page is wired, 3250 * the PT page will be also. 3251 */ 3252 if ((flags & PMAP_ENTER_WIRED) != 0 && 3253 (orig_l3 & PTE_SW_WIRED) == 0) 3254 pmap->pm_stats.wired_count++; 3255 else if ((flags & PMAP_ENTER_WIRED) == 0 && 3256 (orig_l3 & PTE_SW_WIRED) != 0) 3257 pmap->pm_stats.wired_count--; 3258 3259 /* 3260 * Remove the extra PT page reference. 3261 */ 3262 if (mpte != NULL) { 3263 mpte->ref_count--; 3264 KASSERT(mpte->ref_count > 0, 3265 ("pmap_enter: missing reference to page table page," 3266 " va: 0x%lx", va)); 3267 } 3268 3269 /* 3270 * Has the physical page changed? 3271 */ 3272 if (opa == pa) { 3273 /* 3274 * No, might be a protection or wiring change. 3275 */ 3276 if ((orig_l3 & PTE_SW_MANAGED) != 0 && 3277 (new_l3 & PTE_W) != 0) 3278 vm_page_aflag_set(m, PGA_WRITEABLE); 3279 goto validate; 3280 } 3281 3282 /* 3283 * The physical page has changed. Temporarily invalidate 3284 * the mapping. This ensures that all threads sharing the 3285 * pmap keep a consistent view of the mapping, which is 3286 * necessary for the correct handling of COW faults. It 3287 * also permits reuse of the old mapping's PV entry, 3288 * avoiding an allocation. 3289 * 3290 * For consistency, handle unmanaged mappings the same way. 3291 */ 3292 orig_l3 = pmap_load_clear(l3); 3293 KASSERT(PTE_TO_PHYS(orig_l3) == opa, 3294 ("pmap_enter: unexpected pa update for %#lx", va)); 3295 if ((orig_l3 & PTE_SW_MANAGED) != 0) { 3296 om = PHYS_TO_VM_PAGE(opa); 3297 3298 /* 3299 * The pmap lock is sufficient to synchronize with 3300 * concurrent calls to pmap_page_test_mappings() and 3301 * pmap_ts_referenced(). 3302 */ 3303 if ((orig_l3 & PTE_D) != 0) 3304 vm_page_dirty(om); 3305 if ((orig_l3 & PTE_A) != 0) 3306 vm_page_aflag_set(om, PGA_REFERENCED); 3307 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); 3308 pv = pmap_pvh_remove(&om->md, pmap, va); 3309 KASSERT(pv != NULL, 3310 ("pmap_enter: no PV entry for %#lx", va)); 3311 if ((new_l3 & PTE_SW_MANAGED) == 0) 3312 free_pv_entry(pmap, pv); 3313 if ((om->a.flags & PGA_WRITEABLE) != 0 && 3314 TAILQ_EMPTY(&om->md.pv_list) && 3315 ((om->flags & PG_FICTITIOUS) != 0 || 3316 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 3317 vm_page_aflag_clear(om, PGA_WRITEABLE); 3318 } 3319 pmap_invalidate_page(pmap, va); 3320 orig_l3 = 0; 3321 } else { 3322 /* 3323 * Increment the counters. 3324 */ 3325 if ((new_l3 & PTE_SW_WIRED) != 0) 3326 pmap->pm_stats.wired_count++; 3327 pmap_resident_count_inc(pmap, 1); 3328 } 3329 /* 3330 * Enter on the PV list if part of our managed memory. 3331 */ 3332 if ((new_l3 & PTE_SW_MANAGED) != 0) { 3333 if (pv == NULL) { 3334 pv = get_pv_entry(pmap, &lock); 3335 pv->pv_va = va; 3336 } 3337 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); 3338 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3339 m->md.pv_gen++; 3340 if ((new_l3 & PTE_W) != 0) 3341 vm_page_aflag_set(m, PGA_WRITEABLE); 3342 } 3343 3344 validate: 3345 /* 3346 * Sync the i-cache on all harts before updating the PTE 3347 * if the new PTE is executable. 3348 */ 3349 if (prot & VM_PROT_EXECUTE) 3350 pmap_sync_icache(pmap, va, PAGE_SIZE); 3351 3352 /* 3353 * Update the L3 entry. 3354 */ 3355 if (orig_l3 != 0) { 3356 orig_l3 = pmap_load_store(l3, new_l3); 3357 pmap_invalidate_page(pmap, va); 3358 KASSERT(PTE_TO_PHYS(orig_l3) == pa, 3359 ("pmap_enter: invalid update")); 3360 if ((orig_l3 & (PTE_D | PTE_SW_MANAGED)) == 3361 (PTE_D | PTE_SW_MANAGED)) 3362 vm_page_dirty(m); 3363 } else { 3364 pmap_store(l3, new_l3); 3365 } 3366 3367 #if VM_NRESERVLEVEL > 0 3368 if (mpte != NULL && mpte->ref_count == Ln_ENTRIES && 3369 (m->flags & PG_FICTITIOUS) == 0 && 3370 vm_reserv_level_iffullpop(m) == 0) 3371 (void)pmap_promote_l2(pmap, l2, va, mpte, &lock); 3372 #endif 3373 3374 rv = KERN_SUCCESS; 3375 out: 3376 if (lock != NULL) 3377 rw_wunlock(lock); 3378 rw_runlock(&pvh_global_lock); 3379 PMAP_UNLOCK(pmap); 3380 return (rv); 3381 } 3382 3383 /* 3384 * Release a page table page reference after a failed attempt to create a 3385 * mapping. 3386 */ 3387 static void 3388 pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t l2pg) 3389 { 3390 struct spglist free; 3391 3392 SLIST_INIT(&free); 3393 if (pmap_unwire_ptp(pmap, va, l2pg, &free)) { 3394 /* 3395 * Although "va" is not mapped, paging-structure 3396 * caches could nonetheless have entries that 3397 * refer to the freed page table pages. 3398 * Invalidate those entries. 3399 */ 3400 pmap_invalidate_page(pmap, va); 3401 vm_page_free_pages_toq(&free, true); 3402 } 3403 } 3404 3405 /* 3406 * Tries to create a read- and/or execute-only 2MB page mapping. Returns 3407 * KERN_SUCCESS if the mapping was created. Otherwise, returns an error 3408 * value. See pmap_enter_l2() for the possible error values when "no sleep", 3409 * "no replace", and "no reclaim" are specified. 3410 */ 3411 static int 3412 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 3413 struct rwlock **lockp) 3414 { 3415 pd_entry_t new_l2; 3416 pn_t pn; 3417 3418 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3419 3420 pn = VM_PAGE_TO_PHYS(m) / PAGE_SIZE; 3421 new_l2 = (pd_entry_t)((pn << PTE_PPN0_S) | PTE_R | PTE_V | 3422 pmap_memattr_bits(m->md.pv_memattr)); 3423 if ((m->oflags & VPO_UNMANAGED) == 0) 3424 new_l2 |= PTE_SW_MANAGED; 3425 if ((prot & VM_PROT_EXECUTE) != 0) 3426 new_l2 |= PTE_X; 3427 if (va < VM_MAXUSER_ADDRESS) 3428 new_l2 |= PTE_U; 3429 return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP | 3430 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp)); 3431 } 3432 3433 /* 3434 * Returns true if every page table entry in the specified page table is 3435 * zero. 3436 */ 3437 static bool 3438 pmap_every_pte_zero(vm_paddr_t pa) 3439 { 3440 pt_entry_t *pt_end, *pte; 3441 3442 KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned")); 3443 pte = (pt_entry_t *)PHYS_TO_DMAP(pa); 3444 for (pt_end = pte + Ln_ENTRIES; pte < pt_end; pte++) { 3445 if (*pte != 0) 3446 return (false); 3447 } 3448 return (true); 3449 } 3450 3451 /* 3452 * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if 3453 * the mapping was created, and one of KERN_FAILURE, KERN_NO_SPACE, or 3454 * KERN_RESOURCE_SHORTAGE otherwise. Returns KERN_FAILURE if 3455 * PMAP_ENTER_NOREPLACE was specified and a 4KB page mapping already exists 3456 * within the 2MB virtual address range starting at the specified virtual 3457 * address. Returns KERN_NO_SPACE if PMAP_ENTER_NOREPLACE was specified and a 3458 * 2MB page mapping already exists at the specified virtual address. Returns 3459 * KERN_RESOURCE_SHORTAGE if either (1) PMAP_ENTER_NOSLEEP was specified and a 3460 * page table page allocation failed or (2) PMAP_ENTER_NORECLAIM was specified 3461 * and a PV entry allocation failed. 3462 * 3463 * The parameter "m" is only used when creating a managed, writeable mapping. 3464 */ 3465 static int 3466 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, 3467 vm_page_t m, struct rwlock **lockp) 3468 { 3469 struct spglist free; 3470 pd_entry_t *l2, *l3, oldl2; 3471 vm_offset_t sva; 3472 vm_page_t l2pg, mt; 3473 vm_page_t uwptpg; 3474 3475 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3476 3477 if ((l2pg = pmap_alloc_l2(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ? 3478 NULL : lockp)) == NULL) { 3479 CTR2(KTR_PMAP, "pmap_enter_l2: failed to allocate PT page" 3480 " for va %#lx in pmap %p", va, pmap); 3481 return (KERN_RESOURCE_SHORTAGE); 3482 } 3483 3484 l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg)); 3485 l2 = &l2[pmap_l2_index(va)]; 3486 if ((oldl2 = pmap_load(l2)) != 0) { 3487 KASSERT(l2pg->ref_count > 1, 3488 ("pmap_enter_l2: l2pg's ref count is too low")); 3489 if ((flags & PMAP_ENTER_NOREPLACE) != 0) { 3490 if ((oldl2 & PTE_RWX) != 0) { 3491 l2pg->ref_count--; 3492 CTR2(KTR_PMAP, 3493 "pmap_enter_l2: no space for va %#lx" 3494 " in pmap %p", va, pmap); 3495 return (KERN_NO_SPACE); 3496 } else if (va < VM_MAXUSER_ADDRESS || 3497 !pmap_every_pte_zero(L2PTE_TO_PHYS(oldl2))) { 3498 l2pg->ref_count--; 3499 CTR2(KTR_PMAP, "pmap_enter_l2:" 3500 " failed to replace existing mapping" 3501 " for va %#lx in pmap %p", va, pmap); 3502 return (KERN_FAILURE); 3503 } 3504 } 3505 SLIST_INIT(&free); 3506 if ((oldl2 & PTE_RWX) != 0) 3507 (void)pmap_remove_l2(pmap, l2, va, 3508 pmap_load(pmap_l1(pmap, va)), &free, lockp); 3509 else 3510 for (sva = va; sva < va + L2_SIZE; sva += PAGE_SIZE) { 3511 l3 = pmap_l2_to_l3(l2, sva); 3512 if ((pmap_load(l3) & PTE_V) != 0 && 3513 pmap_remove_l3(pmap, l3, sva, oldl2, &free, 3514 lockp) != 0) 3515 break; 3516 } 3517 vm_page_free_pages_toq(&free, true); 3518 if (va >= VM_MAXUSER_ADDRESS) { 3519 /* 3520 * Both pmap_remove_l2() and pmap_remove_l3() will 3521 * leave the kernel page table page zero filled. 3522 */ 3523 mt = PTE_TO_VM_PAGE(pmap_load(l2)); 3524 if (pmap_insert_pt_page(pmap, mt, false, false)) 3525 panic("pmap_enter_l2: trie insert failed"); 3526 } else 3527 KASSERT(pmap_load(l2) == 0, 3528 ("pmap_enter_l2: non-zero L2 entry %p", l2)); 3529 } 3530 3531 /* 3532 * Allocate leaf ptpage for wired userspace pages. 3533 */ 3534 uwptpg = NULL; 3535 if ((new_l2 & PTE_SW_WIRED) != 0 && pmap != kernel_pmap) { 3536 uwptpg = vm_page_alloc_noobj(VM_ALLOC_WIRED); 3537 if (uwptpg == NULL) { 3538 pmap_abort_ptp(pmap, va, l2pg); 3539 return (KERN_RESOURCE_SHORTAGE); 3540 } 3541 uwptpg->pindex = pmap_l2_pindex(va); 3542 if (pmap_insert_pt_page(pmap, uwptpg, true, false)) { 3543 vm_page_unwire_noq(uwptpg); 3544 vm_page_free(uwptpg); 3545 pmap_abort_ptp(pmap, va, l2pg); 3546 return (KERN_RESOURCE_SHORTAGE); 3547 } 3548 pmap_resident_count_inc(pmap, 1); 3549 uwptpg->ref_count = Ln_ENTRIES; 3550 } 3551 if ((new_l2 & PTE_SW_MANAGED) != 0) { 3552 /* 3553 * Abort this mapping if its PV entry could not be created. 3554 */ 3555 if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) { 3556 pmap_abort_ptp(pmap, va, l2pg); 3557 if (uwptpg != NULL) { 3558 mt = pmap_remove_pt_page(pmap, va); 3559 KASSERT(mt == uwptpg, 3560 ("removed pt page %p, expected %p", mt, 3561 uwptpg)); 3562 pmap_resident_count_dec(pmap, 1); 3563 uwptpg->ref_count = 1; 3564 vm_page_unwire_noq(uwptpg); 3565 vm_page_free(uwptpg); 3566 } 3567 CTR2(KTR_PMAP, 3568 "pmap_enter_l2: failed to create PV entry" 3569 " for va %#lx in pmap %p", va, pmap); 3570 return (KERN_RESOURCE_SHORTAGE); 3571 } 3572 if ((new_l2 & PTE_W) != 0) 3573 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 3574 vm_page_aflag_set(mt, PGA_WRITEABLE); 3575 } 3576 3577 /* 3578 * Increment counters. 3579 */ 3580 if ((new_l2 & PTE_SW_WIRED) != 0) 3581 pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE; 3582 pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE; 3583 3584 /* 3585 * Map the superpage. 3586 */ 3587 pmap_store(l2, new_l2); 3588 3589 atomic_add_long(&pmap_l2_mappings, 1); 3590 CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p", 3591 va, pmap); 3592 3593 return (KERN_SUCCESS); 3594 } 3595 3596 /* 3597 * Maps a sequence of resident pages belonging to the same object. 3598 * The sequence begins with the given page m_start. This page is 3599 * mapped at the given virtual address start. Each subsequent page is 3600 * mapped at a virtual address that is offset from start by the same 3601 * amount as the page is offset from m_start within the object. The 3602 * last page in the sequence is the page with the largest offset from 3603 * m_start that can be mapped at a virtual address less than the given 3604 * virtual address end. Not every virtual page between start and end 3605 * is mapped; only those for which a resident page exists with the 3606 * corresponding offset from m_start are mapped. 3607 */ 3608 void 3609 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 3610 vm_page_t m_start, vm_prot_t prot) 3611 { 3612 struct rwlock *lock; 3613 vm_offset_t va; 3614 vm_page_t m, mpte; 3615 vm_pindex_t diff, psize; 3616 int rv; 3617 3618 VM_OBJECT_ASSERT_LOCKED(m_start->object); 3619 3620 psize = atop(end - start); 3621 mpte = NULL; 3622 m = m_start; 3623 lock = NULL; 3624 rw_rlock(&pvh_global_lock); 3625 PMAP_LOCK(pmap); 3626 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 3627 va = start + ptoa(diff); 3628 if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end && 3629 m->psind == 1 && pmap_ps_enabled(pmap) && 3630 ((rv = pmap_enter_2mpage(pmap, va, m, prot, &lock)) == 3631 KERN_SUCCESS || rv == KERN_NO_SPACE)) 3632 m = &m[L2_SIZE / PAGE_SIZE - 1]; 3633 else 3634 mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, 3635 &lock); 3636 m = TAILQ_NEXT(m, listq); 3637 } 3638 if (lock != NULL) 3639 rw_wunlock(lock); 3640 rw_runlock(&pvh_global_lock); 3641 PMAP_UNLOCK(pmap); 3642 } 3643 3644 /* 3645 * this code makes some *MAJOR* assumptions: 3646 * 1. Current pmap & pmap exists. 3647 * 2. Not wired. 3648 * 3. Read access. 3649 * 4. No page table pages. 3650 * but is *MUCH* faster than pmap_enter... 3651 */ 3652 3653 void 3654 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 3655 { 3656 struct rwlock *lock; 3657 3658 lock = NULL; 3659 rw_rlock(&pvh_global_lock); 3660 PMAP_LOCK(pmap); 3661 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); 3662 if (lock != NULL) 3663 rw_wunlock(lock); 3664 rw_runlock(&pvh_global_lock); 3665 PMAP_UNLOCK(pmap); 3666 } 3667 3668 static vm_page_t 3669 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 3670 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) 3671 { 3672 struct spglist free; 3673 pd_entry_t *l2; 3674 pt_entry_t *l3, newl3; 3675 3676 KASSERT(!VA_IS_CLEANMAP(va) || 3677 (m->oflags & VPO_UNMANAGED) != 0, 3678 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 3679 rw_assert(&pvh_global_lock, RA_LOCKED); 3680 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3681 l2 = NULL; 3682 3683 CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va); 3684 /* 3685 * In the case that a page table page is not 3686 * resident, we are creating it here. 3687 */ 3688 if (va < VM_MAXUSER_ADDRESS) { 3689 vm_pindex_t l2pindex; 3690 3691 /* 3692 * Calculate pagetable page index 3693 */ 3694 l2pindex = pmap_l2_pindex(va); 3695 if (mpte && (mpte->pindex == l2pindex)) { 3696 mpte->ref_count++; 3697 } else { 3698 /* 3699 * Get the l2 entry 3700 */ 3701 l2 = pmap_l2(pmap, va); 3702 3703 /* 3704 * If the page table page is mapped, we just increment 3705 * the hold count, and activate it. Otherwise, we 3706 * attempt to allocate a page table page. If this 3707 * attempt fails, we don't retry. Instead, we give up. 3708 */ 3709 if (l2 != NULL && pmap_load(l2) != 0) { 3710 if ((pmap_load(l2) & PTE_RWX) != 0) 3711 return (NULL); 3712 mpte = PTE_TO_VM_PAGE(pmap_load(l2)); 3713 mpte->ref_count++; 3714 } else { 3715 /* 3716 * Pass NULL instead of the PV list lock 3717 * pointer, because we don't intend to sleep. 3718 */ 3719 mpte = _pmap_alloc_l3(pmap, l2pindex, NULL); 3720 if (mpte == NULL) 3721 return (mpte); 3722 } 3723 } 3724 l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); 3725 l3 = &l3[pmap_l3_index(va)]; 3726 } else { 3727 mpte = NULL; 3728 l3 = pmap_l3(kernel_pmap, va); 3729 } 3730 if (l3 == NULL) 3731 panic("pmap_enter_quick_locked: No l3"); 3732 if (pmap_load(l3) != 0) { 3733 if (mpte != NULL) 3734 mpte->ref_count--; 3735 return (NULL); 3736 } 3737 3738 /* 3739 * Enter on the PV list if part of our managed memory. 3740 */ 3741 if ((m->oflags & VPO_UNMANAGED) == 0 && 3742 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { 3743 if (mpte != NULL) { 3744 SLIST_INIT(&free); 3745 if (pmap_unwire_ptp(pmap, va, mpte, &free)) 3746 vm_page_free_pages_toq(&free, false); 3747 } 3748 return (NULL); 3749 } 3750 3751 /* 3752 * Increment counters 3753 */ 3754 pmap_resident_count_inc(pmap, 1); 3755 3756 newl3 = ((VM_PAGE_TO_PHYS(m) / PAGE_SIZE) << PTE_PPN0_S) | 3757 PTE_V | PTE_R | pmap_memattr_bits(m->md.pv_memattr); 3758 if ((prot & VM_PROT_EXECUTE) != 0) 3759 newl3 |= PTE_X; 3760 if ((m->oflags & VPO_UNMANAGED) == 0) 3761 newl3 |= PTE_SW_MANAGED; 3762 if (va < VM_MAX_USER_ADDRESS) 3763 newl3 |= PTE_U; 3764 3765 /* 3766 * Sync the i-cache on all harts before updating the PTE 3767 * if the new PTE is executable. 3768 */ 3769 if (prot & VM_PROT_EXECUTE) 3770 pmap_sync_icache(pmap, va, PAGE_SIZE); 3771 3772 pmap_store(l3, newl3); 3773 3774 #if VM_NRESERVLEVEL > 0 3775 /* 3776 * If both the PTP and the reservation are fully populated, then attempt 3777 * promotion. 3778 */ 3779 if ((prot & VM_PROT_NO_PROMOTE) == 0 && 3780 (mpte == NULL || mpte->ref_count == Ln_ENTRIES) && 3781 (m->flags & PG_FICTITIOUS) == 0 && 3782 vm_reserv_level_iffullpop(m) == 0) { 3783 if (l2 == NULL) 3784 l2 = pmap_l2(pmap, va); 3785 3786 /* 3787 * If promotion succeeds, then the next call to this function 3788 * should not be given the unmapped PTP as a hint. 3789 */ 3790 if (pmap_promote_l2(pmap, l2, va, mpte, lockp)) 3791 mpte = NULL; 3792 } 3793 #endif 3794 3795 return (mpte); 3796 } 3797 3798 /* 3799 * This code maps large physical mmap regions into the 3800 * processor address space. Note that some shortcuts 3801 * are taken, but the code works. 3802 */ 3803 void 3804 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 3805 vm_pindex_t pindex, vm_size_t size) 3806 { 3807 3808 VM_OBJECT_ASSERT_WLOCKED(object); 3809 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 3810 ("pmap_object_init_pt: non-device object")); 3811 } 3812 3813 /* 3814 * Clear the wired attribute from the mappings for the specified range of 3815 * addresses in the given pmap. Every valid mapping within that range 3816 * must have the wired attribute set. In contrast, invalid mappings 3817 * cannot have the wired attribute set, so they are ignored. 3818 * 3819 * The wired attribute of the page table entry is not a hardware feature, 3820 * so there is no need to invalidate any TLB entries. 3821 */ 3822 void 3823 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3824 { 3825 vm_offset_t va_next; 3826 pd_entry_t *l0, *l1, *l2, l2e; 3827 pt_entry_t *l3, l3e; 3828 bool pv_lists_locked; 3829 3830 pv_lists_locked = false; 3831 retry: 3832 PMAP_LOCK(pmap); 3833 for (; sva < eva; sva = va_next) { 3834 if (pmap_mode == PMAP_MODE_SV48) { 3835 l0 = pmap_l0(pmap, sva); 3836 if (pmap_load(l0) == 0) { 3837 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 3838 if (va_next < sva) 3839 va_next = eva; 3840 continue; 3841 } 3842 l1 = pmap_l0_to_l1(l0, sva); 3843 } else { 3844 l1 = pmap_l1(pmap, sva); 3845 } 3846 3847 if (pmap_load(l1) == 0) { 3848 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 3849 if (va_next < sva) 3850 va_next = eva; 3851 continue; 3852 } 3853 3854 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 3855 if (va_next < sva) 3856 va_next = eva; 3857 3858 l2 = pmap_l1_to_l2(l1, sva); 3859 if ((l2e = pmap_load(l2)) == 0) 3860 continue; 3861 if ((l2e & PTE_RWX) != 0) { 3862 if (sva + L2_SIZE == va_next && eva >= va_next) { 3863 if ((l2e & PTE_SW_WIRED) == 0) 3864 panic("pmap_unwire: l2 %#jx is missing " 3865 "PTE_SW_WIRED", (uintmax_t)l2e); 3866 pmap_clear_bits(l2, PTE_SW_WIRED); 3867 continue; 3868 } else { 3869 if (!pv_lists_locked) { 3870 pv_lists_locked = true; 3871 if (!rw_try_rlock(&pvh_global_lock)) { 3872 PMAP_UNLOCK(pmap); 3873 rw_rlock(&pvh_global_lock); 3874 /* Repeat sva. */ 3875 goto retry; 3876 } 3877 } 3878 if (!pmap_demote_l2(pmap, l2, sva)) 3879 panic("pmap_unwire: demotion failed"); 3880 } 3881 } 3882 3883 if (va_next > eva) 3884 va_next = eva; 3885 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 3886 sva += L3_SIZE) { 3887 if ((l3e = pmap_load(l3)) == 0) 3888 continue; 3889 if ((l3e & PTE_SW_WIRED) == 0) 3890 panic("pmap_unwire: l3 %#jx is missing " 3891 "PTE_SW_WIRED", (uintmax_t)l3e); 3892 3893 /* 3894 * PG_W must be cleared atomically. Although the pmap 3895 * lock synchronizes access to PG_W, another processor 3896 * could be setting PG_M and/or PG_A concurrently. 3897 */ 3898 pmap_clear_bits(l3, PTE_SW_WIRED); 3899 pmap->pm_stats.wired_count--; 3900 } 3901 } 3902 if (pv_lists_locked) 3903 rw_runlock(&pvh_global_lock); 3904 PMAP_UNLOCK(pmap); 3905 } 3906 3907 /* 3908 * Copy the range specified by src_addr/len 3909 * from the source map to the range dst_addr/len 3910 * in the destination map. 3911 * 3912 * This routine is only advisory and need not do anything. 3913 */ 3914 3915 void 3916 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 3917 vm_offset_t src_addr) 3918 { 3919 3920 } 3921 3922 /* 3923 * pmap_zero_page zeros the specified hardware page by mapping 3924 * the page into KVM and using bzero to clear its contents. 3925 */ 3926 void 3927 pmap_zero_page(vm_page_t m) 3928 { 3929 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 3930 3931 pagezero((void *)va); 3932 } 3933 3934 /* 3935 * pmap_zero_page_area zeros the specified hardware page by mapping 3936 * the page into KVM and using bzero to clear its contents. 3937 * 3938 * off and size may not cover an area beyond a single hardware page. 3939 */ 3940 void 3941 pmap_zero_page_area(vm_page_t m, int off, int size) 3942 { 3943 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 3944 3945 if (off == 0 && size == PAGE_SIZE) 3946 pagezero((void *)va); 3947 else 3948 bzero((char *)va + off, size); 3949 } 3950 3951 /* 3952 * pmap_copy_page copies the specified (machine independent) 3953 * page by mapping the page into virtual memory and using 3954 * bcopy to copy the page, one machine dependent page at a 3955 * time. 3956 */ 3957 void 3958 pmap_copy_page(vm_page_t msrc, vm_page_t mdst) 3959 { 3960 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 3961 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 3962 3963 pagecopy((void *)src, (void *)dst); 3964 } 3965 3966 int unmapped_buf_allowed = 1; 3967 3968 void 3969 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 3970 vm_offset_t b_offset, int xfersize) 3971 { 3972 void *a_cp, *b_cp; 3973 vm_page_t m_a, m_b; 3974 vm_paddr_t p_a, p_b; 3975 vm_offset_t a_pg_offset, b_pg_offset; 3976 int cnt; 3977 3978 while (xfersize > 0) { 3979 a_pg_offset = a_offset & PAGE_MASK; 3980 m_a = ma[a_offset >> PAGE_SHIFT]; 3981 p_a = m_a->phys_addr; 3982 b_pg_offset = b_offset & PAGE_MASK; 3983 m_b = mb[b_offset >> PAGE_SHIFT]; 3984 p_b = m_b->phys_addr; 3985 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 3986 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 3987 if (__predict_false(!PHYS_IN_DMAP(p_a))) { 3988 panic("!DMAP a %lx", p_a); 3989 } else { 3990 a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset; 3991 } 3992 if (__predict_false(!PHYS_IN_DMAP(p_b))) { 3993 panic("!DMAP b %lx", p_b); 3994 } else { 3995 b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset; 3996 } 3997 bcopy(a_cp, b_cp, cnt); 3998 a_offset += cnt; 3999 b_offset += cnt; 4000 xfersize -= cnt; 4001 } 4002 } 4003 4004 vm_offset_t 4005 pmap_quick_enter_page(vm_page_t m) 4006 { 4007 4008 return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m))); 4009 } 4010 4011 void 4012 pmap_quick_remove_page(vm_offset_t addr) 4013 { 4014 } 4015 4016 /* 4017 * Returns true if the pmap's pv is one of the first 4018 * 16 pvs linked to from this page. This count may 4019 * be changed upwards or downwards in the future; it 4020 * is only necessary that true be returned for a small 4021 * subset of pmaps for proper page aging. 4022 */ 4023 bool 4024 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 4025 { 4026 struct md_page *pvh; 4027 struct rwlock *lock; 4028 pv_entry_t pv; 4029 int loops = 0; 4030 bool rv; 4031 4032 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4033 ("pmap_page_exists_quick: page %p is not managed", m)); 4034 rv = false; 4035 rw_rlock(&pvh_global_lock); 4036 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4037 rw_rlock(lock); 4038 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4039 if (PV_PMAP(pv) == pmap) { 4040 rv = true; 4041 break; 4042 } 4043 loops++; 4044 if (loops >= 16) 4045 break; 4046 } 4047 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 4048 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4049 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4050 if (PV_PMAP(pv) == pmap) { 4051 rv = true; 4052 break; 4053 } 4054 loops++; 4055 if (loops >= 16) 4056 break; 4057 } 4058 } 4059 rw_runlock(lock); 4060 rw_runlock(&pvh_global_lock); 4061 return (rv); 4062 } 4063 4064 /* 4065 * pmap_page_wired_mappings: 4066 * 4067 * Return the number of managed mappings to the given physical page 4068 * that are wired. 4069 */ 4070 int 4071 pmap_page_wired_mappings(vm_page_t m) 4072 { 4073 struct md_page *pvh; 4074 struct rwlock *lock; 4075 pmap_t pmap; 4076 pd_entry_t *l2; 4077 pt_entry_t *l3; 4078 pv_entry_t pv; 4079 int count, md_gen, pvh_gen; 4080 4081 if ((m->oflags & VPO_UNMANAGED) != 0) 4082 return (0); 4083 rw_rlock(&pvh_global_lock); 4084 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4085 rw_rlock(lock); 4086 restart: 4087 count = 0; 4088 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4089 pmap = PV_PMAP(pv); 4090 if (!PMAP_TRYLOCK(pmap)) { 4091 md_gen = m->md.pv_gen; 4092 rw_runlock(lock); 4093 PMAP_LOCK(pmap); 4094 rw_rlock(lock); 4095 if (md_gen != m->md.pv_gen) { 4096 PMAP_UNLOCK(pmap); 4097 goto restart; 4098 } 4099 } 4100 l2 = pmap_l2(pmap, pv->pv_va); 4101 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 4102 ("%s: found a 2mpage in page %p's pv list", __func__, m)); 4103 l3 = pmap_l2_to_l3(l2, pv->pv_va); 4104 if ((pmap_load(l3) & PTE_SW_WIRED) != 0) 4105 count++; 4106 PMAP_UNLOCK(pmap); 4107 } 4108 if ((m->flags & PG_FICTITIOUS) == 0) { 4109 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4110 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4111 pmap = PV_PMAP(pv); 4112 if (!PMAP_TRYLOCK(pmap)) { 4113 md_gen = m->md.pv_gen; 4114 pvh_gen = pvh->pv_gen; 4115 rw_runlock(lock); 4116 PMAP_LOCK(pmap); 4117 rw_rlock(lock); 4118 if (md_gen != m->md.pv_gen || 4119 pvh_gen != pvh->pv_gen) { 4120 PMAP_UNLOCK(pmap); 4121 goto restart; 4122 } 4123 } 4124 l2 = pmap_l2(pmap, pv->pv_va); 4125 if ((pmap_load(l2) & PTE_SW_WIRED) != 0) 4126 count++; 4127 PMAP_UNLOCK(pmap); 4128 } 4129 } 4130 rw_runlock(lock); 4131 rw_runlock(&pvh_global_lock); 4132 return (count); 4133 } 4134 4135 /* 4136 * Returns true if the given page is mapped individually or as part of 4137 * a 2mpage. Otherwise, returns false. 4138 */ 4139 bool 4140 pmap_page_is_mapped(vm_page_t m) 4141 { 4142 struct rwlock *lock; 4143 bool rv; 4144 4145 if ((m->oflags & VPO_UNMANAGED) != 0) 4146 return (false); 4147 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4148 rw_rlock(lock); 4149 rv = !TAILQ_EMPTY(&m->md.pv_list) || 4150 ((m->flags & PG_FICTITIOUS) == 0 && 4151 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 4152 rw_runlock(lock); 4153 return (rv); 4154 } 4155 4156 static void 4157 pmap_remove_pages_pv(pmap_t pmap, vm_page_t m, pv_entry_t pv, 4158 struct spglist *free, bool superpage) 4159 { 4160 struct md_page *pvh; 4161 vm_page_t mpte, mt; 4162 4163 if (superpage) { 4164 pmap_resident_count_dec(pmap, Ln_ENTRIES); 4165 pvh = pa_to_pvh(m->phys_addr); 4166 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 4167 pvh->pv_gen++; 4168 if (TAILQ_EMPTY(&pvh->pv_list)) { 4169 for (mt = m; mt < &m[Ln_ENTRIES]; mt++) 4170 if (TAILQ_EMPTY(&mt->md.pv_list) && 4171 (mt->a.flags & PGA_WRITEABLE) != 0) 4172 vm_page_aflag_clear(mt, PGA_WRITEABLE); 4173 } 4174 mpte = pmap_remove_pt_page(pmap, pv->pv_va); 4175 if (mpte != NULL) { 4176 KASSERT(vm_page_any_valid(mpte), 4177 ("pmap_remove_pages: pte page not promoted")); 4178 pmap_resident_count_dec(pmap, 1); 4179 KASSERT(mpte->ref_count == Ln_ENTRIES, 4180 ("pmap_remove_pages: pte page ref count error")); 4181 mpte->ref_count = 0; 4182 pmap_add_delayed_free_list(mpte, free, false); 4183 } 4184 } else { 4185 pmap_resident_count_dec(pmap, 1); 4186 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4187 m->md.pv_gen++; 4188 if (TAILQ_EMPTY(&m->md.pv_list) && 4189 (m->a.flags & PGA_WRITEABLE) != 0) { 4190 pvh = pa_to_pvh(m->phys_addr); 4191 if (TAILQ_EMPTY(&pvh->pv_list)) 4192 vm_page_aflag_clear(m, PGA_WRITEABLE); 4193 } 4194 } 4195 } 4196 4197 /* 4198 * Destroy all managed, non-wired mappings in the given user-space 4199 * pmap. This pmap cannot be active on any processor besides the 4200 * caller. 4201 * 4202 * This function cannot be applied to the kernel pmap. Moreover, it 4203 * is not intended for general use. It is only to be used during 4204 * process termination. Consequently, it can be implemented in ways 4205 * that make it faster than pmap_remove(). First, it can more quickly 4206 * destroy mappings by iterating over the pmap's collection of PV 4207 * entries, rather than searching the page table. Second, it doesn't 4208 * have to test and clear the page table entries atomically, because 4209 * no processor is currently accessing the user address space. In 4210 * particular, a page table entry's dirty bit won't change state once 4211 * this function starts. 4212 */ 4213 void 4214 pmap_remove_pages(pmap_t pmap) 4215 { 4216 struct spglist free; 4217 pd_entry_t ptepde; 4218 pt_entry_t *pte, tpte; 4219 vm_page_t m, mt; 4220 pv_entry_t pv; 4221 struct pv_chunk *pc, *npc; 4222 struct rwlock *lock; 4223 int64_t bit; 4224 uint64_t inuse, bitmask; 4225 int allfree, field, freed __pv_stat_used, idx; 4226 bool superpage; 4227 4228 lock = NULL; 4229 4230 SLIST_INIT(&free); 4231 rw_rlock(&pvh_global_lock); 4232 PMAP_LOCK(pmap); 4233 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 4234 allfree = 1; 4235 freed = 0; 4236 for (field = 0; field < _NPCM; field++) { 4237 inuse = ~pc->pc_map[field] & pc_freemask[field]; 4238 while (inuse != 0) { 4239 bit = ffsl(inuse) - 1; 4240 bitmask = 1UL << bit; 4241 idx = field * 64 + bit; 4242 pv = &pc->pc_pventry[idx]; 4243 inuse &= ~bitmask; 4244 4245 pte = pmap_l1(pmap, pv->pv_va); 4246 ptepde = pmap_load(pte); 4247 pte = pmap_l1_to_l2(pte, pv->pv_va); 4248 tpte = pmap_load(pte); 4249 4250 KASSERT((tpte & PTE_V) != 0, 4251 ("L2 PTE is invalid... bogus PV entry? " 4252 "va=%#lx, pte=%#lx", pv->pv_va, tpte)); 4253 if ((tpte & PTE_RWX) != 0) { 4254 superpage = true; 4255 } else { 4256 ptepde = tpte; 4257 pte = pmap_l2_to_l3(pte, pv->pv_va); 4258 tpte = pmap_load(pte); 4259 superpage = false; 4260 } 4261 4262 /* 4263 * We cannot remove wired pages from a 4264 * process' mapping at this time. 4265 */ 4266 if (tpte & PTE_SW_WIRED) { 4267 allfree = 0; 4268 continue; 4269 } 4270 4271 m = PTE_TO_VM_PAGE(tpte); 4272 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 4273 m < &vm_page_array[vm_page_array_size], 4274 ("pmap_remove_pages: bad pte %#jx", 4275 (uintmax_t)tpte)); 4276 4277 pmap_clear(pte); 4278 4279 /* 4280 * Update the vm_page_t clean/reference bits. 4281 */ 4282 if ((tpte & (PTE_D | PTE_W)) == 4283 (PTE_D | PTE_W)) { 4284 if (superpage) 4285 for (mt = m; 4286 mt < &m[Ln_ENTRIES]; mt++) 4287 vm_page_dirty(mt); 4288 else 4289 vm_page_dirty(m); 4290 } 4291 4292 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 4293 4294 /* Mark free */ 4295 pc->pc_map[field] |= bitmask; 4296 4297 pmap_remove_pages_pv(pmap, m, pv, &free, 4298 superpage); 4299 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); 4300 freed++; 4301 } 4302 } 4303 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 4304 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 4305 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 4306 if (allfree) { 4307 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 4308 free_pv_chunk(pc); 4309 } 4310 } 4311 if (lock != NULL) 4312 rw_wunlock(lock); 4313 pmap_invalidate_all(pmap); 4314 rw_runlock(&pvh_global_lock); 4315 PMAP_UNLOCK(pmap); 4316 vm_page_free_pages_toq(&free, false); 4317 } 4318 4319 static bool 4320 pmap_page_test_mappings(vm_page_t m, bool accessed, bool modified) 4321 { 4322 struct md_page *pvh; 4323 struct rwlock *lock; 4324 pd_entry_t *l2; 4325 pt_entry_t *l3, mask; 4326 pv_entry_t pv; 4327 pmap_t pmap; 4328 int md_gen, pvh_gen; 4329 bool rv; 4330 4331 mask = 0; 4332 if (modified) 4333 mask |= PTE_D; 4334 if (accessed) 4335 mask |= PTE_A; 4336 4337 rv = false; 4338 rw_rlock(&pvh_global_lock); 4339 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4340 rw_rlock(lock); 4341 restart: 4342 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4343 pmap = PV_PMAP(pv); 4344 if (!PMAP_TRYLOCK(pmap)) { 4345 md_gen = m->md.pv_gen; 4346 rw_runlock(lock); 4347 PMAP_LOCK(pmap); 4348 rw_rlock(lock); 4349 if (md_gen != m->md.pv_gen) { 4350 PMAP_UNLOCK(pmap); 4351 goto restart; 4352 } 4353 } 4354 l2 = pmap_l2(pmap, pv->pv_va); 4355 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 4356 ("%s: found a 2mpage in page %p's pv list", __func__, m)); 4357 l3 = pmap_l2_to_l3(l2, pv->pv_va); 4358 rv = (pmap_load(l3) & mask) == mask; 4359 PMAP_UNLOCK(pmap); 4360 if (rv) 4361 goto out; 4362 } 4363 if ((m->flags & PG_FICTITIOUS) == 0) { 4364 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4365 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4366 pmap = PV_PMAP(pv); 4367 if (!PMAP_TRYLOCK(pmap)) { 4368 md_gen = m->md.pv_gen; 4369 pvh_gen = pvh->pv_gen; 4370 rw_runlock(lock); 4371 PMAP_LOCK(pmap); 4372 rw_rlock(lock); 4373 if (md_gen != m->md.pv_gen || 4374 pvh_gen != pvh->pv_gen) { 4375 PMAP_UNLOCK(pmap); 4376 goto restart; 4377 } 4378 } 4379 l2 = pmap_l2(pmap, pv->pv_va); 4380 rv = (pmap_load(l2) & mask) == mask; 4381 PMAP_UNLOCK(pmap); 4382 if (rv) 4383 goto out; 4384 } 4385 } 4386 out: 4387 rw_runlock(lock); 4388 rw_runlock(&pvh_global_lock); 4389 return (rv); 4390 } 4391 4392 /* 4393 * pmap_is_modified: 4394 * 4395 * Return whether or not the specified physical page was modified 4396 * in any physical maps. 4397 */ 4398 bool 4399 pmap_is_modified(vm_page_t m) 4400 { 4401 4402 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4403 ("pmap_is_modified: page %p is not managed", m)); 4404 4405 /* 4406 * If the page is not busied then this check is racy. 4407 */ 4408 if (!pmap_page_is_write_mapped(m)) 4409 return (false); 4410 return (pmap_page_test_mappings(m, false, true)); 4411 } 4412 4413 /* 4414 * pmap_is_prefaultable: 4415 * 4416 * Return whether or not the specified virtual address is eligible 4417 * for prefault. 4418 */ 4419 bool 4420 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 4421 { 4422 pt_entry_t *l3; 4423 bool rv; 4424 4425 /* 4426 * Return true if and only if the L3 entry for the specified virtual 4427 * address is allocated but invalid. 4428 */ 4429 rv = false; 4430 PMAP_LOCK(pmap); 4431 l3 = pmap_l3(pmap, addr); 4432 if (l3 != NULL && pmap_load(l3) == 0) { 4433 rv = true; 4434 } 4435 PMAP_UNLOCK(pmap); 4436 return (rv); 4437 } 4438 4439 /* 4440 * pmap_is_referenced: 4441 * 4442 * Return whether or not the specified physical page was referenced 4443 * in any physical maps. 4444 */ 4445 bool 4446 pmap_is_referenced(vm_page_t m) 4447 { 4448 4449 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4450 ("pmap_is_referenced: page %p is not managed", m)); 4451 return (pmap_page_test_mappings(m, true, false)); 4452 } 4453 4454 /* 4455 * Clear the write and modified bits in each of the given page's mappings. 4456 */ 4457 void 4458 pmap_remove_write(vm_page_t m) 4459 { 4460 struct md_page *pvh; 4461 struct rwlock *lock; 4462 pmap_t pmap; 4463 pd_entry_t *l2; 4464 pt_entry_t *l3, oldl3, newl3; 4465 pv_entry_t next_pv, pv; 4466 vm_offset_t va; 4467 int md_gen, pvh_gen; 4468 4469 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4470 ("pmap_remove_write: page %p is not managed", m)); 4471 vm_page_assert_busied(m); 4472 4473 if (!pmap_page_is_write_mapped(m)) 4474 return; 4475 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4476 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 4477 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4478 rw_rlock(&pvh_global_lock); 4479 retry_pv_loop: 4480 rw_wlock(lock); 4481 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 4482 pmap = PV_PMAP(pv); 4483 if (!PMAP_TRYLOCK(pmap)) { 4484 pvh_gen = pvh->pv_gen; 4485 rw_wunlock(lock); 4486 PMAP_LOCK(pmap); 4487 rw_wlock(lock); 4488 if (pvh_gen != pvh->pv_gen) { 4489 PMAP_UNLOCK(pmap); 4490 rw_wunlock(lock); 4491 goto retry_pv_loop; 4492 } 4493 } 4494 va = pv->pv_va; 4495 l2 = pmap_l2(pmap, va); 4496 if ((pmap_load(l2) & PTE_W) != 0) 4497 (void)pmap_demote_l2_locked(pmap, l2, va, &lock); 4498 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 4499 ("inconsistent pv lock %p %p for page %p", 4500 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 4501 PMAP_UNLOCK(pmap); 4502 } 4503 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4504 pmap = PV_PMAP(pv); 4505 if (!PMAP_TRYLOCK(pmap)) { 4506 pvh_gen = pvh->pv_gen; 4507 md_gen = m->md.pv_gen; 4508 rw_wunlock(lock); 4509 PMAP_LOCK(pmap); 4510 rw_wlock(lock); 4511 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 4512 PMAP_UNLOCK(pmap); 4513 rw_wunlock(lock); 4514 goto retry_pv_loop; 4515 } 4516 } 4517 l2 = pmap_l2(pmap, pv->pv_va); 4518 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 4519 ("%s: found a 2mpage in page %p's pv list", __func__, m)); 4520 l3 = pmap_l2_to_l3(l2, pv->pv_va); 4521 oldl3 = pmap_load(l3); 4522 retry: 4523 if ((oldl3 & PTE_W) != 0) { 4524 newl3 = oldl3 & ~(PTE_D | PTE_W); 4525 if (!atomic_fcmpset_long(l3, &oldl3, newl3)) 4526 goto retry; 4527 if ((oldl3 & PTE_D) != 0) 4528 vm_page_dirty(m); 4529 pmap_invalidate_page(pmap, pv->pv_va); 4530 } 4531 PMAP_UNLOCK(pmap); 4532 } 4533 rw_wunlock(lock); 4534 vm_page_aflag_clear(m, PGA_WRITEABLE); 4535 rw_runlock(&pvh_global_lock); 4536 } 4537 4538 /* 4539 * pmap_ts_referenced: 4540 * 4541 * Return a count of reference bits for a page, clearing those bits. 4542 * It is not necessary for every reference bit to be cleared, but it 4543 * is necessary that 0 only be returned when there are truly no 4544 * reference bits set. 4545 * 4546 * As an optimization, update the page's dirty field if a modified bit is 4547 * found while counting reference bits. This opportunistic update can be 4548 * performed at low cost and can eliminate the need for some future calls 4549 * to pmap_is_modified(). However, since this function stops after 4550 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 4551 * dirty pages. Those dirty pages will only be detected by a future call 4552 * to pmap_is_modified(). 4553 */ 4554 int 4555 pmap_ts_referenced(vm_page_t m) 4556 { 4557 struct spglist free; 4558 struct md_page *pvh; 4559 struct rwlock *lock; 4560 pv_entry_t pv, pvf; 4561 pmap_t pmap; 4562 pd_entry_t *l2, l2e; 4563 pt_entry_t *l3, l3e; 4564 vm_paddr_t pa; 4565 vm_offset_t va; 4566 int cleared, md_gen, not_cleared, pvh_gen; 4567 4568 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4569 ("pmap_ts_referenced: page %p is not managed", m)); 4570 SLIST_INIT(&free); 4571 cleared = 0; 4572 pa = VM_PAGE_TO_PHYS(m); 4573 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); 4574 4575 lock = PHYS_TO_PV_LIST_LOCK(pa); 4576 rw_rlock(&pvh_global_lock); 4577 rw_wlock(lock); 4578 retry: 4579 not_cleared = 0; 4580 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 4581 goto small_mappings; 4582 pv = pvf; 4583 do { 4584 pmap = PV_PMAP(pv); 4585 if (!PMAP_TRYLOCK(pmap)) { 4586 pvh_gen = pvh->pv_gen; 4587 rw_wunlock(lock); 4588 PMAP_LOCK(pmap); 4589 rw_wlock(lock); 4590 if (pvh_gen != pvh->pv_gen) { 4591 PMAP_UNLOCK(pmap); 4592 goto retry; 4593 } 4594 } 4595 va = pv->pv_va; 4596 l2 = pmap_l2(pmap, va); 4597 l2e = pmap_load(l2); 4598 if ((l2e & (PTE_W | PTE_D)) == (PTE_W | PTE_D)) { 4599 /* 4600 * Although l2e is mapping a 2MB page, because 4601 * this function is called at a 4KB page granularity, 4602 * we only update the 4KB page under test. 4603 */ 4604 vm_page_dirty(m); 4605 } 4606 if ((l2e & PTE_A) != 0) { 4607 /* 4608 * Since this reference bit is shared by 512 4KB 4609 * pages, it should not be cleared every time it is 4610 * tested. Apply a simple "hash" function on the 4611 * physical page number, the virtual superpage number, 4612 * and the pmap address to select one 4KB page out of 4613 * the 512 on which testing the reference bit will 4614 * result in clearing that reference bit. This 4615 * function is designed to avoid the selection of the 4616 * same 4KB page for every 2MB page mapping. 4617 * 4618 * On demotion, a mapping that hasn't been referenced 4619 * is simply destroyed. To avoid the possibility of a 4620 * subsequent page fault on a demoted wired mapping, 4621 * always leave its reference bit set. Moreover, 4622 * since the superpage is wired, the current state of 4623 * its reference bit won't affect page replacement. 4624 */ 4625 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^ 4626 (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 && 4627 (l2e & PTE_SW_WIRED) == 0) { 4628 pmap_clear_bits(l2, PTE_A); 4629 pmap_invalidate_page(pmap, va); 4630 cleared++; 4631 } else 4632 not_cleared++; 4633 } 4634 PMAP_UNLOCK(pmap); 4635 /* Rotate the PV list if it has more than one entry. */ 4636 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 4637 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 4638 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 4639 pvh->pv_gen++; 4640 } 4641 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) 4642 goto out; 4643 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 4644 small_mappings: 4645 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 4646 goto out; 4647 pv = pvf; 4648 do { 4649 pmap = PV_PMAP(pv); 4650 if (!PMAP_TRYLOCK(pmap)) { 4651 pvh_gen = pvh->pv_gen; 4652 md_gen = m->md.pv_gen; 4653 rw_wunlock(lock); 4654 PMAP_LOCK(pmap); 4655 rw_wlock(lock); 4656 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 4657 PMAP_UNLOCK(pmap); 4658 goto retry; 4659 } 4660 } 4661 l2 = pmap_l2(pmap, pv->pv_va); 4662 4663 KASSERT((pmap_load(l2) & PTE_RX) == 0, 4664 ("pmap_ts_referenced: found an invalid l2 table")); 4665 4666 l3 = pmap_l2_to_l3(l2, pv->pv_va); 4667 l3e = pmap_load(l3); 4668 if ((l3e & PTE_D) != 0) 4669 vm_page_dirty(m); 4670 if ((l3e & PTE_A) != 0) { 4671 if ((l3e & PTE_SW_WIRED) == 0) { 4672 /* 4673 * Wired pages cannot be paged out so 4674 * doing accessed bit emulation for 4675 * them is wasted effort. We do the 4676 * hard work for unwired pages only. 4677 */ 4678 pmap_clear_bits(l3, PTE_A); 4679 pmap_invalidate_page(pmap, pv->pv_va); 4680 cleared++; 4681 } else 4682 not_cleared++; 4683 } 4684 PMAP_UNLOCK(pmap); 4685 /* Rotate the PV list if it has more than one entry. */ 4686 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 4687 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4688 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 4689 m->md.pv_gen++; 4690 } 4691 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + 4692 not_cleared < PMAP_TS_REFERENCED_MAX); 4693 out: 4694 rw_wunlock(lock); 4695 rw_runlock(&pvh_global_lock); 4696 vm_page_free_pages_toq(&free, false); 4697 return (cleared + not_cleared); 4698 } 4699 4700 /* 4701 * Apply the given advice to the specified range of addresses within the 4702 * given pmap. Depending on the advice, clear the referenced and/or 4703 * modified flags in each mapping and set the mapped page's dirty field. 4704 */ 4705 void 4706 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 4707 { 4708 } 4709 4710 /* 4711 * Clear the modify bits on the specified physical page. 4712 */ 4713 void 4714 pmap_clear_modify(vm_page_t m) 4715 { 4716 struct md_page *pvh; 4717 struct rwlock *lock; 4718 pmap_t pmap; 4719 pv_entry_t next_pv, pv; 4720 pd_entry_t *l2, oldl2; 4721 pt_entry_t *l3; 4722 vm_offset_t va; 4723 int md_gen, pvh_gen; 4724 4725 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4726 ("%s: page %p is not managed", __func__, m)); 4727 vm_page_assert_busied(m); 4728 4729 if (!pmap_page_is_write_mapped(m)) 4730 return; 4731 4732 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 4733 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4734 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4735 rw_rlock(&pvh_global_lock); 4736 rw_wlock(lock); 4737 restart: 4738 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 4739 pmap = PV_PMAP(pv); 4740 if (!PMAP_TRYLOCK(pmap)) { 4741 pvh_gen = pvh->pv_gen; 4742 rw_wunlock(lock); 4743 PMAP_LOCK(pmap); 4744 rw_wlock(lock); 4745 if (pvh_gen != pvh->pv_gen) { 4746 PMAP_UNLOCK(pmap); 4747 goto restart; 4748 } 4749 } 4750 va = pv->pv_va; 4751 l2 = pmap_l2(pmap, va); 4752 oldl2 = pmap_load(l2); 4753 /* If oldl2 has PTE_W set, then it also has PTE_D set. */ 4754 if ((oldl2 & PTE_W) != 0 && 4755 pmap_demote_l2_locked(pmap, l2, va, &lock) && 4756 (oldl2 & PTE_SW_WIRED) == 0) { 4757 /* 4758 * Write protect the mapping to a single page so that 4759 * a subsequent write access may repromote. 4760 */ 4761 va += VM_PAGE_TO_PHYS(m) - PTE_TO_PHYS(oldl2); 4762 l3 = pmap_l2_to_l3(l2, va); 4763 pmap_clear_bits(l3, PTE_D | PTE_W); 4764 vm_page_dirty(m); 4765 pmap_invalidate_page(pmap, va); 4766 } 4767 PMAP_UNLOCK(pmap); 4768 } 4769 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4770 pmap = PV_PMAP(pv); 4771 if (!PMAP_TRYLOCK(pmap)) { 4772 md_gen = m->md.pv_gen; 4773 pvh_gen = pvh->pv_gen; 4774 rw_wunlock(lock); 4775 PMAP_LOCK(pmap); 4776 rw_wlock(lock); 4777 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 4778 PMAP_UNLOCK(pmap); 4779 goto restart; 4780 } 4781 } 4782 l2 = pmap_l2(pmap, pv->pv_va); 4783 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 4784 ("%s: found a 2mpage in page %p's pv list", __func__, m)); 4785 l3 = pmap_l2_to_l3(l2, pv->pv_va); 4786 if ((pmap_load(l3) & (PTE_D | PTE_W)) == (PTE_D | PTE_W)) { 4787 pmap_clear_bits(l3, PTE_D | PTE_W); 4788 pmap_invalidate_page(pmap, pv->pv_va); 4789 } 4790 PMAP_UNLOCK(pmap); 4791 } 4792 rw_wunlock(lock); 4793 rw_runlock(&pvh_global_lock); 4794 } 4795 4796 void * 4797 pmap_mapbios(vm_paddr_t pa, vm_size_t size) 4798 { 4799 4800 return ((void *)PHYS_TO_DMAP(pa)); 4801 } 4802 4803 void 4804 pmap_unmapbios(void *p, vm_size_t size) 4805 { 4806 } 4807 4808 /* 4809 * Sets the memory attribute for the specified page. 4810 */ 4811 void 4812 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 4813 { 4814 4815 m->md.pv_memattr = ma; 4816 4817 /* 4818 * If "m" is a normal page, update its direct mapping. This update 4819 * can be relied upon to perform any cache operations that are 4820 * required for data coherence. 4821 */ 4822 if ((m->flags & PG_FICTITIOUS) == 0 && 4823 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, 4824 m->md.pv_memattr) != 0) 4825 panic("memory attribute change on the direct map failed"); 4826 } 4827 4828 /* 4829 * Changes the specified virtual address range's memory type to that given by 4830 * the parameter "mode". The specified virtual address range must be 4831 * completely contained within either the direct map or the kernel map. 4832 * 4833 * Returns zero if the change completed successfully, and either EINVAL or 4834 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 4835 * of the virtual address range was not mapped, and ENOMEM is returned if 4836 * there was insufficient memory available to complete the change. In the 4837 * latter case, the memory type may have been changed on some part of the 4838 * virtual address range. 4839 */ 4840 int 4841 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) 4842 { 4843 int error; 4844 4845 PMAP_LOCK(kernel_pmap); 4846 error = pmap_change_attr_locked(va, size, mode); 4847 PMAP_UNLOCK(kernel_pmap); 4848 return (error); 4849 } 4850 4851 static int 4852 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode) 4853 { 4854 vm_offset_t base, offset, tmpva; 4855 vm_paddr_t phys; 4856 pd_entry_t *l1, l1e; 4857 pd_entry_t *l2, l2e; 4858 pt_entry_t *l3, l3e; 4859 pt_entry_t bits, mask; 4860 bool anychanged = false; 4861 int error = 0; 4862 4863 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 4864 base = trunc_page(va); 4865 offset = va & PAGE_MASK; 4866 size = round_page(offset + size); 4867 4868 if (!VIRT_IN_DMAP(base) && 4869 !(base >= VM_MIN_KERNEL_ADDRESS && base < VM_MAX_KERNEL_ADDRESS)) 4870 return (EINVAL); 4871 4872 bits = pmap_memattr_bits(mode); 4873 mask = memattr_mask; 4874 4875 /* First loop: perform PTE validation and demotions as necessary. */ 4876 for (tmpva = base; tmpva < base + size; ) { 4877 l1 = pmap_l1(kernel_pmap, tmpva); 4878 if (l1 == NULL || ((l1e = pmap_load(l1)) & PTE_V) == 0) 4879 return (EINVAL); 4880 if ((l1e & PTE_RWX) != 0) { 4881 /* 4882 * If the existing PTE has the correct attributes, then 4883 * no need to demote. 4884 */ 4885 if ((l1e & mask) == bits) { 4886 tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE; 4887 continue; 4888 } 4889 4890 /* 4891 * If the 1GB page fits in the remaining range, we 4892 * don't need to demote. 4893 */ 4894 if ((tmpva & L1_OFFSET) == 0 && 4895 tmpva + L1_SIZE <= base + size) { 4896 tmpva += L1_SIZE; 4897 continue; 4898 } 4899 4900 if (!pmap_demote_l1(kernel_pmap, l1, tmpva)) 4901 return (EINVAL); 4902 } 4903 l2 = pmap_l1_to_l2(l1, tmpva); 4904 if (((l2e = pmap_load(l2)) & PTE_V) == 0) 4905 return (EINVAL); 4906 if ((l2e & PTE_RWX) != 0) { 4907 /* 4908 * If the existing PTE has the correct attributes, then 4909 * no need to demote. 4910 */ 4911 if ((l2e & mask) == bits) { 4912 tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE; 4913 continue; 4914 } 4915 4916 /* 4917 * If the 2MB page fits in the remaining range, we 4918 * don't need to demote. 4919 */ 4920 if ((tmpva & L2_OFFSET) == 0 && 4921 tmpva + L2_SIZE <= base + size) { 4922 tmpva += L2_SIZE; 4923 continue; 4924 } 4925 4926 if (!pmap_demote_l2(kernel_pmap, l2, tmpva)) 4927 panic("l2 demotion failed"); 4928 } 4929 l3 = pmap_l2_to_l3(l2, tmpva); 4930 if (((l3e = pmap_load(l3)) & PTE_V) == 0) 4931 return (EINVAL); 4932 4933 tmpva += PAGE_SIZE; 4934 } 4935 4936 /* Second loop: perform PTE updates. */ 4937 for (tmpva = base; tmpva < base + size; ) { 4938 l1 = pmap_l1(kernel_pmap, tmpva); 4939 l1e = pmap_load(l1); 4940 if ((l1e & PTE_RWX) != 0) { 4941 /* Unchanged. */ 4942 if ((l1e & mask) == bits) { 4943 tmpva += L1_SIZE; 4944 continue; 4945 } 4946 4947 l1e &= ~mask; 4948 l1e |= bits; 4949 pmap_store(l1, l1e); 4950 anychanged = true; 4951 4952 /* Update corresponding DMAP entry */ 4953 phys = L1PTE_TO_PHYS(l1e); 4954 if (!VIRT_IN_DMAP(tmpva) && PHYS_IN_DMAP(phys)) { 4955 error = pmap_change_attr_locked( 4956 PHYS_TO_DMAP(phys), L1_SIZE, mode); 4957 if (error != 0) 4958 break; 4959 } 4960 tmpva += L1_SIZE; 4961 continue; 4962 } 4963 4964 l2 = pmap_l1_to_l2(l1, tmpva); 4965 l2e = pmap_load(l2); 4966 if ((l2e & PTE_RWX) != 0) { 4967 /* Unchanged. */ 4968 if ((l2e & mask) == bits) { 4969 tmpva += L2_SIZE; 4970 continue; 4971 } 4972 4973 l2e &= ~mask; 4974 l2e |= bits; 4975 pmap_store(l2, l2e); 4976 anychanged = true; 4977 4978 /* Update corresponding DMAP entry */ 4979 phys = L2PTE_TO_PHYS(l2e); 4980 if (!VIRT_IN_DMAP(tmpva) && PHYS_IN_DMAP(phys)) { 4981 error = pmap_change_attr_locked( 4982 PHYS_TO_DMAP(phys), L2_SIZE, mode); 4983 if (error != 0) 4984 break; 4985 } 4986 tmpva += L2_SIZE; 4987 continue; 4988 } 4989 4990 l3 = pmap_l2_to_l3(l2, tmpva); 4991 l3e = pmap_load(l3); 4992 4993 /* Unchanged. */ 4994 if ((l3e & mask) == bits) { 4995 tmpva += PAGE_SIZE; 4996 continue; 4997 } 4998 4999 l3e &= ~mask; 5000 l3e |= bits; 5001 pmap_store(l3, l3e); 5002 anychanged = true; 5003 5004 phys = PTE_TO_PHYS(l3e); 5005 if (!VIRT_IN_DMAP(tmpva) && PHYS_IN_DMAP(phys)) { 5006 error = pmap_change_attr_locked(PHYS_TO_DMAP(phys), 5007 L3_SIZE, mode); 5008 if (error != 0) 5009 break; 5010 } 5011 tmpva += PAGE_SIZE; 5012 } 5013 5014 if (anychanged) { 5015 pmap_invalidate_range(kernel_pmap, base, tmpva); 5016 if (mode == VM_MEMATTR_UNCACHEABLE) 5017 cpu_dcache_wbinv_range((void *)base, size); 5018 } 5019 5020 return (error); 5021 } 5022 5023 /* 5024 * Perform the pmap work for mincore(2). If the page is not both referenced and 5025 * modified by this pmap, returns its physical address so that the caller can 5026 * find other mappings. 5027 */ 5028 int 5029 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) 5030 { 5031 pt_entry_t *l2, *l3, tpte; 5032 vm_paddr_t pa; 5033 int val; 5034 bool managed; 5035 5036 PMAP_LOCK(pmap); 5037 l2 = pmap_l2(pmap, addr); 5038 if (l2 != NULL && ((tpte = pmap_load(l2)) & PTE_V) != 0) { 5039 if ((tpte & PTE_RWX) != 0) { 5040 pa = PTE_TO_PHYS(tpte) | (addr & L2_OFFSET); 5041 val = MINCORE_INCORE | MINCORE_PSIND(1); 5042 } else { 5043 l3 = pmap_l2_to_l3(l2, addr); 5044 tpte = pmap_load(l3); 5045 if ((tpte & PTE_V) == 0) { 5046 PMAP_UNLOCK(pmap); 5047 return (0); 5048 } 5049 pa = PTE_TO_PHYS(tpte) | (addr & L3_OFFSET); 5050 val = MINCORE_INCORE; 5051 } 5052 5053 if ((tpte & PTE_D) != 0) 5054 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 5055 if ((tpte & PTE_A) != 0) 5056 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 5057 managed = (tpte & PTE_SW_MANAGED) == PTE_SW_MANAGED; 5058 } else { 5059 managed = false; 5060 val = 0; 5061 } 5062 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 5063 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { 5064 *pap = pa; 5065 } 5066 PMAP_UNLOCK(pmap); 5067 return (val); 5068 } 5069 5070 void 5071 pmap_activate_sw(struct thread *td) 5072 { 5073 pmap_t oldpmap, pmap; 5074 u_int hart; 5075 5076 oldpmap = PCPU_GET(curpmap); 5077 pmap = vmspace_pmap(td->td_proc->p_vmspace); 5078 if (pmap == oldpmap) 5079 return; 5080 csr_write(satp, pmap->pm_satp); 5081 5082 hart = PCPU_GET(hart); 5083 #ifdef SMP 5084 CPU_SET_ATOMIC(hart, &pmap->pm_active); 5085 CPU_CLR_ATOMIC(hart, &oldpmap->pm_active); 5086 #else 5087 CPU_SET(hart, &pmap->pm_active); 5088 CPU_CLR(hart, &oldpmap->pm_active); 5089 #endif 5090 PCPU_SET(curpmap, pmap); 5091 5092 sfence_vma(); 5093 } 5094 5095 void 5096 pmap_activate(struct thread *td) 5097 { 5098 5099 critical_enter(); 5100 pmap_activate_sw(td); 5101 critical_exit(); 5102 } 5103 5104 void 5105 pmap_activate_boot(pmap_t pmap) 5106 { 5107 u_int hart; 5108 5109 hart = PCPU_GET(hart); 5110 #ifdef SMP 5111 CPU_SET_ATOMIC(hart, &pmap->pm_active); 5112 #else 5113 CPU_SET(hart, &pmap->pm_active); 5114 #endif 5115 PCPU_SET(curpmap, pmap); 5116 } 5117 5118 void 5119 pmap_active_cpus(pmap_t pmap, cpuset_t *res) 5120 { 5121 *res = pmap->pm_active; 5122 } 5123 5124 void 5125 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz) 5126 { 5127 cpuset_t mask; 5128 5129 /* 5130 * From the RISC-V User-Level ISA V2.2: 5131 * 5132 * "To make a store to instruction memory visible to all 5133 * RISC-V harts, the writing hart has to execute a data FENCE 5134 * before requesting that all remote RISC-V harts execute a 5135 * FENCE.I." 5136 * 5137 * However, this is slightly misleading; we still need to 5138 * perform a FENCE.I for the local hart, as FENCE does nothing 5139 * for its icache. FENCE.I alone is also sufficient for the 5140 * local hart. 5141 */ 5142 sched_pin(); 5143 mask = all_harts; 5144 CPU_CLR(PCPU_GET(hart), &mask); 5145 fence_i(); 5146 if (!CPU_EMPTY(&mask) && smp_started) { 5147 fence(); 5148 sbi_remote_fence_i(mask.__bits); 5149 } 5150 sched_unpin(); 5151 } 5152 5153 /* 5154 * Increase the starting virtual address of the given mapping if a 5155 * different alignment might result in more superpage mappings. 5156 */ 5157 void 5158 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 5159 vm_offset_t *addr, vm_size_t size) 5160 { 5161 vm_offset_t superpage_offset; 5162 5163 if (size < L2_SIZE) 5164 return; 5165 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 5166 offset += ptoa(object->pg_color); 5167 superpage_offset = offset & L2_OFFSET; 5168 if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE || 5169 (*addr & L2_OFFSET) == superpage_offset) 5170 return; 5171 if ((*addr & L2_OFFSET) < superpage_offset) 5172 *addr = (*addr & ~L2_OFFSET) + superpage_offset; 5173 else 5174 *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset; 5175 } 5176 5177 /** 5178 * Get the kernel virtual address of a set of physical pages. If there are 5179 * physical addresses not covered by the DMAP perform a transient mapping 5180 * that will be removed when calling pmap_unmap_io_transient. 5181 * 5182 * \param page The pages the caller wishes to obtain the virtual 5183 * address on the kernel memory map. 5184 * \param vaddr On return contains the kernel virtual memory address 5185 * of the pages passed in the page parameter. 5186 * \param count Number of pages passed in. 5187 * \param can_fault true if the thread using the mapped pages can take 5188 * page faults, false otherwise. 5189 * 5190 * \returns true if the caller must call pmap_unmap_io_transient when 5191 * finished or false otherwise. 5192 * 5193 */ 5194 bool 5195 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 5196 bool can_fault) 5197 { 5198 vm_paddr_t paddr; 5199 bool needs_mapping; 5200 int error __diagused, i; 5201 5202 /* 5203 * Allocate any KVA space that we need, this is done in a separate 5204 * loop to prevent calling vmem_alloc while pinned. 5205 */ 5206 needs_mapping = false; 5207 for (i = 0; i < count; i++) { 5208 paddr = VM_PAGE_TO_PHYS(page[i]); 5209 if (__predict_false(paddr >= DMAP_MAX_PHYSADDR)) { 5210 error = vmem_alloc(kernel_arena, PAGE_SIZE, 5211 M_BESTFIT | M_WAITOK, &vaddr[i]); 5212 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 5213 needs_mapping = true; 5214 } else { 5215 vaddr[i] = PHYS_TO_DMAP(paddr); 5216 } 5217 } 5218 5219 /* Exit early if everything is covered by the DMAP */ 5220 if (!needs_mapping) 5221 return (false); 5222 5223 if (!can_fault) 5224 sched_pin(); 5225 for (i = 0; i < count; i++) { 5226 paddr = VM_PAGE_TO_PHYS(page[i]); 5227 if (paddr >= DMAP_MAX_PHYSADDR) { 5228 panic( 5229 "pmap_map_io_transient: TODO: Map out of DMAP data"); 5230 } 5231 } 5232 5233 return (needs_mapping); 5234 } 5235 5236 void 5237 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 5238 bool can_fault) 5239 { 5240 vm_paddr_t paddr; 5241 int i; 5242 5243 if (!can_fault) 5244 sched_unpin(); 5245 for (i = 0; i < count; i++) { 5246 paddr = VM_PAGE_TO_PHYS(page[i]); 5247 if (paddr >= DMAP_MAX_PHYSADDR) { 5248 panic("RISCVTODO: pmap_unmap_io_transient: Unmap data"); 5249 } 5250 } 5251 } 5252 5253 bool 5254 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) 5255 { 5256 5257 return (mode >= VM_MEMATTR_DEFAULT && mode <= VM_MEMATTR_LAST); 5258 } 5259 5260 bool 5261 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l1, pd_entry_t **l2, 5262 pt_entry_t **l3) 5263 { 5264 pd_entry_t *l1p, *l2p; 5265 5266 /* Get l1 directory entry. */ 5267 l1p = pmap_l1(pmap, va); 5268 *l1 = l1p; 5269 5270 if (l1p == NULL || (pmap_load(l1p) & PTE_V) == 0) 5271 return (false); 5272 5273 if ((pmap_load(l1p) & PTE_RX) != 0) { 5274 *l2 = NULL; 5275 *l3 = NULL; 5276 return (true); 5277 } 5278 5279 /* Get l2 directory entry. */ 5280 l2p = pmap_l1_to_l2(l1p, va); 5281 *l2 = l2p; 5282 5283 if (l2p == NULL || (pmap_load(l2p) & PTE_V) == 0) 5284 return (false); 5285 5286 if ((pmap_load(l2p) & PTE_RX) != 0) { 5287 *l3 = NULL; 5288 return (true); 5289 } 5290 5291 /* Get l3 page table entry. */ 5292 *l3 = pmap_l2_to_l3(l2p, va); 5293 5294 return (true); 5295 } 5296 5297 /* 5298 * Track a range of the kernel's virtual address space that is contiguous 5299 * in various mapping attributes. 5300 */ 5301 struct pmap_kernel_map_range { 5302 vm_offset_t sva; 5303 pt_entry_t attrs; 5304 int l3pages; 5305 int l2pages; 5306 int l1pages; 5307 }; 5308 5309 static void 5310 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, 5311 vm_offset_t eva) 5312 { 5313 char *mode; 5314 int i; 5315 5316 if (eva <= range->sva) 5317 return; 5318 5319 for (i = 0; i < nitems(memattr_bits); i++) 5320 if ((range->attrs & memattr_mask) == memattr_bits[i]) 5321 break; 5322 5323 switch (i) { 5324 case VM_MEMATTR_PMA: 5325 mode = "PMA"; 5326 break; 5327 case VM_MEMATTR_UNCACHEABLE: 5328 mode = "NC "; 5329 break; 5330 case VM_MEMATTR_DEVICE: 5331 mode = "IO "; 5332 break; 5333 default: 5334 mode = "???"; 5335 break; 5336 } 5337 5338 sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %s %d %d %d\n", 5339 range->sva, eva, 5340 (range->attrs & PTE_W) == PTE_W ? 'w' : '-', 5341 (range->attrs & PTE_X) == PTE_X ? 'x' : '-', 5342 (range->attrs & PTE_U) == PTE_U ? 'u' : 's', 5343 (range->attrs & PTE_G) == PTE_G ? 'g' : '-', 5344 mode, range->l1pages, range->l2pages, range->l3pages); 5345 5346 /* Reset to sentinel value. */ 5347 range->sva = 0xfffffffffffffffful; 5348 } 5349 5350 /* 5351 * Determine whether the attributes specified by a page table entry match those 5352 * being tracked by the current range. 5353 */ 5354 static bool 5355 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs) 5356 { 5357 5358 return (range->attrs == attrs); 5359 } 5360 5361 static void 5362 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, 5363 pt_entry_t attrs) 5364 { 5365 5366 memset(range, 0, sizeof(*range)); 5367 range->sva = va; 5368 range->attrs = attrs; 5369 } 5370 5371 /* 5372 * Given a leaf PTE, derive the mapping's attributes. If they do not match 5373 * those of the current run, dump the address range and its attributes, and 5374 * begin a new run. 5375 */ 5376 static void 5377 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, 5378 vm_offset_t va, pd_entry_t l1e, pd_entry_t l2e, pt_entry_t l3e) 5379 { 5380 pt_entry_t attrs; 5381 5382 /* The PTE global bit is inherited by lower levels. */ 5383 attrs = l1e & PTE_G; 5384 if ((l1e & PTE_RWX) != 0) { 5385 attrs |= l1e & (PTE_RWX | PTE_U); 5386 attrs |= l1e & memattr_mask; 5387 } else if (l2e != 0) 5388 attrs |= l2e & PTE_G; 5389 5390 if ((l2e & PTE_RWX) != 0) { 5391 attrs |= l2e & (PTE_RWX | PTE_U); 5392 attrs |= l2e & memattr_mask; 5393 } else if (l3e != 0) { 5394 attrs |= l3e & (PTE_RWX | PTE_U | PTE_G); 5395 attrs |= l3e & memattr_mask; 5396 } 5397 5398 if (range->sva > va || !sysctl_kmaps_match(range, attrs)) { 5399 sysctl_kmaps_dump(sb, range, va); 5400 sysctl_kmaps_reinit(range, va, attrs); 5401 } 5402 } 5403 5404 static int 5405 sysctl_kmaps(SYSCTL_HANDLER_ARGS) 5406 { 5407 struct pmap_kernel_map_range range; 5408 struct sbuf sbuf, *sb; 5409 pd_entry_t *l1, l1e, *l2, l2e; 5410 pt_entry_t *l3, l3e; 5411 vm_offset_t sva; 5412 vm_paddr_t pa; 5413 int error, i, j, k; 5414 5415 error = sysctl_wire_old_buffer(req, 0); 5416 if (error != 0) 5417 return (error); 5418 sb = &sbuf; 5419 sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); 5420 5421 /* Sentinel value. */ 5422 range.sva = 0xfffffffffffffffful; 5423 5424 /* 5425 * Iterate over the kernel page tables without holding the kernel pmap 5426 * lock. Kernel page table pages are never freed, so at worst we will 5427 * observe inconsistencies in the output. 5428 */ 5429 sva = VM_MIN_KERNEL_ADDRESS; 5430 for (i = pmap_l1_index(sva); i < Ln_ENTRIES; i++) { 5431 if (i == pmap_l1_index(DMAP_MIN_ADDRESS)) 5432 sbuf_printf(sb, "\nDirect map:\n"); 5433 else if (i == pmap_l1_index(VM_MIN_KERNEL_ADDRESS)) 5434 sbuf_printf(sb, "\nKernel map:\n"); 5435 5436 l1 = pmap_l1(kernel_pmap, sva); 5437 l1e = pmap_load(l1); 5438 if ((l1e & PTE_V) == 0) { 5439 sysctl_kmaps_dump(sb, &range, sva); 5440 sva += L1_SIZE; 5441 continue; 5442 } 5443 if ((l1e & PTE_RWX) != 0) { 5444 sysctl_kmaps_check(sb, &range, sva, l1e, 0, 0); 5445 range.l1pages++; 5446 sva += L1_SIZE; 5447 continue; 5448 } 5449 pa = PTE_TO_PHYS(l1e); 5450 l2 = (pd_entry_t *)PHYS_TO_DMAP(pa); 5451 5452 for (j = pmap_l2_index(sva); j < Ln_ENTRIES; j++) { 5453 l2e = l2[j]; 5454 if ((l2e & PTE_V) == 0) { 5455 sysctl_kmaps_dump(sb, &range, sva); 5456 sva += L2_SIZE; 5457 continue; 5458 } 5459 if ((l2e & PTE_RWX) != 0) { 5460 sysctl_kmaps_check(sb, &range, sva, l1e, l2e, 0); 5461 range.l2pages++; 5462 sva += L2_SIZE; 5463 continue; 5464 } 5465 pa = PTE_TO_PHYS(l2e); 5466 l3 = (pd_entry_t *)PHYS_TO_DMAP(pa); 5467 5468 for (k = pmap_l3_index(sva); k < Ln_ENTRIES; k++, 5469 sva += L3_SIZE) { 5470 l3e = l3[k]; 5471 if ((l3e & PTE_V) == 0) { 5472 sysctl_kmaps_dump(sb, &range, sva); 5473 continue; 5474 } 5475 sysctl_kmaps_check(sb, &range, sva, 5476 l1e, l2e, l3e); 5477 range.l3pages++; 5478 } 5479 } 5480 } 5481 5482 error = sbuf_finish(sb); 5483 sbuf_delete(sb); 5484 return (error); 5485 } 5486 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps, 5487 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP, 5488 NULL, 0, sysctl_kmaps, "A", 5489 "Dump kernel address layout"); 5490