1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 1991 Regents of the University of California. 5 * All rights reserved. 6 * Copyright (c) 1994 John S. Dyson 7 * All rights reserved. 8 * Copyright (c) 1994 David Greenman 9 * All rights reserved. 10 * Copyright (c) 2003 Peter Wemm 11 * All rights reserved. 12 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 13 * All rights reserved. 14 * Copyright (c) 2014 Andrew Turner 15 * All rights reserved. 16 * Copyright (c) 2014 The FreeBSD Foundation 17 * All rights reserved. 18 * Copyright (c) 2015-2018 Ruslan Bukin <br@bsdpad.com> 19 * All rights reserved. 20 * 21 * This code is derived from software contributed to Berkeley by 22 * the Systems Programming Group of the University of Utah Computer 23 * Science Department and William Jolitz of UUNET Technologies Inc. 24 * 25 * Portions of this software were developed by Andrew Turner under 26 * sponsorship from The FreeBSD Foundation. 27 * 28 * Portions of this software were developed by SRI International and the 29 * University of Cambridge Computer Laboratory under DARPA/AFRL contract 30 * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme. 31 * 32 * Portions of this software were developed by the University of Cambridge 33 * Computer Laboratory as part of the CTSRD Project, with support from the 34 * UK Higher Education Innovation Fund (HEIF). 35 * 36 * Redistribution and use in source and binary forms, with or without 37 * modification, are permitted provided that the following conditions 38 * are met: 39 * 1. Redistributions of source code must retain the above copyright 40 * notice, this list of conditions and the following disclaimer. 41 * 2. Redistributions in binary form must reproduce the above copyright 42 * notice, this list of conditions and the following disclaimer in the 43 * documentation and/or other materials provided with the distribution. 44 * 3. All advertising materials mentioning features or use of this software 45 * must display the following acknowledgement: 46 * This product includes software developed by the University of 47 * California, Berkeley and its contributors. 48 * 4. Neither the name of the University nor the names of its contributors 49 * may be used to endorse or promote products derived from this software 50 * without specific prior written permission. 51 * 52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 62 * SUCH DAMAGE. 63 * 64 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 65 */ 66 /*- 67 * Copyright (c) 2003 Networks Associates Technology, Inc. 68 * All rights reserved. 69 * 70 * This software was developed for the FreeBSD Project by Jake Burkholder, 71 * Safeport Network Services, and Network Associates Laboratories, the 72 * Security Research Division of Network Associates, Inc. under 73 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 74 * CHATS research program. 75 * 76 * Redistribution and use in source and binary forms, with or without 77 * modification, are permitted provided that the following conditions 78 * are met: 79 * 1. Redistributions of source code must retain the above copyright 80 * notice, this list of conditions and the following disclaimer. 81 * 2. Redistributions in binary form must reproduce the above copyright 82 * notice, this list of conditions and the following disclaimer in the 83 * documentation and/or other materials provided with the distribution. 84 * 85 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 86 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 87 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 88 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 89 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 90 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 91 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 92 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 93 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 94 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 95 * SUCH DAMAGE. 96 */ 97 98 #include <sys/cdefs.h> 99 __FBSDID("$FreeBSD$"); 100 101 /* 102 * Manages physical address maps. 103 * 104 * Since the information managed by this module is 105 * also stored by the logical address mapping module, 106 * this module may throw away valid virtual-to-physical 107 * mappings at almost any time. However, invalidations 108 * of virtual-to-physical mappings must be done as 109 * requested. 110 * 111 * In order to cope with hardware architectures which 112 * make virtual-to-physical map invalidates expensive, 113 * this module may delay invalidate or reduced protection 114 * operations until such time as they are actually 115 * necessary. This module is given full information as 116 * to which processors are currently using which maps, 117 * and to when physical maps must be made correct. 118 */ 119 120 #include <sys/param.h> 121 #include <sys/systm.h> 122 #include <sys/bitstring.h> 123 #include <sys/bus.h> 124 #include <sys/cpuset.h> 125 #include <sys/kernel.h> 126 #include <sys/ktr.h> 127 #include <sys/lock.h> 128 #include <sys/malloc.h> 129 #include <sys/mman.h> 130 #include <sys/msgbuf.h> 131 #include <sys/mutex.h> 132 #include <sys/physmem.h> 133 #include <sys/proc.h> 134 #include <sys/rwlock.h> 135 #include <sys/sbuf.h> 136 #include <sys/sx.h> 137 #include <sys/vmem.h> 138 #include <sys/vmmeter.h> 139 #include <sys/sched.h> 140 #include <sys/sysctl.h> 141 #include <sys/smp.h> 142 143 #include <vm/vm.h> 144 #include <vm/vm_param.h> 145 #include <vm/vm_kern.h> 146 #include <vm/vm_page.h> 147 #include <vm/vm_map.h> 148 #include <vm/vm_object.h> 149 #include <vm/vm_extern.h> 150 #include <vm/vm_pageout.h> 151 #include <vm/vm_pager.h> 152 #include <vm/vm_phys.h> 153 #include <vm/vm_radix.h> 154 #include <vm/vm_reserv.h> 155 #include <vm/vm_dumpset.h> 156 #include <vm/uma.h> 157 158 #include <machine/machdep.h> 159 #include <machine/md_var.h> 160 #include <machine/pcb.h> 161 #include <machine/sbi.h> 162 163 #define NUL1E (Ln_ENTRIES * Ln_ENTRIES) 164 #define NUL2E (Ln_ENTRIES * NUL1E) 165 166 #if !defined(DIAGNOSTIC) 167 #ifdef __GNUC_GNU_INLINE__ 168 #define PMAP_INLINE __attribute__((__gnu_inline__)) inline 169 #else 170 #define PMAP_INLINE extern inline 171 #endif 172 #else 173 #define PMAP_INLINE 174 #endif 175 176 #ifdef PV_STATS 177 #define PV_STAT(x) do { x ; } while (0) 178 #else 179 #define PV_STAT(x) do { } while (0) 180 #endif 181 182 #define pmap_l2_pindex(v) ((v) >> L2_SHIFT) 183 #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 184 185 #define NPV_LIST_LOCKS MAXCPU 186 187 #define PHYS_TO_PV_LIST_LOCK(pa) \ 188 (&pv_list_locks[pmap_l2_pindex(pa) % NPV_LIST_LOCKS]) 189 190 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ 191 struct rwlock **_lockp = (lockp); \ 192 struct rwlock *_new_lock; \ 193 \ 194 _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ 195 if (_new_lock != *_lockp) { \ 196 if (*_lockp != NULL) \ 197 rw_wunlock(*_lockp); \ 198 *_lockp = _new_lock; \ 199 rw_wlock(*_lockp); \ 200 } \ 201 } while (0) 202 203 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ 204 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) 205 206 #define RELEASE_PV_LIST_LOCK(lockp) do { \ 207 struct rwlock **_lockp = (lockp); \ 208 \ 209 if (*_lockp != NULL) { \ 210 rw_wunlock(*_lockp); \ 211 *_lockp = NULL; \ 212 } \ 213 } while (0) 214 215 #define VM_PAGE_TO_PV_LIST_LOCK(m) \ 216 PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) 217 218 /* The list of all the user pmaps */ 219 LIST_HEAD(pmaplist, pmap); 220 static struct pmaplist allpmaps = LIST_HEAD_INITIALIZER(); 221 222 struct pmap kernel_pmap_store; 223 224 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 225 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 226 vm_offset_t kernel_vm_end = 0; 227 228 vm_paddr_t dmap_phys_base; /* The start of the dmap region */ 229 vm_paddr_t dmap_phys_max; /* The limit of the dmap region */ 230 vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */ 231 232 /* This code assumes all L1 DMAP entries will be used */ 233 CTASSERT((DMAP_MIN_ADDRESS & ~L1_OFFSET) == DMAP_MIN_ADDRESS); 234 CTASSERT((DMAP_MAX_ADDRESS & ~L1_OFFSET) == DMAP_MAX_ADDRESS); 235 236 static struct rwlock_padalign pvh_global_lock; 237 static struct mtx_padalign allpmaps_lock; 238 239 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 240 "VM/pmap parameters"); 241 242 static int superpages_enabled = 1; 243 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled, 244 CTLFLAG_RDTUN, &superpages_enabled, 0, 245 "Enable support for transparent superpages"); 246 247 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 248 "2MB page mapping counters"); 249 250 static u_long pmap_l2_demotions; 251 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD, 252 &pmap_l2_demotions, 0, 253 "2MB page demotions"); 254 255 static u_long pmap_l2_mappings; 256 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD, 257 &pmap_l2_mappings, 0, 258 "2MB page mappings"); 259 260 static u_long pmap_l2_p_failures; 261 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD, 262 &pmap_l2_p_failures, 0, 263 "2MB page promotion failures"); 264 265 static u_long pmap_l2_promotions; 266 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD, 267 &pmap_l2_promotions, 0, 268 "2MB page promotions"); 269 270 /* 271 * Data for the pv entry allocation mechanism 272 */ 273 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 274 static struct mtx pv_chunks_mutex; 275 static struct rwlock pv_list_locks[NPV_LIST_LOCKS]; 276 static struct md_page *pv_table; 277 static struct md_page pv_dummy; 278 279 extern cpuset_t all_harts; 280 281 /* 282 * Internal flags for pmap_enter()'s helper functions. 283 */ 284 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ 285 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ 286 287 static void free_pv_chunk(struct pv_chunk *pc); 288 static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 289 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); 290 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); 291 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 292 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 293 vm_offset_t va); 294 static bool pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va); 295 static bool pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, 296 vm_offset_t va, struct rwlock **lockp); 297 static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, 298 u_int flags, vm_page_t m, struct rwlock **lockp); 299 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 300 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); 301 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva, 302 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); 303 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 304 vm_page_t m, struct rwlock **lockp); 305 306 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, 307 struct rwlock **lockp); 308 309 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, 310 struct spglist *free); 311 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); 312 313 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode); 314 315 #define pmap_clear(pte) pmap_store(pte, 0) 316 #define pmap_clear_bits(pte, bits) atomic_clear_64(pte, bits) 317 #define pmap_load_store(pte, entry) atomic_swap_64(pte, entry) 318 #define pmap_load_clear(pte) pmap_load_store(pte, 0) 319 #define pmap_load(pte) atomic_load_64(pte) 320 #define pmap_store(pte, entry) atomic_store_64(pte, entry) 321 #define pmap_store_bits(pte, bits) atomic_set_64(pte, bits) 322 323 /********************/ 324 /* Inline functions */ 325 /********************/ 326 327 static __inline void 328 pagecopy(void *s, void *d) 329 { 330 331 memcpy(d, s, PAGE_SIZE); 332 } 333 334 static __inline void 335 pagezero(void *p) 336 { 337 338 bzero(p, PAGE_SIZE); 339 } 340 341 #define pmap_l1_index(va) (((va) >> L1_SHIFT) & Ln_ADDR_MASK) 342 #define pmap_l2_index(va) (((va) >> L2_SHIFT) & Ln_ADDR_MASK) 343 #define pmap_l3_index(va) (((va) >> L3_SHIFT) & Ln_ADDR_MASK) 344 345 #define PTE_TO_PHYS(pte) \ 346 ((((pte) & ~PTE_HI_MASK) >> PTE_PPN0_S) * PAGE_SIZE) 347 #define L2PTE_TO_PHYS(l2) \ 348 ((((l2) & ~PTE_HI_MASK) >> PTE_PPN1_S) << L2_SHIFT) 349 350 static __inline pd_entry_t * 351 pmap_l1(pmap_t pmap, vm_offset_t va) 352 { 353 354 KASSERT(VIRT_IS_VALID(va), 355 ("%s: malformed virtual address %#lx", __func__, va)); 356 return (&pmap->pm_l1[pmap_l1_index(va)]); 357 } 358 359 static __inline pd_entry_t * 360 pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va) 361 { 362 vm_paddr_t phys; 363 pd_entry_t *l2; 364 365 phys = PTE_TO_PHYS(pmap_load(l1)); 366 l2 = (pd_entry_t *)PHYS_TO_DMAP(phys); 367 368 return (&l2[pmap_l2_index(va)]); 369 } 370 371 static __inline pd_entry_t * 372 pmap_l2(pmap_t pmap, vm_offset_t va) 373 { 374 pd_entry_t *l1; 375 376 l1 = pmap_l1(pmap, va); 377 if ((pmap_load(l1) & PTE_V) == 0) 378 return (NULL); 379 if ((pmap_load(l1) & PTE_RX) != 0) 380 return (NULL); 381 382 return (pmap_l1_to_l2(l1, va)); 383 } 384 385 static __inline pt_entry_t * 386 pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va) 387 { 388 vm_paddr_t phys; 389 pt_entry_t *l3; 390 391 phys = PTE_TO_PHYS(pmap_load(l2)); 392 l3 = (pd_entry_t *)PHYS_TO_DMAP(phys); 393 394 return (&l3[pmap_l3_index(va)]); 395 } 396 397 static __inline pt_entry_t * 398 pmap_l3(pmap_t pmap, vm_offset_t va) 399 { 400 pd_entry_t *l2; 401 402 l2 = pmap_l2(pmap, va); 403 if (l2 == NULL) 404 return (NULL); 405 if ((pmap_load(l2) & PTE_V) == 0) 406 return (NULL); 407 if ((pmap_load(l2) & PTE_RX) != 0) 408 return (NULL); 409 410 return (pmap_l2_to_l3(l2, va)); 411 } 412 413 static __inline void 414 pmap_resident_count_inc(pmap_t pmap, int count) 415 { 416 417 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 418 pmap->pm_stats.resident_count += count; 419 } 420 421 static __inline void 422 pmap_resident_count_dec(pmap_t pmap, int count) 423 { 424 425 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 426 KASSERT(pmap->pm_stats.resident_count >= count, 427 ("pmap %p resident count underflow %ld %d", pmap, 428 pmap->pm_stats.resident_count, count)); 429 pmap->pm_stats.resident_count -= count; 430 } 431 432 static void 433 pmap_distribute_l1(struct pmap *pmap, vm_pindex_t l1index, 434 pt_entry_t entry) 435 { 436 struct pmap *user_pmap; 437 pd_entry_t *l1; 438 439 /* Distribute new kernel L1 entry to all the user pmaps */ 440 if (pmap != kernel_pmap) 441 return; 442 443 mtx_lock(&allpmaps_lock); 444 LIST_FOREACH(user_pmap, &allpmaps, pm_list) { 445 l1 = &user_pmap->pm_l1[l1index]; 446 pmap_store(l1, entry); 447 } 448 mtx_unlock(&allpmaps_lock); 449 } 450 451 static pt_entry_t * 452 pmap_early_page_idx(vm_offset_t l1pt, vm_offset_t va, u_int *l1_slot, 453 u_int *l2_slot) 454 { 455 pt_entry_t *l2; 456 pd_entry_t *l1; 457 458 l1 = (pd_entry_t *)l1pt; 459 *l1_slot = (va >> L1_SHIFT) & Ln_ADDR_MASK; 460 461 /* Check locore has used a table L1 map */ 462 KASSERT((l1[*l1_slot] & PTE_RX) == 0, 463 ("Invalid bootstrap L1 table")); 464 465 /* Find the address of the L2 table */ 466 l2 = (pt_entry_t *)init_pt_va; 467 *l2_slot = pmap_l2_index(va); 468 469 return (l2); 470 } 471 472 static vm_paddr_t 473 pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va) 474 { 475 u_int l1_slot, l2_slot; 476 pt_entry_t *l2; 477 vm_paddr_t ret; 478 479 l2 = pmap_early_page_idx(l1pt, va, &l1_slot, &l2_slot); 480 481 /* Check locore has used L2 superpages */ 482 KASSERT((l2[l2_slot] & PTE_RX) != 0, 483 ("Invalid bootstrap L2 table")); 484 485 /* L2 is superpages */ 486 ret = L2PTE_TO_PHYS(l2[l2_slot]); 487 ret += (va & L2_OFFSET); 488 489 return (ret); 490 } 491 492 static void 493 pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa, vm_paddr_t max_pa) 494 { 495 vm_offset_t va; 496 vm_paddr_t pa; 497 pd_entry_t *l1; 498 u_int l1_slot; 499 pt_entry_t entry; 500 pn_t pn; 501 502 pa = dmap_phys_base = min_pa & ~L1_OFFSET; 503 va = DMAP_MIN_ADDRESS; 504 l1 = (pd_entry_t *)kern_l1; 505 l1_slot = pmap_l1_index(DMAP_MIN_ADDRESS); 506 507 for (; va < DMAP_MAX_ADDRESS && pa < max_pa; 508 pa += L1_SIZE, va += L1_SIZE, l1_slot++) { 509 KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index")); 510 511 /* superpages */ 512 pn = (pa / PAGE_SIZE); 513 entry = PTE_KERN; 514 entry |= (pn << PTE_PPN0_S); 515 pmap_store(&l1[l1_slot], entry); 516 } 517 518 /* Set the upper limit of the DMAP region */ 519 dmap_phys_max = pa; 520 dmap_max_addr = va; 521 522 sfence_vma(); 523 } 524 525 static vm_offset_t 526 pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start) 527 { 528 vm_offset_t l3pt; 529 pt_entry_t entry; 530 pd_entry_t *l2; 531 vm_paddr_t pa; 532 u_int l2_slot; 533 pn_t pn; 534 535 KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address")); 536 537 l2 = pmap_l2(kernel_pmap, va); 538 l2 = (pd_entry_t *)((uintptr_t)l2 & ~(PAGE_SIZE - 1)); 539 l2_slot = pmap_l2_index(va); 540 l3pt = l3_start; 541 542 for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) { 543 KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index")); 544 545 pa = pmap_early_vtophys(l1pt, l3pt); 546 pn = (pa / PAGE_SIZE); 547 entry = (PTE_V); 548 entry |= (pn << PTE_PPN0_S); 549 pmap_store(&l2[l2_slot], entry); 550 l3pt += PAGE_SIZE; 551 } 552 553 /* Clean the L2 page table */ 554 memset((void *)l3_start, 0, l3pt - l3_start); 555 556 return (l3pt); 557 } 558 559 /* 560 * Bootstrap the system enough to run with virtual memory. 561 */ 562 void 563 pmap_bootstrap(vm_offset_t l1pt, vm_paddr_t kernstart, vm_size_t kernlen) 564 { 565 u_int l1_slot, l2_slot; 566 vm_offset_t freemempos; 567 vm_offset_t dpcpu, msgbufpv; 568 vm_paddr_t max_pa, min_pa, pa; 569 pt_entry_t *l2p; 570 int i; 571 572 printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen); 573 574 /* Set this early so we can use the pagetable walking functions */ 575 kernel_pmap_store.pm_l1 = (pd_entry_t *)l1pt; 576 PMAP_LOCK_INIT(kernel_pmap); 577 578 rw_init(&pvh_global_lock, "pmap pv global"); 579 580 /* 581 * Set the current CPU as active in the kernel pmap. Secondary cores 582 * will add themselves later in init_secondary(). The SBI firmware 583 * may rely on this mask being precise, so CPU_FILL() is not used. 584 */ 585 CPU_SET(PCPU_GET(hart), &kernel_pmap->pm_active); 586 587 /* Assume the address we were loaded to is a valid physical address. */ 588 min_pa = max_pa = kernstart; 589 590 physmap_idx = physmem_avail(physmap, nitems(physmap)); 591 physmap_idx /= 2; 592 593 /* 594 * Find the minimum physical address. physmap is sorted, 595 * but may contain empty ranges. 596 */ 597 for (i = 0; i < physmap_idx * 2; i += 2) { 598 if (physmap[i] == physmap[i + 1]) 599 continue; 600 if (physmap[i] <= min_pa) 601 min_pa = physmap[i]; 602 if (physmap[i + 1] > max_pa) 603 max_pa = physmap[i + 1]; 604 } 605 printf("physmap_idx %u\n", physmap_idx); 606 printf("min_pa %lx\n", min_pa); 607 printf("max_pa %lx\n", max_pa); 608 609 /* Create a direct map region early so we can use it for pa -> va */ 610 pmap_bootstrap_dmap(l1pt, min_pa, max_pa); 611 612 /* 613 * Read the page table to find out what is already mapped. 614 * This assumes we have mapped a block of memory from KERNBASE 615 * using a single L1 entry. 616 */ 617 (void)pmap_early_page_idx(l1pt, KERNBASE, &l1_slot, &l2_slot); 618 619 /* Sanity check the index, KERNBASE should be the first VA */ 620 KASSERT(l2_slot == 0, ("The L2 index is non-zero")); 621 622 freemempos = roundup2(KERNBASE + kernlen, PAGE_SIZE); 623 624 /* Create the l3 tables for the early devmap */ 625 freemempos = pmap_bootstrap_l3(l1pt, 626 VM_MAX_KERNEL_ADDRESS - L2_SIZE, freemempos); 627 628 /* 629 * Invalidate the mapping we created for the DTB. At this point a copy 630 * has been created, and we no longer need it. We want to avoid the 631 * possibility of an aliased mapping in the future. 632 */ 633 l2p = pmap_l2(kernel_pmap, VM_EARLY_DTB_ADDRESS); 634 if ((pmap_load(l2p) & PTE_V) != 0) 635 pmap_clear(l2p); 636 637 sfence_vma(); 638 639 #define alloc_pages(var, np) \ 640 (var) = freemempos; \ 641 freemempos += (np * PAGE_SIZE); \ 642 memset((char *)(var), 0, ((np) * PAGE_SIZE)); 643 644 /* Allocate dynamic per-cpu area. */ 645 alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE); 646 dpcpu_init((void *)dpcpu, 0); 647 648 /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */ 649 alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE); 650 msgbufp = (void *)msgbufpv; 651 652 virtual_avail = roundup2(freemempos, L2_SIZE); 653 virtual_end = VM_MAX_KERNEL_ADDRESS - L2_SIZE; 654 kernel_vm_end = virtual_avail; 655 656 pa = pmap_early_vtophys(l1pt, freemempos); 657 658 physmem_exclude_region(kernstart, pa - kernstart, EXFLAG_NOALLOC); 659 } 660 661 /* 662 * Initialize a vm_page's machine-dependent fields. 663 */ 664 void 665 pmap_page_init(vm_page_t m) 666 { 667 668 TAILQ_INIT(&m->md.pv_list); 669 m->md.pv_memattr = VM_MEMATTR_WRITE_BACK; 670 } 671 672 /* 673 * Initialize the pmap module. 674 * Called by vm_init, to initialize any structures that the pmap 675 * system needs to map virtual memory. 676 */ 677 void 678 pmap_init(void) 679 { 680 vm_size_t s; 681 int i, pv_npg; 682 683 /* 684 * Initialize the pv chunk and pmap list mutexes. 685 */ 686 mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); 687 mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_DEF); 688 689 /* 690 * Initialize the pool of pv list locks. 691 */ 692 for (i = 0; i < NPV_LIST_LOCKS; i++) 693 rw_init(&pv_list_locks[i], "pmap pv list"); 694 695 /* 696 * Calculate the size of the pv head table for superpages. 697 */ 698 pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L2_SIZE); 699 700 /* 701 * Allocate memory for the pv head table for superpages. 702 */ 703 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 704 s = round_page(s); 705 pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); 706 for (i = 0; i < pv_npg; i++) 707 TAILQ_INIT(&pv_table[i].pv_list); 708 TAILQ_INIT(&pv_dummy.pv_list); 709 710 if (superpages_enabled) 711 pagesizes[1] = L2_SIZE; 712 } 713 714 #ifdef SMP 715 /* 716 * For SMP, these functions have to use IPIs for coherence. 717 * 718 * In general, the calling thread uses a plain fence to order the 719 * writes to the page tables before invoking an SBI callback to invoke 720 * sfence_vma() on remote CPUs. 721 */ 722 static void 723 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 724 { 725 cpuset_t mask; 726 727 sched_pin(); 728 mask = pmap->pm_active; 729 CPU_CLR(PCPU_GET(hart), &mask); 730 fence(); 731 if (!CPU_EMPTY(&mask) && smp_started) 732 sbi_remote_sfence_vma(mask.__bits, va, 1); 733 sfence_vma_page(va); 734 sched_unpin(); 735 } 736 737 static void 738 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 739 { 740 cpuset_t mask; 741 742 sched_pin(); 743 mask = pmap->pm_active; 744 CPU_CLR(PCPU_GET(hart), &mask); 745 fence(); 746 if (!CPU_EMPTY(&mask) && smp_started) 747 sbi_remote_sfence_vma(mask.__bits, sva, eva - sva + 1); 748 749 /* 750 * Might consider a loop of sfence_vma_page() for a small 751 * number of pages in the future. 752 */ 753 sfence_vma(); 754 sched_unpin(); 755 } 756 757 static void 758 pmap_invalidate_all(pmap_t pmap) 759 { 760 cpuset_t mask; 761 762 sched_pin(); 763 mask = pmap->pm_active; 764 CPU_CLR(PCPU_GET(hart), &mask); 765 766 /* 767 * XXX: The SBI doc doesn't detail how to specify x0 as the 768 * address to perform a global fence. BBL currently treats 769 * all sfence_vma requests as global however. 770 */ 771 fence(); 772 if (!CPU_EMPTY(&mask) && smp_started) 773 sbi_remote_sfence_vma(mask.__bits, 0, 0); 774 sfence_vma(); 775 sched_unpin(); 776 } 777 #else 778 /* 779 * Normal, non-SMP, invalidation functions. 780 * We inline these within pmap.c for speed. 781 */ 782 static __inline void 783 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 784 { 785 786 sfence_vma_page(va); 787 } 788 789 static __inline void 790 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 791 { 792 793 /* 794 * Might consider a loop of sfence_vma_page() for a small 795 * number of pages in the future. 796 */ 797 sfence_vma(); 798 } 799 800 static __inline void 801 pmap_invalidate_all(pmap_t pmap) 802 { 803 804 sfence_vma(); 805 } 806 #endif 807 808 /* 809 * Routine: pmap_extract 810 * Function: 811 * Extract the physical page address associated 812 * with the given map/virtual_address pair. 813 */ 814 vm_paddr_t 815 pmap_extract(pmap_t pmap, vm_offset_t va) 816 { 817 pd_entry_t *l2p, l2; 818 pt_entry_t *l3p, l3; 819 vm_paddr_t pa; 820 821 pa = 0; 822 PMAP_LOCK(pmap); 823 /* 824 * Start with the l2 tabel. We are unable to allocate 825 * pages in the l1 table. 826 */ 827 l2p = pmap_l2(pmap, va); 828 if (l2p != NULL) { 829 l2 = pmap_load(l2p); 830 if ((l2 & PTE_RX) == 0) { 831 l3p = pmap_l2_to_l3(l2p, va); 832 if (l3p != NULL) { 833 l3 = pmap_load(l3p); 834 pa = PTE_TO_PHYS(l3); 835 pa |= (va & L3_OFFSET); 836 } 837 } else { 838 /* L2 is superpages */ 839 pa = L2PTE_TO_PHYS(l2); 840 pa |= (va & L2_OFFSET); 841 } 842 } 843 PMAP_UNLOCK(pmap); 844 return (pa); 845 } 846 847 /* 848 * Routine: pmap_extract_and_hold 849 * Function: 850 * Atomically extract and hold the physical page 851 * with the given pmap and virtual address pair 852 * if that mapping permits the given protection. 853 */ 854 vm_page_t 855 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 856 { 857 pt_entry_t *l3p, l3; 858 vm_paddr_t phys; 859 vm_page_t m; 860 861 m = NULL; 862 PMAP_LOCK(pmap); 863 l3p = pmap_l3(pmap, va); 864 if (l3p != NULL && (l3 = pmap_load(l3p)) != 0) { 865 if ((l3 & PTE_W) != 0 || (prot & VM_PROT_WRITE) == 0) { 866 phys = PTE_TO_PHYS(l3); 867 m = PHYS_TO_VM_PAGE(phys); 868 if (!vm_page_wire_mapped(m)) 869 m = NULL; 870 } 871 } 872 PMAP_UNLOCK(pmap); 873 return (m); 874 } 875 876 vm_paddr_t 877 pmap_kextract(vm_offset_t va) 878 { 879 pd_entry_t *l2, l2e; 880 pt_entry_t *l3; 881 vm_paddr_t pa; 882 883 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 884 pa = DMAP_TO_PHYS(va); 885 } else { 886 l2 = pmap_l2(kernel_pmap, va); 887 if (l2 == NULL) 888 panic("pmap_kextract: No l2"); 889 l2e = pmap_load(l2); 890 /* 891 * Beware of concurrent promotion and demotion! We must 892 * use l2e rather than loading from l2 multiple times to 893 * ensure we see a consistent state, including the 894 * implicit load in pmap_l2_to_l3. It is, however, safe 895 * to use an old l2e because the L3 page is preserved by 896 * promotion. 897 */ 898 if ((l2e & PTE_RX) != 0) { 899 /* superpages */ 900 pa = L2PTE_TO_PHYS(l2e); 901 pa |= (va & L2_OFFSET); 902 return (pa); 903 } 904 905 l3 = pmap_l2_to_l3(&l2e, va); 906 if (l3 == NULL) 907 panic("pmap_kextract: No l3..."); 908 pa = PTE_TO_PHYS(pmap_load(l3)); 909 pa |= (va & PAGE_MASK); 910 } 911 return (pa); 912 } 913 914 /*************************************************** 915 * Low level mapping routines..... 916 ***************************************************/ 917 918 void 919 pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode __unused) 920 { 921 pt_entry_t entry; 922 pt_entry_t *l3; 923 vm_offset_t va; 924 pn_t pn; 925 926 KASSERT((pa & L3_OFFSET) == 0, 927 ("pmap_kenter_device: Invalid physical address")); 928 KASSERT((sva & L3_OFFSET) == 0, 929 ("pmap_kenter_device: Invalid virtual address")); 930 KASSERT((size & PAGE_MASK) == 0, 931 ("pmap_kenter_device: Mapping is not page-sized")); 932 933 va = sva; 934 while (size != 0) { 935 l3 = pmap_l3(kernel_pmap, va); 936 KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va)); 937 938 pn = (pa / PAGE_SIZE); 939 entry = PTE_KERN; 940 entry |= (pn << PTE_PPN0_S); 941 pmap_store(l3, entry); 942 943 va += PAGE_SIZE; 944 pa += PAGE_SIZE; 945 size -= PAGE_SIZE; 946 } 947 pmap_invalidate_range(kernel_pmap, sva, va); 948 } 949 950 void 951 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa) 952 { 953 pmap_kenter(sva, size, pa, VM_MEMATTR_DEVICE); 954 } 955 956 /* 957 * Remove a page from the kernel pagetables. 958 * Note: not SMP coherent. 959 */ 960 PMAP_INLINE void 961 pmap_kremove(vm_offset_t va) 962 { 963 pt_entry_t *l3; 964 965 l3 = pmap_l3(kernel_pmap, va); 966 KASSERT(l3 != NULL, ("pmap_kremove: Invalid address")); 967 968 pmap_clear(l3); 969 sfence_vma(); 970 } 971 972 void 973 pmap_kremove_device(vm_offset_t sva, vm_size_t size) 974 { 975 pt_entry_t *l3; 976 vm_offset_t va; 977 978 KASSERT((sva & L3_OFFSET) == 0, 979 ("pmap_kremove_device: Invalid virtual address")); 980 KASSERT((size & PAGE_MASK) == 0, 981 ("pmap_kremove_device: Mapping is not page-sized")); 982 983 va = sva; 984 while (size != 0) { 985 l3 = pmap_l3(kernel_pmap, va); 986 KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va)); 987 pmap_clear(l3); 988 989 va += PAGE_SIZE; 990 size -= PAGE_SIZE; 991 } 992 993 pmap_invalidate_range(kernel_pmap, sva, va); 994 } 995 996 /* 997 * Used to map a range of physical addresses into kernel 998 * virtual address space. 999 * 1000 * The value passed in '*virt' is a suggested virtual address for 1001 * the mapping. Architectures which can support a direct-mapped 1002 * physical to virtual region can return the appropriate address 1003 * within that region, leaving '*virt' unchanged. Other 1004 * architectures should map the pages starting at '*virt' and 1005 * update '*virt' with the first usable address after the mapped 1006 * region. 1007 */ 1008 vm_offset_t 1009 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 1010 { 1011 1012 return PHYS_TO_DMAP(start); 1013 } 1014 1015 /* 1016 * Add a list of wired pages to the kva 1017 * this routine is only used for temporary 1018 * kernel mappings that do not need to have 1019 * page modification or references recorded. 1020 * Note that old mappings are simply written 1021 * over. The page *must* be wired. 1022 * Note: SMP coherent. Uses a ranged shootdown IPI. 1023 */ 1024 void 1025 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 1026 { 1027 pt_entry_t *l3, pa; 1028 vm_offset_t va; 1029 vm_page_t m; 1030 pt_entry_t entry; 1031 pn_t pn; 1032 int i; 1033 1034 va = sva; 1035 for (i = 0; i < count; i++) { 1036 m = ma[i]; 1037 pa = VM_PAGE_TO_PHYS(m); 1038 pn = (pa / PAGE_SIZE); 1039 l3 = pmap_l3(kernel_pmap, va); 1040 1041 entry = PTE_KERN; 1042 entry |= (pn << PTE_PPN0_S); 1043 pmap_store(l3, entry); 1044 1045 va += L3_SIZE; 1046 } 1047 pmap_invalidate_range(kernel_pmap, sva, va); 1048 } 1049 1050 /* 1051 * This routine tears out page mappings from the 1052 * kernel -- it is meant only for temporary mappings. 1053 * Note: SMP coherent. Uses a ranged shootdown IPI. 1054 */ 1055 void 1056 pmap_qremove(vm_offset_t sva, int count) 1057 { 1058 pt_entry_t *l3; 1059 vm_offset_t va; 1060 1061 KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", sva)); 1062 1063 for (va = sva; count-- > 0; va += PAGE_SIZE) { 1064 l3 = pmap_l3(kernel_pmap, va); 1065 KASSERT(l3 != NULL, ("pmap_kremove: Invalid address")); 1066 pmap_clear(l3); 1067 } 1068 pmap_invalidate_range(kernel_pmap, sva, va); 1069 } 1070 1071 bool 1072 pmap_ps_enabled(pmap_t pmap __unused) 1073 { 1074 1075 return (superpages_enabled); 1076 } 1077 1078 /*************************************************** 1079 * Page table page management routines..... 1080 ***************************************************/ 1081 /* 1082 * Schedule the specified unused page table page to be freed. Specifically, 1083 * add the page to the specified list of pages that will be released to the 1084 * physical memory manager after the TLB has been updated. 1085 */ 1086 static __inline void 1087 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, 1088 boolean_t set_PG_ZERO) 1089 { 1090 1091 if (set_PG_ZERO) 1092 m->flags |= PG_ZERO; 1093 else 1094 m->flags &= ~PG_ZERO; 1095 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 1096 } 1097 1098 /* 1099 * Inserts the specified page table page into the specified pmap's collection 1100 * of idle page table pages. Each of a pmap's page table pages is responsible 1101 * for mapping a distinct range of virtual addresses. The pmap's collection is 1102 * ordered by this virtual address range. 1103 * 1104 * If "promoted" is false, then the page table page "ml3" must be zero filled. 1105 */ 1106 static __inline int 1107 pmap_insert_pt_page(pmap_t pmap, vm_page_t ml3, bool promoted) 1108 { 1109 1110 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1111 ml3->valid = promoted ? VM_PAGE_BITS_ALL : 0; 1112 return (vm_radix_insert(&pmap->pm_root, ml3)); 1113 } 1114 1115 /* 1116 * Removes the page table page mapping the specified virtual address from the 1117 * specified pmap's collection of idle page table pages, and returns it. 1118 * Otherwise, returns NULL if there is no page table page corresponding to the 1119 * specified virtual address. 1120 */ 1121 static __inline vm_page_t 1122 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) 1123 { 1124 1125 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1126 return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va))); 1127 } 1128 1129 /* 1130 * Decrements a page table page's reference count, which is used to record the 1131 * number of valid page table entries within the page. If the reference count 1132 * drops to zero, then the page table page is unmapped. Returns TRUE if the 1133 * page table page was unmapped and FALSE otherwise. 1134 */ 1135 static inline boolean_t 1136 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 1137 { 1138 1139 --m->ref_count; 1140 if (m->ref_count == 0) { 1141 _pmap_unwire_ptp(pmap, va, m, free); 1142 return (TRUE); 1143 } else { 1144 return (FALSE); 1145 } 1146 } 1147 1148 static void 1149 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 1150 { 1151 vm_paddr_t phys; 1152 1153 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1154 if (m->pindex >= NUL2E) { 1155 pd_entry_t *l1; 1156 l1 = pmap_l1(pmap, va); 1157 pmap_clear(l1); 1158 pmap_distribute_l1(pmap, pmap_l1_index(va), 0); 1159 } else { 1160 pd_entry_t *l2; 1161 l2 = pmap_l2(pmap, va); 1162 pmap_clear(l2); 1163 } 1164 pmap_resident_count_dec(pmap, 1); 1165 if (m->pindex < NUL2E) { 1166 pd_entry_t *l1; 1167 vm_page_t pdpg; 1168 1169 l1 = pmap_l1(pmap, va); 1170 phys = PTE_TO_PHYS(pmap_load(l1)); 1171 pdpg = PHYS_TO_VM_PAGE(phys); 1172 pmap_unwire_ptp(pmap, va, pdpg, free); 1173 } 1174 pmap_invalidate_page(pmap, va); 1175 1176 vm_wire_sub(1); 1177 1178 /* 1179 * Put page on a list so that it is released after 1180 * *ALL* TLB shootdown is done 1181 */ 1182 pmap_add_delayed_free_list(m, free, TRUE); 1183 } 1184 1185 /* 1186 * After removing a page table entry, this routine is used to 1187 * conditionally free the page, and manage the reference count. 1188 */ 1189 static int 1190 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, 1191 struct spglist *free) 1192 { 1193 vm_page_t mpte; 1194 1195 if (va >= VM_MAXUSER_ADDRESS) 1196 return (0); 1197 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 1198 mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(ptepde)); 1199 return (pmap_unwire_ptp(pmap, va, mpte, free)); 1200 } 1201 1202 void 1203 pmap_pinit0(pmap_t pmap) 1204 { 1205 1206 PMAP_LOCK_INIT(pmap); 1207 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); 1208 pmap->pm_l1 = kernel_pmap->pm_l1; 1209 pmap->pm_satp = SATP_MODE_SV39 | (vtophys(pmap->pm_l1) >> PAGE_SHIFT); 1210 CPU_ZERO(&pmap->pm_active); 1211 pmap_activate_boot(pmap); 1212 } 1213 1214 int 1215 pmap_pinit(pmap_t pmap) 1216 { 1217 vm_paddr_t l1phys; 1218 vm_page_t l1pt; 1219 1220 /* 1221 * allocate the l1 page 1222 */ 1223 l1pt = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO | 1224 VM_ALLOC_WAITOK); 1225 1226 l1phys = VM_PAGE_TO_PHYS(l1pt); 1227 pmap->pm_l1 = (pd_entry_t *)PHYS_TO_DMAP(l1phys); 1228 pmap->pm_satp = SATP_MODE_SV39 | (l1phys >> PAGE_SHIFT); 1229 1230 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); 1231 1232 CPU_ZERO(&pmap->pm_active); 1233 1234 /* Install kernel pagetables */ 1235 memcpy(pmap->pm_l1, kernel_pmap->pm_l1, PAGE_SIZE); 1236 1237 /* Add to the list of all user pmaps */ 1238 mtx_lock(&allpmaps_lock); 1239 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); 1240 mtx_unlock(&allpmaps_lock); 1241 1242 vm_radix_init(&pmap->pm_root); 1243 1244 return (1); 1245 } 1246 1247 /* 1248 * This routine is called if the desired page table page does not exist. 1249 * 1250 * If page table page allocation fails, this routine may sleep before 1251 * returning NULL. It sleeps only if a lock pointer was given. 1252 * 1253 * Note: If a page allocation fails at page table level two or three, 1254 * one or two pages may be held during the wait, only to be released 1255 * afterwards. This conservative approach is easily argued to avoid 1256 * race conditions. 1257 */ 1258 static vm_page_t 1259 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) 1260 { 1261 vm_page_t m, /*pdppg, */pdpg; 1262 pt_entry_t entry; 1263 vm_paddr_t phys; 1264 pn_t pn; 1265 1266 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1267 1268 /* 1269 * Allocate a page table page. 1270 */ 1271 m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO); 1272 if (m == NULL) { 1273 if (lockp != NULL) { 1274 RELEASE_PV_LIST_LOCK(lockp); 1275 PMAP_UNLOCK(pmap); 1276 rw_runlock(&pvh_global_lock); 1277 vm_wait(NULL); 1278 rw_rlock(&pvh_global_lock); 1279 PMAP_LOCK(pmap); 1280 } 1281 1282 /* 1283 * Indicate the need to retry. While waiting, the page table 1284 * page may have been allocated. 1285 */ 1286 return (NULL); 1287 } 1288 m->pindex = ptepindex; 1289 1290 /* 1291 * Map the pagetable page into the process address space, if 1292 * it isn't already there. 1293 */ 1294 1295 if (ptepindex >= NUL2E) { 1296 pd_entry_t *l1; 1297 vm_pindex_t l1index; 1298 1299 l1index = ptepindex - NUL2E; 1300 l1 = &pmap->pm_l1[l1index]; 1301 KASSERT((pmap_load(l1) & PTE_V) == 0, 1302 ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1))); 1303 1304 pn = (VM_PAGE_TO_PHYS(m) / PAGE_SIZE); 1305 entry = (PTE_V); 1306 entry |= (pn << PTE_PPN0_S); 1307 pmap_store(l1, entry); 1308 pmap_distribute_l1(pmap, l1index, entry); 1309 } else { 1310 vm_pindex_t l1index; 1311 pd_entry_t *l1, *l2; 1312 1313 l1index = ptepindex >> (L1_SHIFT - L2_SHIFT); 1314 l1 = &pmap->pm_l1[l1index]; 1315 if (pmap_load(l1) == 0) { 1316 /* recurse for allocating page dir */ 1317 if (_pmap_alloc_l3(pmap, NUL2E + l1index, 1318 lockp) == NULL) { 1319 vm_page_unwire_noq(m); 1320 vm_page_free_zero(m); 1321 return (NULL); 1322 } 1323 } else { 1324 phys = PTE_TO_PHYS(pmap_load(l1)); 1325 pdpg = PHYS_TO_VM_PAGE(phys); 1326 pdpg->ref_count++; 1327 } 1328 1329 phys = PTE_TO_PHYS(pmap_load(l1)); 1330 l2 = (pd_entry_t *)PHYS_TO_DMAP(phys); 1331 l2 = &l2[ptepindex & Ln_ADDR_MASK]; 1332 KASSERT((pmap_load(l2) & PTE_V) == 0, 1333 ("%s: L2 entry %#lx is valid", __func__, pmap_load(l2))); 1334 1335 pn = (VM_PAGE_TO_PHYS(m) / PAGE_SIZE); 1336 entry = (PTE_V); 1337 entry |= (pn << PTE_PPN0_S); 1338 pmap_store(l2, entry); 1339 } 1340 1341 pmap_resident_count_inc(pmap, 1); 1342 1343 return (m); 1344 } 1345 1346 static vm_page_t 1347 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 1348 { 1349 pd_entry_t *l1; 1350 vm_page_t l2pg; 1351 vm_pindex_t l2pindex; 1352 1353 retry: 1354 l1 = pmap_l1(pmap, va); 1355 if (l1 != NULL && (pmap_load(l1) & PTE_V) != 0) { 1356 KASSERT((pmap_load(l1) & PTE_RWX) == 0, 1357 ("%s: L1 entry %#lx for VA %#lx is a leaf", __func__, 1358 pmap_load(l1), va)); 1359 /* Add a reference to the L2 page. */ 1360 l2pg = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l1))); 1361 l2pg->ref_count++; 1362 } else { 1363 /* Allocate a L2 page. */ 1364 l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT; 1365 l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp); 1366 if (l2pg == NULL && lockp != NULL) 1367 goto retry; 1368 } 1369 return (l2pg); 1370 } 1371 1372 static vm_page_t 1373 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 1374 { 1375 vm_pindex_t ptepindex; 1376 pd_entry_t *l2; 1377 vm_paddr_t phys; 1378 vm_page_t m; 1379 1380 /* 1381 * Calculate pagetable page index 1382 */ 1383 ptepindex = pmap_l2_pindex(va); 1384 retry: 1385 /* 1386 * Get the page directory entry 1387 */ 1388 l2 = pmap_l2(pmap, va); 1389 1390 /* 1391 * If the page table page is mapped, we just increment the 1392 * hold count, and activate it. 1393 */ 1394 if (l2 != NULL && pmap_load(l2) != 0) { 1395 phys = PTE_TO_PHYS(pmap_load(l2)); 1396 m = PHYS_TO_VM_PAGE(phys); 1397 m->ref_count++; 1398 } else { 1399 /* 1400 * Here if the pte page isn't mapped, or if it has been 1401 * deallocated. 1402 */ 1403 m = _pmap_alloc_l3(pmap, ptepindex, lockp); 1404 if (m == NULL && lockp != NULL) 1405 goto retry; 1406 } 1407 return (m); 1408 } 1409 1410 /*************************************************** 1411 * Pmap allocation/deallocation routines. 1412 ***************************************************/ 1413 1414 /* 1415 * Release any resources held by the given physical map. 1416 * Called when a pmap initialized by pmap_pinit is being released. 1417 * Should only be called if the map contains no valid mappings. 1418 */ 1419 void 1420 pmap_release(pmap_t pmap) 1421 { 1422 vm_page_t m; 1423 1424 KASSERT(pmap->pm_stats.resident_count == 0, 1425 ("pmap_release: pmap resident count %ld != 0", 1426 pmap->pm_stats.resident_count)); 1427 KASSERT(CPU_EMPTY(&pmap->pm_active), 1428 ("releasing active pmap %p", pmap)); 1429 1430 mtx_lock(&allpmaps_lock); 1431 LIST_REMOVE(pmap, pm_list); 1432 mtx_unlock(&allpmaps_lock); 1433 1434 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_l1)); 1435 vm_page_unwire_noq(m); 1436 vm_page_free(m); 1437 } 1438 1439 static int 1440 kvm_size(SYSCTL_HANDLER_ARGS) 1441 { 1442 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; 1443 1444 return sysctl_handle_long(oidp, &ksize, 0, req); 1445 } 1446 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 1447 0, 0, kvm_size, "LU", 1448 "Size of KVM"); 1449 1450 static int 1451 kvm_free(SYSCTL_HANDLER_ARGS) 1452 { 1453 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 1454 1455 return sysctl_handle_long(oidp, &kfree, 0, req); 1456 } 1457 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 1458 0, 0, kvm_free, "LU", 1459 "Amount of KVM free"); 1460 1461 /* 1462 * grow the number of kernel page table entries, if needed 1463 */ 1464 void 1465 pmap_growkernel(vm_offset_t addr) 1466 { 1467 vm_paddr_t paddr; 1468 vm_page_t nkpg; 1469 pd_entry_t *l1, *l2; 1470 pt_entry_t entry; 1471 pn_t pn; 1472 1473 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 1474 1475 addr = roundup2(addr, L2_SIZE); 1476 if (addr - 1 >= vm_map_max(kernel_map)) 1477 addr = vm_map_max(kernel_map); 1478 while (kernel_vm_end < addr) { 1479 l1 = pmap_l1(kernel_pmap, kernel_vm_end); 1480 if (pmap_load(l1) == 0) { 1481 /* We need a new PDP entry */ 1482 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | 1483 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 1484 if (nkpg == NULL) 1485 panic("pmap_growkernel: no memory to grow kernel"); 1486 nkpg->pindex = kernel_vm_end >> L1_SHIFT; 1487 paddr = VM_PAGE_TO_PHYS(nkpg); 1488 1489 pn = (paddr / PAGE_SIZE); 1490 entry = (PTE_V); 1491 entry |= (pn << PTE_PPN0_S); 1492 pmap_store(l1, entry); 1493 pmap_distribute_l1(kernel_pmap, 1494 pmap_l1_index(kernel_vm_end), entry); 1495 continue; /* try again */ 1496 } 1497 l2 = pmap_l1_to_l2(l1, kernel_vm_end); 1498 if ((pmap_load(l2) & PTE_V) != 0 && 1499 (pmap_load(l2) & PTE_RWX) == 0) { 1500 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; 1501 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 1502 kernel_vm_end = vm_map_max(kernel_map); 1503 break; 1504 } 1505 continue; 1506 } 1507 1508 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED | 1509 VM_ALLOC_ZERO); 1510 if (nkpg == NULL) 1511 panic("pmap_growkernel: no memory to grow kernel"); 1512 nkpg->pindex = kernel_vm_end >> L2_SHIFT; 1513 paddr = VM_PAGE_TO_PHYS(nkpg); 1514 1515 pn = (paddr / PAGE_SIZE); 1516 entry = (PTE_V); 1517 entry |= (pn << PTE_PPN0_S); 1518 pmap_store(l2, entry); 1519 1520 pmap_invalidate_page(kernel_pmap, kernel_vm_end); 1521 1522 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; 1523 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 1524 kernel_vm_end = vm_map_max(kernel_map); 1525 break; 1526 } 1527 } 1528 } 1529 1530 /*************************************************** 1531 * page management routines. 1532 ***************************************************/ 1533 1534 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 1535 CTASSERT(_NPCM == 3); 1536 CTASSERT(_NPCPV == 168); 1537 1538 static __inline struct pv_chunk * 1539 pv_to_chunk(pv_entry_t pv) 1540 { 1541 1542 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); 1543 } 1544 1545 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 1546 1547 #define PC_FREE0 0xfffffffffffffffful 1548 #define PC_FREE1 0xfffffffffffffffful 1549 #define PC_FREE2 0x000000fffffffffful 1550 1551 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; 1552 1553 #if 0 1554 #ifdef PV_STATS 1555 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 1556 1557 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 1558 "Current number of pv entry chunks"); 1559 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 1560 "Current number of pv entry chunks allocated"); 1561 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 1562 "Current number of pv entry chunks frees"); 1563 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 1564 "Number of times tried to get a chunk page but failed."); 1565 1566 static long pv_entry_frees, pv_entry_allocs, pv_entry_count; 1567 static int pv_entry_spare; 1568 1569 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 1570 "Current number of pv entry frees"); 1571 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 1572 "Current number of pv entry allocs"); 1573 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 1574 "Current number of pv entries"); 1575 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 1576 "Current number of spare pv entries"); 1577 #endif 1578 #endif /* 0 */ 1579 1580 /* 1581 * We are in a serious low memory condition. Resort to 1582 * drastic measures to free some pages so we can allocate 1583 * another pv entry chunk. 1584 * 1585 * Returns NULL if PV entries were reclaimed from the specified pmap. 1586 * 1587 * We do not, however, unmap 2mpages because subsequent accesses will 1588 * allocate per-page pv entries until repromotion occurs, thereby 1589 * exacerbating the shortage of free pv entries. 1590 */ 1591 static vm_page_t 1592 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) 1593 { 1594 1595 panic("RISCVTODO: reclaim_pv_chunk"); 1596 } 1597 1598 /* 1599 * free the pv_entry back to the free list 1600 */ 1601 static void 1602 free_pv_entry(pmap_t pmap, pv_entry_t pv) 1603 { 1604 struct pv_chunk *pc; 1605 int idx, field, bit; 1606 1607 rw_assert(&pvh_global_lock, RA_LOCKED); 1608 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1609 PV_STAT(atomic_add_long(&pv_entry_frees, 1)); 1610 PV_STAT(atomic_add_int(&pv_entry_spare, 1)); 1611 PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); 1612 pc = pv_to_chunk(pv); 1613 idx = pv - &pc->pc_pventry[0]; 1614 field = idx / 64; 1615 bit = idx % 64; 1616 pc->pc_map[field] |= 1ul << bit; 1617 if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || 1618 pc->pc_map[2] != PC_FREE2) { 1619 /* 98% of the time, pc is already at the head of the list. */ 1620 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { 1621 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1622 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1623 } 1624 return; 1625 } 1626 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1627 free_pv_chunk(pc); 1628 } 1629 1630 static void 1631 free_pv_chunk(struct pv_chunk *pc) 1632 { 1633 vm_page_t m; 1634 1635 mtx_lock(&pv_chunks_mutex); 1636 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 1637 mtx_unlock(&pv_chunks_mutex); 1638 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 1639 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 1640 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 1641 /* entire chunk is free, return it */ 1642 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 1643 dump_drop_page(m->phys_addr); 1644 vm_page_unwire_noq(m); 1645 vm_page_free(m); 1646 } 1647 1648 /* 1649 * Returns a new PV entry, allocating a new PV chunk from the system when 1650 * needed. If this PV chunk allocation fails and a PV list lock pointer was 1651 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is 1652 * returned. 1653 * 1654 * The given PV list lock may be released. 1655 */ 1656 static pv_entry_t 1657 get_pv_entry(pmap_t pmap, struct rwlock **lockp) 1658 { 1659 int bit, field; 1660 pv_entry_t pv; 1661 struct pv_chunk *pc; 1662 vm_page_t m; 1663 1664 rw_assert(&pvh_global_lock, RA_LOCKED); 1665 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1666 PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); 1667 retry: 1668 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 1669 if (pc != NULL) { 1670 for (field = 0; field < _NPCM; field++) { 1671 if (pc->pc_map[field]) { 1672 bit = ffsl(pc->pc_map[field]) - 1; 1673 break; 1674 } 1675 } 1676 if (field < _NPCM) { 1677 pv = &pc->pc_pventry[field * 64 + bit]; 1678 pc->pc_map[field] &= ~(1ul << bit); 1679 /* If this was the last item, move it to tail */ 1680 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && 1681 pc->pc_map[2] == 0) { 1682 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1683 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, 1684 pc_list); 1685 } 1686 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 1687 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); 1688 return (pv); 1689 } 1690 } 1691 /* No free items, allocate another chunk */ 1692 m = vm_page_alloc_noobj(VM_ALLOC_WIRED); 1693 if (m == NULL) { 1694 if (lockp == NULL) { 1695 PV_STAT(pc_chunk_tryfail++); 1696 return (NULL); 1697 } 1698 m = reclaim_pv_chunk(pmap, lockp); 1699 if (m == NULL) 1700 goto retry; 1701 } 1702 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 1703 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 1704 dump_add_page(m->phys_addr); 1705 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 1706 pc->pc_pmap = pmap; 1707 pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ 1708 pc->pc_map[1] = PC_FREE1; 1709 pc->pc_map[2] = PC_FREE2; 1710 mtx_lock(&pv_chunks_mutex); 1711 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 1712 mtx_unlock(&pv_chunks_mutex); 1713 pv = &pc->pc_pventry[0]; 1714 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1715 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 1716 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); 1717 return (pv); 1718 } 1719 1720 /* 1721 * Ensure that the number of spare PV entries in the specified pmap meets or 1722 * exceeds the given count, "needed". 1723 * 1724 * The given PV list lock may be released. 1725 */ 1726 static void 1727 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) 1728 { 1729 struct pch new_tail; 1730 struct pv_chunk *pc; 1731 vm_page_t m; 1732 int avail, free; 1733 bool reclaimed; 1734 1735 rw_assert(&pvh_global_lock, RA_LOCKED); 1736 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1737 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); 1738 1739 /* 1740 * Newly allocated PV chunks must be stored in a private list until 1741 * the required number of PV chunks have been allocated. Otherwise, 1742 * reclaim_pv_chunk() could recycle one of these chunks. In 1743 * contrast, these chunks must be added to the pmap upon allocation. 1744 */ 1745 TAILQ_INIT(&new_tail); 1746 retry: 1747 avail = 0; 1748 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { 1749 bit_count((bitstr_t *)pc->pc_map, 0, 1750 sizeof(pc->pc_map) * NBBY, &free); 1751 if (free == 0) 1752 break; 1753 avail += free; 1754 if (avail >= needed) 1755 break; 1756 } 1757 for (reclaimed = false; avail < needed; avail += _NPCPV) { 1758 m = vm_page_alloc_noobj(VM_ALLOC_WIRED); 1759 if (m == NULL) { 1760 m = reclaim_pv_chunk(pmap, lockp); 1761 if (m == NULL) 1762 goto retry; 1763 reclaimed = true; 1764 } 1765 /* XXX PV STATS */ 1766 #if 0 1767 dump_add_page(m->phys_addr); 1768 #endif 1769 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 1770 pc->pc_pmap = pmap; 1771 pc->pc_map[0] = PC_FREE0; 1772 pc->pc_map[1] = PC_FREE1; 1773 pc->pc_map[2] = PC_FREE2; 1774 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1775 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 1776 1777 /* 1778 * The reclaim might have freed a chunk from the current pmap. 1779 * If that chunk contained available entries, we need to 1780 * re-count the number of available entries. 1781 */ 1782 if (reclaimed) 1783 goto retry; 1784 } 1785 if (!TAILQ_EMPTY(&new_tail)) { 1786 mtx_lock(&pv_chunks_mutex); 1787 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); 1788 mtx_unlock(&pv_chunks_mutex); 1789 } 1790 } 1791 1792 /* 1793 * First find and then remove the pv entry for the specified pmap and virtual 1794 * address from the specified pv list. Returns the pv entry if found and NULL 1795 * otherwise. This operation can be performed on pv lists for either 4KB or 1796 * 2MB page mappings. 1797 */ 1798 static __inline pv_entry_t 1799 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 1800 { 1801 pv_entry_t pv; 1802 1803 rw_assert(&pvh_global_lock, RA_LOCKED); 1804 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 1805 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 1806 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 1807 pvh->pv_gen++; 1808 break; 1809 } 1810 } 1811 return (pv); 1812 } 1813 1814 /* 1815 * First find and then destroy the pv entry for the specified pmap and virtual 1816 * address. This operation can be performed on pv lists for either 4KB or 2MB 1817 * page mappings. 1818 */ 1819 static void 1820 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 1821 { 1822 pv_entry_t pv; 1823 1824 pv = pmap_pvh_remove(pvh, pmap, va); 1825 1826 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found for %#lx", va)); 1827 free_pv_entry(pmap, pv); 1828 } 1829 1830 /* 1831 * Conditionally create the PV entry for a 4KB page mapping if the required 1832 * memory can be allocated without resorting to reclamation. 1833 */ 1834 static boolean_t 1835 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, 1836 struct rwlock **lockp) 1837 { 1838 pv_entry_t pv; 1839 1840 rw_assert(&pvh_global_lock, RA_LOCKED); 1841 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1842 /* Pass NULL instead of the lock pointer to disable reclamation. */ 1843 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 1844 pv->pv_va = va; 1845 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 1846 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 1847 m->md.pv_gen++; 1848 return (TRUE); 1849 } else 1850 return (FALSE); 1851 } 1852 1853 /* 1854 * After demotion from a 2MB page mapping to 512 4KB page mappings, 1855 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 1856 * entries for each of the 4KB page mappings. 1857 */ 1858 static void __unused 1859 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 1860 struct rwlock **lockp) 1861 { 1862 struct md_page *pvh; 1863 struct pv_chunk *pc; 1864 pv_entry_t pv; 1865 vm_page_t m; 1866 vm_offset_t va_last; 1867 int bit, field; 1868 1869 rw_assert(&pvh_global_lock, RA_LOCKED); 1870 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1871 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 1872 1873 /* 1874 * Transfer the 2mpage's pv entry for this mapping to the first 1875 * page's pv list. Once this transfer begins, the pv list lock 1876 * must not be released until the last pv entry is reinstantiated. 1877 */ 1878 pvh = pa_to_pvh(pa); 1879 va &= ~L2_OFFSET; 1880 pv = pmap_pvh_remove(pvh, pmap, va); 1881 KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found")); 1882 m = PHYS_TO_VM_PAGE(pa); 1883 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 1884 m->md.pv_gen++; 1885 /* Instantiate the remaining 511 pv entries. */ 1886 va_last = va + L2_SIZE - PAGE_SIZE; 1887 for (;;) { 1888 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 1889 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || 1890 pc->pc_map[2] != 0, ("pmap_pv_demote_l2: missing spare")); 1891 for (field = 0; field < _NPCM; field++) { 1892 while (pc->pc_map[field] != 0) { 1893 bit = ffsl(pc->pc_map[field]) - 1; 1894 pc->pc_map[field] &= ~(1ul << bit); 1895 pv = &pc->pc_pventry[field * 64 + bit]; 1896 va += PAGE_SIZE; 1897 pv->pv_va = va; 1898 m++; 1899 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 1900 ("pmap_pv_demote_l2: page %p is not managed", m)); 1901 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 1902 m->md.pv_gen++; 1903 if (va == va_last) 1904 goto out; 1905 } 1906 } 1907 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1908 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 1909 } 1910 out: 1911 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { 1912 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1913 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 1914 } 1915 /* XXX PV stats */ 1916 } 1917 1918 #if VM_NRESERVLEVEL > 0 1919 static void 1920 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 1921 struct rwlock **lockp) 1922 { 1923 struct md_page *pvh; 1924 pv_entry_t pv; 1925 vm_page_t m; 1926 vm_offset_t va_last; 1927 1928 rw_assert(&pvh_global_lock, RA_LOCKED); 1929 KASSERT((va & L2_OFFSET) == 0, 1930 ("pmap_pv_promote_l2: misaligned va %#lx", va)); 1931 1932 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 1933 1934 m = PHYS_TO_VM_PAGE(pa); 1935 pv = pmap_pvh_remove(&m->md, pmap, va); 1936 KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv for %#lx not found", va)); 1937 pvh = pa_to_pvh(pa); 1938 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 1939 pvh->pv_gen++; 1940 1941 va_last = va + L2_SIZE - PAGE_SIZE; 1942 do { 1943 m++; 1944 va += PAGE_SIZE; 1945 pmap_pvh_free(&m->md, pmap, va); 1946 } while (va < va_last); 1947 } 1948 #endif /* VM_NRESERVLEVEL > 0 */ 1949 1950 /* 1951 * Create the PV entry for a 2MB page mapping. Always returns true unless the 1952 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns 1953 * false if the PV entry cannot be allocated without resorting to reclamation. 1954 */ 1955 static bool 1956 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags, 1957 struct rwlock **lockp) 1958 { 1959 struct md_page *pvh; 1960 pv_entry_t pv; 1961 vm_paddr_t pa; 1962 1963 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1964 /* Pass NULL instead of the lock pointer to disable reclamation. */ 1965 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? 1966 NULL : lockp)) == NULL) 1967 return (false); 1968 pv->pv_va = va; 1969 pa = PTE_TO_PHYS(l2e); 1970 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 1971 pvh = pa_to_pvh(pa); 1972 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 1973 pvh->pv_gen++; 1974 return (true); 1975 } 1976 1977 static void 1978 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) 1979 { 1980 pt_entry_t newl2, oldl2; 1981 vm_page_t ml3; 1982 vm_paddr_t ml3pa; 1983 1984 KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va)); 1985 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); 1986 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1987 1988 ml3 = pmap_remove_pt_page(pmap, va); 1989 if (ml3 == NULL) 1990 panic("pmap_remove_kernel_l2: Missing pt page"); 1991 1992 ml3pa = VM_PAGE_TO_PHYS(ml3); 1993 newl2 = ml3pa | PTE_V; 1994 1995 /* 1996 * If this page table page was unmapped by a promotion, then it 1997 * contains valid mappings. Zero it to invalidate those mappings. 1998 */ 1999 if (ml3->valid != 0) 2000 pagezero((void *)PHYS_TO_DMAP(ml3pa)); 2001 2002 /* 2003 * Demote the mapping. 2004 */ 2005 oldl2 = pmap_load_store(l2, newl2); 2006 KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx", 2007 __func__, l2, oldl2)); 2008 } 2009 2010 /* 2011 * pmap_remove_l2: Do the things to unmap a level 2 superpage. 2012 */ 2013 static int 2014 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, 2015 pd_entry_t l1e, struct spglist *free, struct rwlock **lockp) 2016 { 2017 struct md_page *pvh; 2018 pt_entry_t oldl2; 2019 vm_offset_t eva, va; 2020 vm_page_t m, ml3; 2021 2022 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2023 KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned")); 2024 oldl2 = pmap_load_clear(l2); 2025 KASSERT((oldl2 & PTE_RWX) != 0, 2026 ("pmap_remove_l2: L2e %lx is not a superpage mapping", oldl2)); 2027 2028 /* 2029 * The sfence.vma documentation states that it is sufficient to specify 2030 * a single address within a superpage mapping. However, since we do 2031 * not perform any invalidation upon promotion, TLBs may still be 2032 * caching 4KB mappings within the superpage, so we must invalidate the 2033 * entire range. 2034 */ 2035 pmap_invalidate_range(pmap, sva, sva + L2_SIZE); 2036 if ((oldl2 & PTE_SW_WIRED) != 0) 2037 pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE; 2038 pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE); 2039 if ((oldl2 & PTE_SW_MANAGED) != 0) { 2040 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, PTE_TO_PHYS(oldl2)); 2041 pvh = pa_to_pvh(PTE_TO_PHYS(oldl2)); 2042 pmap_pvh_free(pvh, pmap, sva); 2043 eva = sva + L2_SIZE; 2044 for (va = sva, m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(oldl2)); 2045 va < eva; va += PAGE_SIZE, m++) { 2046 if ((oldl2 & PTE_D) != 0) 2047 vm_page_dirty(m); 2048 if ((oldl2 & PTE_A) != 0) 2049 vm_page_aflag_set(m, PGA_REFERENCED); 2050 if (TAILQ_EMPTY(&m->md.pv_list) && 2051 TAILQ_EMPTY(&pvh->pv_list)) 2052 vm_page_aflag_clear(m, PGA_WRITEABLE); 2053 } 2054 } 2055 if (pmap == kernel_pmap) { 2056 pmap_remove_kernel_l2(pmap, l2, sva); 2057 } else { 2058 ml3 = pmap_remove_pt_page(pmap, sva); 2059 if (ml3 != NULL) { 2060 KASSERT(ml3->valid == VM_PAGE_BITS_ALL, 2061 ("pmap_remove_l2: l3 page not promoted")); 2062 pmap_resident_count_dec(pmap, 1); 2063 KASSERT(ml3->ref_count == Ln_ENTRIES, 2064 ("pmap_remove_l2: l3 page ref count error")); 2065 ml3->ref_count = 1; 2066 vm_page_unwire_noq(ml3); 2067 pmap_add_delayed_free_list(ml3, free, FALSE); 2068 } 2069 } 2070 return (pmap_unuse_pt(pmap, sva, l1e, free)); 2071 } 2072 2073 /* 2074 * pmap_remove_l3: do the things to unmap a page in a process 2075 */ 2076 static int 2077 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va, 2078 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp) 2079 { 2080 struct md_page *pvh; 2081 pt_entry_t old_l3; 2082 vm_paddr_t phys; 2083 vm_page_t m; 2084 2085 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2086 old_l3 = pmap_load_clear(l3); 2087 pmap_invalidate_page(pmap, va); 2088 if (old_l3 & PTE_SW_WIRED) 2089 pmap->pm_stats.wired_count -= 1; 2090 pmap_resident_count_dec(pmap, 1); 2091 if (old_l3 & PTE_SW_MANAGED) { 2092 phys = PTE_TO_PHYS(old_l3); 2093 m = PHYS_TO_VM_PAGE(phys); 2094 if ((old_l3 & PTE_D) != 0) 2095 vm_page_dirty(m); 2096 if (old_l3 & PTE_A) 2097 vm_page_aflag_set(m, PGA_REFERENCED); 2098 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 2099 pmap_pvh_free(&m->md, pmap, va); 2100 if (TAILQ_EMPTY(&m->md.pv_list) && 2101 (m->flags & PG_FICTITIOUS) == 0) { 2102 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2103 if (TAILQ_EMPTY(&pvh->pv_list)) 2104 vm_page_aflag_clear(m, PGA_WRITEABLE); 2105 } 2106 } 2107 2108 return (pmap_unuse_pt(pmap, va, l2e, free)); 2109 } 2110 2111 /* 2112 * Remove the given range of addresses from the specified map. 2113 * 2114 * It is assumed that the start and end are properly 2115 * rounded to the page size. 2116 */ 2117 void 2118 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 2119 { 2120 struct spglist free; 2121 struct rwlock *lock; 2122 vm_offset_t va, va_next; 2123 pd_entry_t *l1, *l2, l2e; 2124 pt_entry_t *l3; 2125 2126 /* 2127 * Perform an unsynchronized read. This is, however, safe. 2128 */ 2129 if (pmap->pm_stats.resident_count == 0) 2130 return; 2131 2132 SLIST_INIT(&free); 2133 2134 rw_rlock(&pvh_global_lock); 2135 PMAP_LOCK(pmap); 2136 2137 lock = NULL; 2138 for (; sva < eva; sva = va_next) { 2139 if (pmap->pm_stats.resident_count == 0) 2140 break; 2141 2142 l1 = pmap_l1(pmap, sva); 2143 if (pmap_load(l1) == 0) { 2144 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 2145 if (va_next < sva) 2146 va_next = eva; 2147 continue; 2148 } 2149 2150 /* 2151 * Calculate index for next page table. 2152 */ 2153 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 2154 if (va_next < sva) 2155 va_next = eva; 2156 2157 l2 = pmap_l1_to_l2(l1, sva); 2158 if (l2 == NULL) 2159 continue; 2160 if ((l2e = pmap_load(l2)) == 0) 2161 continue; 2162 if ((l2e & PTE_RWX) != 0) { 2163 if (sva + L2_SIZE == va_next && eva >= va_next) { 2164 (void)pmap_remove_l2(pmap, l2, sva, 2165 pmap_load(l1), &free, &lock); 2166 continue; 2167 } else if (!pmap_demote_l2_locked(pmap, l2, sva, 2168 &lock)) { 2169 /* 2170 * The large page mapping was destroyed. 2171 */ 2172 continue; 2173 } 2174 l2e = pmap_load(l2); 2175 } 2176 2177 /* 2178 * Limit our scan to either the end of the va represented 2179 * by the current page table page, or to the end of the 2180 * range being removed. 2181 */ 2182 if (va_next > eva) 2183 va_next = eva; 2184 2185 va = va_next; 2186 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 2187 sva += L3_SIZE) { 2188 if (pmap_load(l3) == 0) { 2189 if (va != va_next) { 2190 pmap_invalidate_range(pmap, va, sva); 2191 va = va_next; 2192 } 2193 continue; 2194 } 2195 if (va == va_next) 2196 va = sva; 2197 if (pmap_remove_l3(pmap, l3, sva, l2e, &free, &lock)) { 2198 sva += L3_SIZE; 2199 break; 2200 } 2201 } 2202 if (va != va_next) 2203 pmap_invalidate_range(pmap, va, sva); 2204 } 2205 if (lock != NULL) 2206 rw_wunlock(lock); 2207 rw_runlock(&pvh_global_lock); 2208 PMAP_UNLOCK(pmap); 2209 vm_page_free_pages_toq(&free, false); 2210 } 2211 2212 /* 2213 * Routine: pmap_remove_all 2214 * Function: 2215 * Removes this physical page from 2216 * all physical maps in which it resides. 2217 * Reflects back modify bits to the pager. 2218 * 2219 * Notes: 2220 * Original versions of this routine were very 2221 * inefficient because they iteratively called 2222 * pmap_remove (slow...) 2223 */ 2224 2225 void 2226 pmap_remove_all(vm_page_t m) 2227 { 2228 struct spglist free; 2229 struct md_page *pvh; 2230 pmap_t pmap; 2231 pt_entry_t *l3, l3e; 2232 pd_entry_t *l2, l2e; 2233 pv_entry_t pv; 2234 vm_offset_t va; 2235 2236 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 2237 ("pmap_remove_all: page %p is not managed", m)); 2238 SLIST_INIT(&free); 2239 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 2240 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2241 2242 rw_wlock(&pvh_global_lock); 2243 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 2244 pmap = PV_PMAP(pv); 2245 PMAP_LOCK(pmap); 2246 va = pv->pv_va; 2247 l2 = pmap_l2(pmap, va); 2248 (void)pmap_demote_l2(pmap, l2, va); 2249 PMAP_UNLOCK(pmap); 2250 } 2251 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 2252 pmap = PV_PMAP(pv); 2253 PMAP_LOCK(pmap); 2254 pmap_resident_count_dec(pmap, 1); 2255 l2 = pmap_l2(pmap, pv->pv_va); 2256 KASSERT(l2 != NULL, ("pmap_remove_all: no l2 table found")); 2257 l2e = pmap_load(l2); 2258 2259 KASSERT((l2e & PTE_RX) == 0, 2260 ("pmap_remove_all: found a superpage in %p's pv list", m)); 2261 2262 l3 = pmap_l2_to_l3(l2, pv->pv_va); 2263 l3e = pmap_load_clear(l3); 2264 pmap_invalidate_page(pmap, pv->pv_va); 2265 if (l3e & PTE_SW_WIRED) 2266 pmap->pm_stats.wired_count--; 2267 if ((l3e & PTE_A) != 0) 2268 vm_page_aflag_set(m, PGA_REFERENCED); 2269 2270 /* 2271 * Update the vm_page_t clean and reference bits. 2272 */ 2273 if ((l3e & PTE_D) != 0) 2274 vm_page_dirty(m); 2275 pmap_unuse_pt(pmap, pv->pv_va, pmap_load(l2), &free); 2276 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 2277 m->md.pv_gen++; 2278 free_pv_entry(pmap, pv); 2279 PMAP_UNLOCK(pmap); 2280 } 2281 vm_page_aflag_clear(m, PGA_WRITEABLE); 2282 rw_wunlock(&pvh_global_lock); 2283 vm_page_free_pages_toq(&free, false); 2284 } 2285 2286 /* 2287 * Set the physical protection on the 2288 * specified range of this map as requested. 2289 */ 2290 void 2291 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 2292 { 2293 pd_entry_t *l1, *l2, l2e; 2294 pt_entry_t *l3, l3e, mask; 2295 vm_page_t m, mt; 2296 vm_paddr_t pa; 2297 vm_offset_t va_next; 2298 bool anychanged, pv_lists_locked; 2299 2300 if ((prot & VM_PROT_READ) == VM_PROT_NONE) { 2301 pmap_remove(pmap, sva, eva); 2302 return; 2303 } 2304 2305 if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == 2306 (VM_PROT_WRITE | VM_PROT_EXECUTE)) 2307 return; 2308 2309 anychanged = false; 2310 pv_lists_locked = false; 2311 mask = 0; 2312 if ((prot & VM_PROT_WRITE) == 0) 2313 mask |= PTE_W | PTE_D; 2314 if ((prot & VM_PROT_EXECUTE) == 0) 2315 mask |= PTE_X; 2316 resume: 2317 PMAP_LOCK(pmap); 2318 for (; sva < eva; sva = va_next) { 2319 l1 = pmap_l1(pmap, sva); 2320 if (pmap_load(l1) == 0) { 2321 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 2322 if (va_next < sva) 2323 va_next = eva; 2324 continue; 2325 } 2326 2327 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 2328 if (va_next < sva) 2329 va_next = eva; 2330 2331 l2 = pmap_l1_to_l2(l1, sva); 2332 if (l2 == NULL || (l2e = pmap_load(l2)) == 0) 2333 continue; 2334 if ((l2e & PTE_RWX) != 0) { 2335 if (sva + L2_SIZE == va_next && eva >= va_next) { 2336 retryl2: 2337 if ((prot & VM_PROT_WRITE) == 0 && 2338 (l2e & (PTE_SW_MANAGED | PTE_D)) == 2339 (PTE_SW_MANAGED | PTE_D)) { 2340 pa = PTE_TO_PHYS(l2e); 2341 m = PHYS_TO_VM_PAGE(pa); 2342 for (mt = m; mt < &m[Ln_ENTRIES]; mt++) 2343 vm_page_dirty(mt); 2344 } 2345 if (!atomic_fcmpset_long(l2, &l2e, l2e & ~mask)) 2346 goto retryl2; 2347 anychanged = true; 2348 continue; 2349 } else { 2350 if (!pv_lists_locked) { 2351 pv_lists_locked = true; 2352 if (!rw_try_rlock(&pvh_global_lock)) { 2353 if (anychanged) 2354 pmap_invalidate_all( 2355 pmap); 2356 PMAP_UNLOCK(pmap); 2357 rw_rlock(&pvh_global_lock); 2358 goto resume; 2359 } 2360 } 2361 if (!pmap_demote_l2(pmap, l2, sva)) { 2362 /* 2363 * The large page mapping was destroyed. 2364 */ 2365 continue; 2366 } 2367 } 2368 } 2369 2370 if (va_next > eva) 2371 va_next = eva; 2372 2373 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 2374 sva += L3_SIZE) { 2375 l3e = pmap_load(l3); 2376 retryl3: 2377 if ((l3e & PTE_V) == 0) 2378 continue; 2379 if ((prot & VM_PROT_WRITE) == 0 && 2380 (l3e & (PTE_SW_MANAGED | PTE_D)) == 2381 (PTE_SW_MANAGED | PTE_D)) { 2382 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(l3e)); 2383 vm_page_dirty(m); 2384 } 2385 if (!atomic_fcmpset_long(l3, &l3e, l3e & ~mask)) 2386 goto retryl3; 2387 anychanged = true; 2388 } 2389 } 2390 if (anychanged) 2391 pmap_invalidate_all(pmap); 2392 if (pv_lists_locked) 2393 rw_runlock(&pvh_global_lock); 2394 PMAP_UNLOCK(pmap); 2395 } 2396 2397 int 2398 pmap_fault(pmap_t pmap, vm_offset_t va, vm_prot_t ftype) 2399 { 2400 pd_entry_t *l2, l2e; 2401 pt_entry_t bits, *pte, oldpte; 2402 int rv; 2403 2404 rv = 0; 2405 PMAP_LOCK(pmap); 2406 l2 = pmap_l2(pmap, va); 2407 if (l2 == NULL || ((l2e = pmap_load(l2)) & PTE_V) == 0) 2408 goto done; 2409 if ((l2e & PTE_RWX) == 0) { 2410 pte = pmap_l2_to_l3(l2, va); 2411 if (pte == NULL || ((oldpte = pmap_load(pte)) & PTE_V) == 0) 2412 goto done; 2413 } else { 2414 pte = l2; 2415 oldpte = l2e; 2416 } 2417 2418 if ((pmap != kernel_pmap && (oldpte & PTE_U) == 0) || 2419 (ftype == VM_PROT_WRITE && (oldpte & PTE_W) == 0) || 2420 (ftype == VM_PROT_EXECUTE && (oldpte & PTE_X) == 0) || 2421 (ftype == VM_PROT_READ && (oldpte & PTE_R) == 0)) 2422 goto done; 2423 2424 bits = PTE_A; 2425 if (ftype == VM_PROT_WRITE) 2426 bits |= PTE_D; 2427 2428 /* 2429 * Spurious faults can occur if the implementation caches invalid 2430 * entries in the TLB, or if simultaneous accesses on multiple CPUs 2431 * race with each other. 2432 */ 2433 if ((oldpte & bits) != bits) 2434 pmap_store_bits(pte, bits); 2435 sfence_vma(); 2436 rv = 1; 2437 done: 2438 PMAP_UNLOCK(pmap); 2439 return (rv); 2440 } 2441 2442 static bool 2443 pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va) 2444 { 2445 struct rwlock *lock; 2446 bool rv; 2447 2448 lock = NULL; 2449 rv = pmap_demote_l2_locked(pmap, l2, va, &lock); 2450 if (lock != NULL) 2451 rw_wunlock(lock); 2452 return (rv); 2453 } 2454 2455 /* 2456 * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page 2457 * mapping is invalidated. 2458 */ 2459 static bool 2460 pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, 2461 struct rwlock **lockp) 2462 { 2463 struct spglist free; 2464 vm_page_t mpte; 2465 pd_entry_t newl2, oldl2; 2466 pt_entry_t *firstl3, newl3; 2467 vm_paddr_t mptepa; 2468 int i; 2469 2470 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2471 2472 oldl2 = pmap_load(l2); 2473 KASSERT((oldl2 & PTE_RWX) != 0, 2474 ("pmap_demote_l2_locked: oldl2 is not a leaf entry")); 2475 if ((oldl2 & PTE_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == 2476 NULL) { 2477 if ((oldl2 & PTE_A) == 0 || (mpte = vm_page_alloc_noobj( 2478 (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 0) | 2479 VM_ALLOC_WIRED)) == NULL) { 2480 SLIST_INIT(&free); 2481 (void)pmap_remove_l2(pmap, l2, va & ~L2_OFFSET, 2482 pmap_load(pmap_l1(pmap, va)), &free, lockp); 2483 vm_page_free_pages_toq(&free, true); 2484 CTR2(KTR_PMAP, "pmap_demote_l2_locked: " 2485 "failure for va %#lx in pmap %p", va, pmap); 2486 return (false); 2487 } 2488 mpte->pindex = pmap_l2_pindex(va); 2489 if (va < VM_MAXUSER_ADDRESS) { 2490 mpte->ref_count = Ln_ENTRIES; 2491 pmap_resident_count_inc(pmap, 1); 2492 } 2493 } 2494 mptepa = VM_PAGE_TO_PHYS(mpte); 2495 firstl3 = (pt_entry_t *)PHYS_TO_DMAP(mptepa); 2496 newl2 = ((mptepa / PAGE_SIZE) << PTE_PPN0_S) | PTE_V; 2497 KASSERT((oldl2 & PTE_A) != 0, 2498 ("pmap_demote_l2_locked: oldl2 is missing PTE_A")); 2499 KASSERT((oldl2 & (PTE_D | PTE_W)) != PTE_W, 2500 ("pmap_demote_l2_locked: oldl2 is missing PTE_D")); 2501 newl3 = oldl2; 2502 2503 /* 2504 * If the page table page is not leftover from an earlier promotion, 2505 * initialize it. 2506 */ 2507 if (mpte->valid == 0) { 2508 for (i = 0; i < Ln_ENTRIES; i++) 2509 pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S)); 2510 } 2511 KASSERT(PTE_TO_PHYS(pmap_load(firstl3)) == PTE_TO_PHYS(newl3), 2512 ("pmap_demote_l2_locked: firstl3 and newl3 map different physical " 2513 "addresses")); 2514 2515 /* 2516 * If the mapping has changed attributes, update the page table 2517 * entries. 2518 */ 2519 if ((pmap_load(firstl3) & PTE_PROMOTE) != (newl3 & PTE_PROMOTE)) 2520 for (i = 0; i < Ln_ENTRIES; i++) 2521 pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S)); 2522 2523 /* 2524 * The spare PV entries must be reserved prior to demoting the 2525 * mapping, that is, prior to changing the L2 entry. Otherwise, the 2526 * state of the L2 entry and the PV lists will be inconsistent, which 2527 * can result in reclaim_pv_chunk() attempting to remove a PV entry from 2528 * the wrong PV list and pmap_pv_demote_l2() failing to find the 2529 * expected PV entry for the 2MB page mapping that is being demoted. 2530 */ 2531 if ((oldl2 & PTE_SW_MANAGED) != 0) 2532 reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp); 2533 2534 /* 2535 * Demote the mapping. 2536 */ 2537 pmap_store(l2, newl2); 2538 2539 /* 2540 * Demote the PV entry. 2541 */ 2542 if ((oldl2 & PTE_SW_MANAGED) != 0) 2543 pmap_pv_demote_l2(pmap, va, PTE_TO_PHYS(oldl2), lockp); 2544 2545 atomic_add_long(&pmap_l2_demotions, 1); 2546 CTR2(KTR_PMAP, "pmap_demote_l2_locked: success for va %#lx in pmap %p", 2547 va, pmap); 2548 return (true); 2549 } 2550 2551 #if VM_NRESERVLEVEL > 0 2552 static void 2553 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, 2554 struct rwlock **lockp) 2555 { 2556 pt_entry_t *firstl3, firstl3e, *l3, l3e; 2557 vm_paddr_t pa; 2558 vm_page_t ml3; 2559 2560 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2561 2562 va &= ~L2_OFFSET; 2563 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 2564 ("pmap_promote_l2: invalid l2 entry %p", l2)); 2565 2566 firstl3 = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l2))); 2567 firstl3e = pmap_load(firstl3); 2568 pa = PTE_TO_PHYS(firstl3e); 2569 if ((pa & L2_OFFSET) != 0) { 2570 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p", 2571 va, pmap); 2572 atomic_add_long(&pmap_l2_p_failures, 1); 2573 return; 2574 } 2575 2576 /* 2577 * Downgrade a clean, writable mapping to read-only to ensure that the 2578 * hardware does not set PTE_D while we are comparing PTEs. 2579 * 2580 * Upon a write access to a clean mapping, the implementation will 2581 * either atomically check protections and set PTE_D, or raise a page 2582 * fault. In the latter case, the pmap lock provides atomicity. Thus, 2583 * we do not issue an sfence.vma here and instead rely on pmap_fault() 2584 * to do so lazily. 2585 */ 2586 while ((firstl3e & (PTE_W | PTE_D)) == PTE_W) { 2587 if (atomic_fcmpset_64(firstl3, &firstl3e, firstl3e & ~PTE_W)) { 2588 firstl3e &= ~PTE_W; 2589 break; 2590 } 2591 } 2592 2593 pa += PAGE_SIZE; 2594 for (l3 = firstl3 + 1; l3 < firstl3 + Ln_ENTRIES; l3++) { 2595 l3e = pmap_load(l3); 2596 if (PTE_TO_PHYS(l3e) != pa) { 2597 CTR2(KTR_PMAP, 2598 "pmap_promote_l2: failure for va %#lx pmap %p", 2599 va, pmap); 2600 atomic_add_long(&pmap_l2_p_failures, 1); 2601 return; 2602 } 2603 while ((l3e & (PTE_W | PTE_D)) == PTE_W) { 2604 if (atomic_fcmpset_64(l3, &l3e, l3e & ~PTE_W)) { 2605 l3e &= ~PTE_W; 2606 break; 2607 } 2608 } 2609 if ((l3e & PTE_PROMOTE) != (firstl3e & PTE_PROMOTE)) { 2610 CTR2(KTR_PMAP, 2611 "pmap_promote_l2: failure for va %#lx pmap %p", 2612 va, pmap); 2613 atomic_add_long(&pmap_l2_p_failures, 1); 2614 return; 2615 } 2616 pa += PAGE_SIZE; 2617 } 2618 2619 ml3 = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); 2620 KASSERT(ml3->pindex == pmap_l2_pindex(va), 2621 ("pmap_promote_l2: page table page's pindex is wrong")); 2622 if (pmap_insert_pt_page(pmap, ml3, true)) { 2623 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p", 2624 va, pmap); 2625 atomic_add_long(&pmap_l2_p_failures, 1); 2626 return; 2627 } 2628 2629 if ((firstl3e & PTE_SW_MANAGED) != 0) 2630 pmap_pv_promote_l2(pmap, va, PTE_TO_PHYS(firstl3e), lockp); 2631 2632 pmap_store(l2, firstl3e); 2633 2634 atomic_add_long(&pmap_l2_promotions, 1); 2635 CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va, 2636 pmap); 2637 } 2638 #endif 2639 2640 /* 2641 * Insert the given physical page (p) at 2642 * the specified virtual address (v) in the 2643 * target physical map with the protection requested. 2644 * 2645 * If specified, the page will be wired down, meaning 2646 * that the related pte can not be reclaimed. 2647 * 2648 * NB: This is the only routine which MAY NOT lazy-evaluate 2649 * or lose information. That is, this routine must actually 2650 * insert this page into the given map NOW. 2651 */ 2652 int 2653 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 2654 u_int flags, int8_t psind) 2655 { 2656 struct rwlock *lock; 2657 pd_entry_t *l1, *l2, l2e; 2658 pt_entry_t new_l3, orig_l3; 2659 pt_entry_t *l3; 2660 pv_entry_t pv; 2661 vm_paddr_t opa, pa, l2_pa, l3_pa; 2662 vm_page_t mpte, om, l2_m, l3_m; 2663 pt_entry_t entry; 2664 pn_t l2_pn, l3_pn, pn; 2665 int rv; 2666 bool nosleep; 2667 2668 va = trunc_page(va); 2669 if ((m->oflags & VPO_UNMANAGED) == 0) 2670 VM_PAGE_OBJECT_BUSY_ASSERT(m); 2671 pa = VM_PAGE_TO_PHYS(m); 2672 pn = (pa / PAGE_SIZE); 2673 2674 new_l3 = PTE_V | PTE_R | PTE_A; 2675 if (prot & VM_PROT_EXECUTE) 2676 new_l3 |= PTE_X; 2677 if (flags & VM_PROT_WRITE) 2678 new_l3 |= PTE_D; 2679 if (prot & VM_PROT_WRITE) 2680 new_l3 |= PTE_W; 2681 if (va < VM_MAX_USER_ADDRESS) 2682 new_l3 |= PTE_U; 2683 2684 new_l3 |= (pn << PTE_PPN0_S); 2685 if ((flags & PMAP_ENTER_WIRED) != 0) 2686 new_l3 |= PTE_SW_WIRED; 2687 2688 /* 2689 * Set modified bit gratuitously for writeable mappings if 2690 * the page is unmanaged. We do not want to take a fault 2691 * to do the dirty bit accounting for these mappings. 2692 */ 2693 if ((m->oflags & VPO_UNMANAGED) != 0) { 2694 if (prot & VM_PROT_WRITE) 2695 new_l3 |= PTE_D; 2696 } else 2697 new_l3 |= PTE_SW_MANAGED; 2698 2699 CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa); 2700 2701 lock = NULL; 2702 mpte = NULL; 2703 rw_rlock(&pvh_global_lock); 2704 PMAP_LOCK(pmap); 2705 if (psind == 1) { 2706 /* Assert the required virtual and physical alignment. */ 2707 KASSERT((va & L2_OFFSET) == 0, 2708 ("pmap_enter: va %#lx unaligned", va)); 2709 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); 2710 rv = pmap_enter_l2(pmap, va, new_l3, flags, m, &lock); 2711 goto out; 2712 } 2713 2714 l2 = pmap_l2(pmap, va); 2715 if (l2 != NULL && ((l2e = pmap_load(l2)) & PTE_V) != 0 && 2716 ((l2e & PTE_RWX) == 0 || pmap_demote_l2_locked(pmap, l2, 2717 va, &lock))) { 2718 l3 = pmap_l2_to_l3(l2, va); 2719 if (va < VM_MAXUSER_ADDRESS) { 2720 mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); 2721 mpte->ref_count++; 2722 } 2723 } else if (va < VM_MAXUSER_ADDRESS) { 2724 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; 2725 mpte = pmap_alloc_l3(pmap, va, nosleep ? NULL : &lock); 2726 if (mpte == NULL && nosleep) { 2727 CTR0(KTR_PMAP, "pmap_enter: mpte == NULL"); 2728 if (lock != NULL) 2729 rw_wunlock(lock); 2730 rw_runlock(&pvh_global_lock); 2731 PMAP_UNLOCK(pmap); 2732 return (KERN_RESOURCE_SHORTAGE); 2733 } 2734 l3 = pmap_l3(pmap, va); 2735 } else { 2736 l3 = pmap_l3(pmap, va); 2737 /* TODO: This is not optimal, but should mostly work */ 2738 if (l3 == NULL) { 2739 if (l2 == NULL) { 2740 l2_m = vm_page_alloc_noobj(VM_ALLOC_WIRED | 2741 VM_ALLOC_ZERO); 2742 if (l2_m == NULL) 2743 panic("pmap_enter: l2 pte_m == NULL"); 2744 2745 l2_pa = VM_PAGE_TO_PHYS(l2_m); 2746 l2_pn = (l2_pa / PAGE_SIZE); 2747 2748 l1 = pmap_l1(pmap, va); 2749 entry = (PTE_V); 2750 entry |= (l2_pn << PTE_PPN0_S); 2751 pmap_store(l1, entry); 2752 pmap_distribute_l1(pmap, pmap_l1_index(va), entry); 2753 l2 = pmap_l1_to_l2(l1, va); 2754 } 2755 2756 l3_m = vm_page_alloc_noobj(VM_ALLOC_WIRED | 2757 VM_ALLOC_ZERO); 2758 if (l3_m == NULL) 2759 panic("pmap_enter: l3 pte_m == NULL"); 2760 2761 l3_pa = VM_PAGE_TO_PHYS(l3_m); 2762 l3_pn = (l3_pa / PAGE_SIZE); 2763 entry = (PTE_V); 2764 entry |= (l3_pn << PTE_PPN0_S); 2765 pmap_store(l2, entry); 2766 l3 = pmap_l2_to_l3(l2, va); 2767 } 2768 pmap_invalidate_page(pmap, va); 2769 } 2770 2771 orig_l3 = pmap_load(l3); 2772 opa = PTE_TO_PHYS(orig_l3); 2773 pv = NULL; 2774 2775 /* 2776 * Is the specified virtual address already mapped? 2777 */ 2778 if ((orig_l3 & PTE_V) != 0) { 2779 /* 2780 * Wiring change, just update stats. We don't worry about 2781 * wiring PT pages as they remain resident as long as there 2782 * are valid mappings in them. Hence, if a user page is wired, 2783 * the PT page will be also. 2784 */ 2785 if ((flags & PMAP_ENTER_WIRED) != 0 && 2786 (orig_l3 & PTE_SW_WIRED) == 0) 2787 pmap->pm_stats.wired_count++; 2788 else if ((flags & PMAP_ENTER_WIRED) == 0 && 2789 (orig_l3 & PTE_SW_WIRED) != 0) 2790 pmap->pm_stats.wired_count--; 2791 2792 /* 2793 * Remove the extra PT page reference. 2794 */ 2795 if (mpte != NULL) { 2796 mpte->ref_count--; 2797 KASSERT(mpte->ref_count > 0, 2798 ("pmap_enter: missing reference to page table page," 2799 " va: 0x%lx", va)); 2800 } 2801 2802 /* 2803 * Has the physical page changed? 2804 */ 2805 if (opa == pa) { 2806 /* 2807 * No, might be a protection or wiring change. 2808 */ 2809 if ((orig_l3 & PTE_SW_MANAGED) != 0 && 2810 (new_l3 & PTE_W) != 0) 2811 vm_page_aflag_set(m, PGA_WRITEABLE); 2812 goto validate; 2813 } 2814 2815 /* 2816 * The physical page has changed. Temporarily invalidate 2817 * the mapping. This ensures that all threads sharing the 2818 * pmap keep a consistent view of the mapping, which is 2819 * necessary for the correct handling of COW faults. It 2820 * also permits reuse of the old mapping's PV entry, 2821 * avoiding an allocation. 2822 * 2823 * For consistency, handle unmanaged mappings the same way. 2824 */ 2825 orig_l3 = pmap_load_clear(l3); 2826 KASSERT(PTE_TO_PHYS(orig_l3) == opa, 2827 ("pmap_enter: unexpected pa update for %#lx", va)); 2828 if ((orig_l3 & PTE_SW_MANAGED) != 0) { 2829 om = PHYS_TO_VM_PAGE(opa); 2830 2831 /* 2832 * The pmap lock is sufficient to synchronize with 2833 * concurrent calls to pmap_page_test_mappings() and 2834 * pmap_ts_referenced(). 2835 */ 2836 if ((orig_l3 & PTE_D) != 0) 2837 vm_page_dirty(om); 2838 if ((orig_l3 & PTE_A) != 0) 2839 vm_page_aflag_set(om, PGA_REFERENCED); 2840 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); 2841 pv = pmap_pvh_remove(&om->md, pmap, va); 2842 KASSERT(pv != NULL, 2843 ("pmap_enter: no PV entry for %#lx", va)); 2844 if ((new_l3 & PTE_SW_MANAGED) == 0) 2845 free_pv_entry(pmap, pv); 2846 if ((om->a.flags & PGA_WRITEABLE) != 0 && 2847 TAILQ_EMPTY(&om->md.pv_list) && 2848 ((om->flags & PG_FICTITIOUS) != 0 || 2849 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 2850 vm_page_aflag_clear(om, PGA_WRITEABLE); 2851 } 2852 pmap_invalidate_page(pmap, va); 2853 orig_l3 = 0; 2854 } else { 2855 /* 2856 * Increment the counters. 2857 */ 2858 if ((new_l3 & PTE_SW_WIRED) != 0) 2859 pmap->pm_stats.wired_count++; 2860 pmap_resident_count_inc(pmap, 1); 2861 } 2862 /* 2863 * Enter on the PV list if part of our managed memory. 2864 */ 2865 if ((new_l3 & PTE_SW_MANAGED) != 0) { 2866 if (pv == NULL) { 2867 pv = get_pv_entry(pmap, &lock); 2868 pv->pv_va = va; 2869 } 2870 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); 2871 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2872 m->md.pv_gen++; 2873 if ((new_l3 & PTE_W) != 0) 2874 vm_page_aflag_set(m, PGA_WRITEABLE); 2875 } 2876 2877 validate: 2878 /* 2879 * Sync the i-cache on all harts before updating the PTE 2880 * if the new PTE is executable. 2881 */ 2882 if (prot & VM_PROT_EXECUTE) 2883 pmap_sync_icache(pmap, va, PAGE_SIZE); 2884 2885 /* 2886 * Update the L3 entry. 2887 */ 2888 if (orig_l3 != 0) { 2889 orig_l3 = pmap_load_store(l3, new_l3); 2890 pmap_invalidate_page(pmap, va); 2891 KASSERT(PTE_TO_PHYS(orig_l3) == pa, 2892 ("pmap_enter: invalid update")); 2893 if ((orig_l3 & (PTE_D | PTE_SW_MANAGED)) == 2894 (PTE_D | PTE_SW_MANAGED)) 2895 vm_page_dirty(m); 2896 } else { 2897 pmap_store(l3, new_l3); 2898 } 2899 2900 #if VM_NRESERVLEVEL > 0 2901 if (mpte != NULL && mpte->ref_count == Ln_ENTRIES && 2902 pmap_ps_enabled(pmap) && 2903 (m->flags & PG_FICTITIOUS) == 0 && 2904 vm_reserv_level_iffullpop(m) == 0) 2905 pmap_promote_l2(pmap, l2, va, &lock); 2906 #endif 2907 2908 rv = KERN_SUCCESS; 2909 out: 2910 if (lock != NULL) 2911 rw_wunlock(lock); 2912 rw_runlock(&pvh_global_lock); 2913 PMAP_UNLOCK(pmap); 2914 return (rv); 2915 } 2916 2917 /* 2918 * Tries to create a read- and/or execute-only 2MB page mapping. Returns true 2919 * if successful. Returns false if (1) a page table page cannot be allocated 2920 * without sleeping, (2) a mapping already exists at the specified virtual 2921 * address, or (3) a PV entry cannot be allocated without reclaiming another 2922 * PV entry. 2923 */ 2924 static bool 2925 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 2926 struct rwlock **lockp) 2927 { 2928 pd_entry_t new_l2; 2929 pn_t pn; 2930 2931 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2932 2933 pn = VM_PAGE_TO_PHYS(m) / PAGE_SIZE; 2934 new_l2 = (pd_entry_t)((pn << PTE_PPN0_S) | PTE_R | PTE_V); 2935 if ((m->oflags & VPO_UNMANAGED) == 0) 2936 new_l2 |= PTE_SW_MANAGED; 2937 if ((prot & VM_PROT_EXECUTE) != 0) 2938 new_l2 |= PTE_X; 2939 if (va < VM_MAXUSER_ADDRESS) 2940 new_l2 |= PTE_U; 2941 return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP | 2942 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) == 2943 KERN_SUCCESS); 2944 } 2945 2946 /* 2947 * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if 2948 * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE 2949 * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and 2950 * a mapping already exists at the specified virtual address. Returns 2951 * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table 2952 * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if 2953 * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. 2954 * 2955 * The parameter "m" is only used when creating a managed, writeable mapping. 2956 */ 2957 static int 2958 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, 2959 vm_page_t m, struct rwlock **lockp) 2960 { 2961 struct spglist free; 2962 pd_entry_t *l2, *l3, oldl2; 2963 vm_offset_t sva; 2964 vm_page_t l2pg, mt; 2965 2966 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2967 2968 if ((l2pg = pmap_alloc_l2(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ? 2969 NULL : lockp)) == NULL) { 2970 CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", 2971 va, pmap); 2972 return (KERN_RESOURCE_SHORTAGE); 2973 } 2974 2975 l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg)); 2976 l2 = &l2[pmap_l2_index(va)]; 2977 if ((oldl2 = pmap_load(l2)) != 0) { 2978 KASSERT(l2pg->ref_count > 1, 2979 ("pmap_enter_l2: l2pg's ref count is too low")); 2980 if ((flags & PMAP_ENTER_NOREPLACE) != 0) { 2981 l2pg->ref_count--; 2982 CTR2(KTR_PMAP, 2983 "pmap_enter_l2: failure for va %#lx in pmap %p", 2984 va, pmap); 2985 return (KERN_FAILURE); 2986 } 2987 SLIST_INIT(&free); 2988 if ((oldl2 & PTE_RWX) != 0) 2989 (void)pmap_remove_l2(pmap, l2, va, 2990 pmap_load(pmap_l1(pmap, va)), &free, lockp); 2991 else 2992 for (sva = va; sva < va + L2_SIZE; sva += PAGE_SIZE) { 2993 l3 = pmap_l2_to_l3(l2, sva); 2994 if ((pmap_load(l3) & PTE_V) != 0 && 2995 pmap_remove_l3(pmap, l3, sva, oldl2, &free, 2996 lockp) != 0) 2997 break; 2998 } 2999 vm_page_free_pages_toq(&free, true); 3000 if (va >= VM_MAXUSER_ADDRESS) { 3001 /* 3002 * Both pmap_remove_l2() and pmap_remove_l3() will 3003 * leave the kernel page table page zero filled. 3004 */ 3005 mt = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); 3006 if (pmap_insert_pt_page(pmap, mt, false)) 3007 panic("pmap_enter_l2: trie insert failed"); 3008 } else 3009 KASSERT(pmap_load(l2) == 0, 3010 ("pmap_enter_l2: non-zero L2 entry %p", l2)); 3011 } 3012 3013 if ((new_l2 & PTE_SW_MANAGED) != 0) { 3014 /* 3015 * Abort this mapping if its PV entry could not be created. 3016 */ 3017 if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) { 3018 SLIST_INIT(&free); 3019 if (pmap_unwire_ptp(pmap, va, l2pg, &free)) { 3020 /* 3021 * Although "va" is not mapped, paging-structure 3022 * caches could nonetheless have entries that 3023 * refer to the freed page table pages. 3024 * Invalidate those entries. 3025 */ 3026 pmap_invalidate_page(pmap, va); 3027 vm_page_free_pages_toq(&free, true); 3028 } 3029 CTR2(KTR_PMAP, 3030 "pmap_enter_l2: failure for va %#lx in pmap %p", 3031 va, pmap); 3032 return (KERN_RESOURCE_SHORTAGE); 3033 } 3034 if ((new_l2 & PTE_W) != 0) 3035 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 3036 vm_page_aflag_set(mt, PGA_WRITEABLE); 3037 } 3038 3039 /* 3040 * Increment counters. 3041 */ 3042 if ((new_l2 & PTE_SW_WIRED) != 0) 3043 pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE; 3044 pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE; 3045 3046 /* 3047 * Map the superpage. 3048 */ 3049 pmap_store(l2, new_l2); 3050 3051 atomic_add_long(&pmap_l2_mappings, 1); 3052 CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p", 3053 va, pmap); 3054 3055 return (KERN_SUCCESS); 3056 } 3057 3058 /* 3059 * Maps a sequence of resident pages belonging to the same object. 3060 * The sequence begins with the given page m_start. This page is 3061 * mapped at the given virtual address start. Each subsequent page is 3062 * mapped at a virtual address that is offset from start by the same 3063 * amount as the page is offset from m_start within the object. The 3064 * last page in the sequence is the page with the largest offset from 3065 * m_start that can be mapped at a virtual address less than the given 3066 * virtual address end. Not every virtual page between start and end 3067 * is mapped; only those for which a resident page exists with the 3068 * corresponding offset from m_start are mapped. 3069 */ 3070 void 3071 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 3072 vm_page_t m_start, vm_prot_t prot) 3073 { 3074 struct rwlock *lock; 3075 vm_offset_t va; 3076 vm_page_t m, mpte; 3077 vm_pindex_t diff, psize; 3078 3079 VM_OBJECT_ASSERT_LOCKED(m_start->object); 3080 3081 psize = atop(end - start); 3082 mpte = NULL; 3083 m = m_start; 3084 lock = NULL; 3085 rw_rlock(&pvh_global_lock); 3086 PMAP_LOCK(pmap); 3087 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 3088 va = start + ptoa(diff); 3089 if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end && 3090 m->psind == 1 && pmap_ps_enabled(pmap) && 3091 pmap_enter_2mpage(pmap, va, m, prot, &lock)) 3092 m = &m[L2_SIZE / PAGE_SIZE - 1]; 3093 else 3094 mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, 3095 &lock); 3096 m = TAILQ_NEXT(m, listq); 3097 } 3098 if (lock != NULL) 3099 rw_wunlock(lock); 3100 rw_runlock(&pvh_global_lock); 3101 PMAP_UNLOCK(pmap); 3102 } 3103 3104 /* 3105 * this code makes some *MAJOR* assumptions: 3106 * 1. Current pmap & pmap exists. 3107 * 2. Not wired. 3108 * 3. Read access. 3109 * 4. No page table pages. 3110 * but is *MUCH* faster than pmap_enter... 3111 */ 3112 3113 void 3114 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 3115 { 3116 struct rwlock *lock; 3117 3118 lock = NULL; 3119 rw_rlock(&pvh_global_lock); 3120 PMAP_LOCK(pmap); 3121 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); 3122 if (lock != NULL) 3123 rw_wunlock(lock); 3124 rw_runlock(&pvh_global_lock); 3125 PMAP_UNLOCK(pmap); 3126 } 3127 3128 static vm_page_t 3129 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 3130 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) 3131 { 3132 struct spglist free; 3133 vm_paddr_t phys; 3134 pd_entry_t *l2; 3135 pt_entry_t *l3, newl3; 3136 3137 KASSERT(!VA_IS_CLEANMAP(va) || 3138 (m->oflags & VPO_UNMANAGED) != 0, 3139 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 3140 rw_assert(&pvh_global_lock, RA_LOCKED); 3141 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3142 3143 CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va); 3144 /* 3145 * In the case that a page table page is not 3146 * resident, we are creating it here. 3147 */ 3148 if (va < VM_MAXUSER_ADDRESS) { 3149 vm_pindex_t l2pindex; 3150 3151 /* 3152 * Calculate pagetable page index 3153 */ 3154 l2pindex = pmap_l2_pindex(va); 3155 if (mpte && (mpte->pindex == l2pindex)) { 3156 mpte->ref_count++; 3157 } else { 3158 /* 3159 * Get the l2 entry 3160 */ 3161 l2 = pmap_l2(pmap, va); 3162 3163 /* 3164 * If the page table page is mapped, we just increment 3165 * the hold count, and activate it. Otherwise, we 3166 * attempt to allocate a page table page. If this 3167 * attempt fails, we don't retry. Instead, we give up. 3168 */ 3169 if (l2 != NULL && pmap_load(l2) != 0) { 3170 phys = PTE_TO_PHYS(pmap_load(l2)); 3171 mpte = PHYS_TO_VM_PAGE(phys); 3172 mpte->ref_count++; 3173 } else { 3174 /* 3175 * Pass NULL instead of the PV list lock 3176 * pointer, because we don't intend to sleep. 3177 */ 3178 mpte = _pmap_alloc_l3(pmap, l2pindex, NULL); 3179 if (mpte == NULL) 3180 return (mpte); 3181 } 3182 } 3183 l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); 3184 l3 = &l3[pmap_l3_index(va)]; 3185 } else { 3186 mpte = NULL; 3187 l3 = pmap_l3(kernel_pmap, va); 3188 } 3189 if (l3 == NULL) 3190 panic("pmap_enter_quick_locked: No l3"); 3191 if (pmap_load(l3) != 0) { 3192 if (mpte != NULL) { 3193 mpte->ref_count--; 3194 mpte = NULL; 3195 } 3196 return (mpte); 3197 } 3198 3199 /* 3200 * Enter on the PV list if part of our managed memory. 3201 */ 3202 if ((m->oflags & VPO_UNMANAGED) == 0 && 3203 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { 3204 if (mpte != NULL) { 3205 SLIST_INIT(&free); 3206 if (pmap_unwire_ptp(pmap, va, mpte, &free)) { 3207 pmap_invalidate_page(pmap, va); 3208 vm_page_free_pages_toq(&free, false); 3209 } 3210 mpte = NULL; 3211 } 3212 return (mpte); 3213 } 3214 3215 /* 3216 * Increment counters 3217 */ 3218 pmap_resident_count_inc(pmap, 1); 3219 3220 newl3 = ((VM_PAGE_TO_PHYS(m) / PAGE_SIZE) << PTE_PPN0_S) | 3221 PTE_V | PTE_R; 3222 if ((prot & VM_PROT_EXECUTE) != 0) 3223 newl3 |= PTE_X; 3224 if ((m->oflags & VPO_UNMANAGED) == 0) 3225 newl3 |= PTE_SW_MANAGED; 3226 if (va < VM_MAX_USER_ADDRESS) 3227 newl3 |= PTE_U; 3228 3229 /* 3230 * Sync the i-cache on all harts before updating the PTE 3231 * if the new PTE is executable. 3232 */ 3233 if (prot & VM_PROT_EXECUTE) 3234 pmap_sync_icache(pmap, va, PAGE_SIZE); 3235 3236 pmap_store(l3, newl3); 3237 3238 pmap_invalidate_page(pmap, va); 3239 return (mpte); 3240 } 3241 3242 /* 3243 * This code maps large physical mmap regions into the 3244 * processor address space. Note that some shortcuts 3245 * are taken, but the code works. 3246 */ 3247 void 3248 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 3249 vm_pindex_t pindex, vm_size_t size) 3250 { 3251 3252 VM_OBJECT_ASSERT_WLOCKED(object); 3253 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 3254 ("pmap_object_init_pt: non-device object")); 3255 } 3256 3257 /* 3258 * Clear the wired attribute from the mappings for the specified range of 3259 * addresses in the given pmap. Every valid mapping within that range 3260 * must have the wired attribute set. In contrast, invalid mappings 3261 * cannot have the wired attribute set, so they are ignored. 3262 * 3263 * The wired attribute of the page table entry is not a hardware feature, 3264 * so there is no need to invalidate any TLB entries. 3265 */ 3266 void 3267 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3268 { 3269 vm_offset_t va_next; 3270 pd_entry_t *l1, *l2, l2e; 3271 pt_entry_t *l3, l3e; 3272 bool pv_lists_locked; 3273 3274 pv_lists_locked = false; 3275 retry: 3276 PMAP_LOCK(pmap); 3277 for (; sva < eva; sva = va_next) { 3278 l1 = pmap_l1(pmap, sva); 3279 if (pmap_load(l1) == 0) { 3280 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 3281 if (va_next < sva) 3282 va_next = eva; 3283 continue; 3284 } 3285 3286 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 3287 if (va_next < sva) 3288 va_next = eva; 3289 3290 l2 = pmap_l1_to_l2(l1, sva); 3291 if ((l2e = pmap_load(l2)) == 0) 3292 continue; 3293 if ((l2e & PTE_RWX) != 0) { 3294 if (sva + L2_SIZE == va_next && eva >= va_next) { 3295 if ((l2e & PTE_SW_WIRED) == 0) 3296 panic("pmap_unwire: l2 %#jx is missing " 3297 "PTE_SW_WIRED", (uintmax_t)l2e); 3298 pmap_clear_bits(l2, PTE_SW_WIRED); 3299 continue; 3300 } else { 3301 if (!pv_lists_locked) { 3302 pv_lists_locked = true; 3303 if (!rw_try_rlock(&pvh_global_lock)) { 3304 PMAP_UNLOCK(pmap); 3305 rw_rlock(&pvh_global_lock); 3306 /* Repeat sva. */ 3307 goto retry; 3308 } 3309 } 3310 if (!pmap_demote_l2(pmap, l2, sva)) 3311 panic("pmap_unwire: demotion failed"); 3312 } 3313 } 3314 3315 if (va_next > eva) 3316 va_next = eva; 3317 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 3318 sva += L3_SIZE) { 3319 if ((l3e = pmap_load(l3)) == 0) 3320 continue; 3321 if ((l3e & PTE_SW_WIRED) == 0) 3322 panic("pmap_unwire: l3 %#jx is missing " 3323 "PTE_SW_WIRED", (uintmax_t)l3e); 3324 3325 /* 3326 * PG_W must be cleared atomically. Although the pmap 3327 * lock synchronizes access to PG_W, another processor 3328 * could be setting PG_M and/or PG_A concurrently. 3329 */ 3330 pmap_clear_bits(l3, PTE_SW_WIRED); 3331 pmap->pm_stats.wired_count--; 3332 } 3333 } 3334 if (pv_lists_locked) 3335 rw_runlock(&pvh_global_lock); 3336 PMAP_UNLOCK(pmap); 3337 } 3338 3339 /* 3340 * Copy the range specified by src_addr/len 3341 * from the source map to the range dst_addr/len 3342 * in the destination map. 3343 * 3344 * This routine is only advisory and need not do anything. 3345 */ 3346 3347 void 3348 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 3349 vm_offset_t src_addr) 3350 { 3351 3352 } 3353 3354 /* 3355 * pmap_zero_page zeros the specified hardware page by mapping 3356 * the page into KVM and using bzero to clear its contents. 3357 */ 3358 void 3359 pmap_zero_page(vm_page_t m) 3360 { 3361 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 3362 3363 pagezero((void *)va); 3364 } 3365 3366 /* 3367 * pmap_zero_page_area zeros the specified hardware page by mapping 3368 * the page into KVM and using bzero to clear its contents. 3369 * 3370 * off and size may not cover an area beyond a single hardware page. 3371 */ 3372 void 3373 pmap_zero_page_area(vm_page_t m, int off, int size) 3374 { 3375 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 3376 3377 if (off == 0 && size == PAGE_SIZE) 3378 pagezero((void *)va); 3379 else 3380 bzero((char *)va + off, size); 3381 } 3382 3383 /* 3384 * pmap_copy_page copies the specified (machine independent) 3385 * page by mapping the page into virtual memory and using 3386 * bcopy to copy the page, one machine dependent page at a 3387 * time. 3388 */ 3389 void 3390 pmap_copy_page(vm_page_t msrc, vm_page_t mdst) 3391 { 3392 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 3393 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 3394 3395 pagecopy((void *)src, (void *)dst); 3396 } 3397 3398 int unmapped_buf_allowed = 1; 3399 3400 void 3401 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 3402 vm_offset_t b_offset, int xfersize) 3403 { 3404 void *a_cp, *b_cp; 3405 vm_page_t m_a, m_b; 3406 vm_paddr_t p_a, p_b; 3407 vm_offset_t a_pg_offset, b_pg_offset; 3408 int cnt; 3409 3410 while (xfersize > 0) { 3411 a_pg_offset = a_offset & PAGE_MASK; 3412 m_a = ma[a_offset >> PAGE_SHIFT]; 3413 p_a = m_a->phys_addr; 3414 b_pg_offset = b_offset & PAGE_MASK; 3415 m_b = mb[b_offset >> PAGE_SHIFT]; 3416 p_b = m_b->phys_addr; 3417 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 3418 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 3419 if (__predict_false(!PHYS_IN_DMAP(p_a))) { 3420 panic("!DMAP a %lx", p_a); 3421 } else { 3422 a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset; 3423 } 3424 if (__predict_false(!PHYS_IN_DMAP(p_b))) { 3425 panic("!DMAP b %lx", p_b); 3426 } else { 3427 b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset; 3428 } 3429 bcopy(a_cp, b_cp, cnt); 3430 a_offset += cnt; 3431 b_offset += cnt; 3432 xfersize -= cnt; 3433 } 3434 } 3435 3436 vm_offset_t 3437 pmap_quick_enter_page(vm_page_t m) 3438 { 3439 3440 return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m))); 3441 } 3442 3443 void 3444 pmap_quick_remove_page(vm_offset_t addr) 3445 { 3446 } 3447 3448 /* 3449 * Returns true if the pmap's pv is one of the first 3450 * 16 pvs linked to from this page. This count may 3451 * be changed upwards or downwards in the future; it 3452 * is only necessary that true be returned for a small 3453 * subset of pmaps for proper page aging. 3454 */ 3455 boolean_t 3456 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 3457 { 3458 struct md_page *pvh; 3459 struct rwlock *lock; 3460 pv_entry_t pv; 3461 int loops = 0; 3462 boolean_t rv; 3463 3464 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3465 ("pmap_page_exists_quick: page %p is not managed", m)); 3466 rv = FALSE; 3467 rw_rlock(&pvh_global_lock); 3468 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3469 rw_rlock(lock); 3470 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 3471 if (PV_PMAP(pv) == pmap) { 3472 rv = TRUE; 3473 break; 3474 } 3475 loops++; 3476 if (loops >= 16) 3477 break; 3478 } 3479 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 3480 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3481 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3482 if (PV_PMAP(pv) == pmap) { 3483 rv = TRUE; 3484 break; 3485 } 3486 loops++; 3487 if (loops >= 16) 3488 break; 3489 } 3490 } 3491 rw_runlock(lock); 3492 rw_runlock(&pvh_global_lock); 3493 return (rv); 3494 } 3495 3496 /* 3497 * pmap_page_wired_mappings: 3498 * 3499 * Return the number of managed mappings to the given physical page 3500 * that are wired. 3501 */ 3502 int 3503 pmap_page_wired_mappings(vm_page_t m) 3504 { 3505 struct md_page *pvh; 3506 struct rwlock *lock; 3507 pmap_t pmap; 3508 pd_entry_t *l2; 3509 pt_entry_t *l3; 3510 pv_entry_t pv; 3511 int count, md_gen, pvh_gen; 3512 3513 if ((m->oflags & VPO_UNMANAGED) != 0) 3514 return (0); 3515 rw_rlock(&pvh_global_lock); 3516 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3517 rw_rlock(lock); 3518 restart: 3519 count = 0; 3520 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 3521 pmap = PV_PMAP(pv); 3522 if (!PMAP_TRYLOCK(pmap)) { 3523 md_gen = m->md.pv_gen; 3524 rw_runlock(lock); 3525 PMAP_LOCK(pmap); 3526 rw_rlock(lock); 3527 if (md_gen != m->md.pv_gen) { 3528 PMAP_UNLOCK(pmap); 3529 goto restart; 3530 } 3531 } 3532 l2 = pmap_l2(pmap, pv->pv_va); 3533 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 3534 ("%s: found a 2mpage in page %p's pv list", __func__, m)); 3535 l3 = pmap_l2_to_l3(l2, pv->pv_va); 3536 if ((pmap_load(l3) & PTE_SW_WIRED) != 0) 3537 count++; 3538 PMAP_UNLOCK(pmap); 3539 } 3540 if ((m->flags & PG_FICTITIOUS) == 0) { 3541 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3542 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3543 pmap = PV_PMAP(pv); 3544 if (!PMAP_TRYLOCK(pmap)) { 3545 md_gen = m->md.pv_gen; 3546 pvh_gen = pvh->pv_gen; 3547 rw_runlock(lock); 3548 PMAP_LOCK(pmap); 3549 rw_rlock(lock); 3550 if (md_gen != m->md.pv_gen || 3551 pvh_gen != pvh->pv_gen) { 3552 PMAP_UNLOCK(pmap); 3553 goto restart; 3554 } 3555 } 3556 l2 = pmap_l2(pmap, pv->pv_va); 3557 if ((pmap_load(l2) & PTE_SW_WIRED) != 0) 3558 count++; 3559 PMAP_UNLOCK(pmap); 3560 } 3561 } 3562 rw_runlock(lock); 3563 rw_runlock(&pvh_global_lock); 3564 return (count); 3565 } 3566 3567 /* 3568 * Returns true if the given page is mapped individually or as part of 3569 * a 2mpage. Otherwise, returns false. 3570 */ 3571 bool 3572 pmap_page_is_mapped(vm_page_t m) 3573 { 3574 struct rwlock *lock; 3575 bool rv; 3576 3577 if ((m->oflags & VPO_UNMANAGED) != 0) 3578 return (false); 3579 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3580 rw_rlock(lock); 3581 rv = !TAILQ_EMPTY(&m->md.pv_list) || 3582 ((m->flags & PG_FICTITIOUS) == 0 && 3583 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 3584 rw_runlock(lock); 3585 return (rv); 3586 } 3587 3588 static void 3589 pmap_remove_pages_pv(pmap_t pmap, vm_page_t m, pv_entry_t pv, 3590 struct spglist *free, bool superpage) 3591 { 3592 struct md_page *pvh; 3593 vm_page_t mpte, mt; 3594 3595 if (superpage) { 3596 pmap_resident_count_dec(pmap, Ln_ENTRIES); 3597 pvh = pa_to_pvh(m->phys_addr); 3598 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 3599 pvh->pv_gen++; 3600 if (TAILQ_EMPTY(&pvh->pv_list)) { 3601 for (mt = m; mt < &m[Ln_ENTRIES]; mt++) 3602 if (TAILQ_EMPTY(&mt->md.pv_list) && 3603 (mt->a.flags & PGA_WRITEABLE) != 0) 3604 vm_page_aflag_clear(mt, PGA_WRITEABLE); 3605 } 3606 mpte = pmap_remove_pt_page(pmap, pv->pv_va); 3607 if (mpte != NULL) { 3608 KASSERT(mpte->valid == VM_PAGE_BITS_ALL, 3609 ("pmap_remove_pages: pte page not promoted")); 3610 pmap_resident_count_dec(pmap, 1); 3611 KASSERT(mpte->ref_count == Ln_ENTRIES, 3612 ("pmap_remove_pages: pte page ref count error")); 3613 mpte->ref_count = 0; 3614 pmap_add_delayed_free_list(mpte, free, FALSE); 3615 } 3616 } else { 3617 pmap_resident_count_dec(pmap, 1); 3618 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 3619 m->md.pv_gen++; 3620 if (TAILQ_EMPTY(&m->md.pv_list) && 3621 (m->a.flags & PGA_WRITEABLE) != 0) { 3622 pvh = pa_to_pvh(m->phys_addr); 3623 if (TAILQ_EMPTY(&pvh->pv_list)) 3624 vm_page_aflag_clear(m, PGA_WRITEABLE); 3625 } 3626 } 3627 } 3628 3629 /* 3630 * Destroy all managed, non-wired mappings in the given user-space 3631 * pmap. This pmap cannot be active on any processor besides the 3632 * caller. 3633 * 3634 * This function cannot be applied to the kernel pmap. Moreover, it 3635 * is not intended for general use. It is only to be used during 3636 * process termination. Consequently, it can be implemented in ways 3637 * that make it faster than pmap_remove(). First, it can more quickly 3638 * destroy mappings by iterating over the pmap's collection of PV 3639 * entries, rather than searching the page table. Second, it doesn't 3640 * have to test and clear the page table entries atomically, because 3641 * no processor is currently accessing the user address space. In 3642 * particular, a page table entry's dirty bit won't change state once 3643 * this function starts. 3644 */ 3645 void 3646 pmap_remove_pages(pmap_t pmap) 3647 { 3648 struct spglist free; 3649 pd_entry_t ptepde; 3650 pt_entry_t *pte, tpte; 3651 vm_page_t m, mt; 3652 pv_entry_t pv; 3653 struct pv_chunk *pc, *npc; 3654 struct rwlock *lock; 3655 int64_t bit; 3656 uint64_t inuse, bitmask; 3657 int allfree, field, freed, idx; 3658 bool superpage; 3659 3660 lock = NULL; 3661 3662 SLIST_INIT(&free); 3663 rw_rlock(&pvh_global_lock); 3664 PMAP_LOCK(pmap); 3665 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 3666 allfree = 1; 3667 freed = 0; 3668 for (field = 0; field < _NPCM; field++) { 3669 inuse = ~pc->pc_map[field] & pc_freemask[field]; 3670 while (inuse != 0) { 3671 bit = ffsl(inuse) - 1; 3672 bitmask = 1UL << bit; 3673 idx = field * 64 + bit; 3674 pv = &pc->pc_pventry[idx]; 3675 inuse &= ~bitmask; 3676 3677 pte = pmap_l1(pmap, pv->pv_va); 3678 ptepde = pmap_load(pte); 3679 pte = pmap_l1_to_l2(pte, pv->pv_va); 3680 tpte = pmap_load(pte); 3681 if ((tpte & PTE_RWX) != 0) { 3682 superpage = true; 3683 } else { 3684 ptepde = tpte; 3685 pte = pmap_l2_to_l3(pte, pv->pv_va); 3686 tpte = pmap_load(pte); 3687 superpage = false; 3688 } 3689 3690 /* 3691 * We cannot remove wired pages from a 3692 * process' mapping at this time. 3693 */ 3694 if (tpte & PTE_SW_WIRED) { 3695 allfree = 0; 3696 continue; 3697 } 3698 3699 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tpte)); 3700 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 3701 m < &vm_page_array[vm_page_array_size], 3702 ("pmap_remove_pages: bad pte %#jx", 3703 (uintmax_t)tpte)); 3704 3705 pmap_clear(pte); 3706 3707 /* 3708 * Update the vm_page_t clean/reference bits. 3709 */ 3710 if ((tpte & (PTE_D | PTE_W)) == 3711 (PTE_D | PTE_W)) { 3712 if (superpage) 3713 for (mt = m; 3714 mt < &m[Ln_ENTRIES]; mt++) 3715 vm_page_dirty(mt); 3716 else 3717 vm_page_dirty(m); 3718 } 3719 3720 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 3721 3722 /* Mark free */ 3723 pc->pc_map[field] |= bitmask; 3724 3725 pmap_remove_pages_pv(pmap, m, pv, &free, 3726 superpage); 3727 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); 3728 freed++; 3729 } 3730 } 3731 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 3732 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 3733 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 3734 if (allfree) { 3735 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3736 free_pv_chunk(pc); 3737 } 3738 } 3739 if (lock != NULL) 3740 rw_wunlock(lock); 3741 pmap_invalidate_all(pmap); 3742 rw_runlock(&pvh_global_lock); 3743 PMAP_UNLOCK(pmap); 3744 vm_page_free_pages_toq(&free, false); 3745 } 3746 3747 static bool 3748 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) 3749 { 3750 struct md_page *pvh; 3751 struct rwlock *lock; 3752 pd_entry_t *l2; 3753 pt_entry_t *l3, mask; 3754 pv_entry_t pv; 3755 pmap_t pmap; 3756 int md_gen, pvh_gen; 3757 bool rv; 3758 3759 mask = 0; 3760 if (modified) 3761 mask |= PTE_D; 3762 if (accessed) 3763 mask |= PTE_A; 3764 3765 rv = FALSE; 3766 rw_rlock(&pvh_global_lock); 3767 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3768 rw_rlock(lock); 3769 restart: 3770 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 3771 pmap = PV_PMAP(pv); 3772 if (!PMAP_TRYLOCK(pmap)) { 3773 md_gen = m->md.pv_gen; 3774 rw_runlock(lock); 3775 PMAP_LOCK(pmap); 3776 rw_rlock(lock); 3777 if (md_gen != m->md.pv_gen) { 3778 PMAP_UNLOCK(pmap); 3779 goto restart; 3780 } 3781 } 3782 l2 = pmap_l2(pmap, pv->pv_va); 3783 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 3784 ("%s: found a 2mpage in page %p's pv list", __func__, m)); 3785 l3 = pmap_l2_to_l3(l2, pv->pv_va); 3786 rv = (pmap_load(l3) & mask) == mask; 3787 PMAP_UNLOCK(pmap); 3788 if (rv) 3789 goto out; 3790 } 3791 if ((m->flags & PG_FICTITIOUS) == 0) { 3792 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3793 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3794 pmap = PV_PMAP(pv); 3795 if (!PMAP_TRYLOCK(pmap)) { 3796 md_gen = m->md.pv_gen; 3797 pvh_gen = pvh->pv_gen; 3798 rw_runlock(lock); 3799 PMAP_LOCK(pmap); 3800 rw_rlock(lock); 3801 if (md_gen != m->md.pv_gen || 3802 pvh_gen != pvh->pv_gen) { 3803 PMAP_UNLOCK(pmap); 3804 goto restart; 3805 } 3806 } 3807 l2 = pmap_l2(pmap, pv->pv_va); 3808 rv = (pmap_load(l2) & mask) == mask; 3809 PMAP_UNLOCK(pmap); 3810 if (rv) 3811 goto out; 3812 } 3813 } 3814 out: 3815 rw_runlock(lock); 3816 rw_runlock(&pvh_global_lock); 3817 return (rv); 3818 } 3819 3820 /* 3821 * pmap_is_modified: 3822 * 3823 * Return whether or not the specified physical page was modified 3824 * in any physical maps. 3825 */ 3826 boolean_t 3827 pmap_is_modified(vm_page_t m) 3828 { 3829 3830 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3831 ("pmap_is_modified: page %p is not managed", m)); 3832 3833 /* 3834 * If the page is not busied then this check is racy. 3835 */ 3836 if (!pmap_page_is_write_mapped(m)) 3837 return (FALSE); 3838 return (pmap_page_test_mappings(m, FALSE, TRUE)); 3839 } 3840 3841 /* 3842 * pmap_is_prefaultable: 3843 * 3844 * Return whether or not the specified virtual address is eligible 3845 * for prefault. 3846 */ 3847 boolean_t 3848 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 3849 { 3850 pt_entry_t *l3; 3851 boolean_t rv; 3852 3853 /* 3854 * Return TRUE if and only if the L3 entry for the specified virtual 3855 * address is allocated but invalid. 3856 */ 3857 rv = FALSE; 3858 PMAP_LOCK(pmap); 3859 l3 = pmap_l3(pmap, addr); 3860 if (l3 != NULL && pmap_load(l3) == 0) { 3861 rv = TRUE; 3862 } 3863 PMAP_UNLOCK(pmap); 3864 return (rv); 3865 } 3866 3867 /* 3868 * pmap_is_referenced: 3869 * 3870 * Return whether or not the specified physical page was referenced 3871 * in any physical maps. 3872 */ 3873 boolean_t 3874 pmap_is_referenced(vm_page_t m) 3875 { 3876 3877 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3878 ("pmap_is_referenced: page %p is not managed", m)); 3879 return (pmap_page_test_mappings(m, TRUE, FALSE)); 3880 } 3881 3882 /* 3883 * Clear the write and modified bits in each of the given page's mappings. 3884 */ 3885 void 3886 pmap_remove_write(vm_page_t m) 3887 { 3888 struct md_page *pvh; 3889 struct rwlock *lock; 3890 pmap_t pmap; 3891 pd_entry_t *l2; 3892 pt_entry_t *l3, oldl3, newl3; 3893 pv_entry_t next_pv, pv; 3894 vm_offset_t va; 3895 int md_gen, pvh_gen; 3896 3897 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3898 ("pmap_remove_write: page %p is not managed", m)); 3899 vm_page_assert_busied(m); 3900 3901 if (!pmap_page_is_write_mapped(m)) 3902 return; 3903 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3904 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 3905 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3906 rw_rlock(&pvh_global_lock); 3907 retry_pv_loop: 3908 rw_wlock(lock); 3909 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 3910 pmap = PV_PMAP(pv); 3911 if (!PMAP_TRYLOCK(pmap)) { 3912 pvh_gen = pvh->pv_gen; 3913 rw_wunlock(lock); 3914 PMAP_LOCK(pmap); 3915 rw_wlock(lock); 3916 if (pvh_gen != pvh->pv_gen) { 3917 PMAP_UNLOCK(pmap); 3918 rw_wunlock(lock); 3919 goto retry_pv_loop; 3920 } 3921 } 3922 va = pv->pv_va; 3923 l2 = pmap_l2(pmap, va); 3924 if ((pmap_load(l2) & PTE_W) != 0) 3925 (void)pmap_demote_l2_locked(pmap, l2, va, &lock); 3926 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 3927 ("inconsistent pv lock %p %p for page %p", 3928 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 3929 PMAP_UNLOCK(pmap); 3930 } 3931 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 3932 pmap = PV_PMAP(pv); 3933 if (!PMAP_TRYLOCK(pmap)) { 3934 pvh_gen = pvh->pv_gen; 3935 md_gen = m->md.pv_gen; 3936 rw_wunlock(lock); 3937 PMAP_LOCK(pmap); 3938 rw_wlock(lock); 3939 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 3940 PMAP_UNLOCK(pmap); 3941 rw_wunlock(lock); 3942 goto retry_pv_loop; 3943 } 3944 } 3945 l2 = pmap_l2(pmap, pv->pv_va); 3946 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 3947 ("%s: found a 2mpage in page %p's pv list", __func__, m)); 3948 l3 = pmap_l2_to_l3(l2, pv->pv_va); 3949 oldl3 = pmap_load(l3); 3950 retry: 3951 if ((oldl3 & PTE_W) != 0) { 3952 newl3 = oldl3 & ~(PTE_D | PTE_W); 3953 if (!atomic_fcmpset_long(l3, &oldl3, newl3)) 3954 goto retry; 3955 if ((oldl3 & PTE_D) != 0) 3956 vm_page_dirty(m); 3957 pmap_invalidate_page(pmap, pv->pv_va); 3958 } 3959 PMAP_UNLOCK(pmap); 3960 } 3961 rw_wunlock(lock); 3962 vm_page_aflag_clear(m, PGA_WRITEABLE); 3963 rw_runlock(&pvh_global_lock); 3964 } 3965 3966 /* 3967 * pmap_ts_referenced: 3968 * 3969 * Return a count of reference bits for a page, clearing those bits. 3970 * It is not necessary for every reference bit to be cleared, but it 3971 * is necessary that 0 only be returned when there are truly no 3972 * reference bits set. 3973 * 3974 * As an optimization, update the page's dirty field if a modified bit is 3975 * found while counting reference bits. This opportunistic update can be 3976 * performed at low cost and can eliminate the need for some future calls 3977 * to pmap_is_modified(). However, since this function stops after 3978 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 3979 * dirty pages. Those dirty pages will only be detected by a future call 3980 * to pmap_is_modified(). 3981 */ 3982 int 3983 pmap_ts_referenced(vm_page_t m) 3984 { 3985 struct spglist free; 3986 struct md_page *pvh; 3987 struct rwlock *lock; 3988 pv_entry_t pv, pvf; 3989 pmap_t pmap; 3990 pd_entry_t *l2, l2e; 3991 pt_entry_t *l3, l3e; 3992 vm_paddr_t pa; 3993 vm_offset_t va; 3994 int cleared, md_gen, not_cleared, pvh_gen; 3995 3996 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3997 ("pmap_ts_referenced: page %p is not managed", m)); 3998 SLIST_INIT(&free); 3999 cleared = 0; 4000 pa = VM_PAGE_TO_PHYS(m); 4001 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); 4002 4003 lock = PHYS_TO_PV_LIST_LOCK(pa); 4004 rw_rlock(&pvh_global_lock); 4005 rw_wlock(lock); 4006 retry: 4007 not_cleared = 0; 4008 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 4009 goto small_mappings; 4010 pv = pvf; 4011 do { 4012 pmap = PV_PMAP(pv); 4013 if (!PMAP_TRYLOCK(pmap)) { 4014 pvh_gen = pvh->pv_gen; 4015 rw_wunlock(lock); 4016 PMAP_LOCK(pmap); 4017 rw_wlock(lock); 4018 if (pvh_gen != pvh->pv_gen) { 4019 PMAP_UNLOCK(pmap); 4020 goto retry; 4021 } 4022 } 4023 va = pv->pv_va; 4024 l2 = pmap_l2(pmap, va); 4025 l2e = pmap_load(l2); 4026 if ((l2e & (PTE_W | PTE_D)) == (PTE_W | PTE_D)) { 4027 /* 4028 * Although l2e is mapping a 2MB page, because 4029 * this function is called at a 4KB page granularity, 4030 * we only update the 4KB page under test. 4031 */ 4032 vm_page_dirty(m); 4033 } 4034 if ((l2e & PTE_A) != 0) { 4035 /* 4036 * Since this reference bit is shared by 512 4KB 4037 * pages, it should not be cleared every time it is 4038 * tested. Apply a simple "hash" function on the 4039 * physical page number, the virtual superpage number, 4040 * and the pmap address to select one 4KB page out of 4041 * the 512 on which testing the reference bit will 4042 * result in clearing that reference bit. This 4043 * function is designed to avoid the selection of the 4044 * same 4KB page for every 2MB page mapping. 4045 * 4046 * On demotion, a mapping that hasn't been referenced 4047 * is simply destroyed. To avoid the possibility of a 4048 * subsequent page fault on a demoted wired mapping, 4049 * always leave its reference bit set. Moreover, 4050 * since the superpage is wired, the current state of 4051 * its reference bit won't affect page replacement. 4052 */ 4053 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^ 4054 (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 && 4055 (l2e & PTE_SW_WIRED) == 0) { 4056 pmap_clear_bits(l2, PTE_A); 4057 pmap_invalidate_page(pmap, va); 4058 cleared++; 4059 } else 4060 not_cleared++; 4061 } 4062 PMAP_UNLOCK(pmap); 4063 /* Rotate the PV list if it has more than one entry. */ 4064 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 4065 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 4066 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 4067 pvh->pv_gen++; 4068 } 4069 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) 4070 goto out; 4071 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 4072 small_mappings: 4073 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 4074 goto out; 4075 pv = pvf; 4076 do { 4077 pmap = PV_PMAP(pv); 4078 if (!PMAP_TRYLOCK(pmap)) { 4079 pvh_gen = pvh->pv_gen; 4080 md_gen = m->md.pv_gen; 4081 rw_wunlock(lock); 4082 PMAP_LOCK(pmap); 4083 rw_wlock(lock); 4084 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 4085 PMAP_UNLOCK(pmap); 4086 goto retry; 4087 } 4088 } 4089 l2 = pmap_l2(pmap, pv->pv_va); 4090 4091 KASSERT((pmap_load(l2) & PTE_RX) == 0, 4092 ("pmap_ts_referenced: found an invalid l2 table")); 4093 4094 l3 = pmap_l2_to_l3(l2, pv->pv_va); 4095 l3e = pmap_load(l3); 4096 if ((l3e & PTE_D) != 0) 4097 vm_page_dirty(m); 4098 if ((l3e & PTE_A) != 0) { 4099 if ((l3e & PTE_SW_WIRED) == 0) { 4100 /* 4101 * Wired pages cannot be paged out so 4102 * doing accessed bit emulation for 4103 * them is wasted effort. We do the 4104 * hard work for unwired pages only. 4105 */ 4106 pmap_clear_bits(l3, PTE_A); 4107 pmap_invalidate_page(pmap, pv->pv_va); 4108 cleared++; 4109 } else 4110 not_cleared++; 4111 } 4112 PMAP_UNLOCK(pmap); 4113 /* Rotate the PV list if it has more than one entry. */ 4114 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 4115 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4116 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 4117 m->md.pv_gen++; 4118 } 4119 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + 4120 not_cleared < PMAP_TS_REFERENCED_MAX); 4121 out: 4122 rw_wunlock(lock); 4123 rw_runlock(&pvh_global_lock); 4124 vm_page_free_pages_toq(&free, false); 4125 return (cleared + not_cleared); 4126 } 4127 4128 /* 4129 * Apply the given advice to the specified range of addresses within the 4130 * given pmap. Depending on the advice, clear the referenced and/or 4131 * modified flags in each mapping and set the mapped page's dirty field. 4132 */ 4133 void 4134 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 4135 { 4136 } 4137 4138 /* 4139 * Clear the modify bits on the specified physical page. 4140 */ 4141 void 4142 pmap_clear_modify(vm_page_t m) 4143 { 4144 struct md_page *pvh; 4145 struct rwlock *lock; 4146 pmap_t pmap; 4147 pv_entry_t next_pv, pv; 4148 pd_entry_t *l2, oldl2; 4149 pt_entry_t *l3; 4150 vm_offset_t va; 4151 int md_gen, pvh_gen; 4152 4153 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4154 ("pmap_clear_modify: page %p is not managed", m)); 4155 vm_page_assert_busied(m); 4156 4157 if (!pmap_page_is_write_mapped(m)) 4158 return; 4159 4160 /* 4161 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. 4162 * If the object containing the page is locked and the page is not 4163 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. 4164 */ 4165 if ((m->a.flags & PGA_WRITEABLE) == 0) 4166 return; 4167 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 4168 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4169 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4170 rw_rlock(&pvh_global_lock); 4171 rw_wlock(lock); 4172 restart: 4173 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 4174 pmap = PV_PMAP(pv); 4175 if (!PMAP_TRYLOCK(pmap)) { 4176 pvh_gen = pvh->pv_gen; 4177 rw_wunlock(lock); 4178 PMAP_LOCK(pmap); 4179 rw_wlock(lock); 4180 if (pvh_gen != pvh->pv_gen) { 4181 PMAP_UNLOCK(pmap); 4182 goto restart; 4183 } 4184 } 4185 va = pv->pv_va; 4186 l2 = pmap_l2(pmap, va); 4187 oldl2 = pmap_load(l2); 4188 /* If oldl2 has PTE_W set, then it also has PTE_D set. */ 4189 if ((oldl2 & PTE_W) != 0 && 4190 pmap_demote_l2_locked(pmap, l2, va, &lock) && 4191 (oldl2 & PTE_SW_WIRED) == 0) { 4192 /* 4193 * Write protect the mapping to a single page so that 4194 * a subsequent write access may repromote. 4195 */ 4196 va += VM_PAGE_TO_PHYS(m) - PTE_TO_PHYS(oldl2); 4197 l3 = pmap_l2_to_l3(l2, va); 4198 pmap_clear_bits(l3, PTE_D | PTE_W); 4199 vm_page_dirty(m); 4200 pmap_invalidate_page(pmap, va); 4201 } 4202 PMAP_UNLOCK(pmap); 4203 } 4204 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4205 pmap = PV_PMAP(pv); 4206 if (!PMAP_TRYLOCK(pmap)) { 4207 md_gen = m->md.pv_gen; 4208 pvh_gen = pvh->pv_gen; 4209 rw_wunlock(lock); 4210 PMAP_LOCK(pmap); 4211 rw_wlock(lock); 4212 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 4213 PMAP_UNLOCK(pmap); 4214 goto restart; 4215 } 4216 } 4217 l2 = pmap_l2(pmap, pv->pv_va); 4218 KASSERT((pmap_load(l2) & PTE_RWX) == 0, 4219 ("%s: found a 2mpage in page %p's pv list", __func__, m)); 4220 l3 = pmap_l2_to_l3(l2, pv->pv_va); 4221 if ((pmap_load(l3) & (PTE_D | PTE_W)) == (PTE_D | PTE_W)) { 4222 pmap_clear_bits(l3, PTE_D | PTE_W); 4223 pmap_invalidate_page(pmap, pv->pv_va); 4224 } 4225 PMAP_UNLOCK(pmap); 4226 } 4227 rw_wunlock(lock); 4228 rw_runlock(&pvh_global_lock); 4229 } 4230 4231 void * 4232 pmap_mapbios(vm_paddr_t pa, vm_size_t size) 4233 { 4234 4235 return ((void *)PHYS_TO_DMAP(pa)); 4236 } 4237 4238 void 4239 pmap_unmapbios(vm_paddr_t pa, vm_size_t size) 4240 { 4241 } 4242 4243 /* 4244 * Sets the memory attribute for the specified page. 4245 */ 4246 void 4247 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 4248 { 4249 4250 m->md.pv_memattr = ma; 4251 4252 /* 4253 * If "m" is a normal page, update its direct mapping. This update 4254 * can be relied upon to perform any cache operations that are 4255 * required for data coherence. 4256 */ 4257 if ((m->flags & PG_FICTITIOUS) == 0 && 4258 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, 4259 m->md.pv_memattr) != 0) 4260 panic("memory attribute change on the direct map failed"); 4261 } 4262 4263 /* 4264 * Changes the specified virtual address range's memory type to that given by 4265 * the parameter "mode". The specified virtual address range must be 4266 * completely contained within either the direct map or the kernel map. 4267 * 4268 * Returns zero if the change completed successfully, and either EINVAL or 4269 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 4270 * of the virtual address range was not mapped, and ENOMEM is returned if 4271 * there was insufficient memory available to complete the change. In the 4272 * latter case, the memory type may have been changed on some part of the 4273 * virtual address range. 4274 */ 4275 int 4276 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) 4277 { 4278 int error; 4279 4280 PMAP_LOCK(kernel_pmap); 4281 error = pmap_change_attr_locked(va, size, mode); 4282 PMAP_UNLOCK(kernel_pmap); 4283 return (error); 4284 } 4285 4286 static int 4287 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode) 4288 { 4289 vm_offset_t base, offset, tmpva; 4290 pd_entry_t *l1, l1e; 4291 pd_entry_t *l2, l2e; 4292 pt_entry_t *l3, l3e; 4293 4294 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 4295 base = trunc_page(va); 4296 offset = va & PAGE_MASK; 4297 size = round_page(offset + size); 4298 4299 if (!VIRT_IN_DMAP(base) && 4300 !(base >= VM_MIN_KERNEL_ADDRESS && base < VM_MAX_KERNEL_ADDRESS)) 4301 return (EINVAL); 4302 4303 for (tmpva = base; tmpva < base + size; ) { 4304 l1 = pmap_l1(kernel_pmap, tmpva); 4305 if (l1 == NULL || ((l1e = pmap_load(l1)) & PTE_V) == 0) 4306 return (EINVAL); 4307 if ((l1e & PTE_RWX) != 0) { 4308 /* 4309 * TODO: Demote if attributes don't match and there 4310 * isn't an L1 page left in the range, and update the 4311 * L1 entry if the attributes don't match but there is 4312 * an L1 page left in the range, once we support the 4313 * upcoming Svpbmt extension. 4314 */ 4315 tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE; 4316 continue; 4317 } 4318 l2 = pmap_l1_to_l2(l1, tmpva); 4319 if (l2 == NULL || ((l2e = pmap_load(l2)) & PTE_V) == 0) 4320 return (EINVAL); 4321 if ((l2e & PTE_RWX) != 0) { 4322 /* 4323 * TODO: Demote if attributes don't match and there 4324 * isn't an L2 page left in the range, and update the 4325 * L2 entry if the attributes don't match but there is 4326 * an L2 page left in the range, once we support the 4327 * upcoming Svpbmt extension. 4328 */ 4329 tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE; 4330 continue; 4331 } 4332 l3 = pmap_l2_to_l3(l2, tmpva); 4333 if (l3 == NULL || ((l3e = pmap_load(l3)) & PTE_V) == 0) 4334 return (EINVAL); 4335 /* 4336 * TODO: Update the L3 entry if the attributes don't match once 4337 * we support the upcoming Svpbmt extension. 4338 */ 4339 tmpva += PAGE_SIZE; 4340 } 4341 4342 return (0); 4343 } 4344 4345 /* 4346 * Perform the pmap work for mincore(2). If the page is not both referenced and 4347 * modified by this pmap, returns its physical address so that the caller can 4348 * find other mappings. 4349 */ 4350 int 4351 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) 4352 { 4353 pt_entry_t *l2, *l3, tpte; 4354 vm_paddr_t pa; 4355 int val; 4356 bool managed; 4357 4358 PMAP_LOCK(pmap); 4359 l2 = pmap_l2(pmap, addr); 4360 if (l2 != NULL && ((tpte = pmap_load(l2)) & PTE_V) != 0) { 4361 if ((tpte & PTE_RWX) != 0) { 4362 pa = PTE_TO_PHYS(tpte) | (addr & L2_OFFSET); 4363 val = MINCORE_INCORE | MINCORE_PSIND(1); 4364 } else { 4365 l3 = pmap_l2_to_l3(l2, addr); 4366 tpte = pmap_load(l3); 4367 if ((tpte & PTE_V) == 0) { 4368 PMAP_UNLOCK(pmap); 4369 return (0); 4370 } 4371 pa = PTE_TO_PHYS(tpte) | (addr & L3_OFFSET); 4372 val = MINCORE_INCORE; 4373 } 4374 4375 if ((tpte & PTE_D) != 0) 4376 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 4377 if ((tpte & PTE_A) != 0) 4378 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 4379 managed = (tpte & PTE_SW_MANAGED) == PTE_SW_MANAGED; 4380 } else { 4381 managed = false; 4382 val = 0; 4383 } 4384 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 4385 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { 4386 *pap = pa; 4387 } 4388 PMAP_UNLOCK(pmap); 4389 return (val); 4390 } 4391 4392 void 4393 pmap_activate_sw(struct thread *td) 4394 { 4395 pmap_t oldpmap, pmap; 4396 u_int hart; 4397 4398 oldpmap = PCPU_GET(curpmap); 4399 pmap = vmspace_pmap(td->td_proc->p_vmspace); 4400 if (pmap == oldpmap) 4401 return; 4402 load_satp(pmap->pm_satp); 4403 4404 hart = PCPU_GET(hart); 4405 #ifdef SMP 4406 CPU_SET_ATOMIC(hart, &pmap->pm_active); 4407 CPU_CLR_ATOMIC(hart, &oldpmap->pm_active); 4408 #else 4409 CPU_SET(hart, &pmap->pm_active); 4410 CPU_CLR(hart, &oldpmap->pm_active); 4411 #endif 4412 PCPU_SET(curpmap, pmap); 4413 4414 sfence_vma(); 4415 } 4416 4417 void 4418 pmap_activate(struct thread *td) 4419 { 4420 4421 critical_enter(); 4422 pmap_activate_sw(td); 4423 critical_exit(); 4424 } 4425 4426 void 4427 pmap_activate_boot(pmap_t pmap) 4428 { 4429 u_int hart; 4430 4431 hart = PCPU_GET(hart); 4432 #ifdef SMP 4433 CPU_SET_ATOMIC(hart, &pmap->pm_active); 4434 #else 4435 CPU_SET(hart, &pmap->pm_active); 4436 #endif 4437 PCPU_SET(curpmap, pmap); 4438 } 4439 4440 void 4441 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz) 4442 { 4443 cpuset_t mask; 4444 4445 /* 4446 * From the RISC-V User-Level ISA V2.2: 4447 * 4448 * "To make a store to instruction memory visible to all 4449 * RISC-V harts, the writing hart has to execute a data FENCE 4450 * before requesting that all remote RISC-V harts execute a 4451 * FENCE.I." 4452 * 4453 * However, this is slightly misleading; we still need to 4454 * perform a FENCE.I for the local hart, as FENCE does nothing 4455 * for its icache. FENCE.I alone is also sufficient for the 4456 * local hart. 4457 */ 4458 sched_pin(); 4459 mask = all_harts; 4460 CPU_CLR(PCPU_GET(hart), &mask); 4461 fence_i(); 4462 if (!CPU_EMPTY(&mask) && smp_started) { 4463 fence(); 4464 sbi_remote_fence_i(mask.__bits); 4465 } 4466 sched_unpin(); 4467 } 4468 4469 /* 4470 * Increase the starting virtual address of the given mapping if a 4471 * different alignment might result in more superpage mappings. 4472 */ 4473 void 4474 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 4475 vm_offset_t *addr, vm_size_t size) 4476 { 4477 vm_offset_t superpage_offset; 4478 4479 if (size < L2_SIZE) 4480 return; 4481 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 4482 offset += ptoa(object->pg_color); 4483 superpage_offset = offset & L2_OFFSET; 4484 if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE || 4485 (*addr & L2_OFFSET) == superpage_offset) 4486 return; 4487 if ((*addr & L2_OFFSET) < superpage_offset) 4488 *addr = (*addr & ~L2_OFFSET) + superpage_offset; 4489 else 4490 *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset; 4491 } 4492 4493 /** 4494 * Get the kernel virtual address of a set of physical pages. If there are 4495 * physical addresses not covered by the DMAP perform a transient mapping 4496 * that will be removed when calling pmap_unmap_io_transient. 4497 * 4498 * \param page The pages the caller wishes to obtain the virtual 4499 * address on the kernel memory map. 4500 * \param vaddr On return contains the kernel virtual memory address 4501 * of the pages passed in the page parameter. 4502 * \param count Number of pages passed in. 4503 * \param can_fault TRUE if the thread using the mapped pages can take 4504 * page faults, FALSE otherwise. 4505 * 4506 * \returns TRUE if the caller must call pmap_unmap_io_transient when 4507 * finished or FALSE otherwise. 4508 * 4509 */ 4510 boolean_t 4511 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 4512 boolean_t can_fault) 4513 { 4514 vm_paddr_t paddr; 4515 boolean_t needs_mapping; 4516 int error, i; 4517 4518 /* 4519 * Allocate any KVA space that we need, this is done in a separate 4520 * loop to prevent calling vmem_alloc while pinned. 4521 */ 4522 needs_mapping = FALSE; 4523 for (i = 0; i < count; i++) { 4524 paddr = VM_PAGE_TO_PHYS(page[i]); 4525 if (__predict_false(paddr >= DMAP_MAX_PHYSADDR)) { 4526 error = vmem_alloc(kernel_arena, PAGE_SIZE, 4527 M_BESTFIT | M_WAITOK, &vaddr[i]); 4528 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 4529 needs_mapping = TRUE; 4530 } else { 4531 vaddr[i] = PHYS_TO_DMAP(paddr); 4532 } 4533 } 4534 4535 /* Exit early if everything is covered by the DMAP */ 4536 if (!needs_mapping) 4537 return (FALSE); 4538 4539 if (!can_fault) 4540 sched_pin(); 4541 for (i = 0; i < count; i++) { 4542 paddr = VM_PAGE_TO_PHYS(page[i]); 4543 if (paddr >= DMAP_MAX_PHYSADDR) { 4544 panic( 4545 "pmap_map_io_transient: TODO: Map out of DMAP data"); 4546 } 4547 } 4548 4549 return (needs_mapping); 4550 } 4551 4552 void 4553 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 4554 boolean_t can_fault) 4555 { 4556 vm_paddr_t paddr; 4557 int i; 4558 4559 if (!can_fault) 4560 sched_unpin(); 4561 for (i = 0; i < count; i++) { 4562 paddr = VM_PAGE_TO_PHYS(page[i]); 4563 if (paddr >= DMAP_MAX_PHYSADDR) { 4564 panic("RISCVTODO: pmap_unmap_io_transient: Unmap data"); 4565 } 4566 } 4567 } 4568 4569 boolean_t 4570 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) 4571 { 4572 4573 return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_BACK); 4574 } 4575 4576 bool 4577 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l1, pd_entry_t **l2, 4578 pt_entry_t **l3) 4579 { 4580 pd_entry_t *l1p, *l2p; 4581 4582 /* Get l1 directory entry. */ 4583 l1p = pmap_l1(pmap, va); 4584 *l1 = l1p; 4585 4586 if (l1p == NULL || (pmap_load(l1p) & PTE_V) == 0) 4587 return (false); 4588 4589 if ((pmap_load(l1p) & PTE_RX) != 0) { 4590 *l2 = NULL; 4591 *l3 = NULL; 4592 return (true); 4593 } 4594 4595 /* Get l2 directory entry. */ 4596 l2p = pmap_l1_to_l2(l1p, va); 4597 *l2 = l2p; 4598 4599 if (l2p == NULL || (pmap_load(l2p) & PTE_V) == 0) 4600 return (false); 4601 4602 if ((pmap_load(l2p) & PTE_RX) != 0) { 4603 *l3 = NULL; 4604 return (true); 4605 } 4606 4607 /* Get l3 page table entry. */ 4608 *l3 = pmap_l2_to_l3(l2p, va); 4609 4610 return (true); 4611 } 4612 4613 /* 4614 * Track a range of the kernel's virtual address space that is contiguous 4615 * in various mapping attributes. 4616 */ 4617 struct pmap_kernel_map_range { 4618 vm_offset_t sva; 4619 pt_entry_t attrs; 4620 int l3pages; 4621 int l2pages; 4622 int l1pages; 4623 }; 4624 4625 static void 4626 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, 4627 vm_offset_t eva) 4628 { 4629 4630 if (eva <= range->sva) 4631 return; 4632 4633 sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %d %d %d\n", 4634 range->sva, eva, 4635 (range->attrs & PTE_W) == PTE_W ? 'w' : '-', 4636 (range->attrs & PTE_X) == PTE_X ? 'x' : '-', 4637 (range->attrs & PTE_U) == PTE_U ? 'u' : 's', 4638 (range->attrs & PTE_G) == PTE_G ? 'g' : '-', 4639 range->l1pages, range->l2pages, range->l3pages); 4640 4641 /* Reset to sentinel value. */ 4642 range->sva = 0xfffffffffffffffful; 4643 } 4644 4645 /* 4646 * Determine whether the attributes specified by a page table entry match those 4647 * being tracked by the current range. 4648 */ 4649 static bool 4650 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs) 4651 { 4652 4653 return (range->attrs == attrs); 4654 } 4655 4656 static void 4657 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, 4658 pt_entry_t attrs) 4659 { 4660 4661 memset(range, 0, sizeof(*range)); 4662 range->sva = va; 4663 range->attrs = attrs; 4664 } 4665 4666 /* 4667 * Given a leaf PTE, derive the mapping's attributes. If they do not match 4668 * those of the current run, dump the address range and its attributes, and 4669 * begin a new run. 4670 */ 4671 static void 4672 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, 4673 vm_offset_t va, pd_entry_t l1e, pd_entry_t l2e, pt_entry_t l3e) 4674 { 4675 pt_entry_t attrs; 4676 4677 /* The PTE global bit is inherited by lower levels. */ 4678 attrs = l1e & PTE_G; 4679 if ((l1e & PTE_RWX) != 0) 4680 attrs |= l1e & (PTE_RWX | PTE_U); 4681 else if (l2e != 0) 4682 attrs |= l2e & PTE_G; 4683 if ((l2e & PTE_RWX) != 0) 4684 attrs |= l2e & (PTE_RWX | PTE_U); 4685 else if (l3e != 0) 4686 attrs |= l3e & (PTE_RWX | PTE_U | PTE_G); 4687 4688 if (range->sva > va || !sysctl_kmaps_match(range, attrs)) { 4689 sysctl_kmaps_dump(sb, range, va); 4690 sysctl_kmaps_reinit(range, va, attrs); 4691 } 4692 } 4693 4694 static int 4695 sysctl_kmaps(SYSCTL_HANDLER_ARGS) 4696 { 4697 struct pmap_kernel_map_range range; 4698 struct sbuf sbuf, *sb; 4699 pd_entry_t l1e, *l2, l2e; 4700 pt_entry_t *l3, l3e; 4701 vm_offset_t sva; 4702 vm_paddr_t pa; 4703 int error, i, j, k; 4704 4705 error = sysctl_wire_old_buffer(req, 0); 4706 if (error != 0) 4707 return (error); 4708 sb = &sbuf; 4709 sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); 4710 4711 /* Sentinel value. */ 4712 range.sva = 0xfffffffffffffffful; 4713 4714 /* 4715 * Iterate over the kernel page tables without holding the kernel pmap 4716 * lock. Kernel page table pages are never freed, so at worst we will 4717 * observe inconsistencies in the output. 4718 */ 4719 sva = VM_MIN_KERNEL_ADDRESS; 4720 for (i = pmap_l1_index(sva); i < Ln_ENTRIES; i++) { 4721 if (i == pmap_l1_index(DMAP_MIN_ADDRESS)) 4722 sbuf_printf(sb, "\nDirect map:\n"); 4723 else if (i == pmap_l1_index(VM_MIN_KERNEL_ADDRESS)) 4724 sbuf_printf(sb, "\nKernel map:\n"); 4725 4726 l1e = kernel_pmap->pm_l1[i]; 4727 if ((l1e & PTE_V) == 0) { 4728 sysctl_kmaps_dump(sb, &range, sva); 4729 sva += L1_SIZE; 4730 continue; 4731 } 4732 if ((l1e & PTE_RWX) != 0) { 4733 sysctl_kmaps_check(sb, &range, sva, l1e, 0, 0); 4734 range.l1pages++; 4735 sva += L1_SIZE; 4736 continue; 4737 } 4738 pa = PTE_TO_PHYS(l1e); 4739 l2 = (pd_entry_t *)PHYS_TO_DMAP(pa); 4740 4741 for (j = pmap_l2_index(sva); j < Ln_ENTRIES; j++) { 4742 l2e = l2[j]; 4743 if ((l2e & PTE_V) == 0) { 4744 sysctl_kmaps_dump(sb, &range, sva); 4745 sva += L2_SIZE; 4746 continue; 4747 } 4748 if ((l2e & PTE_RWX) != 0) { 4749 sysctl_kmaps_check(sb, &range, sva, l1e, l2e, 0); 4750 range.l2pages++; 4751 sva += L2_SIZE; 4752 continue; 4753 } 4754 pa = PTE_TO_PHYS(l2e); 4755 l3 = (pd_entry_t *)PHYS_TO_DMAP(pa); 4756 4757 for (k = pmap_l3_index(sva); k < Ln_ENTRIES; k++, 4758 sva += L3_SIZE) { 4759 l3e = l3[k]; 4760 if ((l3e & PTE_V) == 0) { 4761 sysctl_kmaps_dump(sb, &range, sva); 4762 continue; 4763 } 4764 sysctl_kmaps_check(sb, &range, sva, 4765 l1e, l2e, l3e); 4766 range.l3pages++; 4767 } 4768 } 4769 } 4770 4771 error = sbuf_finish(sb); 4772 sbuf_delete(sb); 4773 return (error); 4774 } 4775 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps, 4776 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP, 4777 NULL, 0, sysctl_kmaps, "A", 4778 "Dump kernel address layout"); 4779