1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 1991 Regents of the University of California. 5 * All rights reserved. 6 * Copyright (c) 1994 John S. Dyson 7 * All rights reserved. 8 * Copyright (c) 1994 David Greenman 9 * All rights reserved. 10 * Copyright (c) 2003 Peter Wemm 11 * All rights reserved. 12 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 13 * All rights reserved. 14 * 15 * This code is derived from software contributed to Berkeley by 16 * the Systems Programming Group of the University of Utah Computer 17 * Science Department and William Jolitz of UUNET Technologies Inc. 18 * 19 * Redistribution and use in source and binary forms, with or without 20 * modification, are permitted provided that the following conditions 21 * are met: 22 * 1. Redistributions of source code must retain the above copyright 23 * notice, this list of conditions and the following disclaimer. 24 * 2. Redistributions in binary form must reproduce the above copyright 25 * notice, this list of conditions and the following disclaimer in the 26 * documentation and/or other materials provided with the distribution. 27 * 3. All advertising materials mentioning features or use of this software 28 * must display the following acknowledgement: 29 * This product includes software developed by the University of 30 * California, Berkeley and its contributors. 31 * 4. Neither the name of the University nor the names of its contributors 32 * may be used to endorse or promote products derived from this software 33 * without specific prior written permission. 34 * 35 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 36 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 37 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 38 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 39 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 40 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 41 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 42 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 43 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 44 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 45 * SUCH DAMAGE. 46 */ 47 /*- 48 * Copyright (c) 2003 Networks Associates Technology, Inc. 49 * Copyright (c) 2014-2020 The FreeBSD Foundation 50 * All rights reserved. 51 * 52 * This software was developed for the FreeBSD Project by Jake Burkholder, 53 * Safeport Network Services, and Network Associates Laboratories, the 54 * Security Research Division of Network Associates, Inc. under 55 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 56 * CHATS research program. 57 * 58 * Portions of this software were developed by 59 * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from 60 * the FreeBSD Foundation. 61 * 62 * Redistribution and use in source and binary forms, with or without 63 * modification, are permitted provided that the following conditions 64 * are met: 65 * 1. Redistributions of source code must retain the above copyright 66 * notice, this list of conditions and the following disclaimer. 67 * 2. Redistributions in binary form must reproduce the above copyright 68 * notice, this list of conditions and the following disclaimer in the 69 * documentation and/or other materials provided with the distribution. 70 * 71 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 72 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 73 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 74 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 75 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 76 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 77 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 78 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 79 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 80 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 81 * SUCH DAMAGE. 82 */ 83 84 #define AMD64_NPT_AWARE 85 86 #include <sys/cdefs.h> 87 /* 88 * Manages physical address maps. 89 * 90 * Since the information managed by this module is 91 * also stored by the logical address mapping module, 92 * this module may throw away valid virtual-to-physical 93 * mappings at almost any time. However, invalidations 94 * of virtual-to-physical mappings must be done as 95 * requested. 96 * 97 * In order to cope with hardware architectures which 98 * make virtual-to-physical map invalidates expensive, 99 * this module may delay invalidate or reduced protection 100 * operations until such time as they are actually 101 * necessary. This module is given full information as 102 * to which processors are currently using which maps, 103 * and to when physical maps must be made correct. 104 */ 105 106 #include "opt_ddb.h" 107 #include "opt_pmap.h" 108 #include "opt_vm.h" 109 110 #include <sys/param.h> 111 #include <sys/asan.h> 112 #include <sys/bitstring.h> 113 #include <sys/bus.h> 114 #include <sys/systm.h> 115 #include <sys/counter.h> 116 #include <sys/kernel.h> 117 #include <sys/ktr.h> 118 #include <sys/lock.h> 119 #include <sys/malloc.h> 120 #include <sys/mman.h> 121 #include <sys/msan.h> 122 #include <sys/mutex.h> 123 #include <sys/proc.h> 124 #include <sys/rangeset.h> 125 #include <sys/rwlock.h> 126 #include <sys/sbuf.h> 127 #include <sys/smr.h> 128 #include <sys/sx.h> 129 #include <sys/turnstile.h> 130 #include <sys/vmem.h> 131 #include <sys/vmmeter.h> 132 #include <sys/sched.h> 133 #include <sys/sysctl.h> 134 #include <sys/smp.h> 135 #ifdef DDB 136 #include <sys/kdb.h> 137 #include <ddb/ddb.h> 138 #endif 139 140 #include <vm/vm.h> 141 #include <vm/vm_param.h> 142 #include <vm/vm_kern.h> 143 #include <vm/vm_page.h> 144 #include <vm/vm_map.h> 145 #include <vm/vm_object.h> 146 #include <vm/vm_extern.h> 147 #include <vm/vm_pageout.h> 148 #include <vm/vm_pager.h> 149 #include <vm/vm_phys.h> 150 #include <vm/vm_radix.h> 151 #include <vm/vm_reserv.h> 152 #include <vm/vm_dumpset.h> 153 #include <vm/uma.h> 154 155 #include <machine/asan.h> 156 #include <machine/intr_machdep.h> 157 #include <x86/apicvar.h> 158 #include <x86/ifunc.h> 159 #include <machine/cpu.h> 160 #include <machine/cputypes.h> 161 #include <machine/md_var.h> 162 #include <machine/msan.h> 163 #include <machine/pcb.h> 164 #include <machine/specialreg.h> 165 #ifdef SMP 166 #include <machine/smp.h> 167 #endif 168 #include <machine/sysarch.h> 169 #include <machine/tss.h> 170 171 #ifdef NUMA 172 #define PMAP_MEMDOM MAXMEMDOM 173 #else 174 #define PMAP_MEMDOM 1 175 #endif 176 177 static __inline bool 178 pmap_type_guest(pmap_t pmap) 179 { 180 181 return ((pmap->pm_type == PT_EPT) || (pmap->pm_type == PT_RVI)); 182 } 183 184 static __inline bool 185 pmap_emulate_ad_bits(pmap_t pmap) 186 { 187 188 return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0); 189 } 190 191 static __inline pt_entry_t 192 pmap_valid_bit(pmap_t pmap) 193 { 194 pt_entry_t mask; 195 196 switch (pmap->pm_type) { 197 case PT_X86: 198 case PT_RVI: 199 mask = X86_PG_V; 200 break; 201 case PT_EPT: 202 if (pmap_emulate_ad_bits(pmap)) 203 mask = EPT_PG_EMUL_V; 204 else 205 mask = EPT_PG_READ; 206 break; 207 default: 208 panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type); 209 } 210 211 return (mask); 212 } 213 214 static __inline pt_entry_t 215 pmap_rw_bit(pmap_t pmap) 216 { 217 pt_entry_t mask; 218 219 switch (pmap->pm_type) { 220 case PT_X86: 221 case PT_RVI: 222 mask = X86_PG_RW; 223 break; 224 case PT_EPT: 225 if (pmap_emulate_ad_bits(pmap)) 226 mask = EPT_PG_EMUL_RW; 227 else 228 mask = EPT_PG_WRITE; 229 break; 230 default: 231 panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type); 232 } 233 234 return (mask); 235 } 236 237 static pt_entry_t pg_g; 238 239 static __inline pt_entry_t 240 pmap_global_bit(pmap_t pmap) 241 { 242 pt_entry_t mask; 243 244 switch (pmap->pm_type) { 245 case PT_X86: 246 mask = pg_g; 247 break; 248 case PT_RVI: 249 case PT_EPT: 250 mask = 0; 251 break; 252 default: 253 panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type); 254 } 255 256 return (mask); 257 } 258 259 static __inline pt_entry_t 260 pmap_accessed_bit(pmap_t pmap) 261 { 262 pt_entry_t mask; 263 264 switch (pmap->pm_type) { 265 case PT_X86: 266 case PT_RVI: 267 mask = X86_PG_A; 268 break; 269 case PT_EPT: 270 if (pmap_emulate_ad_bits(pmap)) 271 mask = EPT_PG_READ; 272 else 273 mask = EPT_PG_A; 274 break; 275 default: 276 panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type); 277 } 278 279 return (mask); 280 } 281 282 static __inline pt_entry_t 283 pmap_modified_bit(pmap_t pmap) 284 { 285 pt_entry_t mask; 286 287 switch (pmap->pm_type) { 288 case PT_X86: 289 case PT_RVI: 290 mask = X86_PG_M; 291 break; 292 case PT_EPT: 293 if (pmap_emulate_ad_bits(pmap)) 294 mask = EPT_PG_WRITE; 295 else 296 mask = EPT_PG_M; 297 break; 298 default: 299 panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type); 300 } 301 302 return (mask); 303 } 304 305 static __inline pt_entry_t 306 pmap_pku_mask_bit(pmap_t pmap) 307 { 308 309 return (pmap->pm_type == PT_X86 ? X86_PG_PKU_MASK : 0); 310 } 311 312 static __inline bool 313 safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte) 314 { 315 316 if (!pmap_emulate_ad_bits(pmap)) 317 return (true); 318 319 KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type)); 320 321 /* 322 * XWR = 010 or 110 will cause an unconditional EPT misconfiguration 323 * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared 324 * if the EPT_PG_WRITE bit is set. 325 */ 326 if ((pte & EPT_PG_WRITE) != 0) 327 return (false); 328 329 /* 330 * XWR = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set. 331 */ 332 if ((pte & EPT_PG_EXECUTE) == 0 || 333 ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0)) 334 return (true); 335 else 336 return (false); 337 } 338 339 #ifdef PV_STATS 340 #define PV_STAT(x) do { x ; } while (0) 341 #else 342 #define PV_STAT(x) do { } while (0) 343 #endif 344 345 #undef pa_index 346 #ifdef NUMA 347 #define pa_index(pa) ({ \ 348 KASSERT((pa) <= vm_phys_segs[vm_phys_nsegs - 1].end, \ 349 ("address %lx beyond the last segment", (pa))); \ 350 (pa) >> PDRSHIFT; \ 351 }) 352 #define pa_to_pmdp(pa) (&pv_table[pa_index(pa)]) 353 #define pa_to_pvh(pa) (&(pa_to_pmdp(pa)->pv_page)) 354 #define PHYS_TO_PV_LIST_LOCK(pa) ({ \ 355 struct rwlock *_lock; \ 356 if (__predict_false((pa) > pmap_last_pa)) \ 357 _lock = &pv_dummy_large.pv_lock; \ 358 else \ 359 _lock = &(pa_to_pmdp(pa)->pv_lock); \ 360 _lock; \ 361 }) 362 #else 363 #define pa_index(pa) ((pa) >> PDRSHIFT) 364 #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 365 366 #define NPV_LIST_LOCKS MAXCPU 367 368 #define PHYS_TO_PV_LIST_LOCK(pa) \ 369 (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) 370 #endif 371 372 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ 373 struct rwlock **_lockp = (lockp); \ 374 struct rwlock *_new_lock; \ 375 \ 376 _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ 377 if (_new_lock != *_lockp) { \ 378 if (*_lockp != NULL) \ 379 rw_wunlock(*_lockp); \ 380 *_lockp = _new_lock; \ 381 rw_wlock(*_lockp); \ 382 } \ 383 } while (0) 384 385 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ 386 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) 387 388 #define RELEASE_PV_LIST_LOCK(lockp) do { \ 389 struct rwlock **_lockp = (lockp); \ 390 \ 391 if (*_lockp != NULL) { \ 392 rw_wunlock(*_lockp); \ 393 *_lockp = NULL; \ 394 } \ 395 } while (0) 396 397 #define VM_PAGE_TO_PV_LIST_LOCK(m) \ 398 PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) 399 400 /* 401 * Statically allocate kernel pmap memory. However, memory for 402 * pm_pcids is obtained after the dynamic allocator is operational. 403 * Initialize it with a non-canonical pointer to catch early accesses 404 * regardless of the active mapping. 405 */ 406 struct pmap kernel_pmap_store = { 407 .pm_pcidp = (void *)0xdeadbeefdeadbeef, 408 }; 409 410 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 411 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 412 413 int nkpt; 414 SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0, 415 "Number of kernel page table pages allocated on bootup"); 416 417 static int ndmpdp; 418 vm_paddr_t dmaplimit; 419 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 420 pt_entry_t pg_nx; 421 422 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 423 "VM/pmap parameters"); 424 425 static int __read_frequently pg_ps_enabled = 1; 426 SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 427 &pg_ps_enabled, 0, "Are large page mappings enabled?"); 428 429 int __read_frequently la57 = 0; 430 SYSCTL_INT(_vm_pmap, OID_AUTO, la57, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 431 &la57, 0, 432 "5-level paging for host is enabled"); 433 434 static bool 435 pmap_is_la57(pmap_t pmap) 436 { 437 if (pmap->pm_type == PT_X86) 438 return (la57); 439 return (false); /* XXXKIB handle EPT */ 440 } 441 442 #define PAT_INDEX_SIZE 8 443 static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ 444 445 static u_int64_t KPTphys; /* phys addr of kernel level 1 */ 446 static u_int64_t KPDphys; /* phys addr of kernel level 2 */ 447 static u_int64_t KPDPphys; /* phys addr of kernel level 3 */ 448 u_int64_t KPML4phys; /* phys addr of kernel level 4 */ 449 u_int64_t KPML5phys; /* phys addr of kernel level 5, 450 if supported */ 451 452 #ifdef KASAN 453 static uint64_t KASANPDPphys; 454 #endif 455 #ifdef KMSAN 456 static uint64_t KMSANSHADPDPphys; 457 static uint64_t KMSANORIGPDPphys; 458 459 /* 460 * To support systems with large amounts of memory, it is necessary to extend 461 * the maximum size of the direct map. This could eat into the space reserved 462 * for the shadow map. 463 */ 464 _Static_assert(DMPML4I + NDMPML4E <= KMSANSHADPML4I, "direct map overflow"); 465 #endif 466 467 static pml4_entry_t *kernel_pml4; 468 static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */ 469 static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */ 470 static int ndmpdpphys; /* number of DMPDPphys pages */ 471 472 vm_paddr_t kernphys; /* phys addr of start of bootstrap data */ 473 vm_paddr_t KERNend; /* and the end */ 474 475 /* 476 * pmap_mapdev support pre initialization (i.e. console) 477 */ 478 #define PMAP_PREINIT_MAPPING_COUNT 8 479 static struct pmap_preinit_mapping { 480 vm_paddr_t pa; 481 vm_offset_t va; 482 vm_size_t sz; 483 int mode; 484 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; 485 static int pmap_initialized; 486 487 /* 488 * Data for the pv entry allocation mechanism. 489 * Updates to pv_invl_gen are protected by the pv list lock but reads are not. 490 */ 491 #ifdef NUMA 492 static __inline int 493 pc_to_domain(struct pv_chunk *pc) 494 { 495 496 return (vm_phys_domain(DMAP_TO_PHYS((vm_offset_t)pc))); 497 } 498 #else 499 static __inline int 500 pc_to_domain(struct pv_chunk *pc __unused) 501 { 502 503 return (0); 504 } 505 #endif 506 507 struct pv_chunks_list { 508 struct mtx pvc_lock; 509 TAILQ_HEAD(pch, pv_chunk) pvc_list; 510 int active_reclaims; 511 } __aligned(CACHE_LINE_SIZE); 512 513 struct pv_chunks_list __exclusive_cache_line pv_chunks[PMAP_MEMDOM]; 514 515 #ifdef NUMA 516 struct pmap_large_md_page { 517 struct rwlock pv_lock; 518 struct md_page pv_page; 519 u_long pv_invl_gen; 520 }; 521 __exclusive_cache_line static struct pmap_large_md_page pv_dummy_large; 522 #define pv_dummy pv_dummy_large.pv_page 523 __read_mostly static struct pmap_large_md_page *pv_table; 524 __read_mostly vm_paddr_t pmap_last_pa; 525 #else 526 static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS]; 527 static u_long pv_invl_gen[NPV_LIST_LOCKS]; 528 static struct md_page *pv_table; 529 static struct md_page pv_dummy; 530 #endif 531 532 /* 533 * All those kernel PT submaps that BSD is so fond of 534 */ 535 pt_entry_t *CMAP1 = NULL; 536 caddr_t CADDR1 = 0; 537 static vm_offset_t qframe = 0; 538 static struct mtx qframe_mtx; 539 540 static int pmap_flags = PMAP_PDE_SUPERPAGE; /* flags for x86 pmaps */ 541 542 static vmem_t *large_vmem; 543 static u_int lm_ents; 544 #define PMAP_ADDRESS_IN_LARGEMAP(va) ((va) >= LARGEMAP_MIN_ADDRESS && \ 545 (va) < LARGEMAP_MIN_ADDRESS + NBPML4 * (u_long)lm_ents) 546 547 int pmap_pcid_enabled = 1; 548 SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 549 &pmap_pcid_enabled, 0, "Is TLB Context ID enabled ?"); 550 int invpcid_works = 0; 551 SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0, 552 "Is the invpcid instruction available ?"); 553 int invlpgb_works; 554 SYSCTL_INT(_vm_pmap, OID_AUTO, invlpgb_works, CTLFLAG_RD, &invlpgb_works, 0, 555 "Is the invlpgb instruction available?"); 556 int invlpgb_maxcnt; 557 int pmap_pcid_invlpg_workaround = 0; 558 SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_invlpg_workaround, 559 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 560 &pmap_pcid_invlpg_workaround, 0, 561 "Enable small core PCID/INVLPG workaround"); 562 int pmap_pcid_invlpg_workaround_uena = 1; 563 564 int __read_frequently pti = 0; 565 SYSCTL_INT(_vm_pmap, OID_AUTO, pti, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 566 &pti, 0, 567 "Page Table Isolation enabled"); 568 static vm_object_t pti_obj; 569 static pml4_entry_t *pti_pml4; 570 static vm_pindex_t pti_pg_idx; 571 static bool pti_finalized; 572 573 struct pmap_pkru_range { 574 struct rs_el pkru_rs_el; 575 u_int pkru_keyidx; 576 int pkru_flags; 577 }; 578 579 static uma_zone_t pmap_pkru_ranges_zone; 580 static bool pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 581 pt_entry_t *pte); 582 static pt_entry_t pmap_pkru_get(pmap_t pmap, vm_offset_t va); 583 static void pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); 584 static void *pkru_dup_range(void *ctx, void *data); 585 static void pkru_free_range(void *ctx, void *node); 586 static int pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap); 587 static int pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); 588 static void pmap_pkru_deassign_all(pmap_t pmap); 589 590 static COUNTER_U64_DEFINE_EARLY(pcid_save_cnt); 591 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLFLAG_RD, 592 &pcid_save_cnt, "Count of saved TLB context on switch"); 593 594 static LIST_HEAD(, pmap_invl_gen) pmap_invl_gen_tracker = 595 LIST_HEAD_INITIALIZER(&pmap_invl_gen_tracker); 596 static struct mtx invl_gen_mtx; 597 /* Fake lock object to satisfy turnstiles interface. */ 598 static struct lock_object invl_gen_ts = { 599 .lo_name = "invlts", 600 }; 601 static struct pmap_invl_gen pmap_invl_gen_head = { 602 .gen = 1, 603 .next = NULL, 604 }; 605 static u_long pmap_invl_gen = 1; 606 static int pmap_invl_waiters; 607 static struct callout pmap_invl_callout; 608 static bool pmap_invl_callout_inited; 609 610 #define PMAP_ASSERT_NOT_IN_DI() \ 611 KASSERT(pmap_not_in_di(), ("DI already started")) 612 613 static bool 614 pmap_di_locked(void) 615 { 616 int tun; 617 618 if ((cpu_feature2 & CPUID2_CX16) == 0) 619 return (true); 620 tun = 0; 621 TUNABLE_INT_FETCH("vm.pmap.di_locked", &tun); 622 return (tun != 0); 623 } 624 625 static int 626 sysctl_pmap_di_locked(SYSCTL_HANDLER_ARGS) 627 { 628 int locked; 629 630 locked = pmap_di_locked(); 631 return (sysctl_handle_int(oidp, &locked, 0, req)); 632 } 633 SYSCTL_PROC(_vm_pmap, OID_AUTO, di_locked, CTLTYPE_INT | CTLFLAG_RDTUN | 634 CTLFLAG_MPSAFE, 0, 0, sysctl_pmap_di_locked, "", 635 "Locked delayed invalidation"); 636 637 static bool pmap_not_in_di_l(void); 638 static bool pmap_not_in_di_u(void); 639 DEFINE_IFUNC(, bool, pmap_not_in_di, (void)) 640 { 641 642 return (pmap_di_locked() ? pmap_not_in_di_l : pmap_not_in_di_u); 643 } 644 645 static bool 646 pmap_not_in_di_l(void) 647 { 648 struct pmap_invl_gen *invl_gen; 649 650 invl_gen = &curthread->td_md.md_invl_gen; 651 return (invl_gen->gen == 0); 652 } 653 654 static void 655 pmap_thread_init_invl_gen_l(struct thread *td) 656 { 657 struct pmap_invl_gen *invl_gen; 658 659 invl_gen = &td->td_md.md_invl_gen; 660 invl_gen->gen = 0; 661 } 662 663 static void 664 pmap_delayed_invl_wait_block(u_long *m_gen, u_long *invl_gen) 665 { 666 struct turnstile *ts; 667 668 ts = turnstile_trywait(&invl_gen_ts); 669 if (*m_gen > atomic_load_long(invl_gen)) 670 turnstile_wait(ts, NULL, TS_SHARED_QUEUE); 671 else 672 turnstile_cancel(ts); 673 } 674 675 static void 676 pmap_delayed_invl_finish_unblock(u_long new_gen) 677 { 678 struct turnstile *ts; 679 680 turnstile_chain_lock(&invl_gen_ts); 681 ts = turnstile_lookup(&invl_gen_ts); 682 if (new_gen != 0) 683 pmap_invl_gen = new_gen; 684 if (ts != NULL) { 685 turnstile_broadcast(ts, TS_SHARED_QUEUE); 686 turnstile_unpend(ts); 687 } 688 turnstile_chain_unlock(&invl_gen_ts); 689 } 690 691 /* 692 * Start a new Delayed Invalidation (DI) block of code, executed by 693 * the current thread. Within a DI block, the current thread may 694 * destroy both the page table and PV list entries for a mapping and 695 * then release the corresponding PV list lock before ensuring that 696 * the mapping is flushed from the TLBs of any processors with the 697 * pmap active. 698 */ 699 static void 700 pmap_delayed_invl_start_l(void) 701 { 702 struct pmap_invl_gen *invl_gen; 703 u_long currgen; 704 705 invl_gen = &curthread->td_md.md_invl_gen; 706 PMAP_ASSERT_NOT_IN_DI(); 707 mtx_lock(&invl_gen_mtx); 708 if (LIST_EMPTY(&pmap_invl_gen_tracker)) 709 currgen = pmap_invl_gen; 710 else 711 currgen = LIST_FIRST(&pmap_invl_gen_tracker)->gen; 712 invl_gen->gen = currgen + 1; 713 LIST_INSERT_HEAD(&pmap_invl_gen_tracker, invl_gen, link); 714 mtx_unlock(&invl_gen_mtx); 715 } 716 717 /* 718 * Finish the DI block, previously started by the current thread. All 719 * required TLB flushes for the pages marked by 720 * pmap_delayed_invl_page() must be finished before this function is 721 * called. 722 * 723 * This function works by bumping the global DI generation number to 724 * the generation number of the current thread's DI, unless there is a 725 * pending DI that started earlier. In the latter case, bumping the 726 * global DI generation number would incorrectly signal that the 727 * earlier DI had finished. Instead, this function bumps the earlier 728 * DI's generation number to match the generation number of the 729 * current thread's DI. 730 */ 731 static void 732 pmap_delayed_invl_finish_l(void) 733 { 734 struct pmap_invl_gen *invl_gen, *next; 735 736 invl_gen = &curthread->td_md.md_invl_gen; 737 KASSERT(invl_gen->gen != 0, ("missed invl_start")); 738 mtx_lock(&invl_gen_mtx); 739 next = LIST_NEXT(invl_gen, link); 740 if (next == NULL) 741 pmap_delayed_invl_finish_unblock(invl_gen->gen); 742 else 743 next->gen = invl_gen->gen; 744 LIST_REMOVE(invl_gen, link); 745 mtx_unlock(&invl_gen_mtx); 746 invl_gen->gen = 0; 747 } 748 749 static bool 750 pmap_not_in_di_u(void) 751 { 752 struct pmap_invl_gen *invl_gen; 753 754 invl_gen = &curthread->td_md.md_invl_gen; 755 return (((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) != 0); 756 } 757 758 static void 759 pmap_thread_init_invl_gen_u(struct thread *td) 760 { 761 struct pmap_invl_gen *invl_gen; 762 763 invl_gen = &td->td_md.md_invl_gen; 764 invl_gen->gen = 0; 765 invl_gen->next = (void *)PMAP_INVL_GEN_NEXT_INVALID; 766 } 767 768 static bool 769 pmap_di_load_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *out) 770 { 771 uint64_t new_high, new_low, old_high, old_low; 772 char res; 773 774 old_low = new_low = 0; 775 old_high = new_high = (uintptr_t)0; 776 777 __asm volatile("lock;cmpxchg16b\t%1" 778 : "=@cce" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high) 779 : "b"(new_low), "c" (new_high) 780 : "memory", "cc"); 781 if (res == 0) { 782 if ((old_high & PMAP_INVL_GEN_NEXT_INVALID) != 0) 783 return (false); 784 out->gen = old_low; 785 out->next = (void *)old_high; 786 } else { 787 out->gen = new_low; 788 out->next = (void *)new_high; 789 } 790 return (true); 791 } 792 793 static bool 794 pmap_di_store_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *old_val, 795 struct pmap_invl_gen *new_val) 796 { 797 uint64_t new_high, new_low, old_high, old_low; 798 char res; 799 800 new_low = new_val->gen; 801 new_high = (uintptr_t)new_val->next; 802 old_low = old_val->gen; 803 old_high = (uintptr_t)old_val->next; 804 805 __asm volatile("lock;cmpxchg16b\t%1" 806 : "=@cce" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high) 807 : "b"(new_low), "c" (new_high) 808 : "memory", "cc"); 809 return (res); 810 } 811 812 static COUNTER_U64_DEFINE_EARLY(pv_page_count); 813 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_page_count, CTLFLAG_RD, 814 &pv_page_count, "Current number of allocated pv pages"); 815 816 static COUNTER_U64_DEFINE_EARLY(user_pt_page_count); 817 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, user_pt_page_count, CTLFLAG_RD, 818 &user_pt_page_count, 819 "Current number of allocated page table pages for userspace"); 820 821 static COUNTER_U64_DEFINE_EARLY(kernel_pt_page_count); 822 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, kernel_pt_page_count, CTLFLAG_RD, 823 &kernel_pt_page_count, 824 "Current number of allocated page table pages for the kernel"); 825 826 #ifdef PV_STATS 827 828 static COUNTER_U64_DEFINE_EARLY(invl_start_restart); 829 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_start_restart, 830 CTLFLAG_RD, &invl_start_restart, 831 "Number of delayed TLB invalidation request restarts"); 832 833 static COUNTER_U64_DEFINE_EARLY(invl_finish_restart); 834 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_finish_restart, CTLFLAG_RD, 835 &invl_finish_restart, 836 "Number of delayed TLB invalidation completion restarts"); 837 838 static int invl_max_qlen; 839 SYSCTL_INT(_vm_pmap, OID_AUTO, invl_max_qlen, CTLFLAG_RD, 840 &invl_max_qlen, 0, 841 "Maximum delayed TLB invalidation request queue length"); 842 #endif 843 844 #define di_delay locks_delay 845 846 static void 847 pmap_delayed_invl_start_u(void) 848 { 849 struct pmap_invl_gen *invl_gen, *p, prev, new_prev; 850 struct thread *td; 851 struct lock_delay_arg lda; 852 uintptr_t prevl; 853 u_char pri; 854 #ifdef PV_STATS 855 int i, ii; 856 #endif 857 858 td = curthread; 859 invl_gen = &td->td_md.md_invl_gen; 860 PMAP_ASSERT_NOT_IN_DI(); 861 lock_delay_arg_init(&lda, &di_delay); 862 invl_gen->saved_pri = 0; 863 pri = td->td_base_pri; 864 if (pri > PVM) { 865 thread_lock(td); 866 pri = td->td_base_pri; 867 if (pri > PVM) { 868 invl_gen->saved_pri = pri; 869 sched_prio(td, PVM); 870 } 871 thread_unlock(td); 872 } 873 again: 874 PV_STAT(i = 0); 875 for (p = &pmap_invl_gen_head;; p = prev.next) { 876 PV_STAT(i++); 877 prevl = (uintptr_t)atomic_load_ptr(&p->next); 878 if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) { 879 PV_STAT(counter_u64_add(invl_start_restart, 1)); 880 lock_delay(&lda); 881 goto again; 882 } 883 if (prevl == 0) 884 break; 885 prev.next = (void *)prevl; 886 } 887 #ifdef PV_STATS 888 if ((ii = invl_max_qlen) < i) 889 atomic_cmpset_int(&invl_max_qlen, ii, i); 890 #endif 891 892 if (!pmap_di_load_invl(p, &prev) || prev.next != NULL) { 893 PV_STAT(counter_u64_add(invl_start_restart, 1)); 894 lock_delay(&lda); 895 goto again; 896 } 897 898 new_prev.gen = prev.gen; 899 new_prev.next = invl_gen; 900 invl_gen->gen = prev.gen + 1; 901 902 /* Formal fence between store to invl->gen and updating *p. */ 903 atomic_thread_fence_rel(); 904 905 /* 906 * After inserting an invl_gen element with invalid bit set, 907 * this thread blocks any other thread trying to enter the 908 * delayed invalidation block. Do not allow to remove us from 909 * the CPU, because it causes starvation for other threads. 910 */ 911 critical_enter(); 912 913 /* 914 * ABA for *p is not possible there, since p->gen can only 915 * increase. So if the *p thread finished its di, then 916 * started a new one and got inserted into the list at the 917 * same place, its gen will appear greater than the previously 918 * read gen. 919 */ 920 if (!pmap_di_store_invl(p, &prev, &new_prev)) { 921 critical_exit(); 922 PV_STAT(counter_u64_add(invl_start_restart, 1)); 923 lock_delay(&lda); 924 goto again; 925 } 926 927 /* 928 * There we clear PMAP_INVL_GEN_NEXT_INVALID in 929 * invl_gen->next, allowing other threads to iterate past us. 930 * pmap_di_store_invl() provides fence between the generation 931 * write and the update of next. 932 */ 933 invl_gen->next = NULL; 934 critical_exit(); 935 } 936 937 static bool 938 pmap_delayed_invl_finish_u_crit(struct pmap_invl_gen *invl_gen, 939 struct pmap_invl_gen *p) 940 { 941 struct pmap_invl_gen prev, new_prev; 942 u_long mygen; 943 944 /* 945 * Load invl_gen->gen after setting invl_gen->next 946 * PMAP_INVL_GEN_NEXT_INVALID. This prevents larger 947 * generations to propagate to our invl_gen->gen. Lock prefix 948 * in atomic_set_ptr() worked as seq_cst fence. 949 */ 950 mygen = atomic_load_long(&invl_gen->gen); 951 952 if (!pmap_di_load_invl(p, &prev) || prev.next != invl_gen) 953 return (false); 954 955 KASSERT(prev.gen < mygen, 956 ("invalid di gen sequence %lu %lu", prev.gen, mygen)); 957 new_prev.gen = mygen; 958 new_prev.next = (void *)((uintptr_t)invl_gen->next & 959 ~PMAP_INVL_GEN_NEXT_INVALID); 960 961 /* Formal fence between load of prev and storing update to it. */ 962 atomic_thread_fence_rel(); 963 964 return (pmap_di_store_invl(p, &prev, &new_prev)); 965 } 966 967 static void 968 pmap_delayed_invl_finish_u(void) 969 { 970 struct pmap_invl_gen *invl_gen, *p; 971 struct thread *td; 972 struct lock_delay_arg lda; 973 uintptr_t prevl; 974 975 td = curthread; 976 invl_gen = &td->td_md.md_invl_gen; 977 KASSERT(invl_gen->gen != 0, ("missed invl_start: gen 0")); 978 KASSERT(((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) == 0, 979 ("missed invl_start: INVALID")); 980 lock_delay_arg_init(&lda, &di_delay); 981 982 again: 983 for (p = &pmap_invl_gen_head; p != NULL; p = (void *)prevl) { 984 prevl = (uintptr_t)atomic_load_ptr(&p->next); 985 if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) { 986 PV_STAT(counter_u64_add(invl_finish_restart, 1)); 987 lock_delay(&lda); 988 goto again; 989 } 990 if ((void *)prevl == invl_gen) 991 break; 992 } 993 994 /* 995 * It is legitimate to not find ourself on the list if a 996 * thread before us finished its DI and started it again. 997 */ 998 if (__predict_false(p == NULL)) { 999 PV_STAT(counter_u64_add(invl_finish_restart, 1)); 1000 lock_delay(&lda); 1001 goto again; 1002 } 1003 1004 critical_enter(); 1005 atomic_set_ptr((uintptr_t *)&invl_gen->next, 1006 PMAP_INVL_GEN_NEXT_INVALID); 1007 if (!pmap_delayed_invl_finish_u_crit(invl_gen, p)) { 1008 atomic_clear_ptr((uintptr_t *)&invl_gen->next, 1009 PMAP_INVL_GEN_NEXT_INVALID); 1010 critical_exit(); 1011 PV_STAT(counter_u64_add(invl_finish_restart, 1)); 1012 lock_delay(&lda); 1013 goto again; 1014 } 1015 critical_exit(); 1016 if (atomic_load_int(&pmap_invl_waiters) > 0) 1017 pmap_delayed_invl_finish_unblock(0); 1018 if (invl_gen->saved_pri != 0) { 1019 thread_lock(td); 1020 sched_prio(td, invl_gen->saved_pri); 1021 thread_unlock(td); 1022 } 1023 } 1024 1025 #ifdef DDB 1026 DB_SHOW_COMMAND(di_queue, pmap_di_queue) 1027 { 1028 struct pmap_invl_gen *p, *pn; 1029 struct thread *td; 1030 uintptr_t nextl; 1031 bool first; 1032 1033 for (p = &pmap_invl_gen_head, first = true; p != NULL; p = pn, 1034 first = false) { 1035 nextl = (uintptr_t)atomic_load_ptr(&p->next); 1036 pn = (void *)(nextl & ~PMAP_INVL_GEN_NEXT_INVALID); 1037 td = first ? NULL : __containerof(p, struct thread, 1038 td_md.md_invl_gen); 1039 db_printf("gen %lu inv %d td %p tid %d\n", p->gen, 1040 (nextl & PMAP_INVL_GEN_NEXT_INVALID) != 0, td, 1041 td != NULL ? td->td_tid : -1); 1042 } 1043 } 1044 #endif 1045 1046 #ifdef PV_STATS 1047 static COUNTER_U64_DEFINE_EARLY(invl_wait); 1048 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_wait, 1049 CTLFLAG_RD, &invl_wait, 1050 "Number of times DI invalidation blocked pmap_remove_all/write"); 1051 1052 static COUNTER_U64_DEFINE_EARLY(invl_wait_slow); 1053 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, invl_wait_slow, CTLFLAG_RD, 1054 &invl_wait_slow, "Number of slow invalidation waits for lockless DI"); 1055 1056 #endif 1057 1058 #ifdef NUMA 1059 static u_long * 1060 pmap_delayed_invl_genp(vm_page_t m) 1061 { 1062 vm_paddr_t pa; 1063 u_long *gen; 1064 1065 pa = VM_PAGE_TO_PHYS(m); 1066 if (__predict_false((pa) > pmap_last_pa)) 1067 gen = &pv_dummy_large.pv_invl_gen; 1068 else 1069 gen = &(pa_to_pmdp(pa)->pv_invl_gen); 1070 1071 return (gen); 1072 } 1073 #else 1074 static u_long * 1075 pmap_delayed_invl_genp(vm_page_t m) 1076 { 1077 1078 return (&pv_invl_gen[pa_index(VM_PAGE_TO_PHYS(m)) % NPV_LIST_LOCKS]); 1079 } 1080 #endif 1081 1082 static void 1083 pmap_delayed_invl_callout_func(void *arg __unused) 1084 { 1085 1086 if (atomic_load_int(&pmap_invl_waiters) == 0) 1087 return; 1088 pmap_delayed_invl_finish_unblock(0); 1089 } 1090 1091 static void 1092 pmap_delayed_invl_callout_init(void *arg __unused) 1093 { 1094 1095 if (pmap_di_locked()) 1096 return; 1097 callout_init(&pmap_invl_callout, 1); 1098 pmap_invl_callout_inited = true; 1099 } 1100 SYSINIT(pmap_di_callout, SI_SUB_CPU + 1, SI_ORDER_ANY, 1101 pmap_delayed_invl_callout_init, NULL); 1102 1103 /* 1104 * Ensure that all currently executing DI blocks, that need to flush 1105 * TLB for the given page m, actually flushed the TLB at the time the 1106 * function returned. If the page m has an empty PV list and we call 1107 * pmap_delayed_invl_wait(), upon its return we know that no CPU has a 1108 * valid mapping for the page m in either its page table or TLB. 1109 * 1110 * This function works by blocking until the global DI generation 1111 * number catches up with the generation number associated with the 1112 * given page m and its PV list. Since this function's callers 1113 * typically own an object lock and sometimes own a page lock, it 1114 * cannot sleep. Instead, it blocks on a turnstile to relinquish the 1115 * processor. 1116 */ 1117 static void 1118 pmap_delayed_invl_wait_l(vm_page_t m) 1119 { 1120 u_long *m_gen; 1121 #ifdef PV_STATS 1122 bool accounted = false; 1123 #endif 1124 1125 m_gen = pmap_delayed_invl_genp(m); 1126 while (*m_gen > pmap_invl_gen) { 1127 #ifdef PV_STATS 1128 if (!accounted) { 1129 counter_u64_add(invl_wait, 1); 1130 accounted = true; 1131 } 1132 #endif 1133 pmap_delayed_invl_wait_block(m_gen, &pmap_invl_gen); 1134 } 1135 } 1136 1137 static void 1138 pmap_delayed_invl_wait_u(vm_page_t m) 1139 { 1140 u_long *m_gen; 1141 struct lock_delay_arg lda; 1142 bool fast; 1143 1144 fast = true; 1145 m_gen = pmap_delayed_invl_genp(m); 1146 lock_delay_arg_init(&lda, &di_delay); 1147 while (*m_gen > atomic_load_long(&pmap_invl_gen_head.gen)) { 1148 if (fast || !pmap_invl_callout_inited) { 1149 PV_STAT(counter_u64_add(invl_wait, 1)); 1150 lock_delay(&lda); 1151 fast = false; 1152 } else { 1153 /* 1154 * The page's invalidation generation number 1155 * is still below the current thread's number. 1156 * Prepare to block so that we do not waste 1157 * CPU cycles or worse, suffer livelock. 1158 * 1159 * Since it is impossible to block without 1160 * racing with pmap_delayed_invl_finish_u(), 1161 * prepare for the race by incrementing 1162 * pmap_invl_waiters and arming a 1-tick 1163 * callout which will unblock us if we lose 1164 * the race. 1165 */ 1166 atomic_add_int(&pmap_invl_waiters, 1); 1167 1168 /* 1169 * Re-check the current thread's invalidation 1170 * generation after incrementing 1171 * pmap_invl_waiters, so that there is no race 1172 * with pmap_delayed_invl_finish_u() setting 1173 * the page generation and checking 1174 * pmap_invl_waiters. The only race allowed 1175 * is for a missed unblock, which is handled 1176 * by the callout. 1177 */ 1178 if (*m_gen > 1179 atomic_load_long(&pmap_invl_gen_head.gen)) { 1180 callout_reset(&pmap_invl_callout, 1, 1181 pmap_delayed_invl_callout_func, NULL); 1182 PV_STAT(counter_u64_add(invl_wait_slow, 1)); 1183 pmap_delayed_invl_wait_block(m_gen, 1184 &pmap_invl_gen_head.gen); 1185 } 1186 atomic_add_int(&pmap_invl_waiters, -1); 1187 } 1188 } 1189 } 1190 1191 DEFINE_IFUNC(, void, pmap_thread_init_invl_gen, (struct thread *)) 1192 { 1193 1194 return (pmap_di_locked() ? pmap_thread_init_invl_gen_l : 1195 pmap_thread_init_invl_gen_u); 1196 } 1197 1198 DEFINE_IFUNC(static, void, pmap_delayed_invl_start, (void)) 1199 { 1200 1201 return (pmap_di_locked() ? pmap_delayed_invl_start_l : 1202 pmap_delayed_invl_start_u); 1203 } 1204 1205 DEFINE_IFUNC(static, void, pmap_delayed_invl_finish, (void)) 1206 { 1207 1208 return (pmap_di_locked() ? pmap_delayed_invl_finish_l : 1209 pmap_delayed_invl_finish_u); 1210 } 1211 1212 DEFINE_IFUNC(static, void, pmap_delayed_invl_wait, (vm_page_t)) 1213 { 1214 1215 return (pmap_di_locked() ? pmap_delayed_invl_wait_l : 1216 pmap_delayed_invl_wait_u); 1217 } 1218 1219 /* 1220 * Mark the page m's PV list as participating in the current thread's 1221 * DI block. Any threads concurrently using m's PV list to remove or 1222 * restrict all mappings to m will wait for the current thread's DI 1223 * block to complete before proceeding. 1224 * 1225 * The function works by setting the DI generation number for m's PV 1226 * list to at least the DI generation number of the current thread. 1227 * This forces a caller of pmap_delayed_invl_wait() to block until 1228 * current thread calls pmap_delayed_invl_finish(). 1229 */ 1230 static void 1231 pmap_delayed_invl_page(vm_page_t m) 1232 { 1233 u_long gen, *m_gen; 1234 1235 rw_assert(VM_PAGE_TO_PV_LIST_LOCK(m), RA_WLOCKED); 1236 gen = curthread->td_md.md_invl_gen.gen; 1237 if (gen == 0) 1238 return; 1239 m_gen = pmap_delayed_invl_genp(m); 1240 if (*m_gen < gen) 1241 *m_gen = gen; 1242 } 1243 1244 /* 1245 * Crashdump maps. 1246 */ 1247 static caddr_t crashdumpmap; 1248 1249 /* 1250 * Internal flags for pmap_enter()'s helper functions. 1251 */ 1252 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ 1253 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ 1254 1255 /* 1256 * Internal flags for pmap_mapdev_internal() and 1257 * pmap_change_props_locked(). 1258 */ 1259 #define MAPDEV_FLUSHCACHE 0x00000001 /* Flush cache after mapping. */ 1260 #define MAPDEV_SETATTR 0x00000002 /* Modify existing attrs. */ 1261 #define MAPDEV_ASSERTVALID 0x00000004 /* Assert mapping validity. */ 1262 1263 TAILQ_HEAD(pv_chunklist, pv_chunk); 1264 1265 static void free_pv_chunk(struct pv_chunk *pc); 1266 static void free_pv_chunk_batch(struct pv_chunklist *batch); 1267 static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 1268 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); 1269 static int popcnt_pc_map_pq(uint64_t *map); 1270 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); 1271 static void reserve_pv_entries(pmap_t pmap, int needed, 1272 struct rwlock **lockp); 1273 static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 1274 struct rwlock **lockp); 1275 static bool pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, 1276 u_int flags, struct rwlock **lockp); 1277 #if VM_NRESERVLEVEL > 0 1278 static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 1279 struct rwlock **lockp); 1280 #endif 1281 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 1282 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 1283 vm_offset_t va); 1284 1285 static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte); 1286 static int pmap_change_props_locked(vm_offset_t va, vm_size_t size, 1287 vm_prot_t prot, int mode, int flags); 1288 static bool pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 1289 static bool pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, 1290 vm_offset_t va, struct rwlock **lockp); 1291 static bool pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, 1292 vm_offset_t va); 1293 static int pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, 1294 vm_prot_t prot, struct rwlock **lockp); 1295 static int pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, 1296 u_int flags, vm_page_t m, struct rwlock **lockp); 1297 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 1298 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); 1299 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); 1300 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted, 1301 bool allpte_PG_A_set); 1302 static void pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, 1303 vm_offset_t eva); 1304 static void pmap_invalidate_cache_range_all(vm_offset_t sva, 1305 vm_offset_t eva); 1306 static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, 1307 pd_entry_t pde); 1308 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); 1309 static vm_page_t pmap_large_map_getptp_unlocked(void); 1310 static vm_paddr_t pmap_large_map_kextract(vm_offset_t va); 1311 #if VM_NRESERVLEVEL > 0 1312 static bool pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 1313 vm_page_t mpte, struct rwlock **lockp); 1314 #endif 1315 static bool pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, 1316 vm_prot_t prot); 1317 static void pmap_pte_props(pt_entry_t *pte, u_long bits, u_long mask); 1318 static void pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, 1319 bool exec); 1320 static pdp_entry_t *pmap_pti_pdpe(vm_offset_t va); 1321 static pd_entry_t *pmap_pti_pde(vm_offset_t va); 1322 static void pmap_pti_wire_pte(void *pte); 1323 static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 1324 struct spglist *free, struct rwlock **lockp); 1325 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, 1326 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); 1327 static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); 1328 static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 1329 struct spglist *free); 1330 static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 1331 pd_entry_t *pde, struct spglist *free, 1332 struct rwlock **lockp); 1333 static bool pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 1334 vm_page_t m, struct rwlock **lockp); 1335 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 1336 pd_entry_t newpde); 1337 static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde); 1338 1339 static pd_entry_t *pmap_alloc_pde(pmap_t pmap, vm_offset_t va, vm_page_t *pdpgp, 1340 struct rwlock **lockp); 1341 static vm_page_t pmap_allocpte_alloc(pmap_t pmap, vm_pindex_t ptepindex, 1342 struct rwlock **lockp, vm_offset_t va); 1343 static vm_page_t pmap_allocpte_nosleep(pmap_t pmap, vm_pindex_t ptepindex, 1344 struct rwlock **lockp, vm_offset_t va); 1345 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, 1346 struct rwlock **lockp); 1347 1348 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, 1349 struct spglist *free); 1350 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); 1351 1352 static vm_page_t pmap_alloc_pt_page(pmap_t, vm_pindex_t, int); 1353 static void pmap_free_pt_page(pmap_t, vm_page_t, bool); 1354 1355 /********************/ 1356 /* Inline functions */ 1357 /********************/ 1358 1359 /* 1360 * Return a non-clipped indexes for a given VA, which are page table 1361 * pages indexes at the corresponding level. 1362 */ 1363 static __inline vm_pindex_t 1364 pmap_pde_pindex(vm_offset_t va) 1365 { 1366 return (va >> PDRSHIFT); 1367 } 1368 1369 static __inline vm_pindex_t 1370 pmap_pdpe_pindex(vm_offset_t va) 1371 { 1372 return (NUPDE + (va >> PDPSHIFT)); 1373 } 1374 1375 static __inline vm_pindex_t 1376 pmap_pml4e_pindex(vm_offset_t va) 1377 { 1378 return (NUPDE + NUPDPE + (va >> PML4SHIFT)); 1379 } 1380 1381 static __inline vm_pindex_t 1382 pmap_pml5e_pindex(vm_offset_t va) 1383 { 1384 return (NUPDE + NUPDPE + NUPML4E + (va >> PML5SHIFT)); 1385 } 1386 1387 static __inline pml4_entry_t * 1388 pmap_pml5e(pmap_t pmap, vm_offset_t va) 1389 { 1390 1391 MPASS(pmap_is_la57(pmap)); 1392 return (&pmap->pm_pmltop[pmap_pml5e_index(va)]); 1393 } 1394 1395 static __inline pml4_entry_t * 1396 pmap_pml5e_u(pmap_t pmap, vm_offset_t va) 1397 { 1398 1399 MPASS(pmap_is_la57(pmap)); 1400 return (&pmap->pm_pmltopu[pmap_pml5e_index(va)]); 1401 } 1402 1403 static __inline pml4_entry_t * 1404 pmap_pml5e_to_pml4e(pml5_entry_t *pml5e, vm_offset_t va) 1405 { 1406 pml4_entry_t *pml4e; 1407 1408 /* XXX MPASS(pmap_is_la57(pmap); */ 1409 pml4e = (pml4_entry_t *)PHYS_TO_DMAP(*pml5e & PG_FRAME); 1410 return (&pml4e[pmap_pml4e_index(va)]); 1411 } 1412 1413 /* Return a pointer to the PML4 slot that corresponds to a VA */ 1414 static __inline pml4_entry_t * 1415 pmap_pml4e(pmap_t pmap, vm_offset_t va) 1416 { 1417 pml5_entry_t *pml5e; 1418 pml4_entry_t *pml4e; 1419 pt_entry_t PG_V; 1420 1421 if (pmap_is_la57(pmap)) { 1422 pml5e = pmap_pml5e(pmap, va); 1423 PG_V = pmap_valid_bit(pmap); 1424 if ((*pml5e & PG_V) == 0) 1425 return (NULL); 1426 pml4e = (pml4_entry_t *)PHYS_TO_DMAP(*pml5e & PG_FRAME); 1427 } else { 1428 pml4e = pmap->pm_pmltop; 1429 } 1430 return (&pml4e[pmap_pml4e_index(va)]); 1431 } 1432 1433 static __inline pml4_entry_t * 1434 pmap_pml4e_u(pmap_t pmap, vm_offset_t va) 1435 { 1436 MPASS(!pmap_is_la57(pmap)); 1437 return (&pmap->pm_pmltopu[pmap_pml4e_index(va)]); 1438 } 1439 1440 /* Return a pointer to the PDP slot that corresponds to a VA */ 1441 static __inline pdp_entry_t * 1442 pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va) 1443 { 1444 pdp_entry_t *pdpe; 1445 1446 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME); 1447 return (&pdpe[pmap_pdpe_index(va)]); 1448 } 1449 1450 /* Return a pointer to the PDP slot that corresponds to a VA */ 1451 static __inline pdp_entry_t * 1452 pmap_pdpe(pmap_t pmap, vm_offset_t va) 1453 { 1454 pml4_entry_t *pml4e; 1455 pt_entry_t PG_V; 1456 1457 PG_V = pmap_valid_bit(pmap); 1458 pml4e = pmap_pml4e(pmap, va); 1459 if (pml4e == NULL || (*pml4e & PG_V) == 0) 1460 return (NULL); 1461 return (pmap_pml4e_to_pdpe(pml4e, va)); 1462 } 1463 1464 /* Return a pointer to the PD slot that corresponds to a VA */ 1465 static __inline pd_entry_t * 1466 pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va) 1467 { 1468 pd_entry_t *pde; 1469 1470 KASSERT((*pdpe & PG_PS) == 0, 1471 ("%s: pdpe %#lx is a leaf", __func__, *pdpe)); 1472 pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME); 1473 return (&pde[pmap_pde_index(va)]); 1474 } 1475 1476 /* Return a pointer to the PD slot that corresponds to a VA */ 1477 static __inline pd_entry_t * 1478 pmap_pde(pmap_t pmap, vm_offset_t va) 1479 { 1480 pdp_entry_t *pdpe; 1481 pt_entry_t PG_V; 1482 1483 PG_V = pmap_valid_bit(pmap); 1484 pdpe = pmap_pdpe(pmap, va); 1485 if (pdpe == NULL || (*pdpe & PG_V) == 0) 1486 return (NULL); 1487 KASSERT((*pdpe & PG_PS) == 0, 1488 ("pmap_pde for 1G page, pmap %p va %#lx", pmap, va)); 1489 return (pmap_pdpe_to_pde(pdpe, va)); 1490 } 1491 1492 /* Return a pointer to the PT slot that corresponds to a VA */ 1493 static __inline pt_entry_t * 1494 pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) 1495 { 1496 pt_entry_t *pte; 1497 1498 KASSERT((*pde & PG_PS) == 0, 1499 ("%s: pde %#lx is a leaf", __func__, *pde)); 1500 pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 1501 return (&pte[pmap_pte_index(va)]); 1502 } 1503 1504 /* Return a pointer to the PT slot that corresponds to a VA */ 1505 static __inline pt_entry_t * 1506 pmap_pte(pmap_t pmap, vm_offset_t va) 1507 { 1508 pd_entry_t *pde; 1509 pt_entry_t PG_V; 1510 1511 PG_V = pmap_valid_bit(pmap); 1512 pde = pmap_pde(pmap, va); 1513 if (pde == NULL || (*pde & PG_V) == 0) 1514 return (NULL); 1515 if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */ 1516 return ((pt_entry_t *)pde); 1517 return (pmap_pde_to_pte(pde, va)); 1518 } 1519 1520 static __inline void 1521 pmap_resident_count_adj(pmap_t pmap, int count) 1522 { 1523 1524 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1525 KASSERT(pmap->pm_stats.resident_count + count >= 0, 1526 ("pmap %p resident count underflow %ld %d", pmap, 1527 pmap->pm_stats.resident_count, count)); 1528 pmap->pm_stats.resident_count += count; 1529 } 1530 1531 static __inline void 1532 pmap_pt_page_count_pinit(pmap_t pmap, int count) 1533 { 1534 KASSERT(pmap->pm_stats.resident_count + count >= 0, 1535 ("pmap %p resident count underflow %ld %d", pmap, 1536 pmap->pm_stats.resident_count, count)); 1537 pmap->pm_stats.resident_count += count; 1538 } 1539 1540 static __inline void 1541 pmap_pt_page_count_adj(pmap_t pmap, int count) 1542 { 1543 if (pmap == kernel_pmap) 1544 counter_u64_add(kernel_pt_page_count, count); 1545 else { 1546 if (pmap != NULL) 1547 pmap_resident_count_adj(pmap, count); 1548 counter_u64_add(user_pt_page_count, count); 1549 } 1550 } 1551 1552 pt_entry_t vtoptem __read_mostly = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + 1553 NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1) << 3; 1554 vm_offset_t PTmap __read_mostly = (vm_offset_t)P4Tmap; 1555 1556 pt_entry_t * 1557 vtopte(vm_offset_t va) 1558 { 1559 KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va)); 1560 1561 return ((pt_entry_t *)(PTmap + ((va >> (PAGE_SHIFT - 3)) & vtoptem))); 1562 } 1563 1564 pd_entry_t vtopdem __read_mostly = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + 1565 NPML4EPGSHIFT)) - 1) << 3; 1566 vm_offset_t PDmap __read_mostly = (vm_offset_t)P4Dmap; 1567 1568 static __inline pd_entry_t * 1569 vtopde(vm_offset_t va) 1570 { 1571 KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va)); 1572 1573 return ((pt_entry_t *)(PDmap + ((va >> (PDRSHIFT - 3)) & vtopdem))); 1574 } 1575 1576 static u_int64_t 1577 allocpages(vm_paddr_t *firstaddr, int n) 1578 { 1579 u_int64_t ret; 1580 1581 ret = *firstaddr; 1582 bzero((void *)ret, n * PAGE_SIZE); 1583 *firstaddr += n * PAGE_SIZE; 1584 return (ret); 1585 } 1586 1587 CTASSERT(powerof2(NDMPML4E)); 1588 1589 /* number of kernel PDP slots */ 1590 #define NKPDPE(ptpgs) howmany(ptpgs, NPDEPG) 1591 1592 static void 1593 nkpt_init(vm_paddr_t addr) 1594 { 1595 int pt_pages; 1596 1597 #ifdef NKPT 1598 pt_pages = NKPT; 1599 #else 1600 pt_pages = howmany(addr - kernphys, NBPDR) + 1; /* +1 for 2M hole @0 */ 1601 pt_pages += NKPDPE(pt_pages); 1602 1603 /* 1604 * Add some slop beyond the bare minimum required for bootstrapping 1605 * the kernel. 1606 * 1607 * This is quite important when allocating KVA for kernel modules. 1608 * The modules are required to be linked in the negative 2GB of 1609 * the address space. If we run out of KVA in this region then 1610 * pmap_growkernel() will need to allocate page table pages to map 1611 * the entire 512GB of KVA space which is an unnecessary tax on 1612 * physical memory. 1613 * 1614 * Secondly, device memory mapped as part of setting up the low- 1615 * level console(s) is taken from KVA, starting at virtual_avail. 1616 * This is because cninit() is called after pmap_bootstrap() but 1617 * before vm_mem_init() and pmap_init(). 20MB for a frame buffer 1618 * is not uncommon. 1619 */ 1620 pt_pages += 32; /* 64MB additional slop. */ 1621 #endif 1622 nkpt = pt_pages; 1623 } 1624 1625 /* 1626 * Returns the proper write/execute permission for a physical page that is 1627 * part of the initial boot allocations. 1628 * 1629 * If the page has kernel text, it is marked as read-only. If the page has 1630 * kernel read-only data, it is marked as read-only/not-executable. If the 1631 * page has only read-write data, it is marked as read-write/not-executable. 1632 * If the page is below/above the kernel range, it is marked as read-write. 1633 * 1634 * This function operates on 2M pages, since we map the kernel space that 1635 * way. 1636 */ 1637 static inline pt_entry_t 1638 bootaddr_rwx(vm_paddr_t pa) 1639 { 1640 /* 1641 * The kernel is loaded at a 2MB-aligned address, and memory below that 1642 * need not be executable. The .bss section is padded to a 2MB 1643 * boundary, so memory following the kernel need not be executable 1644 * either. Preloaded kernel modules have their mapping permissions 1645 * fixed up by the linker. 1646 */ 1647 if (pa < trunc_2mpage(kernphys + btext - KERNSTART) || 1648 pa >= trunc_2mpage(kernphys + _end - KERNSTART)) 1649 return (X86_PG_RW | pg_nx); 1650 1651 /* 1652 * The linker should ensure that the read-only and read-write 1653 * portions don't share the same 2M page, so this shouldn't 1654 * impact read-only data. However, in any case, any page with 1655 * read-write data needs to be read-write. 1656 */ 1657 if (pa >= trunc_2mpage(kernphys + brwsection - KERNSTART)) 1658 return (X86_PG_RW | pg_nx); 1659 1660 /* 1661 * Mark any 2M page containing kernel text as read-only. Mark 1662 * other pages with read-only data as read-only and not executable. 1663 * (It is likely a small portion of the read-only data section will 1664 * be marked as read-only, but executable. This should be acceptable 1665 * since the read-only protection will keep the data from changing.) 1666 * Note that fixups to the .text section will still work until we 1667 * set CR0.WP. 1668 */ 1669 if (pa < round_2mpage(kernphys + etext - KERNSTART)) 1670 return (0); 1671 return (pg_nx); 1672 } 1673 1674 static void 1675 create_pagetables(vm_paddr_t *firstaddr) 1676 { 1677 pd_entry_t *pd_p; 1678 pdp_entry_t *pdp_p; 1679 pml4_entry_t *p4_p; 1680 uint64_t DMPDkernphys; 1681 vm_paddr_t pax; 1682 #ifdef KASAN 1683 pt_entry_t *pt_p; 1684 uint64_t KASANPDphys, KASANPTphys, KASANphys; 1685 vm_offset_t kasankernbase; 1686 int kasankpdpi, kasankpdi, nkasanpte; 1687 #endif 1688 int i, j, ndm1g, nkpdpe, nkdmpde; 1689 1690 TSENTER(); 1691 /* Allocate page table pages for the direct map */ 1692 ndmpdp = howmany(ptoa(Maxmem), NBPDP); 1693 if (ndmpdp < 4) /* Minimum 4GB of dirmap */ 1694 ndmpdp = 4; 1695 ndmpdpphys = howmany(ndmpdp, NPDPEPG); 1696 if (ndmpdpphys > NDMPML4E) { 1697 /* 1698 * Each NDMPML4E allows 512 GB, so limit to that, 1699 * and then readjust ndmpdp and ndmpdpphys. 1700 */ 1701 printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512); 1702 Maxmem = atop(NDMPML4E * NBPML4); 1703 ndmpdpphys = NDMPML4E; 1704 ndmpdp = NDMPML4E * NPDEPG; 1705 } 1706 DMPDPphys = allocpages(firstaddr, ndmpdpphys); 1707 ndm1g = 0; 1708 if ((amd_feature & AMDID_PAGE1GB) != 0) { 1709 /* 1710 * Calculate the number of 1G pages that will fully fit in 1711 * Maxmem. 1712 */ 1713 ndm1g = ptoa(Maxmem) >> PDPSHIFT; 1714 1715 /* 1716 * Allocate 2M pages for the kernel. These will be used in 1717 * place of the one or more 1G pages from ndm1g that maps 1718 * kernel memory into DMAP. 1719 */ 1720 nkdmpde = howmany((vm_offset_t)brwsection - KERNSTART + 1721 kernphys - rounddown2(kernphys, NBPDP), NBPDP); 1722 DMPDkernphys = allocpages(firstaddr, nkdmpde); 1723 } 1724 if (ndm1g < ndmpdp) 1725 DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g); 1726 dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; 1727 1728 /* Allocate pages. */ 1729 KPML4phys = allocpages(firstaddr, 1); 1730 KPDPphys = allocpages(firstaddr, NKPML4E); 1731 #ifdef KASAN 1732 KASANPDPphys = allocpages(firstaddr, NKASANPML4E); 1733 KASANPDphys = allocpages(firstaddr, 1); 1734 #endif 1735 #ifdef KMSAN 1736 /* 1737 * The KMSAN shadow maps are initially left unpopulated, since there is 1738 * no need to shadow memory above KERNBASE. 1739 */ 1740 KMSANSHADPDPphys = allocpages(firstaddr, NKMSANSHADPML4E); 1741 KMSANORIGPDPphys = allocpages(firstaddr, NKMSANORIGPML4E); 1742 #endif 1743 1744 /* 1745 * Allocate the initial number of kernel page table pages required to 1746 * bootstrap. We defer this until after all memory-size dependent 1747 * allocations are done (e.g. direct map), so that we don't have to 1748 * build in too much slop in our estimate. 1749 * 1750 * Note that when NKPML4E > 1, we have an empty page underneath 1751 * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed) 1752 * pages. (pmap_enter requires a PD page to exist for each KPML4E.) 1753 */ 1754 nkpt_init(*firstaddr); 1755 nkpdpe = NKPDPE(nkpt); 1756 1757 KPTphys = allocpages(firstaddr, nkpt); 1758 KPDphys = allocpages(firstaddr, nkpdpe); 1759 1760 #ifdef KASAN 1761 nkasanpte = howmany(nkpt, KASAN_SHADOW_SCALE); 1762 KASANPTphys = allocpages(firstaddr, nkasanpte); 1763 KASANphys = allocpages(firstaddr, nkasanpte * NPTEPG); 1764 #endif 1765 1766 /* 1767 * Connect the zero-filled PT pages to their PD entries. This 1768 * implicitly maps the PT pages at their correct locations within 1769 * the PTmap. 1770 */ 1771 pd_p = (pd_entry_t *)KPDphys; 1772 for (i = 0; i < nkpt; i++) 1773 pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V; 1774 1775 /* 1776 * Map from start of the kernel in physical memory (staging 1777 * area) to the end of loader preallocated memory using 2MB 1778 * pages. This replaces some of the PD entries created above. 1779 * For compatibility, identity map 2M at the start. 1780 */ 1781 pd_p[0] = X86_PG_V | PG_PS | pg_g | X86_PG_M | X86_PG_A | 1782 X86_PG_RW | pg_nx; 1783 for (i = 1, pax = kernphys; pax < KERNend; i++, pax += NBPDR) { 1784 /* Preset PG_M and PG_A because demotion expects it. */ 1785 pd_p[i] = pax | X86_PG_V | PG_PS | pg_g | X86_PG_M | 1786 X86_PG_A | bootaddr_rwx(pax); 1787 } 1788 1789 /* 1790 * Because we map the physical blocks in 2M pages, adjust firstaddr 1791 * to record the physical blocks we've actually mapped into kernel 1792 * virtual address space. 1793 */ 1794 if (*firstaddr < round_2mpage(KERNend)) 1795 *firstaddr = round_2mpage(KERNend); 1796 1797 /* And connect up the PD to the PDP (leaving room for L4 pages) */ 1798 pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE)); 1799 for (i = 0; i < nkpdpe; i++) 1800 pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V; 1801 1802 #ifdef KASAN 1803 kasankernbase = kasan_md_addr_to_shad(KERNBASE); 1804 kasankpdpi = pmap_pdpe_index(kasankernbase); 1805 kasankpdi = pmap_pde_index(kasankernbase); 1806 1807 pdp_p = (pdp_entry_t *)KASANPDPphys; 1808 pdp_p[kasankpdpi] = (KASANPDphys | X86_PG_RW | X86_PG_V | pg_nx); 1809 1810 pd_p = (pd_entry_t *)KASANPDphys; 1811 for (i = 0; i < nkasanpte; i++) 1812 pd_p[i + kasankpdi] = (KASANPTphys + ptoa(i)) | X86_PG_RW | 1813 X86_PG_V | pg_nx; 1814 1815 pt_p = (pt_entry_t *)KASANPTphys; 1816 for (i = 0; i < nkasanpte * NPTEPG; i++) 1817 pt_p[i] = (KASANphys + ptoa(i)) | X86_PG_RW | X86_PG_V | 1818 X86_PG_M | X86_PG_A | pg_nx; 1819 #endif 1820 1821 /* 1822 * Now, set up the direct map region using 2MB and/or 1GB pages. If 1823 * the end of physical memory is not aligned to a 1GB page boundary, 1824 * then the residual physical memory is mapped with 2MB pages. Later, 1825 * if pmap_mapdev{_attr}() uses the direct map for non-write-back 1826 * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings 1827 * that are partially used. 1828 */ 1829 pd_p = (pd_entry_t *)DMPDphys; 1830 for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) { 1831 pd_p[j] = (vm_paddr_t)i << PDRSHIFT; 1832 /* Preset PG_M and PG_A because demotion expects it. */ 1833 pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g | 1834 X86_PG_M | X86_PG_A | pg_nx; 1835 } 1836 pdp_p = (pdp_entry_t *)DMPDPphys; 1837 for (i = 0; i < ndm1g; i++) { 1838 pdp_p[i] = (vm_paddr_t)i << PDPSHIFT; 1839 /* Preset PG_M and PG_A because demotion expects it. */ 1840 pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g | 1841 X86_PG_M | X86_PG_A | pg_nx; 1842 } 1843 for (j = 0; i < ndmpdp; i++, j++) { 1844 pdp_p[i] = DMPDphys + ptoa(j); 1845 pdp_p[i] |= X86_PG_RW | X86_PG_V | pg_nx; 1846 } 1847 1848 /* 1849 * Instead of using a 1G page for the memory containing the kernel, 1850 * use 2M pages with read-only and no-execute permissions. (If using 1G 1851 * pages, this will partially overwrite the PDPEs above.) 1852 */ 1853 if (ndm1g > 0) { 1854 pd_p = (pd_entry_t *)DMPDkernphys; 1855 for (i = 0, pax = rounddown2(kernphys, NBPDP); 1856 i < NPDEPG * nkdmpde; i++, pax += NBPDR) { 1857 pd_p[i] = pax | X86_PG_V | PG_PS | pg_g | X86_PG_M | 1858 X86_PG_A | pg_nx | bootaddr_rwx(pax); 1859 } 1860 j = rounddown2(kernphys, NBPDP) >> PDPSHIFT; 1861 for (i = 0; i < nkdmpde; i++) { 1862 pdp_p[i + j] = (DMPDkernphys + ptoa(i)) | 1863 X86_PG_RW | X86_PG_V | pg_nx; 1864 } 1865 } 1866 1867 /* And recursively map PML4 to itself in order to get PTmap */ 1868 p4_p = (pml4_entry_t *)KPML4phys; 1869 p4_p[PML4PML4I] = KPML4phys; 1870 p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | pg_nx; 1871 1872 #ifdef KASAN 1873 /* Connect the KASAN shadow map slots up to the PML4. */ 1874 for (i = 0; i < NKASANPML4E; i++) { 1875 p4_p[KASANPML4I + i] = KASANPDPphys + ptoa(i); 1876 p4_p[KASANPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx; 1877 } 1878 #endif 1879 1880 #ifdef KMSAN 1881 /* Connect the KMSAN shadow map slots up to the PML4. */ 1882 for (i = 0; i < NKMSANSHADPML4E; i++) { 1883 p4_p[KMSANSHADPML4I + i] = KMSANSHADPDPphys + ptoa(i); 1884 p4_p[KMSANSHADPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx; 1885 } 1886 1887 /* Connect the KMSAN origin map slots up to the PML4. */ 1888 for (i = 0; i < NKMSANORIGPML4E; i++) { 1889 p4_p[KMSANORIGPML4I + i] = KMSANORIGPDPphys + ptoa(i); 1890 p4_p[KMSANORIGPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx; 1891 } 1892 #endif 1893 1894 /* Connect the Direct Map slots up to the PML4. */ 1895 for (i = 0; i < ndmpdpphys; i++) { 1896 p4_p[DMPML4I + i] = DMPDPphys + ptoa(i); 1897 p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | pg_nx; 1898 } 1899 1900 /* Connect the KVA slots up to the PML4 */ 1901 for (i = 0; i < NKPML4E; i++) { 1902 p4_p[KPML4BASE + i] = KPDPphys + ptoa(i); 1903 p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V; 1904 } 1905 1906 kernel_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); 1907 TSEXIT(); 1908 } 1909 1910 /* 1911 * Bootstrap the system enough to run with virtual memory. 1912 * 1913 * On amd64 this is called after mapping has already been enabled 1914 * and just syncs the pmap module with what has already been done. 1915 * [We can't call it easily with mapping off since the kernel is not 1916 * mapped with PA == VA, hence we would have to relocate every address 1917 * from the linked base (virtual) address "KERNBASE" to the actual 1918 * (physical) address starting relative to 0] 1919 */ 1920 void 1921 pmap_bootstrap(vm_paddr_t *firstaddr) 1922 { 1923 vm_offset_t va; 1924 pt_entry_t *pte, *pcpu_pte; 1925 struct region_descriptor r_gdt; 1926 uint64_t cr4, pcpu0_phys; 1927 u_long res; 1928 int i; 1929 1930 TSENTER(); 1931 KERNend = *firstaddr; 1932 res = atop(KERNend - (vm_paddr_t)kernphys); 1933 1934 if (!pti) 1935 pg_g = X86_PG_G; 1936 1937 /* 1938 * Create an initial set of page tables to run the kernel in. 1939 */ 1940 create_pagetables(firstaddr); 1941 1942 pcpu0_phys = allocpages(firstaddr, 1); 1943 1944 /* 1945 * Add a physical memory segment (vm_phys_seg) corresponding to the 1946 * preallocated kernel page table pages so that vm_page structures 1947 * representing these pages will be created. The vm_page structures 1948 * are required for promotion of the corresponding kernel virtual 1949 * addresses to superpage mappings. 1950 */ 1951 vm_phys_early_add_seg(KPTphys, KPTphys + ptoa(nkpt)); 1952 1953 /* 1954 * Account for the virtual addresses mapped by create_pagetables(). 1955 */ 1956 virtual_avail = (vm_offset_t)KERNSTART + round_2mpage(KERNend - 1957 (vm_paddr_t)kernphys); 1958 virtual_end = VM_MAX_KERNEL_ADDRESS; 1959 1960 /* 1961 * Enable PG_G global pages, then switch to the kernel page 1962 * table from the bootstrap page table. After the switch, it 1963 * is possible to enable SMEP and SMAP since PG_U bits are 1964 * correct now. 1965 */ 1966 cr4 = rcr4(); 1967 cr4 |= CR4_PGE; 1968 load_cr4(cr4); 1969 load_cr3(KPML4phys); 1970 if (cpu_stdext_feature & CPUID_STDEXT_SMEP) 1971 cr4 |= CR4_SMEP; 1972 if (cpu_stdext_feature & CPUID_STDEXT_SMAP) 1973 cr4 |= CR4_SMAP; 1974 load_cr4(cr4); 1975 1976 /* 1977 * Initialize the kernel pmap (which is statically allocated). 1978 * Count bootstrap data as being resident in case any of this data is 1979 * later unmapped (using pmap_remove()) and freed. 1980 */ 1981 PMAP_LOCK_INIT(kernel_pmap); 1982 kernel_pmap->pm_pmltop = kernel_pml4; 1983 kernel_pmap->pm_cr3 = KPML4phys; 1984 kernel_pmap->pm_ucr3 = PMAP_NO_CR3; 1985 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 1986 kernel_pmap->pm_stats.resident_count = res; 1987 vm_radix_init(&kernel_pmap->pm_root); 1988 kernel_pmap->pm_flags = pmap_flags; 1989 if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { 1990 rangeset_init(&kernel_pmap->pm_pkru, pkru_dup_range, 1991 pkru_free_range, kernel_pmap, M_NOWAIT); 1992 } 1993 1994 /* 1995 * The kernel pmap is always active on all CPUs. Once CPUs are 1996 * enumerated, the mask will be set equal to all_cpus. 1997 */ 1998 CPU_FILL(&kernel_pmap->pm_active); 1999 2000 /* 2001 * Initialize the TLB invalidations generation number lock. 2002 */ 2003 mtx_init(&invl_gen_mtx, "invlgn", NULL, MTX_DEF); 2004 2005 /* 2006 * Reserve some special page table entries/VA space for temporary 2007 * mapping of pages. 2008 */ 2009 #define SYSMAP(c, p, v, n) \ 2010 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 2011 2012 va = virtual_avail; 2013 pte = vtopte(va); 2014 2015 /* 2016 * Crashdump maps. The first page is reused as CMAP1 for the 2017 * memory test. 2018 */ 2019 SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS) 2020 CADDR1 = crashdumpmap; 2021 2022 SYSMAP(struct pcpu *, pcpu_pte, __pcpu, MAXCPU); 2023 virtual_avail = va; 2024 2025 /* 2026 * Map the BSP PCPU now, the rest of the PCPUs are mapped by 2027 * amd64_mp_alloc_pcpu()/start_all_aps() when we know the 2028 * number of CPUs and NUMA affinity. 2029 */ 2030 pcpu_pte[0] = pcpu0_phys | X86_PG_V | X86_PG_RW | pg_g | pg_nx | 2031 X86_PG_M | X86_PG_A; 2032 for (i = 1; i < MAXCPU; i++) 2033 pcpu_pte[i] = 0; 2034 2035 /* 2036 * Re-initialize PCPU area for BSP after switching. 2037 * Make hardware use gdt and common_tss from the new PCPU. 2038 */ 2039 STAILQ_INIT(&cpuhead); 2040 wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]); 2041 pcpu_init(&__pcpu[0], 0, sizeof(struct pcpu)); 2042 amd64_bsp_pcpu_init1(&__pcpu[0]); 2043 amd64_bsp_ist_init(&__pcpu[0]); 2044 __pcpu[0].pc_common_tss.tss_iobase = sizeof(struct amd64tss) + 2045 IOPERM_BITMAP_SIZE; 2046 memcpy(__pcpu[0].pc_gdt, temp_bsp_pcpu.pc_gdt, NGDT * 2047 sizeof(struct user_segment_descriptor)); 2048 gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&__pcpu[0].pc_common_tss; 2049 ssdtosyssd(&gdt_segs[GPROC0_SEL], 2050 (struct system_segment_descriptor *)&__pcpu[0].pc_gdt[GPROC0_SEL]); 2051 r_gdt.rd_limit = NGDT * sizeof(struct user_segment_descriptor) - 1; 2052 r_gdt.rd_base = (long)__pcpu[0].pc_gdt; 2053 lgdt(&r_gdt); 2054 wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]); 2055 ltr(GSEL(GPROC0_SEL, SEL_KPL)); 2056 __pcpu[0].pc_dynamic = temp_bsp_pcpu.pc_dynamic; 2057 __pcpu[0].pc_acpi_id = temp_bsp_pcpu.pc_acpi_id; 2058 2059 /* 2060 * Initialize the PAT MSR. 2061 * pmap_init_pat() clears and sets CR4_PGE, which, as a 2062 * side-effect, invalidates stale PG_G TLB entries that might 2063 * have been created in our pre-boot environment. 2064 */ 2065 pmap_init_pat(); 2066 2067 /* Initialize TLB Context Id. */ 2068 if (pmap_pcid_enabled) { 2069 kernel_pmap->pm_pcidp = (void *)(uintptr_t) 2070 offsetof(struct pcpu, pc_kpmap_store); 2071 2072 PCPU_SET(kpmap_store.pm_pcid, PMAP_PCID_KERN); 2073 PCPU_SET(kpmap_store.pm_gen, 1); 2074 2075 /* 2076 * PMAP_PCID_KERN + 1 is used for initialization of 2077 * proc0 pmap. The pmap' pcid state might be used by 2078 * EFIRT entry before first context switch, so it 2079 * needs to be valid. 2080 */ 2081 PCPU_SET(pcid_next, PMAP_PCID_KERN + 2); 2082 PCPU_SET(pcid_gen, 1); 2083 2084 /* 2085 * pcpu area for APs is zeroed during AP startup. 2086 * pc_pcid_next and pc_pcid_gen are initialized by AP 2087 * during pcpu setup. 2088 */ 2089 load_cr4(rcr4() | CR4_PCIDE); 2090 } 2091 TSEXIT(); 2092 } 2093 2094 /* 2095 * Setup the PAT MSR. 2096 */ 2097 void 2098 pmap_init_pat(void) 2099 { 2100 uint64_t pat_msr; 2101 u_long cr0, cr4; 2102 int i; 2103 2104 /* Bail if this CPU doesn't implement PAT. */ 2105 if ((cpu_feature & CPUID_PAT) == 0) 2106 panic("no PAT??"); 2107 2108 /* Set default PAT index table. */ 2109 for (i = 0; i < PAT_INDEX_SIZE; i++) 2110 pat_index[i] = -1; 2111 pat_index[PAT_WRITE_BACK] = 0; 2112 pat_index[PAT_WRITE_THROUGH] = 1; 2113 pat_index[PAT_UNCACHEABLE] = 3; 2114 pat_index[PAT_WRITE_COMBINING] = 6; 2115 pat_index[PAT_WRITE_PROTECTED] = 5; 2116 pat_index[PAT_UNCACHED] = 2; 2117 2118 /* 2119 * Initialize default PAT entries. 2120 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. 2121 * Program 5 and 6 as WP and WC. 2122 * 2123 * Leave 4 and 7 as WB and UC. Note that a recursive page table 2124 * mapping for a 2M page uses a PAT value with the bit 3 set due 2125 * to its overload with PG_PS. 2126 */ 2127 pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | 2128 PAT_VALUE(1, PAT_WRITE_THROUGH) | 2129 PAT_VALUE(2, PAT_UNCACHED) | 2130 PAT_VALUE(3, PAT_UNCACHEABLE) | 2131 PAT_VALUE(4, PAT_WRITE_BACK) | 2132 PAT_VALUE(5, PAT_WRITE_PROTECTED) | 2133 PAT_VALUE(6, PAT_WRITE_COMBINING) | 2134 PAT_VALUE(7, PAT_UNCACHEABLE); 2135 2136 /* Disable PGE. */ 2137 cr4 = rcr4(); 2138 load_cr4(cr4 & ~CR4_PGE); 2139 2140 /* Disable caches (CD = 1, NW = 0). */ 2141 cr0 = rcr0(); 2142 load_cr0((cr0 & ~CR0_NW) | CR0_CD); 2143 2144 /* Flushes caches and TLBs. */ 2145 wbinvd(); 2146 invltlb(); 2147 2148 /* Update PAT and index table. */ 2149 wrmsr(MSR_PAT, pat_msr); 2150 2151 /* Flush caches and TLBs again. */ 2152 wbinvd(); 2153 invltlb(); 2154 2155 /* Restore caches and PGE. */ 2156 load_cr0(cr0); 2157 load_cr4(cr4); 2158 } 2159 2160 vm_page_t 2161 pmap_page_alloc_below_4g(bool zeroed) 2162 { 2163 return (vm_page_alloc_noobj_contig((zeroed ? VM_ALLOC_ZERO : 0), 2164 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT)); 2165 } 2166 2167 extern const char la57_trampoline[], la57_trampoline_gdt_desc[], 2168 la57_trampoline_gdt[], la57_trampoline_end[]; 2169 2170 static void 2171 pmap_bootstrap_la57(void *arg __unused) 2172 { 2173 char *v_code; 2174 pml5_entry_t *v_pml5; 2175 pml4_entry_t *v_pml4; 2176 pdp_entry_t *v_pdp; 2177 pd_entry_t *v_pd; 2178 pt_entry_t *v_pt; 2179 vm_page_t m_code, m_pml4, m_pdp, m_pd, m_pt, m_pml5; 2180 void (*la57_tramp)(uint64_t pml5); 2181 struct region_descriptor r_gdt; 2182 2183 if ((cpu_stdext_feature2 & CPUID_STDEXT2_LA57) == 0) 2184 return; 2185 la57 = 1; 2186 TUNABLE_INT_FETCH("vm.pmap.la57", &la57); 2187 if (!la57) 2188 return; 2189 2190 r_gdt.rd_limit = NGDT * sizeof(struct user_segment_descriptor) - 1; 2191 r_gdt.rd_base = (long)__pcpu[0].pc_gdt; 2192 2193 m_code = pmap_page_alloc_below_4g(true); 2194 v_code = (char *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_code)); 2195 m_pml5 = pmap_page_alloc_below_4g(true); 2196 KPML5phys = VM_PAGE_TO_PHYS(m_pml5); 2197 v_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(KPML5phys); 2198 m_pml4 = pmap_page_alloc_below_4g(true); 2199 v_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pml4)); 2200 m_pdp = pmap_page_alloc_below_4g(true); 2201 v_pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pdp)); 2202 m_pd = pmap_page_alloc_below_4g(true); 2203 v_pd = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd)); 2204 m_pt = pmap_page_alloc_below_4g(true); 2205 v_pt = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pt)); 2206 2207 /* 2208 * Map m_code 1:1, it appears below 4G in KVA due to physical 2209 * address being below 4G. Since kernel KVA is in upper half, 2210 * the pml4e should be zero and free for temporary use. 2211 */ 2212 kernel_pmap->pm_pmltop[pmap_pml4e_index(VM_PAGE_TO_PHYS(m_code))] = 2213 VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V | X86_PG_RW | X86_PG_A | 2214 X86_PG_M; 2215 v_pdp[pmap_pdpe_index(VM_PAGE_TO_PHYS(m_code))] = 2216 VM_PAGE_TO_PHYS(m_pd) | X86_PG_V | X86_PG_RW | X86_PG_A | 2217 X86_PG_M; 2218 v_pd[pmap_pde_index(VM_PAGE_TO_PHYS(m_code))] = 2219 VM_PAGE_TO_PHYS(m_pt) | X86_PG_V | X86_PG_RW | X86_PG_A | 2220 X86_PG_M; 2221 v_pt[pmap_pte_index(VM_PAGE_TO_PHYS(m_code))] = 2222 VM_PAGE_TO_PHYS(m_code) | X86_PG_V | X86_PG_RW | X86_PG_A | 2223 X86_PG_M; 2224 2225 /* 2226 * Add pml5 entry at top of KVA pointing to existing pml4 table, 2227 * entering all existing kernel mappings into level 5 table. 2228 */ 2229 v_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V | 2230 X86_PG_RW | X86_PG_A | X86_PG_M; 2231 2232 /* 2233 * Add pml5 entry for 1:1 trampoline mapping after LA57 is turned on. 2234 */ 2235 v_pml5[pmap_pml5e_index(VM_PAGE_TO_PHYS(m_code))] = 2236 VM_PAGE_TO_PHYS(m_pml4) | X86_PG_V | X86_PG_RW | X86_PG_A | 2237 X86_PG_M; 2238 v_pml4[pmap_pml4e_index(VM_PAGE_TO_PHYS(m_code))] = 2239 VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V | X86_PG_RW | X86_PG_A | 2240 X86_PG_M; 2241 2242 /* 2243 * Copy and call the 48->57 trampoline, hope we return there, alive. 2244 */ 2245 bcopy(la57_trampoline, v_code, la57_trampoline_end - la57_trampoline); 2246 *(u_long *)(v_code + 2 + (la57_trampoline_gdt_desc - la57_trampoline)) = 2247 la57_trampoline_gdt - la57_trampoline + VM_PAGE_TO_PHYS(m_code); 2248 la57_tramp = (void (*)(uint64_t))VM_PAGE_TO_PHYS(m_code); 2249 pmap_invalidate_all(kernel_pmap); 2250 if (bootverbose) { 2251 printf("entering LA57 trampoline at %#lx\n", 2252 (vm_offset_t)la57_tramp); 2253 } 2254 la57_tramp(KPML5phys); 2255 2256 /* 2257 * gdt was necessary reset, switch back to our gdt. 2258 */ 2259 lgdt(&r_gdt); 2260 wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]); 2261 load_ds(_udatasel); 2262 load_es(_udatasel); 2263 load_fs(_ufssel); 2264 ssdtosyssd(&gdt_segs[GPROC0_SEL], 2265 (struct system_segment_descriptor *)&__pcpu[0].pc_gdt[GPROC0_SEL]); 2266 ltr(GSEL(GPROC0_SEL, SEL_KPL)); 2267 lidt(&r_idt); 2268 2269 if (bootverbose) 2270 printf("LA57 trampoline returned, CR4 %#lx\n", rcr4()); 2271 2272 /* 2273 * Now unmap the trampoline, and free the pages. 2274 * Clear pml5 entry used for 1:1 trampoline mapping. 2275 */ 2276 pte_clear(&v_pml5[pmap_pml5e_index(VM_PAGE_TO_PHYS(m_code))]); 2277 invlpg((vm_offset_t)v_code); 2278 vm_page_free(m_code); 2279 vm_page_free(m_pdp); 2280 vm_page_free(m_pd); 2281 vm_page_free(m_pt); 2282 2283 /* 2284 * Recursively map PML5 to itself in order to get PTmap and 2285 * PDmap. 2286 */ 2287 v_pml5[PML5PML5I] = KPML5phys | X86_PG_RW | X86_PG_V | pg_nx; 2288 2289 vtoptem = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + 2290 NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1) << 3; 2291 PTmap = (vm_offset_t)P5Tmap; 2292 vtopdem = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + 2293 NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1) << 3; 2294 PDmap = (vm_offset_t)P5Dmap; 2295 2296 kernel_pmap->pm_cr3 = KPML5phys; 2297 kernel_pmap->pm_pmltop = v_pml5; 2298 pmap_pt_page_count_adj(kernel_pmap, 1); 2299 } 2300 SYSINIT(la57, SI_SUB_KMEM, SI_ORDER_ANY, pmap_bootstrap_la57, NULL); 2301 2302 /* 2303 * Initialize a vm_page's machine-dependent fields. 2304 */ 2305 void 2306 pmap_page_init(vm_page_t m) 2307 { 2308 2309 TAILQ_INIT(&m->md.pv_list); 2310 m->md.pat_mode = PAT_WRITE_BACK; 2311 } 2312 2313 static int pmap_allow_2m_x_ept; 2314 SYSCTL_INT(_vm_pmap, OID_AUTO, allow_2m_x_ept, CTLFLAG_RWTUN | CTLFLAG_NOFETCH, 2315 &pmap_allow_2m_x_ept, 0, 2316 "Allow executable superpage mappings in EPT"); 2317 2318 void 2319 pmap_allow_2m_x_ept_recalculate(void) 2320 { 2321 /* 2322 * SKL002, SKL012S. Since the EPT format is only used by 2323 * Intel CPUs, the vendor check is merely a formality. 2324 */ 2325 if (!(cpu_vendor_id != CPU_VENDOR_INTEL || 2326 (cpu_ia32_arch_caps & IA32_ARCH_CAP_IF_PSCHANGE_MC_NO) != 0 || 2327 (CPUID_TO_FAMILY(cpu_id) == 0x6 && 2328 (CPUID_TO_MODEL(cpu_id) == 0x26 || /* Atoms */ 2329 CPUID_TO_MODEL(cpu_id) == 0x27 || 2330 CPUID_TO_MODEL(cpu_id) == 0x35 || 2331 CPUID_TO_MODEL(cpu_id) == 0x36 || 2332 CPUID_TO_MODEL(cpu_id) == 0x37 || 2333 CPUID_TO_MODEL(cpu_id) == 0x86 || 2334 CPUID_TO_MODEL(cpu_id) == 0x1c || 2335 CPUID_TO_MODEL(cpu_id) == 0x4a || 2336 CPUID_TO_MODEL(cpu_id) == 0x4c || 2337 CPUID_TO_MODEL(cpu_id) == 0x4d || 2338 CPUID_TO_MODEL(cpu_id) == 0x5a || 2339 CPUID_TO_MODEL(cpu_id) == 0x5c || 2340 CPUID_TO_MODEL(cpu_id) == 0x5d || 2341 CPUID_TO_MODEL(cpu_id) == 0x5f || 2342 CPUID_TO_MODEL(cpu_id) == 0x6e || 2343 CPUID_TO_MODEL(cpu_id) == 0x7a || 2344 CPUID_TO_MODEL(cpu_id) == 0x57 || /* Knights */ 2345 CPUID_TO_MODEL(cpu_id) == 0x85)))) 2346 pmap_allow_2m_x_ept = 1; 2347 #ifndef BURN_BRIDGES 2348 TUNABLE_INT_FETCH("hw.allow_2m_x_ept", &pmap_allow_2m_x_ept); 2349 #endif 2350 TUNABLE_INT_FETCH("vm.pmap.allow_2m_x_ept", &pmap_allow_2m_x_ept); 2351 } 2352 2353 static bool 2354 pmap_allow_2m_x_page(pmap_t pmap, bool executable) 2355 { 2356 2357 return (pmap->pm_type != PT_EPT || !executable || 2358 !pmap_allow_2m_x_ept); 2359 } 2360 2361 #ifdef NUMA 2362 static void 2363 pmap_init_pv_table(void) 2364 { 2365 struct pmap_large_md_page *pvd; 2366 vm_size_t s; 2367 long start, end, highest, pv_npg; 2368 int domain, i, j, pages; 2369 2370 /* 2371 * For correctness we depend on the size being evenly divisible into a 2372 * page. As a tradeoff between performance and total memory use, the 2373 * entry is 64 bytes (aka one cacheline) in size. Not being smaller 2374 * avoids false-sharing, but not being 128 bytes potentially allows for 2375 * avoidable traffic due to adjacent cacheline prefetcher. 2376 * 2377 * Assert the size so that accidental changes fail to compile. 2378 */ 2379 CTASSERT((sizeof(*pvd) == 64)); 2380 2381 /* 2382 * Calculate the size of the array. 2383 */ 2384 pmap_last_pa = vm_phys_segs[vm_phys_nsegs - 1].end; 2385 pv_npg = howmany(pmap_last_pa, NBPDR); 2386 s = (vm_size_t)pv_npg * sizeof(struct pmap_large_md_page); 2387 s = round_page(s); 2388 pv_table = (struct pmap_large_md_page *)kva_alloc(s); 2389 if (pv_table == NULL) 2390 panic("%s: kva_alloc failed\n", __func__); 2391 2392 /* 2393 * Iterate physical segments to allocate space for respective pages. 2394 */ 2395 highest = -1; 2396 s = 0; 2397 for (i = 0; i < vm_phys_nsegs; i++) { 2398 end = vm_phys_segs[i].end / NBPDR; 2399 domain = vm_phys_segs[i].domain; 2400 2401 if (highest >= end) 2402 continue; 2403 2404 start = highest + 1; 2405 pvd = &pv_table[start]; 2406 2407 pages = end - start + 1; 2408 s = round_page(pages * sizeof(*pvd)); 2409 highest = start + (s / sizeof(*pvd)) - 1; 2410 2411 for (j = 0; j < s; j += PAGE_SIZE) { 2412 vm_page_t m = vm_page_alloc_noobj_domain(domain, 0); 2413 if (m == NULL) 2414 panic("failed to allocate PV table page"); 2415 pmap_qenter((vm_offset_t)pvd + j, &m, 1); 2416 } 2417 2418 for (j = 0; j < s / sizeof(*pvd); j++) { 2419 rw_init_flags(&pvd->pv_lock, "pmap pv list", RW_NEW); 2420 TAILQ_INIT(&pvd->pv_page.pv_list); 2421 pvd->pv_page.pv_gen = 0; 2422 pvd->pv_page.pat_mode = 0; 2423 pvd->pv_invl_gen = 0; 2424 pvd++; 2425 } 2426 } 2427 pvd = &pv_dummy_large; 2428 rw_init_flags(&pvd->pv_lock, "pmap pv list dummy", RW_NEW); 2429 TAILQ_INIT(&pvd->pv_page.pv_list); 2430 pvd->pv_page.pv_gen = 0; 2431 pvd->pv_page.pat_mode = 0; 2432 pvd->pv_invl_gen = 0; 2433 } 2434 #else 2435 static void 2436 pmap_init_pv_table(void) 2437 { 2438 vm_size_t s; 2439 long i, pv_npg; 2440 2441 /* 2442 * Initialize the pool of pv list locks. 2443 */ 2444 for (i = 0; i < NPV_LIST_LOCKS; i++) 2445 rw_init(&pv_list_locks[i], "pmap pv list"); 2446 2447 /* 2448 * Calculate the size of the pv head table for superpages. 2449 */ 2450 pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR); 2451 2452 /* 2453 * Allocate memory for the pv head table for superpages. 2454 */ 2455 s = (vm_size_t)pv_npg * sizeof(struct md_page); 2456 s = round_page(s); 2457 pv_table = kmem_malloc(s, M_WAITOK | M_ZERO); 2458 for (i = 0; i < pv_npg; i++) 2459 TAILQ_INIT(&pv_table[i].pv_list); 2460 TAILQ_INIT(&pv_dummy.pv_list); 2461 } 2462 #endif 2463 2464 /* 2465 * Initialize the pmap module. 2466 * 2467 * Called by vm_mem_init(), to initialize any structures that the pmap 2468 * system needs to map virtual memory. 2469 */ 2470 void 2471 pmap_init(void) 2472 { 2473 struct pmap_preinit_mapping *ppim; 2474 vm_page_t m, mpte; 2475 int error, i, ret, skz63; 2476 2477 /* L1TF, reserve page @0 unconditionally */ 2478 vm_page_blacklist_add(0, bootverbose); 2479 2480 /* Detect bare-metal Skylake Server and Skylake-X. */ 2481 if (vm_guest == VM_GUEST_NO && cpu_vendor_id == CPU_VENDOR_INTEL && 2482 CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) == 0x55) { 2483 /* 2484 * Skylake-X errata SKZ63. Processor May Hang When 2485 * Executing Code In an HLE Transaction Region between 2486 * 40000000H and 403FFFFFH. 2487 * 2488 * Mark the pages in the range as preallocated. It 2489 * seems to be impossible to distinguish between 2490 * Skylake Server and Skylake X. 2491 */ 2492 skz63 = 1; 2493 TUNABLE_INT_FETCH("hw.skz63_enable", &skz63); 2494 if (skz63 != 0) { 2495 if (bootverbose) 2496 printf("SKZ63: skipping 4M RAM starting " 2497 "at physical 1G\n"); 2498 for (i = 0; i < atop(0x400000); i++) { 2499 ret = vm_page_blacklist_add(0x40000000 + 2500 ptoa(i), false); 2501 if (!ret && bootverbose) 2502 printf("page at %#x already used\n", 2503 0x40000000 + ptoa(i)); 2504 } 2505 } 2506 } 2507 2508 /* IFU */ 2509 pmap_allow_2m_x_ept_recalculate(); 2510 2511 /* 2512 * Initialize the vm page array entries for the kernel pmap's 2513 * page table pages. 2514 */ 2515 PMAP_LOCK(kernel_pmap); 2516 for (i = 0; i < nkpt; i++) { 2517 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); 2518 KASSERT(mpte >= vm_page_array && 2519 mpte < &vm_page_array[vm_page_array_size], 2520 ("pmap_init: page table page is out of range")); 2521 mpte->pindex = pmap_pde_pindex(KERNBASE) + i; 2522 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); 2523 mpte->ref_count = 1; 2524 2525 /* 2526 * Collect the page table pages that were replaced by a 2MB 2527 * page in create_pagetables(). They are zero filled. 2528 */ 2529 if ((i == 0 || 2530 kernphys + ((vm_paddr_t)(i - 1) << PDRSHIFT) < KERNend) && 2531 pmap_insert_pt_page(kernel_pmap, mpte, false, false)) 2532 panic("pmap_init: pmap_insert_pt_page failed"); 2533 } 2534 PMAP_UNLOCK(kernel_pmap); 2535 vm_wire_add(nkpt); 2536 2537 /* 2538 * If the kernel is running on a virtual machine, then it must assume 2539 * that MCA is enabled by the hypervisor. Moreover, the kernel must 2540 * be prepared for the hypervisor changing the vendor and family that 2541 * are reported by CPUID. Consequently, the workaround for AMD Family 2542 * 10h Erratum 383 is enabled if the processor's feature set does not 2543 * include at least one feature that is only supported by older Intel 2544 * or newer AMD processors. 2545 */ 2546 if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 && 2547 (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI | 2548 CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP | 2549 AMDID2_FMA4)) == 0) 2550 workaround_erratum383 = 1; 2551 2552 /* 2553 * Are large page mappings enabled? 2554 */ 2555 TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); 2556 if (pg_ps_enabled) { 2557 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 2558 ("pmap_init: can't assign to pagesizes[1]")); 2559 pagesizes[1] = NBPDR; 2560 if ((amd_feature & AMDID_PAGE1GB) != 0) { 2561 KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0, 2562 ("pmap_init: can't assign to pagesizes[2]")); 2563 pagesizes[2] = NBPDP; 2564 } 2565 } 2566 2567 /* 2568 * Initialize pv chunk lists. 2569 */ 2570 for (i = 0; i < PMAP_MEMDOM; i++) { 2571 mtx_init(&pv_chunks[i].pvc_lock, "pmap pv chunk list", NULL, MTX_DEF); 2572 TAILQ_INIT(&pv_chunks[i].pvc_list); 2573 } 2574 pmap_init_pv_table(); 2575 2576 pmap_initialized = 1; 2577 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 2578 ppim = pmap_preinit_mapping + i; 2579 if (ppim->va == 0) 2580 continue; 2581 /* Make the direct map consistent */ 2582 if (ppim->pa < dmaplimit && ppim->pa + ppim->sz <= dmaplimit) { 2583 (void)pmap_change_attr(PHYS_TO_DMAP(ppim->pa), 2584 ppim->sz, ppim->mode); 2585 } 2586 if (!bootverbose) 2587 continue; 2588 printf("PPIM %u: PA=%#lx, VA=%#lx, size=%#lx, mode=%#x\n", i, 2589 ppim->pa, ppim->va, ppim->sz, ppim->mode); 2590 } 2591 2592 mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN); 2593 error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, 2594 (vmem_addr_t *)&qframe); 2595 if (error != 0) 2596 panic("qframe allocation failed"); 2597 2598 lm_ents = 8; 2599 TUNABLE_INT_FETCH("vm.pmap.large_map_pml4_entries", &lm_ents); 2600 if (lm_ents > LMEPML4I - LMSPML4I + 1) 2601 lm_ents = LMEPML4I - LMSPML4I + 1; 2602 #ifdef KMSAN 2603 if (lm_ents > KMSANORIGPML4I - LMSPML4I) { 2604 printf( 2605 "pmap: shrinking large map for KMSAN (%d slots to %ld slots)\n", 2606 lm_ents, KMSANORIGPML4I - LMSPML4I); 2607 lm_ents = KMSANORIGPML4I - LMSPML4I; 2608 } 2609 #endif 2610 if (bootverbose) 2611 printf("pmap: large map %u PML4 slots (%lu GB)\n", 2612 lm_ents, (u_long)lm_ents * (NBPML4 / 1024 / 1024 / 1024)); 2613 if (lm_ents != 0) { 2614 large_vmem = vmem_create("large", LARGEMAP_MIN_ADDRESS, 2615 (vmem_size_t)lm_ents * NBPML4, PAGE_SIZE, 0, M_WAITOK); 2616 if (large_vmem == NULL) { 2617 printf("pmap: cannot create large map\n"); 2618 lm_ents = 0; 2619 } 2620 for (i = 0; i < lm_ents; i++) { 2621 m = pmap_large_map_getptp_unlocked(); 2622 /* XXXKIB la57 */ 2623 kernel_pml4[LMSPML4I + i] = X86_PG_V | 2624 X86_PG_RW | X86_PG_A | X86_PG_M | pg_nx | 2625 VM_PAGE_TO_PHYS(m); 2626 } 2627 } 2628 } 2629 2630 SYSCTL_UINT(_vm_pmap, OID_AUTO, large_map_pml4_entries, 2631 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &lm_ents, 0, 2632 "Maximum number of PML4 entries for use by large map (tunable). " 2633 "Each entry corresponds to 512GB of address space."); 2634 2635 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 2636 "2MB page mapping counters"); 2637 2638 static COUNTER_U64_DEFINE_EARLY(pmap_pde_demotions); 2639 SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, demotions, 2640 CTLFLAG_RD, &pmap_pde_demotions, "2MB page demotions"); 2641 2642 static COUNTER_U64_DEFINE_EARLY(pmap_pde_mappings); 2643 SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, 2644 &pmap_pde_mappings, "2MB page mappings"); 2645 2646 static COUNTER_U64_DEFINE_EARLY(pmap_pde_p_failures); 2647 SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, 2648 &pmap_pde_p_failures, "2MB page promotion failures"); 2649 2650 static COUNTER_U64_DEFINE_EARLY(pmap_pde_promotions); 2651 SYSCTL_COUNTER_U64(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, 2652 &pmap_pde_promotions, "2MB page promotions"); 2653 2654 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 2655 "1GB page mapping counters"); 2656 2657 static COUNTER_U64_DEFINE_EARLY(pmap_pdpe_demotions); 2658 SYSCTL_COUNTER_U64(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD, 2659 &pmap_pdpe_demotions, "1GB page demotions"); 2660 2661 /*************************************************** 2662 * Low level helper routines..... 2663 ***************************************************/ 2664 2665 static pt_entry_t 2666 pmap_swap_pat(pmap_t pmap, pt_entry_t entry) 2667 { 2668 int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT; 2669 2670 switch (pmap->pm_type) { 2671 case PT_X86: 2672 case PT_RVI: 2673 /* Verify that both PAT bits are not set at the same time */ 2674 KASSERT((entry & x86_pat_bits) != x86_pat_bits, 2675 ("Invalid PAT bits in entry %#lx", entry)); 2676 2677 /* Swap the PAT bits if one of them is set */ 2678 if ((entry & x86_pat_bits) != 0) 2679 entry ^= x86_pat_bits; 2680 break; 2681 case PT_EPT: 2682 /* 2683 * Nothing to do - the memory attributes are represented 2684 * the same way for regular pages and superpages. 2685 */ 2686 break; 2687 default: 2688 panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type); 2689 } 2690 2691 return (entry); 2692 } 2693 2694 bool 2695 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) 2696 { 2697 2698 return (mode >= 0 && mode < PAT_INDEX_SIZE && 2699 pat_index[(int)mode] >= 0); 2700 } 2701 2702 /* 2703 * Determine the appropriate bits to set in a PTE or PDE for a specified 2704 * caching mode. 2705 */ 2706 int 2707 pmap_cache_bits(pmap_t pmap, int mode, bool is_pde) 2708 { 2709 int cache_bits, pat_flag, pat_idx; 2710 2711 if (!pmap_is_valid_memattr(pmap, mode)) 2712 panic("Unknown caching mode %d\n", mode); 2713 2714 switch (pmap->pm_type) { 2715 case PT_X86: 2716 case PT_RVI: 2717 /* The PAT bit is different for PTE's and PDE's. */ 2718 pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT; 2719 2720 /* Map the caching mode to a PAT index. */ 2721 pat_idx = pat_index[mode]; 2722 2723 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ 2724 cache_bits = 0; 2725 if (pat_idx & 0x4) 2726 cache_bits |= pat_flag; 2727 if (pat_idx & 0x2) 2728 cache_bits |= PG_NC_PCD; 2729 if (pat_idx & 0x1) 2730 cache_bits |= PG_NC_PWT; 2731 break; 2732 2733 case PT_EPT: 2734 cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode); 2735 break; 2736 2737 default: 2738 panic("unsupported pmap type %d", pmap->pm_type); 2739 } 2740 2741 return (cache_bits); 2742 } 2743 2744 static int 2745 pmap_cache_mask(pmap_t pmap, bool is_pde) 2746 { 2747 int mask; 2748 2749 switch (pmap->pm_type) { 2750 case PT_X86: 2751 case PT_RVI: 2752 mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE; 2753 break; 2754 case PT_EPT: 2755 mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7); 2756 break; 2757 default: 2758 panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type); 2759 } 2760 2761 return (mask); 2762 } 2763 2764 static int 2765 pmap_pat_index(pmap_t pmap, pt_entry_t pte, bool is_pde) 2766 { 2767 int pat_flag, pat_idx; 2768 2769 pat_idx = 0; 2770 switch (pmap->pm_type) { 2771 case PT_X86: 2772 case PT_RVI: 2773 /* The PAT bit is different for PTE's and PDE's. */ 2774 pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT; 2775 2776 if ((pte & pat_flag) != 0) 2777 pat_idx |= 0x4; 2778 if ((pte & PG_NC_PCD) != 0) 2779 pat_idx |= 0x2; 2780 if ((pte & PG_NC_PWT) != 0) 2781 pat_idx |= 0x1; 2782 break; 2783 case PT_EPT: 2784 if ((pte & EPT_PG_IGNORE_PAT) != 0) 2785 panic("EPT PTE %#lx has no PAT memory type", pte); 2786 pat_idx = (pte & EPT_PG_MEMORY_TYPE(0x7)) >> 3; 2787 break; 2788 } 2789 2790 /* See pmap_init_pat(). */ 2791 if (pat_idx == 4) 2792 pat_idx = 0; 2793 if (pat_idx == 7) 2794 pat_idx = 3; 2795 2796 return (pat_idx); 2797 } 2798 2799 bool 2800 pmap_ps_enabled(pmap_t pmap) 2801 { 2802 2803 return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0); 2804 } 2805 2806 static void 2807 pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde) 2808 { 2809 2810 switch (pmap->pm_type) { 2811 case PT_X86: 2812 break; 2813 case PT_RVI: 2814 case PT_EPT: 2815 /* 2816 * XXX 2817 * This is a little bogus since the generation number is 2818 * supposed to be bumped up when a region of the address 2819 * space is invalidated in the page tables. 2820 * 2821 * In this case the old PDE entry is valid but yet we want 2822 * to make sure that any mappings using the old entry are 2823 * invalidated in the TLB. 2824 * 2825 * The reason this works as expected is because we rendezvous 2826 * "all" host cpus and force any vcpu context to exit as a 2827 * side-effect. 2828 */ 2829 atomic_add_long(&pmap->pm_eptgen, 1); 2830 break; 2831 default: 2832 panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type); 2833 } 2834 pde_store(pde, newpde); 2835 } 2836 2837 /* 2838 * After changing the page size for the specified virtual address in the page 2839 * table, flush the corresponding entries from the processor's TLB. Only the 2840 * calling processor's TLB is affected. 2841 * 2842 * The calling thread must be pinned to a processor. 2843 */ 2844 static void 2845 pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde) 2846 { 2847 pt_entry_t PG_G; 2848 2849 if (pmap_type_guest(pmap)) 2850 return; 2851 2852 KASSERT(pmap->pm_type == PT_X86, 2853 ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type)); 2854 2855 PG_G = pmap_global_bit(pmap); 2856 2857 if ((newpde & PG_PS) == 0) 2858 /* Demotion: flush a specific 2MB page mapping. */ 2859 pmap_invlpg(pmap, va); 2860 else if ((newpde & PG_G) == 0) 2861 /* 2862 * Promotion: flush every 4KB page mapping from the TLB 2863 * because there are too many to flush individually. 2864 */ 2865 invltlb(); 2866 else { 2867 /* 2868 * Promotion: flush every 4KB page mapping from the TLB, 2869 * including any global (PG_G) mappings. 2870 */ 2871 invltlb_glob(); 2872 } 2873 } 2874 2875 /* 2876 * The amd64 pmap uses different approaches to TLB invalidation 2877 * depending on the kernel configuration, available hardware features, 2878 * and known hardware errata. The kernel configuration option that 2879 * has the greatest operational impact on TLB invalidation is PTI, 2880 * which is enabled automatically on affected Intel CPUs. The most 2881 * impactful hardware features are first PCID, and then INVPCID 2882 * instruction presence. PCID usage is quite different for PTI 2883 * vs. non-PTI. 2884 * 2885 * * Kernel Page Table Isolation (PTI or KPTI) is used to mitigate 2886 * the Meltdown bug in some Intel CPUs. Under PTI, each user address 2887 * space is served by two page tables, user and kernel. The user 2888 * page table only maps user space and a kernel trampoline. The 2889 * kernel trampoline includes the entirety of the kernel text but 2890 * only the kernel data that is needed to switch from user to kernel 2891 * mode. The kernel page table maps the user and kernel address 2892 * spaces in their entirety. It is identical to the per-process 2893 * page table used in non-PTI mode. 2894 * 2895 * User page tables are only used when the CPU is in user mode. 2896 * Consequently, some TLB invalidations can be postponed until the 2897 * switch from kernel to user mode. In contrast, the user 2898 * space part of the kernel page table is used for copyout(9), so 2899 * TLB invalidations on this page table cannot be similarly postponed. 2900 * 2901 * The existence of a user mode page table for the given pmap is 2902 * indicated by a pm_ucr3 value that differs from PMAP_NO_CR3, in 2903 * which case pm_ucr3 contains the %cr3 register value for the user 2904 * mode page table's root. 2905 * 2906 * * The pm_active bitmask indicates which CPUs currently have the 2907 * pmap active. A CPU's bit is set on context switch to the pmap, and 2908 * cleared on switching off this CPU. For the kernel page table, 2909 * the pm_active field is immutable and contains all CPUs. The 2910 * kernel page table is always logically active on every processor, 2911 * but not necessarily in use by the hardware, e.g., in PTI mode. 2912 * 2913 * When requesting invalidation of virtual addresses with 2914 * pmap_invalidate_XXX() functions, the pmap sends shootdown IPIs to 2915 * all CPUs recorded as active in pm_active. Updates to and reads 2916 * from pm_active are not synchronized, and so they may race with 2917 * each other. Shootdown handlers are prepared to handle the race. 2918 * 2919 * * PCID is an optional feature of the long mode x86 MMU where TLB 2920 * entries are tagged with the 'Process ID' of the address space 2921 * they belong to. This feature provides a limited namespace for 2922 * process identifiers, 12 bits, supporting 4095 simultaneous IDs 2923 * total. 2924 * 2925 * Allocation of a PCID to a pmap is done by an algorithm described 2926 * in section 15.12, "Other TLB Consistency Algorithms", of 2927 * Vahalia's book "Unix Internals". A PCID cannot be allocated for 2928 * the whole lifetime of a pmap in pmap_pinit() due to the limited 2929 * namespace. Instead, a per-CPU, per-pmap PCID is assigned when 2930 * the CPU is about to start caching TLB entries from a pmap, 2931 * i.e., on the context switch that activates the pmap on the CPU. 2932 * 2933 * The PCID allocator maintains a per-CPU, per-pmap generation 2934 * count, pm_gen, which is incremented each time a new PCID is 2935 * allocated. On TLB invalidation, the generation counters for the 2936 * pmap are zeroed, which signals the context switch code that the 2937 * previously allocated PCID is no longer valid. Effectively, 2938 * zeroing any of these counters triggers a TLB shootdown for the 2939 * given CPU/address space, due to the allocation of a new PCID. 2940 * 2941 * Zeroing can be performed remotely. Consequently, if a pmap is 2942 * inactive on a CPU, then a TLB shootdown for that pmap and CPU can 2943 * be initiated by an ordinary memory access to reset the target 2944 * CPU's generation count within the pmap. The CPU initiating the 2945 * TLB shootdown does not need to send an IPI to the target CPU. 2946 * 2947 * * PTI + PCID. The available PCIDs are divided into two sets: PCIDs 2948 * for complete (kernel) page tables, and PCIDs for user mode page 2949 * tables. A user PCID value is obtained from the kernel PCID value 2950 * by setting the highest bit, 11, to 1 (0x800 == PMAP_PCID_USER_PT). 2951 * 2952 * User space page tables are activated on return to user mode, by 2953 * loading pm_ucr3 into %cr3. If the PCPU(ucr3_load_mask) requests 2954 * clearing bit 63 of the loaded ucr3, this effectively causes 2955 * complete invalidation of the user mode TLB entries for the 2956 * current pmap. In which case, local invalidations of individual 2957 * pages in the user page table are skipped. 2958 * 2959 * * Local invalidation, all modes. If the requested invalidation is 2960 * for a specific address or the total invalidation of a currently 2961 * active pmap, then the TLB is flushed using INVLPG for a kernel 2962 * page table, and INVPCID(INVPCID_CTXGLOB)/invltlb_glob() for a 2963 * user space page table(s). 2964 * 2965 * If the INVPCID instruction is available, it is used to flush user 2966 * entries from the kernel page table. 2967 * 2968 * When PCID is enabled, the INVLPG instruction invalidates all TLB 2969 * entries for the given page that either match the current PCID or 2970 * are global. Since TLB entries for the same page under different 2971 * PCIDs are unaffected, kernel pages which reside in all address 2972 * spaces could be problematic. We avoid the problem by creating 2973 * all kernel PTEs with the global flag (PG_G) set, when PTI is 2974 * disabled. 2975 * 2976 * * mode: PTI disabled, PCID present. The kernel reserves PCID 0 for its 2977 * address space, all other 4095 PCIDs are used for user mode spaces 2978 * as described above. A context switch allocates a new PCID if 2979 * the recorded PCID is zero or the recorded generation does not match 2980 * the CPU's generation, effectively flushing the TLB for this address space. 2981 * Total remote invalidation is performed by zeroing pm_gen for all CPUs. 2982 * local user page: INVLPG 2983 * local kernel page: INVLPG 2984 * local user total: INVPCID(CTX) 2985 * local kernel total: INVPCID(CTXGLOB) or invltlb_glob() 2986 * remote user page, inactive pmap: zero pm_gen 2987 * remote user page, active pmap: zero pm_gen + IPI:INVLPG 2988 * (Both actions are required to handle the aforementioned pm_active races.) 2989 * remote kernel page: IPI:INVLPG 2990 * remote user total, inactive pmap: zero pm_gen 2991 * remote user total, active pmap: zero pm_gen + IPI:(INVPCID(CTX) or 2992 * reload %cr3) 2993 * (See note above about pm_active races.) 2994 * remote kernel total: IPI:(INVPCID(CTXGLOB) or invltlb_glob()) 2995 * 2996 * PTI enabled, PCID present. 2997 * local user page: INVLPG for kpt, INVPCID(ADDR) or (INVLPG for ucr3) 2998 * for upt 2999 * local kernel page: INVLPG 3000 * local user total: INVPCID(CTX) or reload %cr3 for kpt, clear PCID_SAVE 3001 * on loading UCR3 into %cr3 for upt 3002 * local kernel total: INVPCID(CTXGLOB) or invltlb_glob() 3003 * remote user page, inactive pmap: zero pm_gen 3004 * remote user page, active pmap: zero pm_gen + IPI:(INVLPG for kpt, 3005 * INVPCID(ADDR) for upt) 3006 * remote kernel page: IPI:INVLPG 3007 * remote user total, inactive pmap: zero pm_gen 3008 * remote user total, active pmap: zero pm_gen + IPI:(INVPCID(CTX) for kpt, 3009 * clear PCID_SAVE on loading UCR3 into $cr3 for upt) 3010 * remote kernel total: IPI:(INVPCID(CTXGLOB) or invltlb_glob()) 3011 * 3012 * No PCID. 3013 * local user page: INVLPG 3014 * local kernel page: INVLPG 3015 * local user total: reload %cr3 3016 * local kernel total: invltlb_glob() 3017 * remote user page, inactive pmap: - 3018 * remote user page, active pmap: IPI:INVLPG 3019 * remote kernel page: IPI:INVLPG 3020 * remote user total, inactive pmap: - 3021 * remote user total, active pmap: IPI:(reload %cr3) 3022 * remote kernel total: IPI:invltlb_glob() 3023 * Since on return to user mode, the reload of %cr3 with ucr3 causes 3024 * TLB invalidation, no specific action is required for user page table. 3025 * 3026 * EPT. EPT pmaps do not map KVA, all mappings are userspace. 3027 * XXX TODO 3028 */ 3029 3030 #ifdef SMP 3031 /* 3032 * Interrupt the cpus that are executing in the guest context. 3033 * This will force the vcpu to exit and the cached EPT mappings 3034 * will be invalidated by the host before the next vmresume. 3035 */ 3036 static __inline void 3037 pmap_invalidate_ept(pmap_t pmap) 3038 { 3039 smr_seq_t goal; 3040 int ipinum; 3041 3042 sched_pin(); 3043 KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), 3044 ("pmap_invalidate_ept: absurd pm_active")); 3045 3046 /* 3047 * The TLB mappings associated with a vcpu context are not 3048 * flushed each time a different vcpu is chosen to execute. 3049 * 3050 * This is in contrast with a process's vtop mappings that 3051 * are flushed from the TLB on each context switch. 3052 * 3053 * Therefore we need to do more than just a TLB shootdown on 3054 * the active cpus in 'pmap->pm_active'. To do this we keep 3055 * track of the number of invalidations performed on this pmap. 3056 * 3057 * Each vcpu keeps a cache of this counter and compares it 3058 * just before a vmresume. If the counter is out-of-date an 3059 * invept will be done to flush stale mappings from the TLB. 3060 * 3061 * To ensure that all vCPU threads have observed the new counter 3062 * value before returning, we use SMR. Ordering is important here: 3063 * the VMM enters an SMR read section before loading the counter 3064 * and after updating the pm_active bit set. Thus, pm_active is 3065 * a superset of active readers, and any reader that has observed 3066 * the goal has observed the new counter value. 3067 */ 3068 atomic_add_long(&pmap->pm_eptgen, 1); 3069 3070 goal = smr_advance(pmap->pm_eptsmr); 3071 3072 /* 3073 * Force the vcpu to exit and trap back into the hypervisor. 3074 */ 3075 ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK; 3076 ipi_selected(pmap->pm_active, ipinum); 3077 sched_unpin(); 3078 3079 /* 3080 * Ensure that all active vCPUs will observe the new generation counter 3081 * value before executing any more guest instructions. 3082 */ 3083 smr_wait(pmap->pm_eptsmr, goal); 3084 } 3085 3086 static inline void 3087 pmap_invalidate_preipi_pcid(pmap_t pmap) 3088 { 3089 struct pmap_pcid *pcidp; 3090 u_int cpuid, i; 3091 3092 sched_pin(); 3093 3094 cpuid = PCPU_GET(cpuid); 3095 if (pmap != PCPU_GET(curpmap)) 3096 cpuid = 0xffffffff; /* An impossible value */ 3097 3098 CPU_FOREACH(i) { 3099 if (cpuid != i) { 3100 pcidp = zpcpu_get_cpu(pmap->pm_pcidp, i); 3101 pcidp->pm_gen = 0; 3102 } 3103 } 3104 3105 /* 3106 * The fence is between stores to pm_gen and the read of the 3107 * pm_active mask. We need to ensure that it is impossible 3108 * for us to miss the bit update in pm_active and 3109 * simultaneously observe a non-zero pm_gen in 3110 * pmap_activate_sw(), otherwise TLB update is missed. 3111 * Without the fence, IA32 allows such an outcome. Note that 3112 * pm_active is updated by a locked operation, which provides 3113 * the reciprocal fence. 3114 */ 3115 atomic_thread_fence_seq_cst(); 3116 } 3117 3118 static void 3119 pmap_invalidate_preipi_nopcid(pmap_t pmap __unused) 3120 { 3121 sched_pin(); 3122 } 3123 3124 DEFINE_IFUNC(static, void, pmap_invalidate_preipi, (pmap_t)) 3125 { 3126 return (pmap_pcid_enabled ? pmap_invalidate_preipi_pcid : 3127 pmap_invalidate_preipi_nopcid); 3128 } 3129 3130 static inline void 3131 pmap_invalidate_page_pcid_cb(pmap_t pmap, vm_offset_t va, 3132 const bool invpcid_works1) 3133 { 3134 struct invpcid_descr d; 3135 uint64_t kcr3, ucr3; 3136 uint32_t pcid; 3137 3138 /* 3139 * Because pm_pcid is recalculated on a context switch, we 3140 * must ensure there is no preemption, not just pinning. 3141 * Otherwise, we might use a stale value below. 3142 */ 3143 CRITICAL_ASSERT(curthread); 3144 3145 /* 3146 * No need to do anything with user page tables invalidation 3147 * if there is no user page table, or invalidation is deferred 3148 * until the return to userspace. ucr3_load_mask is stable 3149 * because we have preemption disabled. 3150 */ 3151 if (pmap->pm_ucr3 == PMAP_NO_CR3 || 3152 PCPU_GET(ucr3_load_mask) != PMAP_UCR3_NOMASK) 3153 return; 3154 3155 pcid = pmap_get_pcid(pmap); 3156 if (invpcid_works1) { 3157 d.pcid = pcid | PMAP_PCID_USER_PT; 3158 d.pad = 0; 3159 d.addr = va; 3160 invpcid(&d, INVPCID_ADDR); 3161 } else { 3162 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; 3163 ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; 3164 pmap_pti_pcid_invlpg(ucr3, kcr3, va); 3165 } 3166 } 3167 3168 static void 3169 pmap_invalidate_page_pcid_invpcid_cb(pmap_t pmap, vm_offset_t va) 3170 { 3171 pmap_invalidate_page_pcid_cb(pmap, va, true); 3172 } 3173 3174 static void 3175 pmap_invalidate_page_pcid_noinvpcid_cb(pmap_t pmap, vm_offset_t va) 3176 { 3177 pmap_invalidate_page_pcid_cb(pmap, va, false); 3178 } 3179 3180 static void 3181 pmap_invalidate_page_nopcid_cb(pmap_t pmap __unused, vm_offset_t va __unused) 3182 { 3183 } 3184 3185 DEFINE_IFUNC(static, void, pmap_invalidate_page_cb, (pmap_t, vm_offset_t)) 3186 { 3187 if (pmap_pcid_enabled) 3188 return (invpcid_works ? pmap_invalidate_page_pcid_invpcid_cb : 3189 pmap_invalidate_page_pcid_noinvpcid_cb); 3190 return (pmap_invalidate_page_nopcid_cb); 3191 } 3192 3193 static void 3194 pmap_invalidate_page_curcpu_cb(pmap_t pmap, vm_offset_t va, 3195 vm_offset_t addr2 __unused) 3196 { 3197 if (pmap == kernel_pmap) { 3198 pmap_invlpg(kernel_pmap, va); 3199 } else if (pmap == PCPU_GET(curpmap)) { 3200 invlpg(va); 3201 pmap_invalidate_page_cb(pmap, va); 3202 } 3203 } 3204 3205 void 3206 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 3207 { 3208 if (pmap_type_guest(pmap)) { 3209 pmap_invalidate_ept(pmap); 3210 return; 3211 } 3212 3213 KASSERT(pmap->pm_type == PT_X86, 3214 ("pmap_invalidate_page: invalid type %d", pmap->pm_type)); 3215 3216 pmap_invalidate_preipi(pmap); 3217 smp_masked_invlpg(va, pmap, pmap_invalidate_page_curcpu_cb); 3218 } 3219 3220 /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */ 3221 #define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE) 3222 3223 static void 3224 pmap_invalidate_range_pcid_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 3225 const bool invpcid_works1) 3226 { 3227 struct invpcid_descr d; 3228 uint64_t kcr3, ucr3; 3229 uint32_t pcid; 3230 3231 CRITICAL_ASSERT(curthread); 3232 3233 if (pmap != PCPU_GET(curpmap) || 3234 pmap->pm_ucr3 == PMAP_NO_CR3 || 3235 PCPU_GET(ucr3_load_mask) != PMAP_UCR3_NOMASK) 3236 return; 3237 3238 pcid = pmap_get_pcid(pmap); 3239 if (invpcid_works1) { 3240 d.pcid = pcid | PMAP_PCID_USER_PT; 3241 d.pad = 0; 3242 for (d.addr = sva; d.addr < eva; d.addr += PAGE_SIZE) 3243 invpcid(&d, INVPCID_ADDR); 3244 } else { 3245 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; 3246 ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; 3247 pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva); 3248 } 3249 } 3250 3251 static void 3252 pmap_invalidate_range_pcid_invpcid_cb(pmap_t pmap, vm_offset_t sva, 3253 vm_offset_t eva) 3254 { 3255 pmap_invalidate_range_pcid_cb(pmap, sva, eva, true); 3256 } 3257 3258 static void 3259 pmap_invalidate_range_pcid_noinvpcid_cb(pmap_t pmap, vm_offset_t sva, 3260 vm_offset_t eva) 3261 { 3262 pmap_invalidate_range_pcid_cb(pmap, sva, eva, false); 3263 } 3264 3265 static void 3266 pmap_invalidate_range_nopcid_cb(pmap_t pmap __unused, vm_offset_t sva __unused, 3267 vm_offset_t eva __unused) 3268 { 3269 } 3270 3271 DEFINE_IFUNC(static, void, pmap_invalidate_range_cb, (pmap_t, vm_offset_t, 3272 vm_offset_t)) 3273 { 3274 if (pmap_pcid_enabled) 3275 return (invpcid_works ? pmap_invalidate_range_pcid_invpcid_cb : 3276 pmap_invalidate_range_pcid_noinvpcid_cb); 3277 return (pmap_invalidate_range_nopcid_cb); 3278 } 3279 3280 static void 3281 pmap_invalidate_range_curcpu_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3282 { 3283 vm_offset_t addr; 3284 3285 if (pmap == kernel_pmap) { 3286 if (PCPU_GET(pcid_invlpg_workaround)) { 3287 struct invpcid_descr d = { 0 }; 3288 3289 invpcid(&d, INVPCID_CTXGLOB); 3290 } else { 3291 for (addr = sva; addr < eva; addr += PAGE_SIZE) 3292 invlpg(addr); 3293 } 3294 } else if (pmap == PCPU_GET(curpmap)) { 3295 for (addr = sva; addr < eva; addr += PAGE_SIZE) 3296 invlpg(addr); 3297 pmap_invalidate_range_cb(pmap, sva, eva); 3298 } 3299 } 3300 3301 void 3302 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3303 { 3304 if (eva - sva >= PMAP_INVLPG_THRESHOLD) { 3305 pmap_invalidate_all(pmap); 3306 return; 3307 } 3308 3309 if (pmap_type_guest(pmap)) { 3310 pmap_invalidate_ept(pmap); 3311 return; 3312 } 3313 3314 KASSERT(pmap->pm_type == PT_X86, 3315 ("pmap_invalidate_range: invalid type %d", pmap->pm_type)); 3316 3317 pmap_invalidate_preipi(pmap); 3318 smp_masked_invlpg_range(sva, eva, pmap, 3319 pmap_invalidate_range_curcpu_cb); 3320 } 3321 3322 static inline void 3323 pmap_invalidate_all_pcid_cb(pmap_t pmap, bool invpcid_works1) 3324 { 3325 struct invpcid_descr d; 3326 uint64_t kcr3; 3327 uint32_t pcid; 3328 3329 if (pmap == kernel_pmap) { 3330 if (invpcid_works1) { 3331 bzero(&d, sizeof(d)); 3332 invpcid(&d, INVPCID_CTXGLOB); 3333 } else { 3334 invltlb_glob(); 3335 } 3336 } else if (pmap == PCPU_GET(curpmap)) { 3337 CRITICAL_ASSERT(curthread); 3338 3339 pcid = pmap_get_pcid(pmap); 3340 if (invpcid_works1) { 3341 d.pcid = pcid; 3342 d.pad = 0; 3343 d.addr = 0; 3344 invpcid(&d, INVPCID_CTX); 3345 } else { 3346 kcr3 = pmap->pm_cr3 | pcid; 3347 load_cr3(kcr3); 3348 } 3349 if (pmap->pm_ucr3 != PMAP_NO_CR3) 3350 PCPU_SET(ucr3_load_mask, ~CR3_PCID_SAVE); 3351 } 3352 } 3353 3354 static void 3355 pmap_invalidate_all_pcid_invpcid_cb(pmap_t pmap) 3356 { 3357 pmap_invalidate_all_pcid_cb(pmap, true); 3358 } 3359 3360 static void 3361 pmap_invalidate_all_pcid_noinvpcid_cb(pmap_t pmap) 3362 { 3363 pmap_invalidate_all_pcid_cb(pmap, false); 3364 } 3365 3366 static void 3367 pmap_invalidate_all_nopcid_cb(pmap_t pmap) 3368 { 3369 if (pmap == kernel_pmap) 3370 invltlb_glob(); 3371 else if (pmap == PCPU_GET(curpmap)) 3372 invltlb(); 3373 } 3374 3375 DEFINE_IFUNC(static, void, pmap_invalidate_all_cb, (pmap_t)) 3376 { 3377 if (pmap_pcid_enabled) 3378 return (invpcid_works ? pmap_invalidate_all_pcid_invpcid_cb : 3379 pmap_invalidate_all_pcid_noinvpcid_cb); 3380 return (pmap_invalidate_all_nopcid_cb); 3381 } 3382 3383 static void 3384 pmap_invalidate_all_curcpu_cb(pmap_t pmap, vm_offset_t addr1 __unused, 3385 vm_offset_t addr2 __unused) 3386 { 3387 pmap_invalidate_all_cb(pmap); 3388 } 3389 3390 void 3391 pmap_invalidate_all(pmap_t pmap) 3392 { 3393 if (pmap_type_guest(pmap)) { 3394 pmap_invalidate_ept(pmap); 3395 return; 3396 } 3397 3398 KASSERT(pmap->pm_type == PT_X86, 3399 ("pmap_invalidate_all: invalid type %d", pmap->pm_type)); 3400 3401 pmap_invalidate_preipi(pmap); 3402 smp_masked_invltlb(pmap, pmap_invalidate_all_curcpu_cb); 3403 } 3404 3405 static void 3406 pmap_invalidate_cache_curcpu_cb(pmap_t pmap __unused, vm_offset_t va __unused, 3407 vm_offset_t addr2 __unused) 3408 { 3409 wbinvd(); 3410 } 3411 3412 void 3413 pmap_invalidate_cache(void) 3414 { 3415 sched_pin(); 3416 smp_cache_flush(pmap_invalidate_cache_curcpu_cb); 3417 } 3418 3419 struct pde_action { 3420 cpuset_t invalidate; /* processors that invalidate their TLB */ 3421 pmap_t pmap; 3422 vm_offset_t va; 3423 pd_entry_t *pde; 3424 pd_entry_t newpde; 3425 u_int store; /* processor that updates the PDE */ 3426 }; 3427 3428 static void 3429 pmap_update_pde_action(void *arg) 3430 { 3431 struct pde_action *act = arg; 3432 3433 if (act->store == PCPU_GET(cpuid)) 3434 pmap_update_pde_store(act->pmap, act->pde, act->newpde); 3435 } 3436 3437 static void 3438 pmap_update_pde_teardown(void *arg) 3439 { 3440 struct pde_action *act = arg; 3441 3442 if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) 3443 pmap_update_pde_invalidate(act->pmap, act->va, act->newpde); 3444 } 3445 3446 /* 3447 * Change the page size for the specified virtual address in a way that 3448 * prevents any possibility of the TLB ever having two entries that map the 3449 * same virtual address using different page sizes. This is the recommended 3450 * workaround for Erratum 383 on AMD Family 10h processors. It prevents a 3451 * machine check exception for a TLB state that is improperly diagnosed as a 3452 * hardware error. 3453 */ 3454 static void 3455 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 3456 { 3457 struct pde_action act; 3458 cpuset_t active, other_cpus; 3459 u_int cpuid; 3460 3461 sched_pin(); 3462 cpuid = PCPU_GET(cpuid); 3463 other_cpus = all_cpus; 3464 CPU_CLR(cpuid, &other_cpus); 3465 if (pmap == kernel_pmap || pmap_type_guest(pmap)) 3466 active = all_cpus; 3467 else { 3468 active = pmap->pm_active; 3469 } 3470 if (CPU_OVERLAP(&active, &other_cpus)) { 3471 act.store = cpuid; 3472 act.invalidate = active; 3473 act.va = va; 3474 act.pmap = pmap; 3475 act.pde = pde; 3476 act.newpde = newpde; 3477 CPU_SET(cpuid, &active); 3478 smp_rendezvous_cpus(active, 3479 smp_no_rendezvous_barrier, pmap_update_pde_action, 3480 pmap_update_pde_teardown, &act); 3481 } else { 3482 pmap_update_pde_store(pmap, pde, newpde); 3483 if (CPU_ISSET(cpuid, &active)) 3484 pmap_update_pde_invalidate(pmap, va, newpde); 3485 } 3486 sched_unpin(); 3487 } 3488 #else /* !SMP */ 3489 /* 3490 * Normal, non-SMP, invalidation functions. 3491 */ 3492 void 3493 pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 3494 { 3495 struct invpcid_descr d; 3496 struct pmap_pcid *pcidp; 3497 uint64_t kcr3, ucr3; 3498 uint32_t pcid; 3499 3500 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { 3501 pmap->pm_eptgen++; 3502 return; 3503 } 3504 KASSERT(pmap->pm_type == PT_X86, 3505 ("pmap_invalidate_range: unknown type %d", pmap->pm_type)); 3506 3507 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) { 3508 invlpg(va); 3509 if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled && 3510 pmap->pm_ucr3 != PMAP_NO_CR3) { 3511 critical_enter(); 3512 pcid = pmap_get_pcid(pmap); 3513 if (invpcid_works) { 3514 d.pcid = pcid | PMAP_PCID_USER_PT; 3515 d.pad = 0; 3516 d.addr = va; 3517 invpcid(&d, INVPCID_ADDR); 3518 } else { 3519 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; 3520 ucr3 = pmap->pm_ucr3 | pcid | 3521 PMAP_PCID_USER_PT | CR3_PCID_SAVE; 3522 pmap_pti_pcid_invlpg(ucr3, kcr3, va); 3523 } 3524 critical_exit(); 3525 } 3526 } else if (pmap_pcid_enabled) { 3527 pcidp = zpcpu_get(pmap->pm_pcidp); 3528 pcidp->pm_gen = 0; 3529 } 3530 } 3531 3532 void 3533 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3534 { 3535 struct invpcid_descr d; 3536 struct pmap_pcid *pcidp; 3537 vm_offset_t addr; 3538 uint64_t kcr3, ucr3; 3539 uint32_t pcid; 3540 3541 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { 3542 pmap->pm_eptgen++; 3543 return; 3544 } 3545 KASSERT(pmap->pm_type == PT_X86, 3546 ("pmap_invalidate_range: unknown type %d", pmap->pm_type)); 3547 3548 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) { 3549 for (addr = sva; addr < eva; addr += PAGE_SIZE) 3550 invlpg(addr); 3551 if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled && 3552 pmap->pm_ucr3 != PMAP_NO_CR3) { 3553 critical_enter(); 3554 pcid = pmap_get_pcid(pmap); 3555 if (invpcid_works) { 3556 d.pcid = pcid | PMAP_PCID_USER_PT; 3557 d.pad = 0; 3558 d.addr = sva; 3559 for (; d.addr < eva; d.addr += PAGE_SIZE) 3560 invpcid(&d, INVPCID_ADDR); 3561 } else { 3562 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; 3563 ucr3 = pmap->pm_ucr3 | pcid | 3564 PMAP_PCID_USER_PT | CR3_PCID_SAVE; 3565 pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva); 3566 } 3567 critical_exit(); 3568 } 3569 } else if (pmap_pcid_enabled) { 3570 pcidp = zpcpu_get(pmap->pm_pcidp); 3571 pcidp->pm_gen = 0; 3572 } 3573 } 3574 3575 void 3576 pmap_invalidate_all(pmap_t pmap) 3577 { 3578 struct invpcid_descr d; 3579 struct pmap_pcid *pcidp; 3580 uint64_t kcr3, ucr3; 3581 uint32_t pcid; 3582 3583 if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { 3584 pmap->pm_eptgen++; 3585 return; 3586 } 3587 KASSERT(pmap->pm_type == PT_X86, 3588 ("pmap_invalidate_all: unknown type %d", pmap->pm_type)); 3589 3590 if (pmap == kernel_pmap) { 3591 if (pmap_pcid_enabled && invpcid_works) { 3592 bzero(&d, sizeof(d)); 3593 invpcid(&d, INVPCID_CTXGLOB); 3594 } else { 3595 invltlb_glob(); 3596 } 3597 } else if (pmap == PCPU_GET(curpmap)) { 3598 if (pmap_pcid_enabled) { 3599 critical_enter(); 3600 pcid = pmap_get_pcid(pmap); 3601 if (invpcid_works) { 3602 d.pcid = pcid; 3603 d.pad = 0; 3604 d.addr = 0; 3605 invpcid(&d, INVPCID_CTX); 3606 if (pmap->pm_ucr3 != PMAP_NO_CR3) { 3607 d.pcid |= PMAP_PCID_USER_PT; 3608 invpcid(&d, INVPCID_CTX); 3609 } 3610 } else { 3611 kcr3 = pmap->pm_cr3 | pcid; 3612 if (pmap->pm_ucr3 != PMAP_NO_CR3) { 3613 ucr3 = pmap->pm_ucr3 | pcid | 3614 PMAP_PCID_USER_PT; 3615 pmap_pti_pcid_invalidate(ucr3, kcr3); 3616 } else 3617 load_cr3(kcr3); 3618 } 3619 critical_exit(); 3620 } else { 3621 invltlb(); 3622 } 3623 } else if (pmap_pcid_enabled) { 3624 pcidp = zpcpu_get(pmap->pm_pcidp); 3625 pcidp->pm_gen = 0; 3626 } 3627 } 3628 3629 void 3630 pmap_invalidate_cache(void) 3631 { 3632 3633 wbinvd(); 3634 } 3635 3636 static void 3637 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 3638 { 3639 struct pmap_pcid *pcidp; 3640 3641 pmap_update_pde_store(pmap, pde, newpde); 3642 if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) 3643 pmap_update_pde_invalidate(pmap, va, newpde); 3644 else { 3645 pcidp = zpcpu_get(pmap->pm_pcidp); 3646 pcidp->pm_gen = 0; 3647 } 3648 } 3649 #endif /* !SMP */ 3650 3651 static void 3652 pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde) 3653 { 3654 3655 /* 3656 * When the PDE has PG_PROMOTED set, the 2MB page mapping was created 3657 * by a promotion that did not invalidate the 512 4KB page mappings 3658 * that might exist in the TLB. Consequently, at this point, the TLB 3659 * may hold both 4KB and 2MB page mappings for the address range [va, 3660 * va + NBPDR). Therefore, the entire range must be invalidated here. 3661 * In contrast, when PG_PROMOTED is clear, the TLB will not hold any 3662 * 4KB page mappings for the address range [va, va + NBPDR), and so a 3663 * single INVLPG suffices to invalidate the 2MB page mapping from the 3664 * TLB. 3665 */ 3666 if ((pde & PG_PROMOTED) != 0) 3667 pmap_invalidate_range(pmap, va, va + NBPDR - 1); 3668 else 3669 pmap_invalidate_page(pmap, va); 3670 } 3671 3672 DEFINE_IFUNC(, void, pmap_invalidate_cache_range, 3673 (vm_offset_t sva, vm_offset_t eva)) 3674 { 3675 3676 if ((cpu_feature & CPUID_SS) != 0) 3677 return (pmap_invalidate_cache_range_selfsnoop); 3678 if ((cpu_feature & CPUID_CLFSH) != 0) 3679 return (pmap_force_invalidate_cache_range); 3680 return (pmap_invalidate_cache_range_all); 3681 } 3682 3683 #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) 3684 3685 static void 3686 pmap_invalidate_cache_range_check_align(vm_offset_t sva, vm_offset_t eva) 3687 { 3688 3689 KASSERT((sva & PAGE_MASK) == 0, 3690 ("pmap_invalidate_cache_range: sva not page-aligned")); 3691 KASSERT((eva & PAGE_MASK) == 0, 3692 ("pmap_invalidate_cache_range: eva not page-aligned")); 3693 } 3694 3695 static void 3696 pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, vm_offset_t eva) 3697 { 3698 3699 pmap_invalidate_cache_range_check_align(sva, eva); 3700 } 3701 3702 void 3703 pmap_force_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) 3704 { 3705 3706 sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1); 3707 3708 /* 3709 * XXX: Some CPUs fault, hang, or trash the local APIC 3710 * registers if we use CLFLUSH on the local APIC range. The 3711 * local APIC is always uncached, so we don't need to flush 3712 * for that range anyway. 3713 */ 3714 if (pmap_kextract(sva) == lapic_paddr) 3715 return; 3716 3717 if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) { 3718 /* 3719 * Do per-cache line flush. Use a locked 3720 * instruction to insure that previous stores are 3721 * included in the write-back. The processor 3722 * propagates flush to other processors in the cache 3723 * coherence domain. 3724 */ 3725 atomic_thread_fence_seq_cst(); 3726 for (; sva < eva; sva += cpu_clflush_line_size) 3727 clflushopt(sva); 3728 atomic_thread_fence_seq_cst(); 3729 } else { 3730 /* 3731 * Writes are ordered by CLFLUSH on Intel CPUs. 3732 */ 3733 if (cpu_vendor_id != CPU_VENDOR_INTEL) 3734 mfence(); 3735 for (; sva < eva; sva += cpu_clflush_line_size) 3736 clflush(sva); 3737 if (cpu_vendor_id != CPU_VENDOR_INTEL) 3738 mfence(); 3739 } 3740 } 3741 3742 static void 3743 pmap_invalidate_cache_range_all(vm_offset_t sva, vm_offset_t eva) 3744 { 3745 3746 pmap_invalidate_cache_range_check_align(sva, eva); 3747 pmap_invalidate_cache(); 3748 } 3749 3750 /* 3751 * Remove the specified set of pages from the data and instruction caches. 3752 * 3753 * In contrast to pmap_invalidate_cache_range(), this function does not 3754 * rely on the CPU's self-snoop feature, because it is intended for use 3755 * when moving pages into a different cache domain. 3756 */ 3757 void 3758 pmap_invalidate_cache_pages(vm_page_t *pages, int count) 3759 { 3760 vm_offset_t daddr, eva; 3761 int i; 3762 bool useclflushopt; 3763 3764 useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0; 3765 if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || 3766 ((cpu_feature & CPUID_CLFSH) == 0 && !useclflushopt)) 3767 pmap_invalidate_cache(); 3768 else { 3769 if (useclflushopt) 3770 atomic_thread_fence_seq_cst(); 3771 else if (cpu_vendor_id != CPU_VENDOR_INTEL) 3772 mfence(); 3773 for (i = 0; i < count; i++) { 3774 daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i])); 3775 eva = daddr + PAGE_SIZE; 3776 for (; daddr < eva; daddr += cpu_clflush_line_size) { 3777 if (useclflushopt) 3778 clflushopt(daddr); 3779 else 3780 clflush(daddr); 3781 } 3782 } 3783 if (useclflushopt) 3784 atomic_thread_fence_seq_cst(); 3785 else if (cpu_vendor_id != CPU_VENDOR_INTEL) 3786 mfence(); 3787 } 3788 } 3789 3790 void 3791 pmap_flush_cache_range(vm_offset_t sva, vm_offset_t eva) 3792 { 3793 3794 pmap_invalidate_cache_range_check_align(sva, eva); 3795 3796 if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) == 0) { 3797 pmap_force_invalidate_cache_range(sva, eva); 3798 return; 3799 } 3800 3801 /* See comment in pmap_force_invalidate_cache_range(). */ 3802 if (pmap_kextract(sva) == lapic_paddr) 3803 return; 3804 3805 atomic_thread_fence_seq_cst(); 3806 for (; sva < eva; sva += cpu_clflush_line_size) 3807 clwb(sva); 3808 atomic_thread_fence_seq_cst(); 3809 } 3810 3811 void 3812 pmap_flush_cache_phys_range(vm_paddr_t spa, vm_paddr_t epa, vm_memattr_t mattr) 3813 { 3814 pt_entry_t *pte; 3815 vm_offset_t vaddr; 3816 int error __diagused; 3817 int pte_bits; 3818 3819 KASSERT((spa & PAGE_MASK) == 0, 3820 ("pmap_flush_cache_phys_range: spa not page-aligned")); 3821 KASSERT((epa & PAGE_MASK) == 0, 3822 ("pmap_flush_cache_phys_range: epa not page-aligned")); 3823 3824 if (spa < dmaplimit) { 3825 pmap_flush_cache_range(PHYS_TO_DMAP(spa), PHYS_TO_DMAP(MIN( 3826 dmaplimit, epa))); 3827 if (dmaplimit >= epa) 3828 return; 3829 spa = dmaplimit; 3830 } 3831 3832 pte_bits = pmap_cache_bits(kernel_pmap, mattr, false) | X86_PG_RW | 3833 X86_PG_V; 3834 error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, 3835 &vaddr); 3836 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 3837 pte = vtopte(vaddr); 3838 for (; spa < epa; spa += PAGE_SIZE) { 3839 sched_pin(); 3840 pte_store(pte, spa | pte_bits); 3841 pmap_invlpg(kernel_pmap, vaddr); 3842 /* XXXKIB atomic inside flush_cache_range are excessive */ 3843 pmap_flush_cache_range(vaddr, vaddr + PAGE_SIZE); 3844 sched_unpin(); 3845 } 3846 vmem_free(kernel_arena, vaddr, PAGE_SIZE); 3847 } 3848 3849 /* 3850 * Routine: pmap_extract 3851 * Function: 3852 * Extract the physical page address associated 3853 * with the given map/virtual_address pair. 3854 */ 3855 vm_paddr_t 3856 pmap_extract(pmap_t pmap, vm_offset_t va) 3857 { 3858 pdp_entry_t *pdpe; 3859 pd_entry_t *pde; 3860 pt_entry_t *pte, PG_V; 3861 vm_paddr_t pa; 3862 3863 pa = 0; 3864 PG_V = pmap_valid_bit(pmap); 3865 PMAP_LOCK(pmap); 3866 pdpe = pmap_pdpe(pmap, va); 3867 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 3868 if ((*pdpe & PG_PS) != 0) 3869 pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK); 3870 else { 3871 pde = pmap_pdpe_to_pde(pdpe, va); 3872 if ((*pde & PG_V) != 0) { 3873 if ((*pde & PG_PS) != 0) { 3874 pa = (*pde & PG_PS_FRAME) | 3875 (va & PDRMASK); 3876 } else { 3877 pte = pmap_pde_to_pte(pde, va); 3878 pa = (*pte & PG_FRAME) | 3879 (va & PAGE_MASK); 3880 } 3881 } 3882 } 3883 } 3884 PMAP_UNLOCK(pmap); 3885 return (pa); 3886 } 3887 3888 /* 3889 * Routine: pmap_extract_and_hold 3890 * Function: 3891 * Atomically extract and hold the physical page 3892 * with the given pmap and virtual address pair 3893 * if that mapping permits the given protection. 3894 */ 3895 vm_page_t 3896 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 3897 { 3898 pdp_entry_t pdpe, *pdpep; 3899 pd_entry_t pde, *pdep; 3900 pt_entry_t pte, PG_RW, PG_V; 3901 vm_page_t m; 3902 3903 m = NULL; 3904 PG_RW = pmap_rw_bit(pmap); 3905 PG_V = pmap_valid_bit(pmap); 3906 PMAP_LOCK(pmap); 3907 3908 pdpep = pmap_pdpe(pmap, va); 3909 if (pdpep == NULL || ((pdpe = *pdpep) & PG_V) == 0) 3910 goto out; 3911 if ((pdpe & PG_PS) != 0) { 3912 if ((pdpe & PG_RW) == 0 && (prot & VM_PROT_WRITE) != 0) 3913 goto out; 3914 m = PHYS_TO_VM_PAGE((pdpe & PG_PS_FRAME) | (va & PDPMASK)); 3915 goto check_page; 3916 } 3917 3918 pdep = pmap_pdpe_to_pde(pdpep, va); 3919 if (pdep == NULL || ((pde = *pdep) & PG_V) == 0) 3920 goto out; 3921 if ((pde & PG_PS) != 0) { 3922 if ((pde & PG_RW) == 0 && (prot & VM_PROT_WRITE) != 0) 3923 goto out; 3924 m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | (va & PDRMASK)); 3925 goto check_page; 3926 } 3927 3928 pte = *pmap_pde_to_pte(pdep, va); 3929 if ((pte & PG_V) == 0 || 3930 ((pte & PG_RW) == 0 && (prot & VM_PROT_WRITE) != 0)) 3931 goto out; 3932 m = PHYS_TO_VM_PAGE(pte & PG_FRAME); 3933 3934 check_page: 3935 if (m != NULL && !vm_page_wire_mapped(m)) 3936 m = NULL; 3937 out: 3938 PMAP_UNLOCK(pmap); 3939 return (m); 3940 } 3941 3942 /* 3943 * Routine: pmap_kextract 3944 * Function: 3945 * Extract the physical page address associated with the given kernel 3946 * virtual address. 3947 */ 3948 vm_paddr_t 3949 pmap_kextract(vm_offset_t va) 3950 { 3951 pd_entry_t pde; 3952 vm_paddr_t pa; 3953 3954 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 3955 pa = DMAP_TO_PHYS(va); 3956 } else if (PMAP_ADDRESS_IN_LARGEMAP(va)) { 3957 pa = pmap_large_map_kextract(va); 3958 } else { 3959 pde = *vtopde(va); 3960 if (pde & PG_PS) { 3961 pa = (pde & PG_PS_FRAME) | (va & PDRMASK); 3962 } else { 3963 /* 3964 * Beware of a concurrent promotion that changes the 3965 * PDE at this point! For example, vtopte() must not 3966 * be used to access the PTE because it would use the 3967 * new PDE. It is, however, safe to use the old PDE 3968 * because the page table page is preserved by the 3969 * promotion. 3970 */ 3971 pa = *pmap_pde_to_pte(&pde, va); 3972 pa = (pa & PG_FRAME) | (va & PAGE_MASK); 3973 } 3974 } 3975 return (pa); 3976 } 3977 3978 /*************************************************** 3979 * Low level mapping routines..... 3980 ***************************************************/ 3981 3982 /* 3983 * Add a wired page to the kva. 3984 * Note: not SMP coherent. 3985 */ 3986 void 3987 pmap_kenter(vm_offset_t va, vm_paddr_t pa) 3988 { 3989 pt_entry_t *pte; 3990 3991 pte = vtopte(va); 3992 pte_store(pte, pa | pg_g | pg_nx | X86_PG_A | X86_PG_M | 3993 X86_PG_RW | X86_PG_V); 3994 } 3995 3996 static __inline void 3997 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) 3998 { 3999 pt_entry_t *pte; 4000 int cache_bits; 4001 4002 pte = vtopte(va); 4003 cache_bits = pmap_cache_bits(kernel_pmap, mode, false); 4004 pte_store(pte, pa | pg_g | pg_nx | X86_PG_A | X86_PG_M | 4005 X86_PG_RW | X86_PG_V | cache_bits); 4006 } 4007 4008 /* 4009 * Remove a page from the kernel pagetables. 4010 * Note: not SMP coherent. 4011 */ 4012 void 4013 pmap_kremove(vm_offset_t va) 4014 { 4015 pt_entry_t *pte; 4016 4017 pte = vtopte(va); 4018 pte_clear(pte); 4019 } 4020 4021 /* 4022 * Used to map a range of physical addresses into kernel 4023 * virtual address space. 4024 * 4025 * The value passed in '*virt' is a suggested virtual address for 4026 * the mapping. Architectures which can support a direct-mapped 4027 * physical to virtual region can return the appropriate address 4028 * within that region, leaving '*virt' unchanged. Other 4029 * architectures should map the pages starting at '*virt' and 4030 * update '*virt' with the first usable address after the mapped 4031 * region. 4032 */ 4033 vm_offset_t 4034 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 4035 { 4036 return PHYS_TO_DMAP(start); 4037 } 4038 4039 /* 4040 * Add a list of wired pages to the kva 4041 * this routine is only used for temporary 4042 * kernel mappings that do not need to have 4043 * page modification or references recorded. 4044 * Note that old mappings are simply written 4045 * over. The page *must* be wired. 4046 * Note: SMP coherent. Uses a ranged shootdown IPI. 4047 */ 4048 void 4049 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 4050 { 4051 pt_entry_t *endpte, oldpte, pa, *pte; 4052 vm_page_t m; 4053 int cache_bits; 4054 4055 oldpte = 0; 4056 pte = vtopte(sva); 4057 endpte = pte + count; 4058 while (pte < endpte) { 4059 m = *ma++; 4060 cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, false); 4061 pa = VM_PAGE_TO_PHYS(m) | cache_bits; 4062 if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) { 4063 oldpte |= *pte; 4064 pte_store(pte, pa | pg_g | pg_nx | X86_PG_A | 4065 X86_PG_M | X86_PG_RW | X86_PG_V); 4066 } 4067 pte++; 4068 } 4069 if (__predict_false((oldpte & X86_PG_V) != 0)) 4070 pmap_invalidate_range(kernel_pmap, sva, sva + count * 4071 PAGE_SIZE); 4072 } 4073 4074 /* 4075 * This routine tears out page mappings from the 4076 * kernel -- it is meant only for temporary mappings. 4077 * Note: SMP coherent. Uses a ranged shootdown IPI. 4078 */ 4079 void 4080 pmap_qremove(vm_offset_t sva, int count) 4081 { 4082 vm_offset_t va; 4083 4084 va = sva; 4085 while (count-- > 0) { 4086 /* 4087 * pmap_enter() calls within the kernel virtual 4088 * address space happen on virtual addresses from 4089 * subarenas that import superpage-sized and -aligned 4090 * address ranges. So, the virtual address that we 4091 * allocate to use with pmap_qenter() can't be close 4092 * enough to one of those pmap_enter() calls for it to 4093 * be caught up in a promotion. 4094 */ 4095 KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va)); 4096 KASSERT((*vtopde(va) & X86_PG_PS) == 0, 4097 ("pmap_qremove on promoted va %#lx", va)); 4098 4099 pmap_kremove(va); 4100 va += PAGE_SIZE; 4101 } 4102 pmap_invalidate_range(kernel_pmap, sva, va); 4103 } 4104 4105 /*************************************************** 4106 * Page table page management routines..... 4107 ***************************************************/ 4108 /* 4109 * Schedule the specified unused page table page to be freed. Specifically, 4110 * add the page to the specified list of pages that will be released to the 4111 * physical memory manager after the TLB has been updated. 4112 */ 4113 static __inline void 4114 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, bool set_PG_ZERO) 4115 { 4116 4117 if (set_PG_ZERO) 4118 m->flags |= PG_ZERO; 4119 else 4120 m->flags &= ~PG_ZERO; 4121 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 4122 } 4123 4124 /* 4125 * Inserts the specified page table page into the specified pmap's collection 4126 * of idle page table pages. Each of a pmap's page table pages is responsible 4127 * for mapping a distinct range of virtual addresses. The pmap's collection is 4128 * ordered by this virtual address range. 4129 * 4130 * If "promoted" is false, then the page table page "mpte" must be zero filled; 4131 * "mpte"'s valid field will be set to 0. 4132 * 4133 * If "promoted" is true and "allpte_PG_A_set" is false, then "mpte" must 4134 * contain valid mappings with identical attributes except for PG_A; "mpte"'s 4135 * valid field will be set to 1. 4136 * 4137 * If "promoted" and "allpte_PG_A_set" are both true, then "mpte" must contain 4138 * valid mappings with identical attributes including PG_A; "mpte"'s valid 4139 * field will be set to VM_PAGE_BITS_ALL. 4140 */ 4141 static __inline int 4142 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted, 4143 bool allpte_PG_A_set) 4144 { 4145 4146 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4147 KASSERT(promoted || !allpte_PG_A_set, 4148 ("a zero-filled PTP can't have PG_A set in every PTE")); 4149 mpte->valid = promoted ? (allpte_PG_A_set ? VM_PAGE_BITS_ALL : 1) : 0; 4150 return (vm_radix_insert(&pmap->pm_root, mpte)); 4151 } 4152 4153 /* 4154 * Removes the page table page mapping the specified virtual address from the 4155 * specified pmap's collection of idle page table pages, and returns it. 4156 * Otherwise, returns NULL if there is no page table page corresponding to the 4157 * specified virtual address. 4158 */ 4159 static __inline vm_page_t 4160 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) 4161 { 4162 4163 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4164 return (vm_radix_remove(&pmap->pm_root, pmap_pde_pindex(va))); 4165 } 4166 4167 /* 4168 * Decrements a page table page's reference count, which is used to record the 4169 * number of valid page table entries within the page. If the reference count 4170 * drops to zero, then the page table page is unmapped. Returns true if the 4171 * page table page was unmapped and false otherwise. 4172 */ 4173 static inline bool 4174 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 4175 { 4176 4177 --m->ref_count; 4178 if (m->ref_count == 0) { 4179 _pmap_unwire_ptp(pmap, va, m, free); 4180 return (true); 4181 } else 4182 return (false); 4183 } 4184 4185 static void 4186 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 4187 { 4188 pml5_entry_t *pml5; 4189 pml4_entry_t *pml4; 4190 pdp_entry_t *pdp; 4191 pd_entry_t *pd; 4192 vm_page_t pdpg, pdppg, pml4pg; 4193 4194 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4195 4196 /* 4197 * unmap the page table page 4198 */ 4199 if (m->pindex >= NUPDE + NUPDPE + NUPML4E) { 4200 /* PML4 page */ 4201 MPASS(pmap_is_la57(pmap)); 4202 pml5 = pmap_pml5e(pmap, va); 4203 *pml5 = 0; 4204 if (pmap->pm_pmltopu != NULL && va <= VM_MAXUSER_ADDRESS) { 4205 pml5 = pmap_pml5e_u(pmap, va); 4206 *pml5 = 0; 4207 } 4208 } else if (m->pindex >= NUPDE + NUPDPE) { 4209 /* PDP page */ 4210 pml4 = pmap_pml4e(pmap, va); 4211 *pml4 = 0; 4212 if (!pmap_is_la57(pmap) && pmap->pm_pmltopu != NULL && 4213 va <= VM_MAXUSER_ADDRESS) { 4214 pml4 = pmap_pml4e_u(pmap, va); 4215 *pml4 = 0; 4216 } 4217 } else if (m->pindex >= NUPDE) { 4218 /* PD page */ 4219 pdp = pmap_pdpe(pmap, va); 4220 *pdp = 0; 4221 } else { 4222 /* PTE page */ 4223 pd = pmap_pde(pmap, va); 4224 *pd = 0; 4225 } 4226 if (m->pindex < NUPDE) { 4227 /* We just released a PT, unhold the matching PD */ 4228 pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME); 4229 pmap_unwire_ptp(pmap, va, pdpg, free); 4230 } else if (m->pindex < NUPDE + NUPDPE) { 4231 /* We just released a PD, unhold the matching PDP */ 4232 pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME); 4233 pmap_unwire_ptp(pmap, va, pdppg, free); 4234 } else if (m->pindex < NUPDE + NUPDPE + NUPML4E && pmap_is_la57(pmap)) { 4235 /* We just released a PDP, unhold the matching PML4 */ 4236 pml4pg = PHYS_TO_VM_PAGE(*pmap_pml5e(pmap, va) & PG_FRAME); 4237 pmap_unwire_ptp(pmap, va, pml4pg, free); 4238 } 4239 4240 pmap_pt_page_count_adj(pmap, -1); 4241 4242 /* 4243 * Put page on a list so that it is released after 4244 * *ALL* TLB shootdown is done 4245 */ 4246 pmap_add_delayed_free_list(m, free, true); 4247 } 4248 4249 /* 4250 * After removing a page table entry, this routine is used to 4251 * conditionally free the page, and manage the reference count. 4252 */ 4253 static int 4254 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, 4255 struct spglist *free) 4256 { 4257 vm_page_t mpte; 4258 4259 if (va >= VM_MAXUSER_ADDRESS) 4260 return (0); 4261 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 4262 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); 4263 return (pmap_unwire_ptp(pmap, va, mpte, free)); 4264 } 4265 4266 /* 4267 * Release a page table page reference after a failed attempt to create a 4268 * mapping. 4269 */ 4270 static void 4271 pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte) 4272 { 4273 struct spglist free; 4274 4275 SLIST_INIT(&free); 4276 if (pmap_unwire_ptp(pmap, va, mpte, &free)) { 4277 /* 4278 * Although "va" was never mapped, paging-structure caches 4279 * could nonetheless have entries that refer to the freed 4280 * page table pages. Invalidate those entries. 4281 */ 4282 pmap_invalidate_page(pmap, va); 4283 vm_page_free_pages_toq(&free, true); 4284 } 4285 } 4286 4287 static void 4288 pmap_pinit_pcids(pmap_t pmap, uint32_t pcid, int gen) 4289 { 4290 struct pmap_pcid *pcidp; 4291 int i; 4292 4293 CPU_FOREACH(i) { 4294 pcidp = zpcpu_get_cpu(pmap->pm_pcidp, i); 4295 pcidp->pm_pcid = pcid; 4296 pcidp->pm_gen = gen; 4297 } 4298 } 4299 4300 void 4301 pmap_pinit0(pmap_t pmap) 4302 { 4303 struct proc *p; 4304 struct thread *td; 4305 4306 PMAP_LOCK_INIT(pmap); 4307 pmap->pm_pmltop = kernel_pmap->pm_pmltop; 4308 pmap->pm_pmltopu = NULL; 4309 pmap->pm_cr3 = kernel_pmap->pm_cr3; 4310 /* hack to keep pmap_pti_pcid_invalidate() alive */ 4311 pmap->pm_ucr3 = PMAP_NO_CR3; 4312 vm_radix_init(&pmap->pm_root); 4313 CPU_ZERO(&pmap->pm_active); 4314 TAILQ_INIT(&pmap->pm_pvchunk); 4315 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 4316 pmap->pm_flags = pmap_flags; 4317 pmap->pm_pcidp = uma_zalloc_pcpu(pcpu_zone_8, M_WAITOK); 4318 pmap_pinit_pcids(pmap, PMAP_PCID_KERN + 1, 1); 4319 pmap_activate_boot(pmap); 4320 td = curthread; 4321 if (pti) { 4322 p = td->td_proc; 4323 PROC_LOCK(p); 4324 p->p_md.md_flags |= P_MD_KPTI; 4325 PROC_UNLOCK(p); 4326 } 4327 pmap_thread_init_invl_gen(td); 4328 4329 if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { 4330 pmap_pkru_ranges_zone = uma_zcreate("pkru ranges", 4331 sizeof(struct pmap_pkru_range), NULL, NULL, NULL, NULL, 4332 UMA_ALIGN_PTR, 0); 4333 } 4334 } 4335 4336 void 4337 pmap_pinit_pml4(vm_page_t pml4pg) 4338 { 4339 pml4_entry_t *pm_pml4; 4340 int i; 4341 4342 pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg)); 4343 4344 /* Wire in kernel global address entries. */ 4345 for (i = 0; i < NKPML4E; i++) { 4346 pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) | X86_PG_RW | 4347 X86_PG_V; 4348 } 4349 #ifdef KASAN 4350 for (i = 0; i < NKASANPML4E; i++) { 4351 pm_pml4[KASANPML4I + i] = (KASANPDPphys + ptoa(i)) | X86_PG_RW | 4352 X86_PG_V | pg_nx; 4353 } 4354 #endif 4355 #ifdef KMSAN 4356 for (i = 0; i < NKMSANSHADPML4E; i++) { 4357 pm_pml4[KMSANSHADPML4I + i] = (KMSANSHADPDPphys + ptoa(i)) | 4358 X86_PG_RW | X86_PG_V | pg_nx; 4359 } 4360 for (i = 0; i < NKMSANORIGPML4E; i++) { 4361 pm_pml4[KMSANORIGPML4I + i] = (KMSANORIGPDPphys + ptoa(i)) | 4362 X86_PG_RW | X86_PG_V | pg_nx; 4363 } 4364 #endif 4365 for (i = 0; i < ndmpdpphys; i++) { 4366 pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) | X86_PG_RW | 4367 X86_PG_V; 4368 } 4369 4370 /* install self-referential address mapping entry(s) */ 4371 pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | X86_PG_V | X86_PG_RW | 4372 X86_PG_A | X86_PG_M; 4373 4374 /* install large map entries if configured */ 4375 for (i = 0; i < lm_ents; i++) 4376 pm_pml4[LMSPML4I + i] = kernel_pmap->pm_pmltop[LMSPML4I + i]; 4377 } 4378 4379 void 4380 pmap_pinit_pml5(vm_page_t pml5pg) 4381 { 4382 pml5_entry_t *pm_pml5; 4383 4384 pm_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pg)); 4385 4386 /* 4387 * Add pml5 entry at top of KVA pointing to existing pml4 table, 4388 * entering all existing kernel mappings into level 5 table. 4389 */ 4390 pm_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V | 4391 X86_PG_RW | X86_PG_A | X86_PG_M; 4392 4393 /* 4394 * Install self-referential address mapping entry. 4395 */ 4396 pm_pml5[PML5PML5I] = VM_PAGE_TO_PHYS(pml5pg) | 4397 X86_PG_RW | X86_PG_V | X86_PG_M | X86_PG_A; 4398 } 4399 4400 static void 4401 pmap_pinit_pml4_pti(vm_page_t pml4pgu) 4402 { 4403 pml4_entry_t *pm_pml4u; 4404 int i; 4405 4406 pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pgu)); 4407 for (i = 0; i < NPML4EPG; i++) 4408 pm_pml4u[i] = pti_pml4[i]; 4409 } 4410 4411 static void 4412 pmap_pinit_pml5_pti(vm_page_t pml5pgu) 4413 { 4414 pml5_entry_t *pm_pml5u; 4415 4416 pm_pml5u = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pgu)); 4417 pagezero(pm_pml5u); 4418 4419 /* 4420 * Add pml5 entry at top of KVA pointing to existing pml4 pti 4421 * table, entering all kernel mappings needed for usermode 4422 * into level 5 table. 4423 */ 4424 pm_pml5u[pmap_pml5e_index(UPT_MAX_ADDRESS)] = 4425 pmap_kextract((vm_offset_t)pti_pml4) | 4426 X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; 4427 } 4428 4429 /* Allocate a page table page and do related bookkeeping */ 4430 static vm_page_t 4431 pmap_alloc_pt_page(pmap_t pmap, vm_pindex_t pindex, int flags) 4432 { 4433 vm_page_t m; 4434 4435 m = vm_page_alloc_noobj(flags); 4436 if (__predict_false(m == NULL)) 4437 return (NULL); 4438 m->pindex = pindex; 4439 pmap_pt_page_count_adj(pmap, 1); 4440 return (m); 4441 } 4442 4443 static void 4444 pmap_free_pt_page(pmap_t pmap, vm_page_t m, bool zerofilled) 4445 { 4446 /* 4447 * This function assumes the page will need to be unwired, 4448 * even though the counterpart allocation in pmap_alloc_pt_page() 4449 * doesn't enforce VM_ALLOC_WIRED. However, all current uses 4450 * of pmap_free_pt_page() require unwiring. The case in which 4451 * a PT page doesn't require unwiring because its ref_count has 4452 * naturally reached 0 is handled through _pmap_unwire_ptp(). 4453 */ 4454 vm_page_unwire_noq(m); 4455 if (zerofilled) 4456 vm_page_free_zero(m); 4457 else 4458 vm_page_free(m); 4459 4460 pmap_pt_page_count_adj(pmap, -1); 4461 } 4462 4463 _Static_assert(sizeof(struct pmap_pcid) == 8, "Fix pcpu zone for pm_pcidp"); 4464 4465 /* 4466 * Initialize a preallocated and zeroed pmap structure, 4467 * such as one in a vmspace structure. 4468 */ 4469 int 4470 pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags) 4471 { 4472 vm_page_t pmltop_pg, pmltop_pgu; 4473 vm_paddr_t pmltop_phys; 4474 4475 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 4476 4477 /* 4478 * Allocate the page directory page. Pass NULL instead of a 4479 * pointer to the pmap here to avoid calling 4480 * pmap_resident_count_adj() through pmap_pt_page_count_adj(), 4481 * since that requires pmap lock. Instead do the accounting 4482 * manually. 4483 * 4484 * Note that final call to pmap_remove() optimization that 4485 * checks for zero resident_count is basically disabled by 4486 * accounting for top-level page. But the optimization was 4487 * not effective since we started using non-managed mapping of 4488 * the shared page. 4489 */ 4490 pmltop_pg = pmap_alloc_pt_page(NULL, 0, VM_ALLOC_WIRED | VM_ALLOC_ZERO | 4491 VM_ALLOC_WAITOK); 4492 pmap_pt_page_count_pinit(pmap, 1); 4493 4494 pmltop_phys = VM_PAGE_TO_PHYS(pmltop_pg); 4495 pmap->pm_pmltop = (pml5_entry_t *)PHYS_TO_DMAP(pmltop_phys); 4496 4497 if (pmap_pcid_enabled) { 4498 if (pmap->pm_pcidp == NULL) 4499 pmap->pm_pcidp = uma_zalloc_pcpu(pcpu_zone_8, 4500 M_WAITOK); 4501 pmap_pinit_pcids(pmap, PMAP_PCID_NONE, 0); 4502 } 4503 pmap->pm_cr3 = PMAP_NO_CR3; /* initialize to an invalid value */ 4504 pmap->pm_ucr3 = PMAP_NO_CR3; 4505 pmap->pm_pmltopu = NULL; 4506 4507 pmap->pm_type = pm_type; 4508 4509 /* 4510 * Do not install the host kernel mappings in the nested page 4511 * tables. These mappings are meaningless in the guest physical 4512 * address space. 4513 * Install minimal kernel mappings in PTI case. 4514 */ 4515 switch (pm_type) { 4516 case PT_X86: 4517 pmap->pm_cr3 = pmltop_phys; 4518 if (pmap_is_la57(pmap)) 4519 pmap_pinit_pml5(pmltop_pg); 4520 else 4521 pmap_pinit_pml4(pmltop_pg); 4522 if ((curproc->p_md.md_flags & P_MD_KPTI) != 0) { 4523 /* 4524 * As with pmltop_pg, pass NULL instead of a 4525 * pointer to the pmap to ensure that the PTI 4526 * page counted explicitly. 4527 */ 4528 pmltop_pgu = pmap_alloc_pt_page(NULL, 0, 4529 VM_ALLOC_WIRED | VM_ALLOC_WAITOK); 4530 pmap_pt_page_count_pinit(pmap, 1); 4531 pmap->pm_pmltopu = (pml4_entry_t *)PHYS_TO_DMAP( 4532 VM_PAGE_TO_PHYS(pmltop_pgu)); 4533 if (pmap_is_la57(pmap)) 4534 pmap_pinit_pml5_pti(pmltop_pgu); 4535 else 4536 pmap_pinit_pml4_pti(pmltop_pgu); 4537 pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pmltop_pgu); 4538 } 4539 if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { 4540 rangeset_init(&pmap->pm_pkru, pkru_dup_range, 4541 pkru_free_range, pmap, M_NOWAIT); 4542 } 4543 break; 4544 case PT_EPT: 4545 case PT_RVI: 4546 pmap->pm_eptsmr = smr_create("pmap", 0, 0); 4547 break; 4548 } 4549 4550 vm_radix_init(&pmap->pm_root); 4551 CPU_ZERO(&pmap->pm_active); 4552 TAILQ_INIT(&pmap->pm_pvchunk); 4553 pmap->pm_flags = flags; 4554 pmap->pm_eptgen = 0; 4555 4556 return (1); 4557 } 4558 4559 int 4560 pmap_pinit(pmap_t pmap) 4561 { 4562 4563 return (pmap_pinit_type(pmap, PT_X86, pmap_flags)); 4564 } 4565 4566 static void 4567 pmap_allocpte_free_unref(pmap_t pmap, vm_offset_t va, pt_entry_t *pte) 4568 { 4569 vm_page_t mpg; 4570 struct spglist free; 4571 4572 mpg = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 4573 if (mpg->ref_count != 0) 4574 return; 4575 SLIST_INIT(&free); 4576 _pmap_unwire_ptp(pmap, va, mpg, &free); 4577 pmap_invalidate_page(pmap, va); 4578 vm_page_free_pages_toq(&free, true); 4579 } 4580 4581 static pml4_entry_t * 4582 pmap_allocpte_getpml4(pmap_t pmap, struct rwlock **lockp, vm_offset_t va, 4583 bool addref) 4584 { 4585 vm_pindex_t pml5index; 4586 pml5_entry_t *pml5; 4587 pml4_entry_t *pml4; 4588 vm_page_t pml4pg; 4589 pt_entry_t PG_V; 4590 bool allocated; 4591 4592 if (!pmap_is_la57(pmap)) 4593 return (&pmap->pm_pmltop[pmap_pml4e_index(va)]); 4594 4595 PG_V = pmap_valid_bit(pmap); 4596 pml5index = pmap_pml5e_index(va); 4597 pml5 = &pmap->pm_pmltop[pml5index]; 4598 if ((*pml5 & PG_V) == 0) { 4599 if (pmap_allocpte_nosleep(pmap, pmap_pml5e_pindex(va), lockp, 4600 va) == NULL) 4601 return (NULL); 4602 allocated = true; 4603 } else { 4604 allocated = false; 4605 } 4606 pml4 = (pml4_entry_t *)PHYS_TO_DMAP(*pml5 & PG_FRAME); 4607 pml4 = &pml4[pmap_pml4e_index(va)]; 4608 if ((*pml4 & PG_V) == 0) { 4609 pml4pg = PHYS_TO_VM_PAGE(*pml5 & PG_FRAME); 4610 if (allocated && !addref) 4611 pml4pg->ref_count--; 4612 else if (!allocated && addref) 4613 pml4pg->ref_count++; 4614 } 4615 return (pml4); 4616 } 4617 4618 static pdp_entry_t * 4619 pmap_allocpte_getpdp(pmap_t pmap, struct rwlock **lockp, vm_offset_t va, 4620 bool addref) 4621 { 4622 vm_page_t pdppg; 4623 pml4_entry_t *pml4; 4624 pdp_entry_t *pdp; 4625 pt_entry_t PG_V; 4626 bool allocated; 4627 4628 PG_V = pmap_valid_bit(pmap); 4629 4630 pml4 = pmap_allocpte_getpml4(pmap, lockp, va, false); 4631 if (pml4 == NULL) 4632 return (NULL); 4633 4634 if ((*pml4 & PG_V) == 0) { 4635 /* Have to allocate a new pdp, recurse */ 4636 if (pmap_allocpte_nosleep(pmap, pmap_pml4e_pindex(va), lockp, 4637 va) == NULL) { 4638 if (pmap_is_la57(pmap)) 4639 pmap_allocpte_free_unref(pmap, va, 4640 pmap_pml5e(pmap, va)); 4641 return (NULL); 4642 } 4643 allocated = true; 4644 } else { 4645 allocated = false; 4646 } 4647 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 4648 pdp = &pdp[pmap_pdpe_index(va)]; 4649 if ((*pdp & PG_V) == 0) { 4650 pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME); 4651 if (allocated && !addref) 4652 pdppg->ref_count--; 4653 else if (!allocated && addref) 4654 pdppg->ref_count++; 4655 } 4656 return (pdp); 4657 } 4658 4659 /* 4660 * The ptepindexes, i.e. page indices, of the page table pages encountered 4661 * while translating virtual address va are defined as follows: 4662 * - for the page table page (last level), 4663 * ptepindex = pmap_pde_pindex(va) = va >> PDRSHIFT, 4664 * in other words, it is just the index of the PDE that maps the page 4665 * table page. 4666 * - for the page directory page, 4667 * ptepindex = NUPDE (number of userland PD entries) + 4668 * (pmap_pde_index(va) >> NPDEPGSHIFT) 4669 * i.e. index of PDPE is put after the last index of PDE, 4670 * - for the page directory pointer page, 4671 * ptepindex = NUPDE + NUPDPE + (pmap_pde_index(va) >> (NPDEPGSHIFT + 4672 * NPML4EPGSHIFT), 4673 * i.e. index of pml4e is put after the last index of PDPE, 4674 * - for the PML4 page (if LA57 mode is enabled), 4675 * ptepindex = NUPDE + NUPDPE + NUPML4E + (pmap_pde_index(va) >> 4676 * (NPDEPGSHIFT + NPML4EPGSHIFT + NPML5EPGSHIFT), 4677 * i.e. index of pml5e is put after the last index of PML4E. 4678 * 4679 * Define an order on the paging entries, where all entries of the 4680 * same height are put together, then heights are put from deepest to 4681 * root. Then ptexpindex is the sequential number of the 4682 * corresponding paging entry in this order. 4683 * 4684 * The values of NUPDE, NUPDPE, and NUPML4E are determined by the size of 4685 * LA57 paging structures even in LA48 paging mode. Moreover, the 4686 * ptepindexes are calculated as if the paging structures were 5-level 4687 * regardless of the actual mode of operation. 4688 * 4689 * The root page at PML4/PML5 does not participate in this indexing scheme, 4690 * since it is statically allocated by pmap_pinit() and not by pmap_allocpte(). 4691 */ 4692 static vm_page_t 4693 pmap_allocpte_nosleep(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp, 4694 vm_offset_t va) 4695 { 4696 vm_pindex_t pml5index, pml4index; 4697 pml5_entry_t *pml5, *pml5u; 4698 pml4_entry_t *pml4, *pml4u; 4699 pdp_entry_t *pdp; 4700 pd_entry_t *pd; 4701 vm_page_t m, pdpg; 4702 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 4703 4704 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4705 4706 PG_A = pmap_accessed_bit(pmap); 4707 PG_M = pmap_modified_bit(pmap); 4708 PG_V = pmap_valid_bit(pmap); 4709 PG_RW = pmap_rw_bit(pmap); 4710 4711 /* 4712 * Allocate a page table page. 4713 */ 4714 m = pmap_alloc_pt_page(pmap, ptepindex, 4715 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 4716 if (m == NULL) 4717 return (NULL); 4718 4719 /* 4720 * Map the pagetable page into the process address space, if 4721 * it isn't already there. 4722 */ 4723 if (ptepindex >= NUPDE + NUPDPE + NUPML4E) { 4724 MPASS(pmap_is_la57(pmap)); 4725 4726 pml5index = pmap_pml5e_index(va); 4727 pml5 = &pmap->pm_pmltop[pml5index]; 4728 KASSERT((*pml5 & PG_V) == 0, 4729 ("pmap %p va %#lx pml5 %#lx", pmap, va, *pml5)); 4730 *pml5 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 4731 4732 if (pmap->pm_pmltopu != NULL && pml5index < NUPML5E) { 4733 MPASS(pmap->pm_ucr3 != PMAP_NO_CR3); 4734 *pml5 |= pg_nx; 4735 4736 pml5u = &pmap->pm_pmltopu[pml5index]; 4737 *pml5u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | 4738 PG_A | PG_M; 4739 } 4740 } else if (ptepindex >= NUPDE + NUPDPE) { 4741 pml4index = pmap_pml4e_index(va); 4742 /* Wire up a new PDPE page */ 4743 pml4 = pmap_allocpte_getpml4(pmap, lockp, va, true); 4744 if (pml4 == NULL) { 4745 pmap_free_pt_page(pmap, m, true); 4746 return (NULL); 4747 } 4748 KASSERT((*pml4 & PG_V) == 0, 4749 ("pmap %p va %#lx pml4 %#lx", pmap, va, *pml4)); 4750 *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 4751 4752 if (!pmap_is_la57(pmap) && pmap->pm_pmltopu != NULL && 4753 pml4index < NUPML4E) { 4754 MPASS(pmap->pm_ucr3 != PMAP_NO_CR3); 4755 4756 /* 4757 * PTI: Make all user-space mappings in the 4758 * kernel-mode page table no-execute so that 4759 * we detect any programming errors that leave 4760 * the kernel-mode page table active on return 4761 * to user space. 4762 */ 4763 *pml4 |= pg_nx; 4764 4765 pml4u = &pmap->pm_pmltopu[pml4index]; 4766 *pml4u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | 4767 PG_A | PG_M; 4768 } 4769 } else if (ptepindex >= NUPDE) { 4770 /* Wire up a new PDE page */ 4771 pdp = pmap_allocpte_getpdp(pmap, lockp, va, true); 4772 if (pdp == NULL) { 4773 pmap_free_pt_page(pmap, m, true); 4774 return (NULL); 4775 } 4776 KASSERT((*pdp & PG_V) == 0, 4777 ("pmap %p va %#lx pdp %#lx", pmap, va, *pdp)); 4778 *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 4779 } else { 4780 /* Wire up a new PTE page */ 4781 pdp = pmap_allocpte_getpdp(pmap, lockp, va, false); 4782 if (pdp == NULL) { 4783 pmap_free_pt_page(pmap, m, true); 4784 return (NULL); 4785 } 4786 if ((*pdp & PG_V) == 0) { 4787 /* Have to allocate a new pd, recurse */ 4788 if (pmap_allocpte_nosleep(pmap, pmap_pdpe_pindex(va), 4789 lockp, va) == NULL) { 4790 pmap_allocpte_free_unref(pmap, va, 4791 pmap_pml4e(pmap, va)); 4792 pmap_free_pt_page(pmap, m, true); 4793 return (NULL); 4794 } 4795 } else { 4796 /* Add reference to the pd page */ 4797 pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); 4798 pdpg->ref_count++; 4799 } 4800 pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME); 4801 4802 /* Now we know where the page directory page is */ 4803 pd = &pd[pmap_pde_index(va)]; 4804 KASSERT((*pd & PG_V) == 0, 4805 ("pmap %p va %#lx pd %#lx", pmap, va, *pd)); 4806 *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 4807 } 4808 4809 return (m); 4810 } 4811 4812 /* 4813 * This routine is called if the desired page table page does not exist. 4814 * 4815 * If page table page allocation fails, this routine may sleep before 4816 * returning NULL. It sleeps only if a lock pointer was given. Sleep 4817 * occurs right before returning to the caller. This way, we never 4818 * drop pmap lock to sleep while a page table page has ref_count == 0, 4819 * which prevents the page from being freed under us. 4820 */ 4821 static vm_page_t 4822 pmap_allocpte_alloc(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp, 4823 vm_offset_t va) 4824 { 4825 vm_page_t m; 4826 4827 m = pmap_allocpte_nosleep(pmap, ptepindex, lockp, va); 4828 if (m == NULL && lockp != NULL) { 4829 RELEASE_PV_LIST_LOCK(lockp); 4830 PMAP_UNLOCK(pmap); 4831 PMAP_ASSERT_NOT_IN_DI(); 4832 vm_wait(NULL); 4833 PMAP_LOCK(pmap); 4834 } 4835 return (m); 4836 } 4837 4838 static pd_entry_t * 4839 pmap_alloc_pde(pmap_t pmap, vm_offset_t va, vm_page_t *pdpgp, 4840 struct rwlock **lockp) 4841 { 4842 pdp_entry_t *pdpe, PG_V; 4843 pd_entry_t *pde; 4844 vm_page_t pdpg; 4845 vm_pindex_t pdpindex; 4846 4847 PG_V = pmap_valid_bit(pmap); 4848 4849 retry: 4850 pdpe = pmap_pdpe(pmap, va); 4851 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 4852 pde = pmap_pdpe_to_pde(pdpe, va); 4853 if (va < VM_MAXUSER_ADDRESS) { 4854 /* Add a reference to the pd page. */ 4855 pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); 4856 pdpg->ref_count++; 4857 } else 4858 pdpg = NULL; 4859 } else if (va < VM_MAXUSER_ADDRESS) { 4860 /* Allocate a pd page. */ 4861 pdpindex = pmap_pde_pindex(va) >> NPDPEPGSHIFT; 4862 pdpg = pmap_allocpte_alloc(pmap, NUPDE + pdpindex, lockp, va); 4863 if (pdpg == NULL) { 4864 if (lockp != NULL) 4865 goto retry; 4866 else 4867 return (NULL); 4868 } 4869 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); 4870 pde = &pde[pmap_pde_index(va)]; 4871 } else 4872 panic("pmap_alloc_pde: missing page table page for va %#lx", 4873 va); 4874 *pdpgp = pdpg; 4875 return (pde); 4876 } 4877 4878 static vm_page_t 4879 pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 4880 { 4881 vm_pindex_t ptepindex; 4882 pd_entry_t *pd, PG_V; 4883 vm_page_t m; 4884 4885 PG_V = pmap_valid_bit(pmap); 4886 4887 /* 4888 * Calculate pagetable page index 4889 */ 4890 ptepindex = pmap_pde_pindex(va); 4891 retry: 4892 /* 4893 * Get the page directory entry 4894 */ 4895 pd = pmap_pde(pmap, va); 4896 4897 /* 4898 * This supports switching from a 2MB page to a 4899 * normal 4K page. 4900 */ 4901 if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) { 4902 if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) { 4903 /* 4904 * Invalidation of the 2MB page mapping may have caused 4905 * the deallocation of the underlying PD page. 4906 */ 4907 pd = NULL; 4908 } 4909 } 4910 4911 /* 4912 * If the page table page is mapped, we just increment the 4913 * hold count, and activate it. 4914 */ 4915 if (pd != NULL && (*pd & PG_V) != 0) { 4916 m = PHYS_TO_VM_PAGE(*pd & PG_FRAME); 4917 m->ref_count++; 4918 } else { 4919 /* 4920 * Here if the pte page isn't mapped, or if it has been 4921 * deallocated. 4922 */ 4923 m = pmap_allocpte_alloc(pmap, ptepindex, lockp, va); 4924 if (m == NULL && lockp != NULL) 4925 goto retry; 4926 } 4927 return (m); 4928 } 4929 4930 /*************************************************** 4931 * Pmap allocation/deallocation routines. 4932 ***************************************************/ 4933 4934 /* 4935 * Release any resources held by the given physical map. 4936 * Called when a pmap initialized by pmap_pinit is being released. 4937 * Should only be called if the map contains no valid mappings. 4938 */ 4939 void 4940 pmap_release(pmap_t pmap) 4941 { 4942 vm_page_t m; 4943 int i; 4944 4945 KASSERT(vm_radix_is_empty(&pmap->pm_root), 4946 ("pmap_release: pmap %p has reserved page table page(s)", 4947 pmap)); 4948 KASSERT(CPU_EMPTY(&pmap->pm_active), 4949 ("releasing active pmap %p", pmap)); 4950 4951 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pmltop)); 4952 4953 if (pmap_is_la57(pmap)) { 4954 pmap->pm_pmltop[pmap_pml5e_index(UPT_MAX_ADDRESS)] = 0; 4955 pmap->pm_pmltop[PML5PML5I] = 0; 4956 } else { 4957 for (i = 0; i < NKPML4E; i++) /* KVA */ 4958 pmap->pm_pmltop[KPML4BASE + i] = 0; 4959 #ifdef KASAN 4960 for (i = 0; i < NKASANPML4E; i++) /* KASAN shadow map */ 4961 pmap->pm_pmltop[KASANPML4I + i] = 0; 4962 #endif 4963 #ifdef KMSAN 4964 for (i = 0; i < NKMSANSHADPML4E; i++) /* KMSAN shadow map */ 4965 pmap->pm_pmltop[KMSANSHADPML4I + i] = 0; 4966 for (i = 0; i < NKMSANORIGPML4E; i++) /* KMSAN shadow map */ 4967 pmap->pm_pmltop[KMSANORIGPML4I + i] = 0; 4968 #endif 4969 for (i = 0; i < ndmpdpphys; i++)/* Direct Map */ 4970 pmap->pm_pmltop[DMPML4I + i] = 0; 4971 pmap->pm_pmltop[PML4PML4I] = 0; /* Recursive Mapping */ 4972 for (i = 0; i < lm_ents; i++) /* Large Map */ 4973 pmap->pm_pmltop[LMSPML4I + i] = 0; 4974 } 4975 4976 pmap_free_pt_page(NULL, m, true); 4977 pmap_pt_page_count_pinit(pmap, -1); 4978 4979 if (pmap->pm_pmltopu != NULL) { 4980 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap-> 4981 pm_pmltopu)); 4982 pmap_free_pt_page(NULL, m, false); 4983 pmap_pt_page_count_pinit(pmap, -1); 4984 } 4985 if (pmap->pm_type == PT_X86 && 4986 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) 4987 rangeset_fini(&pmap->pm_pkru); 4988 4989 KASSERT(pmap->pm_stats.resident_count == 0, 4990 ("pmap_release: pmap %p resident count %ld != 0", 4991 pmap, pmap->pm_stats.resident_count)); 4992 } 4993 4994 static int 4995 kvm_size(SYSCTL_HANDLER_ARGS) 4996 { 4997 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; 4998 4999 return sysctl_handle_long(oidp, &ksize, 0, req); 5000 } 5001 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 5002 0, 0, kvm_size, "LU", 5003 "Size of KVM"); 5004 5005 static int 5006 kvm_free(SYSCTL_HANDLER_ARGS) 5007 { 5008 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 5009 5010 return sysctl_handle_long(oidp, &kfree, 0, req); 5011 } 5012 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 5013 0, 0, kvm_free, "LU", 5014 "Amount of KVM free"); 5015 5016 #ifdef KMSAN 5017 static void 5018 pmap_kmsan_shadow_map_page_array(vm_paddr_t pdppa, vm_size_t size) 5019 { 5020 pdp_entry_t *pdpe; 5021 pd_entry_t *pde; 5022 pt_entry_t *pte; 5023 vm_paddr_t dummypa, dummypd, dummypt; 5024 int i, npde, npdpg; 5025 5026 npdpg = howmany(size, NBPDP); 5027 npde = size / NBPDR; 5028 5029 dummypa = vm_phys_early_alloc(-1, PAGE_SIZE); 5030 pagezero((void *)PHYS_TO_DMAP(dummypa)); 5031 5032 dummypt = vm_phys_early_alloc(-1, PAGE_SIZE); 5033 pagezero((void *)PHYS_TO_DMAP(dummypt)); 5034 dummypd = vm_phys_early_alloc(-1, PAGE_SIZE * npdpg); 5035 for (i = 0; i < npdpg; i++) 5036 pagezero((void *)PHYS_TO_DMAP(dummypd + ptoa(i))); 5037 5038 pte = (pt_entry_t *)PHYS_TO_DMAP(dummypt); 5039 for (i = 0; i < NPTEPG; i++) 5040 pte[i] = (pt_entry_t)(dummypa | X86_PG_V | X86_PG_RW | 5041 X86_PG_A | X86_PG_M | pg_nx); 5042 5043 pde = (pd_entry_t *)PHYS_TO_DMAP(dummypd); 5044 for (i = 0; i < npde; i++) 5045 pde[i] = (pd_entry_t)(dummypt | X86_PG_V | X86_PG_RW | pg_nx); 5046 5047 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(pdppa); 5048 for (i = 0; i < npdpg; i++) 5049 pdpe[i] = (pdp_entry_t)(dummypd + ptoa(i) | X86_PG_V | 5050 X86_PG_RW | pg_nx); 5051 } 5052 5053 static void 5054 pmap_kmsan_page_array_startup(vm_offset_t start, vm_offset_t end) 5055 { 5056 vm_size_t size; 5057 5058 KASSERT(start % NBPDP == 0, ("unaligned page array start address")); 5059 5060 /* 5061 * The end of the page array's KVA region is 2MB aligned, see 5062 * kmem_init(). 5063 */ 5064 size = round_2mpage(end) - start; 5065 pmap_kmsan_shadow_map_page_array(KMSANSHADPDPphys, size); 5066 pmap_kmsan_shadow_map_page_array(KMSANORIGPDPphys, size); 5067 } 5068 #endif 5069 5070 /* 5071 * Allocate physical memory for the vm_page array and map it into KVA, 5072 * attempting to back the vm_pages with domain-local memory. 5073 */ 5074 void 5075 pmap_page_array_startup(long pages) 5076 { 5077 pdp_entry_t *pdpe; 5078 pd_entry_t *pde, newpdir; 5079 vm_offset_t va, start, end; 5080 vm_paddr_t pa; 5081 long pfn; 5082 int domain, i; 5083 5084 vm_page_array_size = pages; 5085 5086 start = VM_MIN_KERNEL_ADDRESS; 5087 end = start + pages * sizeof(struct vm_page); 5088 for (va = start; va < end; va += NBPDR) { 5089 pfn = first_page + (va - start) / sizeof(struct vm_page); 5090 domain = vm_phys_domain(ptoa(pfn)); 5091 pdpe = pmap_pdpe(kernel_pmap, va); 5092 if ((*pdpe & X86_PG_V) == 0) { 5093 pa = vm_phys_early_alloc(domain, PAGE_SIZE); 5094 dump_add_page(pa); 5095 pagezero((void *)PHYS_TO_DMAP(pa)); 5096 *pdpe = (pdp_entry_t)(pa | X86_PG_V | X86_PG_RW | 5097 X86_PG_A | X86_PG_M); 5098 } 5099 pde = pmap_pdpe_to_pde(pdpe, va); 5100 if ((*pde & X86_PG_V) != 0) 5101 panic("Unexpected pde"); 5102 pa = vm_phys_early_alloc(domain, NBPDR); 5103 for (i = 0; i < NPDEPG; i++) 5104 dump_add_page(pa + i * PAGE_SIZE); 5105 newpdir = (pd_entry_t)(pa | X86_PG_V | X86_PG_RW | X86_PG_A | 5106 X86_PG_M | PG_PS | pg_g | pg_nx); 5107 pde_store(pde, newpdir); 5108 } 5109 vm_page_array = (vm_page_t)start; 5110 5111 #ifdef KMSAN 5112 pmap_kmsan_page_array_startup(start, end); 5113 #endif 5114 } 5115 5116 /* 5117 * grow the number of kernel page table entries, if needed 5118 */ 5119 void 5120 pmap_growkernel(vm_offset_t addr) 5121 { 5122 vm_paddr_t paddr; 5123 vm_page_t nkpg; 5124 pd_entry_t *pde, newpdir; 5125 pdp_entry_t *pdpe; 5126 vm_offset_t end; 5127 5128 TSENTER(); 5129 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 5130 5131 /* 5132 * The kernel map covers two distinct regions of KVA: that used 5133 * for dynamic kernel memory allocations, and the uppermost 2GB 5134 * of the virtual address space. The latter is used to map the 5135 * kernel and loadable kernel modules. This scheme enables the 5136 * use of a special code generation model for kernel code which 5137 * takes advantage of compact addressing modes in machine code. 5138 * 5139 * Both regions grow upwards; to avoid wasting memory, the gap 5140 * in between is unmapped. If "addr" is above "KERNBASE", the 5141 * kernel's region is grown, otherwise the kmem region is grown. 5142 * 5143 * The correctness of this action is based on the following 5144 * argument: vm_map_insert() allocates contiguous ranges of the 5145 * kernel virtual address space. It calls this function if a range 5146 * ends after "kernel_vm_end". If the kernel is mapped between 5147 * "kernel_vm_end" and "addr", then the range cannot begin at 5148 * "kernel_vm_end". In fact, its beginning address cannot be less 5149 * than the kernel. Thus, there is no immediate need to allocate 5150 * any new kernel page table pages between "kernel_vm_end" and 5151 * "KERNBASE". 5152 */ 5153 if (KERNBASE < addr) { 5154 end = KERNBASE + nkpt * NBPDR; 5155 if (end == 0) { 5156 TSEXIT(); 5157 return; 5158 } 5159 } else { 5160 end = kernel_vm_end; 5161 } 5162 5163 addr = roundup2(addr, NBPDR); 5164 if (addr - 1 >= vm_map_max(kernel_map)) 5165 addr = vm_map_max(kernel_map); 5166 if (addr <= end) { 5167 /* 5168 * The grown region is already mapped, so there is 5169 * nothing to do. 5170 */ 5171 TSEXIT(); 5172 return; 5173 } 5174 5175 kasan_shadow_map(end, addr - end); 5176 kmsan_shadow_map(end, addr - end); 5177 while (end < addr) { 5178 pdpe = pmap_pdpe(kernel_pmap, end); 5179 if ((*pdpe & X86_PG_V) == 0) { 5180 nkpg = pmap_alloc_pt_page(kernel_pmap, 5181 pmap_pdpe_pindex(end), VM_ALLOC_INTERRUPT | 5182 VM_ALLOC_NOFREE | VM_ALLOC_WIRED | VM_ALLOC_ZERO); 5183 if (nkpg == NULL) 5184 panic("pmap_growkernel: no memory to grow kernel"); 5185 paddr = VM_PAGE_TO_PHYS(nkpg); 5186 *pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW | 5187 X86_PG_A | X86_PG_M); 5188 continue; /* try again */ 5189 } 5190 pde = pmap_pdpe_to_pde(pdpe, end); 5191 if ((*pde & X86_PG_V) != 0) { 5192 end = (end + NBPDR) & ~PDRMASK; 5193 if (end - 1 >= vm_map_max(kernel_map)) { 5194 end = vm_map_max(kernel_map); 5195 break; 5196 } 5197 continue; 5198 } 5199 5200 nkpg = pmap_alloc_pt_page(kernel_pmap, pmap_pde_pindex(end), 5201 VM_ALLOC_INTERRUPT | VM_ALLOC_NOFREE | VM_ALLOC_WIRED | 5202 VM_ALLOC_ZERO); 5203 if (nkpg == NULL) 5204 panic("pmap_growkernel: no memory to grow kernel"); 5205 paddr = VM_PAGE_TO_PHYS(nkpg); 5206 newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; 5207 pde_store(pde, newpdir); 5208 5209 end = (end + NBPDR) & ~PDRMASK; 5210 if (end - 1 >= vm_map_max(kernel_map)) { 5211 end = vm_map_max(kernel_map); 5212 break; 5213 } 5214 } 5215 5216 if (end <= KERNBASE) 5217 kernel_vm_end = end; 5218 else 5219 nkpt = howmany(end - KERNBASE, NBPDR); 5220 TSEXIT(); 5221 } 5222 5223 /*************************************************** 5224 * page management routines. 5225 ***************************************************/ 5226 5227 static const uint64_t pc_freemask[_NPCM] = { 5228 [0 ... _NPCM - 2] = PC_FREEN, 5229 [_NPCM - 1] = PC_FREEL 5230 }; 5231 5232 #ifdef PV_STATS 5233 5234 static COUNTER_U64_DEFINE_EARLY(pc_chunk_count); 5235 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, 5236 &pc_chunk_count, "Current number of pv entry cnunks"); 5237 5238 static COUNTER_U64_DEFINE_EARLY(pc_chunk_allocs); 5239 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, 5240 &pc_chunk_allocs, "Total number of pv entry chunks allocated"); 5241 5242 static COUNTER_U64_DEFINE_EARLY(pc_chunk_frees); 5243 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, 5244 &pc_chunk_frees, "Total number of pv entry chunks freed"); 5245 5246 static COUNTER_U64_DEFINE_EARLY(pc_chunk_tryfail); 5247 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, 5248 &pc_chunk_tryfail, 5249 "Number of failed attempts to get a pv entry chunk page"); 5250 5251 static COUNTER_U64_DEFINE_EARLY(pv_entry_frees); 5252 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, 5253 &pv_entry_frees, "Total number of pv entries freed"); 5254 5255 static COUNTER_U64_DEFINE_EARLY(pv_entry_allocs); 5256 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, 5257 &pv_entry_allocs, "Total number of pv entries allocated"); 5258 5259 static COUNTER_U64_DEFINE_EARLY(pv_entry_count); 5260 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, 5261 &pv_entry_count, "Current number of pv entries"); 5262 5263 static COUNTER_U64_DEFINE_EARLY(pv_entry_spare); 5264 SYSCTL_COUNTER_U64(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, 5265 &pv_entry_spare, "Current number of spare pv entries"); 5266 #endif 5267 5268 static void 5269 reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap, bool start_di) 5270 { 5271 5272 if (pmap == NULL) 5273 return; 5274 pmap_invalidate_all(pmap); 5275 if (pmap != locked_pmap) 5276 PMAP_UNLOCK(pmap); 5277 if (start_di) 5278 pmap_delayed_invl_finish(); 5279 } 5280 5281 /* 5282 * We are in a serious low memory condition. Resort to 5283 * drastic measures to free some pages so we can allocate 5284 * another pv entry chunk. 5285 * 5286 * Returns NULL if PV entries were reclaimed from the specified pmap. 5287 * 5288 * We do not, however, unmap 2mpages because subsequent accesses will 5289 * allocate per-page pv entries until repromotion occurs, thereby 5290 * exacerbating the shortage of free pv entries. 5291 */ 5292 static vm_page_t 5293 reclaim_pv_chunk_domain(pmap_t locked_pmap, struct rwlock **lockp, int domain) 5294 { 5295 struct pv_chunks_list *pvc; 5296 struct pv_chunk *pc, *pc_marker, *pc_marker_end; 5297 struct pv_chunk_header pc_marker_b, pc_marker_end_b; 5298 struct md_page *pvh; 5299 pd_entry_t *pde; 5300 pmap_t next_pmap, pmap; 5301 pt_entry_t *pte, tpte; 5302 pt_entry_t PG_G, PG_A, PG_M, PG_RW; 5303 pv_entry_t pv; 5304 vm_offset_t va; 5305 vm_page_t m, m_pc; 5306 struct spglist free; 5307 uint64_t inuse; 5308 int bit, field, freed; 5309 bool start_di, restart; 5310 5311 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 5312 KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); 5313 pmap = NULL; 5314 m_pc = NULL; 5315 PG_G = PG_A = PG_M = PG_RW = 0; 5316 SLIST_INIT(&free); 5317 bzero(&pc_marker_b, sizeof(pc_marker_b)); 5318 bzero(&pc_marker_end_b, sizeof(pc_marker_end_b)); 5319 pc_marker = (struct pv_chunk *)&pc_marker_b; 5320 pc_marker_end = (struct pv_chunk *)&pc_marker_end_b; 5321 5322 /* 5323 * A delayed invalidation block should already be active if 5324 * pmap_advise() or pmap_remove() called this function by way 5325 * of pmap_demote_pde_locked(). 5326 */ 5327 start_di = pmap_not_in_di(); 5328 5329 pvc = &pv_chunks[domain]; 5330 mtx_lock(&pvc->pvc_lock); 5331 pvc->active_reclaims++; 5332 TAILQ_INSERT_HEAD(&pvc->pvc_list, pc_marker, pc_lru); 5333 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc_marker_end, pc_lru); 5334 while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end && 5335 SLIST_EMPTY(&free)) { 5336 next_pmap = pc->pc_pmap; 5337 if (next_pmap == NULL) { 5338 /* 5339 * The next chunk is a marker. However, it is 5340 * not our marker, so active_reclaims must be 5341 * > 1. Consequently, the next_chunk code 5342 * will not rotate the pv_chunks list. 5343 */ 5344 goto next_chunk; 5345 } 5346 mtx_unlock(&pvc->pvc_lock); 5347 5348 /* 5349 * A pv_chunk can only be removed from the pc_lru list 5350 * when both pc_chunks_mutex is owned and the 5351 * corresponding pmap is locked. 5352 */ 5353 if (pmap != next_pmap) { 5354 restart = false; 5355 reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, 5356 start_di); 5357 pmap = next_pmap; 5358 /* Avoid deadlock and lock recursion. */ 5359 if (pmap > locked_pmap) { 5360 RELEASE_PV_LIST_LOCK(lockp); 5361 PMAP_LOCK(pmap); 5362 if (start_di) 5363 pmap_delayed_invl_start(); 5364 mtx_lock(&pvc->pvc_lock); 5365 restart = true; 5366 } else if (pmap != locked_pmap) { 5367 if (PMAP_TRYLOCK(pmap)) { 5368 if (start_di) 5369 pmap_delayed_invl_start(); 5370 mtx_lock(&pvc->pvc_lock); 5371 restart = true; 5372 } else { 5373 pmap = NULL; /* pmap is not locked */ 5374 mtx_lock(&pvc->pvc_lock); 5375 pc = TAILQ_NEXT(pc_marker, pc_lru); 5376 if (pc == NULL || 5377 pc->pc_pmap != next_pmap) 5378 continue; 5379 goto next_chunk; 5380 } 5381 } else if (start_di) 5382 pmap_delayed_invl_start(); 5383 PG_G = pmap_global_bit(pmap); 5384 PG_A = pmap_accessed_bit(pmap); 5385 PG_M = pmap_modified_bit(pmap); 5386 PG_RW = pmap_rw_bit(pmap); 5387 if (restart) 5388 continue; 5389 } 5390 5391 /* 5392 * Destroy every non-wired, 4 KB page mapping in the chunk. 5393 */ 5394 freed = 0; 5395 for (field = 0; field < _NPCM; field++) { 5396 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 5397 inuse != 0; inuse &= ~(1UL << bit)) { 5398 bit = bsfq(inuse); 5399 pv = &pc->pc_pventry[field * 64 + bit]; 5400 va = pv->pv_va; 5401 pde = pmap_pde(pmap, va); 5402 if ((*pde & PG_PS) != 0) 5403 continue; 5404 pte = pmap_pde_to_pte(pde, va); 5405 if ((*pte & PG_W) != 0) 5406 continue; 5407 tpte = pte_load_clear(pte); 5408 if ((tpte & PG_G) != 0) 5409 pmap_invalidate_page(pmap, va); 5410 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 5411 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 5412 vm_page_dirty(m); 5413 if ((tpte & PG_A) != 0) 5414 vm_page_aflag_set(m, PGA_REFERENCED); 5415 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 5416 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 5417 m->md.pv_gen++; 5418 if (TAILQ_EMPTY(&m->md.pv_list) && 5419 (m->flags & PG_FICTITIOUS) == 0) { 5420 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5421 if (TAILQ_EMPTY(&pvh->pv_list)) { 5422 vm_page_aflag_clear(m, 5423 PGA_WRITEABLE); 5424 } 5425 } 5426 pmap_delayed_invl_page(m); 5427 pc->pc_map[field] |= 1UL << bit; 5428 pmap_unuse_pt(pmap, va, *pde, &free); 5429 freed++; 5430 } 5431 } 5432 if (freed == 0) { 5433 mtx_lock(&pvc->pvc_lock); 5434 goto next_chunk; 5435 } 5436 /* Every freed mapping is for a 4 KB page. */ 5437 pmap_resident_count_adj(pmap, -freed); 5438 PV_STAT(counter_u64_add(pv_entry_frees, freed)); 5439 PV_STAT(counter_u64_add(pv_entry_spare, freed)); 5440 PV_STAT(counter_u64_add(pv_entry_count, -freed)); 5441 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5442 if (pc_is_free(pc)) { 5443 PV_STAT(counter_u64_add(pv_entry_spare, -_NPCPV)); 5444 PV_STAT(counter_u64_add(pc_chunk_count, -1)); 5445 PV_STAT(counter_u64_add(pc_chunk_frees, 1)); 5446 /* Entire chunk is free; return it. */ 5447 m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 5448 dump_drop_page(m_pc->phys_addr); 5449 mtx_lock(&pvc->pvc_lock); 5450 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 5451 break; 5452 } 5453 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 5454 mtx_lock(&pvc->pvc_lock); 5455 /* One freed pv entry in locked_pmap is sufficient. */ 5456 if (pmap == locked_pmap) 5457 break; 5458 next_chunk: 5459 TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru); 5460 TAILQ_INSERT_AFTER(&pvc->pvc_list, pc, pc_marker, pc_lru); 5461 if (pvc->active_reclaims == 1 && pmap != NULL) { 5462 /* 5463 * Rotate the pv chunks list so that we do not 5464 * scan the same pv chunks that could not be 5465 * freed (because they contained a wired 5466 * and/or superpage mapping) on every 5467 * invocation of reclaim_pv_chunk(). 5468 */ 5469 while ((pc = TAILQ_FIRST(&pvc->pvc_list)) != pc_marker) { 5470 MPASS(pc->pc_pmap != NULL); 5471 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 5472 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru); 5473 } 5474 } 5475 } 5476 TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru); 5477 TAILQ_REMOVE(&pvc->pvc_list, pc_marker_end, pc_lru); 5478 pvc->active_reclaims--; 5479 mtx_unlock(&pvc->pvc_lock); 5480 reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, start_di); 5481 if (m_pc == NULL && !SLIST_EMPTY(&free)) { 5482 m_pc = SLIST_FIRST(&free); 5483 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 5484 /* Recycle a freed page table page. */ 5485 m_pc->ref_count = 1; 5486 } 5487 vm_page_free_pages_toq(&free, true); 5488 return (m_pc); 5489 } 5490 5491 static vm_page_t 5492 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) 5493 { 5494 vm_page_t m; 5495 int i, domain; 5496 5497 domain = PCPU_GET(domain); 5498 for (i = 0; i < vm_ndomains; i++) { 5499 m = reclaim_pv_chunk_domain(locked_pmap, lockp, domain); 5500 if (m != NULL) 5501 break; 5502 domain = (domain + 1) % vm_ndomains; 5503 } 5504 5505 return (m); 5506 } 5507 5508 /* 5509 * free the pv_entry back to the free list 5510 */ 5511 static void 5512 free_pv_entry(pmap_t pmap, pv_entry_t pv) 5513 { 5514 struct pv_chunk *pc; 5515 int idx, field, bit; 5516 5517 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5518 PV_STAT(counter_u64_add(pv_entry_frees, 1)); 5519 PV_STAT(counter_u64_add(pv_entry_spare, 1)); 5520 PV_STAT(counter_u64_add(pv_entry_count, -1)); 5521 pc = pv_to_chunk(pv); 5522 idx = pv - &pc->pc_pventry[0]; 5523 field = idx / 64; 5524 bit = idx % 64; 5525 pc->pc_map[field] |= 1ul << bit; 5526 if (!pc_is_free(pc)) { 5527 /* 98% of the time, pc is already at the head of the list. */ 5528 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { 5529 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5530 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 5531 } 5532 return; 5533 } 5534 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5535 free_pv_chunk(pc); 5536 } 5537 5538 static void 5539 free_pv_chunk_dequeued(struct pv_chunk *pc) 5540 { 5541 vm_page_t m; 5542 5543 PV_STAT(counter_u64_add(pv_entry_spare, -_NPCPV)); 5544 PV_STAT(counter_u64_add(pc_chunk_count, -1)); 5545 PV_STAT(counter_u64_add(pc_chunk_frees, 1)); 5546 counter_u64_add(pv_page_count, -1); 5547 /* entire chunk is free, return it */ 5548 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 5549 dump_drop_page(m->phys_addr); 5550 vm_page_unwire_noq(m); 5551 vm_page_free(m); 5552 } 5553 5554 static void 5555 free_pv_chunk(struct pv_chunk *pc) 5556 { 5557 struct pv_chunks_list *pvc; 5558 5559 pvc = &pv_chunks[pc_to_domain(pc)]; 5560 mtx_lock(&pvc->pvc_lock); 5561 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 5562 mtx_unlock(&pvc->pvc_lock); 5563 free_pv_chunk_dequeued(pc); 5564 } 5565 5566 static void 5567 free_pv_chunk_batch(struct pv_chunklist *batch) 5568 { 5569 struct pv_chunks_list *pvc; 5570 struct pv_chunk *pc, *npc; 5571 int i; 5572 5573 for (i = 0; i < vm_ndomains; i++) { 5574 if (TAILQ_EMPTY(&batch[i])) 5575 continue; 5576 pvc = &pv_chunks[i]; 5577 mtx_lock(&pvc->pvc_lock); 5578 TAILQ_FOREACH(pc, &batch[i], pc_list) { 5579 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 5580 } 5581 mtx_unlock(&pvc->pvc_lock); 5582 } 5583 5584 for (i = 0; i < vm_ndomains; i++) { 5585 TAILQ_FOREACH_SAFE(pc, &batch[i], pc_list, npc) { 5586 free_pv_chunk_dequeued(pc); 5587 } 5588 } 5589 } 5590 5591 /* 5592 * Returns a new PV entry, allocating a new PV chunk from the system when 5593 * needed. If this PV chunk allocation fails and a PV list lock pointer was 5594 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is 5595 * returned. 5596 * 5597 * The given PV list lock may be released. 5598 */ 5599 static pv_entry_t 5600 get_pv_entry(pmap_t pmap, struct rwlock **lockp) 5601 { 5602 struct pv_chunks_list *pvc; 5603 int bit, field; 5604 pv_entry_t pv; 5605 struct pv_chunk *pc; 5606 vm_page_t m; 5607 5608 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5609 PV_STAT(counter_u64_add(pv_entry_allocs, 1)); 5610 retry: 5611 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 5612 if (pc != NULL) { 5613 for (field = 0; field < _NPCM; field++) { 5614 if (pc->pc_map[field]) { 5615 bit = bsfq(pc->pc_map[field]); 5616 break; 5617 } 5618 } 5619 if (field < _NPCM) { 5620 pv = &pc->pc_pventry[field * 64 + bit]; 5621 pc->pc_map[field] &= ~(1ul << bit); 5622 /* If this was the last item, move it to tail */ 5623 if (pc_is_full(pc)) { 5624 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5625 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, 5626 pc_list); 5627 } 5628 PV_STAT(counter_u64_add(pv_entry_count, 1)); 5629 PV_STAT(counter_u64_add(pv_entry_spare, -1)); 5630 return (pv); 5631 } 5632 } 5633 /* No free items, allocate another chunk */ 5634 m = vm_page_alloc_noobj(VM_ALLOC_WIRED); 5635 if (m == NULL) { 5636 if (lockp == NULL) { 5637 PV_STAT(counter_u64_add(pc_chunk_tryfail, 1)); 5638 return (NULL); 5639 } 5640 m = reclaim_pv_chunk(pmap, lockp); 5641 if (m == NULL) 5642 goto retry; 5643 } else 5644 counter_u64_add(pv_page_count, 1); 5645 PV_STAT(counter_u64_add(pc_chunk_count, 1)); 5646 PV_STAT(counter_u64_add(pc_chunk_allocs, 1)); 5647 dump_add_page(m->phys_addr); 5648 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 5649 pc->pc_pmap = pmap; 5650 pc->pc_map[0] = PC_FREEN & ~1ul; /* preallocated bit 0 */ 5651 pc->pc_map[1] = PC_FREEN; 5652 pc->pc_map[2] = PC_FREEL; 5653 pvc = &pv_chunks[vm_page_domain(m)]; 5654 mtx_lock(&pvc->pvc_lock); 5655 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru); 5656 mtx_unlock(&pvc->pvc_lock); 5657 pv = &pc->pc_pventry[0]; 5658 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 5659 PV_STAT(counter_u64_add(pv_entry_count, 1)); 5660 PV_STAT(counter_u64_add(pv_entry_spare, _NPCPV - 1)); 5661 return (pv); 5662 } 5663 5664 /* 5665 * Returns the number of one bits within the given PV chunk map. 5666 * 5667 * The erratas for Intel processors state that "POPCNT Instruction May 5668 * Take Longer to Execute Than Expected". It is believed that the 5669 * issue is the spurious dependency on the destination register. 5670 * Provide a hint to the register rename logic that the destination 5671 * value is overwritten, by clearing it, as suggested in the 5672 * optimization manual. It should be cheap for unaffected processors 5673 * as well. 5674 * 5675 * Reference numbers for erratas are 5676 * 4th Gen Core: HSD146 5677 * 5th Gen Core: BDM85 5678 * 6th Gen Core: SKL029 5679 */ 5680 static int 5681 popcnt_pc_map_pq(uint64_t *map) 5682 { 5683 u_long result, tmp; 5684 5685 __asm __volatile("xorl %k0,%k0;popcntq %2,%0;" 5686 "xorl %k1,%k1;popcntq %3,%1;addl %k1,%k0;" 5687 "xorl %k1,%k1;popcntq %4,%1;addl %k1,%k0" 5688 : "=&r" (result), "=&r" (tmp) 5689 : "m" (map[0]), "m" (map[1]), "m" (map[2])); 5690 return (result); 5691 } 5692 5693 /* 5694 * Ensure that the number of spare PV entries in the specified pmap meets or 5695 * exceeds the given count, "needed". 5696 * 5697 * The given PV list lock may be released. 5698 */ 5699 static void 5700 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) 5701 { 5702 struct pv_chunks_list *pvc; 5703 struct pch new_tail[PMAP_MEMDOM]; 5704 struct pv_chunk *pc; 5705 vm_page_t m; 5706 int avail, free, i; 5707 bool reclaimed; 5708 5709 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5710 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); 5711 5712 /* 5713 * Newly allocated PV chunks must be stored in a private list until 5714 * the required number of PV chunks have been allocated. Otherwise, 5715 * reclaim_pv_chunk() could recycle one of these chunks. In 5716 * contrast, these chunks must be added to the pmap upon allocation. 5717 */ 5718 for (i = 0; i < PMAP_MEMDOM; i++) 5719 TAILQ_INIT(&new_tail[i]); 5720 retry: 5721 avail = 0; 5722 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { 5723 #ifndef __POPCNT__ 5724 if ((cpu_feature2 & CPUID2_POPCNT) == 0) 5725 bit_count((bitstr_t *)pc->pc_map, 0, 5726 sizeof(pc->pc_map) * NBBY, &free); 5727 else 5728 #endif 5729 free = popcnt_pc_map_pq(pc->pc_map); 5730 if (free == 0) 5731 break; 5732 avail += free; 5733 if (avail >= needed) 5734 break; 5735 } 5736 for (reclaimed = false; avail < needed; avail += _NPCPV) { 5737 m = vm_page_alloc_noobj(VM_ALLOC_WIRED); 5738 if (m == NULL) { 5739 m = reclaim_pv_chunk(pmap, lockp); 5740 if (m == NULL) 5741 goto retry; 5742 reclaimed = true; 5743 } else 5744 counter_u64_add(pv_page_count, 1); 5745 PV_STAT(counter_u64_add(pc_chunk_count, 1)); 5746 PV_STAT(counter_u64_add(pc_chunk_allocs, 1)); 5747 dump_add_page(m->phys_addr); 5748 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 5749 pc->pc_pmap = pmap; 5750 pc->pc_map[0] = PC_FREEN; 5751 pc->pc_map[1] = PC_FREEN; 5752 pc->pc_map[2] = PC_FREEL; 5753 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 5754 TAILQ_INSERT_TAIL(&new_tail[vm_page_domain(m)], pc, pc_lru); 5755 PV_STAT(counter_u64_add(pv_entry_spare, _NPCPV)); 5756 5757 /* 5758 * The reclaim might have freed a chunk from the current pmap. 5759 * If that chunk contained available entries, we need to 5760 * re-count the number of available entries. 5761 */ 5762 if (reclaimed) 5763 goto retry; 5764 } 5765 for (i = 0; i < vm_ndomains; i++) { 5766 if (TAILQ_EMPTY(&new_tail[i])) 5767 continue; 5768 pvc = &pv_chunks[i]; 5769 mtx_lock(&pvc->pvc_lock); 5770 TAILQ_CONCAT(&pvc->pvc_list, &new_tail[i], pc_lru); 5771 mtx_unlock(&pvc->pvc_lock); 5772 } 5773 } 5774 5775 /* 5776 * First find and then remove the pv entry for the specified pmap and virtual 5777 * address from the specified pv list. Returns the pv entry if found and NULL 5778 * otherwise. This operation can be performed on pv lists for either 4KB or 5779 * 2MB page mappings. 5780 */ 5781 static __inline pv_entry_t 5782 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 5783 { 5784 pv_entry_t pv; 5785 5786 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5787 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 5788 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 5789 pvh->pv_gen++; 5790 break; 5791 } 5792 } 5793 return (pv); 5794 } 5795 5796 /* 5797 * After demotion from a 2MB page mapping to 512 4KB page mappings, 5798 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 5799 * entries for each of the 4KB page mappings. 5800 */ 5801 static void 5802 pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 5803 struct rwlock **lockp) 5804 { 5805 struct md_page *pvh; 5806 struct pv_chunk *pc; 5807 pv_entry_t pv; 5808 vm_offset_t va_last; 5809 vm_page_t m; 5810 int bit, field; 5811 5812 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5813 KASSERT((pa & PDRMASK) == 0, 5814 ("pmap_pv_demote_pde: pa is not 2mpage aligned")); 5815 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 5816 5817 /* 5818 * Transfer the 2mpage's pv entry for this mapping to the first 5819 * page's pv list. Once this transfer begins, the pv list lock 5820 * must not be released until the last pv entry is reinstantiated. 5821 */ 5822 pvh = pa_to_pvh(pa); 5823 va = trunc_2mpage(va); 5824 pv = pmap_pvh_remove(pvh, pmap, va); 5825 KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); 5826 m = PHYS_TO_VM_PAGE(pa); 5827 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5828 m->md.pv_gen++; 5829 /* Instantiate the remaining NPTEPG - 1 pv entries. */ 5830 PV_STAT(counter_u64_add(pv_entry_allocs, NPTEPG - 1)); 5831 va_last = va + NBPDR - PAGE_SIZE; 5832 for (;;) { 5833 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 5834 KASSERT(!pc_is_full(pc), ("pmap_pv_demote_pde: missing spare")); 5835 for (field = 0; field < _NPCM; field++) { 5836 while (pc->pc_map[field]) { 5837 bit = bsfq(pc->pc_map[field]); 5838 pc->pc_map[field] &= ~(1ul << bit); 5839 pv = &pc->pc_pventry[field * 64 + bit]; 5840 va += PAGE_SIZE; 5841 pv->pv_va = va; 5842 m++; 5843 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5844 ("pmap_pv_demote_pde: page %p is not managed", m)); 5845 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5846 m->md.pv_gen++; 5847 if (va == va_last) 5848 goto out; 5849 } 5850 } 5851 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5852 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 5853 } 5854 out: 5855 if (pc_is_full(pc)) { 5856 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5857 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 5858 } 5859 PV_STAT(counter_u64_add(pv_entry_count, NPTEPG - 1)); 5860 PV_STAT(counter_u64_add(pv_entry_spare, -(NPTEPG - 1))); 5861 } 5862 5863 #if VM_NRESERVLEVEL > 0 5864 /* 5865 * After promotion from 512 4KB page mappings to a single 2MB page mapping, 5866 * replace the many pv entries for the 4KB page mappings by a single pv entry 5867 * for the 2MB page mapping. 5868 */ 5869 static void 5870 pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 5871 struct rwlock **lockp) 5872 { 5873 struct md_page *pvh; 5874 pv_entry_t pv; 5875 vm_offset_t va_last; 5876 vm_page_t m; 5877 5878 KASSERT((pa & PDRMASK) == 0, 5879 ("pmap_pv_promote_pde: pa is not 2mpage aligned")); 5880 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 5881 5882 /* 5883 * Transfer the first page's pv entry for this mapping to the 2mpage's 5884 * pv list. Aside from avoiding the cost of a call to get_pv_entry(), 5885 * a transfer avoids the possibility that get_pv_entry() calls 5886 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the 5887 * mappings that is being promoted. 5888 */ 5889 m = PHYS_TO_VM_PAGE(pa); 5890 va = trunc_2mpage(va); 5891 pv = pmap_pvh_remove(&m->md, pmap, va); 5892 KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); 5893 pvh = pa_to_pvh(pa); 5894 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 5895 pvh->pv_gen++; 5896 /* Free the remaining NPTEPG - 1 pv entries. */ 5897 va_last = va + NBPDR - PAGE_SIZE; 5898 do { 5899 m++; 5900 va += PAGE_SIZE; 5901 pmap_pvh_free(&m->md, pmap, va); 5902 } while (va < va_last); 5903 } 5904 #endif /* VM_NRESERVLEVEL > 0 */ 5905 5906 /* 5907 * First find and then destroy the pv entry for the specified pmap and virtual 5908 * address. This operation can be performed on pv lists for either 4KB or 2MB 5909 * page mappings. 5910 */ 5911 static void 5912 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 5913 { 5914 pv_entry_t pv; 5915 5916 pv = pmap_pvh_remove(pvh, pmap, va); 5917 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 5918 free_pv_entry(pmap, pv); 5919 } 5920 5921 /* 5922 * Conditionally create the PV entry for a 4KB page mapping if the required 5923 * memory can be allocated without resorting to reclamation. 5924 */ 5925 static bool 5926 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, 5927 struct rwlock **lockp) 5928 { 5929 pv_entry_t pv; 5930 5931 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5932 /* Pass NULL instead of the lock pointer to disable reclamation. */ 5933 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 5934 pv->pv_va = va; 5935 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 5936 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5937 m->md.pv_gen++; 5938 return (true); 5939 } else 5940 return (false); 5941 } 5942 5943 /* 5944 * Create the PV entry for a 2MB page mapping. Always returns true unless the 5945 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns 5946 * false if the PV entry cannot be allocated without resorting to reclamation. 5947 */ 5948 static bool 5949 pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags, 5950 struct rwlock **lockp) 5951 { 5952 struct md_page *pvh; 5953 pv_entry_t pv; 5954 vm_paddr_t pa; 5955 5956 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5957 /* Pass NULL instead of the lock pointer to disable reclamation. */ 5958 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? 5959 NULL : lockp)) == NULL) 5960 return (false); 5961 pv->pv_va = va; 5962 pa = pde & PG_PS_FRAME; 5963 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 5964 pvh = pa_to_pvh(pa); 5965 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 5966 pvh->pv_gen++; 5967 return (true); 5968 } 5969 5970 /* 5971 * Fills a page table page with mappings to consecutive physical pages. 5972 */ 5973 static void 5974 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) 5975 { 5976 pt_entry_t *pte; 5977 5978 for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { 5979 *pte = newpte; 5980 newpte += PAGE_SIZE; 5981 } 5982 } 5983 5984 /* 5985 * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page 5986 * mapping is invalidated. 5987 */ 5988 static bool 5989 pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 5990 { 5991 struct rwlock *lock; 5992 bool rv; 5993 5994 lock = NULL; 5995 rv = pmap_demote_pde_locked(pmap, pde, va, &lock); 5996 if (lock != NULL) 5997 rw_wunlock(lock); 5998 return (rv); 5999 } 6000 6001 static void 6002 pmap_demote_pde_check(pt_entry_t *firstpte __unused, pt_entry_t newpte __unused) 6003 { 6004 #ifdef INVARIANTS 6005 #ifdef DIAGNOSTIC 6006 pt_entry_t *xpte, *ypte; 6007 6008 for (xpte = firstpte; xpte < firstpte + NPTEPG; 6009 xpte++, newpte += PAGE_SIZE) { 6010 if ((*xpte & PG_FRAME) != (newpte & PG_FRAME)) { 6011 printf("pmap_demote_pde: xpte %zd and newpte map " 6012 "different pages: found %#lx, expected %#lx\n", 6013 xpte - firstpte, *xpte, newpte); 6014 printf("page table dump\n"); 6015 for (ypte = firstpte; ypte < firstpte + NPTEPG; ypte++) 6016 printf("%zd %#lx\n", ypte - firstpte, *ypte); 6017 panic("firstpte"); 6018 } 6019 } 6020 #else 6021 KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), 6022 ("pmap_demote_pde: firstpte and newpte map different physical" 6023 " addresses")); 6024 #endif 6025 #endif 6026 } 6027 6028 static void 6029 pmap_demote_pde_abort(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 6030 pd_entry_t oldpde, struct rwlock **lockp) 6031 { 6032 struct spglist free; 6033 vm_offset_t sva; 6034 6035 SLIST_INIT(&free); 6036 sva = trunc_2mpage(va); 6037 pmap_remove_pde(pmap, pde, sva, &free, lockp); 6038 if ((oldpde & pmap_global_bit(pmap)) == 0) 6039 pmap_invalidate_pde_page(pmap, sva, oldpde); 6040 vm_page_free_pages_toq(&free, true); 6041 CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx in pmap %p", 6042 va, pmap); 6043 } 6044 6045 static bool 6046 pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 6047 struct rwlock **lockp) 6048 { 6049 pd_entry_t newpde, oldpde; 6050 pt_entry_t *firstpte, newpte; 6051 pt_entry_t PG_A, PG_G, PG_M, PG_PKU_MASK, PG_RW, PG_V; 6052 vm_paddr_t mptepa; 6053 vm_page_t mpte; 6054 int PG_PTE_CACHE; 6055 bool in_kernel; 6056 6057 PG_A = pmap_accessed_bit(pmap); 6058 PG_G = pmap_global_bit(pmap); 6059 PG_M = pmap_modified_bit(pmap); 6060 PG_RW = pmap_rw_bit(pmap); 6061 PG_V = pmap_valid_bit(pmap); 6062 PG_PTE_CACHE = pmap_cache_mask(pmap, false); 6063 PG_PKU_MASK = pmap_pku_mask_bit(pmap); 6064 6065 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6066 in_kernel = va >= VM_MAXUSER_ADDRESS; 6067 oldpde = *pde; 6068 KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), 6069 ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); 6070 6071 /* 6072 * Invalidate the 2MB page mapping and return "failure" if the 6073 * mapping was never accessed. 6074 */ 6075 if ((oldpde & PG_A) == 0) { 6076 KASSERT((oldpde & PG_W) == 0, 6077 ("pmap_demote_pde: a wired mapping is missing PG_A")); 6078 pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp); 6079 return (false); 6080 } 6081 6082 mpte = pmap_remove_pt_page(pmap, va); 6083 if (mpte == NULL) { 6084 KASSERT((oldpde & PG_W) == 0, 6085 ("pmap_demote_pde: page table page for a wired mapping" 6086 " is missing")); 6087 6088 /* 6089 * If the page table page is missing and the mapping 6090 * is for a kernel address, the mapping must belong to 6091 * the direct map. Page table pages are preallocated 6092 * for every other part of the kernel address space, 6093 * so the direct map region is the only part of the 6094 * kernel address space that must be handled here. 6095 */ 6096 KASSERT(!in_kernel || (va >= DMAP_MIN_ADDRESS && 6097 va < DMAP_MAX_ADDRESS), 6098 ("pmap_demote_pde: No saved mpte for va %#lx", va)); 6099 6100 /* 6101 * If the 2MB page mapping belongs to the direct map 6102 * region of the kernel's address space, then the page 6103 * allocation request specifies the highest possible 6104 * priority (VM_ALLOC_INTERRUPT). Otherwise, the 6105 * priority is normal. 6106 */ 6107 mpte = pmap_alloc_pt_page(pmap, pmap_pde_pindex(va), 6108 (in_kernel ? VM_ALLOC_INTERRUPT : 0) | VM_ALLOC_WIRED); 6109 6110 /* 6111 * If the allocation of the new page table page fails, 6112 * invalidate the 2MB page mapping and return "failure". 6113 */ 6114 if (mpte == NULL) { 6115 pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp); 6116 return (false); 6117 } 6118 6119 if (!in_kernel) 6120 mpte->ref_count = NPTEPG; 6121 } 6122 mptepa = VM_PAGE_TO_PHYS(mpte); 6123 firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa); 6124 newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; 6125 KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, 6126 ("pmap_demote_pde: oldpde is missing PG_M")); 6127 newpte = oldpde & ~PG_PS; 6128 newpte = pmap_swap_pat(pmap, newpte); 6129 6130 /* 6131 * If the PTP is not leftover from an earlier promotion or it does not 6132 * have PG_A set in every PTE, then fill it. The new PTEs will all 6133 * have PG_A set. 6134 */ 6135 if (!vm_page_all_valid(mpte)) 6136 pmap_fill_ptp(firstpte, newpte); 6137 6138 pmap_demote_pde_check(firstpte, newpte); 6139 6140 /* 6141 * If the mapping has changed attributes, update the PTEs. 6142 */ 6143 if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) 6144 pmap_fill_ptp(firstpte, newpte); 6145 6146 /* 6147 * The spare PV entries must be reserved prior to demoting the 6148 * mapping, that is, prior to changing the PDE. Otherwise, the state 6149 * of the PDE and the PV lists will be inconsistent, which can result 6150 * in reclaim_pv_chunk() attempting to remove a PV entry from the 6151 * wrong PV list and pmap_pv_demote_pde() failing to find the expected 6152 * PV entry for the 2MB page mapping that is being demoted. 6153 */ 6154 if ((oldpde & PG_MANAGED) != 0) 6155 reserve_pv_entries(pmap, NPTEPG - 1, lockp); 6156 6157 /* 6158 * Demote the mapping. This pmap is locked. The old PDE has 6159 * PG_A set. If the old PDE has PG_RW set, it also has PG_M 6160 * set. Thus, there is no danger of a race with another 6161 * processor changing the setting of PG_A and/or PG_M between 6162 * the read above and the store below. 6163 */ 6164 if (workaround_erratum383) 6165 pmap_update_pde(pmap, va, pde, newpde); 6166 else 6167 pde_store(pde, newpde); 6168 6169 /* 6170 * Invalidate a stale recursive mapping of the page table page. 6171 */ 6172 if (in_kernel) 6173 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 6174 6175 /* 6176 * Demote the PV entry. 6177 */ 6178 if ((oldpde & PG_MANAGED) != 0) 6179 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp); 6180 6181 counter_u64_add(pmap_pde_demotions, 1); 6182 CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx in pmap %p", 6183 va, pmap); 6184 return (true); 6185 } 6186 6187 /* 6188 * pmap_remove_kernel_pde: Remove a kernel superpage mapping. 6189 */ 6190 static void 6191 pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 6192 { 6193 pd_entry_t newpde; 6194 vm_paddr_t mptepa; 6195 vm_page_t mpte; 6196 6197 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); 6198 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6199 mpte = pmap_remove_pt_page(pmap, va); 6200 if (mpte == NULL) 6201 panic("pmap_remove_kernel_pde: Missing pt page."); 6202 6203 mptepa = VM_PAGE_TO_PHYS(mpte); 6204 newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V; 6205 6206 /* 6207 * If this page table page was unmapped by a promotion, then it 6208 * contains valid mappings. Zero it to invalidate those mappings. 6209 */ 6210 if (vm_page_any_valid(mpte)) 6211 pagezero((void *)PHYS_TO_DMAP(mptepa)); 6212 6213 /* 6214 * Demote the mapping. 6215 */ 6216 if (workaround_erratum383) 6217 pmap_update_pde(pmap, va, pde, newpde); 6218 else 6219 pde_store(pde, newpde); 6220 6221 /* 6222 * Invalidate a stale recursive mapping of the page table page. 6223 */ 6224 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 6225 } 6226 6227 /* 6228 * pmap_remove_pde: do the things to unmap a superpage in a process 6229 */ 6230 static int 6231 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 6232 struct spglist *free, struct rwlock **lockp) 6233 { 6234 struct md_page *pvh; 6235 pd_entry_t oldpde; 6236 vm_offset_t eva, va; 6237 vm_page_t m, mpte; 6238 pt_entry_t PG_G, PG_A, PG_M, PG_RW; 6239 6240 PG_G = pmap_global_bit(pmap); 6241 PG_A = pmap_accessed_bit(pmap); 6242 PG_M = pmap_modified_bit(pmap); 6243 PG_RW = pmap_rw_bit(pmap); 6244 6245 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6246 KASSERT((sva & PDRMASK) == 0, 6247 ("pmap_remove_pde: sva is not 2mpage aligned")); 6248 oldpde = pte_load_clear(pdq); 6249 if (oldpde & PG_W) 6250 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; 6251 if ((oldpde & PG_G) != 0) 6252 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); 6253 pmap_resident_count_adj(pmap, -NBPDR / PAGE_SIZE); 6254 if (oldpde & PG_MANAGED) { 6255 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME); 6256 pvh = pa_to_pvh(oldpde & PG_PS_FRAME); 6257 pmap_pvh_free(pvh, pmap, sva); 6258 eva = sva + NBPDR; 6259 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 6260 va < eva; va += PAGE_SIZE, m++) { 6261 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 6262 vm_page_dirty(m); 6263 if (oldpde & PG_A) 6264 vm_page_aflag_set(m, PGA_REFERENCED); 6265 if (TAILQ_EMPTY(&m->md.pv_list) && 6266 TAILQ_EMPTY(&pvh->pv_list)) 6267 vm_page_aflag_clear(m, PGA_WRITEABLE); 6268 pmap_delayed_invl_page(m); 6269 } 6270 } 6271 if (pmap == kernel_pmap) { 6272 pmap_remove_kernel_pde(pmap, pdq, sva); 6273 } else { 6274 mpte = pmap_remove_pt_page(pmap, sva); 6275 if (mpte != NULL) { 6276 KASSERT(vm_page_any_valid(mpte), 6277 ("pmap_remove_pde: pte page not promoted")); 6278 pmap_pt_page_count_adj(pmap, -1); 6279 KASSERT(mpte->ref_count == NPTEPG, 6280 ("pmap_remove_pde: pte page ref count error")); 6281 mpte->ref_count = 0; 6282 pmap_add_delayed_free_list(mpte, free, false); 6283 } 6284 } 6285 return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free)); 6286 } 6287 6288 /* 6289 * pmap_remove_pte: do the things to unmap a page in a process 6290 */ 6291 static int 6292 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 6293 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp) 6294 { 6295 struct md_page *pvh; 6296 pt_entry_t oldpte, PG_A, PG_M, PG_RW; 6297 vm_page_t m; 6298 6299 PG_A = pmap_accessed_bit(pmap); 6300 PG_M = pmap_modified_bit(pmap); 6301 PG_RW = pmap_rw_bit(pmap); 6302 6303 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6304 oldpte = pte_load_clear(ptq); 6305 if (oldpte & PG_W) 6306 pmap->pm_stats.wired_count -= 1; 6307 pmap_resident_count_adj(pmap, -1); 6308 if (oldpte & PG_MANAGED) { 6309 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); 6310 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 6311 vm_page_dirty(m); 6312 if (oldpte & PG_A) 6313 vm_page_aflag_set(m, PGA_REFERENCED); 6314 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 6315 pmap_pvh_free(&m->md, pmap, va); 6316 if (TAILQ_EMPTY(&m->md.pv_list) && 6317 (m->flags & PG_FICTITIOUS) == 0) { 6318 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 6319 if (TAILQ_EMPTY(&pvh->pv_list)) 6320 vm_page_aflag_clear(m, PGA_WRITEABLE); 6321 } 6322 pmap_delayed_invl_page(m); 6323 } 6324 return (pmap_unuse_pt(pmap, va, ptepde, free)); 6325 } 6326 6327 /* 6328 * Remove a single page from a process address space 6329 */ 6330 static void 6331 pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 6332 struct spglist *free) 6333 { 6334 struct rwlock *lock; 6335 pt_entry_t *pte, PG_V; 6336 6337 PG_V = pmap_valid_bit(pmap); 6338 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6339 if ((*pde & PG_V) == 0) 6340 return; 6341 pte = pmap_pde_to_pte(pde, va); 6342 if ((*pte & PG_V) == 0) 6343 return; 6344 lock = NULL; 6345 pmap_remove_pte(pmap, pte, va, *pde, free, &lock); 6346 if (lock != NULL) 6347 rw_wunlock(lock); 6348 pmap_invalidate_page(pmap, va); 6349 } 6350 6351 /* 6352 * Removes the specified range of addresses from the page table page. 6353 */ 6354 static bool 6355 pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 6356 pd_entry_t *pde, struct spglist *free, struct rwlock **lockp) 6357 { 6358 pt_entry_t PG_G, *pte; 6359 vm_offset_t va; 6360 bool anyvalid; 6361 6362 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6363 PG_G = pmap_global_bit(pmap); 6364 anyvalid = false; 6365 va = eva; 6366 for (pte = pmap_pde_to_pte(pde, sva); sva != eva; pte++, 6367 sva += PAGE_SIZE) { 6368 if (*pte == 0) { 6369 if (va != eva) { 6370 pmap_invalidate_range(pmap, va, sva); 6371 va = eva; 6372 } 6373 continue; 6374 } 6375 if ((*pte & PG_G) == 0) 6376 anyvalid = true; 6377 else if (va == eva) 6378 va = sva; 6379 if (pmap_remove_pte(pmap, pte, sva, *pde, free, lockp)) { 6380 sva += PAGE_SIZE; 6381 break; 6382 } 6383 } 6384 if (va != eva) 6385 pmap_invalidate_range(pmap, va, sva); 6386 return (anyvalid); 6387 } 6388 6389 static void 6390 pmap_remove1(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, bool map_delete) 6391 { 6392 struct rwlock *lock; 6393 vm_page_t mt; 6394 vm_offset_t va_next; 6395 pml5_entry_t *pml5e; 6396 pml4_entry_t *pml4e; 6397 pdp_entry_t *pdpe; 6398 pd_entry_t ptpaddr, *pde; 6399 pt_entry_t PG_G, PG_V; 6400 struct spglist free; 6401 int anyvalid; 6402 6403 PG_G = pmap_global_bit(pmap); 6404 PG_V = pmap_valid_bit(pmap); 6405 6406 /* 6407 * If there are no resident pages besides the top level page 6408 * table page(s), there is nothing to do. Kernel pmap always 6409 * accounts whole preloaded area as resident, which makes its 6410 * resident count > 2. 6411 * Perform an unsynchronized read. This is, however, safe. 6412 */ 6413 if (pmap->pm_stats.resident_count <= 1 + (pmap->pm_pmltopu != NULL ? 6414 1 : 0)) 6415 return; 6416 6417 anyvalid = 0; 6418 SLIST_INIT(&free); 6419 6420 pmap_delayed_invl_start(); 6421 PMAP_LOCK(pmap); 6422 if (map_delete) 6423 pmap_pkru_on_remove(pmap, sva, eva); 6424 6425 /* 6426 * special handling of removing one page. a very 6427 * common operation and easy to short circuit some 6428 * code. 6429 */ 6430 if (sva + PAGE_SIZE == eva) { 6431 pde = pmap_pde(pmap, sva); 6432 if (pde && (*pde & PG_PS) == 0) { 6433 pmap_remove_page(pmap, sva, pde, &free); 6434 goto out; 6435 } 6436 } 6437 6438 lock = NULL; 6439 for (; sva < eva; sva = va_next) { 6440 if (pmap->pm_stats.resident_count == 0) 6441 break; 6442 6443 if (pmap_is_la57(pmap)) { 6444 pml5e = pmap_pml5e(pmap, sva); 6445 if ((*pml5e & PG_V) == 0) { 6446 va_next = (sva + NBPML5) & ~PML5MASK; 6447 if (va_next < sva) 6448 va_next = eva; 6449 continue; 6450 } 6451 pml4e = pmap_pml5e_to_pml4e(pml5e, sva); 6452 } else { 6453 pml4e = pmap_pml4e(pmap, sva); 6454 } 6455 if ((*pml4e & PG_V) == 0) { 6456 va_next = (sva + NBPML4) & ~PML4MASK; 6457 if (va_next < sva) 6458 va_next = eva; 6459 continue; 6460 } 6461 6462 va_next = (sva + NBPDP) & ~PDPMASK; 6463 if (va_next < sva) 6464 va_next = eva; 6465 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 6466 if ((*pdpe & PG_V) == 0) 6467 continue; 6468 if ((*pdpe & PG_PS) != 0) { 6469 KASSERT(va_next <= eva, 6470 ("partial update of non-transparent 1G mapping " 6471 "pdpe %#lx sva %#lx eva %#lx va_next %#lx", 6472 *pdpe, sva, eva, va_next)); 6473 MPASS(pmap != kernel_pmap); /* XXXKIB */ 6474 MPASS((*pdpe & (PG_MANAGED | PG_G)) == 0); 6475 anyvalid = 1; 6476 *pdpe = 0; 6477 pmap_resident_count_adj(pmap, -NBPDP / PAGE_SIZE); 6478 mt = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, sva) & PG_FRAME); 6479 pmap_unwire_ptp(pmap, sva, mt, &free); 6480 continue; 6481 } 6482 6483 /* 6484 * Calculate index for next page table. 6485 */ 6486 va_next = (sva + NBPDR) & ~PDRMASK; 6487 if (va_next < sva) 6488 va_next = eva; 6489 6490 pde = pmap_pdpe_to_pde(pdpe, sva); 6491 ptpaddr = *pde; 6492 6493 /* 6494 * Weed out invalid mappings. 6495 */ 6496 if (ptpaddr == 0) 6497 continue; 6498 6499 /* 6500 * Check for large page. 6501 */ 6502 if ((ptpaddr & PG_PS) != 0) { 6503 /* 6504 * Are we removing the entire large page? If not, 6505 * demote the mapping and fall through. 6506 */ 6507 if (sva + NBPDR == va_next && eva >= va_next) { 6508 /* 6509 * The TLB entry for a PG_G mapping is 6510 * invalidated by pmap_remove_pde(). 6511 */ 6512 if ((ptpaddr & PG_G) == 0) 6513 anyvalid = 1; 6514 pmap_remove_pde(pmap, pde, sva, &free, &lock); 6515 continue; 6516 } else if (!pmap_demote_pde_locked(pmap, pde, sva, 6517 &lock)) { 6518 /* The large page mapping was destroyed. */ 6519 continue; 6520 } else 6521 ptpaddr = *pde; 6522 } 6523 6524 /* 6525 * Limit our scan to either the end of the va represented 6526 * by the current page table page, or to the end of the 6527 * range being removed. 6528 */ 6529 if (va_next > eva) 6530 va_next = eva; 6531 6532 if (pmap_remove_ptes(pmap, sva, va_next, pde, &free, &lock)) 6533 anyvalid = 1; 6534 } 6535 if (lock != NULL) 6536 rw_wunlock(lock); 6537 out: 6538 if (anyvalid) 6539 pmap_invalidate_all(pmap); 6540 PMAP_UNLOCK(pmap); 6541 pmap_delayed_invl_finish(); 6542 vm_page_free_pages_toq(&free, true); 6543 } 6544 6545 /* 6546 * Remove the given range of addresses from the specified map. 6547 * 6548 * It is assumed that the start and end are properly 6549 * rounded to the page size. 6550 */ 6551 void 6552 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 6553 { 6554 pmap_remove1(pmap, sva, eva, false); 6555 } 6556 6557 /* 6558 * Remove the given range of addresses as part of a logical unmap 6559 * operation. This has the effect of calling pmap_remove(), but 6560 * also clears any metadata that should persist for the lifetime 6561 * of a logical mapping. 6562 */ 6563 void 6564 pmap_map_delete(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 6565 { 6566 pmap_remove1(pmap, sva, eva, true); 6567 } 6568 6569 /* 6570 * Routine: pmap_remove_all 6571 * Function: 6572 * Removes this physical page from 6573 * all physical maps in which it resides. 6574 * Reflects back modify bits to the pager. 6575 * 6576 * Notes: 6577 * Original versions of this routine were very 6578 * inefficient because they iteratively called 6579 * pmap_remove (slow...) 6580 */ 6581 6582 void 6583 pmap_remove_all(vm_page_t m) 6584 { 6585 struct md_page *pvh; 6586 pv_entry_t pv; 6587 pmap_t pmap; 6588 struct rwlock *lock; 6589 pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW; 6590 pd_entry_t *pde; 6591 vm_offset_t va; 6592 struct spglist free; 6593 int pvh_gen, md_gen; 6594 6595 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 6596 ("pmap_remove_all: page %p is not managed", m)); 6597 SLIST_INIT(&free); 6598 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 6599 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 6600 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 6601 rw_wlock(lock); 6602 retry: 6603 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 6604 pmap = PV_PMAP(pv); 6605 if (!PMAP_TRYLOCK(pmap)) { 6606 pvh_gen = pvh->pv_gen; 6607 rw_wunlock(lock); 6608 PMAP_LOCK(pmap); 6609 rw_wlock(lock); 6610 if (pvh_gen != pvh->pv_gen) { 6611 PMAP_UNLOCK(pmap); 6612 goto retry; 6613 } 6614 } 6615 va = pv->pv_va; 6616 pde = pmap_pde(pmap, va); 6617 (void)pmap_demote_pde_locked(pmap, pde, va, &lock); 6618 PMAP_UNLOCK(pmap); 6619 } 6620 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 6621 pmap = PV_PMAP(pv); 6622 if (!PMAP_TRYLOCK(pmap)) { 6623 pvh_gen = pvh->pv_gen; 6624 md_gen = m->md.pv_gen; 6625 rw_wunlock(lock); 6626 PMAP_LOCK(pmap); 6627 rw_wlock(lock); 6628 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 6629 PMAP_UNLOCK(pmap); 6630 goto retry; 6631 } 6632 } 6633 PG_A = pmap_accessed_bit(pmap); 6634 PG_M = pmap_modified_bit(pmap); 6635 PG_RW = pmap_rw_bit(pmap); 6636 pmap_resident_count_adj(pmap, -1); 6637 pde = pmap_pde(pmap, pv->pv_va); 6638 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" 6639 " a 2mpage in page %p's pv list", m)); 6640 pte = pmap_pde_to_pte(pde, pv->pv_va); 6641 tpte = pte_load_clear(pte); 6642 if (tpte & PG_W) 6643 pmap->pm_stats.wired_count--; 6644 if (tpte & PG_A) 6645 vm_page_aflag_set(m, PGA_REFERENCED); 6646 6647 /* 6648 * Update the vm_page_t clean and reference bits. 6649 */ 6650 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 6651 vm_page_dirty(m); 6652 pmap_unuse_pt(pmap, pv->pv_va, *pde, &free); 6653 pmap_invalidate_page(pmap, pv->pv_va); 6654 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 6655 m->md.pv_gen++; 6656 free_pv_entry(pmap, pv); 6657 PMAP_UNLOCK(pmap); 6658 } 6659 vm_page_aflag_clear(m, PGA_WRITEABLE); 6660 rw_wunlock(lock); 6661 pmap_delayed_invl_wait(m); 6662 vm_page_free_pages_toq(&free, true); 6663 } 6664 6665 /* 6666 * pmap_protect_pde: do the things to protect a 2mpage in a process 6667 */ 6668 static bool 6669 pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) 6670 { 6671 pd_entry_t newpde, oldpde; 6672 vm_page_t m, mt; 6673 bool anychanged; 6674 pt_entry_t PG_G, PG_M, PG_RW; 6675 6676 PG_G = pmap_global_bit(pmap); 6677 PG_M = pmap_modified_bit(pmap); 6678 PG_RW = pmap_rw_bit(pmap); 6679 6680 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6681 KASSERT((sva & PDRMASK) == 0, 6682 ("pmap_protect_pde: sva is not 2mpage aligned")); 6683 anychanged = false; 6684 retry: 6685 oldpde = newpde = *pde; 6686 if ((prot & VM_PROT_WRITE) == 0) { 6687 if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) == 6688 (PG_MANAGED | PG_M | PG_RW)) { 6689 m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 6690 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 6691 vm_page_dirty(mt); 6692 } 6693 newpde &= ~(PG_RW | PG_M); 6694 } 6695 if ((prot & VM_PROT_EXECUTE) == 0) 6696 newpde |= pg_nx; 6697 if (newpde != oldpde) { 6698 /* 6699 * As an optimization to future operations on this PDE, clear 6700 * PG_PROMOTED. The impending invalidation will remove any 6701 * lingering 4KB page mappings from the TLB. 6702 */ 6703 if (!atomic_cmpset_long(pde, oldpde, newpde & ~PG_PROMOTED)) 6704 goto retry; 6705 if ((oldpde & PG_G) != 0) 6706 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); 6707 else 6708 anychanged = true; 6709 } 6710 return (anychanged); 6711 } 6712 6713 /* 6714 * Set the physical protection on the 6715 * specified range of this map as requested. 6716 */ 6717 void 6718 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 6719 { 6720 vm_page_t m; 6721 vm_offset_t va_next; 6722 pml4_entry_t *pml4e; 6723 pdp_entry_t *pdpe; 6724 pd_entry_t ptpaddr, *pde; 6725 pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V; 6726 pt_entry_t obits, pbits; 6727 bool anychanged; 6728 6729 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); 6730 if (prot == VM_PROT_NONE) { 6731 pmap_remove(pmap, sva, eva); 6732 return; 6733 } 6734 6735 if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == 6736 (VM_PROT_WRITE|VM_PROT_EXECUTE)) 6737 return; 6738 6739 PG_G = pmap_global_bit(pmap); 6740 PG_M = pmap_modified_bit(pmap); 6741 PG_V = pmap_valid_bit(pmap); 6742 PG_RW = pmap_rw_bit(pmap); 6743 anychanged = false; 6744 6745 /* 6746 * Although this function delays and batches the invalidation 6747 * of stale TLB entries, it does not need to call 6748 * pmap_delayed_invl_start() and 6749 * pmap_delayed_invl_finish(), because it does not 6750 * ordinarily destroy mappings. Stale TLB entries from 6751 * protection-only changes need only be invalidated before the 6752 * pmap lock is released, because protection-only changes do 6753 * not destroy PV entries. Even operations that iterate over 6754 * a physical page's PV list of mappings, like 6755 * pmap_remove_write(), acquire the pmap lock for each 6756 * mapping. Consequently, for protection-only changes, the 6757 * pmap lock suffices to synchronize both page table and TLB 6758 * updates. 6759 * 6760 * This function only destroys a mapping if pmap_demote_pde() 6761 * fails. In that case, stale TLB entries are immediately 6762 * invalidated. 6763 */ 6764 6765 PMAP_LOCK(pmap); 6766 for (; sva < eva; sva = va_next) { 6767 pml4e = pmap_pml4e(pmap, sva); 6768 if (pml4e == NULL || (*pml4e & PG_V) == 0) { 6769 va_next = (sva + NBPML4) & ~PML4MASK; 6770 if (va_next < sva) 6771 va_next = eva; 6772 continue; 6773 } 6774 6775 va_next = (sva + NBPDP) & ~PDPMASK; 6776 if (va_next < sva) 6777 va_next = eva; 6778 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 6779 if ((*pdpe & PG_V) == 0) 6780 continue; 6781 if ((*pdpe & PG_PS) != 0) { 6782 KASSERT(va_next <= eva, 6783 ("partial update of non-transparent 1G mapping " 6784 "pdpe %#lx sva %#lx eva %#lx va_next %#lx", 6785 *pdpe, sva, eva, va_next)); 6786 retry_pdpe: 6787 obits = pbits = *pdpe; 6788 MPASS((pbits & (PG_MANAGED | PG_G)) == 0); 6789 MPASS(pmap != kernel_pmap); /* XXXKIB */ 6790 if ((prot & VM_PROT_WRITE) == 0) 6791 pbits &= ~(PG_RW | PG_M); 6792 if ((prot & VM_PROT_EXECUTE) == 0) 6793 pbits |= pg_nx; 6794 6795 if (pbits != obits) { 6796 if (!atomic_cmpset_long(pdpe, obits, pbits)) 6797 /* PG_PS cannot be cleared under us, */ 6798 goto retry_pdpe; 6799 anychanged = true; 6800 } 6801 continue; 6802 } 6803 6804 va_next = (sva + NBPDR) & ~PDRMASK; 6805 if (va_next < sva) 6806 va_next = eva; 6807 6808 pde = pmap_pdpe_to_pde(pdpe, sva); 6809 ptpaddr = *pde; 6810 6811 /* 6812 * Weed out invalid mappings. 6813 */ 6814 if (ptpaddr == 0) 6815 continue; 6816 6817 /* 6818 * Check for large page. 6819 */ 6820 if ((ptpaddr & PG_PS) != 0) { 6821 /* 6822 * Are we protecting the entire large page? 6823 */ 6824 if (sva + NBPDR == va_next && eva >= va_next) { 6825 /* 6826 * The TLB entry for a PG_G mapping is 6827 * invalidated by pmap_protect_pde(). 6828 */ 6829 if (pmap_protect_pde(pmap, pde, sva, prot)) 6830 anychanged = true; 6831 continue; 6832 } 6833 6834 /* 6835 * Does the large page mapping need to change? If so, 6836 * demote it and fall through. 6837 */ 6838 pbits = ptpaddr; 6839 if ((prot & VM_PROT_WRITE) == 0) 6840 pbits &= ~(PG_RW | PG_M); 6841 if ((prot & VM_PROT_EXECUTE) == 0) 6842 pbits |= pg_nx; 6843 if (ptpaddr == pbits || !pmap_demote_pde(pmap, pde, 6844 sva)) { 6845 /* 6846 * Either the large page mapping doesn't need 6847 * to change, or it was destroyed during 6848 * demotion. 6849 */ 6850 continue; 6851 } 6852 } 6853 6854 if (va_next > eva) 6855 va_next = eva; 6856 6857 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 6858 sva += PAGE_SIZE) { 6859 retry: 6860 obits = pbits = *pte; 6861 if ((pbits & PG_V) == 0) 6862 continue; 6863 6864 if ((prot & VM_PROT_WRITE) == 0) { 6865 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == 6866 (PG_MANAGED | PG_M | PG_RW)) { 6867 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 6868 vm_page_dirty(m); 6869 } 6870 pbits &= ~(PG_RW | PG_M); 6871 } 6872 if ((prot & VM_PROT_EXECUTE) == 0) 6873 pbits |= pg_nx; 6874 6875 if (pbits != obits) { 6876 if (!atomic_cmpset_long(pte, obits, pbits)) 6877 goto retry; 6878 if (obits & PG_G) 6879 pmap_invalidate_page(pmap, sva); 6880 else 6881 anychanged = true; 6882 } 6883 } 6884 } 6885 if (anychanged) 6886 pmap_invalidate_all(pmap); 6887 PMAP_UNLOCK(pmap); 6888 } 6889 6890 static bool 6891 pmap_pde_ept_executable(pmap_t pmap, pd_entry_t pde) 6892 { 6893 6894 if (pmap->pm_type != PT_EPT) 6895 return (false); 6896 return ((pde & EPT_PG_EXECUTE) != 0); 6897 } 6898 6899 #if VM_NRESERVLEVEL > 0 6900 /* 6901 * Tries to promote the 512, contiguous 4KB page mappings that are within a 6902 * single page table page (PTP) to a single 2MB page mapping. For promotion 6903 * to occur, two conditions must be met: (1) the 4KB page mappings must map 6904 * aligned, contiguous physical memory and (2) the 4KB page mappings must have 6905 * identical characteristics. 6906 */ 6907 static bool 6908 pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, vm_page_t mpte, 6909 struct rwlock **lockp) 6910 { 6911 pd_entry_t newpde; 6912 pt_entry_t *firstpte, oldpte, pa, *pte; 6913 pt_entry_t allpte_PG_A, PG_A, PG_G, PG_M, PG_PKU_MASK, PG_RW, PG_V; 6914 int PG_PTE_CACHE; 6915 6916 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6917 if (!pmap_ps_enabled(pmap)) 6918 return (false); 6919 6920 PG_A = pmap_accessed_bit(pmap); 6921 PG_G = pmap_global_bit(pmap); 6922 PG_M = pmap_modified_bit(pmap); 6923 PG_V = pmap_valid_bit(pmap); 6924 PG_RW = pmap_rw_bit(pmap); 6925 PG_PKU_MASK = pmap_pku_mask_bit(pmap); 6926 PG_PTE_CACHE = pmap_cache_mask(pmap, false); 6927 6928 /* 6929 * Examine the first PTE in the specified PTP. Abort if this PTE is 6930 * ineligible for promotion due to hardware errata, invalid, or does 6931 * not map the first 4KB physical page within a 2MB page. 6932 */ 6933 firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 6934 newpde = *firstpte; 6935 if (!pmap_allow_2m_x_page(pmap, pmap_pde_ept_executable(pmap, newpde))) 6936 return (false); 6937 if ((newpde & ((PG_FRAME & PDRMASK) | PG_V)) != PG_V) { 6938 counter_u64_add(pmap_pde_p_failures, 1); 6939 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 6940 " in pmap %p", va, pmap); 6941 return (false); 6942 } 6943 6944 /* 6945 * Both here and in the below "for" loop, to allow for repromotion 6946 * after MADV_FREE, conditionally write protect a clean PTE before 6947 * possibly aborting the promotion due to other PTE attributes. Why? 6948 * Suppose that MADV_FREE is applied to a part of a superpage, the 6949 * address range [S, E). pmap_advise() will demote the superpage 6950 * mapping, destroy the 4KB page mapping at the end of [S, E), and 6951 * clear PG_M and PG_A in the PTEs for the rest of [S, E). Later, 6952 * imagine that the memory in [S, E) is recycled, but the last 4KB 6953 * page in [S, E) is not the last to be rewritten, or simply accessed. 6954 * In other words, there is still a 4KB page in [S, E), call it P, 6955 * that is writeable but PG_M and PG_A are clear in P's PTE. Unless 6956 * we write protect P before aborting the promotion, if and when P is 6957 * finally rewritten, there won't be a page fault to trigger 6958 * repromotion. 6959 */ 6960 setpde: 6961 if ((newpde & (PG_M | PG_RW)) == PG_RW) { 6962 /* 6963 * When PG_M is already clear, PG_RW can be cleared without 6964 * a TLB invalidation. 6965 */ 6966 if (!atomic_fcmpset_long(firstpte, &newpde, newpde & ~PG_RW)) 6967 goto setpde; 6968 newpde &= ~PG_RW; 6969 CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx" 6970 " in pmap %p", va & ~PDRMASK, pmap); 6971 } 6972 6973 /* 6974 * Examine each of the other PTEs in the specified PTP. Abort if this 6975 * PTE maps an unexpected 4KB physical page or does not have identical 6976 * characteristics to the first PTE. 6977 */ 6978 allpte_PG_A = newpde & PG_A; 6979 pa = (newpde & (PG_PS_FRAME | PG_V)) + NBPDR - PAGE_SIZE; 6980 for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { 6981 oldpte = *pte; 6982 if ((oldpte & (PG_FRAME | PG_V)) != pa) { 6983 counter_u64_add(pmap_pde_p_failures, 1); 6984 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 6985 " in pmap %p", va, pmap); 6986 return (false); 6987 } 6988 setpte: 6989 if ((oldpte & (PG_M | PG_RW)) == PG_RW) { 6990 /* 6991 * When PG_M is already clear, PG_RW can be cleared 6992 * without a TLB invalidation. 6993 */ 6994 if (!atomic_fcmpset_long(pte, &oldpte, oldpte & ~PG_RW)) 6995 goto setpte; 6996 oldpte &= ~PG_RW; 6997 CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx" 6998 " in pmap %p", (oldpte & PG_FRAME & PDRMASK) | 6999 (va & ~PDRMASK), pmap); 7000 } 7001 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { 7002 counter_u64_add(pmap_pde_p_failures, 1); 7003 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 7004 " in pmap %p", va, pmap); 7005 return (false); 7006 } 7007 allpte_PG_A &= oldpte; 7008 pa -= PAGE_SIZE; 7009 } 7010 7011 /* 7012 * Unless all PTEs have PG_A set, clear it from the superpage mapping, 7013 * so that promotions triggered by speculative mappings, such as 7014 * pmap_enter_quick(), don't automatically mark the underlying pages 7015 * as referenced. 7016 */ 7017 newpde &= ~PG_A | allpte_PG_A; 7018 7019 /* 7020 * EPT PTEs with PG_M set and PG_A clear are not supported by early 7021 * MMUs supporting EPT. 7022 */ 7023 KASSERT((newpde & PG_A) != 0 || safe_to_clear_referenced(pmap, newpde), 7024 ("unsupported EPT PTE")); 7025 7026 /* 7027 * Save the PTP in its current state until the PDE mapping the 7028 * superpage is demoted by pmap_demote_pde() or destroyed by 7029 * pmap_remove_pde(). If PG_A is not set in every PTE, then request 7030 * that the PTP be refilled on demotion. 7031 */ 7032 if (mpte == NULL) 7033 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 7034 KASSERT(mpte >= vm_page_array && 7035 mpte < &vm_page_array[vm_page_array_size], 7036 ("pmap_promote_pde: page table page is out of range")); 7037 KASSERT(mpte->pindex == pmap_pde_pindex(va), 7038 ("pmap_promote_pde: page table page's pindex is wrong " 7039 "mpte %p pidx %#lx va %#lx va pde pidx %#lx", 7040 mpte, mpte->pindex, va, pmap_pde_pindex(va))); 7041 if (pmap_insert_pt_page(pmap, mpte, true, allpte_PG_A != 0)) { 7042 counter_u64_add(pmap_pde_p_failures, 1); 7043 CTR2(KTR_PMAP, 7044 "pmap_promote_pde: failure for va %#lx in pmap %p", va, 7045 pmap); 7046 return (false); 7047 } 7048 7049 /* 7050 * Promote the pv entries. 7051 */ 7052 if ((newpde & PG_MANAGED) != 0) 7053 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp); 7054 7055 /* 7056 * Propagate the PAT index to its proper position. 7057 */ 7058 newpde = pmap_swap_pat(pmap, newpde); 7059 7060 /* 7061 * Map the superpage. 7062 */ 7063 if (workaround_erratum383) 7064 pmap_update_pde(pmap, va, pde, PG_PS | newpde); 7065 else 7066 pde_store(pde, PG_PROMOTED | PG_PS | newpde); 7067 7068 counter_u64_add(pmap_pde_promotions, 1); 7069 CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx" 7070 " in pmap %p", va, pmap); 7071 return (true); 7072 } 7073 #endif /* VM_NRESERVLEVEL > 0 */ 7074 7075 static int 7076 pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t newpte, int flags, 7077 int psind) 7078 { 7079 vm_page_t mp; 7080 pt_entry_t origpte, *pml4e, *pdpe, *pde, pten, PG_V; 7081 7082 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 7083 KASSERT(psind > 0 && psind < MAXPAGESIZES && pagesizes[psind] != 0, 7084 ("psind %d unexpected", psind)); 7085 KASSERT(((newpte & PG_FRAME) & (pagesizes[psind] - 1)) == 0, 7086 ("unaligned phys address %#lx newpte %#lx psind %d", 7087 newpte & PG_FRAME, newpte, psind)); 7088 KASSERT((va & (pagesizes[psind] - 1)) == 0, 7089 ("unaligned va %#lx psind %d", va, psind)); 7090 KASSERT(va < VM_MAXUSER_ADDRESS, 7091 ("kernel mode non-transparent superpage")); /* XXXKIB */ 7092 KASSERT(va + pagesizes[psind] < VM_MAXUSER_ADDRESS, 7093 ("overflowing user map va %#lx psind %d", va, psind)); /* XXXKIB */ 7094 7095 PG_V = pmap_valid_bit(pmap); 7096 7097 restart: 7098 pten = newpte; 7099 if (!pmap_pkru_same(pmap, va, va + pagesizes[psind], &pten)) 7100 return (KERN_PROTECTION_FAILURE); 7101 7102 if (psind == 2) { /* 1G */ 7103 pml4e = pmap_pml4e(pmap, va); 7104 if (pml4e == NULL || (*pml4e & PG_V) == 0) { 7105 mp = pmap_allocpte_alloc(pmap, pmap_pml4e_pindex(va), 7106 NULL, va); 7107 if (mp == NULL) 7108 goto allocf; 7109 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp)); 7110 pdpe = &pdpe[pmap_pdpe_index(va)]; 7111 origpte = *pdpe; 7112 MPASS(origpte == 0); 7113 } else { 7114 pdpe = pmap_pml4e_to_pdpe(pml4e, va); 7115 KASSERT(pdpe != NULL, ("va %#lx lost pdpe", va)); 7116 origpte = *pdpe; 7117 if ((origpte & PG_V) == 0) { 7118 mp = PHYS_TO_VM_PAGE(*pml4e & PG_FRAME); 7119 mp->ref_count++; 7120 } 7121 } 7122 *pdpe = pten; 7123 } else /* (psind == 1) */ { /* 2M */ 7124 pde = pmap_pde(pmap, va); 7125 if (pde == NULL) { 7126 mp = pmap_allocpte_alloc(pmap, pmap_pdpe_pindex(va), 7127 NULL, va); 7128 if (mp == NULL) 7129 goto allocf; 7130 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp)); 7131 pde = &pde[pmap_pde_index(va)]; 7132 origpte = *pde; 7133 MPASS(origpte == 0); 7134 } else { 7135 origpte = *pde; 7136 if ((origpte & PG_V) == 0) { 7137 pdpe = pmap_pdpe(pmap, va); 7138 MPASS(pdpe != NULL && (*pdpe & PG_V) != 0); 7139 mp = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); 7140 mp->ref_count++; 7141 } 7142 } 7143 *pde = pten; 7144 } 7145 KASSERT((origpte & PG_V) == 0 || ((origpte & PG_PS) != 0 && 7146 (origpte & PG_PS_FRAME) == (pten & PG_PS_FRAME)), 7147 ("va %#lx changing %s phys page origpte %#lx pten %#lx", 7148 va, psind == 2 ? "1G" : "2M", origpte, pten)); 7149 if ((pten & PG_W) != 0 && (origpte & PG_W) == 0) 7150 pmap->pm_stats.wired_count += pagesizes[psind] / PAGE_SIZE; 7151 else if ((pten & PG_W) == 0 && (origpte & PG_W) != 0) 7152 pmap->pm_stats.wired_count -= pagesizes[psind] / PAGE_SIZE; 7153 if ((origpte & PG_V) == 0) 7154 pmap_resident_count_adj(pmap, pagesizes[psind] / PAGE_SIZE); 7155 7156 return (KERN_SUCCESS); 7157 7158 allocf: 7159 if ((flags & PMAP_ENTER_NOSLEEP) != 0) 7160 return (KERN_RESOURCE_SHORTAGE); 7161 PMAP_UNLOCK(pmap); 7162 vm_wait(NULL); 7163 PMAP_LOCK(pmap); 7164 goto restart; 7165 } 7166 7167 /* 7168 * Insert the given physical page (p) at 7169 * the specified virtual address (v) in the 7170 * target physical map with the protection requested. 7171 * 7172 * If specified, the page will be wired down, meaning 7173 * that the related pte can not be reclaimed. 7174 * 7175 * NB: This is the only routine which MAY NOT lazy-evaluate 7176 * or lose information. That is, this routine must actually 7177 * insert this page into the given map NOW. 7178 * 7179 * When destroying both a page table and PV entry, this function 7180 * performs the TLB invalidation before releasing the PV list 7181 * lock, so we do not need pmap_delayed_invl_page() calls here. 7182 */ 7183 int 7184 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 7185 u_int flags, int8_t psind) 7186 { 7187 struct rwlock *lock; 7188 pd_entry_t *pde; 7189 pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V; 7190 pt_entry_t newpte, origpte; 7191 pv_entry_t pv; 7192 vm_paddr_t opa, pa; 7193 vm_page_t mpte, om; 7194 int rv; 7195 bool nosleep; 7196 7197 PG_A = pmap_accessed_bit(pmap); 7198 PG_G = pmap_global_bit(pmap); 7199 PG_M = pmap_modified_bit(pmap); 7200 PG_V = pmap_valid_bit(pmap); 7201 PG_RW = pmap_rw_bit(pmap); 7202 7203 va = trunc_page(va); 7204 KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); 7205 KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, 7206 ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", 7207 va)); 7208 KASSERT((m->oflags & VPO_UNMANAGED) != 0 || !VA_IS_CLEANMAP(va), 7209 ("pmap_enter: managed mapping within the clean submap")); 7210 if ((m->oflags & VPO_UNMANAGED) == 0) 7211 VM_PAGE_OBJECT_BUSY_ASSERT(m); 7212 KASSERT((flags & PMAP_ENTER_RESERVED) == 0, 7213 ("pmap_enter: flags %u has reserved bits set", flags)); 7214 pa = VM_PAGE_TO_PHYS(m); 7215 newpte = (pt_entry_t)(pa | PG_A | PG_V); 7216 if ((flags & VM_PROT_WRITE) != 0) 7217 newpte |= PG_M; 7218 if ((prot & VM_PROT_WRITE) != 0) 7219 newpte |= PG_RW; 7220 KASSERT((newpte & (PG_M | PG_RW)) != PG_M, 7221 ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't")); 7222 if ((prot & VM_PROT_EXECUTE) == 0) 7223 newpte |= pg_nx; 7224 if ((flags & PMAP_ENTER_WIRED) != 0) 7225 newpte |= PG_W; 7226 if (va < VM_MAXUSER_ADDRESS) 7227 newpte |= PG_U; 7228 if (pmap == kernel_pmap) 7229 newpte |= PG_G; 7230 newpte |= pmap_cache_bits(pmap, m->md.pat_mode, psind > 0); 7231 7232 /* 7233 * Set modified bit gratuitously for writeable mappings if 7234 * the page is unmanaged. We do not want to take a fault 7235 * to do the dirty bit accounting for these mappings. 7236 */ 7237 if ((m->oflags & VPO_UNMANAGED) != 0) { 7238 if ((newpte & PG_RW) != 0) 7239 newpte |= PG_M; 7240 } else 7241 newpte |= PG_MANAGED; 7242 7243 lock = NULL; 7244 PMAP_LOCK(pmap); 7245 if ((flags & PMAP_ENTER_LARGEPAGE) != 0) { 7246 KASSERT((m->oflags & VPO_UNMANAGED) != 0, 7247 ("managed largepage va %#lx flags %#x", va, flags)); 7248 rv = pmap_enter_largepage(pmap, va, newpte | PG_PS, flags, 7249 psind); 7250 goto out; 7251 } 7252 if (psind == 1) { 7253 /* Assert the required virtual and physical alignment. */ 7254 KASSERT((va & PDRMASK) == 0, ("pmap_enter: va unaligned")); 7255 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); 7256 rv = pmap_enter_pde(pmap, va, newpte | PG_PS, flags, m, &lock); 7257 goto out; 7258 } 7259 mpte = NULL; 7260 7261 /* 7262 * In the case that a page table page is not 7263 * resident, we are creating it here. 7264 */ 7265 retry: 7266 pde = pmap_pde(pmap, va); 7267 if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 || 7268 pmap_demote_pde_locked(pmap, pde, va, &lock))) { 7269 pte = pmap_pde_to_pte(pde, va); 7270 if (va < VM_MAXUSER_ADDRESS && mpte == NULL) { 7271 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 7272 mpte->ref_count++; 7273 } 7274 } else if (va < VM_MAXUSER_ADDRESS) { 7275 /* 7276 * Here if the pte page isn't mapped, or if it has been 7277 * deallocated. 7278 */ 7279 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; 7280 mpte = pmap_allocpte_alloc(pmap, pmap_pde_pindex(va), 7281 nosleep ? NULL : &lock, va); 7282 if (mpte == NULL && nosleep) { 7283 rv = KERN_RESOURCE_SHORTAGE; 7284 goto out; 7285 } 7286 goto retry; 7287 } else 7288 panic("pmap_enter: invalid page directory va=%#lx", va); 7289 7290 origpte = *pte; 7291 pv = NULL; 7292 if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) 7293 newpte |= pmap_pkru_get(pmap, va); 7294 7295 /* 7296 * Is the specified virtual address already mapped? 7297 */ 7298 if ((origpte & PG_V) != 0) { 7299 /* 7300 * Wiring change, just update stats. We don't worry about 7301 * wiring PT pages as they remain resident as long as there 7302 * are valid mappings in them. Hence, if a user page is wired, 7303 * the PT page will be also. 7304 */ 7305 if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0) 7306 pmap->pm_stats.wired_count++; 7307 else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0) 7308 pmap->pm_stats.wired_count--; 7309 7310 /* 7311 * Remove the extra PT page reference. 7312 */ 7313 if (mpte != NULL) { 7314 mpte->ref_count--; 7315 KASSERT(mpte->ref_count > 0, 7316 ("pmap_enter: missing reference to page table page," 7317 " va: 0x%lx", va)); 7318 } 7319 7320 /* 7321 * Has the physical page changed? 7322 */ 7323 opa = origpte & PG_FRAME; 7324 if (opa == pa) { 7325 /* 7326 * No, might be a protection or wiring change. 7327 */ 7328 if ((origpte & PG_MANAGED) != 0 && 7329 (newpte & PG_RW) != 0) 7330 vm_page_aflag_set(m, PGA_WRITEABLE); 7331 if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) 7332 goto unchanged; 7333 goto validate; 7334 } 7335 7336 /* 7337 * The physical page has changed. Temporarily invalidate 7338 * the mapping. This ensures that all threads sharing the 7339 * pmap keep a consistent view of the mapping, which is 7340 * necessary for the correct handling of COW faults. It 7341 * also permits reuse of the old mapping's PV entry, 7342 * avoiding an allocation. 7343 * 7344 * For consistency, handle unmanaged mappings the same way. 7345 */ 7346 origpte = pte_load_clear(pte); 7347 KASSERT((origpte & PG_FRAME) == opa, 7348 ("pmap_enter: unexpected pa update for %#lx", va)); 7349 if ((origpte & PG_MANAGED) != 0) { 7350 om = PHYS_TO_VM_PAGE(opa); 7351 7352 /* 7353 * The pmap lock is sufficient to synchronize with 7354 * concurrent calls to pmap_page_test_mappings() and 7355 * pmap_ts_referenced(). 7356 */ 7357 if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 7358 vm_page_dirty(om); 7359 if ((origpte & PG_A) != 0) { 7360 pmap_invalidate_page(pmap, va); 7361 vm_page_aflag_set(om, PGA_REFERENCED); 7362 } 7363 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); 7364 pv = pmap_pvh_remove(&om->md, pmap, va); 7365 KASSERT(pv != NULL, 7366 ("pmap_enter: no PV entry for %#lx", va)); 7367 if ((newpte & PG_MANAGED) == 0) 7368 free_pv_entry(pmap, pv); 7369 if ((om->a.flags & PGA_WRITEABLE) != 0 && 7370 TAILQ_EMPTY(&om->md.pv_list) && 7371 ((om->flags & PG_FICTITIOUS) != 0 || 7372 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 7373 vm_page_aflag_clear(om, PGA_WRITEABLE); 7374 } else { 7375 /* 7376 * Since this mapping is unmanaged, assume that PG_A 7377 * is set. 7378 */ 7379 pmap_invalidate_page(pmap, va); 7380 } 7381 origpte = 0; 7382 } else { 7383 /* 7384 * Increment the counters. 7385 */ 7386 if ((newpte & PG_W) != 0) 7387 pmap->pm_stats.wired_count++; 7388 pmap_resident_count_adj(pmap, 1); 7389 } 7390 7391 /* 7392 * Enter on the PV list if part of our managed memory. 7393 */ 7394 if ((newpte & PG_MANAGED) != 0) { 7395 if (pv == NULL) { 7396 pv = get_pv_entry(pmap, &lock); 7397 pv->pv_va = va; 7398 } 7399 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); 7400 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 7401 m->md.pv_gen++; 7402 if ((newpte & PG_RW) != 0) 7403 vm_page_aflag_set(m, PGA_WRITEABLE); 7404 } 7405 7406 /* 7407 * Update the PTE. 7408 */ 7409 if ((origpte & PG_V) != 0) { 7410 validate: 7411 origpte = pte_load_store(pte, newpte); 7412 KASSERT((origpte & PG_FRAME) == pa, 7413 ("pmap_enter: unexpected pa update for %#lx", va)); 7414 if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) == 7415 (PG_M | PG_RW)) { 7416 if ((origpte & PG_MANAGED) != 0) 7417 vm_page_dirty(m); 7418 7419 /* 7420 * Although the PTE may still have PG_RW set, TLB 7421 * invalidation may nonetheless be required because 7422 * the PTE no longer has PG_M set. 7423 */ 7424 } else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) { 7425 /* 7426 * This PTE change does not require TLB invalidation. 7427 */ 7428 goto unchanged; 7429 } 7430 if ((origpte & PG_A) != 0) 7431 pmap_invalidate_page(pmap, va); 7432 } else 7433 pte_store(pte, newpte); 7434 7435 unchanged: 7436 7437 #if VM_NRESERVLEVEL > 0 7438 /* 7439 * If both the page table page and the reservation are fully 7440 * populated, then attempt promotion. 7441 */ 7442 if ((mpte == NULL || mpte->ref_count == NPTEPG) && 7443 (m->flags & PG_FICTITIOUS) == 0 && 7444 vm_reserv_level_iffullpop(m) == 0) 7445 (void)pmap_promote_pde(pmap, pde, va, mpte, &lock); 7446 #endif 7447 7448 rv = KERN_SUCCESS; 7449 out: 7450 if (lock != NULL) 7451 rw_wunlock(lock); 7452 PMAP_UNLOCK(pmap); 7453 return (rv); 7454 } 7455 7456 /* 7457 * Tries to create a read- and/or execute-only 2MB page mapping. Returns 7458 * KERN_SUCCESS if the mapping was created. Otherwise, returns an error 7459 * value. See pmap_enter_pde() for the possible error values when "no sleep", 7460 * "no replace", and "no reclaim" are specified. 7461 */ 7462 static int 7463 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 7464 struct rwlock **lockp) 7465 { 7466 pd_entry_t newpde; 7467 pt_entry_t PG_V; 7468 7469 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 7470 PG_V = pmap_valid_bit(pmap); 7471 newpde = VM_PAGE_TO_PHYS(m) | 7472 pmap_cache_bits(pmap, m->md.pat_mode, true) | PG_PS | PG_V; 7473 if ((m->oflags & VPO_UNMANAGED) == 0) 7474 newpde |= PG_MANAGED; 7475 if ((prot & VM_PROT_EXECUTE) == 0) 7476 newpde |= pg_nx; 7477 if (va < VM_MAXUSER_ADDRESS) 7478 newpde |= PG_U; 7479 return (pmap_enter_pde(pmap, va, newpde, PMAP_ENTER_NOSLEEP | 7480 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp)); 7481 } 7482 7483 /* 7484 * Returns true if every page table entry in the specified page table page is 7485 * zero. 7486 */ 7487 static bool 7488 pmap_every_pte_zero(vm_paddr_t pa) 7489 { 7490 pt_entry_t *pt_end, *pte; 7491 7492 KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned")); 7493 pte = (pt_entry_t *)PHYS_TO_DMAP(pa); 7494 for (pt_end = pte + NPTEPG; pte < pt_end; pte++) { 7495 if (*pte != 0) 7496 return (false); 7497 } 7498 return (true); 7499 } 7500 7501 /* 7502 * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if 7503 * the mapping was created, and one of KERN_FAILURE, KERN_NO_SPACE, 7504 * KERN_PROTECTION_FAILURE, or KERN_RESOURCE_SHORTAGE otherwise. Returns 7505 * KERN_FAILURE if either (1) PMAP_ENTER_NOREPLACE was specified and a 4KB 7506 * page mapping already exists within the 2MB virtual address range starting 7507 * at the specified virtual address or (2) the requested 2MB page mapping is 7508 * not supported due to hardware errata. Returns KERN_NO_SPACE if 7509 * PMAP_ENTER_NOREPLACE was specified and a 2MB page mapping already exists at 7510 * the specified virtual address. Returns KERN_PROTECTION_FAILURE if the PKRU 7511 * settings are not the same across the 2MB virtual address range starting at 7512 * the specified virtual address. Returns KERN_RESOURCE_SHORTAGE if either 7513 * (1) PMAP_ENTER_NOSLEEP was specified and a page table page allocation 7514 * failed or (2) PMAP_ENTER_NORECLAIM was specified and a PV entry allocation 7515 * failed. 7516 * 7517 * The parameter "m" is only used when creating a managed, writeable mapping. 7518 */ 7519 static int 7520 pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, 7521 vm_page_t m, struct rwlock **lockp) 7522 { 7523 struct spglist free; 7524 pd_entry_t oldpde, *pde; 7525 pt_entry_t PG_G, PG_RW, PG_V; 7526 vm_page_t mt, pdpg; 7527 vm_page_t uwptpg; 7528 7529 PG_G = pmap_global_bit(pmap); 7530 PG_RW = pmap_rw_bit(pmap); 7531 KASSERT((newpde & (pmap_modified_bit(pmap) | PG_RW)) != PG_RW, 7532 ("pmap_enter_pde: newpde is missing PG_M")); 7533 PG_V = pmap_valid_bit(pmap); 7534 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 7535 7536 if (!pmap_allow_2m_x_page(pmap, pmap_pde_ept_executable(pmap, 7537 newpde))) { 7538 CTR2(KTR_PMAP, "pmap_enter_pde: 2m x blocked for va %#lx" 7539 " in pmap %p", va, pmap); 7540 return (KERN_FAILURE); 7541 } 7542 if ((pde = pmap_alloc_pde(pmap, va, &pdpg, (flags & 7543 PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) { 7544 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 7545 " in pmap %p", va, pmap); 7546 return (KERN_RESOURCE_SHORTAGE); 7547 } 7548 7549 /* 7550 * If pkru is not same for the whole pde range, return failure 7551 * and let vm_fault() cope. Check after pde allocation, since 7552 * it could sleep. 7553 */ 7554 if (!pmap_pkru_same(pmap, va, va + NBPDR, &newpde)) { 7555 pmap_abort_ptp(pmap, va, pdpg); 7556 return (KERN_PROTECTION_FAILURE); 7557 } 7558 7559 /* 7560 * If there are existing mappings, either abort or remove them. 7561 */ 7562 oldpde = *pde; 7563 if ((oldpde & PG_V) != 0) { 7564 KASSERT(pdpg == NULL || pdpg->ref_count > 1, 7565 ("pmap_enter_pde: pdpg's reference count is too low")); 7566 if ((flags & PMAP_ENTER_NOREPLACE) != 0) { 7567 if ((oldpde & PG_PS) != 0) { 7568 if (pdpg != NULL) 7569 pdpg->ref_count--; 7570 CTR2(KTR_PMAP, 7571 "pmap_enter_pde: no space for va %#lx" 7572 " in pmap %p", va, pmap); 7573 return (KERN_NO_SPACE); 7574 } else if (va < VM_MAXUSER_ADDRESS || 7575 !pmap_every_pte_zero(oldpde & PG_FRAME)) { 7576 if (pdpg != NULL) 7577 pdpg->ref_count--; 7578 CTR2(KTR_PMAP, 7579 "pmap_enter_pde: failure for va %#lx" 7580 " in pmap %p", va, pmap); 7581 return (KERN_FAILURE); 7582 } 7583 } 7584 /* Break the existing mapping(s). */ 7585 SLIST_INIT(&free); 7586 if ((oldpde & PG_PS) != 0) { 7587 /* 7588 * The reference to the PD page that was acquired by 7589 * pmap_alloc_pde() ensures that it won't be freed. 7590 * However, if the PDE resulted from a promotion, then 7591 * a reserved PT page could be freed. 7592 */ 7593 (void)pmap_remove_pde(pmap, pde, va, &free, lockp); 7594 if ((oldpde & PG_G) == 0) 7595 pmap_invalidate_pde_page(pmap, va, oldpde); 7596 } else { 7597 pmap_delayed_invl_start(); 7598 if (pmap_remove_ptes(pmap, va, va + NBPDR, pde, &free, 7599 lockp)) 7600 pmap_invalidate_all(pmap); 7601 pmap_delayed_invl_finish(); 7602 } 7603 if (va < VM_MAXUSER_ADDRESS) { 7604 vm_page_free_pages_toq(&free, true); 7605 KASSERT(*pde == 0, ("pmap_enter_pde: non-zero pde %p", 7606 pde)); 7607 } else { 7608 KASSERT(SLIST_EMPTY(&free), 7609 ("pmap_enter_pde: freed kernel page table page")); 7610 7611 /* 7612 * Both pmap_remove_pde() and pmap_remove_ptes() will 7613 * leave the kernel page table page zero filled. 7614 */ 7615 mt = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 7616 if (pmap_insert_pt_page(pmap, mt, false, false)) 7617 panic("pmap_enter_pde: trie insert failed"); 7618 } 7619 } 7620 7621 /* 7622 * Allocate leaf ptpage for wired userspace pages. 7623 */ 7624 uwptpg = NULL; 7625 if ((newpde & PG_W) != 0 && pmap != kernel_pmap) { 7626 uwptpg = pmap_alloc_pt_page(pmap, pmap_pde_pindex(va), 7627 VM_ALLOC_WIRED); 7628 if (uwptpg == NULL) { 7629 pmap_abort_ptp(pmap, va, pdpg); 7630 return (KERN_RESOURCE_SHORTAGE); 7631 } 7632 if (pmap_insert_pt_page(pmap, uwptpg, true, false)) { 7633 pmap_free_pt_page(pmap, uwptpg, false); 7634 pmap_abort_ptp(pmap, va, pdpg); 7635 return (KERN_RESOURCE_SHORTAGE); 7636 } 7637 7638 uwptpg->ref_count = NPTEPG; 7639 } 7640 if ((newpde & PG_MANAGED) != 0) { 7641 /* 7642 * Abort this mapping if its PV entry could not be created. 7643 */ 7644 if (!pmap_pv_insert_pde(pmap, va, newpde, flags, lockp)) { 7645 if (pdpg != NULL) 7646 pmap_abort_ptp(pmap, va, pdpg); 7647 if (uwptpg != NULL) { 7648 mt = pmap_remove_pt_page(pmap, va); 7649 KASSERT(mt == uwptpg, 7650 ("removed pt page %p, expected %p", mt, 7651 uwptpg)); 7652 uwptpg->ref_count = 1; 7653 pmap_free_pt_page(pmap, uwptpg, false); 7654 } 7655 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 7656 " in pmap %p", va, pmap); 7657 return (KERN_RESOURCE_SHORTAGE); 7658 } 7659 if ((newpde & PG_RW) != 0) { 7660 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 7661 vm_page_aflag_set(mt, PGA_WRITEABLE); 7662 } 7663 } 7664 7665 /* 7666 * Increment counters. 7667 */ 7668 if ((newpde & PG_W) != 0) 7669 pmap->pm_stats.wired_count += NBPDR / PAGE_SIZE; 7670 pmap_resident_count_adj(pmap, NBPDR / PAGE_SIZE); 7671 7672 /* 7673 * Map the superpage. (This is not a promoted mapping; there will not 7674 * be any lingering 4KB page mappings in the TLB.) 7675 */ 7676 pde_store(pde, newpde); 7677 7678 counter_u64_add(pmap_pde_mappings, 1); 7679 CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx in pmap %p", 7680 va, pmap); 7681 return (KERN_SUCCESS); 7682 } 7683 7684 /* 7685 * Maps a sequence of resident pages belonging to the same object. 7686 * The sequence begins with the given page m_start. This page is 7687 * mapped at the given virtual address start. Each subsequent page is 7688 * mapped at a virtual address that is offset from start by the same 7689 * amount as the page is offset from m_start within the object. The 7690 * last page in the sequence is the page with the largest offset from 7691 * m_start that can be mapped at a virtual address less than the given 7692 * virtual address end. Not every virtual page between start and end 7693 * is mapped; only those for which a resident page exists with the 7694 * corresponding offset from m_start are mapped. 7695 */ 7696 void 7697 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 7698 vm_page_t m_start, vm_prot_t prot) 7699 { 7700 struct rwlock *lock; 7701 vm_offset_t va; 7702 vm_page_t m, mpte; 7703 vm_pindex_t diff, psize; 7704 int rv; 7705 7706 VM_OBJECT_ASSERT_LOCKED(m_start->object); 7707 7708 psize = atop(end - start); 7709 mpte = NULL; 7710 m = m_start; 7711 lock = NULL; 7712 PMAP_LOCK(pmap); 7713 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 7714 va = start + ptoa(diff); 7715 if ((va & PDRMASK) == 0 && va + NBPDR <= end && 7716 m->psind == 1 && pmap_ps_enabled(pmap) && 7717 ((rv = pmap_enter_2mpage(pmap, va, m, prot, &lock)) == 7718 KERN_SUCCESS || rv == KERN_NO_SPACE)) 7719 m = &m[NBPDR / PAGE_SIZE - 1]; 7720 else 7721 mpte = pmap_enter_quick_locked(pmap, va, m, prot, 7722 mpte, &lock); 7723 m = TAILQ_NEXT(m, listq); 7724 } 7725 if (lock != NULL) 7726 rw_wunlock(lock); 7727 PMAP_UNLOCK(pmap); 7728 } 7729 7730 /* 7731 * this code makes some *MAJOR* assumptions: 7732 * 1. Current pmap & pmap exists. 7733 * 2. Not wired. 7734 * 3. Read access. 7735 * 4. No page table pages. 7736 * but is *MUCH* faster than pmap_enter... 7737 */ 7738 7739 void 7740 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 7741 { 7742 struct rwlock *lock; 7743 7744 lock = NULL; 7745 PMAP_LOCK(pmap); 7746 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); 7747 if (lock != NULL) 7748 rw_wunlock(lock); 7749 PMAP_UNLOCK(pmap); 7750 } 7751 7752 static vm_page_t 7753 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 7754 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) 7755 { 7756 pd_entry_t *pde; 7757 pt_entry_t newpte, *pte, PG_V; 7758 7759 KASSERT(!VA_IS_CLEANMAP(va) || 7760 (m->oflags & VPO_UNMANAGED) != 0, 7761 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 7762 PG_V = pmap_valid_bit(pmap); 7763 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 7764 pde = NULL; 7765 7766 /* 7767 * In the case that a page table page is not 7768 * resident, we are creating it here. 7769 */ 7770 if (va < VM_MAXUSER_ADDRESS) { 7771 pdp_entry_t *pdpe; 7772 vm_pindex_t ptepindex; 7773 7774 /* 7775 * Calculate pagetable page index 7776 */ 7777 ptepindex = pmap_pde_pindex(va); 7778 if (mpte && (mpte->pindex == ptepindex)) { 7779 mpte->ref_count++; 7780 } else { 7781 /* 7782 * If the page table page is mapped, we just increment 7783 * the hold count, and activate it. Otherwise, we 7784 * attempt to allocate a page table page, passing NULL 7785 * instead of the PV list lock pointer because we don't 7786 * intend to sleep. If this attempt fails, we don't 7787 * retry. Instead, we give up. 7788 */ 7789 pdpe = pmap_pdpe(pmap, va); 7790 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 7791 if ((*pdpe & PG_PS) != 0) 7792 return (NULL); 7793 pde = pmap_pdpe_to_pde(pdpe, va); 7794 if ((*pde & PG_V) != 0) { 7795 if ((*pde & PG_PS) != 0) 7796 return (NULL); 7797 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 7798 mpte->ref_count++; 7799 } else { 7800 mpte = pmap_allocpte_alloc(pmap, 7801 ptepindex, NULL, va); 7802 if (mpte == NULL) 7803 return (NULL); 7804 } 7805 } else { 7806 mpte = pmap_allocpte_alloc(pmap, ptepindex, 7807 NULL, va); 7808 if (mpte == NULL) 7809 return (NULL); 7810 } 7811 } 7812 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); 7813 pte = &pte[pmap_pte_index(va)]; 7814 } else { 7815 mpte = NULL; 7816 pte = vtopte(va); 7817 } 7818 if (*pte) { 7819 if (mpte != NULL) 7820 mpte->ref_count--; 7821 return (NULL); 7822 } 7823 7824 /* 7825 * Enter on the PV list if part of our managed memory. 7826 */ 7827 if ((m->oflags & VPO_UNMANAGED) == 0 && 7828 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { 7829 if (mpte != NULL) 7830 pmap_abort_ptp(pmap, va, mpte); 7831 return (NULL); 7832 } 7833 7834 /* 7835 * Increment counters 7836 */ 7837 pmap_resident_count_adj(pmap, 1); 7838 7839 newpte = VM_PAGE_TO_PHYS(m) | PG_V | 7840 pmap_cache_bits(pmap, m->md.pat_mode, false); 7841 if ((m->oflags & VPO_UNMANAGED) == 0) 7842 newpte |= PG_MANAGED; 7843 if ((prot & VM_PROT_EXECUTE) == 0) 7844 newpte |= pg_nx; 7845 if (va < VM_MAXUSER_ADDRESS) 7846 newpte |= PG_U | pmap_pkru_get(pmap, va); 7847 pte_store(pte, newpte); 7848 7849 #if VM_NRESERVLEVEL > 0 7850 /* 7851 * If both the PTP and the reservation are fully populated, then 7852 * attempt promotion. 7853 */ 7854 if ((prot & VM_PROT_NO_PROMOTE) == 0 && 7855 (mpte == NULL || mpte->ref_count == NPTEPG) && 7856 (m->flags & PG_FICTITIOUS) == 0 && 7857 vm_reserv_level_iffullpop(m) == 0) { 7858 if (pde == NULL) 7859 pde = pmap_pde(pmap, va); 7860 7861 /* 7862 * If promotion succeeds, then the next call to this function 7863 * should not be given the unmapped PTP as a hint. 7864 */ 7865 if (pmap_promote_pde(pmap, pde, va, mpte, lockp)) 7866 mpte = NULL; 7867 } 7868 #endif 7869 7870 return (mpte); 7871 } 7872 7873 /* 7874 * Make a temporary mapping for a physical address. This is only intended 7875 * to be used for panic dumps. 7876 */ 7877 void * 7878 pmap_kenter_temporary(vm_paddr_t pa, int i) 7879 { 7880 vm_offset_t va; 7881 7882 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); 7883 pmap_kenter(va, pa); 7884 pmap_invlpg(kernel_pmap, va); 7885 return ((void *)crashdumpmap); 7886 } 7887 7888 /* 7889 * This code maps large physical mmap regions into the 7890 * processor address space. Note that some shortcuts 7891 * are taken, but the code works. 7892 */ 7893 void 7894 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 7895 vm_pindex_t pindex, vm_size_t size) 7896 { 7897 pd_entry_t *pde; 7898 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 7899 vm_paddr_t pa, ptepa; 7900 vm_page_t p, pdpg; 7901 int pat_mode; 7902 7903 PG_A = pmap_accessed_bit(pmap); 7904 PG_M = pmap_modified_bit(pmap); 7905 PG_V = pmap_valid_bit(pmap); 7906 PG_RW = pmap_rw_bit(pmap); 7907 7908 VM_OBJECT_ASSERT_WLOCKED(object); 7909 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 7910 ("pmap_object_init_pt: non-device object")); 7911 if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { 7912 if (!pmap_ps_enabled(pmap)) 7913 return; 7914 if (!vm_object_populate(object, pindex, pindex + atop(size))) 7915 return; 7916 p = vm_page_lookup(object, pindex); 7917 KASSERT(vm_page_all_valid(p), 7918 ("pmap_object_init_pt: invalid page %p", p)); 7919 pat_mode = p->md.pat_mode; 7920 7921 /* 7922 * Abort the mapping if the first page is not physically 7923 * aligned to a 2MB page boundary. 7924 */ 7925 ptepa = VM_PAGE_TO_PHYS(p); 7926 if (ptepa & (NBPDR - 1)) 7927 return; 7928 7929 /* 7930 * Skip the first page. Abort the mapping if the rest of 7931 * the pages are not physically contiguous or have differing 7932 * memory attributes. 7933 */ 7934 p = TAILQ_NEXT(p, listq); 7935 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; 7936 pa += PAGE_SIZE) { 7937 KASSERT(vm_page_all_valid(p), 7938 ("pmap_object_init_pt: invalid page %p", p)); 7939 if (pa != VM_PAGE_TO_PHYS(p) || 7940 pat_mode != p->md.pat_mode) 7941 return; 7942 p = TAILQ_NEXT(p, listq); 7943 } 7944 7945 /* 7946 * Map using 2MB pages. Since "ptepa" is 2M aligned and 7947 * "size" is a multiple of 2M, adding the PAT setting to "pa" 7948 * will not affect the termination of this loop. 7949 */ 7950 PMAP_LOCK(pmap); 7951 for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, true); 7952 pa < ptepa + size; pa += NBPDR) { 7953 pde = pmap_alloc_pde(pmap, addr, &pdpg, NULL); 7954 if (pde == NULL) { 7955 /* 7956 * The creation of mappings below is only an 7957 * optimization. If a page directory page 7958 * cannot be allocated without blocking, 7959 * continue on to the next mapping rather than 7960 * blocking. 7961 */ 7962 addr += NBPDR; 7963 continue; 7964 } 7965 if ((*pde & PG_V) == 0) { 7966 pde_store(pde, pa | PG_PS | PG_M | PG_A | 7967 PG_U | PG_RW | PG_V); 7968 pmap_resident_count_adj(pmap, NBPDR / PAGE_SIZE); 7969 counter_u64_add(pmap_pde_mappings, 1); 7970 } else { 7971 /* Continue on if the PDE is already valid. */ 7972 pdpg->ref_count--; 7973 KASSERT(pdpg->ref_count > 0, 7974 ("pmap_object_init_pt: missing reference " 7975 "to page directory page, va: 0x%lx", addr)); 7976 } 7977 addr += NBPDR; 7978 } 7979 PMAP_UNLOCK(pmap); 7980 } 7981 } 7982 7983 /* 7984 * Clear the wired attribute from the mappings for the specified range of 7985 * addresses in the given pmap. Every valid mapping within that range 7986 * must have the wired attribute set. In contrast, invalid mappings 7987 * cannot have the wired attribute set, so they are ignored. 7988 * 7989 * The wired attribute of the page table entry is not a hardware 7990 * feature, so there is no need to invalidate any TLB entries. 7991 * Since pmap_demote_pde() for the wired entry must never fail, 7992 * pmap_delayed_invl_start()/finish() calls around the 7993 * function are not needed. 7994 */ 7995 void 7996 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 7997 { 7998 vm_offset_t va_next; 7999 pml4_entry_t *pml4e; 8000 pdp_entry_t *pdpe; 8001 pd_entry_t *pde; 8002 pt_entry_t *pte, PG_V, PG_G __diagused; 8003 8004 PG_V = pmap_valid_bit(pmap); 8005 PG_G = pmap_global_bit(pmap); 8006 PMAP_LOCK(pmap); 8007 for (; sva < eva; sva = va_next) { 8008 pml4e = pmap_pml4e(pmap, sva); 8009 if (pml4e == NULL || (*pml4e & PG_V) == 0) { 8010 va_next = (sva + NBPML4) & ~PML4MASK; 8011 if (va_next < sva) 8012 va_next = eva; 8013 continue; 8014 } 8015 8016 va_next = (sva + NBPDP) & ~PDPMASK; 8017 if (va_next < sva) 8018 va_next = eva; 8019 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 8020 if ((*pdpe & PG_V) == 0) 8021 continue; 8022 if ((*pdpe & PG_PS) != 0) { 8023 KASSERT(va_next <= eva, 8024 ("partial update of non-transparent 1G mapping " 8025 "pdpe %#lx sva %#lx eva %#lx va_next %#lx", 8026 *pdpe, sva, eva, va_next)); 8027 MPASS(pmap != kernel_pmap); /* XXXKIB */ 8028 MPASS((*pdpe & (PG_MANAGED | PG_G)) == 0); 8029 atomic_clear_long(pdpe, PG_W); 8030 pmap->pm_stats.wired_count -= NBPDP / PAGE_SIZE; 8031 continue; 8032 } 8033 8034 va_next = (sva + NBPDR) & ~PDRMASK; 8035 if (va_next < sva) 8036 va_next = eva; 8037 pde = pmap_pdpe_to_pde(pdpe, sva); 8038 if ((*pde & PG_V) == 0) 8039 continue; 8040 if ((*pde & PG_PS) != 0) { 8041 if ((*pde & PG_W) == 0) 8042 panic("pmap_unwire: pde %#jx is missing PG_W", 8043 (uintmax_t)*pde); 8044 8045 /* 8046 * Are we unwiring the entire large page? If not, 8047 * demote the mapping and fall through. 8048 */ 8049 if (sva + NBPDR == va_next && eva >= va_next) { 8050 atomic_clear_long(pde, PG_W); 8051 pmap->pm_stats.wired_count -= NBPDR / 8052 PAGE_SIZE; 8053 continue; 8054 } else if (!pmap_demote_pde(pmap, pde, sva)) 8055 panic("pmap_unwire: demotion failed"); 8056 } 8057 if (va_next > eva) 8058 va_next = eva; 8059 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 8060 sva += PAGE_SIZE) { 8061 if ((*pte & PG_V) == 0) 8062 continue; 8063 if ((*pte & PG_W) == 0) 8064 panic("pmap_unwire: pte %#jx is missing PG_W", 8065 (uintmax_t)*pte); 8066 8067 /* 8068 * PG_W must be cleared atomically. Although the pmap 8069 * lock synchronizes access to PG_W, another processor 8070 * could be setting PG_M and/or PG_A concurrently. 8071 */ 8072 atomic_clear_long(pte, PG_W); 8073 pmap->pm_stats.wired_count--; 8074 } 8075 } 8076 PMAP_UNLOCK(pmap); 8077 } 8078 8079 /* 8080 * Copy the range specified by src_addr/len 8081 * from the source map to the range dst_addr/len 8082 * in the destination map. 8083 * 8084 * This routine is only advisory and need not do anything. 8085 */ 8086 void 8087 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 8088 vm_offset_t src_addr) 8089 { 8090 struct rwlock *lock; 8091 pml4_entry_t *pml4e; 8092 pdp_entry_t *pdpe; 8093 pd_entry_t *pde, srcptepaddr; 8094 pt_entry_t *dst_pte, PG_A, PG_M, PG_V, ptetemp, *src_pte; 8095 vm_offset_t addr, end_addr, va_next; 8096 vm_page_t dst_pdpg, dstmpte, srcmpte; 8097 8098 if (dst_addr != src_addr) 8099 return; 8100 8101 if (dst_pmap->pm_type != src_pmap->pm_type) 8102 return; 8103 8104 /* 8105 * EPT page table entries that require emulation of A/D bits are 8106 * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although 8107 * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit 8108 * (aka EPT_PG_EXECUTE) could still be set. Since some EPT 8109 * implementations flag an EPT misconfiguration for exec-only 8110 * mappings we skip this function entirely for emulated pmaps. 8111 */ 8112 if (pmap_emulate_ad_bits(dst_pmap)) 8113 return; 8114 8115 end_addr = src_addr + len; 8116 lock = NULL; 8117 if (dst_pmap < src_pmap) { 8118 PMAP_LOCK(dst_pmap); 8119 PMAP_LOCK(src_pmap); 8120 } else { 8121 PMAP_LOCK(src_pmap); 8122 PMAP_LOCK(dst_pmap); 8123 } 8124 8125 PG_A = pmap_accessed_bit(dst_pmap); 8126 PG_M = pmap_modified_bit(dst_pmap); 8127 PG_V = pmap_valid_bit(dst_pmap); 8128 8129 for (addr = src_addr; addr < end_addr; addr = va_next) { 8130 KASSERT(addr < UPT_MIN_ADDRESS, 8131 ("pmap_copy: invalid to pmap_copy page tables")); 8132 8133 pml4e = pmap_pml4e(src_pmap, addr); 8134 if (pml4e == NULL || (*pml4e & PG_V) == 0) { 8135 va_next = (addr + NBPML4) & ~PML4MASK; 8136 if (va_next < addr) 8137 va_next = end_addr; 8138 continue; 8139 } 8140 8141 va_next = (addr + NBPDP) & ~PDPMASK; 8142 if (va_next < addr) 8143 va_next = end_addr; 8144 pdpe = pmap_pml4e_to_pdpe(pml4e, addr); 8145 if ((*pdpe & PG_V) == 0) 8146 continue; 8147 if ((*pdpe & PG_PS) != 0) { 8148 KASSERT(va_next <= end_addr, 8149 ("partial update of non-transparent 1G mapping " 8150 "pdpe %#lx sva %#lx eva %#lx va_next %#lx", 8151 *pdpe, addr, end_addr, va_next)); 8152 MPASS((addr & PDPMASK) == 0); 8153 MPASS((*pdpe & PG_MANAGED) == 0); 8154 srcptepaddr = *pdpe; 8155 pdpe = pmap_pdpe(dst_pmap, addr); 8156 if (pdpe == NULL) { 8157 if (pmap_allocpte_alloc(dst_pmap, 8158 pmap_pml4e_pindex(addr), NULL, addr) == 8159 NULL) 8160 break; 8161 pdpe = pmap_pdpe(dst_pmap, addr); 8162 } else { 8163 pml4e = pmap_pml4e(dst_pmap, addr); 8164 dst_pdpg = PHYS_TO_VM_PAGE(*pml4e & PG_FRAME); 8165 dst_pdpg->ref_count++; 8166 } 8167 KASSERT(*pdpe == 0, 8168 ("1G mapping present in dst pmap " 8169 "pdpe %#lx sva %#lx eva %#lx va_next %#lx", 8170 *pdpe, addr, end_addr, va_next)); 8171 *pdpe = srcptepaddr & ~PG_W; 8172 pmap_resident_count_adj(dst_pmap, NBPDP / PAGE_SIZE); 8173 continue; 8174 } 8175 8176 va_next = (addr + NBPDR) & ~PDRMASK; 8177 if (va_next < addr) 8178 va_next = end_addr; 8179 8180 pde = pmap_pdpe_to_pde(pdpe, addr); 8181 srcptepaddr = *pde; 8182 if (srcptepaddr == 0) 8183 continue; 8184 8185 if (srcptepaddr & PG_PS) { 8186 /* 8187 * We can only virtual copy whole superpages. 8188 */ 8189 if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr) 8190 continue; 8191 pde = pmap_alloc_pde(dst_pmap, addr, &dst_pdpg, NULL); 8192 if (pde == NULL) 8193 break; 8194 if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 || 8195 pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr, 8196 PMAP_ENTER_NORECLAIM, &lock))) { 8197 /* 8198 * We leave the dirty bit unchanged because 8199 * managed read/write superpage mappings are 8200 * required to be dirty. However, managed 8201 * superpage mappings are not required to 8202 * have their accessed bit set, so we clear 8203 * it because we don't know if this mapping 8204 * will be used. 8205 */ 8206 srcptepaddr &= ~PG_W; 8207 if ((srcptepaddr & PG_MANAGED) != 0) 8208 srcptepaddr &= ~PG_A; 8209 *pde = srcptepaddr; 8210 pmap_resident_count_adj(dst_pmap, NBPDR / 8211 PAGE_SIZE); 8212 counter_u64_add(pmap_pde_mappings, 1); 8213 } else 8214 pmap_abort_ptp(dst_pmap, addr, dst_pdpg); 8215 continue; 8216 } 8217 8218 srcptepaddr &= PG_FRAME; 8219 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); 8220 KASSERT(srcmpte->ref_count > 0, 8221 ("pmap_copy: source page table page is unused")); 8222 8223 if (va_next > end_addr) 8224 va_next = end_addr; 8225 8226 src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); 8227 src_pte = &src_pte[pmap_pte_index(addr)]; 8228 dstmpte = NULL; 8229 for (; addr < va_next; addr += PAGE_SIZE, src_pte++) { 8230 ptetemp = *src_pte; 8231 8232 /* 8233 * We only virtual copy managed pages. 8234 */ 8235 if ((ptetemp & PG_MANAGED) == 0) 8236 continue; 8237 8238 if (dstmpte != NULL) { 8239 KASSERT(dstmpte->pindex == 8240 pmap_pde_pindex(addr), 8241 ("dstmpte pindex/addr mismatch")); 8242 dstmpte->ref_count++; 8243 } else if ((dstmpte = pmap_allocpte(dst_pmap, addr, 8244 NULL)) == NULL) 8245 goto out; 8246 dst_pte = (pt_entry_t *) 8247 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); 8248 dst_pte = &dst_pte[pmap_pte_index(addr)]; 8249 if (*dst_pte == 0 && 8250 pmap_try_insert_pv_entry(dst_pmap, addr, 8251 PHYS_TO_VM_PAGE(ptetemp & PG_FRAME), &lock)) { 8252 /* 8253 * Clear the wired, modified, and accessed 8254 * (referenced) bits during the copy. 8255 */ 8256 *dst_pte = ptetemp & ~(PG_W | PG_M | PG_A); 8257 pmap_resident_count_adj(dst_pmap, 1); 8258 } else { 8259 pmap_abort_ptp(dst_pmap, addr, dstmpte); 8260 goto out; 8261 } 8262 /* Have we copied all of the valid mappings? */ 8263 if (dstmpte->ref_count >= srcmpte->ref_count) 8264 break; 8265 } 8266 } 8267 out: 8268 if (lock != NULL) 8269 rw_wunlock(lock); 8270 PMAP_UNLOCK(src_pmap); 8271 PMAP_UNLOCK(dst_pmap); 8272 } 8273 8274 int 8275 pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap) 8276 { 8277 int error; 8278 8279 if (dst_pmap->pm_type != src_pmap->pm_type || 8280 dst_pmap->pm_type != PT_X86 || 8281 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0) 8282 return (0); 8283 for (;;) { 8284 if (dst_pmap < src_pmap) { 8285 PMAP_LOCK(dst_pmap); 8286 PMAP_LOCK(src_pmap); 8287 } else { 8288 PMAP_LOCK(src_pmap); 8289 PMAP_LOCK(dst_pmap); 8290 } 8291 error = pmap_pkru_copy(dst_pmap, src_pmap); 8292 /* Clean up partial copy on failure due to no memory. */ 8293 if (error == ENOMEM) 8294 pmap_pkru_deassign_all(dst_pmap); 8295 PMAP_UNLOCK(src_pmap); 8296 PMAP_UNLOCK(dst_pmap); 8297 if (error != ENOMEM) 8298 break; 8299 vm_wait(NULL); 8300 } 8301 return (error); 8302 } 8303 8304 /* 8305 * Zero the specified hardware page. 8306 */ 8307 void 8308 pmap_zero_page(vm_page_t m) 8309 { 8310 vm_offset_t va; 8311 8312 #ifdef TSLOG_PAGEZERO 8313 TSENTER(); 8314 #endif 8315 va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 8316 pagezero((void *)va); 8317 #ifdef TSLOG_PAGEZERO 8318 TSEXIT(); 8319 #endif 8320 } 8321 8322 /* 8323 * Zero an area within a single hardware page. off and size must not 8324 * cover an area beyond a single hardware page. 8325 */ 8326 void 8327 pmap_zero_page_area(vm_page_t m, int off, int size) 8328 { 8329 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 8330 8331 if (off == 0 && size == PAGE_SIZE) 8332 pagezero((void *)va); 8333 else 8334 bzero((char *)va + off, size); 8335 } 8336 8337 /* 8338 * Copy 1 specified hardware page to another. 8339 */ 8340 void 8341 pmap_copy_page(vm_page_t msrc, vm_page_t mdst) 8342 { 8343 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 8344 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 8345 8346 pagecopy((void *)src, (void *)dst); 8347 } 8348 8349 int unmapped_buf_allowed = 1; 8350 8351 void 8352 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 8353 vm_offset_t b_offset, int xfersize) 8354 { 8355 void *a_cp, *b_cp; 8356 vm_page_t pages[2]; 8357 vm_offset_t vaddr[2], a_pg_offset, b_pg_offset; 8358 int cnt; 8359 bool mapped; 8360 8361 while (xfersize > 0) { 8362 a_pg_offset = a_offset & PAGE_MASK; 8363 pages[0] = ma[a_offset >> PAGE_SHIFT]; 8364 b_pg_offset = b_offset & PAGE_MASK; 8365 pages[1] = mb[b_offset >> PAGE_SHIFT]; 8366 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 8367 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 8368 mapped = pmap_map_io_transient(pages, vaddr, 2, false); 8369 a_cp = (char *)vaddr[0] + a_pg_offset; 8370 b_cp = (char *)vaddr[1] + b_pg_offset; 8371 bcopy(a_cp, b_cp, cnt); 8372 if (__predict_false(mapped)) 8373 pmap_unmap_io_transient(pages, vaddr, 2, false); 8374 a_offset += cnt; 8375 b_offset += cnt; 8376 xfersize -= cnt; 8377 } 8378 } 8379 8380 /* 8381 * Returns true if the pmap's pv is one of the first 8382 * 16 pvs linked to from this page. This count may 8383 * be changed upwards or downwards in the future; it 8384 * is only necessary that true be returned for a small 8385 * subset of pmaps for proper page aging. 8386 */ 8387 bool 8388 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 8389 { 8390 struct md_page *pvh; 8391 struct rwlock *lock; 8392 pv_entry_t pv; 8393 int loops = 0; 8394 bool rv; 8395 8396 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 8397 ("pmap_page_exists_quick: page %p is not managed", m)); 8398 rv = false; 8399 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 8400 rw_rlock(lock); 8401 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 8402 if (PV_PMAP(pv) == pmap) { 8403 rv = true; 8404 break; 8405 } 8406 loops++; 8407 if (loops >= 16) 8408 break; 8409 } 8410 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 8411 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 8412 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 8413 if (PV_PMAP(pv) == pmap) { 8414 rv = true; 8415 break; 8416 } 8417 loops++; 8418 if (loops >= 16) 8419 break; 8420 } 8421 } 8422 rw_runlock(lock); 8423 return (rv); 8424 } 8425 8426 /* 8427 * pmap_page_wired_mappings: 8428 * 8429 * Return the number of managed mappings to the given physical page 8430 * that are wired. 8431 */ 8432 int 8433 pmap_page_wired_mappings(vm_page_t m) 8434 { 8435 struct rwlock *lock; 8436 struct md_page *pvh; 8437 pmap_t pmap; 8438 pt_entry_t *pte; 8439 pv_entry_t pv; 8440 int count, md_gen, pvh_gen; 8441 8442 if ((m->oflags & VPO_UNMANAGED) != 0) 8443 return (0); 8444 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 8445 rw_rlock(lock); 8446 restart: 8447 count = 0; 8448 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 8449 pmap = PV_PMAP(pv); 8450 if (!PMAP_TRYLOCK(pmap)) { 8451 md_gen = m->md.pv_gen; 8452 rw_runlock(lock); 8453 PMAP_LOCK(pmap); 8454 rw_rlock(lock); 8455 if (md_gen != m->md.pv_gen) { 8456 PMAP_UNLOCK(pmap); 8457 goto restart; 8458 } 8459 } 8460 pte = pmap_pte(pmap, pv->pv_va); 8461 if ((*pte & PG_W) != 0) 8462 count++; 8463 PMAP_UNLOCK(pmap); 8464 } 8465 if ((m->flags & PG_FICTITIOUS) == 0) { 8466 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 8467 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 8468 pmap = PV_PMAP(pv); 8469 if (!PMAP_TRYLOCK(pmap)) { 8470 md_gen = m->md.pv_gen; 8471 pvh_gen = pvh->pv_gen; 8472 rw_runlock(lock); 8473 PMAP_LOCK(pmap); 8474 rw_rlock(lock); 8475 if (md_gen != m->md.pv_gen || 8476 pvh_gen != pvh->pv_gen) { 8477 PMAP_UNLOCK(pmap); 8478 goto restart; 8479 } 8480 } 8481 pte = pmap_pde(pmap, pv->pv_va); 8482 if ((*pte & PG_W) != 0) 8483 count++; 8484 PMAP_UNLOCK(pmap); 8485 } 8486 } 8487 rw_runlock(lock); 8488 return (count); 8489 } 8490 8491 /* 8492 * Returns true if the given page is mapped individually or as part of 8493 * a 2mpage. Otherwise, returns false. 8494 */ 8495 bool 8496 pmap_page_is_mapped(vm_page_t m) 8497 { 8498 struct rwlock *lock; 8499 bool rv; 8500 8501 if ((m->oflags & VPO_UNMANAGED) != 0) 8502 return (false); 8503 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 8504 rw_rlock(lock); 8505 rv = !TAILQ_EMPTY(&m->md.pv_list) || 8506 ((m->flags & PG_FICTITIOUS) == 0 && 8507 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 8508 rw_runlock(lock); 8509 return (rv); 8510 } 8511 8512 /* 8513 * Destroy all managed, non-wired mappings in the given user-space 8514 * pmap. This pmap cannot be active on any processor besides the 8515 * caller. 8516 * 8517 * This function cannot be applied to the kernel pmap. Moreover, it 8518 * is not intended for general use. It is only to be used during 8519 * process termination. Consequently, it can be implemented in ways 8520 * that make it faster than pmap_remove(). First, it can more quickly 8521 * destroy mappings by iterating over the pmap's collection of PV 8522 * entries, rather than searching the page table. Second, it doesn't 8523 * have to test and clear the page table entries atomically, because 8524 * no processor is currently accessing the user address space. In 8525 * particular, a page table entry's dirty bit won't change state once 8526 * this function starts. 8527 * 8528 * Although this function destroys all of the pmap's managed, 8529 * non-wired mappings, it can delay and batch the invalidation of TLB 8530 * entries without calling pmap_delayed_invl_start() and 8531 * pmap_delayed_invl_finish(). Because the pmap is not active on 8532 * any other processor, none of these TLB entries will ever be used 8533 * before their eventual invalidation. Consequently, there is no need 8534 * for either pmap_remove_all() or pmap_remove_write() to wait for 8535 * that eventual TLB invalidation. 8536 */ 8537 void 8538 pmap_remove_pages(pmap_t pmap) 8539 { 8540 pd_entry_t ptepde; 8541 pt_entry_t *pte, tpte; 8542 pt_entry_t PG_M, PG_RW, PG_V; 8543 struct spglist free; 8544 struct pv_chunklist free_chunks[PMAP_MEMDOM]; 8545 vm_page_t m, mpte, mt; 8546 pv_entry_t pv; 8547 struct md_page *pvh; 8548 struct pv_chunk *pc, *npc; 8549 struct rwlock *lock; 8550 int64_t bit; 8551 uint64_t inuse, bitmask; 8552 int allfree, field, i, idx; 8553 #ifdef PV_STATS 8554 int freed; 8555 #endif 8556 bool superpage; 8557 vm_paddr_t pa; 8558 8559 /* 8560 * Assert that the given pmap is only active on the current 8561 * CPU. Unfortunately, we cannot block another CPU from 8562 * activating the pmap while this function is executing. 8563 */ 8564 KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap)); 8565 #ifdef INVARIANTS 8566 { 8567 cpuset_t other_cpus; 8568 8569 other_cpus = all_cpus; 8570 critical_enter(); 8571 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 8572 CPU_AND(&other_cpus, &other_cpus, &pmap->pm_active); 8573 critical_exit(); 8574 KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap)); 8575 } 8576 #endif 8577 8578 lock = NULL; 8579 PG_M = pmap_modified_bit(pmap); 8580 PG_V = pmap_valid_bit(pmap); 8581 PG_RW = pmap_rw_bit(pmap); 8582 8583 for (i = 0; i < PMAP_MEMDOM; i++) 8584 TAILQ_INIT(&free_chunks[i]); 8585 SLIST_INIT(&free); 8586 PMAP_LOCK(pmap); 8587 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 8588 allfree = 1; 8589 #ifdef PV_STATS 8590 freed = 0; 8591 #endif 8592 for (field = 0; field < _NPCM; field++) { 8593 inuse = ~pc->pc_map[field] & pc_freemask[field]; 8594 while (inuse != 0) { 8595 bit = bsfq(inuse); 8596 bitmask = 1UL << bit; 8597 idx = field * 64 + bit; 8598 pv = &pc->pc_pventry[idx]; 8599 inuse &= ~bitmask; 8600 8601 pte = pmap_pdpe(pmap, pv->pv_va); 8602 ptepde = *pte; 8603 pte = pmap_pdpe_to_pde(pte, pv->pv_va); 8604 tpte = *pte; 8605 if ((tpte & (PG_PS | PG_V)) == PG_V) { 8606 superpage = false; 8607 ptepde = tpte; 8608 pte = (pt_entry_t *)PHYS_TO_DMAP(tpte & 8609 PG_FRAME); 8610 pte = &pte[pmap_pte_index(pv->pv_va)]; 8611 tpte = *pte; 8612 } else { 8613 /* 8614 * Keep track whether 'tpte' is a 8615 * superpage explicitly instead of 8616 * relying on PG_PS being set. 8617 * 8618 * This is because PG_PS is numerically 8619 * identical to PG_PTE_PAT and thus a 8620 * regular page could be mistaken for 8621 * a superpage. 8622 */ 8623 superpage = true; 8624 } 8625 8626 if ((tpte & PG_V) == 0) { 8627 panic("bad pte va %lx pte %lx", 8628 pv->pv_va, tpte); 8629 } 8630 8631 /* 8632 * We cannot remove wired pages from a process' mapping at this time 8633 */ 8634 if (tpte & PG_W) { 8635 allfree = 0; 8636 continue; 8637 } 8638 8639 /* Mark free */ 8640 pc->pc_map[field] |= bitmask; 8641 8642 /* 8643 * Because this pmap is not active on other 8644 * processors, the dirty bit cannot have 8645 * changed state since we last loaded pte. 8646 */ 8647 pte_clear(pte); 8648 8649 if (superpage) 8650 pa = tpte & PG_PS_FRAME; 8651 else 8652 pa = tpte & PG_FRAME; 8653 8654 m = PHYS_TO_VM_PAGE(pa); 8655 KASSERT(m->phys_addr == pa, 8656 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 8657 m, (uintmax_t)m->phys_addr, 8658 (uintmax_t)tpte)); 8659 8660 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 8661 m < &vm_page_array[vm_page_array_size], 8662 ("pmap_remove_pages: bad tpte %#jx", 8663 (uintmax_t)tpte)); 8664 8665 /* 8666 * Update the vm_page_t clean/reference bits. 8667 */ 8668 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 8669 if (superpage) { 8670 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 8671 vm_page_dirty(mt); 8672 } else 8673 vm_page_dirty(m); 8674 } 8675 8676 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 8677 8678 if (superpage) { 8679 pmap_resident_count_adj(pmap, -NBPDR / PAGE_SIZE); 8680 pvh = pa_to_pvh(tpte & PG_PS_FRAME); 8681 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 8682 pvh->pv_gen++; 8683 if (TAILQ_EMPTY(&pvh->pv_list)) { 8684 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 8685 if ((mt->a.flags & PGA_WRITEABLE) != 0 && 8686 TAILQ_EMPTY(&mt->md.pv_list)) 8687 vm_page_aflag_clear(mt, PGA_WRITEABLE); 8688 } 8689 mpte = pmap_remove_pt_page(pmap, pv->pv_va); 8690 if (mpte != NULL) { 8691 KASSERT(vm_page_any_valid(mpte), 8692 ("pmap_remove_pages: pte page not promoted")); 8693 pmap_pt_page_count_adj(pmap, -1); 8694 KASSERT(mpte->ref_count == NPTEPG, 8695 ("pmap_remove_pages: pte page reference count error")); 8696 mpte->ref_count = 0; 8697 pmap_add_delayed_free_list(mpte, &free, false); 8698 } 8699 } else { 8700 pmap_resident_count_adj(pmap, -1); 8701 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 8702 m->md.pv_gen++; 8703 if ((m->a.flags & PGA_WRITEABLE) != 0 && 8704 TAILQ_EMPTY(&m->md.pv_list) && 8705 (m->flags & PG_FICTITIOUS) == 0) { 8706 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 8707 if (TAILQ_EMPTY(&pvh->pv_list)) 8708 vm_page_aflag_clear(m, PGA_WRITEABLE); 8709 } 8710 } 8711 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); 8712 #ifdef PV_STATS 8713 freed++; 8714 #endif 8715 } 8716 } 8717 PV_STAT(counter_u64_add(pv_entry_frees, freed)); 8718 PV_STAT(counter_u64_add(pv_entry_spare, freed)); 8719 PV_STAT(counter_u64_add(pv_entry_count, -freed)); 8720 if (allfree) { 8721 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 8722 TAILQ_INSERT_TAIL(&free_chunks[pc_to_domain(pc)], pc, pc_list); 8723 } 8724 } 8725 if (lock != NULL) 8726 rw_wunlock(lock); 8727 pmap_invalidate_all(pmap); 8728 pmap_pkru_deassign_all(pmap); 8729 free_pv_chunk_batch((struct pv_chunklist *)&free_chunks); 8730 PMAP_UNLOCK(pmap); 8731 vm_page_free_pages_toq(&free, true); 8732 } 8733 8734 static bool 8735 pmap_page_test_mappings(vm_page_t m, bool accessed, bool modified) 8736 { 8737 struct rwlock *lock; 8738 pv_entry_t pv; 8739 struct md_page *pvh; 8740 pt_entry_t *pte, mask; 8741 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 8742 pmap_t pmap; 8743 int md_gen, pvh_gen; 8744 bool rv; 8745 8746 rv = false; 8747 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 8748 rw_rlock(lock); 8749 restart: 8750 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 8751 pmap = PV_PMAP(pv); 8752 if (!PMAP_TRYLOCK(pmap)) { 8753 md_gen = m->md.pv_gen; 8754 rw_runlock(lock); 8755 PMAP_LOCK(pmap); 8756 rw_rlock(lock); 8757 if (md_gen != m->md.pv_gen) { 8758 PMAP_UNLOCK(pmap); 8759 goto restart; 8760 } 8761 } 8762 pte = pmap_pte(pmap, pv->pv_va); 8763 mask = 0; 8764 if (modified) { 8765 PG_M = pmap_modified_bit(pmap); 8766 PG_RW = pmap_rw_bit(pmap); 8767 mask |= PG_RW | PG_M; 8768 } 8769 if (accessed) { 8770 PG_A = pmap_accessed_bit(pmap); 8771 PG_V = pmap_valid_bit(pmap); 8772 mask |= PG_V | PG_A; 8773 } 8774 rv = (*pte & mask) == mask; 8775 PMAP_UNLOCK(pmap); 8776 if (rv) 8777 goto out; 8778 } 8779 if ((m->flags & PG_FICTITIOUS) == 0) { 8780 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 8781 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 8782 pmap = PV_PMAP(pv); 8783 if (!PMAP_TRYLOCK(pmap)) { 8784 md_gen = m->md.pv_gen; 8785 pvh_gen = pvh->pv_gen; 8786 rw_runlock(lock); 8787 PMAP_LOCK(pmap); 8788 rw_rlock(lock); 8789 if (md_gen != m->md.pv_gen || 8790 pvh_gen != pvh->pv_gen) { 8791 PMAP_UNLOCK(pmap); 8792 goto restart; 8793 } 8794 } 8795 pte = pmap_pde(pmap, pv->pv_va); 8796 mask = 0; 8797 if (modified) { 8798 PG_M = pmap_modified_bit(pmap); 8799 PG_RW = pmap_rw_bit(pmap); 8800 mask |= PG_RW | PG_M; 8801 } 8802 if (accessed) { 8803 PG_A = pmap_accessed_bit(pmap); 8804 PG_V = pmap_valid_bit(pmap); 8805 mask |= PG_V | PG_A; 8806 } 8807 rv = (*pte & mask) == mask; 8808 PMAP_UNLOCK(pmap); 8809 if (rv) 8810 goto out; 8811 } 8812 } 8813 out: 8814 rw_runlock(lock); 8815 return (rv); 8816 } 8817 8818 /* 8819 * pmap_is_modified: 8820 * 8821 * Return whether or not the specified physical page was modified 8822 * in any physical maps. 8823 */ 8824 bool 8825 pmap_is_modified(vm_page_t m) 8826 { 8827 8828 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 8829 ("pmap_is_modified: page %p is not managed", m)); 8830 8831 /* 8832 * If the page is not busied then this check is racy. 8833 */ 8834 if (!pmap_page_is_write_mapped(m)) 8835 return (false); 8836 return (pmap_page_test_mappings(m, false, true)); 8837 } 8838 8839 /* 8840 * pmap_is_prefaultable: 8841 * 8842 * Return whether or not the specified virtual address is eligible 8843 * for prefault. 8844 */ 8845 bool 8846 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 8847 { 8848 pd_entry_t *pde; 8849 pt_entry_t *pte, PG_V; 8850 bool rv; 8851 8852 PG_V = pmap_valid_bit(pmap); 8853 8854 /* 8855 * Return true if and only if the PTE for the specified virtual 8856 * address is allocated but invalid. 8857 */ 8858 rv = false; 8859 PMAP_LOCK(pmap); 8860 pde = pmap_pde(pmap, addr); 8861 if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) { 8862 pte = pmap_pde_to_pte(pde, addr); 8863 rv = (*pte & PG_V) == 0; 8864 } 8865 PMAP_UNLOCK(pmap); 8866 return (rv); 8867 } 8868 8869 /* 8870 * pmap_is_referenced: 8871 * 8872 * Return whether or not the specified physical page was referenced 8873 * in any physical maps. 8874 */ 8875 bool 8876 pmap_is_referenced(vm_page_t m) 8877 { 8878 8879 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 8880 ("pmap_is_referenced: page %p is not managed", m)); 8881 return (pmap_page_test_mappings(m, true, false)); 8882 } 8883 8884 /* 8885 * Clear the write and modified bits in each of the given page's mappings. 8886 */ 8887 void 8888 pmap_remove_write(vm_page_t m) 8889 { 8890 struct md_page *pvh; 8891 pmap_t pmap; 8892 struct rwlock *lock; 8893 pv_entry_t next_pv, pv; 8894 pd_entry_t *pde; 8895 pt_entry_t oldpte, *pte, PG_M, PG_RW; 8896 vm_offset_t va; 8897 int pvh_gen, md_gen; 8898 8899 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 8900 ("pmap_remove_write: page %p is not managed", m)); 8901 8902 vm_page_assert_busied(m); 8903 if (!pmap_page_is_write_mapped(m)) 8904 return; 8905 8906 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 8907 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 8908 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 8909 rw_wlock(lock); 8910 retry: 8911 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 8912 pmap = PV_PMAP(pv); 8913 if (!PMAP_TRYLOCK(pmap)) { 8914 pvh_gen = pvh->pv_gen; 8915 rw_wunlock(lock); 8916 PMAP_LOCK(pmap); 8917 rw_wlock(lock); 8918 if (pvh_gen != pvh->pv_gen) { 8919 PMAP_UNLOCK(pmap); 8920 goto retry; 8921 } 8922 } 8923 PG_RW = pmap_rw_bit(pmap); 8924 va = pv->pv_va; 8925 pde = pmap_pde(pmap, va); 8926 if ((*pde & PG_RW) != 0) 8927 (void)pmap_demote_pde_locked(pmap, pde, va, &lock); 8928 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 8929 ("inconsistent pv lock %p %p for page %p", 8930 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 8931 PMAP_UNLOCK(pmap); 8932 } 8933 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 8934 pmap = PV_PMAP(pv); 8935 if (!PMAP_TRYLOCK(pmap)) { 8936 pvh_gen = pvh->pv_gen; 8937 md_gen = m->md.pv_gen; 8938 rw_wunlock(lock); 8939 PMAP_LOCK(pmap); 8940 rw_wlock(lock); 8941 if (pvh_gen != pvh->pv_gen || 8942 md_gen != m->md.pv_gen) { 8943 PMAP_UNLOCK(pmap); 8944 goto retry; 8945 } 8946 } 8947 PG_M = pmap_modified_bit(pmap); 8948 PG_RW = pmap_rw_bit(pmap); 8949 pde = pmap_pde(pmap, pv->pv_va); 8950 KASSERT((*pde & PG_PS) == 0, 8951 ("pmap_remove_write: found a 2mpage in page %p's pv list", 8952 m)); 8953 pte = pmap_pde_to_pte(pde, pv->pv_va); 8954 oldpte = *pte; 8955 if (oldpte & PG_RW) { 8956 while (!atomic_fcmpset_long(pte, &oldpte, oldpte & 8957 ~(PG_RW | PG_M))) 8958 cpu_spinwait(); 8959 if ((oldpte & PG_M) != 0) 8960 vm_page_dirty(m); 8961 pmap_invalidate_page(pmap, pv->pv_va); 8962 } 8963 PMAP_UNLOCK(pmap); 8964 } 8965 rw_wunlock(lock); 8966 vm_page_aflag_clear(m, PGA_WRITEABLE); 8967 pmap_delayed_invl_wait(m); 8968 } 8969 8970 /* 8971 * pmap_ts_referenced: 8972 * 8973 * Return a count of reference bits for a page, clearing those bits. 8974 * It is not necessary for every reference bit to be cleared, but it 8975 * is necessary that 0 only be returned when there are truly no 8976 * reference bits set. 8977 * 8978 * As an optimization, update the page's dirty field if a modified bit is 8979 * found while counting reference bits. This opportunistic update can be 8980 * performed at low cost and can eliminate the need for some future calls 8981 * to pmap_is_modified(). However, since this function stops after 8982 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 8983 * dirty pages. Those dirty pages will only be detected by a future call 8984 * to pmap_is_modified(). 8985 * 8986 * A DI block is not needed within this function, because 8987 * invalidations are performed before the PV list lock is 8988 * released. 8989 */ 8990 int 8991 pmap_ts_referenced(vm_page_t m) 8992 { 8993 struct md_page *pvh; 8994 pv_entry_t pv, pvf; 8995 pmap_t pmap; 8996 struct rwlock *lock; 8997 pd_entry_t oldpde, *pde; 8998 pt_entry_t *pte, PG_A, PG_M, PG_RW; 8999 vm_offset_t va; 9000 vm_paddr_t pa; 9001 int cleared, md_gen, not_cleared, pvh_gen; 9002 struct spglist free; 9003 bool demoted; 9004 9005 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 9006 ("pmap_ts_referenced: page %p is not managed", m)); 9007 SLIST_INIT(&free); 9008 cleared = 0; 9009 pa = VM_PAGE_TO_PHYS(m); 9010 lock = PHYS_TO_PV_LIST_LOCK(pa); 9011 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); 9012 rw_wlock(lock); 9013 retry: 9014 not_cleared = 0; 9015 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 9016 goto small_mappings; 9017 pv = pvf; 9018 do { 9019 if (pvf == NULL) 9020 pvf = pv; 9021 pmap = PV_PMAP(pv); 9022 if (!PMAP_TRYLOCK(pmap)) { 9023 pvh_gen = pvh->pv_gen; 9024 rw_wunlock(lock); 9025 PMAP_LOCK(pmap); 9026 rw_wlock(lock); 9027 if (pvh_gen != pvh->pv_gen) { 9028 PMAP_UNLOCK(pmap); 9029 goto retry; 9030 } 9031 } 9032 PG_A = pmap_accessed_bit(pmap); 9033 PG_M = pmap_modified_bit(pmap); 9034 PG_RW = pmap_rw_bit(pmap); 9035 va = pv->pv_va; 9036 pde = pmap_pde(pmap, pv->pv_va); 9037 oldpde = *pde; 9038 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 9039 /* 9040 * Although "oldpde" is mapping a 2MB page, because 9041 * this function is called at a 4KB page granularity, 9042 * we only update the 4KB page under test. 9043 */ 9044 vm_page_dirty(m); 9045 } 9046 if ((oldpde & PG_A) != 0) { 9047 /* 9048 * Since this reference bit is shared by 512 4KB 9049 * pages, it should not be cleared every time it is 9050 * tested. Apply a simple "hash" function on the 9051 * physical page number, the virtual superpage number, 9052 * and the pmap address to select one 4KB page out of 9053 * the 512 on which testing the reference bit will 9054 * result in clearing that reference bit. This 9055 * function is designed to avoid the selection of the 9056 * same 4KB page for every 2MB page mapping. 9057 * 9058 * On demotion, a mapping that hasn't been referenced 9059 * is simply destroyed. To avoid the possibility of a 9060 * subsequent page fault on a demoted wired mapping, 9061 * always leave its reference bit set. Moreover, 9062 * since the superpage is wired, the current state of 9063 * its reference bit won't affect page replacement. 9064 */ 9065 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^ 9066 (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && 9067 (oldpde & PG_W) == 0) { 9068 if (safe_to_clear_referenced(pmap, oldpde)) { 9069 atomic_clear_long(pde, PG_A); 9070 pmap_invalidate_page(pmap, pv->pv_va); 9071 demoted = false; 9072 } else if (pmap_demote_pde_locked(pmap, pde, 9073 pv->pv_va, &lock)) { 9074 /* 9075 * Remove the mapping to a single page 9076 * so that a subsequent access may 9077 * repromote. Since the underlying 9078 * page table page is fully populated, 9079 * this removal never frees a page 9080 * table page. 9081 */ 9082 demoted = true; 9083 va += VM_PAGE_TO_PHYS(m) - (oldpde & 9084 PG_PS_FRAME); 9085 pte = pmap_pde_to_pte(pde, va); 9086 pmap_remove_pte(pmap, pte, va, *pde, 9087 NULL, &lock); 9088 pmap_invalidate_page(pmap, va); 9089 } else 9090 demoted = true; 9091 9092 if (demoted) { 9093 /* 9094 * The superpage mapping was removed 9095 * entirely and therefore 'pv' is no 9096 * longer valid. 9097 */ 9098 if (pvf == pv) 9099 pvf = NULL; 9100 pv = NULL; 9101 } 9102 cleared++; 9103 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 9104 ("inconsistent pv lock %p %p for page %p", 9105 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 9106 } else 9107 not_cleared++; 9108 } 9109 PMAP_UNLOCK(pmap); 9110 /* Rotate the PV list if it has more than one entry. */ 9111 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 9112 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 9113 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 9114 pvh->pv_gen++; 9115 } 9116 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) 9117 goto out; 9118 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 9119 small_mappings: 9120 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 9121 goto out; 9122 pv = pvf; 9123 do { 9124 if (pvf == NULL) 9125 pvf = pv; 9126 pmap = PV_PMAP(pv); 9127 if (!PMAP_TRYLOCK(pmap)) { 9128 pvh_gen = pvh->pv_gen; 9129 md_gen = m->md.pv_gen; 9130 rw_wunlock(lock); 9131 PMAP_LOCK(pmap); 9132 rw_wlock(lock); 9133 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 9134 PMAP_UNLOCK(pmap); 9135 goto retry; 9136 } 9137 } 9138 PG_A = pmap_accessed_bit(pmap); 9139 PG_M = pmap_modified_bit(pmap); 9140 PG_RW = pmap_rw_bit(pmap); 9141 pde = pmap_pde(pmap, pv->pv_va); 9142 KASSERT((*pde & PG_PS) == 0, 9143 ("pmap_ts_referenced: found a 2mpage in page %p's pv list", 9144 m)); 9145 pte = pmap_pde_to_pte(pde, pv->pv_va); 9146 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 9147 vm_page_dirty(m); 9148 if ((*pte & PG_A) != 0) { 9149 if (safe_to_clear_referenced(pmap, *pte)) { 9150 atomic_clear_long(pte, PG_A); 9151 pmap_invalidate_page(pmap, pv->pv_va); 9152 cleared++; 9153 } else if ((*pte & PG_W) == 0) { 9154 /* 9155 * Wired pages cannot be paged out so 9156 * doing accessed bit emulation for 9157 * them is wasted effort. We do the 9158 * hard work for unwired pages only. 9159 */ 9160 pmap_remove_pte(pmap, pte, pv->pv_va, 9161 *pde, &free, &lock); 9162 pmap_invalidate_page(pmap, pv->pv_va); 9163 cleared++; 9164 if (pvf == pv) 9165 pvf = NULL; 9166 pv = NULL; 9167 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 9168 ("inconsistent pv lock %p %p for page %p", 9169 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 9170 } else 9171 not_cleared++; 9172 } 9173 PMAP_UNLOCK(pmap); 9174 /* Rotate the PV list if it has more than one entry. */ 9175 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 9176 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 9177 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 9178 m->md.pv_gen++; 9179 } 9180 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + 9181 not_cleared < PMAP_TS_REFERENCED_MAX); 9182 out: 9183 rw_wunlock(lock); 9184 vm_page_free_pages_toq(&free, true); 9185 return (cleared + not_cleared); 9186 } 9187 9188 /* 9189 * Apply the given advice to the specified range of addresses within the 9190 * given pmap. Depending on the advice, clear the referenced and/or 9191 * modified flags in each mapping and set the mapped page's dirty field. 9192 */ 9193 void 9194 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 9195 { 9196 struct rwlock *lock; 9197 pml4_entry_t *pml4e; 9198 pdp_entry_t *pdpe; 9199 pd_entry_t oldpde, *pde; 9200 pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V; 9201 vm_offset_t va, va_next; 9202 vm_page_t m; 9203 bool anychanged; 9204 9205 if (advice != MADV_DONTNEED && advice != MADV_FREE) 9206 return; 9207 9208 /* 9209 * A/D bit emulation requires an alternate code path when clearing 9210 * the modified and accessed bits below. Since this function is 9211 * advisory in nature we skip it entirely for pmaps that require 9212 * A/D bit emulation. 9213 */ 9214 if (pmap_emulate_ad_bits(pmap)) 9215 return; 9216 9217 PG_A = pmap_accessed_bit(pmap); 9218 PG_G = pmap_global_bit(pmap); 9219 PG_M = pmap_modified_bit(pmap); 9220 PG_V = pmap_valid_bit(pmap); 9221 PG_RW = pmap_rw_bit(pmap); 9222 anychanged = false; 9223 pmap_delayed_invl_start(); 9224 PMAP_LOCK(pmap); 9225 for (; sva < eva; sva = va_next) { 9226 pml4e = pmap_pml4e(pmap, sva); 9227 if (pml4e == NULL || (*pml4e & PG_V) == 0) { 9228 va_next = (sva + NBPML4) & ~PML4MASK; 9229 if (va_next < sva) 9230 va_next = eva; 9231 continue; 9232 } 9233 9234 va_next = (sva + NBPDP) & ~PDPMASK; 9235 if (va_next < sva) 9236 va_next = eva; 9237 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 9238 if ((*pdpe & PG_V) == 0) 9239 continue; 9240 if ((*pdpe & PG_PS) != 0) 9241 continue; 9242 9243 va_next = (sva + NBPDR) & ~PDRMASK; 9244 if (va_next < sva) 9245 va_next = eva; 9246 pde = pmap_pdpe_to_pde(pdpe, sva); 9247 oldpde = *pde; 9248 if ((oldpde & PG_V) == 0) 9249 continue; 9250 else if ((oldpde & PG_PS) != 0) { 9251 if ((oldpde & PG_MANAGED) == 0) 9252 continue; 9253 lock = NULL; 9254 if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) { 9255 if (lock != NULL) 9256 rw_wunlock(lock); 9257 9258 /* 9259 * The large page mapping was destroyed. 9260 */ 9261 continue; 9262 } 9263 9264 /* 9265 * Unless the page mappings are wired, remove the 9266 * mapping to a single page so that a subsequent 9267 * access may repromote. Choosing the last page 9268 * within the address range [sva, min(va_next, eva)) 9269 * generally results in more repromotions. Since the 9270 * underlying page table page is fully populated, this 9271 * removal never frees a page table page. 9272 */ 9273 if ((oldpde & PG_W) == 0) { 9274 va = eva; 9275 if (va > va_next) 9276 va = va_next; 9277 va -= PAGE_SIZE; 9278 KASSERT(va >= sva, 9279 ("pmap_advise: no address gap")); 9280 pte = pmap_pde_to_pte(pde, va); 9281 KASSERT((*pte & PG_V) != 0, 9282 ("pmap_advise: invalid PTE")); 9283 pmap_remove_pte(pmap, pte, va, *pde, NULL, 9284 &lock); 9285 anychanged = true; 9286 } 9287 if (lock != NULL) 9288 rw_wunlock(lock); 9289 } 9290 if (va_next > eva) 9291 va_next = eva; 9292 va = va_next; 9293 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 9294 sva += PAGE_SIZE) { 9295 if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V)) 9296 goto maybe_invlrng; 9297 else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 9298 if (advice == MADV_DONTNEED) { 9299 /* 9300 * Future calls to pmap_is_modified() 9301 * can be avoided by making the page 9302 * dirty now. 9303 */ 9304 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 9305 vm_page_dirty(m); 9306 } 9307 atomic_clear_long(pte, PG_M | PG_A); 9308 } else if ((*pte & PG_A) != 0) 9309 atomic_clear_long(pte, PG_A); 9310 else 9311 goto maybe_invlrng; 9312 9313 if ((*pte & PG_G) != 0) { 9314 if (va == va_next) 9315 va = sva; 9316 } else 9317 anychanged = true; 9318 continue; 9319 maybe_invlrng: 9320 if (va != va_next) { 9321 pmap_invalidate_range(pmap, va, sva); 9322 va = va_next; 9323 } 9324 } 9325 if (va != va_next) 9326 pmap_invalidate_range(pmap, va, sva); 9327 } 9328 if (anychanged) 9329 pmap_invalidate_all(pmap); 9330 PMAP_UNLOCK(pmap); 9331 pmap_delayed_invl_finish(); 9332 } 9333 9334 /* 9335 * Clear the modify bits on the specified physical page. 9336 */ 9337 void 9338 pmap_clear_modify(vm_page_t m) 9339 { 9340 struct md_page *pvh; 9341 pmap_t pmap; 9342 pv_entry_t next_pv, pv; 9343 pd_entry_t oldpde, *pde; 9344 pt_entry_t *pte, PG_M, PG_RW; 9345 struct rwlock *lock; 9346 vm_offset_t va; 9347 int md_gen, pvh_gen; 9348 9349 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 9350 ("pmap_clear_modify: page %p is not managed", m)); 9351 vm_page_assert_busied(m); 9352 9353 if (!pmap_page_is_write_mapped(m)) 9354 return; 9355 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : 9356 pa_to_pvh(VM_PAGE_TO_PHYS(m)); 9357 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 9358 rw_wlock(lock); 9359 restart: 9360 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 9361 pmap = PV_PMAP(pv); 9362 if (!PMAP_TRYLOCK(pmap)) { 9363 pvh_gen = pvh->pv_gen; 9364 rw_wunlock(lock); 9365 PMAP_LOCK(pmap); 9366 rw_wlock(lock); 9367 if (pvh_gen != pvh->pv_gen) { 9368 PMAP_UNLOCK(pmap); 9369 goto restart; 9370 } 9371 } 9372 PG_M = pmap_modified_bit(pmap); 9373 PG_RW = pmap_rw_bit(pmap); 9374 va = pv->pv_va; 9375 pde = pmap_pde(pmap, va); 9376 oldpde = *pde; 9377 /* If oldpde has PG_RW set, then it also has PG_M set. */ 9378 if ((oldpde & PG_RW) != 0 && 9379 pmap_demote_pde_locked(pmap, pde, va, &lock) && 9380 (oldpde & PG_W) == 0) { 9381 /* 9382 * Write protect the mapping to a single page so that 9383 * a subsequent write access may repromote. 9384 */ 9385 va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_PS_FRAME); 9386 pte = pmap_pde_to_pte(pde, va); 9387 atomic_clear_long(pte, PG_M | PG_RW); 9388 vm_page_dirty(m); 9389 pmap_invalidate_page(pmap, va); 9390 } 9391 PMAP_UNLOCK(pmap); 9392 } 9393 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 9394 pmap = PV_PMAP(pv); 9395 if (!PMAP_TRYLOCK(pmap)) { 9396 md_gen = m->md.pv_gen; 9397 pvh_gen = pvh->pv_gen; 9398 rw_wunlock(lock); 9399 PMAP_LOCK(pmap); 9400 rw_wlock(lock); 9401 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 9402 PMAP_UNLOCK(pmap); 9403 goto restart; 9404 } 9405 } 9406 PG_M = pmap_modified_bit(pmap); 9407 PG_RW = pmap_rw_bit(pmap); 9408 pde = pmap_pde(pmap, pv->pv_va); 9409 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" 9410 " a 2mpage in page %p's pv list", m)); 9411 pte = pmap_pde_to_pte(pde, pv->pv_va); 9412 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 9413 atomic_clear_long(pte, PG_M); 9414 pmap_invalidate_page(pmap, pv->pv_va); 9415 } 9416 PMAP_UNLOCK(pmap); 9417 } 9418 rw_wunlock(lock); 9419 } 9420 9421 /* 9422 * Miscellaneous support routines follow 9423 */ 9424 9425 /* Adjust the properties for a leaf page table entry. */ 9426 static __inline void 9427 pmap_pte_props(pt_entry_t *pte, u_long bits, u_long mask) 9428 { 9429 u_long opte, npte; 9430 9431 opte = *(u_long *)pte; 9432 do { 9433 npte = opte & ~mask; 9434 npte |= bits; 9435 } while (npte != opte && !atomic_fcmpset_long((u_long *)pte, &opte, 9436 npte)); 9437 } 9438 9439 /* 9440 * Map a set of physical memory pages into the kernel virtual 9441 * address space. Return a pointer to where it is mapped. This 9442 * routine is intended to be used for mapping device memory, 9443 * NOT real memory. 9444 */ 9445 static void * 9446 pmap_mapdev_internal(vm_paddr_t pa, vm_size_t size, int mode, int flags) 9447 { 9448 struct pmap_preinit_mapping *ppim; 9449 vm_offset_t va, offset; 9450 vm_size_t tmpsize; 9451 int i; 9452 9453 offset = pa & PAGE_MASK; 9454 size = round_page(offset + size); 9455 pa = trunc_page(pa); 9456 9457 if (!pmap_initialized) { 9458 va = 0; 9459 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 9460 ppim = pmap_preinit_mapping + i; 9461 if (ppim->va == 0) { 9462 ppim->pa = pa; 9463 ppim->sz = size; 9464 ppim->mode = mode; 9465 ppim->va = virtual_avail; 9466 virtual_avail += size; 9467 va = ppim->va; 9468 break; 9469 } 9470 } 9471 if (va == 0) 9472 panic("%s: too many preinit mappings", __func__); 9473 } else { 9474 /* 9475 * If we have a preinit mapping, reuse it. 9476 */ 9477 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 9478 ppim = pmap_preinit_mapping + i; 9479 if (ppim->pa == pa && ppim->sz == size && 9480 (ppim->mode == mode || 9481 (flags & MAPDEV_SETATTR) == 0)) 9482 return ((void *)(ppim->va + offset)); 9483 } 9484 /* 9485 * If the specified range of physical addresses fits within 9486 * the direct map window, use the direct map. 9487 */ 9488 if (pa < dmaplimit && pa + size <= dmaplimit) { 9489 va = PHYS_TO_DMAP(pa); 9490 if ((flags & MAPDEV_SETATTR) != 0) { 9491 PMAP_LOCK(kernel_pmap); 9492 i = pmap_change_props_locked(va, size, 9493 PROT_NONE, mode, flags); 9494 PMAP_UNLOCK(kernel_pmap); 9495 } else 9496 i = 0; 9497 if (!i) 9498 return ((void *)(va + offset)); 9499 } 9500 va = kva_alloc(size); 9501 if (va == 0) 9502 panic("%s: Couldn't allocate KVA", __func__); 9503 } 9504 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) 9505 pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); 9506 pmap_invalidate_range(kernel_pmap, va, va + tmpsize); 9507 if ((flags & MAPDEV_FLUSHCACHE) != 0) 9508 pmap_invalidate_cache_range(va, va + tmpsize); 9509 return ((void *)(va + offset)); 9510 } 9511 9512 void * 9513 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) 9514 { 9515 9516 return (pmap_mapdev_internal(pa, size, mode, MAPDEV_FLUSHCACHE | 9517 MAPDEV_SETATTR)); 9518 } 9519 9520 void * 9521 pmap_mapdev(vm_paddr_t pa, vm_size_t size) 9522 { 9523 9524 return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); 9525 } 9526 9527 void * 9528 pmap_mapdev_pciecfg(vm_paddr_t pa, vm_size_t size) 9529 { 9530 9531 return (pmap_mapdev_internal(pa, size, PAT_UNCACHEABLE, 9532 MAPDEV_SETATTR)); 9533 } 9534 9535 void * 9536 pmap_mapbios(vm_paddr_t pa, vm_size_t size) 9537 { 9538 9539 return (pmap_mapdev_internal(pa, size, PAT_WRITE_BACK, 9540 MAPDEV_FLUSHCACHE)); 9541 } 9542 9543 void 9544 pmap_unmapdev(void *p, vm_size_t size) 9545 { 9546 struct pmap_preinit_mapping *ppim; 9547 vm_offset_t offset, va; 9548 int i; 9549 9550 va = (vm_offset_t)p; 9551 9552 /* If we gave a direct map region in pmap_mapdev, do nothing */ 9553 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) 9554 return; 9555 offset = va & PAGE_MASK; 9556 size = round_page(offset + size); 9557 va = trunc_page(va); 9558 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 9559 ppim = pmap_preinit_mapping + i; 9560 if (ppim->va == va && ppim->sz == size) { 9561 if (pmap_initialized) 9562 return; 9563 ppim->pa = 0; 9564 ppim->va = 0; 9565 ppim->sz = 0; 9566 ppim->mode = 0; 9567 if (va + size == virtual_avail) 9568 virtual_avail = va; 9569 return; 9570 } 9571 } 9572 if (pmap_initialized) { 9573 pmap_qremove(va, atop(size)); 9574 kva_free(va, size); 9575 } 9576 } 9577 9578 /* 9579 * Tries to demote a 1GB page mapping. 9580 */ 9581 static bool 9582 pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va) 9583 { 9584 pdp_entry_t newpdpe, oldpdpe; 9585 pd_entry_t *firstpde, newpde, *pde; 9586 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 9587 vm_paddr_t pdpgpa; 9588 vm_page_t pdpg; 9589 9590 PG_A = pmap_accessed_bit(pmap); 9591 PG_M = pmap_modified_bit(pmap); 9592 PG_V = pmap_valid_bit(pmap); 9593 PG_RW = pmap_rw_bit(pmap); 9594 9595 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9596 oldpdpe = *pdpe; 9597 KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V), 9598 ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V")); 9599 pdpg = pmap_alloc_pt_page(pmap, va >> PDPSHIFT, 9600 VM_ALLOC_WIRED | VM_ALLOC_INTERRUPT); 9601 if (pdpg == NULL) { 9602 CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx" 9603 " in pmap %p", va, pmap); 9604 return (false); 9605 } 9606 pdpgpa = VM_PAGE_TO_PHYS(pdpg); 9607 firstpde = (pd_entry_t *)PHYS_TO_DMAP(pdpgpa); 9608 newpdpe = pdpgpa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V; 9609 KASSERT((oldpdpe & PG_A) != 0, 9610 ("pmap_demote_pdpe: oldpdpe is missing PG_A")); 9611 KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW, 9612 ("pmap_demote_pdpe: oldpdpe is missing PG_M")); 9613 newpde = oldpdpe; 9614 9615 /* 9616 * Initialize the page directory page. 9617 */ 9618 for (pde = firstpde; pde < firstpde + NPDEPG; pde++) { 9619 *pde = newpde; 9620 newpde += NBPDR; 9621 } 9622 9623 /* 9624 * Demote the mapping. 9625 */ 9626 *pdpe = newpdpe; 9627 9628 /* 9629 * Invalidate a stale recursive mapping of the page directory page. 9630 */ 9631 pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va)); 9632 9633 counter_u64_add(pmap_pdpe_demotions, 1); 9634 CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx" 9635 " in pmap %p", va, pmap); 9636 return (true); 9637 } 9638 9639 /* 9640 * Sets the memory attribute for the specified page. 9641 */ 9642 void 9643 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 9644 { 9645 9646 m->md.pat_mode = ma; 9647 9648 /* 9649 * If "m" is a normal page, update its direct mapping. This update 9650 * can be relied upon to perform any cache operations that are 9651 * required for data coherence. 9652 */ 9653 if ((m->flags & PG_FICTITIOUS) == 0 && 9654 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, 9655 m->md.pat_mode)) 9656 panic("memory attribute change on the direct map failed"); 9657 } 9658 9659 void 9660 pmap_page_set_memattr_noflush(vm_page_t m, vm_memattr_t ma) 9661 { 9662 int error; 9663 9664 m->md.pat_mode = ma; 9665 9666 if ((m->flags & PG_FICTITIOUS) != 0) 9667 return; 9668 PMAP_LOCK(kernel_pmap); 9669 error = pmap_change_props_locked(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), 9670 PAGE_SIZE, PROT_NONE, m->md.pat_mode, 0); 9671 PMAP_UNLOCK(kernel_pmap); 9672 if (error != 0) 9673 panic("memory attribute change on the direct map failed"); 9674 } 9675 9676 /* 9677 * Changes the specified virtual address range's memory type to that given by 9678 * the parameter "mode". The specified virtual address range must be 9679 * completely contained within either the direct map or the kernel map. If 9680 * the virtual address range is contained within the kernel map, then the 9681 * memory type for each of the corresponding ranges of the direct map is also 9682 * changed. (The corresponding ranges of the direct map are those ranges that 9683 * map the same physical pages as the specified virtual address range.) These 9684 * changes to the direct map are necessary because Intel describes the 9685 * behavior of their processors as "undefined" if two or more mappings to the 9686 * same physical page have different memory types. 9687 * 9688 * Returns zero if the change completed successfully, and either EINVAL or 9689 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 9690 * of the virtual address range was not mapped, and ENOMEM is returned if 9691 * there was insufficient memory available to complete the change. In the 9692 * latter case, the memory type may have been changed on some part of the 9693 * virtual address range or the direct map. 9694 */ 9695 int 9696 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) 9697 { 9698 int error; 9699 9700 PMAP_LOCK(kernel_pmap); 9701 error = pmap_change_props_locked(va, size, PROT_NONE, mode, 9702 MAPDEV_FLUSHCACHE); 9703 PMAP_UNLOCK(kernel_pmap); 9704 return (error); 9705 } 9706 9707 /* 9708 * Changes the specified virtual address range's protections to those 9709 * specified by "prot". Like pmap_change_attr(), protections for aliases 9710 * in the direct map are updated as well. Protections on aliasing mappings may 9711 * be a subset of the requested protections; for example, mappings in the direct 9712 * map are never executable. 9713 */ 9714 int 9715 pmap_change_prot(vm_offset_t va, vm_size_t size, vm_prot_t prot) 9716 { 9717 int error; 9718 9719 /* Only supported within the kernel map. */ 9720 if (va < VM_MIN_KERNEL_ADDRESS) 9721 return (EINVAL); 9722 9723 PMAP_LOCK(kernel_pmap); 9724 error = pmap_change_props_locked(va, size, prot, -1, 9725 MAPDEV_ASSERTVALID); 9726 PMAP_UNLOCK(kernel_pmap); 9727 return (error); 9728 } 9729 9730 static int 9731 pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot, 9732 int mode, int flags) 9733 { 9734 vm_offset_t base, offset, tmpva; 9735 vm_paddr_t pa_start, pa_end, pa_end1; 9736 pdp_entry_t *pdpe; 9737 pd_entry_t *pde, pde_bits, pde_mask; 9738 pt_entry_t *pte, pte_bits, pte_mask; 9739 int error; 9740 bool changed; 9741 9742 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 9743 base = trunc_page(va); 9744 offset = va & PAGE_MASK; 9745 size = round_page(offset + size); 9746 9747 /* 9748 * Only supported on kernel virtual addresses, including the direct 9749 * map but excluding the recursive map. 9750 */ 9751 if (base < DMAP_MIN_ADDRESS) 9752 return (EINVAL); 9753 9754 /* 9755 * Construct our flag sets and masks. "bits" is the subset of 9756 * "mask" that will be set in each modified PTE. 9757 * 9758 * Mappings in the direct map are never allowed to be executable. 9759 */ 9760 pde_bits = pte_bits = 0; 9761 pde_mask = pte_mask = 0; 9762 if (mode != -1) { 9763 pde_bits |= pmap_cache_bits(kernel_pmap, mode, true); 9764 pde_mask |= X86_PG_PDE_CACHE; 9765 pte_bits |= pmap_cache_bits(kernel_pmap, mode, false); 9766 pte_mask |= X86_PG_PTE_CACHE; 9767 } 9768 if (prot != VM_PROT_NONE) { 9769 if ((prot & VM_PROT_WRITE) != 0) { 9770 pde_bits |= X86_PG_RW; 9771 pte_bits |= X86_PG_RW; 9772 } 9773 if ((prot & VM_PROT_EXECUTE) == 0 || 9774 va < VM_MIN_KERNEL_ADDRESS) { 9775 pde_bits |= pg_nx; 9776 pte_bits |= pg_nx; 9777 } 9778 pde_mask |= X86_PG_RW | pg_nx; 9779 pte_mask |= X86_PG_RW | pg_nx; 9780 } 9781 9782 /* 9783 * Pages that aren't mapped aren't supported. Also break down 2MB pages 9784 * into 4KB pages if required. 9785 */ 9786 for (tmpva = base; tmpva < base + size; ) { 9787 pdpe = pmap_pdpe(kernel_pmap, tmpva); 9788 if (pdpe == NULL || *pdpe == 0) { 9789 KASSERT((flags & MAPDEV_ASSERTVALID) == 0, 9790 ("%s: addr %#lx is not mapped", __func__, tmpva)); 9791 return (EINVAL); 9792 } 9793 if (*pdpe & PG_PS) { 9794 /* 9795 * If the current 1GB page already has the required 9796 * properties, then we need not demote this page. Just 9797 * increment tmpva to the next 1GB page frame. 9798 */ 9799 if ((*pdpe & pde_mask) == pde_bits) { 9800 tmpva = trunc_1gpage(tmpva) + NBPDP; 9801 continue; 9802 } 9803 9804 /* 9805 * If the current offset aligns with a 1GB page frame 9806 * and there is at least 1GB left within the range, then 9807 * we need not break down this page into 2MB pages. 9808 */ 9809 if ((tmpva & PDPMASK) == 0 && 9810 tmpva + PDPMASK < base + size) { 9811 tmpva += NBPDP; 9812 continue; 9813 } 9814 if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva)) 9815 return (ENOMEM); 9816 } 9817 pde = pmap_pdpe_to_pde(pdpe, tmpva); 9818 if (*pde == 0) { 9819 KASSERT((flags & MAPDEV_ASSERTVALID) == 0, 9820 ("%s: addr %#lx is not mapped", __func__, tmpva)); 9821 return (EINVAL); 9822 } 9823 if (*pde & PG_PS) { 9824 /* 9825 * If the current 2MB page already has the required 9826 * properties, then we need not demote this page. Just 9827 * increment tmpva to the next 2MB page frame. 9828 */ 9829 if ((*pde & pde_mask) == pde_bits) { 9830 tmpva = trunc_2mpage(tmpva) + NBPDR; 9831 continue; 9832 } 9833 9834 /* 9835 * If the current offset aligns with a 2MB page frame 9836 * and there is at least 2MB left within the range, then 9837 * we need not break down this page into 4KB pages. 9838 */ 9839 if ((tmpva & PDRMASK) == 0 && 9840 tmpva + PDRMASK < base + size) { 9841 tmpva += NBPDR; 9842 continue; 9843 } 9844 if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) 9845 return (ENOMEM); 9846 } 9847 pte = pmap_pde_to_pte(pde, tmpva); 9848 if (*pte == 0) { 9849 KASSERT((flags & MAPDEV_ASSERTVALID) == 0, 9850 ("%s: addr %#lx is not mapped", __func__, tmpva)); 9851 return (EINVAL); 9852 } 9853 tmpva += PAGE_SIZE; 9854 } 9855 error = 0; 9856 9857 /* 9858 * Ok, all the pages exist, so run through them updating their 9859 * properties if required. 9860 */ 9861 changed = false; 9862 pa_start = pa_end = 0; 9863 for (tmpva = base; tmpva < base + size; ) { 9864 pdpe = pmap_pdpe(kernel_pmap, tmpva); 9865 if (*pdpe & PG_PS) { 9866 if ((*pdpe & pde_mask) != pde_bits) { 9867 pmap_pte_props(pdpe, pde_bits, pde_mask); 9868 changed = true; 9869 } 9870 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 9871 (*pdpe & PG_PS_FRAME) < dmaplimit) { 9872 if (pa_start == pa_end) { 9873 /* Start physical address run. */ 9874 pa_start = *pdpe & PG_PS_FRAME; 9875 pa_end = pa_start + NBPDP; 9876 } else if (pa_end == (*pdpe & PG_PS_FRAME)) 9877 pa_end += NBPDP; 9878 else { 9879 /* Run ended, update direct map. */ 9880 error = pmap_change_props_locked( 9881 PHYS_TO_DMAP(pa_start), 9882 pa_end - pa_start, prot, mode, 9883 flags); 9884 if (error != 0) 9885 break; 9886 /* Start physical address run. */ 9887 pa_start = *pdpe & PG_PS_FRAME; 9888 pa_end = pa_start + NBPDP; 9889 } 9890 } 9891 tmpva = trunc_1gpage(tmpva) + NBPDP; 9892 continue; 9893 } 9894 pde = pmap_pdpe_to_pde(pdpe, tmpva); 9895 if (*pde & PG_PS) { 9896 if ((*pde & pde_mask) != pde_bits) { 9897 pmap_pte_props(pde, pde_bits, pde_mask); 9898 changed = true; 9899 } 9900 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 9901 (*pde & PG_PS_FRAME) < dmaplimit) { 9902 if (pa_start == pa_end) { 9903 /* Start physical address run. */ 9904 pa_start = *pde & PG_PS_FRAME; 9905 pa_end = pa_start + NBPDR; 9906 } else if (pa_end == (*pde & PG_PS_FRAME)) 9907 pa_end += NBPDR; 9908 else { 9909 /* Run ended, update direct map. */ 9910 error = pmap_change_props_locked( 9911 PHYS_TO_DMAP(pa_start), 9912 pa_end - pa_start, prot, mode, 9913 flags); 9914 if (error != 0) 9915 break; 9916 /* Start physical address run. */ 9917 pa_start = *pde & PG_PS_FRAME; 9918 pa_end = pa_start + NBPDR; 9919 } 9920 } 9921 tmpva = trunc_2mpage(tmpva) + NBPDR; 9922 } else { 9923 pte = pmap_pde_to_pte(pde, tmpva); 9924 if ((*pte & pte_mask) != pte_bits) { 9925 pmap_pte_props(pte, pte_bits, pte_mask); 9926 changed = true; 9927 } 9928 if (tmpva >= VM_MIN_KERNEL_ADDRESS && 9929 (*pte & PG_FRAME) < dmaplimit) { 9930 if (pa_start == pa_end) { 9931 /* Start physical address run. */ 9932 pa_start = *pte & PG_FRAME; 9933 pa_end = pa_start + PAGE_SIZE; 9934 } else if (pa_end == (*pte & PG_FRAME)) 9935 pa_end += PAGE_SIZE; 9936 else { 9937 /* Run ended, update direct map. */ 9938 error = pmap_change_props_locked( 9939 PHYS_TO_DMAP(pa_start), 9940 pa_end - pa_start, prot, mode, 9941 flags); 9942 if (error != 0) 9943 break; 9944 /* Start physical address run. */ 9945 pa_start = *pte & PG_FRAME; 9946 pa_end = pa_start + PAGE_SIZE; 9947 } 9948 } 9949 tmpva += PAGE_SIZE; 9950 } 9951 } 9952 if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) { 9953 pa_end1 = MIN(pa_end, dmaplimit); 9954 if (pa_start != pa_end1) 9955 error = pmap_change_props_locked(PHYS_TO_DMAP(pa_start), 9956 pa_end1 - pa_start, prot, mode, flags); 9957 } 9958 9959 /* 9960 * Flush CPU caches if required to make sure any data isn't cached that 9961 * shouldn't be, etc. 9962 */ 9963 if (changed) { 9964 pmap_invalidate_range(kernel_pmap, base, tmpva); 9965 if ((flags & MAPDEV_FLUSHCACHE) != 0) 9966 pmap_invalidate_cache_range(base, tmpva); 9967 } 9968 return (error); 9969 } 9970 9971 /* 9972 * Demotes any mapping within the direct map region that covers more than the 9973 * specified range of physical addresses. This range's size must be a power 9974 * of two and its starting address must be a multiple of its size. Since the 9975 * demotion does not change any attributes of the mapping, a TLB invalidation 9976 * is not mandatory. The caller may, however, request a TLB invalidation. 9977 */ 9978 void 9979 pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, bool invalidate) 9980 { 9981 pdp_entry_t *pdpe; 9982 pd_entry_t *pde; 9983 vm_offset_t va; 9984 bool changed; 9985 9986 if (len == 0) 9987 return; 9988 KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2")); 9989 KASSERT((base & (len - 1)) == 0, 9990 ("pmap_demote_DMAP: base is not a multiple of len")); 9991 if (len < NBPDP && base < dmaplimit) { 9992 va = PHYS_TO_DMAP(base); 9993 changed = false; 9994 PMAP_LOCK(kernel_pmap); 9995 pdpe = pmap_pdpe(kernel_pmap, va); 9996 if ((*pdpe & X86_PG_V) == 0) 9997 panic("pmap_demote_DMAP: invalid PDPE"); 9998 if ((*pdpe & PG_PS) != 0) { 9999 if (!pmap_demote_pdpe(kernel_pmap, pdpe, va)) 10000 panic("pmap_demote_DMAP: PDPE failed"); 10001 changed = true; 10002 } 10003 if (len < NBPDR) { 10004 pde = pmap_pdpe_to_pde(pdpe, va); 10005 if ((*pde & X86_PG_V) == 0) 10006 panic("pmap_demote_DMAP: invalid PDE"); 10007 if ((*pde & PG_PS) != 0) { 10008 if (!pmap_demote_pde(kernel_pmap, pde, va)) 10009 panic("pmap_demote_DMAP: PDE failed"); 10010 changed = true; 10011 } 10012 } 10013 if (changed && invalidate) 10014 pmap_invalidate_page(kernel_pmap, va); 10015 PMAP_UNLOCK(kernel_pmap); 10016 } 10017 } 10018 10019 /* 10020 * Perform the pmap work for mincore(2). If the page is not both referenced and 10021 * modified by this pmap, returns its physical address so that the caller can 10022 * find other mappings. 10023 */ 10024 int 10025 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) 10026 { 10027 pdp_entry_t *pdpe; 10028 pd_entry_t *pdep; 10029 pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V; 10030 vm_paddr_t pa; 10031 int val; 10032 10033 PG_A = pmap_accessed_bit(pmap); 10034 PG_M = pmap_modified_bit(pmap); 10035 PG_V = pmap_valid_bit(pmap); 10036 PG_RW = pmap_rw_bit(pmap); 10037 10038 PMAP_LOCK(pmap); 10039 pte = 0; 10040 pa = 0; 10041 val = 0; 10042 pdpe = pmap_pdpe(pmap, addr); 10043 if (pdpe == NULL) 10044 goto out; 10045 if ((*pdpe & PG_V) != 0) { 10046 if ((*pdpe & PG_PS) != 0) { 10047 pte = *pdpe; 10048 pa = ((pte & PG_PS_PDP_FRAME) | (addr & PDPMASK)) & 10049 PG_FRAME; 10050 val = MINCORE_PSIND(2); 10051 } else { 10052 pdep = pmap_pde(pmap, addr); 10053 if (pdep != NULL && (*pdep & PG_V) != 0) { 10054 if ((*pdep & PG_PS) != 0) { 10055 pte = *pdep; 10056 /* Compute the physical address of the 4KB page. */ 10057 pa = ((pte & PG_PS_FRAME) | (addr & 10058 PDRMASK)) & PG_FRAME; 10059 val = MINCORE_PSIND(1); 10060 } else { 10061 pte = *pmap_pde_to_pte(pdep, addr); 10062 pa = pte & PG_FRAME; 10063 val = 0; 10064 } 10065 } 10066 } 10067 } 10068 if ((pte & PG_V) != 0) { 10069 val |= MINCORE_INCORE; 10070 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 10071 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 10072 if ((pte & PG_A) != 0) 10073 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 10074 } 10075 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 10076 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && 10077 (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { 10078 *pap = pa; 10079 } 10080 out: 10081 PMAP_UNLOCK(pmap); 10082 return (val); 10083 } 10084 10085 static uint64_t 10086 pmap_pcid_alloc(pmap_t pmap, struct pmap_pcid *pcidp) 10087 { 10088 uint32_t gen, new_gen, pcid_next; 10089 10090 CRITICAL_ASSERT(curthread); 10091 gen = PCPU_GET(pcid_gen); 10092 if (pcidp->pm_pcid == PMAP_PCID_KERN) 10093 return (pti ? 0 : CR3_PCID_SAVE); 10094 if (pcidp->pm_gen == gen) 10095 return (CR3_PCID_SAVE); 10096 pcid_next = PCPU_GET(pcid_next); 10097 KASSERT((!pti && pcid_next <= PMAP_PCID_OVERMAX) || 10098 (pti && pcid_next <= PMAP_PCID_OVERMAX_KERN), 10099 ("cpu %d pcid_next %#x", PCPU_GET(cpuid), pcid_next)); 10100 if ((!pti && pcid_next == PMAP_PCID_OVERMAX) || 10101 (pti && pcid_next == PMAP_PCID_OVERMAX_KERN)) { 10102 new_gen = gen + 1; 10103 if (new_gen == 0) 10104 new_gen = 1; 10105 PCPU_SET(pcid_gen, new_gen); 10106 pcid_next = PMAP_PCID_KERN + 1; 10107 } else { 10108 new_gen = gen; 10109 } 10110 pcidp->pm_pcid = pcid_next; 10111 pcidp->pm_gen = new_gen; 10112 PCPU_SET(pcid_next, pcid_next + 1); 10113 return (0); 10114 } 10115 10116 static uint64_t 10117 pmap_pcid_alloc_checked(pmap_t pmap, struct pmap_pcid *pcidp) 10118 { 10119 uint64_t cached; 10120 10121 cached = pmap_pcid_alloc(pmap, pcidp); 10122 KASSERT(pcidp->pm_pcid < PMAP_PCID_OVERMAX, 10123 ("pmap %p cpu %d pcid %#x", pmap, PCPU_GET(cpuid), pcidp->pm_pcid)); 10124 KASSERT(pcidp->pm_pcid != PMAP_PCID_KERN || pmap == kernel_pmap, 10125 ("non-kernel pmap pmap %p cpu %d pcid %#x", 10126 pmap, PCPU_GET(cpuid), pcidp->pm_pcid)); 10127 return (cached); 10128 } 10129 10130 static void 10131 pmap_activate_sw_pti_post(struct thread *td, pmap_t pmap) 10132 { 10133 10134 PCPU_GET(tssp)->tss_rsp0 = pmap->pm_ucr3 != PMAP_NO_CR3 ? 10135 PCPU_GET(pti_rsp0) : (uintptr_t)td->td_md.md_stack_base; 10136 } 10137 10138 static void 10139 pmap_activate_sw_pcid_pti(struct thread *td, pmap_t pmap, u_int cpuid) 10140 { 10141 pmap_t old_pmap; 10142 struct pmap_pcid *pcidp, *old_pcidp; 10143 uint64_t cached, cr3, kcr3, ucr3; 10144 10145 KASSERT((read_rflags() & PSL_I) == 0, 10146 ("PCID needs interrupts disabled in pmap_activate_sw()")); 10147 10148 /* See the comment in pmap_invalidate_page_pcid(). */ 10149 if (PCPU_GET(ucr3_load_mask) != PMAP_UCR3_NOMASK) { 10150 PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK); 10151 old_pmap = PCPU_GET(curpmap); 10152 MPASS(old_pmap->pm_ucr3 != PMAP_NO_CR3); 10153 old_pcidp = zpcpu_get_cpu(old_pmap->pm_pcidp, cpuid); 10154 old_pcidp->pm_gen = 0; 10155 } 10156 10157 pcidp = zpcpu_get_cpu(pmap->pm_pcidp, cpuid); 10158 cached = pmap_pcid_alloc_checked(pmap, pcidp); 10159 cr3 = rcr3(); 10160 if ((cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) 10161 load_cr3(pmap->pm_cr3 | pcidp->pm_pcid); 10162 PCPU_SET(curpmap, pmap); 10163 kcr3 = pmap->pm_cr3 | pcidp->pm_pcid; 10164 ucr3 = pmap->pm_ucr3 | pcidp->pm_pcid | PMAP_PCID_USER_PT; 10165 10166 if (!cached && pmap->pm_ucr3 != PMAP_NO_CR3) 10167 PCPU_SET(ucr3_load_mask, ~CR3_PCID_SAVE); 10168 10169 PCPU_SET(kcr3, kcr3 | CR3_PCID_SAVE); 10170 PCPU_SET(ucr3, ucr3 | CR3_PCID_SAVE); 10171 if (cached) 10172 counter_u64_add(pcid_save_cnt, 1); 10173 10174 pmap_activate_sw_pti_post(td, pmap); 10175 } 10176 10177 static void 10178 pmap_activate_sw_pcid_nopti(struct thread *td __unused, pmap_t pmap, 10179 u_int cpuid) 10180 { 10181 struct pmap_pcid *pcidp; 10182 uint64_t cached, cr3; 10183 10184 KASSERT((read_rflags() & PSL_I) == 0, 10185 ("PCID needs interrupts disabled in pmap_activate_sw()")); 10186 10187 pcidp = zpcpu_get_cpu(pmap->pm_pcidp, cpuid); 10188 cached = pmap_pcid_alloc_checked(pmap, pcidp); 10189 cr3 = rcr3(); 10190 if (!cached || (cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) 10191 load_cr3(pmap->pm_cr3 | pcidp->pm_pcid | cached); 10192 PCPU_SET(curpmap, pmap); 10193 if (cached) 10194 counter_u64_add(pcid_save_cnt, 1); 10195 } 10196 10197 static void 10198 pmap_activate_sw_nopcid_nopti(struct thread *td __unused, pmap_t pmap, 10199 u_int cpuid __unused) 10200 { 10201 10202 load_cr3(pmap->pm_cr3); 10203 PCPU_SET(curpmap, pmap); 10204 } 10205 10206 static void 10207 pmap_activate_sw_nopcid_pti(struct thread *td, pmap_t pmap, 10208 u_int cpuid __unused) 10209 { 10210 10211 pmap_activate_sw_nopcid_nopti(td, pmap, cpuid); 10212 PCPU_SET(kcr3, pmap->pm_cr3); 10213 PCPU_SET(ucr3, pmap->pm_ucr3); 10214 pmap_activate_sw_pti_post(td, pmap); 10215 } 10216 10217 DEFINE_IFUNC(static, void, pmap_activate_sw_mode, (struct thread *, pmap_t, 10218 u_int)) 10219 { 10220 10221 if (pmap_pcid_enabled && pti) 10222 return (pmap_activate_sw_pcid_pti); 10223 else if (pmap_pcid_enabled && !pti) 10224 return (pmap_activate_sw_pcid_nopti); 10225 else if (!pmap_pcid_enabled && pti) 10226 return (pmap_activate_sw_nopcid_pti); 10227 else /* if (!pmap_pcid_enabled && !pti) */ 10228 return (pmap_activate_sw_nopcid_nopti); 10229 } 10230 10231 void 10232 pmap_activate_sw(struct thread *td) 10233 { 10234 pmap_t oldpmap, pmap; 10235 u_int cpuid; 10236 10237 oldpmap = PCPU_GET(curpmap); 10238 pmap = vmspace_pmap(td->td_proc->p_vmspace); 10239 if (oldpmap == pmap) { 10240 if (cpu_vendor_id != CPU_VENDOR_INTEL) 10241 mfence(); 10242 return; 10243 } 10244 cpuid = PCPU_GET(cpuid); 10245 #ifdef SMP 10246 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 10247 #else 10248 CPU_SET(cpuid, &pmap->pm_active); 10249 #endif 10250 pmap_activate_sw_mode(td, pmap, cpuid); 10251 #ifdef SMP 10252 CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); 10253 #else 10254 CPU_CLR(cpuid, &oldpmap->pm_active); 10255 #endif 10256 } 10257 10258 void 10259 pmap_activate(struct thread *td) 10260 { 10261 /* 10262 * invltlb_{invpcid,}_pcid_handler() is used to handle an 10263 * invalidate_all IPI, which checks for curpmap == 10264 * smp_tlb_pmap. The below sequence of operations has a 10265 * window where %CR3 is loaded with the new pmap's PML4 10266 * address, but the curpmap value has not yet been updated. 10267 * This causes the invltlb IPI handler, which is called 10268 * between the updates, to execute as a NOP, which leaves 10269 * stale TLB entries. 10270 * 10271 * Note that the most common use of pmap_activate_sw(), from 10272 * a context switch, is immune to this race, because 10273 * interrupts are disabled (while the thread lock is owned), 10274 * so the IPI is delayed until after curpmap is updated. Protect 10275 * other callers in a similar way, by disabling interrupts 10276 * around the %cr3 register reload and curpmap assignment. 10277 */ 10278 spinlock_enter(); 10279 pmap_activate_sw(td); 10280 spinlock_exit(); 10281 } 10282 10283 void 10284 pmap_activate_boot(pmap_t pmap) 10285 { 10286 uint64_t kcr3; 10287 u_int cpuid; 10288 10289 /* 10290 * kernel_pmap must be never deactivated, and we ensure that 10291 * by never activating it at all. 10292 */ 10293 MPASS(pmap != kernel_pmap); 10294 10295 cpuid = PCPU_GET(cpuid); 10296 #ifdef SMP 10297 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 10298 #else 10299 CPU_SET(cpuid, &pmap->pm_active); 10300 #endif 10301 PCPU_SET(curpmap, pmap); 10302 if (pti) { 10303 kcr3 = pmap->pm_cr3; 10304 if (pmap_pcid_enabled) 10305 kcr3 |= pmap_get_pcid(pmap) | CR3_PCID_SAVE; 10306 } else { 10307 kcr3 = PMAP_NO_CR3; 10308 } 10309 PCPU_SET(kcr3, kcr3); 10310 PCPU_SET(ucr3, PMAP_NO_CR3); 10311 } 10312 10313 void 10314 pmap_active_cpus(pmap_t pmap, cpuset_t *res) 10315 { 10316 *res = pmap->pm_active; 10317 } 10318 10319 void 10320 pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) 10321 { 10322 } 10323 10324 /* 10325 * Increase the starting virtual address of the given mapping if a 10326 * different alignment might result in more superpage mappings. 10327 */ 10328 void 10329 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 10330 vm_offset_t *addr, vm_size_t size) 10331 { 10332 vm_offset_t superpage_offset; 10333 10334 if (size < NBPDR) 10335 return; 10336 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 10337 offset += ptoa(object->pg_color); 10338 superpage_offset = offset & PDRMASK; 10339 if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || 10340 (*addr & PDRMASK) == superpage_offset) 10341 return; 10342 if ((*addr & PDRMASK) < superpage_offset) 10343 *addr = (*addr & ~PDRMASK) + superpage_offset; 10344 else 10345 *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; 10346 } 10347 10348 #ifdef INVARIANTS 10349 static unsigned long num_dirty_emulations; 10350 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW, 10351 &num_dirty_emulations, 0, NULL); 10352 10353 static unsigned long num_accessed_emulations; 10354 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW, 10355 &num_accessed_emulations, 0, NULL); 10356 10357 static unsigned long num_superpage_accessed_emulations; 10358 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW, 10359 &num_superpage_accessed_emulations, 0, NULL); 10360 10361 static unsigned long ad_emulation_superpage_promotions; 10362 SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW, 10363 &ad_emulation_superpage_promotions, 0, NULL); 10364 #endif /* INVARIANTS */ 10365 10366 int 10367 pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype) 10368 { 10369 int rv; 10370 struct rwlock *lock; 10371 #if VM_NRESERVLEVEL > 0 10372 vm_page_t m, mpte; 10373 #endif 10374 pd_entry_t *pde; 10375 pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V; 10376 10377 KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE, 10378 ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype)); 10379 10380 if (!pmap_emulate_ad_bits(pmap)) 10381 return (-1); 10382 10383 PG_A = pmap_accessed_bit(pmap); 10384 PG_M = pmap_modified_bit(pmap); 10385 PG_V = pmap_valid_bit(pmap); 10386 PG_RW = pmap_rw_bit(pmap); 10387 10388 rv = -1; 10389 lock = NULL; 10390 PMAP_LOCK(pmap); 10391 10392 pde = pmap_pde(pmap, va); 10393 if (pde == NULL || (*pde & PG_V) == 0) 10394 goto done; 10395 10396 if ((*pde & PG_PS) != 0) { 10397 if (ftype == VM_PROT_READ) { 10398 #ifdef INVARIANTS 10399 atomic_add_long(&num_superpage_accessed_emulations, 1); 10400 #endif 10401 *pde |= PG_A; 10402 rv = 0; 10403 } 10404 goto done; 10405 } 10406 10407 pte = pmap_pde_to_pte(pde, va); 10408 if ((*pte & PG_V) == 0) 10409 goto done; 10410 10411 if (ftype == VM_PROT_WRITE) { 10412 if ((*pte & PG_RW) == 0) 10413 goto done; 10414 /* 10415 * Set the modified and accessed bits simultaneously. 10416 * 10417 * Intel EPT PTEs that do software emulation of A/D bits map 10418 * PG_A and PG_M to EPT_PG_READ and EPT_PG_WRITE respectively. 10419 * An EPT misconfiguration is triggered if the PTE is writable 10420 * but not readable (WR=10). This is avoided by setting PG_A 10421 * and PG_M simultaneously. 10422 */ 10423 *pte |= PG_M | PG_A; 10424 } else { 10425 *pte |= PG_A; 10426 } 10427 10428 #if VM_NRESERVLEVEL > 0 10429 /* try to promote the mapping */ 10430 if (va < VM_MAXUSER_ADDRESS) 10431 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 10432 else 10433 mpte = NULL; 10434 10435 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 10436 10437 if ((mpte == NULL || mpte->ref_count == NPTEPG) && 10438 (m->flags & PG_FICTITIOUS) == 0 && 10439 vm_reserv_level_iffullpop(m) == 0 && 10440 pmap_promote_pde(pmap, pde, va, mpte, &lock)) { 10441 #ifdef INVARIANTS 10442 atomic_add_long(&ad_emulation_superpage_promotions, 1); 10443 #endif 10444 } 10445 #endif 10446 10447 #ifdef INVARIANTS 10448 if (ftype == VM_PROT_WRITE) 10449 atomic_add_long(&num_dirty_emulations, 1); 10450 else 10451 atomic_add_long(&num_accessed_emulations, 1); 10452 #endif 10453 rv = 0; /* success */ 10454 done: 10455 if (lock != NULL) 10456 rw_wunlock(lock); 10457 PMAP_UNLOCK(pmap); 10458 return (rv); 10459 } 10460 10461 void 10462 pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num) 10463 { 10464 pml4_entry_t *pml4; 10465 pdp_entry_t *pdp; 10466 pd_entry_t *pde; 10467 pt_entry_t *pte, PG_V; 10468 int idx; 10469 10470 idx = 0; 10471 PG_V = pmap_valid_bit(pmap); 10472 PMAP_LOCK(pmap); 10473 10474 pml4 = pmap_pml4e(pmap, va); 10475 if (pml4 == NULL) 10476 goto done; 10477 ptr[idx++] = *pml4; 10478 if ((*pml4 & PG_V) == 0) 10479 goto done; 10480 10481 pdp = pmap_pml4e_to_pdpe(pml4, va); 10482 ptr[idx++] = *pdp; 10483 if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) 10484 goto done; 10485 10486 pde = pmap_pdpe_to_pde(pdp, va); 10487 ptr[idx++] = *pde; 10488 if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) 10489 goto done; 10490 10491 pte = pmap_pde_to_pte(pde, va); 10492 ptr[idx++] = *pte; 10493 10494 done: 10495 PMAP_UNLOCK(pmap); 10496 *num = idx; 10497 } 10498 10499 /** 10500 * Get the kernel virtual address of a set of physical pages. If there are 10501 * physical addresses not covered by the DMAP perform a transient mapping 10502 * that will be removed when calling pmap_unmap_io_transient. 10503 * 10504 * \param page The pages the caller wishes to obtain the virtual 10505 * address on the kernel memory map. 10506 * \param vaddr On return contains the kernel virtual memory address 10507 * of the pages passed in the page parameter. 10508 * \param count Number of pages passed in. 10509 * \param can_fault true if the thread using the mapped pages can take 10510 * page faults, false otherwise. 10511 * 10512 * \returns true if the caller must call pmap_unmap_io_transient when 10513 * finished or false otherwise. 10514 * 10515 */ 10516 bool 10517 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 10518 bool can_fault) 10519 { 10520 vm_paddr_t paddr; 10521 bool needs_mapping; 10522 int error __unused, i; 10523 10524 /* 10525 * Allocate any KVA space that we need, this is done in a separate 10526 * loop to prevent calling vmem_alloc while pinned. 10527 */ 10528 needs_mapping = false; 10529 for (i = 0; i < count; i++) { 10530 paddr = VM_PAGE_TO_PHYS(page[i]); 10531 if (__predict_false(paddr >= dmaplimit)) { 10532 error = vmem_alloc(kernel_arena, PAGE_SIZE, 10533 M_BESTFIT | M_WAITOK, &vaddr[i]); 10534 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 10535 needs_mapping = true; 10536 } else { 10537 vaddr[i] = PHYS_TO_DMAP(paddr); 10538 } 10539 } 10540 10541 /* Exit early if everything is covered by the DMAP */ 10542 if (!needs_mapping) 10543 return (false); 10544 10545 /* 10546 * NB: The sequence of updating a page table followed by accesses 10547 * to the corresponding pages used in the !DMAP case is subject to 10548 * the situation described in the "AMD64 Architecture Programmer's 10549 * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special 10550 * Coherency Considerations". Therefore, issuing the INVLPG right 10551 * after modifying the PTE bits is crucial. 10552 */ 10553 if (!can_fault) 10554 sched_pin(); 10555 for (i = 0; i < count; i++) { 10556 paddr = VM_PAGE_TO_PHYS(page[i]); 10557 if (paddr >= dmaplimit) { 10558 if (can_fault) { 10559 /* 10560 * Slow path, since we can get page faults 10561 * while mappings are active don't pin the 10562 * thread to the CPU and instead add a global 10563 * mapping visible to all CPUs. 10564 */ 10565 pmap_qenter(vaddr[i], &page[i], 1); 10566 } else { 10567 pmap_kenter_attr(vaddr[i], paddr, 10568 page[i]->md.pat_mode); 10569 pmap_invlpg(kernel_pmap, vaddr[i]); 10570 } 10571 } 10572 } 10573 10574 return (needs_mapping); 10575 } 10576 10577 void 10578 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 10579 bool can_fault) 10580 { 10581 vm_paddr_t paddr; 10582 int i; 10583 10584 if (!can_fault) 10585 sched_unpin(); 10586 for (i = 0; i < count; i++) { 10587 paddr = VM_PAGE_TO_PHYS(page[i]); 10588 if (paddr >= dmaplimit) { 10589 if (can_fault) 10590 pmap_qremove(vaddr[i], 1); 10591 vmem_free(kernel_arena, vaddr[i], PAGE_SIZE); 10592 } 10593 } 10594 } 10595 10596 vm_offset_t 10597 pmap_quick_enter_page(vm_page_t m) 10598 { 10599 vm_paddr_t paddr; 10600 10601 paddr = VM_PAGE_TO_PHYS(m); 10602 if (paddr < dmaplimit) 10603 return (PHYS_TO_DMAP(paddr)); 10604 mtx_lock_spin(&qframe_mtx); 10605 KASSERT(*vtopte(qframe) == 0, ("qframe busy")); 10606 10607 /* 10608 * Since qframe is exclusively mapped by us, and we do not set 10609 * PG_G, we can use INVLPG here. 10610 */ 10611 invlpg(qframe); 10612 10613 pte_store(vtopte(qframe), paddr | X86_PG_RW | X86_PG_V | X86_PG_A | 10614 X86_PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, false)); 10615 return (qframe); 10616 } 10617 10618 void 10619 pmap_quick_remove_page(vm_offset_t addr) 10620 { 10621 10622 if (addr != qframe) 10623 return; 10624 pte_store(vtopte(qframe), 0); 10625 mtx_unlock_spin(&qframe_mtx); 10626 } 10627 10628 /* 10629 * Pdp pages from the large map are managed differently from either 10630 * kernel or user page table pages. They are permanently allocated at 10631 * initialization time, and their reference count is permanently set to 10632 * zero. The pml4 entries pointing to those pages are copied into 10633 * each allocated pmap. 10634 * 10635 * In contrast, pd and pt pages are managed like user page table 10636 * pages. They are dynamically allocated, and their reference count 10637 * represents the number of valid entries within the page. 10638 */ 10639 static vm_page_t 10640 pmap_large_map_getptp_unlocked(void) 10641 { 10642 return (pmap_alloc_pt_page(kernel_pmap, 0, VM_ALLOC_ZERO)); 10643 } 10644 10645 static vm_page_t 10646 pmap_large_map_getptp(void) 10647 { 10648 vm_page_t m; 10649 10650 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 10651 m = pmap_large_map_getptp_unlocked(); 10652 if (m == NULL) { 10653 PMAP_UNLOCK(kernel_pmap); 10654 vm_wait(NULL); 10655 PMAP_LOCK(kernel_pmap); 10656 /* Callers retry. */ 10657 } 10658 return (m); 10659 } 10660 10661 static pdp_entry_t * 10662 pmap_large_map_pdpe(vm_offset_t va) 10663 { 10664 vm_pindex_t pml4_idx; 10665 vm_paddr_t mphys; 10666 10667 pml4_idx = pmap_pml4e_index(va); 10668 KASSERT(LMSPML4I <= pml4_idx && pml4_idx < LMSPML4I + lm_ents, 10669 ("pmap_large_map_pdpe: va %#jx out of range idx %#jx LMSPML4I " 10670 "%#jx lm_ents %d", 10671 (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); 10672 KASSERT((kernel_pml4[pml4_idx] & X86_PG_V) != 0, 10673 ("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx " 10674 "LMSPML4I %#jx lm_ents %d", 10675 (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); 10676 mphys = kernel_pml4[pml4_idx] & PG_FRAME; 10677 return ((pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va)); 10678 } 10679 10680 static pd_entry_t * 10681 pmap_large_map_pde(vm_offset_t va) 10682 { 10683 pdp_entry_t *pdpe; 10684 vm_page_t m; 10685 vm_paddr_t mphys; 10686 10687 retry: 10688 pdpe = pmap_large_map_pdpe(va); 10689 if (*pdpe == 0) { 10690 m = pmap_large_map_getptp(); 10691 if (m == NULL) 10692 goto retry; 10693 mphys = VM_PAGE_TO_PHYS(m); 10694 *pdpe = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx; 10695 } else { 10696 MPASS((*pdpe & X86_PG_PS) == 0); 10697 mphys = *pdpe & PG_FRAME; 10698 } 10699 return ((pd_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pde_index(va)); 10700 } 10701 10702 static pt_entry_t * 10703 pmap_large_map_pte(vm_offset_t va) 10704 { 10705 pd_entry_t *pde; 10706 vm_page_t m; 10707 vm_paddr_t mphys; 10708 10709 retry: 10710 pde = pmap_large_map_pde(va); 10711 if (*pde == 0) { 10712 m = pmap_large_map_getptp(); 10713 if (m == NULL) 10714 goto retry; 10715 mphys = VM_PAGE_TO_PHYS(m); 10716 *pde = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx; 10717 PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))->ref_count++; 10718 } else { 10719 MPASS((*pde & X86_PG_PS) == 0); 10720 mphys = *pde & PG_FRAME; 10721 } 10722 return ((pt_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pte_index(va)); 10723 } 10724 10725 static vm_paddr_t 10726 pmap_large_map_kextract(vm_offset_t va) 10727 { 10728 pdp_entry_t *pdpe, pdp; 10729 pd_entry_t *pde, pd; 10730 pt_entry_t *pte, pt; 10731 10732 KASSERT(PMAP_ADDRESS_IN_LARGEMAP(va), 10733 ("not largemap range %#lx", (u_long)va)); 10734 pdpe = pmap_large_map_pdpe(va); 10735 pdp = *pdpe; 10736 KASSERT((pdp & X86_PG_V) != 0, 10737 ("invalid pdp va %#lx pdpe %#lx pdp %#lx", va, 10738 (u_long)pdpe, pdp)); 10739 if ((pdp & X86_PG_PS) != 0) { 10740 KASSERT((amd_feature & AMDID_PAGE1GB) != 0, 10741 ("no 1G pages, va %#lx pdpe %#lx pdp %#lx", va, 10742 (u_long)pdpe, pdp)); 10743 return ((pdp & PG_PS_PDP_FRAME) | (va & PDPMASK)); 10744 } 10745 pde = pmap_pdpe_to_pde(pdpe, va); 10746 pd = *pde; 10747 KASSERT((pd & X86_PG_V) != 0, 10748 ("invalid pd va %#lx pde %#lx pd %#lx", va, (u_long)pde, pd)); 10749 if ((pd & X86_PG_PS) != 0) 10750 return ((pd & PG_PS_FRAME) | (va & PDRMASK)); 10751 pte = pmap_pde_to_pte(pde, va); 10752 pt = *pte; 10753 KASSERT((pt & X86_PG_V) != 0, 10754 ("invalid pte va %#lx pte %#lx pt %#lx", va, (u_long)pte, pt)); 10755 return ((pt & PG_FRAME) | (va & PAGE_MASK)); 10756 } 10757 10758 static int 10759 pmap_large_map_getva(vm_size_t len, vm_offset_t align, vm_offset_t phase, 10760 vmem_addr_t *vmem_res) 10761 { 10762 10763 /* 10764 * Large mappings are all but static. Consequently, there 10765 * is no point in waiting for an earlier allocation to be 10766 * freed. 10767 */ 10768 return (vmem_xalloc(large_vmem, len, align, phase, 0, VMEM_ADDR_MIN, 10769 VMEM_ADDR_MAX, M_NOWAIT | M_BESTFIT, vmem_res)); 10770 } 10771 10772 int 10773 pmap_large_map(vm_paddr_t spa, vm_size_t len, void **addr, 10774 vm_memattr_t mattr) 10775 { 10776 pdp_entry_t *pdpe; 10777 pd_entry_t *pde; 10778 pt_entry_t *pte; 10779 vm_offset_t va, inc; 10780 vmem_addr_t vmem_res; 10781 vm_paddr_t pa; 10782 int error; 10783 10784 if (len == 0 || spa + len < spa) 10785 return (EINVAL); 10786 10787 /* See if DMAP can serve. */ 10788 if (spa + len <= dmaplimit) { 10789 va = PHYS_TO_DMAP(spa); 10790 *addr = (void *)va; 10791 return (pmap_change_attr(va, len, mattr)); 10792 } 10793 10794 /* 10795 * No, allocate KVA. Fit the address with best possible 10796 * alignment for superpages. Fall back to worse align if 10797 * failed. 10798 */ 10799 error = ENOMEM; 10800 if ((amd_feature & AMDID_PAGE1GB) != 0 && rounddown2(spa + len, 10801 NBPDP) >= roundup2(spa, NBPDP) + NBPDP) 10802 error = pmap_large_map_getva(len, NBPDP, spa & PDPMASK, 10803 &vmem_res); 10804 if (error != 0 && rounddown2(spa + len, NBPDR) >= roundup2(spa, 10805 NBPDR) + NBPDR) 10806 error = pmap_large_map_getva(len, NBPDR, spa & PDRMASK, 10807 &vmem_res); 10808 if (error != 0) 10809 error = pmap_large_map_getva(len, PAGE_SIZE, 0, &vmem_res); 10810 if (error != 0) 10811 return (error); 10812 10813 /* 10814 * Fill pagetable. PG_M is not pre-set, we scan modified bits 10815 * in the pagetable to minimize flushing. No need to 10816 * invalidate TLB, since we only update invalid entries. 10817 */ 10818 PMAP_LOCK(kernel_pmap); 10819 for (pa = spa, va = vmem_res; len > 0; pa += inc, va += inc, 10820 len -= inc) { 10821 if ((amd_feature & AMDID_PAGE1GB) != 0 && len >= NBPDP && 10822 (pa & PDPMASK) == 0 && (va & PDPMASK) == 0) { 10823 pdpe = pmap_large_map_pdpe(va); 10824 MPASS(*pdpe == 0); 10825 *pdpe = pa | pg_g | X86_PG_PS | X86_PG_RW | 10826 X86_PG_V | X86_PG_A | pg_nx | 10827 pmap_cache_bits(kernel_pmap, mattr, true); 10828 inc = NBPDP; 10829 } else if (len >= NBPDR && (pa & PDRMASK) == 0 && 10830 (va & PDRMASK) == 0) { 10831 pde = pmap_large_map_pde(va); 10832 MPASS(*pde == 0); 10833 *pde = pa | pg_g | X86_PG_PS | X86_PG_RW | 10834 X86_PG_V | X86_PG_A | pg_nx | 10835 pmap_cache_bits(kernel_pmap, mattr, true); 10836 PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))-> 10837 ref_count++; 10838 inc = NBPDR; 10839 } else { 10840 pte = pmap_large_map_pte(va); 10841 MPASS(*pte == 0); 10842 *pte = pa | pg_g | X86_PG_RW | X86_PG_V | 10843 X86_PG_A | pg_nx | pmap_cache_bits(kernel_pmap, 10844 mattr, false); 10845 PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte))-> 10846 ref_count++; 10847 inc = PAGE_SIZE; 10848 } 10849 } 10850 PMAP_UNLOCK(kernel_pmap); 10851 MPASS(len == 0); 10852 10853 *addr = (void *)vmem_res; 10854 return (0); 10855 } 10856 10857 void 10858 pmap_large_unmap(void *svaa, vm_size_t len) 10859 { 10860 vm_offset_t sva, va; 10861 vm_size_t inc; 10862 pdp_entry_t *pdpe, pdp; 10863 pd_entry_t *pde, pd; 10864 pt_entry_t *pte; 10865 vm_page_t m; 10866 struct spglist spgf; 10867 10868 sva = (vm_offset_t)svaa; 10869 if (len == 0 || sva + len < sva || (sva >= DMAP_MIN_ADDRESS && 10870 sva + len <= DMAP_MIN_ADDRESS + dmaplimit)) 10871 return; 10872 10873 SLIST_INIT(&spgf); 10874 KASSERT(PMAP_ADDRESS_IN_LARGEMAP(sva) && 10875 PMAP_ADDRESS_IN_LARGEMAP(sva + len - 1), 10876 ("not largemap range %#lx %#lx", (u_long)svaa, (u_long)svaa + len)); 10877 PMAP_LOCK(kernel_pmap); 10878 for (va = sva; va < sva + len; va += inc) { 10879 pdpe = pmap_large_map_pdpe(va); 10880 pdp = *pdpe; 10881 KASSERT((pdp & X86_PG_V) != 0, 10882 ("invalid pdp va %#lx pdpe %#lx pdp %#lx", va, 10883 (u_long)pdpe, pdp)); 10884 if ((pdp & X86_PG_PS) != 0) { 10885 KASSERT((amd_feature & AMDID_PAGE1GB) != 0, 10886 ("no 1G pages, va %#lx pdpe %#lx pdp %#lx", va, 10887 (u_long)pdpe, pdp)); 10888 KASSERT((va & PDPMASK) == 0, 10889 ("PDPMASK bit set, va %#lx pdpe %#lx pdp %#lx", va, 10890 (u_long)pdpe, pdp)); 10891 KASSERT(va + NBPDP <= sva + len, 10892 ("unmap covers partial 1GB page, sva %#lx va %#lx " 10893 "pdpe %#lx pdp %#lx len %#lx", sva, va, 10894 (u_long)pdpe, pdp, len)); 10895 *pdpe = 0; 10896 inc = NBPDP; 10897 continue; 10898 } 10899 pde = pmap_pdpe_to_pde(pdpe, va); 10900 pd = *pde; 10901 KASSERT((pd & X86_PG_V) != 0, 10902 ("invalid pd va %#lx pde %#lx pd %#lx", va, 10903 (u_long)pde, pd)); 10904 if ((pd & X86_PG_PS) != 0) { 10905 KASSERT((va & PDRMASK) == 0, 10906 ("PDRMASK bit set, va %#lx pde %#lx pd %#lx", va, 10907 (u_long)pde, pd)); 10908 KASSERT(va + NBPDR <= sva + len, 10909 ("unmap covers partial 2MB page, sva %#lx va %#lx " 10910 "pde %#lx pd %#lx len %#lx", sva, va, (u_long)pde, 10911 pd, len)); 10912 pde_store(pde, 0); 10913 inc = NBPDR; 10914 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde)); 10915 m->ref_count--; 10916 if (m->ref_count == 0) { 10917 *pdpe = 0; 10918 SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); 10919 } 10920 continue; 10921 } 10922 pte = pmap_pde_to_pte(pde, va); 10923 KASSERT((*pte & X86_PG_V) != 0, 10924 ("invalid pte va %#lx pte %#lx pt %#lx", va, 10925 (u_long)pte, *pte)); 10926 pte_clear(pte); 10927 inc = PAGE_SIZE; 10928 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pte)); 10929 m->ref_count--; 10930 if (m->ref_count == 0) { 10931 *pde = 0; 10932 SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); 10933 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde)); 10934 m->ref_count--; 10935 if (m->ref_count == 0) { 10936 *pdpe = 0; 10937 SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); 10938 } 10939 } 10940 } 10941 pmap_invalidate_range(kernel_pmap, sva, sva + len); 10942 PMAP_UNLOCK(kernel_pmap); 10943 vm_page_free_pages_toq(&spgf, false); 10944 vmem_free(large_vmem, sva, len); 10945 } 10946 10947 static void 10948 pmap_large_map_wb_fence_mfence(void) 10949 { 10950 10951 mfence(); 10952 } 10953 10954 static void 10955 pmap_large_map_wb_fence_atomic(void) 10956 { 10957 10958 atomic_thread_fence_seq_cst(); 10959 } 10960 10961 static void 10962 pmap_large_map_wb_fence_nop(void) 10963 { 10964 } 10965 10966 DEFINE_IFUNC(static, void, pmap_large_map_wb_fence, (void)) 10967 { 10968 10969 if (cpu_vendor_id != CPU_VENDOR_INTEL) 10970 return (pmap_large_map_wb_fence_mfence); 10971 else if ((cpu_stdext_feature & (CPUID_STDEXT_CLWB | 10972 CPUID_STDEXT_CLFLUSHOPT)) == 0) 10973 return (pmap_large_map_wb_fence_atomic); 10974 else 10975 /* clflush is strongly enough ordered */ 10976 return (pmap_large_map_wb_fence_nop); 10977 } 10978 10979 static void 10980 pmap_large_map_flush_range_clwb(vm_offset_t va, vm_size_t len) 10981 { 10982 10983 for (; len > 0; len -= cpu_clflush_line_size, 10984 va += cpu_clflush_line_size) 10985 clwb(va); 10986 } 10987 10988 static void 10989 pmap_large_map_flush_range_clflushopt(vm_offset_t va, vm_size_t len) 10990 { 10991 10992 for (; len > 0; len -= cpu_clflush_line_size, 10993 va += cpu_clflush_line_size) 10994 clflushopt(va); 10995 } 10996 10997 static void 10998 pmap_large_map_flush_range_clflush(vm_offset_t va, vm_size_t len) 10999 { 11000 11001 for (; len > 0; len -= cpu_clflush_line_size, 11002 va += cpu_clflush_line_size) 11003 clflush(va); 11004 } 11005 11006 static void 11007 pmap_large_map_flush_range_nop(vm_offset_t sva __unused, vm_size_t len __unused) 11008 { 11009 } 11010 11011 DEFINE_IFUNC(static, void, pmap_large_map_flush_range, (vm_offset_t, vm_size_t)) 11012 { 11013 11014 if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) != 0) 11015 return (pmap_large_map_flush_range_clwb); 11016 else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) 11017 return (pmap_large_map_flush_range_clflushopt); 11018 else if ((cpu_feature & CPUID_CLFSH) != 0) 11019 return (pmap_large_map_flush_range_clflush); 11020 else 11021 return (pmap_large_map_flush_range_nop); 11022 } 11023 11024 static void 11025 pmap_large_map_wb_large(vm_offset_t sva, vm_offset_t eva) 11026 { 11027 volatile u_long *pe; 11028 u_long p; 11029 vm_offset_t va; 11030 vm_size_t inc; 11031 bool seen_other; 11032 11033 for (va = sva; va < eva; va += inc) { 11034 inc = 0; 11035 if ((amd_feature & AMDID_PAGE1GB) != 0) { 11036 pe = (volatile u_long *)pmap_large_map_pdpe(va); 11037 p = *pe; 11038 if ((p & X86_PG_PS) != 0) 11039 inc = NBPDP; 11040 } 11041 if (inc == 0) { 11042 pe = (volatile u_long *)pmap_large_map_pde(va); 11043 p = *pe; 11044 if ((p & X86_PG_PS) != 0) 11045 inc = NBPDR; 11046 } 11047 if (inc == 0) { 11048 pe = (volatile u_long *)pmap_large_map_pte(va); 11049 p = *pe; 11050 inc = PAGE_SIZE; 11051 } 11052 seen_other = false; 11053 for (;;) { 11054 if ((p & X86_PG_AVAIL1) != 0) { 11055 /* 11056 * Spin-wait for the end of a parallel 11057 * write-back. 11058 */ 11059 cpu_spinwait(); 11060 p = *pe; 11061 11062 /* 11063 * If we saw other write-back 11064 * occurring, we cannot rely on PG_M to 11065 * indicate state of the cache. The 11066 * PG_M bit is cleared before the 11067 * flush to avoid ignoring new writes, 11068 * and writes which are relevant for 11069 * us might happen after. 11070 */ 11071 seen_other = true; 11072 continue; 11073 } 11074 11075 if ((p & X86_PG_M) != 0 || seen_other) { 11076 if (!atomic_fcmpset_long(pe, &p, 11077 (p & ~X86_PG_M) | X86_PG_AVAIL1)) 11078 /* 11079 * If we saw PG_M without 11080 * PG_AVAIL1, and then on the 11081 * next attempt we do not 11082 * observe either PG_M or 11083 * PG_AVAIL1, the other 11084 * write-back started after us 11085 * and finished before us. We 11086 * can rely on it doing our 11087 * work. 11088 */ 11089 continue; 11090 pmap_large_map_flush_range(va, inc); 11091 atomic_clear_long(pe, X86_PG_AVAIL1); 11092 } 11093 break; 11094 } 11095 maybe_yield(); 11096 } 11097 } 11098 11099 /* 11100 * Write-back cache lines for the given address range. 11101 * 11102 * Must be called only on the range or sub-range returned from 11103 * pmap_large_map(). Must not be called on the coalesced ranges. 11104 * 11105 * Does nothing on CPUs without CLWB, CLFLUSHOPT, or CLFLUSH 11106 * instructions support. 11107 */ 11108 void 11109 pmap_large_map_wb(void *svap, vm_size_t len) 11110 { 11111 vm_offset_t eva, sva; 11112 11113 sva = (vm_offset_t)svap; 11114 eva = sva + len; 11115 pmap_large_map_wb_fence(); 11116 if (sva >= DMAP_MIN_ADDRESS && eva <= DMAP_MIN_ADDRESS + dmaplimit) { 11117 pmap_large_map_flush_range(sva, len); 11118 } else { 11119 KASSERT(sva >= LARGEMAP_MIN_ADDRESS && 11120 eva <= LARGEMAP_MIN_ADDRESS + lm_ents * NBPML4, 11121 ("pmap_large_map_wb: not largemap %#lx %#lx", sva, len)); 11122 pmap_large_map_wb_large(sva, eva); 11123 } 11124 pmap_large_map_wb_fence(); 11125 } 11126 11127 static vm_page_t 11128 pmap_pti_alloc_page(void) 11129 { 11130 vm_page_t m; 11131 11132 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11133 m = vm_page_grab(pti_obj, pti_pg_idx++, VM_ALLOC_WIRED | VM_ALLOC_ZERO); 11134 return (m); 11135 } 11136 11137 static bool 11138 pmap_pti_free_page(vm_page_t m) 11139 { 11140 if (!vm_page_unwire_noq(m)) 11141 return (false); 11142 vm_page_xbusy_claim(m); 11143 vm_page_free_zero(m); 11144 return (true); 11145 } 11146 11147 static void 11148 pmap_pti_init(void) 11149 { 11150 vm_page_t pml4_pg; 11151 pdp_entry_t *pdpe; 11152 vm_offset_t va; 11153 int i; 11154 11155 if (!pti) 11156 return; 11157 pti_obj = vm_pager_allocate(OBJT_PHYS, NULL, 0, VM_PROT_ALL, 0, NULL); 11158 VM_OBJECT_WLOCK(pti_obj); 11159 pml4_pg = pmap_pti_alloc_page(); 11160 pti_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4_pg)); 11161 for (va = VM_MIN_KERNEL_ADDRESS; va <= VM_MAX_KERNEL_ADDRESS && 11162 va >= VM_MIN_KERNEL_ADDRESS && va > NBPML4; va += NBPML4) { 11163 pdpe = pmap_pti_pdpe(va); 11164 pmap_pti_wire_pte(pdpe); 11165 } 11166 pmap_pti_add_kva_locked((vm_offset_t)&__pcpu[0], 11167 (vm_offset_t)&__pcpu[0] + sizeof(__pcpu[0]) * MAXCPU, false); 11168 pmap_pti_add_kva_locked((vm_offset_t)idt, (vm_offset_t)idt + 11169 sizeof(struct gate_descriptor) * NIDT, false); 11170 CPU_FOREACH(i) { 11171 /* Doublefault stack IST 1 */ 11172 va = __pcpu[i].pc_common_tss.tss_ist1 + sizeof(struct nmi_pcpu); 11173 pmap_pti_add_kva_locked(va - DBLFAULT_STACK_SIZE, va, false); 11174 /* NMI stack IST 2 */ 11175 va = __pcpu[i].pc_common_tss.tss_ist2 + sizeof(struct nmi_pcpu); 11176 pmap_pti_add_kva_locked(va - NMI_STACK_SIZE, va, false); 11177 /* MC# stack IST 3 */ 11178 va = __pcpu[i].pc_common_tss.tss_ist3 + 11179 sizeof(struct nmi_pcpu); 11180 pmap_pti_add_kva_locked(va - MCE_STACK_SIZE, va, false); 11181 /* DB# stack IST 4 */ 11182 va = __pcpu[i].pc_common_tss.tss_ist4 + sizeof(struct nmi_pcpu); 11183 pmap_pti_add_kva_locked(va - DBG_STACK_SIZE, va, false); 11184 } 11185 pmap_pti_add_kva_locked((vm_offset_t)KERNSTART, (vm_offset_t)etext, 11186 true); 11187 pti_finalized = true; 11188 VM_OBJECT_WUNLOCK(pti_obj); 11189 } 11190 11191 static void 11192 pmap_cpu_init(void *arg __unused) 11193 { 11194 CPU_COPY(&all_cpus, &kernel_pmap->pm_active); 11195 pmap_pti_init(); 11196 } 11197 SYSINIT(pmap_cpu, SI_SUB_CPU + 1, SI_ORDER_ANY, pmap_cpu_init, NULL); 11198 11199 static pdp_entry_t * 11200 pmap_pti_pdpe(vm_offset_t va) 11201 { 11202 pml4_entry_t *pml4e; 11203 pdp_entry_t *pdpe; 11204 vm_page_t m; 11205 vm_pindex_t pml4_idx; 11206 vm_paddr_t mphys; 11207 11208 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11209 11210 pml4_idx = pmap_pml4e_index(va); 11211 pml4e = &pti_pml4[pml4_idx]; 11212 m = NULL; 11213 if (*pml4e == 0) { 11214 if (pti_finalized) 11215 panic("pml4 alloc after finalization\n"); 11216 m = pmap_pti_alloc_page(); 11217 if (*pml4e != 0) { 11218 pmap_pti_free_page(m); 11219 mphys = *pml4e & ~PAGE_MASK; 11220 } else { 11221 mphys = VM_PAGE_TO_PHYS(m); 11222 *pml4e = mphys | X86_PG_RW | X86_PG_V; 11223 } 11224 } else { 11225 mphys = *pml4e & ~PAGE_MASK; 11226 } 11227 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va); 11228 return (pdpe); 11229 } 11230 11231 static void 11232 pmap_pti_wire_pte(void *pte) 11233 { 11234 vm_page_t m; 11235 11236 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11237 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte)); 11238 m->ref_count++; 11239 } 11240 11241 static void 11242 pmap_pti_unwire_pde(void *pde, bool only_ref) 11243 { 11244 vm_page_t m; 11245 11246 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11247 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde)); 11248 MPASS(only_ref || m->ref_count > 1); 11249 pmap_pti_free_page(m); 11250 } 11251 11252 static void 11253 pmap_pti_unwire_pte(void *pte, vm_offset_t va) 11254 { 11255 vm_page_t m; 11256 pd_entry_t *pde; 11257 11258 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11259 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte)); 11260 if (pmap_pti_free_page(m)) { 11261 pde = pmap_pti_pde(va); 11262 MPASS((*pde & (X86_PG_PS | X86_PG_V)) == X86_PG_V); 11263 *pde = 0; 11264 pmap_pti_unwire_pde(pde, false); 11265 } 11266 } 11267 11268 static pd_entry_t * 11269 pmap_pti_pde(vm_offset_t va) 11270 { 11271 pdp_entry_t *pdpe; 11272 pd_entry_t *pde; 11273 vm_page_t m; 11274 vm_pindex_t pd_idx; 11275 vm_paddr_t mphys; 11276 11277 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11278 11279 pdpe = pmap_pti_pdpe(va); 11280 if (*pdpe == 0) { 11281 m = pmap_pti_alloc_page(); 11282 if (*pdpe != 0) { 11283 pmap_pti_free_page(m); 11284 MPASS((*pdpe & X86_PG_PS) == 0); 11285 mphys = *pdpe & ~PAGE_MASK; 11286 } else { 11287 mphys = VM_PAGE_TO_PHYS(m); 11288 *pdpe = mphys | X86_PG_RW | X86_PG_V; 11289 } 11290 } else { 11291 MPASS((*pdpe & X86_PG_PS) == 0); 11292 mphys = *pdpe & ~PAGE_MASK; 11293 } 11294 11295 pde = (pd_entry_t *)PHYS_TO_DMAP(mphys); 11296 pd_idx = pmap_pde_index(va); 11297 pde += pd_idx; 11298 return (pde); 11299 } 11300 11301 static pt_entry_t * 11302 pmap_pti_pte(vm_offset_t va, bool *unwire_pde) 11303 { 11304 pd_entry_t *pde; 11305 pt_entry_t *pte; 11306 vm_page_t m; 11307 vm_paddr_t mphys; 11308 11309 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11310 11311 pde = pmap_pti_pde(va); 11312 if (unwire_pde != NULL) { 11313 *unwire_pde = true; 11314 pmap_pti_wire_pte(pde); 11315 } 11316 if (*pde == 0) { 11317 m = pmap_pti_alloc_page(); 11318 if (*pde != 0) { 11319 pmap_pti_free_page(m); 11320 MPASS((*pde & X86_PG_PS) == 0); 11321 mphys = *pde & ~(PAGE_MASK | pg_nx); 11322 } else { 11323 mphys = VM_PAGE_TO_PHYS(m); 11324 *pde = mphys | X86_PG_RW | X86_PG_V; 11325 if (unwire_pde != NULL) 11326 *unwire_pde = false; 11327 } 11328 } else { 11329 MPASS((*pde & X86_PG_PS) == 0); 11330 mphys = *pde & ~(PAGE_MASK | pg_nx); 11331 } 11332 11333 pte = (pt_entry_t *)PHYS_TO_DMAP(mphys); 11334 pte += pmap_pte_index(va); 11335 11336 return (pte); 11337 } 11338 11339 static void 11340 pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, bool exec) 11341 { 11342 vm_paddr_t pa; 11343 pd_entry_t *pde; 11344 pt_entry_t *pte, ptev; 11345 bool unwire_pde; 11346 11347 VM_OBJECT_ASSERT_WLOCKED(pti_obj); 11348 11349 sva = trunc_page(sva); 11350 MPASS(sva > VM_MAXUSER_ADDRESS); 11351 eva = round_page(eva); 11352 MPASS(sva < eva); 11353 for (; sva < eva; sva += PAGE_SIZE) { 11354 pte = pmap_pti_pte(sva, &unwire_pde); 11355 pa = pmap_kextract(sva); 11356 ptev = pa | X86_PG_RW | X86_PG_V | X86_PG_A | X86_PG_G | 11357 (exec ? 0 : pg_nx) | pmap_cache_bits(kernel_pmap, 11358 VM_MEMATTR_DEFAULT, false); 11359 if (*pte == 0) { 11360 pte_store(pte, ptev); 11361 pmap_pti_wire_pte(pte); 11362 } else { 11363 KASSERT(!pti_finalized, 11364 ("pti overlap after fin %#lx %#lx %#lx", 11365 sva, *pte, ptev)); 11366 KASSERT(*pte == ptev, 11367 ("pti non-identical pte after fin %#lx %#lx %#lx", 11368 sva, *pte, ptev)); 11369 } 11370 if (unwire_pde) { 11371 pde = pmap_pti_pde(sva); 11372 pmap_pti_unwire_pde(pde, true); 11373 } 11374 } 11375 } 11376 11377 void 11378 pmap_pti_add_kva(vm_offset_t sva, vm_offset_t eva, bool exec) 11379 { 11380 11381 if (!pti) 11382 return; 11383 VM_OBJECT_WLOCK(pti_obj); 11384 pmap_pti_add_kva_locked(sva, eva, exec); 11385 VM_OBJECT_WUNLOCK(pti_obj); 11386 } 11387 11388 void 11389 pmap_pti_remove_kva(vm_offset_t sva, vm_offset_t eva) 11390 { 11391 pt_entry_t *pte; 11392 vm_offset_t va; 11393 11394 if (!pti) 11395 return; 11396 sva = rounddown2(sva, PAGE_SIZE); 11397 MPASS(sva > VM_MAXUSER_ADDRESS); 11398 eva = roundup2(eva, PAGE_SIZE); 11399 MPASS(sva < eva); 11400 VM_OBJECT_WLOCK(pti_obj); 11401 for (va = sva; va < eva; va += PAGE_SIZE) { 11402 pte = pmap_pti_pte(va, NULL); 11403 KASSERT((*pte & X86_PG_V) != 0, 11404 ("invalid pte va %#lx pte %#lx pt %#lx", va, 11405 (u_long)pte, *pte)); 11406 pte_clear(pte); 11407 pmap_pti_unwire_pte(pte, va); 11408 } 11409 pmap_invalidate_range(kernel_pmap, sva, eva); 11410 VM_OBJECT_WUNLOCK(pti_obj); 11411 } 11412 11413 static void * 11414 pkru_dup_range(void *ctx __unused, void *data) 11415 { 11416 struct pmap_pkru_range *node, *new_node; 11417 11418 new_node = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT); 11419 if (new_node == NULL) 11420 return (NULL); 11421 node = data; 11422 memcpy(new_node, node, sizeof(*node)); 11423 return (new_node); 11424 } 11425 11426 static void 11427 pkru_free_range(void *ctx __unused, void *node) 11428 { 11429 11430 uma_zfree(pmap_pkru_ranges_zone, node); 11431 } 11432 11433 static int 11434 pmap_pkru_assign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx, 11435 int flags) 11436 { 11437 struct pmap_pkru_range *ppr; 11438 int error; 11439 11440 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11441 MPASS(pmap->pm_type == PT_X86); 11442 MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); 11443 if ((flags & AMD64_PKRU_EXCL) != 0 && 11444 !rangeset_check_empty(&pmap->pm_pkru, sva, eva)) 11445 return (EBUSY); 11446 ppr = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT); 11447 if (ppr == NULL) 11448 return (ENOMEM); 11449 ppr->pkru_keyidx = keyidx; 11450 ppr->pkru_flags = flags & AMD64_PKRU_PERSIST; 11451 error = rangeset_insert(&pmap->pm_pkru, sva, eva, ppr); 11452 if (error != 0) 11453 uma_zfree(pmap_pkru_ranges_zone, ppr); 11454 return (error); 11455 } 11456 11457 static int 11458 pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 11459 { 11460 11461 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11462 MPASS(pmap->pm_type == PT_X86); 11463 MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); 11464 return (rangeset_remove(&pmap->pm_pkru, sva, eva)); 11465 } 11466 11467 static void 11468 pmap_pkru_deassign_all(pmap_t pmap) 11469 { 11470 11471 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11472 if (pmap->pm_type == PT_X86 && 11473 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) 11474 rangeset_remove_all(&pmap->pm_pkru); 11475 } 11476 11477 /* 11478 * Returns true if the PKU setting is the same across the specified address 11479 * range, and false otherwise. When returning true, updates the referenced PTE 11480 * to reflect the PKU setting. 11481 */ 11482 static bool 11483 pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t *pte) 11484 { 11485 struct pmap_pkru_range *ppr; 11486 vm_offset_t va; 11487 u_int keyidx; 11488 11489 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11490 KASSERT(pmap->pm_type != PT_X86 || (*pte & X86_PG_PKU_MASK) == 0, 11491 ("pte %p has unexpected PKU %ld", pte, *pte & X86_PG_PKU_MASK)); 11492 if (pmap->pm_type != PT_X86 || 11493 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 || 11494 sva >= VM_MAXUSER_ADDRESS) 11495 return (true); 11496 MPASS(eva <= VM_MAXUSER_ADDRESS); 11497 ppr = rangeset_containing(&pmap->pm_pkru, sva); 11498 if (ppr == NULL) 11499 return (rangeset_empty(&pmap->pm_pkru, sva, eva)); 11500 keyidx = ppr->pkru_keyidx; 11501 while ((va = ppr->pkru_rs_el.re_end) < eva) { 11502 if ((ppr = rangeset_beginning(&pmap->pm_pkru, va)) == NULL || 11503 keyidx != ppr->pkru_keyidx) 11504 return (false); 11505 } 11506 *pte |= X86_PG_PKU(keyidx); 11507 return (true); 11508 } 11509 11510 static pt_entry_t 11511 pmap_pkru_get(pmap_t pmap, vm_offset_t va) 11512 { 11513 struct pmap_pkru_range *ppr; 11514 11515 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11516 if (pmap->pm_type != PT_X86 || 11517 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 || 11518 va >= VM_MAXUSER_ADDRESS) 11519 return (0); 11520 ppr = rangeset_containing(&pmap->pm_pkru, va); 11521 if (ppr != NULL) 11522 return (X86_PG_PKU(ppr->pkru_keyidx)); 11523 return (0); 11524 } 11525 11526 static bool 11527 pred_pkru_on_remove(void *ctx __unused, void *r) 11528 { 11529 struct pmap_pkru_range *ppr; 11530 11531 ppr = r; 11532 return ((ppr->pkru_flags & AMD64_PKRU_PERSIST) == 0); 11533 } 11534 11535 static void 11536 pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 11537 { 11538 11539 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11540 if (pmap->pm_type == PT_X86 && 11541 (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { 11542 rangeset_remove_pred(&pmap->pm_pkru, sva, eva, 11543 pred_pkru_on_remove); 11544 } 11545 } 11546 11547 static int 11548 pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap) 11549 { 11550 11551 PMAP_LOCK_ASSERT(dst_pmap, MA_OWNED); 11552 PMAP_LOCK_ASSERT(src_pmap, MA_OWNED); 11553 MPASS(dst_pmap->pm_type == PT_X86); 11554 MPASS(src_pmap->pm_type == PT_X86); 11555 MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); 11556 if (src_pmap->pm_pkru.rs_data_ctx == NULL) 11557 return (0); 11558 return (rangeset_copy(&dst_pmap->pm_pkru, &src_pmap->pm_pkru)); 11559 } 11560 11561 static void 11562 pmap_pkru_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 11563 u_int keyidx) 11564 { 11565 pml4_entry_t *pml4e; 11566 pdp_entry_t *pdpe; 11567 pd_entry_t newpde, ptpaddr, *pde; 11568 pt_entry_t newpte, *ptep, pte; 11569 vm_offset_t va, va_next; 11570 bool changed; 11571 11572 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 11573 MPASS(pmap->pm_type == PT_X86); 11574 MPASS(keyidx <= PMAP_MAX_PKRU_IDX); 11575 11576 for (changed = false, va = sva; va < eva; va = va_next) { 11577 pml4e = pmap_pml4e(pmap, va); 11578 if (pml4e == NULL || (*pml4e & X86_PG_V) == 0) { 11579 va_next = (va + NBPML4) & ~PML4MASK; 11580 if (va_next < va) 11581 va_next = eva; 11582 continue; 11583 } 11584 11585 pdpe = pmap_pml4e_to_pdpe(pml4e, va); 11586 if ((*pdpe & X86_PG_V) == 0) { 11587 va_next = (va + NBPDP) & ~PDPMASK; 11588 if (va_next < va) 11589 va_next = eva; 11590 continue; 11591 } 11592 11593 va_next = (va + NBPDR) & ~PDRMASK; 11594 if (va_next < va) 11595 va_next = eva; 11596 11597 pde = pmap_pdpe_to_pde(pdpe, va); 11598 ptpaddr = *pde; 11599 if (ptpaddr == 0) 11600 continue; 11601 11602 MPASS((ptpaddr & X86_PG_V) != 0); 11603 if ((ptpaddr & PG_PS) != 0) { 11604 if (va + NBPDR == va_next && eva >= va_next) { 11605 newpde = (ptpaddr & ~X86_PG_PKU_MASK) | 11606 X86_PG_PKU(keyidx); 11607 if (newpde != ptpaddr) { 11608 *pde = newpde; 11609 changed = true; 11610 } 11611 continue; 11612 } else if (!pmap_demote_pde(pmap, pde, va)) { 11613 continue; 11614 } 11615 } 11616 11617 if (va_next > eva) 11618 va_next = eva; 11619 11620 for (ptep = pmap_pde_to_pte(pde, va); va != va_next; 11621 ptep++, va += PAGE_SIZE) { 11622 pte = *ptep; 11623 if ((pte & X86_PG_V) == 0) 11624 continue; 11625 newpte = (pte & ~X86_PG_PKU_MASK) | X86_PG_PKU(keyidx); 11626 if (newpte != pte) { 11627 *ptep = newpte; 11628 changed = true; 11629 } 11630 } 11631 } 11632 if (changed) 11633 pmap_invalidate_range(pmap, sva, eva); 11634 } 11635 11636 static int 11637 pmap_pkru_check_uargs(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 11638 u_int keyidx, int flags) 11639 { 11640 11641 if (pmap->pm_type != PT_X86 || keyidx > PMAP_MAX_PKRU_IDX || 11642 (flags & ~(AMD64_PKRU_PERSIST | AMD64_PKRU_EXCL)) != 0) 11643 return (EINVAL); 11644 if (eva <= sva || eva > VM_MAXUSER_ADDRESS) 11645 return (EFAULT); 11646 if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0) 11647 return (ENOTSUP); 11648 return (0); 11649 } 11650 11651 int 11652 pmap_pkru_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx, 11653 int flags) 11654 { 11655 int error; 11656 11657 sva = trunc_page(sva); 11658 eva = round_page(eva); 11659 error = pmap_pkru_check_uargs(pmap, sva, eva, keyidx, flags); 11660 if (error != 0) 11661 return (error); 11662 for (;;) { 11663 PMAP_LOCK(pmap); 11664 error = pmap_pkru_assign(pmap, sva, eva, keyidx, flags); 11665 if (error == 0) 11666 pmap_pkru_update_range(pmap, sva, eva, keyidx); 11667 PMAP_UNLOCK(pmap); 11668 if (error != ENOMEM) 11669 break; 11670 vm_wait(NULL); 11671 } 11672 return (error); 11673 } 11674 11675 int 11676 pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 11677 { 11678 int error; 11679 11680 sva = trunc_page(sva); 11681 eva = round_page(eva); 11682 error = pmap_pkru_check_uargs(pmap, sva, eva, 0, 0); 11683 if (error != 0) 11684 return (error); 11685 for (;;) { 11686 PMAP_LOCK(pmap); 11687 error = pmap_pkru_deassign(pmap, sva, eva); 11688 if (error == 0) 11689 pmap_pkru_update_range(pmap, sva, eva, 0); 11690 PMAP_UNLOCK(pmap); 11691 if (error != ENOMEM) 11692 break; 11693 vm_wait(NULL); 11694 } 11695 return (error); 11696 } 11697 11698 #if defined(KASAN) || defined(KMSAN) 11699 11700 /* 11701 * Reserve enough memory to: 11702 * 1) allocate PDP pages for the shadow map(s), 11703 * 2) shadow the boot stack of KSTACK_PAGES pages, 11704 * 3) assuming that the kernel stack does not cross a 1GB boundary, 11705 * so we need one or two PD pages, one or two PT pages, and KSTACK_PAGES shadow 11706 * pages per shadow map. 11707 */ 11708 #ifdef KASAN 11709 #define SAN_EARLY_PAGES \ 11710 (NKASANPML4E + 2 + 2 + howmany(KSTACK_PAGES, KASAN_SHADOW_SCALE)) 11711 #else 11712 #define SAN_EARLY_PAGES \ 11713 (NKMSANSHADPML4E + NKMSANORIGPML4E + 2 * (2 + 2 + KSTACK_PAGES)) 11714 #endif 11715 11716 static uint64_t __nosanitizeaddress __nosanitizememory 11717 pmap_san_enter_early_alloc_4k(uint64_t pabase) 11718 { 11719 static uint8_t data[PAGE_SIZE * SAN_EARLY_PAGES] __aligned(PAGE_SIZE); 11720 static size_t offset = 0; 11721 uint64_t pa; 11722 11723 if (offset == sizeof(data)) { 11724 panic("%s: ran out of memory for the bootstrap shadow map", 11725 __func__); 11726 } 11727 11728 pa = pabase + ((vm_offset_t)&data[offset] - KERNSTART); 11729 offset += PAGE_SIZE; 11730 return (pa); 11731 } 11732 11733 /* 11734 * Map a shadow page, before the kernel has bootstrapped its page tables. This 11735 * is currently only used to shadow the temporary boot stack set up by locore. 11736 */ 11737 static void __nosanitizeaddress __nosanitizememory 11738 pmap_san_enter_early(vm_offset_t va) 11739 { 11740 static bool first = true; 11741 pml4_entry_t *pml4e; 11742 pdp_entry_t *pdpe; 11743 pd_entry_t *pde; 11744 pt_entry_t *pte; 11745 uint64_t cr3, pa, base; 11746 int i; 11747 11748 base = amd64_loadaddr(); 11749 cr3 = rcr3(); 11750 11751 if (first) { 11752 /* 11753 * If this the first call, we need to allocate new PML4Es for 11754 * the bootstrap shadow map(s). We don't know how the PML4 page 11755 * was initialized by the boot loader, so we can't simply test 11756 * whether the shadow map's PML4Es are zero. 11757 */ 11758 first = false; 11759 #ifdef KASAN 11760 for (i = 0; i < NKASANPML4E; i++) { 11761 pa = pmap_san_enter_early_alloc_4k(base); 11762 11763 pml4e = (pml4_entry_t *)cr3 + 11764 pmap_pml4e_index(KASAN_MIN_ADDRESS + i * NBPML4); 11765 *pml4e = (pml4_entry_t)(pa | X86_PG_RW | X86_PG_V); 11766 } 11767 #else 11768 for (i = 0; i < NKMSANORIGPML4E; i++) { 11769 pa = pmap_san_enter_early_alloc_4k(base); 11770 11771 pml4e = (pml4_entry_t *)cr3 + 11772 pmap_pml4e_index(KMSAN_ORIG_MIN_ADDRESS + 11773 i * NBPML4); 11774 *pml4e = (pml4_entry_t)(pa | X86_PG_RW | X86_PG_V); 11775 } 11776 for (i = 0; i < NKMSANSHADPML4E; i++) { 11777 pa = pmap_san_enter_early_alloc_4k(base); 11778 11779 pml4e = (pml4_entry_t *)cr3 + 11780 pmap_pml4e_index(KMSAN_SHAD_MIN_ADDRESS + 11781 i * NBPML4); 11782 *pml4e = (pml4_entry_t)(pa | X86_PG_RW | X86_PG_V); 11783 } 11784 #endif 11785 } 11786 pml4e = (pml4_entry_t *)cr3 + pmap_pml4e_index(va); 11787 pdpe = (pdp_entry_t *)(*pml4e & PG_FRAME) + pmap_pdpe_index(va); 11788 if (*pdpe == 0) { 11789 pa = pmap_san_enter_early_alloc_4k(base); 11790 *pdpe = (pdp_entry_t)(pa | X86_PG_RW | X86_PG_V); 11791 } 11792 pde = (pd_entry_t *)(*pdpe & PG_FRAME) + pmap_pde_index(va); 11793 if (*pde == 0) { 11794 pa = pmap_san_enter_early_alloc_4k(base); 11795 *pde = (pd_entry_t)(pa | X86_PG_RW | X86_PG_V); 11796 } 11797 pte = (pt_entry_t *)(*pde & PG_FRAME) + pmap_pte_index(va); 11798 if (*pte != 0) 11799 panic("%s: PTE for %#lx is already initialized", __func__, va); 11800 pa = pmap_san_enter_early_alloc_4k(base); 11801 *pte = (pt_entry_t)(pa | X86_PG_A | X86_PG_M | X86_PG_RW | X86_PG_V); 11802 } 11803 11804 static vm_page_t 11805 pmap_san_enter_alloc_4k(void) 11806 { 11807 vm_page_t m; 11808 11809 m = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED | 11810 VM_ALLOC_ZERO); 11811 if (m == NULL) 11812 panic("%s: no memory to grow shadow map", __func__); 11813 return (m); 11814 } 11815 11816 static vm_page_t 11817 pmap_san_enter_alloc_2m(void) 11818 { 11819 return (vm_page_alloc_noobj_contig(VM_ALLOC_WIRED | VM_ALLOC_ZERO, 11820 NPTEPG, 0, ~0ul, NBPDR, 0, VM_MEMATTR_DEFAULT)); 11821 } 11822 11823 /* 11824 * Grow a shadow map by at least one 4KB page at the specified address. Use 2MB 11825 * pages when possible. 11826 */ 11827 void __nosanitizeaddress __nosanitizememory 11828 pmap_san_enter(vm_offset_t va) 11829 { 11830 pdp_entry_t *pdpe; 11831 pd_entry_t *pde; 11832 pt_entry_t *pte; 11833 vm_page_t m; 11834 11835 if (kernphys == 0) { 11836 /* 11837 * We're creating a temporary shadow map for the boot stack. 11838 */ 11839 pmap_san_enter_early(va); 11840 return; 11841 } 11842 11843 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 11844 11845 pdpe = pmap_pdpe(kernel_pmap, va); 11846 if ((*pdpe & X86_PG_V) == 0) { 11847 m = pmap_san_enter_alloc_4k(); 11848 *pdpe = (pdp_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW | 11849 X86_PG_V | pg_nx); 11850 } 11851 pde = pmap_pdpe_to_pde(pdpe, va); 11852 if ((*pde & X86_PG_V) == 0) { 11853 m = pmap_san_enter_alloc_2m(); 11854 if (m != NULL) { 11855 *pde = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW | 11856 X86_PG_PS | X86_PG_V | X86_PG_A | X86_PG_M | pg_nx); 11857 } else { 11858 m = pmap_san_enter_alloc_4k(); 11859 *pde = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW | 11860 X86_PG_V | pg_nx); 11861 } 11862 } 11863 if ((*pde & X86_PG_PS) != 0) 11864 return; 11865 pte = pmap_pde_to_pte(pde, va); 11866 if ((*pte & X86_PG_V) != 0) 11867 return; 11868 m = pmap_san_enter_alloc_4k(); 11869 *pte = (pt_entry_t)(VM_PAGE_TO_PHYS(m) | X86_PG_RW | X86_PG_V | 11870 X86_PG_M | X86_PG_A | pg_nx); 11871 } 11872 #endif 11873 11874 /* 11875 * Track a range of the kernel's virtual address space that is contiguous 11876 * in various mapping attributes. 11877 */ 11878 struct pmap_kernel_map_range { 11879 vm_offset_t sva; 11880 pt_entry_t attrs; 11881 int ptes; 11882 int pdes; 11883 int pdpes; 11884 }; 11885 11886 static void 11887 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, 11888 vm_offset_t eva) 11889 { 11890 const char *mode; 11891 int i, pat_idx; 11892 11893 if (eva <= range->sva) 11894 return; 11895 11896 pat_idx = pmap_pat_index(kernel_pmap, range->attrs, true); 11897 for (i = 0; i < PAT_INDEX_SIZE; i++) 11898 if (pat_index[i] == pat_idx) 11899 break; 11900 11901 switch (i) { 11902 case PAT_WRITE_BACK: 11903 mode = "WB"; 11904 break; 11905 case PAT_WRITE_THROUGH: 11906 mode = "WT"; 11907 break; 11908 case PAT_UNCACHEABLE: 11909 mode = "UC"; 11910 break; 11911 case PAT_UNCACHED: 11912 mode = "U-"; 11913 break; 11914 case PAT_WRITE_PROTECTED: 11915 mode = "WP"; 11916 break; 11917 case PAT_WRITE_COMBINING: 11918 mode = "WC"; 11919 break; 11920 default: 11921 printf("%s: unknown PAT mode %#x for range 0x%016lx-0x%016lx\n", 11922 __func__, pat_idx, range->sva, eva); 11923 mode = "??"; 11924 break; 11925 } 11926 11927 sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %s %d %d %d\n", 11928 range->sva, eva, 11929 (range->attrs & X86_PG_RW) != 0 ? 'w' : '-', 11930 (range->attrs & pg_nx) != 0 ? '-' : 'x', 11931 (range->attrs & X86_PG_U) != 0 ? 'u' : 's', 11932 (range->attrs & X86_PG_G) != 0 ? 'g' : '-', 11933 mode, range->pdpes, range->pdes, range->ptes); 11934 11935 /* Reset to sentinel value. */ 11936 range->sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1, 11937 NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1, 11938 NPDEPG - 1, NPTEPG - 1); 11939 } 11940 11941 /* 11942 * Determine whether the attributes specified by a page table entry match those 11943 * being tracked by the current range. This is not quite as simple as a direct 11944 * flag comparison since some PAT modes have multiple representations. 11945 */ 11946 static bool 11947 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs) 11948 { 11949 pt_entry_t diff, mask; 11950 11951 mask = X86_PG_G | X86_PG_RW | X86_PG_U | X86_PG_PDE_CACHE | pg_nx; 11952 diff = (range->attrs ^ attrs) & mask; 11953 if (diff == 0) 11954 return (true); 11955 if ((diff & ~X86_PG_PDE_PAT) == 0 && 11956 pmap_pat_index(kernel_pmap, range->attrs, true) == 11957 pmap_pat_index(kernel_pmap, attrs, true)) 11958 return (true); 11959 return (false); 11960 } 11961 11962 static void 11963 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, 11964 pt_entry_t attrs) 11965 { 11966 11967 memset(range, 0, sizeof(*range)); 11968 range->sva = va; 11969 range->attrs = attrs; 11970 } 11971 11972 /* 11973 * Given a leaf PTE, derive the mapping's attributes. If they do not match 11974 * those of the current run, dump the address range and its attributes, and 11975 * begin a new run. 11976 */ 11977 static void 11978 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, 11979 vm_offset_t va, pml4_entry_t pml4e, pdp_entry_t pdpe, pd_entry_t pde, 11980 pt_entry_t pte) 11981 { 11982 pt_entry_t attrs; 11983 11984 attrs = pml4e & (X86_PG_RW | X86_PG_U | pg_nx); 11985 11986 attrs |= pdpe & pg_nx; 11987 attrs &= pg_nx | (pdpe & (X86_PG_RW | X86_PG_U)); 11988 if ((pdpe & PG_PS) != 0) { 11989 attrs |= pdpe & (X86_PG_G | X86_PG_PDE_CACHE); 11990 } else if (pde != 0) { 11991 attrs |= pde & pg_nx; 11992 attrs &= pg_nx | (pde & (X86_PG_RW | X86_PG_U)); 11993 } 11994 if ((pde & PG_PS) != 0) { 11995 attrs |= pde & (X86_PG_G | X86_PG_PDE_CACHE); 11996 } else if (pte != 0) { 11997 attrs |= pte & pg_nx; 11998 attrs &= pg_nx | (pte & (X86_PG_RW | X86_PG_U)); 11999 attrs |= pte & (X86_PG_G | X86_PG_PTE_CACHE); 12000 12001 /* Canonicalize by always using the PDE PAT bit. */ 12002 if ((attrs & X86_PG_PTE_PAT) != 0) 12003 attrs ^= X86_PG_PDE_PAT | X86_PG_PTE_PAT; 12004 } 12005 12006 if (range->sva > va || !sysctl_kmaps_match(range, attrs)) { 12007 sysctl_kmaps_dump(sb, range, va); 12008 sysctl_kmaps_reinit(range, va, attrs); 12009 } 12010 } 12011 12012 static int 12013 sysctl_kmaps(SYSCTL_HANDLER_ARGS) 12014 { 12015 struct pmap_kernel_map_range range; 12016 struct sbuf sbuf, *sb; 12017 pml4_entry_t pml4e; 12018 pdp_entry_t *pdp, pdpe; 12019 pd_entry_t *pd, pde; 12020 pt_entry_t *pt, pte; 12021 vm_offset_t sva; 12022 vm_paddr_t pa; 12023 int error, i, j, k, l; 12024 12025 error = sysctl_wire_old_buffer(req, 0); 12026 if (error != 0) 12027 return (error); 12028 sb = &sbuf; 12029 sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); 12030 12031 /* Sentinel value. */ 12032 range.sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1, 12033 NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1, 12034 NPDEPG - 1, NPTEPG - 1); 12035 12036 /* 12037 * Iterate over the kernel page tables without holding the kernel pmap 12038 * lock. Outside of the large map, kernel page table pages are never 12039 * freed, so at worst we will observe inconsistencies in the output. 12040 * Within the large map, ensure that PDP and PD page addresses are 12041 * valid before descending. 12042 */ 12043 for (sva = 0, i = pmap_pml4e_index(sva); i < NPML4EPG; i++) { 12044 switch (i) { 12045 case PML4PML4I: 12046 sbuf_printf(sb, "\nRecursive map:\n"); 12047 break; 12048 case DMPML4I: 12049 sbuf_printf(sb, "\nDirect map:\n"); 12050 break; 12051 #ifdef KASAN 12052 case KASANPML4I: 12053 sbuf_printf(sb, "\nKASAN shadow map:\n"); 12054 break; 12055 #endif 12056 #ifdef KMSAN 12057 case KMSANSHADPML4I: 12058 sbuf_printf(sb, "\nKMSAN shadow map:\n"); 12059 break; 12060 case KMSANORIGPML4I: 12061 sbuf_printf(sb, "\nKMSAN origin map:\n"); 12062 break; 12063 #endif 12064 case KPML4BASE: 12065 sbuf_printf(sb, "\nKernel map:\n"); 12066 break; 12067 case LMSPML4I: 12068 sbuf_printf(sb, "\nLarge map:\n"); 12069 break; 12070 } 12071 12072 /* Convert to canonical form. */ 12073 if (sva == 1ul << 47) 12074 sva |= -1ul << 48; 12075 12076 restart: 12077 pml4e = kernel_pml4[i]; 12078 if ((pml4e & X86_PG_V) == 0) { 12079 sva = rounddown2(sva, NBPML4); 12080 sysctl_kmaps_dump(sb, &range, sva); 12081 sva += NBPML4; 12082 continue; 12083 } 12084 pa = pml4e & PG_FRAME; 12085 pdp = (pdp_entry_t *)PHYS_TO_DMAP(pa); 12086 12087 for (j = pmap_pdpe_index(sva); j < NPDPEPG; j++) { 12088 pdpe = pdp[j]; 12089 if ((pdpe & X86_PG_V) == 0) { 12090 sva = rounddown2(sva, NBPDP); 12091 sysctl_kmaps_dump(sb, &range, sva); 12092 sva += NBPDP; 12093 continue; 12094 } 12095 pa = pdpe & PG_FRAME; 12096 if ((pdpe & PG_PS) != 0) { 12097 sva = rounddown2(sva, NBPDP); 12098 sysctl_kmaps_check(sb, &range, sva, pml4e, pdpe, 12099 0, 0); 12100 range.pdpes++; 12101 sva += NBPDP; 12102 continue; 12103 } 12104 if (PMAP_ADDRESS_IN_LARGEMAP(sva) && 12105 vm_phys_paddr_to_vm_page(pa) == NULL) { 12106 /* 12107 * Page table pages for the large map may be 12108 * freed. Validate the next-level address 12109 * before descending. 12110 */ 12111 goto restart; 12112 } 12113 pd = (pd_entry_t *)PHYS_TO_DMAP(pa); 12114 12115 for (k = pmap_pde_index(sva); k < NPDEPG; k++) { 12116 pde = pd[k]; 12117 if ((pde & X86_PG_V) == 0) { 12118 sva = rounddown2(sva, NBPDR); 12119 sysctl_kmaps_dump(sb, &range, sva); 12120 sva += NBPDR; 12121 continue; 12122 } 12123 pa = pde & PG_FRAME; 12124 if ((pde & PG_PS) != 0) { 12125 sva = rounddown2(sva, NBPDR); 12126 sysctl_kmaps_check(sb, &range, sva, 12127 pml4e, pdpe, pde, 0); 12128 range.pdes++; 12129 sva += NBPDR; 12130 continue; 12131 } 12132 if (PMAP_ADDRESS_IN_LARGEMAP(sva) && 12133 vm_phys_paddr_to_vm_page(pa) == NULL) { 12134 /* 12135 * Page table pages for the large map 12136 * may be freed. Validate the 12137 * next-level address before descending. 12138 */ 12139 goto restart; 12140 } 12141 pt = (pt_entry_t *)PHYS_TO_DMAP(pa); 12142 12143 for (l = pmap_pte_index(sva); l < NPTEPG; l++, 12144 sva += PAGE_SIZE) { 12145 pte = pt[l]; 12146 if ((pte & X86_PG_V) == 0) { 12147 sysctl_kmaps_dump(sb, &range, 12148 sva); 12149 continue; 12150 } 12151 sysctl_kmaps_check(sb, &range, sva, 12152 pml4e, pdpe, pde, pte); 12153 range.ptes++; 12154 } 12155 } 12156 } 12157 } 12158 12159 error = sbuf_finish(sb); 12160 sbuf_delete(sb); 12161 return (error); 12162 } 12163 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps, 12164 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP, 12165 NULL, 0, sysctl_kmaps, "A", 12166 "Dump kernel address layout"); 12167 12168 #ifdef DDB 12169 DB_SHOW_COMMAND(pte, pmap_print_pte) 12170 { 12171 pmap_t pmap; 12172 pml5_entry_t *pml5; 12173 pml4_entry_t *pml4; 12174 pdp_entry_t *pdp; 12175 pd_entry_t *pde; 12176 pt_entry_t *pte, PG_V; 12177 vm_offset_t va; 12178 12179 if (!have_addr) { 12180 db_printf("show pte addr\n"); 12181 return; 12182 } 12183 va = (vm_offset_t)addr; 12184 12185 if (kdb_thread != NULL) 12186 pmap = vmspace_pmap(kdb_thread->td_proc->p_vmspace); 12187 else 12188 pmap = PCPU_GET(curpmap); 12189 12190 PG_V = pmap_valid_bit(pmap); 12191 db_printf("VA 0x%016lx", va); 12192 12193 if (pmap_is_la57(pmap)) { 12194 pml5 = pmap_pml5e(pmap, va); 12195 db_printf(" pml5e 0x%016lx", *pml5); 12196 if ((*pml5 & PG_V) == 0) { 12197 db_printf("\n"); 12198 return; 12199 } 12200 pml4 = pmap_pml5e_to_pml4e(pml5, va); 12201 } else { 12202 pml4 = pmap_pml4e(pmap, va); 12203 } 12204 db_printf(" pml4e 0x%016lx", *pml4); 12205 if ((*pml4 & PG_V) == 0) { 12206 db_printf("\n"); 12207 return; 12208 } 12209 pdp = pmap_pml4e_to_pdpe(pml4, va); 12210 db_printf(" pdpe 0x%016lx", *pdp); 12211 if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) { 12212 db_printf("\n"); 12213 return; 12214 } 12215 pde = pmap_pdpe_to_pde(pdp, va); 12216 db_printf(" pde 0x%016lx", *pde); 12217 if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) { 12218 db_printf("\n"); 12219 return; 12220 } 12221 pte = pmap_pde_to_pte(pde, va); 12222 db_printf(" pte 0x%016lx\n", *pte); 12223 } 12224 12225 DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap) 12226 { 12227 vm_paddr_t a; 12228 12229 if (have_addr) { 12230 a = (vm_paddr_t)addr; 12231 db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a)); 12232 } else { 12233 db_printf("show phys2dmap addr\n"); 12234 } 12235 } 12236 12237 static void 12238 ptpages_show_page(int level, int idx, vm_page_t pg) 12239 { 12240 db_printf("l %d i %d pg %p phys %#lx ref %x\n", 12241 level, idx, pg, VM_PAGE_TO_PHYS(pg), pg->ref_count); 12242 } 12243 12244 static void 12245 ptpages_show_complain(int level, int idx, uint64_t pte) 12246 { 12247 db_printf("l %d i %d pte %#lx\n", level, idx, pte); 12248 } 12249 12250 static void 12251 ptpages_show_pml4(vm_page_t pg4, int num_entries, uint64_t PG_V) 12252 { 12253 vm_page_t pg3, pg2, pg1; 12254 pml4_entry_t *pml4; 12255 pdp_entry_t *pdp; 12256 pd_entry_t *pd; 12257 int i4, i3, i2; 12258 12259 pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg4)); 12260 for (i4 = 0; i4 < num_entries; i4++) { 12261 if ((pml4[i4] & PG_V) == 0) 12262 continue; 12263 pg3 = PHYS_TO_VM_PAGE(pml4[i4] & PG_FRAME); 12264 if (pg3 == NULL) { 12265 ptpages_show_complain(3, i4, pml4[i4]); 12266 continue; 12267 } 12268 ptpages_show_page(3, i4, pg3); 12269 pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg3)); 12270 for (i3 = 0; i3 < NPDPEPG; i3++) { 12271 if ((pdp[i3] & PG_V) == 0) 12272 continue; 12273 pg2 = PHYS_TO_VM_PAGE(pdp[i3] & PG_FRAME); 12274 if (pg3 == NULL) { 12275 ptpages_show_complain(2, i3, pdp[i3]); 12276 continue; 12277 } 12278 ptpages_show_page(2, i3, pg2); 12279 pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg2)); 12280 for (i2 = 0; i2 < NPDEPG; i2++) { 12281 if ((pd[i2] & PG_V) == 0) 12282 continue; 12283 pg1 = PHYS_TO_VM_PAGE(pd[i2] & PG_FRAME); 12284 if (pg1 == NULL) { 12285 ptpages_show_complain(1, i2, pd[i2]); 12286 continue; 12287 } 12288 ptpages_show_page(1, i2, pg1); 12289 } 12290 } 12291 } 12292 } 12293 12294 DB_SHOW_COMMAND(ptpages, pmap_ptpages) 12295 { 12296 pmap_t pmap; 12297 vm_page_t pg; 12298 pml5_entry_t *pml5; 12299 uint64_t PG_V; 12300 int i5; 12301 12302 if (have_addr) 12303 pmap = (pmap_t)addr; 12304 else 12305 pmap = PCPU_GET(curpmap); 12306 12307 PG_V = pmap_valid_bit(pmap); 12308 12309 if (pmap_is_la57(pmap)) { 12310 pml5 = pmap->pm_pmltop; 12311 for (i5 = 0; i5 < NUPML5E; i5++) { 12312 if ((pml5[i5] & PG_V) == 0) 12313 continue; 12314 pg = PHYS_TO_VM_PAGE(pml5[i5] & PG_FRAME); 12315 if (pg == NULL) { 12316 ptpages_show_complain(4, i5, pml5[i5]); 12317 continue; 12318 } 12319 ptpages_show_page(4, i5, pg); 12320 ptpages_show_pml4(pg, NPML4EPG, PG_V); 12321 } 12322 } else { 12323 ptpages_show_pml4(PHYS_TO_VM_PAGE(DMAP_TO_PHYS( 12324 (vm_offset_t)pmap->pm_pmltop)), NUP4ML4E, PG_V); 12325 } 12326 } 12327 #endif 12328