1 /* $OpenBSD: pmap.c,v 1.178 2024/11/02 07:58:58 mpi Exp $ */ 2 /* $NetBSD: pmap.c,v 1.3 2003/05/08 18:13:13 thorpej Exp $ */ 3 4 /* 5 * Copyright (c) 1997 Charles D. Cranor and Washington University. 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /* 30 * Copyright 2001 (c) Wasabi Systems, Inc. 31 * All rights reserved. 32 * 33 * Written by Frank van der Linden for Wasabi Systems, Inc. 34 * 35 * Redistribution and use in source and binary forms, with or without 36 * modification, are permitted provided that the following conditions 37 * are met: 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgement: 45 * This product includes software developed for the NetBSD Project by 46 * Wasabi Systems, Inc. 47 * 4. The name of Wasabi Systems, Inc. may not be used to endorse 48 * or promote products derived from this software without specific prior 49 * written permission. 50 * 51 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND 52 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 53 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 54 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC 55 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 56 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 57 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 58 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 59 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 60 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 61 * POSSIBILITY OF SUCH DAMAGE. 62 */ 63 64 /* 65 * This is the i386 pmap modified and generalized to support x86-64 66 * as well. The idea is to hide the upper N levels of the page tables 67 * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest 68 * is mostly untouched, except that it uses some more generalized 69 * macros and interfaces. 70 * 71 * This pmap has been tested on the i386 as well, and it can be easily 72 * adapted to PAE. 73 * 74 * fvdl@wasabisystems.com 18-Jun-2001 75 */ 76 77 /* 78 * pmap.c: i386 pmap module rewrite 79 * Chuck Cranor <chuck@ccrc.wustl.edu> 80 * 11-Aug-97 81 * 82 * history of this pmap module: in addition to my own input, i used 83 * the following references for this rewrite of the i386 pmap: 84 * 85 * [1] the NetBSD i386 pmap. this pmap appears to be based on the 86 * BSD hp300 pmap done by Mike Hibler at University of Utah. 87 * it was then ported to the i386 by William Jolitz of UUNET 88 * Technologies, Inc. Then Charles M. Hannum of the NetBSD 89 * project fixed some bugs and provided some speed ups. 90 * 91 * [2] the FreeBSD i386 pmap. this pmap seems to be the 92 * Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson 93 * and David Greenman. 94 * 95 * [3] the Mach pmap. this pmap, from CMU, seems to have migrated 96 * between several processors. the VAX version was done by 97 * Avadis Tevanian, Jr., and Michael Wayne Young. the i386 98 * version was done by Lance Berc, Mike Kupfer, Bob Baron, 99 * David Golub, and Richard Draves. the alpha version was 100 * done by Alessandro Forin (CMU/Mach) and Chris Demetriou 101 * (NetBSD/alpha). 102 */ 103 104 #include <sys/param.h> 105 #include <sys/systm.h> 106 #include <sys/atomic.h> 107 #include <sys/proc.h> 108 #include <sys/pool.h> 109 #include <sys/user.h> 110 #include <sys/mutex.h> 111 112 #include <uvm/uvm.h> 113 114 #include <machine/cpu.h> 115 #ifdef MULTIPROCESSOR 116 #include <machine/i82489reg.h> 117 #include <machine/i82489var.h> 118 #endif 119 120 #include "vmm.h" 121 122 #if NVMM > 0 123 #include <machine/vmmvar.h> 124 #endif /* NVMM > 0 */ 125 126 #include "acpi.h" 127 128 /* #define PMAP_DEBUG */ 129 130 #ifdef PMAP_DEBUG 131 #define DPRINTF(x...) do { printf(x); } while(0) 132 #else 133 #define DPRINTF(x...) 134 #endif /* PMAP_DEBUG */ 135 136 137 /* 138 * general info: 139 * 140 * - for an explanation of how the i386 MMU hardware works see 141 * the comments in <machine/pte.h>. 142 * 143 * - for an explanation of the general memory structure used by 144 * this pmap (including the recursive mapping), see the comments 145 * in <machine/pmap.h>. 146 * 147 * this file contains the code for the "pmap module." the module's 148 * job is to manage the hardware's virtual to physical address mappings. 149 * note that there are two levels of mapping in the VM system: 150 * 151 * [1] the upper layer of the VM system uses vm_map's and vm_map_entry's 152 * to map ranges of virtual address space to objects/files. for 153 * example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only 154 * to the file /bin/ls starting at offset zero." note that 155 * the upper layer mapping is not concerned with how individual 156 * vm_pages are mapped. 157 * 158 * [2] the lower layer of the VM system (the pmap) maintains the mappings 159 * from virtual addresses. it is concerned with which vm_page is 160 * mapped where. for example, when you run /bin/ls and start 161 * at page 0x1000 the fault routine may lookup the correct page 162 * of the /bin/ls file and then ask the pmap layer to establish 163 * a mapping for it. 164 * 165 * note that information in the lower layer of the VM system can be 166 * thrown away since it can easily be reconstructed from the info 167 * in the upper layer. 168 * 169 * data structures we use include: 170 * - struct pmap: describes the address space of one process 171 * - struct pv_entry: describes one <PMAP,VA> mapping of a PA 172 * - struct pg_to_free: a list of virtual addresses whose mappings 173 * have been changed. used for TLB flushing. 174 */ 175 176 /* 177 * memory allocation 178 * 179 * - there are three data structures that we must dynamically allocate: 180 * 181 * [A] new process' page directory page (PDP) 182 * - plan 1: done at pmap_create() we use 183 * pool_get(&pmap_pmap_pool, PR_WAITOK) to do this allocation. 184 * 185 * if we are low in free physical memory then we sleep in 186 * pool_get() -- in this case this is ok since we are creating 187 * a new pmap and should not be holding any locks. 188 * 189 * XXX: the fork code currently has no way to return an "out of 190 * memory, try again" error code since uvm_fork [fka vm_fork] 191 * is a void function. 192 * 193 * [B] new page tables pages (PTP) 194 * call uvm_pagealloc() 195 * => success: zero page, add to pm_pdir 196 * => failure: we are out of free vm_pages, let pmap_enter() 197 * tell UVM about it. 198 * 199 * note: for kernel PTPs, we start with NKPTP of them. as we map 200 * kernel memory (at uvm_map time) we check to see if we've grown 201 * the kernel pmap. if so, we call the optional function 202 * pmap_growkernel() to grow the kernel PTPs in advance. 203 * 204 * [C] pv_entry structures 205 * - try to allocate one from the pool. 206 * If we fail, we simply let pmap_enter() tell UVM about it. 207 */ 208 209 long nkptp[] = NKPTP_INITIALIZER; 210 211 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER; 212 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER; 213 const long nkptpmax[] = NKPTPMAX_INITIALIZER; 214 const long nbpd[] = NBPD_INITIALIZER; 215 pd_entry_t *const normal_pdes[] = PDES_INITIALIZER; 216 217 #define pmap_pte_set(p, n) atomic_swap_64(p, n) 218 #define pmap_pte_clearbits(p, b) x86_atomic_clearbits_u64(p, b) 219 #define pmap_pte_setbits(p, b) x86_atomic_setbits_u64(p, b) 220 221 /* 222 * global data structures 223 */ 224 225 struct pmap kernel_pmap_store; /* the kernel's pmap (proc0) */ 226 227 /* 228 * pg_nx: NX PTE bit (if CPU supports) 229 * pg_g_kern: PG_G if global pages should be used in kernel mappings, 230 * 0 otherwise (for insecure CPUs) 231 */ 232 pt_entry_t pg_nx = 0; 233 pt_entry_t pg_g_kern = 0; 234 235 /* pg_xo: XO PTE bits, set to PKU key1 (if cpu supports PKU) */ 236 pt_entry_t pg_xo; 237 238 /* pg_crypt, pg_frame, pg_lgframe: will be derived from CPUID */ 239 pt_entry_t pg_crypt = 0; 240 pt_entry_t pg_frame = PG_FRAME; 241 pt_entry_t pg_lgframe = PG_LGFRAME; 242 243 /* 244 * pmap_pg_wc: if our processor supports PAT then we set this 245 * to be the pte bits for Write Combining. Else we fall back to 246 * UC- so mtrrs can override the cacheability; 247 */ 248 int pmap_pg_wc = PG_UCMINUS; 249 250 /* 251 * pmap_use_pcid: nonzero if PCID use is enabled (currently we require INVPCID) 252 * 253 * The next three are zero unless and until PCID support is enabled so code 254 * can just 'or' them in as needed without tests. 255 * cr3_pcid: CR3_REUSE_PCID 256 * cr3_pcid_proc and cr3_pcid_temp: PCID_PROC and PCID_TEMP 257 */ 258 #if PCID_KERN != 0 259 # error "pmap.c assumes PCID_KERN is zero" 260 #endif 261 int pmap_use_pcid; 262 static u_int cr3_pcid_proc; 263 static u_int cr3_pcid_temp; 264 /* these two are accessed from locore.o */ 265 paddr_t cr3_reuse_pcid; 266 paddr_t cr3_pcid_proc_intel; 267 268 /* 269 * other data structures 270 */ 271 272 pt_entry_t protection_codes[8]; /* maps MI prot to i386 prot code */ 273 int pmap_initialized = 0; /* pmap_init done yet? */ 274 275 /* 276 * pv management structures. 277 */ 278 struct pool pmap_pv_pool; 279 280 /* 281 * linked list of all non-kernel pmaps 282 */ 283 284 struct pmap_head pmaps; 285 struct mutex pmaps_lock = MUTEX_INITIALIZER(IPL_VM); 286 287 /* 288 * pool that pmap structures are allocated from 289 */ 290 291 struct pool pmap_pmap_pool; 292 293 /* 294 * When we're freeing a ptp, we need to delay the freeing until all 295 * tlb shootdown has been done. This is the list of the to-be-freed pages. 296 */ 297 TAILQ_HEAD(pg_to_free, vm_page); 298 299 /* 300 * pool that PDPs are allocated from 301 */ 302 303 struct pool pmap_pdp_pool; 304 void pmap_pdp_ctor(pd_entry_t *); 305 void pmap_pdp_ctor_intel(pd_entry_t *); 306 307 extern vaddr_t msgbuf_vaddr; 308 extern paddr_t msgbuf_paddr; 309 310 extern vaddr_t idt_vaddr; /* we allocate IDT early */ 311 extern paddr_t idt_paddr; 312 313 extern vaddr_t lo32_vaddr; 314 extern vaddr_t lo32_paddr; 315 316 vaddr_t virtual_avail; 317 extern int end; 318 319 /* 320 * local prototypes 321 */ 322 323 void pmap_enter_pv(struct vm_page *, struct pv_entry *, struct pmap *, 324 vaddr_t, struct vm_page *); 325 struct vm_page *pmap_get_ptp(struct pmap *, vaddr_t); 326 struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int); 327 int pmap_find_pte_direct(struct pmap *pm, vaddr_t va, pt_entry_t **pd, int *offs); 328 void pmap_free_ptp(struct pmap *, struct vm_page *, 329 vaddr_t, struct pg_to_free *); 330 void pmap_freepage(struct pmap *, struct vm_page *, int, struct pg_to_free *); 331 #ifdef MULTIPROCESSOR 332 static int pmap_is_active(struct pmap *, struct cpu_info *); 333 #endif 334 paddr_t pmap_map_ptes(struct pmap *); 335 struct pv_entry *pmap_remove_pv(struct vm_page *, struct pmap *, vaddr_t); 336 void pmap_do_remove(struct pmap *, vaddr_t, vaddr_t, int); 337 #if NVMM > 0 338 void pmap_remove_ept(struct pmap *, vaddr_t, vaddr_t); 339 void pmap_do_remove_ept(struct pmap *, vaddr_t); 340 int pmap_enter_ept(struct pmap *, vaddr_t, paddr_t, vm_prot_t); 341 void pmap_shootept(struct pmap *, int); 342 #endif /* NVMM > 0 */ 343 int pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *, 344 vaddr_t, int, struct pv_entry **); 345 void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t, 346 vaddr_t, vaddr_t, int, struct pv_entry **); 347 #define PMAP_REMOVE_ALL 0 /* remove all mappings */ 348 #define PMAP_REMOVE_SKIPWIRED 1 /* skip wired mappings */ 349 350 void pmap_unmap_ptes(struct pmap *, paddr_t); 351 int pmap_get_physpage(vaddr_t, int, paddr_t *); 352 int pmap_pdes_valid(vaddr_t, pd_entry_t *); 353 void pmap_alloc_level(vaddr_t, int, long *); 354 355 static inline 356 void pmap_sync_flags_pte(struct vm_page *, u_long); 357 358 void pmap_tlb_shootpage(struct pmap *, vaddr_t, int); 359 void pmap_tlb_shootrange(struct pmap *, vaddr_t, vaddr_t, int); 360 void pmap_tlb_shoottlb(struct pmap *, int); 361 #ifdef MULTIPROCESSOR 362 void pmap_tlb_shootwait(void); 363 #else 364 #define pmap_tlb_shootwait() do { } while (0) 365 #endif 366 367 /* 368 * p m a p i n l i n e h e l p e r f u n c t i o n s 369 */ 370 371 /* 372 * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]? 373 * of course the kernel is always loaded 374 */ 375 376 static inline int 377 pmap_is_curpmap(struct pmap *pmap) 378 { 379 return((pmap == pmap_kernel()) || 380 (pmap->pm_pdirpa == (rcr3() & CR3_PADDR))); 381 } 382 383 /* 384 * pmap_is_active: is this pmap loaded into the specified processor's %cr3? 385 */ 386 387 #ifdef MULTIPROCESSOR 388 static inline int 389 pmap_is_active(struct pmap *pmap, struct cpu_info *ci) 390 { 391 return (pmap == pmap_kernel() || pmap == ci->ci_proc_pmap 392 #if NVMM > 0 393 || (pmap_is_ept(pmap) && pmap == ci->ci_ept_pmap) 394 #endif /* NVMM > 0 */ 395 ); 396 } 397 #endif 398 399 static inline u_int 400 pmap_pte2flags(u_long pte) 401 { 402 return (((pte & PG_U) ? PG_PMAP_REF : 0) | 403 ((pte & PG_M) ? PG_PMAP_MOD : 0)); 404 } 405 406 static inline void 407 pmap_sync_flags_pte(struct vm_page *pg, u_long pte) 408 { 409 if (pte & (PG_U|PG_M)) { 410 atomic_setbits_int(&pg->pg_flags, pmap_pte2flags(pte)); 411 } 412 } 413 414 /* 415 * pmap_map_ptes: map a pmap's PTEs into KVM 416 * 417 * This should not be done for EPT pmaps 418 */ 419 paddr_t 420 pmap_map_ptes(struct pmap *pmap) 421 { 422 paddr_t cr3; 423 424 KASSERT(!pmap_is_ept(pmap)); 425 426 /* the kernel's pmap is always accessible */ 427 if (pmap == pmap_kernel()) 428 return 0; 429 430 /* 431 * Lock the target map before switching to its page tables to 432 * guarantee other CPUs have finished changing the tables before 433 * we potentially start caching table and TLB entries. 434 */ 435 mtx_enter(&pmap->pm_mtx); 436 437 cr3 = rcr3(); 438 KASSERT((cr3 & CR3_PCID) == PCID_KERN || 439 (cr3 & CR3_PCID) == PCID_PROC); 440 if (pmap->pm_pdirpa == (cr3 & CR3_PADDR)) 441 cr3 = 0; 442 else { 443 cr3 |= cr3_reuse_pcid; 444 lcr3(pmap->pm_pdirpa | cr3_pcid_temp); 445 } 446 447 return cr3; 448 } 449 450 void 451 pmap_unmap_ptes(struct pmap *pmap, paddr_t save_cr3) 452 { 453 if (pmap != pmap_kernel()) 454 mtx_leave(&pmap->pm_mtx); 455 456 if (save_cr3 != 0) 457 lcr3(save_cr3); 458 } 459 460 int 461 pmap_find_pte_direct(struct pmap *pm, vaddr_t va, pt_entry_t **pd, int *offs) 462 { 463 u_long mask, shift; 464 pd_entry_t pde; 465 paddr_t pdpa; 466 int lev; 467 468 pdpa = pm->pm_pdirpa; 469 shift = L4_SHIFT; 470 mask = L4_MASK; 471 for (lev = PTP_LEVELS; lev > 0; lev--) { 472 *pd = (pd_entry_t *)PMAP_DIRECT_MAP(pdpa); 473 *offs = (VA_SIGN_POS(va) & mask) >> shift; 474 pde = (*pd)[*offs]; 475 476 /* Large pages are different, break early if we run into one. */ 477 if ((pde & (PG_PS|PG_V)) != PG_V) 478 return (lev - 1); 479 480 pdpa = ((*pd)[*offs] & pg_frame); 481 /* 4096/8 == 512 == 2^9 entries per level */ 482 shift -= 9; 483 mask >>= 9; 484 } 485 486 return (0); 487 } 488 489 /* 490 * p m a p k e n t e r f u n c t i o n s 491 * 492 * functions to quickly enter/remove pages from the kernel address 493 * space. pmap_kremove is exported to MI kernel. we make use of 494 * the recursive PTE mappings. 495 */ 496 497 /* 498 * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking 499 * 500 * => no need to lock anything, assume va is already allocated 501 * => should be faster than normal pmap enter function 502 */ 503 504 void 505 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot) 506 { 507 pt_entry_t *pte, opte, npte; 508 509 pte = kvtopte(va); 510 511 npte = (pa & PMAP_PA_MASK) | ((prot & PROT_WRITE) ? PG_RW : PG_RO) | 512 ((pa & PMAP_NOCACHE) ? PG_N : 0) | 513 ((pa & PMAP_WC) ? pmap_pg_wc : 0) | PG_V | 514 ((pa & PMAP_NOCRYPT) ? 0 : pg_crypt); 515 516 /* special 1:1 mappings in the first 2MB must not be global */ 517 if (va >= (vaddr_t)NBPD_L2) 518 npte |= pg_g_kern; 519 520 if (!(prot & PROT_EXEC)) 521 npte |= pg_nx; 522 opte = pmap_pte_set(pte, npte); 523 #ifdef LARGEPAGES 524 /* XXX For now... */ 525 if (opte & PG_PS) 526 panic("%s: PG_PS", __func__); 527 #endif 528 if (pmap_valid_entry(opte)) { 529 if ((pa & PMAP_NOCACHE && (opte & PG_N) == 0) || 530 (pa & PMAP_NOCRYPT)) 531 wbinvd_on_all_cpus(); 532 /* This shouldn't happen */ 533 pmap_tlb_shootpage(pmap_kernel(), va, 1); 534 pmap_tlb_shootwait(); 535 } 536 } 537 538 /* 539 * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking 540 * 541 * => no need to lock anything 542 * => caller must dispose of any vm_page mapped in the va range 543 * => note: not an inline function 544 * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE 545 * => we assume kernel only unmaps valid addresses and thus don't bother 546 * checking the valid bit before doing TLB flushing 547 */ 548 549 void 550 pmap_kremove(vaddr_t sva, vsize_t len) 551 { 552 pt_entry_t *pte, opte; 553 vaddr_t va, eva; 554 555 eva = sva + len; 556 557 for (va = sva; va != eva; va += PAGE_SIZE) { 558 pte = kvtopte(va); 559 560 opte = pmap_pte_set(pte, 0); 561 #ifdef LARGEPAGES 562 KASSERT((opte & PG_PS) == 0); 563 #endif 564 KASSERT((opte & PG_PVLIST) == 0); 565 } 566 567 pmap_tlb_shootrange(pmap_kernel(), sva, eva, 1); 568 pmap_tlb_shootwait(); 569 } 570 571 /* 572 * pmap_set_pml4_early 573 * 574 * Utility function to map 2GB of 2MB pages to 'pa'. The VA that is assigned 575 * is the pml4 entry for 'early mappings' (see pmap.h). This function is used 576 * by display drivers that need to map their framebuffers early, before the 577 * pmap is fully initialized (eg, to show panic messages). 578 * 579 * Users of this function must call pmap_clear_pml4_early to remove the 580 * mapping when finished. 581 * 582 * Parameters: 583 * pa: phys addr to map 584 * 585 * Return value: 586 * VA mapping to 'pa'. This mapping is 2GB in size and starts at the base 587 * of the 2MB region containing 'va'. 588 */ 589 vaddr_t 590 pmap_set_pml4_early(paddr_t pa) 591 { 592 extern paddr_t early_pte_pages; 593 pt_entry_t *pml4e, *pte; 594 int i, j, off; 595 paddr_t curpa; 596 vaddr_t va; 597 598 pml4e = (pt_entry_t *)(proc0.p_addr->u_pcb.pcb_cr3 + KERNBASE); 599 pml4e[PDIR_SLOT_EARLY] = (pd_entry_t)early_pte_pages | PG_V | PG_RW | 600 pg_crypt; 601 602 off = pa & PAGE_MASK_L2; 603 curpa = pa & L2_FRAME; 604 605 pte = (pt_entry_t *)PMAP_DIRECT_MAP(early_pte_pages); 606 memset(pte, 0, 3 * NBPG); 607 608 pte[0] = (early_pte_pages + NBPG) | PG_V | PG_RW | pg_crypt; 609 pte[1] = (early_pte_pages + 2 * NBPG) | PG_V | PG_RW | pg_crypt; 610 611 pte = (pt_entry_t *)PMAP_DIRECT_MAP(early_pte_pages + NBPG); 612 for (i = 0; i < 2; i++) { 613 /* 2 early pages of mappings */ 614 for (j = 0; j < 512; j++) { 615 /* j[0..511] : 2MB mappings per page */ 616 pte[(i * 512) + j] = curpa | PG_V | PG_RW | PG_PS | 617 pg_crypt; 618 curpa += (2 * 1024 * 1024); 619 } 620 } 621 622 va = (vaddr_t)((PDIR_SLOT_EARLY * 512ULL) << L3_SHIFT) + off; 623 return VA_SIGN_NEG(va); 624 } 625 626 /* 627 * pmap_clear_pml4_early 628 * 629 * Clears the mapping previously established with pmap_set_pml4_early. 630 */ 631 void 632 pmap_clear_pml4_early(void) 633 { 634 extern paddr_t early_pte_pages; 635 pt_entry_t *pml4e, *pte; 636 637 pte = (pt_entry_t *)PMAP_DIRECT_MAP(early_pte_pages); 638 memset(pte, 0, 3 * NBPG); 639 640 pml4e = (pd_entry_t *)pmap_kernel()->pm_pdir; 641 pml4e[PDIR_SLOT_EARLY] = 0; 642 tlbflush(); 643 } 644 645 /* 646 * p m a p i n i t f u n c t i o n s 647 * 648 * pmap_bootstrap and pmap_init are called during system startup 649 * to init the pmap module. pmap_bootstrap() does a low level 650 * init just to get things rolling. pmap_init() finishes the job. 651 */ 652 653 /* 654 * pmap_bootstrap: get the system in a state where it can run with VM 655 * properly enabled (called before main()). the VM system is 656 * fully init'd later... 657 */ 658 659 paddr_t 660 pmap_bootstrap(paddr_t first_avail, paddr_t max_pa) 661 { 662 vaddr_t kva_start = VM_MIN_KERNEL_ADDRESS; 663 struct pmap *kpm; 664 int curslot, i, j, p; 665 long ndmpdp; 666 paddr_t dmpd, dmpdp, start_cur, cur_pa; 667 vaddr_t kva, kva_end; 668 pt_entry_t *pml3, *pml2; 669 670 KASSERT(((0x1000ULL | pg_crypt) & pg_frame) == 0x1000ULL); 671 672 /* 673 * define the boundaries of the managed kernel virtual address 674 * space. 675 */ 676 677 virtual_avail = kva_start; /* first free KVA */ 678 679 /* 680 * If PKU is available, initialize PROT_EXEC entry correctly, 681 * and enable the feature before it gets used 682 * XXX Some Hypervisors forget to save/restore PKU 683 */ 684 if (cpuid_level >= 0x7) { 685 uint32_t ecx, dummy; 686 687 CPUID_LEAF(0x7, 0, dummy, dummy, ecx, dummy); 688 if (ecx & SEFF0ECX_PKU) { 689 lcr4(rcr4() | CR4_PKE); 690 pg_xo = PG_XO; 691 } 692 } 693 694 /* 695 * set up protection_codes: we need to be able to convert from 696 * a MI protection code (some combo of VM_PROT...) to something 697 * we can jam into a i386 PTE. 698 */ 699 700 protection_codes[PROT_NONE] = pg_nx; /* --- */ 701 protection_codes[PROT_EXEC] = pg_xo; /* --x */ 702 protection_codes[PROT_READ] = PG_RO | pg_nx; /* -r- */ 703 protection_codes[PROT_READ | PROT_EXEC] = PG_RO; /* -rx */ 704 protection_codes[PROT_WRITE] = PG_RW | pg_nx; /* w-- */ 705 protection_codes[PROT_WRITE | PROT_EXEC] = PG_RW; /* w-x */ 706 protection_codes[PROT_WRITE | PROT_READ] = PG_RW | pg_nx; /* wr- */ 707 protection_codes[PROT_READ | PROT_WRITE | PROT_EXEC] = PG_RW; /* wrx */ 708 709 /* 710 * now we init the kernel's pmap 711 * 712 * the kernel pmap's pm_obj is not used for much. however, in 713 * user pmaps the pm_obj contains the list of active PTPs. 714 * the pm_obj currently does not have a pager. 715 */ 716 717 kpm = pmap_kernel(); 718 for (i = 0; i < PTP_LEVELS - 1; i++) { 719 uvm_obj_init(&kpm->pm_obj[i], &pmap_pager, 1); 720 kpm->pm_ptphint[i] = NULL; 721 } 722 memset(&kpm->pm_list, 0, sizeof(kpm->pm_list)); /* pm_list not used */ 723 kpm->pm_pdir = (pd_entry_t *)(proc0.p_addr->u_pcb.pcb_cr3 + KERNBASE); 724 kpm->pm_pdirpa = proc0.p_addr->u_pcb.pcb_cr3; 725 kpm->pm_stats.wired_count = kpm->pm_stats.resident_count = 726 atop(kva_start - VM_MIN_KERNEL_ADDRESS); 727 /* 728 * the above is just a rough estimate and not critical to the proper 729 * operation of the system. 730 */ 731 732 kpm->pm_type = PMAP_TYPE_NORMAL; 733 734 curpcb->pcb_pmap = kpm; /* proc0's pcb */ 735 736 /* 737 * Configure and enable PCID use if supported. 738 * Currently we require INVPCID support. 739 */ 740 if ((cpu_ecxfeature & CPUIDECX_PCID) && cpuid_level >= 0x07) { 741 uint32_t ebx, dummy; 742 CPUID_LEAF(0x7, 0, dummy, ebx, dummy, dummy); 743 if (ebx & SEFF0EBX_INVPCID) { 744 pmap_use_pcid = 1; 745 /* 746 * We cannot use global mappings because 747 * invpcid function 0 does not invalidate global 748 * mappings. The hardware can cache kernel 749 * mappings based on PCID_KERN, i.e. there is no 750 * need for global mappings. 751 */ 752 pg_g_kern = 0; 753 lcr4( rcr4() | CR4_PCIDE ); 754 cr3_pcid_proc = PCID_PROC; 755 cr3_pcid_temp = PCID_TEMP; 756 cr3_reuse_pcid = CR3_REUSE_PCID; 757 cr3_pcid_proc_intel = PCID_PROC_INTEL; 758 } 759 } 760 761 /* 762 * Add PG_G attribute to already mapped kernel pages. pg_g_kern 763 * is calculated in locore0.S and may be set to: 764 * 765 * 0 if this CPU does not safely support global pages in the kernel 766 * (Intel/Meltdown) 767 * PG_G if this CPU does safely support global pages in the kernel 768 * (AMD) 769 */ 770 #if KERNBASE == VM_MIN_KERNEL_ADDRESS 771 for (kva = VM_MIN_KERNEL_ADDRESS ; kva < virtual_avail ; 772 #else 773 kva_end = roundup((vaddr_t)&end, PAGE_SIZE); 774 for (kva = KERNBASE; kva < kva_end ; 775 #endif 776 kva += PAGE_SIZE) { 777 unsigned long p1i = pl1_i(kva); 778 if (pmap_valid_entry(PTE_BASE[p1i])) 779 PTE_BASE[p1i] |= pg_g_kern; 780 } 781 782 /* 783 * Map the direct map. The first 4GB were mapped in locore, here 784 * we map the rest if it exists. We actually use the direct map 785 * here to set up the page tables, we're assuming that we're still 786 * operating in the lower 4GB of memory. 787 * 788 * Map (up to) the first 512GB of physical memory first. This part 789 * is handled differently than physical memory > 512GB since we have 790 * already mapped part of this range in locore0. 791 */ 792 ndmpdp = (max_pa + NBPD_L3 - 1) >> L3_SHIFT; 793 if (ndmpdp < NDML2_ENTRIES) 794 ndmpdp = NDML2_ENTRIES; /* At least 4GB */ 795 if (ndmpdp > 512) 796 ndmpdp = 512; /* At most 512GB */ 797 798 dmpdp = kpm->pm_pdir[PDIR_SLOT_DIRECT] & pg_frame; 799 800 dmpd = first_avail; first_avail += ndmpdp * PAGE_SIZE; 801 802 for (i = NDML2_ENTRIES; i < NPDPG * ndmpdp; i++) { 803 paddr_t pdp; 804 vaddr_t va; 805 806 pdp = (paddr_t)&(((pd_entry_t *)dmpd)[i]); 807 va = PMAP_DIRECT_MAP(pdp); 808 809 *((pd_entry_t *)va) = ((paddr_t)i << L2_SHIFT); 810 *((pd_entry_t *)va) |= PG_RW | PG_V | PG_PS | pg_g_kern | PG_U | 811 PG_M | pg_nx | pg_crypt; 812 } 813 814 for (i = NDML2_ENTRIES; i < ndmpdp; i++) { 815 paddr_t pdp; 816 vaddr_t va; 817 818 pdp = (paddr_t)&(((pd_entry_t *)dmpdp)[i]); 819 va = PMAP_DIRECT_MAP(pdp); 820 821 *((pd_entry_t *)va) = dmpd + (i << PAGE_SHIFT); 822 *((pd_entry_t *)va) |= PG_RW | PG_V | PG_U | PG_M | pg_nx | 823 pg_crypt; 824 } 825 826 kpm->pm_pdir[PDIR_SLOT_DIRECT] = dmpdp | PG_V | PG_KW | PG_U | 827 PG_M | pg_nx | pg_crypt; 828 829 /* Map any remaining physical memory > 512GB */ 830 for (curslot = 1 ; curslot < NUM_L4_SLOT_DIRECT ; curslot++) { 831 /* 832 * Start of current range starts at PA (curslot) * 512GB 833 */ 834 start_cur = (paddr_t)(curslot * NBPD_L4); 835 if (max_pa > start_cur) { 836 /* Next 512GB, new PML4e and L3(512GB) page */ 837 dmpd = first_avail; first_avail += PAGE_SIZE; 838 pml3 = (pt_entry_t *)PMAP_DIRECT_MAP(dmpd); 839 kpm->pm_pdir[PDIR_SLOT_DIRECT + curslot] = dmpd | 840 PG_KW | PG_V | PG_U | PG_M | pg_nx | pg_crypt; 841 842 /* Calculate full 1GB pages in this 512GB region */ 843 p = ((max_pa - start_cur) >> L3_SHIFT); 844 845 /* Check if a partial (<1GB) page remains */ 846 if (max_pa & L2_MASK) 847 p++; 848 849 /* 850 * Handle the case where this range is full and there 851 * is still more memory after (p would be > 512). 852 */ 853 if (p > NPDPG) 854 p = NPDPG; 855 856 /* Allocate 'p' L2(1GB) pages and populate */ 857 for (i = 0; i < p; i++) { 858 dmpd = first_avail; first_avail += PAGE_SIZE; 859 pml2 = (pt_entry_t *)PMAP_DIRECT_MAP(dmpd); 860 pml3[i] = dmpd | 861 PG_RW | PG_V | PG_U | PG_M | pg_nx | 862 pg_crypt; 863 864 cur_pa = start_cur + (i << L3_SHIFT); 865 j = 0; 866 867 while (cur_pa < max_pa && j < NPDPG) { 868 pml2[j] = curslot * NBPD_L4 + 869 (uint64_t)i * NBPD_L3 + 870 (uint64_t)j * NBPD_L2; 871 pml2[j] |= PG_RW | PG_V | pg_g_kern | 872 PG_U | PG_M | pg_nx | PG_PS | 873 pg_crypt; 874 cur_pa += NBPD_L2; 875 j++; 876 } 877 } 878 } 879 } 880 881 tlbflush(); 882 883 msgbuf_vaddr = virtual_avail; 884 virtual_avail += round_page(MSGBUFSIZE); 885 886 idt_vaddr = virtual_avail; 887 virtual_avail += 2 * PAGE_SIZE; 888 idt_paddr = first_avail; /* steal a page */ 889 first_avail += 2 * PAGE_SIZE; 890 891 #if defined(MULTIPROCESSOR) || \ 892 (NACPI > 0 && !defined(SMALL_KERNEL)) 893 /* 894 * Grab a page below 4G for things that need it (i.e. 895 * having an initial %cr3 for the MP trampoline). 896 */ 897 lo32_vaddr = virtual_avail; 898 virtual_avail += PAGE_SIZE; 899 lo32_paddr = first_avail; 900 first_avail += PAGE_SIZE; 901 #endif 902 903 /* 904 * init the global lists. 905 */ 906 LIST_INIT(&pmaps); 907 908 /* 909 * initialize the pmap pools. 910 */ 911 912 pool_init(&pmap_pmap_pool, sizeof(struct pmap), 0, IPL_VM, 0, 913 "pmappl", NULL); 914 pool_init(&pmap_pv_pool, sizeof(struct pv_entry), 0, IPL_VM, 0, 915 "pvpl", &pool_allocator_single); 916 pool_sethiwat(&pmap_pv_pool, 32 * 1024); 917 918 /* 919 * initialize the PDE pool. 920 */ 921 922 pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, IPL_VM, 0, 923 "pdppl", &pool_allocator_single); 924 925 kpm->pm_pdir_intel = NULL; 926 kpm->pm_pdirpa_intel = 0; 927 928 /* 929 * ensure the TLB is sync'd with reality by flushing it... 930 */ 931 932 tlbflush(); 933 934 return first_avail; 935 } 936 937 void 938 pmap_init_percpu(void) 939 { 940 pool_cache_init(&pmap_pv_pool); 941 } 942 943 /* 944 * pmap_randomize 945 * 946 * Randomizes the location of the kernel pmap 947 */ 948 void 949 pmap_randomize(void) 950 { 951 pd_entry_t *pml4va, *oldpml4va; 952 paddr_t pml4pa; 953 int i; 954 955 pml4va = km_alloc(PAGE_SIZE, &kv_page, &kp_zero, &kd_nowait); 956 if (pml4va == NULL) 957 panic("%s: km_alloc failed", __func__); 958 959 /* Copy old PML4 page to new one */ 960 oldpml4va = pmap_kernel()->pm_pdir; 961 memcpy(pml4va, oldpml4va, PAGE_SIZE); 962 963 /* Switch to new PML4 */ 964 pmap_extract(pmap_kernel(), (vaddr_t)pml4va, &pml4pa); 965 lcr3(pml4pa); 966 967 /* Fixup pmap_kernel and proc0's %cr3 */ 968 pmap_kernel()->pm_pdirpa = pml4pa; 969 pmap_kernel()->pm_pdir = pml4va; 970 proc0.p_addr->u_pcb.pcb_cr3 = pml4pa; 971 972 /* Fixup recursive PTE PML4E slot. We are only changing the PA */ 973 pml4va[PDIR_SLOT_PTE] = pml4pa | (pml4va[PDIR_SLOT_PTE] & ~pg_frame); 974 975 for (i = 0; i < NPDPG; i++) { 976 /* PTE slot already handled earlier */ 977 if (i == PDIR_SLOT_PTE) 978 continue; 979 980 if (pml4va[i] & pg_frame) 981 pmap_randomize_level(&pml4va[i], 3); 982 } 983 984 /* Wipe out bootstrap PML4 */ 985 memset(oldpml4va, 0, PAGE_SIZE); 986 tlbflush(); 987 } 988 989 void 990 pmap_randomize_level(pd_entry_t *pde, int level) 991 { 992 pd_entry_t *new_pd_va; 993 paddr_t old_pd_pa, new_pd_pa; 994 vaddr_t old_pd_va; 995 struct vm_page *pg; 996 int i; 997 998 if (level == 0) 999 return; 1000 1001 if (level < PTP_LEVELS - 1 && (*pde & PG_PS)) 1002 return; 1003 1004 new_pd_va = km_alloc(PAGE_SIZE, &kv_page, &kp_zero, &kd_nowait); 1005 if (new_pd_va == NULL) 1006 panic("%s: cannot allocate page for L%d page directory", 1007 __func__, level); 1008 1009 old_pd_pa = *pde & pg_frame; 1010 old_pd_va = PMAP_DIRECT_MAP(old_pd_pa); 1011 pmap_extract(pmap_kernel(), (vaddr_t)new_pd_va, &new_pd_pa); 1012 memcpy(new_pd_va, (void *)old_pd_va, PAGE_SIZE); 1013 *pde = new_pd_pa | (*pde & ~pg_frame); 1014 1015 tlbflush(); 1016 memset((void *)old_pd_va, 0, PAGE_SIZE); 1017 1018 pg = PHYS_TO_VM_PAGE(old_pd_pa); 1019 if (pg != NULL) { 1020 pg->wire_count--; 1021 pmap_kernel()->pm_stats.resident_count--; 1022 if (pg->wire_count <= 1) 1023 uvm_pagefree(pg); 1024 } 1025 1026 for (i = 0; i < NPDPG; i++) 1027 if (new_pd_va[i] & pg_frame) 1028 pmap_randomize_level(&new_pd_va[i], level - 1); 1029 } 1030 1031 /* 1032 * Pre-allocate PTPs for low memory, so that 1:1 mappings for various 1033 * trampoline code can be entered. 1034 */ 1035 paddr_t 1036 pmap_prealloc_lowmem_ptps(paddr_t first_avail) 1037 { 1038 pd_entry_t *pdes; 1039 int level; 1040 paddr_t newp; 1041 1042 pdes = pmap_kernel()->pm_pdir; 1043 level = PTP_LEVELS; 1044 for (;;) { 1045 newp = first_avail; first_avail += PAGE_SIZE; 1046 memset((void *)PMAP_DIRECT_MAP(newp), 0, PAGE_SIZE); 1047 pdes[pl_i(0, level)] = 1048 (newp & pg_frame) | PG_V | PG_RW | pg_crypt; 1049 level--; 1050 if (level <= 1) 1051 break; 1052 pdes = normal_pdes[level - 2]; 1053 } 1054 1055 return first_avail; 1056 } 1057 1058 /* 1059 * pmap_init: no further initialization required on this platform 1060 */ 1061 void 1062 pmap_init(void) 1063 { 1064 pmap_initialized = 1; 1065 } 1066 1067 /* 1068 * p v _ e n t r y f u n c t i o n s 1069 */ 1070 1071 /* 1072 * main pv_entry manipulation functions: 1073 * pmap_enter_pv: enter a mapping onto a pv list 1074 * pmap_remove_pv: remove a mapping from a pv list 1075 */ 1076 1077 /* 1078 * pmap_enter_pv: enter a mapping onto a pv list 1079 * 1080 * => caller should adjust ptp's wire_count before calling 1081 * 1082 * pve: preallocated pve for us to use 1083 * ptp: PTP in pmap that maps this VA 1084 */ 1085 1086 void 1087 pmap_enter_pv(struct vm_page *pg, struct pv_entry *pve, struct pmap *pmap, 1088 vaddr_t va, struct vm_page *ptp) 1089 { 1090 pve->pv_pmap = pmap; 1091 pve->pv_va = va; 1092 pve->pv_ptp = ptp; /* NULL for kernel pmap */ 1093 mtx_enter(&pg->mdpage.pv_mtx); 1094 pve->pv_next = pg->mdpage.pv_list; /* add to ... */ 1095 pg->mdpage.pv_list = pve; /* ... list */ 1096 mtx_leave(&pg->mdpage.pv_mtx); 1097 } 1098 1099 /* 1100 * pmap_remove_pv: try to remove a mapping from a pv_list 1101 * 1102 * => caller should adjust ptp's wire_count and free PTP if needed 1103 * => we return the removed pve 1104 */ 1105 1106 struct pv_entry * 1107 pmap_remove_pv(struct vm_page *pg, struct pmap *pmap, vaddr_t va) 1108 { 1109 struct pv_entry *pve, **prevptr; 1110 1111 mtx_enter(&pg->mdpage.pv_mtx); 1112 prevptr = &pg->mdpage.pv_list; 1113 while ((pve = *prevptr) != NULL) { 1114 if (pve->pv_pmap == pmap && pve->pv_va == va) { /* match? */ 1115 *prevptr = pve->pv_next; /* remove it! */ 1116 break; 1117 } 1118 prevptr = &pve->pv_next; /* previous pointer */ 1119 } 1120 mtx_leave(&pg->mdpage.pv_mtx); 1121 return(pve); /* return removed pve */ 1122 } 1123 1124 /* 1125 * p t p f u n c t i o n s 1126 */ 1127 1128 struct vm_page * 1129 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level) 1130 { 1131 int lidx = level - 1; 1132 struct vm_page *pg; 1133 1134 if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] && 1135 pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) 1136 return (pmap->pm_ptphint[lidx]); 1137 1138 pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level)); 1139 1140 return pg; 1141 } 1142 1143 void 1144 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level, 1145 struct pg_to_free *pagelist) 1146 { 1147 int lidx; 1148 struct uvm_object *obj; 1149 1150 lidx = level - 1; 1151 1152 obj = &pmap->pm_obj[lidx]; 1153 pmap->pm_stats.resident_count--; 1154 if (pmap->pm_ptphint[lidx] == ptp) 1155 pmap->pm_ptphint[lidx] = RBT_ROOT(uvm_objtree, &obj->memt); 1156 ptp->wire_count = 0; 1157 uvm_pagerealloc(ptp, NULL, 0); 1158 TAILQ_INSERT_TAIL(pagelist, ptp, pageq); 1159 } 1160 1161 void 1162 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va, 1163 struct pg_to_free *pagelist) 1164 { 1165 unsigned long index; 1166 int level; 1167 vaddr_t invaladdr; 1168 1169 level = 1; 1170 do { 1171 pmap_freepage(pmap, ptp, level, pagelist); 1172 index = pl_i(va, level + 1); 1173 pmap_pte_set(&normal_pdes[level - 1][index], 0); 1174 if (level == PTP_LEVELS - 1 && pmap->pm_pdir_intel != NULL) { 1175 /* Zap special meltdown PML4e */ 1176 pmap_pte_set(&pmap->pm_pdir_intel[index], 0); 1177 DPRINTF("%s: cleared meltdown PML4e @ index %lu " 1178 "(va range start 0x%llx)\n", __func__, index, 1179 (uint64_t)(index << L4_SHIFT)); 1180 } 1181 invaladdr = level == 1 ? (vaddr_t)PTE_BASE : 1182 (vaddr_t)normal_pdes[level - 2]; 1183 pmap_tlb_shootpage(pmap, invaladdr + index * PAGE_SIZE, 1184 pmap_is_curpmap(curpcb->pcb_pmap)); 1185 if (level < PTP_LEVELS - 1) { 1186 ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1); 1187 ptp->wire_count--; 1188 if (ptp->wire_count > 1) 1189 break; 1190 } 1191 } while (++level < PTP_LEVELS); 1192 } 1193 1194 /* 1195 * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one) 1196 * 1197 * => pmap should NOT be pmap_kernel() 1198 */ 1199 1200 struct vm_page * 1201 pmap_get_ptp(struct pmap *pmap, vaddr_t va) 1202 { 1203 struct vm_page *ptp, *pptp; 1204 int i; 1205 unsigned long index; 1206 pd_entry_t *pva, *pva_intel; 1207 paddr_t ppa, pa; 1208 struct uvm_object *obj; 1209 1210 ptp = NULL; 1211 pa = (paddr_t)-1; 1212 1213 /* 1214 * Loop through all page table levels seeing if we need to 1215 * add a new page to that level. 1216 */ 1217 for (i = PTP_LEVELS; i > 1; i--) { 1218 /* 1219 * Save values from previous round. 1220 */ 1221 pptp = ptp; 1222 ppa = pa; 1223 1224 index = pl_i(va, i); 1225 pva = normal_pdes[i - 2]; 1226 1227 if (pmap_valid_entry(pva[index])) { 1228 ppa = pva[index] & pg_frame; 1229 ptp = NULL; 1230 continue; 1231 } 1232 1233 obj = &pmap->pm_obj[i-2]; 1234 ptp = uvm_pagealloc(obj, ptp_va2o(va, i - 1), NULL, 1235 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 1236 1237 if (ptp == NULL) 1238 return NULL; 1239 1240 atomic_clearbits_int(&ptp->pg_flags, PG_BUSY); 1241 ptp->wire_count = 1; 1242 pmap->pm_ptphint[i - 2] = ptp; 1243 pa = VM_PAGE_TO_PHYS(ptp); 1244 pva[index] = (pd_entry_t) (pa | PG_u | PG_RW | PG_V | pg_crypt); 1245 1246 /* 1247 * Meltdown Special case - if we are adding a new PML4e for 1248 * usermode addresses, just copy the PML4e to the U-K page 1249 * table. 1250 */ 1251 if (pmap->pm_pdir_intel != NULL && i == PTP_LEVELS && 1252 va < VM_MAXUSER_ADDRESS) { 1253 pva_intel = pmap->pm_pdir_intel; 1254 pva_intel[index] = pva[index]; 1255 DPRINTF("%s: copying usermode PML4e (content=0x%llx) " 1256 "from 0x%llx -> 0x%llx\n", __func__, pva[index], 1257 (uint64_t)&pva[index], (uint64_t)&pva_intel[index]); 1258 } 1259 1260 pmap->pm_stats.resident_count++; 1261 /* 1262 * If we're not in the top level, increase the 1263 * wire count of the parent page. 1264 */ 1265 if (i < PTP_LEVELS) { 1266 if (pptp == NULL) 1267 pptp = pmap_find_ptp(pmap, va, ppa, i); 1268 #ifdef DIAGNOSTIC 1269 if (pptp == NULL) 1270 panic("%s: pde page disappeared", __func__); 1271 #endif 1272 pptp->wire_count++; 1273 } 1274 } 1275 1276 /* 1277 * ptp is not NULL if we just allocated a new ptp. If it's 1278 * still NULL, we must look up the existing one. 1279 */ 1280 if (ptp == NULL) { 1281 ptp = pmap_find_ptp(pmap, va, ppa, 1); 1282 #ifdef DIAGNOSTIC 1283 if (ptp == NULL) { 1284 printf("va %lx ppa %lx\n", (unsigned long)va, 1285 (unsigned long)ppa); 1286 panic("%s: unmanaged user PTP", __func__); 1287 } 1288 #endif 1289 } 1290 1291 pmap->pm_ptphint[0] = ptp; 1292 return(ptp); 1293 } 1294 1295 /* 1296 * p m a p l i f e c y c l e f u n c t i o n s 1297 */ 1298 1299 /* 1300 * pmap_pdp_ctor: constructor for the PDP cache. 1301 */ 1302 1303 void 1304 pmap_pdp_ctor(pd_entry_t *pdir) 1305 { 1306 paddr_t pdirpa; 1307 int npde, i; 1308 struct pmap *kpm = pmap_kernel(); 1309 1310 /* fetch the physical address of the page directory. */ 1311 (void) pmap_extract(kpm, (vaddr_t) pdir, &pdirpa); 1312 1313 /* zero init area */ 1314 memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t)); 1315 1316 /* put in recursive PDE to map the PTEs */ 1317 pdir[PDIR_SLOT_PTE] = pdirpa | PG_V | PG_KW | pg_nx | pg_crypt; 1318 1319 npde = nkptp[PTP_LEVELS - 1]; 1320 1321 /* put in kernel VM PDEs */ 1322 memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN], 1323 npde * sizeof(pd_entry_t)); 1324 1325 /* zero the rest */ 1326 memset(&pdir[PDIR_SLOT_KERN + npde], 0, 1327 (NTOPLEVEL_PDES - (PDIR_SLOT_KERN + npde)) * sizeof(pd_entry_t)); 1328 1329 for (i = 0; i < NUM_L4_SLOT_DIRECT; i++) 1330 pdir[PDIR_SLOT_DIRECT + i] = kpm->pm_pdir[PDIR_SLOT_DIRECT + i]; 1331 1332 #if VM_MIN_KERNEL_ADDRESS != KERNBASE 1333 pdir[pl4_pi(KERNBASE)] = PDP_BASE[pl4_pi(KERNBASE)]; 1334 #endif 1335 } 1336 1337 void 1338 pmap_pdp_ctor_intel(pd_entry_t *pdir) 1339 { 1340 struct pmap *kpm = pmap_kernel(); 1341 1342 /* Copy PML4es from pmap_kernel's U-K view */ 1343 memcpy(pdir, kpm->pm_pdir_intel, PAGE_SIZE); 1344 } 1345 1346 /* 1347 * pmap_create: create a pmap 1348 * 1349 * => note: old pmap interface took a "size" args which allowed for 1350 * the creation of "software only" pmaps (not in bsd). 1351 */ 1352 1353 struct pmap * 1354 pmap_create(void) 1355 { 1356 struct pmap *pmap; 1357 int i; 1358 1359 pmap = pool_get(&pmap_pmap_pool, PR_WAITOK); 1360 1361 mtx_init(&pmap->pm_mtx, IPL_VM); 1362 1363 /* init uvm_object */ 1364 for (i = 0; i < PTP_LEVELS - 1; i++) { 1365 uvm_obj_init(&pmap->pm_obj[i], &pmap_pager, 1); 1366 pmap->pm_ptphint[i] = NULL; 1367 } 1368 pmap->pm_stats.wired_count = 0; 1369 pmap->pm_stats.resident_count = 1; /* count the PDP allocd below */ 1370 pmap->pm_type = PMAP_TYPE_NORMAL; 1371 pmap->eptp = 0; 1372 1373 /* allocate PDP */ 1374 1375 /* 1376 * note that there is no need to splvm to protect us from 1377 * malloc since malloc allocates out of a submap and we should 1378 * have already allocated kernel PTPs to cover the range... 1379 */ 1380 1381 pmap->pm_pdir = pool_get(&pmap_pdp_pool, PR_WAITOK); 1382 pmap_pdp_ctor(pmap->pm_pdir); 1383 1384 pmap->pm_pdirpa = pmap->pm_pdir[PDIR_SLOT_PTE] & pg_frame; 1385 1386 /* 1387 * Intel CPUs need a special page table to be used during usermode 1388 * execution, one that lacks all kernel mappings. 1389 */ 1390 if (cpu_meltdown) { 1391 pmap->pm_pdir_intel = pool_get(&pmap_pdp_pool, PR_WAITOK); 1392 pmap_pdp_ctor_intel(pmap->pm_pdir_intel); 1393 pmap->pm_stats.resident_count++; 1394 if (!pmap_extract(pmap_kernel(), (vaddr_t)pmap->pm_pdir_intel, 1395 &pmap->pm_pdirpa_intel)) 1396 panic("%s: unknown PA mapping for meltdown PML4", 1397 __func__); 1398 } else { 1399 pmap->pm_pdir_intel = NULL; 1400 pmap->pm_pdirpa_intel = 0; 1401 } 1402 1403 mtx_enter(&pmaps_lock); 1404 LIST_INSERT_HEAD(&pmaps, pmap, pm_list); 1405 mtx_leave(&pmaps_lock); 1406 return (pmap); 1407 } 1408 1409 /* 1410 * pmap_destroy: drop reference count on pmap. free pmap if 1411 * reference count goes to zero. 1412 */ 1413 1414 void 1415 pmap_destroy(struct pmap *pmap) 1416 { 1417 struct vm_page *pg; 1418 int refs; 1419 int i; 1420 1421 /* 1422 * drop reference count 1423 */ 1424 1425 refs = atomic_dec_int_nv(&pmap->pm_obj[0].uo_refs); 1426 if (refs > 0) { 1427 return; 1428 } 1429 1430 /* 1431 * remove it from global list of pmaps 1432 */ 1433 mtx_enter(&pmaps_lock); 1434 LIST_REMOVE(pmap, pm_list); 1435 mtx_leave(&pmaps_lock); 1436 1437 /* 1438 * free any remaining PTPs 1439 */ 1440 1441 for (i = 0; i < PTP_LEVELS - 1; i++) { 1442 while ((pg = RBT_ROOT(uvm_objtree, 1443 &pmap->pm_obj[i].memt)) != NULL) { 1444 KASSERT((pg->pg_flags & PG_BUSY) == 0); 1445 1446 pg->wire_count = 0; 1447 pmap->pm_stats.resident_count--; 1448 1449 uvm_pagefree(pg); 1450 } 1451 } 1452 1453 pool_put(&pmap_pdp_pool, pmap->pm_pdir); 1454 1455 if (pmap->pm_pdir_intel != NULL) { 1456 pmap->pm_stats.resident_count--; 1457 pool_put(&pmap_pdp_pool, pmap->pm_pdir_intel); 1458 } 1459 1460 pool_put(&pmap_pmap_pool, pmap); 1461 } 1462 1463 /* 1464 * Add a reference to the specified pmap. 1465 */ 1466 1467 void 1468 pmap_reference(struct pmap *pmap) 1469 { 1470 atomic_inc_int(&pmap->pm_obj[0].uo_refs); 1471 } 1472 1473 /* 1474 * pmap_activate: activate a process' pmap (fill in %cr3) 1475 * 1476 * => called from cpu_fork() and when switching pmaps during exec 1477 * => if p is the curproc, then load it into the MMU 1478 */ 1479 1480 void 1481 pmap_activate(struct proc *p) 1482 { 1483 struct pcb *pcb = &p->p_addr->u_pcb; 1484 struct pmap *pmap = p->p_vmspace->vm_map.pmap; 1485 1486 pcb->pcb_pmap = pmap; 1487 pcb->pcb_cr3 = pmap->pm_pdirpa; 1488 pcb->pcb_cr3 |= (pmap != pmap_kernel()) ? cr3_pcid_proc : 1489 (PCID_KERN | cr3_reuse_pcid); 1490 1491 if (p != curproc) 1492 return; 1493 1494 if ((p->p_flag & P_SYSTEM) == 0) { 1495 struct cpu_info *self = curcpu(); 1496 1497 /* mark the pmap in use by this processor */ 1498 self->ci_proc_pmap = pmap; 1499 1500 /* in case we return to userspace without context switching */ 1501 if (cpu_meltdown) { 1502 self->ci_kern_cr3 = pcb->pcb_cr3 | cr3_reuse_pcid; 1503 self->ci_user_cr3 = pmap->pm_pdirpa_intel | 1504 cr3_pcid_proc_intel; 1505 } 1506 } 1507 1508 lcr3(pcb->pcb_cr3); 1509 } 1510 1511 /* 1512 * pmap_deactivate: deactivate a process' pmap 1513 */ 1514 1515 void 1516 pmap_deactivate(struct proc *p) 1517 { 1518 if ((p->p_flag & P_SYSTEM) == 0) { 1519 struct cpu_info *self = curcpu(); 1520 1521 /* 1522 * mark the pmap no longer in use by this processor. 1523 */ 1524 KASSERT(self->ci_proc_pmap == p->p_vmspace->vm_map.pmap); 1525 self->ci_proc_pmap = NULL; 1526 } 1527 } 1528 1529 /* 1530 * end of lifecycle functions 1531 */ 1532 1533 /* 1534 * some misc. functions 1535 */ 1536 1537 int 1538 pmap_pdes_valid(vaddr_t va, pd_entry_t *lastpde) 1539 { 1540 int i; 1541 unsigned long index; 1542 pd_entry_t pde; 1543 1544 for (i = PTP_LEVELS; i > 1; i--) { 1545 index = pl_i(va, i); 1546 pde = normal_pdes[i - 2][index]; 1547 if (!pmap_valid_entry(pde)) 1548 return 0; 1549 } 1550 if (lastpde != NULL) 1551 *lastpde = pde; 1552 return 1; 1553 } 1554 1555 /* 1556 * pmap_extract: extract a PA for the given VA 1557 */ 1558 1559 int 1560 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap) 1561 { 1562 pt_entry_t *ptes, pte; 1563 int level, offs; 1564 1565 if (pmap == pmap_kernel() && va >= PMAP_DIRECT_BASE && 1566 va < PMAP_DIRECT_END) { 1567 *pap = va - PMAP_DIRECT_BASE; 1568 return 1; 1569 } 1570 1571 if (pmap != pmap_kernel()) 1572 mtx_enter(&pmap->pm_mtx); 1573 1574 level = pmap_find_pte_direct(pmap, va, &ptes, &offs); 1575 pte = ptes[offs]; 1576 1577 if (pmap != pmap_kernel()) 1578 mtx_leave(&pmap->pm_mtx); 1579 1580 if (__predict_true(level == 0 && pmap_valid_entry(pte))) { 1581 if (pap != NULL) 1582 *pap = (pte & pg_frame) | (va & PAGE_MASK); 1583 return 1; 1584 } 1585 if (level == 1 && (pte & (PG_PS|PG_V)) == (PG_PS|PG_V)) { 1586 if (pap != NULL) 1587 *pap = (pte & pg_lgframe) | (va & PAGE_MASK_L2); 1588 return 1; 1589 } 1590 1591 return 0; 1592 } 1593 1594 /* 1595 * pmap_zero_page: zero a page 1596 */ 1597 1598 void 1599 pmap_zero_page(struct vm_page *pg) 1600 { 1601 pagezero(pmap_map_direct(pg)); 1602 } 1603 1604 /* 1605 * pmap_flush_cache: flush the cache for a virtual address. 1606 */ 1607 void 1608 pmap_flush_cache(vaddr_t addr, vsize_t len) 1609 { 1610 vaddr_t i; 1611 1612 if (curcpu()->ci_cflushsz == 0) { 1613 wbinvd_on_all_cpus(); 1614 return; 1615 } 1616 1617 /* all cpus that have clflush also have mfence. */ 1618 mfence(); 1619 for (i = addr; i < addr + len; i += curcpu()->ci_cflushsz) 1620 clflush(i); 1621 mfence(); 1622 } 1623 1624 /* 1625 * pmap_copy_page: copy a page 1626 */ 1627 1628 void 1629 pmap_copy_page(struct vm_page *srcpg, struct vm_page *dstpg) 1630 { 1631 vaddr_t srcva = pmap_map_direct(srcpg); 1632 vaddr_t dstva = pmap_map_direct(dstpg); 1633 1634 memcpy((void *)dstva, (void *)srcva, PAGE_SIZE); 1635 } 1636 1637 /* 1638 * p m a p r e m o v e f u n c t i o n s 1639 * 1640 * functions that remove mappings 1641 */ 1642 1643 /* 1644 * pmap_remove_ptes: remove PTEs from a PTP 1645 * 1646 * => PTP must be mapped into KVA 1647 * => PTP should be null if pmap == pmap_kernel() 1648 */ 1649 1650 void 1651 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva, 1652 vaddr_t startva, vaddr_t endva, int flags, struct pv_entry **free_pvs) 1653 { 1654 struct pv_entry *pve; 1655 pt_entry_t *pte = (pt_entry_t *) ptpva; 1656 struct vm_page *pg; 1657 pt_entry_t opte; 1658 1659 /* 1660 * note that ptpva points to the PTE that maps startva. this may 1661 * or may not be the first PTE in the PTP. 1662 * 1663 * we loop through the PTP while there are still PTEs to look at 1664 * and the wire_count is greater than 1 (because we use the wire_count 1665 * to keep track of the number of real PTEs in the PTP). 1666 */ 1667 1668 for (/*null*/; startva < endva && (ptp == NULL || ptp->wire_count > 1) 1669 ; pte++, startva += PAGE_SIZE) { 1670 if (!pmap_valid_entry(*pte)) 1671 continue; /* VA not mapped */ 1672 if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) { 1673 continue; 1674 } 1675 1676 /* atomically save the old PTE and zap! it */ 1677 opte = pmap_pte_set(pte, 0); 1678 1679 if (opte & PG_W) 1680 pmap->pm_stats.wired_count--; 1681 pmap->pm_stats.resident_count--; 1682 1683 if (ptp != NULL) 1684 ptp->wire_count--; /* dropping a PTE */ 1685 1686 pg = PHYS_TO_VM_PAGE(opte & pg_frame); 1687 1688 /* 1689 * if we are not on a pv list we are done. 1690 */ 1691 1692 if ((opte & PG_PVLIST) == 0) { 1693 #ifdef DIAGNOSTIC 1694 if (pg != NULL) 1695 panic("%s: managed page without PG_PVLIST: " 1696 "va 0x%lx, opte 0x%llx", __func__, 1697 startva, opte); 1698 #endif 1699 continue; 1700 } 1701 1702 #ifdef DIAGNOSTIC 1703 if (pg == NULL) 1704 panic("%s: unmanaged page marked PG_PVLIST: " 1705 "va 0x%lx, opte 0x%llx", __func__, 1706 startva, opte); 1707 #endif 1708 1709 /* sync R/M bits */ 1710 pmap_sync_flags_pte(pg, opte); 1711 pve = pmap_remove_pv(pg, pmap, startva); 1712 if (pve != NULL) { 1713 pve->pv_next = *free_pvs; 1714 *free_pvs = pve; 1715 } 1716 1717 /* end of "for" loop: time for next pte */ 1718 } 1719 } 1720 1721 /* 1722 * pmap_remove_pte: remove a single PTE from a PTP 1723 * 1724 * => PTP must be mapped into KVA 1725 * => PTP should be null if pmap == pmap_kernel() 1726 * => returns true if we removed a mapping 1727 */ 1728 1729 int 1730 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, 1731 vaddr_t va, int flags, struct pv_entry **free_pvs) 1732 { 1733 struct pv_entry *pve; 1734 struct vm_page *pg; 1735 pt_entry_t opte; 1736 1737 if (!pmap_valid_entry(*pte)) 1738 return 0; /* VA not mapped */ 1739 if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) { 1740 return 0; 1741 } 1742 1743 /* atomically save the old PTE and zap! it */ 1744 opte = pmap_pte_set(pte, 0); 1745 1746 if (opte & PG_W) 1747 pmap->pm_stats.wired_count--; 1748 pmap->pm_stats.resident_count--; 1749 1750 if (ptp != NULL) 1751 ptp->wire_count--; /* dropping a PTE */ 1752 1753 pg = PHYS_TO_VM_PAGE(opte & pg_frame); 1754 1755 /* 1756 * if we are not on a pv list we are done. 1757 */ 1758 if ((opte & PG_PVLIST) == 0) { 1759 #ifdef DIAGNOSTIC 1760 if (pg != NULL) 1761 panic("%s: managed page without PG_PVLIST: " 1762 "va 0x%lx, opte 0x%llx", __func__, va, opte); 1763 #endif 1764 return 1; 1765 } 1766 1767 #ifdef DIAGNOSTIC 1768 if (pg == NULL) 1769 panic("%s: unmanaged page marked PG_PVLIST: " 1770 "va 0x%lx, opte 0x%llx", __func__, va, opte); 1771 #endif 1772 1773 /* sync R/M bits */ 1774 pmap_sync_flags_pte(pg, opte); 1775 pve = pmap_remove_pv(pg, pmap, va); 1776 if (pve != NULL) { 1777 pve->pv_next = *free_pvs; 1778 *free_pvs = pve; 1779 } 1780 1781 return 1; 1782 } 1783 1784 /* 1785 * pmap_remove: top level mapping removal function 1786 * 1787 * => caller should not be holding any pmap locks 1788 */ 1789 1790 void 1791 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 1792 { 1793 #if NVMM > 0 1794 if (pmap_is_ept(pmap)) 1795 pmap_remove_ept(pmap, sva, eva); 1796 else 1797 #endif /* NVMM > 0 */ 1798 pmap_do_remove(pmap, sva, eva, PMAP_REMOVE_ALL); 1799 } 1800 1801 /* 1802 * pmap_do_remove: mapping removal guts 1803 * 1804 * => caller should not be holding any pmap locks 1805 */ 1806 1807 void 1808 pmap_do_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva, int flags) 1809 { 1810 pd_entry_t pde; 1811 int result; 1812 paddr_t ptppa; 1813 vaddr_t blkendva; 1814 struct vm_page *ptp; 1815 struct pv_entry *pve; 1816 struct pv_entry *free_pvs = NULL; 1817 vaddr_t va; 1818 int shootall = 0, shootself; 1819 struct pg_to_free empty_ptps; 1820 paddr_t scr3; 1821 1822 TAILQ_INIT(&empty_ptps); 1823 1824 scr3 = pmap_map_ptes(pmap); 1825 shootself = (scr3 == 0); 1826 1827 /* 1828 * removing one page? take shortcut function. 1829 */ 1830 1831 if (sva + PAGE_SIZE == eva) { 1832 if (pmap_pdes_valid(sva, &pde)) { 1833 1834 /* PA of the PTP */ 1835 ptppa = pde & pg_frame; 1836 1837 /* get PTP if non-kernel mapping */ 1838 1839 if (pmap == pmap_kernel()) { 1840 /* we never free kernel PTPs */ 1841 ptp = NULL; 1842 } else { 1843 ptp = pmap_find_ptp(pmap, sva, ptppa, 1); 1844 #ifdef DIAGNOSTIC 1845 if (ptp == NULL) 1846 panic("%s: unmanaged PTP detected " 1847 "in shortcut path", __func__); 1848 #endif 1849 } 1850 1851 /* do it! */ 1852 result = pmap_remove_pte(pmap, ptp, 1853 &PTE_BASE[pl1_i(sva)], sva, flags, &free_pvs); 1854 1855 /* 1856 * if mapping removed and the PTP is no longer 1857 * being used, free it! 1858 */ 1859 1860 if (result && ptp && ptp->wire_count <= 1) 1861 pmap_free_ptp(pmap, ptp, sva, &empty_ptps); 1862 pmap_tlb_shootpage(pmap, sva, shootself); 1863 pmap_unmap_ptes(pmap, scr3); 1864 pmap_tlb_shootwait(); 1865 } else { 1866 pmap_unmap_ptes(pmap, scr3); 1867 } 1868 1869 goto cleanup; 1870 } 1871 1872 if ((eva - sva > 32 * PAGE_SIZE) && sva < VM_MIN_KERNEL_ADDRESS) 1873 shootall = 1; 1874 1875 for (va = sva; va < eva; va = blkendva) { 1876 /* determine range of block */ 1877 blkendva = x86_round_pdr(va + 1); 1878 if (blkendva > eva) 1879 blkendva = eva; 1880 1881 /* 1882 * XXXCDC: our PTE mappings should never be removed 1883 * with pmap_remove! if we allow this (and why would 1884 * we?) then we end up freeing the pmap's page 1885 * directory page (PDP) before we are finished using 1886 * it when we hit it in the recursive mapping. this 1887 * is BAD. 1888 * 1889 * long term solution is to move the PTEs out of user 1890 * address space. and into kernel address space (up 1891 * with APTE). then we can set VM_MAXUSER_ADDRESS to 1892 * be VM_MAX_ADDRESS. 1893 */ 1894 1895 if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE) 1896 /* XXXCDC: ugly hack to avoid freeing PDP here */ 1897 continue; 1898 1899 if (!pmap_pdes_valid(va, &pde)) 1900 continue; 1901 1902 /* PA of the PTP */ 1903 ptppa = pde & pg_frame; 1904 1905 /* get PTP if non-kernel mapping */ 1906 if (pmap == pmap_kernel()) { 1907 /* we never free kernel PTPs */ 1908 ptp = NULL; 1909 } else { 1910 ptp = pmap_find_ptp(pmap, va, ptppa, 1); 1911 #ifdef DIAGNOSTIC 1912 if (ptp == NULL) 1913 panic("%s: unmanaged PTP detected", __func__); 1914 #endif 1915 } 1916 pmap_remove_ptes(pmap, ptp, (vaddr_t)&PTE_BASE[pl1_i(va)], 1917 va, blkendva, flags, &free_pvs); 1918 1919 /* if PTP is no longer being used, free it! */ 1920 if (ptp && ptp->wire_count <= 1) { 1921 pmap_free_ptp(pmap, ptp, va, &empty_ptps); 1922 } 1923 } 1924 1925 if (shootall) 1926 pmap_tlb_shoottlb(pmap, shootself); 1927 else 1928 pmap_tlb_shootrange(pmap, sva, eva, shootself); 1929 1930 pmap_unmap_ptes(pmap, scr3); 1931 pmap_tlb_shootwait(); 1932 1933 cleanup: 1934 while ((pve = free_pvs) != NULL) { 1935 free_pvs = pve->pv_next; 1936 pool_put(&pmap_pv_pool, pve); 1937 } 1938 1939 while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) { 1940 TAILQ_REMOVE(&empty_ptps, ptp, pageq); 1941 uvm_pagefree(ptp); 1942 } 1943 } 1944 1945 /* 1946 * pmap_page_remove: remove a managed vm_page from all pmaps that map it 1947 * 1948 * => R/M bits are sync'd back to attrs 1949 */ 1950 1951 void 1952 pmap_page_remove(struct vm_page *pg) 1953 { 1954 struct pv_entry *pve; 1955 struct pmap *pm; 1956 pt_entry_t opte; 1957 #ifdef DIAGNOSTIC 1958 pd_entry_t pde; 1959 #endif 1960 struct pg_to_free empty_ptps; 1961 struct vm_page *ptp; 1962 paddr_t scr3; 1963 int shootself; 1964 1965 TAILQ_INIT(&empty_ptps); 1966 1967 mtx_enter(&pg->mdpage.pv_mtx); 1968 while ((pve = pg->mdpage.pv_list) != NULL) { 1969 pmap_reference(pve->pv_pmap); 1970 pm = pve->pv_pmap; 1971 mtx_leave(&pg->mdpage.pv_mtx); 1972 1973 /* XXX use direct map? */ 1974 scr3 = pmap_map_ptes(pm); /* locks pmap */ 1975 shootself = (scr3 == 0); 1976 1977 /* 1978 * We dropped the pvlist lock before grabbing the pmap 1979 * lock to avoid lock ordering problems. This means 1980 * we have to check the pvlist again since somebody 1981 * else might have modified it. All we care about is 1982 * that the pvlist entry matches the pmap we just 1983 * locked. If it doesn't, unlock the pmap and try 1984 * again. 1985 */ 1986 mtx_enter(&pg->mdpage.pv_mtx); 1987 if ((pve = pg->mdpage.pv_list) == NULL || 1988 pve->pv_pmap != pm) { 1989 mtx_leave(&pg->mdpage.pv_mtx); 1990 pmap_unmap_ptes(pm, scr3); /* unlocks pmap */ 1991 pmap_destroy(pm); 1992 mtx_enter(&pg->mdpage.pv_mtx); 1993 continue; 1994 } 1995 1996 pg->mdpage.pv_list = pve->pv_next; 1997 mtx_leave(&pg->mdpage.pv_mtx); 1998 1999 #ifdef DIAGNOSTIC 2000 if (pve->pv_ptp != NULL && pmap_pdes_valid(pve->pv_va, &pde) && 2001 (pde & pg_frame) != VM_PAGE_TO_PHYS(pve->pv_ptp)) { 2002 printf("%s: pg=%p: va=%lx, pv_ptp=%p\n", __func__, 2003 pg, pve->pv_va, pve->pv_ptp); 2004 printf("%s: PTP's phys addr: " 2005 "actual=%lx, recorded=%lx\n", __func__, 2006 (unsigned long)(pde & pg_frame), 2007 VM_PAGE_TO_PHYS(pve->pv_ptp)); 2008 panic("%s: mapped managed page has " 2009 "invalid pv_ptp field", __func__); 2010 } 2011 #endif 2012 2013 /* atomically save the old PTE and zap it */ 2014 opte = pmap_pte_set(&PTE_BASE[pl1_i(pve->pv_va)], 0); 2015 2016 if (opte & PG_W) 2017 pve->pv_pmap->pm_stats.wired_count--; 2018 pve->pv_pmap->pm_stats.resident_count--; 2019 2020 pmap_tlb_shootpage(pve->pv_pmap, pve->pv_va, shootself); 2021 2022 pmap_sync_flags_pte(pg, opte); 2023 2024 /* update the PTP reference count. free if last reference. */ 2025 if (pve->pv_ptp != NULL) { 2026 pve->pv_ptp->wire_count--; 2027 if (pve->pv_ptp->wire_count <= 1) { 2028 pmap_free_ptp(pve->pv_pmap, pve->pv_ptp, 2029 pve->pv_va, &empty_ptps); 2030 } 2031 } 2032 pmap_unmap_ptes(pve->pv_pmap, scr3); /* unlocks pmap */ 2033 pmap_destroy(pve->pv_pmap); 2034 pool_put(&pmap_pv_pool, pve); 2035 mtx_enter(&pg->mdpage.pv_mtx); 2036 } 2037 mtx_leave(&pg->mdpage.pv_mtx); 2038 2039 pmap_tlb_shootwait(); 2040 2041 while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) { 2042 TAILQ_REMOVE(&empty_ptps, ptp, pageq); 2043 uvm_pagefree(ptp); 2044 } 2045 } 2046 2047 /* 2048 * p m a p a t t r i b u t e f u n c t i o n s 2049 * functions that test/change managed page's attributes 2050 * since a page can be mapped multiple times we must check each PTE that 2051 * maps it by going down the pv lists. 2052 */ 2053 2054 /* 2055 * pmap_test_attrs: test a page's attributes 2056 */ 2057 2058 int 2059 pmap_test_attrs(struct vm_page *pg, unsigned int testbits) 2060 { 2061 struct pv_entry *pve; 2062 pt_entry_t *ptes; 2063 int level, offs; 2064 u_long mybits, testflags; 2065 2066 testflags = pmap_pte2flags(testbits); 2067 2068 if (pg->pg_flags & testflags) 2069 return 1; 2070 2071 mybits = 0; 2072 mtx_enter(&pg->mdpage.pv_mtx); 2073 for (pve = pg->mdpage.pv_list; pve != NULL && mybits == 0; 2074 pve = pve->pv_next) { 2075 level = pmap_find_pte_direct(pve->pv_pmap, pve->pv_va, &ptes, 2076 &offs); 2077 mybits |= (ptes[offs] & testbits); 2078 } 2079 mtx_leave(&pg->mdpage.pv_mtx); 2080 2081 if (mybits == 0) 2082 return 0; 2083 2084 atomic_setbits_int(&pg->pg_flags, pmap_pte2flags(mybits)); 2085 2086 return 1; 2087 } 2088 2089 /* 2090 * pmap_clear_attrs: change a page's attributes 2091 * 2092 * => we return 1 if we cleared one of the bits we were asked to 2093 */ 2094 2095 int 2096 pmap_clear_attrs(struct vm_page *pg, unsigned long clearbits) 2097 { 2098 struct pv_entry *pve; 2099 pt_entry_t *ptes, opte; 2100 u_long clearflags; 2101 int result, level, offs; 2102 2103 clearflags = pmap_pte2flags(clearbits); 2104 2105 result = pg->pg_flags & clearflags; 2106 if (result) 2107 atomic_clearbits_int(&pg->pg_flags, clearflags); 2108 2109 mtx_enter(&pg->mdpage.pv_mtx); 2110 for (pve = pg->mdpage.pv_list; pve != NULL; pve = pve->pv_next) { 2111 level = pmap_find_pte_direct(pve->pv_pmap, pve->pv_va, &ptes, 2112 &offs); 2113 opte = ptes[offs]; 2114 if (opte & clearbits) { 2115 result = 1; 2116 pmap_pte_clearbits(&ptes[offs], (opte & clearbits)); 2117 pmap_tlb_shootpage(pve->pv_pmap, pve->pv_va, 2118 pmap_is_curpmap(pve->pv_pmap)); 2119 } 2120 } 2121 mtx_leave(&pg->mdpage.pv_mtx); 2122 2123 pmap_tlb_shootwait(); 2124 2125 return (result != 0); 2126 } 2127 2128 /* 2129 * p m a p p r o t e c t i o n f u n c t i o n s 2130 */ 2131 2132 /* 2133 * pmap_page_protect: change the protection of all recorded mappings 2134 * of a managed page 2135 * 2136 * => NOTE: this is an inline function in pmap.h 2137 */ 2138 2139 /* see pmap.h */ 2140 2141 /* 2142 * pmap_protect: set the protection in of the pages in a pmap 2143 * 2144 * => NOTE: this is an inline function in pmap.h 2145 */ 2146 2147 /* see pmap.h */ 2148 2149 /* 2150 * pmap_write_protect: write-protect pages in a pmap 2151 */ 2152 2153 void 2154 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot) 2155 { 2156 pt_entry_t *spte, *epte; 2157 pt_entry_t clear = 0, set = 0; 2158 vaddr_t blockend; 2159 int shootall = 0, shootself; 2160 vaddr_t va; 2161 paddr_t scr3; 2162 2163 scr3 = pmap_map_ptes(pmap); 2164 shootself = (scr3 == 0); 2165 2166 /* should be ok, but just in case ... */ 2167 sva &= PG_FRAME; 2168 eva &= PG_FRAME; 2169 2170 if (!(prot & PROT_READ)) 2171 set |= pg_xo; 2172 if (!(prot & PROT_WRITE)) 2173 clear = PG_RW; 2174 if (!(prot & PROT_EXEC)) 2175 set |= pg_nx; 2176 2177 if ((eva - sva > 32 * PAGE_SIZE) && sva < VM_MIN_KERNEL_ADDRESS) 2178 shootall = 1; 2179 2180 for (va = sva; va < eva ; va = blockend) { 2181 blockend = (va & L2_FRAME) + NBPD_L2; 2182 if (blockend > eva) 2183 blockend = eva; 2184 2185 /* 2186 * XXXCDC: our PTE mappings should never be write-protected! 2187 * 2188 * long term solution is to move the PTEs out of user 2189 * address space. and into kernel address space (up 2190 * with APTE). then we can set VM_MAXUSER_ADDRESS to 2191 * be VM_MAX_ADDRESS. 2192 */ 2193 2194 /* XXXCDC: ugly hack to avoid freeing PDP here */ 2195 if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE) 2196 continue; 2197 2198 /* empty block? */ 2199 if (!pmap_pdes_valid(va, NULL)) 2200 continue; 2201 2202 #ifdef DIAGNOSTIC 2203 if (va >= VM_MAXUSER_ADDRESS && va < VM_MAX_ADDRESS) 2204 panic("%s: PTE space", __func__); 2205 #endif 2206 2207 spte = &PTE_BASE[pl1_i(va)]; 2208 epte = &PTE_BASE[pl1_i(blockend)]; 2209 2210 for (/*null */; spte < epte ; spte++) { 2211 if (!pmap_valid_entry(*spte)) 2212 continue; 2213 pmap_pte_clearbits(spte, clear); 2214 pmap_pte_setbits(spte, set); 2215 } 2216 } 2217 2218 if (shootall) 2219 pmap_tlb_shoottlb(pmap, shootself); 2220 else 2221 pmap_tlb_shootrange(pmap, sva, eva, shootself); 2222 2223 pmap_unmap_ptes(pmap, scr3); 2224 pmap_tlb_shootwait(); 2225 } 2226 2227 /* 2228 * end of protection functions 2229 */ 2230 2231 /* 2232 * pmap_unwire: clear the wired bit in the PTE 2233 * 2234 * => mapping should already be in map 2235 */ 2236 2237 void 2238 pmap_unwire(struct pmap *pmap, vaddr_t va) 2239 { 2240 pt_entry_t *ptes; 2241 int level, offs; 2242 2243 level = pmap_find_pte_direct(pmap, va, &ptes, &offs); 2244 2245 if (level == 0) { 2246 2247 #ifdef DIAGNOSTIC 2248 if (!pmap_valid_entry(ptes[offs])) 2249 panic("%s: invalid (unmapped) va 0x%lx", __func__, va); 2250 #endif 2251 if (__predict_true((ptes[offs] & PG_W) != 0)) { 2252 pmap_pte_clearbits(&ptes[offs], PG_W); 2253 pmap->pm_stats.wired_count--; 2254 } 2255 #ifdef DIAGNOSTIC 2256 else { 2257 printf("%s: wiring for pmap %p va 0x%lx " 2258 "didn't change!\n", __func__, pmap, va); 2259 } 2260 #endif 2261 } 2262 #ifdef DIAGNOSTIC 2263 else { 2264 panic("%s: invalid PDE", __func__); 2265 } 2266 #endif 2267 } 2268 2269 void 2270 pmap_enter_special(vaddr_t va, paddr_t pa, vm_prot_t prot) 2271 { 2272 uint64_t l4idx, l3idx, l2idx, l1idx; 2273 pd_entry_t *pd, *ptp; 2274 paddr_t npa; 2275 struct pmap *pmap = pmap_kernel(); 2276 pt_entry_t *ptes; 2277 int level, offs; 2278 2279 /* If CPU is secure, no need to do anything */ 2280 if (!cpu_meltdown) 2281 return; 2282 2283 /* Must be kernel VA */ 2284 if (va < VM_MIN_KERNEL_ADDRESS) 2285 panic("%s: invalid special mapping va 0x%lx requested", 2286 __func__, va); 2287 2288 if (pmap->pm_pdir_intel == NULL) 2289 pmap->pm_pdir_intel = pool_get(&pmap_pdp_pool, 2290 PR_WAITOK | PR_ZERO); 2291 2292 l4idx = (va & L4_MASK) >> L4_SHIFT; /* PML4E idx */ 2293 l3idx = (va & L3_MASK) >> L3_SHIFT; /* PDPTE idx */ 2294 l2idx = (va & L2_MASK) >> L2_SHIFT; /* PDE idx */ 2295 l1idx = (va & L1_MASK) >> L1_SHIFT; /* PTE idx */ 2296 2297 DPRINTF("%s: va=0x%llx pa=0x%llx l4idx=%lld l3idx=%lld " 2298 "l2idx=%lld l1idx=%lld\n", __func__, (uint64_t)va, 2299 (uint64_t)pa, l4idx, l3idx, l2idx, l1idx); 2300 2301 /* Start at PML4 / top level */ 2302 pd = pmap->pm_pdir_intel; 2303 2304 if (pd == NULL) 2305 panic("%s: PML4 not initialized for pmap @ %p", __func__, 2306 pmap); 2307 2308 /* npa = physaddr of PDPT */ 2309 npa = pd[l4idx] & PMAP_PA_MASK; 2310 2311 /* Valid PML4e for the 512GB region containing va? */ 2312 if (!npa) { 2313 /* No valid PML4E - allocate PDPT page and set PML4E */ 2314 2315 ptp = pool_get(&pmap_pdp_pool, PR_WAITOK | PR_ZERO); 2316 2317 if (!pmap_extract(pmap, (vaddr_t)ptp, &npa)) 2318 panic("%s: can't locate PDPT page", __func__); 2319 2320 pd[l4idx] = (npa | PG_RW | PG_V | pg_crypt); 2321 2322 DPRINTF("%s: allocated new PDPT page at phys 0x%llx, " 2323 "setting PML4e[%lld] = 0x%llx\n", __func__, 2324 (uint64_t)npa, l4idx, pd[l4idx]); 2325 } 2326 2327 pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa); 2328 if (pd == NULL) 2329 panic("%s: can't locate PDPT @ pa=0x%llx", __func__, 2330 (uint64_t)npa); 2331 2332 /* npa = physaddr of PD page */ 2333 npa = pd[l3idx] & PMAP_PA_MASK; 2334 2335 /* Valid PDPTe for the 1GB region containing va? */ 2336 if (!npa) { 2337 /* No valid PDPTe - allocate PD page and set PDPTe */ 2338 2339 ptp = pool_get(&pmap_pdp_pool, PR_WAITOK | PR_ZERO); 2340 2341 if (!pmap_extract(pmap, (vaddr_t)ptp, &npa)) 2342 panic("%s: can't locate PD page", __func__); 2343 2344 pd[l3idx] = (npa | PG_RW | PG_V | pg_crypt); 2345 2346 DPRINTF("%s: allocated new PD page at phys 0x%llx, " 2347 "setting PDPTe[%lld] = 0x%llx\n", __func__, 2348 (uint64_t)npa, l3idx, pd[l3idx]); 2349 } 2350 2351 pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa); 2352 if (pd == NULL) 2353 panic("%s: can't locate PD page @ pa=0x%llx", __func__, 2354 (uint64_t)npa); 2355 2356 /* npa = physaddr of PT page */ 2357 npa = pd[l2idx] & PMAP_PA_MASK; 2358 2359 /* Valid PDE for the 2MB region containing va? */ 2360 if (!npa) { 2361 /* No valid PDE - allocate PT page and set PDE */ 2362 2363 ptp = pool_get(&pmap_pdp_pool, PR_WAITOK | PR_ZERO); 2364 2365 if (!pmap_extract(pmap, (vaddr_t)ptp, &npa)) 2366 panic("%s: can't locate PT page", __func__); 2367 2368 pd[l2idx] = (npa | PG_RW | PG_V | pg_crypt); 2369 2370 DPRINTF("%s: allocated new PT page at phys 0x%llx, " 2371 "setting PDE[%lld] = 0x%llx\n", __func__, 2372 (uint64_t)npa, l2idx, pd[l2idx]); 2373 } 2374 2375 pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa); 2376 if (pd == NULL) 2377 panic("%s: can't locate PT page @ pa=0x%llx", __func__, 2378 (uint64_t)npa); 2379 2380 DPRINTF("%s: setting PTE, PT page @ phys 0x%llx virt 0x%llx prot " 2381 "0x%llx was 0x%llx\n", __func__, (uint64_t)npa, (uint64_t)pd, 2382 (uint64_t)prot, (uint64_t)pd[l1idx]); 2383 2384 pd[l1idx] = pa | protection_codes[prot] | PG_V | PG_W | pg_crypt; 2385 2386 /* 2387 * Look up the corresponding U+K entry. If we're installing the 2388 * same PA into the U-K map then set the PG_G bit on both and copy 2389 * the cache-control bits from the U+K entry to the U-K entry. 2390 */ 2391 level = pmap_find_pte_direct(pmap, va, &ptes, &offs); 2392 if (__predict_true(level == 0 && pmap_valid_entry(ptes[offs]))) { 2393 if (((pd[l1idx] ^ ptes[offs]) & pg_frame) == 0) { 2394 pd[l1idx] |= PG_G | (ptes[offs] & (PG_N | PG_WT)); 2395 ptes[offs] |= PG_G; 2396 } else { 2397 DPRINTF("%s: special diffing mapping at %llx\n", 2398 __func__, (long long)va); 2399 } 2400 } else 2401 DPRINTF("%s: no U+K mapping for special mapping?\n", __func__); 2402 2403 DPRINTF("%s: setting PTE[%lld] = 0x%llx\n", __func__, l1idx, pd[l1idx]); 2404 } 2405 2406 #if NVMM > 0 2407 /* 2408 * pmap_convert 2409 * 2410 * Converts 'pmap' to the new 'mode'. 2411 * 2412 * Parameters: 2413 * pmap: the pmap to convert 2414 * mode: the new mode (see pmap.h, PMAP_TYPE_xxx) 2415 */ 2416 void 2417 pmap_convert(struct pmap *pmap, int mode) 2418 { 2419 pt_entry_t *pte; 2420 2421 mtx_enter(&pmap->pm_mtx); 2422 pmap->pm_type = mode; 2423 2424 if (pmap_is_ept(pmap)) { 2425 /* Clear PML4 */ 2426 pte = (pt_entry_t *)pmap->pm_pdir; 2427 memset(pte, 0, PAGE_SIZE); 2428 2429 /* Give back the meltdown pdir */ 2430 if (pmap->pm_pdir_intel != NULL) { 2431 pool_put(&pmap_pdp_pool, pmap->pm_pdir_intel); 2432 pmap->pm_pdir_intel = NULL; 2433 } 2434 } 2435 mtx_leave(&pmap->pm_mtx); 2436 } 2437 2438 void 2439 pmap_remove_ept(struct pmap *pmap, vaddr_t sgpa, vaddr_t egpa) 2440 { 2441 vaddr_t v; 2442 2443 mtx_enter(&pmap->pm_mtx); 2444 2445 DPRINTF("%s: sgpa=0x%llx egpa=0x%llx\n", __func__, (uint64_t)sgpa, 2446 (uint64_t)egpa); 2447 for (v = sgpa; v < egpa + PAGE_SIZE; v += PAGE_SIZE) 2448 pmap_do_remove_ept(pmap, v); 2449 2450 pmap_shootept(pmap, 1); 2451 2452 mtx_leave(&pmap->pm_mtx); 2453 2454 pmap_tlb_shootwait(); 2455 } 2456 2457 void 2458 pmap_do_remove_ept(struct pmap *pmap, paddr_t gpa) 2459 { 2460 uint64_t l4idx, l3idx, l2idx, l1idx; 2461 struct vm_page *pg3, *pg2, *pg1; 2462 paddr_t npa3, npa2, npa1; 2463 pd_entry_t *pd4, *pd3, *pd2, *pd1; 2464 pd_entry_t *pptes; 2465 2466 MUTEX_ASSERT_LOCKED(&pmap->pm_mtx); 2467 2468 l4idx = (gpa & L4_MASK) >> L4_SHIFT; /* PML4E idx */ 2469 l3idx = (gpa & L3_MASK) >> L3_SHIFT; /* PDPTE idx */ 2470 l2idx = (gpa & L2_MASK) >> L2_SHIFT; /* PDE idx */ 2471 l1idx = (gpa & L1_MASK) >> L1_SHIFT; /* PTE idx */ 2472 2473 /* Start at PML4 / top level */ 2474 pd4 = (pd_entry_t *)pmap->pm_pdir; 2475 2476 if (pd4 == NULL) 2477 return; 2478 2479 /* npa3 = physaddr of PDPT */ 2480 npa3 = pd4[l4idx] & PMAP_PA_MASK; 2481 if (!npa3) 2482 return; 2483 pd3 = (pd_entry_t *)PMAP_DIRECT_MAP(npa3); 2484 pg3 = PHYS_TO_VM_PAGE(npa3); 2485 2486 /* npa2 = physaddr of PD page */ 2487 npa2 = pd3[l3idx] & PMAP_PA_MASK; 2488 if (!npa2) 2489 return; 2490 pd2 = (pd_entry_t *)PMAP_DIRECT_MAP(npa2); 2491 pg2 = PHYS_TO_VM_PAGE(npa2); 2492 2493 /* npa1 = physaddr of PT page */ 2494 npa1 = pd2[l2idx] & PMAP_PA_MASK; 2495 if (!npa1) 2496 return; 2497 pd1 = (pd_entry_t *)PMAP_DIRECT_MAP(npa1); 2498 pg1 = PHYS_TO_VM_PAGE(npa1); 2499 2500 if (pd1[l1idx] == 0) 2501 return; 2502 2503 pd1[l1idx] = 0; 2504 pg1->wire_count--; 2505 pmap->pm_stats.resident_count--; 2506 2507 if (pg1->wire_count > 1) 2508 return; 2509 2510 pg1->wire_count = 0; 2511 pptes = (pd_entry_t *)PMAP_DIRECT_MAP(npa2); 2512 pptes[l2idx] = 0; 2513 uvm_pagefree(pg1); 2514 pmap->pm_stats.resident_count--; 2515 2516 pg2->wire_count--; 2517 if (pg2->wire_count > 1) 2518 return; 2519 2520 pg2->wire_count = 0; 2521 pptes = (pd_entry_t *)PMAP_DIRECT_MAP(npa3); 2522 pptes[l3idx] = 0; 2523 uvm_pagefree(pg2); 2524 pmap->pm_stats.resident_count--; 2525 2526 pg3->wire_count--; 2527 if (pg3->wire_count > 1) 2528 return; 2529 2530 pg3->wire_count = 0; 2531 pptes = pd4; 2532 pptes[l4idx] = 0; 2533 uvm_pagefree(pg3); 2534 pmap->pm_stats.resident_count--; 2535 } 2536 2537 int 2538 pmap_enter_ept(struct pmap *pmap, paddr_t gpa, paddr_t hpa, vm_prot_t prot) 2539 { 2540 uint64_t l4idx, l3idx, l2idx, l1idx; 2541 pd_entry_t *pd, npte; 2542 struct vm_page *ptp, *pptp; 2543 paddr_t npa; 2544 struct uvm_object *obj; 2545 int ret = 0; 2546 2547 if (gpa > MAXDSIZ) 2548 return ENOMEM; 2549 2550 l4idx = (gpa & L4_MASK) >> L4_SHIFT; /* PML4E idx */ 2551 l3idx = (gpa & L3_MASK) >> L3_SHIFT; /* PDPTE idx */ 2552 l2idx = (gpa & L2_MASK) >> L2_SHIFT; /* PDE idx */ 2553 l1idx = (gpa & L1_MASK) >> L1_SHIFT; /* PTE idx */ 2554 2555 mtx_enter(&pmap->pm_mtx); 2556 2557 /* Start at PML4 / top level */ 2558 pd = (pd_entry_t *)pmap->pm_pdir; 2559 2560 if (pd == NULL) { 2561 ret = ENOMEM; 2562 goto unlock; 2563 } 2564 2565 /* npa = physaddr of PDPT */ 2566 npa = pd[l4idx] & PMAP_PA_MASK; 2567 2568 /* Valid PML4e for the 512GB region containing gpa? */ 2569 if (!npa) { 2570 /* No valid PML4e - allocate PDPT page and set PML4e */ 2571 obj = &pmap->pm_obj[2]; /* PML4 UVM object */ 2572 ptp = uvm_pagealloc(obj, ptp_va2o(gpa, 3), NULL, 2573 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 2574 2575 if (ptp == NULL) { 2576 ret = ENOMEM; 2577 goto unlock; 2578 } 2579 atomic_clearbits_int(&ptp->pg_flags, PG_BUSY); 2580 2581 /* 2582 * New PDPT page - we are setting the first entry, so set 2583 * the wired count to 1 2584 */ 2585 ptp->wire_count = 1; 2586 2587 /* Calculate phys address of this new PDPT page */ 2588 npa = VM_PAGE_TO_PHYS(ptp); 2589 2590 /* 2591 * Higher levels get full perms; specific permissions are 2592 * entered at the lowest level. 2593 */ 2594 pd[l4idx] = (npa | EPT_R | EPT_W | EPT_X); 2595 2596 pmap->pm_stats.resident_count++; 2597 2598 pptp = ptp; 2599 } else { 2600 /* Already allocated PML4e */ 2601 pptp = PHYS_TO_VM_PAGE(npa); 2602 } 2603 2604 pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa); 2605 if (pd == NULL) 2606 panic("%s: can't locate PDPT @ pa=0x%llx", __func__, 2607 (uint64_t)npa); 2608 2609 /* npa = physaddr of PD page */ 2610 npa = pd[l3idx] & PMAP_PA_MASK; 2611 2612 /* Valid PDPTe for the 1GB region containing gpa? */ 2613 if (!npa) { 2614 /* No valid PDPTe - allocate PD page and set PDPTe */ 2615 obj = &pmap->pm_obj[1]; /* PDPT UVM object */ 2616 ptp = uvm_pagealloc(obj, ptp_va2o(gpa, 2), NULL, 2617 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 2618 2619 if (ptp == NULL) { 2620 ret = ENOMEM; 2621 goto unlock; 2622 } 2623 atomic_clearbits_int(&ptp->pg_flags, PG_BUSY); 2624 2625 /* 2626 * New PD page - we are setting the first entry, so set 2627 * the wired count to 1 2628 */ 2629 ptp->wire_count = 1; 2630 pptp->wire_count++; 2631 2632 npa = VM_PAGE_TO_PHYS(ptp); 2633 2634 /* 2635 * Higher levels get full perms; specific permissions are 2636 * entered at the lowest level. 2637 */ 2638 pd[l3idx] = (npa | EPT_R | EPT_W | EPT_X); 2639 2640 pmap->pm_stats.resident_count++; 2641 2642 pptp = ptp; 2643 } else { 2644 /* Already allocated PDPTe */ 2645 pptp = PHYS_TO_VM_PAGE(npa); 2646 } 2647 2648 pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa); 2649 if (pd == NULL) 2650 panic("%s: can't locate PD page @ pa=0x%llx", __func__, 2651 (uint64_t)npa); 2652 2653 /* npa = physaddr of PT page */ 2654 npa = pd[l2idx] & PMAP_PA_MASK; 2655 2656 /* Valid PDE for the 2MB region containing gpa? */ 2657 if (!npa) { 2658 /* No valid PDE - allocate PT page and set PDE */ 2659 obj = &pmap->pm_obj[0]; /* PDE UVM object */ 2660 ptp = uvm_pagealloc(obj, ptp_va2o(gpa, 1), NULL, 2661 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 2662 2663 if (ptp == NULL) { 2664 ret = ENOMEM; 2665 goto unlock; 2666 } 2667 atomic_clearbits_int(&ptp->pg_flags, PG_BUSY); 2668 2669 ptp->wire_count = 1; 2670 pptp->wire_count++; 2671 2672 npa = VM_PAGE_TO_PHYS(ptp); 2673 2674 /* 2675 * Higher level get full perms; specific permissions are 2676 * entered at the lowest level. 2677 */ 2678 pd[l2idx] = (npa | EPT_R | EPT_W | EPT_X); 2679 2680 pmap->pm_stats.resident_count++; 2681 2682 } else { 2683 /* Find final ptp */ 2684 ptp = PHYS_TO_VM_PAGE(npa); 2685 if (ptp == NULL) 2686 panic("%s: ptp page vanished?", __func__); 2687 } 2688 2689 pd = (pd_entry_t *)PMAP_DIRECT_MAP(npa); 2690 if (pd == NULL) 2691 panic("%s: can't locate PT page @ pa=0x%llx", __func__, 2692 (uint64_t)npa); 2693 2694 npte = hpa | EPT_WB; 2695 if (prot & PROT_READ) 2696 npte |= EPT_R; 2697 if (prot & PROT_WRITE) 2698 npte |= EPT_W; 2699 if (prot & PROT_EXEC) 2700 npte |= EPT_X; 2701 2702 if (pd[l1idx] == 0) { 2703 ptp->wire_count++; 2704 pmap->pm_stats.resident_count++; 2705 } else { 2706 /* XXX flush ept */ 2707 } 2708 2709 pd[l1idx] = npte; 2710 2711 unlock: 2712 mtx_leave(&pmap->pm_mtx); 2713 2714 return ret; 2715 } 2716 #endif /* NVMM > 0 */ 2717 2718 /* 2719 * pmap_enter: enter a mapping into a pmap 2720 * 2721 * => must be done "now" ... no lazy-evaluation 2722 */ 2723 2724 int 2725 pmap_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, int flags) 2726 { 2727 pt_entry_t opte, npte; 2728 struct vm_page *ptp, *pg = NULL; 2729 struct pv_entry *pve, *opve = NULL; 2730 int ptpdelta, wireddelta, resdelta; 2731 int wired = (flags & PMAP_WIRED) != 0; 2732 int crypt = (flags & PMAP_NOCRYPT) == 0; 2733 int nocache = (pa & PMAP_NOCACHE) != 0; 2734 int wc = (pa & PMAP_WC) != 0; 2735 int error, shootself; 2736 paddr_t scr3; 2737 2738 #if NVMM > 0 2739 if (pmap_is_ept(pmap)) 2740 return pmap_enter_ept(pmap, va, pa, prot); 2741 #endif /* NVMM > 0 */ 2742 2743 KASSERT(!(wc && nocache)); 2744 pa &= PMAP_PA_MASK; 2745 2746 #ifdef DIAGNOSTIC 2747 if (va == (vaddr_t) PDP_BASE) 2748 panic("%s: trying to map over PDP!", __func__); 2749 2750 /* sanity check: kernel PTPs should already have been pre-allocated */ 2751 if (va >= VM_MIN_KERNEL_ADDRESS && 2752 !pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)])) 2753 panic("%s: missing kernel PTP for va %lx!", __func__, va); 2754 2755 #endif 2756 2757 pve = pool_get(&pmap_pv_pool, PR_NOWAIT); 2758 if (pve == NULL) { 2759 if (flags & PMAP_CANFAIL) { 2760 error = ENOMEM; 2761 goto out; 2762 } 2763 panic("%s: no pv entries available", __func__); 2764 } 2765 2766 /* 2767 * map in ptes and get a pointer to our PTP (unless we are the kernel) 2768 */ 2769 2770 scr3 = pmap_map_ptes(pmap); 2771 shootself = (scr3 == 0); 2772 if (pmap == pmap_kernel()) { 2773 ptp = NULL; 2774 } else { 2775 ptp = pmap_get_ptp(pmap, va); 2776 if (ptp == NULL) { 2777 if (flags & PMAP_CANFAIL) { 2778 pmap_unmap_ptes(pmap, scr3); 2779 error = ENOMEM; 2780 goto out; 2781 } 2782 panic("%s: get ptp failed", __func__); 2783 } 2784 } 2785 opte = PTE_BASE[pl1_i(va)]; /* old PTE */ 2786 2787 /* 2788 * is there currently a valid mapping at our VA? 2789 */ 2790 2791 if (pmap_valid_entry(opte)) { 2792 /* 2793 * first, calculate pm_stats updates. resident count will not 2794 * change since we are replacing/changing a valid mapping. 2795 * wired count might change... 2796 */ 2797 2798 resdelta = 0; 2799 if (wired && (opte & PG_W) == 0) 2800 wireddelta = 1; 2801 else if (!wired && (opte & PG_W) != 0) 2802 wireddelta = -1; 2803 else 2804 wireddelta = 0; 2805 ptpdelta = 0; 2806 2807 /* 2808 * is the currently mapped PA the same as the one we 2809 * want to map? 2810 */ 2811 2812 if ((opte & pg_frame) == pa) { 2813 2814 /* if this is on the PVLIST, sync R/M bit */ 2815 if (opte & PG_PVLIST) { 2816 pg = PHYS_TO_VM_PAGE(pa); 2817 #ifdef DIAGNOSTIC 2818 if (pg == NULL) 2819 panic("%s: same pa, PG_PVLIST " 2820 "mapping with unmanaged page: " 2821 "va 0x%lx, opte 0x%llx, pa 0x%lx", 2822 __func__, va, opte, pa); 2823 #endif 2824 pmap_sync_flags_pte(pg, opte); 2825 } else { 2826 #ifdef DIAGNOSTIC 2827 if (PHYS_TO_VM_PAGE(pa) != NULL) 2828 panic("%s: same pa, no PG_PVLIST " 2829 "mapping with managed page: " 2830 "va 0x%lx, opte 0x%llx, pa 0x%lx", 2831 __func__, va, opte, pa); 2832 #endif 2833 } 2834 goto enter_now; 2835 } 2836 2837 /* 2838 * changing PAs: we must remove the old one first 2839 */ 2840 2841 /* 2842 * if current mapping is on a pvlist, 2843 * remove it (sync R/M bits) 2844 */ 2845 2846 if (opte & PG_PVLIST) { 2847 pg = PHYS_TO_VM_PAGE(opte & pg_frame); 2848 #ifdef DIAGNOSTIC 2849 if (pg == NULL) 2850 panic("%s: PG_PVLIST mapping with unmanaged " 2851 "page: va 0x%lx, opte 0x%llx, pa 0x%lx", 2852 __func__, va, opte, pa); 2853 #endif 2854 pmap_sync_flags_pte(pg, opte); 2855 opve = pmap_remove_pv(pg, pmap, va); 2856 pg = NULL; /* This is not the page we are looking for */ 2857 } 2858 } else { /* opte not valid */ 2859 resdelta = 1; 2860 if (wired) 2861 wireddelta = 1; 2862 else 2863 wireddelta = 0; 2864 if (ptp != NULL) 2865 ptpdelta = 1; 2866 else 2867 ptpdelta = 0; 2868 } 2869 2870 /* 2871 * pve is either NULL or points to a now-free pv_entry structure 2872 * (the latter case is if we called pmap_remove_pv above). 2873 * 2874 * if this entry is to be on a pvlist, enter it now. 2875 */ 2876 2877 if (pmap_initialized) 2878 pg = PHYS_TO_VM_PAGE(pa); 2879 2880 if (pg != NULL) { 2881 pmap_enter_pv(pg, pve, pmap, va, ptp); 2882 pve = NULL; 2883 } 2884 2885 enter_now: 2886 /* 2887 * at this point pg is !NULL if we want the PG_PVLIST bit set 2888 */ 2889 2890 pmap->pm_stats.resident_count += resdelta; 2891 pmap->pm_stats.wired_count += wireddelta; 2892 if (ptp != NULL) 2893 ptp->wire_count += ptpdelta; 2894 2895 KASSERT(pg == PHYS_TO_VM_PAGE(pa)); 2896 2897 npte = pa | protection_codes[prot] | PG_V; 2898 if (pg != NULL) { 2899 npte |= PG_PVLIST; 2900 /* 2901 * make sure that if the page is write combined all 2902 * instances of pmap_enter make it so. 2903 */ 2904 if (pg->pg_flags & PG_PMAP_WC) { 2905 KASSERT(nocache == 0); 2906 wc = 1; 2907 } 2908 } 2909 if (wc) 2910 npte |= pmap_pg_wc; 2911 if (wired) 2912 npte |= PG_W; 2913 if (nocache) 2914 npte |= PG_N; 2915 if (va < VM_MAXUSER_ADDRESS) 2916 npte |= ((flags & PMAP_EFI) ? 0 : PG_u); 2917 else if (va < VM_MAX_ADDRESS) 2918 npte |= (PG_u | PG_RW); /* XXXCDC: no longer needed? */ 2919 if (pmap == pmap_kernel()) 2920 npte |= pg_g_kern; 2921 if (crypt) 2922 npte |= pg_crypt; 2923 2924 /* 2925 * If the old entry wasn't valid, we can just update it and 2926 * go. If it was valid, and this isn't a read->write 2927 * transition, then we can safely just update it and flush 2928 * any old TLB entries. 2929 * 2930 * If it _was_ valid and this _is_ a read->write transition, 2931 * then this could be a CoW resolution and we need to make 2932 * sure no CPU can see the new writable mapping while another 2933 * still has the old mapping in its TLB, so insert a correct 2934 * but unwritable mapping, flush any old TLB entries, then 2935 * make it writable. 2936 */ 2937 if (! pmap_valid_entry(opte)) { 2938 PTE_BASE[pl1_i(va)] = npte; 2939 } else if ((opte | (npte ^ PG_RW)) & PG_RW) { 2940 /* previously writable or not making writable */ 2941 PTE_BASE[pl1_i(va)] = npte; 2942 if (nocache && (opte & PG_N) == 0) 2943 wbinvd_on_all_cpus(); 2944 pmap_tlb_shootpage(pmap, va, shootself); 2945 } else { 2946 PTE_BASE[pl1_i(va)] = npte ^ PG_RW; 2947 if (nocache && (opte & PG_N) == 0) /* XXX impossible? */ 2948 wbinvd_on_all_cpus(); 2949 pmap_tlb_shootpage(pmap, va, shootself); 2950 pmap_tlb_shootwait(); 2951 PTE_BASE[pl1_i(va)] = npte; 2952 } 2953 2954 pmap_unmap_ptes(pmap, scr3); 2955 pmap_tlb_shootwait(); 2956 2957 error = 0; 2958 2959 out: 2960 if (pve != NULL) 2961 pool_put(&pmap_pv_pool, pve); 2962 if (opve != NULL) 2963 pool_put(&pmap_pv_pool, opve); 2964 2965 return error; 2966 } 2967 2968 int 2969 pmap_get_physpage(vaddr_t va, int level, paddr_t *paddrp) 2970 { 2971 struct vm_page *ptp; 2972 struct pmap *kpm = pmap_kernel(); 2973 2974 if (uvm.page_init_done == 0) { 2975 vaddr_t va; 2976 2977 /* 2978 * we're growing the kernel pmap early (from 2979 * uvm_pageboot_alloc()). this case must be 2980 * handled a little differently. 2981 */ 2982 2983 va = pmap_steal_memory(PAGE_SIZE, NULL, NULL); 2984 *paddrp = PMAP_DIRECT_UNMAP(va); 2985 } else { 2986 ptp = uvm_pagealloc(&kpm->pm_obj[level - 1], 2987 ptp_va2o(va, level), NULL, 2988 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 2989 if (ptp == NULL) 2990 panic("%s: out of memory", __func__); 2991 atomic_clearbits_int(&ptp->pg_flags, PG_BUSY); 2992 ptp->wire_count = 1; 2993 *paddrp = VM_PAGE_TO_PHYS(ptp); 2994 } 2995 kpm->pm_stats.resident_count++; 2996 return 1; 2997 } 2998 2999 /* 3000 * Allocate the amount of specified ptps for a ptp level, and populate 3001 * all levels below accordingly, mapping virtual addresses starting at 3002 * kva. 3003 * 3004 * Used by pmap_growkernel. 3005 */ 3006 void 3007 pmap_alloc_level(vaddr_t kva, int lvl, long *needed_ptps) 3008 { 3009 unsigned long i; 3010 vaddr_t va; 3011 paddr_t pa; 3012 unsigned long index, endindex; 3013 int level; 3014 pd_entry_t *pdep; 3015 3016 for (level = lvl; level > 1; level--) { 3017 if (level == PTP_LEVELS) 3018 pdep = pmap_kernel()->pm_pdir; 3019 else 3020 pdep = normal_pdes[level - 2]; 3021 va = kva; 3022 index = pl_i(kva, level); 3023 endindex = index + needed_ptps[level - 1]; 3024 /* 3025 * XXX special case for first time call. 3026 */ 3027 if (nkptp[level - 1] != 0) 3028 index++; 3029 else 3030 endindex--; 3031 3032 for (i = index; i <= endindex; i++) { 3033 pmap_get_physpage(va, level - 1, &pa); 3034 pdep[i] = pa | PG_RW | PG_V | pg_nx | pg_crypt; 3035 nkptp[level - 1]++; 3036 va += nbpd[level - 1]; 3037 } 3038 } 3039 } 3040 3041 /* 3042 * pmap_growkernel: increase usage of KVM space 3043 * 3044 * => we allocate new PTPs for the kernel and install them in all 3045 * the pmaps on the system. 3046 */ 3047 3048 static vaddr_t pmap_maxkvaddr = VM_MIN_KERNEL_ADDRESS; 3049 3050 vaddr_t 3051 pmap_growkernel(vaddr_t maxkvaddr) 3052 { 3053 struct pmap *kpm = pmap_kernel(), *pm; 3054 int s, i; 3055 unsigned newpdes; 3056 long needed_kptp[PTP_LEVELS], target_nptp, old; 3057 3058 if (maxkvaddr <= pmap_maxkvaddr) 3059 return pmap_maxkvaddr; 3060 3061 maxkvaddr = x86_round_pdr(maxkvaddr); 3062 old = nkptp[PTP_LEVELS - 1]; 3063 /* 3064 * This loop could be optimized more, but pmap_growkernel() 3065 * is called infrequently. 3066 */ 3067 for (i = PTP_LEVELS - 1; i >= 1; i--) { 3068 target_nptp = pl_i(maxkvaddr, i + 1) - 3069 pl_i(VM_MIN_KERNEL_ADDRESS, i + 1); 3070 /* 3071 * XXX only need to check toplevel. 3072 */ 3073 if (target_nptp > nkptpmax[i]) 3074 panic("%s: out of KVA space", __func__); 3075 needed_kptp[i] = target_nptp - nkptp[i] + 1; 3076 } 3077 3078 3079 s = splhigh(); /* to be safe */ 3080 pmap_alloc_level(pmap_maxkvaddr, PTP_LEVELS, needed_kptp); 3081 3082 /* 3083 * If the number of top level entries changed, update all 3084 * pmaps. 3085 */ 3086 if (needed_kptp[PTP_LEVELS - 1] != 0) { 3087 newpdes = nkptp[PTP_LEVELS - 1] - old; 3088 mtx_enter(&pmaps_lock); 3089 LIST_FOREACH(pm, &pmaps, pm_list) { 3090 memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old], 3091 &kpm->pm_pdir[PDIR_SLOT_KERN + old], 3092 newpdes * sizeof (pd_entry_t)); 3093 } 3094 mtx_leave(&pmaps_lock); 3095 } 3096 pmap_maxkvaddr = maxkvaddr; 3097 splx(s); 3098 3099 return maxkvaddr; 3100 } 3101 3102 vaddr_t 3103 pmap_steal_memory(vsize_t size, vaddr_t *start, vaddr_t *end) 3104 { 3105 int segno; 3106 u_int npg; 3107 vaddr_t va; 3108 paddr_t pa; 3109 struct vm_physseg *seg; 3110 3111 size = round_page(size); 3112 npg = atop(size); 3113 3114 for (segno = 0, seg = vm_physmem; segno < vm_nphysseg; segno++, seg++) { 3115 if (seg->avail_end - seg->avail_start < npg) 3116 continue; 3117 /* 3118 * We can only steal at an ``unused'' segment boundary, 3119 * i.e. either at the start or at the end. 3120 */ 3121 if (seg->avail_start == seg->start || 3122 seg->avail_end == seg->end) 3123 break; 3124 } 3125 if (segno == vm_nphysseg) { 3126 panic("%s: out of memory", __func__); 3127 } else { 3128 if (seg->avail_start == seg->start) { 3129 pa = ptoa(seg->avail_start); 3130 seg->avail_start += npg; 3131 seg->start += npg; 3132 } else { 3133 pa = ptoa(seg->avail_end) - size; 3134 seg->avail_end -= npg; 3135 seg->end -= npg; 3136 } 3137 /* 3138 * If all the segment has been consumed now, remove it. 3139 * Note that the crash dump code still knows about it 3140 * and will dump it correctly. 3141 */ 3142 if (seg->start == seg->end) { 3143 if (vm_nphysseg-- == 1) 3144 panic("%s: out of memory", __func__); 3145 while (segno < vm_nphysseg) { 3146 seg[0] = seg[1]; /* struct copy */ 3147 seg++; 3148 segno++; 3149 } 3150 } 3151 3152 va = PMAP_DIRECT_MAP(pa); 3153 memset((void *)va, 0, size); 3154 } 3155 3156 if (start != NULL) 3157 *start = virtual_avail; 3158 if (end != NULL) 3159 *end = VM_MAX_KERNEL_ADDRESS; 3160 3161 return (va); 3162 } 3163 3164 #ifdef MULTIPROCESSOR 3165 /* 3166 * Locking for tlb shootdown. 3167 * 3168 * We lock by setting tlb_shoot_wait to the number of cpus that will 3169 * receive our tlb shootdown. After sending the IPIs, we don't need to 3170 * worry about locking order or interrupts spinning for the lock because 3171 * the call that grabs the "lock" isn't the one that releases it. And 3172 * there is nothing that can block the IPI that releases the lock. 3173 * 3174 * The functions are organized so that we first count the number of 3175 * cpus we need to send the IPI to, then we grab the counter, then 3176 * we send the IPIs, then we finally do our own shootdown. 3177 * 3178 * Our shootdown is last to make it parallel with the other cpus 3179 * to shorten the spin time. 3180 * 3181 * Notice that we depend on failures to send IPIs only being able to 3182 * happen during boot. If they happen later, the above assumption 3183 * doesn't hold since we can end up in situations where noone will 3184 * release the lock if we get an interrupt in a bad moment. 3185 */ 3186 #ifdef MP_LOCKDEBUG 3187 #include <ddb/db_output.h> 3188 extern int __mp_lock_spinout; 3189 #endif 3190 3191 volatile long tlb_shoot_wait __attribute__((section(".kudata"))); 3192 3193 volatile vaddr_t tlb_shoot_addr1 __attribute__((section(".kudata"))); 3194 volatile vaddr_t tlb_shoot_addr2 __attribute__((section(".kudata"))); 3195 volatile int tlb_shoot_first_pcid __attribute__((section(".kudata"))); 3196 3197 #if NVMM > 0 3198 #include <amd64/vmmvar.h> 3199 volatile uint64_t ept_shoot_mode __attribute__((section(".kudata"))); 3200 volatile struct vmx_invept_descriptor ept_shoot_vid 3201 __attribute__((section(".kudata"))); 3202 #endif /* NVMM > 0 */ 3203 3204 /* Obtain the "lock" for TLB shooting */ 3205 static inline int 3206 pmap_start_tlb_shoot(long wait, const char *func) 3207 { 3208 int s = splvm(); 3209 3210 while (atomic_cas_ulong(&tlb_shoot_wait, 0, wait) != 0) { 3211 #ifdef MP_LOCKDEBUG 3212 int nticks = __mp_lock_spinout; 3213 #endif 3214 while (tlb_shoot_wait != 0) { 3215 CPU_BUSY_CYCLE(); 3216 #ifdef MP_LOCKDEBUG 3217 if (--nticks <= 0) { 3218 db_printf("%s: spun out", func); 3219 db_enter(); 3220 nticks = __mp_lock_spinout; 3221 } 3222 #endif 3223 } 3224 } 3225 3226 return s; 3227 } 3228 3229 void 3230 pmap_tlb_shootpage(struct pmap *pm, vaddr_t va, int shootself) 3231 { 3232 struct cpu_info *ci, *self = curcpu(); 3233 CPU_INFO_ITERATOR cii; 3234 long wait = 0; 3235 u_int64_t mask = 0; 3236 int is_kva = va >= VM_MIN_KERNEL_ADDRESS; 3237 3238 CPU_INFO_FOREACH(cii, ci) { 3239 if (ci == self || !(ci->ci_flags & CPUF_RUNNING)) 3240 continue; 3241 if (!is_kva && !pmap_is_active(pm, ci)) 3242 continue; 3243 mask |= (1ULL << ci->ci_cpuid); 3244 wait++; 3245 } 3246 3247 if (wait > 0) { 3248 int s = pmap_start_tlb_shoot(wait, __func__); 3249 3250 tlb_shoot_first_pcid = is_kva ? PCID_KERN : PCID_PROC; 3251 tlb_shoot_addr1 = va; 3252 CPU_INFO_FOREACH(cii, ci) { 3253 if ((mask & (1ULL << ci->ci_cpuid)) == 0) 3254 continue; 3255 if (x86_fast_ipi(ci, LAPIC_IPI_INVLPG) != 0) 3256 panic("%s: ipi failed", __func__); 3257 } 3258 splx(s); 3259 } 3260 3261 if (!pmap_use_pcid) { 3262 if (shootself) 3263 pmap_update_pg(va); 3264 } else if (is_kva) { 3265 invpcid(INVPCID_ADDR, PCID_PROC, va); 3266 invpcid(INVPCID_ADDR, PCID_KERN, va); 3267 } else if (shootself) { 3268 invpcid(INVPCID_ADDR, PCID_PROC, va); 3269 if (cpu_meltdown) 3270 invpcid(INVPCID_ADDR, PCID_PROC_INTEL, va); 3271 } 3272 } 3273 3274 void 3275 pmap_tlb_shootrange(struct pmap *pm, vaddr_t sva, vaddr_t eva, int shootself) 3276 { 3277 struct cpu_info *ci, *self = curcpu(); 3278 CPU_INFO_ITERATOR cii; 3279 long wait = 0; 3280 u_int64_t mask = 0; 3281 int is_kva = sva >= VM_MIN_KERNEL_ADDRESS; 3282 vaddr_t va; 3283 3284 CPU_INFO_FOREACH(cii, ci) { 3285 if (ci == self || !(ci->ci_flags & CPUF_RUNNING)) 3286 continue; 3287 if (!is_kva && !pmap_is_active(pm, ci)) 3288 continue; 3289 mask |= (1ULL << ci->ci_cpuid); 3290 wait++; 3291 } 3292 3293 if (wait > 0) { 3294 int s = pmap_start_tlb_shoot(wait, __func__); 3295 3296 tlb_shoot_first_pcid = is_kva ? PCID_KERN : PCID_PROC; 3297 tlb_shoot_addr1 = sva; 3298 tlb_shoot_addr2 = eva; 3299 CPU_INFO_FOREACH(cii, ci) { 3300 if ((mask & (1ULL << ci->ci_cpuid)) == 0) 3301 continue; 3302 if (x86_fast_ipi(ci, LAPIC_IPI_INVLRANGE) != 0) 3303 panic("%s: ipi failed", __func__); 3304 } 3305 splx(s); 3306 } 3307 3308 if (!pmap_use_pcid) { 3309 if (shootself) { 3310 for (va = sva; va < eva; va += PAGE_SIZE) 3311 pmap_update_pg(va); 3312 } 3313 } else if (is_kva) { 3314 for (va = sva; va < eva; va += PAGE_SIZE) { 3315 invpcid(INVPCID_ADDR, PCID_PROC, va); 3316 invpcid(INVPCID_ADDR, PCID_KERN, va); 3317 } 3318 } else if (shootself) { 3319 if (cpu_meltdown) { 3320 for (va = sva; va < eva; va += PAGE_SIZE) { 3321 invpcid(INVPCID_ADDR, PCID_PROC, va); 3322 invpcid(INVPCID_ADDR, PCID_PROC_INTEL, va); 3323 } 3324 } else { 3325 for (va = sva; va < eva; va += PAGE_SIZE) 3326 invpcid(INVPCID_ADDR, PCID_PROC, va); 3327 } 3328 } 3329 } 3330 3331 void 3332 pmap_tlb_shoottlb(struct pmap *pm, int shootself) 3333 { 3334 struct cpu_info *ci, *self = curcpu(); 3335 CPU_INFO_ITERATOR cii; 3336 long wait = 0; 3337 u_int64_t mask = 0; 3338 3339 KASSERT(pm != pmap_kernel()); 3340 3341 CPU_INFO_FOREACH(cii, ci) { 3342 if (ci == self || !pmap_is_active(pm, ci) || 3343 !(ci->ci_flags & CPUF_RUNNING)) 3344 continue; 3345 mask |= (1ULL << ci->ci_cpuid); 3346 wait++; 3347 } 3348 3349 if (wait) { 3350 int s = pmap_start_tlb_shoot(wait, __func__); 3351 CPU_INFO_FOREACH(cii, ci) { 3352 if ((mask & (1ULL << ci->ci_cpuid)) == 0) 3353 continue; 3354 if (x86_fast_ipi(ci, LAPIC_IPI_INVLTLB) != 0) 3355 panic("%s: ipi failed", __func__); 3356 } 3357 splx(s); 3358 } 3359 3360 if (shootself) { 3361 if (!pmap_use_pcid) 3362 tlbflush(); 3363 else { 3364 invpcid(INVPCID_PCID, PCID_PROC, 0); 3365 if (cpu_meltdown) 3366 invpcid(INVPCID_PCID, PCID_PROC_INTEL, 0); 3367 } 3368 } 3369 } 3370 3371 #if NVMM > 0 3372 /* 3373 * pmap_shootept: similar to pmap_tlb_shoottlb, but for remotely invalidating 3374 * EPT using invept. 3375 */ 3376 void 3377 pmap_shootept(struct pmap *pm, int shootself) 3378 { 3379 struct cpu_info *ci, *self = curcpu(); 3380 struct vmx_invept_descriptor vid; 3381 CPU_INFO_ITERATOR cii; 3382 long wait = 0; 3383 u_int64_t mask = 0; 3384 3385 KASSERT(pmap_is_ept(pm)); 3386 3387 CPU_INFO_FOREACH(cii, ci) { 3388 if (ci == self || !pmap_is_active(pm, ci) || 3389 !(ci->ci_flags & CPUF_RUNNING) || 3390 !(ci->ci_flags & CPUF_VMM)) 3391 continue; 3392 mask |= (1ULL << ci->ci_cpuid); 3393 wait++; 3394 } 3395 3396 if (wait) { 3397 int s = pmap_start_tlb_shoot(wait, __func__); 3398 3399 ept_shoot_mode = self->ci_vmm_cap.vcc_vmx.vmx_invept_mode; 3400 ept_shoot_vid.vid_eptp = pm->eptp; 3401 ept_shoot_vid.vid_reserved = 0; 3402 3403 CPU_INFO_FOREACH(cii, ci) { 3404 if ((mask & (1ULL << ci->ci_cpuid)) == 0) 3405 continue; 3406 if (x86_fast_ipi(ci, LAPIC_IPI_INVEPT) != 0) 3407 panic("%s: ipi failed", __func__); 3408 } 3409 3410 splx(s); 3411 } 3412 3413 if (shootself && (self->ci_flags & CPUF_VMM)) { 3414 vid.vid_eptp = pm->eptp; 3415 vid.vid_reserved = 0; 3416 invept(self->ci_vmm_cap.vcc_vmx.vmx_invept_mode, &vid); 3417 } 3418 } 3419 #endif /* NVMM > 0 */ 3420 3421 void 3422 pmap_tlb_shootwait(void) 3423 { 3424 #ifdef MP_LOCKDEBUG 3425 int nticks = __mp_lock_spinout; 3426 #endif 3427 while (tlb_shoot_wait != 0) { 3428 CPU_BUSY_CYCLE(); 3429 #ifdef MP_LOCKDEBUG 3430 if (--nticks <= 0) { 3431 db_printf("%s: spun out", __func__); 3432 db_enter(); 3433 nticks = __mp_lock_spinout; 3434 } 3435 #endif 3436 } 3437 } 3438 3439 #else /* MULTIPROCESSOR */ 3440 3441 void 3442 pmap_tlb_shootpage(struct pmap *pm, vaddr_t va, int shootself) 3443 { 3444 if (!pmap_use_pcid) { 3445 if (shootself) 3446 pmap_update_pg(va); 3447 } else if (va >= VM_MIN_KERNEL_ADDRESS) { 3448 invpcid(INVPCID_ADDR, PCID_PROC, va); 3449 invpcid(INVPCID_ADDR, PCID_KERN, va); 3450 } else if (shootself) { 3451 invpcid(INVPCID_ADDR, PCID_PROC, va); 3452 if (cpu_meltdown) 3453 invpcid(INVPCID_ADDR, PCID_PROC_INTEL, va); 3454 } 3455 } 3456 3457 void 3458 pmap_tlb_shootrange(struct pmap *pm, vaddr_t sva, vaddr_t eva, int shootself) 3459 { 3460 vaddr_t va; 3461 3462 if (!pmap_use_pcid) { 3463 if (shootself) { 3464 for (va = sva; va < eva; va += PAGE_SIZE) 3465 pmap_update_pg(va); 3466 } 3467 } else if (sva >= VM_MIN_KERNEL_ADDRESS) { 3468 for (va = sva; va < eva; va += PAGE_SIZE) { 3469 invpcid(INVPCID_ADDR, PCID_PROC, va); 3470 invpcid(INVPCID_ADDR, PCID_KERN, va); 3471 } 3472 } else if (shootself) { 3473 if (cpu_meltdown) { 3474 for (va = sva; va < eva; va += PAGE_SIZE) { 3475 invpcid(INVPCID_ADDR, PCID_PROC, va); 3476 invpcid(INVPCID_ADDR, PCID_PROC_INTEL, va); 3477 } 3478 } else { 3479 for (va = sva; va < eva; va += PAGE_SIZE) 3480 invpcid(INVPCID_ADDR, PCID_PROC, va); 3481 } 3482 } 3483 } 3484 3485 void 3486 pmap_tlb_shoottlb(struct pmap *pm, int shootself) 3487 { 3488 if (shootself) { 3489 if (!pmap_use_pcid) 3490 tlbflush(); 3491 else { 3492 invpcid(INVPCID_PCID, PCID_PROC, 0); 3493 if (cpu_meltdown) 3494 invpcid(INVPCID_PCID, PCID_PROC_INTEL, 0); 3495 } 3496 } 3497 } 3498 3499 #if NVMM > 0 3500 void 3501 pmap_shootept(struct pmap *pm, int shootself) 3502 { 3503 struct cpu_info *self = curcpu(); 3504 struct vmx_invept_descriptor vid; 3505 3506 KASSERT(pmap_is_ept(pm)); 3507 3508 if (shootself && (self->ci_flags & CPUF_VMM)) { 3509 vid.vid_eptp = pm->eptp; 3510 vid.vid_reserved = 0; 3511 invept(self->ci_vmm_cap.vcc_vmx.vmx_invept_mode, &vid); 3512 } 3513 } 3514 #endif /* NVMM > 0 */ 3515 3516 #endif /* MULTIPROCESSOR */ 3517