1 /* $OpenBSD: pmap.c,v 1.226 2024/11/08 13:18:29 jsg Exp $ */ 2 /* $NetBSD: pmap.c,v 1.91 2000/06/02 17:46:37 thorpej Exp $ */ 3 4 /* 5 * Copyright (c) 1997 Charles D. Cranor and Washington University. 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /* 30 * pmap.c: i386 pmap module rewrite 31 * Chuck Cranor <chuck@ccrc.wustl.edu> 32 * 11-Aug-97 33 * 34 * history of this pmap module: in addition to my own input, i used 35 * the following references for this rewrite of the i386 pmap: 36 * 37 * [1] the NetBSD i386 pmap. this pmap appears to be based on the 38 * BSD hp300 pmap done by Mike Hibler at University of Utah. 39 * it was then ported to the i386 by William Jolitz of UUNET 40 * Technologies, Inc. Then Charles M. Hannum of the NetBSD 41 * project fixed some bugs and provided some speed ups. 42 * 43 * [2] the FreeBSD i386 pmap. this pmap seems to be the 44 * Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson 45 * and David Greenman. 46 * 47 * [3] the Mach pmap. this pmap, from CMU, seems to have migrated 48 * between several processors. the VAX version was done by 49 * Avadis Tevanian, Jr., and Michael Wayne Young. the i386 50 * version was done by Lance Berc, Mike Kupfer, Bob Baron, 51 * David Golub, and Richard Draves. the alpha version was 52 * done by Alessandro Forin (CMU/Mach) and Chris Demetriou 53 * (NetBSD/alpha). 54 */ 55 56 #include <sys/param.h> 57 #include <sys/systm.h> 58 #include <sys/atomic.h> 59 #include <sys/proc.h> 60 #include <sys/pool.h> 61 #include <sys/user.h> 62 #include <sys/mutex.h> 63 64 #include <uvm/uvm.h> 65 66 #include <machine/specialreg.h> 67 68 #include <sys/msgbuf.h> 69 #include <stand/boot/bootarg.h> 70 71 /* #define PMAP_DEBUG */ 72 73 #ifdef PMAP_DEBUG 74 #define DPRINTF(x...) do { printf(x); } while(0) 75 #else 76 #define DPRINTF(x...) 77 #endif /* PMAP_DEBUG */ 78 79 /* 80 * this file contains the code for the "pmap module." the module's 81 * job is to manage the hardware's virtual to physical address mappings. 82 * note that there are two levels of mapping in the VM system: 83 * 84 * [1] the upper layer of the VM system uses vm_map's and vm_map_entry's 85 * to map ranges of virtual address space to objects/files. for 86 * example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only 87 * to the file /bin/ls starting at offset zero." note that 88 * the upper layer mapping is not concerned with how individual 89 * vm_pages are mapped. 90 * 91 * [2] the lower layer of the VM system (the pmap) maintains the mappings 92 * from virtual addresses. it is concerned with which vm_page is 93 * mapped where. for example, when you run /bin/ls and start 94 * at page 0x1000 the fault routine may lookup the correct page 95 * of the /bin/ls file and then ask the pmap layer to establish 96 * a mapping for it. 97 * 98 * note that information in the lower layer of the VM system can be 99 * thrown away since it can easily be reconstructed from the info 100 * in the upper layer. 101 * 102 * data structures we use include: 103 * 104 * - struct pmap: describes the address space of one thread 105 * - struct pv_entry: describes one <PMAP,VA> mapping of a PA 106 * - struct pv_head: there is one pv_head per managed page of 107 * physical memory. the pv_head points to a list of pv_entry 108 * structures which describe all the <PMAP,VA> pairs that this 109 * page is mapped in. this is critical for page based operations 110 * such as pmap_page_protect() [change protection on _all_ mappings 111 * of a page] 112 */ 113 /* 114 * i386 MMU hardware structure: 115 * 116 * the i386 MMU is a two-level MMU which maps 4GB of virtual memory. 117 * the pagesize is 4K (4096 [0x1000] bytes), although newer pentium 118 * processors can support a 4MB pagesize as well. 119 * 120 * the first level table (segment table?) is called a "page directory" 121 * and it contains 1024 page directory entries (PDEs). each PDE is 122 * 4 bytes (an int), so a PD fits in a single 4K page. this page is 123 * the page directory page (PDP). each PDE in a PDP maps 4MB of space 124 * (1024 * 4MB = 4GB). a PDE contains the physical address of the 125 * second level table: the page table. or, if 4MB pages are being used, 126 * then the PDE contains the PA of the 4MB page being mapped. 127 * 128 * a page table consists of 1024 page table entries (PTEs). each PTE is 129 * 4 bytes (an int), so a page table also fits in a single 4K page. a 130 * 4K page being used as a page table is called a page table page (PTP). 131 * each PTE in a PTP maps one 4K page (1024 * 4K = 4MB). a PTE contains 132 * the physical address of the page it maps and some flag bits (described 133 * below). 134 * 135 * the processor has a special register, "cr3", which points to the 136 * the PDP which is currently controlling the mappings of the virtual 137 * address space. 138 * 139 * the following picture shows the translation process for a 4K page: 140 * 141 * %cr3 register [PA of PDP] 142 * | 143 * | 144 * | bits <31-22> of VA bits <21-12> of VA bits <11-0> 145 * | index the PDP (0 - 1023) index the PTP are the page offset 146 * | | | | 147 * | v | | 148 * +--->+----------+ | | 149 * | PD Page | PA of v | 150 * | |---PTP-------->+------------+ | 151 * | 1024 PDE | | page table |--PTE--+ | 152 * | entries | | (aka PTP) | | | 153 * +----------+ | 1024 PTE | | | 154 * | entries | | | 155 * +------------+ | | 156 * | | 157 * bits <31-12> bits <11-0> 158 * p h y s i c a l a d d r 159 * 160 * the i386 caches PTEs in a TLB. it is important to flush out old 161 * TLB mappings when making a change to a mapping. writing to the 162 * %cr3 will flush the entire TLB. newer processors also have an 163 * instruction that will invalidate the mapping of a single page (which 164 * is useful if you are changing a single mapping because it preserves 165 * all the cached TLB entries). 166 * 167 * as shows, bits 31-12 of the PTE contain PA of the page being mapped. 168 * the rest of the PTE is defined as follows: 169 * bit# name use 170 * 11 n/a available for OS use, hardware ignores it 171 * 10 n/a available for OS use, hardware ignores it 172 * 9 n/a available for OS use, hardware ignores it 173 * 8 G global bit (see discussion below) 174 * 7 PS page size [for PDEs] (0=4k, 1=4M <if supported>) 175 * 6 D dirty (modified) page 176 * 5 A accessed (referenced) page 177 * 4 PCD cache disable 178 * 3 PWT prevent write through (cache) 179 * 2 U/S user/supervisor bit (0=supervisor only, 1=both u&s) 180 * 1 R/W read/write bit (0=read only, 1=read-write) 181 * 0 P present (valid) 182 * 183 * notes: 184 * - on the i386 the R/W bit is ignored if processor is in supervisor 185 * state (bug!) 186 * - PS is only supported on newer processors 187 * - PTEs with the G bit are global in the sense that they are not 188 * flushed from the TLB when %cr3 is written (to flush, use the 189 * "flush single page" instruction). this is only supported on 190 * newer processors. this bit can be used to keep the kernel's 191 * TLB entries around while context switching. since the kernel 192 * is mapped into all processes at the same place it does not make 193 * sense to flush these entries when switching from one process' 194 * pmap to another. 195 */ 196 /* 197 * A pmap describes a process' 4GB virtual address space. This 198 * virtual address space can be broken up into 1024 4MB regions which 199 * are described by PDEs in the PDP. The PDEs are defined as follows: 200 * 201 * Ranges are inclusive -> exclusive, just like vm_map_entry start/end. 202 * The following assumes that KERNBASE is 0xd0000000. 203 * 204 * PDE#s VA range Usage 205 * 0->831 0x0 -> 0xcfc00000 user address space, note that the 206 * max user address is 0xcfbfe000 207 * the final two pages in the last 4MB 208 * used to be reserved for the UAREA 209 * but now are no longer used. 210 * 831 0xcfc00000-> recursive mapping of PDP (used for 211 * 0xd0000000 linear mapping of PTPs). 212 * 832->1023 0xd0000000-> kernel address space (constant 213 * 0xffc00000 across all pmaps/processes). 214 * 1023 0xffc00000-> "alternate" recursive PDP mapping 215 * <end> (for other pmaps). 216 * 217 * 218 * Note: A recursive PDP mapping provides a way to map all the PTEs for 219 * a 4GB address space into a linear chunk of virtual memory. In other 220 * words, the PTE for page 0 is the first int mapped into the 4MB recursive 221 * area. The PTE for page 1 is the second int. The very last int in the 222 * 4MB range is the PTE that maps VA 0xffffe000 (the last page in a 4GB 223 * address). 224 * 225 * All pmaps' PDs must have the same values in slots 832->1023 so that 226 * the kernel is always mapped in every process. These values are loaded 227 * into the PD at pmap creation time. 228 * 229 * At any one time only one pmap can be active on a processor. This is 230 * the pmap whose PDP is pointed to by processor register %cr3. This pmap 231 * will have all its PTEs mapped into memory at the recursive mapping 232 * point (slot #831 as show above). When the pmap code wants to find the 233 * PTE for a virtual address, all it has to do is the following: 234 * 235 * Address of PTE = (831 * 4MB) + (VA / PAGE_SIZE) * sizeof(pt_entry_t) 236 * = 0xcfc00000 + (VA / 4096) * 4 237 * 238 * What happens if the pmap layer is asked to perform an operation 239 * on a pmap that is not the one which is currently active? In that 240 * case we take the PA of the PDP of the non-active pmap and put it in 241 * slot 1023 of the active pmap. This causes the non-active pmap's 242 * PTEs to get mapped in the final 4MB of the 4GB address space 243 * (e.g. starting at 0xffc00000). 244 * 245 * The following figure shows the effects of the recursive PDP mapping: 246 * 247 * PDP (%cr3) 248 * +----+ 249 * | 0| -> PTP#0 that maps VA 0x0 -> 0x400000 250 * | | 251 * | | 252 * | 831| -> points back to PDP (%cr3) mapping VA 0xcfc00000 -> 0xd0000000 253 * | 832| -> first kernel PTP (maps 0xd0000000 -> 0xe0400000) 254 * | | 255 * |1023| -> points to alternate pmap's PDP (maps 0xffc00000 -> end) 256 * +----+ 257 * 258 * Note that the PDE#831 VA (0xcfc00000) is defined as "PTE_BASE". 259 * Note that the PDE#1023 VA (0xffc00000) is defined as "APTE_BASE". 260 * 261 * Starting at VA 0xcfc00000 the current active PDP (%cr3) acts as a 262 * PTP: 263 * 264 * PTP#831 == PDP(%cr3) => maps VA 0xcfc00000 -> 0xd0000000 265 * +----+ 266 * | 0| -> maps the contents of PTP#0 at VA 0xcfc00000->0xcfc01000 267 * | | 268 * | | 269 * | 831| -> maps the contents of PTP#831 (the PDP) at VA 0xcff3f000 270 * | 832| -> maps the contents of first kernel PTP 271 * | | 272 * |1023| 273 * +----+ 274 * 275 * Note that mapping of the PDP at PTP#831's VA (0xcff3f000) is 276 * defined as "PDP_BASE".... within that mapping there are two 277 * defines: 278 * "PDP_PDE" (0xcff3fcfc) is the VA of the PDE in the PDP 279 * which points back to itself. 280 * "APDP_PDE" (0xcff3fffc) is the VA of the PDE in the PDP which 281 * establishes the recursive mapping of the alternate pmap. 282 * To set the alternate PDP, one just has to put the correct 283 * PA info in *APDP_PDE. 284 * 285 * Note that in the APTE_BASE space, the APDP appears at VA 286 * "APDP_BASE" (0xfffff000). 287 */ 288 #define PG_FRAME 0xfffff000 /* page frame mask */ 289 #define PG_LGFRAME 0xffc00000 /* large (4M) page frame mask */ 290 291 /* 292 * The following defines give the virtual addresses of various MMU 293 * data structures: 294 * PTE_BASE and APTE_BASE: the base VA of the linear PTE mappings 295 * PDP_PDE and APDP_PDE: the VA of the PDE that points back to the PDP/APDP 296 */ 297 #define PTE_BASE ((pt_entry_t *) (PDSLOT_PTE * NBPD)) 298 #define APTE_BASE ((pt_entry_t *) (PDSLOT_APTE * NBPD)) 299 #define PDP_BASE ((pd_entry_t *)(((char *)PTE_BASE) + (PDSLOT_PTE * NBPG))) 300 #define APDP_BASE ((pd_entry_t *)(((char *)APTE_BASE) + (PDSLOT_APTE * NBPG))) 301 #define PDP_PDE (PDP_BASE + PDSLOT_PTE) 302 #define APDP_PDE (PDP_BASE + PDSLOT_APTE) 303 304 /* 305 * pdei/ptei: generate index into PDP/PTP from a VA 306 */ 307 #define PD_MASK 0xffc00000 /* page directory address bits */ 308 #define PT_MASK 0x003ff000 /* page table address bits */ 309 #define pdei(VA) (((VA) & PD_MASK) >> PDSHIFT) 310 #define ptei(VA) (((VA) & PT_MASK) >> PGSHIFT) 311 312 /* 313 * Mach derived conversion macros 314 */ 315 #define i386_round_pdr(x) ((((unsigned)(x)) + ~PD_MASK) & PD_MASK) 316 317 /* 318 * various address macros 319 * 320 * vtopte: return a pointer to the PTE mapping a VA 321 */ 322 #define vtopte(VA) (PTE_BASE + atop((vaddr_t)VA)) 323 324 /* 325 * PTP macros: 326 * A PTP's index is the PD index of the PDE that points to it. 327 * A PTP's offset is the byte-offset in the PTE space that this PTP is at. 328 * A PTP's VA is the first VA mapped by that PTP. 329 * 330 * Note that NBPG == number of bytes in a PTP (4096 bytes == 1024 entries) 331 * NBPD == number of bytes a PTP can map (4MB) 332 */ 333 334 #define ptp_i2o(I) ((I) * NBPG) /* index => offset */ 335 #define ptp_o2i(O) ((O) / NBPG) /* offset => index */ 336 #define ptp_i2v(I) ((I) * NBPD) /* index => VA */ 337 #define ptp_v2i(V) ((V) / NBPD) /* VA => index (same as pdei) */ 338 339 /* 340 * Access PD and PT 341 */ 342 #define PDE(pm,i) (((pd_entry_t *)(pm)->pm_pdir)[(i)]) 343 344 /* 345 * here we define the data types for PDEs and PTEs 346 */ 347 typedef u_int32_t pd_entry_t; /* PDE */ 348 typedef u_int32_t pt_entry_t; /* PTE */ 349 350 /* 351 * Number of PTEs per cache line. 4 byte pte, 64-byte cache line 352 * Used to avoid false sharing of cache lines. 353 */ 354 #define NPTECL 16 355 356 /* 357 * global data structures 358 */ 359 360 /* The kernel's pmap (proc0), 32 byte aligned in case we are using PAE */ 361 struct pmap __attribute__ ((aligned (32))) kernel_pmap_store; 362 363 /* 364 * nkpde is the number of kernel PTPs allocated for the kernel at 365 * boot time (NKPTP is a compile time override). this number can 366 * grow dynamically as needed (but once allocated, we never free 367 * kernel PTPs). 368 */ 369 370 int nkpde = NKPTP; 371 int nkptp_max = 1024 - (KERNBASE / NBPD) - 1; 372 373 /* 374 * pg_g_kern: if CPU is affected by Meltdown pg_g_kern is 0, 375 * otherwise it is set to PG_G. pmap_pg_g will be derived 376 * from pg_g_kern, see pmap_bootstrap(). 377 */ 378 extern int pg_g_kern; 379 380 /* 381 * pmap_pg_g: if our processor supports PG_G in the PTE then we 382 * set pmap_pg_g to PG_G (otherwise it is zero). 383 */ 384 385 int pmap_pg_g = 0; 386 387 /* 388 * pmap_pg_wc: if our processor supports PAT then we set this 389 * to be the pte bits for Write Combining. Else we fall back to 390 * UC- so mtrrs can override the cacheability 391 */ 392 int pmap_pg_wc = PG_UCMINUS; 393 394 /* 395 * other data structures 396 */ 397 398 uint32_t protection_codes[8]; /* maps MI prot to i386 prot code */ 399 int pmap_initialized = 0; /* pmap_init done yet? */ 400 401 /* 402 * MULTIPROCESSOR: special VAs/ PTEs are actually allocated inside a 403 * MAXCPUS*NPTECL array of PTEs, to avoid cache line thrashing 404 * due to false sharing. 405 */ 406 407 #ifdef MULTIPROCESSOR 408 #define PTESLEW(pte, id) ((pte)+(id)*NPTECL) 409 #define VASLEW(va,id) ((va)+(id)*NPTECL*NBPG) 410 #else 411 #define PTESLEW(pte, id) (pte) 412 #define VASLEW(va,id) (va) 413 #endif 414 415 /* 416 * pv management structures. 417 */ 418 struct pool pmap_pv_pool; 419 420 #define PVE_LOWAT (PVE_PER_PVPAGE / 2) /* free pv_entry low water mark */ 421 #define PVE_HIWAT (PVE_LOWAT + (PVE_PER_PVPAGE * 2)) 422 /* high water mark */ 423 424 /* 425 * the following two vaddr_t's are used during system startup 426 * to keep track of how much of the kernel's VM space we have used. 427 * once the system is started, the management of the remaining kernel 428 * VM space is turned over to the kernel_map vm_map. 429 */ 430 431 static vaddr_t virtual_avail; /* VA of first free KVA */ 432 static vaddr_t virtual_end; /* VA of last free KVA */ 433 434 /* 435 * linked list of all non-kernel pmaps 436 */ 437 438 struct pmap_head pmaps; 439 struct mutex pmaps_lock = MUTEX_INITIALIZER(IPL_VM); 440 441 /* 442 * pool that pmap structures are allocated from 443 */ 444 445 struct pool pmap_pmap_pool; 446 447 /* 448 * special VAs and the PTEs that map them 449 */ 450 451 pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte, *flsh_pte; 452 caddr_t pmap_csrcp, pmap_cdstp, pmap_zerop, pmap_ptpp, pmap_flshp; 453 caddr_t vmmap; /* XXX: used by mem.c... it should really uvm_map_reserve it */ 454 455 extern uint32_t cpu_meltdown; 456 457 /* 458 * local prototypes 459 */ 460 struct vm_page *pmap_alloc_ptp_86(struct pmap *, int, pt_entry_t); 461 struct vm_page *pmap_get_ptp_86(struct pmap *, int); 462 pt_entry_t *pmap_map_ptes_86(struct pmap *); 463 void pmap_unmap_ptes_86(struct pmap *); 464 void pmap_do_remove_86(struct pmap *, vaddr_t, vaddr_t, int); 465 void pmap_remove_ptes_86(struct pmap *, struct vm_page *, vaddr_t, 466 vaddr_t, vaddr_t, int, struct pv_entry **); 467 void *pmap_pv_page_alloc(struct pool *, int, int *); 468 void pmap_pv_page_free(struct pool *, void *); 469 470 struct pool_allocator pmap_pv_page_allocator = { 471 pmap_pv_page_alloc, pmap_pv_page_free, 472 }; 473 474 void pmap_sync_flags_pte_86(struct vm_page *, pt_entry_t); 475 476 void pmap_drop_ptp_86(struct pmap *, vaddr_t, struct vm_page *, 477 pt_entry_t *); 478 479 void setcslimit(struct pmap *, struct trapframe *, struct pcb *, 480 vaddr_t); 481 void pmap_pinit_pd_86(struct pmap *); 482 483 static __inline u_int 484 pmap_pte2flags(pt_entry_t pte) 485 { 486 return (((pte & PG_U) ? PG_PMAP_REF : 0) | 487 ((pte & PG_M) ? PG_PMAP_MOD : 0)); 488 } 489 490 void 491 pmap_sync_flags_pte_86(struct vm_page *pg, pt_entry_t pte) 492 { 493 if (pte & (PG_U|PG_M)) { 494 atomic_setbits_int(&pg->pg_flags, pmap_pte2flags(pte)); 495 } 496 } 497 498 void 499 pmap_apte_flush(void) 500 { 501 pmap_tlb_shoottlb(); 502 pmap_tlb_shootwait(); 503 } 504 505 /* 506 * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in 507 * 508 * => we lock enough pmaps to keep things locked in 509 * => must be undone with pmap_unmap_ptes before returning 510 */ 511 512 pt_entry_t * 513 pmap_map_ptes_86(struct pmap *pmap) 514 { 515 pd_entry_t opde; 516 517 /* the kernel's pmap is always accessible */ 518 if (pmap == pmap_kernel()) { 519 return(PTE_BASE); 520 } 521 522 mtx_enter(&pmap->pm_mtx); 523 524 /* if curpmap then we are always mapped */ 525 if (pmap_is_curpmap(pmap)) { 526 return(PTE_BASE); 527 } 528 529 mtx_enter(&curcpu()->ci_curpmap->pm_apte_mtx); 530 531 /* need to load a new alternate pt space into curpmap? */ 532 opde = *APDP_PDE; 533 #if defined(MULTIPROCESSOR) && defined(DIAGNOSTIC) 534 if (pmap_valid_entry(opde)) 535 panic("pmap_map_ptes_86: APTE valid"); 536 #endif 537 if (!pmap_valid_entry(opde) || (opde & PG_FRAME) != pmap->pm_pdirpa) { 538 *APDP_PDE = (pd_entry_t) (pmap->pm_pdirpa | PG_RW | PG_V | 539 PG_U | PG_M); 540 if (pmap_valid_entry(opde)) 541 pmap_apte_flush(); 542 } 543 return(APTE_BASE); 544 } 545 546 /* 547 * pmap_unmap_ptes: unlock the PTE mapping of "pmap" 548 */ 549 550 void 551 pmap_unmap_ptes_86(struct pmap *pmap) 552 { 553 if (pmap == pmap_kernel()) 554 return; 555 556 if (!pmap_is_curpmap(pmap)) { 557 #if defined(MULTIPROCESSOR) 558 *APDP_PDE = 0; 559 pmap_apte_flush(); 560 #endif 561 mtx_leave(&curcpu()->ci_curpmap->pm_apte_mtx); 562 } 563 564 mtx_leave(&pmap->pm_mtx); 565 } 566 567 void 568 pmap_exec_account(struct pmap *pm, vaddr_t va, 569 uint32_t opte, uint32_t npte) 570 { 571 if (pm == pmap_kernel()) 572 return; 573 574 if (curproc->p_vmspace == NULL || 575 pm != vm_map_pmap(&curproc->p_vmspace->vm_map)) 576 return; 577 578 if ((opte ^ npte) & PG_X) 579 pmap_tlb_shootpage(pm, va); 580 581 if (cpu_pae) 582 return; 583 584 /* 585 * Executability was removed on the last executable change. 586 * Reset the code segment to something conservative and 587 * let the trap handler deal with setting the right limit. 588 * We can't do that because of locking constraints on the vm map. 589 * 590 * XXX - floating cs - set this _really_ low. 591 */ 592 if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) { 593 struct trapframe *tf = curproc->p_md.md_regs; 594 struct pcb *pcb = &curproc->p_addr->u_pcb; 595 596 KERNEL_LOCK(); 597 pm->pm_hiexec = I386_MAX_EXE_ADDR; 598 setcslimit(pm, tf, pcb, I386_MAX_EXE_ADDR); 599 KERNEL_UNLOCK(); 600 } 601 } 602 603 /* 604 * Fixup the code segment to cover all potential executable mappings. 605 * Called by kernel SEGV trap handler. 606 * returns 0 if no changes to the code segment were made. 607 */ 608 int 609 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, vaddr_t gdt_cs, 610 struct pcb *pcb) 611 { 612 struct vm_map_entry *ent; 613 struct pmap *pm = vm_map_pmap(map); 614 vaddr_t va = 0; 615 vaddr_t pm_cs; 616 617 KERNEL_LOCK(); 618 619 vm_map_lock(map); 620 RBT_FOREACH_REVERSE(ent, uvm_map_addr, &map->addr) { 621 if (ent->protection & PROT_EXEC) 622 break; 623 } 624 /* 625 * This entry has greater va than the entries before. 626 * We need to make it point to the last page, not past it. 627 */ 628 if (ent) 629 va = trunc_page(ent->end - 1); 630 vm_map_unlock(map); 631 632 KERNEL_ASSERT_LOCKED(); 633 634 pm_cs = SEGDESC_LIMIT(pm->pm_codeseg); 635 636 /* 637 * Another thread running on another cpu can change 638 * pm_hiexec and pm_codeseg. If this has happened 639 * during our timeslice, our gdt code segment will 640 * be stale. So only allow the fault through if the 641 * faulting address is less then pm_hiexec and our 642 * gdt code segment is not stale. 643 */ 644 if (va <= pm->pm_hiexec && pm_cs == pm->pm_hiexec && 645 gdt_cs == pm->pm_hiexec) { 646 KERNEL_UNLOCK(); 647 return (0); 648 } 649 650 pm->pm_hiexec = va; 651 652 /* 653 * We have a new 'highest executable' va, so we need to update 654 * the value for the code segment limit, which is stored in the 655 * PCB. 656 */ 657 setcslimit(pm, tf, pcb, va); 658 659 KERNEL_UNLOCK(); 660 return (1); 661 } 662 663 u_int32_t 664 pmap_pte_set_86(vaddr_t va, paddr_t pa, u_int32_t bits) 665 { 666 pt_entry_t pte, *ptep = vtopte(va); 667 668 pa &= PMAP_PA_MASK; 669 670 pte = i386_atomic_testset_ul(ptep, pa | bits); /* zap! */ 671 return (pte & ~PG_FRAME); 672 } 673 674 u_int32_t 675 pmap_pte_setbits_86(vaddr_t va, u_int32_t set, u_int32_t clr) 676 { 677 pt_entry_t *ptep = vtopte(va); 678 pt_entry_t pte = *ptep; 679 680 *ptep = (pte | set) & ~clr; 681 return (pte & ~PG_FRAME); 682 } 683 684 u_int32_t 685 pmap_pte_bits_86(vaddr_t va) 686 { 687 pt_entry_t *ptep = vtopte(va); 688 689 return (*ptep & ~PG_FRAME); 690 } 691 692 paddr_t 693 pmap_pte_paddr_86(vaddr_t va) 694 { 695 pt_entry_t *ptep = vtopte(va); 696 697 return (*ptep & PG_FRAME); 698 } 699 700 /* 701 * pmap_tmpmap_pa: map a page in for tmp usage 702 */ 703 704 vaddr_t 705 pmap_tmpmap_pa_86(paddr_t pa) 706 { 707 #ifdef MULTIPROCESSOR 708 int id = cpu_number(); 709 #endif 710 pt_entry_t *ptpte; 711 caddr_t ptpva; 712 713 ptpte = PTESLEW(ptp_pte, id); 714 ptpva = VASLEW(pmap_ptpp, id); 715 716 #if defined(DIAGNOSTIC) 717 if (*ptpte) 718 panic("pmap_tmpmap_pa: ptp_pte in use?"); 719 #endif 720 *ptpte = PG_V | PG_RW | pa; /* always a new mapping */ 721 return((vaddr_t)ptpva); 722 } 723 724 725 vaddr_t 726 pmap_tmpmap_pa(paddr_t pa) 727 { 728 if (cpu_pae) 729 return pmap_tmpmap_pa_pae(pa); 730 731 return pmap_tmpmap_pa_86(pa); 732 } 733 734 /* 735 * pmap_tmpunmap_pa: unmap a tmp use page (undoes pmap_tmpmap_pa) 736 */ 737 738 void 739 pmap_tmpunmap_pa_86(void) 740 { 741 #ifdef MULTIPROCESSOR 742 int id = cpu_number(); 743 #endif 744 pt_entry_t *ptpte; 745 caddr_t ptpva; 746 747 ptpte = PTESLEW(ptp_pte, id); 748 ptpva = VASLEW(pmap_ptpp, id); 749 750 #if defined(DIAGNOSTIC) 751 if (!pmap_valid_entry(*ptpte)) 752 panic("pmap_tmpunmap_pa: our pte invalid?"); 753 #endif 754 755 *ptpte = 0; 756 pmap_update_pg((vaddr_t)ptpva); 757 #ifdef MULTIPROCESSOR 758 /* 759 * No need for tlb shootdown here, since ptp_pte is per-CPU. 760 */ 761 #endif 762 } 763 764 void 765 pmap_tmpunmap_pa(void) 766 { 767 if (cpu_pae) { 768 pmap_tmpunmap_pa_pae(); 769 return; 770 } 771 772 pmap_tmpunmap_pa_86(); 773 } 774 775 paddr_t 776 vtophys(vaddr_t va) 777 { 778 if (cpu_pae) 779 return vtophys_pae(va); 780 else 781 return ((*vtopte(va) & PG_FRAME) | (va & ~PG_FRAME)); 782 } 783 784 void 785 setcslimit(struct pmap *pm, struct trapframe *tf, struct pcb *pcb, 786 vaddr_t limit) 787 { 788 /* 789 * Called when we have a new 'highest executable' va, so we need 790 * to update the value for the code segment limit, which is stored 791 * in the PCB. 792 * 793 * There are no caching issues to be concerned with: the 794 * processor reads the whole descriptor from the GDT when the 795 * appropriate selector is loaded into a segment register, and 796 * this only happens on the return to userland. 797 * 798 * This also works in the MP case, since whichever CPU gets to 799 * run the process will pick up the right descriptor value from 800 * the PCB. 801 */ 802 limit = min(limit, VM_MAXUSER_ADDRESS - 1); 803 804 setsegment(&pm->pm_codeseg, 0, atop(limit), 805 SDT_MEMERA, SEL_UPL, 1, 1); 806 807 /* And update the GDT since we may be called by the 808 * trap handler (cpu_switch won't get a chance). 809 */ 810 curcpu()->ci_gdt[GUCODE_SEL].sd = pm->pm_codeseg; 811 812 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 813 } 814 815 /* 816 * p m a p k e n t e r f u n c t i o n s 817 * 818 * functions to quickly enter/remove pages from the kernel address 819 * space. pmap_kremove is exported to MI kernel. we make use of 820 * the recursive PTE mappings. 821 */ 822 823 /* 824 * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking 825 * 826 * => no need to lock anything, assume va is already allocated 827 * => should be faster than normal pmap enter function 828 */ 829 830 void 831 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot) 832 { 833 uint32_t bits; 834 uint32_t global = 0; 835 836 /* special 1:1 mappings in the first large page must not be global */ 837 if (!cpu_pae) { 838 if (va >= (vaddr_t)NBPD) /* 4MB pages on non-PAE */ 839 global = pmap_pg_g; 840 } else { 841 if (va >= (vaddr_t)NBPD / 2) /* 2MB pages on PAE */ 842 global = pmap_pg_g; 843 } 844 845 bits = pmap_pte_set(va, pa, ((prot & PROT_WRITE) ? PG_RW : PG_RO) | 846 PG_V | global | PG_U | PG_M | 847 ((prot & PROT_EXEC) ? PG_X : 0) | 848 ((pa & PMAP_NOCACHE) ? PG_N : 0) | 849 ((pa & PMAP_WC) ? pmap_pg_wc : 0)); 850 if (pmap_valid_entry(bits)) { 851 if (pa & PMAP_NOCACHE && (bits & PG_N) == 0) 852 wbinvd_on_all_cpus(); 853 /* NB. - this should not happen. */ 854 pmap_tlb_shootpage(pmap_kernel(), va); 855 pmap_tlb_shootwait(); 856 } 857 } 858 859 /* 860 * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking 861 * 862 * => no need to lock anything 863 * => caller must dispose of any vm_page mapped in the va range 864 * => note: not an inline function 865 * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE 866 */ 867 868 void 869 pmap_kremove(vaddr_t sva, vsize_t len) 870 { 871 uint32_t bits; 872 vaddr_t va, eva; 873 874 eva = sva + len; 875 876 for (va = sva; va != eva; va += PAGE_SIZE) { 877 bits = pmap_pte_set(va, 0, 0); 878 #ifdef DIAGNOSTIC 879 if (bits & PG_PVLIST) 880 panic("pmap_kremove: PG_PVLIST mapping for 0x%lx", va); 881 #endif 882 } 883 pmap_tlb_shootrange(pmap_kernel(), sva, eva); 884 pmap_tlb_shootwait(); 885 } 886 887 /* 888 * Allocate a new PD for Intel's U-K. 889 */ 890 void 891 pmap_alloc_pdir_intel_x86(struct pmap *pmap) 892 { 893 vaddr_t va; 894 895 KASSERT(pmap->pm_pdir_intel == 0); 896 897 va = (vaddr_t)km_alloc(NBPG, &kv_any, &kp_zero, &kd_waitok); 898 if (va == 0) 899 panic("kernel_map out of virtual space"); 900 pmap->pm_pdir_intel = va; 901 if (!pmap_extract(pmap_kernel(), (vaddr_t)pmap->pm_pdir_intel, 902 &pmap->pm_pdirpa_intel)) 903 panic("can't locate PD page"); 904 } 905 906 /* 907 * p m a p i n i t f u n c t i o n s 908 * 909 * pmap_bootstrap and pmap_init are called during system startup 910 * to init the pmap module. pmap_bootstrap() does a low level 911 * init just to get things rolling. pmap_init() finishes the job. 912 */ 913 914 /* 915 * pmap_bootstrap: get the system in a state where it can run with VM 916 * properly enabled (called before main()). the VM system is 917 * fully init'd later... 918 * 919 * => on i386, locore.s has already enabled the MMU by allocating 920 * a PDP for the kernel, and nkpde PTPs for the kernel. 921 * => kva_start is the first free virtual address in kernel space 922 */ 923 924 void 925 pmap_bootstrap(vaddr_t kva_start) 926 { 927 struct pmap *kpm; 928 vaddr_t kva; 929 pt_entry_t *pte; 930 931 /* 932 * set the page size (default value is 4K which is ok) 933 */ 934 935 uvm_setpagesize(); 936 937 /* 938 * a quick sanity check 939 */ 940 941 if (PAGE_SIZE != NBPG) 942 panic("pmap_bootstrap: PAGE_SIZE != NBPG"); 943 944 /* 945 * set up our local static global vars that keep track of the 946 * usage of KVM before kernel_map is set up 947 */ 948 949 virtual_avail = kva_start; /* first free KVA */ 950 virtual_end = VM_MAX_KERNEL_ADDRESS; /* last KVA */ 951 952 /* 953 * set up protection_codes: we need to be able to convert from 954 * a MI protection code (some combo of VM_PROT...) to something 955 * we can jam into a i386 PTE. 956 */ 957 958 protection_codes[PROT_NONE] = 0; /* --- */ 959 protection_codes[PROT_EXEC] = PG_X; /* --x */ 960 protection_codes[PROT_READ] = PG_RO; /* -r- */ 961 protection_codes[PROT_READ | PROT_EXEC] = PG_X; /* -rx */ 962 protection_codes[PROT_WRITE] = PG_RW; /* w-- */ 963 protection_codes[PROT_WRITE | PROT_EXEC] = PG_RW|PG_X; /* w-x */ 964 protection_codes[PROT_READ | PROT_WRITE] = PG_RW; /* wr- */ 965 protection_codes[PROT_READ | PROT_WRITE | PROT_EXEC] = PG_RW|PG_X; /* wrx */ 966 967 /* 968 * now we init the kernel's pmap 969 * 970 * the kernel pmap's pm_obj is not used for much. however, in 971 * user pmaps the pm_obj contains the list of active PTPs. 972 * the pm_obj currently does not have a pager. it might be possible 973 * to add a pager that would allow a process to read-only mmap its 974 * own page tables (fast user level vtophys?). this may or may not 975 * be useful. 976 */ 977 978 kpm = pmap_kernel(); 979 mtx_init(&kpm->pm_mtx, -1); /* must not be used */ 980 mtx_init(&kpm->pm_apte_mtx, IPL_VM); 981 uvm_obj_init(&kpm->pm_obj, &pmap_pager, 1); 982 bzero(&kpm->pm_list, sizeof(kpm->pm_list)); /* pm_list not used */ 983 kpm->pm_pdir = (vaddr_t)(proc0.p_addr->u_pcb.pcb_cr3 + KERNBASE); 984 kpm->pm_pdirpa = proc0.p_addr->u_pcb.pcb_cr3; 985 kpm->pm_pdir_intel = 0; 986 kpm->pm_pdirpa_intel = 0; 987 kpm->pm_stats.wired_count = kpm->pm_stats.resident_count = 988 atop(kva_start - VM_MIN_KERNEL_ADDRESS); 989 990 /* 991 * the above is just a rough estimate and not critical to the proper 992 * operation of the system. 993 */ 994 995 /* 996 * enable global TLB entries if they are supported and the 997 * CPU is not affected by Meltdown. 998 */ 999 1000 if (cpu_feature & CPUID_PGE) { 1001 lcr4(rcr4() | CR4_PGE); /* enable hardware (via %cr4) */ 1002 pmap_pg_g = pg_g_kern; /* if safe to use, enable software */ 1003 1004 /* add PG_G attribute to already mapped kernel pages */ 1005 for (kva = VM_MIN_KERNEL_ADDRESS; kva < virtual_avail; 1006 kva += PAGE_SIZE) 1007 if (pmap_valid_entry(PTE_BASE[atop(kva)])) 1008 PTE_BASE[atop(kva)] |= pmap_pg_g; 1009 } 1010 1011 /* 1012 * now we allocate the "special" VAs which are used for tmp mappings 1013 * by the pmap (and other modules). we allocate the VAs by advancing 1014 * virtual_avail (note that there are no pages mapped at these VAs). 1015 * we find the PTE that maps the allocated VA via the linear PTE 1016 * mapping. 1017 */ 1018 1019 pte = PTE_BASE + atop(virtual_avail); 1020 1021 #ifdef MULTIPROCESSOR 1022 /* 1023 * Waste some VA space to avoid false sharing of cache lines 1024 * for page table pages: Give each possible CPU a cache line 1025 * of PTEs (16) to play with, though we only need 4. We could 1026 * recycle some of this waste by putting the idle stacks here 1027 * as well; we could waste less space if we knew the largest 1028 * CPU ID beforehand. 1029 */ 1030 pmap_csrcp = (caddr_t) virtual_avail; csrc_pte = pte; 1031 1032 pmap_cdstp = (caddr_t) virtual_avail+PAGE_SIZE; cdst_pte = pte+1; 1033 1034 pmap_zerop = (caddr_t) virtual_avail+PAGE_SIZE*2; zero_pte = pte+2; 1035 1036 pmap_ptpp = (caddr_t) virtual_avail+PAGE_SIZE*3; ptp_pte = pte+3; 1037 1038 pmap_flshp = (caddr_t) virtual_avail+PAGE_SIZE*4; flsh_pte = pte+4; 1039 1040 virtual_avail += PAGE_SIZE * MAXCPUS * NPTECL; 1041 pte += MAXCPUS * NPTECL; 1042 #else 1043 pmap_csrcp = (caddr_t) virtual_avail; csrc_pte = pte; /* allocate */ 1044 virtual_avail += PAGE_SIZE; pte++; /* advance */ 1045 1046 pmap_cdstp = (caddr_t) virtual_avail; cdst_pte = pte; 1047 virtual_avail += PAGE_SIZE; pte++; 1048 1049 pmap_zerop = (caddr_t) virtual_avail; zero_pte = pte; 1050 virtual_avail += PAGE_SIZE; pte++; 1051 1052 pmap_ptpp = (caddr_t) virtual_avail; ptp_pte = pte; 1053 virtual_avail += PAGE_SIZE; pte++; 1054 1055 pmap_flshp = (caddr_t) virtual_avail; flsh_pte = pte; 1056 virtual_avail += PAGE_SIZE; pte++; 1057 #endif 1058 1059 /* XXX: vmmap used by mem.c... should be uvm_map_reserve */ 1060 vmmap = (char *)virtual_avail; /* don't need pte */ 1061 virtual_avail += PAGE_SIZE; 1062 1063 msgbufp = (struct msgbuf *)virtual_avail; /* don't need pte */ 1064 virtual_avail += round_page(MSGBUFSIZE); pte++; 1065 1066 bootargp = (bootarg_t *)virtual_avail; 1067 virtual_avail += round_page(bootargc); pte++; 1068 1069 /* 1070 * now we reserve some VM for mapping pages when doing a crash dump 1071 */ 1072 1073 virtual_avail = reserve_dumppages(virtual_avail); 1074 1075 /* 1076 * init the static-global locks and global lists. 1077 */ 1078 1079 LIST_INIT(&pmaps); 1080 1081 /* 1082 * initialize the pmap pool. 1083 */ 1084 1085 pool_init(&pmap_pmap_pool, sizeof(struct pmap), 32, IPL_NONE, 0, 1086 "pmappl", NULL); 1087 pool_init(&pmap_pv_pool, sizeof(struct pv_entry), 0, IPL_VM, 0, 1088 "pvpl", &pmap_pv_page_allocator); 1089 1090 /* 1091 * ensure the TLB is sync'd with reality by flushing it... 1092 */ 1093 1094 tlbflush(); 1095 } 1096 1097 /* 1098 * Pre-allocate PTP 0 for low memory, so that 1:1 mappings for various 1099 * trampoline code can be entered. 1100 */ 1101 void 1102 pmap_prealloc_lowmem_ptp(void) 1103 { 1104 pt_entry_t *pte, npte; 1105 vaddr_t ptpva = (vaddr_t)vtopte(0); 1106 1107 /* If PAE, use the PAE-specific preallocator */ 1108 if (cpu_pae) { 1109 pmap_prealloc_lowmem_ptp_pae(); 1110 return; 1111 } 1112 1113 /* enter pa for pte 0 into recursive map */ 1114 pte = vtopte(ptpva); 1115 npte = PTP0_PA | PG_RW | PG_V | PG_U | PG_M; 1116 1117 i386_atomic_testset_ul(pte, npte); 1118 1119 /* make sure it is clean before using */ 1120 memset((void *)ptpva, 0, NBPG); 1121 } 1122 1123 /* 1124 * pmap_init: called from uvm_init, our job is to get the pmap 1125 * system ready to manage mappings... this mainly means initing 1126 * the pv_entry stuff. 1127 */ 1128 1129 void 1130 pmap_init(void) 1131 { 1132 /* 1133 * prime the pool with pv_entry structures to allow us to get 1134 * the kmem_map allocated and inited (done after this function 1135 * is finished). we do this by setting a low water mark such 1136 * that we are more likely to have these around in extreme 1137 * memory starvation. 1138 */ 1139 1140 pool_setlowat(&pmap_pv_pool, PVE_LOWAT); 1141 pool_sethiwat(&pmap_pv_pool, PVE_HIWAT); 1142 1143 /* 1144 * done: pmap module is up (and ready for business) 1145 */ 1146 1147 pmap_initialized = 1; 1148 } 1149 1150 /* 1151 * p v _ e n t r y f u n c t i o n s 1152 */ 1153 1154 void * 1155 pmap_pv_page_alloc(struct pool *pp, int flags, int *slowdown) 1156 { 1157 struct kmem_dyn_mode kd = KMEM_DYN_INITIALIZER; 1158 1159 kd.kd_waitok = ISSET(flags, PR_WAITOK); 1160 kd.kd_slowdown = slowdown; 1161 1162 return (km_alloc(pp->pr_pgsize, 1163 pmap_initialized ? &kv_page : &kv_any, pp->pr_crange, &kd)); 1164 } 1165 1166 void 1167 pmap_pv_page_free(struct pool *pp, void *v) 1168 { 1169 km_free(v, pp->pr_pgsize, &kv_page, pp->pr_crange); 1170 } 1171 1172 /* 1173 * main pv_entry manipulation functions: 1174 * pmap_enter_pv: enter a mapping onto a pv list 1175 * pmap_remove_pv: remove a mapping from a pv list 1176 */ 1177 1178 /* 1179 * pmap_enter_pv: enter a mapping onto a pv list 1180 * 1181 * => caller should have pmap locked 1182 * => we will gain the lock on the pv and allocate the new pv_entry 1183 * => caller should adjust ptp's wire_count before calling 1184 * 1185 * pve: preallocated pve for us to use 1186 * ptp: PTP in pmap that maps this VA 1187 */ 1188 1189 void 1190 pmap_enter_pv(struct vm_page *pg, struct pv_entry *pve, struct pmap *pmap, 1191 vaddr_t va, struct vm_page *ptp) 1192 { 1193 pve->pv_pmap = pmap; 1194 pve->pv_va = va; 1195 pve->pv_ptp = ptp; /* NULL for kernel pmap */ 1196 mtx_enter(&pg->mdpage.pv_mtx); 1197 pve->pv_next = pg->mdpage.pv_list; /* add to ... */ 1198 pg->mdpage.pv_list = pve; /* ... locked list */ 1199 mtx_leave(&pg->mdpage.pv_mtx); 1200 } 1201 1202 /* 1203 * pmap_remove_pv: try to remove a mapping from a pv_list 1204 * 1205 * => pmap should be locked 1206 * => caller should hold lock on pv [so that attrs can be adjusted] 1207 * => caller should adjust ptp's wire_count and free PTP if needed 1208 * => we return the removed pve 1209 */ 1210 1211 struct pv_entry * 1212 pmap_remove_pv(struct vm_page *pg, struct pmap *pmap, vaddr_t va) 1213 { 1214 struct pv_entry *pve, **prevptr; 1215 1216 mtx_enter(&pg->mdpage.pv_mtx); 1217 prevptr = &pg->mdpage.pv_list; /* previous pv_entry pointer */ 1218 while ((pve = *prevptr) != NULL) { 1219 if (pve->pv_pmap == pmap && pve->pv_va == va) { /* match? */ 1220 *prevptr = pve->pv_next; /* remove it! */ 1221 break; 1222 } 1223 prevptr = &pve->pv_next; /* previous pointer */ 1224 } 1225 mtx_leave(&pg->mdpage.pv_mtx); 1226 return(pve); /* return removed pve */ 1227 } 1228 1229 /* 1230 * p t p f u n c t i o n s 1231 */ 1232 1233 /* 1234 * pmap_alloc_ptp: allocate a PTP for a PMAP 1235 * 1236 * => pmap should already be locked by caller 1237 * => we use the ptp's wire_count to count the number of active mappings 1238 * in the PTP (we start it at one to prevent any chance this PTP 1239 * will ever leak onto the active/inactive queues) 1240 */ 1241 1242 struct vm_page * 1243 pmap_alloc_ptp_86(struct pmap *pmap, int pde_index, pt_entry_t pde_flags) 1244 { 1245 struct vm_page *ptp; 1246 pd_entry_t *pva_intel; 1247 1248 ptp = uvm_pagealloc(&pmap->pm_obj, ptp_i2o(pde_index), NULL, 1249 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 1250 if (ptp == NULL) 1251 return (NULL); 1252 1253 /* got one! */ 1254 atomic_clearbits_int(&ptp->pg_flags, PG_BUSY); 1255 ptp->wire_count = 1; /* no mappings yet */ 1256 PDE(pmap, pde_index) = (pd_entry_t)(VM_PAGE_TO_PHYS(ptp) | 1257 PG_RW | PG_V | PG_M | PG_U | pde_flags); 1258 1259 /* 1260 * Meltdown special case - if we are adding a new PDE for 1261 * usermode addresses, just copy the PDE to the U-K page 1262 * table. 1263 */ 1264 if (pmap->pm_pdir_intel && ptp_i2v(pde_index) < VM_MAXUSER_ADDRESS) { 1265 pva_intel = (pd_entry_t *)pmap->pm_pdir_intel; 1266 pva_intel[pde_index] = PDE(pmap, pde_index); 1267 DPRINTF("%s: copying usermode PDE (content=0x%x) pde_index %d " 1268 "from 0x%x -> 0x%x\n", __func__, PDE(pmap, pde_index), 1269 pde_index, (uint32_t)&PDE(pmap, pde_index), 1270 (uint32_t)&(pva_intel[pde_index])); 1271 } 1272 1273 pmap->pm_stats.resident_count++; /* count PTP as resident */ 1274 pmap->pm_ptphint = ptp; 1275 return(ptp); 1276 } 1277 1278 /* 1279 * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one) 1280 * 1281 * => pmap should NOT be pmap_kernel() 1282 * => pmap should be locked 1283 */ 1284 1285 struct vm_page * 1286 pmap_get_ptp_86(struct pmap *pmap, int pde_index) 1287 { 1288 struct vm_page *ptp; 1289 1290 if (pmap_valid_entry(PDE(pmap, pde_index))) { 1291 /* valid... check hint (saves us a PA->PG lookup) */ 1292 if (pmap->pm_ptphint && 1293 (PDE(pmap, pde_index) & PG_FRAME) == 1294 VM_PAGE_TO_PHYS(pmap->pm_ptphint)) 1295 return(pmap->pm_ptphint); 1296 1297 ptp = uvm_pagelookup(&pmap->pm_obj, ptp_i2o(pde_index)); 1298 #ifdef DIAGNOSTIC 1299 if (ptp == NULL) 1300 panic("pmap_get_ptp_86: unmanaged user PTP"); 1301 #endif 1302 pmap->pm_ptphint = ptp; 1303 return(ptp); 1304 } 1305 1306 /* allocate a new PTP (updates ptphint) */ 1307 return (pmap_alloc_ptp_86(pmap, pde_index, PG_u)); 1308 } 1309 1310 void 1311 pmap_drop_ptp_86(struct pmap *pm, vaddr_t va, struct vm_page *ptp, 1312 pt_entry_t *ptes) 1313 { 1314 pd_entry_t *pva_intel; 1315 1316 i386_atomic_testset_ul(&PDE(pm, pdei(va)), 0); 1317 pmap_tlb_shootpage(curcpu()->ci_curpmap, ((vaddr_t)ptes) + ptp->offset); 1318 #ifdef MULTIPROCESSOR 1319 /* 1320 * Always shoot down the other pmap's 1321 * self-mapping of the PTP. 1322 */ 1323 pmap_tlb_shootpage(pm, ((vaddr_t)PTE_BASE) + ptp->offset); 1324 #endif 1325 pm->pm_stats.resident_count--; 1326 /* update hint */ 1327 if (pm->pm_ptphint == ptp) 1328 pm->pm_ptphint = RBT_ROOT(uvm_objtree, &pm->pm_obj.memt); 1329 ptp->wire_count = 0; 1330 /* Postpone free to after shootdown. */ 1331 uvm_pagerealloc(ptp, NULL, 0); 1332 1333 if (pm->pm_pdir_intel) { 1334 KASSERT(va < VM_MAXUSER_ADDRESS); 1335 /* Zap special meltdown PDE */ 1336 pva_intel = (pd_entry_t *)pm->pm_pdir_intel; 1337 i386_atomic_testset_ul(&pva_intel[pdei(va)], 0); 1338 DPRINTF("%s: cleared meltdown PDE @ index %lu " 1339 "(va range start 0x%x)\n", __func__, pdei(va), 1340 (uint32_t)va); 1341 } 1342 } 1343 1344 /* 1345 * p m a p l i f e c y c l e f u n c t i o n s 1346 */ 1347 1348 /* 1349 * pmap_create: create a pmap 1350 * 1351 * => note: old pmap interface took a "size" args which allowed for 1352 * the creation of "software only" pmaps (not in bsd). 1353 */ 1354 1355 struct pmap * 1356 pmap_create(void) 1357 { 1358 struct pmap *pmap; 1359 1360 pmap = pool_get(&pmap_pmap_pool, PR_WAITOK); 1361 1362 mtx_init(&pmap->pm_mtx, IPL_VM); 1363 mtx_init(&pmap->pm_apte_mtx, IPL_VM); 1364 1365 /* init uvm_object */ 1366 uvm_obj_init(&pmap->pm_obj, &pmap_pager, 1); 1367 pmap->pm_stats.wired_count = 0; 1368 pmap->pm_stats.resident_count = 1; /* count the PDP allocd below */ 1369 pmap->pm_ptphint = NULL; 1370 pmap->pm_hiexec = 0; 1371 pmap->pm_flags = 0; 1372 pmap->pm_pdir_intel = 0; 1373 pmap->pm_pdirpa_intel = 0; 1374 1375 initcodesegment(&pmap->pm_codeseg); 1376 1377 pmap_pinit_pd(pmap); 1378 return (pmap); 1379 } 1380 1381 void 1382 pmap_pinit_pd_86(struct pmap *pmap) 1383 { 1384 /* allocate PDP */ 1385 pmap->pm_pdir = (vaddr_t)km_alloc(NBPG, &kv_any, &kp_dirty, &kd_waitok); 1386 if (pmap->pm_pdir == 0) 1387 panic("kernel_map out of virtual space"); 1388 pmap_extract(pmap_kernel(), (vaddr_t)pmap->pm_pdir, 1389 &pmap->pm_pdirpa); 1390 pmap->pm_pdirsize = NBPG; 1391 1392 /* init PDP */ 1393 /* zero init area */ 1394 bzero((void *)pmap->pm_pdir, PDSLOT_PTE * sizeof(pd_entry_t)); 1395 /* put in recursive PDE to map the PTEs */ 1396 PDE(pmap, PDSLOT_PTE) = pmap->pm_pdirpa | PG_V | PG_KW | PG_U | PG_M; 1397 PDE(pmap, PDSLOT_PTE + 1) = 0; 1398 1399 /* 1400 * we need to lock pmaps_lock to prevent nkpde from changing on 1401 * us. note that there is no need to splvm to protect us from 1402 * malloc since malloc allocates out of a submap and we should have 1403 * already allocated kernel PTPs to cover the range... 1404 */ 1405 /* put in kernel VM PDEs */ 1406 bcopy(&PDP_BASE[PDSLOT_KERN], &PDE(pmap, PDSLOT_KERN), 1407 nkpde * sizeof(pd_entry_t)); 1408 /* zero the rest */ 1409 bzero(&PDE(pmap, PDSLOT_KERN + nkpde), 1410 NBPG - ((PDSLOT_KERN + nkpde) * sizeof(pd_entry_t))); 1411 1412 /* 1413 * Intel CPUs need a special page table to be used during usermode 1414 * execution, one that lacks all kernel mappings. 1415 */ 1416 if (cpu_meltdown) { 1417 pmap_alloc_pdir_intel_x86(pmap); 1418 1419 /* Copy PDEs from pmap_kernel's U-K view */ 1420 bcopy((void *)pmap_kernel()->pm_pdir_intel, 1421 (void *)pmap->pm_pdir_intel, NBPG); 1422 1423 DPRINTF("%s: pmap %p pm_pdir 0x%lx pm_pdirpa 0x%lx " 1424 "pdir_intel 0x%lx pdirpa_intel 0x%lx\n", 1425 __func__, pmap, pmap->pm_pdir, pmap->pm_pdirpa, 1426 pmap->pm_pdir_intel, pmap->pm_pdirpa_intel); 1427 } 1428 1429 mtx_enter(&pmaps_lock); 1430 LIST_INSERT_HEAD(&pmaps, pmap, pm_list); 1431 mtx_leave(&pmaps_lock); 1432 } 1433 1434 /* 1435 * pmap_destroy: drop reference count on pmap. free pmap if 1436 * reference count goes to zero. 1437 */ 1438 1439 void 1440 pmap_destroy(struct pmap *pmap) 1441 { 1442 struct vm_page *pg; 1443 int refs; 1444 1445 refs = atomic_dec_int_nv(&pmap->pm_obj.uo_refs); 1446 if (refs > 0) 1447 return; 1448 1449 #ifdef MULTIPROCESSOR 1450 pmap_tlb_droppmap(pmap); 1451 #endif 1452 1453 mtx_enter(&pmaps_lock); 1454 LIST_REMOVE(pmap, pm_list); 1455 mtx_leave(&pmaps_lock); 1456 1457 /* Free any remaining PTPs. */ 1458 while ((pg = RBT_ROOT(uvm_objtree, &pmap->pm_obj.memt)) != NULL) { 1459 pg->wire_count = 0; 1460 uvm_pagefree(pg); 1461 } 1462 1463 km_free((void *)pmap->pm_pdir, pmap->pm_pdirsize, &kv_any, &kp_dirty); 1464 pmap->pm_pdir = 0; 1465 1466 if (pmap->pm_pdir_intel) { 1467 km_free((void *)pmap->pm_pdir_intel, pmap->pm_pdirsize, 1468 &kv_any, &kp_dirty); 1469 pmap->pm_pdir_intel = 0; 1470 } 1471 1472 pool_put(&pmap_pmap_pool, pmap); 1473 } 1474 1475 1476 /* 1477 * Add a reference to the specified pmap. 1478 */ 1479 1480 void 1481 pmap_reference(struct pmap *pmap) 1482 { 1483 atomic_inc_int(&pmap->pm_obj.uo_refs); 1484 } 1485 1486 void 1487 pmap_activate(struct proc *p) 1488 { 1489 KASSERT(curproc == p); 1490 KASSERT(&p->p_addr->u_pcb == curpcb); 1491 pmap_switch(NULL, p); 1492 } 1493 1494 int nlazy_cr3_hit; 1495 int nlazy_cr3; 1496 1497 void 1498 pmap_switch(struct proc *o, struct proc *p) 1499 { 1500 struct pcb *pcb = &p->p_addr->u_pcb; 1501 struct pmap *pmap, *opmap; 1502 struct cpu_info *self = curcpu(); 1503 1504 pmap = p->p_vmspace->vm_map.pmap; 1505 opmap = self->ci_curpmap; 1506 1507 pcb->pcb_pmap = pmap; 1508 pcb->pcb_cr3 = pmap->pm_pdirpa; 1509 1510 if (opmap == pmap) { 1511 if (pmap != pmap_kernel()) 1512 nlazy_cr3_hit++; 1513 } else if (o != NULL && pmap == pmap_kernel()) { 1514 nlazy_cr3++; 1515 } else { 1516 self->ci_curpmap = pmap; 1517 lcr3(pmap->pm_pdirpa); 1518 } 1519 1520 /* 1521 * Meltdown: iff we're doing separate U+K and U-K page tables, 1522 * then record them in cpu_info for easy access in syscall and 1523 * interrupt trampolines. 1524 */ 1525 if (pmap->pm_pdirpa_intel) { 1526 self->ci_kern_cr3 = pmap->pm_pdirpa; 1527 self->ci_user_cr3 = pmap->pm_pdirpa_intel; 1528 } 1529 1530 /* 1531 * Set the correct descriptor value (i.e. with the 1532 * correct code segment X limit) in the GDT. 1533 */ 1534 self->ci_gdt[GUCODE_SEL].sd = pmap->pm_codeseg; 1535 self->ci_gdt[GUFS_SEL].sd = pcb->pcb_threadsegs[TSEG_FS]; 1536 self->ci_gdt[GUGS_SEL].sd = pcb->pcb_threadsegs[TSEG_GS]; 1537 } 1538 1539 void 1540 pmap_deactivate(struct proc *p) 1541 { 1542 } 1543 1544 /* 1545 * pmap_extract: extract a PA for the given VA 1546 */ 1547 1548 int 1549 pmap_extract_86(struct pmap *pmap, vaddr_t va, paddr_t *pap) 1550 { 1551 pt_entry_t *ptes, pte; 1552 1553 ptes = pmap_map_ptes_86(pmap); 1554 if (pmap_valid_entry(PDE(pmap, pdei(va)))) { 1555 pte = ptes[atop(va)]; 1556 pmap_unmap_ptes_86(pmap); 1557 if (!pmap_valid_entry(pte)) 1558 return 0; 1559 if (pap != NULL) 1560 *pap = (pte & PG_FRAME) | (va & ~PG_FRAME); 1561 return 1; 1562 } 1563 pmap_unmap_ptes_86(pmap); 1564 return 0; 1565 } 1566 1567 /* 1568 * pmap_virtual_space: used during bootup [uvm_pageboot_alloc] to 1569 * determine the bounds of the kernel virtual address space. 1570 */ 1571 1572 void 1573 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp) 1574 { 1575 *startp = virtual_avail; 1576 *endp = virtual_end; 1577 } 1578 1579 /* 1580 * pmap_zero_page: zero a page 1581 */ 1582 void (*pagezero)(void *, size_t) = bzero; 1583 1584 void 1585 pmap_zero_page(struct vm_page *pg) 1586 { 1587 pmap_zero_phys(VM_PAGE_TO_PHYS(pg)); 1588 } 1589 1590 /* 1591 * pmap_zero_phys: same as pmap_zero_page, but for use before vm_pages are 1592 * initialized. 1593 */ 1594 void 1595 pmap_zero_phys_86(paddr_t pa) 1596 { 1597 #ifdef MULTIPROCESSOR 1598 int id = cpu_number(); 1599 #endif 1600 pt_entry_t *zpte = PTESLEW(zero_pte, id); 1601 caddr_t zerova = VASLEW(pmap_zerop, id); 1602 1603 #ifdef DIAGNOSTIC 1604 if (*zpte) 1605 panic("pmap_zero_phys_86: lock botch"); 1606 #endif 1607 1608 *zpte = (pa & PG_FRAME) | PG_V | PG_RW; /* map in */ 1609 pmap_update_pg((vaddr_t)zerova); /* flush TLB */ 1610 pagezero(zerova, PAGE_SIZE); /* zero */ 1611 *zpte = 0; 1612 } 1613 1614 /* 1615 * pmap_flush_cache: flush the cache for a virtual address. 1616 */ 1617 void 1618 pmap_flush_cache(vaddr_t addr, vsize_t len) 1619 { 1620 vaddr_t i; 1621 1622 if (curcpu()->ci_cflushsz == 0) { 1623 wbinvd_on_all_cpus(); 1624 return; 1625 } 1626 1627 mfence(); 1628 for (i = addr; i < addr + len; i += curcpu()->ci_cflushsz) 1629 clflush(i); 1630 mfence(); 1631 } 1632 1633 void 1634 pmap_flush_page(paddr_t pa) 1635 { 1636 #ifdef MULTIPROCESSOR 1637 int id = cpu_number(); 1638 #endif 1639 pt_entry_t *pte; 1640 caddr_t va; 1641 1642 KDASSERT(PHYS_TO_VM_PAGE(pa) != NULL); 1643 1644 if (cpu_pae) { 1645 pmap_flush_page_pae(pa); 1646 return; 1647 } 1648 1649 pte = PTESLEW(flsh_pte, id); 1650 va = VASLEW(pmap_flshp, id); 1651 1652 #ifdef DIAGNOSTIC 1653 if (*pte) 1654 panic("pmap_flush_page: lock botch"); 1655 #endif 1656 1657 *pte = (pa & PG_FRAME) | PG_V | PG_RW; 1658 pmap_update_pg(va); 1659 pmap_flush_cache((vaddr_t)va, PAGE_SIZE); 1660 *pte = 0; 1661 pmap_update_pg(va); 1662 } 1663 1664 /* 1665 * pmap_copy_page: copy a page 1666 */ 1667 1668 void 1669 pmap_copy_page_86(struct vm_page *srcpg, struct vm_page *dstpg) 1670 { 1671 paddr_t srcpa = VM_PAGE_TO_PHYS(srcpg); 1672 paddr_t dstpa = VM_PAGE_TO_PHYS(dstpg); 1673 #ifdef MULTIPROCESSOR 1674 int id = cpu_number(); 1675 #endif 1676 pt_entry_t *spte = PTESLEW(csrc_pte, id); 1677 pt_entry_t *dpte = PTESLEW(cdst_pte, id); 1678 caddr_t csrcva = VASLEW(pmap_csrcp, id); 1679 caddr_t cdstva = VASLEW(pmap_cdstp, id); 1680 1681 #ifdef DIAGNOSTIC 1682 if (*spte || *dpte) 1683 panic("pmap_copy_page_86: lock botch"); 1684 #endif 1685 1686 *spte = (srcpa & PG_FRAME) | PG_V | PG_RW; 1687 *dpte = (dstpa & PG_FRAME) | PG_V | PG_RW; 1688 pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva); 1689 bcopy(csrcva, cdstva, PAGE_SIZE); 1690 *spte = *dpte = 0; 1691 pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva); 1692 } 1693 1694 /* 1695 * p m a p r e m o v e f u n c t i o n s 1696 * 1697 * functions that remove mappings 1698 */ 1699 1700 /* 1701 * pmap_remove_ptes: remove PTEs from a PTP 1702 * 1703 * => caller must hold pmap's lock 1704 * => PTP must be mapped into KVA 1705 * => PTP should be null if pmap == pmap_kernel() 1706 */ 1707 1708 void 1709 pmap_remove_ptes_86(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva, 1710 vaddr_t startva, vaddr_t endva, int flags, struct pv_entry **free_pvs) 1711 { 1712 struct pv_entry *pve; 1713 pt_entry_t *pte = (pt_entry_t *) ptpva; 1714 struct vm_page *pg; 1715 pt_entry_t opte; 1716 1717 /* 1718 * note that ptpva points to the PTE that maps startva. this may 1719 * or may not be the first PTE in the PTP. 1720 * 1721 * we loop through the PTP while there are still PTEs to look at 1722 * and the wire_count is greater than 1 (because we use the wire_count 1723 * to keep track of the number of real PTEs in the PTP). 1724 */ 1725 1726 for (/*null*/; startva < endva && (ptp == NULL || ptp->wire_count > 1) 1727 ; pte++, startva += NBPG) { 1728 if (!pmap_valid_entry(*pte)) 1729 continue; /* VA not mapped */ 1730 1731 if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) 1732 continue; 1733 1734 /* atomically save the old PTE and zero it */ 1735 opte = i386_atomic_testset_ul(pte, 0); 1736 1737 if (opte & PG_W) 1738 pmap->pm_stats.wired_count--; 1739 pmap->pm_stats.resident_count--; 1740 1741 if (ptp) 1742 ptp->wire_count--; /* dropping a PTE */ 1743 1744 /* 1745 * Unnecessary work if not PG_PVLIST. 1746 */ 1747 pg = PHYS_TO_VM_PAGE(opte & PG_FRAME); 1748 1749 /* 1750 * if we are not on a pv list we are done. 1751 */ 1752 if ((opte & PG_PVLIST) == 0) { 1753 #ifdef DIAGNOSTIC 1754 if (pg != NULL) 1755 panic("pmap_remove_ptes_86: managed page " 1756 "without PG_PVLIST for 0x%lx", startva); 1757 #endif 1758 continue; 1759 } 1760 1761 #ifdef DIAGNOSTIC 1762 if (pg == NULL) 1763 panic("pmap_remove_ptes_86: unmanaged page marked " 1764 "PG_PVLIST, va = 0x%lx, pa = 0x%lx", 1765 startva, (u_long)(opte & PG_FRAME)); 1766 #endif 1767 1768 /* sync R/M bits */ 1769 pmap_sync_flags_pte_86(pg, opte); 1770 pve = pmap_remove_pv(pg, pmap, startva); 1771 if (pve) { 1772 pve->pv_next = *free_pvs; 1773 *free_pvs = pve; 1774 } 1775 1776 /* end of "for" loop: time for next pte */ 1777 } 1778 } 1779 1780 /* 1781 * pmap_remove: top level mapping removal function 1782 * 1783 * => caller should not be holding any pmap locks 1784 */ 1785 1786 void 1787 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 1788 { 1789 pmap_do_remove(pmap, sva, eva, PMAP_REMOVE_ALL); 1790 } 1791 1792 void 1793 pmap_do_remove_86(struct pmap *pmap, vaddr_t sva, vaddr_t eva, int flags) 1794 { 1795 pt_entry_t *ptes; 1796 paddr_t ptppa; 1797 vaddr_t blkendva; 1798 struct vm_page *ptp; 1799 struct pv_entry *pve; 1800 struct pv_entry *free_pvs = NULL; 1801 TAILQ_HEAD(, vm_page) empty_ptps; 1802 int shootall; 1803 vaddr_t va; 1804 1805 TAILQ_INIT(&empty_ptps); 1806 1807 ptes = pmap_map_ptes_86(pmap); /* locks pmap */ 1808 1809 /* 1810 * Decide if we want to shoot the whole tlb or just the range. 1811 * Right now, we simply shoot everything when we remove more 1812 * than 32 pages, but never in the kernel pmap. XXX - tune. 1813 */ 1814 if ((eva - sva > 32 * PAGE_SIZE) && pmap != pmap_kernel()) 1815 shootall = 1; 1816 else 1817 shootall = 0; 1818 1819 for (va = sva ; va < eva ; va = blkendva) { 1820 /* determine range of block */ 1821 blkendva = i386_round_pdr(va + 1); 1822 if (blkendva > eva) 1823 blkendva = eva; 1824 1825 /* 1826 * XXXCDC: our PTE mappings should never be removed 1827 * with pmap_remove! if we allow this (and why would 1828 * we?) then we end up freeing the pmap's page 1829 * directory page (PDP) before we are finished using 1830 * it when we hit it in the recursive mapping. this 1831 * is BAD. 1832 * 1833 * long term solution is to move the PTEs out of user 1834 * address space. and into kernel address space (up 1835 * with APTE). then we can set VM_MAXUSER_ADDRESS to 1836 * be VM_MAX_ADDRESS. 1837 */ 1838 1839 if (pdei(va) == PDSLOT_PTE) 1840 /* XXXCDC: ugly hack to avoid freeing PDP here */ 1841 continue; 1842 1843 if (!pmap_valid_entry(PDE(pmap, pdei(va)))) 1844 /* valid block? */ 1845 continue; 1846 1847 /* PA of the PTP */ 1848 ptppa = PDE(pmap, pdei(va)) & PG_FRAME; 1849 1850 /* get PTP if non-kernel mapping */ 1851 if (pmap == pmap_kernel()) { 1852 /* we never free kernel PTPs */ 1853 ptp = NULL; 1854 } else { 1855 if (pmap->pm_ptphint && 1856 VM_PAGE_TO_PHYS(pmap->pm_ptphint) == ptppa) { 1857 ptp = pmap->pm_ptphint; 1858 } else { 1859 ptp = PHYS_TO_VM_PAGE(ptppa); 1860 #ifdef DIAGNOSTIC 1861 if (ptp == NULL) 1862 panic("pmap_do_remove_86: unmanaged " 1863 "PTP detected"); 1864 #endif 1865 } 1866 } 1867 pmap_remove_ptes_86(pmap, ptp, (vaddr_t)&ptes[atop(va)], 1868 va, blkendva, flags, &free_pvs); 1869 1870 /* If PTP is no longer being used, free it. */ 1871 if (ptp && ptp->wire_count <= 1) { 1872 pmap_drop_ptp_86(pmap, va, ptp, ptes); 1873 TAILQ_INSERT_TAIL(&empty_ptps, ptp, pageq); 1874 } 1875 1876 if (!shootall) 1877 pmap_tlb_shootrange(pmap, va, blkendva); 1878 } 1879 1880 if (shootall) 1881 pmap_tlb_shoottlb(); 1882 1883 pmap_unmap_ptes_86(pmap); 1884 pmap_tlb_shootwait(); 1885 1886 while ((pve = free_pvs) != NULL) { 1887 free_pvs = pve->pv_next; 1888 pool_put(&pmap_pv_pool, pve); 1889 } 1890 1891 while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) { 1892 TAILQ_REMOVE(&empty_ptps, ptp, pageq); 1893 uvm_pagefree(ptp); 1894 } 1895 } 1896 1897 /* 1898 * pmap_page_remove: remove a managed vm_page from all pmaps that map it 1899 * 1900 * => R/M bits are sync'd back to attrs 1901 */ 1902 1903 void 1904 pmap_page_remove_86(struct vm_page *pg) 1905 { 1906 struct pv_entry *pve; 1907 struct pmap *pm; 1908 pt_entry_t *ptes, opte; 1909 TAILQ_HEAD(, vm_page) empty_ptps; 1910 struct vm_page *ptp; 1911 1912 if (pg->mdpage.pv_list == NULL) 1913 return; 1914 1915 TAILQ_INIT(&empty_ptps); 1916 1917 mtx_enter(&pg->mdpage.pv_mtx); 1918 while ((pve = pg->mdpage.pv_list) != NULL) { 1919 pmap_reference(pve->pv_pmap); 1920 pm = pve->pv_pmap; 1921 mtx_leave(&pg->mdpage.pv_mtx); 1922 1923 ptes = pmap_map_ptes_86(pm); /* locks pmap */ 1924 1925 /* 1926 * We dropped the pvlist lock before grabbing the pmap 1927 * lock to avoid lock ordering problems. This means 1928 * we have to check the pvlist again since somebody 1929 * else might have modified it. All we care about is 1930 * that the pvlist entry matches the pmap we just 1931 * locked. If it doesn't, unlock the pmap and try 1932 * again. 1933 */ 1934 mtx_enter(&pg->mdpage.pv_mtx); 1935 if ((pve = pg->mdpage.pv_list) == NULL || 1936 pve->pv_pmap != pm) { 1937 mtx_leave(&pg->mdpage.pv_mtx); 1938 pmap_unmap_ptes_86(pm); /* unlocks pmap */ 1939 pmap_destroy(pm); 1940 mtx_enter(&pg->mdpage.pv_mtx); 1941 continue; 1942 } 1943 1944 pg->mdpage.pv_list = pve->pv_next; 1945 mtx_leave(&pg->mdpage.pv_mtx); 1946 1947 #ifdef DIAGNOSTIC 1948 if (pve->pv_ptp && (PDE(pve->pv_pmap, pdei(pve->pv_va)) & 1949 PG_FRAME) 1950 != VM_PAGE_TO_PHYS(pve->pv_ptp)) { 1951 printf("pmap_page_remove_86: pg=%p: va=%lx, " 1952 "pv_ptp=%p\n", 1953 pg, pve->pv_va, pve->pv_ptp); 1954 printf("pmap_page_remove_86: PTP's phys addr: " 1955 "actual=%x, recorded=%lx\n", 1956 (PDE(pve->pv_pmap, pdei(pve->pv_va)) & 1957 PG_FRAME), VM_PAGE_TO_PHYS(pve->pv_ptp)); 1958 panic("pmap_page_remove_86: mapped managed page has " 1959 "invalid pv_ptp field"); 1960 } 1961 #endif 1962 opte = i386_atomic_testset_ul(&ptes[atop(pve->pv_va)], 0); 1963 1964 if (opte & PG_W) 1965 pve->pv_pmap->pm_stats.wired_count--; 1966 pve->pv_pmap->pm_stats.resident_count--; 1967 1968 /* sync R/M bits */ 1969 pmap_sync_flags_pte_86(pg, opte); 1970 1971 /* update the PTP reference count. free if last reference. */ 1972 if (pve->pv_ptp && --pve->pv_ptp->wire_count <= 1) { 1973 pmap_drop_ptp_86(pve->pv_pmap, pve->pv_va, 1974 pve->pv_ptp, ptes); 1975 TAILQ_INSERT_TAIL(&empty_ptps, pve->pv_ptp, pageq); 1976 } 1977 1978 pmap_tlb_shootpage(pve->pv_pmap, pve->pv_va); 1979 1980 pmap_unmap_ptes_86(pve->pv_pmap); /* unlocks pmap */ 1981 pmap_destroy(pve->pv_pmap); 1982 pool_put(&pmap_pv_pool, pve); 1983 mtx_enter(&pg->mdpage.pv_mtx); 1984 } 1985 mtx_leave(&pg->mdpage.pv_mtx); 1986 1987 pmap_tlb_shootwait(); 1988 1989 while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) { 1990 TAILQ_REMOVE(&empty_ptps, ptp, pageq); 1991 uvm_pagefree(ptp); 1992 } 1993 } 1994 1995 /* 1996 * p m a p a t t r i b u t e f u n c t i o n s 1997 * functions that test/change managed page's attributes 1998 * since a page can be mapped multiple times we must check each PTE that 1999 * maps it by going down the pv lists. 2000 */ 2001 2002 /* 2003 * pmap_test_attrs: test a page's attributes 2004 */ 2005 2006 int 2007 pmap_test_attrs_86(struct vm_page *pg, int testbits) 2008 { 2009 struct pv_entry *pve; 2010 pt_entry_t *ptes, pte; 2011 u_long mybits, testflags; 2012 paddr_t ptppa; 2013 2014 testflags = pmap_pte2flags(testbits); 2015 2016 if (pg->pg_flags & testflags) 2017 return 1; 2018 2019 mybits = 0; 2020 mtx_enter(&pg->mdpage.pv_mtx); 2021 for (pve = pg->mdpage.pv_list; pve != NULL && mybits == 0; 2022 pve = pve->pv_next) { 2023 ptppa = PDE(pve->pv_pmap, pdei(pve->pv_va)) & PG_FRAME; 2024 ptes = (pt_entry_t *)pmap_tmpmap_pa(ptppa); 2025 pte = ptes[ptei(pve->pv_va)]; 2026 pmap_tmpunmap_pa(); 2027 mybits |= (pte & testbits); 2028 } 2029 mtx_leave(&pg->mdpage.pv_mtx); 2030 2031 if (mybits == 0) 2032 return 0; 2033 2034 atomic_setbits_int(&pg->pg_flags, pmap_pte2flags(mybits)); 2035 2036 return 1; 2037 } 2038 2039 /* 2040 * pmap_clear_attrs: change a page's attributes 2041 * 2042 * => we return 1 if we cleared one of the bits we were asked to 2043 */ 2044 2045 int 2046 pmap_clear_attrs_86(struct vm_page *pg, int clearbits) 2047 { 2048 struct pv_entry *pve; 2049 pt_entry_t *ptes, opte; 2050 u_long clearflags; 2051 paddr_t ptppa; 2052 int result; 2053 2054 clearflags = pmap_pte2flags(clearbits); 2055 2056 result = pg->pg_flags & clearflags; 2057 if (result) 2058 atomic_clearbits_int(&pg->pg_flags, clearflags); 2059 2060 mtx_enter(&pg->mdpage.pv_mtx); 2061 for (pve = pg->mdpage.pv_list; pve != NULL; pve = pve->pv_next) { 2062 ptppa = PDE(pve->pv_pmap, pdei(pve->pv_va)) & PG_FRAME; 2063 ptes = (pt_entry_t *)pmap_tmpmap_pa(ptppa); 2064 #ifdef DIAGNOSTIC 2065 if (!pmap_valid_entry(PDE(pve->pv_pmap, pdei(pve->pv_va)))) 2066 panic("pmap_clear_attrs_86: mapping without PTP " 2067 "detected"); 2068 #endif 2069 2070 opte = ptes[ptei(pve->pv_va)]; 2071 if (opte & clearbits) { 2072 result = 1; 2073 i386_atomic_clearbits_l(&ptes[ptei(pve->pv_va)], 2074 (opte & clearbits)); 2075 pmap_tlb_shootpage(pve->pv_pmap, pve->pv_va); 2076 } 2077 pmap_tmpunmap_pa(); 2078 } 2079 mtx_leave(&pg->mdpage.pv_mtx); 2080 2081 pmap_tlb_shootwait(); 2082 2083 return (result != 0); 2084 } 2085 2086 /* 2087 * p m a p p r o t e c t i o n f u n c t i o n s 2088 */ 2089 2090 /* 2091 * pmap_page_protect: change the protection of all recorded mappings 2092 * of a managed page 2093 * 2094 * => NOTE: this is an inline function in pmap.h 2095 */ 2096 2097 /* see pmap.h */ 2098 2099 /* 2100 * pmap_protect: set the protection in of the pages in a pmap 2101 * 2102 * => NOTE: this is an inline function in pmap.h 2103 */ 2104 2105 /* see pmap.h */ 2106 2107 /* 2108 * pmap_write_protect: write-protect pages in a pmap 2109 */ 2110 2111 void 2112 pmap_write_protect_86(struct pmap *pmap, vaddr_t sva, vaddr_t eva, 2113 vm_prot_t prot) 2114 { 2115 pt_entry_t *ptes, *spte, *epte, npte, opte; 2116 vaddr_t blockend; 2117 u_int32_t md_prot; 2118 vaddr_t va; 2119 int shootall = 0; 2120 2121 ptes = pmap_map_ptes_86(pmap); /* locks pmap */ 2122 2123 /* should be ok, but just in case ... */ 2124 sva &= PG_FRAME; 2125 eva &= PG_FRAME; 2126 2127 if ((eva - sva > 32 * PAGE_SIZE) && pmap != pmap_kernel()) 2128 shootall = 1; 2129 2130 for (va = sva; va < eva; va = blockend) { 2131 blockend = (va & PD_MASK) + NBPD; 2132 if (blockend > eva) 2133 blockend = eva; 2134 2135 /* 2136 * XXXCDC: our PTE mappings should never be write-protected! 2137 * 2138 * long term solution is to move the PTEs out of user 2139 * address space. and into kernel address space (up 2140 * with APTE). then we can set VM_MAXUSER_ADDRESS to 2141 * be VM_MAX_ADDRESS. 2142 */ 2143 2144 /* XXXCDC: ugly hack to avoid freeing PDP here */ 2145 if (pdei(va) == PDSLOT_PTE) 2146 continue; 2147 2148 /* empty block? */ 2149 if (!pmap_valid_entry(PDE(pmap, pdei(va)))) 2150 continue; 2151 2152 md_prot = protection_codes[prot]; 2153 if (va < VM_MAXUSER_ADDRESS) 2154 md_prot |= PG_u; 2155 else if (va < VM_MAX_ADDRESS) 2156 /* XXX: write-prot our PTES? never! */ 2157 md_prot |= PG_RW; 2158 2159 spte = &ptes[atop(va)]; 2160 epte = &ptes[atop(blockend)]; 2161 2162 for (/*null */; spte < epte ; spte++, va += PAGE_SIZE) { 2163 2164 if (!pmap_valid_entry(*spte)) /* no mapping? */ 2165 continue; 2166 2167 opte = *spte; 2168 npte = (opte & ~PG_PROT) | md_prot; 2169 2170 if (npte != opte) { 2171 pmap_exec_account(pmap, va, *spte, npte); 2172 i386_atomic_clearbits_l(spte, 2173 (~md_prot & opte) & PG_PROT); 2174 i386_atomic_setbits_l(spte, md_prot); 2175 } 2176 } 2177 } 2178 if (shootall) 2179 pmap_tlb_shoottlb(); 2180 else 2181 pmap_tlb_shootrange(pmap, sva, eva); 2182 2183 pmap_unmap_ptes_86(pmap); /* unlocks pmap */ 2184 pmap_tlb_shootwait(); 2185 } 2186 2187 /* 2188 * end of protection functions 2189 */ 2190 2191 /* 2192 * pmap_unwire: clear the wired bit in the PTE 2193 * 2194 * => mapping should already be in map 2195 */ 2196 2197 void 2198 pmap_unwire_86(struct pmap *pmap, vaddr_t va) 2199 { 2200 pt_entry_t *ptes; 2201 2202 if (pmap_valid_entry(PDE(pmap, pdei(va)))) { 2203 ptes = pmap_map_ptes_86(pmap); /* locks pmap */ 2204 2205 #ifdef DIAGNOSTIC 2206 if (!pmap_valid_entry(ptes[atop(va)])) 2207 panic("pmap_unwire_86: invalid (unmapped) va " 2208 "0x%lx", va); 2209 #endif 2210 2211 if ((ptes[atop(va)] & PG_W) != 0) { 2212 i386_atomic_clearbits_l(&ptes[atop(va)], PG_W); 2213 pmap->pm_stats.wired_count--; 2214 } 2215 #ifdef DIAGNOSTIC 2216 else { 2217 printf("pmap_unwire_86: wiring for pmap %p va 0x%lx " 2218 "didn't change!\n", pmap, va); 2219 } 2220 #endif 2221 pmap_unmap_ptes_86(pmap); /* unlocks map */ 2222 } 2223 #ifdef DIAGNOSTIC 2224 else { 2225 panic("pmap_unwire_86: invalid PDE"); 2226 } 2227 #endif 2228 } 2229 2230 /* 2231 * pmap_enter: enter a mapping into a pmap 2232 * 2233 * => must be done "now" ... no lazy-evaluation 2234 */ 2235 2236 int 2237 pmap_enter_86(struct pmap *pmap, vaddr_t va, paddr_t pa, 2238 vm_prot_t prot, int flags) 2239 { 2240 pt_entry_t *ptes, opte, npte; 2241 struct vm_page *ptp; 2242 struct pv_entry *pve, *opve = NULL; 2243 int wired = (flags & PMAP_WIRED) != 0; 2244 int nocache = (pa & PMAP_NOCACHE) != 0; 2245 int wc = (pa & PMAP_WC) != 0; 2246 struct vm_page *pg = NULL; 2247 int error, wired_count, resident_count, ptp_count; 2248 2249 KASSERT(!(wc && nocache)); 2250 pa &= PMAP_PA_MASK; /* nuke flags from pa */ 2251 2252 #ifdef DIAGNOSTIC 2253 /* sanity check: totally out of range? */ 2254 if (va >= VM_MAX_KERNEL_ADDRESS) 2255 panic("pmap_enter_86: too big"); 2256 2257 if (va == (vaddr_t) PDP_BASE || va == (vaddr_t) APDP_BASE) 2258 panic("pmap_enter_86: trying to map over PDP/APDP!"); 2259 2260 /* sanity check: kernel PTPs should already have been pre-allocated */ 2261 if (va >= VM_MIN_KERNEL_ADDRESS && 2262 !pmap_valid_entry(PDE(pmap, pdei(va)))) 2263 panic("pmap_enter: missing kernel PTP!"); 2264 #endif 2265 if (pmap_initialized) 2266 pve = pool_get(&pmap_pv_pool, PR_NOWAIT); 2267 else 2268 pve = NULL; 2269 wired_count = resident_count = ptp_count = 0; 2270 2271 /* 2272 * map in ptes and get a pointer to our PTP (unless we are the kernel) 2273 */ 2274 2275 ptes = pmap_map_ptes_86(pmap); /* locks pmap */ 2276 if (pmap == pmap_kernel()) { 2277 ptp = NULL; 2278 } else { 2279 ptp = pmap_get_ptp_86(pmap, pdei(va)); 2280 if (ptp == NULL) { 2281 if (flags & PMAP_CANFAIL) { 2282 pmap_unmap_ptes_86(pmap); 2283 error = ENOMEM; 2284 goto out; 2285 } 2286 panic("pmap_enter_86: get ptp failed"); 2287 } 2288 } 2289 /* 2290 * not allowed to sleep after here! 2291 */ 2292 opte = ptes[atop(va)]; /* old PTE */ 2293 2294 /* 2295 * is there currently a valid mapping at our VA? 2296 */ 2297 2298 if (pmap_valid_entry(opte)) { 2299 2300 /* 2301 * first, calculate pm_stats updates. resident count will not 2302 * change since we are replacing/changing a valid 2303 * mapping. wired count might change... 2304 */ 2305 2306 if (wired && (opte & PG_W) == 0) 2307 wired_count++; 2308 else if (!wired && (opte & PG_W) != 0) 2309 wired_count--; 2310 2311 /* 2312 * is the currently mapped PA the same as the one we 2313 * want to map? 2314 */ 2315 2316 if ((opte & PG_FRAME) == pa) { 2317 2318 /* if this is on the PVLIST, sync R/M bit */ 2319 if (opte & PG_PVLIST) { 2320 pg = PHYS_TO_VM_PAGE(pa); 2321 #ifdef DIAGNOSTIC 2322 if (pg == NULL) 2323 panic("pmap_enter_86: same pa " 2324 "PG_PVLIST mapping with " 2325 "unmanaged page " 2326 "pa = 0x%lx (0x%lx)", pa, 2327 atop(pa)); 2328 #endif 2329 pmap_sync_flags_pte_86(pg, opte); 2330 } 2331 goto enter_now; 2332 } 2333 2334 /* 2335 * changing PAs: we must remove the old one first 2336 */ 2337 2338 /* 2339 * if current mapping is on a pvlist, 2340 * remove it (sync R/M bits) 2341 */ 2342 2343 if (opte & PG_PVLIST) { 2344 pg = PHYS_TO_VM_PAGE(opte & PG_FRAME); 2345 #ifdef DIAGNOSTIC 2346 if (pg == NULL) 2347 panic("pmap_enter_86: PG_PVLIST mapping with " 2348 "unmanaged page " 2349 "pa = 0x%lx (0x%lx)", pa, atop(pa)); 2350 #endif 2351 pmap_sync_flags_pte_86(pg, opte); 2352 opve = pmap_remove_pv(pg, pmap, va); 2353 pg = NULL; /* This is not the page we are looking for */ 2354 } 2355 } else { /* opte not valid */ 2356 resident_count++; 2357 if (wired) 2358 wired_count++; 2359 if (ptp) 2360 ptp_count++; /* count # of valid entries */ 2361 } 2362 2363 /* 2364 * pve is either NULL or points to a now-free pv_entry structure 2365 * (the latter case is if we called pmap_remove_pv above). 2366 * 2367 * if this entry is to be on a pvlist, enter it now. 2368 */ 2369 2370 if (pmap_initialized && pg == NULL) 2371 pg = PHYS_TO_VM_PAGE(pa); 2372 2373 if (pg != NULL) { 2374 if (pve == NULL) { 2375 pve = opve; 2376 opve = NULL; 2377 } 2378 if (pve == NULL) { 2379 if (flags & PMAP_CANFAIL) { 2380 pmap_unmap_ptes_86(pmap); 2381 error = ENOMEM; 2382 goto out; 2383 } 2384 panic("pmap_enter_86: no pv entries available"); 2385 } 2386 /* lock pg when adding */ 2387 pmap_enter_pv(pg, pve, pmap, va, ptp); 2388 pve = NULL; 2389 } 2390 2391 enter_now: 2392 /* 2393 * at this point pg is !NULL if we want the PG_PVLIST bit set 2394 */ 2395 2396 npte = pa | protection_codes[prot] | PG_V; 2397 pmap_exec_account(pmap, va, opte, npte); 2398 if (wired) 2399 npte |= PG_W; 2400 if (nocache) 2401 npte |= PG_N; 2402 if (va < VM_MAXUSER_ADDRESS) 2403 npte |= PG_u; 2404 else if (va < VM_MAX_ADDRESS) 2405 npte |= PG_RW; /* XXXCDC: no longer needed? */ 2406 if (pmap == pmap_kernel()) 2407 npte |= pmap_pg_g; 2408 if (flags & PROT_READ) 2409 npte |= PG_U; 2410 if (flags & PROT_WRITE) 2411 npte |= PG_M; 2412 if (pg) { 2413 npte |= PG_PVLIST; 2414 if (pg->pg_flags & PG_PMAP_WC) { 2415 KASSERT(nocache == 0); 2416 wc = 1; 2417 } 2418 pmap_sync_flags_pte_86(pg, npte); 2419 } 2420 if (wc) 2421 npte |= pmap_pg_wc; 2422 2423 opte = i386_atomic_testset_ul(&ptes[atop(va)], npte); 2424 if (ptp) 2425 ptp->wire_count += ptp_count; 2426 pmap->pm_stats.resident_count += resident_count; 2427 pmap->pm_stats.wired_count += wired_count; 2428 2429 if (pmap_valid_entry(opte)) { 2430 if (nocache && (opte & PG_N) == 0) 2431 wbinvd_on_all_cpus(); /* XXX clflush before we enter? */ 2432 pmap_tlb_shootpage(pmap, va); 2433 } 2434 2435 pmap_unmap_ptes_86(pmap); 2436 pmap_tlb_shootwait(); 2437 2438 error = 0; 2439 2440 out: 2441 if (pve) 2442 pool_put(&pmap_pv_pool, pve); 2443 if (opve) 2444 pool_put(&pmap_pv_pool, opve); 2445 2446 return error; 2447 } 2448 2449 /* 2450 * Allocate an extra PD page and PT pages as needed to map kernel 2451 * pages used for the U-K mappings. These special mappings are set 2452 * up during bootstrap and get never removed and are part of 2453 * pmap_kernel. 2454 * 2455 * New pmaps inherit the kernel portion of pmap_kernel including 2456 * the special mappings (see pmap_pinit_pd_86()). 2457 * 2458 * To be able to release PT pages when migrating to PAE paging, use 2459 * wire_count for number of PTEs in the PT page. 2460 */ 2461 void 2462 pmap_enter_special_86(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int32_t flags) 2463 { 2464 struct pmap *pmap = pmap_kernel(); 2465 struct vm_page *ptppg = NULL; 2466 pd_entry_t *pd, *ptp; 2467 pt_entry_t *ptes; 2468 uint32_t l2idx, l1idx; 2469 paddr_t npa; 2470 2471 /* If CPU is secure, no need to do anything */ 2472 if (!cpu_meltdown) 2473 return; 2474 2475 /* Must be kernel VA */ 2476 if (va < VM_MIN_KERNEL_ADDRESS) 2477 panic("invalid special mapping va 0x%lx requested", va); 2478 2479 if (!pmap->pm_pdir_intel) 2480 pmap_alloc_pdir_intel_x86(pmap); 2481 2482 DPRINTF("%s: pm_pdir_intel 0x%x pm_pdirpa_intel 0x%x\n", __func__, 2483 (uint32_t)pmap->pm_pdir_intel, (uint32_t)pmap->pm_pdirpa_intel); 2484 2485 l2idx = pdei(va); 2486 l1idx = ptei(va); 2487 2488 DPRINTF("%s: va 0x%08lx pa 0x%08lx prot 0x%08lx flags 0x%08x " 2489 "l2idx %u l1idx %u\n", __func__, va, pa, (unsigned long)prot, 2490 flags, l2idx, l1idx); 2491 2492 if ((pd = (pd_entry_t *)pmap->pm_pdir_intel) == NULL) 2493 panic("%s: PD not initialized for pmap @ %p", __func__, pmap); 2494 2495 /* npa = physaddr of PT page */ 2496 npa = pd[l2idx] & PMAP_PA_MASK; 2497 2498 /* Valid PDE for the 4MB region containing va? */ 2499 if (!npa) { 2500 /* 2501 * No valid PDE - allocate PT page and set PDE. We 2502 * get it from pm_obj, which is used for PT pages. 2503 * We calculate the offset from l2idx+1024, so we are 2504 * beyond the regular PT pages. For their l2dix 2505 * 0 <= l2idx < 1024 holds. 2506 */ 2507 ptppg = uvm_pagealloc(&pmap->pm_obj, ptp_i2o(l2idx + 1024), 2508 NULL, UVM_PGA_USERESERVE|UVM_PGA_ZERO); 2509 if (ptppg == NULL) 2510 panic("%s: failed to allocate PT page", __func__); 2511 2512 atomic_clearbits_int(&ptppg->pg_flags, PG_BUSY); 2513 ptppg->wire_count = 1; /* no mappings yet */ 2514 2515 npa = VM_PAGE_TO_PHYS(ptppg); 2516 pd[l2idx] = (npa | PG_RW | PG_V | PG_M | PG_U); 2517 2518 DPRINTF("%s: allocated new PT page at phys 0x%x, " 2519 "setting PDE[%d] = 0x%x\n", __func__, (uint32_t)npa, 2520 l2idx, pd[l2idx]); 2521 } 2522 2523 /* temporarily map PT page and set PTE for U-K mapping */ 2524 if (ptppg == NULL && (ptppg = PHYS_TO_VM_PAGE(npa)) == NULL) 2525 panic("%s: no vm_page for PT page", __func__); 2526 mtx_enter(&ptppg->mdpage.pv_mtx); 2527 ptp = (pd_entry_t *)pmap_tmpmap_pa(npa); 2528 ptp[l1idx] = (pa | protection_codes[prot] | PG_V | PG_M | PG_U | flags); 2529 ptppg->wire_count++; 2530 DPRINTF("%s: setting PTE[%d] = 0x%x (wire_count %d)\n", __func__, 2531 l1idx, ptp[l1idx], ptppg->wire_count); 2532 pmap_tmpunmap_pa(); 2533 mtx_leave(&ptppg->mdpage.pv_mtx); 2534 2535 /* 2536 * if supported, set the PG_G flag on the corresponding U+K 2537 * entry. U+K mappings can use PG_G, as they are mapped 2538 * along with user land anyway. 2539 */ 2540 if (!(cpu_feature & CPUID_PGE)) 2541 return; 2542 ptes = pmap_map_ptes_86(pmap); /* pmap_kernel -> PTE_BASE */ 2543 if (pmap_valid_entry(ptes[atop(va)])) 2544 ptes[atop(va)] |= PG_G; 2545 else 2546 DPRINTF("%s: no U+K mapping for special mapping?\n", __func__); 2547 pmap_unmap_ptes_86(pmap); /* pmap_kernel -> nothing */ 2548 } 2549 2550 /* 2551 * pmap_growkernel: increase usage of KVM space 2552 * 2553 * => we allocate new PTPs for the kernel and install them in all 2554 * the pmaps on the system. 2555 */ 2556 2557 vaddr_t 2558 pmap_growkernel_86(vaddr_t maxkvaddr) 2559 { 2560 struct pmap *kpm = pmap_kernel(), *pm; 2561 int needed_kpde; /* needed number of kernel PTPs */ 2562 int s; 2563 paddr_t ptaddr; 2564 2565 needed_kpde = (int)(maxkvaddr - VM_MIN_KERNEL_ADDRESS + (NBPD-1)) 2566 / NBPD; 2567 if (needed_kpde <= nkpde) 2568 goto out; /* we are OK */ 2569 2570 /* 2571 * whoops! we need to add kernel PTPs 2572 */ 2573 2574 s = splhigh(); /* to be safe */ 2575 2576 for (/*null*/ ; nkpde < needed_kpde ; nkpde++) { 2577 2578 if (uvm.page_init_done == 0) { 2579 2580 /* 2581 * we're growing the kernel pmap early (from 2582 * uvm_pageboot_alloc()). this case must be 2583 * handled a little differently. 2584 */ 2585 2586 if (uvm_page_physget(&ptaddr) == 0) 2587 panic("pmap_growkernel: out of memory"); 2588 pmap_zero_phys_86(ptaddr); 2589 2590 PDE(kpm, PDSLOT_KERN + nkpde) = 2591 ptaddr | PG_RW | PG_V | PG_U | PG_M; 2592 2593 /* count PTP as resident */ 2594 kpm->pm_stats.resident_count++; 2595 continue; 2596 } 2597 2598 /* 2599 * THIS *MUST* BE CODED SO AS TO WORK IN THE 2600 * pmap_initialized == 0 CASE! WE MAY BE 2601 * INVOKED WHILE pmap_init() IS RUNNING! 2602 */ 2603 2604 while (!pmap_alloc_ptp_86(kpm, PDSLOT_KERN + nkpde, 0)) 2605 uvm_wait("pmap_growkernel"); 2606 2607 /* distribute new kernel PTP to all active pmaps */ 2608 mtx_enter(&pmaps_lock); 2609 LIST_FOREACH(pm, &pmaps, pm_list) { 2610 PDE(pm, PDSLOT_KERN + nkpde) = 2611 PDE(kpm, PDSLOT_KERN + nkpde); 2612 } 2613 mtx_leave(&pmaps_lock); 2614 } 2615 2616 splx(s); 2617 2618 out: 2619 return (VM_MIN_KERNEL_ADDRESS + (nkpde * NBPD)); 2620 } 2621 2622 #ifdef MULTIPROCESSOR 2623 /* 2624 * Locking for tlb shootdown. 2625 * 2626 * We lock by setting tlb_shoot_wait to the number of cpus that will 2627 * receive our tlb shootdown. After sending the IPIs, we don't need to 2628 * worry about locking order or interrupts spinning for the lock because 2629 * the call that grabs the "lock" isn't the one that releases it. And 2630 * there is nothing that can block the IPI that releases the lock. 2631 * 2632 * The functions are organized so that we first count the number of 2633 * cpus we need to send the IPI to, then we grab the counter, then 2634 * we send the IPIs, then we finally do our own shootdown. 2635 * 2636 * Our shootdown is last to make it parallel with the other cpus 2637 * to shorten the spin time. 2638 * 2639 * Notice that we depend on failures to send IPIs only being able to 2640 * happen during boot. If they happen later, the above assumption 2641 * doesn't hold since we can end up in situations where noone will 2642 * release the lock if we get an interrupt in a bad moment. 2643 */ 2644 2645 volatile int tlb_shoot_wait __attribute__((section(".kudata"))); 2646 2647 volatile vaddr_t tlb_shoot_addr1 __attribute__((section(".kudata"))); 2648 volatile vaddr_t tlb_shoot_addr2 __attribute__((section(".kudata"))); 2649 2650 void 2651 pmap_tlb_shootpage(struct pmap *pm, vaddr_t va) 2652 { 2653 struct cpu_info *ci, *self = curcpu(); 2654 CPU_INFO_ITERATOR cii; 2655 int wait = 0; 2656 u_int64_t mask = 0; 2657 2658 CPU_INFO_FOREACH(cii, ci) { 2659 if (ci == self || !pmap_is_active(pm, ci) || 2660 !(ci->ci_flags & CPUF_RUNNING)) 2661 continue; 2662 mask |= (1ULL << ci->ci_cpuid); 2663 wait++; 2664 } 2665 2666 if (wait > 0) { 2667 int s = splvm(); 2668 2669 while (atomic_cas_uint(&tlb_shoot_wait, 0, wait) != 0) { 2670 while (tlb_shoot_wait != 0) 2671 CPU_BUSY_CYCLE(); 2672 } 2673 tlb_shoot_addr1 = va; 2674 CPU_INFO_FOREACH(cii, ci) { 2675 if ((mask & (1ULL << ci->ci_cpuid)) == 0) 2676 continue; 2677 if (i386_fast_ipi(ci, LAPIC_IPI_INVLPG) != 0) 2678 panic("pmap_tlb_shootpage: ipi failed"); 2679 } 2680 splx(s); 2681 } 2682 2683 if (pmap_is_curpmap(pm)) 2684 pmap_update_pg(va); 2685 } 2686 2687 void 2688 pmap_tlb_shootrange(struct pmap *pm, vaddr_t sva, vaddr_t eva) 2689 { 2690 struct cpu_info *ci, *self = curcpu(); 2691 CPU_INFO_ITERATOR cii; 2692 int wait = 0; 2693 u_int64_t mask = 0; 2694 vaddr_t va; 2695 2696 CPU_INFO_FOREACH(cii, ci) { 2697 if (ci == self || !pmap_is_active(pm, ci) || 2698 !(ci->ci_flags & CPUF_RUNNING)) 2699 continue; 2700 mask |= (1ULL << ci->ci_cpuid); 2701 wait++; 2702 } 2703 2704 if (wait > 0) { 2705 int s = splvm(); 2706 2707 while (atomic_cas_uint(&tlb_shoot_wait, 0, wait) != 0) { 2708 while (tlb_shoot_wait != 0) 2709 CPU_BUSY_CYCLE(); 2710 } 2711 tlb_shoot_addr1 = sva; 2712 tlb_shoot_addr2 = eva; 2713 CPU_INFO_FOREACH(cii, ci) { 2714 if ((mask & (1ULL << ci->ci_cpuid)) == 0) 2715 continue; 2716 if (i386_fast_ipi(ci, LAPIC_IPI_INVLRANGE) != 0) 2717 panic("pmap_tlb_shootrange: ipi failed"); 2718 } 2719 splx(s); 2720 } 2721 2722 if (pmap_is_curpmap(pm)) 2723 for (va = sva; va < eva; va += PAGE_SIZE) 2724 pmap_update_pg(va); 2725 } 2726 2727 void 2728 pmap_tlb_shoottlb(void) 2729 { 2730 struct cpu_info *ci, *self = curcpu(); 2731 CPU_INFO_ITERATOR cii; 2732 int wait = 0; 2733 u_int64_t mask = 0; 2734 2735 CPU_INFO_FOREACH(cii, ci) { 2736 if (ci == self || !(ci->ci_flags & CPUF_RUNNING)) 2737 continue; 2738 mask |= (1ULL << ci->ci_cpuid); 2739 wait++; 2740 } 2741 2742 if (wait) { 2743 int s = splvm(); 2744 2745 while (atomic_cas_uint(&tlb_shoot_wait, 0, wait) != 0) { 2746 while (tlb_shoot_wait != 0) 2747 CPU_BUSY_CYCLE(); 2748 } 2749 2750 CPU_INFO_FOREACH(cii, ci) { 2751 if ((mask & (1ULL << ci->ci_cpuid)) == 0) 2752 continue; 2753 if (i386_fast_ipi(ci, LAPIC_IPI_INVLTLB) != 0) 2754 panic("pmap_tlb_shoottlb: ipi failed"); 2755 } 2756 splx(s); 2757 } 2758 2759 tlbflush(); 2760 } 2761 2762 void 2763 pmap_tlb_droppmap(struct pmap *pm) 2764 { 2765 struct cpu_info *ci, *self = curcpu(); 2766 CPU_INFO_ITERATOR cii; 2767 int wait = 0; 2768 u_int64_t mask = 0; 2769 2770 CPU_INFO_FOREACH(cii, ci) { 2771 if (ci == self || !(ci->ci_flags & CPUF_RUNNING) || 2772 ci->ci_curpmap != pm) 2773 continue; 2774 mask |= (1ULL << ci->ci_cpuid); 2775 wait++; 2776 } 2777 2778 if (wait) { 2779 int s = splvm(); 2780 2781 while (atomic_cas_uint(&tlb_shoot_wait, 0, wait) != 0) { 2782 while (tlb_shoot_wait != 0) 2783 CPU_BUSY_CYCLE(); 2784 } 2785 2786 CPU_INFO_FOREACH(cii, ci) { 2787 if ((mask & (1ULL << ci->ci_cpuid)) == 0) 2788 continue; 2789 if (i386_fast_ipi(ci, LAPIC_IPI_RELOADCR3) != 0) 2790 panic("pmap_tlb_droppmap: ipi failed"); 2791 } 2792 splx(s); 2793 } 2794 2795 if (self->ci_curpmap == pm) 2796 pmap_activate(curproc); 2797 2798 pmap_tlb_shootwait(); 2799 } 2800 2801 void 2802 pmap_tlb_shootwait(void) 2803 { 2804 while (tlb_shoot_wait != 0) 2805 CPU_BUSY_CYCLE(); 2806 } 2807 2808 #else 2809 2810 void 2811 pmap_tlb_shootpage(struct pmap *pm, vaddr_t va) 2812 { 2813 if (pmap_is_curpmap(pm)) 2814 pmap_update_pg(va); 2815 2816 } 2817 2818 void 2819 pmap_tlb_shootrange(struct pmap *pm, vaddr_t sva, vaddr_t eva) 2820 { 2821 vaddr_t va; 2822 2823 for (va = sva; va < eva; va += PAGE_SIZE) 2824 pmap_update_pg(va); 2825 } 2826 2827 void 2828 pmap_tlb_shoottlb(void) 2829 { 2830 tlbflush(); 2831 } 2832 #endif /* MULTIPROCESSOR */ 2833 2834 u_int32_t (*pmap_pte_set_p)(vaddr_t, paddr_t, u_int32_t) = 2835 pmap_pte_set_86; 2836 u_int32_t (*pmap_pte_setbits_p)(vaddr_t, u_int32_t, u_int32_t) = 2837 pmap_pte_setbits_86; 2838 u_int32_t (*pmap_pte_bits_p)(vaddr_t) = pmap_pte_bits_86; 2839 paddr_t (*pmap_pte_paddr_p)(vaddr_t) = pmap_pte_paddr_86; 2840 int (*pmap_clear_attrs_p)(struct vm_page *, int) = 2841 pmap_clear_attrs_86; 2842 int (*pmap_enter_p)(pmap_t, vaddr_t, paddr_t, vm_prot_t, int) = 2843 pmap_enter_86; 2844 void (*pmap_enter_special_p)(vaddr_t, paddr_t, vm_prot_t, 2845 u_int32_t) = pmap_enter_special_86; 2846 int (*pmap_extract_p)(pmap_t, vaddr_t, paddr_t *) = 2847 pmap_extract_86; 2848 vaddr_t (*pmap_growkernel_p)(vaddr_t) = pmap_growkernel_86; 2849 void (*pmap_page_remove_p)(struct vm_page *) = pmap_page_remove_86; 2850 void (*pmap_do_remove_p)(struct pmap *, vaddr_t, vaddr_t, int) = 2851 pmap_do_remove_86; 2852 int (*pmap_test_attrs_p)(struct vm_page *, int) = 2853 pmap_test_attrs_86; 2854 void (*pmap_unwire_p)(struct pmap *, vaddr_t) = pmap_unwire_86; 2855 void (*pmap_write_protect_p)(struct pmap *, vaddr_t, vaddr_t, 2856 vm_prot_t) = pmap_write_protect_86; 2857 void (*pmap_pinit_pd_p)(pmap_t) = pmap_pinit_pd_86; 2858 void (*pmap_zero_phys_p)(paddr_t) = pmap_zero_phys_86; 2859 void (*pmap_copy_page_p)(struct vm_page *, struct vm_page *) = 2860 pmap_copy_page_86; 2861