1 /* 2 * Copyright (c) 1991 Regents of the University of California. 3 * Copyright (c) 1994 John S. Dyson 4 * Copyright (c) 1994 David Greenman 5 * Copyright (c) 2003 Peter Wemm 6 * Copyright (c) 2005-2008 Alan L. Cox <alc@cs.rice.edu> 7 * Copyright (c) 2008, 2009 The DragonFly Project. 8 * Copyright (c) 2008, 2009 Jordan Gordeev. 9 * Copyright (c) 2011-2019 Matthew Dillon 10 * All rights reserved. 11 * 12 * This code is derived from software contributed to Berkeley by 13 * the Systems Programming Group of the University of Utah Computer 14 * Science Department and William Jolitz of UUNET Technologies Inc. 15 * 16 * Redistribution and use in source and binary forms, with or without 17 * modification, are permitted provided that the following conditions 18 * are met: 19 * 1. Redistributions of source code must retain the above copyright 20 * notice, this list of conditions and the following disclaimer. 21 * 2. Redistributions in binary form must reproduce the above copyright 22 * notice, this list of conditions and the following disclaimer in the 23 * documentation and/or other materials provided with the distribution. 24 * 3. All advertising materials mentioning features or use of this software 25 * must display the following acknowledgement: 26 * This product includes software developed by the University of 27 * California, Berkeley and its contributors. 28 * 4. Neither the name of the University nor the names of its contributors 29 * may be used to endorse or promote products derived from this software 30 * without specific prior written permission. 31 * 32 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 33 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 34 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 35 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 36 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 37 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 38 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 39 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 40 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 41 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 42 * SUCH DAMAGE. 43 */ 44 /* 45 * Manage physical address maps for x86-64 systems. 46 * 47 * Some notes: 48 * - The 'M'odified bit is only applicable to terminal PTEs. 49 * 50 * - The 'U'ser access bit can be set for higher-level PTEs as 51 * long as it isn't set for terminal PTEs for pages we don't 52 * want user access to. 53 */ 54 55 #if 0 /* JG */ 56 #include "opt_pmap.h" 57 #endif 58 #include "opt_msgbuf.h" 59 60 #include <sys/param.h> 61 #include <sys/kernel.h> 62 #include <sys/proc.h> 63 #include <sys/msgbuf.h> 64 #include <sys/vmmeter.h> 65 #include <sys/mman.h> 66 #include <sys/systm.h> 67 68 #include <vm/vm.h> 69 #include <vm/vm_param.h> 70 #include <sys/sysctl.h> 71 #include <sys/lock.h> 72 #include <vm/vm_kern.h> 73 #include <vm/vm_page.h> 74 #include <vm/vm_map.h> 75 #include <vm/vm_object.h> 76 #include <vm/vm_extern.h> 77 #include <vm/vm_pageout.h> 78 #include <vm/vm_pager.h> 79 #include <vm/vm_zone.h> 80 81 #include <sys/thread2.h> 82 #include <sys/spinlock2.h> 83 #include <vm/vm_page2.h> 84 85 #include <machine/cputypes.h> 86 #include <machine/cpu.h> 87 #include <machine/md_var.h> 88 #include <machine/specialreg.h> 89 #include <machine/smp.h> 90 #include <machine_base/apic/apicreg.h> 91 #include <machine/globaldata.h> 92 #include <machine/pmap.h> 93 #include <machine/pmap_inval.h> 94 95 #include <ddb/ddb.h> 96 97 #define PMAP_KEEP_PDIRS 98 99 #if defined(DIAGNOSTIC) 100 #define PMAP_DIAGNOSTIC 101 #endif 102 103 #define MINPV 2048 104 105 /* 106 * pmap debugging will report who owns a pv lock when blocking. 107 */ 108 #ifdef PMAP_DEBUG 109 110 #define PMAP_DEBUG_DECL ,const char *func, int lineno 111 #define PMAP_DEBUG_ARGS , __func__, __LINE__ 112 #define PMAP_DEBUG_COPY , func, lineno 113 114 #define pv_get(pmap, pindex, pmarkp) _pv_get(pmap, pindex, pmarkp \ 115 PMAP_DEBUG_ARGS) 116 #define pv_lock(pv) _pv_lock(pv \ 117 PMAP_DEBUG_ARGS) 118 #define pv_hold_try(pv) _pv_hold_try(pv \ 119 PMAP_DEBUG_ARGS) 120 #define pv_alloc(pmap, pindex, isnewp) _pv_alloc(pmap, pindex, isnewp \ 121 PMAP_DEBUG_ARGS) 122 123 #define pv_free(pv, pvp) _pv_free(pv, pvp PMAP_DEBUG_ARGS) 124 125 #else 126 127 #define PMAP_DEBUG_DECL 128 #define PMAP_DEBUG_ARGS 129 #define PMAP_DEBUG_COPY 130 131 #define pv_get(pmap, pindex, pmarkp) _pv_get(pmap, pindex, pmarkp) 132 #define pv_lock(pv) _pv_lock(pv) 133 #define pv_hold_try(pv) _pv_hold_try(pv) 134 #define pv_alloc(pmap, pindex, isnewp) _pv_alloc(pmap, pindex, isnewp) 135 #define pv_free(pv, pvp) _pv_free(pv, pvp) 136 137 #endif 138 139 /* 140 * Get PDEs and PTEs for user/kernel address space 141 */ 142 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) 143 144 #define pmap_pde_v(pmap, pte) \ 145 ((*(pd_entry_t *)pte & pmap->pmap_bits[PG_V_IDX]) != 0) 146 #define pmap_pte_w(pmap, pte) \ 147 ((*(pt_entry_t *)pte & pmap->pmap_bits[PG_W_IDX]) != 0) 148 #define pmap_pte_m(pmap, pte) \ 149 ((*(pt_entry_t *)pte & pmap->pmap_bits[PG_M_IDX]) != 0) 150 #define pmap_pte_u(pmap, pte) \ 151 ((*(pt_entry_t *)pte & pmap->pmap_bits[PG_U_IDX]) != 0) 152 #define pmap_pte_v(pmap, pte) \ 153 ((*(pt_entry_t *)pte & pmap->pmap_bits[PG_V_IDX]) != 0) 154 155 /* 156 * Given a map and a machine independent protection code, 157 * convert to a vax protection code. 158 */ 159 #define pte_prot(m, p) \ 160 (m->protection_codes[p & (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE)]) 161 static uint64_t protection_codes[PROTECTION_CODES_SIZE]; 162 163 /* 164 * Backing scan macros. Note that in the use case 'ipte' is only a tentitive 165 * value and must be validated by a pmap_inval_smp_cmpset*() or equivalent 166 * function. 167 * 168 * NOTE: cpu_ccfence() is required to prevent excessive optmization of 169 * of the (ipte) variable. 170 */ 171 #define PMAP_PAGE_BACKING_SCAN(m, match_pmap, ipmap, iptep, ipte, iva) \ 172 if (m->object) { \ 173 vm_object_t iobj = m->object; \ 174 vm_map_backing_t iba, next_ba; \ 175 struct pmap *ipmap; \ 176 pt_entry_t ipte; \ 177 pt_entry_t *iptep; \ 178 vm_offset_t iva; \ 179 vm_pindex_t ipindex_start; \ 180 vm_pindex_t ipindex_end; \ 181 \ 182 lockmgr(&iobj->backing_lk, LK_SHARED); \ 183 next_ba = TAILQ_FIRST(&iobj->backing_list); \ 184 while ((iba = next_ba) != NULL) { \ 185 next_ba = TAILQ_NEXT(iba, entry); \ 186 ipmap = iba->pmap; \ 187 if (match_pmap && ipmap != match_pmap) \ 188 continue; \ 189 ipindex_start = iba->offset >> PAGE_SHIFT; \ 190 ipindex_end = ipindex_start + \ 191 ((iba->end - iba->start) >> PAGE_SHIFT); \ 192 if (m->pindex < ipindex_start || \ 193 m->pindex >= ipindex_end) { \ 194 continue; \ 195 } \ 196 iva = iba->start + \ 197 ((m->pindex - ipindex_start) << PAGE_SHIFT); \ 198 iptep = pmap_pte(ipmap, iva); \ 199 if (iptep == NULL) \ 200 continue; \ 201 ipte = *iptep; \ 202 cpu_ccfence(); \ 203 if (m->phys_addr != (ipte & PG_FRAME)) \ 204 continue; \ 205 206 #define PMAP_PAGE_BACKING_RETRY \ 207 { \ 208 next_ba = iba; \ 209 continue; \ 210 } \ 211 212 #define PMAP_PAGE_BACKING_DONE \ 213 } \ 214 lockmgr(&iobj->backing_lk, LK_RELEASE); \ 215 } \ 216 217 struct pmap kernel_pmap; 218 struct pmap iso_pmap; 219 220 vm_paddr_t avail_start; /* PA of first available physical page */ 221 vm_paddr_t avail_end; /* PA of last available physical page */ 222 vm_offset_t virtual2_start; /* cutout free area prior to kernel start */ 223 vm_offset_t virtual2_end; 224 vm_offset_t virtual_start; /* VA of first avail page (after kernel bss) */ 225 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 226 vm_offset_t KvaStart; /* VA start of KVA space */ 227 vm_offset_t KvaEnd; /* VA end of KVA space (non-inclusive) */ 228 vm_offset_t KvaSize; /* max size of kernel virtual address space */ 229 vm_offset_t DMapMaxAddress; 230 static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */ 231 //static int pgeflag; /* PG_G or-in */ 232 uint64_t PatMsr; 233 234 static int ndmpdp; 235 static vm_paddr_t dmaplimit; 236 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 237 238 static pt_entry_t pat_pte_index[PAT_INDEX_SIZE]; /* PAT -> PG_ bits */ 239 static pt_entry_t pat_pde_index[PAT_INDEX_SIZE]; /* PAT -> PG_ bits */ 240 241 static uint64_t KPTbase; 242 static uint64_t KPTphys; 243 static uint64_t KPDphys; /* phys addr of kernel level 2 */ 244 static uint64_t KPDbase; /* phys addr of kernel level 2 @ KERNBASE */ 245 uint64_t KPDPphys; /* phys addr of kernel level 3 */ 246 uint64_t KPML4phys; /* phys addr of kernel level 4 */ 247 248 static uint64_t DMPDphys; /* phys addr of direct mapped level 2 */ 249 static uint64_t DMPDPphys; /* phys addr of direct mapped level 3 */ 250 251 /* 252 * Data for the pv entry allocation mechanism 253 */ 254 __read_mostly static vm_zone_t pvzone; 255 __read_mostly static int pmap_pagedaemon_waken = 0; 256 static struct vm_zone pvzone_store; 257 static struct pv_entry *pvinit; 258 259 /* 260 * All those kernel PT submaps that BSD is so fond of 261 */ 262 pt_entry_t *CMAP1 = NULL, *ptmmap; 263 caddr_t CADDR1 = NULL, ptvmmap = NULL; 264 static pt_entry_t *msgbufmap; 265 struct msgbuf *msgbufp=NULL; 266 267 /* 268 * PMAP default PG_* bits. Needed to be able to add 269 * EPT/NPT pagetable pmap_bits for the VMM module 270 */ 271 uint64_t pmap_bits_default[] = { 272 REGULAR_PMAP, /* TYPE_IDX 0 */ 273 X86_PG_V, /* PG_V_IDX 1 */ 274 X86_PG_RW, /* PG_RW_IDX 2 */ 275 X86_PG_U, /* PG_U_IDX 3 */ 276 X86_PG_A, /* PG_A_IDX 4 */ 277 X86_PG_M, /* PG_M_IDX 5 */ 278 X86_PG_PS, /* PG_PS_IDX3 6 */ 279 X86_PG_G, /* PG_G_IDX 7 */ 280 X86_PG_AVAIL1, /* PG_AVAIL1_IDX 8 */ 281 X86_PG_AVAIL2, /* PG_AVAIL2_IDX 9 */ 282 X86_PG_AVAIL3, /* PG_AVAIL3_IDX 10 */ 283 X86_PG_NC_PWT | X86_PG_NC_PCD, /* PG_N_IDX 11 */ 284 X86_PG_NX, /* PG_NX_IDX 12 */ 285 }; 286 /* 287 * Crashdump maps. 288 */ 289 static pt_entry_t *pt_crashdumpmap; 290 static caddr_t crashdumpmap; 291 292 static int pmap_debug = 0; 293 SYSCTL_INT(_machdep, OID_AUTO, pmap_debug, CTLFLAG_RW, 294 &pmap_debug, 0, "Debug pmap's"); 295 #ifdef PMAP_DEBUG2 296 static int pmap_enter_debug = 0; 297 SYSCTL_INT(_machdep, OID_AUTO, pmap_enter_debug, CTLFLAG_RW, 298 &pmap_enter_debug, 0, "Debug pmap_enter's"); 299 #endif 300 static int pmap_yield_count = 64; 301 SYSCTL_INT(_machdep, OID_AUTO, pmap_yield_count, CTLFLAG_RW, 302 &pmap_yield_count, 0, "Yield during init_pt/release"); 303 int pmap_fast_kernel_cpusync = 0; 304 SYSCTL_INT(_machdep, OID_AUTO, pmap_fast_kernel_cpusync, CTLFLAG_RW, 305 &pmap_fast_kernel_cpusync, 0, "Share page table pages when possible"); 306 int pmap_dynamic_delete = 0; 307 SYSCTL_INT(_machdep, OID_AUTO, pmap_dynamic_delete, CTLFLAG_RW, 308 &pmap_dynamic_delete, 0, "Dynamically delete PT/PD/PDPs"); 309 int pmap_lock_delay = 100; 310 SYSCTL_INT(_machdep, OID_AUTO, pmap_lock_delay, CTLFLAG_RW, 311 &pmap_lock_delay, 0, "Spin loops"); 312 static int meltdown_mitigation = -1; 313 TUNABLE_INT("machdep.meltdown_mitigation", &meltdown_mitigation); 314 SYSCTL_INT(_machdep, OID_AUTO, meltdown_mitigation, CTLFLAG_RW, 315 &meltdown_mitigation, 0, "Userland pmap isolation"); 316 317 static int pmap_nx_enable = -1; /* -1 = auto */ 318 /* needs manual TUNABLE in early probe, see below */ 319 SYSCTL_INT(_machdep, OID_AUTO, pmap_nx_enable, CTLFLAG_RD, 320 &pmap_nx_enable, 0, 321 "no-execute support (0=disabled, 1=w/READ, 2=w/READ & WRITE)"); 322 323 static int pmap_pv_debug = 50; 324 SYSCTL_INT(_machdep, OID_AUTO, pmap_pv_debug, CTLFLAG_RW, 325 &pmap_pv_debug, 0, ""); 326 327 static long vm_pmap_pv_entries; 328 SYSCTL_LONG(_vm, OID_AUTO, pmap_pv_entries, CTLFLAG_RD, 329 &vm_pmap_pv_entries, 0, ""); 330 331 /* Standard user access funtions */ 332 extern int std_copyinstr (const void *udaddr, void *kaddr, size_t len, 333 size_t *lencopied); 334 extern int std_copyin (const void *udaddr, void *kaddr, size_t len); 335 extern int std_copyout (const void *kaddr, void *udaddr, size_t len); 336 extern int std_fubyte (const uint8_t *base); 337 extern int std_subyte (uint8_t *base, uint8_t byte); 338 extern int32_t std_fuword32 (const uint32_t *base); 339 extern int64_t std_fuword64 (const uint64_t *base); 340 extern int std_suword64 (uint64_t *base, uint64_t word); 341 extern int std_suword32 (uint32_t *base, int word); 342 extern uint32_t std_swapu32 (volatile uint32_t *base, uint32_t v); 343 extern uint64_t std_swapu64 (volatile uint64_t *base, uint64_t v); 344 extern uint32_t std_fuwordadd32 (volatile uint32_t *base, uint32_t v); 345 extern uint64_t std_fuwordadd64 (volatile uint64_t *base, uint64_t v); 346 347 #if 0 348 static void pv_hold(pv_entry_t pv); 349 #endif 350 static int _pv_hold_try(pv_entry_t pv 351 PMAP_DEBUG_DECL); 352 static void pv_drop(pv_entry_t pv); 353 static void _pv_lock(pv_entry_t pv 354 PMAP_DEBUG_DECL); 355 static void pv_unlock(pv_entry_t pv); 356 static pv_entry_t _pv_alloc(pmap_t pmap, vm_pindex_t pindex, int *isnew 357 PMAP_DEBUG_DECL); 358 static pv_entry_t _pv_get(pmap_t pmap, vm_pindex_t pindex, vm_pindex_t **pmarkp 359 PMAP_DEBUG_DECL); 360 static void _pv_free(pv_entry_t pv, pv_entry_t pvp PMAP_DEBUG_DECL); 361 static pv_entry_t pv_get_try(pmap_t pmap, vm_pindex_t pindex, 362 vm_pindex_t **pmarkp, int *errorp); 363 static void pv_put(pv_entry_t pv); 364 static void *pv_pte_lookup(pv_entry_t pv, vm_pindex_t pindex); 365 static pv_entry_t pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, 366 pv_entry_t *pvpp); 367 static void pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, 368 pmap_inval_bulk_t *bulk, int destroy); 369 static vm_page_t pmap_remove_pv_page(pv_entry_t pv); 370 static int pmap_release_pv(pv_entry_t pv, pv_entry_t pvp, 371 pmap_inval_bulk_t *bulk); 372 373 struct pmap_scan_info; 374 static void pmap_remove_callback(pmap_t pmap, struct pmap_scan_info *info, 375 vm_pindex_t *pte_placemark, pv_entry_t pt_pv, 376 vm_offset_t va, pt_entry_t *ptep, void *arg __unused); 377 static void pmap_protect_callback(pmap_t pmap, struct pmap_scan_info *info, 378 vm_pindex_t *pte_placemark, pv_entry_t pt_pv, 379 vm_offset_t va, pt_entry_t *ptep, void *arg __unused); 380 381 static void x86_64_protection_init (void); 382 static void create_pagetables(vm_paddr_t *firstaddr); 383 static void pmap_remove_all (vm_page_t m); 384 static boolean_t pmap_testbit (vm_page_t m, int bit); 385 386 static pt_entry_t *pmap_pte_quick (pmap_t pmap, vm_offset_t va); 387 static vm_offset_t pmap_kmem_choose(vm_offset_t addr); 388 389 static void pmap_pinit_defaults(struct pmap *pmap); 390 static void pv_placemarker_wait(pmap_t pmap, vm_pindex_t *pmark); 391 static void pv_placemarker_wakeup(pmap_t pmap, vm_pindex_t *pmark); 392 393 static int 394 pv_entry_compare(pv_entry_t pv1, pv_entry_t pv2) 395 { 396 if (pv1->pv_pindex < pv2->pv_pindex) 397 return(-1); 398 if (pv1->pv_pindex > pv2->pv_pindex) 399 return(1); 400 return(0); 401 } 402 403 RB_GENERATE2(pv_entry_rb_tree, pv_entry, pv_entry, 404 pv_entry_compare, vm_pindex_t, pv_pindex); 405 406 /* 407 * Keep track of pages in the pmap. The procedure is handed 408 * the vm_page->md.pmap_count value prior to an increment or 409 * decrement. 410 * 411 * t_arm - Active real memory 412 * t_avm - Active virtual memory 413 * t_armshr - Active real memory that is also shared 414 * t_avmshr - Active virtual memory that is also shared 415 * 416 * NOTE: At the moment t_avm is effectively just the same as t_arm. 417 */ 418 static __inline 419 void 420 pmap_page_stats_adding(long prev_count) 421 { 422 globaldata_t gd = mycpu; 423 424 if (prev_count == 0) { 425 ++gd->gd_vmtotal.t_arm; 426 ++gd->gd_vmtotal.t_avm; 427 } else if (prev_count == 1) { 428 ++gd->gd_vmtotal.t_armshr; 429 ++gd->gd_vmtotal.t_avmshr; 430 } else { 431 ++gd->gd_vmtotal.t_avmshr; 432 } 433 } 434 435 static __inline 436 void 437 pmap_page_stats_deleting(long prev_count) 438 { 439 globaldata_t gd = mycpu; 440 441 if (prev_count == 1) { 442 --gd->gd_vmtotal.t_arm; 443 --gd->gd_vmtotal.t_avm; 444 } else if (prev_count == 2) { 445 --gd->gd_vmtotal.t_armshr; 446 --gd->gd_vmtotal.t_avmshr; 447 } else { 448 --gd->gd_vmtotal.t_avmshr; 449 } 450 } 451 452 /* 453 * Move the kernel virtual free pointer to the next 454 * 2MB. This is used to help improve performance 455 * by using a large (2MB) page for much of the kernel 456 * (.text, .data, .bss) 457 */ 458 static 459 vm_offset_t 460 pmap_kmem_choose(vm_offset_t addr) 461 { 462 vm_offset_t newaddr = addr; 463 464 newaddr = roundup2(addr, NBPDR); 465 return newaddr; 466 } 467 468 /* 469 * Returns the pindex of a page table entry (representing a terminal page). 470 * There are NUPTE_TOTAL page table entries possible (a huge number) 471 * 472 * x86-64 has a 48-bit address space, where bit 47 is sign-extended out. 473 * We want to properly translate negative KVAs. 474 */ 475 static __inline 476 vm_pindex_t 477 pmap_pte_pindex(vm_offset_t va) 478 { 479 return ((va >> PAGE_SHIFT) & (NUPTE_TOTAL - 1)); 480 } 481 482 /* 483 * Returns the pindex of a page table. 484 */ 485 static __inline 486 vm_pindex_t 487 pmap_pt_pindex(vm_offset_t va) 488 { 489 return (NUPTE_TOTAL + ((va >> PDRSHIFT) & (NUPT_TOTAL - 1))); 490 } 491 492 /* 493 * Returns the pindex of a page directory. 494 */ 495 static __inline 496 vm_pindex_t 497 pmap_pd_pindex(vm_offset_t va) 498 { 499 return (NUPTE_TOTAL + NUPT_TOTAL + 500 ((va >> PDPSHIFT) & (NUPD_TOTAL - 1))); 501 } 502 503 static __inline 504 vm_pindex_t 505 pmap_pdp_pindex(vm_offset_t va) 506 { 507 return (NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + 508 ((va >> PML4SHIFT) & (NUPDP_TOTAL - 1))); 509 } 510 511 static __inline 512 vm_pindex_t 513 pmap_pml4_pindex(void) 514 { 515 return (NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL); 516 } 517 518 /* 519 * Return various clipped indexes for a given VA 520 * 521 * Returns the index of a pt in a page directory, representing a page 522 * table. 523 */ 524 static __inline 525 vm_pindex_t 526 pmap_pt_index(vm_offset_t va) 527 { 528 return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1)); 529 } 530 531 /* 532 * Returns the index of a pd in a page directory page, representing a page 533 * directory. 534 */ 535 static __inline 536 vm_pindex_t 537 pmap_pd_index(vm_offset_t va) 538 { 539 return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1)); 540 } 541 542 /* 543 * Returns the index of a pdp in the pml4 table, representing a page 544 * directory page. 545 */ 546 static __inline 547 vm_pindex_t 548 pmap_pdp_index(vm_offset_t va) 549 { 550 return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1)); 551 } 552 553 /* 554 * Of all the layers (PTE, PT, PD, PDP, PML4) the best one to cache is 555 * the PT layer. This will speed up core pmap operations considerably. 556 * We also cache the PTE layer to (hopefully) improve relative lookup 557 * speeds. 558 * 559 * NOTE: The pmap spinlock does not need to be held but the passed-in pv 560 * must be in a known associated state (typically by being locked when 561 * the pmap spinlock isn't held). We allow the race for that case. 562 * 563 * NOTE: pm_pvhint* is only accessed (read) with the spin-lock held, using 564 * cpu_ccfence() to prevent compiler optimizations from reloading the 565 * field. 566 */ 567 static __inline 568 void 569 pv_cache(pmap_t pmap, pv_entry_t pv, vm_pindex_t pindex) 570 { 571 if (pindex < pmap_pt_pindex(0)) { 572 ; 573 } else if (pindex < pmap_pd_pindex(0)) { 574 pmap->pm_pvhint_pt = pv; 575 } 576 } 577 578 /* 579 * Locate the requested pt_entry 580 */ 581 static __inline 582 pv_entry_t 583 pv_entry_lookup(pmap_t pmap, vm_pindex_t pindex) 584 { 585 pv_entry_t pv; 586 587 if (pindex < pmap_pt_pindex(0)) 588 return NULL; 589 #if 1 590 if (pindex < pmap_pd_pindex(0)) 591 pv = pmap->pm_pvhint_pt; 592 else 593 pv = NULL; 594 cpu_ccfence(); 595 if (pv == NULL || pv->pv_pmap != pmap) { 596 pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pindex); 597 if (pv) 598 pv_cache(pmap, pv, pindex); 599 } else if (pv->pv_pindex != pindex) { 600 pv = pv_entry_rb_tree_RB_LOOKUP_REL(&pmap->pm_pvroot, 601 pindex, pv); 602 if (pv) 603 pv_cache(pmap, pv, pindex); 604 } 605 #else 606 pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pindex); 607 #endif 608 return pv; 609 } 610 611 /* 612 * pmap_pte_quick: 613 * 614 * Super fast pmap_pte routine best used when scanning the pv lists. 615 * This eliminates many course-grained invltlb calls. Note that many of 616 * the pv list scans are across different pmaps and it is very wasteful 617 * to do an entire invltlb when checking a single mapping. 618 */ 619 static __inline pt_entry_t *pmap_pte(pmap_t pmap, vm_offset_t va); 620 621 static 622 pt_entry_t * 623 pmap_pte_quick(pmap_t pmap, vm_offset_t va) 624 { 625 return pmap_pte(pmap, va); 626 } 627 628 /* 629 * The placemarker hash must be broken up into four zones so lock 630 * ordering semantics continue to work (e.g. pte, pt, pd, then pdp). 631 * 632 * Placemarkers are used to 'lock' page table indices that do not have 633 * a pv_entry. This allows the pmap to support managed and unmanaged 634 * pages and shared page tables. 635 */ 636 #define PM_PLACE_BASE (PM_PLACEMARKS >> 2) 637 638 static __inline 639 vm_pindex_t * 640 pmap_placemarker_hash(pmap_t pmap, vm_pindex_t pindex) 641 { 642 int hi; 643 644 if (pindex < pmap_pt_pindex(0)) /* zone 0 - PTE */ 645 hi = 0; 646 else if (pindex < pmap_pd_pindex(0)) /* zone 1 - PT */ 647 hi = PM_PLACE_BASE; 648 else if (pindex < pmap_pdp_pindex(0)) /* zone 2 - PD */ 649 hi = PM_PLACE_BASE << 1; 650 else /* zone 3 - PDP (and PML4E) */ 651 hi = PM_PLACE_BASE | (PM_PLACE_BASE << 1); 652 hi += pindex & (PM_PLACE_BASE - 1); 653 654 return (&pmap->pm_placemarks[hi]); 655 } 656 657 658 /* 659 * Generic procedure to index a pte from a pt, pd, or pdp. 660 * 661 * NOTE: Normally passed pindex as pmap_xx_index(). pmap_xx_pindex() is NOT 662 * a page table page index but is instead of PV lookup index. 663 */ 664 static 665 void * 666 pv_pte_lookup(pv_entry_t pv, vm_pindex_t pindex) 667 { 668 pt_entry_t *pte; 669 670 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pv->pv_m)); 671 return(&pte[pindex]); 672 } 673 674 /* 675 * Return pointer to PDP slot in the PML4 676 */ 677 static __inline 678 pml4_entry_t * 679 pmap_pdp(pmap_t pmap, vm_offset_t va) 680 { 681 return (&pmap->pm_pml4[pmap_pdp_index(va)]); 682 } 683 684 /* 685 * Return pointer to PD slot in the PDP given a pointer to the PDP 686 */ 687 static __inline 688 pdp_entry_t * 689 pmap_pdp_to_pd(pml4_entry_t pdp_pte, vm_offset_t va) 690 { 691 pdp_entry_t *pd; 692 693 pd = (pdp_entry_t *)PHYS_TO_DMAP(pdp_pte & PG_FRAME); 694 return (&pd[pmap_pd_index(va)]); 695 } 696 697 /* 698 * Return pointer to PD slot in the PDP. 699 */ 700 static __inline 701 pdp_entry_t * 702 pmap_pd(pmap_t pmap, vm_offset_t va) 703 { 704 pml4_entry_t *pdp; 705 706 pdp = pmap_pdp(pmap, va); 707 if ((*pdp & pmap->pmap_bits[PG_V_IDX]) == 0) 708 return NULL; 709 return (pmap_pdp_to_pd(*pdp, va)); 710 } 711 712 /* 713 * Return pointer to PT slot in the PD given a pointer to the PD 714 */ 715 static __inline 716 pd_entry_t * 717 pmap_pd_to_pt(pdp_entry_t pd_pte, vm_offset_t va) 718 { 719 pd_entry_t *pt; 720 721 pt = (pd_entry_t *)PHYS_TO_DMAP(pd_pte & PG_FRAME); 722 return (&pt[pmap_pt_index(va)]); 723 } 724 725 /* 726 * Return pointer to PT slot in the PD 727 * 728 * SIMPLE PMAP NOTE: Simple pmaps (embedded in objects) do not have PDPs, 729 * so we cannot lookup the PD via the PDP. Instead we 730 * must look it up via the pmap. 731 */ 732 static __inline 733 pd_entry_t * 734 pmap_pt(pmap_t pmap, vm_offset_t va) 735 { 736 pdp_entry_t *pd; 737 pv_entry_t pv; 738 vm_pindex_t pd_pindex; 739 vm_paddr_t phys; 740 741 if (pmap->pm_flags & PMAP_FLAG_SIMPLE) { 742 pd_pindex = pmap_pd_pindex(va); 743 spin_lock_shared(&pmap->pm_spin); 744 pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pd_pindex); 745 if (pv == NULL || pv->pv_m == NULL) { 746 spin_unlock_shared(&pmap->pm_spin); 747 return NULL; 748 } 749 phys = VM_PAGE_TO_PHYS(pv->pv_m); 750 spin_unlock_shared(&pmap->pm_spin); 751 return (pmap_pd_to_pt(phys, va)); 752 } else { 753 pd = pmap_pd(pmap, va); 754 if (pd == NULL || (*pd & pmap->pmap_bits[PG_V_IDX]) == 0) 755 return NULL; 756 return (pmap_pd_to_pt(*pd, va)); 757 } 758 } 759 760 /* 761 * Return pointer to PTE slot in the PT given a pointer to the PT 762 */ 763 static __inline 764 pt_entry_t * 765 pmap_pt_to_pte(pd_entry_t pt_pte, vm_offset_t va) 766 { 767 pt_entry_t *pte; 768 769 pte = (pt_entry_t *)PHYS_TO_DMAP(pt_pte & PG_FRAME); 770 return (&pte[pmap_pte_index(va)]); 771 } 772 773 /* 774 * Return pointer to PTE slot in the PT 775 */ 776 static __inline 777 pt_entry_t * 778 pmap_pte(pmap_t pmap, vm_offset_t va) 779 { 780 pd_entry_t *pt; 781 782 pt = pmap_pt(pmap, va); 783 if (pt == NULL || (*pt & pmap->pmap_bits[PG_V_IDX]) == 0) 784 return NULL; 785 if ((*pt & pmap->pmap_bits[PG_PS_IDX]) != 0) 786 return ((pt_entry_t *)pt); 787 return (pmap_pt_to_pte(*pt, va)); 788 } 789 790 /* 791 * Return address of PT slot in PD (KVM only) 792 * 793 * Cannot be used for user page tables because it might interfere with 794 * the shared page-table-page optimization (pmap_mmu_optimize). 795 */ 796 static __inline 797 pd_entry_t * 798 vtopt(vm_offset_t va) 799 { 800 uint64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + 801 NPML4EPGSHIFT)) - 1); 802 803 return (PDmap + ((va >> PDRSHIFT) & mask)); 804 } 805 806 /* 807 * KVM - return address of PTE slot in PT 808 */ 809 static __inline 810 pt_entry_t * 811 vtopte(vm_offset_t va) 812 { 813 uint64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + 814 NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 815 816 return (PTmap + ((va >> PAGE_SHIFT) & mask)); 817 } 818 819 /* 820 * Returns the physical address translation from va for a user address. 821 * (vm_paddr_t)-1 is returned on failure. 822 */ 823 vm_paddr_t 824 uservtophys(vm_offset_t va) 825 { 826 uint64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + 827 NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 828 vm_paddr_t pa; 829 pt_entry_t pte; 830 pmap_t pmap; 831 832 pmap = vmspace_pmap(mycpu->gd_curthread->td_lwp->lwp_vmspace); 833 pa = (vm_paddr_t)-1; 834 if (va < VM_MAX_USER_ADDRESS) { 835 pte = kreadmem64(PTmap + ((va >> PAGE_SHIFT) & mask)); 836 if (pte & pmap->pmap_bits[PG_V_IDX]) 837 pa = (pte & PG_FRAME) | (va & PAGE_MASK); 838 } 839 return pa; 840 } 841 842 static uint64_t 843 allocpages(vm_paddr_t *firstaddr, long n) 844 { 845 uint64_t ret; 846 847 ret = *firstaddr; 848 bzero((void *)ret, n * PAGE_SIZE); 849 *firstaddr += n * PAGE_SIZE; 850 return (ret); 851 } 852 853 static 854 void 855 create_pagetables(vm_paddr_t *firstaddr) 856 { 857 long i; /* must be 64 bits */ 858 long nkpt_base; 859 long nkpt_phys; 860 long nkpd_phys; 861 int j; 862 863 /* 864 * We are running (mostly) V=P at this point 865 * 866 * Calculate how many 1GB PD entries in our PDP pages are needed 867 * for the DMAP. This is only allocated if the system does not 868 * support 1GB pages. Otherwise ndmpdp is simply a count of 869 * the number of 1G terminal entries in our PDP pages are needed. 870 * 871 * NOTE: Maxmem is in pages 872 */ 873 ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT; 874 if (ndmpdp < 4) /* Minimum 4GB of dirmap */ 875 ndmpdp = 4; 876 KKASSERT(ndmpdp <= NDMPML4E * NPML4EPG); 877 DMapMaxAddress = DMAP_MIN_ADDRESS + 878 ((ndmpdp * NPDEPG) << PDRSHIFT); 879 880 /* 881 * Starting at KERNBASE - map all 2G worth of page table pages. 882 * KERNBASE is offset -2G from the end of kvm. This will accomodate 883 * all KVM allocations above KERNBASE, including the SYSMAPs below. 884 * 885 * We do this by allocating 2*512 PT pages. Each PT page can map 886 * 2MB, for 2GB total. 887 */ 888 nkpt_base = (NPDPEPG - KPDPI) * NPTEPG; /* typically 2 x 512 */ 889 890 /* 891 * Starting at the beginning of kvm (VM_MIN_KERNEL_ADDRESS), 892 * Calculate how many page table pages we need to preallocate 893 * for early vm_map allocations. 894 * 895 * A few extra won't hurt, they will get used up in the running 896 * system. 897 * 898 * vm_page array 899 * initial pventry's 900 */ 901 nkpt_phys = (Maxmem * sizeof(struct vm_page) + NBPDR - 1) / NBPDR; 902 nkpt_phys += (Maxmem * sizeof(struct pv_entry) + NBPDR - 1) / NBPDR; 903 nkpt_phys += 128; /* a few extra */ 904 905 /* 906 * The highest value nkpd_phys can be set to is 907 * NKPDPE - (NPDPEPG - KPDPI) (i.e. NKPDPE - 2). 908 * 909 * Doing so would cause all PD pages to be pre-populated for 910 * a maximal KVM space (approximately 16*512 pages, or 32MB. 911 * We can save memory by not doing this. 912 */ 913 nkpd_phys = (nkpt_phys + NPDPEPG - 1) / NPDPEPG; 914 915 /* 916 * Allocate pages 917 * 918 * Normally NKPML4E=1-16 (1-16 kernel PDP page) 919 * Normally NKPDPE= NKPML4E*512-1 (511 min kernel PD pages) 920 * 921 * Only allocate enough PD pages 922 * NOTE: We allocate all kernel PD pages up-front, typically 923 * ~511G of KVM, requiring 511 PD pages. 924 */ 925 KPTbase = allocpages(firstaddr, nkpt_base); /* KERNBASE to end */ 926 KPTphys = allocpages(firstaddr, nkpt_phys); /* KVA start */ 927 KPML4phys = allocpages(firstaddr, 1); /* recursive PML4 map */ 928 KPDPphys = allocpages(firstaddr, NKPML4E); /* kernel PDP pages */ 929 KPDphys = allocpages(firstaddr, nkpd_phys); /* kernel PD pages */ 930 931 /* 932 * Alloc PD pages for the area starting at KERNBASE. 933 */ 934 KPDbase = allocpages(firstaddr, NPDPEPG - KPDPI); 935 936 /* 937 * Stuff for our DMAP. Use 2MB pages even when 1GB pages 938 * are available in order to allow APU code to adjust page 939 * attributes on a fixed grain (see pmap_change_attr()). 940 */ 941 DMPDPphys = allocpages(firstaddr, NDMPML4E); 942 #if 1 943 DMPDphys = allocpages(firstaddr, ndmpdp); 944 #else 945 if ((amd_feature & AMDID_PAGE1GB) == 0) 946 DMPDphys = allocpages(firstaddr, ndmpdp); 947 #endif 948 dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; 949 950 /* 951 * Fill in the underlying page table pages for the area around 952 * KERNBASE. This remaps low physical memory to KERNBASE. 953 * 954 * Read-only from zero to physfree 955 * XXX not fully used, underneath 2M pages 956 */ 957 for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) { 958 ((pt_entry_t *)KPTbase)[i] = i << PAGE_SHIFT; 959 ((pt_entry_t *)KPTbase)[i] |= 960 pmap_bits_default[PG_RW_IDX] | 961 pmap_bits_default[PG_V_IDX] | 962 pmap_bits_default[PG_G_IDX]; 963 } 964 965 /* 966 * Now map the initial kernel page tables. One block of page 967 * tables is placed at the beginning of kernel virtual memory, 968 * and another block is placed at KERNBASE to map the kernel binary, 969 * data, bss, and initial pre-allocations. 970 */ 971 for (i = 0; i < nkpt_base; i++) { 972 ((pd_entry_t *)KPDbase)[i] = KPTbase + (i << PAGE_SHIFT); 973 ((pd_entry_t *)KPDbase)[i] |= 974 pmap_bits_default[PG_RW_IDX] | 975 pmap_bits_default[PG_V_IDX]; 976 } 977 for (i = 0; i < nkpt_phys; i++) { 978 ((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT); 979 ((pd_entry_t *)KPDphys)[i] |= 980 pmap_bits_default[PG_RW_IDX] | 981 pmap_bits_default[PG_V_IDX]; 982 } 983 984 /* 985 * Map from zero to end of allocations using 2M pages as an 986 * optimization. This will bypass some of the KPTBase pages 987 * above in the KERNBASE area. 988 */ 989 for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) { 990 ((pd_entry_t *)KPDbase)[i] = i << PDRSHIFT; 991 ((pd_entry_t *)KPDbase)[i] |= 992 pmap_bits_default[PG_RW_IDX] | 993 pmap_bits_default[PG_V_IDX] | 994 pmap_bits_default[PG_PS_IDX] | 995 pmap_bits_default[PG_G_IDX]; 996 } 997 998 /* 999 * Load PD addresses into the PDP pages for primary KVA space to 1000 * cover existing page tables. PD's for KERNBASE are handled in 1001 * the next loop. 1002 * 1003 * expected to pre-populate all of its PDs. See NKPDPE in vmparam.h. 1004 */ 1005 for (i = 0; i < nkpd_phys; i++) { 1006 ((pdp_entry_t *)KPDPphys)[NKPML4E * NPDPEPG - NKPDPE + i] = 1007 KPDphys + (i << PAGE_SHIFT); 1008 ((pdp_entry_t *)KPDPphys)[NKPML4E * NPDPEPG - NKPDPE + i] |= 1009 pmap_bits_default[PG_RW_IDX] | 1010 pmap_bits_default[PG_V_IDX] | 1011 pmap_bits_default[PG_A_IDX]; 1012 } 1013 1014 /* 1015 * Load PDs for KERNBASE to the end 1016 */ 1017 i = (NKPML4E - 1) * NPDPEPG + KPDPI; 1018 for (j = 0; j < NPDPEPG - KPDPI; ++j) { 1019 ((pdp_entry_t *)KPDPphys)[i + j] = 1020 KPDbase + (j << PAGE_SHIFT); 1021 ((pdp_entry_t *)KPDPphys)[i + j] |= 1022 pmap_bits_default[PG_RW_IDX] | 1023 pmap_bits_default[PG_V_IDX] | 1024 pmap_bits_default[PG_A_IDX]; 1025 } 1026 1027 /* 1028 * Now set up the direct map space using either 2MB or 1GB pages 1029 * Preset PG_M and PG_A because demotion expects it. 1030 * 1031 * When filling in entries in the PD pages make sure any excess 1032 * entries are set to zero as we allocated enough PD pages 1033 * 1034 * Stuff for our DMAP. Use 2MB pages even when 1GB pages 1035 * are available in order to allow APU code to adjust page 1036 * attributes on a fixed grain (see pmap_change_attr()). 1037 */ 1038 #if 0 1039 if ((amd_feature & AMDID_PAGE1GB) == 0) 1040 #endif 1041 { 1042 /* 1043 * Use 2MB pages 1044 */ 1045 for (i = 0; i < NPDEPG * ndmpdp; i++) { 1046 ((pd_entry_t *)DMPDphys)[i] = i << PDRSHIFT; 1047 ((pd_entry_t *)DMPDphys)[i] |= 1048 pmap_bits_default[PG_RW_IDX] | 1049 pmap_bits_default[PG_V_IDX] | 1050 pmap_bits_default[PG_PS_IDX] | 1051 pmap_bits_default[PG_G_IDX] | 1052 pmap_bits_default[PG_M_IDX] | 1053 pmap_bits_default[PG_A_IDX]; 1054 } 1055 1056 /* 1057 * And the direct map space's PDP 1058 */ 1059 for (i = 0; i < ndmpdp; i++) { 1060 ((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + 1061 (i << PAGE_SHIFT); 1062 ((pdp_entry_t *)DMPDPphys)[i] |= 1063 pmap_bits_default[PG_RW_IDX] | 1064 pmap_bits_default[PG_V_IDX] | 1065 pmap_bits_default[PG_A_IDX]; 1066 } 1067 } 1068 #if 0 1069 else { 1070 /* 1071 * 1GB pages 1072 */ 1073 for (i = 0; i < ndmpdp; i++) { 1074 ((pdp_entry_t *)DMPDPphys)[i] = 1075 (vm_paddr_t)i << PDPSHIFT; 1076 ((pdp_entry_t *)DMPDPphys)[i] |= 1077 pmap_bits_default[PG_RW_IDX] | 1078 pmap_bits_default[PG_V_IDX] | 1079 pmap_bits_default[PG_PS_IDX] | 1080 pmap_bits_default[PG_G_IDX] | 1081 pmap_bits_default[PG_M_IDX] | 1082 pmap_bits_default[PG_A_IDX]; 1083 } 1084 } 1085 #endif 1086 1087 /* And recursively map PML4 to itself in order to get PTmap */ 1088 ((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys; 1089 ((pdp_entry_t *)KPML4phys)[PML4PML4I] |= 1090 pmap_bits_default[PG_RW_IDX] | 1091 pmap_bits_default[PG_V_IDX] | 1092 pmap_bits_default[PG_A_IDX]; 1093 1094 /* 1095 * Connect the Direct Map slots up to the PML4 1096 */ 1097 for (j = 0; j < NDMPML4E; ++j) { 1098 ((pdp_entry_t *)KPML4phys)[DMPML4I + j] = 1099 (DMPDPphys + ((vm_paddr_t)j << PAGE_SHIFT)) | 1100 pmap_bits_default[PG_RW_IDX] | 1101 pmap_bits_default[PG_V_IDX] | 1102 pmap_bits_default[PG_A_IDX]; 1103 } 1104 1105 /* 1106 * Connect the KVA slot up to the PML4 1107 */ 1108 for (j = 0; j < NKPML4E; ++j) { 1109 ((pdp_entry_t *)KPML4phys)[KPML4I + j] = 1110 KPDPphys + ((vm_paddr_t)j << PAGE_SHIFT); 1111 ((pdp_entry_t *)KPML4phys)[KPML4I + j] |= 1112 pmap_bits_default[PG_RW_IDX] | 1113 pmap_bits_default[PG_V_IDX] | 1114 pmap_bits_default[PG_A_IDX]; 1115 } 1116 cpu_mfence(); 1117 cpu_invltlb(); 1118 } 1119 1120 /* 1121 * Bootstrap the system enough to run with virtual memory. 1122 * 1123 * On x86_64 this is called after mapping has already been enabled 1124 * and just syncs the pmap module with what has already been done. 1125 * [We can't call it easily with mapping off since the kernel is not 1126 * mapped with PA == VA, hence we would have to relocate every address 1127 * from the linked base (virtual) address "KERNBASE" to the actual 1128 * (physical) address starting relative to 0] 1129 */ 1130 void 1131 pmap_bootstrap(vm_paddr_t *firstaddr) 1132 { 1133 vm_offset_t va; 1134 pt_entry_t *pte; 1135 int i; 1136 1137 KvaStart = VM_MIN_KERNEL_ADDRESS; 1138 KvaEnd = VM_MAX_KERNEL_ADDRESS; 1139 KvaSize = KvaEnd - KvaStart; 1140 1141 avail_start = *firstaddr; 1142 1143 /* 1144 * Create an initial set of page tables to run the kernel in. 1145 */ 1146 create_pagetables(firstaddr); 1147 1148 virtual2_start = KvaStart; 1149 virtual2_end = PTOV_OFFSET; 1150 1151 virtual_start = (vm_offset_t) PTOV_OFFSET + *firstaddr; 1152 virtual_start = pmap_kmem_choose(virtual_start); 1153 1154 virtual_end = VM_MAX_KERNEL_ADDRESS; 1155 1156 /* XXX do %cr0 as well */ 1157 load_cr4(rcr4() | CR4_PGE | CR4_PSE); 1158 load_cr3(KPML4phys); 1159 1160 /* 1161 * Initialize protection array. 1162 */ 1163 x86_64_protection_init(); 1164 1165 /* 1166 * The kernel's pmap is statically allocated so we don't have to use 1167 * pmap_create, which is unlikely to work correctly at this part of 1168 * the boot sequence (XXX and which no longer exists). 1169 */ 1170 kernel_pmap.pm_pml4 = (pdp_entry_t *) (PTOV_OFFSET + KPML4phys); 1171 kernel_pmap.pm_count = 1; 1172 CPUMASK_ASSALLONES(kernel_pmap.pm_active); 1173 RB_INIT(&kernel_pmap.pm_pvroot); 1174 spin_init(&kernel_pmap.pm_spin, "pmapbootstrap"); 1175 for (i = 0; i < PM_PLACEMARKS; ++i) 1176 kernel_pmap.pm_placemarks[i] = PM_NOPLACEMARK; 1177 1178 /* 1179 * Reserve some special page table entries/VA space for temporary 1180 * mapping of pages. 1181 */ 1182 #define SYSMAP(c, p, v, n) \ 1183 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 1184 1185 va = virtual_start; 1186 pte = vtopte(va); 1187 1188 /* 1189 * CMAP1/CMAP2 are used for zeroing and copying pages. 1190 */ 1191 SYSMAP(caddr_t, CMAP1, CADDR1, 1) 1192 1193 /* 1194 * Crashdump maps. 1195 */ 1196 SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS); 1197 1198 /* 1199 * ptvmmap is used for reading arbitrary physical pages via 1200 * /dev/mem. 1201 */ 1202 SYSMAP(caddr_t, ptmmap, ptvmmap, 1) 1203 1204 /* 1205 * msgbufp is used to map the system message buffer. 1206 * XXX msgbufmap is not used. 1207 */ 1208 SYSMAP(struct msgbuf *, msgbufmap, msgbufp, 1209 atop(round_page(MSGBUF_SIZE))) 1210 1211 virtual_start = va; 1212 virtual_start = pmap_kmem_choose(virtual_start); 1213 1214 *CMAP1 = 0; 1215 1216 /* 1217 * PG_G is terribly broken on SMP because we IPI invltlb's in some 1218 * cases rather then invl1pg. Actually, I don't even know why it 1219 * works under UP because self-referential page table mappings 1220 */ 1221 // pgeflag = 0; 1222 1223 cpu_invltlb(); 1224 1225 /* Initialize the PAT MSR */ 1226 pmap_init_pat(); 1227 pmap_pinit_defaults(&kernel_pmap); 1228 1229 TUNABLE_INT_FETCH("machdep.pmap_fast_kernel_cpusync", 1230 &pmap_fast_kernel_cpusync); 1231 1232 } 1233 1234 /* 1235 * Setup the PAT MSR. 1236 */ 1237 void 1238 pmap_init_pat(void) 1239 { 1240 uint64_t pat_msr; 1241 u_long cr0, cr4; 1242 int i; 1243 1244 /* 1245 * Default values mapping PATi,PCD,PWT bits at system reset. 1246 * The default values effectively ignore the PATi bit by 1247 * repeating the encodings for 0-3 in 4-7, and map the PCD 1248 * and PWT bit combinations to the expected PAT types. 1249 */ 1250 pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | /* 000 */ 1251 PAT_VALUE(1, PAT_WRITE_THROUGH) | /* 001 */ 1252 PAT_VALUE(2, PAT_UNCACHED) | /* 010 */ 1253 PAT_VALUE(3, PAT_UNCACHEABLE) | /* 011 */ 1254 PAT_VALUE(4, PAT_WRITE_BACK) | /* 100 */ 1255 PAT_VALUE(5, PAT_WRITE_THROUGH) | /* 101 */ 1256 PAT_VALUE(6, PAT_UNCACHED) | /* 110 */ 1257 PAT_VALUE(7, PAT_UNCACHEABLE); /* 111 */ 1258 pat_pte_index[PAT_WRITE_BACK] = 0; 1259 pat_pte_index[PAT_WRITE_THROUGH]= 0 | X86_PG_NC_PWT; 1260 pat_pte_index[PAT_UNCACHED] = X86_PG_NC_PCD; 1261 pat_pte_index[PAT_UNCACHEABLE] = X86_PG_NC_PCD | X86_PG_NC_PWT; 1262 pat_pte_index[PAT_WRITE_PROTECTED] = pat_pte_index[PAT_UNCACHEABLE]; 1263 pat_pte_index[PAT_WRITE_COMBINING] = pat_pte_index[PAT_UNCACHEABLE]; 1264 1265 if (cpu_feature & CPUID_PAT) { 1266 /* 1267 * If we support the PAT then set-up entries for 1268 * WRITE_PROTECTED and WRITE_COMBINING using bit patterns 1269 * 5 and 6. 1270 */ 1271 pat_msr = (pat_msr & ~PAT_MASK(5)) | 1272 PAT_VALUE(5, PAT_WRITE_PROTECTED); 1273 pat_msr = (pat_msr & ~PAT_MASK(6)) | 1274 PAT_VALUE(6, PAT_WRITE_COMBINING); 1275 pat_pte_index[PAT_WRITE_PROTECTED] = X86_PG_PTE_PAT | X86_PG_NC_PWT; 1276 pat_pte_index[PAT_WRITE_COMBINING] = X86_PG_PTE_PAT | X86_PG_NC_PCD; 1277 1278 /* 1279 * Then enable the PAT 1280 */ 1281 1282 /* Disable PGE. */ 1283 cr4 = rcr4(); 1284 load_cr4(cr4 & ~CR4_PGE); 1285 1286 /* Disable caches (CD = 1, NW = 0). */ 1287 cr0 = rcr0(); 1288 load_cr0((cr0 & ~CR0_NW) | CR0_CD); 1289 1290 /* Flushes caches and TLBs. */ 1291 wbinvd(); 1292 cpu_invltlb(); 1293 1294 /* Update PAT and index table. */ 1295 wrmsr(MSR_PAT, pat_msr); 1296 1297 /* Flush caches and TLBs again. */ 1298 wbinvd(); 1299 cpu_invltlb(); 1300 1301 /* Restore caches and PGE. */ 1302 load_cr0(cr0); 1303 load_cr4(cr4); 1304 PatMsr = pat_msr; 1305 } 1306 1307 for (i = 0; i < 8; ++i) { 1308 pt_entry_t pte; 1309 1310 pte = pat_pte_index[i]; 1311 if (pte & X86_PG_PTE_PAT) { 1312 pte &= ~X86_PG_PTE_PAT; 1313 pte |= X86_PG_PDE_PAT; 1314 } 1315 pat_pde_index[i] = pte; 1316 } 1317 } 1318 1319 /* 1320 * Set 4mb pdir for mp startup 1321 */ 1322 void 1323 pmap_set_opt(void) 1324 { 1325 if (cpu_feature & CPUID_PSE) { 1326 load_cr4(rcr4() | CR4_PSE); 1327 if (mycpu->gd_cpuid == 0) /* only on BSP */ 1328 cpu_invltlb(); 1329 } 1330 1331 /* 1332 * Check for SMAP support and enable if available. Must be done 1333 * after cr3 is loaded, and on all cores. 1334 */ 1335 if (cpu_stdext_feature & CPUID_STDEXT_SMAP) { 1336 load_cr4(rcr4() | CR4_SMAP); 1337 } 1338 if (cpu_stdext_feature & CPUID_STDEXT_SMEP) { 1339 load_cr4(rcr4() | CR4_SMEP); 1340 } 1341 } 1342 1343 /* 1344 * Early initialization of the pmap module. 1345 * 1346 * Called by vm_init, to initialize any structures that the pmap 1347 * system needs to map virtual memory. pmap_init has been enhanced to 1348 * support in a fairly consistant way, discontiguous physical memory. 1349 */ 1350 void 1351 pmap_init(void) 1352 { 1353 vm_pindex_t initial_pvs; 1354 vm_pindex_t i; 1355 1356 /* 1357 * Allocate memory for random pmap data structures. Includes the 1358 * pv_head_table. 1359 */ 1360 for (i = 0; i < vm_page_array_size; i++) { 1361 vm_page_t m; 1362 1363 m = &vm_page_array[i]; 1364 m->md.pmap_count = 0; 1365 m->md.writeable_count = 0; 1366 } 1367 1368 /* 1369 * init the pv free list 1370 */ 1371 initial_pvs = vm_page_array_size; 1372 if (initial_pvs < MINPV) 1373 initial_pvs = MINPV; 1374 pvzone = &pvzone_store; 1375 pvinit = (void *)kmem_alloc(&kernel_map, 1376 initial_pvs * sizeof (struct pv_entry), 1377 VM_SUBSYS_PVENTRY); 1378 zbootinit(pvzone, "PV ENTRY", sizeof (struct pv_entry), 1379 pvinit, initial_pvs); 1380 1381 /* 1382 * Now it is safe to enable pv_table recording. 1383 */ 1384 pmap_initialized = TRUE; 1385 } 1386 1387 /* 1388 * Initialize the address space (zone) for the pv_entries. Set a 1389 * high water mark so that the system can recover from excessive 1390 * numbers of pv entries. 1391 * 1392 * Also create the kernel page table template for isolated user 1393 * pmaps. 1394 */ 1395 static void pmap_init_iso_range(vm_offset_t base, size_t bytes); 1396 static void pmap_init2_iso_pmap(void); 1397 #if 0 1398 static void dump_pmap(pmap_t pmap, pt_entry_t pte, int level, vm_offset_t base); 1399 #endif 1400 1401 void 1402 pmap_init2(void) 1403 { 1404 vm_pindex_t entry_max; 1405 1406 /* 1407 * We can significantly reduce pv_entry_max from historical 1408 * levels because pv_entry's are no longer use for PTEs at the 1409 * leafs. This prevents excessive pcpu caching on many-core 1410 * boxes (even with the further '/ 16' done in zinitna(). 1411 * 1412 * Remember, however, that processes can share physical pages 1413 * with each process still needing the pdp/pd/pt infrstructure 1414 * (which still use pv_entry's). And don't just assume that 1415 * every PT will be completely filled up. So don't make it 1416 * too small. 1417 */ 1418 entry_max = maxproc * 32 + vm_page_array_size / 16; 1419 TUNABLE_LONG_FETCH("vm.pmap.pv_entries", &entry_max); 1420 vm_pmap_pv_entries = entry_max; 1421 1422 /* 1423 * Subtract out pages already installed in the zone (hack) 1424 */ 1425 if (entry_max <= MINPV) 1426 entry_max = MINPV; 1427 1428 zinitna(pvzone, NULL, 0, entry_max, ZONE_INTERRUPT); 1429 1430 /* 1431 * Enable dynamic deletion of empty higher-level page table pages 1432 * by default only if system memory is < 8GB (use 7GB for slop). 1433 * This can save a little memory, but imposes significant 1434 * performance overhead for things like bulk builds, and for programs 1435 * which do a lot of memory mapping and memory unmapping. 1436 */ 1437 #if 0 1438 if (pmap_dynamic_delete < 0) { 1439 if (vmstats.v_page_count < 7LL * 1024 * 1024 * 1024 / PAGE_SIZE) 1440 pmap_dynamic_delete = 1; 1441 else 1442 pmap_dynamic_delete = 0; 1443 } 1444 #endif 1445 /* 1446 * Disable so vm_map_backing iterations do not race 1447 */ 1448 pmap_dynamic_delete = 0; 1449 1450 /* 1451 * Automatic detection of Intel meltdown bug requiring user/kernel 1452 * mmap isolation. 1453 * 1454 * Currently there are so many Intel cpu's impacted that its better 1455 * to whitelist future Intel CPUs. Most? AMD cpus are not impacted 1456 * so the default is off for AMD. 1457 */ 1458 if (meltdown_mitigation < 0) { 1459 if (cpu_vendor_id == CPU_VENDOR_INTEL) 1460 meltdown_mitigation = 1; 1461 else 1462 meltdown_mitigation = 0; 1463 } 1464 if (meltdown_mitigation) { 1465 kprintf("machdep.meltdown_mitigation enabled to " 1466 "protect against (mostly Intel) meltdown bug\n"); 1467 kprintf("system call performance will be impacted\n"); 1468 } 1469 1470 pmap_init2_iso_pmap(); 1471 } 1472 1473 /* 1474 * Create the isolation pmap template. Once created, the template 1475 * is static and its PML4e entries are used to populate the 1476 * kernel portion of any isolated user pmaps. 1477 * 1478 * Our isolation pmap must contain: 1479 * (1) trampoline area for all cpus 1480 * (2) common_tss area for all cpus (its part of the trampoline area now) 1481 * (3) IDT for all cpus 1482 * (4) GDT for all cpus 1483 */ 1484 static void 1485 pmap_init2_iso_pmap(void) 1486 { 1487 int n; 1488 1489 if (bootverbose) 1490 kprintf("Initialize isolation pmap\n"); 1491 1492 /* 1493 * Try to use our normal API calls to make this easier. We have 1494 * to scrap the shadowed kernel PDPs pmap_pinit() creates for our 1495 * iso_pmap. 1496 */ 1497 pmap_pinit(&iso_pmap); 1498 bzero(iso_pmap.pm_pml4, PAGE_SIZE); 1499 1500 /* 1501 * Install areas needed by the cpu and trampoline. 1502 */ 1503 for (n = 0; n < ncpus; ++n) { 1504 struct privatespace *ps; 1505 1506 ps = CPU_prvspace[n]; 1507 pmap_init_iso_range((vm_offset_t)&ps->trampoline, 1508 sizeof(ps->trampoline)); 1509 pmap_init_iso_range((vm_offset_t)&ps->dblstack, 1510 sizeof(ps->dblstack)); 1511 pmap_init_iso_range((vm_offset_t)&ps->dbgstack, 1512 sizeof(ps->dbgstack)); 1513 pmap_init_iso_range((vm_offset_t)&ps->common_tss, 1514 sizeof(ps->common_tss)); 1515 pmap_init_iso_range(r_idt_arr[n].rd_base, 1516 r_idt_arr[n].rd_limit + 1); 1517 } 1518 pmap_init_iso_range((register_t)gdt, sizeof(gdt)); 1519 pmap_init_iso_range((vm_offset_t)(int *)btext, 1520 (vm_offset_t)(int *)etext - 1521 (vm_offset_t)(int *)btext); 1522 1523 #if 0 1524 kprintf("Dump iso_pmap:\n"); 1525 dump_pmap(&iso_pmap, vtophys(iso_pmap.pm_pml4), 0, 0); 1526 kprintf("\nDump kernel_pmap:\n"); 1527 dump_pmap(&kernel_pmap, vtophys(kernel_pmap.pm_pml4), 0, 0); 1528 #endif 1529 } 1530 1531 /* 1532 * This adds a kernel virtual address range to the isolation pmap. 1533 */ 1534 static void 1535 pmap_init_iso_range(vm_offset_t base, size_t bytes) 1536 { 1537 pv_entry_t pv; 1538 pv_entry_t pvp; 1539 pt_entry_t *ptep; 1540 pt_entry_t pte; 1541 vm_offset_t va; 1542 1543 if (bootverbose) { 1544 kprintf("isolate %016jx-%016jx (%zd)\n", 1545 base, base + bytes, bytes); 1546 } 1547 va = base & ~(vm_offset_t)PAGE_MASK; 1548 while (va < base + bytes) { 1549 if ((va & PDRMASK) == 0 && va + NBPDR <= base + bytes && 1550 (ptep = pmap_pt(&kernel_pmap, va)) != NULL && 1551 (*ptep & kernel_pmap.pmap_bits[PG_V_IDX]) && 1552 (*ptep & kernel_pmap.pmap_bits[PG_PS_IDX])) { 1553 /* 1554 * Use 2MB pages if possible 1555 */ 1556 pte = *ptep; 1557 pv = pmap_allocpte(&iso_pmap, pmap_pd_pindex(va), &pvp); 1558 ptep = pv_pte_lookup(pv, (va >> PDRSHIFT) & 511); 1559 *ptep = pte; 1560 va += NBPDR; 1561 } else { 1562 /* 1563 * Otherwise use 4KB pages 1564 */ 1565 pv = pmap_allocpte(&iso_pmap, pmap_pt_pindex(va), &pvp); 1566 ptep = pv_pte_lookup(pv, (va >> PAGE_SHIFT) & 511); 1567 *ptep = vtophys(va) | kernel_pmap.pmap_bits[PG_RW_IDX] | 1568 kernel_pmap.pmap_bits[PG_V_IDX] | 1569 kernel_pmap.pmap_bits[PG_A_IDX] | 1570 kernel_pmap.pmap_bits[PG_M_IDX]; 1571 1572 va += PAGE_SIZE; 1573 } 1574 pv_put(pv); 1575 pv_put(pvp); 1576 } 1577 } 1578 1579 #if 0 1580 /* 1581 * Useful debugging pmap dumper, do not remove (#if 0 when not in use) 1582 */ 1583 static 1584 void 1585 dump_pmap(pmap_t pmap, pt_entry_t pte, int level, vm_offset_t base) 1586 { 1587 pt_entry_t *ptp; 1588 vm_offset_t incr; 1589 int i; 1590 1591 switch(level) { 1592 case 0: /* PML4e page, 512G entries */ 1593 incr = (1LL << 48) / 512; 1594 break; 1595 case 1: /* PDP page, 1G entries */ 1596 incr = (1LL << 39) / 512; 1597 break; 1598 case 2: /* PD page, 2MB entries */ 1599 incr = (1LL << 30) / 512; 1600 break; 1601 case 3: /* PT page, 4KB entries */ 1602 incr = (1LL << 21) / 512; 1603 break; 1604 default: 1605 incr = 0; 1606 break; 1607 } 1608 1609 if (level == 0) 1610 kprintf("cr3 %016jx @ va=%016jx\n", pte, base); 1611 ptp = (void *)PHYS_TO_DMAP(pte & ~(pt_entry_t)PAGE_MASK); 1612 for (i = 0; i < 512; ++i) { 1613 if (level == 0 && i == 128) 1614 base += 0xFFFF000000000000LLU; 1615 if (ptp[i]) { 1616 kprintf("%*.*s ", level * 4, level * 4, ""); 1617 if (level == 1 && (ptp[i] & 0x180) == 0x180) { 1618 kprintf("va=%016jx %3d term %016jx (1GB)\n", 1619 base, i, ptp[i]); 1620 } else if (level == 2 && (ptp[i] & 0x180) == 0x180) { 1621 kprintf("va=%016jx %3d term %016jx (2MB)\n", 1622 base, i, ptp[i]); 1623 } else if (level == 3) { 1624 kprintf("va=%016jx %3d term %016jx\n", 1625 base, i, ptp[i]); 1626 } else { 1627 kprintf("va=%016jx %3d deep %016jx\n", 1628 base, i, ptp[i]); 1629 dump_pmap(pmap, ptp[i], level + 1, base); 1630 } 1631 } 1632 base += incr; 1633 } 1634 } 1635 1636 #endif 1637 1638 /* 1639 * Typically used to initialize a fictitious page by vm/device_pager.c 1640 */ 1641 void 1642 pmap_page_init(struct vm_page *m) 1643 { 1644 vm_page_init(m); 1645 m->md.pmap_count = 0; 1646 m->md.writeable_count = 0; 1647 } 1648 1649 /*************************************************** 1650 * Low level helper routines..... 1651 ***************************************************/ 1652 1653 /* 1654 * Extract the physical page address associated with the map/VA pair. 1655 * The page must be wired for this to work reliably. 1656 */ 1657 vm_paddr_t 1658 pmap_extract(pmap_t pmap, vm_offset_t va, void **handlep) 1659 { 1660 vm_paddr_t rtval; 1661 pv_entry_t pt_pv; 1662 pt_entry_t *ptep; 1663 1664 rtval = 0; 1665 if (va >= VM_MAX_USER_ADDRESS) { 1666 /* 1667 * Kernel page directories might be direct-mapped and 1668 * there is typically no PV tracking of pte's 1669 */ 1670 pd_entry_t *pt; 1671 1672 pt = pmap_pt(pmap, va); 1673 if (pt && (*pt & pmap->pmap_bits[PG_V_IDX])) { 1674 if (*pt & pmap->pmap_bits[PG_PS_IDX]) { 1675 rtval = *pt & PG_PS_FRAME; 1676 rtval |= va & PDRMASK; 1677 } else { 1678 ptep = pmap_pt_to_pte(*pt, va); 1679 if (*pt & pmap->pmap_bits[PG_V_IDX]) { 1680 rtval = *ptep & PG_FRAME; 1681 rtval |= va & PAGE_MASK; 1682 } 1683 } 1684 } 1685 if (handlep) 1686 *handlep = NULL; 1687 } else { 1688 /* 1689 * User pages currently do not direct-map the page directory 1690 * and some pages might not used managed PVs. But all PT's 1691 * will have a PV. 1692 */ 1693 pt_pv = pv_get(pmap, pmap_pt_pindex(va), NULL); 1694 if (pt_pv) { 1695 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); 1696 if (*ptep & pmap->pmap_bits[PG_V_IDX]) { 1697 rtval = *ptep & PG_FRAME; 1698 rtval |= va & PAGE_MASK; 1699 } 1700 if (handlep) 1701 *handlep = pt_pv; /* locked until done */ 1702 else 1703 pv_put (pt_pv); 1704 } else if (handlep) { 1705 *handlep = NULL; 1706 } 1707 } 1708 return rtval; 1709 } 1710 1711 void 1712 pmap_extract_done(void *handle) 1713 { 1714 if (handle) 1715 pv_put((pv_entry_t)handle); 1716 } 1717 1718 /* 1719 * Similar to extract but checks protections, SMP-friendly short-cut for 1720 * vm_fault_page[_quick](). Can return NULL to cause the caller to 1721 * fall-through to the real fault code. Does not work with HVM page 1722 * tables. 1723 * 1724 * if busyp is NULL the returned page, if not NULL, is held (and not busied). 1725 * 1726 * If busyp is not NULL and this function sets *busyp non-zero, the returned 1727 * page is busied (and not held). 1728 * 1729 * If busyp is not NULL and this function sets *busyp to zero, the returned 1730 * page is held (and not busied). 1731 * 1732 * If VM_PROT_WRITE is set in prot, and the pte is already writable, the 1733 * returned page will be dirtied. If the pte is not already writable NULL 1734 * is returned. In otherwords, if the bit is set and a vm_page_t is returned, 1735 * any COW will already have happened and that page can be written by the 1736 * caller. 1737 * 1738 * WARNING! THE RETURNED PAGE IS ONLY HELD AND NOT SUITABLE FOR READING 1739 * OR WRITING AS-IS. 1740 */ 1741 vm_page_t 1742 pmap_fault_page_quick(pmap_t pmap, vm_offset_t va, vm_prot_t prot, int *busyp) 1743 { 1744 if (pmap && 1745 va < VM_MAX_USER_ADDRESS && 1746 (pmap->pm_flags & PMAP_HVM) == 0) { 1747 pv_entry_t pt_pv; 1748 pv_entry_t pte_pv; 1749 pt_entry_t *ptep; 1750 pt_entry_t req; 1751 vm_page_t m; 1752 int error; 1753 1754 req = pmap->pmap_bits[PG_V_IDX] | 1755 pmap->pmap_bits[PG_U_IDX]; 1756 if (prot & VM_PROT_WRITE) 1757 req |= pmap->pmap_bits[PG_RW_IDX]; 1758 1759 pt_pv = pv_get(pmap, pmap_pt_pindex(va), NULL); 1760 if (pt_pv == NULL) 1761 return (NULL); 1762 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); 1763 if ((*ptep & req) != req) { 1764 pv_put(pt_pv); 1765 return (NULL); 1766 } 1767 pte_pv = pv_get_try(pmap, pmap_pte_pindex(va), NULL, &error); 1768 if (pte_pv && error == 0) { 1769 m = pte_pv->pv_m; 1770 if (prot & VM_PROT_WRITE) { 1771 /* interlocked by presence of pv_entry */ 1772 vm_page_dirty(m); 1773 } 1774 if (busyp) { 1775 if (prot & VM_PROT_WRITE) { 1776 if (vm_page_busy_try(m, TRUE)) 1777 m = NULL; 1778 *busyp = 1; 1779 } else { 1780 vm_page_hold(m); 1781 *busyp = 0; 1782 } 1783 } else { 1784 vm_page_hold(m); 1785 } 1786 pv_put(pte_pv); 1787 } else if (pte_pv) { 1788 pv_drop(pte_pv); 1789 m = NULL; 1790 } else { 1791 /* error, since we didn't request a placemarker */ 1792 m = NULL; 1793 } 1794 pv_put(pt_pv); 1795 return(m); 1796 } else { 1797 return(NULL); 1798 } 1799 } 1800 1801 /* 1802 * Extract the physical page address associated kernel virtual address. 1803 */ 1804 vm_paddr_t 1805 pmap_kextract(vm_offset_t va) 1806 { 1807 pd_entry_t pt; /* pt entry in pd */ 1808 vm_paddr_t pa; 1809 1810 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 1811 pa = DMAP_TO_PHYS(va); 1812 } else { 1813 pt = *vtopt(va); 1814 if (pt & kernel_pmap.pmap_bits[PG_PS_IDX]) { 1815 pa = (pt & PG_PS_FRAME) | (va & PDRMASK); 1816 } else { 1817 /* 1818 * Beware of a concurrent promotion that changes the 1819 * PDE at this point! For example, vtopte() must not 1820 * be used to access the PTE because it would use the 1821 * new PDE. It is, however, safe to use the old PDE 1822 * because the page table page is preserved by the 1823 * promotion. 1824 */ 1825 pa = *pmap_pt_to_pte(pt, va); 1826 pa = (pa & PG_FRAME) | (va & PAGE_MASK); 1827 } 1828 } 1829 return pa; 1830 } 1831 1832 /*************************************************** 1833 * Low level mapping routines..... 1834 ***************************************************/ 1835 1836 /* 1837 * Routine: pmap_kenter 1838 * Function: 1839 * Add a wired page to the KVA 1840 * NOTE! note that in order for the mapping to take effect -- you 1841 * should do an invltlb after doing the pmap_kenter(). 1842 */ 1843 void 1844 pmap_kenter(vm_offset_t va, vm_paddr_t pa) 1845 { 1846 pt_entry_t *ptep; 1847 pt_entry_t npte; 1848 1849 npte = pa | 1850 kernel_pmap.pmap_bits[PG_RW_IDX] | 1851 kernel_pmap.pmap_bits[PG_V_IDX]; 1852 // pgeflag; 1853 ptep = vtopte(va); 1854 #if 1 1855 pmap_inval_smp(&kernel_pmap, va, 1, ptep, npte); 1856 #else 1857 /* FUTURE */ 1858 if (*ptep) 1859 pmap_inval_smp(&kernel_pmap, va, ptep, npte); 1860 else 1861 *ptep = npte; 1862 #endif 1863 } 1864 1865 /* 1866 * Similar to pmap_kenter(), except we only invalidate the mapping on the 1867 * current CPU. Returns 0 if the previous pte was 0, 1 if it wasn't 1868 * (caller can conditionalize calling smp_invltlb()). 1869 */ 1870 int 1871 pmap_kenter_quick(vm_offset_t va, vm_paddr_t pa) 1872 { 1873 pt_entry_t *ptep; 1874 pt_entry_t npte; 1875 int res; 1876 1877 npte = pa | kernel_pmap.pmap_bits[PG_RW_IDX] | 1878 kernel_pmap.pmap_bits[PG_V_IDX]; 1879 // npte |= pgeflag; 1880 ptep = vtopte(va); 1881 #if 1 1882 res = 1; 1883 #else 1884 /* FUTURE */ 1885 res = (*ptep != 0); 1886 #endif 1887 atomic_swap_long(ptep, npte); 1888 cpu_invlpg((void *)va); 1889 1890 return res; 1891 } 1892 1893 /* 1894 * Enter addresses into the kernel pmap but don't bother 1895 * doing any tlb invalidations. Caller will do a rollup 1896 * invalidation via pmap_rollup_inval(). 1897 */ 1898 int 1899 pmap_kenter_noinval(vm_offset_t va, vm_paddr_t pa) 1900 { 1901 pt_entry_t *ptep; 1902 pt_entry_t npte; 1903 int res; 1904 1905 npte = pa | 1906 kernel_pmap.pmap_bits[PG_RW_IDX] | 1907 kernel_pmap.pmap_bits[PG_V_IDX]; 1908 // pgeflag; 1909 ptep = vtopte(va); 1910 #if 1 1911 res = 1; 1912 #else 1913 /* FUTURE */ 1914 res = (*ptep != 0); 1915 #endif 1916 atomic_swap_long(ptep, npte); 1917 cpu_invlpg((void *)va); 1918 1919 return res; 1920 } 1921 1922 /* 1923 * remove a page from the kernel pagetables 1924 */ 1925 void 1926 pmap_kremove(vm_offset_t va) 1927 { 1928 pt_entry_t *ptep; 1929 1930 ptep = vtopte(va); 1931 pmap_inval_smp(&kernel_pmap, va, 1, ptep, 0); 1932 } 1933 1934 void 1935 pmap_kremove_quick(vm_offset_t va) 1936 { 1937 pt_entry_t *ptep; 1938 1939 ptep = vtopte(va); 1940 (void)pte_load_clear(ptep); 1941 cpu_invlpg((void *)va); 1942 } 1943 1944 /* 1945 * Remove addresses from the kernel pmap but don't bother 1946 * doing any tlb invalidations. Caller will do a rollup 1947 * invalidation via pmap_rollup_inval(). 1948 */ 1949 void 1950 pmap_kremove_noinval(vm_offset_t va) 1951 { 1952 pt_entry_t *ptep; 1953 1954 ptep = vtopte(va); 1955 (void)pte_load_clear(ptep); 1956 } 1957 1958 /* 1959 * XXX these need to be recoded. They are not used in any critical path. 1960 */ 1961 void 1962 pmap_kmodify_rw(vm_offset_t va) 1963 { 1964 atomic_set_long(vtopte(va), kernel_pmap.pmap_bits[PG_RW_IDX]); 1965 cpu_invlpg((void *)va); 1966 } 1967 1968 /* NOT USED 1969 void 1970 pmap_kmodify_nc(vm_offset_t va) 1971 { 1972 atomic_set_long(vtopte(va), PG_N); 1973 cpu_invlpg((void *)va); 1974 } 1975 */ 1976 1977 /* 1978 * Used to map a range of physical addresses into kernel virtual 1979 * address space during the low level boot, typically to map the 1980 * dump bitmap, message buffer, and vm_page_array. 1981 * 1982 * These mappings are typically made at some pointer after the end of the 1983 * kernel text+data. 1984 * 1985 * We could return PHYS_TO_DMAP(start) here and not allocate any 1986 * via (*virtp), but then kmem from userland and kernel dumps won't 1987 * have access to the related pointers. 1988 */ 1989 vm_offset_t 1990 pmap_map(vm_offset_t *virtp, vm_paddr_t start, vm_paddr_t end, int prot) 1991 { 1992 vm_offset_t va; 1993 vm_offset_t va_start; 1994 1995 /*return PHYS_TO_DMAP(start);*/ 1996 1997 va_start = *virtp; 1998 va = va_start; 1999 2000 while (start < end) { 2001 pmap_kenter_quick(va, start); 2002 va += PAGE_SIZE; 2003 start += PAGE_SIZE; 2004 } 2005 *virtp = va; 2006 return va_start; 2007 } 2008 2009 #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) 2010 2011 /* 2012 * Remove the specified set of pages from the data and instruction caches. 2013 * 2014 * In contrast to pmap_invalidate_cache_range(), this function does not 2015 * rely on the CPU's self-snoop feature, because it is intended for use 2016 * when moving pages into a different cache domain. 2017 */ 2018 void 2019 pmap_invalidate_cache_pages(vm_page_t *pages, int count) 2020 { 2021 vm_offset_t daddr, eva; 2022 int i; 2023 2024 if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || 2025 (cpu_feature & CPUID_CLFSH) == 0) 2026 wbinvd(); 2027 else { 2028 cpu_mfence(); 2029 for (i = 0; i < count; i++) { 2030 daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i])); 2031 eva = daddr + PAGE_SIZE; 2032 for (; daddr < eva; daddr += cpu_clflush_line_size) 2033 clflush(daddr); 2034 } 2035 cpu_mfence(); 2036 } 2037 } 2038 2039 void 2040 pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) 2041 { 2042 KASSERT((sva & PAGE_MASK) == 0, 2043 ("pmap_invalidate_cache_range: sva not page-aligned")); 2044 KASSERT((eva & PAGE_MASK) == 0, 2045 ("pmap_invalidate_cache_range: eva not page-aligned")); 2046 2047 if (cpu_feature & CPUID_SS) { 2048 ; /* If "Self Snoop" is supported, do nothing. */ 2049 } else { 2050 /* Globally invalidate caches */ 2051 cpu_wbinvd_on_all_cpus(); 2052 } 2053 } 2054 2055 /* 2056 * Invalidate the specified range of virtual memory on all cpus associated 2057 * with the pmap. 2058 */ 2059 void 2060 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 2061 { 2062 pmap_inval_smp(pmap, sva, (eva - sva) >> PAGE_SHIFT, NULL, 0); 2063 } 2064 2065 /* 2066 * Add a list of wired pages to the kva. This routine is used for temporary 2067 * kernel mappings such as those found in buffer cache buffer. Page 2068 * modifications and accesses are not tracked or recorded. 2069 * 2070 * NOTE! Old mappings are simply overwritten, and we cannot assume relaxed 2071 * semantics as previous mappings may have been zerod without any 2072 * invalidation. 2073 * 2074 * The page *must* be wired. 2075 */ 2076 static __inline void 2077 _pmap_qenter(vm_offset_t beg_va, vm_page_t *m, int count, int doinval) 2078 { 2079 vm_offset_t end_va; 2080 vm_offset_t va; 2081 2082 end_va = beg_va + count * PAGE_SIZE; 2083 2084 for (va = beg_va; va < end_va; va += PAGE_SIZE) { 2085 pt_entry_t pte; 2086 pt_entry_t *ptep; 2087 2088 ptep = vtopte(va); 2089 pte = VM_PAGE_TO_PHYS(*m) | 2090 kernel_pmap.pmap_bits[PG_RW_IDX] | 2091 kernel_pmap.pmap_bits[PG_V_IDX] | 2092 kernel_pmap.pmap_cache_bits_pte[(*m)->pat_mode]; 2093 // pgeflag; 2094 atomic_swap_long(ptep, pte); 2095 m++; 2096 } 2097 if (doinval) 2098 pmap_invalidate_range(&kernel_pmap, beg_va, end_va); 2099 } 2100 2101 void 2102 pmap_qenter(vm_offset_t beg_va, vm_page_t *m, int count) 2103 { 2104 _pmap_qenter(beg_va, m, count, 1); 2105 } 2106 2107 void 2108 pmap_qenter_noinval(vm_offset_t beg_va, vm_page_t *m, int count) 2109 { 2110 _pmap_qenter(beg_va, m, count, 0); 2111 } 2112 2113 /* 2114 * This routine jerks page mappings from the kernel -- it is meant only 2115 * for temporary mappings such as those found in buffer cache buffers. 2116 * No recording modified or access status occurs. 2117 * 2118 * MPSAFE, INTERRUPT SAFE (cluster callback) 2119 */ 2120 void 2121 pmap_qremove(vm_offset_t beg_va, int count) 2122 { 2123 vm_offset_t end_va; 2124 vm_offset_t va; 2125 2126 end_va = beg_va + count * PAGE_SIZE; 2127 2128 for (va = beg_va; va < end_va; va += PAGE_SIZE) { 2129 pt_entry_t *pte; 2130 2131 pte = vtopte(va); 2132 (void)pte_load_clear(pte); 2133 cpu_invlpg((void *)va); 2134 } 2135 pmap_invalidate_range(&kernel_pmap, beg_va, end_va); 2136 } 2137 2138 /* 2139 * This routine removes temporary kernel mappings, only invalidating them 2140 * on the current cpu. It should only be used under carefully controlled 2141 * conditions. 2142 */ 2143 void 2144 pmap_qremove_quick(vm_offset_t beg_va, int count) 2145 { 2146 vm_offset_t end_va; 2147 vm_offset_t va; 2148 2149 end_va = beg_va + count * PAGE_SIZE; 2150 2151 for (va = beg_va; va < end_va; va += PAGE_SIZE) { 2152 pt_entry_t *pte; 2153 2154 pte = vtopte(va); 2155 (void)pte_load_clear(pte); 2156 cpu_invlpg((void *)va); 2157 } 2158 } 2159 2160 /* 2161 * This routine removes temporary kernel mappings *without* invalidating 2162 * the TLB. It can only be used on permanent kva reservations such as those 2163 * found in buffer cache buffers, under carefully controlled circumstances. 2164 * 2165 * NOTE: Repopulating these KVAs requires unconditional invalidation. 2166 * (pmap_qenter() does unconditional invalidation). 2167 */ 2168 void 2169 pmap_qremove_noinval(vm_offset_t beg_va, int count) 2170 { 2171 vm_offset_t end_va; 2172 vm_offset_t va; 2173 2174 end_va = beg_va + count * PAGE_SIZE; 2175 2176 for (va = beg_va; va < end_va; va += PAGE_SIZE) { 2177 pt_entry_t *pte; 2178 2179 pte = vtopte(va); 2180 (void)pte_load_clear(pte); 2181 } 2182 } 2183 2184 /* 2185 * Create a new thread and optionally associate it with a (new) process. 2186 * NOTE! the new thread's cpu may not equal the current cpu. 2187 */ 2188 void 2189 pmap_init_thread(thread_t td) 2190 { 2191 /* enforce pcb placement & alignment */ 2192 td->td_pcb = (struct pcb *)(td->td_kstack + td->td_kstack_size) - 1; 2193 td->td_pcb = (struct pcb *)((intptr_t)td->td_pcb & ~(intptr_t)0xF); 2194 td->td_savefpu = &td->td_pcb->pcb_save; 2195 td->td_sp = (char *)td->td_pcb; /* no -16 */ 2196 } 2197 2198 /* 2199 * This routine directly affects the fork perf for a process. 2200 */ 2201 void 2202 pmap_init_proc(struct proc *p) 2203 { 2204 } 2205 2206 static void 2207 pmap_pinit_defaults(struct pmap *pmap) 2208 { 2209 bcopy(pmap_bits_default, pmap->pmap_bits, 2210 sizeof(pmap_bits_default)); 2211 bcopy(protection_codes, pmap->protection_codes, 2212 sizeof(protection_codes)); 2213 bcopy(pat_pte_index, pmap->pmap_cache_bits_pte, 2214 sizeof(pat_pte_index)); 2215 bcopy(pat_pde_index, pmap->pmap_cache_bits_pde, 2216 sizeof(pat_pte_index)); 2217 pmap->pmap_cache_mask_pte = X86_PG_NC_PWT | X86_PG_NC_PCD | X86_PG_PTE_PAT; 2218 pmap->pmap_cache_mask_pde = X86_PG_NC_PWT | X86_PG_NC_PCD | X86_PG_PDE_PAT; 2219 pmap->copyinstr = std_copyinstr; 2220 pmap->copyin = std_copyin; 2221 pmap->copyout = std_copyout; 2222 pmap->fubyte = std_fubyte; 2223 pmap->subyte = std_subyte; 2224 pmap->fuword32 = std_fuword32; 2225 pmap->fuword64 = std_fuword64; 2226 pmap->suword32 = std_suword32; 2227 pmap->suword64 = std_suword64; 2228 pmap->swapu32 = std_swapu32; 2229 pmap->swapu64 = std_swapu64; 2230 pmap->fuwordadd32 = std_fuwordadd32; 2231 pmap->fuwordadd64 = std_fuwordadd64; 2232 } 2233 /* 2234 * Initialize pmap0/vmspace0. 2235 * 2236 * On architectures where the kernel pmap is not integrated into the user 2237 * process pmap, this pmap represents the process pmap, not the kernel pmap. 2238 * kernel_pmap should be used to directly access the kernel_pmap. 2239 */ 2240 void 2241 pmap_pinit0(struct pmap *pmap) 2242 { 2243 int i; 2244 2245 pmap->pm_pml4 = (pml4_entry_t *)(PTOV_OFFSET + KPML4phys); 2246 pmap->pm_count = 1; 2247 CPUMASK_ASSZERO(pmap->pm_active); 2248 pmap->pm_pvhint_pt = NULL; 2249 pmap->pm_pvhint_unused = NULL; 2250 RB_INIT(&pmap->pm_pvroot); 2251 spin_init(&pmap->pm_spin, "pmapinit0"); 2252 for (i = 0; i < PM_PLACEMARKS; ++i) 2253 pmap->pm_placemarks[i] = PM_NOPLACEMARK; 2254 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 2255 pmap_pinit_defaults(pmap); 2256 } 2257 2258 /* 2259 * Initialize a preallocated and zeroed pmap structure, 2260 * such as one in a vmspace structure. 2261 */ 2262 static void 2263 pmap_pinit_simple(struct pmap *pmap) 2264 { 2265 int i; 2266 2267 /* 2268 * Misc initialization 2269 */ 2270 pmap->pm_count = 1; 2271 CPUMASK_ASSZERO(pmap->pm_active); 2272 pmap->pm_pvhint_pt = NULL; 2273 pmap->pm_pvhint_unused = NULL; 2274 pmap->pm_flags = PMAP_FLAG_SIMPLE; 2275 2276 pmap_pinit_defaults(pmap); 2277 2278 /* 2279 * Don't blow up locks/tokens on re-use (XXX fix/use drop code 2280 * for this). 2281 */ 2282 if (pmap->pm_pmlpv == NULL) { 2283 RB_INIT(&pmap->pm_pvroot); 2284 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 2285 spin_init(&pmap->pm_spin, "pmapinitsimple"); 2286 for (i = 0; i < PM_PLACEMARKS; ++i) 2287 pmap->pm_placemarks[i] = PM_NOPLACEMARK; 2288 } 2289 } 2290 2291 void 2292 pmap_pinit(struct pmap *pmap) 2293 { 2294 pv_entry_t pv; 2295 int j; 2296 2297 if (pmap->pm_pmlpv) { 2298 if (pmap->pmap_bits[TYPE_IDX] != REGULAR_PMAP) { 2299 pmap_puninit(pmap); 2300 } 2301 } 2302 2303 pmap_pinit_simple(pmap); 2304 pmap->pm_flags &= ~PMAP_FLAG_SIMPLE; 2305 2306 /* 2307 * No need to allocate page table space yet but we do need a valid 2308 * page directory table. 2309 */ 2310 if (pmap->pm_pml4 == NULL) { 2311 pmap->pm_pml4 = 2312 (pml4_entry_t *)kmem_alloc_pageable(&kernel_map, 2313 PAGE_SIZE * 2, 2314 VM_SUBSYS_PML4); 2315 pmap->pm_pml4_iso = (void *)((char *)pmap->pm_pml4 + PAGE_SIZE); 2316 } 2317 2318 /* 2319 * Allocate the PML4e table, which wires it even though it isn't 2320 * being entered into some higher level page table (it being the 2321 * highest level). If one is already cached we don't have to do 2322 * anything. 2323 */ 2324 if ((pv = pmap->pm_pmlpv) == NULL) { 2325 pv = pmap_allocpte(pmap, pmap_pml4_pindex(), NULL); 2326 pmap->pm_pmlpv = pv; 2327 pmap_kenter((vm_offset_t)pmap->pm_pml4, 2328 VM_PAGE_TO_PHYS(pv->pv_m)); 2329 pv_put(pv); 2330 2331 /* 2332 * Install DMAP and KMAP. 2333 */ 2334 for (j = 0; j < NDMPML4E; ++j) { 2335 pmap->pm_pml4[DMPML4I + j] = 2336 (DMPDPphys + ((vm_paddr_t)j << PAGE_SHIFT)) | 2337 pmap->pmap_bits[PG_RW_IDX] | 2338 pmap->pmap_bits[PG_V_IDX] | 2339 pmap->pmap_bits[PG_A_IDX]; 2340 } 2341 for (j = 0; j < NKPML4E; ++j) { 2342 pmap->pm_pml4[KPML4I + j] = 2343 (KPDPphys + ((vm_paddr_t)j << PAGE_SHIFT)) | 2344 pmap->pmap_bits[PG_RW_IDX] | 2345 pmap->pmap_bits[PG_V_IDX] | 2346 pmap->pmap_bits[PG_A_IDX]; 2347 } 2348 2349 /* 2350 * install self-referential address mapping entry 2351 */ 2352 pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pv->pv_m) | 2353 pmap->pmap_bits[PG_V_IDX] | 2354 pmap->pmap_bits[PG_RW_IDX] | 2355 pmap->pmap_bits[PG_A_IDX]; 2356 } else { 2357 KKASSERT(pv->pv_m->flags & PG_MAPPED); 2358 KKASSERT(pv->pv_m->flags & PG_WRITEABLE); 2359 } 2360 KKASSERT(pmap->pm_pml4[255] == 0); 2361 2362 /* 2363 * When implementing an isolated userland pmap, a second PML4e table 2364 * is needed. We use pmap_pml4_pindex() + 1 for convenience, but 2365 * note that we do not operate on this table using our API functions 2366 * so handling of the + 1 case is mostly just to prevent implosions. 2367 * 2368 * We install an isolated version of the kernel PDPs into this 2369 * second PML4e table. The pmap code will mirror all user PDPs 2370 * between the primary and secondary PML4e table. 2371 */ 2372 if ((pv = pmap->pm_pmlpv_iso) == NULL && meltdown_mitigation && 2373 pmap != &iso_pmap) { 2374 pv = pmap_allocpte(pmap, pmap_pml4_pindex() + 1, NULL); 2375 pmap->pm_pmlpv_iso = pv; 2376 pmap_kenter((vm_offset_t)pmap->pm_pml4_iso, 2377 VM_PAGE_TO_PHYS(pv->pv_m)); 2378 pv_put(pv); 2379 2380 /* 2381 * Install an isolated version of the kernel pmap for 2382 * user consumption, using PDPs constructed in iso_pmap. 2383 */ 2384 for (j = 0; j < NKPML4E; ++j) { 2385 pmap->pm_pml4_iso[KPML4I + j] = 2386 iso_pmap.pm_pml4[KPML4I + j]; 2387 } 2388 } else if (pv) { 2389 KKASSERT(pv->pv_m->flags & PG_MAPPED); 2390 KKASSERT(pv->pv_m->flags & PG_WRITEABLE); 2391 } 2392 } 2393 2394 /* 2395 * Clean up a pmap structure so it can be physically freed. This routine 2396 * is called by the vmspace dtor function. A great deal of pmap data is 2397 * left passively mapped to improve vmspace management so we have a bit 2398 * of cleanup work to do here. 2399 */ 2400 void 2401 pmap_puninit(pmap_t pmap) 2402 { 2403 pv_entry_t pv; 2404 vm_page_t p; 2405 2406 KKASSERT(CPUMASK_TESTZERO(pmap->pm_active)); 2407 if ((pv = pmap->pm_pmlpv) != NULL) { 2408 if (pv_hold_try(pv) == 0) 2409 pv_lock(pv); 2410 KKASSERT(pv == pmap->pm_pmlpv); 2411 p = pmap_remove_pv_page(pv); 2412 pv_free(pv, NULL); 2413 pv = NULL; /* safety */ 2414 pmap_kremove((vm_offset_t)pmap->pm_pml4); 2415 vm_page_busy_wait(p, FALSE, "pgpun"); 2416 KKASSERT(p->flags & PG_UNQUEUED); 2417 vm_page_unwire(p, 0); 2418 vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE); 2419 vm_page_free(p); 2420 pmap->pm_pmlpv = NULL; 2421 } 2422 if ((pv = pmap->pm_pmlpv_iso) != NULL) { 2423 if (pv_hold_try(pv) == 0) 2424 pv_lock(pv); 2425 KKASSERT(pv == pmap->pm_pmlpv_iso); 2426 p = pmap_remove_pv_page(pv); 2427 pv_free(pv, NULL); 2428 pv = NULL; /* safety */ 2429 pmap_kremove((vm_offset_t)pmap->pm_pml4_iso); 2430 vm_page_busy_wait(p, FALSE, "pgpun"); 2431 KKASSERT(p->flags & PG_UNQUEUED); 2432 vm_page_unwire(p, 0); 2433 vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE); 2434 vm_page_free(p); 2435 pmap->pm_pmlpv_iso = NULL; 2436 } 2437 if (pmap->pm_pml4) { 2438 KKASSERT(pmap->pm_pml4 != (void *)(PTOV_OFFSET + KPML4phys)); 2439 kmem_free(&kernel_map, 2440 (vm_offset_t)pmap->pm_pml4, PAGE_SIZE * 2); 2441 pmap->pm_pml4 = NULL; 2442 pmap->pm_pml4_iso = NULL; 2443 } 2444 KKASSERT(pmap->pm_stats.resident_count == 0); 2445 KKASSERT(pmap->pm_stats.wired_count == 0); 2446 } 2447 2448 /* 2449 * This function is now unused (used to add the pmap to the pmap_list) 2450 */ 2451 void 2452 pmap_pinit2(struct pmap *pmap) 2453 { 2454 } 2455 2456 /* 2457 * This routine is called when various levels in the page table need to 2458 * be populated. This routine cannot fail. 2459 * 2460 * This function returns two locked pv_entry's, one representing the 2461 * requested pv and one representing the requested pv's parent pv. If 2462 * an intermediate page table does not exist it will be created, mapped, 2463 * wired, and the parent page table will be given an additional hold 2464 * count representing the presence of the child pv_entry. 2465 */ 2466 static 2467 pv_entry_t 2468 pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp) 2469 { 2470 pt_entry_t *ptep; 2471 pt_entry_t *ptep_iso; 2472 pv_entry_t pv; 2473 pv_entry_t pvp; 2474 pt_entry_t v; 2475 vm_page_t m; 2476 int isnew; 2477 int ispt; 2478 2479 /* 2480 * If the pv already exists and we aren't being asked for the 2481 * parent page table page we can just return it. A locked+held pv 2482 * is returned. The pv will also have a second hold related to the 2483 * pmap association that we don't have to worry about. 2484 */ 2485 ispt = 0; 2486 pv = pv_alloc(pmap, ptepindex, &isnew); 2487 if (isnew == 0 && pvpp == NULL) 2488 return(pv); 2489 2490 /* 2491 * DragonFly doesn't use PV's to represent terminal PTEs any more. 2492 * The index range is still used for placemarkers, but not for 2493 * actual pv_entry's. 2494 */ 2495 KKASSERT(ptepindex >= pmap_pt_pindex(0)); 2496 2497 /* 2498 * Note that pt_pv's are only returned for user VAs. We assert that 2499 * a pt_pv is not being requested for kernel VAs. The kernel 2500 * pre-wires all higher-level page tables so don't overload managed 2501 * higher-level page tables on top of it! 2502 * 2503 * However, its convenient for us to allow the case when creating 2504 * iso_pmap. This is a bit of a hack but it simplifies iso_pmap 2505 * a lot. 2506 */ 2507 2508 /* 2509 * The kernel never uses managed PT/PD/PDP pages. 2510 */ 2511 KKASSERT(pmap != &kernel_pmap); 2512 2513 /* 2514 * Non-terminal PVs allocate a VM page to represent the page table, 2515 * so we have to resolve pvp and calculate ptepindex for the pvp 2516 * and then for the page table entry index in the pvp for 2517 * fall-through. 2518 */ 2519 if (ptepindex < pmap_pd_pindex(0)) { 2520 /* 2521 * pv is PT, pvp is PD 2522 */ 2523 ptepindex = (ptepindex - pmap_pt_pindex(0)) >> NPDEPGSHIFT; 2524 ptepindex += NUPTE_TOTAL + NUPT_TOTAL; 2525 pvp = pmap_allocpte(pmap, ptepindex, NULL); 2526 2527 /* 2528 * PT index in PD 2529 */ 2530 ptepindex = pv->pv_pindex - pmap_pt_pindex(0); 2531 ptepindex &= ((1ul << NPDEPGSHIFT) - 1); 2532 ispt = 1; 2533 } else if (ptepindex < pmap_pdp_pindex(0)) { 2534 /* 2535 * pv is PD, pvp is PDP 2536 * 2537 * SIMPLE PMAP NOTE: Simple pmaps do not allocate above 2538 * the PD. 2539 */ 2540 ptepindex = (ptepindex - pmap_pd_pindex(0)) >> NPDPEPGSHIFT; 2541 ptepindex += NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL; 2542 2543 if (pmap->pm_flags & PMAP_FLAG_SIMPLE) { 2544 KKASSERT(pvpp == NULL); 2545 pvp = NULL; 2546 } else { 2547 pvp = pmap_allocpte(pmap, ptepindex, NULL); 2548 } 2549 2550 /* 2551 * PD index in PDP 2552 */ 2553 ptepindex = pv->pv_pindex - pmap_pd_pindex(0); 2554 ptepindex &= ((1ul << NPDPEPGSHIFT) - 1); 2555 } else if (ptepindex < pmap_pml4_pindex()) { 2556 /* 2557 * pv is PDP, pvp is the root pml4 table 2558 */ 2559 pvp = pmap_allocpte(pmap, pmap_pml4_pindex(), NULL); 2560 2561 /* 2562 * PDP index in PML4 2563 */ 2564 ptepindex = pv->pv_pindex - pmap_pdp_pindex(0); 2565 ptepindex &= ((1ul << NPML4EPGSHIFT) - 1); 2566 } else { 2567 /* 2568 * pv represents the top-level PML4, there is no parent. 2569 */ 2570 pvp = NULL; 2571 } 2572 2573 if (isnew == 0) 2574 goto notnew; 2575 2576 /* 2577 * (isnew) is TRUE, pv is not terminal. 2578 * 2579 * (1) Add a wire count to the parent page table (pvp). 2580 * (2) Allocate a VM page for the page table. 2581 * (3) Enter the VM page into the parent page table. 2582 * 2583 * page table pages are marked PG_WRITEABLE and PG_MAPPED. 2584 */ 2585 if (pvp) 2586 vm_page_wire_quick(pvp->pv_m); 2587 2588 for (;;) { 2589 m = vm_page_alloc(NULL, pv->pv_pindex, 2590 VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM | 2591 VM_ALLOC_INTERRUPT); 2592 if (m) 2593 break; 2594 vm_wait(0); 2595 } 2596 vm_page_wire(m); /* wire for mapping in parent */ 2597 pmap_zero_page(VM_PAGE_TO_PHYS(m)); 2598 m->valid = VM_PAGE_BITS_ALL; 2599 vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE | PG_UNQUEUED); 2600 KKASSERT(m->queue == PQ_NONE); 2601 2602 pv->pv_m = m; 2603 2604 /* 2605 * (isnew) is TRUE, pv is not terminal. 2606 * 2607 * Wire the page into pvp. Bump the resident_count for the pmap. 2608 * There is no pvp for the top level, address the pm_pml4[] array 2609 * directly. 2610 * 2611 * If the caller wants the parent we return it, otherwise 2612 * we just put it away. 2613 * 2614 * No interlock is needed for pte 0 -> non-zero. 2615 * 2616 * In the situation where *ptep is valid we might have an unmanaged 2617 * page table page shared from another page table which we need to 2618 * unshare before installing our private page table page. 2619 */ 2620 if (pvp) { 2621 v = VM_PAGE_TO_PHYS(m) | 2622 (pmap->pmap_bits[PG_RW_IDX] | 2623 pmap->pmap_bits[PG_V_IDX] | 2624 pmap->pmap_bits[PG_A_IDX]); 2625 if (ptepindex < NUPTE_USER) 2626 v |= pmap->pmap_bits[PG_U_IDX]; 2627 if (ptepindex < pmap_pt_pindex(0)) 2628 v |= pmap->pmap_bits[PG_M_IDX]; 2629 2630 ptep = pv_pte_lookup(pvp, ptepindex); 2631 if (pvp == pmap->pm_pmlpv && pmap->pm_pmlpv_iso) 2632 ptep_iso = pv_pte_lookup(pmap->pm_pmlpv_iso, ptepindex); 2633 else 2634 ptep_iso = NULL; 2635 if (*ptep & pmap->pmap_bits[PG_V_IDX]) { 2636 panic("pmap_allocpte: ptpte present without pv_entry!"); 2637 } else { 2638 pt_entry_t pte; 2639 2640 pte = atomic_swap_long(ptep, v); 2641 if (ptep_iso) 2642 atomic_swap_long(ptep_iso, v); 2643 if (pte != 0) { 2644 kprintf("install pgtbl mixup 0x%016jx " 2645 "old/new 0x%016jx/0x%016jx\n", 2646 (intmax_t)ptepindex, pte, v); 2647 } 2648 } 2649 } 2650 vm_page_wakeup(m); 2651 2652 /* 2653 * (isnew) may be TRUE or FALSE, pv may or may not be terminal. 2654 */ 2655 notnew: 2656 if (pvp) { 2657 KKASSERT(pvp->pv_m != NULL); 2658 ptep = pv_pte_lookup(pvp, ptepindex); 2659 v = VM_PAGE_TO_PHYS(pv->pv_m) | 2660 (pmap->pmap_bits[PG_RW_IDX] | 2661 pmap->pmap_bits[PG_V_IDX] | 2662 pmap->pmap_bits[PG_A_IDX]); 2663 if (ptepindex < NUPTE_USER) 2664 v |= pmap->pmap_bits[PG_U_IDX]; 2665 if (ptepindex < pmap_pt_pindex(0)) 2666 v |= pmap->pmap_bits[PG_M_IDX]; 2667 if (*ptep != v) { 2668 kprintf("mismatched upper level pt %016jx/%016jx\n", 2669 *ptep, v); 2670 } 2671 } 2672 if (pvpp) 2673 *pvpp = pvp; 2674 else if (pvp) 2675 pv_put(pvp); 2676 return (pv); 2677 } 2678 2679 /* 2680 * Release any resources held by the given physical map. 2681 * 2682 * Called when a pmap initialized by pmap_pinit is being released. Should 2683 * only be called if the map contains no valid mappings. 2684 */ 2685 struct pmap_release_info { 2686 pmap_t pmap; 2687 int retry; 2688 pv_entry_t pvp; 2689 }; 2690 2691 static int pmap_release_callback(pv_entry_t pv, void *data); 2692 2693 void 2694 pmap_release(struct pmap *pmap) 2695 { 2696 struct pmap_release_info info; 2697 2698 KASSERT(CPUMASK_TESTZERO(pmap->pm_active), 2699 ("pmap still active! %016jx", 2700 (uintmax_t)CPUMASK_LOWMASK(pmap->pm_active))); 2701 2702 /* 2703 * There is no longer a pmap_list, if there were we would remove the 2704 * pmap from it here. 2705 */ 2706 2707 /* 2708 * Pull pv's off the RB tree in order from low to high and release 2709 * each page. 2710 */ 2711 info.pmap = pmap; 2712 do { 2713 info.retry = 0; 2714 info.pvp = NULL; 2715 2716 spin_lock(&pmap->pm_spin); 2717 RB_SCAN(pv_entry_rb_tree, &pmap->pm_pvroot, NULL, 2718 pmap_release_callback, &info); 2719 spin_unlock(&pmap->pm_spin); 2720 2721 if (info.pvp) 2722 pv_put(info.pvp); 2723 } while (info.retry); 2724 2725 2726 /* 2727 * One resident page (the pml4 page) should remain. Two if 2728 * the pmap has implemented an isolated userland PML4E table. 2729 * No wired pages should remain. 2730 */ 2731 int expected_res = 0; 2732 2733 if ((pmap->pm_flags & PMAP_FLAG_SIMPLE) == 0) 2734 ++expected_res; 2735 if (pmap->pm_pmlpv_iso) 2736 ++expected_res; 2737 2738 #if 1 2739 if (pmap->pm_stats.resident_count != expected_res || 2740 pmap->pm_stats.wired_count != 0) { 2741 kprintf("fatal pmap problem - pmap %p flags %08x " 2742 "rescnt=%jd wirecnt=%jd\n", 2743 pmap, 2744 pmap->pm_flags, 2745 pmap->pm_stats.resident_count, 2746 pmap->pm_stats.wired_count); 2747 tsleep(pmap, 0, "DEAD", 0); 2748 } 2749 #else 2750 KKASSERT(pmap->pm_stats.resident_count == expected_res); 2751 KKASSERT(pmap->pm_stats.wired_count == 0); 2752 #endif 2753 } 2754 2755 /* 2756 * Called from low to high. We must cache the proper parent pv so we 2757 * can adjust its wired count. 2758 */ 2759 static int 2760 pmap_release_callback(pv_entry_t pv, void *data) 2761 { 2762 struct pmap_release_info *info = data; 2763 pmap_t pmap = info->pmap; 2764 vm_pindex_t pindex; 2765 int r; 2766 2767 /* 2768 * Acquire a held and locked pv, check for release race 2769 */ 2770 pindex = pv->pv_pindex; 2771 if (info->pvp == pv) { 2772 spin_unlock(&pmap->pm_spin); 2773 info->pvp = NULL; 2774 } else if (pv_hold_try(pv)) { 2775 spin_unlock(&pmap->pm_spin); 2776 } else { 2777 spin_unlock(&pmap->pm_spin); 2778 pv_lock(pv); 2779 pv_put(pv); 2780 info->retry = 1; 2781 spin_lock(&pmap->pm_spin); 2782 2783 return -1; 2784 } 2785 KKASSERT(pv->pv_pmap == pmap && pindex == pv->pv_pindex); 2786 2787 if (pv->pv_pindex < pmap_pt_pindex(0)) { 2788 /* 2789 * I am PTE, parent is PT 2790 */ 2791 pindex = pv->pv_pindex >> NPTEPGSHIFT; 2792 pindex += NUPTE_TOTAL; 2793 } else if (pv->pv_pindex < pmap_pd_pindex(0)) { 2794 /* 2795 * I am PT, parent is PD 2796 */ 2797 pindex = (pv->pv_pindex - NUPTE_TOTAL) >> NPDEPGSHIFT; 2798 pindex += NUPTE_TOTAL + NUPT_TOTAL; 2799 } else if (pv->pv_pindex < pmap_pdp_pindex(0)) { 2800 /* 2801 * I am PD, parent is PDP 2802 */ 2803 pindex = (pv->pv_pindex - NUPTE_TOTAL - NUPT_TOTAL) >> 2804 NPDPEPGSHIFT; 2805 pindex += NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL; 2806 } else if (pv->pv_pindex < pmap_pml4_pindex()) { 2807 /* 2808 * I am PDP, parent is PML4. We always calculate the 2809 * normal PML4 here, not the isolated PML4. 2810 */ 2811 pindex = pmap_pml4_pindex(); 2812 } else { 2813 /* 2814 * parent is NULL 2815 */ 2816 if (info->pvp) { 2817 pv_put(info->pvp); 2818 info->pvp = NULL; 2819 } 2820 pindex = 0; 2821 } 2822 if (pindex) { 2823 if (info->pvp && info->pvp->pv_pindex != pindex) { 2824 pv_put(info->pvp); 2825 info->pvp = NULL; 2826 } 2827 if (info->pvp == NULL) 2828 info->pvp = pv_get(pmap, pindex, NULL); 2829 } else { 2830 if (info->pvp) { 2831 pv_put(info->pvp); 2832 info->pvp = NULL; 2833 } 2834 } 2835 r = pmap_release_pv(pv, info->pvp, NULL); 2836 spin_lock(&pmap->pm_spin); 2837 2838 return(r); 2839 } 2840 2841 /* 2842 * Called with held (i.e. also locked) pv. This function will dispose of 2843 * the lock along with the pv. 2844 * 2845 * If the caller already holds the locked parent page table for pv it 2846 * must pass it as pvp, allowing us to avoid a deadlock, else it can 2847 * pass NULL for pvp. 2848 */ 2849 static int 2850 pmap_release_pv(pv_entry_t pv, pv_entry_t pvp, pmap_inval_bulk_t *bulk) 2851 { 2852 vm_page_t p; 2853 2854 /* 2855 * The pmap is currently not spinlocked, pv is held+locked. 2856 * Remove the pv's page from its parent's page table. The 2857 * parent's page table page's wire_count will be decremented. 2858 * 2859 * This will clean out the pte at any level of the page table. 2860 * If smp != 0 all cpus are affected. 2861 * 2862 * Do not tear-down recursively, its faster to just let the 2863 * release run its course. 2864 */ 2865 pmap_remove_pv_pte(pv, pvp, bulk, 0); 2866 2867 /* 2868 * Terminal pvs are unhooked from their vm_pages. Because 2869 * terminal pages aren't page table pages they aren't wired 2870 * by us, so we have to be sure not to unwire them either. 2871 */ 2872 if (pv->pv_pindex < pmap_pt_pindex(0)) { 2873 pmap_remove_pv_page(pv); 2874 goto skip; 2875 } 2876 2877 /* 2878 * We leave the top-level page table page cached, wired, and 2879 * mapped in the pmap until the dtor function (pmap_puninit()) 2880 * gets called. 2881 * 2882 * Since we are leaving the top-level pv intact we need 2883 * to break out of what would otherwise be an infinite loop. 2884 * 2885 * This covers both the normal and the isolated PML4 page. 2886 */ 2887 if (pv->pv_pindex >= pmap_pml4_pindex()) { 2888 pv_put(pv); 2889 return(-1); 2890 } 2891 2892 /* 2893 * For page table pages (other than the top-level page), 2894 * remove and free the vm_page. The representitive mapping 2895 * removed above by pmap_remove_pv_pte() did not undo the 2896 * last wire_count so we have to do that as well. 2897 */ 2898 p = pmap_remove_pv_page(pv); 2899 vm_page_busy_wait(p, FALSE, "pmaprl"); 2900 if (p->wire_count != 1) { 2901 const char *tstr; 2902 2903 if (pv->pv_pindex >= pmap_pdp_pindex(0)) 2904 tstr = "PDP"; 2905 else if (pv->pv_pindex >= pmap_pd_pindex(0)) 2906 tstr = "PD"; 2907 else if (pv->pv_pindex >= pmap_pt_pindex(0)) 2908 tstr = "PT"; 2909 else 2910 tstr = "PTE"; 2911 2912 kprintf("p(%s) p->wire_count was %016lx %d\n", 2913 tstr, pv->pv_pindex, p->wire_count); 2914 } 2915 KKASSERT(p->wire_count == 1); 2916 KKASSERT(p->flags & PG_UNQUEUED); 2917 2918 vm_page_unwire(p, 0); 2919 KKASSERT(p->wire_count == 0); 2920 2921 vm_page_free(p); 2922 skip: 2923 pv_free(pv, pvp); 2924 2925 return 0; 2926 } 2927 2928 /* 2929 * This function will remove the pte associated with a pv from its parent. 2930 * Terminal pv's are supported. All cpus specified by (bulk) are properly 2931 * invalidated. 2932 * 2933 * The wire count will be dropped on the parent page table. The wire 2934 * count on the page being removed (pv->pv_m) from the parent page table 2935 * is NOT touched. Note that terminal pages will not have any additional 2936 * wire counts while page table pages will have at least one representing 2937 * the mapping, plus others representing sub-mappings. 2938 * 2939 * NOTE: Cannot be called on kernel page table pages, only KVM terminal 2940 * pages and user page table and terminal pages. 2941 * 2942 * NOTE: The pte being removed might be unmanaged, and the pv supplied might 2943 * be freshly allocated and not imply that the pte is managed. In this 2944 * case pv->pv_m should be NULL. 2945 * 2946 * The pv must be locked. The pvp, if supplied, must be locked. All 2947 * supplied pv's will remain locked on return. 2948 * 2949 * XXX must lock parent pv's if they exist to remove pte XXX 2950 */ 2951 static 2952 void 2953 pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, pmap_inval_bulk_t *bulk, 2954 int destroy) 2955 { 2956 vm_pindex_t ptepindex = pv->pv_pindex; 2957 pmap_t pmap = pv->pv_pmap; 2958 vm_page_t p; 2959 int gotpvp = 0; 2960 2961 KKASSERT(pmap); 2962 2963 if (ptepindex >= pmap_pml4_pindex()) { 2964 /* 2965 * We are the top level PML4E table, there is no parent. 2966 * 2967 * This is either the normal or isolated PML4E table. 2968 * Only the normal is used in regular operation, the isolated 2969 * is only passed in when breaking down the whole pmap. 2970 */ 2971 p = pmap->pm_pmlpv->pv_m; 2972 KKASSERT(pv->pv_m == p); /* debugging */ 2973 } else if (ptepindex >= pmap_pdp_pindex(0)) { 2974 /* 2975 * Remove a PDP page from the PML4E. This can only occur 2976 * with user page tables. We do not have to lock the 2977 * pml4 PV so just ignore pvp. 2978 */ 2979 vm_pindex_t pml4_pindex; 2980 vm_pindex_t pdp_index; 2981 pml4_entry_t *pdp; 2982 pml4_entry_t *pdp_iso; 2983 2984 pdp_index = ptepindex - pmap_pdp_pindex(0); 2985 if (pvp == NULL) { 2986 pml4_pindex = pmap_pml4_pindex(); 2987 pvp = pv_get(pv->pv_pmap, pml4_pindex, NULL); 2988 KKASSERT(pvp); 2989 gotpvp = 1; 2990 } 2991 2992 pdp = &pmap->pm_pml4[pdp_index & ((1ul << NPML4EPGSHIFT) - 1)]; 2993 KKASSERT((*pdp & pmap->pmap_bits[PG_V_IDX]) != 0); 2994 p = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); 2995 pmap_inval_bulk(bulk, (vm_offset_t)-1, pdp, 0); 2996 2997 /* 2998 * Also remove the PDP from the isolated PML4E if the 2999 * process uses one. 3000 */ 3001 if (pvp == pmap->pm_pmlpv && pmap->pm_pmlpv_iso) { 3002 pdp_iso = &pmap->pm_pml4_iso[pdp_index & 3003 ((1ul << NPML4EPGSHIFT) - 1)]; 3004 pmap_inval_bulk(bulk, (vm_offset_t)-1, pdp_iso, 0); 3005 } 3006 KKASSERT(pv->pv_m == p); /* debugging */ 3007 } else if (ptepindex >= pmap_pd_pindex(0)) { 3008 /* 3009 * Remove a PD page from the PDP 3010 * 3011 * SIMPLE PMAP NOTE: Non-existant pvp's are ok in the case 3012 * of a simple pmap because it stops at 3013 * the PD page. 3014 */ 3015 vm_pindex_t pdp_pindex; 3016 vm_pindex_t pd_index; 3017 pdp_entry_t *pd; 3018 3019 pd_index = ptepindex - pmap_pd_pindex(0); 3020 3021 if (pvp == NULL) { 3022 pdp_pindex = NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + 3023 (pd_index >> NPML4EPGSHIFT); 3024 pvp = pv_get(pv->pv_pmap, pdp_pindex, NULL); 3025 gotpvp = 1; 3026 } 3027 3028 if (pvp) { 3029 pd = pv_pte_lookup(pvp, pd_index & 3030 ((1ul << NPDPEPGSHIFT) - 1)); 3031 KKASSERT((*pd & pmap->pmap_bits[PG_V_IDX]) != 0); 3032 p = PHYS_TO_VM_PAGE(*pd & PG_FRAME); 3033 pmap_inval_bulk(bulk, (vm_offset_t)-1, pd, 0); 3034 } else { 3035 KKASSERT(pmap->pm_flags & PMAP_FLAG_SIMPLE); 3036 p = pv->pv_m; /* degenerate test later */ 3037 } 3038 KKASSERT(pv->pv_m == p); /* debugging */ 3039 } else if (ptepindex >= pmap_pt_pindex(0)) { 3040 /* 3041 * Remove a PT page from the PD 3042 */ 3043 vm_pindex_t pd_pindex; 3044 vm_pindex_t pt_index; 3045 pd_entry_t *pt; 3046 3047 pt_index = ptepindex - pmap_pt_pindex(0); 3048 3049 if (pvp == NULL) { 3050 pd_pindex = NUPTE_TOTAL + NUPT_TOTAL + 3051 (pt_index >> NPDPEPGSHIFT); 3052 pvp = pv_get(pv->pv_pmap, pd_pindex, NULL); 3053 KKASSERT(pvp); 3054 gotpvp = 1; 3055 } 3056 3057 pt = pv_pte_lookup(pvp, pt_index & ((1ul << NPDPEPGSHIFT) - 1)); 3058 #if 0 3059 KASSERT((*pt & pmap->pmap_bits[PG_V_IDX]) != 0, 3060 ("*pt unexpectedly invalid %016jx " 3061 "gotpvp=%d ptepindex=%ld ptindex=%ld pv=%p pvp=%p", 3062 *pt, gotpvp, ptepindex, pt_index, pv, pvp)); 3063 p = PHYS_TO_VM_PAGE(*pt & PG_FRAME); 3064 #else 3065 if ((*pt & pmap->pmap_bits[PG_V_IDX]) == 0) { 3066 kprintf("*pt unexpectedly invalid %016jx " 3067 "gotpvp=%d ptepindex=%ld ptindex=%ld " 3068 "pv=%p pvp=%p\n", 3069 *pt, gotpvp, ptepindex, pt_index, pv, pvp); 3070 tsleep(pt, 0, "DEAD", 0); 3071 p = pv->pv_m; 3072 } else { 3073 p = PHYS_TO_VM_PAGE(*pt & PG_FRAME); 3074 } 3075 #endif 3076 pmap_inval_bulk(bulk, (vm_offset_t)-1, pt, 0); 3077 KKASSERT(pv->pv_m == p); /* debugging */ 3078 } else { 3079 KKASSERT(0); 3080 } 3081 3082 /* 3083 * If requested, scrap the underlying pv->pv_m and the underlying 3084 * pv. If this is a page-table-page we must also free the page. 3085 * 3086 * pvp must be returned locked. 3087 */ 3088 if (destroy == 1) { 3089 /* 3090 * page table page (PT, PD, PDP, PML4), caller was responsible 3091 * for testing wired_count. 3092 */ 3093 KKASSERT(pv->pv_m->wire_count == 1); 3094 p = pmap_remove_pv_page(pv); 3095 pv_free(pv, pvp); 3096 pv = NULL; 3097 3098 vm_page_busy_wait(p, FALSE, "pgpun"); 3099 vm_page_unwire(p, 0); 3100 vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE); 3101 vm_page_free(p); 3102 } else if (destroy == 2) { 3103 /* 3104 * Normal page, remove from pmap and leave the underlying 3105 * page untouched. 3106 */ 3107 pmap_remove_pv_page(pv); 3108 pv_free(pv, pvp); 3109 pv = NULL; /* safety */ 3110 } 3111 3112 /* 3113 * If we acquired pvp ourselves then we are responsible for 3114 * recursively deleting it. 3115 */ 3116 if (pvp && gotpvp) { 3117 /* 3118 * Recursively destroy higher-level page tables. 3119 * 3120 * This is optional. If we do not, they will still 3121 * be destroyed when the process exits. 3122 * 3123 * NOTE: Do not destroy pv_entry's with extra hold refs, 3124 * a caller may have unlocked it and intends to 3125 * continue to use it. 3126 */ 3127 if (pmap_dynamic_delete && 3128 pvp->pv_m && 3129 pvp->pv_m->wire_count == 1 && 3130 (pvp->pv_hold & PV_HOLD_MASK) == 2 && 3131 pvp->pv_pindex < pmap_pml4_pindex()) { 3132 if (pmap != &kernel_pmap) { 3133 pmap_remove_pv_pte(pvp, NULL, bulk, 1); 3134 pvp = NULL; /* safety */ 3135 } else { 3136 kprintf("Attempt to remove kernel_pmap pindex " 3137 "%jd\n", pvp->pv_pindex); 3138 pv_put(pvp); 3139 } 3140 } else { 3141 pv_put(pvp); 3142 } 3143 } 3144 } 3145 3146 /* 3147 * Remove the vm_page association to a pv. The pv must be locked. 3148 */ 3149 static 3150 vm_page_t 3151 pmap_remove_pv_page(pv_entry_t pv) 3152 { 3153 vm_page_t m; 3154 3155 m = pv->pv_m; 3156 pv->pv_m = NULL; 3157 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 3158 3159 return(m); 3160 } 3161 3162 /* 3163 * Grow the number of kernel page table entries, if needed. 3164 * 3165 * This routine is always called to validate any address space 3166 * beyond KERNBASE (for kldloads). kernel_vm_end only governs the address 3167 * space below KERNBASE. 3168 * 3169 * kernel_map must be locked exclusively by the caller. 3170 */ 3171 void 3172 pmap_growkernel(vm_offset_t kstart, vm_offset_t kend) 3173 { 3174 vm_paddr_t paddr; 3175 vm_offset_t ptppaddr; 3176 vm_page_t nkpg; 3177 pd_entry_t *pt, newpt; 3178 pdp_entry_t *pd, newpd; 3179 int update_kernel_vm_end; 3180 3181 /* 3182 * bootstrap kernel_vm_end on first real VM use 3183 */ 3184 if (kernel_vm_end == 0) { 3185 kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 3186 3187 for (;;) { 3188 pt = pmap_pt(&kernel_pmap, kernel_vm_end); 3189 if (pt == NULL) 3190 break; 3191 if ((*pt & kernel_pmap.pmap_bits[PG_V_IDX]) == 0) 3192 break; 3193 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & 3194 ~(vm_offset_t)(PAGE_SIZE * NPTEPG - 1); 3195 if (kernel_vm_end - 1 >= vm_map_max(&kernel_map)) { 3196 kernel_vm_end = vm_map_max(&kernel_map); 3197 break; 3198 } 3199 } 3200 } 3201 3202 /* 3203 * Fill in the gaps. kernel_vm_end is only adjusted for ranges 3204 * below KERNBASE. Ranges above KERNBASE are kldloaded and we 3205 * do not want to force-fill 128G worth of page tables. 3206 */ 3207 if (kstart < KERNBASE) { 3208 if (kstart > kernel_vm_end) 3209 kstart = kernel_vm_end; 3210 KKASSERT(kend <= KERNBASE); 3211 update_kernel_vm_end = 1; 3212 } else { 3213 update_kernel_vm_end = 0; 3214 } 3215 3216 kstart = rounddown2(kstart, (vm_offset_t)(PAGE_SIZE * NPTEPG)); 3217 kend = roundup2(kend, (vm_offset_t)(PAGE_SIZE * NPTEPG)); 3218 3219 if (kend - 1 >= vm_map_max(&kernel_map)) 3220 kend = vm_map_max(&kernel_map); 3221 3222 while (kstart < kend) { 3223 pt = pmap_pt(&kernel_pmap, kstart); 3224 if (pt == NULL) { 3225 /* 3226 * We need a new PD entry 3227 */ 3228 nkpg = vm_page_alloc(NULL, mycpu->gd_rand_incr++, 3229 VM_ALLOC_NORMAL | 3230 VM_ALLOC_SYSTEM | 3231 VM_ALLOC_INTERRUPT); 3232 if (nkpg == NULL) { 3233 panic("pmap_growkernel: no memory to grow " 3234 "kernel"); 3235 } 3236 paddr = VM_PAGE_TO_PHYS(nkpg); 3237 pmap_zero_page(paddr); 3238 pd = pmap_pd(&kernel_pmap, kstart); 3239 3240 newpd = (pdp_entry_t) 3241 (paddr | 3242 kernel_pmap.pmap_bits[PG_V_IDX] | 3243 kernel_pmap.pmap_bits[PG_RW_IDX] | 3244 kernel_pmap.pmap_bits[PG_A_IDX]); 3245 atomic_swap_long(pd, newpd); 3246 3247 #if 0 3248 kprintf("NEWPD pd=%p pde=%016jx phys=%016jx\n", 3249 pd, newpd, paddr); 3250 #endif 3251 3252 continue; /* try again */ 3253 } 3254 3255 if ((*pt & kernel_pmap.pmap_bits[PG_V_IDX]) != 0) { 3256 kstart = (kstart + PAGE_SIZE * NPTEPG) & 3257 ~(vm_offset_t)(PAGE_SIZE * NPTEPG - 1); 3258 if (kstart - 1 >= vm_map_max(&kernel_map)) { 3259 kstart = vm_map_max(&kernel_map); 3260 break; 3261 } 3262 continue; 3263 } 3264 3265 /* 3266 * We need a new PT 3267 * 3268 * This index is bogus, but out of the way 3269 */ 3270 nkpg = vm_page_alloc(NULL, mycpu->gd_rand_incr++, 3271 VM_ALLOC_NORMAL | 3272 VM_ALLOC_SYSTEM | 3273 VM_ALLOC_INTERRUPT); 3274 if (nkpg == NULL) 3275 panic("pmap_growkernel: no memory to grow kernel"); 3276 3277 vm_page_wire(nkpg); 3278 ptppaddr = VM_PAGE_TO_PHYS(nkpg); 3279 pmap_zero_page(ptppaddr); 3280 newpt = (pd_entry_t)(ptppaddr | 3281 kernel_pmap.pmap_bits[PG_V_IDX] | 3282 kernel_pmap.pmap_bits[PG_RW_IDX] | 3283 kernel_pmap.pmap_bits[PG_A_IDX]); 3284 atomic_swap_long(pt, newpt); 3285 3286 kstart = (kstart + PAGE_SIZE * NPTEPG) & 3287 ~(vm_offset_t)(PAGE_SIZE * NPTEPG - 1); 3288 3289 if (kstart - 1 >= vm_map_max(&kernel_map)) { 3290 kstart = vm_map_max(&kernel_map); 3291 break; 3292 } 3293 } 3294 3295 /* 3296 * Only update kernel_vm_end for areas below KERNBASE. 3297 */ 3298 if (update_kernel_vm_end && kernel_vm_end < kstart) 3299 kernel_vm_end = kstart; 3300 } 3301 3302 /* 3303 * Add a reference to the specified pmap. 3304 */ 3305 void 3306 pmap_reference(pmap_t pmap) 3307 { 3308 if (pmap != NULL) 3309 atomic_add_int(&pmap->pm_count, 1); 3310 } 3311 3312 void 3313 pmap_maybethreaded(pmap_t pmap) 3314 { 3315 atomic_set_int(&pmap->pm_flags, PMAP_MULTI); 3316 } 3317 3318 /* 3319 * Called while page is hard-busied to clear the PG_MAPPED and PG_WRITEABLE 3320 * flags if able. 3321 */ 3322 int 3323 pmap_mapped_sync(vm_page_t m) 3324 { 3325 if (m->md.pmap_count == 0) 3326 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 3327 return (m->flags); 3328 } 3329 3330 /*************************************************** 3331 * page management routines. 3332 ***************************************************/ 3333 3334 /* 3335 * Hold a pv without locking it 3336 */ 3337 #if 0 3338 static void 3339 pv_hold(pv_entry_t pv) 3340 { 3341 atomic_add_int(&pv->pv_hold, 1); 3342 } 3343 #endif 3344 3345 /* 3346 * Hold a pv_entry, preventing its destruction. TRUE is returned if the pv 3347 * was successfully locked, FALSE if it wasn't. The caller must dispose of 3348 * the pv properly. 3349 * 3350 * Either the pmap->pm_spin or the related vm_page_spin (if traversing a 3351 * pv list via its page) must be held by the caller in order to stabilize 3352 * the pv. 3353 */ 3354 static int 3355 _pv_hold_try(pv_entry_t pv PMAP_DEBUG_DECL) 3356 { 3357 u_int count; 3358 3359 /* 3360 * Critical path shortcut expects pv to already have one ref 3361 * (for the pv->pv_pmap). 3362 */ 3363 count = pv->pv_hold; 3364 cpu_ccfence(); 3365 for (;;) { 3366 if ((count & PV_HOLD_LOCKED) == 0) { 3367 if (atomic_fcmpset_int(&pv->pv_hold, &count, 3368 (count + 1) | PV_HOLD_LOCKED)) { 3369 #ifdef PMAP_DEBUG 3370 pv->pv_func = func; 3371 pv->pv_line = lineno; 3372 #endif 3373 return TRUE; 3374 } 3375 } else { 3376 if (atomic_fcmpset_int(&pv->pv_hold, &count, count + 1)) 3377 return FALSE; 3378 } 3379 /* retry */ 3380 } 3381 } 3382 3383 /* 3384 * Drop a previously held pv_entry which could not be locked, allowing its 3385 * destruction. 3386 * 3387 * Must not be called with a spinlock held as we might zfree() the pv if it 3388 * is no longer associated with a pmap and this was the last hold count. 3389 */ 3390 static void 3391 pv_drop(pv_entry_t pv) 3392 { 3393 u_int count; 3394 3395 for (;;) { 3396 count = pv->pv_hold; 3397 cpu_ccfence(); 3398 KKASSERT((count & PV_HOLD_MASK) > 0); 3399 KKASSERT((count & (PV_HOLD_LOCKED | PV_HOLD_MASK)) != 3400 (PV_HOLD_LOCKED | 1)); 3401 if (atomic_cmpset_int(&pv->pv_hold, count, count - 1)) { 3402 if ((count & PV_HOLD_MASK) == 1) { 3403 #ifdef PMAP_DEBUG2 3404 if (pmap_enter_debug > 0) { 3405 --pmap_enter_debug; 3406 kprintf("pv_drop: free pv %p\n", pv); 3407 } 3408 #endif 3409 KKASSERT(count == 1); 3410 KKASSERT(pv->pv_pmap == NULL); 3411 zfree(pvzone, pv); 3412 } 3413 return; 3414 } 3415 /* retry */ 3416 } 3417 } 3418 3419 /* 3420 * Find or allocate the requested PV entry, returning a locked, held pv. 3421 * 3422 * If (*isnew) is non-zero, the returned pv will have two hold counts, one 3423 * for the caller and one representing the pmap and vm_page association. 3424 * 3425 * If (*isnew) is zero, the returned pv will have only one hold count. 3426 * 3427 * Since both associations can only be adjusted while the pv is locked, 3428 * together they represent just one additional hold. 3429 */ 3430 static 3431 pv_entry_t 3432 _pv_alloc(pmap_t pmap, vm_pindex_t pindex, int *isnew PMAP_DEBUG_DECL) 3433 { 3434 struct mdglobaldata *md = mdcpu; 3435 pv_entry_t pv; 3436 pv_entry_t pnew; 3437 int pmap_excl = 0; 3438 3439 pnew = NULL; 3440 if (md->gd_newpv) { 3441 #if 1 3442 pnew = atomic_swap_ptr((void *)&md->gd_newpv, NULL); 3443 #else 3444 crit_enter(); 3445 pnew = md->gd_newpv; /* might race NULL */ 3446 md->gd_newpv = NULL; 3447 crit_exit(); 3448 #endif 3449 } 3450 if (pnew == NULL) 3451 pnew = zalloc(pvzone); 3452 3453 spin_lock_shared(&pmap->pm_spin); 3454 for (;;) { 3455 /* 3456 * Shortcut cache 3457 */ 3458 pv = pv_entry_lookup(pmap, pindex); 3459 if (pv == NULL) { 3460 vm_pindex_t *pmark; 3461 3462 /* 3463 * Requires exclusive pmap spinlock 3464 */ 3465 if (pmap_excl == 0) { 3466 pmap_excl = 1; 3467 if (!spin_lock_upgrade_try(&pmap->pm_spin)) { 3468 spin_unlock_shared(&pmap->pm_spin); 3469 spin_lock(&pmap->pm_spin); 3470 continue; 3471 } 3472 } 3473 3474 /* 3475 * We need to block if someone is holding our 3476 * placemarker. As long as we determine the 3477 * placemarker has not been aquired we do not 3478 * need to get it as acquision also requires 3479 * the pmap spin lock. 3480 * 3481 * However, we can race the wakeup. 3482 */ 3483 pmark = pmap_placemarker_hash(pmap, pindex); 3484 3485 if (((*pmark ^ pindex) & ~PM_PLACEMARK_WAKEUP) == 0) { 3486 tsleep_interlock(pmark, 0); 3487 atomic_set_long(pmark, PM_PLACEMARK_WAKEUP); 3488 if (((*pmark ^ pindex) & 3489 ~PM_PLACEMARK_WAKEUP) == 0) { 3490 spin_unlock(&pmap->pm_spin); 3491 tsleep(pmark, PINTERLOCKED, "pvplc", 0); 3492 spin_lock(&pmap->pm_spin); 3493 } 3494 continue; 3495 } 3496 3497 /* 3498 * Setup the new entry 3499 */ 3500 pnew->pv_pmap = pmap; 3501 pnew->pv_pindex = pindex; 3502 pnew->pv_hold = PV_HOLD_LOCKED | 2; 3503 pnew->pv_flags = 0; 3504 #ifdef PMAP_DEBUG 3505 pnew->pv_func = func; 3506 pnew->pv_line = lineno; 3507 if (pnew->pv_line_lastfree > 0) { 3508 pnew->pv_line_lastfree = 3509 -pnew->pv_line_lastfree; 3510 } 3511 #endif 3512 pv = pv_entry_rb_tree_RB_INSERT(&pmap->pm_pvroot, pnew); 3513 atomic_add_long(&pmap->pm_stats.resident_count, 1); 3514 spin_unlock(&pmap->pm_spin); 3515 *isnew = 1; 3516 3517 KASSERT(pv == NULL, ("pv insert failed %p->%p", pnew, pv)); 3518 return(pnew); 3519 } 3520 3521 /* 3522 * We already have an entry, cleanup the staged pnew if 3523 * we can get the lock, otherwise block and retry. 3524 */ 3525 if (__predict_true(_pv_hold_try(pv PMAP_DEBUG_COPY))) { 3526 if (pmap_excl) 3527 spin_unlock(&pmap->pm_spin); 3528 else 3529 spin_unlock_shared(&pmap->pm_spin); 3530 #if 1 3531 pnew = atomic_swap_ptr((void *)&md->gd_newpv, pnew); 3532 if (pnew) 3533 zfree(pvzone, pnew); 3534 #else 3535 crit_enter(); 3536 if (md->gd_newpv == NULL) 3537 md->gd_newpv = pnew; 3538 else 3539 zfree(pvzone, pnew); 3540 crit_exit(); 3541 #endif 3542 KKASSERT(pv->pv_pmap == pmap && 3543 pv->pv_pindex == pindex); 3544 *isnew = 0; 3545 return(pv); 3546 } 3547 if (pmap_excl) { 3548 spin_unlock(&pmap->pm_spin); 3549 _pv_lock(pv PMAP_DEBUG_COPY); 3550 pv_put(pv); 3551 spin_lock(&pmap->pm_spin); 3552 } else { 3553 spin_unlock_shared(&pmap->pm_spin); 3554 _pv_lock(pv PMAP_DEBUG_COPY); 3555 pv_put(pv); 3556 spin_lock_shared(&pmap->pm_spin); 3557 } 3558 } 3559 /* NOT REACHED */ 3560 } 3561 3562 /* 3563 * Find the requested PV entry, returning a locked+held pv or NULL 3564 */ 3565 static 3566 pv_entry_t 3567 _pv_get(pmap_t pmap, vm_pindex_t pindex, vm_pindex_t **pmarkp PMAP_DEBUG_DECL) 3568 { 3569 pv_entry_t pv; 3570 int pmap_excl = 0; 3571 3572 spin_lock_shared(&pmap->pm_spin); 3573 for (;;) { 3574 /* 3575 * Shortcut cache 3576 */ 3577 pv = pv_entry_lookup(pmap, pindex); 3578 if (pv == NULL) { 3579 /* 3580 * Block if there is ANY placemarker. If we are to 3581 * return it, we must also aquire the spot, so we 3582 * have to block even if the placemarker is held on 3583 * a different address. 3584 * 3585 * OPTIMIZATION: If pmarkp is passed as NULL the 3586 * caller is just probing (or looking for a real 3587 * pv_entry), and in this case we only need to check 3588 * to see if the placemarker matches pindex. 3589 */ 3590 vm_pindex_t *pmark; 3591 3592 /* 3593 * Requires exclusive pmap spinlock 3594 */ 3595 if (pmap_excl == 0) { 3596 pmap_excl = 1; 3597 if (!spin_lock_upgrade_try(&pmap->pm_spin)) { 3598 spin_unlock_shared(&pmap->pm_spin); 3599 spin_lock(&pmap->pm_spin); 3600 continue; 3601 } 3602 } 3603 3604 pmark = pmap_placemarker_hash(pmap, pindex); 3605 3606 if ((pmarkp && *pmark != PM_NOPLACEMARK) || 3607 ((*pmark ^ pindex) & ~PM_PLACEMARK_WAKEUP) == 0) { 3608 tsleep_interlock(pmark, 0); 3609 atomic_set_long(pmark, PM_PLACEMARK_WAKEUP); 3610 if ((pmarkp && *pmark != PM_NOPLACEMARK) || 3611 ((*pmark ^ pindex) & 3612 ~PM_PLACEMARK_WAKEUP) == 0) { 3613 spin_unlock(&pmap->pm_spin); 3614 tsleep(pmark, PINTERLOCKED, "pvpld", 0); 3615 spin_lock(&pmap->pm_spin); 3616 } 3617 continue; 3618 } 3619 if (pmarkp) { 3620 if (atomic_swap_long(pmark, pindex) != 3621 PM_NOPLACEMARK) { 3622 panic("_pv_get: pmark race"); 3623 } 3624 *pmarkp = pmark; 3625 } 3626 spin_unlock(&pmap->pm_spin); 3627 return NULL; 3628 } 3629 if (_pv_hold_try(pv PMAP_DEBUG_COPY)) { 3630 if (pmap_excl) 3631 spin_unlock(&pmap->pm_spin); 3632 else 3633 spin_unlock_shared(&pmap->pm_spin); 3634 KKASSERT(pv->pv_pmap == pmap && 3635 pv->pv_pindex == pindex); 3636 return(pv); 3637 } 3638 if (pmap_excl) { 3639 spin_unlock(&pmap->pm_spin); 3640 _pv_lock(pv PMAP_DEBUG_COPY); 3641 pv_put(pv); 3642 spin_lock(&pmap->pm_spin); 3643 } else { 3644 spin_unlock_shared(&pmap->pm_spin); 3645 _pv_lock(pv PMAP_DEBUG_COPY); 3646 pv_put(pv); 3647 spin_lock_shared(&pmap->pm_spin); 3648 } 3649 } 3650 } 3651 3652 /* 3653 * Lookup, hold, and attempt to lock (pmap,pindex). 3654 * 3655 * If the entry does not exist NULL is returned and *errorp is set to 0 3656 * 3657 * If the entry exists and could be successfully locked it is returned and 3658 * errorp is set to 0. 3659 * 3660 * If the entry exists but could NOT be successfully locked it is returned 3661 * held and *errorp is set to 1. 3662 * 3663 * If the entry is placemarked by someone else NULL is returned and *errorp 3664 * is set to 1. 3665 */ 3666 static 3667 pv_entry_t 3668 pv_get_try(pmap_t pmap, vm_pindex_t pindex, vm_pindex_t **pmarkp, int *errorp) 3669 { 3670 pv_entry_t pv; 3671 3672 spin_lock_shared(&pmap->pm_spin); 3673 3674 pv = pv_entry_lookup(pmap, pindex); 3675 if (pv == NULL) { 3676 vm_pindex_t *pmark; 3677 3678 pmark = pmap_placemarker_hash(pmap, pindex); 3679 3680 if (((*pmark ^ pindex) & ~PM_PLACEMARK_WAKEUP) == 0) { 3681 *errorp = 1; 3682 } else if (pmarkp && 3683 atomic_cmpset_long(pmark, PM_NOPLACEMARK, pindex)) { 3684 *errorp = 0; 3685 } else { 3686 /* 3687 * Can't set a placemark with a NULL pmarkp, or if 3688 * pmarkp is non-NULL but we failed to set our 3689 * placemark. 3690 */ 3691 *errorp = 1; 3692 } 3693 if (pmarkp) 3694 *pmarkp = pmark; 3695 spin_unlock_shared(&pmap->pm_spin); 3696 3697 return NULL; 3698 } 3699 3700 /* 3701 * XXX This has problems if the lock is shared, why? 3702 */ 3703 if (pv_hold_try(pv)) { 3704 spin_unlock_shared(&pmap->pm_spin); 3705 *errorp = 0; 3706 KKASSERT(pv->pv_pmap == pmap && pv->pv_pindex == pindex); 3707 return(pv); /* lock succeeded */ 3708 } 3709 spin_unlock_shared(&pmap->pm_spin); 3710 *errorp = 1; 3711 3712 return (pv); /* lock failed */ 3713 } 3714 3715 /* 3716 * Lock a held pv, keeping the hold count 3717 */ 3718 static 3719 void 3720 _pv_lock(pv_entry_t pv PMAP_DEBUG_DECL) 3721 { 3722 u_int count; 3723 3724 for (;;) { 3725 count = pv->pv_hold; 3726 cpu_ccfence(); 3727 if ((count & PV_HOLD_LOCKED) == 0) { 3728 if (atomic_cmpset_int(&pv->pv_hold, count, 3729 count | PV_HOLD_LOCKED)) { 3730 #ifdef PMAP_DEBUG 3731 pv->pv_func = func; 3732 pv->pv_line = lineno; 3733 #endif 3734 return; 3735 } 3736 continue; 3737 } 3738 tsleep_interlock(pv, 0); 3739 if (atomic_cmpset_int(&pv->pv_hold, count, 3740 count | PV_HOLD_WAITING)) { 3741 #ifdef PMAP_DEBUG2 3742 if (pmap_enter_debug > 0) { 3743 --pmap_enter_debug; 3744 kprintf("pv waiting on %s:%d\n", 3745 pv->pv_func, pv->pv_line); 3746 } 3747 #endif 3748 tsleep(pv, PINTERLOCKED, "pvwait", hz); 3749 } 3750 /* retry */ 3751 } 3752 } 3753 3754 /* 3755 * Unlock a held and locked pv, keeping the hold count. 3756 */ 3757 static 3758 void 3759 pv_unlock(pv_entry_t pv) 3760 { 3761 u_int count; 3762 3763 for (;;) { 3764 count = pv->pv_hold; 3765 cpu_ccfence(); 3766 KKASSERT((count & (PV_HOLD_LOCKED | PV_HOLD_MASK)) >= 3767 (PV_HOLD_LOCKED | 1)); 3768 if (atomic_cmpset_int(&pv->pv_hold, count, 3769 count & 3770 ~(PV_HOLD_LOCKED | PV_HOLD_WAITING))) { 3771 if (count & PV_HOLD_WAITING) 3772 wakeup(pv); 3773 break; 3774 } 3775 } 3776 } 3777 3778 /* 3779 * Unlock and drop a pv. If the pv is no longer associated with a pmap 3780 * and the hold count drops to zero we will free it. 3781 * 3782 * Caller should not hold any spin locks. We are protected from hold races 3783 * by virtue of holds only occuring only with a pmap_spin or vm_page_spin 3784 * lock held. A pv cannot be located otherwise. 3785 */ 3786 static 3787 void 3788 pv_put(pv_entry_t pv) 3789 { 3790 #ifdef PMAP_DEBUG2 3791 if (pmap_enter_debug > 0) { 3792 --pmap_enter_debug; 3793 kprintf("pv_put pv=%p hold=%08x\n", pv, pv->pv_hold); 3794 } 3795 #endif 3796 3797 /* 3798 * Normal put-aways must have a pv_m associated with the pv, 3799 * but allow the case where the pv has been destructed due 3800 * to pmap_dynamic_delete. 3801 */ 3802 KKASSERT(pv->pv_pmap == NULL || pv->pv_m != NULL); 3803 3804 /* 3805 * Fast - shortcut most common condition 3806 */ 3807 if (atomic_cmpset_int(&pv->pv_hold, PV_HOLD_LOCKED | 2, 1)) 3808 return; 3809 3810 /* 3811 * Slow 3812 */ 3813 pv_unlock(pv); 3814 pv_drop(pv); 3815 } 3816 3817 /* 3818 * Remove the pmap association from a pv, require that pv_m already be removed, 3819 * then unlock and drop the pv. Any pte operations must have already been 3820 * completed. This call may result in a last-drop which will physically free 3821 * the pv. 3822 * 3823 * Removing the pmap association entails an additional drop. 3824 * 3825 * pv must be exclusively locked on call and will be disposed of on return. 3826 */ 3827 static 3828 void 3829 _pv_free(pv_entry_t pv, pv_entry_t pvp PMAP_DEBUG_DECL) 3830 { 3831 pmap_t pmap; 3832 3833 #ifdef PMAP_DEBUG 3834 pv->pv_func_lastfree = func; 3835 pv->pv_line_lastfree = lineno; 3836 #endif 3837 KKASSERT(pv->pv_m == NULL); 3838 KKASSERT((pv->pv_hold & (PV_HOLD_LOCKED|PV_HOLD_MASK)) >= 3839 (PV_HOLD_LOCKED|1)); 3840 if ((pmap = pv->pv_pmap) != NULL) { 3841 spin_lock(&pmap->pm_spin); 3842 KKASSERT(pv->pv_pmap == pmap); 3843 if (pmap->pm_pvhint_pt == pv) 3844 pmap->pm_pvhint_pt = NULL; 3845 if (pmap->pm_pvhint_unused == pv) 3846 pmap->pm_pvhint_unused = NULL; 3847 pv_entry_rb_tree_RB_REMOVE(&pmap->pm_pvroot, pv); 3848 atomic_add_long(&pmap->pm_stats.resident_count, -1); 3849 pv->pv_pmap = NULL; 3850 pv->pv_pindex = 0; 3851 spin_unlock(&pmap->pm_spin); 3852 3853 /* 3854 * Try to shortcut three atomic ops, otherwise fall through 3855 * and do it normally. Drop two refs and the lock all in 3856 * one go. 3857 */ 3858 if (pvp) { 3859 if (vm_page_unwire_quick(pvp->pv_m)) 3860 panic("_pv_free: bad wirecount on pvp"); 3861 } 3862 if (atomic_cmpset_int(&pv->pv_hold, PV_HOLD_LOCKED | 2, 0)) { 3863 #ifdef PMAP_DEBUG2 3864 if (pmap_enter_debug > 0) { 3865 --pmap_enter_debug; 3866 kprintf("pv_free: free pv %p\n", pv); 3867 } 3868 #endif 3869 zfree(pvzone, pv); 3870 return; 3871 } 3872 pv_drop(pv); /* ref for pv_pmap */ 3873 } 3874 pv_unlock(pv); 3875 pv_drop(pv); 3876 } 3877 3878 /* 3879 * This routine is very drastic, but can save the system 3880 * in a pinch. 3881 */ 3882 void 3883 pmap_collect(void) 3884 { 3885 int i; 3886 vm_page_t m; 3887 static int warningdone=0; 3888 3889 if (pmap_pagedaemon_waken == 0) 3890 return; 3891 pmap_pagedaemon_waken = 0; 3892 if (warningdone < 5) { 3893 kprintf("pmap_collect: pv_entries exhausted -- " 3894 "suggest increasing vm.pmap_pv_entries above %ld\n", 3895 vm_pmap_pv_entries); 3896 warningdone++; 3897 } 3898 3899 for (i = 0; i < vm_page_array_size; i++) { 3900 m = &vm_page_array[i]; 3901 if (m->wire_count || m->hold_count) 3902 continue; 3903 if (vm_page_busy_try(m, TRUE) == 0) { 3904 if (m->wire_count == 0 && m->hold_count == 0) { 3905 pmap_remove_all(m); 3906 } 3907 vm_page_wakeup(m); 3908 } 3909 } 3910 } 3911 3912 /* 3913 * Scan the pmap for active page table entries and issue a callback. 3914 * The callback must dispose of pte_pv, whos PTE entry is at *ptep in 3915 * its parent page table. 3916 * 3917 * pte_pv will be NULL if the page or page table is unmanaged. 3918 * pt_pv will point to the page table page containing the pte for the page. 3919 * 3920 * NOTE! If we come across an unmanaged page TABLE (verses an unmanaged page), 3921 * we pass a NULL pte_pv and we pass a pt_pv pointing to the passed 3922 * process pmap's PD and page to the callback function. This can be 3923 * confusing because the pt_pv is really a pd_pv, and the target page 3924 * table page is simply aliased by the pmap and not owned by it. 3925 * 3926 * It is assumed that the start and end are properly rounded to the page size. 3927 * 3928 * It is assumed that PD pages and above are managed and thus in the RB tree, 3929 * allowing us to use RB_SCAN from the PD pages down for ranged scans. 3930 */ 3931 struct pmap_scan_info { 3932 struct pmap *pmap; 3933 vm_offset_t sva; 3934 vm_offset_t eva; 3935 vm_pindex_t sva_pd_pindex; 3936 vm_pindex_t eva_pd_pindex; 3937 void (*func)(pmap_t, struct pmap_scan_info *, 3938 vm_pindex_t *, pv_entry_t, vm_offset_t, 3939 pt_entry_t *, void *); 3940 void *arg; 3941 pmap_inval_bulk_t bulk_core; 3942 pmap_inval_bulk_t *bulk; 3943 int count; 3944 int stop; 3945 }; 3946 3947 static int pmap_scan_cmp(pv_entry_t pv, void *data); 3948 static int pmap_scan_callback(pv_entry_t pv, void *data); 3949 3950 static void 3951 pmap_scan(struct pmap_scan_info *info, int smp_inval) 3952 { 3953 struct pmap *pmap = info->pmap; 3954 pv_entry_t pt_pv; /* A page table PV */ 3955 pv_entry_t pte_pv; /* A page table entry PV */ 3956 vm_pindex_t *pte_placemark; 3957 vm_pindex_t *pt_placemark; 3958 pt_entry_t *ptep; 3959 pt_entry_t oldpte; 3960 struct pv_entry dummy_pv; 3961 3962 info->stop = 0; 3963 if (pmap == NULL) 3964 return; 3965 if (info->sva == info->eva) 3966 return; 3967 if (smp_inval) { 3968 info->bulk = &info->bulk_core; 3969 pmap_inval_bulk_init(&info->bulk_core, pmap); 3970 } else { 3971 info->bulk = NULL; 3972 } 3973 3974 /* 3975 * Hold the token for stability; if the pmap is empty we have nothing 3976 * to do. 3977 */ 3978 #if 0 3979 if (pmap->pm_stats.resident_count == 0) { 3980 return; 3981 } 3982 #endif 3983 3984 info->count = 0; 3985 3986 /* 3987 * Special handling for scanning one page, which is a very common 3988 * operation (it is?). 3989 * 3990 * NOTE: Locks must be ordered bottom-up. pte,pt,pd,pdp,pml4 3991 */ 3992 if (info->sva + PAGE_SIZE == info->eva) { 3993 if (info->sva >= VM_MAX_USER_ADDRESS) { 3994 /* 3995 * Kernel mappings do not track wire counts on 3996 * page table pages and only maintain pd_pv and 3997 * pte_pv levels so pmap_scan() works. 3998 */ 3999 pt_pv = NULL; 4000 pte_pv = pv_get(pmap, pmap_pte_pindex(info->sva), 4001 &pte_placemark); 4002 KKASSERT(pte_pv == NULL); 4003 ptep = vtopte(info->sva); 4004 } else { 4005 /* 4006 * We hold pte_placemark across the operation for 4007 * unmanaged pages. 4008 * 4009 * WARNING! We must hold pt_placemark across the 4010 * *ptep test to prevent misintepreting 4011 * a non-zero *ptep as a shared page 4012 * table page. Hold it across the function 4013 * callback as well for SMP safety. 4014 */ 4015 pte_pv = pv_get(pmap, pmap_pte_pindex(info->sva), 4016 &pte_placemark); 4017 KKASSERT(pte_pv == NULL); 4018 pt_pv = pv_get(pmap, pmap_pt_pindex(info->sva), 4019 &pt_placemark); 4020 if (pt_pv == NULL) { 4021 #if 0 4022 KKASSERT(0); 4023 pd_pv = pv_get(pmap, 4024 pmap_pd_pindex(info->sva), 4025 NULL); 4026 if (pd_pv) { 4027 ptep = pv_pte_lookup(pd_pv, 4028 pmap_pt_index(info->sva)); 4029 if (*ptep) { 4030 info->func(pmap, info, 4031 pt_placemark, pd_pv, 4032 info->sva, ptep, 4033 info->arg); 4034 } else { 4035 pv_placemarker_wakeup(pmap, 4036 pt_placemark); 4037 } 4038 pv_put(pd_pv); 4039 } else { 4040 pv_placemarker_wakeup(pmap, 4041 pt_placemark); 4042 } 4043 #else 4044 pv_placemarker_wakeup(pmap, pt_placemark); 4045 #endif 4046 pv_placemarker_wakeup(pmap, pte_placemark); 4047 goto fast_skip; 4048 } 4049 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(info->sva)); 4050 } 4051 4052 /* 4053 * NOTE: *ptep can't be ripped out from under us if we hold 4054 * pte_pv (or pte_placemark) locked, but bits can 4055 * change. 4056 */ 4057 oldpte = *ptep; 4058 cpu_ccfence(); 4059 if (oldpte == 0) { 4060 KKASSERT(pte_pv == NULL); 4061 pv_placemarker_wakeup(pmap, pte_placemark); 4062 } else { 4063 KASSERT((oldpte & pmap->pmap_bits[PG_V_IDX]) == 4064 pmap->pmap_bits[PG_V_IDX], 4065 ("badB *ptep %016lx/%016lx sva %016lx pte_pv NULL", 4066 *ptep, oldpte, info->sva)); 4067 info->func(pmap, info, pte_placemark, pt_pv, 4068 info->sva, ptep, info->arg); 4069 } 4070 if (pt_pv) 4071 pv_put(pt_pv); 4072 fast_skip: 4073 pmap_inval_bulk_flush(info->bulk); 4074 return; 4075 } 4076 4077 /* 4078 * Nominal scan case, RB_SCAN() for PD pages and iterate from 4079 * there. 4080 * 4081 * WARNING! eva can overflow our standard ((N + mask) >> bits) 4082 * bounds, resulting in a pd_pindex of 0. To solve the 4083 * problem we use an inclusive range. 4084 */ 4085 info->sva_pd_pindex = pmap_pd_pindex(info->sva); 4086 info->eva_pd_pindex = pmap_pd_pindex(info->eva - PAGE_SIZE); 4087 4088 if (info->sva >= VM_MAX_USER_ADDRESS) { 4089 /* 4090 * The kernel does not currently maintain any pv_entry's for 4091 * higher-level page tables. 4092 */ 4093 bzero(&dummy_pv, sizeof(dummy_pv)); 4094 dummy_pv.pv_pindex = info->sva_pd_pindex; 4095 spin_lock(&pmap->pm_spin); 4096 while (dummy_pv.pv_pindex <= info->eva_pd_pindex) { 4097 pmap_scan_callback(&dummy_pv, info); 4098 ++dummy_pv.pv_pindex; 4099 if (dummy_pv.pv_pindex < info->sva_pd_pindex) /*wrap*/ 4100 break; 4101 } 4102 spin_unlock(&pmap->pm_spin); 4103 } else { 4104 /* 4105 * User page tables maintain local PML4, PDP, PD, and PT 4106 * pv_entry's. pv_entry's are not used for PTEs. 4107 */ 4108 spin_lock(&pmap->pm_spin); 4109 pv_entry_rb_tree_RB_SCAN(&pmap->pm_pvroot, pmap_scan_cmp, 4110 pmap_scan_callback, info); 4111 spin_unlock(&pmap->pm_spin); 4112 } 4113 pmap_inval_bulk_flush(info->bulk); 4114 } 4115 4116 /* 4117 * WARNING! pmap->pm_spin held 4118 * 4119 * WARNING! eva can overflow our standard ((N + mask) >> bits) 4120 * bounds, resulting in a pd_pindex of 0. To solve the 4121 * problem we use an inclusive range. 4122 */ 4123 static int 4124 pmap_scan_cmp(pv_entry_t pv, void *data) 4125 { 4126 struct pmap_scan_info *info = data; 4127 if (pv->pv_pindex < info->sva_pd_pindex) 4128 return(-1); 4129 if (pv->pv_pindex > info->eva_pd_pindex) 4130 return(1); 4131 return(0); 4132 } 4133 4134 /* 4135 * pmap_scan() by PDs 4136 * 4137 * WARNING! pmap->pm_spin held 4138 */ 4139 static int 4140 pmap_scan_callback(pv_entry_t pv, void *data) 4141 { 4142 struct pmap_scan_info *info = data; 4143 struct pmap *pmap = info->pmap; 4144 pv_entry_t pd_pv; /* A page directory PV */ 4145 pv_entry_t pt_pv; /* A page table PV */ 4146 vm_pindex_t *pt_placemark; 4147 pt_entry_t *ptep; 4148 pt_entry_t oldpte; 4149 vm_offset_t sva; 4150 vm_offset_t eva; 4151 vm_offset_t va_next; 4152 vm_pindex_t pd_pindex; 4153 int error; 4154 4155 /* 4156 * Stop if requested 4157 */ 4158 if (info->stop) 4159 return -1; 4160 4161 /* 4162 * Pull the PD pindex from the pv before releasing the spinlock. 4163 * 4164 * WARNING: pv is faked for kernel pmap scans. 4165 */ 4166 pd_pindex = pv->pv_pindex; 4167 spin_unlock(&pmap->pm_spin); 4168 pv = NULL; /* invalid after spinlock unlocked */ 4169 4170 /* 4171 * Calculate the page range within the PD. SIMPLE pmaps are 4172 * direct-mapped for the entire 2^64 address space. Normal pmaps 4173 * reflect the user and kernel address space which requires 4174 * cannonicalization w/regards to converting pd_pindex's back 4175 * into addresses. 4176 */ 4177 sva = (pd_pindex - pmap_pd_pindex(0)) << PDPSHIFT; 4178 if ((pmap->pm_flags & PMAP_FLAG_SIMPLE) == 0 && 4179 (sva & PML4_SIGNMASK)) { 4180 sva |= PML4_SIGNMASK; 4181 } 4182 eva = sva + NBPDP; /* can overflow */ 4183 if (sva < info->sva) 4184 sva = info->sva; 4185 if (eva < info->sva || eva > info->eva) 4186 eva = info->eva; 4187 4188 /* 4189 * NOTE: kernel mappings do not track page table pages, only 4190 * terminal pages. 4191 * 4192 * NOTE: Locks must be ordered bottom-up. pte,pt,pd,pdp,pml4. 4193 * However, for the scan to be efficient we try to 4194 * cache items top-down. 4195 */ 4196 pd_pv = NULL; 4197 pt_pv = NULL; 4198 4199 for (; sva < eva; sva = va_next) { 4200 if (info->stop) 4201 break; 4202 if (sva >= VM_MAX_USER_ADDRESS) { 4203 if (pt_pv) { 4204 pv_put(pt_pv); 4205 pt_pv = NULL; 4206 } 4207 goto kernel_skip; 4208 } 4209 4210 /* 4211 * PD cache, scan shortcut if it doesn't exist. 4212 */ 4213 if (pd_pv == NULL) { 4214 pd_pv = pv_get(pmap, pmap_pd_pindex(sva), NULL); 4215 } else if (pd_pv->pv_pmap != pmap || 4216 pd_pv->pv_pindex != pmap_pd_pindex(sva)) { 4217 pv_put(pd_pv); 4218 pd_pv = pv_get(pmap, pmap_pd_pindex(sva), NULL); 4219 } 4220 if (pd_pv == NULL) { 4221 va_next = (sva + NBPDP) & ~PDPMASK; 4222 if (va_next < sva) 4223 va_next = eva; 4224 continue; 4225 } 4226 4227 /* 4228 * PT cache 4229 * 4230 * NOTE: The cached pt_pv can be removed from the pmap when 4231 * pmap_dynamic_delete is enabled. 4232 */ 4233 if (pt_pv && (pt_pv->pv_pmap != pmap || 4234 pt_pv->pv_pindex != pmap_pt_pindex(sva))) { 4235 pv_put(pt_pv); 4236 pt_pv = NULL; 4237 } 4238 if (pt_pv == NULL) { 4239 pt_pv = pv_get_try(pmap, pmap_pt_pindex(sva), 4240 &pt_placemark, &error); 4241 if (error) { 4242 pv_put(pd_pv); /* lock order */ 4243 pd_pv = NULL; 4244 if (pt_pv) { 4245 pv_lock(pt_pv); 4246 pv_put(pt_pv); 4247 pt_pv = NULL; 4248 } else { 4249 pv_placemarker_wait(pmap, pt_placemark); 4250 } 4251 va_next = sva; 4252 continue; 4253 } 4254 /* may have to re-check later if pt_pv is NULL here */ 4255 } 4256 4257 /* 4258 * If pt_pv is NULL we either have a shared page table 4259 * page (NOT IMPLEMENTED XXX) and must issue a callback 4260 * specific to that case, or there is no page table page. 4261 * 4262 * Either way we can skip the page table page. 4263 * 4264 * WARNING! pt_pv can also be NULL due to a pv creation 4265 * race where we find it to be NULL and then 4266 * later see a pte_pv. But its possible the pt_pv 4267 * got created inbetween the two operations, so 4268 * we must check. 4269 * 4270 * XXX This should no longer be the case because 4271 * we have pt_placemark. 4272 */ 4273 if (pt_pv == NULL) { 4274 #if 0 4275 /* XXX REMOVED */ 4276 /* 4277 * Possible unmanaged (shared from another pmap) 4278 * page table page. 4279 * 4280 * WARNING! We must hold pt_placemark across the 4281 * *ptep test to prevent misintepreting 4282 * a non-zero *ptep as a shared page 4283 * table page. Hold it across the function 4284 * callback as well for SMP safety. 4285 */ 4286 KKASSERT(0); 4287 ptep = pv_pte_lookup(pd_pv, pmap_pt_index(sva)); 4288 if (*ptep & pmap->pmap_bits[PG_V_IDX]) { 4289 info->func(pmap, info, pt_placemark, pd_pv, 4290 sva, ptep, info->arg); 4291 } else { 4292 pv_placemarker_wakeup(pmap, pt_placemark); 4293 } 4294 #else 4295 pv_placemarker_wakeup(pmap, pt_placemark); 4296 #endif 4297 4298 /* 4299 * Done, move to next page table page. 4300 */ 4301 va_next = (sva + NBPDR) & ~PDRMASK; 4302 if (va_next < sva) 4303 va_next = eva; 4304 continue; 4305 } 4306 4307 /* 4308 * From this point in the loop testing pt_pv for non-NULL 4309 * means we are in UVM, else if it is NULL we are in KVM. 4310 * 4311 * Limit our scan to either the end of the va represented 4312 * by the current page table page, or to the end of the 4313 * range being removed. 4314 */ 4315 kernel_skip: 4316 va_next = (sva + NBPDR) & ~PDRMASK; 4317 if (va_next < sva) 4318 va_next = eva; 4319 if (va_next > eva) 4320 va_next = eva; 4321 4322 /* 4323 * Scan the page table for pages. Some pages may not be 4324 * managed (might not have a pv_entry). 4325 * 4326 * There is no page table management for kernel pages so 4327 * pt_pv will be NULL in that case, but otherwise pt_pv 4328 * is non-NULL, locked, and referenced. 4329 */ 4330 4331 /* 4332 * At this point a non-NULL pt_pv means a UVA, and a NULL 4333 * pt_pv means a KVA. 4334 */ 4335 if (pt_pv) 4336 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(sva)); 4337 else 4338 ptep = vtopte(sva); 4339 4340 while (sva < va_next) { 4341 vm_pindex_t *pte_placemark; 4342 pv_entry_t pte_pv; 4343 4344 /* 4345 * Yield every 64 pages, stop if requested. 4346 */ 4347 if ((++info->count & 63) == 0) 4348 lwkt_user_yield(); 4349 if (info->stop) 4350 break; 4351 4352 /* 4353 * We can shortcut our scan if *ptep == 0. This is 4354 * an unlocked check. 4355 */ 4356 if (*ptep == 0) { 4357 sva += PAGE_SIZE; 4358 ++ptep; 4359 continue; 4360 } 4361 cpu_ccfence(); 4362 4363 /* 4364 * Acquire the pte_placemark. pte_pv's won't exist 4365 * for leaf pages. 4366 * 4367 * A multitude of races are possible here so if we 4368 * cannot lock definite state we clean out our cache 4369 * and break the inner while() loop to force a loop 4370 * up to the top of the for(). 4371 * 4372 * XXX unlock/relock pd_pv, pt_pv, and re-test their 4373 * validity instead of looping up? 4374 */ 4375 pte_pv = pv_get_try(pmap, pmap_pte_pindex(sva), 4376 &pte_placemark, &error); 4377 KKASSERT(pte_pv == NULL); 4378 if (error) { 4379 if (pd_pv) { 4380 pv_put(pd_pv); /* lock order */ 4381 pd_pv = NULL; 4382 } 4383 if (pt_pv) { 4384 pv_put(pt_pv); /* lock order */ 4385 pt_pv = NULL; 4386 } 4387 pv_placemarker_wait(pmap, pte_placemark); 4388 va_next = sva; /* retry */ 4389 break; 4390 } 4391 4392 /* 4393 * Reload *ptep after successfully locking the 4394 * pindex. 4395 */ 4396 cpu_ccfence(); 4397 oldpte = *ptep; 4398 if (oldpte == 0) { 4399 pv_placemarker_wakeup(pmap, pte_placemark); 4400 sva += PAGE_SIZE; 4401 ++ptep; 4402 continue; 4403 } 4404 4405 /* 4406 * We can't hold pd_pv across the callback (because 4407 * we don't pass it to the callback and the callback 4408 * might deadlock) 4409 */ 4410 if (pd_pv) { 4411 vm_page_wire_quick(pd_pv->pv_m); 4412 pv_unlock(pd_pv); 4413 } 4414 4415 /* 4416 * Ready for the callback. The locked placemarker 4417 * is consumed by the callback. 4418 */ 4419 if (oldpte & pmap->pmap_bits[PG_MANAGED_IDX]) { 4420 /* 4421 * Managed pte 4422 */ 4423 KASSERT((oldpte & pmap->pmap_bits[PG_V_IDX]), 4424 ("badC *ptep %016lx/%016lx sva %016lx", 4425 *ptep, oldpte, sva)); 4426 /* 4427 * We must unlock pd_pv across the callback 4428 * to avoid deadlocks on any recursive 4429 * disposal. Re-check that it still exists 4430 * after re-locking. 4431 * 4432 * Call target disposes of pte_placemark 4433 * and may destroy but will not dispose 4434 * of pt_pv. 4435 */ 4436 info->func(pmap, info, pte_placemark, pt_pv, 4437 sva, ptep, info->arg); 4438 } else { 4439 /* 4440 * Unmanaged pte 4441 * 4442 * We must unlock pd_pv across the callback 4443 * to avoid deadlocks on any recursive 4444 * disposal. Re-check that it still exists 4445 * after re-locking. 4446 * 4447 * Call target disposes of pte_placemark 4448 * and may destroy but will not dispose 4449 * of pt_pv. 4450 */ 4451 KASSERT((oldpte & pmap->pmap_bits[PG_V_IDX]), 4452 ("badD *ptep %016lx/%016lx sva %016lx ", 4453 *ptep, oldpte, sva)); 4454 info->func(pmap, info, pte_placemark, pt_pv, 4455 sva, ptep, info->arg); 4456 } 4457 if (pd_pv) { 4458 pv_lock(pd_pv); 4459 if (vm_page_unwire_quick(pd_pv->pv_m)) { 4460 panic("pmap_scan_callback: " 4461 "bad wirecount on pd_pv"); 4462 } 4463 if (pd_pv->pv_pmap == NULL) { 4464 va_next = sva; /* retry */ 4465 break; 4466 } 4467 } 4468 4469 /* 4470 * NOTE: The cached pt_pv can be removed from the 4471 * pmap when pmap_dynamic_delete is enabled, 4472 * which will cause ptep to become stale. 4473 * 4474 * This also means that no pages remain under 4475 * the PT, so we can just break out of the inner 4476 * loop and let the outer loop clean everything 4477 * up. 4478 */ 4479 if (pt_pv && pt_pv->pv_pmap != pmap) 4480 break; 4481 sva += PAGE_SIZE; 4482 ++ptep; 4483 } 4484 } 4485 if (pd_pv) { 4486 pv_put(pd_pv); 4487 pd_pv = NULL; 4488 } 4489 if (pt_pv) { 4490 pv_put(pt_pv); 4491 pt_pv = NULL; 4492 } 4493 if ((++info->count & 7) == 0) 4494 lwkt_user_yield(); 4495 4496 /* 4497 * Relock before returning. 4498 */ 4499 spin_lock(&pmap->pm_spin); 4500 return (0); 4501 } 4502 4503 void 4504 pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva) 4505 { 4506 struct pmap_scan_info info; 4507 4508 info.pmap = pmap; 4509 info.sva = sva; 4510 info.eva = eva; 4511 info.func = pmap_remove_callback; 4512 info.arg = NULL; 4513 pmap_scan(&info, 1); 4514 #if 0 4515 cpu_invltlb(); 4516 if (eva - sva < 1024*1024) { 4517 while (sva < eva) { 4518 cpu_invlpg((void *)sva); 4519 sva += PAGE_SIZE; 4520 } 4521 } 4522 #endif 4523 } 4524 4525 static void 4526 pmap_remove_noinval(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva) 4527 { 4528 struct pmap_scan_info info; 4529 4530 info.pmap = pmap; 4531 info.sva = sva; 4532 info.eva = eva; 4533 info.func = pmap_remove_callback; 4534 info.arg = NULL; 4535 pmap_scan(&info, 0); 4536 } 4537 4538 static void 4539 pmap_remove_callback(pmap_t pmap, struct pmap_scan_info *info, 4540 vm_pindex_t *pte_placemark, pv_entry_t pt_pv, 4541 vm_offset_t va, pt_entry_t *ptep, void *arg __unused) 4542 { 4543 pt_entry_t pte; 4544 4545 /* 4546 * Managed or unmanaged pte (pte_placemark is non-NULL) 4547 * 4548 * pt_pv's wire_count is still bumped by unmanaged pages 4549 * so we must decrement it manually. 4550 * 4551 * We have to unwire the target page table page. 4552 */ 4553 pte = pmap_inval_bulk(info->bulk, va, ptep, 0); 4554 if (pte & pmap->pmap_bits[PG_MANAGED_IDX]) { 4555 vm_page_t p; 4556 4557 p = PHYS_TO_VM_PAGE(pte & PG_FRAME); 4558 KKASSERT(pte & pmap->pmap_bits[PG_V_IDX]); 4559 if (pte & pmap->pmap_bits[PG_M_IDX]) 4560 vm_page_dirty(p); 4561 if (pte & pmap->pmap_bits[PG_A_IDX]) 4562 vm_page_flag_set(p, PG_REFERENCED); 4563 4564 /* 4565 * NOTE: p is not hard-busied so it is not safe to 4566 * clear PG_MAPPED and PG_WRITEABLE on the 1->0 4567 * transition against them being set in 4568 * pmap_enter(). 4569 */ 4570 if (pte & pmap->pmap_bits[PG_RW_IDX]) 4571 atomic_add_long(&p->md.writeable_count, -1); 4572 pmap_page_stats_deleting( 4573 atomic_fetchadd_long(&p->md.pmap_count, -1)); 4574 } 4575 if (pte & pmap->pmap_bits[PG_V_IDX]) { 4576 atomic_add_long(&pmap->pm_stats.resident_count, -1); 4577 if (pt_pv && vm_page_unwire_quick(pt_pv->pv_m)) 4578 panic("pmap_remove: insufficient wirecount"); 4579 } 4580 if (pte & pmap->pmap_bits[PG_W_IDX]) 4581 atomic_add_long(&pmap->pm_stats.wired_count, -1); 4582 if (pte & pmap->pmap_bits[PG_G_IDX]) 4583 cpu_invlpg((void *)va); 4584 pv_placemarker_wakeup(pmap, pte_placemark); 4585 } 4586 4587 /* 4588 * Removes this physical page from all physical maps in which it resides. 4589 * Reflects back modify bits to the pager. 4590 * 4591 * This routine may not be called from an interrupt. 4592 * 4593 * The page must be busied by its caller, preventing new ptes from being 4594 * installed. This allows us to assert that pmap_count is zero and safely 4595 * clear the MAPPED and WRITEABLE bits upon completion. 4596 */ 4597 static 4598 void 4599 pmap_remove_all(vm_page_t m) 4600 { 4601 int retry; 4602 4603 if (!pmap_initialized) 4604 return; 4605 4606 /* 4607 * pmap_count doesn't cover fictitious pages, but PG_MAPPED does 4608 * (albeit without certain race protections). 4609 */ 4610 #if 0 4611 if (m->md.pmap_count == 0) 4612 return; 4613 #endif 4614 if ((m->flags & PG_MAPPED) == 0) 4615 return; 4616 4617 retry = ticks + hz * 60; 4618 again: 4619 PMAP_PAGE_BACKING_SCAN(m, NULL, ipmap, iptep, ipte, iva) { 4620 if (!pmap_inval_smp_cmpset(ipmap, iva, iptep, ipte, 0)) 4621 PMAP_PAGE_BACKING_RETRY; 4622 if (ipte & ipmap->pmap_bits[PG_MANAGED_IDX]) { 4623 if (ipte & ipmap->pmap_bits[PG_M_IDX]) 4624 vm_page_dirty(m); 4625 if (ipte & ipmap->pmap_bits[PG_A_IDX]) 4626 vm_page_flag_set(m, PG_REFERENCED); 4627 4628 /* 4629 * NOTE: m is not hard-busied so it is not safe to 4630 * clear PG_MAPPED and PG_WRITEABLE on the 1->0 4631 * transition against them being set in 4632 * pmap_enter(). 4633 */ 4634 if (ipte & ipmap->pmap_bits[PG_RW_IDX]) 4635 atomic_add_long(&m->md.writeable_count, -1); 4636 pmap_page_stats_deleting( 4637 atomic_fetchadd_long(&m->md.pmap_count, -1)); 4638 } 4639 4640 /* 4641 * Cleanup various tracking counters. pt_pv can't go away 4642 * due to our wired ref. 4643 */ 4644 if (ipmap != &kernel_pmap) { 4645 pv_entry_t pt_pv; 4646 4647 spin_lock_shared(&ipmap->pm_spin); 4648 pt_pv = pv_entry_lookup(ipmap, pmap_pt_pindex(iva)); 4649 spin_unlock_shared(&ipmap->pm_spin); 4650 4651 if (pt_pv) { 4652 if (vm_page_unwire_quick(pt_pv->pv_m)) { 4653 panic("pmap_remove_all: bad " 4654 "wire_count on pt_pv"); 4655 } 4656 atomic_add_long( 4657 &ipmap->pm_stats.resident_count, -1); 4658 } 4659 } 4660 if (ipte & ipmap->pmap_bits[PG_W_IDX]) 4661 atomic_add_long(&ipmap->pm_stats.wired_count, -1); 4662 if (ipte & ipmap->pmap_bits[PG_G_IDX]) 4663 cpu_invlpg((void *)iva); 4664 } PMAP_PAGE_BACKING_DONE; 4665 4666 /* 4667 * pmap_count should be zero but it is possible to race a pmap_enter() 4668 * replacement (see 'oldm'). Once it is zero it cannot become 4669 * non-zero because the page is hard-busied. 4670 */ 4671 if (m->md.pmap_count || m->md.writeable_count) { 4672 tsleep(&m->md.pmap_count, 0, "pgunm", 1); 4673 if (retry - ticks > 0) 4674 goto again; 4675 panic("pmap_remove_all: cannot return pmap_count " 4676 "to 0 (%p, %ld, %ld)", 4677 m, m->md.pmap_count, m->md.writeable_count); 4678 } 4679 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 4680 } 4681 4682 /* 4683 * Removes the page from a particular pmap. 4684 * 4685 * The page must be busied by the caller. 4686 */ 4687 void 4688 pmap_remove_specific(pmap_t pmap_match, vm_page_t m) 4689 { 4690 if (!pmap_initialized) 4691 return; 4692 4693 /* 4694 * PG_MAPPED test works for both non-fictitious and fictitious pages. 4695 */ 4696 if ((m->flags & PG_MAPPED) == 0) 4697 return; 4698 4699 PMAP_PAGE_BACKING_SCAN(m, pmap_match, ipmap, iptep, ipte, iva) { 4700 if (!pmap_inval_smp_cmpset(ipmap, iva, iptep, ipte, 0)) 4701 PMAP_PAGE_BACKING_RETRY; 4702 if (ipte & ipmap->pmap_bits[PG_MANAGED_IDX]) { 4703 if (ipte & ipmap->pmap_bits[PG_M_IDX]) 4704 vm_page_dirty(m); 4705 if (ipte & ipmap->pmap_bits[PG_A_IDX]) 4706 vm_page_flag_set(m, PG_REFERENCED); 4707 4708 /* 4709 * NOTE: m is not hard-busied so it is not safe to 4710 * clear PG_MAPPED and PG_WRITEABLE on the 1->0 4711 * transition against them being set in 4712 * pmap_enter(). 4713 */ 4714 if (ipte & ipmap->pmap_bits[PG_RW_IDX]) 4715 atomic_add_long(&m->md.writeable_count, -1); 4716 pmap_page_stats_deleting( 4717 atomic_fetchadd_long(&m->md.pmap_count, -1)); 4718 } 4719 4720 /* 4721 * Cleanup various tracking counters. pt_pv can't go away 4722 * due to our wired ref. 4723 */ 4724 if (ipmap != &kernel_pmap) { 4725 pv_entry_t pt_pv; 4726 4727 spin_lock_shared(&ipmap->pm_spin); 4728 pt_pv = pv_entry_lookup(ipmap, pmap_pt_pindex(iva)); 4729 spin_unlock_shared(&ipmap->pm_spin); 4730 4731 if (pt_pv) { 4732 atomic_add_long( 4733 &ipmap->pm_stats.resident_count, -1); 4734 if (vm_page_unwire_quick(pt_pv->pv_m)) { 4735 panic("pmap_remove_specific: bad " 4736 "wire_count on pt_pv"); 4737 } 4738 } 4739 } 4740 if (ipte & ipmap->pmap_bits[PG_W_IDX]) 4741 atomic_add_long(&ipmap->pm_stats.wired_count, -1); 4742 if (ipte & ipmap->pmap_bits[PG_G_IDX]) 4743 cpu_invlpg((void *)iva); 4744 } PMAP_PAGE_BACKING_DONE; 4745 } 4746 4747 /* 4748 * Set the physical protection on the specified range of this map 4749 * as requested. This function is typically only used for debug watchpoints 4750 * and COW pages. 4751 * 4752 * This function may not be called from an interrupt if the map is 4753 * not the kernel_pmap. 4754 * 4755 * NOTE! For shared page table pages we just unmap the page. 4756 */ 4757 void 4758 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 4759 { 4760 struct pmap_scan_info info; 4761 /* JG review for NX */ 4762 4763 if (pmap == NULL) 4764 return; 4765 if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == VM_PROT_NONE) { 4766 pmap_remove(pmap, sva, eva); 4767 return; 4768 } 4769 if (prot & VM_PROT_WRITE) 4770 return; 4771 info.pmap = pmap; 4772 info.sva = sva; 4773 info.eva = eva; 4774 info.func = pmap_protect_callback; 4775 info.arg = &prot; 4776 pmap_scan(&info, 1); 4777 } 4778 4779 static 4780 void 4781 pmap_protect_callback(pmap_t pmap, struct pmap_scan_info *info, 4782 vm_pindex_t *pte_placemark, 4783 pv_entry_t pt_pv, vm_offset_t va, 4784 pt_entry_t *ptep, void *arg __unused) 4785 { 4786 pt_entry_t pbits; 4787 pt_entry_t cbits; 4788 vm_page_t m; 4789 4790 again: 4791 pbits = *ptep; 4792 cpu_ccfence(); 4793 cbits = pbits; 4794 if (pbits & pmap->pmap_bits[PG_MANAGED_IDX]) { 4795 cbits &= ~pmap->pmap_bits[PG_A_IDX]; 4796 cbits &= ~pmap->pmap_bits[PG_M_IDX]; 4797 } 4798 /* else unmanaged page, adjust bits, no wire changes */ 4799 4800 if (ptep) { 4801 cbits &= ~pmap->pmap_bits[PG_RW_IDX]; 4802 #ifdef PMAP_DEBUG2 4803 if (pmap_enter_debug > 0) { 4804 --pmap_enter_debug; 4805 kprintf("pmap_protect va=%lx ptep=%p " 4806 "pt_pv=%p cbits=%08lx\n", 4807 va, ptep, pt_pv, cbits 4808 ); 4809 } 4810 #endif 4811 if (pbits != cbits) { 4812 if (!pmap_inval_smp_cmpset(pmap, va, 4813 ptep, pbits, cbits)) { 4814 goto again; 4815 } 4816 } 4817 if (pbits & pmap->pmap_bits[PG_MANAGED_IDX]) { 4818 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 4819 if (pbits & pmap->pmap_bits[PG_A_IDX]) 4820 vm_page_flag_set(m, PG_REFERENCED); 4821 if (pbits & pmap->pmap_bits[PG_M_IDX]) 4822 vm_page_dirty(m); 4823 if (pbits & pmap->pmap_bits[PG_RW_IDX]) 4824 atomic_add_long(&m->md.writeable_count, -1); 4825 4826 } 4827 } 4828 pv_placemarker_wakeup(pmap, pte_placemark); 4829 } 4830 4831 /* 4832 * Insert the vm_page (m) at the virtual address (va), replacing any prior 4833 * mapping at that address. Set protection and wiring as requested. 4834 * 4835 * If entry is non-NULL we check to see if the SEG_SIZE optimization is 4836 * possible. If it is we enter the page into the appropriate shared pmap 4837 * hanging off the related VM object instead of the passed pmap, then we 4838 * share the page table page from the VM object's pmap into the current pmap. 4839 * 4840 * NOTE: This routine MUST insert the page into the pmap now, it cannot 4841 * lazy-evaluate. 4842 */ 4843 void 4844 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 4845 boolean_t wired, vm_map_entry_t entry) 4846 { 4847 pv_entry_t pt_pv; /* page table */ 4848 pv_entry_t pte_pv; /* page table entry */ 4849 vm_pindex_t *pte_placemark; 4850 pt_entry_t *ptep; 4851 pt_entry_t origpte; 4852 vm_paddr_t opa; 4853 vm_page_t oldm; 4854 pt_entry_t newpte; 4855 vm_paddr_t pa; 4856 4857 if (pmap == NULL) 4858 return; 4859 va = trunc_page(va); 4860 #ifdef PMAP_DIAGNOSTIC 4861 if (va >= KvaEnd) 4862 panic("pmap_enter: toobig"); 4863 if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS)) 4864 panic("pmap_enter: invalid to pmap_enter page table " 4865 "pages (va: 0x%lx)", va); 4866 #endif 4867 if (va < UPT_MAX_ADDRESS && pmap == &kernel_pmap) { 4868 kprintf("Warning: pmap_enter called on UVA with " 4869 "kernel_pmap\n"); 4870 #ifdef DDB 4871 db_print_backtrace(); 4872 #endif 4873 } 4874 if (va >= UPT_MAX_ADDRESS && pmap != &kernel_pmap) { 4875 kprintf("Warning: pmap_enter called on KVA without" 4876 "kernel_pmap\n"); 4877 #ifdef DDB 4878 db_print_backtrace(); 4879 #endif 4880 } 4881 4882 /* 4883 * Get the locked page table page (pt_pv) for our new page table 4884 * entry, allocating it if necessary. 4885 * 4886 * There is no pte_pv for a terminal pte so the terminal pte will 4887 * be locked via pte_placemark. 4888 * 4889 * Only MMU actions by the CPU itself can modify the ptep out from 4890 * under us. 4891 * 4892 * If the pmap is still being initialized we assume existing 4893 * page tables. 4894 * 4895 * NOTE: Kernel mapppings do not track page table pages 4896 * (i.e. there is no pt_pv pt_pv structure). 4897 * 4898 * NOTE: origpte here is 'tentative', used only to check for 4899 * the degenerate case where the entry already exists and 4900 * matches. 4901 */ 4902 if (pmap_initialized == FALSE) { 4903 pte_pv = NULL; 4904 pt_pv = NULL; 4905 pte_placemark = NULL; 4906 ptep = vtopte(va); 4907 origpte = *ptep; 4908 } else { 4909 pte_pv = pv_get(pmap, pmap_pte_pindex(va), &pte_placemark); 4910 KKASSERT(pte_pv == NULL); 4911 if (va >= VM_MAX_USER_ADDRESS) { 4912 pt_pv = NULL; 4913 ptep = vtopte(va); 4914 } else { 4915 pt_pv = pmap_allocpte(pmap, pmap_pt_pindex(va), NULL); 4916 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); 4917 } 4918 origpte = *ptep; 4919 cpu_ccfence(); 4920 } 4921 4922 pa = VM_PAGE_TO_PHYS(m); 4923 4924 /* 4925 * Calculate the new PTE. 4926 */ 4927 newpte = (pt_entry_t)(pa | pte_prot(pmap, prot) | 4928 pmap->pmap_bits[PG_V_IDX] | pmap->pmap_bits[PG_A_IDX]); 4929 if (wired) 4930 newpte |= pmap->pmap_bits[PG_W_IDX]; 4931 if (va < VM_MAX_USER_ADDRESS) 4932 newpte |= pmap->pmap_bits[PG_U_IDX]; 4933 if ((m->flags & PG_FICTITIOUS) == 0) 4934 newpte |= pmap->pmap_bits[PG_MANAGED_IDX]; 4935 // if (pmap == &kernel_pmap) 4936 // newpte |= pgeflag; 4937 newpte |= pmap->pmap_cache_bits_pte[m->pat_mode]; 4938 4939 /* 4940 * It is possible for multiple faults to occur in threaded 4941 * environments, the existing pte might be correct. 4942 */ 4943 if (((origpte ^ newpte) & 4944 ~(pt_entry_t)(pmap->pmap_bits[PG_M_IDX] | 4945 pmap->pmap_bits[PG_A_IDX])) == 0) { 4946 goto done; 4947 } 4948 4949 /* 4950 * Adjust page flags. The page is soft-busied or hard-busied, we 4951 * should be able to safely set PG_* flag bits even with the (shared) 4952 * soft-busy. 4953 * 4954 * The pmap_count and writeable_count is only tracked for 4955 * non-fictitious pages. As a bit of a safety, bump pmap_count 4956 * and set the PG_* bits before mapping the page. If another part 4957 * of the system does not properly hard-busy the page (against our 4958 * soft-busy or hard-busy) in order to remove mappings it might not 4959 * see the pte that we are about to add and thus will not be able to 4960 * drop pmap_count to 0. 4961 * 4962 * The PG_MAPPED and PG_WRITEABLE flags are set for any type of page. 4963 * 4964 * NOTE! PG_MAPPED and PG_WRITEABLE can only be cleared when 4965 * the page is hard-busied AND pmap_count is 0. This 4966 * interlocks our setting of the flags here. 4967 */ 4968 /*vm_page_spin_lock(m);*/ 4969 if ((m->flags & PG_FICTITIOUS) == 0) { 4970 pmap_page_stats_adding( 4971 atomic_fetchadd_long(&m->md.pmap_count, 1)); 4972 if (newpte & pmap->pmap_bits[PG_RW_IDX]) 4973 atomic_add_long(&m->md.writeable_count, 1); 4974 } 4975 if (newpte & pmap->pmap_bits[PG_RW_IDX]) { 4976 if ((m->flags & (PG_MAPPED | PG_WRITEABLE)) == 0) 4977 vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE); 4978 } else { 4979 if ((m->flags & PG_MAPPED) == 0) 4980 vm_page_flag_set(m, PG_MAPPED); 4981 } 4982 /*vm_page_spin_unlock(m);*/ 4983 4984 /* 4985 * A race can develop when replacing an existing mapping. The new 4986 * page has been busied and the pte is placemark-locked, but the 4987 * old page is could be ripped out from under us at any time by 4988 * a backing scan. 4989 * 4990 * The race is handled by having the backing scans check pmap_count 4991 * writeable_count when doing operations that should ensure one 4992 * becomes 0. 4993 */ 4994 opa = origpte & PG_FRAME; 4995 if (opa && (origpte & pmap->pmap_bits[PG_MANAGED_IDX])) { 4996 oldm = PHYS_TO_VM_PAGE(opa); 4997 KKASSERT(opa == oldm->phys_addr); 4998 KKASSERT(entry != NULL); 4999 } else { 5000 oldm = NULL; 5001 } 5002 5003 /* 5004 * Swap the new and old PTEs and perform any necessary SMP 5005 * synchronization. 5006 */ 5007 if ((prot & VM_PROT_NOSYNC) || (opa == 0 && pt_pv != NULL)) { 5008 /* 5009 * Explicitly permitted to avoid pmap cpu mask synchronization 5010 * or the prior content of a non-kernel-related pmap was 5011 * invalid. 5012 */ 5013 origpte = atomic_swap_long(ptep, newpte); 5014 if (opa) 5015 cpu_invlpg((void *)va); 5016 } else { 5017 /* 5018 * Not permitted to avoid pmap cpu mask synchronization 5019 * or there prior content being replaced or this is a kernel 5020 * related pmap. 5021 * 5022 * Due to other kernel optimizations, we cannot assume a 5023 * 0->non_zero transition of *ptep can be done with a swap. 5024 */ 5025 origpte = pmap_inval_smp(pmap, va, 1, ptep, newpte); 5026 } 5027 opa = origpte & PG_FRAME; 5028 5029 #ifdef PMAP_DEBUG2 5030 if (pmap_enter_debug > 0) { 5031 --pmap_enter_debug; 5032 kprintf("pmap_enter: va=%lx m=%p origpte=%lx newpte=%lx ptep=%p" 5033 " pte_pv=%p pt_pv=%p opa=%lx prot=%02x\n", 5034 va, m, 5035 origpte, newpte, ptep, 5036 pte_pv, pt_pv, opa, prot); 5037 } 5038 #endif 5039 5040 /* 5041 * Account for the changes in the pt_pv and pmap. 5042 * 5043 * Retain the same wiring count due to replacing an existing page, 5044 * or bump the wiring count for a new page. 5045 */ 5046 if (pt_pv && opa == 0) { 5047 vm_page_wire_quick(pt_pv->pv_m); 5048 atomic_add_long(&pt_pv->pv_pmap->pm_stats.resident_count, 1); 5049 } 5050 if (wired && (origpte & pmap->pmap_bits[PG_W_IDX]) == 0) 5051 atomic_add_long(&pmap->pm_stats.wired_count, 1); 5052 5053 /* 5054 * Account for the removal of the old page. pmap and pt_pv stats 5055 * have already been fully adjusted for both. 5056 * 5057 * WARNING! oldm is not soft or hard-busied. The pte at worst can 5058 * only be removed out from under us since we hold the 5059 * placemarker. So if it is still there, it must not have 5060 * changed. 5061 */ 5062 if (opa && (origpte & pmap->pmap_bits[PG_MANAGED_IDX])) { 5063 KKASSERT(oldm == PHYS_TO_VM_PAGE(opa)); 5064 if (origpte & pmap->pmap_bits[PG_M_IDX]) 5065 vm_page_dirty(oldm); 5066 if (origpte & pmap->pmap_bits[PG_A_IDX]) 5067 vm_page_flag_set(oldm, PG_REFERENCED); 5068 5069 /* 5070 * NOTE: oldm is not hard-busied so it is not safe to 5071 * clear PG_MAPPED and PG_WRITEABLE on the 1->0 5072 * transition against them being set in 5073 * pmap_enter(). 5074 */ 5075 if (origpte & pmap->pmap_bits[PG_RW_IDX]) 5076 atomic_add_long(&oldm->md.writeable_count, -1); 5077 pmap_page_stats_deleting( 5078 atomic_fetchadd_long(&oldm->md.pmap_count, -1)); 5079 } 5080 5081 done: 5082 KKASSERT((newpte & pmap->pmap_bits[PG_MANAGED_IDX]) == 0 || 5083 (m->flags & PG_MAPPED)); 5084 5085 /* 5086 * Cleanup the pv entry, allowing other accessors. If the new page 5087 * is not managed but we have a pte_pv (which was locking our 5088 * operation), we can free it now. pte_pv->pv_m should be NULL. 5089 */ 5090 if (pte_placemark) 5091 pv_placemarker_wakeup(pmap, pte_placemark); 5092 if (pt_pv) 5093 pv_put(pt_pv); 5094 } 5095 5096 /* 5097 * Make a temporary mapping for a physical address. This is only intended 5098 * to be used for panic dumps. 5099 * 5100 * The caller is responsible for calling smp_invltlb(). 5101 */ 5102 void * 5103 pmap_kenter_temporary(vm_paddr_t pa, long i) 5104 { 5105 pmap_kenter_quick((vm_offset_t)crashdumpmap + (i * PAGE_SIZE), pa); 5106 return ((void *)crashdumpmap); 5107 } 5108 5109 #if 0 5110 #define MAX_INIT_PT (96) 5111 5112 /* 5113 * This routine preloads the ptes for a given object into the specified pmap. 5114 * This eliminates the blast of soft faults on process startup and 5115 * immediately after an mmap. 5116 */ 5117 static int pmap_object_init_pt_callback(vm_page_t p, void *data); 5118 #endif 5119 5120 void 5121 pmap_object_init_pt(pmap_t pmap, vm_map_entry_t entry, 5122 vm_offset_t addr, vm_size_t size, int limit) 5123 { 5124 #if 0 5125 vm_prot_t prot = entry->protection; 5126 vm_object_t object = entry->ba.object; 5127 vm_pindex_t pindex = atop(entry->ba.offset + (addr - entry->ba.start)); 5128 struct rb_vm_page_scan_info info; 5129 struct lwp *lp; 5130 vm_size_t psize; 5131 5132 /* 5133 * We can't preinit if read access isn't set or there is no pmap 5134 * or object. 5135 */ 5136 if ((prot & VM_PROT_READ) == 0 || pmap == NULL || object == NULL) 5137 return; 5138 5139 /* 5140 * We can't preinit if the pmap is not the current pmap 5141 */ 5142 lp = curthread->td_lwp; 5143 if (lp == NULL || pmap != vmspace_pmap(lp->lwp_vmspace)) 5144 return; 5145 5146 /* 5147 * Misc additional checks 5148 */ 5149 psize = x86_64_btop(size); 5150 5151 if ((object->type != OBJT_VNODE) || 5152 ((limit & MAP_PREFAULT_PARTIAL) && (psize > MAX_INIT_PT) && 5153 (object->resident_page_count > MAX_INIT_PT))) { 5154 return; 5155 } 5156 5157 if (pindex + psize > object->size) { 5158 if (object->size < pindex) 5159 return; 5160 psize = object->size - pindex; 5161 } 5162 5163 if (psize == 0) 5164 return; 5165 5166 /* 5167 * If everything is segment-aligned do not pre-init here. Instead 5168 * allow the normal vm_fault path to pass a segment hint to 5169 * pmap_enter() which will then use an object-referenced shared 5170 * page table page. 5171 */ 5172 if ((addr & SEG_MASK) == 0 && 5173 (ctob(psize) & SEG_MASK) == 0 && 5174 (ctob(pindex) & SEG_MASK) == 0) { 5175 return; 5176 } 5177 5178 /* 5179 * Use a red-black scan to traverse the requested range and load 5180 * any valid pages found into the pmap. 5181 * 5182 * We cannot safely scan the object's memq without holding the 5183 * object token. 5184 */ 5185 info.start_pindex = pindex; 5186 info.end_pindex = pindex + psize - 1; 5187 info.limit = limit; 5188 info.mpte = NULL; 5189 info.addr = addr; 5190 info.pmap = pmap; 5191 info.object = object; 5192 info.entry = entry; 5193 5194 /* 5195 * By using the NOLK scan, the callback function must be sure 5196 * to return -1 if the VM page falls out of the object. 5197 */ 5198 vm_object_hold_shared(object); 5199 vm_page_rb_tree_RB_SCAN_NOLK(&object->rb_memq, rb_vm_page_scancmp, 5200 pmap_object_init_pt_callback, &info); 5201 vm_object_drop(object); 5202 #endif 5203 } 5204 5205 #if 0 5206 5207 static 5208 int 5209 pmap_object_init_pt_callback(vm_page_t p, void *data) 5210 { 5211 struct rb_vm_page_scan_info *info = data; 5212 vm_pindex_t rel_index; 5213 int hard_busy; 5214 5215 /* 5216 * don't allow an madvise to blow away our really 5217 * free pages allocating pv entries. 5218 */ 5219 if ((info->limit & MAP_PREFAULT_MADVISE) && 5220 vmstats.v_free_count < vmstats.v_free_reserved) { 5221 return(-1); 5222 } 5223 5224 /* 5225 * Ignore list markers and ignore pages we cannot instantly 5226 * busy (while holding the object token). 5227 */ 5228 if (p->flags & PG_MARKER) 5229 return 0; 5230 hard_busy = 0; 5231 again: 5232 if (hard_busy) { 5233 if (vm_page_busy_try(p, TRUE)) 5234 return 0; 5235 } else { 5236 if (vm_page_sbusy_try(p)) 5237 return 0; 5238 } 5239 if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && 5240 (p->flags & PG_FICTITIOUS) == 0) { 5241 if ((p->queue - p->pc) == PQ_CACHE) { 5242 if (hard_busy == 0) { 5243 vm_page_sbusy_drop(p); 5244 hard_busy = 1; 5245 goto again; 5246 } 5247 vm_page_deactivate(p); 5248 } 5249 rel_index = p->pindex - info->start_pindex; 5250 pmap_enter(info->pmap, info->addr + x86_64_ptob(rel_index), p, 5251 VM_PROT_READ, FALSE, info->entry); 5252 } 5253 if (hard_busy) 5254 vm_page_wakeup(p); 5255 else 5256 vm_page_sbusy_drop(p); 5257 5258 /* 5259 * We are using an unlocked scan (that is, the scan expects its 5260 * current element to remain in the tree on return). So we have 5261 * to check here and abort the scan if it isn't. 5262 */ 5263 if (p->object != info->object) 5264 return -1; 5265 lwkt_yield(); 5266 return(0); 5267 } 5268 5269 #endif 5270 5271 /* 5272 * Return TRUE if the pmap is in shape to trivially pre-fault the specified 5273 * address. 5274 * 5275 * Returns FALSE if it would be non-trivial or if a pte is already loaded 5276 * into the slot. 5277 * 5278 * The address must reside within a vm_map mapped range to ensure that the 5279 * page table doesn't get ripped out from under us. 5280 * 5281 * XXX This is safe only because page table pages are not freed. 5282 */ 5283 int 5284 pmap_prefault_ok(pmap_t pmap, vm_offset_t addr) 5285 { 5286 pt_entry_t *pte; 5287 5288 /*spin_lock(&pmap->pm_spin);*/ 5289 if ((pte = pmap_pte(pmap, addr)) != NULL) { 5290 if (*pte & pmap->pmap_bits[PG_V_IDX]) { 5291 /*spin_unlock(&pmap->pm_spin);*/ 5292 return FALSE; 5293 } 5294 } 5295 /*spin_unlock(&pmap->pm_spin);*/ 5296 return TRUE; 5297 } 5298 5299 /* 5300 * Change the wiring attribute for a pmap/va pair. The mapping must already 5301 * exist in the pmap. The mapping may or may not be managed. The wiring in 5302 * the page is not changed, the page is returned so the caller can adjust 5303 * its wiring (the page is not locked in any way). 5304 * 5305 * Wiring is not a hardware characteristic so there is no need to invalidate 5306 * TLB. However, in an SMP environment we must use a locked bus cycle to 5307 * update the pte (if we are not using the pmap_inval_*() API that is)... 5308 * it's ok to do this for simple wiring changes. 5309 */ 5310 vm_page_t 5311 pmap_unwire(pmap_t pmap, vm_offset_t va) 5312 { 5313 pt_entry_t *ptep; 5314 pv_entry_t pt_pv; 5315 vm_paddr_t pa; 5316 vm_page_t m; 5317 5318 if (pmap == NULL) 5319 return NULL; 5320 5321 /* 5322 * Assume elements in the kernel pmap are stable 5323 */ 5324 if (pmap == &kernel_pmap) { 5325 if (pmap_pt(pmap, va) == 0) 5326 return NULL; 5327 ptep = pmap_pte_quick(pmap, va); 5328 if (pmap_pte_v(pmap, ptep)) { 5329 if (pmap_pte_w(pmap, ptep)) 5330 atomic_add_long(&pmap->pm_stats.wired_count,-1); 5331 atomic_clear_long(ptep, pmap->pmap_bits[PG_W_IDX]); 5332 pa = *ptep & PG_FRAME; 5333 m = PHYS_TO_VM_PAGE(pa); 5334 } else { 5335 m = NULL; 5336 } 5337 } else { 5338 /* 5339 * We can only [un]wire pmap-local pages (we cannot wire 5340 * shared pages) 5341 */ 5342 pt_pv = pv_get(pmap, pmap_pt_pindex(va), NULL); 5343 if (pt_pv == NULL) 5344 return NULL; 5345 5346 ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); 5347 if ((*ptep & pmap->pmap_bits[PG_V_IDX]) == 0) { 5348 pv_put(pt_pv); 5349 return NULL; 5350 } 5351 5352 if (pmap_pte_w(pmap, ptep)) { 5353 atomic_add_long(&pt_pv->pv_pmap->pm_stats.wired_count, 5354 -1); 5355 } 5356 /* XXX else return NULL so caller doesn't unwire m ? */ 5357 5358 atomic_clear_long(ptep, pmap->pmap_bits[PG_W_IDX]); 5359 5360 pa = *ptep & PG_FRAME; 5361 m = PHYS_TO_VM_PAGE(pa); /* held by wired count */ 5362 pv_put(pt_pv); 5363 } 5364 return m; 5365 } 5366 5367 /* 5368 * Copy the range specified by src_addr/len from the source map to 5369 * the range dst_addr/len in the destination map. 5370 * 5371 * This routine is only advisory and need not do anything. 5372 */ 5373 void 5374 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, 5375 vm_size_t len, vm_offset_t src_addr) 5376 { 5377 } 5378 5379 /* 5380 * pmap_zero_page: 5381 * 5382 * Zero the specified physical page. 5383 * 5384 * This function may be called from an interrupt and no locking is 5385 * required. 5386 */ 5387 void 5388 pmap_zero_page(vm_paddr_t phys) 5389 { 5390 vm_offset_t va = PHYS_TO_DMAP(phys); 5391 5392 pagezero((void *)va); 5393 } 5394 5395 /* 5396 * pmap_zero_page: 5397 * 5398 * Zero part of a physical page by mapping it into memory and clearing 5399 * its contents with bzero. 5400 * 5401 * off and size may not cover an area beyond a single hardware page. 5402 */ 5403 void 5404 pmap_zero_page_area(vm_paddr_t phys, int off, int size) 5405 { 5406 vm_offset_t virt = PHYS_TO_DMAP(phys); 5407 5408 bzero((char *)virt + off, size); 5409 } 5410 5411 /* 5412 * pmap_copy_page: 5413 * 5414 * Copy the physical page from the source PA to the target PA. 5415 * This function may be called from an interrupt. No locking 5416 * is required. 5417 */ 5418 void 5419 pmap_copy_page(vm_paddr_t src, vm_paddr_t dst) 5420 { 5421 vm_offset_t src_virt, dst_virt; 5422 5423 src_virt = PHYS_TO_DMAP(src); 5424 dst_virt = PHYS_TO_DMAP(dst); 5425 bcopy((void *)src_virt, (void *)dst_virt, PAGE_SIZE); 5426 } 5427 5428 /* 5429 * pmap_copy_page_frag: 5430 * 5431 * Copy the physical page from the source PA to the target PA. 5432 * This function may be called from an interrupt. No locking 5433 * is required. 5434 */ 5435 void 5436 pmap_copy_page_frag(vm_paddr_t src, vm_paddr_t dst, size_t bytes) 5437 { 5438 vm_offset_t src_virt, dst_virt; 5439 5440 src_virt = PHYS_TO_DMAP(src); 5441 dst_virt = PHYS_TO_DMAP(dst); 5442 5443 bcopy((char *)src_virt + (src & PAGE_MASK), 5444 (char *)dst_virt + (dst & PAGE_MASK), 5445 bytes); 5446 } 5447 5448 /* 5449 * Remove all pages from specified address space this aids process exit 5450 * speeds. Also, this code may be special cased for the current process 5451 * only. 5452 */ 5453 void 5454 pmap_remove_pages(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 5455 { 5456 pmap_remove_noinval(pmap, sva, eva); 5457 cpu_invltlb(); 5458 } 5459 5460 /* 5461 * pmap_testbit tests bits in pte's note that the testbit/clearbit 5462 * routines are inline, and a lot of things compile-time evaluate. 5463 */ 5464 static 5465 boolean_t 5466 pmap_testbit(vm_page_t m, int bit) 5467 { 5468 int res = FALSE; 5469 5470 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 5471 return FALSE; 5472 /* 5473 * Nothing to do if all the mappings are already read-only. 5474 * The page's [M]odify bits have already been synchronized 5475 * to the vm_page_t and cleaned out. 5476 */ 5477 if (bit == PG_M_IDX && m->md.writeable_count == 0) 5478 return FALSE; 5479 5480 /* 5481 * Iterate the mapping 5482 */ 5483 PMAP_PAGE_BACKING_SCAN(m, NULL, ipmap, iptep, ipte, iva) { 5484 if (ipte & ipmap->pmap_bits[bit]) { 5485 res = TRUE; 5486 break; 5487 } 5488 } PMAP_PAGE_BACKING_DONE; 5489 return res; 5490 } 5491 5492 /* 5493 * This routine is used to modify bits in ptes. Only one bit should be 5494 * specified. PG_RW requires special handling. This call works with 5495 * any sort of mapped page. PG_FICTITIOUS pages might not be optimal. 5496 * 5497 * Caller must NOT hold any spin locks 5498 * Caller must hold (m) hard-busied 5499 * 5500 * NOTE: When clearing PG_M we could also (not implemented) drop 5501 * through to the PG_RW code and clear PG_RW too, forcing 5502 * a fault on write to redetect PG_M for virtual kernels, but 5503 * it isn't necessary since virtual kernels invalidate the 5504 * pte when they clear the VPTE_M bit in their virtual page 5505 * tables. 5506 * 5507 * NOTE: Does not re-dirty the page when clearing only PG_M. 5508 * 5509 * NOTE: Because we do not lock the pv, *pte can be in a state of 5510 * flux. Despite this the value of *pte is still somewhat 5511 * related while we hold the vm_page spin lock. 5512 * 5513 * *pte can be zero due to this race. Since we are clearing 5514 * bits we basically do no harm when this race occurs. 5515 */ 5516 static __inline 5517 void 5518 pmap_clearbit(vm_page_t m, int bit_index) 5519 { 5520 pt_entry_t npte; 5521 int retry; 5522 5523 /* 5524 * Too early in the boot 5525 */ 5526 if (!pmap_initialized) { 5527 if (bit_index == PG_RW_IDX) 5528 vm_page_flag_clear(m, PG_WRITEABLE); 5529 return; 5530 } 5531 5532 /* 5533 * Being asked to clear other random bits, we don't track them 5534 * so we have to iterate. 5535 */ 5536 if (bit_index != PG_RW_IDX) { 5537 PMAP_PAGE_BACKING_SCAN(m, NULL, ipmap, iptep, ipte, iva) { 5538 if (ipte & ipmap->pmap_bits[bit_index]) { 5539 atomic_clear_long(iptep, 5540 ipmap->pmap_bits[bit_index]); 5541 } 5542 } PMAP_PAGE_BACKING_DONE; 5543 return; 5544 } 5545 5546 /* 5547 * Being asked to clear the RW bit. 5548 * 5549 * Nothing to do if all the mappings are already read-only 5550 */ 5551 if (m->md.writeable_count == 0) 5552 return; 5553 5554 /* 5555 * Iterate the mappings and check. 5556 */ 5557 retry = ticks + hz * 60; 5558 again: 5559 /* 5560 * Clear PG_RW. This also clears PG_M and marks the page dirty if 5561 * PG_M was set. 5562 * 5563 * Since the caller holds the page hard-busied we can safely clear 5564 * PG_WRITEABLE, and callers expect us to for the PG_RW_IDX path. 5565 */ 5566 PMAP_PAGE_BACKING_SCAN(m, NULL, ipmap, iptep, ipte, iva) { 5567 #if 0 5568 if ((ipte & ipmap->pmap_bits[PG_MANAGED_IDX]) == 0) 5569 continue; 5570 #endif 5571 if ((ipte & ipmap->pmap_bits[PG_RW_IDX]) == 0) 5572 continue; 5573 npte = ipte & ~(ipmap->pmap_bits[PG_RW_IDX] | 5574 ipmap->pmap_bits[PG_M_IDX]); 5575 if (!pmap_inval_smp_cmpset(ipmap, iva, iptep, ipte, npte)) 5576 PMAP_PAGE_BACKING_RETRY; 5577 if (ipte & ipmap->pmap_bits[PG_M_IDX]) 5578 vm_page_dirty(m); 5579 5580 /* 5581 * NOTE: m is not hard-busied so it is not safe to 5582 * clear PG_WRITEABLE on the 1->0 transition 5583 * against it being set in pmap_enter(). 5584 * 5585 * pmap_count and writeable_count are only applicable 5586 * to non-fictitious pages (PG_MANAGED_IDX from pte) 5587 */ 5588 if (ipte & ipmap->pmap_bits[PG_MANAGED_IDX]) 5589 atomic_add_long(&m->md.writeable_count, -1); 5590 } PMAP_PAGE_BACKING_DONE; 5591 5592 /* 5593 * writeable_count should be zero but it is possible to race 5594 * a pmap_enter() replacement (see 'oldm'). Once it is zero 5595 * it cannot become non-zero because the page is hard-busied. 5596 */ 5597 if (m->md.writeable_count != 0) { 5598 tsleep(&m->md.writeable_count, 0, "pgwab", 1); 5599 if (retry - ticks > 0) 5600 goto again; 5601 panic("pmap_remove_all: cannot return writeable_count " 5602 "to 0 (%ld)", 5603 m->md.writeable_count); 5604 } 5605 vm_page_flag_clear(m, PG_WRITEABLE); 5606 } 5607 5608 /* 5609 * Lower the permission for all mappings to a given page. 5610 * 5611 * Page must be hard-busied by caller. Because the page is busied by the 5612 * caller, this should not be able to race a pmap_enter(). 5613 */ 5614 void 5615 pmap_page_protect(vm_page_t m, vm_prot_t prot) 5616 { 5617 /* JG NX support? */ 5618 if ((prot & VM_PROT_WRITE) == 0) { 5619 if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) { 5620 /* 5621 * NOTE: pmap_clearbit(.. PG_RW) also clears 5622 * the PG_WRITEABLE flag in (m). 5623 */ 5624 pmap_clearbit(m, PG_RW_IDX); 5625 } else { 5626 pmap_remove_all(m); 5627 } 5628 } 5629 } 5630 5631 vm_paddr_t 5632 pmap_phys_address(vm_pindex_t ppn) 5633 { 5634 return (x86_64_ptob(ppn)); 5635 } 5636 5637 /* 5638 * Return a count of reference bits for a page, clearing those bits. 5639 * It is not necessary for every reference bit to be cleared, but it 5640 * is necessary that 0 only be returned when there are truly no 5641 * reference bits set. 5642 * 5643 * XXX: The exact number of bits to check and clear is a matter that 5644 * should be tested and standardized at some point in the future for 5645 * optimal aging of shared pages. 5646 * 5647 * This routine may not block. 5648 */ 5649 int 5650 pmap_ts_referenced(vm_page_t m) 5651 { 5652 int rval = 0; 5653 pt_entry_t npte; 5654 5655 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 5656 return rval; 5657 PMAP_PAGE_BACKING_SCAN(m, NULL, ipmap, iptep, ipte, iva) { 5658 if (ipte & ipmap->pmap_bits[PG_A_IDX]) { 5659 npte = ipte & ~ipmap->pmap_bits[PG_A_IDX]; 5660 if (!atomic_cmpset_long(iptep, ipte, npte)) 5661 PMAP_PAGE_BACKING_RETRY; 5662 ++rval; 5663 if (rval > 4) 5664 break; 5665 } 5666 } PMAP_PAGE_BACKING_DONE; 5667 return rval; 5668 } 5669 5670 /* 5671 * pmap_is_modified: 5672 * 5673 * Return whether or not the specified physical page was modified 5674 * in any physical maps. 5675 */ 5676 boolean_t 5677 pmap_is_modified(vm_page_t m) 5678 { 5679 boolean_t res; 5680 5681 res = pmap_testbit(m, PG_M_IDX); 5682 return (res); 5683 } 5684 5685 /* 5686 * Clear the modify bit on the vm_page. 5687 * 5688 * The page must be hard-busied. 5689 */ 5690 void 5691 pmap_clear_modify(vm_page_t m) 5692 { 5693 pmap_clearbit(m, PG_M_IDX); 5694 } 5695 5696 /* 5697 * pmap_clear_reference: 5698 * 5699 * Clear the reference bit on the specified physical page. 5700 */ 5701 void 5702 pmap_clear_reference(vm_page_t m) 5703 { 5704 pmap_clearbit(m, PG_A_IDX); 5705 } 5706 5707 /* 5708 * Miscellaneous support routines follow 5709 */ 5710 5711 static 5712 void 5713 x86_64_protection_init(void) 5714 { 5715 uint64_t *kp; 5716 int prot; 5717 5718 /* 5719 * NX supported? (boot time loader.conf override only) 5720 * 5721 * -1 Automatic (sets mode 1) 5722 * 0 Disabled 5723 * 1 NX implemented, differentiates PROT_READ vs PROT_READ|PROT_EXEC 5724 * 2 NX implemented for all cases 5725 */ 5726 TUNABLE_INT_FETCH("machdep.pmap_nx_enable", &pmap_nx_enable); 5727 if ((amd_feature & AMDID_NX) == 0) { 5728 pmap_bits_default[PG_NX_IDX] = 0; 5729 pmap_nx_enable = 0; 5730 } else if (pmap_nx_enable < 0) { 5731 pmap_nx_enable = 1; /* default to mode 1 (READ) */ 5732 } 5733 5734 /* 5735 * 0 is basically read-only access, but also set the NX (no-execute) 5736 * bit when VM_PROT_EXECUTE is not specified. 5737 */ 5738 kp = protection_codes; 5739 for (prot = 0; prot < PROTECTION_CODES_SIZE; prot++) { 5740 switch (prot) { 5741 case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE: 5742 /* 5743 * This case handled elsewhere 5744 */ 5745 *kp = 0; 5746 break; 5747 case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE: 5748 /* 5749 * Read-only is 0|NX (pmap_nx_enable mode >= 1) 5750 */ 5751 if (pmap_nx_enable >= 1) 5752 *kp = pmap_bits_default[PG_NX_IDX]; 5753 break; 5754 case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE: 5755 case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE: 5756 /* 5757 * Execute requires read access 5758 */ 5759 *kp = 0; 5760 break; 5761 case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE: 5762 case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE: 5763 /* 5764 * Write without execute is RW|NX 5765 * (pmap_nx_enable mode >= 2) 5766 */ 5767 *kp = pmap_bits_default[PG_RW_IDX]; 5768 if (pmap_nx_enable >= 2) 5769 *kp |= pmap_bits_default[PG_NX_IDX]; 5770 break; 5771 case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE: 5772 case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE: 5773 /* 5774 * Write with execute is RW 5775 */ 5776 *kp = pmap_bits_default[PG_RW_IDX]; 5777 break; 5778 } 5779 ++kp; 5780 } 5781 } 5782 5783 /* 5784 * Map a set of physical memory pages into the kernel virtual 5785 * address space. Return a pointer to where it is mapped. This 5786 * routine is intended to be used for mapping device memory, 5787 * NOT real memory. 5788 * 5789 * NOTE: We can't use pgeflag unless we invalidate the pages one at 5790 * a time. 5791 * 5792 * NOTE: The PAT attributes {WRITE_BACK, WRITE_THROUGH, UNCACHED, UNCACHEABLE} 5793 * work whether the cpu supports PAT or not. The remaining PAT 5794 * attributes {WRITE_PROTECTED, WRITE_COMBINING} only work if the cpu 5795 * supports PAT. 5796 */ 5797 void * 5798 pmap_mapdev(vm_paddr_t pa, vm_size_t size) 5799 { 5800 return(pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); 5801 } 5802 5803 void * 5804 pmap_mapdev_uncacheable(vm_paddr_t pa, vm_size_t size) 5805 { 5806 return(pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); 5807 } 5808 5809 void * 5810 pmap_mapbios(vm_paddr_t pa, vm_size_t size) 5811 { 5812 return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); 5813 } 5814 5815 /* 5816 * Map a set of physical memory pages into the kernel virtual 5817 * address space. Return a pointer to where it is mapped. This 5818 * routine is intended to be used for mapping device memory, 5819 * NOT real memory. 5820 */ 5821 void * 5822 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) 5823 { 5824 vm_offset_t va, tmpva, offset; 5825 pt_entry_t *pte; 5826 vm_size_t tmpsize; 5827 5828 offset = pa & PAGE_MASK; 5829 size = roundup(offset + size, PAGE_SIZE); 5830 5831 va = kmem_alloc_nofault(&kernel_map, size, VM_SUBSYS_MAPDEV, PAGE_SIZE); 5832 if (va == 0) 5833 panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); 5834 5835 pa = pa & ~PAGE_MASK; 5836 for (tmpva = va, tmpsize = size; tmpsize > 0;) { 5837 pte = vtopte(tmpva); 5838 *pte = pa | 5839 kernel_pmap.pmap_bits[PG_RW_IDX] | 5840 kernel_pmap.pmap_bits[PG_V_IDX] | /* pgeflag | */ 5841 kernel_pmap.pmap_cache_bits_pte[mode]; 5842 tmpsize -= PAGE_SIZE; 5843 tmpva += PAGE_SIZE; 5844 pa += PAGE_SIZE; 5845 } 5846 pmap_invalidate_range(&kernel_pmap, va, va + size); 5847 pmap_invalidate_cache_range(va, va + size); 5848 5849 return ((void *)(va + offset)); 5850 } 5851 5852 void 5853 pmap_unmapdev(vm_offset_t va, vm_size_t size) 5854 { 5855 vm_offset_t base, offset; 5856 5857 base = va & ~PAGE_MASK; 5858 offset = va & PAGE_MASK; 5859 size = roundup(offset + size, PAGE_SIZE); 5860 pmap_qremove(va, size >> PAGE_SHIFT); 5861 kmem_free(&kernel_map, base, size); 5862 } 5863 5864 /* 5865 * Sets the memory attribute for the specified page. 5866 */ 5867 void 5868 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 5869 { 5870 5871 m->pat_mode = ma; 5872 5873 /* 5874 * If "m" is a normal page, update its direct mapping. This update 5875 * can be relied upon to perform any cache operations that are 5876 * required for data coherence. 5877 */ 5878 if ((m->flags & PG_FICTITIOUS) == 0) 5879 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), 1, m->pat_mode); 5880 } 5881 5882 /* 5883 * Change the PAT attribute on an existing kernel memory map. Caller 5884 * must ensure that the virtual memory in question is not accessed 5885 * during the adjustment. 5886 * 5887 * If the va is within the DMAP we cannot use vtopte() because the DMAP 5888 * utilizes 2MB or 1GB pages. 2MB is forced atm so calculate the pd_entry 5889 * pointer based on that. 5890 */ 5891 void 5892 pmap_change_attr(vm_offset_t va, vm_size_t count, int mode) 5893 { 5894 pt_entry_t *pte; 5895 vm_offset_t base; 5896 int changed = 0; 5897 5898 if (va == 0) 5899 panic("pmap_change_attr: va is NULL"); 5900 base = trunc_page(va); 5901 5902 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 5903 pd_entry_t *pd; 5904 5905 KKASSERT(va < DMapMaxAddress); 5906 pd = (pd_entry_t *)PHYS_TO_DMAP(DMPDphys); 5907 pd += (va - DMAP_MIN_ADDRESS) >> PDRSHIFT; 5908 5909 while ((long)count > 0) { 5910 *pd = 5911 (*pd & ~(pd_entry_t)(kernel_pmap.pmap_cache_mask_pde)) | 5912 kernel_pmap.pmap_cache_bits_pde[mode]; 5913 count -= NBPDR / PAGE_SIZE; 5914 va += NBPDR; 5915 ++pd; 5916 } 5917 } else { 5918 while (count) { 5919 pte = vtopte(va); 5920 *pte = 5921 (*pte & ~(pt_entry_t)(kernel_pmap.pmap_cache_mask_pte)) | 5922 kernel_pmap.pmap_cache_bits_pte[mode]; 5923 --count; 5924 va += PAGE_SIZE; 5925 } 5926 } 5927 5928 changed = 1; /* XXX: not optimal */ 5929 5930 /* 5931 * Flush CPU caches if required to make sure any data isn't cached that 5932 * shouldn't be, etc. 5933 */ 5934 if (changed) { 5935 pmap_invalidate_range(&kernel_pmap, base, va); 5936 pmap_invalidate_cache_range(base, va); 5937 } 5938 } 5939 5940 /* 5941 * perform the pmap work for mincore 5942 */ 5943 int 5944 pmap_mincore(pmap_t pmap, vm_offset_t addr) 5945 { 5946 pt_entry_t *ptep, pte; 5947 vm_page_t m; 5948 int val = 0; 5949 5950 ptep = pmap_pte(pmap, addr); 5951 5952 if (ptep && (pte = *ptep) != 0) { 5953 vm_offset_t pa; 5954 5955 val = MINCORE_INCORE; 5956 pa = pte & PG_FRAME; 5957 if (pte & pmap->pmap_bits[PG_MANAGED_IDX]) 5958 m = PHYS_TO_VM_PAGE(pa); 5959 else 5960 m = NULL; 5961 5962 /* 5963 * Modified by us 5964 */ 5965 if (pte & pmap->pmap_bits[PG_M_IDX]) 5966 val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; 5967 5968 /* 5969 * Modified by someone 5970 */ 5971 else if (m && (m->dirty || pmap_is_modified(m))) 5972 val |= MINCORE_MODIFIED_OTHER; 5973 5974 /* 5975 * Referenced by us, or someone else. 5976 */ 5977 if (pte & pmap->pmap_bits[PG_A_IDX]) { 5978 val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; 5979 } else if (m && ((m->flags & PG_REFERENCED) || 5980 pmap_ts_referenced(m))) { 5981 val |= MINCORE_REFERENCED_OTHER; 5982 vm_page_flag_set(m, PG_REFERENCED); 5983 } 5984 } 5985 return val; 5986 } 5987 5988 /* 5989 * Replace p->p_vmspace with a new one. If adjrefs is non-zero the new 5990 * vmspace will be ref'd and the old one will be deref'd. 5991 * 5992 * The vmspace for all lwps associated with the process will be adjusted 5993 * and cr3 will be reloaded if any lwp is the current lwp. 5994 * 5995 * The process must hold the vmspace->vm_map.token for oldvm and newvm 5996 */ 5997 void 5998 pmap_replacevm(struct proc *p, struct vmspace *newvm, int adjrefs) 5999 { 6000 struct vmspace *oldvm; 6001 struct lwp *lp; 6002 6003 oldvm = p->p_vmspace; 6004 if (oldvm != newvm) { 6005 if (adjrefs) 6006 vmspace_ref(newvm); 6007 p->p_vmspace = newvm; 6008 KKASSERT(p->p_nthreads == 1); 6009 lp = RB_ROOT(&p->p_lwp_tree); 6010 pmap_setlwpvm(lp, newvm); 6011 if (adjrefs) 6012 vmspace_rel(oldvm); 6013 } 6014 } 6015 6016 /* 6017 * Set the vmspace for a LWP. The vmspace is almost universally set the 6018 * same as the process vmspace, but virtual kernels need to swap out contexts 6019 * on a per-lwp basis. 6020 * 6021 * Caller does not necessarily hold any vmspace tokens. Caller must control 6022 * the lwp (typically be in the context of the lwp). We use a critical 6023 * section to protect against statclock and hardclock (statistics collection). 6024 */ 6025 void 6026 pmap_setlwpvm(struct lwp *lp, struct vmspace *newvm) 6027 { 6028 struct vmspace *oldvm; 6029 struct pmap *pmap; 6030 thread_t td; 6031 6032 oldvm = lp->lwp_vmspace; 6033 6034 if (oldvm != newvm) { 6035 crit_enter(); 6036 td = curthread; 6037 KKASSERT((newvm->vm_refcnt & VM_REF_DELETED) == 0); 6038 lp->lwp_vmspace = newvm; 6039 if (td->td_lwp == lp) { 6040 pmap = vmspace_pmap(newvm); 6041 ATOMIC_CPUMASK_ORBIT(pmap->pm_active, mycpu->gd_cpuid); 6042 if (pmap->pm_active_lock & CPULOCK_EXCL) 6043 pmap_interlock_wait(newvm); 6044 #if defined(SWTCH_OPTIM_STATS) 6045 tlb_flush_count++; 6046 #endif 6047 if (pmap->pmap_bits[TYPE_IDX] == REGULAR_PMAP) { 6048 td->td_pcb->pcb_cr3 = vtophys(pmap->pm_pml4); 6049 if (meltdown_mitigation && pmap->pm_pmlpv_iso) { 6050 td->td_pcb->pcb_cr3_iso = 6051 vtophys(pmap->pm_pml4_iso); 6052 td->td_pcb->pcb_flags |= PCB_ISOMMU; 6053 } else { 6054 td->td_pcb->pcb_cr3_iso = 0; 6055 td->td_pcb->pcb_flags &= ~PCB_ISOMMU; 6056 } 6057 } else if (pmap->pmap_bits[TYPE_IDX] == EPT_PMAP) { 6058 td->td_pcb->pcb_cr3 = KPML4phys; 6059 td->td_pcb->pcb_cr3_iso = 0; 6060 td->td_pcb->pcb_flags &= ~PCB_ISOMMU; 6061 } else { 6062 panic("pmap_setlwpvm: unknown pmap type\n"); 6063 } 6064 6065 /* 6066 * The MMU separation fields needs to be updated. 6067 * (it can't access the pcb directly from the 6068 * restricted user pmap). 6069 */ 6070 { 6071 struct trampframe *tramp; 6072 6073 tramp = &pscpu->trampoline; 6074 tramp->tr_pcb_cr3 = td->td_pcb->pcb_cr3; 6075 tramp->tr_pcb_cr3_iso = td->td_pcb->pcb_cr3_iso; 6076 tramp->tr_pcb_flags = td->td_pcb->pcb_flags; 6077 tramp->tr_pcb_rsp = (register_t)td->td_pcb; 6078 /* tr_pcb_rsp doesn't change */ 6079 } 6080 6081 /* 6082 * In kernel-land we always use the normal PML4E 6083 * so the kernel is fully mapped and can also access 6084 * user memory. 6085 */ 6086 load_cr3(td->td_pcb->pcb_cr3); 6087 pmap = vmspace_pmap(oldvm); 6088 ATOMIC_CPUMASK_NANDBIT(pmap->pm_active, 6089 mycpu->gd_cpuid); 6090 } 6091 crit_exit(); 6092 } 6093 } 6094 6095 /* 6096 * Called when switching to a locked pmap, used to interlock against pmaps 6097 * undergoing modifications to prevent us from activating the MMU for the 6098 * target pmap until all such modifications have completed. We have to do 6099 * this because the thread making the modifications has already set up its 6100 * SMP synchronization mask. 6101 * 6102 * This function cannot sleep! 6103 * 6104 * No requirements. 6105 */ 6106 void 6107 pmap_interlock_wait(struct vmspace *vm) 6108 { 6109 struct pmap *pmap = &vm->vm_pmap; 6110 6111 if (pmap->pm_active_lock & CPULOCK_EXCL) { 6112 crit_enter(); 6113 KKASSERT(curthread->td_critcount >= 2); 6114 DEBUG_PUSH_INFO("pmap_interlock_wait"); 6115 while (pmap->pm_active_lock & CPULOCK_EXCL) { 6116 cpu_ccfence(); 6117 lwkt_process_ipiq(); 6118 } 6119 DEBUG_POP_INFO(); 6120 crit_exit(); 6121 } 6122 } 6123 6124 vm_offset_t 6125 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) 6126 { 6127 6128 if ((obj == NULL) || (size < NBPDR) || 6129 ((obj->type != OBJT_DEVICE) && (obj->type != OBJT_MGTDEVICE))) { 6130 return addr; 6131 } 6132 6133 addr = roundup2(addr, NBPDR); 6134 return addr; 6135 } 6136 6137 /* 6138 * Used by kmalloc/kfree, page already exists at va 6139 */ 6140 vm_page_t 6141 pmap_kvtom(vm_offset_t va) 6142 { 6143 pt_entry_t *ptep = vtopte(va); 6144 6145 return(PHYS_TO_VM_PAGE(*ptep & PG_FRAME)); 6146 } 6147 6148 /* 6149 * Initialize machine-specific shared page directory support. This 6150 * is executed when a VM object is created. 6151 */ 6152 void 6153 pmap_object_init(vm_object_t object) 6154 { 6155 } 6156 6157 /* 6158 * Clean up machine-specific shared page directory support. This 6159 * is executed when a VM object is destroyed. 6160 */ 6161 void 6162 pmap_object_free(vm_object_t object) 6163 { 6164 } 6165 6166 /* 6167 * pmap_pgscan_callback - Used by pmap_pgscan to acquire the related 6168 * VM page and issue a pginfo->callback. 6169 */ 6170 static 6171 void 6172 pmap_pgscan_callback(pmap_t pmap, struct pmap_scan_info *info, 6173 vm_pindex_t *pte_placemark, 6174 pv_entry_t pt_pv, vm_offset_t va, 6175 pt_entry_t *ptep, void *arg) 6176 { 6177 struct pmap_pgscan_info *pginfo = arg; 6178 vm_page_t m; 6179 pt_entry_t pte; 6180 6181 pte = *ptep; 6182 cpu_ccfence(); 6183 6184 if (pte & pmap->pmap_bits[PG_MANAGED_IDX]) { 6185 /* 6186 * Try to busy the page while we hold the pte_placemark locked. 6187 */ 6188 m = PHYS_TO_VM_PAGE(*ptep & PG_FRAME); 6189 if (vm_page_busy_try(m, TRUE) == 0) { 6190 if (m == PHYS_TO_VM_PAGE(*ptep & PG_FRAME)) { 6191 /* 6192 * The callback is issued with the pt_pv 6193 * unlocked. 6194 */ 6195 pv_placemarker_wakeup(pmap, pte_placemark); 6196 if (pt_pv) { 6197 vm_page_wire_quick(pt_pv->pv_m); 6198 pv_unlock(pt_pv); 6199 } 6200 if (pginfo->callback(pginfo, va, m) < 0) 6201 info->stop = 1; 6202 if (pt_pv) { 6203 pv_lock(pt_pv); 6204 if (vm_page_unwire_quick(pt_pv->pv_m)) { 6205 panic("pmap_pgscan: bad wire_" 6206 "count on pt_pv"); 6207 } 6208 } 6209 } else { 6210 vm_page_wakeup(m); 6211 pv_placemarker_wakeup(pmap, pte_placemark); 6212 } 6213 } else { 6214 ++pginfo->busycount; 6215 pv_placemarker_wakeup(pmap, pte_placemark); 6216 } 6217 } else { 6218 /* 6219 * Shared page table or unmanaged page (sharept or !sharept) 6220 */ 6221 pv_placemarker_wakeup(pmap, pte_placemark); 6222 } 6223 } 6224 6225 void 6226 pmap_pgscan(struct pmap_pgscan_info *pginfo) 6227 { 6228 struct pmap_scan_info info; 6229 6230 pginfo->offset = pginfo->beg_addr; 6231 info.pmap = pginfo->pmap; 6232 info.sva = pginfo->beg_addr; 6233 info.eva = pginfo->end_addr; 6234 info.func = pmap_pgscan_callback; 6235 info.arg = pginfo; 6236 pmap_scan(&info, 0); 6237 if (info.stop == 0) 6238 pginfo->offset = pginfo->end_addr; 6239 } 6240 6241 /* 6242 * Wait for a placemarker that we do not own to clear. The placemarker 6243 * in question is not necessarily set to the pindex we want, we may have 6244 * to wait on the element because we want to reserve it ourselves. 6245 * 6246 * NOTE: PM_PLACEMARK_WAKEUP sets a bit which is already set in 6247 * PM_NOPLACEMARK, so it does not interfere with placemarks 6248 * which have already been woken up. 6249 * 6250 * NOTE: This routine is called without the pmap spin-lock and so can 6251 * race changes to *pmark. Due to the sensitivity of the routine 6252 * to possible MULTIPLE interactions from other cpus, and the 6253 * overloading of the WAKEUP bit on PM_NOPLACEMARK, we have to 6254 * use a cmpset loop to avoid a race that might cause the WAKEUP 6255 * bit to be lost. 6256 * 6257 * Caller is expected to retry its operation upon return. 6258 */ 6259 static 6260 void 6261 pv_placemarker_wait(pmap_t pmap, vm_pindex_t *pmark) 6262 { 6263 vm_pindex_t mark; 6264 6265 mark = *pmark; 6266 cpu_ccfence(); 6267 while (mark != PM_NOPLACEMARK) { 6268 tsleep_interlock(pmark, 0); 6269 if (atomic_fcmpset_long(pmark, &mark, 6270 mark | PM_PLACEMARK_WAKEUP)) { 6271 tsleep(pmark, PINTERLOCKED, "pvplw", 0); 6272 break; 6273 } 6274 } 6275 } 6276 6277 /* 6278 * Wakeup a placemarker that we own. Replace the entry with 6279 * PM_NOPLACEMARK and issue a wakeup() if necessary. 6280 */ 6281 static 6282 void 6283 pv_placemarker_wakeup(pmap_t pmap, vm_pindex_t *pmark) 6284 { 6285 vm_pindex_t pindex; 6286 6287 pindex = atomic_swap_long(pmark, PM_NOPLACEMARK); 6288 KKASSERT(pindex != PM_NOPLACEMARK); 6289 if (pindex & PM_PLACEMARK_WAKEUP) 6290 wakeup(pmark); 6291 } 6292