1 /* 2 * Copyright (c) 1991 Regents of the University of California. 3 * Copyright (c) 1994 John S. Dyson 4 * Copyright (c) 1994 David Greenman 5 * Copyright (c) 2003 Peter Wemm 6 * Copyright (c) 2005-2008 Alan L. Cox <alc@cs.rice.edu> 7 * Copyright (c) 2008, 2009 The DragonFly Project. 8 * Copyright (c) 2008, 2009 Jordan Gordeev. 9 * All rights reserved. 10 * 11 * This code is derived from software contributed to Berkeley by 12 * the Systems Programming Group of the University of Utah Computer 13 * Science Department and William Jolitz of UUNET Technologies Inc. 14 * 15 * Redistribution and use in source and binary forms, with or without 16 * modification, are permitted provided that the following conditions 17 * are met: 18 * 1. Redistributions of source code must retain the above copyright 19 * notice, this list of conditions and the following disclaimer. 20 * 2. Redistributions in binary form must reproduce the above copyright 21 * notice, this list of conditions and the following disclaimer in the 22 * documentation and/or other materials provided with the distribution. 23 * 3. All advertising materials mentioning features or use of this software 24 * must display the following acknowledgement: 25 * This product includes software developed by the University of 26 * California, Berkeley and its contributors. 27 * 4. Neither the name of the University nor the names of its contributors 28 * may be used to endorse or promote products derived from this software 29 * without specific prior written permission. 30 * 31 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 34 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 41 * SUCH DAMAGE. 42 * 43 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 44 * $FreeBSD: src/sys/i386/i386/pmap.c,v 1.250.2.18 2002/03/06 22:48:53 silby Exp $ 45 */ 46 47 /* 48 * Manages physical address maps. 49 * 50 * In addition to hardware address maps, this 51 * module is called upon to provide software-use-only 52 * maps which may or may not be stored in the same 53 * form as hardware maps. These pseudo-maps are 54 * used to store intermediate results from copy 55 * operations to and from address spaces. 56 * 57 * Since the information managed by this module is 58 * also stored by the logical address mapping module, 59 * this module may throw away valid virtual-to-physical 60 * mappings at almost any time. However, invalidations 61 * of virtual-to-physical mappings must be done as 62 * requested. 63 * 64 * In order to cope with hardware architectures which 65 * make virtual-to-physical map invalidates expensive, 66 * this module may delay invalidate or reduced protection 67 * operations until such time as they are actually 68 * necessary. This module is given full information as 69 * to which processors are currently using which maps, 70 * and to when physical maps must be made correct. 71 */ 72 73 #if JG 74 #include "opt_disable_pse.h" 75 #include "opt_pmap.h" 76 #endif 77 #include "opt_msgbuf.h" 78 79 #include <sys/param.h> 80 #include <sys/systm.h> 81 #include <sys/kernel.h> 82 #include <sys/proc.h> 83 #include <sys/msgbuf.h> 84 #include <sys/vmmeter.h> 85 #include <sys/mman.h> 86 87 #include <vm/vm.h> 88 #include <vm/vm_param.h> 89 #include <sys/sysctl.h> 90 #include <sys/lock.h> 91 #include <vm/vm_kern.h> 92 #include <vm/vm_page.h> 93 #include <vm/vm_map.h> 94 #include <vm/vm_object.h> 95 #include <vm/vm_extern.h> 96 #include <vm/vm_pageout.h> 97 #include <vm/vm_pager.h> 98 #include <vm/vm_zone.h> 99 100 #include <sys/user.h> 101 #include <sys/thread2.h> 102 #include <sys/sysref2.h> 103 104 #include <machine/cputypes.h> 105 #include <machine/md_var.h> 106 #include <machine/specialreg.h> 107 #include <machine/smp.h> 108 #include <machine_base/apic/apicreg.h> 109 #include <machine/globaldata.h> 110 #include <machine/pmap.h> 111 #include <machine/pmap_inval.h> 112 113 #include <ddb/ddb.h> 114 115 #define PMAP_KEEP_PDIRS 116 #ifndef PMAP_SHPGPERPROC 117 #define PMAP_SHPGPERPROC 200 118 #endif 119 120 #if defined(DIAGNOSTIC) 121 #define PMAP_DIAGNOSTIC 122 #endif 123 124 #define MINPV 2048 125 126 /* 127 * Get PDEs and PTEs for user/kernel address space 128 */ 129 static pd_entry_t *pmap_pde(pmap_t pmap, vm_offset_t va); 130 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) 131 132 #define pmap_pde_v(pte) ((*(pd_entry_t *)pte & PG_V) != 0) 133 #define pmap_pte_w(pte) ((*(pt_entry_t *)pte & PG_W) != 0) 134 #define pmap_pte_m(pte) ((*(pt_entry_t *)pte & PG_M) != 0) 135 #define pmap_pte_u(pte) ((*(pt_entry_t *)pte & PG_A) != 0) 136 #define pmap_pte_v(pte) ((*(pt_entry_t *)pte & PG_V) != 0) 137 138 139 /* 140 * Given a map and a machine independent protection code, 141 * convert to a vax protection code. 142 */ 143 #define pte_prot(m, p) \ 144 (protection_codes[p & (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE)]) 145 static int protection_codes[8]; 146 147 struct pmap kernel_pmap; 148 static TAILQ_HEAD(,pmap) pmap_list = TAILQ_HEAD_INITIALIZER(pmap_list); 149 150 vm_paddr_t avail_start; /* PA of first available physical page */ 151 vm_paddr_t avail_end; /* PA of last available physical page */ 152 vm_offset_t virtual2_start; /* cutout free area prior to kernel start */ 153 vm_offset_t virtual2_end; 154 vm_offset_t virtual_start; /* VA of first avail page (after kernel bss) */ 155 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 156 vm_offset_t KvaStart; /* VA start of KVA space */ 157 vm_offset_t KvaEnd; /* VA end of KVA space (non-inclusive) */ 158 vm_offset_t KvaSize; /* max size of kernel virtual address space */ 159 static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */ 160 static int pgeflag; /* PG_G or-in */ 161 static int pseflag; /* PG_PS or-in */ 162 163 static vm_object_t kptobj; 164 165 static int ndmpdp; 166 static vm_paddr_t dmaplimit; 167 static int nkpt; 168 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 169 170 static uint64_t KPTbase; 171 static uint64_t KPTphys; 172 static uint64_t KPDphys; /* phys addr of kernel level 2 */ 173 static uint64_t KPDbase; /* phys addr of kernel level 2 @ KERNBASE */ 174 uint64_t KPDPphys; /* phys addr of kernel level 3 */ 175 uint64_t KPML4phys; /* phys addr of kernel level 4 */ 176 177 static uint64_t DMPDphys; /* phys addr of direct mapped level 2 */ 178 static uint64_t DMPDPphys; /* phys addr of direct mapped level 3 */ 179 180 /* 181 * Data for the pv entry allocation mechanism 182 */ 183 static vm_zone_t pvzone; 184 static struct vm_zone pvzone_store; 185 static struct vm_object pvzone_obj; 186 static int pv_entry_count=0, pv_entry_max=0, pv_entry_high_water=0; 187 static int pmap_pagedaemon_waken = 0; 188 static struct pv_entry *pvinit; 189 190 /* 191 * All those kernel PT submaps that BSD is so fond of 192 */ 193 pt_entry_t *CMAP1 = 0, *ptmmap; 194 caddr_t CADDR1 = 0, ptvmmap = 0; 195 static pt_entry_t *msgbufmap; 196 struct msgbuf *msgbufp=0; 197 198 /* 199 * Crashdump maps. 200 */ 201 static pt_entry_t *pt_crashdumpmap; 202 static caddr_t crashdumpmap; 203 204 extern pt_entry_t *SMPpt; 205 extern uint64_t SMPptpa; 206 207 #define DISABLE_PSE 208 209 static pv_entry_t get_pv_entry (void); 210 static void i386_protection_init (void); 211 static void create_pagetables(vm_paddr_t *firstaddr); 212 static void pmap_remove_all (vm_page_t m); 213 static int pmap_remove_pte (struct pmap *pmap, pt_entry_t *ptq, 214 vm_offset_t sva, pmap_inval_info_t info); 215 static void pmap_remove_page (struct pmap *pmap, 216 vm_offset_t va, pmap_inval_info_t info); 217 static int pmap_remove_entry (struct pmap *pmap, vm_page_t m, 218 vm_offset_t va, pmap_inval_info_t info); 219 static boolean_t pmap_testbit (vm_page_t m, int bit); 220 static void pmap_insert_entry (pmap_t pmap, vm_offset_t va, 221 vm_page_t mpte, vm_page_t m); 222 223 static vm_page_t pmap_allocpte (pmap_t pmap, vm_offset_t va); 224 225 static int pmap_release_free_page (pmap_t pmap, vm_page_t p); 226 static vm_page_t _pmap_allocpte (pmap_t pmap, vm_pindex_t ptepindex); 227 static pt_entry_t * pmap_pte_quick (pmap_t pmap, vm_offset_t va); 228 static vm_page_t pmap_page_lookup (vm_object_t object, vm_pindex_t pindex); 229 static int _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m, 230 pmap_inval_info_t info); 231 static int pmap_unuse_pt (pmap_t, vm_offset_t, vm_page_t, pmap_inval_info_t); 232 static vm_offset_t pmap_kmem_choose(vm_offset_t addr); 233 234 static unsigned pdir4mb; 235 236 /* 237 * Move the kernel virtual free pointer to the next 238 * 2MB. This is used to help improve performance 239 * by using a large (2MB) page for much of the kernel 240 * (.text, .data, .bss) 241 */ 242 static 243 vm_offset_t 244 pmap_kmem_choose(vm_offset_t addr) 245 { 246 vm_offset_t newaddr = addr; 247 248 newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1); 249 return newaddr; 250 } 251 252 /* 253 * pmap_pte_quick: 254 * 255 * Super fast pmap_pte routine best used when scanning the pv lists. 256 * This eliminates many course-grained invltlb calls. Note that many of 257 * the pv list scans are across different pmaps and it is very wasteful 258 * to do an entire invltlb when checking a single mapping. 259 * 260 * Should only be called while in a critical section. 261 */ 262 static __inline pt_entry_t *pmap_pte(pmap_t pmap, vm_offset_t va); 263 264 static 265 pt_entry_t * 266 pmap_pte_quick(pmap_t pmap, vm_offset_t va) 267 { 268 return pmap_pte(pmap, va); 269 } 270 271 /* Return a non-clipped PD index for a given VA */ 272 static __inline 273 vm_pindex_t 274 pmap_pde_pindex(vm_offset_t va) 275 { 276 return va >> PDRSHIFT; 277 } 278 279 /* Return various clipped indexes for a given VA */ 280 static __inline 281 vm_pindex_t 282 pmap_pte_index(vm_offset_t va) 283 { 284 285 return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1)); 286 } 287 288 static __inline 289 vm_pindex_t 290 pmap_pde_index(vm_offset_t va) 291 { 292 293 return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1)); 294 } 295 296 static __inline 297 vm_pindex_t 298 pmap_pdpe_index(vm_offset_t va) 299 { 300 301 return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1)); 302 } 303 304 static __inline 305 vm_pindex_t 306 pmap_pml4e_index(vm_offset_t va) 307 { 308 309 return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1)); 310 } 311 312 /* Return a pointer to the PML4 slot that corresponds to a VA */ 313 static __inline 314 pml4_entry_t * 315 pmap_pml4e(pmap_t pmap, vm_offset_t va) 316 { 317 318 return (&pmap->pm_pml4[pmap_pml4e_index(va)]); 319 } 320 321 /* Return a pointer to the PDP slot that corresponds to a VA */ 322 static __inline 323 pdp_entry_t * 324 pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va) 325 { 326 pdp_entry_t *pdpe; 327 328 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME); 329 return (&pdpe[pmap_pdpe_index(va)]); 330 } 331 332 /* Return a pointer to the PDP slot that corresponds to a VA */ 333 static __inline 334 pdp_entry_t * 335 pmap_pdpe(pmap_t pmap, vm_offset_t va) 336 { 337 pml4_entry_t *pml4e; 338 339 pml4e = pmap_pml4e(pmap, va); 340 if ((*pml4e & PG_V) == 0) 341 return NULL; 342 return (pmap_pml4e_to_pdpe(pml4e, va)); 343 } 344 345 /* Return a pointer to the PD slot that corresponds to a VA */ 346 static __inline 347 pd_entry_t * 348 pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va) 349 { 350 pd_entry_t *pde; 351 352 pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME); 353 return (&pde[pmap_pde_index(va)]); 354 } 355 356 /* Return a pointer to the PD slot that corresponds to a VA */ 357 static __inline 358 pd_entry_t * 359 pmap_pde(pmap_t pmap, vm_offset_t va) 360 { 361 pdp_entry_t *pdpe; 362 363 pdpe = pmap_pdpe(pmap, va); 364 if (pdpe == NULL || (*pdpe & PG_V) == 0) 365 return NULL; 366 return (pmap_pdpe_to_pde(pdpe, va)); 367 } 368 369 /* Return a pointer to the PT slot that corresponds to a VA */ 370 static __inline 371 pt_entry_t * 372 pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) 373 { 374 pt_entry_t *pte; 375 376 pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 377 return (&pte[pmap_pte_index(va)]); 378 } 379 380 /* Return a pointer to the PT slot that corresponds to a VA */ 381 static __inline 382 pt_entry_t * 383 pmap_pte(pmap_t pmap, vm_offset_t va) 384 { 385 pd_entry_t *pde; 386 387 pde = pmap_pde(pmap, va); 388 if (pde == NULL || (*pde & PG_V) == 0) 389 return NULL; 390 if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */ 391 return ((pt_entry_t *)pde); 392 return (pmap_pde_to_pte(pde, va)); 393 } 394 395 static __inline 396 pt_entry_t * 397 vtopte(vm_offset_t va) 398 { 399 uint64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 400 401 return (PTmap + ((va >> PAGE_SHIFT) & mask)); 402 } 403 404 static __inline 405 pd_entry_t * 406 vtopde(vm_offset_t va) 407 { 408 uint64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 409 410 return (PDmap + ((va >> PDRSHIFT) & mask)); 411 } 412 413 static uint64_t 414 allocpages(vm_paddr_t *firstaddr, int n) 415 { 416 uint64_t ret; 417 418 ret = *firstaddr; 419 bzero((void *)ret, n * PAGE_SIZE); 420 *firstaddr += n * PAGE_SIZE; 421 return (ret); 422 } 423 424 static 425 void 426 create_pagetables(vm_paddr_t *firstaddr) 427 { 428 int i; 429 430 /* we are running (mostly) V=P at this point */ 431 432 /* Allocate pages */ 433 KPTbase = allocpages(firstaddr, NKPT); 434 KPTphys = allocpages(firstaddr, NKPT); 435 KPML4phys = allocpages(firstaddr, 1); 436 KPDPphys = allocpages(firstaddr, NKPML4E); 437 438 /* 439 * Calculate the page directory base for KERNBASE, 440 * that is where we start populating the page table pages. 441 * Basically this is the end - 2. 442 */ 443 KPDphys = allocpages(firstaddr, NKPDPE); 444 KPDbase = KPDphys + ((NKPDPE - (NPDPEPG - KPDPI)) << PAGE_SHIFT); 445 446 ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT; 447 if (ndmpdp < 4) /* Minimum 4GB of dirmap */ 448 ndmpdp = 4; 449 DMPDPphys = allocpages(firstaddr, NDMPML4E); 450 if ((amd_feature & AMDID_PAGE1GB) == 0) 451 DMPDphys = allocpages(firstaddr, ndmpdp); 452 dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; 453 454 /* 455 * Fill in the underlying page table pages for the area around 456 * KERNBASE. This remaps low physical memory to KERNBASE. 457 * 458 * Read-only from zero to physfree 459 * XXX not fully used, underneath 2M pages 460 */ 461 for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) { 462 ((pt_entry_t *)KPTbase)[i] = i << PAGE_SHIFT; 463 ((pt_entry_t *)KPTbase)[i] |= PG_RW | PG_V | PG_G; 464 } 465 466 /* 467 * Now map the initial kernel page tables. One block of page 468 * tables is placed at the beginning of kernel virtual memory, 469 * and another block is placed at KERNBASE to map the kernel binary, 470 * data, bss, and initial pre-allocations. 471 */ 472 for (i = 0; i < NKPT; i++) { 473 ((pd_entry_t *)KPDbase)[i] = KPTbase + (i << PAGE_SHIFT); 474 ((pd_entry_t *)KPDbase)[i] |= PG_RW | PG_V; 475 } 476 for (i = 0; i < NKPT; i++) { 477 ((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT); 478 ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V; 479 } 480 481 /* 482 * Map from zero to end of allocations using 2M pages as an 483 * optimization. This will bypass some of the KPTBase pages 484 * above in the KERNBASE area. 485 */ 486 for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) { 487 ((pd_entry_t *)KPDbase)[i] = i << PDRSHIFT; 488 ((pd_entry_t *)KPDbase)[i] |= PG_RW | PG_V | PG_PS | PG_G; 489 } 490 491 /* 492 * And connect up the PD to the PDP. The kernel pmap is expected 493 * to pre-populate all of its PDs. See NKPDPE in vmparam.h. 494 */ 495 for (i = 0; i < NKPDPE; i++) { 496 ((pdp_entry_t *)KPDPphys)[NPDPEPG - NKPDPE + i] = 497 KPDphys + (i << PAGE_SHIFT); 498 ((pdp_entry_t *)KPDPphys)[NPDPEPG - NKPDPE + i] |= 499 PG_RW | PG_V | PG_U; 500 } 501 502 /* Now set up the direct map space using either 2MB or 1GB pages */ 503 /* Preset PG_M and PG_A because demotion expects it */ 504 if ((amd_feature & AMDID_PAGE1GB) == 0) { 505 for (i = 0; i < NPDEPG * ndmpdp; i++) { 506 ((pd_entry_t *)DMPDphys)[i] = (vm_paddr_t)i << PDRSHIFT; 507 ((pd_entry_t *)DMPDphys)[i] |= PG_RW | PG_V | PG_PS | 508 PG_G | PG_M | PG_A; 509 } 510 /* And the direct map space's PDP */ 511 for (i = 0; i < ndmpdp; i++) { 512 ((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + 513 (i << PAGE_SHIFT); 514 ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_U; 515 } 516 } else { 517 for (i = 0; i < ndmpdp; i++) { 518 ((pdp_entry_t *)DMPDPphys)[i] = 519 (vm_paddr_t)i << PDPSHIFT; 520 ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_PS | 521 PG_G | PG_M | PG_A; 522 } 523 } 524 525 /* And recursively map PML4 to itself in order to get PTmap */ 526 ((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys; 527 ((pdp_entry_t *)KPML4phys)[PML4PML4I] |= PG_RW | PG_V | PG_U; 528 529 /* Connect the Direct Map slot up to the PML4 */ 530 ((pdp_entry_t *)KPML4phys)[DMPML4I] = DMPDPphys; 531 ((pdp_entry_t *)KPML4phys)[DMPML4I] |= PG_RW | PG_V | PG_U; 532 533 /* Connect the KVA slot up to the PML4 */ 534 ((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys; 535 ((pdp_entry_t *)KPML4phys)[KPML4I] |= PG_RW | PG_V | PG_U; 536 } 537 538 void 539 init_paging(vm_paddr_t *firstaddr) 540 { 541 create_pagetables(firstaddr); 542 } 543 544 /* 545 * Bootstrap the system enough to run with virtual memory. 546 * 547 * On the i386 this is called after mapping has already been enabled 548 * and just syncs the pmap module with what has already been done. 549 * [We can't call it easily with mapping off since the kernel is not 550 * mapped with PA == VA, hence we would have to relocate every address 551 * from the linked base (virtual) address "KERNBASE" to the actual 552 * (physical) address starting relative to 0] 553 */ 554 void 555 pmap_bootstrap(vm_paddr_t *firstaddr) 556 { 557 vm_offset_t va; 558 pt_entry_t *pte; 559 struct mdglobaldata *gd; 560 int pg; 561 562 KvaStart = VM_MIN_KERNEL_ADDRESS; 563 KvaEnd = VM_MAX_KERNEL_ADDRESS; 564 KvaSize = KvaEnd - KvaStart; 565 566 avail_start = *firstaddr; 567 568 /* 569 * Create an initial set of page tables to run the kernel in. 570 */ 571 create_pagetables(firstaddr); 572 573 virtual2_start = KvaStart; 574 virtual2_end = PTOV_OFFSET; 575 576 virtual_start = (vm_offset_t) PTOV_OFFSET + *firstaddr; 577 virtual_start = pmap_kmem_choose(virtual_start); 578 579 virtual_end = VM_MAX_KERNEL_ADDRESS; 580 581 /* XXX do %cr0 as well */ 582 load_cr4(rcr4() | CR4_PGE | CR4_PSE); 583 load_cr3(KPML4phys); 584 585 /* 586 * Initialize protection array. 587 */ 588 i386_protection_init(); 589 590 /* 591 * The kernel's pmap is statically allocated so we don't have to use 592 * pmap_create, which is unlikely to work correctly at this part of 593 * the boot sequence (XXX and which no longer exists). 594 */ 595 kernel_pmap.pm_pml4 = (pdp_entry_t *) (PTOV_OFFSET + KPML4phys); 596 kernel_pmap.pm_count = 1; 597 kernel_pmap.pm_active = (cpumask_t)-1 & ~CPUMASK_LOCK; 598 TAILQ_INIT(&kernel_pmap.pm_pvlist); 599 nkpt = NKPT; 600 601 /* 602 * Reserve some special page table entries/VA space for temporary 603 * mapping of pages. 604 */ 605 #define SYSMAP(c, p, v, n) \ 606 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 607 608 va = virtual_start; 609 #ifdef JG 610 pte = (pt_entry_t *) pmap_pte(&kernel_pmap, va); 611 #else 612 pte = vtopte(va); 613 #endif 614 615 /* 616 * CMAP1/CMAP2 are used for zeroing and copying pages. 617 */ 618 SYSMAP(caddr_t, CMAP1, CADDR1, 1) 619 620 /* 621 * Crashdump maps. 622 */ 623 SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS); 624 625 /* 626 * ptvmmap is used for reading arbitrary physical pages via 627 * /dev/mem. 628 */ 629 SYSMAP(caddr_t, ptmmap, ptvmmap, 1) 630 631 /* 632 * msgbufp is used to map the system message buffer. 633 * XXX msgbufmap is not used. 634 */ 635 SYSMAP(struct msgbuf *, msgbufmap, msgbufp, 636 atop(round_page(MSGBUF_SIZE))) 637 638 virtual_start = va; 639 640 *CMAP1 = 0; 641 642 /* 643 * PG_G is terribly broken on SMP because we IPI invltlb's in some 644 * cases rather then invl1pg. Actually, I don't even know why it 645 * works under UP because self-referential page table mappings 646 */ 647 #ifdef SMP 648 pgeflag = 0; 649 #else 650 if (cpu_feature & CPUID_PGE) 651 pgeflag = PG_G; 652 #endif 653 654 /* 655 * Initialize the 4MB page size flag 656 */ 657 pseflag = 0; 658 /* 659 * The 4MB page version of the initial 660 * kernel page mapping. 661 */ 662 pdir4mb = 0; 663 664 #if !defined(DISABLE_PSE) 665 if (cpu_feature & CPUID_PSE) { 666 pt_entry_t ptditmp; 667 /* 668 * Note that we have enabled PSE mode 669 */ 670 pseflag = PG_PS; 671 ptditmp = *(PTmap + x86_64_btop(KERNBASE)); 672 ptditmp &= ~(NBPDR - 1); 673 ptditmp |= PG_V | PG_RW | PG_PS | PG_U | pgeflag; 674 pdir4mb = ptditmp; 675 676 #ifndef SMP 677 /* 678 * Enable the PSE mode. If we are SMP we can't do this 679 * now because the APs will not be able to use it when 680 * they boot up. 681 */ 682 load_cr4(rcr4() | CR4_PSE); 683 684 /* 685 * We can do the mapping here for the single processor 686 * case. We simply ignore the old page table page from 687 * now on. 688 */ 689 /* 690 * For SMP, we still need 4K pages to bootstrap APs, 691 * PSE will be enabled as soon as all APs are up. 692 */ 693 PTD[KPTDI] = (pd_entry_t)ptditmp; 694 cpu_invltlb(); 695 #endif 696 } 697 #endif 698 #ifdef SMP 699 if (cpu_apic_address == 0) 700 panic("pmap_bootstrap: no local apic!"); 701 #endif 702 703 /* 704 * We need to finish setting up the globaldata page for the BSP. 705 * locore has already populated the page table for the mdglobaldata 706 * portion. 707 */ 708 pg = MDGLOBALDATA_BASEALLOC_PAGES; 709 gd = &CPU_prvspace[0].mdglobaldata; 710 gd->gd_CMAP1 = &SMPpt[pg + 0]; 711 gd->gd_CMAP2 = &SMPpt[pg + 1]; 712 gd->gd_CMAP3 = &SMPpt[pg + 2]; 713 gd->gd_PMAP1 = &SMPpt[pg + 3]; 714 gd->gd_CADDR1 = CPU_prvspace[0].CPAGE1; 715 gd->gd_CADDR2 = CPU_prvspace[0].CPAGE2; 716 gd->gd_CADDR3 = CPU_prvspace[0].CPAGE3; 717 gd->gd_PADDR1 = (pt_entry_t *)CPU_prvspace[0].PPAGE1; 718 719 cpu_invltlb(); 720 } 721 722 #ifdef SMP 723 /* 724 * Set 4mb pdir for mp startup 725 */ 726 void 727 pmap_set_opt(void) 728 { 729 if (pseflag && (cpu_feature & CPUID_PSE)) { 730 load_cr4(rcr4() | CR4_PSE); 731 if (pdir4mb && mycpu->gd_cpuid == 0) { /* only on BSP */ 732 cpu_invltlb(); 733 } 734 } 735 } 736 #endif 737 738 /* 739 * Initialize the pmap module. 740 * Called by vm_init, to initialize any structures that the pmap 741 * system needs to map virtual memory. 742 * pmap_init has been enhanced to support in a fairly consistant 743 * way, discontiguous physical memory. 744 */ 745 void 746 pmap_init(void) 747 { 748 int i; 749 int initial_pvs; 750 751 /* 752 * object for kernel page table pages 753 */ 754 /* JG I think the number can be arbitrary */ 755 kptobj = vm_object_allocate(OBJT_DEFAULT, 5); 756 757 /* 758 * Allocate memory for random pmap data structures. Includes the 759 * pv_head_table. 760 */ 761 762 for(i = 0; i < vm_page_array_size; i++) { 763 vm_page_t m; 764 765 m = &vm_page_array[i]; 766 TAILQ_INIT(&m->md.pv_list); 767 m->md.pv_list_count = 0; 768 } 769 770 /* 771 * init the pv free list 772 */ 773 initial_pvs = vm_page_array_size; 774 if (initial_pvs < MINPV) 775 initial_pvs = MINPV; 776 pvzone = &pvzone_store; 777 pvinit = (struct pv_entry *) kmem_alloc(&kernel_map, 778 initial_pvs * sizeof (struct pv_entry)); 779 zbootinit(pvzone, "PV ENTRY", sizeof (struct pv_entry), pvinit, 780 initial_pvs); 781 782 /* 783 * Now it is safe to enable pv_table recording. 784 */ 785 pmap_initialized = TRUE; 786 #ifdef SMP 787 lapic = pmap_mapdev_uncacheable(cpu_apic_address, sizeof(struct LAPIC)); 788 #endif 789 } 790 791 /* 792 * Initialize the address space (zone) for the pv_entries. Set a 793 * high water mark so that the system can recover from excessive 794 * numbers of pv entries. 795 */ 796 void 797 pmap_init2(void) 798 { 799 int shpgperproc = PMAP_SHPGPERPROC; 800 801 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); 802 pv_entry_max = shpgperproc * maxproc + vm_page_array_size; 803 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); 804 pv_entry_high_water = 9 * (pv_entry_max / 10); 805 zinitna(pvzone, &pvzone_obj, NULL, 0, pv_entry_max, ZONE_INTERRUPT, 1); 806 } 807 808 809 /*************************************************** 810 * Low level helper routines..... 811 ***************************************************/ 812 813 #if defined(PMAP_DIAGNOSTIC) 814 815 /* 816 * This code checks for non-writeable/modified pages. 817 * This should be an invalid condition. 818 */ 819 static 820 int 821 pmap_nw_modified(pt_entry_t pte) 822 { 823 if ((pte & (PG_M|PG_RW)) == PG_M) 824 return 1; 825 else 826 return 0; 827 } 828 #endif 829 830 831 /* 832 * this routine defines the region(s) of memory that should 833 * not be tested for the modified bit. 834 */ 835 static __inline 836 int 837 pmap_track_modified(vm_offset_t va) 838 { 839 if ((va < clean_sva) || (va >= clean_eva)) 840 return 1; 841 else 842 return 0; 843 } 844 845 /* 846 * pmap_extract: 847 * 848 * Extract the physical page address associated with the map/VA pair. 849 * 850 * This function may not be called from an interrupt if the pmap is 851 * not kernel_pmap. 852 */ 853 vm_paddr_t 854 pmap_extract(pmap_t pmap, vm_offset_t va) 855 { 856 vm_paddr_t rtval; 857 pt_entry_t *pte; 858 pd_entry_t pde, *pdep; 859 860 rtval = 0; 861 pdep = pmap_pde(pmap, va); 862 if (pdep != NULL) { 863 pde = *pdep; 864 if (pde) { 865 if ((pde & PG_PS) != 0) { 866 rtval = (pde & PG_PS_FRAME) | (va & PDRMASK); 867 } else { 868 pte = pmap_pde_to_pte(pdep, va); 869 rtval = (*pte & PG_FRAME) | (va & PAGE_MASK); 870 } 871 } 872 } 873 return rtval; 874 } 875 876 /* 877 * Routine: pmap_kextract 878 * Function: 879 * Extract the physical page address associated 880 * kernel virtual address. 881 */ 882 vm_paddr_t 883 pmap_kextract(vm_offset_t va) 884 { 885 pd_entry_t pde; 886 vm_paddr_t pa; 887 888 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 889 pa = DMAP_TO_PHYS(va); 890 } else { 891 pde = *vtopde(va); 892 if (pde & PG_PS) { 893 pa = (pde & PG_PS_FRAME) | (va & PDRMASK); 894 } else { 895 /* 896 * Beware of a concurrent promotion that changes the 897 * PDE at this point! For example, vtopte() must not 898 * be used to access the PTE because it would use the 899 * new PDE. It is, however, safe to use the old PDE 900 * because the page table page is preserved by the 901 * promotion. 902 */ 903 pa = *pmap_pde_to_pte(&pde, va); 904 pa = (pa & PG_FRAME) | (va & PAGE_MASK); 905 } 906 } 907 return pa; 908 } 909 910 /*************************************************** 911 * Low level mapping routines..... 912 ***************************************************/ 913 914 /* 915 * Routine: pmap_kenter 916 * Function: 917 * Add a wired page to the KVA 918 * NOTE! note that in order for the mapping to take effect -- you 919 * should do an invltlb after doing the pmap_kenter(). 920 */ 921 void 922 pmap_kenter(vm_offset_t va, vm_paddr_t pa) 923 { 924 pt_entry_t *pte; 925 pt_entry_t npte; 926 pmap_inval_info info; 927 928 pmap_inval_init(&info); 929 npte = pa | PG_RW | PG_V | pgeflag; 930 pte = vtopte(va); 931 pmap_inval_interlock(&info, &kernel_pmap, va); 932 *pte = npte; 933 pmap_inval_deinterlock(&info, &kernel_pmap); 934 pmap_inval_done(&info); 935 } 936 937 /* 938 * Routine: pmap_kenter_quick 939 * Function: 940 * Similar to pmap_kenter(), except we only invalidate the 941 * mapping on the current CPU. 942 */ 943 void 944 pmap_kenter_quick(vm_offset_t va, vm_paddr_t pa) 945 { 946 pt_entry_t *pte; 947 pt_entry_t npte; 948 949 npte = pa | PG_RW | PG_V | pgeflag; 950 pte = vtopte(va); 951 *pte = npte; 952 cpu_invlpg((void *)va); 953 } 954 955 void 956 pmap_kenter_sync(vm_offset_t va) 957 { 958 pmap_inval_info info; 959 960 pmap_inval_init(&info); 961 pmap_inval_interlock(&info, &kernel_pmap, va); 962 pmap_inval_deinterlock(&info, &kernel_pmap); 963 pmap_inval_done(&info); 964 } 965 966 void 967 pmap_kenter_sync_quick(vm_offset_t va) 968 { 969 cpu_invlpg((void *)va); 970 } 971 972 /* 973 * remove a page from the kernel pagetables 974 */ 975 void 976 pmap_kremove(vm_offset_t va) 977 { 978 pt_entry_t *pte; 979 pmap_inval_info info; 980 981 pmap_inval_init(&info); 982 pte = vtopte(va); 983 pmap_inval_interlock(&info, &kernel_pmap, va); 984 *pte = 0; 985 pmap_inval_deinterlock(&info, &kernel_pmap); 986 pmap_inval_done(&info); 987 } 988 989 void 990 pmap_kremove_quick(vm_offset_t va) 991 { 992 pt_entry_t *pte; 993 pte = vtopte(va); 994 *pte = 0; 995 cpu_invlpg((void *)va); 996 } 997 998 /* 999 * XXX these need to be recoded. They are not used in any critical path. 1000 */ 1001 void 1002 pmap_kmodify_rw(vm_offset_t va) 1003 { 1004 *vtopte(va) |= PG_RW; 1005 cpu_invlpg((void *)va); 1006 } 1007 1008 void 1009 pmap_kmodify_nc(vm_offset_t va) 1010 { 1011 *vtopte(va) |= PG_N; 1012 cpu_invlpg((void *)va); 1013 } 1014 1015 /* 1016 * Used to map a range of physical addresses into kernel 1017 * virtual address space. 1018 * 1019 * For now, VM is already on, we only need to map the 1020 * specified memory. 1021 */ 1022 vm_offset_t 1023 pmap_map(vm_offset_t *virtp, vm_paddr_t start, vm_paddr_t end, int prot) 1024 { 1025 return PHYS_TO_DMAP(start); 1026 } 1027 1028 1029 /* 1030 * Add a list of wired pages to the kva 1031 * this routine is only used for temporary 1032 * kernel mappings that do not need to have 1033 * page modification or references recorded. 1034 * Note that old mappings are simply written 1035 * over. The page *must* be wired. 1036 */ 1037 void 1038 pmap_qenter(vm_offset_t va, vm_page_t *m, int count) 1039 { 1040 vm_offset_t end_va; 1041 1042 end_va = va + count * PAGE_SIZE; 1043 1044 while (va < end_va) { 1045 pt_entry_t *pte; 1046 1047 pte = vtopte(va); 1048 *pte = VM_PAGE_TO_PHYS(*m) | PG_RW | PG_V | pgeflag; 1049 cpu_invlpg((void *)va); 1050 va += PAGE_SIZE; 1051 m++; 1052 } 1053 #ifdef SMP 1054 smp_invltlb(); /* XXX */ 1055 #endif 1056 } 1057 1058 void 1059 pmap_qenter2(vm_offset_t va, vm_page_t *m, int count, cpumask_t *mask) 1060 { 1061 vm_offset_t end_va; 1062 cpumask_t cmask = mycpu->gd_cpumask; 1063 1064 end_va = va + count * PAGE_SIZE; 1065 1066 while (va < end_va) { 1067 pt_entry_t *pte; 1068 pt_entry_t pteval; 1069 1070 /* 1071 * Install the new PTE. If the pte changed from the prior 1072 * mapping we must reset the cpu mask and invalidate the page. 1073 * If the pte is the same but we have not seen it on the 1074 * current cpu, invlpg the existing mapping. Otherwise the 1075 * entry is optimal and no invalidation is required. 1076 */ 1077 pte = vtopte(va); 1078 pteval = VM_PAGE_TO_PHYS(*m) | PG_A | PG_RW | PG_V | pgeflag; 1079 if (*pte != pteval) { 1080 *mask = 0; 1081 *pte = pteval; 1082 cpu_invlpg((void *)va); 1083 } else if ((*mask & cmask) == 0) { 1084 cpu_invlpg((void *)va); 1085 } 1086 va += PAGE_SIZE; 1087 m++; 1088 } 1089 *mask |= cmask; 1090 } 1091 1092 /* 1093 * This routine jerks page mappings from the 1094 * kernel -- it is meant only for temporary mappings. 1095 * 1096 * MPSAFE, INTERRUPT SAFE (cluster callback) 1097 */ 1098 void 1099 pmap_qremove(vm_offset_t va, int count) 1100 { 1101 vm_offset_t end_va; 1102 1103 end_va = va + count * PAGE_SIZE; 1104 1105 while (va < end_va) { 1106 pt_entry_t *pte; 1107 1108 pte = vtopte(va); 1109 *pte = 0; 1110 cpu_invlpg((void *)va); 1111 va += PAGE_SIZE; 1112 } 1113 #ifdef SMP 1114 smp_invltlb(); 1115 #endif 1116 } 1117 1118 /* 1119 * This routine works like vm_page_lookup() but also blocks as long as the 1120 * page is busy. This routine does not busy the page it returns. 1121 * 1122 * Unless the caller is managing objects whos pages are in a known state, 1123 * the call should be made with a critical section held so the page's object 1124 * association remains valid on return. 1125 */ 1126 static 1127 vm_page_t 1128 pmap_page_lookup(vm_object_t object, vm_pindex_t pindex) 1129 { 1130 vm_page_t m; 1131 1132 do { 1133 m = vm_page_lookup(object, pindex); 1134 } while (m && vm_page_sleep_busy(m, FALSE, "pplookp")); 1135 1136 return(m); 1137 } 1138 1139 /* 1140 * Create a new thread and optionally associate it with a (new) process. 1141 * NOTE! the new thread's cpu may not equal the current cpu. 1142 */ 1143 void 1144 pmap_init_thread(thread_t td) 1145 { 1146 /* enforce pcb placement */ 1147 td->td_pcb = (struct pcb *)(td->td_kstack + td->td_kstack_size) - 1; 1148 td->td_savefpu = &td->td_pcb->pcb_save; 1149 td->td_sp = (char *)td->td_pcb - 16; /* JG is -16 needed on x86_64? */ 1150 } 1151 1152 /* 1153 * This routine directly affects the fork perf for a process. 1154 */ 1155 void 1156 pmap_init_proc(struct proc *p) 1157 { 1158 } 1159 1160 /* 1161 * Dispose the UPAGES for a process that has exited. 1162 * This routine directly impacts the exit perf of a process. 1163 */ 1164 void 1165 pmap_dispose_proc(struct proc *p) 1166 { 1167 KASSERT(p->p_lock == 0, ("attempt to dispose referenced proc! %p", p)); 1168 } 1169 1170 /*************************************************** 1171 * Page table page management routines..... 1172 ***************************************************/ 1173 1174 /* 1175 * This routine unholds page table pages, and if the hold count 1176 * drops to zero, then it decrements the wire count. 1177 */ 1178 static __inline 1179 int 1180 pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m, 1181 pmap_inval_info_t info) 1182 { 1183 KKASSERT(m->hold_count > 0); 1184 if (m->hold_count > 1) { 1185 vm_page_unhold(m); 1186 return 0; 1187 } else { 1188 return _pmap_unwire_pte_hold(pmap, va, m, info); 1189 } 1190 } 1191 1192 static 1193 int 1194 _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m, 1195 pmap_inval_info_t info) 1196 { 1197 /* 1198 * Wait until we can busy the page ourselves. We cannot have 1199 * any active flushes if we block. We own one hold count on the 1200 * page so it cannot be freed out from under us. 1201 */ 1202 if (m->flags & PG_BUSY) { 1203 pmap_inval_flush(info); 1204 while (vm_page_sleep_busy(m, FALSE, "pmuwpt")) 1205 ; 1206 } 1207 KASSERT(m->queue == PQ_NONE, 1208 ("_pmap_unwire_pte_hold: %p->queue != PQ_NONE", m)); 1209 1210 /* 1211 * This case can occur if new references were acquired while 1212 * we were blocked. 1213 */ 1214 if (m->hold_count > 1) { 1215 KKASSERT(m->hold_count > 1); 1216 vm_page_unhold(m); 1217 return 0; 1218 } 1219 1220 /* 1221 * Unmap the page table page 1222 */ 1223 KKASSERT(m->hold_count == 1); 1224 vm_page_busy(m); 1225 pmap_inval_interlock(info, pmap, -1); 1226 1227 if (m->pindex >= (NUPDE + NUPDPE)) { 1228 /* PDP page */ 1229 pml4_entry_t *pml4; 1230 pml4 = pmap_pml4e(pmap, va); 1231 *pml4 = 0; 1232 } else if (m->pindex >= NUPDE) { 1233 /* PD page */ 1234 pdp_entry_t *pdp; 1235 pdp = pmap_pdpe(pmap, va); 1236 *pdp = 0; 1237 } else { 1238 /* PT page */ 1239 pd_entry_t *pd; 1240 pd = pmap_pde(pmap, va); 1241 *pd = 0; 1242 } 1243 1244 KKASSERT(pmap->pm_stats.resident_count > 0); 1245 --pmap->pm_stats.resident_count; 1246 1247 if (pmap->pm_ptphint == m) 1248 pmap->pm_ptphint = NULL; 1249 pmap_inval_deinterlock(info, pmap); 1250 1251 if (m->pindex < NUPDE) { 1252 /* We just released a PT, unhold the matching PD */ 1253 vm_page_t pdpg; 1254 1255 pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME); 1256 pmap_unwire_pte_hold(pmap, va, pdpg, info); 1257 } 1258 if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) { 1259 /* We just released a PD, unhold the matching PDP */ 1260 vm_page_t pdppg; 1261 1262 pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME); 1263 pmap_unwire_pte_hold(pmap, va, pdppg, info); 1264 } 1265 1266 /* 1267 * This was our last hold, the page had better be unwired 1268 * after we decrement wire_count. 1269 * 1270 * FUTURE NOTE: shared page directory page could result in 1271 * multiple wire counts. 1272 */ 1273 vm_page_unhold(m); 1274 --m->wire_count; 1275 KKASSERT(m->wire_count == 0); 1276 --vmstats.v_wire_count; 1277 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 1278 vm_page_flash(m); 1279 vm_page_free_zero(m); 1280 1281 return 1; 1282 } 1283 1284 /* 1285 * After removing a page table entry, this routine is used to 1286 * conditionally free the page, and manage the hold/wire counts. 1287 */ 1288 static 1289 int 1290 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t mpte, 1291 pmap_inval_info_t info) 1292 { 1293 vm_pindex_t ptepindex; 1294 1295 if (va >= VM_MAX_USER_ADDRESS) 1296 return 0; 1297 1298 if (mpte == NULL) { 1299 ptepindex = pmap_pde_pindex(va); 1300 #if JGHINT 1301 if (pmap->pm_ptphint && 1302 (pmap->pm_ptphint->pindex == ptepindex)) { 1303 mpte = pmap->pm_ptphint; 1304 } else { 1305 #endif 1306 pmap_inval_flush(info); 1307 mpte = pmap_page_lookup(pmap->pm_pteobj, ptepindex); 1308 pmap->pm_ptphint = mpte; 1309 #if JGHINT 1310 } 1311 #endif 1312 } 1313 return pmap_unwire_pte_hold(pmap, va, mpte, info); 1314 } 1315 1316 /* 1317 * Initialize pmap0/vmspace0. This pmap is not added to pmap_list because 1318 * it, and IdlePTD, represents the template used to update all other pmaps. 1319 * 1320 * On architectures where the kernel pmap is not integrated into the user 1321 * process pmap, this pmap represents the process pmap, not the kernel pmap. 1322 * kernel_pmap should be used to directly access the kernel_pmap. 1323 */ 1324 void 1325 pmap_pinit0(struct pmap *pmap) 1326 { 1327 pmap->pm_pml4 = (pml4_entry_t *)(PTOV_OFFSET + KPML4phys); 1328 pmap->pm_count = 1; 1329 pmap->pm_active = 0; 1330 pmap->pm_ptphint = NULL; 1331 TAILQ_INIT(&pmap->pm_pvlist); 1332 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1333 } 1334 1335 /* 1336 * Initialize a preallocated and zeroed pmap structure, 1337 * such as one in a vmspace structure. 1338 */ 1339 void 1340 pmap_pinit(struct pmap *pmap) 1341 { 1342 vm_page_t ptdpg; 1343 1344 /* 1345 * No need to allocate page table space yet but we do need a valid 1346 * page directory table. 1347 */ 1348 if (pmap->pm_pml4 == NULL) { 1349 pmap->pm_pml4 = 1350 (pml4_entry_t *)kmem_alloc_pageable(&kernel_map, PAGE_SIZE); 1351 } 1352 1353 /* 1354 * Allocate an object for the ptes 1355 */ 1356 if (pmap->pm_pteobj == NULL) 1357 pmap->pm_pteobj = vm_object_allocate(OBJT_DEFAULT, NUPDE + NUPDPE + PML4PML4I + 1); 1358 1359 /* 1360 * Allocate the page directory page, unless we already have 1361 * one cached. If we used the cached page the wire_count will 1362 * already be set appropriately. 1363 */ 1364 if ((ptdpg = pmap->pm_pdirm) == NULL) { 1365 ptdpg = vm_page_grab(pmap->pm_pteobj, NUPDE + NUPDPE + PML4PML4I, 1366 VM_ALLOC_NORMAL | VM_ALLOC_RETRY); 1367 pmap->pm_pdirm = ptdpg; 1368 vm_page_flag_clear(ptdpg, PG_MAPPED | PG_BUSY); 1369 ptdpg->valid = VM_PAGE_BITS_ALL; 1370 if (ptdpg->wire_count == 0) 1371 ++vmstats.v_wire_count; 1372 ptdpg->wire_count = 1; 1373 pmap_kenter((vm_offset_t)pmap->pm_pml4, VM_PAGE_TO_PHYS(ptdpg)); 1374 } 1375 if ((ptdpg->flags & PG_ZERO) == 0) 1376 bzero(pmap->pm_pml4, PAGE_SIZE); 1377 1378 pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U; 1379 pmap->pm_pml4[DMPML4I] = DMPDPphys | PG_RW | PG_V | PG_U; 1380 1381 /* install self-referential address mapping entry */ 1382 pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(ptdpg) | PG_V | PG_RW | PG_A | PG_M; 1383 1384 pmap->pm_count = 1; 1385 pmap->pm_active = 0; 1386 pmap->pm_ptphint = NULL; 1387 TAILQ_INIT(&pmap->pm_pvlist); 1388 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1389 pmap->pm_stats.resident_count = 1; 1390 } 1391 1392 /* 1393 * Clean up a pmap structure so it can be physically freed. This routine 1394 * is called by the vmspace dtor function. A great deal of pmap data is 1395 * left passively mapped to improve vmspace management so we have a bit 1396 * of cleanup work to do here. 1397 */ 1398 void 1399 pmap_puninit(pmap_t pmap) 1400 { 1401 vm_page_t p; 1402 1403 KKASSERT(pmap->pm_active == 0); 1404 if ((p = pmap->pm_pdirm) != NULL) { 1405 KKASSERT(pmap->pm_pml4 != NULL); 1406 KKASSERT(pmap->pm_pml4 != (void *)(PTOV_OFFSET + KPML4phys)); 1407 pmap_kremove((vm_offset_t)pmap->pm_pml4); 1408 p->wire_count--; 1409 vmstats.v_wire_count--; 1410 KKASSERT((p->flags & PG_BUSY) == 0); 1411 vm_page_busy(p); 1412 vm_page_free_zero(p); 1413 pmap->pm_pdirm = NULL; 1414 } 1415 if (pmap->pm_pml4) { 1416 KKASSERT(pmap->pm_pml4 != (void *)(PTOV_OFFSET + KPML4phys)); 1417 kmem_free(&kernel_map, (vm_offset_t)pmap->pm_pml4, PAGE_SIZE); 1418 pmap->pm_pml4 = NULL; 1419 } 1420 if (pmap->pm_pteobj) { 1421 vm_object_deallocate(pmap->pm_pteobj); 1422 pmap->pm_pteobj = NULL; 1423 } 1424 } 1425 1426 /* 1427 * Wire in kernel global address entries. To avoid a race condition 1428 * between pmap initialization and pmap_growkernel, this procedure 1429 * adds the pmap to the master list (which growkernel scans to update), 1430 * then copies the template. 1431 */ 1432 void 1433 pmap_pinit2(struct pmap *pmap) 1434 { 1435 crit_enter(); 1436 TAILQ_INSERT_TAIL(&pmap_list, pmap, pm_pmnode); 1437 /* XXX copies current process, does not fill in MPPTDI */ 1438 crit_exit(); 1439 } 1440 1441 /* 1442 * Attempt to release and free a vm_page in a pmap. Returns 1 on success, 1443 * 0 on failure (if the procedure had to sleep). 1444 * 1445 * When asked to remove the page directory page itself, we actually just 1446 * leave it cached so we do not have to incur the SMP inval overhead of 1447 * removing the kernel mapping. pmap_puninit() will take care of it. 1448 */ 1449 static 1450 int 1451 pmap_release_free_page(struct pmap *pmap, vm_page_t p) 1452 { 1453 /* 1454 * This code optimizes the case of freeing non-busy 1455 * page-table pages. Those pages are zero now, and 1456 * might as well be placed directly into the zero queue. 1457 */ 1458 if (vm_page_sleep_busy(p, FALSE, "pmaprl")) 1459 return 0; 1460 1461 vm_page_busy(p); 1462 1463 /* 1464 * Remove the page table page from the processes address space. 1465 */ 1466 if (p->pindex == NUPDE + NUPDPE + PML4PML4I) { 1467 /* 1468 * We are the pml4 table itself. 1469 */ 1470 /* XXX anything to do here? */ 1471 } else if (p->pindex >= (NUPDE + NUPDPE)) { 1472 /* 1473 * Remove a PDP page from the PML4. We do not maintain 1474 * hold counts on the PML4 page. 1475 */ 1476 pml4_entry_t *pml4; 1477 vm_page_t m4; 1478 int idx; 1479 1480 m4 = vm_page_lookup(pmap->pm_pteobj, NUPDE + NUPDPE + PML4PML4I); 1481 KKASSERT(m4 != NULL); 1482 pml4 = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m4)); 1483 idx = (p->pindex - (NUPDE + NUPDPE)) % NPML4EPG; 1484 KKASSERT(pml4[idx] != 0); 1485 pml4[idx] = 0; 1486 } else if (p->pindex >= NUPDE) { 1487 /* 1488 * Remove a PD page from the PDP and drop the hold count 1489 * on the PDP. The PDP is left cached in the pmap if 1490 * the hold count drops to 0 so the wire count remains 1491 * intact. 1492 */ 1493 vm_page_t m3; 1494 pdp_entry_t *pdp; 1495 int idx; 1496 1497 m3 = vm_page_lookup(pmap->pm_pteobj, 1498 NUPDE + NUPDPE + (p->pindex - NUPDE) / NPDPEPG); 1499 KKASSERT(m3 != NULL); 1500 pdp = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m3)); 1501 idx = (p->pindex - NUPDE) % NPDPEPG; 1502 KKASSERT(pdp[idx] != 0); 1503 pdp[idx] = 0; 1504 m3->hold_count--; 1505 } else { 1506 /* 1507 * Remove a PT page from the PD and drop the hold count 1508 * on the PD. The PD is left cached in the pmap if 1509 * the hold count drops to 0 so the wire count remains 1510 * intact. 1511 */ 1512 vm_page_t m2; 1513 pd_entry_t *pd; 1514 int idx; 1515 1516 m2 = vm_page_lookup(pmap->pm_pteobj, 1517 NUPDE + p->pindex / NPDEPG); 1518 KKASSERT(m2 != NULL); 1519 pd = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m2)); 1520 idx = p->pindex % NPDEPG; 1521 pd[idx] = 0; 1522 m2->hold_count--; 1523 } 1524 1525 /* 1526 * One fewer mappings in the pmap. p's hold count had better 1527 * be zero. 1528 */ 1529 KKASSERT(pmap->pm_stats.resident_count > 0); 1530 --pmap->pm_stats.resident_count; 1531 if (p->hold_count) 1532 panic("pmap_release: freeing held page table page"); 1533 if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == p->pindex)) 1534 pmap->pm_ptphint = NULL; 1535 1536 /* 1537 * We leave the top-level page table page cached, wired, and mapped in 1538 * the pmap until the dtor function (pmap_puninit()) gets called. 1539 * However, still clean it up so we can set PG_ZERO. 1540 */ 1541 if (p->pindex == NUPDE + NUPDPE + PML4PML4I) { 1542 bzero(pmap->pm_pml4, PAGE_SIZE); 1543 vm_page_flag_set(p, PG_ZERO); 1544 vm_page_wakeup(p); 1545 } else { 1546 p->wire_count--; 1547 KKASSERT(p->wire_count == 0); 1548 vmstats.v_wire_count--; 1549 /* JG eventually revert to using vm_page_free_zero() */ 1550 vm_page_free(p); 1551 } 1552 return 1; 1553 } 1554 1555 /* 1556 * This routine is called when various levels in the page table need to 1557 * be populated. This routine cannot fail. 1558 */ 1559 static 1560 vm_page_t 1561 _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex) 1562 { 1563 vm_page_t m; 1564 1565 /* 1566 * Find or fabricate a new pagetable page. This will busy the page. 1567 */ 1568 m = vm_page_grab(pmap->pm_pteobj, ptepindex, 1569 VM_ALLOC_NORMAL | VM_ALLOC_ZERO | VM_ALLOC_RETRY); 1570 if ((m->flags & PG_ZERO) == 0) { 1571 pmap_zero_page(VM_PAGE_TO_PHYS(m)); 1572 } 1573 1574 KASSERT(m->queue == PQ_NONE, 1575 ("_pmap_allocpte: %p->queue != PQ_NONE", m)); 1576 1577 /* 1578 * Increment the hold count for the page we will be returning to 1579 * the caller. 1580 */ 1581 m->hold_count++; 1582 if (m->wire_count++ == 0) 1583 vmstats.v_wire_count++; 1584 1585 /* 1586 * Map the pagetable page into the process address space, if 1587 * it isn't already there. 1588 * 1589 * It is possible that someone else got in and mapped the page 1590 * directory page while we were blocked, if so just unbusy and 1591 * return the held page. 1592 */ 1593 if (ptepindex >= (NUPDE + NUPDPE)) { 1594 /* 1595 * Wire up a new PDP page in the PML4 1596 */ 1597 vm_pindex_t pml4index; 1598 pml4_entry_t *pml4; 1599 1600 pml4index = ptepindex - (NUPDE + NUPDPE); 1601 pml4 = &pmap->pm_pml4[pml4index]; 1602 if (*pml4 & PG_V) { 1603 if (--m->wire_count == 0) 1604 --vmstats.v_wire_count; 1605 vm_page_wakeup(m); 1606 return(m); 1607 } 1608 *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 1609 } else if (ptepindex >= NUPDE) { 1610 /* 1611 * Wire up a new PD page in the PDP 1612 */ 1613 vm_pindex_t pml4index; 1614 vm_pindex_t pdpindex; 1615 vm_page_t pdppg; 1616 pml4_entry_t *pml4; 1617 pdp_entry_t *pdp; 1618 1619 pdpindex = ptepindex - NUPDE; 1620 pml4index = pdpindex >> NPML4EPGSHIFT; 1621 1622 pml4 = &pmap->pm_pml4[pml4index]; 1623 if ((*pml4 & PG_V) == 0) { 1624 /* 1625 * Have to allocate a new PDP page, recurse. 1626 * This always succeeds. Returned page will 1627 * be held. 1628 */ 1629 pdppg = _pmap_allocpte(pmap, 1630 NUPDE + NUPDPE + pml4index); 1631 } else { 1632 /* 1633 * Add a held reference to the PDP page. 1634 */ 1635 pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME); 1636 pdppg->hold_count++; 1637 } 1638 1639 /* 1640 * Now find the pdp_entry and map the PDP. If the PDP 1641 * has already been mapped unwind and return the 1642 * already-mapped PDP held. 1643 * 1644 * pdppg is left held (hold_count is incremented for 1645 * each PD in the PDP). 1646 */ 1647 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 1648 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 1649 if (*pdp & PG_V) { 1650 vm_page_unhold(pdppg); 1651 if (--m->wire_count == 0) 1652 --vmstats.v_wire_count; 1653 vm_page_wakeup(m); 1654 return(m); 1655 } 1656 *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 1657 } else { 1658 /* 1659 * Wire up the new PT page in the PD 1660 */ 1661 vm_pindex_t pml4index; 1662 vm_pindex_t pdpindex; 1663 pml4_entry_t *pml4; 1664 pdp_entry_t *pdp; 1665 pd_entry_t *pd; 1666 vm_page_t pdpg; 1667 1668 pdpindex = ptepindex >> NPDPEPGSHIFT; 1669 pml4index = pdpindex >> NPML4EPGSHIFT; 1670 1671 /* 1672 * Locate the PDP page in the PML4, then the PD page in 1673 * the PDP. If either does not exist we simply recurse 1674 * to allocate them. 1675 * 1676 * We can just recurse on the PD page as it will recurse 1677 * on the PDP if necessary. 1678 */ 1679 pml4 = &pmap->pm_pml4[pml4index]; 1680 if ((*pml4 & PG_V) == 0) { 1681 pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex); 1682 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 1683 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 1684 } else { 1685 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 1686 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 1687 if ((*pdp & PG_V) == 0) { 1688 pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex); 1689 } else { 1690 pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); 1691 pdpg->hold_count++; 1692 } 1693 } 1694 1695 /* 1696 * Now fill in the pte in the PD. If the pte already exists 1697 * (again, if we raced the grab), unhold pdpg and unwire 1698 * m, returning a held m. 1699 * 1700 * pdpg is left held (hold_count is incremented for 1701 * each PT in the PD). 1702 */ 1703 pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME); 1704 pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)]; 1705 if (*pd != 0) { 1706 vm_page_unhold(pdpg); 1707 if (--m->wire_count == 0) 1708 --vmstats.v_wire_count; 1709 vm_page_wakeup(m); 1710 return(m); 1711 } 1712 *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 1713 } 1714 1715 /* 1716 * We successfully loaded a PDP, PD, or PTE. Set the page table hint, 1717 * valid bits, mapped flag, unbusy, and we're done. 1718 */ 1719 pmap->pm_ptphint = m; 1720 ++pmap->pm_stats.resident_count; 1721 1722 m->valid = VM_PAGE_BITS_ALL; 1723 vm_page_flag_clear(m, PG_ZERO); 1724 vm_page_flag_set(m, PG_MAPPED); 1725 vm_page_wakeup(m); 1726 1727 return (m); 1728 } 1729 1730 static 1731 vm_page_t 1732 pmap_allocpte(pmap_t pmap, vm_offset_t va) 1733 { 1734 vm_pindex_t ptepindex; 1735 pd_entry_t *pd; 1736 vm_page_t m; 1737 1738 /* 1739 * Calculate pagetable page index 1740 */ 1741 ptepindex = pmap_pde_pindex(va); 1742 1743 /* 1744 * Get the page directory entry 1745 */ 1746 pd = pmap_pde(pmap, va); 1747 1748 /* 1749 * This supports switching from a 2MB page to a 1750 * normal 4K page. 1751 */ 1752 if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) { 1753 panic("no promotion/demotion yet"); 1754 *pd = 0; 1755 pd = NULL; 1756 cpu_invltlb(); 1757 smp_invltlb(); 1758 } 1759 1760 /* 1761 * If the page table page is mapped, we just increment the 1762 * hold count, and activate it. 1763 */ 1764 if (pd != NULL && (*pd & PG_V) != 0) { 1765 /* YYY hint is used here on i386 */ 1766 m = pmap_page_lookup( pmap->pm_pteobj, ptepindex); 1767 pmap->pm_ptphint = m; 1768 m->hold_count++; 1769 return m; 1770 } 1771 /* 1772 * Here if the pte page isn't mapped, or if it has been deallocated. 1773 */ 1774 return _pmap_allocpte(pmap, ptepindex); 1775 } 1776 1777 1778 /*************************************************** 1779 * Pmap allocation/deallocation routines. 1780 ***************************************************/ 1781 1782 /* 1783 * Release any resources held by the given physical map. 1784 * Called when a pmap initialized by pmap_pinit is being released. 1785 * Should only be called if the map contains no valid mappings. 1786 */ 1787 static int pmap_release_callback(struct vm_page *p, void *data); 1788 1789 void 1790 pmap_release(struct pmap *pmap) 1791 { 1792 vm_object_t object = pmap->pm_pteobj; 1793 struct rb_vm_page_scan_info info; 1794 1795 KASSERT(pmap->pm_active == 0, ("pmap still active! %08x", pmap->pm_active)); 1796 #if defined(DIAGNOSTIC) 1797 if (object->ref_count != 1) 1798 panic("pmap_release: pteobj reference count != 1"); 1799 #endif 1800 1801 info.pmap = pmap; 1802 info.object = object; 1803 crit_enter(); 1804 TAILQ_REMOVE(&pmap_list, pmap, pm_pmnode); 1805 crit_exit(); 1806 1807 do { 1808 crit_enter(); 1809 info.error = 0; 1810 info.mpte = NULL; 1811 info.limit = object->generation; 1812 1813 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL, 1814 pmap_release_callback, &info); 1815 if (info.error == 0 && info.mpte) { 1816 if (!pmap_release_free_page(pmap, info.mpte)) 1817 info.error = 1; 1818 } 1819 crit_exit(); 1820 } while (info.error); 1821 } 1822 1823 static 1824 int 1825 pmap_release_callback(struct vm_page *p, void *data) 1826 { 1827 struct rb_vm_page_scan_info *info = data; 1828 1829 if (p->pindex == NUPDE + NUPDPE + PML4PML4I) { 1830 info->mpte = p; 1831 return(0); 1832 } 1833 if (!pmap_release_free_page(info->pmap, p)) { 1834 info->error = 1; 1835 return(-1); 1836 } 1837 if (info->object->generation != info->limit) { 1838 info->error = 1; 1839 return(-1); 1840 } 1841 return(0); 1842 } 1843 1844 /* 1845 * Grow the number of kernel page table entries, if needed. 1846 */ 1847 void 1848 pmap_growkernel(vm_offset_t addr) 1849 { 1850 vm_paddr_t paddr; 1851 vm_offset_t ptppaddr; 1852 vm_page_t nkpg; 1853 pd_entry_t *pde, newpdir; 1854 pdp_entry_t newpdp; 1855 1856 crit_enter(); 1857 if (kernel_vm_end == 0) { 1858 kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 1859 nkpt = 0; 1860 while ((*pmap_pde(&kernel_pmap, kernel_vm_end) & PG_V) != 0) { 1861 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); 1862 nkpt++; 1863 if (kernel_vm_end - 1 >= kernel_map.max_offset) { 1864 kernel_vm_end = kernel_map.max_offset; 1865 break; 1866 } 1867 } 1868 } 1869 addr = roundup2(addr, PAGE_SIZE * NPTEPG); 1870 if (addr - 1 >= kernel_map.max_offset) 1871 addr = kernel_map.max_offset; 1872 while (kernel_vm_end < addr) { 1873 pde = pmap_pde(&kernel_pmap, kernel_vm_end); 1874 if (pde == NULL) { 1875 /* We need a new PDP entry */ 1876 nkpg = vm_page_alloc(kptobj, nkpt, 1877 VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM 1878 | VM_ALLOC_INTERRUPT); 1879 if (nkpg == NULL) 1880 panic("pmap_growkernel: no memory to grow kernel"); 1881 paddr = VM_PAGE_TO_PHYS(nkpg); 1882 if ((nkpg->flags & PG_ZERO) == 0) 1883 pmap_zero_page(paddr); 1884 vm_page_flag_clear(nkpg, PG_ZERO); 1885 newpdp = (pdp_entry_t) 1886 (paddr | PG_V | PG_RW | PG_A | PG_M); 1887 *pmap_pdpe(&kernel_pmap, kernel_vm_end) = newpdp; 1888 nkpt++; 1889 continue; /* try again */ 1890 } 1891 if ((*pde & PG_V) != 0) { 1892 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); 1893 if (kernel_vm_end - 1 >= kernel_map.max_offset) { 1894 kernel_vm_end = kernel_map.max_offset; 1895 break; 1896 } 1897 continue; 1898 } 1899 1900 /* 1901 * This index is bogus, but out of the way 1902 */ 1903 nkpg = vm_page_alloc(kptobj, nkpt, 1904 VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM | VM_ALLOC_INTERRUPT); 1905 if (nkpg == NULL) 1906 panic("pmap_growkernel: no memory to grow kernel"); 1907 1908 vm_page_wire(nkpg); 1909 ptppaddr = VM_PAGE_TO_PHYS(nkpg); 1910 pmap_zero_page(ptppaddr); 1911 vm_page_flag_clear(nkpg, PG_ZERO); 1912 newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); 1913 *pmap_pde(&kernel_pmap, kernel_vm_end) = newpdir; 1914 nkpt++; 1915 1916 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); 1917 if (kernel_vm_end - 1 >= kernel_map.max_offset) { 1918 kernel_vm_end = kernel_map.max_offset; 1919 break; 1920 } 1921 } 1922 crit_exit(); 1923 } 1924 1925 /* 1926 * Retire the given physical map from service. 1927 * Should only be called if the map contains 1928 * no valid mappings. 1929 */ 1930 void 1931 pmap_destroy(pmap_t pmap) 1932 { 1933 int count; 1934 1935 if (pmap == NULL) 1936 return; 1937 1938 count = --pmap->pm_count; 1939 if (count == 0) { 1940 pmap_release(pmap); 1941 panic("destroying a pmap is not yet implemented"); 1942 } 1943 } 1944 1945 /* 1946 * Add a reference to the specified pmap. 1947 */ 1948 void 1949 pmap_reference(pmap_t pmap) 1950 { 1951 if (pmap != NULL) { 1952 pmap->pm_count++; 1953 } 1954 } 1955 1956 /*************************************************** 1957 * page management routines. 1958 ***************************************************/ 1959 1960 /* 1961 * free the pv_entry back to the free list. This function may be 1962 * called from an interrupt. 1963 */ 1964 static __inline 1965 void 1966 free_pv_entry(pv_entry_t pv) 1967 { 1968 pv_entry_count--; 1969 KKASSERT(pv_entry_count >= 0); 1970 zfree(pvzone, pv); 1971 } 1972 1973 /* 1974 * get a new pv_entry, allocating a block from the system 1975 * when needed. This function may be called from an interrupt. 1976 */ 1977 static 1978 pv_entry_t 1979 get_pv_entry(void) 1980 { 1981 pv_entry_count++; 1982 if (pv_entry_high_water && 1983 (pv_entry_count > pv_entry_high_water) && 1984 (pmap_pagedaemon_waken == 0)) { 1985 pmap_pagedaemon_waken = 1; 1986 wakeup(&vm_pages_needed); 1987 } 1988 return zalloc(pvzone); 1989 } 1990 1991 /* 1992 * This routine is very drastic, but can save the system 1993 * in a pinch. 1994 */ 1995 void 1996 pmap_collect(void) 1997 { 1998 int i; 1999 vm_page_t m; 2000 static int warningdone=0; 2001 2002 if (pmap_pagedaemon_waken == 0) 2003 return; 2004 2005 if (warningdone < 5) { 2006 kprintf("pmap_collect: collecting pv entries -- suggest increasing PMAP_SHPGPERPROC\n"); 2007 warningdone++; 2008 } 2009 2010 for(i = 0; i < vm_page_array_size; i++) { 2011 m = &vm_page_array[i]; 2012 if (m->wire_count || m->hold_count || m->busy || 2013 (m->flags & PG_BUSY)) 2014 continue; 2015 pmap_remove_all(m); 2016 } 2017 pmap_pagedaemon_waken = 0; 2018 } 2019 2020 2021 /* 2022 * If it is the first entry on the list, it is actually 2023 * in the header and we must copy the following entry up 2024 * to the header. Otherwise we must search the list for 2025 * the entry. In either case we free the now unused entry. 2026 */ 2027 static 2028 int 2029 pmap_remove_entry(struct pmap *pmap, vm_page_t m, 2030 vm_offset_t va, pmap_inval_info_t info) 2031 { 2032 pv_entry_t pv; 2033 int rtval; 2034 2035 crit_enter(); 2036 if (m->md.pv_list_count < pmap->pm_stats.resident_count) { 2037 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 2038 if (pmap == pv->pv_pmap && va == pv->pv_va) 2039 break; 2040 } 2041 } else { 2042 TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) { 2043 if (va == pv->pv_va) 2044 break; 2045 } 2046 } 2047 2048 rtval = 0; 2049 KKASSERT(pv); 2050 2051 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 2052 m->md.pv_list_count--; 2053 KKASSERT(m->md.pv_list_count >= 0); 2054 if (TAILQ_EMPTY(&m->md.pv_list)) 2055 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 2056 TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); 2057 ++pmap->pm_generation; 2058 rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem, info); 2059 free_pv_entry(pv); 2060 2061 crit_exit(); 2062 return rtval; 2063 } 2064 2065 /* 2066 * Create a pv entry for page at pa for 2067 * (pmap, va). 2068 */ 2069 static 2070 void 2071 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_page_t m) 2072 { 2073 pv_entry_t pv; 2074 2075 crit_enter(); 2076 pv = get_pv_entry(); 2077 pv->pv_va = va; 2078 pv->pv_pmap = pmap; 2079 pv->pv_ptem = mpte; 2080 2081 TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist); 2082 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 2083 ++pmap->pm_generation; 2084 m->md.pv_list_count++; 2085 2086 crit_exit(); 2087 } 2088 2089 /* 2090 * pmap_remove_pte: do the things to unmap a page in a process 2091 */ 2092 static 2093 int 2094 pmap_remove_pte(struct pmap *pmap, pt_entry_t *ptq, vm_offset_t va, 2095 pmap_inval_info_t info) 2096 { 2097 pt_entry_t oldpte; 2098 vm_page_t m; 2099 2100 pmap_inval_interlock(info, pmap, va); 2101 oldpte = pte_load_clear(ptq); 2102 pmap_inval_deinterlock(info, pmap); 2103 if (oldpte & PG_W) 2104 pmap->pm_stats.wired_count -= 1; 2105 /* 2106 * Machines that don't support invlpg, also don't support 2107 * PG_G. XXX PG_G is disabled for SMP so don't worry about 2108 * the SMP case. 2109 */ 2110 if (oldpte & PG_G) 2111 cpu_invlpg((void *)va); 2112 KKASSERT(pmap->pm_stats.resident_count > 0); 2113 --pmap->pm_stats.resident_count; 2114 if (oldpte & PG_MANAGED) { 2115 m = PHYS_TO_VM_PAGE(oldpte); 2116 if (oldpte & PG_M) { 2117 #if defined(PMAP_DIAGNOSTIC) 2118 if (pmap_nw_modified((pt_entry_t) oldpte)) { 2119 kprintf( 2120 "pmap_remove: modified page not writable: va: 0x%lx, pte: 0x%lx\n", 2121 va, oldpte); 2122 } 2123 #endif 2124 if (pmap_track_modified(va)) 2125 vm_page_dirty(m); 2126 } 2127 if (oldpte & PG_A) 2128 vm_page_flag_set(m, PG_REFERENCED); 2129 return pmap_remove_entry(pmap, m, va, info); 2130 } else { 2131 return pmap_unuse_pt(pmap, va, NULL, info); 2132 } 2133 2134 return 0; 2135 } 2136 2137 /* 2138 * pmap_remove_page: 2139 * 2140 * Remove a single page from a process address space. 2141 * 2142 * This function may not be called from an interrupt if the pmap is 2143 * not kernel_pmap. 2144 */ 2145 static 2146 void 2147 pmap_remove_page(struct pmap *pmap, vm_offset_t va, pmap_inval_info_t info) 2148 { 2149 pt_entry_t *pte; 2150 2151 pte = pmap_pte(pmap, va); 2152 if (pte == NULL) 2153 return; 2154 if ((*pte & PG_V) == 0) 2155 return; 2156 pmap_remove_pte(pmap, pte, va, info); 2157 } 2158 2159 /* 2160 * pmap_remove: 2161 * 2162 * Remove the given range of addresses from the specified map. 2163 * 2164 * It is assumed that the start and end are properly 2165 * rounded to the page size. 2166 * 2167 * This function may not be called from an interrupt if the pmap is 2168 * not kernel_pmap. 2169 */ 2170 void 2171 pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva) 2172 { 2173 vm_offset_t va_next; 2174 pml4_entry_t *pml4e; 2175 pdp_entry_t *pdpe; 2176 pd_entry_t ptpaddr, *pde; 2177 pt_entry_t *pte; 2178 struct pmap_inval_info info; 2179 2180 if (pmap == NULL) 2181 return; 2182 2183 if (pmap->pm_stats.resident_count == 0) 2184 return; 2185 2186 pmap_inval_init(&info); 2187 2188 /* 2189 * special handling of removing one page. a very 2190 * common operation and easy to short circuit some 2191 * code. 2192 */ 2193 if (sva + PAGE_SIZE == eva) { 2194 pde = pmap_pde(pmap, sva); 2195 if (pde && (*pde & PG_PS) == 0) { 2196 pmap_remove_page(pmap, sva, &info); 2197 pmap_inval_done(&info); 2198 return; 2199 } 2200 } 2201 2202 for (; sva < eva; sva = va_next) { 2203 pml4e = pmap_pml4e(pmap, sva); 2204 if ((*pml4e & PG_V) == 0) { 2205 va_next = (sva + NBPML4) & ~PML4MASK; 2206 if (va_next < sva) 2207 va_next = eva; 2208 continue; 2209 } 2210 2211 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 2212 if ((*pdpe & PG_V) == 0) { 2213 va_next = (sva + NBPDP) & ~PDPMASK; 2214 if (va_next < sva) 2215 va_next = eva; 2216 continue; 2217 } 2218 2219 /* 2220 * Calculate index for next page table. 2221 */ 2222 va_next = (sva + NBPDR) & ~PDRMASK; 2223 if (va_next < sva) 2224 va_next = eva; 2225 2226 pde = pmap_pdpe_to_pde(pdpe, sva); 2227 ptpaddr = *pde; 2228 2229 /* 2230 * Weed out invalid mappings. 2231 */ 2232 if (ptpaddr == 0) 2233 continue; 2234 2235 /* 2236 * Check for large page. 2237 */ 2238 if ((ptpaddr & PG_PS) != 0) { 2239 /* JG FreeBSD has more complex treatment here */ 2240 pmap_inval_interlock(&info, pmap, -1); 2241 *pde = 0; 2242 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 2243 pmap_inval_deinterlock(&info, pmap); 2244 continue; 2245 } 2246 2247 /* 2248 * Limit our scan to either the end of the va represented 2249 * by the current page table page, or to the end of the 2250 * range being removed. 2251 */ 2252 if (va_next > eva) 2253 va_next = eva; 2254 2255 /* 2256 * NOTE: pmap_remove_pte() can block. 2257 */ 2258 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 2259 sva += PAGE_SIZE) { 2260 if (*pte == 0) 2261 continue; 2262 if (pmap_remove_pte(pmap, pte, sva, &info)) 2263 break; 2264 } 2265 } 2266 pmap_inval_done(&info); 2267 } 2268 2269 /* 2270 * pmap_remove_all: 2271 * 2272 * Removes this physical page from all physical maps in which it resides. 2273 * Reflects back modify bits to the pager. 2274 * 2275 * This routine may not be called from an interrupt. 2276 */ 2277 2278 static 2279 void 2280 pmap_remove_all(vm_page_t m) 2281 { 2282 struct pmap_inval_info info; 2283 pt_entry_t *pte, tpte; 2284 pv_entry_t pv; 2285 2286 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 2287 return; 2288 2289 pmap_inval_init(&info); 2290 crit_enter(); 2291 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 2292 KKASSERT(pv->pv_pmap->pm_stats.resident_count > 0); 2293 --pv->pv_pmap->pm_stats.resident_count; 2294 2295 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); 2296 pmap_inval_interlock(&info, pv->pv_pmap, pv->pv_va); 2297 tpte = pte_load_clear(pte); 2298 if (tpte & PG_W) 2299 pv->pv_pmap->pm_stats.wired_count--; 2300 pmap_inval_deinterlock(&info, pv->pv_pmap); 2301 if (tpte & PG_A) 2302 vm_page_flag_set(m, PG_REFERENCED); 2303 2304 /* 2305 * Update the vm_page_t clean and reference bits. 2306 */ 2307 if (tpte & PG_M) { 2308 #if defined(PMAP_DIAGNOSTIC) 2309 if (pmap_nw_modified(tpte)) { 2310 kprintf( 2311 "pmap_remove_all: modified page not writable: va: 0x%lx, pte: 0x%lx\n", 2312 pv->pv_va, tpte); 2313 } 2314 #endif 2315 if (pmap_track_modified(pv->pv_va)) 2316 vm_page_dirty(m); 2317 } 2318 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 2319 TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist); 2320 ++pv->pv_pmap->pm_generation; 2321 m->md.pv_list_count--; 2322 KKASSERT(m->md.pv_list_count >= 0); 2323 if (TAILQ_EMPTY(&m->md.pv_list)) 2324 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 2325 pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem, &info); 2326 free_pv_entry(pv); 2327 } 2328 crit_exit(); 2329 KKASSERT((m->flags & (PG_MAPPED|PG_WRITEABLE)) == 0); 2330 pmap_inval_done(&info); 2331 } 2332 2333 /* 2334 * pmap_protect: 2335 * 2336 * Set the physical protection on the specified range of this map 2337 * as requested. 2338 * 2339 * This function may not be called from an interrupt if the map is 2340 * not the kernel_pmap. 2341 */ 2342 void 2343 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 2344 { 2345 vm_offset_t va_next; 2346 pml4_entry_t *pml4e; 2347 pdp_entry_t *pdpe; 2348 pd_entry_t ptpaddr, *pde; 2349 pt_entry_t *pte; 2350 pmap_inval_info info; 2351 2352 /* JG review for NX */ 2353 2354 if (pmap == NULL) 2355 return; 2356 2357 if ((prot & VM_PROT_READ) == VM_PROT_NONE) { 2358 pmap_remove(pmap, sva, eva); 2359 return; 2360 } 2361 2362 if (prot & VM_PROT_WRITE) 2363 return; 2364 2365 pmap_inval_init(&info); 2366 2367 for (; sva < eva; sva = va_next) { 2368 2369 pml4e = pmap_pml4e(pmap, sva); 2370 if ((*pml4e & PG_V) == 0) { 2371 va_next = (sva + NBPML4) & ~PML4MASK; 2372 if (va_next < sva) 2373 va_next = eva; 2374 continue; 2375 } 2376 2377 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 2378 if ((*pdpe & PG_V) == 0) { 2379 va_next = (sva + NBPDP) & ~PDPMASK; 2380 if (va_next < sva) 2381 va_next = eva; 2382 continue; 2383 } 2384 2385 va_next = (sva + NBPDR) & ~PDRMASK; 2386 if (va_next < sva) 2387 va_next = eva; 2388 2389 pde = pmap_pdpe_to_pde(pdpe, sva); 2390 ptpaddr = *pde; 2391 2392 /* 2393 * Check for large page. 2394 */ 2395 if ((ptpaddr & PG_PS) != 0) { 2396 pmap_inval_interlock(&info, pmap, -1); 2397 *pde &= ~(PG_M|PG_RW); 2398 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 2399 pmap_inval_deinterlock(&info, pmap); 2400 continue; 2401 } 2402 2403 /* 2404 * Weed out invalid mappings. Note: we assume that the page 2405 * directory table is always allocated, and in kernel virtual. 2406 */ 2407 if (ptpaddr == 0) 2408 continue; 2409 2410 if (va_next > eva) 2411 va_next = eva; 2412 2413 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 2414 sva += PAGE_SIZE) { 2415 pt_entry_t pbits; 2416 pt_entry_t cbits; 2417 vm_page_t m; 2418 2419 /* 2420 * XXX non-optimal. Note also that there can be 2421 * no pmap_inval_flush() calls until after we modify 2422 * ptbase[sindex] (or otherwise we have to do another 2423 * pmap_inval_add() call). 2424 */ 2425 pmap_inval_interlock(&info, pmap, sva); 2426 again: 2427 pbits = *pte; 2428 cbits = pbits; 2429 if ((pbits & PG_V) == 0) { 2430 pmap_inval_deinterlock(&info, pmap); 2431 continue; 2432 } 2433 if (pbits & PG_MANAGED) { 2434 m = NULL; 2435 if (pbits & PG_A) { 2436 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 2437 vm_page_flag_set(m, PG_REFERENCED); 2438 cbits &= ~PG_A; 2439 } 2440 if (pbits & PG_M) { 2441 if (pmap_track_modified(sva)) { 2442 if (m == NULL) 2443 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 2444 vm_page_dirty(m); 2445 cbits &= ~PG_M; 2446 } 2447 } 2448 } 2449 cbits &= ~PG_RW; 2450 if (pbits != cbits && 2451 !atomic_cmpset_long(pte, pbits, cbits)) { 2452 goto again; 2453 } 2454 pmap_inval_deinterlock(&info, pmap); 2455 } 2456 } 2457 pmap_inval_done(&info); 2458 } 2459 2460 /* 2461 * Insert the given physical page (p) at 2462 * the specified virtual address (v) in the 2463 * target physical map with the protection requested. 2464 * 2465 * If specified, the page will be wired down, meaning 2466 * that the related pte can not be reclaimed. 2467 * 2468 * NB: This is the only routine which MAY NOT lazy-evaluate 2469 * or lose information. That is, this routine must actually 2470 * insert this page into the given map NOW. 2471 */ 2472 void 2473 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 2474 boolean_t wired) 2475 { 2476 vm_paddr_t pa; 2477 pd_entry_t *pde; 2478 pt_entry_t *pte; 2479 vm_paddr_t opa; 2480 pt_entry_t origpte, newpte; 2481 vm_page_t mpte; 2482 pmap_inval_info info; 2483 2484 if (pmap == NULL) 2485 return; 2486 2487 va = trunc_page(va); 2488 #ifdef PMAP_DIAGNOSTIC 2489 if (va >= KvaEnd) 2490 panic("pmap_enter: toobig"); 2491 if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS)) 2492 panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", va); 2493 #endif 2494 if (va < UPT_MAX_ADDRESS && pmap == &kernel_pmap) { 2495 kprintf("Warning: pmap_enter called on UVA with kernel_pmap\n"); 2496 #ifdef DDB 2497 db_print_backtrace(); 2498 #endif 2499 } 2500 if (va >= UPT_MAX_ADDRESS && pmap != &kernel_pmap) { 2501 kprintf("Warning: pmap_enter called on KVA without kernel_pmap\n"); 2502 #ifdef DDB 2503 db_print_backtrace(); 2504 #endif 2505 } 2506 2507 /* 2508 * In the case that a page table page is not 2509 * resident, we are creating it here. 2510 */ 2511 if (va < VM_MAX_USER_ADDRESS) 2512 mpte = pmap_allocpte(pmap, va); 2513 else 2514 mpte = NULL; 2515 2516 pmap_inval_init(&info); 2517 pde = pmap_pde(pmap, va); 2518 if (pde != NULL && (*pde & PG_V) != 0) { 2519 if ((*pde & PG_PS) != 0) 2520 panic("pmap_enter: attempted pmap_enter on 2MB page"); 2521 pte = pmap_pde_to_pte(pde, va); 2522 } else 2523 panic("pmap_enter: invalid page directory va=%#lx", va); 2524 2525 KKASSERT(pte != NULL); 2526 pa = VM_PAGE_TO_PHYS(m); 2527 origpte = *pte; 2528 opa = origpte & PG_FRAME; 2529 2530 /* 2531 * Mapping has not changed, must be protection or wiring change. 2532 */ 2533 if (origpte && (opa == pa)) { 2534 /* 2535 * Wiring change, just update stats. We don't worry about 2536 * wiring PT pages as they remain resident as long as there 2537 * are valid mappings in them. Hence, if a user page is wired, 2538 * the PT page will be also. 2539 */ 2540 if (wired && ((origpte & PG_W) == 0)) 2541 pmap->pm_stats.wired_count++; 2542 else if (!wired && (origpte & PG_W)) 2543 pmap->pm_stats.wired_count--; 2544 2545 #if defined(PMAP_DIAGNOSTIC) 2546 if (pmap_nw_modified(origpte)) { 2547 kprintf( 2548 "pmap_enter: modified page not writable: va: 0x%lx, pte: 0x%lx\n", 2549 va, origpte); 2550 } 2551 #endif 2552 2553 /* 2554 * Remove the extra pte reference. Note that we cannot 2555 * optimize the RO->RW case because we have adjusted the 2556 * wiring count above and may need to adjust the wiring 2557 * bits below. 2558 */ 2559 if (mpte) 2560 mpte->hold_count--; 2561 2562 /* 2563 * We might be turning off write access to the page, 2564 * so we go ahead and sense modify status. 2565 */ 2566 if (origpte & PG_MANAGED) { 2567 if ((origpte & PG_M) && pmap_track_modified(va)) { 2568 vm_page_t om; 2569 om = PHYS_TO_VM_PAGE(opa); 2570 vm_page_dirty(om); 2571 } 2572 pa |= PG_MANAGED; 2573 KKASSERT(m->flags & PG_MAPPED); 2574 } 2575 goto validate; 2576 } 2577 /* 2578 * Mapping has changed, invalidate old range and fall through to 2579 * handle validating new mapping. 2580 */ 2581 while (opa) { 2582 int err; 2583 err = pmap_remove_pte(pmap, pte, va, &info); 2584 if (err) 2585 panic("pmap_enter: pte vanished, va: 0x%lx", va); 2586 origpte = *pte; 2587 opa = origpte & PG_FRAME; 2588 if (opa) { 2589 kprintf("pmap_enter: Warning, raced pmap %p va %p\n", 2590 pmap, (void *)va); 2591 } 2592 } 2593 2594 /* 2595 * Enter on the PV list if part of our managed memory. Note that we 2596 * raise IPL while manipulating pv_table since pmap_enter can be 2597 * called at interrupt time. 2598 */ 2599 if (pmap_initialized && 2600 (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) { 2601 pmap_insert_entry(pmap, va, mpte, m); 2602 pa |= PG_MANAGED; 2603 vm_page_flag_set(m, PG_MAPPED); 2604 } 2605 2606 /* 2607 * Increment counters 2608 */ 2609 ++pmap->pm_stats.resident_count; 2610 if (wired) 2611 pmap->pm_stats.wired_count++; 2612 2613 validate: 2614 /* 2615 * Now validate mapping with desired protection/wiring. 2616 */ 2617 newpte = (pt_entry_t) (pa | pte_prot(pmap, prot) | PG_V); 2618 2619 if (wired) 2620 newpte |= PG_W; 2621 if (va < VM_MAX_USER_ADDRESS) 2622 newpte |= PG_U; 2623 if (pmap == &kernel_pmap) 2624 newpte |= pgeflag; 2625 2626 /* 2627 * if the mapping or permission bits are different, we need 2628 * to update the pte. 2629 */ 2630 if ((origpte & ~(PG_M|PG_A)) != newpte) { 2631 pmap_inval_interlock(&info, pmap, va); 2632 *pte = newpte | PG_A; 2633 pmap_inval_deinterlock(&info, pmap); 2634 if (newpte & PG_RW) 2635 vm_page_flag_set(m, PG_WRITEABLE); 2636 } 2637 KKASSERT((newpte & PG_MANAGED) == 0 || (m->flags & PG_MAPPED)); 2638 pmap_inval_done(&info); 2639 } 2640 2641 /* 2642 * This code works like pmap_enter() but assumes VM_PROT_READ and not-wired. 2643 * This code also assumes that the pmap has no pre-existing entry for this 2644 * VA. 2645 * 2646 * This code currently may only be used on user pmaps, not kernel_pmap. 2647 */ 2648 void 2649 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m) 2650 { 2651 pt_entry_t *pte; 2652 vm_paddr_t pa; 2653 vm_page_t mpte; 2654 vm_pindex_t ptepindex; 2655 pd_entry_t *ptepa; 2656 pmap_inval_info info; 2657 2658 pmap_inval_init(&info); 2659 2660 if (va < UPT_MAX_ADDRESS && pmap == &kernel_pmap) { 2661 kprintf("Warning: pmap_enter_quick called on UVA with kernel_pmap\n"); 2662 #ifdef DDB 2663 db_print_backtrace(); 2664 #endif 2665 } 2666 if (va >= UPT_MAX_ADDRESS && pmap != &kernel_pmap) { 2667 kprintf("Warning: pmap_enter_quick called on KVA without kernel_pmap\n"); 2668 #ifdef DDB 2669 db_print_backtrace(); 2670 #endif 2671 } 2672 2673 KKASSERT(va < UPT_MIN_ADDRESS); /* assert used on user pmaps only */ 2674 2675 /* 2676 * Calculate the page table page (mpte), allocating it if necessary. 2677 * 2678 * A held page table page (mpte), or NULL, is passed onto the 2679 * section following. 2680 */ 2681 if (va < VM_MAX_USER_ADDRESS) { 2682 /* 2683 * Calculate pagetable page index 2684 */ 2685 ptepindex = pmap_pde_pindex(va); 2686 2687 do { 2688 /* 2689 * Get the page directory entry 2690 */ 2691 ptepa = pmap_pde(pmap, va); 2692 2693 /* 2694 * If the page table page is mapped, we just increment 2695 * the hold count, and activate it. 2696 */ 2697 if (ptepa && (*ptepa & PG_V) != 0) { 2698 if (*ptepa & PG_PS) 2699 panic("pmap_enter_quick: unexpected mapping into 2MB page"); 2700 // if (pmap->pm_ptphint && 2701 // (pmap->pm_ptphint->pindex == ptepindex)) { 2702 // mpte = pmap->pm_ptphint; 2703 // } else { 2704 mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex); 2705 pmap->pm_ptphint = mpte; 2706 // } 2707 if (mpte) 2708 mpte->hold_count++; 2709 } else { 2710 mpte = _pmap_allocpte(pmap, ptepindex); 2711 } 2712 } while (mpte == NULL); 2713 } else { 2714 mpte = NULL; 2715 /* this code path is not yet used */ 2716 } 2717 2718 /* 2719 * With a valid (and held) page directory page, we can just use 2720 * vtopte() to get to the pte. If the pte is already present 2721 * we do not disturb it. 2722 */ 2723 pte = vtopte(va); 2724 if (*pte & PG_V) { 2725 if (mpte) 2726 pmap_unwire_pte_hold(pmap, va, mpte, &info); 2727 pa = VM_PAGE_TO_PHYS(m); 2728 KKASSERT(((*pte ^ pa) & PG_FRAME) == 0); 2729 pmap_inval_done(&info); 2730 return; 2731 } 2732 2733 /* 2734 * Enter on the PV list if part of our managed memory 2735 */ 2736 if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) { 2737 pmap_insert_entry(pmap, va, mpte, m); 2738 vm_page_flag_set(m, PG_MAPPED); 2739 } 2740 2741 /* 2742 * Increment counters 2743 */ 2744 ++pmap->pm_stats.resident_count; 2745 2746 pa = VM_PAGE_TO_PHYS(m); 2747 2748 /* 2749 * Now validate mapping with RO protection 2750 */ 2751 if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) 2752 *pte = pa | PG_V | PG_U; 2753 else 2754 *pte = pa | PG_V | PG_U | PG_MANAGED; 2755 /* pmap_inval_add(&info, pmap, va); shouldn't be needed inval->valid */ 2756 pmap_inval_done(&info); 2757 } 2758 2759 /* 2760 * Make a temporary mapping for a physical address. This is only intended 2761 * to be used for panic dumps. 2762 */ 2763 /* JG Needed on x86_64? */ 2764 void * 2765 pmap_kenter_temporary(vm_paddr_t pa, int i) 2766 { 2767 pmap_kenter((vm_offset_t)crashdumpmap + (i * PAGE_SIZE), pa); 2768 return ((void *)crashdumpmap); 2769 } 2770 2771 #define MAX_INIT_PT (96) 2772 2773 /* 2774 * This routine preloads the ptes for a given object into the specified pmap. 2775 * This eliminates the blast of soft faults on process startup and 2776 * immediately after an mmap. 2777 */ 2778 static int pmap_object_init_pt_callback(vm_page_t p, void *data); 2779 2780 void 2781 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_prot_t prot, 2782 vm_object_t object, vm_pindex_t pindex, 2783 vm_size_t size, int limit) 2784 { 2785 struct rb_vm_page_scan_info info; 2786 struct lwp *lp; 2787 vm_size_t psize; 2788 2789 /* 2790 * We can't preinit if read access isn't set or there is no pmap 2791 * or object. 2792 */ 2793 if ((prot & VM_PROT_READ) == 0 || pmap == NULL || object == NULL) 2794 return; 2795 2796 /* 2797 * We can't preinit if the pmap is not the current pmap 2798 */ 2799 lp = curthread->td_lwp; 2800 if (lp == NULL || pmap != vmspace_pmap(lp->lwp_vmspace)) 2801 return; 2802 2803 psize = x86_64_btop(size); 2804 2805 if ((object->type != OBJT_VNODE) || 2806 ((limit & MAP_PREFAULT_PARTIAL) && (psize > MAX_INIT_PT) && 2807 (object->resident_page_count > MAX_INIT_PT))) { 2808 return; 2809 } 2810 2811 if (psize + pindex > object->size) { 2812 if (object->size < pindex) 2813 return; 2814 psize = object->size - pindex; 2815 } 2816 2817 if (psize == 0) 2818 return; 2819 2820 /* 2821 * Use a red-black scan to traverse the requested range and load 2822 * any valid pages found into the pmap. 2823 * 2824 * We cannot safely scan the object's memq unless we are in a 2825 * critical section since interrupts can remove pages from objects. 2826 */ 2827 info.start_pindex = pindex; 2828 info.end_pindex = pindex + psize - 1; 2829 info.limit = limit; 2830 info.mpte = NULL; 2831 info.addr = addr; 2832 info.pmap = pmap; 2833 2834 crit_enter(); 2835 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, 2836 pmap_object_init_pt_callback, &info); 2837 crit_exit(); 2838 } 2839 2840 static 2841 int 2842 pmap_object_init_pt_callback(vm_page_t p, void *data) 2843 { 2844 struct rb_vm_page_scan_info *info = data; 2845 vm_pindex_t rel_index; 2846 /* 2847 * don't allow an madvise to blow away our really 2848 * free pages allocating pv entries. 2849 */ 2850 if ((info->limit & MAP_PREFAULT_MADVISE) && 2851 vmstats.v_free_count < vmstats.v_free_reserved) { 2852 return(-1); 2853 } 2854 if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && 2855 (p->busy == 0) && (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) { 2856 if ((p->queue - p->pc) == PQ_CACHE) 2857 vm_page_deactivate(p); 2858 vm_page_busy(p); 2859 rel_index = p->pindex - info->start_pindex; 2860 pmap_enter_quick(info->pmap, 2861 info->addr + x86_64_ptob(rel_index), p); 2862 vm_page_wakeup(p); 2863 } 2864 return(0); 2865 } 2866 2867 /* 2868 * Return TRUE if the pmap is in shape to trivially 2869 * pre-fault the specified address. 2870 * 2871 * Returns FALSE if it would be non-trivial or if a 2872 * pte is already loaded into the slot. 2873 */ 2874 int 2875 pmap_prefault_ok(pmap_t pmap, vm_offset_t addr) 2876 { 2877 pt_entry_t *pte; 2878 pd_entry_t *pde; 2879 2880 pde = pmap_pde(pmap, addr); 2881 if (pde == NULL || *pde == 0) 2882 return(0); 2883 2884 pte = vtopte(addr); 2885 if (*pte) 2886 return(0); 2887 2888 return(1); 2889 } 2890 2891 /* 2892 * Routine: pmap_change_wiring 2893 * Function: Change the wiring attribute for a map/virtual-address 2894 * pair. 2895 * In/out conditions: 2896 * The mapping must already exist in the pmap. 2897 */ 2898 void 2899 pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired) 2900 { 2901 pt_entry_t *pte; 2902 2903 if (pmap == NULL) 2904 return; 2905 2906 pte = pmap_pte(pmap, va); 2907 2908 if (wired && !pmap_pte_w(pte)) 2909 pmap->pm_stats.wired_count++; 2910 else if (!wired && pmap_pte_w(pte)) 2911 pmap->pm_stats.wired_count--; 2912 2913 /* 2914 * Wiring is not a hardware characteristic so there is no need to 2915 * invalidate TLB. However, in an SMP environment we must use 2916 * a locked bus cycle to update the pte (if we are not using 2917 * the pmap_inval_*() API that is)... it's ok to do this for simple 2918 * wiring changes. 2919 */ 2920 #ifdef SMP 2921 if (wired) 2922 atomic_set_long(pte, PG_W); 2923 else 2924 atomic_clear_long(pte, PG_W); 2925 #else 2926 if (wired) 2927 atomic_set_long_nonlocked(pte, PG_W); 2928 else 2929 atomic_clear_long_nonlocked(pte, PG_W); 2930 #endif 2931 } 2932 2933 2934 2935 /* 2936 * Copy the range specified by src_addr/len 2937 * from the source map to the range dst_addr/len 2938 * in the destination map. 2939 * 2940 * This routine is only advisory and need not do anything. 2941 */ 2942 void 2943 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, 2944 vm_size_t len, vm_offset_t src_addr) 2945 { 2946 return; 2947 #if 0 2948 pmap_inval_info info; 2949 vm_offset_t addr; 2950 vm_offset_t end_addr = src_addr + len; 2951 vm_offset_t pdnxt; 2952 pd_entry_t src_frame, dst_frame; 2953 vm_page_t m; 2954 2955 if (dst_addr != src_addr) 2956 return; 2957 #if JGPMAP32 2958 src_frame = src_pmap->pm_pdir[PTDPTDI] & PG_FRAME; 2959 if (src_frame != (PTDpde & PG_FRAME)) { 2960 return; 2961 } 2962 2963 dst_frame = dst_pmap->pm_pdir[PTDPTDI] & PG_FRAME; 2964 if (dst_frame != (APTDpde & PG_FRAME)) { 2965 APTDpde = (pd_entry_t) (dst_frame | PG_RW | PG_V); 2966 /* The page directory is not shared between CPUs */ 2967 cpu_invltlb(); 2968 } 2969 #endif 2970 pmap_inval_init(&info); 2971 pmap_inval_add(&info, dst_pmap, -1); 2972 pmap_inval_add(&info, src_pmap, -1); 2973 2974 /* 2975 * critical section protection is required to maintain the page/object 2976 * association, interrupts can free pages and remove them from 2977 * their objects. 2978 */ 2979 crit_enter(); 2980 for (addr = src_addr; addr < end_addr; addr = pdnxt) { 2981 pt_entry_t *src_pte, *dst_pte; 2982 vm_page_t dstmpte, srcmpte; 2983 vm_offset_t srcptepaddr; 2984 vm_pindex_t ptepindex; 2985 2986 if (addr >= UPT_MIN_ADDRESS) 2987 panic("pmap_copy: invalid to pmap_copy page tables\n"); 2988 2989 /* 2990 * Don't let optional prefaulting of pages make us go 2991 * way below the low water mark of free pages or way 2992 * above high water mark of used pv entries. 2993 */ 2994 if (vmstats.v_free_count < vmstats.v_free_reserved || 2995 pv_entry_count > pv_entry_high_water) 2996 break; 2997 2998 pdnxt = ((addr + PAGE_SIZE*NPTEPG) & ~(PAGE_SIZE*NPTEPG - 1)); 2999 ptepindex = addr >> PDRSHIFT; 3000 3001 #if JGPMAP32 3002 srcptepaddr = (vm_offset_t) src_pmap->pm_pdir[ptepindex]; 3003 #endif 3004 if (srcptepaddr == 0) 3005 continue; 3006 3007 if (srcptepaddr & PG_PS) { 3008 #if JGPMAP32 3009 if (dst_pmap->pm_pdir[ptepindex] == 0) { 3010 dst_pmap->pm_pdir[ptepindex] = (pd_entry_t) srcptepaddr; 3011 dst_pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; 3012 } 3013 #endif 3014 continue; 3015 } 3016 3017 srcmpte = vm_page_lookup(src_pmap->pm_pteobj, ptepindex); 3018 if ((srcmpte == NULL) || (srcmpte->hold_count == 0) || 3019 (srcmpte->flags & PG_BUSY)) { 3020 continue; 3021 } 3022 3023 if (pdnxt > end_addr) 3024 pdnxt = end_addr; 3025 3026 src_pte = vtopte(addr); 3027 #if JGPMAP32 3028 dst_pte = avtopte(addr); 3029 #endif 3030 while (addr < pdnxt) { 3031 pt_entry_t ptetemp; 3032 3033 ptetemp = *src_pte; 3034 /* 3035 * we only virtual copy managed pages 3036 */ 3037 if ((ptetemp & PG_MANAGED) != 0) { 3038 /* 3039 * We have to check after allocpte for the 3040 * pte still being around... allocpte can 3041 * block. 3042 * 3043 * pmap_allocpte() can block. If we lose 3044 * our page directory mappings we stop. 3045 */ 3046 dstmpte = pmap_allocpte(dst_pmap, addr); 3047 3048 #if JGPMAP32 3049 if (src_frame != (PTDpde & PG_FRAME) || 3050 dst_frame != (APTDpde & PG_FRAME) 3051 ) { 3052 kprintf("WARNING: pmap_copy: detected and corrected race\n"); 3053 pmap_unwire_pte_hold(dst_pmap, dstmpte, &info); 3054 goto failed; 3055 } else if ((*dst_pte == 0) && 3056 (ptetemp = *src_pte) != 0 && 3057 (ptetemp & PG_MANAGED)) { 3058 /* 3059 * Clear the modified and 3060 * accessed (referenced) bits 3061 * during the copy. 3062 */ 3063 m = PHYS_TO_VM_PAGE(ptetemp); 3064 *dst_pte = ptetemp & ~(PG_M | PG_A); 3065 ++dst_pmap->pm_stats.resident_count; 3066 pmap_insert_entry(dst_pmap, addr, 3067 dstmpte, m); 3068 KKASSERT(m->flags & PG_MAPPED); 3069 } else { 3070 kprintf("WARNING: pmap_copy: dst_pte race detected and corrected\n"); 3071 pmap_unwire_pte_hold(dst_pmap, dstmpte, &info); 3072 goto failed; 3073 } 3074 #endif 3075 if (dstmpte->hold_count >= srcmpte->hold_count) 3076 break; 3077 } 3078 addr += PAGE_SIZE; 3079 src_pte++; 3080 dst_pte++; 3081 } 3082 } 3083 failed: 3084 crit_exit(); 3085 pmap_inval_done(&info); 3086 #endif 3087 } 3088 3089 /* 3090 * pmap_zero_page: 3091 * 3092 * Zero the specified physical page. 3093 * 3094 * This function may be called from an interrupt and no locking is 3095 * required. 3096 */ 3097 void 3098 pmap_zero_page(vm_paddr_t phys) 3099 { 3100 vm_offset_t va = PHYS_TO_DMAP(phys); 3101 3102 pagezero((void *)va); 3103 } 3104 3105 /* 3106 * pmap_page_assertzero: 3107 * 3108 * Assert that a page is empty, panic if it isn't. 3109 */ 3110 void 3111 pmap_page_assertzero(vm_paddr_t phys) 3112 { 3113 vm_offset_t virt = PHYS_TO_DMAP(phys); 3114 int i; 3115 3116 for (i = 0; i < PAGE_SIZE; i += sizeof(long)) { 3117 if (*(long *)((char *)virt + i) != 0) { 3118 panic("pmap_page_assertzero() @ %p not zero!\n", (void *)virt); 3119 } 3120 } 3121 } 3122 3123 /* 3124 * pmap_zero_page: 3125 * 3126 * Zero part of a physical page by mapping it into memory and clearing 3127 * its contents with bzero. 3128 * 3129 * off and size may not cover an area beyond a single hardware page. 3130 */ 3131 void 3132 pmap_zero_page_area(vm_paddr_t phys, int off, int size) 3133 { 3134 vm_offset_t virt = PHYS_TO_DMAP(phys); 3135 3136 bzero((char *)virt + off, size); 3137 } 3138 3139 /* 3140 * pmap_copy_page: 3141 * 3142 * Copy the physical page from the source PA to the target PA. 3143 * This function may be called from an interrupt. No locking 3144 * is required. 3145 */ 3146 void 3147 pmap_copy_page(vm_paddr_t src, vm_paddr_t dst) 3148 { 3149 vm_offset_t src_virt, dst_virt; 3150 3151 src_virt = PHYS_TO_DMAP(src); 3152 dst_virt = PHYS_TO_DMAP(dst); 3153 bcopy((void *)src_virt, (void *)dst_virt, PAGE_SIZE); 3154 } 3155 3156 /* 3157 * pmap_copy_page_frag: 3158 * 3159 * Copy the physical page from the source PA to the target PA. 3160 * This function may be called from an interrupt. No locking 3161 * is required. 3162 */ 3163 void 3164 pmap_copy_page_frag(vm_paddr_t src, vm_paddr_t dst, size_t bytes) 3165 { 3166 vm_offset_t src_virt, dst_virt; 3167 3168 src_virt = PHYS_TO_DMAP(src); 3169 dst_virt = PHYS_TO_DMAP(dst); 3170 3171 bcopy((char *)src_virt + (src & PAGE_MASK), 3172 (char *)dst_virt + (dst & PAGE_MASK), 3173 bytes); 3174 } 3175 3176 /* 3177 * Returns true if the pmap's pv is one of the first 3178 * 16 pvs linked to from this page. This count may 3179 * be changed upwards or downwards in the future; it 3180 * is only necessary that true be returned for a small 3181 * subset of pmaps for proper page aging. 3182 */ 3183 boolean_t 3184 pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 3185 { 3186 pv_entry_t pv; 3187 int loops = 0; 3188 3189 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 3190 return FALSE; 3191 3192 crit_enter(); 3193 3194 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 3195 if (pv->pv_pmap == pmap) { 3196 crit_exit(); 3197 return TRUE; 3198 } 3199 loops++; 3200 if (loops >= 16) 3201 break; 3202 } 3203 crit_exit(); 3204 return (FALSE); 3205 } 3206 3207 /* 3208 * Remove all pages from specified address space 3209 * this aids process exit speeds. Also, this code 3210 * is special cased for current process only, but 3211 * can have the more generic (and slightly slower) 3212 * mode enabled. This is much faster than pmap_remove 3213 * in the case of running down an entire address space. 3214 */ 3215 void 3216 pmap_remove_pages(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3217 { 3218 struct lwp *lp; 3219 pt_entry_t *pte, tpte; 3220 pv_entry_t pv, npv; 3221 vm_page_t m; 3222 pmap_inval_info info; 3223 int iscurrentpmap; 3224 int save_generation; 3225 3226 lp = curthread->td_lwp; 3227 if (lp && pmap == vmspace_pmap(lp->lwp_vmspace)) 3228 iscurrentpmap = 1; 3229 else 3230 iscurrentpmap = 0; 3231 3232 pmap_inval_init(&info); 3233 for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) { 3234 if (pv->pv_va >= eva || pv->pv_va < sva) { 3235 npv = TAILQ_NEXT(pv, pv_plist); 3236 continue; 3237 } 3238 3239 KKASSERT(pmap == pv->pv_pmap); 3240 3241 if (iscurrentpmap) 3242 pte = vtopte(pv->pv_va); 3243 else 3244 pte = pmap_pte_quick(pmap, pv->pv_va); 3245 pmap_inval_interlock(&info, pmap, pv->pv_va); 3246 3247 /* 3248 * We cannot remove wired pages from a process' mapping 3249 * at this time 3250 */ 3251 if (*pte & PG_W) { 3252 pmap_inval_deinterlock(&info, pmap); 3253 npv = TAILQ_NEXT(pv, pv_plist); 3254 continue; 3255 } 3256 tpte = pte_load_clear(pte); 3257 3258 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 3259 3260 KASSERT(m < &vm_page_array[vm_page_array_size], 3261 ("pmap_remove_pages: bad tpte %lx", tpte)); 3262 3263 KKASSERT(pmap->pm_stats.resident_count > 0); 3264 --pmap->pm_stats.resident_count; 3265 pmap_inval_deinterlock(&info, pmap); 3266 3267 /* 3268 * Update the vm_page_t clean and reference bits. 3269 */ 3270 if (tpte & PG_M) { 3271 vm_page_dirty(m); 3272 } 3273 3274 npv = TAILQ_NEXT(pv, pv_plist); 3275 TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); 3276 save_generation = ++pmap->pm_generation; 3277 3278 m->md.pv_list_count--; 3279 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 3280 if (TAILQ_EMPTY(&m->md.pv_list)) 3281 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); 3282 3283 pmap_unuse_pt(pmap, pv->pv_va, pv->pv_ptem, &info); 3284 free_pv_entry(pv); 3285 3286 /* 3287 * Restart the scan if we blocked during the unuse or free 3288 * calls and other removals were made. 3289 */ 3290 if (save_generation != pmap->pm_generation) { 3291 kprintf("Warning: pmap_remove_pages race-A avoided\n"); 3292 npv = TAILQ_FIRST(&pmap->pm_pvlist); 3293 } 3294 } 3295 pmap_inval_done(&info); 3296 } 3297 3298 /* 3299 * pmap_testbit tests bits in pte's 3300 * note that the testbit/clearbit routines are inline, 3301 * and a lot of things compile-time evaluate. 3302 */ 3303 static 3304 boolean_t 3305 pmap_testbit(vm_page_t m, int bit) 3306 { 3307 pv_entry_t pv; 3308 pt_entry_t *pte; 3309 3310 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 3311 return FALSE; 3312 3313 if (TAILQ_FIRST(&m->md.pv_list) == NULL) 3314 return FALSE; 3315 3316 crit_enter(); 3317 3318 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 3319 /* 3320 * if the bit being tested is the modified bit, then 3321 * mark clean_map and ptes as never 3322 * modified. 3323 */ 3324 if (bit & (PG_A|PG_M)) { 3325 if (!pmap_track_modified(pv->pv_va)) 3326 continue; 3327 } 3328 3329 #if defined(PMAP_DIAGNOSTIC) 3330 if (pv->pv_pmap == NULL) { 3331 kprintf("Null pmap (tb) at va: 0x%lx\n", pv->pv_va); 3332 continue; 3333 } 3334 #endif 3335 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); 3336 if (*pte & bit) { 3337 crit_exit(); 3338 return TRUE; 3339 } 3340 } 3341 crit_exit(); 3342 return (FALSE); 3343 } 3344 3345 /* 3346 * this routine is used to modify bits in ptes 3347 */ 3348 static __inline 3349 void 3350 pmap_clearbit(vm_page_t m, int bit) 3351 { 3352 struct pmap_inval_info info; 3353 pv_entry_t pv; 3354 pt_entry_t *pte; 3355 pt_entry_t pbits; 3356 3357 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 3358 return; 3359 3360 pmap_inval_init(&info); 3361 3362 /* 3363 * Loop over all current mappings setting/clearing as appropos If 3364 * setting RO do we need to clear the VAC? 3365 */ 3366 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 3367 /* 3368 * don't write protect pager mappings 3369 */ 3370 if (bit == PG_RW) { 3371 if (!pmap_track_modified(pv->pv_va)) 3372 continue; 3373 } 3374 3375 #if defined(PMAP_DIAGNOSTIC) 3376 if (pv->pv_pmap == NULL) { 3377 kprintf("Null pmap (cb) at va: 0x%lx\n", pv->pv_va); 3378 continue; 3379 } 3380 #endif 3381 3382 /* 3383 * Careful here. We can use a locked bus instruction to 3384 * clear PG_A or PG_M safely but we need to synchronize 3385 * with the target cpus when we mess with PG_RW. 3386 * 3387 * We do not have to force synchronization when clearing 3388 * PG_M even for PTEs generated via virtual memory maps, 3389 * because the virtual kernel will invalidate the pmap 3390 * entry when/if it needs to resynchronize the Modify bit. 3391 */ 3392 if (bit & PG_RW) 3393 pmap_inval_interlock(&info, pv->pv_pmap, pv->pv_va); 3394 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); 3395 again: 3396 pbits = *pte; 3397 if (pbits & bit) { 3398 if (bit == PG_RW) { 3399 if (pbits & PG_M) { 3400 vm_page_dirty(m); 3401 atomic_clear_long(pte, PG_M|PG_RW); 3402 } else { 3403 /* 3404 * The cpu may be trying to set PG_M 3405 * simultaniously with our clearing 3406 * of PG_RW. 3407 */ 3408 if (!atomic_cmpset_long(pte, pbits, 3409 pbits & ~PG_RW)) 3410 goto again; 3411 } 3412 } else if (bit == PG_M) { 3413 /* 3414 * We could also clear PG_RW here to force 3415 * a fault on write to redetect PG_M for 3416 * virtual kernels, but it isn't necessary 3417 * since virtual kernels invalidate the pte 3418 * when they clear the VPTE_M bit in their 3419 * virtual page tables. 3420 */ 3421 atomic_clear_long(pte, PG_M); 3422 } else { 3423 atomic_clear_long(pte, bit); 3424 } 3425 } 3426 if (bit & PG_RW) 3427 pmap_inval_deinterlock(&info, pv->pv_pmap); 3428 } 3429 pmap_inval_done(&info); 3430 } 3431 3432 /* 3433 * pmap_page_protect: 3434 * 3435 * Lower the permission for all mappings to a given page. 3436 */ 3437 void 3438 pmap_page_protect(vm_page_t m, vm_prot_t prot) 3439 { 3440 /* JG NX support? */ 3441 if ((prot & VM_PROT_WRITE) == 0) { 3442 if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) { 3443 pmap_clearbit(m, PG_RW); 3444 vm_page_flag_clear(m, PG_WRITEABLE); 3445 } else { 3446 pmap_remove_all(m); 3447 } 3448 } 3449 } 3450 3451 vm_paddr_t 3452 pmap_phys_address(vm_pindex_t ppn) 3453 { 3454 return (x86_64_ptob(ppn)); 3455 } 3456 3457 /* 3458 * pmap_ts_referenced: 3459 * 3460 * Return a count of reference bits for a page, clearing those bits. 3461 * It is not necessary for every reference bit to be cleared, but it 3462 * is necessary that 0 only be returned when there are truly no 3463 * reference bits set. 3464 * 3465 * XXX: The exact number of bits to check and clear is a matter that 3466 * should be tested and standardized at some point in the future for 3467 * optimal aging of shared pages. 3468 */ 3469 int 3470 pmap_ts_referenced(vm_page_t m) 3471 { 3472 pv_entry_t pv, pvf, pvn; 3473 pt_entry_t *pte; 3474 int rtval = 0; 3475 3476 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 3477 return (rtval); 3478 3479 crit_enter(); 3480 3481 if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 3482 3483 pvf = pv; 3484 3485 do { 3486 pvn = TAILQ_NEXT(pv, pv_list); 3487 3488 crit_enter(); 3489 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 3490 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 3491 crit_exit(); 3492 3493 if (!pmap_track_modified(pv->pv_va)) 3494 continue; 3495 3496 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); 3497 3498 if (pte && (*pte & PG_A)) { 3499 #ifdef SMP 3500 atomic_clear_long(pte, PG_A); 3501 #else 3502 atomic_clear_long_nonlocked(pte, PG_A); 3503 #endif 3504 rtval++; 3505 if (rtval > 4) { 3506 break; 3507 } 3508 } 3509 } while ((pv = pvn) != NULL && pv != pvf); 3510 } 3511 crit_exit(); 3512 3513 return (rtval); 3514 } 3515 3516 /* 3517 * pmap_is_modified: 3518 * 3519 * Return whether or not the specified physical page was modified 3520 * in any physical maps. 3521 */ 3522 boolean_t 3523 pmap_is_modified(vm_page_t m) 3524 { 3525 return pmap_testbit(m, PG_M); 3526 } 3527 3528 /* 3529 * Clear the modify bits on the specified physical page. 3530 */ 3531 void 3532 pmap_clear_modify(vm_page_t m) 3533 { 3534 pmap_clearbit(m, PG_M); 3535 } 3536 3537 /* 3538 * pmap_clear_reference: 3539 * 3540 * Clear the reference bit on the specified physical page. 3541 */ 3542 void 3543 pmap_clear_reference(vm_page_t m) 3544 { 3545 pmap_clearbit(m, PG_A); 3546 } 3547 3548 /* 3549 * Miscellaneous support routines follow 3550 */ 3551 3552 static 3553 void 3554 i386_protection_init(void) 3555 { 3556 int *kp, prot; 3557 3558 /* JG NX support may go here; No VM_PROT_EXECUTE ==> set NX bit */ 3559 kp = protection_codes; 3560 for (prot = 0; prot < 8; prot++) { 3561 switch (prot) { 3562 case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE: 3563 /* 3564 * Read access is also 0. There isn't any execute bit, 3565 * so just make it readable. 3566 */ 3567 case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE: 3568 case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE: 3569 case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE: 3570 *kp++ = 0; 3571 break; 3572 case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE: 3573 case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE: 3574 case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE: 3575 case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE: 3576 *kp++ = PG_RW; 3577 break; 3578 } 3579 } 3580 } 3581 3582 /* 3583 * Map a set of physical memory pages into the kernel virtual 3584 * address space. Return a pointer to where it is mapped. This 3585 * routine is intended to be used for mapping device memory, 3586 * NOT real memory. 3587 * 3588 * NOTE: we can't use pgeflag unless we invalidate the pages one at 3589 * a time. 3590 */ 3591 void * 3592 pmap_mapdev(vm_paddr_t pa, vm_size_t size) 3593 { 3594 vm_offset_t va, tmpva, offset; 3595 pt_entry_t *pte; 3596 3597 offset = pa & PAGE_MASK; 3598 size = roundup(offset + size, PAGE_SIZE); 3599 3600 va = kmem_alloc_nofault(&kernel_map, size, PAGE_SIZE); 3601 if (va == 0) 3602 panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); 3603 3604 pa = pa & ~PAGE_MASK; 3605 for (tmpva = va; size > 0;) { 3606 pte = vtopte(tmpva); 3607 *pte = pa | PG_RW | PG_V; /* | pgeflag; */ 3608 size -= PAGE_SIZE; 3609 tmpva += PAGE_SIZE; 3610 pa += PAGE_SIZE; 3611 } 3612 cpu_invltlb(); 3613 smp_invltlb(); 3614 3615 return ((void *)(va + offset)); 3616 } 3617 3618 void * 3619 pmap_mapdev_uncacheable(vm_paddr_t pa, vm_size_t size) 3620 { 3621 vm_offset_t va, tmpva, offset; 3622 pt_entry_t *pte; 3623 3624 offset = pa & PAGE_MASK; 3625 size = roundup(offset + size, PAGE_SIZE); 3626 3627 va = kmem_alloc_nofault(&kernel_map, size, PAGE_SIZE); 3628 if (va == 0) 3629 panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); 3630 3631 pa = pa & ~PAGE_MASK; 3632 for (tmpva = va; size > 0;) { 3633 pte = vtopte(tmpva); 3634 *pte = pa | PG_RW | PG_V | PG_N; /* | pgeflag; */ 3635 size -= PAGE_SIZE; 3636 tmpva += PAGE_SIZE; 3637 pa += PAGE_SIZE; 3638 } 3639 cpu_invltlb(); 3640 smp_invltlb(); 3641 3642 return ((void *)(va + offset)); 3643 } 3644 3645 void 3646 pmap_unmapdev(vm_offset_t va, vm_size_t size) 3647 { 3648 vm_offset_t base, offset; 3649 3650 base = va & ~PAGE_MASK; 3651 offset = va & PAGE_MASK; 3652 size = roundup(offset + size, PAGE_SIZE); 3653 pmap_qremove(va, size >> PAGE_SHIFT); 3654 kmem_free(&kernel_map, base, size); 3655 } 3656 3657 /* 3658 * perform the pmap work for mincore 3659 */ 3660 int 3661 pmap_mincore(pmap_t pmap, vm_offset_t addr) 3662 { 3663 pt_entry_t *ptep, pte; 3664 vm_page_t m; 3665 int val = 0; 3666 3667 ptep = pmap_pte(pmap, addr); 3668 if (ptep == 0) { 3669 return 0; 3670 } 3671 3672 if ((pte = *ptep) != 0) { 3673 vm_offset_t pa; 3674 3675 val = MINCORE_INCORE; 3676 if ((pte & PG_MANAGED) == 0) 3677 return val; 3678 3679 pa = pte & PG_FRAME; 3680 3681 m = PHYS_TO_VM_PAGE(pa); 3682 3683 /* 3684 * Modified by us 3685 */ 3686 if (pte & PG_M) 3687 val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; 3688 /* 3689 * Modified by someone 3690 */ 3691 else if (m->dirty || pmap_is_modified(m)) 3692 val |= MINCORE_MODIFIED_OTHER; 3693 /* 3694 * Referenced by us 3695 */ 3696 if (pte & PG_A) 3697 val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; 3698 3699 /* 3700 * Referenced by someone 3701 */ 3702 else if ((m->flags & PG_REFERENCED) || pmap_ts_referenced(m)) { 3703 val |= MINCORE_REFERENCED_OTHER; 3704 vm_page_flag_set(m, PG_REFERENCED); 3705 } 3706 } 3707 return val; 3708 } 3709 3710 /* 3711 * Replace p->p_vmspace with a new one. If adjrefs is non-zero the new 3712 * vmspace will be ref'd and the old one will be deref'd. 3713 * 3714 * The vmspace for all lwps associated with the process will be adjusted 3715 * and cr3 will be reloaded if any lwp is the current lwp. 3716 */ 3717 void 3718 pmap_replacevm(struct proc *p, struct vmspace *newvm, int adjrefs) 3719 { 3720 struct vmspace *oldvm; 3721 struct lwp *lp; 3722 3723 crit_enter(); 3724 oldvm = p->p_vmspace; 3725 if (oldvm != newvm) { 3726 p->p_vmspace = newvm; 3727 KKASSERT(p->p_nthreads == 1); 3728 lp = RB_ROOT(&p->p_lwp_tree); 3729 pmap_setlwpvm(lp, newvm); 3730 if (adjrefs) { 3731 sysref_get(&newvm->vm_sysref); 3732 sysref_put(&oldvm->vm_sysref); 3733 } 3734 } 3735 crit_exit(); 3736 } 3737 3738 /* 3739 * Set the vmspace for a LWP. The vmspace is almost universally set the 3740 * same as the process vmspace, but virtual kernels need to swap out contexts 3741 * on a per-lwp basis. 3742 */ 3743 void 3744 pmap_setlwpvm(struct lwp *lp, struct vmspace *newvm) 3745 { 3746 struct vmspace *oldvm; 3747 struct pmap *pmap; 3748 3749 crit_enter(); 3750 oldvm = lp->lwp_vmspace; 3751 3752 if (oldvm != newvm) { 3753 lp->lwp_vmspace = newvm; 3754 if (curthread->td_lwp == lp) { 3755 pmap = vmspace_pmap(newvm); 3756 #if defined(SMP) 3757 atomic_set_int(&pmap->pm_active, mycpu->gd_cpumask); 3758 if (pmap->pm_active & CPUMASK_LOCK) 3759 pmap_interlock_wait(newvm); 3760 #else 3761 pmap->pm_active |= 1; 3762 #endif 3763 #if defined(SWTCH_OPTIM_STATS) 3764 tlb_flush_count++; 3765 #endif 3766 curthread->td_pcb->pcb_cr3 = vtophys(pmap->pm_pml4); 3767 curthread->td_pcb->pcb_cr3 |= PG_RW | PG_U | PG_V; 3768 load_cr3(curthread->td_pcb->pcb_cr3); 3769 pmap = vmspace_pmap(oldvm); 3770 #if defined(SMP) 3771 atomic_clear_int(&pmap->pm_active, mycpu->gd_cpumask); 3772 #else 3773 pmap->pm_active &= ~1; 3774 #endif 3775 } 3776 } 3777 crit_exit(); 3778 } 3779 3780 #ifdef SMP 3781 3782 /* 3783 * Called when switching to a locked pmap 3784 */ 3785 void 3786 pmap_interlock_wait(struct vmspace *vm) 3787 { 3788 struct pmap *pmap = &vm->vm_pmap; 3789 3790 if (pmap->pm_active & CPUMASK_LOCK) { 3791 kprintf("Warning: pmap_interlock %p %08x\n", 3792 pmap, pmap->pm_active); 3793 while (pmap->pm_active & CPUMASK_LOCK) { 3794 cpu_pause(); 3795 cpu_ccfence(); 3796 lwkt_process_ipiq(); 3797 } 3798 } 3799 } 3800 3801 #endif 3802 3803 vm_offset_t 3804 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) 3805 { 3806 3807 if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) { 3808 return addr; 3809 } 3810 3811 addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1); 3812 return addr; 3813 } 3814 3815 3816 #if defined(DEBUG) 3817 3818 static void pads (pmap_t pm); 3819 void pmap_pvdump (vm_paddr_t pa); 3820 3821 /* print address space of pmap*/ 3822 static 3823 void 3824 pads(pmap_t pm) 3825 { 3826 vm_offset_t va; 3827 unsigned i, j; 3828 pt_entry_t *ptep; 3829 3830 if (pm == &kernel_pmap) 3831 return; 3832 crit_enter(); 3833 for (i = 0; i < NPDEPG; i++) { 3834 ; 3835 } 3836 crit_exit(); 3837 3838 } 3839 3840 void 3841 pmap_pvdump(vm_paddr_t pa) 3842 { 3843 pv_entry_t pv; 3844 vm_page_t m; 3845 3846 kprintf("pa %08llx", (long long)pa); 3847 m = PHYS_TO_VM_PAGE(pa); 3848 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 3849 #ifdef used_to_be 3850 kprintf(" -> pmap %p, va %x, flags %x", 3851 (void *)pv->pv_pmap, pv->pv_va, pv->pv_flags); 3852 #endif 3853 kprintf(" -> pmap %p, va %x", (void *)pv->pv_pmap, pv->pv_va); 3854 pads(pv->pv_pmap); 3855 } 3856 kprintf(" "); 3857 } 3858 #endif 3859