1 /* $NetBSD: pmap.c,v 1.5 2007/11/11 01:30:55 ad Exp $ */ 2 3 /* 4 * 5 * Copyright (c) 1997 Charles D. Cranor and Washington University. 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by Charles D. Cranor and 19 * Washington University. 20 * 4. The name of the author may not be used to endorse or promote products 21 * derived from this software without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 24 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 25 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 26 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 27 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 28 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 32 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 /* 36 * Copyright 2001 (c) Wasabi Systems, Inc. 37 * All rights reserved. 38 * 39 * Written by Frank van der Linden for Wasabi Systems, Inc. 40 * 41 * Redistribution and use in source and binary forms, with or without 42 * modification, are permitted provided that the following conditions 43 * are met: 44 * 1. Redistributions of source code must retain the above copyright 45 * notice, this list of conditions and the following disclaimer. 46 * 2. Redistributions in binary form must reproduce the above copyright 47 * notice, this list of conditions and the following disclaimer in the 48 * documentation and/or other materials provided with the distribution. 49 * 3. All advertising materials mentioning features or use of this software 50 * must display the following acknowledgement: 51 * This product includes software developed for the NetBSD Project by 52 * Wasabi Systems, Inc. 53 * 4. The name of Wasabi Systems, Inc. may not be used to endorse 54 * or promote products derived from this software without specific prior 55 * written permission. 56 * 57 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND 58 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 59 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 60 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC 61 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 62 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 63 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 64 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 65 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 66 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 67 * POSSIBILITY OF SUCH DAMAGE. 68 */ 69 70 /* 71 * This is the i386 pmap modified and generalized to support x86-64 72 * as well. The idea is to hide the upper N levels of the page tables 73 * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest 74 * is mostly untouched, except that it uses some more generalized 75 * macros and interfaces. 76 * 77 * This pmap has been tested on the i386 as well, and it can be easily 78 * adapted to PAE. 79 * 80 * fvdl@wasabisystems.com 18-Jun-2001 81 */ 82 83 /* 84 * pmap.c: i386 pmap module rewrite 85 * Chuck Cranor <chuck@ccrc.wustl.edu> 86 * 11-Aug-97 87 * 88 * history of this pmap module: in addition to my own input, i used 89 * the following references for this rewrite of the i386 pmap: 90 * 91 * [1] the NetBSD i386 pmap. this pmap appears to be based on the 92 * BSD hp300 pmap done by Mike Hibler at University of Utah. 93 * it was then ported to the i386 by William Jolitz of UUNET 94 * Technologies, Inc. Then Charles M. Hannum of the NetBSD 95 * project fixed some bugs and provided some speed ups. 96 * 97 * [2] the FreeBSD i386 pmap. this pmap seems to be the 98 * Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson 99 * and David Greenman. 100 * 101 * [3] the Mach pmap. this pmap, from CMU, seems to have migrated 102 * between several processors. the VAX version was done by 103 * Avadis Tevanian, Jr., and Michael Wayne Young. the i386 104 * version was done by Lance Berc, Mike Kupfer, Bob Baron, 105 * David Golub, and Richard Draves. the alpha version was 106 * done by Alessandro Forin (CMU/Mach) and Chris Demetriou 107 * (NetBSD/alpha). 108 */ 109 110 #include <sys/cdefs.h> 111 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.5 2007/11/11 01:30:55 ad Exp $"); 112 113 #ifndef __x86_64__ 114 #include "opt_cputype.h" 115 #endif 116 #include "opt_user_ldt.h" 117 #include "opt_lockdebug.h" 118 #include "opt_multiprocessor.h" 119 #if !defined(__x86_64__) 120 #include "opt_kstack_dr0.h" 121 #endif /* !defined(__x86_64__) */ 122 123 #include <sys/param.h> 124 #include <sys/systm.h> 125 #include <sys/proc.h> 126 #include <sys/malloc.h> 127 #include <sys/pool.h> 128 #include <sys/user.h> 129 #include <sys/kernel.h> 130 131 #include <uvm/uvm.h> 132 133 #include <dev/isa/isareg.h> 134 135 #include <machine/atomic.h> 136 #include <machine/cpu.h> 137 #include <machine/specialreg.h> 138 #include <machine/gdt.h> 139 #include <machine/intr.h> 140 #include <machine/isa_machdep.h> 141 #include <machine/cpuvar.h> 142 143 #include <x86/i82489reg.h> 144 #include <x86/i82489var.h> 145 146 /* XXX */ 147 void atomic_inc_uint(volatile unsigned int *); 148 unsigned int atomic_dec_uint_nv(volatile unsigned int *); 149 150 /* 151 * general info: 152 * 153 * - for an explanation of how the i386 MMU hardware works see 154 * the comments in <machine/pte.h>. 155 * 156 * - for an explanation of the general memory structure used by 157 * this pmap (including the recursive mapping), see the comments 158 * in <machine/pmap.h>. 159 * 160 * this file contains the code for the "pmap module." the module's 161 * job is to manage the hardware's virtual to physical address mappings. 162 * note that there are two levels of mapping in the VM system: 163 * 164 * [1] the upper layer of the VM system uses vm_map's and vm_map_entry's 165 * to map ranges of virtual address space to objects/files. for 166 * example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only 167 * to the file /bin/ls starting at offset zero." note that 168 * the upper layer mapping is not concerned with how individual 169 * vm_pages are mapped. 170 * 171 * [2] the lower layer of the VM system (the pmap) maintains the mappings 172 * from virtual addresses. it is concerned with which vm_page is 173 * mapped where. for example, when you run /bin/ls and start 174 * at page 0x1000 the fault routine may lookup the correct page 175 * of the /bin/ls file and then ask the pmap layer to establish 176 * a mapping for it. 177 * 178 * note that information in the lower layer of the VM system can be 179 * thrown away since it can easily be reconstructed from the info 180 * in the upper layer. 181 * 182 * data structures we use include: 183 * 184 * - struct pmap: describes the address space of one thread 185 * - struct pv_entry: describes one <PMAP,VA> mapping of a PA 186 * - struct pv_head: there is one pv_head per managed page of 187 * physical memory. the pv_head points to a list of pv_entry 188 * structures which describe all the <PMAP,VA> pairs that this 189 * page is mapped in. this is critical for page based operations 190 * such as pmap_page_protect() [change protection on _all_ mappings 191 * of a page] 192 * - pv_page/pv_page_info: pv_entry's are allocated out of pv_page's. 193 * if we run out of pv_entry's we allocate a new pv_page and free 194 * its pv_entrys. 195 */ 196 197 /* 198 * memory allocation 199 * 200 * - there are three data structures that we must dynamically allocate: 201 * 202 * [A] new process' page directory page (PDP) 203 * - plan 1: done at pmap_create() we use 204 * uvm_km_alloc(kernel_map, PAGE_SIZE) [fka kmem_alloc] to do this 205 * allocation. 206 * 207 * if we are low in free physical memory then we sleep in 208 * uvm_km_alloc -- in this case this is ok since we are creating 209 * a new pmap and should not be holding any locks. 210 * 211 * if the kernel is totally out of virtual space 212 * (i.e. uvm_km_alloc returns NULL), then we panic. 213 * 214 * XXX: the fork code currently has no way to return an "out of 215 * memory, try again" error code since uvm_fork [fka vm_fork] 216 * is a void function. 217 * 218 * [B] new page tables pages (PTP) 219 * - call uvm_pagealloc() 220 * => success: zero page, add to pm_pdir 221 * => failure: we are out of free vm_pages, let pmap_enter() 222 * tell UVM about it. 223 * 224 * note: for kernel PTPs, we start with NKPTP of them. as we map 225 * kernel memory (at uvm_map time) we check to see if we've grown 226 * the kernel pmap. if so, we call the optional function 227 * pmap_growkernel() to grow the kernel PTPs in advance. 228 * 229 * [C] pv_entry structures 230 * - plan 1: try to allocate one off the free list 231 * => success: done! 232 * => failure: no more free pv_entrys on the list 233 * - plan 2: try to allocate a new pv_page to add a chunk of 234 * pv_entrys to the free list 235 * [a] obtain a free, unmapped, VA in kmem_map. either 236 * we have one saved from a previous call, or we allocate 237 * one now using a "vm_map_lock_try" in uvm_map 238 * => success: we have an unmapped VA, continue to [b] 239 * => failure: unable to lock kmem_map or out of VA in it. 240 * move on to plan 3. 241 * [b] allocate a page for the VA 242 * => success: map it in, free the pv_entry's, DONE! 243 * => failure: no free vm_pages, etc. 244 * save VA for later call to [a], go to plan 3. 245 * If we fail, we simply let pmap_enter() tell UVM about it. 246 */ 247 248 /* 249 * locking 250 * 251 * we have the following locks that we must contend with: 252 * 253 * RW locks: 254 * 255 * - pmap_main_lock 256 * this lock is used to prevent deadlock and/or provide mutex 257 * access to the pmap system. most operations lock the pmap 258 * structure first, then they lock the pv_lists (if needed). 259 * however, some operations such as pmap_page_protect lock 260 * the pv_lists and then lock pmaps. in order to prevent a 261 * cycle, we require a mutex lock when locking the pv_lists 262 * first. thus, the "pmap = >pv_list" lockers must gain a 263 * read-lock on pmap_main_lock before locking the pmap. and 264 * the "pv_list => pmap" lockers must gain a write-lock on 265 * pmap_main_lock before locking. since only one thread 266 * can write-lock a lock at a time, this provides mutex. 267 * 268 * mutexes: 269 * 270 * - pmap lock (per pmap, part of uvm_object) 271 * this lock protects the fields in the pmap structure including 272 * the non-kernel PDEs in the PDP, and the PTEs. it also locks 273 * in the alternate PTE space (since that is determined by the 274 * entry in the PDP). 275 * 276 * - pvh_lock (per pv_head) 277 * this lock protects the pv_entry list which is chained off the 278 * pv_head structure for a specific managed PA. it is locked 279 * when traversing the list (e.g. adding/removing mappings, 280 * syncing R/M bits, etc.) 281 * 282 * - pmaps_lock 283 * this lock protects the list of active pmaps (headed by "pmaps"). 284 * we lock it when adding or removing pmaps from this list. 285 * 286 * tlb shootdown 287 * 288 * tlb shootdowns are hard interrupts that operate outside the spl 289 * framework: they don't need to be blocked provided that the pmap module 290 * gets the order of events correct. the calls are made by talking directly 291 * to the lapic. the stubs to handle the interrupts are quite short and do 292 * one of the following: invalidate a single page, a range of pages, all 293 * user tlb entries or the entire tlb. 294 * 295 * the cpus synchronize with each other using pmap_mbox structures which are 296 * aligned on 64-byte cache lines. tlb shootdowns against the kernel pmap 297 * use a global mailbox and are generated using a broadcast ipi (broadcast 298 * to all but the sending cpu). shootdowns against regular pmaps use 299 * per-cpu mailboxes and are multicast. kernel and user shootdowns can 300 * execute simultaneously, as can shootdowns within different multithreaded 301 * processes. TODO: 302 * 303 * 1. figure out which waitpoints can be deferered to pmap_update(). 304 * 2. see if there is a cheap way to batch some updates. 305 */ 306 307 vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER; 308 int ptp_shifts[] = PTP_SHIFT_INITIALIZER; 309 long nkptp[] = NKPTP_INITIALIZER; 310 long nkptpmax[] = NKPTPMAX_INITIALIZER; 311 long nbpd[] = NBPD_INITIALIZER; 312 pd_entry_t *normal_pdes[] = PDES_INITIALIZER; 313 pd_entry_t *alternate_pdes[] = APDES_INITIALIZER; 314 315 /* 316 * locking data structures. to enable the locks, changes from the 317 * 'vmlocking' cvs branch are required. for now, just stub them out. 318 */ 319 320 #define rw_enter(a, b) /* nothing */ 321 #define rw_exit(a) /* nothing */ 322 #define mutex_enter(a) simple_lock(a) 323 #define mutex_exit(a) simple_unlock(a) 324 #define mutex_init(a, b, c) simple_lock_init(a) 325 #define mutex_owned(a) (1) 326 #define mutex_destroy(a) /* nothing */ 327 #define kmutex_t struct simplelock 328 329 static kmutex_t pmaps_lock; 330 static krwlock_t pmap_main_lock; 331 332 static vaddr_t pmap_maxkvaddr; 333 334 #define COUNT(x) /* nothing */ 335 336 TAILQ_HEAD(pv_pagelist, pv_page); 337 typedef struct pv_pagelist pv_pagelist_t; 338 339 /* 340 * Global TLB shootdown mailbox. 341 */ 342 struct evcnt pmap_tlb_evcnt __aligned(64); 343 struct pmap_mbox pmap_mbox __aligned(64); 344 345 /* 346 * Per-CPU data. The pmap mailbox is cache intensive so gets its 347 * own line. Note that the mailbox must be the first item. 348 */ 349 struct pmap_cpu { 350 /* TLB shootdown */ 351 struct pmap_mbox pc_mbox; 352 }; 353 354 union { 355 struct pmap_cpu pc; 356 uint8_t padding[128]; 357 } pmap_cpu[X86_MAXPROCS] __aligned(64); 358 359 /* 360 * global data structures 361 */ 362 363 struct pmap kernel_pmap_store; /* the kernel's pmap (proc0) */ 364 365 /* 366 * pmap_pg_g: if our processor supports PG_G in the PTE then we 367 * set pmap_pg_g to PG_G (otherwise it is zero). 368 */ 369 370 int pmap_pg_g = 0; 371 372 /* 373 * pmap_largepages: if our processor supports PG_PS and we are 374 * using it, this is set to true. 375 */ 376 377 int pmap_largepages; 378 379 /* 380 * i386 physical memory comes in a big contig chunk with a small 381 * hole toward the front of it... the following two paddr_t's 382 * (shared with machdep.c) describe the physical address space 383 * of this machine. 384 */ 385 paddr_t avail_start; /* PA of first available physical page */ 386 paddr_t avail_end; /* PA of last available physical page */ 387 388 /* 389 * other data structures 390 */ 391 392 static pt_entry_t protection_codes[8]; /* maps MI prot to i386 prot code */ 393 static bool pmap_initialized = false; /* pmap_init done yet? */ 394 395 /* 396 * the following two vaddr_t's are used during system startup 397 * to keep track of how much of the kernel's VM space we have used. 398 * once the system is started, the management of the remaining kernel 399 * VM space is turned over to the kernel_map vm_map. 400 */ 401 402 static vaddr_t virtual_avail; /* VA of first free KVA */ 403 static vaddr_t virtual_end; /* VA of last free KVA */ 404 405 /* 406 * pv_page management structures 407 */ 408 409 #define PVE_LOWAT (PVE_PER_PVPAGE / 2) /* free pv_entry low water mark */ 410 #define PVE_HIWAT (PVE_LOWAT + (PVE_PER_PVPAGE * 2)) 411 /* high water mark */ 412 413 static inline int 414 pv_compare(struct pv_entry *a, struct pv_entry *b) 415 { 416 417 if (a->pv_pmap < b->pv_pmap) 418 return (-1); 419 else if (a->pv_pmap > b->pv_pmap) 420 return (1); 421 else if (a->pv_va < b->pv_va) 422 return (-1); 423 else if (a->pv_va > b->pv_va) 424 return (1); 425 else 426 return (0); 427 } 428 429 SPLAY_PROTOTYPE(pvtree, pv_entry, pv_node, pv_compare); 430 SPLAY_GENERATE(pvtree, pv_entry, pv_node, pv_compare); 431 432 /* 433 * linked list of all non-kernel pmaps 434 */ 435 436 static struct pmap_head pmaps; 437 438 /* 439 * pool that pmap structures are allocated from 440 */ 441 442 static struct pool_cache pmap_cache; 443 444 /* 445 * pv_entry cache 446 */ 447 448 struct pool_cache pmap_pv_cache; 449 450 /* 451 * MULTIPROCESSOR: special VA's/ PTE's are actually allocated inside a 452 * X86_MAXPROCS*NPTECL array of PTE's, to avoid cache line thrashing 453 * due to false sharing. 454 */ 455 456 #ifdef MULTIPROCESSOR 457 #define PTESLEW(pte, id) ((pte)+(id)*NPTECL) 458 #define VASLEW(va,id) ((va)+(id)*NPTECL*PAGE_SIZE) 459 #else 460 #define PTESLEW(pte, id) (pte) 461 #define VASLEW(va,id) (va) 462 #endif 463 464 /* 465 * special VAs and the PTEs that map them 466 */ 467 static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte, *early_zero_pte; 468 static char *csrcp, *cdstp, *zerop, *ptpp, *early_zerop; 469 470 /* 471 * pool and cache that PDPs are allocated from 472 */ 473 474 static struct pool_cache pmap_pdp_cache; 475 476 int pmap_pdp_ctor(void *, void *, int); 477 478 void *vmmap; /* XXX: used by mem.c... it should really uvm_map_reserve it */ 479 480 extern vaddr_t idt_vaddr; /* we allocate IDT early */ 481 extern paddr_t idt_paddr; 482 483 #ifdef _LP64 484 extern vaddr_t lo32_vaddr; 485 extern vaddr_t lo32_paddr; 486 #endif 487 488 extern int end; 489 490 #if defined(I586_CPU) 491 /* stuff to fix the pentium f00f bug */ 492 extern vaddr_t pentium_idt_vaddr; 493 #endif 494 495 496 /* 497 * local prototypes 498 */ 499 500 static struct vm_page *pmap_get_ptp(struct pmap *, vaddr_t, pd_entry_t **); 501 static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int); 502 static void pmap_freepage(struct pmap *, struct vm_page *, int, 503 struct vm_page **); 504 static void pmap_free_ptp(struct pmap *, struct vm_page *, 505 vaddr_t, pt_entry_t *, pd_entry_t **, 506 struct vm_page **); 507 static bool pmap_is_curpmap(struct pmap *); 508 static bool pmap_is_active(struct pmap *, struct cpu_info *, bool); 509 static void pmap_map_ptes(struct pmap *, struct pmap **, 510 pt_entry_t **, pd_entry_t ***); 511 static struct pv_entry *pmap_remove_pv(struct pv_head *, struct pmap *, 512 vaddr_t); 513 static void pmap_do_remove(struct pmap *, vaddr_t, vaddr_t, int); 514 static bool pmap_remove_pte(struct pmap *, struct vm_page *, 515 pt_entry_t *, vaddr_t, int, 516 struct pv_entry **); 517 static pt_entry_t pmap_remove_ptes(struct pmap *, struct vm_page *, 518 vaddr_t, vaddr_t, vaddr_t, int, 519 struct pv_entry **); 520 #define PMAP_REMOVE_ALL 0 /* remove all mappings */ 521 #define PMAP_REMOVE_SKIPWIRED 1 /* skip wired mappings */ 522 523 static void pmap_unmap_ptes(struct pmap *, struct pmap *); 524 static bool pmap_get_physpage(vaddr_t, int, paddr_t *); 525 static int pmap_pdes_invalid(vaddr_t, pd_entry_t **, 526 pd_entry_t *); 527 #define pmap_pdes_valid(va, pdes, lastpde) \ 528 (pmap_pdes_invalid((va), (pdes), (lastpde)) == 0) 529 static void pmap_alloc_level(pd_entry_t **, vaddr_t, int, long *); 530 531 static bool pmap_reactivate(struct pmap *); 532 533 /* 534 * p m a p h e l p e r f u n c t i o n s 535 */ 536 537 /* 538 * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]? 539 * of course the kernel is always loaded 540 */ 541 542 inline static bool 543 pmap_is_curpmap(struct pmap *pmap) 544 { 545 546 return((pmap == pmap_kernel()) || 547 (pmap == curcpu()->ci_pmap)); 548 } 549 550 /* 551 * pmap_is_active: is this pmap loaded into the specified processor's %cr3? 552 */ 553 554 inline static bool 555 pmap_is_active(struct pmap *pmap, struct cpu_info *ci, bool kernel) 556 { 557 558 return (pmap == pmap_kernel() || 559 (pmap->pm_cpus & ci->ci_cpumask) != 0 || 560 (kernel && (pmap->pm_kernel_cpus & ci->ci_cpumask) != 0)); 561 } 562 563 static void 564 pmap_apte_flush(struct pmap *pmap) 565 { 566 567 /* 568 * Flush the APTE mapping from all other CPUs that 569 * are using the pmap we are using (who's APTE space 570 * is the one we've just modified). 571 * 572 * XXXthorpej -- find a way to defer the IPI. 573 */ 574 pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, 0); 575 pmap_tlb_shootwait(); 576 } 577 578 /* 579 * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in 580 * 581 * => we lock enough pmaps to keep things locked in 582 * => must be undone with pmap_unmap_ptes before returning 583 */ 584 585 static void 586 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2, 587 pd_entry_t **ptepp, pd_entry_t ***pdeppp) 588 { 589 pd_entry_t opde; 590 struct pmap *ourpmap; 591 struct cpu_info *ci; 592 struct lwp *l; 593 bool iscurrent; 594 uint64_t ncsw; 595 596 /* the kernel's pmap is always accessible */ 597 if (pmap == pmap_kernel()) { 598 *pmap2 = NULL; 599 *ptepp = PTE_BASE; 600 *pdeppp = normal_pdes; 601 return; 602 } 603 604 retry: 605 crit_enter(); 606 l = curlwp; 607 ncsw = l->l_ncsw; 608 ourpmap = NULL; 609 ci = curcpu(); 610 if (ci->ci_want_pmapload && 611 vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) { 612 pmap_load(); 613 if (l->l_ncsw != ncsw) { 614 crit_exit(); 615 goto retry; 616 } 617 } 618 iscurrent = pmap_is_curpmap(pmap); 619 620 /* if curpmap then we are always mapped */ 621 if (iscurrent) { 622 mutex_enter(&pmap->pm_lock); 623 *pmap2 = NULL; 624 *ptepp = PTE_BASE; 625 *pdeppp = normal_pdes; 626 goto out; 627 } 628 629 ourpmap = ci->ci_pmap; 630 631 /* need to lock both curpmap and pmap: use ordered locking */ 632 if ((uintptr_t) pmap < (uintptr_t) ourpmap) { 633 mutex_enter(&pmap->pm_lock); 634 mutex_enter(&ourpmap->pm_lock); 635 } else { 636 mutex_enter(&ourpmap->pm_lock); 637 mutex_enter(&pmap->pm_lock); 638 } 639 640 if (l->l_ncsw != ncsw) 641 goto unlock_and_retry; 642 643 /* need to load a new alternate pt space into curpmap? */ 644 COUNT(apdp_pde_map); 645 opde = *APDP_PDE; 646 if (!pmap_valid_entry(opde) || (opde & PG_FRAME) != pmap->pm_pdirpa) { 647 *APDP_PDE = (pd_entry_t) (pmap->pm_pdirpa | PG_RW | PG_V); 648 if (pmap_valid_entry(opde)) 649 pmap_apte_flush(ourpmap); 650 } 651 652 *pmap2 = ourpmap; 653 *ptepp = APTE_BASE; 654 *pdeppp = alternate_pdes; 655 KASSERT(l->l_ncsw == ncsw); 656 out: 657 /* 658 * might have blocked, need to retry? 659 */ 660 if (l->l_ncsw != ncsw) { 661 unlock_and_retry: 662 crit_exit(); 663 if (ourpmap != NULL) 664 mutex_exit(&ourpmap->pm_lock); 665 mutex_exit(&pmap->pm_lock); 666 goto retry; 667 } 668 669 return; 670 } 671 672 /* 673 * pmap_unmap_ptes: unlock the PTE mapping of "pmap" 674 */ 675 676 static void 677 pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2) 678 { 679 680 if (pmap == pmap_kernel()) { 681 return; 682 } 683 if (pmap2 == NULL) { 684 mutex_exit(&pmap->pm_lock); 685 } else { 686 KASSERT(curcpu()->ci_pmap == pmap2); 687 #if defined(MULTIPROCESSOR) 688 *APDP_PDE = 0; 689 pmap_apte_flush(pmap2); 690 #endif 691 COUNT(apdp_pde_unmap); 692 mutex_exit(&pmap->pm_lock); 693 mutex_exit(&pmap2->pm_lock); 694 } 695 696 /* re-enable preemption */ 697 crit_exit(); 698 } 699 700 inline static void 701 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte) 702 { 703 704 #if !defined(__x86_64__) 705 if (curproc == NULL || curproc->p_vmspace == NULL || 706 pm != vm_map_pmap(&curproc->p_vmspace->vm_map)) 707 return; 708 709 if ((opte ^ npte) & PG_X) 710 pmap_update_pg(va); 711 712 /* 713 * Executability was removed on the last executable change. 714 * Reset the code segment to something conservative and 715 * let the trap handler deal with setting the right limit. 716 * We can't do that because of locking constraints on the vm map. 717 */ 718 719 if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) { 720 struct trapframe *tf = curlwp->l_md.md_regs; 721 struct pcb *pcb = &curlwp->l_addr->u_pcb; 722 723 pcb->pcb_cs = tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 724 pm->pm_hiexec = I386_MAX_EXE_ADDR; 725 } 726 #endif /* !defined(__x86_64__) */ 727 } 728 729 #if !defined(__x86_64__) 730 /* 731 * Fixup the code segment to cover all potential executable mappings. 732 * returns 0 if no changes to the code segment were made. 733 */ 734 735 int 736 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb) 737 { 738 struct vm_map_entry *ent; 739 struct pmap *pm = vm_map_pmap(map); 740 vaddr_t va = 0; 741 742 vm_map_lock_read(map); 743 for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) { 744 745 /* 746 * This entry has greater va than the entries before. 747 * We need to make it point to the last page, not past it. 748 */ 749 750 if (ent->protection & VM_PROT_EXECUTE) 751 va = trunc_page(ent->end) - PAGE_SIZE; 752 } 753 vm_map_unlock_read(map); 754 if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL)) 755 return (0); 756 757 pm->pm_hiexec = va; 758 if (pm->pm_hiexec > I386_MAX_EXE_ADDR) { 759 pcb->pcb_cs = tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL); 760 } else { 761 pcb->pcb_cs = tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 762 return (0); 763 } 764 return (1); 765 } 766 #endif /* !defined(__x86_64__) */ 767 768 /* 769 * p m a p k e n t e r f u n c t i o n s 770 * 771 * functions to quickly enter/remove pages from the kernel address 772 * space. pmap_kremove is exported to MI kernel. we make use of 773 * the recursive PTE mappings. 774 */ 775 776 /* 777 * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking 778 * 779 * => no need to lock anything, assume va is already allocated 780 * => should be faster than normal pmap enter function 781 */ 782 783 void 784 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot) 785 { 786 pt_entry_t *pte, opte, npte; 787 788 if (va < VM_MIN_KERNEL_ADDRESS) 789 pte = vtopte(va); 790 else 791 pte = kvtopte(va); 792 793 npte = pa | protection_codes[prot] | PG_V | pmap_pg_g; 794 opte = pmap_pte_set(pte, npte); /* zap! */ 795 #if defined(DIAGNOSTIC) 796 /* XXX For now... */ 797 if (opte & PG_PS) 798 panic("pmap_kenter_pa: PG_PS"); 799 #endif 800 if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) { 801 /* This should not happen, so no need to batch updates. */ 802 crit_enter(); 803 pmap_tlb_shootdown(pmap_kernel(), va, 0, opte); 804 crit_exit(); 805 } 806 } 807 808 #if defined(__x86_64__) 809 /* 810 * Change protection for a virtual address. Local for a CPU only, don't 811 * care about TLB shootdowns. 812 */ 813 void 814 pmap_changeprot_local(vaddr_t va, vm_prot_t prot) 815 { 816 pt_entry_t *pte, opte; 817 818 if (va < VM_MIN_KERNEL_ADDRESS) 819 pte = vtopte(va); 820 else 821 pte = kvtopte(va); 822 823 opte = *pte; 824 825 if ((prot & VM_PROT_WRITE) != 0) 826 *pte |= PG_RW; 827 else 828 *pte &= ~PG_RW; 829 830 if (opte != *pte) 831 invlpg(va); 832 } 833 #endif /* defined(__x86_64__) */ 834 835 /* 836 * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking 837 * 838 * => no need to lock anything 839 * => caller must dispose of any vm_page mapped in the va range 840 * => note: not an inline function 841 * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE 842 * => we assume kernel only unmaps valid addresses and thus don't bother 843 * checking the valid bit before doing TLB flushing 844 * => must be followed by call to pmap_update() before reuse of page 845 */ 846 847 void 848 pmap_kremove(vaddr_t sva, vsize_t len) 849 { 850 pt_entry_t *pte, xpte; 851 vaddr_t va, eva; 852 853 eva = sva + len; 854 xpte = 0; 855 856 for (va = sva; va < eva; va += PAGE_SIZE) { 857 if (va < VM_MIN_KERNEL_ADDRESS) 858 pte = vtopte(va); 859 else 860 pte = kvtopte(va); 861 xpte |= pmap_pte_set(pte, 0); /* zap! */ 862 #if defined(DIAGNOSTIC) 863 /* XXX For now... */ 864 if (xpte & PG_PS) 865 panic("pmap_kremove: PG_PS"); 866 if (xpte & PG_PVLIST) 867 panic("pmap_kremove: PG_PVLIST mapping for 0x%lx", 868 va); 869 #endif 870 } 871 if ((xpte & (PG_V | PG_U)) == (PG_V | PG_U)) { 872 crit_enter(); 873 pmap_tlb_shootdown(pmap_kernel(), sva, eva, xpte); 874 crit_exit(); 875 } 876 } 877 878 /* 879 * p m a p i n i t f u n c t i o n s 880 * 881 * pmap_bootstrap and pmap_init are called during system startup 882 * to init the pmap module. pmap_bootstrap() does a low level 883 * init just to get things rolling. pmap_init() finishes the job. 884 */ 885 886 /* 887 * pmap_bootstrap: get the system in a state where it can run with VM 888 * properly enabled (called before main()). the VM system is 889 * fully init'd later... 890 * 891 * => on i386, locore.s has already enabled the MMU by allocating 892 * a PDP for the kernel, and nkpde PTP's for the kernel. 893 * => kva_start is the first free virtual address in kernel space 894 */ 895 896 void 897 pmap_bootstrap(vaddr_t kva_start) 898 { 899 vaddr_t kva; 900 vaddr_t kva_end; 901 struct pmap *kpm; 902 pt_entry_t *pte; 903 int i; 904 unsigned long p1i; 905 pt_entry_t pg_nx = (cpu_feature & CPUID_NOX ? PG_NX : 0); 906 907 /* 908 * set up our local static global vars that keep track of the 909 * usage of KVM before kernel_map is set up 910 */ 911 912 virtual_avail = kva_start; /* first free KVA */ 913 virtual_end = VM_MAX_KERNEL_ADDRESS; /* last KVA */ 914 915 /* 916 * set up protection_codes: we need to be able to convert from 917 * a MI protection code (some combo of VM_PROT...) to something 918 * we can jam into a i386 PTE. 919 */ 920 921 protection_codes[VM_PROT_NONE] = pg_nx; /* --- */ 922 protection_codes[VM_PROT_EXECUTE] = PG_RO | PG_X; /* --x */ 923 protection_codes[VM_PROT_READ] = PG_RO | pg_nx; /* -r- */ 924 protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO | PG_X;/* -rx */ 925 protection_codes[VM_PROT_WRITE] = PG_RW | pg_nx; /* w-- */ 926 protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW | PG_X;/* w-x */ 927 protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW | pg_nx; 928 /* wr- */ 929 protection_codes[VM_PROT_ALL] = PG_RW | PG_X; /* wrx */ 930 931 /* 932 * now we init the kernel's pmap 933 * 934 * the kernel pmap's pm_obj is not used for much. however, in 935 * user pmaps the pm_obj contains the list of active PTPs. 936 * the pm_obj currently does not have a pager. it might be possible 937 * to add a pager that would allow a process to read-only mmap its 938 * own page tables (fast user level vtophys?). this may or may not 939 * be useful. 940 */ 941 942 kpm = pmap_kernel(); 943 for (i = 0; i < PTP_LEVELS - 1; i++) { 944 UVM_OBJ_INIT(&kpm->pm_obj[i], NULL, 1); 945 kpm->pm_ptphint[i] = NULL; 946 } 947 memset(&kpm->pm_list, 0, sizeof(kpm->pm_list)); /* pm_list not used */ 948 kpm->pm_pdir = (pd_entry_t *)(lwp0.l_addr->u_pcb.pcb_cr3 + KERNBASE); 949 kpm->pm_pdirpa = (paddr_t) lwp0.l_addr->u_pcb.pcb_cr3; 950 kpm->pm_stats.wired_count = kpm->pm_stats.resident_count = 951 x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS); 952 953 /* 954 * the above is just a rough estimate and not critical to the proper 955 * operation of the system. 956 */ 957 958 /* 959 * Begin to enable global TLB entries if they are supported. 960 * The G bit has no effect until the CR4_PGE bit is set in CR4, 961 * which happens in cpu_init(), which is run on each cpu 962 * (and happens later) 963 */ 964 965 if (cpu_feature & CPUID_PGE) { 966 pmap_pg_g = PG_G; /* enable software */ 967 968 /* add PG_G attribute to already mapped kernel pages */ 969 if (KERNBASE == VM_MIN_KERNEL_ADDRESS) { 970 kva_end = virtual_avail; 971 } else { 972 kva_end = roundup((vaddr_t)&end, PAGE_SIZE); 973 } 974 for (kva = KERNBASE; kva < kva_end; kva += PAGE_SIZE) { 975 p1i = pl1_i(kva); 976 if (pmap_valid_entry(PTE_BASE[p1i])) 977 PTE_BASE[p1i] |= PG_G; 978 } 979 } 980 981 /* 982 * enable large pages if they are supported. 983 */ 984 985 if (cpu_feature & CPUID_PSE) { 986 paddr_t pa; 987 pd_entry_t *pde; 988 extern char __data_start; 989 990 lcr4(rcr4() | CR4_PSE); /* enable hardware (via %cr4) */ 991 pmap_largepages = 1; /* enable software */ 992 993 /* 994 * the TLB must be flushed after enabling large pages 995 * on Pentium CPUs, according to section 3.6.2.2 of 996 * "Intel Architecture Software Developer's Manual, 997 * Volume 3: System Programming". 998 */ 999 tlbflush(); 1000 1001 /* 1002 * now, remap the kernel text using large pages. we 1003 * assume that the linker has properly aligned the 1004 * .data segment to a NBPD_L2 boundary. 1005 */ 1006 kva_end = rounddown((vaddr_t)&__data_start, NBPD_L1); 1007 for (pa = 0, kva = KERNBASE; kva + NBPD_L2 <= kva_end; 1008 kva += NBPD_L2, pa += NBPD_L2) { 1009 pde = &L2_BASE[pl2_i(kva)]; 1010 *pde = pa | pmap_pg_g | PG_PS | 1011 PG_KR | PG_V; /* zap! */ 1012 tlbflush(); 1013 } 1014 #if defined(DEBUG) 1015 printf("kernel text is mapped with " 1016 "%lu large pages and %lu normal pages\n", 1017 (unsigned long)howmany(kva - KERNBASE, NBPD_L2), 1018 (unsigned long)howmany((vaddr_t)&__data_start - kva, 1019 NBPD_L1)); 1020 #endif /* defined(DEBUG) */ 1021 } 1022 1023 if (VM_MIN_KERNEL_ADDRESS != KERNBASE) { 1024 /* 1025 * zero_pte is stuck at the end of mapped space for the kernel 1026 * image (disjunct from kva space). This is done so that it 1027 * can safely be used in pmap_growkernel (pmap_get_physpage), 1028 * when it's called for the first time. 1029 * XXXfvdl fix this for MULTIPROCESSOR later. 1030 */ 1031 1032 early_zerop = (void *)(KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2); 1033 early_zero_pte = PTE_BASE + pl1_i((unsigned long)early_zerop); 1034 } 1035 1036 /* 1037 * now we allocate the "special" VAs which are used for tmp mappings 1038 * by the pmap (and other modules). we allocate the VAs by advancing 1039 * virtual_avail (note that there are no pages mapped at these VAs). 1040 * we find the PTE that maps the allocated VA via the linear PTE 1041 * mapping. 1042 */ 1043 1044 pte = PTE_BASE + pl1_i(virtual_avail); 1045 1046 #ifdef MULTIPROCESSOR 1047 /* 1048 * Waste some VA space to avoid false sharing of cache lines 1049 * for page table pages: Give each possible CPU a cache line 1050 * of PTE's (8) to play with, though we only need 4. We could 1051 * recycle some of this waste by putting the idle stacks here 1052 * as well; we could waste less space if we knew the largest 1053 * CPU ID beforehand. 1054 */ 1055 csrcp = (char *) virtual_avail; csrc_pte = pte; 1056 1057 cdstp = (char *) virtual_avail+PAGE_SIZE; cdst_pte = pte+1; 1058 1059 zerop = (char *) virtual_avail+PAGE_SIZE*2; zero_pte = pte+2; 1060 1061 ptpp = (char *) virtual_avail+PAGE_SIZE*3; ptp_pte = pte+3; 1062 1063 virtual_avail += PAGE_SIZE * X86_MAXPROCS * NPTECL; 1064 pte += X86_MAXPROCS * NPTECL; 1065 #else 1066 csrcp = (void *) virtual_avail; csrc_pte = pte; /* allocate */ 1067 virtual_avail += PAGE_SIZE; pte++; /* advance */ 1068 1069 cdstp = (void *) virtual_avail; cdst_pte = pte; 1070 virtual_avail += PAGE_SIZE; pte++; 1071 1072 zerop = (void *) virtual_avail; zero_pte = pte; 1073 virtual_avail += PAGE_SIZE; pte++; 1074 1075 ptpp = (void *) virtual_avail; ptp_pte = pte; 1076 virtual_avail += PAGE_SIZE; pte++; 1077 #endif 1078 1079 if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { 1080 early_zerop = zerop; 1081 early_zero_pte = zero_pte; 1082 } 1083 1084 /* 1085 * Nothing after this point actually needs pte; 1086 */ 1087 pte = (void *)0xdeadbeef; 1088 1089 /* XXX: vmmap used by mem.c... should be uvm_map_reserve */ 1090 /* XXXfvdl PTEs not needed here */ 1091 vmmap = (char *)virtual_avail; /* don't need pte */ 1092 virtual_avail += PAGE_SIZE; pte++; 1093 1094 idt_vaddr = virtual_avail; /* don't need pte */ 1095 idt_paddr = avail_start; /* steal a page */ 1096 #if defined(__x86_64__) 1097 virtual_avail += 2 * PAGE_SIZE; pte += 2; 1098 avail_start += 2 * PAGE_SIZE; 1099 #else /* defined(__x86_64__) */ 1100 virtual_avail += PAGE_SIZE; pte++; 1101 avail_start += PAGE_SIZE; 1102 #endif /* defined(__x86_64__) */ 1103 1104 #if defined(I586_CPU) 1105 /* pentium f00f bug stuff */ 1106 pentium_idt_vaddr = virtual_avail; /* don't need pte */ 1107 virtual_avail += PAGE_SIZE; pte++; 1108 #endif 1109 1110 #ifdef _LP64 1111 /* 1112 * Grab a page below 4G for things that need it (i.e. 1113 * having an initial %cr3 for the MP trampoline). 1114 */ 1115 lo32_vaddr = virtual_avail; 1116 virtual_avail += PAGE_SIZE; pte++; 1117 lo32_paddr = avail_start; 1118 avail_start += PAGE_SIZE; 1119 #endif 1120 1121 /* 1122 * now we reserve some VM for mapping pages when doing a crash dump 1123 */ 1124 1125 virtual_avail = reserve_dumppages(virtual_avail); 1126 1127 /* 1128 * init the static-global locks and global lists. 1129 * 1130 * => pventry::pvh_lock (initialized elsewhere) must also be 1131 * a spin lock, again at IPL_VM to prevent deadlock, and 1132 * again is never taken from interrupt context. 1133 */ 1134 1135 rw_init(&pmap_main_lock); 1136 mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE); 1137 LIST_INIT(&pmaps); 1138 pmap_cpu_init_early(curcpu()); 1139 1140 /* 1141 * initialize caches. 1142 */ 1143 1144 pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), 0, 0, 0, "pmappl", 1145 &pool_allocator_nointr, IPL_NONE, NULL, NULL, NULL); 1146 pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE, 0, 0, 0, "pdppl", 1147 &pool_allocator_nointr, IPL_NONE, pmap_pdp_ctor, NULL, NULL); 1148 pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0, 0, 1149 "pvpl", &pool_allocator_meta, IPL_NONE, NULL, NULL, NULL); 1150 1151 /* 1152 * ensure the TLB is sync'd with reality by flushing it... 1153 */ 1154 1155 tlbflush(); 1156 1157 /* 1158 * calculate pmap_maxkvaddr from nkptp[]. 1159 */ 1160 1161 kva = VM_MIN_KERNEL_ADDRESS; 1162 for (i = PTP_LEVELS - 1; i >= 1; i--) { 1163 kva += nkptp[i] * nbpd[i]; 1164 } 1165 pmap_maxkvaddr = kva; 1166 } 1167 1168 #if defined(__x86_64__) 1169 /* 1170 * Pre-allocate PTPs for low memory, so that 1:1 mappings for various 1171 * trampoline code can be entered. 1172 */ 1173 void 1174 pmap_prealloc_lowmem_ptps(void) 1175 { 1176 pd_entry_t *pdes; 1177 int level; 1178 paddr_t newp; 1179 1180 pdes = pmap_kernel()->pm_pdir; 1181 level = PTP_LEVELS; 1182 for (;;) { 1183 newp = avail_start; 1184 avail_start += PAGE_SIZE; 1185 *early_zero_pte = (newp & PG_FRAME) | PG_V | PG_RW; 1186 pmap_update_pg((vaddr_t)early_zerop); 1187 memset(early_zerop, 0, PAGE_SIZE); 1188 pdes[pl_i(0, level)] = (newp & PG_FRAME) | PG_V | PG_RW; 1189 level--; 1190 if (level <= 1) 1191 break; 1192 pdes = normal_pdes[level - 2]; 1193 } 1194 } 1195 #endif /* defined(__x86_64__) */ 1196 1197 /* 1198 * pmap_init: called from uvm_init, our job is to get the pmap 1199 * system ready to manage mappings... 1200 */ 1201 1202 void 1203 pmap_init(void) 1204 { 1205 1206 /* 1207 * done: pmap module is up (and ready for business) 1208 */ 1209 1210 pmap_initialized = true; 1211 } 1212 1213 /* 1214 * pmap_cpu_init_early: perform early per-CPU initialization. 1215 */ 1216 1217 void 1218 pmap_cpu_init_early(struct cpu_info *ci) 1219 { 1220 struct pmap_cpu *pc; 1221 static uint8_t pmap_cpu_alloc; 1222 1223 pc = &pmap_cpu[pmap_cpu_alloc++].pc; 1224 ci->ci_pmap_cpu = pc; 1225 } 1226 1227 /* 1228 * pmap_cpu_init_late: perform late per-CPU initialization. 1229 */ 1230 1231 void 1232 pmap_cpu_init_late(struct cpu_info *ci) 1233 { 1234 1235 if (ci == &cpu_info_primary) 1236 evcnt_attach_dynamic(&pmap_tlb_evcnt, EVCNT_TYPE_INTR, 1237 NULL, "global", "TLB IPI"); 1238 evcnt_attach_dynamic(&ci->ci_tlb_evcnt, EVCNT_TYPE_INTR, 1239 NULL, ci->ci_dev->dv_xname, "TLB IPI"); 1240 } 1241 1242 /* 1243 * p v _ e n t r y f u n c t i o n s 1244 */ 1245 1246 /* 1247 * pmap_free_pvs: free a list of pv_entrys 1248 */ 1249 1250 static void 1251 pmap_free_pvs(struct pv_entry *pv) 1252 { 1253 struct pv_entry *next; 1254 1255 for ( /* null */ ; pv != NULL ; pv = next) { 1256 next = SPLAY_RIGHT(pv, pv_node); 1257 pool_cache_put(&pmap_pv_cache, pv); 1258 } 1259 } 1260 1261 /* 1262 * pmap_lock_pvhs: Lock pvh1 and optional pvh2 1263 * Observe locking order when locking both pvhs 1264 */ 1265 1266 static void 1267 pmap_lock_pvhs(struct pv_head *pvh1, struct pv_head *pvh2) 1268 { 1269 1270 if (pvh2 == NULL) { 1271 mutex_spin_enter(&pvh1->pvh_lock); 1272 return; 1273 } 1274 1275 if (pvh1 < pvh2) { 1276 mutex_spin_enter(&pvh1->pvh_lock); 1277 mutex_spin_enter(&pvh2->pvh_lock); 1278 } else { 1279 mutex_spin_enter(&pvh2->pvh_lock); 1280 mutex_spin_enter(&pvh1->pvh_lock); 1281 } 1282 } 1283 1284 1285 /* 1286 * main pv_entry manipulation functions: 1287 * pmap_enter_pv: enter a mapping onto a pv_head list 1288 * pmap_remove_pv: remove a mappiing from a pv_head list 1289 * 1290 * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock 1291 * the pvh before calling 1292 */ 1293 1294 /* 1295 * pmap_enter_pv: enter a mapping onto a pv_head lst 1296 * 1297 * => caller should hold the proper lock on pmap_main_lock 1298 * => caller should have pmap locked 1299 * => caller should have the pv_head locked 1300 * => caller should adjust ptp's wire_count before calling 1301 */ 1302 1303 static void 1304 pmap_enter_pv(struct pv_head *pvh, 1305 struct pv_entry *pve, /* preallocated pve for us to use */ 1306 struct pmap *pmap, 1307 vaddr_t va, 1308 struct vm_page *ptp) /* PTP in pmap that maps this VA */ 1309 { 1310 pve->pv_pmap = pmap; 1311 pve->pv_va = va; 1312 pve->pv_ptp = ptp; /* NULL for kernel pmap */ 1313 SPLAY_INSERT(pvtree, &pvh->pvh_root, pve); /* add to locked list */ 1314 } 1315 1316 /* 1317 * pmap_remove_pv: try to remove a mapping from a pv_list 1318 * 1319 * => caller should hold proper lock on pmap_main_lock 1320 * => pmap should be locked 1321 * => caller should hold lock on pv_head [so that attrs can be adjusted] 1322 * => caller should adjust ptp's wire_count and free PTP if needed 1323 * => we return the removed pve 1324 */ 1325 1326 static struct pv_entry * 1327 pmap_remove_pv(struct pv_head *pvh, struct pmap *pmap, vaddr_t va) 1328 { 1329 struct pv_entry tmp, *pve; 1330 1331 tmp.pv_pmap = pmap; 1332 tmp.pv_va = va; 1333 pve = SPLAY_FIND(pvtree, &pvh->pvh_root, &tmp); 1334 if (pve == NULL) 1335 return (NULL); 1336 SPLAY_REMOVE(pvtree, &pvh->pvh_root, pve); 1337 return(pve); /* return removed pve */ 1338 } 1339 1340 /* 1341 * p t p f u n c t i o n s 1342 */ 1343 1344 static inline struct vm_page * 1345 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level) 1346 { 1347 int lidx = level - 1; 1348 struct vm_page *pg; 1349 1350 if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] && 1351 pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) { 1352 return (pmap->pm_ptphint[lidx]); 1353 } 1354 if (lidx == 0) 1355 pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level)); 1356 else { 1357 mutex_enter(&pmap->pm_obj[lidx].vmobjlock); 1358 pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level)); 1359 mutex_exit(&pmap->pm_obj[lidx].vmobjlock); 1360 } 1361 return pg; 1362 } 1363 1364 static inline void 1365 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level, 1366 struct vm_page **empty_ptps) 1367 { 1368 int lidx; 1369 struct uvm_object *obj; 1370 1371 lidx = level - 1; 1372 1373 obj = &pmap->pm_obj[lidx]; 1374 pmap->pm_stats.resident_count--; 1375 if (lidx != 0) 1376 mutex_enter(&obj->vmobjlock); 1377 if (pmap->pm_ptphint[lidx] == ptp) 1378 pmap->pm_ptphint[lidx] = TAILQ_FIRST(&obj->memq); 1379 ptp->wire_count = 0; 1380 uvm_pagerealloc(ptp, NULL, 0); 1381 ptp->flags |= PG_ZERO; 1382 ptp->mdpage.mp_link = *empty_ptps; 1383 *empty_ptps = ptp; 1384 if (lidx != 0) 1385 mutex_exit(&obj->vmobjlock); 1386 } 1387 1388 static void 1389 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va, 1390 pt_entry_t *ptes, pd_entry_t **pdes, struct vm_page **empty_ptps) 1391 { 1392 unsigned long index; 1393 int level; 1394 vaddr_t invaladdr; 1395 pd_entry_t opde; 1396 struct pmap *curpmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map); 1397 1398 level = 1; 1399 do { 1400 pmap_freepage(pmap, ptp, level, empty_ptps); 1401 index = pl_i(va, level + 1); 1402 opde = pmap_pte_set(&pdes[level - 1][index], 0); 1403 invaladdr = level == 1 ? (vaddr_t)ptes : 1404 (vaddr_t)pdes[level - 2]; 1405 pmap_tlb_shootdown(curpmap, invaladdr + index * PAGE_SIZE, 1406 0, opde); 1407 #if defined(MULTIPROCESSOR) 1408 invaladdr = level == 1 ? (vaddr_t)PTE_BASE : 1409 (vaddr_t)normal_pdes[level - 2]; 1410 pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE, 1411 0, opde); 1412 #endif 1413 if (level < PTP_LEVELS - 1) { 1414 ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1); 1415 ptp->wire_count--; 1416 if (ptp->wire_count > 1) 1417 break; 1418 } 1419 } while (++level < PTP_LEVELS); 1420 } 1421 1422 /* 1423 * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one) 1424 * 1425 * => pmap should NOT be pmap_kernel() 1426 * => pmap should be locked 1427 */ 1428 1429 static struct vm_page * 1430 pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t **pdes) 1431 { 1432 struct vm_page *ptp, *pptp; 1433 int i; 1434 unsigned long index; 1435 pd_entry_t *pva; 1436 paddr_t ppa, pa; 1437 struct uvm_object *obj; 1438 1439 ptp = NULL; 1440 pa = (paddr_t)-1; 1441 1442 /* 1443 * Loop through all page table levels seeing if we need to 1444 * add a new page to that level. 1445 */ 1446 for (i = PTP_LEVELS; i > 1; i--) { 1447 /* 1448 * Save values from previous round. 1449 */ 1450 pptp = ptp; 1451 ppa = pa; 1452 1453 index = pl_i(va, i); 1454 pva = pdes[i - 2]; 1455 1456 if (pmap_valid_entry(pva[index])) { 1457 ppa = pva[index] & PG_FRAME; 1458 ptp = NULL; 1459 continue; 1460 } 1461 1462 obj = &pmap->pm_obj[i-2]; 1463 /* 1464 * XXX pm_obj[0] is pm_lock, which is already locked. 1465 */ 1466 if (i != 2) 1467 mutex_enter(&obj->vmobjlock); 1468 ptp = uvm_pagealloc(obj, ptp_va2o(va, i - 1), NULL, 1469 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 1470 if (i != 2) 1471 mutex_exit(&obj->vmobjlock); 1472 1473 if (ptp == NULL) 1474 return NULL; 1475 1476 ptp->flags &= ~PG_BUSY; /* never busy */ 1477 ptp->wire_count = 1; 1478 pmap->pm_ptphint[i - 2] = ptp; 1479 pa = VM_PAGE_TO_PHYS(ptp); 1480 pva[index] = (pd_entry_t) (pa | PG_u | PG_RW | PG_V); 1481 pmap->pm_stats.resident_count++; 1482 /* 1483 * If we're not in the top level, increase the 1484 * wire count of the parent page. 1485 */ 1486 if (i < PTP_LEVELS) { 1487 if (pptp == NULL) 1488 pptp = pmap_find_ptp(pmap, va, ppa, i); 1489 #ifdef DIAGNOSTIC 1490 if (pptp == NULL) 1491 panic("pde page disappeared"); 1492 #endif 1493 pptp->wire_count++; 1494 } 1495 } 1496 1497 /* 1498 * ptp is not NULL if we just allocated a new ptp. If it's 1499 * still NULL, we must look up the existing one. 1500 */ 1501 if (ptp == NULL) { 1502 ptp = pmap_find_ptp(pmap, va, ppa, 1); 1503 #ifdef DIAGNOSTIC 1504 if (ptp == NULL) { 1505 printf("va %lx ppa %lx\n", (unsigned long)va, 1506 (unsigned long)ppa); 1507 panic("pmap_get_ptp: unmanaged user PTP"); 1508 } 1509 #endif 1510 } 1511 1512 pmap->pm_ptphint[0] = ptp; 1513 return(ptp); 1514 } 1515 1516 /* 1517 * p m a p l i f e c y c l e f u n c t i o n s 1518 */ 1519 1520 /* 1521 * pmap_pdp_ctor: constructor for the PDP cache. 1522 */ 1523 1524 int 1525 pmap_pdp_ctor(void *arg, void *object, int flags) 1526 { 1527 pd_entry_t *pdir = object; 1528 paddr_t pdirpa = 0; /* XXX: GCC */ 1529 int npde; 1530 1531 /* 1532 * NOTE: The `pmap_lock' is held when the PDP is allocated. 1533 */ 1534 1535 /* fetch the physical address of the page directory. */ 1536 (void) pmap_extract(pmap_kernel(), (vaddr_t) pdir, &pdirpa); 1537 1538 /* zero init area */ 1539 memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t)); 1540 1541 /* put in recursive PDE to map the PTEs */ 1542 pdir[PDIR_SLOT_PTE] = pdirpa | PG_V | PG_KW; 1543 1544 npde = nkptp[PTP_LEVELS - 1]; 1545 1546 /* put in kernel VM PDEs */ 1547 memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN], 1548 npde * sizeof(pd_entry_t)); 1549 1550 /* zero the rest */ 1551 memset(&pdir[PDIR_SLOT_KERN + npde], 0, 1552 (NTOPLEVEL_PDES - (PDIR_SLOT_KERN + npde)) * sizeof(pd_entry_t)); 1553 1554 if (VM_MIN_KERNEL_ADDRESS != KERNBASE) { 1555 int idx = pl_i(KERNBASE, PTP_LEVELS); 1556 1557 pdir[idx] = PDP_BASE[idx]; 1558 } 1559 1560 return (0); 1561 } 1562 1563 /* 1564 * pmap_create: create a pmap 1565 * 1566 * => note: old pmap interface took a "size" args which allowed for 1567 * the creation of "software only" pmaps (not in bsd). 1568 */ 1569 1570 struct pmap * 1571 pmap_create(void) 1572 { 1573 struct pmap *pmap; 1574 int i; 1575 1576 pmap = pool_cache_get(&pmap_cache, PR_WAITOK); 1577 1578 /* init uvm_object */ 1579 for (i = 0; i < PTP_LEVELS - 1; i++) { 1580 UVM_OBJ_INIT(&pmap->pm_obj[i], NULL, 1); 1581 pmap->pm_ptphint[i] = NULL; 1582 } 1583 pmap->pm_stats.wired_count = 0; 1584 pmap->pm_stats.resident_count = 1; /* count the PDP allocd below */ 1585 #if !defined(__x86_64__) 1586 pmap->pm_hiexec = 0; 1587 #endif /* !defined(__x86_64__) */ 1588 pmap->pm_flags = 0; 1589 pmap->pm_cpus = 0; 1590 pmap->pm_kernel_cpus = 0; 1591 1592 /* init the LDT */ 1593 pmap->pm_ldt = NULL; 1594 pmap->pm_ldt_len = 0; 1595 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 1596 1597 /* allocate PDP */ 1598 try_again: 1599 pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK); 1600 1601 mutex_enter(&pmaps_lock); 1602 1603 if (pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] == 0) { 1604 mutex_exit(&pmaps_lock); 1605 pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir); 1606 goto try_again; 1607 } 1608 1609 pmap->pm_pdirpa = pmap->pm_pdir[PDIR_SLOT_PTE] & PG_FRAME; 1610 1611 LIST_INSERT_HEAD(&pmaps, pmap, pm_list); 1612 1613 mutex_exit(&pmaps_lock); 1614 1615 return (pmap); 1616 } 1617 1618 /* 1619 * pmap_destroy: drop reference count on pmap. free pmap if 1620 * reference count goes to zero. 1621 */ 1622 1623 void 1624 pmap_destroy(struct pmap *pmap) 1625 { 1626 int i; 1627 #ifdef DIAGNOSTIC 1628 struct cpu_info *ci; 1629 CPU_INFO_ITERATOR cii; 1630 #endif /* DIAGNOSTIC */ 1631 1632 /* 1633 * drop reference count 1634 */ 1635 1636 if (atomic_dec_uint_nv((unsigned *)&pmap->pm_obj[0].uo_refs) > 0) { 1637 return; 1638 } 1639 1640 #ifdef DIAGNOSTIC 1641 for (CPU_INFO_FOREACH(cii, ci)) 1642 if (ci->ci_pmap == pmap) 1643 panic("destroying pmap being used"); 1644 #endif /* DIAGNOSTIC */ 1645 1646 /* 1647 * reference count is zero, free pmap resources and then free pmap. 1648 */ 1649 1650 /* 1651 * remove it from global list of pmaps 1652 */ 1653 1654 KERNEL_LOCK(1, NULL); 1655 1656 mutex_enter(&pmaps_lock); 1657 LIST_REMOVE(pmap, pm_list); 1658 mutex_exit(&pmaps_lock); 1659 1660 /* 1661 * destroyed pmap shouldn't have remaining PTPs 1662 */ 1663 1664 for (i = 0; i < PTP_LEVELS - 1; i++) { 1665 KASSERT(pmap->pm_obj[i].uo_npages == 0); 1666 KASSERT(TAILQ_EMPTY(&pmap->pm_obj[i].memq)); 1667 } 1668 1669 /* 1670 * MULTIPROCESSOR -- no need to flush out of other processors' 1671 * APTE space because we do that in pmap_unmap_ptes(). 1672 */ 1673 pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir); 1674 1675 #ifdef USER_LDT 1676 if (pmap->pm_flags & PMF_USER_LDT) { 1677 /* 1678 * no need to switch the LDT; this address space is gone, 1679 * nothing is using it. 1680 * 1681 * No need to lock the pmap for ldt_free (or anything else), 1682 * we're the last one to use it. 1683 */ 1684 ldt_free(pmap->pm_ldt_sel); 1685 uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt, 1686 pmap->pm_ldt_len * sizeof(union descriptor), UVM_KMF_WIRED); 1687 } 1688 #endif 1689 1690 for (i = 0; i < PTP_LEVELS - 1; i++) 1691 mutex_destroy(&pmap->pm_obj[i].vmobjlock); 1692 pool_cache_put(&pmap_cache, pmap); 1693 1694 KERNEL_UNLOCK_ONE(NULL); 1695 } 1696 1697 /* 1698 * Add a reference to the specified pmap. 1699 */ 1700 1701 inline void 1702 pmap_reference(struct pmap *pmap) 1703 { 1704 1705 atomic_inc_uint((unsigned *)&pmap->pm_obj[0].uo_refs); 1706 } 1707 1708 #if defined(PMAP_FORK) 1709 /* 1710 * pmap_fork: perform any necessary data structure manipulation when 1711 * a VM space is forked. 1712 */ 1713 1714 void 1715 pmap_fork(struct pmap *pmap1, struct pmap *pmap2) 1716 { 1717 #ifdef USER_LDT 1718 union descriptor *new_ldt; 1719 size_t len; 1720 int sel; 1721 1722 retry: 1723 if (pmap1->pm_flags & PMF_USER_LDT) { 1724 len = pmap1->pm_ldt_len * sizeof(union descriptor); 1725 new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, 1726 len, 0, UVM_KMF_WIRED); 1727 sel = ldt_alloc(new_ldt, len); 1728 } else { 1729 len = -1; 1730 new_ldt = NULL; 1731 sel = -1; 1732 } 1733 1734 if ((uintptr_t) pmap1 < (uintptr_t) pmap2) { 1735 mutex_enter(&pmap1->pm_obj.vmobjlock); 1736 mutex_enter(&pmap2->pm_obj.vmobjlock); 1737 } else { 1738 mutex_enter(&pmap2->pm_obj.vmobjlock); 1739 mutex_enter(&pmap1->pm_obj.vmobjlock); 1740 } 1741 1742 /* Copy the LDT, if necessary. */ 1743 if (pmap1->pm_flags & PMF_USER_LDT) { 1744 if (len != pmap1->pm_ldt_len * sizeof(union descriptor)) { 1745 mutex_exit(&pmap2->pm_obj.vmobjlock); 1746 mutex_exit(&pmap1->pm_obj.vmobjlock); 1747 if (len != -1) { 1748 ldt_free(sel); 1749 uvm_km_free(kernel_map, (vaddr_t)new_ldt, 1750 len, UVM_KMF_WIRED); 1751 } 1752 goto retry; 1753 } 1754 1755 memcpy(new_ldt, pmap1->pm_ldt, len); 1756 pmap2->pm_ldt = new_ldt; 1757 pmap2->pm_ldt_len = pmap1->pm_ldt_len; 1758 pmap2->pm_flags |= PMF_USER_LDT; 1759 pmap2->pm_ldt_sel = sel; 1760 len = -1; 1761 } 1762 1763 mutex_exit(&pmap2->pm_obj.vmobjlock); 1764 mutex_exit(&pmap1->pm_obj.vmobjlock); 1765 1766 if (len != -1) { 1767 ldt_free(sel); 1768 uvm_km_free(kernel_map, (vaddr_t)new_ldt, len, 1769 UVM_KMF_WIRED); 1770 } 1771 #endif /* USER_LDT */ 1772 } 1773 #endif /* PMAP_FORK */ 1774 1775 #ifdef USER_LDT 1776 /* 1777 * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and 1778 * restore the default. 1779 */ 1780 1781 void 1782 pmap_ldt_cleanup(struct lwp *l) 1783 { 1784 struct pcb *pcb = &l->l_addr->u_pcb; 1785 pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap; 1786 union descriptor *old_ldt = NULL; 1787 size_t len = 0; 1788 int sel = -1; 1789 1790 mutex_enter(&pmap->pm_lock); 1791 1792 if (pmap->pm_flags & PMF_USER_LDT) { 1793 sel = pmap->pm_ldt_sel; 1794 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 1795 pcb->pcb_ldt_sel = pmap->pm_ldt_sel; 1796 if (l == curlwp) 1797 lldt(pcb->pcb_ldt_sel); 1798 old_ldt = pmap->pm_ldt; 1799 len = pmap->pm_ldt_len * sizeof(union descriptor); 1800 pmap->pm_ldt = NULL; 1801 pmap->pm_ldt_len = 0; 1802 pmap->pm_flags &= ~PMF_USER_LDT; 1803 } 1804 1805 mutex_exit(&pmap->pm_lock); 1806 1807 if (sel != -1) 1808 ldt_free(sel); 1809 if (old_ldt != NULL) 1810 uvm_km_free(kernel_map, (vaddr_t)old_ldt, len, UVM_KMF_WIRED); 1811 } 1812 #endif /* USER_LDT */ 1813 1814 /* 1815 * pmap_activate: activate a process' pmap 1816 * 1817 * => must be called with kernel preemption disabled 1818 * => if lwp is the curlwp, then set ci_want_pmapload so that 1819 * actual MMU context switch will be done by pmap_load() later 1820 */ 1821 1822 void 1823 pmap_activate(struct lwp *l) 1824 { 1825 struct cpu_info *ci; 1826 struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 1827 1828 ci = curcpu(); 1829 1830 if (l == ci->ci_curlwp) { 1831 struct pcb *pcb; 1832 1833 KASSERT(ci->ci_want_pmapload == 0); 1834 KASSERT(ci->ci_tlbstate != TLBSTATE_VALID); 1835 #ifdef KSTACK_CHECK_DR0 1836 /* 1837 * setup breakpoint on the top of stack 1838 */ 1839 if (l == &lwp0) 1840 dr0(0, 0, 0, 0); 1841 else 1842 dr0(KSTACK_LOWEST_ADDR(l), 1, 3, 1); 1843 #endif 1844 1845 /* 1846 * no need to switch to kernel vmspace because 1847 * it's a subset of any vmspace. 1848 */ 1849 1850 if (pmap == pmap_kernel()) { 1851 ci->ci_want_pmapload = 0; 1852 return; 1853 } 1854 1855 pcb = &l->l_addr->u_pcb; 1856 pcb->pcb_ldt_sel = pmap->pm_ldt_sel; 1857 1858 ci->ci_want_pmapload = 1; 1859 1860 #if defined(__x86_64__) 1861 if (pcb->pcb_flags & PCB_GS64) 1862 wrmsr(MSR_KERNELGSBASE, pcb->pcb_gs); 1863 if (pcb->pcb_flags & PCB_FS64) 1864 wrmsr(MSR_FSBASE, pcb->pcb_fs); 1865 #endif /* defined(__x86_64__) */ 1866 } 1867 } 1868 1869 /* 1870 * pmap_reactivate: try to regain reference to the pmap. 1871 * 1872 * => must be called with kernel preemption disabled 1873 */ 1874 1875 static bool 1876 pmap_reactivate(struct pmap *pmap) 1877 { 1878 struct cpu_info *ci; 1879 uint32_t cpumask; 1880 bool result; 1881 uint32_t oldcpus; 1882 1883 ci = curcpu(); 1884 cpumask = ci->ci_cpumask; 1885 1886 KASSERT(pmap->pm_pdirpa == rcr3()); 1887 1888 /* 1889 * if we still have a lazy reference to this pmap, 1890 * we can assume that there was no tlb shootdown 1891 * for this pmap in the meantime. 1892 * 1893 * the order of events here is important as we must 1894 * synchronize with TLB shootdown interrupts. declare 1895 * interest in invalidations (TLBSTATE_VALID) and then 1896 * check the cpumask, which the IPIs can change only 1897 * when the state is !TLBSTATE_VALID. 1898 */ 1899 1900 ci->ci_tlbstate = TLBSTATE_VALID; 1901 oldcpus = pmap->pm_cpus; 1902 x86_atomic_setbits_l(&pmap->pm_cpus, cpumask); 1903 KASSERT((pmap->pm_kernel_cpus & cpumask) != 0); 1904 if (oldcpus & cpumask) { 1905 /* got it */ 1906 result = true; 1907 } else { 1908 result = false; 1909 } 1910 1911 return result; 1912 } 1913 1914 /* 1915 * pmap_load: actually switch pmap. (fill in %cr3 and LDT info) 1916 */ 1917 1918 void 1919 pmap_load(void) 1920 { 1921 struct cpu_info *ci; 1922 uint32_t cpumask; 1923 struct pmap *pmap; 1924 struct pmap *oldpmap; 1925 struct lwp *l; 1926 struct pcb *pcb; 1927 uint64_t ncsw; 1928 1929 crit_enter(); 1930 KASSERT(curcpu()->ci_want_pmapload); 1931 retry: 1932 ci = curcpu(); 1933 cpumask = ci->ci_cpumask; 1934 1935 /* should be able to take ipis. */ 1936 KASSERT(ci->ci_ilevel < IPL_IPI); 1937 KASSERT((x86_read_psl() & PSL_I) != 0); 1938 1939 l = ci->ci_curlwp; 1940 KASSERT(l != NULL); 1941 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 1942 KASSERT(pmap != pmap_kernel()); 1943 oldpmap = ci->ci_pmap; 1944 1945 pcb = &l->l_addr->u_pcb; 1946 /* loaded by pmap_activate */ 1947 KASSERT(pcb->pcb_ldt_sel == pmap->pm_ldt_sel); 1948 1949 if (pmap == oldpmap) { 1950 if (!pmap_reactivate(pmap)) { 1951 1952 /* 1953 * pmap has been changed during deactivated. 1954 * our tlb may be stale. 1955 */ 1956 1957 tlbflush(); 1958 } 1959 1960 ci->ci_want_pmapload = 0; 1961 crit_exit(); 1962 return; 1963 } 1964 1965 /* 1966 * grab a reference to the new pmap. 1967 */ 1968 1969 pmap_reference(pmap); 1970 1971 /* 1972 * actually switch pmap. 1973 */ 1974 1975 x86_atomic_clearbits_l(&oldpmap->pm_cpus, cpumask); 1976 x86_atomic_clearbits_l(&oldpmap->pm_kernel_cpus, cpumask); 1977 1978 KASSERT(oldpmap->pm_pdirpa == rcr3()); 1979 KASSERT((pmap->pm_cpus & cpumask) == 0); 1980 KASSERT((pmap->pm_kernel_cpus & cpumask) == 0); 1981 1982 /* 1983 * mark the pmap in use by this processor. again we must 1984 * synchronize with TLB shootdown interrupts, so set the 1985 * state VALID first, then register us for shootdown events 1986 * on this pmap. 1987 */ 1988 1989 ci->ci_tlbstate = TLBSTATE_VALID; 1990 x86_atomic_setbits_l(&pmap->pm_cpus, cpumask); 1991 x86_atomic_setbits_l(&pmap->pm_kernel_cpus, cpumask); 1992 ci->ci_pmap = pmap; 1993 1994 /* 1995 * update tss. now that we have registered for invalidations 1996 * from other CPUs, we're good to load the page tables. 1997 */ 1998 1999 lldt(pcb->pcb_ldt_sel); 2000 pcb->pcb_cr3 = pmap->pm_pdirpa; 2001 lcr3(pcb->pcb_cr3); 2002 2003 ci->ci_want_pmapload = 0; 2004 2005 /* 2006 * we're now running with the new pmap. drop the reference 2007 * to the old pmap. if we block, we need to go around again. 2008 */ 2009 2010 ncsw = l->l_ncsw; 2011 pmap_destroy(oldpmap); 2012 if (l->l_ncsw != ncsw) { 2013 goto retry; 2014 } 2015 2016 crit_exit(); 2017 } 2018 2019 /* 2020 * pmap_deactivate: deactivate a process' pmap 2021 * 2022 * => must be called with kernel preemption disabled (high SPL is enough) 2023 */ 2024 2025 void 2026 pmap_deactivate(struct lwp *l) 2027 { 2028 struct pmap *pmap; 2029 struct cpu_info *ci; 2030 2031 if (l != curlwp) { 2032 return; 2033 } 2034 2035 /* 2036 * wait for pending TLB shootdowns to complete. necessary 2037 * because TLB shootdown state is per-CPU, and the LWP may 2038 * be coming off the CPU before it has a chance to call 2039 * pmap_update(). 2040 */ 2041 pmap_tlb_shootwait(); 2042 2043 ci = curcpu(); 2044 2045 if (ci->ci_want_pmapload) { 2046 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 2047 != pmap_kernel()); 2048 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 2049 != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID); 2050 2051 /* 2052 * userspace has not been touched. 2053 * nothing to do here. 2054 */ 2055 2056 ci->ci_want_pmapload = 0; 2057 return; 2058 } 2059 2060 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2061 2062 if (pmap == pmap_kernel()) { 2063 return; 2064 } 2065 2066 KASSERT(pmap->pm_pdirpa == rcr3()); 2067 KASSERT(ci->ci_pmap == pmap); 2068 2069 /* 2070 * we aren't interested in TLB invalidations for this pmap, 2071 * at least for the time being. 2072 */ 2073 2074 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); 2075 ci->ci_tlbstate = TLBSTATE_LAZY; 2076 } 2077 2078 /* 2079 * end of lifecycle functions 2080 */ 2081 2082 /* 2083 * some misc. functions 2084 */ 2085 2086 static int 2087 pmap_pdes_invalid(vaddr_t va, pd_entry_t **pdes, pd_entry_t *lastpde) 2088 { 2089 int i; 2090 unsigned long index; 2091 pd_entry_t pde; 2092 2093 for (i = PTP_LEVELS; i > 1; i--) { 2094 index = pl_i(va, i); 2095 pde = pdes[i - 2][index]; 2096 if ((pde & PG_V) == 0) 2097 return i; 2098 } 2099 if (lastpde != NULL) 2100 *lastpde = pde; 2101 return 0; 2102 } 2103 2104 /* 2105 * pmap_extract: extract a PA for the given VA 2106 */ 2107 2108 bool 2109 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap) 2110 { 2111 pt_entry_t *ptes, pte; 2112 pd_entry_t pde, **pdes; 2113 struct pmap *pmap2; 2114 2115 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 2116 if (!pmap_pdes_valid(va, pdes, &pde)) { 2117 pmap_unmap_ptes(pmap, pmap2); 2118 return false; 2119 } 2120 pte = ptes[pl1_i(va)]; 2121 pmap_unmap_ptes(pmap, pmap2); 2122 2123 if (pde & PG_PS) { 2124 if (pap != NULL) 2125 *pap = (pde & PG_LGFRAME) | (va & (NBPD_L2 - 1)); 2126 return (true); 2127 } 2128 2129 if (__predict_true((pte & PG_V) != 0)) { 2130 if (pap != NULL) 2131 *pap = (pte & PG_FRAME) | (va & (NBPD_L1 - 1)); 2132 return (true); 2133 } 2134 2135 return false; 2136 } 2137 2138 2139 /* 2140 * vtophys: virtual address to physical address. For use by 2141 * machine-dependent code only. 2142 */ 2143 2144 paddr_t 2145 vtophys(vaddr_t va) 2146 { 2147 paddr_t pa; 2148 2149 if (pmap_extract(pmap_kernel(), va, &pa) == true) 2150 return (pa); 2151 return (0); 2152 } 2153 2154 2155 /* 2156 * pmap_virtual_space: used during bootup [pmap_steal_memory] to 2157 * determine the bounds of the kernel virtual addess space. 2158 */ 2159 2160 void 2161 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp) 2162 { 2163 *startp = virtual_avail; 2164 *endp = virtual_end; 2165 } 2166 2167 /* 2168 * pmap_map: map a range of PAs into kvm. 2169 * 2170 * => used during crash dump 2171 * => XXX: pmap_map() should be phased out? 2172 */ 2173 2174 vaddr_t 2175 pmap_map(vaddr_t va, paddr_t spa, paddr_t epa, vm_prot_t prot) 2176 { 2177 while (spa < epa) { 2178 pmap_enter(pmap_kernel(), va, spa, prot, 0); 2179 va += PAGE_SIZE; 2180 spa += PAGE_SIZE; 2181 } 2182 pmap_update(pmap_kernel()); 2183 return va; 2184 } 2185 2186 /* 2187 * pmap_zero_page: zero a page 2188 */ 2189 2190 void 2191 pmap_zero_page(paddr_t pa) 2192 { 2193 #ifdef MULTIPROCESSOR 2194 int id = cpu_number(); 2195 #endif 2196 pt_entry_t *zpte = PTESLEW(zero_pte, id); 2197 void *zerova = VASLEW(zerop, id); 2198 2199 #ifdef DIAGNOSTIC 2200 if (*zpte) 2201 panic("pmap_zero_page: lock botch"); 2202 #endif 2203 2204 *zpte = (pa & PG_FRAME) | PG_V | PG_RW | PG_M | PG_U; /* map in */ 2205 pmap_update_pg((vaddr_t)zerova); /* flush TLB */ 2206 2207 if (cpu_feature & CPUID_SSE2) 2208 sse2_zero_page(zerova); 2209 else 2210 memset(zerova, 0, PAGE_SIZE); 2211 2212 #ifdef DIAGNOSTIC 2213 *zpte = 0; /* zap! */ 2214 #endif 2215 } 2216 2217 /* 2218 * pmap_pagezeroidle: the same, for the idle loop page zero'er. 2219 * Returns true if the page was zero'd, false if we aborted for 2220 * some reason. 2221 */ 2222 2223 bool 2224 pmap_pageidlezero(paddr_t pa) 2225 { 2226 2227 pmap_zero_page(pa); 2228 return true; 2229 } 2230 2231 /* 2232 * pmap_copy_page: copy a page 2233 */ 2234 2235 void 2236 pmap_copy_page(paddr_t srcpa, paddr_t dstpa) 2237 { 2238 #ifdef MULTIPROCESSOR 2239 int id = cpu_number(); 2240 #endif 2241 pt_entry_t *spte = PTESLEW(csrc_pte,id); 2242 pt_entry_t *dpte = PTESLEW(cdst_pte,id); 2243 void *csrcva = VASLEW(csrcp, id); 2244 void *cdstva = VASLEW(cdstp, id); 2245 2246 #ifdef DIAGNOSTIC 2247 if (*spte || *dpte) 2248 panic("pmap_copy_page: lock botch"); 2249 #endif 2250 2251 *spte = (srcpa & PG_FRAME) | PG_V | PG_RW | PG_U; 2252 *dpte = (dstpa & PG_FRAME) | PG_V | PG_RW | PG_M | PG_U; 2253 pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva); 2254 if (cpu_feature & CPUID_SSE2) 2255 sse2_copy_page(csrcva, cdstva); 2256 else 2257 memcpy(cdstva, csrcva, PAGE_SIZE); 2258 #ifdef DIAGNOSTIC 2259 *spte = *dpte = 0; /* zap! */ 2260 #endif 2261 } 2262 2263 /* 2264 * p m a p r e m o v e f u n c t i o n s 2265 * 2266 * functions that remove mappings 2267 */ 2268 2269 /* 2270 * pmap_remove_ptes: remove PTEs from a PTP 2271 * 2272 * => must have proper locking on pmap_master_lock 2273 * => caller must hold pmap's lock 2274 * => PTP must be mapped into KVA 2275 * => PTP should be null if pmap == pmap_kernel() 2276 * => must be called with kernel preemption disabled 2277 * => returns composite pte if at least one page should be shot down 2278 */ 2279 2280 static pt_entry_t 2281 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva, 2282 vaddr_t startva, vaddr_t endva, int flags, 2283 struct pv_entry **pv_tofree) 2284 { 2285 struct pv_entry *pve; 2286 pt_entry_t *pte = (pt_entry_t *) ptpva; 2287 pt_entry_t opte, xpte = 0; 2288 2289 /* 2290 * note that ptpva points to the PTE that maps startva. this may 2291 * or may not be the first PTE in the PTP. 2292 * 2293 * we loop through the PTP while there are still PTEs to look at 2294 * and the wire_count is greater than 1 (because we use the wire_count 2295 * to keep track of the number of real PTEs in the PTP). 2296 */ 2297 2298 for (/*null*/; startva < endva && (ptp == NULL || ptp->wire_count > 1) 2299 ; pte++, startva += PAGE_SIZE) { 2300 struct vm_page *pg; 2301 struct vm_page_md *mdpg; 2302 2303 if (!pmap_valid_entry(*pte)) 2304 continue; /* VA not mapped */ 2305 if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) { 2306 continue; 2307 } 2308 2309 /* atomically save the old PTE and zap! it */ 2310 opte = pmap_pte_set(pte, 0); 2311 pmap_exec_account(pmap, startva, opte, 0); 2312 KASSERT(pmap_valid_entry(opte)); 2313 2314 if (opte & PG_W) 2315 pmap->pm_stats.wired_count--; 2316 pmap->pm_stats.resident_count--; 2317 xpte |= opte; 2318 2319 if (ptp) { 2320 ptp->wire_count--; /* dropping a PTE */ 2321 /* Make sure that the PDE is flushed */ 2322 if (ptp->wire_count <= 1) 2323 xpte |= PG_U; 2324 } 2325 2326 /* 2327 * if we are not on a pv_head list we are done. 2328 */ 2329 2330 if ((opte & PG_PVLIST) == 0) { 2331 #ifdef DIAGNOSTIC 2332 if (PHYS_TO_VM_PAGE(opte & PG_FRAME) != NULL) 2333 panic("pmap_remove_ptes: managed page without " 2334 "PG_PVLIST for 0x%lx", startva); 2335 #endif 2336 continue; 2337 } 2338 2339 pg = PHYS_TO_VM_PAGE(opte & PG_FRAME); 2340 #ifdef DIAGNOSTIC 2341 if (pg == NULL) 2342 panic("pmap_remove_ptes: unmanaged page marked " 2343 "PG_PVLIST, va = 0x%lx, pa = 0x%lx", 2344 startva, (u_long)(opte & PG_FRAME)); 2345 #endif 2346 mdpg = &pg->mdpage; 2347 2348 /* sync R/M bits */ 2349 mutex_spin_enter(&mdpg->mp_pvhead.pvh_lock); 2350 mdpg->mp_attrs |= (opte & (PG_U|PG_M)); 2351 pve = pmap_remove_pv(&mdpg->mp_pvhead, pmap, startva); 2352 mutex_spin_exit(&mdpg->mp_pvhead.pvh_lock); 2353 2354 if (pve) { 2355 SPLAY_RIGHT(pve, pv_node) = *pv_tofree; 2356 *pv_tofree = pve; 2357 } 2358 2359 /* end of "for" loop: time for next pte */ 2360 } 2361 2362 return xpte; 2363 } 2364 2365 2366 /* 2367 * pmap_remove_pte: remove a single PTE from a PTP 2368 * 2369 * => must have proper locking on pmap_master_lock 2370 * => caller must hold pmap's lock 2371 * => PTP must be mapped into KVA 2372 * => PTP should be null if pmap == pmap_kernel() 2373 * => returns true if we removed a mapping 2374 * => must be called with kernel preemption disabled 2375 */ 2376 2377 static bool 2378 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, 2379 vaddr_t va, int flags, struct pv_entry **pv_tofree) 2380 { 2381 pt_entry_t opte; 2382 struct pv_entry *pve; 2383 struct vm_page *pg; 2384 struct vm_page_md *mdpg; 2385 2386 if (!pmap_valid_entry(*pte)) 2387 return(false); /* VA not mapped */ 2388 if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) { 2389 return(false); 2390 } 2391 2392 /* atomically save the old PTE and zap! it */ 2393 opte = pmap_pte_set(pte, 0); 2394 pmap_exec_account(pmap, va, opte, 0); 2395 KASSERT(pmap_valid_entry(opte)); 2396 2397 if (opte & PG_W) 2398 pmap->pm_stats.wired_count--; 2399 pmap->pm_stats.resident_count--; 2400 2401 if (opte & PG_U) 2402 pmap_tlb_shootdown(pmap, va, 0, opte); 2403 2404 if (ptp) { 2405 ptp->wire_count--; /* dropping a PTE */ 2406 /* Make sure that the PDE is flushed */ 2407 if ((ptp->wire_count <= 1) && !(opte & PG_U)) 2408 pmap_tlb_shootdown(pmap, va, 0, opte); 2409 } 2410 2411 /* 2412 * if we are not on a pv_head list we are done. 2413 */ 2414 2415 if ((opte & PG_PVLIST) == 0) { 2416 #ifdef DIAGNOSTIC 2417 if (PHYS_TO_VM_PAGE(opte & PG_FRAME) != NULL) 2418 panic("pmap_remove_pte: managed page without " 2419 "PG_PVLIST for 0x%lx", va); 2420 #endif 2421 return(true); 2422 } 2423 2424 pg = PHYS_TO_VM_PAGE(opte & PG_FRAME); 2425 #ifdef DIAGNOSTIC 2426 if (pg == NULL) 2427 panic("pmap_remove_pte: unmanaged page marked " 2428 "PG_PVLIST, va = 0x%lx, pa = 0x%lx", va, 2429 (u_long)(opte & PG_FRAME)); 2430 #endif 2431 mdpg = &pg->mdpage; 2432 2433 /* sync R/M bits */ 2434 mutex_spin_enter(&mdpg->mp_pvhead.pvh_lock); 2435 mdpg->mp_attrs |= (opte & (PG_U|PG_M)); 2436 pve = pmap_remove_pv(&mdpg->mp_pvhead, pmap, va); 2437 mutex_spin_exit(&mdpg->mp_pvhead.pvh_lock); 2438 2439 if (pve) { 2440 SPLAY_RIGHT(pve, pv_node) = *pv_tofree; 2441 *pv_tofree = pve; 2442 } 2443 2444 return(true); 2445 } 2446 2447 /* 2448 * pmap_remove: top level mapping removal function 2449 * 2450 * => caller should not be holding any pmap locks 2451 */ 2452 2453 void 2454 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 2455 { 2456 pmap_do_remove(pmap, sva, eva, PMAP_REMOVE_ALL); 2457 } 2458 2459 /* 2460 * pmap_do_remove: mapping removal guts 2461 * 2462 * => caller should not be holding any pmap locks 2463 */ 2464 2465 static void 2466 pmap_do_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva, int flags) 2467 { 2468 pt_entry_t *ptes, xpte = 0; 2469 pd_entry_t **pdes, pde; 2470 struct pv_entry *pv_tofree = NULL; 2471 bool result; 2472 paddr_t ptppa; 2473 vaddr_t blkendva, va = sva; 2474 struct vm_page *ptp, *empty_ptps = NULL; 2475 struct pmap *pmap2; 2476 2477 /* 2478 * we lock in the pmap => pv_head direction 2479 */ 2480 2481 rw_enter(&pmap_main_lock, RW_READER); 2482 2483 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 2484 2485 /* 2486 * removing one page? take shortcut function. 2487 */ 2488 2489 if (va + PAGE_SIZE == eva) { 2490 if (pmap_pdes_valid(va, pdes, &pde)) { 2491 2492 /* PA of the PTP */ 2493 ptppa = pde & PG_FRAME; 2494 2495 /* get PTP if non-kernel mapping */ 2496 if (pmap == pmap_kernel()) { 2497 /* we never free kernel PTPs */ 2498 ptp = NULL; 2499 } else { 2500 ptp = pmap_find_ptp(pmap, va, ptppa, 1); 2501 #ifdef DIAGNOSTIC 2502 if (ptp == NULL) 2503 panic("pmap_remove: unmanaged " 2504 "PTP detected"); 2505 #endif 2506 } 2507 2508 /* do it! */ 2509 result = pmap_remove_pte(pmap, ptp, 2510 &ptes[pl1_i(va)], va, flags, &pv_tofree); 2511 2512 /* 2513 * if mapping removed and the PTP is no longer 2514 * being used, free it! 2515 */ 2516 2517 if (result && ptp && ptp->wire_count <= 1) 2518 pmap_free_ptp(pmap, ptp, va, ptes, pdes, 2519 &empty_ptps); 2520 } 2521 } else for (/* null */ ; va < eva ; va = blkendva) { 2522 int lvl; 2523 2524 /* determine range of block */ 2525 blkendva = x86_round_pdr(va+1); 2526 if (blkendva > eva) 2527 blkendva = eva; 2528 2529 /* 2530 * XXXCDC: our PTE mappings should never be removed 2531 * with pmap_remove! if we allow this (and why would 2532 * we?) then we end up freeing the pmap's page 2533 * directory page (PDP) before we are finished using 2534 * it when we hit in in the recursive mapping. this 2535 * is BAD. 2536 * 2537 * long term solution is to move the PTEs out of user 2538 * address space. and into kernel address space (up 2539 * with APTE). then we can set VM_MAXUSER_ADDRESS to 2540 * be VM_MAX_ADDRESS. 2541 */ 2542 2543 if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE) 2544 /* XXXCDC: ugly hack to avoid freeing PDP here */ 2545 continue; 2546 2547 lvl = pmap_pdes_invalid(va, pdes, &pde); 2548 if (lvl != 0) { 2549 /* 2550 * skip a range corresponding to an invalid pde. 2551 */ 2552 blkendva = (va & ptp_masks[lvl - 1]) + nbpd[lvl - 1]; 2553 continue; 2554 } 2555 2556 /* PA of the PTP */ 2557 ptppa = pde & PG_FRAME; 2558 2559 /* get PTP if non-kernel mapping */ 2560 if (pmap == pmap_kernel()) { 2561 /* we never free kernel PTPs */ 2562 ptp = NULL; 2563 } else { 2564 ptp = pmap_find_ptp(pmap, va, ptppa, 1); 2565 #ifdef DIAGNOSTIC 2566 if (ptp == NULL) 2567 panic("pmap_remove: unmanaged PTP " 2568 "detected"); 2569 #endif 2570 } 2571 xpte |= pmap_remove_ptes(pmap, ptp, 2572 (vaddr_t)&ptes[pl1_i(va)], va, blkendva, 2573 flags, &pv_tofree); 2574 2575 /* if PTP is no longer being used, free it! */ 2576 if (ptp && ptp->wire_count <= 1) { 2577 pmap_free_ptp(pmap, ptp, va, ptes, pdes, &empty_ptps); 2578 } 2579 if ((xpte & PG_U) != 0) 2580 pmap_tlb_shootdown(pmap, sva, eva, xpte); 2581 } 2582 pmap_tlb_shootwait(); 2583 pmap_unmap_ptes(pmap, pmap2); /* unlock pmap */ 2584 rw_exit(&pmap_main_lock); 2585 2586 /* Now we can free unused PVs and ptps */ 2587 if (pv_tofree) 2588 pmap_free_pvs(pv_tofree); 2589 for (ptp = empty_ptps; ptp != NULL; ptp = empty_ptps) { 2590 empty_ptps = ptp->mdpage.mp_link; 2591 uvm_pagefree(ptp); 2592 } 2593 } 2594 2595 /* 2596 * pmap_page_remove: remove a managed vm_page from all pmaps that map it 2597 * 2598 * => we set pv_head => pmap locking 2599 * => R/M bits are sync'd back to attrs 2600 */ 2601 2602 void 2603 pmap_page_remove(struct vm_page *pg) 2604 { 2605 struct pv_head *pvh; 2606 struct pv_entry *pve, *npve, *killlist = NULL; 2607 pt_entry_t *ptes, opte; 2608 pd_entry_t **pdes; 2609 #ifdef DIAGNOSTIC 2610 pd_entry_t pde; 2611 #endif 2612 struct vm_page *empty_ptps = NULL; 2613 struct vm_page *ptp; 2614 struct pmap *pmap2; 2615 2616 #ifdef DIAGNOSTIC 2617 int bank, off; 2618 2619 bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off); 2620 if (bank == -1) 2621 panic("pmap_page_remove: unmanaged page?"); 2622 #endif 2623 2624 pvh = &pg->mdpage.mp_pvhead; 2625 if (SPLAY_ROOT(&pvh->pvh_root) == NULL) { 2626 return; 2627 } 2628 2629 /* set pv_head => pmap locking */ 2630 rw_enter(&pmap_main_lock, RW_WRITER); 2631 2632 for (pve = SPLAY_MIN(pvtree, &pvh->pvh_root); pve != NULL; pve = npve) { 2633 npve = SPLAY_NEXT(pvtree, &pvh->pvh_root, pve); 2634 2635 /* locks pmap */ 2636 pmap_map_ptes(pve->pv_pmap, &pmap2, &ptes, &pdes); 2637 2638 #ifdef DIAGNOSTIC 2639 if (pve->pv_ptp && pmap_pdes_valid(pve->pv_va, pdes, &pde) && 2640 (pde & PG_FRAME) != VM_PAGE_TO_PHYS(pve->pv_ptp)) { 2641 printf("pmap_page_remove: pg=%p: va=%lx, pv_ptp=%p\n", 2642 pg, pve->pv_va, pve->pv_ptp); 2643 printf("pmap_page_remove: PTP's phys addr: " 2644 "actual=%lx, recorded=%lx\n", 2645 (unsigned long)(pde & PG_FRAME), 2646 (unsigned long)VM_PAGE_TO_PHYS(pve->pv_ptp)); 2647 panic("pmap_page_remove: mapped managed page has " 2648 "invalid pv_ptp field"); 2649 } 2650 #endif 2651 2652 /* atomically save the old PTE and zap! it */ 2653 opte = pmap_pte_set(&ptes[pl1_i(pve->pv_va)], 0); 2654 KASSERT(pmap_valid_entry(opte)); 2655 KDASSERT((opte & PG_FRAME) == VM_PAGE_TO_PHYS(pg)); 2656 2657 if (opte & PG_W) 2658 pve->pv_pmap->pm_stats.wired_count--; 2659 pve->pv_pmap->pm_stats.resident_count--; 2660 2661 /* Shootdown only if referenced */ 2662 if (opte & PG_U) 2663 pmap_tlb_shootdown(pve->pv_pmap, pve->pv_va, 0, opte); 2664 2665 /* sync R/M bits */ 2666 pg->mdpage.mp_attrs |= (opte & (PG_U|PG_M)); 2667 2668 /* update the PTP reference count. free if last reference. */ 2669 if (pve->pv_ptp) { 2670 pve->pv_ptp->wire_count--; 2671 if (pve->pv_ptp->wire_count <= 1) { 2672 pmap_free_ptp(pve->pv_pmap, pve->pv_ptp, 2673 pve->pv_va, ptes, pdes, &empty_ptps); 2674 } 2675 } 2676 2677 pmap_unmap_ptes(pve->pv_pmap, pmap2); /* unlocks pmap */ 2678 SPLAY_REMOVE(pvtree, &pvh->pvh_root, pve); /* remove it */ 2679 SPLAY_RIGHT(pve, pv_node) = killlist; /* mark it for death */ 2680 killlist = pve; 2681 } 2682 rw_exit(&pmap_main_lock); 2683 2684 crit_enter(); 2685 pmap_tlb_shootwait(); 2686 crit_exit(); 2687 2688 /* Now we can free unused pvs and ptps. */ 2689 pmap_free_pvs(killlist); 2690 for (ptp = empty_ptps; ptp != NULL; ptp = empty_ptps) { 2691 empty_ptps = ptp->mdpage.mp_link; 2692 uvm_pagefree(ptp); 2693 } 2694 } 2695 2696 /* 2697 * p m a p a t t r i b u t e f u n c t i o n s 2698 * functions that test/change managed page's attributes 2699 * since a page can be mapped multiple times we must check each PTE that 2700 * maps it by going down the pv lists. 2701 */ 2702 2703 /* 2704 * pmap_test_attrs: test a page's attributes 2705 * 2706 * => we set pv_head => pmap locking 2707 */ 2708 2709 bool 2710 pmap_test_attrs(struct vm_page *pg, unsigned testbits) 2711 { 2712 struct vm_page_md *mdpg; 2713 int *myattrs; 2714 struct pv_head *pvh; 2715 struct pv_entry *pve; 2716 struct pmap *pmap2; 2717 pt_entry_t pte; 2718 2719 #if DIAGNOSTIC 2720 int bank, off; 2721 2722 bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off); 2723 if (bank == -1) 2724 panic("pmap_test_attrs: unmanaged page?"); 2725 #endif 2726 mdpg = &pg->mdpage; 2727 2728 /* 2729 * before locking: see if attributes are already set and if so, 2730 * return! 2731 */ 2732 2733 myattrs = &mdpg->mp_attrs; 2734 if (*myattrs & testbits) 2735 return(true); 2736 2737 /* test to see if there is a list before bothering to lock */ 2738 pvh = &mdpg->mp_pvhead; 2739 if (SPLAY_ROOT(&pvh->pvh_root) == NULL) { 2740 return(false); 2741 } 2742 2743 /* nope, gonna have to do it the hard way */ 2744 rw_enter(&pmap_main_lock, RW_WRITER); 2745 2746 for (pve = SPLAY_MIN(pvtree, &pvh->pvh_root); 2747 pve != NULL && (*myattrs & testbits) == 0; 2748 pve = SPLAY_NEXT(pvtree, &pvh->pvh_root, pve)) { 2749 pt_entry_t *ptes; 2750 pd_entry_t **pdes; 2751 2752 pmap_map_ptes(pve->pv_pmap, &pmap2, &ptes, &pdes); 2753 pte = ptes[pl1_i(pve->pv_va)]; 2754 pmap_unmap_ptes(pve->pv_pmap, pmap2); 2755 *myattrs |= pte; 2756 } 2757 2758 /* 2759 * note that we will exit the for loop with a non-null pve if 2760 * we have found the bits we are testing for. 2761 */ 2762 2763 rw_exit(&pmap_main_lock); 2764 return((*myattrs & testbits) != 0); 2765 } 2766 2767 /* 2768 * pmap_clear_attrs: clear the specified attribute for a page. 2769 * 2770 * => we set pv_head => pmap locking 2771 * => we return true if we cleared one of the bits we were asked to 2772 */ 2773 2774 bool 2775 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits) 2776 { 2777 struct vm_page_md *mdpg; 2778 uint32_t result; 2779 struct pv_head *pvh; 2780 struct pv_entry *pve; 2781 pt_entry_t *ptes, opte; 2782 int *myattrs; 2783 struct pmap *pmap2; 2784 2785 #ifdef DIAGNOSTIC 2786 int bank, off; 2787 2788 bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off); 2789 if (bank == -1) 2790 panic("pmap_change_attrs: unmanaged page?"); 2791 #endif 2792 mdpg = &pg->mdpage; 2793 2794 rw_enter(&pmap_main_lock, RW_WRITER); 2795 pvh = &mdpg->mp_pvhead; 2796 2797 myattrs = &mdpg->mp_attrs; 2798 result = *myattrs & clearbits; 2799 *myattrs &= ~clearbits; 2800 2801 SPLAY_FOREACH(pve, pvtree, &pvh->pvh_root) { 2802 pt_entry_t *ptep; 2803 pd_entry_t **pdes; 2804 2805 /* locks pmap */ 2806 pmap_map_ptes(pve->pv_pmap, &pmap2, &ptes, &pdes); 2807 #ifdef DIAGNOSTIC 2808 if (!pmap_pdes_valid(pve->pv_va, pdes, NULL)) 2809 panic("pmap_change_attrs: mapping without PTP " 2810 "detected"); 2811 #endif 2812 ptep = &ptes[pl1_i(pve->pv_va)]; 2813 opte = *ptep; 2814 KASSERT(pmap_valid_entry(opte)); 2815 KDASSERT((opte & PG_FRAME) == VM_PAGE_TO_PHYS(pg)); 2816 if (opte & clearbits) { 2817 /* We need to do something */ 2818 if (clearbits == PG_RW) { 2819 result |= PG_RW; 2820 2821 /* 2822 * On write protect we might not need to flush 2823 * the TLB 2824 */ 2825 2826 /* First zap the RW bit! */ 2827 pmap_pte_clearbits(ptep, PG_RW); 2828 opte = *ptep; 2829 2830 /* 2831 * Then test if it is not cached as RW the TLB 2832 */ 2833 if (!(opte & PG_M)) 2834 goto no_tlb_shootdown; 2835 } 2836 2837 /* 2838 * Since we need a shootdown we might as well 2839 * always clear PG_U AND PG_M. 2840 */ 2841 2842 /* zap! */ 2843 opte = pmap_pte_set(ptep, (opte & ~(PG_U | PG_M))); 2844 2845 result |= (opte & clearbits); 2846 *myattrs |= (opte & ~(clearbits)); 2847 2848 pmap_tlb_shootdown(pve->pv_pmap, pve->pv_va, 0, opte); 2849 } 2850 no_tlb_shootdown: 2851 pmap_unmap_ptes(pve->pv_pmap, pmap2); /* unlocks pmap */ 2852 } 2853 2854 rw_exit(&pmap_main_lock); 2855 2856 crit_enter(); 2857 pmap_tlb_shootwait(); 2858 crit_exit(); 2859 2860 return(result != 0); 2861 } 2862 2863 2864 /* 2865 * p m a p p r o t e c t i o n f u n c t i o n s 2866 */ 2867 2868 /* 2869 * pmap_page_protect: change the protection of all recorded mappings 2870 * of a managed page 2871 * 2872 * => NOTE: this is an inline function in pmap.h 2873 */ 2874 2875 /* see pmap.h */ 2876 2877 /* 2878 * pmap_protect: set the protection in of the pages in a pmap 2879 * 2880 * => NOTE: this is an inline function in pmap.h 2881 */ 2882 2883 /* see pmap.h */ 2884 2885 /* 2886 * pmap_write_protect: write-protect pages in a pmap 2887 */ 2888 2889 void 2890 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot) 2891 { 2892 pt_entry_t *ptes, *epte, xpte; 2893 volatile pt_entry_t *spte; 2894 pd_entry_t **pdes; 2895 vaddr_t blockend, va, tva; 2896 pt_entry_t opte; 2897 struct pmap *pmap2; 2898 2899 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 2900 2901 /* should be ok, but just in case ... */ 2902 sva &= PG_FRAME; 2903 eva &= PG_FRAME; 2904 xpte = 0; 2905 2906 for (va = sva ; va < eva ; va = blockend) { 2907 2908 blockend = (va & L2_FRAME) + NBPD_L2; 2909 if (blockend > eva) 2910 blockend = eva; 2911 2912 /* 2913 * XXXCDC: our PTE mappings should never be write-protected! 2914 * 2915 * long term solution is to move the PTEs out of user 2916 * address space. and into kernel address space (up 2917 * with APTE). then we can set VM_MAXUSER_ADDRESS to 2918 * be VM_MAX_ADDRESS. 2919 */ 2920 2921 /* XXXCDC: ugly hack to avoid freeing PDP here */ 2922 if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE) 2923 continue; 2924 2925 /* empty block? */ 2926 if (!pmap_pdes_valid(va, pdes, NULL)) 2927 continue; 2928 2929 #ifdef DIAGNOSTIC 2930 if (va >= VM_MAXUSER_ADDRESS && 2931 va < VM_MAX_ADDRESS) 2932 panic("pmap_write_protect: PTE space"); 2933 #endif 2934 2935 spte = &ptes[pl1_i(va)]; 2936 epte = &ptes[pl1_i(blockend)]; 2937 2938 for (/*null */; spte < epte ; spte++) { 2939 opte = *spte; 2940 xpte |= opte; 2941 if ((opte & (PG_RW|PG_V)) == (PG_RW|PG_V)) { 2942 pmap_pte_clearbits(spte, PG_RW); /* zap! */ 2943 if (*spte & PG_M) { 2944 tva = x86_ptob(spte - ptes); 2945 pmap_tlb_shootdown(pmap, tva, 0, opte); 2946 } 2947 } 2948 } 2949 } 2950 2951 /* 2952 * if we kept a removal record and removed some pages update the TLB 2953 */ 2954 pmap_tlb_shootdown(pmap, sva, eva, xpte); 2955 pmap_tlb_shootwait(); 2956 pmap_unmap_ptes(pmap, pmap2); /* unlocks pmap */ 2957 } 2958 2959 /* 2960 * end of protection functions 2961 */ 2962 2963 /* 2964 * pmap_unwire: clear the wired bit in the PTE 2965 * 2966 * => mapping should already be in map 2967 */ 2968 2969 void 2970 pmap_unwire(struct pmap *pmap, vaddr_t va) 2971 { 2972 pt_entry_t *ptes; 2973 pd_entry_t **pdes; 2974 struct pmap *pmap2; 2975 2976 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 2977 2978 if (pmap_pdes_valid(va, pdes, NULL)) { 2979 2980 #ifdef DIAGNOSTIC 2981 if (!pmap_valid_entry(ptes[pl1_i(va)])) 2982 panic("pmap_unwire: invalid (unmapped) va 0x%lx", va); 2983 #endif 2984 if ((ptes[pl1_i(va)] & PG_W) != 0) { 2985 pmap_pte_clearbits(&ptes[pl1_i(va)], PG_W); 2986 pmap->pm_stats.wired_count--; 2987 } 2988 #ifdef DIAGNOSTIC 2989 else { 2990 printf("pmap_unwire: wiring for pmap %p va 0x%lx " 2991 "didn't change!\n", pmap, va); 2992 } 2993 #endif 2994 pmap_unmap_ptes(pmap, pmap2); /* unlocks map */ 2995 } 2996 #ifdef DIAGNOSTIC 2997 else { 2998 panic("pmap_unwire: invalid PDE"); 2999 } 3000 #endif 3001 } 3002 3003 /* 3004 * pmap_collect: free resources held by a pmap 3005 * 3006 * => optional function. 3007 * => called when a process is swapped out to free memory. 3008 */ 3009 3010 void 3011 pmap_collect(struct pmap *pmap) 3012 { 3013 /* 3014 * free all of the pt pages by removing the physical mappings 3015 * for its entire address space. 3016 */ 3017 3018 pmap_do_remove(pmap, VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS, 3019 PMAP_REMOVE_SKIPWIRED); 3020 } 3021 3022 /* 3023 * pmap_copy: copy mappings from one pmap to another 3024 * 3025 * => optional function 3026 * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr) 3027 */ 3028 3029 /* 3030 * defined as macro in pmap.h 3031 */ 3032 3033 /* 3034 * pmap_enter: enter a mapping into a pmap 3035 * 3036 * => must be done "now" ... no lazy-evaluation 3037 * => we set pmap => pv_head locking 3038 */ 3039 3040 int 3041 pmap_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, 3042 int flags) 3043 { 3044 pt_entry_t *ptes, opte, npte; 3045 pt_entry_t *ptep; 3046 pd_entry_t **pdes; 3047 struct vm_page *ptp, *pg; 3048 struct vm_page_md *mdpg; 3049 struct pv_head *old_pvh, *new_pvh; 3050 struct pv_entry *pve = NULL, *freepve, *freepve2 = NULL; 3051 int error; 3052 bool wired = (flags & PMAP_WIRED) != 0; 3053 struct pmap *pmap2; 3054 3055 KASSERT(pmap_initialized); 3056 3057 #ifdef DIAGNOSTIC 3058 /* sanity check: totally out of range? */ 3059 if (va >= VM_MAX_KERNEL_ADDRESS) 3060 panic("pmap_enter: too big"); 3061 3062 if (va == (vaddr_t) PDP_BASE || va == (vaddr_t) APDP_BASE) 3063 panic("pmap_enter: trying to map over PDP/APDP!"); 3064 3065 /* sanity check: kernel PTPs should already have been pre-allocated */ 3066 if (va >= VM_MIN_KERNEL_ADDRESS && 3067 !pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)])) 3068 panic("pmap_enter: missing kernel PTP for va %lx!", va); 3069 #endif 3070 3071 npte = pa | protection_codes[prot] | PG_V; 3072 if (wired) 3073 npte |= PG_W; 3074 if (va < VM_MAXUSER_ADDRESS) 3075 npte |= PG_u; 3076 else if (va < VM_MAX_ADDRESS) 3077 npte |= (PG_u | PG_RW); /* XXXCDC: no longer needed? */ 3078 if (pmap == pmap_kernel()) 3079 npte |= pmap_pg_g; 3080 if (flags & VM_PROT_ALL) { 3081 npte |= PG_U; 3082 if (flags & VM_PROT_WRITE) 3083 npte |= PG_M; 3084 } 3085 3086 /* get a pve. */ 3087 freepve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT); 3088 3089 /* get lock */ 3090 rw_enter(&pmap_main_lock, RW_READER); 3091 3092 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 3093 if (pmap == pmap_kernel()) { 3094 ptp = NULL; 3095 } else { 3096 ptp = pmap_get_ptp(pmap, va, pdes); 3097 if (ptp == NULL) { 3098 if (flags & PMAP_CANFAIL) { 3099 error = ENOMEM; 3100 goto out; 3101 } 3102 panic("pmap_enter: get ptp failed"); 3103 } 3104 } 3105 3106 /* 3107 * Get first view on old PTE 3108 * on SMP the PTE might gain PG_U and PG_M flags 3109 * before we zap it later 3110 */ 3111 ptep = &ptes[pl1_i(va)]; 3112 opte = *ptep; /* old PTE */ 3113 3114 /* 3115 * is there currently a valid mapping at our VA and does it 3116 * map to the same PA as the one we want to map ? 3117 */ 3118 3119 if (pmap_valid_entry(opte) && ((opte & PG_FRAME) == pa)) { 3120 3121 /* 3122 * first, calculate pm_stats updates. resident count will not 3123 * change since we are replacing/changing a valid mapping. 3124 * wired count might change... 3125 */ 3126 pmap->pm_stats.wired_count += 3127 ((npte & PG_W) ? 1 : 0 - (opte & PG_W) ? 1 : 0); 3128 3129 npte |= (opte & PG_PVLIST); 3130 3131 /* zap! */ 3132 opte = pmap_pte_set(ptep, npte); 3133 3134 /* 3135 * if this is on the PVLIST, sync R/M bit 3136 */ 3137 if (opte & PG_PVLIST) { 3138 pg = PHYS_TO_VM_PAGE(pa); 3139 #ifdef DIAGNOSTIC 3140 if (pg == NULL) 3141 panic("pmap_enter: same pa PG_PVLIST " 3142 "mapping with unmanaged page " 3143 "pa = 0x%lx (0x%lx)", pa, 3144 atop(pa)); 3145 #endif 3146 mdpg = &pg->mdpage; 3147 old_pvh = &mdpg->mp_pvhead; 3148 mutex_spin_enter(&old_pvh->pvh_lock); 3149 mdpg->mp_attrs |= opte; 3150 mutex_spin_exit(&old_pvh->pvh_lock); 3151 } 3152 goto shootdown_now; 3153 } 3154 3155 pg = PHYS_TO_VM_PAGE(pa); 3156 if (pg != NULL) { 3157 /* This is a managed page */ 3158 npte |= PG_PVLIST; 3159 mdpg = &pg->mdpage; 3160 new_pvh = &mdpg->mp_pvhead; 3161 if ((opte & (PG_PVLIST | PG_V)) != (PG_PVLIST | PG_V)) { 3162 /* We can not steal a pve - allocate one */ 3163 pve = freepve; 3164 freepve = NULL; 3165 if (pve == NULL) { 3166 if (!(flags & PMAP_CANFAIL)) 3167 panic("pmap_enter: " 3168 "no pv entries available"); 3169 error = ENOMEM; 3170 goto out; 3171 } 3172 } 3173 } else { 3174 new_pvh = NULL; 3175 } 3176 3177 /* 3178 * is there currently a valid mapping at our VA? 3179 */ 3180 3181 if (pmap_valid_entry(opte)) { 3182 3183 /* 3184 * changing PAs: we must remove the old one first 3185 */ 3186 3187 /* 3188 * first, calculate pm_stats updates. resident count will not 3189 * change since we are replacing/changing a valid mapping. 3190 * wired count might change... 3191 */ 3192 pmap->pm_stats.wired_count += 3193 ((npte & PG_W) ? 1 : 0 - (opte & PG_W) ? 1 : 0); 3194 3195 if (opte & PG_PVLIST) { 3196 pg = PHYS_TO_VM_PAGE(opte & PG_FRAME); 3197 #ifdef DIAGNOSTIC 3198 if (pg == NULL) 3199 panic("pmap_enter: PG_PVLIST mapping with " 3200 "unmanaged page " 3201 "pa = 0x%lx (0x%lx)", pa, atop(pa)); 3202 #endif 3203 mdpg = &pg->mdpage; 3204 old_pvh = &mdpg->mp_pvhead; 3205 3206 /* new_pvh is NULL if page will not be managed */ 3207 pmap_lock_pvhs(old_pvh, new_pvh); 3208 3209 /* zap! */ 3210 opte = pmap_pte_set(ptep, npte); 3211 3212 pve = pmap_remove_pv(old_pvh, pmap, va); 3213 KASSERT(pve != 0); 3214 mdpg->mp_attrs |= opte; 3215 3216 if (new_pvh != NULL) { 3217 pmap_enter_pv(new_pvh, pve, pmap, va, ptp); 3218 mutex_spin_exit(&new_pvh->pvh_lock); 3219 } 3220 mutex_spin_exit(&old_pvh->pvh_lock); 3221 if (new_pvh == NULL) 3222 freepve2 = pve; 3223 goto shootdown_test; 3224 } 3225 } else { /* opte not valid */ 3226 pmap->pm_stats.resident_count++; 3227 if (wired) 3228 pmap->pm_stats.wired_count++; 3229 if (ptp) 3230 ptp->wire_count++; 3231 } 3232 3233 if (new_pvh) { 3234 mutex_spin_enter(&new_pvh->pvh_lock); 3235 pmap_enter_pv(new_pvh, pve, pmap, va, ptp); 3236 mutex_spin_exit(&new_pvh->pvh_lock); 3237 } 3238 3239 opte = pmap_pte_set(ptep, npte); /* zap! */ 3240 3241 shootdown_test: 3242 /* Update page attributes if needed */ 3243 if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) { 3244 shootdown_now: 3245 pmap_tlb_shootdown(pmap, va, 0, opte); 3246 pmap_tlb_shootwait(); 3247 } 3248 3249 error = 0; 3250 3251 out: 3252 pmap_unmap_ptes(pmap, pmap2); 3253 rw_exit(&pmap_main_lock); 3254 3255 if (freepve != NULL) { 3256 /* put back the pv, we don't need it. */ 3257 pool_cache_put(&pmap_pv_cache, freepve); 3258 } 3259 if (freepve2 != NULL) 3260 pool_cache_put(&pmap_pv_cache, freepve2); 3261 3262 return error; 3263 } 3264 3265 static bool 3266 pmap_get_physpage(vaddr_t va, int level, paddr_t *paddrp) 3267 { 3268 struct vm_page *ptp; 3269 struct pmap *kpm = pmap_kernel(); 3270 3271 if (uvm.page_init_done == false) { 3272 /* 3273 * we're growing the kernel pmap early (from 3274 * uvm_pageboot_alloc()). this case must be 3275 * handled a little differently. 3276 */ 3277 3278 if (uvm_page_physget(paddrp) == false) 3279 panic("pmap_get_physpage: out of memory"); 3280 *early_zero_pte = (*paddrp & PG_FRAME) | PG_V | PG_RW; 3281 pmap_update_pg((vaddr_t)early_zerop); 3282 memset(early_zerop, 0, PAGE_SIZE); 3283 #if defined(DIAGNOSTIC) 3284 *early_zero_pte = 0; 3285 #endif /* defined(DIAGNOSTIC) */ 3286 } else { 3287 /* XXX */ 3288 if (level != 1) 3289 mutex_enter(&kpm->pm_obj[level - 1].vmobjlock); 3290 ptp = uvm_pagealloc(&kpm->pm_obj[level - 1], 3291 ptp_va2o(va, level), NULL, 3292 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 3293 if (level != 1) 3294 mutex_exit(&kpm->pm_obj[level - 1].vmobjlock); 3295 if (ptp == NULL) 3296 panic("pmap_get_physpage: out of memory"); 3297 ptp->flags &= ~PG_BUSY; 3298 ptp->wire_count = 1; 3299 *paddrp = VM_PAGE_TO_PHYS(ptp); 3300 } 3301 kpm->pm_stats.resident_count++; 3302 return true; 3303 } 3304 3305 /* 3306 * Allocate the amount of specified ptps for a ptp level, and populate 3307 * all levels below accordingly, mapping virtual addresses starting at 3308 * kva. 3309 * 3310 * Used by pmap_growkernel. 3311 */ 3312 static void 3313 pmap_alloc_level(pd_entry_t **pdes, vaddr_t kva, int lvl, long *needed_ptps) 3314 { 3315 unsigned long i; 3316 vaddr_t va; 3317 paddr_t pa; 3318 unsigned long index, endindex; 3319 int level; 3320 pd_entry_t *pdep; 3321 3322 for (level = lvl; level > 1; level--) { 3323 if (level == PTP_LEVELS) 3324 pdep = pmap_kernel()->pm_pdir; 3325 else 3326 pdep = pdes[level - 2]; 3327 va = kva; 3328 index = pl_i_roundup(kva, level); 3329 endindex = index + needed_ptps[level - 1] - 1; 3330 3331 for (i = index; i <= endindex; i++) { 3332 KASSERT(!pmap_valid_entry(pdep[i])); 3333 pmap_get_physpage(va, level - 1, &pa); 3334 pdep[i] = pa | PG_RW | PG_V; 3335 KASSERT(level != PTP_LEVELS || nkptp[level - 1] + 3336 pl_i(VM_MIN_KERNEL_ADDRESS, level) == i); 3337 nkptp[level - 1]++; 3338 va += nbpd[level - 1]; 3339 } 3340 } 3341 3342 /* For nkptp vs pmap_pdp_cache. */ 3343 mb_write(); 3344 } 3345 3346 /* 3347 * pmap_growkernel: increase usage of KVM space 3348 * 3349 * => we allocate new PTPs for the kernel and install them in all 3350 * the pmaps on the system. 3351 */ 3352 3353 vaddr_t 3354 pmap_growkernel(vaddr_t maxkvaddr) 3355 { 3356 struct pmap *kpm = pmap_kernel(), *pm; 3357 int s, i; 3358 unsigned newpdes; 3359 long needed_kptp[PTP_LEVELS], target_nptp, old; 3360 bool invalidate = false; 3361 3362 s = splvm(); /* to be safe */ 3363 mutex_enter(&kpm->pm_lock); 3364 3365 if (maxkvaddr <= pmap_maxkvaddr) { 3366 mutex_exit(&kpm->pm_lock); 3367 splx(s); 3368 return pmap_maxkvaddr; 3369 } 3370 3371 maxkvaddr = x86_round_pdr(maxkvaddr); 3372 old = nkptp[PTP_LEVELS - 1]; 3373 /* 3374 * This loop could be optimized more, but pmap_growkernel() 3375 * is called infrequently. 3376 */ 3377 for (i = PTP_LEVELS - 1; i >= 1; i--) { 3378 target_nptp = pl_i_roundup(maxkvaddr, i + 1) - 3379 pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1); 3380 /* 3381 * XXX only need to check toplevel. 3382 */ 3383 if (target_nptp > nkptpmax[i]) 3384 panic("out of KVA space"); 3385 KASSERT(target_nptp >= nkptp[i]); 3386 needed_kptp[i] = target_nptp - nkptp[i]; 3387 } 3388 3389 pmap_alloc_level(normal_pdes, pmap_maxkvaddr, PTP_LEVELS, needed_kptp); 3390 3391 /* 3392 * If the number of top level entries changed, update all 3393 * pmaps. 3394 */ 3395 if (needed_kptp[PTP_LEVELS - 1] != 0) { 3396 newpdes = nkptp[PTP_LEVELS - 1] - old; 3397 mutex_enter(&pmaps_lock); 3398 LIST_FOREACH(pm, &pmaps, pm_list) { 3399 memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old], 3400 &kpm->pm_pdir[PDIR_SLOT_KERN + old], 3401 newpdes * sizeof (pd_entry_t)); 3402 } 3403 mutex_exit(&pmaps_lock); 3404 invalidate = true; 3405 } 3406 pmap_maxkvaddr = maxkvaddr; 3407 mutex_exit(&kpm->pm_lock); 3408 splx(s); 3409 3410 if (invalidate) { 3411 /* Invalidate the PDP cache. */ 3412 pool_cache_invalidate(&pmap_pdp_cache); 3413 } 3414 3415 return maxkvaddr; 3416 } 3417 3418 #ifdef DEBUG 3419 void pmap_dump(struct pmap *, vaddr_t, vaddr_t); 3420 3421 /* 3422 * pmap_dump: dump all the mappings from a pmap 3423 * 3424 * => caller should not be holding any pmap locks 3425 */ 3426 3427 void 3428 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 3429 { 3430 pt_entry_t *ptes, *pte; 3431 pd_entry_t **pdes; 3432 struct pmap *pmap2; 3433 vaddr_t blkendva; 3434 3435 /* 3436 * if end is out of range truncate. 3437 * if (end == start) update to max. 3438 */ 3439 3440 if (eva > VM_MAXUSER_ADDRESS || eva <= sva) 3441 eva = VM_MAXUSER_ADDRESS; 3442 3443 /* 3444 * we lock in the pmap => pv_head direction 3445 */ 3446 3447 rw_enter(&pmap_main_lock, RW_READER); 3448 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 3449 3450 /* 3451 * dumping a range of pages: we dump in PTP sized blocks (4MB) 3452 */ 3453 3454 for (/* null */ ; sva < eva ; sva = blkendva) { 3455 3456 /* determine range of block */ 3457 blkendva = x86_round_pdr(sva+1); 3458 if (blkendva > eva) 3459 blkendva = eva; 3460 3461 /* valid block? */ 3462 if (!pmap_pdes_valid(sva, pdes, NULL)) 3463 continue; 3464 3465 pte = &ptes[pl1_i(sva)]; 3466 for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) { 3467 if (!pmap_valid_entry(*pte)) 3468 continue; 3469 printf("va %#lx -> pa %#lx (pte=%#lx)\n", 3470 sva, (unsigned long)*pte, 3471 (unsigned long)*pte & PG_FRAME); 3472 } 3473 } 3474 pmap_unmap_ptes(pmap, pmap2); 3475 rw_exit(&pmap_main_lock); 3476 } 3477 #endif 3478 3479 /* 3480 * pmap_tlb_shootdown: invalidate pages on all CPUs using pmap 'pm' 3481 * 3482 * => always invalidates locally before returning 3483 * => returns before remote CPUs have invalidated 3484 * => must be called with preemption disabled 3485 */ 3486 3487 void 3488 pmap_tlb_shootdown(struct pmap *pm, vaddr_t sva, vaddr_t eva, pt_entry_t pte) 3489 { 3490 #ifdef MULTIPROCESSOR 3491 extern int _lock_cas(volatile uintptr_t *, uintptr_t, uintptr_t); 3492 extern bool x86_mp_online; 3493 struct cpu_info *ci; 3494 struct pmap_mbox *mb, *selfmb; 3495 CPU_INFO_ITERATOR cii; 3496 uintptr_t head; 3497 u_int count; 3498 int s; 3499 #endif /* MULTIPROCESSOR */ 3500 struct cpu_info *self; 3501 bool kernel; 3502 3503 KASSERT(eva == 0 || eva >= sva); 3504 3505 if (pte & PG_PS) 3506 sva &= PG_LGFRAME; 3507 pte &= PG_G; 3508 self = curcpu(); 3509 3510 if (sva == (vaddr_t)-1LL) { 3511 kernel = true; 3512 } else { 3513 if (eva == 0) 3514 eva = sva + PAGE_SIZE; 3515 kernel = sva >= VM_MAXUSER_ADDRESS; 3516 KASSERT(kernel == (eva > VM_MAXUSER_ADDRESS)); 3517 } 3518 3519 /* 3520 * If the range is larger than 32 pages, then invalidate 3521 * everything. 3522 */ 3523 if (sva != (vaddr_t)-1LL && eva - sva > (32 * PAGE_SIZE)) { 3524 sva = (vaddr_t)-1LL; 3525 eva = sva; 3526 } 3527 3528 #ifdef MULTIPROCESSOR 3529 if (ncpu > 1 && x86_mp_online) { 3530 selfmb = &self->ci_pmap_cpu->pc_mbox; 3531 3532 /* 3533 * If the CPUs have no notion of global pages then 3534 * reload of %cr3 is sufficient. 3535 */ 3536 if (pte != 0 && (cpu_feature & CPUID_PGE) == 0) 3537 pte = 0; 3538 3539 if (pm == pmap_kernel()) { 3540 /* 3541 * Mapped on all CPUs: use the broadcast mechanism. 3542 * Once we have the lock, increment the counter. 3543 */ 3544 s = splvm(); 3545 mb = &pmap_mbox; 3546 count = SPINLOCK_BACKOFF_MIN; 3547 do { 3548 if ((head = mb->mb_head) != mb->mb_tail) { 3549 splx(s); 3550 while ((head = mb->mb_head) != 3551 mb->mb_tail) 3552 SPINLOCK_BACKOFF(count); 3553 s = splvm(); 3554 } 3555 } while (!_lock_cas(&mb->mb_head, head, 3556 head + ncpu - 1)); 3557 3558 /* 3559 * Once underway we must stay at IPL_VM until the 3560 * IPI is dispatched. Otherwise interrupt handlers 3561 * on this CPU can deadlock against us. 3562 */ 3563 pmap_tlb_evcnt.ev_count++; 3564 mb->mb_pointer = self; 3565 mb->mb_addr1 = sva; 3566 mb->mb_addr2 = eva; 3567 mb->mb_global = pte; 3568 x86_ipi(LAPIC_TLB_BCAST_VECTOR, LAPIC_DEST_ALLEXCL, 3569 LAPIC_DLMODE_FIXED); 3570 self->ci_need_tlbwait = 1; 3571 splx(s); 3572 } else if ((pm->pm_cpus & ~self->ci_cpumask) != 0 || 3573 (kernel && (pm->pm_kernel_cpus & ~self->ci_cpumask) != 0)) { 3574 /* 3575 * We don't bother traversing the CPU list if only 3576 * used by this CPU. 3577 * 3578 * We can't do global flushes with the multicast 3579 * mechanism. 3580 */ 3581 KASSERT(pte == 0); 3582 3583 /* 3584 * Take ownership of the shootdown mailbox on each 3585 * CPU, fill the details and fire it off. 3586 */ 3587 s = splvm(); 3588 for (CPU_INFO_FOREACH(cii, ci)) { 3589 if (ci == self || 3590 !pmap_is_active(pm, ci, kernel) || 3591 !(ci->ci_flags & CPUF_RUNNING)) 3592 continue; 3593 selfmb->mb_head++; 3594 mb = &ci->ci_pmap_cpu->pc_mbox; 3595 count = SPINLOCK_BACKOFF_MIN; 3596 while (!_lock_cas((uintptr_t *)&mb->mb_pointer, 3597 0, (uintptr_t)&selfmb->mb_tail)) { 3598 splx(s); 3599 while (mb->mb_pointer != 0) 3600 SPINLOCK_BACKOFF(count); 3601 s = splvm(); 3602 } 3603 mb->mb_addr1 = sva; 3604 mb->mb_addr2 = eva; 3605 mb->mb_global = pte; 3606 if (x86_ipi(LAPIC_TLB_MCAST_VECTOR, 3607 ci->ci_apicid, 3608 LAPIC_DLMODE_FIXED)) 3609 panic("pmap_tlb_shootdown: ipi failed"); 3610 } 3611 self->ci_need_tlbwait = 1; 3612 splx(s); 3613 } 3614 } 3615 #endif /* MULTIPROCESSOR */ 3616 3617 /* Update the current CPU before waiting for others. */ 3618 if (!pmap_is_active(pm, self, kernel)) 3619 return; 3620 3621 if (sva == (vaddr_t)-1LL) { 3622 if (pte != 0) 3623 tlbflushg(); 3624 else 3625 tlbflush(); 3626 } else { 3627 do { 3628 pmap_update_pg(sva); 3629 sva += PAGE_SIZE; 3630 } while (sva < eva); 3631 } 3632 } 3633 3634 /* 3635 * pmap_tlb_shootwait: wait for pending TLB shootdowns to complete 3636 * 3637 * => only waits for operations generated by the current CPU 3638 * => must be called with preemption disabled 3639 */ 3640 3641 void 3642 pmap_tlb_shootwait(void) 3643 { 3644 struct cpu_info *self; 3645 struct pmap_mbox *mb; 3646 3647 /* 3648 * Anything to do? XXX Really we want to avoid touching the cache 3649 * lines of the two mailboxes, but the processor may read ahead. 3650 */ 3651 self = curcpu(); 3652 if (!self->ci_need_tlbwait) 3653 return; 3654 self->ci_need_tlbwait = 0; 3655 3656 /* If we own the global mailbox, wait for it to drain. */ 3657 mb = &pmap_mbox; 3658 while (mb->mb_pointer == self && mb->mb_head != mb->mb_tail) 3659 x86_pause(); 3660 3661 /* If we own other CPU's mailboxes, wait for them to drain. */ 3662 mb = &self->ci_pmap_cpu->pc_mbox; 3663 KASSERT(mb->mb_pointer != &mb->mb_tail); 3664 while (mb->mb_head != mb->mb_tail) 3665 x86_pause(); 3666 } 3667 3668 /* 3669 * pmap_update: process deferred invalidations 3670 */ 3671 3672 void 3673 pmap_update(struct pmap *pm) 3674 { 3675 3676 crit_enter(); 3677 pmap_tlb_shootwait(); 3678 crit_exit(); 3679 } 3680