1 /* $NetBSD: pmap.c,v 1.77 2008/12/18 12:18:20 cegger Exp $ */ 2 3 /* 4 * Copyright (c) 2007 Manuel Bouyer. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 3. All advertising materials mentioning features or use of this software 15 * must display the following acknowledgement: 16 * This product includes software developed by Manuel Bouyer. 17 * 4. The name of the author may not be used to endorse or promote products 18 * derived from this software without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 21 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 22 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 23 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 24 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 25 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 29 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 * 31 */ 32 33 /* 34 * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr> 35 * 36 * Permission to use, copy, modify, and distribute this software for any 37 * purpose with or without fee is hereby granted, provided that the above 38 * copyright notice and this permission notice appear in all copies. 39 * 40 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 41 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 42 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 43 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 44 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 45 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 46 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 47 */ 48 49 /* 50 * 51 * Copyright (c) 1997 Charles D. Cranor and Washington University. 52 * All rights reserved. 53 * 54 * Redistribution and use in source and binary forms, with or without 55 * modification, are permitted provided that the following conditions 56 * are met: 57 * 1. Redistributions of source code must retain the above copyright 58 * notice, this list of conditions and the following disclaimer. 59 * 2. Redistributions in binary form must reproduce the above copyright 60 * notice, this list of conditions and the following disclaimer in the 61 * documentation and/or other materials provided with the distribution. 62 * 3. All advertising materials mentioning features or use of this software 63 * must display the following acknowledgement: 64 * This product includes software developed by Charles D. Cranor and 65 * Washington University. 66 * 4. The name of the author may not be used to endorse or promote products 67 * derived from this software without specific prior written permission. 68 * 69 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 70 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 71 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 72 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 73 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 74 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 75 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 76 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 77 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 78 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 79 */ 80 81 /* 82 * Copyright 2001 (c) Wasabi Systems, Inc. 83 * All rights reserved. 84 * 85 * Written by Frank van der Linden for Wasabi Systems, Inc. 86 * 87 * Redistribution and use in source and binary forms, with or without 88 * modification, are permitted provided that the following conditions 89 * are met: 90 * 1. Redistributions of source code must retain the above copyright 91 * notice, this list of conditions and the following disclaimer. 92 * 2. Redistributions in binary form must reproduce the above copyright 93 * notice, this list of conditions and the following disclaimer in the 94 * documentation and/or other materials provided with the distribution. 95 * 3. All advertising materials mentioning features or use of this software 96 * must display the following acknowledgement: 97 * This product includes software developed for the NetBSD Project by 98 * Wasabi Systems, Inc. 99 * 4. The name of Wasabi Systems, Inc. may not be used to endorse 100 * or promote products derived from this software without specific prior 101 * written permission. 102 * 103 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND 104 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 105 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 106 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC 107 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 108 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 109 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 110 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 111 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 112 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 113 * POSSIBILITY OF SUCH DAMAGE. 114 */ 115 116 /* 117 * This is the i386 pmap modified and generalized to support x86-64 118 * as well. The idea is to hide the upper N levels of the page tables 119 * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest 120 * is mostly untouched, except that it uses some more generalized 121 * macros and interfaces. 122 * 123 * This pmap has been tested on the i386 as well, and it can be easily 124 * adapted to PAE. 125 * 126 * fvdl@wasabisystems.com 18-Jun-2001 127 */ 128 129 /* 130 * pmap.c: i386 pmap module rewrite 131 * Chuck Cranor <chuck@ccrc.wustl.edu> 132 * 11-Aug-97 133 * 134 * history of this pmap module: in addition to my own input, i used 135 * the following references for this rewrite of the i386 pmap: 136 * 137 * [1] the NetBSD i386 pmap. this pmap appears to be based on the 138 * BSD hp300 pmap done by Mike Hibler at University of Utah. 139 * it was then ported to the i386 by William Jolitz of UUNET 140 * Technologies, Inc. Then Charles M. Hannum of the NetBSD 141 * project fixed some bugs and provided some speed ups. 142 * 143 * [2] the FreeBSD i386 pmap. this pmap seems to be the 144 * Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson 145 * and David Greenman. 146 * 147 * [3] the Mach pmap. this pmap, from CMU, seems to have migrated 148 * between several processors. the VAX version was done by 149 * Avadis Tevanian, Jr., and Michael Wayne Young. the i386 150 * version was done by Lance Berc, Mike Kupfer, Bob Baron, 151 * David Golub, and Richard Draves. the alpha version was 152 * done by Alessandro Forin (CMU/Mach) and Chris Demetriou 153 * (NetBSD/alpha). 154 */ 155 156 #include <sys/cdefs.h> 157 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.77 2008/12/18 12:18:20 cegger Exp $"); 158 159 #include "opt_user_ldt.h" 160 #include "opt_lockdebug.h" 161 #include "opt_multiprocessor.h" 162 #include "opt_xen.h" 163 #if !defined(__x86_64__) 164 #include "opt_kstack_dr0.h" 165 #endif /* !defined(__x86_64__) */ 166 167 #include <sys/param.h> 168 #include <sys/systm.h> 169 #include <sys/proc.h> 170 #include <sys/pool.h> 171 #include <sys/user.h> 172 #include <sys/kernel.h> 173 #include <sys/atomic.h> 174 #include <sys/cpu.h> 175 #include <sys/intr.h> 176 177 #include <uvm/uvm.h> 178 179 #include <dev/isa/isareg.h> 180 181 #include <machine/specialreg.h> 182 #include <machine/gdt.h> 183 #include <machine/isa_machdep.h> 184 #include <machine/cpuvar.h> 185 186 #include <x86/pmap.h> 187 #include <x86/pmap_pv.h> 188 189 #include <x86/i82489reg.h> 190 #include <x86/i82489var.h> 191 192 #ifdef XEN 193 #include <xen/xen3-public/xen.h> 194 #include <xen/hypervisor.h> 195 #endif 196 197 /* flag to be used for kernel mappings: PG_u on Xen/amd64, 0 otherwise */ 198 #if defined(XEN) && defined(__x86_64__) 199 #define PG_k PG_u 200 #else 201 #define PG_k 0 202 #endif 203 204 /* 205 * general info: 206 * 207 * - for an explanation of how the i386 MMU hardware works see 208 * the comments in <machine/pte.h>. 209 * 210 * - for an explanation of the general memory structure used by 211 * this pmap (including the recursive mapping), see the comments 212 * in <machine/pmap.h>. 213 * 214 * this file contains the code for the "pmap module." the module's 215 * job is to manage the hardware's virtual to physical address mappings. 216 * note that there are two levels of mapping in the VM system: 217 * 218 * [1] the upper layer of the VM system uses vm_map's and vm_map_entry's 219 * to map ranges of virtual address space to objects/files. for 220 * example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only 221 * to the file /bin/ls starting at offset zero." note that 222 * the upper layer mapping is not concerned with how individual 223 * vm_pages are mapped. 224 * 225 * [2] the lower layer of the VM system (the pmap) maintains the mappings 226 * from virtual addresses. it is concerned with which vm_page is 227 * mapped where. for example, when you run /bin/ls and start 228 * at page 0x1000 the fault routine may lookup the correct page 229 * of the /bin/ls file and then ask the pmap layer to establish 230 * a mapping for it. 231 * 232 * note that information in the lower layer of the VM system can be 233 * thrown away since it can easily be reconstructed from the info 234 * in the upper layer. 235 * 236 * data structures we use include: 237 * 238 * - struct pmap: describes the address space of one thread 239 * - struct pv_entry: describes one <PMAP,VA> mapping of a PA 240 * - struct pv_head: there is one pv_head per managed page of 241 * physical memory. the pv_head points to a list of pv_entry 242 * structures which describe all the <PMAP,VA> pairs that this 243 * page is mapped in. this is critical for page based operations 244 * such as pmap_page_protect() [change protection on _all_ mappings 245 * of a page] 246 */ 247 248 /* 249 * memory allocation 250 * 251 * - there are three data structures that we must dynamically allocate: 252 * 253 * [A] new process' page directory page (PDP) 254 * - plan 1: done at pmap_create() we use 255 * uvm_km_alloc(kernel_map, PAGE_SIZE) [fka kmem_alloc] to do this 256 * allocation. 257 * 258 * if we are low in free physical memory then we sleep in 259 * uvm_km_alloc -- in this case this is ok since we are creating 260 * a new pmap and should not be holding any locks. 261 * 262 * if the kernel is totally out of virtual space 263 * (i.e. uvm_km_alloc returns NULL), then we panic. 264 * 265 * XXX: the fork code currently has no way to return an "out of 266 * memory, try again" error code since uvm_fork [fka vm_fork] 267 * is a void function. 268 * 269 * [B] new page tables pages (PTP) 270 * - call uvm_pagealloc() 271 * => success: zero page, add to pm_pdir 272 * => failure: we are out of free vm_pages, let pmap_enter() 273 * tell UVM about it. 274 * 275 * note: for kernel PTPs, we start with NKPTP of them. as we map 276 * kernel memory (at uvm_map time) we check to see if we've grown 277 * the kernel pmap. if so, we call the optional function 278 * pmap_growkernel() to grow the kernel PTPs in advance. 279 * 280 * [C] pv_entry structures 281 */ 282 283 /* 284 * locking 285 * 286 * we have the following locks that we must contend with: 287 * 288 * mutexes: 289 * 290 * - pmap lock (per pmap, part of uvm_object) 291 * this lock protects the fields in the pmap structure including 292 * the non-kernel PDEs in the PDP, and the PTEs. it also locks 293 * in the alternate PTE space (since that is determined by the 294 * entry in the PDP). 295 * 296 * - pvh_lock (per pv_head) 297 * this lock protects the pv_entry list which is chained off the 298 * pv_head structure for a specific managed PA. it is locked 299 * when traversing the list (e.g. adding/removing mappings, 300 * syncing R/M bits, etc.) 301 * 302 * - pmaps_lock 303 * this lock protects the list of active pmaps (headed by "pmaps"). 304 * we lock it when adding or removing pmaps from this list. 305 * 306 * tlb shootdown 307 * 308 * tlb shootdowns are hard interrupts that operate outside the spl 309 * framework: they don't need to be blocked provided that the pmap module 310 * gets the order of events correct. the calls are made by talking directly 311 * to the lapic. the stubs to handle the interrupts are quite short and do 312 * one of the following: invalidate a single page, a range of pages, all 313 * user tlb entries or the entire tlb. 314 * 315 * the cpus synchronize with each other using pmap_mbox structures which are 316 * aligned on 64-byte cache lines. tlb shootdowns against the kernel pmap 317 * use a global mailbox and are generated using a broadcast ipi (broadcast 318 * to all but the sending cpu). shootdowns against regular pmaps use 319 * per-cpu mailboxes and are multicast. kernel and user shootdowns can 320 * execute simultaneously, as can shootdowns within different multithreaded 321 * processes. TODO: 322 * 323 * 1. figure out which waitpoints can be deferered to pmap_update(). 324 * 2. see if there is a cheap way to batch some updates. 325 */ 326 327 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER; 328 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER; 329 const long nkptpmax[] = NKPTPMAX_INITIALIZER; 330 const long nbpd[] = NBPD_INITIALIZER; 331 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER; 332 pd_entry_t * const alternate_pdes[] = APDES_INITIALIZER; 333 334 long nkptp[] = NKPTP_INITIALIZER; 335 336 static kmutex_t pmaps_lock; 337 338 static vaddr_t pmap_maxkvaddr; 339 340 #define COUNT(x) /* nothing */ 341 342 /* 343 * XXX kludge: dummy locking to make KASSERTs in uvm_page.c comfortable. 344 * actual locking is done by pm_lock. 345 */ 346 #if defined(DIAGNOSTIC) 347 #define PMAP_SUBOBJ_LOCK(pm, idx) \ 348 KASSERT(mutex_owned(&(pm)->pm_lock)); \ 349 if ((idx) != 0) \ 350 mutex_enter(&(pm)->pm_obj[(idx)].vmobjlock) 351 #define PMAP_SUBOBJ_UNLOCK(pm, idx) \ 352 KASSERT(mutex_owned(&(pm)->pm_lock)); \ 353 if ((idx) != 0) \ 354 mutex_exit(&(pm)->pm_obj[(idx)].vmobjlock) 355 #else /* defined(DIAGNOSTIC) */ 356 #define PMAP_SUBOBJ_LOCK(pm, idx) /* nothing */ 357 #define PMAP_SUBOBJ_UNLOCK(pm, idx) /* nothing */ 358 #endif /* defined(DIAGNOSTIC) */ 359 360 /* 361 * Global TLB shootdown mailbox. 362 */ 363 struct evcnt pmap_tlb_evcnt __aligned(64); 364 struct pmap_mbox pmap_mbox __aligned(64); 365 366 /* 367 * Per-CPU data. The pmap mailbox is cache intensive so gets its 368 * own line. Note that the mailbox must be the first item. 369 */ 370 struct pmap_cpu { 371 /* TLB shootdown */ 372 struct pmap_mbox pc_mbox; 373 }; 374 375 union { 376 struct pmap_cpu pc; 377 uint8_t padding[64]; 378 } pmap_cpu[MAXCPUS] __aligned(64); 379 380 /* 381 * global data structures 382 */ 383 384 static struct pmap kernel_pmap_store; /* the kernel's pmap (proc0) */ 385 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store; 386 387 /* 388 * pmap_pg_g: if our processor supports PG_G in the PTE then we 389 * set pmap_pg_g to PG_G (otherwise it is zero). 390 */ 391 392 int pmap_pg_g = 0; 393 394 /* 395 * pmap_largepages: if our processor supports PG_PS and we are 396 * using it, this is set to true. 397 */ 398 399 int pmap_largepages; 400 401 /* 402 * i386 physical memory comes in a big contig chunk with a small 403 * hole toward the front of it... the following two paddr_t's 404 * (shared with machdep.c) describe the physical address space 405 * of this machine. 406 */ 407 paddr_t avail_start; /* PA of first available physical page */ 408 paddr_t avail_end; /* PA of last available physical page */ 409 410 #ifdef XEN 411 /* First avail vaddr in bootstrap space, needed by pmap_bootstrap() */ 412 vaddr_t first_bt_vaddr; 413 #ifdef __x86_64__ 414 /* Dummy PGD for user cr3, used between pmap_deacivate() and pmap_activate() */ 415 static paddr_t xen_dummy_user_pgd; 416 /* Currently active user PGD (can't use rcr3()) */ 417 static paddr_t xen_current_user_pgd = 0; 418 #endif /* __x86_64__ */ 419 paddr_t pmap_pa_start; /* PA of first physical page for this domain */ 420 paddr_t pmap_pa_end; /* PA of last physical page for this domain */ 421 #endif /* XEN */ 422 423 #define VM_PAGE_TO_PP(pg) (&(pg)->mdpage.mp_pp) 424 425 #define pp_lock(pp) mutex_spin_enter(&(pp)->pp_lock) 426 #define pp_unlock(pp) mutex_spin_exit(&(pp)->pp_lock) 427 #define pp_locked(pp) mutex_owned(&(pp)->pp_lock) 428 429 #define PV_HASH_SIZE 32768 430 #define PV_HASH_LOCK_CNT 32 431 432 struct pv_hash_lock { 433 kmutex_t lock; 434 } __aligned(CACHE_LINE_SIZE) pv_hash_locks[PV_HASH_LOCK_CNT] 435 __aligned(CACHE_LINE_SIZE); 436 437 struct pv_hash_head { 438 SLIST_HEAD(, pv_entry) hh_list; 439 } pv_hash_heads[PV_HASH_SIZE]; 440 441 static u_int 442 pvhash_hash(struct vm_page *ptp, vaddr_t va) 443 { 444 445 return (uintptr_t)ptp / sizeof(*ptp) + (va >> PAGE_SHIFT); 446 } 447 448 static struct pv_hash_head * 449 pvhash_head(u_int hash) 450 { 451 452 return &pv_hash_heads[hash % PV_HASH_SIZE]; 453 } 454 455 static kmutex_t * 456 pvhash_lock(u_int hash) 457 { 458 459 return &pv_hash_locks[hash % PV_HASH_LOCK_CNT].lock; 460 } 461 462 static struct pv_entry * 463 pvhash_remove(struct pv_hash_head *hh, struct vm_page *ptp, vaddr_t va) 464 { 465 struct pv_entry *pve; 466 struct pv_entry *prev; 467 468 prev = NULL; 469 SLIST_FOREACH(pve, &hh->hh_list, pve_hash) { 470 if (pve->pve_pte.pte_ptp == ptp && 471 pve->pve_pte.pte_va == va) { 472 if (prev != NULL) { 473 SLIST_REMOVE_AFTER(prev, pve_hash); 474 } else { 475 SLIST_REMOVE_HEAD(&hh->hh_list, pve_hash); 476 } 477 break; 478 } 479 prev = pve; 480 } 481 return pve; 482 } 483 484 /* 485 * other data structures 486 */ 487 488 static pt_entry_t protection_codes[8]; /* maps MI prot to i386 prot code */ 489 static bool pmap_initialized = false; /* pmap_init done yet? */ 490 491 /* 492 * the following two vaddr_t's are used during system startup 493 * to keep track of how much of the kernel's VM space we have used. 494 * once the system is started, the management of the remaining kernel 495 * VM space is turned over to the kernel_map vm_map. 496 */ 497 498 static vaddr_t virtual_avail; /* VA of first free KVA */ 499 static vaddr_t virtual_end; /* VA of last free KVA */ 500 501 /* 502 * linked list of all non-kernel pmaps 503 */ 504 505 static struct pmap_head pmaps; 506 507 /* 508 * pool that pmap structures are allocated from 509 */ 510 511 static struct pool_cache pmap_cache; 512 513 /* 514 * pv_entry cache 515 */ 516 517 static struct pool_cache pmap_pv_cache; 518 519 /* 520 * MULTIPROCESSOR: special VA's/ PTE's are actually allocated inside a 521 * maxcpus*NPTECL array of PTE's, to avoid cache line thrashing 522 * due to false sharing. 523 */ 524 525 #ifdef MULTIPROCESSOR 526 #define PTESLEW(pte, id) ((pte)+(id)*NPTECL) 527 #define VASLEW(va,id) ((va)+(id)*NPTECL*PAGE_SIZE) 528 #else 529 #define PTESLEW(pte, id) (pte) 530 #define VASLEW(va,id) (va) 531 #endif 532 533 /* 534 * special VAs and the PTEs that map them 535 */ 536 static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte, *early_zero_pte; 537 static char *csrcp, *cdstp, *zerop, *ptpp, *early_zerop; 538 539 /* 540 * pool and cache that PDPs are allocated from 541 */ 542 543 static struct pool_cache pmap_pdp_cache; 544 int pmap_pdp_ctor(void *, void *, int); 545 void pmap_pdp_dtor(void *, void *); 546 #ifdef PAE 547 /* need to allocate items of 4 pages */ 548 void *pmap_pdp_alloc(struct pool *, int); 549 void pmap_pdp_free(struct pool *, void *); 550 static struct pool_allocator pmap_pdp_allocator = { 551 .pa_alloc = pmap_pdp_alloc, 552 .pa_free = pmap_pdp_free, 553 .pa_pagesz = PAGE_SIZE * PDP_SIZE, 554 }; 555 #endif /* PAE */ 556 557 void *vmmap; /* XXX: used by mem.c... it should really uvm_map_reserve it */ 558 559 extern vaddr_t idt_vaddr; /* we allocate IDT early */ 560 extern paddr_t idt_paddr; 561 562 #ifdef _LP64 563 extern vaddr_t lo32_vaddr; 564 extern vaddr_t lo32_paddr; 565 #endif 566 567 extern int end; 568 569 #ifdef i386 570 /* stuff to fix the pentium f00f bug */ 571 extern vaddr_t pentium_idt_vaddr; 572 #endif 573 574 575 /* 576 * local prototypes 577 */ 578 579 static struct vm_page *pmap_get_ptp(struct pmap *, vaddr_t, 580 pd_entry_t * const *); 581 static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int); 582 static void pmap_freepage(struct pmap *, struct vm_page *, int); 583 static void pmap_free_ptp(struct pmap *, struct vm_page *, 584 vaddr_t, pt_entry_t *, 585 pd_entry_t * const *); 586 static bool pmap_is_curpmap(struct pmap *); 587 static bool pmap_is_active(struct pmap *, struct cpu_info *, bool); 588 static void pmap_map_ptes(struct pmap *, struct pmap **, 589 pt_entry_t **, pd_entry_t * const **); 590 static void pmap_do_remove(struct pmap *, vaddr_t, vaddr_t, int); 591 static bool pmap_remove_pte(struct pmap *, struct vm_page *, 592 pt_entry_t *, vaddr_t, int, 593 struct pv_entry **); 594 static pt_entry_t pmap_remove_ptes(struct pmap *, struct vm_page *, 595 vaddr_t, vaddr_t, vaddr_t, int, 596 struct pv_entry **); 597 #define PMAP_REMOVE_ALL 0 /* remove all mappings */ 598 #define PMAP_REMOVE_SKIPWIRED 1 /* skip wired mappings */ 599 600 static void pmap_unmap_ptes(struct pmap *, struct pmap *); 601 static bool pmap_get_physpage(vaddr_t, int, paddr_t *); 602 static int pmap_pdes_invalid(vaddr_t, pd_entry_t * const *, 603 pd_entry_t *); 604 #define pmap_pdes_valid(va, pdes, lastpde) \ 605 (pmap_pdes_invalid((va), (pdes), (lastpde)) == 0) 606 static void pmap_alloc_level(pd_entry_t * const *, vaddr_t, int, 607 long *); 608 609 static bool pmap_reactivate(struct pmap *); 610 611 /* 612 * p m a p h e l p e r f u n c t i o n s 613 */ 614 615 static inline void 616 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff) 617 { 618 619 if (pmap == pmap_kernel()) { 620 atomic_add_long(&pmap->pm_stats.resident_count, resid_diff); 621 atomic_add_long(&pmap->pm_stats.wired_count, wired_diff); 622 } else { 623 KASSERT(mutex_owned(&pmap->pm_lock)); 624 pmap->pm_stats.resident_count += resid_diff; 625 pmap->pm_stats.wired_count += wired_diff; 626 } 627 } 628 629 static inline void 630 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte) 631 { 632 int resid_diff = ((npte & PG_V) ? 1 : 0) - ((opte & PG_V) ? 1 : 0); 633 int wired_diff = ((npte & PG_W) ? 1 : 0) - ((opte & PG_W) ? 1 : 0); 634 635 KASSERT((npte & (PG_V | PG_W)) != PG_W); 636 KASSERT((opte & (PG_V | PG_W)) != PG_W); 637 638 pmap_stats_update(pmap, resid_diff, wired_diff); 639 } 640 641 /* 642 * ptp_to_pmap: lookup pmap by ptp 643 */ 644 645 static struct pmap * 646 ptp_to_pmap(struct vm_page *ptp) 647 { 648 struct pmap *pmap; 649 650 if (ptp == NULL) { 651 return pmap_kernel(); 652 } 653 pmap = (struct pmap *)ptp->uobject; 654 KASSERT(pmap != NULL); 655 KASSERT(&pmap->pm_obj[0] == ptp->uobject); 656 return pmap; 657 } 658 659 static inline struct pv_pte * 660 pve_to_pvpte(struct pv_entry *pve) 661 { 662 663 KASSERT((void *)&pve->pve_pte == (void *)pve); 664 return &pve->pve_pte; 665 } 666 667 static inline struct pv_entry * 668 pvpte_to_pve(struct pv_pte *pvpte) 669 { 670 struct pv_entry *pve = (void *)pvpte; 671 672 KASSERT(pve_to_pvpte(pve) == pvpte); 673 return pve; 674 } 675 676 /* 677 * pv_pte_first, pv_pte_next: PV list iterator. 678 */ 679 680 static struct pv_pte * 681 pv_pte_first(struct pmap_page *pp) 682 { 683 684 KASSERT(pp_locked(pp)); 685 if ((pp->pp_flags & PP_EMBEDDED) != 0) { 686 return &pp->pp_pte; 687 } 688 return pve_to_pvpte(LIST_FIRST(&pp->pp_head.pvh_list)); 689 } 690 691 static struct pv_pte * 692 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte) 693 { 694 695 KASSERT(pvpte != NULL); 696 KASSERT(pp_locked(pp)); 697 if (pvpte == &pp->pp_pte) { 698 KASSERT((pp->pp_flags & PP_EMBEDDED) != 0); 699 return NULL; 700 } 701 KASSERT((pp->pp_flags & PP_EMBEDDED) == 0); 702 return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list)); 703 } 704 705 /* 706 * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]? 707 * of course the kernel is always loaded 708 */ 709 710 inline static bool 711 pmap_is_curpmap(struct pmap *pmap) 712 { 713 #if defined(XEN) && defined(__x86_64__) 714 /* 715 * Only kernel pmap is physically loaded. 716 * User PGD may be active, but TLB will be flushed 717 * with HYPERVISOR_iret anyway, so let's say no 718 */ 719 return(pmap == pmap_kernel()); 720 #else /* XEN && __x86_64__*/ 721 return((pmap == pmap_kernel()) || 722 (pmap == curcpu()->ci_pmap)); 723 #endif 724 } 725 726 /* 727 * pmap_is_active: is this pmap loaded into the specified processor's %cr3? 728 */ 729 730 inline static bool 731 pmap_is_active(struct pmap *pmap, struct cpu_info *ci, bool kernel) 732 { 733 734 return (pmap == pmap_kernel() || 735 (pmap->pm_cpus & ci->ci_cpumask) != 0 || 736 (kernel && (pmap->pm_kernel_cpus & ci->ci_cpumask) != 0)); 737 } 738 739 static void 740 pmap_apte_flush(struct pmap *pmap) 741 { 742 743 KASSERT(kpreempt_disabled()); 744 745 /* 746 * Flush the APTE mapping from all other CPUs that 747 * are using the pmap we are using (who's APTE space 748 * is the one we've just modified). 749 * 750 * XXXthorpej -- find a way to defer the IPI. 751 */ 752 pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, 0); 753 pmap_tlb_shootwait(); 754 } 755 756 /* 757 * Add a reference to the specified pmap. 758 */ 759 760 inline void 761 pmap_reference(struct pmap *pmap) 762 { 763 764 atomic_inc_uint((unsigned *)&pmap->pm_obj[0].uo_refs); 765 } 766 767 /* 768 * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in 769 * 770 * => we lock enough pmaps to keep things locked in 771 * => must be undone with pmap_unmap_ptes before returning 772 */ 773 774 static void 775 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2, 776 pd_entry_t **ptepp, pd_entry_t * const **pdeppp) 777 { 778 pd_entry_t opde, npde; 779 struct pmap *ourpmap; 780 struct cpu_info *ci; 781 struct lwp *l; 782 bool iscurrent; 783 uint64_t ncsw; 784 #ifdef XEN 785 int s; 786 #endif 787 788 /* the kernel's pmap is always accessible */ 789 if (pmap == pmap_kernel()) { 790 *pmap2 = NULL; 791 *ptepp = PTE_BASE; 792 *pdeppp = normal_pdes; 793 return; 794 } 795 KASSERT(kpreempt_disabled()); 796 797 retry: 798 l = curlwp; 799 ncsw = l->l_ncsw; 800 ourpmap = NULL; 801 ci = curcpu(); 802 #if defined(XEN) && defined(__x86_64__) 803 /* 804 * curmap can only be pmap_kernel so at this point 805 * pmap_is_curpmap is always false 806 */ 807 iscurrent = 0; 808 ourpmap = pmap_kernel(); 809 #else /* XEN && __x86_64__*/ 810 if (ci->ci_want_pmapload && 811 vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) { 812 pmap_load(); 813 if (l->l_ncsw != ncsw) 814 goto retry; 815 } 816 iscurrent = pmap_is_curpmap(pmap); 817 /* if curpmap then we are always mapped */ 818 if (iscurrent) { 819 mutex_enter(&pmap->pm_lock); 820 *pmap2 = NULL; 821 *ptepp = PTE_BASE; 822 *pdeppp = normal_pdes; 823 goto out; 824 } 825 ourpmap = ci->ci_pmap; 826 #endif /* XEN && __x86_64__ */ 827 828 /* need to lock both curpmap and pmap: use ordered locking */ 829 pmap_reference(ourpmap); 830 if ((uintptr_t) pmap < (uintptr_t) ourpmap) { 831 mutex_enter(&pmap->pm_lock); 832 mutex_enter(&ourpmap->pm_lock); 833 } else { 834 mutex_enter(&ourpmap->pm_lock); 835 mutex_enter(&pmap->pm_lock); 836 } 837 838 if (l->l_ncsw != ncsw) 839 goto unlock_and_retry; 840 841 /* need to load a new alternate pt space into curpmap? */ 842 COUNT(apdp_pde_map); 843 opde = *APDP_PDE; 844 #ifdef XEN 845 if (!pmap_valid_entry(opde) || 846 pmap_pte2pa(opde) != pmap_pdirpa(pmap, 0)) { 847 int i; 848 s = splvm(); 849 /* Make recursive entry usable in user PGD */ 850 for (i = 0; i < PDP_SIZE; i++) { 851 npde = pmap_pa2pte( 852 pmap_pdirpa(pmap, i * NPDPG)) | PG_k | PG_V; 853 xpq_queue_pte_update( 854 xpmap_ptom(pmap_pdirpa(pmap, PDIR_SLOT_PTE + i)), 855 npde); 856 xpq_queue_pte_update(xpmap_ptetomach(&APDP_PDE[i]), 857 npde); 858 #ifdef PAE 859 /* update shadow entry too */ 860 xpq_queue_pte_update( 861 xpmap_ptetomach(&APDP_PDE_SHADOW[i]), npde); 862 #endif /* PAE */ 863 xpq_queue_invlpg( 864 (vaddr_t)&pmap->pm_pdir[PDIR_SLOT_PTE + i]); 865 } 866 xpq_flush_queue(); 867 if (pmap_valid_entry(opde)) 868 pmap_apte_flush(ourpmap); 869 splx(s); 870 } 871 #else /* XEN */ 872 npde = pmap_pa2pte(pmap_pdirpa(pmap, 0)) | PG_RW | PG_V; 873 if (!pmap_valid_entry(opde) || 874 pmap_pte2pa(opde) != pmap_pdirpa(pmap, 0)) { 875 pmap_pte_set(APDP_PDE, npde); 876 pmap_pte_flush(); 877 if (pmap_valid_entry(opde)) 878 pmap_apte_flush(ourpmap); 879 } 880 #endif /* XEN */ 881 *pmap2 = ourpmap; 882 *ptepp = APTE_BASE; 883 *pdeppp = alternate_pdes; 884 KASSERT(l->l_ncsw == ncsw); 885 #if !defined(XEN) || !defined(__x86_64__) 886 out: 887 #endif 888 /* 889 * might have blocked, need to retry? 890 */ 891 if (l->l_ncsw != ncsw) { 892 unlock_and_retry: 893 if (ourpmap != NULL) { 894 mutex_exit(&ourpmap->pm_lock); 895 pmap_destroy(ourpmap); 896 } 897 mutex_exit(&pmap->pm_lock); 898 goto retry; 899 } 900 901 return; 902 } 903 904 /* 905 * pmap_unmap_ptes: unlock the PTE mapping of "pmap" 906 */ 907 908 static void 909 pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2) 910 { 911 912 if (pmap == pmap_kernel()) { 913 return; 914 } 915 KASSERT(kpreempt_disabled()); 916 if (pmap2 == NULL) { 917 mutex_exit(&pmap->pm_lock); 918 } else { 919 #if defined(XEN) && defined(__x86_64__) 920 KASSERT(pmap2 == pmap_kernel()); 921 #else 922 KASSERT(curcpu()->ci_pmap == pmap2); 923 #endif 924 #if defined(MULTIPROCESSOR) 925 pmap_pte_set(APDP_PDE, 0); 926 pmap_pte_flush(); 927 pmap_apte_flush(pmap2); 928 #endif 929 COUNT(apdp_pde_unmap); 930 mutex_exit(&pmap->pm_lock); 931 mutex_exit(&pmap2->pm_lock); 932 pmap_destroy(pmap2); 933 } 934 } 935 936 inline static void 937 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte) 938 { 939 940 #if !defined(__x86_64__) 941 if (curproc == NULL || curproc->p_vmspace == NULL || 942 pm != vm_map_pmap(&curproc->p_vmspace->vm_map)) 943 return; 944 945 if ((opte ^ npte) & PG_X) 946 pmap_update_pg(va); 947 948 /* 949 * Executability was removed on the last executable change. 950 * Reset the code segment to something conservative and 951 * let the trap handler deal with setting the right limit. 952 * We can't do that because of locking constraints on the vm map. 953 */ 954 955 if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) { 956 struct trapframe *tf = curlwp->l_md.md_regs; 957 958 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 959 pm->pm_hiexec = I386_MAX_EXE_ADDR; 960 } 961 #endif /* !defined(__x86_64__) */ 962 } 963 964 #if !defined(__x86_64__) 965 /* 966 * Fixup the code segment to cover all potential executable mappings. 967 * returns 0 if no changes to the code segment were made. 968 */ 969 970 int 971 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb) 972 { 973 struct vm_map_entry *ent; 974 struct pmap *pm = vm_map_pmap(map); 975 vaddr_t va = 0; 976 977 vm_map_lock_read(map); 978 for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) { 979 980 /* 981 * This entry has greater va than the entries before. 982 * We need to make it point to the last page, not past it. 983 */ 984 985 if (ent->protection & VM_PROT_EXECUTE) 986 va = trunc_page(ent->end) - PAGE_SIZE; 987 } 988 vm_map_unlock_read(map); 989 if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL)) 990 return (0); 991 992 pm->pm_hiexec = va; 993 if (pm->pm_hiexec > I386_MAX_EXE_ADDR) { 994 tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL); 995 } else { 996 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 997 return (0); 998 } 999 return (1); 1000 } 1001 #endif /* !defined(__x86_64__) */ 1002 1003 /* 1004 * p m a p k e n t e r f u n c t i o n s 1005 * 1006 * functions to quickly enter/remove pages from the kernel address 1007 * space. pmap_kremove is exported to MI kernel. we make use of 1008 * the recursive PTE mappings. 1009 */ 1010 1011 /* 1012 * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking 1013 * 1014 * => no need to lock anything, assume va is already allocated 1015 * => should be faster than normal pmap enter function 1016 */ 1017 1018 void 1019 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot) 1020 { 1021 pt_entry_t *pte, opte, npte; 1022 1023 KASSERT(!(prot & ~VM_PROT_ALL)); 1024 1025 if (va < VM_MIN_KERNEL_ADDRESS) 1026 pte = vtopte(va); 1027 else 1028 pte = kvtopte(va); 1029 #ifdef DOM0OPS 1030 if (pa < pmap_pa_start || pa >= pmap_pa_end) { 1031 #ifdef DEBUG 1032 printk("pmap_kenter_pa: pa 0x%" PRIx64 " for va 0x%" PRIx64 1033 " outside range\n", (int64_t)pa, (int64_t)va); 1034 #endif /* DEBUG */ 1035 npte = pa; 1036 } else 1037 #endif /* DOM0OPS */ 1038 npte = pmap_pa2pte(pa); 1039 npte |= protection_codes[prot] | PG_k | PG_V | pmap_pg_g; 1040 opte = pmap_pte_testset(pte, npte); /* zap! */ 1041 #if defined(DIAGNOSTIC) 1042 /* XXX For now... */ 1043 if (opte & PG_PS) 1044 panic("pmap_kenter_pa: PG_PS"); 1045 #endif 1046 if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) { 1047 /* This should not happen, so no need to batch updates. */ 1048 kpreempt_disable(); 1049 pmap_tlb_shootdown(pmap_kernel(), va, 0, opte); 1050 kpreempt_enable(); 1051 } 1052 } 1053 1054 #ifdef XEN 1055 /* 1056 * pmap_kenter_ma: enter a kernel mapping without R/M (pv_entry) tracking 1057 * 1058 * => no need to lock anything, assume va is already allocated 1059 * => should be faster than normal pmap enter function 1060 * => we expect a MACHINE address 1061 */ 1062 1063 void 1064 pmap_kenter_ma(vaddr_t va, paddr_t ma, vm_prot_t prot) 1065 { 1066 pt_entry_t *pte, opte, npte; 1067 1068 if (va < VM_MIN_KERNEL_ADDRESS) 1069 pte = vtopte(va); 1070 else 1071 pte = kvtopte(va); 1072 1073 npte = ma | ((prot & VM_PROT_WRITE) ? PG_RW : PG_RO) | 1074 PG_V | PG_k; 1075 #ifndef XEN 1076 if ((cpu_feature & CPUID_NOX) && !(prot & VM_PROT_EXECUTE)) 1077 npte |= PG_NX; 1078 #endif 1079 opte = pmap_pte_testset (pte, npte); /* zap! */ 1080 1081 if (pmap_valid_entry(opte)) { 1082 #if defined(MULTIPROCESSOR) 1083 kpreempt_disable(); 1084 pmap_tlb_shootdown(pmap_kernel(), va, 0, opte); 1085 kpreempt_enable(); 1086 #else 1087 /* Don't bother deferring in the single CPU case. */ 1088 pmap_update_pg(va); 1089 #endif 1090 } 1091 } 1092 #endif /* XEN */ 1093 1094 #if defined(__x86_64__) 1095 /* 1096 * Change protection for a virtual address. Local for a CPU only, don't 1097 * care about TLB shootdowns. 1098 * 1099 * => must be called with preemption disabled 1100 */ 1101 void 1102 pmap_changeprot_local(vaddr_t va, vm_prot_t prot) 1103 { 1104 pt_entry_t *pte, opte, npte; 1105 1106 KASSERT(kpreempt_disabled()); 1107 1108 if (va < VM_MIN_KERNEL_ADDRESS) 1109 pte = vtopte(va); 1110 else 1111 pte = kvtopte(va); 1112 1113 npte = opte = *pte; 1114 1115 if ((prot & VM_PROT_WRITE) != 0) 1116 npte |= PG_RW; 1117 else 1118 npte &= ~PG_RW; 1119 1120 if (opte != npte) { 1121 pmap_pte_set(pte, npte); 1122 pmap_pte_flush(); 1123 invlpg(va); 1124 } 1125 } 1126 #endif /* defined(__x86_64__) */ 1127 1128 /* 1129 * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking 1130 * 1131 * => no need to lock anything 1132 * => caller must dispose of any vm_page mapped in the va range 1133 * => note: not an inline function 1134 * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE 1135 * => we assume kernel only unmaps valid addresses and thus don't bother 1136 * checking the valid bit before doing TLB flushing 1137 * => must be followed by call to pmap_update() before reuse of page 1138 */ 1139 1140 void 1141 pmap_kremove(vaddr_t sva, vsize_t len) 1142 { 1143 pt_entry_t *pte, xpte; 1144 vaddr_t va, eva; 1145 1146 eva = sva + len; 1147 xpte = 0; 1148 1149 for (va = sva; va < eva; va += PAGE_SIZE) { 1150 if (va < VM_MIN_KERNEL_ADDRESS) 1151 pte = vtopte(va); 1152 else 1153 pte = kvtopte(va); 1154 xpte |= pmap_pte_testset(pte, 0); /* zap! */ 1155 #if defined(DIAGNOSTIC) 1156 /* XXX For now... */ 1157 if (xpte & PG_PS) 1158 panic("pmap_kremove: PG_PS"); 1159 if (xpte & PG_PVLIST) 1160 panic("pmap_kremove: PG_PVLIST mapping for 0x%lx", 1161 va); 1162 #endif 1163 } 1164 if ((xpte & (PG_V | PG_U)) == (PG_V | PG_U)) { 1165 kpreempt_disable(); 1166 pmap_tlb_shootdown(pmap_kernel(), sva, eva, xpte); 1167 kpreempt_enable(); 1168 } 1169 } 1170 1171 /* 1172 * p m a p i n i t f u n c t i o n s 1173 * 1174 * pmap_bootstrap and pmap_init are called during system startup 1175 * to init the pmap module. pmap_bootstrap() does a low level 1176 * init just to get things rolling. pmap_init() finishes the job. 1177 */ 1178 1179 /* 1180 * pmap_bootstrap: get the system in a state where it can run with VM 1181 * properly enabled (called before main()). the VM system is 1182 * fully init'd later... 1183 * 1184 * => on i386, locore.s has already enabled the MMU by allocating 1185 * a PDP for the kernel, and nkpde PTP's for the kernel. 1186 * => kva_start is the first free virtual address in kernel space 1187 */ 1188 1189 void 1190 pmap_bootstrap(vaddr_t kva_start) 1191 { 1192 struct pmap *kpm; 1193 pt_entry_t *pte; 1194 int i; 1195 vaddr_t kva; 1196 #ifdef XEN 1197 pt_entry_t pg_nx = 0; 1198 #else 1199 unsigned long p1i; 1200 vaddr_t kva_end; 1201 pt_entry_t pg_nx = (cpu_feature & CPUID_NOX ? PG_NX : 0); 1202 #endif 1203 1204 /* 1205 * set up our local static global vars that keep track of the 1206 * usage of KVM before kernel_map is set up 1207 */ 1208 1209 virtual_avail = kva_start; /* first free KVA */ 1210 virtual_end = VM_MAX_KERNEL_ADDRESS; /* last KVA */ 1211 1212 /* 1213 * set up protection_codes: we need to be able to convert from 1214 * a MI protection code (some combo of VM_PROT...) to something 1215 * we can jam into a i386 PTE. 1216 */ 1217 1218 protection_codes[VM_PROT_NONE] = pg_nx; /* --- */ 1219 protection_codes[VM_PROT_EXECUTE] = PG_RO | PG_X; /* --x */ 1220 protection_codes[VM_PROT_READ] = PG_RO | pg_nx; /* -r- */ 1221 protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO | PG_X;/* -rx */ 1222 protection_codes[VM_PROT_WRITE] = PG_RW | pg_nx; /* w-- */ 1223 protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW | PG_X;/* w-x */ 1224 protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW | pg_nx; 1225 /* wr- */ 1226 protection_codes[VM_PROT_ALL] = PG_RW | PG_X; /* wrx */ 1227 1228 /* 1229 * now we init the kernel's pmap 1230 * 1231 * the kernel pmap's pm_obj is not used for much. however, in 1232 * user pmaps the pm_obj contains the list of active PTPs. 1233 * the pm_obj currently does not have a pager. it might be possible 1234 * to add a pager that would allow a process to read-only mmap its 1235 * own page tables (fast user level vtophys?). this may or may not 1236 * be useful. 1237 */ 1238 1239 kpm = pmap_kernel(); 1240 for (i = 0; i < PTP_LEVELS - 1; i++) { 1241 UVM_OBJ_INIT(&kpm->pm_obj[i], NULL, 1); 1242 kpm->pm_ptphint[i] = NULL; 1243 } 1244 memset(&kpm->pm_list, 0, sizeof(kpm->pm_list)); /* pm_list not used */ 1245 kpm->pm_pdir = (pd_entry_t *)(lwp0.l_addr->u_pcb.pcb_cr3 + KERNBASE); 1246 #ifdef PAE 1247 for (i = 0; i < PDP_SIZE; i++) 1248 kpm->pm_pdirpa[i] = 1249 (paddr_t)lwp0.l_addr->u_pcb.pcb_cr3 + PAGE_SIZE * i; 1250 #else 1251 kpm->pm_pdirpa = (paddr_t) lwp0.l_addr->u_pcb.pcb_cr3; 1252 #endif 1253 kpm->pm_stats.wired_count = kpm->pm_stats.resident_count = 1254 x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS); 1255 1256 /* 1257 * the above is just a rough estimate and not critical to the proper 1258 * operation of the system. 1259 */ 1260 1261 #ifndef XEN 1262 /* 1263 * Begin to enable global TLB entries if they are supported. 1264 * The G bit has no effect until the CR4_PGE bit is set in CR4, 1265 * which happens in cpu_init(), which is run on each cpu 1266 * (and happens later) 1267 */ 1268 1269 if (cpu_feature & CPUID_PGE) { 1270 pmap_pg_g = PG_G; /* enable software */ 1271 1272 /* add PG_G attribute to already mapped kernel pages */ 1273 if (KERNBASE == VM_MIN_KERNEL_ADDRESS) { 1274 kva_end = virtual_avail; 1275 } else { 1276 extern vaddr_t eblob, esym; 1277 kva_end = (vaddr_t)&end; 1278 if (esym > kva_end) 1279 kva_end = esym; 1280 if (eblob > kva_end) 1281 kva_end = eblob; 1282 kva_end = roundup(kva_end, PAGE_SIZE); 1283 } 1284 for (kva = KERNBASE; kva < kva_end; kva += PAGE_SIZE) { 1285 p1i = pl1_i(kva); 1286 if (pmap_valid_entry(PTE_BASE[p1i])) 1287 PTE_BASE[p1i] |= PG_G; 1288 } 1289 } 1290 1291 /* 1292 * enable large pages if they are supported. 1293 */ 1294 1295 if (cpu_feature & CPUID_PSE) { 1296 paddr_t pa; 1297 pd_entry_t *pde; 1298 extern char __data_start; 1299 1300 lcr4(rcr4() | CR4_PSE); /* enable hardware (via %cr4) */ 1301 pmap_largepages = 1; /* enable software */ 1302 1303 /* 1304 * the TLB must be flushed after enabling large pages 1305 * on Pentium CPUs, according to section 3.6.2.2 of 1306 * "Intel Architecture Software Developer's Manual, 1307 * Volume 3: System Programming". 1308 */ 1309 tlbflush(); 1310 1311 /* 1312 * now, remap the kernel text using large pages. we 1313 * assume that the linker has properly aligned the 1314 * .data segment to a NBPD_L2 boundary. 1315 */ 1316 kva_end = rounddown((vaddr_t)&__data_start, NBPD_L1); 1317 for (pa = 0, kva = KERNBASE; kva + NBPD_L2 <= kva_end; 1318 kva += NBPD_L2, pa += NBPD_L2) { 1319 pde = &L2_BASE[pl2_i(kva)]; 1320 *pde = pa | pmap_pg_g | PG_PS | 1321 PG_KR | PG_V; /* zap! */ 1322 tlbflush(); 1323 } 1324 #if defined(DEBUG) 1325 printf("kernel text is mapped with " 1326 "%lu large pages and %lu normal pages\n", 1327 (unsigned long)howmany(kva - KERNBASE, NBPD_L2), 1328 (unsigned long)howmany((vaddr_t)&__data_start - kva, 1329 NBPD_L1)); 1330 #endif /* defined(DEBUG) */ 1331 } 1332 #endif /* !XEN */ 1333 1334 if (VM_MIN_KERNEL_ADDRESS != KERNBASE) { 1335 /* 1336 * zero_pte is stuck at the end of mapped space for the kernel 1337 * image (disjunct from kva space). This is done so that it 1338 * can safely be used in pmap_growkernel (pmap_get_physpage), 1339 * when it's called for the first time. 1340 * XXXfvdl fix this for MULTIPROCESSOR later. 1341 */ 1342 1343 early_zerop = (void *)(KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2); 1344 early_zero_pte = PTE_BASE + pl1_i((unsigned long)early_zerop); 1345 } 1346 1347 /* 1348 * now we allocate the "special" VAs which are used for tmp mappings 1349 * by the pmap (and other modules). we allocate the VAs by advancing 1350 * virtual_avail (note that there are no pages mapped at these VAs). 1351 * we find the PTE that maps the allocated VA via the linear PTE 1352 * mapping. 1353 */ 1354 1355 pte = PTE_BASE + pl1_i(virtual_avail); 1356 1357 #ifdef MULTIPROCESSOR 1358 /* 1359 * Waste some VA space to avoid false sharing of cache lines 1360 * for page table pages: Give each possible CPU a cache line 1361 * of PTE's (8) to play with, though we only need 4. We could 1362 * recycle some of this waste by putting the idle stacks here 1363 * as well; we could waste less space if we knew the largest 1364 * CPU ID beforehand. 1365 */ 1366 csrcp = (char *) virtual_avail; csrc_pte = pte; 1367 1368 cdstp = (char *) virtual_avail+PAGE_SIZE; cdst_pte = pte+1; 1369 1370 zerop = (char *) virtual_avail+PAGE_SIZE*2; zero_pte = pte+2; 1371 1372 ptpp = (char *) virtual_avail+PAGE_SIZE*3; ptp_pte = pte+3; 1373 1374 virtual_avail += PAGE_SIZE * maxcpus * NPTECL; 1375 pte += maxcpus * NPTECL; 1376 #else 1377 csrcp = (void *) virtual_avail; csrc_pte = pte; /* allocate */ 1378 virtual_avail += PAGE_SIZE; pte++; /* advance */ 1379 1380 cdstp = (void *) virtual_avail; cdst_pte = pte; 1381 virtual_avail += PAGE_SIZE; pte++; 1382 1383 zerop = (void *) virtual_avail; zero_pte = pte; 1384 virtual_avail += PAGE_SIZE; pte++; 1385 1386 ptpp = (void *) virtual_avail; ptp_pte = pte; 1387 virtual_avail += PAGE_SIZE; pte++; 1388 #endif 1389 1390 if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { 1391 early_zerop = zerop; 1392 early_zero_pte = zero_pte; 1393 } 1394 1395 /* 1396 * Nothing after this point actually needs pte; 1397 */ 1398 pte = (void *)0xdeadbeef; 1399 1400 /* XXX: vmmap used by mem.c... should be uvm_map_reserve */ 1401 /* XXXfvdl PTEs not needed here */ 1402 vmmap = (char *)virtual_avail; /* don't need pte */ 1403 virtual_avail += PAGE_SIZE; pte++; 1404 1405 #ifdef XEN 1406 #ifdef __x86_64__ 1407 /* 1408 * We want a dummy page directory for Xen: 1409 * when deactivate a pmap, Xen will still consider it active. 1410 * So we set user PGD to this one to lift all protection on 1411 * the now inactive page tables set. 1412 */ 1413 xen_dummy_user_pgd = avail_start; 1414 avail_start += PAGE_SIZE; 1415 1416 /* Zero fill it, the less checks in Xen it requires the better */ 1417 memset((void *) (xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE); 1418 /* Mark read-only */ 1419 HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE, 1420 pmap_pa2pte(xen_dummy_user_pgd) | PG_u | PG_V, UVMF_INVLPG); 1421 /* Pin as L4 */ 1422 xpq_queue_pin_table(xpmap_ptom_masked(xen_dummy_user_pgd)); 1423 #endif /* __x86_64__ */ 1424 idt_vaddr = virtual_avail; /* don't need pte */ 1425 idt_paddr = avail_start; /* steal a page */ 1426 /* 1427 * Xen require one more page as we can't store 1428 * GDT and LDT on the same page 1429 */ 1430 virtual_avail += 3 * PAGE_SIZE; 1431 avail_start += 3 * PAGE_SIZE; 1432 #else /* XEN */ 1433 idt_vaddr = virtual_avail; /* don't need pte */ 1434 idt_paddr = avail_start; /* steal a page */ 1435 #if defined(__x86_64__) 1436 virtual_avail += 2 * PAGE_SIZE; pte += 2; 1437 avail_start += 2 * PAGE_SIZE; 1438 #else /* defined(__x86_64__) */ 1439 virtual_avail += PAGE_SIZE; pte++; 1440 avail_start += PAGE_SIZE; 1441 /* pentium f00f bug stuff */ 1442 pentium_idt_vaddr = virtual_avail; /* don't need pte */ 1443 virtual_avail += PAGE_SIZE; pte++; 1444 #endif /* defined(__x86_64__) */ 1445 #endif /* XEN */ 1446 1447 #ifdef _LP64 1448 /* 1449 * Grab a page below 4G for things that need it (i.e. 1450 * having an initial %cr3 for the MP trampoline). 1451 */ 1452 lo32_vaddr = virtual_avail; 1453 virtual_avail += PAGE_SIZE; pte++; 1454 lo32_paddr = avail_start; 1455 avail_start += PAGE_SIZE; 1456 #endif 1457 1458 /* 1459 * now we reserve some VM for mapping pages when doing a crash dump 1460 */ 1461 1462 virtual_avail = reserve_dumppages(virtual_avail); 1463 1464 /* 1465 * init the static-global locks and global lists. 1466 * 1467 * => pventry::pvh_lock (initialized elsewhere) must also be 1468 * a spin lock, again at IPL_VM to prevent deadlock, and 1469 * again is never taken from interrupt context. 1470 */ 1471 1472 mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE); 1473 LIST_INIT(&pmaps); 1474 pmap_cpu_init_early(curcpu()); 1475 1476 /* 1477 * initialize caches. 1478 */ 1479 1480 pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), 0, 0, 0, 1481 "pmappl", NULL, IPL_NONE, NULL, NULL, NULL); 1482 #ifdef PAE 1483 pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE * PDP_SIZE, 0, 0, 0, 1484 "pdppl", &pmap_pdp_allocator, IPL_NONE, 1485 pmap_pdp_ctor, pmap_pdp_dtor, NULL); 1486 #else /* PAE */ 1487 pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE, 0, 0, 0, 1488 "pdppl", NULL, IPL_NONE, pmap_pdp_ctor, pmap_pdp_dtor, NULL); 1489 #endif /* PAE */ 1490 pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0, 1491 PR_LARGECACHE, "pvpl", &pool_allocator_meta, IPL_NONE, NULL, 1492 NULL, NULL); 1493 1494 /* 1495 * ensure the TLB is sync'd with reality by flushing it... 1496 */ 1497 1498 tlbflush(); 1499 1500 /* 1501 * calculate pmap_maxkvaddr from nkptp[]. 1502 */ 1503 1504 kva = VM_MIN_KERNEL_ADDRESS; 1505 for (i = PTP_LEVELS - 1; i >= 1; i--) { 1506 kva += nkptp[i] * nbpd[i]; 1507 } 1508 pmap_maxkvaddr = kva; 1509 } 1510 1511 #if defined(__x86_64__) 1512 /* 1513 * Pre-allocate PTPs for low memory, so that 1:1 mappings for various 1514 * trampoline code can be entered. 1515 */ 1516 void 1517 pmap_prealloc_lowmem_ptps(void) 1518 { 1519 #ifdef XEN 1520 int level; 1521 paddr_t newp; 1522 paddr_t pdes_pa; 1523 1524 pdes_pa = pmap_kernel()->pm_pdirpa; 1525 level = PTP_LEVELS; 1526 for (;;) { 1527 newp = avail_start; 1528 avail_start += PAGE_SIZE; 1529 HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop, 1530 xpmap_ptom_masked(newp) | PG_u | PG_V | PG_RW, UVMF_INVLPG); 1531 memset((void *)early_zerop, 0, PAGE_SIZE); 1532 /* Mark R/O before installing */ 1533 HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop, 1534 xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG); 1535 if (newp < (NKL2_KIMG_ENTRIES * NBPD_L2)) 1536 HYPERVISOR_update_va_mapping (newp + KERNBASE, 1537 xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG); 1538 xpq_queue_pte_update ( 1539 xpmap_ptom_masked(pdes_pa) 1540 + (pl_i(0, level) * sizeof (pd_entry_t)), 1541 xpmap_ptom_masked(newp) | PG_RW | PG_u | PG_V); 1542 level--; 1543 if (level <= 1) 1544 break; 1545 pdes_pa = newp; 1546 } 1547 #else /* XEN */ 1548 pd_entry_t *pdes; 1549 int level; 1550 paddr_t newp; 1551 1552 pdes = pmap_kernel()->pm_pdir; 1553 level = PTP_LEVELS; 1554 for (;;) { 1555 newp = avail_start; 1556 avail_start += PAGE_SIZE; 1557 *early_zero_pte = (newp & PG_FRAME) | PG_V | PG_RW; 1558 pmap_update_pg((vaddr_t)early_zerop); 1559 memset(early_zerop, 0, PAGE_SIZE); 1560 pdes[pl_i(0, level)] = (newp & PG_FRAME) | PG_V | PG_RW; 1561 level--; 1562 if (level <= 1) 1563 break; 1564 pdes = normal_pdes[level - 2]; 1565 } 1566 #endif /* XEN */ 1567 } 1568 #endif /* defined(__x86_64__) */ 1569 1570 /* 1571 * pmap_init: called from uvm_init, our job is to get the pmap 1572 * system ready to manage mappings... 1573 */ 1574 1575 void 1576 pmap_init(void) 1577 { 1578 int i; 1579 1580 for (i = 0; i < PV_HASH_SIZE; i++) { 1581 SLIST_INIT(&pv_hash_heads[i].hh_list); 1582 } 1583 for (i = 0; i < PV_HASH_LOCK_CNT; i++) { 1584 mutex_init(&pv_hash_locks[i].lock, MUTEX_NODEBUG, IPL_VM); 1585 } 1586 1587 /* 1588 * done: pmap module is up (and ready for business) 1589 */ 1590 1591 pmap_initialized = true; 1592 } 1593 1594 /* 1595 * pmap_cpu_init_early: perform early per-CPU initialization. 1596 */ 1597 1598 void 1599 pmap_cpu_init_early(struct cpu_info *ci) 1600 { 1601 struct pmap_cpu *pc; 1602 static uint8_t pmap_cpu_alloc; 1603 1604 pc = &pmap_cpu[pmap_cpu_alloc++].pc; 1605 ci->ci_pmap_cpu = pc; 1606 } 1607 1608 /* 1609 * pmap_cpu_init_late: perform late per-CPU initialization. 1610 */ 1611 1612 void 1613 pmap_cpu_init_late(struct cpu_info *ci) 1614 { 1615 1616 if (ci == &cpu_info_primary) 1617 evcnt_attach_dynamic(&pmap_tlb_evcnt, EVCNT_TYPE_INTR, 1618 NULL, "global", "TLB IPI"); 1619 evcnt_attach_dynamic(&ci->ci_tlb_evcnt, EVCNT_TYPE_MISC, 1620 NULL, device_xname(ci->ci_dev), "TLB IPI"); 1621 } 1622 1623 /* 1624 * p v _ e n t r y f u n c t i o n s 1625 */ 1626 1627 /* 1628 * pmap_free_pvs: free a list of pv_entrys 1629 */ 1630 1631 static void 1632 pmap_free_pvs(struct pv_entry *pve) 1633 { 1634 struct pv_entry *next; 1635 1636 for ( /* null */ ; pve != NULL ; pve = next) { 1637 next = pve->pve_next; 1638 pool_cache_put(&pmap_pv_cache, pve); 1639 } 1640 } 1641 1642 /* 1643 * main pv_entry manipulation functions: 1644 * pmap_enter_pv: enter a mapping onto a pv_head list 1645 * pmap_remove_pv: remove a mappiing from a pv_head list 1646 * 1647 * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock 1648 * the pvh before calling 1649 */ 1650 1651 /* 1652 * insert_pv: a helper of pmap_enter_pv 1653 */ 1654 1655 static void 1656 insert_pv(struct pmap_page *pp, struct pv_entry *pve) 1657 { 1658 struct pv_hash_head *hh; 1659 kmutex_t *lock; 1660 u_int hash; 1661 1662 KASSERT(pp_locked(pp)); 1663 1664 hash = pvhash_hash(pve->pve_pte.pte_ptp, pve->pve_pte.pte_va); 1665 lock = pvhash_lock(hash); 1666 hh = pvhash_head(hash); 1667 mutex_spin_enter(lock); 1668 SLIST_INSERT_HEAD(&hh->hh_list, pve, pve_hash); 1669 mutex_spin_exit(lock); 1670 1671 LIST_INSERT_HEAD(&pp->pp_head.pvh_list, pve, pve_list); 1672 } 1673 1674 /* 1675 * pmap_enter_pv: enter a mapping onto a pv_head lst 1676 * 1677 * => caller should have the pp_lock locked 1678 * => caller should adjust ptp's wire_count before calling 1679 */ 1680 1681 static struct pv_entry * 1682 pmap_enter_pv(struct pmap_page *pp, 1683 struct pv_entry *pve, /* preallocated pve for us to use */ 1684 struct pv_entry **sparepve, 1685 struct vm_page *ptp, 1686 vaddr_t va) 1687 { 1688 1689 KASSERT(ptp == NULL || ptp->wire_count >= 2); 1690 KASSERT(ptp == NULL || ptp->uobject != NULL); 1691 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 1692 KASSERT(pp_locked(pp)); 1693 1694 if ((pp->pp_flags & PP_EMBEDDED) == 0) { 1695 if (LIST_EMPTY(&pp->pp_head.pvh_list)) { 1696 pp->pp_flags |= PP_EMBEDDED; 1697 pp->pp_pte.pte_ptp = ptp; 1698 pp->pp_pte.pte_va = va; 1699 1700 return pve; 1701 } 1702 } else { 1703 struct pv_entry *pve2; 1704 1705 pve2 = *sparepve; 1706 *sparepve = NULL; 1707 1708 pve2->pve_pte = pp->pp_pte; 1709 pp->pp_flags &= ~PP_EMBEDDED; 1710 LIST_INIT(&pp->pp_head.pvh_list); 1711 insert_pv(pp, pve2); 1712 } 1713 1714 pve->pve_pte.pte_ptp = ptp; 1715 pve->pve_pte.pte_va = va; 1716 insert_pv(pp, pve); 1717 1718 return NULL; 1719 } 1720 1721 /* 1722 * pmap_remove_pv: try to remove a mapping from a pv_list 1723 * 1724 * => caller should hold pp_lock [so that attrs can be adjusted] 1725 * => caller should adjust ptp's wire_count and free PTP if needed 1726 * => we return the removed pve 1727 */ 1728 1729 static struct pv_entry * 1730 pmap_remove_pv(struct pmap_page *pp, struct vm_page *ptp, vaddr_t va) 1731 { 1732 struct pv_hash_head *hh; 1733 struct pv_entry *pve; 1734 kmutex_t *lock; 1735 u_int hash; 1736 1737 KASSERT(ptp == NULL || ptp->uobject != NULL); 1738 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 1739 KASSERT(pp_locked(pp)); 1740 1741 if ((pp->pp_flags & PP_EMBEDDED) != 0) { 1742 KASSERT(pp->pp_pte.pte_ptp == ptp); 1743 KASSERT(pp->pp_pte.pte_va == va); 1744 1745 pp->pp_flags &= ~PP_EMBEDDED; 1746 LIST_INIT(&pp->pp_head.pvh_list); 1747 1748 return NULL; 1749 } 1750 1751 hash = pvhash_hash(ptp, va); 1752 lock = pvhash_lock(hash); 1753 hh = pvhash_head(hash); 1754 mutex_spin_enter(lock); 1755 pve = pvhash_remove(hh, ptp, va); 1756 mutex_spin_exit(lock); 1757 1758 LIST_REMOVE(pve, pve_list); 1759 1760 return pve; 1761 } 1762 1763 /* 1764 * p t p f u n c t i o n s 1765 */ 1766 1767 static inline struct vm_page * 1768 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level) 1769 { 1770 int lidx = level - 1; 1771 struct vm_page *pg; 1772 1773 KASSERT(mutex_owned(&pmap->pm_lock)); 1774 1775 if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] && 1776 pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) { 1777 return (pmap->pm_ptphint[lidx]); 1778 } 1779 PMAP_SUBOBJ_LOCK(pmap, lidx); 1780 pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level)); 1781 PMAP_SUBOBJ_UNLOCK(pmap, lidx); 1782 1783 KASSERT(pg == NULL || pg->wire_count >= 1); 1784 return pg; 1785 } 1786 1787 static inline void 1788 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level) 1789 { 1790 int lidx; 1791 struct uvm_object *obj; 1792 1793 KASSERT(ptp->wire_count == 1); 1794 1795 lidx = level - 1; 1796 1797 obj = &pmap->pm_obj[lidx]; 1798 pmap_stats_update(pmap, -1, 0); 1799 if (lidx != 0) 1800 mutex_enter(&obj->vmobjlock); 1801 if (pmap->pm_ptphint[lidx] == ptp) 1802 pmap->pm_ptphint[lidx] = TAILQ_FIRST(&obj->memq); 1803 ptp->wire_count = 0; 1804 uvm_pagerealloc(ptp, NULL, 0); 1805 VM_PAGE_TO_PP(ptp)->pp_link = curlwp->l_md.md_gc_ptp; 1806 curlwp->l_md.md_gc_ptp = ptp; 1807 if (lidx != 0) 1808 mutex_exit(&obj->vmobjlock); 1809 } 1810 1811 static void 1812 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va, 1813 pt_entry_t *ptes, pd_entry_t * const *pdes) 1814 { 1815 unsigned long index; 1816 int level; 1817 vaddr_t invaladdr; 1818 #ifdef MULTIPROCESSOR 1819 vaddr_t invaladdr2; 1820 #endif 1821 pd_entry_t opde; 1822 struct pmap *curpmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map); 1823 1824 KASSERT(pmap != pmap_kernel()); 1825 KASSERT(mutex_owned(&pmap->pm_lock)); 1826 KASSERT(kpreempt_disabled()); 1827 1828 level = 1; 1829 do { 1830 index = pl_i(va, level + 1); 1831 opde = pmap_pte_testset(&pdes[level - 1][index], 0); 1832 #if defined(XEN) && defined(__x86_64__) 1833 /* 1834 * If ptp is a L3 currently mapped in kernel space, 1835 * clear it before freeing 1836 */ 1837 if (pmap->pm_pdirpa == xen_current_user_pgd 1838 && level == PTP_LEVELS - 1) 1839 pmap_pte_set(&pmap_kernel()->pm_pdir[index], 0); 1840 #endif /* XEN && __x86_64__ */ 1841 pmap_freepage(pmap, ptp, level); 1842 invaladdr = level == 1 ? (vaddr_t)ptes : 1843 (vaddr_t)pdes[level - 2]; 1844 pmap_tlb_shootdown(curpmap, invaladdr + index * PAGE_SIZE, 1845 0, opde); 1846 #if defined(MULTIPROCESSOR) 1847 invaladdr2 = level == 1 ? (vaddr_t)PTE_BASE : 1848 (vaddr_t)normal_pdes[level - 2]; 1849 if (pmap != curpmap || invaladdr != invaladdr2) { 1850 pmap_tlb_shootdown(pmap, invaladdr2 + index * PAGE_SIZE, 1851 0, opde); 1852 } 1853 #endif 1854 if (level < PTP_LEVELS - 1) { 1855 ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1); 1856 ptp->wire_count--; 1857 if (ptp->wire_count > 1) 1858 break; 1859 } 1860 } while (++level < PTP_LEVELS); 1861 pmap_pte_flush(); 1862 } 1863 1864 /* 1865 * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one) 1866 * 1867 * => pmap should NOT be pmap_kernel() 1868 * => pmap should be locked 1869 * => preemption should be disabled 1870 */ 1871 1872 static struct vm_page * 1873 pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t * const *pdes) 1874 { 1875 struct vm_page *ptp, *pptp; 1876 int i; 1877 unsigned long index; 1878 pd_entry_t *pva; 1879 paddr_t ppa, pa; 1880 struct uvm_object *obj; 1881 1882 KASSERT(pmap != pmap_kernel()); 1883 KASSERT(mutex_owned(&pmap->pm_lock)); 1884 KASSERT(kpreempt_disabled()); 1885 1886 ptp = NULL; 1887 pa = (paddr_t)-1; 1888 1889 /* 1890 * Loop through all page table levels seeing if we need to 1891 * add a new page to that level. 1892 */ 1893 for (i = PTP_LEVELS; i > 1; i--) { 1894 /* 1895 * Save values from previous round. 1896 */ 1897 pptp = ptp; 1898 ppa = pa; 1899 1900 index = pl_i(va, i); 1901 pva = pdes[i - 2]; 1902 1903 if (pmap_valid_entry(pva[index])) { 1904 ppa = pmap_pte2pa(pva[index]); 1905 ptp = NULL; 1906 continue; 1907 } 1908 1909 obj = &pmap->pm_obj[i-2]; 1910 PMAP_SUBOBJ_LOCK(pmap, i - 2); 1911 ptp = uvm_pagealloc(obj, ptp_va2o(va, i - 1), NULL, 1912 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 1913 PMAP_SUBOBJ_UNLOCK(pmap, i - 2); 1914 1915 if (ptp == NULL) 1916 return NULL; 1917 1918 ptp->flags &= ~PG_BUSY; /* never busy */ 1919 ptp->wire_count = 1; 1920 pmap->pm_ptphint[i - 2] = ptp; 1921 pa = VM_PAGE_TO_PHYS(ptp); 1922 pmap_pte_set(&pva[index], (pd_entry_t) 1923 (pmap_pa2pte(pa) | PG_u | PG_RW | PG_V)); 1924 #if defined(XEN) && defined(__x86_64__) 1925 /* 1926 * In Xen we must enter the mapping in kernel map too 1927 * if pmap is curmap and modifying top level (PGD) 1928 */ 1929 if(i == PTP_LEVELS && pmap != pmap_kernel()) { 1930 pmap_pte_set(&pmap_kernel()->pm_pdir[index], 1931 (pd_entry_t) (pmap_pa2pte(pa) 1932 | PG_u | PG_RW | PG_V)); 1933 } 1934 #endif /* XEN && __x86_64__ */ 1935 pmap_pte_flush(); 1936 pmap_stats_update(pmap, 1, 0); 1937 /* 1938 * If we're not in the top level, increase the 1939 * wire count of the parent page. 1940 */ 1941 if (i < PTP_LEVELS) { 1942 if (pptp == NULL) 1943 pptp = pmap_find_ptp(pmap, va, ppa, i); 1944 #ifdef DIAGNOSTIC 1945 if (pptp == NULL) 1946 panic("pde page disappeared"); 1947 #endif 1948 pptp->wire_count++; 1949 } 1950 } 1951 1952 /* 1953 * ptp is not NULL if we just allocated a new ptp. If it's 1954 * still NULL, we must look up the existing one. 1955 */ 1956 if (ptp == NULL) { 1957 ptp = pmap_find_ptp(pmap, va, ppa, 1); 1958 #ifdef DIAGNOSTIC 1959 if (ptp == NULL) { 1960 printf("va %lx ppa %lx\n", (unsigned long)va, 1961 (unsigned long)ppa); 1962 panic("pmap_get_ptp: unmanaged user PTP"); 1963 } 1964 #endif 1965 } 1966 1967 pmap->pm_ptphint[0] = ptp; 1968 return(ptp); 1969 } 1970 1971 /* 1972 * p m a p l i f e c y c l e f u n c t i o n s 1973 */ 1974 1975 /* 1976 * pmap_pdp_ctor: constructor for the PDP cache. 1977 */ 1978 1979 int 1980 pmap_pdp_ctor(void *arg, void *v, int flags) 1981 { 1982 pd_entry_t *pdir = v; 1983 paddr_t pdirpa = 0; /* XXX: GCC */ 1984 vaddr_t object; 1985 int i; 1986 1987 #if !defined(XEN) || !defined(__x86_64__) 1988 int npde; 1989 #endif 1990 #ifdef XEN 1991 int s; 1992 #endif 1993 1994 /* 1995 * NOTE: The `pmap_lock' is held when the PDP is allocated. 1996 */ 1997 1998 #if defined(XEN) && defined(__x86_64__) 1999 /* fetch the physical address of the page directory. */ 2000 (void) pmap_extract(pmap_kernel(), (vaddr_t) pdir, &pdirpa); 2001 2002 /* zero init area */ 2003 memset (pdir, 0, PAGE_SIZE); /* Xen wants a clean page */ 2004 /* 2005 * this pdir will NEVER be active in kernel mode 2006 * so mark recursive entry invalid 2007 */ 2008 pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa) | PG_u; 2009 /* 2010 * PDP constructed this way won't be for kernel, 2011 * hence we don't put kernel mappings on Xen. 2012 * But we need to make pmap_create() happy, so put a dummy (without 2013 * PG_V) value at the right place. 2014 */ 2015 pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] = 2016 (unsigned long)-1 & PG_FRAME; 2017 #else /* XEN && __x86_64__*/ 2018 /* zero init area */ 2019 memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t)); 2020 2021 object = (vaddr_t)v; 2022 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2023 /* fetch the physical address of the page directory. */ 2024 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2025 /* put in recursive PDE to map the PTEs */ 2026 pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PG_V; 2027 #ifndef XEN 2028 pdir[PDIR_SLOT_PTE + i] |= PG_KW; 2029 #endif 2030 } 2031 2032 /* copy kernel's PDE */ 2033 npde = nkptp[PTP_LEVELS - 1]; 2034 2035 memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN], 2036 npde * sizeof(pd_entry_t)); 2037 2038 /* zero the rest */ 2039 memset(&pdir[PDIR_SLOT_KERN + npde], 0, 2040 (NTOPLEVEL_PDES - (PDIR_SLOT_KERN + npde)) * sizeof(pd_entry_t)); 2041 2042 if (VM_MIN_KERNEL_ADDRESS != KERNBASE) { 2043 int idx = pl_i(KERNBASE, PTP_LEVELS); 2044 2045 pdir[idx] = PDP_BASE[idx]; 2046 } 2047 #endif /* XEN && __x86_64__*/ 2048 #ifdef XEN 2049 s = splvm(); 2050 object = (vaddr_t)v; 2051 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2052 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2053 /* remap this page RO */ 2054 pmap_kenter_pa(object, pdirpa, VM_PROT_READ); 2055 pmap_update(pmap_kernel()); 2056 /* 2057 * pin as L2/L4 page, we have to do the page with the 2058 * PDIR_SLOT_PTE entries last 2059 */ 2060 #ifdef PAE 2061 if (i == l2tol3(PDIR_SLOT_PTE)) 2062 continue; 2063 #endif 2064 xpq_queue_pin_table(xpmap_ptom_masked(pdirpa)); 2065 } 2066 #ifdef PAE 2067 object = ((vaddr_t)pdir) + PAGE_SIZE * l2tol3(PDIR_SLOT_PTE); 2068 (void)pmap_extract(pmap_kernel(), object, &pdirpa); 2069 xpq_queue_pin_table(xpmap_ptom_masked(pdirpa)); 2070 #endif 2071 xpq_flush_queue(); 2072 splx(s); 2073 #endif /* XEN */ 2074 2075 return (0); 2076 } 2077 2078 /* 2079 * pmap_pdp_dtor: destructor for the PDP cache. 2080 */ 2081 2082 void 2083 pmap_pdp_dtor(void *arg, void *v) 2084 { 2085 #ifdef XEN 2086 paddr_t pdirpa = 0; /* XXX: GCC */ 2087 vaddr_t object = (vaddr_t)v; 2088 int i; 2089 int s = splvm(); 2090 pt_entry_t *pte; 2091 2092 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2093 /* fetch the physical address of the page directory. */ 2094 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2095 /* unpin page table */ 2096 xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa)); 2097 } 2098 object = (vaddr_t)v; 2099 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2100 /* Set page RW again */ 2101 pte = kvtopte(object); 2102 xpq_queue_pte_update(xpmap_ptetomach(pte), *pte | PG_RW); 2103 xpq_queue_invlpg((vaddr_t)object); 2104 } 2105 xpq_flush_queue(); 2106 splx(s); 2107 #endif /* XEN */ 2108 } 2109 2110 #ifdef PAE 2111 2112 /* pmap_pdp_alloc: Allocate a page for the pdp memory pool. */ 2113 2114 void * 2115 pmap_pdp_alloc(struct pool *pp, int flags) 2116 { 2117 return (void *)uvm_km_alloc(kernel_map, 2118 PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE, 2119 ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK) 2120 | UVM_KMF_WIRED); 2121 } 2122 2123 /* 2124 * pmap_pdp_free: free a PDP 2125 */ 2126 2127 void 2128 pmap_pdp_free(struct pool *pp, void *v) 2129 { 2130 uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE, 2131 UVM_KMF_WIRED); 2132 } 2133 #endif /* PAE */ 2134 2135 /* 2136 * pmap_create: create a pmap 2137 * 2138 * => note: old pmap interface took a "size" args which allowed for 2139 * the creation of "software only" pmaps (not in bsd). 2140 */ 2141 2142 struct pmap * 2143 pmap_create(void) 2144 { 2145 struct pmap *pmap; 2146 int i; 2147 2148 pmap = pool_cache_get(&pmap_cache, PR_WAITOK); 2149 2150 /* init uvm_object */ 2151 for (i = 0; i < PTP_LEVELS - 1; i++) { 2152 UVM_OBJ_INIT(&pmap->pm_obj[i], NULL, 1); 2153 pmap->pm_ptphint[i] = NULL; 2154 } 2155 pmap->pm_stats.wired_count = 0; 2156 pmap->pm_stats.resident_count = 1; /* count the PDP allocd below */ 2157 #if !defined(__x86_64__) 2158 pmap->pm_hiexec = 0; 2159 #endif /* !defined(__x86_64__) */ 2160 pmap->pm_flags = 0; 2161 pmap->pm_cpus = 0; 2162 pmap->pm_kernel_cpus = 0; 2163 2164 /* init the LDT */ 2165 pmap->pm_ldt = NULL; 2166 pmap->pm_ldt_len = 0; 2167 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 2168 2169 /* allocate PDP */ 2170 try_again: 2171 pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK); 2172 2173 mutex_enter(&pmaps_lock); 2174 2175 if (pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] == 0) { 2176 mutex_exit(&pmaps_lock); 2177 pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir); 2178 goto try_again; 2179 } 2180 2181 #ifdef PAE 2182 for (i = 0; i < PDP_SIZE; i++) 2183 pmap->pm_pdirpa[i] = 2184 pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]); 2185 #else 2186 pmap->pm_pdirpa = pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE]); 2187 #endif 2188 2189 LIST_INSERT_HEAD(&pmaps, pmap, pm_list); 2190 2191 mutex_exit(&pmaps_lock); 2192 2193 return (pmap); 2194 } 2195 2196 /* 2197 * pmap_destroy: drop reference count on pmap. free pmap if 2198 * reference count goes to zero. 2199 */ 2200 2201 void 2202 pmap_destroy(struct pmap *pmap) 2203 { 2204 int i; 2205 #ifdef DIAGNOSTIC 2206 struct cpu_info *ci; 2207 CPU_INFO_ITERATOR cii; 2208 #endif /* DIAGNOSTIC */ 2209 2210 /* 2211 * if we have torn down this pmap, process deferred frees and 2212 * invalidations now. 2213 */ 2214 if (__predict_false(curlwp->l_md.md_gc_pmap == pmap)) { 2215 pmap_update(pmap); 2216 } 2217 2218 /* 2219 * drop reference count 2220 */ 2221 2222 if (atomic_dec_uint_nv((unsigned *)&pmap->pm_obj[0].uo_refs) > 0) { 2223 return; 2224 } 2225 2226 #ifdef DIAGNOSTIC 2227 for (CPU_INFO_FOREACH(cii, ci)) 2228 if (ci->ci_pmap == pmap) 2229 panic("destroying pmap being used"); 2230 #endif /* DIAGNOSTIC */ 2231 2232 /* 2233 * reference count is zero, free pmap resources and then free pmap. 2234 */ 2235 #ifdef XEN 2236 /* 2237 * Xen lazy APDP handling: 2238 * clear APDP_PDE if pmap is the currently mapped 2239 */ 2240 if (xpmap_ptom_masked(pmap_pdirpa(pmap, 0)) == (*APDP_PDE & PG_FRAME)) { 2241 kpreempt_disable(); 2242 for (i = 0; i < PDP_SIZE; i++) { 2243 pmap_pte_set(&APDP_PDE[i], 0); 2244 #ifdef PAE 2245 /* clear shadow entry too */ 2246 pmap_pte_set(&APDP_PDE_SHADOW[i], 0); 2247 #endif 2248 } 2249 pmap_pte_flush(); 2250 pmap_apte_flush(pmap_kernel()); 2251 kpreempt_enable(); 2252 } 2253 #endif 2254 2255 /* 2256 * remove it from global list of pmaps 2257 */ 2258 2259 mutex_enter(&pmaps_lock); 2260 LIST_REMOVE(pmap, pm_list); 2261 mutex_exit(&pmaps_lock); 2262 2263 /* 2264 * destroyed pmap shouldn't have remaining PTPs 2265 */ 2266 2267 for (i = 0; i < PTP_LEVELS - 1; i++) { 2268 KASSERT(pmap->pm_obj[i].uo_npages == 0); 2269 KASSERT(TAILQ_EMPTY(&pmap->pm_obj[i].memq)); 2270 } 2271 2272 /* 2273 * MULTIPROCESSOR -- no need to flush out of other processors' 2274 * APTE space because we do that in pmap_unmap_ptes(). 2275 */ 2276 pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir); 2277 2278 #ifdef USER_LDT 2279 if (pmap->pm_flags & PMF_USER_LDT) { 2280 /* 2281 * no need to switch the LDT; this address space is gone, 2282 * nothing is using it. 2283 * 2284 * No need to lock the pmap for ldt_free (or anything else), 2285 * we're the last one to use it. 2286 */ 2287 ldt_free(pmap->pm_ldt_sel); 2288 uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt, 2289 pmap->pm_ldt_len * sizeof(union descriptor), UVM_KMF_WIRED); 2290 } 2291 #endif 2292 2293 for (i = 0; i < PTP_LEVELS - 1; i++) 2294 mutex_destroy(&pmap->pm_obj[i].vmobjlock); 2295 pool_cache_put(&pmap_cache, pmap); 2296 } 2297 2298 /* 2299 * pmap_remove_all: pmap is being torn down by the current thread. 2300 * avoid unnecessary invalidations. 2301 */ 2302 2303 void 2304 pmap_remove_all(struct pmap *pmap) 2305 { 2306 lwp_t *l = curlwp; 2307 2308 KASSERT(l->l_md.md_gc_pmap == NULL); 2309 2310 l->l_md.md_gc_pmap = pmap; 2311 } 2312 2313 #if defined(PMAP_FORK) 2314 /* 2315 * pmap_fork: perform any necessary data structure manipulation when 2316 * a VM space is forked. 2317 */ 2318 2319 void 2320 pmap_fork(struct pmap *pmap1, struct pmap *pmap2) 2321 { 2322 #ifdef USER_LDT 2323 union descriptor *new_ldt; 2324 size_t len; 2325 int sel; 2326 2327 retry: 2328 if (pmap1->pm_flags & PMF_USER_LDT) { 2329 len = pmap1->pm_ldt_len * sizeof(union descriptor); 2330 new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, 2331 len, 0, UVM_KMF_WIRED); 2332 sel = ldt_alloc(new_ldt, len); 2333 } else { 2334 len = -1; 2335 new_ldt = NULL; 2336 sel = -1; 2337 } 2338 2339 if ((uintptr_t) pmap1 < (uintptr_t) pmap2) { 2340 mutex_enter(&pmap1->pm_lock); 2341 mutex_enter(&pmap2->pm_lock); 2342 } else { 2343 mutex_enter(&pmap2->pm_lock); 2344 mutex_enter(&pmap1->pm_lock); 2345 } 2346 2347 /* Copy the LDT, if necessary. */ 2348 if (pmap1->pm_flags & PMF_USER_LDT) { 2349 if (len != pmap1->pm_ldt_len * sizeof(union descriptor)) { 2350 mutex_exit(&pmap2->pm_lock); 2351 mutex_exit(&pmap1->pm_lock); 2352 if (len != -1) { 2353 ldt_free(sel); 2354 uvm_km_free(kernel_map, (vaddr_t)new_ldt, 2355 len, UVM_KMF_WIRED); 2356 } 2357 goto retry; 2358 } 2359 2360 memcpy(new_ldt, pmap1->pm_ldt, len); 2361 pmap2->pm_ldt = new_ldt; 2362 pmap2->pm_ldt_len = pmap1->pm_ldt_len; 2363 pmap2->pm_flags |= PMF_USER_LDT; 2364 pmap2->pm_ldt_sel = sel; 2365 len = -1; 2366 } 2367 2368 mutex_exit(&pmap2->pm_lock); 2369 mutex_exit(&pmap1->pm_lock); 2370 2371 if (len != -1) { 2372 ldt_free(sel); 2373 uvm_km_free(kernel_map, (vaddr_t)new_ldt, len, 2374 UVM_KMF_WIRED); 2375 } 2376 #endif /* USER_LDT */ 2377 } 2378 #endif /* PMAP_FORK */ 2379 2380 #ifdef USER_LDT 2381 /* 2382 * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and 2383 * restore the default. 2384 */ 2385 2386 void 2387 pmap_ldt_cleanup(struct lwp *l) 2388 { 2389 struct pcb *pcb = &l->l_addr->u_pcb; 2390 pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap; 2391 union descriptor *old_ldt = NULL; 2392 size_t len = 0; 2393 int sel = -1; 2394 2395 mutex_enter(&pmap->pm_lock); 2396 kpreempt_disable(); 2397 2398 if (pmap->pm_flags & PMF_USER_LDT) { 2399 sel = pmap->pm_ldt_sel; 2400 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 2401 pcb->pcb_ldt_sel = pmap->pm_ldt_sel; 2402 if (l == curlwp) 2403 lldt(pcb->pcb_ldt_sel); 2404 old_ldt = pmap->pm_ldt; 2405 len = pmap->pm_ldt_len * sizeof(union descriptor); 2406 pmap->pm_ldt = NULL; 2407 pmap->pm_ldt_len = 0; 2408 pmap->pm_flags &= ~PMF_USER_LDT; 2409 } 2410 2411 kpreempt_enable(); 2412 mutex_exit(&pmap->pm_lock); 2413 2414 if (sel != -1) 2415 ldt_free(sel); 2416 if (old_ldt != NULL) 2417 uvm_km_free(kernel_map, (vaddr_t)old_ldt, len, UVM_KMF_WIRED); 2418 } 2419 #endif /* USER_LDT */ 2420 2421 /* 2422 * pmap_activate: activate a process' pmap 2423 * 2424 * => must be called with kernel preemption disabled 2425 * => if lwp is the curlwp, then set ci_want_pmapload so that 2426 * actual MMU context switch will be done by pmap_load() later 2427 */ 2428 2429 void 2430 pmap_activate(struct lwp *l) 2431 { 2432 struct cpu_info *ci; 2433 struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2434 2435 KASSERT(kpreempt_disabled()); 2436 2437 ci = curcpu(); 2438 2439 if (l == ci->ci_curlwp) { 2440 struct pcb *pcb; 2441 2442 KASSERT(ci->ci_want_pmapload == 0); 2443 KASSERT(ci->ci_tlbstate != TLBSTATE_VALID); 2444 #ifdef KSTACK_CHECK_DR0 2445 /* 2446 * setup breakpoint on the top of stack 2447 */ 2448 if (l == &lwp0) 2449 dr0(0, 0, 0, 0); 2450 else 2451 dr0(KSTACK_LOWEST_ADDR(l), 1, 3, 1); 2452 #endif 2453 2454 /* 2455 * no need to switch to kernel vmspace because 2456 * it's a subset of any vmspace. 2457 */ 2458 2459 if (pmap == pmap_kernel()) { 2460 ci->ci_want_pmapload = 0; 2461 return; 2462 } 2463 2464 pcb = &l->l_addr->u_pcb; 2465 pcb->pcb_ldt_sel = pmap->pm_ldt_sel; 2466 2467 ci->ci_want_pmapload = 1; 2468 2469 #if defined(__x86_64__) 2470 if (pcb->pcb_flags & PCB_GS64) 2471 wrmsr(MSR_KERNELGSBASE, pcb->pcb_gs); 2472 if (pcb->pcb_flags & PCB_FS64) 2473 wrmsr(MSR_FSBASE, pcb->pcb_fs); 2474 #endif /* defined(__x86_64__) */ 2475 } 2476 } 2477 2478 /* 2479 * pmap_reactivate: try to regain reference to the pmap. 2480 * 2481 * => must be called with kernel preemption disabled 2482 */ 2483 2484 static bool 2485 pmap_reactivate(struct pmap *pmap) 2486 { 2487 struct cpu_info *ci; 2488 uint32_t cpumask; 2489 bool result; 2490 uint32_t oldcpus; 2491 2492 ci = curcpu(); 2493 cpumask = ci->ci_cpumask; 2494 2495 KASSERT(kpreempt_disabled()); 2496 #if defined(XEN) && defined(__x86_64__) 2497 KASSERT(pmap->pm_pdirpa == xen_current_user_pgd); 2498 #elif defined(PAE) 2499 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(pmap_l3pd[0])); 2500 #elif !defined(XEN) || (defined(XEN) && defined(XEN3)) 2501 KASSERT(pmap->pm_pdirpa == pmap_pte2pa(rcr3())); 2502 #endif 2503 2504 /* 2505 * if we still have a lazy reference to this pmap, 2506 * we can assume that there was no tlb shootdown 2507 * for this pmap in the meantime. 2508 * 2509 * the order of events here is important as we must 2510 * synchronize with TLB shootdown interrupts. declare 2511 * interest in invalidations (TLBSTATE_VALID) and then 2512 * check the cpumask, which the IPIs can change only 2513 * when the state is TLBSTATE_LAZY. 2514 */ 2515 2516 ci->ci_tlbstate = TLBSTATE_VALID; 2517 oldcpus = pmap->pm_cpus; 2518 KASSERT((pmap->pm_kernel_cpus & cpumask) != 0); 2519 if (oldcpus & cpumask) { 2520 /* got it */ 2521 result = true; 2522 } else { 2523 /* must reload */ 2524 atomic_or_32(&pmap->pm_cpus, cpumask); 2525 result = false; 2526 } 2527 2528 return result; 2529 } 2530 2531 /* 2532 * pmap_load: actually switch pmap. (fill in %cr3 and LDT info) 2533 */ 2534 2535 void 2536 pmap_load(void) 2537 { 2538 struct cpu_info *ci; 2539 uint32_t cpumask; 2540 struct pmap *pmap; 2541 struct pmap *oldpmap; 2542 struct lwp *l; 2543 struct pcb *pcb; 2544 uint64_t ncsw; 2545 2546 kpreempt_disable(); 2547 retry: 2548 ci = curcpu(); 2549 if (!ci->ci_want_pmapload) { 2550 kpreempt_enable(); 2551 return; 2552 } 2553 cpumask = ci->ci_cpumask; 2554 l = ci->ci_curlwp; 2555 ncsw = l->l_ncsw; 2556 2557 /* should be able to take ipis. */ 2558 KASSERT(ci->ci_ilevel < IPL_IPI); 2559 #ifdef XEN 2560 /* XXX not yet KASSERT(x86_read_psl() != 0); */ 2561 #else 2562 KASSERT((x86_read_psl() & PSL_I) != 0); 2563 #endif 2564 2565 KASSERT(l != NULL); 2566 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2567 KASSERT(pmap != pmap_kernel()); 2568 oldpmap = ci->ci_pmap; 2569 2570 pcb = &l->l_addr->u_pcb; 2571 /* loaded by pmap_activate */ 2572 KASSERT(pcb->pcb_ldt_sel == pmap->pm_ldt_sel); 2573 2574 if (pmap == oldpmap) { 2575 if (!pmap_reactivate(pmap)) { 2576 2577 /* 2578 * pmap has been changed during deactivated. 2579 * our tlb may be stale. 2580 */ 2581 2582 tlbflush(); 2583 } 2584 2585 ci->ci_want_pmapload = 0; 2586 kpreempt_enable(); 2587 return; 2588 } 2589 2590 /* 2591 * grab a reference to the new pmap. 2592 */ 2593 2594 pmap_reference(pmap); 2595 2596 /* 2597 * actually switch pmap. 2598 */ 2599 2600 atomic_and_32(&oldpmap->pm_cpus, ~cpumask); 2601 atomic_and_32(&oldpmap->pm_kernel_cpus, ~cpumask); 2602 2603 #if defined(XEN) && defined(__x86_64__) 2604 KASSERT(oldpmap->pm_pdirpa == xen_current_user_pgd || 2605 oldpmap == pmap_kernel()); 2606 #elif defined(PAE) 2607 KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(pmap_l3pd[0])); 2608 #elif !defined(XEN) || (defined(XEN) && defined(XEN3)) 2609 KASSERT(oldpmap->pm_pdirpa == pmap_pte2pa(rcr3())); 2610 #endif 2611 KASSERT((pmap->pm_cpus & cpumask) == 0); 2612 KASSERT((pmap->pm_kernel_cpus & cpumask) == 0); 2613 2614 /* 2615 * mark the pmap in use by this processor. again we must 2616 * synchronize with TLB shootdown interrupts, so set the 2617 * state VALID first, then register us for shootdown events 2618 * on this pmap. 2619 */ 2620 2621 ci->ci_tlbstate = TLBSTATE_VALID; 2622 atomic_or_32(&pmap->pm_cpus, cpumask); 2623 atomic_or_32(&pmap->pm_kernel_cpus, cpumask); 2624 ci->ci_pmap = pmap; 2625 2626 /* 2627 * update tss. now that we have registered for invalidations 2628 * from other CPUs, we're good to load the page tables. 2629 */ 2630 #ifdef PAE 2631 pcb->pcb_cr3 = pmap_l3paddr; 2632 #else 2633 pcb->pcb_cr3 = pmap->pm_pdirpa; 2634 #endif 2635 #if defined(XEN) && defined(__x86_64__) 2636 /* kernel pmap always in cr3 and should never go in user cr3 */ 2637 if (pmap_pdirpa(pmap, 0) != pmap_pdirpa(pmap_kernel(), 0)) { 2638 /* 2639 * Map user space address in kernel space and load 2640 * user cr3 2641 */ 2642 int i, s; 2643 pd_entry_t *old_pgd, *new_pgd; 2644 paddr_t addr; 2645 s = splvm(); 2646 new_pgd = pmap->pm_pdir; 2647 old_pgd = pmap_kernel()->pm_pdir; 2648 addr = xpmap_ptom(pmap_pdirpa(pmap_kernel(), 0)); 2649 for (i = 0; i < PDIR_SLOT_PTE; 2650 i++, addr += sizeof(pd_entry_t)) { 2651 if ((new_pgd[i] & PG_V) || (old_pgd[i] & PG_V)) 2652 xpq_queue_pte_update(addr, new_pgd[i]); 2653 } 2654 xpq_flush_queue(); /* XXXtlb */ 2655 tlbflush(); 2656 xen_set_user_pgd(pmap_pdirpa(pmap, 0)); 2657 xen_current_user_pgd = pmap_pdirpa(pmap, 0); 2658 splx(s); 2659 } 2660 #else /* XEN && x86_64 */ 2661 #if defined(XEN) 2662 /* 2663 * clear APDP slot, in case it points to a page table that has 2664 * been freed 2665 */ 2666 if (*APDP_PDE) { 2667 int i; 2668 for (i = 0; i < PDP_SIZE; i++) { 2669 pmap_pte_set(&APDP_PDE[i], 0); 2670 #ifdef PAE 2671 /* clear shadow entry too */ 2672 pmap_pte_set(&APDP_PDE_SHADOW[i], 0); 2673 #endif 2674 } 2675 } 2676 /* lldt() does pmap_pte_flush() */ 2677 #else /* XEN */ 2678 #if defined(i386) 2679 ci->ci_tss.tss_ldt = pcb->pcb_ldt_sel; 2680 ci->ci_tss.tss_cr3 = pcb->pcb_cr3; 2681 #endif 2682 #endif /* XEN */ 2683 lldt(pcb->pcb_ldt_sel); 2684 #ifdef PAE 2685 { 2686 paddr_t l3_pd = xpmap_ptom_masked(pmap_l3paddr); 2687 int i; 2688 int s = splvm(); 2689 /* don't update the kernel L3 slot */ 2690 for (i = 0 ; i < PDP_SIZE - 1 ; i++, l3_pd += sizeof(pd_entry_t)) { 2691 xpq_queue_pte_update(l3_pd, 2692 xpmap_ptom(pmap->pm_pdirpa[i]) | PG_V); 2693 } 2694 tlbflush(); 2695 xpq_flush_queue(); 2696 splx(s); 2697 } 2698 #else /* PAE */ 2699 lcr3(pcb->pcb_cr3); 2700 #endif /* PAE */ 2701 #endif /* XEN && x86_64 */ 2702 2703 ci->ci_want_pmapload = 0; 2704 2705 /* 2706 * we're now running with the new pmap. drop the reference 2707 * to the old pmap. if we block, we need to go around again. 2708 */ 2709 2710 pmap_destroy(oldpmap); 2711 if (l->l_ncsw != ncsw) { 2712 goto retry; 2713 } 2714 2715 kpreempt_enable(); 2716 } 2717 2718 /* 2719 * pmap_deactivate: deactivate a process' pmap 2720 * 2721 * => must be called with kernel preemption disabled (high SPL is enough) 2722 */ 2723 2724 void 2725 pmap_deactivate(struct lwp *l) 2726 { 2727 struct pmap *pmap; 2728 struct cpu_info *ci; 2729 2730 KASSERT(kpreempt_disabled()); 2731 2732 if (l != curlwp) { 2733 return; 2734 } 2735 2736 /* 2737 * wait for pending TLB shootdowns to complete. necessary 2738 * because TLB shootdown state is per-CPU, and the LWP may 2739 * be coming off the CPU before it has a chance to call 2740 * pmap_update(). 2741 */ 2742 pmap_tlb_shootwait(); 2743 2744 ci = curcpu(); 2745 2746 if (ci->ci_want_pmapload) { 2747 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 2748 != pmap_kernel()); 2749 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 2750 != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID); 2751 2752 /* 2753 * userspace has not been touched. 2754 * nothing to do here. 2755 */ 2756 2757 ci->ci_want_pmapload = 0; 2758 return; 2759 } 2760 2761 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2762 2763 if (pmap == pmap_kernel()) { 2764 return; 2765 } 2766 2767 #if defined(XEN) && defined(__x86_64__) 2768 KASSERT(pmap->pm_pdirpa == xen_current_user_pgd); 2769 #elif defined(PAE) 2770 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(pmap_l3pd[0])); 2771 #elif !defined(XEN) || (defined(XEN) && defined(XEN3)) 2772 KASSERT(pmap->pm_pdirpa == pmap_pte2pa(rcr3())); 2773 #endif 2774 KASSERT(ci->ci_pmap == pmap); 2775 2776 /* 2777 * we aren't interested in TLB invalidations for this pmap, 2778 * at least for the time being. 2779 */ 2780 2781 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); 2782 ci->ci_tlbstate = TLBSTATE_LAZY; 2783 } 2784 2785 /* 2786 * end of lifecycle functions 2787 */ 2788 2789 /* 2790 * some misc. functions 2791 */ 2792 2793 static int 2794 pmap_pdes_invalid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde) 2795 { 2796 int i; 2797 unsigned long index; 2798 pd_entry_t pde; 2799 2800 for (i = PTP_LEVELS; i > 1; i--) { 2801 index = pl_i(va, i); 2802 pde = pdes[i - 2][index]; 2803 if ((pde & PG_V) == 0) 2804 return i; 2805 } 2806 if (lastpde != NULL) 2807 *lastpde = pde; 2808 return 0; 2809 } 2810 2811 /* 2812 * pmap_extract: extract a PA for the given VA 2813 */ 2814 2815 bool 2816 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap) 2817 { 2818 pt_entry_t *ptes, pte; 2819 pd_entry_t pde; 2820 pd_entry_t * const *pdes; 2821 struct pmap *pmap2; 2822 struct cpu_info *ci; 2823 vaddr_t pa; 2824 lwp_t *l; 2825 bool hard, rv; 2826 2827 rv = false; 2828 pa = 0; 2829 l = curlwp; 2830 2831 KPREEMPT_DISABLE(l); 2832 ci = l->l_cpu; 2833 if (__predict_true(!ci->ci_want_pmapload && ci->ci_pmap == pmap) || 2834 pmap == pmap_kernel()) { 2835 /* 2836 * no need to lock, because it's pmap_kernel() or our 2837 * own pmap and is active. if a user pmap, the caller 2838 * will hold the vm_map write/read locked and so prevent 2839 * entries from disappearing while we are here. ptps 2840 * can disappear via pmap_remove(), pmap_protect() and 2841 * pmap_collect(), but they are called with the vm_map 2842 * write locked. 2843 */ 2844 hard = false; 2845 ptes = PTE_BASE; 2846 pdes = normal_pdes; 2847 } else { 2848 /* we lose, do it the hard way. */ 2849 hard = true; 2850 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 2851 } 2852 if (pmap_pdes_valid(va, pdes, &pde)) { 2853 pte = ptes[pl1_i(va)]; 2854 if (pde & PG_PS) { 2855 pa = (pde & PG_LGFRAME) | (va & (NBPD_L2 - 1)); 2856 rv = true; 2857 } else if (__predict_true((pte & PG_V) != 0)) { 2858 pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1)); 2859 rv = true; 2860 } 2861 } 2862 if (__predict_false(hard)) { 2863 pmap_unmap_ptes(pmap, pmap2); 2864 } 2865 KPREEMPT_ENABLE(l); 2866 if (pap != NULL) { 2867 *pap = pa; 2868 } 2869 return rv; 2870 } 2871 2872 2873 /* 2874 * vtophys: virtual address to physical address. For use by 2875 * machine-dependent code only. 2876 */ 2877 2878 paddr_t 2879 vtophys(vaddr_t va) 2880 { 2881 paddr_t pa; 2882 2883 if (pmap_extract(pmap_kernel(), va, &pa) == true) 2884 return (pa); 2885 return (0); 2886 } 2887 2888 #ifdef XEN 2889 /* 2890 * pmap_extract_ma: extract a MA for the given VA 2891 */ 2892 2893 bool 2894 pmap_extract_ma(pmap, va, pap) 2895 struct pmap *pmap; 2896 vaddr_t va; 2897 paddr_t *pap; 2898 { 2899 pt_entry_t *ptes, pte; 2900 pd_entry_t pde; 2901 pd_entry_t * const *pdes; 2902 struct pmap *pmap2; 2903 2904 kpreempt_disable(); 2905 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 2906 if (!pmap_pdes_valid(va, pdes, &pde)) { 2907 pmap_unmap_ptes(pmap, pmap2); 2908 kpreempt_enable(); 2909 return false; 2910 } 2911 2912 pte = ptes[pl1_i(va)]; 2913 pmap_unmap_ptes(pmap, pmap2); 2914 kpreempt_enable(); 2915 2916 if (__predict_true((pte & PG_V) != 0)) { 2917 if (pap != NULL) 2918 *pap = (pte & PG_FRAME) | (va & (NBPD_L1 - 1)); 2919 return true; 2920 } 2921 2922 return false; 2923 } 2924 2925 /* 2926 * vtomach: virtual address to machine address. For use by 2927 * machine-dependent code only. 2928 */ 2929 2930 paddr_t 2931 vtomach(vaddr_t va) 2932 { 2933 paddr_t pa; 2934 2935 if (pmap_extract_ma(pmap_kernel(), va, &pa) == true) 2936 return (pa); 2937 return (0); 2938 } 2939 2940 #endif /* XEN */ 2941 2942 2943 2944 /* 2945 * pmap_virtual_space: used during bootup [pmap_steal_memory] to 2946 * determine the bounds of the kernel virtual addess space. 2947 */ 2948 2949 void 2950 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp) 2951 { 2952 *startp = virtual_avail; 2953 *endp = virtual_end; 2954 } 2955 2956 /* 2957 * pmap_map: map a range of PAs into kvm. 2958 * 2959 * => used during crash dump 2960 * => XXX: pmap_map() should be phased out? 2961 */ 2962 2963 vaddr_t 2964 pmap_map(vaddr_t va, paddr_t spa, paddr_t epa, vm_prot_t prot) 2965 { 2966 while (spa < epa) { 2967 pmap_kenter_pa(va, spa, prot); 2968 va += PAGE_SIZE; 2969 spa += PAGE_SIZE; 2970 } 2971 pmap_update(pmap_kernel()); 2972 return va; 2973 } 2974 2975 /* 2976 * pmap_zero_page: zero a page 2977 */ 2978 2979 void 2980 pmap_zero_page(paddr_t pa) 2981 { 2982 pt_entry_t *zpte; 2983 void *zerova; 2984 int id; 2985 2986 kpreempt_disable(); 2987 id = cpu_number(); 2988 zpte = PTESLEW(zero_pte, id); 2989 zerova = VASLEW(zerop, id); 2990 2991 #ifdef DIAGNOSTIC 2992 if (*zpte) 2993 panic("pmap_zero_page: lock botch"); 2994 #endif 2995 2996 pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k); 2997 pmap_pte_flush(); 2998 pmap_update_pg((vaddr_t)zerova); /* flush TLB */ 2999 3000 memset(zerova, 0, PAGE_SIZE); 3001 3002 #if defined(DIAGNOSTIC) || defined(XEN) 3003 pmap_pte_set(zpte, 0); /* zap ! */ 3004 pmap_pte_flush(); 3005 #endif 3006 kpreempt_enable(); 3007 } 3008 3009 /* 3010 * pmap_pagezeroidle: the same, for the idle loop page zero'er. 3011 * Returns true if the page was zero'd, false if we aborted for 3012 * some reason. 3013 */ 3014 3015 bool 3016 pmap_pageidlezero(paddr_t pa) 3017 { 3018 pt_entry_t *zpte; 3019 void *zerova; 3020 bool rv; 3021 int id; 3022 3023 id = cpu_number(); 3024 zpte = PTESLEW(zero_pte, id); 3025 zerova = VASLEW(zerop, id); 3026 3027 KASSERT(cpu_feature & CPUID_SSE2); 3028 KASSERT(*zpte == 0); 3029 3030 pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k); 3031 pmap_pte_flush(); 3032 pmap_update_pg((vaddr_t)zerova); /* flush TLB */ 3033 3034 rv = sse2_idlezero_page(zerova); 3035 3036 #if defined(DIAGNOSTIC) || defined(XEN) 3037 pmap_pte_set(zpte, 0); /* zap ! */ 3038 pmap_pte_flush(); 3039 #endif 3040 3041 return rv; 3042 } 3043 3044 /* 3045 * pmap_copy_page: copy a page 3046 */ 3047 3048 void 3049 pmap_copy_page(paddr_t srcpa, paddr_t dstpa) 3050 { 3051 pt_entry_t *spte; 3052 pt_entry_t *dpte; 3053 void *csrcva; 3054 void *cdstva; 3055 int id; 3056 3057 kpreempt_disable(); 3058 id = cpu_number(); 3059 spte = PTESLEW(csrc_pte,id); 3060 dpte = PTESLEW(cdst_pte,id); 3061 csrcva = VASLEW(csrcp, id); 3062 cdstva = VASLEW(cdstp, id); 3063 3064 KASSERT(*spte == 0 && *dpte == 0); 3065 3066 pmap_pte_set(spte, pmap_pa2pte(srcpa) | PG_V | PG_RW | PG_U | PG_k); 3067 pmap_pte_set(dpte, 3068 pmap_pa2pte(dstpa) | PG_V | PG_RW | PG_M | PG_U | PG_k); 3069 pmap_pte_flush(); 3070 pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva); 3071 3072 memcpy(cdstva, csrcva, PAGE_SIZE); 3073 3074 #if defined(DIAGNOSTIC) || defined(XEN) 3075 pmap_pte_set(spte, 0); 3076 pmap_pte_set(dpte, 0); 3077 pmap_pte_flush(); 3078 #endif 3079 kpreempt_enable(); 3080 } 3081 3082 static pt_entry_t * 3083 pmap_map_ptp(struct vm_page *ptp) 3084 { 3085 pt_entry_t *ptppte; 3086 void *ptpva; 3087 int id; 3088 3089 KASSERT(kpreempt_disabled()); 3090 3091 id = cpu_number(); 3092 ptppte = PTESLEW(ptp_pte, id); 3093 ptpva = VASLEW(ptpp, id); 3094 #if !defined(XEN) 3095 pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M | 3096 PG_RW | PG_U | PG_k); 3097 #else 3098 pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M | 3099 PG_U | PG_k); 3100 #endif 3101 pmap_pte_flush(); 3102 pmap_update_pg((vaddr_t)ptpva); 3103 3104 return (pt_entry_t *)ptpva; 3105 } 3106 3107 static void 3108 pmap_unmap_ptp(void) 3109 { 3110 #if defined(DIAGNOSTIC) || defined(XEN) 3111 pt_entry_t *pte; 3112 3113 KASSERT(kpreempt_disabled()); 3114 3115 pte = PTESLEW(ptp_pte, cpu_number()); 3116 if (*pte != 0) { 3117 pmap_pte_set(pte, 0); 3118 pmap_pte_flush(); 3119 } 3120 #endif 3121 } 3122 3123 static pt_entry_t * 3124 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va) 3125 { 3126 3127 KASSERT(kpreempt_disabled()); 3128 if (pmap_is_curpmap(pmap)) { 3129 return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */ 3130 } 3131 KASSERT(ptp != NULL); 3132 return pmap_map_ptp(ptp) + pl1_pi(va); 3133 } 3134 3135 static void 3136 pmap_unmap_pte(void) 3137 { 3138 3139 KASSERT(kpreempt_disabled()); 3140 3141 pmap_unmap_ptp(); 3142 } 3143 3144 /* 3145 * p m a p r e m o v e f u n c t i o n s 3146 * 3147 * functions that remove mappings 3148 */ 3149 3150 /* 3151 * pmap_remove_ptes: remove PTEs from a PTP 3152 * 3153 * => must have proper locking on pmap_master_lock 3154 * => caller must hold pmap's lock 3155 * => PTP must be mapped into KVA 3156 * => PTP should be null if pmap == pmap_kernel() 3157 * => must be called with kernel preemption disabled 3158 * => returns composite pte if at least one page should be shot down 3159 */ 3160 3161 static pt_entry_t 3162 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva, 3163 vaddr_t startva, vaddr_t endva, int flags, 3164 struct pv_entry **pv_tofree) 3165 { 3166 struct pv_entry *pve; 3167 pt_entry_t *pte = (pt_entry_t *) ptpva; 3168 pt_entry_t opte, xpte = 0; 3169 3170 KASSERT(pmap == pmap_kernel() || mutex_owned(&pmap->pm_lock)); 3171 KASSERT(kpreempt_disabled()); 3172 3173 /* 3174 * note that ptpva points to the PTE that maps startva. this may 3175 * or may not be the first PTE in the PTP. 3176 * 3177 * we loop through the PTP while there are still PTEs to look at 3178 * and the wire_count is greater than 1 (because we use the wire_count 3179 * to keep track of the number of real PTEs in the PTP). 3180 */ 3181 3182 for (/*null*/; startva < endva && (ptp == NULL || ptp->wire_count > 1) 3183 ; pte++, startva += PAGE_SIZE) { 3184 struct vm_page *pg; 3185 struct pmap_page *pp; 3186 3187 if (!pmap_valid_entry(*pte)) 3188 continue; /* VA not mapped */ 3189 if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) { 3190 continue; 3191 } 3192 3193 /* atomically save the old PTE and zap! it */ 3194 opte = pmap_pte_testset(pte, 0); 3195 if (!pmap_valid_entry(opte)) { 3196 continue; 3197 } 3198 3199 pmap_exec_account(pmap, startva, opte, 0); 3200 pmap_stats_update_bypte(pmap, 0, opte); 3201 xpte |= opte; 3202 3203 if (ptp) { 3204 ptp->wire_count--; /* dropping a PTE */ 3205 /* Make sure that the PDE is flushed */ 3206 if (ptp->wire_count <= 1) 3207 xpte |= PG_U; 3208 } 3209 3210 /* 3211 * if we are not on a pv_head list we are done. 3212 */ 3213 3214 if ((opte & PG_PVLIST) == 0) { 3215 #if defined(DIAGNOSTIC) && !defined(DOM0OPS) 3216 if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL) 3217 panic("pmap_remove_ptes: managed page without " 3218 "PG_PVLIST for 0x%lx", startva); 3219 #endif 3220 continue; 3221 } 3222 3223 pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte)); 3224 #ifdef DIAGNOSTIC 3225 if (pg == NULL) 3226 panic("pmap_remove_ptes: unmanaged page marked " 3227 "PG_PVLIST, va = 0x%lx, pa = 0x%lx", 3228 startva, (u_long)pmap_pte2pa(opte)); 3229 #endif 3230 3231 /* sync R/M bits */ 3232 pp = VM_PAGE_TO_PP(pg); 3233 pp_lock(pp); 3234 pp->pp_attrs |= opte; 3235 pve = pmap_remove_pv(pp, ptp, startva); 3236 pp_unlock(pp); 3237 3238 if (pve != NULL) { 3239 pve->pve_next = *pv_tofree; 3240 *pv_tofree = pve; 3241 } 3242 3243 /* end of "for" loop: time for next pte */ 3244 } 3245 3246 return xpte; 3247 } 3248 3249 3250 /* 3251 * pmap_remove_pte: remove a single PTE from a PTP 3252 * 3253 * => must have proper locking on pmap_master_lock 3254 * => caller must hold pmap's lock 3255 * => PTP must be mapped into KVA 3256 * => PTP should be null if pmap == pmap_kernel() 3257 * => returns true if we removed a mapping 3258 * => must be called with kernel preemption disabled 3259 */ 3260 3261 static bool 3262 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, 3263 vaddr_t va, int flags, struct pv_entry **pv_tofree) 3264 { 3265 pt_entry_t opte; 3266 struct pv_entry *pve; 3267 struct vm_page *pg; 3268 struct pmap_page *pp; 3269 3270 KASSERT(pmap == pmap_kernel() || mutex_owned(&pmap->pm_lock)); 3271 KASSERT(pmap == pmap_kernel() || kpreempt_disabled()); 3272 3273 if (!pmap_valid_entry(*pte)) 3274 return(false); /* VA not mapped */ 3275 if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) { 3276 return(false); 3277 } 3278 3279 /* atomically save the old PTE and zap! it */ 3280 opte = pmap_pte_testset(pte, 0); 3281 if (!pmap_valid_entry(opte)) { 3282 return false; 3283 } 3284 3285 pmap_exec_account(pmap, va, opte, 0); 3286 pmap_stats_update_bypte(pmap, 0, opte); 3287 3288 if (opte & PG_U) 3289 pmap_tlb_shootdown(pmap, va, 0, opte); 3290 3291 if (ptp) { 3292 ptp->wire_count--; /* dropping a PTE */ 3293 /* Make sure that the PDE is flushed */ 3294 if ((ptp->wire_count <= 1) && !(opte & PG_U)) 3295 pmap_tlb_shootdown(pmap, va, 0, opte); 3296 } 3297 3298 /* 3299 * if we are not on a pv_head list we are done. 3300 */ 3301 3302 if ((opte & PG_PVLIST) == 0) { 3303 #if defined(DIAGNOSTIC) && !defined(DOM0OPS) 3304 if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL) 3305 panic("pmap_remove_pte: managed page without " 3306 "PG_PVLIST for 0x%lx", va); 3307 #endif 3308 return(true); 3309 } 3310 3311 pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte)); 3312 #ifdef DIAGNOSTIC 3313 if (pg == NULL) 3314 panic("pmap_remove_pte: unmanaged page marked " 3315 "PG_PVLIST, va = 0x%lx, pa = 0x%lx", va, 3316 (u_long)(pmap_pte2pa(opte))); 3317 #endif 3318 3319 /* sync R/M bits */ 3320 pp = VM_PAGE_TO_PP(pg); 3321 pp_lock(pp); 3322 pp->pp_attrs |= opte; 3323 pve = pmap_remove_pv(pp, ptp, va); 3324 pp_unlock(pp); 3325 3326 if (pve) { 3327 pve->pve_next = *pv_tofree; 3328 *pv_tofree = pve; 3329 } 3330 3331 return(true); 3332 } 3333 3334 /* 3335 * pmap_remove: top level mapping removal function 3336 * 3337 * => caller should not be holding any pmap locks 3338 */ 3339 3340 void 3341 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 3342 { 3343 pmap_do_remove(pmap, sva, eva, PMAP_REMOVE_ALL); 3344 } 3345 3346 /* 3347 * pmap_do_remove: mapping removal guts 3348 * 3349 * => caller should not be holding any pmap locks 3350 */ 3351 3352 static void 3353 pmap_do_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva, int flags) 3354 { 3355 pt_entry_t *ptes, xpte = 0; 3356 pd_entry_t pde; 3357 pd_entry_t * const *pdes; 3358 struct pv_entry *pv_tofree = NULL; 3359 bool result; 3360 paddr_t ptppa; 3361 vaddr_t blkendva, va = sva; 3362 struct vm_page *ptp; 3363 struct pmap *pmap2; 3364 3365 kpreempt_disable(); 3366 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 3367 3368 /* 3369 * removing one page? take shortcut function. 3370 */ 3371 3372 if (va + PAGE_SIZE == eva) { 3373 if (pmap_pdes_valid(va, pdes, &pde)) { 3374 3375 /* PA of the PTP */ 3376 ptppa = pmap_pte2pa(pde); 3377 3378 /* get PTP if non-kernel mapping */ 3379 if (pmap == pmap_kernel()) { 3380 /* we never free kernel PTPs */ 3381 ptp = NULL; 3382 } else { 3383 ptp = pmap_find_ptp(pmap, va, ptppa, 1); 3384 #ifdef DIAGNOSTIC 3385 if (ptp == NULL) 3386 panic("pmap_remove: unmanaged " 3387 "PTP detected"); 3388 #endif 3389 } 3390 3391 /* do it! */ 3392 result = pmap_remove_pte(pmap, ptp, 3393 &ptes[pl1_i(va)], va, flags, &pv_tofree); 3394 3395 /* 3396 * if mapping removed and the PTP is no longer 3397 * being used, free it! 3398 */ 3399 3400 if (result && ptp && ptp->wire_count <= 1) 3401 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3402 } 3403 } else for (/* null */ ; va < eva ; va = blkendva) { 3404 int lvl; 3405 3406 /* determine range of block */ 3407 blkendva = x86_round_pdr(va+1); 3408 if (blkendva > eva) 3409 blkendva = eva; 3410 3411 /* 3412 * XXXCDC: our PTE mappings should never be removed 3413 * with pmap_remove! if we allow this (and why would 3414 * we?) then we end up freeing the pmap's page 3415 * directory page (PDP) before we are finished using 3416 * it when we hit in in the recursive mapping. this 3417 * is BAD. 3418 * 3419 * long term solution is to move the PTEs out of user 3420 * address space. and into kernel address space (up 3421 * with APTE). then we can set VM_MAXUSER_ADDRESS to 3422 * be VM_MAX_ADDRESS. 3423 */ 3424 3425 if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE) 3426 /* XXXCDC: ugly hack to avoid freeing PDP here */ 3427 continue; 3428 3429 lvl = pmap_pdes_invalid(va, pdes, &pde); 3430 if (lvl != 0) { 3431 /* 3432 * skip a range corresponding to an invalid pde. 3433 */ 3434 blkendva = (va & ptp_masks[lvl - 1]) + nbpd[lvl - 1]; 3435 continue; 3436 } 3437 3438 /* PA of the PTP */ 3439 ptppa = pmap_pte2pa(pde); 3440 3441 /* get PTP if non-kernel mapping */ 3442 if (pmap == pmap_kernel()) { 3443 /* we never free kernel PTPs */ 3444 ptp = NULL; 3445 } else { 3446 ptp = pmap_find_ptp(pmap, va, ptppa, 1); 3447 #ifdef DIAGNOSTIC 3448 if (ptp == NULL) 3449 panic("pmap_remove: unmanaged PTP " 3450 "detected"); 3451 #endif 3452 } 3453 xpte |= pmap_remove_ptes(pmap, ptp, 3454 (vaddr_t)&ptes[pl1_i(va)], va, blkendva, 3455 flags, &pv_tofree); 3456 3457 /* if PTP is no longer being used, free it! */ 3458 if (ptp && ptp->wire_count <= 1) { 3459 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3460 } 3461 if ((xpte & PG_U) != 0) 3462 pmap_tlb_shootdown(pmap, sva, eva, xpte); 3463 } 3464 pmap_unmap_ptes(pmap, pmap2); /* unlock pmap */ 3465 kpreempt_enable(); 3466 3467 /* Now we free unused PVs */ 3468 if (pv_tofree) 3469 pmap_free_pvs(pv_tofree); 3470 } 3471 3472 /* 3473 * pmap_sync_pv: clear pte bits and return the old value of the pte. 3474 * 3475 * => called with pp_lock held. (thus preemption disabled) 3476 * => issues tlb shootdowns if necessary. 3477 */ 3478 3479 static int 3480 pmap_sync_pv(struct pv_pte *pvpte, pt_entry_t expect, int clearbits, 3481 pt_entry_t *optep) 3482 { 3483 struct pmap *pmap; 3484 struct vm_page *ptp; 3485 vaddr_t va; 3486 pt_entry_t *ptep; 3487 pt_entry_t opte; 3488 pt_entry_t npte; 3489 bool need_shootdown; 3490 3491 ptp = pvpte->pte_ptp; 3492 va = pvpte->pte_va; 3493 KASSERT(ptp == NULL || ptp->uobject != NULL); 3494 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 3495 pmap = ptp_to_pmap(ptp); 3496 3497 KASSERT((expect & ~(PG_FRAME | PG_V)) == 0); 3498 KASSERT((expect & PG_V) != 0); 3499 KASSERT(clearbits == ~0 || (clearbits & ~(PG_M | PG_U | PG_RW)) == 0); 3500 KASSERT(kpreempt_disabled()); 3501 3502 ptep = pmap_map_pte(pmap, ptp, va); 3503 do { 3504 opte = *ptep; 3505 KASSERT((opte & (PG_M | PG_U)) != PG_M); 3506 KASSERT((opte & (PG_U | PG_V)) != PG_U); 3507 KASSERT(opte == 0 || (opte & PG_V) != 0); 3508 if ((opte & (PG_FRAME | PG_V)) != expect) { 3509 3510 /* 3511 * we lost a race with a V->P operation like 3512 * pmap_remove(). wait for the competitor 3513 * reflecting pte bits into mp_attrs. 3514 * 3515 * issue a redundant TLB shootdown so that 3516 * we can wait for its completion. 3517 */ 3518 3519 pmap_unmap_pte(); 3520 if (clearbits != 0) { 3521 pmap_tlb_shootdown(pmap, va, 0, 3522 (pmap == pmap_kernel() ? PG_G : 0)); 3523 } 3524 return EAGAIN; 3525 } 3526 3527 /* 3528 * check if there's anything to do on this pte. 3529 */ 3530 3531 if ((opte & clearbits) == 0) { 3532 need_shootdown = false; 3533 break; 3534 } 3535 3536 /* 3537 * we need a shootdown if the pte is cached. (PG_U) 3538 * 3539 * ...unless we are clearing only the PG_RW bit and 3540 * it isn't cached as RW. (PG_M) 3541 */ 3542 3543 need_shootdown = (opte & PG_U) != 0 && 3544 !(clearbits == PG_RW && (opte & PG_M) == 0); 3545 3546 npte = opte & ~clearbits; 3547 3548 /* 3549 * if we need a shootdown anyway, clear PG_U and PG_M. 3550 */ 3551 3552 if (need_shootdown) { 3553 npte &= ~(PG_U | PG_M); 3554 } 3555 KASSERT((npte & (PG_M | PG_U)) != PG_M); 3556 KASSERT((npte & (PG_U | PG_V)) != PG_U); 3557 KASSERT(npte == 0 || (opte & PG_V) != 0); 3558 } while (pmap_pte_cas(ptep, opte, npte) != opte); 3559 3560 if (need_shootdown) { 3561 pmap_tlb_shootdown(pmap, va, 0, opte); 3562 } 3563 pmap_unmap_pte(); 3564 3565 *optep = opte; 3566 return 0; 3567 } 3568 3569 /* 3570 * pmap_page_remove: remove a managed vm_page from all pmaps that map it 3571 * 3572 * => R/M bits are sync'd back to attrs 3573 */ 3574 3575 void 3576 pmap_page_remove(struct vm_page *pg) 3577 { 3578 struct pmap_page *pp; 3579 struct pv_pte *pvpte; 3580 struct pv_entry *killlist = NULL; 3581 struct vm_page *ptp; 3582 pt_entry_t expect; 3583 lwp_t *l; 3584 int count; 3585 3586 #ifdef DIAGNOSTIC 3587 int bank, off; 3588 3589 bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off); 3590 if (bank == -1) 3591 panic("pmap_page_remove: unmanaged page?"); 3592 #endif 3593 3594 l = curlwp; 3595 pp = VM_PAGE_TO_PP(pg); 3596 expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V; 3597 count = SPINLOCK_BACKOFF_MIN; 3598 kpreempt_disable(); 3599 startover: 3600 pp_lock(pp); 3601 while ((pvpte = pv_pte_first(pp)) != NULL) { 3602 struct pmap *pmap; 3603 struct pv_entry *pve; 3604 pt_entry_t opte; 3605 vaddr_t va; 3606 int error; 3607 3608 /* 3609 * add a reference to the pmap before clearing the pte. 3610 * otherwise the pmap can disappear behind us. 3611 */ 3612 3613 ptp = pvpte->pte_ptp; 3614 pmap = ptp_to_pmap(ptp); 3615 if (ptp != NULL) { 3616 pmap_reference(pmap); 3617 } 3618 3619 error = pmap_sync_pv(pvpte, expect, ~0, &opte); 3620 if (error == EAGAIN) { 3621 int hold_count; 3622 pp_unlock(pp); 3623 KERNEL_UNLOCK_ALL(curlwp, &hold_count); 3624 if (ptp != NULL) { 3625 pmap_destroy(pmap); 3626 } 3627 SPINLOCK_BACKOFF(count); 3628 KERNEL_LOCK(hold_count, curlwp); 3629 goto startover; 3630 } 3631 3632 pp->pp_attrs |= opte; 3633 va = pvpte->pte_va; 3634 pve = pmap_remove_pv(pp, ptp, va); 3635 pp_unlock(pp); 3636 3637 /* update the PTP reference count. free if last reference. */ 3638 if (ptp != NULL) { 3639 struct pmap *pmap2; 3640 pt_entry_t *ptes; 3641 pd_entry_t * const *pdes; 3642 3643 KASSERT(pmap != pmap_kernel()); 3644 3645 pmap_tlb_shootwait(); 3646 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3647 pmap_stats_update_bypte(pmap, 0, opte); 3648 ptp->wire_count--; 3649 if (ptp->wire_count <= 1) { 3650 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3651 } 3652 pmap_unmap_ptes(pmap, pmap2); 3653 pmap_destroy(pmap); 3654 } else { 3655 KASSERT(pmap == pmap_kernel()); 3656 pmap_stats_update_bypte(pmap, 0, opte); 3657 } 3658 3659 if (pve != NULL) { 3660 pve->pve_next = killlist; /* mark it for death */ 3661 killlist = pve; 3662 } 3663 pp_lock(pp); 3664 } 3665 pp_unlock(pp); 3666 kpreempt_enable(); 3667 3668 /* Now free unused pvs. */ 3669 pmap_free_pvs(killlist); 3670 } 3671 3672 /* 3673 * p m a p a t t r i b u t e f u n c t i o n s 3674 * functions that test/change managed page's attributes 3675 * since a page can be mapped multiple times we must check each PTE that 3676 * maps it by going down the pv lists. 3677 */ 3678 3679 /* 3680 * pmap_test_attrs: test a page's attributes 3681 */ 3682 3683 bool 3684 pmap_test_attrs(struct vm_page *pg, unsigned testbits) 3685 { 3686 struct pmap_page *pp; 3687 struct pv_pte *pvpte; 3688 pt_entry_t expect; 3689 u_int result; 3690 3691 #if DIAGNOSTIC 3692 int bank, off; 3693 3694 bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off); 3695 if (bank == -1) 3696 panic("pmap_test_attrs: unmanaged page?"); 3697 #endif 3698 3699 pp = VM_PAGE_TO_PP(pg); 3700 if ((pp->pp_attrs & testbits) != 0) { 3701 return true; 3702 } 3703 expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V; 3704 pp_lock(pp); 3705 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 3706 pt_entry_t opte; 3707 int error; 3708 3709 if ((pp->pp_attrs & testbits) != 0) { 3710 break; 3711 } 3712 error = pmap_sync_pv(pvpte, expect, 0, &opte); 3713 if (error == 0) { 3714 pp->pp_attrs |= opte; 3715 } 3716 } 3717 result = pp->pp_attrs & testbits; 3718 pp_unlock(pp); 3719 3720 /* 3721 * note that we will exit the for loop with a non-null pve if 3722 * we have found the bits we are testing for. 3723 */ 3724 3725 return result != 0; 3726 } 3727 3728 /* 3729 * pmap_clear_attrs: clear the specified attribute for a page. 3730 * 3731 * => we return true if we cleared one of the bits we were asked to 3732 */ 3733 3734 bool 3735 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits) 3736 { 3737 struct pmap_page *pp; 3738 struct pv_pte *pvpte; 3739 u_int result; 3740 pt_entry_t expect; 3741 int count; 3742 #ifdef DIAGNOSTIC 3743 int bank, off; 3744 3745 bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off); 3746 if (bank == -1) 3747 panic("pmap_change_attrs: unmanaged page?"); 3748 #endif 3749 3750 pp = VM_PAGE_TO_PP(pg); 3751 expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V; 3752 count = SPINLOCK_BACKOFF_MIN; 3753 kpreempt_disable(); 3754 startover: 3755 pp_lock(pp); 3756 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 3757 pt_entry_t opte; 3758 int error; 3759 3760 error = pmap_sync_pv(pvpte, expect, clearbits, &opte); 3761 if (error == EAGAIN) { 3762 int hold_count; 3763 pp_unlock(pp); 3764 KERNEL_UNLOCK_ALL(curlwp, &hold_count); 3765 SPINLOCK_BACKOFF(count); 3766 KERNEL_LOCK(hold_count, curlwp); 3767 goto startover; 3768 } 3769 pp->pp_attrs |= opte; 3770 } 3771 result = pp->pp_attrs & clearbits; 3772 pp->pp_attrs &= ~clearbits; 3773 pp_unlock(pp); 3774 kpreempt_enable(); 3775 3776 return result != 0; 3777 } 3778 3779 3780 /* 3781 * p m a p p r o t e c t i o n f u n c t i o n s 3782 */ 3783 3784 /* 3785 * pmap_page_protect: change the protection of all recorded mappings 3786 * of a managed page 3787 * 3788 * => NOTE: this is an inline function in pmap.h 3789 */ 3790 3791 /* see pmap.h */ 3792 3793 /* 3794 * pmap_protect: set the protection in of the pages in a pmap 3795 * 3796 * => NOTE: this is an inline function in pmap.h 3797 */ 3798 3799 /* see pmap.h */ 3800 3801 /* 3802 * pmap_write_protect: write-protect pages in a pmap 3803 */ 3804 3805 void 3806 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot) 3807 { 3808 pt_entry_t *ptes, *epte; 3809 pt_entry_t *spte; 3810 pd_entry_t * const *pdes; 3811 vaddr_t blockend, va; 3812 pt_entry_t opte; 3813 struct pmap *pmap2; 3814 3815 KASSERT(curlwp->l_md.md_gc_pmap != pmap); 3816 3817 kpreempt_disable(); 3818 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 3819 3820 /* should be ok, but just in case ... */ 3821 sva &= PG_FRAME; 3822 eva &= PG_FRAME; 3823 3824 for (va = sva ; va < eva ; va = blockend) { 3825 3826 blockend = (va & L2_FRAME) + NBPD_L2; 3827 if (blockend > eva) 3828 blockend = eva; 3829 3830 /* 3831 * XXXCDC: our PTE mappings should never be write-protected! 3832 * 3833 * long term solution is to move the PTEs out of user 3834 * address space. and into kernel address space (up 3835 * with APTE). then we can set VM_MAXUSER_ADDRESS to 3836 * be VM_MAX_ADDRESS. 3837 */ 3838 3839 /* XXXCDC: ugly hack to avoid freeing PDP here */ 3840 if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE) 3841 continue; 3842 3843 /* empty block? */ 3844 if (!pmap_pdes_valid(va, pdes, NULL)) 3845 continue; 3846 3847 #ifdef DIAGNOSTIC 3848 if (va >= VM_MAXUSER_ADDRESS && 3849 va < VM_MAX_ADDRESS) 3850 panic("pmap_write_protect: PTE space"); 3851 #endif 3852 3853 spte = &ptes[pl1_i(va)]; 3854 epte = &ptes[pl1_i(blockend)]; 3855 3856 for (/*null */; spte < epte ; spte++) { 3857 pt_entry_t npte; 3858 3859 do { 3860 opte = *spte; 3861 if ((~opte & (PG_RW | PG_V)) != 0) { 3862 goto next; 3863 } 3864 npte = opte & ~PG_RW; 3865 } while (pmap_pte_cas(spte, opte, npte) != opte); 3866 if ((opte & PG_M) != 0) { 3867 vaddr_t tva; 3868 3869 tva = x86_ptob(spte - ptes); 3870 pmap_tlb_shootdown(pmap, tva, 0, opte); 3871 } 3872 next:; 3873 } 3874 } 3875 3876 pmap_unmap_ptes(pmap, pmap2); /* unlocks pmap */ 3877 kpreempt_enable(); 3878 } 3879 3880 /* 3881 * end of protection functions 3882 */ 3883 3884 /* 3885 * pmap_unwire: clear the wired bit in the PTE 3886 * 3887 * => mapping should already be in map 3888 */ 3889 3890 void 3891 pmap_unwire(struct pmap *pmap, vaddr_t va) 3892 { 3893 pt_entry_t *ptes; 3894 pd_entry_t * const *pdes; 3895 struct pmap *pmap2; 3896 3897 kpreempt_disable(); 3898 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 3899 3900 if (pmap_pdes_valid(va, pdes, NULL)) { 3901 pt_entry_t *ptep = &ptes[pl1_i(va)]; 3902 pt_entry_t opte = *ptep; 3903 3904 #ifdef DIAGNOSTIC 3905 if (!pmap_valid_entry(opte)) 3906 panic("pmap_unwire: invalid (unmapped) va 0x%lx", va); 3907 #endif 3908 if ((opte & PG_W) != 0) { 3909 pt_entry_t npte = opte & ~PG_W; 3910 3911 opte = pmap_pte_testset(ptep, npte); 3912 pmap_stats_update_bypte(pmap, npte, opte); 3913 } 3914 #ifdef DIAGNOSTIC 3915 else { 3916 printf("pmap_unwire: wiring for pmap %p va 0x%lx " 3917 "didn't change!\n", pmap, va); 3918 } 3919 #endif 3920 pmap_unmap_ptes(pmap, pmap2); /* unlocks map */ 3921 } 3922 #ifdef DIAGNOSTIC 3923 else { 3924 panic("pmap_unwire: invalid PDE"); 3925 } 3926 #endif 3927 kpreempt_enable(); 3928 } 3929 3930 /* 3931 * pmap_collect: free resources held by a pmap 3932 * 3933 * => optional function. 3934 * => called when a process is swapped out to free memory. 3935 */ 3936 3937 void 3938 pmap_collect(struct pmap *pmap) 3939 { 3940 /* 3941 * free all of the pt pages by removing the physical mappings 3942 * for its entire address space. 3943 */ 3944 3945 pmap_do_remove(pmap, VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS, 3946 PMAP_REMOVE_SKIPWIRED); 3947 } 3948 3949 /* 3950 * pmap_copy: copy mappings from one pmap to another 3951 * 3952 * => optional function 3953 * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr) 3954 */ 3955 3956 /* 3957 * defined as macro in pmap.h 3958 */ 3959 3960 /* 3961 * pmap_enter: enter a mapping into a pmap 3962 * 3963 * => must be done "now" ... no lazy-evaluation 3964 * => we set pmap => pv_head locking 3965 */ 3966 #ifdef XEN 3967 int 3968 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa, 3969 vm_prot_t prot, int flags, int domid) 3970 { 3971 #else /* XEN */ 3972 int 3973 pmap_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, 3974 int flags) 3975 { 3976 paddr_t ma = pa; 3977 #endif /* XEN */ 3978 pt_entry_t *ptes, opte, npte; 3979 pt_entry_t *ptep; 3980 pd_entry_t * const *pdes; 3981 struct vm_page *ptp, *pg; 3982 struct pmap_page *new_pp; 3983 struct pmap_page *old_pp; 3984 struct pv_entry *old_pve = NULL; 3985 struct pv_entry *new_pve; 3986 struct pv_entry *new_pve2; 3987 int error; 3988 bool wired = (flags & PMAP_WIRED) != 0; 3989 struct pmap *pmap2; 3990 3991 KASSERT(pmap_initialized); 3992 KASSERT(curlwp->l_md.md_gc_pmap != pmap); 3993 3994 #ifdef DIAGNOSTIC 3995 /* sanity check: totally out of range? */ 3996 if (va >= VM_MAX_KERNEL_ADDRESS) 3997 panic("pmap_enter: too big"); 3998 3999 if (va == (vaddr_t) PDP_BASE || va == (vaddr_t) APDP_BASE) 4000 panic("pmap_enter: trying to map over PDP/APDP!"); 4001 4002 /* sanity check: kernel PTPs should already have been pre-allocated */ 4003 if (va >= VM_MIN_KERNEL_ADDRESS && 4004 !pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)])) 4005 panic("pmap_enter: missing kernel PTP for va %lx!", va); 4006 #endif /* DIAGNOSTIC */ 4007 #ifdef XEN 4008 KASSERT(domid == DOMID_SELF || pa == 0); 4009 #endif /* XEN */ 4010 4011 npte = ma | protection_codes[prot] | PG_V; 4012 if (wired) 4013 npte |= PG_W; 4014 if (va < VM_MAXUSER_ADDRESS) 4015 npte |= PG_u; 4016 else if (va < VM_MAX_ADDRESS) 4017 npte |= (PG_u | PG_RW); /* XXXCDC: no longer needed? */ 4018 else 4019 npte |= PG_k; 4020 if (pmap == pmap_kernel()) 4021 npte |= pmap_pg_g; 4022 if (flags & VM_PROT_ALL) { 4023 npte |= PG_U; 4024 if (flags & VM_PROT_WRITE) { 4025 KASSERT((npte & PG_RW) != 0); 4026 npte |= PG_M; 4027 } 4028 } 4029 4030 #ifdef XEN 4031 if (domid != DOMID_SELF) 4032 pg = NULL; 4033 else 4034 #endif 4035 pg = PHYS_TO_VM_PAGE(pa); 4036 if (pg != NULL) { 4037 /* This is a managed page */ 4038 npte |= PG_PVLIST; 4039 new_pp = VM_PAGE_TO_PP(pg); 4040 } else { 4041 new_pp = NULL; 4042 } 4043 4044 /* get pves. */ 4045 new_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT); 4046 new_pve2 = pool_cache_get(&pmap_pv_cache, PR_NOWAIT); 4047 if (new_pve == NULL || new_pve2 == NULL) { 4048 if (flags & PMAP_CANFAIL) { 4049 error = ENOMEM; 4050 goto out2; 4051 } 4052 panic("pmap_enter: pve allocation failed"); 4053 } 4054 4055 kpreempt_disable(); 4056 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 4057 if (pmap == pmap_kernel()) { 4058 ptp = NULL; 4059 } else { 4060 ptp = pmap_get_ptp(pmap, va, pdes); 4061 if (ptp == NULL) { 4062 pmap_unmap_ptes(pmap, pmap2); 4063 if (flags & PMAP_CANFAIL) { 4064 error = ENOMEM; 4065 goto out; 4066 } 4067 panic("pmap_enter: get ptp failed"); 4068 } 4069 } 4070 4071 /* 4072 * update the pte. 4073 */ 4074 4075 ptep = &ptes[pl1_i(va)]; 4076 do { 4077 opte = *ptep; 4078 4079 /* 4080 * if the same page, inherit PG_U and PG_M. 4081 */ 4082 if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) { 4083 npte |= opte & (PG_U | PG_M); 4084 } 4085 #if defined(XEN) 4086 if (domid != DOMID_SELF) { 4087 /* pmap_pte_cas with error handling */ 4088 int s = splvm(); 4089 if (opte != *ptep) { 4090 splx(s); 4091 continue; 4092 } 4093 error = xpq_update_foreign( 4094 vtomach((vaddr_t)ptep), npte, domid); 4095 splx(s); 4096 if (error) { 4097 if (ptp != NULL && ptp->wire_count <= 1) { 4098 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 4099 } 4100 pmap_unmap_ptes(pmap, pmap2); 4101 goto out; 4102 } 4103 break; 4104 } 4105 #endif /* defined(XEN) */ 4106 } while (pmap_pte_cas(ptep, opte, npte) != opte); 4107 4108 /* 4109 * update statistics and PTP's reference count. 4110 */ 4111 4112 pmap_stats_update_bypte(pmap, npte, opte); 4113 if (ptp != NULL && !pmap_valid_entry(opte)) { 4114 ptp->wire_count++; 4115 } 4116 KASSERT(ptp == NULL || ptp->wire_count > 1); 4117 4118 /* 4119 * if the same page, we can skip pv_entry handling. 4120 */ 4121 4122 if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) { 4123 KASSERT(((opte ^ npte) & PG_PVLIST) == 0); 4124 goto same_pa; 4125 } 4126 4127 /* 4128 * if old page is managed, remove pv_entry from its list. 4129 */ 4130 4131 if ((~opte & (PG_V | PG_PVLIST)) == 0) { 4132 pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte)); 4133 #ifdef DIAGNOSTIC 4134 if (pg == NULL) 4135 panic("pmap_enter: PG_PVLIST mapping with " 4136 "unmanaged page " 4137 "pa = 0x%" PRIx64 " (0x%" PRIx64 ")", 4138 (int64_t)pa, (int64_t)atop(pa)); 4139 #endif 4140 old_pp = VM_PAGE_TO_PP(pg); 4141 4142 pp_lock(old_pp); 4143 old_pve = pmap_remove_pv(old_pp, ptp, va); 4144 old_pp->pp_attrs |= opte; 4145 pp_unlock(old_pp); 4146 } 4147 4148 /* 4149 * if new page is managed, insert pv_entry into its list. 4150 */ 4151 4152 if (new_pp) { 4153 pp_lock(new_pp); 4154 new_pve = pmap_enter_pv(new_pp, new_pve, &new_pve2, ptp, va); 4155 pp_unlock(new_pp); 4156 } 4157 4158 same_pa: 4159 pmap_unmap_ptes(pmap, pmap2); 4160 4161 /* 4162 * shootdown tlb if necessary. 4163 */ 4164 4165 if ((~opte & (PG_V | PG_U)) == 0 && 4166 ((opte ^ npte) & (PG_FRAME | PG_RW)) != 0) { 4167 pmap_tlb_shootdown(pmap, va, 0, opte); 4168 } 4169 4170 error = 0; 4171 out: 4172 kpreempt_enable(); 4173 out2: 4174 if (old_pve != NULL) { 4175 pool_cache_put(&pmap_pv_cache, old_pve); 4176 } 4177 if (new_pve != NULL) { 4178 pool_cache_put(&pmap_pv_cache, new_pve); 4179 } 4180 if (new_pve2 != NULL) { 4181 pool_cache_put(&pmap_pv_cache, new_pve2); 4182 } 4183 4184 return error; 4185 } 4186 4187 #ifdef XEN 4188 int 4189 pmap_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, int flags) 4190 { 4191 paddr_t ma; 4192 4193 if (__predict_false(pa < pmap_pa_start || pmap_pa_end <= pa)) { 4194 ma = pa; /* XXX hack */ 4195 } else { 4196 ma = xpmap_ptom(pa); 4197 } 4198 4199 return pmap_enter_ma(pmap, va, ma, pa, prot, flags, DOMID_SELF); 4200 } 4201 #endif /* XEN */ 4202 4203 static bool 4204 pmap_get_physpage(vaddr_t va, int level, paddr_t *paddrp) 4205 { 4206 struct vm_page *ptp; 4207 struct pmap *kpm = pmap_kernel(); 4208 4209 if (uvm.page_init_done == false) { 4210 /* 4211 * we're growing the kernel pmap early (from 4212 * uvm_pageboot_alloc()). this case must be 4213 * handled a little differently. 4214 */ 4215 4216 if (uvm_page_physget(paddrp) == false) 4217 panic("pmap_get_physpage: out of memory"); 4218 kpreempt_disable(); 4219 pmap_pte_set(early_zero_pte, 4220 pmap_pa2pte(*paddrp) | PG_V | PG_RW | PG_k); 4221 pmap_pte_flush(); 4222 pmap_update_pg((vaddr_t)early_zerop); 4223 memset(early_zerop, 0, PAGE_SIZE); 4224 #if defined(DIAGNOSTIC) || defined (XEN) 4225 pmap_pte_set(early_zero_pte, 0); 4226 pmap_pte_flush(); 4227 #endif /* defined(DIAGNOSTIC) */ 4228 kpreempt_enable(); 4229 } else { 4230 /* XXX */ 4231 PMAP_SUBOBJ_LOCK(kpm, level - 1); 4232 ptp = uvm_pagealloc(&kpm->pm_obj[level - 1], 4233 ptp_va2o(va, level), NULL, 4234 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 4235 PMAP_SUBOBJ_UNLOCK(kpm, level - 1); 4236 if (ptp == NULL) 4237 panic("pmap_get_physpage: out of memory"); 4238 ptp->flags &= ~PG_BUSY; 4239 ptp->wire_count = 1; 4240 *paddrp = VM_PAGE_TO_PHYS(ptp); 4241 } 4242 pmap_stats_update(kpm, 1, 0); 4243 return true; 4244 } 4245 4246 /* 4247 * Allocate the amount of specified ptps for a ptp level, and populate 4248 * all levels below accordingly, mapping virtual addresses starting at 4249 * kva. 4250 * 4251 * Used by pmap_growkernel. 4252 */ 4253 static void 4254 pmap_alloc_level(pd_entry_t * const *pdes, vaddr_t kva, int lvl, 4255 long *needed_ptps) 4256 { 4257 unsigned long i; 4258 vaddr_t va; 4259 paddr_t pa; 4260 unsigned long index, endindex; 4261 int level; 4262 pd_entry_t *pdep; 4263 #ifdef XEN 4264 int s = splvm(); /* protect xpq_* */ 4265 #endif 4266 4267 for (level = lvl; level > 1; level--) { 4268 if (level == PTP_LEVELS) 4269 pdep = pmap_kernel()->pm_pdir; 4270 else 4271 pdep = pdes[level - 2]; 4272 va = kva; 4273 index = pl_i_roundup(kva, level); 4274 endindex = index + needed_ptps[level - 1] - 1; 4275 4276 4277 for (i = index; i <= endindex; i++) { 4278 KASSERT(!pmap_valid_entry(pdep[i])); 4279 pmap_get_physpage(va, level - 1, &pa); 4280 #ifdef XEN 4281 xpq_queue_pte_update((level == PTP_LEVELS) ? 4282 xpmap_ptom(pmap_pdirpa(pmap_kernel(), i)) : 4283 xpmap_ptetomach(&pdep[i]), 4284 pmap_pa2pte(pa) | PG_k | PG_V | PG_RW); 4285 #ifdef PAE 4286 if (level == PTP_LEVELS && i > L2_SLOT_KERN) { 4287 /* update real kernel PD too */ 4288 xpq_queue_pte_update( 4289 xpmap_ptetomach(&pmap_kl2pd[l2tol2(i)]), 4290 pmap_pa2pte(pa) | PG_k | PG_V | PG_RW); 4291 } 4292 #endif 4293 #else /* XEN */ 4294 pdep[i] = pa | PG_RW | PG_V; 4295 #endif /* XEN */ 4296 KASSERT(level != PTP_LEVELS || nkptp[level - 1] + 4297 pl_i(VM_MIN_KERNEL_ADDRESS, level) == i); 4298 nkptp[level - 1]++; 4299 va += nbpd[level - 1]; 4300 } 4301 pmap_pte_flush(); 4302 } 4303 #ifdef XEN 4304 splx(s); 4305 #endif 4306 } 4307 4308 /* 4309 * pmap_growkernel: increase usage of KVM space 4310 * 4311 * => we allocate new PTPs for the kernel and install them in all 4312 * the pmaps on the system. 4313 */ 4314 4315 vaddr_t 4316 pmap_growkernel(vaddr_t maxkvaddr) 4317 { 4318 struct pmap *kpm = pmap_kernel(); 4319 #if !defined(XEN) || !defined(__x86_64__) 4320 struct pmap *pm; 4321 #endif 4322 int s, i; 4323 long needed_kptp[PTP_LEVELS], target_nptp, old; 4324 bool invalidate = false; 4325 4326 s = splvm(); /* to be safe */ 4327 mutex_enter(&kpm->pm_lock); 4328 4329 if (maxkvaddr <= pmap_maxkvaddr) { 4330 mutex_exit(&kpm->pm_lock); 4331 splx(s); 4332 return pmap_maxkvaddr; 4333 } 4334 4335 maxkvaddr = x86_round_pdr(maxkvaddr); 4336 old = nkptp[PTP_LEVELS - 1]; 4337 /* 4338 * This loop could be optimized more, but pmap_growkernel() 4339 * is called infrequently. 4340 */ 4341 for (i = PTP_LEVELS - 1; i >= 1; i--) { 4342 target_nptp = pl_i_roundup(maxkvaddr, i + 1) - 4343 pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1); 4344 /* 4345 * XXX only need to check toplevel. 4346 */ 4347 if (target_nptp > nkptpmax[i]) 4348 panic("out of KVA space"); 4349 KASSERT(target_nptp >= nkptp[i]); 4350 needed_kptp[i] = target_nptp - nkptp[i]; 4351 } 4352 4353 pmap_alloc_level(normal_pdes, pmap_maxkvaddr, PTP_LEVELS, needed_kptp); 4354 4355 /* 4356 * If the number of top level entries changed, update all 4357 * pmaps. 4358 */ 4359 if (needed_kptp[PTP_LEVELS - 1] != 0) { 4360 #ifdef XEN 4361 #ifdef __x86_64__ 4362 /* nothing, kernel entries are never entered in user pmap */ 4363 #else /* __x86_64__ */ 4364 mutex_enter(&pmaps_lock); 4365 LIST_FOREACH(pm, &pmaps, pm_list) { 4366 int pdkidx; 4367 for (pdkidx = PDIR_SLOT_KERN + old; 4368 pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1]; 4369 pdkidx++) { 4370 xpq_queue_pte_update( 4371 xpmap_ptom(pmap_pdirpa(pm, pdkidx)), 4372 kpm->pm_pdir[pdkidx]); 4373 } 4374 xpq_flush_queue(); 4375 } 4376 mutex_exit(&pmaps_lock); 4377 #endif /* __x86_64__ */ 4378 #else /* XEN */ 4379 unsigned newpdes; 4380 newpdes = nkptp[PTP_LEVELS - 1] - old; 4381 mutex_enter(&pmaps_lock); 4382 LIST_FOREACH(pm, &pmaps, pm_list) { 4383 memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old], 4384 &kpm->pm_pdir[PDIR_SLOT_KERN + old], 4385 newpdes * sizeof (pd_entry_t)); 4386 } 4387 mutex_exit(&pmaps_lock); 4388 #endif 4389 invalidate = true; 4390 } 4391 pmap_maxkvaddr = maxkvaddr; 4392 mutex_exit(&kpm->pm_lock); 4393 splx(s); 4394 4395 if (invalidate) { 4396 /* Invalidate the PDP cache. */ 4397 pool_cache_invalidate(&pmap_pdp_cache); 4398 } 4399 4400 return maxkvaddr; 4401 } 4402 4403 #ifdef DEBUG 4404 void pmap_dump(struct pmap *, vaddr_t, vaddr_t); 4405 4406 /* 4407 * pmap_dump: dump all the mappings from a pmap 4408 * 4409 * => caller should not be holding any pmap locks 4410 */ 4411 4412 void 4413 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 4414 { 4415 pt_entry_t *ptes, *pte; 4416 pd_entry_t * const *pdes; 4417 struct pmap *pmap2; 4418 vaddr_t blkendva; 4419 4420 /* 4421 * if end is out of range truncate. 4422 * if (end == start) update to max. 4423 */ 4424 4425 if (eva > VM_MAXUSER_ADDRESS || eva <= sva) 4426 eva = VM_MAXUSER_ADDRESS; 4427 4428 /* 4429 * we lock in the pmap => pv_head direction 4430 */ 4431 4432 kpreempt_disable(); 4433 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 4434 4435 /* 4436 * dumping a range of pages: we dump in PTP sized blocks (4MB) 4437 */ 4438 4439 for (/* null */ ; sva < eva ; sva = blkendva) { 4440 4441 /* determine range of block */ 4442 blkendva = x86_round_pdr(sva+1); 4443 if (blkendva > eva) 4444 blkendva = eva; 4445 4446 /* valid block? */ 4447 if (!pmap_pdes_valid(sva, pdes, NULL)) 4448 continue; 4449 4450 pte = &ptes[pl1_i(sva)]; 4451 for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) { 4452 if (!pmap_valid_entry(*pte)) 4453 continue; 4454 printf("va %#lx -> pa %#lx (pte=%#lx)\n", 4455 sva, (unsigned long)*pte, 4456 (unsigned long)pmap_pte2pa(*pte)); 4457 } 4458 } 4459 pmap_unmap_ptes(pmap, pmap2); 4460 kpreempt_enable(); 4461 } 4462 #endif 4463 4464 /* 4465 * pmap_tlb_shootdown: invalidate pages on all CPUs using pmap 'pm' 4466 * 4467 * => always invalidates locally before returning 4468 * => returns before remote CPUs have invalidated 4469 * => must be called with preemption disabled 4470 */ 4471 4472 void 4473 pmap_tlb_shootdown(struct pmap *pm, vaddr_t sva, vaddr_t eva, pt_entry_t pte) 4474 { 4475 #ifdef MULTIPROCESSOR 4476 extern bool x86_mp_online; 4477 struct cpu_info *ci; 4478 struct pmap_mbox *mb, *selfmb; 4479 CPU_INFO_ITERATOR cii; 4480 uintptr_t head; 4481 u_int count; 4482 int s; 4483 #endif /* MULTIPROCESSOR */ 4484 struct cpu_info *self; 4485 bool kernel; 4486 4487 KASSERT(eva == 0 || eva >= sva); 4488 KASSERT(kpreempt_disabled()); 4489 4490 if (pte & PG_PS) 4491 sva &= PG_LGFRAME; 4492 pte &= PG_G; 4493 self = curcpu(); 4494 4495 if (sva == (vaddr_t)-1LL) { 4496 kernel = true; 4497 } else { 4498 if (eva == 0) 4499 eva = sva + PAGE_SIZE; 4500 kernel = sva >= VM_MAXUSER_ADDRESS; 4501 KASSERT(kernel == (eva > VM_MAXUSER_ADDRESS)); 4502 } 4503 4504 /* 4505 * if tearing down the pmap, do nothing. we'll flush later 4506 * when we're ready to recycle/destroy it. 4507 */ 4508 if (__predict_false(curlwp->l_md.md_gc_pmap == pm)) { 4509 return; 4510 } 4511 4512 /* 4513 * If the range is larger than 32 pages, then invalidate 4514 * everything. 4515 */ 4516 if (sva != (vaddr_t)-1LL && eva - sva > (32 * PAGE_SIZE)) { 4517 sva = (vaddr_t)-1LL; 4518 eva = sva; 4519 } 4520 4521 #ifdef MULTIPROCESSOR 4522 if (ncpu > 1 && x86_mp_online) { 4523 selfmb = &self->ci_pmap_cpu->pc_mbox; 4524 4525 /* 4526 * If the CPUs have no notion of global pages then 4527 * reload of %cr3 is sufficient. 4528 */ 4529 if (pte != 0 && (cpu_feature & CPUID_PGE) == 0) 4530 pte = 0; 4531 4532 if (pm == pmap_kernel()) { 4533 /* 4534 * Mapped on all CPUs: use the broadcast mechanism. 4535 * Once we have the lock, increment the counter. 4536 */ 4537 s = splvm(); 4538 mb = &pmap_mbox; 4539 count = SPINLOCK_BACKOFF_MIN; 4540 do { 4541 if ((head = mb->mb_head) != mb->mb_tail) { 4542 splx(s); 4543 while ((head = mb->mb_head) != 4544 mb->mb_tail) 4545 SPINLOCK_BACKOFF(count); 4546 s = splvm(); 4547 } 4548 } while (atomic_cas_ulong( 4549 (volatile u_long *)&mb->mb_head, 4550 head, head + ncpu - 1) != head); 4551 4552 /* 4553 * Once underway we must stay at IPL_VM until the 4554 * IPI is dispatched. Otherwise interrupt handlers 4555 * on this CPU can deadlock against us. 4556 */ 4557 pmap_tlb_evcnt.ev_count++; 4558 mb->mb_pointer = self; 4559 mb->mb_addr1 = sva; 4560 mb->mb_addr2 = eva; 4561 mb->mb_global = pte; 4562 x86_ipi(LAPIC_TLB_BCAST_VECTOR, LAPIC_DEST_ALLEXCL, 4563 LAPIC_DLMODE_FIXED); 4564 self->ci_need_tlbwait = 1; 4565 splx(s); 4566 } else if ((pm->pm_cpus & ~self->ci_cpumask) != 0 || 4567 (kernel && (pm->pm_kernel_cpus & ~self->ci_cpumask) != 0)) { 4568 /* 4569 * We don't bother traversing the CPU list if only 4570 * used by this CPU. 4571 * 4572 * We can't do global flushes with the multicast 4573 * mechanism. 4574 */ 4575 KASSERT(pte == 0); 4576 4577 /* 4578 * Take ownership of the shootdown mailbox on each 4579 * CPU, fill the details and fire it off. 4580 */ 4581 s = splvm(); 4582 for (CPU_INFO_FOREACH(cii, ci)) { 4583 if (ci == self || 4584 !pmap_is_active(pm, ci, kernel) || 4585 !(ci->ci_flags & CPUF_RUNNING)) 4586 continue; 4587 selfmb->mb_head++; 4588 mb = &ci->ci_pmap_cpu->pc_mbox; 4589 count = SPINLOCK_BACKOFF_MIN; 4590 while (atomic_cas_ulong( 4591 (u_long *)&mb->mb_pointer, 4592 0, (u_long)&selfmb->mb_tail) != 0) { 4593 splx(s); 4594 while (mb->mb_pointer != 0) 4595 SPINLOCK_BACKOFF(count); 4596 s = splvm(); 4597 } 4598 mb->mb_addr1 = sva; 4599 mb->mb_addr2 = eva; 4600 mb->mb_global = pte; 4601 if (x86_ipi(LAPIC_TLB_MCAST_VECTOR, 4602 ci->ci_cpuid, LAPIC_DLMODE_FIXED)) 4603 panic("pmap_tlb_shootdown: ipi failed"); 4604 } 4605 self->ci_need_tlbwait = 1; 4606 splx(s); 4607 } 4608 } 4609 #endif /* MULTIPROCESSOR */ 4610 4611 /* Update the current CPU before waiting for others. */ 4612 if (!pmap_is_active(pm, self, kernel)) 4613 return; 4614 4615 if (sva == (vaddr_t)-1LL) { 4616 if (pte != 0) 4617 tlbflushg(); 4618 else 4619 tlbflush(); 4620 } else { 4621 do { 4622 pmap_update_pg(sva); 4623 sva += PAGE_SIZE; 4624 } while (sva < eva); 4625 } 4626 } 4627 4628 /* 4629 * pmap_tlb_shootwait: wait for pending TLB shootdowns to complete 4630 * 4631 * => only waits for operations generated by the current CPU 4632 * => must be called with preemption disabled 4633 */ 4634 4635 void 4636 pmap_tlb_shootwait(void) 4637 { 4638 struct cpu_info *self; 4639 struct pmap_mbox *mb; 4640 4641 KASSERT(kpreempt_disabled()); 4642 4643 /* 4644 * Anything to do? XXX Really we want to avoid touching the cache 4645 * lines of the two mailboxes, but the processor may read ahead. 4646 */ 4647 self = curcpu(); 4648 if (!self->ci_need_tlbwait) 4649 return; 4650 self->ci_need_tlbwait = 0; 4651 4652 /* If we own the global mailbox, wait for it to drain. */ 4653 mb = &pmap_mbox; 4654 while (mb->mb_pointer == self && mb->mb_head != mb->mb_tail) 4655 x86_pause(); 4656 4657 /* If we own other CPU's mailboxes, wait for them to drain. */ 4658 mb = &self->ci_pmap_cpu->pc_mbox; 4659 KASSERT(mb->mb_pointer != &mb->mb_tail); 4660 while (mb->mb_head != mb->mb_tail) 4661 x86_pause(); 4662 } 4663 4664 /* 4665 * pmap_update: process deferred invalidations 4666 */ 4667 4668 void 4669 pmap_update(struct pmap *pmap) 4670 { 4671 struct vm_page *ptp, *empty_ptps; 4672 struct pmap_page *pp; 4673 lwp_t *l; 4674 4675 /* 4676 * if we have torn down this pmap, invalidate non-global TLB 4677 * entries on any processors using it. 4678 */ 4679 l = curlwp; 4680 if (__predict_false(l->l_md.md_gc_pmap == pmap)) { 4681 l->l_md.md_gc_pmap = NULL; 4682 KPREEMPT_DISABLE(l); 4683 pmap_tlb_shootdown(pmap, -1, -1, 0); 4684 KPREEMPT_ENABLE(l); 4685 } 4686 4687 /* 4688 * wait for tlb shootdowns to complete before returning control 4689 * to the caller. 4690 */ 4691 kpreempt_disable(); 4692 pmap_tlb_shootwait(); 4693 kpreempt_enable(); 4694 4695 /* 4696 * now that shootdowns are complete, process deferred frees, 4697 * but not from interrupt context. 4698 */ 4699 if (l->l_md.md_gc_ptp != NULL) { 4700 if (cpu_intr_p() || (l->l_pflag & LP_INTR) != 0) { 4701 return; 4702 } 4703 4704 empty_ptps = l->l_md.md_gc_ptp; 4705 l->l_md.md_gc_ptp = NULL; 4706 4707 while ((ptp = empty_ptps) != NULL) { 4708 ptp->flags |= PG_ZERO; 4709 pp = VM_PAGE_TO_PP(ptp); 4710 empty_ptps = pp->pp_link; 4711 LIST_INIT(&pp->pp_head.pvh_list); 4712 uvm_pagefree(ptp); 4713 } 4714 } 4715 } 4716 4717 #if PTP_LEVELS > 4 4718 #error "Unsupported number of page table mappings" 4719 #endif 4720 4721 paddr_t 4722 pmap_init_tmp_pgtbl(paddr_t pg) 4723 { 4724 static bool maps_loaded; 4725 static const paddr_t x86_tmp_pml_paddr[] = { 4726 4 * PAGE_SIZE, 4727 5 * PAGE_SIZE, 4728 6 * PAGE_SIZE, 4729 7 * PAGE_SIZE 4730 }; 4731 static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 }; 4732 4733 pd_entry_t *tmp_pml, *kernel_pml; 4734 4735 int level; 4736 4737 if (!maps_loaded) { 4738 for (level = 0; level < PTP_LEVELS; ++level) { 4739 x86_tmp_pml_vaddr[level] = 4740 uvm_km_alloc(kernel_map, PAGE_SIZE, 0, 4741 UVM_KMF_VAONLY); 4742 4743 if (x86_tmp_pml_vaddr[level] == 0) 4744 panic("mapping of real mode PML failed\n"); 4745 pmap_kenter_pa(x86_tmp_pml_vaddr[level], 4746 x86_tmp_pml_paddr[level], 4747 VM_PROT_READ | VM_PROT_WRITE); 4748 pmap_update(pmap_kernel()); 4749 } 4750 maps_loaded = true; 4751 } 4752 4753 /* Zero levels 1-3 */ 4754 for (level = 0; level < PTP_LEVELS - 1; ++level) { 4755 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 4756 memset(tmp_pml, 0, PAGE_SIZE); 4757 } 4758 4759 /* Copy PML4 */ 4760 kernel_pml = pmap_kernel()->pm_pdir; 4761 tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1]; 4762 memcpy(tmp_pml, kernel_pml, PAGE_SIZE); 4763 4764 /* Hook our own level 3 in */ 4765 tmp_pml[pl_i(pg, PTP_LEVELS)] = 4766 (x86_tmp_pml_paddr[PTP_LEVELS - 2] & PG_FRAME) | PG_RW | PG_V; 4767 4768 for (level = PTP_LEVELS - 1; level > 0; --level) { 4769 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 4770 4771 tmp_pml[pl_i(pg, level + 1)] = 4772 (x86_tmp_pml_paddr[level - 1] & PG_FRAME) | PG_RW | PG_V; 4773 } 4774 4775 tmp_pml = (void *)x86_tmp_pml_vaddr[0]; 4776 tmp_pml[pl_i(pg, 1)] = (pg & PG_FRAME) | PG_RW | PG_V; 4777 4778 return x86_tmp_pml_paddr[PTP_LEVELS - 1]; 4779 } 4780