1 /* $NetBSD: pmap.c,v 1.113 2010/07/24 00:45:56 jym Exp $ */ 2 3 /* 4 * Copyright (c) 2007 Manuel Bouyer. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 * 26 */ 27 28 /* 29 * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr> 30 * 31 * Permission to use, copy, modify, and distribute this software for any 32 * purpose with or without fee is hereby granted, provided that the above 33 * copyright notice and this permission notice appear in all copies. 34 * 35 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 36 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 37 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 38 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 39 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 40 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 41 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 42 */ 43 44 /* 45 * 46 * Copyright (c) 1997 Charles D. Cranor and Washington University. 47 * All rights reserved. 48 * 49 * Redistribution and use in source and binary forms, with or without 50 * modification, are permitted provided that the following conditions 51 * are met: 52 * 1. Redistributions of source code must retain the above copyright 53 * notice, this list of conditions and the following disclaimer. 54 * 2. Redistributions in binary form must reproduce the above copyright 55 * notice, this list of conditions and the following disclaimer in the 56 * documentation and/or other materials provided with the distribution. 57 * 3. All advertising materials mentioning features or use of this software 58 * must display the following acknowledgement: 59 * This product includes software developed by Charles D. Cranor and 60 * Washington University. 61 * 4. The name of the author may not be used to endorse or promote products 62 * derived from this software without specific prior written permission. 63 * 64 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 65 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 66 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 67 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 68 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 69 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 70 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 71 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 72 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 73 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 74 */ 75 76 /* 77 * Copyright 2001 (c) Wasabi Systems, Inc. 78 * All rights reserved. 79 * 80 * Written by Frank van der Linden for Wasabi Systems, Inc. 81 * 82 * Redistribution and use in source and binary forms, with or without 83 * modification, are permitted provided that the following conditions 84 * are met: 85 * 1. Redistributions of source code must retain the above copyright 86 * notice, this list of conditions and the following disclaimer. 87 * 2. Redistributions in binary form must reproduce the above copyright 88 * notice, this list of conditions and the following disclaimer in the 89 * documentation and/or other materials provided with the distribution. 90 * 3. All advertising materials mentioning features or use of this software 91 * must display the following acknowledgement: 92 * This product includes software developed for the NetBSD Project by 93 * Wasabi Systems, Inc. 94 * 4. The name of Wasabi Systems, Inc. may not be used to endorse 95 * or promote products derived from this software without specific prior 96 * written permission. 97 * 98 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND 99 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 100 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 101 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC 102 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 103 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 104 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 105 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 106 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 107 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 108 * POSSIBILITY OF SUCH DAMAGE. 109 */ 110 111 /* 112 * This is the i386 pmap modified and generalized to support x86-64 113 * as well. The idea is to hide the upper N levels of the page tables 114 * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest 115 * is mostly untouched, except that it uses some more generalized 116 * macros and interfaces. 117 * 118 * This pmap has been tested on the i386 as well, and it can be easily 119 * adapted to PAE. 120 * 121 * fvdl@wasabisystems.com 18-Jun-2001 122 */ 123 124 /* 125 * pmap.c: i386 pmap module rewrite 126 * Chuck Cranor <chuck@ccrc.wustl.edu> 127 * 11-Aug-97 128 * 129 * history of this pmap module: in addition to my own input, i used 130 * the following references for this rewrite of the i386 pmap: 131 * 132 * [1] the NetBSD i386 pmap. this pmap appears to be based on the 133 * BSD hp300 pmap done by Mike Hibler at University of Utah. 134 * it was then ported to the i386 by William Jolitz of UUNET 135 * Technologies, Inc. Then Charles M. Hannum of the NetBSD 136 * project fixed some bugs and provided some speed ups. 137 * 138 * [2] the FreeBSD i386 pmap. this pmap seems to be the 139 * Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson 140 * and David Greenman. 141 * 142 * [3] the Mach pmap. this pmap, from CMU, seems to have migrated 143 * between several processors. the VAX version was done by 144 * Avadis Tevanian, Jr., and Michael Wayne Young. the i386 145 * version was done by Lance Berc, Mike Kupfer, Bob Baron, 146 * David Golub, and Richard Draves. the alpha version was 147 * done by Alessandro Forin (CMU/Mach) and Chris Demetriou 148 * (NetBSD/alpha). 149 */ 150 151 #include <sys/cdefs.h> 152 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.113 2010/07/24 00:45:56 jym Exp $"); 153 154 #include "opt_user_ldt.h" 155 #include "opt_lockdebug.h" 156 #include "opt_multiprocessor.h" 157 #include "opt_xen.h" 158 #if !defined(__x86_64__) 159 #include "opt_kstack_dr0.h" 160 #endif /* !defined(__x86_64__) */ 161 162 #include <sys/param.h> 163 #include <sys/systm.h> 164 #include <sys/proc.h> 165 #include <sys/pool.h> 166 #include <sys/kernel.h> 167 #include <sys/atomic.h> 168 #include <sys/cpu.h> 169 #include <sys/intr.h> 170 #include <sys/xcall.h> 171 172 #include <uvm/uvm.h> 173 174 #include <dev/isa/isareg.h> 175 176 #include <machine/specialreg.h> 177 #include <machine/gdt.h> 178 #include <machine/isa_machdep.h> 179 #include <machine/cpuvar.h> 180 181 #include <x86/pmap.h> 182 #include <x86/pmap_pv.h> 183 184 #include <x86/i82489reg.h> 185 #include <x86/i82489var.h> 186 187 #ifdef XEN 188 #include <xen/xen3-public/xen.h> 189 #include <xen/hypervisor.h> 190 #endif 191 192 /* flag to be used for kernel mappings: PG_u on Xen/amd64, 0 otherwise */ 193 #if defined(XEN) && defined(__x86_64__) 194 #define PG_k PG_u 195 #else 196 #define PG_k 0 197 #endif 198 199 /* 200 * general info: 201 * 202 * - for an explanation of how the i386 MMU hardware works see 203 * the comments in <machine/pte.h>. 204 * 205 * - for an explanation of the general memory structure used by 206 * this pmap (including the recursive mapping), see the comments 207 * in <machine/pmap.h>. 208 * 209 * this file contains the code for the "pmap module." the module's 210 * job is to manage the hardware's virtual to physical address mappings. 211 * note that there are two levels of mapping in the VM system: 212 * 213 * [1] the upper layer of the VM system uses vm_map's and vm_map_entry's 214 * to map ranges of virtual address space to objects/files. for 215 * example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only 216 * to the file /bin/ls starting at offset zero." note that 217 * the upper layer mapping is not concerned with how individual 218 * vm_pages are mapped. 219 * 220 * [2] the lower layer of the VM system (the pmap) maintains the mappings 221 * from virtual addresses. it is concerned with which vm_page is 222 * mapped where. for example, when you run /bin/ls and start 223 * at page 0x1000 the fault routine may lookup the correct page 224 * of the /bin/ls file and then ask the pmap layer to establish 225 * a mapping for it. 226 * 227 * note that information in the lower layer of the VM system can be 228 * thrown away since it can easily be reconstructed from the info 229 * in the upper layer. 230 * 231 * data structures we use include: 232 * 233 * - struct pmap: describes the address space of one thread 234 * - struct pv_entry: describes one <PMAP,VA> mapping of a PA 235 * - struct pv_head: there is one pv_head per managed page of 236 * physical memory. the pv_head points to a list of pv_entry 237 * structures which describe all the <PMAP,VA> pairs that this 238 * page is mapped in. this is critical for page based operations 239 * such as pmap_page_protect() [change protection on _all_ mappings 240 * of a page] 241 */ 242 243 /* 244 * memory allocation 245 * 246 * - there are three data structures that we must dynamically allocate: 247 * 248 * [A] new process' page directory page (PDP) 249 * - plan 1: done at pmap_create() we use 250 * uvm_km_alloc(kernel_map, PAGE_SIZE) [fka kmem_alloc] to do this 251 * allocation. 252 * 253 * if we are low in free physical memory then we sleep in 254 * uvm_km_alloc -- in this case this is ok since we are creating 255 * a new pmap and should not be holding any locks. 256 * 257 * if the kernel is totally out of virtual space 258 * (i.e. uvm_km_alloc returns NULL), then we panic. 259 * 260 * [B] new page tables pages (PTP) 261 * - call uvm_pagealloc() 262 * => success: zero page, add to pm_pdir 263 * => failure: we are out of free vm_pages, let pmap_enter() 264 * tell UVM about it. 265 * 266 * note: for kernel PTPs, we start with NKPTP of them. as we map 267 * kernel memory (at uvm_map time) we check to see if we've grown 268 * the kernel pmap. if so, we call the optional function 269 * pmap_growkernel() to grow the kernel PTPs in advance. 270 * 271 * [C] pv_entry structures 272 */ 273 274 /* 275 * locking 276 * 277 * we have the following locks that we must contend with: 278 * 279 * mutexes: 280 * 281 * - pmap lock (per pmap, part of uvm_object) 282 * this lock protects the fields in the pmap structure including 283 * the non-kernel PDEs in the PDP, and the PTEs. it also locks 284 * in the alternate PTE space (since that is determined by the 285 * entry in the PDP). 286 * 287 * - pvh_lock (per pv_head) 288 * this lock protects the pv_entry list which is chained off the 289 * pv_head structure for a specific managed PA. it is locked 290 * when traversing the list (e.g. adding/removing mappings, 291 * syncing R/M bits, etc.) 292 * 293 * - pmaps_lock 294 * this lock protects the list of active pmaps (headed by "pmaps"). 295 * we lock it when adding or removing pmaps from this list. 296 * 297 * tlb shootdown 298 * 299 * tlb shootdowns are hard interrupts that operate outside the spl 300 * framework: they don't need to be blocked provided that the pmap module 301 * gets the order of events correct. the calls are made by talking directly 302 * to the lapic. the stubs to handle the interrupts are quite short and do 303 * one of the following: invalidate a single page, a range of pages, all 304 * user tlb entries or the entire tlb. 305 * 306 * the cpus synchronize with each other using pmap_mbox structures which are 307 * aligned on 64-byte cache lines. tlb shootdowns against the kernel pmap 308 * use a global mailbox and are generated using a broadcast ipi (broadcast 309 * to all but the sending cpu). shootdowns against regular pmaps use 310 * per-cpu mailboxes and are multicast. kernel and user shootdowns can 311 * execute simultaneously, as can shootdowns within different multithreaded 312 * processes. TODO: 313 * 314 * 1. figure out which waitpoints can be deferered to pmap_update(). 315 * 2. see if there is a cheap way to batch some updates. 316 */ 317 318 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER; 319 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER; 320 const long nkptpmax[] = NKPTPMAX_INITIALIZER; 321 const long nbpd[] = NBPD_INITIALIZER; 322 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER; 323 pd_entry_t * const alternate_pdes[] = APDES_INITIALIZER; 324 325 long nkptp[] = NKPTP_INITIALIZER; 326 327 static kmutex_t pmaps_lock; 328 329 static vaddr_t pmap_maxkvaddr; 330 331 #define COUNT(x) /* nothing */ 332 333 /* 334 * XXX kludge: dummy locking to make KASSERTs in uvm_page.c comfortable. 335 * actual locking is done by pm_lock. 336 */ 337 #if defined(DIAGNOSTIC) 338 #define PMAP_SUBOBJ_LOCK(pm, idx) \ 339 KASSERT(mutex_owned(&(pm)->pm_lock)); \ 340 if ((idx) != 0) \ 341 mutex_enter(&(pm)->pm_obj[(idx)].vmobjlock) 342 #define PMAP_SUBOBJ_UNLOCK(pm, idx) \ 343 KASSERT(mutex_owned(&(pm)->pm_lock)); \ 344 if ((idx) != 0) \ 345 mutex_exit(&(pm)->pm_obj[(idx)].vmobjlock) 346 #else /* defined(DIAGNOSTIC) */ 347 #define PMAP_SUBOBJ_LOCK(pm, idx) /* nothing */ 348 #define PMAP_SUBOBJ_UNLOCK(pm, idx) /* nothing */ 349 #endif /* defined(DIAGNOSTIC) */ 350 351 /* 352 * Misc. event counters. 353 */ 354 struct evcnt pmap_iobmp_evcnt; 355 struct evcnt pmap_ldt_evcnt; 356 357 /* 358 * Global TLB shootdown mailbox. 359 */ 360 struct evcnt pmap_tlb_evcnt __aligned(64); 361 struct pmap_mbox pmap_mbox __aligned(64); 362 363 /* 364 * PAT 365 */ 366 #define PATENTRY(n, type) (type << ((n) * 8)) 367 #define PAT_UC 0x0ULL 368 #define PAT_WC 0x1ULL 369 #define PAT_WT 0x4ULL 370 #define PAT_WP 0x5ULL 371 #define PAT_WB 0x6ULL 372 #define PAT_UCMINUS 0x7ULL 373 374 static bool cpu_pat_enabled = false; 375 376 377 /* 378 * Per-CPU data. The pmap mailbox is cache intensive so gets its 379 * own line. Note that the mailbox must be the first item. 380 */ 381 struct pmap_cpu { 382 /* TLB shootdown */ 383 struct pmap_mbox pc_mbox; 384 }; 385 386 union { 387 struct pmap_cpu pc; 388 uint8_t padding[64]; 389 } pmap_cpu[MAXCPUS] __aligned(64); 390 391 /* 392 * global data structures 393 */ 394 395 static struct pmap kernel_pmap_store; /* the kernel's pmap (proc0) */ 396 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store; 397 398 /* 399 * pmap_pg_g: if our processor supports PG_G in the PTE then we 400 * set pmap_pg_g to PG_G (otherwise it is zero). 401 */ 402 403 int pmap_pg_g = 0; 404 405 /* 406 * pmap_largepages: if our processor supports PG_PS and we are 407 * using it, this is set to true. 408 */ 409 410 int pmap_largepages; 411 412 /* 413 * i386 physical memory comes in a big contig chunk with a small 414 * hole toward the front of it... the following two paddr_t's 415 * (shared with machdep.c) describe the physical address space 416 * of this machine. 417 */ 418 paddr_t avail_start; /* PA of first available physical page */ 419 paddr_t avail_end; /* PA of last available physical page */ 420 421 #ifdef XEN 422 #ifdef __x86_64__ 423 /* Dummy PGD for user cr3, used between pmap_deactivate() and pmap_activate() */ 424 static paddr_t xen_dummy_user_pgd; 425 #endif /* __x86_64__ */ 426 paddr_t pmap_pa_start; /* PA of first physical page for this domain */ 427 paddr_t pmap_pa_end; /* PA of last physical page for this domain */ 428 #endif /* XEN */ 429 430 #define VM_PAGE_TO_PP(pg) (&(pg)->mdpage.mp_pp) 431 432 #define pp_lock(pp) mutex_spin_enter(&(pp)->pp_lock) 433 #define pp_unlock(pp) mutex_spin_exit(&(pp)->pp_lock) 434 #define pp_locked(pp) mutex_owned(&(pp)->pp_lock) 435 436 #define PV_HASH_SIZE 32768 437 #define PV_HASH_LOCK_CNT 32 438 439 struct pv_hash_lock { 440 kmutex_t lock; 441 } __aligned(CACHE_LINE_SIZE) pv_hash_locks[PV_HASH_LOCK_CNT] 442 __aligned(CACHE_LINE_SIZE); 443 444 struct pv_hash_head { 445 SLIST_HEAD(, pv_entry) hh_list; 446 } pv_hash_heads[PV_HASH_SIZE]; 447 448 static u_int 449 pvhash_hash(struct vm_page *ptp, vaddr_t va) 450 { 451 452 return (uintptr_t)ptp / sizeof(*ptp) + (va >> PAGE_SHIFT); 453 } 454 455 static struct pv_hash_head * 456 pvhash_head(u_int hash) 457 { 458 459 return &pv_hash_heads[hash % PV_HASH_SIZE]; 460 } 461 462 static kmutex_t * 463 pvhash_lock(u_int hash) 464 { 465 466 return &pv_hash_locks[hash % PV_HASH_LOCK_CNT].lock; 467 } 468 469 static struct pv_entry * 470 pvhash_remove(struct pv_hash_head *hh, struct vm_page *ptp, vaddr_t va) 471 { 472 struct pv_entry *pve; 473 struct pv_entry *prev; 474 475 prev = NULL; 476 SLIST_FOREACH(pve, &hh->hh_list, pve_hash) { 477 if (pve->pve_pte.pte_ptp == ptp && 478 pve->pve_pte.pte_va == va) { 479 if (prev != NULL) { 480 SLIST_REMOVE_AFTER(prev, pve_hash); 481 } else { 482 SLIST_REMOVE_HEAD(&hh->hh_list, pve_hash); 483 } 484 break; 485 } 486 prev = pve; 487 } 488 return pve; 489 } 490 491 /* 492 * other data structures 493 */ 494 495 static pt_entry_t protection_codes[8]; /* maps MI prot to i386 prot code */ 496 static bool pmap_initialized = false; /* pmap_init done yet? */ 497 498 /* 499 * the following two vaddr_t's are used during system startup 500 * to keep track of how much of the kernel's VM space we have used. 501 * once the system is started, the management of the remaining kernel 502 * VM space is turned over to the kernel_map vm_map. 503 */ 504 505 static vaddr_t virtual_avail; /* VA of first free KVA */ 506 static vaddr_t virtual_end; /* VA of last free KVA */ 507 508 /* 509 * linked list of all non-kernel pmaps 510 */ 511 512 static struct pmap_head pmaps; 513 514 /* 515 * pool that pmap structures are allocated from 516 */ 517 518 static struct pool_cache pmap_cache; 519 520 /* 521 * pv_entry cache 522 */ 523 524 static struct pool_cache pmap_pv_cache; 525 526 /* 527 * MULTIPROCESSOR: special VA's/ PTE's are actually allocated inside a 528 * maxcpus*NPTECL array of PTE's, to avoid cache line thrashing 529 * due to false sharing. 530 */ 531 532 #ifdef MULTIPROCESSOR 533 #define PTESLEW(pte, id) ((pte)+(id)*NPTECL) 534 #define VASLEW(va,id) ((va)+(id)*NPTECL*PAGE_SIZE) 535 #else 536 #define PTESLEW(pte, id) (pte) 537 #define VASLEW(va,id) (va) 538 #endif 539 540 /* 541 * special VAs and the PTEs that map them 542 */ 543 static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte, *early_zero_pte; 544 static char *csrcp, *cdstp, *zerop, *ptpp, *early_zerop; 545 546 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int); 547 548 /* 549 * pool and cache that PDPs are allocated from 550 */ 551 552 static struct pool_cache pmap_pdp_cache; 553 int pmap_pdp_ctor(void *, void *, int); 554 void pmap_pdp_dtor(void *, void *); 555 #ifdef PAE 556 /* need to allocate items of 4 pages */ 557 void *pmap_pdp_alloc(struct pool *, int); 558 void pmap_pdp_free(struct pool *, void *); 559 static struct pool_allocator pmap_pdp_allocator = { 560 .pa_alloc = pmap_pdp_alloc, 561 .pa_free = pmap_pdp_free, 562 .pa_pagesz = PAGE_SIZE * PDP_SIZE, 563 }; 564 #endif /* PAE */ 565 566 void *vmmap; /* XXX: used by mem.c... it should really uvm_map_reserve it */ 567 568 extern vaddr_t idt_vaddr; /* we allocate IDT early */ 569 extern paddr_t idt_paddr; 570 571 #ifdef _LP64 572 extern vaddr_t lo32_vaddr; 573 extern vaddr_t lo32_paddr; 574 #endif 575 576 extern int end; 577 578 #ifdef i386 579 /* stuff to fix the pentium f00f bug */ 580 extern vaddr_t pentium_idt_vaddr; 581 #endif 582 583 584 /* 585 * local prototypes 586 */ 587 588 static struct vm_page *pmap_get_ptp(struct pmap *, vaddr_t, 589 pd_entry_t * const *); 590 static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int); 591 static void pmap_freepage(struct pmap *, struct vm_page *, int); 592 static void pmap_free_ptp(struct pmap *, struct vm_page *, 593 vaddr_t, pt_entry_t *, 594 pd_entry_t * const *); 595 static bool pmap_is_curpmap(struct pmap *); 596 static bool pmap_is_active(struct pmap *, struct cpu_info *, bool); 597 static bool pmap_remove_pte(struct pmap *, struct vm_page *, 598 pt_entry_t *, vaddr_t, 599 struct pv_entry **); 600 static pt_entry_t pmap_remove_ptes(struct pmap *, struct vm_page *, 601 vaddr_t, vaddr_t, vaddr_t, 602 struct pv_entry **); 603 604 static void pmap_unmap_apdp(void); 605 static bool pmap_get_physpage(vaddr_t, int, paddr_t *); 606 static void pmap_alloc_level(pd_entry_t * const *, vaddr_t, int, 607 long *); 608 609 static bool pmap_reactivate(struct pmap *); 610 611 /* 612 * p m a p h e l p e r f u n c t i o n s 613 */ 614 615 static inline void 616 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff) 617 { 618 619 if (pmap == pmap_kernel()) { 620 atomic_add_long(&pmap->pm_stats.resident_count, resid_diff); 621 atomic_add_long(&pmap->pm_stats.wired_count, wired_diff); 622 } else { 623 KASSERT(mutex_owned(&pmap->pm_lock)); 624 pmap->pm_stats.resident_count += resid_diff; 625 pmap->pm_stats.wired_count += wired_diff; 626 } 627 } 628 629 static inline void 630 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte) 631 { 632 int resid_diff = ((npte & PG_V) ? 1 : 0) - ((opte & PG_V) ? 1 : 0); 633 int wired_diff = ((npte & PG_W) ? 1 : 0) - ((opte & PG_W) ? 1 : 0); 634 635 KASSERT((npte & (PG_V | PG_W)) != PG_W); 636 KASSERT((opte & (PG_V | PG_W)) != PG_W); 637 638 pmap_stats_update(pmap, resid_diff, wired_diff); 639 } 640 641 /* 642 * ptp_to_pmap: lookup pmap by ptp 643 */ 644 645 static struct pmap * 646 ptp_to_pmap(struct vm_page *ptp) 647 { 648 struct pmap *pmap; 649 650 if (ptp == NULL) { 651 return pmap_kernel(); 652 } 653 pmap = (struct pmap *)ptp->uobject; 654 KASSERT(pmap != NULL); 655 KASSERT(&pmap->pm_obj[0] == ptp->uobject); 656 return pmap; 657 } 658 659 static inline struct pv_pte * 660 pve_to_pvpte(struct pv_entry *pve) 661 { 662 663 KASSERT((void *)&pve->pve_pte == (void *)pve); 664 return &pve->pve_pte; 665 } 666 667 static inline struct pv_entry * 668 pvpte_to_pve(struct pv_pte *pvpte) 669 { 670 struct pv_entry *pve = (void *)pvpte; 671 672 KASSERT(pve_to_pvpte(pve) == pvpte); 673 return pve; 674 } 675 676 /* 677 * pv_pte_first, pv_pte_next: PV list iterator. 678 */ 679 680 static struct pv_pte * 681 pv_pte_first(struct pmap_page *pp) 682 { 683 684 KASSERT(pp_locked(pp)); 685 if ((pp->pp_flags & PP_EMBEDDED) != 0) { 686 return &pp->pp_pte; 687 } 688 return pve_to_pvpte(LIST_FIRST(&pp->pp_head.pvh_list)); 689 } 690 691 static struct pv_pte * 692 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte) 693 { 694 695 KASSERT(pvpte != NULL); 696 KASSERT(pp_locked(pp)); 697 if (pvpte == &pp->pp_pte) { 698 KASSERT((pp->pp_flags & PP_EMBEDDED) != 0); 699 return NULL; 700 } 701 KASSERT((pp->pp_flags & PP_EMBEDDED) == 0); 702 return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list)); 703 } 704 705 /* 706 * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]? 707 * of course the kernel is always loaded 708 */ 709 710 inline static bool 711 pmap_is_curpmap(struct pmap *pmap) 712 { 713 #if defined(XEN) && defined(__x86_64__) 714 /* 715 * Only kernel pmap is physically loaded. 716 * User PGD may be active, but TLB will be flushed 717 * with HYPERVISOR_iret anyway, so let's say no 718 */ 719 return(pmap == pmap_kernel()); 720 #else /* XEN && __x86_64__*/ 721 return((pmap == pmap_kernel()) || 722 (pmap == curcpu()->ci_pmap)); 723 #endif 724 } 725 726 /* 727 * pmap_is_active: is this pmap loaded into the specified processor's %cr3? 728 */ 729 730 inline static bool 731 pmap_is_active(struct pmap *pmap, struct cpu_info *ci, bool kernel) 732 { 733 734 return (pmap == pmap_kernel() || 735 (pmap->pm_cpus & ci->ci_cpumask) != 0 || 736 (kernel && (pmap->pm_kernel_cpus & ci->ci_cpumask) != 0)); 737 } 738 739 static void 740 pmap_apte_flush(struct pmap *pmap) 741 { 742 743 KASSERT(kpreempt_disabled()); 744 745 /* 746 * Flush the APTE mapping from all other CPUs that 747 * are using the pmap we are using (who's APTE space 748 * is the one we've just modified). 749 * 750 * XXXthorpej -- find a way to defer the IPI. 751 */ 752 pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, 0); 753 pmap_tlb_shootwait(); 754 } 755 756 /* 757 * Unmap the content of APDP PDEs 758 */ 759 static void 760 pmap_unmap_apdp(void) 761 { 762 int i; 763 764 for (i = 0; i < PDP_SIZE; i++) { 765 pmap_pte_set(APDP_PDE+i, 0); 766 #if defined (XEN) && defined (PAE) 767 /* clear shadow entries too */ 768 pmap_pte_set(APDP_PDE_SHADOW+i, 0); 769 #endif 770 } 771 } 772 773 /* 774 * Add a reference to the specified pmap. 775 */ 776 777 inline void 778 pmap_reference(struct pmap *pmap) 779 { 780 781 atomic_inc_uint(&pmap->pm_obj[0].uo_refs); 782 } 783 784 /* 785 * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in 786 * 787 * => we lock enough pmaps to keep things locked in 788 * => must be undone with pmap_unmap_ptes before returning 789 */ 790 791 void 792 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2, 793 pd_entry_t **ptepp, pd_entry_t * const **pdeppp) 794 { 795 pd_entry_t opde, npde; 796 struct pmap *ourpmap; 797 struct cpu_info *ci; 798 struct lwp *l; 799 bool iscurrent; 800 uint64_t ncsw; 801 #ifdef XEN 802 int s, i; 803 #endif 804 805 /* the kernel's pmap is always accessible */ 806 if (pmap == pmap_kernel()) { 807 *pmap2 = NULL; 808 *ptepp = PTE_BASE; 809 *pdeppp = normal_pdes; 810 return; 811 } 812 KASSERT(kpreempt_disabled()); 813 814 retry: 815 l = curlwp; 816 ncsw = l->l_ncsw; 817 ourpmap = NULL; 818 ci = curcpu(); 819 #if defined(XEN) && defined(__x86_64__) 820 /* 821 * curmap can only be pmap_kernel so at this point 822 * pmap_is_curpmap is always false 823 */ 824 iscurrent = 0; 825 ourpmap = pmap_kernel(); 826 #else /* XEN && __x86_64__*/ 827 if (ci->ci_want_pmapload && 828 vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) { 829 pmap_load(); 830 if (l->l_ncsw != ncsw) 831 goto retry; 832 } 833 iscurrent = pmap_is_curpmap(pmap); 834 /* if curpmap then we are always mapped */ 835 if (iscurrent) { 836 mutex_enter(&pmap->pm_lock); 837 *pmap2 = NULL; 838 *ptepp = PTE_BASE; 839 *pdeppp = normal_pdes; 840 goto out; 841 } 842 ourpmap = ci->ci_pmap; 843 #endif /* XEN && __x86_64__ */ 844 845 /* need to lock both curpmap and pmap: use ordered locking */ 846 pmap_reference(ourpmap); 847 if ((uintptr_t) pmap < (uintptr_t) ourpmap) { 848 mutex_enter(&pmap->pm_lock); 849 mutex_enter(&ourpmap->pm_lock); 850 } else { 851 mutex_enter(&ourpmap->pm_lock); 852 mutex_enter(&pmap->pm_lock); 853 } 854 855 if (l->l_ncsw != ncsw) 856 goto unlock_and_retry; 857 858 /* need to load a new alternate pt space into curpmap? */ 859 COUNT(apdp_pde_map); 860 opde = *APDP_PDE; 861 if (!pmap_valid_entry(opde) || 862 pmap_pte2pa(opde) != pmap_pdirpa(pmap, 0)) { 863 #ifdef XEN 864 s = splvm(); 865 /* Make recursive entry usable in user PGD */ 866 for (i = 0; i < PDP_SIZE; i++) { 867 npde = pmap_pa2pte( 868 pmap_pdirpa(pmap, i * NPDPG)) | PG_k | PG_V; 869 xpq_queue_pte_update( 870 xpmap_ptom(pmap_pdirpa(pmap, PDIR_SLOT_PTE + i)), 871 npde); 872 xpq_queue_pte_update(xpmap_ptetomach(&APDP_PDE[i]), 873 npde); 874 #ifdef PAE 875 /* update shadow entry too */ 876 xpq_queue_pte_update( 877 xpmap_ptetomach(&APDP_PDE_SHADOW[i]), npde); 878 #endif /* PAE */ 879 xpq_queue_invlpg( 880 (vaddr_t)&pmap->pm_pdir[PDIR_SLOT_PTE + i]); 881 } 882 if (pmap_valid_entry(opde)) 883 pmap_apte_flush(ourpmap); 884 splx(s); 885 #else /* XEN */ 886 int i; 887 for (i = 0; i < PDP_SIZE; i++) { 888 npde = pmap_pa2pte( 889 pmap_pdirpa(pmap, i * NPDPG)) | PG_RW | PG_V; 890 pmap_pte_set(APDP_PDE+i, npde); 891 } 892 pmap_pte_flush(); 893 if (pmap_valid_entry(opde)) 894 pmap_apte_flush(ourpmap); 895 #endif /* XEN */ 896 } 897 *pmap2 = ourpmap; 898 *ptepp = APTE_BASE; 899 *pdeppp = alternate_pdes; 900 KASSERT(l->l_ncsw == ncsw); 901 #if !defined(XEN) || !defined(__x86_64__) 902 out: 903 #endif 904 /* 905 * might have blocked, need to retry? 906 */ 907 if (l->l_ncsw != ncsw) { 908 unlock_and_retry: 909 if (ourpmap != NULL) { 910 mutex_exit(&ourpmap->pm_lock); 911 pmap_destroy(ourpmap); 912 } 913 mutex_exit(&pmap->pm_lock); 914 goto retry; 915 } 916 917 return; 918 } 919 920 /* 921 * pmap_unmap_ptes: unlock the PTE mapping of "pmap" 922 */ 923 924 void 925 pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2) 926 { 927 928 if (pmap == pmap_kernel()) { 929 return; 930 } 931 KASSERT(kpreempt_disabled()); 932 if (pmap2 == NULL) { 933 mutex_exit(&pmap->pm_lock); 934 } else { 935 #if defined(XEN) && defined(__x86_64__) 936 KASSERT(pmap2 == pmap_kernel()); 937 #else 938 KASSERT(curcpu()->ci_pmap == pmap2); 939 #endif 940 #if defined(MULTIPROCESSOR) 941 pmap_unmap_apdp(); 942 pmap_pte_flush(); 943 pmap_apte_flush(pmap2); 944 #endif 945 COUNT(apdp_pde_unmap); 946 mutex_exit(&pmap->pm_lock); 947 mutex_exit(&pmap2->pm_lock); 948 pmap_destroy(pmap2); 949 } 950 } 951 952 inline static void 953 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte) 954 { 955 956 #if !defined(__x86_64__) 957 if (curproc == NULL || curproc->p_vmspace == NULL || 958 pm != vm_map_pmap(&curproc->p_vmspace->vm_map)) 959 return; 960 961 if ((opte ^ npte) & PG_X) 962 pmap_update_pg(va); 963 964 /* 965 * Executability was removed on the last executable change. 966 * Reset the code segment to something conservative and 967 * let the trap handler deal with setting the right limit. 968 * We can't do that because of locking constraints on the vm map. 969 */ 970 971 if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) { 972 struct trapframe *tf = curlwp->l_md.md_regs; 973 974 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 975 pm->pm_hiexec = I386_MAX_EXE_ADDR; 976 } 977 #endif /* !defined(__x86_64__) */ 978 } 979 980 #if !defined(__x86_64__) 981 /* 982 * Fixup the code segment to cover all potential executable mappings. 983 * returns 0 if no changes to the code segment were made. 984 */ 985 986 int 987 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb) 988 { 989 struct vm_map_entry *ent; 990 struct pmap *pm = vm_map_pmap(map); 991 vaddr_t va = 0; 992 993 vm_map_lock_read(map); 994 for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) { 995 996 /* 997 * This entry has greater va than the entries before. 998 * We need to make it point to the last page, not past it. 999 */ 1000 1001 if (ent->protection & VM_PROT_EXECUTE) 1002 va = trunc_page(ent->end) - PAGE_SIZE; 1003 } 1004 vm_map_unlock_read(map); 1005 if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL)) 1006 return (0); 1007 1008 pm->pm_hiexec = va; 1009 if (pm->pm_hiexec > I386_MAX_EXE_ADDR) { 1010 tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL); 1011 } else { 1012 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 1013 return (0); 1014 } 1015 return (1); 1016 } 1017 #endif /* !defined(__x86_64__) */ 1018 1019 void 1020 pat_init(struct cpu_info *ci) 1021 { 1022 uint64_t pat; 1023 1024 if (!(ci->ci_feat_val[0] & CPUID_PAT)) 1025 return; 1026 1027 /* We change WT to WC. Leave all other entries the default values. */ 1028 pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) | 1029 PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) | 1030 PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) | 1031 PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC); 1032 1033 wrmsr(MSR_CR_PAT, pat); 1034 cpu_pat_enabled = true; 1035 aprint_debug_dev(ci->ci_dev, "PAT enabled\n"); 1036 } 1037 1038 static pt_entry_t 1039 pmap_pat_flags(u_int flags) 1040 { 1041 u_int cacheflags = (flags & PMAP_CACHE_MASK); 1042 1043 if (!cpu_pat_enabled) { 1044 switch (cacheflags) { 1045 case PMAP_NOCACHE: 1046 case PMAP_NOCACHE_OVR: 1047 /* results in PGC_UCMINUS on cpus which have 1048 * the cpuid PAT but PAT "disabled" 1049 */ 1050 return PG_N; 1051 default: 1052 return 0; 1053 } 1054 } 1055 1056 switch (cacheflags) { 1057 case PMAP_NOCACHE: 1058 return PGC_UC; 1059 case PMAP_WRITE_COMBINE: 1060 return PGC_WC; 1061 case PMAP_WRITE_BACK: 1062 return PGC_WB; 1063 case PMAP_NOCACHE_OVR: 1064 return PGC_UCMINUS; 1065 } 1066 1067 return 0; 1068 } 1069 1070 /* 1071 * p m a p k e n t e r f u n c t i o n s 1072 * 1073 * functions to quickly enter/remove pages from the kernel address 1074 * space. pmap_kremove is exported to MI kernel. we make use of 1075 * the recursive PTE mappings. 1076 */ 1077 1078 /* 1079 * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking 1080 * 1081 * => no need to lock anything, assume va is already allocated 1082 * => should be faster than normal pmap enter function 1083 */ 1084 1085 void 1086 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags) 1087 { 1088 pt_entry_t *pte, opte, npte; 1089 1090 KASSERT(!(prot & ~VM_PROT_ALL)); 1091 1092 if (va < VM_MIN_KERNEL_ADDRESS) 1093 pte = vtopte(va); 1094 else 1095 pte = kvtopte(va); 1096 #ifdef DOM0OPS 1097 if (pa < pmap_pa_start || pa >= pmap_pa_end) { 1098 #ifdef DEBUG 1099 printk("pmap_kenter_pa: pa 0x%" PRIx64 " for va 0x%" PRIx64 1100 " outside range\n", (int64_t)pa, (int64_t)va); 1101 #endif /* DEBUG */ 1102 npte = pa; 1103 } else 1104 #endif /* DOM0OPS */ 1105 npte = pmap_pa2pte(pa); 1106 npte |= protection_codes[prot] | PG_k | PG_V | pmap_pg_g; 1107 npte |= pmap_pat_flags(flags); 1108 opte = pmap_pte_testset(pte, npte); /* zap! */ 1109 #if defined(DIAGNOSTIC) 1110 /* XXX For now... */ 1111 if (opte & PG_PS) 1112 panic("pmap_kenter_pa: PG_PS"); 1113 #endif 1114 if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) { 1115 /* This should not happen, so no need to batch updates. */ 1116 kpreempt_disable(); 1117 pmap_tlb_shootdown(pmap_kernel(), va, 0, opte); 1118 kpreempt_enable(); 1119 } 1120 } 1121 1122 void 1123 pmap_emap_enter(vaddr_t va, paddr_t pa, vm_prot_t prot) 1124 { 1125 pt_entry_t *pte, opte, npte; 1126 1127 KASSERT((prot & ~VM_PROT_ALL) == 0); 1128 pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va); 1129 1130 #ifdef DOM0OPS 1131 if (pa < pmap_pa_start || pa >= pmap_pa_end) { 1132 npte = pa; 1133 } else 1134 #endif 1135 npte = pmap_pa2pte(pa); 1136 1137 npte = pmap_pa2pte(pa); 1138 npte |= protection_codes[prot] | PG_k | PG_V; 1139 opte = pmap_pte_testset(pte, npte); 1140 } 1141 1142 /* 1143 * pmap_emap_sync: perform TLB flush or pmap load, if it was deferred. 1144 */ 1145 void 1146 pmap_emap_sync(bool canload) 1147 { 1148 struct cpu_info *ci = curcpu(); 1149 struct pmap *pmap; 1150 1151 KASSERT(kpreempt_disabled()); 1152 if (__predict_true(ci->ci_want_pmapload && canload)) { 1153 /* 1154 * XXX: Hint for pmap_reactivate(), which might suggest to 1155 * not perform TLB flush, if state has not changed. 1156 */ 1157 pmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map); 1158 if (__predict_false(pmap == ci->ci_pmap)) { 1159 const uint32_t cpumask = ci->ci_cpumask; 1160 atomic_and_32(&pmap->pm_cpus, ~cpumask); 1161 } 1162 pmap_load(); 1163 KASSERT(ci->ci_want_pmapload == 0); 1164 } else { 1165 tlbflush(); 1166 } 1167 1168 } 1169 1170 void 1171 pmap_emap_remove(vaddr_t sva, vsize_t len) 1172 { 1173 pt_entry_t *pte, xpte; 1174 vaddr_t va, eva = sva + len; 1175 1176 for (va = sva; va < eva; va += PAGE_SIZE) { 1177 pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va); 1178 xpte |= pmap_pte_testset(pte, 0); 1179 } 1180 } 1181 1182 __weak_alias(pmap_kenter_ma, pmap_kenter_pa); 1183 1184 #if defined(__x86_64__) 1185 /* 1186 * Change protection for a virtual address. Local for a CPU only, don't 1187 * care about TLB shootdowns. 1188 * 1189 * => must be called with preemption disabled 1190 */ 1191 void 1192 pmap_changeprot_local(vaddr_t va, vm_prot_t prot) 1193 { 1194 pt_entry_t *pte, opte, npte; 1195 1196 KASSERT(kpreempt_disabled()); 1197 1198 if (va < VM_MIN_KERNEL_ADDRESS) 1199 pte = vtopte(va); 1200 else 1201 pte = kvtopte(va); 1202 1203 npte = opte = *pte; 1204 1205 if ((prot & VM_PROT_WRITE) != 0) 1206 npte |= PG_RW; 1207 else 1208 npte &= ~PG_RW; 1209 1210 if (opte != npte) { 1211 pmap_pte_set(pte, npte); 1212 pmap_pte_flush(); 1213 invlpg(va); 1214 } 1215 } 1216 #endif /* defined(__x86_64__) */ 1217 1218 /* 1219 * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking 1220 * 1221 * => no need to lock anything 1222 * => caller must dispose of any vm_page mapped in the va range 1223 * => note: not an inline function 1224 * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE 1225 * => we assume kernel only unmaps valid addresses and thus don't bother 1226 * checking the valid bit before doing TLB flushing 1227 * => must be followed by call to pmap_update() before reuse of page 1228 */ 1229 1230 void 1231 pmap_kremove(vaddr_t sva, vsize_t len) 1232 { 1233 pt_entry_t *pte, xpte; 1234 vaddr_t va, eva; 1235 1236 eva = sva + len; 1237 xpte = 0; 1238 1239 for (va = sva; va < eva; va += PAGE_SIZE) { 1240 if (va < VM_MIN_KERNEL_ADDRESS) 1241 pte = vtopte(va); 1242 else 1243 pte = kvtopte(va); 1244 xpte |= pmap_pte_testset(pte, 0); /* zap! */ 1245 #if defined(DIAGNOSTIC) 1246 /* XXX For now... */ 1247 if (xpte & PG_PS) 1248 panic("pmap_kremove: PG_PS"); 1249 if (xpte & PG_PVLIST) 1250 panic("pmap_kremove: PG_PVLIST mapping for 0x%lx", 1251 va); 1252 #endif 1253 } 1254 if ((xpte & (PG_V | PG_U)) == (PG_V | PG_U)) { 1255 kpreempt_disable(); 1256 pmap_tlb_shootdown(pmap_kernel(), sva, eva, xpte); 1257 kpreempt_enable(); 1258 } 1259 } 1260 1261 /* 1262 * p m a p i n i t f u n c t i o n s 1263 * 1264 * pmap_bootstrap and pmap_init are called during system startup 1265 * to init the pmap module. pmap_bootstrap() does a low level 1266 * init just to get things rolling. pmap_init() finishes the job. 1267 */ 1268 1269 /* 1270 * pmap_bootstrap: get the system in a state where it can run with VM 1271 * properly enabled (called before main()). the VM system is 1272 * fully init'd later... 1273 * 1274 * => on i386, locore.s has already enabled the MMU by allocating 1275 * a PDP for the kernel, and nkpde PTP's for the kernel. 1276 * => kva_start is the first free virtual address in kernel space 1277 */ 1278 1279 void 1280 pmap_bootstrap(vaddr_t kva_start) 1281 { 1282 struct pmap *kpm; 1283 pt_entry_t *pte; 1284 int i; 1285 vaddr_t kva; 1286 #ifndef XEN 1287 unsigned long p1i; 1288 vaddr_t kva_end; 1289 #endif 1290 1291 pt_entry_t pg_nx = (cpu_feature[2] & CPUID_NOX ? PG_NX : 0); 1292 1293 /* 1294 * set up our local static global vars that keep track of the 1295 * usage of KVM before kernel_map is set up 1296 */ 1297 1298 virtual_avail = kva_start; /* first free KVA */ 1299 virtual_end = VM_MAX_KERNEL_ADDRESS; /* last KVA */ 1300 1301 /* 1302 * set up protection_codes: we need to be able to convert from 1303 * a MI protection code (some combo of VM_PROT...) to something 1304 * we can jam into a i386 PTE. 1305 */ 1306 1307 protection_codes[VM_PROT_NONE] = pg_nx; /* --- */ 1308 protection_codes[VM_PROT_EXECUTE] = PG_RO | PG_X; /* --x */ 1309 protection_codes[VM_PROT_READ] = PG_RO | pg_nx; /* -r- */ 1310 protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO | PG_X;/* -rx */ 1311 protection_codes[VM_PROT_WRITE] = PG_RW | pg_nx; /* w-- */ 1312 protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW | PG_X;/* w-x */ 1313 protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW | pg_nx; 1314 /* wr- */ 1315 protection_codes[VM_PROT_ALL] = PG_RW | PG_X; /* wrx */ 1316 1317 /* 1318 * now we init the kernel's pmap 1319 * 1320 * the kernel pmap's pm_obj is not used for much. however, in 1321 * user pmaps the pm_obj contains the list of active PTPs. 1322 * the pm_obj currently does not have a pager. it might be possible 1323 * to add a pager that would allow a process to read-only mmap its 1324 * own page tables (fast user level vtophys?). this may or may not 1325 * be useful. 1326 */ 1327 1328 kpm = pmap_kernel(); 1329 for (i = 0; i < PTP_LEVELS - 1; i++) { 1330 UVM_OBJ_INIT(&kpm->pm_obj[i], NULL, 1); 1331 kpm->pm_ptphint[i] = NULL; 1332 } 1333 memset(&kpm->pm_list, 0, sizeof(kpm->pm_list)); /* pm_list not used */ 1334 1335 kpm->pm_pdir = (pd_entry_t *)(PDPpaddr + KERNBASE); 1336 for (i = 0; i < PDP_SIZE; i++) 1337 kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i; 1338 1339 kpm->pm_stats.wired_count = kpm->pm_stats.resident_count = 1340 x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS); 1341 1342 /* 1343 * the above is just a rough estimate and not critical to the proper 1344 * operation of the system. 1345 */ 1346 1347 #ifndef XEN 1348 /* 1349 * Begin to enable global TLB entries if they are supported. 1350 * The G bit has no effect until the CR4_PGE bit is set in CR4, 1351 * which happens in cpu_init(), which is run on each cpu 1352 * (and happens later) 1353 */ 1354 1355 if (cpu_feature[0] & CPUID_PGE) { 1356 pmap_pg_g = PG_G; /* enable software */ 1357 1358 /* add PG_G attribute to already mapped kernel pages */ 1359 if (KERNBASE == VM_MIN_KERNEL_ADDRESS) { 1360 kva_end = virtual_avail; 1361 } else { 1362 extern vaddr_t eblob, esym; 1363 kva_end = (vaddr_t)&end; 1364 if (esym > kva_end) 1365 kva_end = esym; 1366 if (eblob > kva_end) 1367 kva_end = eblob; 1368 kva_end = roundup(kva_end, PAGE_SIZE); 1369 } 1370 for (kva = KERNBASE; kva < kva_end; kva += PAGE_SIZE) { 1371 p1i = pl1_i(kva); 1372 if (pmap_valid_entry(PTE_BASE[p1i])) 1373 PTE_BASE[p1i] |= PG_G; 1374 } 1375 } 1376 1377 /* 1378 * enable large pages if they are supported. 1379 */ 1380 1381 if (cpu_feature[0] & CPUID_PSE) { 1382 paddr_t pa; 1383 pd_entry_t *pde; 1384 extern char __data_start; 1385 1386 lcr4(rcr4() | CR4_PSE); /* enable hardware (via %cr4) */ 1387 pmap_largepages = 1; /* enable software */ 1388 1389 /* 1390 * the TLB must be flushed after enabling large pages 1391 * on Pentium CPUs, according to section 3.6.2.2 of 1392 * "Intel Architecture Software Developer's Manual, 1393 * Volume 3: System Programming". 1394 */ 1395 tlbflush(); 1396 1397 /* 1398 * now, remap the kernel text using large pages. we 1399 * assume that the linker has properly aligned the 1400 * .data segment to a NBPD_L2 boundary. 1401 */ 1402 kva_end = rounddown((vaddr_t)&__data_start, NBPD_L1); 1403 for (pa = 0, kva = KERNBASE; kva + NBPD_L2 <= kva_end; 1404 kva += NBPD_L2, pa += NBPD_L2) { 1405 pde = &L2_BASE[pl2_i(kva)]; 1406 *pde = pa | pmap_pg_g | PG_PS | 1407 PG_KR | PG_V; /* zap! */ 1408 tlbflush(); 1409 } 1410 #if defined(DEBUG) 1411 aprint_normal("kernel text is mapped with %" PRIuPSIZE " large " 1412 "pages and %" PRIuPSIZE " normal pages\n", 1413 howmany(kva - KERNBASE, NBPD_L2), 1414 howmany((vaddr_t)&__data_start - kva, NBPD_L1)); 1415 #endif /* defined(DEBUG) */ 1416 } 1417 #endif /* !XEN */ 1418 1419 if (VM_MIN_KERNEL_ADDRESS != KERNBASE) { 1420 /* 1421 * zero_pte is stuck at the end of mapped space for the kernel 1422 * image (disjunct from kva space). This is done so that it 1423 * can safely be used in pmap_growkernel (pmap_get_physpage), 1424 * when it's called for the first time. 1425 * XXXfvdl fix this for MULTIPROCESSOR later. 1426 */ 1427 1428 early_zerop = (void *)(KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2); 1429 early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop); 1430 } 1431 1432 /* 1433 * now we allocate the "special" VAs which are used for tmp mappings 1434 * by the pmap (and other modules). we allocate the VAs by advancing 1435 * virtual_avail (note that there are no pages mapped at these VAs). 1436 * we find the PTE that maps the allocated VA via the linear PTE 1437 * mapping. 1438 */ 1439 1440 pte = PTE_BASE + pl1_i(virtual_avail); 1441 1442 #ifdef MULTIPROCESSOR 1443 /* 1444 * Waste some VA space to avoid false sharing of cache lines 1445 * for page table pages: Give each possible CPU a cache line 1446 * of PTE's (8) to play with, though we only need 4. We could 1447 * recycle some of this waste by putting the idle stacks here 1448 * as well; we could waste less space if we knew the largest 1449 * CPU ID beforehand. 1450 */ 1451 csrcp = (char *) virtual_avail; csrc_pte = pte; 1452 1453 cdstp = (char *) virtual_avail+PAGE_SIZE; cdst_pte = pte+1; 1454 1455 zerop = (char *) virtual_avail+PAGE_SIZE*2; zero_pte = pte+2; 1456 1457 ptpp = (char *) virtual_avail+PAGE_SIZE*3; ptp_pte = pte+3; 1458 1459 virtual_avail += PAGE_SIZE * maxcpus * NPTECL; 1460 pte += maxcpus * NPTECL; 1461 #else 1462 csrcp = (void *) virtual_avail; csrc_pte = pte; /* allocate */ 1463 virtual_avail += PAGE_SIZE; pte++; /* advance */ 1464 1465 cdstp = (void *) virtual_avail; cdst_pte = pte; 1466 virtual_avail += PAGE_SIZE; pte++; 1467 1468 zerop = (void *) virtual_avail; zero_pte = pte; 1469 virtual_avail += PAGE_SIZE; pte++; 1470 1471 ptpp = (void *) virtual_avail; ptp_pte = pte; 1472 virtual_avail += PAGE_SIZE; pte++; 1473 #endif 1474 1475 if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { 1476 early_zerop = zerop; 1477 early_zero_pte = zero_pte; 1478 } 1479 1480 /* 1481 * Nothing after this point actually needs pte; 1482 */ 1483 pte = (void *)0xdeadbeef; 1484 1485 /* XXX: vmmap used by mem.c... should be uvm_map_reserve */ 1486 /* XXXfvdl PTEs not needed here */ 1487 vmmap = (char *)virtual_avail; /* don't need pte */ 1488 virtual_avail += PAGE_SIZE; pte++; 1489 1490 #ifdef XEN 1491 #ifdef __x86_64__ 1492 /* 1493 * We want a dummy page directory for Xen: 1494 * when deactivate a pmap, Xen will still consider it active. 1495 * So we set user PGD to this one to lift all protection on 1496 * the now inactive page tables set. 1497 */ 1498 xen_dummy_user_pgd = avail_start; 1499 avail_start += PAGE_SIZE; 1500 1501 /* Zero fill it, the less checks in Xen it requires the better */ 1502 memset((void *) (xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE); 1503 /* Mark read-only */ 1504 HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE, 1505 pmap_pa2pte(xen_dummy_user_pgd) | PG_u | PG_V, UVMF_INVLPG); 1506 /* Pin as L4 */ 1507 xpq_queue_pin_table(xpmap_ptom_masked(xen_dummy_user_pgd)); 1508 #endif /* __x86_64__ */ 1509 idt_vaddr = virtual_avail; /* don't need pte */ 1510 idt_paddr = avail_start; /* steal a page */ 1511 /* 1512 * Xen require one more page as we can't store 1513 * GDT and LDT on the same page 1514 */ 1515 virtual_avail += 3 * PAGE_SIZE; 1516 avail_start += 3 * PAGE_SIZE; 1517 #else /* XEN */ 1518 idt_vaddr = virtual_avail; /* don't need pte */ 1519 idt_paddr = avail_start; /* steal a page */ 1520 #if defined(__x86_64__) 1521 virtual_avail += 2 * PAGE_SIZE; pte += 2; 1522 avail_start += 2 * PAGE_SIZE; 1523 #else /* defined(__x86_64__) */ 1524 virtual_avail += PAGE_SIZE; pte++; 1525 avail_start += PAGE_SIZE; 1526 /* pentium f00f bug stuff */ 1527 pentium_idt_vaddr = virtual_avail; /* don't need pte */ 1528 virtual_avail += PAGE_SIZE; pte++; 1529 #endif /* defined(__x86_64__) */ 1530 #endif /* XEN */ 1531 1532 #ifdef _LP64 1533 /* 1534 * Grab a page below 4G for things that need it (i.e. 1535 * having an initial %cr3 for the MP trampoline). 1536 */ 1537 lo32_vaddr = virtual_avail; 1538 virtual_avail += PAGE_SIZE; pte++; 1539 lo32_paddr = avail_start; 1540 avail_start += PAGE_SIZE; 1541 #endif 1542 1543 /* 1544 * now we reserve some VM for mapping pages when doing a crash dump 1545 */ 1546 1547 virtual_avail = reserve_dumppages(virtual_avail); 1548 1549 /* 1550 * init the static-global locks and global lists. 1551 * 1552 * => pventry::pvh_lock (initialized elsewhere) must also be 1553 * a spin lock, again at IPL_VM to prevent deadlock, and 1554 * again is never taken from interrupt context. 1555 */ 1556 1557 mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE); 1558 LIST_INIT(&pmaps); 1559 pmap_cpu_init_early(curcpu()); 1560 1561 /* 1562 * initialize caches. 1563 */ 1564 1565 pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), 0, 0, 0, 1566 "pmappl", NULL, IPL_NONE, NULL, NULL, NULL); 1567 #ifdef PAE 1568 pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE * PDP_SIZE, 0, 0, 0, 1569 "pdppl", &pmap_pdp_allocator, IPL_NONE, 1570 pmap_pdp_ctor, pmap_pdp_dtor, NULL); 1571 #else /* PAE */ 1572 pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE, 0, 0, 0, 1573 "pdppl", NULL, IPL_NONE, pmap_pdp_ctor, pmap_pdp_dtor, NULL); 1574 #endif /* PAE */ 1575 pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0, 1576 PR_LARGECACHE, "pvpl", &pool_allocator_meta, IPL_NONE, NULL, 1577 NULL, NULL); 1578 1579 /* 1580 * ensure the TLB is sync'd with reality by flushing it... 1581 */ 1582 1583 tlbflush(); 1584 1585 /* 1586 * calculate pmap_maxkvaddr from nkptp[]. 1587 */ 1588 1589 kva = VM_MIN_KERNEL_ADDRESS; 1590 for (i = PTP_LEVELS - 1; i >= 1; i--) { 1591 kva += nkptp[i] * nbpd[i]; 1592 } 1593 pmap_maxkvaddr = kva; 1594 } 1595 1596 #if defined(__x86_64__) 1597 /* 1598 * Pre-allocate PTPs for low memory, so that 1:1 mappings for various 1599 * trampoline code can be entered. 1600 */ 1601 void 1602 pmap_prealloc_lowmem_ptps(void) 1603 { 1604 #ifdef XEN 1605 int level; 1606 paddr_t newp; 1607 paddr_t pdes_pa; 1608 1609 pdes_pa = pmap_pdirpa(pmap_kernel(), 0); 1610 level = PTP_LEVELS; 1611 for (;;) { 1612 newp = avail_start; 1613 avail_start += PAGE_SIZE; 1614 HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop, 1615 xpmap_ptom_masked(newp) | PG_u | PG_V | PG_RW, UVMF_INVLPG); 1616 memset((void *)early_zerop, 0, PAGE_SIZE); 1617 /* Mark R/O before installing */ 1618 HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop, 1619 xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG); 1620 if (newp < (NKL2_KIMG_ENTRIES * NBPD_L2)) 1621 HYPERVISOR_update_va_mapping (newp + KERNBASE, 1622 xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG); 1623 xpq_queue_pte_update ( 1624 xpmap_ptom_masked(pdes_pa) 1625 + (pl_i(0, level) * sizeof (pd_entry_t)), 1626 xpmap_ptom_masked(newp) | PG_RW | PG_u | PG_V); 1627 level--; 1628 if (level <= 1) 1629 break; 1630 pdes_pa = newp; 1631 } 1632 #else /* XEN */ 1633 pd_entry_t *pdes; 1634 int level; 1635 paddr_t newp; 1636 1637 pdes = pmap_kernel()->pm_pdir; 1638 level = PTP_LEVELS; 1639 for (;;) { 1640 newp = avail_start; 1641 avail_start += PAGE_SIZE; 1642 *early_zero_pte = (newp & PG_FRAME) | PG_V | PG_RW; 1643 pmap_update_pg((vaddr_t)early_zerop); 1644 memset(early_zerop, 0, PAGE_SIZE); 1645 pdes[pl_i(0, level)] = (newp & PG_FRAME) | PG_V | PG_RW; 1646 level--; 1647 if (level <= 1) 1648 break; 1649 pdes = normal_pdes[level - 2]; 1650 } 1651 #endif /* XEN */ 1652 } 1653 #endif /* defined(__x86_64__) */ 1654 1655 /* 1656 * pmap_init: called from uvm_init, our job is to get the pmap 1657 * system ready to manage mappings... 1658 */ 1659 1660 void 1661 pmap_init(void) 1662 { 1663 int i; 1664 1665 for (i = 0; i < PV_HASH_SIZE; i++) { 1666 SLIST_INIT(&pv_hash_heads[i].hh_list); 1667 } 1668 for (i = 0; i < PV_HASH_LOCK_CNT; i++) { 1669 mutex_init(&pv_hash_locks[i].lock, MUTEX_NODEBUG, IPL_VM); 1670 } 1671 1672 /* 1673 * done: pmap module is up (and ready for business) 1674 */ 1675 1676 pmap_initialized = true; 1677 } 1678 1679 /* 1680 * pmap_cpu_init_early: perform early per-CPU initialization. 1681 */ 1682 1683 void 1684 pmap_cpu_init_early(struct cpu_info *ci) 1685 { 1686 struct pmap_cpu *pc; 1687 static uint8_t pmap_cpu_alloc; 1688 1689 pc = &pmap_cpu[pmap_cpu_alloc++].pc; 1690 ci->ci_pmap_cpu = pc; 1691 } 1692 1693 /* 1694 * pmap_cpu_init_late: perform late per-CPU initialization. 1695 */ 1696 1697 void 1698 pmap_cpu_init_late(struct cpu_info *ci) 1699 { 1700 1701 if (ci == &cpu_info_primary) { 1702 evcnt_attach_dynamic(&pmap_tlb_evcnt, EVCNT_TYPE_INTR, 1703 NULL, "global", "TLB IPI"); 1704 evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC, 1705 NULL, "x86", "io bitmap copy"); 1706 evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC, 1707 NULL, "x86", "ldt sync"); 1708 } 1709 1710 evcnt_attach_dynamic(&ci->ci_tlb_evcnt, EVCNT_TYPE_MISC, 1711 NULL, device_xname(ci->ci_dev), "TLB IPI"); 1712 1713 #ifdef PAE 1714 int ret; 1715 struct pglist pg; 1716 struct vm_page *vmap; 1717 1718 /* The BP has already its own L3 page allocated in locore.S. */ 1719 if (ci == &cpu_info_primary) 1720 return; 1721 1722 /* 1723 * Allocate a page for the per-CPU L3 PD. cr3 being 32 bits, PA musts 1724 * resides below the 4GB boundary. 1725 */ 1726 ret = uvm_pglistalloc(PAGE_SIZE, 0, 0x100000000ULL, 32, 0, &pg, 1, 0); 1727 vmap = TAILQ_FIRST(&pg); 1728 1729 if (ret != 0 || vmap == NULL) 1730 panic("%s: failed to allocate L3 pglist for CPU %d (ret %d)\n", 1731 __func__, cpu_index(ci), ret); 1732 1733 ci->ci_pae_l3_pdirpa = vmap->phys_addr; 1734 1735 ci->ci_pae_l3_pdir = (paddr_t *)uvm_km_alloc(kernel_map, PAGE_SIZE, 0, 1736 UVM_KMF_VAONLY | UVM_KMF_NOWAIT); 1737 if (ci->ci_pae_l3_pdir == NULL) 1738 panic("%s: failed to allocate L3 PD for CPU %d\n", 1739 __func__, cpu_index(ci)); 1740 1741 pmap_kenter_pa((vaddr_t)ci->ci_pae_l3_pdir, ci->ci_pae_l3_pdirpa, 1742 VM_PROT_READ | VM_PROT_WRITE, 0); 1743 1744 pmap_update(pmap_kernel()); 1745 #endif 1746 } 1747 1748 /* 1749 * p v _ e n t r y f u n c t i o n s 1750 */ 1751 1752 /* 1753 * pmap_free_pvs: free a list of pv_entrys 1754 */ 1755 1756 static void 1757 pmap_free_pvs(struct pv_entry *pve) 1758 { 1759 struct pv_entry *next; 1760 1761 for ( /* null */ ; pve != NULL ; pve = next) { 1762 next = pve->pve_next; 1763 pool_cache_put(&pmap_pv_cache, pve); 1764 } 1765 } 1766 1767 /* 1768 * main pv_entry manipulation functions: 1769 * pmap_enter_pv: enter a mapping onto a pv_head list 1770 * pmap_remove_pv: remove a mapping from a pv_head list 1771 * 1772 * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock 1773 * the pvh before calling 1774 */ 1775 1776 /* 1777 * insert_pv: a helper of pmap_enter_pv 1778 */ 1779 1780 static void 1781 insert_pv(struct pmap_page *pp, struct pv_entry *pve) 1782 { 1783 struct pv_hash_head *hh; 1784 kmutex_t *lock; 1785 u_int hash; 1786 1787 KASSERT(pp_locked(pp)); 1788 1789 hash = pvhash_hash(pve->pve_pte.pte_ptp, pve->pve_pte.pte_va); 1790 lock = pvhash_lock(hash); 1791 hh = pvhash_head(hash); 1792 mutex_spin_enter(lock); 1793 SLIST_INSERT_HEAD(&hh->hh_list, pve, pve_hash); 1794 mutex_spin_exit(lock); 1795 1796 LIST_INSERT_HEAD(&pp->pp_head.pvh_list, pve, pve_list); 1797 } 1798 1799 /* 1800 * pmap_enter_pv: enter a mapping onto a pv_head lst 1801 * 1802 * => caller should have the pp_lock locked 1803 * => caller should adjust ptp's wire_count before calling 1804 */ 1805 1806 static struct pv_entry * 1807 pmap_enter_pv(struct pmap_page *pp, 1808 struct pv_entry *pve, /* preallocated pve for us to use */ 1809 struct pv_entry **sparepve, 1810 struct vm_page *ptp, 1811 vaddr_t va) 1812 { 1813 1814 KASSERT(ptp == NULL || ptp->wire_count >= 2); 1815 KASSERT(ptp == NULL || ptp->uobject != NULL); 1816 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 1817 KASSERT(pp_locked(pp)); 1818 1819 if ((pp->pp_flags & PP_EMBEDDED) == 0) { 1820 if (LIST_EMPTY(&pp->pp_head.pvh_list)) { 1821 pp->pp_flags |= PP_EMBEDDED; 1822 pp->pp_pte.pte_ptp = ptp; 1823 pp->pp_pte.pte_va = va; 1824 1825 return pve; 1826 } 1827 } else { 1828 struct pv_entry *pve2; 1829 1830 pve2 = *sparepve; 1831 *sparepve = NULL; 1832 1833 pve2->pve_pte = pp->pp_pte; 1834 pp->pp_flags &= ~PP_EMBEDDED; 1835 LIST_INIT(&pp->pp_head.pvh_list); 1836 insert_pv(pp, pve2); 1837 } 1838 1839 pve->pve_pte.pte_ptp = ptp; 1840 pve->pve_pte.pte_va = va; 1841 insert_pv(pp, pve); 1842 1843 return NULL; 1844 } 1845 1846 /* 1847 * pmap_remove_pv: try to remove a mapping from a pv_list 1848 * 1849 * => caller should hold pp_lock [so that attrs can be adjusted] 1850 * => caller should adjust ptp's wire_count and free PTP if needed 1851 * => we return the removed pve 1852 */ 1853 1854 static struct pv_entry * 1855 pmap_remove_pv(struct pmap_page *pp, struct vm_page *ptp, vaddr_t va) 1856 { 1857 struct pv_hash_head *hh; 1858 struct pv_entry *pve; 1859 kmutex_t *lock; 1860 u_int hash; 1861 1862 KASSERT(ptp == NULL || ptp->uobject != NULL); 1863 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 1864 KASSERT(pp_locked(pp)); 1865 1866 if ((pp->pp_flags & PP_EMBEDDED) != 0) { 1867 KASSERT(pp->pp_pte.pte_ptp == ptp); 1868 KASSERT(pp->pp_pte.pte_va == va); 1869 1870 pp->pp_flags &= ~PP_EMBEDDED; 1871 LIST_INIT(&pp->pp_head.pvh_list); 1872 1873 return NULL; 1874 } 1875 1876 hash = pvhash_hash(ptp, va); 1877 lock = pvhash_lock(hash); 1878 hh = pvhash_head(hash); 1879 mutex_spin_enter(lock); 1880 pve = pvhash_remove(hh, ptp, va); 1881 mutex_spin_exit(lock); 1882 1883 LIST_REMOVE(pve, pve_list); 1884 1885 return pve; 1886 } 1887 1888 /* 1889 * p t p f u n c t i o n s 1890 */ 1891 1892 static inline struct vm_page * 1893 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level) 1894 { 1895 int lidx = level - 1; 1896 struct vm_page *pg; 1897 1898 KASSERT(mutex_owned(&pmap->pm_lock)); 1899 1900 if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] && 1901 pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) { 1902 return (pmap->pm_ptphint[lidx]); 1903 } 1904 PMAP_SUBOBJ_LOCK(pmap, lidx); 1905 pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level)); 1906 PMAP_SUBOBJ_UNLOCK(pmap, lidx); 1907 1908 KASSERT(pg == NULL || pg->wire_count >= 1); 1909 return pg; 1910 } 1911 1912 static inline void 1913 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level) 1914 { 1915 int lidx; 1916 struct uvm_object *obj; 1917 1918 KASSERT(ptp->wire_count == 1); 1919 1920 lidx = level - 1; 1921 1922 obj = &pmap->pm_obj[lidx]; 1923 pmap_stats_update(pmap, -1, 0); 1924 if (lidx != 0) 1925 mutex_enter(&obj->vmobjlock); 1926 if (pmap->pm_ptphint[lidx] == ptp) 1927 pmap->pm_ptphint[lidx] = TAILQ_FIRST(&obj->memq); 1928 ptp->wire_count = 0; 1929 uvm_pagerealloc(ptp, NULL, 0); 1930 VM_PAGE_TO_PP(ptp)->pp_link = curlwp->l_md.md_gc_ptp; 1931 curlwp->l_md.md_gc_ptp = ptp; 1932 if (lidx != 0) 1933 mutex_exit(&obj->vmobjlock); 1934 } 1935 1936 static void 1937 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va, 1938 pt_entry_t *ptes, pd_entry_t * const *pdes) 1939 { 1940 unsigned long index; 1941 int level; 1942 vaddr_t invaladdr; 1943 #ifdef MULTIPROCESSOR 1944 vaddr_t invaladdr2; 1945 #endif 1946 pd_entry_t opde; 1947 struct pmap *curpmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map); 1948 1949 KASSERT(pmap != pmap_kernel()); 1950 KASSERT(mutex_owned(&pmap->pm_lock)); 1951 KASSERT(kpreempt_disabled()); 1952 1953 level = 1; 1954 do { 1955 index = pl_i(va, level + 1); 1956 opde = pmap_pte_testset(&pdes[level - 1][index], 0); 1957 #if defined(XEN) && defined(__x86_64__) 1958 /* 1959 * If ptp is a L3 currently mapped in kernel space, 1960 * clear it before freeing 1961 */ 1962 if (pmap_pdirpa(pmap, 0) == curcpu()->ci_xen_current_user_pgd 1963 && level == PTP_LEVELS - 1) 1964 pmap_pte_set(&pmap_kernel()->pm_pdir[index], 0); 1965 #endif /* XEN && __x86_64__ */ 1966 pmap_freepage(pmap, ptp, level); 1967 invaladdr = level == 1 ? (vaddr_t)ptes : 1968 (vaddr_t)pdes[level - 2]; 1969 pmap_tlb_shootdown(curpmap, invaladdr + index * PAGE_SIZE, 1970 0, opde); 1971 #if defined(MULTIPROCESSOR) 1972 invaladdr2 = level == 1 ? (vaddr_t)PTE_BASE : 1973 (vaddr_t)normal_pdes[level - 2]; 1974 if (pmap != curpmap || invaladdr != invaladdr2) { 1975 pmap_tlb_shootdown(pmap, invaladdr2 + index * PAGE_SIZE, 1976 0, opde); 1977 } 1978 #endif 1979 if (level < PTP_LEVELS - 1) { 1980 ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1); 1981 ptp->wire_count--; 1982 if (ptp->wire_count > 1) 1983 break; 1984 } 1985 } while (++level < PTP_LEVELS); 1986 pmap_pte_flush(); 1987 } 1988 1989 /* 1990 * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one) 1991 * 1992 * => pmap should NOT be pmap_kernel() 1993 * => pmap should be locked 1994 * => preemption should be disabled 1995 */ 1996 1997 static struct vm_page * 1998 pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t * const *pdes) 1999 { 2000 struct vm_page *ptp, *pptp; 2001 int i; 2002 unsigned long index; 2003 pd_entry_t *pva; 2004 paddr_t ppa, pa; 2005 struct uvm_object *obj; 2006 2007 KASSERT(pmap != pmap_kernel()); 2008 KASSERT(mutex_owned(&pmap->pm_lock)); 2009 KASSERT(kpreempt_disabled()); 2010 2011 ptp = NULL; 2012 pa = (paddr_t)-1; 2013 2014 /* 2015 * Loop through all page table levels seeing if we need to 2016 * add a new page to that level. 2017 */ 2018 for (i = PTP_LEVELS; i > 1; i--) { 2019 /* 2020 * Save values from previous round. 2021 */ 2022 pptp = ptp; 2023 ppa = pa; 2024 2025 index = pl_i(va, i); 2026 pva = pdes[i - 2]; 2027 2028 if (pmap_valid_entry(pva[index])) { 2029 ppa = pmap_pte2pa(pva[index]); 2030 ptp = NULL; 2031 continue; 2032 } 2033 2034 obj = &pmap->pm_obj[i-2]; 2035 PMAP_SUBOBJ_LOCK(pmap, i - 2); 2036 ptp = uvm_pagealloc(obj, ptp_va2o(va, i - 1), NULL, 2037 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 2038 PMAP_SUBOBJ_UNLOCK(pmap, i - 2); 2039 2040 if (ptp == NULL) 2041 return NULL; 2042 2043 ptp->flags &= ~PG_BUSY; /* never busy */ 2044 ptp->wire_count = 1; 2045 pmap->pm_ptphint[i - 2] = ptp; 2046 pa = VM_PAGE_TO_PHYS(ptp); 2047 pmap_pte_set(&pva[index], (pd_entry_t) 2048 (pmap_pa2pte(pa) | PG_u | PG_RW | PG_V)); 2049 #if defined(XEN) && defined(__x86_64__) 2050 /* 2051 * In Xen we must enter the mapping in kernel map too 2052 * if pmap is curmap and modifying top level (PGD) 2053 */ 2054 if(i == PTP_LEVELS && pmap != pmap_kernel()) { 2055 pmap_pte_set(&pmap_kernel()->pm_pdir[index], 2056 (pd_entry_t) (pmap_pa2pte(pa) 2057 | PG_u | PG_RW | PG_V)); 2058 } 2059 #endif /* XEN && __x86_64__ */ 2060 pmap_pte_flush(); 2061 pmap_stats_update(pmap, 1, 0); 2062 /* 2063 * If we're not in the top level, increase the 2064 * wire count of the parent page. 2065 */ 2066 if (i < PTP_LEVELS) { 2067 if (pptp == NULL) 2068 pptp = pmap_find_ptp(pmap, va, ppa, i); 2069 #ifdef DIAGNOSTIC 2070 if (pptp == NULL) 2071 panic("pde page disappeared"); 2072 #endif 2073 pptp->wire_count++; 2074 } 2075 } 2076 2077 /* 2078 * ptp is not NULL if we just allocated a new ptp. If it's 2079 * still NULL, we must look up the existing one. 2080 */ 2081 if (ptp == NULL) { 2082 ptp = pmap_find_ptp(pmap, va, ppa, 1); 2083 #ifdef DIAGNOSTIC 2084 if (ptp == NULL) { 2085 printf("va %" PRIxVADDR " ppa %" PRIxPADDR "\n", 2086 va, ppa); 2087 panic("pmap_get_ptp: unmanaged user PTP"); 2088 } 2089 #endif 2090 } 2091 2092 pmap->pm_ptphint[0] = ptp; 2093 return(ptp); 2094 } 2095 2096 /* 2097 * p m a p l i f e c y c l e f u n c t i o n s 2098 */ 2099 2100 /* 2101 * pmap_pdp_ctor: constructor for the PDP cache. 2102 */ 2103 2104 int 2105 pmap_pdp_ctor(void *arg, void *v, int flags) 2106 { 2107 pd_entry_t *pdir = v; 2108 paddr_t pdirpa = 0; /* XXX: GCC */ 2109 vaddr_t object; 2110 int i; 2111 2112 #if !defined(XEN) || !defined(__x86_64__) 2113 int npde; 2114 #endif 2115 #ifdef XEN 2116 int s; 2117 #endif 2118 2119 /* 2120 * NOTE: The `pmap_lock' is held when the PDP is allocated. 2121 */ 2122 2123 #if defined(XEN) && defined(__x86_64__) 2124 /* fetch the physical address of the page directory. */ 2125 (void) pmap_extract(pmap_kernel(), (vaddr_t) pdir, &pdirpa); 2126 2127 /* zero init area */ 2128 memset (pdir, 0, PAGE_SIZE); /* Xen wants a clean page */ 2129 /* 2130 * this pdir will NEVER be active in kernel mode 2131 * so mark recursive entry invalid 2132 */ 2133 pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa) | PG_u; 2134 /* 2135 * PDP constructed this way won't be for kernel, 2136 * hence we don't put kernel mappings on Xen. 2137 * But we need to make pmap_create() happy, so put a dummy (without 2138 * PG_V) value at the right place. 2139 */ 2140 pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] = 2141 (pd_entry_t)-1 & PG_FRAME; 2142 #else /* XEN && __x86_64__*/ 2143 /* zero init area */ 2144 memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t)); 2145 2146 object = (vaddr_t)v; 2147 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2148 /* fetch the physical address of the page directory. */ 2149 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2150 /* put in recursive PDE to map the PTEs */ 2151 pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PG_V; 2152 #ifndef XEN 2153 pdir[PDIR_SLOT_PTE + i] |= PG_KW; 2154 #endif 2155 } 2156 2157 /* copy kernel's PDE */ 2158 npde = nkptp[PTP_LEVELS - 1]; 2159 2160 memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN], 2161 npde * sizeof(pd_entry_t)); 2162 2163 /* zero the rest */ 2164 memset(&pdir[PDIR_SLOT_KERN + npde], 0, 2165 (NTOPLEVEL_PDES - (PDIR_SLOT_KERN + npde)) * sizeof(pd_entry_t)); 2166 2167 if (VM_MIN_KERNEL_ADDRESS != KERNBASE) { 2168 int idx = pl_i(KERNBASE, PTP_LEVELS); 2169 2170 pdir[idx] = PDP_BASE[idx]; 2171 } 2172 #endif /* XEN && __x86_64__*/ 2173 #ifdef XEN 2174 s = splvm(); 2175 object = (vaddr_t)v; 2176 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2177 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2178 /* remap this page RO */ 2179 pmap_kenter_pa(object, pdirpa, VM_PROT_READ, 0); 2180 pmap_update(pmap_kernel()); 2181 /* 2182 * pin as L2/L4 page, we have to do the page with the 2183 * PDIR_SLOT_PTE entries last 2184 */ 2185 #ifdef PAE 2186 if (i == l2tol3(PDIR_SLOT_PTE)) 2187 continue; 2188 #endif 2189 xpq_queue_pin_table(xpmap_ptom_masked(pdirpa)); 2190 } 2191 #ifdef PAE 2192 object = ((vaddr_t)pdir) + PAGE_SIZE * l2tol3(PDIR_SLOT_PTE); 2193 (void)pmap_extract(pmap_kernel(), object, &pdirpa); 2194 xpq_queue_pin_table(xpmap_ptom_masked(pdirpa)); 2195 #endif 2196 splx(s); 2197 #endif /* XEN */ 2198 2199 return (0); 2200 } 2201 2202 /* 2203 * pmap_pdp_dtor: destructor for the PDP cache. 2204 */ 2205 2206 void 2207 pmap_pdp_dtor(void *arg, void *v) 2208 { 2209 #ifdef XEN 2210 paddr_t pdirpa = 0; /* XXX: GCC */ 2211 vaddr_t object = (vaddr_t)v; 2212 int i; 2213 int s = splvm(); 2214 pt_entry_t *pte; 2215 2216 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2217 /* fetch the physical address of the page directory. */ 2218 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2219 /* unpin page table */ 2220 xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa)); 2221 } 2222 object = (vaddr_t)v; 2223 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2224 /* Set page RW again */ 2225 pte = kvtopte(object); 2226 xpq_queue_pte_update(xpmap_ptetomach(pte), *pte | PG_RW); 2227 xpq_queue_invlpg((vaddr_t)object); 2228 } 2229 splx(s); 2230 #endif /* XEN */ 2231 } 2232 2233 #ifdef PAE 2234 2235 /* pmap_pdp_alloc: Allocate a page for the pdp memory pool. */ 2236 2237 void * 2238 pmap_pdp_alloc(struct pool *pp, int flags) 2239 { 2240 return (void *)uvm_km_alloc(kernel_map, 2241 PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE, 2242 ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK) 2243 | UVM_KMF_WIRED); 2244 } 2245 2246 /* 2247 * pmap_pdp_free: free a PDP 2248 */ 2249 2250 void 2251 pmap_pdp_free(struct pool *pp, void *v) 2252 { 2253 uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE, 2254 UVM_KMF_WIRED); 2255 } 2256 #endif /* PAE */ 2257 2258 /* 2259 * pmap_create: create a pmap 2260 * 2261 * => note: old pmap interface took a "size" args which allowed for 2262 * the creation of "software only" pmaps (not in bsd). 2263 */ 2264 2265 struct pmap * 2266 pmap_create(void) 2267 { 2268 struct pmap *pmap; 2269 int i; 2270 2271 pmap = pool_cache_get(&pmap_cache, PR_WAITOK); 2272 2273 /* init uvm_object */ 2274 for (i = 0; i < PTP_LEVELS - 1; i++) { 2275 UVM_OBJ_INIT(&pmap->pm_obj[i], NULL, 1); 2276 pmap->pm_ptphint[i] = NULL; 2277 } 2278 pmap->pm_stats.wired_count = 0; 2279 /* count the PDP allocd below */ 2280 pmap->pm_stats.resident_count = PDP_SIZE; 2281 #if !defined(__x86_64__) 2282 pmap->pm_hiexec = 0; 2283 #endif /* !defined(__x86_64__) */ 2284 pmap->pm_flags = 0; 2285 pmap->pm_cpus = 0; 2286 pmap->pm_kernel_cpus = 0; 2287 2288 /* init the LDT */ 2289 pmap->pm_ldt = NULL; 2290 pmap->pm_ldt_len = 0; 2291 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 2292 2293 /* allocate PDP */ 2294 try_again: 2295 pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK); 2296 2297 mutex_enter(&pmaps_lock); 2298 2299 if (pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] == 0) { 2300 mutex_exit(&pmaps_lock); 2301 pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir); 2302 goto try_again; 2303 } 2304 2305 for (i = 0; i < PDP_SIZE; i++) 2306 pmap->pm_pdirpa[i] = 2307 pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]); 2308 2309 LIST_INSERT_HEAD(&pmaps, pmap, pm_list); 2310 2311 mutex_exit(&pmaps_lock); 2312 2313 return (pmap); 2314 } 2315 2316 /* 2317 * pmap_destroy: drop reference count on pmap. free pmap if 2318 * reference count goes to zero. 2319 */ 2320 2321 void 2322 pmap_destroy(struct pmap *pmap) 2323 { 2324 int i; 2325 #ifdef DIAGNOSTIC 2326 struct cpu_info *ci; 2327 CPU_INFO_ITERATOR cii; 2328 #endif /* DIAGNOSTIC */ 2329 2330 /* 2331 * if we have torn down this pmap, process deferred frees and 2332 * invalidations now. 2333 */ 2334 if (__predict_false(curlwp->l_md.md_gc_pmap == pmap)) { 2335 pmap_update(pmap); 2336 } 2337 2338 /* 2339 * drop reference count 2340 */ 2341 2342 if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) { 2343 return; 2344 } 2345 2346 #ifdef DIAGNOSTIC 2347 for (CPU_INFO_FOREACH(cii, ci)) 2348 if (ci->ci_pmap == pmap) 2349 panic("destroying pmap being used"); 2350 #endif /* DIAGNOSTIC */ 2351 2352 /* 2353 * reference count is zero, free pmap resources and then free pmap. 2354 */ 2355 #ifdef XEN 2356 /* 2357 * Xen lazy APDP handling: 2358 * clear APDP_PDE if pmap is the currently mapped 2359 */ 2360 if (xpmap_ptom_masked(pmap_pdirpa(pmap, 0)) == (*APDP_PDE & PG_FRAME)) { 2361 kpreempt_disable(); 2362 pmap_unmap_apdp(); 2363 pmap_pte_flush(); 2364 pmap_apte_flush(pmap_kernel()); 2365 kpreempt_enable(); 2366 } 2367 #endif 2368 2369 /* 2370 * remove it from global list of pmaps 2371 */ 2372 2373 mutex_enter(&pmaps_lock); 2374 LIST_REMOVE(pmap, pm_list); 2375 mutex_exit(&pmaps_lock); 2376 2377 /* 2378 * destroyed pmap shouldn't have remaining PTPs 2379 */ 2380 2381 for (i = 0; i < PTP_LEVELS - 1; i++) { 2382 KASSERT(pmap->pm_obj[i].uo_npages == 0); 2383 KASSERT(TAILQ_EMPTY(&pmap->pm_obj[i].memq)); 2384 } 2385 2386 /* 2387 * MULTIPROCESSOR -- no need to flush out of other processors' 2388 * APTE space because we do that in pmap_unmap_ptes(). 2389 */ 2390 pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir); 2391 2392 #ifdef USER_LDT 2393 if (pmap->pm_ldt != NULL) { 2394 /* 2395 * no need to switch the LDT; this address space is gone, 2396 * nothing is using it. 2397 * 2398 * No need to lock the pmap for ldt_free (or anything else), 2399 * we're the last one to use it. 2400 */ 2401 mutex_enter(&cpu_lock); 2402 ldt_free(pmap->pm_ldt_sel); 2403 mutex_exit(&cpu_lock); 2404 uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt, 2405 pmap->pm_ldt_len, UVM_KMF_WIRED); 2406 } 2407 #endif 2408 2409 for (i = 0; i < PTP_LEVELS - 1; i++) 2410 mutex_destroy(&pmap->pm_obj[i].vmobjlock); 2411 pool_cache_put(&pmap_cache, pmap); 2412 } 2413 2414 /* 2415 * pmap_remove_all: pmap is being torn down by the current thread. 2416 * avoid unnecessary invalidations. 2417 */ 2418 2419 void 2420 pmap_remove_all(struct pmap *pmap) 2421 { 2422 lwp_t *l = curlwp; 2423 2424 KASSERT(l->l_md.md_gc_pmap == NULL); 2425 2426 l->l_md.md_gc_pmap = pmap; 2427 } 2428 2429 #if defined(PMAP_FORK) 2430 /* 2431 * pmap_fork: perform any necessary data structure manipulation when 2432 * a VM space is forked. 2433 */ 2434 2435 void 2436 pmap_fork(struct pmap *pmap1, struct pmap *pmap2) 2437 { 2438 #ifdef USER_LDT 2439 union descriptor *new_ldt; 2440 size_t len; 2441 int sel; 2442 2443 if (__predict_true(pmap1->pm_ldt == NULL)) { 2444 return; 2445 } 2446 2447 retry: 2448 if (pmap1->pm_ldt != NULL) { 2449 len = pmap1->pm_ldt_len; 2450 new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, len, 0, 2451 UVM_KMF_WIRED); 2452 mutex_enter(&cpu_lock); 2453 sel = ldt_alloc(new_ldt, len); 2454 if (sel == -1) { 2455 mutex_exit(&cpu_lock); 2456 uvm_km_free(kernel_map, (vaddr_t)new_ldt, len, 2457 UVM_KMF_WIRED); 2458 printf("WARNING: pmap_fork: unable to allocate LDT\n"); 2459 return; 2460 } 2461 } else { 2462 len = -1; 2463 new_ldt = NULL; 2464 sel = -1; 2465 mutex_enter(&cpu_lock); 2466 } 2467 2468 /* Copy the LDT, if necessary. */ 2469 if (pmap1->pm_ldt != NULL) { 2470 if (len != pmap1->pm_ldt_len) { 2471 if (len != -1) { 2472 ldt_free(sel); 2473 uvm_km_free(kernel_map, (vaddr_t)new_ldt, 2474 len, UVM_KMF_WIRED); 2475 } 2476 mutex_exit(&cpu_lock); 2477 goto retry; 2478 } 2479 2480 memcpy(new_ldt, pmap1->pm_ldt, len); 2481 pmap2->pm_ldt = new_ldt; 2482 pmap2->pm_ldt_len = pmap1->pm_ldt_len; 2483 pmap2->pm_ldt_sel = sel; 2484 len = -1; 2485 } 2486 2487 if (len != -1) { 2488 ldt_free(sel); 2489 uvm_km_free(kernel_map, (vaddr_t)new_ldt, len, 2490 UVM_KMF_WIRED); 2491 } 2492 mutex_exit(&cpu_lock); 2493 #endif /* USER_LDT */ 2494 } 2495 #endif /* PMAP_FORK */ 2496 2497 #ifdef USER_LDT 2498 2499 /* 2500 * pmap_ldt_xcall: cross call used by pmap_ldt_sync. if the named pmap 2501 * is active, reload LDTR. 2502 */ 2503 static void 2504 pmap_ldt_xcall(void *arg1, void *arg2) 2505 { 2506 struct pmap *pm; 2507 2508 kpreempt_disable(); 2509 pm = arg1; 2510 if (curcpu()->ci_pmap == pm) { 2511 lldt(pm->pm_ldt_sel); 2512 } 2513 kpreempt_enable(); 2514 } 2515 2516 /* 2517 * pmap_ldt_sync: LDT selector for the named pmap is changing. swap 2518 * in the new selector on all CPUs. 2519 */ 2520 void 2521 pmap_ldt_sync(struct pmap *pm) 2522 { 2523 uint64_t where; 2524 2525 KASSERT(mutex_owned(&cpu_lock)); 2526 2527 pmap_ldt_evcnt.ev_count++; 2528 where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL); 2529 xc_wait(where); 2530 } 2531 2532 /* 2533 * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and 2534 * restore the default. 2535 */ 2536 2537 void 2538 pmap_ldt_cleanup(struct lwp *l) 2539 { 2540 pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap; 2541 union descriptor *dp = NULL; 2542 size_t len = 0; 2543 int sel = -1; 2544 2545 if (__predict_true(pmap->pm_ldt == NULL)) { 2546 return; 2547 } 2548 2549 mutex_enter(&cpu_lock); 2550 if (pmap->pm_ldt != NULL) { 2551 sel = pmap->pm_ldt_sel; 2552 dp = pmap->pm_ldt; 2553 len = pmap->pm_ldt_len; 2554 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 2555 pmap->pm_ldt = NULL; 2556 pmap->pm_ldt_len = 0; 2557 pmap_ldt_sync(pmap); 2558 ldt_free(sel); 2559 uvm_km_free(kernel_map, (vaddr_t)dp, len, UVM_KMF_WIRED); 2560 } 2561 mutex_exit(&cpu_lock); 2562 } 2563 #endif /* USER_LDT */ 2564 2565 /* 2566 * pmap_activate: activate a process' pmap 2567 * 2568 * => must be called with kernel preemption disabled 2569 * => if lwp is the curlwp, then set ci_want_pmapload so that 2570 * actual MMU context switch will be done by pmap_load() later 2571 */ 2572 2573 void 2574 pmap_activate(struct lwp *l) 2575 { 2576 struct cpu_info *ci; 2577 struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2578 2579 KASSERT(kpreempt_disabled()); 2580 2581 ci = curcpu(); 2582 2583 if (l == ci->ci_curlwp) { 2584 KASSERT(ci->ci_want_pmapload == 0); 2585 KASSERT(ci->ci_tlbstate != TLBSTATE_VALID); 2586 #ifdef KSTACK_CHECK_DR0 2587 /* 2588 * setup breakpoint on the top of stack 2589 */ 2590 if (l == &lwp0) 2591 dr0(0, 0, 0, 0); 2592 else 2593 dr0(KSTACK_LOWEST_ADDR(l), 1, 3, 1); 2594 #endif 2595 2596 /* 2597 * no need to switch to kernel vmspace because 2598 * it's a subset of any vmspace. 2599 */ 2600 2601 if (pmap == pmap_kernel()) { 2602 ci->ci_want_pmapload = 0; 2603 return; 2604 } 2605 2606 ci->ci_want_pmapload = 1; 2607 } 2608 } 2609 2610 /* 2611 * pmap_reactivate: try to regain reference to the pmap. 2612 * 2613 * => must be called with kernel preemption disabled 2614 */ 2615 2616 static bool 2617 pmap_reactivate(struct pmap *pmap) 2618 { 2619 struct cpu_info *ci; 2620 uint32_t cpumask; 2621 bool result; 2622 uint32_t oldcpus; 2623 2624 ci = curcpu(); 2625 cpumask = ci->ci_cpumask; 2626 2627 KASSERT(kpreempt_disabled()); 2628 #if defined(XEN) && defined(__x86_64__) 2629 KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd); 2630 #elif defined(PAE) 2631 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0])); 2632 #elif !defined(XEN) 2633 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3())); 2634 #endif 2635 2636 /* 2637 * if we still have a lazy reference to this pmap, 2638 * we can assume that there was no tlb shootdown 2639 * for this pmap in the meantime. 2640 * 2641 * the order of events here is important as we must 2642 * synchronize with TLB shootdown interrupts. declare 2643 * interest in invalidations (TLBSTATE_VALID) and then 2644 * check the cpumask, which the IPIs can change only 2645 * when the state is TLBSTATE_LAZY. 2646 */ 2647 2648 ci->ci_tlbstate = TLBSTATE_VALID; 2649 oldcpus = pmap->pm_cpus; 2650 KASSERT((pmap->pm_kernel_cpus & cpumask) != 0); 2651 if (oldcpus & cpumask) { 2652 /* got it */ 2653 result = true; 2654 } else { 2655 /* must reload */ 2656 atomic_or_32(&pmap->pm_cpus, cpumask); 2657 result = false; 2658 } 2659 2660 return result; 2661 } 2662 2663 /* 2664 * pmap_load: actually switch pmap. (fill in %cr3 and LDT info) 2665 */ 2666 2667 void 2668 pmap_load(void) 2669 { 2670 struct cpu_info *ci; 2671 uint32_t cpumask; 2672 struct pmap *pmap; 2673 struct pmap *oldpmap; 2674 struct lwp *l; 2675 struct pcb *pcb; 2676 uint64_t ncsw; 2677 2678 kpreempt_disable(); 2679 retry: 2680 ci = curcpu(); 2681 if (!ci->ci_want_pmapload) { 2682 kpreempt_enable(); 2683 return; 2684 } 2685 cpumask = ci->ci_cpumask; 2686 l = ci->ci_curlwp; 2687 ncsw = l->l_ncsw; 2688 2689 /* should be able to take ipis. */ 2690 KASSERT(ci->ci_ilevel < IPL_HIGH); 2691 #ifdef XEN 2692 /* XXX not yet KASSERT(x86_read_psl() != 0); */ 2693 #else 2694 KASSERT((x86_read_psl() & PSL_I) != 0); 2695 #endif 2696 2697 KASSERT(l != NULL); 2698 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2699 KASSERT(pmap != pmap_kernel()); 2700 oldpmap = ci->ci_pmap; 2701 pcb = lwp_getpcb(l); 2702 2703 if (pmap == oldpmap) { 2704 if (!pmap_reactivate(pmap)) { 2705 u_int gen = uvm_emap_gen_return(); 2706 2707 /* 2708 * pmap has been changed during deactivated. 2709 * our tlb may be stale. 2710 */ 2711 2712 tlbflush(); 2713 uvm_emap_update(gen); 2714 } 2715 2716 ci->ci_want_pmapload = 0; 2717 kpreempt_enable(); 2718 return; 2719 } 2720 2721 /* 2722 * grab a reference to the new pmap. 2723 */ 2724 2725 pmap_reference(pmap); 2726 2727 /* 2728 * actually switch pmap. 2729 */ 2730 2731 atomic_and_32(&oldpmap->pm_cpus, ~cpumask); 2732 atomic_and_32(&oldpmap->pm_kernel_cpus, ~cpumask); 2733 2734 #if defined(XEN) && defined(__x86_64__) 2735 KASSERT(pmap_pdirpa(oldpmap, 0) == ci->ci_xen_current_user_pgd || 2736 oldpmap == pmap_kernel()); 2737 #elif defined(PAE) 2738 KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0])); 2739 #elif !defined(XEN) 2740 KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(rcr3())); 2741 #endif 2742 KASSERT((pmap->pm_cpus & cpumask) == 0); 2743 KASSERT((pmap->pm_kernel_cpus & cpumask) == 0); 2744 2745 /* 2746 * mark the pmap in use by this processor. again we must 2747 * synchronize with TLB shootdown interrupts, so set the 2748 * state VALID first, then register us for shootdown events 2749 * on this pmap. 2750 */ 2751 2752 ci->ci_tlbstate = TLBSTATE_VALID; 2753 atomic_or_32(&pmap->pm_cpus, cpumask); 2754 atomic_or_32(&pmap->pm_kernel_cpus, cpumask); 2755 ci->ci_pmap = pmap; 2756 2757 /* 2758 * update tss. now that we have registered for invalidations 2759 * from other CPUs, we're good to load the page tables. 2760 */ 2761 #ifdef PAE 2762 pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa; 2763 #else 2764 pcb->pcb_cr3 = pmap_pdirpa(pmap, 0); 2765 #endif 2766 2767 #ifdef i386 2768 #ifdef XEN 2769 /* 2770 * clear APDP slot, in case it points to a page table that has 2771 * been freed 2772 */ 2773 if (*APDP_PDE) { 2774 pmap_unmap_apdp(); 2775 } 2776 /* lldt() does pmap_pte_flush() */ 2777 #endif /* XEN */ 2778 2779 #ifndef XEN 2780 ci->ci_tss.tss_ldt = pmap->pm_ldt_sel; 2781 ci->ci_tss.tss_cr3 = pcb->pcb_cr3; 2782 #endif /* !XEN */ 2783 #endif /* i386 */ 2784 2785 lldt(pmap->pm_ldt_sel); 2786 2787 u_int gen = uvm_emap_gen_return(); 2788 cpu_load_pmap(pmap); 2789 uvm_emap_update(gen); 2790 2791 ci->ci_want_pmapload = 0; 2792 2793 /* 2794 * we're now running with the new pmap. drop the reference 2795 * to the old pmap. if we block, we need to go around again. 2796 */ 2797 2798 pmap_destroy(oldpmap); 2799 if (l->l_ncsw != ncsw) { 2800 goto retry; 2801 } 2802 2803 kpreempt_enable(); 2804 } 2805 2806 /* 2807 * pmap_deactivate: deactivate a process' pmap 2808 * 2809 * => must be called with kernel preemption disabled (high SPL is enough) 2810 */ 2811 2812 void 2813 pmap_deactivate(struct lwp *l) 2814 { 2815 struct pmap *pmap; 2816 struct cpu_info *ci; 2817 2818 KASSERT(kpreempt_disabled()); 2819 2820 if (l != curlwp) { 2821 return; 2822 } 2823 2824 /* 2825 * wait for pending TLB shootdowns to complete. necessary 2826 * because TLB shootdown state is per-CPU, and the LWP may 2827 * be coming off the CPU before it has a chance to call 2828 * pmap_update(). 2829 */ 2830 pmap_tlb_shootwait(); 2831 2832 ci = curcpu(); 2833 2834 if (ci->ci_want_pmapload) { 2835 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 2836 != pmap_kernel()); 2837 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 2838 != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID); 2839 2840 /* 2841 * userspace has not been touched. 2842 * nothing to do here. 2843 */ 2844 2845 ci->ci_want_pmapload = 0; 2846 return; 2847 } 2848 2849 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2850 2851 if (pmap == pmap_kernel()) { 2852 return; 2853 } 2854 2855 #if defined(XEN) && defined(__x86_64__) 2856 KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd); 2857 #elif defined(PAE) 2858 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0])); 2859 #elif !defined(XEN) 2860 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3())); 2861 #endif 2862 KASSERT(ci->ci_pmap == pmap); 2863 2864 /* 2865 * we aren't interested in TLB invalidations for this pmap, 2866 * at least for the time being. 2867 */ 2868 2869 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); 2870 ci->ci_tlbstate = TLBSTATE_LAZY; 2871 } 2872 2873 /* 2874 * end of lifecycle functions 2875 */ 2876 2877 /* 2878 * some misc. functions 2879 */ 2880 2881 int 2882 pmap_pdes_invalid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde) 2883 { 2884 int i; 2885 unsigned long index; 2886 pd_entry_t pde; 2887 2888 for (i = PTP_LEVELS; i > 1; i--) { 2889 index = pl_i(va, i); 2890 pde = pdes[i - 2][index]; 2891 if ((pde & PG_V) == 0) 2892 return i; 2893 } 2894 if (lastpde != NULL) 2895 *lastpde = pde; 2896 return 0; 2897 } 2898 2899 /* 2900 * pmap_extract: extract a PA for the given VA 2901 */ 2902 2903 bool 2904 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap) 2905 { 2906 pt_entry_t *ptes, pte; 2907 pd_entry_t pde; 2908 pd_entry_t * const *pdes; 2909 struct pmap *pmap2; 2910 struct cpu_info *ci; 2911 paddr_t pa; 2912 lwp_t *l; 2913 bool hard, rv; 2914 2915 rv = false; 2916 pa = 0; 2917 l = curlwp; 2918 2919 KPREEMPT_DISABLE(l); 2920 ci = l->l_cpu; 2921 if (__predict_true(!ci->ci_want_pmapload && ci->ci_pmap == pmap) || 2922 pmap == pmap_kernel()) { 2923 /* 2924 * no need to lock, because it's pmap_kernel() or our 2925 * own pmap and is active. if a user pmap, the caller 2926 * will hold the vm_map write/read locked and so prevent 2927 * entries from disappearing while we are here. ptps 2928 * can disappear via pmap_remove() and pmap_protect(), 2929 * but they are called with the vm_map write locked. 2930 */ 2931 hard = false; 2932 ptes = PTE_BASE; 2933 pdes = normal_pdes; 2934 } else { 2935 /* we lose, do it the hard way. */ 2936 hard = true; 2937 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 2938 } 2939 if (pmap_pdes_valid(va, pdes, &pde)) { 2940 pte = ptes[pl1_i(va)]; 2941 if (pde & PG_PS) { 2942 pa = (pde & PG_LGFRAME) | (va & (NBPD_L2 - 1)); 2943 rv = true; 2944 } else if (__predict_true((pte & PG_V) != 0)) { 2945 pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1)); 2946 rv = true; 2947 } 2948 } 2949 if (__predict_false(hard)) { 2950 pmap_unmap_ptes(pmap, pmap2); 2951 } 2952 KPREEMPT_ENABLE(l); 2953 if (pap != NULL) { 2954 *pap = pa; 2955 } 2956 return rv; 2957 } 2958 2959 2960 /* 2961 * vtophys: virtual address to physical address. For use by 2962 * machine-dependent code only. 2963 */ 2964 2965 paddr_t 2966 vtophys(vaddr_t va) 2967 { 2968 paddr_t pa; 2969 2970 if (pmap_extract(pmap_kernel(), va, &pa) == true) 2971 return (pa); 2972 return (0); 2973 } 2974 2975 __weak_alias(pmap_extract_ma, pmap_extract); 2976 2977 #ifdef XEN 2978 2979 /* 2980 * vtomach: virtual address to machine address. For use by 2981 * machine-dependent code only. 2982 */ 2983 2984 paddr_t 2985 vtomach(vaddr_t va) 2986 { 2987 paddr_t pa; 2988 2989 if (pmap_extract_ma(pmap_kernel(), va, &pa) == true) 2990 return (pa); 2991 return (0); 2992 } 2993 2994 #endif /* XEN */ 2995 2996 /* 2997 * pmap_virtual_space: used during bootup [pmap_steal_memory] to 2998 * determine the bounds of the kernel virtual addess space. 2999 */ 3000 3001 void 3002 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp) 3003 { 3004 *startp = virtual_avail; 3005 *endp = virtual_end; 3006 } 3007 3008 /* 3009 * pmap_map: map a range of PAs into kvm. 3010 * 3011 * => used during crash dump 3012 * => XXX: pmap_map() should be phased out? 3013 */ 3014 3015 vaddr_t 3016 pmap_map(vaddr_t va, paddr_t spa, paddr_t epa, vm_prot_t prot) 3017 { 3018 while (spa < epa) { 3019 pmap_kenter_pa(va, spa, prot, 0); 3020 va += PAGE_SIZE; 3021 spa += PAGE_SIZE; 3022 } 3023 pmap_update(pmap_kernel()); 3024 return va; 3025 } 3026 3027 /* 3028 * pmap_zero_page: zero a page 3029 */ 3030 3031 void 3032 pmap_zero_page(paddr_t pa) 3033 { 3034 pt_entry_t *zpte; 3035 void *zerova; 3036 int id; 3037 3038 kpreempt_disable(); 3039 id = cpu_number(); 3040 zpte = PTESLEW(zero_pte, id); 3041 zerova = VASLEW(zerop, id); 3042 3043 #ifdef DIAGNOSTIC 3044 if (*zpte) 3045 panic("pmap_zero_page: lock botch"); 3046 #endif 3047 3048 pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k); 3049 pmap_pte_flush(); 3050 pmap_update_pg((vaddr_t)zerova); /* flush TLB */ 3051 3052 memset(zerova, 0, PAGE_SIZE); 3053 3054 #if defined(DIAGNOSTIC) || defined(XEN) 3055 pmap_pte_set(zpte, 0); /* zap ! */ 3056 pmap_pte_flush(); 3057 #endif 3058 kpreempt_enable(); 3059 } 3060 3061 /* 3062 * pmap_pagezeroidle: the same, for the idle loop page zero'er. 3063 * Returns true if the page was zero'd, false if we aborted for 3064 * some reason. 3065 */ 3066 3067 bool 3068 pmap_pageidlezero(paddr_t pa) 3069 { 3070 pt_entry_t *zpte; 3071 void *zerova; 3072 bool rv; 3073 int id; 3074 3075 id = cpu_number(); 3076 zpte = PTESLEW(zero_pte, id); 3077 zerova = VASLEW(zerop, id); 3078 3079 KASSERT(cpu_feature[0] & CPUID_SSE2); 3080 KASSERT(*zpte == 0); 3081 3082 pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k); 3083 pmap_pte_flush(); 3084 pmap_update_pg((vaddr_t)zerova); /* flush TLB */ 3085 3086 rv = sse2_idlezero_page(zerova); 3087 3088 #if defined(DIAGNOSTIC) || defined(XEN) 3089 pmap_pte_set(zpte, 0); /* zap ! */ 3090 pmap_pte_flush(); 3091 #endif 3092 3093 return rv; 3094 } 3095 3096 /* 3097 * pmap_copy_page: copy a page 3098 */ 3099 3100 void 3101 pmap_copy_page(paddr_t srcpa, paddr_t dstpa) 3102 { 3103 pt_entry_t *spte; 3104 pt_entry_t *dpte; 3105 void *csrcva; 3106 void *cdstva; 3107 int id; 3108 3109 kpreempt_disable(); 3110 id = cpu_number(); 3111 spte = PTESLEW(csrc_pte,id); 3112 dpte = PTESLEW(cdst_pte,id); 3113 csrcva = VASLEW(csrcp, id); 3114 cdstva = VASLEW(cdstp, id); 3115 3116 KASSERT(*spte == 0 && *dpte == 0); 3117 3118 pmap_pte_set(spte, pmap_pa2pte(srcpa) | PG_V | PG_RW | PG_U | PG_k); 3119 pmap_pte_set(dpte, 3120 pmap_pa2pte(dstpa) | PG_V | PG_RW | PG_M | PG_U | PG_k); 3121 pmap_pte_flush(); 3122 pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva); 3123 3124 memcpy(cdstva, csrcva, PAGE_SIZE); 3125 3126 #if defined(DIAGNOSTIC) || defined(XEN) 3127 pmap_pte_set(spte, 0); 3128 pmap_pte_set(dpte, 0); 3129 pmap_pte_flush(); 3130 #endif 3131 kpreempt_enable(); 3132 } 3133 3134 static pt_entry_t * 3135 pmap_map_ptp(struct vm_page *ptp) 3136 { 3137 pt_entry_t *ptppte; 3138 void *ptpva; 3139 int id; 3140 3141 KASSERT(kpreempt_disabled()); 3142 3143 id = cpu_number(); 3144 ptppte = PTESLEW(ptp_pte, id); 3145 ptpva = VASLEW(ptpp, id); 3146 #if !defined(XEN) 3147 pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M | 3148 PG_RW | PG_U | PG_k); 3149 #else 3150 pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M | 3151 PG_U | PG_k); 3152 #endif 3153 pmap_pte_flush(); 3154 pmap_update_pg((vaddr_t)ptpva); 3155 3156 return (pt_entry_t *)ptpva; 3157 } 3158 3159 static void 3160 pmap_unmap_ptp(void) 3161 { 3162 #if defined(DIAGNOSTIC) || defined(XEN) 3163 pt_entry_t *pte; 3164 3165 KASSERT(kpreempt_disabled()); 3166 3167 pte = PTESLEW(ptp_pte, cpu_number()); 3168 if (*pte != 0) { 3169 pmap_pte_set(pte, 0); 3170 pmap_pte_flush(); 3171 } 3172 #endif 3173 } 3174 3175 static pt_entry_t * 3176 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va) 3177 { 3178 3179 KASSERT(kpreempt_disabled()); 3180 if (pmap_is_curpmap(pmap)) { 3181 return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */ 3182 } 3183 KASSERT(ptp != NULL); 3184 return pmap_map_ptp(ptp) + pl1_pi(va); 3185 } 3186 3187 static void 3188 pmap_unmap_pte(void) 3189 { 3190 3191 KASSERT(kpreempt_disabled()); 3192 3193 pmap_unmap_ptp(); 3194 } 3195 3196 /* 3197 * p m a p r e m o v e f u n c t i o n s 3198 * 3199 * functions that remove mappings 3200 */ 3201 3202 /* 3203 * pmap_remove_ptes: remove PTEs from a PTP 3204 * 3205 * => must have proper locking on pmap_master_lock 3206 * => caller must hold pmap's lock 3207 * => PTP must be mapped into KVA 3208 * => PTP should be null if pmap == pmap_kernel() 3209 * => must be called with kernel preemption disabled 3210 * => returns composite pte if at least one page should be shot down 3211 */ 3212 3213 static pt_entry_t 3214 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva, 3215 vaddr_t startva, vaddr_t endva, struct pv_entry **pv_tofree) 3216 { 3217 struct pv_entry *pve; 3218 pt_entry_t *pte = (pt_entry_t *) ptpva; 3219 pt_entry_t opte, xpte = 0; 3220 3221 KASSERT(pmap == pmap_kernel() || mutex_owned(&pmap->pm_lock)); 3222 KASSERT(kpreempt_disabled()); 3223 3224 /* 3225 * note that ptpva points to the PTE that maps startva. this may 3226 * or may not be the first PTE in the PTP. 3227 * 3228 * we loop through the PTP while there are still PTEs to look at 3229 * and the wire_count is greater than 1 (because we use the wire_count 3230 * to keep track of the number of real PTEs in the PTP). 3231 */ 3232 3233 for (/*null*/; startva < endva && (ptp == NULL || ptp->wire_count > 1) 3234 ; pte++, startva += PAGE_SIZE) { 3235 struct vm_page *pg; 3236 struct pmap_page *pp; 3237 3238 if (!pmap_valid_entry(*pte)) 3239 continue; /* VA not mapped */ 3240 3241 /* atomically save the old PTE and zap! it */ 3242 opte = pmap_pte_testset(pte, 0); 3243 if (!pmap_valid_entry(opte)) { 3244 continue; 3245 } 3246 3247 pmap_exec_account(pmap, startva, opte, 0); 3248 pmap_stats_update_bypte(pmap, 0, opte); 3249 xpte |= opte; 3250 3251 if (ptp) { 3252 ptp->wire_count--; /* dropping a PTE */ 3253 /* Make sure that the PDE is flushed */ 3254 if (ptp->wire_count <= 1) 3255 xpte |= PG_U; 3256 } 3257 3258 /* 3259 * if we are not on a pv_head list we are done. 3260 */ 3261 3262 if ((opte & PG_PVLIST) == 0) { 3263 #if defined(DIAGNOSTIC) && !defined(DOM0OPS) 3264 if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL) 3265 panic("pmap_remove_ptes: managed page without " 3266 "PG_PVLIST for %#" PRIxVADDR, startva); 3267 #endif 3268 continue; 3269 } 3270 3271 pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte)); 3272 #ifdef DIAGNOSTIC 3273 if (pg == NULL) 3274 panic("pmap_remove_ptes: unmanaged page marked " 3275 "PG_PVLIST, va = %#" PRIxVADDR ", " 3276 "pa = %#" PRIxPADDR, 3277 startva, (paddr_t)pmap_pte2pa(opte)); 3278 #endif 3279 3280 /* sync R/M bits */ 3281 pp = VM_PAGE_TO_PP(pg); 3282 pp_lock(pp); 3283 pp->pp_attrs |= opte; 3284 pve = pmap_remove_pv(pp, ptp, startva); 3285 pp_unlock(pp); 3286 3287 if (pve != NULL) { 3288 pve->pve_next = *pv_tofree; 3289 *pv_tofree = pve; 3290 } 3291 3292 /* end of "for" loop: time for next pte */ 3293 } 3294 3295 return xpte; 3296 } 3297 3298 3299 /* 3300 * pmap_remove_pte: remove a single PTE from a PTP 3301 * 3302 * => must have proper locking on pmap_master_lock 3303 * => caller must hold pmap's lock 3304 * => PTP must be mapped into KVA 3305 * => PTP should be null if pmap == pmap_kernel() 3306 * => returns true if we removed a mapping 3307 * => must be called with kernel preemption disabled 3308 */ 3309 3310 static bool 3311 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, 3312 vaddr_t va, struct pv_entry **pv_tofree) 3313 { 3314 pt_entry_t opte; 3315 struct pv_entry *pve; 3316 struct vm_page *pg; 3317 struct pmap_page *pp; 3318 3319 KASSERT(pmap == pmap_kernel() || mutex_owned(&pmap->pm_lock)); 3320 KASSERT(pmap == pmap_kernel() || kpreempt_disabled()); 3321 3322 if (!pmap_valid_entry(*pte)) 3323 return(false); /* VA not mapped */ 3324 3325 /* atomically save the old PTE and zap! it */ 3326 opte = pmap_pte_testset(pte, 0); 3327 if (!pmap_valid_entry(opte)) { 3328 return false; 3329 } 3330 3331 pmap_exec_account(pmap, va, opte, 0); 3332 pmap_stats_update_bypte(pmap, 0, opte); 3333 3334 if (opte & PG_U) 3335 pmap_tlb_shootdown(pmap, va, 0, opte); 3336 3337 if (ptp) { 3338 ptp->wire_count--; /* dropping a PTE */ 3339 /* Make sure that the PDE is flushed */ 3340 if ((ptp->wire_count <= 1) && !(opte & PG_U)) 3341 pmap_tlb_shootdown(pmap, va, 0, opte); 3342 } 3343 3344 /* 3345 * if we are not on a pv_head list we are done. 3346 */ 3347 3348 if ((opte & PG_PVLIST) == 0) { 3349 #if defined(DIAGNOSTIC) && !defined(DOM0OPS) 3350 if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL) 3351 panic("pmap_remove_pte: managed page without " 3352 "PG_PVLIST for %#" PRIxVADDR, va); 3353 #endif 3354 return(true); 3355 } 3356 3357 pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte)); 3358 #ifdef DIAGNOSTIC 3359 if (pg == NULL) 3360 panic("pmap_remove_pte: unmanaged page marked " 3361 "PG_PVLIST, va = %#" PRIxVADDR ", pa = %#" PRIxPADDR, 3362 va, (paddr_t)pmap_pte2pa(opte)); 3363 #endif 3364 3365 /* sync R/M bits */ 3366 pp = VM_PAGE_TO_PP(pg); 3367 pp_lock(pp); 3368 pp->pp_attrs |= opte; 3369 pve = pmap_remove_pv(pp, ptp, va); 3370 pp_unlock(pp); 3371 3372 if (pve) { 3373 pve->pve_next = *pv_tofree; 3374 *pv_tofree = pve; 3375 } 3376 3377 return(true); 3378 } 3379 3380 /* 3381 * pmap_remove: mapping removal function. 3382 * 3383 * => caller should not be holding any pmap locks 3384 */ 3385 3386 void 3387 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 3388 { 3389 pt_entry_t *ptes, xpte = 0; 3390 pd_entry_t pde; 3391 pd_entry_t * const *pdes; 3392 struct pv_entry *pv_tofree = NULL; 3393 bool result; 3394 int i; 3395 paddr_t ptppa; 3396 vaddr_t blkendva, va = sva; 3397 struct vm_page *ptp; 3398 struct pmap *pmap2; 3399 3400 kpreempt_disable(); 3401 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 3402 3403 /* 3404 * removing one page? take shortcut function. 3405 */ 3406 3407 if (va + PAGE_SIZE == eva) { 3408 if (pmap_pdes_valid(va, pdes, &pde)) { 3409 3410 /* PA of the PTP */ 3411 ptppa = pmap_pte2pa(pde); 3412 3413 /* get PTP if non-kernel mapping */ 3414 if (pmap == pmap_kernel()) { 3415 /* we never free kernel PTPs */ 3416 ptp = NULL; 3417 } else { 3418 ptp = pmap_find_ptp(pmap, va, ptppa, 1); 3419 #ifdef DIAGNOSTIC 3420 if (ptp == NULL) 3421 panic("pmap_remove: unmanaged " 3422 "PTP detected"); 3423 #endif 3424 } 3425 3426 /* do it! */ 3427 result = pmap_remove_pte(pmap, ptp, 3428 &ptes[pl1_i(va)], va, &pv_tofree); 3429 3430 /* 3431 * if mapping removed and the PTP is no longer 3432 * being used, free it! 3433 */ 3434 3435 if (result && ptp && ptp->wire_count <= 1) 3436 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3437 } 3438 } else for (/* null */ ; va < eva ; va = blkendva) { 3439 int lvl; 3440 3441 /* determine range of block */ 3442 blkendva = x86_round_pdr(va+1); 3443 if (blkendva > eva) 3444 blkendva = eva; 3445 3446 /* 3447 * XXXCDC: our PTE mappings should never be removed 3448 * with pmap_remove! if we allow this (and why would 3449 * we?) then we end up freeing the pmap's page 3450 * directory page (PDP) before we are finished using 3451 * it when we hit in in the recursive mapping. this 3452 * is BAD. 3453 * 3454 * long term solution is to move the PTEs out of user 3455 * address space. and into kernel address space (up 3456 * with APTE). then we can set VM_MAXUSER_ADDRESS to 3457 * be VM_MAX_ADDRESS. 3458 */ 3459 3460 /* XXXCDC: ugly hack to avoid freeing PDP here */ 3461 for (i = 0; i < PDP_SIZE; i++) { 3462 if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i) 3463 continue; 3464 } 3465 3466 lvl = pmap_pdes_invalid(va, pdes, &pde); 3467 if (lvl != 0) { 3468 /* 3469 * skip a range corresponding to an invalid pde. 3470 */ 3471 blkendva = (va & ptp_masks[lvl - 1]) + nbpd[lvl - 1]; 3472 continue; 3473 } 3474 3475 /* PA of the PTP */ 3476 ptppa = pmap_pte2pa(pde); 3477 3478 /* get PTP if non-kernel mapping */ 3479 if (pmap == pmap_kernel()) { 3480 /* we never free kernel PTPs */ 3481 ptp = NULL; 3482 } else { 3483 ptp = pmap_find_ptp(pmap, va, ptppa, 1); 3484 #ifdef DIAGNOSTIC 3485 if (ptp == NULL) 3486 panic("pmap_remove: unmanaged PTP " 3487 "detected"); 3488 #endif 3489 } 3490 xpte |= pmap_remove_ptes(pmap, ptp, 3491 (vaddr_t)&ptes[pl1_i(va)], va, blkendva, &pv_tofree); 3492 3493 /* if PTP is no longer being used, free it! */ 3494 if (ptp && ptp->wire_count <= 1) { 3495 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3496 } 3497 if ((xpte & PG_U) != 0) 3498 pmap_tlb_shootdown(pmap, sva, eva, xpte); 3499 } 3500 pmap_unmap_ptes(pmap, pmap2); /* unlock pmap */ 3501 kpreempt_enable(); 3502 3503 /* Now we free unused PVs */ 3504 if (pv_tofree) 3505 pmap_free_pvs(pv_tofree); 3506 } 3507 3508 /* 3509 * pmap_sync_pv: clear pte bits and return the old value of the pte. 3510 * 3511 * => called with pp_lock held. (thus preemption disabled) 3512 * => issues tlb shootdowns if necessary. 3513 */ 3514 3515 static int 3516 pmap_sync_pv(struct pv_pte *pvpte, pt_entry_t expect, int clearbits, 3517 pt_entry_t *optep) 3518 { 3519 struct pmap *pmap; 3520 struct vm_page *ptp; 3521 vaddr_t va; 3522 pt_entry_t *ptep; 3523 pt_entry_t opte; 3524 pt_entry_t npte; 3525 bool need_shootdown; 3526 3527 ptp = pvpte->pte_ptp; 3528 va = pvpte->pte_va; 3529 KASSERT(ptp == NULL || ptp->uobject != NULL); 3530 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 3531 pmap = ptp_to_pmap(ptp); 3532 3533 KASSERT((expect & ~(PG_FRAME | PG_V)) == 0); 3534 KASSERT((expect & PG_V) != 0); 3535 KASSERT(clearbits == ~0 || (clearbits & ~(PG_M | PG_U | PG_RW)) == 0); 3536 KASSERT(kpreempt_disabled()); 3537 3538 ptep = pmap_map_pte(pmap, ptp, va); 3539 do { 3540 opte = *ptep; 3541 KASSERT((opte & (PG_M | PG_U)) != PG_M); 3542 KASSERT((opte & (PG_U | PG_V)) != PG_U); 3543 KASSERT(opte == 0 || (opte & PG_V) != 0); 3544 if ((opte & (PG_FRAME | PG_V)) != expect) { 3545 3546 /* 3547 * we lost a race with a V->P operation like 3548 * pmap_remove(). wait for the competitor 3549 * reflecting pte bits into mp_attrs. 3550 * 3551 * issue a redundant TLB shootdown so that 3552 * we can wait for its completion. 3553 */ 3554 3555 pmap_unmap_pte(); 3556 if (clearbits != 0) { 3557 pmap_tlb_shootdown(pmap, va, 0, 3558 (pmap == pmap_kernel() ? PG_G : 0)); 3559 } 3560 return EAGAIN; 3561 } 3562 3563 /* 3564 * check if there's anything to do on this pte. 3565 */ 3566 3567 if ((opte & clearbits) == 0) { 3568 need_shootdown = false; 3569 break; 3570 } 3571 3572 /* 3573 * we need a shootdown if the pte is cached. (PG_U) 3574 * 3575 * ...unless we are clearing only the PG_RW bit and 3576 * it isn't cached as RW. (PG_M) 3577 */ 3578 3579 need_shootdown = (opte & PG_U) != 0 && 3580 !(clearbits == PG_RW && (opte & PG_M) == 0); 3581 3582 npte = opte & ~clearbits; 3583 3584 /* 3585 * if we need a shootdown anyway, clear PG_U and PG_M. 3586 */ 3587 3588 if (need_shootdown) { 3589 npte &= ~(PG_U | PG_M); 3590 } 3591 KASSERT((npte & (PG_M | PG_U)) != PG_M); 3592 KASSERT((npte & (PG_U | PG_V)) != PG_U); 3593 KASSERT(npte == 0 || (opte & PG_V) != 0); 3594 } while (pmap_pte_cas(ptep, opte, npte) != opte); 3595 3596 if (need_shootdown) { 3597 pmap_tlb_shootdown(pmap, va, 0, opte); 3598 } 3599 pmap_unmap_pte(); 3600 3601 *optep = opte; 3602 return 0; 3603 } 3604 3605 /* 3606 * pmap_page_remove: remove a managed vm_page from all pmaps that map it 3607 * 3608 * => R/M bits are sync'd back to attrs 3609 */ 3610 3611 void 3612 pmap_page_remove(struct vm_page *pg) 3613 { 3614 struct pmap_page *pp; 3615 struct pv_pte *pvpte; 3616 struct pv_entry *killlist = NULL; 3617 struct vm_page *ptp; 3618 pt_entry_t expect; 3619 lwp_t *l; 3620 int count; 3621 3622 l = curlwp; 3623 pp = VM_PAGE_TO_PP(pg); 3624 expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V; 3625 count = SPINLOCK_BACKOFF_MIN; 3626 kpreempt_disable(); 3627 startover: 3628 pp_lock(pp); 3629 while ((pvpte = pv_pte_first(pp)) != NULL) { 3630 struct pmap *pmap; 3631 struct pv_entry *pve; 3632 pt_entry_t opte; 3633 vaddr_t va; 3634 int error; 3635 3636 /* 3637 * add a reference to the pmap before clearing the pte. 3638 * otherwise the pmap can disappear behind us. 3639 */ 3640 3641 ptp = pvpte->pte_ptp; 3642 pmap = ptp_to_pmap(ptp); 3643 if (ptp != NULL) { 3644 pmap_reference(pmap); 3645 } 3646 3647 error = pmap_sync_pv(pvpte, expect, ~0, &opte); 3648 if (error == EAGAIN) { 3649 int hold_count; 3650 pp_unlock(pp); 3651 KERNEL_UNLOCK_ALL(curlwp, &hold_count); 3652 if (ptp != NULL) { 3653 pmap_destroy(pmap); 3654 } 3655 SPINLOCK_BACKOFF(count); 3656 KERNEL_LOCK(hold_count, curlwp); 3657 goto startover; 3658 } 3659 3660 pp->pp_attrs |= opte; 3661 va = pvpte->pte_va; 3662 pve = pmap_remove_pv(pp, ptp, va); 3663 pp_unlock(pp); 3664 3665 /* update the PTP reference count. free if last reference. */ 3666 if (ptp != NULL) { 3667 struct pmap *pmap2; 3668 pt_entry_t *ptes; 3669 pd_entry_t * const *pdes; 3670 3671 KASSERT(pmap != pmap_kernel()); 3672 3673 pmap_tlb_shootwait(); 3674 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3675 pmap_stats_update_bypte(pmap, 0, opte); 3676 ptp->wire_count--; 3677 if (ptp->wire_count <= 1) { 3678 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3679 } 3680 pmap_unmap_ptes(pmap, pmap2); 3681 pmap_destroy(pmap); 3682 } else { 3683 KASSERT(pmap == pmap_kernel()); 3684 pmap_stats_update_bypte(pmap, 0, opte); 3685 } 3686 3687 if (pve != NULL) { 3688 pve->pve_next = killlist; /* mark it for death */ 3689 killlist = pve; 3690 } 3691 pp_lock(pp); 3692 } 3693 pp_unlock(pp); 3694 kpreempt_enable(); 3695 3696 /* Now free unused pvs. */ 3697 pmap_free_pvs(killlist); 3698 } 3699 3700 /* 3701 * p m a p a t t r i b u t e f u n c t i o n s 3702 * functions that test/change managed page's attributes 3703 * since a page can be mapped multiple times we must check each PTE that 3704 * maps it by going down the pv lists. 3705 */ 3706 3707 /* 3708 * pmap_test_attrs: test a page's attributes 3709 */ 3710 3711 bool 3712 pmap_test_attrs(struct vm_page *pg, unsigned testbits) 3713 { 3714 struct pmap_page *pp; 3715 struct pv_pte *pvpte; 3716 pt_entry_t expect; 3717 u_int result; 3718 3719 pp = VM_PAGE_TO_PP(pg); 3720 if ((pp->pp_attrs & testbits) != 0) { 3721 return true; 3722 } 3723 expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V; 3724 pp_lock(pp); 3725 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 3726 pt_entry_t opte; 3727 int error; 3728 3729 if ((pp->pp_attrs & testbits) != 0) { 3730 break; 3731 } 3732 error = pmap_sync_pv(pvpte, expect, 0, &opte); 3733 if (error == 0) { 3734 pp->pp_attrs |= opte; 3735 } 3736 } 3737 result = pp->pp_attrs & testbits; 3738 pp_unlock(pp); 3739 3740 /* 3741 * note that we will exit the for loop with a non-null pve if 3742 * we have found the bits we are testing for. 3743 */ 3744 3745 return result != 0; 3746 } 3747 3748 /* 3749 * pmap_clear_attrs: clear the specified attribute for a page. 3750 * 3751 * => we return true if we cleared one of the bits we were asked to 3752 */ 3753 3754 bool 3755 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits) 3756 { 3757 struct pmap_page *pp; 3758 struct pv_pte *pvpte; 3759 u_int result; 3760 pt_entry_t expect; 3761 int count; 3762 3763 pp = VM_PAGE_TO_PP(pg); 3764 expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V; 3765 count = SPINLOCK_BACKOFF_MIN; 3766 kpreempt_disable(); 3767 startover: 3768 pp_lock(pp); 3769 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 3770 pt_entry_t opte; 3771 int error; 3772 3773 error = pmap_sync_pv(pvpte, expect, clearbits, &opte); 3774 if (error == EAGAIN) { 3775 int hold_count; 3776 pp_unlock(pp); 3777 KERNEL_UNLOCK_ALL(curlwp, &hold_count); 3778 SPINLOCK_BACKOFF(count); 3779 KERNEL_LOCK(hold_count, curlwp); 3780 goto startover; 3781 } 3782 pp->pp_attrs |= opte; 3783 } 3784 result = pp->pp_attrs & clearbits; 3785 pp->pp_attrs &= ~clearbits; 3786 pp_unlock(pp); 3787 kpreempt_enable(); 3788 3789 return result != 0; 3790 } 3791 3792 3793 /* 3794 * p m a p p r o t e c t i o n f u n c t i o n s 3795 */ 3796 3797 /* 3798 * pmap_page_protect: change the protection of all recorded mappings 3799 * of a managed page 3800 * 3801 * => NOTE: this is an inline function in pmap.h 3802 */ 3803 3804 /* see pmap.h */ 3805 3806 /* 3807 * pmap_protect: set the protection in of the pages in a pmap 3808 * 3809 * => NOTE: this is an inline function in pmap.h 3810 */ 3811 3812 /* see pmap.h */ 3813 3814 /* 3815 * pmap_write_protect: write-protect pages in a pmap 3816 */ 3817 3818 void 3819 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot) 3820 { 3821 int i; 3822 pt_entry_t *ptes, *epte; 3823 pt_entry_t *spte; 3824 pd_entry_t * const *pdes; 3825 vaddr_t blockend, va; 3826 pt_entry_t opte; 3827 struct pmap *pmap2; 3828 3829 KASSERT(curlwp->l_md.md_gc_pmap != pmap); 3830 3831 kpreempt_disable(); 3832 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 3833 3834 /* should be ok, but just in case ... */ 3835 sva &= PG_FRAME; 3836 eva &= PG_FRAME; 3837 3838 for (va = sva ; va < eva ; va = blockend) { 3839 3840 blockend = (va & L2_FRAME) + NBPD_L2; 3841 if (blockend > eva) 3842 blockend = eva; 3843 3844 /* 3845 * XXXCDC: our PTE mappings should never be write-protected! 3846 * 3847 * long term solution is to move the PTEs out of user 3848 * address space. and into kernel address space (up 3849 * with APTE). then we can set VM_MAXUSER_ADDRESS to 3850 * be VM_MAX_ADDRESS. 3851 */ 3852 3853 /* XXXCDC: ugly hack to avoid freeing PDP here */ 3854 for (i = 0; i < PDP_SIZE; i++) { 3855 if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i) 3856 continue; 3857 } 3858 3859 /* empty block? */ 3860 if (!pmap_pdes_valid(va, pdes, NULL)) 3861 continue; 3862 3863 #ifdef DIAGNOSTIC 3864 if (va >= VM_MAXUSER_ADDRESS && 3865 va < VM_MAX_ADDRESS) 3866 panic("pmap_write_protect: PTE space"); 3867 #endif 3868 3869 spte = &ptes[pl1_i(va)]; 3870 epte = &ptes[pl1_i(blockend)]; 3871 3872 for (/*null */; spte < epte ; spte++) { 3873 pt_entry_t npte; 3874 3875 do { 3876 opte = *spte; 3877 if ((~opte & (PG_RW | PG_V)) != 0) { 3878 goto next; 3879 } 3880 npte = opte & ~PG_RW; 3881 } while (pmap_pte_cas(spte, opte, npte) != opte); 3882 if ((opte & PG_M) != 0) { 3883 vaddr_t tva; 3884 3885 tva = x86_ptob(spte - ptes); 3886 pmap_tlb_shootdown(pmap, tva, 0, opte); 3887 } 3888 next:; 3889 } 3890 } 3891 3892 pmap_unmap_ptes(pmap, pmap2); /* unlocks pmap */ 3893 kpreempt_enable(); 3894 } 3895 3896 /* 3897 * end of protection functions 3898 */ 3899 3900 /* 3901 * pmap_unwire: clear the wired bit in the PTE 3902 * 3903 * => mapping should already be in map 3904 */ 3905 3906 void 3907 pmap_unwire(struct pmap *pmap, vaddr_t va) 3908 { 3909 pt_entry_t *ptes; 3910 pd_entry_t * const *pdes; 3911 struct pmap *pmap2; 3912 3913 kpreempt_disable(); 3914 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 3915 3916 if (pmap_pdes_valid(va, pdes, NULL)) { 3917 pt_entry_t *ptep = &ptes[pl1_i(va)]; 3918 pt_entry_t opte = *ptep; 3919 3920 #ifdef DIAGNOSTIC 3921 if (!pmap_valid_entry(opte)) 3922 panic("pmap_unwire: invalid (unmapped) va 0x%lx", va); 3923 #endif 3924 if ((opte & PG_W) != 0) { 3925 pt_entry_t npte = opte & ~PG_W; 3926 3927 opte = pmap_pte_testset(ptep, npte); 3928 pmap_stats_update_bypte(pmap, npte, opte); 3929 } 3930 #ifdef DIAGNOSTIC 3931 else { 3932 printf("pmap_unwire: wiring for pmap %p va 0x%lx " 3933 "didn't change!\n", pmap, va); 3934 } 3935 #endif 3936 pmap_unmap_ptes(pmap, pmap2); /* unlocks map */ 3937 } 3938 #ifdef DIAGNOSTIC 3939 else { 3940 panic("pmap_unwire: invalid PDE"); 3941 } 3942 #endif 3943 kpreempt_enable(); 3944 } 3945 3946 /* 3947 * pmap_copy: copy mappings from one pmap to another 3948 * 3949 * => optional function 3950 * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr) 3951 */ 3952 3953 /* 3954 * defined as macro in pmap.h 3955 */ 3956 3957 __weak_alias(pmap_enter, pmap_enter_default); 3958 3959 int 3960 pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, 3961 u_int flags) 3962 { 3963 return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0); 3964 } 3965 3966 /* 3967 * pmap_enter: enter a mapping into a pmap 3968 * 3969 * => must be done "now" ... no lazy-evaluation 3970 * => we set pmap => pv_head locking 3971 */ 3972 int 3973 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa, 3974 vm_prot_t prot, u_int flags, int domid) 3975 { 3976 pt_entry_t *ptes, opte, npte; 3977 pt_entry_t *ptep; 3978 pd_entry_t * const *pdes; 3979 struct vm_page *ptp, *pg; 3980 struct pmap_page *new_pp; 3981 struct pmap_page *old_pp; 3982 struct pv_entry *old_pve = NULL; 3983 struct pv_entry *new_pve; 3984 struct pv_entry *new_pve2; 3985 int error; 3986 bool wired = (flags & PMAP_WIRED) != 0; 3987 struct pmap *pmap2; 3988 3989 KASSERT(pmap_initialized); 3990 KASSERT(curlwp->l_md.md_gc_pmap != pmap); 3991 3992 #ifdef DIAGNOSTIC 3993 /* sanity check: totally out of range? */ 3994 if (va >= VM_MAX_KERNEL_ADDRESS) 3995 panic("pmap_enter: too big"); 3996 3997 if (va == (vaddr_t) PDP_BASE || va == (vaddr_t) APDP_BASE) 3998 panic("pmap_enter: trying to map over PDP/APDP!"); 3999 4000 /* sanity check: kernel PTPs should already have been pre-allocated */ 4001 if (va >= VM_MIN_KERNEL_ADDRESS && 4002 !pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)])) 4003 panic("pmap_enter: missing kernel PTP for va %lx!", va); 4004 #endif /* DIAGNOSTIC */ 4005 #ifdef XEN 4006 KASSERT(domid == DOMID_SELF || pa == 0); 4007 #endif /* XEN */ 4008 4009 npte = ma | protection_codes[prot] | PG_V; 4010 npte |= pmap_pat_flags(flags); 4011 if (wired) 4012 npte |= PG_W; 4013 if (va < VM_MAXUSER_ADDRESS) 4014 npte |= PG_u; 4015 else if (va < VM_MAX_ADDRESS) 4016 npte |= (PG_u | PG_RW); /* XXXCDC: no longer needed? */ 4017 else 4018 npte |= PG_k; 4019 if (pmap == pmap_kernel()) 4020 npte |= pmap_pg_g; 4021 if (flags & VM_PROT_ALL) { 4022 npte |= PG_U; 4023 if (flags & VM_PROT_WRITE) { 4024 KASSERT((npte & PG_RW) != 0); 4025 npte |= PG_M; 4026 } 4027 } 4028 4029 #ifdef XEN 4030 if (domid != DOMID_SELF) 4031 pg = NULL; 4032 else 4033 #endif 4034 pg = PHYS_TO_VM_PAGE(pa); 4035 if (pg != NULL) { 4036 /* This is a managed page */ 4037 npte |= PG_PVLIST; 4038 new_pp = VM_PAGE_TO_PP(pg); 4039 } else { 4040 new_pp = NULL; 4041 } 4042 4043 /* get pves. */ 4044 new_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT); 4045 new_pve2 = pool_cache_get(&pmap_pv_cache, PR_NOWAIT); 4046 if (new_pve == NULL || new_pve2 == NULL) { 4047 if (flags & PMAP_CANFAIL) { 4048 error = ENOMEM; 4049 goto out2; 4050 } 4051 panic("pmap_enter: pve allocation failed"); 4052 } 4053 4054 kpreempt_disable(); 4055 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 4056 if (pmap == pmap_kernel()) { 4057 ptp = NULL; 4058 } else { 4059 ptp = pmap_get_ptp(pmap, va, pdes); 4060 if (ptp == NULL) { 4061 pmap_unmap_ptes(pmap, pmap2); 4062 if (flags & PMAP_CANFAIL) { 4063 error = ENOMEM; 4064 goto out; 4065 } 4066 panic("pmap_enter: get ptp failed"); 4067 } 4068 } 4069 4070 /* 4071 * update the pte. 4072 */ 4073 4074 ptep = &ptes[pl1_i(va)]; 4075 do { 4076 opte = *ptep; 4077 4078 /* 4079 * if the same page, inherit PG_U and PG_M. 4080 */ 4081 if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) { 4082 npte |= opte & (PG_U | PG_M); 4083 } 4084 #if defined(XEN) 4085 if (domid != DOMID_SELF) { 4086 /* pmap_pte_cas with error handling */ 4087 int s = splvm(); 4088 if (opte != *ptep) { 4089 splx(s); 4090 continue; 4091 } 4092 error = xpq_update_foreign( 4093 vtomach((vaddr_t)ptep), npte, domid); 4094 splx(s); 4095 if (error) { 4096 if (ptp != NULL && ptp->wire_count <= 1) { 4097 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 4098 } 4099 pmap_unmap_ptes(pmap, pmap2); 4100 goto out; 4101 } 4102 break; 4103 } 4104 #endif /* defined(XEN) */ 4105 } while (pmap_pte_cas(ptep, opte, npte) != opte); 4106 4107 /* 4108 * update statistics and PTP's reference count. 4109 */ 4110 4111 pmap_stats_update_bypte(pmap, npte, opte); 4112 if (ptp != NULL && !pmap_valid_entry(opte)) { 4113 ptp->wire_count++; 4114 } 4115 KASSERT(ptp == NULL || ptp->wire_count > 1); 4116 4117 /* 4118 * if the same page, we can skip pv_entry handling. 4119 */ 4120 4121 if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) { 4122 KASSERT(((opte ^ npte) & PG_PVLIST) == 0); 4123 goto same_pa; 4124 } 4125 4126 /* 4127 * if old page is managed, remove pv_entry from its list. 4128 */ 4129 4130 if ((~opte & (PG_V | PG_PVLIST)) == 0) { 4131 pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte)); 4132 #ifdef DIAGNOSTIC 4133 if (pg == NULL) 4134 panic("pmap_enter: PG_PVLIST mapping with " 4135 "unmanaged page " 4136 "pa = 0x%" PRIx64 " (0x%" PRIx64 ")", 4137 (int64_t)pa, (int64_t)atop(pa)); 4138 #endif 4139 old_pp = VM_PAGE_TO_PP(pg); 4140 4141 pp_lock(old_pp); 4142 old_pve = pmap_remove_pv(old_pp, ptp, va); 4143 old_pp->pp_attrs |= opte; 4144 pp_unlock(old_pp); 4145 } 4146 4147 /* 4148 * if new page is managed, insert pv_entry into its list. 4149 */ 4150 4151 if (new_pp) { 4152 pp_lock(new_pp); 4153 new_pve = pmap_enter_pv(new_pp, new_pve, &new_pve2, ptp, va); 4154 pp_unlock(new_pp); 4155 } 4156 4157 same_pa: 4158 pmap_unmap_ptes(pmap, pmap2); 4159 4160 /* 4161 * shootdown tlb if necessary. 4162 */ 4163 4164 if ((~opte & (PG_V | PG_U)) == 0 && 4165 ((opte ^ npte) & (PG_FRAME | PG_RW)) != 0) { 4166 pmap_tlb_shootdown(pmap, va, 0, opte); 4167 } 4168 4169 error = 0; 4170 out: 4171 kpreempt_enable(); 4172 out2: 4173 if (old_pve != NULL) { 4174 pool_cache_put(&pmap_pv_cache, old_pve); 4175 } 4176 if (new_pve != NULL) { 4177 pool_cache_put(&pmap_pv_cache, new_pve); 4178 } 4179 if (new_pve2 != NULL) { 4180 pool_cache_put(&pmap_pv_cache, new_pve2); 4181 } 4182 4183 return error; 4184 } 4185 4186 static bool 4187 pmap_get_physpage(vaddr_t va, int level, paddr_t *paddrp) 4188 { 4189 struct vm_page *ptp; 4190 struct pmap *kpm = pmap_kernel(); 4191 4192 if (uvm.page_init_done == false) { 4193 /* 4194 * we're growing the kernel pmap early (from 4195 * uvm_pageboot_alloc()). this case must be 4196 * handled a little differently. 4197 */ 4198 4199 if (uvm_page_physget(paddrp) == false) 4200 panic("pmap_get_physpage: out of memory"); 4201 kpreempt_disable(); 4202 pmap_pte_set(early_zero_pte, 4203 pmap_pa2pte(*paddrp) | PG_V | PG_RW | PG_k); 4204 pmap_pte_flush(); 4205 pmap_update_pg((vaddr_t)early_zerop); 4206 memset(early_zerop, 0, PAGE_SIZE); 4207 #if defined(DIAGNOSTIC) || defined (XEN) 4208 pmap_pte_set(early_zero_pte, 0); 4209 pmap_pte_flush(); 4210 #endif /* defined(DIAGNOSTIC) */ 4211 kpreempt_enable(); 4212 } else { 4213 /* XXX */ 4214 PMAP_SUBOBJ_LOCK(kpm, level - 1); 4215 ptp = uvm_pagealloc(&kpm->pm_obj[level - 1], 4216 ptp_va2o(va, level), NULL, 4217 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 4218 PMAP_SUBOBJ_UNLOCK(kpm, level - 1); 4219 if (ptp == NULL) 4220 panic("pmap_get_physpage: out of memory"); 4221 ptp->flags &= ~PG_BUSY; 4222 ptp->wire_count = 1; 4223 *paddrp = VM_PAGE_TO_PHYS(ptp); 4224 } 4225 pmap_stats_update(kpm, 1, 0); 4226 return true; 4227 } 4228 4229 /* 4230 * Allocate the amount of specified ptps for a ptp level, and populate 4231 * all levels below accordingly, mapping virtual addresses starting at 4232 * kva. 4233 * 4234 * Used by pmap_growkernel. 4235 */ 4236 static void 4237 pmap_alloc_level(pd_entry_t * const *pdes, vaddr_t kva, int lvl, 4238 long *needed_ptps) 4239 { 4240 unsigned long i; 4241 vaddr_t va; 4242 paddr_t pa; 4243 unsigned long index, endindex; 4244 int level; 4245 pd_entry_t *pdep; 4246 #ifdef XEN 4247 int s = splvm(); /* protect xpq_* */ 4248 #endif 4249 4250 for (level = lvl; level > 1; level--) { 4251 if (level == PTP_LEVELS) 4252 pdep = pmap_kernel()->pm_pdir; 4253 else 4254 pdep = pdes[level - 2]; 4255 va = kva; 4256 index = pl_i_roundup(kva, level); 4257 endindex = index + needed_ptps[level - 1] - 1; 4258 4259 4260 for (i = index; i <= endindex; i++) { 4261 KASSERT(!pmap_valid_entry(pdep[i])); 4262 pmap_get_physpage(va, level - 1, &pa); 4263 #ifdef XEN 4264 xpq_queue_pte_update((level == PTP_LEVELS) ? 4265 xpmap_ptom(pmap_pdirpa(pmap_kernel(), i)) : 4266 xpmap_ptetomach(&pdep[i]), 4267 pmap_pa2pte(pa) | PG_k | PG_V | PG_RW); 4268 #ifdef PAE 4269 if (level == PTP_LEVELS && i > L2_SLOT_KERN) { 4270 /* update real kernel PD too */ 4271 xpq_queue_pte_update( 4272 xpmap_ptetomach(&pmap_kl2pd[l2tol2(i)]), 4273 pmap_pa2pte(pa) | PG_k | PG_V | PG_RW); 4274 } 4275 #endif 4276 #else /* XEN */ 4277 pdep[i] = pmap_pa2pte(pa) | PG_k | PG_V | PG_RW; 4278 #endif /* XEN */ 4279 KASSERT(level != PTP_LEVELS || nkptp[level - 1] + 4280 pl_i(VM_MIN_KERNEL_ADDRESS, level) == i); 4281 nkptp[level - 1]++; 4282 va += nbpd[level - 1]; 4283 } 4284 pmap_pte_flush(); 4285 } 4286 #ifdef XEN 4287 splx(s); 4288 #endif 4289 } 4290 4291 /* 4292 * pmap_growkernel: increase usage of KVM space 4293 * 4294 * => we allocate new PTPs for the kernel and install them in all 4295 * the pmaps on the system. 4296 */ 4297 4298 vaddr_t 4299 pmap_growkernel(vaddr_t maxkvaddr) 4300 { 4301 struct pmap *kpm = pmap_kernel(); 4302 #if !defined(XEN) || !defined(__x86_64__) 4303 struct pmap *pm; 4304 #endif 4305 int s, i; 4306 long needed_kptp[PTP_LEVELS], target_nptp, old; 4307 bool invalidate = false; 4308 4309 s = splvm(); /* to be safe */ 4310 mutex_enter(&kpm->pm_lock); 4311 4312 if (maxkvaddr <= pmap_maxkvaddr) { 4313 mutex_exit(&kpm->pm_lock); 4314 splx(s); 4315 return pmap_maxkvaddr; 4316 } 4317 4318 maxkvaddr = x86_round_pdr(maxkvaddr); 4319 old = nkptp[PTP_LEVELS - 1]; 4320 /* 4321 * This loop could be optimized more, but pmap_growkernel() 4322 * is called infrequently. 4323 */ 4324 for (i = PTP_LEVELS - 1; i >= 1; i--) { 4325 target_nptp = pl_i_roundup(maxkvaddr, i + 1) - 4326 pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1); 4327 /* 4328 * XXX only need to check toplevel. 4329 */ 4330 if (target_nptp > nkptpmax[i]) 4331 panic("out of KVA space"); 4332 KASSERT(target_nptp >= nkptp[i]); 4333 needed_kptp[i] = target_nptp - nkptp[i]; 4334 } 4335 4336 pmap_alloc_level(normal_pdes, pmap_maxkvaddr, PTP_LEVELS, needed_kptp); 4337 4338 /* 4339 * If the number of top level entries changed, update all 4340 * pmaps. 4341 */ 4342 if (needed_kptp[PTP_LEVELS - 1] != 0) { 4343 #ifdef XEN 4344 #ifdef __x86_64__ 4345 /* nothing, kernel entries are never entered in user pmap */ 4346 #else /* __x86_64__ */ 4347 mutex_enter(&pmaps_lock); 4348 LIST_FOREACH(pm, &pmaps, pm_list) { 4349 int pdkidx; 4350 for (pdkidx = PDIR_SLOT_KERN + old; 4351 pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1]; 4352 pdkidx++) { 4353 xpq_queue_pte_update( 4354 xpmap_ptom(pmap_pdirpa(pm, pdkidx)), 4355 kpm->pm_pdir[pdkidx]); 4356 } 4357 xpq_flush_queue(); 4358 } 4359 mutex_exit(&pmaps_lock); 4360 #endif /* __x86_64__ */ 4361 #else /* XEN */ 4362 unsigned newpdes; 4363 newpdes = nkptp[PTP_LEVELS - 1] - old; 4364 mutex_enter(&pmaps_lock); 4365 LIST_FOREACH(pm, &pmaps, pm_list) { 4366 memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old], 4367 &kpm->pm_pdir[PDIR_SLOT_KERN + old], 4368 newpdes * sizeof (pd_entry_t)); 4369 } 4370 mutex_exit(&pmaps_lock); 4371 #endif 4372 invalidate = true; 4373 } 4374 pmap_maxkvaddr = maxkvaddr; 4375 mutex_exit(&kpm->pm_lock); 4376 splx(s); 4377 4378 if (invalidate) { 4379 /* Invalidate the PDP cache. */ 4380 pool_cache_invalidate(&pmap_pdp_cache); 4381 } 4382 4383 return maxkvaddr; 4384 } 4385 4386 #ifdef DEBUG 4387 void pmap_dump(struct pmap *, vaddr_t, vaddr_t); 4388 4389 /* 4390 * pmap_dump: dump all the mappings from a pmap 4391 * 4392 * => caller should not be holding any pmap locks 4393 */ 4394 4395 void 4396 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 4397 { 4398 pt_entry_t *ptes, *pte; 4399 pd_entry_t * const *pdes; 4400 struct pmap *pmap2; 4401 vaddr_t blkendva; 4402 4403 /* 4404 * if end is out of range truncate. 4405 * if (end == start) update to max. 4406 */ 4407 4408 if (eva > VM_MAXUSER_ADDRESS || eva <= sva) 4409 eva = VM_MAXUSER_ADDRESS; 4410 4411 /* 4412 * we lock in the pmap => pv_head direction 4413 */ 4414 4415 kpreempt_disable(); 4416 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 4417 4418 /* 4419 * dumping a range of pages: we dump in PTP sized blocks (4MB) 4420 */ 4421 4422 for (/* null */ ; sva < eva ; sva = blkendva) { 4423 4424 /* determine range of block */ 4425 blkendva = x86_round_pdr(sva+1); 4426 if (blkendva > eva) 4427 blkendva = eva; 4428 4429 /* valid block? */ 4430 if (!pmap_pdes_valid(sva, pdes, NULL)) 4431 continue; 4432 4433 pte = &ptes[pl1_i(sva)]; 4434 for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) { 4435 if (!pmap_valid_entry(*pte)) 4436 continue; 4437 printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR 4438 " (pte=%#" PRIxPADDR ")\n", 4439 sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte); 4440 } 4441 } 4442 pmap_unmap_ptes(pmap, pmap2); 4443 kpreempt_enable(); 4444 } 4445 #endif 4446 4447 /* 4448 * pmap_tlb_shootdown: invalidate pages on all CPUs using pmap 'pm' 4449 * 4450 * => always invalidates locally before returning 4451 * => returns before remote CPUs have invalidated 4452 * => must be called with preemption disabled 4453 */ 4454 4455 void 4456 pmap_tlb_shootdown(struct pmap *pm, vaddr_t sva, vaddr_t eva, pt_entry_t pte) 4457 { 4458 #ifdef MULTIPROCESSOR 4459 extern bool x86_mp_online; 4460 struct cpu_info *ci; 4461 struct pmap_mbox *mb, *selfmb; 4462 CPU_INFO_ITERATOR cii; 4463 uintptr_t head; 4464 u_int count; 4465 int s; 4466 #endif /* MULTIPROCESSOR */ 4467 struct cpu_info *self; 4468 bool kernel; 4469 4470 KASSERT(eva == 0 || eva >= sva); 4471 KASSERT(kpreempt_disabled()); 4472 4473 if (pte & PG_PS) 4474 sva &= PG_LGFRAME; 4475 pte &= PG_G; 4476 self = curcpu(); 4477 4478 if (sva == (vaddr_t)-1LL) { 4479 kernel = true; 4480 } else { 4481 if (eva == 0) 4482 eva = sva + PAGE_SIZE; 4483 kernel = sva >= VM_MAXUSER_ADDRESS; 4484 KASSERT(kernel == (eva > VM_MAXUSER_ADDRESS)); 4485 } 4486 4487 /* 4488 * if tearing down the pmap, do nothing. we'll flush later 4489 * when we're ready to recycle/destroy it. 4490 */ 4491 if (__predict_false(curlwp->l_md.md_gc_pmap == pm)) { 4492 return; 4493 } 4494 4495 /* 4496 * If the range is larger than 32 pages, then invalidate 4497 * everything. 4498 */ 4499 if (sva != (vaddr_t)-1LL && eva - sva > (32 * PAGE_SIZE)) { 4500 sva = (vaddr_t)-1LL; 4501 eva = sva; 4502 } 4503 4504 #ifdef MULTIPROCESSOR 4505 if (ncpu > 1 && x86_mp_online) { 4506 selfmb = &self->ci_pmap_cpu->pc_mbox; 4507 4508 /* 4509 * If the CPUs have no notion of global pages then 4510 * reload of %cr3 is sufficient. 4511 */ 4512 if (pte != 0 && (cpu_feature[0] & CPUID_PGE) == 0) 4513 pte = 0; 4514 4515 if (pm == pmap_kernel()) { 4516 /* 4517 * Mapped on all CPUs: use the broadcast mechanism. 4518 * Once we have the lock, increment the counter. 4519 */ 4520 s = splvm(); 4521 mb = &pmap_mbox; 4522 count = SPINLOCK_BACKOFF_MIN; 4523 do { 4524 if ((head = mb->mb_head) != mb->mb_tail) { 4525 splx(s); 4526 while ((head = mb->mb_head) != 4527 mb->mb_tail) 4528 SPINLOCK_BACKOFF(count); 4529 s = splvm(); 4530 } 4531 } while (atomic_cas_ulong( 4532 (volatile u_long *)&mb->mb_head, 4533 head, head + ncpu - 1) != head); 4534 4535 /* 4536 * Once underway we must stay at IPL_VM until the 4537 * IPI is dispatched. Otherwise interrupt handlers 4538 * on this CPU can deadlock against us. 4539 */ 4540 pmap_tlb_evcnt.ev_count++; 4541 mb->mb_pointer = self; 4542 mb->mb_addr1 = sva; 4543 mb->mb_addr2 = eva; 4544 mb->mb_global = pte; 4545 x86_ipi(LAPIC_TLB_BCAST_VECTOR, LAPIC_DEST_ALLEXCL, 4546 LAPIC_DLMODE_FIXED); 4547 self->ci_need_tlbwait = 1; 4548 splx(s); 4549 } else if ((pm->pm_cpus & ~self->ci_cpumask) != 0 || 4550 (kernel && (pm->pm_kernel_cpus & ~self->ci_cpumask) != 0)) { 4551 /* 4552 * We don't bother traversing the CPU list if only 4553 * used by this CPU. 4554 * 4555 * We can't do global flushes with the multicast 4556 * mechanism. 4557 */ 4558 KASSERT(pte == 0); 4559 4560 /* 4561 * Take ownership of the shootdown mailbox on each 4562 * CPU, fill the details and fire it off. 4563 */ 4564 s = splvm(); 4565 for (CPU_INFO_FOREACH(cii, ci)) { 4566 if (ci == self || 4567 !pmap_is_active(pm, ci, kernel) || 4568 !(ci->ci_flags & CPUF_RUNNING)) 4569 continue; 4570 selfmb->mb_head++; 4571 mb = &ci->ci_pmap_cpu->pc_mbox; 4572 count = SPINLOCK_BACKOFF_MIN; 4573 while (atomic_cas_ulong( 4574 (u_long *)&mb->mb_pointer, 4575 0, (u_long)&selfmb->mb_tail) != 0) { 4576 splx(s); 4577 while (mb->mb_pointer != 0) 4578 SPINLOCK_BACKOFF(count); 4579 s = splvm(); 4580 } 4581 mb->mb_addr1 = sva; 4582 mb->mb_addr2 = eva; 4583 mb->mb_global = pte; 4584 if (x86_ipi(LAPIC_TLB_MCAST_VECTOR, 4585 ci->ci_cpuid, LAPIC_DLMODE_FIXED)) 4586 panic("pmap_tlb_shootdown: ipi failed"); 4587 } 4588 self->ci_need_tlbwait = 1; 4589 splx(s); 4590 } 4591 } 4592 #endif /* MULTIPROCESSOR */ 4593 4594 /* Update the current CPU before waiting for others. */ 4595 if (!pmap_is_active(pm, self, kernel)) 4596 return; 4597 4598 if (sva == (vaddr_t)-1LL) { 4599 u_int gen = uvm_emap_gen_return(); 4600 if (pte != 0) { 4601 tlbflushg(); 4602 } else { 4603 tlbflush(); 4604 } 4605 uvm_emap_update(gen); 4606 } else { 4607 do { 4608 pmap_update_pg(sva); 4609 sva += PAGE_SIZE; 4610 } while (sva < eva); 4611 } 4612 } 4613 4614 /* 4615 * pmap_tlb_shootwait: wait for pending TLB shootdowns to complete 4616 * 4617 * => only waits for operations generated by the current CPU 4618 * => must be called with preemption disabled 4619 */ 4620 4621 void 4622 pmap_tlb_shootwait(void) 4623 { 4624 struct cpu_info *self; 4625 struct pmap_mbox *mb; 4626 4627 KASSERT(kpreempt_disabled()); 4628 4629 /* 4630 * Anything to do? XXX Really we want to avoid touching the cache 4631 * lines of the two mailboxes, but the processor may read ahead. 4632 */ 4633 self = curcpu(); 4634 if (!self->ci_need_tlbwait) 4635 return; 4636 self->ci_need_tlbwait = 0; 4637 4638 /* If we own the global mailbox, wait for it to drain. */ 4639 mb = &pmap_mbox; 4640 while (mb->mb_pointer == self && mb->mb_head != mb->mb_tail) 4641 x86_pause(); 4642 4643 /* If we own other CPU's mailboxes, wait for them to drain. */ 4644 mb = &self->ci_pmap_cpu->pc_mbox; 4645 KASSERT(mb->mb_pointer != &mb->mb_tail); 4646 while (mb->mb_head != mb->mb_tail) 4647 x86_pause(); 4648 } 4649 4650 /* 4651 * pmap_update: process deferred invalidations 4652 */ 4653 4654 void 4655 pmap_update(struct pmap *pmap) 4656 { 4657 struct vm_page *ptp, *empty_ptps; 4658 struct pmap_page *pp; 4659 lwp_t *l; 4660 4661 /* 4662 * if we have torn down this pmap, invalidate non-global TLB 4663 * entries on any processors using it. 4664 */ 4665 l = curlwp; 4666 if (__predict_false(l->l_md.md_gc_pmap == pmap)) { 4667 l->l_md.md_gc_pmap = NULL; 4668 KPREEMPT_DISABLE(l); 4669 pmap_tlb_shootdown(pmap, -1, -1, 0); 4670 KPREEMPT_ENABLE(l); 4671 } 4672 4673 /* 4674 * wait for tlb shootdowns to complete before returning control 4675 * to the caller. 4676 */ 4677 kpreempt_disable(); 4678 pmap_tlb_shootwait(); 4679 kpreempt_enable(); 4680 4681 /* 4682 * now that shootdowns are complete, process deferred frees, 4683 * but not from interrupt context. 4684 */ 4685 if (l->l_md.md_gc_ptp != NULL) { 4686 if (cpu_intr_p() || (l->l_pflag & LP_INTR) != 0) { 4687 return; 4688 } 4689 4690 empty_ptps = l->l_md.md_gc_ptp; 4691 l->l_md.md_gc_ptp = NULL; 4692 4693 while ((ptp = empty_ptps) != NULL) { 4694 ptp->flags |= PG_ZERO; 4695 pp = VM_PAGE_TO_PP(ptp); 4696 empty_ptps = pp->pp_link; 4697 LIST_INIT(&pp->pp_head.pvh_list); 4698 uvm_pagefree(ptp); 4699 } 4700 } 4701 } 4702 4703 #if PTP_LEVELS > 4 4704 #error "Unsupported number of page table mappings" 4705 #endif 4706 4707 paddr_t 4708 pmap_init_tmp_pgtbl(paddr_t pg) 4709 { 4710 static bool maps_loaded; 4711 static const paddr_t x86_tmp_pml_paddr[] = { 4712 4 * PAGE_SIZE, 4713 5 * PAGE_SIZE, 4714 6 * PAGE_SIZE, 4715 7 * PAGE_SIZE 4716 }; 4717 static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 }; 4718 4719 pd_entry_t *tmp_pml, *kernel_pml; 4720 4721 int level; 4722 4723 if (!maps_loaded) { 4724 for (level = 0; level < PTP_LEVELS; ++level) { 4725 x86_tmp_pml_vaddr[level] = 4726 uvm_km_alloc(kernel_map, PAGE_SIZE, 0, 4727 UVM_KMF_VAONLY); 4728 4729 if (x86_tmp_pml_vaddr[level] == 0) 4730 panic("mapping of real mode PML failed\n"); 4731 pmap_kenter_pa(x86_tmp_pml_vaddr[level], 4732 x86_tmp_pml_paddr[level], 4733 VM_PROT_READ | VM_PROT_WRITE, 0); 4734 pmap_update(pmap_kernel()); 4735 } 4736 maps_loaded = true; 4737 } 4738 4739 /* Zero levels 1-3 */ 4740 for (level = 0; level < PTP_LEVELS - 1; ++level) { 4741 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 4742 memset(tmp_pml, 0, PAGE_SIZE); 4743 } 4744 4745 /* Copy PML4 */ 4746 kernel_pml = pmap_kernel()->pm_pdir; 4747 tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1]; 4748 memcpy(tmp_pml, kernel_pml, PAGE_SIZE); 4749 4750 #ifdef PAE 4751 /* 4752 * Use the last 4 entries of the L2 page as L3 PD entries. These 4753 * last entries are unlikely to be used for temporary mappings. 4754 * 508: maps 0->1GB (userland) 4755 * 509: unused 4756 * 510: unused 4757 * 511: maps 3->4GB (kernel) 4758 */ 4759 tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PG_V; 4760 tmp_pml[509] = 0; 4761 tmp_pml[510] = 0; 4762 tmp_pml[511] = pmap_pdirpa(pmap_kernel(),PDIR_SLOT_KERN) | PG_V; 4763 #endif 4764 4765 for (level = PTP_LEVELS - 1; level > 0; --level) { 4766 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 4767 4768 tmp_pml[pl_i(pg, level + 1)] = 4769 (x86_tmp_pml_paddr[level - 1] & PG_FRAME) | PG_RW | PG_V; 4770 } 4771 4772 tmp_pml = (void *)x86_tmp_pml_vaddr[0]; 4773 tmp_pml[pl_i(pg, 1)] = (pg & PG_FRAME) | PG_RW | PG_V; 4774 4775 #ifdef PAE 4776 /* Return the PA of the L3 page (entry 508 of the L2 page) */ 4777 return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t); 4778 #endif 4779 4780 return x86_tmp_pml_paddr[PTP_LEVELS - 1]; 4781 } 4782