1 /* $NetBSD: pmap.c,v 1.119 2011/04/14 16:00:21 yamt Exp $ */ 2 3 /* 4 * Copyright (c) 2007 Manuel Bouyer. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 * 26 */ 27 28 /* 29 * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr> 30 * 31 * Permission to use, copy, modify, and distribute this software for any 32 * purpose with or without fee is hereby granted, provided that the above 33 * copyright notice and this permission notice appear in all copies. 34 * 35 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 36 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 37 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 38 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 39 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 40 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 41 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 42 */ 43 44 /* 45 * Copyright (c) 1997 Charles D. Cranor and Washington University. 46 * All rights reserved. 47 * 48 * Redistribution and use in source and binary forms, with or without 49 * modification, are permitted provided that the following conditions 50 * are met: 51 * 1. Redistributions of source code must retain the above copyright 52 * notice, this list of conditions and the following disclaimer. 53 * 2. Redistributions in binary form must reproduce the above copyright 54 * notice, this list of conditions and the following disclaimer in the 55 * documentation and/or other materials provided with the distribution. 56 * 57 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 58 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 59 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 60 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 61 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 62 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 63 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 64 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 65 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 66 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 67 */ 68 69 /* 70 * Copyright 2001 (c) Wasabi Systems, Inc. 71 * All rights reserved. 72 * 73 * Written by Frank van der Linden for Wasabi Systems, Inc. 74 * 75 * Redistribution and use in source and binary forms, with or without 76 * modification, are permitted provided that the following conditions 77 * are met: 78 * 1. Redistributions of source code must retain the above copyright 79 * notice, this list of conditions and the following disclaimer. 80 * 2. Redistributions in binary form must reproduce the above copyright 81 * notice, this list of conditions and the following disclaimer in the 82 * documentation and/or other materials provided with the distribution. 83 * 3. All advertising materials mentioning features or use of this software 84 * must display the following acknowledgement: 85 * This product includes software developed for the NetBSD Project by 86 * Wasabi Systems, Inc. 87 * 4. The name of Wasabi Systems, Inc. may not be used to endorse 88 * or promote products derived from this software without specific prior 89 * written permission. 90 * 91 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND 92 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 93 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 94 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC 95 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 96 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 97 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 98 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 99 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 100 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 101 * POSSIBILITY OF SUCH DAMAGE. 102 */ 103 104 /* 105 * This is the i386 pmap modified and generalized to support x86-64 106 * as well. The idea is to hide the upper N levels of the page tables 107 * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest 108 * is mostly untouched, except that it uses some more generalized 109 * macros and interfaces. 110 * 111 * This pmap has been tested on the i386 as well, and it can be easily 112 * adapted to PAE. 113 * 114 * fvdl@wasabisystems.com 18-Jun-2001 115 */ 116 117 /* 118 * pmap.c: i386 pmap module rewrite 119 * Chuck Cranor <chuck@netbsd> 120 * 11-Aug-97 121 * 122 * history of this pmap module: in addition to my own input, i used 123 * the following references for this rewrite of the i386 pmap: 124 * 125 * [1] the NetBSD i386 pmap. this pmap appears to be based on the 126 * BSD hp300 pmap done by Mike Hibler at University of Utah. 127 * it was then ported to the i386 by William Jolitz of UUNET 128 * Technologies, Inc. Then Charles M. Hannum of the NetBSD 129 * project fixed some bugs and provided some speed ups. 130 * 131 * [2] the FreeBSD i386 pmap. this pmap seems to be the 132 * Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson 133 * and David Greenman. 134 * 135 * [3] the Mach pmap. this pmap, from CMU, seems to have migrated 136 * between several processors. the VAX version was done by 137 * Avadis Tevanian, Jr., and Michael Wayne Young. the i386 138 * version was done by Lance Berc, Mike Kupfer, Bob Baron, 139 * David Golub, and Richard Draves. the alpha version was 140 * done by Alessandro Forin (CMU/Mach) and Chris Demetriou 141 * (NetBSD/alpha). 142 */ 143 144 #include <sys/cdefs.h> 145 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.119 2011/04/14 16:00:21 yamt Exp $"); 146 147 #include "opt_user_ldt.h" 148 #include "opt_lockdebug.h" 149 #include "opt_multiprocessor.h" 150 #include "opt_xen.h" 151 #if !defined(__x86_64__) 152 #include "opt_kstack_dr0.h" 153 #endif /* !defined(__x86_64__) */ 154 155 #include <sys/param.h> 156 #include <sys/systm.h> 157 #include <sys/proc.h> 158 #include <sys/pool.h> 159 #include <sys/kernel.h> 160 #include <sys/atomic.h> 161 #include <sys/cpu.h> 162 #include <sys/intr.h> 163 #include <sys/xcall.h> 164 165 #include <uvm/uvm.h> 166 167 #include <dev/isa/isareg.h> 168 169 #include <machine/specialreg.h> 170 #include <machine/gdt.h> 171 #include <machine/isa_machdep.h> 172 #include <machine/cpuvar.h> 173 174 #include <x86/pmap.h> 175 #include <x86/pmap_pv.h> 176 177 #include <x86/i82489reg.h> 178 #include <x86/i82489var.h> 179 180 #ifdef XEN 181 #include <xen/xen3-public/xen.h> 182 #include <xen/hypervisor.h> 183 #endif 184 185 /* flag to be used for kernel mappings: PG_u on Xen/amd64, 0 otherwise */ 186 #if defined(XEN) && defined(__x86_64__) 187 #define PG_k PG_u 188 #else 189 #define PG_k 0 190 #endif 191 192 /* 193 * general info: 194 * 195 * - for an explanation of how the i386 MMU hardware works see 196 * the comments in <machine/pte.h>. 197 * 198 * - for an explanation of the general memory structure used by 199 * this pmap (including the recursive mapping), see the comments 200 * in <machine/pmap.h>. 201 * 202 * this file contains the code for the "pmap module." the module's 203 * job is to manage the hardware's virtual to physical address mappings. 204 * note that there are two levels of mapping in the VM system: 205 * 206 * [1] the upper layer of the VM system uses vm_map's and vm_map_entry's 207 * to map ranges of virtual address space to objects/files. for 208 * example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only 209 * to the file /bin/ls starting at offset zero." note that 210 * the upper layer mapping is not concerned with how individual 211 * vm_pages are mapped. 212 * 213 * [2] the lower layer of the VM system (the pmap) maintains the mappings 214 * from virtual addresses. it is concerned with which vm_page is 215 * mapped where. for example, when you run /bin/ls and start 216 * at page 0x1000 the fault routine may lookup the correct page 217 * of the /bin/ls file and then ask the pmap layer to establish 218 * a mapping for it. 219 * 220 * note that information in the lower layer of the VM system can be 221 * thrown away since it can easily be reconstructed from the info 222 * in the upper layer. 223 * 224 * data structures we use include: 225 * 226 * - struct pmap: describes the address space of one thread 227 * - struct pv_entry: describes one <PMAP,VA> mapping of a PA 228 * - struct pv_head: there is one pv_head per managed page of 229 * physical memory. the pv_head points to a list of pv_entry 230 * structures which describe all the <PMAP,VA> pairs that this 231 * page is mapped in. this is critical for page based operations 232 * such as pmap_page_protect() [change protection on _all_ mappings 233 * of a page] 234 */ 235 236 /* 237 * memory allocation 238 * 239 * - there are three data structures that we must dynamically allocate: 240 * 241 * [A] new process' page directory page (PDP) 242 * - plan 1: done at pmap_create() we use 243 * uvm_km_alloc(kernel_map, PAGE_SIZE) [fka kmem_alloc] to do this 244 * allocation. 245 * 246 * if we are low in free physical memory then we sleep in 247 * uvm_km_alloc -- in this case this is ok since we are creating 248 * a new pmap and should not be holding any locks. 249 * 250 * if the kernel is totally out of virtual space 251 * (i.e. uvm_km_alloc returns NULL), then we panic. 252 * 253 * [B] new page tables pages (PTP) 254 * - call uvm_pagealloc() 255 * => success: zero page, add to pm_pdir 256 * => failure: we are out of free vm_pages, let pmap_enter() 257 * tell UVM about it. 258 * 259 * note: for kernel PTPs, we start with NKPTP of them. as we map 260 * kernel memory (at uvm_map time) we check to see if we've grown 261 * the kernel pmap. if so, we call the optional function 262 * pmap_growkernel() to grow the kernel PTPs in advance. 263 * 264 * [C] pv_entry structures 265 */ 266 267 /* 268 * locking 269 * 270 * we have the following locks that we must contend with: 271 * 272 * mutexes: 273 * 274 * - pmap lock (per pmap, part of uvm_object) 275 * this lock protects the fields in the pmap structure including 276 * the non-kernel PDEs in the PDP, and the PTEs. it also locks 277 * in the alternate PTE space (since that is determined by the 278 * entry in the PDP). 279 * 280 * - pvh_lock (per pv_head) 281 * this lock protects the pv_entry list which is chained off the 282 * pv_head structure for a specific managed PA. it is locked 283 * when traversing the list (e.g. adding/removing mappings, 284 * syncing R/M bits, etc.) 285 * 286 * - pmaps_lock 287 * this lock protects the list of active pmaps (headed by "pmaps"). 288 * we lock it when adding or removing pmaps from this list. 289 * 290 * tlb shootdown 291 * 292 * tlb shootdowns are hard interrupts that operate outside the spl 293 * framework: they don't need to be blocked provided that the pmap module 294 * gets the order of events correct. the calls are made by talking directly 295 * to the lapic. the stubs to handle the interrupts are quite short and do 296 * one of the following: invalidate a single page, a range of pages, all 297 * user tlb entries or the entire tlb. 298 * 299 * the cpus synchronize with each other using pmap_mbox structures which are 300 * aligned on 64-byte cache lines. tlb shootdowns against the kernel pmap 301 * use a global mailbox and are generated using a broadcast ipi (broadcast 302 * to all but the sending cpu). shootdowns against regular pmaps use 303 * per-cpu mailboxes and are multicast. kernel and user shootdowns can 304 * execute simultaneously, as can shootdowns within different multithreaded 305 * processes. TODO: 306 * 307 * 1. figure out which waitpoints can be deferered to pmap_update(). 308 * 2. see if there is a cheap way to batch some updates. 309 */ 310 311 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER; 312 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER; 313 const long nkptpmax[] = NKPTPMAX_INITIALIZER; 314 const long nbpd[] = NBPD_INITIALIZER; 315 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER; 316 pd_entry_t * const alternate_pdes[] = APDES_INITIALIZER; 317 318 long nkptp[] = NKPTP_INITIALIZER; 319 320 static kmutex_t pmaps_lock; 321 322 static vaddr_t pmap_maxkvaddr; 323 324 #define COUNT(x) /* nothing */ 325 326 /* 327 * XXX kludge: dummy locking to make KASSERTs in uvm_page.c comfortable. 328 * actual locking is done by pm_lock. 329 */ 330 #if defined(DIAGNOSTIC) 331 #define PMAP_SUBOBJ_LOCK(pm, idx) \ 332 KASSERT(mutex_owned(&(pm)->pm_lock)); \ 333 if ((idx) != 0) \ 334 mutex_enter(&(pm)->pm_obj[(idx)].vmobjlock) 335 #define PMAP_SUBOBJ_UNLOCK(pm, idx) \ 336 KASSERT(mutex_owned(&(pm)->pm_lock)); \ 337 if ((idx) != 0) \ 338 mutex_exit(&(pm)->pm_obj[(idx)].vmobjlock) 339 #else /* defined(DIAGNOSTIC) */ 340 #define PMAP_SUBOBJ_LOCK(pm, idx) /* nothing */ 341 #define PMAP_SUBOBJ_UNLOCK(pm, idx) /* nothing */ 342 #endif /* defined(DIAGNOSTIC) */ 343 344 /* 345 * Misc. event counters. 346 */ 347 struct evcnt pmap_iobmp_evcnt; 348 struct evcnt pmap_ldt_evcnt; 349 350 /* 351 * Global TLB shootdown mailbox. 352 */ 353 struct evcnt pmap_tlb_evcnt __aligned(64); 354 struct pmap_mbox pmap_mbox __aligned(64); 355 356 /* 357 * PAT 358 */ 359 #define PATENTRY(n, type) (type << ((n) * 8)) 360 #define PAT_UC 0x0ULL 361 #define PAT_WC 0x1ULL 362 #define PAT_WT 0x4ULL 363 #define PAT_WP 0x5ULL 364 #define PAT_WB 0x6ULL 365 #define PAT_UCMINUS 0x7ULL 366 367 static bool cpu_pat_enabled = false; 368 369 370 /* 371 * Per-CPU data. The pmap mailbox is cache intensive so gets its 372 * own line. Note that the mailbox must be the first item. 373 */ 374 struct pmap_cpu { 375 /* TLB shootdown */ 376 struct pmap_mbox pc_mbox; 377 }; 378 379 union { 380 struct pmap_cpu pc; 381 uint8_t padding[64]; 382 } pmap_cpu[MAXCPUS] __aligned(64); 383 384 /* 385 * global data structures 386 */ 387 388 static struct pmap kernel_pmap_store; /* the kernel's pmap (proc0) */ 389 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store; 390 391 /* 392 * pmap_pg_g: if our processor supports PG_G in the PTE then we 393 * set pmap_pg_g to PG_G (otherwise it is zero). 394 */ 395 396 int pmap_pg_g = 0; 397 398 /* 399 * pmap_largepages: if our processor supports PG_PS and we are 400 * using it, this is set to true. 401 */ 402 403 int pmap_largepages; 404 405 /* 406 * i386 physical memory comes in a big contig chunk with a small 407 * hole toward the front of it... the following two paddr_t's 408 * (shared with machdep.c) describe the physical address space 409 * of this machine. 410 */ 411 paddr_t avail_start; /* PA of first available physical page */ 412 paddr_t avail_end; /* PA of last available physical page */ 413 414 #ifdef XEN 415 #ifdef __x86_64__ 416 /* Dummy PGD for user cr3, used between pmap_deactivate() and pmap_activate() */ 417 static paddr_t xen_dummy_user_pgd; 418 #endif /* __x86_64__ */ 419 paddr_t pmap_pa_start; /* PA of first physical page for this domain */ 420 paddr_t pmap_pa_end; /* PA of last physical page for this domain */ 421 #endif /* XEN */ 422 423 #define VM_PAGE_TO_PP(pg) (&(pg)->mdpage.mp_pp) 424 425 #define pp_lock(pp) mutex_spin_enter(&(pp)->pp_lock) 426 #define pp_unlock(pp) mutex_spin_exit(&(pp)->pp_lock) 427 #define pp_locked(pp) mutex_owned(&(pp)->pp_lock) 428 429 #define PV_HASH_SIZE 32768 430 #define PV_HASH_LOCK_CNT 32 431 432 struct pv_hash_lock { 433 kmutex_t lock; 434 } __aligned(CACHE_LINE_SIZE) pv_hash_locks[PV_HASH_LOCK_CNT] 435 __aligned(CACHE_LINE_SIZE); 436 437 struct pv_hash_head { 438 SLIST_HEAD(, pv_entry) hh_list; 439 } pv_hash_heads[PV_HASH_SIZE]; 440 441 static u_int 442 pvhash_hash(struct vm_page *ptp, vaddr_t va) 443 { 444 445 return (uintptr_t)ptp / sizeof(*ptp) + (va >> PAGE_SHIFT); 446 } 447 448 static struct pv_hash_head * 449 pvhash_head(u_int hash) 450 { 451 452 return &pv_hash_heads[hash % PV_HASH_SIZE]; 453 } 454 455 static kmutex_t * 456 pvhash_lock(u_int hash) 457 { 458 459 return &pv_hash_locks[hash % PV_HASH_LOCK_CNT].lock; 460 } 461 462 static struct pv_entry * 463 pvhash_remove(struct pv_hash_head *hh, struct vm_page *ptp, vaddr_t va) 464 { 465 struct pv_entry *pve; 466 struct pv_entry *prev; 467 468 prev = NULL; 469 SLIST_FOREACH(pve, &hh->hh_list, pve_hash) { 470 if (pve->pve_pte.pte_ptp == ptp && 471 pve->pve_pte.pte_va == va) { 472 if (prev != NULL) { 473 SLIST_REMOVE_AFTER(prev, pve_hash); 474 } else { 475 SLIST_REMOVE_HEAD(&hh->hh_list, pve_hash); 476 } 477 break; 478 } 479 prev = pve; 480 } 481 return pve; 482 } 483 484 /* 485 * other data structures 486 */ 487 488 static pt_entry_t protection_codes[8]; /* maps MI prot to i386 prot code */ 489 static bool pmap_initialized = false; /* pmap_init done yet? */ 490 491 /* 492 * the following two vaddr_t's are used during system startup 493 * to keep track of how much of the kernel's VM space we have used. 494 * once the system is started, the management of the remaining kernel 495 * VM space is turned over to the kernel_map vm_map. 496 */ 497 498 static vaddr_t virtual_avail; /* VA of first free KVA */ 499 static vaddr_t virtual_end; /* VA of last free KVA */ 500 501 /* 502 * linked list of all non-kernel pmaps 503 */ 504 505 static struct pmap_head pmaps; 506 507 /* 508 * pool that pmap structures are allocated from 509 */ 510 511 static struct pool_cache pmap_cache; 512 513 /* 514 * pv_entry cache 515 */ 516 517 static struct pool_cache pmap_pv_cache; 518 519 /* 520 * MULTIPROCESSOR: special VA's/ PTE's are actually allocated inside a 521 * maxcpus*NPTECL array of PTE's, to avoid cache line thrashing 522 * due to false sharing. 523 */ 524 525 #ifdef MULTIPROCESSOR 526 #define PTESLEW(pte, id) ((pte)+(id)*NPTECL) 527 #define VASLEW(va,id) ((va)+(id)*NPTECL*PAGE_SIZE) 528 #else 529 #define PTESLEW(pte, id) (pte) 530 #define VASLEW(va,id) (va) 531 #endif 532 533 /* 534 * special VAs and the PTEs that map them 535 */ 536 static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte, *early_zero_pte; 537 static char *csrcp, *cdstp, *zerop, *ptpp, *early_zerop; 538 539 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int); 540 541 /* 542 * pool and cache that PDPs are allocated from 543 */ 544 545 static struct pool_cache pmap_pdp_cache; 546 int pmap_pdp_ctor(void *, void *, int); 547 void pmap_pdp_dtor(void *, void *); 548 #ifdef PAE 549 /* need to allocate items of 4 pages */ 550 void *pmap_pdp_alloc(struct pool *, int); 551 void pmap_pdp_free(struct pool *, void *); 552 static struct pool_allocator pmap_pdp_allocator = { 553 .pa_alloc = pmap_pdp_alloc, 554 .pa_free = pmap_pdp_free, 555 .pa_pagesz = PAGE_SIZE * PDP_SIZE, 556 }; 557 #endif /* PAE */ 558 559 void *vmmap; /* XXX: used by mem.c... it should really uvm_map_reserve it */ 560 561 extern vaddr_t idt_vaddr; /* we allocate IDT early */ 562 extern paddr_t idt_paddr; 563 564 #ifdef _LP64 565 extern vaddr_t lo32_vaddr; 566 extern vaddr_t lo32_paddr; 567 #endif 568 569 extern int end; 570 571 #ifdef i386 572 /* stuff to fix the pentium f00f bug */ 573 extern vaddr_t pentium_idt_vaddr; 574 #endif 575 576 577 /* 578 * local prototypes 579 */ 580 581 static struct vm_page *pmap_get_ptp(struct pmap *, vaddr_t, 582 pd_entry_t * const *); 583 static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int); 584 static void pmap_freepage(struct pmap *, struct vm_page *, int); 585 static void pmap_free_ptp(struct pmap *, struct vm_page *, 586 vaddr_t, pt_entry_t *, 587 pd_entry_t * const *); 588 static bool pmap_is_curpmap(struct pmap *); 589 static bool pmap_is_active(struct pmap *, struct cpu_info *, bool); 590 static bool pmap_remove_pte(struct pmap *, struct vm_page *, 591 pt_entry_t *, vaddr_t, 592 struct pv_entry **); 593 static pt_entry_t pmap_remove_ptes(struct pmap *, struct vm_page *, 594 vaddr_t, vaddr_t, vaddr_t, 595 struct pv_entry **); 596 597 static void pmap_unmap_apdp(void); 598 static bool pmap_get_physpage(vaddr_t, int, paddr_t *); 599 static void pmap_alloc_level(pd_entry_t * const *, vaddr_t, int, 600 long *); 601 602 static bool pmap_reactivate(struct pmap *); 603 604 /* 605 * p m a p h e l p e r f u n c t i o n s 606 */ 607 608 static inline void 609 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff) 610 { 611 612 if (pmap == pmap_kernel()) { 613 atomic_add_long(&pmap->pm_stats.resident_count, resid_diff); 614 atomic_add_long(&pmap->pm_stats.wired_count, wired_diff); 615 } else { 616 KASSERT(mutex_owned(&pmap->pm_lock)); 617 pmap->pm_stats.resident_count += resid_diff; 618 pmap->pm_stats.wired_count += wired_diff; 619 } 620 } 621 622 static inline void 623 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte) 624 { 625 int resid_diff = ((npte & PG_V) ? 1 : 0) - ((opte & PG_V) ? 1 : 0); 626 int wired_diff = ((npte & PG_W) ? 1 : 0) - ((opte & PG_W) ? 1 : 0); 627 628 KASSERT((npte & (PG_V | PG_W)) != PG_W); 629 KASSERT((opte & (PG_V | PG_W)) != PG_W); 630 631 pmap_stats_update(pmap, resid_diff, wired_diff); 632 } 633 634 /* 635 * ptp_to_pmap: lookup pmap by ptp 636 */ 637 638 static struct pmap * 639 ptp_to_pmap(struct vm_page *ptp) 640 { 641 struct pmap *pmap; 642 643 if (ptp == NULL) { 644 return pmap_kernel(); 645 } 646 pmap = (struct pmap *)ptp->uobject; 647 KASSERT(pmap != NULL); 648 KASSERT(&pmap->pm_obj[0] == ptp->uobject); 649 return pmap; 650 } 651 652 static inline struct pv_pte * 653 pve_to_pvpte(struct pv_entry *pve) 654 { 655 656 KASSERT((void *)&pve->pve_pte == (void *)pve); 657 return &pve->pve_pte; 658 } 659 660 static inline struct pv_entry * 661 pvpte_to_pve(struct pv_pte *pvpte) 662 { 663 struct pv_entry *pve = (void *)pvpte; 664 665 KASSERT(pve_to_pvpte(pve) == pvpte); 666 return pve; 667 } 668 669 /* 670 * pv_pte_first, pv_pte_next: PV list iterator. 671 */ 672 673 static struct pv_pte * 674 pv_pte_first(struct pmap_page *pp) 675 { 676 677 KASSERT(pp_locked(pp)); 678 if ((pp->pp_flags & PP_EMBEDDED) != 0) { 679 return &pp->pp_pte; 680 } 681 return pve_to_pvpte(LIST_FIRST(&pp->pp_head.pvh_list)); 682 } 683 684 static struct pv_pte * 685 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte) 686 { 687 688 KASSERT(pvpte != NULL); 689 KASSERT(pp_locked(pp)); 690 if (pvpte == &pp->pp_pte) { 691 KASSERT((pp->pp_flags & PP_EMBEDDED) != 0); 692 return NULL; 693 } 694 KASSERT((pp->pp_flags & PP_EMBEDDED) == 0); 695 return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list)); 696 } 697 698 /* 699 * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]? 700 * of course the kernel is always loaded 701 */ 702 703 inline static bool 704 pmap_is_curpmap(struct pmap *pmap) 705 { 706 #if defined(XEN) && defined(__x86_64__) 707 /* 708 * Only kernel pmap is physically loaded. 709 * User PGD may be active, but TLB will be flushed 710 * with HYPERVISOR_iret anyway, so let's say no 711 */ 712 return(pmap == pmap_kernel()); 713 #else /* XEN && __x86_64__*/ 714 return((pmap == pmap_kernel()) || 715 (pmap == curcpu()->ci_pmap)); 716 #endif 717 } 718 719 /* 720 * pmap_is_active: is this pmap loaded into the specified processor's %cr3? 721 */ 722 723 inline static bool 724 pmap_is_active(struct pmap *pmap, struct cpu_info *ci, bool kernel) 725 { 726 727 return (pmap == pmap_kernel() || 728 (pmap->pm_cpus & ci->ci_cpumask) != 0 || 729 (kernel && (pmap->pm_kernel_cpus & ci->ci_cpumask) != 0)); 730 } 731 732 static void 733 pmap_apte_flush(struct pmap *pmap) 734 { 735 736 KASSERT(kpreempt_disabled()); 737 738 /* 739 * Flush the APTE mapping from all other CPUs that 740 * are using the pmap we are using (who's APTE space 741 * is the one we've just modified). 742 * 743 * XXXthorpej -- find a way to defer the IPI. 744 */ 745 pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, 0); 746 pmap_tlb_shootwait(); 747 } 748 749 /* 750 * Unmap the content of APDP PDEs 751 */ 752 static void 753 pmap_unmap_apdp(void) 754 { 755 int i; 756 757 for (i = 0; i < PDP_SIZE; i++) { 758 pmap_pte_set(APDP_PDE+i, 0); 759 #if defined (XEN) && defined (PAE) 760 /* clear shadow entries too */ 761 pmap_pte_set(APDP_PDE_SHADOW+i, 0); 762 #endif 763 } 764 } 765 766 /* 767 * Add a reference to the specified pmap. 768 */ 769 770 inline void 771 pmap_reference(struct pmap *pmap) 772 { 773 774 atomic_inc_uint(&pmap->pm_obj[0].uo_refs); 775 } 776 777 /* 778 * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in 779 * 780 * => we lock enough pmaps to keep things locked in 781 * => must be undone with pmap_unmap_ptes before returning 782 */ 783 784 void 785 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2, 786 pd_entry_t **ptepp, pd_entry_t * const **pdeppp) 787 { 788 pd_entry_t opde, npde; 789 struct pmap *ourpmap; 790 struct cpu_info *ci; 791 struct lwp *l; 792 bool iscurrent; 793 uint64_t ncsw; 794 #ifdef XEN 795 int s, i; 796 #endif 797 798 /* the kernel's pmap is always accessible */ 799 if (pmap == pmap_kernel()) { 800 *pmap2 = NULL; 801 *ptepp = PTE_BASE; 802 *pdeppp = normal_pdes; 803 return; 804 } 805 KASSERT(kpreempt_disabled()); 806 807 retry: 808 l = curlwp; 809 ncsw = l->l_ncsw; 810 ourpmap = NULL; 811 ci = curcpu(); 812 #if defined(XEN) && defined(__x86_64__) 813 /* 814 * curmap can only be pmap_kernel so at this point 815 * pmap_is_curpmap is always false 816 */ 817 iscurrent = 0; 818 ourpmap = pmap_kernel(); 819 #else /* XEN && __x86_64__*/ 820 if (ci->ci_want_pmapload && 821 vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) { 822 pmap_load(); 823 if (l->l_ncsw != ncsw) 824 goto retry; 825 } 826 iscurrent = pmap_is_curpmap(pmap); 827 /* if curpmap then we are always mapped */ 828 if (iscurrent) { 829 mutex_enter(&pmap->pm_lock); 830 *pmap2 = NULL; 831 *ptepp = PTE_BASE; 832 *pdeppp = normal_pdes; 833 goto out; 834 } 835 ourpmap = ci->ci_pmap; 836 #endif /* XEN && __x86_64__ */ 837 838 /* need to lock both curpmap and pmap: use ordered locking */ 839 pmap_reference(ourpmap); 840 if ((uintptr_t) pmap < (uintptr_t) ourpmap) { 841 mutex_enter(&pmap->pm_lock); 842 mutex_enter(&ourpmap->pm_lock); 843 } else { 844 mutex_enter(&ourpmap->pm_lock); 845 mutex_enter(&pmap->pm_lock); 846 } 847 848 if (l->l_ncsw != ncsw) 849 goto unlock_and_retry; 850 851 /* need to load a new alternate pt space into curpmap? */ 852 COUNT(apdp_pde_map); 853 opde = *APDP_PDE; 854 if (!pmap_valid_entry(opde) || 855 pmap_pte2pa(opde) != pmap_pdirpa(pmap, 0)) { 856 #ifdef XEN 857 s = splvm(); 858 /* Make recursive entry usable in user PGD */ 859 for (i = 0; i < PDP_SIZE; i++) { 860 npde = pmap_pa2pte( 861 pmap_pdirpa(pmap, i * NPDPG)) | PG_k | PG_V; 862 xpq_queue_pte_update( 863 xpmap_ptom(pmap_pdirpa(pmap, PDIR_SLOT_PTE + i)), 864 npde); 865 xpq_queue_pte_update(xpmap_ptetomach(&APDP_PDE[i]), 866 npde); 867 #ifdef PAE 868 /* update shadow entry too */ 869 xpq_queue_pte_update( 870 xpmap_ptetomach(&APDP_PDE_SHADOW[i]), npde); 871 #endif /* PAE */ 872 xpq_queue_invlpg( 873 (vaddr_t)&pmap->pm_pdir[PDIR_SLOT_PTE + i]); 874 } 875 if (pmap_valid_entry(opde)) 876 pmap_apte_flush(ourpmap); 877 splx(s); 878 #else /* XEN */ 879 int i; 880 for (i = 0; i < PDP_SIZE; i++) { 881 npde = pmap_pa2pte( 882 pmap_pdirpa(pmap, i * NPDPG)) | PG_RW | PG_V; 883 pmap_pte_set(APDP_PDE+i, npde); 884 } 885 pmap_pte_flush(); 886 if (pmap_valid_entry(opde)) 887 pmap_apte_flush(ourpmap); 888 #endif /* XEN */ 889 } 890 *pmap2 = ourpmap; 891 *ptepp = APTE_BASE; 892 *pdeppp = alternate_pdes; 893 KASSERT(l->l_ncsw == ncsw); 894 #if !defined(XEN) || !defined(__x86_64__) 895 out: 896 #endif 897 /* 898 * might have blocked, need to retry? 899 */ 900 if (l->l_ncsw != ncsw) { 901 unlock_and_retry: 902 if (ourpmap != NULL) { 903 mutex_exit(&ourpmap->pm_lock); 904 pmap_destroy(ourpmap); 905 } 906 mutex_exit(&pmap->pm_lock); 907 goto retry; 908 } 909 910 return; 911 } 912 913 /* 914 * pmap_unmap_ptes: unlock the PTE mapping of "pmap" 915 */ 916 917 void 918 pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2) 919 { 920 921 if (pmap == pmap_kernel()) { 922 return; 923 } 924 KASSERT(kpreempt_disabled()); 925 if (pmap2 == NULL) { 926 mutex_exit(&pmap->pm_lock); 927 } else { 928 #if defined(XEN) && defined(__x86_64__) 929 KASSERT(pmap2 == pmap_kernel()); 930 #else 931 KASSERT(curcpu()->ci_pmap == pmap2); 932 #endif 933 #if defined(MULTIPROCESSOR) 934 pmap_unmap_apdp(); 935 pmap_pte_flush(); 936 pmap_apte_flush(pmap2); 937 #endif 938 COUNT(apdp_pde_unmap); 939 mutex_exit(&pmap->pm_lock); 940 mutex_exit(&pmap2->pm_lock); 941 pmap_destroy(pmap2); 942 } 943 } 944 945 inline static void 946 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte) 947 { 948 949 #if !defined(__x86_64__) 950 if (curproc == NULL || curproc->p_vmspace == NULL || 951 pm != vm_map_pmap(&curproc->p_vmspace->vm_map)) 952 return; 953 954 if ((opte ^ npte) & PG_X) 955 pmap_update_pg(va); 956 957 /* 958 * Executability was removed on the last executable change. 959 * Reset the code segment to something conservative and 960 * let the trap handler deal with setting the right limit. 961 * We can't do that because of locking constraints on the vm map. 962 */ 963 964 if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) { 965 struct trapframe *tf = curlwp->l_md.md_regs; 966 967 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 968 pm->pm_hiexec = I386_MAX_EXE_ADDR; 969 } 970 #endif /* !defined(__x86_64__) */ 971 } 972 973 #if !defined(__x86_64__) 974 /* 975 * Fixup the code segment to cover all potential executable mappings. 976 * returns 0 if no changes to the code segment were made. 977 */ 978 979 int 980 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb) 981 { 982 struct vm_map_entry *ent; 983 struct pmap *pm = vm_map_pmap(map); 984 vaddr_t va = 0; 985 986 vm_map_lock_read(map); 987 for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) { 988 989 /* 990 * This entry has greater va than the entries before. 991 * We need to make it point to the last page, not past it. 992 */ 993 994 if (ent->protection & VM_PROT_EXECUTE) 995 va = trunc_page(ent->end) - PAGE_SIZE; 996 } 997 vm_map_unlock_read(map); 998 if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL)) 999 return (0); 1000 1001 pm->pm_hiexec = va; 1002 if (pm->pm_hiexec > I386_MAX_EXE_ADDR) { 1003 tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL); 1004 } else { 1005 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 1006 return (0); 1007 } 1008 return (1); 1009 } 1010 #endif /* !defined(__x86_64__) */ 1011 1012 void 1013 pat_init(struct cpu_info *ci) 1014 { 1015 uint64_t pat; 1016 1017 if (!(ci->ci_feat_val[0] & CPUID_PAT)) 1018 return; 1019 1020 /* We change WT to WC. Leave all other entries the default values. */ 1021 pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) | 1022 PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) | 1023 PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) | 1024 PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC); 1025 1026 wrmsr(MSR_CR_PAT, pat); 1027 cpu_pat_enabled = true; 1028 aprint_debug_dev(ci->ci_dev, "PAT enabled\n"); 1029 } 1030 1031 static pt_entry_t 1032 pmap_pat_flags(u_int flags) 1033 { 1034 u_int cacheflags = (flags & PMAP_CACHE_MASK); 1035 1036 if (!cpu_pat_enabled) { 1037 switch (cacheflags) { 1038 case PMAP_NOCACHE: 1039 case PMAP_NOCACHE_OVR: 1040 /* results in PGC_UCMINUS on cpus which have 1041 * the cpuid PAT but PAT "disabled" 1042 */ 1043 return PG_N; 1044 default: 1045 return 0; 1046 } 1047 } 1048 1049 switch (cacheflags) { 1050 case PMAP_NOCACHE: 1051 return PGC_UC; 1052 case PMAP_WRITE_COMBINE: 1053 return PGC_WC; 1054 case PMAP_WRITE_BACK: 1055 return PGC_WB; 1056 case PMAP_NOCACHE_OVR: 1057 return PGC_UCMINUS; 1058 } 1059 1060 return 0; 1061 } 1062 1063 /* 1064 * p m a p k e n t e r f u n c t i o n s 1065 * 1066 * functions to quickly enter/remove pages from the kernel address 1067 * space. pmap_kremove is exported to MI kernel. we make use of 1068 * the recursive PTE mappings. 1069 */ 1070 1071 /* 1072 * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking 1073 * 1074 * => no need to lock anything, assume va is already allocated 1075 * => should be faster than normal pmap enter function 1076 */ 1077 1078 void 1079 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags) 1080 { 1081 pt_entry_t *pte, opte, npte; 1082 1083 KASSERT(!(prot & ~VM_PROT_ALL)); 1084 1085 if (va < VM_MIN_KERNEL_ADDRESS) 1086 pte = vtopte(va); 1087 else 1088 pte = kvtopte(va); 1089 #ifdef DOM0OPS 1090 if (pa < pmap_pa_start || pa >= pmap_pa_end) { 1091 #ifdef DEBUG 1092 printk("pmap_kenter_pa: pa 0x%" PRIx64 " for va 0x%" PRIx64 1093 " outside range\n", (int64_t)pa, (int64_t)va); 1094 #endif /* DEBUG */ 1095 npte = pa; 1096 } else 1097 #endif /* DOM0OPS */ 1098 npte = pmap_pa2pte(pa); 1099 npte |= protection_codes[prot] | PG_k | PG_V | pmap_pg_g; 1100 npte |= pmap_pat_flags(flags); 1101 opte = pmap_pte_testset(pte, npte); /* zap! */ 1102 #if defined(DIAGNOSTIC) 1103 /* XXX For now... */ 1104 if (opte & PG_PS) 1105 panic("pmap_kenter_pa: PG_PS"); 1106 #endif 1107 if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) { 1108 /* This should not happen, so no need to batch updates. */ 1109 kpreempt_disable(); 1110 pmap_tlb_shootdown(pmap_kernel(), va, 0, opte); 1111 kpreempt_enable(); 1112 } 1113 } 1114 1115 void 1116 pmap_emap_enter(vaddr_t va, paddr_t pa, vm_prot_t prot) 1117 { 1118 pt_entry_t *pte, opte, npte; 1119 1120 KASSERT((prot & ~VM_PROT_ALL) == 0); 1121 pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va); 1122 1123 #ifdef DOM0OPS 1124 if (pa < pmap_pa_start || pa >= pmap_pa_end) { 1125 npte = pa; 1126 } else 1127 #endif 1128 npte = pmap_pa2pte(pa); 1129 1130 npte = pmap_pa2pte(pa); 1131 npte |= protection_codes[prot] | PG_k | PG_V; 1132 opte = pmap_pte_testset(pte, npte); 1133 } 1134 1135 /* 1136 * pmap_emap_sync: perform TLB flush or pmap load, if it was deferred. 1137 */ 1138 void 1139 pmap_emap_sync(bool canload) 1140 { 1141 struct cpu_info *ci = curcpu(); 1142 struct pmap *pmap; 1143 1144 KASSERT(kpreempt_disabled()); 1145 if (__predict_true(ci->ci_want_pmapload && canload)) { 1146 /* 1147 * XXX: Hint for pmap_reactivate(), which might suggest to 1148 * not perform TLB flush, if state has not changed. 1149 */ 1150 pmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map); 1151 if (__predict_false(pmap == ci->ci_pmap)) { 1152 const uint32_t cpumask = ci->ci_cpumask; 1153 atomic_and_32(&pmap->pm_cpus, ~cpumask); 1154 } 1155 pmap_load(); 1156 KASSERT(ci->ci_want_pmapload == 0); 1157 } else { 1158 tlbflush(); 1159 } 1160 1161 } 1162 1163 void 1164 pmap_emap_remove(vaddr_t sva, vsize_t len) 1165 { 1166 pt_entry_t *pte, xpte; 1167 vaddr_t va, eva = sva + len; 1168 1169 for (va = sva; va < eva; va += PAGE_SIZE) { 1170 pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va); 1171 xpte |= pmap_pte_testset(pte, 0); 1172 } 1173 } 1174 1175 __weak_alias(pmap_kenter_ma, pmap_kenter_pa); 1176 1177 #if defined(__x86_64__) 1178 /* 1179 * Change protection for a virtual address. Local for a CPU only, don't 1180 * care about TLB shootdowns. 1181 * 1182 * => must be called with preemption disabled 1183 */ 1184 void 1185 pmap_changeprot_local(vaddr_t va, vm_prot_t prot) 1186 { 1187 pt_entry_t *pte, opte, npte; 1188 1189 KASSERT(kpreempt_disabled()); 1190 1191 if (va < VM_MIN_KERNEL_ADDRESS) 1192 pte = vtopte(va); 1193 else 1194 pte = kvtopte(va); 1195 1196 npte = opte = *pte; 1197 1198 if ((prot & VM_PROT_WRITE) != 0) 1199 npte |= PG_RW; 1200 else 1201 npte &= ~PG_RW; 1202 1203 if (opte != npte) { 1204 pmap_pte_set(pte, npte); 1205 pmap_pte_flush(); 1206 invlpg(va); 1207 } 1208 } 1209 #endif /* defined(__x86_64__) */ 1210 1211 /* 1212 * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking 1213 * 1214 * => no need to lock anything 1215 * => caller must dispose of any vm_page mapped in the va range 1216 * => note: not an inline function 1217 * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE 1218 * => we assume kernel only unmaps valid addresses and thus don't bother 1219 * checking the valid bit before doing TLB flushing 1220 * => must be followed by call to pmap_update() before reuse of page 1221 */ 1222 1223 void 1224 pmap_kremove(vaddr_t sva, vsize_t len) 1225 { 1226 pt_entry_t *pte, xpte; 1227 vaddr_t va, eva; 1228 1229 eva = sva + len; 1230 xpte = 0; 1231 1232 for (va = sva; va < eva; va += PAGE_SIZE) { 1233 if (va < VM_MIN_KERNEL_ADDRESS) 1234 pte = vtopte(va); 1235 else 1236 pte = kvtopte(va); 1237 xpte |= pmap_pte_testset(pte, 0); /* zap! */ 1238 #if defined(DIAGNOSTIC) 1239 /* XXX For now... */ 1240 if (xpte & PG_PS) 1241 panic("pmap_kremove: PG_PS"); 1242 if (xpte & PG_PVLIST) 1243 panic("pmap_kremove: PG_PVLIST mapping for 0x%lx", 1244 va); 1245 #endif 1246 } 1247 if ((xpte & (PG_V | PG_U)) == (PG_V | PG_U)) { 1248 kpreempt_disable(); 1249 pmap_tlb_shootdown(pmap_kernel(), sva, eva, xpte); 1250 kpreempt_enable(); 1251 } 1252 } 1253 1254 /* 1255 * p m a p i n i t f u n c t i o n s 1256 * 1257 * pmap_bootstrap and pmap_init are called during system startup 1258 * to init the pmap module. pmap_bootstrap() does a low level 1259 * init just to get things rolling. pmap_init() finishes the job. 1260 */ 1261 1262 /* 1263 * pmap_bootstrap: get the system in a state where it can run with VM 1264 * properly enabled (called before main()). the VM system is 1265 * fully init'd later... 1266 * 1267 * => on i386, locore.s has already enabled the MMU by allocating 1268 * a PDP for the kernel, and nkpde PTP's for the kernel. 1269 * => kva_start is the first free virtual address in kernel space 1270 */ 1271 1272 void 1273 pmap_bootstrap(vaddr_t kva_start) 1274 { 1275 struct pmap *kpm; 1276 pt_entry_t *pte; 1277 int i; 1278 vaddr_t kva; 1279 #ifndef XEN 1280 unsigned long p1i; 1281 vaddr_t kva_end; 1282 #endif 1283 1284 pt_entry_t pg_nx = (cpu_feature[2] & CPUID_NOX ? PG_NX : 0); 1285 1286 /* 1287 * set up our local static global vars that keep track of the 1288 * usage of KVM before kernel_map is set up 1289 */ 1290 1291 virtual_avail = kva_start; /* first free KVA */ 1292 virtual_end = VM_MAX_KERNEL_ADDRESS; /* last KVA */ 1293 1294 /* 1295 * set up protection_codes: we need to be able to convert from 1296 * a MI protection code (some combo of VM_PROT...) to something 1297 * we can jam into a i386 PTE. 1298 */ 1299 1300 protection_codes[VM_PROT_NONE] = pg_nx; /* --- */ 1301 protection_codes[VM_PROT_EXECUTE] = PG_RO | PG_X; /* --x */ 1302 protection_codes[VM_PROT_READ] = PG_RO | pg_nx; /* -r- */ 1303 protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO | PG_X;/* -rx */ 1304 protection_codes[VM_PROT_WRITE] = PG_RW | pg_nx; /* w-- */ 1305 protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW | PG_X;/* w-x */ 1306 protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW | pg_nx; 1307 /* wr- */ 1308 protection_codes[VM_PROT_ALL] = PG_RW | PG_X; /* wrx */ 1309 1310 /* 1311 * now we init the kernel's pmap 1312 * 1313 * the kernel pmap's pm_obj is not used for much. however, in 1314 * user pmaps the pm_obj contains the list of active PTPs. 1315 * the pm_obj currently does not have a pager. it might be possible 1316 * to add a pager that would allow a process to read-only mmap its 1317 * own page tables (fast user level vtophys?). this may or may not 1318 * be useful. 1319 */ 1320 1321 kpm = pmap_kernel(); 1322 for (i = 0; i < PTP_LEVELS - 1; i++) { 1323 UVM_OBJ_INIT(&kpm->pm_obj[i], NULL, 1); 1324 kpm->pm_ptphint[i] = NULL; 1325 } 1326 memset(&kpm->pm_list, 0, sizeof(kpm->pm_list)); /* pm_list not used */ 1327 1328 kpm->pm_pdir = (pd_entry_t *)(PDPpaddr + KERNBASE); 1329 for (i = 0; i < PDP_SIZE; i++) 1330 kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i; 1331 1332 kpm->pm_stats.wired_count = kpm->pm_stats.resident_count = 1333 x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS); 1334 1335 /* 1336 * the above is just a rough estimate and not critical to the proper 1337 * operation of the system. 1338 */ 1339 1340 #ifndef XEN 1341 /* 1342 * Begin to enable global TLB entries if they are supported. 1343 * The G bit has no effect until the CR4_PGE bit is set in CR4, 1344 * which happens in cpu_init(), which is run on each cpu 1345 * (and happens later) 1346 */ 1347 1348 if (cpu_feature[0] & CPUID_PGE) { 1349 pmap_pg_g = PG_G; /* enable software */ 1350 1351 /* add PG_G attribute to already mapped kernel pages */ 1352 if (KERNBASE == VM_MIN_KERNEL_ADDRESS) { 1353 kva_end = virtual_avail; 1354 } else { 1355 extern vaddr_t eblob, esym; 1356 kva_end = (vaddr_t)&end; 1357 if (esym > kva_end) 1358 kva_end = esym; 1359 if (eblob > kva_end) 1360 kva_end = eblob; 1361 kva_end = roundup(kva_end, PAGE_SIZE); 1362 } 1363 for (kva = KERNBASE; kva < kva_end; kva += PAGE_SIZE) { 1364 p1i = pl1_i(kva); 1365 if (pmap_valid_entry(PTE_BASE[p1i])) 1366 PTE_BASE[p1i] |= PG_G; 1367 } 1368 } 1369 1370 /* 1371 * enable large pages if they are supported. 1372 */ 1373 1374 if (cpu_feature[0] & CPUID_PSE) { 1375 paddr_t pa; 1376 pd_entry_t *pde; 1377 extern char __data_start; 1378 1379 lcr4(rcr4() | CR4_PSE); /* enable hardware (via %cr4) */ 1380 pmap_largepages = 1; /* enable software */ 1381 1382 /* 1383 * the TLB must be flushed after enabling large pages 1384 * on Pentium CPUs, according to section 3.6.2.2 of 1385 * "Intel Architecture Software Developer's Manual, 1386 * Volume 3: System Programming". 1387 */ 1388 tlbflush(); 1389 1390 /* 1391 * now, remap the kernel text using large pages. we 1392 * assume that the linker has properly aligned the 1393 * .data segment to a NBPD_L2 boundary. 1394 */ 1395 kva_end = rounddown((vaddr_t)&__data_start, NBPD_L1); 1396 for (pa = 0, kva = KERNBASE; kva + NBPD_L2 <= kva_end; 1397 kva += NBPD_L2, pa += NBPD_L2) { 1398 pde = &L2_BASE[pl2_i(kva)]; 1399 *pde = pa | pmap_pg_g | PG_PS | 1400 PG_KR | PG_V; /* zap! */ 1401 tlbflush(); 1402 } 1403 #if defined(DEBUG) 1404 aprint_normal("kernel text is mapped with %" PRIuPSIZE " large " 1405 "pages and %" PRIuPSIZE " normal pages\n", 1406 howmany(kva - KERNBASE, NBPD_L2), 1407 howmany((vaddr_t)&__data_start - kva, NBPD_L1)); 1408 #endif /* defined(DEBUG) */ 1409 } 1410 #endif /* !XEN */ 1411 1412 if (VM_MIN_KERNEL_ADDRESS != KERNBASE) { 1413 /* 1414 * zero_pte is stuck at the end of mapped space for the kernel 1415 * image (disjunct from kva space). This is done so that it 1416 * can safely be used in pmap_growkernel (pmap_get_physpage), 1417 * when it's called for the first time. 1418 * XXXfvdl fix this for MULTIPROCESSOR later. 1419 */ 1420 1421 early_zerop = (void *)(KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2); 1422 early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop); 1423 } 1424 1425 /* 1426 * now we allocate the "special" VAs which are used for tmp mappings 1427 * by the pmap (and other modules). we allocate the VAs by advancing 1428 * virtual_avail (note that there are no pages mapped at these VAs). 1429 * we find the PTE that maps the allocated VA via the linear PTE 1430 * mapping. 1431 */ 1432 1433 pte = PTE_BASE + pl1_i(virtual_avail); 1434 1435 #ifdef MULTIPROCESSOR 1436 /* 1437 * Waste some VA space to avoid false sharing of cache lines 1438 * for page table pages: Give each possible CPU a cache line 1439 * of PTE's (8) to play with, though we only need 4. We could 1440 * recycle some of this waste by putting the idle stacks here 1441 * as well; we could waste less space if we knew the largest 1442 * CPU ID beforehand. 1443 */ 1444 csrcp = (char *) virtual_avail; csrc_pte = pte; 1445 1446 cdstp = (char *) virtual_avail+PAGE_SIZE; cdst_pte = pte+1; 1447 1448 zerop = (char *) virtual_avail+PAGE_SIZE*2; zero_pte = pte+2; 1449 1450 ptpp = (char *) virtual_avail+PAGE_SIZE*3; ptp_pte = pte+3; 1451 1452 virtual_avail += PAGE_SIZE * maxcpus * NPTECL; 1453 pte += maxcpus * NPTECL; 1454 #else 1455 csrcp = (void *) virtual_avail; csrc_pte = pte; /* allocate */ 1456 virtual_avail += PAGE_SIZE; pte++; /* advance */ 1457 1458 cdstp = (void *) virtual_avail; cdst_pte = pte; 1459 virtual_avail += PAGE_SIZE; pte++; 1460 1461 zerop = (void *) virtual_avail; zero_pte = pte; 1462 virtual_avail += PAGE_SIZE; pte++; 1463 1464 ptpp = (void *) virtual_avail; ptp_pte = pte; 1465 virtual_avail += PAGE_SIZE; pte++; 1466 #endif 1467 1468 if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { 1469 early_zerop = zerop; 1470 early_zero_pte = zero_pte; 1471 } 1472 1473 /* 1474 * Nothing after this point actually needs pte; 1475 */ 1476 pte = (void *)0xdeadbeef; 1477 1478 /* XXX: vmmap used by mem.c... should be uvm_map_reserve */ 1479 /* XXXfvdl PTEs not needed here */ 1480 vmmap = (char *)virtual_avail; /* don't need pte */ 1481 virtual_avail += PAGE_SIZE; pte++; 1482 1483 #ifdef XEN 1484 #ifdef __x86_64__ 1485 /* 1486 * We want a dummy page directory for Xen: 1487 * when deactivate a pmap, Xen will still consider it active. 1488 * So we set user PGD to this one to lift all protection on 1489 * the now inactive page tables set. 1490 */ 1491 xen_dummy_user_pgd = avail_start; 1492 avail_start += PAGE_SIZE; 1493 1494 /* Zero fill it, the less checks in Xen it requires the better */ 1495 memset((void *) (xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE); 1496 /* Mark read-only */ 1497 HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE, 1498 pmap_pa2pte(xen_dummy_user_pgd) | PG_u | PG_V, UVMF_INVLPG); 1499 /* Pin as L4 */ 1500 xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd)); 1501 #endif /* __x86_64__ */ 1502 idt_vaddr = virtual_avail; /* don't need pte */ 1503 idt_paddr = avail_start; /* steal a page */ 1504 /* 1505 * Xen require one more page as we can't store 1506 * GDT and LDT on the same page 1507 */ 1508 virtual_avail += 3 * PAGE_SIZE; 1509 avail_start += 3 * PAGE_SIZE; 1510 #else /* XEN */ 1511 idt_vaddr = virtual_avail; /* don't need pte */ 1512 idt_paddr = avail_start; /* steal a page */ 1513 #if defined(__x86_64__) 1514 virtual_avail += 2 * PAGE_SIZE; pte += 2; 1515 avail_start += 2 * PAGE_SIZE; 1516 #else /* defined(__x86_64__) */ 1517 virtual_avail += PAGE_SIZE; pte++; 1518 avail_start += PAGE_SIZE; 1519 /* pentium f00f bug stuff */ 1520 pentium_idt_vaddr = virtual_avail; /* don't need pte */ 1521 virtual_avail += PAGE_SIZE; pte++; 1522 #endif /* defined(__x86_64__) */ 1523 #endif /* XEN */ 1524 1525 #ifdef _LP64 1526 /* 1527 * Grab a page below 4G for things that need it (i.e. 1528 * having an initial %cr3 for the MP trampoline). 1529 */ 1530 lo32_vaddr = virtual_avail; 1531 virtual_avail += PAGE_SIZE; pte++; 1532 lo32_paddr = avail_start; 1533 avail_start += PAGE_SIZE; 1534 #endif 1535 1536 /* 1537 * now we reserve some VM for mapping pages when doing a crash dump 1538 */ 1539 1540 virtual_avail = reserve_dumppages(virtual_avail); 1541 1542 /* 1543 * init the static-global locks and global lists. 1544 * 1545 * => pventry::pvh_lock (initialized elsewhere) must also be 1546 * a spin lock, again at IPL_VM to prevent deadlock, and 1547 * again is never taken from interrupt context. 1548 */ 1549 1550 mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE); 1551 LIST_INIT(&pmaps); 1552 pmap_cpu_init_early(curcpu()); 1553 1554 /* 1555 * initialize caches. 1556 */ 1557 1558 pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), 0, 0, 0, 1559 "pmappl", NULL, IPL_NONE, NULL, NULL, NULL); 1560 #ifdef PAE 1561 pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE * PDP_SIZE, 0, 0, 0, 1562 "pdppl", &pmap_pdp_allocator, IPL_NONE, 1563 pmap_pdp_ctor, pmap_pdp_dtor, NULL); 1564 #else /* PAE */ 1565 pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE, 0, 0, 0, 1566 "pdppl", NULL, IPL_NONE, pmap_pdp_ctor, pmap_pdp_dtor, NULL); 1567 #endif /* PAE */ 1568 pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0, 1569 PR_LARGECACHE, "pvpl", &pool_allocator_meta, IPL_NONE, NULL, 1570 NULL, NULL); 1571 1572 /* 1573 * ensure the TLB is sync'd with reality by flushing it... 1574 */ 1575 1576 tlbflush(); 1577 1578 /* 1579 * calculate pmap_maxkvaddr from nkptp[]. 1580 */ 1581 1582 kva = VM_MIN_KERNEL_ADDRESS; 1583 for (i = PTP_LEVELS - 1; i >= 1; i--) { 1584 kva += nkptp[i] * nbpd[i]; 1585 } 1586 pmap_maxkvaddr = kva; 1587 } 1588 1589 #if defined(__x86_64__) 1590 /* 1591 * Pre-allocate PTPs for low memory, so that 1:1 mappings for various 1592 * trampoline code can be entered. 1593 */ 1594 void 1595 pmap_prealloc_lowmem_ptps(void) 1596 { 1597 #ifdef XEN 1598 int level; 1599 paddr_t newp; 1600 paddr_t pdes_pa; 1601 1602 pdes_pa = pmap_pdirpa(pmap_kernel(), 0); 1603 level = PTP_LEVELS; 1604 for (;;) { 1605 newp = avail_start; 1606 avail_start += PAGE_SIZE; 1607 HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop, 1608 xpmap_ptom_masked(newp) | PG_u | PG_V | PG_RW, UVMF_INVLPG); 1609 memset((void *)early_zerop, 0, PAGE_SIZE); 1610 /* Mark R/O before installing */ 1611 HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop, 1612 xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG); 1613 if (newp < (NKL2_KIMG_ENTRIES * NBPD_L2)) 1614 HYPERVISOR_update_va_mapping (newp + KERNBASE, 1615 xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG); 1616 xpq_queue_pte_update ( 1617 xpmap_ptom_masked(pdes_pa) 1618 + (pl_i(0, level) * sizeof (pd_entry_t)), 1619 xpmap_ptom_masked(newp) | PG_RW | PG_u | PG_V); 1620 level--; 1621 if (level <= 1) 1622 break; 1623 pdes_pa = newp; 1624 } 1625 #else /* XEN */ 1626 pd_entry_t *pdes; 1627 int level; 1628 paddr_t newp; 1629 1630 pdes = pmap_kernel()->pm_pdir; 1631 level = PTP_LEVELS; 1632 for (;;) { 1633 newp = avail_start; 1634 avail_start += PAGE_SIZE; 1635 *early_zero_pte = (newp & PG_FRAME) | PG_V | PG_RW; 1636 pmap_update_pg((vaddr_t)early_zerop); 1637 memset(early_zerop, 0, PAGE_SIZE); 1638 pdes[pl_i(0, level)] = (newp & PG_FRAME) | PG_V | PG_RW; 1639 level--; 1640 if (level <= 1) 1641 break; 1642 pdes = normal_pdes[level - 2]; 1643 } 1644 #endif /* XEN */ 1645 } 1646 #endif /* defined(__x86_64__) */ 1647 1648 /* 1649 * pmap_init: called from uvm_init, our job is to get the pmap 1650 * system ready to manage mappings... 1651 */ 1652 1653 void 1654 pmap_init(void) 1655 { 1656 int i; 1657 1658 for (i = 0; i < PV_HASH_SIZE; i++) { 1659 SLIST_INIT(&pv_hash_heads[i].hh_list); 1660 } 1661 for (i = 0; i < PV_HASH_LOCK_CNT; i++) { 1662 mutex_init(&pv_hash_locks[i].lock, MUTEX_NODEBUG, IPL_VM); 1663 } 1664 1665 /* 1666 * done: pmap module is up (and ready for business) 1667 */ 1668 1669 pmap_initialized = true; 1670 } 1671 1672 /* 1673 * pmap_cpu_init_early: perform early per-CPU initialization. 1674 */ 1675 1676 void 1677 pmap_cpu_init_early(struct cpu_info *ci) 1678 { 1679 struct pmap_cpu *pc; 1680 static uint8_t pmap_cpu_alloc; 1681 1682 pc = &pmap_cpu[pmap_cpu_alloc++].pc; 1683 ci->ci_pmap_cpu = pc; 1684 } 1685 1686 /* 1687 * pmap_cpu_init_late: perform late per-CPU initialization. 1688 */ 1689 1690 void 1691 pmap_cpu_init_late(struct cpu_info *ci) 1692 { 1693 1694 if (ci == &cpu_info_primary) { 1695 evcnt_attach_dynamic(&pmap_tlb_evcnt, EVCNT_TYPE_INTR, 1696 NULL, "global", "TLB IPI"); 1697 evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC, 1698 NULL, "x86", "io bitmap copy"); 1699 evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC, 1700 NULL, "x86", "ldt sync"); 1701 } 1702 1703 evcnt_attach_dynamic(&ci->ci_tlb_evcnt, EVCNT_TYPE_MISC, 1704 NULL, device_xname(ci->ci_dev), "TLB IPI"); 1705 1706 #ifdef PAE 1707 int ret; 1708 struct pglist pg; 1709 struct vm_page *vmap; 1710 1711 /* The BP has already its own L3 page allocated in locore.S. */ 1712 if (ci == &cpu_info_primary) 1713 return; 1714 1715 /* 1716 * Allocate a page for the per-CPU L3 PD. cr3 being 32 bits, PA musts 1717 * resides below the 4GB boundary. 1718 */ 1719 ret = uvm_pglistalloc(PAGE_SIZE, 0, 0x100000000ULL, 32, 0, &pg, 1, 0); 1720 vmap = TAILQ_FIRST(&pg); 1721 1722 if (ret != 0 || vmap == NULL) 1723 panic("%s: failed to allocate L3 pglist for CPU %d (ret %d)\n", 1724 __func__, cpu_index(ci), ret); 1725 1726 ci->ci_pae_l3_pdirpa = vmap->phys_addr; 1727 1728 ci->ci_pae_l3_pdir = (paddr_t *)uvm_km_alloc(kernel_map, PAGE_SIZE, 0, 1729 UVM_KMF_VAONLY | UVM_KMF_NOWAIT); 1730 if (ci->ci_pae_l3_pdir == NULL) 1731 panic("%s: failed to allocate L3 PD for CPU %d\n", 1732 __func__, cpu_index(ci)); 1733 1734 pmap_kenter_pa((vaddr_t)ci->ci_pae_l3_pdir, ci->ci_pae_l3_pdirpa, 1735 VM_PROT_READ | VM_PROT_WRITE, 0); 1736 1737 pmap_update(pmap_kernel()); 1738 #endif 1739 } 1740 1741 /* 1742 * p v _ e n t r y f u n c t i o n s 1743 */ 1744 1745 /* 1746 * pmap_free_pvs: free a list of pv_entrys 1747 */ 1748 1749 static void 1750 pmap_free_pvs(struct pv_entry *pve) 1751 { 1752 struct pv_entry *next; 1753 1754 for ( /* null */ ; pve != NULL ; pve = next) { 1755 next = pve->pve_next; 1756 pool_cache_put(&pmap_pv_cache, pve); 1757 } 1758 } 1759 1760 /* 1761 * main pv_entry manipulation functions: 1762 * pmap_enter_pv: enter a mapping onto a pv_head list 1763 * pmap_remove_pv: remove a mapping from a pv_head list 1764 * 1765 * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock 1766 * the pvh before calling 1767 */ 1768 1769 /* 1770 * insert_pv: a helper of pmap_enter_pv 1771 */ 1772 1773 static void 1774 insert_pv(struct pmap_page *pp, struct pv_entry *pve) 1775 { 1776 struct pv_hash_head *hh; 1777 kmutex_t *lock; 1778 u_int hash; 1779 1780 KASSERT(pp_locked(pp)); 1781 1782 hash = pvhash_hash(pve->pve_pte.pte_ptp, pve->pve_pte.pte_va); 1783 lock = pvhash_lock(hash); 1784 hh = pvhash_head(hash); 1785 mutex_spin_enter(lock); 1786 SLIST_INSERT_HEAD(&hh->hh_list, pve, pve_hash); 1787 mutex_spin_exit(lock); 1788 1789 LIST_INSERT_HEAD(&pp->pp_head.pvh_list, pve, pve_list); 1790 } 1791 1792 /* 1793 * pmap_enter_pv: enter a mapping onto a pv_head lst 1794 * 1795 * => caller should have the pp_lock locked 1796 * => caller should adjust ptp's wire_count before calling 1797 */ 1798 1799 static struct pv_entry * 1800 pmap_enter_pv(struct pmap_page *pp, 1801 struct pv_entry *pve, /* preallocated pve for us to use */ 1802 struct pv_entry **sparepve, 1803 struct vm_page *ptp, 1804 vaddr_t va) 1805 { 1806 1807 KASSERT(ptp == NULL || ptp->wire_count >= 2); 1808 KASSERT(ptp == NULL || ptp->uobject != NULL); 1809 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 1810 KASSERT(pp_locked(pp)); 1811 1812 if ((pp->pp_flags & PP_EMBEDDED) == 0) { 1813 if (LIST_EMPTY(&pp->pp_head.pvh_list)) { 1814 pp->pp_flags |= PP_EMBEDDED; 1815 pp->pp_pte.pte_ptp = ptp; 1816 pp->pp_pte.pte_va = va; 1817 1818 return pve; 1819 } 1820 } else { 1821 struct pv_entry *pve2; 1822 1823 pve2 = *sparepve; 1824 *sparepve = NULL; 1825 1826 pve2->pve_pte = pp->pp_pte; 1827 pp->pp_flags &= ~PP_EMBEDDED; 1828 LIST_INIT(&pp->pp_head.pvh_list); 1829 insert_pv(pp, pve2); 1830 } 1831 1832 pve->pve_pte.pte_ptp = ptp; 1833 pve->pve_pte.pte_va = va; 1834 insert_pv(pp, pve); 1835 1836 return NULL; 1837 } 1838 1839 /* 1840 * pmap_remove_pv: try to remove a mapping from a pv_list 1841 * 1842 * => caller should hold pp_lock [so that attrs can be adjusted] 1843 * => caller should adjust ptp's wire_count and free PTP if needed 1844 * => we return the removed pve 1845 */ 1846 1847 static struct pv_entry * 1848 pmap_remove_pv(struct pmap_page *pp, struct vm_page *ptp, vaddr_t va) 1849 { 1850 struct pv_hash_head *hh; 1851 struct pv_entry *pve; 1852 kmutex_t *lock; 1853 u_int hash; 1854 1855 KASSERT(ptp == NULL || ptp->uobject != NULL); 1856 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 1857 KASSERT(pp_locked(pp)); 1858 1859 if ((pp->pp_flags & PP_EMBEDDED) != 0) { 1860 KASSERT(pp->pp_pte.pte_ptp == ptp); 1861 KASSERT(pp->pp_pte.pte_va == va); 1862 1863 pp->pp_flags &= ~PP_EMBEDDED; 1864 LIST_INIT(&pp->pp_head.pvh_list); 1865 1866 return NULL; 1867 } 1868 1869 hash = pvhash_hash(ptp, va); 1870 lock = pvhash_lock(hash); 1871 hh = pvhash_head(hash); 1872 mutex_spin_enter(lock); 1873 pve = pvhash_remove(hh, ptp, va); 1874 mutex_spin_exit(lock); 1875 1876 LIST_REMOVE(pve, pve_list); 1877 1878 return pve; 1879 } 1880 1881 /* 1882 * p t p f u n c t i o n s 1883 */ 1884 1885 static inline struct vm_page * 1886 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level) 1887 { 1888 int lidx = level - 1; 1889 struct vm_page *pg; 1890 1891 KASSERT(mutex_owned(&pmap->pm_lock)); 1892 1893 if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] && 1894 pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) { 1895 return (pmap->pm_ptphint[lidx]); 1896 } 1897 PMAP_SUBOBJ_LOCK(pmap, lidx); 1898 pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level)); 1899 PMAP_SUBOBJ_UNLOCK(pmap, lidx); 1900 1901 KASSERT(pg == NULL || pg->wire_count >= 1); 1902 return pg; 1903 } 1904 1905 static inline void 1906 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level) 1907 { 1908 lwp_t *l; 1909 int lidx; 1910 struct uvm_object *obj; 1911 1912 KASSERT(ptp->wire_count == 1); 1913 1914 lidx = level - 1; 1915 1916 obj = &pmap->pm_obj[lidx]; 1917 pmap_stats_update(pmap, -1, 0); 1918 if (lidx != 0) 1919 mutex_enter(&obj->vmobjlock); 1920 if (pmap->pm_ptphint[lidx] == ptp) 1921 pmap->pm_ptphint[lidx] = TAILQ_FIRST(&obj->memq); 1922 ptp->wire_count = 0; 1923 uvm_pagerealloc(ptp, NULL, 0); 1924 l = curlwp; 1925 KASSERT((l->l_pflag & LP_INTR) == 0); 1926 VM_PAGE_TO_PP(ptp)->pp_link = l->l_md.md_gc_ptp; 1927 l->l_md.md_gc_ptp = ptp; 1928 if (lidx != 0) 1929 mutex_exit(&obj->vmobjlock); 1930 } 1931 1932 static void 1933 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va, 1934 pt_entry_t *ptes, pd_entry_t * const *pdes) 1935 { 1936 unsigned long index; 1937 int level; 1938 vaddr_t invaladdr; 1939 #ifdef MULTIPROCESSOR 1940 vaddr_t invaladdr2; 1941 #endif 1942 pd_entry_t opde; 1943 struct pmap *curpmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map); 1944 1945 KASSERT(pmap != pmap_kernel()); 1946 KASSERT(mutex_owned(&pmap->pm_lock)); 1947 KASSERT(kpreempt_disabled()); 1948 1949 level = 1; 1950 do { 1951 index = pl_i(va, level + 1); 1952 opde = pmap_pte_testset(&pdes[level - 1][index], 0); 1953 #if defined(XEN) && defined(__x86_64__) 1954 /* 1955 * If ptp is a L3 currently mapped in kernel space, 1956 * clear it before freeing 1957 */ 1958 if (pmap_pdirpa(pmap, 0) == curcpu()->ci_xen_current_user_pgd 1959 && level == PTP_LEVELS - 1) 1960 pmap_pte_set(&pmap_kernel()->pm_pdir[index], 0); 1961 #endif /* XEN && __x86_64__ */ 1962 pmap_freepage(pmap, ptp, level); 1963 invaladdr = level == 1 ? (vaddr_t)ptes : 1964 (vaddr_t)pdes[level - 2]; 1965 pmap_tlb_shootdown(curpmap, invaladdr + index * PAGE_SIZE, 1966 0, opde); 1967 #if defined(MULTIPROCESSOR) 1968 invaladdr2 = level == 1 ? (vaddr_t)PTE_BASE : 1969 (vaddr_t)normal_pdes[level - 2]; 1970 if (pmap != curpmap || invaladdr != invaladdr2) { 1971 pmap_tlb_shootdown(pmap, invaladdr2 + index * PAGE_SIZE, 1972 0, opde); 1973 } 1974 #endif 1975 if (level < PTP_LEVELS - 1) { 1976 ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1); 1977 ptp->wire_count--; 1978 if (ptp->wire_count > 1) 1979 break; 1980 } 1981 } while (++level < PTP_LEVELS); 1982 pmap_pte_flush(); 1983 } 1984 1985 /* 1986 * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one) 1987 * 1988 * => pmap should NOT be pmap_kernel() 1989 * => pmap should be locked 1990 * => preemption should be disabled 1991 */ 1992 1993 static struct vm_page * 1994 pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t * const *pdes) 1995 { 1996 struct vm_page *ptp, *pptp; 1997 int i; 1998 unsigned long index; 1999 pd_entry_t *pva; 2000 paddr_t ppa, pa; 2001 struct uvm_object *obj; 2002 2003 KASSERT(pmap != pmap_kernel()); 2004 KASSERT(mutex_owned(&pmap->pm_lock)); 2005 KASSERT(kpreempt_disabled()); 2006 2007 ptp = NULL; 2008 pa = (paddr_t)-1; 2009 2010 /* 2011 * Loop through all page table levels seeing if we need to 2012 * add a new page to that level. 2013 */ 2014 for (i = PTP_LEVELS; i > 1; i--) { 2015 /* 2016 * Save values from previous round. 2017 */ 2018 pptp = ptp; 2019 ppa = pa; 2020 2021 index = pl_i(va, i); 2022 pva = pdes[i - 2]; 2023 2024 if (pmap_valid_entry(pva[index])) { 2025 ppa = pmap_pte2pa(pva[index]); 2026 ptp = NULL; 2027 continue; 2028 } 2029 2030 obj = &pmap->pm_obj[i-2]; 2031 PMAP_SUBOBJ_LOCK(pmap, i - 2); 2032 ptp = uvm_pagealloc(obj, ptp_va2o(va, i - 1), NULL, 2033 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 2034 PMAP_SUBOBJ_UNLOCK(pmap, i - 2); 2035 2036 if (ptp == NULL) 2037 return NULL; 2038 2039 ptp->flags &= ~PG_BUSY; /* never busy */ 2040 ptp->wire_count = 1; 2041 pmap->pm_ptphint[i - 2] = ptp; 2042 pa = VM_PAGE_TO_PHYS(ptp); 2043 pmap_pte_set(&pva[index], (pd_entry_t) 2044 (pmap_pa2pte(pa) | PG_u | PG_RW | PG_V)); 2045 #if defined(XEN) && defined(__x86_64__) 2046 /* 2047 * In Xen we must enter the mapping in kernel map too 2048 * if pmap is curmap and modifying top level (PGD) 2049 */ 2050 if(i == PTP_LEVELS && pmap != pmap_kernel()) { 2051 pmap_pte_set(&pmap_kernel()->pm_pdir[index], 2052 (pd_entry_t) (pmap_pa2pte(pa) 2053 | PG_u | PG_RW | PG_V)); 2054 } 2055 #endif /* XEN && __x86_64__ */ 2056 pmap_pte_flush(); 2057 pmap_stats_update(pmap, 1, 0); 2058 /* 2059 * If we're not in the top level, increase the 2060 * wire count of the parent page. 2061 */ 2062 if (i < PTP_LEVELS) { 2063 if (pptp == NULL) 2064 pptp = pmap_find_ptp(pmap, va, ppa, i); 2065 #ifdef DIAGNOSTIC 2066 if (pptp == NULL) 2067 panic("pde page disappeared"); 2068 #endif 2069 pptp->wire_count++; 2070 } 2071 } 2072 2073 /* 2074 * ptp is not NULL if we just allocated a new ptp. If it's 2075 * still NULL, we must look up the existing one. 2076 */ 2077 if (ptp == NULL) { 2078 ptp = pmap_find_ptp(pmap, va, ppa, 1); 2079 #ifdef DIAGNOSTIC 2080 if (ptp == NULL) { 2081 printf("va %" PRIxVADDR " ppa %" PRIxPADDR "\n", 2082 va, ppa); 2083 panic("pmap_get_ptp: unmanaged user PTP"); 2084 } 2085 #endif 2086 } 2087 2088 pmap->pm_ptphint[0] = ptp; 2089 return(ptp); 2090 } 2091 2092 /* 2093 * p m a p l i f e c y c l e f u n c t i o n s 2094 */ 2095 2096 /* 2097 * pmap_pdp_ctor: constructor for the PDP cache. 2098 */ 2099 2100 int 2101 pmap_pdp_ctor(void *arg, void *v, int flags) 2102 { 2103 pd_entry_t *pdir = v; 2104 paddr_t pdirpa = 0; /* XXX: GCC */ 2105 vaddr_t object; 2106 int i; 2107 2108 #if !defined(XEN) || !defined(__x86_64__) 2109 int npde; 2110 #endif 2111 #ifdef XEN 2112 int s; 2113 #endif 2114 2115 /* 2116 * NOTE: The `pmap_lock' is held when the PDP is allocated. 2117 */ 2118 2119 #if defined(XEN) && defined(__x86_64__) 2120 /* fetch the physical address of the page directory. */ 2121 (void) pmap_extract(pmap_kernel(), (vaddr_t) pdir, &pdirpa); 2122 2123 /* zero init area */ 2124 memset (pdir, 0, PAGE_SIZE); /* Xen wants a clean page */ 2125 /* 2126 * this pdir will NEVER be active in kernel mode 2127 * so mark recursive entry invalid 2128 */ 2129 pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa) | PG_u; 2130 /* 2131 * PDP constructed this way won't be for kernel, 2132 * hence we don't put kernel mappings on Xen. 2133 * But we need to make pmap_create() happy, so put a dummy (without 2134 * PG_V) value at the right place. 2135 */ 2136 pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] = 2137 (pd_entry_t)-1 & PG_FRAME; 2138 #else /* XEN && __x86_64__*/ 2139 /* zero init area */ 2140 memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t)); 2141 2142 object = (vaddr_t)v; 2143 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2144 /* fetch the physical address of the page directory. */ 2145 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2146 /* put in recursive PDE to map the PTEs */ 2147 pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PG_V; 2148 #ifndef XEN 2149 pdir[PDIR_SLOT_PTE + i] |= PG_KW; 2150 #endif 2151 } 2152 2153 /* copy kernel's PDE */ 2154 npde = nkptp[PTP_LEVELS - 1]; 2155 2156 memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN], 2157 npde * sizeof(pd_entry_t)); 2158 2159 /* zero the rest */ 2160 memset(&pdir[PDIR_SLOT_KERN + npde], 0, 2161 (NTOPLEVEL_PDES - (PDIR_SLOT_KERN + npde)) * sizeof(pd_entry_t)); 2162 2163 if (VM_MIN_KERNEL_ADDRESS != KERNBASE) { 2164 int idx = pl_i(KERNBASE, PTP_LEVELS); 2165 2166 pdir[idx] = PDP_BASE[idx]; 2167 } 2168 #endif /* XEN && __x86_64__*/ 2169 #ifdef XEN 2170 s = splvm(); 2171 object = (vaddr_t)v; 2172 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2173 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2174 /* remap this page RO */ 2175 pmap_kenter_pa(object, pdirpa, VM_PROT_READ, 0); 2176 pmap_update(pmap_kernel()); 2177 /* 2178 * pin as L2/L4 page, we have to do the page with the 2179 * PDIR_SLOT_PTE entries last 2180 */ 2181 #ifdef PAE 2182 if (i == l2tol3(PDIR_SLOT_PTE)) 2183 continue; 2184 #endif 2185 2186 #ifdef __x86_64__ 2187 xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa)); 2188 #else 2189 xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa)); 2190 #endif 2191 } 2192 #ifdef PAE 2193 object = ((vaddr_t)pdir) + PAGE_SIZE * l2tol3(PDIR_SLOT_PTE); 2194 (void)pmap_extract(pmap_kernel(), object, &pdirpa); 2195 xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa)); 2196 #endif 2197 splx(s); 2198 #endif /* XEN */ 2199 2200 return (0); 2201 } 2202 2203 /* 2204 * pmap_pdp_dtor: destructor for the PDP cache. 2205 */ 2206 2207 void 2208 pmap_pdp_dtor(void *arg, void *v) 2209 { 2210 #ifdef XEN 2211 paddr_t pdirpa = 0; /* XXX: GCC */ 2212 vaddr_t object = (vaddr_t)v; 2213 int i; 2214 int s = splvm(); 2215 pt_entry_t *pte; 2216 2217 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2218 /* fetch the physical address of the page directory. */ 2219 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2220 /* unpin page table */ 2221 xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa)); 2222 } 2223 object = (vaddr_t)v; 2224 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2225 /* Set page RW again */ 2226 pte = kvtopte(object); 2227 xpq_queue_pte_update(xpmap_ptetomach(pte), *pte | PG_RW); 2228 xpq_queue_invlpg((vaddr_t)object); 2229 } 2230 splx(s); 2231 #endif /* XEN */ 2232 } 2233 2234 #ifdef PAE 2235 2236 /* pmap_pdp_alloc: Allocate a page for the pdp memory pool. */ 2237 2238 void * 2239 pmap_pdp_alloc(struct pool *pp, int flags) 2240 { 2241 return (void *)uvm_km_alloc(kernel_map, 2242 PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE, 2243 ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK) 2244 | UVM_KMF_WIRED); 2245 } 2246 2247 /* 2248 * pmap_pdp_free: free a PDP 2249 */ 2250 2251 void 2252 pmap_pdp_free(struct pool *pp, void *v) 2253 { 2254 uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE, 2255 UVM_KMF_WIRED); 2256 } 2257 #endif /* PAE */ 2258 2259 /* 2260 * pmap_create: create a pmap 2261 * 2262 * => note: old pmap interface took a "size" args which allowed for 2263 * the creation of "software only" pmaps (not in bsd). 2264 */ 2265 2266 struct pmap * 2267 pmap_create(void) 2268 { 2269 struct pmap *pmap; 2270 int i; 2271 2272 pmap = pool_cache_get(&pmap_cache, PR_WAITOK); 2273 2274 /* init uvm_object */ 2275 for (i = 0; i < PTP_LEVELS - 1; i++) { 2276 UVM_OBJ_INIT(&pmap->pm_obj[i], NULL, 1); 2277 pmap->pm_ptphint[i] = NULL; 2278 } 2279 pmap->pm_stats.wired_count = 0; 2280 /* count the PDP allocd below */ 2281 pmap->pm_stats.resident_count = PDP_SIZE; 2282 #if !defined(__x86_64__) 2283 pmap->pm_hiexec = 0; 2284 #endif /* !defined(__x86_64__) */ 2285 pmap->pm_flags = 0; 2286 pmap->pm_cpus = 0; 2287 pmap->pm_kernel_cpus = 0; 2288 2289 /* init the LDT */ 2290 pmap->pm_ldt = NULL; 2291 pmap->pm_ldt_len = 0; 2292 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 2293 2294 /* allocate PDP */ 2295 try_again: 2296 pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK); 2297 2298 mutex_enter(&pmaps_lock); 2299 2300 if (pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] == 0) { 2301 mutex_exit(&pmaps_lock); 2302 pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir); 2303 goto try_again; 2304 } 2305 2306 for (i = 0; i < PDP_SIZE; i++) 2307 pmap->pm_pdirpa[i] = 2308 pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]); 2309 2310 LIST_INSERT_HEAD(&pmaps, pmap, pm_list); 2311 2312 mutex_exit(&pmaps_lock); 2313 2314 return (pmap); 2315 } 2316 2317 /* 2318 * pmap_destroy: drop reference count on pmap. free pmap if 2319 * reference count goes to zero. 2320 */ 2321 2322 void 2323 pmap_destroy(struct pmap *pmap) 2324 { 2325 int i; 2326 #ifdef DIAGNOSTIC 2327 struct cpu_info *ci; 2328 CPU_INFO_ITERATOR cii; 2329 #endif /* DIAGNOSTIC */ 2330 2331 /* 2332 * if we have torn down this pmap, process deferred frees and 2333 * invalidations now. 2334 */ 2335 if (__predict_false(curlwp->l_md.md_gc_pmap == pmap)) { 2336 pmap_update(pmap); 2337 } 2338 2339 /* 2340 * drop reference count 2341 */ 2342 2343 if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) { 2344 return; 2345 } 2346 2347 #ifdef DIAGNOSTIC 2348 for (CPU_INFO_FOREACH(cii, ci)) 2349 if (ci->ci_pmap == pmap) 2350 panic("destroying pmap being used"); 2351 #endif /* DIAGNOSTIC */ 2352 2353 /* 2354 * reference count is zero, free pmap resources and then free pmap. 2355 */ 2356 #ifdef XEN 2357 /* 2358 * Xen lazy APDP handling: 2359 * clear APDP_PDE if pmap is the currently mapped 2360 */ 2361 if (xpmap_ptom_masked(pmap_pdirpa(pmap, 0)) == (*APDP_PDE & PG_FRAME)) { 2362 kpreempt_disable(); 2363 pmap_unmap_apdp(); 2364 pmap_pte_flush(); 2365 pmap_apte_flush(pmap_kernel()); 2366 kpreempt_enable(); 2367 } 2368 #endif 2369 2370 /* 2371 * remove it from global list of pmaps 2372 */ 2373 2374 mutex_enter(&pmaps_lock); 2375 LIST_REMOVE(pmap, pm_list); 2376 mutex_exit(&pmaps_lock); 2377 2378 /* 2379 * destroyed pmap shouldn't have remaining PTPs 2380 */ 2381 2382 for (i = 0; i < PTP_LEVELS - 1; i++) { 2383 KASSERT(pmap->pm_obj[i].uo_npages == 0); 2384 KASSERT(TAILQ_EMPTY(&pmap->pm_obj[i].memq)); 2385 } 2386 2387 /* 2388 * MULTIPROCESSOR -- no need to flush out of other processors' 2389 * APTE space because we do that in pmap_unmap_ptes(). 2390 */ 2391 pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir); 2392 2393 #ifdef USER_LDT 2394 if (pmap->pm_ldt != NULL) { 2395 /* 2396 * no need to switch the LDT; this address space is gone, 2397 * nothing is using it. 2398 * 2399 * No need to lock the pmap for ldt_free (or anything else), 2400 * we're the last one to use it. 2401 */ 2402 mutex_enter(&cpu_lock); 2403 ldt_free(pmap->pm_ldt_sel); 2404 mutex_exit(&cpu_lock); 2405 uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt, 2406 pmap->pm_ldt_len, UVM_KMF_WIRED); 2407 } 2408 #endif 2409 2410 for (i = 0; i < PTP_LEVELS - 1; i++) 2411 mutex_destroy(&pmap->pm_obj[i].vmobjlock); 2412 pool_cache_put(&pmap_cache, pmap); 2413 } 2414 2415 /* 2416 * pmap_remove_all: pmap is being torn down by the current thread. 2417 * avoid unnecessary invalidations. 2418 */ 2419 2420 void 2421 pmap_remove_all(struct pmap *pmap) 2422 { 2423 lwp_t *l = curlwp; 2424 2425 KASSERT(l->l_md.md_gc_pmap == NULL); 2426 2427 l->l_md.md_gc_pmap = pmap; 2428 } 2429 2430 #if defined(PMAP_FORK) 2431 /* 2432 * pmap_fork: perform any necessary data structure manipulation when 2433 * a VM space is forked. 2434 */ 2435 2436 void 2437 pmap_fork(struct pmap *pmap1, struct pmap *pmap2) 2438 { 2439 #ifdef USER_LDT 2440 union descriptor *new_ldt; 2441 size_t len; 2442 int sel; 2443 2444 if (__predict_true(pmap1->pm_ldt == NULL)) { 2445 return; 2446 } 2447 2448 retry: 2449 if (pmap1->pm_ldt != NULL) { 2450 len = pmap1->pm_ldt_len; 2451 new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, len, 0, 2452 UVM_KMF_WIRED); 2453 mutex_enter(&cpu_lock); 2454 sel = ldt_alloc(new_ldt, len); 2455 if (sel == -1) { 2456 mutex_exit(&cpu_lock); 2457 uvm_km_free(kernel_map, (vaddr_t)new_ldt, len, 2458 UVM_KMF_WIRED); 2459 printf("WARNING: pmap_fork: unable to allocate LDT\n"); 2460 return; 2461 } 2462 } else { 2463 len = -1; 2464 new_ldt = NULL; 2465 sel = -1; 2466 mutex_enter(&cpu_lock); 2467 } 2468 2469 /* Copy the LDT, if necessary. */ 2470 if (pmap1->pm_ldt != NULL) { 2471 if (len != pmap1->pm_ldt_len) { 2472 if (len != -1) { 2473 ldt_free(sel); 2474 uvm_km_free(kernel_map, (vaddr_t)new_ldt, 2475 len, UVM_KMF_WIRED); 2476 } 2477 mutex_exit(&cpu_lock); 2478 goto retry; 2479 } 2480 2481 memcpy(new_ldt, pmap1->pm_ldt, len); 2482 pmap2->pm_ldt = new_ldt; 2483 pmap2->pm_ldt_len = pmap1->pm_ldt_len; 2484 pmap2->pm_ldt_sel = sel; 2485 len = -1; 2486 } 2487 2488 if (len != -1) { 2489 ldt_free(sel); 2490 uvm_km_free(kernel_map, (vaddr_t)new_ldt, len, 2491 UVM_KMF_WIRED); 2492 } 2493 mutex_exit(&cpu_lock); 2494 #endif /* USER_LDT */ 2495 } 2496 #endif /* PMAP_FORK */ 2497 2498 #ifdef USER_LDT 2499 2500 /* 2501 * pmap_ldt_xcall: cross call used by pmap_ldt_sync. if the named pmap 2502 * is active, reload LDTR. 2503 */ 2504 static void 2505 pmap_ldt_xcall(void *arg1, void *arg2) 2506 { 2507 struct pmap *pm; 2508 2509 kpreempt_disable(); 2510 pm = arg1; 2511 if (curcpu()->ci_pmap == pm) { 2512 lldt(pm->pm_ldt_sel); 2513 } 2514 kpreempt_enable(); 2515 } 2516 2517 /* 2518 * pmap_ldt_sync: LDT selector for the named pmap is changing. swap 2519 * in the new selector on all CPUs. 2520 */ 2521 void 2522 pmap_ldt_sync(struct pmap *pm) 2523 { 2524 uint64_t where; 2525 2526 KASSERT(mutex_owned(&cpu_lock)); 2527 2528 pmap_ldt_evcnt.ev_count++; 2529 where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL); 2530 xc_wait(where); 2531 } 2532 2533 /* 2534 * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and 2535 * restore the default. 2536 */ 2537 2538 void 2539 pmap_ldt_cleanup(struct lwp *l) 2540 { 2541 pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap; 2542 union descriptor *dp = NULL; 2543 size_t len = 0; 2544 int sel = -1; 2545 2546 if (__predict_true(pmap->pm_ldt == NULL)) { 2547 return; 2548 } 2549 2550 mutex_enter(&cpu_lock); 2551 if (pmap->pm_ldt != NULL) { 2552 sel = pmap->pm_ldt_sel; 2553 dp = pmap->pm_ldt; 2554 len = pmap->pm_ldt_len; 2555 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 2556 pmap->pm_ldt = NULL; 2557 pmap->pm_ldt_len = 0; 2558 pmap_ldt_sync(pmap); 2559 ldt_free(sel); 2560 uvm_km_free(kernel_map, (vaddr_t)dp, len, UVM_KMF_WIRED); 2561 } 2562 mutex_exit(&cpu_lock); 2563 } 2564 #endif /* USER_LDT */ 2565 2566 /* 2567 * pmap_activate: activate a process' pmap 2568 * 2569 * => must be called with kernel preemption disabled 2570 * => if lwp is the curlwp, then set ci_want_pmapload so that 2571 * actual MMU context switch will be done by pmap_load() later 2572 */ 2573 2574 void 2575 pmap_activate(struct lwp *l) 2576 { 2577 struct cpu_info *ci; 2578 struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2579 2580 KASSERT(kpreempt_disabled()); 2581 2582 ci = curcpu(); 2583 2584 if (l == ci->ci_curlwp) { 2585 KASSERT(ci->ci_want_pmapload == 0); 2586 KASSERT(ci->ci_tlbstate != TLBSTATE_VALID); 2587 #ifdef KSTACK_CHECK_DR0 2588 /* 2589 * setup breakpoint on the top of stack 2590 */ 2591 if (l == &lwp0) 2592 dr0(0, 0, 0, 0); 2593 else 2594 dr0(KSTACK_LOWEST_ADDR(l), 1, 3, 1); 2595 #endif 2596 2597 /* 2598 * no need to switch to kernel vmspace because 2599 * it's a subset of any vmspace. 2600 */ 2601 2602 if (pmap == pmap_kernel()) { 2603 ci->ci_want_pmapload = 0; 2604 return; 2605 } 2606 2607 ci->ci_want_pmapload = 1; 2608 } 2609 } 2610 2611 /* 2612 * pmap_reactivate: try to regain reference to the pmap. 2613 * 2614 * => must be called with kernel preemption disabled 2615 */ 2616 2617 static bool 2618 pmap_reactivate(struct pmap *pmap) 2619 { 2620 struct cpu_info *ci; 2621 uint32_t cpumask; 2622 bool result; 2623 uint32_t oldcpus; 2624 2625 ci = curcpu(); 2626 cpumask = ci->ci_cpumask; 2627 2628 KASSERT(kpreempt_disabled()); 2629 #if defined(XEN) && defined(__x86_64__) 2630 KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd); 2631 #elif defined(PAE) 2632 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0])); 2633 #elif !defined(XEN) 2634 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3())); 2635 #endif 2636 2637 /* 2638 * if we still have a lazy reference to this pmap, 2639 * we can assume that there was no tlb shootdown 2640 * for this pmap in the meantime. 2641 * 2642 * the order of events here is important as we must 2643 * synchronize with TLB shootdown interrupts. declare 2644 * interest in invalidations (TLBSTATE_VALID) and then 2645 * check the cpumask, which the IPIs can change only 2646 * when the state is TLBSTATE_LAZY. 2647 */ 2648 2649 ci->ci_tlbstate = TLBSTATE_VALID; 2650 oldcpus = pmap->pm_cpus; 2651 KASSERT((pmap->pm_kernel_cpus & cpumask) != 0); 2652 if (oldcpus & cpumask) { 2653 /* got it */ 2654 result = true; 2655 } else { 2656 /* must reload */ 2657 atomic_or_32(&pmap->pm_cpus, cpumask); 2658 result = false; 2659 } 2660 2661 return result; 2662 } 2663 2664 /* 2665 * pmap_load: actually switch pmap. (fill in %cr3 and LDT info) 2666 */ 2667 2668 void 2669 pmap_load(void) 2670 { 2671 struct cpu_info *ci; 2672 uint32_t cpumask; 2673 struct pmap *pmap; 2674 struct pmap *oldpmap; 2675 struct lwp *l; 2676 struct pcb *pcb; 2677 uint64_t ncsw; 2678 2679 kpreempt_disable(); 2680 retry: 2681 ci = curcpu(); 2682 if (!ci->ci_want_pmapload) { 2683 kpreempt_enable(); 2684 return; 2685 } 2686 cpumask = ci->ci_cpumask; 2687 l = ci->ci_curlwp; 2688 ncsw = l->l_ncsw; 2689 2690 /* should be able to take ipis. */ 2691 KASSERT(ci->ci_ilevel < IPL_HIGH); 2692 #ifdef XEN 2693 /* XXX not yet KASSERT(x86_read_psl() != 0); */ 2694 #else 2695 KASSERT((x86_read_psl() & PSL_I) != 0); 2696 #endif 2697 2698 KASSERT(l != NULL); 2699 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2700 KASSERT(pmap != pmap_kernel()); 2701 oldpmap = ci->ci_pmap; 2702 pcb = lwp_getpcb(l); 2703 2704 if (pmap == oldpmap) { 2705 if (!pmap_reactivate(pmap)) { 2706 u_int gen = uvm_emap_gen_return(); 2707 2708 /* 2709 * pmap has been changed during deactivated. 2710 * our tlb may be stale. 2711 */ 2712 2713 tlbflush(); 2714 uvm_emap_update(gen); 2715 } 2716 2717 ci->ci_want_pmapload = 0; 2718 kpreempt_enable(); 2719 return; 2720 } 2721 2722 /* 2723 * grab a reference to the new pmap. 2724 */ 2725 2726 pmap_reference(pmap); 2727 2728 /* 2729 * actually switch pmap. 2730 */ 2731 2732 atomic_and_32(&oldpmap->pm_cpus, ~cpumask); 2733 atomic_and_32(&oldpmap->pm_kernel_cpus, ~cpumask); 2734 2735 #if defined(XEN) && defined(__x86_64__) 2736 KASSERT(pmap_pdirpa(oldpmap, 0) == ci->ci_xen_current_user_pgd || 2737 oldpmap == pmap_kernel()); 2738 #elif defined(PAE) 2739 KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0])); 2740 #elif !defined(XEN) 2741 KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(rcr3())); 2742 #endif 2743 KASSERT((pmap->pm_cpus & cpumask) == 0); 2744 KASSERT((pmap->pm_kernel_cpus & cpumask) == 0); 2745 2746 /* 2747 * mark the pmap in use by this processor. again we must 2748 * synchronize with TLB shootdown interrupts, so set the 2749 * state VALID first, then register us for shootdown events 2750 * on this pmap. 2751 */ 2752 2753 ci->ci_tlbstate = TLBSTATE_VALID; 2754 atomic_or_32(&pmap->pm_cpus, cpumask); 2755 atomic_or_32(&pmap->pm_kernel_cpus, cpumask); 2756 ci->ci_pmap = pmap; 2757 2758 /* 2759 * update tss. now that we have registered for invalidations 2760 * from other CPUs, we're good to load the page tables. 2761 */ 2762 #ifdef PAE 2763 pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa; 2764 #else 2765 pcb->pcb_cr3 = pmap_pdirpa(pmap, 0); 2766 #endif 2767 2768 #ifdef i386 2769 #ifdef XEN 2770 /* 2771 * clear APDP slot, in case it points to a page table that has 2772 * been freed 2773 */ 2774 if (*APDP_PDE) { 2775 pmap_unmap_apdp(); 2776 } 2777 /* lldt() does pmap_pte_flush() */ 2778 #endif /* XEN */ 2779 2780 #ifndef XEN 2781 ci->ci_tss.tss_ldt = pmap->pm_ldt_sel; 2782 ci->ci_tss.tss_cr3 = pcb->pcb_cr3; 2783 #endif /* !XEN */ 2784 #endif /* i386 */ 2785 2786 lldt(pmap->pm_ldt_sel); 2787 2788 u_int gen = uvm_emap_gen_return(); 2789 cpu_load_pmap(pmap); 2790 uvm_emap_update(gen); 2791 2792 ci->ci_want_pmapload = 0; 2793 2794 /* 2795 * we're now running with the new pmap. drop the reference 2796 * to the old pmap. if we block, we need to go around again. 2797 */ 2798 2799 pmap_destroy(oldpmap); 2800 if (l->l_ncsw != ncsw) { 2801 goto retry; 2802 } 2803 2804 kpreempt_enable(); 2805 } 2806 2807 /* 2808 * pmap_deactivate: deactivate a process' pmap 2809 * 2810 * => must be called with kernel preemption disabled (high SPL is enough) 2811 */ 2812 2813 void 2814 pmap_deactivate(struct lwp *l) 2815 { 2816 struct pmap *pmap; 2817 struct cpu_info *ci; 2818 2819 KASSERT(kpreempt_disabled()); 2820 2821 if (l != curlwp) { 2822 return; 2823 } 2824 2825 /* 2826 * wait for pending TLB shootdowns to complete. necessary 2827 * because TLB shootdown state is per-CPU, and the LWP may 2828 * be coming off the CPU before it has a chance to call 2829 * pmap_update(). 2830 */ 2831 pmap_tlb_shootwait(); 2832 2833 ci = curcpu(); 2834 2835 if (ci->ci_want_pmapload) { 2836 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 2837 != pmap_kernel()); 2838 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 2839 != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID); 2840 2841 /* 2842 * userspace has not been touched. 2843 * nothing to do here. 2844 */ 2845 2846 ci->ci_want_pmapload = 0; 2847 return; 2848 } 2849 2850 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2851 2852 if (pmap == pmap_kernel()) { 2853 return; 2854 } 2855 2856 #if defined(XEN) && defined(__x86_64__) 2857 KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd); 2858 #elif defined(PAE) 2859 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0])); 2860 #elif !defined(XEN) 2861 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3())); 2862 #endif 2863 KASSERT(ci->ci_pmap == pmap); 2864 2865 /* 2866 * we aren't interested in TLB invalidations for this pmap, 2867 * at least for the time being. 2868 */ 2869 2870 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); 2871 ci->ci_tlbstate = TLBSTATE_LAZY; 2872 } 2873 2874 /* 2875 * end of lifecycle functions 2876 */ 2877 2878 /* 2879 * some misc. functions 2880 */ 2881 2882 int 2883 pmap_pdes_invalid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde) 2884 { 2885 int i; 2886 unsigned long index; 2887 pd_entry_t pde; 2888 2889 for (i = PTP_LEVELS; i > 1; i--) { 2890 index = pl_i(va, i); 2891 pde = pdes[i - 2][index]; 2892 if ((pde & PG_V) == 0) 2893 return i; 2894 } 2895 if (lastpde != NULL) 2896 *lastpde = pde; 2897 return 0; 2898 } 2899 2900 /* 2901 * pmap_extract: extract a PA for the given VA 2902 */ 2903 2904 bool 2905 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap) 2906 { 2907 pt_entry_t *ptes, pte; 2908 pd_entry_t pde; 2909 pd_entry_t * const *pdes; 2910 struct pmap *pmap2; 2911 struct cpu_info *ci; 2912 paddr_t pa; 2913 lwp_t *l; 2914 bool hard, rv; 2915 2916 rv = false; 2917 pa = 0; 2918 l = curlwp; 2919 2920 KPREEMPT_DISABLE(l); 2921 ci = l->l_cpu; 2922 if (__predict_true(!ci->ci_want_pmapload && ci->ci_pmap == pmap) || 2923 pmap == pmap_kernel()) { 2924 /* 2925 * no need to lock, because it's pmap_kernel() or our 2926 * own pmap and is active. if a user pmap, the caller 2927 * will hold the vm_map write/read locked and so prevent 2928 * entries from disappearing while we are here. ptps 2929 * can disappear via pmap_remove() and pmap_protect(), 2930 * but they are called with the vm_map write locked. 2931 */ 2932 hard = false; 2933 ptes = PTE_BASE; 2934 pdes = normal_pdes; 2935 } else { 2936 /* we lose, do it the hard way. */ 2937 hard = true; 2938 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 2939 } 2940 if (pmap_pdes_valid(va, pdes, &pde)) { 2941 pte = ptes[pl1_i(va)]; 2942 if (pde & PG_PS) { 2943 pa = (pde & PG_LGFRAME) | (va & (NBPD_L2 - 1)); 2944 rv = true; 2945 } else if (__predict_true((pte & PG_V) != 0)) { 2946 pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1)); 2947 rv = true; 2948 } 2949 } 2950 if (__predict_false(hard)) { 2951 pmap_unmap_ptes(pmap, pmap2); 2952 } 2953 KPREEMPT_ENABLE(l); 2954 if (pap != NULL) { 2955 *pap = pa; 2956 } 2957 return rv; 2958 } 2959 2960 2961 /* 2962 * vtophys: virtual address to physical address. For use by 2963 * machine-dependent code only. 2964 */ 2965 2966 paddr_t 2967 vtophys(vaddr_t va) 2968 { 2969 paddr_t pa; 2970 2971 if (pmap_extract(pmap_kernel(), va, &pa) == true) 2972 return (pa); 2973 return (0); 2974 } 2975 2976 __weak_alias(pmap_extract_ma, pmap_extract); 2977 2978 #ifdef XEN 2979 2980 /* 2981 * vtomach: virtual address to machine address. For use by 2982 * machine-dependent code only. 2983 */ 2984 2985 paddr_t 2986 vtomach(vaddr_t va) 2987 { 2988 paddr_t pa; 2989 2990 if (pmap_extract_ma(pmap_kernel(), va, &pa) == true) 2991 return (pa); 2992 return (0); 2993 } 2994 2995 #endif /* XEN */ 2996 2997 /* 2998 * pmap_virtual_space: used during bootup [pmap_steal_memory] to 2999 * determine the bounds of the kernel virtual addess space. 3000 */ 3001 3002 void 3003 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp) 3004 { 3005 *startp = virtual_avail; 3006 *endp = virtual_end; 3007 } 3008 3009 /* 3010 * pmap_map: map a range of PAs into kvm. 3011 * 3012 * => used during crash dump 3013 * => XXX: pmap_map() should be phased out? 3014 */ 3015 3016 vaddr_t 3017 pmap_map(vaddr_t va, paddr_t spa, paddr_t epa, vm_prot_t prot) 3018 { 3019 while (spa < epa) { 3020 pmap_kenter_pa(va, spa, prot, 0); 3021 va += PAGE_SIZE; 3022 spa += PAGE_SIZE; 3023 } 3024 pmap_update(pmap_kernel()); 3025 return va; 3026 } 3027 3028 /* 3029 * pmap_zero_page: zero a page 3030 */ 3031 3032 void 3033 pmap_zero_page(paddr_t pa) 3034 { 3035 pt_entry_t *zpte; 3036 void *zerova; 3037 int id; 3038 3039 kpreempt_disable(); 3040 id = cpu_number(); 3041 zpte = PTESLEW(zero_pte, id); 3042 zerova = VASLEW(zerop, id); 3043 3044 #ifdef DIAGNOSTIC 3045 if (*zpte) 3046 panic("pmap_zero_page: lock botch"); 3047 #endif 3048 3049 pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k); 3050 pmap_pte_flush(); 3051 pmap_update_pg((vaddr_t)zerova); /* flush TLB */ 3052 3053 memset(zerova, 0, PAGE_SIZE); 3054 3055 #if defined(DIAGNOSTIC) || defined(XEN) 3056 pmap_pte_set(zpte, 0); /* zap ! */ 3057 pmap_pte_flush(); 3058 #endif 3059 kpreempt_enable(); 3060 } 3061 3062 /* 3063 * pmap_pagezeroidle: the same, for the idle loop page zero'er. 3064 * Returns true if the page was zero'd, false if we aborted for 3065 * some reason. 3066 */ 3067 3068 bool 3069 pmap_pageidlezero(paddr_t pa) 3070 { 3071 pt_entry_t *zpte; 3072 void *zerova; 3073 bool rv; 3074 int id; 3075 3076 id = cpu_number(); 3077 zpte = PTESLEW(zero_pte, id); 3078 zerova = VASLEW(zerop, id); 3079 3080 KASSERT(cpu_feature[0] & CPUID_SSE2); 3081 KASSERT(*zpte == 0); 3082 3083 pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k); 3084 pmap_pte_flush(); 3085 pmap_update_pg((vaddr_t)zerova); /* flush TLB */ 3086 3087 rv = sse2_idlezero_page(zerova); 3088 3089 #if defined(DIAGNOSTIC) || defined(XEN) 3090 pmap_pte_set(zpte, 0); /* zap ! */ 3091 pmap_pte_flush(); 3092 #endif 3093 3094 return rv; 3095 } 3096 3097 /* 3098 * pmap_copy_page: copy a page 3099 */ 3100 3101 void 3102 pmap_copy_page(paddr_t srcpa, paddr_t dstpa) 3103 { 3104 pt_entry_t *spte; 3105 pt_entry_t *dpte; 3106 void *csrcva; 3107 void *cdstva; 3108 int id; 3109 3110 kpreempt_disable(); 3111 id = cpu_number(); 3112 spte = PTESLEW(csrc_pte,id); 3113 dpte = PTESLEW(cdst_pte,id); 3114 csrcva = VASLEW(csrcp, id); 3115 cdstva = VASLEW(cdstp, id); 3116 3117 KASSERT(*spte == 0 && *dpte == 0); 3118 3119 pmap_pte_set(spte, pmap_pa2pte(srcpa) | PG_V | PG_RW | PG_U | PG_k); 3120 pmap_pte_set(dpte, 3121 pmap_pa2pte(dstpa) | PG_V | PG_RW | PG_M | PG_U | PG_k); 3122 pmap_pte_flush(); 3123 pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva); 3124 3125 memcpy(cdstva, csrcva, PAGE_SIZE); 3126 3127 #if defined(DIAGNOSTIC) || defined(XEN) 3128 pmap_pte_set(spte, 0); 3129 pmap_pte_set(dpte, 0); 3130 pmap_pte_flush(); 3131 #endif 3132 kpreempt_enable(); 3133 } 3134 3135 static pt_entry_t * 3136 pmap_map_ptp(struct vm_page *ptp) 3137 { 3138 pt_entry_t *ptppte; 3139 void *ptpva; 3140 int id; 3141 3142 KASSERT(kpreempt_disabled()); 3143 3144 id = cpu_number(); 3145 ptppte = PTESLEW(ptp_pte, id); 3146 ptpva = VASLEW(ptpp, id); 3147 #if !defined(XEN) 3148 pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M | 3149 PG_RW | PG_U | PG_k); 3150 #else 3151 pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M | 3152 PG_U | PG_k); 3153 #endif 3154 pmap_pte_flush(); 3155 pmap_update_pg((vaddr_t)ptpva); 3156 3157 return (pt_entry_t *)ptpva; 3158 } 3159 3160 static void 3161 pmap_unmap_ptp(void) 3162 { 3163 #if defined(DIAGNOSTIC) || defined(XEN) 3164 pt_entry_t *pte; 3165 3166 KASSERT(kpreempt_disabled()); 3167 3168 pte = PTESLEW(ptp_pte, cpu_number()); 3169 if (*pte != 0) { 3170 pmap_pte_set(pte, 0); 3171 pmap_pte_flush(); 3172 } 3173 #endif 3174 } 3175 3176 static pt_entry_t * 3177 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va) 3178 { 3179 3180 KASSERT(kpreempt_disabled()); 3181 if (pmap_is_curpmap(pmap)) { 3182 return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */ 3183 } 3184 KASSERT(ptp != NULL); 3185 return pmap_map_ptp(ptp) + pl1_pi(va); 3186 } 3187 3188 static void 3189 pmap_unmap_pte(void) 3190 { 3191 3192 KASSERT(kpreempt_disabled()); 3193 3194 pmap_unmap_ptp(); 3195 } 3196 3197 /* 3198 * p m a p r e m o v e f u n c t i o n s 3199 * 3200 * functions that remove mappings 3201 */ 3202 3203 /* 3204 * pmap_remove_ptes: remove PTEs from a PTP 3205 * 3206 * => must have proper locking on pmap_master_lock 3207 * => caller must hold pmap's lock 3208 * => PTP must be mapped into KVA 3209 * => PTP should be null if pmap == pmap_kernel() 3210 * => must be called with kernel preemption disabled 3211 * => returns composite pte if at least one page should be shot down 3212 */ 3213 3214 static pt_entry_t 3215 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva, 3216 vaddr_t startva, vaddr_t endva, struct pv_entry **pv_tofree) 3217 { 3218 struct pv_entry *pve; 3219 pt_entry_t *pte = (pt_entry_t *) ptpva; 3220 pt_entry_t opte, xpte = 0; 3221 3222 KASSERT(pmap == pmap_kernel() || mutex_owned(&pmap->pm_lock)); 3223 KASSERT(kpreempt_disabled()); 3224 3225 /* 3226 * note that ptpva points to the PTE that maps startva. this may 3227 * or may not be the first PTE in the PTP. 3228 * 3229 * we loop through the PTP while there are still PTEs to look at 3230 * and the wire_count is greater than 1 (because we use the wire_count 3231 * to keep track of the number of real PTEs in the PTP). 3232 */ 3233 3234 for (/*null*/; startva < endva && (ptp == NULL || ptp->wire_count > 1) 3235 ; pte++, startva += PAGE_SIZE) { 3236 struct vm_page *pg; 3237 struct pmap_page *pp; 3238 3239 if (!pmap_valid_entry(*pte)) 3240 continue; /* VA not mapped */ 3241 3242 /* atomically save the old PTE and zap! it */ 3243 opte = pmap_pte_testset(pte, 0); 3244 if (!pmap_valid_entry(opte)) { 3245 continue; 3246 } 3247 3248 pmap_exec_account(pmap, startva, opte, 0); 3249 pmap_stats_update_bypte(pmap, 0, opte); 3250 xpte |= opte; 3251 3252 if (ptp) { 3253 ptp->wire_count--; /* dropping a PTE */ 3254 /* Make sure that the PDE is flushed */ 3255 if (ptp->wire_count <= 1) 3256 xpte |= PG_U; 3257 } 3258 3259 /* 3260 * if we are not on a pv_head list we are done. 3261 */ 3262 3263 if ((opte & PG_PVLIST) == 0) { 3264 #if defined(DIAGNOSTIC) && !defined(DOM0OPS) 3265 if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL) 3266 panic("pmap_remove_ptes: managed page without " 3267 "PG_PVLIST for %#" PRIxVADDR, startva); 3268 #endif 3269 continue; 3270 } 3271 3272 pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte)); 3273 #ifdef DIAGNOSTIC 3274 if (pg == NULL) 3275 panic("pmap_remove_ptes: unmanaged page marked " 3276 "PG_PVLIST, va = %#" PRIxVADDR ", " 3277 "pa = %#" PRIxPADDR, 3278 startva, (paddr_t)pmap_pte2pa(opte)); 3279 #endif 3280 3281 /* sync R/M bits */ 3282 pp = VM_PAGE_TO_PP(pg); 3283 pp_lock(pp); 3284 pp->pp_attrs |= opte; 3285 pve = pmap_remove_pv(pp, ptp, startva); 3286 pp_unlock(pp); 3287 3288 if (pve != NULL) { 3289 pve->pve_next = *pv_tofree; 3290 *pv_tofree = pve; 3291 } 3292 3293 /* end of "for" loop: time for next pte */ 3294 } 3295 3296 return xpte; 3297 } 3298 3299 3300 /* 3301 * pmap_remove_pte: remove a single PTE from a PTP 3302 * 3303 * => must have proper locking on pmap_master_lock 3304 * => caller must hold pmap's lock 3305 * => PTP must be mapped into KVA 3306 * => PTP should be null if pmap == pmap_kernel() 3307 * => returns true if we removed a mapping 3308 * => must be called with kernel preemption disabled 3309 */ 3310 3311 static bool 3312 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, 3313 vaddr_t va, struct pv_entry **pv_tofree) 3314 { 3315 pt_entry_t opte; 3316 struct pv_entry *pve; 3317 struct vm_page *pg; 3318 struct pmap_page *pp; 3319 3320 KASSERT(pmap == pmap_kernel() || mutex_owned(&pmap->pm_lock)); 3321 KASSERT(pmap == pmap_kernel() || kpreempt_disabled()); 3322 3323 if (!pmap_valid_entry(*pte)) 3324 return(false); /* VA not mapped */ 3325 3326 /* atomically save the old PTE and zap! it */ 3327 opte = pmap_pte_testset(pte, 0); 3328 if (!pmap_valid_entry(opte)) { 3329 return false; 3330 } 3331 3332 pmap_exec_account(pmap, va, opte, 0); 3333 pmap_stats_update_bypte(pmap, 0, opte); 3334 3335 if (opte & PG_U) 3336 pmap_tlb_shootdown(pmap, va, 0, opte); 3337 3338 if (ptp) { 3339 ptp->wire_count--; /* dropping a PTE */ 3340 /* Make sure that the PDE is flushed */ 3341 if ((ptp->wire_count <= 1) && !(opte & PG_U)) 3342 pmap_tlb_shootdown(pmap, va, 0, opte); 3343 } 3344 3345 /* 3346 * if we are not on a pv_head list we are done. 3347 */ 3348 3349 if ((opte & PG_PVLIST) == 0) { 3350 #if defined(DIAGNOSTIC) && !defined(DOM0OPS) 3351 if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL) 3352 panic("pmap_remove_pte: managed page without " 3353 "PG_PVLIST for %#" PRIxVADDR, va); 3354 #endif 3355 return(true); 3356 } 3357 3358 pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte)); 3359 #ifdef DIAGNOSTIC 3360 if (pg == NULL) 3361 panic("pmap_remove_pte: unmanaged page marked " 3362 "PG_PVLIST, va = %#" PRIxVADDR ", pa = %#" PRIxPADDR, 3363 va, (paddr_t)pmap_pte2pa(opte)); 3364 #endif 3365 3366 /* sync R/M bits */ 3367 pp = VM_PAGE_TO_PP(pg); 3368 pp_lock(pp); 3369 pp->pp_attrs |= opte; 3370 pve = pmap_remove_pv(pp, ptp, va); 3371 pp_unlock(pp); 3372 3373 if (pve) { 3374 pve->pve_next = *pv_tofree; 3375 *pv_tofree = pve; 3376 } 3377 3378 return(true); 3379 } 3380 3381 /* 3382 * pmap_remove: mapping removal function. 3383 * 3384 * => caller should not be holding any pmap locks 3385 */ 3386 3387 void 3388 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 3389 { 3390 pt_entry_t *ptes, xpte = 0; 3391 pd_entry_t pde; 3392 pd_entry_t * const *pdes; 3393 struct pv_entry *pv_tofree = NULL; 3394 bool result; 3395 int i; 3396 paddr_t ptppa; 3397 vaddr_t blkendva, va = sva; 3398 struct vm_page *ptp; 3399 struct pmap *pmap2; 3400 3401 kpreempt_disable(); 3402 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 3403 3404 /* 3405 * removing one page? take shortcut function. 3406 */ 3407 3408 if (va + PAGE_SIZE == eva) { 3409 if (pmap_pdes_valid(va, pdes, &pde)) { 3410 3411 /* PA of the PTP */ 3412 ptppa = pmap_pte2pa(pde); 3413 3414 /* get PTP if non-kernel mapping */ 3415 if (pmap == pmap_kernel()) { 3416 /* we never free kernel PTPs */ 3417 ptp = NULL; 3418 } else { 3419 ptp = pmap_find_ptp(pmap, va, ptppa, 1); 3420 #ifdef DIAGNOSTIC 3421 if (ptp == NULL) 3422 panic("pmap_remove: unmanaged " 3423 "PTP detected"); 3424 #endif 3425 } 3426 3427 /* do it! */ 3428 result = pmap_remove_pte(pmap, ptp, 3429 &ptes[pl1_i(va)], va, &pv_tofree); 3430 3431 /* 3432 * if mapping removed and the PTP is no longer 3433 * being used, free it! 3434 */ 3435 3436 if (result && ptp && ptp->wire_count <= 1) 3437 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3438 } 3439 } else for (/* null */ ; va < eva ; va = blkendva) { 3440 int lvl; 3441 3442 /* determine range of block */ 3443 blkendva = x86_round_pdr(va+1); 3444 if (blkendva > eva) 3445 blkendva = eva; 3446 3447 /* 3448 * XXXCDC: our PTE mappings should never be removed 3449 * with pmap_remove! if we allow this (and why would 3450 * we?) then we end up freeing the pmap's page 3451 * directory page (PDP) before we are finished using 3452 * it when we hit in in the recursive mapping. this 3453 * is BAD. 3454 * 3455 * long term solution is to move the PTEs out of user 3456 * address space. and into kernel address space (up 3457 * with APTE). then we can set VM_MAXUSER_ADDRESS to 3458 * be VM_MAX_ADDRESS. 3459 */ 3460 3461 /* XXXCDC: ugly hack to avoid freeing PDP here */ 3462 for (i = 0; i < PDP_SIZE; i++) { 3463 if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i) 3464 continue; 3465 } 3466 3467 lvl = pmap_pdes_invalid(va, pdes, &pde); 3468 if (lvl != 0) { 3469 /* 3470 * skip a range corresponding to an invalid pde. 3471 */ 3472 blkendva = (va & ptp_masks[lvl - 1]) + nbpd[lvl - 1]; 3473 continue; 3474 } 3475 3476 /* PA of the PTP */ 3477 ptppa = pmap_pte2pa(pde); 3478 3479 /* get PTP if non-kernel mapping */ 3480 if (pmap == pmap_kernel()) { 3481 /* we never free kernel PTPs */ 3482 ptp = NULL; 3483 } else { 3484 ptp = pmap_find_ptp(pmap, va, ptppa, 1); 3485 #ifdef DIAGNOSTIC 3486 if (ptp == NULL) 3487 panic("pmap_remove: unmanaged PTP " 3488 "detected"); 3489 #endif 3490 } 3491 xpte |= pmap_remove_ptes(pmap, ptp, 3492 (vaddr_t)&ptes[pl1_i(va)], va, blkendva, &pv_tofree); 3493 3494 /* if PTP is no longer being used, free it! */ 3495 if (ptp && ptp->wire_count <= 1) { 3496 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3497 } 3498 if ((xpte & PG_U) != 0) 3499 pmap_tlb_shootdown(pmap, sva, eva, xpte); 3500 } 3501 pmap_unmap_ptes(pmap, pmap2); /* unlock pmap */ 3502 kpreempt_enable(); 3503 3504 /* Now we free unused PVs */ 3505 if (pv_tofree) 3506 pmap_free_pvs(pv_tofree); 3507 } 3508 3509 /* 3510 * pmap_sync_pv: clear pte bits and return the old value of the pte. 3511 * 3512 * => called with pp_lock held. (thus preemption disabled) 3513 * => issues tlb shootdowns if necessary. 3514 */ 3515 3516 static int 3517 pmap_sync_pv(struct pv_pte *pvpte, pt_entry_t expect, int clearbits, 3518 pt_entry_t *optep) 3519 { 3520 struct pmap *pmap; 3521 struct vm_page *ptp; 3522 vaddr_t va; 3523 pt_entry_t *ptep; 3524 pt_entry_t opte; 3525 pt_entry_t npte; 3526 bool need_shootdown; 3527 3528 ptp = pvpte->pte_ptp; 3529 va = pvpte->pte_va; 3530 KASSERT(ptp == NULL || ptp->uobject != NULL); 3531 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 3532 pmap = ptp_to_pmap(ptp); 3533 3534 KASSERT((expect & ~(PG_FRAME | PG_V)) == 0); 3535 KASSERT((expect & PG_V) != 0); 3536 KASSERT(clearbits == ~0 || (clearbits & ~(PG_M | PG_U | PG_RW)) == 0); 3537 KASSERT(kpreempt_disabled()); 3538 3539 ptep = pmap_map_pte(pmap, ptp, va); 3540 do { 3541 opte = *ptep; 3542 KASSERT((opte & (PG_M | PG_U)) != PG_M); 3543 KASSERT((opte & (PG_U | PG_V)) != PG_U); 3544 KASSERT(opte == 0 || (opte & PG_V) != 0); 3545 if ((opte & (PG_FRAME | PG_V)) != expect) { 3546 3547 /* 3548 * we lost a race with a V->P operation like 3549 * pmap_remove(). wait for the competitor 3550 * reflecting pte bits into mp_attrs. 3551 * 3552 * issue a redundant TLB shootdown so that 3553 * we can wait for its completion. 3554 */ 3555 3556 pmap_unmap_pte(); 3557 if (clearbits != 0) { 3558 pmap_tlb_shootdown(pmap, va, 0, 3559 (pmap == pmap_kernel() ? PG_G : 0)); 3560 } 3561 return EAGAIN; 3562 } 3563 3564 /* 3565 * check if there's anything to do on this pte. 3566 */ 3567 3568 if ((opte & clearbits) == 0) { 3569 need_shootdown = false; 3570 break; 3571 } 3572 3573 /* 3574 * we need a shootdown if the pte is cached. (PG_U) 3575 * 3576 * ...unless we are clearing only the PG_RW bit and 3577 * it isn't cached as RW. (PG_M) 3578 */ 3579 3580 need_shootdown = (opte & PG_U) != 0 && 3581 !(clearbits == PG_RW && (opte & PG_M) == 0); 3582 3583 npte = opte & ~clearbits; 3584 3585 /* 3586 * if we need a shootdown anyway, clear PG_U and PG_M. 3587 */ 3588 3589 if (need_shootdown) { 3590 npte &= ~(PG_U | PG_M); 3591 } 3592 KASSERT((npte & (PG_M | PG_U)) != PG_M); 3593 KASSERT((npte & (PG_U | PG_V)) != PG_U); 3594 KASSERT(npte == 0 || (opte & PG_V) != 0); 3595 } while (pmap_pte_cas(ptep, opte, npte) != opte); 3596 3597 if (need_shootdown) { 3598 pmap_tlb_shootdown(pmap, va, 0, opte); 3599 } 3600 pmap_unmap_pte(); 3601 3602 *optep = opte; 3603 return 0; 3604 } 3605 3606 /* 3607 * pmap_page_remove: remove a managed vm_page from all pmaps that map it 3608 * 3609 * => R/M bits are sync'd back to attrs 3610 */ 3611 3612 void 3613 pmap_page_remove(struct vm_page *pg) 3614 { 3615 struct pmap_page *pp; 3616 struct pv_pte *pvpte; 3617 struct pv_entry *killlist = NULL; 3618 struct vm_page *ptp; 3619 pt_entry_t expect; 3620 lwp_t *l; 3621 int count; 3622 3623 l = curlwp; 3624 pp = VM_PAGE_TO_PP(pg); 3625 expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V; 3626 count = SPINLOCK_BACKOFF_MIN; 3627 kpreempt_disable(); 3628 startover: 3629 pp_lock(pp); 3630 while ((pvpte = pv_pte_first(pp)) != NULL) { 3631 struct pmap *pmap; 3632 struct pv_entry *pve; 3633 pt_entry_t opte; 3634 vaddr_t va; 3635 int error; 3636 3637 /* 3638 * add a reference to the pmap before clearing the pte. 3639 * otherwise the pmap can disappear behind us. 3640 */ 3641 3642 ptp = pvpte->pte_ptp; 3643 pmap = ptp_to_pmap(ptp); 3644 if (ptp != NULL) { 3645 pmap_reference(pmap); 3646 } 3647 3648 error = pmap_sync_pv(pvpte, expect, ~0, &opte); 3649 if (error == EAGAIN) { 3650 int hold_count; 3651 pp_unlock(pp); 3652 KERNEL_UNLOCK_ALL(curlwp, &hold_count); 3653 if (ptp != NULL) { 3654 pmap_destroy(pmap); 3655 } 3656 SPINLOCK_BACKOFF(count); 3657 KERNEL_LOCK(hold_count, curlwp); 3658 goto startover; 3659 } 3660 3661 pp->pp_attrs |= opte; 3662 va = pvpte->pte_va; 3663 pve = pmap_remove_pv(pp, ptp, va); 3664 pp_unlock(pp); 3665 3666 /* update the PTP reference count. free if last reference. */ 3667 if (ptp != NULL) { 3668 struct pmap *pmap2; 3669 pt_entry_t *ptes; 3670 pd_entry_t * const *pdes; 3671 3672 KASSERT(pmap != pmap_kernel()); 3673 3674 pmap_tlb_shootwait(); 3675 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3676 pmap_stats_update_bypte(pmap, 0, opte); 3677 ptp->wire_count--; 3678 if (ptp->wire_count <= 1) { 3679 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3680 } 3681 pmap_unmap_ptes(pmap, pmap2); 3682 pmap_destroy(pmap); 3683 } else { 3684 KASSERT(pmap == pmap_kernel()); 3685 pmap_stats_update_bypte(pmap, 0, opte); 3686 } 3687 3688 if (pve != NULL) { 3689 pve->pve_next = killlist; /* mark it for death */ 3690 killlist = pve; 3691 } 3692 pp_lock(pp); 3693 } 3694 pp_unlock(pp); 3695 kpreempt_enable(); 3696 3697 /* Now free unused pvs. */ 3698 pmap_free_pvs(killlist); 3699 } 3700 3701 /* 3702 * p m a p a t t r i b u t e f u n c t i o n s 3703 * functions that test/change managed page's attributes 3704 * since a page can be mapped multiple times we must check each PTE that 3705 * maps it by going down the pv lists. 3706 */ 3707 3708 /* 3709 * pmap_test_attrs: test a page's attributes 3710 */ 3711 3712 bool 3713 pmap_test_attrs(struct vm_page *pg, unsigned testbits) 3714 { 3715 struct pmap_page *pp; 3716 struct pv_pte *pvpte; 3717 pt_entry_t expect; 3718 u_int result; 3719 3720 pp = VM_PAGE_TO_PP(pg); 3721 if ((pp->pp_attrs & testbits) != 0) { 3722 return true; 3723 } 3724 expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V; 3725 pp_lock(pp); 3726 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 3727 pt_entry_t opte; 3728 int error; 3729 3730 if ((pp->pp_attrs & testbits) != 0) { 3731 break; 3732 } 3733 error = pmap_sync_pv(pvpte, expect, 0, &opte); 3734 if (error == 0) { 3735 pp->pp_attrs |= opte; 3736 } 3737 } 3738 result = pp->pp_attrs & testbits; 3739 pp_unlock(pp); 3740 3741 /* 3742 * note that we will exit the for loop with a non-null pve if 3743 * we have found the bits we are testing for. 3744 */ 3745 3746 return result != 0; 3747 } 3748 3749 /* 3750 * pmap_clear_attrs: clear the specified attribute for a page. 3751 * 3752 * => we return true if we cleared one of the bits we were asked to 3753 */ 3754 3755 bool 3756 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits) 3757 { 3758 struct pmap_page *pp; 3759 struct pv_pte *pvpte; 3760 u_int result; 3761 pt_entry_t expect; 3762 int count; 3763 3764 pp = VM_PAGE_TO_PP(pg); 3765 expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V; 3766 count = SPINLOCK_BACKOFF_MIN; 3767 kpreempt_disable(); 3768 startover: 3769 pp_lock(pp); 3770 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 3771 pt_entry_t opte; 3772 int error; 3773 3774 error = pmap_sync_pv(pvpte, expect, clearbits, &opte); 3775 if (error == EAGAIN) { 3776 int hold_count; 3777 pp_unlock(pp); 3778 KERNEL_UNLOCK_ALL(curlwp, &hold_count); 3779 SPINLOCK_BACKOFF(count); 3780 KERNEL_LOCK(hold_count, curlwp); 3781 goto startover; 3782 } 3783 pp->pp_attrs |= opte; 3784 } 3785 result = pp->pp_attrs & clearbits; 3786 pp->pp_attrs &= ~clearbits; 3787 pp_unlock(pp); 3788 kpreempt_enable(); 3789 3790 return result != 0; 3791 } 3792 3793 3794 /* 3795 * p m a p p r o t e c t i o n f u n c t i o n s 3796 */ 3797 3798 /* 3799 * pmap_page_protect: change the protection of all recorded mappings 3800 * of a managed page 3801 * 3802 * => NOTE: this is an inline function in pmap.h 3803 */ 3804 3805 /* see pmap.h */ 3806 3807 /* 3808 * pmap_protect: set the protection in of the pages in a pmap 3809 * 3810 * => NOTE: this is an inline function in pmap.h 3811 */ 3812 3813 /* see pmap.h */ 3814 3815 /* 3816 * pmap_write_protect: write-protect pages in a pmap 3817 */ 3818 3819 void 3820 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot) 3821 { 3822 int i; 3823 pt_entry_t *ptes, *epte; 3824 pt_entry_t *spte; 3825 pd_entry_t * const *pdes; 3826 vaddr_t blockend, va; 3827 pt_entry_t opte; 3828 struct pmap *pmap2; 3829 3830 KASSERT(curlwp->l_md.md_gc_pmap != pmap); 3831 3832 kpreempt_disable(); 3833 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 3834 3835 /* should be ok, but just in case ... */ 3836 sva &= PG_FRAME; 3837 eva &= PG_FRAME; 3838 3839 for (va = sva ; va < eva ; va = blockend) { 3840 3841 blockend = (va & L2_FRAME) + NBPD_L2; 3842 if (blockend > eva) 3843 blockend = eva; 3844 3845 /* 3846 * XXXCDC: our PTE mappings should never be write-protected! 3847 * 3848 * long term solution is to move the PTEs out of user 3849 * address space. and into kernel address space (up 3850 * with APTE). then we can set VM_MAXUSER_ADDRESS to 3851 * be VM_MAX_ADDRESS. 3852 */ 3853 3854 /* XXXCDC: ugly hack to avoid freeing PDP here */ 3855 for (i = 0; i < PDP_SIZE; i++) { 3856 if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i) 3857 continue; 3858 } 3859 3860 /* empty block? */ 3861 if (!pmap_pdes_valid(va, pdes, NULL)) 3862 continue; 3863 3864 #ifdef DIAGNOSTIC 3865 if (va >= VM_MAXUSER_ADDRESS && 3866 va < VM_MAX_ADDRESS) 3867 panic("pmap_write_protect: PTE space"); 3868 #endif 3869 3870 spte = &ptes[pl1_i(va)]; 3871 epte = &ptes[pl1_i(blockend)]; 3872 3873 for (/*null */; spte < epte ; spte++) { 3874 pt_entry_t npte; 3875 3876 do { 3877 opte = *spte; 3878 if ((~opte & (PG_RW | PG_V)) != 0) { 3879 goto next; 3880 } 3881 npte = opte & ~PG_RW; 3882 } while (pmap_pte_cas(spte, opte, npte) != opte); 3883 if ((opte & PG_M) != 0) { 3884 vaddr_t tva; 3885 3886 tva = x86_ptob(spte - ptes); 3887 pmap_tlb_shootdown(pmap, tva, 0, opte); 3888 } 3889 next:; 3890 } 3891 } 3892 3893 pmap_unmap_ptes(pmap, pmap2); /* unlocks pmap */ 3894 kpreempt_enable(); 3895 } 3896 3897 /* 3898 * end of protection functions 3899 */ 3900 3901 /* 3902 * pmap_unwire: clear the wired bit in the PTE 3903 * 3904 * => mapping should already be in map 3905 */ 3906 3907 void 3908 pmap_unwire(struct pmap *pmap, vaddr_t va) 3909 { 3910 pt_entry_t *ptes; 3911 pd_entry_t * const *pdes; 3912 struct pmap *pmap2; 3913 3914 kpreempt_disable(); 3915 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 3916 3917 if (pmap_pdes_valid(va, pdes, NULL)) { 3918 pt_entry_t *ptep = &ptes[pl1_i(va)]; 3919 pt_entry_t opte = *ptep; 3920 3921 #ifdef DIAGNOSTIC 3922 if (!pmap_valid_entry(opte)) 3923 panic("pmap_unwire: invalid (unmapped) va 0x%lx", va); 3924 #endif 3925 if ((opte & PG_W) != 0) { 3926 pt_entry_t npte = opte & ~PG_W; 3927 3928 opte = pmap_pte_testset(ptep, npte); 3929 pmap_stats_update_bypte(pmap, npte, opte); 3930 } 3931 #ifdef DIAGNOSTIC 3932 else { 3933 printf("pmap_unwire: wiring for pmap %p va 0x%lx " 3934 "didn't change!\n", pmap, va); 3935 } 3936 #endif 3937 pmap_unmap_ptes(pmap, pmap2); /* unlocks map */ 3938 } 3939 #ifdef DIAGNOSTIC 3940 else { 3941 panic("pmap_unwire: invalid PDE"); 3942 } 3943 #endif 3944 kpreempt_enable(); 3945 } 3946 3947 /* 3948 * pmap_copy: copy mappings from one pmap to another 3949 * 3950 * => optional function 3951 * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr) 3952 */ 3953 3954 /* 3955 * defined as macro in pmap.h 3956 */ 3957 3958 __weak_alias(pmap_enter, pmap_enter_default); 3959 3960 int 3961 pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, 3962 u_int flags) 3963 { 3964 return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0); 3965 } 3966 3967 /* 3968 * pmap_enter: enter a mapping into a pmap 3969 * 3970 * => must be done "now" ... no lazy-evaluation 3971 * => we set pmap => pv_head locking 3972 */ 3973 int 3974 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa, 3975 vm_prot_t prot, u_int flags, int domid) 3976 { 3977 pt_entry_t *ptes, opte, npte; 3978 pt_entry_t *ptep; 3979 pd_entry_t * const *pdes; 3980 struct vm_page *ptp, *pg; 3981 struct pmap_page *new_pp; 3982 struct pmap_page *old_pp; 3983 struct pv_entry *old_pve = NULL; 3984 struct pv_entry *new_pve; 3985 struct pv_entry *new_pve2; 3986 int error; 3987 bool wired = (flags & PMAP_WIRED) != 0; 3988 struct pmap *pmap2; 3989 3990 KASSERT(pmap_initialized); 3991 KASSERT(curlwp->l_md.md_gc_pmap != pmap); 3992 3993 #ifdef DIAGNOSTIC 3994 /* sanity check: totally out of range? */ 3995 if (va >= VM_MAX_KERNEL_ADDRESS) 3996 panic("pmap_enter: too big"); 3997 3998 if (va == (vaddr_t) PDP_BASE || va == (vaddr_t) APDP_BASE) 3999 panic("pmap_enter: trying to map over PDP/APDP!"); 4000 4001 /* sanity check: kernel PTPs should already have been pre-allocated */ 4002 if (va >= VM_MIN_KERNEL_ADDRESS && 4003 !pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)])) 4004 panic("pmap_enter: missing kernel PTP for va %lx!", va); 4005 #endif /* DIAGNOSTIC */ 4006 #ifdef XEN 4007 KASSERT(domid == DOMID_SELF || pa == 0); 4008 #endif /* XEN */ 4009 4010 npte = ma | protection_codes[prot] | PG_V; 4011 npte |= pmap_pat_flags(flags); 4012 if (wired) 4013 npte |= PG_W; 4014 if (va < VM_MAXUSER_ADDRESS) 4015 npte |= PG_u; 4016 else if (va < VM_MAX_ADDRESS) 4017 npte |= (PG_u | PG_RW); /* XXXCDC: no longer needed? */ 4018 else 4019 npte |= PG_k; 4020 if (pmap == pmap_kernel()) 4021 npte |= pmap_pg_g; 4022 if (flags & VM_PROT_ALL) { 4023 npte |= PG_U; 4024 if (flags & VM_PROT_WRITE) { 4025 KASSERT((npte & PG_RW) != 0); 4026 npte |= PG_M; 4027 } 4028 } 4029 4030 #ifdef XEN 4031 if (domid != DOMID_SELF) 4032 pg = NULL; 4033 else 4034 #endif 4035 pg = PHYS_TO_VM_PAGE(pa); 4036 if (pg != NULL) { 4037 /* This is a managed page */ 4038 npte |= PG_PVLIST; 4039 new_pp = VM_PAGE_TO_PP(pg); 4040 } else { 4041 new_pp = NULL; 4042 } 4043 4044 /* get pves. */ 4045 new_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT); 4046 new_pve2 = pool_cache_get(&pmap_pv_cache, PR_NOWAIT); 4047 if (new_pve == NULL || new_pve2 == NULL) { 4048 if (flags & PMAP_CANFAIL) { 4049 error = ENOMEM; 4050 goto out2; 4051 } 4052 panic("pmap_enter: pve allocation failed"); 4053 } 4054 4055 kpreempt_disable(); 4056 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 4057 if (pmap == pmap_kernel()) { 4058 ptp = NULL; 4059 } else { 4060 ptp = pmap_get_ptp(pmap, va, pdes); 4061 if (ptp == NULL) { 4062 pmap_unmap_ptes(pmap, pmap2); 4063 if (flags & PMAP_CANFAIL) { 4064 error = ENOMEM; 4065 goto out; 4066 } 4067 panic("pmap_enter: get ptp failed"); 4068 } 4069 } 4070 4071 /* 4072 * update the pte. 4073 */ 4074 4075 ptep = &ptes[pl1_i(va)]; 4076 do { 4077 opte = *ptep; 4078 4079 /* 4080 * if the same page, inherit PG_U and PG_M. 4081 */ 4082 if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) { 4083 npte |= opte & (PG_U | PG_M); 4084 } 4085 #if defined(XEN) 4086 if (domid != DOMID_SELF) { 4087 /* pmap_pte_cas with error handling */ 4088 int s = splvm(); 4089 if (opte != *ptep) { 4090 splx(s); 4091 continue; 4092 } 4093 error = xpq_update_foreign( 4094 vtomach((vaddr_t)ptep), npte, domid); 4095 splx(s); 4096 if (error) { 4097 if (ptp != NULL && ptp->wire_count <= 1) { 4098 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 4099 } 4100 pmap_unmap_ptes(pmap, pmap2); 4101 goto out; 4102 } 4103 break; 4104 } 4105 #endif /* defined(XEN) */ 4106 } while (pmap_pte_cas(ptep, opte, npte) != opte); 4107 4108 /* 4109 * update statistics and PTP's reference count. 4110 */ 4111 4112 pmap_stats_update_bypte(pmap, npte, opte); 4113 if (ptp != NULL && !pmap_valid_entry(opte)) { 4114 ptp->wire_count++; 4115 } 4116 KASSERT(ptp == NULL || ptp->wire_count > 1); 4117 4118 /* 4119 * if the same page, we can skip pv_entry handling. 4120 */ 4121 4122 if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) { 4123 KASSERT(((opte ^ npte) & PG_PVLIST) == 0); 4124 goto same_pa; 4125 } 4126 4127 /* 4128 * if old page is managed, remove pv_entry from its list. 4129 */ 4130 4131 if ((~opte & (PG_V | PG_PVLIST)) == 0) { 4132 pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte)); 4133 #ifdef DIAGNOSTIC 4134 if (pg == NULL) 4135 panic("pmap_enter: PG_PVLIST mapping with " 4136 "unmanaged page " 4137 "pa = 0x%" PRIx64 " (0x%" PRIx64 ")", 4138 (int64_t)pa, (int64_t)atop(pa)); 4139 #endif 4140 old_pp = VM_PAGE_TO_PP(pg); 4141 4142 pp_lock(old_pp); 4143 old_pve = pmap_remove_pv(old_pp, ptp, va); 4144 old_pp->pp_attrs |= opte; 4145 pp_unlock(old_pp); 4146 } 4147 4148 /* 4149 * if new page is managed, insert pv_entry into its list. 4150 */ 4151 4152 if (new_pp) { 4153 pp_lock(new_pp); 4154 new_pve = pmap_enter_pv(new_pp, new_pve, &new_pve2, ptp, va); 4155 pp_unlock(new_pp); 4156 } 4157 4158 same_pa: 4159 pmap_unmap_ptes(pmap, pmap2); 4160 4161 /* 4162 * shootdown tlb if necessary. 4163 */ 4164 4165 if ((~opte & (PG_V | PG_U)) == 0 && 4166 ((opte ^ npte) & (PG_FRAME | PG_RW)) != 0) { 4167 pmap_tlb_shootdown(pmap, va, 0, opte); 4168 } 4169 4170 error = 0; 4171 out: 4172 kpreempt_enable(); 4173 out2: 4174 if (old_pve != NULL) { 4175 pool_cache_put(&pmap_pv_cache, old_pve); 4176 } 4177 if (new_pve != NULL) { 4178 pool_cache_put(&pmap_pv_cache, new_pve); 4179 } 4180 if (new_pve2 != NULL) { 4181 pool_cache_put(&pmap_pv_cache, new_pve2); 4182 } 4183 4184 return error; 4185 } 4186 4187 static bool 4188 pmap_get_physpage(vaddr_t va, int level, paddr_t *paddrp) 4189 { 4190 struct vm_page *ptp; 4191 struct pmap *kpm = pmap_kernel(); 4192 4193 if (uvm.page_init_done == false) { 4194 /* 4195 * we're growing the kernel pmap early (from 4196 * uvm_pageboot_alloc()). this case must be 4197 * handled a little differently. 4198 */ 4199 4200 if (uvm_page_physget(paddrp) == false) 4201 panic("pmap_get_physpage: out of memory"); 4202 kpreempt_disable(); 4203 pmap_pte_set(early_zero_pte, 4204 pmap_pa2pte(*paddrp) | PG_V | PG_RW | PG_k); 4205 pmap_pte_flush(); 4206 pmap_update_pg((vaddr_t)early_zerop); 4207 memset(early_zerop, 0, PAGE_SIZE); 4208 #if defined(DIAGNOSTIC) || defined (XEN) 4209 pmap_pte_set(early_zero_pte, 0); 4210 pmap_pte_flush(); 4211 #endif /* defined(DIAGNOSTIC) */ 4212 kpreempt_enable(); 4213 } else { 4214 /* XXX */ 4215 ptp = uvm_pagealloc(NULL, 0, NULL, 4216 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 4217 if (ptp == NULL) 4218 panic("pmap_get_physpage: out of memory"); 4219 ptp->flags &= ~PG_BUSY; 4220 ptp->wire_count = 1; 4221 *paddrp = VM_PAGE_TO_PHYS(ptp); 4222 } 4223 pmap_stats_update(kpm, 1, 0); 4224 return true; 4225 } 4226 4227 /* 4228 * Allocate the amount of specified ptps for a ptp level, and populate 4229 * all levels below accordingly, mapping virtual addresses starting at 4230 * kva. 4231 * 4232 * Used by pmap_growkernel. 4233 */ 4234 static void 4235 pmap_alloc_level(pd_entry_t * const *pdes, vaddr_t kva, int lvl, 4236 long *needed_ptps) 4237 { 4238 unsigned long i; 4239 vaddr_t va; 4240 paddr_t pa; 4241 unsigned long index, endindex; 4242 int level; 4243 pd_entry_t *pdep; 4244 #ifdef XEN 4245 int s = splvm(); /* protect xpq_* */ 4246 #endif 4247 4248 for (level = lvl; level > 1; level--) { 4249 if (level == PTP_LEVELS) 4250 pdep = pmap_kernel()->pm_pdir; 4251 else 4252 pdep = pdes[level - 2]; 4253 va = kva; 4254 index = pl_i_roundup(kva, level); 4255 endindex = index + needed_ptps[level - 1] - 1; 4256 4257 4258 for (i = index; i <= endindex; i++) { 4259 KASSERT(!pmap_valid_entry(pdep[i])); 4260 pmap_get_physpage(va, level - 1, &pa); 4261 #ifdef XEN 4262 xpq_queue_pte_update((level == PTP_LEVELS) ? 4263 xpmap_ptom(pmap_pdirpa(pmap_kernel(), i)) : 4264 xpmap_ptetomach(&pdep[i]), 4265 pmap_pa2pte(pa) | PG_k | PG_V | PG_RW); 4266 #ifdef PAE 4267 if (level == PTP_LEVELS && i > L2_SLOT_KERN) { 4268 /* update real kernel PD too */ 4269 xpq_queue_pte_update( 4270 xpmap_ptetomach(&pmap_kl2pd[l2tol2(i)]), 4271 pmap_pa2pte(pa) | PG_k | PG_V | PG_RW); 4272 } 4273 #endif 4274 #else /* XEN */ 4275 pdep[i] = pmap_pa2pte(pa) | PG_k | PG_V | PG_RW; 4276 #endif /* XEN */ 4277 KASSERT(level != PTP_LEVELS || nkptp[level - 1] + 4278 pl_i(VM_MIN_KERNEL_ADDRESS, level) == i); 4279 nkptp[level - 1]++; 4280 va += nbpd[level - 1]; 4281 } 4282 pmap_pte_flush(); 4283 } 4284 #ifdef XEN 4285 splx(s); 4286 #endif 4287 } 4288 4289 /* 4290 * pmap_growkernel: increase usage of KVM space 4291 * 4292 * => we allocate new PTPs for the kernel and install them in all 4293 * the pmaps on the system. 4294 */ 4295 4296 vaddr_t 4297 pmap_growkernel(vaddr_t maxkvaddr) 4298 { 4299 struct pmap *kpm = pmap_kernel(); 4300 #if !defined(XEN) || !defined(__x86_64__) 4301 struct pmap *pm; 4302 #endif 4303 int s, i; 4304 long needed_kptp[PTP_LEVELS], target_nptp, old; 4305 bool invalidate = false; 4306 4307 s = splvm(); /* to be safe */ 4308 mutex_enter(&kpm->pm_lock); 4309 4310 if (maxkvaddr <= pmap_maxkvaddr) { 4311 mutex_exit(&kpm->pm_lock); 4312 splx(s); 4313 return pmap_maxkvaddr; 4314 } 4315 4316 maxkvaddr = x86_round_pdr(maxkvaddr); 4317 old = nkptp[PTP_LEVELS - 1]; 4318 /* 4319 * This loop could be optimized more, but pmap_growkernel() 4320 * is called infrequently. 4321 */ 4322 for (i = PTP_LEVELS - 1; i >= 1; i--) { 4323 target_nptp = pl_i_roundup(maxkvaddr, i + 1) - 4324 pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1); 4325 /* 4326 * XXX only need to check toplevel. 4327 */ 4328 if (target_nptp > nkptpmax[i]) 4329 panic("out of KVA space"); 4330 KASSERT(target_nptp >= nkptp[i]); 4331 needed_kptp[i] = target_nptp - nkptp[i]; 4332 } 4333 4334 pmap_alloc_level(normal_pdes, pmap_maxkvaddr, PTP_LEVELS, needed_kptp); 4335 4336 /* 4337 * If the number of top level entries changed, update all 4338 * pmaps. 4339 */ 4340 if (needed_kptp[PTP_LEVELS - 1] != 0) { 4341 #ifdef XEN 4342 #ifdef __x86_64__ 4343 /* nothing, kernel entries are never entered in user pmap */ 4344 #else /* __x86_64__ */ 4345 mutex_enter(&pmaps_lock); 4346 LIST_FOREACH(pm, &pmaps, pm_list) { 4347 int pdkidx; 4348 for (pdkidx = PDIR_SLOT_KERN + old; 4349 pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1]; 4350 pdkidx++) { 4351 xpq_queue_pte_update( 4352 xpmap_ptom(pmap_pdirpa(pm, pdkidx)), 4353 kpm->pm_pdir[pdkidx]); 4354 } 4355 xpq_flush_queue(); 4356 } 4357 mutex_exit(&pmaps_lock); 4358 #endif /* __x86_64__ */ 4359 #else /* XEN */ 4360 unsigned newpdes; 4361 newpdes = nkptp[PTP_LEVELS - 1] - old; 4362 mutex_enter(&pmaps_lock); 4363 LIST_FOREACH(pm, &pmaps, pm_list) { 4364 memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old], 4365 &kpm->pm_pdir[PDIR_SLOT_KERN + old], 4366 newpdes * sizeof (pd_entry_t)); 4367 } 4368 mutex_exit(&pmaps_lock); 4369 #endif 4370 invalidate = true; 4371 } 4372 pmap_maxkvaddr = maxkvaddr; 4373 mutex_exit(&kpm->pm_lock); 4374 splx(s); 4375 4376 if (invalidate) { 4377 /* Invalidate the PDP cache. */ 4378 pool_cache_invalidate(&pmap_pdp_cache); 4379 } 4380 4381 return maxkvaddr; 4382 } 4383 4384 #ifdef DEBUG 4385 void pmap_dump(struct pmap *, vaddr_t, vaddr_t); 4386 4387 /* 4388 * pmap_dump: dump all the mappings from a pmap 4389 * 4390 * => caller should not be holding any pmap locks 4391 */ 4392 4393 void 4394 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 4395 { 4396 pt_entry_t *ptes, *pte; 4397 pd_entry_t * const *pdes; 4398 struct pmap *pmap2; 4399 vaddr_t blkendva; 4400 4401 /* 4402 * if end is out of range truncate. 4403 * if (end == start) update to max. 4404 */ 4405 4406 if (eva > VM_MAXUSER_ADDRESS || eva <= sva) 4407 eva = VM_MAXUSER_ADDRESS; 4408 4409 /* 4410 * we lock in the pmap => pv_head direction 4411 */ 4412 4413 kpreempt_disable(); 4414 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 4415 4416 /* 4417 * dumping a range of pages: we dump in PTP sized blocks (4MB) 4418 */ 4419 4420 for (/* null */ ; sva < eva ; sva = blkendva) { 4421 4422 /* determine range of block */ 4423 blkendva = x86_round_pdr(sva+1); 4424 if (blkendva > eva) 4425 blkendva = eva; 4426 4427 /* valid block? */ 4428 if (!pmap_pdes_valid(sva, pdes, NULL)) 4429 continue; 4430 4431 pte = &ptes[pl1_i(sva)]; 4432 for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) { 4433 if (!pmap_valid_entry(*pte)) 4434 continue; 4435 printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR 4436 " (pte=%#" PRIxPADDR ")\n", 4437 sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte); 4438 } 4439 } 4440 pmap_unmap_ptes(pmap, pmap2); 4441 kpreempt_enable(); 4442 } 4443 #endif 4444 4445 /* 4446 * pmap_tlb_shootdown: invalidate pages on all CPUs using pmap 'pm' 4447 * 4448 * => always invalidates locally before returning 4449 * => returns before remote CPUs have invalidated 4450 * => must be called with preemption disabled 4451 */ 4452 4453 void 4454 pmap_tlb_shootdown(struct pmap *pm, vaddr_t sva, vaddr_t eva, pt_entry_t pte) 4455 { 4456 #ifdef MULTIPROCESSOR 4457 extern bool x86_mp_online; 4458 struct cpu_info *ci; 4459 struct pmap_mbox *mb, *selfmb; 4460 CPU_INFO_ITERATOR cii; 4461 uintptr_t head; 4462 u_int count; 4463 int s; 4464 #endif /* MULTIPROCESSOR */ 4465 struct cpu_info *self; 4466 bool kernel; 4467 4468 KASSERT(eva == 0 || eva >= sva); 4469 KASSERT(kpreempt_disabled()); 4470 4471 if (pte & PG_PS) 4472 sva &= PG_LGFRAME; 4473 pte &= PG_G; 4474 self = curcpu(); 4475 4476 if (sva == (vaddr_t)-1LL) { 4477 kernel = true; 4478 } else { 4479 if (eva == 0) 4480 eva = sva + PAGE_SIZE; 4481 kernel = sva >= VM_MAXUSER_ADDRESS; 4482 KASSERT(kernel == (eva > VM_MAXUSER_ADDRESS)); 4483 } 4484 4485 /* 4486 * if tearing down the pmap, do nothing. we'll flush later 4487 * when we're ready to recycle/destroy it. 4488 */ 4489 if (__predict_false(curlwp->l_md.md_gc_pmap == pm)) { 4490 return; 4491 } 4492 4493 /* 4494 * If the range is larger than 32 pages, then invalidate 4495 * everything. 4496 */ 4497 if (sva != (vaddr_t)-1LL && eva - sva > (32 * PAGE_SIZE)) { 4498 sva = (vaddr_t)-1LL; 4499 eva = sva; 4500 } 4501 4502 #ifdef MULTIPROCESSOR 4503 if (ncpu > 1 && x86_mp_online) { 4504 selfmb = &self->ci_pmap_cpu->pc_mbox; 4505 4506 /* 4507 * If the CPUs have no notion of global pages then 4508 * reload of %cr3 is sufficient. 4509 */ 4510 if (pte != 0 && (cpu_feature[0] & CPUID_PGE) == 0) 4511 pte = 0; 4512 4513 if (pm == pmap_kernel()) { 4514 /* 4515 * Mapped on all CPUs: use the broadcast mechanism. 4516 * Once we have the lock, increment the counter. 4517 */ 4518 s = splvm(); 4519 mb = &pmap_mbox; 4520 count = SPINLOCK_BACKOFF_MIN; 4521 do { 4522 if ((head = mb->mb_head) != mb->mb_tail) { 4523 splx(s); 4524 while ((head = mb->mb_head) != 4525 mb->mb_tail) 4526 SPINLOCK_BACKOFF(count); 4527 s = splvm(); 4528 } 4529 } while (atomic_cas_ulong( 4530 (volatile u_long *)&mb->mb_head, 4531 head, head + ncpu - 1) != head); 4532 4533 /* 4534 * Once underway we must stay at IPL_VM until the 4535 * IPI is dispatched. Otherwise interrupt handlers 4536 * on this CPU can deadlock against us. 4537 */ 4538 pmap_tlb_evcnt.ev_count++; 4539 mb->mb_pointer = self; 4540 mb->mb_addr1 = sva; 4541 mb->mb_addr2 = eva; 4542 mb->mb_global = pte; 4543 x86_ipi(LAPIC_TLB_BCAST_VECTOR, LAPIC_DEST_ALLEXCL, 4544 LAPIC_DLMODE_FIXED); 4545 self->ci_need_tlbwait = 1; 4546 splx(s); 4547 } else if ((pm->pm_cpus & ~self->ci_cpumask) != 0 || 4548 (kernel && (pm->pm_kernel_cpus & ~self->ci_cpumask) != 0)) { 4549 /* 4550 * We don't bother traversing the CPU list if only 4551 * used by this CPU. 4552 * 4553 * We can't do global flushes with the multicast 4554 * mechanism. 4555 */ 4556 KASSERT(pte == 0); 4557 4558 /* 4559 * Take ownership of the shootdown mailbox on each 4560 * CPU, fill the details and fire it off. 4561 */ 4562 s = splvm(); 4563 for (CPU_INFO_FOREACH(cii, ci)) { 4564 if (ci == self || 4565 !pmap_is_active(pm, ci, kernel) || 4566 !(ci->ci_flags & CPUF_RUNNING)) 4567 continue; 4568 selfmb->mb_head++; 4569 mb = &ci->ci_pmap_cpu->pc_mbox; 4570 count = SPINLOCK_BACKOFF_MIN; 4571 while (atomic_cas_ulong( 4572 (u_long *)&mb->mb_pointer, 4573 0, (u_long)&selfmb->mb_tail) != 0) { 4574 splx(s); 4575 while (mb->mb_pointer != 0) 4576 SPINLOCK_BACKOFF(count); 4577 s = splvm(); 4578 } 4579 mb->mb_addr1 = sva; 4580 mb->mb_addr2 = eva; 4581 mb->mb_global = pte; 4582 if (x86_ipi(LAPIC_TLB_MCAST_VECTOR, 4583 ci->ci_cpuid, LAPIC_DLMODE_FIXED)) 4584 panic("pmap_tlb_shootdown: ipi failed"); 4585 } 4586 self->ci_need_tlbwait = 1; 4587 splx(s); 4588 } 4589 } 4590 #endif /* MULTIPROCESSOR */ 4591 4592 /* Update the current CPU before waiting for others. */ 4593 if (!pmap_is_active(pm, self, kernel)) 4594 return; 4595 4596 if (sva == (vaddr_t)-1LL) { 4597 u_int gen = uvm_emap_gen_return(); 4598 if (pte != 0) { 4599 tlbflushg(); 4600 } else { 4601 tlbflush(); 4602 } 4603 uvm_emap_update(gen); 4604 } else { 4605 do { 4606 pmap_update_pg(sva); 4607 sva += PAGE_SIZE; 4608 } while (sva < eva); 4609 } 4610 } 4611 4612 /* 4613 * pmap_tlb_shootwait: wait for pending TLB shootdowns to complete 4614 * 4615 * => only waits for operations generated by the current CPU 4616 * => must be called with preemption disabled 4617 */ 4618 4619 void 4620 pmap_tlb_shootwait(void) 4621 { 4622 struct cpu_info *self; 4623 struct pmap_mbox *mb; 4624 4625 KASSERT(kpreempt_disabled()); 4626 4627 /* 4628 * Anything to do? XXX Really we want to avoid touching the cache 4629 * lines of the two mailboxes, but the processor may read ahead. 4630 */ 4631 self = curcpu(); 4632 if (!self->ci_need_tlbwait) 4633 return; 4634 self->ci_need_tlbwait = 0; 4635 4636 /* If we own the global mailbox, wait for it to drain. */ 4637 mb = &pmap_mbox; 4638 while (mb->mb_pointer == self && mb->mb_head != mb->mb_tail) 4639 x86_pause(); 4640 4641 /* If we own other CPU's mailboxes, wait for them to drain. */ 4642 mb = &self->ci_pmap_cpu->pc_mbox; 4643 KASSERT(mb->mb_pointer != &mb->mb_tail); 4644 while (mb->mb_head != mb->mb_tail) 4645 x86_pause(); 4646 } 4647 4648 /* 4649 * pmap_update: process deferred invalidations 4650 */ 4651 4652 void 4653 pmap_update(struct pmap *pmap) 4654 { 4655 struct vm_page *ptp, *empty_ptps; 4656 struct pmap_page *pp; 4657 lwp_t *l; 4658 4659 /* 4660 * if we have torn down this pmap, invalidate non-global TLB 4661 * entries on any processors using it. 4662 */ 4663 l = curlwp; 4664 if (__predict_false(l->l_md.md_gc_pmap == pmap)) { 4665 l->l_md.md_gc_pmap = NULL; 4666 KPREEMPT_DISABLE(l); 4667 pmap_tlb_shootdown(pmap, -1, -1, 0); 4668 KPREEMPT_ENABLE(l); 4669 } 4670 4671 /* 4672 * wait for tlb shootdowns to complete before returning control 4673 * to the caller. 4674 */ 4675 kpreempt_disable(); 4676 pmap_tlb_shootwait(); 4677 kpreempt_enable(); 4678 4679 /* 4680 * now that shootdowns are complete, process deferred frees, 4681 * but not from interrupt context. 4682 */ 4683 if (l->l_md.md_gc_ptp != NULL) { 4684 KASSERT((l->l_pflag & LP_INTR) == 0); 4685 if (cpu_intr_p()) { 4686 return; 4687 } 4688 4689 empty_ptps = l->l_md.md_gc_ptp; 4690 l->l_md.md_gc_ptp = NULL; 4691 4692 while ((ptp = empty_ptps) != NULL) { 4693 ptp->flags |= PG_ZERO; 4694 pp = VM_PAGE_TO_PP(ptp); 4695 empty_ptps = pp->pp_link; 4696 LIST_INIT(&pp->pp_head.pvh_list); 4697 uvm_pagefree(ptp); 4698 } 4699 } 4700 } 4701 4702 #if PTP_LEVELS > 4 4703 #error "Unsupported number of page table mappings" 4704 #endif 4705 4706 paddr_t 4707 pmap_init_tmp_pgtbl(paddr_t pg) 4708 { 4709 static bool maps_loaded; 4710 static const paddr_t x86_tmp_pml_paddr[] = { 4711 4 * PAGE_SIZE, 4712 5 * PAGE_SIZE, 4713 6 * PAGE_SIZE, 4714 7 * PAGE_SIZE 4715 }; 4716 static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 }; 4717 4718 pd_entry_t *tmp_pml, *kernel_pml; 4719 4720 int level; 4721 4722 if (!maps_loaded) { 4723 for (level = 0; level < PTP_LEVELS; ++level) { 4724 x86_tmp_pml_vaddr[level] = 4725 uvm_km_alloc(kernel_map, PAGE_SIZE, 0, 4726 UVM_KMF_VAONLY); 4727 4728 if (x86_tmp_pml_vaddr[level] == 0) 4729 panic("mapping of real mode PML failed\n"); 4730 pmap_kenter_pa(x86_tmp_pml_vaddr[level], 4731 x86_tmp_pml_paddr[level], 4732 VM_PROT_READ | VM_PROT_WRITE, 0); 4733 pmap_update(pmap_kernel()); 4734 } 4735 maps_loaded = true; 4736 } 4737 4738 /* Zero levels 1-3 */ 4739 for (level = 0; level < PTP_LEVELS - 1; ++level) { 4740 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 4741 memset(tmp_pml, 0, PAGE_SIZE); 4742 } 4743 4744 /* Copy PML4 */ 4745 kernel_pml = pmap_kernel()->pm_pdir; 4746 tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1]; 4747 memcpy(tmp_pml, kernel_pml, PAGE_SIZE); 4748 4749 #ifdef PAE 4750 /* 4751 * Use the last 4 entries of the L2 page as L3 PD entries. These 4752 * last entries are unlikely to be used for temporary mappings. 4753 * 508: maps 0->1GB (userland) 4754 * 509: unused 4755 * 510: unused 4756 * 511: maps 3->4GB (kernel) 4757 */ 4758 tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PG_V; 4759 tmp_pml[509] = 0; 4760 tmp_pml[510] = 0; 4761 tmp_pml[511] = pmap_pdirpa(pmap_kernel(),PDIR_SLOT_KERN) | PG_V; 4762 #endif 4763 4764 for (level = PTP_LEVELS - 1; level > 0; --level) { 4765 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 4766 4767 tmp_pml[pl_i(pg, level + 1)] = 4768 (x86_tmp_pml_paddr[level - 1] & PG_FRAME) | PG_RW | PG_V; 4769 } 4770 4771 tmp_pml = (void *)x86_tmp_pml_vaddr[0]; 4772 tmp_pml[pl_i(pg, 1)] = (pg & PG_FRAME) | PG_RW | PG_V; 4773 4774 #ifdef PAE 4775 /* Return the PA of the L3 page (entry 508 of the L2 page) */ 4776 return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t); 4777 #endif 4778 4779 return x86_tmp_pml_paddr[PTP_LEVELS - 1]; 4780 } 4781 4782 u_int 4783 x86_mmap_flags(paddr_t mdpgno) 4784 { 4785 u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK; 4786 u_int pflag = 0; 4787 4788 if (nflag & X86_MMAP_FLAG_PREFETCH) 4789 pflag |= PMAP_WRITE_COMBINE; 4790 4791 return pflag; 4792 } 4793