1 /* $NetBSD: pmap.c,v 1.105 2010/02/26 19:25:07 jym Exp $ */ 2 3 /* 4 * Copyright (c) 2007 Manuel Bouyer. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 * 26 */ 27 28 /* 29 * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr> 30 * 31 * Permission to use, copy, modify, and distribute this software for any 32 * purpose with or without fee is hereby granted, provided that the above 33 * copyright notice and this permission notice appear in all copies. 34 * 35 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 36 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 37 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 38 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 39 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 40 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 41 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 42 */ 43 44 /* 45 * 46 * Copyright (c) 1997 Charles D. Cranor and Washington University. 47 * All rights reserved. 48 * 49 * Redistribution and use in source and binary forms, with or without 50 * modification, are permitted provided that the following conditions 51 * are met: 52 * 1. Redistributions of source code must retain the above copyright 53 * notice, this list of conditions and the following disclaimer. 54 * 2. Redistributions in binary form must reproduce the above copyright 55 * notice, this list of conditions and the following disclaimer in the 56 * documentation and/or other materials provided with the distribution. 57 * 3. All advertising materials mentioning features or use of this software 58 * must display the following acknowledgement: 59 * This product includes software developed by Charles D. Cranor and 60 * Washington University. 61 * 4. The name of the author may not be used to endorse or promote products 62 * derived from this software without specific prior written permission. 63 * 64 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 65 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 66 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 67 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 68 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 69 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 70 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 71 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 72 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 73 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 74 */ 75 76 /* 77 * Copyright 2001 (c) Wasabi Systems, Inc. 78 * All rights reserved. 79 * 80 * Written by Frank van der Linden for Wasabi Systems, Inc. 81 * 82 * Redistribution and use in source and binary forms, with or without 83 * modification, are permitted provided that the following conditions 84 * are met: 85 * 1. Redistributions of source code must retain the above copyright 86 * notice, this list of conditions and the following disclaimer. 87 * 2. Redistributions in binary form must reproduce the above copyright 88 * notice, this list of conditions and the following disclaimer in the 89 * documentation and/or other materials provided with the distribution. 90 * 3. All advertising materials mentioning features or use of this software 91 * must display the following acknowledgement: 92 * This product includes software developed for the NetBSD Project by 93 * Wasabi Systems, Inc. 94 * 4. The name of Wasabi Systems, Inc. may not be used to endorse 95 * or promote products derived from this software without specific prior 96 * written permission. 97 * 98 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND 99 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 100 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 101 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC 102 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 103 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 104 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 105 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 106 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 107 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 108 * POSSIBILITY OF SUCH DAMAGE. 109 */ 110 111 /* 112 * This is the i386 pmap modified and generalized to support x86-64 113 * as well. The idea is to hide the upper N levels of the page tables 114 * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest 115 * is mostly untouched, except that it uses some more generalized 116 * macros and interfaces. 117 * 118 * This pmap has been tested on the i386 as well, and it can be easily 119 * adapted to PAE. 120 * 121 * fvdl@wasabisystems.com 18-Jun-2001 122 */ 123 124 /* 125 * pmap.c: i386 pmap module rewrite 126 * Chuck Cranor <chuck@ccrc.wustl.edu> 127 * 11-Aug-97 128 * 129 * history of this pmap module: in addition to my own input, i used 130 * the following references for this rewrite of the i386 pmap: 131 * 132 * [1] the NetBSD i386 pmap. this pmap appears to be based on the 133 * BSD hp300 pmap done by Mike Hibler at University of Utah. 134 * it was then ported to the i386 by William Jolitz of UUNET 135 * Technologies, Inc. Then Charles M. Hannum of the NetBSD 136 * project fixed some bugs and provided some speed ups. 137 * 138 * [2] the FreeBSD i386 pmap. this pmap seems to be the 139 * Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson 140 * and David Greenman. 141 * 142 * [3] the Mach pmap. this pmap, from CMU, seems to have migrated 143 * between several processors. the VAX version was done by 144 * Avadis Tevanian, Jr., and Michael Wayne Young. the i386 145 * version was done by Lance Berc, Mike Kupfer, Bob Baron, 146 * David Golub, and Richard Draves. the alpha version was 147 * done by Alessandro Forin (CMU/Mach) and Chris Demetriou 148 * (NetBSD/alpha). 149 */ 150 151 #include <sys/cdefs.h> 152 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.105 2010/02/26 19:25:07 jym Exp $"); 153 154 #include "opt_user_ldt.h" 155 #include "opt_lockdebug.h" 156 #include "opt_multiprocessor.h" 157 #include "opt_xen.h" 158 #if !defined(__x86_64__) 159 #include "opt_kstack_dr0.h" 160 #endif /* !defined(__x86_64__) */ 161 162 #include <sys/param.h> 163 #include <sys/systm.h> 164 #include <sys/proc.h> 165 #include <sys/pool.h> 166 #include <sys/kernel.h> 167 #include <sys/atomic.h> 168 #include <sys/cpu.h> 169 #include <sys/intr.h> 170 #include <sys/xcall.h> 171 172 #include <uvm/uvm.h> 173 174 #include <dev/isa/isareg.h> 175 176 #include <machine/specialreg.h> 177 #include <machine/gdt.h> 178 #include <machine/isa_machdep.h> 179 #include <machine/cpuvar.h> 180 181 #include <x86/pmap.h> 182 #include <x86/pmap_pv.h> 183 184 #include <x86/i82489reg.h> 185 #include <x86/i82489var.h> 186 187 #ifdef XEN 188 #include <xen/xen3-public/xen.h> 189 #include <xen/hypervisor.h> 190 #endif 191 192 /* flag to be used for kernel mappings: PG_u on Xen/amd64, 0 otherwise */ 193 #if defined(XEN) && defined(__x86_64__) 194 #define PG_k PG_u 195 #else 196 #define PG_k 0 197 #endif 198 199 /* 200 * general info: 201 * 202 * - for an explanation of how the i386 MMU hardware works see 203 * the comments in <machine/pte.h>. 204 * 205 * - for an explanation of the general memory structure used by 206 * this pmap (including the recursive mapping), see the comments 207 * in <machine/pmap.h>. 208 * 209 * this file contains the code for the "pmap module." the module's 210 * job is to manage the hardware's virtual to physical address mappings. 211 * note that there are two levels of mapping in the VM system: 212 * 213 * [1] the upper layer of the VM system uses vm_map's and vm_map_entry's 214 * to map ranges of virtual address space to objects/files. for 215 * example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only 216 * to the file /bin/ls starting at offset zero." note that 217 * the upper layer mapping is not concerned with how individual 218 * vm_pages are mapped. 219 * 220 * [2] the lower layer of the VM system (the pmap) maintains the mappings 221 * from virtual addresses. it is concerned with which vm_page is 222 * mapped where. for example, when you run /bin/ls and start 223 * at page 0x1000 the fault routine may lookup the correct page 224 * of the /bin/ls file and then ask the pmap layer to establish 225 * a mapping for it. 226 * 227 * note that information in the lower layer of the VM system can be 228 * thrown away since it can easily be reconstructed from the info 229 * in the upper layer. 230 * 231 * data structures we use include: 232 * 233 * - struct pmap: describes the address space of one thread 234 * - struct pv_entry: describes one <PMAP,VA> mapping of a PA 235 * - struct pv_head: there is one pv_head per managed page of 236 * physical memory. the pv_head points to a list of pv_entry 237 * structures which describe all the <PMAP,VA> pairs that this 238 * page is mapped in. this is critical for page based operations 239 * such as pmap_page_protect() [change protection on _all_ mappings 240 * of a page] 241 */ 242 243 /* 244 * memory allocation 245 * 246 * - there are three data structures that we must dynamically allocate: 247 * 248 * [A] new process' page directory page (PDP) 249 * - plan 1: done at pmap_create() we use 250 * uvm_km_alloc(kernel_map, PAGE_SIZE) [fka kmem_alloc] to do this 251 * allocation. 252 * 253 * if we are low in free physical memory then we sleep in 254 * uvm_km_alloc -- in this case this is ok since we are creating 255 * a new pmap and should not be holding any locks. 256 * 257 * if the kernel is totally out of virtual space 258 * (i.e. uvm_km_alloc returns NULL), then we panic. 259 * 260 * [B] new page tables pages (PTP) 261 * - call uvm_pagealloc() 262 * => success: zero page, add to pm_pdir 263 * => failure: we are out of free vm_pages, let pmap_enter() 264 * tell UVM about it. 265 * 266 * note: for kernel PTPs, we start with NKPTP of them. as we map 267 * kernel memory (at uvm_map time) we check to see if we've grown 268 * the kernel pmap. if so, we call the optional function 269 * pmap_growkernel() to grow the kernel PTPs in advance. 270 * 271 * [C] pv_entry structures 272 */ 273 274 /* 275 * locking 276 * 277 * we have the following locks that we must contend with: 278 * 279 * mutexes: 280 * 281 * - pmap lock (per pmap, part of uvm_object) 282 * this lock protects the fields in the pmap structure including 283 * the non-kernel PDEs in the PDP, and the PTEs. it also locks 284 * in the alternate PTE space (since that is determined by the 285 * entry in the PDP). 286 * 287 * - pvh_lock (per pv_head) 288 * this lock protects the pv_entry list which is chained off the 289 * pv_head structure for a specific managed PA. it is locked 290 * when traversing the list (e.g. adding/removing mappings, 291 * syncing R/M bits, etc.) 292 * 293 * - pmaps_lock 294 * this lock protects the list of active pmaps (headed by "pmaps"). 295 * we lock it when adding or removing pmaps from this list. 296 * 297 * tlb shootdown 298 * 299 * tlb shootdowns are hard interrupts that operate outside the spl 300 * framework: they don't need to be blocked provided that the pmap module 301 * gets the order of events correct. the calls are made by talking directly 302 * to the lapic. the stubs to handle the interrupts are quite short and do 303 * one of the following: invalidate a single page, a range of pages, all 304 * user tlb entries or the entire tlb. 305 * 306 * the cpus synchronize with each other using pmap_mbox structures which are 307 * aligned on 64-byte cache lines. tlb shootdowns against the kernel pmap 308 * use a global mailbox and are generated using a broadcast ipi (broadcast 309 * to all but the sending cpu). shootdowns against regular pmaps use 310 * per-cpu mailboxes and are multicast. kernel and user shootdowns can 311 * execute simultaneously, as can shootdowns within different multithreaded 312 * processes. TODO: 313 * 314 * 1. figure out which waitpoints can be deferered to pmap_update(). 315 * 2. see if there is a cheap way to batch some updates. 316 */ 317 318 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER; 319 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER; 320 const long nkptpmax[] = NKPTPMAX_INITIALIZER; 321 const long nbpd[] = NBPD_INITIALIZER; 322 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER; 323 pd_entry_t * const alternate_pdes[] = APDES_INITIALIZER; 324 325 long nkptp[] = NKPTP_INITIALIZER; 326 327 static kmutex_t pmaps_lock; 328 329 static vaddr_t pmap_maxkvaddr; 330 331 #define COUNT(x) /* nothing */ 332 333 /* 334 * XXX kludge: dummy locking to make KASSERTs in uvm_page.c comfortable. 335 * actual locking is done by pm_lock. 336 */ 337 #if defined(DIAGNOSTIC) 338 #define PMAP_SUBOBJ_LOCK(pm, idx) \ 339 KASSERT(mutex_owned(&(pm)->pm_lock)); \ 340 if ((idx) != 0) \ 341 mutex_enter(&(pm)->pm_obj[(idx)].vmobjlock) 342 #define PMAP_SUBOBJ_UNLOCK(pm, idx) \ 343 KASSERT(mutex_owned(&(pm)->pm_lock)); \ 344 if ((idx) != 0) \ 345 mutex_exit(&(pm)->pm_obj[(idx)].vmobjlock) 346 #else /* defined(DIAGNOSTIC) */ 347 #define PMAP_SUBOBJ_LOCK(pm, idx) /* nothing */ 348 #define PMAP_SUBOBJ_UNLOCK(pm, idx) /* nothing */ 349 #endif /* defined(DIAGNOSTIC) */ 350 351 /* 352 * Misc. event counters. 353 */ 354 struct evcnt pmap_iobmp_evcnt; 355 struct evcnt pmap_ldt_evcnt; 356 357 /* 358 * Global TLB shootdown mailbox. 359 */ 360 struct evcnt pmap_tlb_evcnt __aligned(64); 361 struct pmap_mbox pmap_mbox __aligned(64); 362 363 /* 364 * Per-CPU data. The pmap mailbox is cache intensive so gets its 365 * own line. Note that the mailbox must be the first item. 366 */ 367 struct pmap_cpu { 368 /* TLB shootdown */ 369 struct pmap_mbox pc_mbox; 370 }; 371 372 union { 373 struct pmap_cpu pc; 374 uint8_t padding[64]; 375 } pmap_cpu[MAXCPUS] __aligned(64); 376 377 /* 378 * global data structures 379 */ 380 381 static struct pmap kernel_pmap_store; /* the kernel's pmap (proc0) */ 382 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store; 383 384 /* 385 * pmap_pg_g: if our processor supports PG_G in the PTE then we 386 * set pmap_pg_g to PG_G (otherwise it is zero). 387 */ 388 389 int pmap_pg_g = 0; 390 391 /* 392 * pmap_largepages: if our processor supports PG_PS and we are 393 * using it, this is set to true. 394 */ 395 396 int pmap_largepages; 397 398 /* 399 * i386 physical memory comes in a big contig chunk with a small 400 * hole toward the front of it... the following two paddr_t's 401 * (shared with machdep.c) describe the physical address space 402 * of this machine. 403 */ 404 paddr_t avail_start; /* PA of first available physical page */ 405 paddr_t avail_end; /* PA of last available physical page */ 406 407 #ifdef XEN 408 #ifdef __x86_64__ 409 /* Dummy PGD for user cr3, used between pmap_deactivate() and pmap_activate() */ 410 static paddr_t xen_dummy_user_pgd; 411 /* Currently active user PGD (can't use rcr3()) */ 412 static paddr_t xen_current_user_pgd = 0; 413 #endif /* __x86_64__ */ 414 paddr_t pmap_pa_start; /* PA of first physical page for this domain */ 415 paddr_t pmap_pa_end; /* PA of last physical page for this domain */ 416 #endif /* XEN */ 417 418 #define VM_PAGE_TO_PP(pg) (&(pg)->mdpage.mp_pp) 419 420 #define pp_lock(pp) mutex_spin_enter(&(pp)->pp_lock) 421 #define pp_unlock(pp) mutex_spin_exit(&(pp)->pp_lock) 422 #define pp_locked(pp) mutex_owned(&(pp)->pp_lock) 423 424 #define PV_HASH_SIZE 32768 425 #define PV_HASH_LOCK_CNT 32 426 427 struct pv_hash_lock { 428 kmutex_t lock; 429 } __aligned(CACHE_LINE_SIZE) pv_hash_locks[PV_HASH_LOCK_CNT] 430 __aligned(CACHE_LINE_SIZE); 431 432 struct pv_hash_head { 433 SLIST_HEAD(, pv_entry) hh_list; 434 } pv_hash_heads[PV_HASH_SIZE]; 435 436 static u_int 437 pvhash_hash(struct vm_page *ptp, vaddr_t va) 438 { 439 440 return (uintptr_t)ptp / sizeof(*ptp) + (va >> PAGE_SHIFT); 441 } 442 443 static struct pv_hash_head * 444 pvhash_head(u_int hash) 445 { 446 447 return &pv_hash_heads[hash % PV_HASH_SIZE]; 448 } 449 450 static kmutex_t * 451 pvhash_lock(u_int hash) 452 { 453 454 return &pv_hash_locks[hash % PV_HASH_LOCK_CNT].lock; 455 } 456 457 static struct pv_entry * 458 pvhash_remove(struct pv_hash_head *hh, struct vm_page *ptp, vaddr_t va) 459 { 460 struct pv_entry *pve; 461 struct pv_entry *prev; 462 463 prev = NULL; 464 SLIST_FOREACH(pve, &hh->hh_list, pve_hash) { 465 if (pve->pve_pte.pte_ptp == ptp && 466 pve->pve_pte.pte_va == va) { 467 if (prev != NULL) { 468 SLIST_REMOVE_AFTER(prev, pve_hash); 469 } else { 470 SLIST_REMOVE_HEAD(&hh->hh_list, pve_hash); 471 } 472 break; 473 } 474 prev = pve; 475 } 476 return pve; 477 } 478 479 /* 480 * other data structures 481 */ 482 483 static pt_entry_t protection_codes[8]; /* maps MI prot to i386 prot code */ 484 static bool pmap_initialized = false; /* pmap_init done yet? */ 485 486 /* 487 * the following two vaddr_t's are used during system startup 488 * to keep track of how much of the kernel's VM space we have used. 489 * once the system is started, the management of the remaining kernel 490 * VM space is turned over to the kernel_map vm_map. 491 */ 492 493 static vaddr_t virtual_avail; /* VA of first free KVA */ 494 static vaddr_t virtual_end; /* VA of last free KVA */ 495 496 /* 497 * linked list of all non-kernel pmaps 498 */ 499 500 static struct pmap_head pmaps; 501 502 /* 503 * pool that pmap structures are allocated from 504 */ 505 506 static struct pool_cache pmap_cache; 507 508 /* 509 * pv_entry cache 510 */ 511 512 static struct pool_cache pmap_pv_cache; 513 514 /* 515 * MULTIPROCESSOR: special VA's/ PTE's are actually allocated inside a 516 * maxcpus*NPTECL array of PTE's, to avoid cache line thrashing 517 * due to false sharing. 518 */ 519 520 #ifdef MULTIPROCESSOR 521 #define PTESLEW(pte, id) ((pte)+(id)*NPTECL) 522 #define VASLEW(va,id) ((va)+(id)*NPTECL*PAGE_SIZE) 523 #else 524 #define PTESLEW(pte, id) (pte) 525 #define VASLEW(va,id) (va) 526 #endif 527 528 /* 529 * special VAs and the PTEs that map them 530 */ 531 static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte, *early_zero_pte; 532 static char *csrcp, *cdstp, *zerop, *ptpp, *early_zerop; 533 534 /* 535 * pool and cache that PDPs are allocated from 536 */ 537 538 static struct pool_cache pmap_pdp_cache; 539 int pmap_pdp_ctor(void *, void *, int); 540 void pmap_pdp_dtor(void *, void *); 541 #ifdef PAE 542 /* need to allocate items of 4 pages */ 543 void *pmap_pdp_alloc(struct pool *, int); 544 void pmap_pdp_free(struct pool *, void *); 545 static struct pool_allocator pmap_pdp_allocator = { 546 .pa_alloc = pmap_pdp_alloc, 547 .pa_free = pmap_pdp_free, 548 .pa_pagesz = PAGE_SIZE * PDP_SIZE, 549 }; 550 #endif /* PAE */ 551 552 void *vmmap; /* XXX: used by mem.c... it should really uvm_map_reserve it */ 553 554 extern vaddr_t idt_vaddr; /* we allocate IDT early */ 555 extern paddr_t idt_paddr; 556 557 #ifdef _LP64 558 extern vaddr_t lo32_vaddr; 559 extern vaddr_t lo32_paddr; 560 #endif 561 562 extern int end; 563 564 #ifdef i386 565 /* stuff to fix the pentium f00f bug */ 566 extern vaddr_t pentium_idt_vaddr; 567 #endif 568 569 570 /* 571 * local prototypes 572 */ 573 574 static struct vm_page *pmap_get_ptp(struct pmap *, vaddr_t, 575 pd_entry_t * const *); 576 static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int); 577 static void pmap_freepage(struct pmap *, struct vm_page *, int); 578 static void pmap_free_ptp(struct pmap *, struct vm_page *, 579 vaddr_t, pt_entry_t *, 580 pd_entry_t * const *); 581 static bool pmap_is_curpmap(struct pmap *); 582 static bool pmap_is_active(struct pmap *, struct cpu_info *, bool); 583 static void pmap_map_ptes(struct pmap *, struct pmap **, 584 pt_entry_t **, pd_entry_t * const **); 585 static bool pmap_remove_pte(struct pmap *, struct vm_page *, 586 pt_entry_t *, vaddr_t, 587 struct pv_entry **); 588 static pt_entry_t pmap_remove_ptes(struct pmap *, struct vm_page *, 589 vaddr_t, vaddr_t, vaddr_t, 590 struct pv_entry **); 591 592 static void pmap_unmap_ptes(struct pmap *, struct pmap *); 593 static void pmap_unmap_apdp(void); 594 static bool pmap_get_physpage(vaddr_t, int, paddr_t *); 595 static int pmap_pdes_invalid(vaddr_t, pd_entry_t * const *, 596 pd_entry_t *); 597 #define pmap_pdes_valid(va, pdes, lastpde) \ 598 (pmap_pdes_invalid((va), (pdes), (lastpde)) == 0) 599 static void pmap_alloc_level(pd_entry_t * const *, vaddr_t, int, 600 long *); 601 602 static bool pmap_reactivate(struct pmap *); 603 604 /* 605 * p m a p h e l p e r f u n c t i o n s 606 */ 607 608 static inline void 609 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff) 610 { 611 612 if (pmap == pmap_kernel()) { 613 atomic_add_long(&pmap->pm_stats.resident_count, resid_diff); 614 atomic_add_long(&pmap->pm_stats.wired_count, wired_diff); 615 } else { 616 KASSERT(mutex_owned(&pmap->pm_lock)); 617 pmap->pm_stats.resident_count += resid_diff; 618 pmap->pm_stats.wired_count += wired_diff; 619 } 620 } 621 622 static inline void 623 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte) 624 { 625 int resid_diff = ((npte & PG_V) ? 1 : 0) - ((opte & PG_V) ? 1 : 0); 626 int wired_diff = ((npte & PG_W) ? 1 : 0) - ((opte & PG_W) ? 1 : 0); 627 628 KASSERT((npte & (PG_V | PG_W)) != PG_W); 629 KASSERT((opte & (PG_V | PG_W)) != PG_W); 630 631 pmap_stats_update(pmap, resid_diff, wired_diff); 632 } 633 634 /* 635 * ptp_to_pmap: lookup pmap by ptp 636 */ 637 638 static struct pmap * 639 ptp_to_pmap(struct vm_page *ptp) 640 { 641 struct pmap *pmap; 642 643 if (ptp == NULL) { 644 return pmap_kernel(); 645 } 646 pmap = (struct pmap *)ptp->uobject; 647 KASSERT(pmap != NULL); 648 KASSERT(&pmap->pm_obj[0] == ptp->uobject); 649 return pmap; 650 } 651 652 static inline struct pv_pte * 653 pve_to_pvpte(struct pv_entry *pve) 654 { 655 656 KASSERT((void *)&pve->pve_pte == (void *)pve); 657 return &pve->pve_pte; 658 } 659 660 static inline struct pv_entry * 661 pvpte_to_pve(struct pv_pte *pvpte) 662 { 663 struct pv_entry *pve = (void *)pvpte; 664 665 KASSERT(pve_to_pvpte(pve) == pvpte); 666 return pve; 667 } 668 669 /* 670 * pv_pte_first, pv_pte_next: PV list iterator. 671 */ 672 673 static struct pv_pte * 674 pv_pte_first(struct pmap_page *pp) 675 { 676 677 KASSERT(pp_locked(pp)); 678 if ((pp->pp_flags & PP_EMBEDDED) != 0) { 679 return &pp->pp_pte; 680 } 681 return pve_to_pvpte(LIST_FIRST(&pp->pp_head.pvh_list)); 682 } 683 684 static struct pv_pte * 685 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte) 686 { 687 688 KASSERT(pvpte != NULL); 689 KASSERT(pp_locked(pp)); 690 if (pvpte == &pp->pp_pte) { 691 KASSERT((pp->pp_flags & PP_EMBEDDED) != 0); 692 return NULL; 693 } 694 KASSERT((pp->pp_flags & PP_EMBEDDED) == 0); 695 return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list)); 696 } 697 698 /* 699 * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]? 700 * of course the kernel is always loaded 701 */ 702 703 inline static bool 704 pmap_is_curpmap(struct pmap *pmap) 705 { 706 #if defined(XEN) && defined(__x86_64__) 707 /* 708 * Only kernel pmap is physically loaded. 709 * User PGD may be active, but TLB will be flushed 710 * with HYPERVISOR_iret anyway, so let's say no 711 */ 712 return(pmap == pmap_kernel()); 713 #else /* XEN && __x86_64__*/ 714 return((pmap == pmap_kernel()) || 715 (pmap == curcpu()->ci_pmap)); 716 #endif 717 } 718 719 /* 720 * pmap_is_active: is this pmap loaded into the specified processor's %cr3? 721 */ 722 723 inline static bool 724 pmap_is_active(struct pmap *pmap, struct cpu_info *ci, bool kernel) 725 { 726 727 return (pmap == pmap_kernel() || 728 (pmap->pm_cpus & ci->ci_cpumask) != 0 || 729 (kernel && (pmap->pm_kernel_cpus & ci->ci_cpumask) != 0)); 730 } 731 732 static void 733 pmap_apte_flush(struct pmap *pmap) 734 { 735 736 KASSERT(kpreempt_disabled()); 737 738 /* 739 * Flush the APTE mapping from all other CPUs that 740 * are using the pmap we are using (who's APTE space 741 * is the one we've just modified). 742 * 743 * XXXthorpej -- find a way to defer the IPI. 744 */ 745 pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, 0); 746 pmap_tlb_shootwait(); 747 } 748 749 /* 750 * Unmap the content of APDP PDEs 751 */ 752 static void 753 pmap_unmap_apdp(void) { 754 int i; 755 756 for (i = 0; i < PDP_SIZE; i++) { 757 pmap_pte_set(APDP_PDE+i, 0); 758 #if defined (XEN) && defined (PAE) 759 /* clear shadow entries too */ 760 pmap_pte_set(APDP_PDE_SHADOW+i, 0); 761 #endif 762 } 763 } 764 765 /* 766 * Add a reference to the specified pmap. 767 */ 768 769 inline void 770 pmap_reference(struct pmap *pmap) 771 { 772 773 atomic_inc_uint(&pmap->pm_obj[0].uo_refs); 774 } 775 776 /* 777 * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in 778 * 779 * => we lock enough pmaps to keep things locked in 780 * => must be undone with pmap_unmap_ptes before returning 781 */ 782 783 static void 784 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2, 785 pd_entry_t **ptepp, pd_entry_t * const **pdeppp) 786 { 787 pd_entry_t opde, npde; 788 struct pmap *ourpmap; 789 struct cpu_info *ci; 790 struct lwp *l; 791 bool iscurrent; 792 uint64_t ncsw; 793 #ifdef XEN 794 int s; 795 #endif 796 797 /* the kernel's pmap is always accessible */ 798 if (pmap == pmap_kernel()) { 799 *pmap2 = NULL; 800 *ptepp = PTE_BASE; 801 *pdeppp = normal_pdes; 802 return; 803 } 804 KASSERT(kpreempt_disabled()); 805 806 retry: 807 l = curlwp; 808 ncsw = l->l_ncsw; 809 ourpmap = NULL; 810 ci = curcpu(); 811 #if defined(XEN) && defined(__x86_64__) 812 /* 813 * curmap can only be pmap_kernel so at this point 814 * pmap_is_curpmap is always false 815 */ 816 iscurrent = 0; 817 ourpmap = pmap_kernel(); 818 #else /* XEN && __x86_64__*/ 819 if (ci->ci_want_pmapload && 820 vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) { 821 pmap_load(); 822 if (l->l_ncsw != ncsw) 823 goto retry; 824 } 825 iscurrent = pmap_is_curpmap(pmap); 826 /* if curpmap then we are always mapped */ 827 if (iscurrent) { 828 mutex_enter(&pmap->pm_lock); 829 *pmap2 = NULL; 830 *ptepp = PTE_BASE; 831 *pdeppp = normal_pdes; 832 goto out; 833 } 834 ourpmap = ci->ci_pmap; 835 #endif /* XEN && __x86_64__ */ 836 837 /* need to lock both curpmap and pmap: use ordered locking */ 838 pmap_reference(ourpmap); 839 if ((uintptr_t) pmap < (uintptr_t) ourpmap) { 840 mutex_enter(&pmap->pm_lock); 841 mutex_enter(&ourpmap->pm_lock); 842 } else { 843 mutex_enter(&ourpmap->pm_lock); 844 mutex_enter(&pmap->pm_lock); 845 } 846 847 if (l->l_ncsw != ncsw) 848 goto unlock_and_retry; 849 850 /* need to load a new alternate pt space into curpmap? */ 851 COUNT(apdp_pde_map); 852 opde = *APDP_PDE; 853 if (!pmap_valid_entry(opde) || 854 pmap_pte2pa(opde) != pmap_pdirpa(pmap, 0)) { 855 #ifdef XEN 856 int i; 857 s = splvm(); 858 /* Make recursive entry usable in user PGD */ 859 for (i = 0; i < PDP_SIZE; i++) { 860 npde = pmap_pa2pte( 861 pmap_pdirpa(pmap, i * NPDPG)) | PG_k | PG_V; 862 xpq_queue_pte_update( 863 xpmap_ptom(pmap_pdirpa(pmap, PDIR_SLOT_PTE + i)), 864 npde); 865 xpq_queue_pte_update(xpmap_ptetomach(&APDP_PDE[i]), 866 npde); 867 #ifdef PAE 868 /* update shadow entry too */ 869 xpq_queue_pte_update( 870 xpmap_ptetomach(&APDP_PDE_SHADOW[i]), npde); 871 #endif /* PAE */ 872 xpq_queue_invlpg( 873 (vaddr_t)&pmap->pm_pdir[PDIR_SLOT_PTE + i]); 874 } 875 if (pmap_valid_entry(opde)) 876 pmap_apte_flush(ourpmap); 877 splx(s); 878 #else /* XEN */ 879 int i; 880 for (i = 0; i < PDP_SIZE; i++) { 881 npde = pmap_pa2pte( 882 pmap_pdirpa(pmap, i * NPDPG)) | PG_RW | PG_V; 883 pmap_pte_set(APDP_PDE+i, npde); 884 } 885 pmap_pte_flush(); 886 if (pmap_valid_entry(opde)) 887 pmap_apte_flush(ourpmap); 888 #endif /* XEN */ 889 } 890 *pmap2 = ourpmap; 891 *ptepp = APTE_BASE; 892 *pdeppp = alternate_pdes; 893 KASSERT(l->l_ncsw == ncsw); 894 #if !defined(XEN) || !defined(__x86_64__) 895 out: 896 #endif 897 /* 898 * might have blocked, need to retry? 899 */ 900 if (l->l_ncsw != ncsw) { 901 unlock_and_retry: 902 if (ourpmap != NULL) { 903 mutex_exit(&ourpmap->pm_lock); 904 pmap_destroy(ourpmap); 905 } 906 mutex_exit(&pmap->pm_lock); 907 goto retry; 908 } 909 910 return; 911 } 912 913 /* 914 * pmap_unmap_ptes: unlock the PTE mapping of "pmap" 915 */ 916 917 static void 918 pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2) 919 { 920 921 if (pmap == pmap_kernel()) { 922 return; 923 } 924 KASSERT(kpreempt_disabled()); 925 if (pmap2 == NULL) { 926 mutex_exit(&pmap->pm_lock); 927 } else { 928 #if defined(XEN) && defined(__x86_64__) 929 KASSERT(pmap2 == pmap_kernel()); 930 #else 931 KASSERT(curcpu()->ci_pmap == pmap2); 932 #endif 933 #if defined(MULTIPROCESSOR) 934 pmap_unmap_apdp(); 935 pmap_pte_flush(); 936 pmap_apte_flush(pmap2); 937 #endif 938 COUNT(apdp_pde_unmap); 939 mutex_exit(&pmap->pm_lock); 940 mutex_exit(&pmap2->pm_lock); 941 pmap_destroy(pmap2); 942 } 943 } 944 945 inline static void 946 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte) 947 { 948 949 #if !defined(__x86_64__) 950 if (curproc == NULL || curproc->p_vmspace == NULL || 951 pm != vm_map_pmap(&curproc->p_vmspace->vm_map)) 952 return; 953 954 if ((opte ^ npte) & PG_X) 955 pmap_update_pg(va); 956 957 /* 958 * Executability was removed on the last executable change. 959 * Reset the code segment to something conservative and 960 * let the trap handler deal with setting the right limit. 961 * We can't do that because of locking constraints on the vm map. 962 */ 963 964 if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) { 965 struct trapframe *tf = curlwp->l_md.md_regs; 966 967 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 968 pm->pm_hiexec = I386_MAX_EXE_ADDR; 969 } 970 #endif /* !defined(__x86_64__) */ 971 } 972 973 #if !defined(__x86_64__) 974 /* 975 * Fixup the code segment to cover all potential executable mappings. 976 * returns 0 if no changes to the code segment were made. 977 */ 978 979 int 980 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb) 981 { 982 struct vm_map_entry *ent; 983 struct pmap *pm = vm_map_pmap(map); 984 vaddr_t va = 0; 985 986 vm_map_lock_read(map); 987 for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) { 988 989 /* 990 * This entry has greater va than the entries before. 991 * We need to make it point to the last page, not past it. 992 */ 993 994 if (ent->protection & VM_PROT_EXECUTE) 995 va = trunc_page(ent->end) - PAGE_SIZE; 996 } 997 vm_map_unlock_read(map); 998 if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL)) 999 return (0); 1000 1001 pm->pm_hiexec = va; 1002 if (pm->pm_hiexec > I386_MAX_EXE_ADDR) { 1003 tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL); 1004 } else { 1005 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 1006 return (0); 1007 } 1008 return (1); 1009 } 1010 #endif /* !defined(__x86_64__) */ 1011 1012 /* 1013 * p m a p k e n t e r f u n c t i o n s 1014 * 1015 * functions to quickly enter/remove pages from the kernel address 1016 * space. pmap_kremove is exported to MI kernel. we make use of 1017 * the recursive PTE mappings. 1018 */ 1019 1020 /* 1021 * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking 1022 * 1023 * => no need to lock anything, assume va is already allocated 1024 * => should be faster than normal pmap enter function 1025 */ 1026 1027 void 1028 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags) 1029 { 1030 pt_entry_t *pte, opte, npte; 1031 1032 KASSERT(!(prot & ~VM_PROT_ALL)); 1033 1034 if (va < VM_MIN_KERNEL_ADDRESS) 1035 pte = vtopte(va); 1036 else 1037 pte = kvtopte(va); 1038 #ifdef DOM0OPS 1039 if (pa < pmap_pa_start || pa >= pmap_pa_end) { 1040 #ifdef DEBUG 1041 printk("pmap_kenter_pa: pa 0x%" PRIx64 " for va 0x%" PRIx64 1042 " outside range\n", (int64_t)pa, (int64_t)va); 1043 #endif /* DEBUG */ 1044 npte = pa; 1045 } else 1046 #endif /* DOM0OPS */ 1047 npte = pmap_pa2pte(pa); 1048 npte |= protection_codes[prot] | PG_k | PG_V | pmap_pg_g; 1049 if (flags & PMAP_NOCACHE) 1050 npte |= PG_N; 1051 opte = pmap_pte_testset(pte, npte); /* zap! */ 1052 #if defined(DIAGNOSTIC) 1053 /* XXX For now... */ 1054 if (opte & PG_PS) 1055 panic("pmap_kenter_pa: PG_PS"); 1056 #endif 1057 if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) { 1058 /* This should not happen, so no need to batch updates. */ 1059 kpreempt_disable(); 1060 pmap_tlb_shootdown(pmap_kernel(), va, 0, opte); 1061 kpreempt_enable(); 1062 } 1063 } 1064 1065 void 1066 pmap_emap_enter(vaddr_t va, paddr_t pa, vm_prot_t prot) 1067 { 1068 pt_entry_t *pte, opte, npte; 1069 1070 KASSERT((prot & ~VM_PROT_ALL) == 0); 1071 pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va); 1072 1073 #ifdef DOM0OPS 1074 if (pa < pmap_pa_start || pa >= pmap_pa_end) { 1075 npte = pa; 1076 } else 1077 #endif 1078 npte = pmap_pa2pte(pa); 1079 1080 npte = pmap_pa2pte(pa); 1081 npte |= protection_codes[prot] | PG_k | PG_V; 1082 opte = pmap_pte_testset(pte, npte); 1083 } 1084 1085 /* 1086 * pmap_emap_sync: perform TLB flush or pmap load, if it was deferred. 1087 */ 1088 void 1089 pmap_emap_sync(bool canload) 1090 { 1091 struct cpu_info *ci = curcpu(); 1092 struct pmap *pmap; 1093 1094 KASSERT(kpreempt_disabled()); 1095 if (__predict_true(ci->ci_want_pmapload && canload)) { 1096 /* 1097 * XXX: Hint for pmap_reactivate(), which might suggest to 1098 * not perform TLB flush, if state has not changed. 1099 */ 1100 pmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map); 1101 if (__predict_false(pmap == ci->ci_pmap)) { 1102 const uint32_t cpumask = ci->ci_cpumask; 1103 atomic_and_32(&pmap->pm_cpus, ~cpumask); 1104 } 1105 pmap_load(); 1106 KASSERT(ci->ci_want_pmapload == 0); 1107 } else { 1108 tlbflush(); 1109 } 1110 1111 } 1112 1113 void 1114 pmap_emap_remove(vaddr_t sva, vsize_t len) 1115 { 1116 pt_entry_t *pte, xpte; 1117 vaddr_t va, eva = sva + len; 1118 1119 for (va = sva; va < eva; va += PAGE_SIZE) { 1120 pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va); 1121 xpte |= pmap_pte_testset(pte, 0); 1122 } 1123 } 1124 1125 #ifdef XEN 1126 /* 1127 * pmap_kenter_ma: enter a kernel mapping without R/M (pv_entry) tracking 1128 * 1129 * => no need to lock anything, assume va is already allocated 1130 * => should be faster than normal pmap enter function 1131 * => we expect a MACHINE address 1132 */ 1133 1134 void 1135 pmap_kenter_ma(vaddr_t va, paddr_t ma, vm_prot_t prot, u_int flags) 1136 { 1137 pt_entry_t *pte, opte, npte; 1138 1139 if (va < VM_MIN_KERNEL_ADDRESS) 1140 pte = vtopte(va); 1141 else 1142 pte = kvtopte(va); 1143 1144 npte = ma | ((prot & VM_PROT_WRITE) ? PG_RW : PG_RO) | 1145 PG_V | PG_k; 1146 if (flags & PMAP_NOCACHE) 1147 npte |= PG_N; 1148 1149 #ifndef XEN 1150 if ((cpu_feature & CPUID_NOX) && !(prot & VM_PROT_EXECUTE)) 1151 npte |= PG_NX; 1152 #endif 1153 opte = pmap_pte_testset (pte, npte); /* zap! */ 1154 1155 if (pmap_valid_entry(opte)) { 1156 #if defined(MULTIPROCESSOR) 1157 kpreempt_disable(); 1158 pmap_tlb_shootdown(pmap_kernel(), va, 0, opte); 1159 kpreempt_enable(); 1160 #else 1161 /* Don't bother deferring in the single CPU case. */ 1162 pmap_update_pg(va); 1163 #endif 1164 } 1165 } 1166 #endif /* XEN */ 1167 1168 #if defined(__x86_64__) 1169 /* 1170 * Change protection for a virtual address. Local for a CPU only, don't 1171 * care about TLB shootdowns. 1172 * 1173 * => must be called with preemption disabled 1174 */ 1175 void 1176 pmap_changeprot_local(vaddr_t va, vm_prot_t prot) 1177 { 1178 pt_entry_t *pte, opte, npte; 1179 1180 KASSERT(kpreempt_disabled()); 1181 1182 if (va < VM_MIN_KERNEL_ADDRESS) 1183 pte = vtopte(va); 1184 else 1185 pte = kvtopte(va); 1186 1187 npte = opte = *pte; 1188 1189 if ((prot & VM_PROT_WRITE) != 0) 1190 npte |= PG_RW; 1191 else 1192 npte &= ~PG_RW; 1193 1194 if (opte != npte) { 1195 pmap_pte_set(pte, npte); 1196 pmap_pte_flush(); 1197 invlpg(va); 1198 } 1199 } 1200 #endif /* defined(__x86_64__) */ 1201 1202 /* 1203 * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking 1204 * 1205 * => no need to lock anything 1206 * => caller must dispose of any vm_page mapped in the va range 1207 * => note: not an inline function 1208 * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE 1209 * => we assume kernel only unmaps valid addresses and thus don't bother 1210 * checking the valid bit before doing TLB flushing 1211 * => must be followed by call to pmap_update() before reuse of page 1212 */ 1213 1214 void 1215 pmap_kremove(vaddr_t sva, vsize_t len) 1216 { 1217 pt_entry_t *pte, xpte; 1218 vaddr_t va, eva; 1219 1220 eva = sva + len; 1221 xpte = 0; 1222 1223 for (va = sva; va < eva; va += PAGE_SIZE) { 1224 if (va < VM_MIN_KERNEL_ADDRESS) 1225 pte = vtopte(va); 1226 else 1227 pte = kvtopte(va); 1228 xpte |= pmap_pte_testset(pte, 0); /* zap! */ 1229 #if defined(DIAGNOSTIC) 1230 /* XXX For now... */ 1231 if (xpte & PG_PS) 1232 panic("pmap_kremove: PG_PS"); 1233 if (xpte & PG_PVLIST) 1234 panic("pmap_kremove: PG_PVLIST mapping for 0x%lx", 1235 va); 1236 #endif 1237 } 1238 if ((xpte & (PG_V | PG_U)) == (PG_V | PG_U)) { 1239 kpreempt_disable(); 1240 pmap_tlb_shootdown(pmap_kernel(), sva, eva, xpte); 1241 kpreempt_enable(); 1242 } 1243 } 1244 1245 /* 1246 * p m a p i n i t f u n c t i o n s 1247 * 1248 * pmap_bootstrap and pmap_init are called during system startup 1249 * to init the pmap module. pmap_bootstrap() does a low level 1250 * init just to get things rolling. pmap_init() finishes the job. 1251 */ 1252 1253 /* 1254 * pmap_bootstrap: get the system in a state where it can run with VM 1255 * properly enabled (called before main()). the VM system is 1256 * fully init'd later... 1257 * 1258 * => on i386, locore.s has already enabled the MMU by allocating 1259 * a PDP for the kernel, and nkpde PTP's for the kernel. 1260 * => kva_start is the first free virtual address in kernel space 1261 */ 1262 1263 void 1264 pmap_bootstrap(vaddr_t kva_start) 1265 { 1266 struct pmap *kpm; 1267 pt_entry_t *pte; 1268 struct pcb *pcb; 1269 int i; 1270 vaddr_t kva; 1271 #ifdef XEN 1272 pt_entry_t pg_nx = 0; 1273 #else 1274 unsigned long p1i; 1275 vaddr_t kva_end; 1276 pt_entry_t pg_nx = (cpu_feature & CPUID_NOX ? PG_NX : 0); 1277 #endif 1278 1279 /* 1280 * set up our local static global vars that keep track of the 1281 * usage of KVM before kernel_map is set up 1282 */ 1283 1284 virtual_avail = kva_start; /* first free KVA */ 1285 virtual_end = VM_MAX_KERNEL_ADDRESS; /* last KVA */ 1286 1287 /* 1288 * set up protection_codes: we need to be able to convert from 1289 * a MI protection code (some combo of VM_PROT...) to something 1290 * we can jam into a i386 PTE. 1291 */ 1292 1293 protection_codes[VM_PROT_NONE] = pg_nx; /* --- */ 1294 protection_codes[VM_PROT_EXECUTE] = PG_RO | PG_X; /* --x */ 1295 protection_codes[VM_PROT_READ] = PG_RO | pg_nx; /* -r- */ 1296 protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO | PG_X;/* -rx */ 1297 protection_codes[VM_PROT_WRITE] = PG_RW | pg_nx; /* w-- */ 1298 protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW | PG_X;/* w-x */ 1299 protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW | pg_nx; 1300 /* wr- */ 1301 protection_codes[VM_PROT_ALL] = PG_RW | PG_X; /* wrx */ 1302 1303 /* 1304 * now we init the kernel's pmap 1305 * 1306 * the kernel pmap's pm_obj is not used for much. however, in 1307 * user pmaps the pm_obj contains the list of active PTPs. 1308 * the pm_obj currently does not have a pager. it might be possible 1309 * to add a pager that would allow a process to read-only mmap its 1310 * own page tables (fast user level vtophys?). this may or may not 1311 * be useful. 1312 */ 1313 1314 kpm = pmap_kernel(); 1315 for (i = 0; i < PTP_LEVELS - 1; i++) { 1316 UVM_OBJ_INIT(&kpm->pm_obj[i], NULL, 1); 1317 kpm->pm_ptphint[i] = NULL; 1318 } 1319 memset(&kpm->pm_list, 0, sizeof(kpm->pm_list)); /* pm_list not used */ 1320 pcb = lwp_getpcb(&lwp0); 1321 kpm->pm_pdir = (pd_entry_t *)(pcb->pcb_cr3 + KERNBASE); 1322 #ifdef PAE 1323 for (i = 0; i < PDP_SIZE; i++) 1324 kpm->pm_pdirpa[i] = (paddr_t)pcb->pcb_cr3 + PAGE_SIZE * i; 1325 #else 1326 kpm->pm_pdirpa = (paddr_t)pcb->pcb_cr3; 1327 #endif 1328 kpm->pm_stats.wired_count = kpm->pm_stats.resident_count = 1329 x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS); 1330 1331 /* 1332 * the above is just a rough estimate and not critical to the proper 1333 * operation of the system. 1334 */ 1335 1336 #ifndef XEN 1337 /* 1338 * Begin to enable global TLB entries if they are supported. 1339 * The G bit has no effect until the CR4_PGE bit is set in CR4, 1340 * which happens in cpu_init(), which is run on each cpu 1341 * (and happens later) 1342 */ 1343 1344 if (cpu_feature & CPUID_PGE) { 1345 pmap_pg_g = PG_G; /* enable software */ 1346 1347 /* add PG_G attribute to already mapped kernel pages */ 1348 if (KERNBASE == VM_MIN_KERNEL_ADDRESS) { 1349 kva_end = virtual_avail; 1350 } else { 1351 extern vaddr_t eblob, esym; 1352 kva_end = (vaddr_t)&end; 1353 if (esym > kva_end) 1354 kva_end = esym; 1355 if (eblob > kva_end) 1356 kva_end = eblob; 1357 kva_end = roundup(kva_end, PAGE_SIZE); 1358 } 1359 for (kva = KERNBASE; kva < kva_end; kva += PAGE_SIZE) { 1360 p1i = pl1_i(kva); 1361 if (pmap_valid_entry(PTE_BASE[p1i])) 1362 PTE_BASE[p1i] |= PG_G; 1363 } 1364 } 1365 1366 /* 1367 * enable large pages if they are supported. 1368 */ 1369 1370 if (cpu_feature & CPUID_PSE) { 1371 paddr_t pa; 1372 pd_entry_t *pde; 1373 extern char __data_start; 1374 1375 lcr4(rcr4() | CR4_PSE); /* enable hardware (via %cr4) */ 1376 pmap_largepages = 1; /* enable software */ 1377 1378 /* 1379 * the TLB must be flushed after enabling large pages 1380 * on Pentium CPUs, according to section 3.6.2.2 of 1381 * "Intel Architecture Software Developer's Manual, 1382 * Volume 3: System Programming". 1383 */ 1384 tlbflush(); 1385 1386 /* 1387 * now, remap the kernel text using large pages. we 1388 * assume that the linker has properly aligned the 1389 * .data segment to a NBPD_L2 boundary. 1390 */ 1391 kva_end = rounddown((vaddr_t)&__data_start, NBPD_L1); 1392 for (pa = 0, kva = KERNBASE; kva + NBPD_L2 <= kva_end; 1393 kva += NBPD_L2, pa += NBPD_L2) { 1394 pde = &L2_BASE[pl2_i(kva)]; 1395 *pde = pa | pmap_pg_g | PG_PS | 1396 PG_KR | PG_V; /* zap! */ 1397 tlbflush(); 1398 } 1399 #if defined(DEBUG) 1400 aprint_normal("kernel text is mapped with %" PRIuPSIZE " large " 1401 "pages and %" PRIuPSIZE " normal pages\n", 1402 howmany(kva - KERNBASE, NBPD_L2), 1403 howmany((vaddr_t)&__data_start - kva, NBPD_L1)); 1404 #endif /* defined(DEBUG) */ 1405 } 1406 #endif /* !XEN */ 1407 1408 if (VM_MIN_KERNEL_ADDRESS != KERNBASE) { 1409 /* 1410 * zero_pte is stuck at the end of mapped space for the kernel 1411 * image (disjunct from kva space). This is done so that it 1412 * can safely be used in pmap_growkernel (pmap_get_physpage), 1413 * when it's called for the first time. 1414 * XXXfvdl fix this for MULTIPROCESSOR later. 1415 */ 1416 1417 early_zerop = (void *)(KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2); 1418 early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop); 1419 } 1420 1421 /* 1422 * now we allocate the "special" VAs which are used for tmp mappings 1423 * by the pmap (and other modules). we allocate the VAs by advancing 1424 * virtual_avail (note that there are no pages mapped at these VAs). 1425 * we find the PTE that maps the allocated VA via the linear PTE 1426 * mapping. 1427 */ 1428 1429 pte = PTE_BASE + pl1_i(virtual_avail); 1430 1431 #ifdef MULTIPROCESSOR 1432 /* 1433 * Waste some VA space to avoid false sharing of cache lines 1434 * for page table pages: Give each possible CPU a cache line 1435 * of PTE's (8) to play with, though we only need 4. We could 1436 * recycle some of this waste by putting the idle stacks here 1437 * as well; we could waste less space if we knew the largest 1438 * CPU ID beforehand. 1439 */ 1440 csrcp = (char *) virtual_avail; csrc_pte = pte; 1441 1442 cdstp = (char *) virtual_avail+PAGE_SIZE; cdst_pte = pte+1; 1443 1444 zerop = (char *) virtual_avail+PAGE_SIZE*2; zero_pte = pte+2; 1445 1446 ptpp = (char *) virtual_avail+PAGE_SIZE*3; ptp_pte = pte+3; 1447 1448 virtual_avail += PAGE_SIZE * maxcpus * NPTECL; 1449 pte += maxcpus * NPTECL; 1450 #else 1451 csrcp = (void *) virtual_avail; csrc_pte = pte; /* allocate */ 1452 virtual_avail += PAGE_SIZE; pte++; /* advance */ 1453 1454 cdstp = (void *) virtual_avail; cdst_pte = pte; 1455 virtual_avail += PAGE_SIZE; pte++; 1456 1457 zerop = (void *) virtual_avail; zero_pte = pte; 1458 virtual_avail += PAGE_SIZE; pte++; 1459 1460 ptpp = (void *) virtual_avail; ptp_pte = pte; 1461 virtual_avail += PAGE_SIZE; pte++; 1462 #endif 1463 1464 if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { 1465 early_zerop = zerop; 1466 early_zero_pte = zero_pte; 1467 } 1468 1469 /* 1470 * Nothing after this point actually needs pte; 1471 */ 1472 pte = (void *)0xdeadbeef; 1473 1474 /* XXX: vmmap used by mem.c... should be uvm_map_reserve */ 1475 /* XXXfvdl PTEs not needed here */ 1476 vmmap = (char *)virtual_avail; /* don't need pte */ 1477 virtual_avail += PAGE_SIZE; pte++; 1478 1479 #ifdef XEN 1480 #ifdef __x86_64__ 1481 /* 1482 * We want a dummy page directory for Xen: 1483 * when deactivate a pmap, Xen will still consider it active. 1484 * So we set user PGD to this one to lift all protection on 1485 * the now inactive page tables set. 1486 */ 1487 xen_dummy_user_pgd = avail_start; 1488 avail_start += PAGE_SIZE; 1489 1490 /* Zero fill it, the less checks in Xen it requires the better */ 1491 memset((void *) (xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE); 1492 /* Mark read-only */ 1493 HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE, 1494 pmap_pa2pte(xen_dummy_user_pgd) | PG_u | PG_V, UVMF_INVLPG); 1495 /* Pin as L4 */ 1496 xpq_queue_pin_table(xpmap_ptom_masked(xen_dummy_user_pgd)); 1497 #endif /* __x86_64__ */ 1498 idt_vaddr = virtual_avail; /* don't need pte */ 1499 idt_paddr = avail_start; /* steal a page */ 1500 /* 1501 * Xen require one more page as we can't store 1502 * GDT and LDT on the same page 1503 */ 1504 virtual_avail += 3 * PAGE_SIZE; 1505 avail_start += 3 * PAGE_SIZE; 1506 #else /* XEN */ 1507 idt_vaddr = virtual_avail; /* don't need pte */ 1508 idt_paddr = avail_start; /* steal a page */ 1509 #if defined(__x86_64__) 1510 virtual_avail += 2 * PAGE_SIZE; pte += 2; 1511 avail_start += 2 * PAGE_SIZE; 1512 #else /* defined(__x86_64__) */ 1513 virtual_avail += PAGE_SIZE; pte++; 1514 avail_start += PAGE_SIZE; 1515 /* pentium f00f bug stuff */ 1516 pentium_idt_vaddr = virtual_avail; /* don't need pte */ 1517 virtual_avail += PAGE_SIZE; pte++; 1518 #endif /* defined(__x86_64__) */ 1519 #endif /* XEN */ 1520 1521 #ifdef _LP64 1522 /* 1523 * Grab a page below 4G for things that need it (i.e. 1524 * having an initial %cr3 for the MP trampoline). 1525 */ 1526 lo32_vaddr = virtual_avail; 1527 virtual_avail += PAGE_SIZE; pte++; 1528 lo32_paddr = avail_start; 1529 avail_start += PAGE_SIZE; 1530 #endif 1531 1532 /* 1533 * now we reserve some VM for mapping pages when doing a crash dump 1534 */ 1535 1536 virtual_avail = reserve_dumppages(virtual_avail); 1537 1538 /* 1539 * init the static-global locks and global lists. 1540 * 1541 * => pventry::pvh_lock (initialized elsewhere) must also be 1542 * a spin lock, again at IPL_VM to prevent deadlock, and 1543 * again is never taken from interrupt context. 1544 */ 1545 1546 mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE); 1547 LIST_INIT(&pmaps); 1548 pmap_cpu_init_early(curcpu()); 1549 1550 /* 1551 * initialize caches. 1552 */ 1553 1554 pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), 0, 0, 0, 1555 "pmappl", NULL, IPL_NONE, NULL, NULL, NULL); 1556 #ifdef PAE 1557 pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE * PDP_SIZE, 0, 0, 0, 1558 "pdppl", &pmap_pdp_allocator, IPL_NONE, 1559 pmap_pdp_ctor, pmap_pdp_dtor, NULL); 1560 #else /* PAE */ 1561 pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE, 0, 0, 0, 1562 "pdppl", NULL, IPL_NONE, pmap_pdp_ctor, pmap_pdp_dtor, NULL); 1563 #endif /* PAE */ 1564 pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0, 1565 PR_LARGECACHE, "pvpl", &pool_allocator_meta, IPL_NONE, NULL, 1566 NULL, NULL); 1567 1568 /* 1569 * ensure the TLB is sync'd with reality by flushing it... 1570 */ 1571 1572 tlbflush(); 1573 1574 /* 1575 * calculate pmap_maxkvaddr from nkptp[]. 1576 */ 1577 1578 kva = VM_MIN_KERNEL_ADDRESS; 1579 for (i = PTP_LEVELS - 1; i >= 1; i--) { 1580 kva += nkptp[i] * nbpd[i]; 1581 } 1582 pmap_maxkvaddr = kva; 1583 } 1584 1585 #if defined(__x86_64__) 1586 /* 1587 * Pre-allocate PTPs for low memory, so that 1:1 mappings for various 1588 * trampoline code can be entered. 1589 */ 1590 void 1591 pmap_prealloc_lowmem_ptps(void) 1592 { 1593 #ifdef XEN 1594 int level; 1595 paddr_t newp; 1596 paddr_t pdes_pa; 1597 1598 pdes_pa = pmap_kernel()->pm_pdirpa; 1599 level = PTP_LEVELS; 1600 for (;;) { 1601 newp = avail_start; 1602 avail_start += PAGE_SIZE; 1603 HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop, 1604 xpmap_ptom_masked(newp) | PG_u | PG_V | PG_RW, UVMF_INVLPG); 1605 memset((void *)early_zerop, 0, PAGE_SIZE); 1606 /* Mark R/O before installing */ 1607 HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop, 1608 xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG); 1609 if (newp < (NKL2_KIMG_ENTRIES * NBPD_L2)) 1610 HYPERVISOR_update_va_mapping (newp + KERNBASE, 1611 xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG); 1612 xpq_queue_pte_update ( 1613 xpmap_ptom_masked(pdes_pa) 1614 + (pl_i(0, level) * sizeof (pd_entry_t)), 1615 xpmap_ptom_masked(newp) | PG_RW | PG_u | PG_V); 1616 level--; 1617 if (level <= 1) 1618 break; 1619 pdes_pa = newp; 1620 } 1621 #else /* XEN */ 1622 pd_entry_t *pdes; 1623 int level; 1624 paddr_t newp; 1625 1626 pdes = pmap_kernel()->pm_pdir; 1627 level = PTP_LEVELS; 1628 for (;;) { 1629 newp = avail_start; 1630 avail_start += PAGE_SIZE; 1631 *early_zero_pte = (newp & PG_FRAME) | PG_V | PG_RW; 1632 pmap_update_pg((vaddr_t)early_zerop); 1633 memset(early_zerop, 0, PAGE_SIZE); 1634 pdes[pl_i(0, level)] = (newp & PG_FRAME) | PG_V | PG_RW; 1635 level--; 1636 if (level <= 1) 1637 break; 1638 pdes = normal_pdes[level - 2]; 1639 } 1640 #endif /* XEN */ 1641 } 1642 #endif /* defined(__x86_64__) */ 1643 1644 /* 1645 * pmap_init: called from uvm_init, our job is to get the pmap 1646 * system ready to manage mappings... 1647 */ 1648 1649 void 1650 pmap_init(void) 1651 { 1652 int i; 1653 1654 for (i = 0; i < PV_HASH_SIZE; i++) { 1655 SLIST_INIT(&pv_hash_heads[i].hh_list); 1656 } 1657 for (i = 0; i < PV_HASH_LOCK_CNT; i++) { 1658 mutex_init(&pv_hash_locks[i].lock, MUTEX_NODEBUG, IPL_VM); 1659 } 1660 1661 /* 1662 * done: pmap module is up (and ready for business) 1663 */ 1664 1665 pmap_initialized = true; 1666 } 1667 1668 /* 1669 * pmap_cpu_init_early: perform early per-CPU initialization. 1670 */ 1671 1672 void 1673 pmap_cpu_init_early(struct cpu_info *ci) 1674 { 1675 struct pmap_cpu *pc; 1676 static uint8_t pmap_cpu_alloc; 1677 1678 pc = &pmap_cpu[pmap_cpu_alloc++].pc; 1679 ci->ci_pmap_cpu = pc; 1680 } 1681 1682 /* 1683 * pmap_cpu_init_late: perform late per-CPU initialization. 1684 */ 1685 1686 void 1687 pmap_cpu_init_late(struct cpu_info *ci) 1688 { 1689 1690 if (ci == &cpu_info_primary) { 1691 evcnt_attach_dynamic(&pmap_tlb_evcnt, EVCNT_TYPE_INTR, 1692 NULL, "global", "TLB IPI"); 1693 evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC, 1694 NULL, "x86", "io bitmap copy"); 1695 evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC, 1696 NULL, "x86", "ldt sync"); 1697 } 1698 1699 evcnt_attach_dynamic(&ci->ci_tlb_evcnt, EVCNT_TYPE_MISC, 1700 NULL, device_xname(ci->ci_dev), "TLB IPI"); 1701 } 1702 1703 /* 1704 * p v _ e n t r y f u n c t i o n s 1705 */ 1706 1707 /* 1708 * pmap_free_pvs: free a list of pv_entrys 1709 */ 1710 1711 static void 1712 pmap_free_pvs(struct pv_entry *pve) 1713 { 1714 struct pv_entry *next; 1715 1716 for ( /* null */ ; pve != NULL ; pve = next) { 1717 next = pve->pve_next; 1718 pool_cache_put(&pmap_pv_cache, pve); 1719 } 1720 } 1721 1722 /* 1723 * main pv_entry manipulation functions: 1724 * pmap_enter_pv: enter a mapping onto a pv_head list 1725 * pmap_remove_pv: remove a mapping from a pv_head list 1726 * 1727 * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock 1728 * the pvh before calling 1729 */ 1730 1731 /* 1732 * insert_pv: a helper of pmap_enter_pv 1733 */ 1734 1735 static void 1736 insert_pv(struct pmap_page *pp, struct pv_entry *pve) 1737 { 1738 struct pv_hash_head *hh; 1739 kmutex_t *lock; 1740 u_int hash; 1741 1742 KASSERT(pp_locked(pp)); 1743 1744 hash = pvhash_hash(pve->pve_pte.pte_ptp, pve->pve_pte.pte_va); 1745 lock = pvhash_lock(hash); 1746 hh = pvhash_head(hash); 1747 mutex_spin_enter(lock); 1748 SLIST_INSERT_HEAD(&hh->hh_list, pve, pve_hash); 1749 mutex_spin_exit(lock); 1750 1751 LIST_INSERT_HEAD(&pp->pp_head.pvh_list, pve, pve_list); 1752 } 1753 1754 /* 1755 * pmap_enter_pv: enter a mapping onto a pv_head lst 1756 * 1757 * => caller should have the pp_lock locked 1758 * => caller should adjust ptp's wire_count before calling 1759 */ 1760 1761 static struct pv_entry * 1762 pmap_enter_pv(struct pmap_page *pp, 1763 struct pv_entry *pve, /* preallocated pve for us to use */ 1764 struct pv_entry **sparepve, 1765 struct vm_page *ptp, 1766 vaddr_t va) 1767 { 1768 1769 KASSERT(ptp == NULL || ptp->wire_count >= 2); 1770 KASSERT(ptp == NULL || ptp->uobject != NULL); 1771 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 1772 KASSERT(pp_locked(pp)); 1773 1774 if ((pp->pp_flags & PP_EMBEDDED) == 0) { 1775 if (LIST_EMPTY(&pp->pp_head.pvh_list)) { 1776 pp->pp_flags |= PP_EMBEDDED; 1777 pp->pp_pte.pte_ptp = ptp; 1778 pp->pp_pte.pte_va = va; 1779 1780 return pve; 1781 } 1782 } else { 1783 struct pv_entry *pve2; 1784 1785 pve2 = *sparepve; 1786 *sparepve = NULL; 1787 1788 pve2->pve_pte = pp->pp_pte; 1789 pp->pp_flags &= ~PP_EMBEDDED; 1790 LIST_INIT(&pp->pp_head.pvh_list); 1791 insert_pv(pp, pve2); 1792 } 1793 1794 pve->pve_pte.pte_ptp = ptp; 1795 pve->pve_pte.pte_va = va; 1796 insert_pv(pp, pve); 1797 1798 return NULL; 1799 } 1800 1801 /* 1802 * pmap_remove_pv: try to remove a mapping from a pv_list 1803 * 1804 * => caller should hold pp_lock [so that attrs can be adjusted] 1805 * => caller should adjust ptp's wire_count and free PTP if needed 1806 * => we return the removed pve 1807 */ 1808 1809 static struct pv_entry * 1810 pmap_remove_pv(struct pmap_page *pp, struct vm_page *ptp, vaddr_t va) 1811 { 1812 struct pv_hash_head *hh; 1813 struct pv_entry *pve; 1814 kmutex_t *lock; 1815 u_int hash; 1816 1817 KASSERT(ptp == NULL || ptp->uobject != NULL); 1818 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 1819 KASSERT(pp_locked(pp)); 1820 1821 if ((pp->pp_flags & PP_EMBEDDED) != 0) { 1822 KASSERT(pp->pp_pte.pte_ptp == ptp); 1823 KASSERT(pp->pp_pte.pte_va == va); 1824 1825 pp->pp_flags &= ~PP_EMBEDDED; 1826 LIST_INIT(&pp->pp_head.pvh_list); 1827 1828 return NULL; 1829 } 1830 1831 hash = pvhash_hash(ptp, va); 1832 lock = pvhash_lock(hash); 1833 hh = pvhash_head(hash); 1834 mutex_spin_enter(lock); 1835 pve = pvhash_remove(hh, ptp, va); 1836 mutex_spin_exit(lock); 1837 1838 LIST_REMOVE(pve, pve_list); 1839 1840 return pve; 1841 } 1842 1843 /* 1844 * p t p f u n c t i o n s 1845 */ 1846 1847 static inline struct vm_page * 1848 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level) 1849 { 1850 int lidx = level - 1; 1851 struct vm_page *pg; 1852 1853 KASSERT(mutex_owned(&pmap->pm_lock)); 1854 1855 if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] && 1856 pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) { 1857 return (pmap->pm_ptphint[lidx]); 1858 } 1859 PMAP_SUBOBJ_LOCK(pmap, lidx); 1860 pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level)); 1861 PMAP_SUBOBJ_UNLOCK(pmap, lidx); 1862 1863 KASSERT(pg == NULL || pg->wire_count >= 1); 1864 return pg; 1865 } 1866 1867 static inline void 1868 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level) 1869 { 1870 int lidx; 1871 struct uvm_object *obj; 1872 1873 KASSERT(ptp->wire_count == 1); 1874 1875 lidx = level - 1; 1876 1877 obj = &pmap->pm_obj[lidx]; 1878 pmap_stats_update(pmap, -1, 0); 1879 if (lidx != 0) 1880 mutex_enter(&obj->vmobjlock); 1881 if (pmap->pm_ptphint[lidx] == ptp) 1882 pmap->pm_ptphint[lidx] = TAILQ_FIRST(&obj->memq); 1883 ptp->wire_count = 0; 1884 uvm_pagerealloc(ptp, NULL, 0); 1885 VM_PAGE_TO_PP(ptp)->pp_link = curlwp->l_md.md_gc_ptp; 1886 curlwp->l_md.md_gc_ptp = ptp; 1887 if (lidx != 0) 1888 mutex_exit(&obj->vmobjlock); 1889 } 1890 1891 static void 1892 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va, 1893 pt_entry_t *ptes, pd_entry_t * const *pdes) 1894 { 1895 unsigned long index; 1896 int level; 1897 vaddr_t invaladdr; 1898 #ifdef MULTIPROCESSOR 1899 vaddr_t invaladdr2; 1900 #endif 1901 pd_entry_t opde; 1902 struct pmap *curpmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map); 1903 1904 KASSERT(pmap != pmap_kernel()); 1905 KASSERT(mutex_owned(&pmap->pm_lock)); 1906 KASSERT(kpreempt_disabled()); 1907 1908 level = 1; 1909 do { 1910 index = pl_i(va, level + 1); 1911 opde = pmap_pte_testset(&pdes[level - 1][index], 0); 1912 #if defined(XEN) && defined(__x86_64__) 1913 /* 1914 * If ptp is a L3 currently mapped in kernel space, 1915 * clear it before freeing 1916 */ 1917 if (pmap->pm_pdirpa == xen_current_user_pgd 1918 && level == PTP_LEVELS - 1) 1919 pmap_pte_set(&pmap_kernel()->pm_pdir[index], 0); 1920 #endif /* XEN && __x86_64__ */ 1921 pmap_freepage(pmap, ptp, level); 1922 invaladdr = level == 1 ? (vaddr_t)ptes : 1923 (vaddr_t)pdes[level - 2]; 1924 pmap_tlb_shootdown(curpmap, invaladdr + index * PAGE_SIZE, 1925 0, opde); 1926 #if defined(MULTIPROCESSOR) 1927 invaladdr2 = level == 1 ? (vaddr_t)PTE_BASE : 1928 (vaddr_t)normal_pdes[level - 2]; 1929 if (pmap != curpmap || invaladdr != invaladdr2) { 1930 pmap_tlb_shootdown(pmap, invaladdr2 + index * PAGE_SIZE, 1931 0, opde); 1932 } 1933 #endif 1934 if (level < PTP_LEVELS - 1) { 1935 ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1); 1936 ptp->wire_count--; 1937 if (ptp->wire_count > 1) 1938 break; 1939 } 1940 } while (++level < PTP_LEVELS); 1941 pmap_pte_flush(); 1942 } 1943 1944 /* 1945 * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one) 1946 * 1947 * => pmap should NOT be pmap_kernel() 1948 * => pmap should be locked 1949 * => preemption should be disabled 1950 */ 1951 1952 static struct vm_page * 1953 pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t * const *pdes) 1954 { 1955 struct vm_page *ptp, *pptp; 1956 int i; 1957 unsigned long index; 1958 pd_entry_t *pva; 1959 paddr_t ppa, pa; 1960 struct uvm_object *obj; 1961 1962 KASSERT(pmap != pmap_kernel()); 1963 KASSERT(mutex_owned(&pmap->pm_lock)); 1964 KASSERT(kpreempt_disabled()); 1965 1966 ptp = NULL; 1967 pa = (paddr_t)-1; 1968 1969 /* 1970 * Loop through all page table levels seeing if we need to 1971 * add a new page to that level. 1972 */ 1973 for (i = PTP_LEVELS; i > 1; i--) { 1974 /* 1975 * Save values from previous round. 1976 */ 1977 pptp = ptp; 1978 ppa = pa; 1979 1980 index = pl_i(va, i); 1981 pva = pdes[i - 2]; 1982 1983 if (pmap_valid_entry(pva[index])) { 1984 ppa = pmap_pte2pa(pva[index]); 1985 ptp = NULL; 1986 continue; 1987 } 1988 1989 obj = &pmap->pm_obj[i-2]; 1990 PMAP_SUBOBJ_LOCK(pmap, i - 2); 1991 ptp = uvm_pagealloc(obj, ptp_va2o(va, i - 1), NULL, 1992 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 1993 PMAP_SUBOBJ_UNLOCK(pmap, i - 2); 1994 1995 if (ptp == NULL) 1996 return NULL; 1997 1998 ptp->flags &= ~PG_BUSY; /* never busy */ 1999 ptp->wire_count = 1; 2000 pmap->pm_ptphint[i - 2] = ptp; 2001 pa = VM_PAGE_TO_PHYS(ptp); 2002 pmap_pte_set(&pva[index], (pd_entry_t) 2003 (pmap_pa2pte(pa) | PG_u | PG_RW | PG_V)); 2004 #if defined(XEN) && defined(__x86_64__) 2005 /* 2006 * In Xen we must enter the mapping in kernel map too 2007 * if pmap is curmap and modifying top level (PGD) 2008 */ 2009 if(i == PTP_LEVELS && pmap != pmap_kernel()) { 2010 pmap_pte_set(&pmap_kernel()->pm_pdir[index], 2011 (pd_entry_t) (pmap_pa2pte(pa) 2012 | PG_u | PG_RW | PG_V)); 2013 } 2014 #endif /* XEN && __x86_64__ */ 2015 pmap_pte_flush(); 2016 pmap_stats_update(pmap, 1, 0); 2017 /* 2018 * If we're not in the top level, increase the 2019 * wire count of the parent page. 2020 */ 2021 if (i < PTP_LEVELS) { 2022 if (pptp == NULL) 2023 pptp = pmap_find_ptp(pmap, va, ppa, i); 2024 #ifdef DIAGNOSTIC 2025 if (pptp == NULL) 2026 panic("pde page disappeared"); 2027 #endif 2028 pptp->wire_count++; 2029 } 2030 } 2031 2032 /* 2033 * ptp is not NULL if we just allocated a new ptp. If it's 2034 * still NULL, we must look up the existing one. 2035 */ 2036 if (ptp == NULL) { 2037 ptp = pmap_find_ptp(pmap, va, ppa, 1); 2038 #ifdef DIAGNOSTIC 2039 if (ptp == NULL) { 2040 printf("va %" PRIxVADDR " ppa %" PRIxPADDR "\n", 2041 va, ppa); 2042 panic("pmap_get_ptp: unmanaged user PTP"); 2043 } 2044 #endif 2045 } 2046 2047 pmap->pm_ptphint[0] = ptp; 2048 return(ptp); 2049 } 2050 2051 /* 2052 * p m a p l i f e c y c l e f u n c t i o n s 2053 */ 2054 2055 /* 2056 * pmap_pdp_ctor: constructor for the PDP cache. 2057 */ 2058 2059 int 2060 pmap_pdp_ctor(void *arg, void *v, int flags) 2061 { 2062 pd_entry_t *pdir = v; 2063 paddr_t pdirpa = 0; /* XXX: GCC */ 2064 vaddr_t object; 2065 int i; 2066 2067 #if !defined(XEN) || !defined(__x86_64__) 2068 int npde; 2069 #endif 2070 #ifdef XEN 2071 int s; 2072 #endif 2073 2074 /* 2075 * NOTE: The `pmap_lock' is held when the PDP is allocated. 2076 */ 2077 2078 #if defined(XEN) && defined(__x86_64__) 2079 /* fetch the physical address of the page directory. */ 2080 (void) pmap_extract(pmap_kernel(), (vaddr_t) pdir, &pdirpa); 2081 2082 /* zero init area */ 2083 memset (pdir, 0, PAGE_SIZE); /* Xen wants a clean page */ 2084 /* 2085 * this pdir will NEVER be active in kernel mode 2086 * so mark recursive entry invalid 2087 */ 2088 pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa) | PG_u; 2089 /* 2090 * PDP constructed this way won't be for kernel, 2091 * hence we don't put kernel mappings on Xen. 2092 * But we need to make pmap_create() happy, so put a dummy (without 2093 * PG_V) value at the right place. 2094 */ 2095 pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] = 2096 (pd_entry_t)-1 & PG_FRAME; 2097 #else /* XEN && __x86_64__*/ 2098 /* zero init area */ 2099 memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t)); 2100 2101 object = (vaddr_t)v; 2102 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2103 /* fetch the physical address of the page directory. */ 2104 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2105 /* put in recursive PDE to map the PTEs */ 2106 pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PG_V; 2107 #ifndef XEN 2108 pdir[PDIR_SLOT_PTE + i] |= PG_KW; 2109 #endif 2110 } 2111 2112 /* copy kernel's PDE */ 2113 npde = nkptp[PTP_LEVELS - 1]; 2114 2115 memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN], 2116 npde * sizeof(pd_entry_t)); 2117 2118 /* zero the rest */ 2119 memset(&pdir[PDIR_SLOT_KERN + npde], 0, 2120 (NTOPLEVEL_PDES - (PDIR_SLOT_KERN + npde)) * sizeof(pd_entry_t)); 2121 2122 if (VM_MIN_KERNEL_ADDRESS != KERNBASE) { 2123 int idx = pl_i(KERNBASE, PTP_LEVELS); 2124 2125 pdir[idx] = PDP_BASE[idx]; 2126 } 2127 #endif /* XEN && __x86_64__*/ 2128 #ifdef XEN 2129 s = splvm(); 2130 object = (vaddr_t)v; 2131 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2132 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2133 /* remap this page RO */ 2134 pmap_kenter_pa(object, pdirpa, VM_PROT_READ, 0); 2135 pmap_update(pmap_kernel()); 2136 /* 2137 * pin as L2/L4 page, we have to do the page with the 2138 * PDIR_SLOT_PTE entries last 2139 */ 2140 #ifdef PAE 2141 if (i == l2tol3(PDIR_SLOT_PTE)) 2142 continue; 2143 #endif 2144 xpq_queue_pin_table(xpmap_ptom_masked(pdirpa)); 2145 } 2146 #ifdef PAE 2147 object = ((vaddr_t)pdir) + PAGE_SIZE * l2tol3(PDIR_SLOT_PTE); 2148 (void)pmap_extract(pmap_kernel(), object, &pdirpa); 2149 xpq_queue_pin_table(xpmap_ptom_masked(pdirpa)); 2150 #endif 2151 splx(s); 2152 #endif /* XEN */ 2153 2154 return (0); 2155 } 2156 2157 /* 2158 * pmap_pdp_dtor: destructor for the PDP cache. 2159 */ 2160 2161 void 2162 pmap_pdp_dtor(void *arg, void *v) 2163 { 2164 #ifdef XEN 2165 paddr_t pdirpa = 0; /* XXX: GCC */ 2166 vaddr_t object = (vaddr_t)v; 2167 int i; 2168 int s = splvm(); 2169 pt_entry_t *pte; 2170 2171 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2172 /* fetch the physical address of the page directory. */ 2173 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2174 /* unpin page table */ 2175 xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa)); 2176 } 2177 object = (vaddr_t)v; 2178 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2179 /* Set page RW again */ 2180 pte = kvtopte(object); 2181 xpq_queue_pte_update(xpmap_ptetomach(pte), *pte | PG_RW); 2182 xpq_queue_invlpg((vaddr_t)object); 2183 } 2184 splx(s); 2185 #endif /* XEN */ 2186 } 2187 2188 #ifdef PAE 2189 2190 /* pmap_pdp_alloc: Allocate a page for the pdp memory pool. */ 2191 2192 void * 2193 pmap_pdp_alloc(struct pool *pp, int flags) 2194 { 2195 return (void *)uvm_km_alloc(kernel_map, 2196 PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE, 2197 ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK) 2198 | UVM_KMF_WIRED); 2199 } 2200 2201 /* 2202 * pmap_pdp_free: free a PDP 2203 */ 2204 2205 void 2206 pmap_pdp_free(struct pool *pp, void *v) 2207 { 2208 uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE, 2209 UVM_KMF_WIRED); 2210 } 2211 #endif /* PAE */ 2212 2213 /* 2214 * pmap_create: create a pmap 2215 * 2216 * => note: old pmap interface took a "size" args which allowed for 2217 * the creation of "software only" pmaps (not in bsd). 2218 */ 2219 2220 struct pmap * 2221 pmap_create(void) 2222 { 2223 struct pmap *pmap; 2224 int i; 2225 2226 pmap = pool_cache_get(&pmap_cache, PR_WAITOK); 2227 2228 /* init uvm_object */ 2229 for (i = 0; i < PTP_LEVELS - 1; i++) { 2230 UVM_OBJ_INIT(&pmap->pm_obj[i], NULL, 1); 2231 pmap->pm_ptphint[i] = NULL; 2232 } 2233 pmap->pm_stats.wired_count = 0; 2234 /* count the PDP allocd below */ 2235 pmap->pm_stats.resident_count = PDP_SIZE; 2236 #if !defined(__x86_64__) 2237 pmap->pm_hiexec = 0; 2238 #endif /* !defined(__x86_64__) */ 2239 pmap->pm_flags = 0; 2240 pmap->pm_cpus = 0; 2241 pmap->pm_kernel_cpus = 0; 2242 2243 /* init the LDT */ 2244 pmap->pm_ldt = NULL; 2245 pmap->pm_ldt_len = 0; 2246 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 2247 2248 /* allocate PDP */ 2249 try_again: 2250 pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK); 2251 2252 mutex_enter(&pmaps_lock); 2253 2254 if (pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] == 0) { 2255 mutex_exit(&pmaps_lock); 2256 pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir); 2257 goto try_again; 2258 } 2259 2260 #ifdef PAE 2261 for (i = 0; i < PDP_SIZE; i++) 2262 pmap->pm_pdirpa[i] = 2263 pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]); 2264 #else 2265 pmap->pm_pdirpa = pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE]); 2266 #endif 2267 2268 LIST_INSERT_HEAD(&pmaps, pmap, pm_list); 2269 2270 mutex_exit(&pmaps_lock); 2271 2272 return (pmap); 2273 } 2274 2275 /* 2276 * pmap_destroy: drop reference count on pmap. free pmap if 2277 * reference count goes to zero. 2278 */ 2279 2280 void 2281 pmap_destroy(struct pmap *pmap) 2282 { 2283 int i; 2284 #ifdef DIAGNOSTIC 2285 struct cpu_info *ci; 2286 CPU_INFO_ITERATOR cii; 2287 #endif /* DIAGNOSTIC */ 2288 2289 /* 2290 * if we have torn down this pmap, process deferred frees and 2291 * invalidations now. 2292 */ 2293 if (__predict_false(curlwp->l_md.md_gc_pmap == pmap)) { 2294 pmap_update(pmap); 2295 } 2296 2297 /* 2298 * drop reference count 2299 */ 2300 2301 if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) { 2302 return; 2303 } 2304 2305 #ifdef DIAGNOSTIC 2306 for (CPU_INFO_FOREACH(cii, ci)) 2307 if (ci->ci_pmap == pmap) 2308 panic("destroying pmap being used"); 2309 #endif /* DIAGNOSTIC */ 2310 2311 /* 2312 * reference count is zero, free pmap resources and then free pmap. 2313 */ 2314 #ifdef XEN 2315 /* 2316 * Xen lazy APDP handling: 2317 * clear APDP_PDE if pmap is the currently mapped 2318 */ 2319 if (xpmap_ptom_masked(pmap_pdirpa(pmap, 0)) == (*APDP_PDE & PG_FRAME)) { 2320 kpreempt_disable(); 2321 pmap_unmap_apdp(); 2322 pmap_pte_flush(); 2323 pmap_apte_flush(pmap_kernel()); 2324 kpreempt_enable(); 2325 } 2326 #endif 2327 2328 /* 2329 * remove it from global list of pmaps 2330 */ 2331 2332 mutex_enter(&pmaps_lock); 2333 LIST_REMOVE(pmap, pm_list); 2334 mutex_exit(&pmaps_lock); 2335 2336 /* 2337 * destroyed pmap shouldn't have remaining PTPs 2338 */ 2339 2340 for (i = 0; i < PTP_LEVELS - 1; i++) { 2341 KASSERT(pmap->pm_obj[i].uo_npages == 0); 2342 KASSERT(TAILQ_EMPTY(&pmap->pm_obj[i].memq)); 2343 } 2344 2345 /* 2346 * MULTIPROCESSOR -- no need to flush out of other processors' 2347 * APTE space because we do that in pmap_unmap_ptes(). 2348 */ 2349 pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir); 2350 2351 #ifdef USER_LDT 2352 if (pmap->pm_ldt != NULL) { 2353 /* 2354 * no need to switch the LDT; this address space is gone, 2355 * nothing is using it. 2356 * 2357 * No need to lock the pmap for ldt_free (or anything else), 2358 * we're the last one to use it. 2359 */ 2360 mutex_enter(&cpu_lock); 2361 ldt_free(pmap->pm_ldt_sel); 2362 mutex_exit(&cpu_lock); 2363 uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt, 2364 pmap->pm_ldt_len, UVM_KMF_WIRED); 2365 } 2366 #endif 2367 2368 for (i = 0; i < PTP_LEVELS - 1; i++) 2369 mutex_destroy(&pmap->pm_obj[i].vmobjlock); 2370 pool_cache_put(&pmap_cache, pmap); 2371 } 2372 2373 /* 2374 * pmap_remove_all: pmap is being torn down by the current thread. 2375 * avoid unnecessary invalidations. 2376 */ 2377 2378 void 2379 pmap_remove_all(struct pmap *pmap) 2380 { 2381 lwp_t *l = curlwp; 2382 2383 KASSERT(l->l_md.md_gc_pmap == NULL); 2384 2385 l->l_md.md_gc_pmap = pmap; 2386 } 2387 2388 #if defined(PMAP_FORK) 2389 /* 2390 * pmap_fork: perform any necessary data structure manipulation when 2391 * a VM space is forked. 2392 */ 2393 2394 void 2395 pmap_fork(struct pmap *pmap1, struct pmap *pmap2) 2396 { 2397 #ifdef USER_LDT 2398 union descriptor *new_ldt; 2399 size_t len; 2400 int sel; 2401 2402 if (__predict_true(pmap1->pm_ldt == NULL)) { 2403 return; 2404 } 2405 2406 retry: 2407 if (pmap1->pm_ldt != NULL) { 2408 len = pmap1->pm_ldt_len; 2409 new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, len, 0, 2410 UVM_KMF_WIRED); 2411 mutex_enter(&cpu_lock); 2412 sel = ldt_alloc(new_ldt, len); 2413 if (sel == -1) { 2414 mutex_exit(&cpu_lock); 2415 uvm_km_free(kernel_map, (vaddr_t)new_ldt, len, 2416 UVM_KMF_WIRED); 2417 printf("WARNING: pmap_fork: unable to allocate LDT\n"); 2418 return; 2419 } 2420 } else { 2421 len = -1; 2422 new_ldt = NULL; 2423 sel = -1; 2424 mutex_enter(&cpu_lock); 2425 } 2426 2427 /* Copy the LDT, if necessary. */ 2428 if (pmap1->pm_ldt != NULL) { 2429 if (len != pmap1->pm_ldt_len) { 2430 if (len != -1) { 2431 ldt_free(sel); 2432 uvm_km_free(kernel_map, (vaddr_t)new_ldt, 2433 len, UVM_KMF_WIRED); 2434 } 2435 mutex_exit(&cpu_lock); 2436 goto retry; 2437 } 2438 2439 memcpy(new_ldt, pmap1->pm_ldt, len); 2440 pmap2->pm_ldt = new_ldt; 2441 pmap2->pm_ldt_len = pmap1->pm_ldt_len; 2442 pmap2->pm_ldt_sel = sel; 2443 len = -1; 2444 } 2445 2446 if (len != -1) { 2447 ldt_free(sel); 2448 uvm_km_free(kernel_map, (vaddr_t)new_ldt, len, 2449 UVM_KMF_WIRED); 2450 } 2451 mutex_exit(&cpu_lock); 2452 #endif /* USER_LDT */ 2453 } 2454 #endif /* PMAP_FORK */ 2455 2456 #ifdef USER_LDT 2457 2458 /* 2459 * pmap_ldt_xcall: cross call used by pmap_ldt_sync. if the named pmap 2460 * is active, reload LDTR. 2461 */ 2462 static void 2463 pmap_ldt_xcall(void *arg1, void *arg2) 2464 { 2465 struct pmap *pm; 2466 2467 kpreempt_disable(); 2468 pm = arg1; 2469 if (curcpu()->ci_pmap == pm) { 2470 lldt(pm->pm_ldt_sel); 2471 } 2472 kpreempt_enable(); 2473 } 2474 2475 /* 2476 * pmap_ldt_sync: LDT selector for the named pmap is changing. swap 2477 * in the new selector on all CPUs. 2478 */ 2479 void 2480 pmap_ldt_sync(struct pmap *pm) 2481 { 2482 uint64_t where; 2483 2484 KASSERT(mutex_owned(&cpu_lock)); 2485 2486 pmap_ldt_evcnt.ev_count++; 2487 where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL); 2488 xc_wait(where); 2489 } 2490 2491 /* 2492 * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and 2493 * restore the default. 2494 */ 2495 2496 void 2497 pmap_ldt_cleanup(struct lwp *l) 2498 { 2499 pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap; 2500 union descriptor *dp = NULL; 2501 size_t len = 0; 2502 int sel = -1; 2503 2504 if (__predict_true(pmap->pm_ldt == NULL)) { 2505 return; 2506 } 2507 2508 mutex_enter(&cpu_lock); 2509 if (pmap->pm_ldt != NULL) { 2510 sel = pmap->pm_ldt_sel; 2511 dp = pmap->pm_ldt; 2512 len = pmap->pm_ldt_len; 2513 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 2514 pmap->pm_ldt = NULL; 2515 pmap->pm_ldt_len = 0; 2516 pmap_ldt_sync(pmap); 2517 ldt_free(sel); 2518 uvm_km_free(kernel_map, (vaddr_t)dp, len, UVM_KMF_WIRED); 2519 } 2520 mutex_exit(&cpu_lock); 2521 } 2522 #endif /* USER_LDT */ 2523 2524 /* 2525 * pmap_activate: activate a process' pmap 2526 * 2527 * => must be called with kernel preemption disabled 2528 * => if lwp is the curlwp, then set ci_want_pmapload so that 2529 * actual MMU context switch will be done by pmap_load() later 2530 */ 2531 2532 void 2533 pmap_activate(struct lwp *l) 2534 { 2535 struct cpu_info *ci; 2536 struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2537 2538 KASSERT(kpreempt_disabled()); 2539 2540 ci = curcpu(); 2541 2542 if (l == ci->ci_curlwp) { 2543 struct pcb *pcb; 2544 2545 KASSERT(ci->ci_want_pmapload == 0); 2546 KASSERT(ci->ci_tlbstate != TLBSTATE_VALID); 2547 #ifdef KSTACK_CHECK_DR0 2548 /* 2549 * setup breakpoint on the top of stack 2550 */ 2551 if (l == &lwp0) 2552 dr0(0, 0, 0, 0); 2553 else 2554 dr0(KSTACK_LOWEST_ADDR(l), 1, 3, 1); 2555 #endif 2556 2557 /* 2558 * no need to switch to kernel vmspace because 2559 * it's a subset of any vmspace. 2560 */ 2561 2562 if (pmap == pmap_kernel()) { 2563 ci->ci_want_pmapload = 0; 2564 return; 2565 } 2566 2567 pcb = lwp_getpcb(l); 2568 ci->ci_want_pmapload = 1; 2569 2570 #if defined(__x86_64__) 2571 if (pcb->pcb_flags & PCB_GS64) 2572 wrmsr(MSR_KERNELGSBASE, pcb->pcb_gs); 2573 if (pcb->pcb_flags & PCB_FS64) 2574 wrmsr(MSR_FSBASE, pcb->pcb_fs); 2575 #endif /* defined(__x86_64__) */ 2576 } 2577 } 2578 2579 /* 2580 * pmap_reactivate: try to regain reference to the pmap. 2581 * 2582 * => must be called with kernel preemption disabled 2583 */ 2584 2585 static bool 2586 pmap_reactivate(struct pmap *pmap) 2587 { 2588 struct cpu_info *ci; 2589 uint32_t cpumask; 2590 bool result; 2591 uint32_t oldcpus; 2592 2593 ci = curcpu(); 2594 cpumask = ci->ci_cpumask; 2595 2596 KASSERT(kpreempt_disabled()); 2597 #if defined(XEN) && defined(__x86_64__) 2598 KASSERT(pmap->pm_pdirpa == xen_current_user_pgd); 2599 #elif defined(PAE) 2600 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(pmap_l3pd[0])); 2601 #elif !defined(XEN) 2602 KASSERT(pmap->pm_pdirpa == pmap_pte2pa(rcr3())); 2603 #endif 2604 2605 /* 2606 * if we still have a lazy reference to this pmap, 2607 * we can assume that there was no tlb shootdown 2608 * for this pmap in the meantime. 2609 * 2610 * the order of events here is important as we must 2611 * synchronize with TLB shootdown interrupts. declare 2612 * interest in invalidations (TLBSTATE_VALID) and then 2613 * check the cpumask, which the IPIs can change only 2614 * when the state is TLBSTATE_LAZY. 2615 */ 2616 2617 ci->ci_tlbstate = TLBSTATE_VALID; 2618 oldcpus = pmap->pm_cpus; 2619 KASSERT((pmap->pm_kernel_cpus & cpumask) != 0); 2620 if (oldcpus & cpumask) { 2621 /* got it */ 2622 result = true; 2623 } else { 2624 /* must reload */ 2625 atomic_or_32(&pmap->pm_cpus, cpumask); 2626 result = false; 2627 } 2628 2629 return result; 2630 } 2631 2632 /* 2633 * pmap_load: actually switch pmap. (fill in %cr3 and LDT info) 2634 */ 2635 2636 void 2637 pmap_load(void) 2638 { 2639 struct cpu_info *ci; 2640 uint32_t cpumask; 2641 struct pmap *pmap; 2642 struct pmap *oldpmap; 2643 struct lwp *l; 2644 struct pcb *pcb; 2645 uint64_t ncsw; 2646 2647 kpreempt_disable(); 2648 retry: 2649 ci = curcpu(); 2650 if (!ci->ci_want_pmapload) { 2651 kpreempt_enable(); 2652 return; 2653 } 2654 cpumask = ci->ci_cpumask; 2655 l = ci->ci_curlwp; 2656 ncsw = l->l_ncsw; 2657 2658 /* should be able to take ipis. */ 2659 KASSERT(ci->ci_ilevel < IPL_HIGH); 2660 #ifdef XEN 2661 /* XXX not yet KASSERT(x86_read_psl() != 0); */ 2662 #else 2663 KASSERT((x86_read_psl() & PSL_I) != 0); 2664 #endif 2665 2666 KASSERT(l != NULL); 2667 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2668 KASSERT(pmap != pmap_kernel()); 2669 oldpmap = ci->ci_pmap; 2670 pcb = lwp_getpcb(l); 2671 2672 if (pmap == oldpmap) { 2673 if (!pmap_reactivate(pmap)) { 2674 u_int gen = uvm_emap_gen_return(); 2675 2676 /* 2677 * pmap has been changed during deactivated. 2678 * our tlb may be stale. 2679 */ 2680 2681 tlbflush(); 2682 uvm_emap_update(gen); 2683 } 2684 2685 ci->ci_want_pmapload = 0; 2686 kpreempt_enable(); 2687 return; 2688 } 2689 2690 /* 2691 * grab a reference to the new pmap. 2692 */ 2693 2694 pmap_reference(pmap); 2695 2696 /* 2697 * actually switch pmap. 2698 */ 2699 2700 atomic_and_32(&oldpmap->pm_cpus, ~cpumask); 2701 atomic_and_32(&oldpmap->pm_kernel_cpus, ~cpumask); 2702 2703 #if defined(XEN) && defined(__x86_64__) 2704 KASSERT(oldpmap->pm_pdirpa == xen_current_user_pgd || 2705 oldpmap == pmap_kernel()); 2706 #elif defined(PAE) 2707 KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(pmap_l3pd[0])); 2708 #elif !defined(XEN) 2709 KASSERT(oldpmap->pm_pdirpa == pmap_pte2pa(rcr3())); 2710 #endif 2711 KASSERT((pmap->pm_cpus & cpumask) == 0); 2712 KASSERT((pmap->pm_kernel_cpus & cpumask) == 0); 2713 2714 /* 2715 * mark the pmap in use by this processor. again we must 2716 * synchronize with TLB shootdown interrupts, so set the 2717 * state VALID first, then register us for shootdown events 2718 * on this pmap. 2719 */ 2720 2721 ci->ci_tlbstate = TLBSTATE_VALID; 2722 atomic_or_32(&pmap->pm_cpus, cpumask); 2723 atomic_or_32(&pmap->pm_kernel_cpus, cpumask); 2724 ci->ci_pmap = pmap; 2725 2726 /* 2727 * update tss. now that we have registered for invalidations 2728 * from other CPUs, we're good to load the page tables. 2729 */ 2730 #ifdef PAE 2731 pcb->pcb_cr3 = pmap_l3paddr; 2732 #else 2733 pcb->pcb_cr3 = pmap->pm_pdirpa; 2734 #endif 2735 #if defined(XEN) && defined(__x86_64__) 2736 /* kernel pmap always in cr3 and should never go in user cr3 */ 2737 if (pmap_pdirpa(pmap, 0) != pmap_pdirpa(pmap_kernel(), 0)) { 2738 /* 2739 * Map user space address in kernel space and load 2740 * user cr3 2741 */ 2742 int i, s; 2743 pd_entry_t *old_pgd, *new_pgd; 2744 paddr_t addr; 2745 s = splvm(); 2746 new_pgd = pmap->pm_pdir; 2747 old_pgd = pmap_kernel()->pm_pdir; 2748 addr = xpmap_ptom(pmap_pdirpa(pmap_kernel(), 0)); 2749 for (i = 0; i < PDIR_SLOT_PTE; 2750 i++, addr += sizeof(pd_entry_t)) { 2751 if ((new_pgd[i] & PG_V) || (old_pgd[i] & PG_V)) 2752 xpq_queue_pte_update(addr, new_pgd[i]); 2753 } 2754 tlbflush(); 2755 xen_set_user_pgd(pmap_pdirpa(pmap, 0)); 2756 xen_current_user_pgd = pmap_pdirpa(pmap, 0); 2757 splx(s); 2758 } 2759 #else /* XEN && x86_64 */ 2760 #if defined(XEN) 2761 /* 2762 * clear APDP slot, in case it points to a page table that has 2763 * been freed 2764 */ 2765 if (*APDP_PDE) { 2766 pmap_unmap_apdp(); 2767 } 2768 /* lldt() does pmap_pte_flush() */ 2769 #else /* XEN */ 2770 #if defined(i386) 2771 ci->ci_tss.tss_ldt = pmap->pm_ldt_sel; 2772 ci->ci_tss.tss_cr3 = pcb->pcb_cr3; 2773 #endif 2774 #endif /* XEN */ 2775 lldt(pmap->pm_ldt_sel); 2776 #ifdef PAE 2777 { 2778 paddr_t l3_pd = xpmap_ptom_masked(pmap_l3paddr); 2779 int i; 2780 int s = splvm(); 2781 /* don't update the kernel L3 slot */ 2782 for (i = 0 ; i < PDP_SIZE - 1; i++, l3_pd += sizeof(pd_entry_t)) { 2783 xpq_queue_pte_update(l3_pd, 2784 xpmap_ptom(pmap->pm_pdirpa[i]) | PG_V); 2785 } 2786 tlbflush(); 2787 splx(s); 2788 } 2789 #else /* PAE */ 2790 { 2791 u_int gen = uvm_emap_gen_return(); 2792 lcr3(pcb->pcb_cr3); 2793 uvm_emap_update(gen); 2794 } 2795 #endif /* PAE */ 2796 #endif /* XEN && x86_64 */ 2797 2798 ci->ci_want_pmapload = 0; 2799 2800 /* 2801 * we're now running with the new pmap. drop the reference 2802 * to the old pmap. if we block, we need to go around again. 2803 */ 2804 2805 pmap_destroy(oldpmap); 2806 if (l->l_ncsw != ncsw) { 2807 goto retry; 2808 } 2809 2810 kpreempt_enable(); 2811 } 2812 2813 /* 2814 * pmap_deactivate: deactivate a process' pmap 2815 * 2816 * => must be called with kernel preemption disabled (high SPL is enough) 2817 */ 2818 2819 void 2820 pmap_deactivate(struct lwp *l) 2821 { 2822 struct pmap *pmap; 2823 struct cpu_info *ci; 2824 2825 KASSERT(kpreempt_disabled()); 2826 2827 if (l != curlwp) { 2828 return; 2829 } 2830 2831 /* 2832 * wait for pending TLB shootdowns to complete. necessary 2833 * because TLB shootdown state is per-CPU, and the LWP may 2834 * be coming off the CPU before it has a chance to call 2835 * pmap_update(). 2836 */ 2837 pmap_tlb_shootwait(); 2838 2839 ci = curcpu(); 2840 2841 if (ci->ci_want_pmapload) { 2842 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 2843 != pmap_kernel()); 2844 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 2845 != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID); 2846 2847 /* 2848 * userspace has not been touched. 2849 * nothing to do here. 2850 */ 2851 2852 ci->ci_want_pmapload = 0; 2853 return; 2854 } 2855 2856 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2857 2858 if (pmap == pmap_kernel()) { 2859 return; 2860 } 2861 2862 #if defined(XEN) && defined(__x86_64__) 2863 KASSERT(pmap->pm_pdirpa == xen_current_user_pgd); 2864 #elif defined(PAE) 2865 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(pmap_l3pd[0])); 2866 #elif !defined(XEN) 2867 KASSERT(pmap->pm_pdirpa == pmap_pte2pa(rcr3())); 2868 #endif 2869 KASSERT(ci->ci_pmap == pmap); 2870 2871 /* 2872 * we aren't interested in TLB invalidations for this pmap, 2873 * at least for the time being. 2874 */ 2875 2876 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); 2877 ci->ci_tlbstate = TLBSTATE_LAZY; 2878 } 2879 2880 /* 2881 * end of lifecycle functions 2882 */ 2883 2884 /* 2885 * some misc. functions 2886 */ 2887 2888 static int 2889 pmap_pdes_invalid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde) 2890 { 2891 int i; 2892 unsigned long index; 2893 pd_entry_t pde; 2894 2895 for (i = PTP_LEVELS; i > 1; i--) { 2896 index = pl_i(va, i); 2897 pde = pdes[i - 2][index]; 2898 if ((pde & PG_V) == 0) 2899 return i; 2900 } 2901 if (lastpde != NULL) 2902 *lastpde = pde; 2903 return 0; 2904 } 2905 2906 /* 2907 * pmap_extract: extract a PA for the given VA 2908 */ 2909 2910 bool 2911 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap) 2912 { 2913 pt_entry_t *ptes, pte; 2914 pd_entry_t pde; 2915 pd_entry_t * const *pdes; 2916 struct pmap *pmap2; 2917 struct cpu_info *ci; 2918 paddr_t pa; 2919 lwp_t *l; 2920 bool hard, rv; 2921 2922 rv = false; 2923 pa = 0; 2924 l = curlwp; 2925 2926 KPREEMPT_DISABLE(l); 2927 ci = l->l_cpu; 2928 if (__predict_true(!ci->ci_want_pmapload && ci->ci_pmap == pmap) || 2929 pmap == pmap_kernel()) { 2930 /* 2931 * no need to lock, because it's pmap_kernel() or our 2932 * own pmap and is active. if a user pmap, the caller 2933 * will hold the vm_map write/read locked and so prevent 2934 * entries from disappearing while we are here. ptps 2935 * can disappear via pmap_remove() and pmap_protect(), 2936 * but they are called with the vm_map write locked. 2937 */ 2938 hard = false; 2939 ptes = PTE_BASE; 2940 pdes = normal_pdes; 2941 } else { 2942 /* we lose, do it the hard way. */ 2943 hard = true; 2944 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 2945 } 2946 if (pmap_pdes_valid(va, pdes, &pde)) { 2947 pte = ptes[pl1_i(va)]; 2948 if (pde & PG_PS) { 2949 pa = (pde & PG_LGFRAME) | (va & (NBPD_L2 - 1)); 2950 rv = true; 2951 } else if (__predict_true((pte & PG_V) != 0)) { 2952 pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1)); 2953 rv = true; 2954 } 2955 } 2956 if (__predict_false(hard)) { 2957 pmap_unmap_ptes(pmap, pmap2); 2958 } 2959 KPREEMPT_ENABLE(l); 2960 if (pap != NULL) { 2961 *pap = pa; 2962 } 2963 return rv; 2964 } 2965 2966 2967 /* 2968 * vtophys: virtual address to physical address. For use by 2969 * machine-dependent code only. 2970 */ 2971 2972 paddr_t 2973 vtophys(vaddr_t va) 2974 { 2975 paddr_t pa; 2976 2977 if (pmap_extract(pmap_kernel(), va, &pa) == true) 2978 return (pa); 2979 return (0); 2980 } 2981 2982 #ifdef XEN 2983 /* 2984 * pmap_extract_ma: extract a MA for the given VA 2985 */ 2986 2987 bool 2988 pmap_extract_ma(struct pmap *pmap, vaddr_t va, paddr_t *pap) 2989 { 2990 pt_entry_t *ptes, pte; 2991 pd_entry_t pde; 2992 pd_entry_t * const *pdes; 2993 struct pmap *pmap2; 2994 2995 kpreempt_disable(); 2996 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 2997 if (!pmap_pdes_valid(va, pdes, &pde)) { 2998 pmap_unmap_ptes(pmap, pmap2); 2999 kpreempt_enable(); 3000 return false; 3001 } 3002 3003 pte = ptes[pl1_i(va)]; 3004 pmap_unmap_ptes(pmap, pmap2); 3005 kpreempt_enable(); 3006 3007 if (__predict_true((pte & PG_V) != 0)) { 3008 if (pap != NULL) 3009 *pap = (pte & PG_FRAME) | (va & (NBPD_L1 - 1)); 3010 return true; 3011 } 3012 3013 return false; 3014 } 3015 3016 /* 3017 * vtomach: virtual address to machine address. For use by 3018 * machine-dependent code only. 3019 */ 3020 3021 paddr_t 3022 vtomach(vaddr_t va) 3023 { 3024 paddr_t pa; 3025 3026 if (pmap_extract_ma(pmap_kernel(), va, &pa) == true) 3027 return (pa); 3028 return (0); 3029 } 3030 3031 #endif /* XEN */ 3032 3033 3034 3035 /* 3036 * pmap_virtual_space: used during bootup [pmap_steal_memory] to 3037 * determine the bounds of the kernel virtual addess space. 3038 */ 3039 3040 void 3041 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp) 3042 { 3043 *startp = virtual_avail; 3044 *endp = virtual_end; 3045 } 3046 3047 /* 3048 * pmap_map: map a range of PAs into kvm. 3049 * 3050 * => used during crash dump 3051 * => XXX: pmap_map() should be phased out? 3052 */ 3053 3054 vaddr_t 3055 pmap_map(vaddr_t va, paddr_t spa, paddr_t epa, vm_prot_t prot) 3056 { 3057 while (spa < epa) { 3058 pmap_kenter_pa(va, spa, prot, 0); 3059 va += PAGE_SIZE; 3060 spa += PAGE_SIZE; 3061 } 3062 pmap_update(pmap_kernel()); 3063 return va; 3064 } 3065 3066 /* 3067 * pmap_zero_page: zero a page 3068 */ 3069 3070 void 3071 pmap_zero_page(paddr_t pa) 3072 { 3073 pt_entry_t *zpte; 3074 void *zerova; 3075 int id; 3076 3077 kpreempt_disable(); 3078 id = cpu_number(); 3079 zpte = PTESLEW(zero_pte, id); 3080 zerova = VASLEW(zerop, id); 3081 3082 #ifdef DIAGNOSTIC 3083 if (*zpte) 3084 panic("pmap_zero_page: lock botch"); 3085 #endif 3086 3087 pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k); 3088 pmap_pte_flush(); 3089 pmap_update_pg((vaddr_t)zerova); /* flush TLB */ 3090 3091 memset(zerova, 0, PAGE_SIZE); 3092 3093 #if defined(DIAGNOSTIC) || defined(XEN) 3094 pmap_pte_set(zpte, 0); /* zap ! */ 3095 pmap_pte_flush(); 3096 #endif 3097 kpreempt_enable(); 3098 } 3099 3100 /* 3101 * pmap_pagezeroidle: the same, for the idle loop page zero'er. 3102 * Returns true if the page was zero'd, false if we aborted for 3103 * some reason. 3104 */ 3105 3106 bool 3107 pmap_pageidlezero(paddr_t pa) 3108 { 3109 pt_entry_t *zpte; 3110 void *zerova; 3111 bool rv; 3112 int id; 3113 3114 id = cpu_number(); 3115 zpte = PTESLEW(zero_pte, id); 3116 zerova = VASLEW(zerop, id); 3117 3118 KASSERT(cpu_feature & CPUID_SSE2); 3119 KASSERT(*zpte == 0); 3120 3121 pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k); 3122 pmap_pte_flush(); 3123 pmap_update_pg((vaddr_t)zerova); /* flush TLB */ 3124 3125 rv = sse2_idlezero_page(zerova); 3126 3127 #if defined(DIAGNOSTIC) || defined(XEN) 3128 pmap_pte_set(zpte, 0); /* zap ! */ 3129 pmap_pte_flush(); 3130 #endif 3131 3132 return rv; 3133 } 3134 3135 /* 3136 * pmap_copy_page: copy a page 3137 */ 3138 3139 void 3140 pmap_copy_page(paddr_t srcpa, paddr_t dstpa) 3141 { 3142 pt_entry_t *spte; 3143 pt_entry_t *dpte; 3144 void *csrcva; 3145 void *cdstva; 3146 int id; 3147 3148 kpreempt_disable(); 3149 id = cpu_number(); 3150 spte = PTESLEW(csrc_pte,id); 3151 dpte = PTESLEW(cdst_pte,id); 3152 csrcva = VASLEW(csrcp, id); 3153 cdstva = VASLEW(cdstp, id); 3154 3155 KASSERT(*spte == 0 && *dpte == 0); 3156 3157 pmap_pte_set(spte, pmap_pa2pte(srcpa) | PG_V | PG_RW | PG_U | PG_k); 3158 pmap_pte_set(dpte, 3159 pmap_pa2pte(dstpa) | PG_V | PG_RW | PG_M | PG_U | PG_k); 3160 pmap_pte_flush(); 3161 pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva); 3162 3163 memcpy(cdstva, csrcva, PAGE_SIZE); 3164 3165 #if defined(DIAGNOSTIC) || defined(XEN) 3166 pmap_pte_set(spte, 0); 3167 pmap_pte_set(dpte, 0); 3168 pmap_pte_flush(); 3169 #endif 3170 kpreempt_enable(); 3171 } 3172 3173 static pt_entry_t * 3174 pmap_map_ptp(struct vm_page *ptp) 3175 { 3176 pt_entry_t *ptppte; 3177 void *ptpva; 3178 int id; 3179 3180 KASSERT(kpreempt_disabled()); 3181 3182 id = cpu_number(); 3183 ptppte = PTESLEW(ptp_pte, id); 3184 ptpva = VASLEW(ptpp, id); 3185 #if !defined(XEN) 3186 pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M | 3187 PG_RW | PG_U | PG_k); 3188 #else 3189 pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M | 3190 PG_U | PG_k); 3191 #endif 3192 pmap_pte_flush(); 3193 pmap_update_pg((vaddr_t)ptpva); 3194 3195 return (pt_entry_t *)ptpva; 3196 } 3197 3198 static void 3199 pmap_unmap_ptp(void) 3200 { 3201 #if defined(DIAGNOSTIC) || defined(XEN) 3202 pt_entry_t *pte; 3203 3204 KASSERT(kpreempt_disabled()); 3205 3206 pte = PTESLEW(ptp_pte, cpu_number()); 3207 if (*pte != 0) { 3208 pmap_pte_set(pte, 0); 3209 pmap_pte_flush(); 3210 } 3211 #endif 3212 } 3213 3214 static pt_entry_t * 3215 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va) 3216 { 3217 3218 KASSERT(kpreempt_disabled()); 3219 if (pmap_is_curpmap(pmap)) { 3220 return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */ 3221 } 3222 KASSERT(ptp != NULL); 3223 return pmap_map_ptp(ptp) + pl1_pi(va); 3224 } 3225 3226 static void 3227 pmap_unmap_pte(void) 3228 { 3229 3230 KASSERT(kpreempt_disabled()); 3231 3232 pmap_unmap_ptp(); 3233 } 3234 3235 /* 3236 * p m a p r e m o v e f u n c t i o n s 3237 * 3238 * functions that remove mappings 3239 */ 3240 3241 /* 3242 * pmap_remove_ptes: remove PTEs from a PTP 3243 * 3244 * => must have proper locking on pmap_master_lock 3245 * => caller must hold pmap's lock 3246 * => PTP must be mapped into KVA 3247 * => PTP should be null if pmap == pmap_kernel() 3248 * => must be called with kernel preemption disabled 3249 * => returns composite pte if at least one page should be shot down 3250 */ 3251 3252 static pt_entry_t 3253 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva, 3254 vaddr_t startva, vaddr_t endva, struct pv_entry **pv_tofree) 3255 { 3256 struct pv_entry *pve; 3257 pt_entry_t *pte = (pt_entry_t *) ptpva; 3258 pt_entry_t opte, xpte = 0; 3259 3260 KASSERT(pmap == pmap_kernel() || mutex_owned(&pmap->pm_lock)); 3261 KASSERT(kpreempt_disabled()); 3262 3263 /* 3264 * note that ptpva points to the PTE that maps startva. this may 3265 * or may not be the first PTE in the PTP. 3266 * 3267 * we loop through the PTP while there are still PTEs to look at 3268 * and the wire_count is greater than 1 (because we use the wire_count 3269 * to keep track of the number of real PTEs in the PTP). 3270 */ 3271 3272 for (/*null*/; startva < endva && (ptp == NULL || ptp->wire_count > 1) 3273 ; pte++, startva += PAGE_SIZE) { 3274 struct vm_page *pg; 3275 struct pmap_page *pp; 3276 3277 if (!pmap_valid_entry(*pte)) 3278 continue; /* VA not mapped */ 3279 3280 /* atomically save the old PTE and zap! it */ 3281 opte = pmap_pte_testset(pte, 0); 3282 if (!pmap_valid_entry(opte)) { 3283 continue; 3284 } 3285 3286 pmap_exec_account(pmap, startva, opte, 0); 3287 pmap_stats_update_bypte(pmap, 0, opte); 3288 xpte |= opte; 3289 3290 if (ptp) { 3291 ptp->wire_count--; /* dropping a PTE */ 3292 /* Make sure that the PDE is flushed */ 3293 if (ptp->wire_count <= 1) 3294 xpte |= PG_U; 3295 } 3296 3297 /* 3298 * if we are not on a pv_head list we are done. 3299 */ 3300 3301 if ((opte & PG_PVLIST) == 0) { 3302 #if defined(DIAGNOSTIC) && !defined(DOM0OPS) 3303 if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL) 3304 panic("pmap_remove_ptes: managed page without " 3305 "PG_PVLIST for %#" PRIxVADDR, startva); 3306 #endif 3307 continue; 3308 } 3309 3310 pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte)); 3311 #ifdef DIAGNOSTIC 3312 if (pg == NULL) 3313 panic("pmap_remove_ptes: unmanaged page marked " 3314 "PG_PVLIST, va = %#" PRIxVADDR ", " 3315 "pa = %#" PRIxPADDR, 3316 startva, (paddr_t)pmap_pte2pa(opte)); 3317 #endif 3318 3319 /* sync R/M bits */ 3320 pp = VM_PAGE_TO_PP(pg); 3321 pp_lock(pp); 3322 pp->pp_attrs |= opte; 3323 pve = pmap_remove_pv(pp, ptp, startva); 3324 pp_unlock(pp); 3325 3326 if (pve != NULL) { 3327 pve->pve_next = *pv_tofree; 3328 *pv_tofree = pve; 3329 } 3330 3331 /* end of "for" loop: time for next pte */ 3332 } 3333 3334 return xpte; 3335 } 3336 3337 3338 /* 3339 * pmap_remove_pte: remove a single PTE from a PTP 3340 * 3341 * => must have proper locking on pmap_master_lock 3342 * => caller must hold pmap's lock 3343 * => PTP must be mapped into KVA 3344 * => PTP should be null if pmap == pmap_kernel() 3345 * => returns true if we removed a mapping 3346 * => must be called with kernel preemption disabled 3347 */ 3348 3349 static bool 3350 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, 3351 vaddr_t va, struct pv_entry **pv_tofree) 3352 { 3353 pt_entry_t opte; 3354 struct pv_entry *pve; 3355 struct vm_page *pg; 3356 struct pmap_page *pp; 3357 3358 KASSERT(pmap == pmap_kernel() || mutex_owned(&pmap->pm_lock)); 3359 KASSERT(pmap == pmap_kernel() || kpreempt_disabled()); 3360 3361 if (!pmap_valid_entry(*pte)) 3362 return(false); /* VA not mapped */ 3363 3364 /* atomically save the old PTE and zap! it */ 3365 opte = pmap_pte_testset(pte, 0); 3366 if (!pmap_valid_entry(opte)) { 3367 return false; 3368 } 3369 3370 pmap_exec_account(pmap, va, opte, 0); 3371 pmap_stats_update_bypte(pmap, 0, opte); 3372 3373 if (opte & PG_U) 3374 pmap_tlb_shootdown(pmap, va, 0, opte); 3375 3376 if (ptp) { 3377 ptp->wire_count--; /* dropping a PTE */ 3378 /* Make sure that the PDE is flushed */ 3379 if ((ptp->wire_count <= 1) && !(opte & PG_U)) 3380 pmap_tlb_shootdown(pmap, va, 0, opte); 3381 } 3382 3383 /* 3384 * if we are not on a pv_head list we are done. 3385 */ 3386 3387 if ((opte & PG_PVLIST) == 0) { 3388 #if defined(DIAGNOSTIC) && !defined(DOM0OPS) 3389 if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL) 3390 panic("pmap_remove_pte: managed page without " 3391 "PG_PVLIST for %#" PRIxVADDR, va); 3392 #endif 3393 return(true); 3394 } 3395 3396 pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte)); 3397 #ifdef DIAGNOSTIC 3398 if (pg == NULL) 3399 panic("pmap_remove_pte: unmanaged page marked " 3400 "PG_PVLIST, va = %#" PRIxVADDR ", pa = %#" PRIxPADDR, 3401 va, (paddr_t)pmap_pte2pa(opte)); 3402 #endif 3403 3404 /* sync R/M bits */ 3405 pp = VM_PAGE_TO_PP(pg); 3406 pp_lock(pp); 3407 pp->pp_attrs |= opte; 3408 pve = pmap_remove_pv(pp, ptp, va); 3409 pp_unlock(pp); 3410 3411 if (pve) { 3412 pve->pve_next = *pv_tofree; 3413 *pv_tofree = pve; 3414 } 3415 3416 return(true); 3417 } 3418 3419 /* 3420 * pmap_remove: mapping removal function. 3421 * 3422 * => caller should not be holding any pmap locks 3423 */ 3424 3425 void 3426 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 3427 { 3428 pt_entry_t *ptes, xpte = 0; 3429 pd_entry_t pde; 3430 pd_entry_t * const *pdes; 3431 struct pv_entry *pv_tofree = NULL; 3432 bool result; 3433 paddr_t ptppa; 3434 vaddr_t blkendva, va = sva; 3435 struct vm_page *ptp; 3436 struct pmap *pmap2; 3437 3438 kpreempt_disable(); 3439 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 3440 3441 /* 3442 * removing one page? take shortcut function. 3443 */ 3444 3445 if (va + PAGE_SIZE == eva) { 3446 if (pmap_pdes_valid(va, pdes, &pde)) { 3447 3448 /* PA of the PTP */ 3449 ptppa = pmap_pte2pa(pde); 3450 3451 /* get PTP if non-kernel mapping */ 3452 if (pmap == pmap_kernel()) { 3453 /* we never free kernel PTPs */ 3454 ptp = NULL; 3455 } else { 3456 ptp = pmap_find_ptp(pmap, va, ptppa, 1); 3457 #ifdef DIAGNOSTIC 3458 if (ptp == NULL) 3459 panic("pmap_remove: unmanaged " 3460 "PTP detected"); 3461 #endif 3462 } 3463 3464 /* do it! */ 3465 result = pmap_remove_pte(pmap, ptp, 3466 &ptes[pl1_i(va)], va, &pv_tofree); 3467 3468 /* 3469 * if mapping removed and the PTP is no longer 3470 * being used, free it! 3471 */ 3472 3473 if (result && ptp && ptp->wire_count <= 1) 3474 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3475 } 3476 } else for (/* null */ ; va < eva ; va = blkendva) { 3477 int lvl; 3478 3479 /* determine range of block */ 3480 blkendva = x86_round_pdr(va+1); 3481 if (blkendva > eva) 3482 blkendva = eva; 3483 3484 /* 3485 * XXXCDC: our PTE mappings should never be removed 3486 * with pmap_remove! if we allow this (and why would 3487 * we?) then we end up freeing the pmap's page 3488 * directory page (PDP) before we are finished using 3489 * it when we hit in in the recursive mapping. this 3490 * is BAD. 3491 * 3492 * long term solution is to move the PTEs out of user 3493 * address space. and into kernel address space (up 3494 * with APTE). then we can set VM_MAXUSER_ADDRESS to 3495 * be VM_MAX_ADDRESS. 3496 */ 3497 3498 if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE) 3499 /* XXXCDC: ugly hack to avoid freeing PDP here */ 3500 continue; 3501 3502 lvl = pmap_pdes_invalid(va, pdes, &pde); 3503 if (lvl != 0) { 3504 /* 3505 * skip a range corresponding to an invalid pde. 3506 */ 3507 blkendva = (va & ptp_masks[lvl - 1]) + nbpd[lvl - 1]; 3508 continue; 3509 } 3510 3511 /* PA of the PTP */ 3512 ptppa = pmap_pte2pa(pde); 3513 3514 /* get PTP if non-kernel mapping */ 3515 if (pmap == pmap_kernel()) { 3516 /* we never free kernel PTPs */ 3517 ptp = NULL; 3518 } else { 3519 ptp = pmap_find_ptp(pmap, va, ptppa, 1); 3520 #ifdef DIAGNOSTIC 3521 if (ptp == NULL) 3522 panic("pmap_remove: unmanaged PTP " 3523 "detected"); 3524 #endif 3525 } 3526 xpte |= pmap_remove_ptes(pmap, ptp, 3527 (vaddr_t)&ptes[pl1_i(va)], va, blkendva, &pv_tofree); 3528 3529 /* if PTP is no longer being used, free it! */ 3530 if (ptp && ptp->wire_count <= 1) { 3531 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3532 } 3533 if ((xpte & PG_U) != 0) 3534 pmap_tlb_shootdown(pmap, sva, eva, xpte); 3535 } 3536 pmap_unmap_ptes(pmap, pmap2); /* unlock pmap */ 3537 kpreempt_enable(); 3538 3539 /* Now we free unused PVs */ 3540 if (pv_tofree) 3541 pmap_free_pvs(pv_tofree); 3542 } 3543 3544 /* 3545 * pmap_sync_pv: clear pte bits and return the old value of the pte. 3546 * 3547 * => called with pp_lock held. (thus preemption disabled) 3548 * => issues tlb shootdowns if necessary. 3549 */ 3550 3551 static int 3552 pmap_sync_pv(struct pv_pte *pvpte, pt_entry_t expect, int clearbits, 3553 pt_entry_t *optep) 3554 { 3555 struct pmap *pmap; 3556 struct vm_page *ptp; 3557 vaddr_t va; 3558 pt_entry_t *ptep; 3559 pt_entry_t opte; 3560 pt_entry_t npte; 3561 bool need_shootdown; 3562 3563 ptp = pvpte->pte_ptp; 3564 va = pvpte->pte_va; 3565 KASSERT(ptp == NULL || ptp->uobject != NULL); 3566 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 3567 pmap = ptp_to_pmap(ptp); 3568 3569 KASSERT((expect & ~(PG_FRAME | PG_V)) == 0); 3570 KASSERT((expect & PG_V) != 0); 3571 KASSERT(clearbits == ~0 || (clearbits & ~(PG_M | PG_U | PG_RW)) == 0); 3572 KASSERT(kpreempt_disabled()); 3573 3574 ptep = pmap_map_pte(pmap, ptp, va); 3575 do { 3576 opte = *ptep; 3577 KASSERT((opte & (PG_M | PG_U)) != PG_M); 3578 KASSERT((opte & (PG_U | PG_V)) != PG_U); 3579 KASSERT(opte == 0 || (opte & PG_V) != 0); 3580 if ((opte & (PG_FRAME | PG_V)) != expect) { 3581 3582 /* 3583 * we lost a race with a V->P operation like 3584 * pmap_remove(). wait for the competitor 3585 * reflecting pte bits into mp_attrs. 3586 * 3587 * issue a redundant TLB shootdown so that 3588 * we can wait for its completion. 3589 */ 3590 3591 pmap_unmap_pte(); 3592 if (clearbits != 0) { 3593 pmap_tlb_shootdown(pmap, va, 0, 3594 (pmap == pmap_kernel() ? PG_G : 0)); 3595 } 3596 return EAGAIN; 3597 } 3598 3599 /* 3600 * check if there's anything to do on this pte. 3601 */ 3602 3603 if ((opte & clearbits) == 0) { 3604 need_shootdown = false; 3605 break; 3606 } 3607 3608 /* 3609 * we need a shootdown if the pte is cached. (PG_U) 3610 * 3611 * ...unless we are clearing only the PG_RW bit and 3612 * it isn't cached as RW. (PG_M) 3613 */ 3614 3615 need_shootdown = (opte & PG_U) != 0 && 3616 !(clearbits == PG_RW && (opte & PG_M) == 0); 3617 3618 npte = opte & ~clearbits; 3619 3620 /* 3621 * if we need a shootdown anyway, clear PG_U and PG_M. 3622 */ 3623 3624 if (need_shootdown) { 3625 npte &= ~(PG_U | PG_M); 3626 } 3627 KASSERT((npte & (PG_M | PG_U)) != PG_M); 3628 KASSERT((npte & (PG_U | PG_V)) != PG_U); 3629 KASSERT(npte == 0 || (opte & PG_V) != 0); 3630 } while (pmap_pte_cas(ptep, opte, npte) != opte); 3631 3632 if (need_shootdown) { 3633 pmap_tlb_shootdown(pmap, va, 0, opte); 3634 } 3635 pmap_unmap_pte(); 3636 3637 *optep = opte; 3638 return 0; 3639 } 3640 3641 /* 3642 * pmap_page_remove: remove a managed vm_page from all pmaps that map it 3643 * 3644 * => R/M bits are sync'd back to attrs 3645 */ 3646 3647 void 3648 pmap_page_remove(struct vm_page *pg) 3649 { 3650 struct pmap_page *pp; 3651 struct pv_pte *pvpte; 3652 struct pv_entry *killlist = NULL; 3653 struct vm_page *ptp; 3654 pt_entry_t expect; 3655 lwp_t *l; 3656 int count; 3657 3658 l = curlwp; 3659 pp = VM_PAGE_TO_PP(pg); 3660 expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V; 3661 count = SPINLOCK_BACKOFF_MIN; 3662 kpreempt_disable(); 3663 startover: 3664 pp_lock(pp); 3665 while ((pvpte = pv_pte_first(pp)) != NULL) { 3666 struct pmap *pmap; 3667 struct pv_entry *pve; 3668 pt_entry_t opte; 3669 vaddr_t va; 3670 int error; 3671 3672 /* 3673 * add a reference to the pmap before clearing the pte. 3674 * otherwise the pmap can disappear behind us. 3675 */ 3676 3677 ptp = pvpte->pte_ptp; 3678 pmap = ptp_to_pmap(ptp); 3679 if (ptp != NULL) { 3680 pmap_reference(pmap); 3681 } 3682 3683 error = pmap_sync_pv(pvpte, expect, ~0, &opte); 3684 if (error == EAGAIN) { 3685 int hold_count; 3686 pp_unlock(pp); 3687 KERNEL_UNLOCK_ALL(curlwp, &hold_count); 3688 if (ptp != NULL) { 3689 pmap_destroy(pmap); 3690 } 3691 SPINLOCK_BACKOFF(count); 3692 KERNEL_LOCK(hold_count, curlwp); 3693 goto startover; 3694 } 3695 3696 pp->pp_attrs |= opte; 3697 va = pvpte->pte_va; 3698 pve = pmap_remove_pv(pp, ptp, va); 3699 pp_unlock(pp); 3700 3701 /* update the PTP reference count. free if last reference. */ 3702 if (ptp != NULL) { 3703 struct pmap *pmap2; 3704 pt_entry_t *ptes; 3705 pd_entry_t * const *pdes; 3706 3707 KASSERT(pmap != pmap_kernel()); 3708 3709 pmap_tlb_shootwait(); 3710 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3711 pmap_stats_update_bypte(pmap, 0, opte); 3712 ptp->wire_count--; 3713 if (ptp->wire_count <= 1) { 3714 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3715 } 3716 pmap_unmap_ptes(pmap, pmap2); 3717 pmap_destroy(pmap); 3718 } else { 3719 KASSERT(pmap == pmap_kernel()); 3720 pmap_stats_update_bypte(pmap, 0, opte); 3721 } 3722 3723 if (pve != NULL) { 3724 pve->pve_next = killlist; /* mark it for death */ 3725 killlist = pve; 3726 } 3727 pp_lock(pp); 3728 } 3729 pp_unlock(pp); 3730 kpreempt_enable(); 3731 3732 /* Now free unused pvs. */ 3733 pmap_free_pvs(killlist); 3734 } 3735 3736 /* 3737 * p m a p a t t r i b u t e f u n c t i o n s 3738 * functions that test/change managed page's attributes 3739 * since a page can be mapped multiple times we must check each PTE that 3740 * maps it by going down the pv lists. 3741 */ 3742 3743 /* 3744 * pmap_test_attrs: test a page's attributes 3745 */ 3746 3747 bool 3748 pmap_test_attrs(struct vm_page *pg, unsigned testbits) 3749 { 3750 struct pmap_page *pp; 3751 struct pv_pte *pvpte; 3752 pt_entry_t expect; 3753 u_int result; 3754 3755 pp = VM_PAGE_TO_PP(pg); 3756 if ((pp->pp_attrs & testbits) != 0) { 3757 return true; 3758 } 3759 expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V; 3760 pp_lock(pp); 3761 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 3762 pt_entry_t opte; 3763 int error; 3764 3765 if ((pp->pp_attrs & testbits) != 0) { 3766 break; 3767 } 3768 error = pmap_sync_pv(pvpte, expect, 0, &opte); 3769 if (error == 0) { 3770 pp->pp_attrs |= opte; 3771 } 3772 } 3773 result = pp->pp_attrs & testbits; 3774 pp_unlock(pp); 3775 3776 /* 3777 * note that we will exit the for loop with a non-null pve if 3778 * we have found the bits we are testing for. 3779 */ 3780 3781 return result != 0; 3782 } 3783 3784 /* 3785 * pmap_clear_attrs: clear the specified attribute for a page. 3786 * 3787 * => we return true if we cleared one of the bits we were asked to 3788 */ 3789 3790 bool 3791 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits) 3792 { 3793 struct pmap_page *pp; 3794 struct pv_pte *pvpte; 3795 u_int result; 3796 pt_entry_t expect; 3797 int count; 3798 3799 pp = VM_PAGE_TO_PP(pg); 3800 expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V; 3801 count = SPINLOCK_BACKOFF_MIN; 3802 kpreempt_disable(); 3803 startover: 3804 pp_lock(pp); 3805 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 3806 pt_entry_t opte; 3807 int error; 3808 3809 error = pmap_sync_pv(pvpte, expect, clearbits, &opte); 3810 if (error == EAGAIN) { 3811 int hold_count; 3812 pp_unlock(pp); 3813 KERNEL_UNLOCK_ALL(curlwp, &hold_count); 3814 SPINLOCK_BACKOFF(count); 3815 KERNEL_LOCK(hold_count, curlwp); 3816 goto startover; 3817 } 3818 pp->pp_attrs |= opte; 3819 } 3820 result = pp->pp_attrs & clearbits; 3821 pp->pp_attrs &= ~clearbits; 3822 pp_unlock(pp); 3823 kpreempt_enable(); 3824 3825 return result != 0; 3826 } 3827 3828 3829 /* 3830 * p m a p p r o t e c t i o n f u n c t i o n s 3831 */ 3832 3833 /* 3834 * pmap_page_protect: change the protection of all recorded mappings 3835 * of a managed page 3836 * 3837 * => NOTE: this is an inline function in pmap.h 3838 */ 3839 3840 /* see pmap.h */ 3841 3842 /* 3843 * pmap_protect: set the protection in of the pages in a pmap 3844 * 3845 * => NOTE: this is an inline function in pmap.h 3846 */ 3847 3848 /* see pmap.h */ 3849 3850 /* 3851 * pmap_write_protect: write-protect pages in a pmap 3852 */ 3853 3854 void 3855 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot) 3856 { 3857 pt_entry_t *ptes, *epte; 3858 pt_entry_t *spte; 3859 pd_entry_t * const *pdes; 3860 vaddr_t blockend, va; 3861 pt_entry_t opte; 3862 struct pmap *pmap2; 3863 3864 KASSERT(curlwp->l_md.md_gc_pmap != pmap); 3865 3866 kpreempt_disable(); 3867 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 3868 3869 /* should be ok, but just in case ... */ 3870 sva &= PG_FRAME; 3871 eva &= PG_FRAME; 3872 3873 for (va = sva ; va < eva ; va = blockend) { 3874 3875 blockend = (va & L2_FRAME) + NBPD_L2; 3876 if (blockend > eva) 3877 blockend = eva; 3878 3879 /* 3880 * XXXCDC: our PTE mappings should never be write-protected! 3881 * 3882 * long term solution is to move the PTEs out of user 3883 * address space. and into kernel address space (up 3884 * with APTE). then we can set VM_MAXUSER_ADDRESS to 3885 * be VM_MAX_ADDRESS. 3886 */ 3887 3888 /* XXXCDC: ugly hack to avoid freeing PDP here */ 3889 if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE) 3890 continue; 3891 3892 /* empty block? */ 3893 if (!pmap_pdes_valid(va, pdes, NULL)) 3894 continue; 3895 3896 #ifdef DIAGNOSTIC 3897 if (va >= VM_MAXUSER_ADDRESS && 3898 va < VM_MAX_ADDRESS) 3899 panic("pmap_write_protect: PTE space"); 3900 #endif 3901 3902 spte = &ptes[pl1_i(va)]; 3903 epte = &ptes[pl1_i(blockend)]; 3904 3905 for (/*null */; spte < epte ; spte++) { 3906 pt_entry_t npte; 3907 3908 do { 3909 opte = *spte; 3910 if ((~opte & (PG_RW | PG_V)) != 0) { 3911 goto next; 3912 } 3913 npte = opte & ~PG_RW; 3914 } while (pmap_pte_cas(spte, opte, npte) != opte); 3915 if ((opte & PG_M) != 0) { 3916 vaddr_t tva; 3917 3918 tva = x86_ptob(spte - ptes); 3919 pmap_tlb_shootdown(pmap, tva, 0, opte); 3920 } 3921 next:; 3922 } 3923 } 3924 3925 pmap_unmap_ptes(pmap, pmap2); /* unlocks pmap */ 3926 kpreempt_enable(); 3927 } 3928 3929 /* 3930 * end of protection functions 3931 */ 3932 3933 /* 3934 * pmap_unwire: clear the wired bit in the PTE 3935 * 3936 * => mapping should already be in map 3937 */ 3938 3939 void 3940 pmap_unwire(struct pmap *pmap, vaddr_t va) 3941 { 3942 pt_entry_t *ptes; 3943 pd_entry_t * const *pdes; 3944 struct pmap *pmap2; 3945 3946 kpreempt_disable(); 3947 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 3948 3949 if (pmap_pdes_valid(va, pdes, NULL)) { 3950 pt_entry_t *ptep = &ptes[pl1_i(va)]; 3951 pt_entry_t opte = *ptep; 3952 3953 #ifdef DIAGNOSTIC 3954 if (!pmap_valid_entry(opte)) 3955 panic("pmap_unwire: invalid (unmapped) va 0x%lx", va); 3956 #endif 3957 if ((opte & PG_W) != 0) { 3958 pt_entry_t npte = opte & ~PG_W; 3959 3960 opte = pmap_pte_testset(ptep, npte); 3961 pmap_stats_update_bypte(pmap, npte, opte); 3962 } 3963 #ifdef DIAGNOSTIC 3964 else { 3965 printf("pmap_unwire: wiring for pmap %p va 0x%lx " 3966 "didn't change!\n", pmap, va); 3967 } 3968 #endif 3969 pmap_unmap_ptes(pmap, pmap2); /* unlocks map */ 3970 } 3971 #ifdef DIAGNOSTIC 3972 else { 3973 panic("pmap_unwire: invalid PDE"); 3974 } 3975 #endif 3976 kpreempt_enable(); 3977 } 3978 3979 /* 3980 * pmap_copy: copy mappings from one pmap to another 3981 * 3982 * => optional function 3983 * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr) 3984 */ 3985 3986 /* 3987 * defined as macro in pmap.h 3988 */ 3989 3990 /* 3991 * pmap_enter: enter a mapping into a pmap 3992 * 3993 * => must be done "now" ... no lazy-evaluation 3994 * => we set pmap => pv_head locking 3995 */ 3996 #ifdef XEN 3997 int 3998 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa, 3999 vm_prot_t prot, u_int flags, int domid) 4000 { 4001 #else /* XEN */ 4002 int 4003 pmap_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, 4004 u_int flags) 4005 { 4006 paddr_t ma = pa; 4007 #endif /* XEN */ 4008 pt_entry_t *ptes, opte, npte; 4009 pt_entry_t *ptep; 4010 pd_entry_t * const *pdes; 4011 struct vm_page *ptp, *pg; 4012 struct pmap_page *new_pp; 4013 struct pmap_page *old_pp; 4014 struct pv_entry *old_pve = NULL; 4015 struct pv_entry *new_pve; 4016 struct pv_entry *new_pve2; 4017 int error; 4018 bool wired = (flags & PMAP_WIRED) != 0; 4019 struct pmap *pmap2; 4020 4021 KASSERT(pmap_initialized); 4022 KASSERT(curlwp->l_md.md_gc_pmap != pmap); 4023 4024 #ifdef DIAGNOSTIC 4025 /* sanity check: totally out of range? */ 4026 if (va >= VM_MAX_KERNEL_ADDRESS) 4027 panic("pmap_enter: too big"); 4028 4029 if (va == (vaddr_t) PDP_BASE || va == (vaddr_t) APDP_BASE) 4030 panic("pmap_enter: trying to map over PDP/APDP!"); 4031 4032 /* sanity check: kernel PTPs should already have been pre-allocated */ 4033 if (va >= VM_MIN_KERNEL_ADDRESS && 4034 !pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)])) 4035 panic("pmap_enter: missing kernel PTP for va %lx!", va); 4036 #endif /* DIAGNOSTIC */ 4037 #ifdef XEN 4038 KASSERT(domid == DOMID_SELF || pa == 0); 4039 #endif /* XEN */ 4040 4041 npte = ma | protection_codes[prot] | PG_V; 4042 if (wired) 4043 npte |= PG_W; 4044 if (flags & PMAP_NOCACHE) 4045 npte |= PG_N; 4046 if (va < VM_MAXUSER_ADDRESS) 4047 npte |= PG_u; 4048 else if (va < VM_MAX_ADDRESS) 4049 npte |= (PG_u | PG_RW); /* XXXCDC: no longer needed? */ 4050 else 4051 npte |= PG_k; 4052 if (pmap == pmap_kernel()) 4053 npte |= pmap_pg_g; 4054 if (flags & VM_PROT_ALL) { 4055 npte |= PG_U; 4056 if (flags & VM_PROT_WRITE) { 4057 KASSERT((npte & PG_RW) != 0); 4058 npte |= PG_M; 4059 } 4060 } 4061 4062 #ifdef XEN 4063 if (domid != DOMID_SELF) 4064 pg = NULL; 4065 else 4066 #endif 4067 pg = PHYS_TO_VM_PAGE(pa); 4068 if (pg != NULL) { 4069 /* This is a managed page */ 4070 npte |= PG_PVLIST; 4071 new_pp = VM_PAGE_TO_PP(pg); 4072 } else { 4073 new_pp = NULL; 4074 } 4075 4076 /* get pves. */ 4077 new_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT); 4078 new_pve2 = pool_cache_get(&pmap_pv_cache, PR_NOWAIT); 4079 if (new_pve == NULL || new_pve2 == NULL) { 4080 if (flags & PMAP_CANFAIL) { 4081 error = ENOMEM; 4082 goto out2; 4083 } 4084 panic("pmap_enter: pve allocation failed"); 4085 } 4086 4087 kpreempt_disable(); 4088 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 4089 if (pmap == pmap_kernel()) { 4090 ptp = NULL; 4091 } else { 4092 ptp = pmap_get_ptp(pmap, va, pdes); 4093 if (ptp == NULL) { 4094 pmap_unmap_ptes(pmap, pmap2); 4095 if (flags & PMAP_CANFAIL) { 4096 error = ENOMEM; 4097 goto out; 4098 } 4099 panic("pmap_enter: get ptp failed"); 4100 } 4101 } 4102 4103 /* 4104 * update the pte. 4105 */ 4106 4107 ptep = &ptes[pl1_i(va)]; 4108 do { 4109 opte = *ptep; 4110 4111 /* 4112 * if the same page, inherit PG_U and PG_M. 4113 */ 4114 if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) { 4115 npte |= opte & (PG_U | PG_M); 4116 } 4117 #if defined(XEN) 4118 if (domid != DOMID_SELF) { 4119 /* pmap_pte_cas with error handling */ 4120 int s = splvm(); 4121 if (opte != *ptep) { 4122 splx(s); 4123 continue; 4124 } 4125 error = xpq_update_foreign( 4126 vtomach((vaddr_t)ptep), npte, domid); 4127 splx(s); 4128 if (error) { 4129 if (ptp != NULL && ptp->wire_count <= 1) { 4130 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 4131 } 4132 pmap_unmap_ptes(pmap, pmap2); 4133 goto out; 4134 } 4135 break; 4136 } 4137 #endif /* defined(XEN) */ 4138 } while (pmap_pte_cas(ptep, opte, npte) != opte); 4139 4140 /* 4141 * update statistics and PTP's reference count. 4142 */ 4143 4144 pmap_stats_update_bypte(pmap, npte, opte); 4145 if (ptp != NULL && !pmap_valid_entry(opte)) { 4146 ptp->wire_count++; 4147 } 4148 KASSERT(ptp == NULL || ptp->wire_count > 1); 4149 4150 /* 4151 * if the same page, we can skip pv_entry handling. 4152 */ 4153 4154 if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) { 4155 KASSERT(((opte ^ npte) & PG_PVLIST) == 0); 4156 goto same_pa; 4157 } 4158 4159 /* 4160 * if old page is managed, remove pv_entry from its list. 4161 */ 4162 4163 if ((~opte & (PG_V | PG_PVLIST)) == 0) { 4164 pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte)); 4165 #ifdef DIAGNOSTIC 4166 if (pg == NULL) 4167 panic("pmap_enter: PG_PVLIST mapping with " 4168 "unmanaged page " 4169 "pa = 0x%" PRIx64 " (0x%" PRIx64 ")", 4170 (int64_t)pa, (int64_t)atop(pa)); 4171 #endif 4172 old_pp = VM_PAGE_TO_PP(pg); 4173 4174 pp_lock(old_pp); 4175 old_pve = pmap_remove_pv(old_pp, ptp, va); 4176 old_pp->pp_attrs |= opte; 4177 pp_unlock(old_pp); 4178 } 4179 4180 /* 4181 * if new page is managed, insert pv_entry into its list. 4182 */ 4183 4184 if (new_pp) { 4185 pp_lock(new_pp); 4186 new_pve = pmap_enter_pv(new_pp, new_pve, &new_pve2, ptp, va); 4187 pp_unlock(new_pp); 4188 } 4189 4190 same_pa: 4191 pmap_unmap_ptes(pmap, pmap2); 4192 4193 /* 4194 * shootdown tlb if necessary. 4195 */ 4196 4197 if ((~opte & (PG_V | PG_U)) == 0 && 4198 ((opte ^ npte) & (PG_FRAME | PG_RW)) != 0) { 4199 pmap_tlb_shootdown(pmap, va, 0, opte); 4200 } 4201 4202 error = 0; 4203 out: 4204 kpreempt_enable(); 4205 out2: 4206 if (old_pve != NULL) { 4207 pool_cache_put(&pmap_pv_cache, old_pve); 4208 } 4209 if (new_pve != NULL) { 4210 pool_cache_put(&pmap_pv_cache, new_pve); 4211 } 4212 if (new_pve2 != NULL) { 4213 pool_cache_put(&pmap_pv_cache, new_pve2); 4214 } 4215 4216 return error; 4217 } 4218 4219 #ifdef XEN 4220 int 4221 pmap_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags) 4222 { 4223 paddr_t ma; 4224 4225 if (__predict_false(pa < pmap_pa_start || pmap_pa_end <= pa)) { 4226 ma = pa; /* XXX hack */ 4227 } else { 4228 ma = xpmap_ptom(pa); 4229 } 4230 4231 return pmap_enter_ma(pmap, va, ma, pa, prot, flags, DOMID_SELF); 4232 } 4233 #endif /* XEN */ 4234 4235 static bool 4236 pmap_get_physpage(vaddr_t va, int level, paddr_t *paddrp) 4237 { 4238 struct vm_page *ptp; 4239 struct pmap *kpm = pmap_kernel(); 4240 4241 if (uvm.page_init_done == false) { 4242 /* 4243 * we're growing the kernel pmap early (from 4244 * uvm_pageboot_alloc()). this case must be 4245 * handled a little differently. 4246 */ 4247 4248 if (uvm_page_physget(paddrp) == false) 4249 panic("pmap_get_physpage: out of memory"); 4250 kpreempt_disable(); 4251 pmap_pte_set(early_zero_pte, 4252 pmap_pa2pte(*paddrp) | PG_V | PG_RW | PG_k); 4253 pmap_pte_flush(); 4254 pmap_update_pg((vaddr_t)early_zerop); 4255 memset(early_zerop, 0, PAGE_SIZE); 4256 #if defined(DIAGNOSTIC) || defined (XEN) 4257 pmap_pte_set(early_zero_pte, 0); 4258 pmap_pte_flush(); 4259 #endif /* defined(DIAGNOSTIC) */ 4260 kpreempt_enable(); 4261 } else { 4262 /* XXX */ 4263 PMAP_SUBOBJ_LOCK(kpm, level - 1); 4264 ptp = uvm_pagealloc(&kpm->pm_obj[level - 1], 4265 ptp_va2o(va, level), NULL, 4266 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 4267 PMAP_SUBOBJ_UNLOCK(kpm, level - 1); 4268 if (ptp == NULL) 4269 panic("pmap_get_physpage: out of memory"); 4270 ptp->flags &= ~PG_BUSY; 4271 ptp->wire_count = 1; 4272 *paddrp = VM_PAGE_TO_PHYS(ptp); 4273 } 4274 pmap_stats_update(kpm, 1, 0); 4275 return true; 4276 } 4277 4278 /* 4279 * Allocate the amount of specified ptps for a ptp level, and populate 4280 * all levels below accordingly, mapping virtual addresses starting at 4281 * kva. 4282 * 4283 * Used by pmap_growkernel. 4284 */ 4285 static void 4286 pmap_alloc_level(pd_entry_t * const *pdes, vaddr_t kva, int lvl, 4287 long *needed_ptps) 4288 { 4289 unsigned long i; 4290 vaddr_t va; 4291 paddr_t pa; 4292 unsigned long index, endindex; 4293 int level; 4294 pd_entry_t *pdep; 4295 #ifdef XEN 4296 int s = splvm(); /* protect xpq_* */ 4297 #endif 4298 4299 for (level = lvl; level > 1; level--) { 4300 if (level == PTP_LEVELS) 4301 pdep = pmap_kernel()->pm_pdir; 4302 else 4303 pdep = pdes[level - 2]; 4304 va = kva; 4305 index = pl_i_roundup(kva, level); 4306 endindex = index + needed_ptps[level - 1] - 1; 4307 4308 4309 for (i = index; i <= endindex; i++) { 4310 KASSERT(!pmap_valid_entry(pdep[i])); 4311 pmap_get_physpage(va, level - 1, &pa); 4312 #ifdef XEN 4313 xpq_queue_pte_update((level == PTP_LEVELS) ? 4314 xpmap_ptom(pmap_pdirpa(pmap_kernel(), i)) : 4315 xpmap_ptetomach(&pdep[i]), 4316 pmap_pa2pte(pa) | PG_k | PG_V | PG_RW); 4317 #ifdef PAE 4318 if (level == PTP_LEVELS && i > L2_SLOT_KERN) { 4319 /* update real kernel PD too */ 4320 xpq_queue_pte_update( 4321 xpmap_ptetomach(&pmap_kl2pd[l2tol2(i)]), 4322 pmap_pa2pte(pa) | PG_k | PG_V | PG_RW); 4323 } 4324 #endif 4325 #else /* XEN */ 4326 pdep[i] = pa | PG_RW | PG_V; 4327 #endif /* XEN */ 4328 KASSERT(level != PTP_LEVELS || nkptp[level - 1] + 4329 pl_i(VM_MIN_KERNEL_ADDRESS, level) == i); 4330 nkptp[level - 1]++; 4331 va += nbpd[level - 1]; 4332 } 4333 pmap_pte_flush(); 4334 } 4335 #ifdef XEN 4336 splx(s); 4337 #endif 4338 } 4339 4340 /* 4341 * pmap_growkernel: increase usage of KVM space 4342 * 4343 * => we allocate new PTPs for the kernel and install them in all 4344 * the pmaps on the system. 4345 */ 4346 4347 vaddr_t 4348 pmap_growkernel(vaddr_t maxkvaddr) 4349 { 4350 struct pmap *kpm = pmap_kernel(); 4351 #if !defined(XEN) || !defined(__x86_64__) 4352 struct pmap *pm; 4353 #endif 4354 int s, i; 4355 long needed_kptp[PTP_LEVELS], target_nptp, old; 4356 bool invalidate = false; 4357 4358 s = splvm(); /* to be safe */ 4359 mutex_enter(&kpm->pm_lock); 4360 4361 if (maxkvaddr <= pmap_maxkvaddr) { 4362 mutex_exit(&kpm->pm_lock); 4363 splx(s); 4364 return pmap_maxkvaddr; 4365 } 4366 4367 maxkvaddr = x86_round_pdr(maxkvaddr); 4368 old = nkptp[PTP_LEVELS - 1]; 4369 /* 4370 * This loop could be optimized more, but pmap_growkernel() 4371 * is called infrequently. 4372 */ 4373 for (i = PTP_LEVELS - 1; i >= 1; i--) { 4374 target_nptp = pl_i_roundup(maxkvaddr, i + 1) - 4375 pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1); 4376 /* 4377 * XXX only need to check toplevel. 4378 */ 4379 if (target_nptp > nkptpmax[i]) 4380 panic("out of KVA space"); 4381 KASSERT(target_nptp >= nkptp[i]); 4382 needed_kptp[i] = target_nptp - nkptp[i]; 4383 } 4384 4385 pmap_alloc_level(normal_pdes, pmap_maxkvaddr, PTP_LEVELS, needed_kptp); 4386 4387 /* 4388 * If the number of top level entries changed, update all 4389 * pmaps. 4390 */ 4391 if (needed_kptp[PTP_LEVELS - 1] != 0) { 4392 #ifdef XEN 4393 #ifdef __x86_64__ 4394 /* nothing, kernel entries are never entered in user pmap */ 4395 #else /* __x86_64__ */ 4396 mutex_enter(&pmaps_lock); 4397 LIST_FOREACH(pm, &pmaps, pm_list) { 4398 int pdkidx; 4399 for (pdkidx = PDIR_SLOT_KERN + old; 4400 pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1]; 4401 pdkidx++) { 4402 xpq_queue_pte_update( 4403 xpmap_ptom(pmap_pdirpa(pm, pdkidx)), 4404 kpm->pm_pdir[pdkidx]); 4405 } 4406 xpq_flush_queue(); 4407 } 4408 mutex_exit(&pmaps_lock); 4409 #endif /* __x86_64__ */ 4410 #else /* XEN */ 4411 unsigned newpdes; 4412 newpdes = nkptp[PTP_LEVELS - 1] - old; 4413 mutex_enter(&pmaps_lock); 4414 LIST_FOREACH(pm, &pmaps, pm_list) { 4415 memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old], 4416 &kpm->pm_pdir[PDIR_SLOT_KERN + old], 4417 newpdes * sizeof (pd_entry_t)); 4418 } 4419 mutex_exit(&pmaps_lock); 4420 #endif 4421 invalidate = true; 4422 } 4423 pmap_maxkvaddr = maxkvaddr; 4424 mutex_exit(&kpm->pm_lock); 4425 splx(s); 4426 4427 if (invalidate) { 4428 /* Invalidate the PDP cache. */ 4429 pool_cache_invalidate(&pmap_pdp_cache); 4430 } 4431 4432 return maxkvaddr; 4433 } 4434 4435 #ifdef DEBUG 4436 void pmap_dump(struct pmap *, vaddr_t, vaddr_t); 4437 4438 /* 4439 * pmap_dump: dump all the mappings from a pmap 4440 * 4441 * => caller should not be holding any pmap locks 4442 */ 4443 4444 void 4445 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 4446 { 4447 pt_entry_t *ptes, *pte; 4448 pd_entry_t * const *pdes; 4449 struct pmap *pmap2; 4450 vaddr_t blkendva; 4451 4452 /* 4453 * if end is out of range truncate. 4454 * if (end == start) update to max. 4455 */ 4456 4457 if (eva > VM_MAXUSER_ADDRESS || eva <= sva) 4458 eva = VM_MAXUSER_ADDRESS; 4459 4460 /* 4461 * we lock in the pmap => pv_head direction 4462 */ 4463 4464 kpreempt_disable(); 4465 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 4466 4467 /* 4468 * dumping a range of pages: we dump in PTP sized blocks (4MB) 4469 */ 4470 4471 for (/* null */ ; sva < eva ; sva = blkendva) { 4472 4473 /* determine range of block */ 4474 blkendva = x86_round_pdr(sva+1); 4475 if (blkendva > eva) 4476 blkendva = eva; 4477 4478 /* valid block? */ 4479 if (!pmap_pdes_valid(sva, pdes, NULL)) 4480 continue; 4481 4482 pte = &ptes[pl1_i(sva)]; 4483 for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) { 4484 if (!pmap_valid_entry(*pte)) 4485 continue; 4486 printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR 4487 " (pte=%#" PRIxPADDR ")\n", 4488 sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte); 4489 } 4490 } 4491 pmap_unmap_ptes(pmap, pmap2); 4492 kpreempt_enable(); 4493 } 4494 #endif 4495 4496 /* 4497 * pmap_tlb_shootdown: invalidate pages on all CPUs using pmap 'pm' 4498 * 4499 * => always invalidates locally before returning 4500 * => returns before remote CPUs have invalidated 4501 * => must be called with preemption disabled 4502 */ 4503 4504 void 4505 pmap_tlb_shootdown(struct pmap *pm, vaddr_t sva, vaddr_t eva, pt_entry_t pte) 4506 { 4507 #ifdef MULTIPROCESSOR 4508 extern bool x86_mp_online; 4509 struct cpu_info *ci; 4510 struct pmap_mbox *mb, *selfmb; 4511 CPU_INFO_ITERATOR cii; 4512 uintptr_t head; 4513 u_int count; 4514 int s; 4515 #endif /* MULTIPROCESSOR */ 4516 struct cpu_info *self; 4517 bool kernel; 4518 4519 KASSERT(eva == 0 || eva >= sva); 4520 KASSERT(kpreempt_disabled()); 4521 4522 if (pte & PG_PS) 4523 sva &= PG_LGFRAME; 4524 pte &= PG_G; 4525 self = curcpu(); 4526 4527 if (sva == (vaddr_t)-1LL) { 4528 kernel = true; 4529 } else { 4530 if (eva == 0) 4531 eva = sva + PAGE_SIZE; 4532 kernel = sva >= VM_MAXUSER_ADDRESS; 4533 KASSERT(kernel == (eva > VM_MAXUSER_ADDRESS)); 4534 } 4535 4536 /* 4537 * if tearing down the pmap, do nothing. we'll flush later 4538 * when we're ready to recycle/destroy it. 4539 */ 4540 if (__predict_false(curlwp->l_md.md_gc_pmap == pm)) { 4541 return; 4542 } 4543 4544 /* 4545 * If the range is larger than 32 pages, then invalidate 4546 * everything. 4547 */ 4548 if (sva != (vaddr_t)-1LL && eva - sva > (32 * PAGE_SIZE)) { 4549 sva = (vaddr_t)-1LL; 4550 eva = sva; 4551 } 4552 4553 #ifdef MULTIPROCESSOR 4554 if (ncpu > 1 && x86_mp_online) { 4555 selfmb = &self->ci_pmap_cpu->pc_mbox; 4556 4557 /* 4558 * If the CPUs have no notion of global pages then 4559 * reload of %cr3 is sufficient. 4560 */ 4561 if (pte != 0 && (cpu_feature & CPUID_PGE) == 0) 4562 pte = 0; 4563 4564 if (pm == pmap_kernel()) { 4565 /* 4566 * Mapped on all CPUs: use the broadcast mechanism. 4567 * Once we have the lock, increment the counter. 4568 */ 4569 s = splvm(); 4570 mb = &pmap_mbox; 4571 count = SPINLOCK_BACKOFF_MIN; 4572 do { 4573 if ((head = mb->mb_head) != mb->mb_tail) { 4574 splx(s); 4575 while ((head = mb->mb_head) != 4576 mb->mb_tail) 4577 SPINLOCK_BACKOFF(count); 4578 s = splvm(); 4579 } 4580 } while (atomic_cas_ulong( 4581 (volatile u_long *)&mb->mb_head, 4582 head, head + ncpu - 1) != head); 4583 4584 /* 4585 * Once underway we must stay at IPL_VM until the 4586 * IPI is dispatched. Otherwise interrupt handlers 4587 * on this CPU can deadlock against us. 4588 */ 4589 pmap_tlb_evcnt.ev_count++; 4590 mb->mb_pointer = self; 4591 mb->mb_addr1 = sva; 4592 mb->mb_addr2 = eva; 4593 mb->mb_global = pte; 4594 x86_ipi(LAPIC_TLB_BCAST_VECTOR, LAPIC_DEST_ALLEXCL, 4595 LAPIC_DLMODE_FIXED); 4596 self->ci_need_tlbwait = 1; 4597 splx(s); 4598 } else if ((pm->pm_cpus & ~self->ci_cpumask) != 0 || 4599 (kernel && (pm->pm_kernel_cpus & ~self->ci_cpumask) != 0)) { 4600 /* 4601 * We don't bother traversing the CPU list if only 4602 * used by this CPU. 4603 * 4604 * We can't do global flushes with the multicast 4605 * mechanism. 4606 */ 4607 KASSERT(pte == 0); 4608 4609 /* 4610 * Take ownership of the shootdown mailbox on each 4611 * CPU, fill the details and fire it off. 4612 */ 4613 s = splvm(); 4614 for (CPU_INFO_FOREACH(cii, ci)) { 4615 if (ci == self || 4616 !pmap_is_active(pm, ci, kernel) || 4617 !(ci->ci_flags & CPUF_RUNNING)) 4618 continue; 4619 selfmb->mb_head++; 4620 mb = &ci->ci_pmap_cpu->pc_mbox; 4621 count = SPINLOCK_BACKOFF_MIN; 4622 while (atomic_cas_ulong( 4623 (u_long *)&mb->mb_pointer, 4624 0, (u_long)&selfmb->mb_tail) != 0) { 4625 splx(s); 4626 while (mb->mb_pointer != 0) 4627 SPINLOCK_BACKOFF(count); 4628 s = splvm(); 4629 } 4630 mb->mb_addr1 = sva; 4631 mb->mb_addr2 = eva; 4632 mb->mb_global = pte; 4633 if (x86_ipi(LAPIC_TLB_MCAST_VECTOR, 4634 ci->ci_cpuid, LAPIC_DLMODE_FIXED)) 4635 panic("pmap_tlb_shootdown: ipi failed"); 4636 } 4637 self->ci_need_tlbwait = 1; 4638 splx(s); 4639 } 4640 } 4641 #endif /* MULTIPROCESSOR */ 4642 4643 /* Update the current CPU before waiting for others. */ 4644 if (!pmap_is_active(pm, self, kernel)) 4645 return; 4646 4647 if (sva == (vaddr_t)-1LL) { 4648 u_int gen = uvm_emap_gen_return(); 4649 if (pte != 0) { 4650 tlbflushg(); 4651 } else { 4652 tlbflush(); 4653 } 4654 uvm_emap_update(gen); 4655 } else { 4656 do { 4657 pmap_update_pg(sva); 4658 sva += PAGE_SIZE; 4659 } while (sva < eva); 4660 } 4661 } 4662 4663 /* 4664 * pmap_tlb_shootwait: wait for pending TLB shootdowns to complete 4665 * 4666 * => only waits for operations generated by the current CPU 4667 * => must be called with preemption disabled 4668 */ 4669 4670 void 4671 pmap_tlb_shootwait(void) 4672 { 4673 struct cpu_info *self; 4674 struct pmap_mbox *mb; 4675 4676 KASSERT(kpreempt_disabled()); 4677 4678 /* 4679 * Anything to do? XXX Really we want to avoid touching the cache 4680 * lines of the two mailboxes, but the processor may read ahead. 4681 */ 4682 self = curcpu(); 4683 if (!self->ci_need_tlbwait) 4684 return; 4685 self->ci_need_tlbwait = 0; 4686 4687 /* If we own the global mailbox, wait for it to drain. */ 4688 mb = &pmap_mbox; 4689 while (mb->mb_pointer == self && mb->mb_head != mb->mb_tail) 4690 x86_pause(); 4691 4692 /* If we own other CPU's mailboxes, wait for them to drain. */ 4693 mb = &self->ci_pmap_cpu->pc_mbox; 4694 KASSERT(mb->mb_pointer != &mb->mb_tail); 4695 while (mb->mb_head != mb->mb_tail) 4696 x86_pause(); 4697 } 4698 4699 /* 4700 * pmap_update: process deferred invalidations 4701 */ 4702 4703 void 4704 pmap_update(struct pmap *pmap) 4705 { 4706 struct vm_page *ptp, *empty_ptps; 4707 struct pmap_page *pp; 4708 lwp_t *l; 4709 4710 /* 4711 * if we have torn down this pmap, invalidate non-global TLB 4712 * entries on any processors using it. 4713 */ 4714 l = curlwp; 4715 if (__predict_false(l->l_md.md_gc_pmap == pmap)) { 4716 l->l_md.md_gc_pmap = NULL; 4717 KPREEMPT_DISABLE(l); 4718 pmap_tlb_shootdown(pmap, -1, -1, 0); 4719 KPREEMPT_ENABLE(l); 4720 } 4721 4722 /* 4723 * wait for tlb shootdowns to complete before returning control 4724 * to the caller. 4725 */ 4726 kpreempt_disable(); 4727 pmap_tlb_shootwait(); 4728 kpreempt_enable(); 4729 4730 /* 4731 * now that shootdowns are complete, process deferred frees, 4732 * but not from interrupt context. 4733 */ 4734 if (l->l_md.md_gc_ptp != NULL) { 4735 if (cpu_intr_p() || (l->l_pflag & LP_INTR) != 0) { 4736 return; 4737 } 4738 4739 empty_ptps = l->l_md.md_gc_ptp; 4740 l->l_md.md_gc_ptp = NULL; 4741 4742 while ((ptp = empty_ptps) != NULL) { 4743 ptp->flags |= PG_ZERO; 4744 pp = VM_PAGE_TO_PP(ptp); 4745 empty_ptps = pp->pp_link; 4746 LIST_INIT(&pp->pp_head.pvh_list); 4747 uvm_pagefree(ptp); 4748 } 4749 } 4750 } 4751 4752 #if PTP_LEVELS > 4 4753 #error "Unsupported number of page table mappings" 4754 #endif 4755 4756 paddr_t 4757 pmap_init_tmp_pgtbl(paddr_t pg) 4758 { 4759 static bool maps_loaded; 4760 static const paddr_t x86_tmp_pml_paddr[] = { 4761 4 * PAGE_SIZE, 4762 5 * PAGE_SIZE, 4763 6 * PAGE_SIZE, 4764 7 * PAGE_SIZE 4765 }; 4766 static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 }; 4767 4768 pd_entry_t *tmp_pml, *kernel_pml; 4769 4770 int level; 4771 4772 if (!maps_loaded) { 4773 for (level = 0; level < PTP_LEVELS; ++level) { 4774 x86_tmp_pml_vaddr[level] = 4775 uvm_km_alloc(kernel_map, PAGE_SIZE, 0, 4776 UVM_KMF_VAONLY); 4777 4778 if (x86_tmp_pml_vaddr[level] == 0) 4779 panic("mapping of real mode PML failed\n"); 4780 pmap_kenter_pa(x86_tmp_pml_vaddr[level], 4781 x86_tmp_pml_paddr[level], 4782 VM_PROT_READ | VM_PROT_WRITE, 0); 4783 pmap_update(pmap_kernel()); 4784 } 4785 maps_loaded = true; 4786 } 4787 4788 /* Zero levels 1-3 */ 4789 for (level = 0; level < PTP_LEVELS - 1; ++level) { 4790 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 4791 memset(tmp_pml, 0, PAGE_SIZE); 4792 } 4793 4794 /* Copy PML4 */ 4795 kernel_pml = pmap_kernel()->pm_pdir; 4796 tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1]; 4797 memcpy(tmp_pml, kernel_pml, PAGE_SIZE); 4798 4799 for (level = PTP_LEVELS - 1; level > 0; --level) { 4800 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 4801 4802 tmp_pml[pl_i(pg, level + 1)] = 4803 (x86_tmp_pml_paddr[level - 1] & PG_FRAME) | PG_RW | PG_V; 4804 } 4805 4806 tmp_pml = (void *)x86_tmp_pml_vaddr[0]; 4807 tmp_pml[pl_i(pg, 1)] = (pg & PG_FRAME) | PG_RW | PG_V; 4808 4809 return x86_tmp_pml_paddr[PTP_LEVELS - 1]; 4810 } 4811