1 /* $NetBSD: pmap.c,v 1.102 2010/02/10 00:39:30 jym Exp $ */ 2 3 /* 4 * Copyright (c) 2007 Manuel Bouyer. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 * 26 */ 27 28 /* 29 * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr> 30 * 31 * Permission to use, copy, modify, and distribute this software for any 32 * purpose with or without fee is hereby granted, provided that the above 33 * copyright notice and this permission notice appear in all copies. 34 * 35 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 36 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 37 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 38 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 39 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 40 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 41 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 42 */ 43 44 /* 45 * 46 * Copyright (c) 1997 Charles D. Cranor and Washington University. 47 * All rights reserved. 48 * 49 * Redistribution and use in source and binary forms, with or without 50 * modification, are permitted provided that the following conditions 51 * are met: 52 * 1. Redistributions of source code must retain the above copyright 53 * notice, this list of conditions and the following disclaimer. 54 * 2. Redistributions in binary form must reproduce the above copyright 55 * notice, this list of conditions and the following disclaimer in the 56 * documentation and/or other materials provided with the distribution. 57 * 3. All advertising materials mentioning features or use of this software 58 * must display the following acknowledgement: 59 * This product includes software developed by Charles D. Cranor and 60 * Washington University. 61 * 4. The name of the author may not be used to endorse or promote products 62 * derived from this software without specific prior written permission. 63 * 64 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 65 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 66 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 67 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 68 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 69 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 70 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 71 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 72 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 73 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 74 */ 75 76 /* 77 * Copyright 2001 (c) Wasabi Systems, Inc. 78 * All rights reserved. 79 * 80 * Written by Frank van der Linden for Wasabi Systems, Inc. 81 * 82 * Redistribution and use in source and binary forms, with or without 83 * modification, are permitted provided that the following conditions 84 * are met: 85 * 1. Redistributions of source code must retain the above copyright 86 * notice, this list of conditions and the following disclaimer. 87 * 2. Redistributions in binary form must reproduce the above copyright 88 * notice, this list of conditions and the following disclaimer in the 89 * documentation and/or other materials provided with the distribution. 90 * 3. All advertising materials mentioning features or use of this software 91 * must display the following acknowledgement: 92 * This product includes software developed for the NetBSD Project by 93 * Wasabi Systems, Inc. 94 * 4. The name of Wasabi Systems, Inc. may not be used to endorse 95 * or promote products derived from this software without specific prior 96 * written permission. 97 * 98 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND 99 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 100 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 101 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC 102 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 103 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 104 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 105 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 106 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 107 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 108 * POSSIBILITY OF SUCH DAMAGE. 109 */ 110 111 /* 112 * This is the i386 pmap modified and generalized to support x86-64 113 * as well. The idea is to hide the upper N levels of the page tables 114 * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest 115 * is mostly untouched, except that it uses some more generalized 116 * macros and interfaces. 117 * 118 * This pmap has been tested on the i386 as well, and it can be easily 119 * adapted to PAE. 120 * 121 * fvdl@wasabisystems.com 18-Jun-2001 122 */ 123 124 /* 125 * pmap.c: i386 pmap module rewrite 126 * Chuck Cranor <chuck@ccrc.wustl.edu> 127 * 11-Aug-97 128 * 129 * history of this pmap module: in addition to my own input, i used 130 * the following references for this rewrite of the i386 pmap: 131 * 132 * [1] the NetBSD i386 pmap. this pmap appears to be based on the 133 * BSD hp300 pmap done by Mike Hibler at University of Utah. 134 * it was then ported to the i386 by William Jolitz of UUNET 135 * Technologies, Inc. Then Charles M. Hannum of the NetBSD 136 * project fixed some bugs and provided some speed ups. 137 * 138 * [2] the FreeBSD i386 pmap. this pmap seems to be the 139 * Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson 140 * and David Greenman. 141 * 142 * [3] the Mach pmap. this pmap, from CMU, seems to have migrated 143 * between several processors. the VAX version was done by 144 * Avadis Tevanian, Jr., and Michael Wayne Young. the i386 145 * version was done by Lance Berc, Mike Kupfer, Bob Baron, 146 * David Golub, and Richard Draves. the alpha version was 147 * done by Alessandro Forin (CMU/Mach) and Chris Demetriou 148 * (NetBSD/alpha). 149 */ 150 151 #include <sys/cdefs.h> 152 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.102 2010/02/10 00:39:30 jym Exp $"); 153 154 #include "opt_user_ldt.h" 155 #include "opt_lockdebug.h" 156 #include "opt_multiprocessor.h" 157 #include "opt_xen.h" 158 #if !defined(__x86_64__) 159 #include "opt_kstack_dr0.h" 160 #endif /* !defined(__x86_64__) */ 161 162 #include <sys/param.h> 163 #include <sys/systm.h> 164 #include <sys/proc.h> 165 #include <sys/pool.h> 166 #include <sys/kernel.h> 167 #include <sys/atomic.h> 168 #include <sys/cpu.h> 169 #include <sys/intr.h> 170 #include <sys/xcall.h> 171 172 #include <uvm/uvm.h> 173 174 #include <dev/isa/isareg.h> 175 176 #include <machine/specialreg.h> 177 #include <machine/gdt.h> 178 #include <machine/isa_machdep.h> 179 #include <machine/cpuvar.h> 180 181 #include <x86/pmap.h> 182 #include <x86/pmap_pv.h> 183 184 #include <x86/i82489reg.h> 185 #include <x86/i82489var.h> 186 187 #ifdef XEN 188 #include <xen/xen3-public/xen.h> 189 #include <xen/hypervisor.h> 190 #endif 191 192 /* flag to be used for kernel mappings: PG_u on Xen/amd64, 0 otherwise */ 193 #if defined(XEN) && defined(__x86_64__) 194 #define PG_k PG_u 195 #else 196 #define PG_k 0 197 #endif 198 199 /* 200 * general info: 201 * 202 * - for an explanation of how the i386 MMU hardware works see 203 * the comments in <machine/pte.h>. 204 * 205 * - for an explanation of the general memory structure used by 206 * this pmap (including the recursive mapping), see the comments 207 * in <machine/pmap.h>. 208 * 209 * this file contains the code for the "pmap module." the module's 210 * job is to manage the hardware's virtual to physical address mappings. 211 * note that there are two levels of mapping in the VM system: 212 * 213 * [1] the upper layer of the VM system uses vm_map's and vm_map_entry's 214 * to map ranges of virtual address space to objects/files. for 215 * example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only 216 * to the file /bin/ls starting at offset zero." note that 217 * the upper layer mapping is not concerned with how individual 218 * vm_pages are mapped. 219 * 220 * [2] the lower layer of the VM system (the pmap) maintains the mappings 221 * from virtual addresses. it is concerned with which vm_page is 222 * mapped where. for example, when you run /bin/ls and start 223 * at page 0x1000 the fault routine may lookup the correct page 224 * of the /bin/ls file and then ask the pmap layer to establish 225 * a mapping for it. 226 * 227 * note that information in the lower layer of the VM system can be 228 * thrown away since it can easily be reconstructed from the info 229 * in the upper layer. 230 * 231 * data structures we use include: 232 * 233 * - struct pmap: describes the address space of one thread 234 * - struct pv_entry: describes one <PMAP,VA> mapping of a PA 235 * - struct pv_head: there is one pv_head per managed page of 236 * physical memory. the pv_head points to a list of pv_entry 237 * structures which describe all the <PMAP,VA> pairs that this 238 * page is mapped in. this is critical for page based operations 239 * such as pmap_page_protect() [change protection on _all_ mappings 240 * of a page] 241 */ 242 243 /* 244 * memory allocation 245 * 246 * - there are three data structures that we must dynamically allocate: 247 * 248 * [A] new process' page directory page (PDP) 249 * - plan 1: done at pmap_create() we use 250 * uvm_km_alloc(kernel_map, PAGE_SIZE) [fka kmem_alloc] to do this 251 * allocation. 252 * 253 * if we are low in free physical memory then we sleep in 254 * uvm_km_alloc -- in this case this is ok since we are creating 255 * a new pmap and should not be holding any locks. 256 * 257 * if the kernel is totally out of virtual space 258 * (i.e. uvm_km_alloc returns NULL), then we panic. 259 * 260 * [B] new page tables pages (PTP) 261 * - call uvm_pagealloc() 262 * => success: zero page, add to pm_pdir 263 * => failure: we are out of free vm_pages, let pmap_enter() 264 * tell UVM about it. 265 * 266 * note: for kernel PTPs, we start with NKPTP of them. as we map 267 * kernel memory (at uvm_map time) we check to see if we've grown 268 * the kernel pmap. if so, we call the optional function 269 * pmap_growkernel() to grow the kernel PTPs in advance. 270 * 271 * [C] pv_entry structures 272 */ 273 274 /* 275 * locking 276 * 277 * we have the following locks that we must contend with: 278 * 279 * mutexes: 280 * 281 * - pmap lock (per pmap, part of uvm_object) 282 * this lock protects the fields in the pmap structure including 283 * the non-kernel PDEs in the PDP, and the PTEs. it also locks 284 * in the alternate PTE space (since that is determined by the 285 * entry in the PDP). 286 * 287 * - pvh_lock (per pv_head) 288 * this lock protects the pv_entry list which is chained off the 289 * pv_head structure for a specific managed PA. it is locked 290 * when traversing the list (e.g. adding/removing mappings, 291 * syncing R/M bits, etc.) 292 * 293 * - pmaps_lock 294 * this lock protects the list of active pmaps (headed by "pmaps"). 295 * we lock it when adding or removing pmaps from this list. 296 * 297 * tlb shootdown 298 * 299 * tlb shootdowns are hard interrupts that operate outside the spl 300 * framework: they don't need to be blocked provided that the pmap module 301 * gets the order of events correct. the calls are made by talking directly 302 * to the lapic. the stubs to handle the interrupts are quite short and do 303 * one of the following: invalidate a single page, a range of pages, all 304 * user tlb entries or the entire tlb. 305 * 306 * the cpus synchronize with each other using pmap_mbox structures which are 307 * aligned on 64-byte cache lines. tlb shootdowns against the kernel pmap 308 * use a global mailbox and are generated using a broadcast ipi (broadcast 309 * to all but the sending cpu). shootdowns against regular pmaps use 310 * per-cpu mailboxes and are multicast. kernel and user shootdowns can 311 * execute simultaneously, as can shootdowns within different multithreaded 312 * processes. TODO: 313 * 314 * 1. figure out which waitpoints can be deferered to pmap_update(). 315 * 2. see if there is a cheap way to batch some updates. 316 */ 317 318 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER; 319 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER; 320 const long nkptpmax[] = NKPTPMAX_INITIALIZER; 321 const long nbpd[] = NBPD_INITIALIZER; 322 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER; 323 pd_entry_t * const alternate_pdes[] = APDES_INITIALIZER; 324 325 long nkptp[] = NKPTP_INITIALIZER; 326 327 static kmutex_t pmaps_lock; 328 329 static vaddr_t pmap_maxkvaddr; 330 331 #define COUNT(x) /* nothing */ 332 333 /* 334 * XXX kludge: dummy locking to make KASSERTs in uvm_page.c comfortable. 335 * actual locking is done by pm_lock. 336 */ 337 #if defined(DIAGNOSTIC) 338 #define PMAP_SUBOBJ_LOCK(pm, idx) \ 339 KASSERT(mutex_owned(&(pm)->pm_lock)); \ 340 if ((idx) != 0) \ 341 mutex_enter(&(pm)->pm_obj[(idx)].vmobjlock) 342 #define PMAP_SUBOBJ_UNLOCK(pm, idx) \ 343 KASSERT(mutex_owned(&(pm)->pm_lock)); \ 344 if ((idx) != 0) \ 345 mutex_exit(&(pm)->pm_obj[(idx)].vmobjlock) 346 #else /* defined(DIAGNOSTIC) */ 347 #define PMAP_SUBOBJ_LOCK(pm, idx) /* nothing */ 348 #define PMAP_SUBOBJ_UNLOCK(pm, idx) /* nothing */ 349 #endif /* defined(DIAGNOSTIC) */ 350 351 /* 352 * Misc. event counters. 353 */ 354 struct evcnt pmap_iobmp_evcnt; 355 struct evcnt pmap_ldt_evcnt; 356 357 /* 358 * Global TLB shootdown mailbox. 359 */ 360 struct evcnt pmap_tlb_evcnt __aligned(64); 361 struct pmap_mbox pmap_mbox __aligned(64); 362 363 /* 364 * Per-CPU data. The pmap mailbox is cache intensive so gets its 365 * own line. Note that the mailbox must be the first item. 366 */ 367 struct pmap_cpu { 368 /* TLB shootdown */ 369 struct pmap_mbox pc_mbox; 370 }; 371 372 union { 373 struct pmap_cpu pc; 374 uint8_t padding[64]; 375 } pmap_cpu[MAXCPUS] __aligned(64); 376 377 /* 378 * global data structures 379 */ 380 381 static struct pmap kernel_pmap_store; /* the kernel's pmap (proc0) */ 382 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store; 383 384 /* 385 * pmap_pg_g: if our processor supports PG_G in the PTE then we 386 * set pmap_pg_g to PG_G (otherwise it is zero). 387 */ 388 389 int pmap_pg_g = 0; 390 391 /* 392 * pmap_largepages: if our processor supports PG_PS and we are 393 * using it, this is set to true. 394 */ 395 396 int pmap_largepages; 397 398 /* 399 * i386 physical memory comes in a big contig chunk with a small 400 * hole toward the front of it... the following two paddr_t's 401 * (shared with machdep.c) describe the physical address space 402 * of this machine. 403 */ 404 paddr_t avail_start; /* PA of first available physical page */ 405 paddr_t avail_end; /* PA of last available physical page */ 406 407 #ifdef XEN 408 #ifdef __x86_64__ 409 /* Dummy PGD for user cr3, used between pmap_deactivate() and pmap_activate() */ 410 static paddr_t xen_dummy_user_pgd; 411 /* Currently active user PGD (can't use rcr3()) */ 412 static paddr_t xen_current_user_pgd = 0; 413 #endif /* __x86_64__ */ 414 paddr_t pmap_pa_start; /* PA of first physical page for this domain */ 415 paddr_t pmap_pa_end; /* PA of last physical page for this domain */ 416 #endif /* XEN */ 417 418 #define VM_PAGE_TO_PP(pg) (&(pg)->mdpage.mp_pp) 419 420 #define pp_lock(pp) mutex_spin_enter(&(pp)->pp_lock) 421 #define pp_unlock(pp) mutex_spin_exit(&(pp)->pp_lock) 422 #define pp_locked(pp) mutex_owned(&(pp)->pp_lock) 423 424 #define PV_HASH_SIZE 32768 425 #define PV_HASH_LOCK_CNT 32 426 427 struct pv_hash_lock { 428 kmutex_t lock; 429 } __aligned(CACHE_LINE_SIZE) pv_hash_locks[PV_HASH_LOCK_CNT] 430 __aligned(CACHE_LINE_SIZE); 431 432 struct pv_hash_head { 433 SLIST_HEAD(, pv_entry) hh_list; 434 } pv_hash_heads[PV_HASH_SIZE]; 435 436 static u_int 437 pvhash_hash(struct vm_page *ptp, vaddr_t va) 438 { 439 440 return (uintptr_t)ptp / sizeof(*ptp) + (va >> PAGE_SHIFT); 441 } 442 443 static struct pv_hash_head * 444 pvhash_head(u_int hash) 445 { 446 447 return &pv_hash_heads[hash % PV_HASH_SIZE]; 448 } 449 450 static kmutex_t * 451 pvhash_lock(u_int hash) 452 { 453 454 return &pv_hash_locks[hash % PV_HASH_LOCK_CNT].lock; 455 } 456 457 static struct pv_entry * 458 pvhash_remove(struct pv_hash_head *hh, struct vm_page *ptp, vaddr_t va) 459 { 460 struct pv_entry *pve; 461 struct pv_entry *prev; 462 463 prev = NULL; 464 SLIST_FOREACH(pve, &hh->hh_list, pve_hash) { 465 if (pve->pve_pte.pte_ptp == ptp && 466 pve->pve_pte.pte_va == va) { 467 if (prev != NULL) { 468 SLIST_REMOVE_AFTER(prev, pve_hash); 469 } else { 470 SLIST_REMOVE_HEAD(&hh->hh_list, pve_hash); 471 } 472 break; 473 } 474 prev = pve; 475 } 476 return pve; 477 } 478 479 /* 480 * other data structures 481 */ 482 483 static pt_entry_t protection_codes[8]; /* maps MI prot to i386 prot code */ 484 static bool pmap_initialized = false; /* pmap_init done yet? */ 485 486 /* 487 * the following two vaddr_t's are used during system startup 488 * to keep track of how much of the kernel's VM space we have used. 489 * once the system is started, the management of the remaining kernel 490 * VM space is turned over to the kernel_map vm_map. 491 */ 492 493 static vaddr_t virtual_avail; /* VA of first free KVA */ 494 static vaddr_t virtual_end; /* VA of last free KVA */ 495 496 /* 497 * linked list of all non-kernel pmaps 498 */ 499 500 static struct pmap_head pmaps; 501 502 /* 503 * pool that pmap structures are allocated from 504 */ 505 506 static struct pool_cache pmap_cache; 507 508 /* 509 * pv_entry cache 510 */ 511 512 static struct pool_cache pmap_pv_cache; 513 514 /* 515 * MULTIPROCESSOR: special VA's/ PTE's are actually allocated inside a 516 * maxcpus*NPTECL array of PTE's, to avoid cache line thrashing 517 * due to false sharing. 518 */ 519 520 #ifdef MULTIPROCESSOR 521 #define PTESLEW(pte, id) ((pte)+(id)*NPTECL) 522 #define VASLEW(va,id) ((va)+(id)*NPTECL*PAGE_SIZE) 523 #else 524 #define PTESLEW(pte, id) (pte) 525 #define VASLEW(va,id) (va) 526 #endif 527 528 /* 529 * special VAs and the PTEs that map them 530 */ 531 static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte, *early_zero_pte; 532 static char *csrcp, *cdstp, *zerop, *ptpp, *early_zerop; 533 534 /* 535 * pool and cache that PDPs are allocated from 536 */ 537 538 static struct pool_cache pmap_pdp_cache; 539 int pmap_pdp_ctor(void *, void *, int); 540 void pmap_pdp_dtor(void *, void *); 541 #ifdef PAE 542 /* need to allocate items of 4 pages */ 543 void *pmap_pdp_alloc(struct pool *, int); 544 void pmap_pdp_free(struct pool *, void *); 545 static struct pool_allocator pmap_pdp_allocator = { 546 .pa_alloc = pmap_pdp_alloc, 547 .pa_free = pmap_pdp_free, 548 .pa_pagesz = PAGE_SIZE * PDP_SIZE, 549 }; 550 #endif /* PAE */ 551 552 void *vmmap; /* XXX: used by mem.c... it should really uvm_map_reserve it */ 553 554 extern vaddr_t idt_vaddr; /* we allocate IDT early */ 555 extern paddr_t idt_paddr; 556 557 #ifdef _LP64 558 extern vaddr_t lo32_vaddr; 559 extern vaddr_t lo32_paddr; 560 #endif 561 562 extern int end; 563 564 #ifdef i386 565 /* stuff to fix the pentium f00f bug */ 566 extern vaddr_t pentium_idt_vaddr; 567 #endif 568 569 570 /* 571 * local prototypes 572 */ 573 574 static struct vm_page *pmap_get_ptp(struct pmap *, vaddr_t, 575 pd_entry_t * const *); 576 static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int); 577 static void pmap_freepage(struct pmap *, struct vm_page *, int); 578 static void pmap_free_ptp(struct pmap *, struct vm_page *, 579 vaddr_t, pt_entry_t *, 580 pd_entry_t * const *); 581 static bool pmap_is_curpmap(struct pmap *); 582 static bool pmap_is_active(struct pmap *, struct cpu_info *, bool); 583 static void pmap_map_ptes(struct pmap *, struct pmap **, 584 pt_entry_t **, pd_entry_t * const **); 585 static bool pmap_remove_pte(struct pmap *, struct vm_page *, 586 pt_entry_t *, vaddr_t, 587 struct pv_entry **); 588 static pt_entry_t pmap_remove_ptes(struct pmap *, struct vm_page *, 589 vaddr_t, vaddr_t, vaddr_t, 590 struct pv_entry **); 591 592 static void pmap_unmap_ptes(struct pmap *, struct pmap *); 593 static bool pmap_get_physpage(vaddr_t, int, paddr_t *); 594 static int pmap_pdes_invalid(vaddr_t, pd_entry_t * const *, 595 pd_entry_t *); 596 #define pmap_pdes_valid(va, pdes, lastpde) \ 597 (pmap_pdes_invalid((va), (pdes), (lastpde)) == 0) 598 static void pmap_alloc_level(pd_entry_t * const *, vaddr_t, int, 599 long *); 600 601 static bool pmap_reactivate(struct pmap *); 602 603 /* 604 * p m a p h e l p e r f u n c t i o n s 605 */ 606 607 static inline void 608 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff) 609 { 610 611 if (pmap == pmap_kernel()) { 612 atomic_add_long(&pmap->pm_stats.resident_count, resid_diff); 613 atomic_add_long(&pmap->pm_stats.wired_count, wired_diff); 614 } else { 615 KASSERT(mutex_owned(&pmap->pm_lock)); 616 pmap->pm_stats.resident_count += resid_diff; 617 pmap->pm_stats.wired_count += wired_diff; 618 } 619 } 620 621 static inline void 622 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte) 623 { 624 int resid_diff = ((npte & PG_V) ? 1 : 0) - ((opte & PG_V) ? 1 : 0); 625 int wired_diff = ((npte & PG_W) ? 1 : 0) - ((opte & PG_W) ? 1 : 0); 626 627 KASSERT((npte & (PG_V | PG_W)) != PG_W); 628 KASSERT((opte & (PG_V | PG_W)) != PG_W); 629 630 pmap_stats_update(pmap, resid_diff, wired_diff); 631 } 632 633 /* 634 * ptp_to_pmap: lookup pmap by ptp 635 */ 636 637 static struct pmap * 638 ptp_to_pmap(struct vm_page *ptp) 639 { 640 struct pmap *pmap; 641 642 if (ptp == NULL) { 643 return pmap_kernel(); 644 } 645 pmap = (struct pmap *)ptp->uobject; 646 KASSERT(pmap != NULL); 647 KASSERT(&pmap->pm_obj[0] == ptp->uobject); 648 return pmap; 649 } 650 651 static inline struct pv_pte * 652 pve_to_pvpte(struct pv_entry *pve) 653 { 654 655 KASSERT((void *)&pve->pve_pte == (void *)pve); 656 return &pve->pve_pte; 657 } 658 659 static inline struct pv_entry * 660 pvpte_to_pve(struct pv_pte *pvpte) 661 { 662 struct pv_entry *pve = (void *)pvpte; 663 664 KASSERT(pve_to_pvpte(pve) == pvpte); 665 return pve; 666 } 667 668 /* 669 * pv_pte_first, pv_pte_next: PV list iterator. 670 */ 671 672 static struct pv_pte * 673 pv_pte_first(struct pmap_page *pp) 674 { 675 676 KASSERT(pp_locked(pp)); 677 if ((pp->pp_flags & PP_EMBEDDED) != 0) { 678 return &pp->pp_pte; 679 } 680 return pve_to_pvpte(LIST_FIRST(&pp->pp_head.pvh_list)); 681 } 682 683 static struct pv_pte * 684 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte) 685 { 686 687 KASSERT(pvpte != NULL); 688 KASSERT(pp_locked(pp)); 689 if (pvpte == &pp->pp_pte) { 690 KASSERT((pp->pp_flags & PP_EMBEDDED) != 0); 691 return NULL; 692 } 693 KASSERT((pp->pp_flags & PP_EMBEDDED) == 0); 694 return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list)); 695 } 696 697 /* 698 * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]? 699 * of course the kernel is always loaded 700 */ 701 702 inline static bool 703 pmap_is_curpmap(struct pmap *pmap) 704 { 705 #if defined(XEN) && defined(__x86_64__) 706 /* 707 * Only kernel pmap is physically loaded. 708 * User PGD may be active, but TLB will be flushed 709 * with HYPERVISOR_iret anyway, so let's say no 710 */ 711 return(pmap == pmap_kernel()); 712 #else /* XEN && __x86_64__*/ 713 return((pmap == pmap_kernel()) || 714 (pmap == curcpu()->ci_pmap)); 715 #endif 716 } 717 718 /* 719 * pmap_is_active: is this pmap loaded into the specified processor's %cr3? 720 */ 721 722 inline static bool 723 pmap_is_active(struct pmap *pmap, struct cpu_info *ci, bool kernel) 724 { 725 726 return (pmap == pmap_kernel() || 727 (pmap->pm_cpus & ci->ci_cpumask) != 0 || 728 (kernel && (pmap->pm_kernel_cpus & ci->ci_cpumask) != 0)); 729 } 730 731 static void 732 pmap_apte_flush(struct pmap *pmap) 733 { 734 735 KASSERT(kpreempt_disabled()); 736 737 /* 738 * Flush the APTE mapping from all other CPUs that 739 * are using the pmap we are using (who's APTE space 740 * is the one we've just modified). 741 * 742 * XXXthorpej -- find a way to defer the IPI. 743 */ 744 pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, 0); 745 pmap_tlb_shootwait(); 746 } 747 748 /* 749 * Add a reference to the specified pmap. 750 */ 751 752 inline void 753 pmap_reference(struct pmap *pmap) 754 { 755 756 atomic_inc_uint(&pmap->pm_obj[0].uo_refs); 757 } 758 759 /* 760 * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in 761 * 762 * => we lock enough pmaps to keep things locked in 763 * => must be undone with pmap_unmap_ptes before returning 764 */ 765 766 static void 767 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2, 768 pd_entry_t **ptepp, pd_entry_t * const **pdeppp) 769 { 770 pd_entry_t opde, npde; 771 struct pmap *ourpmap; 772 struct cpu_info *ci; 773 struct lwp *l; 774 bool iscurrent; 775 uint64_t ncsw; 776 #ifdef XEN 777 int s; 778 #endif 779 780 /* the kernel's pmap is always accessible */ 781 if (pmap == pmap_kernel()) { 782 *pmap2 = NULL; 783 *ptepp = PTE_BASE; 784 *pdeppp = normal_pdes; 785 return; 786 } 787 KASSERT(kpreempt_disabled()); 788 789 retry: 790 l = curlwp; 791 ncsw = l->l_ncsw; 792 ourpmap = NULL; 793 ci = curcpu(); 794 #if defined(XEN) && defined(__x86_64__) 795 /* 796 * curmap can only be pmap_kernel so at this point 797 * pmap_is_curpmap is always false 798 */ 799 iscurrent = 0; 800 ourpmap = pmap_kernel(); 801 #else /* XEN && __x86_64__*/ 802 if (ci->ci_want_pmapload && 803 vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) { 804 pmap_load(); 805 if (l->l_ncsw != ncsw) 806 goto retry; 807 } 808 iscurrent = pmap_is_curpmap(pmap); 809 /* if curpmap then we are always mapped */ 810 if (iscurrent) { 811 mutex_enter(&pmap->pm_lock); 812 *pmap2 = NULL; 813 *ptepp = PTE_BASE; 814 *pdeppp = normal_pdes; 815 goto out; 816 } 817 ourpmap = ci->ci_pmap; 818 #endif /* XEN && __x86_64__ */ 819 820 /* need to lock both curpmap and pmap: use ordered locking */ 821 pmap_reference(ourpmap); 822 if ((uintptr_t) pmap < (uintptr_t) ourpmap) { 823 mutex_enter(&pmap->pm_lock); 824 mutex_enter(&ourpmap->pm_lock); 825 } else { 826 mutex_enter(&ourpmap->pm_lock); 827 mutex_enter(&pmap->pm_lock); 828 } 829 830 if (l->l_ncsw != ncsw) 831 goto unlock_and_retry; 832 833 /* need to load a new alternate pt space into curpmap? */ 834 COUNT(apdp_pde_map); 835 opde = *APDP_PDE; 836 #ifdef XEN 837 if (!pmap_valid_entry(opde) || 838 pmap_pte2pa(opde) != pmap_pdirpa(pmap, 0)) { 839 int i; 840 s = splvm(); 841 /* Make recursive entry usable in user PGD */ 842 for (i = 0; i < PDP_SIZE; i++) { 843 npde = pmap_pa2pte( 844 pmap_pdirpa(pmap, i * NPDPG)) | PG_k | PG_V; 845 xpq_queue_pte_update( 846 xpmap_ptom(pmap_pdirpa(pmap, PDIR_SLOT_PTE + i)), 847 npde); 848 xpq_queue_pte_update(xpmap_ptetomach(&APDP_PDE[i]), 849 npde); 850 #ifdef PAE 851 /* update shadow entry too */ 852 xpq_queue_pte_update( 853 xpmap_ptetomach(&APDP_PDE_SHADOW[i]), npde); 854 #endif /* PAE */ 855 xpq_queue_invlpg( 856 (vaddr_t)&pmap->pm_pdir[PDIR_SLOT_PTE + i]); 857 } 858 xpq_flush_queue(); 859 if (pmap_valid_entry(opde)) 860 pmap_apte_flush(ourpmap); 861 splx(s); 862 } 863 #else /* XEN */ 864 npde = pmap_pa2pte(pmap_pdirpa(pmap, 0)) | PG_RW | PG_V; 865 if (!pmap_valid_entry(opde) || 866 pmap_pte2pa(opde) != pmap_pdirpa(pmap, 0)) { 867 pmap_pte_set(APDP_PDE, npde); 868 pmap_pte_flush(); 869 if (pmap_valid_entry(opde)) 870 pmap_apte_flush(ourpmap); 871 } 872 #endif /* XEN */ 873 *pmap2 = ourpmap; 874 *ptepp = APTE_BASE; 875 *pdeppp = alternate_pdes; 876 KASSERT(l->l_ncsw == ncsw); 877 #if !defined(XEN) || !defined(__x86_64__) 878 out: 879 #endif 880 /* 881 * might have blocked, need to retry? 882 */ 883 if (l->l_ncsw != ncsw) { 884 unlock_and_retry: 885 if (ourpmap != NULL) { 886 mutex_exit(&ourpmap->pm_lock); 887 pmap_destroy(ourpmap); 888 } 889 mutex_exit(&pmap->pm_lock); 890 goto retry; 891 } 892 893 return; 894 } 895 896 /* 897 * pmap_unmap_ptes: unlock the PTE mapping of "pmap" 898 */ 899 900 static void 901 pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2) 902 { 903 904 if (pmap == pmap_kernel()) { 905 return; 906 } 907 KASSERT(kpreempt_disabled()); 908 if (pmap2 == NULL) { 909 mutex_exit(&pmap->pm_lock); 910 } else { 911 #if defined(XEN) && defined(__x86_64__) 912 KASSERT(pmap2 == pmap_kernel()); 913 #else 914 KASSERT(curcpu()->ci_pmap == pmap2); 915 #endif 916 #if defined(MULTIPROCESSOR) 917 pmap_pte_set(APDP_PDE, 0); 918 pmap_pte_flush(); 919 pmap_apte_flush(pmap2); 920 #endif 921 COUNT(apdp_pde_unmap); 922 mutex_exit(&pmap->pm_lock); 923 mutex_exit(&pmap2->pm_lock); 924 pmap_destroy(pmap2); 925 } 926 } 927 928 inline static void 929 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte) 930 { 931 932 #if !defined(__x86_64__) 933 if (curproc == NULL || curproc->p_vmspace == NULL || 934 pm != vm_map_pmap(&curproc->p_vmspace->vm_map)) 935 return; 936 937 if ((opte ^ npte) & PG_X) 938 pmap_update_pg(va); 939 940 /* 941 * Executability was removed on the last executable change. 942 * Reset the code segment to something conservative and 943 * let the trap handler deal with setting the right limit. 944 * We can't do that because of locking constraints on the vm map. 945 */ 946 947 if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) { 948 struct trapframe *tf = curlwp->l_md.md_regs; 949 950 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 951 pm->pm_hiexec = I386_MAX_EXE_ADDR; 952 } 953 #endif /* !defined(__x86_64__) */ 954 } 955 956 #if !defined(__x86_64__) 957 /* 958 * Fixup the code segment to cover all potential executable mappings. 959 * returns 0 if no changes to the code segment were made. 960 */ 961 962 int 963 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb) 964 { 965 struct vm_map_entry *ent; 966 struct pmap *pm = vm_map_pmap(map); 967 vaddr_t va = 0; 968 969 vm_map_lock_read(map); 970 for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) { 971 972 /* 973 * This entry has greater va than the entries before. 974 * We need to make it point to the last page, not past it. 975 */ 976 977 if (ent->protection & VM_PROT_EXECUTE) 978 va = trunc_page(ent->end) - PAGE_SIZE; 979 } 980 vm_map_unlock_read(map); 981 if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL)) 982 return (0); 983 984 pm->pm_hiexec = va; 985 if (pm->pm_hiexec > I386_MAX_EXE_ADDR) { 986 tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL); 987 } else { 988 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 989 return (0); 990 } 991 return (1); 992 } 993 #endif /* !defined(__x86_64__) */ 994 995 /* 996 * p m a p k e n t e r f u n c t i o n s 997 * 998 * functions to quickly enter/remove pages from the kernel address 999 * space. pmap_kremove is exported to MI kernel. we make use of 1000 * the recursive PTE mappings. 1001 */ 1002 1003 /* 1004 * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking 1005 * 1006 * => no need to lock anything, assume va is already allocated 1007 * => should be faster than normal pmap enter function 1008 */ 1009 1010 void 1011 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags) 1012 { 1013 pt_entry_t *pte, opte, npte; 1014 1015 KASSERT(!(prot & ~VM_PROT_ALL)); 1016 1017 if (va < VM_MIN_KERNEL_ADDRESS) 1018 pte = vtopte(va); 1019 else 1020 pte = kvtopte(va); 1021 #ifdef DOM0OPS 1022 if (pa < pmap_pa_start || pa >= pmap_pa_end) { 1023 #ifdef DEBUG 1024 printk("pmap_kenter_pa: pa 0x%" PRIx64 " for va 0x%" PRIx64 1025 " outside range\n", (int64_t)pa, (int64_t)va); 1026 #endif /* DEBUG */ 1027 npte = pa; 1028 } else 1029 #endif /* DOM0OPS */ 1030 npte = pmap_pa2pte(pa); 1031 npte |= protection_codes[prot] | PG_k | PG_V | pmap_pg_g; 1032 if (flags & PMAP_NOCACHE) 1033 npte |= PG_N; 1034 opte = pmap_pte_testset(pte, npte); /* zap! */ 1035 #if defined(DIAGNOSTIC) 1036 /* XXX For now... */ 1037 if (opte & PG_PS) 1038 panic("pmap_kenter_pa: PG_PS"); 1039 #endif 1040 if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) { 1041 /* This should not happen, so no need to batch updates. */ 1042 kpreempt_disable(); 1043 pmap_tlb_shootdown(pmap_kernel(), va, 0, opte); 1044 kpreempt_enable(); 1045 } 1046 } 1047 1048 void 1049 pmap_emap_enter(vaddr_t va, paddr_t pa, vm_prot_t prot) 1050 { 1051 pt_entry_t *pte, opte, npte; 1052 1053 KASSERT((prot & ~VM_PROT_ALL) == 0); 1054 pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va); 1055 1056 #ifdef DOM0OPS 1057 if (pa < pmap_pa_start || pa >= pmap_pa_end) { 1058 npte = pa; 1059 } else 1060 #endif 1061 npte = pmap_pa2pte(pa); 1062 1063 npte = pmap_pa2pte(pa); 1064 npte |= protection_codes[prot] | PG_k | PG_V; 1065 opte = pmap_pte_testset(pte, npte); 1066 } 1067 1068 /* 1069 * pmap_emap_sync: perform TLB flush or pmap load, if it was deferred. 1070 */ 1071 void 1072 pmap_emap_sync(bool canload) 1073 { 1074 struct cpu_info *ci = curcpu(); 1075 struct pmap *pmap; 1076 1077 KASSERT(kpreempt_disabled()); 1078 if (__predict_true(ci->ci_want_pmapload && canload)) { 1079 /* 1080 * XXX: Hint for pmap_reactivate(), which might suggest to 1081 * not perform TLB flush, if state has not changed. 1082 */ 1083 pmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map); 1084 if (__predict_false(pmap == ci->ci_pmap)) { 1085 const uint32_t cpumask = ci->ci_cpumask; 1086 atomic_and_32(&pmap->pm_cpus, ~cpumask); 1087 } 1088 pmap_load(); 1089 KASSERT(ci->ci_want_pmapload == 0); 1090 } else { 1091 tlbflush(); 1092 } 1093 1094 } 1095 1096 void 1097 pmap_emap_remove(vaddr_t sva, vsize_t len) 1098 { 1099 pt_entry_t *pte, xpte; 1100 vaddr_t va, eva = sva + len; 1101 1102 for (va = sva; va < eva; va += PAGE_SIZE) { 1103 pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va); 1104 xpte |= pmap_pte_testset(pte, 0); 1105 } 1106 } 1107 1108 #ifdef XEN 1109 /* 1110 * pmap_kenter_ma: enter a kernel mapping without R/M (pv_entry) tracking 1111 * 1112 * => no need to lock anything, assume va is already allocated 1113 * => should be faster than normal pmap enter function 1114 * => we expect a MACHINE address 1115 */ 1116 1117 void 1118 pmap_kenter_ma(vaddr_t va, paddr_t ma, vm_prot_t prot, u_int flags) 1119 { 1120 pt_entry_t *pte, opte, npte; 1121 1122 if (va < VM_MIN_KERNEL_ADDRESS) 1123 pte = vtopte(va); 1124 else 1125 pte = kvtopte(va); 1126 1127 npte = ma | ((prot & VM_PROT_WRITE) ? PG_RW : PG_RO) | 1128 PG_V | PG_k; 1129 if (flags & PMAP_NOCACHE) 1130 npte |= PG_N; 1131 1132 #ifndef XEN 1133 if ((cpu_feature & CPUID_NOX) && !(prot & VM_PROT_EXECUTE)) 1134 npte |= PG_NX; 1135 #endif 1136 opte = pmap_pte_testset (pte, npte); /* zap! */ 1137 1138 if (pmap_valid_entry(opte)) { 1139 #if defined(MULTIPROCESSOR) 1140 kpreempt_disable(); 1141 pmap_tlb_shootdown(pmap_kernel(), va, 0, opte); 1142 kpreempt_enable(); 1143 #else 1144 /* Don't bother deferring in the single CPU case. */ 1145 pmap_update_pg(va); 1146 #endif 1147 } 1148 } 1149 #endif /* XEN */ 1150 1151 #if defined(__x86_64__) 1152 /* 1153 * Change protection for a virtual address. Local for a CPU only, don't 1154 * care about TLB shootdowns. 1155 * 1156 * => must be called with preemption disabled 1157 */ 1158 void 1159 pmap_changeprot_local(vaddr_t va, vm_prot_t prot) 1160 { 1161 pt_entry_t *pte, opte, npte; 1162 1163 KASSERT(kpreempt_disabled()); 1164 1165 if (va < VM_MIN_KERNEL_ADDRESS) 1166 pte = vtopte(va); 1167 else 1168 pte = kvtopte(va); 1169 1170 npte = opte = *pte; 1171 1172 if ((prot & VM_PROT_WRITE) != 0) 1173 npte |= PG_RW; 1174 else 1175 npte &= ~PG_RW; 1176 1177 if (opte != npte) { 1178 pmap_pte_set(pte, npte); 1179 pmap_pte_flush(); 1180 invlpg(va); 1181 } 1182 } 1183 #endif /* defined(__x86_64__) */ 1184 1185 /* 1186 * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking 1187 * 1188 * => no need to lock anything 1189 * => caller must dispose of any vm_page mapped in the va range 1190 * => note: not an inline function 1191 * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE 1192 * => we assume kernel only unmaps valid addresses and thus don't bother 1193 * checking the valid bit before doing TLB flushing 1194 * => must be followed by call to pmap_update() before reuse of page 1195 */ 1196 1197 void 1198 pmap_kremove(vaddr_t sva, vsize_t len) 1199 { 1200 pt_entry_t *pte, xpte; 1201 vaddr_t va, eva; 1202 1203 eva = sva + len; 1204 xpte = 0; 1205 1206 for (va = sva; va < eva; va += PAGE_SIZE) { 1207 if (va < VM_MIN_KERNEL_ADDRESS) 1208 pte = vtopte(va); 1209 else 1210 pte = kvtopte(va); 1211 xpte |= pmap_pte_testset(pte, 0); /* zap! */ 1212 #if defined(DIAGNOSTIC) 1213 /* XXX For now... */ 1214 if (xpte & PG_PS) 1215 panic("pmap_kremove: PG_PS"); 1216 if (xpte & PG_PVLIST) 1217 panic("pmap_kremove: PG_PVLIST mapping for 0x%lx", 1218 va); 1219 #endif 1220 } 1221 if ((xpte & (PG_V | PG_U)) == (PG_V | PG_U)) { 1222 kpreempt_disable(); 1223 pmap_tlb_shootdown(pmap_kernel(), sva, eva, xpte); 1224 kpreempt_enable(); 1225 } 1226 } 1227 1228 /* 1229 * p m a p i n i t f u n c t i o n s 1230 * 1231 * pmap_bootstrap and pmap_init are called during system startup 1232 * to init the pmap module. pmap_bootstrap() does a low level 1233 * init just to get things rolling. pmap_init() finishes the job. 1234 */ 1235 1236 /* 1237 * pmap_bootstrap: get the system in a state where it can run with VM 1238 * properly enabled (called before main()). the VM system is 1239 * fully init'd later... 1240 * 1241 * => on i386, locore.s has already enabled the MMU by allocating 1242 * a PDP for the kernel, and nkpde PTP's for the kernel. 1243 * => kva_start is the first free virtual address in kernel space 1244 */ 1245 1246 void 1247 pmap_bootstrap(vaddr_t kva_start) 1248 { 1249 struct pmap *kpm; 1250 pt_entry_t *pte; 1251 struct pcb *pcb; 1252 int i; 1253 vaddr_t kva; 1254 #ifdef XEN 1255 pt_entry_t pg_nx = 0; 1256 #else 1257 unsigned long p1i; 1258 vaddr_t kva_end; 1259 pt_entry_t pg_nx = (cpu_feature & CPUID_NOX ? PG_NX : 0); 1260 #endif 1261 1262 /* 1263 * set up our local static global vars that keep track of the 1264 * usage of KVM before kernel_map is set up 1265 */ 1266 1267 virtual_avail = kva_start; /* first free KVA */ 1268 virtual_end = VM_MAX_KERNEL_ADDRESS; /* last KVA */ 1269 1270 /* 1271 * set up protection_codes: we need to be able to convert from 1272 * a MI protection code (some combo of VM_PROT...) to something 1273 * we can jam into a i386 PTE. 1274 */ 1275 1276 protection_codes[VM_PROT_NONE] = pg_nx; /* --- */ 1277 protection_codes[VM_PROT_EXECUTE] = PG_RO | PG_X; /* --x */ 1278 protection_codes[VM_PROT_READ] = PG_RO | pg_nx; /* -r- */ 1279 protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO | PG_X;/* -rx */ 1280 protection_codes[VM_PROT_WRITE] = PG_RW | pg_nx; /* w-- */ 1281 protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW | PG_X;/* w-x */ 1282 protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW | pg_nx; 1283 /* wr- */ 1284 protection_codes[VM_PROT_ALL] = PG_RW | PG_X; /* wrx */ 1285 1286 /* 1287 * now we init the kernel's pmap 1288 * 1289 * the kernel pmap's pm_obj is not used for much. however, in 1290 * user pmaps the pm_obj contains the list of active PTPs. 1291 * the pm_obj currently does not have a pager. it might be possible 1292 * to add a pager that would allow a process to read-only mmap its 1293 * own page tables (fast user level vtophys?). this may or may not 1294 * be useful. 1295 */ 1296 1297 kpm = pmap_kernel(); 1298 for (i = 0; i < PTP_LEVELS - 1; i++) { 1299 UVM_OBJ_INIT(&kpm->pm_obj[i], NULL, 1); 1300 kpm->pm_ptphint[i] = NULL; 1301 } 1302 memset(&kpm->pm_list, 0, sizeof(kpm->pm_list)); /* pm_list not used */ 1303 pcb = lwp_getpcb(&lwp0); 1304 kpm->pm_pdir = (pd_entry_t *)(pcb->pcb_cr3 + KERNBASE); 1305 #ifdef PAE 1306 for (i = 0; i < PDP_SIZE; i++) 1307 kpm->pm_pdirpa[i] = (paddr_t)pcb->pcb_cr3 + PAGE_SIZE * i; 1308 #else 1309 kpm->pm_pdirpa = (paddr_t)pcb->pcb_cr3; 1310 #endif 1311 kpm->pm_stats.wired_count = kpm->pm_stats.resident_count = 1312 x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS); 1313 1314 /* 1315 * the above is just a rough estimate and not critical to the proper 1316 * operation of the system. 1317 */ 1318 1319 #ifndef XEN 1320 /* 1321 * Begin to enable global TLB entries if they are supported. 1322 * The G bit has no effect until the CR4_PGE bit is set in CR4, 1323 * which happens in cpu_init(), which is run on each cpu 1324 * (and happens later) 1325 */ 1326 1327 if (cpu_feature & CPUID_PGE) { 1328 pmap_pg_g = PG_G; /* enable software */ 1329 1330 /* add PG_G attribute to already mapped kernel pages */ 1331 if (KERNBASE == VM_MIN_KERNEL_ADDRESS) { 1332 kva_end = virtual_avail; 1333 } else { 1334 extern vaddr_t eblob, esym; 1335 kva_end = (vaddr_t)&end; 1336 if (esym > kva_end) 1337 kva_end = esym; 1338 if (eblob > kva_end) 1339 kva_end = eblob; 1340 kva_end = roundup(kva_end, PAGE_SIZE); 1341 } 1342 for (kva = KERNBASE; kva < kva_end; kva += PAGE_SIZE) { 1343 p1i = pl1_i(kva); 1344 if (pmap_valid_entry(PTE_BASE[p1i])) 1345 PTE_BASE[p1i] |= PG_G; 1346 } 1347 } 1348 1349 /* 1350 * enable large pages if they are supported. 1351 */ 1352 1353 if (cpu_feature & CPUID_PSE) { 1354 paddr_t pa; 1355 pd_entry_t *pde; 1356 extern char __data_start; 1357 1358 lcr4(rcr4() | CR4_PSE); /* enable hardware (via %cr4) */ 1359 pmap_largepages = 1; /* enable software */ 1360 1361 /* 1362 * the TLB must be flushed after enabling large pages 1363 * on Pentium CPUs, according to section 3.6.2.2 of 1364 * "Intel Architecture Software Developer's Manual, 1365 * Volume 3: System Programming". 1366 */ 1367 tlbflush(); 1368 1369 /* 1370 * now, remap the kernel text using large pages. we 1371 * assume that the linker has properly aligned the 1372 * .data segment to a NBPD_L2 boundary. 1373 */ 1374 kva_end = rounddown((vaddr_t)&__data_start, NBPD_L1); 1375 for (pa = 0, kva = KERNBASE; kva + NBPD_L2 <= kva_end; 1376 kva += NBPD_L2, pa += NBPD_L2) { 1377 pde = &L2_BASE[pl2_i(kva)]; 1378 *pde = pa | pmap_pg_g | PG_PS | 1379 PG_KR | PG_V; /* zap! */ 1380 tlbflush(); 1381 } 1382 #if defined(DEBUG) 1383 aprint_normal("kernel text is mapped with " 1384 "%lu large pages and %lu normal pages\n", 1385 (unsigned long)howmany(kva - KERNBASE, NBPD_L2), 1386 (unsigned long)howmany((vaddr_t)&__data_start - kva, 1387 NBPD_L1)); 1388 #endif /* defined(DEBUG) */ 1389 } 1390 #endif /* !XEN */ 1391 1392 if (VM_MIN_KERNEL_ADDRESS != KERNBASE) { 1393 /* 1394 * zero_pte is stuck at the end of mapped space for the kernel 1395 * image (disjunct from kva space). This is done so that it 1396 * can safely be used in pmap_growkernel (pmap_get_physpage), 1397 * when it's called for the first time. 1398 * XXXfvdl fix this for MULTIPROCESSOR later. 1399 */ 1400 1401 early_zerop = (void *)(KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2); 1402 early_zero_pte = PTE_BASE + pl1_i((unsigned long)early_zerop); 1403 } 1404 1405 /* 1406 * now we allocate the "special" VAs which are used for tmp mappings 1407 * by the pmap (and other modules). we allocate the VAs by advancing 1408 * virtual_avail (note that there are no pages mapped at these VAs). 1409 * we find the PTE that maps the allocated VA via the linear PTE 1410 * mapping. 1411 */ 1412 1413 pte = PTE_BASE + pl1_i(virtual_avail); 1414 1415 #ifdef MULTIPROCESSOR 1416 /* 1417 * Waste some VA space to avoid false sharing of cache lines 1418 * for page table pages: Give each possible CPU a cache line 1419 * of PTE's (8) to play with, though we only need 4. We could 1420 * recycle some of this waste by putting the idle stacks here 1421 * as well; we could waste less space if we knew the largest 1422 * CPU ID beforehand. 1423 */ 1424 csrcp = (char *) virtual_avail; csrc_pte = pte; 1425 1426 cdstp = (char *) virtual_avail+PAGE_SIZE; cdst_pte = pte+1; 1427 1428 zerop = (char *) virtual_avail+PAGE_SIZE*2; zero_pte = pte+2; 1429 1430 ptpp = (char *) virtual_avail+PAGE_SIZE*3; ptp_pte = pte+3; 1431 1432 virtual_avail += PAGE_SIZE * maxcpus * NPTECL; 1433 pte += maxcpus * NPTECL; 1434 #else 1435 csrcp = (void *) virtual_avail; csrc_pte = pte; /* allocate */ 1436 virtual_avail += PAGE_SIZE; pte++; /* advance */ 1437 1438 cdstp = (void *) virtual_avail; cdst_pte = pte; 1439 virtual_avail += PAGE_SIZE; pte++; 1440 1441 zerop = (void *) virtual_avail; zero_pte = pte; 1442 virtual_avail += PAGE_SIZE; pte++; 1443 1444 ptpp = (void *) virtual_avail; ptp_pte = pte; 1445 virtual_avail += PAGE_SIZE; pte++; 1446 #endif 1447 1448 if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { 1449 early_zerop = zerop; 1450 early_zero_pte = zero_pte; 1451 } 1452 1453 /* 1454 * Nothing after this point actually needs pte; 1455 */ 1456 pte = (void *)0xdeadbeef; 1457 1458 /* XXX: vmmap used by mem.c... should be uvm_map_reserve */ 1459 /* XXXfvdl PTEs not needed here */ 1460 vmmap = (char *)virtual_avail; /* don't need pte */ 1461 virtual_avail += PAGE_SIZE; pte++; 1462 1463 #ifdef XEN 1464 #ifdef __x86_64__ 1465 /* 1466 * We want a dummy page directory for Xen: 1467 * when deactivate a pmap, Xen will still consider it active. 1468 * So we set user PGD to this one to lift all protection on 1469 * the now inactive page tables set. 1470 */ 1471 xen_dummy_user_pgd = avail_start; 1472 avail_start += PAGE_SIZE; 1473 1474 /* Zero fill it, the less checks in Xen it requires the better */ 1475 memset((void *) (xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE); 1476 /* Mark read-only */ 1477 HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE, 1478 pmap_pa2pte(xen_dummy_user_pgd) | PG_u | PG_V, UVMF_INVLPG); 1479 /* Pin as L4 */ 1480 xpq_queue_pin_table(xpmap_ptom_masked(xen_dummy_user_pgd)); 1481 #endif /* __x86_64__ */ 1482 idt_vaddr = virtual_avail; /* don't need pte */ 1483 idt_paddr = avail_start; /* steal a page */ 1484 /* 1485 * Xen require one more page as we can't store 1486 * GDT and LDT on the same page 1487 */ 1488 virtual_avail += 3 * PAGE_SIZE; 1489 avail_start += 3 * PAGE_SIZE; 1490 #else /* XEN */ 1491 idt_vaddr = virtual_avail; /* don't need pte */ 1492 idt_paddr = avail_start; /* steal a page */ 1493 #if defined(__x86_64__) 1494 virtual_avail += 2 * PAGE_SIZE; pte += 2; 1495 avail_start += 2 * PAGE_SIZE; 1496 #else /* defined(__x86_64__) */ 1497 virtual_avail += PAGE_SIZE; pte++; 1498 avail_start += PAGE_SIZE; 1499 /* pentium f00f bug stuff */ 1500 pentium_idt_vaddr = virtual_avail; /* don't need pte */ 1501 virtual_avail += PAGE_SIZE; pte++; 1502 #endif /* defined(__x86_64__) */ 1503 #endif /* XEN */ 1504 1505 #ifdef _LP64 1506 /* 1507 * Grab a page below 4G for things that need it (i.e. 1508 * having an initial %cr3 for the MP trampoline). 1509 */ 1510 lo32_vaddr = virtual_avail; 1511 virtual_avail += PAGE_SIZE; pte++; 1512 lo32_paddr = avail_start; 1513 avail_start += PAGE_SIZE; 1514 #endif 1515 1516 /* 1517 * now we reserve some VM for mapping pages when doing a crash dump 1518 */ 1519 1520 virtual_avail = reserve_dumppages(virtual_avail); 1521 1522 /* 1523 * init the static-global locks and global lists. 1524 * 1525 * => pventry::pvh_lock (initialized elsewhere) must also be 1526 * a spin lock, again at IPL_VM to prevent deadlock, and 1527 * again is never taken from interrupt context. 1528 */ 1529 1530 mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE); 1531 LIST_INIT(&pmaps); 1532 pmap_cpu_init_early(curcpu()); 1533 1534 /* 1535 * initialize caches. 1536 */ 1537 1538 pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), 0, 0, 0, 1539 "pmappl", NULL, IPL_NONE, NULL, NULL, NULL); 1540 #ifdef PAE 1541 pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE * PDP_SIZE, 0, 0, 0, 1542 "pdppl", &pmap_pdp_allocator, IPL_NONE, 1543 pmap_pdp_ctor, pmap_pdp_dtor, NULL); 1544 #else /* PAE */ 1545 pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE, 0, 0, 0, 1546 "pdppl", NULL, IPL_NONE, pmap_pdp_ctor, pmap_pdp_dtor, NULL); 1547 #endif /* PAE */ 1548 pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0, 1549 PR_LARGECACHE, "pvpl", &pool_allocator_meta, IPL_NONE, NULL, 1550 NULL, NULL); 1551 1552 /* 1553 * ensure the TLB is sync'd with reality by flushing it... 1554 */ 1555 1556 tlbflush(); 1557 1558 /* 1559 * calculate pmap_maxkvaddr from nkptp[]. 1560 */ 1561 1562 kva = VM_MIN_KERNEL_ADDRESS; 1563 for (i = PTP_LEVELS - 1; i >= 1; i--) { 1564 kva += nkptp[i] * nbpd[i]; 1565 } 1566 pmap_maxkvaddr = kva; 1567 } 1568 1569 #if defined(__x86_64__) 1570 /* 1571 * Pre-allocate PTPs for low memory, so that 1:1 mappings for various 1572 * trampoline code can be entered. 1573 */ 1574 void 1575 pmap_prealloc_lowmem_ptps(void) 1576 { 1577 #ifdef XEN 1578 int level; 1579 paddr_t newp; 1580 paddr_t pdes_pa; 1581 1582 pdes_pa = pmap_kernel()->pm_pdirpa; 1583 level = PTP_LEVELS; 1584 for (;;) { 1585 newp = avail_start; 1586 avail_start += PAGE_SIZE; 1587 HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop, 1588 xpmap_ptom_masked(newp) | PG_u | PG_V | PG_RW, UVMF_INVLPG); 1589 memset((void *)early_zerop, 0, PAGE_SIZE); 1590 /* Mark R/O before installing */ 1591 HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop, 1592 xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG); 1593 if (newp < (NKL2_KIMG_ENTRIES * NBPD_L2)) 1594 HYPERVISOR_update_va_mapping (newp + KERNBASE, 1595 xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG); 1596 xpq_queue_pte_update ( 1597 xpmap_ptom_masked(pdes_pa) 1598 + (pl_i(0, level) * sizeof (pd_entry_t)), 1599 xpmap_ptom_masked(newp) | PG_RW | PG_u | PG_V); 1600 level--; 1601 if (level <= 1) 1602 break; 1603 pdes_pa = newp; 1604 } 1605 #else /* XEN */ 1606 pd_entry_t *pdes; 1607 int level; 1608 paddr_t newp; 1609 1610 pdes = pmap_kernel()->pm_pdir; 1611 level = PTP_LEVELS; 1612 for (;;) { 1613 newp = avail_start; 1614 avail_start += PAGE_SIZE; 1615 *early_zero_pte = (newp & PG_FRAME) | PG_V | PG_RW; 1616 pmap_update_pg((vaddr_t)early_zerop); 1617 memset(early_zerop, 0, PAGE_SIZE); 1618 pdes[pl_i(0, level)] = (newp & PG_FRAME) | PG_V | PG_RW; 1619 level--; 1620 if (level <= 1) 1621 break; 1622 pdes = normal_pdes[level - 2]; 1623 } 1624 #endif /* XEN */ 1625 } 1626 #endif /* defined(__x86_64__) */ 1627 1628 /* 1629 * pmap_init: called from uvm_init, our job is to get the pmap 1630 * system ready to manage mappings... 1631 */ 1632 1633 void 1634 pmap_init(void) 1635 { 1636 int i; 1637 1638 for (i = 0; i < PV_HASH_SIZE; i++) { 1639 SLIST_INIT(&pv_hash_heads[i].hh_list); 1640 } 1641 for (i = 0; i < PV_HASH_LOCK_CNT; i++) { 1642 mutex_init(&pv_hash_locks[i].lock, MUTEX_NODEBUG, IPL_VM); 1643 } 1644 1645 /* 1646 * done: pmap module is up (and ready for business) 1647 */ 1648 1649 pmap_initialized = true; 1650 } 1651 1652 /* 1653 * pmap_cpu_init_early: perform early per-CPU initialization. 1654 */ 1655 1656 void 1657 pmap_cpu_init_early(struct cpu_info *ci) 1658 { 1659 struct pmap_cpu *pc; 1660 static uint8_t pmap_cpu_alloc; 1661 1662 pc = &pmap_cpu[pmap_cpu_alloc++].pc; 1663 ci->ci_pmap_cpu = pc; 1664 } 1665 1666 /* 1667 * pmap_cpu_init_late: perform late per-CPU initialization. 1668 */ 1669 1670 void 1671 pmap_cpu_init_late(struct cpu_info *ci) 1672 { 1673 1674 if (ci == &cpu_info_primary) { 1675 evcnt_attach_dynamic(&pmap_tlb_evcnt, EVCNT_TYPE_INTR, 1676 NULL, "global", "TLB IPI"); 1677 evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC, 1678 NULL, "x86", "io bitmap copy"); 1679 evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC, 1680 NULL, "x86", "ldt sync"); 1681 } 1682 1683 evcnt_attach_dynamic(&ci->ci_tlb_evcnt, EVCNT_TYPE_MISC, 1684 NULL, device_xname(ci->ci_dev), "TLB IPI"); 1685 } 1686 1687 /* 1688 * p v _ e n t r y f u n c t i o n s 1689 */ 1690 1691 /* 1692 * pmap_free_pvs: free a list of pv_entrys 1693 */ 1694 1695 static void 1696 pmap_free_pvs(struct pv_entry *pve) 1697 { 1698 struct pv_entry *next; 1699 1700 for ( /* null */ ; pve != NULL ; pve = next) { 1701 next = pve->pve_next; 1702 pool_cache_put(&pmap_pv_cache, pve); 1703 } 1704 } 1705 1706 /* 1707 * main pv_entry manipulation functions: 1708 * pmap_enter_pv: enter a mapping onto a pv_head list 1709 * pmap_remove_pv: remove a mapping from a pv_head list 1710 * 1711 * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock 1712 * the pvh before calling 1713 */ 1714 1715 /* 1716 * insert_pv: a helper of pmap_enter_pv 1717 */ 1718 1719 static void 1720 insert_pv(struct pmap_page *pp, struct pv_entry *pve) 1721 { 1722 struct pv_hash_head *hh; 1723 kmutex_t *lock; 1724 u_int hash; 1725 1726 KASSERT(pp_locked(pp)); 1727 1728 hash = pvhash_hash(pve->pve_pte.pte_ptp, pve->pve_pte.pte_va); 1729 lock = pvhash_lock(hash); 1730 hh = pvhash_head(hash); 1731 mutex_spin_enter(lock); 1732 SLIST_INSERT_HEAD(&hh->hh_list, pve, pve_hash); 1733 mutex_spin_exit(lock); 1734 1735 LIST_INSERT_HEAD(&pp->pp_head.pvh_list, pve, pve_list); 1736 } 1737 1738 /* 1739 * pmap_enter_pv: enter a mapping onto a pv_head lst 1740 * 1741 * => caller should have the pp_lock locked 1742 * => caller should adjust ptp's wire_count before calling 1743 */ 1744 1745 static struct pv_entry * 1746 pmap_enter_pv(struct pmap_page *pp, 1747 struct pv_entry *pve, /* preallocated pve for us to use */ 1748 struct pv_entry **sparepve, 1749 struct vm_page *ptp, 1750 vaddr_t va) 1751 { 1752 1753 KASSERT(ptp == NULL || ptp->wire_count >= 2); 1754 KASSERT(ptp == NULL || ptp->uobject != NULL); 1755 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 1756 KASSERT(pp_locked(pp)); 1757 1758 if ((pp->pp_flags & PP_EMBEDDED) == 0) { 1759 if (LIST_EMPTY(&pp->pp_head.pvh_list)) { 1760 pp->pp_flags |= PP_EMBEDDED; 1761 pp->pp_pte.pte_ptp = ptp; 1762 pp->pp_pte.pte_va = va; 1763 1764 return pve; 1765 } 1766 } else { 1767 struct pv_entry *pve2; 1768 1769 pve2 = *sparepve; 1770 *sparepve = NULL; 1771 1772 pve2->pve_pte = pp->pp_pte; 1773 pp->pp_flags &= ~PP_EMBEDDED; 1774 LIST_INIT(&pp->pp_head.pvh_list); 1775 insert_pv(pp, pve2); 1776 } 1777 1778 pve->pve_pte.pte_ptp = ptp; 1779 pve->pve_pte.pte_va = va; 1780 insert_pv(pp, pve); 1781 1782 return NULL; 1783 } 1784 1785 /* 1786 * pmap_remove_pv: try to remove a mapping from a pv_list 1787 * 1788 * => caller should hold pp_lock [so that attrs can be adjusted] 1789 * => caller should adjust ptp's wire_count and free PTP if needed 1790 * => we return the removed pve 1791 */ 1792 1793 static struct pv_entry * 1794 pmap_remove_pv(struct pmap_page *pp, struct vm_page *ptp, vaddr_t va) 1795 { 1796 struct pv_hash_head *hh; 1797 struct pv_entry *pve; 1798 kmutex_t *lock; 1799 u_int hash; 1800 1801 KASSERT(ptp == NULL || ptp->uobject != NULL); 1802 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 1803 KASSERT(pp_locked(pp)); 1804 1805 if ((pp->pp_flags & PP_EMBEDDED) != 0) { 1806 KASSERT(pp->pp_pte.pte_ptp == ptp); 1807 KASSERT(pp->pp_pte.pte_va == va); 1808 1809 pp->pp_flags &= ~PP_EMBEDDED; 1810 LIST_INIT(&pp->pp_head.pvh_list); 1811 1812 return NULL; 1813 } 1814 1815 hash = pvhash_hash(ptp, va); 1816 lock = pvhash_lock(hash); 1817 hh = pvhash_head(hash); 1818 mutex_spin_enter(lock); 1819 pve = pvhash_remove(hh, ptp, va); 1820 mutex_spin_exit(lock); 1821 1822 LIST_REMOVE(pve, pve_list); 1823 1824 return pve; 1825 } 1826 1827 /* 1828 * p t p f u n c t i o n s 1829 */ 1830 1831 static inline struct vm_page * 1832 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level) 1833 { 1834 int lidx = level - 1; 1835 struct vm_page *pg; 1836 1837 KASSERT(mutex_owned(&pmap->pm_lock)); 1838 1839 if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] && 1840 pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) { 1841 return (pmap->pm_ptphint[lidx]); 1842 } 1843 PMAP_SUBOBJ_LOCK(pmap, lidx); 1844 pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level)); 1845 PMAP_SUBOBJ_UNLOCK(pmap, lidx); 1846 1847 KASSERT(pg == NULL || pg->wire_count >= 1); 1848 return pg; 1849 } 1850 1851 static inline void 1852 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level) 1853 { 1854 int lidx; 1855 struct uvm_object *obj; 1856 1857 KASSERT(ptp->wire_count == 1); 1858 1859 lidx = level - 1; 1860 1861 obj = &pmap->pm_obj[lidx]; 1862 pmap_stats_update(pmap, -1, 0); 1863 if (lidx != 0) 1864 mutex_enter(&obj->vmobjlock); 1865 if (pmap->pm_ptphint[lidx] == ptp) 1866 pmap->pm_ptphint[lidx] = TAILQ_FIRST(&obj->memq); 1867 ptp->wire_count = 0; 1868 uvm_pagerealloc(ptp, NULL, 0); 1869 VM_PAGE_TO_PP(ptp)->pp_link = curlwp->l_md.md_gc_ptp; 1870 curlwp->l_md.md_gc_ptp = ptp; 1871 if (lidx != 0) 1872 mutex_exit(&obj->vmobjlock); 1873 } 1874 1875 static void 1876 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va, 1877 pt_entry_t *ptes, pd_entry_t * const *pdes) 1878 { 1879 unsigned long index; 1880 int level; 1881 vaddr_t invaladdr; 1882 #ifdef MULTIPROCESSOR 1883 vaddr_t invaladdr2; 1884 #endif 1885 pd_entry_t opde; 1886 struct pmap *curpmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map); 1887 1888 KASSERT(pmap != pmap_kernel()); 1889 KASSERT(mutex_owned(&pmap->pm_lock)); 1890 KASSERT(kpreempt_disabled()); 1891 1892 level = 1; 1893 do { 1894 index = pl_i(va, level + 1); 1895 opde = pmap_pte_testset(&pdes[level - 1][index], 0); 1896 #if defined(XEN) && defined(__x86_64__) 1897 /* 1898 * If ptp is a L3 currently mapped in kernel space, 1899 * clear it before freeing 1900 */ 1901 if (pmap->pm_pdirpa == xen_current_user_pgd 1902 && level == PTP_LEVELS - 1) 1903 pmap_pte_set(&pmap_kernel()->pm_pdir[index], 0); 1904 #endif /* XEN && __x86_64__ */ 1905 pmap_freepage(pmap, ptp, level); 1906 invaladdr = level == 1 ? (vaddr_t)ptes : 1907 (vaddr_t)pdes[level - 2]; 1908 pmap_tlb_shootdown(curpmap, invaladdr + index * PAGE_SIZE, 1909 0, opde); 1910 #if defined(MULTIPROCESSOR) 1911 invaladdr2 = level == 1 ? (vaddr_t)PTE_BASE : 1912 (vaddr_t)normal_pdes[level - 2]; 1913 if (pmap != curpmap || invaladdr != invaladdr2) { 1914 pmap_tlb_shootdown(pmap, invaladdr2 + index * PAGE_SIZE, 1915 0, opde); 1916 } 1917 #endif 1918 if (level < PTP_LEVELS - 1) { 1919 ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1); 1920 ptp->wire_count--; 1921 if (ptp->wire_count > 1) 1922 break; 1923 } 1924 } while (++level < PTP_LEVELS); 1925 pmap_pte_flush(); 1926 } 1927 1928 /* 1929 * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one) 1930 * 1931 * => pmap should NOT be pmap_kernel() 1932 * => pmap should be locked 1933 * => preemption should be disabled 1934 */ 1935 1936 static struct vm_page * 1937 pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t * const *pdes) 1938 { 1939 struct vm_page *ptp, *pptp; 1940 int i; 1941 unsigned long index; 1942 pd_entry_t *pva; 1943 paddr_t ppa, pa; 1944 struct uvm_object *obj; 1945 1946 KASSERT(pmap != pmap_kernel()); 1947 KASSERT(mutex_owned(&pmap->pm_lock)); 1948 KASSERT(kpreempt_disabled()); 1949 1950 ptp = NULL; 1951 pa = (paddr_t)-1; 1952 1953 /* 1954 * Loop through all page table levels seeing if we need to 1955 * add a new page to that level. 1956 */ 1957 for (i = PTP_LEVELS; i > 1; i--) { 1958 /* 1959 * Save values from previous round. 1960 */ 1961 pptp = ptp; 1962 ppa = pa; 1963 1964 index = pl_i(va, i); 1965 pva = pdes[i - 2]; 1966 1967 if (pmap_valid_entry(pva[index])) { 1968 ppa = pmap_pte2pa(pva[index]); 1969 ptp = NULL; 1970 continue; 1971 } 1972 1973 obj = &pmap->pm_obj[i-2]; 1974 PMAP_SUBOBJ_LOCK(pmap, i - 2); 1975 ptp = uvm_pagealloc(obj, ptp_va2o(va, i - 1), NULL, 1976 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 1977 PMAP_SUBOBJ_UNLOCK(pmap, i - 2); 1978 1979 if (ptp == NULL) 1980 return NULL; 1981 1982 ptp->flags &= ~PG_BUSY; /* never busy */ 1983 ptp->wire_count = 1; 1984 pmap->pm_ptphint[i - 2] = ptp; 1985 pa = VM_PAGE_TO_PHYS(ptp); 1986 pmap_pte_set(&pva[index], (pd_entry_t) 1987 (pmap_pa2pte(pa) | PG_u | PG_RW | PG_V)); 1988 #if defined(XEN) && defined(__x86_64__) 1989 /* 1990 * In Xen we must enter the mapping in kernel map too 1991 * if pmap is curmap and modifying top level (PGD) 1992 */ 1993 if(i == PTP_LEVELS && pmap != pmap_kernel()) { 1994 pmap_pte_set(&pmap_kernel()->pm_pdir[index], 1995 (pd_entry_t) (pmap_pa2pte(pa) 1996 | PG_u | PG_RW | PG_V)); 1997 } 1998 #endif /* XEN && __x86_64__ */ 1999 pmap_pte_flush(); 2000 pmap_stats_update(pmap, 1, 0); 2001 /* 2002 * If we're not in the top level, increase the 2003 * wire count of the parent page. 2004 */ 2005 if (i < PTP_LEVELS) { 2006 if (pptp == NULL) 2007 pptp = pmap_find_ptp(pmap, va, ppa, i); 2008 #ifdef DIAGNOSTIC 2009 if (pptp == NULL) 2010 panic("pde page disappeared"); 2011 #endif 2012 pptp->wire_count++; 2013 } 2014 } 2015 2016 /* 2017 * ptp is not NULL if we just allocated a new ptp. If it's 2018 * still NULL, we must look up the existing one. 2019 */ 2020 if (ptp == NULL) { 2021 ptp = pmap_find_ptp(pmap, va, ppa, 1); 2022 #ifdef DIAGNOSTIC 2023 if (ptp == NULL) { 2024 printf("va %lx ppa %lx\n", (unsigned long)va, 2025 (unsigned long)ppa); 2026 panic("pmap_get_ptp: unmanaged user PTP"); 2027 } 2028 #endif 2029 } 2030 2031 pmap->pm_ptphint[0] = ptp; 2032 return(ptp); 2033 } 2034 2035 /* 2036 * p m a p l i f e c y c l e f u n c t i o n s 2037 */ 2038 2039 /* 2040 * pmap_pdp_ctor: constructor for the PDP cache. 2041 */ 2042 2043 int 2044 pmap_pdp_ctor(void *arg, void *v, int flags) 2045 { 2046 pd_entry_t *pdir = v; 2047 paddr_t pdirpa = 0; /* XXX: GCC */ 2048 vaddr_t object; 2049 int i; 2050 2051 #if !defined(XEN) || !defined(__x86_64__) 2052 int npde; 2053 #endif 2054 #ifdef XEN 2055 int s; 2056 #endif 2057 2058 /* 2059 * NOTE: The `pmap_lock' is held when the PDP is allocated. 2060 */ 2061 2062 #if defined(XEN) && defined(__x86_64__) 2063 /* fetch the physical address of the page directory. */ 2064 (void) pmap_extract(pmap_kernel(), (vaddr_t) pdir, &pdirpa); 2065 2066 /* zero init area */ 2067 memset (pdir, 0, PAGE_SIZE); /* Xen wants a clean page */ 2068 /* 2069 * this pdir will NEVER be active in kernel mode 2070 * so mark recursive entry invalid 2071 */ 2072 pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa) | PG_u; 2073 /* 2074 * PDP constructed this way won't be for kernel, 2075 * hence we don't put kernel mappings on Xen. 2076 * But we need to make pmap_create() happy, so put a dummy (without 2077 * PG_V) value at the right place. 2078 */ 2079 pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] = 2080 (unsigned long)-1 & PG_FRAME; 2081 #else /* XEN && __x86_64__*/ 2082 /* zero init area */ 2083 memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t)); 2084 2085 object = (vaddr_t)v; 2086 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2087 /* fetch the physical address of the page directory. */ 2088 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2089 /* put in recursive PDE to map the PTEs */ 2090 pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PG_V; 2091 #ifndef XEN 2092 pdir[PDIR_SLOT_PTE + i] |= PG_KW; 2093 #endif 2094 } 2095 2096 /* copy kernel's PDE */ 2097 npde = nkptp[PTP_LEVELS - 1]; 2098 2099 memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN], 2100 npde * sizeof(pd_entry_t)); 2101 2102 /* zero the rest */ 2103 memset(&pdir[PDIR_SLOT_KERN + npde], 0, 2104 (NTOPLEVEL_PDES - (PDIR_SLOT_KERN + npde)) * sizeof(pd_entry_t)); 2105 2106 if (VM_MIN_KERNEL_ADDRESS != KERNBASE) { 2107 int idx = pl_i(KERNBASE, PTP_LEVELS); 2108 2109 pdir[idx] = PDP_BASE[idx]; 2110 } 2111 #endif /* XEN && __x86_64__*/ 2112 #ifdef XEN 2113 s = splvm(); 2114 object = (vaddr_t)v; 2115 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2116 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2117 /* remap this page RO */ 2118 pmap_kenter_pa(object, pdirpa, VM_PROT_READ, 0); 2119 pmap_update(pmap_kernel()); 2120 /* 2121 * pin as L2/L4 page, we have to do the page with the 2122 * PDIR_SLOT_PTE entries last 2123 */ 2124 #ifdef PAE 2125 if (i == l2tol3(PDIR_SLOT_PTE)) 2126 continue; 2127 #endif 2128 xpq_queue_pin_table(xpmap_ptom_masked(pdirpa)); 2129 } 2130 #ifdef PAE 2131 object = ((vaddr_t)pdir) + PAGE_SIZE * l2tol3(PDIR_SLOT_PTE); 2132 (void)pmap_extract(pmap_kernel(), object, &pdirpa); 2133 xpq_queue_pin_table(xpmap_ptom_masked(pdirpa)); 2134 #endif 2135 xpq_flush_queue(); 2136 splx(s); 2137 #endif /* XEN */ 2138 2139 return (0); 2140 } 2141 2142 /* 2143 * pmap_pdp_dtor: destructor for the PDP cache. 2144 */ 2145 2146 void 2147 pmap_pdp_dtor(void *arg, void *v) 2148 { 2149 #ifdef XEN 2150 paddr_t pdirpa = 0; /* XXX: GCC */ 2151 vaddr_t object = (vaddr_t)v; 2152 int i; 2153 int s = splvm(); 2154 pt_entry_t *pte; 2155 2156 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2157 /* fetch the physical address of the page directory. */ 2158 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2159 /* unpin page table */ 2160 xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa)); 2161 } 2162 object = (vaddr_t)v; 2163 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2164 /* Set page RW again */ 2165 pte = kvtopte(object); 2166 xpq_queue_pte_update(xpmap_ptetomach(pte), *pte | PG_RW); 2167 xpq_queue_invlpg((vaddr_t)object); 2168 } 2169 xpq_flush_queue(); 2170 splx(s); 2171 #endif /* XEN */ 2172 } 2173 2174 #ifdef PAE 2175 2176 /* pmap_pdp_alloc: Allocate a page for the pdp memory pool. */ 2177 2178 void * 2179 pmap_pdp_alloc(struct pool *pp, int flags) 2180 { 2181 return (void *)uvm_km_alloc(kernel_map, 2182 PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE, 2183 ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK) 2184 | UVM_KMF_WIRED); 2185 } 2186 2187 /* 2188 * pmap_pdp_free: free a PDP 2189 */ 2190 2191 void 2192 pmap_pdp_free(struct pool *pp, void *v) 2193 { 2194 uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE, 2195 UVM_KMF_WIRED); 2196 } 2197 #endif /* PAE */ 2198 2199 /* 2200 * pmap_create: create a pmap 2201 * 2202 * => note: old pmap interface took a "size" args which allowed for 2203 * the creation of "software only" pmaps (not in bsd). 2204 */ 2205 2206 struct pmap * 2207 pmap_create(void) 2208 { 2209 struct pmap *pmap; 2210 int i; 2211 2212 pmap = pool_cache_get(&pmap_cache, PR_WAITOK); 2213 2214 /* init uvm_object */ 2215 for (i = 0; i < PTP_LEVELS - 1; i++) { 2216 UVM_OBJ_INIT(&pmap->pm_obj[i], NULL, 1); 2217 pmap->pm_ptphint[i] = NULL; 2218 } 2219 pmap->pm_stats.wired_count = 0; 2220 /* count the PDP allocd below */ 2221 pmap->pm_stats.resident_count = PDP_SIZE; 2222 #if !defined(__x86_64__) 2223 pmap->pm_hiexec = 0; 2224 #endif /* !defined(__x86_64__) */ 2225 pmap->pm_flags = 0; 2226 pmap->pm_cpus = 0; 2227 pmap->pm_kernel_cpus = 0; 2228 2229 /* init the LDT */ 2230 pmap->pm_ldt = NULL; 2231 pmap->pm_ldt_len = 0; 2232 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 2233 2234 /* allocate PDP */ 2235 try_again: 2236 pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK); 2237 2238 mutex_enter(&pmaps_lock); 2239 2240 if (pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] == 0) { 2241 mutex_exit(&pmaps_lock); 2242 pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir); 2243 goto try_again; 2244 } 2245 2246 #ifdef PAE 2247 for (i = 0; i < PDP_SIZE; i++) 2248 pmap->pm_pdirpa[i] = 2249 pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]); 2250 #else 2251 pmap->pm_pdirpa = pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE]); 2252 #endif 2253 2254 LIST_INSERT_HEAD(&pmaps, pmap, pm_list); 2255 2256 mutex_exit(&pmaps_lock); 2257 2258 return (pmap); 2259 } 2260 2261 /* 2262 * pmap_destroy: drop reference count on pmap. free pmap if 2263 * reference count goes to zero. 2264 */ 2265 2266 void 2267 pmap_destroy(struct pmap *pmap) 2268 { 2269 int i; 2270 #ifdef DIAGNOSTIC 2271 struct cpu_info *ci; 2272 CPU_INFO_ITERATOR cii; 2273 #endif /* DIAGNOSTIC */ 2274 2275 /* 2276 * if we have torn down this pmap, process deferred frees and 2277 * invalidations now. 2278 */ 2279 if (__predict_false(curlwp->l_md.md_gc_pmap == pmap)) { 2280 pmap_update(pmap); 2281 } 2282 2283 /* 2284 * drop reference count 2285 */ 2286 2287 if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) { 2288 return; 2289 } 2290 2291 #ifdef DIAGNOSTIC 2292 for (CPU_INFO_FOREACH(cii, ci)) 2293 if (ci->ci_pmap == pmap) 2294 panic("destroying pmap being used"); 2295 #endif /* DIAGNOSTIC */ 2296 2297 /* 2298 * reference count is zero, free pmap resources and then free pmap. 2299 */ 2300 #ifdef XEN 2301 /* 2302 * Xen lazy APDP handling: 2303 * clear APDP_PDE if pmap is the currently mapped 2304 */ 2305 if (xpmap_ptom_masked(pmap_pdirpa(pmap, 0)) == (*APDP_PDE & PG_FRAME)) { 2306 kpreempt_disable(); 2307 for (i = 0; i < PDP_SIZE; i++) { 2308 pmap_pte_set(&APDP_PDE[i], 0); 2309 #ifdef PAE 2310 /* clear shadow entry too */ 2311 pmap_pte_set(&APDP_PDE_SHADOW[i], 0); 2312 #endif 2313 } 2314 pmap_pte_flush(); 2315 pmap_apte_flush(pmap_kernel()); 2316 kpreempt_enable(); 2317 } 2318 #endif 2319 2320 /* 2321 * remove it from global list of pmaps 2322 */ 2323 2324 mutex_enter(&pmaps_lock); 2325 LIST_REMOVE(pmap, pm_list); 2326 mutex_exit(&pmaps_lock); 2327 2328 /* 2329 * destroyed pmap shouldn't have remaining PTPs 2330 */ 2331 2332 for (i = 0; i < PTP_LEVELS - 1; i++) { 2333 KASSERT(pmap->pm_obj[i].uo_npages == 0); 2334 KASSERT(TAILQ_EMPTY(&pmap->pm_obj[i].memq)); 2335 } 2336 2337 /* 2338 * MULTIPROCESSOR -- no need to flush out of other processors' 2339 * APTE space because we do that in pmap_unmap_ptes(). 2340 */ 2341 pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir); 2342 2343 #ifdef USER_LDT 2344 if (pmap->pm_ldt != NULL) { 2345 /* 2346 * no need to switch the LDT; this address space is gone, 2347 * nothing is using it. 2348 * 2349 * No need to lock the pmap for ldt_free (or anything else), 2350 * we're the last one to use it. 2351 */ 2352 mutex_enter(&cpu_lock); 2353 ldt_free(pmap->pm_ldt_sel); 2354 mutex_exit(&cpu_lock); 2355 uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt, 2356 pmap->pm_ldt_len, UVM_KMF_WIRED); 2357 } 2358 #endif 2359 2360 for (i = 0; i < PTP_LEVELS - 1; i++) 2361 mutex_destroy(&pmap->pm_obj[i].vmobjlock); 2362 pool_cache_put(&pmap_cache, pmap); 2363 } 2364 2365 /* 2366 * pmap_remove_all: pmap is being torn down by the current thread. 2367 * avoid unnecessary invalidations. 2368 */ 2369 2370 void 2371 pmap_remove_all(struct pmap *pmap) 2372 { 2373 lwp_t *l = curlwp; 2374 2375 KASSERT(l->l_md.md_gc_pmap == NULL); 2376 2377 l->l_md.md_gc_pmap = pmap; 2378 } 2379 2380 #if defined(PMAP_FORK) 2381 /* 2382 * pmap_fork: perform any necessary data structure manipulation when 2383 * a VM space is forked. 2384 */ 2385 2386 void 2387 pmap_fork(struct pmap *pmap1, struct pmap *pmap2) 2388 { 2389 #ifdef USER_LDT 2390 union descriptor *new_ldt; 2391 size_t len; 2392 int sel; 2393 2394 if (__predict_true(pmap1->pm_ldt == NULL)) { 2395 return; 2396 } 2397 2398 retry: 2399 if (pmap1->pm_ldt != NULL) { 2400 len = pmap1->pm_ldt_len; 2401 new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, len, 0, 2402 UVM_KMF_WIRED); 2403 mutex_enter(&cpu_lock); 2404 sel = ldt_alloc(new_ldt, len); 2405 if (sel == -1) { 2406 mutex_exit(&cpu_lock); 2407 uvm_km_free(kernel_map, (vaddr_t)new_ldt, len, 2408 UVM_KMF_WIRED); 2409 printf("WARNING: pmap_fork: unable to allocate LDT\n"); 2410 return; 2411 } 2412 } else { 2413 len = -1; 2414 new_ldt = NULL; 2415 sel = -1; 2416 mutex_enter(&cpu_lock); 2417 } 2418 2419 /* Copy the LDT, if necessary. */ 2420 if (pmap1->pm_ldt != NULL) { 2421 if (len != pmap1->pm_ldt_len) { 2422 if (len != -1) { 2423 ldt_free(sel); 2424 uvm_km_free(kernel_map, (vaddr_t)new_ldt, 2425 len, UVM_KMF_WIRED); 2426 } 2427 mutex_exit(&cpu_lock); 2428 goto retry; 2429 } 2430 2431 memcpy(new_ldt, pmap1->pm_ldt, len); 2432 pmap2->pm_ldt = new_ldt; 2433 pmap2->pm_ldt_len = pmap1->pm_ldt_len; 2434 pmap2->pm_ldt_sel = sel; 2435 len = -1; 2436 } 2437 2438 if (len != -1) { 2439 ldt_free(sel); 2440 uvm_km_free(kernel_map, (vaddr_t)new_ldt, len, 2441 UVM_KMF_WIRED); 2442 } 2443 mutex_exit(&cpu_lock); 2444 #endif /* USER_LDT */ 2445 } 2446 #endif /* PMAP_FORK */ 2447 2448 #ifdef USER_LDT 2449 2450 /* 2451 * pmap_ldt_xcall: cross call used by pmap_ldt_sync. if the named pmap 2452 * is active, reload LDTR. 2453 */ 2454 static void 2455 pmap_ldt_xcall(void *arg1, void *arg2) 2456 { 2457 struct pmap *pm; 2458 2459 kpreempt_disable(); 2460 pm = arg1; 2461 if (curcpu()->ci_pmap == pm) { 2462 lldt(pm->pm_ldt_sel); 2463 } 2464 kpreempt_enable(); 2465 } 2466 2467 /* 2468 * pmap_ldt_sync: LDT selector for the named pmap is changing. swap 2469 * in the new selector on all CPUs. 2470 */ 2471 void 2472 pmap_ldt_sync(struct pmap *pm) 2473 { 2474 uint64_t where; 2475 2476 KASSERT(mutex_owned(&cpu_lock)); 2477 2478 pmap_ldt_evcnt.ev_count++; 2479 where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL); 2480 xc_wait(where); 2481 } 2482 2483 /* 2484 * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and 2485 * restore the default. 2486 */ 2487 2488 void 2489 pmap_ldt_cleanup(struct lwp *l) 2490 { 2491 pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap; 2492 union descriptor *dp = NULL; 2493 size_t len = 0; 2494 int sel = -1; 2495 2496 if (__predict_true(pmap->pm_ldt == NULL)) { 2497 return; 2498 } 2499 2500 mutex_enter(&cpu_lock); 2501 if (pmap->pm_ldt != NULL) { 2502 sel = pmap->pm_ldt_sel; 2503 dp = pmap->pm_ldt; 2504 len = pmap->pm_ldt_len; 2505 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 2506 pmap->pm_ldt = NULL; 2507 pmap->pm_ldt_len = 0; 2508 pmap_ldt_sync(pmap); 2509 ldt_free(sel); 2510 uvm_km_free(kernel_map, (vaddr_t)dp, len, UVM_KMF_WIRED); 2511 } 2512 mutex_exit(&cpu_lock); 2513 } 2514 #endif /* USER_LDT */ 2515 2516 /* 2517 * pmap_activate: activate a process' pmap 2518 * 2519 * => must be called with kernel preemption disabled 2520 * => if lwp is the curlwp, then set ci_want_pmapload so that 2521 * actual MMU context switch will be done by pmap_load() later 2522 */ 2523 2524 void 2525 pmap_activate(struct lwp *l) 2526 { 2527 struct cpu_info *ci; 2528 struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2529 2530 KASSERT(kpreempt_disabled()); 2531 2532 ci = curcpu(); 2533 2534 if (l == ci->ci_curlwp) { 2535 struct pcb *pcb; 2536 2537 KASSERT(ci->ci_want_pmapload == 0); 2538 KASSERT(ci->ci_tlbstate != TLBSTATE_VALID); 2539 #ifdef KSTACK_CHECK_DR0 2540 /* 2541 * setup breakpoint on the top of stack 2542 */ 2543 if (l == &lwp0) 2544 dr0(0, 0, 0, 0); 2545 else 2546 dr0(KSTACK_LOWEST_ADDR(l), 1, 3, 1); 2547 #endif 2548 2549 /* 2550 * no need to switch to kernel vmspace because 2551 * it's a subset of any vmspace. 2552 */ 2553 2554 if (pmap == pmap_kernel()) { 2555 ci->ci_want_pmapload = 0; 2556 return; 2557 } 2558 2559 pcb = lwp_getpcb(l); 2560 ci->ci_want_pmapload = 1; 2561 2562 #if defined(__x86_64__) 2563 if (pcb->pcb_flags & PCB_GS64) 2564 wrmsr(MSR_KERNELGSBASE, pcb->pcb_gs); 2565 if (pcb->pcb_flags & PCB_FS64) 2566 wrmsr(MSR_FSBASE, pcb->pcb_fs); 2567 #endif /* defined(__x86_64__) */ 2568 } 2569 } 2570 2571 /* 2572 * pmap_reactivate: try to regain reference to the pmap. 2573 * 2574 * => must be called with kernel preemption disabled 2575 */ 2576 2577 static bool 2578 pmap_reactivate(struct pmap *pmap) 2579 { 2580 struct cpu_info *ci; 2581 uint32_t cpumask; 2582 bool result; 2583 uint32_t oldcpus; 2584 2585 ci = curcpu(); 2586 cpumask = ci->ci_cpumask; 2587 2588 KASSERT(kpreempt_disabled()); 2589 #if defined(XEN) && defined(__x86_64__) 2590 KASSERT(pmap->pm_pdirpa == xen_current_user_pgd); 2591 #elif defined(PAE) 2592 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(pmap_l3pd[0])); 2593 #elif !defined(XEN) 2594 KASSERT(pmap->pm_pdirpa == pmap_pte2pa(rcr3())); 2595 #endif 2596 2597 /* 2598 * if we still have a lazy reference to this pmap, 2599 * we can assume that there was no tlb shootdown 2600 * for this pmap in the meantime. 2601 * 2602 * the order of events here is important as we must 2603 * synchronize with TLB shootdown interrupts. declare 2604 * interest in invalidations (TLBSTATE_VALID) and then 2605 * check the cpumask, which the IPIs can change only 2606 * when the state is TLBSTATE_LAZY. 2607 */ 2608 2609 ci->ci_tlbstate = TLBSTATE_VALID; 2610 oldcpus = pmap->pm_cpus; 2611 KASSERT((pmap->pm_kernel_cpus & cpumask) != 0); 2612 if (oldcpus & cpumask) { 2613 /* got it */ 2614 result = true; 2615 } else { 2616 /* must reload */ 2617 atomic_or_32(&pmap->pm_cpus, cpumask); 2618 result = false; 2619 } 2620 2621 return result; 2622 } 2623 2624 /* 2625 * pmap_load: actually switch pmap. (fill in %cr3 and LDT info) 2626 */ 2627 2628 void 2629 pmap_load(void) 2630 { 2631 struct cpu_info *ci; 2632 uint32_t cpumask; 2633 struct pmap *pmap; 2634 struct pmap *oldpmap; 2635 struct lwp *l; 2636 struct pcb *pcb; 2637 uint64_t ncsw; 2638 2639 kpreempt_disable(); 2640 retry: 2641 ci = curcpu(); 2642 if (!ci->ci_want_pmapload) { 2643 kpreempt_enable(); 2644 return; 2645 } 2646 cpumask = ci->ci_cpumask; 2647 l = ci->ci_curlwp; 2648 ncsw = l->l_ncsw; 2649 2650 /* should be able to take ipis. */ 2651 KASSERT(ci->ci_ilevel < IPL_HIGH); 2652 #ifdef XEN 2653 /* XXX not yet KASSERT(x86_read_psl() != 0); */ 2654 #else 2655 KASSERT((x86_read_psl() & PSL_I) != 0); 2656 #endif 2657 2658 KASSERT(l != NULL); 2659 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2660 KASSERT(pmap != pmap_kernel()); 2661 oldpmap = ci->ci_pmap; 2662 pcb = lwp_getpcb(l); 2663 2664 if (pmap == oldpmap) { 2665 if (!pmap_reactivate(pmap)) { 2666 u_int gen = uvm_emap_gen_return(); 2667 2668 /* 2669 * pmap has been changed during deactivated. 2670 * our tlb may be stale. 2671 */ 2672 2673 tlbflush(); 2674 uvm_emap_update(gen); 2675 } 2676 2677 ci->ci_want_pmapload = 0; 2678 kpreempt_enable(); 2679 return; 2680 } 2681 2682 /* 2683 * grab a reference to the new pmap. 2684 */ 2685 2686 pmap_reference(pmap); 2687 2688 /* 2689 * actually switch pmap. 2690 */ 2691 2692 atomic_and_32(&oldpmap->pm_cpus, ~cpumask); 2693 atomic_and_32(&oldpmap->pm_kernel_cpus, ~cpumask); 2694 2695 #if defined(XEN) && defined(__x86_64__) 2696 KASSERT(oldpmap->pm_pdirpa == xen_current_user_pgd || 2697 oldpmap == pmap_kernel()); 2698 #elif defined(PAE) 2699 KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(pmap_l3pd[0])); 2700 #elif !defined(XEN) 2701 KASSERT(oldpmap->pm_pdirpa == pmap_pte2pa(rcr3())); 2702 #endif 2703 KASSERT((pmap->pm_cpus & cpumask) == 0); 2704 KASSERT((pmap->pm_kernel_cpus & cpumask) == 0); 2705 2706 /* 2707 * mark the pmap in use by this processor. again we must 2708 * synchronize with TLB shootdown interrupts, so set the 2709 * state VALID first, then register us for shootdown events 2710 * on this pmap. 2711 */ 2712 2713 ci->ci_tlbstate = TLBSTATE_VALID; 2714 atomic_or_32(&pmap->pm_cpus, cpumask); 2715 atomic_or_32(&pmap->pm_kernel_cpus, cpumask); 2716 ci->ci_pmap = pmap; 2717 2718 /* 2719 * update tss. now that we have registered for invalidations 2720 * from other CPUs, we're good to load the page tables. 2721 */ 2722 #ifdef PAE 2723 pcb->pcb_cr3 = pmap_l3paddr; 2724 #else 2725 pcb->pcb_cr3 = pmap->pm_pdirpa; 2726 #endif 2727 #if defined(XEN) && defined(__x86_64__) 2728 /* kernel pmap always in cr3 and should never go in user cr3 */ 2729 if (pmap_pdirpa(pmap, 0) != pmap_pdirpa(pmap_kernel(), 0)) { 2730 /* 2731 * Map user space address in kernel space and load 2732 * user cr3 2733 */ 2734 int i, s; 2735 pd_entry_t *old_pgd, *new_pgd; 2736 paddr_t addr; 2737 s = splvm(); 2738 new_pgd = pmap->pm_pdir; 2739 old_pgd = pmap_kernel()->pm_pdir; 2740 addr = xpmap_ptom(pmap_pdirpa(pmap_kernel(), 0)); 2741 for (i = 0; i < PDIR_SLOT_PTE; 2742 i++, addr += sizeof(pd_entry_t)) { 2743 if ((new_pgd[i] & PG_V) || (old_pgd[i] & PG_V)) 2744 xpq_queue_pte_update(addr, new_pgd[i]); 2745 } 2746 xpq_flush_queue(); /* XXXtlb */ 2747 tlbflush(); 2748 xen_set_user_pgd(pmap_pdirpa(pmap, 0)); 2749 xen_current_user_pgd = pmap_pdirpa(pmap, 0); 2750 splx(s); 2751 } 2752 #else /* XEN && x86_64 */ 2753 #if defined(XEN) 2754 /* 2755 * clear APDP slot, in case it points to a page table that has 2756 * been freed 2757 */ 2758 if (*APDP_PDE) { 2759 int i; 2760 for (i = 0; i < PDP_SIZE; i++) { 2761 pmap_pte_set(&APDP_PDE[i], 0); 2762 #ifdef PAE 2763 /* clear shadow entry too */ 2764 pmap_pte_set(&APDP_PDE_SHADOW[i], 0); 2765 #endif 2766 } 2767 } 2768 /* lldt() does pmap_pte_flush() */ 2769 #else /* XEN */ 2770 #if defined(i386) 2771 ci->ci_tss.tss_ldt = pmap->pm_ldt_sel; 2772 ci->ci_tss.tss_cr3 = pcb->pcb_cr3; 2773 #endif 2774 #endif /* XEN */ 2775 lldt(pmap->pm_ldt_sel); 2776 #ifdef PAE 2777 { 2778 paddr_t l3_pd = xpmap_ptom_masked(pmap_l3paddr); 2779 int i; 2780 int s = splvm(); 2781 /* don't update the kernel L3 slot */ 2782 for (i = 0 ; i < PDP_SIZE - 1 ; i++, l3_pd += sizeof(pd_entry_t)) { 2783 xpq_queue_pte_update(l3_pd, 2784 xpmap_ptom(pmap->pm_pdirpa[i]) | PG_V); 2785 } 2786 tlbflush(); 2787 xpq_flush_queue(); 2788 splx(s); 2789 } 2790 #else /* PAE */ 2791 { 2792 u_int gen = uvm_emap_gen_return(); 2793 lcr3(pcb->pcb_cr3); 2794 uvm_emap_update(gen); 2795 } 2796 #endif /* PAE */ 2797 #endif /* XEN && x86_64 */ 2798 2799 ci->ci_want_pmapload = 0; 2800 2801 /* 2802 * we're now running with the new pmap. drop the reference 2803 * to the old pmap. if we block, we need to go around again. 2804 */ 2805 2806 pmap_destroy(oldpmap); 2807 if (l->l_ncsw != ncsw) { 2808 goto retry; 2809 } 2810 2811 kpreempt_enable(); 2812 } 2813 2814 /* 2815 * pmap_deactivate: deactivate a process' pmap 2816 * 2817 * => must be called with kernel preemption disabled (high SPL is enough) 2818 */ 2819 2820 void 2821 pmap_deactivate(struct lwp *l) 2822 { 2823 struct pmap *pmap; 2824 struct cpu_info *ci; 2825 2826 KASSERT(kpreempt_disabled()); 2827 2828 if (l != curlwp) { 2829 return; 2830 } 2831 2832 /* 2833 * wait for pending TLB shootdowns to complete. necessary 2834 * because TLB shootdown state is per-CPU, and the LWP may 2835 * be coming off the CPU before it has a chance to call 2836 * pmap_update(). 2837 */ 2838 pmap_tlb_shootwait(); 2839 2840 ci = curcpu(); 2841 2842 if (ci->ci_want_pmapload) { 2843 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 2844 != pmap_kernel()); 2845 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 2846 != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID); 2847 2848 /* 2849 * userspace has not been touched. 2850 * nothing to do here. 2851 */ 2852 2853 ci->ci_want_pmapload = 0; 2854 return; 2855 } 2856 2857 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2858 2859 if (pmap == pmap_kernel()) { 2860 return; 2861 } 2862 2863 #if defined(XEN) && defined(__x86_64__) 2864 KASSERT(pmap->pm_pdirpa == xen_current_user_pgd); 2865 #elif defined(PAE) 2866 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(pmap_l3pd[0])); 2867 #elif !defined(XEN) 2868 KASSERT(pmap->pm_pdirpa == pmap_pte2pa(rcr3())); 2869 #endif 2870 KASSERT(ci->ci_pmap == pmap); 2871 2872 /* 2873 * we aren't interested in TLB invalidations for this pmap, 2874 * at least for the time being. 2875 */ 2876 2877 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); 2878 ci->ci_tlbstate = TLBSTATE_LAZY; 2879 } 2880 2881 /* 2882 * end of lifecycle functions 2883 */ 2884 2885 /* 2886 * some misc. functions 2887 */ 2888 2889 static int 2890 pmap_pdes_invalid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde) 2891 { 2892 int i; 2893 unsigned long index; 2894 pd_entry_t pde; 2895 2896 for (i = PTP_LEVELS; i > 1; i--) { 2897 index = pl_i(va, i); 2898 pde = pdes[i - 2][index]; 2899 if ((pde & PG_V) == 0) 2900 return i; 2901 } 2902 if (lastpde != NULL) 2903 *lastpde = pde; 2904 return 0; 2905 } 2906 2907 /* 2908 * pmap_extract: extract a PA for the given VA 2909 */ 2910 2911 bool 2912 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap) 2913 { 2914 pt_entry_t *ptes, pte; 2915 pd_entry_t pde; 2916 pd_entry_t * const *pdes; 2917 struct pmap *pmap2; 2918 struct cpu_info *ci; 2919 vaddr_t pa; 2920 lwp_t *l; 2921 bool hard, rv; 2922 2923 rv = false; 2924 pa = 0; 2925 l = curlwp; 2926 2927 KPREEMPT_DISABLE(l); 2928 ci = l->l_cpu; 2929 if (__predict_true(!ci->ci_want_pmapload && ci->ci_pmap == pmap) || 2930 pmap == pmap_kernel()) { 2931 /* 2932 * no need to lock, because it's pmap_kernel() or our 2933 * own pmap and is active. if a user pmap, the caller 2934 * will hold the vm_map write/read locked and so prevent 2935 * entries from disappearing while we are here. ptps 2936 * can disappear via pmap_remove() and pmap_protect(), 2937 * but they are called with the vm_map write locked. 2938 */ 2939 hard = false; 2940 ptes = PTE_BASE; 2941 pdes = normal_pdes; 2942 } else { 2943 /* we lose, do it the hard way. */ 2944 hard = true; 2945 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 2946 } 2947 if (pmap_pdes_valid(va, pdes, &pde)) { 2948 pte = ptes[pl1_i(va)]; 2949 if (pde & PG_PS) { 2950 pa = (pde & PG_LGFRAME) | (va & (NBPD_L2 - 1)); 2951 rv = true; 2952 } else if (__predict_true((pte & PG_V) != 0)) { 2953 pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1)); 2954 rv = true; 2955 } 2956 } 2957 if (__predict_false(hard)) { 2958 pmap_unmap_ptes(pmap, pmap2); 2959 } 2960 KPREEMPT_ENABLE(l); 2961 if (pap != NULL) { 2962 *pap = pa; 2963 } 2964 return rv; 2965 } 2966 2967 2968 /* 2969 * vtophys: virtual address to physical address. For use by 2970 * machine-dependent code only. 2971 */ 2972 2973 paddr_t 2974 vtophys(vaddr_t va) 2975 { 2976 paddr_t pa; 2977 2978 if (pmap_extract(pmap_kernel(), va, &pa) == true) 2979 return (pa); 2980 return (0); 2981 } 2982 2983 #ifdef XEN 2984 /* 2985 * pmap_extract_ma: extract a MA for the given VA 2986 */ 2987 2988 bool 2989 pmap_extract_ma(struct pmap *pmap, vaddr_t va, paddr_t *pap) 2990 { 2991 pt_entry_t *ptes, pte; 2992 pd_entry_t pde; 2993 pd_entry_t * const *pdes; 2994 struct pmap *pmap2; 2995 2996 kpreempt_disable(); 2997 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 2998 if (!pmap_pdes_valid(va, pdes, &pde)) { 2999 pmap_unmap_ptes(pmap, pmap2); 3000 kpreempt_enable(); 3001 return false; 3002 } 3003 3004 pte = ptes[pl1_i(va)]; 3005 pmap_unmap_ptes(pmap, pmap2); 3006 kpreempt_enable(); 3007 3008 if (__predict_true((pte & PG_V) != 0)) { 3009 if (pap != NULL) 3010 *pap = (pte & PG_FRAME) | (va & (NBPD_L1 - 1)); 3011 return true; 3012 } 3013 3014 return false; 3015 } 3016 3017 /* 3018 * vtomach: virtual address to machine address. For use by 3019 * machine-dependent code only. 3020 */ 3021 3022 paddr_t 3023 vtomach(vaddr_t va) 3024 { 3025 paddr_t pa; 3026 3027 if (pmap_extract_ma(pmap_kernel(), va, &pa) == true) 3028 return (pa); 3029 return (0); 3030 } 3031 3032 #endif /* XEN */ 3033 3034 3035 3036 /* 3037 * pmap_virtual_space: used during bootup [pmap_steal_memory] to 3038 * determine the bounds of the kernel virtual addess space. 3039 */ 3040 3041 void 3042 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp) 3043 { 3044 *startp = virtual_avail; 3045 *endp = virtual_end; 3046 } 3047 3048 /* 3049 * pmap_map: map a range of PAs into kvm. 3050 * 3051 * => used during crash dump 3052 * => XXX: pmap_map() should be phased out? 3053 */ 3054 3055 vaddr_t 3056 pmap_map(vaddr_t va, paddr_t spa, paddr_t epa, vm_prot_t prot) 3057 { 3058 while (spa < epa) { 3059 pmap_kenter_pa(va, spa, prot, 0); 3060 va += PAGE_SIZE; 3061 spa += PAGE_SIZE; 3062 } 3063 pmap_update(pmap_kernel()); 3064 return va; 3065 } 3066 3067 /* 3068 * pmap_zero_page: zero a page 3069 */ 3070 3071 void 3072 pmap_zero_page(paddr_t pa) 3073 { 3074 pt_entry_t *zpte; 3075 void *zerova; 3076 int id; 3077 3078 kpreempt_disable(); 3079 id = cpu_number(); 3080 zpte = PTESLEW(zero_pte, id); 3081 zerova = VASLEW(zerop, id); 3082 3083 #ifdef DIAGNOSTIC 3084 if (*zpte) 3085 panic("pmap_zero_page: lock botch"); 3086 #endif 3087 3088 pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k); 3089 pmap_pte_flush(); 3090 pmap_update_pg((vaddr_t)zerova); /* flush TLB */ 3091 3092 memset(zerova, 0, PAGE_SIZE); 3093 3094 #if defined(DIAGNOSTIC) || defined(XEN) 3095 pmap_pte_set(zpte, 0); /* zap ! */ 3096 pmap_pte_flush(); 3097 #endif 3098 kpreempt_enable(); 3099 } 3100 3101 /* 3102 * pmap_pagezeroidle: the same, for the idle loop page zero'er. 3103 * Returns true if the page was zero'd, false if we aborted for 3104 * some reason. 3105 */ 3106 3107 bool 3108 pmap_pageidlezero(paddr_t pa) 3109 { 3110 pt_entry_t *zpte; 3111 void *zerova; 3112 bool rv; 3113 int id; 3114 3115 id = cpu_number(); 3116 zpte = PTESLEW(zero_pte, id); 3117 zerova = VASLEW(zerop, id); 3118 3119 KASSERT(cpu_feature & CPUID_SSE2); 3120 KASSERT(*zpte == 0); 3121 3122 pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k); 3123 pmap_pte_flush(); 3124 pmap_update_pg((vaddr_t)zerova); /* flush TLB */ 3125 3126 rv = sse2_idlezero_page(zerova); 3127 3128 #if defined(DIAGNOSTIC) || defined(XEN) 3129 pmap_pte_set(zpte, 0); /* zap ! */ 3130 pmap_pte_flush(); 3131 #endif 3132 3133 return rv; 3134 } 3135 3136 /* 3137 * pmap_copy_page: copy a page 3138 */ 3139 3140 void 3141 pmap_copy_page(paddr_t srcpa, paddr_t dstpa) 3142 { 3143 pt_entry_t *spte; 3144 pt_entry_t *dpte; 3145 void *csrcva; 3146 void *cdstva; 3147 int id; 3148 3149 kpreempt_disable(); 3150 id = cpu_number(); 3151 spte = PTESLEW(csrc_pte,id); 3152 dpte = PTESLEW(cdst_pte,id); 3153 csrcva = VASLEW(csrcp, id); 3154 cdstva = VASLEW(cdstp, id); 3155 3156 KASSERT(*spte == 0 && *dpte == 0); 3157 3158 pmap_pte_set(spte, pmap_pa2pte(srcpa) | PG_V | PG_RW | PG_U | PG_k); 3159 pmap_pte_set(dpte, 3160 pmap_pa2pte(dstpa) | PG_V | PG_RW | PG_M | PG_U | PG_k); 3161 pmap_pte_flush(); 3162 pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva); 3163 3164 memcpy(cdstva, csrcva, PAGE_SIZE); 3165 3166 #if defined(DIAGNOSTIC) || defined(XEN) 3167 pmap_pte_set(spte, 0); 3168 pmap_pte_set(dpte, 0); 3169 pmap_pte_flush(); 3170 #endif 3171 kpreempt_enable(); 3172 } 3173 3174 static pt_entry_t * 3175 pmap_map_ptp(struct vm_page *ptp) 3176 { 3177 pt_entry_t *ptppte; 3178 void *ptpva; 3179 int id; 3180 3181 KASSERT(kpreempt_disabled()); 3182 3183 id = cpu_number(); 3184 ptppte = PTESLEW(ptp_pte, id); 3185 ptpva = VASLEW(ptpp, id); 3186 #if !defined(XEN) 3187 pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M | 3188 PG_RW | PG_U | PG_k); 3189 #else 3190 pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M | 3191 PG_U | PG_k); 3192 #endif 3193 pmap_pte_flush(); 3194 pmap_update_pg((vaddr_t)ptpva); 3195 3196 return (pt_entry_t *)ptpva; 3197 } 3198 3199 static void 3200 pmap_unmap_ptp(void) 3201 { 3202 #if defined(DIAGNOSTIC) || defined(XEN) 3203 pt_entry_t *pte; 3204 3205 KASSERT(kpreempt_disabled()); 3206 3207 pte = PTESLEW(ptp_pte, cpu_number()); 3208 if (*pte != 0) { 3209 pmap_pte_set(pte, 0); 3210 pmap_pte_flush(); 3211 } 3212 #endif 3213 } 3214 3215 static pt_entry_t * 3216 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va) 3217 { 3218 3219 KASSERT(kpreempt_disabled()); 3220 if (pmap_is_curpmap(pmap)) { 3221 return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */ 3222 } 3223 KASSERT(ptp != NULL); 3224 return pmap_map_ptp(ptp) + pl1_pi(va); 3225 } 3226 3227 static void 3228 pmap_unmap_pte(void) 3229 { 3230 3231 KASSERT(kpreempt_disabled()); 3232 3233 pmap_unmap_ptp(); 3234 } 3235 3236 /* 3237 * p m a p r e m o v e f u n c t i o n s 3238 * 3239 * functions that remove mappings 3240 */ 3241 3242 /* 3243 * pmap_remove_ptes: remove PTEs from a PTP 3244 * 3245 * => must have proper locking on pmap_master_lock 3246 * => caller must hold pmap's lock 3247 * => PTP must be mapped into KVA 3248 * => PTP should be null if pmap == pmap_kernel() 3249 * => must be called with kernel preemption disabled 3250 * => returns composite pte if at least one page should be shot down 3251 */ 3252 3253 static pt_entry_t 3254 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva, 3255 vaddr_t startva, vaddr_t endva, struct pv_entry **pv_tofree) 3256 { 3257 struct pv_entry *pve; 3258 pt_entry_t *pte = (pt_entry_t *) ptpva; 3259 pt_entry_t opte, xpte = 0; 3260 3261 KASSERT(pmap == pmap_kernel() || mutex_owned(&pmap->pm_lock)); 3262 KASSERT(kpreempt_disabled()); 3263 3264 /* 3265 * note that ptpva points to the PTE that maps startva. this may 3266 * or may not be the first PTE in the PTP. 3267 * 3268 * we loop through the PTP while there are still PTEs to look at 3269 * and the wire_count is greater than 1 (because we use the wire_count 3270 * to keep track of the number of real PTEs in the PTP). 3271 */ 3272 3273 for (/*null*/; startva < endva && (ptp == NULL || ptp->wire_count > 1) 3274 ; pte++, startva += PAGE_SIZE) { 3275 struct vm_page *pg; 3276 struct pmap_page *pp; 3277 3278 if (!pmap_valid_entry(*pte)) 3279 continue; /* VA not mapped */ 3280 3281 /* atomically save the old PTE and zap! it */ 3282 opte = pmap_pte_testset(pte, 0); 3283 if (!pmap_valid_entry(opte)) { 3284 continue; 3285 } 3286 3287 pmap_exec_account(pmap, startva, opte, 0); 3288 pmap_stats_update_bypte(pmap, 0, opte); 3289 xpte |= opte; 3290 3291 if (ptp) { 3292 ptp->wire_count--; /* dropping a PTE */ 3293 /* Make sure that the PDE is flushed */ 3294 if (ptp->wire_count <= 1) 3295 xpte |= PG_U; 3296 } 3297 3298 /* 3299 * if we are not on a pv_head list we are done. 3300 */ 3301 3302 if ((opte & PG_PVLIST) == 0) { 3303 #if defined(DIAGNOSTIC) && !defined(DOM0OPS) 3304 if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL) 3305 panic("pmap_remove_ptes: managed page without " 3306 "PG_PVLIST for 0x%lx", startva); 3307 #endif 3308 continue; 3309 } 3310 3311 pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte)); 3312 #ifdef DIAGNOSTIC 3313 if (pg == NULL) 3314 panic("pmap_remove_ptes: unmanaged page marked " 3315 "PG_PVLIST, va = 0x%lx, pa = 0x%lx", 3316 startva, (u_long)pmap_pte2pa(opte)); 3317 #endif 3318 3319 /* sync R/M bits */ 3320 pp = VM_PAGE_TO_PP(pg); 3321 pp_lock(pp); 3322 pp->pp_attrs |= opte; 3323 pve = pmap_remove_pv(pp, ptp, startva); 3324 pp_unlock(pp); 3325 3326 if (pve != NULL) { 3327 pve->pve_next = *pv_tofree; 3328 *pv_tofree = pve; 3329 } 3330 3331 /* end of "for" loop: time for next pte */ 3332 } 3333 3334 return xpte; 3335 } 3336 3337 3338 /* 3339 * pmap_remove_pte: remove a single PTE from a PTP 3340 * 3341 * => must have proper locking on pmap_master_lock 3342 * => caller must hold pmap's lock 3343 * => PTP must be mapped into KVA 3344 * => PTP should be null if pmap == pmap_kernel() 3345 * => returns true if we removed a mapping 3346 * => must be called with kernel preemption disabled 3347 */ 3348 3349 static bool 3350 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, 3351 vaddr_t va, struct pv_entry **pv_tofree) 3352 { 3353 pt_entry_t opte; 3354 struct pv_entry *pve; 3355 struct vm_page *pg; 3356 struct pmap_page *pp; 3357 3358 KASSERT(pmap == pmap_kernel() || mutex_owned(&pmap->pm_lock)); 3359 KASSERT(pmap == pmap_kernel() || kpreempt_disabled()); 3360 3361 if (!pmap_valid_entry(*pte)) 3362 return(false); /* VA not mapped */ 3363 3364 /* atomically save the old PTE and zap! it */ 3365 opte = pmap_pte_testset(pte, 0); 3366 if (!pmap_valid_entry(opte)) { 3367 return false; 3368 } 3369 3370 pmap_exec_account(pmap, va, opte, 0); 3371 pmap_stats_update_bypte(pmap, 0, opte); 3372 3373 if (opte & PG_U) 3374 pmap_tlb_shootdown(pmap, va, 0, opte); 3375 3376 if (ptp) { 3377 ptp->wire_count--; /* dropping a PTE */ 3378 /* Make sure that the PDE is flushed */ 3379 if ((ptp->wire_count <= 1) && !(opte & PG_U)) 3380 pmap_tlb_shootdown(pmap, va, 0, opte); 3381 } 3382 3383 /* 3384 * if we are not on a pv_head list we are done. 3385 */ 3386 3387 if ((opte & PG_PVLIST) == 0) { 3388 #if defined(DIAGNOSTIC) && !defined(DOM0OPS) 3389 if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL) 3390 panic("pmap_remove_pte: managed page without " 3391 "PG_PVLIST for 0x%lx", va); 3392 #endif 3393 return(true); 3394 } 3395 3396 pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte)); 3397 #ifdef DIAGNOSTIC 3398 if (pg == NULL) 3399 panic("pmap_remove_pte: unmanaged page marked " 3400 "PG_PVLIST, va = 0x%lx, pa = 0x%lx", va, 3401 (u_long)(pmap_pte2pa(opte))); 3402 #endif 3403 3404 /* sync R/M bits */ 3405 pp = VM_PAGE_TO_PP(pg); 3406 pp_lock(pp); 3407 pp->pp_attrs |= opte; 3408 pve = pmap_remove_pv(pp, ptp, va); 3409 pp_unlock(pp); 3410 3411 if (pve) { 3412 pve->pve_next = *pv_tofree; 3413 *pv_tofree = pve; 3414 } 3415 3416 return(true); 3417 } 3418 3419 /* 3420 * pmap_remove: mapping removal function. 3421 * 3422 * => caller should not be holding any pmap locks 3423 */ 3424 3425 void 3426 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 3427 { 3428 pt_entry_t *ptes, xpte = 0; 3429 pd_entry_t pde; 3430 pd_entry_t * const *pdes; 3431 struct pv_entry *pv_tofree = NULL; 3432 bool result; 3433 paddr_t ptppa; 3434 vaddr_t blkendva, va = sva; 3435 struct vm_page *ptp; 3436 struct pmap *pmap2; 3437 3438 kpreempt_disable(); 3439 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 3440 3441 /* 3442 * removing one page? take shortcut function. 3443 */ 3444 3445 if (va + PAGE_SIZE == eva) { 3446 if (pmap_pdes_valid(va, pdes, &pde)) { 3447 3448 /* PA of the PTP */ 3449 ptppa = pmap_pte2pa(pde); 3450 3451 /* get PTP if non-kernel mapping */ 3452 if (pmap == pmap_kernel()) { 3453 /* we never free kernel PTPs */ 3454 ptp = NULL; 3455 } else { 3456 ptp = pmap_find_ptp(pmap, va, ptppa, 1); 3457 #ifdef DIAGNOSTIC 3458 if (ptp == NULL) 3459 panic("pmap_remove: unmanaged " 3460 "PTP detected"); 3461 #endif 3462 } 3463 3464 /* do it! */ 3465 result = pmap_remove_pte(pmap, ptp, 3466 &ptes[pl1_i(va)], va, &pv_tofree); 3467 3468 /* 3469 * if mapping removed and the PTP is no longer 3470 * being used, free it! 3471 */ 3472 3473 if (result && ptp && ptp->wire_count <= 1) 3474 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3475 } 3476 } else for (/* null */ ; va < eva ; va = blkendva) { 3477 int lvl; 3478 3479 /* determine range of block */ 3480 blkendva = x86_round_pdr(va+1); 3481 if (blkendva > eva) 3482 blkendva = eva; 3483 3484 /* 3485 * XXXCDC: our PTE mappings should never be removed 3486 * with pmap_remove! if we allow this (and why would 3487 * we?) then we end up freeing the pmap's page 3488 * directory page (PDP) before we are finished using 3489 * it when we hit in in the recursive mapping. this 3490 * is BAD. 3491 * 3492 * long term solution is to move the PTEs out of user 3493 * address space. and into kernel address space (up 3494 * with APTE). then we can set VM_MAXUSER_ADDRESS to 3495 * be VM_MAX_ADDRESS. 3496 */ 3497 3498 if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE) 3499 /* XXXCDC: ugly hack to avoid freeing PDP here */ 3500 continue; 3501 3502 lvl = pmap_pdes_invalid(va, pdes, &pde); 3503 if (lvl != 0) { 3504 /* 3505 * skip a range corresponding to an invalid pde. 3506 */ 3507 blkendva = (va & ptp_masks[lvl - 1]) + nbpd[lvl - 1]; 3508 continue; 3509 } 3510 3511 /* PA of the PTP */ 3512 ptppa = pmap_pte2pa(pde); 3513 3514 /* get PTP if non-kernel mapping */ 3515 if (pmap == pmap_kernel()) { 3516 /* we never free kernel PTPs */ 3517 ptp = NULL; 3518 } else { 3519 ptp = pmap_find_ptp(pmap, va, ptppa, 1); 3520 #ifdef DIAGNOSTIC 3521 if (ptp == NULL) 3522 panic("pmap_remove: unmanaged PTP " 3523 "detected"); 3524 #endif 3525 } 3526 xpte |= pmap_remove_ptes(pmap, ptp, 3527 (vaddr_t)&ptes[pl1_i(va)], va, blkendva, &pv_tofree); 3528 3529 /* if PTP is no longer being used, free it! */ 3530 if (ptp && ptp->wire_count <= 1) { 3531 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3532 } 3533 if ((xpte & PG_U) != 0) 3534 pmap_tlb_shootdown(pmap, sva, eva, xpte); 3535 } 3536 pmap_unmap_ptes(pmap, pmap2); /* unlock pmap */ 3537 kpreempt_enable(); 3538 3539 /* Now we free unused PVs */ 3540 if (pv_tofree) 3541 pmap_free_pvs(pv_tofree); 3542 } 3543 3544 /* 3545 * pmap_sync_pv: clear pte bits and return the old value of the pte. 3546 * 3547 * => called with pp_lock held. (thus preemption disabled) 3548 * => issues tlb shootdowns if necessary. 3549 */ 3550 3551 static int 3552 pmap_sync_pv(struct pv_pte *pvpte, pt_entry_t expect, int clearbits, 3553 pt_entry_t *optep) 3554 { 3555 struct pmap *pmap; 3556 struct vm_page *ptp; 3557 vaddr_t va; 3558 pt_entry_t *ptep; 3559 pt_entry_t opte; 3560 pt_entry_t npte; 3561 bool need_shootdown; 3562 3563 ptp = pvpte->pte_ptp; 3564 va = pvpte->pte_va; 3565 KASSERT(ptp == NULL || ptp->uobject != NULL); 3566 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 3567 pmap = ptp_to_pmap(ptp); 3568 3569 KASSERT((expect & ~(PG_FRAME | PG_V)) == 0); 3570 KASSERT((expect & PG_V) != 0); 3571 KASSERT(clearbits == ~0 || (clearbits & ~(PG_M | PG_U | PG_RW)) == 0); 3572 KASSERT(kpreempt_disabled()); 3573 3574 ptep = pmap_map_pte(pmap, ptp, va); 3575 do { 3576 opte = *ptep; 3577 KASSERT((opte & (PG_M | PG_U)) != PG_M); 3578 KASSERT((opte & (PG_U | PG_V)) != PG_U); 3579 KASSERT(opte == 0 || (opte & PG_V) != 0); 3580 if ((opte & (PG_FRAME | PG_V)) != expect) { 3581 3582 /* 3583 * we lost a race with a V->P operation like 3584 * pmap_remove(). wait for the competitor 3585 * reflecting pte bits into mp_attrs. 3586 * 3587 * issue a redundant TLB shootdown so that 3588 * we can wait for its completion. 3589 */ 3590 3591 pmap_unmap_pte(); 3592 if (clearbits != 0) { 3593 pmap_tlb_shootdown(pmap, va, 0, 3594 (pmap == pmap_kernel() ? PG_G : 0)); 3595 } 3596 return EAGAIN; 3597 } 3598 3599 /* 3600 * check if there's anything to do on this pte. 3601 */ 3602 3603 if ((opte & clearbits) == 0) { 3604 need_shootdown = false; 3605 break; 3606 } 3607 3608 /* 3609 * we need a shootdown if the pte is cached. (PG_U) 3610 * 3611 * ...unless we are clearing only the PG_RW bit and 3612 * it isn't cached as RW. (PG_M) 3613 */ 3614 3615 need_shootdown = (opte & PG_U) != 0 && 3616 !(clearbits == PG_RW && (opte & PG_M) == 0); 3617 3618 npte = opte & ~clearbits; 3619 3620 /* 3621 * if we need a shootdown anyway, clear PG_U and PG_M. 3622 */ 3623 3624 if (need_shootdown) { 3625 npte &= ~(PG_U | PG_M); 3626 } 3627 KASSERT((npte & (PG_M | PG_U)) != PG_M); 3628 KASSERT((npte & (PG_U | PG_V)) != PG_U); 3629 KASSERT(npte == 0 || (opte & PG_V) != 0); 3630 } while (pmap_pte_cas(ptep, opte, npte) != opte); 3631 3632 if (need_shootdown) { 3633 pmap_tlb_shootdown(pmap, va, 0, opte); 3634 } 3635 pmap_unmap_pte(); 3636 3637 *optep = opte; 3638 return 0; 3639 } 3640 3641 /* 3642 * pmap_page_remove: remove a managed vm_page from all pmaps that map it 3643 * 3644 * => R/M bits are sync'd back to attrs 3645 */ 3646 3647 void 3648 pmap_page_remove(struct vm_page *pg) 3649 { 3650 struct pmap_page *pp; 3651 struct pv_pte *pvpte; 3652 struct pv_entry *killlist = NULL; 3653 struct vm_page *ptp; 3654 pt_entry_t expect; 3655 lwp_t *l; 3656 int count; 3657 3658 l = curlwp; 3659 pp = VM_PAGE_TO_PP(pg); 3660 expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V; 3661 count = SPINLOCK_BACKOFF_MIN; 3662 kpreempt_disable(); 3663 startover: 3664 pp_lock(pp); 3665 while ((pvpte = pv_pte_first(pp)) != NULL) { 3666 struct pmap *pmap; 3667 struct pv_entry *pve; 3668 pt_entry_t opte; 3669 vaddr_t va; 3670 int error; 3671 3672 /* 3673 * add a reference to the pmap before clearing the pte. 3674 * otherwise the pmap can disappear behind us. 3675 */ 3676 3677 ptp = pvpte->pte_ptp; 3678 pmap = ptp_to_pmap(ptp); 3679 if (ptp != NULL) { 3680 pmap_reference(pmap); 3681 } 3682 3683 error = pmap_sync_pv(pvpte, expect, ~0, &opte); 3684 if (error == EAGAIN) { 3685 int hold_count; 3686 pp_unlock(pp); 3687 KERNEL_UNLOCK_ALL(curlwp, &hold_count); 3688 if (ptp != NULL) { 3689 pmap_destroy(pmap); 3690 } 3691 SPINLOCK_BACKOFF(count); 3692 KERNEL_LOCK(hold_count, curlwp); 3693 goto startover; 3694 } 3695 3696 pp->pp_attrs |= opte; 3697 va = pvpte->pte_va; 3698 pve = pmap_remove_pv(pp, ptp, va); 3699 pp_unlock(pp); 3700 3701 /* update the PTP reference count. free if last reference. */ 3702 if (ptp != NULL) { 3703 struct pmap *pmap2; 3704 pt_entry_t *ptes; 3705 pd_entry_t * const *pdes; 3706 3707 KASSERT(pmap != pmap_kernel()); 3708 3709 pmap_tlb_shootwait(); 3710 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3711 pmap_stats_update_bypte(pmap, 0, opte); 3712 ptp->wire_count--; 3713 if (ptp->wire_count <= 1) { 3714 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3715 } 3716 pmap_unmap_ptes(pmap, pmap2); 3717 pmap_destroy(pmap); 3718 } else { 3719 KASSERT(pmap == pmap_kernel()); 3720 pmap_stats_update_bypte(pmap, 0, opte); 3721 } 3722 3723 if (pve != NULL) { 3724 pve->pve_next = killlist; /* mark it for death */ 3725 killlist = pve; 3726 } 3727 pp_lock(pp); 3728 } 3729 pp_unlock(pp); 3730 kpreempt_enable(); 3731 3732 /* Now free unused pvs. */ 3733 pmap_free_pvs(killlist); 3734 } 3735 3736 /* 3737 * p m a p a t t r i b u t e f u n c t i o n s 3738 * functions that test/change managed page's attributes 3739 * since a page can be mapped multiple times we must check each PTE that 3740 * maps it by going down the pv lists. 3741 */ 3742 3743 /* 3744 * pmap_test_attrs: test a page's attributes 3745 */ 3746 3747 bool 3748 pmap_test_attrs(struct vm_page *pg, unsigned testbits) 3749 { 3750 struct pmap_page *pp; 3751 struct pv_pte *pvpte; 3752 pt_entry_t expect; 3753 u_int result; 3754 3755 pp = VM_PAGE_TO_PP(pg); 3756 if ((pp->pp_attrs & testbits) != 0) { 3757 return true; 3758 } 3759 expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V; 3760 pp_lock(pp); 3761 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 3762 pt_entry_t opte; 3763 int error; 3764 3765 if ((pp->pp_attrs & testbits) != 0) { 3766 break; 3767 } 3768 error = pmap_sync_pv(pvpte, expect, 0, &opte); 3769 if (error == 0) { 3770 pp->pp_attrs |= opte; 3771 } 3772 } 3773 result = pp->pp_attrs & testbits; 3774 pp_unlock(pp); 3775 3776 /* 3777 * note that we will exit the for loop with a non-null pve if 3778 * we have found the bits we are testing for. 3779 */ 3780 3781 return result != 0; 3782 } 3783 3784 /* 3785 * pmap_clear_attrs: clear the specified attribute for a page. 3786 * 3787 * => we return true if we cleared one of the bits we were asked to 3788 */ 3789 3790 bool 3791 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits) 3792 { 3793 struct pmap_page *pp; 3794 struct pv_pte *pvpte; 3795 u_int result; 3796 pt_entry_t expect; 3797 int count; 3798 3799 pp = VM_PAGE_TO_PP(pg); 3800 expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V; 3801 count = SPINLOCK_BACKOFF_MIN; 3802 kpreempt_disable(); 3803 startover: 3804 pp_lock(pp); 3805 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 3806 pt_entry_t opte; 3807 int error; 3808 3809 error = pmap_sync_pv(pvpte, expect, clearbits, &opte); 3810 if (error == EAGAIN) { 3811 int hold_count; 3812 pp_unlock(pp); 3813 KERNEL_UNLOCK_ALL(curlwp, &hold_count); 3814 SPINLOCK_BACKOFF(count); 3815 KERNEL_LOCK(hold_count, curlwp); 3816 goto startover; 3817 } 3818 pp->pp_attrs |= opte; 3819 } 3820 result = pp->pp_attrs & clearbits; 3821 pp->pp_attrs &= ~clearbits; 3822 pp_unlock(pp); 3823 kpreempt_enable(); 3824 3825 return result != 0; 3826 } 3827 3828 3829 /* 3830 * p m a p p r o t e c t i o n f u n c t i o n s 3831 */ 3832 3833 /* 3834 * pmap_page_protect: change the protection of all recorded mappings 3835 * of a managed page 3836 * 3837 * => NOTE: this is an inline function in pmap.h 3838 */ 3839 3840 /* see pmap.h */ 3841 3842 /* 3843 * pmap_protect: set the protection in of the pages in a pmap 3844 * 3845 * => NOTE: this is an inline function in pmap.h 3846 */ 3847 3848 /* see pmap.h */ 3849 3850 /* 3851 * pmap_write_protect: write-protect pages in a pmap 3852 */ 3853 3854 void 3855 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot) 3856 { 3857 pt_entry_t *ptes, *epte; 3858 pt_entry_t *spte; 3859 pd_entry_t * const *pdes; 3860 vaddr_t blockend, va; 3861 pt_entry_t opte; 3862 struct pmap *pmap2; 3863 3864 KASSERT(curlwp->l_md.md_gc_pmap != pmap); 3865 3866 kpreempt_disable(); 3867 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 3868 3869 /* should be ok, but just in case ... */ 3870 sva &= PG_FRAME; 3871 eva &= PG_FRAME; 3872 3873 for (va = sva ; va < eva ; va = blockend) { 3874 3875 blockend = (va & L2_FRAME) + NBPD_L2; 3876 if (blockend > eva) 3877 blockend = eva; 3878 3879 /* 3880 * XXXCDC: our PTE mappings should never be write-protected! 3881 * 3882 * long term solution is to move the PTEs out of user 3883 * address space. and into kernel address space (up 3884 * with APTE). then we can set VM_MAXUSER_ADDRESS to 3885 * be VM_MAX_ADDRESS. 3886 */ 3887 3888 /* XXXCDC: ugly hack to avoid freeing PDP here */ 3889 if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE) 3890 continue; 3891 3892 /* empty block? */ 3893 if (!pmap_pdes_valid(va, pdes, NULL)) 3894 continue; 3895 3896 #ifdef DIAGNOSTIC 3897 if (va >= VM_MAXUSER_ADDRESS && 3898 va < VM_MAX_ADDRESS) 3899 panic("pmap_write_protect: PTE space"); 3900 #endif 3901 3902 spte = &ptes[pl1_i(va)]; 3903 epte = &ptes[pl1_i(blockend)]; 3904 3905 for (/*null */; spte < epte ; spte++) { 3906 pt_entry_t npte; 3907 3908 do { 3909 opte = *spte; 3910 if ((~opte & (PG_RW | PG_V)) != 0) { 3911 goto next; 3912 } 3913 npte = opte & ~PG_RW; 3914 } while (pmap_pte_cas(spte, opte, npte) != opte); 3915 if ((opte & PG_M) != 0) { 3916 vaddr_t tva; 3917 3918 tva = x86_ptob(spte - ptes); 3919 pmap_tlb_shootdown(pmap, tva, 0, opte); 3920 } 3921 next:; 3922 } 3923 } 3924 3925 pmap_unmap_ptes(pmap, pmap2); /* unlocks pmap */ 3926 kpreempt_enable(); 3927 } 3928 3929 /* 3930 * end of protection functions 3931 */ 3932 3933 /* 3934 * pmap_unwire: clear the wired bit in the PTE 3935 * 3936 * => mapping should already be in map 3937 */ 3938 3939 void 3940 pmap_unwire(struct pmap *pmap, vaddr_t va) 3941 { 3942 pt_entry_t *ptes; 3943 pd_entry_t * const *pdes; 3944 struct pmap *pmap2; 3945 3946 kpreempt_disable(); 3947 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 3948 3949 if (pmap_pdes_valid(va, pdes, NULL)) { 3950 pt_entry_t *ptep = &ptes[pl1_i(va)]; 3951 pt_entry_t opte = *ptep; 3952 3953 #ifdef DIAGNOSTIC 3954 if (!pmap_valid_entry(opte)) 3955 panic("pmap_unwire: invalid (unmapped) va 0x%lx", va); 3956 #endif 3957 if ((opte & PG_W) != 0) { 3958 pt_entry_t npte = opte & ~PG_W; 3959 3960 opte = pmap_pte_testset(ptep, npte); 3961 pmap_stats_update_bypte(pmap, npte, opte); 3962 } 3963 #ifdef DIAGNOSTIC 3964 else { 3965 printf("pmap_unwire: wiring for pmap %p va 0x%lx " 3966 "didn't change!\n", pmap, va); 3967 } 3968 #endif 3969 pmap_unmap_ptes(pmap, pmap2); /* unlocks map */ 3970 } 3971 #ifdef DIAGNOSTIC 3972 else { 3973 panic("pmap_unwire: invalid PDE"); 3974 } 3975 #endif 3976 kpreempt_enable(); 3977 } 3978 3979 /* 3980 * pmap_copy: copy mappings from one pmap to another 3981 * 3982 * => optional function 3983 * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr) 3984 */ 3985 3986 /* 3987 * defined as macro in pmap.h 3988 */ 3989 3990 /* 3991 * pmap_enter: enter a mapping into a pmap 3992 * 3993 * => must be done "now" ... no lazy-evaluation 3994 * => we set pmap => pv_head locking 3995 */ 3996 #ifdef XEN 3997 int 3998 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa, 3999 vm_prot_t prot, u_int flags, int domid) 4000 { 4001 #else /* XEN */ 4002 int 4003 pmap_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, 4004 u_int flags) 4005 { 4006 paddr_t ma = pa; 4007 #endif /* XEN */ 4008 pt_entry_t *ptes, opte, npte; 4009 pt_entry_t *ptep; 4010 pd_entry_t * const *pdes; 4011 struct vm_page *ptp, *pg; 4012 struct pmap_page *new_pp; 4013 struct pmap_page *old_pp; 4014 struct pv_entry *old_pve = NULL; 4015 struct pv_entry *new_pve; 4016 struct pv_entry *new_pve2; 4017 int error; 4018 bool wired = (flags & PMAP_WIRED) != 0; 4019 struct pmap *pmap2; 4020 4021 KASSERT(pmap_initialized); 4022 KASSERT(curlwp->l_md.md_gc_pmap != pmap); 4023 4024 #ifdef DIAGNOSTIC 4025 /* sanity check: totally out of range? */ 4026 if (va >= VM_MAX_KERNEL_ADDRESS) 4027 panic("pmap_enter: too big"); 4028 4029 if (va == (vaddr_t) PDP_BASE || va == (vaddr_t) APDP_BASE) 4030 panic("pmap_enter: trying to map over PDP/APDP!"); 4031 4032 /* sanity check: kernel PTPs should already have been pre-allocated */ 4033 if (va >= VM_MIN_KERNEL_ADDRESS && 4034 !pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)])) 4035 panic("pmap_enter: missing kernel PTP for va %lx!", va); 4036 #endif /* DIAGNOSTIC */ 4037 #ifdef XEN 4038 KASSERT(domid == DOMID_SELF || pa == 0); 4039 #endif /* XEN */ 4040 4041 npte = ma | protection_codes[prot] | PG_V; 4042 if (wired) 4043 npte |= PG_W; 4044 if (flags & PMAP_NOCACHE) 4045 npte |= PG_N; 4046 if (va < VM_MAXUSER_ADDRESS) 4047 npte |= PG_u; 4048 else if (va < VM_MAX_ADDRESS) 4049 npte |= (PG_u | PG_RW); /* XXXCDC: no longer needed? */ 4050 else 4051 npte |= PG_k; 4052 if (pmap == pmap_kernel()) 4053 npte |= pmap_pg_g; 4054 if (flags & VM_PROT_ALL) { 4055 npte |= PG_U; 4056 if (flags & VM_PROT_WRITE) { 4057 KASSERT((npte & PG_RW) != 0); 4058 npte |= PG_M; 4059 } 4060 } 4061 4062 #ifdef XEN 4063 if (domid != DOMID_SELF) 4064 pg = NULL; 4065 else 4066 #endif 4067 pg = PHYS_TO_VM_PAGE(pa); 4068 if (pg != NULL) { 4069 /* This is a managed page */ 4070 npte |= PG_PVLIST; 4071 new_pp = VM_PAGE_TO_PP(pg); 4072 } else { 4073 new_pp = NULL; 4074 } 4075 4076 /* get pves. */ 4077 new_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT); 4078 new_pve2 = pool_cache_get(&pmap_pv_cache, PR_NOWAIT); 4079 if (new_pve == NULL || new_pve2 == NULL) { 4080 if (flags & PMAP_CANFAIL) { 4081 error = ENOMEM; 4082 goto out2; 4083 } 4084 panic("pmap_enter: pve allocation failed"); 4085 } 4086 4087 kpreempt_disable(); 4088 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 4089 if (pmap == pmap_kernel()) { 4090 ptp = NULL; 4091 } else { 4092 ptp = pmap_get_ptp(pmap, va, pdes); 4093 if (ptp == NULL) { 4094 pmap_unmap_ptes(pmap, pmap2); 4095 if (flags & PMAP_CANFAIL) { 4096 error = ENOMEM; 4097 goto out; 4098 } 4099 panic("pmap_enter: get ptp failed"); 4100 } 4101 } 4102 4103 /* 4104 * update the pte. 4105 */ 4106 4107 ptep = &ptes[pl1_i(va)]; 4108 do { 4109 opte = *ptep; 4110 4111 /* 4112 * if the same page, inherit PG_U and PG_M. 4113 */ 4114 if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) { 4115 npte |= opte & (PG_U | PG_M); 4116 } 4117 #if defined(XEN) 4118 if (domid != DOMID_SELF) { 4119 /* pmap_pte_cas with error handling */ 4120 int s = splvm(); 4121 if (opte != *ptep) { 4122 splx(s); 4123 continue; 4124 } 4125 error = xpq_update_foreign( 4126 vtomach((vaddr_t)ptep), npte, domid); 4127 splx(s); 4128 if (error) { 4129 if (ptp != NULL && ptp->wire_count <= 1) { 4130 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 4131 } 4132 pmap_unmap_ptes(pmap, pmap2); 4133 goto out; 4134 } 4135 break; 4136 } 4137 #endif /* defined(XEN) */ 4138 } while (pmap_pte_cas(ptep, opte, npte) != opte); 4139 4140 /* 4141 * update statistics and PTP's reference count. 4142 */ 4143 4144 pmap_stats_update_bypte(pmap, npte, opte); 4145 if (ptp != NULL && !pmap_valid_entry(opte)) { 4146 ptp->wire_count++; 4147 } 4148 KASSERT(ptp == NULL || ptp->wire_count > 1); 4149 4150 /* 4151 * if the same page, we can skip pv_entry handling. 4152 */ 4153 4154 if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) { 4155 KASSERT(((opte ^ npte) & PG_PVLIST) == 0); 4156 goto same_pa; 4157 } 4158 4159 /* 4160 * if old page is managed, remove pv_entry from its list. 4161 */ 4162 4163 if ((~opte & (PG_V | PG_PVLIST)) == 0) { 4164 pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte)); 4165 #ifdef DIAGNOSTIC 4166 if (pg == NULL) 4167 panic("pmap_enter: PG_PVLIST mapping with " 4168 "unmanaged page " 4169 "pa = 0x%" PRIx64 " (0x%" PRIx64 ")", 4170 (int64_t)pa, (int64_t)atop(pa)); 4171 #endif 4172 old_pp = VM_PAGE_TO_PP(pg); 4173 4174 pp_lock(old_pp); 4175 old_pve = pmap_remove_pv(old_pp, ptp, va); 4176 old_pp->pp_attrs |= opte; 4177 pp_unlock(old_pp); 4178 } 4179 4180 /* 4181 * if new page is managed, insert pv_entry into its list. 4182 */ 4183 4184 if (new_pp) { 4185 pp_lock(new_pp); 4186 new_pve = pmap_enter_pv(new_pp, new_pve, &new_pve2, ptp, va); 4187 pp_unlock(new_pp); 4188 } 4189 4190 same_pa: 4191 pmap_unmap_ptes(pmap, pmap2); 4192 4193 /* 4194 * shootdown tlb if necessary. 4195 */ 4196 4197 if ((~opte & (PG_V | PG_U)) == 0 && 4198 ((opte ^ npte) & (PG_FRAME | PG_RW)) != 0) { 4199 pmap_tlb_shootdown(pmap, va, 0, opte); 4200 } 4201 4202 error = 0; 4203 out: 4204 kpreempt_enable(); 4205 out2: 4206 if (old_pve != NULL) { 4207 pool_cache_put(&pmap_pv_cache, old_pve); 4208 } 4209 if (new_pve != NULL) { 4210 pool_cache_put(&pmap_pv_cache, new_pve); 4211 } 4212 if (new_pve2 != NULL) { 4213 pool_cache_put(&pmap_pv_cache, new_pve2); 4214 } 4215 4216 return error; 4217 } 4218 4219 #ifdef XEN 4220 int 4221 pmap_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags) 4222 { 4223 paddr_t ma; 4224 4225 if (__predict_false(pa < pmap_pa_start || pmap_pa_end <= pa)) { 4226 ma = pa; /* XXX hack */ 4227 } else { 4228 ma = xpmap_ptom(pa); 4229 } 4230 4231 return pmap_enter_ma(pmap, va, ma, pa, prot, flags, DOMID_SELF); 4232 } 4233 #endif /* XEN */ 4234 4235 static bool 4236 pmap_get_physpage(vaddr_t va, int level, paddr_t *paddrp) 4237 { 4238 struct vm_page *ptp; 4239 struct pmap *kpm = pmap_kernel(); 4240 4241 if (uvm.page_init_done == false) { 4242 /* 4243 * we're growing the kernel pmap early (from 4244 * uvm_pageboot_alloc()). this case must be 4245 * handled a little differently. 4246 */ 4247 4248 if (uvm_page_physget(paddrp) == false) 4249 panic("pmap_get_physpage: out of memory"); 4250 kpreempt_disable(); 4251 pmap_pte_set(early_zero_pte, 4252 pmap_pa2pte(*paddrp) | PG_V | PG_RW | PG_k); 4253 pmap_pte_flush(); 4254 pmap_update_pg((vaddr_t)early_zerop); 4255 memset(early_zerop, 0, PAGE_SIZE); 4256 #if defined(DIAGNOSTIC) || defined (XEN) 4257 pmap_pte_set(early_zero_pte, 0); 4258 pmap_pte_flush(); 4259 #endif /* defined(DIAGNOSTIC) */ 4260 kpreempt_enable(); 4261 } else { 4262 /* XXX */ 4263 PMAP_SUBOBJ_LOCK(kpm, level - 1); 4264 ptp = uvm_pagealloc(&kpm->pm_obj[level - 1], 4265 ptp_va2o(va, level), NULL, 4266 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 4267 PMAP_SUBOBJ_UNLOCK(kpm, level - 1); 4268 if (ptp == NULL) 4269 panic("pmap_get_physpage: out of memory"); 4270 ptp->flags &= ~PG_BUSY; 4271 ptp->wire_count = 1; 4272 *paddrp = VM_PAGE_TO_PHYS(ptp); 4273 } 4274 pmap_stats_update(kpm, 1, 0); 4275 return true; 4276 } 4277 4278 /* 4279 * Allocate the amount of specified ptps for a ptp level, and populate 4280 * all levels below accordingly, mapping virtual addresses starting at 4281 * kva. 4282 * 4283 * Used by pmap_growkernel. 4284 */ 4285 static void 4286 pmap_alloc_level(pd_entry_t * const *pdes, vaddr_t kva, int lvl, 4287 long *needed_ptps) 4288 { 4289 unsigned long i; 4290 vaddr_t va; 4291 paddr_t pa; 4292 unsigned long index, endindex; 4293 int level; 4294 pd_entry_t *pdep; 4295 #ifdef XEN 4296 int s = splvm(); /* protect xpq_* */ 4297 #endif 4298 4299 for (level = lvl; level > 1; level--) { 4300 if (level == PTP_LEVELS) 4301 pdep = pmap_kernel()->pm_pdir; 4302 else 4303 pdep = pdes[level - 2]; 4304 va = kva; 4305 index = pl_i_roundup(kva, level); 4306 endindex = index + needed_ptps[level - 1] - 1; 4307 4308 4309 for (i = index; i <= endindex; i++) { 4310 KASSERT(!pmap_valid_entry(pdep[i])); 4311 pmap_get_physpage(va, level - 1, &pa); 4312 #ifdef XEN 4313 xpq_queue_pte_update((level == PTP_LEVELS) ? 4314 xpmap_ptom(pmap_pdirpa(pmap_kernel(), i)) : 4315 xpmap_ptetomach(&pdep[i]), 4316 pmap_pa2pte(pa) | PG_k | PG_V | PG_RW); 4317 #ifdef PAE 4318 if (level == PTP_LEVELS && i > L2_SLOT_KERN) { 4319 /* update real kernel PD too */ 4320 xpq_queue_pte_update( 4321 xpmap_ptetomach(&pmap_kl2pd[l2tol2(i)]), 4322 pmap_pa2pte(pa) | PG_k | PG_V | PG_RW); 4323 } 4324 #endif 4325 #else /* XEN */ 4326 pdep[i] = pa | PG_RW | PG_V; 4327 #endif /* XEN */ 4328 KASSERT(level != PTP_LEVELS || nkptp[level - 1] + 4329 pl_i(VM_MIN_KERNEL_ADDRESS, level) == i); 4330 nkptp[level - 1]++; 4331 va += nbpd[level - 1]; 4332 } 4333 pmap_pte_flush(); 4334 } 4335 #ifdef XEN 4336 splx(s); 4337 #endif 4338 } 4339 4340 /* 4341 * pmap_growkernel: increase usage of KVM space 4342 * 4343 * => we allocate new PTPs for the kernel and install them in all 4344 * the pmaps on the system. 4345 */ 4346 4347 vaddr_t 4348 pmap_growkernel(vaddr_t maxkvaddr) 4349 { 4350 struct pmap *kpm = pmap_kernel(); 4351 #if !defined(XEN) || !defined(__x86_64__) 4352 struct pmap *pm; 4353 #endif 4354 int s, i; 4355 long needed_kptp[PTP_LEVELS], target_nptp, old; 4356 bool invalidate = false; 4357 4358 s = splvm(); /* to be safe */ 4359 mutex_enter(&kpm->pm_lock); 4360 4361 if (maxkvaddr <= pmap_maxkvaddr) { 4362 mutex_exit(&kpm->pm_lock); 4363 splx(s); 4364 return pmap_maxkvaddr; 4365 } 4366 4367 maxkvaddr = x86_round_pdr(maxkvaddr); 4368 old = nkptp[PTP_LEVELS - 1]; 4369 /* 4370 * This loop could be optimized more, but pmap_growkernel() 4371 * is called infrequently. 4372 */ 4373 for (i = PTP_LEVELS - 1; i >= 1; i--) { 4374 target_nptp = pl_i_roundup(maxkvaddr, i + 1) - 4375 pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1); 4376 /* 4377 * XXX only need to check toplevel. 4378 */ 4379 if (target_nptp > nkptpmax[i]) 4380 panic("out of KVA space"); 4381 KASSERT(target_nptp >= nkptp[i]); 4382 needed_kptp[i] = target_nptp - nkptp[i]; 4383 } 4384 4385 pmap_alloc_level(normal_pdes, pmap_maxkvaddr, PTP_LEVELS, needed_kptp); 4386 4387 /* 4388 * If the number of top level entries changed, update all 4389 * pmaps. 4390 */ 4391 if (needed_kptp[PTP_LEVELS - 1] != 0) { 4392 #ifdef XEN 4393 #ifdef __x86_64__ 4394 /* nothing, kernel entries are never entered in user pmap */ 4395 #else /* __x86_64__ */ 4396 mutex_enter(&pmaps_lock); 4397 LIST_FOREACH(pm, &pmaps, pm_list) { 4398 int pdkidx; 4399 for (pdkidx = PDIR_SLOT_KERN + old; 4400 pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1]; 4401 pdkidx++) { 4402 xpq_queue_pte_update( 4403 xpmap_ptom(pmap_pdirpa(pm, pdkidx)), 4404 kpm->pm_pdir[pdkidx]); 4405 } 4406 xpq_flush_queue(); 4407 } 4408 mutex_exit(&pmaps_lock); 4409 #endif /* __x86_64__ */ 4410 #else /* XEN */ 4411 unsigned newpdes; 4412 newpdes = nkptp[PTP_LEVELS - 1] - old; 4413 mutex_enter(&pmaps_lock); 4414 LIST_FOREACH(pm, &pmaps, pm_list) { 4415 memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old], 4416 &kpm->pm_pdir[PDIR_SLOT_KERN + old], 4417 newpdes * sizeof (pd_entry_t)); 4418 } 4419 mutex_exit(&pmaps_lock); 4420 #endif 4421 invalidate = true; 4422 } 4423 pmap_maxkvaddr = maxkvaddr; 4424 mutex_exit(&kpm->pm_lock); 4425 splx(s); 4426 4427 if (invalidate) { 4428 /* Invalidate the PDP cache. */ 4429 pool_cache_invalidate(&pmap_pdp_cache); 4430 } 4431 4432 return maxkvaddr; 4433 } 4434 4435 #ifdef DEBUG 4436 void pmap_dump(struct pmap *, vaddr_t, vaddr_t); 4437 4438 /* 4439 * pmap_dump: dump all the mappings from a pmap 4440 * 4441 * => caller should not be holding any pmap locks 4442 */ 4443 4444 void 4445 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 4446 { 4447 pt_entry_t *ptes, *pte; 4448 pd_entry_t * const *pdes; 4449 struct pmap *pmap2; 4450 vaddr_t blkendva; 4451 4452 /* 4453 * if end is out of range truncate. 4454 * if (end == start) update to max. 4455 */ 4456 4457 if (eva > VM_MAXUSER_ADDRESS || eva <= sva) 4458 eva = VM_MAXUSER_ADDRESS; 4459 4460 /* 4461 * we lock in the pmap => pv_head direction 4462 */ 4463 4464 kpreempt_disable(); 4465 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 4466 4467 /* 4468 * dumping a range of pages: we dump in PTP sized blocks (4MB) 4469 */ 4470 4471 for (/* null */ ; sva < eva ; sva = blkendva) { 4472 4473 /* determine range of block */ 4474 blkendva = x86_round_pdr(sva+1); 4475 if (blkendva > eva) 4476 blkendva = eva; 4477 4478 /* valid block? */ 4479 if (!pmap_pdes_valid(sva, pdes, NULL)) 4480 continue; 4481 4482 pte = &ptes[pl1_i(sva)]; 4483 for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) { 4484 if (!pmap_valid_entry(*pte)) 4485 continue; 4486 printf("va %#lx -> pa %#lx (pte=%#lx)\n", 4487 sva, (unsigned long)*pte, 4488 (unsigned long)pmap_pte2pa(*pte)); 4489 } 4490 } 4491 pmap_unmap_ptes(pmap, pmap2); 4492 kpreempt_enable(); 4493 } 4494 #endif 4495 4496 /* 4497 * pmap_tlb_shootdown: invalidate pages on all CPUs using pmap 'pm' 4498 * 4499 * => always invalidates locally before returning 4500 * => returns before remote CPUs have invalidated 4501 * => must be called with preemption disabled 4502 */ 4503 4504 void 4505 pmap_tlb_shootdown(struct pmap *pm, vaddr_t sva, vaddr_t eva, pt_entry_t pte) 4506 { 4507 #ifdef MULTIPROCESSOR 4508 extern bool x86_mp_online; 4509 struct cpu_info *ci; 4510 struct pmap_mbox *mb, *selfmb; 4511 CPU_INFO_ITERATOR cii; 4512 uintptr_t head; 4513 u_int count; 4514 int s; 4515 #endif /* MULTIPROCESSOR */ 4516 struct cpu_info *self; 4517 bool kernel; 4518 4519 KASSERT(eva == 0 || eva >= sva); 4520 KASSERT(kpreempt_disabled()); 4521 4522 if (pte & PG_PS) 4523 sva &= PG_LGFRAME; 4524 pte &= PG_G; 4525 self = curcpu(); 4526 4527 if (sva == (vaddr_t)-1LL) { 4528 kernel = true; 4529 } else { 4530 if (eva == 0) 4531 eva = sva + PAGE_SIZE; 4532 kernel = sva >= VM_MAXUSER_ADDRESS; 4533 KASSERT(kernel == (eva > VM_MAXUSER_ADDRESS)); 4534 } 4535 4536 /* 4537 * if tearing down the pmap, do nothing. we'll flush later 4538 * when we're ready to recycle/destroy it. 4539 */ 4540 if (__predict_false(curlwp->l_md.md_gc_pmap == pm)) { 4541 return; 4542 } 4543 4544 /* 4545 * If the range is larger than 32 pages, then invalidate 4546 * everything. 4547 */ 4548 if (sva != (vaddr_t)-1LL && eva - sva > (32 * PAGE_SIZE)) { 4549 sva = (vaddr_t)-1LL; 4550 eva = sva; 4551 } 4552 4553 #ifdef MULTIPROCESSOR 4554 if (ncpu > 1 && x86_mp_online) { 4555 selfmb = &self->ci_pmap_cpu->pc_mbox; 4556 4557 /* 4558 * If the CPUs have no notion of global pages then 4559 * reload of %cr3 is sufficient. 4560 */ 4561 if (pte != 0 && (cpu_feature & CPUID_PGE) == 0) 4562 pte = 0; 4563 4564 if (pm == pmap_kernel()) { 4565 /* 4566 * Mapped on all CPUs: use the broadcast mechanism. 4567 * Once we have the lock, increment the counter. 4568 */ 4569 s = splvm(); 4570 mb = &pmap_mbox; 4571 count = SPINLOCK_BACKOFF_MIN; 4572 do { 4573 if ((head = mb->mb_head) != mb->mb_tail) { 4574 splx(s); 4575 while ((head = mb->mb_head) != 4576 mb->mb_tail) 4577 SPINLOCK_BACKOFF(count); 4578 s = splvm(); 4579 } 4580 } while (atomic_cas_ulong( 4581 (volatile u_long *)&mb->mb_head, 4582 head, head + ncpu - 1) != head); 4583 4584 /* 4585 * Once underway we must stay at IPL_VM until the 4586 * IPI is dispatched. Otherwise interrupt handlers 4587 * on this CPU can deadlock against us. 4588 */ 4589 pmap_tlb_evcnt.ev_count++; 4590 mb->mb_pointer = self; 4591 mb->mb_addr1 = sva; 4592 mb->mb_addr2 = eva; 4593 mb->mb_global = pte; 4594 x86_ipi(LAPIC_TLB_BCAST_VECTOR, LAPIC_DEST_ALLEXCL, 4595 LAPIC_DLMODE_FIXED); 4596 self->ci_need_tlbwait = 1; 4597 splx(s); 4598 } else if ((pm->pm_cpus & ~self->ci_cpumask) != 0 || 4599 (kernel && (pm->pm_kernel_cpus & ~self->ci_cpumask) != 0)) { 4600 /* 4601 * We don't bother traversing the CPU list if only 4602 * used by this CPU. 4603 * 4604 * We can't do global flushes with the multicast 4605 * mechanism. 4606 */ 4607 KASSERT(pte == 0); 4608 4609 /* 4610 * Take ownership of the shootdown mailbox on each 4611 * CPU, fill the details and fire it off. 4612 */ 4613 s = splvm(); 4614 for (CPU_INFO_FOREACH(cii, ci)) { 4615 if (ci == self || 4616 !pmap_is_active(pm, ci, kernel) || 4617 !(ci->ci_flags & CPUF_RUNNING)) 4618 continue; 4619 selfmb->mb_head++; 4620 mb = &ci->ci_pmap_cpu->pc_mbox; 4621 count = SPINLOCK_BACKOFF_MIN; 4622 while (atomic_cas_ulong( 4623 (u_long *)&mb->mb_pointer, 4624 0, (u_long)&selfmb->mb_tail) != 0) { 4625 splx(s); 4626 while (mb->mb_pointer != 0) 4627 SPINLOCK_BACKOFF(count); 4628 s = splvm(); 4629 } 4630 mb->mb_addr1 = sva; 4631 mb->mb_addr2 = eva; 4632 mb->mb_global = pte; 4633 if (x86_ipi(LAPIC_TLB_MCAST_VECTOR, 4634 ci->ci_cpuid, LAPIC_DLMODE_FIXED)) 4635 panic("pmap_tlb_shootdown: ipi failed"); 4636 } 4637 self->ci_need_tlbwait = 1; 4638 splx(s); 4639 } 4640 } 4641 #endif /* MULTIPROCESSOR */ 4642 4643 /* Update the current CPU before waiting for others. */ 4644 if (!pmap_is_active(pm, self, kernel)) 4645 return; 4646 4647 if (sva == (vaddr_t)-1LL) { 4648 u_int gen = uvm_emap_gen_return(); 4649 if (pte != 0) { 4650 tlbflushg(); 4651 } else { 4652 tlbflush(); 4653 } 4654 uvm_emap_update(gen); 4655 } else { 4656 do { 4657 pmap_update_pg(sva); 4658 sva += PAGE_SIZE; 4659 } while (sva < eva); 4660 } 4661 } 4662 4663 /* 4664 * pmap_tlb_shootwait: wait for pending TLB shootdowns to complete 4665 * 4666 * => only waits for operations generated by the current CPU 4667 * => must be called with preemption disabled 4668 */ 4669 4670 void 4671 pmap_tlb_shootwait(void) 4672 { 4673 struct cpu_info *self; 4674 struct pmap_mbox *mb; 4675 4676 KASSERT(kpreempt_disabled()); 4677 4678 /* 4679 * Anything to do? XXX Really we want to avoid touching the cache 4680 * lines of the two mailboxes, but the processor may read ahead. 4681 */ 4682 self = curcpu(); 4683 if (!self->ci_need_tlbwait) 4684 return; 4685 self->ci_need_tlbwait = 0; 4686 4687 /* If we own the global mailbox, wait for it to drain. */ 4688 mb = &pmap_mbox; 4689 while (mb->mb_pointer == self && mb->mb_head != mb->mb_tail) 4690 x86_pause(); 4691 4692 /* If we own other CPU's mailboxes, wait for them to drain. */ 4693 mb = &self->ci_pmap_cpu->pc_mbox; 4694 KASSERT(mb->mb_pointer != &mb->mb_tail); 4695 while (mb->mb_head != mb->mb_tail) 4696 x86_pause(); 4697 } 4698 4699 /* 4700 * pmap_update: process deferred invalidations 4701 */ 4702 4703 void 4704 pmap_update(struct pmap *pmap) 4705 { 4706 struct vm_page *ptp, *empty_ptps; 4707 struct pmap_page *pp; 4708 lwp_t *l; 4709 4710 /* 4711 * if we have torn down this pmap, invalidate non-global TLB 4712 * entries on any processors using it. 4713 */ 4714 l = curlwp; 4715 if (__predict_false(l->l_md.md_gc_pmap == pmap)) { 4716 l->l_md.md_gc_pmap = NULL; 4717 KPREEMPT_DISABLE(l); 4718 pmap_tlb_shootdown(pmap, -1, -1, 0); 4719 KPREEMPT_ENABLE(l); 4720 } 4721 4722 /* 4723 * wait for tlb shootdowns to complete before returning control 4724 * to the caller. 4725 */ 4726 kpreempt_disable(); 4727 pmap_tlb_shootwait(); 4728 kpreempt_enable(); 4729 4730 /* 4731 * now that shootdowns are complete, process deferred frees, 4732 * but not from interrupt context. 4733 */ 4734 if (l->l_md.md_gc_ptp != NULL) { 4735 if (cpu_intr_p() || (l->l_pflag & LP_INTR) != 0) { 4736 return; 4737 } 4738 4739 empty_ptps = l->l_md.md_gc_ptp; 4740 l->l_md.md_gc_ptp = NULL; 4741 4742 while ((ptp = empty_ptps) != NULL) { 4743 ptp->flags |= PG_ZERO; 4744 pp = VM_PAGE_TO_PP(ptp); 4745 empty_ptps = pp->pp_link; 4746 LIST_INIT(&pp->pp_head.pvh_list); 4747 uvm_pagefree(ptp); 4748 } 4749 } 4750 } 4751 4752 #if PTP_LEVELS > 4 4753 #error "Unsupported number of page table mappings" 4754 #endif 4755 4756 paddr_t 4757 pmap_init_tmp_pgtbl(paddr_t pg) 4758 { 4759 static bool maps_loaded; 4760 static const paddr_t x86_tmp_pml_paddr[] = { 4761 4 * PAGE_SIZE, 4762 5 * PAGE_SIZE, 4763 6 * PAGE_SIZE, 4764 7 * PAGE_SIZE 4765 }; 4766 static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 }; 4767 4768 pd_entry_t *tmp_pml, *kernel_pml; 4769 4770 int level; 4771 4772 if (!maps_loaded) { 4773 for (level = 0; level < PTP_LEVELS; ++level) { 4774 x86_tmp_pml_vaddr[level] = 4775 uvm_km_alloc(kernel_map, PAGE_SIZE, 0, 4776 UVM_KMF_VAONLY); 4777 4778 if (x86_tmp_pml_vaddr[level] == 0) 4779 panic("mapping of real mode PML failed\n"); 4780 pmap_kenter_pa(x86_tmp_pml_vaddr[level], 4781 x86_tmp_pml_paddr[level], 4782 VM_PROT_READ | VM_PROT_WRITE, 0); 4783 pmap_update(pmap_kernel()); 4784 } 4785 maps_loaded = true; 4786 } 4787 4788 /* Zero levels 1-3 */ 4789 for (level = 0; level < PTP_LEVELS - 1; ++level) { 4790 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 4791 memset(tmp_pml, 0, PAGE_SIZE); 4792 } 4793 4794 /* Copy PML4 */ 4795 kernel_pml = pmap_kernel()->pm_pdir; 4796 tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1]; 4797 memcpy(tmp_pml, kernel_pml, PAGE_SIZE); 4798 4799 for (level = PTP_LEVELS - 1; level > 0; --level) { 4800 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 4801 4802 tmp_pml[pl_i(pg, level + 1)] = 4803 (x86_tmp_pml_paddr[level - 1] & PG_FRAME) | PG_RW | PG_V; 4804 } 4805 4806 tmp_pml = (void *)x86_tmp_pml_vaddr[0]; 4807 tmp_pml[pl_i(pg, 1)] = (pg & PG_FRAME) | PG_RW | PG_V; 4808 4809 return x86_tmp_pml_paddr[PTP_LEVELS - 1]; 4810 } 4811