1 /* $NetBSD: pmap.c,v 1.99 2010/01/10 12:10:23 jym Exp $ */ 2 3 /* 4 * Copyright (c) 2007 Manuel Bouyer. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 * 26 */ 27 28 /* 29 * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr> 30 * 31 * Permission to use, copy, modify, and distribute this software for any 32 * purpose with or without fee is hereby granted, provided that the above 33 * copyright notice and this permission notice appear in all copies. 34 * 35 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 36 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 37 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 38 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 39 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 40 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 41 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 42 */ 43 44 /* 45 * 46 * Copyright (c) 1997 Charles D. Cranor and Washington University. 47 * All rights reserved. 48 * 49 * Redistribution and use in source and binary forms, with or without 50 * modification, are permitted provided that the following conditions 51 * are met: 52 * 1. Redistributions of source code must retain the above copyright 53 * notice, this list of conditions and the following disclaimer. 54 * 2. Redistributions in binary form must reproduce the above copyright 55 * notice, this list of conditions and the following disclaimer in the 56 * documentation and/or other materials provided with the distribution. 57 * 3. All advertising materials mentioning features or use of this software 58 * must display the following acknowledgement: 59 * This product includes software developed by Charles D. Cranor and 60 * Washington University. 61 * 4. The name of the author may not be used to endorse or promote products 62 * derived from this software without specific prior written permission. 63 * 64 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 65 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 66 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 67 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 68 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 69 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 70 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 71 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 72 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 73 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 74 */ 75 76 /* 77 * Copyright 2001 (c) Wasabi Systems, Inc. 78 * All rights reserved. 79 * 80 * Written by Frank van der Linden for Wasabi Systems, Inc. 81 * 82 * Redistribution and use in source and binary forms, with or without 83 * modification, are permitted provided that the following conditions 84 * are met: 85 * 1. Redistributions of source code must retain the above copyright 86 * notice, this list of conditions and the following disclaimer. 87 * 2. Redistributions in binary form must reproduce the above copyright 88 * notice, this list of conditions and the following disclaimer in the 89 * documentation and/or other materials provided with the distribution. 90 * 3. All advertising materials mentioning features or use of this software 91 * must display the following acknowledgement: 92 * This product includes software developed for the NetBSD Project by 93 * Wasabi Systems, Inc. 94 * 4. The name of Wasabi Systems, Inc. may not be used to endorse 95 * or promote products derived from this software without specific prior 96 * written permission. 97 * 98 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND 99 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 100 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 101 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC 102 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 103 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 104 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 105 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 106 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 107 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 108 * POSSIBILITY OF SUCH DAMAGE. 109 */ 110 111 /* 112 * This is the i386 pmap modified and generalized to support x86-64 113 * as well. The idea is to hide the upper N levels of the page tables 114 * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest 115 * is mostly untouched, except that it uses some more generalized 116 * macros and interfaces. 117 * 118 * This pmap has been tested on the i386 as well, and it can be easily 119 * adapted to PAE. 120 * 121 * fvdl@wasabisystems.com 18-Jun-2001 122 */ 123 124 /* 125 * pmap.c: i386 pmap module rewrite 126 * Chuck Cranor <chuck@ccrc.wustl.edu> 127 * 11-Aug-97 128 * 129 * history of this pmap module: in addition to my own input, i used 130 * the following references for this rewrite of the i386 pmap: 131 * 132 * [1] the NetBSD i386 pmap. this pmap appears to be based on the 133 * BSD hp300 pmap done by Mike Hibler at University of Utah. 134 * it was then ported to the i386 by William Jolitz of UUNET 135 * Technologies, Inc. Then Charles M. Hannum of the NetBSD 136 * project fixed some bugs and provided some speed ups. 137 * 138 * [2] the FreeBSD i386 pmap. this pmap seems to be the 139 * Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson 140 * and David Greenman. 141 * 142 * [3] the Mach pmap. this pmap, from CMU, seems to have migrated 143 * between several processors. the VAX version was done by 144 * Avadis Tevanian, Jr., and Michael Wayne Young. the i386 145 * version was done by Lance Berc, Mike Kupfer, Bob Baron, 146 * David Golub, and Richard Draves. the alpha version was 147 * done by Alessandro Forin (CMU/Mach) and Chris Demetriou 148 * (NetBSD/alpha). 149 */ 150 151 #include <sys/cdefs.h> 152 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.99 2010/01/10 12:10:23 jym Exp $"); 153 154 #include "opt_user_ldt.h" 155 #include "opt_lockdebug.h" 156 #include "opt_multiprocessor.h" 157 #include "opt_xen.h" 158 #if !defined(__x86_64__) 159 #include "opt_kstack_dr0.h" 160 #endif /* !defined(__x86_64__) */ 161 162 #include <sys/param.h> 163 #include <sys/systm.h> 164 #include <sys/proc.h> 165 #include <sys/pool.h> 166 #include <sys/kernel.h> 167 #include <sys/atomic.h> 168 #include <sys/cpu.h> 169 #include <sys/intr.h> 170 #include <sys/xcall.h> 171 172 #include <uvm/uvm.h> 173 174 #include <dev/isa/isareg.h> 175 176 #include <machine/specialreg.h> 177 #include <machine/gdt.h> 178 #include <machine/isa_machdep.h> 179 #include <machine/cpuvar.h> 180 181 #include <x86/pmap.h> 182 #include <x86/pmap_pv.h> 183 184 #include <x86/i82489reg.h> 185 #include <x86/i82489var.h> 186 187 #ifdef XEN 188 #include <xen/xen3-public/xen.h> 189 #include <xen/hypervisor.h> 190 #endif 191 192 /* flag to be used for kernel mappings: PG_u on Xen/amd64, 0 otherwise */ 193 #if defined(XEN) && defined(__x86_64__) 194 #define PG_k PG_u 195 #else 196 #define PG_k 0 197 #endif 198 199 /* 200 * general info: 201 * 202 * - for an explanation of how the i386 MMU hardware works see 203 * the comments in <machine/pte.h>. 204 * 205 * - for an explanation of the general memory structure used by 206 * this pmap (including the recursive mapping), see the comments 207 * in <machine/pmap.h>. 208 * 209 * this file contains the code for the "pmap module." the module's 210 * job is to manage the hardware's virtual to physical address mappings. 211 * note that there are two levels of mapping in the VM system: 212 * 213 * [1] the upper layer of the VM system uses vm_map's and vm_map_entry's 214 * to map ranges of virtual address space to objects/files. for 215 * example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only 216 * to the file /bin/ls starting at offset zero." note that 217 * the upper layer mapping is not concerned with how individual 218 * vm_pages are mapped. 219 * 220 * [2] the lower layer of the VM system (the pmap) maintains the mappings 221 * from virtual addresses. it is concerned with which vm_page is 222 * mapped where. for example, when you run /bin/ls and start 223 * at page 0x1000 the fault routine may lookup the correct page 224 * of the /bin/ls file and then ask the pmap layer to establish 225 * a mapping for it. 226 * 227 * note that information in the lower layer of the VM system can be 228 * thrown away since it can easily be reconstructed from the info 229 * in the upper layer. 230 * 231 * data structures we use include: 232 * 233 * - struct pmap: describes the address space of one thread 234 * - struct pv_entry: describes one <PMAP,VA> mapping of a PA 235 * - struct pv_head: there is one pv_head per managed page of 236 * physical memory. the pv_head points to a list of pv_entry 237 * structures which describe all the <PMAP,VA> pairs that this 238 * page is mapped in. this is critical for page based operations 239 * such as pmap_page_protect() [change protection on _all_ mappings 240 * of a page] 241 */ 242 243 /* 244 * memory allocation 245 * 246 * - there are three data structures that we must dynamically allocate: 247 * 248 * [A] new process' page directory page (PDP) 249 * - plan 1: done at pmap_create() we use 250 * uvm_km_alloc(kernel_map, PAGE_SIZE) [fka kmem_alloc] to do this 251 * allocation. 252 * 253 * if we are low in free physical memory then we sleep in 254 * uvm_km_alloc -- in this case this is ok since we are creating 255 * a new pmap and should not be holding any locks. 256 * 257 * if the kernel is totally out of virtual space 258 * (i.e. uvm_km_alloc returns NULL), then we panic. 259 * 260 * [B] new page tables pages (PTP) 261 * - call uvm_pagealloc() 262 * => success: zero page, add to pm_pdir 263 * => failure: we are out of free vm_pages, let pmap_enter() 264 * tell UVM about it. 265 * 266 * note: for kernel PTPs, we start with NKPTP of them. as we map 267 * kernel memory (at uvm_map time) we check to see if we've grown 268 * the kernel pmap. if so, we call the optional function 269 * pmap_growkernel() to grow the kernel PTPs in advance. 270 * 271 * [C] pv_entry structures 272 */ 273 274 /* 275 * locking 276 * 277 * we have the following locks that we must contend with: 278 * 279 * mutexes: 280 * 281 * - pmap lock (per pmap, part of uvm_object) 282 * this lock protects the fields in the pmap structure including 283 * the non-kernel PDEs in the PDP, and the PTEs. it also locks 284 * in the alternate PTE space (since that is determined by the 285 * entry in the PDP). 286 * 287 * - pvh_lock (per pv_head) 288 * this lock protects the pv_entry list which is chained off the 289 * pv_head structure for a specific managed PA. it is locked 290 * when traversing the list (e.g. adding/removing mappings, 291 * syncing R/M bits, etc.) 292 * 293 * - pmaps_lock 294 * this lock protects the list of active pmaps (headed by "pmaps"). 295 * we lock it when adding or removing pmaps from this list. 296 * 297 * tlb shootdown 298 * 299 * tlb shootdowns are hard interrupts that operate outside the spl 300 * framework: they don't need to be blocked provided that the pmap module 301 * gets the order of events correct. the calls are made by talking directly 302 * to the lapic. the stubs to handle the interrupts are quite short and do 303 * one of the following: invalidate a single page, a range of pages, all 304 * user tlb entries or the entire tlb. 305 * 306 * the cpus synchronize with each other using pmap_mbox structures which are 307 * aligned on 64-byte cache lines. tlb shootdowns against the kernel pmap 308 * use a global mailbox and are generated using a broadcast ipi (broadcast 309 * to all but the sending cpu). shootdowns against regular pmaps use 310 * per-cpu mailboxes and are multicast. kernel and user shootdowns can 311 * execute simultaneously, as can shootdowns within different multithreaded 312 * processes. TODO: 313 * 314 * 1. figure out which waitpoints can be deferered to pmap_update(). 315 * 2. see if there is a cheap way to batch some updates. 316 */ 317 318 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER; 319 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER; 320 const long nkptpmax[] = NKPTPMAX_INITIALIZER; 321 const long nbpd[] = NBPD_INITIALIZER; 322 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER; 323 pd_entry_t * const alternate_pdes[] = APDES_INITIALIZER; 324 325 long nkptp[] = NKPTP_INITIALIZER; 326 327 static kmutex_t pmaps_lock; 328 329 static vaddr_t pmap_maxkvaddr; 330 331 #define COUNT(x) /* nothing */ 332 333 /* 334 * XXX kludge: dummy locking to make KASSERTs in uvm_page.c comfortable. 335 * actual locking is done by pm_lock. 336 */ 337 #if defined(DIAGNOSTIC) 338 #define PMAP_SUBOBJ_LOCK(pm, idx) \ 339 KASSERT(mutex_owned(&(pm)->pm_lock)); \ 340 if ((idx) != 0) \ 341 mutex_enter(&(pm)->pm_obj[(idx)].vmobjlock) 342 #define PMAP_SUBOBJ_UNLOCK(pm, idx) \ 343 KASSERT(mutex_owned(&(pm)->pm_lock)); \ 344 if ((idx) != 0) \ 345 mutex_exit(&(pm)->pm_obj[(idx)].vmobjlock) 346 #else /* defined(DIAGNOSTIC) */ 347 #define PMAP_SUBOBJ_LOCK(pm, idx) /* nothing */ 348 #define PMAP_SUBOBJ_UNLOCK(pm, idx) /* nothing */ 349 #endif /* defined(DIAGNOSTIC) */ 350 351 /* 352 * Misc. event counters. 353 */ 354 struct evcnt pmap_iobmp_evcnt; 355 struct evcnt pmap_ldt_evcnt; 356 357 /* 358 * Global TLB shootdown mailbox. 359 */ 360 struct evcnt pmap_tlb_evcnt __aligned(64); 361 struct pmap_mbox pmap_mbox __aligned(64); 362 363 /* 364 * Per-CPU data. The pmap mailbox is cache intensive so gets its 365 * own line. Note that the mailbox must be the first item. 366 */ 367 struct pmap_cpu { 368 /* TLB shootdown */ 369 struct pmap_mbox pc_mbox; 370 }; 371 372 union { 373 struct pmap_cpu pc; 374 uint8_t padding[64]; 375 } pmap_cpu[MAXCPUS] __aligned(64); 376 377 /* 378 * global data structures 379 */ 380 381 static struct pmap kernel_pmap_store; /* the kernel's pmap (proc0) */ 382 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store; 383 384 /* 385 * pmap_pg_g: if our processor supports PG_G in the PTE then we 386 * set pmap_pg_g to PG_G (otherwise it is zero). 387 */ 388 389 int pmap_pg_g = 0; 390 391 /* 392 * pmap_largepages: if our processor supports PG_PS and we are 393 * using it, this is set to true. 394 */ 395 396 int pmap_largepages; 397 398 /* 399 * i386 physical memory comes in a big contig chunk with a small 400 * hole toward the front of it... the following two paddr_t's 401 * (shared with machdep.c) describe the physical address space 402 * of this machine. 403 */ 404 paddr_t avail_start; /* PA of first available physical page */ 405 paddr_t avail_end; /* PA of last available physical page */ 406 407 #ifdef XEN 408 #ifdef __x86_64__ 409 /* Dummy PGD for user cr3, used between pmap_deacivate() and pmap_activate() */ 410 static paddr_t xen_dummy_user_pgd; 411 /* Currently active user PGD (can't use rcr3()) */ 412 static paddr_t xen_current_user_pgd = 0; 413 #endif /* __x86_64__ */ 414 paddr_t pmap_pa_start; /* PA of first physical page for this domain */ 415 paddr_t pmap_pa_end; /* PA of last physical page for this domain */ 416 #endif /* XEN */ 417 418 #define VM_PAGE_TO_PP(pg) (&(pg)->mdpage.mp_pp) 419 420 #define pp_lock(pp) mutex_spin_enter(&(pp)->pp_lock) 421 #define pp_unlock(pp) mutex_spin_exit(&(pp)->pp_lock) 422 #define pp_locked(pp) mutex_owned(&(pp)->pp_lock) 423 424 #define PV_HASH_SIZE 32768 425 #define PV_HASH_LOCK_CNT 32 426 427 struct pv_hash_lock { 428 kmutex_t lock; 429 } __aligned(CACHE_LINE_SIZE) pv_hash_locks[PV_HASH_LOCK_CNT] 430 __aligned(CACHE_LINE_SIZE); 431 432 struct pv_hash_head { 433 SLIST_HEAD(, pv_entry) hh_list; 434 } pv_hash_heads[PV_HASH_SIZE]; 435 436 static u_int 437 pvhash_hash(struct vm_page *ptp, vaddr_t va) 438 { 439 440 return (uintptr_t)ptp / sizeof(*ptp) + (va >> PAGE_SHIFT); 441 } 442 443 static struct pv_hash_head * 444 pvhash_head(u_int hash) 445 { 446 447 return &pv_hash_heads[hash % PV_HASH_SIZE]; 448 } 449 450 static kmutex_t * 451 pvhash_lock(u_int hash) 452 { 453 454 return &pv_hash_locks[hash % PV_HASH_LOCK_CNT].lock; 455 } 456 457 static struct pv_entry * 458 pvhash_remove(struct pv_hash_head *hh, struct vm_page *ptp, vaddr_t va) 459 { 460 struct pv_entry *pve; 461 struct pv_entry *prev; 462 463 prev = NULL; 464 SLIST_FOREACH(pve, &hh->hh_list, pve_hash) { 465 if (pve->pve_pte.pte_ptp == ptp && 466 pve->pve_pte.pte_va == va) { 467 if (prev != NULL) { 468 SLIST_REMOVE_AFTER(prev, pve_hash); 469 } else { 470 SLIST_REMOVE_HEAD(&hh->hh_list, pve_hash); 471 } 472 break; 473 } 474 prev = pve; 475 } 476 return pve; 477 } 478 479 /* 480 * other data structures 481 */ 482 483 static pt_entry_t protection_codes[8]; /* maps MI prot to i386 prot code */ 484 static bool pmap_initialized = false; /* pmap_init done yet? */ 485 486 /* 487 * the following two vaddr_t's are used during system startup 488 * to keep track of how much of the kernel's VM space we have used. 489 * once the system is started, the management of the remaining kernel 490 * VM space is turned over to the kernel_map vm_map. 491 */ 492 493 static vaddr_t virtual_avail; /* VA of first free KVA */ 494 static vaddr_t virtual_end; /* VA of last free KVA */ 495 496 /* 497 * linked list of all non-kernel pmaps 498 */ 499 500 static struct pmap_head pmaps; 501 502 /* 503 * pool that pmap structures are allocated from 504 */ 505 506 static struct pool_cache pmap_cache; 507 508 /* 509 * pv_entry cache 510 */ 511 512 static struct pool_cache pmap_pv_cache; 513 514 /* 515 * MULTIPROCESSOR: special VA's/ PTE's are actually allocated inside a 516 * maxcpus*NPTECL array of PTE's, to avoid cache line thrashing 517 * due to false sharing. 518 */ 519 520 #ifdef MULTIPROCESSOR 521 #define PTESLEW(pte, id) ((pte)+(id)*NPTECL) 522 #define VASLEW(va,id) ((va)+(id)*NPTECL*PAGE_SIZE) 523 #else 524 #define PTESLEW(pte, id) (pte) 525 #define VASLEW(va,id) (va) 526 #endif 527 528 /* 529 * special VAs and the PTEs that map them 530 */ 531 static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte, *early_zero_pte; 532 static char *csrcp, *cdstp, *zerop, *ptpp, *early_zerop; 533 534 /* 535 * pool and cache that PDPs are allocated from 536 */ 537 538 static struct pool_cache pmap_pdp_cache; 539 int pmap_pdp_ctor(void *, void *, int); 540 void pmap_pdp_dtor(void *, void *); 541 #ifdef PAE 542 /* need to allocate items of 4 pages */ 543 void *pmap_pdp_alloc(struct pool *, int); 544 void pmap_pdp_free(struct pool *, void *); 545 static struct pool_allocator pmap_pdp_allocator = { 546 .pa_alloc = pmap_pdp_alloc, 547 .pa_free = pmap_pdp_free, 548 .pa_pagesz = PAGE_SIZE * PDP_SIZE, 549 }; 550 #endif /* PAE */ 551 552 void *vmmap; /* XXX: used by mem.c... it should really uvm_map_reserve it */ 553 554 extern vaddr_t idt_vaddr; /* we allocate IDT early */ 555 extern paddr_t idt_paddr; 556 557 #ifdef _LP64 558 extern vaddr_t lo32_vaddr; 559 extern vaddr_t lo32_paddr; 560 #endif 561 562 extern int end; 563 564 #ifdef i386 565 /* stuff to fix the pentium f00f bug */ 566 extern vaddr_t pentium_idt_vaddr; 567 #endif 568 569 570 /* 571 * local prototypes 572 */ 573 574 static struct vm_page *pmap_get_ptp(struct pmap *, vaddr_t, 575 pd_entry_t * const *); 576 static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int); 577 static void pmap_freepage(struct pmap *, struct vm_page *, int); 578 static void pmap_free_ptp(struct pmap *, struct vm_page *, 579 vaddr_t, pt_entry_t *, 580 pd_entry_t * const *); 581 static bool pmap_is_curpmap(struct pmap *); 582 static bool pmap_is_active(struct pmap *, struct cpu_info *, bool); 583 static void pmap_map_ptes(struct pmap *, struct pmap **, 584 pt_entry_t **, pd_entry_t * const **); 585 static bool pmap_remove_pte(struct pmap *, struct vm_page *, 586 pt_entry_t *, vaddr_t, 587 struct pv_entry **); 588 static pt_entry_t pmap_remove_ptes(struct pmap *, struct vm_page *, 589 vaddr_t, vaddr_t, vaddr_t, 590 struct pv_entry **); 591 592 static void pmap_unmap_ptes(struct pmap *, struct pmap *); 593 static bool pmap_get_physpage(vaddr_t, int, paddr_t *); 594 static int pmap_pdes_invalid(vaddr_t, pd_entry_t * const *, 595 pd_entry_t *); 596 #define pmap_pdes_valid(va, pdes, lastpde) \ 597 (pmap_pdes_invalid((va), (pdes), (lastpde)) == 0) 598 static void pmap_alloc_level(pd_entry_t * const *, vaddr_t, int, 599 long *); 600 601 static bool pmap_reactivate(struct pmap *); 602 603 /* 604 * p m a p h e l p e r f u n c t i o n s 605 */ 606 607 static inline void 608 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff) 609 { 610 611 if (pmap == pmap_kernel()) { 612 atomic_add_long(&pmap->pm_stats.resident_count, resid_diff); 613 atomic_add_long(&pmap->pm_stats.wired_count, wired_diff); 614 } else { 615 KASSERT(mutex_owned(&pmap->pm_lock)); 616 pmap->pm_stats.resident_count += resid_diff; 617 pmap->pm_stats.wired_count += wired_diff; 618 } 619 } 620 621 static inline void 622 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte) 623 { 624 int resid_diff = ((npte & PG_V) ? 1 : 0) - ((opte & PG_V) ? 1 : 0); 625 int wired_diff = ((npte & PG_W) ? 1 : 0) - ((opte & PG_W) ? 1 : 0); 626 627 KASSERT((npte & (PG_V | PG_W)) != PG_W); 628 KASSERT((opte & (PG_V | PG_W)) != PG_W); 629 630 pmap_stats_update(pmap, resid_diff, wired_diff); 631 } 632 633 /* 634 * ptp_to_pmap: lookup pmap by ptp 635 */ 636 637 static struct pmap * 638 ptp_to_pmap(struct vm_page *ptp) 639 { 640 struct pmap *pmap; 641 642 if (ptp == NULL) { 643 return pmap_kernel(); 644 } 645 pmap = (struct pmap *)ptp->uobject; 646 KASSERT(pmap != NULL); 647 KASSERT(&pmap->pm_obj[0] == ptp->uobject); 648 return pmap; 649 } 650 651 static inline struct pv_pte * 652 pve_to_pvpte(struct pv_entry *pve) 653 { 654 655 KASSERT((void *)&pve->pve_pte == (void *)pve); 656 return &pve->pve_pte; 657 } 658 659 static inline struct pv_entry * 660 pvpte_to_pve(struct pv_pte *pvpte) 661 { 662 struct pv_entry *pve = (void *)pvpte; 663 664 KASSERT(pve_to_pvpte(pve) == pvpte); 665 return pve; 666 } 667 668 /* 669 * pv_pte_first, pv_pte_next: PV list iterator. 670 */ 671 672 static struct pv_pte * 673 pv_pte_first(struct pmap_page *pp) 674 { 675 676 KASSERT(pp_locked(pp)); 677 if ((pp->pp_flags & PP_EMBEDDED) != 0) { 678 return &pp->pp_pte; 679 } 680 return pve_to_pvpte(LIST_FIRST(&pp->pp_head.pvh_list)); 681 } 682 683 static struct pv_pte * 684 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte) 685 { 686 687 KASSERT(pvpte != NULL); 688 KASSERT(pp_locked(pp)); 689 if (pvpte == &pp->pp_pte) { 690 KASSERT((pp->pp_flags & PP_EMBEDDED) != 0); 691 return NULL; 692 } 693 KASSERT((pp->pp_flags & PP_EMBEDDED) == 0); 694 return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list)); 695 } 696 697 /* 698 * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]? 699 * of course the kernel is always loaded 700 */ 701 702 inline static bool 703 pmap_is_curpmap(struct pmap *pmap) 704 { 705 #if defined(XEN) && defined(__x86_64__) 706 /* 707 * Only kernel pmap is physically loaded. 708 * User PGD may be active, but TLB will be flushed 709 * with HYPERVISOR_iret anyway, so let's say no 710 */ 711 return(pmap == pmap_kernel()); 712 #else /* XEN && __x86_64__*/ 713 return((pmap == pmap_kernel()) || 714 (pmap == curcpu()->ci_pmap)); 715 #endif 716 } 717 718 /* 719 * pmap_is_active: is this pmap loaded into the specified processor's %cr3? 720 */ 721 722 inline static bool 723 pmap_is_active(struct pmap *pmap, struct cpu_info *ci, bool kernel) 724 { 725 726 return (pmap == pmap_kernel() || 727 (pmap->pm_cpus & ci->ci_cpumask) != 0 || 728 (kernel && (pmap->pm_kernel_cpus & ci->ci_cpumask) != 0)); 729 } 730 731 static void 732 pmap_apte_flush(struct pmap *pmap) 733 { 734 735 KASSERT(kpreempt_disabled()); 736 737 /* 738 * Flush the APTE mapping from all other CPUs that 739 * are using the pmap we are using (who's APTE space 740 * is the one we've just modified). 741 * 742 * XXXthorpej -- find a way to defer the IPI. 743 */ 744 pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, 0); 745 pmap_tlb_shootwait(); 746 } 747 748 /* 749 * Add a reference to the specified pmap. 750 */ 751 752 inline void 753 pmap_reference(struct pmap *pmap) 754 { 755 756 atomic_inc_uint(&pmap->pm_obj[0].uo_refs); 757 } 758 759 /* 760 * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in 761 * 762 * => we lock enough pmaps to keep things locked in 763 * => must be undone with pmap_unmap_ptes before returning 764 */ 765 766 static void 767 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2, 768 pd_entry_t **ptepp, pd_entry_t * const **pdeppp) 769 { 770 pd_entry_t opde, npde; 771 struct pmap *ourpmap; 772 struct cpu_info *ci; 773 struct lwp *l; 774 bool iscurrent; 775 uint64_t ncsw; 776 #ifdef XEN 777 int s; 778 #endif 779 780 /* the kernel's pmap is always accessible */ 781 if (pmap == pmap_kernel()) { 782 *pmap2 = NULL; 783 *ptepp = PTE_BASE; 784 *pdeppp = normal_pdes; 785 return; 786 } 787 KASSERT(kpreempt_disabled()); 788 789 retry: 790 l = curlwp; 791 ncsw = l->l_ncsw; 792 ourpmap = NULL; 793 ci = curcpu(); 794 #if defined(XEN) && defined(__x86_64__) 795 /* 796 * curmap can only be pmap_kernel so at this point 797 * pmap_is_curpmap is always false 798 */ 799 iscurrent = 0; 800 ourpmap = pmap_kernel(); 801 #else /* XEN && __x86_64__*/ 802 if (ci->ci_want_pmapload && 803 vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) { 804 pmap_load(); 805 if (l->l_ncsw != ncsw) 806 goto retry; 807 } 808 iscurrent = pmap_is_curpmap(pmap); 809 /* if curpmap then we are always mapped */ 810 if (iscurrent) { 811 mutex_enter(&pmap->pm_lock); 812 *pmap2 = NULL; 813 *ptepp = PTE_BASE; 814 *pdeppp = normal_pdes; 815 goto out; 816 } 817 ourpmap = ci->ci_pmap; 818 #endif /* XEN && __x86_64__ */ 819 820 /* need to lock both curpmap and pmap: use ordered locking */ 821 pmap_reference(ourpmap); 822 if ((uintptr_t) pmap < (uintptr_t) ourpmap) { 823 mutex_enter(&pmap->pm_lock); 824 mutex_enter(&ourpmap->pm_lock); 825 } else { 826 mutex_enter(&ourpmap->pm_lock); 827 mutex_enter(&pmap->pm_lock); 828 } 829 830 if (l->l_ncsw != ncsw) 831 goto unlock_and_retry; 832 833 /* need to load a new alternate pt space into curpmap? */ 834 COUNT(apdp_pde_map); 835 opde = *APDP_PDE; 836 #ifdef XEN 837 if (!pmap_valid_entry(opde) || 838 pmap_pte2pa(opde) != pmap_pdirpa(pmap, 0)) { 839 int i; 840 s = splvm(); 841 /* Make recursive entry usable in user PGD */ 842 for (i = 0; i < PDP_SIZE; i++) { 843 npde = pmap_pa2pte( 844 pmap_pdirpa(pmap, i * NPDPG)) | PG_k | PG_V; 845 xpq_queue_pte_update( 846 xpmap_ptom(pmap_pdirpa(pmap, PDIR_SLOT_PTE + i)), 847 npde); 848 xpq_queue_pte_update(xpmap_ptetomach(&APDP_PDE[i]), 849 npde); 850 #ifdef PAE 851 /* update shadow entry too */ 852 xpq_queue_pte_update( 853 xpmap_ptetomach(&APDP_PDE_SHADOW[i]), npde); 854 #endif /* PAE */ 855 xpq_queue_invlpg( 856 (vaddr_t)&pmap->pm_pdir[PDIR_SLOT_PTE + i]); 857 } 858 xpq_flush_queue(); 859 if (pmap_valid_entry(opde)) 860 pmap_apte_flush(ourpmap); 861 splx(s); 862 } 863 #else /* XEN */ 864 npde = pmap_pa2pte(pmap_pdirpa(pmap, 0)) | PG_RW | PG_V; 865 if (!pmap_valid_entry(opde) || 866 pmap_pte2pa(opde) != pmap_pdirpa(pmap, 0)) { 867 pmap_pte_set(APDP_PDE, npde); 868 pmap_pte_flush(); 869 if (pmap_valid_entry(opde)) 870 pmap_apte_flush(ourpmap); 871 } 872 #endif /* XEN */ 873 *pmap2 = ourpmap; 874 *ptepp = APTE_BASE; 875 *pdeppp = alternate_pdes; 876 KASSERT(l->l_ncsw == ncsw); 877 #if !defined(XEN) || !defined(__x86_64__) 878 out: 879 #endif 880 /* 881 * might have blocked, need to retry? 882 */ 883 if (l->l_ncsw != ncsw) { 884 unlock_and_retry: 885 if (ourpmap != NULL) { 886 mutex_exit(&ourpmap->pm_lock); 887 pmap_destroy(ourpmap); 888 } 889 mutex_exit(&pmap->pm_lock); 890 goto retry; 891 } 892 893 return; 894 } 895 896 /* 897 * pmap_unmap_ptes: unlock the PTE mapping of "pmap" 898 */ 899 900 static void 901 pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2) 902 { 903 904 if (pmap == pmap_kernel()) { 905 return; 906 } 907 KASSERT(kpreempt_disabled()); 908 if (pmap2 == NULL) { 909 mutex_exit(&pmap->pm_lock); 910 } else { 911 #if defined(XEN) && defined(__x86_64__) 912 KASSERT(pmap2 == pmap_kernel()); 913 #else 914 KASSERT(curcpu()->ci_pmap == pmap2); 915 #endif 916 #if defined(MULTIPROCESSOR) 917 pmap_pte_set(APDP_PDE, 0); 918 pmap_pte_flush(); 919 pmap_apte_flush(pmap2); 920 #endif 921 COUNT(apdp_pde_unmap); 922 mutex_exit(&pmap->pm_lock); 923 mutex_exit(&pmap2->pm_lock); 924 pmap_destroy(pmap2); 925 } 926 } 927 928 inline static void 929 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte) 930 { 931 932 #if !defined(__x86_64__) 933 if (curproc == NULL || curproc->p_vmspace == NULL || 934 pm != vm_map_pmap(&curproc->p_vmspace->vm_map)) 935 return; 936 937 if ((opte ^ npte) & PG_X) 938 pmap_update_pg(va); 939 940 /* 941 * Executability was removed on the last executable change. 942 * Reset the code segment to something conservative and 943 * let the trap handler deal with setting the right limit. 944 * We can't do that because of locking constraints on the vm map. 945 */ 946 947 if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) { 948 struct trapframe *tf = curlwp->l_md.md_regs; 949 950 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 951 pm->pm_hiexec = I386_MAX_EXE_ADDR; 952 } 953 #endif /* !defined(__x86_64__) */ 954 } 955 956 #if !defined(__x86_64__) 957 /* 958 * Fixup the code segment to cover all potential executable mappings. 959 * returns 0 if no changes to the code segment were made. 960 */ 961 962 int 963 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb) 964 { 965 struct vm_map_entry *ent; 966 struct pmap *pm = vm_map_pmap(map); 967 vaddr_t va = 0; 968 969 vm_map_lock_read(map); 970 for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) { 971 972 /* 973 * This entry has greater va than the entries before. 974 * We need to make it point to the last page, not past it. 975 */ 976 977 if (ent->protection & VM_PROT_EXECUTE) 978 va = trunc_page(ent->end) - PAGE_SIZE; 979 } 980 vm_map_unlock_read(map); 981 if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL)) 982 return (0); 983 984 pm->pm_hiexec = va; 985 if (pm->pm_hiexec > I386_MAX_EXE_ADDR) { 986 tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL); 987 } else { 988 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 989 return (0); 990 } 991 return (1); 992 } 993 #endif /* !defined(__x86_64__) */ 994 995 /* 996 * p m a p k e n t e r f u n c t i o n s 997 * 998 * functions to quickly enter/remove pages from the kernel address 999 * space. pmap_kremove is exported to MI kernel. we make use of 1000 * the recursive PTE mappings. 1001 */ 1002 1003 /* 1004 * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking 1005 * 1006 * => no need to lock anything, assume va is already allocated 1007 * => should be faster than normal pmap enter function 1008 */ 1009 1010 void 1011 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags) 1012 { 1013 pt_entry_t *pte, opte, npte; 1014 1015 KASSERT(!(prot & ~VM_PROT_ALL)); 1016 1017 if (va < VM_MIN_KERNEL_ADDRESS) 1018 pte = vtopte(va); 1019 else 1020 pte = kvtopte(va); 1021 #ifdef DOM0OPS 1022 if (pa < pmap_pa_start || pa >= pmap_pa_end) { 1023 #ifdef DEBUG 1024 printk("pmap_kenter_pa: pa 0x%" PRIx64 " for va 0x%" PRIx64 1025 " outside range\n", (int64_t)pa, (int64_t)va); 1026 #endif /* DEBUG */ 1027 npte = pa; 1028 } else 1029 #endif /* DOM0OPS */ 1030 npte = pmap_pa2pte(pa); 1031 npte |= protection_codes[prot] | PG_k | PG_V | pmap_pg_g; 1032 if (flags & PMAP_NOCACHE) 1033 npte |= PG_N; 1034 opte = pmap_pte_testset(pte, npte); /* zap! */ 1035 #if defined(DIAGNOSTIC) 1036 /* XXX For now... */ 1037 if (opte & PG_PS) 1038 panic("pmap_kenter_pa: PG_PS"); 1039 #endif 1040 if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) { 1041 /* This should not happen, so no need to batch updates. */ 1042 kpreempt_disable(); 1043 pmap_tlb_shootdown(pmap_kernel(), va, 0, opte); 1044 kpreempt_enable(); 1045 } 1046 } 1047 1048 void 1049 pmap_emap_enter(vaddr_t va, paddr_t pa, vm_prot_t prot) 1050 { 1051 pt_entry_t *pte, opte, npte; 1052 1053 KASSERT((prot & ~VM_PROT_ALL) == 0); 1054 pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va); 1055 1056 #ifdef DOM0OPS 1057 if (pa < pmap_pa_start || pa >= pmap_pa_end) { 1058 npte = pa; 1059 } else 1060 #endif 1061 npte = pmap_pa2pte(pa); 1062 1063 npte = pmap_pa2pte(pa); 1064 npte |= protection_codes[prot] | PG_k | PG_V; 1065 opte = pmap_pte_testset(pte, npte); 1066 } 1067 1068 /* 1069 * pmap_emap_sync: perform TLB flush or pmap load, if it was deferred. 1070 */ 1071 void 1072 pmap_emap_sync(bool canload) 1073 { 1074 struct cpu_info *ci = curcpu(); 1075 struct pmap *pmap; 1076 1077 KASSERT(kpreempt_disabled()); 1078 if (__predict_true(ci->ci_want_pmapload && canload)) { 1079 /* 1080 * XXX: Hint for pmap_reactivate(), which might suggest to 1081 * not perform TLB flush, if state has not changed. 1082 */ 1083 pmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map); 1084 if (__predict_false(pmap == ci->ci_pmap)) { 1085 const uint32_t cpumask = ci->ci_cpumask; 1086 atomic_and_32(&pmap->pm_cpus, ~cpumask); 1087 } 1088 pmap_load(); 1089 KASSERT(ci->ci_want_pmapload == 0); 1090 } else { 1091 tlbflush(); 1092 } 1093 1094 } 1095 1096 void 1097 pmap_emap_remove(vaddr_t sva, vsize_t len) 1098 { 1099 pt_entry_t *pte, xpte; 1100 vaddr_t va, eva = sva + len; 1101 1102 for (va = sva; va < eva; va += PAGE_SIZE) { 1103 pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va); 1104 xpte |= pmap_pte_testset(pte, 0); 1105 } 1106 } 1107 1108 #ifdef XEN 1109 /* 1110 * pmap_kenter_ma: enter a kernel mapping without R/M (pv_entry) tracking 1111 * 1112 * => no need to lock anything, assume va is already allocated 1113 * => should be faster than normal pmap enter function 1114 * => we expect a MACHINE address 1115 */ 1116 1117 void 1118 pmap_kenter_ma(vaddr_t va, paddr_t ma, vm_prot_t prot, u_int flags) 1119 { 1120 pt_entry_t *pte, opte, npte; 1121 1122 if (va < VM_MIN_KERNEL_ADDRESS) 1123 pte = vtopte(va); 1124 else 1125 pte = kvtopte(va); 1126 1127 npte = ma | ((prot & VM_PROT_WRITE) ? PG_RW : PG_RO) | 1128 PG_V | PG_k; 1129 if (flags & PMAP_NOCACHE) 1130 npte |= PG_N; 1131 1132 #ifndef XEN 1133 if ((cpu_feature & CPUID_NOX) && !(prot & VM_PROT_EXECUTE)) 1134 npte |= PG_NX; 1135 #endif 1136 opte = pmap_pte_testset (pte, npte); /* zap! */ 1137 1138 if (pmap_valid_entry(opte)) { 1139 #if defined(MULTIPROCESSOR) 1140 kpreempt_disable(); 1141 pmap_tlb_shootdown(pmap_kernel(), va, 0, opte); 1142 kpreempt_enable(); 1143 #else 1144 /* Don't bother deferring in the single CPU case. */ 1145 pmap_update_pg(va); 1146 #endif 1147 } 1148 } 1149 #endif /* XEN */ 1150 1151 #if defined(__x86_64__) 1152 /* 1153 * Change protection for a virtual address. Local for a CPU only, don't 1154 * care about TLB shootdowns. 1155 * 1156 * => must be called with preemption disabled 1157 */ 1158 void 1159 pmap_changeprot_local(vaddr_t va, vm_prot_t prot) 1160 { 1161 pt_entry_t *pte, opte, npte; 1162 1163 KASSERT(kpreempt_disabled()); 1164 1165 if (va < VM_MIN_KERNEL_ADDRESS) 1166 pte = vtopte(va); 1167 else 1168 pte = kvtopte(va); 1169 1170 npte = opte = *pte; 1171 1172 if ((prot & VM_PROT_WRITE) != 0) 1173 npte |= PG_RW; 1174 else 1175 npte &= ~PG_RW; 1176 1177 if (opte != npte) { 1178 pmap_pte_set(pte, npte); 1179 pmap_pte_flush(); 1180 invlpg(va); 1181 } 1182 } 1183 #endif /* defined(__x86_64__) */ 1184 1185 /* 1186 * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking 1187 * 1188 * => no need to lock anything 1189 * => caller must dispose of any vm_page mapped in the va range 1190 * => note: not an inline function 1191 * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE 1192 * => we assume kernel only unmaps valid addresses and thus don't bother 1193 * checking the valid bit before doing TLB flushing 1194 * => must be followed by call to pmap_update() before reuse of page 1195 */ 1196 1197 void 1198 pmap_kremove(vaddr_t sva, vsize_t len) 1199 { 1200 pt_entry_t *pte, xpte; 1201 vaddr_t va, eva; 1202 1203 eva = sva + len; 1204 xpte = 0; 1205 1206 for (va = sva; va < eva; va += PAGE_SIZE) { 1207 if (va < VM_MIN_KERNEL_ADDRESS) 1208 pte = vtopte(va); 1209 else 1210 pte = kvtopte(va); 1211 xpte |= pmap_pte_testset(pte, 0); /* zap! */ 1212 #if defined(DIAGNOSTIC) 1213 /* XXX For now... */ 1214 if (xpte & PG_PS) 1215 panic("pmap_kremove: PG_PS"); 1216 if (xpte & PG_PVLIST) 1217 panic("pmap_kremove: PG_PVLIST mapping for 0x%lx", 1218 va); 1219 #endif 1220 } 1221 if ((xpte & (PG_V | PG_U)) == (PG_V | PG_U)) { 1222 kpreempt_disable(); 1223 pmap_tlb_shootdown(pmap_kernel(), sva, eva, xpte); 1224 kpreempt_enable(); 1225 } 1226 } 1227 1228 /* 1229 * p m a p i n i t f u n c t i o n s 1230 * 1231 * pmap_bootstrap and pmap_init are called during system startup 1232 * to init the pmap module. pmap_bootstrap() does a low level 1233 * init just to get things rolling. pmap_init() finishes the job. 1234 */ 1235 1236 /* 1237 * pmap_bootstrap: get the system in a state where it can run with VM 1238 * properly enabled (called before main()). the VM system is 1239 * fully init'd later... 1240 * 1241 * => on i386, locore.s has already enabled the MMU by allocating 1242 * a PDP for the kernel, and nkpde PTP's for the kernel. 1243 * => kva_start is the first free virtual address in kernel space 1244 */ 1245 1246 void 1247 pmap_bootstrap(vaddr_t kva_start) 1248 { 1249 struct pmap *kpm; 1250 pt_entry_t *pte; 1251 struct pcb *pcb; 1252 int i; 1253 vaddr_t kva; 1254 #ifdef XEN 1255 pt_entry_t pg_nx = 0; 1256 #else 1257 unsigned long p1i; 1258 vaddr_t kva_end; 1259 pt_entry_t pg_nx = (cpu_feature & CPUID_NOX ? PG_NX : 0); 1260 #endif 1261 1262 /* 1263 * set up our local static global vars that keep track of the 1264 * usage of KVM before kernel_map is set up 1265 */ 1266 1267 virtual_avail = kva_start; /* first free KVA */ 1268 virtual_end = VM_MAX_KERNEL_ADDRESS; /* last KVA */ 1269 1270 /* 1271 * set up protection_codes: we need to be able to convert from 1272 * a MI protection code (some combo of VM_PROT...) to something 1273 * we can jam into a i386 PTE. 1274 */ 1275 1276 protection_codes[VM_PROT_NONE] = pg_nx; /* --- */ 1277 protection_codes[VM_PROT_EXECUTE] = PG_RO | PG_X; /* --x */ 1278 protection_codes[VM_PROT_READ] = PG_RO | pg_nx; /* -r- */ 1279 protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO | PG_X;/* -rx */ 1280 protection_codes[VM_PROT_WRITE] = PG_RW | pg_nx; /* w-- */ 1281 protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW | PG_X;/* w-x */ 1282 protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW | pg_nx; 1283 /* wr- */ 1284 protection_codes[VM_PROT_ALL] = PG_RW | PG_X; /* wrx */ 1285 1286 /* 1287 * now we init the kernel's pmap 1288 * 1289 * the kernel pmap's pm_obj is not used for much. however, in 1290 * user pmaps the pm_obj contains the list of active PTPs. 1291 * the pm_obj currently does not have a pager. it might be possible 1292 * to add a pager that would allow a process to read-only mmap its 1293 * own page tables (fast user level vtophys?). this may or may not 1294 * be useful. 1295 */ 1296 1297 kpm = pmap_kernel(); 1298 for (i = 0; i < PTP_LEVELS - 1; i++) { 1299 UVM_OBJ_INIT(&kpm->pm_obj[i], NULL, 1); 1300 kpm->pm_ptphint[i] = NULL; 1301 } 1302 memset(&kpm->pm_list, 0, sizeof(kpm->pm_list)); /* pm_list not used */ 1303 pcb = lwp_getpcb(&lwp0); 1304 kpm->pm_pdir = (pd_entry_t *)(pcb->pcb_cr3 + KERNBASE); 1305 #ifdef PAE 1306 for (i = 0; i < PDP_SIZE; i++) 1307 kpm->pm_pdirpa[i] = (paddr_t)pcb->pcb_cr3 + PAGE_SIZE * i; 1308 #else 1309 kpm->pm_pdirpa = (paddr_t)pcb->pcb_cr3; 1310 #endif 1311 kpm->pm_stats.wired_count = kpm->pm_stats.resident_count = 1312 x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS); 1313 1314 /* 1315 * the above is just a rough estimate and not critical to the proper 1316 * operation of the system. 1317 */ 1318 1319 #ifndef XEN 1320 /* 1321 * Begin to enable global TLB entries if they are supported. 1322 * The G bit has no effect until the CR4_PGE bit is set in CR4, 1323 * which happens in cpu_init(), which is run on each cpu 1324 * (and happens later) 1325 */ 1326 1327 if (cpu_feature & CPUID_PGE) { 1328 pmap_pg_g = PG_G; /* enable software */ 1329 1330 /* add PG_G attribute to already mapped kernel pages */ 1331 if (KERNBASE == VM_MIN_KERNEL_ADDRESS) { 1332 kva_end = virtual_avail; 1333 } else { 1334 extern vaddr_t eblob, esym; 1335 kva_end = (vaddr_t)&end; 1336 if (esym > kva_end) 1337 kva_end = esym; 1338 if (eblob > kva_end) 1339 kva_end = eblob; 1340 kva_end = roundup(kva_end, PAGE_SIZE); 1341 } 1342 for (kva = KERNBASE; kva < kva_end; kva += PAGE_SIZE) { 1343 p1i = pl1_i(kva); 1344 if (pmap_valid_entry(PTE_BASE[p1i])) 1345 PTE_BASE[p1i] |= PG_G; 1346 } 1347 } 1348 1349 /* 1350 * enable large pages if they are supported. 1351 */ 1352 1353 if (cpu_feature & CPUID_PSE) { 1354 paddr_t pa; 1355 pd_entry_t *pde; 1356 extern char __data_start; 1357 1358 lcr4(rcr4() | CR4_PSE); /* enable hardware (via %cr4) */ 1359 pmap_largepages = 1; /* enable software */ 1360 1361 /* 1362 * the TLB must be flushed after enabling large pages 1363 * on Pentium CPUs, according to section 3.6.2.2 of 1364 * "Intel Architecture Software Developer's Manual, 1365 * Volume 3: System Programming". 1366 */ 1367 tlbflush(); 1368 1369 /* 1370 * now, remap the kernel text using large pages. we 1371 * assume that the linker has properly aligned the 1372 * .data segment to a NBPD_L2 boundary. 1373 */ 1374 kva_end = rounddown((vaddr_t)&__data_start, NBPD_L1); 1375 for (pa = 0, kva = KERNBASE; kva + NBPD_L2 <= kva_end; 1376 kva += NBPD_L2, pa += NBPD_L2) { 1377 pde = &L2_BASE[pl2_i(kva)]; 1378 *pde = pa | pmap_pg_g | PG_PS | 1379 PG_KR | PG_V; /* zap! */ 1380 tlbflush(); 1381 } 1382 #if defined(DEBUG) 1383 printf("kernel text is mapped with " 1384 "%lu large pages and %lu normal pages\n", 1385 (unsigned long)howmany(kva - KERNBASE, NBPD_L2), 1386 (unsigned long)howmany((vaddr_t)&__data_start - kva, 1387 NBPD_L1)); 1388 #endif /* defined(DEBUG) */ 1389 } 1390 #endif /* !XEN */ 1391 1392 if (VM_MIN_KERNEL_ADDRESS != KERNBASE) { 1393 /* 1394 * zero_pte is stuck at the end of mapped space for the kernel 1395 * image (disjunct from kva space). This is done so that it 1396 * can safely be used in pmap_growkernel (pmap_get_physpage), 1397 * when it's called for the first time. 1398 * XXXfvdl fix this for MULTIPROCESSOR later. 1399 */ 1400 1401 early_zerop = (void *)(KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2); 1402 early_zero_pte = PTE_BASE + pl1_i((unsigned long)early_zerop); 1403 } 1404 1405 /* 1406 * now we allocate the "special" VAs which are used for tmp mappings 1407 * by the pmap (and other modules). we allocate the VAs by advancing 1408 * virtual_avail (note that there are no pages mapped at these VAs). 1409 * we find the PTE that maps the allocated VA via the linear PTE 1410 * mapping. 1411 */ 1412 1413 pte = PTE_BASE + pl1_i(virtual_avail); 1414 1415 #ifdef MULTIPROCESSOR 1416 /* 1417 * Waste some VA space to avoid false sharing of cache lines 1418 * for page table pages: Give each possible CPU a cache line 1419 * of PTE's (8) to play with, though we only need 4. We could 1420 * recycle some of this waste by putting the idle stacks here 1421 * as well; we could waste less space if we knew the largest 1422 * CPU ID beforehand. 1423 */ 1424 csrcp = (char *) virtual_avail; csrc_pte = pte; 1425 1426 cdstp = (char *) virtual_avail+PAGE_SIZE; cdst_pte = pte+1; 1427 1428 zerop = (char *) virtual_avail+PAGE_SIZE*2; zero_pte = pte+2; 1429 1430 ptpp = (char *) virtual_avail+PAGE_SIZE*3; ptp_pte = pte+3; 1431 1432 virtual_avail += PAGE_SIZE * maxcpus * NPTECL; 1433 pte += maxcpus * NPTECL; 1434 #else 1435 csrcp = (void *) virtual_avail; csrc_pte = pte; /* allocate */ 1436 virtual_avail += PAGE_SIZE; pte++; /* advance */ 1437 1438 cdstp = (void *) virtual_avail; cdst_pte = pte; 1439 virtual_avail += PAGE_SIZE; pte++; 1440 1441 zerop = (void *) virtual_avail; zero_pte = pte; 1442 virtual_avail += PAGE_SIZE; pte++; 1443 1444 ptpp = (void *) virtual_avail; ptp_pte = pte; 1445 virtual_avail += PAGE_SIZE; pte++; 1446 #endif 1447 1448 if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { 1449 early_zerop = zerop; 1450 early_zero_pte = zero_pte; 1451 } 1452 1453 /* 1454 * Nothing after this point actually needs pte; 1455 */ 1456 pte = (void *)0xdeadbeef; 1457 1458 /* XXX: vmmap used by mem.c... should be uvm_map_reserve */ 1459 /* XXXfvdl PTEs not needed here */ 1460 vmmap = (char *)virtual_avail; /* don't need pte */ 1461 virtual_avail += PAGE_SIZE; pte++; 1462 1463 #ifdef XEN 1464 #ifdef __x86_64__ 1465 /* 1466 * We want a dummy page directory for Xen: 1467 * when deactivate a pmap, Xen will still consider it active. 1468 * So we set user PGD to this one to lift all protection on 1469 * the now inactive page tables set. 1470 */ 1471 xen_dummy_user_pgd = avail_start; 1472 avail_start += PAGE_SIZE; 1473 1474 /* Zero fill it, the less checks in Xen it requires the better */ 1475 memset((void *) (xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE); 1476 /* Mark read-only */ 1477 HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE, 1478 pmap_pa2pte(xen_dummy_user_pgd) | PG_u | PG_V, UVMF_INVLPG); 1479 /* Pin as L4 */ 1480 xpq_queue_pin_table(xpmap_ptom_masked(xen_dummy_user_pgd)); 1481 #endif /* __x86_64__ */ 1482 idt_vaddr = virtual_avail; /* don't need pte */ 1483 idt_paddr = avail_start; /* steal a page */ 1484 /* 1485 * Xen require one more page as we can't store 1486 * GDT and LDT on the same page 1487 */ 1488 virtual_avail += 3 * PAGE_SIZE; 1489 avail_start += 3 * PAGE_SIZE; 1490 #else /* XEN */ 1491 idt_vaddr = virtual_avail; /* don't need pte */ 1492 idt_paddr = avail_start; /* steal a page */ 1493 #if defined(__x86_64__) 1494 virtual_avail += 2 * PAGE_SIZE; pte += 2; 1495 avail_start += 2 * PAGE_SIZE; 1496 #else /* defined(__x86_64__) */ 1497 virtual_avail += PAGE_SIZE; pte++; 1498 avail_start += PAGE_SIZE; 1499 /* pentium f00f bug stuff */ 1500 pentium_idt_vaddr = virtual_avail; /* don't need pte */ 1501 virtual_avail += PAGE_SIZE; pte++; 1502 #endif /* defined(__x86_64__) */ 1503 #endif /* XEN */ 1504 1505 #ifdef _LP64 1506 /* 1507 * Grab a page below 4G for things that need it (i.e. 1508 * having an initial %cr3 for the MP trampoline). 1509 */ 1510 lo32_vaddr = virtual_avail; 1511 virtual_avail += PAGE_SIZE; pte++; 1512 lo32_paddr = avail_start; 1513 avail_start += PAGE_SIZE; 1514 #endif 1515 1516 /* 1517 * now we reserve some VM for mapping pages when doing a crash dump 1518 */ 1519 1520 virtual_avail = reserve_dumppages(virtual_avail); 1521 1522 /* 1523 * init the static-global locks and global lists. 1524 * 1525 * => pventry::pvh_lock (initialized elsewhere) must also be 1526 * a spin lock, again at IPL_VM to prevent deadlock, and 1527 * again is never taken from interrupt context. 1528 */ 1529 1530 mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE); 1531 LIST_INIT(&pmaps); 1532 pmap_cpu_init_early(curcpu()); 1533 1534 /* 1535 * initialize caches. 1536 */ 1537 1538 pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), 0, 0, 0, 1539 "pmappl", NULL, IPL_NONE, NULL, NULL, NULL); 1540 #ifdef PAE 1541 pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE * PDP_SIZE, 0, 0, 0, 1542 "pdppl", &pmap_pdp_allocator, IPL_NONE, 1543 pmap_pdp_ctor, pmap_pdp_dtor, NULL); 1544 #else /* PAE */ 1545 pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE, 0, 0, 0, 1546 "pdppl", NULL, IPL_NONE, pmap_pdp_ctor, pmap_pdp_dtor, NULL); 1547 #endif /* PAE */ 1548 pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0, 1549 PR_LARGECACHE, "pvpl", &pool_allocator_meta, IPL_NONE, NULL, 1550 NULL, NULL); 1551 1552 /* 1553 * ensure the TLB is sync'd with reality by flushing it... 1554 */ 1555 1556 tlbflush(); 1557 1558 /* 1559 * calculate pmap_maxkvaddr from nkptp[]. 1560 */ 1561 1562 kva = VM_MIN_KERNEL_ADDRESS; 1563 for (i = PTP_LEVELS - 1; i >= 1; i--) { 1564 kva += nkptp[i] * nbpd[i]; 1565 } 1566 pmap_maxkvaddr = kva; 1567 } 1568 1569 #if defined(__x86_64__) 1570 /* 1571 * Pre-allocate PTPs for low memory, so that 1:1 mappings for various 1572 * trampoline code can be entered. 1573 */ 1574 void 1575 pmap_prealloc_lowmem_ptps(void) 1576 { 1577 #ifdef XEN 1578 int level; 1579 paddr_t newp; 1580 paddr_t pdes_pa; 1581 1582 pdes_pa = pmap_kernel()->pm_pdirpa; 1583 level = PTP_LEVELS; 1584 for (;;) { 1585 newp = avail_start; 1586 avail_start += PAGE_SIZE; 1587 HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop, 1588 xpmap_ptom_masked(newp) | PG_u | PG_V | PG_RW, UVMF_INVLPG); 1589 memset((void *)early_zerop, 0, PAGE_SIZE); 1590 /* Mark R/O before installing */ 1591 HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop, 1592 xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG); 1593 if (newp < (NKL2_KIMG_ENTRIES * NBPD_L2)) 1594 HYPERVISOR_update_va_mapping (newp + KERNBASE, 1595 xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG); 1596 xpq_queue_pte_update ( 1597 xpmap_ptom_masked(pdes_pa) 1598 + (pl_i(0, level) * sizeof (pd_entry_t)), 1599 xpmap_ptom_masked(newp) | PG_RW | PG_u | PG_V); 1600 level--; 1601 if (level <= 1) 1602 break; 1603 pdes_pa = newp; 1604 } 1605 #else /* XEN */ 1606 pd_entry_t *pdes; 1607 int level; 1608 paddr_t newp; 1609 1610 pdes = pmap_kernel()->pm_pdir; 1611 level = PTP_LEVELS; 1612 for (;;) { 1613 newp = avail_start; 1614 avail_start += PAGE_SIZE; 1615 *early_zero_pte = (newp & PG_FRAME) | PG_V | PG_RW; 1616 pmap_update_pg((vaddr_t)early_zerop); 1617 memset(early_zerop, 0, PAGE_SIZE); 1618 pdes[pl_i(0, level)] = (newp & PG_FRAME) | PG_V | PG_RW; 1619 level--; 1620 if (level <= 1) 1621 break; 1622 pdes = normal_pdes[level - 2]; 1623 } 1624 #endif /* XEN */ 1625 } 1626 #endif /* defined(__x86_64__) */ 1627 1628 /* 1629 * pmap_init: called from uvm_init, our job is to get the pmap 1630 * system ready to manage mappings... 1631 */ 1632 1633 void 1634 pmap_init(void) 1635 { 1636 int i; 1637 1638 for (i = 0; i < PV_HASH_SIZE; i++) { 1639 SLIST_INIT(&pv_hash_heads[i].hh_list); 1640 } 1641 for (i = 0; i < PV_HASH_LOCK_CNT; i++) { 1642 mutex_init(&pv_hash_locks[i].lock, MUTEX_NODEBUG, IPL_VM); 1643 } 1644 1645 /* 1646 * done: pmap module is up (and ready for business) 1647 */ 1648 1649 pmap_initialized = true; 1650 } 1651 1652 /* 1653 * pmap_cpu_init_early: perform early per-CPU initialization. 1654 */ 1655 1656 void 1657 pmap_cpu_init_early(struct cpu_info *ci) 1658 { 1659 struct pmap_cpu *pc; 1660 static uint8_t pmap_cpu_alloc; 1661 1662 pc = &pmap_cpu[pmap_cpu_alloc++].pc; 1663 ci->ci_pmap_cpu = pc; 1664 } 1665 1666 /* 1667 * pmap_cpu_init_late: perform late per-CPU initialization. 1668 */ 1669 1670 void 1671 pmap_cpu_init_late(struct cpu_info *ci) 1672 { 1673 1674 if (ci == &cpu_info_primary) { 1675 evcnt_attach_dynamic(&pmap_tlb_evcnt, EVCNT_TYPE_INTR, 1676 NULL, "global", "TLB IPI"); 1677 evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC, 1678 NULL, "x86", "io bitmap copy"); 1679 evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC, 1680 NULL, "x86", "ldt sync"); 1681 } 1682 1683 evcnt_attach_dynamic(&ci->ci_tlb_evcnt, EVCNT_TYPE_MISC, 1684 NULL, device_xname(ci->ci_dev), "TLB IPI"); 1685 } 1686 1687 /* 1688 * p v _ e n t r y f u n c t i o n s 1689 */ 1690 1691 /* 1692 * pmap_free_pvs: free a list of pv_entrys 1693 */ 1694 1695 static void 1696 pmap_free_pvs(struct pv_entry *pve) 1697 { 1698 struct pv_entry *next; 1699 1700 for ( /* null */ ; pve != NULL ; pve = next) { 1701 next = pve->pve_next; 1702 pool_cache_put(&pmap_pv_cache, pve); 1703 } 1704 } 1705 1706 /* 1707 * main pv_entry manipulation functions: 1708 * pmap_enter_pv: enter a mapping onto a pv_head list 1709 * pmap_remove_pv: remove a mapping from a pv_head list 1710 * 1711 * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock 1712 * the pvh before calling 1713 */ 1714 1715 /* 1716 * insert_pv: a helper of pmap_enter_pv 1717 */ 1718 1719 static void 1720 insert_pv(struct pmap_page *pp, struct pv_entry *pve) 1721 { 1722 struct pv_hash_head *hh; 1723 kmutex_t *lock; 1724 u_int hash; 1725 1726 KASSERT(pp_locked(pp)); 1727 1728 hash = pvhash_hash(pve->pve_pte.pte_ptp, pve->pve_pte.pte_va); 1729 lock = pvhash_lock(hash); 1730 hh = pvhash_head(hash); 1731 mutex_spin_enter(lock); 1732 SLIST_INSERT_HEAD(&hh->hh_list, pve, pve_hash); 1733 mutex_spin_exit(lock); 1734 1735 LIST_INSERT_HEAD(&pp->pp_head.pvh_list, pve, pve_list); 1736 } 1737 1738 /* 1739 * pmap_enter_pv: enter a mapping onto a pv_head lst 1740 * 1741 * => caller should have the pp_lock locked 1742 * => caller should adjust ptp's wire_count before calling 1743 */ 1744 1745 static struct pv_entry * 1746 pmap_enter_pv(struct pmap_page *pp, 1747 struct pv_entry *pve, /* preallocated pve for us to use */ 1748 struct pv_entry **sparepve, 1749 struct vm_page *ptp, 1750 vaddr_t va) 1751 { 1752 1753 KASSERT(ptp == NULL || ptp->wire_count >= 2); 1754 KASSERT(ptp == NULL || ptp->uobject != NULL); 1755 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 1756 KASSERT(pp_locked(pp)); 1757 1758 if ((pp->pp_flags & PP_EMBEDDED) == 0) { 1759 if (LIST_EMPTY(&pp->pp_head.pvh_list)) { 1760 pp->pp_flags |= PP_EMBEDDED; 1761 pp->pp_pte.pte_ptp = ptp; 1762 pp->pp_pte.pte_va = va; 1763 1764 return pve; 1765 } 1766 } else { 1767 struct pv_entry *pve2; 1768 1769 pve2 = *sparepve; 1770 *sparepve = NULL; 1771 1772 pve2->pve_pte = pp->pp_pte; 1773 pp->pp_flags &= ~PP_EMBEDDED; 1774 LIST_INIT(&pp->pp_head.pvh_list); 1775 insert_pv(pp, pve2); 1776 } 1777 1778 pve->pve_pte.pte_ptp = ptp; 1779 pve->pve_pte.pte_va = va; 1780 insert_pv(pp, pve); 1781 1782 return NULL; 1783 } 1784 1785 /* 1786 * pmap_remove_pv: try to remove a mapping from a pv_list 1787 * 1788 * => caller should hold pp_lock [so that attrs can be adjusted] 1789 * => caller should adjust ptp's wire_count and free PTP if needed 1790 * => we return the removed pve 1791 */ 1792 1793 static struct pv_entry * 1794 pmap_remove_pv(struct pmap_page *pp, struct vm_page *ptp, vaddr_t va) 1795 { 1796 struct pv_hash_head *hh; 1797 struct pv_entry *pve; 1798 kmutex_t *lock; 1799 u_int hash; 1800 1801 KASSERT(ptp == NULL || ptp->uobject != NULL); 1802 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 1803 KASSERT(pp_locked(pp)); 1804 1805 if ((pp->pp_flags & PP_EMBEDDED) != 0) { 1806 KASSERT(pp->pp_pte.pte_ptp == ptp); 1807 KASSERT(pp->pp_pte.pte_va == va); 1808 1809 pp->pp_flags &= ~PP_EMBEDDED; 1810 LIST_INIT(&pp->pp_head.pvh_list); 1811 1812 return NULL; 1813 } 1814 1815 hash = pvhash_hash(ptp, va); 1816 lock = pvhash_lock(hash); 1817 hh = pvhash_head(hash); 1818 mutex_spin_enter(lock); 1819 pve = pvhash_remove(hh, ptp, va); 1820 mutex_spin_exit(lock); 1821 1822 LIST_REMOVE(pve, pve_list); 1823 1824 return pve; 1825 } 1826 1827 /* 1828 * p t p f u n c t i o n s 1829 */ 1830 1831 static inline struct vm_page * 1832 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level) 1833 { 1834 int lidx = level - 1; 1835 struct vm_page *pg; 1836 1837 KASSERT(mutex_owned(&pmap->pm_lock)); 1838 1839 if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] && 1840 pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) { 1841 return (pmap->pm_ptphint[lidx]); 1842 } 1843 PMAP_SUBOBJ_LOCK(pmap, lidx); 1844 pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level)); 1845 PMAP_SUBOBJ_UNLOCK(pmap, lidx); 1846 1847 KASSERT(pg == NULL || pg->wire_count >= 1); 1848 return pg; 1849 } 1850 1851 static inline void 1852 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level) 1853 { 1854 int lidx; 1855 struct uvm_object *obj; 1856 1857 KASSERT(ptp->wire_count == 1); 1858 1859 lidx = level - 1; 1860 1861 obj = &pmap->pm_obj[lidx]; 1862 pmap_stats_update(pmap, -1, 0); 1863 if (lidx != 0) 1864 mutex_enter(&obj->vmobjlock); 1865 if (pmap->pm_ptphint[lidx] == ptp) 1866 pmap->pm_ptphint[lidx] = TAILQ_FIRST(&obj->memq); 1867 ptp->wire_count = 0; 1868 uvm_pagerealloc(ptp, NULL, 0); 1869 VM_PAGE_TO_PP(ptp)->pp_link = curlwp->l_md.md_gc_ptp; 1870 curlwp->l_md.md_gc_ptp = ptp; 1871 if (lidx != 0) 1872 mutex_exit(&obj->vmobjlock); 1873 } 1874 1875 static void 1876 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va, 1877 pt_entry_t *ptes, pd_entry_t * const *pdes) 1878 { 1879 unsigned long index; 1880 int level; 1881 vaddr_t invaladdr; 1882 #ifdef MULTIPROCESSOR 1883 vaddr_t invaladdr2; 1884 #endif 1885 pd_entry_t opde; 1886 struct pmap *curpmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map); 1887 1888 KASSERT(pmap != pmap_kernel()); 1889 KASSERT(mutex_owned(&pmap->pm_lock)); 1890 KASSERT(kpreempt_disabled()); 1891 1892 level = 1; 1893 do { 1894 index = pl_i(va, level + 1); 1895 opde = pmap_pte_testset(&pdes[level - 1][index], 0); 1896 #if defined(XEN) && defined(__x86_64__) 1897 /* 1898 * If ptp is a L3 currently mapped in kernel space, 1899 * clear it before freeing 1900 */ 1901 if (pmap->pm_pdirpa == xen_current_user_pgd 1902 && level == PTP_LEVELS - 1) 1903 pmap_pte_set(&pmap_kernel()->pm_pdir[index], 0); 1904 #endif /* XEN && __x86_64__ */ 1905 pmap_freepage(pmap, ptp, level); 1906 invaladdr = level == 1 ? (vaddr_t)ptes : 1907 (vaddr_t)pdes[level - 2]; 1908 pmap_tlb_shootdown(curpmap, invaladdr + index * PAGE_SIZE, 1909 0, opde); 1910 #if defined(MULTIPROCESSOR) 1911 invaladdr2 = level == 1 ? (vaddr_t)PTE_BASE : 1912 (vaddr_t)normal_pdes[level - 2]; 1913 if (pmap != curpmap || invaladdr != invaladdr2) { 1914 pmap_tlb_shootdown(pmap, invaladdr2 + index * PAGE_SIZE, 1915 0, opde); 1916 } 1917 #endif 1918 if (level < PTP_LEVELS - 1) { 1919 ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1); 1920 ptp->wire_count--; 1921 if (ptp->wire_count > 1) 1922 break; 1923 } 1924 } while (++level < PTP_LEVELS); 1925 pmap_pte_flush(); 1926 } 1927 1928 /* 1929 * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one) 1930 * 1931 * => pmap should NOT be pmap_kernel() 1932 * => pmap should be locked 1933 * => preemption should be disabled 1934 */ 1935 1936 static struct vm_page * 1937 pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t * const *pdes) 1938 { 1939 struct vm_page *ptp, *pptp; 1940 int i; 1941 unsigned long index; 1942 pd_entry_t *pva; 1943 paddr_t ppa, pa; 1944 struct uvm_object *obj; 1945 1946 KASSERT(pmap != pmap_kernel()); 1947 KASSERT(mutex_owned(&pmap->pm_lock)); 1948 KASSERT(kpreempt_disabled()); 1949 1950 ptp = NULL; 1951 pa = (paddr_t)-1; 1952 1953 /* 1954 * Loop through all page table levels seeing if we need to 1955 * add a new page to that level. 1956 */ 1957 for (i = PTP_LEVELS; i > 1; i--) { 1958 /* 1959 * Save values from previous round. 1960 */ 1961 pptp = ptp; 1962 ppa = pa; 1963 1964 index = pl_i(va, i); 1965 pva = pdes[i - 2]; 1966 1967 if (pmap_valid_entry(pva[index])) { 1968 ppa = pmap_pte2pa(pva[index]); 1969 ptp = NULL; 1970 continue; 1971 } 1972 1973 obj = &pmap->pm_obj[i-2]; 1974 PMAP_SUBOBJ_LOCK(pmap, i - 2); 1975 ptp = uvm_pagealloc(obj, ptp_va2o(va, i - 1), NULL, 1976 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 1977 PMAP_SUBOBJ_UNLOCK(pmap, i - 2); 1978 1979 if (ptp == NULL) 1980 return NULL; 1981 1982 ptp->flags &= ~PG_BUSY; /* never busy */ 1983 ptp->wire_count = 1; 1984 pmap->pm_ptphint[i - 2] = ptp; 1985 pa = VM_PAGE_TO_PHYS(ptp); 1986 pmap_pte_set(&pva[index], (pd_entry_t) 1987 (pmap_pa2pte(pa) | PG_u | PG_RW | PG_V)); 1988 #if defined(XEN) && defined(__x86_64__) 1989 /* 1990 * In Xen we must enter the mapping in kernel map too 1991 * if pmap is curmap and modifying top level (PGD) 1992 */ 1993 if(i == PTP_LEVELS && pmap != pmap_kernel()) { 1994 pmap_pte_set(&pmap_kernel()->pm_pdir[index], 1995 (pd_entry_t) (pmap_pa2pte(pa) 1996 | PG_u | PG_RW | PG_V)); 1997 } 1998 #endif /* XEN && __x86_64__ */ 1999 pmap_pte_flush(); 2000 pmap_stats_update(pmap, 1, 0); 2001 /* 2002 * If we're not in the top level, increase the 2003 * wire count of the parent page. 2004 */ 2005 if (i < PTP_LEVELS) { 2006 if (pptp == NULL) 2007 pptp = pmap_find_ptp(pmap, va, ppa, i); 2008 #ifdef DIAGNOSTIC 2009 if (pptp == NULL) 2010 panic("pde page disappeared"); 2011 #endif 2012 pptp->wire_count++; 2013 } 2014 } 2015 2016 /* 2017 * ptp is not NULL if we just allocated a new ptp. If it's 2018 * still NULL, we must look up the existing one. 2019 */ 2020 if (ptp == NULL) { 2021 ptp = pmap_find_ptp(pmap, va, ppa, 1); 2022 #ifdef DIAGNOSTIC 2023 if (ptp == NULL) { 2024 printf("va %lx ppa %lx\n", (unsigned long)va, 2025 (unsigned long)ppa); 2026 panic("pmap_get_ptp: unmanaged user PTP"); 2027 } 2028 #endif 2029 } 2030 2031 pmap->pm_ptphint[0] = ptp; 2032 return(ptp); 2033 } 2034 2035 /* 2036 * p m a p l i f e c y c l e f u n c t i o n s 2037 */ 2038 2039 /* 2040 * pmap_pdp_ctor: constructor for the PDP cache. 2041 */ 2042 2043 int 2044 pmap_pdp_ctor(void *arg, void *v, int flags) 2045 { 2046 pd_entry_t *pdir = v; 2047 paddr_t pdirpa = 0; /* XXX: GCC */ 2048 vaddr_t object; 2049 int i; 2050 2051 #if !defined(XEN) || !defined(__x86_64__) 2052 int npde; 2053 #endif 2054 #ifdef XEN 2055 int s; 2056 #endif 2057 2058 /* 2059 * NOTE: The `pmap_lock' is held when the PDP is allocated. 2060 */ 2061 2062 #if defined(XEN) && defined(__x86_64__) 2063 /* fetch the physical address of the page directory. */ 2064 (void) pmap_extract(pmap_kernel(), (vaddr_t) pdir, &pdirpa); 2065 2066 /* zero init area */ 2067 memset (pdir, 0, PAGE_SIZE); /* Xen wants a clean page */ 2068 /* 2069 * this pdir will NEVER be active in kernel mode 2070 * so mark recursive entry invalid 2071 */ 2072 pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa) | PG_u; 2073 /* 2074 * PDP constructed this way won't be for kernel, 2075 * hence we don't put kernel mappings on Xen. 2076 * But we need to make pmap_create() happy, so put a dummy (without 2077 * PG_V) value at the right place. 2078 */ 2079 pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] = 2080 (unsigned long)-1 & PG_FRAME; 2081 #else /* XEN && __x86_64__*/ 2082 /* zero init area */ 2083 memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t)); 2084 2085 object = (vaddr_t)v; 2086 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2087 /* fetch the physical address of the page directory. */ 2088 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2089 /* put in recursive PDE to map the PTEs */ 2090 pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PG_V; 2091 #ifndef XEN 2092 pdir[PDIR_SLOT_PTE + i] |= PG_KW; 2093 #endif 2094 } 2095 2096 /* copy kernel's PDE */ 2097 npde = nkptp[PTP_LEVELS - 1]; 2098 2099 memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN], 2100 npde * sizeof(pd_entry_t)); 2101 2102 /* zero the rest */ 2103 memset(&pdir[PDIR_SLOT_KERN + npde], 0, 2104 (NTOPLEVEL_PDES - (PDIR_SLOT_KERN + npde)) * sizeof(pd_entry_t)); 2105 2106 if (VM_MIN_KERNEL_ADDRESS != KERNBASE) { 2107 int idx = pl_i(KERNBASE, PTP_LEVELS); 2108 2109 pdir[idx] = PDP_BASE[idx]; 2110 } 2111 #endif /* XEN && __x86_64__*/ 2112 #ifdef XEN 2113 s = splvm(); 2114 object = (vaddr_t)v; 2115 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2116 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2117 /* remap this page RO */ 2118 pmap_kenter_pa(object, pdirpa, VM_PROT_READ, 0); 2119 pmap_update(pmap_kernel()); 2120 /* 2121 * pin as L2/L4 page, we have to do the page with the 2122 * PDIR_SLOT_PTE entries last 2123 */ 2124 #ifdef PAE 2125 if (i == l2tol3(PDIR_SLOT_PTE)) 2126 continue; 2127 #endif 2128 xpq_queue_pin_table(xpmap_ptom_masked(pdirpa)); 2129 } 2130 #ifdef PAE 2131 object = ((vaddr_t)pdir) + PAGE_SIZE * l2tol3(PDIR_SLOT_PTE); 2132 (void)pmap_extract(pmap_kernel(), object, &pdirpa); 2133 xpq_queue_pin_table(xpmap_ptom_masked(pdirpa)); 2134 #endif 2135 xpq_flush_queue(); 2136 splx(s); 2137 #endif /* XEN */ 2138 2139 return (0); 2140 } 2141 2142 /* 2143 * pmap_pdp_dtor: destructor for the PDP cache. 2144 */ 2145 2146 void 2147 pmap_pdp_dtor(void *arg, void *v) 2148 { 2149 #ifdef XEN 2150 paddr_t pdirpa = 0; /* XXX: GCC */ 2151 vaddr_t object = (vaddr_t)v; 2152 int i; 2153 int s = splvm(); 2154 pt_entry_t *pte; 2155 2156 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2157 /* fetch the physical address of the page directory. */ 2158 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2159 /* unpin page table */ 2160 xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa)); 2161 } 2162 object = (vaddr_t)v; 2163 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2164 /* Set page RW again */ 2165 pte = kvtopte(object); 2166 xpq_queue_pte_update(xpmap_ptetomach(pte), *pte | PG_RW); 2167 xpq_queue_invlpg((vaddr_t)object); 2168 } 2169 xpq_flush_queue(); 2170 splx(s); 2171 #endif /* XEN */ 2172 } 2173 2174 #ifdef PAE 2175 2176 /* pmap_pdp_alloc: Allocate a page for the pdp memory pool. */ 2177 2178 void * 2179 pmap_pdp_alloc(struct pool *pp, int flags) 2180 { 2181 return (void *)uvm_km_alloc(kernel_map, 2182 PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE, 2183 ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK) 2184 | UVM_KMF_WIRED); 2185 } 2186 2187 /* 2188 * pmap_pdp_free: free a PDP 2189 */ 2190 2191 void 2192 pmap_pdp_free(struct pool *pp, void *v) 2193 { 2194 uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE, 2195 UVM_KMF_WIRED); 2196 } 2197 #endif /* PAE */ 2198 2199 /* 2200 * pmap_create: create a pmap 2201 * 2202 * => note: old pmap interface took a "size" args which allowed for 2203 * the creation of "software only" pmaps (not in bsd). 2204 */ 2205 2206 struct pmap * 2207 pmap_create(void) 2208 { 2209 struct pmap *pmap; 2210 int i; 2211 2212 pmap = pool_cache_get(&pmap_cache, PR_WAITOK); 2213 2214 /* init uvm_object */ 2215 for (i = 0; i < PTP_LEVELS - 1; i++) { 2216 UVM_OBJ_INIT(&pmap->pm_obj[i], NULL, 1); 2217 pmap->pm_ptphint[i] = NULL; 2218 } 2219 pmap->pm_stats.wired_count = 0; 2220 pmap->pm_stats.resident_count = 1; /* count the PDP allocd below */ 2221 #if !defined(__x86_64__) 2222 pmap->pm_hiexec = 0; 2223 #endif /* !defined(__x86_64__) */ 2224 pmap->pm_flags = 0; 2225 pmap->pm_cpus = 0; 2226 pmap->pm_kernel_cpus = 0; 2227 2228 /* init the LDT */ 2229 pmap->pm_ldt = NULL; 2230 pmap->pm_ldt_len = 0; 2231 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 2232 2233 /* allocate PDP */ 2234 try_again: 2235 pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK); 2236 2237 mutex_enter(&pmaps_lock); 2238 2239 if (pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] == 0) { 2240 mutex_exit(&pmaps_lock); 2241 pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir); 2242 goto try_again; 2243 } 2244 2245 #ifdef PAE 2246 for (i = 0; i < PDP_SIZE; i++) 2247 pmap->pm_pdirpa[i] = 2248 pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]); 2249 #else 2250 pmap->pm_pdirpa = pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE]); 2251 #endif 2252 2253 LIST_INSERT_HEAD(&pmaps, pmap, pm_list); 2254 2255 mutex_exit(&pmaps_lock); 2256 2257 return (pmap); 2258 } 2259 2260 /* 2261 * pmap_destroy: drop reference count on pmap. free pmap if 2262 * reference count goes to zero. 2263 */ 2264 2265 void 2266 pmap_destroy(struct pmap *pmap) 2267 { 2268 int i; 2269 #ifdef DIAGNOSTIC 2270 struct cpu_info *ci; 2271 CPU_INFO_ITERATOR cii; 2272 #endif /* DIAGNOSTIC */ 2273 2274 /* 2275 * if we have torn down this pmap, process deferred frees and 2276 * invalidations now. 2277 */ 2278 if (__predict_false(curlwp->l_md.md_gc_pmap == pmap)) { 2279 pmap_update(pmap); 2280 } 2281 2282 /* 2283 * drop reference count 2284 */ 2285 2286 if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) { 2287 return; 2288 } 2289 2290 #ifdef DIAGNOSTIC 2291 for (CPU_INFO_FOREACH(cii, ci)) 2292 if (ci->ci_pmap == pmap) 2293 panic("destroying pmap being used"); 2294 #endif /* DIAGNOSTIC */ 2295 2296 /* 2297 * reference count is zero, free pmap resources and then free pmap. 2298 */ 2299 #ifdef XEN 2300 /* 2301 * Xen lazy APDP handling: 2302 * clear APDP_PDE if pmap is the currently mapped 2303 */ 2304 if (xpmap_ptom_masked(pmap_pdirpa(pmap, 0)) == (*APDP_PDE & PG_FRAME)) { 2305 kpreempt_disable(); 2306 for (i = 0; i < PDP_SIZE; i++) { 2307 pmap_pte_set(&APDP_PDE[i], 0); 2308 #ifdef PAE 2309 /* clear shadow entry too */ 2310 pmap_pte_set(&APDP_PDE_SHADOW[i], 0); 2311 #endif 2312 } 2313 pmap_pte_flush(); 2314 pmap_apte_flush(pmap_kernel()); 2315 kpreempt_enable(); 2316 } 2317 #endif 2318 2319 /* 2320 * remove it from global list of pmaps 2321 */ 2322 2323 mutex_enter(&pmaps_lock); 2324 LIST_REMOVE(pmap, pm_list); 2325 mutex_exit(&pmaps_lock); 2326 2327 /* 2328 * destroyed pmap shouldn't have remaining PTPs 2329 */ 2330 2331 for (i = 0; i < PTP_LEVELS - 1; i++) { 2332 KASSERT(pmap->pm_obj[i].uo_npages == 0); 2333 KASSERT(TAILQ_EMPTY(&pmap->pm_obj[i].memq)); 2334 } 2335 2336 /* 2337 * MULTIPROCESSOR -- no need to flush out of other processors' 2338 * APTE space because we do that in pmap_unmap_ptes(). 2339 */ 2340 pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir); 2341 2342 #ifdef USER_LDT 2343 if (pmap->pm_ldt != NULL) { 2344 /* 2345 * no need to switch the LDT; this address space is gone, 2346 * nothing is using it. 2347 * 2348 * No need to lock the pmap for ldt_free (or anything else), 2349 * we're the last one to use it. 2350 */ 2351 mutex_enter(&cpu_lock); 2352 ldt_free(pmap->pm_ldt_sel); 2353 mutex_exit(&cpu_lock); 2354 uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt, 2355 pmap->pm_ldt_len, UVM_KMF_WIRED); 2356 } 2357 #endif 2358 2359 for (i = 0; i < PTP_LEVELS - 1; i++) 2360 mutex_destroy(&pmap->pm_obj[i].vmobjlock); 2361 pool_cache_put(&pmap_cache, pmap); 2362 } 2363 2364 /* 2365 * pmap_remove_all: pmap is being torn down by the current thread. 2366 * avoid unnecessary invalidations. 2367 */ 2368 2369 void 2370 pmap_remove_all(struct pmap *pmap) 2371 { 2372 lwp_t *l = curlwp; 2373 2374 KASSERT(l->l_md.md_gc_pmap == NULL); 2375 2376 l->l_md.md_gc_pmap = pmap; 2377 } 2378 2379 #if defined(PMAP_FORK) 2380 /* 2381 * pmap_fork: perform any necessary data structure manipulation when 2382 * a VM space is forked. 2383 */ 2384 2385 void 2386 pmap_fork(struct pmap *pmap1, struct pmap *pmap2) 2387 { 2388 #ifdef USER_LDT 2389 union descriptor *new_ldt; 2390 size_t len; 2391 int sel; 2392 2393 if (__predict_true(pmap1->pm_ldt == NULL)) { 2394 return; 2395 } 2396 2397 retry: 2398 if (pmap1->pm_ldt != NULL) { 2399 len = pmap1->pm_ldt_len; 2400 new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, len, 0, 2401 UVM_KMF_WIRED); 2402 mutex_enter(&cpu_lock); 2403 sel = ldt_alloc(new_ldt, len); 2404 if (sel == -1) { 2405 mutex_exit(&cpu_lock); 2406 uvm_km_free(kernel_map, (vaddr_t)new_ldt, len, 2407 UVM_KMF_WIRED); 2408 printf("WARNING: pmap_fork: unable to allocate LDT\n"); 2409 return; 2410 } 2411 } else { 2412 len = -1; 2413 new_ldt = NULL; 2414 sel = -1; 2415 mutex_enter(&cpu_lock); 2416 } 2417 2418 /* Copy the LDT, if necessary. */ 2419 if (pmap1->pm_ldt != NULL) { 2420 if (len != pmap1->pm_ldt_len) { 2421 if (len != -1) { 2422 ldt_free(sel); 2423 uvm_km_free(kernel_map, (vaddr_t)new_ldt, 2424 len, UVM_KMF_WIRED); 2425 } 2426 mutex_exit(&cpu_lock); 2427 goto retry; 2428 } 2429 2430 memcpy(new_ldt, pmap1->pm_ldt, len); 2431 pmap2->pm_ldt = new_ldt; 2432 pmap2->pm_ldt_len = pmap1->pm_ldt_len; 2433 pmap2->pm_ldt_sel = sel; 2434 len = -1; 2435 } 2436 2437 if (len != -1) { 2438 ldt_free(sel); 2439 uvm_km_free(kernel_map, (vaddr_t)new_ldt, len, 2440 UVM_KMF_WIRED); 2441 } 2442 mutex_exit(&cpu_lock); 2443 #endif /* USER_LDT */ 2444 } 2445 #endif /* PMAP_FORK */ 2446 2447 #ifdef USER_LDT 2448 2449 /* 2450 * pmap_ldt_xcall: cross call used by pmap_ldt_sync. if the named pmap 2451 * is active, reload LDTR. 2452 */ 2453 static void 2454 pmap_ldt_xcall(void *arg1, void *arg2) 2455 { 2456 struct pmap *pm; 2457 2458 kpreempt_disable(); 2459 pm = arg1; 2460 if (curcpu()->ci_pmap == pm) { 2461 lldt(pm->pm_ldt_sel); 2462 } 2463 kpreempt_enable(); 2464 } 2465 2466 /* 2467 * pmap_ldt_sync: LDT selector for the named pmap is changing. swap 2468 * in the new selector on all CPUs. 2469 */ 2470 void 2471 pmap_ldt_sync(struct pmap *pm) 2472 { 2473 uint64_t where; 2474 2475 KASSERT(mutex_owned(&cpu_lock)); 2476 2477 pmap_ldt_evcnt.ev_count++; 2478 where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL); 2479 xc_wait(where); 2480 } 2481 2482 /* 2483 * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and 2484 * restore the default. 2485 */ 2486 2487 void 2488 pmap_ldt_cleanup(struct lwp *l) 2489 { 2490 pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap; 2491 union descriptor *dp = NULL; 2492 size_t len = 0; 2493 int sel = -1; 2494 2495 if (__predict_true(pmap->pm_ldt == NULL)) { 2496 return; 2497 } 2498 2499 mutex_enter(&cpu_lock); 2500 if (pmap->pm_ldt != NULL) { 2501 sel = pmap->pm_ldt_sel; 2502 dp = pmap->pm_ldt; 2503 len = pmap->pm_ldt_len; 2504 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 2505 pmap->pm_ldt = NULL; 2506 pmap->pm_ldt_len = 0; 2507 pmap_ldt_sync(pmap); 2508 ldt_free(sel); 2509 uvm_km_free(kernel_map, (vaddr_t)dp, len, UVM_KMF_WIRED); 2510 } 2511 mutex_exit(&cpu_lock); 2512 } 2513 #endif /* USER_LDT */ 2514 2515 /* 2516 * pmap_activate: activate a process' pmap 2517 * 2518 * => must be called with kernel preemption disabled 2519 * => if lwp is the curlwp, then set ci_want_pmapload so that 2520 * actual MMU context switch will be done by pmap_load() later 2521 */ 2522 2523 void 2524 pmap_activate(struct lwp *l) 2525 { 2526 struct cpu_info *ci; 2527 struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2528 2529 KASSERT(kpreempt_disabled()); 2530 2531 ci = curcpu(); 2532 2533 if (l == ci->ci_curlwp) { 2534 struct pcb *pcb; 2535 2536 KASSERT(ci->ci_want_pmapload == 0); 2537 KASSERT(ci->ci_tlbstate != TLBSTATE_VALID); 2538 #ifdef KSTACK_CHECK_DR0 2539 /* 2540 * setup breakpoint on the top of stack 2541 */ 2542 if (l == &lwp0) 2543 dr0(0, 0, 0, 0); 2544 else 2545 dr0(KSTACK_LOWEST_ADDR(l), 1, 3, 1); 2546 #endif 2547 2548 /* 2549 * no need to switch to kernel vmspace because 2550 * it's a subset of any vmspace. 2551 */ 2552 2553 if (pmap == pmap_kernel()) { 2554 ci->ci_want_pmapload = 0; 2555 return; 2556 } 2557 2558 pcb = lwp_getpcb(l); 2559 ci->ci_want_pmapload = 1; 2560 2561 #if defined(__x86_64__) 2562 if (pcb->pcb_flags & PCB_GS64) 2563 wrmsr(MSR_KERNELGSBASE, pcb->pcb_gs); 2564 if (pcb->pcb_flags & PCB_FS64) 2565 wrmsr(MSR_FSBASE, pcb->pcb_fs); 2566 #endif /* defined(__x86_64__) */ 2567 } 2568 } 2569 2570 /* 2571 * pmap_reactivate: try to regain reference to the pmap. 2572 * 2573 * => must be called with kernel preemption disabled 2574 */ 2575 2576 static bool 2577 pmap_reactivate(struct pmap *pmap) 2578 { 2579 struct cpu_info *ci; 2580 uint32_t cpumask; 2581 bool result; 2582 uint32_t oldcpus; 2583 2584 ci = curcpu(); 2585 cpumask = ci->ci_cpumask; 2586 2587 KASSERT(kpreempt_disabled()); 2588 #if defined(XEN) && defined(__x86_64__) 2589 KASSERT(pmap->pm_pdirpa == xen_current_user_pgd); 2590 #elif defined(PAE) 2591 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(pmap_l3pd[0])); 2592 #elif !defined(XEN) 2593 KASSERT(pmap->pm_pdirpa == pmap_pte2pa(rcr3())); 2594 #endif 2595 2596 /* 2597 * if we still have a lazy reference to this pmap, 2598 * we can assume that there was no tlb shootdown 2599 * for this pmap in the meantime. 2600 * 2601 * the order of events here is important as we must 2602 * synchronize with TLB shootdown interrupts. declare 2603 * interest in invalidations (TLBSTATE_VALID) and then 2604 * check the cpumask, which the IPIs can change only 2605 * when the state is TLBSTATE_LAZY. 2606 */ 2607 2608 ci->ci_tlbstate = TLBSTATE_VALID; 2609 oldcpus = pmap->pm_cpus; 2610 KASSERT((pmap->pm_kernel_cpus & cpumask) != 0); 2611 if (oldcpus & cpumask) { 2612 /* got it */ 2613 result = true; 2614 } else { 2615 /* must reload */ 2616 atomic_or_32(&pmap->pm_cpus, cpumask); 2617 result = false; 2618 } 2619 2620 return result; 2621 } 2622 2623 /* 2624 * pmap_load: actually switch pmap. (fill in %cr3 and LDT info) 2625 */ 2626 2627 void 2628 pmap_load(void) 2629 { 2630 struct cpu_info *ci; 2631 uint32_t cpumask; 2632 struct pmap *pmap; 2633 struct pmap *oldpmap; 2634 struct lwp *l; 2635 struct pcb *pcb; 2636 uint64_t ncsw; 2637 2638 kpreempt_disable(); 2639 retry: 2640 ci = curcpu(); 2641 if (!ci->ci_want_pmapload) { 2642 kpreempt_enable(); 2643 return; 2644 } 2645 cpumask = ci->ci_cpumask; 2646 l = ci->ci_curlwp; 2647 ncsw = l->l_ncsw; 2648 2649 /* should be able to take ipis. */ 2650 KASSERT(ci->ci_ilevel < IPL_HIGH); 2651 #ifdef XEN 2652 /* XXX not yet KASSERT(x86_read_psl() != 0); */ 2653 #else 2654 KASSERT((x86_read_psl() & PSL_I) != 0); 2655 #endif 2656 2657 KASSERT(l != NULL); 2658 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2659 KASSERT(pmap != pmap_kernel()); 2660 oldpmap = ci->ci_pmap; 2661 pcb = lwp_getpcb(l); 2662 2663 if (pmap == oldpmap) { 2664 if (!pmap_reactivate(pmap)) { 2665 u_int gen = uvm_emap_gen_return(); 2666 2667 /* 2668 * pmap has been changed during deactivated. 2669 * our tlb may be stale. 2670 */ 2671 2672 tlbflush(); 2673 uvm_emap_update(gen); 2674 } 2675 2676 ci->ci_want_pmapload = 0; 2677 kpreempt_enable(); 2678 return; 2679 } 2680 2681 /* 2682 * grab a reference to the new pmap. 2683 */ 2684 2685 pmap_reference(pmap); 2686 2687 /* 2688 * actually switch pmap. 2689 */ 2690 2691 atomic_and_32(&oldpmap->pm_cpus, ~cpumask); 2692 atomic_and_32(&oldpmap->pm_kernel_cpus, ~cpumask); 2693 2694 #if defined(XEN) && defined(__x86_64__) 2695 KASSERT(oldpmap->pm_pdirpa == xen_current_user_pgd || 2696 oldpmap == pmap_kernel()); 2697 #elif defined(PAE) 2698 KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(pmap_l3pd[0])); 2699 #elif !defined(XEN) 2700 KASSERT(oldpmap->pm_pdirpa == pmap_pte2pa(rcr3())); 2701 #endif 2702 KASSERT((pmap->pm_cpus & cpumask) == 0); 2703 KASSERT((pmap->pm_kernel_cpus & cpumask) == 0); 2704 2705 /* 2706 * mark the pmap in use by this processor. again we must 2707 * synchronize with TLB shootdown interrupts, so set the 2708 * state VALID first, then register us for shootdown events 2709 * on this pmap. 2710 */ 2711 2712 ci->ci_tlbstate = TLBSTATE_VALID; 2713 atomic_or_32(&pmap->pm_cpus, cpumask); 2714 atomic_or_32(&pmap->pm_kernel_cpus, cpumask); 2715 ci->ci_pmap = pmap; 2716 2717 /* 2718 * update tss. now that we have registered for invalidations 2719 * from other CPUs, we're good to load the page tables. 2720 */ 2721 #ifdef PAE 2722 pcb->pcb_cr3 = pmap_l3paddr; 2723 #else 2724 pcb->pcb_cr3 = pmap->pm_pdirpa; 2725 #endif 2726 #if defined(XEN) && defined(__x86_64__) 2727 /* kernel pmap always in cr3 and should never go in user cr3 */ 2728 if (pmap_pdirpa(pmap, 0) != pmap_pdirpa(pmap_kernel(), 0)) { 2729 /* 2730 * Map user space address in kernel space and load 2731 * user cr3 2732 */ 2733 int i, s; 2734 pd_entry_t *old_pgd, *new_pgd; 2735 paddr_t addr; 2736 s = splvm(); 2737 new_pgd = pmap->pm_pdir; 2738 old_pgd = pmap_kernel()->pm_pdir; 2739 addr = xpmap_ptom(pmap_pdirpa(pmap_kernel(), 0)); 2740 for (i = 0; i < PDIR_SLOT_PTE; 2741 i++, addr += sizeof(pd_entry_t)) { 2742 if ((new_pgd[i] & PG_V) || (old_pgd[i] & PG_V)) 2743 xpq_queue_pte_update(addr, new_pgd[i]); 2744 } 2745 xpq_flush_queue(); /* XXXtlb */ 2746 tlbflush(); 2747 xen_set_user_pgd(pmap_pdirpa(pmap, 0)); 2748 xen_current_user_pgd = pmap_pdirpa(pmap, 0); 2749 splx(s); 2750 } 2751 #else /* XEN && x86_64 */ 2752 #if defined(XEN) 2753 /* 2754 * clear APDP slot, in case it points to a page table that has 2755 * been freed 2756 */ 2757 if (*APDP_PDE) { 2758 int i; 2759 for (i = 0; i < PDP_SIZE; i++) { 2760 pmap_pte_set(&APDP_PDE[i], 0); 2761 #ifdef PAE 2762 /* clear shadow entry too */ 2763 pmap_pte_set(&APDP_PDE_SHADOW[i], 0); 2764 #endif 2765 } 2766 } 2767 /* lldt() does pmap_pte_flush() */ 2768 #else /* XEN */ 2769 #if defined(i386) 2770 ci->ci_tss.tss_ldt = pmap->pm_ldt_sel; 2771 ci->ci_tss.tss_cr3 = pcb->pcb_cr3; 2772 #endif 2773 #endif /* XEN */ 2774 lldt(pmap->pm_ldt_sel); 2775 #ifdef PAE 2776 { 2777 paddr_t l3_pd = xpmap_ptom_masked(pmap_l3paddr); 2778 int i; 2779 int s = splvm(); 2780 /* don't update the kernel L3 slot */ 2781 for (i = 0 ; i < PDP_SIZE - 1 ; i++, l3_pd += sizeof(pd_entry_t)) { 2782 xpq_queue_pte_update(l3_pd, 2783 xpmap_ptom(pmap->pm_pdirpa[i]) | PG_V); 2784 } 2785 tlbflush(); 2786 xpq_flush_queue(); 2787 splx(s); 2788 } 2789 #else /* PAE */ 2790 { 2791 u_int gen = uvm_emap_gen_return(); 2792 lcr3(pcb->pcb_cr3); 2793 uvm_emap_update(gen); 2794 } 2795 #endif /* PAE */ 2796 #endif /* XEN && x86_64 */ 2797 2798 ci->ci_want_pmapload = 0; 2799 2800 /* 2801 * we're now running with the new pmap. drop the reference 2802 * to the old pmap. if we block, we need to go around again. 2803 */ 2804 2805 pmap_destroy(oldpmap); 2806 if (l->l_ncsw != ncsw) { 2807 goto retry; 2808 } 2809 2810 kpreempt_enable(); 2811 } 2812 2813 /* 2814 * pmap_deactivate: deactivate a process' pmap 2815 * 2816 * => must be called with kernel preemption disabled (high SPL is enough) 2817 */ 2818 2819 void 2820 pmap_deactivate(struct lwp *l) 2821 { 2822 struct pmap *pmap; 2823 struct cpu_info *ci; 2824 2825 KASSERT(kpreempt_disabled()); 2826 2827 if (l != curlwp) { 2828 return; 2829 } 2830 2831 /* 2832 * wait for pending TLB shootdowns to complete. necessary 2833 * because TLB shootdown state is per-CPU, and the LWP may 2834 * be coming off the CPU before it has a chance to call 2835 * pmap_update(). 2836 */ 2837 pmap_tlb_shootwait(); 2838 2839 ci = curcpu(); 2840 2841 if (ci->ci_want_pmapload) { 2842 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 2843 != pmap_kernel()); 2844 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 2845 != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID); 2846 2847 /* 2848 * userspace has not been touched. 2849 * nothing to do here. 2850 */ 2851 2852 ci->ci_want_pmapload = 0; 2853 return; 2854 } 2855 2856 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2857 2858 if (pmap == pmap_kernel()) { 2859 return; 2860 } 2861 2862 #if defined(XEN) && defined(__x86_64__) 2863 KASSERT(pmap->pm_pdirpa == xen_current_user_pgd); 2864 #elif defined(PAE) 2865 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(pmap_l3pd[0])); 2866 #elif !defined(XEN) 2867 KASSERT(pmap->pm_pdirpa == pmap_pte2pa(rcr3())); 2868 #endif 2869 KASSERT(ci->ci_pmap == pmap); 2870 2871 /* 2872 * we aren't interested in TLB invalidations for this pmap, 2873 * at least for the time being. 2874 */ 2875 2876 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); 2877 ci->ci_tlbstate = TLBSTATE_LAZY; 2878 } 2879 2880 /* 2881 * end of lifecycle functions 2882 */ 2883 2884 /* 2885 * some misc. functions 2886 */ 2887 2888 static int 2889 pmap_pdes_invalid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde) 2890 { 2891 int i; 2892 unsigned long index; 2893 pd_entry_t pde; 2894 2895 for (i = PTP_LEVELS; i > 1; i--) { 2896 index = pl_i(va, i); 2897 pde = pdes[i - 2][index]; 2898 if ((pde & PG_V) == 0) 2899 return i; 2900 } 2901 if (lastpde != NULL) 2902 *lastpde = pde; 2903 return 0; 2904 } 2905 2906 /* 2907 * pmap_extract: extract a PA for the given VA 2908 */ 2909 2910 bool 2911 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap) 2912 { 2913 pt_entry_t *ptes, pte; 2914 pd_entry_t pde; 2915 pd_entry_t * const *pdes; 2916 struct pmap *pmap2; 2917 struct cpu_info *ci; 2918 vaddr_t pa; 2919 lwp_t *l; 2920 bool hard, rv; 2921 2922 rv = false; 2923 pa = 0; 2924 l = curlwp; 2925 2926 KPREEMPT_DISABLE(l); 2927 ci = l->l_cpu; 2928 if (__predict_true(!ci->ci_want_pmapload && ci->ci_pmap == pmap) || 2929 pmap == pmap_kernel()) { 2930 /* 2931 * no need to lock, because it's pmap_kernel() or our 2932 * own pmap and is active. if a user pmap, the caller 2933 * will hold the vm_map write/read locked and so prevent 2934 * entries from disappearing while we are here. ptps 2935 * can disappear via pmap_remove() and pmap_protect(), 2936 * but they are called with the vm_map write locked. 2937 */ 2938 hard = false; 2939 ptes = PTE_BASE; 2940 pdes = normal_pdes; 2941 } else { 2942 /* we lose, do it the hard way. */ 2943 hard = true; 2944 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 2945 } 2946 if (pmap_pdes_valid(va, pdes, &pde)) { 2947 pte = ptes[pl1_i(va)]; 2948 if (pde & PG_PS) { 2949 pa = (pde & PG_LGFRAME) | (va & (NBPD_L2 - 1)); 2950 rv = true; 2951 } else if (__predict_true((pte & PG_V) != 0)) { 2952 pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1)); 2953 rv = true; 2954 } 2955 } 2956 if (__predict_false(hard)) { 2957 pmap_unmap_ptes(pmap, pmap2); 2958 } 2959 KPREEMPT_ENABLE(l); 2960 if (pap != NULL) { 2961 *pap = pa; 2962 } 2963 return rv; 2964 } 2965 2966 2967 /* 2968 * vtophys: virtual address to physical address. For use by 2969 * machine-dependent code only. 2970 */ 2971 2972 paddr_t 2973 vtophys(vaddr_t va) 2974 { 2975 paddr_t pa; 2976 2977 if (pmap_extract(pmap_kernel(), va, &pa) == true) 2978 return (pa); 2979 return (0); 2980 } 2981 2982 #ifdef XEN 2983 /* 2984 * pmap_extract_ma: extract a MA for the given VA 2985 */ 2986 2987 bool 2988 pmap_extract_ma(struct pmap *pmap, vaddr_t va, paddr_t *pap) 2989 { 2990 pt_entry_t *ptes, pte; 2991 pd_entry_t pde; 2992 pd_entry_t * const *pdes; 2993 struct pmap *pmap2; 2994 2995 kpreempt_disable(); 2996 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 2997 if (!pmap_pdes_valid(va, pdes, &pde)) { 2998 pmap_unmap_ptes(pmap, pmap2); 2999 kpreempt_enable(); 3000 return false; 3001 } 3002 3003 pte = ptes[pl1_i(va)]; 3004 pmap_unmap_ptes(pmap, pmap2); 3005 kpreempt_enable(); 3006 3007 if (__predict_true((pte & PG_V) != 0)) { 3008 if (pap != NULL) 3009 *pap = (pte & PG_FRAME) | (va & (NBPD_L1 - 1)); 3010 return true; 3011 } 3012 3013 return false; 3014 } 3015 3016 /* 3017 * vtomach: virtual address to machine address. For use by 3018 * machine-dependent code only. 3019 */ 3020 3021 paddr_t 3022 vtomach(vaddr_t va) 3023 { 3024 paddr_t pa; 3025 3026 if (pmap_extract_ma(pmap_kernel(), va, &pa) == true) 3027 return (pa); 3028 return (0); 3029 } 3030 3031 #endif /* XEN */ 3032 3033 3034 3035 /* 3036 * pmap_virtual_space: used during bootup [pmap_steal_memory] to 3037 * determine the bounds of the kernel virtual addess space. 3038 */ 3039 3040 void 3041 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp) 3042 { 3043 *startp = virtual_avail; 3044 *endp = virtual_end; 3045 } 3046 3047 /* 3048 * pmap_map: map a range of PAs into kvm. 3049 * 3050 * => used during crash dump 3051 * => XXX: pmap_map() should be phased out? 3052 */ 3053 3054 vaddr_t 3055 pmap_map(vaddr_t va, paddr_t spa, paddr_t epa, vm_prot_t prot) 3056 { 3057 while (spa < epa) { 3058 pmap_kenter_pa(va, spa, prot, 0); 3059 va += PAGE_SIZE; 3060 spa += PAGE_SIZE; 3061 } 3062 pmap_update(pmap_kernel()); 3063 return va; 3064 } 3065 3066 /* 3067 * pmap_zero_page: zero a page 3068 */ 3069 3070 void 3071 pmap_zero_page(paddr_t pa) 3072 { 3073 pt_entry_t *zpte; 3074 void *zerova; 3075 int id; 3076 3077 kpreempt_disable(); 3078 id = cpu_number(); 3079 zpte = PTESLEW(zero_pte, id); 3080 zerova = VASLEW(zerop, id); 3081 3082 #ifdef DIAGNOSTIC 3083 if (*zpte) 3084 panic("pmap_zero_page: lock botch"); 3085 #endif 3086 3087 pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k); 3088 pmap_pte_flush(); 3089 pmap_update_pg((vaddr_t)zerova); /* flush TLB */ 3090 3091 memset(zerova, 0, PAGE_SIZE); 3092 3093 #if defined(DIAGNOSTIC) || defined(XEN) 3094 pmap_pte_set(zpte, 0); /* zap ! */ 3095 pmap_pte_flush(); 3096 #endif 3097 kpreempt_enable(); 3098 } 3099 3100 /* 3101 * pmap_pagezeroidle: the same, for the idle loop page zero'er. 3102 * Returns true if the page was zero'd, false if we aborted for 3103 * some reason. 3104 */ 3105 3106 bool 3107 pmap_pageidlezero(paddr_t pa) 3108 { 3109 pt_entry_t *zpte; 3110 void *zerova; 3111 bool rv; 3112 int id; 3113 3114 id = cpu_number(); 3115 zpte = PTESLEW(zero_pte, id); 3116 zerova = VASLEW(zerop, id); 3117 3118 KASSERT(cpu_feature & CPUID_SSE2); 3119 KASSERT(*zpte == 0); 3120 3121 pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k); 3122 pmap_pte_flush(); 3123 pmap_update_pg((vaddr_t)zerova); /* flush TLB */ 3124 3125 rv = sse2_idlezero_page(zerova); 3126 3127 #if defined(DIAGNOSTIC) || defined(XEN) 3128 pmap_pte_set(zpte, 0); /* zap ! */ 3129 pmap_pte_flush(); 3130 #endif 3131 3132 return rv; 3133 } 3134 3135 /* 3136 * pmap_copy_page: copy a page 3137 */ 3138 3139 void 3140 pmap_copy_page(paddr_t srcpa, paddr_t dstpa) 3141 { 3142 pt_entry_t *spte; 3143 pt_entry_t *dpte; 3144 void *csrcva; 3145 void *cdstva; 3146 int id; 3147 3148 kpreempt_disable(); 3149 id = cpu_number(); 3150 spte = PTESLEW(csrc_pte,id); 3151 dpte = PTESLEW(cdst_pte,id); 3152 csrcva = VASLEW(csrcp, id); 3153 cdstva = VASLEW(cdstp, id); 3154 3155 KASSERT(*spte == 0 && *dpte == 0); 3156 3157 pmap_pte_set(spte, pmap_pa2pte(srcpa) | PG_V | PG_RW | PG_U | PG_k); 3158 pmap_pte_set(dpte, 3159 pmap_pa2pte(dstpa) | PG_V | PG_RW | PG_M | PG_U | PG_k); 3160 pmap_pte_flush(); 3161 pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva); 3162 3163 memcpy(cdstva, csrcva, PAGE_SIZE); 3164 3165 #if defined(DIAGNOSTIC) || defined(XEN) 3166 pmap_pte_set(spte, 0); 3167 pmap_pte_set(dpte, 0); 3168 pmap_pte_flush(); 3169 #endif 3170 kpreempt_enable(); 3171 } 3172 3173 static pt_entry_t * 3174 pmap_map_ptp(struct vm_page *ptp) 3175 { 3176 pt_entry_t *ptppte; 3177 void *ptpva; 3178 int id; 3179 3180 KASSERT(kpreempt_disabled()); 3181 3182 id = cpu_number(); 3183 ptppte = PTESLEW(ptp_pte, id); 3184 ptpva = VASLEW(ptpp, id); 3185 #if !defined(XEN) 3186 pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M | 3187 PG_RW | PG_U | PG_k); 3188 #else 3189 pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M | 3190 PG_U | PG_k); 3191 #endif 3192 pmap_pte_flush(); 3193 pmap_update_pg((vaddr_t)ptpva); 3194 3195 return (pt_entry_t *)ptpva; 3196 } 3197 3198 static void 3199 pmap_unmap_ptp(void) 3200 { 3201 #if defined(DIAGNOSTIC) || defined(XEN) 3202 pt_entry_t *pte; 3203 3204 KASSERT(kpreempt_disabled()); 3205 3206 pte = PTESLEW(ptp_pte, cpu_number()); 3207 if (*pte != 0) { 3208 pmap_pte_set(pte, 0); 3209 pmap_pte_flush(); 3210 } 3211 #endif 3212 } 3213 3214 static pt_entry_t * 3215 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va) 3216 { 3217 3218 KASSERT(kpreempt_disabled()); 3219 if (pmap_is_curpmap(pmap)) { 3220 return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */ 3221 } 3222 KASSERT(ptp != NULL); 3223 return pmap_map_ptp(ptp) + pl1_pi(va); 3224 } 3225 3226 static void 3227 pmap_unmap_pte(void) 3228 { 3229 3230 KASSERT(kpreempt_disabled()); 3231 3232 pmap_unmap_ptp(); 3233 } 3234 3235 /* 3236 * p m a p r e m o v e f u n c t i o n s 3237 * 3238 * functions that remove mappings 3239 */ 3240 3241 /* 3242 * pmap_remove_ptes: remove PTEs from a PTP 3243 * 3244 * => must have proper locking on pmap_master_lock 3245 * => caller must hold pmap's lock 3246 * => PTP must be mapped into KVA 3247 * => PTP should be null if pmap == pmap_kernel() 3248 * => must be called with kernel preemption disabled 3249 * => returns composite pte if at least one page should be shot down 3250 */ 3251 3252 static pt_entry_t 3253 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva, 3254 vaddr_t startva, vaddr_t endva, struct pv_entry **pv_tofree) 3255 { 3256 struct pv_entry *pve; 3257 pt_entry_t *pte = (pt_entry_t *) ptpva; 3258 pt_entry_t opte, xpte = 0; 3259 3260 KASSERT(pmap == pmap_kernel() || mutex_owned(&pmap->pm_lock)); 3261 KASSERT(kpreempt_disabled()); 3262 3263 /* 3264 * note that ptpva points to the PTE that maps startva. this may 3265 * or may not be the first PTE in the PTP. 3266 * 3267 * we loop through the PTP while there are still PTEs to look at 3268 * and the wire_count is greater than 1 (because we use the wire_count 3269 * to keep track of the number of real PTEs in the PTP). 3270 */ 3271 3272 for (/*null*/; startva < endva && (ptp == NULL || ptp->wire_count > 1) 3273 ; pte++, startva += PAGE_SIZE) { 3274 struct vm_page *pg; 3275 struct pmap_page *pp; 3276 3277 if (!pmap_valid_entry(*pte)) 3278 continue; /* VA not mapped */ 3279 3280 /* atomically save the old PTE and zap! it */ 3281 opte = pmap_pte_testset(pte, 0); 3282 if (!pmap_valid_entry(opte)) { 3283 continue; 3284 } 3285 3286 pmap_exec_account(pmap, startva, opte, 0); 3287 pmap_stats_update_bypte(pmap, 0, opte); 3288 xpte |= opte; 3289 3290 if (ptp) { 3291 ptp->wire_count--; /* dropping a PTE */ 3292 /* Make sure that the PDE is flushed */ 3293 if (ptp->wire_count <= 1) 3294 xpte |= PG_U; 3295 } 3296 3297 /* 3298 * if we are not on a pv_head list we are done. 3299 */ 3300 3301 if ((opte & PG_PVLIST) == 0) { 3302 #if defined(DIAGNOSTIC) && !defined(DOM0OPS) 3303 if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL) 3304 panic("pmap_remove_ptes: managed page without " 3305 "PG_PVLIST for 0x%lx", startva); 3306 #endif 3307 continue; 3308 } 3309 3310 pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte)); 3311 #ifdef DIAGNOSTIC 3312 if (pg == NULL) 3313 panic("pmap_remove_ptes: unmanaged page marked " 3314 "PG_PVLIST, va = 0x%lx, pa = 0x%lx", 3315 startva, (u_long)pmap_pte2pa(opte)); 3316 #endif 3317 3318 /* sync R/M bits */ 3319 pp = VM_PAGE_TO_PP(pg); 3320 pp_lock(pp); 3321 pp->pp_attrs |= opte; 3322 pve = pmap_remove_pv(pp, ptp, startva); 3323 pp_unlock(pp); 3324 3325 if (pve != NULL) { 3326 pve->pve_next = *pv_tofree; 3327 *pv_tofree = pve; 3328 } 3329 3330 /* end of "for" loop: time for next pte */ 3331 } 3332 3333 return xpte; 3334 } 3335 3336 3337 /* 3338 * pmap_remove_pte: remove a single PTE from a PTP 3339 * 3340 * => must have proper locking on pmap_master_lock 3341 * => caller must hold pmap's lock 3342 * => PTP must be mapped into KVA 3343 * => PTP should be null if pmap == pmap_kernel() 3344 * => returns true if we removed a mapping 3345 * => must be called with kernel preemption disabled 3346 */ 3347 3348 static bool 3349 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, 3350 vaddr_t va, struct pv_entry **pv_tofree) 3351 { 3352 pt_entry_t opte; 3353 struct pv_entry *pve; 3354 struct vm_page *pg; 3355 struct pmap_page *pp; 3356 3357 KASSERT(pmap == pmap_kernel() || mutex_owned(&pmap->pm_lock)); 3358 KASSERT(pmap == pmap_kernel() || kpreempt_disabled()); 3359 3360 if (!pmap_valid_entry(*pte)) 3361 return(false); /* VA not mapped */ 3362 3363 /* atomically save the old PTE and zap! it */ 3364 opte = pmap_pte_testset(pte, 0); 3365 if (!pmap_valid_entry(opte)) { 3366 return false; 3367 } 3368 3369 pmap_exec_account(pmap, va, opte, 0); 3370 pmap_stats_update_bypte(pmap, 0, opte); 3371 3372 if (opte & PG_U) 3373 pmap_tlb_shootdown(pmap, va, 0, opte); 3374 3375 if (ptp) { 3376 ptp->wire_count--; /* dropping a PTE */ 3377 /* Make sure that the PDE is flushed */ 3378 if ((ptp->wire_count <= 1) && !(opte & PG_U)) 3379 pmap_tlb_shootdown(pmap, va, 0, opte); 3380 } 3381 3382 /* 3383 * if we are not on a pv_head list we are done. 3384 */ 3385 3386 if ((opte & PG_PVLIST) == 0) { 3387 #if defined(DIAGNOSTIC) && !defined(DOM0OPS) 3388 if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL) 3389 panic("pmap_remove_pte: managed page without " 3390 "PG_PVLIST for 0x%lx", va); 3391 #endif 3392 return(true); 3393 } 3394 3395 pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte)); 3396 #ifdef DIAGNOSTIC 3397 if (pg == NULL) 3398 panic("pmap_remove_pte: unmanaged page marked " 3399 "PG_PVLIST, va = 0x%lx, pa = 0x%lx", va, 3400 (u_long)(pmap_pte2pa(opte))); 3401 #endif 3402 3403 /* sync R/M bits */ 3404 pp = VM_PAGE_TO_PP(pg); 3405 pp_lock(pp); 3406 pp->pp_attrs |= opte; 3407 pve = pmap_remove_pv(pp, ptp, va); 3408 pp_unlock(pp); 3409 3410 if (pve) { 3411 pve->pve_next = *pv_tofree; 3412 *pv_tofree = pve; 3413 } 3414 3415 return(true); 3416 } 3417 3418 /* 3419 * pmap_remove: mapping removal function. 3420 * 3421 * => caller should not be holding any pmap locks 3422 */ 3423 3424 void 3425 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 3426 { 3427 pt_entry_t *ptes, xpte = 0; 3428 pd_entry_t pde; 3429 pd_entry_t * const *pdes; 3430 struct pv_entry *pv_tofree = NULL; 3431 bool result; 3432 paddr_t ptppa; 3433 vaddr_t blkendva, va = sva; 3434 struct vm_page *ptp; 3435 struct pmap *pmap2; 3436 3437 kpreempt_disable(); 3438 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 3439 3440 /* 3441 * removing one page? take shortcut function. 3442 */ 3443 3444 if (va + PAGE_SIZE == eva) { 3445 if (pmap_pdes_valid(va, pdes, &pde)) { 3446 3447 /* PA of the PTP */ 3448 ptppa = pmap_pte2pa(pde); 3449 3450 /* get PTP if non-kernel mapping */ 3451 if (pmap == pmap_kernel()) { 3452 /* we never free kernel PTPs */ 3453 ptp = NULL; 3454 } else { 3455 ptp = pmap_find_ptp(pmap, va, ptppa, 1); 3456 #ifdef DIAGNOSTIC 3457 if (ptp == NULL) 3458 panic("pmap_remove: unmanaged " 3459 "PTP detected"); 3460 #endif 3461 } 3462 3463 /* do it! */ 3464 result = pmap_remove_pte(pmap, ptp, 3465 &ptes[pl1_i(va)], va, &pv_tofree); 3466 3467 /* 3468 * if mapping removed and the PTP is no longer 3469 * being used, free it! 3470 */ 3471 3472 if (result && ptp && ptp->wire_count <= 1) 3473 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3474 } 3475 } else for (/* null */ ; va < eva ; va = blkendva) { 3476 int lvl; 3477 3478 /* determine range of block */ 3479 blkendva = x86_round_pdr(va+1); 3480 if (blkendva > eva) 3481 blkendva = eva; 3482 3483 /* 3484 * XXXCDC: our PTE mappings should never be removed 3485 * with pmap_remove! if we allow this (and why would 3486 * we?) then we end up freeing the pmap's page 3487 * directory page (PDP) before we are finished using 3488 * it when we hit in in the recursive mapping. this 3489 * is BAD. 3490 * 3491 * long term solution is to move the PTEs out of user 3492 * address space. and into kernel address space (up 3493 * with APTE). then we can set VM_MAXUSER_ADDRESS to 3494 * be VM_MAX_ADDRESS. 3495 */ 3496 3497 if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE) 3498 /* XXXCDC: ugly hack to avoid freeing PDP here */ 3499 continue; 3500 3501 lvl = pmap_pdes_invalid(va, pdes, &pde); 3502 if (lvl != 0) { 3503 /* 3504 * skip a range corresponding to an invalid pde. 3505 */ 3506 blkendva = (va & ptp_masks[lvl - 1]) + nbpd[lvl - 1]; 3507 continue; 3508 } 3509 3510 /* PA of the PTP */ 3511 ptppa = pmap_pte2pa(pde); 3512 3513 /* get PTP if non-kernel mapping */ 3514 if (pmap == pmap_kernel()) { 3515 /* we never free kernel PTPs */ 3516 ptp = NULL; 3517 } else { 3518 ptp = pmap_find_ptp(pmap, va, ptppa, 1); 3519 #ifdef DIAGNOSTIC 3520 if (ptp == NULL) 3521 panic("pmap_remove: unmanaged PTP " 3522 "detected"); 3523 #endif 3524 } 3525 xpte |= pmap_remove_ptes(pmap, ptp, 3526 (vaddr_t)&ptes[pl1_i(va)], va, blkendva, &pv_tofree); 3527 3528 /* if PTP is no longer being used, free it! */ 3529 if (ptp && ptp->wire_count <= 1) { 3530 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3531 } 3532 if ((xpte & PG_U) != 0) 3533 pmap_tlb_shootdown(pmap, sva, eva, xpte); 3534 } 3535 pmap_unmap_ptes(pmap, pmap2); /* unlock pmap */ 3536 kpreempt_enable(); 3537 3538 /* Now we free unused PVs */ 3539 if (pv_tofree) 3540 pmap_free_pvs(pv_tofree); 3541 } 3542 3543 /* 3544 * pmap_sync_pv: clear pte bits and return the old value of the pte. 3545 * 3546 * => called with pp_lock held. (thus preemption disabled) 3547 * => issues tlb shootdowns if necessary. 3548 */ 3549 3550 static int 3551 pmap_sync_pv(struct pv_pte *pvpte, pt_entry_t expect, int clearbits, 3552 pt_entry_t *optep) 3553 { 3554 struct pmap *pmap; 3555 struct vm_page *ptp; 3556 vaddr_t va; 3557 pt_entry_t *ptep; 3558 pt_entry_t opte; 3559 pt_entry_t npte; 3560 bool need_shootdown; 3561 3562 ptp = pvpte->pte_ptp; 3563 va = pvpte->pte_va; 3564 KASSERT(ptp == NULL || ptp->uobject != NULL); 3565 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 3566 pmap = ptp_to_pmap(ptp); 3567 3568 KASSERT((expect & ~(PG_FRAME | PG_V)) == 0); 3569 KASSERT((expect & PG_V) != 0); 3570 KASSERT(clearbits == ~0 || (clearbits & ~(PG_M | PG_U | PG_RW)) == 0); 3571 KASSERT(kpreempt_disabled()); 3572 3573 ptep = pmap_map_pte(pmap, ptp, va); 3574 do { 3575 opte = *ptep; 3576 KASSERT((opte & (PG_M | PG_U)) != PG_M); 3577 KASSERT((opte & (PG_U | PG_V)) != PG_U); 3578 KASSERT(opte == 0 || (opte & PG_V) != 0); 3579 if ((opte & (PG_FRAME | PG_V)) != expect) { 3580 3581 /* 3582 * we lost a race with a V->P operation like 3583 * pmap_remove(). wait for the competitor 3584 * reflecting pte bits into mp_attrs. 3585 * 3586 * issue a redundant TLB shootdown so that 3587 * we can wait for its completion. 3588 */ 3589 3590 pmap_unmap_pte(); 3591 if (clearbits != 0) { 3592 pmap_tlb_shootdown(pmap, va, 0, 3593 (pmap == pmap_kernel() ? PG_G : 0)); 3594 } 3595 return EAGAIN; 3596 } 3597 3598 /* 3599 * check if there's anything to do on this pte. 3600 */ 3601 3602 if ((opte & clearbits) == 0) { 3603 need_shootdown = false; 3604 break; 3605 } 3606 3607 /* 3608 * we need a shootdown if the pte is cached. (PG_U) 3609 * 3610 * ...unless we are clearing only the PG_RW bit and 3611 * it isn't cached as RW. (PG_M) 3612 */ 3613 3614 need_shootdown = (opte & PG_U) != 0 && 3615 !(clearbits == PG_RW && (opte & PG_M) == 0); 3616 3617 npte = opte & ~clearbits; 3618 3619 /* 3620 * if we need a shootdown anyway, clear PG_U and PG_M. 3621 */ 3622 3623 if (need_shootdown) { 3624 npte &= ~(PG_U | PG_M); 3625 } 3626 KASSERT((npte & (PG_M | PG_U)) != PG_M); 3627 KASSERT((npte & (PG_U | PG_V)) != PG_U); 3628 KASSERT(npte == 0 || (opte & PG_V) != 0); 3629 } while (pmap_pte_cas(ptep, opte, npte) != opte); 3630 3631 if (need_shootdown) { 3632 pmap_tlb_shootdown(pmap, va, 0, opte); 3633 } 3634 pmap_unmap_pte(); 3635 3636 *optep = opte; 3637 return 0; 3638 } 3639 3640 /* 3641 * pmap_page_remove: remove a managed vm_page from all pmaps that map it 3642 * 3643 * => R/M bits are sync'd back to attrs 3644 */ 3645 3646 void 3647 pmap_page_remove(struct vm_page *pg) 3648 { 3649 struct pmap_page *pp; 3650 struct pv_pte *pvpte; 3651 struct pv_entry *killlist = NULL; 3652 struct vm_page *ptp; 3653 pt_entry_t expect; 3654 lwp_t *l; 3655 int count; 3656 3657 l = curlwp; 3658 pp = VM_PAGE_TO_PP(pg); 3659 expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V; 3660 count = SPINLOCK_BACKOFF_MIN; 3661 kpreempt_disable(); 3662 startover: 3663 pp_lock(pp); 3664 while ((pvpte = pv_pte_first(pp)) != NULL) { 3665 struct pmap *pmap; 3666 struct pv_entry *pve; 3667 pt_entry_t opte; 3668 vaddr_t va; 3669 int error; 3670 3671 /* 3672 * add a reference to the pmap before clearing the pte. 3673 * otherwise the pmap can disappear behind us. 3674 */ 3675 3676 ptp = pvpte->pte_ptp; 3677 pmap = ptp_to_pmap(ptp); 3678 if (ptp != NULL) { 3679 pmap_reference(pmap); 3680 } 3681 3682 error = pmap_sync_pv(pvpte, expect, ~0, &opte); 3683 if (error == EAGAIN) { 3684 int hold_count; 3685 pp_unlock(pp); 3686 KERNEL_UNLOCK_ALL(curlwp, &hold_count); 3687 if (ptp != NULL) { 3688 pmap_destroy(pmap); 3689 } 3690 SPINLOCK_BACKOFF(count); 3691 KERNEL_LOCK(hold_count, curlwp); 3692 goto startover; 3693 } 3694 3695 pp->pp_attrs |= opte; 3696 va = pvpte->pte_va; 3697 pve = pmap_remove_pv(pp, ptp, va); 3698 pp_unlock(pp); 3699 3700 /* update the PTP reference count. free if last reference. */ 3701 if (ptp != NULL) { 3702 struct pmap *pmap2; 3703 pt_entry_t *ptes; 3704 pd_entry_t * const *pdes; 3705 3706 KASSERT(pmap != pmap_kernel()); 3707 3708 pmap_tlb_shootwait(); 3709 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3710 pmap_stats_update_bypte(pmap, 0, opte); 3711 ptp->wire_count--; 3712 if (ptp->wire_count <= 1) { 3713 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3714 } 3715 pmap_unmap_ptes(pmap, pmap2); 3716 pmap_destroy(pmap); 3717 } else { 3718 KASSERT(pmap == pmap_kernel()); 3719 pmap_stats_update_bypte(pmap, 0, opte); 3720 } 3721 3722 if (pve != NULL) { 3723 pve->pve_next = killlist; /* mark it for death */ 3724 killlist = pve; 3725 } 3726 pp_lock(pp); 3727 } 3728 pp_unlock(pp); 3729 kpreempt_enable(); 3730 3731 /* Now free unused pvs. */ 3732 pmap_free_pvs(killlist); 3733 } 3734 3735 /* 3736 * p m a p a t t r i b u t e f u n c t i o n s 3737 * functions that test/change managed page's attributes 3738 * since a page can be mapped multiple times we must check each PTE that 3739 * maps it by going down the pv lists. 3740 */ 3741 3742 /* 3743 * pmap_test_attrs: test a page's attributes 3744 */ 3745 3746 bool 3747 pmap_test_attrs(struct vm_page *pg, unsigned testbits) 3748 { 3749 struct pmap_page *pp; 3750 struct pv_pte *pvpte; 3751 pt_entry_t expect; 3752 u_int result; 3753 3754 pp = VM_PAGE_TO_PP(pg); 3755 if ((pp->pp_attrs & testbits) != 0) { 3756 return true; 3757 } 3758 expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V; 3759 pp_lock(pp); 3760 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 3761 pt_entry_t opte; 3762 int error; 3763 3764 if ((pp->pp_attrs & testbits) != 0) { 3765 break; 3766 } 3767 error = pmap_sync_pv(pvpte, expect, 0, &opte); 3768 if (error == 0) { 3769 pp->pp_attrs |= opte; 3770 } 3771 } 3772 result = pp->pp_attrs & testbits; 3773 pp_unlock(pp); 3774 3775 /* 3776 * note that we will exit the for loop with a non-null pve if 3777 * we have found the bits we are testing for. 3778 */ 3779 3780 return result != 0; 3781 } 3782 3783 /* 3784 * pmap_clear_attrs: clear the specified attribute for a page. 3785 * 3786 * => we return true if we cleared one of the bits we were asked to 3787 */ 3788 3789 bool 3790 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits) 3791 { 3792 struct pmap_page *pp; 3793 struct pv_pte *pvpte; 3794 u_int result; 3795 pt_entry_t expect; 3796 int count; 3797 3798 pp = VM_PAGE_TO_PP(pg); 3799 expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V; 3800 count = SPINLOCK_BACKOFF_MIN; 3801 kpreempt_disable(); 3802 startover: 3803 pp_lock(pp); 3804 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 3805 pt_entry_t opte; 3806 int error; 3807 3808 error = pmap_sync_pv(pvpte, expect, clearbits, &opte); 3809 if (error == EAGAIN) { 3810 int hold_count; 3811 pp_unlock(pp); 3812 KERNEL_UNLOCK_ALL(curlwp, &hold_count); 3813 SPINLOCK_BACKOFF(count); 3814 KERNEL_LOCK(hold_count, curlwp); 3815 goto startover; 3816 } 3817 pp->pp_attrs |= opte; 3818 } 3819 result = pp->pp_attrs & clearbits; 3820 pp->pp_attrs &= ~clearbits; 3821 pp_unlock(pp); 3822 kpreempt_enable(); 3823 3824 return result != 0; 3825 } 3826 3827 3828 /* 3829 * p m a p p r o t e c t i o n f u n c t i o n s 3830 */ 3831 3832 /* 3833 * pmap_page_protect: change the protection of all recorded mappings 3834 * of a managed page 3835 * 3836 * => NOTE: this is an inline function in pmap.h 3837 */ 3838 3839 /* see pmap.h */ 3840 3841 /* 3842 * pmap_protect: set the protection in of the pages in a pmap 3843 * 3844 * => NOTE: this is an inline function in pmap.h 3845 */ 3846 3847 /* see pmap.h */ 3848 3849 /* 3850 * pmap_write_protect: write-protect pages in a pmap 3851 */ 3852 3853 void 3854 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot) 3855 { 3856 pt_entry_t *ptes, *epte; 3857 pt_entry_t *spte; 3858 pd_entry_t * const *pdes; 3859 vaddr_t blockend, va; 3860 pt_entry_t opte; 3861 struct pmap *pmap2; 3862 3863 KASSERT(curlwp->l_md.md_gc_pmap != pmap); 3864 3865 kpreempt_disable(); 3866 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 3867 3868 /* should be ok, but just in case ... */ 3869 sva &= PG_FRAME; 3870 eva &= PG_FRAME; 3871 3872 for (va = sva ; va < eva ; va = blockend) { 3873 3874 blockend = (va & L2_FRAME) + NBPD_L2; 3875 if (blockend > eva) 3876 blockend = eva; 3877 3878 /* 3879 * XXXCDC: our PTE mappings should never be write-protected! 3880 * 3881 * long term solution is to move the PTEs out of user 3882 * address space. and into kernel address space (up 3883 * with APTE). then we can set VM_MAXUSER_ADDRESS to 3884 * be VM_MAX_ADDRESS. 3885 */ 3886 3887 /* XXXCDC: ugly hack to avoid freeing PDP here */ 3888 if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE) 3889 continue; 3890 3891 /* empty block? */ 3892 if (!pmap_pdes_valid(va, pdes, NULL)) 3893 continue; 3894 3895 #ifdef DIAGNOSTIC 3896 if (va >= VM_MAXUSER_ADDRESS && 3897 va < VM_MAX_ADDRESS) 3898 panic("pmap_write_protect: PTE space"); 3899 #endif 3900 3901 spte = &ptes[pl1_i(va)]; 3902 epte = &ptes[pl1_i(blockend)]; 3903 3904 for (/*null */; spte < epte ; spte++) { 3905 pt_entry_t npte; 3906 3907 do { 3908 opte = *spte; 3909 if ((~opte & (PG_RW | PG_V)) != 0) { 3910 goto next; 3911 } 3912 npte = opte & ~PG_RW; 3913 } while (pmap_pte_cas(spte, opte, npte) != opte); 3914 if ((opte & PG_M) != 0) { 3915 vaddr_t tva; 3916 3917 tva = x86_ptob(spte - ptes); 3918 pmap_tlb_shootdown(pmap, tva, 0, opte); 3919 } 3920 next:; 3921 } 3922 } 3923 3924 pmap_unmap_ptes(pmap, pmap2); /* unlocks pmap */ 3925 kpreempt_enable(); 3926 } 3927 3928 /* 3929 * end of protection functions 3930 */ 3931 3932 /* 3933 * pmap_unwire: clear the wired bit in the PTE 3934 * 3935 * => mapping should already be in map 3936 */ 3937 3938 void 3939 pmap_unwire(struct pmap *pmap, vaddr_t va) 3940 { 3941 pt_entry_t *ptes; 3942 pd_entry_t * const *pdes; 3943 struct pmap *pmap2; 3944 3945 kpreempt_disable(); 3946 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 3947 3948 if (pmap_pdes_valid(va, pdes, NULL)) { 3949 pt_entry_t *ptep = &ptes[pl1_i(va)]; 3950 pt_entry_t opte = *ptep; 3951 3952 #ifdef DIAGNOSTIC 3953 if (!pmap_valid_entry(opte)) 3954 panic("pmap_unwire: invalid (unmapped) va 0x%lx", va); 3955 #endif 3956 if ((opte & PG_W) != 0) { 3957 pt_entry_t npte = opte & ~PG_W; 3958 3959 opte = pmap_pte_testset(ptep, npte); 3960 pmap_stats_update_bypte(pmap, npte, opte); 3961 } 3962 #ifdef DIAGNOSTIC 3963 else { 3964 printf("pmap_unwire: wiring for pmap %p va 0x%lx " 3965 "didn't change!\n", pmap, va); 3966 } 3967 #endif 3968 pmap_unmap_ptes(pmap, pmap2); /* unlocks map */ 3969 } 3970 #ifdef DIAGNOSTIC 3971 else { 3972 panic("pmap_unwire: invalid PDE"); 3973 } 3974 #endif 3975 kpreempt_enable(); 3976 } 3977 3978 /* 3979 * pmap_copy: copy mappings from one pmap to another 3980 * 3981 * => optional function 3982 * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr) 3983 */ 3984 3985 /* 3986 * defined as macro in pmap.h 3987 */ 3988 3989 /* 3990 * pmap_enter: enter a mapping into a pmap 3991 * 3992 * => must be done "now" ... no lazy-evaluation 3993 * => we set pmap => pv_head locking 3994 */ 3995 #ifdef XEN 3996 int 3997 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa, 3998 vm_prot_t prot, u_int flags, int domid) 3999 { 4000 #else /* XEN */ 4001 int 4002 pmap_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, 4003 u_int flags) 4004 { 4005 paddr_t ma = pa; 4006 #endif /* XEN */ 4007 pt_entry_t *ptes, opte, npte; 4008 pt_entry_t *ptep; 4009 pd_entry_t * const *pdes; 4010 struct vm_page *ptp, *pg; 4011 struct pmap_page *new_pp; 4012 struct pmap_page *old_pp; 4013 struct pv_entry *old_pve = NULL; 4014 struct pv_entry *new_pve; 4015 struct pv_entry *new_pve2; 4016 int error; 4017 bool wired = (flags & PMAP_WIRED) != 0; 4018 struct pmap *pmap2; 4019 4020 KASSERT(pmap_initialized); 4021 KASSERT(curlwp->l_md.md_gc_pmap != pmap); 4022 4023 #ifdef DIAGNOSTIC 4024 /* sanity check: totally out of range? */ 4025 if (va >= VM_MAX_KERNEL_ADDRESS) 4026 panic("pmap_enter: too big"); 4027 4028 if (va == (vaddr_t) PDP_BASE || va == (vaddr_t) APDP_BASE) 4029 panic("pmap_enter: trying to map over PDP/APDP!"); 4030 4031 /* sanity check: kernel PTPs should already have been pre-allocated */ 4032 if (va >= VM_MIN_KERNEL_ADDRESS && 4033 !pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)])) 4034 panic("pmap_enter: missing kernel PTP for va %lx!", va); 4035 #endif /* DIAGNOSTIC */ 4036 #ifdef XEN 4037 KASSERT(domid == DOMID_SELF || pa == 0); 4038 #endif /* XEN */ 4039 4040 npte = ma | protection_codes[prot] | PG_V; 4041 if (wired) 4042 npte |= PG_W; 4043 if (flags & PMAP_NOCACHE) 4044 npte |= PG_N; 4045 if (va < VM_MAXUSER_ADDRESS) 4046 npte |= PG_u; 4047 else if (va < VM_MAX_ADDRESS) 4048 npte |= (PG_u | PG_RW); /* XXXCDC: no longer needed? */ 4049 else 4050 npte |= PG_k; 4051 if (pmap == pmap_kernel()) 4052 npte |= pmap_pg_g; 4053 if (flags & VM_PROT_ALL) { 4054 npte |= PG_U; 4055 if (flags & VM_PROT_WRITE) { 4056 KASSERT((npte & PG_RW) != 0); 4057 npte |= PG_M; 4058 } 4059 } 4060 4061 #ifdef XEN 4062 if (domid != DOMID_SELF) 4063 pg = NULL; 4064 else 4065 #endif 4066 pg = PHYS_TO_VM_PAGE(pa); 4067 if (pg != NULL) { 4068 /* This is a managed page */ 4069 npte |= PG_PVLIST; 4070 new_pp = VM_PAGE_TO_PP(pg); 4071 } else { 4072 new_pp = NULL; 4073 } 4074 4075 /* get pves. */ 4076 new_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT); 4077 new_pve2 = pool_cache_get(&pmap_pv_cache, PR_NOWAIT); 4078 if (new_pve == NULL || new_pve2 == NULL) { 4079 if (flags & PMAP_CANFAIL) { 4080 error = ENOMEM; 4081 goto out2; 4082 } 4083 panic("pmap_enter: pve allocation failed"); 4084 } 4085 4086 kpreempt_disable(); 4087 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 4088 if (pmap == pmap_kernel()) { 4089 ptp = NULL; 4090 } else { 4091 ptp = pmap_get_ptp(pmap, va, pdes); 4092 if (ptp == NULL) { 4093 pmap_unmap_ptes(pmap, pmap2); 4094 if (flags & PMAP_CANFAIL) { 4095 error = ENOMEM; 4096 goto out; 4097 } 4098 panic("pmap_enter: get ptp failed"); 4099 } 4100 } 4101 4102 /* 4103 * update the pte. 4104 */ 4105 4106 ptep = &ptes[pl1_i(va)]; 4107 do { 4108 opte = *ptep; 4109 4110 /* 4111 * if the same page, inherit PG_U and PG_M. 4112 */ 4113 if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) { 4114 npte |= opte & (PG_U | PG_M); 4115 } 4116 #if defined(XEN) 4117 if (domid != DOMID_SELF) { 4118 /* pmap_pte_cas with error handling */ 4119 int s = splvm(); 4120 if (opte != *ptep) { 4121 splx(s); 4122 continue; 4123 } 4124 error = xpq_update_foreign( 4125 vtomach((vaddr_t)ptep), npte, domid); 4126 splx(s); 4127 if (error) { 4128 if (ptp != NULL && ptp->wire_count <= 1) { 4129 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 4130 } 4131 pmap_unmap_ptes(pmap, pmap2); 4132 goto out; 4133 } 4134 break; 4135 } 4136 #endif /* defined(XEN) */ 4137 } while (pmap_pte_cas(ptep, opte, npte) != opte); 4138 4139 /* 4140 * update statistics and PTP's reference count. 4141 */ 4142 4143 pmap_stats_update_bypte(pmap, npte, opte); 4144 if (ptp != NULL && !pmap_valid_entry(opte)) { 4145 ptp->wire_count++; 4146 } 4147 KASSERT(ptp == NULL || ptp->wire_count > 1); 4148 4149 /* 4150 * if the same page, we can skip pv_entry handling. 4151 */ 4152 4153 if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) { 4154 KASSERT(((opte ^ npte) & PG_PVLIST) == 0); 4155 goto same_pa; 4156 } 4157 4158 /* 4159 * if old page is managed, remove pv_entry from its list. 4160 */ 4161 4162 if ((~opte & (PG_V | PG_PVLIST)) == 0) { 4163 pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte)); 4164 #ifdef DIAGNOSTIC 4165 if (pg == NULL) 4166 panic("pmap_enter: PG_PVLIST mapping with " 4167 "unmanaged page " 4168 "pa = 0x%" PRIx64 " (0x%" PRIx64 ")", 4169 (int64_t)pa, (int64_t)atop(pa)); 4170 #endif 4171 old_pp = VM_PAGE_TO_PP(pg); 4172 4173 pp_lock(old_pp); 4174 old_pve = pmap_remove_pv(old_pp, ptp, va); 4175 old_pp->pp_attrs |= opte; 4176 pp_unlock(old_pp); 4177 } 4178 4179 /* 4180 * if new page is managed, insert pv_entry into its list. 4181 */ 4182 4183 if (new_pp) { 4184 pp_lock(new_pp); 4185 new_pve = pmap_enter_pv(new_pp, new_pve, &new_pve2, ptp, va); 4186 pp_unlock(new_pp); 4187 } 4188 4189 same_pa: 4190 pmap_unmap_ptes(pmap, pmap2); 4191 4192 /* 4193 * shootdown tlb if necessary. 4194 */ 4195 4196 if ((~opte & (PG_V | PG_U)) == 0 && 4197 ((opte ^ npte) & (PG_FRAME | PG_RW)) != 0) { 4198 pmap_tlb_shootdown(pmap, va, 0, opte); 4199 } 4200 4201 error = 0; 4202 out: 4203 kpreempt_enable(); 4204 out2: 4205 if (old_pve != NULL) { 4206 pool_cache_put(&pmap_pv_cache, old_pve); 4207 } 4208 if (new_pve != NULL) { 4209 pool_cache_put(&pmap_pv_cache, new_pve); 4210 } 4211 if (new_pve2 != NULL) { 4212 pool_cache_put(&pmap_pv_cache, new_pve2); 4213 } 4214 4215 return error; 4216 } 4217 4218 #ifdef XEN 4219 int 4220 pmap_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags) 4221 { 4222 paddr_t ma; 4223 4224 if (__predict_false(pa < pmap_pa_start || pmap_pa_end <= pa)) { 4225 ma = pa; /* XXX hack */ 4226 } else { 4227 ma = xpmap_ptom(pa); 4228 } 4229 4230 return pmap_enter_ma(pmap, va, ma, pa, prot, flags, DOMID_SELF); 4231 } 4232 #endif /* XEN */ 4233 4234 static bool 4235 pmap_get_physpage(vaddr_t va, int level, paddr_t *paddrp) 4236 { 4237 struct vm_page *ptp; 4238 struct pmap *kpm = pmap_kernel(); 4239 4240 if (uvm.page_init_done == false) { 4241 /* 4242 * we're growing the kernel pmap early (from 4243 * uvm_pageboot_alloc()). this case must be 4244 * handled a little differently. 4245 */ 4246 4247 if (uvm_page_physget(paddrp) == false) 4248 panic("pmap_get_physpage: out of memory"); 4249 kpreempt_disable(); 4250 pmap_pte_set(early_zero_pte, 4251 pmap_pa2pte(*paddrp) | PG_V | PG_RW | PG_k); 4252 pmap_pte_flush(); 4253 pmap_update_pg((vaddr_t)early_zerop); 4254 memset(early_zerop, 0, PAGE_SIZE); 4255 #if defined(DIAGNOSTIC) || defined (XEN) 4256 pmap_pte_set(early_zero_pte, 0); 4257 pmap_pte_flush(); 4258 #endif /* defined(DIAGNOSTIC) */ 4259 kpreempt_enable(); 4260 } else { 4261 /* XXX */ 4262 PMAP_SUBOBJ_LOCK(kpm, level - 1); 4263 ptp = uvm_pagealloc(&kpm->pm_obj[level - 1], 4264 ptp_va2o(va, level), NULL, 4265 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 4266 PMAP_SUBOBJ_UNLOCK(kpm, level - 1); 4267 if (ptp == NULL) 4268 panic("pmap_get_physpage: out of memory"); 4269 ptp->flags &= ~PG_BUSY; 4270 ptp->wire_count = 1; 4271 *paddrp = VM_PAGE_TO_PHYS(ptp); 4272 } 4273 pmap_stats_update(kpm, 1, 0); 4274 return true; 4275 } 4276 4277 /* 4278 * Allocate the amount of specified ptps for a ptp level, and populate 4279 * all levels below accordingly, mapping virtual addresses starting at 4280 * kva. 4281 * 4282 * Used by pmap_growkernel. 4283 */ 4284 static void 4285 pmap_alloc_level(pd_entry_t * const *pdes, vaddr_t kva, int lvl, 4286 long *needed_ptps) 4287 { 4288 unsigned long i; 4289 vaddr_t va; 4290 paddr_t pa; 4291 unsigned long index, endindex; 4292 int level; 4293 pd_entry_t *pdep; 4294 #ifdef XEN 4295 int s = splvm(); /* protect xpq_* */ 4296 #endif 4297 4298 for (level = lvl; level > 1; level--) { 4299 if (level == PTP_LEVELS) 4300 pdep = pmap_kernel()->pm_pdir; 4301 else 4302 pdep = pdes[level - 2]; 4303 va = kva; 4304 index = pl_i_roundup(kva, level); 4305 endindex = index + needed_ptps[level - 1] - 1; 4306 4307 4308 for (i = index; i <= endindex; i++) { 4309 KASSERT(!pmap_valid_entry(pdep[i])); 4310 pmap_get_physpage(va, level - 1, &pa); 4311 #ifdef XEN 4312 xpq_queue_pte_update((level == PTP_LEVELS) ? 4313 xpmap_ptom(pmap_pdirpa(pmap_kernel(), i)) : 4314 xpmap_ptetomach(&pdep[i]), 4315 pmap_pa2pte(pa) | PG_k | PG_V | PG_RW); 4316 #ifdef PAE 4317 if (level == PTP_LEVELS && i > L2_SLOT_KERN) { 4318 /* update real kernel PD too */ 4319 xpq_queue_pte_update( 4320 xpmap_ptetomach(&pmap_kl2pd[l2tol2(i)]), 4321 pmap_pa2pte(pa) | PG_k | PG_V | PG_RW); 4322 } 4323 #endif 4324 #else /* XEN */ 4325 pdep[i] = pa | PG_RW | PG_V; 4326 #endif /* XEN */ 4327 KASSERT(level != PTP_LEVELS || nkptp[level - 1] + 4328 pl_i(VM_MIN_KERNEL_ADDRESS, level) == i); 4329 nkptp[level - 1]++; 4330 va += nbpd[level - 1]; 4331 } 4332 pmap_pte_flush(); 4333 } 4334 #ifdef XEN 4335 splx(s); 4336 #endif 4337 } 4338 4339 /* 4340 * pmap_growkernel: increase usage of KVM space 4341 * 4342 * => we allocate new PTPs for the kernel and install them in all 4343 * the pmaps on the system. 4344 */ 4345 4346 vaddr_t 4347 pmap_growkernel(vaddr_t maxkvaddr) 4348 { 4349 struct pmap *kpm = pmap_kernel(); 4350 #if !defined(XEN) || !defined(__x86_64__) 4351 struct pmap *pm; 4352 #endif 4353 int s, i; 4354 long needed_kptp[PTP_LEVELS], target_nptp, old; 4355 bool invalidate = false; 4356 4357 s = splvm(); /* to be safe */ 4358 mutex_enter(&kpm->pm_lock); 4359 4360 if (maxkvaddr <= pmap_maxkvaddr) { 4361 mutex_exit(&kpm->pm_lock); 4362 splx(s); 4363 return pmap_maxkvaddr; 4364 } 4365 4366 maxkvaddr = x86_round_pdr(maxkvaddr); 4367 old = nkptp[PTP_LEVELS - 1]; 4368 /* 4369 * This loop could be optimized more, but pmap_growkernel() 4370 * is called infrequently. 4371 */ 4372 for (i = PTP_LEVELS - 1; i >= 1; i--) { 4373 target_nptp = pl_i_roundup(maxkvaddr, i + 1) - 4374 pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1); 4375 /* 4376 * XXX only need to check toplevel. 4377 */ 4378 if (target_nptp > nkptpmax[i]) 4379 panic("out of KVA space"); 4380 KASSERT(target_nptp >= nkptp[i]); 4381 needed_kptp[i] = target_nptp - nkptp[i]; 4382 } 4383 4384 pmap_alloc_level(normal_pdes, pmap_maxkvaddr, PTP_LEVELS, needed_kptp); 4385 4386 /* 4387 * If the number of top level entries changed, update all 4388 * pmaps. 4389 */ 4390 if (needed_kptp[PTP_LEVELS - 1] != 0) { 4391 #ifdef XEN 4392 #ifdef __x86_64__ 4393 /* nothing, kernel entries are never entered in user pmap */ 4394 #else /* __x86_64__ */ 4395 mutex_enter(&pmaps_lock); 4396 LIST_FOREACH(pm, &pmaps, pm_list) { 4397 int pdkidx; 4398 for (pdkidx = PDIR_SLOT_KERN + old; 4399 pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1]; 4400 pdkidx++) { 4401 xpq_queue_pte_update( 4402 xpmap_ptom(pmap_pdirpa(pm, pdkidx)), 4403 kpm->pm_pdir[pdkidx]); 4404 } 4405 xpq_flush_queue(); 4406 } 4407 mutex_exit(&pmaps_lock); 4408 #endif /* __x86_64__ */ 4409 #else /* XEN */ 4410 unsigned newpdes; 4411 newpdes = nkptp[PTP_LEVELS - 1] - old; 4412 mutex_enter(&pmaps_lock); 4413 LIST_FOREACH(pm, &pmaps, pm_list) { 4414 memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old], 4415 &kpm->pm_pdir[PDIR_SLOT_KERN + old], 4416 newpdes * sizeof (pd_entry_t)); 4417 } 4418 mutex_exit(&pmaps_lock); 4419 #endif 4420 invalidate = true; 4421 } 4422 pmap_maxkvaddr = maxkvaddr; 4423 mutex_exit(&kpm->pm_lock); 4424 splx(s); 4425 4426 if (invalidate) { 4427 /* Invalidate the PDP cache. */ 4428 pool_cache_invalidate(&pmap_pdp_cache); 4429 } 4430 4431 return maxkvaddr; 4432 } 4433 4434 #ifdef DEBUG 4435 void pmap_dump(struct pmap *, vaddr_t, vaddr_t); 4436 4437 /* 4438 * pmap_dump: dump all the mappings from a pmap 4439 * 4440 * => caller should not be holding any pmap locks 4441 */ 4442 4443 void 4444 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 4445 { 4446 pt_entry_t *ptes, *pte; 4447 pd_entry_t * const *pdes; 4448 struct pmap *pmap2; 4449 vaddr_t blkendva; 4450 4451 /* 4452 * if end is out of range truncate. 4453 * if (end == start) update to max. 4454 */ 4455 4456 if (eva > VM_MAXUSER_ADDRESS || eva <= sva) 4457 eva = VM_MAXUSER_ADDRESS; 4458 4459 /* 4460 * we lock in the pmap => pv_head direction 4461 */ 4462 4463 kpreempt_disable(); 4464 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 4465 4466 /* 4467 * dumping a range of pages: we dump in PTP sized blocks (4MB) 4468 */ 4469 4470 for (/* null */ ; sva < eva ; sva = blkendva) { 4471 4472 /* determine range of block */ 4473 blkendva = x86_round_pdr(sva+1); 4474 if (blkendva > eva) 4475 blkendva = eva; 4476 4477 /* valid block? */ 4478 if (!pmap_pdes_valid(sva, pdes, NULL)) 4479 continue; 4480 4481 pte = &ptes[pl1_i(sva)]; 4482 for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) { 4483 if (!pmap_valid_entry(*pte)) 4484 continue; 4485 printf("va %#lx -> pa %#lx (pte=%#lx)\n", 4486 sva, (unsigned long)*pte, 4487 (unsigned long)pmap_pte2pa(*pte)); 4488 } 4489 } 4490 pmap_unmap_ptes(pmap, pmap2); 4491 kpreempt_enable(); 4492 } 4493 #endif 4494 4495 /* 4496 * pmap_tlb_shootdown: invalidate pages on all CPUs using pmap 'pm' 4497 * 4498 * => always invalidates locally before returning 4499 * => returns before remote CPUs have invalidated 4500 * => must be called with preemption disabled 4501 */ 4502 4503 void 4504 pmap_tlb_shootdown(struct pmap *pm, vaddr_t sva, vaddr_t eva, pt_entry_t pte) 4505 { 4506 #ifdef MULTIPROCESSOR 4507 extern bool x86_mp_online; 4508 struct cpu_info *ci; 4509 struct pmap_mbox *mb, *selfmb; 4510 CPU_INFO_ITERATOR cii; 4511 uintptr_t head; 4512 u_int count; 4513 int s; 4514 #endif /* MULTIPROCESSOR */ 4515 struct cpu_info *self; 4516 bool kernel; 4517 4518 KASSERT(eva == 0 || eva >= sva); 4519 KASSERT(kpreempt_disabled()); 4520 4521 if (pte & PG_PS) 4522 sva &= PG_LGFRAME; 4523 pte &= PG_G; 4524 self = curcpu(); 4525 4526 if (sva == (vaddr_t)-1LL) { 4527 kernel = true; 4528 } else { 4529 if (eva == 0) 4530 eva = sva + PAGE_SIZE; 4531 kernel = sva >= VM_MAXUSER_ADDRESS; 4532 KASSERT(kernel == (eva > VM_MAXUSER_ADDRESS)); 4533 } 4534 4535 /* 4536 * if tearing down the pmap, do nothing. we'll flush later 4537 * when we're ready to recycle/destroy it. 4538 */ 4539 if (__predict_false(curlwp->l_md.md_gc_pmap == pm)) { 4540 return; 4541 } 4542 4543 /* 4544 * If the range is larger than 32 pages, then invalidate 4545 * everything. 4546 */ 4547 if (sva != (vaddr_t)-1LL && eva - sva > (32 * PAGE_SIZE)) { 4548 sva = (vaddr_t)-1LL; 4549 eva = sva; 4550 } 4551 4552 #ifdef MULTIPROCESSOR 4553 if (ncpu > 1 && x86_mp_online) { 4554 selfmb = &self->ci_pmap_cpu->pc_mbox; 4555 4556 /* 4557 * If the CPUs have no notion of global pages then 4558 * reload of %cr3 is sufficient. 4559 */ 4560 if (pte != 0 && (cpu_feature & CPUID_PGE) == 0) 4561 pte = 0; 4562 4563 if (pm == pmap_kernel()) { 4564 /* 4565 * Mapped on all CPUs: use the broadcast mechanism. 4566 * Once we have the lock, increment the counter. 4567 */ 4568 s = splvm(); 4569 mb = &pmap_mbox; 4570 count = SPINLOCK_BACKOFF_MIN; 4571 do { 4572 if ((head = mb->mb_head) != mb->mb_tail) { 4573 splx(s); 4574 while ((head = mb->mb_head) != 4575 mb->mb_tail) 4576 SPINLOCK_BACKOFF(count); 4577 s = splvm(); 4578 } 4579 } while (atomic_cas_ulong( 4580 (volatile u_long *)&mb->mb_head, 4581 head, head + ncpu - 1) != head); 4582 4583 /* 4584 * Once underway we must stay at IPL_VM until the 4585 * IPI is dispatched. Otherwise interrupt handlers 4586 * on this CPU can deadlock against us. 4587 */ 4588 pmap_tlb_evcnt.ev_count++; 4589 mb->mb_pointer = self; 4590 mb->mb_addr1 = sva; 4591 mb->mb_addr2 = eva; 4592 mb->mb_global = pte; 4593 x86_ipi(LAPIC_TLB_BCAST_VECTOR, LAPIC_DEST_ALLEXCL, 4594 LAPIC_DLMODE_FIXED); 4595 self->ci_need_tlbwait = 1; 4596 splx(s); 4597 } else if ((pm->pm_cpus & ~self->ci_cpumask) != 0 || 4598 (kernel && (pm->pm_kernel_cpus & ~self->ci_cpumask) != 0)) { 4599 /* 4600 * We don't bother traversing the CPU list if only 4601 * used by this CPU. 4602 * 4603 * We can't do global flushes with the multicast 4604 * mechanism. 4605 */ 4606 KASSERT(pte == 0); 4607 4608 /* 4609 * Take ownership of the shootdown mailbox on each 4610 * CPU, fill the details and fire it off. 4611 */ 4612 s = splvm(); 4613 for (CPU_INFO_FOREACH(cii, ci)) { 4614 if (ci == self || 4615 !pmap_is_active(pm, ci, kernel) || 4616 !(ci->ci_flags & CPUF_RUNNING)) 4617 continue; 4618 selfmb->mb_head++; 4619 mb = &ci->ci_pmap_cpu->pc_mbox; 4620 count = SPINLOCK_BACKOFF_MIN; 4621 while (atomic_cas_ulong( 4622 (u_long *)&mb->mb_pointer, 4623 0, (u_long)&selfmb->mb_tail) != 0) { 4624 splx(s); 4625 while (mb->mb_pointer != 0) 4626 SPINLOCK_BACKOFF(count); 4627 s = splvm(); 4628 } 4629 mb->mb_addr1 = sva; 4630 mb->mb_addr2 = eva; 4631 mb->mb_global = pte; 4632 if (x86_ipi(LAPIC_TLB_MCAST_VECTOR, 4633 ci->ci_cpuid, LAPIC_DLMODE_FIXED)) 4634 panic("pmap_tlb_shootdown: ipi failed"); 4635 } 4636 self->ci_need_tlbwait = 1; 4637 splx(s); 4638 } 4639 } 4640 #endif /* MULTIPROCESSOR */ 4641 4642 /* Update the current CPU before waiting for others. */ 4643 if (!pmap_is_active(pm, self, kernel)) 4644 return; 4645 4646 if (sva == (vaddr_t)-1LL) { 4647 u_int gen = uvm_emap_gen_return(); 4648 if (pte != 0) { 4649 tlbflushg(); 4650 } else { 4651 tlbflush(); 4652 } 4653 uvm_emap_update(gen); 4654 } else { 4655 do { 4656 pmap_update_pg(sva); 4657 sva += PAGE_SIZE; 4658 } while (sva < eva); 4659 } 4660 } 4661 4662 /* 4663 * pmap_tlb_shootwait: wait for pending TLB shootdowns to complete 4664 * 4665 * => only waits for operations generated by the current CPU 4666 * => must be called with preemption disabled 4667 */ 4668 4669 void 4670 pmap_tlb_shootwait(void) 4671 { 4672 struct cpu_info *self; 4673 struct pmap_mbox *mb; 4674 4675 KASSERT(kpreempt_disabled()); 4676 4677 /* 4678 * Anything to do? XXX Really we want to avoid touching the cache 4679 * lines of the two mailboxes, but the processor may read ahead. 4680 */ 4681 self = curcpu(); 4682 if (!self->ci_need_tlbwait) 4683 return; 4684 self->ci_need_tlbwait = 0; 4685 4686 /* If we own the global mailbox, wait for it to drain. */ 4687 mb = &pmap_mbox; 4688 while (mb->mb_pointer == self && mb->mb_head != mb->mb_tail) 4689 x86_pause(); 4690 4691 /* If we own other CPU's mailboxes, wait for them to drain. */ 4692 mb = &self->ci_pmap_cpu->pc_mbox; 4693 KASSERT(mb->mb_pointer != &mb->mb_tail); 4694 while (mb->mb_head != mb->mb_tail) 4695 x86_pause(); 4696 } 4697 4698 /* 4699 * pmap_update: process deferred invalidations 4700 */ 4701 4702 void 4703 pmap_update(struct pmap *pmap) 4704 { 4705 struct vm_page *ptp, *empty_ptps; 4706 struct pmap_page *pp; 4707 lwp_t *l; 4708 4709 /* 4710 * if we have torn down this pmap, invalidate non-global TLB 4711 * entries on any processors using it. 4712 */ 4713 l = curlwp; 4714 if (__predict_false(l->l_md.md_gc_pmap == pmap)) { 4715 l->l_md.md_gc_pmap = NULL; 4716 KPREEMPT_DISABLE(l); 4717 pmap_tlb_shootdown(pmap, -1, -1, 0); 4718 KPREEMPT_ENABLE(l); 4719 } 4720 4721 /* 4722 * wait for tlb shootdowns to complete before returning control 4723 * to the caller. 4724 */ 4725 kpreempt_disable(); 4726 pmap_tlb_shootwait(); 4727 kpreempt_enable(); 4728 4729 /* 4730 * now that shootdowns are complete, process deferred frees, 4731 * but not from interrupt context. 4732 */ 4733 if (l->l_md.md_gc_ptp != NULL) { 4734 if (cpu_intr_p() || (l->l_pflag & LP_INTR) != 0) { 4735 return; 4736 } 4737 4738 empty_ptps = l->l_md.md_gc_ptp; 4739 l->l_md.md_gc_ptp = NULL; 4740 4741 while ((ptp = empty_ptps) != NULL) { 4742 ptp->flags |= PG_ZERO; 4743 pp = VM_PAGE_TO_PP(ptp); 4744 empty_ptps = pp->pp_link; 4745 LIST_INIT(&pp->pp_head.pvh_list); 4746 uvm_pagefree(ptp); 4747 } 4748 } 4749 } 4750 4751 #if PTP_LEVELS > 4 4752 #error "Unsupported number of page table mappings" 4753 #endif 4754 4755 paddr_t 4756 pmap_init_tmp_pgtbl(paddr_t pg) 4757 { 4758 static bool maps_loaded; 4759 static const paddr_t x86_tmp_pml_paddr[] = { 4760 4 * PAGE_SIZE, 4761 5 * PAGE_SIZE, 4762 6 * PAGE_SIZE, 4763 7 * PAGE_SIZE 4764 }; 4765 static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 }; 4766 4767 pd_entry_t *tmp_pml, *kernel_pml; 4768 4769 int level; 4770 4771 if (!maps_loaded) { 4772 for (level = 0; level < PTP_LEVELS; ++level) { 4773 x86_tmp_pml_vaddr[level] = 4774 uvm_km_alloc(kernel_map, PAGE_SIZE, 0, 4775 UVM_KMF_VAONLY); 4776 4777 if (x86_tmp_pml_vaddr[level] == 0) 4778 panic("mapping of real mode PML failed\n"); 4779 pmap_kenter_pa(x86_tmp_pml_vaddr[level], 4780 x86_tmp_pml_paddr[level], 4781 VM_PROT_READ | VM_PROT_WRITE, 0); 4782 pmap_update(pmap_kernel()); 4783 } 4784 maps_loaded = true; 4785 } 4786 4787 /* Zero levels 1-3 */ 4788 for (level = 0; level < PTP_LEVELS - 1; ++level) { 4789 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 4790 memset(tmp_pml, 0, PAGE_SIZE); 4791 } 4792 4793 /* Copy PML4 */ 4794 kernel_pml = pmap_kernel()->pm_pdir; 4795 tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1]; 4796 memcpy(tmp_pml, kernel_pml, PAGE_SIZE); 4797 4798 for (level = PTP_LEVELS - 1; level > 0; --level) { 4799 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 4800 4801 tmp_pml[pl_i(pg, level + 1)] = 4802 (x86_tmp_pml_paddr[level - 1] & PG_FRAME) | PG_RW | PG_V; 4803 } 4804 4805 tmp_pml = (void *)x86_tmp_pml_vaddr[0]; 4806 tmp_pml[pl_i(pg, 1)] = (pg & PG_FRAME) | PG_RW | PG_V; 4807 4808 return x86_tmp_pml_paddr[PTP_LEVELS - 1]; 4809 } 4810