1 /* $NetBSD: pmap.c,v 1.82 2009/03/21 22:55:08 ad Exp $ */ 2 3 /* 4 * Copyright (c) 2007 Manuel Bouyer. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 3. All advertising materials mentioning features or use of this software 15 * must display the following acknowledgement: 16 * This product includes software developed by Manuel Bouyer. 17 * 4. The name of the author may not be used to endorse or promote products 18 * derived from this software without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 21 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 22 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 23 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 24 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 25 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 29 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 * 31 */ 32 33 /* 34 * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr> 35 * 36 * Permission to use, copy, modify, and distribute this software for any 37 * purpose with or without fee is hereby granted, provided that the above 38 * copyright notice and this permission notice appear in all copies. 39 * 40 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 41 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 42 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 43 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 44 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 45 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 46 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 47 */ 48 49 /* 50 * 51 * Copyright (c) 1997 Charles D. Cranor and Washington University. 52 * All rights reserved. 53 * 54 * Redistribution and use in source and binary forms, with or without 55 * modification, are permitted provided that the following conditions 56 * are met: 57 * 1. Redistributions of source code must retain the above copyright 58 * notice, this list of conditions and the following disclaimer. 59 * 2. Redistributions in binary form must reproduce the above copyright 60 * notice, this list of conditions and the following disclaimer in the 61 * documentation and/or other materials provided with the distribution. 62 * 3. All advertising materials mentioning features or use of this software 63 * must display the following acknowledgement: 64 * This product includes software developed by Charles D. Cranor and 65 * Washington University. 66 * 4. The name of the author may not be used to endorse or promote products 67 * derived from this software without specific prior written permission. 68 * 69 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 70 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 71 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 72 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 73 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 74 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 75 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 76 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 77 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 78 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 79 */ 80 81 /* 82 * Copyright 2001 (c) Wasabi Systems, Inc. 83 * All rights reserved. 84 * 85 * Written by Frank van der Linden for Wasabi Systems, Inc. 86 * 87 * Redistribution and use in source and binary forms, with or without 88 * modification, are permitted provided that the following conditions 89 * are met: 90 * 1. Redistributions of source code must retain the above copyright 91 * notice, this list of conditions and the following disclaimer. 92 * 2. Redistributions in binary form must reproduce the above copyright 93 * notice, this list of conditions and the following disclaimer in the 94 * documentation and/or other materials provided with the distribution. 95 * 3. All advertising materials mentioning features or use of this software 96 * must display the following acknowledgement: 97 * This product includes software developed for the NetBSD Project by 98 * Wasabi Systems, Inc. 99 * 4. The name of Wasabi Systems, Inc. may not be used to endorse 100 * or promote products derived from this software without specific prior 101 * written permission. 102 * 103 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND 104 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 105 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 106 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC 107 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 108 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 109 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 110 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 111 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 112 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 113 * POSSIBILITY OF SUCH DAMAGE. 114 */ 115 116 /* 117 * This is the i386 pmap modified and generalized to support x86-64 118 * as well. The idea is to hide the upper N levels of the page tables 119 * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest 120 * is mostly untouched, except that it uses some more generalized 121 * macros and interfaces. 122 * 123 * This pmap has been tested on the i386 as well, and it can be easily 124 * adapted to PAE. 125 * 126 * fvdl@wasabisystems.com 18-Jun-2001 127 */ 128 129 /* 130 * pmap.c: i386 pmap module rewrite 131 * Chuck Cranor <chuck@ccrc.wustl.edu> 132 * 11-Aug-97 133 * 134 * history of this pmap module: in addition to my own input, i used 135 * the following references for this rewrite of the i386 pmap: 136 * 137 * [1] the NetBSD i386 pmap. this pmap appears to be based on the 138 * BSD hp300 pmap done by Mike Hibler at University of Utah. 139 * it was then ported to the i386 by William Jolitz of UUNET 140 * Technologies, Inc. Then Charles M. Hannum of the NetBSD 141 * project fixed some bugs and provided some speed ups. 142 * 143 * [2] the FreeBSD i386 pmap. this pmap seems to be the 144 * Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson 145 * and David Greenman. 146 * 147 * [3] the Mach pmap. this pmap, from CMU, seems to have migrated 148 * between several processors. the VAX version was done by 149 * Avadis Tevanian, Jr., and Michael Wayne Young. the i386 150 * version was done by Lance Berc, Mike Kupfer, Bob Baron, 151 * David Golub, and Richard Draves. the alpha version was 152 * done by Alessandro Forin (CMU/Mach) and Chris Demetriou 153 * (NetBSD/alpha). 154 */ 155 156 #include <sys/cdefs.h> 157 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.82 2009/03/21 22:55:08 ad Exp $"); 158 159 #include "opt_user_ldt.h" 160 #include "opt_lockdebug.h" 161 #include "opt_multiprocessor.h" 162 #include "opt_xen.h" 163 #if !defined(__x86_64__) 164 #include "opt_kstack_dr0.h" 165 #endif /* !defined(__x86_64__) */ 166 167 #include <sys/param.h> 168 #include <sys/systm.h> 169 #include <sys/proc.h> 170 #include <sys/pool.h> 171 #include <sys/user.h> 172 #include <sys/kernel.h> 173 #include <sys/atomic.h> 174 #include <sys/cpu.h> 175 #include <sys/intr.h> 176 #include <sys/xcall.h> 177 178 #include <uvm/uvm.h> 179 180 #include <dev/isa/isareg.h> 181 182 #include <machine/specialreg.h> 183 #include <machine/gdt.h> 184 #include <machine/isa_machdep.h> 185 #include <machine/cpuvar.h> 186 187 #include <x86/pmap.h> 188 #include <x86/pmap_pv.h> 189 190 #include <x86/i82489reg.h> 191 #include <x86/i82489var.h> 192 193 #ifdef XEN 194 #include <xen/xen3-public/xen.h> 195 #include <xen/hypervisor.h> 196 #endif 197 198 /* flag to be used for kernel mappings: PG_u on Xen/amd64, 0 otherwise */ 199 #if defined(XEN) && defined(__x86_64__) 200 #define PG_k PG_u 201 #else 202 #define PG_k 0 203 #endif 204 205 /* 206 * general info: 207 * 208 * - for an explanation of how the i386 MMU hardware works see 209 * the comments in <machine/pte.h>. 210 * 211 * - for an explanation of the general memory structure used by 212 * this pmap (including the recursive mapping), see the comments 213 * in <machine/pmap.h>. 214 * 215 * this file contains the code for the "pmap module." the module's 216 * job is to manage the hardware's virtual to physical address mappings. 217 * note that there are two levels of mapping in the VM system: 218 * 219 * [1] the upper layer of the VM system uses vm_map's and vm_map_entry's 220 * to map ranges of virtual address space to objects/files. for 221 * example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only 222 * to the file /bin/ls starting at offset zero." note that 223 * the upper layer mapping is not concerned with how individual 224 * vm_pages are mapped. 225 * 226 * [2] the lower layer of the VM system (the pmap) maintains the mappings 227 * from virtual addresses. it is concerned with which vm_page is 228 * mapped where. for example, when you run /bin/ls and start 229 * at page 0x1000 the fault routine may lookup the correct page 230 * of the /bin/ls file and then ask the pmap layer to establish 231 * a mapping for it. 232 * 233 * note that information in the lower layer of the VM system can be 234 * thrown away since it can easily be reconstructed from the info 235 * in the upper layer. 236 * 237 * data structures we use include: 238 * 239 * - struct pmap: describes the address space of one thread 240 * - struct pv_entry: describes one <PMAP,VA> mapping of a PA 241 * - struct pv_head: there is one pv_head per managed page of 242 * physical memory. the pv_head points to a list of pv_entry 243 * structures which describe all the <PMAP,VA> pairs that this 244 * page is mapped in. this is critical for page based operations 245 * such as pmap_page_protect() [change protection on _all_ mappings 246 * of a page] 247 */ 248 249 /* 250 * memory allocation 251 * 252 * - there are three data structures that we must dynamically allocate: 253 * 254 * [A] new process' page directory page (PDP) 255 * - plan 1: done at pmap_create() we use 256 * uvm_km_alloc(kernel_map, PAGE_SIZE) [fka kmem_alloc] to do this 257 * allocation. 258 * 259 * if we are low in free physical memory then we sleep in 260 * uvm_km_alloc -- in this case this is ok since we are creating 261 * a new pmap and should not be holding any locks. 262 * 263 * if the kernel is totally out of virtual space 264 * (i.e. uvm_km_alloc returns NULL), then we panic. 265 * 266 * XXX: the fork code currently has no way to return an "out of 267 * memory, try again" error code since uvm_fork [fka vm_fork] 268 * is a void function. 269 * 270 * [B] new page tables pages (PTP) 271 * - call uvm_pagealloc() 272 * => success: zero page, add to pm_pdir 273 * => failure: we are out of free vm_pages, let pmap_enter() 274 * tell UVM about it. 275 * 276 * note: for kernel PTPs, we start with NKPTP of them. as we map 277 * kernel memory (at uvm_map time) we check to see if we've grown 278 * the kernel pmap. if so, we call the optional function 279 * pmap_growkernel() to grow the kernel PTPs in advance. 280 * 281 * [C] pv_entry structures 282 */ 283 284 /* 285 * locking 286 * 287 * we have the following locks that we must contend with: 288 * 289 * mutexes: 290 * 291 * - pmap lock (per pmap, part of uvm_object) 292 * this lock protects the fields in the pmap structure including 293 * the non-kernel PDEs in the PDP, and the PTEs. it also locks 294 * in the alternate PTE space (since that is determined by the 295 * entry in the PDP). 296 * 297 * - pvh_lock (per pv_head) 298 * this lock protects the pv_entry list which is chained off the 299 * pv_head structure for a specific managed PA. it is locked 300 * when traversing the list (e.g. adding/removing mappings, 301 * syncing R/M bits, etc.) 302 * 303 * - pmaps_lock 304 * this lock protects the list of active pmaps (headed by "pmaps"). 305 * we lock it when adding or removing pmaps from this list. 306 * 307 * tlb shootdown 308 * 309 * tlb shootdowns are hard interrupts that operate outside the spl 310 * framework: they don't need to be blocked provided that the pmap module 311 * gets the order of events correct. the calls are made by talking directly 312 * to the lapic. the stubs to handle the interrupts are quite short and do 313 * one of the following: invalidate a single page, a range of pages, all 314 * user tlb entries or the entire tlb. 315 * 316 * the cpus synchronize with each other using pmap_mbox structures which are 317 * aligned on 64-byte cache lines. tlb shootdowns against the kernel pmap 318 * use a global mailbox and are generated using a broadcast ipi (broadcast 319 * to all but the sending cpu). shootdowns against regular pmaps use 320 * per-cpu mailboxes and are multicast. kernel and user shootdowns can 321 * execute simultaneously, as can shootdowns within different multithreaded 322 * processes. TODO: 323 * 324 * 1. figure out which waitpoints can be deferered to pmap_update(). 325 * 2. see if there is a cheap way to batch some updates. 326 */ 327 328 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER; 329 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER; 330 const long nkptpmax[] = NKPTPMAX_INITIALIZER; 331 const long nbpd[] = NBPD_INITIALIZER; 332 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER; 333 pd_entry_t * const alternate_pdes[] = APDES_INITIALIZER; 334 335 long nkptp[] = NKPTP_INITIALIZER; 336 337 static kmutex_t pmaps_lock; 338 339 static vaddr_t pmap_maxkvaddr; 340 341 #define COUNT(x) /* nothing */ 342 343 /* 344 * XXX kludge: dummy locking to make KASSERTs in uvm_page.c comfortable. 345 * actual locking is done by pm_lock. 346 */ 347 #if defined(DIAGNOSTIC) 348 #define PMAP_SUBOBJ_LOCK(pm, idx) \ 349 KASSERT(mutex_owned(&(pm)->pm_lock)); \ 350 if ((idx) != 0) \ 351 mutex_enter(&(pm)->pm_obj[(idx)].vmobjlock) 352 #define PMAP_SUBOBJ_UNLOCK(pm, idx) \ 353 KASSERT(mutex_owned(&(pm)->pm_lock)); \ 354 if ((idx) != 0) \ 355 mutex_exit(&(pm)->pm_obj[(idx)].vmobjlock) 356 #else /* defined(DIAGNOSTIC) */ 357 #define PMAP_SUBOBJ_LOCK(pm, idx) /* nothing */ 358 #define PMAP_SUBOBJ_UNLOCK(pm, idx) /* nothing */ 359 #endif /* defined(DIAGNOSTIC) */ 360 361 /* 362 * Misc. event counters. 363 */ 364 struct evcnt pmap_iobmp_evcnt; 365 struct evcnt pmap_ldt_evcnt; 366 367 /* 368 * Global TLB shootdown mailbox. 369 */ 370 struct evcnt pmap_tlb_evcnt __aligned(64); 371 struct pmap_mbox pmap_mbox __aligned(64); 372 373 /* 374 * Per-CPU data. The pmap mailbox is cache intensive so gets its 375 * own line. Note that the mailbox must be the first item. 376 */ 377 struct pmap_cpu { 378 /* TLB shootdown */ 379 struct pmap_mbox pc_mbox; 380 }; 381 382 union { 383 struct pmap_cpu pc; 384 uint8_t padding[64]; 385 } pmap_cpu[MAXCPUS] __aligned(64); 386 387 /* 388 * global data structures 389 */ 390 391 static struct pmap kernel_pmap_store; /* the kernel's pmap (proc0) */ 392 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store; 393 394 /* 395 * pmap_pg_g: if our processor supports PG_G in the PTE then we 396 * set pmap_pg_g to PG_G (otherwise it is zero). 397 */ 398 399 int pmap_pg_g = 0; 400 401 /* 402 * pmap_largepages: if our processor supports PG_PS and we are 403 * using it, this is set to true. 404 */ 405 406 int pmap_largepages; 407 408 /* 409 * i386 physical memory comes in a big contig chunk with a small 410 * hole toward the front of it... the following two paddr_t's 411 * (shared with machdep.c) describe the physical address space 412 * of this machine. 413 */ 414 paddr_t avail_start; /* PA of first available physical page */ 415 paddr_t avail_end; /* PA of last available physical page */ 416 417 #ifdef XEN 418 #ifdef __x86_64__ 419 /* Dummy PGD for user cr3, used between pmap_deacivate() and pmap_activate() */ 420 static paddr_t xen_dummy_user_pgd; 421 /* Currently active user PGD (can't use rcr3()) */ 422 static paddr_t xen_current_user_pgd = 0; 423 #endif /* __x86_64__ */ 424 paddr_t pmap_pa_start; /* PA of first physical page for this domain */ 425 paddr_t pmap_pa_end; /* PA of last physical page for this domain */ 426 #endif /* XEN */ 427 428 #define VM_PAGE_TO_PP(pg) (&(pg)->mdpage.mp_pp) 429 430 #define pp_lock(pp) mutex_spin_enter(&(pp)->pp_lock) 431 #define pp_unlock(pp) mutex_spin_exit(&(pp)->pp_lock) 432 #define pp_locked(pp) mutex_owned(&(pp)->pp_lock) 433 434 #define PV_HASH_SIZE 32768 435 #define PV_HASH_LOCK_CNT 32 436 437 struct pv_hash_lock { 438 kmutex_t lock; 439 } __aligned(CACHE_LINE_SIZE) pv_hash_locks[PV_HASH_LOCK_CNT] 440 __aligned(CACHE_LINE_SIZE); 441 442 struct pv_hash_head { 443 SLIST_HEAD(, pv_entry) hh_list; 444 } pv_hash_heads[PV_HASH_SIZE]; 445 446 static u_int 447 pvhash_hash(struct vm_page *ptp, vaddr_t va) 448 { 449 450 return (uintptr_t)ptp / sizeof(*ptp) + (va >> PAGE_SHIFT); 451 } 452 453 static struct pv_hash_head * 454 pvhash_head(u_int hash) 455 { 456 457 return &pv_hash_heads[hash % PV_HASH_SIZE]; 458 } 459 460 static kmutex_t * 461 pvhash_lock(u_int hash) 462 { 463 464 return &pv_hash_locks[hash % PV_HASH_LOCK_CNT].lock; 465 } 466 467 static struct pv_entry * 468 pvhash_remove(struct pv_hash_head *hh, struct vm_page *ptp, vaddr_t va) 469 { 470 struct pv_entry *pve; 471 struct pv_entry *prev; 472 473 prev = NULL; 474 SLIST_FOREACH(pve, &hh->hh_list, pve_hash) { 475 if (pve->pve_pte.pte_ptp == ptp && 476 pve->pve_pte.pte_va == va) { 477 if (prev != NULL) { 478 SLIST_REMOVE_AFTER(prev, pve_hash); 479 } else { 480 SLIST_REMOVE_HEAD(&hh->hh_list, pve_hash); 481 } 482 break; 483 } 484 prev = pve; 485 } 486 return pve; 487 } 488 489 /* 490 * other data structures 491 */ 492 493 static pt_entry_t protection_codes[8]; /* maps MI prot to i386 prot code */ 494 static bool pmap_initialized = false; /* pmap_init done yet? */ 495 496 /* 497 * the following two vaddr_t's are used during system startup 498 * to keep track of how much of the kernel's VM space we have used. 499 * once the system is started, the management of the remaining kernel 500 * VM space is turned over to the kernel_map vm_map. 501 */ 502 503 static vaddr_t virtual_avail; /* VA of first free KVA */ 504 static vaddr_t virtual_end; /* VA of last free KVA */ 505 506 /* 507 * linked list of all non-kernel pmaps 508 */ 509 510 static struct pmap_head pmaps; 511 512 /* 513 * pool that pmap structures are allocated from 514 */ 515 516 static struct pool_cache pmap_cache; 517 518 /* 519 * pv_entry cache 520 */ 521 522 static struct pool_cache pmap_pv_cache; 523 524 /* 525 * MULTIPROCESSOR: special VA's/ PTE's are actually allocated inside a 526 * maxcpus*NPTECL array of PTE's, to avoid cache line thrashing 527 * due to false sharing. 528 */ 529 530 #ifdef MULTIPROCESSOR 531 #define PTESLEW(pte, id) ((pte)+(id)*NPTECL) 532 #define VASLEW(va,id) ((va)+(id)*NPTECL*PAGE_SIZE) 533 #else 534 #define PTESLEW(pte, id) (pte) 535 #define VASLEW(va,id) (va) 536 #endif 537 538 /* 539 * special VAs and the PTEs that map them 540 */ 541 static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte, *early_zero_pte; 542 static char *csrcp, *cdstp, *zerop, *ptpp, *early_zerop; 543 544 /* 545 * pool and cache that PDPs are allocated from 546 */ 547 548 static struct pool_cache pmap_pdp_cache; 549 int pmap_pdp_ctor(void *, void *, int); 550 void pmap_pdp_dtor(void *, void *); 551 #ifdef PAE 552 /* need to allocate items of 4 pages */ 553 void *pmap_pdp_alloc(struct pool *, int); 554 void pmap_pdp_free(struct pool *, void *); 555 static struct pool_allocator pmap_pdp_allocator = { 556 .pa_alloc = pmap_pdp_alloc, 557 .pa_free = pmap_pdp_free, 558 .pa_pagesz = PAGE_SIZE * PDP_SIZE, 559 }; 560 #endif /* PAE */ 561 562 void *vmmap; /* XXX: used by mem.c... it should really uvm_map_reserve it */ 563 564 extern vaddr_t idt_vaddr; /* we allocate IDT early */ 565 extern paddr_t idt_paddr; 566 567 #ifdef _LP64 568 extern vaddr_t lo32_vaddr; 569 extern vaddr_t lo32_paddr; 570 #endif 571 572 extern int end; 573 574 #ifdef i386 575 /* stuff to fix the pentium f00f bug */ 576 extern vaddr_t pentium_idt_vaddr; 577 #endif 578 579 580 /* 581 * local prototypes 582 */ 583 584 static struct vm_page *pmap_get_ptp(struct pmap *, vaddr_t, 585 pd_entry_t * const *); 586 static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int); 587 static void pmap_freepage(struct pmap *, struct vm_page *, int); 588 static void pmap_free_ptp(struct pmap *, struct vm_page *, 589 vaddr_t, pt_entry_t *, 590 pd_entry_t * const *); 591 static bool pmap_is_curpmap(struct pmap *); 592 static bool pmap_is_active(struct pmap *, struct cpu_info *, bool); 593 static void pmap_map_ptes(struct pmap *, struct pmap **, 594 pt_entry_t **, pd_entry_t * const **); 595 static void pmap_do_remove(struct pmap *, vaddr_t, vaddr_t, int); 596 static bool pmap_remove_pte(struct pmap *, struct vm_page *, 597 pt_entry_t *, vaddr_t, int, 598 struct pv_entry **); 599 static pt_entry_t pmap_remove_ptes(struct pmap *, struct vm_page *, 600 vaddr_t, vaddr_t, vaddr_t, int, 601 struct pv_entry **); 602 #define PMAP_REMOVE_ALL 0 /* remove all mappings */ 603 #define PMAP_REMOVE_SKIPWIRED 1 /* skip wired mappings */ 604 605 static void pmap_unmap_ptes(struct pmap *, struct pmap *); 606 static bool pmap_get_physpage(vaddr_t, int, paddr_t *); 607 static int pmap_pdes_invalid(vaddr_t, pd_entry_t * const *, 608 pd_entry_t *); 609 #define pmap_pdes_valid(va, pdes, lastpde) \ 610 (pmap_pdes_invalid((va), (pdes), (lastpde)) == 0) 611 static void pmap_alloc_level(pd_entry_t * const *, vaddr_t, int, 612 long *); 613 614 static bool pmap_reactivate(struct pmap *); 615 616 /* 617 * p m a p h e l p e r f u n c t i o n s 618 */ 619 620 static inline void 621 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff) 622 { 623 624 if (pmap == pmap_kernel()) { 625 atomic_add_long(&pmap->pm_stats.resident_count, resid_diff); 626 atomic_add_long(&pmap->pm_stats.wired_count, wired_diff); 627 } else { 628 KASSERT(mutex_owned(&pmap->pm_lock)); 629 pmap->pm_stats.resident_count += resid_diff; 630 pmap->pm_stats.wired_count += wired_diff; 631 } 632 } 633 634 static inline void 635 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte) 636 { 637 int resid_diff = ((npte & PG_V) ? 1 : 0) - ((opte & PG_V) ? 1 : 0); 638 int wired_diff = ((npte & PG_W) ? 1 : 0) - ((opte & PG_W) ? 1 : 0); 639 640 KASSERT((npte & (PG_V | PG_W)) != PG_W); 641 KASSERT((opte & (PG_V | PG_W)) != PG_W); 642 643 pmap_stats_update(pmap, resid_diff, wired_diff); 644 } 645 646 /* 647 * ptp_to_pmap: lookup pmap by ptp 648 */ 649 650 static struct pmap * 651 ptp_to_pmap(struct vm_page *ptp) 652 { 653 struct pmap *pmap; 654 655 if (ptp == NULL) { 656 return pmap_kernel(); 657 } 658 pmap = (struct pmap *)ptp->uobject; 659 KASSERT(pmap != NULL); 660 KASSERT(&pmap->pm_obj[0] == ptp->uobject); 661 return pmap; 662 } 663 664 static inline struct pv_pte * 665 pve_to_pvpte(struct pv_entry *pve) 666 { 667 668 KASSERT((void *)&pve->pve_pte == (void *)pve); 669 return &pve->pve_pte; 670 } 671 672 static inline struct pv_entry * 673 pvpte_to_pve(struct pv_pte *pvpte) 674 { 675 struct pv_entry *pve = (void *)pvpte; 676 677 KASSERT(pve_to_pvpte(pve) == pvpte); 678 return pve; 679 } 680 681 /* 682 * pv_pte_first, pv_pte_next: PV list iterator. 683 */ 684 685 static struct pv_pte * 686 pv_pte_first(struct pmap_page *pp) 687 { 688 689 KASSERT(pp_locked(pp)); 690 if ((pp->pp_flags & PP_EMBEDDED) != 0) { 691 return &pp->pp_pte; 692 } 693 return pve_to_pvpte(LIST_FIRST(&pp->pp_head.pvh_list)); 694 } 695 696 static struct pv_pte * 697 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte) 698 { 699 700 KASSERT(pvpte != NULL); 701 KASSERT(pp_locked(pp)); 702 if (pvpte == &pp->pp_pte) { 703 KASSERT((pp->pp_flags & PP_EMBEDDED) != 0); 704 return NULL; 705 } 706 KASSERT((pp->pp_flags & PP_EMBEDDED) == 0); 707 return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list)); 708 } 709 710 /* 711 * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]? 712 * of course the kernel is always loaded 713 */ 714 715 inline static bool 716 pmap_is_curpmap(struct pmap *pmap) 717 { 718 #if defined(XEN) && defined(__x86_64__) 719 /* 720 * Only kernel pmap is physically loaded. 721 * User PGD may be active, but TLB will be flushed 722 * with HYPERVISOR_iret anyway, so let's say no 723 */ 724 return(pmap == pmap_kernel()); 725 #else /* XEN && __x86_64__*/ 726 return((pmap == pmap_kernel()) || 727 (pmap == curcpu()->ci_pmap)); 728 #endif 729 } 730 731 /* 732 * pmap_is_active: is this pmap loaded into the specified processor's %cr3? 733 */ 734 735 inline static bool 736 pmap_is_active(struct pmap *pmap, struct cpu_info *ci, bool kernel) 737 { 738 739 return (pmap == pmap_kernel() || 740 (pmap->pm_cpus & ci->ci_cpumask) != 0 || 741 (kernel && (pmap->pm_kernel_cpus & ci->ci_cpumask) != 0)); 742 } 743 744 static void 745 pmap_apte_flush(struct pmap *pmap) 746 { 747 748 KASSERT(kpreempt_disabled()); 749 750 /* 751 * Flush the APTE mapping from all other CPUs that 752 * are using the pmap we are using (who's APTE space 753 * is the one we've just modified). 754 * 755 * XXXthorpej -- find a way to defer the IPI. 756 */ 757 pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, 0); 758 pmap_tlb_shootwait(); 759 } 760 761 /* 762 * Add a reference to the specified pmap. 763 */ 764 765 inline void 766 pmap_reference(struct pmap *pmap) 767 { 768 769 atomic_inc_uint((unsigned *)&pmap->pm_obj[0].uo_refs); 770 } 771 772 /* 773 * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in 774 * 775 * => we lock enough pmaps to keep things locked in 776 * => must be undone with pmap_unmap_ptes before returning 777 */ 778 779 static void 780 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2, 781 pd_entry_t **ptepp, pd_entry_t * const **pdeppp) 782 { 783 pd_entry_t opde, npde; 784 struct pmap *ourpmap; 785 struct cpu_info *ci; 786 struct lwp *l; 787 bool iscurrent; 788 uint64_t ncsw; 789 #ifdef XEN 790 int s; 791 #endif 792 793 /* the kernel's pmap is always accessible */ 794 if (pmap == pmap_kernel()) { 795 *pmap2 = NULL; 796 *ptepp = PTE_BASE; 797 *pdeppp = normal_pdes; 798 return; 799 } 800 KASSERT(kpreempt_disabled()); 801 802 retry: 803 l = curlwp; 804 ncsw = l->l_ncsw; 805 ourpmap = NULL; 806 ci = curcpu(); 807 #if defined(XEN) && defined(__x86_64__) 808 /* 809 * curmap can only be pmap_kernel so at this point 810 * pmap_is_curpmap is always false 811 */ 812 iscurrent = 0; 813 ourpmap = pmap_kernel(); 814 #else /* XEN && __x86_64__*/ 815 if (ci->ci_want_pmapload && 816 vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) { 817 pmap_load(); 818 if (l->l_ncsw != ncsw) 819 goto retry; 820 } 821 iscurrent = pmap_is_curpmap(pmap); 822 /* if curpmap then we are always mapped */ 823 if (iscurrent) { 824 mutex_enter(&pmap->pm_lock); 825 *pmap2 = NULL; 826 *ptepp = PTE_BASE; 827 *pdeppp = normal_pdes; 828 goto out; 829 } 830 ourpmap = ci->ci_pmap; 831 #endif /* XEN && __x86_64__ */ 832 833 /* need to lock both curpmap and pmap: use ordered locking */ 834 pmap_reference(ourpmap); 835 if ((uintptr_t) pmap < (uintptr_t) ourpmap) { 836 mutex_enter(&pmap->pm_lock); 837 mutex_enter(&ourpmap->pm_lock); 838 } else { 839 mutex_enter(&ourpmap->pm_lock); 840 mutex_enter(&pmap->pm_lock); 841 } 842 843 if (l->l_ncsw != ncsw) 844 goto unlock_and_retry; 845 846 /* need to load a new alternate pt space into curpmap? */ 847 COUNT(apdp_pde_map); 848 opde = *APDP_PDE; 849 #ifdef XEN 850 if (!pmap_valid_entry(opde) || 851 pmap_pte2pa(opde) != pmap_pdirpa(pmap, 0)) { 852 int i; 853 s = splvm(); 854 /* Make recursive entry usable in user PGD */ 855 for (i = 0; i < PDP_SIZE; i++) { 856 npde = pmap_pa2pte( 857 pmap_pdirpa(pmap, i * NPDPG)) | PG_k | PG_V; 858 xpq_queue_pte_update( 859 xpmap_ptom(pmap_pdirpa(pmap, PDIR_SLOT_PTE + i)), 860 npde); 861 xpq_queue_pte_update(xpmap_ptetomach(&APDP_PDE[i]), 862 npde); 863 #ifdef PAE 864 /* update shadow entry too */ 865 xpq_queue_pte_update( 866 xpmap_ptetomach(&APDP_PDE_SHADOW[i]), npde); 867 #endif /* PAE */ 868 xpq_queue_invlpg( 869 (vaddr_t)&pmap->pm_pdir[PDIR_SLOT_PTE + i]); 870 } 871 xpq_flush_queue(); 872 if (pmap_valid_entry(opde)) 873 pmap_apte_flush(ourpmap); 874 splx(s); 875 } 876 #else /* XEN */ 877 npde = pmap_pa2pte(pmap_pdirpa(pmap, 0)) | PG_RW | PG_V; 878 if (!pmap_valid_entry(opde) || 879 pmap_pte2pa(opde) != pmap_pdirpa(pmap, 0)) { 880 pmap_pte_set(APDP_PDE, npde); 881 pmap_pte_flush(); 882 if (pmap_valid_entry(opde)) 883 pmap_apte_flush(ourpmap); 884 } 885 #endif /* XEN */ 886 *pmap2 = ourpmap; 887 *ptepp = APTE_BASE; 888 *pdeppp = alternate_pdes; 889 KASSERT(l->l_ncsw == ncsw); 890 #if !defined(XEN) || !defined(__x86_64__) 891 out: 892 #endif 893 /* 894 * might have blocked, need to retry? 895 */ 896 if (l->l_ncsw != ncsw) { 897 unlock_and_retry: 898 if (ourpmap != NULL) { 899 mutex_exit(&ourpmap->pm_lock); 900 pmap_destroy(ourpmap); 901 } 902 mutex_exit(&pmap->pm_lock); 903 goto retry; 904 } 905 906 return; 907 } 908 909 /* 910 * pmap_unmap_ptes: unlock the PTE mapping of "pmap" 911 */ 912 913 static void 914 pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2) 915 { 916 917 if (pmap == pmap_kernel()) { 918 return; 919 } 920 KASSERT(kpreempt_disabled()); 921 if (pmap2 == NULL) { 922 mutex_exit(&pmap->pm_lock); 923 } else { 924 #if defined(XEN) && defined(__x86_64__) 925 KASSERT(pmap2 == pmap_kernel()); 926 #else 927 KASSERT(curcpu()->ci_pmap == pmap2); 928 #endif 929 #if defined(MULTIPROCESSOR) 930 pmap_pte_set(APDP_PDE, 0); 931 pmap_pte_flush(); 932 pmap_apte_flush(pmap2); 933 #endif 934 COUNT(apdp_pde_unmap); 935 mutex_exit(&pmap->pm_lock); 936 mutex_exit(&pmap2->pm_lock); 937 pmap_destroy(pmap2); 938 } 939 } 940 941 inline static void 942 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte) 943 { 944 945 #if !defined(__x86_64__) 946 if (curproc == NULL || curproc->p_vmspace == NULL || 947 pm != vm_map_pmap(&curproc->p_vmspace->vm_map)) 948 return; 949 950 if ((opte ^ npte) & PG_X) 951 pmap_update_pg(va); 952 953 /* 954 * Executability was removed on the last executable change. 955 * Reset the code segment to something conservative and 956 * let the trap handler deal with setting the right limit. 957 * We can't do that because of locking constraints on the vm map. 958 */ 959 960 if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) { 961 struct trapframe *tf = curlwp->l_md.md_regs; 962 963 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 964 pm->pm_hiexec = I386_MAX_EXE_ADDR; 965 } 966 #endif /* !defined(__x86_64__) */ 967 } 968 969 #if !defined(__x86_64__) 970 /* 971 * Fixup the code segment to cover all potential executable mappings. 972 * returns 0 if no changes to the code segment were made. 973 */ 974 975 int 976 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb) 977 { 978 struct vm_map_entry *ent; 979 struct pmap *pm = vm_map_pmap(map); 980 vaddr_t va = 0; 981 982 vm_map_lock_read(map); 983 for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) { 984 985 /* 986 * This entry has greater va than the entries before. 987 * We need to make it point to the last page, not past it. 988 */ 989 990 if (ent->protection & VM_PROT_EXECUTE) 991 va = trunc_page(ent->end) - PAGE_SIZE; 992 } 993 vm_map_unlock_read(map); 994 if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL)) 995 return (0); 996 997 pm->pm_hiexec = va; 998 if (pm->pm_hiexec > I386_MAX_EXE_ADDR) { 999 tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL); 1000 } else { 1001 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 1002 return (0); 1003 } 1004 return (1); 1005 } 1006 #endif /* !defined(__x86_64__) */ 1007 1008 /* 1009 * p m a p k e n t e r f u n c t i o n s 1010 * 1011 * functions to quickly enter/remove pages from the kernel address 1012 * space. pmap_kremove is exported to MI kernel. we make use of 1013 * the recursive PTE mappings. 1014 */ 1015 1016 /* 1017 * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking 1018 * 1019 * => no need to lock anything, assume va is already allocated 1020 * => should be faster than normal pmap enter function 1021 */ 1022 1023 void 1024 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot) 1025 { 1026 pt_entry_t *pte, opte, npte; 1027 1028 KASSERT(!(prot & ~VM_PROT_ALL)); 1029 1030 if (va < VM_MIN_KERNEL_ADDRESS) 1031 pte = vtopte(va); 1032 else 1033 pte = kvtopte(va); 1034 #ifdef DOM0OPS 1035 if (pa < pmap_pa_start || pa >= pmap_pa_end) { 1036 #ifdef DEBUG 1037 printk("pmap_kenter_pa: pa 0x%" PRIx64 " for va 0x%" PRIx64 1038 " outside range\n", (int64_t)pa, (int64_t)va); 1039 #endif /* DEBUG */ 1040 npte = pa; 1041 } else 1042 #endif /* DOM0OPS */ 1043 npte = pmap_pa2pte(pa); 1044 npte |= protection_codes[prot] | PG_k | PG_V | pmap_pg_g; 1045 opte = pmap_pte_testset(pte, npte); /* zap! */ 1046 #if defined(DIAGNOSTIC) 1047 /* XXX For now... */ 1048 if (opte & PG_PS) 1049 panic("pmap_kenter_pa: PG_PS"); 1050 #endif 1051 if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) { 1052 /* This should not happen, so no need to batch updates. */ 1053 kpreempt_disable(); 1054 pmap_tlb_shootdown(pmap_kernel(), va, 0, opte); 1055 kpreempt_enable(); 1056 } 1057 } 1058 1059 #ifdef XEN 1060 /* 1061 * pmap_kenter_ma: enter a kernel mapping without R/M (pv_entry) tracking 1062 * 1063 * => no need to lock anything, assume va is already allocated 1064 * => should be faster than normal pmap enter function 1065 * => we expect a MACHINE address 1066 */ 1067 1068 void 1069 pmap_kenter_ma(vaddr_t va, paddr_t ma, vm_prot_t prot) 1070 { 1071 pt_entry_t *pte, opte, npte; 1072 1073 if (va < VM_MIN_KERNEL_ADDRESS) 1074 pte = vtopte(va); 1075 else 1076 pte = kvtopte(va); 1077 1078 npte = ma | ((prot & VM_PROT_WRITE) ? PG_RW : PG_RO) | 1079 PG_V | PG_k; 1080 #ifndef XEN 1081 if ((cpu_feature & CPUID_NOX) && !(prot & VM_PROT_EXECUTE)) 1082 npte |= PG_NX; 1083 #endif 1084 opte = pmap_pte_testset (pte, npte); /* zap! */ 1085 1086 if (pmap_valid_entry(opte)) { 1087 #if defined(MULTIPROCESSOR) 1088 kpreempt_disable(); 1089 pmap_tlb_shootdown(pmap_kernel(), va, 0, opte); 1090 kpreempt_enable(); 1091 #else 1092 /* Don't bother deferring in the single CPU case. */ 1093 pmap_update_pg(va); 1094 #endif 1095 } 1096 } 1097 #endif /* XEN */ 1098 1099 #if defined(__x86_64__) 1100 /* 1101 * Change protection for a virtual address. Local for a CPU only, don't 1102 * care about TLB shootdowns. 1103 * 1104 * => must be called with preemption disabled 1105 */ 1106 void 1107 pmap_changeprot_local(vaddr_t va, vm_prot_t prot) 1108 { 1109 pt_entry_t *pte, opte, npte; 1110 1111 KASSERT(kpreempt_disabled()); 1112 1113 if (va < VM_MIN_KERNEL_ADDRESS) 1114 pte = vtopte(va); 1115 else 1116 pte = kvtopte(va); 1117 1118 npte = opte = *pte; 1119 1120 if ((prot & VM_PROT_WRITE) != 0) 1121 npte |= PG_RW; 1122 else 1123 npte &= ~PG_RW; 1124 1125 if (opte != npte) { 1126 pmap_pte_set(pte, npte); 1127 pmap_pte_flush(); 1128 invlpg(va); 1129 } 1130 } 1131 #endif /* defined(__x86_64__) */ 1132 1133 /* 1134 * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking 1135 * 1136 * => no need to lock anything 1137 * => caller must dispose of any vm_page mapped in the va range 1138 * => note: not an inline function 1139 * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE 1140 * => we assume kernel only unmaps valid addresses and thus don't bother 1141 * checking the valid bit before doing TLB flushing 1142 * => must be followed by call to pmap_update() before reuse of page 1143 */ 1144 1145 void 1146 pmap_kremove(vaddr_t sva, vsize_t len) 1147 { 1148 pt_entry_t *pte, xpte; 1149 vaddr_t va, eva; 1150 1151 eva = sva + len; 1152 xpte = 0; 1153 1154 for (va = sva; va < eva; va += PAGE_SIZE) { 1155 if (va < VM_MIN_KERNEL_ADDRESS) 1156 pte = vtopte(va); 1157 else 1158 pte = kvtopte(va); 1159 xpte |= pmap_pte_testset(pte, 0); /* zap! */ 1160 #if defined(DIAGNOSTIC) 1161 /* XXX For now... */ 1162 if (xpte & PG_PS) 1163 panic("pmap_kremove: PG_PS"); 1164 if (xpte & PG_PVLIST) 1165 panic("pmap_kremove: PG_PVLIST mapping for 0x%lx", 1166 va); 1167 #endif 1168 } 1169 if ((xpte & (PG_V | PG_U)) == (PG_V | PG_U)) { 1170 kpreempt_disable(); 1171 pmap_tlb_shootdown(pmap_kernel(), sva, eva, xpte); 1172 kpreempt_enable(); 1173 } 1174 } 1175 1176 /* 1177 * p m a p i n i t f u n c t i o n s 1178 * 1179 * pmap_bootstrap and pmap_init are called during system startup 1180 * to init the pmap module. pmap_bootstrap() does a low level 1181 * init just to get things rolling. pmap_init() finishes the job. 1182 */ 1183 1184 /* 1185 * pmap_bootstrap: get the system in a state where it can run with VM 1186 * properly enabled (called before main()). the VM system is 1187 * fully init'd later... 1188 * 1189 * => on i386, locore.s has already enabled the MMU by allocating 1190 * a PDP for the kernel, and nkpde PTP's for the kernel. 1191 * => kva_start is the first free virtual address in kernel space 1192 */ 1193 1194 void 1195 pmap_bootstrap(vaddr_t kva_start) 1196 { 1197 struct pmap *kpm; 1198 pt_entry_t *pte; 1199 int i; 1200 vaddr_t kva; 1201 #ifdef XEN 1202 pt_entry_t pg_nx = 0; 1203 #else 1204 unsigned long p1i; 1205 vaddr_t kva_end; 1206 pt_entry_t pg_nx = (cpu_feature & CPUID_NOX ? PG_NX : 0); 1207 #endif 1208 1209 /* 1210 * set up our local static global vars that keep track of the 1211 * usage of KVM before kernel_map is set up 1212 */ 1213 1214 virtual_avail = kva_start; /* first free KVA */ 1215 virtual_end = VM_MAX_KERNEL_ADDRESS; /* last KVA */ 1216 1217 /* 1218 * set up protection_codes: we need to be able to convert from 1219 * a MI protection code (some combo of VM_PROT...) to something 1220 * we can jam into a i386 PTE. 1221 */ 1222 1223 protection_codes[VM_PROT_NONE] = pg_nx; /* --- */ 1224 protection_codes[VM_PROT_EXECUTE] = PG_RO | PG_X; /* --x */ 1225 protection_codes[VM_PROT_READ] = PG_RO | pg_nx; /* -r- */ 1226 protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO | PG_X;/* -rx */ 1227 protection_codes[VM_PROT_WRITE] = PG_RW | pg_nx; /* w-- */ 1228 protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW | PG_X;/* w-x */ 1229 protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW | pg_nx; 1230 /* wr- */ 1231 protection_codes[VM_PROT_ALL] = PG_RW | PG_X; /* wrx */ 1232 1233 /* 1234 * now we init the kernel's pmap 1235 * 1236 * the kernel pmap's pm_obj is not used for much. however, in 1237 * user pmaps the pm_obj contains the list of active PTPs. 1238 * the pm_obj currently does not have a pager. it might be possible 1239 * to add a pager that would allow a process to read-only mmap its 1240 * own page tables (fast user level vtophys?). this may or may not 1241 * be useful. 1242 */ 1243 1244 kpm = pmap_kernel(); 1245 for (i = 0; i < PTP_LEVELS - 1; i++) { 1246 UVM_OBJ_INIT(&kpm->pm_obj[i], NULL, 1); 1247 kpm->pm_ptphint[i] = NULL; 1248 } 1249 memset(&kpm->pm_list, 0, sizeof(kpm->pm_list)); /* pm_list not used */ 1250 kpm->pm_pdir = (pd_entry_t *)(lwp0.l_addr->u_pcb.pcb_cr3 + KERNBASE); 1251 #ifdef PAE 1252 for (i = 0; i < PDP_SIZE; i++) 1253 kpm->pm_pdirpa[i] = 1254 (paddr_t)lwp0.l_addr->u_pcb.pcb_cr3 + PAGE_SIZE * i; 1255 #else 1256 kpm->pm_pdirpa = (paddr_t) lwp0.l_addr->u_pcb.pcb_cr3; 1257 #endif 1258 kpm->pm_stats.wired_count = kpm->pm_stats.resident_count = 1259 x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS); 1260 1261 /* 1262 * the above is just a rough estimate and not critical to the proper 1263 * operation of the system. 1264 */ 1265 1266 #ifndef XEN 1267 /* 1268 * Begin to enable global TLB entries if they are supported. 1269 * The G bit has no effect until the CR4_PGE bit is set in CR4, 1270 * which happens in cpu_init(), which is run on each cpu 1271 * (and happens later) 1272 */ 1273 1274 if (cpu_feature & CPUID_PGE) { 1275 pmap_pg_g = PG_G; /* enable software */ 1276 1277 /* add PG_G attribute to already mapped kernel pages */ 1278 if (KERNBASE == VM_MIN_KERNEL_ADDRESS) { 1279 kva_end = virtual_avail; 1280 } else { 1281 extern vaddr_t eblob, esym; 1282 kva_end = (vaddr_t)&end; 1283 if (esym > kva_end) 1284 kva_end = esym; 1285 if (eblob > kva_end) 1286 kva_end = eblob; 1287 kva_end = roundup(kva_end, PAGE_SIZE); 1288 } 1289 for (kva = KERNBASE; kva < kva_end; kva += PAGE_SIZE) { 1290 p1i = pl1_i(kva); 1291 if (pmap_valid_entry(PTE_BASE[p1i])) 1292 PTE_BASE[p1i] |= PG_G; 1293 } 1294 } 1295 1296 /* 1297 * enable large pages if they are supported. 1298 */ 1299 1300 if (cpu_feature & CPUID_PSE) { 1301 paddr_t pa; 1302 pd_entry_t *pde; 1303 extern char __data_start; 1304 1305 lcr4(rcr4() | CR4_PSE); /* enable hardware (via %cr4) */ 1306 pmap_largepages = 1; /* enable software */ 1307 1308 /* 1309 * the TLB must be flushed after enabling large pages 1310 * on Pentium CPUs, according to section 3.6.2.2 of 1311 * "Intel Architecture Software Developer's Manual, 1312 * Volume 3: System Programming". 1313 */ 1314 tlbflush(); 1315 1316 /* 1317 * now, remap the kernel text using large pages. we 1318 * assume that the linker has properly aligned the 1319 * .data segment to a NBPD_L2 boundary. 1320 */ 1321 kva_end = rounddown((vaddr_t)&__data_start, NBPD_L1); 1322 for (pa = 0, kva = KERNBASE; kva + NBPD_L2 <= kva_end; 1323 kva += NBPD_L2, pa += NBPD_L2) { 1324 pde = &L2_BASE[pl2_i(kva)]; 1325 *pde = pa | pmap_pg_g | PG_PS | 1326 PG_KR | PG_V; /* zap! */ 1327 tlbflush(); 1328 } 1329 #if defined(DEBUG) 1330 printf("kernel text is mapped with " 1331 "%lu large pages and %lu normal pages\n", 1332 (unsigned long)howmany(kva - KERNBASE, NBPD_L2), 1333 (unsigned long)howmany((vaddr_t)&__data_start - kva, 1334 NBPD_L1)); 1335 #endif /* defined(DEBUG) */ 1336 } 1337 #endif /* !XEN */ 1338 1339 if (VM_MIN_KERNEL_ADDRESS != KERNBASE) { 1340 /* 1341 * zero_pte is stuck at the end of mapped space for the kernel 1342 * image (disjunct from kva space). This is done so that it 1343 * can safely be used in pmap_growkernel (pmap_get_physpage), 1344 * when it's called for the first time. 1345 * XXXfvdl fix this for MULTIPROCESSOR later. 1346 */ 1347 1348 early_zerop = (void *)(KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2); 1349 early_zero_pte = PTE_BASE + pl1_i((unsigned long)early_zerop); 1350 } 1351 1352 /* 1353 * now we allocate the "special" VAs which are used for tmp mappings 1354 * by the pmap (and other modules). we allocate the VAs by advancing 1355 * virtual_avail (note that there are no pages mapped at these VAs). 1356 * we find the PTE that maps the allocated VA via the linear PTE 1357 * mapping. 1358 */ 1359 1360 pte = PTE_BASE + pl1_i(virtual_avail); 1361 1362 #ifdef MULTIPROCESSOR 1363 /* 1364 * Waste some VA space to avoid false sharing of cache lines 1365 * for page table pages: Give each possible CPU a cache line 1366 * of PTE's (8) to play with, though we only need 4. We could 1367 * recycle some of this waste by putting the idle stacks here 1368 * as well; we could waste less space if we knew the largest 1369 * CPU ID beforehand. 1370 */ 1371 csrcp = (char *) virtual_avail; csrc_pte = pte; 1372 1373 cdstp = (char *) virtual_avail+PAGE_SIZE; cdst_pte = pte+1; 1374 1375 zerop = (char *) virtual_avail+PAGE_SIZE*2; zero_pte = pte+2; 1376 1377 ptpp = (char *) virtual_avail+PAGE_SIZE*3; ptp_pte = pte+3; 1378 1379 virtual_avail += PAGE_SIZE * maxcpus * NPTECL; 1380 pte += maxcpus * NPTECL; 1381 #else 1382 csrcp = (void *) virtual_avail; csrc_pte = pte; /* allocate */ 1383 virtual_avail += PAGE_SIZE; pte++; /* advance */ 1384 1385 cdstp = (void *) virtual_avail; cdst_pte = pte; 1386 virtual_avail += PAGE_SIZE; pte++; 1387 1388 zerop = (void *) virtual_avail; zero_pte = pte; 1389 virtual_avail += PAGE_SIZE; pte++; 1390 1391 ptpp = (void *) virtual_avail; ptp_pte = pte; 1392 virtual_avail += PAGE_SIZE; pte++; 1393 #endif 1394 1395 if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { 1396 early_zerop = zerop; 1397 early_zero_pte = zero_pte; 1398 } 1399 1400 /* 1401 * Nothing after this point actually needs pte; 1402 */ 1403 pte = (void *)0xdeadbeef; 1404 1405 /* XXX: vmmap used by mem.c... should be uvm_map_reserve */ 1406 /* XXXfvdl PTEs not needed here */ 1407 vmmap = (char *)virtual_avail; /* don't need pte */ 1408 virtual_avail += PAGE_SIZE; pte++; 1409 1410 #ifdef XEN 1411 #ifdef __x86_64__ 1412 /* 1413 * We want a dummy page directory for Xen: 1414 * when deactivate a pmap, Xen will still consider it active. 1415 * So we set user PGD to this one to lift all protection on 1416 * the now inactive page tables set. 1417 */ 1418 xen_dummy_user_pgd = avail_start; 1419 avail_start += PAGE_SIZE; 1420 1421 /* Zero fill it, the less checks in Xen it requires the better */ 1422 memset((void *) (xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE); 1423 /* Mark read-only */ 1424 HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE, 1425 pmap_pa2pte(xen_dummy_user_pgd) | PG_u | PG_V, UVMF_INVLPG); 1426 /* Pin as L4 */ 1427 xpq_queue_pin_table(xpmap_ptom_masked(xen_dummy_user_pgd)); 1428 #endif /* __x86_64__ */ 1429 idt_vaddr = virtual_avail; /* don't need pte */ 1430 idt_paddr = avail_start; /* steal a page */ 1431 /* 1432 * Xen require one more page as we can't store 1433 * GDT and LDT on the same page 1434 */ 1435 virtual_avail += 3 * PAGE_SIZE; 1436 avail_start += 3 * PAGE_SIZE; 1437 #else /* XEN */ 1438 idt_vaddr = virtual_avail; /* don't need pte */ 1439 idt_paddr = avail_start; /* steal a page */ 1440 #if defined(__x86_64__) 1441 virtual_avail += 2 * PAGE_SIZE; pte += 2; 1442 avail_start += 2 * PAGE_SIZE; 1443 #else /* defined(__x86_64__) */ 1444 virtual_avail += PAGE_SIZE; pte++; 1445 avail_start += PAGE_SIZE; 1446 /* pentium f00f bug stuff */ 1447 pentium_idt_vaddr = virtual_avail; /* don't need pte */ 1448 virtual_avail += PAGE_SIZE; pte++; 1449 #endif /* defined(__x86_64__) */ 1450 #endif /* XEN */ 1451 1452 #ifdef _LP64 1453 /* 1454 * Grab a page below 4G for things that need it (i.e. 1455 * having an initial %cr3 for the MP trampoline). 1456 */ 1457 lo32_vaddr = virtual_avail; 1458 virtual_avail += PAGE_SIZE; pte++; 1459 lo32_paddr = avail_start; 1460 avail_start += PAGE_SIZE; 1461 #endif 1462 1463 /* 1464 * now we reserve some VM for mapping pages when doing a crash dump 1465 */ 1466 1467 virtual_avail = reserve_dumppages(virtual_avail); 1468 1469 /* 1470 * init the static-global locks and global lists. 1471 * 1472 * => pventry::pvh_lock (initialized elsewhere) must also be 1473 * a spin lock, again at IPL_VM to prevent deadlock, and 1474 * again is never taken from interrupt context. 1475 */ 1476 1477 mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE); 1478 LIST_INIT(&pmaps); 1479 pmap_cpu_init_early(curcpu()); 1480 1481 /* 1482 * initialize caches. 1483 */ 1484 1485 pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), 0, 0, 0, 1486 "pmappl", NULL, IPL_NONE, NULL, NULL, NULL); 1487 #ifdef PAE 1488 pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE * PDP_SIZE, 0, 0, 0, 1489 "pdppl", &pmap_pdp_allocator, IPL_NONE, 1490 pmap_pdp_ctor, pmap_pdp_dtor, NULL); 1491 #else /* PAE */ 1492 pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE, 0, 0, 0, 1493 "pdppl", NULL, IPL_NONE, pmap_pdp_ctor, pmap_pdp_dtor, NULL); 1494 #endif /* PAE */ 1495 pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0, 1496 PR_LARGECACHE, "pvpl", &pool_allocator_meta, IPL_NONE, NULL, 1497 NULL, NULL); 1498 1499 /* 1500 * ensure the TLB is sync'd with reality by flushing it... 1501 */ 1502 1503 tlbflush(); 1504 1505 /* 1506 * calculate pmap_maxkvaddr from nkptp[]. 1507 */ 1508 1509 kva = VM_MIN_KERNEL_ADDRESS; 1510 for (i = PTP_LEVELS - 1; i >= 1; i--) { 1511 kva += nkptp[i] * nbpd[i]; 1512 } 1513 pmap_maxkvaddr = kva; 1514 } 1515 1516 #if defined(__x86_64__) 1517 /* 1518 * Pre-allocate PTPs for low memory, so that 1:1 mappings for various 1519 * trampoline code can be entered. 1520 */ 1521 void 1522 pmap_prealloc_lowmem_ptps(void) 1523 { 1524 #ifdef XEN 1525 int level; 1526 paddr_t newp; 1527 paddr_t pdes_pa; 1528 1529 pdes_pa = pmap_kernel()->pm_pdirpa; 1530 level = PTP_LEVELS; 1531 for (;;) { 1532 newp = avail_start; 1533 avail_start += PAGE_SIZE; 1534 HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop, 1535 xpmap_ptom_masked(newp) | PG_u | PG_V | PG_RW, UVMF_INVLPG); 1536 memset((void *)early_zerop, 0, PAGE_SIZE); 1537 /* Mark R/O before installing */ 1538 HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop, 1539 xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG); 1540 if (newp < (NKL2_KIMG_ENTRIES * NBPD_L2)) 1541 HYPERVISOR_update_va_mapping (newp + KERNBASE, 1542 xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG); 1543 xpq_queue_pte_update ( 1544 xpmap_ptom_masked(pdes_pa) 1545 + (pl_i(0, level) * sizeof (pd_entry_t)), 1546 xpmap_ptom_masked(newp) | PG_RW | PG_u | PG_V); 1547 level--; 1548 if (level <= 1) 1549 break; 1550 pdes_pa = newp; 1551 } 1552 #else /* XEN */ 1553 pd_entry_t *pdes; 1554 int level; 1555 paddr_t newp; 1556 1557 pdes = pmap_kernel()->pm_pdir; 1558 level = PTP_LEVELS; 1559 for (;;) { 1560 newp = avail_start; 1561 avail_start += PAGE_SIZE; 1562 *early_zero_pte = (newp & PG_FRAME) | PG_V | PG_RW; 1563 pmap_update_pg((vaddr_t)early_zerop); 1564 memset(early_zerop, 0, PAGE_SIZE); 1565 pdes[pl_i(0, level)] = (newp & PG_FRAME) | PG_V | PG_RW; 1566 level--; 1567 if (level <= 1) 1568 break; 1569 pdes = normal_pdes[level - 2]; 1570 } 1571 #endif /* XEN */ 1572 } 1573 #endif /* defined(__x86_64__) */ 1574 1575 /* 1576 * pmap_init: called from uvm_init, our job is to get the pmap 1577 * system ready to manage mappings... 1578 */ 1579 1580 void 1581 pmap_init(void) 1582 { 1583 int i; 1584 1585 for (i = 0; i < PV_HASH_SIZE; i++) { 1586 SLIST_INIT(&pv_hash_heads[i].hh_list); 1587 } 1588 for (i = 0; i < PV_HASH_LOCK_CNT; i++) { 1589 mutex_init(&pv_hash_locks[i].lock, MUTEX_NODEBUG, IPL_VM); 1590 } 1591 1592 /* 1593 * done: pmap module is up (and ready for business) 1594 */ 1595 1596 pmap_initialized = true; 1597 } 1598 1599 /* 1600 * pmap_cpu_init_early: perform early per-CPU initialization. 1601 */ 1602 1603 void 1604 pmap_cpu_init_early(struct cpu_info *ci) 1605 { 1606 struct pmap_cpu *pc; 1607 static uint8_t pmap_cpu_alloc; 1608 1609 pc = &pmap_cpu[pmap_cpu_alloc++].pc; 1610 ci->ci_pmap_cpu = pc; 1611 } 1612 1613 /* 1614 * pmap_cpu_init_late: perform late per-CPU initialization. 1615 */ 1616 1617 void 1618 pmap_cpu_init_late(struct cpu_info *ci) 1619 { 1620 1621 if (ci == &cpu_info_primary) { 1622 evcnt_attach_dynamic(&pmap_tlb_evcnt, EVCNT_TYPE_INTR, 1623 NULL, "global", "TLB IPI"); 1624 evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC, 1625 NULL, "x86", "io bitmap copy"); 1626 evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC, 1627 NULL, "x86", "ldt sync"); 1628 } 1629 1630 evcnt_attach_dynamic(&ci->ci_tlb_evcnt, EVCNT_TYPE_MISC, 1631 NULL, device_xname(ci->ci_dev), "TLB IPI"); 1632 } 1633 1634 /* 1635 * p v _ e n t r y f u n c t i o n s 1636 */ 1637 1638 /* 1639 * pmap_free_pvs: free a list of pv_entrys 1640 */ 1641 1642 static void 1643 pmap_free_pvs(struct pv_entry *pve) 1644 { 1645 struct pv_entry *next; 1646 1647 for ( /* null */ ; pve != NULL ; pve = next) { 1648 next = pve->pve_next; 1649 pool_cache_put(&pmap_pv_cache, pve); 1650 } 1651 } 1652 1653 /* 1654 * main pv_entry manipulation functions: 1655 * pmap_enter_pv: enter a mapping onto a pv_head list 1656 * pmap_remove_pv: remove a mappiing from a pv_head list 1657 * 1658 * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock 1659 * the pvh before calling 1660 */ 1661 1662 /* 1663 * insert_pv: a helper of pmap_enter_pv 1664 */ 1665 1666 static void 1667 insert_pv(struct pmap_page *pp, struct pv_entry *pve) 1668 { 1669 struct pv_hash_head *hh; 1670 kmutex_t *lock; 1671 u_int hash; 1672 1673 KASSERT(pp_locked(pp)); 1674 1675 hash = pvhash_hash(pve->pve_pte.pte_ptp, pve->pve_pte.pte_va); 1676 lock = pvhash_lock(hash); 1677 hh = pvhash_head(hash); 1678 mutex_spin_enter(lock); 1679 SLIST_INSERT_HEAD(&hh->hh_list, pve, pve_hash); 1680 mutex_spin_exit(lock); 1681 1682 LIST_INSERT_HEAD(&pp->pp_head.pvh_list, pve, pve_list); 1683 } 1684 1685 /* 1686 * pmap_enter_pv: enter a mapping onto a pv_head lst 1687 * 1688 * => caller should have the pp_lock locked 1689 * => caller should adjust ptp's wire_count before calling 1690 */ 1691 1692 static struct pv_entry * 1693 pmap_enter_pv(struct pmap_page *pp, 1694 struct pv_entry *pve, /* preallocated pve for us to use */ 1695 struct pv_entry **sparepve, 1696 struct vm_page *ptp, 1697 vaddr_t va) 1698 { 1699 1700 KASSERT(ptp == NULL || ptp->wire_count >= 2); 1701 KASSERT(ptp == NULL || ptp->uobject != NULL); 1702 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 1703 KASSERT(pp_locked(pp)); 1704 1705 if ((pp->pp_flags & PP_EMBEDDED) == 0) { 1706 if (LIST_EMPTY(&pp->pp_head.pvh_list)) { 1707 pp->pp_flags |= PP_EMBEDDED; 1708 pp->pp_pte.pte_ptp = ptp; 1709 pp->pp_pte.pte_va = va; 1710 1711 return pve; 1712 } 1713 } else { 1714 struct pv_entry *pve2; 1715 1716 pve2 = *sparepve; 1717 *sparepve = NULL; 1718 1719 pve2->pve_pte = pp->pp_pte; 1720 pp->pp_flags &= ~PP_EMBEDDED; 1721 LIST_INIT(&pp->pp_head.pvh_list); 1722 insert_pv(pp, pve2); 1723 } 1724 1725 pve->pve_pte.pte_ptp = ptp; 1726 pve->pve_pte.pte_va = va; 1727 insert_pv(pp, pve); 1728 1729 return NULL; 1730 } 1731 1732 /* 1733 * pmap_remove_pv: try to remove a mapping from a pv_list 1734 * 1735 * => caller should hold pp_lock [so that attrs can be adjusted] 1736 * => caller should adjust ptp's wire_count and free PTP if needed 1737 * => we return the removed pve 1738 */ 1739 1740 static struct pv_entry * 1741 pmap_remove_pv(struct pmap_page *pp, struct vm_page *ptp, vaddr_t va) 1742 { 1743 struct pv_hash_head *hh; 1744 struct pv_entry *pve; 1745 kmutex_t *lock; 1746 u_int hash; 1747 1748 KASSERT(ptp == NULL || ptp->uobject != NULL); 1749 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 1750 KASSERT(pp_locked(pp)); 1751 1752 if ((pp->pp_flags & PP_EMBEDDED) != 0) { 1753 KASSERT(pp->pp_pte.pte_ptp == ptp); 1754 KASSERT(pp->pp_pte.pte_va == va); 1755 1756 pp->pp_flags &= ~PP_EMBEDDED; 1757 LIST_INIT(&pp->pp_head.pvh_list); 1758 1759 return NULL; 1760 } 1761 1762 hash = pvhash_hash(ptp, va); 1763 lock = pvhash_lock(hash); 1764 hh = pvhash_head(hash); 1765 mutex_spin_enter(lock); 1766 pve = pvhash_remove(hh, ptp, va); 1767 mutex_spin_exit(lock); 1768 1769 LIST_REMOVE(pve, pve_list); 1770 1771 return pve; 1772 } 1773 1774 /* 1775 * p t p f u n c t i o n s 1776 */ 1777 1778 static inline struct vm_page * 1779 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level) 1780 { 1781 int lidx = level - 1; 1782 struct vm_page *pg; 1783 1784 KASSERT(mutex_owned(&pmap->pm_lock)); 1785 1786 if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] && 1787 pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) { 1788 return (pmap->pm_ptphint[lidx]); 1789 } 1790 PMAP_SUBOBJ_LOCK(pmap, lidx); 1791 pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level)); 1792 PMAP_SUBOBJ_UNLOCK(pmap, lidx); 1793 1794 KASSERT(pg == NULL || pg->wire_count >= 1); 1795 return pg; 1796 } 1797 1798 static inline void 1799 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level) 1800 { 1801 int lidx; 1802 struct uvm_object *obj; 1803 1804 KASSERT(ptp->wire_count == 1); 1805 1806 lidx = level - 1; 1807 1808 obj = &pmap->pm_obj[lidx]; 1809 pmap_stats_update(pmap, -1, 0); 1810 if (lidx != 0) 1811 mutex_enter(&obj->vmobjlock); 1812 if (pmap->pm_ptphint[lidx] == ptp) 1813 pmap->pm_ptphint[lidx] = TAILQ_FIRST(&obj->memq); 1814 ptp->wire_count = 0; 1815 uvm_pagerealloc(ptp, NULL, 0); 1816 VM_PAGE_TO_PP(ptp)->pp_link = curlwp->l_md.md_gc_ptp; 1817 curlwp->l_md.md_gc_ptp = ptp; 1818 if (lidx != 0) 1819 mutex_exit(&obj->vmobjlock); 1820 } 1821 1822 static void 1823 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va, 1824 pt_entry_t *ptes, pd_entry_t * const *pdes) 1825 { 1826 unsigned long index; 1827 int level; 1828 vaddr_t invaladdr; 1829 #ifdef MULTIPROCESSOR 1830 vaddr_t invaladdr2; 1831 #endif 1832 pd_entry_t opde; 1833 struct pmap *curpmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map); 1834 1835 KASSERT(pmap != pmap_kernel()); 1836 KASSERT(mutex_owned(&pmap->pm_lock)); 1837 KASSERT(kpreempt_disabled()); 1838 1839 level = 1; 1840 do { 1841 index = pl_i(va, level + 1); 1842 opde = pmap_pte_testset(&pdes[level - 1][index], 0); 1843 #if defined(XEN) && defined(__x86_64__) 1844 /* 1845 * If ptp is a L3 currently mapped in kernel space, 1846 * clear it before freeing 1847 */ 1848 if (pmap->pm_pdirpa == xen_current_user_pgd 1849 && level == PTP_LEVELS - 1) 1850 pmap_pte_set(&pmap_kernel()->pm_pdir[index], 0); 1851 #endif /* XEN && __x86_64__ */ 1852 pmap_freepage(pmap, ptp, level); 1853 invaladdr = level == 1 ? (vaddr_t)ptes : 1854 (vaddr_t)pdes[level - 2]; 1855 pmap_tlb_shootdown(curpmap, invaladdr + index * PAGE_SIZE, 1856 0, opde); 1857 #if defined(MULTIPROCESSOR) 1858 invaladdr2 = level == 1 ? (vaddr_t)PTE_BASE : 1859 (vaddr_t)normal_pdes[level - 2]; 1860 if (pmap != curpmap || invaladdr != invaladdr2) { 1861 pmap_tlb_shootdown(pmap, invaladdr2 + index * PAGE_SIZE, 1862 0, opde); 1863 } 1864 #endif 1865 if (level < PTP_LEVELS - 1) { 1866 ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1); 1867 ptp->wire_count--; 1868 if (ptp->wire_count > 1) 1869 break; 1870 } 1871 } while (++level < PTP_LEVELS); 1872 pmap_pte_flush(); 1873 } 1874 1875 /* 1876 * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one) 1877 * 1878 * => pmap should NOT be pmap_kernel() 1879 * => pmap should be locked 1880 * => preemption should be disabled 1881 */ 1882 1883 static struct vm_page * 1884 pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t * const *pdes) 1885 { 1886 struct vm_page *ptp, *pptp; 1887 int i; 1888 unsigned long index; 1889 pd_entry_t *pva; 1890 paddr_t ppa, pa; 1891 struct uvm_object *obj; 1892 1893 KASSERT(pmap != pmap_kernel()); 1894 KASSERT(mutex_owned(&pmap->pm_lock)); 1895 KASSERT(kpreempt_disabled()); 1896 1897 ptp = NULL; 1898 pa = (paddr_t)-1; 1899 1900 /* 1901 * Loop through all page table levels seeing if we need to 1902 * add a new page to that level. 1903 */ 1904 for (i = PTP_LEVELS; i > 1; i--) { 1905 /* 1906 * Save values from previous round. 1907 */ 1908 pptp = ptp; 1909 ppa = pa; 1910 1911 index = pl_i(va, i); 1912 pva = pdes[i - 2]; 1913 1914 if (pmap_valid_entry(pva[index])) { 1915 ppa = pmap_pte2pa(pva[index]); 1916 ptp = NULL; 1917 continue; 1918 } 1919 1920 obj = &pmap->pm_obj[i-2]; 1921 PMAP_SUBOBJ_LOCK(pmap, i - 2); 1922 ptp = uvm_pagealloc(obj, ptp_va2o(va, i - 1), NULL, 1923 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 1924 PMAP_SUBOBJ_UNLOCK(pmap, i - 2); 1925 1926 if (ptp == NULL) 1927 return NULL; 1928 1929 ptp->flags &= ~PG_BUSY; /* never busy */ 1930 ptp->wire_count = 1; 1931 pmap->pm_ptphint[i - 2] = ptp; 1932 pa = VM_PAGE_TO_PHYS(ptp); 1933 pmap_pte_set(&pva[index], (pd_entry_t) 1934 (pmap_pa2pte(pa) | PG_u | PG_RW | PG_V)); 1935 #if defined(XEN) && defined(__x86_64__) 1936 /* 1937 * In Xen we must enter the mapping in kernel map too 1938 * if pmap is curmap and modifying top level (PGD) 1939 */ 1940 if(i == PTP_LEVELS && pmap != pmap_kernel()) { 1941 pmap_pte_set(&pmap_kernel()->pm_pdir[index], 1942 (pd_entry_t) (pmap_pa2pte(pa) 1943 | PG_u | PG_RW | PG_V)); 1944 } 1945 #endif /* XEN && __x86_64__ */ 1946 pmap_pte_flush(); 1947 pmap_stats_update(pmap, 1, 0); 1948 /* 1949 * If we're not in the top level, increase the 1950 * wire count of the parent page. 1951 */ 1952 if (i < PTP_LEVELS) { 1953 if (pptp == NULL) 1954 pptp = pmap_find_ptp(pmap, va, ppa, i); 1955 #ifdef DIAGNOSTIC 1956 if (pptp == NULL) 1957 panic("pde page disappeared"); 1958 #endif 1959 pptp->wire_count++; 1960 } 1961 } 1962 1963 /* 1964 * ptp is not NULL if we just allocated a new ptp. If it's 1965 * still NULL, we must look up the existing one. 1966 */ 1967 if (ptp == NULL) { 1968 ptp = pmap_find_ptp(pmap, va, ppa, 1); 1969 #ifdef DIAGNOSTIC 1970 if (ptp == NULL) { 1971 printf("va %lx ppa %lx\n", (unsigned long)va, 1972 (unsigned long)ppa); 1973 panic("pmap_get_ptp: unmanaged user PTP"); 1974 } 1975 #endif 1976 } 1977 1978 pmap->pm_ptphint[0] = ptp; 1979 return(ptp); 1980 } 1981 1982 /* 1983 * p m a p l i f e c y c l e f u n c t i o n s 1984 */ 1985 1986 /* 1987 * pmap_pdp_ctor: constructor for the PDP cache. 1988 */ 1989 1990 int 1991 pmap_pdp_ctor(void *arg, void *v, int flags) 1992 { 1993 pd_entry_t *pdir = v; 1994 paddr_t pdirpa = 0; /* XXX: GCC */ 1995 vaddr_t object; 1996 int i; 1997 1998 #if !defined(XEN) || !defined(__x86_64__) 1999 int npde; 2000 #endif 2001 #ifdef XEN 2002 int s; 2003 #endif 2004 2005 /* 2006 * NOTE: The `pmap_lock' is held when the PDP is allocated. 2007 */ 2008 2009 #if defined(XEN) && defined(__x86_64__) 2010 /* fetch the physical address of the page directory. */ 2011 (void) pmap_extract(pmap_kernel(), (vaddr_t) pdir, &pdirpa); 2012 2013 /* zero init area */ 2014 memset (pdir, 0, PAGE_SIZE); /* Xen wants a clean page */ 2015 /* 2016 * this pdir will NEVER be active in kernel mode 2017 * so mark recursive entry invalid 2018 */ 2019 pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa) | PG_u; 2020 /* 2021 * PDP constructed this way won't be for kernel, 2022 * hence we don't put kernel mappings on Xen. 2023 * But we need to make pmap_create() happy, so put a dummy (without 2024 * PG_V) value at the right place. 2025 */ 2026 pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] = 2027 (unsigned long)-1 & PG_FRAME; 2028 #else /* XEN && __x86_64__*/ 2029 /* zero init area */ 2030 memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t)); 2031 2032 object = (vaddr_t)v; 2033 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2034 /* fetch the physical address of the page directory. */ 2035 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2036 /* put in recursive PDE to map the PTEs */ 2037 pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PG_V; 2038 #ifndef XEN 2039 pdir[PDIR_SLOT_PTE + i] |= PG_KW; 2040 #endif 2041 } 2042 2043 /* copy kernel's PDE */ 2044 npde = nkptp[PTP_LEVELS - 1]; 2045 2046 memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN], 2047 npde * sizeof(pd_entry_t)); 2048 2049 /* zero the rest */ 2050 memset(&pdir[PDIR_SLOT_KERN + npde], 0, 2051 (NTOPLEVEL_PDES - (PDIR_SLOT_KERN + npde)) * sizeof(pd_entry_t)); 2052 2053 if (VM_MIN_KERNEL_ADDRESS != KERNBASE) { 2054 int idx = pl_i(KERNBASE, PTP_LEVELS); 2055 2056 pdir[idx] = PDP_BASE[idx]; 2057 } 2058 #endif /* XEN && __x86_64__*/ 2059 #ifdef XEN 2060 s = splvm(); 2061 object = (vaddr_t)v; 2062 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2063 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2064 /* remap this page RO */ 2065 pmap_kenter_pa(object, pdirpa, VM_PROT_READ); 2066 pmap_update(pmap_kernel()); 2067 /* 2068 * pin as L2/L4 page, we have to do the page with the 2069 * PDIR_SLOT_PTE entries last 2070 */ 2071 #ifdef PAE 2072 if (i == l2tol3(PDIR_SLOT_PTE)) 2073 continue; 2074 #endif 2075 xpq_queue_pin_table(xpmap_ptom_masked(pdirpa)); 2076 } 2077 #ifdef PAE 2078 object = ((vaddr_t)pdir) + PAGE_SIZE * l2tol3(PDIR_SLOT_PTE); 2079 (void)pmap_extract(pmap_kernel(), object, &pdirpa); 2080 xpq_queue_pin_table(xpmap_ptom_masked(pdirpa)); 2081 #endif 2082 xpq_flush_queue(); 2083 splx(s); 2084 #endif /* XEN */ 2085 2086 return (0); 2087 } 2088 2089 /* 2090 * pmap_pdp_dtor: destructor for the PDP cache. 2091 */ 2092 2093 void 2094 pmap_pdp_dtor(void *arg, void *v) 2095 { 2096 #ifdef XEN 2097 paddr_t pdirpa = 0; /* XXX: GCC */ 2098 vaddr_t object = (vaddr_t)v; 2099 int i; 2100 int s = splvm(); 2101 pt_entry_t *pte; 2102 2103 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2104 /* fetch the physical address of the page directory. */ 2105 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2106 /* unpin page table */ 2107 xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa)); 2108 } 2109 object = (vaddr_t)v; 2110 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2111 /* Set page RW again */ 2112 pte = kvtopte(object); 2113 xpq_queue_pte_update(xpmap_ptetomach(pte), *pte | PG_RW); 2114 xpq_queue_invlpg((vaddr_t)object); 2115 } 2116 xpq_flush_queue(); 2117 splx(s); 2118 #endif /* XEN */ 2119 } 2120 2121 #ifdef PAE 2122 2123 /* pmap_pdp_alloc: Allocate a page for the pdp memory pool. */ 2124 2125 void * 2126 pmap_pdp_alloc(struct pool *pp, int flags) 2127 { 2128 return (void *)uvm_km_alloc(kernel_map, 2129 PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE, 2130 ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK) 2131 | UVM_KMF_WIRED); 2132 } 2133 2134 /* 2135 * pmap_pdp_free: free a PDP 2136 */ 2137 2138 void 2139 pmap_pdp_free(struct pool *pp, void *v) 2140 { 2141 uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE, 2142 UVM_KMF_WIRED); 2143 } 2144 #endif /* PAE */ 2145 2146 /* 2147 * pmap_create: create a pmap 2148 * 2149 * => note: old pmap interface took a "size" args which allowed for 2150 * the creation of "software only" pmaps (not in bsd). 2151 */ 2152 2153 struct pmap * 2154 pmap_create(void) 2155 { 2156 struct pmap *pmap; 2157 int i; 2158 2159 pmap = pool_cache_get(&pmap_cache, PR_WAITOK); 2160 2161 /* init uvm_object */ 2162 for (i = 0; i < PTP_LEVELS - 1; i++) { 2163 UVM_OBJ_INIT(&pmap->pm_obj[i], NULL, 1); 2164 pmap->pm_ptphint[i] = NULL; 2165 } 2166 pmap->pm_stats.wired_count = 0; 2167 pmap->pm_stats.resident_count = 1; /* count the PDP allocd below */ 2168 #if !defined(__x86_64__) 2169 pmap->pm_hiexec = 0; 2170 #endif /* !defined(__x86_64__) */ 2171 pmap->pm_flags = 0; 2172 pmap->pm_cpus = 0; 2173 pmap->pm_kernel_cpus = 0; 2174 2175 /* init the LDT */ 2176 pmap->pm_ldt = NULL; 2177 pmap->pm_ldt_len = 0; 2178 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 2179 2180 /* allocate PDP */ 2181 try_again: 2182 pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK); 2183 2184 mutex_enter(&pmaps_lock); 2185 2186 if (pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] == 0) { 2187 mutex_exit(&pmaps_lock); 2188 pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir); 2189 goto try_again; 2190 } 2191 2192 #ifdef PAE 2193 for (i = 0; i < PDP_SIZE; i++) 2194 pmap->pm_pdirpa[i] = 2195 pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]); 2196 #else 2197 pmap->pm_pdirpa = pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE]); 2198 #endif 2199 2200 LIST_INSERT_HEAD(&pmaps, pmap, pm_list); 2201 2202 mutex_exit(&pmaps_lock); 2203 2204 return (pmap); 2205 } 2206 2207 /* 2208 * pmap_destroy: drop reference count on pmap. free pmap if 2209 * reference count goes to zero. 2210 */ 2211 2212 void 2213 pmap_destroy(struct pmap *pmap) 2214 { 2215 int i; 2216 #ifdef DIAGNOSTIC 2217 struct cpu_info *ci; 2218 CPU_INFO_ITERATOR cii; 2219 #endif /* DIAGNOSTIC */ 2220 2221 /* 2222 * if we have torn down this pmap, process deferred frees and 2223 * invalidations now. 2224 */ 2225 if (__predict_false(curlwp->l_md.md_gc_pmap == pmap)) { 2226 pmap_update(pmap); 2227 } 2228 2229 /* 2230 * drop reference count 2231 */ 2232 2233 if (atomic_dec_uint_nv((unsigned *)&pmap->pm_obj[0].uo_refs) > 0) { 2234 return; 2235 } 2236 2237 #ifdef DIAGNOSTIC 2238 for (CPU_INFO_FOREACH(cii, ci)) 2239 if (ci->ci_pmap == pmap) 2240 panic("destroying pmap being used"); 2241 #endif /* DIAGNOSTIC */ 2242 2243 /* 2244 * reference count is zero, free pmap resources and then free pmap. 2245 */ 2246 #ifdef XEN 2247 /* 2248 * Xen lazy APDP handling: 2249 * clear APDP_PDE if pmap is the currently mapped 2250 */ 2251 if (xpmap_ptom_masked(pmap_pdirpa(pmap, 0)) == (*APDP_PDE & PG_FRAME)) { 2252 kpreempt_disable(); 2253 for (i = 0; i < PDP_SIZE; i++) { 2254 pmap_pte_set(&APDP_PDE[i], 0); 2255 #ifdef PAE 2256 /* clear shadow entry too */ 2257 pmap_pte_set(&APDP_PDE_SHADOW[i], 0); 2258 #endif 2259 } 2260 pmap_pte_flush(); 2261 pmap_apte_flush(pmap_kernel()); 2262 kpreempt_enable(); 2263 } 2264 #endif 2265 2266 /* 2267 * remove it from global list of pmaps 2268 */ 2269 2270 mutex_enter(&pmaps_lock); 2271 LIST_REMOVE(pmap, pm_list); 2272 mutex_exit(&pmaps_lock); 2273 2274 /* 2275 * destroyed pmap shouldn't have remaining PTPs 2276 */ 2277 2278 for (i = 0; i < PTP_LEVELS - 1; i++) { 2279 KASSERT(pmap->pm_obj[i].uo_npages == 0); 2280 KASSERT(TAILQ_EMPTY(&pmap->pm_obj[i].memq)); 2281 } 2282 2283 /* 2284 * MULTIPROCESSOR -- no need to flush out of other processors' 2285 * APTE space because we do that in pmap_unmap_ptes(). 2286 */ 2287 pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir); 2288 2289 #ifdef USER_LDT 2290 if (pmap->pm_ldt != NULL) { 2291 /* 2292 * no need to switch the LDT; this address space is gone, 2293 * nothing is using it. 2294 * 2295 * No need to lock the pmap for ldt_free (or anything else), 2296 * we're the last one to use it. 2297 */ 2298 mutex_enter(&cpu_lock); 2299 ldt_free(pmap->pm_ldt_sel); 2300 mutex_exit(&cpu_lock); 2301 uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt, 2302 pmap->pm_ldt_len, UVM_KMF_WIRED); 2303 } 2304 #endif 2305 2306 for (i = 0; i < PTP_LEVELS - 1; i++) 2307 mutex_destroy(&pmap->pm_obj[i].vmobjlock); 2308 pool_cache_put(&pmap_cache, pmap); 2309 } 2310 2311 /* 2312 * pmap_remove_all: pmap is being torn down by the current thread. 2313 * avoid unnecessary invalidations. 2314 */ 2315 2316 void 2317 pmap_remove_all(struct pmap *pmap) 2318 { 2319 lwp_t *l = curlwp; 2320 2321 KASSERT(l->l_md.md_gc_pmap == NULL); 2322 2323 l->l_md.md_gc_pmap = pmap; 2324 } 2325 2326 #if defined(PMAP_FORK) 2327 /* 2328 * pmap_fork: perform any necessary data structure manipulation when 2329 * a VM space is forked. 2330 */ 2331 2332 void 2333 pmap_fork(struct pmap *pmap1, struct pmap *pmap2) 2334 { 2335 #ifdef USER_LDT 2336 union descriptor *new_ldt; 2337 size_t len; 2338 int sel; 2339 2340 if (__predict_true(pmap1->pm_ldt == NULL)) { 2341 return; 2342 } 2343 2344 retry: 2345 if (pmap1->pm_ldt != NULL) { 2346 len = pmap1->pm_ldt_len; 2347 new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, len, 0, 2348 UVM_KMF_WIRED); 2349 mutex_enter(&cpu_lock); 2350 sel = ldt_alloc(new_ldt, len); 2351 if (sel == -1) { 2352 mutex_exit(&cpu_lock); 2353 uvm_km_free(kernel_map, (vaddr_t)new_ldt, len, 2354 UVM_KMF_WIRED); 2355 printf("WARNING: pmap_fork: unable to allocate LDT\n"); 2356 return; 2357 } 2358 } else { 2359 len = -1; 2360 new_ldt = NULL; 2361 sel = -1; 2362 mutex_enter(&cpu_lock); 2363 } 2364 2365 /* Copy the LDT, if necessary. */ 2366 if (pmap1->pm_ldt != NULL) { 2367 if (len != pmap1->pm_ldt_len) { 2368 if (len != -1) { 2369 ldt_free(sel); 2370 uvm_km_free(kernel_map, (vaddr_t)new_ldt, 2371 len, UVM_KMF_WIRED); 2372 } 2373 mutex_exit(&cpu_lock); 2374 goto retry; 2375 } 2376 2377 memcpy(new_ldt, pmap1->pm_ldt, len); 2378 pmap2->pm_ldt = new_ldt; 2379 pmap2->pm_ldt_len = pmap1->pm_ldt_len; 2380 pmap2->pm_ldt_sel = sel; 2381 len = -1; 2382 } 2383 2384 if (len != -1) { 2385 ldt_free(sel); 2386 uvm_km_free(kernel_map, (vaddr_t)new_ldt, len, 2387 UVM_KMF_WIRED); 2388 } 2389 mutex_exit(&cpu_lock); 2390 #endif /* USER_LDT */ 2391 } 2392 #endif /* PMAP_FORK */ 2393 2394 #ifdef USER_LDT 2395 2396 /* 2397 * pmap_ldt_xcall: cross call used by pmap_ldt_sync. if the named pmap 2398 * is active, reload LDTR. 2399 */ 2400 static void 2401 pmap_ldt_xcall(void *arg1, void *arg2) 2402 { 2403 struct pmap *pm; 2404 2405 kpreempt_disable(); 2406 pm = arg1; 2407 if (curcpu()->ci_pmap == pm) { 2408 lldt(pm->pm_ldt_sel); 2409 } 2410 kpreempt_enable(); 2411 } 2412 2413 /* 2414 * pmap_ldt_sync: LDT selector for the named pmap is changing. swap 2415 * in the new selector on all CPUs. 2416 */ 2417 void 2418 pmap_ldt_sync(struct pmap *pm) 2419 { 2420 uint64_t where; 2421 2422 KASSERT(mutex_owned(&cpu_lock)); 2423 2424 pmap_ldt_evcnt.ev_count++; 2425 where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL); 2426 xc_wait(where); 2427 } 2428 2429 /* 2430 * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and 2431 * restore the default. 2432 */ 2433 2434 void 2435 pmap_ldt_cleanup(struct lwp *l) 2436 { 2437 pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap; 2438 union descriptor *dp = NULL; 2439 size_t len = 0; 2440 int sel = -1; 2441 2442 if (__predict_true(pmap->pm_ldt == NULL)) { 2443 return; 2444 } 2445 2446 mutex_enter(&cpu_lock); 2447 if (pmap->pm_ldt != NULL) { 2448 sel = pmap->pm_ldt_sel; 2449 dp = pmap->pm_ldt; 2450 len = pmap->pm_ldt_len; 2451 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 2452 pmap->pm_ldt = NULL; 2453 pmap->pm_ldt_len = 0; 2454 pmap_ldt_sync(pmap); 2455 ldt_free(sel); 2456 uvm_km_free(kernel_map, (vaddr_t)dp, len, UVM_KMF_WIRED); 2457 } 2458 mutex_exit(&cpu_lock); 2459 } 2460 #endif /* USER_LDT */ 2461 2462 /* 2463 * pmap_activate: activate a process' pmap 2464 * 2465 * => must be called with kernel preemption disabled 2466 * => if lwp is the curlwp, then set ci_want_pmapload so that 2467 * actual MMU context switch will be done by pmap_load() later 2468 */ 2469 2470 void 2471 pmap_activate(struct lwp *l) 2472 { 2473 struct cpu_info *ci; 2474 struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2475 2476 KASSERT(kpreempt_disabled()); 2477 2478 ci = curcpu(); 2479 2480 if (l == ci->ci_curlwp) { 2481 struct pcb *pcb; 2482 2483 KASSERT(ci->ci_want_pmapload == 0); 2484 KASSERT(ci->ci_tlbstate != TLBSTATE_VALID); 2485 #ifdef KSTACK_CHECK_DR0 2486 /* 2487 * setup breakpoint on the top of stack 2488 */ 2489 if (l == &lwp0) 2490 dr0(0, 0, 0, 0); 2491 else 2492 dr0(KSTACK_LOWEST_ADDR(l), 1, 3, 1); 2493 #endif 2494 2495 /* 2496 * no need to switch to kernel vmspace because 2497 * it's a subset of any vmspace. 2498 */ 2499 2500 if (pmap == pmap_kernel()) { 2501 ci->ci_want_pmapload = 0; 2502 return; 2503 } 2504 2505 pcb = &l->l_addr->u_pcb; 2506 ci->ci_want_pmapload = 1; 2507 2508 #if defined(__x86_64__) 2509 if (pcb->pcb_flags & PCB_GS64) 2510 wrmsr(MSR_KERNELGSBASE, pcb->pcb_gs); 2511 if (pcb->pcb_flags & PCB_FS64) 2512 wrmsr(MSR_FSBASE, pcb->pcb_fs); 2513 #endif /* defined(__x86_64__) */ 2514 } 2515 } 2516 2517 /* 2518 * pmap_reactivate: try to regain reference to the pmap. 2519 * 2520 * => must be called with kernel preemption disabled 2521 */ 2522 2523 static bool 2524 pmap_reactivate(struct pmap *pmap) 2525 { 2526 struct cpu_info *ci; 2527 uint32_t cpumask; 2528 bool result; 2529 uint32_t oldcpus; 2530 2531 ci = curcpu(); 2532 cpumask = ci->ci_cpumask; 2533 2534 KASSERT(kpreempt_disabled()); 2535 #if defined(XEN) && defined(__x86_64__) 2536 KASSERT(pmap->pm_pdirpa == xen_current_user_pgd); 2537 #elif defined(PAE) 2538 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(pmap_l3pd[0])); 2539 #elif !defined(XEN) || (defined(XEN) && defined(XEN3)) 2540 KASSERT(pmap->pm_pdirpa == pmap_pte2pa(rcr3())); 2541 #endif 2542 2543 /* 2544 * if we still have a lazy reference to this pmap, 2545 * we can assume that there was no tlb shootdown 2546 * for this pmap in the meantime. 2547 * 2548 * the order of events here is important as we must 2549 * synchronize with TLB shootdown interrupts. declare 2550 * interest in invalidations (TLBSTATE_VALID) and then 2551 * check the cpumask, which the IPIs can change only 2552 * when the state is TLBSTATE_LAZY. 2553 */ 2554 2555 ci->ci_tlbstate = TLBSTATE_VALID; 2556 oldcpus = pmap->pm_cpus; 2557 KASSERT((pmap->pm_kernel_cpus & cpumask) != 0); 2558 if (oldcpus & cpumask) { 2559 /* got it */ 2560 result = true; 2561 } else { 2562 /* must reload */ 2563 atomic_or_32(&pmap->pm_cpus, cpumask); 2564 result = false; 2565 } 2566 2567 return result; 2568 } 2569 2570 /* 2571 * pmap_load: actually switch pmap. (fill in %cr3 and LDT info) 2572 */ 2573 2574 void 2575 pmap_load(void) 2576 { 2577 struct cpu_info *ci; 2578 uint32_t cpumask; 2579 struct pmap *pmap; 2580 struct pmap *oldpmap; 2581 struct lwp *l; 2582 struct pcb *pcb; 2583 uint64_t ncsw; 2584 2585 kpreempt_disable(); 2586 retry: 2587 ci = curcpu(); 2588 if (!ci->ci_want_pmapload) { 2589 kpreempt_enable(); 2590 return; 2591 } 2592 cpumask = ci->ci_cpumask; 2593 l = ci->ci_curlwp; 2594 ncsw = l->l_ncsw; 2595 2596 /* should be able to take ipis. */ 2597 KASSERT(ci->ci_ilevel < IPL_IPI); 2598 #ifdef XEN 2599 /* XXX not yet KASSERT(x86_read_psl() != 0); */ 2600 #else 2601 KASSERT((x86_read_psl() & PSL_I) != 0); 2602 #endif 2603 2604 KASSERT(l != NULL); 2605 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2606 KASSERT(pmap != pmap_kernel()); 2607 oldpmap = ci->ci_pmap; 2608 pcb = &l->l_addr->u_pcb; 2609 2610 if (pmap == oldpmap) { 2611 if (!pmap_reactivate(pmap)) { 2612 2613 /* 2614 * pmap has been changed during deactivated. 2615 * our tlb may be stale. 2616 */ 2617 2618 tlbflush(); 2619 } 2620 2621 ci->ci_want_pmapload = 0; 2622 kpreempt_enable(); 2623 return; 2624 } 2625 2626 /* 2627 * grab a reference to the new pmap. 2628 */ 2629 2630 pmap_reference(pmap); 2631 2632 /* 2633 * actually switch pmap. 2634 */ 2635 2636 atomic_and_32(&oldpmap->pm_cpus, ~cpumask); 2637 atomic_and_32(&oldpmap->pm_kernel_cpus, ~cpumask); 2638 2639 #if defined(XEN) && defined(__x86_64__) 2640 KASSERT(oldpmap->pm_pdirpa == xen_current_user_pgd || 2641 oldpmap == pmap_kernel()); 2642 #elif defined(PAE) 2643 KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(pmap_l3pd[0])); 2644 #elif !defined(XEN) || (defined(XEN) && defined(XEN3)) 2645 KASSERT(oldpmap->pm_pdirpa == pmap_pte2pa(rcr3())); 2646 #endif 2647 KASSERT((pmap->pm_cpus & cpumask) == 0); 2648 KASSERT((pmap->pm_kernel_cpus & cpumask) == 0); 2649 2650 /* 2651 * mark the pmap in use by this processor. again we must 2652 * synchronize with TLB shootdown interrupts, so set the 2653 * state VALID first, then register us for shootdown events 2654 * on this pmap. 2655 */ 2656 2657 ci->ci_tlbstate = TLBSTATE_VALID; 2658 atomic_or_32(&pmap->pm_cpus, cpumask); 2659 atomic_or_32(&pmap->pm_kernel_cpus, cpumask); 2660 ci->ci_pmap = pmap; 2661 2662 /* 2663 * update tss. now that we have registered for invalidations 2664 * from other CPUs, we're good to load the page tables. 2665 */ 2666 #ifdef PAE 2667 pcb->pcb_cr3 = pmap_l3paddr; 2668 #else 2669 pcb->pcb_cr3 = pmap->pm_pdirpa; 2670 #endif 2671 #if defined(XEN) && defined(__x86_64__) 2672 /* kernel pmap always in cr3 and should never go in user cr3 */ 2673 if (pmap_pdirpa(pmap, 0) != pmap_pdirpa(pmap_kernel(), 0)) { 2674 /* 2675 * Map user space address in kernel space and load 2676 * user cr3 2677 */ 2678 int i, s; 2679 pd_entry_t *old_pgd, *new_pgd; 2680 paddr_t addr; 2681 s = splvm(); 2682 new_pgd = pmap->pm_pdir; 2683 old_pgd = pmap_kernel()->pm_pdir; 2684 addr = xpmap_ptom(pmap_pdirpa(pmap_kernel(), 0)); 2685 for (i = 0; i < PDIR_SLOT_PTE; 2686 i++, addr += sizeof(pd_entry_t)) { 2687 if ((new_pgd[i] & PG_V) || (old_pgd[i] & PG_V)) 2688 xpq_queue_pte_update(addr, new_pgd[i]); 2689 } 2690 xpq_flush_queue(); /* XXXtlb */ 2691 tlbflush(); 2692 xen_set_user_pgd(pmap_pdirpa(pmap, 0)); 2693 xen_current_user_pgd = pmap_pdirpa(pmap, 0); 2694 splx(s); 2695 } 2696 #else /* XEN && x86_64 */ 2697 #if defined(XEN) 2698 /* 2699 * clear APDP slot, in case it points to a page table that has 2700 * been freed 2701 */ 2702 if (*APDP_PDE) { 2703 int i; 2704 for (i = 0; i < PDP_SIZE; i++) { 2705 pmap_pte_set(&APDP_PDE[i], 0); 2706 #ifdef PAE 2707 /* clear shadow entry too */ 2708 pmap_pte_set(&APDP_PDE_SHADOW[i], 0); 2709 #endif 2710 } 2711 } 2712 /* lldt() does pmap_pte_flush() */ 2713 #else /* XEN */ 2714 #if defined(i386) 2715 ci->ci_tss.tss_ldt = pmap->pm_ldt_sel; 2716 ci->ci_tss.tss_cr3 = pcb->pcb_cr3; 2717 #endif 2718 #endif /* XEN */ 2719 lldt(pmap->pm_ldt_sel); 2720 #ifdef PAE 2721 { 2722 paddr_t l3_pd = xpmap_ptom_masked(pmap_l3paddr); 2723 int i; 2724 int s = splvm(); 2725 /* don't update the kernel L3 slot */ 2726 for (i = 0 ; i < PDP_SIZE - 1 ; i++, l3_pd += sizeof(pd_entry_t)) { 2727 xpq_queue_pte_update(l3_pd, 2728 xpmap_ptom(pmap->pm_pdirpa[i]) | PG_V); 2729 } 2730 tlbflush(); 2731 xpq_flush_queue(); 2732 splx(s); 2733 } 2734 #else /* PAE */ 2735 lcr3(pcb->pcb_cr3); 2736 #endif /* PAE */ 2737 #endif /* XEN && x86_64 */ 2738 2739 ci->ci_want_pmapload = 0; 2740 2741 /* 2742 * we're now running with the new pmap. drop the reference 2743 * to the old pmap. if we block, we need to go around again. 2744 */ 2745 2746 pmap_destroy(oldpmap); 2747 if (l->l_ncsw != ncsw) { 2748 goto retry; 2749 } 2750 2751 kpreempt_enable(); 2752 } 2753 2754 /* 2755 * pmap_deactivate: deactivate a process' pmap 2756 * 2757 * => must be called with kernel preemption disabled (high SPL is enough) 2758 */ 2759 2760 void 2761 pmap_deactivate(struct lwp *l) 2762 { 2763 struct pmap *pmap; 2764 struct cpu_info *ci; 2765 2766 KASSERT(kpreempt_disabled()); 2767 2768 if (l != curlwp) { 2769 return; 2770 } 2771 2772 /* 2773 * wait for pending TLB shootdowns to complete. necessary 2774 * because TLB shootdown state is per-CPU, and the LWP may 2775 * be coming off the CPU before it has a chance to call 2776 * pmap_update(). 2777 */ 2778 pmap_tlb_shootwait(); 2779 2780 ci = curcpu(); 2781 2782 if (ci->ci_want_pmapload) { 2783 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 2784 != pmap_kernel()); 2785 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 2786 != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID); 2787 2788 /* 2789 * userspace has not been touched. 2790 * nothing to do here. 2791 */ 2792 2793 ci->ci_want_pmapload = 0; 2794 return; 2795 } 2796 2797 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2798 2799 if (pmap == pmap_kernel()) { 2800 return; 2801 } 2802 2803 #if defined(XEN) && defined(__x86_64__) 2804 KASSERT(pmap->pm_pdirpa == xen_current_user_pgd); 2805 #elif defined(PAE) 2806 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(pmap_l3pd[0])); 2807 #elif !defined(XEN) || (defined(XEN) && defined(XEN3)) 2808 KASSERT(pmap->pm_pdirpa == pmap_pte2pa(rcr3())); 2809 #endif 2810 KASSERT(ci->ci_pmap == pmap); 2811 2812 /* 2813 * we aren't interested in TLB invalidations for this pmap, 2814 * at least for the time being. 2815 */ 2816 2817 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); 2818 ci->ci_tlbstate = TLBSTATE_LAZY; 2819 } 2820 2821 /* 2822 * end of lifecycle functions 2823 */ 2824 2825 /* 2826 * some misc. functions 2827 */ 2828 2829 static int 2830 pmap_pdes_invalid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde) 2831 { 2832 int i; 2833 unsigned long index; 2834 pd_entry_t pde; 2835 2836 for (i = PTP_LEVELS; i > 1; i--) { 2837 index = pl_i(va, i); 2838 pde = pdes[i - 2][index]; 2839 if ((pde & PG_V) == 0) 2840 return i; 2841 } 2842 if (lastpde != NULL) 2843 *lastpde = pde; 2844 return 0; 2845 } 2846 2847 /* 2848 * pmap_extract: extract a PA for the given VA 2849 */ 2850 2851 bool 2852 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap) 2853 { 2854 pt_entry_t *ptes, pte; 2855 pd_entry_t pde; 2856 pd_entry_t * const *pdes; 2857 struct pmap *pmap2; 2858 struct cpu_info *ci; 2859 vaddr_t pa; 2860 lwp_t *l; 2861 bool hard, rv; 2862 2863 rv = false; 2864 pa = 0; 2865 l = curlwp; 2866 2867 KPREEMPT_DISABLE(l); 2868 ci = l->l_cpu; 2869 if (__predict_true(!ci->ci_want_pmapload && ci->ci_pmap == pmap) || 2870 pmap == pmap_kernel()) { 2871 /* 2872 * no need to lock, because it's pmap_kernel() or our 2873 * own pmap and is active. if a user pmap, the caller 2874 * will hold the vm_map write/read locked and so prevent 2875 * entries from disappearing while we are here. ptps 2876 * can disappear via pmap_remove(), pmap_protect() and 2877 * pmap_collect(), but they are called with the vm_map 2878 * write locked. 2879 */ 2880 hard = false; 2881 ptes = PTE_BASE; 2882 pdes = normal_pdes; 2883 } else { 2884 /* we lose, do it the hard way. */ 2885 hard = true; 2886 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 2887 } 2888 if (pmap_pdes_valid(va, pdes, &pde)) { 2889 pte = ptes[pl1_i(va)]; 2890 if (pde & PG_PS) { 2891 pa = (pde & PG_LGFRAME) | (va & (NBPD_L2 - 1)); 2892 rv = true; 2893 } else if (__predict_true((pte & PG_V) != 0)) { 2894 pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1)); 2895 rv = true; 2896 } 2897 } 2898 if (__predict_false(hard)) { 2899 pmap_unmap_ptes(pmap, pmap2); 2900 } 2901 KPREEMPT_ENABLE(l); 2902 if (pap != NULL) { 2903 *pap = pa; 2904 } 2905 return rv; 2906 } 2907 2908 2909 /* 2910 * vtophys: virtual address to physical address. For use by 2911 * machine-dependent code only. 2912 */ 2913 2914 paddr_t 2915 vtophys(vaddr_t va) 2916 { 2917 paddr_t pa; 2918 2919 if (pmap_extract(pmap_kernel(), va, &pa) == true) 2920 return (pa); 2921 return (0); 2922 } 2923 2924 #ifdef XEN 2925 /* 2926 * pmap_extract_ma: extract a MA for the given VA 2927 */ 2928 2929 bool 2930 pmap_extract_ma(struct pmap *pmap, vaddr_t va, paddr_t *pap) 2931 { 2932 pt_entry_t *ptes, pte; 2933 pd_entry_t pde; 2934 pd_entry_t * const *pdes; 2935 struct pmap *pmap2; 2936 2937 kpreempt_disable(); 2938 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 2939 if (!pmap_pdes_valid(va, pdes, &pde)) { 2940 pmap_unmap_ptes(pmap, pmap2); 2941 kpreempt_enable(); 2942 return false; 2943 } 2944 2945 pte = ptes[pl1_i(va)]; 2946 pmap_unmap_ptes(pmap, pmap2); 2947 kpreempt_enable(); 2948 2949 if (__predict_true((pte & PG_V) != 0)) { 2950 if (pap != NULL) 2951 *pap = (pte & PG_FRAME) | (va & (NBPD_L1 - 1)); 2952 return true; 2953 } 2954 2955 return false; 2956 } 2957 2958 /* 2959 * vtomach: virtual address to machine address. For use by 2960 * machine-dependent code only. 2961 */ 2962 2963 paddr_t 2964 vtomach(vaddr_t va) 2965 { 2966 paddr_t pa; 2967 2968 if (pmap_extract_ma(pmap_kernel(), va, &pa) == true) 2969 return (pa); 2970 return (0); 2971 } 2972 2973 #endif /* XEN */ 2974 2975 2976 2977 /* 2978 * pmap_virtual_space: used during bootup [pmap_steal_memory] to 2979 * determine the bounds of the kernel virtual addess space. 2980 */ 2981 2982 void 2983 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp) 2984 { 2985 *startp = virtual_avail; 2986 *endp = virtual_end; 2987 } 2988 2989 /* 2990 * pmap_map: map a range of PAs into kvm. 2991 * 2992 * => used during crash dump 2993 * => XXX: pmap_map() should be phased out? 2994 */ 2995 2996 vaddr_t 2997 pmap_map(vaddr_t va, paddr_t spa, paddr_t epa, vm_prot_t prot) 2998 { 2999 while (spa < epa) { 3000 pmap_kenter_pa(va, spa, prot); 3001 va += PAGE_SIZE; 3002 spa += PAGE_SIZE; 3003 } 3004 pmap_update(pmap_kernel()); 3005 return va; 3006 } 3007 3008 /* 3009 * pmap_zero_page: zero a page 3010 */ 3011 3012 void 3013 pmap_zero_page(paddr_t pa) 3014 { 3015 pt_entry_t *zpte; 3016 void *zerova; 3017 int id; 3018 3019 kpreempt_disable(); 3020 id = cpu_number(); 3021 zpte = PTESLEW(zero_pte, id); 3022 zerova = VASLEW(zerop, id); 3023 3024 #ifdef DIAGNOSTIC 3025 if (*zpte) 3026 panic("pmap_zero_page: lock botch"); 3027 #endif 3028 3029 pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k); 3030 pmap_pte_flush(); 3031 pmap_update_pg((vaddr_t)zerova); /* flush TLB */ 3032 3033 memset(zerova, 0, PAGE_SIZE); 3034 3035 #if defined(DIAGNOSTIC) || defined(XEN) 3036 pmap_pte_set(zpte, 0); /* zap ! */ 3037 pmap_pte_flush(); 3038 #endif 3039 kpreempt_enable(); 3040 } 3041 3042 /* 3043 * pmap_pagezeroidle: the same, for the idle loop page zero'er. 3044 * Returns true if the page was zero'd, false if we aborted for 3045 * some reason. 3046 */ 3047 3048 bool 3049 pmap_pageidlezero(paddr_t pa) 3050 { 3051 pt_entry_t *zpte; 3052 void *zerova; 3053 bool rv; 3054 int id; 3055 3056 id = cpu_number(); 3057 zpte = PTESLEW(zero_pte, id); 3058 zerova = VASLEW(zerop, id); 3059 3060 KASSERT(cpu_feature & CPUID_SSE2); 3061 KASSERT(*zpte == 0); 3062 3063 pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k); 3064 pmap_pte_flush(); 3065 pmap_update_pg((vaddr_t)zerova); /* flush TLB */ 3066 3067 rv = sse2_idlezero_page(zerova); 3068 3069 #if defined(DIAGNOSTIC) || defined(XEN) 3070 pmap_pte_set(zpte, 0); /* zap ! */ 3071 pmap_pte_flush(); 3072 #endif 3073 3074 return rv; 3075 } 3076 3077 /* 3078 * pmap_copy_page: copy a page 3079 */ 3080 3081 void 3082 pmap_copy_page(paddr_t srcpa, paddr_t dstpa) 3083 { 3084 pt_entry_t *spte; 3085 pt_entry_t *dpte; 3086 void *csrcva; 3087 void *cdstva; 3088 int id; 3089 3090 kpreempt_disable(); 3091 id = cpu_number(); 3092 spte = PTESLEW(csrc_pte,id); 3093 dpte = PTESLEW(cdst_pte,id); 3094 csrcva = VASLEW(csrcp, id); 3095 cdstva = VASLEW(cdstp, id); 3096 3097 KASSERT(*spte == 0 && *dpte == 0); 3098 3099 pmap_pte_set(spte, pmap_pa2pte(srcpa) | PG_V | PG_RW | PG_U | PG_k); 3100 pmap_pte_set(dpte, 3101 pmap_pa2pte(dstpa) | PG_V | PG_RW | PG_M | PG_U | PG_k); 3102 pmap_pte_flush(); 3103 pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva); 3104 3105 memcpy(cdstva, csrcva, PAGE_SIZE); 3106 3107 #if defined(DIAGNOSTIC) || defined(XEN) 3108 pmap_pte_set(spte, 0); 3109 pmap_pte_set(dpte, 0); 3110 pmap_pte_flush(); 3111 #endif 3112 kpreempt_enable(); 3113 } 3114 3115 static pt_entry_t * 3116 pmap_map_ptp(struct vm_page *ptp) 3117 { 3118 pt_entry_t *ptppte; 3119 void *ptpva; 3120 int id; 3121 3122 KASSERT(kpreempt_disabled()); 3123 3124 id = cpu_number(); 3125 ptppte = PTESLEW(ptp_pte, id); 3126 ptpva = VASLEW(ptpp, id); 3127 #if !defined(XEN) 3128 pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M | 3129 PG_RW | PG_U | PG_k); 3130 #else 3131 pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M | 3132 PG_U | PG_k); 3133 #endif 3134 pmap_pte_flush(); 3135 pmap_update_pg((vaddr_t)ptpva); 3136 3137 return (pt_entry_t *)ptpva; 3138 } 3139 3140 static void 3141 pmap_unmap_ptp(void) 3142 { 3143 #if defined(DIAGNOSTIC) || defined(XEN) 3144 pt_entry_t *pte; 3145 3146 KASSERT(kpreempt_disabled()); 3147 3148 pte = PTESLEW(ptp_pte, cpu_number()); 3149 if (*pte != 0) { 3150 pmap_pte_set(pte, 0); 3151 pmap_pte_flush(); 3152 } 3153 #endif 3154 } 3155 3156 static pt_entry_t * 3157 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va) 3158 { 3159 3160 KASSERT(kpreempt_disabled()); 3161 if (pmap_is_curpmap(pmap)) { 3162 return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */ 3163 } 3164 KASSERT(ptp != NULL); 3165 return pmap_map_ptp(ptp) + pl1_pi(va); 3166 } 3167 3168 static void 3169 pmap_unmap_pte(void) 3170 { 3171 3172 KASSERT(kpreempt_disabled()); 3173 3174 pmap_unmap_ptp(); 3175 } 3176 3177 /* 3178 * p m a p r e m o v e f u n c t i o n s 3179 * 3180 * functions that remove mappings 3181 */ 3182 3183 /* 3184 * pmap_remove_ptes: remove PTEs from a PTP 3185 * 3186 * => must have proper locking on pmap_master_lock 3187 * => caller must hold pmap's lock 3188 * => PTP must be mapped into KVA 3189 * => PTP should be null if pmap == pmap_kernel() 3190 * => must be called with kernel preemption disabled 3191 * => returns composite pte if at least one page should be shot down 3192 */ 3193 3194 static pt_entry_t 3195 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva, 3196 vaddr_t startva, vaddr_t endva, int flags, 3197 struct pv_entry **pv_tofree) 3198 { 3199 struct pv_entry *pve; 3200 pt_entry_t *pte = (pt_entry_t *) ptpva; 3201 pt_entry_t opte, xpte = 0; 3202 3203 KASSERT(pmap == pmap_kernel() || mutex_owned(&pmap->pm_lock)); 3204 KASSERT(kpreempt_disabled()); 3205 3206 /* 3207 * note that ptpva points to the PTE that maps startva. this may 3208 * or may not be the first PTE in the PTP. 3209 * 3210 * we loop through the PTP while there are still PTEs to look at 3211 * and the wire_count is greater than 1 (because we use the wire_count 3212 * to keep track of the number of real PTEs in the PTP). 3213 */ 3214 3215 for (/*null*/; startva < endva && (ptp == NULL || ptp->wire_count > 1) 3216 ; pte++, startva += PAGE_SIZE) { 3217 struct vm_page *pg; 3218 struct pmap_page *pp; 3219 3220 if (!pmap_valid_entry(*pte)) 3221 continue; /* VA not mapped */ 3222 if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) { 3223 continue; 3224 } 3225 3226 /* atomically save the old PTE and zap! it */ 3227 opte = pmap_pte_testset(pte, 0); 3228 if (!pmap_valid_entry(opte)) { 3229 continue; 3230 } 3231 3232 pmap_exec_account(pmap, startva, opte, 0); 3233 pmap_stats_update_bypte(pmap, 0, opte); 3234 xpte |= opte; 3235 3236 if (ptp) { 3237 ptp->wire_count--; /* dropping a PTE */ 3238 /* Make sure that the PDE is flushed */ 3239 if (ptp->wire_count <= 1) 3240 xpte |= PG_U; 3241 } 3242 3243 /* 3244 * if we are not on a pv_head list we are done. 3245 */ 3246 3247 if ((opte & PG_PVLIST) == 0) { 3248 #if defined(DIAGNOSTIC) && !defined(DOM0OPS) 3249 if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL) 3250 panic("pmap_remove_ptes: managed page without " 3251 "PG_PVLIST for 0x%lx", startva); 3252 #endif 3253 continue; 3254 } 3255 3256 pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte)); 3257 #ifdef DIAGNOSTIC 3258 if (pg == NULL) 3259 panic("pmap_remove_ptes: unmanaged page marked " 3260 "PG_PVLIST, va = 0x%lx, pa = 0x%lx", 3261 startva, (u_long)pmap_pte2pa(opte)); 3262 #endif 3263 3264 /* sync R/M bits */ 3265 pp = VM_PAGE_TO_PP(pg); 3266 pp_lock(pp); 3267 pp->pp_attrs |= opte; 3268 pve = pmap_remove_pv(pp, ptp, startva); 3269 pp_unlock(pp); 3270 3271 if (pve != NULL) { 3272 pve->pve_next = *pv_tofree; 3273 *pv_tofree = pve; 3274 } 3275 3276 /* end of "for" loop: time for next pte */ 3277 } 3278 3279 return xpte; 3280 } 3281 3282 3283 /* 3284 * pmap_remove_pte: remove a single PTE from a PTP 3285 * 3286 * => must have proper locking on pmap_master_lock 3287 * => caller must hold pmap's lock 3288 * => PTP must be mapped into KVA 3289 * => PTP should be null if pmap == pmap_kernel() 3290 * => returns true if we removed a mapping 3291 * => must be called with kernel preemption disabled 3292 */ 3293 3294 static bool 3295 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, 3296 vaddr_t va, int flags, struct pv_entry **pv_tofree) 3297 { 3298 pt_entry_t opte; 3299 struct pv_entry *pve; 3300 struct vm_page *pg; 3301 struct pmap_page *pp; 3302 3303 KASSERT(pmap == pmap_kernel() || mutex_owned(&pmap->pm_lock)); 3304 KASSERT(pmap == pmap_kernel() || kpreempt_disabled()); 3305 3306 if (!pmap_valid_entry(*pte)) 3307 return(false); /* VA not mapped */ 3308 if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) { 3309 return(false); 3310 } 3311 3312 /* atomically save the old PTE and zap! it */ 3313 opte = pmap_pte_testset(pte, 0); 3314 if (!pmap_valid_entry(opte)) { 3315 return false; 3316 } 3317 3318 pmap_exec_account(pmap, va, opte, 0); 3319 pmap_stats_update_bypte(pmap, 0, opte); 3320 3321 if (opte & PG_U) 3322 pmap_tlb_shootdown(pmap, va, 0, opte); 3323 3324 if (ptp) { 3325 ptp->wire_count--; /* dropping a PTE */ 3326 /* Make sure that the PDE is flushed */ 3327 if ((ptp->wire_count <= 1) && !(opte & PG_U)) 3328 pmap_tlb_shootdown(pmap, va, 0, opte); 3329 } 3330 3331 /* 3332 * if we are not on a pv_head list we are done. 3333 */ 3334 3335 if ((opte & PG_PVLIST) == 0) { 3336 #if defined(DIAGNOSTIC) && !defined(DOM0OPS) 3337 if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL) 3338 panic("pmap_remove_pte: managed page without " 3339 "PG_PVLIST for 0x%lx", va); 3340 #endif 3341 return(true); 3342 } 3343 3344 pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte)); 3345 #ifdef DIAGNOSTIC 3346 if (pg == NULL) 3347 panic("pmap_remove_pte: unmanaged page marked " 3348 "PG_PVLIST, va = 0x%lx, pa = 0x%lx", va, 3349 (u_long)(pmap_pte2pa(opte))); 3350 #endif 3351 3352 /* sync R/M bits */ 3353 pp = VM_PAGE_TO_PP(pg); 3354 pp_lock(pp); 3355 pp->pp_attrs |= opte; 3356 pve = pmap_remove_pv(pp, ptp, va); 3357 pp_unlock(pp); 3358 3359 if (pve) { 3360 pve->pve_next = *pv_tofree; 3361 *pv_tofree = pve; 3362 } 3363 3364 return(true); 3365 } 3366 3367 /* 3368 * pmap_remove: top level mapping removal function 3369 * 3370 * => caller should not be holding any pmap locks 3371 */ 3372 3373 void 3374 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 3375 { 3376 pmap_do_remove(pmap, sva, eva, PMAP_REMOVE_ALL); 3377 } 3378 3379 /* 3380 * pmap_do_remove: mapping removal guts 3381 * 3382 * => caller should not be holding any pmap locks 3383 */ 3384 3385 static void 3386 pmap_do_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva, int flags) 3387 { 3388 pt_entry_t *ptes, xpte = 0; 3389 pd_entry_t pde; 3390 pd_entry_t * const *pdes; 3391 struct pv_entry *pv_tofree = NULL; 3392 bool result; 3393 paddr_t ptppa; 3394 vaddr_t blkendva, va = sva; 3395 struct vm_page *ptp; 3396 struct pmap *pmap2; 3397 3398 kpreempt_disable(); 3399 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 3400 3401 /* 3402 * removing one page? take shortcut function. 3403 */ 3404 3405 if (va + PAGE_SIZE == eva) { 3406 if (pmap_pdes_valid(va, pdes, &pde)) { 3407 3408 /* PA of the PTP */ 3409 ptppa = pmap_pte2pa(pde); 3410 3411 /* get PTP if non-kernel mapping */ 3412 if (pmap == pmap_kernel()) { 3413 /* we never free kernel PTPs */ 3414 ptp = NULL; 3415 } else { 3416 ptp = pmap_find_ptp(pmap, va, ptppa, 1); 3417 #ifdef DIAGNOSTIC 3418 if (ptp == NULL) 3419 panic("pmap_remove: unmanaged " 3420 "PTP detected"); 3421 #endif 3422 } 3423 3424 /* do it! */ 3425 result = pmap_remove_pte(pmap, ptp, 3426 &ptes[pl1_i(va)], va, flags, &pv_tofree); 3427 3428 /* 3429 * if mapping removed and the PTP is no longer 3430 * being used, free it! 3431 */ 3432 3433 if (result && ptp && ptp->wire_count <= 1) 3434 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3435 } 3436 } else for (/* null */ ; va < eva ; va = blkendva) { 3437 int lvl; 3438 3439 /* determine range of block */ 3440 blkendva = x86_round_pdr(va+1); 3441 if (blkendva > eva) 3442 blkendva = eva; 3443 3444 /* 3445 * XXXCDC: our PTE mappings should never be removed 3446 * with pmap_remove! if we allow this (and why would 3447 * we?) then we end up freeing the pmap's page 3448 * directory page (PDP) before we are finished using 3449 * it when we hit in in the recursive mapping. this 3450 * is BAD. 3451 * 3452 * long term solution is to move the PTEs out of user 3453 * address space. and into kernel address space (up 3454 * with APTE). then we can set VM_MAXUSER_ADDRESS to 3455 * be VM_MAX_ADDRESS. 3456 */ 3457 3458 if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE) 3459 /* XXXCDC: ugly hack to avoid freeing PDP here */ 3460 continue; 3461 3462 lvl = pmap_pdes_invalid(va, pdes, &pde); 3463 if (lvl != 0) { 3464 /* 3465 * skip a range corresponding to an invalid pde. 3466 */ 3467 blkendva = (va & ptp_masks[lvl - 1]) + nbpd[lvl - 1]; 3468 continue; 3469 } 3470 3471 /* PA of the PTP */ 3472 ptppa = pmap_pte2pa(pde); 3473 3474 /* get PTP if non-kernel mapping */ 3475 if (pmap == pmap_kernel()) { 3476 /* we never free kernel PTPs */ 3477 ptp = NULL; 3478 } else { 3479 ptp = pmap_find_ptp(pmap, va, ptppa, 1); 3480 #ifdef DIAGNOSTIC 3481 if (ptp == NULL) 3482 panic("pmap_remove: unmanaged PTP " 3483 "detected"); 3484 #endif 3485 } 3486 xpte |= pmap_remove_ptes(pmap, ptp, 3487 (vaddr_t)&ptes[pl1_i(va)], va, blkendva, 3488 flags, &pv_tofree); 3489 3490 /* if PTP is no longer being used, free it! */ 3491 if (ptp && ptp->wire_count <= 1) { 3492 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3493 } 3494 if ((xpte & PG_U) != 0) 3495 pmap_tlb_shootdown(pmap, sva, eva, xpte); 3496 } 3497 pmap_unmap_ptes(pmap, pmap2); /* unlock pmap */ 3498 kpreempt_enable(); 3499 3500 /* Now we free unused PVs */ 3501 if (pv_tofree) 3502 pmap_free_pvs(pv_tofree); 3503 } 3504 3505 /* 3506 * pmap_sync_pv: clear pte bits and return the old value of the pte. 3507 * 3508 * => called with pp_lock held. (thus preemption disabled) 3509 * => issues tlb shootdowns if necessary. 3510 */ 3511 3512 static int 3513 pmap_sync_pv(struct pv_pte *pvpte, pt_entry_t expect, int clearbits, 3514 pt_entry_t *optep) 3515 { 3516 struct pmap *pmap; 3517 struct vm_page *ptp; 3518 vaddr_t va; 3519 pt_entry_t *ptep; 3520 pt_entry_t opte; 3521 pt_entry_t npte; 3522 bool need_shootdown; 3523 3524 ptp = pvpte->pte_ptp; 3525 va = pvpte->pte_va; 3526 KASSERT(ptp == NULL || ptp->uobject != NULL); 3527 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 3528 pmap = ptp_to_pmap(ptp); 3529 3530 KASSERT((expect & ~(PG_FRAME | PG_V)) == 0); 3531 KASSERT((expect & PG_V) != 0); 3532 KASSERT(clearbits == ~0 || (clearbits & ~(PG_M | PG_U | PG_RW)) == 0); 3533 KASSERT(kpreempt_disabled()); 3534 3535 ptep = pmap_map_pte(pmap, ptp, va); 3536 do { 3537 opte = *ptep; 3538 KASSERT((opte & (PG_M | PG_U)) != PG_M); 3539 KASSERT((opte & (PG_U | PG_V)) != PG_U); 3540 KASSERT(opte == 0 || (opte & PG_V) != 0); 3541 if ((opte & (PG_FRAME | PG_V)) != expect) { 3542 3543 /* 3544 * we lost a race with a V->P operation like 3545 * pmap_remove(). wait for the competitor 3546 * reflecting pte bits into mp_attrs. 3547 * 3548 * issue a redundant TLB shootdown so that 3549 * we can wait for its completion. 3550 */ 3551 3552 pmap_unmap_pte(); 3553 if (clearbits != 0) { 3554 pmap_tlb_shootdown(pmap, va, 0, 3555 (pmap == pmap_kernel() ? PG_G : 0)); 3556 } 3557 return EAGAIN; 3558 } 3559 3560 /* 3561 * check if there's anything to do on this pte. 3562 */ 3563 3564 if ((opte & clearbits) == 0) { 3565 need_shootdown = false; 3566 break; 3567 } 3568 3569 /* 3570 * we need a shootdown if the pte is cached. (PG_U) 3571 * 3572 * ...unless we are clearing only the PG_RW bit and 3573 * it isn't cached as RW. (PG_M) 3574 */ 3575 3576 need_shootdown = (opte & PG_U) != 0 && 3577 !(clearbits == PG_RW && (opte & PG_M) == 0); 3578 3579 npte = opte & ~clearbits; 3580 3581 /* 3582 * if we need a shootdown anyway, clear PG_U and PG_M. 3583 */ 3584 3585 if (need_shootdown) { 3586 npte &= ~(PG_U | PG_M); 3587 } 3588 KASSERT((npte & (PG_M | PG_U)) != PG_M); 3589 KASSERT((npte & (PG_U | PG_V)) != PG_U); 3590 KASSERT(npte == 0 || (opte & PG_V) != 0); 3591 } while (pmap_pte_cas(ptep, opte, npte) != opte); 3592 3593 if (need_shootdown) { 3594 pmap_tlb_shootdown(pmap, va, 0, opte); 3595 } 3596 pmap_unmap_pte(); 3597 3598 *optep = opte; 3599 return 0; 3600 } 3601 3602 /* 3603 * pmap_page_remove: remove a managed vm_page from all pmaps that map it 3604 * 3605 * => R/M bits are sync'd back to attrs 3606 */ 3607 3608 void 3609 pmap_page_remove(struct vm_page *pg) 3610 { 3611 struct pmap_page *pp; 3612 struct pv_pte *pvpte; 3613 struct pv_entry *killlist = NULL; 3614 struct vm_page *ptp; 3615 pt_entry_t expect; 3616 lwp_t *l; 3617 int count; 3618 3619 #ifdef DIAGNOSTIC 3620 int bank, off; 3621 3622 bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off); 3623 if (bank == -1) 3624 panic("pmap_page_remove: unmanaged page?"); 3625 #endif 3626 3627 l = curlwp; 3628 pp = VM_PAGE_TO_PP(pg); 3629 expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V; 3630 count = SPINLOCK_BACKOFF_MIN; 3631 kpreempt_disable(); 3632 startover: 3633 pp_lock(pp); 3634 while ((pvpte = pv_pte_first(pp)) != NULL) { 3635 struct pmap *pmap; 3636 struct pv_entry *pve; 3637 pt_entry_t opte; 3638 vaddr_t va; 3639 int error; 3640 3641 /* 3642 * add a reference to the pmap before clearing the pte. 3643 * otherwise the pmap can disappear behind us. 3644 */ 3645 3646 ptp = pvpte->pte_ptp; 3647 pmap = ptp_to_pmap(ptp); 3648 if (ptp != NULL) { 3649 pmap_reference(pmap); 3650 } 3651 3652 error = pmap_sync_pv(pvpte, expect, ~0, &opte); 3653 if (error == EAGAIN) { 3654 int hold_count; 3655 pp_unlock(pp); 3656 KERNEL_UNLOCK_ALL(curlwp, &hold_count); 3657 if (ptp != NULL) { 3658 pmap_destroy(pmap); 3659 } 3660 SPINLOCK_BACKOFF(count); 3661 KERNEL_LOCK(hold_count, curlwp); 3662 goto startover; 3663 } 3664 3665 pp->pp_attrs |= opte; 3666 va = pvpte->pte_va; 3667 pve = pmap_remove_pv(pp, ptp, va); 3668 pp_unlock(pp); 3669 3670 /* update the PTP reference count. free if last reference. */ 3671 if (ptp != NULL) { 3672 struct pmap *pmap2; 3673 pt_entry_t *ptes; 3674 pd_entry_t * const *pdes; 3675 3676 KASSERT(pmap != pmap_kernel()); 3677 3678 pmap_tlb_shootwait(); 3679 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3680 pmap_stats_update_bypte(pmap, 0, opte); 3681 ptp->wire_count--; 3682 if (ptp->wire_count <= 1) { 3683 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3684 } 3685 pmap_unmap_ptes(pmap, pmap2); 3686 pmap_destroy(pmap); 3687 } else { 3688 KASSERT(pmap == pmap_kernel()); 3689 pmap_stats_update_bypte(pmap, 0, opte); 3690 } 3691 3692 if (pve != NULL) { 3693 pve->pve_next = killlist; /* mark it for death */ 3694 killlist = pve; 3695 } 3696 pp_lock(pp); 3697 } 3698 pp_unlock(pp); 3699 kpreempt_enable(); 3700 3701 /* Now free unused pvs. */ 3702 pmap_free_pvs(killlist); 3703 } 3704 3705 /* 3706 * p m a p a t t r i b u t e f u n c t i o n s 3707 * functions that test/change managed page's attributes 3708 * since a page can be mapped multiple times we must check each PTE that 3709 * maps it by going down the pv lists. 3710 */ 3711 3712 /* 3713 * pmap_test_attrs: test a page's attributes 3714 */ 3715 3716 bool 3717 pmap_test_attrs(struct vm_page *pg, unsigned testbits) 3718 { 3719 struct pmap_page *pp; 3720 struct pv_pte *pvpte; 3721 pt_entry_t expect; 3722 u_int result; 3723 3724 #if DIAGNOSTIC 3725 int bank, off; 3726 3727 bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off); 3728 if (bank == -1) 3729 panic("pmap_test_attrs: unmanaged page?"); 3730 #endif 3731 3732 pp = VM_PAGE_TO_PP(pg); 3733 if ((pp->pp_attrs & testbits) != 0) { 3734 return true; 3735 } 3736 expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V; 3737 pp_lock(pp); 3738 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 3739 pt_entry_t opte; 3740 int error; 3741 3742 if ((pp->pp_attrs & testbits) != 0) { 3743 break; 3744 } 3745 error = pmap_sync_pv(pvpte, expect, 0, &opte); 3746 if (error == 0) { 3747 pp->pp_attrs |= opte; 3748 } 3749 } 3750 result = pp->pp_attrs & testbits; 3751 pp_unlock(pp); 3752 3753 /* 3754 * note that we will exit the for loop with a non-null pve if 3755 * we have found the bits we are testing for. 3756 */ 3757 3758 return result != 0; 3759 } 3760 3761 /* 3762 * pmap_clear_attrs: clear the specified attribute for a page. 3763 * 3764 * => we return true if we cleared one of the bits we were asked to 3765 */ 3766 3767 bool 3768 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits) 3769 { 3770 struct pmap_page *pp; 3771 struct pv_pte *pvpte; 3772 u_int result; 3773 pt_entry_t expect; 3774 int count; 3775 #ifdef DIAGNOSTIC 3776 int bank, off; 3777 3778 bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off); 3779 if (bank == -1) 3780 panic("pmap_change_attrs: unmanaged page?"); 3781 #endif 3782 3783 pp = VM_PAGE_TO_PP(pg); 3784 expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V; 3785 count = SPINLOCK_BACKOFF_MIN; 3786 kpreempt_disable(); 3787 startover: 3788 pp_lock(pp); 3789 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 3790 pt_entry_t opte; 3791 int error; 3792 3793 error = pmap_sync_pv(pvpte, expect, clearbits, &opte); 3794 if (error == EAGAIN) { 3795 int hold_count; 3796 pp_unlock(pp); 3797 KERNEL_UNLOCK_ALL(curlwp, &hold_count); 3798 SPINLOCK_BACKOFF(count); 3799 KERNEL_LOCK(hold_count, curlwp); 3800 goto startover; 3801 } 3802 pp->pp_attrs |= opte; 3803 } 3804 result = pp->pp_attrs & clearbits; 3805 pp->pp_attrs &= ~clearbits; 3806 pp_unlock(pp); 3807 kpreempt_enable(); 3808 3809 return result != 0; 3810 } 3811 3812 3813 /* 3814 * p m a p p r o t e c t i o n f u n c t i o n s 3815 */ 3816 3817 /* 3818 * pmap_page_protect: change the protection of all recorded mappings 3819 * of a managed page 3820 * 3821 * => NOTE: this is an inline function in pmap.h 3822 */ 3823 3824 /* see pmap.h */ 3825 3826 /* 3827 * pmap_protect: set the protection in of the pages in a pmap 3828 * 3829 * => NOTE: this is an inline function in pmap.h 3830 */ 3831 3832 /* see pmap.h */ 3833 3834 /* 3835 * pmap_write_protect: write-protect pages in a pmap 3836 */ 3837 3838 void 3839 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot) 3840 { 3841 pt_entry_t *ptes, *epte; 3842 pt_entry_t *spte; 3843 pd_entry_t * const *pdes; 3844 vaddr_t blockend, va; 3845 pt_entry_t opte; 3846 struct pmap *pmap2; 3847 3848 KASSERT(curlwp->l_md.md_gc_pmap != pmap); 3849 3850 kpreempt_disable(); 3851 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 3852 3853 /* should be ok, but just in case ... */ 3854 sva &= PG_FRAME; 3855 eva &= PG_FRAME; 3856 3857 for (va = sva ; va < eva ; va = blockend) { 3858 3859 blockend = (va & L2_FRAME) + NBPD_L2; 3860 if (blockend > eva) 3861 blockend = eva; 3862 3863 /* 3864 * XXXCDC: our PTE mappings should never be write-protected! 3865 * 3866 * long term solution is to move the PTEs out of user 3867 * address space. and into kernel address space (up 3868 * with APTE). then we can set VM_MAXUSER_ADDRESS to 3869 * be VM_MAX_ADDRESS. 3870 */ 3871 3872 /* XXXCDC: ugly hack to avoid freeing PDP here */ 3873 if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE) 3874 continue; 3875 3876 /* empty block? */ 3877 if (!pmap_pdes_valid(va, pdes, NULL)) 3878 continue; 3879 3880 #ifdef DIAGNOSTIC 3881 if (va >= VM_MAXUSER_ADDRESS && 3882 va < VM_MAX_ADDRESS) 3883 panic("pmap_write_protect: PTE space"); 3884 #endif 3885 3886 spte = &ptes[pl1_i(va)]; 3887 epte = &ptes[pl1_i(blockend)]; 3888 3889 for (/*null */; spte < epte ; spte++) { 3890 pt_entry_t npte; 3891 3892 do { 3893 opte = *spte; 3894 if ((~opte & (PG_RW | PG_V)) != 0) { 3895 goto next; 3896 } 3897 npte = opte & ~PG_RW; 3898 } while (pmap_pte_cas(spte, opte, npte) != opte); 3899 if ((opte & PG_M) != 0) { 3900 vaddr_t tva; 3901 3902 tva = x86_ptob(spte - ptes); 3903 pmap_tlb_shootdown(pmap, tva, 0, opte); 3904 } 3905 next:; 3906 } 3907 } 3908 3909 pmap_unmap_ptes(pmap, pmap2); /* unlocks pmap */ 3910 kpreempt_enable(); 3911 } 3912 3913 /* 3914 * end of protection functions 3915 */ 3916 3917 /* 3918 * pmap_unwire: clear the wired bit in the PTE 3919 * 3920 * => mapping should already be in map 3921 */ 3922 3923 void 3924 pmap_unwire(struct pmap *pmap, vaddr_t va) 3925 { 3926 pt_entry_t *ptes; 3927 pd_entry_t * const *pdes; 3928 struct pmap *pmap2; 3929 3930 kpreempt_disable(); 3931 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 3932 3933 if (pmap_pdes_valid(va, pdes, NULL)) { 3934 pt_entry_t *ptep = &ptes[pl1_i(va)]; 3935 pt_entry_t opte = *ptep; 3936 3937 #ifdef DIAGNOSTIC 3938 if (!pmap_valid_entry(opte)) 3939 panic("pmap_unwire: invalid (unmapped) va 0x%lx", va); 3940 #endif 3941 if ((opte & PG_W) != 0) { 3942 pt_entry_t npte = opte & ~PG_W; 3943 3944 opte = pmap_pte_testset(ptep, npte); 3945 pmap_stats_update_bypte(pmap, npte, opte); 3946 } 3947 #ifdef DIAGNOSTIC 3948 else { 3949 printf("pmap_unwire: wiring for pmap %p va 0x%lx " 3950 "didn't change!\n", pmap, va); 3951 } 3952 #endif 3953 pmap_unmap_ptes(pmap, pmap2); /* unlocks map */ 3954 } 3955 #ifdef DIAGNOSTIC 3956 else { 3957 panic("pmap_unwire: invalid PDE"); 3958 } 3959 #endif 3960 kpreempt_enable(); 3961 } 3962 3963 /* 3964 * pmap_collect: free resources held by a pmap 3965 * 3966 * => optional function. 3967 * => called when a process is swapped out to free memory. 3968 */ 3969 3970 void 3971 pmap_collect(struct pmap *pmap) 3972 { 3973 /* 3974 * free all of the pt pages by removing the physical mappings 3975 * for its entire address space. 3976 */ 3977 3978 pmap_do_remove(pmap, VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS, 3979 PMAP_REMOVE_SKIPWIRED); 3980 } 3981 3982 /* 3983 * pmap_copy: copy mappings from one pmap to another 3984 * 3985 * => optional function 3986 * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr) 3987 */ 3988 3989 /* 3990 * defined as macro in pmap.h 3991 */ 3992 3993 /* 3994 * pmap_enter: enter a mapping into a pmap 3995 * 3996 * => must be done "now" ... no lazy-evaluation 3997 * => we set pmap => pv_head locking 3998 */ 3999 #ifdef XEN 4000 int 4001 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa, 4002 vm_prot_t prot, int flags, int domid) 4003 { 4004 #else /* XEN */ 4005 int 4006 pmap_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, 4007 int flags) 4008 { 4009 paddr_t ma = pa; 4010 #endif /* XEN */ 4011 pt_entry_t *ptes, opte, npte; 4012 pt_entry_t *ptep; 4013 pd_entry_t * const *pdes; 4014 struct vm_page *ptp, *pg; 4015 struct pmap_page *new_pp; 4016 struct pmap_page *old_pp; 4017 struct pv_entry *old_pve = NULL; 4018 struct pv_entry *new_pve; 4019 struct pv_entry *new_pve2; 4020 int error; 4021 bool wired = (flags & PMAP_WIRED) != 0; 4022 struct pmap *pmap2; 4023 4024 KASSERT(pmap_initialized); 4025 KASSERT(curlwp->l_md.md_gc_pmap != pmap); 4026 4027 #ifdef DIAGNOSTIC 4028 /* sanity check: totally out of range? */ 4029 if (va >= VM_MAX_KERNEL_ADDRESS) 4030 panic("pmap_enter: too big"); 4031 4032 if (va == (vaddr_t) PDP_BASE || va == (vaddr_t) APDP_BASE) 4033 panic("pmap_enter: trying to map over PDP/APDP!"); 4034 4035 /* sanity check: kernel PTPs should already have been pre-allocated */ 4036 if (va >= VM_MIN_KERNEL_ADDRESS && 4037 !pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)])) 4038 panic("pmap_enter: missing kernel PTP for va %lx!", va); 4039 #endif /* DIAGNOSTIC */ 4040 #ifdef XEN 4041 KASSERT(domid == DOMID_SELF || pa == 0); 4042 #endif /* XEN */ 4043 4044 npte = ma | protection_codes[prot] | PG_V; 4045 if (wired) 4046 npte |= PG_W; 4047 if (va < VM_MAXUSER_ADDRESS) 4048 npte |= PG_u; 4049 else if (va < VM_MAX_ADDRESS) 4050 npte |= (PG_u | PG_RW); /* XXXCDC: no longer needed? */ 4051 else 4052 npte |= PG_k; 4053 if (pmap == pmap_kernel()) 4054 npte |= pmap_pg_g; 4055 if (flags & VM_PROT_ALL) { 4056 npte |= PG_U; 4057 if (flags & VM_PROT_WRITE) { 4058 KASSERT((npte & PG_RW) != 0); 4059 npte |= PG_M; 4060 } 4061 } 4062 4063 #ifdef XEN 4064 if (domid != DOMID_SELF) 4065 pg = NULL; 4066 else 4067 #endif 4068 pg = PHYS_TO_VM_PAGE(pa); 4069 if (pg != NULL) { 4070 /* This is a managed page */ 4071 npte |= PG_PVLIST; 4072 new_pp = VM_PAGE_TO_PP(pg); 4073 } else { 4074 new_pp = NULL; 4075 } 4076 4077 /* get pves. */ 4078 new_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT); 4079 new_pve2 = pool_cache_get(&pmap_pv_cache, PR_NOWAIT); 4080 if (new_pve == NULL || new_pve2 == NULL) { 4081 if (flags & PMAP_CANFAIL) { 4082 error = ENOMEM; 4083 goto out2; 4084 } 4085 panic("pmap_enter: pve allocation failed"); 4086 } 4087 4088 kpreempt_disable(); 4089 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 4090 if (pmap == pmap_kernel()) { 4091 ptp = NULL; 4092 } else { 4093 ptp = pmap_get_ptp(pmap, va, pdes); 4094 if (ptp == NULL) { 4095 pmap_unmap_ptes(pmap, pmap2); 4096 if (flags & PMAP_CANFAIL) { 4097 error = ENOMEM; 4098 goto out; 4099 } 4100 panic("pmap_enter: get ptp failed"); 4101 } 4102 } 4103 4104 /* 4105 * update the pte. 4106 */ 4107 4108 ptep = &ptes[pl1_i(va)]; 4109 do { 4110 opte = *ptep; 4111 4112 /* 4113 * if the same page, inherit PG_U and PG_M. 4114 */ 4115 if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) { 4116 npte |= opte & (PG_U | PG_M); 4117 } 4118 #if defined(XEN) 4119 if (domid != DOMID_SELF) { 4120 /* pmap_pte_cas with error handling */ 4121 int s = splvm(); 4122 if (opte != *ptep) { 4123 splx(s); 4124 continue; 4125 } 4126 error = xpq_update_foreign( 4127 vtomach((vaddr_t)ptep), npte, domid); 4128 splx(s); 4129 if (error) { 4130 if (ptp != NULL && ptp->wire_count <= 1) { 4131 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 4132 } 4133 pmap_unmap_ptes(pmap, pmap2); 4134 goto out; 4135 } 4136 break; 4137 } 4138 #endif /* defined(XEN) */ 4139 } while (pmap_pte_cas(ptep, opte, npte) != opte); 4140 4141 /* 4142 * update statistics and PTP's reference count. 4143 */ 4144 4145 pmap_stats_update_bypte(pmap, npte, opte); 4146 if (ptp != NULL && !pmap_valid_entry(opte)) { 4147 ptp->wire_count++; 4148 } 4149 KASSERT(ptp == NULL || ptp->wire_count > 1); 4150 4151 /* 4152 * if the same page, we can skip pv_entry handling. 4153 */ 4154 4155 if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) { 4156 KASSERT(((opte ^ npte) & PG_PVLIST) == 0); 4157 goto same_pa; 4158 } 4159 4160 /* 4161 * if old page is managed, remove pv_entry from its list. 4162 */ 4163 4164 if ((~opte & (PG_V | PG_PVLIST)) == 0) { 4165 pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte)); 4166 #ifdef DIAGNOSTIC 4167 if (pg == NULL) 4168 panic("pmap_enter: PG_PVLIST mapping with " 4169 "unmanaged page " 4170 "pa = 0x%" PRIx64 " (0x%" PRIx64 ")", 4171 (int64_t)pa, (int64_t)atop(pa)); 4172 #endif 4173 old_pp = VM_PAGE_TO_PP(pg); 4174 4175 pp_lock(old_pp); 4176 old_pve = pmap_remove_pv(old_pp, ptp, va); 4177 old_pp->pp_attrs |= opte; 4178 pp_unlock(old_pp); 4179 } 4180 4181 /* 4182 * if new page is managed, insert pv_entry into its list. 4183 */ 4184 4185 if (new_pp) { 4186 pp_lock(new_pp); 4187 new_pve = pmap_enter_pv(new_pp, new_pve, &new_pve2, ptp, va); 4188 pp_unlock(new_pp); 4189 } 4190 4191 same_pa: 4192 pmap_unmap_ptes(pmap, pmap2); 4193 4194 /* 4195 * shootdown tlb if necessary. 4196 */ 4197 4198 if ((~opte & (PG_V | PG_U)) == 0 && 4199 ((opte ^ npte) & (PG_FRAME | PG_RW)) != 0) { 4200 pmap_tlb_shootdown(pmap, va, 0, opte); 4201 } 4202 4203 error = 0; 4204 out: 4205 kpreempt_enable(); 4206 out2: 4207 if (old_pve != NULL) { 4208 pool_cache_put(&pmap_pv_cache, old_pve); 4209 } 4210 if (new_pve != NULL) { 4211 pool_cache_put(&pmap_pv_cache, new_pve); 4212 } 4213 if (new_pve2 != NULL) { 4214 pool_cache_put(&pmap_pv_cache, new_pve2); 4215 } 4216 4217 return error; 4218 } 4219 4220 #ifdef XEN 4221 int 4222 pmap_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, int flags) 4223 { 4224 paddr_t ma; 4225 4226 if (__predict_false(pa < pmap_pa_start || pmap_pa_end <= pa)) { 4227 ma = pa; /* XXX hack */ 4228 } else { 4229 ma = xpmap_ptom(pa); 4230 } 4231 4232 return pmap_enter_ma(pmap, va, ma, pa, prot, flags, DOMID_SELF); 4233 } 4234 #endif /* XEN */ 4235 4236 static bool 4237 pmap_get_physpage(vaddr_t va, int level, paddr_t *paddrp) 4238 { 4239 struct vm_page *ptp; 4240 struct pmap *kpm = pmap_kernel(); 4241 4242 if (uvm.page_init_done == false) { 4243 /* 4244 * we're growing the kernel pmap early (from 4245 * uvm_pageboot_alloc()). this case must be 4246 * handled a little differently. 4247 */ 4248 4249 if (uvm_page_physget(paddrp) == false) 4250 panic("pmap_get_physpage: out of memory"); 4251 kpreempt_disable(); 4252 pmap_pte_set(early_zero_pte, 4253 pmap_pa2pte(*paddrp) | PG_V | PG_RW | PG_k); 4254 pmap_pte_flush(); 4255 pmap_update_pg((vaddr_t)early_zerop); 4256 memset(early_zerop, 0, PAGE_SIZE); 4257 #if defined(DIAGNOSTIC) || defined (XEN) 4258 pmap_pte_set(early_zero_pte, 0); 4259 pmap_pte_flush(); 4260 #endif /* defined(DIAGNOSTIC) */ 4261 kpreempt_enable(); 4262 } else { 4263 /* XXX */ 4264 PMAP_SUBOBJ_LOCK(kpm, level - 1); 4265 ptp = uvm_pagealloc(&kpm->pm_obj[level - 1], 4266 ptp_va2o(va, level), NULL, 4267 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 4268 PMAP_SUBOBJ_UNLOCK(kpm, level - 1); 4269 if (ptp == NULL) 4270 panic("pmap_get_physpage: out of memory"); 4271 ptp->flags &= ~PG_BUSY; 4272 ptp->wire_count = 1; 4273 *paddrp = VM_PAGE_TO_PHYS(ptp); 4274 } 4275 pmap_stats_update(kpm, 1, 0); 4276 return true; 4277 } 4278 4279 /* 4280 * Allocate the amount of specified ptps for a ptp level, and populate 4281 * all levels below accordingly, mapping virtual addresses starting at 4282 * kva. 4283 * 4284 * Used by pmap_growkernel. 4285 */ 4286 static void 4287 pmap_alloc_level(pd_entry_t * const *pdes, vaddr_t kva, int lvl, 4288 long *needed_ptps) 4289 { 4290 unsigned long i; 4291 vaddr_t va; 4292 paddr_t pa; 4293 unsigned long index, endindex; 4294 int level; 4295 pd_entry_t *pdep; 4296 #ifdef XEN 4297 int s = splvm(); /* protect xpq_* */ 4298 #endif 4299 4300 for (level = lvl; level > 1; level--) { 4301 if (level == PTP_LEVELS) 4302 pdep = pmap_kernel()->pm_pdir; 4303 else 4304 pdep = pdes[level - 2]; 4305 va = kva; 4306 index = pl_i_roundup(kva, level); 4307 endindex = index + needed_ptps[level - 1] - 1; 4308 4309 4310 for (i = index; i <= endindex; i++) { 4311 KASSERT(!pmap_valid_entry(pdep[i])); 4312 pmap_get_physpage(va, level - 1, &pa); 4313 #ifdef XEN 4314 xpq_queue_pte_update((level == PTP_LEVELS) ? 4315 xpmap_ptom(pmap_pdirpa(pmap_kernel(), i)) : 4316 xpmap_ptetomach(&pdep[i]), 4317 pmap_pa2pte(pa) | PG_k | PG_V | PG_RW); 4318 #ifdef PAE 4319 if (level == PTP_LEVELS && i > L2_SLOT_KERN) { 4320 /* update real kernel PD too */ 4321 xpq_queue_pte_update( 4322 xpmap_ptetomach(&pmap_kl2pd[l2tol2(i)]), 4323 pmap_pa2pte(pa) | PG_k | PG_V | PG_RW); 4324 } 4325 #endif 4326 #else /* XEN */ 4327 pdep[i] = pa | PG_RW | PG_V; 4328 #endif /* XEN */ 4329 KASSERT(level != PTP_LEVELS || nkptp[level - 1] + 4330 pl_i(VM_MIN_KERNEL_ADDRESS, level) == i); 4331 nkptp[level - 1]++; 4332 va += nbpd[level - 1]; 4333 } 4334 pmap_pte_flush(); 4335 } 4336 #ifdef XEN 4337 splx(s); 4338 #endif 4339 } 4340 4341 /* 4342 * pmap_growkernel: increase usage of KVM space 4343 * 4344 * => we allocate new PTPs for the kernel and install them in all 4345 * the pmaps on the system. 4346 */ 4347 4348 vaddr_t 4349 pmap_growkernel(vaddr_t maxkvaddr) 4350 { 4351 struct pmap *kpm = pmap_kernel(); 4352 #if !defined(XEN) || !defined(__x86_64__) 4353 struct pmap *pm; 4354 #endif 4355 int s, i; 4356 long needed_kptp[PTP_LEVELS], target_nptp, old; 4357 bool invalidate = false; 4358 4359 s = splvm(); /* to be safe */ 4360 mutex_enter(&kpm->pm_lock); 4361 4362 if (maxkvaddr <= pmap_maxkvaddr) { 4363 mutex_exit(&kpm->pm_lock); 4364 splx(s); 4365 return pmap_maxkvaddr; 4366 } 4367 4368 maxkvaddr = x86_round_pdr(maxkvaddr); 4369 old = nkptp[PTP_LEVELS - 1]; 4370 /* 4371 * This loop could be optimized more, but pmap_growkernel() 4372 * is called infrequently. 4373 */ 4374 for (i = PTP_LEVELS - 1; i >= 1; i--) { 4375 target_nptp = pl_i_roundup(maxkvaddr, i + 1) - 4376 pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1); 4377 /* 4378 * XXX only need to check toplevel. 4379 */ 4380 if (target_nptp > nkptpmax[i]) 4381 panic("out of KVA space"); 4382 KASSERT(target_nptp >= nkptp[i]); 4383 needed_kptp[i] = target_nptp - nkptp[i]; 4384 } 4385 4386 pmap_alloc_level(normal_pdes, pmap_maxkvaddr, PTP_LEVELS, needed_kptp); 4387 4388 /* 4389 * If the number of top level entries changed, update all 4390 * pmaps. 4391 */ 4392 if (needed_kptp[PTP_LEVELS - 1] != 0) { 4393 #ifdef XEN 4394 #ifdef __x86_64__ 4395 /* nothing, kernel entries are never entered in user pmap */ 4396 #else /* __x86_64__ */ 4397 mutex_enter(&pmaps_lock); 4398 LIST_FOREACH(pm, &pmaps, pm_list) { 4399 int pdkidx; 4400 for (pdkidx = PDIR_SLOT_KERN + old; 4401 pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1]; 4402 pdkidx++) { 4403 xpq_queue_pte_update( 4404 xpmap_ptom(pmap_pdirpa(pm, pdkidx)), 4405 kpm->pm_pdir[pdkidx]); 4406 } 4407 xpq_flush_queue(); 4408 } 4409 mutex_exit(&pmaps_lock); 4410 #endif /* __x86_64__ */ 4411 #else /* XEN */ 4412 unsigned newpdes; 4413 newpdes = nkptp[PTP_LEVELS - 1] - old; 4414 mutex_enter(&pmaps_lock); 4415 LIST_FOREACH(pm, &pmaps, pm_list) { 4416 memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old], 4417 &kpm->pm_pdir[PDIR_SLOT_KERN + old], 4418 newpdes * sizeof (pd_entry_t)); 4419 } 4420 mutex_exit(&pmaps_lock); 4421 #endif 4422 invalidate = true; 4423 } 4424 pmap_maxkvaddr = maxkvaddr; 4425 mutex_exit(&kpm->pm_lock); 4426 splx(s); 4427 4428 if (invalidate) { 4429 /* Invalidate the PDP cache. */ 4430 pool_cache_invalidate(&pmap_pdp_cache); 4431 } 4432 4433 return maxkvaddr; 4434 } 4435 4436 #ifdef DEBUG 4437 void pmap_dump(struct pmap *, vaddr_t, vaddr_t); 4438 4439 /* 4440 * pmap_dump: dump all the mappings from a pmap 4441 * 4442 * => caller should not be holding any pmap locks 4443 */ 4444 4445 void 4446 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 4447 { 4448 pt_entry_t *ptes, *pte; 4449 pd_entry_t * const *pdes; 4450 struct pmap *pmap2; 4451 vaddr_t blkendva; 4452 4453 /* 4454 * if end is out of range truncate. 4455 * if (end == start) update to max. 4456 */ 4457 4458 if (eva > VM_MAXUSER_ADDRESS || eva <= sva) 4459 eva = VM_MAXUSER_ADDRESS; 4460 4461 /* 4462 * we lock in the pmap => pv_head direction 4463 */ 4464 4465 kpreempt_disable(); 4466 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 4467 4468 /* 4469 * dumping a range of pages: we dump in PTP sized blocks (4MB) 4470 */ 4471 4472 for (/* null */ ; sva < eva ; sva = blkendva) { 4473 4474 /* determine range of block */ 4475 blkendva = x86_round_pdr(sva+1); 4476 if (blkendva > eva) 4477 blkendva = eva; 4478 4479 /* valid block? */ 4480 if (!pmap_pdes_valid(sva, pdes, NULL)) 4481 continue; 4482 4483 pte = &ptes[pl1_i(sva)]; 4484 for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) { 4485 if (!pmap_valid_entry(*pte)) 4486 continue; 4487 printf("va %#lx -> pa %#lx (pte=%#lx)\n", 4488 sva, (unsigned long)*pte, 4489 (unsigned long)pmap_pte2pa(*pte)); 4490 } 4491 } 4492 pmap_unmap_ptes(pmap, pmap2); 4493 kpreempt_enable(); 4494 } 4495 #endif 4496 4497 /* 4498 * pmap_tlb_shootdown: invalidate pages on all CPUs using pmap 'pm' 4499 * 4500 * => always invalidates locally before returning 4501 * => returns before remote CPUs have invalidated 4502 * => must be called with preemption disabled 4503 */ 4504 4505 void 4506 pmap_tlb_shootdown(struct pmap *pm, vaddr_t sva, vaddr_t eva, pt_entry_t pte) 4507 { 4508 #ifdef MULTIPROCESSOR 4509 extern bool x86_mp_online; 4510 struct cpu_info *ci; 4511 struct pmap_mbox *mb, *selfmb; 4512 CPU_INFO_ITERATOR cii; 4513 uintptr_t head; 4514 u_int count; 4515 int s; 4516 #endif /* MULTIPROCESSOR */ 4517 struct cpu_info *self; 4518 bool kernel; 4519 4520 KASSERT(eva == 0 || eva >= sva); 4521 KASSERT(kpreempt_disabled()); 4522 4523 if (pte & PG_PS) 4524 sva &= PG_LGFRAME; 4525 pte &= PG_G; 4526 self = curcpu(); 4527 4528 if (sva == (vaddr_t)-1LL) { 4529 kernel = true; 4530 } else { 4531 if (eva == 0) 4532 eva = sva + PAGE_SIZE; 4533 kernel = sva >= VM_MAXUSER_ADDRESS; 4534 KASSERT(kernel == (eva > VM_MAXUSER_ADDRESS)); 4535 } 4536 4537 /* 4538 * if tearing down the pmap, do nothing. we'll flush later 4539 * when we're ready to recycle/destroy it. 4540 */ 4541 if (__predict_false(curlwp->l_md.md_gc_pmap == pm)) { 4542 return; 4543 } 4544 4545 /* 4546 * If the range is larger than 32 pages, then invalidate 4547 * everything. 4548 */ 4549 if (sva != (vaddr_t)-1LL && eva - sva > (32 * PAGE_SIZE)) { 4550 sva = (vaddr_t)-1LL; 4551 eva = sva; 4552 } 4553 4554 #ifdef MULTIPROCESSOR 4555 if (ncpu > 1 && x86_mp_online) { 4556 selfmb = &self->ci_pmap_cpu->pc_mbox; 4557 4558 /* 4559 * If the CPUs have no notion of global pages then 4560 * reload of %cr3 is sufficient. 4561 */ 4562 if (pte != 0 && (cpu_feature & CPUID_PGE) == 0) 4563 pte = 0; 4564 4565 if (pm == pmap_kernel()) { 4566 /* 4567 * Mapped on all CPUs: use the broadcast mechanism. 4568 * Once we have the lock, increment the counter. 4569 */ 4570 s = splvm(); 4571 mb = &pmap_mbox; 4572 count = SPINLOCK_BACKOFF_MIN; 4573 do { 4574 if ((head = mb->mb_head) != mb->mb_tail) { 4575 splx(s); 4576 while ((head = mb->mb_head) != 4577 mb->mb_tail) 4578 SPINLOCK_BACKOFF(count); 4579 s = splvm(); 4580 } 4581 } while (atomic_cas_ulong( 4582 (volatile u_long *)&mb->mb_head, 4583 head, head + ncpu - 1) != head); 4584 4585 /* 4586 * Once underway we must stay at IPL_VM until the 4587 * IPI is dispatched. Otherwise interrupt handlers 4588 * on this CPU can deadlock against us. 4589 */ 4590 pmap_tlb_evcnt.ev_count++; 4591 mb->mb_pointer = self; 4592 mb->mb_addr1 = sva; 4593 mb->mb_addr2 = eva; 4594 mb->mb_global = pte; 4595 x86_ipi(LAPIC_TLB_BCAST_VECTOR, LAPIC_DEST_ALLEXCL, 4596 LAPIC_DLMODE_FIXED); 4597 self->ci_need_tlbwait = 1; 4598 splx(s); 4599 } else if ((pm->pm_cpus & ~self->ci_cpumask) != 0 || 4600 (kernel && (pm->pm_kernel_cpus & ~self->ci_cpumask) != 0)) { 4601 /* 4602 * We don't bother traversing the CPU list if only 4603 * used by this CPU. 4604 * 4605 * We can't do global flushes with the multicast 4606 * mechanism. 4607 */ 4608 KASSERT(pte == 0); 4609 4610 /* 4611 * Take ownership of the shootdown mailbox on each 4612 * CPU, fill the details and fire it off. 4613 */ 4614 s = splvm(); 4615 for (CPU_INFO_FOREACH(cii, ci)) { 4616 if (ci == self || 4617 !pmap_is_active(pm, ci, kernel) || 4618 !(ci->ci_flags & CPUF_RUNNING)) 4619 continue; 4620 selfmb->mb_head++; 4621 mb = &ci->ci_pmap_cpu->pc_mbox; 4622 count = SPINLOCK_BACKOFF_MIN; 4623 while (atomic_cas_ulong( 4624 (u_long *)&mb->mb_pointer, 4625 0, (u_long)&selfmb->mb_tail) != 0) { 4626 splx(s); 4627 while (mb->mb_pointer != 0) 4628 SPINLOCK_BACKOFF(count); 4629 s = splvm(); 4630 } 4631 mb->mb_addr1 = sva; 4632 mb->mb_addr2 = eva; 4633 mb->mb_global = pte; 4634 if (x86_ipi(LAPIC_TLB_MCAST_VECTOR, 4635 ci->ci_cpuid, LAPIC_DLMODE_FIXED)) 4636 panic("pmap_tlb_shootdown: ipi failed"); 4637 } 4638 self->ci_need_tlbwait = 1; 4639 splx(s); 4640 } 4641 } 4642 #endif /* MULTIPROCESSOR */ 4643 4644 /* Update the current CPU before waiting for others. */ 4645 if (!pmap_is_active(pm, self, kernel)) 4646 return; 4647 4648 if (sva == (vaddr_t)-1LL) { 4649 if (pte != 0) 4650 tlbflushg(); 4651 else 4652 tlbflush(); 4653 } else { 4654 do { 4655 pmap_update_pg(sva); 4656 sva += PAGE_SIZE; 4657 } while (sva < eva); 4658 } 4659 } 4660 4661 /* 4662 * pmap_tlb_shootwait: wait for pending TLB shootdowns to complete 4663 * 4664 * => only waits for operations generated by the current CPU 4665 * => must be called with preemption disabled 4666 */ 4667 4668 void 4669 pmap_tlb_shootwait(void) 4670 { 4671 struct cpu_info *self; 4672 struct pmap_mbox *mb; 4673 4674 KASSERT(kpreempt_disabled()); 4675 4676 /* 4677 * Anything to do? XXX Really we want to avoid touching the cache 4678 * lines of the two mailboxes, but the processor may read ahead. 4679 */ 4680 self = curcpu(); 4681 if (!self->ci_need_tlbwait) 4682 return; 4683 self->ci_need_tlbwait = 0; 4684 4685 /* If we own the global mailbox, wait for it to drain. */ 4686 mb = &pmap_mbox; 4687 while (mb->mb_pointer == self && mb->mb_head != mb->mb_tail) 4688 x86_pause(); 4689 4690 /* If we own other CPU's mailboxes, wait for them to drain. */ 4691 mb = &self->ci_pmap_cpu->pc_mbox; 4692 KASSERT(mb->mb_pointer != &mb->mb_tail); 4693 while (mb->mb_head != mb->mb_tail) 4694 x86_pause(); 4695 } 4696 4697 /* 4698 * pmap_update: process deferred invalidations 4699 */ 4700 4701 void 4702 pmap_update(struct pmap *pmap) 4703 { 4704 struct vm_page *ptp, *empty_ptps; 4705 struct pmap_page *pp; 4706 lwp_t *l; 4707 4708 /* 4709 * if we have torn down this pmap, invalidate non-global TLB 4710 * entries on any processors using it. 4711 */ 4712 l = curlwp; 4713 if (__predict_false(l->l_md.md_gc_pmap == pmap)) { 4714 l->l_md.md_gc_pmap = NULL; 4715 KPREEMPT_DISABLE(l); 4716 pmap_tlb_shootdown(pmap, -1, -1, 0); 4717 KPREEMPT_ENABLE(l); 4718 } 4719 4720 /* 4721 * wait for tlb shootdowns to complete before returning control 4722 * to the caller. 4723 */ 4724 kpreempt_disable(); 4725 pmap_tlb_shootwait(); 4726 kpreempt_enable(); 4727 4728 /* 4729 * now that shootdowns are complete, process deferred frees, 4730 * but not from interrupt context. 4731 */ 4732 if (l->l_md.md_gc_ptp != NULL) { 4733 if (cpu_intr_p() || (l->l_pflag & LP_INTR) != 0) { 4734 return; 4735 } 4736 4737 empty_ptps = l->l_md.md_gc_ptp; 4738 l->l_md.md_gc_ptp = NULL; 4739 4740 while ((ptp = empty_ptps) != NULL) { 4741 ptp->flags |= PG_ZERO; 4742 pp = VM_PAGE_TO_PP(ptp); 4743 empty_ptps = pp->pp_link; 4744 LIST_INIT(&pp->pp_head.pvh_list); 4745 uvm_pagefree(ptp); 4746 } 4747 } 4748 } 4749 4750 #if PTP_LEVELS > 4 4751 #error "Unsupported number of page table mappings" 4752 #endif 4753 4754 paddr_t 4755 pmap_init_tmp_pgtbl(paddr_t pg) 4756 { 4757 static bool maps_loaded; 4758 static const paddr_t x86_tmp_pml_paddr[] = { 4759 4 * PAGE_SIZE, 4760 5 * PAGE_SIZE, 4761 6 * PAGE_SIZE, 4762 7 * PAGE_SIZE 4763 }; 4764 static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 }; 4765 4766 pd_entry_t *tmp_pml, *kernel_pml; 4767 4768 int level; 4769 4770 if (!maps_loaded) { 4771 for (level = 0; level < PTP_LEVELS; ++level) { 4772 x86_tmp_pml_vaddr[level] = 4773 uvm_km_alloc(kernel_map, PAGE_SIZE, 0, 4774 UVM_KMF_VAONLY); 4775 4776 if (x86_tmp_pml_vaddr[level] == 0) 4777 panic("mapping of real mode PML failed\n"); 4778 pmap_kenter_pa(x86_tmp_pml_vaddr[level], 4779 x86_tmp_pml_paddr[level], 4780 VM_PROT_READ | VM_PROT_WRITE); 4781 pmap_update(pmap_kernel()); 4782 } 4783 maps_loaded = true; 4784 } 4785 4786 /* Zero levels 1-3 */ 4787 for (level = 0; level < PTP_LEVELS - 1; ++level) { 4788 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 4789 memset(tmp_pml, 0, PAGE_SIZE); 4790 } 4791 4792 /* Copy PML4 */ 4793 kernel_pml = pmap_kernel()->pm_pdir; 4794 tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1]; 4795 memcpy(tmp_pml, kernel_pml, PAGE_SIZE); 4796 4797 /* Hook our own level 3 in */ 4798 tmp_pml[pl_i(pg, PTP_LEVELS)] = 4799 (x86_tmp_pml_paddr[PTP_LEVELS - 2] & PG_FRAME) | PG_RW | PG_V; 4800 4801 for (level = PTP_LEVELS - 1; level > 0; --level) { 4802 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 4803 4804 tmp_pml[pl_i(pg, level + 1)] = 4805 (x86_tmp_pml_paddr[level - 1] & PG_FRAME) | PG_RW | PG_V; 4806 } 4807 4808 tmp_pml = (void *)x86_tmp_pml_vaddr[0]; 4809 tmp_pml[pl_i(pg, 1)] = (pg & PG_FRAME) | PG_RW | PG_V; 4810 4811 return x86_tmp_pml_paddr[PTP_LEVELS - 1]; 4812 } 4813