1 /* $NetBSD: pmap.c,v 1.124 2011/06/18 21:18:20 rmind Exp $ */ 2 3 /*- 4 * Copyright (c) 2008, 2010 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 2007 Manuel Bouyer. 34 * 35 * Redistribution and use in source and binary forms, with or without 36 * modification, are permitted provided that the following conditions 37 * are met: 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 44 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 45 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 46 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 47 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 48 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 49 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 50 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 51 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 52 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 53 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 54 * 55 */ 56 57 /* 58 * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr> 59 * 60 * Permission to use, copy, modify, and distribute this software for any 61 * purpose with or without fee is hereby granted, provided that the above 62 * copyright notice and this permission notice appear in all copies. 63 * 64 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 65 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 66 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 67 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 68 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 69 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 70 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 71 */ 72 73 /* 74 * Copyright (c) 1997 Charles D. Cranor and Washington University. 75 * All rights reserved. 76 * 77 * Redistribution and use in source and binary forms, with or without 78 * modification, are permitted provided that the following conditions 79 * are met: 80 * 1. Redistributions of source code must retain the above copyright 81 * notice, this list of conditions and the following disclaimer. 82 * 2. Redistributions in binary form must reproduce the above copyright 83 * notice, this list of conditions and the following disclaimer in the 84 * documentation and/or other materials provided with the distribution. 85 * 86 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 87 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 88 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 89 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 90 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 91 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 92 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 93 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 94 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 95 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 96 */ 97 98 /* 99 * Copyright 2001 (c) Wasabi Systems, Inc. 100 * All rights reserved. 101 * 102 * Written by Frank van der Linden for Wasabi Systems, Inc. 103 * 104 * Redistribution and use in source and binary forms, with or without 105 * modification, are permitted provided that the following conditions 106 * are met: 107 * 1. Redistributions of source code must retain the above copyright 108 * notice, this list of conditions and the following disclaimer. 109 * 2. Redistributions in binary form must reproduce the above copyright 110 * notice, this list of conditions and the following disclaimer in the 111 * documentation and/or other materials provided with the distribution. 112 * 3. All advertising materials mentioning features or use of this software 113 * must display the following acknowledgement: 114 * This product includes software developed for the NetBSD Project by 115 * Wasabi Systems, Inc. 116 * 4. The name of Wasabi Systems, Inc. may not be used to endorse 117 * or promote products derived from this software without specific prior 118 * written permission. 119 * 120 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND 121 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 122 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 123 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC 124 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 125 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 126 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 127 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 128 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 129 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 130 * POSSIBILITY OF SUCH DAMAGE. 131 */ 132 133 /* 134 * This is the i386 pmap modified and generalized to support x86-64 135 * as well. The idea is to hide the upper N levels of the page tables 136 * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest 137 * is mostly untouched, except that it uses some more generalized 138 * macros and interfaces. 139 * 140 * This pmap has been tested on the i386 as well, and it can be easily 141 * adapted to PAE. 142 * 143 * fvdl@wasabisystems.com 18-Jun-2001 144 */ 145 146 /* 147 * pmap.c: i386 pmap module rewrite 148 * Chuck Cranor <chuck@netbsd> 149 * 11-Aug-97 150 * 151 * history of this pmap module: in addition to my own input, i used 152 * the following references for this rewrite of the i386 pmap: 153 * 154 * [1] the NetBSD i386 pmap. this pmap appears to be based on the 155 * BSD hp300 pmap done by Mike Hibler at University of Utah. 156 * it was then ported to the i386 by William Jolitz of UUNET 157 * Technologies, Inc. Then Charles M. Hannum of the NetBSD 158 * project fixed some bugs and provided some speed ups. 159 * 160 * [2] the FreeBSD i386 pmap. this pmap seems to be the 161 * Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson 162 * and David Greenman. 163 * 164 * [3] the Mach pmap. this pmap, from CMU, seems to have migrated 165 * between several processors. the VAX version was done by 166 * Avadis Tevanian, Jr., and Michael Wayne Young. the i386 167 * version was done by Lance Berc, Mike Kupfer, Bob Baron, 168 * David Golub, and Richard Draves. the alpha version was 169 * done by Alessandro Forin (CMU/Mach) and Chris Demetriou 170 * (NetBSD/alpha). 171 */ 172 173 #include <sys/cdefs.h> 174 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.124 2011/06/18 21:18:20 rmind Exp $"); 175 176 #include "opt_user_ldt.h" 177 #include "opt_lockdebug.h" 178 #include "opt_multiprocessor.h" 179 #include "opt_xen.h" 180 #if !defined(__x86_64__) 181 #include "opt_kstack_dr0.h" 182 #endif /* !defined(__x86_64__) */ 183 184 #include <sys/param.h> 185 #include <sys/systm.h> 186 #include <sys/proc.h> 187 #include <sys/pool.h> 188 #include <sys/kernel.h> 189 #include <sys/atomic.h> 190 #include <sys/cpu.h> 191 #include <sys/intr.h> 192 #include <sys/xcall.h> 193 194 #include <uvm/uvm.h> 195 196 #include <dev/isa/isareg.h> 197 198 #include <machine/specialreg.h> 199 #include <machine/gdt.h> 200 #include <machine/isa_machdep.h> 201 #include <machine/cpuvar.h> 202 203 #include <x86/pmap.h> 204 #include <x86/pmap_pv.h> 205 206 #include <x86/i82489reg.h> 207 #include <x86/i82489var.h> 208 209 #ifdef XEN 210 #include <xen/xen3-public/xen.h> 211 #include <xen/hypervisor.h> 212 #endif 213 214 /* flag to be used for kernel mappings: PG_u on Xen/amd64, 0 otherwise */ 215 #if defined(XEN) && defined(__x86_64__) 216 #define PG_k PG_u 217 #else 218 #define PG_k 0 219 #endif 220 221 /* 222 * general info: 223 * 224 * - for an explanation of how the i386 MMU hardware works see 225 * the comments in <machine/pte.h>. 226 * 227 * - for an explanation of the general memory structure used by 228 * this pmap (including the recursive mapping), see the comments 229 * in <machine/pmap.h>. 230 * 231 * this file contains the code for the "pmap module." the module's 232 * job is to manage the hardware's virtual to physical address mappings. 233 * note that there are two levels of mapping in the VM system: 234 * 235 * [1] the upper layer of the VM system uses vm_map's and vm_map_entry's 236 * to map ranges of virtual address space to objects/files. for 237 * example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only 238 * to the file /bin/ls starting at offset zero." note that 239 * the upper layer mapping is not concerned with how individual 240 * vm_pages are mapped. 241 * 242 * [2] the lower layer of the VM system (the pmap) maintains the mappings 243 * from virtual addresses. it is concerned with which vm_page is 244 * mapped where. for example, when you run /bin/ls and start 245 * at page 0x1000 the fault routine may lookup the correct page 246 * of the /bin/ls file and then ask the pmap layer to establish 247 * a mapping for it. 248 * 249 * note that information in the lower layer of the VM system can be 250 * thrown away since it can easily be reconstructed from the info 251 * in the upper layer. 252 * 253 * data structures we use include: 254 * 255 * - struct pmap: describes the address space of one thread 256 * - struct pv_entry: describes one <PMAP,VA> mapping of a PA 257 * - struct pv_head: there is one pv_head per managed page of 258 * physical memory. the pv_head points to a list of pv_entry 259 * structures which describe all the <PMAP,VA> pairs that this 260 * page is mapped in. this is critical for page based operations 261 * such as pmap_page_protect() [change protection on _all_ mappings 262 * of a page] 263 */ 264 265 /* 266 * memory allocation 267 * 268 * - there are three data structures that we must dynamically allocate: 269 * 270 * [A] new process' page directory page (PDP) 271 * - plan 1: done at pmap_create() we use 272 * uvm_km_alloc(kernel_map, PAGE_SIZE) [fka kmem_alloc] to do this 273 * allocation. 274 * 275 * if we are low in free physical memory then we sleep in 276 * uvm_km_alloc -- in this case this is ok since we are creating 277 * a new pmap and should not be holding any locks. 278 * 279 * if the kernel is totally out of virtual space 280 * (i.e. uvm_km_alloc returns NULL), then we panic. 281 * 282 * [B] new page tables pages (PTP) 283 * - call uvm_pagealloc() 284 * => success: zero page, add to pm_pdir 285 * => failure: we are out of free vm_pages, let pmap_enter() 286 * tell UVM about it. 287 * 288 * note: for kernel PTPs, we start with NKPTP of them. as we map 289 * kernel memory (at uvm_map time) we check to see if we've grown 290 * the kernel pmap. if so, we call the optional function 291 * pmap_growkernel() to grow the kernel PTPs in advance. 292 * 293 * [C] pv_entry structures 294 */ 295 296 /* 297 * locking 298 * 299 * we have the following locks that we must contend with: 300 * 301 * mutexes: 302 * 303 * - pmap lock (per pmap, part of uvm_object) 304 * this lock protects the fields in the pmap structure including 305 * the non-kernel PDEs in the PDP, and the PTEs. it also locks 306 * in the alternate PTE space (since that is determined by the 307 * entry in the PDP). 308 * 309 * - pvh_lock (per pv_head) 310 * this lock protects the pv_entry list which is chained off the 311 * pv_head structure for a specific managed PA. it is locked 312 * when traversing the list (e.g. adding/removing mappings, 313 * syncing R/M bits, etc.) 314 * 315 * - pmaps_lock 316 * this lock protects the list of active pmaps (headed by "pmaps"). 317 * we lock it when adding or removing pmaps from this list. 318 */ 319 320 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER; 321 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER; 322 const long nkptpmax[] = NKPTPMAX_INITIALIZER; 323 const long nbpd[] = NBPD_INITIALIZER; 324 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER; 325 326 long nkptp[] = NKPTP_INITIALIZER; 327 328 static kmutex_t pmaps_lock; 329 330 static vaddr_t pmap_maxkvaddr; 331 332 /* 333 * XXX kludge: dummy locking to make KASSERTs in uvm_page.c comfortable. 334 * actual locking is done by pm_lock. 335 */ 336 #if defined(DIAGNOSTIC) 337 #define PMAP_SUBOBJ_LOCK(pm, idx) \ 338 KASSERT(mutex_owned((pm)->pm_lock)); \ 339 if ((idx) != 0) \ 340 mutex_enter((pm)->pm_obj[(idx)].vmobjlock) 341 #define PMAP_SUBOBJ_UNLOCK(pm, idx) \ 342 KASSERT(mutex_owned((pm)->pm_lock)); \ 343 if ((idx) != 0) \ 344 mutex_exit((pm)->pm_obj[(idx)].vmobjlock) 345 #else /* defined(DIAGNOSTIC) */ 346 #define PMAP_SUBOBJ_LOCK(pm, idx) /* nothing */ 347 #define PMAP_SUBOBJ_UNLOCK(pm, idx) /* nothing */ 348 #endif /* defined(DIAGNOSTIC) */ 349 350 /* 351 * Misc. event counters. 352 */ 353 struct evcnt pmap_iobmp_evcnt; 354 struct evcnt pmap_ldt_evcnt; 355 356 /* 357 * PAT 358 */ 359 #define PATENTRY(n, type) (type << ((n) * 8)) 360 #define PAT_UC 0x0ULL 361 #define PAT_WC 0x1ULL 362 #define PAT_WT 0x4ULL 363 #define PAT_WP 0x5ULL 364 #define PAT_WB 0x6ULL 365 #define PAT_UCMINUS 0x7ULL 366 367 static bool cpu_pat_enabled = false; 368 369 /* 370 * global data structures 371 */ 372 373 static struct pmap kernel_pmap_store; /* the kernel's pmap (proc0) */ 374 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store; 375 376 /* 377 * pmap_pg_g: if our processor supports PG_G in the PTE then we 378 * set pmap_pg_g to PG_G (otherwise it is zero). 379 */ 380 381 int pmap_pg_g = 0; 382 383 /* 384 * pmap_largepages: if our processor supports PG_PS and we are 385 * using it, this is set to true. 386 */ 387 388 int pmap_largepages; 389 390 /* 391 * i386 physical memory comes in a big contig chunk with a small 392 * hole toward the front of it... the following two paddr_t's 393 * (shared with machdep.c) describe the physical address space 394 * of this machine. 395 */ 396 paddr_t avail_start; /* PA of first available physical page */ 397 paddr_t avail_end; /* PA of last available physical page */ 398 399 #ifdef XEN 400 #ifdef __x86_64__ 401 /* Dummy PGD for user cr3, used between pmap_deactivate() and pmap_activate() */ 402 static paddr_t xen_dummy_user_pgd; 403 #endif /* __x86_64__ */ 404 paddr_t pmap_pa_start; /* PA of first physical page for this domain */ 405 paddr_t pmap_pa_end; /* PA of last physical page for this domain */ 406 #endif /* XEN */ 407 408 #define VM_PAGE_TO_PP(pg) (&(pg)->mdpage.mp_pp) 409 410 #define PV_HASH_SIZE 32768 411 #define PV_HASH_LOCK_CNT 32 412 413 struct pv_hash_lock { 414 kmutex_t lock; 415 } __aligned(CACHE_LINE_SIZE) pv_hash_locks[PV_HASH_LOCK_CNT] 416 __aligned(CACHE_LINE_SIZE); 417 418 struct pv_hash_head { 419 SLIST_HEAD(, pv_entry) hh_list; 420 } pv_hash_heads[PV_HASH_SIZE]; 421 422 static u_int 423 pvhash_hash(struct vm_page *ptp, vaddr_t va) 424 { 425 426 return (uintptr_t)ptp / sizeof(*ptp) + (va >> PAGE_SHIFT); 427 } 428 429 static struct pv_hash_head * 430 pvhash_head(u_int hash) 431 { 432 433 return &pv_hash_heads[hash % PV_HASH_SIZE]; 434 } 435 436 static kmutex_t * 437 pvhash_lock(u_int hash) 438 { 439 440 return &pv_hash_locks[hash % PV_HASH_LOCK_CNT].lock; 441 } 442 443 static struct pv_entry * 444 pvhash_remove(struct pv_hash_head *hh, struct vm_page *ptp, vaddr_t va) 445 { 446 struct pv_entry *pve; 447 struct pv_entry *prev; 448 449 prev = NULL; 450 SLIST_FOREACH(pve, &hh->hh_list, pve_hash) { 451 if (pve->pve_pte.pte_ptp == ptp && 452 pve->pve_pte.pte_va == va) { 453 if (prev != NULL) { 454 SLIST_REMOVE_AFTER(prev, pve_hash); 455 } else { 456 SLIST_REMOVE_HEAD(&hh->hh_list, pve_hash); 457 } 458 break; 459 } 460 prev = pve; 461 } 462 return pve; 463 } 464 465 /* 466 * other data structures 467 */ 468 469 static pt_entry_t protection_codes[8]; /* maps MI prot to i386 prot code */ 470 static bool pmap_initialized = false; /* pmap_init done yet? */ 471 472 /* 473 * the following two vaddr_t's are used during system startup 474 * to keep track of how much of the kernel's VM space we have used. 475 * once the system is started, the management of the remaining kernel 476 * VM space is turned over to the kernel_map vm_map. 477 */ 478 479 static vaddr_t virtual_avail; /* VA of first free KVA */ 480 static vaddr_t virtual_end; /* VA of last free KVA */ 481 482 /* 483 * linked list of all non-kernel pmaps 484 */ 485 486 static struct pmap_head pmaps; 487 488 /* 489 * pool that pmap structures are allocated from 490 */ 491 492 static struct pool_cache pmap_cache; 493 494 /* 495 * pv_entry cache 496 */ 497 498 static struct pool_cache pmap_pv_cache; 499 500 /* 501 * MULTIPROCESSOR: special VA's/ PTE's are actually allocated inside a 502 * maxcpus*NPTECL array of PTE's, to avoid cache line thrashing 503 * due to false sharing. 504 */ 505 506 #ifdef MULTIPROCESSOR 507 #define PTESLEW(pte, id) ((pte)+(id)*NPTECL) 508 #define VASLEW(va,id) ((va)+(id)*NPTECL*PAGE_SIZE) 509 #else 510 #define PTESLEW(pte, id) (pte) 511 #define VASLEW(va,id) (va) 512 #endif 513 514 /* 515 * special VAs and the PTEs that map them 516 */ 517 static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte, *early_zero_pte; 518 static char *csrcp, *cdstp, *zerop, *ptpp, *early_zerop; 519 520 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int); 521 522 /* 523 * pool and cache that PDPs are allocated from 524 */ 525 526 static struct pool_cache pmap_pdp_cache; 527 int pmap_pdp_ctor(void *, void *, int); 528 void pmap_pdp_dtor(void *, void *); 529 #ifdef PAE 530 /* need to allocate items of 4 pages */ 531 void *pmap_pdp_alloc(struct pool *, int); 532 void pmap_pdp_free(struct pool *, void *); 533 static struct pool_allocator pmap_pdp_allocator = { 534 .pa_alloc = pmap_pdp_alloc, 535 .pa_free = pmap_pdp_free, 536 .pa_pagesz = PAGE_SIZE * PDP_SIZE, 537 }; 538 #endif /* PAE */ 539 540 extern vaddr_t idt_vaddr; /* we allocate IDT early */ 541 extern paddr_t idt_paddr; 542 543 #ifdef _LP64 544 extern vaddr_t lo32_vaddr; 545 extern vaddr_t lo32_paddr; 546 #endif 547 548 extern int end; 549 550 #ifdef i386 551 /* stuff to fix the pentium f00f bug */ 552 extern vaddr_t pentium_idt_vaddr; 553 #endif 554 555 556 /* 557 * local prototypes 558 */ 559 560 static struct vm_page *pmap_get_ptp(struct pmap *, vaddr_t, 561 pd_entry_t * const *); 562 static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int); 563 static void pmap_freepage(struct pmap *, struct vm_page *, int); 564 static void pmap_free_ptp(struct pmap *, struct vm_page *, 565 vaddr_t, pt_entry_t *, 566 pd_entry_t * const *); 567 static bool pmap_is_active(struct pmap *, struct cpu_info *, bool); 568 static bool pmap_remove_pte(struct pmap *, struct vm_page *, 569 pt_entry_t *, vaddr_t, 570 struct pv_entry **); 571 static void pmap_remove_ptes(struct pmap *, struct vm_page *, 572 vaddr_t, vaddr_t, vaddr_t, 573 struct pv_entry **); 574 575 static bool pmap_get_physpage(vaddr_t, int, paddr_t *); 576 static void pmap_alloc_level(pd_entry_t * const *, vaddr_t, int, 577 long *); 578 579 static bool pmap_reactivate(struct pmap *); 580 581 /* 582 * p m a p h e l p e r f u n c t i o n s 583 */ 584 585 static inline void 586 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff) 587 { 588 589 if (pmap == pmap_kernel()) { 590 atomic_add_long(&pmap->pm_stats.resident_count, resid_diff); 591 atomic_add_long(&pmap->pm_stats.wired_count, wired_diff); 592 } else { 593 KASSERT(mutex_owned(pmap->pm_lock)); 594 pmap->pm_stats.resident_count += resid_diff; 595 pmap->pm_stats.wired_count += wired_diff; 596 } 597 } 598 599 static inline void 600 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte) 601 { 602 int resid_diff = ((npte & PG_V) ? 1 : 0) - ((opte & PG_V) ? 1 : 0); 603 int wired_diff = ((npte & PG_W) ? 1 : 0) - ((opte & PG_W) ? 1 : 0); 604 605 KASSERT((npte & (PG_V | PG_W)) != PG_W); 606 KASSERT((opte & (PG_V | PG_W)) != PG_W); 607 608 pmap_stats_update(pmap, resid_diff, wired_diff); 609 } 610 611 /* 612 * ptp_to_pmap: lookup pmap by ptp 613 */ 614 615 static struct pmap * 616 ptp_to_pmap(struct vm_page *ptp) 617 { 618 struct pmap *pmap; 619 620 if (ptp == NULL) { 621 return pmap_kernel(); 622 } 623 pmap = (struct pmap *)ptp->uobject; 624 KASSERT(pmap != NULL); 625 KASSERT(&pmap->pm_obj[0] == ptp->uobject); 626 return pmap; 627 } 628 629 static inline struct pv_pte * 630 pve_to_pvpte(struct pv_entry *pve) 631 { 632 633 KASSERT((void *)&pve->pve_pte == (void *)pve); 634 return &pve->pve_pte; 635 } 636 637 static inline struct pv_entry * 638 pvpte_to_pve(struct pv_pte *pvpte) 639 { 640 struct pv_entry *pve = (void *)pvpte; 641 642 KASSERT(pve_to_pvpte(pve) == pvpte); 643 return pve; 644 } 645 646 /* 647 * pv_pte_first, pv_pte_next: PV list iterator. 648 */ 649 650 static struct pv_pte * 651 pv_pte_first(struct pmap_page *pp) 652 { 653 654 if ((pp->pp_flags & PP_EMBEDDED) != 0) { 655 return &pp->pp_pte; 656 } 657 return pve_to_pvpte(LIST_FIRST(&pp->pp_head.pvh_list)); 658 } 659 660 static struct pv_pte * 661 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte) 662 { 663 664 KASSERT(pvpte != NULL); 665 if (pvpte == &pp->pp_pte) { 666 KASSERT((pp->pp_flags & PP_EMBEDDED) != 0); 667 return NULL; 668 } 669 KASSERT((pp->pp_flags & PP_EMBEDDED) == 0); 670 return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list)); 671 } 672 673 /* 674 * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]? 675 * of course the kernel is always loaded 676 */ 677 678 bool 679 pmap_is_curpmap(struct pmap *pmap) 680 { 681 #if defined(XEN) && defined(__x86_64__) 682 /* 683 * Only kernel pmap is physically loaded. 684 * User PGD may be active, but TLB will be flushed 685 * with HYPERVISOR_iret anyway, so let's say no 686 */ 687 return(pmap == pmap_kernel()); 688 #else /* XEN && __x86_64__*/ 689 return((pmap == pmap_kernel()) || 690 (pmap == curcpu()->ci_pmap)); 691 #endif 692 } 693 694 /* 695 * pmap_is_active: is this pmap loaded into the specified processor's %cr3? 696 */ 697 698 inline static bool 699 pmap_is_active(struct pmap *pmap, struct cpu_info *ci, bool kernel) 700 { 701 702 return (pmap == pmap_kernel() || 703 (pmap->pm_cpus & ci->ci_cpumask) != 0 || 704 (kernel && (pmap->pm_kernel_cpus & ci->ci_cpumask) != 0)); 705 } 706 707 /* 708 * Add a reference to the specified pmap. 709 */ 710 711 void 712 pmap_reference(struct pmap *pmap) 713 { 714 715 atomic_inc_uint(&pmap->pm_obj[0].uo_refs); 716 } 717 718 #ifndef XEN 719 720 /* 721 * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in 722 * 723 * => we lock enough pmaps to keep things locked in 724 * => must be undone with pmap_unmap_ptes before returning 725 */ 726 727 void 728 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2, 729 pd_entry_t **ptepp, pd_entry_t * const **pdeppp) 730 { 731 struct pmap *curpmap; 732 struct cpu_info *ci; 733 uint32_t cpumask; 734 lwp_t *l; 735 736 /* The kernel's pmap is always accessible. */ 737 if (pmap == pmap_kernel()) { 738 *pmap2 = NULL; 739 *ptepp = PTE_BASE; 740 *pdeppp = normal_pdes; 741 return; 742 } 743 KASSERT(kpreempt_disabled()); 744 745 l = curlwp; 746 retry: 747 ci = curcpu(); 748 mutex_enter(pmap->pm_lock); 749 curpmap = ci->ci_pmap; 750 if (vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) { 751 /* Our own pmap so just load it: easy. */ 752 if (__predict_false(ci->ci_want_pmapload)) { 753 mutex_exit(pmap->pm_lock); 754 pmap_load(); 755 goto retry; 756 } 757 KASSERT(pmap == curpmap); 758 } else if (pmap == curpmap) { 759 /* 760 * Already on the CPU: make it valid. This is very 761 * often the case during exit(), when we have switched 762 * to the kernel pmap in order to destroy a user pmap. 763 */ 764 if (!pmap_reactivate(pmap)) { 765 u_int gen = uvm_emap_gen_return(); 766 tlbflush(); 767 uvm_emap_update(gen); 768 } 769 } else { 770 /* 771 * Toss current pmap from CPU, but keep ref to it. 772 * Can happen if we block during exit(). 773 */ 774 cpumask = ci->ci_cpumask; 775 atomic_and_32(&curpmap->pm_cpus, ~cpumask); 776 atomic_and_32(&curpmap->pm_kernel_cpus, ~cpumask); 777 ci->ci_pmap = pmap; 778 ci->ci_tlbstate = TLBSTATE_VALID; 779 atomic_or_32(&pmap->pm_cpus, cpumask); 780 atomic_or_32(&pmap->pm_kernel_cpus, cpumask); 781 lcr3(pmap_pdirpa(pmap, 0)); 782 } 783 pmap->pm_ncsw = l->l_ncsw; 784 *pmap2 = curpmap; 785 *ptepp = PTE_BASE; 786 *pdeppp = normal_pdes; 787 } 788 789 /* 790 * pmap_unmap_ptes: unlock the PTE mapping of "pmap" 791 */ 792 793 void 794 pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2) 795 { 796 struct cpu_info *ci; 797 struct pmap *mypmap; 798 799 KASSERT(kpreempt_disabled()); 800 801 /* The kernel's pmap is always accessible. */ 802 if (pmap == pmap_kernel()) { 803 return; 804 } 805 806 /* 807 * We cannot tolerate context switches while mapped in. 808 * If it is our own pmap all we have to do is unlock. 809 */ 810 KASSERT(pmap->pm_ncsw == curlwp->l_ncsw); 811 mypmap = vm_map_pmap(&curproc->p_vmspace->vm_map); 812 if (pmap == mypmap) { 813 mutex_exit(pmap->pm_lock); 814 return; 815 } 816 817 /* 818 * Mark whatever's on the CPU now as lazy and unlock. 819 * If the pmap was already installed, we are done. 820 */ 821 ci = curcpu(); 822 ci->ci_tlbstate = TLBSTATE_LAZY; 823 ci->ci_want_pmapload = (mypmap != pmap_kernel()); 824 mutex_exit(pmap->pm_lock); 825 if (pmap == pmap2) { 826 return; 827 } 828 829 /* 830 * We installed another pmap on the CPU. Grab a reference to 831 * it and leave in place. Toss the evicted pmap (can block). 832 */ 833 pmap_reference(pmap); 834 pmap_destroy(pmap2); 835 } 836 837 #endif 838 839 inline static void 840 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte) 841 { 842 843 #if !defined(__x86_64__) 844 if (curproc == NULL || curproc->p_vmspace == NULL || 845 pm != vm_map_pmap(&curproc->p_vmspace->vm_map)) 846 return; 847 848 if ((opte ^ npte) & PG_X) 849 pmap_update_pg(va); 850 851 /* 852 * Executability was removed on the last executable change. 853 * Reset the code segment to something conservative and 854 * let the trap handler deal with setting the right limit. 855 * We can't do that because of locking constraints on the vm map. 856 */ 857 858 if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) { 859 struct trapframe *tf = curlwp->l_md.md_regs; 860 861 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 862 pm->pm_hiexec = I386_MAX_EXE_ADDR; 863 } 864 #endif /* !defined(__x86_64__) */ 865 } 866 867 #if !defined(__x86_64__) 868 /* 869 * Fixup the code segment to cover all potential executable mappings. 870 * returns 0 if no changes to the code segment were made. 871 */ 872 873 int 874 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb) 875 { 876 struct vm_map_entry *ent; 877 struct pmap *pm = vm_map_pmap(map); 878 vaddr_t va = 0; 879 880 vm_map_lock_read(map); 881 for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) { 882 883 /* 884 * This entry has greater va than the entries before. 885 * We need to make it point to the last page, not past it. 886 */ 887 888 if (ent->protection & VM_PROT_EXECUTE) 889 va = trunc_page(ent->end) - PAGE_SIZE; 890 } 891 vm_map_unlock_read(map); 892 if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL)) 893 return (0); 894 895 pm->pm_hiexec = va; 896 if (pm->pm_hiexec > I386_MAX_EXE_ADDR) { 897 tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL); 898 } else { 899 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 900 return (0); 901 } 902 return (1); 903 } 904 #endif /* !defined(__x86_64__) */ 905 906 void 907 pat_init(struct cpu_info *ci) 908 { 909 uint64_t pat; 910 911 if (!(ci->ci_feat_val[0] & CPUID_PAT)) 912 return; 913 914 /* We change WT to WC. Leave all other entries the default values. */ 915 pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) | 916 PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) | 917 PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) | 918 PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC); 919 920 wrmsr(MSR_CR_PAT, pat); 921 cpu_pat_enabled = true; 922 aprint_debug_dev(ci->ci_dev, "PAT enabled\n"); 923 } 924 925 static pt_entry_t 926 pmap_pat_flags(u_int flags) 927 { 928 u_int cacheflags = (flags & PMAP_CACHE_MASK); 929 930 if (!cpu_pat_enabled) { 931 switch (cacheflags) { 932 case PMAP_NOCACHE: 933 case PMAP_NOCACHE_OVR: 934 /* results in PGC_UCMINUS on cpus which have 935 * the cpuid PAT but PAT "disabled" 936 */ 937 return PG_N; 938 default: 939 return 0; 940 } 941 } 942 943 switch (cacheflags) { 944 case PMAP_NOCACHE: 945 return PGC_UC; 946 case PMAP_WRITE_COMBINE: 947 return PGC_WC; 948 case PMAP_WRITE_BACK: 949 return PGC_WB; 950 case PMAP_NOCACHE_OVR: 951 return PGC_UCMINUS; 952 } 953 954 return 0; 955 } 956 957 /* 958 * p m a p k e n t e r f u n c t i o n s 959 * 960 * functions to quickly enter/remove pages from the kernel address 961 * space. pmap_kremove is exported to MI kernel. we make use of 962 * the recursive PTE mappings. 963 */ 964 965 /* 966 * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking 967 * 968 * => no need to lock anything, assume va is already allocated 969 * => should be faster than normal pmap enter function 970 */ 971 972 void 973 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags) 974 { 975 pt_entry_t *pte, opte, npte; 976 977 KASSERT(!(prot & ~VM_PROT_ALL)); 978 979 if (va < VM_MIN_KERNEL_ADDRESS) 980 pte = vtopte(va); 981 else 982 pte = kvtopte(va); 983 #ifdef DOM0OPS 984 if (pa < pmap_pa_start || pa >= pmap_pa_end) { 985 #ifdef DEBUG 986 printk("pmap_kenter_pa: pa 0x%" PRIx64 " for va 0x%" PRIx64 987 " outside range\n", (int64_t)pa, (int64_t)va); 988 #endif /* DEBUG */ 989 npte = pa; 990 } else 991 #endif /* DOM0OPS */ 992 npte = pmap_pa2pte(pa); 993 npte |= protection_codes[prot] | PG_k | PG_V | pmap_pg_g; 994 npte |= pmap_pat_flags(flags); 995 opte = pmap_pte_testset(pte, npte); /* zap! */ 996 #if defined(DIAGNOSTIC) 997 /* XXX For now... */ 998 if (opte & PG_PS) 999 panic("pmap_kenter_pa: PG_PS"); 1000 #endif 1001 if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) { 1002 #if defined(DIAGNOSTIC) 1003 printf("pmap_kenter_pa: mapping already present\n"); 1004 #endif 1005 /* This should not happen. */ 1006 kpreempt_disable(); 1007 pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER); 1008 kpreempt_enable(); 1009 } 1010 } 1011 1012 void 1013 pmap_emap_enter(vaddr_t va, paddr_t pa, vm_prot_t prot) 1014 { 1015 pt_entry_t *pte, opte, npte; 1016 1017 KASSERT((prot & ~VM_PROT_ALL) == 0); 1018 pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va); 1019 1020 #ifdef DOM0OPS 1021 if (pa < pmap_pa_start || pa >= pmap_pa_end) { 1022 npte = pa; 1023 } else 1024 #endif 1025 npte = pmap_pa2pte(pa); 1026 1027 npte = pmap_pa2pte(pa); 1028 npte |= protection_codes[prot] | PG_k | PG_V; 1029 opte = pmap_pte_testset(pte, npte); 1030 } 1031 1032 /* 1033 * pmap_emap_sync: perform TLB flush or pmap load, if it was deferred. 1034 */ 1035 void 1036 pmap_emap_sync(bool canload) 1037 { 1038 struct cpu_info *ci = curcpu(); 1039 struct pmap *pmap; 1040 1041 KASSERT(kpreempt_disabled()); 1042 if (__predict_true(ci->ci_want_pmapload && canload)) { 1043 /* 1044 * XXX: Hint for pmap_reactivate(), which might suggest to 1045 * not perform TLB flush, if state has not changed. 1046 */ 1047 pmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map); 1048 if (__predict_false(pmap == ci->ci_pmap)) { 1049 const uint32_t cpumask = ci->ci_cpumask; 1050 atomic_and_32(&pmap->pm_cpus, ~cpumask); 1051 } 1052 pmap_load(); 1053 KASSERT(ci->ci_want_pmapload == 0); 1054 } else { 1055 tlbflush(); 1056 } 1057 1058 } 1059 1060 void 1061 pmap_emap_remove(vaddr_t sva, vsize_t len) 1062 { 1063 pt_entry_t *pte, xpte; 1064 vaddr_t va, eva = sva + len; 1065 1066 for (va = sva; va < eva; va += PAGE_SIZE) { 1067 pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va); 1068 xpte |= pmap_pte_testset(pte, 0); 1069 } 1070 } 1071 1072 __weak_alias(pmap_kenter_ma, pmap_kenter_pa); 1073 1074 #if defined(__x86_64__) 1075 /* 1076 * Change protection for a virtual address. Local for a CPU only, don't 1077 * care about TLB shootdowns. 1078 * 1079 * => must be called with preemption disabled 1080 */ 1081 void 1082 pmap_changeprot_local(vaddr_t va, vm_prot_t prot) 1083 { 1084 pt_entry_t *pte, opte, npte; 1085 1086 KASSERT(kpreempt_disabled()); 1087 1088 if (va < VM_MIN_KERNEL_ADDRESS) 1089 pte = vtopte(va); 1090 else 1091 pte = kvtopte(va); 1092 1093 npte = opte = *pte; 1094 1095 if ((prot & VM_PROT_WRITE) != 0) 1096 npte |= PG_RW; 1097 else 1098 npte &= ~PG_RW; 1099 1100 if (opte != npte) { 1101 pmap_pte_set(pte, npte); 1102 pmap_pte_flush(); 1103 invlpg(va); 1104 } 1105 } 1106 #endif /* defined(__x86_64__) */ 1107 1108 /* 1109 * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking 1110 * 1111 * => no need to lock anything 1112 * => caller must dispose of any vm_page mapped in the va range 1113 * => note: not an inline function 1114 * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE 1115 * => we assume kernel only unmaps valid addresses and thus don't bother 1116 * checking the valid bit before doing TLB flushing 1117 * => must be followed by call to pmap_update() before reuse of page 1118 */ 1119 1120 void 1121 pmap_kremove(vaddr_t sva, vsize_t len) 1122 { 1123 pt_entry_t *pte, opte; 1124 vaddr_t va, eva; 1125 1126 eva = sva + len; 1127 1128 kpreempt_disable(); 1129 for (va = sva; va < eva; va += PAGE_SIZE) { 1130 if (va < VM_MIN_KERNEL_ADDRESS) 1131 pte = vtopte(va); 1132 else 1133 pte = kvtopte(va); 1134 opte = pmap_pte_testset(pte, 0); /* zap! */ 1135 if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) { 1136 pmap_tlb_shootdown(pmap_kernel(), va, opte, 1137 TLBSHOOT_KREMOVE); 1138 } 1139 KASSERT((opte & PG_PS) == 0); 1140 KASSERT((opte & PG_PVLIST) == 0); 1141 } 1142 kpreempt_enable(); 1143 } 1144 1145 /* 1146 * p m a p i n i t f u n c t i o n s 1147 * 1148 * pmap_bootstrap and pmap_init are called during system startup 1149 * to init the pmap module. pmap_bootstrap() does a low level 1150 * init just to get things rolling. pmap_init() finishes the job. 1151 */ 1152 1153 /* 1154 * pmap_bootstrap: get the system in a state where it can run with VM 1155 * properly enabled (called before main()). the VM system is 1156 * fully init'd later... 1157 * 1158 * => on i386, locore.s has already enabled the MMU by allocating 1159 * a PDP for the kernel, and nkpde PTP's for the kernel. 1160 * => kva_start is the first free virtual address in kernel space 1161 */ 1162 1163 void 1164 pmap_bootstrap(vaddr_t kva_start) 1165 { 1166 struct pmap *kpm; 1167 pt_entry_t *pte; 1168 int i; 1169 vaddr_t kva; 1170 #ifndef XEN 1171 unsigned long p1i; 1172 vaddr_t kva_end; 1173 #endif 1174 1175 pt_entry_t pg_nx = (cpu_feature[2] & CPUID_NOX ? PG_NX : 0); 1176 1177 /* 1178 * set up our local static global vars that keep track of the 1179 * usage of KVM before kernel_map is set up 1180 */ 1181 1182 virtual_avail = kva_start; /* first free KVA */ 1183 virtual_end = VM_MAX_KERNEL_ADDRESS; /* last KVA */ 1184 1185 /* 1186 * set up protection_codes: we need to be able to convert from 1187 * a MI protection code (some combo of VM_PROT...) to something 1188 * we can jam into a i386 PTE. 1189 */ 1190 1191 protection_codes[VM_PROT_NONE] = pg_nx; /* --- */ 1192 protection_codes[VM_PROT_EXECUTE] = PG_RO | PG_X; /* --x */ 1193 protection_codes[VM_PROT_READ] = PG_RO | pg_nx; /* -r- */ 1194 protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO | PG_X;/* -rx */ 1195 protection_codes[VM_PROT_WRITE] = PG_RW | pg_nx; /* w-- */ 1196 protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW | PG_X;/* w-x */ 1197 protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW | pg_nx; 1198 /* wr- */ 1199 protection_codes[VM_PROT_ALL] = PG_RW | PG_X; /* wrx */ 1200 1201 /* 1202 * now we init the kernel's pmap 1203 * 1204 * the kernel pmap's pm_obj is not used for much. however, in 1205 * user pmaps the pm_obj contains the list of active PTPs. 1206 * the pm_obj currently does not have a pager. it might be possible 1207 * to add a pager that would allow a process to read-only mmap its 1208 * own page tables (fast user level vtophys?). this may or may not 1209 * be useful. 1210 */ 1211 1212 kpm = pmap_kernel(); 1213 for (i = 0; i < PTP_LEVELS - 1; i++) { 1214 mutex_init(&kpm->pm_obj_lock[i], MUTEX_DEFAULT, IPL_NONE); 1215 uvm_obj_init(&kpm->pm_obj[i], NULL, false, 1); 1216 uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_obj_lock[i]); 1217 kpm->pm_ptphint[i] = NULL; 1218 } 1219 memset(&kpm->pm_list, 0, sizeof(kpm->pm_list)); /* pm_list not used */ 1220 1221 kpm->pm_pdir = (pd_entry_t *)(PDPpaddr + KERNBASE); 1222 for (i = 0; i < PDP_SIZE; i++) 1223 kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i; 1224 1225 kpm->pm_stats.wired_count = kpm->pm_stats.resident_count = 1226 x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS); 1227 1228 /* 1229 * the above is just a rough estimate and not critical to the proper 1230 * operation of the system. 1231 */ 1232 1233 #ifndef XEN 1234 /* 1235 * Begin to enable global TLB entries if they are supported. 1236 * The G bit has no effect until the CR4_PGE bit is set in CR4, 1237 * which happens in cpu_init(), which is run on each cpu 1238 * (and happens later) 1239 */ 1240 1241 if (cpu_feature[0] & CPUID_PGE) { 1242 pmap_pg_g = PG_G; /* enable software */ 1243 1244 /* add PG_G attribute to already mapped kernel pages */ 1245 if (KERNBASE == VM_MIN_KERNEL_ADDRESS) { 1246 kva_end = virtual_avail; 1247 } else { 1248 extern vaddr_t eblob, esym; 1249 kva_end = (vaddr_t)&end; 1250 if (esym > kva_end) 1251 kva_end = esym; 1252 if (eblob > kva_end) 1253 kva_end = eblob; 1254 kva_end = roundup(kva_end, PAGE_SIZE); 1255 } 1256 for (kva = KERNBASE; kva < kva_end; kva += PAGE_SIZE) { 1257 p1i = pl1_i(kva); 1258 if (pmap_valid_entry(PTE_BASE[p1i])) 1259 PTE_BASE[p1i] |= PG_G; 1260 } 1261 } 1262 1263 /* 1264 * enable large pages if they are supported. 1265 */ 1266 1267 if (cpu_feature[0] & CPUID_PSE) { 1268 paddr_t pa; 1269 pd_entry_t *pde; 1270 extern char __data_start; 1271 1272 lcr4(rcr4() | CR4_PSE); /* enable hardware (via %cr4) */ 1273 pmap_largepages = 1; /* enable software */ 1274 1275 /* 1276 * the TLB must be flushed after enabling large pages 1277 * on Pentium CPUs, according to section 3.6.2.2 of 1278 * "Intel Architecture Software Developer's Manual, 1279 * Volume 3: System Programming". 1280 */ 1281 tlbflushg(); 1282 1283 /* 1284 * now, remap the kernel text using large pages. we 1285 * assume that the linker has properly aligned the 1286 * .data segment to a NBPD_L2 boundary. 1287 */ 1288 kva_end = rounddown((vaddr_t)&__data_start, NBPD_L1); 1289 for (pa = 0, kva = KERNBASE; kva + NBPD_L2 <= kva_end; 1290 kva += NBPD_L2, pa += NBPD_L2) { 1291 pde = &L2_BASE[pl2_i(kva)]; 1292 *pde = pa | pmap_pg_g | PG_PS | 1293 PG_KR | PG_V; /* zap! */ 1294 tlbflushg(); 1295 } 1296 #if defined(DEBUG) 1297 aprint_normal("kernel text is mapped with %" PRIuPSIZE " large " 1298 "pages and %" PRIuPSIZE " normal pages\n", 1299 howmany(kva - KERNBASE, NBPD_L2), 1300 howmany((vaddr_t)&__data_start - kva, NBPD_L1)); 1301 #endif /* defined(DEBUG) */ 1302 } 1303 #endif /* !XEN */ 1304 1305 if (VM_MIN_KERNEL_ADDRESS != KERNBASE) { 1306 /* 1307 * zero_pte is stuck at the end of mapped space for the kernel 1308 * image (disjunct from kva space). This is done so that it 1309 * can safely be used in pmap_growkernel (pmap_get_physpage), 1310 * when it's called for the first time. 1311 * XXXfvdl fix this for MULTIPROCESSOR later. 1312 */ 1313 1314 early_zerop = (void *)(KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2); 1315 early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop); 1316 } 1317 1318 /* 1319 * now we allocate the "special" VAs which are used for tmp mappings 1320 * by the pmap (and other modules). we allocate the VAs by advancing 1321 * virtual_avail (note that there are no pages mapped at these VAs). 1322 * we find the PTE that maps the allocated VA via the linear PTE 1323 * mapping. 1324 */ 1325 1326 pte = PTE_BASE + pl1_i(virtual_avail); 1327 1328 #ifdef MULTIPROCESSOR 1329 /* 1330 * Waste some VA space to avoid false sharing of cache lines 1331 * for page table pages: Give each possible CPU a cache line 1332 * of PTE's (8) to play with, though we only need 4. We could 1333 * recycle some of this waste by putting the idle stacks here 1334 * as well; we could waste less space if we knew the largest 1335 * CPU ID beforehand. 1336 */ 1337 csrcp = (char *) virtual_avail; csrc_pte = pte; 1338 1339 cdstp = (char *) virtual_avail+PAGE_SIZE; cdst_pte = pte+1; 1340 1341 zerop = (char *) virtual_avail+PAGE_SIZE*2; zero_pte = pte+2; 1342 1343 ptpp = (char *) virtual_avail+PAGE_SIZE*3; ptp_pte = pte+3; 1344 1345 virtual_avail += PAGE_SIZE * maxcpus * NPTECL; 1346 pte += maxcpus * NPTECL; 1347 #else 1348 csrcp = (void *) virtual_avail; csrc_pte = pte; /* allocate */ 1349 virtual_avail += PAGE_SIZE; pte++; /* advance */ 1350 1351 cdstp = (void *) virtual_avail; cdst_pte = pte; 1352 virtual_avail += PAGE_SIZE; pte++; 1353 1354 zerop = (void *) virtual_avail; zero_pte = pte; 1355 virtual_avail += PAGE_SIZE; pte++; 1356 1357 ptpp = (void *) virtual_avail; ptp_pte = pte; 1358 virtual_avail += PAGE_SIZE; pte++; 1359 #endif 1360 1361 if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { 1362 early_zerop = zerop; 1363 early_zero_pte = zero_pte; 1364 } 1365 1366 /* 1367 * Nothing after this point actually needs pte; 1368 */ 1369 pte = (void *)0xdeadbeef; 1370 1371 #ifdef XEN 1372 #ifdef __x86_64__ 1373 /* 1374 * We want a dummy page directory for Xen: 1375 * when deactivate a pmap, Xen will still consider it active. 1376 * So we set user PGD to this one to lift all protection on 1377 * the now inactive page tables set. 1378 */ 1379 xen_dummy_user_pgd = avail_start; 1380 avail_start += PAGE_SIZE; 1381 1382 /* Zero fill it, the less checks in Xen it requires the better */ 1383 memset((void *) (xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE); 1384 /* Mark read-only */ 1385 HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE, 1386 pmap_pa2pte(xen_dummy_user_pgd) | PG_u | PG_V, UVMF_INVLPG); 1387 /* Pin as L4 */ 1388 xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd)); 1389 #endif /* __x86_64__ */ 1390 idt_vaddr = virtual_avail; /* don't need pte */ 1391 idt_paddr = avail_start; /* steal a page */ 1392 /* 1393 * Xen require one more page as we can't store 1394 * GDT and LDT on the same page 1395 */ 1396 virtual_avail += 3 * PAGE_SIZE; 1397 avail_start += 3 * PAGE_SIZE; 1398 #else /* XEN */ 1399 idt_vaddr = virtual_avail; /* don't need pte */ 1400 idt_paddr = avail_start; /* steal a page */ 1401 #if defined(__x86_64__) 1402 virtual_avail += 2 * PAGE_SIZE; pte += 2; 1403 avail_start += 2 * PAGE_SIZE; 1404 #else /* defined(__x86_64__) */ 1405 virtual_avail += PAGE_SIZE; pte++; 1406 avail_start += PAGE_SIZE; 1407 /* pentium f00f bug stuff */ 1408 pentium_idt_vaddr = virtual_avail; /* don't need pte */ 1409 virtual_avail += PAGE_SIZE; pte++; 1410 #endif /* defined(__x86_64__) */ 1411 #endif /* XEN */ 1412 1413 #ifdef _LP64 1414 /* 1415 * Grab a page below 4G for things that need it (i.e. 1416 * having an initial %cr3 for the MP trampoline). 1417 */ 1418 lo32_vaddr = virtual_avail; 1419 virtual_avail += PAGE_SIZE; pte++; 1420 lo32_paddr = avail_start; 1421 avail_start += PAGE_SIZE; 1422 #endif 1423 1424 /* 1425 * now we reserve some VM for mapping pages when doing a crash dump 1426 */ 1427 1428 virtual_avail = reserve_dumppages(virtual_avail); 1429 1430 /* 1431 * init the static-global locks and global lists. 1432 * 1433 * => pventry::pvh_lock (initialized elsewhere) must also be 1434 * a spin lock, again at IPL_VM to prevent deadlock, and 1435 * again is never taken from interrupt context. 1436 */ 1437 1438 mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE); 1439 LIST_INIT(&pmaps); 1440 1441 /* 1442 * initialize caches. 1443 */ 1444 1445 pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), 0, 0, 0, 1446 "pmappl", NULL, IPL_NONE, NULL, NULL, NULL); 1447 #ifdef PAE 1448 pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE * PDP_SIZE, 0, 0, 0, 1449 "pdppl", &pmap_pdp_allocator, IPL_NONE, 1450 pmap_pdp_ctor, pmap_pdp_dtor, NULL); 1451 #else /* PAE */ 1452 pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE, 0, 0, 0, 1453 "pdppl", NULL, IPL_NONE, pmap_pdp_ctor, pmap_pdp_dtor, NULL); 1454 #endif /* PAE */ 1455 pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0, 1456 PR_LARGECACHE, "pvpl", &pool_allocator_meta, IPL_NONE, NULL, 1457 NULL, NULL); 1458 1459 /* 1460 * ensure the TLB is sync'd with reality by flushing it... 1461 */ 1462 1463 tlbflushg(); 1464 1465 /* 1466 * calculate pmap_maxkvaddr from nkptp[]. 1467 */ 1468 1469 kva = VM_MIN_KERNEL_ADDRESS; 1470 for (i = PTP_LEVELS - 1; i >= 1; i--) { 1471 kva += nkptp[i] * nbpd[i]; 1472 } 1473 pmap_maxkvaddr = kva; 1474 } 1475 1476 #if defined(__x86_64__) 1477 /* 1478 * Pre-allocate PTPs for low memory, so that 1:1 mappings for various 1479 * trampoline code can be entered. 1480 */ 1481 void 1482 pmap_prealloc_lowmem_ptps(void) 1483 { 1484 int level; 1485 paddr_t newp; 1486 #ifdef XEN 1487 paddr_t pdes_pa; 1488 1489 pdes_pa = pmap_pdirpa(pmap_kernel(), 0); 1490 level = PTP_LEVELS; 1491 for (;;) { 1492 newp = avail_start; 1493 avail_start += PAGE_SIZE; 1494 HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop, 1495 xpmap_ptom_masked(newp) | PG_u | PG_V | PG_RW, UVMF_INVLPG); 1496 memset(early_zerop, 0, PAGE_SIZE); 1497 /* Mark R/O before installing */ 1498 HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop, 1499 xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG); 1500 if (newp < (NKL2_KIMG_ENTRIES * NBPD_L2)) 1501 HYPERVISOR_update_va_mapping (newp + KERNBASE, 1502 xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG); 1503 xpq_queue_pte_update ( 1504 xpmap_ptom_masked(pdes_pa) 1505 + (pl_i(0, level) * sizeof (pd_entry_t)), 1506 xpmap_ptom_masked(newp) | PG_RW | PG_u | PG_V); 1507 pmap_pte_flush(); 1508 level--; 1509 if (level <= 1) 1510 break; 1511 pdes_pa = newp; 1512 } 1513 #else /* XEN */ 1514 pd_entry_t *pdes; 1515 1516 pdes = pmap_kernel()->pm_pdir; 1517 level = PTP_LEVELS; 1518 for (;;) { 1519 newp = avail_start; 1520 avail_start += PAGE_SIZE; 1521 pmap_pte_set(early_zero_pte, (newp & PG_FRAME) | PG_V | PG_RW); 1522 pmap_pte_flush(); 1523 pmap_update_pg((vaddr_t)early_zerop); 1524 memset(early_zerop, 0, PAGE_SIZE); 1525 pdes[pl_i(0, level)] = (newp & PG_FRAME) | PG_V | PG_RW; 1526 level--; 1527 if (level <= 1) 1528 break; 1529 pdes = normal_pdes[level - 2]; 1530 } 1531 #endif /* XEN */ 1532 } 1533 #endif /* defined(__x86_64__) */ 1534 1535 /* 1536 * pmap_init: called from uvm_init, our job is to get the pmap 1537 * system ready to manage mappings... 1538 */ 1539 1540 void 1541 pmap_init(void) 1542 { 1543 int i; 1544 1545 for (i = 0; i < PV_HASH_SIZE; i++) { 1546 SLIST_INIT(&pv_hash_heads[i].hh_list); 1547 } 1548 for (i = 0; i < PV_HASH_LOCK_CNT; i++) { 1549 mutex_init(&pv_hash_locks[i].lock, MUTEX_NODEBUG, IPL_VM); 1550 } 1551 1552 pmap_tlb_init(); 1553 1554 evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC, 1555 NULL, "x86", "io bitmap copy"); 1556 evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC, 1557 NULL, "x86", "ldt sync"); 1558 1559 /* 1560 * done: pmap module is up (and ready for business) 1561 */ 1562 1563 pmap_initialized = true; 1564 } 1565 1566 /* 1567 * pmap_cpu_init_late: perform late per-CPU initialization. 1568 */ 1569 1570 void 1571 pmap_cpu_init_late(struct cpu_info *ci) 1572 { 1573 #ifdef PAE 1574 int ret; 1575 struct pglist pg; 1576 struct vm_page *vmap; 1577 1578 /* The BP has already its own L3 page allocated in locore.S. */ 1579 if (ci == &cpu_info_primary) 1580 return; 1581 1582 /* 1583 * Allocate a page for the per-CPU L3 PD. cr3 being 32 bits, PA musts 1584 * resides below the 4GB boundary. 1585 */ 1586 ret = uvm_pglistalloc(PAGE_SIZE, 0, 0x100000000ULL, 32, 0, &pg, 1, 0); 1587 vmap = TAILQ_FIRST(&pg); 1588 1589 if (ret != 0 || vmap == NULL) 1590 panic("%s: failed to allocate L3 pglist for CPU %d (ret %d)\n", 1591 __func__, cpu_index(ci), ret); 1592 1593 ci->ci_pae_l3_pdirpa = vmap->phys_addr; 1594 1595 ci->ci_pae_l3_pdir = (paddr_t *)uvm_km_alloc(kernel_map, PAGE_SIZE, 0, 1596 UVM_KMF_VAONLY | UVM_KMF_NOWAIT); 1597 if (ci->ci_pae_l3_pdir == NULL) 1598 panic("%s: failed to allocate L3 PD for CPU %d\n", 1599 __func__, cpu_index(ci)); 1600 1601 pmap_kenter_pa((vaddr_t)ci->ci_pae_l3_pdir, ci->ci_pae_l3_pdirpa, 1602 VM_PROT_READ | VM_PROT_WRITE, 0); 1603 1604 pmap_update(pmap_kernel()); 1605 #endif 1606 } 1607 1608 /* 1609 * p v _ e n t r y f u n c t i o n s 1610 */ 1611 1612 /* 1613 * pmap_free_pvs: free a list of pv_entrys 1614 */ 1615 1616 static void 1617 pmap_free_pvs(struct pv_entry *pve) 1618 { 1619 struct pv_entry *next; 1620 1621 for ( /* null */ ; pve != NULL ; pve = next) { 1622 next = pve->pve_next; 1623 pool_cache_put(&pmap_pv_cache, pve); 1624 } 1625 } 1626 1627 /* 1628 * main pv_entry manipulation functions: 1629 * pmap_enter_pv: enter a mapping onto a pv_head list 1630 * pmap_remove_pv: remove a mapping from a pv_head list 1631 * 1632 * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock 1633 * the pvh before calling 1634 */ 1635 1636 /* 1637 * insert_pv: a helper of pmap_enter_pv 1638 */ 1639 1640 static void 1641 insert_pv(struct pmap_page *pp, struct pv_entry *pve) 1642 { 1643 struct pv_hash_head *hh; 1644 kmutex_t *lock; 1645 u_int hash; 1646 1647 hash = pvhash_hash(pve->pve_pte.pte_ptp, pve->pve_pte.pte_va); 1648 lock = pvhash_lock(hash); 1649 hh = pvhash_head(hash); 1650 mutex_spin_enter(lock); 1651 SLIST_INSERT_HEAD(&hh->hh_list, pve, pve_hash); 1652 mutex_spin_exit(lock); 1653 1654 LIST_INSERT_HEAD(&pp->pp_head.pvh_list, pve, pve_list); 1655 } 1656 1657 /* 1658 * pmap_enter_pv: enter a mapping onto a pv_head lst 1659 * 1660 * => caller should adjust ptp's wire_count before calling 1661 */ 1662 1663 static struct pv_entry * 1664 pmap_enter_pv(struct pmap_page *pp, 1665 struct pv_entry *pve, /* preallocated pve for us to use */ 1666 struct pv_entry **sparepve, 1667 struct vm_page *ptp, 1668 vaddr_t va) 1669 { 1670 1671 KASSERT(ptp == NULL || ptp->wire_count >= 2); 1672 KASSERT(ptp == NULL || ptp->uobject != NULL); 1673 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 1674 1675 if ((pp->pp_flags & PP_EMBEDDED) == 0) { 1676 if (LIST_EMPTY(&pp->pp_head.pvh_list)) { 1677 pp->pp_flags |= PP_EMBEDDED; 1678 pp->pp_pte.pte_ptp = ptp; 1679 pp->pp_pte.pte_va = va; 1680 1681 return pve; 1682 } 1683 } else { 1684 struct pv_entry *pve2; 1685 1686 pve2 = *sparepve; 1687 *sparepve = NULL; 1688 1689 pve2->pve_pte = pp->pp_pte; 1690 pp->pp_flags &= ~PP_EMBEDDED; 1691 LIST_INIT(&pp->pp_head.pvh_list); 1692 insert_pv(pp, pve2); 1693 } 1694 1695 pve->pve_pte.pte_ptp = ptp; 1696 pve->pve_pte.pte_va = va; 1697 insert_pv(pp, pve); 1698 1699 return NULL; 1700 } 1701 1702 /* 1703 * pmap_remove_pv: try to remove a mapping from a pv_list 1704 * 1705 * => caller should adjust ptp's wire_count and free PTP if needed 1706 * => we return the removed pve 1707 */ 1708 1709 static struct pv_entry * 1710 pmap_remove_pv(struct pmap_page *pp, struct vm_page *ptp, vaddr_t va) 1711 { 1712 struct pv_hash_head *hh; 1713 struct pv_entry *pve; 1714 kmutex_t *lock; 1715 u_int hash; 1716 1717 KASSERT(ptp == NULL || ptp->uobject != NULL); 1718 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 1719 1720 if ((pp->pp_flags & PP_EMBEDDED) != 0) { 1721 KASSERT(pp->pp_pte.pte_ptp == ptp); 1722 KASSERT(pp->pp_pte.pte_va == va); 1723 1724 pp->pp_flags &= ~PP_EMBEDDED; 1725 LIST_INIT(&pp->pp_head.pvh_list); 1726 1727 return NULL; 1728 } 1729 1730 hash = pvhash_hash(ptp, va); 1731 lock = pvhash_lock(hash); 1732 hh = pvhash_head(hash); 1733 mutex_spin_enter(lock); 1734 pve = pvhash_remove(hh, ptp, va); 1735 mutex_spin_exit(lock); 1736 1737 LIST_REMOVE(pve, pve_list); 1738 1739 return pve; 1740 } 1741 1742 /* 1743 * p t p f u n c t i o n s 1744 */ 1745 1746 static inline struct vm_page * 1747 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level) 1748 { 1749 int lidx = level - 1; 1750 struct vm_page *pg; 1751 1752 KASSERT(mutex_owned(pmap->pm_lock)); 1753 1754 if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] && 1755 pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) { 1756 return (pmap->pm_ptphint[lidx]); 1757 } 1758 PMAP_SUBOBJ_LOCK(pmap, lidx); 1759 pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level)); 1760 PMAP_SUBOBJ_UNLOCK(pmap, lidx); 1761 1762 KASSERT(pg == NULL || pg->wire_count >= 1); 1763 return pg; 1764 } 1765 1766 static inline void 1767 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level) 1768 { 1769 lwp_t *l; 1770 int lidx; 1771 struct uvm_object *obj; 1772 1773 KASSERT(ptp->wire_count == 1); 1774 1775 lidx = level - 1; 1776 1777 obj = &pmap->pm_obj[lidx]; 1778 pmap_stats_update(pmap, -1, 0); 1779 if (lidx != 0) 1780 mutex_enter(obj->vmobjlock); 1781 if (pmap->pm_ptphint[lidx] == ptp) 1782 pmap->pm_ptphint[lidx] = TAILQ_FIRST(&obj->memq); 1783 ptp->wire_count = 0; 1784 uvm_pagerealloc(ptp, NULL, 0); 1785 l = curlwp; 1786 KASSERT((l->l_pflag & LP_INTR) == 0); 1787 VM_PAGE_TO_PP(ptp)->pp_link = l->l_md.md_gc_ptp; 1788 l->l_md.md_gc_ptp = ptp; 1789 if (lidx != 0) 1790 mutex_exit(obj->vmobjlock); 1791 } 1792 1793 static void 1794 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va, 1795 pt_entry_t *ptes, pd_entry_t * const *pdes) 1796 { 1797 unsigned long index; 1798 int level; 1799 vaddr_t invaladdr; 1800 pd_entry_t opde; 1801 #ifdef XEN 1802 struct pmap *curpmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map); 1803 #ifdef MULTIPROCESSOR 1804 vaddr_t invaladdr2; 1805 #endif 1806 #endif 1807 1808 KASSERT(pmap != pmap_kernel()); 1809 KASSERT(mutex_owned(pmap->pm_lock)); 1810 KASSERT(kpreempt_disabled()); 1811 1812 level = 1; 1813 do { 1814 index = pl_i(va, level + 1); 1815 opde = pmap_pte_testset(&pdes[level - 1][index], 0); 1816 #if defined(XEN) 1817 # if defined(__x86_64__) 1818 /* 1819 * If ptp is a L3 currently mapped in kernel space, 1820 * clear it before freeing 1821 */ 1822 if (pmap_pdirpa(pmap, 0) == curcpu()->ci_xen_current_user_pgd 1823 && level == PTP_LEVELS - 1) 1824 pmap_pte_set(&pmap_kernel()->pm_pdir[index], 0); 1825 # endif /*__x86_64__ */ 1826 invaladdr = level == 1 ? (vaddr_t)ptes : 1827 (vaddr_t)pdes[level - 2]; 1828 pmap_tlb_shootdown(curpmap, invaladdr + index * PAGE_SIZE, 1829 opde, TLBSHOOT_FREE_PTP1); 1830 # if defined(MULTIPROCESSOR) 1831 invaladdr2 = level == 1 ? (vaddr_t)PTE_BASE : 1832 (vaddr_t)normal_pdes[level - 2]; 1833 if (pmap != curpmap || invaladdr != invaladdr2) { 1834 pmap_tlb_shootdown(pmap, invaladdr2 + index * PAGE_SIZE, 1835 opde, TLBSHOOT_FREE_PTP2); 1836 } 1837 # endif /* MULTIPROCESSOR */ 1838 #else /* XEN */ 1839 invaladdr = level == 1 ? (vaddr_t)ptes : 1840 (vaddr_t)pdes[level - 2]; 1841 pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE, 1842 opde, TLBSHOOT_FREE_PTP1); 1843 #endif /* XEN */ 1844 pmap_freepage(pmap, ptp, level); 1845 if (level < PTP_LEVELS - 1) { 1846 ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1); 1847 ptp->wire_count--; 1848 if (ptp->wire_count > 1) 1849 break; 1850 } 1851 } while (++level < PTP_LEVELS); 1852 pmap_pte_flush(); 1853 } 1854 1855 /* 1856 * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one) 1857 * 1858 * => pmap should NOT be pmap_kernel() 1859 * => pmap should be locked 1860 * => preemption should be disabled 1861 */ 1862 1863 static struct vm_page * 1864 pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t * const *pdes) 1865 { 1866 struct vm_page *ptp, *pptp; 1867 int i; 1868 unsigned long index; 1869 pd_entry_t *pva; 1870 paddr_t ppa, pa; 1871 struct uvm_object *obj; 1872 1873 KASSERT(pmap != pmap_kernel()); 1874 KASSERT(mutex_owned(pmap->pm_lock)); 1875 KASSERT(kpreempt_disabled()); 1876 1877 ptp = NULL; 1878 pa = (paddr_t)-1; 1879 1880 /* 1881 * Loop through all page table levels seeing if we need to 1882 * add a new page to that level. 1883 */ 1884 for (i = PTP_LEVELS; i > 1; i--) { 1885 /* 1886 * Save values from previous round. 1887 */ 1888 pptp = ptp; 1889 ppa = pa; 1890 1891 index = pl_i(va, i); 1892 pva = pdes[i - 2]; 1893 1894 if (pmap_valid_entry(pva[index])) { 1895 ppa = pmap_pte2pa(pva[index]); 1896 ptp = NULL; 1897 continue; 1898 } 1899 1900 obj = &pmap->pm_obj[i-2]; 1901 PMAP_SUBOBJ_LOCK(pmap, i - 2); 1902 ptp = uvm_pagealloc(obj, ptp_va2o(va, i - 1), NULL, 1903 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 1904 PMAP_SUBOBJ_UNLOCK(pmap, i - 2); 1905 1906 if (ptp == NULL) 1907 return NULL; 1908 1909 ptp->flags &= ~PG_BUSY; /* never busy */ 1910 ptp->wire_count = 1; 1911 pmap->pm_ptphint[i - 2] = ptp; 1912 pa = VM_PAGE_TO_PHYS(ptp); 1913 pmap_pte_set(&pva[index], (pd_entry_t) 1914 (pmap_pa2pte(pa) | PG_u | PG_RW | PG_V)); 1915 #if defined(XEN) && defined(__x86_64__) 1916 /* 1917 * In Xen we must enter the mapping in kernel map too 1918 * if pmap is curmap and modifying top level (PGD) 1919 */ 1920 if(i == PTP_LEVELS && pmap != pmap_kernel()) { 1921 pmap_pte_set(&pmap_kernel()->pm_pdir[index], 1922 (pd_entry_t) (pmap_pa2pte(pa) 1923 | PG_u | PG_RW | PG_V)); 1924 } 1925 #endif /* XEN && __x86_64__ */ 1926 pmap_pte_flush(); 1927 pmap_stats_update(pmap, 1, 0); 1928 /* 1929 * If we're not in the top level, increase the 1930 * wire count of the parent page. 1931 */ 1932 if (i < PTP_LEVELS) { 1933 if (pptp == NULL) 1934 pptp = pmap_find_ptp(pmap, va, ppa, i); 1935 #ifdef DIAGNOSTIC 1936 if (pptp == NULL) 1937 panic("pde page disappeared"); 1938 #endif 1939 pptp->wire_count++; 1940 } 1941 } 1942 1943 /* 1944 * ptp is not NULL if we just allocated a new ptp. If it's 1945 * still NULL, we must look up the existing one. 1946 */ 1947 if (ptp == NULL) { 1948 ptp = pmap_find_ptp(pmap, va, ppa, 1); 1949 #ifdef DIAGNOSTIC 1950 if (ptp == NULL) { 1951 printf("va %" PRIxVADDR " ppa %" PRIxPADDR "\n", 1952 va, ppa); 1953 panic("pmap_get_ptp: unmanaged user PTP"); 1954 } 1955 #endif 1956 } 1957 1958 pmap->pm_ptphint[0] = ptp; 1959 return(ptp); 1960 } 1961 1962 /* 1963 * p m a p l i f e c y c l e f u n c t i o n s 1964 */ 1965 1966 /* 1967 * pmap_pdp_ctor: constructor for the PDP cache. 1968 */ 1969 1970 int 1971 pmap_pdp_ctor(void *arg, void *v, int flags) 1972 { 1973 pd_entry_t *pdir = v; 1974 paddr_t pdirpa = 0; /* XXX: GCC */ 1975 vaddr_t object; 1976 int i; 1977 1978 #if !defined(XEN) || !defined(__x86_64__) 1979 int npde; 1980 #endif 1981 #ifdef XEN 1982 int s; 1983 #endif 1984 1985 /* 1986 * NOTE: The `pmap_lock' is held when the PDP is allocated. 1987 */ 1988 1989 #if defined(XEN) && defined(__x86_64__) 1990 /* fetch the physical address of the page directory. */ 1991 (void) pmap_extract(pmap_kernel(), (vaddr_t) pdir, &pdirpa); 1992 1993 /* zero init area */ 1994 memset (pdir, 0, PAGE_SIZE); /* Xen wants a clean page */ 1995 /* 1996 * this pdir will NEVER be active in kernel mode 1997 * so mark recursive entry invalid 1998 */ 1999 pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa) | PG_u; 2000 /* 2001 * PDP constructed this way won't be for kernel, 2002 * hence we don't put kernel mappings on Xen. 2003 * But we need to make pmap_create() happy, so put a dummy (without 2004 * PG_V) value at the right place. 2005 */ 2006 pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] = 2007 (pd_entry_t)-1 & PG_FRAME; 2008 #else /* XEN && __x86_64__*/ 2009 /* zero init area */ 2010 memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t)); 2011 2012 object = (vaddr_t)v; 2013 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2014 /* fetch the physical address of the page directory. */ 2015 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2016 /* put in recursive PDE to map the PTEs */ 2017 pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PG_V; 2018 #ifndef XEN 2019 pdir[PDIR_SLOT_PTE + i] |= PG_KW; 2020 #endif 2021 } 2022 2023 /* copy kernel's PDE */ 2024 npde = nkptp[PTP_LEVELS - 1]; 2025 2026 memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN], 2027 npde * sizeof(pd_entry_t)); 2028 2029 /* zero the rest */ 2030 memset(&pdir[PDIR_SLOT_KERN + npde], 0, 2031 (NTOPLEVEL_PDES - (PDIR_SLOT_KERN + npde)) * sizeof(pd_entry_t)); 2032 2033 if (VM_MIN_KERNEL_ADDRESS != KERNBASE) { 2034 int idx = pl_i(KERNBASE, PTP_LEVELS); 2035 2036 pdir[idx] = PDP_BASE[idx]; 2037 } 2038 #endif /* XEN && __x86_64__*/ 2039 #ifdef XEN 2040 s = splvm(); 2041 object = (vaddr_t)v; 2042 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2043 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2044 /* remap this page RO */ 2045 pmap_kenter_pa(object, pdirpa, VM_PROT_READ, 0); 2046 pmap_update(pmap_kernel()); 2047 /* 2048 * pin as L2/L4 page, we have to do the page with the 2049 * PDIR_SLOT_PTE entries last 2050 */ 2051 #ifdef PAE 2052 if (i == l2tol3(PDIR_SLOT_PTE)) 2053 continue; 2054 #endif 2055 2056 #ifdef __x86_64__ 2057 xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa)); 2058 #else 2059 xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa)); 2060 #endif 2061 } 2062 #ifdef PAE 2063 object = ((vaddr_t)pdir) + PAGE_SIZE * l2tol3(PDIR_SLOT_PTE); 2064 (void)pmap_extract(pmap_kernel(), object, &pdirpa); 2065 xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa)); 2066 #endif 2067 splx(s); 2068 #endif /* XEN */ 2069 2070 return (0); 2071 } 2072 2073 /* 2074 * pmap_pdp_dtor: destructor for the PDP cache. 2075 */ 2076 2077 void 2078 pmap_pdp_dtor(void *arg, void *v) 2079 { 2080 #ifdef XEN 2081 paddr_t pdirpa = 0; /* XXX: GCC */ 2082 vaddr_t object = (vaddr_t)v; 2083 int i; 2084 int s = splvm(); 2085 pt_entry_t *pte; 2086 2087 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2088 /* fetch the physical address of the page directory. */ 2089 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2090 /* unpin page table */ 2091 xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa)); 2092 } 2093 object = (vaddr_t)v; 2094 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2095 /* Set page RW again */ 2096 pte = kvtopte(object); 2097 xpq_queue_pte_update(xpmap_ptetomach(pte), *pte | PG_RW); 2098 xpq_queue_invlpg((vaddr_t)object); 2099 } 2100 splx(s); 2101 #endif /* XEN */ 2102 } 2103 2104 #ifdef PAE 2105 2106 /* pmap_pdp_alloc: Allocate a page for the pdp memory pool. */ 2107 2108 void * 2109 pmap_pdp_alloc(struct pool *pp, int flags) 2110 { 2111 return (void *)uvm_km_alloc(kernel_map, 2112 PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE, 2113 ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK) 2114 | UVM_KMF_WIRED); 2115 } 2116 2117 /* 2118 * pmap_pdp_free: free a PDP 2119 */ 2120 2121 void 2122 pmap_pdp_free(struct pool *pp, void *v) 2123 { 2124 uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE, 2125 UVM_KMF_WIRED); 2126 } 2127 #endif /* PAE */ 2128 2129 /* 2130 * pmap_create: create a pmap 2131 * 2132 * => note: old pmap interface took a "size" args which allowed for 2133 * the creation of "software only" pmaps (not in bsd). 2134 */ 2135 2136 struct pmap * 2137 pmap_create(void) 2138 { 2139 struct pmap *pmap; 2140 int i; 2141 2142 pmap = pool_cache_get(&pmap_cache, PR_WAITOK); 2143 2144 /* init uvm_object */ 2145 for (i = 0; i < PTP_LEVELS - 1; i++) { 2146 mutex_init(&pmap->pm_obj_lock[i], MUTEX_DEFAULT, IPL_NONE); 2147 uvm_obj_init(&pmap->pm_obj[i], NULL, false, 1); 2148 uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_obj_lock[i]); 2149 pmap->pm_ptphint[i] = NULL; 2150 } 2151 pmap->pm_stats.wired_count = 0; 2152 /* count the PDP allocd below */ 2153 pmap->pm_stats.resident_count = PDP_SIZE; 2154 #if !defined(__x86_64__) 2155 pmap->pm_hiexec = 0; 2156 #endif /* !defined(__x86_64__) */ 2157 pmap->pm_flags = 0; 2158 pmap->pm_cpus = 0; 2159 pmap->pm_kernel_cpus = 0; 2160 pmap->pm_gc_ptp = NULL; 2161 2162 /* init the LDT */ 2163 pmap->pm_ldt = NULL; 2164 pmap->pm_ldt_len = 0; 2165 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 2166 2167 /* allocate PDP */ 2168 try_again: 2169 pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK); 2170 2171 mutex_enter(&pmaps_lock); 2172 2173 if (pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] == 0) { 2174 mutex_exit(&pmaps_lock); 2175 pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir); 2176 goto try_again; 2177 } 2178 2179 for (i = 0; i < PDP_SIZE; i++) 2180 pmap->pm_pdirpa[i] = 2181 pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]); 2182 2183 LIST_INSERT_HEAD(&pmaps, pmap, pm_list); 2184 2185 mutex_exit(&pmaps_lock); 2186 2187 return (pmap); 2188 } 2189 2190 /* 2191 * pmap_free_ptps: put a list of ptps back to the freelist. 2192 */ 2193 2194 static void 2195 pmap_free_ptps(struct vm_page *empty_ptps) 2196 { 2197 struct vm_page *ptp; 2198 struct pmap_page *pp; 2199 2200 while ((ptp = empty_ptps) != NULL) { 2201 pp = VM_PAGE_TO_PP(ptp); 2202 empty_ptps = pp->pp_link; 2203 LIST_INIT(&pp->pp_head.pvh_list); 2204 uvm_pagefree(ptp); 2205 } 2206 } 2207 2208 /* 2209 * pmap_destroy: drop reference count on pmap. free pmap if 2210 * reference count goes to zero. 2211 */ 2212 2213 void 2214 pmap_destroy(struct pmap *pmap) 2215 { 2216 int i; 2217 #ifdef DIAGNOSTIC 2218 struct cpu_info *ci; 2219 CPU_INFO_ITERATOR cii; 2220 #endif /* DIAGNOSTIC */ 2221 lwp_t *l; 2222 2223 /* 2224 * If we have torn down this pmap, process deferred frees and 2225 * invalidations. Free now if the system is low on memory. 2226 * Otherwise, free when the pmap is destroyed thus avoiding a 2227 * TLB shootdown. 2228 */ 2229 l = curlwp; 2230 if (__predict_false(l->l_md.md_gc_pmap == pmap)) { 2231 if (uvmexp.free < uvmexp.freetarg) { 2232 pmap_update(pmap); 2233 } else { 2234 KASSERT(pmap->pm_gc_ptp == NULL); 2235 pmap->pm_gc_ptp = l->l_md.md_gc_ptp; 2236 l->l_md.md_gc_ptp = NULL; 2237 l->l_md.md_gc_pmap = NULL; 2238 } 2239 } 2240 2241 /* 2242 * drop reference count 2243 */ 2244 2245 if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) { 2246 return; 2247 } 2248 2249 #ifdef DIAGNOSTIC 2250 for (CPU_INFO_FOREACH(cii, ci)) 2251 if (ci->ci_pmap == pmap) 2252 panic("destroying pmap being used"); 2253 #endif /* DIAGNOSTIC */ 2254 2255 /* 2256 * reference count is zero, free pmap resources and then free pmap. 2257 */ 2258 #ifdef XEN 2259 /* 2260 * Xen lazy APDP handling: 2261 * clear APDP_PDE if pmap is the currently mapped 2262 */ 2263 if (xpmap_ptom_masked(pmap_pdirpa(pmap, 0)) == (*APDP_PDE & PG_FRAME)) { 2264 kpreempt_disable(); 2265 pmap_unmap_apdp(); 2266 pmap_pte_flush(); 2267 pmap_apte_flush(pmap_kernel()); 2268 kpreempt_enable(); 2269 } 2270 #endif 2271 2272 /* 2273 * remove it from global list of pmaps 2274 */ 2275 2276 mutex_enter(&pmaps_lock); 2277 LIST_REMOVE(pmap, pm_list); 2278 mutex_exit(&pmaps_lock); 2279 2280 /* 2281 * Process deferred PTP frees. No TLB shootdown required, as the 2282 * PTP pages are no longer visible to any CPU. 2283 */ 2284 2285 pmap_free_ptps(pmap->pm_gc_ptp); 2286 2287 /* 2288 * destroyed pmap shouldn't have remaining PTPs 2289 */ 2290 2291 for (i = 0; i < PTP_LEVELS - 1; i++) { 2292 KASSERT(pmap->pm_obj[i].uo_npages == 0); 2293 KASSERT(TAILQ_EMPTY(&pmap->pm_obj[i].memq)); 2294 } 2295 2296 /* 2297 * MULTIPROCESSOR -- no need to flush out of other processors' 2298 * APTE space because we do that in pmap_unmap_ptes(). 2299 */ 2300 pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir); 2301 2302 #ifdef USER_LDT 2303 if (pmap->pm_ldt != NULL) { 2304 /* 2305 * no need to switch the LDT; this address space is gone, 2306 * nothing is using it. 2307 * 2308 * No need to lock the pmap for ldt_free (or anything else), 2309 * we're the last one to use it. 2310 */ 2311 mutex_enter(&cpu_lock); 2312 ldt_free(pmap->pm_ldt_sel); 2313 mutex_exit(&cpu_lock); 2314 uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt, 2315 pmap->pm_ldt_len, UVM_KMF_WIRED); 2316 } 2317 #endif 2318 2319 for (i = 0; i < PTP_LEVELS - 1; i++) { 2320 uvm_obj_destroy(&pmap->pm_obj[i], false); 2321 mutex_destroy(&pmap->pm_obj_lock[i]); 2322 } 2323 pool_cache_put(&pmap_cache, pmap); 2324 } 2325 2326 /* 2327 * pmap_remove_all: pmap is being torn down by the current thread. 2328 * avoid unnecessary invalidations. 2329 */ 2330 2331 void 2332 pmap_remove_all(struct pmap *pmap) 2333 { 2334 lwp_t *l = curlwp; 2335 2336 KASSERT(l->l_md.md_gc_pmap == NULL); 2337 2338 l->l_md.md_gc_pmap = pmap; 2339 } 2340 2341 #if defined(PMAP_FORK) 2342 /* 2343 * pmap_fork: perform any necessary data structure manipulation when 2344 * a VM space is forked. 2345 */ 2346 2347 void 2348 pmap_fork(struct pmap *pmap1, struct pmap *pmap2) 2349 { 2350 #ifdef USER_LDT 2351 union descriptor *new_ldt; 2352 size_t len; 2353 int sel; 2354 2355 if (__predict_true(pmap1->pm_ldt == NULL)) { 2356 return; 2357 } 2358 2359 retry: 2360 if (pmap1->pm_ldt != NULL) { 2361 len = pmap1->pm_ldt_len; 2362 new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, len, 0, 2363 UVM_KMF_WIRED); 2364 mutex_enter(&cpu_lock); 2365 sel = ldt_alloc(new_ldt, len); 2366 if (sel == -1) { 2367 mutex_exit(&cpu_lock); 2368 uvm_km_free(kernel_map, (vaddr_t)new_ldt, len, 2369 UVM_KMF_WIRED); 2370 printf("WARNING: pmap_fork: unable to allocate LDT\n"); 2371 return; 2372 } 2373 } else { 2374 len = -1; 2375 new_ldt = NULL; 2376 sel = -1; 2377 mutex_enter(&cpu_lock); 2378 } 2379 2380 /* Copy the LDT, if necessary. */ 2381 if (pmap1->pm_ldt != NULL) { 2382 if (len != pmap1->pm_ldt_len) { 2383 if (len != -1) { 2384 ldt_free(sel); 2385 uvm_km_free(kernel_map, (vaddr_t)new_ldt, 2386 len, UVM_KMF_WIRED); 2387 } 2388 mutex_exit(&cpu_lock); 2389 goto retry; 2390 } 2391 2392 memcpy(new_ldt, pmap1->pm_ldt, len); 2393 pmap2->pm_ldt = new_ldt; 2394 pmap2->pm_ldt_len = pmap1->pm_ldt_len; 2395 pmap2->pm_ldt_sel = sel; 2396 len = -1; 2397 } 2398 2399 if (len != -1) { 2400 ldt_free(sel); 2401 uvm_km_free(kernel_map, (vaddr_t)new_ldt, len, 2402 UVM_KMF_WIRED); 2403 } 2404 mutex_exit(&cpu_lock); 2405 #endif /* USER_LDT */ 2406 } 2407 #endif /* PMAP_FORK */ 2408 2409 #ifdef USER_LDT 2410 2411 /* 2412 * pmap_ldt_xcall: cross call used by pmap_ldt_sync. if the named pmap 2413 * is active, reload LDTR. 2414 */ 2415 static void 2416 pmap_ldt_xcall(void *arg1, void *arg2) 2417 { 2418 struct pmap *pm; 2419 2420 kpreempt_disable(); 2421 pm = arg1; 2422 if (curcpu()->ci_pmap == pm) { 2423 lldt(pm->pm_ldt_sel); 2424 } 2425 kpreempt_enable(); 2426 } 2427 2428 /* 2429 * pmap_ldt_sync: LDT selector for the named pmap is changing. swap 2430 * in the new selector on all CPUs. 2431 */ 2432 void 2433 pmap_ldt_sync(struct pmap *pm) 2434 { 2435 uint64_t where; 2436 2437 KASSERT(mutex_owned(&cpu_lock)); 2438 2439 pmap_ldt_evcnt.ev_count++; 2440 where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL); 2441 xc_wait(where); 2442 } 2443 2444 /* 2445 * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and 2446 * restore the default. 2447 */ 2448 2449 void 2450 pmap_ldt_cleanup(struct lwp *l) 2451 { 2452 pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap; 2453 union descriptor *dp = NULL; 2454 size_t len = 0; 2455 int sel = -1; 2456 2457 if (__predict_true(pmap->pm_ldt == NULL)) { 2458 return; 2459 } 2460 2461 mutex_enter(&cpu_lock); 2462 if (pmap->pm_ldt != NULL) { 2463 sel = pmap->pm_ldt_sel; 2464 dp = pmap->pm_ldt; 2465 len = pmap->pm_ldt_len; 2466 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 2467 pmap->pm_ldt = NULL; 2468 pmap->pm_ldt_len = 0; 2469 pmap_ldt_sync(pmap); 2470 ldt_free(sel); 2471 uvm_km_free(kernel_map, (vaddr_t)dp, len, UVM_KMF_WIRED); 2472 } 2473 mutex_exit(&cpu_lock); 2474 } 2475 #endif /* USER_LDT */ 2476 2477 /* 2478 * pmap_activate: activate a process' pmap 2479 * 2480 * => must be called with kernel preemption disabled 2481 * => if lwp is the curlwp, then set ci_want_pmapload so that 2482 * actual MMU context switch will be done by pmap_load() later 2483 */ 2484 2485 void 2486 pmap_activate(struct lwp *l) 2487 { 2488 struct cpu_info *ci; 2489 struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2490 2491 KASSERT(kpreempt_disabled()); 2492 2493 ci = curcpu(); 2494 2495 if (l == ci->ci_curlwp) { 2496 KASSERT(ci->ci_want_pmapload == 0); 2497 KASSERT(ci->ci_tlbstate != TLBSTATE_VALID); 2498 #ifdef KSTACK_CHECK_DR0 2499 /* 2500 * setup breakpoint on the top of stack 2501 */ 2502 if (l == &lwp0) 2503 dr0(0, 0, 0, 0); 2504 else 2505 dr0(KSTACK_LOWEST_ADDR(l), 1, 3, 1); 2506 #endif 2507 2508 /* 2509 * no need to switch to kernel vmspace because 2510 * it's a subset of any vmspace. 2511 */ 2512 2513 if (pmap == pmap_kernel()) { 2514 ci->ci_want_pmapload = 0; 2515 return; 2516 } 2517 2518 ci->ci_want_pmapload = 1; 2519 } 2520 } 2521 2522 /* 2523 * pmap_reactivate: try to regain reference to the pmap. 2524 * 2525 * => must be called with kernel preemption disabled 2526 */ 2527 2528 static bool 2529 pmap_reactivate(struct pmap *pmap) 2530 { 2531 struct cpu_info *ci; 2532 uint32_t cpumask; 2533 bool result; 2534 uint32_t oldcpus; 2535 2536 ci = curcpu(); 2537 cpumask = ci->ci_cpumask; 2538 2539 KASSERT(kpreempt_disabled()); 2540 #if defined(XEN) && defined(__x86_64__) 2541 KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd); 2542 #elif defined(PAE) 2543 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0])); 2544 #elif !defined(XEN) 2545 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3())); 2546 #endif 2547 2548 /* 2549 * if we still have a lazy reference to this pmap, 2550 * we can assume that there was no tlb shootdown 2551 * for this pmap in the meantime. 2552 * 2553 * the order of events here is important as we must 2554 * synchronize with TLB shootdown interrupts. declare 2555 * interest in invalidations (TLBSTATE_VALID) and then 2556 * check the cpumask, which the IPIs can change only 2557 * when the state is TLBSTATE_LAZY. 2558 */ 2559 2560 ci->ci_tlbstate = TLBSTATE_VALID; 2561 oldcpus = pmap->pm_cpus; 2562 KASSERT((pmap->pm_kernel_cpus & cpumask) != 0); 2563 if (oldcpus & cpumask) { 2564 /* got it */ 2565 result = true; 2566 } else { 2567 /* must reload */ 2568 atomic_or_32(&pmap->pm_cpus, cpumask); 2569 result = false; 2570 } 2571 2572 return result; 2573 } 2574 2575 /* 2576 * pmap_load: actually switch pmap. (fill in %cr3 and LDT info) 2577 */ 2578 2579 void 2580 pmap_load(void) 2581 { 2582 struct cpu_info *ci; 2583 uint32_t cpumask; 2584 struct pmap *pmap; 2585 struct pmap *oldpmap; 2586 struct lwp *l; 2587 struct pcb *pcb; 2588 uint64_t ncsw; 2589 2590 kpreempt_disable(); 2591 retry: 2592 ci = curcpu(); 2593 if (!ci->ci_want_pmapload) { 2594 kpreempt_enable(); 2595 return; 2596 } 2597 cpumask = ci->ci_cpumask; 2598 l = ci->ci_curlwp; 2599 ncsw = l->l_ncsw; 2600 2601 /* should be able to take ipis. */ 2602 KASSERT(ci->ci_ilevel < IPL_HIGH); 2603 #ifdef XEN 2604 /* XXX not yet KASSERT(x86_read_psl() != 0); */ 2605 #else 2606 KASSERT((x86_read_psl() & PSL_I) != 0); 2607 #endif 2608 2609 KASSERT(l != NULL); 2610 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2611 KASSERT(pmap != pmap_kernel()); 2612 oldpmap = ci->ci_pmap; 2613 pcb = lwp_getpcb(l); 2614 2615 if (pmap == oldpmap) { 2616 if (!pmap_reactivate(pmap)) { 2617 u_int gen = uvm_emap_gen_return(); 2618 2619 /* 2620 * pmap has been changed during deactivated. 2621 * our tlb may be stale. 2622 */ 2623 2624 tlbflush(); 2625 uvm_emap_update(gen); 2626 } 2627 2628 ci->ci_want_pmapload = 0; 2629 kpreempt_enable(); 2630 return; 2631 } 2632 2633 /* 2634 * grab a reference to the new pmap. 2635 */ 2636 2637 pmap_reference(pmap); 2638 2639 /* 2640 * actually switch pmap. 2641 */ 2642 2643 atomic_and_32(&oldpmap->pm_cpus, ~cpumask); 2644 atomic_and_32(&oldpmap->pm_kernel_cpus, ~cpumask); 2645 2646 #if defined(XEN) && defined(__x86_64__) 2647 KASSERT(pmap_pdirpa(oldpmap, 0) == ci->ci_xen_current_user_pgd || 2648 oldpmap == pmap_kernel()); 2649 #elif defined(PAE) 2650 KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0])); 2651 #elif !defined(XEN) 2652 KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(rcr3())); 2653 #endif 2654 KASSERT((pmap->pm_cpus & cpumask) == 0); 2655 KASSERT((pmap->pm_kernel_cpus & cpumask) == 0); 2656 2657 /* 2658 * mark the pmap in use by this processor. again we must 2659 * synchronize with TLB shootdown interrupts, so set the 2660 * state VALID first, then register us for shootdown events 2661 * on this pmap. 2662 */ 2663 2664 ci->ci_tlbstate = TLBSTATE_VALID; 2665 atomic_or_32(&pmap->pm_cpus, cpumask); 2666 atomic_or_32(&pmap->pm_kernel_cpus, cpumask); 2667 ci->ci_pmap = pmap; 2668 2669 /* 2670 * update tss. now that we have registered for invalidations 2671 * from other CPUs, we're good to load the page tables. 2672 */ 2673 #ifdef PAE 2674 pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa; 2675 #else 2676 pcb->pcb_cr3 = pmap_pdirpa(pmap, 0); 2677 #endif 2678 2679 #ifdef i386 2680 #ifdef XEN 2681 /* 2682 * clear APDP slot, in case it points to a page table that has 2683 * been freed 2684 */ 2685 if (*APDP_PDE) { 2686 pmap_unmap_apdp(); 2687 } 2688 /* lldt() does pmap_pte_flush() */ 2689 #endif /* XEN */ 2690 2691 #ifndef XEN 2692 ci->ci_tss.tss_ldt = pmap->pm_ldt_sel; 2693 ci->ci_tss.tss_cr3 = pcb->pcb_cr3; 2694 #endif /* !XEN */ 2695 #endif /* i386 */ 2696 2697 lldt(pmap->pm_ldt_sel); 2698 2699 u_int gen = uvm_emap_gen_return(); 2700 cpu_load_pmap(pmap); 2701 uvm_emap_update(gen); 2702 2703 ci->ci_want_pmapload = 0; 2704 2705 /* 2706 * we're now running with the new pmap. drop the reference 2707 * to the old pmap. if we block, we need to go around again. 2708 */ 2709 2710 pmap_destroy(oldpmap); 2711 if (l->l_ncsw != ncsw) { 2712 goto retry; 2713 } 2714 2715 kpreempt_enable(); 2716 } 2717 2718 /* 2719 * pmap_deactivate: deactivate a process' pmap. 2720 * 2721 * => Must be called with kernel preemption disabled (high IPL is enough). 2722 */ 2723 void 2724 pmap_deactivate(struct lwp *l) 2725 { 2726 struct pmap *pmap; 2727 struct cpu_info *ci; 2728 2729 KASSERT(kpreempt_disabled()); 2730 2731 if (l != curlwp) { 2732 return; 2733 } 2734 2735 /* 2736 * Wait for pending TLB shootdowns to complete. Necessary because 2737 * TLB shootdown state is per-CPU, and the LWP may be coming off 2738 * the CPU before it has a chance to call pmap_update(), e.g. due 2739 * to kernel preemption or blocking routine in between. 2740 */ 2741 pmap_tlb_shootnow(); 2742 2743 ci = curcpu(); 2744 2745 if (ci->ci_want_pmapload) { 2746 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 2747 != pmap_kernel()); 2748 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 2749 != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID); 2750 2751 /* 2752 * userspace has not been touched. 2753 * nothing to do here. 2754 */ 2755 2756 ci->ci_want_pmapload = 0; 2757 return; 2758 } 2759 2760 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2761 2762 if (pmap == pmap_kernel()) { 2763 return; 2764 } 2765 2766 #if defined(XEN) && defined(__x86_64__) 2767 KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd); 2768 #elif defined(PAE) 2769 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0])); 2770 #elif !defined(XEN) 2771 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3())); 2772 #endif 2773 KASSERT(ci->ci_pmap == pmap); 2774 2775 /* 2776 * we aren't interested in TLB invalidations for this pmap, 2777 * at least for the time being. 2778 */ 2779 2780 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); 2781 ci->ci_tlbstate = TLBSTATE_LAZY; 2782 } 2783 2784 /* 2785 * end of lifecycle functions 2786 */ 2787 2788 /* 2789 * some misc. functions 2790 */ 2791 2792 int 2793 pmap_pdes_invalid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde) 2794 { 2795 int i; 2796 unsigned long index; 2797 pd_entry_t pde; 2798 2799 for (i = PTP_LEVELS; i > 1; i--) { 2800 index = pl_i(va, i); 2801 pde = pdes[i - 2][index]; 2802 if ((pde & PG_V) == 0) 2803 return i; 2804 } 2805 if (lastpde != NULL) 2806 *lastpde = pde; 2807 return 0; 2808 } 2809 2810 /* 2811 * pmap_extract: extract a PA for the given VA 2812 */ 2813 2814 bool 2815 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap) 2816 { 2817 pt_entry_t *ptes, pte; 2818 pd_entry_t pde; 2819 pd_entry_t * const *pdes; 2820 struct pmap *pmap2; 2821 struct cpu_info *ci; 2822 paddr_t pa; 2823 lwp_t *l; 2824 bool hard, rv; 2825 2826 rv = false; 2827 pa = 0; 2828 l = curlwp; 2829 2830 KPREEMPT_DISABLE(l); 2831 ci = l->l_cpu; 2832 if (__predict_true(!ci->ci_want_pmapload && ci->ci_pmap == pmap) || 2833 pmap == pmap_kernel()) { 2834 /* 2835 * no need to lock, because it's pmap_kernel() or our 2836 * own pmap and is active. if a user pmap, the caller 2837 * will hold the vm_map write/read locked and so prevent 2838 * entries from disappearing while we are here. ptps 2839 * can disappear via pmap_remove() and pmap_protect(), 2840 * but they are called with the vm_map write locked. 2841 */ 2842 hard = false; 2843 ptes = PTE_BASE; 2844 pdes = normal_pdes; 2845 } else { 2846 /* we lose, do it the hard way. */ 2847 hard = true; 2848 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 2849 } 2850 if (pmap_pdes_valid(va, pdes, &pde)) { 2851 pte = ptes[pl1_i(va)]; 2852 if (pde & PG_PS) { 2853 pa = (pde & PG_LGFRAME) | (va & (NBPD_L2 - 1)); 2854 rv = true; 2855 } else if (__predict_true((pte & PG_V) != 0)) { 2856 pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1)); 2857 rv = true; 2858 } 2859 } 2860 if (__predict_false(hard)) { 2861 pmap_unmap_ptes(pmap, pmap2); 2862 } 2863 KPREEMPT_ENABLE(l); 2864 if (pap != NULL) { 2865 *pap = pa; 2866 } 2867 return rv; 2868 } 2869 2870 2871 /* 2872 * vtophys: virtual address to physical address. For use by 2873 * machine-dependent code only. 2874 */ 2875 2876 paddr_t 2877 vtophys(vaddr_t va) 2878 { 2879 paddr_t pa; 2880 2881 if (pmap_extract(pmap_kernel(), va, &pa) == true) 2882 return (pa); 2883 return (0); 2884 } 2885 2886 __weak_alias(pmap_extract_ma, pmap_extract); 2887 2888 #ifdef XEN 2889 2890 /* 2891 * vtomach: virtual address to machine address. For use by 2892 * machine-dependent code only. 2893 */ 2894 2895 paddr_t 2896 vtomach(vaddr_t va) 2897 { 2898 paddr_t pa; 2899 2900 if (pmap_extract_ma(pmap_kernel(), va, &pa) == true) 2901 return (pa); 2902 return (0); 2903 } 2904 2905 #endif /* XEN */ 2906 2907 /* 2908 * pmap_virtual_space: used during bootup [pmap_steal_memory] to 2909 * determine the bounds of the kernel virtual addess space. 2910 */ 2911 2912 void 2913 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp) 2914 { 2915 *startp = virtual_avail; 2916 *endp = virtual_end; 2917 } 2918 2919 /* 2920 * pmap_map: map a range of PAs into kvm. 2921 * 2922 * => used during crash dump 2923 * => XXX: pmap_map() should be phased out? 2924 */ 2925 2926 vaddr_t 2927 pmap_map(vaddr_t va, paddr_t spa, paddr_t epa, vm_prot_t prot) 2928 { 2929 while (spa < epa) { 2930 pmap_kenter_pa(va, spa, prot, 0); 2931 va += PAGE_SIZE; 2932 spa += PAGE_SIZE; 2933 } 2934 pmap_update(pmap_kernel()); 2935 return va; 2936 } 2937 2938 /* 2939 * pmap_zero_page: zero a page 2940 */ 2941 2942 void 2943 pmap_zero_page(paddr_t pa) 2944 { 2945 pt_entry_t *zpte; 2946 void *zerova; 2947 int id; 2948 2949 kpreempt_disable(); 2950 id = cpu_number(); 2951 zpte = PTESLEW(zero_pte, id); 2952 zerova = VASLEW(zerop, id); 2953 2954 #ifdef DIAGNOSTIC 2955 if (*zpte) 2956 panic("pmap_zero_page: lock botch"); 2957 #endif 2958 2959 pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k); 2960 pmap_pte_flush(); 2961 pmap_update_pg((vaddr_t)zerova); /* flush TLB */ 2962 2963 memset(zerova, 0, PAGE_SIZE); 2964 2965 #if defined(DIAGNOSTIC) || defined(XEN) 2966 pmap_pte_set(zpte, 0); /* zap ! */ 2967 pmap_pte_flush(); 2968 #endif 2969 kpreempt_enable(); 2970 } 2971 2972 /* 2973 * pmap_pagezeroidle: the same, for the idle loop page zero'er. 2974 * Returns true if the page was zero'd, false if we aborted for 2975 * some reason. 2976 */ 2977 2978 bool 2979 pmap_pageidlezero(paddr_t pa) 2980 { 2981 pt_entry_t *zpte; 2982 void *zerova; 2983 bool rv; 2984 int id; 2985 2986 id = cpu_number(); 2987 zpte = PTESLEW(zero_pte, id); 2988 zerova = VASLEW(zerop, id); 2989 2990 KASSERT(cpu_feature[0] & CPUID_SSE2); 2991 KASSERT(*zpte == 0); 2992 2993 pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k); 2994 pmap_pte_flush(); 2995 pmap_update_pg((vaddr_t)zerova); /* flush TLB */ 2996 2997 rv = sse2_idlezero_page(zerova); 2998 2999 #if defined(DIAGNOSTIC) || defined(XEN) 3000 pmap_pte_set(zpte, 0); /* zap ! */ 3001 pmap_pte_flush(); 3002 #endif 3003 3004 return rv; 3005 } 3006 3007 /* 3008 * pmap_copy_page: copy a page 3009 */ 3010 3011 void 3012 pmap_copy_page(paddr_t srcpa, paddr_t dstpa) 3013 { 3014 pt_entry_t *spte; 3015 pt_entry_t *dpte; 3016 void *csrcva; 3017 void *cdstva; 3018 int id; 3019 3020 kpreempt_disable(); 3021 id = cpu_number(); 3022 spte = PTESLEW(csrc_pte,id); 3023 dpte = PTESLEW(cdst_pte,id); 3024 csrcva = VASLEW(csrcp, id); 3025 cdstva = VASLEW(cdstp, id); 3026 3027 KASSERT(*spte == 0 && *dpte == 0); 3028 3029 pmap_pte_set(spte, pmap_pa2pte(srcpa) | PG_V | PG_RW | PG_U | PG_k); 3030 pmap_pte_set(dpte, 3031 pmap_pa2pte(dstpa) | PG_V | PG_RW | PG_M | PG_U | PG_k); 3032 pmap_pte_flush(); 3033 pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva); 3034 3035 memcpy(cdstva, csrcva, PAGE_SIZE); 3036 3037 #if defined(DIAGNOSTIC) || defined(XEN) 3038 pmap_pte_set(spte, 0); 3039 pmap_pte_set(dpte, 0); 3040 pmap_pte_flush(); 3041 #endif 3042 kpreempt_enable(); 3043 } 3044 3045 static pt_entry_t * 3046 pmap_map_ptp(struct vm_page *ptp) 3047 { 3048 pt_entry_t *ptppte; 3049 void *ptpva; 3050 int id; 3051 3052 KASSERT(kpreempt_disabled()); 3053 3054 id = cpu_number(); 3055 ptppte = PTESLEW(ptp_pte, id); 3056 ptpva = VASLEW(ptpp, id); 3057 #if !defined(XEN) 3058 pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M | 3059 PG_RW | PG_U | PG_k); 3060 #else 3061 pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M | 3062 PG_U | PG_k); 3063 #endif 3064 pmap_pte_flush(); 3065 pmap_update_pg((vaddr_t)ptpva); 3066 3067 return (pt_entry_t *)ptpva; 3068 } 3069 3070 static void 3071 pmap_unmap_ptp(void) 3072 { 3073 #if defined(DIAGNOSTIC) || defined(XEN) 3074 pt_entry_t *pte; 3075 3076 KASSERT(kpreempt_disabled()); 3077 3078 pte = PTESLEW(ptp_pte, cpu_number()); 3079 if (*pte != 0) { 3080 pmap_pte_set(pte, 0); 3081 pmap_pte_flush(); 3082 } 3083 #endif 3084 } 3085 3086 static pt_entry_t * 3087 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va) 3088 { 3089 3090 KASSERT(kpreempt_disabled()); 3091 if (pmap_is_curpmap(pmap)) { 3092 return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */ 3093 } 3094 KASSERT(ptp != NULL); 3095 return pmap_map_ptp(ptp) + pl1_pi(va); 3096 } 3097 3098 static void 3099 pmap_unmap_pte(void) 3100 { 3101 3102 KASSERT(kpreempt_disabled()); 3103 3104 pmap_unmap_ptp(); 3105 } 3106 3107 /* 3108 * p m a p r e m o v e f u n c t i o n s 3109 * 3110 * functions that remove mappings 3111 */ 3112 3113 /* 3114 * pmap_remove_ptes: remove PTEs from a PTP 3115 * 3116 * => caller must hold pmap's lock 3117 * => PTP must be mapped into KVA 3118 * => PTP should be null if pmap == pmap_kernel() 3119 * => must be called with kernel preemption disabled 3120 * => returns composite pte if at least one page should be shot down 3121 */ 3122 3123 static void 3124 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva, 3125 vaddr_t startva, vaddr_t endva, struct pv_entry **pv_tofree) 3126 { 3127 pt_entry_t *pte = (pt_entry_t *)ptpva; 3128 3129 KASSERT(pmap == pmap_kernel() || mutex_owned(pmap->pm_lock)); 3130 KASSERT(kpreempt_disabled()); 3131 3132 /* 3133 * note that ptpva points to the PTE that maps startva. this may 3134 * or may not be the first PTE in the PTP. 3135 * 3136 * we loop through the PTP while there are still PTEs to look at 3137 * and the wire_count is greater than 1 (because we use the wire_count 3138 * to keep track of the number of real PTEs in the PTP). 3139 */ 3140 while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) { 3141 (void)pmap_remove_pte(pmap, ptp, pte, startva, pv_tofree); 3142 startva += PAGE_SIZE; 3143 pte++; 3144 } 3145 } 3146 3147 3148 /* 3149 * pmap_remove_pte: remove a single PTE from a PTP. 3150 * 3151 * => caller must hold pmap's lock 3152 * => PTP must be mapped into KVA 3153 * => PTP should be null if pmap == pmap_kernel() 3154 * => returns true if we removed a mapping 3155 * => must be called with kernel preemption disabled 3156 */ 3157 static bool 3158 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, 3159 vaddr_t va, struct pv_entry **pv_tofree) 3160 { 3161 struct pv_entry *pve; 3162 struct vm_page *pg; 3163 struct pmap_page *pp; 3164 pt_entry_t opte; 3165 3166 KASSERT(pmap == pmap_kernel() || mutex_owned(pmap->pm_lock)); 3167 KASSERT(kpreempt_disabled()); 3168 3169 if (!pmap_valid_entry(*pte)) { 3170 /* VA not mapped. */ 3171 return false; 3172 } 3173 3174 /* Atomically save the old PTE and zap it. */ 3175 opte = pmap_pte_testset(pte, 0); 3176 if (!pmap_valid_entry(opte)) { 3177 return false; 3178 } 3179 3180 pmap_exec_account(pmap, va, opte, 0); 3181 pmap_stats_update_bypte(pmap, 0, opte); 3182 3183 if (ptp) { 3184 /* 3185 * Dropping a PTE. Make sure that the PDE is flushed. 3186 */ 3187 ptp->wire_count--; 3188 if (ptp->wire_count <= 1) { 3189 opte |= PG_U; 3190 } 3191 } 3192 3193 if ((opte & PG_U) != 0) { 3194 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE); 3195 } 3196 3197 /* 3198 * If we are not on a pv_head list - we are done. 3199 */ 3200 if ((opte & PG_PVLIST) == 0) { 3201 #if defined(DIAGNOSTIC) && !defined(DOM0OPS) 3202 if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL) 3203 panic("pmap_remove_pte: managed page without " 3204 "PG_PVLIST for %#" PRIxVADDR, va); 3205 #endif 3206 return true; 3207 } 3208 3209 pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte)); 3210 3211 KASSERTMSG(pg != NULL, ("pmap_remove_pte: unmanaged page marked " 3212 "PG_PVLIST, va = %#" PRIxVADDR ", pa = %#" PRIxPADDR, 3213 va, (paddr_t)pmap_pte2pa(opte))); 3214 3215 KASSERT(pmap == pmap_kernel() || uvm_page_locked_p(pg)); 3216 3217 /* Sync R/M bits. */ 3218 pp = VM_PAGE_TO_PP(pg); 3219 pp->pp_attrs |= opte; 3220 pve = pmap_remove_pv(pp, ptp, va); 3221 3222 if (pve) { 3223 pve->pve_next = *pv_tofree; 3224 *pv_tofree = pve; 3225 } 3226 return true; 3227 } 3228 3229 /* 3230 * pmap_remove: mapping removal function. 3231 * 3232 * => caller should not be holding any pmap locks 3233 */ 3234 3235 void 3236 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 3237 { 3238 pt_entry_t *ptes; 3239 pd_entry_t pde; 3240 pd_entry_t * const *pdes; 3241 struct pv_entry *pv_tofree = NULL; 3242 bool result; 3243 int i; 3244 paddr_t ptppa; 3245 vaddr_t blkendva, va = sva; 3246 struct vm_page *ptp; 3247 struct pmap *pmap2; 3248 3249 kpreempt_disable(); 3250 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 3251 3252 /* 3253 * removing one page? take shortcut function. 3254 */ 3255 3256 if (va + PAGE_SIZE == eva) { 3257 if (pmap_pdes_valid(va, pdes, &pde)) { 3258 3259 /* PA of the PTP */ 3260 ptppa = pmap_pte2pa(pde); 3261 3262 /* Get PTP if non-kernel mapping. */ 3263 if (pmap != pmap_kernel()) { 3264 ptp = pmap_find_ptp(pmap, va, ptppa, 1); 3265 KASSERTMSG(ptp != NULL, 3266 ("pmap_remove: unmanaged PTP detected") 3267 ); 3268 } else { 3269 /* Never free kernel PTPs. */ 3270 ptp = NULL; 3271 } 3272 3273 result = pmap_remove_pte(pmap, ptp, 3274 &ptes[pl1_i(va)], va, &pv_tofree); 3275 3276 /* 3277 * if mapping removed and the PTP is no longer 3278 * being used, free it! 3279 */ 3280 3281 if (result && ptp && ptp->wire_count <= 1) 3282 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3283 } 3284 } else for (/* null */ ; va < eva ; va = blkendva) { 3285 int lvl; 3286 3287 /* determine range of block */ 3288 blkendva = x86_round_pdr(va+1); 3289 if (blkendva > eva) 3290 blkendva = eva; 3291 3292 /* 3293 * XXXCDC: our PTE mappings should never be removed 3294 * with pmap_remove! if we allow this (and why would 3295 * we?) then we end up freeing the pmap's page 3296 * directory page (PDP) before we are finished using 3297 * it when we hit in in the recursive mapping. this 3298 * is BAD. 3299 * 3300 * long term solution is to move the PTEs out of user 3301 * address space. and into kernel address space (up 3302 * with APTE). then we can set VM_MAXUSER_ADDRESS to 3303 * be VM_MAX_ADDRESS. 3304 */ 3305 3306 /* XXXCDC: ugly hack to avoid freeing PDP here */ 3307 for (i = 0; i < PDP_SIZE; i++) { 3308 if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i) 3309 continue; 3310 } 3311 3312 lvl = pmap_pdes_invalid(va, pdes, &pde); 3313 if (lvl != 0) { 3314 /* 3315 * skip a range corresponding to an invalid pde. 3316 */ 3317 blkendva = (va & ptp_masks[lvl - 1]) + nbpd[lvl - 1]; 3318 continue; 3319 } 3320 3321 /* PA of the PTP */ 3322 ptppa = pmap_pte2pa(pde); 3323 3324 /* Get PTP if non-kernel mapping. */ 3325 if (pmap != pmap_kernel()) { 3326 ptp = pmap_find_ptp(pmap, va, ptppa, 1); 3327 KASSERTMSG(ptp != NULL, 3328 ("pmap_remove: unmanaged PTP detected") 3329 ); 3330 } else { 3331 /* Never free kernel PTPs. */ 3332 ptp = NULL; 3333 } 3334 3335 pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va, 3336 blkendva, &pv_tofree); 3337 3338 /* if PTP is no longer being used, free it! */ 3339 if (ptp && ptp->wire_count <= 1) { 3340 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3341 } 3342 } 3343 pmap_unmap_ptes(pmap, pmap2); /* unlock pmap */ 3344 kpreempt_enable(); 3345 3346 /* Now we free unused PVs */ 3347 if (pv_tofree) 3348 pmap_free_pvs(pv_tofree); 3349 } 3350 3351 /* 3352 * pmap_sync_pv: clear pte bits and return the old value of the pte. 3353 * 3354 * => Caller should disable kernel preemption. 3355 * => issues tlb shootdowns if necessary. 3356 */ 3357 3358 static int 3359 pmap_sync_pv(struct pv_pte *pvpte, pt_entry_t expect, int clearbits, 3360 pt_entry_t *optep) 3361 { 3362 struct pmap *pmap; 3363 struct vm_page *ptp; 3364 vaddr_t va; 3365 pt_entry_t *ptep; 3366 pt_entry_t opte; 3367 pt_entry_t npte; 3368 bool need_shootdown; 3369 3370 ptp = pvpte->pte_ptp; 3371 va = pvpte->pte_va; 3372 KASSERT(ptp == NULL || ptp->uobject != NULL); 3373 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 3374 pmap = ptp_to_pmap(ptp); 3375 3376 KASSERT((expect & ~(PG_FRAME | PG_V)) == 0); 3377 KASSERT((expect & PG_V) != 0); 3378 KASSERT(clearbits == ~0 || (clearbits & ~(PG_M | PG_U | PG_RW)) == 0); 3379 KASSERT(kpreempt_disabled()); 3380 3381 ptep = pmap_map_pte(pmap, ptp, va); 3382 do { 3383 opte = *ptep; 3384 KASSERT((opte & (PG_M | PG_U)) != PG_M); 3385 KASSERT((opte & (PG_U | PG_V)) != PG_U); 3386 KASSERT(opte == 0 || (opte & PG_V) != 0); 3387 if ((opte & (PG_FRAME | PG_V)) != expect) { 3388 3389 /* 3390 * we lost a race with a V->P operation like 3391 * pmap_remove(). wait for the competitor 3392 * reflecting pte bits into mp_attrs. 3393 * 3394 * issue a redundant TLB shootdown so that 3395 * we can wait for its completion. 3396 */ 3397 3398 pmap_unmap_pte(); 3399 if (clearbits != 0) { 3400 pmap_tlb_shootdown(pmap, va, 3401 (pmap == pmap_kernel() ? PG_G : 0), 3402 TLBSHOOT_SYNC_PV1); 3403 } 3404 return EAGAIN; 3405 } 3406 3407 /* 3408 * check if there's anything to do on this pte. 3409 */ 3410 3411 if ((opte & clearbits) == 0) { 3412 need_shootdown = false; 3413 break; 3414 } 3415 3416 /* 3417 * we need a shootdown if the pte is cached. (PG_U) 3418 * 3419 * ...unless we are clearing only the PG_RW bit and 3420 * it isn't cached as RW. (PG_M) 3421 */ 3422 3423 need_shootdown = (opte & PG_U) != 0 && 3424 !(clearbits == PG_RW && (opte & PG_M) == 0); 3425 3426 npte = opte & ~clearbits; 3427 3428 /* 3429 * if we need a shootdown anyway, clear PG_U and PG_M. 3430 */ 3431 3432 if (need_shootdown) { 3433 npte &= ~(PG_U | PG_M); 3434 } 3435 KASSERT((npte & (PG_M | PG_U)) != PG_M); 3436 KASSERT((npte & (PG_U | PG_V)) != PG_U); 3437 KASSERT(npte == 0 || (opte & PG_V) != 0); 3438 } while (pmap_pte_cas(ptep, opte, npte) != opte); 3439 3440 if (need_shootdown) { 3441 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV2); 3442 } 3443 pmap_unmap_pte(); 3444 3445 *optep = opte; 3446 return 0; 3447 } 3448 3449 /* 3450 * pmap_page_remove: remove a managed vm_page from all pmaps that map it 3451 * 3452 * => R/M bits are sync'd back to attrs 3453 */ 3454 3455 void 3456 pmap_page_remove(struct vm_page *pg) 3457 { 3458 struct pmap_page *pp; 3459 struct pv_pte *pvpte; 3460 struct pv_entry *killlist = NULL; 3461 struct vm_page *ptp; 3462 pt_entry_t expect; 3463 lwp_t *l; 3464 int count; 3465 3466 KASSERT(uvm_page_locked_p(pg)); 3467 3468 l = curlwp; 3469 pp = VM_PAGE_TO_PP(pg); 3470 expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V; 3471 count = SPINLOCK_BACKOFF_MIN; 3472 kpreempt_disable(); 3473 startover: 3474 while ((pvpte = pv_pte_first(pp)) != NULL) { 3475 struct pmap *pmap; 3476 struct pv_entry *pve; 3477 pt_entry_t opte; 3478 vaddr_t va; 3479 int error; 3480 3481 /* 3482 * add a reference to the pmap before clearing the pte. 3483 * otherwise the pmap can disappear behind us. 3484 */ 3485 3486 ptp = pvpte->pte_ptp; 3487 pmap = ptp_to_pmap(ptp); 3488 if (ptp != NULL) { 3489 pmap_reference(pmap); 3490 } 3491 3492 error = pmap_sync_pv(pvpte, expect, ~0, &opte); 3493 if (error == EAGAIN) { 3494 int hold_count; 3495 KERNEL_UNLOCK_ALL(curlwp, &hold_count); 3496 if (ptp != NULL) { 3497 pmap_destroy(pmap); 3498 } 3499 SPINLOCK_BACKOFF(count); 3500 KERNEL_LOCK(hold_count, curlwp); 3501 goto startover; 3502 } 3503 3504 pp->pp_attrs |= opte; 3505 va = pvpte->pte_va; 3506 pve = pmap_remove_pv(pp, ptp, va); 3507 3508 /* update the PTP reference count. free if last reference. */ 3509 if (ptp != NULL) { 3510 struct pmap *pmap2; 3511 pt_entry_t *ptes; 3512 pd_entry_t * const *pdes; 3513 3514 KASSERT(pmap != pmap_kernel()); 3515 3516 pmap_tlb_shootnow(); 3517 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3518 pmap_stats_update_bypte(pmap, 0, opte); 3519 ptp->wire_count--; 3520 if (ptp->wire_count <= 1) { 3521 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3522 } 3523 pmap_unmap_ptes(pmap, pmap2); 3524 pmap_destroy(pmap); 3525 } else { 3526 KASSERT(pmap == pmap_kernel()); 3527 pmap_stats_update_bypte(pmap, 0, opte); 3528 } 3529 3530 if (pve != NULL) { 3531 pve->pve_next = killlist; /* mark it for death */ 3532 killlist = pve; 3533 } 3534 } 3535 pmap_tlb_shootnow(); 3536 kpreempt_enable(); 3537 3538 /* Now free unused pvs. */ 3539 pmap_free_pvs(killlist); 3540 } 3541 3542 /* 3543 * p m a p a t t r i b u t e f u n c t i o n s 3544 * functions that test/change managed page's attributes 3545 * since a page can be mapped multiple times we must check each PTE that 3546 * maps it by going down the pv lists. 3547 */ 3548 3549 /* 3550 * pmap_test_attrs: test a page's attributes 3551 */ 3552 3553 bool 3554 pmap_test_attrs(struct vm_page *pg, unsigned testbits) 3555 { 3556 struct pmap_page *pp; 3557 struct pv_pte *pvpte; 3558 pt_entry_t expect; 3559 u_int result; 3560 3561 KASSERT(uvm_page_locked_p(pg)); 3562 3563 pp = VM_PAGE_TO_PP(pg); 3564 if ((pp->pp_attrs & testbits) != 0) { 3565 return true; 3566 } 3567 expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V; 3568 kpreempt_disable(); 3569 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 3570 pt_entry_t opte; 3571 int error; 3572 3573 if ((pp->pp_attrs & testbits) != 0) { 3574 break; 3575 } 3576 error = pmap_sync_pv(pvpte, expect, 0, &opte); 3577 if (error == 0) { 3578 pp->pp_attrs |= opte; 3579 } 3580 } 3581 result = pp->pp_attrs & testbits; 3582 kpreempt_enable(); 3583 3584 /* 3585 * note that we will exit the for loop with a non-null pve if 3586 * we have found the bits we are testing for. 3587 */ 3588 3589 return result != 0; 3590 } 3591 3592 /* 3593 * pmap_clear_attrs: clear the specified attribute for a page. 3594 * 3595 * => we return true if we cleared one of the bits we were asked to 3596 */ 3597 3598 bool 3599 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits) 3600 { 3601 struct pmap_page *pp; 3602 struct pv_pte *pvpte; 3603 u_int result; 3604 pt_entry_t expect; 3605 int count; 3606 3607 KASSERT(uvm_page_locked_p(pg)); 3608 3609 pp = VM_PAGE_TO_PP(pg); 3610 expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V; 3611 count = SPINLOCK_BACKOFF_MIN; 3612 kpreempt_disable(); 3613 startover: 3614 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 3615 pt_entry_t opte; 3616 int error; 3617 3618 error = pmap_sync_pv(pvpte, expect, clearbits, &opte); 3619 if (error == EAGAIN) { 3620 int hold_count; 3621 KERNEL_UNLOCK_ALL(curlwp, &hold_count); 3622 SPINLOCK_BACKOFF(count); 3623 KERNEL_LOCK(hold_count, curlwp); 3624 goto startover; 3625 } 3626 pp->pp_attrs |= opte; 3627 } 3628 result = pp->pp_attrs & clearbits; 3629 pp->pp_attrs &= ~clearbits; 3630 kpreempt_enable(); 3631 3632 return result != 0; 3633 } 3634 3635 3636 /* 3637 * p m a p p r o t e c t i o n f u n c t i o n s 3638 */ 3639 3640 /* 3641 * pmap_page_protect: change the protection of all recorded mappings 3642 * of a managed page 3643 * 3644 * => NOTE: this is an inline function in pmap.h 3645 */ 3646 3647 /* see pmap.h */ 3648 3649 /* 3650 * pmap_protect: set the protection in of the pages in a pmap 3651 * 3652 * => NOTE: this is an inline function in pmap.h 3653 */ 3654 3655 /* see pmap.h */ 3656 3657 /* 3658 * pmap_write_protect: write-protect pages in a pmap 3659 */ 3660 3661 void 3662 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot) 3663 { 3664 int i; 3665 pt_entry_t *ptes, *epte; 3666 pt_entry_t *spte; 3667 pd_entry_t * const *pdes; 3668 vaddr_t blockend, va; 3669 pt_entry_t opte; 3670 struct pmap *pmap2; 3671 3672 KASSERT(curlwp->l_md.md_gc_pmap != pmap); 3673 3674 kpreempt_disable(); 3675 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 3676 3677 /* should be ok, but just in case ... */ 3678 sva &= PG_FRAME; 3679 eva &= PG_FRAME; 3680 3681 for (va = sva ; va < eva ; va = blockend) { 3682 3683 blockend = (va & L2_FRAME) + NBPD_L2; 3684 if (blockend > eva) 3685 blockend = eva; 3686 3687 /* 3688 * XXXCDC: our PTE mappings should never be write-protected! 3689 * 3690 * long term solution is to move the PTEs out of user 3691 * address space. and into kernel address space (up 3692 * with APTE). then we can set VM_MAXUSER_ADDRESS to 3693 * be VM_MAX_ADDRESS. 3694 */ 3695 3696 /* XXXCDC: ugly hack to avoid freeing PDP here */ 3697 for (i = 0; i < PDP_SIZE; i++) { 3698 if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i) 3699 continue; 3700 } 3701 3702 /* empty block? */ 3703 if (!pmap_pdes_valid(va, pdes, NULL)) 3704 continue; 3705 3706 #ifdef DIAGNOSTIC 3707 if (va >= VM_MAXUSER_ADDRESS && 3708 va < VM_MAX_ADDRESS) 3709 panic("pmap_write_protect: PTE space"); 3710 #endif 3711 3712 spte = &ptes[pl1_i(va)]; 3713 epte = &ptes[pl1_i(blockend)]; 3714 3715 for (/*null */; spte < epte ; spte++) { 3716 pt_entry_t npte; 3717 3718 do { 3719 opte = *spte; 3720 if ((~opte & (PG_RW | PG_V)) != 0) { 3721 goto next; 3722 } 3723 npte = opte & ~PG_RW; 3724 } while (pmap_pte_cas(spte, opte, npte) != opte); 3725 if ((opte & PG_M) != 0) { 3726 vaddr_t tva; 3727 3728 tva = x86_ptob(spte - ptes); 3729 pmap_tlb_shootdown(pmap, tva, opte, 3730 TLBSHOOT_WRITE_PROTECT); 3731 } 3732 next:; 3733 } 3734 } 3735 3736 pmap_unmap_ptes(pmap, pmap2); /* unlocks pmap */ 3737 kpreempt_enable(); 3738 } 3739 3740 /* 3741 * end of protection functions 3742 */ 3743 3744 /* 3745 * pmap_unwire: clear the wired bit in the PTE 3746 * 3747 * => mapping should already be in map 3748 */ 3749 3750 void 3751 pmap_unwire(struct pmap *pmap, vaddr_t va) 3752 { 3753 pt_entry_t *ptes; 3754 pd_entry_t * const *pdes; 3755 struct pmap *pmap2; 3756 3757 kpreempt_disable(); 3758 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 3759 3760 if (pmap_pdes_valid(va, pdes, NULL)) { 3761 pt_entry_t *ptep = &ptes[pl1_i(va)]; 3762 pt_entry_t opte = *ptep; 3763 3764 #ifdef DIAGNOSTIC 3765 if (!pmap_valid_entry(opte)) 3766 panic("pmap_unwire: invalid (unmapped) va 0x%lx", va); 3767 #endif 3768 if ((opte & PG_W) != 0) { 3769 pt_entry_t npte = opte & ~PG_W; 3770 3771 opte = pmap_pte_testset(ptep, npte); 3772 pmap_stats_update_bypte(pmap, npte, opte); 3773 } 3774 #ifdef DIAGNOSTIC 3775 else { 3776 printf("pmap_unwire: wiring for pmap %p va 0x%lx " 3777 "didn't change!\n", pmap, va); 3778 } 3779 #endif 3780 pmap_unmap_ptes(pmap, pmap2); /* unlocks map */ 3781 } 3782 #ifdef DIAGNOSTIC 3783 else { 3784 panic("pmap_unwire: invalid PDE"); 3785 } 3786 #endif 3787 kpreempt_enable(); 3788 } 3789 3790 /* 3791 * pmap_copy: copy mappings from one pmap to another 3792 * 3793 * => optional function 3794 * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr) 3795 */ 3796 3797 /* 3798 * defined as macro in pmap.h 3799 */ 3800 3801 __weak_alias(pmap_enter, pmap_enter_default); 3802 3803 int 3804 pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, 3805 u_int flags) 3806 { 3807 return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0); 3808 } 3809 3810 /* 3811 * pmap_enter: enter a mapping into a pmap 3812 * 3813 * => must be done "now" ... no lazy-evaluation 3814 * => we set pmap => pv_head locking 3815 */ 3816 int 3817 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa, 3818 vm_prot_t prot, u_int flags, int domid) 3819 { 3820 pt_entry_t *ptes, opte, npte; 3821 pt_entry_t *ptep; 3822 pd_entry_t * const *pdes; 3823 struct vm_page *ptp, *pg; 3824 struct pmap_page *new_pp; 3825 struct pmap_page *old_pp; 3826 struct pv_entry *old_pve = NULL; 3827 struct pv_entry *new_pve; 3828 struct pv_entry *new_pve2; 3829 int error; 3830 bool wired = (flags & PMAP_WIRED) != 0; 3831 struct pmap *pmap2; 3832 3833 KASSERT(pmap_initialized); 3834 KASSERT(curlwp->l_md.md_gc_pmap != pmap); 3835 3836 #ifdef DIAGNOSTIC 3837 /* sanity check: totally out of range? */ 3838 if (va >= VM_MAX_KERNEL_ADDRESS) 3839 panic("pmap_enter: too big"); 3840 3841 if (va == (vaddr_t) PDP_BASE || va == (vaddr_t) APDP_BASE) 3842 panic("pmap_enter: trying to map over PDP/APDP!"); 3843 3844 /* sanity check: kernel PTPs should already have been pre-allocated */ 3845 if (va >= VM_MIN_KERNEL_ADDRESS && 3846 !pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)])) 3847 panic("pmap_enter: missing kernel PTP for va %lx!", va); 3848 #endif /* DIAGNOSTIC */ 3849 #ifdef XEN 3850 KASSERT(domid == DOMID_SELF || pa == 0); 3851 #endif /* XEN */ 3852 3853 npte = ma | protection_codes[prot] | PG_V; 3854 npte |= pmap_pat_flags(flags); 3855 if (wired) 3856 npte |= PG_W; 3857 if (va < VM_MAXUSER_ADDRESS) 3858 npte |= PG_u; 3859 else if (va < VM_MAX_ADDRESS) 3860 npte |= (PG_u | PG_RW); /* XXXCDC: no longer needed? */ 3861 else 3862 npte |= PG_k; 3863 if (pmap == pmap_kernel()) 3864 npte |= pmap_pg_g; 3865 if (flags & VM_PROT_ALL) { 3866 npte |= PG_U; 3867 if (flags & VM_PROT_WRITE) { 3868 KASSERT((npte & PG_RW) != 0); 3869 npte |= PG_M; 3870 } 3871 } 3872 3873 #ifdef XEN 3874 if (domid != DOMID_SELF) 3875 pg = NULL; 3876 else 3877 #endif 3878 pg = PHYS_TO_VM_PAGE(pa); 3879 if (pg != NULL) { 3880 /* This is a managed page */ 3881 npte |= PG_PVLIST; 3882 new_pp = VM_PAGE_TO_PP(pg); 3883 } else { 3884 new_pp = NULL; 3885 } 3886 3887 /* get pves. */ 3888 new_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT); 3889 new_pve2 = pool_cache_get(&pmap_pv_cache, PR_NOWAIT); 3890 if (new_pve == NULL || new_pve2 == NULL) { 3891 if (flags & PMAP_CANFAIL) { 3892 error = ENOMEM; 3893 goto out2; 3894 } 3895 panic("pmap_enter: pve allocation failed"); 3896 } 3897 3898 kpreempt_disable(); 3899 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 3900 if (pmap == pmap_kernel()) { 3901 ptp = NULL; 3902 } else { 3903 ptp = pmap_get_ptp(pmap, va, pdes); 3904 if (ptp == NULL) { 3905 pmap_unmap_ptes(pmap, pmap2); 3906 if (flags & PMAP_CANFAIL) { 3907 error = ENOMEM; 3908 goto out; 3909 } 3910 panic("pmap_enter: get ptp failed"); 3911 } 3912 } 3913 3914 /* 3915 * update the pte. 3916 */ 3917 3918 ptep = &ptes[pl1_i(va)]; 3919 do { 3920 opte = *ptep; 3921 3922 /* 3923 * if the same page, inherit PG_U and PG_M. 3924 */ 3925 if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) { 3926 npte |= opte & (PG_U | PG_M); 3927 } 3928 #if defined(XEN) 3929 if (domid != DOMID_SELF) { 3930 /* pmap_pte_cas with error handling */ 3931 int s = splvm(); 3932 if (opte != *ptep) { 3933 splx(s); 3934 continue; 3935 } 3936 error = xpq_update_foreign( 3937 vtomach((vaddr_t)ptep), npte, domid); 3938 splx(s); 3939 if (error) { 3940 if (ptp != NULL && ptp->wire_count <= 1) { 3941 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3942 } 3943 pmap_unmap_ptes(pmap, pmap2); 3944 goto out; 3945 } 3946 break; 3947 } 3948 #endif /* defined(XEN) */ 3949 } while (pmap_pte_cas(ptep, opte, npte) != opte); 3950 3951 /* 3952 * update statistics and PTP's reference count. 3953 */ 3954 3955 pmap_stats_update_bypte(pmap, npte, opte); 3956 if (ptp != NULL && !pmap_valid_entry(opte)) { 3957 ptp->wire_count++; 3958 } 3959 KASSERT(ptp == NULL || ptp->wire_count > 1); 3960 3961 /* 3962 * if the same page, we can skip pv_entry handling. 3963 */ 3964 3965 if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) { 3966 KASSERT(((opte ^ npte) & PG_PVLIST) == 0); 3967 goto same_pa; 3968 } 3969 3970 /* 3971 * if old page is managed, remove pv_entry from its list. 3972 */ 3973 3974 if ((~opte & (PG_V | PG_PVLIST)) == 0) { 3975 pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte)); 3976 3977 KASSERTMSG(pg != NULL, ("pmap_enter: PG_PVLIST mapping with " 3978 "unmanaged page pa = 0x%" PRIx64 " (0x%" PRIx64 ")", 3979 (int64_t)pa, (int64_t)atop(pa))); 3980 3981 KASSERT(uvm_page_locked_p(pg)); 3982 3983 old_pp = VM_PAGE_TO_PP(pg); 3984 old_pve = pmap_remove_pv(old_pp, ptp, va); 3985 old_pp->pp_attrs |= opte; 3986 } 3987 3988 /* 3989 * if new page is managed, insert pv_entry into its list. 3990 */ 3991 3992 if (new_pp) { 3993 new_pve = pmap_enter_pv(new_pp, new_pve, &new_pve2, ptp, va); 3994 } 3995 3996 same_pa: 3997 pmap_unmap_ptes(pmap, pmap2); 3998 3999 /* 4000 * shootdown tlb if necessary. 4001 */ 4002 4003 if ((~opte & (PG_V | PG_U)) == 0 && 4004 ((opte ^ npte) & (PG_FRAME | PG_RW)) != 0) { 4005 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER); 4006 } 4007 4008 error = 0; 4009 out: 4010 kpreempt_enable(); 4011 out2: 4012 if (old_pve != NULL) { 4013 pool_cache_put(&pmap_pv_cache, old_pve); 4014 } 4015 if (new_pve != NULL) { 4016 pool_cache_put(&pmap_pv_cache, new_pve); 4017 } 4018 if (new_pve2 != NULL) { 4019 pool_cache_put(&pmap_pv_cache, new_pve2); 4020 } 4021 4022 return error; 4023 } 4024 4025 static bool 4026 pmap_get_physpage(vaddr_t va, int level, paddr_t *paddrp) 4027 { 4028 struct vm_page *ptp; 4029 struct pmap *kpm = pmap_kernel(); 4030 4031 if (uvm.page_init_done == false) { 4032 /* 4033 * we're growing the kernel pmap early (from 4034 * uvm_pageboot_alloc()). this case must be 4035 * handled a little differently. 4036 */ 4037 4038 if (uvm_page_physget(paddrp) == false) 4039 panic("pmap_get_physpage: out of memory"); 4040 kpreempt_disable(); 4041 pmap_pte_set(early_zero_pte, 4042 pmap_pa2pte(*paddrp) | PG_V | PG_RW | PG_k); 4043 pmap_pte_flush(); 4044 pmap_update_pg((vaddr_t)early_zerop); 4045 memset(early_zerop, 0, PAGE_SIZE); 4046 #if defined(DIAGNOSTIC) || defined (XEN) 4047 pmap_pte_set(early_zero_pte, 0); 4048 pmap_pte_flush(); 4049 #endif /* defined(DIAGNOSTIC) */ 4050 kpreempt_enable(); 4051 } else { 4052 /* XXX */ 4053 ptp = uvm_pagealloc(NULL, 0, NULL, 4054 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 4055 if (ptp == NULL) 4056 panic("pmap_get_physpage: out of memory"); 4057 ptp->flags &= ~PG_BUSY; 4058 ptp->wire_count = 1; 4059 *paddrp = VM_PAGE_TO_PHYS(ptp); 4060 } 4061 pmap_stats_update(kpm, 1, 0); 4062 return true; 4063 } 4064 4065 /* 4066 * Allocate the amount of specified ptps for a ptp level, and populate 4067 * all levels below accordingly, mapping virtual addresses starting at 4068 * kva. 4069 * 4070 * Used by pmap_growkernel. 4071 */ 4072 static void 4073 pmap_alloc_level(pd_entry_t * const *pdes, vaddr_t kva, int lvl, 4074 long *needed_ptps) 4075 { 4076 unsigned long i; 4077 vaddr_t va; 4078 paddr_t pa; 4079 unsigned long index, endindex; 4080 int level; 4081 pd_entry_t *pdep; 4082 #ifdef XEN 4083 int s = splvm(); /* protect xpq_* */ 4084 #endif 4085 4086 for (level = lvl; level > 1; level--) { 4087 if (level == PTP_LEVELS) 4088 pdep = pmap_kernel()->pm_pdir; 4089 else 4090 pdep = pdes[level - 2]; 4091 va = kva; 4092 index = pl_i_roundup(kva, level); 4093 endindex = index + needed_ptps[level - 1] - 1; 4094 4095 4096 for (i = index; i <= endindex; i++) { 4097 KASSERT(!pmap_valid_entry(pdep[i])); 4098 pmap_get_physpage(va, level - 1, &pa); 4099 #ifdef XEN 4100 xpq_queue_pte_update((level == PTP_LEVELS) ? 4101 xpmap_ptom(pmap_pdirpa(pmap_kernel(), i)) : 4102 xpmap_ptetomach(&pdep[i]), 4103 pmap_pa2pte(pa) | PG_k | PG_V | PG_RW); 4104 #ifdef PAE 4105 if (level == PTP_LEVELS && i > L2_SLOT_KERN) { 4106 /* update real kernel PD too */ 4107 xpq_queue_pte_update( 4108 xpmap_ptetomach(&pmap_kl2pd[l2tol2(i)]), 4109 pmap_pa2pte(pa) | PG_k | PG_V | PG_RW); 4110 } 4111 #endif 4112 #else /* XEN */ 4113 pdep[i] = pmap_pa2pte(pa) | PG_k | PG_V | PG_RW; 4114 #endif /* XEN */ 4115 KASSERT(level != PTP_LEVELS || nkptp[level - 1] + 4116 pl_i(VM_MIN_KERNEL_ADDRESS, level) == i); 4117 nkptp[level - 1]++; 4118 va += nbpd[level - 1]; 4119 } 4120 pmap_pte_flush(); 4121 } 4122 #ifdef XEN 4123 splx(s); 4124 #endif 4125 } 4126 4127 /* 4128 * pmap_growkernel: increase usage of KVM space 4129 * 4130 * => we allocate new PTPs for the kernel and install them in all 4131 * the pmaps on the system. 4132 */ 4133 4134 vaddr_t 4135 pmap_growkernel(vaddr_t maxkvaddr) 4136 { 4137 struct pmap *kpm = pmap_kernel(); 4138 #if !defined(XEN) || !defined(__x86_64__) 4139 struct pmap *pm; 4140 #endif 4141 int s, i; 4142 long needed_kptp[PTP_LEVELS], target_nptp, old; 4143 bool invalidate = false; 4144 4145 s = splvm(); /* to be safe */ 4146 mutex_enter(kpm->pm_lock); 4147 4148 if (maxkvaddr <= pmap_maxkvaddr) { 4149 mutex_exit(kpm->pm_lock); 4150 splx(s); 4151 return pmap_maxkvaddr; 4152 } 4153 4154 maxkvaddr = x86_round_pdr(maxkvaddr); 4155 old = nkptp[PTP_LEVELS - 1]; 4156 /* 4157 * This loop could be optimized more, but pmap_growkernel() 4158 * is called infrequently. 4159 */ 4160 for (i = PTP_LEVELS - 1; i >= 1; i--) { 4161 target_nptp = pl_i_roundup(maxkvaddr, i + 1) - 4162 pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1); 4163 /* 4164 * XXX only need to check toplevel. 4165 */ 4166 if (target_nptp > nkptpmax[i]) 4167 panic("out of KVA space"); 4168 KASSERT(target_nptp >= nkptp[i]); 4169 needed_kptp[i] = target_nptp - nkptp[i]; 4170 } 4171 4172 pmap_alloc_level(normal_pdes, pmap_maxkvaddr, PTP_LEVELS, needed_kptp); 4173 4174 /* 4175 * If the number of top level entries changed, update all 4176 * pmaps. 4177 */ 4178 if (needed_kptp[PTP_LEVELS - 1] != 0) { 4179 #ifdef XEN 4180 #ifdef __x86_64__ 4181 /* nothing, kernel entries are never entered in user pmap */ 4182 #else /* __x86_64__ */ 4183 mutex_enter(&pmaps_lock); 4184 LIST_FOREACH(pm, &pmaps, pm_list) { 4185 int pdkidx; 4186 for (pdkidx = PDIR_SLOT_KERN + old; 4187 pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1]; 4188 pdkidx++) { 4189 xpq_queue_pte_update( 4190 xpmap_ptom(pmap_pdirpa(pm, pdkidx)), 4191 kpm->pm_pdir[pdkidx]); 4192 } 4193 xpq_flush_queue(); 4194 } 4195 mutex_exit(&pmaps_lock); 4196 #endif /* __x86_64__ */ 4197 #else /* XEN */ 4198 unsigned newpdes; 4199 newpdes = nkptp[PTP_LEVELS - 1] - old; 4200 mutex_enter(&pmaps_lock); 4201 LIST_FOREACH(pm, &pmaps, pm_list) { 4202 memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old], 4203 &kpm->pm_pdir[PDIR_SLOT_KERN + old], 4204 newpdes * sizeof (pd_entry_t)); 4205 } 4206 mutex_exit(&pmaps_lock); 4207 #endif 4208 invalidate = true; 4209 } 4210 pmap_maxkvaddr = maxkvaddr; 4211 mutex_exit(kpm->pm_lock); 4212 splx(s); 4213 4214 if (invalidate) { 4215 /* Invalidate the PDP cache. */ 4216 pool_cache_invalidate(&pmap_pdp_cache); 4217 } 4218 4219 return maxkvaddr; 4220 } 4221 4222 #ifdef DEBUG 4223 void pmap_dump(struct pmap *, vaddr_t, vaddr_t); 4224 4225 /* 4226 * pmap_dump: dump all the mappings from a pmap 4227 * 4228 * => caller should not be holding any pmap locks 4229 */ 4230 4231 void 4232 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 4233 { 4234 pt_entry_t *ptes, *pte; 4235 pd_entry_t * const *pdes; 4236 struct pmap *pmap2; 4237 vaddr_t blkendva; 4238 4239 /* 4240 * if end is out of range truncate. 4241 * if (end == start) update to max. 4242 */ 4243 4244 if (eva > VM_MAXUSER_ADDRESS || eva <= sva) 4245 eva = VM_MAXUSER_ADDRESS; 4246 4247 /* 4248 * we lock in the pmap => pv_head direction 4249 */ 4250 4251 kpreempt_disable(); 4252 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 4253 4254 /* 4255 * dumping a range of pages: we dump in PTP sized blocks (4MB) 4256 */ 4257 4258 for (/* null */ ; sva < eva ; sva = blkendva) { 4259 4260 /* determine range of block */ 4261 blkendva = x86_round_pdr(sva+1); 4262 if (blkendva > eva) 4263 blkendva = eva; 4264 4265 /* valid block? */ 4266 if (!pmap_pdes_valid(sva, pdes, NULL)) 4267 continue; 4268 4269 pte = &ptes[pl1_i(sva)]; 4270 for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) { 4271 if (!pmap_valid_entry(*pte)) 4272 continue; 4273 printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR 4274 " (pte=%#" PRIxPADDR ")\n", 4275 sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte); 4276 } 4277 } 4278 pmap_unmap_ptes(pmap, pmap2); 4279 kpreempt_enable(); 4280 } 4281 #endif 4282 4283 /* 4284 * pmap_update: process deferred invalidations and frees. 4285 */ 4286 4287 void 4288 pmap_update(struct pmap *pmap) 4289 { 4290 struct vm_page *empty_ptps; 4291 lwp_t *l = curlwp; 4292 4293 /* 4294 * If we have torn down this pmap, invalidate non-global TLB 4295 * entries on any processors using it. 4296 */ 4297 KPREEMPT_DISABLE(l); 4298 if (__predict_false(l->l_md.md_gc_pmap == pmap)) { 4299 l->l_md.md_gc_pmap = NULL; 4300 pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, TLBSHOOT_UPDATE); 4301 } 4302 /* 4303 * Initiate any pending TLB shootdowns. Wait for them to 4304 * complete before returning control to the caller. 4305 */ 4306 pmap_tlb_shootnow(); 4307 KPREEMPT_ENABLE(l); 4308 4309 /* 4310 * Now that shootdowns are complete, process deferred frees, 4311 * but not from interrupt context. 4312 */ 4313 if (l->l_md.md_gc_ptp != NULL) { 4314 KASSERT((l->l_pflag & LP_INTR) == 0); 4315 if (cpu_intr_p()) { 4316 return; 4317 } 4318 empty_ptps = l->l_md.md_gc_ptp; 4319 l->l_md.md_gc_ptp = NULL; 4320 pmap_free_ptps(empty_ptps); 4321 } 4322 } 4323 4324 #if PTP_LEVELS > 4 4325 #error "Unsupported number of page table mappings" 4326 #endif 4327 4328 paddr_t 4329 pmap_init_tmp_pgtbl(paddr_t pg) 4330 { 4331 static bool maps_loaded; 4332 static const paddr_t x86_tmp_pml_paddr[] = { 4333 4 * PAGE_SIZE, 4334 5 * PAGE_SIZE, 4335 6 * PAGE_SIZE, 4336 7 * PAGE_SIZE 4337 }; 4338 static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 }; 4339 4340 pd_entry_t *tmp_pml, *kernel_pml; 4341 4342 int level; 4343 4344 if (!maps_loaded) { 4345 for (level = 0; level < PTP_LEVELS; ++level) { 4346 x86_tmp_pml_vaddr[level] = 4347 uvm_km_alloc(kernel_map, PAGE_SIZE, 0, 4348 UVM_KMF_VAONLY); 4349 4350 if (x86_tmp_pml_vaddr[level] == 0) 4351 panic("mapping of real mode PML failed\n"); 4352 pmap_kenter_pa(x86_tmp_pml_vaddr[level], 4353 x86_tmp_pml_paddr[level], 4354 VM_PROT_READ | VM_PROT_WRITE, 0); 4355 pmap_update(pmap_kernel()); 4356 } 4357 maps_loaded = true; 4358 } 4359 4360 /* Zero levels 1-3 */ 4361 for (level = 0; level < PTP_LEVELS - 1; ++level) { 4362 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 4363 memset(tmp_pml, 0, PAGE_SIZE); 4364 } 4365 4366 /* Copy PML4 */ 4367 kernel_pml = pmap_kernel()->pm_pdir; 4368 tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1]; 4369 memcpy(tmp_pml, kernel_pml, PAGE_SIZE); 4370 4371 #ifdef PAE 4372 /* 4373 * Use the last 4 entries of the L2 page as L3 PD entries. These 4374 * last entries are unlikely to be used for temporary mappings. 4375 * 508: maps 0->1GB (userland) 4376 * 509: unused 4377 * 510: unused 4378 * 511: maps 3->4GB (kernel) 4379 */ 4380 tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PG_V; 4381 tmp_pml[509] = 0; 4382 tmp_pml[510] = 0; 4383 tmp_pml[511] = pmap_pdirpa(pmap_kernel(),PDIR_SLOT_KERN) | PG_V; 4384 #endif 4385 4386 for (level = PTP_LEVELS - 1; level > 0; --level) { 4387 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 4388 4389 tmp_pml[pl_i(pg, level + 1)] = 4390 (x86_tmp_pml_paddr[level - 1] & PG_FRAME) | PG_RW | PG_V; 4391 } 4392 4393 tmp_pml = (void *)x86_tmp_pml_vaddr[0]; 4394 tmp_pml[pl_i(pg, 1)] = (pg & PG_FRAME) | PG_RW | PG_V; 4395 4396 #ifdef PAE 4397 /* Return the PA of the L3 page (entry 508 of the L2 page) */ 4398 return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t); 4399 #endif 4400 4401 return x86_tmp_pml_paddr[PTP_LEVELS - 1]; 4402 } 4403 4404 u_int 4405 x86_mmap_flags(paddr_t mdpgno) 4406 { 4407 u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK; 4408 u_int pflag = 0; 4409 4410 if (nflag & X86_MMAP_FLAG_PREFETCH) 4411 pflag |= PMAP_WRITE_COMBINE; 4412 4413 return pflag; 4414 } 4415