1 /* $NetBSD: pmap.c,v 1.141 2011/11/08 17:16:52 cherry Exp $ */ 2 3 /*- 4 * Copyright (c) 2008, 2010 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 2007 Manuel Bouyer. 34 * 35 * Redistribution and use in source and binary forms, with or without 36 * modification, are permitted provided that the following conditions 37 * are met: 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 44 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 45 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 46 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 47 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 48 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 49 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 50 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 51 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 52 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 53 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 54 * 55 */ 56 57 /* 58 * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr> 59 * 60 * Permission to use, copy, modify, and distribute this software for any 61 * purpose with or without fee is hereby granted, provided that the above 62 * copyright notice and this permission notice appear in all copies. 63 * 64 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 65 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 66 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 67 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 68 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 69 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 70 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 71 */ 72 73 /* 74 * Copyright (c) 1997 Charles D. Cranor and Washington University. 75 * All rights reserved. 76 * 77 * Redistribution and use in source and binary forms, with or without 78 * modification, are permitted provided that the following conditions 79 * are met: 80 * 1. Redistributions of source code must retain the above copyright 81 * notice, this list of conditions and the following disclaimer. 82 * 2. Redistributions in binary form must reproduce the above copyright 83 * notice, this list of conditions and the following disclaimer in the 84 * documentation and/or other materials provided with the distribution. 85 * 86 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 87 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 88 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 89 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 90 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 91 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 92 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 93 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 94 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 95 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 96 */ 97 98 /* 99 * Copyright 2001 (c) Wasabi Systems, Inc. 100 * All rights reserved. 101 * 102 * Written by Frank van der Linden for Wasabi Systems, Inc. 103 * 104 * Redistribution and use in source and binary forms, with or without 105 * modification, are permitted provided that the following conditions 106 * are met: 107 * 1. Redistributions of source code must retain the above copyright 108 * notice, this list of conditions and the following disclaimer. 109 * 2. Redistributions in binary form must reproduce the above copyright 110 * notice, this list of conditions and the following disclaimer in the 111 * documentation and/or other materials provided with the distribution. 112 * 3. All advertising materials mentioning features or use of this software 113 * must display the following acknowledgement: 114 * This product includes software developed for the NetBSD Project by 115 * Wasabi Systems, Inc. 116 * 4. The name of Wasabi Systems, Inc. may not be used to endorse 117 * or promote products derived from this software without specific prior 118 * written permission. 119 * 120 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND 121 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 122 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 123 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC 124 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 125 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 126 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 127 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 128 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 129 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 130 * POSSIBILITY OF SUCH DAMAGE. 131 */ 132 133 /* 134 * This is the i386 pmap modified and generalized to support x86-64 135 * as well. The idea is to hide the upper N levels of the page tables 136 * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest 137 * is mostly untouched, except that it uses some more generalized 138 * macros and interfaces. 139 * 140 * This pmap has been tested on the i386 as well, and it can be easily 141 * adapted to PAE. 142 * 143 * fvdl@wasabisystems.com 18-Jun-2001 144 */ 145 146 /* 147 * pmap.c: i386 pmap module rewrite 148 * Chuck Cranor <chuck@netbsd> 149 * 11-Aug-97 150 * 151 * history of this pmap module: in addition to my own input, i used 152 * the following references for this rewrite of the i386 pmap: 153 * 154 * [1] the NetBSD i386 pmap. this pmap appears to be based on the 155 * BSD hp300 pmap done by Mike Hibler at University of Utah. 156 * it was then ported to the i386 by William Jolitz of UUNET 157 * Technologies, Inc. Then Charles M. Hannum of the NetBSD 158 * project fixed some bugs and provided some speed ups. 159 * 160 * [2] the FreeBSD i386 pmap. this pmap seems to be the 161 * Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson 162 * and David Greenman. 163 * 164 * [3] the Mach pmap. this pmap, from CMU, seems to have migrated 165 * between several processors. the VAX version was done by 166 * Avadis Tevanian, Jr., and Michael Wayne Young. the i386 167 * version was done by Lance Berc, Mike Kupfer, Bob Baron, 168 * David Golub, and Richard Draves. the alpha version was 169 * done by Alessandro Forin (CMU/Mach) and Chris Demetriou 170 * (NetBSD/alpha). 171 */ 172 173 #include <sys/cdefs.h> 174 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.141 2011/11/08 17:16:52 cherry Exp $"); 175 176 #include "opt_user_ldt.h" 177 #include "opt_lockdebug.h" 178 #include "opt_multiprocessor.h" 179 #include "opt_xen.h" 180 #if !defined(__x86_64__) 181 #include "opt_kstack_dr0.h" 182 #endif /* !defined(__x86_64__) */ 183 184 #include <sys/param.h> 185 #include <sys/systm.h> 186 #include <sys/proc.h> 187 #include <sys/pool.h> 188 #include <sys/kernel.h> 189 #include <sys/atomic.h> 190 #include <sys/cpu.h> 191 #include <sys/intr.h> 192 #include <sys/xcall.h> 193 194 #include <uvm/uvm.h> 195 196 #include <dev/isa/isareg.h> 197 198 #include <machine/specialreg.h> 199 #include <machine/gdt.h> 200 #include <machine/isa_machdep.h> 201 #include <machine/cpuvar.h> 202 203 #include <x86/pmap.h> 204 #include <x86/pmap_pv.h> 205 206 #include <x86/i82489reg.h> 207 #include <x86/i82489var.h> 208 209 #ifdef XEN 210 #include <xen/xen3-public/xen.h> 211 #include <xen/hypervisor.h> 212 #endif 213 214 /* 215 * general info: 216 * 217 * - for an explanation of how the i386 MMU hardware works see 218 * the comments in <machine/pte.h>. 219 * 220 * - for an explanation of the general memory structure used by 221 * this pmap (including the recursive mapping), see the comments 222 * in <machine/pmap.h>. 223 * 224 * this file contains the code for the "pmap module." the module's 225 * job is to manage the hardware's virtual to physical address mappings. 226 * note that there are two levels of mapping in the VM system: 227 * 228 * [1] the upper layer of the VM system uses vm_map's and vm_map_entry's 229 * to map ranges of virtual address space to objects/files. for 230 * example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only 231 * to the file /bin/ls starting at offset zero." note that 232 * the upper layer mapping is not concerned with how individual 233 * vm_pages are mapped. 234 * 235 * [2] the lower layer of the VM system (the pmap) maintains the mappings 236 * from virtual addresses. it is concerned with which vm_page is 237 * mapped where. for example, when you run /bin/ls and start 238 * at page 0x1000 the fault routine may lookup the correct page 239 * of the /bin/ls file and then ask the pmap layer to establish 240 * a mapping for it. 241 * 242 * note that information in the lower layer of the VM system can be 243 * thrown away since it can easily be reconstructed from the info 244 * in the upper layer. 245 * 246 * data structures we use include: 247 * 248 * - struct pmap: describes the address space of one thread 249 * - struct pv_entry: describes one <PMAP,VA> mapping of a PA 250 * - struct pv_head: there is one pv_head per managed page of 251 * physical memory. the pv_head points to a list of pv_entry 252 * structures which describe all the <PMAP,VA> pairs that this 253 * page is mapped in. this is critical for page based operations 254 * such as pmap_page_protect() [change protection on _all_ mappings 255 * of a page] 256 */ 257 258 /* 259 * memory allocation 260 * 261 * - there are three data structures that we must dynamically allocate: 262 * 263 * [A] new process' page directory page (PDP) 264 * - plan 1: done at pmap_create() we use 265 * uvm_km_alloc(kernel_map, PAGE_SIZE) [fka kmem_alloc] to do this 266 * allocation. 267 * 268 * if we are low in free physical memory then we sleep in 269 * uvm_km_alloc -- in this case this is ok since we are creating 270 * a new pmap and should not be holding any locks. 271 * 272 * if the kernel is totally out of virtual space 273 * (i.e. uvm_km_alloc returns NULL), then we panic. 274 * 275 * [B] new page tables pages (PTP) 276 * - call uvm_pagealloc() 277 * => success: zero page, add to pm_pdir 278 * => failure: we are out of free vm_pages, let pmap_enter() 279 * tell UVM about it. 280 * 281 * note: for kernel PTPs, we start with NKPTP of them. as we map 282 * kernel memory (at uvm_map time) we check to see if we've grown 283 * the kernel pmap. if so, we call the optional function 284 * pmap_growkernel() to grow the kernel PTPs in advance. 285 * 286 * [C] pv_entry structures 287 */ 288 289 /* 290 * locking 291 * 292 * we have the following locks that we must contend with: 293 * 294 * mutexes: 295 * 296 * - pmap lock (per pmap, part of uvm_object) 297 * this lock protects the fields in the pmap structure including 298 * the non-kernel PDEs in the PDP, and the PTEs. it also locks 299 * in the alternate PTE space (since that is determined by the 300 * entry in the PDP). 301 * 302 * - pvh_lock (per pv_head) 303 * this lock protects the pv_entry list which is chained off the 304 * pv_head structure for a specific managed PA. it is locked 305 * when traversing the list (e.g. adding/removing mappings, 306 * syncing R/M bits, etc.) 307 * 308 * - pmaps_lock 309 * this lock protects the list of active pmaps (headed by "pmaps"). 310 * we lock it when adding or removing pmaps from this list. 311 */ 312 313 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER; 314 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER; 315 const long nkptpmax[] = NKPTPMAX_INITIALIZER; 316 const long nbpd[] = NBPD_INITIALIZER; 317 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER; 318 319 long nkptp[] = NKPTP_INITIALIZER; 320 321 struct pmap_head pmaps; 322 kmutex_t pmaps_lock; 323 324 static vaddr_t pmap_maxkvaddr; 325 326 /* 327 * XXX kludge: dummy locking to make KASSERTs in uvm_page.c comfortable. 328 * actual locking is done by pm_lock. 329 */ 330 #if defined(DIAGNOSTIC) 331 #define PMAP_SUBOBJ_LOCK(pm, idx) \ 332 KASSERT(mutex_owned((pm)->pm_lock)); \ 333 if ((idx) != 0) \ 334 mutex_enter((pm)->pm_obj[(idx)].vmobjlock) 335 #define PMAP_SUBOBJ_UNLOCK(pm, idx) \ 336 KASSERT(mutex_owned((pm)->pm_lock)); \ 337 if ((idx) != 0) \ 338 mutex_exit((pm)->pm_obj[(idx)].vmobjlock) 339 #else /* defined(DIAGNOSTIC) */ 340 #define PMAP_SUBOBJ_LOCK(pm, idx) /* nothing */ 341 #define PMAP_SUBOBJ_UNLOCK(pm, idx) /* nothing */ 342 #endif /* defined(DIAGNOSTIC) */ 343 344 /* 345 * Misc. event counters. 346 */ 347 struct evcnt pmap_iobmp_evcnt; 348 struct evcnt pmap_ldt_evcnt; 349 350 /* 351 * PAT 352 */ 353 #define PATENTRY(n, type) (type << ((n) * 8)) 354 #define PAT_UC 0x0ULL 355 #define PAT_WC 0x1ULL 356 #define PAT_WT 0x4ULL 357 #define PAT_WP 0x5ULL 358 #define PAT_WB 0x6ULL 359 #define PAT_UCMINUS 0x7ULL 360 361 static bool cpu_pat_enabled __read_mostly = false; 362 363 /* 364 * global data structures 365 */ 366 367 static struct pmap kernel_pmap_store; /* the kernel's pmap (proc0) */ 368 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store; 369 370 /* 371 * pmap_pg_g: if our processor supports PG_G in the PTE then we 372 * set pmap_pg_g to PG_G (otherwise it is zero). 373 */ 374 375 int pmap_pg_g __read_mostly = 0; 376 377 /* 378 * pmap_largepages: if our processor supports PG_PS and we are 379 * using it, this is set to true. 380 */ 381 382 int pmap_largepages __read_mostly; 383 384 /* 385 * i386 physical memory comes in a big contig chunk with a small 386 * hole toward the front of it... the following two paddr_t's 387 * (shared with machdep.c) describe the physical address space 388 * of this machine. 389 */ 390 paddr_t avail_start __read_mostly; /* PA of first available physical page */ 391 paddr_t avail_end __read_mostly; /* PA of last available physical page */ 392 393 #ifdef XEN 394 #ifdef __x86_64__ 395 /* Dummy PGD for user cr3, used between pmap_deactivate() and pmap_activate() */ 396 static paddr_t xen_dummy_user_pgd; 397 #endif /* __x86_64__ */ 398 paddr_t pmap_pa_start; /* PA of first physical page for this domain */ 399 paddr_t pmap_pa_end; /* PA of last physical page for this domain */ 400 #endif /* XEN */ 401 402 #define VM_PAGE_TO_PP(pg) (&(pg)->mdpage.mp_pp) 403 404 #define PV_HASH_SIZE 32768 405 #define PV_HASH_LOCK_CNT 32 406 407 struct pv_hash_lock { 408 kmutex_t lock; 409 } __aligned(CACHE_LINE_SIZE) pv_hash_locks[PV_HASH_LOCK_CNT] 410 __aligned(CACHE_LINE_SIZE); 411 412 struct pv_hash_head { 413 SLIST_HEAD(, pv_entry) hh_list; 414 } pv_hash_heads[PV_HASH_SIZE]; 415 416 static u_int 417 pvhash_hash(struct vm_page *ptp, vaddr_t va) 418 { 419 420 return (uintptr_t)ptp / sizeof(*ptp) + (va >> PAGE_SHIFT); 421 } 422 423 static struct pv_hash_head * 424 pvhash_head(u_int hash) 425 { 426 427 return &pv_hash_heads[hash % PV_HASH_SIZE]; 428 } 429 430 static kmutex_t * 431 pvhash_lock(u_int hash) 432 { 433 434 return &pv_hash_locks[hash % PV_HASH_LOCK_CNT].lock; 435 } 436 437 static struct pv_entry * 438 pvhash_remove(struct pv_hash_head *hh, struct vm_page *ptp, vaddr_t va) 439 { 440 struct pv_entry *pve; 441 struct pv_entry *prev; 442 443 prev = NULL; 444 SLIST_FOREACH(pve, &hh->hh_list, pve_hash) { 445 if (pve->pve_pte.pte_ptp == ptp && 446 pve->pve_pte.pte_va == va) { 447 if (prev != NULL) { 448 SLIST_REMOVE_AFTER(prev, pve_hash); 449 } else { 450 SLIST_REMOVE_HEAD(&hh->hh_list, pve_hash); 451 } 452 break; 453 } 454 prev = pve; 455 } 456 return pve; 457 } 458 459 /* 460 * other data structures 461 */ 462 463 static pt_entry_t protection_codes[8] __read_mostly; /* maps MI prot to i386 464 prot code */ 465 static bool pmap_initialized __read_mostly = false; /* pmap_init done yet? */ 466 467 /* 468 * the following two vaddr_t's are used during system startup 469 * to keep track of how much of the kernel's VM space we have used. 470 * once the system is started, the management of the remaining kernel 471 * VM space is turned over to the kernel_map vm_map. 472 */ 473 474 static vaddr_t virtual_avail __read_mostly; /* VA of first free KVA */ 475 static vaddr_t virtual_end __read_mostly; /* VA of last free KVA */ 476 477 /* 478 * pool that pmap structures are allocated from 479 */ 480 481 static struct pool_cache pmap_cache; 482 483 /* 484 * pv_entry cache 485 */ 486 487 static struct pool_cache pmap_pv_cache; 488 489 /* 490 * MULTIPROCESSOR: special VA's/ PTE's are actually allocated inside a 491 * maxcpus*NPTECL array of PTE's, to avoid cache line thrashing 492 * due to false sharing. 493 */ 494 495 #ifdef MULTIPROCESSOR 496 #define PTESLEW(pte, id) ((pte)+(id)*NPTECL) 497 #define VASLEW(va,id) ((va)+(id)*NPTECL*PAGE_SIZE) 498 #else 499 #define PTESLEW(pte, id) (pte) 500 #define VASLEW(va,id) (va) 501 #endif 502 503 /* 504 * special VAs and the PTEs that map them 505 */ 506 static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte, *early_zero_pte; 507 static char *csrcp, *cdstp, *zerop, *ptpp, *early_zerop; 508 509 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int); 510 511 /* 512 * pool and cache that PDPs are allocated from 513 */ 514 515 static struct pool_cache pmap_pdp_cache; 516 int pmap_pdp_ctor(void *, void *, int); 517 void pmap_pdp_dtor(void *, void *); 518 #ifdef PAE 519 /* need to allocate items of 4 pages */ 520 void *pmap_pdp_alloc(struct pool *, int); 521 void pmap_pdp_free(struct pool *, void *); 522 static struct pool_allocator pmap_pdp_allocator = { 523 .pa_alloc = pmap_pdp_alloc, 524 .pa_free = pmap_pdp_free, 525 .pa_pagesz = PAGE_SIZE * PDP_SIZE, 526 }; 527 #endif /* PAE */ 528 529 extern vaddr_t idt_vaddr; /* we allocate IDT early */ 530 extern paddr_t idt_paddr; 531 532 #ifdef _LP64 533 extern vaddr_t lo32_vaddr; 534 extern vaddr_t lo32_paddr; 535 #endif 536 537 extern int end; 538 539 #ifdef i386 540 /* stuff to fix the pentium f00f bug */ 541 extern vaddr_t pentium_idt_vaddr; 542 #endif 543 544 545 /* 546 * local prototypes 547 */ 548 549 static struct vm_page *pmap_get_ptp(struct pmap *, vaddr_t, 550 pd_entry_t * const *); 551 static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int); 552 static void pmap_freepage(struct pmap *, struct vm_page *, int); 553 static void pmap_free_ptp(struct pmap *, struct vm_page *, 554 vaddr_t, pt_entry_t *, 555 pd_entry_t * const *); 556 static bool pmap_is_active(struct pmap *, struct cpu_info *, bool); 557 static bool pmap_remove_pte(struct pmap *, struct vm_page *, 558 pt_entry_t *, vaddr_t, 559 struct pv_entry **); 560 static void pmap_remove_ptes(struct pmap *, struct vm_page *, 561 vaddr_t, vaddr_t, vaddr_t, 562 struct pv_entry **); 563 564 static bool pmap_get_physpage(vaddr_t, int, paddr_t *); 565 static void pmap_alloc_level(pd_entry_t * const *, vaddr_t, int, 566 long *); 567 568 static bool pmap_reactivate(struct pmap *); 569 570 /* 571 * p m a p h e l p e r f u n c t i o n s 572 */ 573 574 static inline void 575 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff) 576 { 577 578 if (pmap == pmap_kernel()) { 579 atomic_add_long(&pmap->pm_stats.resident_count, resid_diff); 580 atomic_add_long(&pmap->pm_stats.wired_count, wired_diff); 581 } else { 582 KASSERT(mutex_owned(pmap->pm_lock)); 583 pmap->pm_stats.resident_count += resid_diff; 584 pmap->pm_stats.wired_count += wired_diff; 585 } 586 } 587 588 static inline void 589 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte) 590 { 591 int resid_diff = ((npte & PG_V) ? 1 : 0) - ((opte & PG_V) ? 1 : 0); 592 int wired_diff = ((npte & PG_W) ? 1 : 0) - ((opte & PG_W) ? 1 : 0); 593 594 KASSERT((npte & (PG_V | PG_W)) != PG_W); 595 KASSERT((opte & (PG_V | PG_W)) != PG_W); 596 597 pmap_stats_update(pmap, resid_diff, wired_diff); 598 } 599 600 /* 601 * ptp_to_pmap: lookup pmap by ptp 602 */ 603 604 static struct pmap * 605 ptp_to_pmap(struct vm_page *ptp) 606 { 607 struct pmap *pmap; 608 609 if (ptp == NULL) { 610 return pmap_kernel(); 611 } 612 pmap = (struct pmap *)ptp->uobject; 613 KASSERT(pmap != NULL); 614 KASSERT(&pmap->pm_obj[0] == ptp->uobject); 615 return pmap; 616 } 617 618 static inline struct pv_pte * 619 pve_to_pvpte(struct pv_entry *pve) 620 { 621 622 KASSERT((void *)&pve->pve_pte == (void *)pve); 623 return &pve->pve_pte; 624 } 625 626 static inline struct pv_entry * 627 pvpte_to_pve(struct pv_pte *pvpte) 628 { 629 struct pv_entry *pve = (void *)pvpte; 630 631 KASSERT(pve_to_pvpte(pve) == pvpte); 632 return pve; 633 } 634 635 /* 636 * pv_pte_first, pv_pte_next: PV list iterator. 637 */ 638 639 static struct pv_pte * 640 pv_pte_first(struct pmap_page *pp) 641 { 642 643 if ((pp->pp_flags & PP_EMBEDDED) != 0) { 644 return &pp->pp_pte; 645 } 646 return pve_to_pvpte(LIST_FIRST(&pp->pp_head.pvh_list)); 647 } 648 649 static struct pv_pte * 650 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte) 651 { 652 653 KASSERT(pvpte != NULL); 654 if (pvpte == &pp->pp_pte) { 655 KASSERT((pp->pp_flags & PP_EMBEDDED) != 0); 656 return NULL; 657 } 658 KASSERT((pp->pp_flags & PP_EMBEDDED) == 0); 659 return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list)); 660 } 661 662 /* 663 * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]? 664 * of course the kernel is always loaded 665 */ 666 667 bool 668 pmap_is_curpmap(struct pmap *pmap) 669 { 670 #if defined(XEN) && defined(__x86_64__) 671 /* 672 * Only kernel pmap is physically loaded. 673 * User PGD may be active, but TLB will be flushed 674 * with HYPERVISOR_iret anyway, so let's say no 675 */ 676 return(pmap == pmap_kernel()); 677 #else /* XEN && __x86_64__*/ 678 return((pmap == pmap_kernel()) || 679 (pmap == curcpu()->ci_pmap)); 680 #endif 681 } 682 683 /* 684 * pmap_is_active: is this pmap loaded into the specified processor's %cr3? 685 */ 686 687 inline static bool 688 pmap_is_active(struct pmap *pmap, struct cpu_info *ci, bool kernel) 689 { 690 691 return (pmap == pmap_kernel() || 692 (pmap->pm_cpus & ci->ci_cpumask) != 0 || 693 (kernel && (pmap->pm_kernel_cpus & ci->ci_cpumask) != 0)); 694 } 695 696 /* 697 * Add a reference to the specified pmap. 698 */ 699 700 void 701 pmap_reference(struct pmap *pmap) 702 { 703 704 atomic_inc_uint(&pmap->pm_obj[0].uo_refs); 705 } 706 707 #ifndef XEN 708 709 /* 710 * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in 711 * 712 * there are several pmaps involved. some or all of them might be same. 713 * 714 * - the pmap given by the first argument 715 * our caller wants to access this pmap's PTEs. 716 * 717 * - pmap_kernel() 718 * the kernel pmap. note that it only contains the kernel part 719 * of the address space which is shared by any pmap. ie. any 720 * pmap can be used instead of pmap_kernel() for our purpose. 721 * 722 * - ci->ci_pmap 723 * pmap currently loaded on the cpu. 724 * 725 * - vm_map_pmap(&curproc->p_vmspace->vm_map) 726 * current process' pmap. 727 * 728 * => we lock enough pmaps to keep things locked in 729 * => must be undone with pmap_unmap_ptes before returning 730 */ 731 732 void 733 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2, 734 pd_entry_t **ptepp, pd_entry_t * const **pdeppp) 735 { 736 struct pmap *curpmap; 737 struct cpu_info *ci; 738 uint32_t cpumask; 739 lwp_t *l; 740 741 /* The kernel's pmap is always accessible. */ 742 if (pmap == pmap_kernel()) { 743 *pmap2 = NULL; 744 *ptepp = PTE_BASE; 745 *pdeppp = normal_pdes; 746 return; 747 } 748 KASSERT(kpreempt_disabled()); 749 750 l = curlwp; 751 retry: 752 mutex_enter(pmap->pm_lock); 753 ci = curcpu(); 754 curpmap = ci->ci_pmap; 755 if (vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) { 756 /* Our own pmap so just load it: easy. */ 757 if (__predict_false(ci->ci_want_pmapload)) { 758 mutex_exit(pmap->pm_lock); 759 pmap_load(); 760 goto retry; 761 } 762 KASSERT(pmap == curpmap); 763 } else if (pmap == curpmap) { 764 /* 765 * Already on the CPU: make it valid. This is very 766 * often the case during exit(), when we have switched 767 * to the kernel pmap in order to destroy a user pmap. 768 */ 769 if (!pmap_reactivate(pmap)) { 770 u_int gen = uvm_emap_gen_return(); 771 tlbflush(); 772 uvm_emap_update(gen); 773 } 774 } else { 775 /* 776 * Toss current pmap from CPU, but keep a reference to it. 777 * The reference will be dropped by pmap_unmap_ptes(). 778 * Can happen if we block during exit(). 779 */ 780 cpumask = ci->ci_cpumask; 781 atomic_and_32(&curpmap->pm_cpus, ~cpumask); 782 atomic_and_32(&curpmap->pm_kernel_cpus, ~cpumask); 783 ci->ci_pmap = pmap; 784 ci->ci_tlbstate = TLBSTATE_VALID; 785 atomic_or_32(&pmap->pm_cpus, cpumask); 786 atomic_or_32(&pmap->pm_kernel_cpus, cpumask); 787 cpu_load_pmap(pmap); 788 } 789 pmap->pm_ncsw = l->l_ncsw; 790 *pmap2 = curpmap; 791 *ptepp = PTE_BASE; 792 *pdeppp = normal_pdes; 793 } 794 795 /* 796 * pmap_unmap_ptes: unlock the PTE mapping of "pmap" 797 */ 798 799 void 800 pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2) 801 { 802 struct cpu_info *ci; 803 struct pmap *mypmap; 804 805 KASSERT(kpreempt_disabled()); 806 807 /* The kernel's pmap is always accessible. */ 808 if (pmap == pmap_kernel()) { 809 return; 810 } 811 812 /* 813 * We cannot tolerate context switches while mapped in. 814 * If it is our own pmap all we have to do is unlock. 815 */ 816 KASSERT(pmap->pm_ncsw == curlwp->l_ncsw); 817 mypmap = vm_map_pmap(&curproc->p_vmspace->vm_map); 818 if (pmap == mypmap) { 819 mutex_exit(pmap->pm_lock); 820 return; 821 } 822 823 /* 824 * Mark whatever's on the CPU now as lazy and unlock. 825 * If the pmap was already installed, we are done. 826 */ 827 ci = curcpu(); 828 ci->ci_tlbstate = TLBSTATE_LAZY; 829 ci->ci_want_pmapload = (mypmap != pmap_kernel()); 830 mutex_exit(pmap->pm_lock); 831 if (pmap == pmap2) { 832 return; 833 } 834 835 /* 836 * We installed another pmap on the CPU. Grab a reference to 837 * it and leave in place. Toss the evicted pmap (can block). 838 */ 839 pmap_reference(pmap); 840 pmap_destroy(pmap2); 841 } 842 843 #endif 844 845 inline static void 846 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte) 847 { 848 849 #if !defined(__x86_64__) 850 if (curproc == NULL || curproc->p_vmspace == NULL || 851 pm != vm_map_pmap(&curproc->p_vmspace->vm_map)) 852 return; 853 854 if ((opte ^ npte) & PG_X) 855 pmap_update_pg(va); 856 857 /* 858 * Executability was removed on the last executable change. 859 * Reset the code segment to something conservative and 860 * let the trap handler deal with setting the right limit. 861 * We can't do that because of locking constraints on the vm map. 862 */ 863 864 if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) { 865 struct trapframe *tf = curlwp->l_md.md_regs; 866 867 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 868 pm->pm_hiexec = I386_MAX_EXE_ADDR; 869 } 870 #endif /* !defined(__x86_64__) */ 871 } 872 873 #if !defined(__x86_64__) 874 /* 875 * Fixup the code segment to cover all potential executable mappings. 876 * returns 0 if no changes to the code segment were made. 877 */ 878 879 int 880 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb) 881 { 882 struct vm_map_entry *ent; 883 struct pmap *pm = vm_map_pmap(map); 884 vaddr_t va = 0; 885 886 vm_map_lock_read(map); 887 for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) { 888 889 /* 890 * This entry has greater va than the entries before. 891 * We need to make it point to the last page, not past it. 892 */ 893 894 if (ent->protection & VM_PROT_EXECUTE) 895 va = trunc_page(ent->end) - PAGE_SIZE; 896 } 897 vm_map_unlock_read(map); 898 if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL)) 899 return (0); 900 901 pm->pm_hiexec = va; 902 if (pm->pm_hiexec > I386_MAX_EXE_ADDR) { 903 tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL); 904 } else { 905 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 906 return (0); 907 } 908 return (1); 909 } 910 #endif /* !defined(__x86_64__) */ 911 912 void 913 pat_init(struct cpu_info *ci) 914 { 915 uint64_t pat; 916 917 if (!(ci->ci_feat_val[0] & CPUID_PAT)) 918 return; 919 920 /* We change WT to WC. Leave all other entries the default values. */ 921 pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) | 922 PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) | 923 PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) | 924 PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC); 925 926 wrmsr(MSR_CR_PAT, pat); 927 cpu_pat_enabled = true; 928 aprint_debug_dev(ci->ci_dev, "PAT enabled\n"); 929 } 930 931 static pt_entry_t 932 pmap_pat_flags(u_int flags) 933 { 934 u_int cacheflags = (flags & PMAP_CACHE_MASK); 935 936 if (!cpu_pat_enabled) { 937 switch (cacheflags) { 938 case PMAP_NOCACHE: 939 case PMAP_NOCACHE_OVR: 940 /* results in PGC_UCMINUS on cpus which have 941 * the cpuid PAT but PAT "disabled" 942 */ 943 return PG_N; 944 default: 945 return 0; 946 } 947 } 948 949 switch (cacheflags) { 950 case PMAP_NOCACHE: 951 return PGC_UC; 952 case PMAP_WRITE_COMBINE: 953 return PGC_WC; 954 case PMAP_WRITE_BACK: 955 return PGC_WB; 956 case PMAP_NOCACHE_OVR: 957 return PGC_UCMINUS; 958 } 959 960 return 0; 961 } 962 963 /* 964 * p m a p k e n t e r f u n c t i o n s 965 * 966 * functions to quickly enter/remove pages from the kernel address 967 * space. pmap_kremove is exported to MI kernel. we make use of 968 * the recursive PTE mappings. 969 */ 970 971 /* 972 * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking 973 * 974 * => no need to lock anything, assume va is already allocated 975 * => should be faster than normal pmap enter function 976 */ 977 978 void 979 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags) 980 { 981 pt_entry_t *pte, opte, npte; 982 983 KASSERT(!(prot & ~VM_PROT_ALL)); 984 985 if (va < VM_MIN_KERNEL_ADDRESS) 986 pte = vtopte(va); 987 else 988 pte = kvtopte(va); 989 #ifdef DOM0OPS 990 if (pa < pmap_pa_start || pa >= pmap_pa_end) { 991 #ifdef DEBUG 992 printf_nolog("%s: pa 0x%" PRIx64 " for va 0x%" PRIx64 993 " outside range\n", __func__, (int64_t)pa, (int64_t)va); 994 #endif /* DEBUG */ 995 npte = pa; 996 } else 997 #endif /* DOM0OPS */ 998 npte = pmap_pa2pte(pa); 999 npte |= protection_codes[prot] | PG_k | PG_V | pmap_pg_g; 1000 npte |= pmap_pat_flags(flags); 1001 opte = pmap_pte_testset(pte, npte); /* zap! */ 1002 #if defined(DIAGNOSTIC) 1003 /* XXX For now... */ 1004 if (opte & PG_PS) 1005 panic("%s: PG_PS", __func__); 1006 #endif 1007 if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) { 1008 #if defined(DIAGNOSTIC) 1009 printf_nolog("%s: mapping already present\n", __func__); 1010 #endif 1011 /* This should not happen. */ 1012 kpreempt_disable(); 1013 pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER); 1014 kpreempt_enable(); 1015 } 1016 } 1017 1018 void 1019 pmap_emap_enter(vaddr_t va, paddr_t pa, vm_prot_t prot) 1020 { 1021 pt_entry_t *pte, opte, npte; 1022 1023 KASSERT((prot & ~VM_PROT_ALL) == 0); 1024 pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va); 1025 1026 #ifdef DOM0OPS 1027 if (pa < pmap_pa_start || pa >= pmap_pa_end) { 1028 npte = pa; 1029 } else 1030 #endif 1031 npte = pmap_pa2pte(pa); 1032 1033 npte = pmap_pa2pte(pa); 1034 npte |= protection_codes[prot] | PG_k | PG_V; 1035 opte = pmap_pte_testset(pte, npte); 1036 } 1037 1038 /* 1039 * pmap_emap_sync: perform TLB flush or pmap load, if it was deferred. 1040 */ 1041 void 1042 pmap_emap_sync(bool canload) 1043 { 1044 struct cpu_info *ci = curcpu(); 1045 struct pmap *pmap; 1046 1047 KASSERT(kpreempt_disabled()); 1048 if (__predict_true(ci->ci_want_pmapload && canload)) { 1049 /* 1050 * XXX: Hint for pmap_reactivate(), which might suggest to 1051 * not perform TLB flush, if state has not changed. 1052 */ 1053 pmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map); 1054 if (__predict_false(pmap == ci->ci_pmap)) { 1055 const uint32_t cpumask = ci->ci_cpumask; 1056 atomic_and_32(&pmap->pm_cpus, ~cpumask); 1057 } 1058 pmap_load(); 1059 KASSERT(ci->ci_want_pmapload == 0); 1060 } else { 1061 tlbflush(); 1062 } 1063 1064 } 1065 1066 void 1067 pmap_emap_remove(vaddr_t sva, vsize_t len) 1068 { 1069 pt_entry_t *pte, xpte; 1070 vaddr_t va, eva = sva + len; 1071 1072 for (va = sva; va < eva; va += PAGE_SIZE) { 1073 pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va); 1074 xpte |= pmap_pte_testset(pte, 0); 1075 } 1076 } 1077 1078 __strict_weak_alias(pmap_kenter_ma, pmap_kenter_pa); 1079 1080 #if defined(__x86_64__) 1081 /* 1082 * Change protection for a virtual address. Local for a CPU only, don't 1083 * care about TLB shootdowns. 1084 * 1085 * => must be called with preemption disabled 1086 */ 1087 void 1088 pmap_changeprot_local(vaddr_t va, vm_prot_t prot) 1089 { 1090 pt_entry_t *pte, opte, npte; 1091 1092 KASSERT(kpreempt_disabled()); 1093 1094 if (va < VM_MIN_KERNEL_ADDRESS) 1095 pte = vtopte(va); 1096 else 1097 pte = kvtopte(va); 1098 1099 npte = opte = *pte; 1100 1101 if ((prot & VM_PROT_WRITE) != 0) 1102 npte |= PG_RW; 1103 else 1104 npte &= ~PG_RW; 1105 1106 if (opte != npte) { 1107 pmap_pte_set(pte, npte); 1108 pmap_pte_flush(); 1109 invlpg(va); 1110 } 1111 } 1112 #endif /* defined(__x86_64__) */ 1113 1114 /* 1115 * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking 1116 * 1117 * => no need to lock anything 1118 * => caller must dispose of any vm_page mapped in the va range 1119 * => note: not an inline function 1120 * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE 1121 * => we assume kernel only unmaps valid addresses and thus don't bother 1122 * checking the valid bit before doing TLB flushing 1123 * => must be followed by call to pmap_update() before reuse of page 1124 */ 1125 1126 void 1127 pmap_kremove(vaddr_t sva, vsize_t len) 1128 { 1129 pt_entry_t *pte, opte; 1130 vaddr_t va, eva; 1131 1132 eva = sva + len; 1133 1134 kpreempt_disable(); 1135 for (va = sva; va < eva; va += PAGE_SIZE) { 1136 if (va < VM_MIN_KERNEL_ADDRESS) 1137 pte = vtopte(va); 1138 else 1139 pte = kvtopte(va); 1140 opte = pmap_pte_testset(pte, 0); /* zap! */ 1141 if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) { 1142 pmap_tlb_shootdown(pmap_kernel(), va, opte, 1143 TLBSHOOT_KREMOVE); 1144 } 1145 KASSERT((opte & PG_PS) == 0); 1146 KASSERT((opte & PG_PVLIST) == 0); 1147 } 1148 kpreempt_enable(); 1149 } 1150 1151 /* 1152 * p m a p i n i t f u n c t i o n s 1153 * 1154 * pmap_bootstrap and pmap_init are called during system startup 1155 * to init the pmap module. pmap_bootstrap() does a low level 1156 * init just to get things rolling. pmap_init() finishes the job. 1157 */ 1158 1159 /* 1160 * pmap_bootstrap: get the system in a state where it can run with VM 1161 * properly enabled (called before main()). the VM system is 1162 * fully init'd later... 1163 * 1164 * => on i386, locore.s has already enabled the MMU by allocating 1165 * a PDP for the kernel, and nkpde PTP's for the kernel. 1166 * => kva_start is the first free virtual address in kernel space 1167 */ 1168 1169 void 1170 pmap_bootstrap(vaddr_t kva_start) 1171 { 1172 struct pmap *kpm; 1173 pt_entry_t *pte; 1174 int i; 1175 vaddr_t kva; 1176 #ifndef XEN 1177 unsigned long p1i; 1178 vaddr_t kva_end; 1179 #endif 1180 1181 pt_entry_t pg_nx = (cpu_feature[2] & CPUID_NOX ? PG_NX : 0); 1182 1183 /* 1184 * set up our local static global vars that keep track of the 1185 * usage of KVM before kernel_map is set up 1186 */ 1187 1188 virtual_avail = kva_start; /* first free KVA */ 1189 virtual_end = VM_MAX_KERNEL_ADDRESS; /* last KVA */ 1190 1191 /* 1192 * set up protection_codes: we need to be able to convert from 1193 * a MI protection code (some combo of VM_PROT...) to something 1194 * we can jam into a i386 PTE. 1195 */ 1196 1197 protection_codes[VM_PROT_NONE] = pg_nx; /* --- */ 1198 protection_codes[VM_PROT_EXECUTE] = PG_RO | PG_X; /* --x */ 1199 protection_codes[VM_PROT_READ] = PG_RO | pg_nx; /* -r- */ 1200 protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO | PG_X;/* -rx */ 1201 protection_codes[VM_PROT_WRITE] = PG_RW | pg_nx; /* w-- */ 1202 protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW | PG_X;/* w-x */ 1203 protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW | pg_nx; 1204 /* wr- */ 1205 protection_codes[VM_PROT_ALL] = PG_RW | PG_X; /* wrx */ 1206 1207 /* 1208 * now we init the kernel's pmap 1209 * 1210 * the kernel pmap's pm_obj is not used for much. however, in 1211 * user pmaps the pm_obj contains the list of active PTPs. 1212 * the pm_obj currently does not have a pager. it might be possible 1213 * to add a pager that would allow a process to read-only mmap its 1214 * own page tables (fast user level vtophys?). this may or may not 1215 * be useful. 1216 */ 1217 1218 kpm = pmap_kernel(); 1219 for (i = 0; i < PTP_LEVELS - 1; i++) { 1220 mutex_init(&kpm->pm_obj_lock[i], MUTEX_DEFAULT, IPL_NONE); 1221 uvm_obj_init(&kpm->pm_obj[i], NULL, false, 1); 1222 uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_obj_lock[i]); 1223 kpm->pm_ptphint[i] = NULL; 1224 } 1225 memset(&kpm->pm_list, 0, sizeof(kpm->pm_list)); /* pm_list not used */ 1226 1227 kpm->pm_pdir = (pd_entry_t *)(PDPpaddr + KERNBASE); 1228 for (i = 0; i < PDP_SIZE; i++) 1229 kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i; 1230 1231 kpm->pm_stats.wired_count = kpm->pm_stats.resident_count = 1232 x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS); 1233 1234 /* 1235 * the above is just a rough estimate and not critical to the proper 1236 * operation of the system. 1237 */ 1238 1239 #ifndef XEN 1240 /* 1241 * Begin to enable global TLB entries if they are supported. 1242 * The G bit has no effect until the CR4_PGE bit is set in CR4, 1243 * which happens in cpu_init(), which is run on each cpu 1244 * (and happens later) 1245 */ 1246 1247 if (cpu_feature[0] & CPUID_PGE) { 1248 pmap_pg_g = PG_G; /* enable software */ 1249 1250 /* add PG_G attribute to already mapped kernel pages */ 1251 if (KERNBASE == VM_MIN_KERNEL_ADDRESS) { 1252 kva_end = virtual_avail; 1253 } else { 1254 extern vaddr_t eblob, esym; 1255 kva_end = (vaddr_t)&end; 1256 if (esym > kva_end) 1257 kva_end = esym; 1258 if (eblob > kva_end) 1259 kva_end = eblob; 1260 kva_end = roundup(kva_end, PAGE_SIZE); 1261 } 1262 for (kva = KERNBASE; kva < kva_end; kva += PAGE_SIZE) { 1263 p1i = pl1_i(kva); 1264 if (pmap_valid_entry(PTE_BASE[p1i])) 1265 PTE_BASE[p1i] |= PG_G; 1266 } 1267 } 1268 1269 /* 1270 * enable large pages if they are supported. 1271 */ 1272 1273 if (cpu_feature[0] & CPUID_PSE) { 1274 paddr_t pa; 1275 pd_entry_t *pde; 1276 extern char __data_start; 1277 1278 lcr4(rcr4() | CR4_PSE); /* enable hardware (via %cr4) */ 1279 pmap_largepages = 1; /* enable software */ 1280 1281 /* 1282 * the TLB must be flushed after enabling large pages 1283 * on Pentium CPUs, according to section 3.6.2.2 of 1284 * "Intel Architecture Software Developer's Manual, 1285 * Volume 3: System Programming". 1286 */ 1287 tlbflushg(); 1288 1289 /* 1290 * now, remap the kernel text using large pages. we 1291 * assume that the linker has properly aligned the 1292 * .data segment to a NBPD_L2 boundary. 1293 */ 1294 kva_end = rounddown((vaddr_t)&__data_start, NBPD_L1); 1295 for (pa = 0, kva = KERNBASE; kva + NBPD_L2 <= kva_end; 1296 kva += NBPD_L2, pa += NBPD_L2) { 1297 pde = &L2_BASE[pl2_i(kva)]; 1298 *pde = pa | pmap_pg_g | PG_PS | 1299 PG_KR | PG_V; /* zap! */ 1300 tlbflushg(); 1301 } 1302 #if defined(DEBUG) 1303 aprint_normal("kernel text is mapped with %" PRIuPSIZE " large " 1304 "pages and %" PRIuPSIZE " normal pages\n", 1305 howmany(kva - KERNBASE, NBPD_L2), 1306 howmany((vaddr_t)&__data_start - kva, NBPD_L1)); 1307 #endif /* defined(DEBUG) */ 1308 } 1309 #endif /* !XEN */ 1310 1311 if (VM_MIN_KERNEL_ADDRESS != KERNBASE) { 1312 /* 1313 * zero_pte is stuck at the end of mapped space for the kernel 1314 * image (disjunct from kva space). This is done so that it 1315 * can safely be used in pmap_growkernel (pmap_get_physpage), 1316 * when it's called for the first time. 1317 * XXXfvdl fix this for MULTIPROCESSOR later. 1318 */ 1319 1320 early_zerop = (void *)(KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2); 1321 early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop); 1322 } 1323 1324 /* 1325 * now we allocate the "special" VAs which are used for tmp mappings 1326 * by the pmap (and other modules). we allocate the VAs by advancing 1327 * virtual_avail (note that there are no pages mapped at these VAs). 1328 * we find the PTE that maps the allocated VA via the linear PTE 1329 * mapping. 1330 */ 1331 1332 pte = PTE_BASE + pl1_i(virtual_avail); 1333 1334 #ifdef MULTIPROCESSOR 1335 /* 1336 * Waste some VA space to avoid false sharing of cache lines 1337 * for page table pages: Give each possible CPU a cache line 1338 * of PTE's (8) to play with, though we only need 4. We could 1339 * recycle some of this waste by putting the idle stacks here 1340 * as well; we could waste less space if we knew the largest 1341 * CPU ID beforehand. 1342 */ 1343 csrcp = (char *) virtual_avail; csrc_pte = pte; 1344 1345 cdstp = (char *) virtual_avail+PAGE_SIZE; cdst_pte = pte+1; 1346 1347 zerop = (char *) virtual_avail+PAGE_SIZE*2; zero_pte = pte+2; 1348 1349 ptpp = (char *) virtual_avail+PAGE_SIZE*3; ptp_pte = pte+3; 1350 1351 virtual_avail += PAGE_SIZE * maxcpus * NPTECL; 1352 pte += maxcpus * NPTECL; 1353 #else 1354 csrcp = (void *) virtual_avail; csrc_pte = pte; /* allocate */ 1355 virtual_avail += PAGE_SIZE; pte++; /* advance */ 1356 1357 cdstp = (void *) virtual_avail; cdst_pte = pte; 1358 virtual_avail += PAGE_SIZE; pte++; 1359 1360 zerop = (void *) virtual_avail; zero_pte = pte; 1361 virtual_avail += PAGE_SIZE; pte++; 1362 1363 ptpp = (void *) virtual_avail; ptp_pte = pte; 1364 virtual_avail += PAGE_SIZE; pte++; 1365 #endif 1366 1367 if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { 1368 early_zerop = zerop; 1369 early_zero_pte = zero_pte; 1370 } 1371 1372 /* 1373 * Nothing after this point actually needs pte; 1374 */ 1375 pte = (void *)0xdeadbeef; 1376 1377 #ifdef XEN 1378 #ifdef __x86_64__ 1379 /* 1380 * We want a dummy page directory for Xen: 1381 * when deactivate a pmap, Xen will still consider it active. 1382 * So we set user PGD to this one to lift all protection on 1383 * the now inactive page tables set. 1384 */ 1385 xen_dummy_user_pgd = avail_start; 1386 avail_start += PAGE_SIZE; 1387 1388 /* Zero fill it, the less checks in Xen it requires the better */ 1389 memset((void *) (xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE); 1390 /* Mark read-only */ 1391 HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE, 1392 pmap_pa2pte(xen_dummy_user_pgd) | PG_u | PG_V, UVMF_INVLPG); 1393 /* Pin as L4 */ 1394 xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd)); 1395 #endif /* __x86_64__ */ 1396 idt_vaddr = virtual_avail; /* don't need pte */ 1397 idt_paddr = avail_start; /* steal a page */ 1398 /* 1399 * Xen require one more page as we can't store 1400 * GDT and LDT on the same page 1401 */ 1402 virtual_avail += 3 * PAGE_SIZE; 1403 avail_start += 3 * PAGE_SIZE; 1404 #else /* XEN */ 1405 idt_vaddr = virtual_avail; /* don't need pte */ 1406 idt_paddr = avail_start; /* steal a page */ 1407 #if defined(__x86_64__) 1408 virtual_avail += 2 * PAGE_SIZE; pte += 2; 1409 avail_start += 2 * PAGE_SIZE; 1410 #else /* defined(__x86_64__) */ 1411 virtual_avail += PAGE_SIZE; pte++; 1412 avail_start += PAGE_SIZE; 1413 /* pentium f00f bug stuff */ 1414 pentium_idt_vaddr = virtual_avail; /* don't need pte */ 1415 virtual_avail += PAGE_SIZE; pte++; 1416 #endif /* defined(__x86_64__) */ 1417 #endif /* XEN */ 1418 1419 #ifdef _LP64 1420 /* 1421 * Grab a page below 4G for things that need it (i.e. 1422 * having an initial %cr3 for the MP trampoline). 1423 */ 1424 lo32_vaddr = virtual_avail; 1425 virtual_avail += PAGE_SIZE; pte++; 1426 lo32_paddr = avail_start; 1427 avail_start += PAGE_SIZE; 1428 #endif 1429 1430 /* 1431 * now we reserve some VM for mapping pages when doing a crash dump 1432 */ 1433 1434 virtual_avail = reserve_dumppages(virtual_avail); 1435 1436 /* 1437 * init the static-global locks and global lists. 1438 * 1439 * => pventry::pvh_lock (initialized elsewhere) must also be 1440 * a spin lock, again at IPL_VM to prevent deadlock, and 1441 * again is never taken from interrupt context. 1442 */ 1443 1444 mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE); 1445 LIST_INIT(&pmaps); 1446 1447 /* 1448 * initialize caches. 1449 */ 1450 1451 pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), 0, 0, 0, 1452 "pmappl", NULL, IPL_NONE, NULL, NULL, NULL); 1453 #ifdef PAE 1454 pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE * PDP_SIZE, 0, 0, 0, 1455 "pdppl", &pmap_pdp_allocator, IPL_NONE, 1456 pmap_pdp_ctor, pmap_pdp_dtor, NULL); 1457 #else /* PAE */ 1458 pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE, 0, 0, 0, 1459 "pdppl", NULL, IPL_NONE, pmap_pdp_ctor, pmap_pdp_dtor, NULL); 1460 #endif /* PAE */ 1461 pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0, 1462 PR_LARGECACHE, "pvpl", &pool_allocator_meta, IPL_NONE, NULL, 1463 NULL, NULL); 1464 1465 /* 1466 * ensure the TLB is sync'd with reality by flushing it... 1467 */ 1468 1469 tlbflushg(); 1470 1471 /* 1472 * calculate pmap_maxkvaddr from nkptp[]. 1473 */ 1474 1475 kva = VM_MIN_KERNEL_ADDRESS; 1476 for (i = PTP_LEVELS - 1; i >= 1; i--) { 1477 kva += nkptp[i] * nbpd[i]; 1478 } 1479 pmap_maxkvaddr = kva; 1480 } 1481 1482 #if defined(__x86_64__) 1483 /* 1484 * Pre-allocate PTPs for low memory, so that 1:1 mappings for various 1485 * trampoline code can be entered. 1486 */ 1487 void 1488 pmap_prealloc_lowmem_ptps(void) 1489 { 1490 int level; 1491 paddr_t newp; 1492 #ifdef XEN 1493 paddr_t pdes_pa; 1494 1495 pdes_pa = pmap_pdirpa(pmap_kernel(), 0); 1496 level = PTP_LEVELS; 1497 for (;;) { 1498 newp = avail_start; 1499 avail_start += PAGE_SIZE; 1500 HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop, 1501 xpmap_ptom_masked(newp) | PG_u | PG_V | PG_RW, UVMF_INVLPG); 1502 memset(early_zerop, 0, PAGE_SIZE); 1503 /* Mark R/O before installing */ 1504 HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop, 1505 xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG); 1506 if (newp < (NKL2_KIMG_ENTRIES * NBPD_L2)) 1507 HYPERVISOR_update_va_mapping (newp + KERNBASE, 1508 xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG); 1509 /* Update the pmap_kernel() L4 shadow */ 1510 xpq_queue_pte_update ( 1511 xpmap_ptom_masked(pdes_pa) 1512 + (pl_i(0, level) * sizeof (pd_entry_t)), 1513 xpmap_ptom_masked(newp) | PG_RW | PG_u | PG_V); 1514 /* sync to per-cpu PD */ 1515 xpq_queue_pte_update( 1516 xpmap_ptom_masked(cpu_info_primary.ci_kpm_pdirpa + 1517 pl_i(0, PTP_LEVELS) * 1518 sizeof(pd_entry_t)), 1519 pmap_kernel()->pm_pdir[pl_i(0, PTP_LEVELS)]); 1520 pmap_pte_flush(); 1521 level--; 1522 if (level <= 1) 1523 break; 1524 pdes_pa = newp; 1525 } 1526 #else /* XEN */ 1527 pd_entry_t *pdes; 1528 1529 pdes = pmap_kernel()->pm_pdir; 1530 level = PTP_LEVELS; 1531 for (;;) { 1532 newp = avail_start; 1533 avail_start += PAGE_SIZE; 1534 pmap_pte_set(early_zero_pte, (newp & PG_FRAME) | PG_V | PG_RW); 1535 pmap_pte_flush(); 1536 pmap_update_pg((vaddr_t)early_zerop); 1537 memset(early_zerop, 0, PAGE_SIZE); 1538 pdes[pl_i(0, level)] = (newp & PG_FRAME) | PG_V | PG_RW; 1539 level--; 1540 if (level <= 1) 1541 break; 1542 pdes = normal_pdes[level - 2]; 1543 } 1544 #endif /* XEN */ 1545 } 1546 #endif /* defined(__x86_64__) */ 1547 1548 /* 1549 * pmap_init: called from uvm_init, our job is to get the pmap 1550 * system ready to manage mappings... 1551 */ 1552 1553 void 1554 pmap_init(void) 1555 { 1556 int i; 1557 1558 for (i = 0; i < PV_HASH_SIZE; i++) { 1559 SLIST_INIT(&pv_hash_heads[i].hh_list); 1560 } 1561 for (i = 0; i < PV_HASH_LOCK_CNT; i++) { 1562 mutex_init(&pv_hash_locks[i].lock, MUTEX_NODEBUG, IPL_VM); 1563 } 1564 1565 pmap_tlb_init(); 1566 1567 evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC, 1568 NULL, "x86", "io bitmap copy"); 1569 evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC, 1570 NULL, "x86", "ldt sync"); 1571 1572 /* 1573 * done: pmap module is up (and ready for business) 1574 */ 1575 1576 pmap_initialized = true; 1577 } 1578 1579 /* 1580 * pmap_cpu_init_late: perform late per-CPU initialization. 1581 */ 1582 1583 #ifndef XEN 1584 void 1585 pmap_cpu_init_late(struct cpu_info *ci) 1586 { 1587 /* 1588 * The BP has already its own PD page allocated during early 1589 * MD startup. 1590 */ 1591 if (ci == &cpu_info_primary) 1592 return; 1593 1594 #ifdef PAE 1595 int ret; 1596 struct pglist pg; 1597 struct vm_page *vmap; 1598 1599 /* 1600 * Allocate a page for the per-CPU L3 PD. cr3 being 32 bits, PA musts 1601 * resides below the 4GB boundary. 1602 */ 1603 ret = uvm_pglistalloc(PAGE_SIZE, 0, 0x100000000ULL, 32, 0, &pg, 1, 0); 1604 vmap = TAILQ_FIRST(&pg); 1605 1606 if (ret != 0 || vmap == NULL) 1607 panic("%s: failed to allocate L3 pglist for CPU %d (ret %d)\n", 1608 __func__, cpu_index(ci), ret); 1609 1610 ci->ci_pae_l3_pdirpa = vmap->phys_addr; 1611 1612 ci->ci_pae_l3_pdir = (paddr_t *)uvm_km_alloc(kernel_map, PAGE_SIZE, 0, 1613 UVM_KMF_VAONLY | UVM_KMF_NOWAIT); 1614 if (ci->ci_pae_l3_pdir == NULL) 1615 panic("%s: failed to allocate L3 PD for CPU %d\n", 1616 __func__, cpu_index(ci)); 1617 1618 pmap_kenter_pa((vaddr_t)ci->ci_pae_l3_pdir, ci->ci_pae_l3_pdirpa, 1619 VM_PROT_READ | VM_PROT_WRITE, 0); 1620 1621 pmap_update(pmap_kernel()); 1622 #endif 1623 } 1624 #endif 1625 1626 /* 1627 * p v _ e n t r y f u n c t i o n s 1628 */ 1629 1630 /* 1631 * pmap_free_pvs: free a list of pv_entrys 1632 */ 1633 1634 static void 1635 pmap_free_pvs(struct pv_entry *pve) 1636 { 1637 struct pv_entry *next; 1638 1639 for ( /* null */ ; pve != NULL ; pve = next) { 1640 next = pve->pve_next; 1641 pool_cache_put(&pmap_pv_cache, pve); 1642 } 1643 } 1644 1645 /* 1646 * main pv_entry manipulation functions: 1647 * pmap_enter_pv: enter a mapping onto a pv_head list 1648 * pmap_remove_pv: remove a mapping from a pv_head list 1649 * 1650 * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock 1651 * the pvh before calling 1652 */ 1653 1654 /* 1655 * insert_pv: a helper of pmap_enter_pv 1656 */ 1657 1658 static void 1659 insert_pv(struct pmap_page *pp, struct pv_entry *pve) 1660 { 1661 struct pv_hash_head *hh; 1662 kmutex_t *lock; 1663 u_int hash; 1664 1665 hash = pvhash_hash(pve->pve_pte.pte_ptp, pve->pve_pte.pte_va); 1666 lock = pvhash_lock(hash); 1667 hh = pvhash_head(hash); 1668 mutex_spin_enter(lock); 1669 SLIST_INSERT_HEAD(&hh->hh_list, pve, pve_hash); 1670 mutex_spin_exit(lock); 1671 1672 LIST_INSERT_HEAD(&pp->pp_head.pvh_list, pve, pve_list); 1673 } 1674 1675 /* 1676 * pmap_enter_pv: enter a mapping onto a pv_head lst 1677 * 1678 * => caller should adjust ptp's wire_count before calling 1679 */ 1680 1681 static struct pv_entry * 1682 pmap_enter_pv(struct pmap_page *pp, 1683 struct pv_entry *pve, /* preallocated pve for us to use */ 1684 struct pv_entry **sparepve, 1685 struct vm_page *ptp, 1686 vaddr_t va) 1687 { 1688 1689 KASSERT(ptp == NULL || ptp->wire_count >= 2); 1690 KASSERT(ptp == NULL || ptp->uobject != NULL); 1691 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 1692 1693 if ((pp->pp_flags & PP_EMBEDDED) == 0) { 1694 if (LIST_EMPTY(&pp->pp_head.pvh_list)) { 1695 pp->pp_flags |= PP_EMBEDDED; 1696 pp->pp_pte.pte_ptp = ptp; 1697 pp->pp_pte.pte_va = va; 1698 1699 return pve; 1700 } 1701 } else { 1702 struct pv_entry *pve2; 1703 1704 pve2 = *sparepve; 1705 *sparepve = NULL; 1706 1707 pve2->pve_pte = pp->pp_pte; 1708 pp->pp_flags &= ~PP_EMBEDDED; 1709 LIST_INIT(&pp->pp_head.pvh_list); 1710 insert_pv(pp, pve2); 1711 } 1712 1713 pve->pve_pte.pte_ptp = ptp; 1714 pve->pve_pte.pte_va = va; 1715 insert_pv(pp, pve); 1716 1717 return NULL; 1718 } 1719 1720 /* 1721 * pmap_remove_pv: try to remove a mapping from a pv_list 1722 * 1723 * => caller should adjust ptp's wire_count and free PTP if needed 1724 * => we return the removed pve 1725 */ 1726 1727 static struct pv_entry * 1728 pmap_remove_pv(struct pmap_page *pp, struct vm_page *ptp, vaddr_t va) 1729 { 1730 struct pv_hash_head *hh; 1731 struct pv_entry *pve; 1732 kmutex_t *lock; 1733 u_int hash; 1734 1735 KASSERT(ptp == NULL || ptp->uobject != NULL); 1736 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 1737 1738 if ((pp->pp_flags & PP_EMBEDDED) != 0) { 1739 KASSERT(pp->pp_pte.pte_ptp == ptp); 1740 KASSERT(pp->pp_pte.pte_va == va); 1741 1742 pp->pp_flags &= ~PP_EMBEDDED; 1743 LIST_INIT(&pp->pp_head.pvh_list); 1744 1745 return NULL; 1746 } 1747 1748 hash = pvhash_hash(ptp, va); 1749 lock = pvhash_lock(hash); 1750 hh = pvhash_head(hash); 1751 mutex_spin_enter(lock); 1752 pve = pvhash_remove(hh, ptp, va); 1753 mutex_spin_exit(lock); 1754 1755 LIST_REMOVE(pve, pve_list); 1756 1757 return pve; 1758 } 1759 1760 /* 1761 * p t p f u n c t i o n s 1762 */ 1763 1764 static inline struct vm_page * 1765 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level) 1766 { 1767 int lidx = level - 1; 1768 struct vm_page *pg; 1769 1770 KASSERT(mutex_owned(pmap->pm_lock)); 1771 1772 if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] && 1773 pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) { 1774 return (pmap->pm_ptphint[lidx]); 1775 } 1776 PMAP_SUBOBJ_LOCK(pmap, lidx); 1777 pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level)); 1778 PMAP_SUBOBJ_UNLOCK(pmap, lidx); 1779 1780 KASSERT(pg == NULL || pg->wire_count >= 1); 1781 return pg; 1782 } 1783 1784 static inline void 1785 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level) 1786 { 1787 lwp_t *l; 1788 int lidx; 1789 struct uvm_object *obj; 1790 1791 KASSERT(ptp->wire_count == 1); 1792 1793 lidx = level - 1; 1794 1795 obj = &pmap->pm_obj[lidx]; 1796 pmap_stats_update(pmap, -1, 0); 1797 if (lidx != 0) 1798 mutex_enter(obj->vmobjlock); 1799 if (pmap->pm_ptphint[lidx] == ptp) 1800 pmap->pm_ptphint[lidx] = TAILQ_FIRST(&obj->memq); 1801 ptp->wire_count = 0; 1802 uvm_pagerealloc(ptp, NULL, 0); 1803 l = curlwp; 1804 KASSERT((l->l_pflag & LP_INTR) == 0); 1805 VM_PAGE_TO_PP(ptp)->pp_link = l->l_md.md_gc_ptp; 1806 l->l_md.md_gc_ptp = ptp; 1807 if (lidx != 0) 1808 mutex_exit(obj->vmobjlock); 1809 } 1810 1811 static void 1812 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va, 1813 pt_entry_t *ptes, pd_entry_t * const *pdes) 1814 { 1815 unsigned long index; 1816 int level; 1817 vaddr_t invaladdr; 1818 pd_entry_t opde; 1819 #ifdef XEN 1820 struct pmap *curpmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map); 1821 #ifdef MULTIPROCESSOR 1822 vaddr_t invaladdr2; 1823 #endif 1824 #endif 1825 1826 KASSERT(pmap != pmap_kernel()); 1827 KASSERT(mutex_owned(pmap->pm_lock)); 1828 KASSERT(kpreempt_disabled()); 1829 1830 level = 1; 1831 do { 1832 index = pl_i(va, level + 1); 1833 opde = pmap_pte_testset(&pdes[level - 1][index], 0); 1834 #if defined(XEN) 1835 # if defined(__x86_64__) 1836 /* 1837 * If ptp is a L3 currently mapped in kernel space, 1838 * clear it before freeing 1839 */ 1840 if (pmap_pdirpa(pmap, 0) == curcpu()->ci_xen_current_user_pgd 1841 && level == PTP_LEVELS - 1) { 1842 pmap_pte_set(&pmap_kernel()->pm_pdir[index], 0); 1843 /* 1844 * Update the per-cpu PD on all cpus the current 1845 * pmap is active on 1846 */ 1847 CPU_INFO_ITERATOR cii; 1848 struct cpu_info *ci; 1849 for (CPU_INFO_FOREACH(cii, ci)) { 1850 if (ci == NULL) { 1851 continue; 1852 } 1853 if (ci->ci_cpumask & pmap->pm_cpus) { 1854 pmap_pte_set(&ci->ci_kpm_pdir[index], 0); 1855 } 1856 } 1857 } 1858 # endif /*__x86_64__ */ 1859 invaladdr = level == 1 ? (vaddr_t)ptes : 1860 (vaddr_t)pdes[level - 2]; 1861 pmap_tlb_shootdown(curpmap, invaladdr + index * PAGE_SIZE, 1862 opde, TLBSHOOT_FREE_PTP1); 1863 # if defined(MULTIPROCESSOR) 1864 invaladdr2 = level == 1 ? (vaddr_t)PTE_BASE : 1865 (vaddr_t)normal_pdes[level - 2]; 1866 if (pmap != curpmap || invaladdr != invaladdr2) { 1867 pmap_tlb_shootdown(pmap, invaladdr2 + index * PAGE_SIZE, 1868 opde, TLBSHOOT_FREE_PTP2); 1869 } 1870 # endif /* MULTIPROCESSOR */ 1871 #else /* XEN */ 1872 invaladdr = level == 1 ? (vaddr_t)ptes : 1873 (vaddr_t)pdes[level - 2]; 1874 pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE, 1875 opde, TLBSHOOT_FREE_PTP1); 1876 #endif /* XEN */ 1877 pmap_freepage(pmap, ptp, level); 1878 if (level < PTP_LEVELS - 1) { 1879 ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1); 1880 ptp->wire_count--; 1881 if (ptp->wire_count > 1) 1882 break; 1883 } 1884 } while (++level < PTP_LEVELS); 1885 pmap_pte_flush(); 1886 } 1887 1888 /* 1889 * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one) 1890 * 1891 * => pmap should NOT be pmap_kernel() 1892 * => pmap should be locked 1893 * => preemption should be disabled 1894 */ 1895 1896 static struct vm_page * 1897 pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t * const *pdes) 1898 { 1899 struct vm_page *ptp, *pptp; 1900 int i; 1901 unsigned long index; 1902 pd_entry_t *pva; 1903 paddr_t ppa, pa; 1904 struct uvm_object *obj; 1905 1906 KASSERT(pmap != pmap_kernel()); 1907 KASSERT(mutex_owned(pmap->pm_lock)); 1908 KASSERT(kpreempt_disabled()); 1909 1910 ptp = NULL; 1911 pa = (paddr_t)-1; 1912 1913 /* 1914 * Loop through all page table levels seeing if we need to 1915 * add a new page to that level. 1916 */ 1917 for (i = PTP_LEVELS; i > 1; i--) { 1918 /* 1919 * Save values from previous round. 1920 */ 1921 pptp = ptp; 1922 ppa = pa; 1923 1924 index = pl_i(va, i); 1925 pva = pdes[i - 2]; 1926 1927 if (pmap_valid_entry(pva[index])) { 1928 ppa = pmap_pte2pa(pva[index]); 1929 ptp = NULL; 1930 continue; 1931 } 1932 1933 obj = &pmap->pm_obj[i-2]; 1934 PMAP_SUBOBJ_LOCK(pmap, i - 2); 1935 ptp = uvm_pagealloc(obj, ptp_va2o(va, i - 1), NULL, 1936 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 1937 PMAP_SUBOBJ_UNLOCK(pmap, i - 2); 1938 1939 if (ptp == NULL) 1940 return NULL; 1941 1942 ptp->flags &= ~PG_BUSY; /* never busy */ 1943 ptp->wire_count = 1; 1944 pmap->pm_ptphint[i - 2] = ptp; 1945 pa = VM_PAGE_TO_PHYS(ptp); 1946 pmap_pte_set(&pva[index], (pd_entry_t) 1947 (pmap_pa2pte(pa) | PG_u | PG_RW | PG_V)); 1948 #if defined(XEN) && defined(__x86_64__) 1949 /* 1950 * In Xen we must enter the mapping in kernel map too 1951 * if pmap is curmap and modifying top level (PGD) 1952 */ 1953 if(i == PTP_LEVELS && pmap != pmap_kernel()) { 1954 pmap_pte_set(&pmap_kernel()->pm_pdir[index], 1955 (pd_entry_t) (pmap_pa2pte(pa) 1956 | PG_u | PG_RW | PG_V)); 1957 /* 1958 * Update the per-cpu PD on all cpus the current 1959 * pmap is active on 1960 */ 1961 CPU_INFO_ITERATOR cii; 1962 struct cpu_info *ci; 1963 for (CPU_INFO_FOREACH(cii, ci)) { 1964 if (ci == NULL) { 1965 continue; 1966 } 1967 if (ci->ci_cpumask & pmap->pm_cpus) { 1968 pmap_pte_set(&ci->ci_kpm_pdir[index], 1969 (pd_entry_t) (pmap_pa2pte(pa) | PG_u | PG_RW | PG_V)); 1970 } 1971 } 1972 } 1973 #endif /* XEN && __x86_64__ */ 1974 pmap_pte_flush(); 1975 pmap_stats_update(pmap, 1, 0); 1976 /* 1977 * If we're not in the top level, increase the 1978 * wire count of the parent page. 1979 */ 1980 if (i < PTP_LEVELS) { 1981 if (pptp == NULL) 1982 pptp = pmap_find_ptp(pmap, va, ppa, i); 1983 #ifdef DIAGNOSTIC 1984 if (pptp == NULL) 1985 panic("pde page disappeared"); 1986 #endif 1987 pptp->wire_count++; 1988 } 1989 } 1990 1991 /* 1992 * ptp is not NULL if we just allocated a new ptp. If it's 1993 * still NULL, we must look up the existing one. 1994 */ 1995 if (ptp == NULL) { 1996 ptp = pmap_find_ptp(pmap, va, ppa, 1); 1997 #ifdef DIAGNOSTIC 1998 if (ptp == NULL) { 1999 printf("va %" PRIxVADDR " ppa %" PRIxPADDR "\n", 2000 va, ppa); 2001 panic("pmap_get_ptp: unmanaged user PTP"); 2002 } 2003 #endif 2004 } 2005 2006 pmap->pm_ptphint[0] = ptp; 2007 return(ptp); 2008 } 2009 2010 /* 2011 * p m a p l i f e c y c l e f u n c t i o n s 2012 */ 2013 2014 /* 2015 * pmap_pdp_ctor: constructor for the PDP cache. 2016 */ 2017 int 2018 pmap_pdp_ctor(void *arg, void *v, int flags) 2019 { 2020 pd_entry_t *pdir = v; 2021 paddr_t pdirpa = 0; /* XXX: GCC */ 2022 vaddr_t object; 2023 int i; 2024 2025 #if !defined(XEN) || !defined(__x86_64__) 2026 int npde; 2027 #endif 2028 #ifdef XEN 2029 int s; 2030 #endif 2031 2032 /* 2033 * NOTE: The `pmaps_lock' is held when the PDP is allocated. 2034 */ 2035 2036 #if defined(XEN) && defined(__x86_64__) 2037 /* fetch the physical address of the page directory. */ 2038 (void) pmap_extract(pmap_kernel(), (vaddr_t) pdir, &pdirpa); 2039 2040 /* zero init area */ 2041 memset (pdir, 0, PAGE_SIZE); /* Xen wants a clean page */ 2042 /* 2043 * this pdir will NEVER be active in kernel mode 2044 * so mark recursive entry invalid 2045 */ 2046 pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa) | PG_u; 2047 /* 2048 * PDP constructed this way won't be for kernel, 2049 * hence we don't put kernel mappings on Xen. 2050 * But we need to make pmap_create() happy, so put a dummy (without 2051 * PG_V) value at the right place. 2052 */ 2053 pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] = 2054 (pd_entry_t)-1 & PG_FRAME; 2055 #else /* XEN && __x86_64__*/ 2056 /* zero init area */ 2057 memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t)); 2058 2059 object = (vaddr_t)v; 2060 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2061 /* fetch the physical address of the page directory. */ 2062 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2063 /* put in recursive PDE to map the PTEs */ 2064 pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PG_V; 2065 #ifndef XEN 2066 pdir[PDIR_SLOT_PTE + i] |= PG_KW; 2067 #endif 2068 } 2069 2070 /* copy kernel's PDE */ 2071 npde = nkptp[PTP_LEVELS - 1]; 2072 2073 memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN], 2074 npde * sizeof(pd_entry_t)); 2075 2076 /* zero the rest */ 2077 memset(&pdir[PDIR_SLOT_KERN + npde], 0, 2078 (NTOPLEVEL_PDES - (PDIR_SLOT_KERN + npde)) * sizeof(pd_entry_t)); 2079 2080 if (VM_MIN_KERNEL_ADDRESS != KERNBASE) { 2081 int idx = pl_i(KERNBASE, PTP_LEVELS); 2082 2083 pdir[idx] = PDP_BASE[idx]; 2084 } 2085 #endif /* XEN && __x86_64__*/ 2086 #ifdef XEN 2087 s = splvm(); 2088 object = (vaddr_t)v; 2089 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2090 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2091 /* FIXME: This should use pmap_protect() .. */ 2092 pmap_kenter_pa(object, pdirpa, VM_PROT_READ, 0); 2093 pmap_update(pmap_kernel()); 2094 /* 2095 * pin as L2/L4 page, we have to do the page with the 2096 * PDIR_SLOT_PTE entries last 2097 */ 2098 #ifdef PAE 2099 if (i == l2tol3(PDIR_SLOT_PTE)) 2100 continue; 2101 #endif 2102 2103 #ifdef __x86_64__ 2104 xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa)); 2105 #else 2106 xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa)); 2107 #endif 2108 } 2109 #ifdef PAE 2110 object = ((vaddr_t)pdir) + PAGE_SIZE * l2tol3(PDIR_SLOT_PTE); 2111 (void)pmap_extract(pmap_kernel(), object, &pdirpa); 2112 xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa)); 2113 #endif 2114 splx(s); 2115 #endif /* XEN */ 2116 2117 return (0); 2118 } 2119 2120 /* 2121 * pmap_pdp_dtor: destructor for the PDP cache. 2122 */ 2123 2124 void 2125 pmap_pdp_dtor(void *arg, void *v) 2126 { 2127 #ifdef XEN 2128 paddr_t pdirpa = 0; /* XXX: GCC */ 2129 vaddr_t object = (vaddr_t)v; 2130 int i; 2131 int s = splvm(); 2132 pt_entry_t *pte; 2133 2134 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2135 /* fetch the physical address of the page directory. */ 2136 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2137 /* unpin page table */ 2138 xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa)); 2139 } 2140 object = (vaddr_t)v; 2141 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2142 /* Set page RW again */ 2143 pte = kvtopte(object); 2144 xpq_queue_pte_update(xpmap_ptetomach(pte), *pte | PG_RW); 2145 xpq_queue_invlpg((vaddr_t)object); 2146 } 2147 splx(s); 2148 #endif /* XEN */ 2149 } 2150 2151 #ifdef PAE 2152 2153 /* pmap_pdp_alloc: Allocate a page for the pdp memory pool. */ 2154 2155 void * 2156 pmap_pdp_alloc(struct pool *pp, int flags) 2157 { 2158 return (void *)uvm_km_alloc(kernel_map, 2159 PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE, 2160 ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK) 2161 | UVM_KMF_WIRED); 2162 } 2163 2164 /* 2165 * pmap_pdp_free: free a PDP 2166 */ 2167 2168 void 2169 pmap_pdp_free(struct pool *pp, void *v) 2170 { 2171 uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE, 2172 UVM_KMF_WIRED); 2173 } 2174 #endif /* PAE */ 2175 2176 /* 2177 * pmap_create: create a pmap 2178 * 2179 * => note: old pmap interface took a "size" args which allowed for 2180 * the creation of "software only" pmaps (not in bsd). 2181 */ 2182 2183 struct pmap * 2184 pmap_create(void) 2185 { 2186 struct pmap *pmap; 2187 int i; 2188 2189 pmap = pool_cache_get(&pmap_cache, PR_WAITOK); 2190 2191 /* init uvm_object */ 2192 for (i = 0; i < PTP_LEVELS - 1; i++) { 2193 mutex_init(&pmap->pm_obj_lock[i], MUTEX_DEFAULT, IPL_NONE); 2194 uvm_obj_init(&pmap->pm_obj[i], NULL, false, 1); 2195 uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_obj_lock[i]); 2196 pmap->pm_ptphint[i] = NULL; 2197 } 2198 pmap->pm_stats.wired_count = 0; 2199 /* count the PDP allocd below */ 2200 pmap->pm_stats.resident_count = PDP_SIZE; 2201 #if !defined(__x86_64__) 2202 pmap->pm_hiexec = 0; 2203 #endif /* !defined(__x86_64__) */ 2204 pmap->pm_flags = 0; 2205 pmap->pm_cpus = 0; 2206 pmap->pm_kernel_cpus = 0; 2207 pmap->pm_gc_ptp = NULL; 2208 2209 /* init the LDT */ 2210 pmap->pm_ldt = NULL; 2211 pmap->pm_ldt_len = 0; 2212 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 2213 2214 /* allocate PDP */ 2215 try_again: 2216 pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK); 2217 2218 mutex_enter(&pmaps_lock); 2219 2220 if (pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] == 0) { 2221 mutex_exit(&pmaps_lock); 2222 pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir); 2223 goto try_again; 2224 } 2225 2226 for (i = 0; i < PDP_SIZE; i++) 2227 pmap->pm_pdirpa[i] = 2228 pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]); 2229 2230 LIST_INSERT_HEAD(&pmaps, pmap, pm_list); 2231 2232 mutex_exit(&pmaps_lock); 2233 2234 return (pmap); 2235 } 2236 2237 /* 2238 * pmap_free_ptps: put a list of ptps back to the freelist. 2239 */ 2240 2241 static void 2242 pmap_free_ptps(struct vm_page *empty_ptps) 2243 { 2244 struct vm_page *ptp; 2245 struct pmap_page *pp; 2246 2247 while ((ptp = empty_ptps) != NULL) { 2248 pp = VM_PAGE_TO_PP(ptp); 2249 empty_ptps = pp->pp_link; 2250 LIST_INIT(&pp->pp_head.pvh_list); 2251 uvm_pagefree(ptp); 2252 } 2253 } 2254 2255 /* 2256 * pmap_destroy: drop reference count on pmap. free pmap if 2257 * reference count goes to zero. 2258 */ 2259 2260 void 2261 pmap_destroy(struct pmap *pmap) 2262 { 2263 int i; 2264 #ifdef DIAGNOSTIC 2265 struct cpu_info *ci; 2266 CPU_INFO_ITERATOR cii; 2267 #endif /* DIAGNOSTIC */ 2268 lwp_t *l; 2269 2270 /* 2271 * If we have torn down this pmap, process deferred frees and 2272 * invalidations. Free now if the system is low on memory. 2273 * Otherwise, free when the pmap is destroyed thus avoiding a 2274 * TLB shootdown. 2275 */ 2276 l = curlwp; 2277 if (__predict_false(l->l_md.md_gc_pmap == pmap)) { 2278 if (uvmexp.free < uvmexp.freetarg) { 2279 pmap_update(pmap); 2280 } else { 2281 KASSERT(pmap->pm_gc_ptp == NULL); 2282 pmap->pm_gc_ptp = l->l_md.md_gc_ptp; 2283 l->l_md.md_gc_ptp = NULL; 2284 l->l_md.md_gc_pmap = NULL; 2285 } 2286 } 2287 2288 /* 2289 * drop reference count 2290 */ 2291 2292 if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) { 2293 return; 2294 } 2295 2296 #ifdef DIAGNOSTIC 2297 for (CPU_INFO_FOREACH(cii, ci)) 2298 if (ci->ci_pmap == pmap) 2299 panic("destroying pmap being used"); 2300 #endif /* DIAGNOSTIC */ 2301 2302 /* 2303 * reference count is zero, free pmap resources and then free pmap. 2304 */ 2305 #ifdef XEN 2306 /* 2307 * Xen lazy APDP handling: 2308 * clear APDP_PDE if pmap is the currently mapped 2309 */ 2310 if (xpmap_ptom_masked(pmap_pdirpa(pmap, 0)) == (*APDP_PDE & PG_FRAME)) { 2311 kpreempt_disable(); 2312 pmap_unmap_apdp(); 2313 pmap_pte_flush(); 2314 pmap_apte_flush(pmap_kernel()); 2315 kpreempt_enable(); 2316 } 2317 #endif 2318 2319 /* 2320 * remove it from global list of pmaps 2321 */ 2322 2323 mutex_enter(&pmaps_lock); 2324 LIST_REMOVE(pmap, pm_list); 2325 mutex_exit(&pmaps_lock); 2326 2327 /* 2328 * Process deferred PTP frees. No TLB shootdown required, as the 2329 * PTP pages are no longer visible to any CPU. 2330 */ 2331 2332 pmap_free_ptps(pmap->pm_gc_ptp); 2333 2334 /* 2335 * destroyed pmap shouldn't have remaining PTPs 2336 */ 2337 2338 for (i = 0; i < PTP_LEVELS - 1; i++) { 2339 KASSERT(pmap->pm_obj[i].uo_npages == 0); 2340 KASSERT(TAILQ_EMPTY(&pmap->pm_obj[i].memq)); 2341 } 2342 2343 /* 2344 * MULTIPROCESSOR -- no need to flush out of other processors' 2345 * APTE space because we do that in pmap_unmap_ptes(). 2346 */ 2347 pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir); 2348 2349 #ifdef USER_LDT 2350 if (pmap->pm_ldt != NULL) { 2351 /* 2352 * no need to switch the LDT; this address space is gone, 2353 * nothing is using it. 2354 * 2355 * No need to lock the pmap for ldt_free (or anything else), 2356 * we're the last one to use it. 2357 */ 2358 mutex_enter(&cpu_lock); 2359 ldt_free(pmap->pm_ldt_sel); 2360 mutex_exit(&cpu_lock); 2361 uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt, 2362 pmap->pm_ldt_len, UVM_KMF_WIRED); 2363 } 2364 #endif 2365 2366 for (i = 0; i < PTP_LEVELS - 1; i++) { 2367 uvm_obj_destroy(&pmap->pm_obj[i], false); 2368 mutex_destroy(&pmap->pm_obj_lock[i]); 2369 } 2370 pool_cache_put(&pmap_cache, pmap); 2371 } 2372 2373 /* 2374 * pmap_remove_all: pmap is being torn down by the current thread. 2375 * avoid unnecessary invalidations. 2376 */ 2377 2378 void 2379 pmap_remove_all(struct pmap *pmap) 2380 { 2381 lwp_t *l = curlwp; 2382 2383 KASSERT(l->l_md.md_gc_pmap == NULL); 2384 2385 l->l_md.md_gc_pmap = pmap; 2386 } 2387 2388 #if defined(PMAP_FORK) 2389 /* 2390 * pmap_fork: perform any necessary data structure manipulation when 2391 * a VM space is forked. 2392 */ 2393 2394 void 2395 pmap_fork(struct pmap *pmap1, struct pmap *pmap2) 2396 { 2397 #ifdef USER_LDT 2398 union descriptor *new_ldt; 2399 size_t len; 2400 int sel; 2401 2402 if (__predict_true(pmap1->pm_ldt == NULL)) { 2403 return; 2404 } 2405 2406 retry: 2407 if (pmap1->pm_ldt != NULL) { 2408 len = pmap1->pm_ldt_len; 2409 new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, len, 0, 2410 UVM_KMF_WIRED); 2411 mutex_enter(&cpu_lock); 2412 sel = ldt_alloc(new_ldt, len); 2413 if (sel == -1) { 2414 mutex_exit(&cpu_lock); 2415 uvm_km_free(kernel_map, (vaddr_t)new_ldt, len, 2416 UVM_KMF_WIRED); 2417 printf("WARNING: pmap_fork: unable to allocate LDT\n"); 2418 return; 2419 } 2420 } else { 2421 len = -1; 2422 new_ldt = NULL; 2423 sel = -1; 2424 mutex_enter(&cpu_lock); 2425 } 2426 2427 /* Copy the LDT, if necessary. */ 2428 if (pmap1->pm_ldt != NULL) { 2429 if (len != pmap1->pm_ldt_len) { 2430 if (len != -1) { 2431 ldt_free(sel); 2432 uvm_km_free(kernel_map, (vaddr_t)new_ldt, 2433 len, UVM_KMF_WIRED); 2434 } 2435 mutex_exit(&cpu_lock); 2436 goto retry; 2437 } 2438 2439 memcpy(new_ldt, pmap1->pm_ldt, len); 2440 pmap2->pm_ldt = new_ldt; 2441 pmap2->pm_ldt_len = pmap1->pm_ldt_len; 2442 pmap2->pm_ldt_sel = sel; 2443 len = -1; 2444 } 2445 2446 if (len != -1) { 2447 ldt_free(sel); 2448 uvm_km_free(kernel_map, (vaddr_t)new_ldt, len, 2449 UVM_KMF_WIRED); 2450 } 2451 mutex_exit(&cpu_lock); 2452 #endif /* USER_LDT */ 2453 } 2454 #endif /* PMAP_FORK */ 2455 2456 #ifdef USER_LDT 2457 2458 /* 2459 * pmap_ldt_xcall: cross call used by pmap_ldt_sync. if the named pmap 2460 * is active, reload LDTR. 2461 */ 2462 static void 2463 pmap_ldt_xcall(void *arg1, void *arg2) 2464 { 2465 struct pmap *pm; 2466 2467 kpreempt_disable(); 2468 pm = arg1; 2469 if (curcpu()->ci_pmap == pm) { 2470 lldt(pm->pm_ldt_sel); 2471 } 2472 kpreempt_enable(); 2473 } 2474 2475 /* 2476 * pmap_ldt_sync: LDT selector for the named pmap is changing. swap 2477 * in the new selector on all CPUs. 2478 */ 2479 void 2480 pmap_ldt_sync(struct pmap *pm) 2481 { 2482 uint64_t where; 2483 2484 KASSERT(mutex_owned(&cpu_lock)); 2485 2486 pmap_ldt_evcnt.ev_count++; 2487 where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL); 2488 xc_wait(where); 2489 } 2490 2491 /* 2492 * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and 2493 * restore the default. 2494 */ 2495 2496 void 2497 pmap_ldt_cleanup(struct lwp *l) 2498 { 2499 pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap; 2500 union descriptor *dp = NULL; 2501 size_t len = 0; 2502 int sel = -1; 2503 2504 if (__predict_true(pmap->pm_ldt == NULL)) { 2505 return; 2506 } 2507 2508 mutex_enter(&cpu_lock); 2509 if (pmap->pm_ldt != NULL) { 2510 sel = pmap->pm_ldt_sel; 2511 dp = pmap->pm_ldt; 2512 len = pmap->pm_ldt_len; 2513 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 2514 pmap->pm_ldt = NULL; 2515 pmap->pm_ldt_len = 0; 2516 pmap_ldt_sync(pmap); 2517 ldt_free(sel); 2518 uvm_km_free(kernel_map, (vaddr_t)dp, len, UVM_KMF_WIRED); 2519 } 2520 mutex_exit(&cpu_lock); 2521 } 2522 #endif /* USER_LDT */ 2523 2524 /* 2525 * pmap_activate: activate a process' pmap 2526 * 2527 * => must be called with kernel preemption disabled 2528 * => if lwp is the curlwp, then set ci_want_pmapload so that 2529 * actual MMU context switch will be done by pmap_load() later 2530 */ 2531 2532 void 2533 pmap_activate(struct lwp *l) 2534 { 2535 struct cpu_info *ci; 2536 struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2537 2538 KASSERT(kpreempt_disabled()); 2539 2540 ci = curcpu(); 2541 2542 if (l == ci->ci_curlwp) { 2543 KASSERT(ci->ci_want_pmapload == 0); 2544 KASSERT(ci->ci_tlbstate != TLBSTATE_VALID); 2545 #ifdef KSTACK_CHECK_DR0 2546 /* 2547 * setup breakpoint on the top of stack 2548 */ 2549 if (l == &lwp0) 2550 dr0(0, 0, 0, 0); 2551 else 2552 dr0(KSTACK_LOWEST_ADDR(l), 1, 3, 1); 2553 #endif 2554 2555 /* 2556 * no need to switch to kernel vmspace because 2557 * it's a subset of any vmspace. 2558 */ 2559 2560 if (pmap == pmap_kernel()) { 2561 ci->ci_want_pmapload = 0; 2562 return; 2563 } 2564 2565 ci->ci_want_pmapload = 1; 2566 } 2567 } 2568 2569 /* 2570 * pmap_reactivate: try to regain reference to the pmap. 2571 * 2572 * => must be called with kernel preemption disabled 2573 */ 2574 2575 static bool 2576 pmap_reactivate(struct pmap *pmap) 2577 { 2578 struct cpu_info *ci; 2579 uint32_t cpumask; 2580 bool result; 2581 uint32_t oldcpus; 2582 2583 ci = curcpu(); 2584 cpumask = ci->ci_cpumask; 2585 2586 KASSERT(kpreempt_disabled()); 2587 #if defined(XEN) && defined(__x86_64__) 2588 KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd); 2589 #elif defined(PAE) 2590 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0])); 2591 #elif !defined(XEN) 2592 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3())); 2593 #endif 2594 2595 /* 2596 * if we still have a lazy reference to this pmap, 2597 * we can assume that there was no tlb shootdown 2598 * for this pmap in the meantime. 2599 * 2600 * the order of events here is important as we must 2601 * synchronize with TLB shootdown interrupts. declare 2602 * interest in invalidations (TLBSTATE_VALID) and then 2603 * check the cpumask, which the IPIs can change only 2604 * when the state is TLBSTATE_LAZY. 2605 */ 2606 2607 ci->ci_tlbstate = TLBSTATE_VALID; 2608 oldcpus = pmap->pm_cpus; 2609 KASSERT((pmap->pm_kernel_cpus & cpumask) != 0); 2610 if (oldcpus & cpumask) { 2611 /* got it */ 2612 result = true; 2613 } else { 2614 /* must reload */ 2615 atomic_or_32(&pmap->pm_cpus, cpumask); 2616 result = false; 2617 } 2618 2619 return result; 2620 } 2621 2622 /* 2623 * pmap_load: actually switch pmap. (fill in %cr3 and LDT info) 2624 * 2625 * ensures that the current process' pmap is loaded on the current cpu's MMU 2626 * and there's no stale TLB entries. 2627 * 2628 * the caller should disable preemption or do check-and-retry to prevent 2629 * a preemption from undoing our efforts. 2630 * 2631 * this function can block. 2632 */ 2633 2634 void 2635 pmap_load(void) 2636 { 2637 struct cpu_info *ci; 2638 uint32_t cpumask; 2639 struct pmap *pmap; 2640 struct pmap *oldpmap; 2641 struct lwp *l; 2642 struct pcb *pcb; 2643 uint64_t ncsw; 2644 2645 kpreempt_disable(); 2646 retry: 2647 ci = curcpu(); 2648 if (!ci->ci_want_pmapload) { 2649 kpreempt_enable(); 2650 return; 2651 } 2652 cpumask = ci->ci_cpumask; 2653 l = ci->ci_curlwp; 2654 ncsw = l->l_ncsw; 2655 2656 /* should be able to take ipis. */ 2657 KASSERT(ci->ci_ilevel < IPL_HIGH); 2658 #ifdef XEN 2659 /* Check to see if interrupts are enabled (ie; no events are masked) */ 2660 KASSERT(x86_read_psl() == 0); 2661 #else 2662 KASSERT((x86_read_psl() & PSL_I) != 0); 2663 #endif 2664 2665 KASSERT(l != NULL); 2666 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2667 KASSERT(pmap != pmap_kernel()); 2668 oldpmap = ci->ci_pmap; 2669 pcb = lwp_getpcb(l); 2670 2671 if (pmap == oldpmap) { 2672 if (!pmap_reactivate(pmap)) { 2673 u_int gen = uvm_emap_gen_return(); 2674 2675 /* 2676 * pmap has been changed during deactivated. 2677 * our tlb may be stale. 2678 */ 2679 2680 tlbflush(); 2681 uvm_emap_update(gen); 2682 } 2683 2684 ci->ci_want_pmapload = 0; 2685 kpreempt_enable(); 2686 return; 2687 } 2688 2689 /* 2690 * grab a reference to the new pmap. 2691 */ 2692 2693 pmap_reference(pmap); 2694 2695 /* 2696 * actually switch pmap. 2697 */ 2698 2699 atomic_and_32(&oldpmap->pm_cpus, ~cpumask); 2700 atomic_and_32(&oldpmap->pm_kernel_cpus, ~cpumask); 2701 2702 #if defined(XEN) && defined(__x86_64__) 2703 KASSERT(pmap_pdirpa(oldpmap, 0) == ci->ci_xen_current_user_pgd || 2704 oldpmap == pmap_kernel()); 2705 #elif defined(PAE) 2706 KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0])); 2707 #elif !defined(XEN) 2708 KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(rcr3())); 2709 #endif 2710 KASSERT((pmap->pm_cpus & cpumask) == 0); 2711 KASSERT((pmap->pm_kernel_cpus & cpumask) == 0); 2712 2713 /* 2714 * mark the pmap in use by this processor. again we must 2715 * synchronize with TLB shootdown interrupts, so set the 2716 * state VALID first, then register us for shootdown events 2717 * on this pmap. 2718 */ 2719 2720 ci->ci_tlbstate = TLBSTATE_VALID; 2721 atomic_or_32(&pmap->pm_cpus, cpumask); 2722 atomic_or_32(&pmap->pm_kernel_cpus, cpumask); 2723 ci->ci_pmap = pmap; 2724 2725 /* 2726 * update tss. now that we have registered for invalidations 2727 * from other CPUs, we're good to load the page tables. 2728 */ 2729 #ifdef PAE 2730 pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa; 2731 #else 2732 pcb->pcb_cr3 = pmap_pdirpa(pmap, 0); 2733 #endif 2734 2735 #ifdef i386 2736 #ifdef XEN 2737 /* 2738 * clear APDP slot, in case it points to a page table that has 2739 * been freed 2740 */ 2741 if (*APDP_PDE) { 2742 pmap_unmap_apdp(); 2743 } 2744 /* lldt() does pmap_pte_flush() */ 2745 #endif /* XEN */ 2746 2747 #ifndef XEN 2748 ci->ci_tss.tss_ldt = pmap->pm_ldt_sel; 2749 ci->ci_tss.tss_cr3 = pcb->pcb_cr3; 2750 #endif /* !XEN */ 2751 #endif /* i386 */ 2752 2753 lldt(pmap->pm_ldt_sel); 2754 2755 u_int gen = uvm_emap_gen_return(); 2756 cpu_load_pmap(pmap); 2757 uvm_emap_update(gen); 2758 2759 ci->ci_want_pmapload = 0; 2760 2761 /* 2762 * we're now running with the new pmap. drop the reference 2763 * to the old pmap. if we block, we need to go around again. 2764 */ 2765 2766 pmap_destroy(oldpmap); 2767 if (l->l_ncsw != ncsw) { 2768 goto retry; 2769 } 2770 2771 kpreempt_enable(); 2772 } 2773 2774 /* 2775 * pmap_deactivate: deactivate a process' pmap. 2776 * 2777 * => Must be called with kernel preemption disabled (high IPL is enough). 2778 */ 2779 void 2780 pmap_deactivate(struct lwp *l) 2781 { 2782 struct pmap *pmap; 2783 struct cpu_info *ci; 2784 2785 KASSERT(kpreempt_disabled()); 2786 2787 if (l != curlwp) { 2788 return; 2789 } 2790 2791 /* 2792 * Wait for pending TLB shootdowns to complete. Necessary because 2793 * TLB shootdown state is per-CPU, and the LWP may be coming off 2794 * the CPU before it has a chance to call pmap_update(), e.g. due 2795 * to kernel preemption or blocking routine in between. 2796 */ 2797 pmap_tlb_shootnow(); 2798 2799 ci = curcpu(); 2800 2801 if (ci->ci_want_pmapload) { 2802 /* 2803 * ci_want_pmapload means that our pmap is not loaded on 2804 * the CPU or TLB might be stale. note that pmap_kernel() 2805 * is always considered loaded. 2806 */ 2807 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 2808 != pmap_kernel()); 2809 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 2810 != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID); 2811 2812 /* 2813 * userspace has not been touched. 2814 * nothing to do here. 2815 */ 2816 2817 ci->ci_want_pmapload = 0; 2818 return; 2819 } 2820 2821 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2822 2823 if (pmap == pmap_kernel()) { 2824 return; 2825 } 2826 2827 #if defined(XEN) && defined(__x86_64__) 2828 KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd); 2829 #elif defined(PAE) 2830 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0])); 2831 #elif !defined(XEN) 2832 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3())); 2833 #endif 2834 KASSERT(ci->ci_pmap == pmap); 2835 2836 /* 2837 * we aren't interested in TLB invalidations for this pmap, 2838 * at least for the time being. 2839 */ 2840 2841 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); 2842 ci->ci_tlbstate = TLBSTATE_LAZY; 2843 } 2844 2845 /* 2846 * end of lifecycle functions 2847 */ 2848 2849 /* 2850 * some misc. functions 2851 */ 2852 2853 int 2854 pmap_pdes_invalid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde) 2855 { 2856 int i; 2857 unsigned long index; 2858 pd_entry_t pde; 2859 2860 for (i = PTP_LEVELS; i > 1; i--) { 2861 index = pl_i(va, i); 2862 pde = pdes[i - 2][index]; 2863 if ((pde & PG_V) == 0) 2864 return i; 2865 } 2866 if (lastpde != NULL) 2867 *lastpde = pde; 2868 return 0; 2869 } 2870 2871 /* 2872 * pmap_extract: extract a PA for the given VA 2873 */ 2874 2875 bool 2876 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap) 2877 { 2878 pt_entry_t *ptes, pte; 2879 pd_entry_t pde; 2880 pd_entry_t * const *pdes; 2881 struct pmap *pmap2; 2882 struct cpu_info *ci; 2883 paddr_t pa; 2884 lwp_t *l; 2885 bool hard, rv; 2886 2887 rv = false; 2888 pa = 0; 2889 l = curlwp; 2890 2891 KPREEMPT_DISABLE(l); 2892 ci = l->l_cpu; 2893 if (__predict_true(!ci->ci_want_pmapload && ci->ci_pmap == pmap) || 2894 pmap == pmap_kernel()) { 2895 /* 2896 * no need to lock, because it's pmap_kernel() or our 2897 * own pmap and is active. if a user pmap, the caller 2898 * will hold the vm_map write/read locked and so prevent 2899 * entries from disappearing while we are here. ptps 2900 * can disappear via pmap_remove() and pmap_protect(), 2901 * but they are called with the vm_map write locked. 2902 */ 2903 hard = false; 2904 ptes = PTE_BASE; 2905 pdes = normal_pdes; 2906 } else { 2907 /* we lose, do it the hard way. */ 2908 hard = true; 2909 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 2910 } 2911 if (pmap_pdes_valid(va, pdes, &pde)) { 2912 pte = ptes[pl1_i(va)]; 2913 if (pde & PG_PS) { 2914 pa = (pde & PG_LGFRAME) | (va & (NBPD_L2 - 1)); 2915 rv = true; 2916 } else if (__predict_true((pte & PG_V) != 0)) { 2917 pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1)); 2918 rv = true; 2919 } 2920 } 2921 if (__predict_false(hard)) { 2922 pmap_unmap_ptes(pmap, pmap2); 2923 } 2924 KPREEMPT_ENABLE(l); 2925 if (pap != NULL) { 2926 *pap = pa; 2927 } 2928 return rv; 2929 } 2930 2931 2932 /* 2933 * vtophys: virtual address to physical address. For use by 2934 * machine-dependent code only. 2935 */ 2936 2937 paddr_t 2938 vtophys(vaddr_t va) 2939 { 2940 paddr_t pa; 2941 2942 if (pmap_extract(pmap_kernel(), va, &pa) == true) 2943 return (pa); 2944 return (0); 2945 } 2946 2947 __strict_weak_alias(pmap_extract_ma, pmap_extract); 2948 2949 #ifdef XEN 2950 2951 /* 2952 * vtomach: virtual address to machine address. For use by 2953 * machine-dependent code only. 2954 */ 2955 2956 paddr_t 2957 vtomach(vaddr_t va) 2958 { 2959 paddr_t pa; 2960 2961 if (pmap_extract_ma(pmap_kernel(), va, &pa) == true) 2962 return (pa); 2963 return (0); 2964 } 2965 2966 #endif /* XEN */ 2967 2968 /* 2969 * pmap_virtual_space: used during bootup [pmap_steal_memory] to 2970 * determine the bounds of the kernel virtual addess space. 2971 */ 2972 2973 void 2974 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp) 2975 { 2976 *startp = virtual_avail; 2977 *endp = virtual_end; 2978 } 2979 2980 /* 2981 * pmap_map: map a range of PAs into kvm. 2982 * 2983 * => used during crash dump 2984 * => XXX: pmap_map() should be phased out? 2985 */ 2986 2987 vaddr_t 2988 pmap_map(vaddr_t va, paddr_t spa, paddr_t epa, vm_prot_t prot) 2989 { 2990 while (spa < epa) { 2991 pmap_kenter_pa(va, spa, prot, 0); 2992 va += PAGE_SIZE; 2993 spa += PAGE_SIZE; 2994 } 2995 pmap_update(pmap_kernel()); 2996 return va; 2997 } 2998 2999 /* 3000 * pmap_zero_page: zero a page 3001 */ 3002 3003 void 3004 pmap_zero_page(paddr_t pa) 3005 { 3006 pt_entry_t *zpte; 3007 void *zerova; 3008 int id; 3009 3010 kpreempt_disable(); 3011 id = cpu_number(); 3012 zpte = PTESLEW(zero_pte, id); 3013 zerova = VASLEW(zerop, id); 3014 3015 #ifdef DIAGNOSTIC 3016 if (*zpte) 3017 panic("pmap_zero_page: lock botch"); 3018 #endif 3019 3020 pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k); 3021 pmap_pte_flush(); 3022 pmap_update_pg((vaddr_t)zerova); /* flush TLB */ 3023 3024 memset(zerova, 0, PAGE_SIZE); 3025 3026 #if defined(DIAGNOSTIC) || defined(XEN) 3027 pmap_pte_set(zpte, 0); /* zap ! */ 3028 pmap_pte_flush(); 3029 #endif 3030 kpreempt_enable(); 3031 } 3032 3033 /* 3034 * pmap_pagezeroidle: the same, for the idle loop page zero'er. 3035 * Returns true if the page was zero'd, false if we aborted for 3036 * some reason. 3037 */ 3038 3039 bool 3040 pmap_pageidlezero(paddr_t pa) 3041 { 3042 pt_entry_t *zpte; 3043 void *zerova; 3044 bool rv; 3045 int id; 3046 3047 id = cpu_number(); 3048 zpte = PTESLEW(zero_pte, id); 3049 zerova = VASLEW(zerop, id); 3050 3051 KASSERT(cpu_feature[0] & CPUID_SSE2); 3052 KASSERT(*zpte == 0); 3053 3054 pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k); 3055 pmap_pte_flush(); 3056 pmap_update_pg((vaddr_t)zerova); /* flush TLB */ 3057 3058 rv = sse2_idlezero_page(zerova); 3059 3060 #if defined(DIAGNOSTIC) || defined(XEN) 3061 pmap_pte_set(zpte, 0); /* zap ! */ 3062 pmap_pte_flush(); 3063 #endif 3064 3065 return rv; 3066 } 3067 3068 /* 3069 * pmap_copy_page: copy a page 3070 */ 3071 3072 void 3073 pmap_copy_page(paddr_t srcpa, paddr_t dstpa) 3074 { 3075 pt_entry_t *spte; 3076 pt_entry_t *dpte; 3077 void *csrcva; 3078 void *cdstva; 3079 int id; 3080 3081 kpreempt_disable(); 3082 id = cpu_number(); 3083 spte = PTESLEW(csrc_pte,id); 3084 dpte = PTESLEW(cdst_pte,id); 3085 csrcva = VASLEW(csrcp, id); 3086 cdstva = VASLEW(cdstp, id); 3087 3088 KASSERT(*spte == 0 && *dpte == 0); 3089 3090 pmap_pte_set(spte, pmap_pa2pte(srcpa) | PG_V | PG_RW | PG_U | PG_k); 3091 pmap_pte_set(dpte, 3092 pmap_pa2pte(dstpa) | PG_V | PG_RW | PG_M | PG_U | PG_k); 3093 pmap_pte_flush(); 3094 pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva); 3095 3096 memcpy(cdstva, csrcva, PAGE_SIZE); 3097 3098 #if defined(DIAGNOSTIC) || defined(XEN) 3099 pmap_pte_set(spte, 0); 3100 pmap_pte_set(dpte, 0); 3101 pmap_pte_flush(); 3102 #endif 3103 kpreempt_enable(); 3104 } 3105 3106 static pt_entry_t * 3107 pmap_map_ptp(struct vm_page *ptp) 3108 { 3109 pt_entry_t *ptppte; 3110 void *ptpva; 3111 int id; 3112 3113 KASSERT(kpreempt_disabled()); 3114 3115 id = cpu_number(); 3116 ptppte = PTESLEW(ptp_pte, id); 3117 ptpva = VASLEW(ptpp, id); 3118 #if !defined(XEN) 3119 pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M | 3120 PG_RW | PG_U | PG_k); 3121 #else 3122 pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M | 3123 PG_U | PG_k); 3124 #endif 3125 pmap_pte_flush(); 3126 pmap_update_pg((vaddr_t)ptpva); 3127 3128 return (pt_entry_t *)ptpva; 3129 } 3130 3131 static void 3132 pmap_unmap_ptp(void) 3133 { 3134 #if defined(DIAGNOSTIC) || defined(XEN) 3135 pt_entry_t *pte; 3136 3137 KASSERT(kpreempt_disabled()); 3138 3139 pte = PTESLEW(ptp_pte, cpu_number()); 3140 if (*pte != 0) { 3141 pmap_pte_set(pte, 0); 3142 pmap_pte_flush(); 3143 } 3144 #endif 3145 } 3146 3147 static pt_entry_t * 3148 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va) 3149 { 3150 3151 KASSERT(kpreempt_disabled()); 3152 if (pmap_is_curpmap(pmap)) { 3153 return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */ 3154 } 3155 KASSERT(ptp != NULL); 3156 return pmap_map_ptp(ptp) + pl1_pi(va); 3157 } 3158 3159 static void 3160 pmap_unmap_pte(void) 3161 { 3162 3163 KASSERT(kpreempt_disabled()); 3164 3165 pmap_unmap_ptp(); 3166 } 3167 3168 /* 3169 * p m a p r e m o v e f u n c t i o n s 3170 * 3171 * functions that remove mappings 3172 */ 3173 3174 /* 3175 * pmap_remove_ptes: remove PTEs from a PTP 3176 * 3177 * => caller must hold pmap's lock 3178 * => PTP must be mapped into KVA 3179 * => PTP should be null if pmap == pmap_kernel() 3180 * => must be called with kernel preemption disabled 3181 * => returns composite pte if at least one page should be shot down 3182 */ 3183 3184 static void 3185 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva, 3186 vaddr_t startva, vaddr_t endva, struct pv_entry **pv_tofree) 3187 { 3188 pt_entry_t *pte = (pt_entry_t *)ptpva; 3189 3190 KASSERT(pmap == pmap_kernel() || mutex_owned(pmap->pm_lock)); 3191 KASSERT(kpreempt_disabled()); 3192 3193 /* 3194 * note that ptpva points to the PTE that maps startva. this may 3195 * or may not be the first PTE in the PTP. 3196 * 3197 * we loop through the PTP while there are still PTEs to look at 3198 * and the wire_count is greater than 1 (because we use the wire_count 3199 * to keep track of the number of real PTEs in the PTP). 3200 */ 3201 while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) { 3202 (void)pmap_remove_pte(pmap, ptp, pte, startva, pv_tofree); 3203 startva += PAGE_SIZE; 3204 pte++; 3205 } 3206 } 3207 3208 3209 /* 3210 * pmap_remove_pte: remove a single PTE from a PTP. 3211 * 3212 * => caller must hold pmap's lock 3213 * => PTP must be mapped into KVA 3214 * => PTP should be null if pmap == pmap_kernel() 3215 * => returns true if we removed a mapping 3216 * => must be called with kernel preemption disabled 3217 */ 3218 static bool 3219 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, 3220 vaddr_t va, struct pv_entry **pv_tofree) 3221 { 3222 struct pv_entry *pve; 3223 struct vm_page *pg; 3224 struct pmap_page *pp; 3225 pt_entry_t opte; 3226 3227 KASSERT(pmap == pmap_kernel() || mutex_owned(pmap->pm_lock)); 3228 KASSERT(kpreempt_disabled()); 3229 3230 if (!pmap_valid_entry(*pte)) { 3231 /* VA not mapped. */ 3232 return false; 3233 } 3234 3235 /* Atomically save the old PTE and zap it. */ 3236 opte = pmap_pte_testset(pte, 0); 3237 if (!pmap_valid_entry(opte)) { 3238 return false; 3239 } 3240 3241 pmap_exec_account(pmap, va, opte, 0); 3242 pmap_stats_update_bypte(pmap, 0, opte); 3243 3244 if (ptp) { 3245 /* 3246 * Dropping a PTE. Make sure that the PDE is flushed. 3247 */ 3248 ptp->wire_count--; 3249 if (ptp->wire_count <= 1) { 3250 opte |= PG_U; 3251 } 3252 } 3253 3254 if ((opte & PG_U) != 0) { 3255 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE); 3256 } 3257 3258 /* 3259 * If we are not on a pv_head list - we are done. 3260 */ 3261 if ((opte & PG_PVLIST) == 0) { 3262 #if defined(DIAGNOSTIC) && !defined(DOM0OPS) 3263 if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL) 3264 panic("pmap_remove_pte: managed page without " 3265 "PG_PVLIST for %#" PRIxVADDR, va); 3266 #endif 3267 return true; 3268 } 3269 3270 pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte)); 3271 3272 KASSERTMSG(pg != NULL, "pmap_remove_pte: unmanaged page marked " 3273 "PG_PVLIST, va = %#" PRIxVADDR ", pa = %#" PRIxPADDR, 3274 va, (paddr_t)pmap_pte2pa(opte)); 3275 3276 KASSERT(uvm_page_locked_p(pg)); 3277 3278 /* Sync R/M bits. */ 3279 pp = VM_PAGE_TO_PP(pg); 3280 pp->pp_attrs |= opte; 3281 pve = pmap_remove_pv(pp, ptp, va); 3282 3283 if (pve) { 3284 pve->pve_next = *pv_tofree; 3285 *pv_tofree = pve; 3286 } 3287 return true; 3288 } 3289 3290 /* 3291 * pmap_remove: mapping removal function. 3292 * 3293 * => caller should not be holding any pmap locks 3294 */ 3295 3296 void 3297 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 3298 { 3299 pt_entry_t *ptes; 3300 pd_entry_t pde; 3301 pd_entry_t * const *pdes; 3302 struct pv_entry *pv_tofree = NULL; 3303 bool result; 3304 int i; 3305 paddr_t ptppa; 3306 vaddr_t blkendva, va = sva; 3307 struct vm_page *ptp; 3308 struct pmap *pmap2; 3309 3310 kpreempt_disable(); 3311 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 3312 3313 /* 3314 * removing one page? take shortcut function. 3315 */ 3316 3317 if (va + PAGE_SIZE == eva) { 3318 if (pmap_pdes_valid(va, pdes, &pde)) { 3319 3320 /* PA of the PTP */ 3321 ptppa = pmap_pte2pa(pde); 3322 3323 /* Get PTP if non-kernel mapping. */ 3324 if (pmap != pmap_kernel()) { 3325 ptp = pmap_find_ptp(pmap, va, ptppa, 1); 3326 KASSERTMSG(ptp != NULL, 3327 "pmap_remove: unmanaged PTP detected"); 3328 } else { 3329 /* Never free kernel PTPs. */ 3330 ptp = NULL; 3331 } 3332 3333 result = pmap_remove_pte(pmap, ptp, 3334 &ptes[pl1_i(va)], va, &pv_tofree); 3335 3336 /* 3337 * if mapping removed and the PTP is no longer 3338 * being used, free it! 3339 */ 3340 3341 if (result && ptp && ptp->wire_count <= 1) 3342 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3343 } 3344 } else for (/* null */ ; va < eva ; va = blkendva) { 3345 int lvl; 3346 3347 /* determine range of block */ 3348 blkendva = x86_round_pdr(va+1); 3349 if (blkendva > eva) 3350 blkendva = eva; 3351 3352 /* 3353 * XXXCDC: our PTE mappings should never be removed 3354 * with pmap_remove! if we allow this (and why would 3355 * we?) then we end up freeing the pmap's page 3356 * directory page (PDP) before we are finished using 3357 * it when we hit in in the recursive mapping. this 3358 * is BAD. 3359 * 3360 * long term solution is to move the PTEs out of user 3361 * address space. and into kernel address space (up 3362 * with APTE). then we can set VM_MAXUSER_ADDRESS to 3363 * be VM_MAX_ADDRESS. 3364 */ 3365 3366 /* XXXCDC: ugly hack to avoid freeing PDP here */ 3367 for (i = 0; i < PDP_SIZE; i++) { 3368 if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i) 3369 continue; 3370 } 3371 3372 lvl = pmap_pdes_invalid(va, pdes, &pde); 3373 if (lvl != 0) { 3374 /* 3375 * skip a range corresponding to an invalid pde. 3376 */ 3377 blkendva = (va & ptp_masks[lvl - 1]) + nbpd[lvl - 1]; 3378 continue; 3379 } 3380 3381 /* PA of the PTP */ 3382 ptppa = pmap_pte2pa(pde); 3383 3384 /* Get PTP if non-kernel mapping. */ 3385 if (pmap != pmap_kernel()) { 3386 ptp = pmap_find_ptp(pmap, va, ptppa, 1); 3387 KASSERTMSG(ptp != NULL, 3388 "pmap_remove: unmanaged PTP detected"); 3389 } else { 3390 /* Never free kernel PTPs. */ 3391 ptp = NULL; 3392 } 3393 3394 pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va, 3395 blkendva, &pv_tofree); 3396 3397 /* if PTP is no longer being used, free it! */ 3398 if (ptp && ptp->wire_count <= 1) { 3399 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3400 } 3401 } 3402 pmap_unmap_ptes(pmap, pmap2); /* unlock pmap */ 3403 kpreempt_enable(); 3404 3405 /* Now we free unused PVs */ 3406 if (pv_tofree) 3407 pmap_free_pvs(pv_tofree); 3408 } 3409 3410 /* 3411 * pmap_sync_pv: clear pte bits and return the old value of the pte. 3412 * 3413 * => Caller should disable kernel preemption. 3414 * => issues tlb shootdowns if necessary. 3415 */ 3416 3417 static int 3418 pmap_sync_pv(struct pv_pte *pvpte, pt_entry_t expect, int clearbits, 3419 pt_entry_t *optep) 3420 { 3421 struct pmap *pmap; 3422 struct vm_page *ptp; 3423 vaddr_t va; 3424 pt_entry_t *ptep; 3425 pt_entry_t opte; 3426 pt_entry_t npte; 3427 bool need_shootdown; 3428 3429 ptp = pvpte->pte_ptp; 3430 va = pvpte->pte_va; 3431 KASSERT(ptp == NULL || ptp->uobject != NULL); 3432 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 3433 pmap = ptp_to_pmap(ptp); 3434 3435 KASSERT((expect & ~(PG_FRAME | PG_V)) == 0); 3436 KASSERT((expect & PG_V) != 0); 3437 KASSERT(clearbits == ~0 || (clearbits & ~(PG_M | PG_U | PG_RW)) == 0); 3438 KASSERT(kpreempt_disabled()); 3439 3440 ptep = pmap_map_pte(pmap, ptp, va); 3441 do { 3442 opte = *ptep; 3443 KASSERT((opte & (PG_M | PG_U)) != PG_M); 3444 KASSERT((opte & (PG_U | PG_V)) != PG_U); 3445 KASSERT(opte == 0 || (opte & PG_V) != 0); 3446 if ((opte & (PG_FRAME | PG_V)) != expect) { 3447 3448 /* 3449 * we lost a race with a V->P operation like 3450 * pmap_remove(). wait for the competitor 3451 * reflecting pte bits into mp_attrs. 3452 * 3453 * issue a redundant TLB shootdown so that 3454 * we can wait for its completion. 3455 */ 3456 3457 pmap_unmap_pte(); 3458 if (clearbits != 0) { 3459 pmap_tlb_shootdown(pmap, va, 3460 (pmap == pmap_kernel() ? PG_G : 0), 3461 TLBSHOOT_SYNC_PV1); 3462 } 3463 return EAGAIN; 3464 } 3465 3466 /* 3467 * check if there's anything to do on this pte. 3468 */ 3469 3470 if ((opte & clearbits) == 0) { 3471 need_shootdown = false; 3472 break; 3473 } 3474 3475 /* 3476 * we need a shootdown if the pte is cached. (PG_U) 3477 * 3478 * ...unless we are clearing only the PG_RW bit and 3479 * it isn't cached as RW. (PG_M) 3480 */ 3481 3482 need_shootdown = (opte & PG_U) != 0 && 3483 !(clearbits == PG_RW && (opte & PG_M) == 0); 3484 3485 npte = opte & ~clearbits; 3486 3487 /* 3488 * if we need a shootdown anyway, clear PG_U and PG_M. 3489 */ 3490 3491 if (need_shootdown) { 3492 npte &= ~(PG_U | PG_M); 3493 } 3494 KASSERT((npte & (PG_M | PG_U)) != PG_M); 3495 KASSERT((npte & (PG_U | PG_V)) != PG_U); 3496 KASSERT(npte == 0 || (opte & PG_V) != 0); 3497 } while (pmap_pte_cas(ptep, opte, npte) != opte); 3498 3499 if (need_shootdown) { 3500 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV2); 3501 } 3502 pmap_unmap_pte(); 3503 3504 *optep = opte; 3505 return 0; 3506 } 3507 3508 /* 3509 * pmap_page_remove: remove a managed vm_page from all pmaps that map it 3510 * 3511 * => R/M bits are sync'd back to attrs 3512 */ 3513 3514 void 3515 pmap_page_remove(struct vm_page *pg) 3516 { 3517 struct pmap_page *pp; 3518 struct pv_pte *pvpte; 3519 struct pv_entry *killlist = NULL; 3520 struct vm_page *ptp; 3521 pt_entry_t expect; 3522 lwp_t *l; 3523 int count; 3524 3525 KASSERT(uvm_page_locked_p(pg)); 3526 3527 l = curlwp; 3528 pp = VM_PAGE_TO_PP(pg); 3529 expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V; 3530 count = SPINLOCK_BACKOFF_MIN; 3531 kpreempt_disable(); 3532 startover: 3533 while ((pvpte = pv_pte_first(pp)) != NULL) { 3534 struct pmap *pmap; 3535 struct pv_entry *pve; 3536 pt_entry_t opte; 3537 vaddr_t va; 3538 int error; 3539 3540 /* 3541 * add a reference to the pmap before clearing the pte. 3542 * otherwise the pmap can disappear behind us. 3543 */ 3544 3545 ptp = pvpte->pte_ptp; 3546 pmap = ptp_to_pmap(ptp); 3547 if (ptp != NULL) { 3548 pmap_reference(pmap); 3549 } 3550 3551 error = pmap_sync_pv(pvpte, expect, ~0, &opte); 3552 if (error == EAGAIN) { 3553 int hold_count; 3554 KERNEL_UNLOCK_ALL(curlwp, &hold_count); 3555 if (ptp != NULL) { 3556 pmap_destroy(pmap); 3557 } 3558 SPINLOCK_BACKOFF(count); 3559 KERNEL_LOCK(hold_count, curlwp); 3560 goto startover; 3561 } 3562 3563 pp->pp_attrs |= opte; 3564 va = pvpte->pte_va; 3565 pve = pmap_remove_pv(pp, ptp, va); 3566 3567 /* update the PTP reference count. free if last reference. */ 3568 if (ptp != NULL) { 3569 struct pmap *pmap2; 3570 pt_entry_t *ptes; 3571 pd_entry_t * const *pdes; 3572 3573 KASSERT(pmap != pmap_kernel()); 3574 3575 pmap_tlb_shootnow(); 3576 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3577 pmap_stats_update_bypte(pmap, 0, opte); 3578 ptp->wire_count--; 3579 if (ptp->wire_count <= 1) { 3580 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3581 } 3582 pmap_unmap_ptes(pmap, pmap2); 3583 pmap_destroy(pmap); 3584 } else { 3585 KASSERT(pmap == pmap_kernel()); 3586 pmap_stats_update_bypte(pmap, 0, opte); 3587 } 3588 3589 if (pve != NULL) { 3590 pve->pve_next = killlist; /* mark it for death */ 3591 killlist = pve; 3592 } 3593 } 3594 pmap_tlb_shootnow(); 3595 kpreempt_enable(); 3596 3597 /* Now free unused pvs. */ 3598 pmap_free_pvs(killlist); 3599 } 3600 3601 /* 3602 * p m a p a t t r i b u t e f u n c t i o n s 3603 * functions that test/change managed page's attributes 3604 * since a page can be mapped multiple times we must check each PTE that 3605 * maps it by going down the pv lists. 3606 */ 3607 3608 /* 3609 * pmap_test_attrs: test a page's attributes 3610 */ 3611 3612 bool 3613 pmap_test_attrs(struct vm_page *pg, unsigned testbits) 3614 { 3615 struct pmap_page *pp; 3616 struct pv_pte *pvpte; 3617 pt_entry_t expect; 3618 u_int result; 3619 3620 KASSERT(uvm_page_locked_p(pg)); 3621 3622 pp = VM_PAGE_TO_PP(pg); 3623 if ((pp->pp_attrs & testbits) != 0) { 3624 return true; 3625 } 3626 expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V; 3627 kpreempt_disable(); 3628 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 3629 pt_entry_t opte; 3630 int error; 3631 3632 if ((pp->pp_attrs & testbits) != 0) { 3633 break; 3634 } 3635 error = pmap_sync_pv(pvpte, expect, 0, &opte); 3636 if (error == 0) { 3637 pp->pp_attrs |= opte; 3638 } 3639 } 3640 result = pp->pp_attrs & testbits; 3641 kpreempt_enable(); 3642 3643 /* 3644 * note that we will exit the for loop with a non-null pve if 3645 * we have found the bits we are testing for. 3646 */ 3647 3648 return result != 0; 3649 } 3650 3651 /* 3652 * pmap_clear_attrs: clear the specified attribute for a page. 3653 * 3654 * => we return true if we cleared one of the bits we were asked to 3655 */ 3656 3657 bool 3658 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits) 3659 { 3660 struct pmap_page *pp; 3661 struct pv_pte *pvpte; 3662 u_int result; 3663 pt_entry_t expect; 3664 int count; 3665 3666 KASSERT(uvm_page_locked_p(pg)); 3667 3668 pp = VM_PAGE_TO_PP(pg); 3669 expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V; 3670 count = SPINLOCK_BACKOFF_MIN; 3671 kpreempt_disable(); 3672 startover: 3673 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 3674 pt_entry_t opte; 3675 int error; 3676 3677 error = pmap_sync_pv(pvpte, expect, clearbits, &opte); 3678 if (error == EAGAIN) { 3679 int hold_count; 3680 KERNEL_UNLOCK_ALL(curlwp, &hold_count); 3681 SPINLOCK_BACKOFF(count); 3682 KERNEL_LOCK(hold_count, curlwp); 3683 goto startover; 3684 } 3685 pp->pp_attrs |= opte; 3686 } 3687 result = pp->pp_attrs & clearbits; 3688 pp->pp_attrs &= ~clearbits; 3689 kpreempt_enable(); 3690 3691 return result != 0; 3692 } 3693 3694 3695 /* 3696 * p m a p p r o t e c t i o n f u n c t i o n s 3697 */ 3698 3699 /* 3700 * pmap_page_protect: change the protection of all recorded mappings 3701 * of a managed page 3702 * 3703 * => NOTE: this is an inline function in pmap.h 3704 */ 3705 3706 /* see pmap.h */ 3707 3708 /* 3709 * pmap_protect: set the protection in of the pages in a pmap 3710 * 3711 * => NOTE: this is an inline function in pmap.h 3712 */ 3713 3714 /* see pmap.h */ 3715 3716 /* 3717 * pmap_write_protect: write-protect pages in a pmap. 3718 */ 3719 void 3720 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot) 3721 { 3722 pt_entry_t *ptes; 3723 pt_entry_t * const *pdes; 3724 struct pmap *pmap2; 3725 vaddr_t blockend, va; 3726 3727 KASSERT(curlwp->l_md.md_gc_pmap != pmap); 3728 3729 sva &= PG_FRAME; 3730 eva &= PG_FRAME; 3731 3732 /* Acquire pmap. */ 3733 kpreempt_disable(); 3734 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3735 3736 for (va = sva ; va < eva ; va = blockend) { 3737 pt_entry_t *spte, *epte; 3738 int i; 3739 3740 blockend = (va & L2_FRAME) + NBPD_L2; 3741 if (blockend > eva) 3742 blockend = eva; 3743 3744 /* 3745 * XXXCDC: our PTE mappings should never be write-protected! 3746 * 3747 * long term solution is to move the PTEs out of user 3748 * address space. and into kernel address space (up 3749 * with APTE). then we can set VM_MAXUSER_ADDRESS to 3750 * be VM_MAX_ADDRESS. 3751 */ 3752 3753 /* XXXCDC: ugly hack to avoid freeing PDP here */ 3754 for (i = 0; i < PDP_SIZE; i++) { 3755 if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i) 3756 continue; 3757 } 3758 3759 /* Is it a valid block? */ 3760 if (!pmap_pdes_valid(va, pdes, NULL)) { 3761 continue; 3762 } 3763 KASSERT(va < VM_MAXUSER_ADDRESS || va >= VM_MAX_ADDRESS); 3764 3765 spte = &ptes[pl1_i(va)]; 3766 epte = &ptes[pl1_i(blockend)]; 3767 3768 for (/*null */; spte < epte ; spte++) { 3769 pt_entry_t opte, npte; 3770 3771 do { 3772 opte = *spte; 3773 if ((~opte & (PG_RW | PG_V)) != 0) { 3774 goto next; 3775 } 3776 npte = opte & ~PG_RW; 3777 } while (pmap_pte_cas(spte, opte, npte) != opte); 3778 3779 if ((opte & PG_M) != 0) { 3780 vaddr_t tva = x86_ptob(spte - ptes); 3781 pmap_tlb_shootdown(pmap, tva, opte, 3782 TLBSHOOT_WRITE_PROTECT); 3783 } 3784 next:; 3785 } 3786 } 3787 3788 /* Release pmap. */ 3789 pmap_unmap_ptes(pmap, pmap2); 3790 kpreempt_enable(); 3791 } 3792 3793 /* 3794 * pmap_unwire: clear the wired bit in the PTE. 3795 * 3796 * => Mapping should already be present. 3797 */ 3798 void 3799 pmap_unwire(struct pmap *pmap, vaddr_t va) 3800 { 3801 pt_entry_t *ptes, *ptep, opte; 3802 pd_entry_t * const *pdes; 3803 struct pmap *pmap2; 3804 3805 /* Acquire pmap. */ 3806 kpreempt_disable(); 3807 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3808 3809 if (!pmap_pdes_valid(va, pdes, NULL)) { 3810 panic("pmap_unwire: invalid PDE"); 3811 } 3812 3813 ptep = &ptes[pl1_i(va)]; 3814 opte = *ptep; 3815 KASSERT(pmap_valid_entry(opte)); 3816 3817 if (opte & PG_W) { 3818 pt_entry_t npte = opte & ~PG_W; 3819 3820 opte = pmap_pte_testset(ptep, npte); 3821 pmap_stats_update_bypte(pmap, npte, opte); 3822 } else { 3823 printf("pmap_unwire: wiring for pmap %p va 0x%lx " 3824 "did not change!\n", pmap, va); 3825 } 3826 3827 /* Release pmap. */ 3828 pmap_unmap_ptes(pmap, pmap2); 3829 kpreempt_enable(); 3830 } 3831 3832 /* 3833 * pmap_copy: copy mappings from one pmap to another 3834 * 3835 * => optional function 3836 * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr) 3837 */ 3838 3839 /* 3840 * defined as macro in pmap.h 3841 */ 3842 3843 __strict_weak_alias(pmap_enter, pmap_enter_default); 3844 3845 int 3846 pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, 3847 u_int flags) 3848 { 3849 return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0); 3850 } 3851 3852 /* 3853 * pmap_enter: enter a mapping into a pmap 3854 * 3855 * => must be done "now" ... no lazy-evaluation 3856 * => we set pmap => pv_head locking 3857 */ 3858 int 3859 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa, 3860 vm_prot_t prot, u_int flags, int domid) 3861 { 3862 pt_entry_t *ptes, opte, npte; 3863 pt_entry_t *ptep; 3864 pd_entry_t * const *pdes; 3865 struct vm_page *ptp, *pg; 3866 struct pmap_page *new_pp; 3867 struct pmap_page *old_pp; 3868 struct pv_entry *old_pve = NULL; 3869 struct pv_entry *new_pve; 3870 struct pv_entry *new_pve2; 3871 int error; 3872 bool wired = (flags & PMAP_WIRED) != 0; 3873 struct pmap *pmap2; 3874 3875 KASSERT(pmap_initialized); 3876 KASSERT(curlwp->l_md.md_gc_pmap != pmap); 3877 KASSERT(va < VM_MAX_KERNEL_ADDRESS); 3878 KASSERTMSG(va != (vaddr_t)PDP_BASE && va != (vaddr_t)APDP_BASE, 3879 "pmap_enter: trying to map over PDP/APDP!"); 3880 KASSERTMSG(va < VM_MIN_KERNEL_ADDRESS || 3881 pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]), 3882 "pmap_enter: missing kernel PTP for VA %lx!", va); 3883 3884 #ifdef XEN 3885 KASSERT(domid == DOMID_SELF || pa == 0); 3886 #endif /* XEN */ 3887 3888 npte = ma | protection_codes[prot] | PG_V; 3889 npte |= pmap_pat_flags(flags); 3890 if (wired) 3891 npte |= PG_W; 3892 if (va < VM_MAXUSER_ADDRESS) 3893 npte |= PG_u; 3894 else if (va < VM_MAX_ADDRESS) 3895 npte |= (PG_u | PG_RW); /* XXXCDC: no longer needed? */ 3896 else 3897 npte |= PG_k; 3898 if (pmap == pmap_kernel()) 3899 npte |= pmap_pg_g; 3900 if (flags & VM_PROT_ALL) { 3901 npte |= PG_U; 3902 if (flags & VM_PROT_WRITE) { 3903 KASSERT((npte & PG_RW) != 0); 3904 npte |= PG_M; 3905 } 3906 } 3907 3908 #ifdef XEN 3909 if (domid != DOMID_SELF) 3910 pg = NULL; 3911 else 3912 #endif 3913 pg = PHYS_TO_VM_PAGE(pa); 3914 if (pg != NULL) { 3915 /* This is a managed page */ 3916 npte |= PG_PVLIST; 3917 new_pp = VM_PAGE_TO_PP(pg); 3918 } else { 3919 new_pp = NULL; 3920 } 3921 3922 /* get pves. */ 3923 new_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT); 3924 new_pve2 = pool_cache_get(&pmap_pv_cache, PR_NOWAIT); 3925 if (new_pve == NULL || new_pve2 == NULL) { 3926 if (flags & PMAP_CANFAIL) { 3927 error = ENOMEM; 3928 goto out2; 3929 } 3930 panic("pmap_enter: pve allocation failed"); 3931 } 3932 3933 kpreempt_disable(); 3934 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 3935 if (pmap == pmap_kernel()) { 3936 ptp = NULL; 3937 } else { 3938 ptp = pmap_get_ptp(pmap, va, pdes); 3939 if (ptp == NULL) { 3940 pmap_unmap_ptes(pmap, pmap2); 3941 if (flags & PMAP_CANFAIL) { 3942 error = ENOMEM; 3943 goto out; 3944 } 3945 panic("pmap_enter: get ptp failed"); 3946 } 3947 } 3948 3949 /* 3950 * update the pte. 3951 */ 3952 3953 ptep = &ptes[pl1_i(va)]; 3954 do { 3955 opte = *ptep; 3956 3957 /* 3958 * if the same page, inherit PG_U and PG_M. 3959 */ 3960 if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) { 3961 npte |= opte & (PG_U | PG_M); 3962 } 3963 #if defined(XEN) 3964 if (domid != DOMID_SELF) { 3965 /* pmap_pte_cas with error handling */ 3966 int s = splvm(); 3967 if (opte != *ptep) { 3968 splx(s); 3969 continue; 3970 } 3971 error = xpq_update_foreign( 3972 vtomach((vaddr_t)ptep), npte, domid); 3973 splx(s); 3974 if (error) { 3975 if (ptp != NULL && ptp->wire_count <= 1) { 3976 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3977 } 3978 pmap_unmap_ptes(pmap, pmap2); 3979 goto out; 3980 } 3981 break; 3982 } 3983 #endif /* defined(XEN) */ 3984 } while (pmap_pte_cas(ptep, opte, npte) != opte); 3985 3986 /* 3987 * update statistics and PTP's reference count. 3988 */ 3989 3990 pmap_stats_update_bypte(pmap, npte, opte); 3991 if (ptp != NULL && !pmap_valid_entry(opte)) { 3992 ptp->wire_count++; 3993 } 3994 KASSERT(ptp == NULL || ptp->wire_count > 1); 3995 3996 /* 3997 * if the same page, we can skip pv_entry handling. 3998 */ 3999 4000 if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) { 4001 KASSERT(((opte ^ npte) & PG_PVLIST) == 0); 4002 goto same_pa; 4003 } 4004 4005 /* 4006 * if old page is managed, remove pv_entry from its list. 4007 */ 4008 4009 if ((~opte & (PG_V | PG_PVLIST)) == 0) { 4010 pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte)); 4011 4012 KASSERTMSG(pg != NULL, "pmap_enter: PG_PVLIST mapping with " 4013 "unmanaged page pa = 0x%" PRIx64 " (0x%" PRIx64 ")", 4014 (int64_t)pa, (int64_t)atop(pa)); 4015 4016 KASSERT(uvm_page_locked_p(pg)); 4017 4018 old_pp = VM_PAGE_TO_PP(pg); 4019 old_pve = pmap_remove_pv(old_pp, ptp, va); 4020 old_pp->pp_attrs |= opte; 4021 } 4022 4023 /* 4024 * if new page is managed, insert pv_entry into its list. 4025 */ 4026 4027 if (new_pp) { 4028 new_pve = pmap_enter_pv(new_pp, new_pve, &new_pve2, ptp, va); 4029 } 4030 4031 same_pa: 4032 pmap_unmap_ptes(pmap, pmap2); 4033 4034 /* 4035 * shootdown tlb if necessary. 4036 */ 4037 4038 if ((~opte & (PG_V | PG_U)) == 0 && 4039 ((opte ^ npte) & (PG_FRAME | PG_RW)) != 0) { 4040 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER); 4041 } 4042 4043 error = 0; 4044 out: 4045 kpreempt_enable(); 4046 out2: 4047 if (old_pve != NULL) { 4048 pool_cache_put(&pmap_pv_cache, old_pve); 4049 } 4050 if (new_pve != NULL) { 4051 pool_cache_put(&pmap_pv_cache, new_pve); 4052 } 4053 if (new_pve2 != NULL) { 4054 pool_cache_put(&pmap_pv_cache, new_pve2); 4055 } 4056 4057 return error; 4058 } 4059 4060 static bool 4061 pmap_get_physpage(vaddr_t va, int level, paddr_t *paddrp) 4062 { 4063 struct vm_page *ptp; 4064 struct pmap *kpm = pmap_kernel(); 4065 4066 if (uvm.page_init_done == false) { 4067 /* 4068 * we're growing the kernel pmap early (from 4069 * uvm_pageboot_alloc()). this case must be 4070 * handled a little differently. 4071 */ 4072 4073 if (uvm_page_physget(paddrp) == false) 4074 panic("pmap_get_physpage: out of memory"); 4075 kpreempt_disable(); 4076 pmap_pte_set(early_zero_pte, 4077 pmap_pa2pte(*paddrp) | PG_V | PG_RW | PG_k); 4078 pmap_pte_flush(); 4079 pmap_update_pg((vaddr_t)early_zerop); 4080 memset(early_zerop, 0, PAGE_SIZE); 4081 #if defined(DIAGNOSTIC) || defined (XEN) 4082 pmap_pte_set(early_zero_pte, 0); 4083 pmap_pte_flush(); 4084 #endif /* defined(DIAGNOSTIC) */ 4085 kpreempt_enable(); 4086 } else { 4087 /* XXX */ 4088 ptp = uvm_pagealloc(NULL, 0, NULL, 4089 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 4090 if (ptp == NULL) 4091 panic("pmap_get_physpage: out of memory"); 4092 ptp->flags &= ~PG_BUSY; 4093 ptp->wire_count = 1; 4094 *paddrp = VM_PAGE_TO_PHYS(ptp); 4095 } 4096 pmap_stats_update(kpm, 1, 0); 4097 return true; 4098 } 4099 4100 /* 4101 * Allocate the amount of specified ptps for a ptp level, and populate 4102 * all levels below accordingly, mapping virtual addresses starting at 4103 * kva. 4104 * 4105 * Used by pmap_growkernel. 4106 */ 4107 static void 4108 pmap_alloc_level(pd_entry_t * const *pdes, vaddr_t kva, int lvl, 4109 long *needed_ptps) 4110 { 4111 unsigned long i; 4112 vaddr_t va; 4113 paddr_t pa; 4114 unsigned long index, endindex; 4115 int level; 4116 pd_entry_t *pdep; 4117 #ifdef XEN 4118 int s = splvm(); /* protect xpq_* */ 4119 #endif 4120 4121 for (level = lvl; level > 1; level--) { 4122 if (level == PTP_LEVELS) 4123 pdep = pmap_kernel()->pm_pdir; 4124 else 4125 pdep = pdes[level - 2]; 4126 va = kva; 4127 index = pl_i_roundup(kva, level); 4128 endindex = index + needed_ptps[level - 1] - 1; 4129 4130 4131 for (i = index; i <= endindex; i++) { 4132 pt_entry_t pte; 4133 4134 KASSERT(!pmap_valid_entry(pdep[i])); 4135 pmap_get_physpage(va, level - 1, &pa); 4136 pte = pmap_pa2pte(pa) | PG_k | PG_V | PG_RW; 4137 #ifdef XEN 4138 switch (level) { 4139 case PTP_LEVELS: 4140 #if defined(PAE) || defined(__x86_64__) 4141 if (i >= PDIR_SLOT_KERN) { 4142 /* update per-cpu PMDs on all cpus */ 4143 CPU_INFO_ITERATOR cii; 4144 struct cpu_info *ci; 4145 for (CPU_INFO_FOREACH(cii, ci)) { 4146 if (ci == NULL) { 4147 continue; 4148 } 4149 #ifdef PAE 4150 xpq_queue_pte_update( 4151 xpmap_ptetomach(&ci->ci_kpm_pdir[l2tol2(i)]), pte); 4152 #elif defined(__x86_64__) 4153 xpq_queue_pte_update( 4154 xpmap_ptetomach(&ci->ci_kpm_pdir[i]), pte); 4155 #endif /* PAE */ 4156 } 4157 } 4158 #endif /* PAE || __x86_64__ */ 4159 /* FALLTHROUGH */ 4160 4161 default: /* All other levels */ 4162 xpq_queue_pte_update( 4163 xpmap_ptetomach(&pdep[i]), 4164 pte); 4165 } 4166 #else /* XEN */ 4167 pdep[i] = pte; 4168 #endif /* XEN */ 4169 KASSERT(level != PTP_LEVELS || nkptp[level - 1] + 4170 pl_i(VM_MIN_KERNEL_ADDRESS, level) == i); 4171 nkptp[level - 1]++; 4172 va += nbpd[level - 1]; 4173 } 4174 pmap_pte_flush(); 4175 } 4176 #ifdef XEN 4177 splx(s); 4178 #endif 4179 } 4180 4181 /* 4182 * pmap_growkernel: increase usage of KVM space 4183 * 4184 * => we allocate new PTPs for the kernel and install them in all 4185 * the pmaps on the system. 4186 */ 4187 4188 vaddr_t 4189 pmap_growkernel(vaddr_t maxkvaddr) 4190 { 4191 struct pmap *kpm = pmap_kernel(); 4192 #if !defined(XEN) || !defined(__x86_64__) 4193 struct pmap *pm; 4194 #endif 4195 int s, i; 4196 long needed_kptp[PTP_LEVELS], target_nptp, old; 4197 bool invalidate = false; 4198 4199 s = splvm(); /* to be safe */ 4200 mutex_enter(kpm->pm_lock); 4201 4202 if (maxkvaddr <= pmap_maxkvaddr) { 4203 mutex_exit(kpm->pm_lock); 4204 splx(s); 4205 return pmap_maxkvaddr; 4206 } 4207 4208 maxkvaddr = x86_round_pdr(maxkvaddr); 4209 old = nkptp[PTP_LEVELS - 1]; 4210 /* 4211 * This loop could be optimized more, but pmap_growkernel() 4212 * is called infrequently. 4213 */ 4214 for (i = PTP_LEVELS - 1; i >= 1; i--) { 4215 target_nptp = pl_i_roundup(maxkvaddr, i + 1) - 4216 pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1); 4217 /* 4218 * XXX only need to check toplevel. 4219 */ 4220 if (target_nptp > nkptpmax[i]) 4221 panic("out of KVA space"); 4222 KASSERT(target_nptp >= nkptp[i]); 4223 needed_kptp[i] = target_nptp - nkptp[i]; 4224 } 4225 4226 pmap_alloc_level(normal_pdes, pmap_maxkvaddr, PTP_LEVELS, needed_kptp); 4227 4228 /* 4229 * If the number of top level entries changed, update all 4230 * pmaps. 4231 */ 4232 if (needed_kptp[PTP_LEVELS - 1] != 0) { 4233 #ifdef XEN 4234 #ifdef __x86_64__ 4235 /* nothing, kernel entries are never entered in user pmap */ 4236 #else /* __x86_64__ */ 4237 mutex_enter(&pmaps_lock); 4238 LIST_FOREACH(pm, &pmaps, pm_list) { 4239 int pdkidx; 4240 for (pdkidx = PDIR_SLOT_KERN + old; 4241 pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1]; 4242 pdkidx++) { 4243 xpq_queue_pte_update( 4244 xpmap_ptom(pmap_pdirpa(pm, pdkidx)), 4245 kpm->pm_pdir[pdkidx]); 4246 } 4247 xpq_flush_queue(); 4248 } 4249 mutex_exit(&pmaps_lock); 4250 #endif /* __x86_64__ */ 4251 #else /* XEN */ 4252 unsigned newpdes; 4253 newpdes = nkptp[PTP_LEVELS - 1] - old; 4254 mutex_enter(&pmaps_lock); 4255 LIST_FOREACH(pm, &pmaps, pm_list) { 4256 memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old], 4257 &kpm->pm_pdir[PDIR_SLOT_KERN + old], 4258 newpdes * sizeof (pd_entry_t)); 4259 } 4260 mutex_exit(&pmaps_lock); 4261 #endif 4262 invalidate = true; 4263 } 4264 pmap_maxkvaddr = maxkvaddr; 4265 mutex_exit(kpm->pm_lock); 4266 splx(s); 4267 4268 if (invalidate) { 4269 /* Invalidate the PDP cache. */ 4270 pool_cache_invalidate(&pmap_pdp_cache); 4271 } 4272 4273 return maxkvaddr; 4274 } 4275 4276 #ifdef DEBUG 4277 void pmap_dump(struct pmap *, vaddr_t, vaddr_t); 4278 4279 /* 4280 * pmap_dump: dump all the mappings from a pmap 4281 * 4282 * => caller should not be holding any pmap locks 4283 */ 4284 4285 void 4286 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 4287 { 4288 pt_entry_t *ptes, *pte; 4289 pd_entry_t * const *pdes; 4290 struct pmap *pmap2; 4291 vaddr_t blkendva; 4292 4293 /* 4294 * if end is out of range truncate. 4295 * if (end == start) update to max. 4296 */ 4297 4298 if (eva > VM_MAXUSER_ADDRESS || eva <= sva) 4299 eva = VM_MAXUSER_ADDRESS; 4300 4301 /* 4302 * we lock in the pmap => pv_head direction 4303 */ 4304 4305 kpreempt_disable(); 4306 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 4307 4308 /* 4309 * dumping a range of pages: we dump in PTP sized blocks (4MB) 4310 */ 4311 4312 for (/* null */ ; sva < eva ; sva = blkendva) { 4313 4314 /* determine range of block */ 4315 blkendva = x86_round_pdr(sva+1); 4316 if (blkendva > eva) 4317 blkendva = eva; 4318 4319 /* valid block? */ 4320 if (!pmap_pdes_valid(sva, pdes, NULL)) 4321 continue; 4322 4323 pte = &ptes[pl1_i(sva)]; 4324 for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) { 4325 if (!pmap_valid_entry(*pte)) 4326 continue; 4327 printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR 4328 " (pte=%#" PRIxPADDR ")\n", 4329 sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte); 4330 } 4331 } 4332 pmap_unmap_ptes(pmap, pmap2); 4333 kpreempt_enable(); 4334 } 4335 #endif 4336 4337 /* 4338 * pmap_update: process deferred invalidations and frees. 4339 */ 4340 4341 void 4342 pmap_update(struct pmap *pmap) 4343 { 4344 struct vm_page *empty_ptps; 4345 lwp_t *l = curlwp; 4346 4347 /* 4348 * If we have torn down this pmap, invalidate non-global TLB 4349 * entries on any processors using it. 4350 */ 4351 KPREEMPT_DISABLE(l); 4352 if (__predict_false(l->l_md.md_gc_pmap == pmap)) { 4353 l->l_md.md_gc_pmap = NULL; 4354 pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, TLBSHOOT_UPDATE); 4355 } 4356 /* 4357 * Initiate any pending TLB shootdowns. Wait for them to 4358 * complete before returning control to the caller. 4359 */ 4360 pmap_tlb_shootnow(); 4361 KPREEMPT_ENABLE(l); 4362 4363 /* 4364 * Now that shootdowns are complete, process deferred frees, 4365 * but not from interrupt context. 4366 */ 4367 if (l->l_md.md_gc_ptp != NULL) { 4368 KASSERT((l->l_pflag & LP_INTR) == 0); 4369 if (cpu_intr_p()) { 4370 return; 4371 } 4372 empty_ptps = l->l_md.md_gc_ptp; 4373 l->l_md.md_gc_ptp = NULL; 4374 pmap_free_ptps(empty_ptps); 4375 } 4376 } 4377 4378 #if PTP_LEVELS > 4 4379 #error "Unsupported number of page table mappings" 4380 #endif 4381 4382 paddr_t 4383 pmap_init_tmp_pgtbl(paddr_t pg) 4384 { 4385 static bool maps_loaded; 4386 static const paddr_t x86_tmp_pml_paddr[] = { 4387 4 * PAGE_SIZE, 4388 5 * PAGE_SIZE, 4389 6 * PAGE_SIZE, 4390 7 * PAGE_SIZE 4391 }; 4392 static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 }; 4393 4394 pd_entry_t *tmp_pml, *kernel_pml; 4395 4396 int level; 4397 4398 if (!maps_loaded) { 4399 for (level = 0; level < PTP_LEVELS; ++level) { 4400 x86_tmp_pml_vaddr[level] = 4401 uvm_km_alloc(kernel_map, PAGE_SIZE, 0, 4402 UVM_KMF_VAONLY); 4403 4404 if (x86_tmp_pml_vaddr[level] == 0) 4405 panic("mapping of real mode PML failed\n"); 4406 pmap_kenter_pa(x86_tmp_pml_vaddr[level], 4407 x86_tmp_pml_paddr[level], 4408 VM_PROT_READ | VM_PROT_WRITE, 0); 4409 pmap_update(pmap_kernel()); 4410 } 4411 maps_loaded = true; 4412 } 4413 4414 /* Zero levels 1-3 */ 4415 for (level = 0; level < PTP_LEVELS - 1; ++level) { 4416 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 4417 memset(tmp_pml, 0, PAGE_SIZE); 4418 } 4419 4420 /* Copy PML4 */ 4421 kernel_pml = pmap_kernel()->pm_pdir; 4422 tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1]; 4423 memcpy(tmp_pml, kernel_pml, PAGE_SIZE); 4424 4425 #ifdef PAE 4426 /* 4427 * Use the last 4 entries of the L2 page as L3 PD entries. These 4428 * last entries are unlikely to be used for temporary mappings. 4429 * 508: maps 0->1GB (userland) 4430 * 509: unused 4431 * 510: unused 4432 * 511: maps 3->4GB (kernel) 4433 */ 4434 tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PG_V; 4435 tmp_pml[509] = 0; 4436 tmp_pml[510] = 0; 4437 tmp_pml[511] = pmap_pdirpa(pmap_kernel(),PDIR_SLOT_KERN) | PG_V; 4438 #endif 4439 4440 for (level = PTP_LEVELS - 1; level > 0; --level) { 4441 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 4442 4443 tmp_pml[pl_i(pg, level + 1)] = 4444 (x86_tmp_pml_paddr[level - 1] & PG_FRAME) | PG_RW | PG_V; 4445 } 4446 4447 tmp_pml = (void *)x86_tmp_pml_vaddr[0]; 4448 tmp_pml[pl_i(pg, 1)] = (pg & PG_FRAME) | PG_RW | PG_V; 4449 4450 #ifdef PAE 4451 /* Return the PA of the L3 page (entry 508 of the L2 page) */ 4452 return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t); 4453 #endif 4454 4455 return x86_tmp_pml_paddr[PTP_LEVELS - 1]; 4456 } 4457 4458 u_int 4459 x86_mmap_flags(paddr_t mdpgno) 4460 { 4461 u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK; 4462 u_int pflag = 0; 4463 4464 if (nflag & X86_MMAP_FLAG_PREFETCH) 4465 pflag |= PMAP_WRITE_COMBINE; 4466 4467 return pflag; 4468 } 4469 4470 /* 4471 * Invalidates pool_cache(9) used by pmap(9). 4472 */ 4473 void 4474 pmap_invalidate_pool_caches(void) 4475 { 4476 #ifdef XEN 4477 /* 4478 * We must invalidate all shadow pages found inside the pmap_pdp_cache. 4479 * They are technically considered by Xen as L2 pages, although they 4480 * are not currently found inside pmaps list. 4481 */ 4482 pool_cache_invalidate(&pmap_pdp_cache); 4483 #endif 4484 } 4485