1 /* $NetBSD: pmap.c,v 1.231 2016/12/13 10:54:27 kamil Exp $ */ 2 3 /*- 4 * Copyright (c) 2008, 2010, 2016 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran, and by Maxime Villard. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 2007 Manuel Bouyer. 34 * 35 * Redistribution and use in source and binary forms, with or without 36 * modification, are permitted provided that the following conditions 37 * are met: 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 44 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 45 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 46 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 47 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 48 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 49 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 50 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 51 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 52 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 53 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 54 * 55 */ 56 57 /* 58 * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr> 59 * 60 * Permission to use, copy, modify, and distribute this software for any 61 * purpose with or without fee is hereby granted, provided that the above 62 * copyright notice and this permission notice appear in all copies. 63 * 64 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 65 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 66 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 67 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 68 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 69 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 70 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 71 */ 72 73 /* 74 * Copyright (c) 1997 Charles D. Cranor and Washington University. 75 * All rights reserved. 76 * 77 * Redistribution and use in source and binary forms, with or without 78 * modification, are permitted provided that the following conditions 79 * are met: 80 * 1. Redistributions of source code must retain the above copyright 81 * notice, this list of conditions and the following disclaimer. 82 * 2. Redistributions in binary form must reproduce the above copyright 83 * notice, this list of conditions and the following disclaimer in the 84 * documentation and/or other materials provided with the distribution. 85 * 86 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 87 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 88 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 89 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 90 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 91 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 92 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 93 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 94 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 95 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 96 */ 97 98 /* 99 * Copyright 2001 (c) Wasabi Systems, Inc. 100 * All rights reserved. 101 * 102 * Written by Frank van der Linden for Wasabi Systems, Inc. 103 * 104 * Redistribution and use in source and binary forms, with or without 105 * modification, are permitted provided that the following conditions 106 * are met: 107 * 1. Redistributions of source code must retain the above copyright 108 * notice, this list of conditions and the following disclaimer. 109 * 2. Redistributions in binary form must reproduce the above copyright 110 * notice, this list of conditions and the following disclaimer in the 111 * documentation and/or other materials provided with the distribution. 112 * 3. All advertising materials mentioning features or use of this software 113 * must display the following acknowledgement: 114 * This product includes software developed for the NetBSD Project by 115 * Wasabi Systems, Inc. 116 * 4. The name of Wasabi Systems, Inc. may not be used to endorse 117 * or promote products derived from this software without specific prior 118 * written permission. 119 * 120 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND 121 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 122 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 123 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC 124 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 125 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 126 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 127 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 128 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 129 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 130 * POSSIBILITY OF SUCH DAMAGE. 131 */ 132 133 /* 134 * This is the i386 pmap modified and generalized to support x86-64 135 * as well. The idea is to hide the upper N levels of the page tables 136 * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest 137 * is mostly untouched, except that it uses some more generalized 138 * macros and interfaces. 139 * 140 * This pmap has been tested on the i386 as well, and it can be easily 141 * adapted to PAE. 142 * 143 * fvdl@wasabisystems.com 18-Jun-2001 144 */ 145 146 /* 147 * pmap.c: i386 pmap module rewrite 148 * Chuck Cranor <chuck@netbsd> 149 * 11-Aug-97 150 * 151 * history of this pmap module: in addition to my own input, i used 152 * the following references for this rewrite of the i386 pmap: 153 * 154 * [1] the NetBSD i386 pmap. this pmap appears to be based on the 155 * BSD hp300 pmap done by Mike Hibler at University of Utah. 156 * it was then ported to the i386 by William Jolitz of UUNET 157 * Technologies, Inc. Then Charles M. Hannum of the NetBSD 158 * project fixed some bugs and provided some speed ups. 159 * 160 * [2] the FreeBSD i386 pmap. this pmap seems to be the 161 * Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson 162 * and David Greenman. 163 * 164 * [3] the Mach pmap. this pmap, from CMU, seems to have migrated 165 * between several processors. the VAX version was done by 166 * Avadis Tevanian, Jr., and Michael Wayne Young. the i386 167 * version was done by Lance Berc, Mike Kupfer, Bob Baron, 168 * David Golub, and Richard Draves. the alpha version was 169 * done by Alessandro Forin (CMU/Mach) and Chris Demetriou 170 * (NetBSD/alpha). 171 */ 172 173 #include <sys/cdefs.h> 174 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.231 2016/12/13 10:54:27 kamil Exp $"); 175 176 #include "opt_user_ldt.h" 177 #include "opt_lockdebug.h" 178 #include "opt_multiprocessor.h" 179 #include "opt_xen.h" 180 181 #include <sys/param.h> 182 #include <sys/systm.h> 183 #include <sys/proc.h> 184 #include <sys/pool.h> 185 #include <sys/kernel.h> 186 #include <sys/atomic.h> 187 #include <sys/cpu.h> 188 #include <sys/intr.h> 189 #include <sys/xcall.h> 190 #include <sys/kcore.h> 191 192 #include <uvm/uvm.h> 193 #include <uvm/pmap/pmap_pvt.h> 194 195 #include <dev/isa/isareg.h> 196 197 #include <machine/specialreg.h> 198 #include <machine/gdt.h> 199 #include <machine/isa_machdep.h> 200 #include <machine/cpuvar.h> 201 #include <machine/cputypes.h> 202 203 #include <x86/pmap.h> 204 #include <x86/pmap_pv.h> 205 206 #include <x86/i82489reg.h> 207 #include <x86/i82489var.h> 208 209 #ifdef XEN 210 #include <xen/xen-public/xen.h> 211 #include <xen/hypervisor.h> 212 #endif 213 214 /* 215 * general info: 216 * 217 * - for an explanation of how the i386 MMU hardware works see 218 * the comments in <machine/pte.h>. 219 * 220 * - for an explanation of the general memory structure used by 221 * this pmap (including the recursive mapping), see the comments 222 * in <machine/pmap.h>. 223 * 224 * this file contains the code for the "pmap module." the module's 225 * job is to manage the hardware's virtual to physical address mappings. 226 * note that there are two levels of mapping in the VM system: 227 * 228 * [1] the upper layer of the VM system uses vm_map's and vm_map_entry's 229 * to map ranges of virtual address space to objects/files. for 230 * example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only 231 * to the file /bin/ls starting at offset zero." note that 232 * the upper layer mapping is not concerned with how individual 233 * vm_pages are mapped. 234 * 235 * [2] the lower layer of the VM system (the pmap) maintains the mappings 236 * from virtual addresses. it is concerned with which vm_page is 237 * mapped where. for example, when you run /bin/ls and start 238 * at page 0x1000 the fault routine may lookup the correct page 239 * of the /bin/ls file and then ask the pmap layer to establish 240 * a mapping for it. 241 * 242 * note that information in the lower layer of the VM system can be 243 * thrown away since it can easily be reconstructed from the info 244 * in the upper layer. 245 * 246 * data structures we use include: 247 * 248 * - struct pmap: describes the address space of one thread 249 * - struct pmap_page: describes one pv-tracked page, without 250 * necessarily a corresponding vm_page 251 * - struct pv_entry: describes one <PMAP,VA> mapping of a PA 252 * - struct pv_head: there is one pv_head per pv-tracked page of 253 * physical memory. the pv_head points to a list of pv_entry 254 * structures which describe all the <PMAP,VA> pairs that this 255 * page is mapped in. this is critical for page based operations 256 * such as pmap_page_protect() [change protection on _all_ mappings 257 * of a page] 258 */ 259 260 /* 261 * memory allocation 262 * 263 * - there are three data structures that we must dynamically allocate: 264 * 265 * [A] new process' page directory page (PDP) 266 * - plan 1: done at pmap_create() we use 267 * uvm_km_alloc(kernel_map, PAGE_SIZE) [fka kmem_alloc] to do this 268 * allocation. 269 * 270 * if we are low in free physical memory then we sleep in 271 * uvm_km_alloc -- in this case this is ok since we are creating 272 * a new pmap and should not be holding any locks. 273 * 274 * if the kernel is totally out of virtual space 275 * (i.e. uvm_km_alloc returns NULL), then we panic. 276 * 277 * [B] new page tables pages (PTP) 278 * - call uvm_pagealloc() 279 * => success: zero page, add to pm_pdir 280 * => failure: we are out of free vm_pages, let pmap_enter() 281 * tell UVM about it. 282 * 283 * note: for kernel PTPs, we start with NKPTP of them. as we map 284 * kernel memory (at uvm_map time) we check to see if we've grown 285 * the kernel pmap. if so, we call the optional function 286 * pmap_growkernel() to grow the kernel PTPs in advance. 287 * 288 * [C] pv_entry structures 289 */ 290 291 /* 292 * locking 293 * 294 * we have the following locks that we must contend with: 295 * 296 * mutexes: 297 * 298 * - pmap lock (per pmap, part of uvm_object) 299 * this lock protects the fields in the pmap structure including 300 * the non-kernel PDEs in the PDP, and the PTEs. it also locks 301 * in the alternate PTE space (since that is determined by the 302 * entry in the PDP). 303 * 304 * - pvh_lock (per pv_head) 305 * this lock protects the pv_entry list which is chained off the 306 * pv_head structure for a specific pv-tracked PA. it is locked 307 * when traversing the list (e.g. adding/removing mappings, 308 * syncing R/M bits, etc.) 309 * 310 * - pmaps_lock 311 * this lock protects the list of active pmaps (headed by "pmaps"). 312 * we lock it when adding or removing pmaps from this list. 313 */ 314 315 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER; 316 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER; 317 const long nkptpmax[] = NKPTPMAX_INITIALIZER; 318 const long nbpd[] = NBPD_INITIALIZER; 319 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER; 320 321 long nkptp[] = NKPTP_INITIALIZER; 322 323 struct pmap_head pmaps; 324 kmutex_t pmaps_lock; 325 326 static vaddr_t pmap_maxkvaddr; 327 328 /* 329 * XXX kludge: dummy locking to make KASSERTs in uvm_page.c comfortable. 330 * actual locking is done by pm_lock. 331 */ 332 #if defined(DIAGNOSTIC) 333 #define PMAP_SUBOBJ_LOCK(pm, idx) \ 334 KASSERT(mutex_owned((pm)->pm_lock)); \ 335 if ((idx) != 0) \ 336 mutex_enter((pm)->pm_obj[(idx)].vmobjlock) 337 #define PMAP_SUBOBJ_UNLOCK(pm, idx) \ 338 KASSERT(mutex_owned((pm)->pm_lock)); \ 339 if ((idx) != 0) \ 340 mutex_exit((pm)->pm_obj[(idx)].vmobjlock) 341 #else /* defined(DIAGNOSTIC) */ 342 #define PMAP_SUBOBJ_LOCK(pm, idx) /* nothing */ 343 #define PMAP_SUBOBJ_UNLOCK(pm, idx) /* nothing */ 344 #endif /* defined(DIAGNOSTIC) */ 345 346 /* 347 * Misc. event counters. 348 */ 349 struct evcnt pmap_iobmp_evcnt; 350 struct evcnt pmap_ldt_evcnt; 351 352 /* 353 * PAT 354 */ 355 #define PATENTRY(n, type) (type << ((n) * 8)) 356 #define PAT_UC 0x0ULL 357 #define PAT_WC 0x1ULL 358 #define PAT_WT 0x4ULL 359 #define PAT_WP 0x5ULL 360 #define PAT_WB 0x6ULL 361 #define PAT_UCMINUS 0x7ULL 362 363 static bool cpu_pat_enabled __read_mostly = false; 364 365 /* 366 * Global data structures 367 */ 368 369 static struct pmap kernel_pmap_store; /* the kernel's pmap (proc0) */ 370 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store; 371 372 /* 373 * pmap_pg_nx: if our processor supports PG_NX in the PTE then we 374 * set pmap_pg_nx to PG_NX (otherwise it is zero). 375 */ 376 pd_entry_t pmap_pg_nx __read_mostly = 0; 377 378 /* 379 * pmap_pg_g: if our processor supports PG_G in the PTE then we 380 * set pmap_pg_g to PG_G (otherwise it is zero). 381 */ 382 pd_entry_t pmap_pg_g __read_mostly = 0; 383 384 /* 385 * pmap_largepages: if our processor supports PG_PS and we are 386 * using it, this is set to true. 387 */ 388 int pmap_largepages __read_mostly = 0; 389 390 /* 391 * i386 physical memory comes in a big contig chunk with a small 392 * hole toward the front of it... the following two paddr_t's 393 * (shared with machdep.c) describe the physical address space 394 * of this machine. 395 */ 396 paddr_t avail_start __read_mostly; /* PA of first available physical page */ 397 paddr_t avail_end __read_mostly; /* PA of last available physical page */ 398 399 #ifdef XEN 400 #ifdef __x86_64__ 401 /* Dummy PGD for user cr3, used between pmap_deactivate() and pmap_activate() */ 402 static paddr_t xen_dummy_user_pgd; 403 #endif /* __x86_64__ */ 404 paddr_t pmap_pa_start; /* PA of first physical page for this domain */ 405 paddr_t pmap_pa_end; /* PA of last physical page for this domain */ 406 #endif /* XEN */ 407 408 #define VM_PAGE_TO_PP(pg) (&(pg)->mdpage.mp_pp) 409 410 #define PV_HASH_SIZE 32768 411 #define PV_HASH_LOCK_CNT 32 412 413 struct pv_hash_lock { 414 kmutex_t lock; 415 } __aligned(CACHE_LINE_SIZE) pv_hash_locks[PV_HASH_LOCK_CNT] 416 __aligned(CACHE_LINE_SIZE); 417 418 struct pv_hash_head { 419 SLIST_HEAD(, pv_entry) hh_list; 420 } pv_hash_heads[PV_HASH_SIZE]; 421 422 static u_int 423 pvhash_hash(struct vm_page *ptp, vaddr_t va) 424 { 425 426 return (uintptr_t)ptp / sizeof(*ptp) + (va >> PAGE_SHIFT); 427 } 428 429 static struct pv_hash_head * 430 pvhash_head(u_int hash) 431 { 432 433 return &pv_hash_heads[hash % PV_HASH_SIZE]; 434 } 435 436 static kmutex_t * 437 pvhash_lock(u_int hash) 438 { 439 440 return &pv_hash_locks[hash % PV_HASH_LOCK_CNT].lock; 441 } 442 443 static struct pv_entry * 444 pvhash_remove(struct pv_hash_head *hh, struct vm_page *ptp, vaddr_t va) 445 { 446 struct pv_entry *pve; 447 struct pv_entry *prev; 448 449 prev = NULL; 450 SLIST_FOREACH(pve, &hh->hh_list, pve_hash) { 451 if (pve->pve_pte.pte_ptp == ptp && 452 pve->pve_pte.pte_va == va) { 453 if (prev != NULL) { 454 SLIST_REMOVE_AFTER(prev, pve_hash); 455 } else { 456 SLIST_REMOVE_HEAD(&hh->hh_list, pve_hash); 457 } 458 break; 459 } 460 prev = pve; 461 } 462 return pve; 463 } 464 465 /* 466 * Other data structures 467 */ 468 469 static pt_entry_t protection_codes[8] __read_mostly; 470 471 static bool pmap_initialized __read_mostly = false; /* pmap_init done yet? */ 472 473 /* 474 * The following two vaddr_t's are used during system startup to keep track of 475 * how much of the kernel's VM space we have used. Once the system is started, 476 * the management of the remaining kernel VM space is turned over to the 477 * kernel_map vm_map. 478 */ 479 static vaddr_t virtual_avail __read_mostly; /* VA of first free KVA */ 480 static vaddr_t virtual_end __read_mostly; /* VA of last free KVA */ 481 482 /* 483 * LAPIC virtual address, and fake physical address. 484 */ 485 volatile vaddr_t local_apic_va; 486 paddr_t local_apic_pa; 487 488 /* 489 * pool that pmap structures are allocated from 490 */ 491 static struct pool_cache pmap_cache; 492 493 /* 494 * pv_entry cache 495 */ 496 static struct pool_cache pmap_pv_cache; 497 498 #ifndef __HAVE_DIRECT_MAP 499 /* 500 * MULTIPROCESSOR: special VAs and PTEs are actually allocated inside a 501 * (maxcpus * NPTECL) array of PTE, to avoid cache line thrashing due to 502 * false sharing. 503 */ 504 #ifdef MULTIPROCESSOR 505 #define PTESLEW(pte, id) ((pte)+(id)*NPTECL) 506 #define VASLEW(va,id) ((va)+(id)*NPTECL*PAGE_SIZE) 507 #else 508 #define PTESLEW(pte, id) ((void)id, pte) 509 #define VASLEW(va,id) ((void)id, va) 510 #endif 511 512 /* 513 * Special VAs and the PTEs that map them 514 */ 515 static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte, *early_zero_pte; 516 static char *csrcp, *cdstp, *zerop, *ptpp; 517 #ifdef XEN 518 char *early_zerop; /* also referenced from xen_locore() */ 519 #else 520 static char *early_zerop; 521 #endif 522 523 #endif 524 525 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int); 526 527 /* PDP pool_cache(9) and its callbacks */ 528 struct pool_cache pmap_pdp_cache; 529 static int pmap_pdp_ctor(void *, void *, int); 530 static void pmap_pdp_dtor(void *, void *); 531 #ifdef PAE 532 /* need to allocate items of 4 pages */ 533 static void *pmap_pdp_alloc(struct pool *, int); 534 static void pmap_pdp_free(struct pool *, void *); 535 static struct pool_allocator pmap_pdp_allocator = { 536 .pa_alloc = pmap_pdp_alloc, 537 .pa_free = pmap_pdp_free, 538 .pa_pagesz = PAGE_SIZE * PDP_SIZE, 539 }; 540 #endif /* PAE */ 541 542 extern vaddr_t idt_vaddr; 543 extern paddr_t idt_paddr; 544 extern vaddr_t gdt_vaddr; 545 extern paddr_t gdt_paddr; 546 extern vaddr_t ldt_vaddr; 547 extern paddr_t ldt_paddr; 548 549 extern int end; 550 551 #ifdef i386 552 /* stuff to fix the pentium f00f bug */ 553 extern vaddr_t pentium_idt_vaddr; 554 #endif 555 556 /* 557 * Local prototypes 558 */ 559 560 static void pmap_init_lapic(void); 561 #ifdef __HAVE_DIRECT_MAP 562 static void pmap_init_directmap(struct pmap *); 563 #endif 564 #ifndef XEN 565 static void pmap_remap_largepages(void); 566 #endif 567 568 static struct vm_page *pmap_get_ptp(struct pmap *, vaddr_t, 569 pd_entry_t * const *); 570 static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int); 571 static void pmap_freepage(struct pmap *, struct vm_page *, int); 572 static void pmap_free_ptp(struct pmap *, struct vm_page *, vaddr_t, 573 pt_entry_t *, pd_entry_t * const *); 574 static bool pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *, 575 vaddr_t, struct pv_entry **); 576 static void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t, vaddr_t, 577 vaddr_t, struct pv_entry **); 578 579 static paddr_t pmap_get_physpage(void); 580 static void pmap_alloc_level(vaddr_t, long *); 581 582 static bool pmap_reactivate(struct pmap *); 583 584 /* 585 * p m a p h e l p e r f u n c t i o n s 586 */ 587 588 static inline void 589 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff) 590 { 591 592 if (pmap == pmap_kernel()) { 593 atomic_add_long(&pmap->pm_stats.resident_count, resid_diff); 594 atomic_add_long(&pmap->pm_stats.wired_count, wired_diff); 595 } else { 596 KASSERT(mutex_owned(pmap->pm_lock)); 597 pmap->pm_stats.resident_count += resid_diff; 598 pmap->pm_stats.wired_count += wired_diff; 599 } 600 } 601 602 static inline void 603 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte) 604 { 605 int resid_diff = ((npte & PG_V) ? 1 : 0) - ((opte & PG_V) ? 1 : 0); 606 int wired_diff = ((npte & PG_W) ? 1 : 0) - ((opte & PG_W) ? 1 : 0); 607 608 KASSERT((npte & (PG_V | PG_W)) != PG_W); 609 KASSERT((opte & (PG_V | PG_W)) != PG_W); 610 611 pmap_stats_update(pmap, resid_diff, wired_diff); 612 } 613 614 /* 615 * ptp_to_pmap: lookup pmap by ptp 616 */ 617 618 static struct pmap * 619 ptp_to_pmap(struct vm_page *ptp) 620 { 621 struct pmap *pmap; 622 623 if (ptp == NULL) { 624 return pmap_kernel(); 625 } 626 pmap = (struct pmap *)ptp->uobject; 627 KASSERT(pmap != NULL); 628 KASSERT(&pmap->pm_obj[0] == ptp->uobject); 629 return pmap; 630 } 631 632 static inline struct pv_pte * 633 pve_to_pvpte(struct pv_entry *pve) 634 { 635 636 KASSERT((void *)&pve->pve_pte == (void *)pve); 637 return &pve->pve_pte; 638 } 639 640 static inline struct pv_entry * 641 pvpte_to_pve(struct pv_pte *pvpte) 642 { 643 struct pv_entry *pve = (void *)pvpte; 644 645 KASSERT(pve_to_pvpte(pve) == pvpte); 646 return pve; 647 } 648 649 /* 650 * pv_pte_first, pv_pte_next: PV list iterator. 651 */ 652 653 static struct pv_pte * 654 pv_pte_first(struct pmap_page *pp) 655 { 656 657 if ((pp->pp_flags & PP_EMBEDDED) != 0) { 658 return &pp->pp_pte; 659 } 660 return pve_to_pvpte(LIST_FIRST(&pp->pp_head.pvh_list)); 661 } 662 663 static struct pv_pte * 664 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte) 665 { 666 667 KASSERT(pvpte != NULL); 668 if (pvpte == &pp->pp_pte) { 669 KASSERT((pp->pp_flags & PP_EMBEDDED) != 0); 670 return NULL; 671 } 672 KASSERT((pp->pp_flags & PP_EMBEDDED) == 0); 673 return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list)); 674 } 675 676 /* 677 * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]? 678 * of course the kernel is always loaded 679 */ 680 681 bool 682 pmap_is_curpmap(struct pmap *pmap) 683 { 684 return((pmap == pmap_kernel()) || 685 (pmap == curcpu()->ci_pmap)); 686 } 687 688 /* 689 * Add a reference to the specified pmap. 690 */ 691 692 void 693 pmap_reference(struct pmap *pmap) 694 { 695 696 atomic_inc_uint(&pmap->pm_obj[0].uo_refs); 697 } 698 699 /* 700 * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in 701 * 702 * there are several pmaps involved. some or all of them might be same. 703 * 704 * - the pmap given by the first argument 705 * our caller wants to access this pmap's PTEs. 706 * 707 * - pmap_kernel() 708 * the kernel pmap. note that it only contains the kernel part 709 * of the address space which is shared by any pmap. ie. any 710 * pmap can be used instead of pmap_kernel() for our purpose. 711 * 712 * - ci->ci_pmap 713 * pmap currently loaded on the cpu. 714 * 715 * - vm_map_pmap(&curproc->p_vmspace->vm_map) 716 * current process' pmap. 717 * 718 * => we lock enough pmaps to keep things locked in 719 * => must be undone with pmap_unmap_ptes before returning 720 */ 721 722 void 723 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2, 724 pd_entry_t **ptepp, pd_entry_t * const **pdeppp) 725 { 726 struct pmap *curpmap; 727 struct cpu_info *ci; 728 lwp_t *l; 729 730 /* The kernel's pmap is always accessible. */ 731 if (pmap == pmap_kernel()) { 732 *pmap2 = NULL; 733 *ptepp = PTE_BASE; 734 *pdeppp = normal_pdes; 735 return; 736 } 737 KASSERT(kpreempt_disabled()); 738 739 l = curlwp; 740 retry: 741 mutex_enter(pmap->pm_lock); 742 ci = curcpu(); 743 curpmap = ci->ci_pmap; 744 if (vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) { 745 /* Our own pmap so just load it: easy. */ 746 if (__predict_false(ci->ci_want_pmapload)) { 747 mutex_exit(pmap->pm_lock); 748 pmap_load(); 749 goto retry; 750 } 751 KASSERT(pmap == curpmap); 752 } else if (pmap == curpmap) { 753 /* 754 * Already on the CPU: make it valid. This is very 755 * often the case during exit(), when we have switched 756 * to the kernel pmap in order to destroy a user pmap. 757 */ 758 if (!pmap_reactivate(pmap)) { 759 u_int gen = uvm_emap_gen_return(); 760 tlbflush(); 761 uvm_emap_update(gen); 762 } 763 } else { 764 /* 765 * Toss current pmap from CPU, but keep a reference to it. 766 * The reference will be dropped by pmap_unmap_ptes(). 767 * Can happen if we block during exit(). 768 */ 769 const cpuid_t cid = cpu_index(ci); 770 771 kcpuset_atomic_clear(curpmap->pm_cpus, cid); 772 kcpuset_atomic_clear(curpmap->pm_kernel_cpus, cid); 773 ci->ci_pmap = pmap; 774 ci->ci_tlbstate = TLBSTATE_VALID; 775 kcpuset_atomic_set(pmap->pm_cpus, cid); 776 kcpuset_atomic_set(pmap->pm_kernel_cpus, cid); 777 cpu_load_pmap(pmap, curpmap); 778 } 779 pmap->pm_ncsw = l->l_ncsw; 780 *pmap2 = curpmap; 781 *ptepp = PTE_BASE; 782 #if defined(XEN) && defined(__x86_64__) 783 KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] == L4_BASE); 784 ci->ci_normal_pdes[PTP_LEVELS - 2] = pmap->pm_pdir; 785 *pdeppp = ci->ci_normal_pdes; 786 #else /* XEN && __x86_64__ */ 787 *pdeppp = normal_pdes; 788 #endif /* XEN && __x86_64__ */ 789 } 790 791 /* 792 * pmap_unmap_ptes: unlock the PTE mapping of "pmap" 793 */ 794 795 void 796 pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2) 797 { 798 struct cpu_info *ci; 799 struct pmap *mypmap; 800 801 KASSERT(kpreempt_disabled()); 802 803 /* The kernel's pmap is always accessible. */ 804 if (pmap == pmap_kernel()) { 805 return; 806 } 807 808 ci = curcpu(); 809 #if defined(XEN) && defined(__x86_64__) 810 /* Reset per-cpu normal_pdes */ 811 KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] != L4_BASE); 812 ci->ci_normal_pdes[PTP_LEVELS - 2] = L4_BASE; 813 #endif /* XEN && __x86_64__ */ 814 /* 815 * We cannot tolerate context switches while mapped in. 816 * If it is our own pmap all we have to do is unlock. 817 */ 818 KASSERT(pmap->pm_ncsw == curlwp->l_ncsw); 819 mypmap = vm_map_pmap(&curproc->p_vmspace->vm_map); 820 if (pmap == mypmap) { 821 mutex_exit(pmap->pm_lock); 822 return; 823 } 824 825 /* 826 * Mark whatever's on the CPU now as lazy and unlock. 827 * If the pmap was already installed, we are done. 828 */ 829 ci->ci_tlbstate = TLBSTATE_LAZY; 830 ci->ci_want_pmapload = (mypmap != pmap_kernel()); 831 mutex_exit(pmap->pm_lock); 832 if (pmap == pmap2) { 833 return; 834 } 835 836 /* 837 * We installed another pmap on the CPU. Grab a reference to 838 * it and leave in place. Toss the evicted pmap (can block). 839 */ 840 pmap_reference(pmap); 841 pmap_destroy(pmap2); 842 } 843 844 845 inline static void 846 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte) 847 { 848 849 #if !defined(__x86_64__) 850 if (curproc == NULL || curproc->p_vmspace == NULL || 851 pm != vm_map_pmap(&curproc->p_vmspace->vm_map)) 852 return; 853 854 if ((opte ^ npte) & PG_X) 855 pmap_update_pg(va); 856 857 /* 858 * Executability was removed on the last executable change. 859 * Reset the code segment to something conservative and 860 * let the trap handler deal with setting the right limit. 861 * We can't do that because of locking constraints on the vm map. 862 */ 863 864 if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) { 865 struct trapframe *tf = curlwp->l_md.md_regs; 866 867 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 868 pm->pm_hiexec = I386_MAX_EXE_ADDR; 869 } 870 #endif /* !defined(__x86_64__) */ 871 } 872 873 #if !defined(__x86_64__) 874 /* 875 * Fixup the code segment to cover all potential executable mappings. 876 * returns 0 if no changes to the code segment were made. 877 */ 878 879 int 880 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb) 881 { 882 struct vm_map_entry *ent; 883 struct pmap *pm = vm_map_pmap(map); 884 vaddr_t va = 0; 885 886 vm_map_lock_read(map); 887 for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) { 888 889 /* 890 * This entry has greater va than the entries before. 891 * We need to make it point to the last page, not past it. 892 */ 893 894 if (ent->protection & VM_PROT_EXECUTE) 895 va = trunc_page(ent->end) - PAGE_SIZE; 896 } 897 vm_map_unlock_read(map); 898 if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL)) 899 return (0); 900 901 pm->pm_hiexec = va; 902 if (pm->pm_hiexec > I386_MAX_EXE_ADDR) { 903 tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL); 904 } else { 905 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 906 return (0); 907 } 908 return (1); 909 } 910 #endif /* !defined(__x86_64__) */ 911 912 void 913 pat_init(struct cpu_info *ci) 914 { 915 uint64_t pat; 916 917 if (!(ci->ci_feat_val[0] & CPUID_PAT)) 918 return; 919 920 /* We change WT to WC. Leave all other entries the default values. */ 921 pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) | 922 PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) | 923 PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) | 924 PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC); 925 926 wrmsr(MSR_CR_PAT, pat); 927 cpu_pat_enabled = true; 928 aprint_debug_dev(ci->ci_dev, "PAT enabled\n"); 929 } 930 931 static pt_entry_t 932 pmap_pat_flags(u_int flags) 933 { 934 u_int cacheflags = (flags & PMAP_CACHE_MASK); 935 936 if (!cpu_pat_enabled) { 937 switch (cacheflags) { 938 case PMAP_NOCACHE: 939 case PMAP_NOCACHE_OVR: 940 /* results in PGC_UCMINUS on cpus which have 941 * the cpuid PAT but PAT "disabled" 942 */ 943 return PG_N; 944 default: 945 return 0; 946 } 947 } 948 949 switch (cacheflags) { 950 case PMAP_NOCACHE: 951 return PGC_UC; 952 case PMAP_WRITE_COMBINE: 953 return PGC_WC; 954 case PMAP_WRITE_BACK: 955 return PGC_WB; 956 case PMAP_NOCACHE_OVR: 957 return PGC_UCMINUS; 958 } 959 960 return 0; 961 } 962 963 /* 964 * p m a p k e n t e r f u n c t i o n s 965 * 966 * functions to quickly enter/remove pages from the kernel address 967 * space. pmap_kremove is exported to MI kernel. we make use of 968 * the recursive PTE mappings. 969 */ 970 971 /* 972 * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking 973 * 974 * => no need to lock anything, assume va is already allocated 975 * => should be faster than normal pmap enter function 976 */ 977 978 void 979 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags) 980 { 981 pt_entry_t *pte, opte, npte; 982 983 KASSERT(!(prot & ~VM_PROT_ALL)); 984 985 if (va < VM_MIN_KERNEL_ADDRESS) 986 pte = vtopte(va); 987 else 988 pte = kvtopte(va); 989 #ifdef DOM0OPS 990 if (pa < pmap_pa_start || pa >= pmap_pa_end) { 991 #ifdef DEBUG 992 printf_nolog("%s: pa 0x%" PRIx64 " for va 0x%" PRIx64 993 " outside range\n", __func__, (int64_t)pa, (int64_t)va); 994 #endif /* DEBUG */ 995 npte = pa; 996 } else 997 #endif /* DOM0OPS */ 998 npte = pmap_pa2pte(pa); 999 npte |= protection_codes[prot] | PG_k | PG_V | pmap_pg_g; 1000 npte |= pmap_pat_flags(flags); 1001 opte = pmap_pte_testset(pte, npte); /* zap! */ 1002 #if defined(DIAGNOSTIC) 1003 /* 1004 * XXX: make sure we are not dealing with a large page, since the only 1005 * large pages created are for the kernel image, and they should never 1006 * be kentered. 1007 */ 1008 if (opte & PG_PS) 1009 panic("%s: PG_PS", __func__); 1010 #endif 1011 if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) { 1012 /* This should not happen. */ 1013 printf_nolog("%s: mapping already present\n", __func__); 1014 kpreempt_disable(); 1015 pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER); 1016 kpreempt_enable(); 1017 } 1018 } 1019 1020 void 1021 pmap_emap_enter(vaddr_t va, paddr_t pa, vm_prot_t prot) 1022 { 1023 pt_entry_t *pte, npte; 1024 1025 KASSERT((prot & ~VM_PROT_ALL) == 0); 1026 pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va); 1027 1028 #ifdef DOM0OPS 1029 if (pa < pmap_pa_start || pa >= pmap_pa_end) { 1030 npte = pa; 1031 } else 1032 #endif 1033 npte = pmap_pa2pte(pa); 1034 1035 npte = pmap_pa2pte(pa); 1036 npte |= protection_codes[prot] | PG_k | PG_V; 1037 pmap_pte_set(pte, npte); 1038 } 1039 1040 /* 1041 * pmap_emap_sync: perform TLB flush or pmap load, if it was deferred. 1042 */ 1043 void 1044 pmap_emap_sync(bool canload) 1045 { 1046 struct cpu_info *ci = curcpu(); 1047 struct pmap *pmap; 1048 1049 KASSERT(kpreempt_disabled()); 1050 if (__predict_true(ci->ci_want_pmapload && canload)) { 1051 /* 1052 * XXX: Hint for pmap_reactivate(), which might suggest to 1053 * not perform TLB flush, if state has not changed. 1054 */ 1055 pmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map); 1056 if (__predict_false(pmap == ci->ci_pmap)) { 1057 kcpuset_atomic_clear(pmap->pm_cpus, cpu_index(ci)); 1058 } 1059 pmap_load(); 1060 KASSERT(ci->ci_want_pmapload == 0); 1061 } else { 1062 tlbflush(); 1063 } 1064 } 1065 1066 void 1067 pmap_emap_remove(vaddr_t sva, vsize_t len) 1068 { 1069 pt_entry_t *pte; 1070 vaddr_t va, eva = sva + len; 1071 1072 for (va = sva; va < eva; va += PAGE_SIZE) { 1073 pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va); 1074 pmap_pte_set(pte, 0); 1075 } 1076 } 1077 1078 __strict_weak_alias(pmap_kenter_ma, pmap_kenter_pa); 1079 1080 #if defined(__x86_64__) 1081 /* 1082 * Change protection for a virtual address. Local for a CPU only, don't 1083 * care about TLB shootdowns. 1084 * 1085 * => must be called with preemption disabled 1086 */ 1087 void 1088 pmap_changeprot_local(vaddr_t va, vm_prot_t prot) 1089 { 1090 pt_entry_t *pte, opte, npte; 1091 1092 KASSERT(kpreempt_disabled()); 1093 1094 if (va < VM_MIN_KERNEL_ADDRESS) 1095 pte = vtopte(va); 1096 else 1097 pte = kvtopte(va); 1098 1099 npte = opte = *pte; 1100 1101 if ((prot & VM_PROT_WRITE) != 0) 1102 npte |= PG_RW; 1103 else 1104 npte &= ~PG_RW; 1105 1106 if (opte != npte) { 1107 pmap_pte_set(pte, npte); 1108 pmap_pte_flush(); 1109 invlpg(va); 1110 } 1111 } 1112 #endif /* defined(__x86_64__) */ 1113 1114 /* 1115 * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking 1116 * 1117 * => no need to lock anything 1118 * => caller must dispose of any vm_page mapped in the va range 1119 * => note: not an inline function 1120 * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE 1121 * => we assume kernel only unmaps valid addresses and thus don't bother 1122 * checking the valid bit before doing TLB flushing 1123 * => must be followed by call to pmap_update() before reuse of page 1124 */ 1125 1126 static inline void 1127 pmap_kremove1(vaddr_t sva, vsize_t len, bool localonly) 1128 { 1129 pt_entry_t *pte, opte; 1130 vaddr_t va, eva; 1131 1132 eva = sva + len; 1133 1134 kpreempt_disable(); 1135 for (va = sva; va < eva; va += PAGE_SIZE) { 1136 pte = kvtopte(va); 1137 opte = pmap_pte_testset(pte, 0); /* zap! */ 1138 if ((opte & (PG_V | PG_U)) == (PG_V | PG_U) && !localonly) { 1139 pmap_tlb_shootdown(pmap_kernel(), va, opte, 1140 TLBSHOOT_KREMOVE); 1141 } 1142 KASSERT((opte & PG_PS) == 0); 1143 KASSERT((opte & PG_PVLIST) == 0); 1144 } 1145 if (localonly) { 1146 tlbflushg(); 1147 } 1148 kpreempt_enable(); 1149 } 1150 1151 void 1152 pmap_kremove(vaddr_t sva, vsize_t len) 1153 { 1154 1155 pmap_kremove1(sva, len, false); 1156 } 1157 1158 /* 1159 * pmap_kremove_local: like pmap_kremove(), but only worry about 1160 * TLB invalidations on the current CPU. this is only intended 1161 * for use while writing kernel crash dumps. 1162 */ 1163 1164 void 1165 pmap_kremove_local(vaddr_t sva, vsize_t len) 1166 { 1167 1168 KASSERT(panicstr != NULL); 1169 pmap_kremove1(sva, len, true); 1170 } 1171 1172 /* 1173 * p m a p i n i t f u n c t i o n s 1174 * 1175 * pmap_bootstrap and pmap_init are called during system startup 1176 * to init the pmap module. pmap_bootstrap() does a low level 1177 * init just to get things rolling. pmap_init() finishes the job. 1178 */ 1179 1180 /* 1181 * pmap_bootstrap_valloc: allocate a virtual address in the bootstrap area. 1182 * This function is to be used before any VM system has been set up. 1183 * 1184 * The va is taken from virtual_avail. 1185 */ 1186 static vaddr_t 1187 pmap_bootstrap_valloc(size_t npages) 1188 { 1189 vaddr_t va = virtual_avail; 1190 virtual_avail += npages * PAGE_SIZE; 1191 return va; 1192 } 1193 1194 /* 1195 * pmap_bootstrap_palloc: allocate a physical address in the bootstrap area. 1196 * This function is to be used before any VM system has been set up. 1197 * 1198 * The pa is taken from avail_start. 1199 */ 1200 static paddr_t 1201 pmap_bootstrap_palloc(size_t npages) 1202 { 1203 paddr_t pa = avail_start; 1204 avail_start += npages * PAGE_SIZE; 1205 return pa; 1206 } 1207 1208 /* 1209 * pmap_bootstrap: get the system in a state where it can run with VM properly 1210 * enabled (called before main()). The VM system is fully init'd later. 1211 * 1212 * => on i386, locore.S has already enabled the MMU by allocating a PDP for the 1213 * kernel, and nkpde PTP's for the kernel. 1214 * => kva_start is the first free virtual address in kernel space. 1215 */ 1216 void 1217 pmap_bootstrap(vaddr_t kva_start) 1218 { 1219 struct pmap *kpm; 1220 int i; 1221 vaddr_t kva; 1222 #ifndef XEN 1223 unsigned long p1i; 1224 vaddr_t kva_end; 1225 #endif 1226 1227 pmap_pg_nx = (cpu_feature[2] & CPUID_NOX ? PG_NX : 0); 1228 1229 /* 1230 * Set up our local static global vars that keep track of the usage of 1231 * KVM before kernel_map is set up. 1232 */ 1233 virtual_avail = kva_start; /* first free KVA */ 1234 virtual_end = VM_MAX_KERNEL_ADDRESS; /* last KVA */ 1235 1236 /* 1237 * Set up protection_codes: we need to be able to convert from a MI 1238 * protection code (some combo of VM_PROT...) to something we can jam 1239 * into a x86 PTE. 1240 */ 1241 protection_codes[VM_PROT_NONE] = pmap_pg_nx; 1242 protection_codes[VM_PROT_EXECUTE] = PG_RO | PG_X; 1243 protection_codes[VM_PROT_READ] = PG_RO | pmap_pg_nx; 1244 protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO | PG_X; 1245 protection_codes[VM_PROT_WRITE] = PG_RW | pmap_pg_nx; 1246 protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW | PG_X; 1247 protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW | pmap_pg_nx; 1248 protection_codes[VM_PROT_ALL] = PG_RW | PG_X; 1249 1250 /* 1251 * Now we init the kernel's pmap. 1252 * 1253 * The kernel pmap's pm_obj is not used for much. However, in user pmaps 1254 * the pm_obj contains the list of active PTPs. 1255 * 1256 * The pm_obj currently does not have a pager. It might be possible to 1257 * add a pager that would allow a process to read-only mmap its own page 1258 * tables (fast user-level vtophys?). This may or may not be useful. 1259 */ 1260 kpm = pmap_kernel(); 1261 for (i = 0; i < PTP_LEVELS - 1; i++) { 1262 mutex_init(&kpm->pm_obj_lock[i], MUTEX_DEFAULT, IPL_NONE); 1263 uvm_obj_init(&kpm->pm_obj[i], NULL, false, 1); 1264 uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_obj_lock[i]); 1265 kpm->pm_ptphint[i] = NULL; 1266 } 1267 memset(&kpm->pm_list, 0, sizeof(kpm->pm_list)); /* pm_list not used */ 1268 1269 kpm->pm_pdir = (pd_entry_t *)(PDPpaddr + KERNBASE); 1270 for (i = 0; i < PDP_SIZE; i++) 1271 kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i; 1272 1273 kpm->pm_stats.wired_count = kpm->pm_stats.resident_count = 1274 x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS); 1275 1276 kcpuset_create(&kpm->pm_cpus, true); 1277 kcpuset_create(&kpm->pm_kernel_cpus, true); 1278 1279 /* 1280 * the above is just a rough estimate and not critical to the proper 1281 * operation of the system. 1282 */ 1283 1284 #ifndef XEN 1285 /* 1286 * Begin to enable global TLB entries if they are supported. 1287 * The G bit has no effect until the CR4_PGE bit is set in CR4, 1288 * which happens in cpu_init(), which is run on each cpu 1289 * (and happens later) 1290 */ 1291 if (cpu_feature[0] & CPUID_PGE) { 1292 pmap_pg_g = PG_G; /* enable software */ 1293 1294 /* add PG_G attribute to already mapped kernel pages */ 1295 1296 if (KERNBASE == VM_MIN_KERNEL_ADDRESS) { 1297 /* i386 only */ 1298 kva_end = virtual_avail; 1299 } else { 1300 /* amd64 only */ 1301 extern vaddr_t kern_end; 1302 kva_end = kern_end; 1303 } 1304 1305 for (kva = KERNBASE; kva < kva_end; kva += PAGE_SIZE) { 1306 p1i = pl1_i(kva); 1307 if (pmap_valid_entry(PTE_BASE[p1i])) 1308 PTE_BASE[p1i] |= PG_G; 1309 } 1310 } 1311 1312 /* 1313 * Enable large pages if they are supported. 1314 */ 1315 if (cpu_feature[0] & CPUID_PSE) { 1316 lcr4(rcr4() | CR4_PSE); /* enable hardware (via %cr4) */ 1317 pmap_largepages = 1; /* enable software */ 1318 1319 /* 1320 * The TLB must be flushed after enabling large pages on Pentium 1321 * CPUs, according to section 3.6.2.2 of "Intel Architecture 1322 * Software Developer's Manual, Volume 3: System Programming". 1323 */ 1324 tlbflushg(); 1325 1326 /* Remap the kernel. */ 1327 pmap_remap_largepages(); 1328 } 1329 #endif /* !XEN */ 1330 1331 pmap_init_lapic(); 1332 1333 #ifdef __HAVE_DIRECT_MAP 1334 pmap_init_directmap(kpm); 1335 #else 1336 if (VM_MIN_KERNEL_ADDRESS != KERNBASE) { 1337 /* 1338 * zero_pte is stuck at the end of mapped space for the kernel 1339 * image (disjunct from kva space). This is done so that it 1340 * can safely be used in pmap_growkernel (pmap_get_physpage), 1341 * when it's called for the first time. 1342 * XXXfvdl fix this for MULTIPROCESSOR later. 1343 */ 1344 #ifdef XEN 1345 /* early_zerop initialized in xen_locore() */ 1346 #else 1347 early_zerop = (void *)(KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2); 1348 #endif 1349 early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop); 1350 } 1351 1352 /* 1353 * Now we allocate the "special" VAs which are used for tmp mappings 1354 * by the pmap (and other modules). We allocate the VAs by advancing 1355 * virtual_avail (note that there are no pages mapped at these VAs). 1356 * we find the PTE that maps the allocated VA via the linear PTE 1357 * mapping. 1358 */ 1359 1360 pt_entry_t *pte = PTE_BASE + pl1_i(virtual_avail); 1361 1362 #ifdef MULTIPROCESSOR 1363 /* 1364 * Waste some VA space to avoid false sharing of cache lines 1365 * for page table pages: Give each possible CPU a cache line 1366 * of PTE's (8) to play with, though we only need 4. We could 1367 * recycle some of this waste by putting the idle stacks here 1368 * as well; we could waste less space if we knew the largest 1369 * CPU ID beforehand. 1370 */ 1371 csrcp = (char *) virtual_avail; csrc_pte = pte; 1372 1373 cdstp = (char *) virtual_avail+PAGE_SIZE; cdst_pte = pte+1; 1374 1375 zerop = (char *) virtual_avail+PAGE_SIZE*2; zero_pte = pte+2; 1376 1377 ptpp = (char *) virtual_avail+PAGE_SIZE*3; ptp_pte = pte+3; 1378 1379 virtual_avail += PAGE_SIZE * maxcpus * NPTECL; 1380 pte += maxcpus * NPTECL; 1381 #else 1382 csrcp = (void *) virtual_avail; csrc_pte = pte; /* allocate */ 1383 virtual_avail += PAGE_SIZE; pte++; /* advance */ 1384 1385 cdstp = (void *) virtual_avail; cdst_pte = pte; 1386 virtual_avail += PAGE_SIZE; pte++; 1387 1388 zerop = (void *) virtual_avail; zero_pte = pte; 1389 virtual_avail += PAGE_SIZE; pte++; 1390 1391 ptpp = (void *) virtual_avail; ptp_pte = pte; 1392 virtual_avail += PAGE_SIZE; pte++; 1393 #endif 1394 1395 if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { 1396 early_zerop = zerop; 1397 early_zero_pte = zero_pte; 1398 } 1399 #endif 1400 1401 #if defined(XEN) && defined(__x86_64__) 1402 /* 1403 * We want a dummy page directory for Xen: when deactivating a pmap, Xen 1404 * will still consider it active. So we set user PGD to this one to lift 1405 * all protection on the now inactive page tables set. 1406 */ 1407 xen_dummy_user_pgd = pmap_bootstrap_palloc(1); 1408 1409 /* Zero fill it, the less checks in Xen it requires the better */ 1410 memset((void *) (xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE); 1411 /* Mark read-only */ 1412 HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE, 1413 pmap_pa2pte(xen_dummy_user_pgd) | PG_u | PG_V, UVMF_INVLPG); 1414 /* Pin as L4 */ 1415 xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd)); 1416 #endif 1417 1418 /* 1419 * Allocate space for the IDT, GDT and LDT. 1420 */ 1421 idt_vaddr = pmap_bootstrap_valloc(1); 1422 idt_paddr = pmap_bootstrap_palloc(1); 1423 1424 gdt_vaddr = pmap_bootstrap_valloc(1); 1425 gdt_paddr = pmap_bootstrap_palloc(1); 1426 1427 ldt_vaddr = pmap_bootstrap_valloc(1); 1428 ldt_paddr = pmap_bootstrap_palloc(1); 1429 1430 #if !defined(__x86_64__) && !defined(XEN) 1431 /* pentium f00f bug stuff */ 1432 pentium_idt_vaddr = pmap_bootstrap_valloc(1); 1433 #endif 1434 1435 /* 1436 * Now we reserve some VM for mapping pages when doing a crash dump. 1437 */ 1438 virtual_avail = reserve_dumppages(virtual_avail); 1439 1440 /* 1441 * Init the static-global locks and global lists. 1442 * 1443 * => pventry::pvh_lock (initialized elsewhere) must also be 1444 * a spin lock, again at IPL_VM to prevent deadlock, and 1445 * again is never taken from interrupt context. 1446 */ 1447 mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE); 1448 LIST_INIT(&pmaps); 1449 1450 /* 1451 * Ensure the TLB is sync'd with reality by flushing it... 1452 */ 1453 tlbflushg(); 1454 1455 /* 1456 * Calculate pmap_maxkvaddr from nkptp[]. 1457 */ 1458 kva = VM_MIN_KERNEL_ADDRESS; 1459 for (i = PTP_LEVELS - 1; i >= 1; i--) { 1460 kva += nkptp[i] * nbpd[i]; 1461 } 1462 pmap_maxkvaddr = kva; 1463 } 1464 1465 static void 1466 pmap_init_lapic(void) 1467 { 1468 /* 1469 * On CPUs that have no LAPIC, local_apic_va is never kentered. But our 1470 * x86 implementation relies a lot on this address to be valid; so just 1471 * allocate a fake physical page that will be kentered into 1472 * local_apic_va by machdep. 1473 * 1474 * If the LAPIC is present, the va will be remapped somewhere else 1475 * later in lapic_map. 1476 */ 1477 local_apic_va = pmap_bootstrap_valloc(1); 1478 local_apic_pa = pmap_bootstrap_palloc(1); 1479 } 1480 1481 #ifdef __HAVE_DIRECT_MAP 1482 /* 1483 * Create the amd64 direct map. Called only once at boot time. 1484 */ 1485 static void 1486 pmap_init_directmap(struct pmap *kpm) 1487 { 1488 extern phys_ram_seg_t mem_clusters[]; 1489 extern int mem_cluster_cnt; 1490 1491 paddr_t lastpa, dm_pd, dm_pdp, pdp; 1492 vaddr_t tmpva; 1493 pt_entry_t *pte; 1494 pd_entry_t *pde; 1495 phys_ram_seg_t *mc; 1496 long n_dm_pdp; 1497 int i; 1498 1499 const pd_entry_t pteflags = PG_V | PG_KW | pmap_pg_nx; 1500 1501 /* Get the last physical address available */ 1502 lastpa = 0; 1503 for (i = 0; i < mem_cluster_cnt; i++) { 1504 mc = &mem_clusters[i]; 1505 lastpa = MAX(lastpa, mc->start + mc->size); 1506 } 1507 1508 /* 1509 * We allocate only one L4 entry for the direct map (PDIR_SLOT_DIRECT), 1510 * so we cannot map more than 512GB. 1511 */ 1512 if (lastpa > NBPD_L4) { 1513 panic("RAM limit reached: > 512GB not supported"); 1514 } 1515 1516 /* Allocate L3. */ 1517 dm_pdp = pmap_bootstrap_palloc(1); 1518 1519 /* Number of L3 entries. */ 1520 n_dm_pdp = (lastpa + NBPD_L3 - 1) >> L3_SHIFT; 1521 1522 /* In locore.S, we allocated a tmp va. Use it now. */ 1523 tmpva = (KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2); 1524 pte = PTE_BASE + pl1_i(tmpva); 1525 *pte = dm_pdp | pteflags; 1526 pmap_update_pg(tmpva); 1527 memset((void *)tmpva, 0, PAGE_SIZE); 1528 1529 /* 1530 * Map the direct map RW. Use super pages (1GB) or large pages (2MB) if 1531 * they are supported. Note: PG_G is not allowed on non-leaf PTPs. 1532 */ 1533 if (cpu_feature[2] & CPUID_P1GB) { 1534 /* Super pages are supported. Just create L3. */ 1535 for (i = 0; i < n_dm_pdp; i++) { 1536 pdp = (paddr_t)&(((pd_entry_t *)dm_pdp)[i]); 1537 *pte = (pdp & PG_FRAME) | pteflags; 1538 pmap_update_pg(tmpva); 1539 1540 pde = (pd_entry_t *)(tmpva + (pdp & ~PG_FRAME)); 1541 *pde = ((paddr_t)i << L3_SHIFT) | pteflags | PG_U | 1542 PG_PS | PG_G; 1543 } 1544 } else { 1545 /* Allocate L2. */ 1546 dm_pd = pmap_bootstrap_palloc(n_dm_pdp); 1547 1548 /* Zero out the L2 pages. */ 1549 for (i = 0; i < n_dm_pdp; i++) { 1550 pdp = dm_pd + i * PAGE_SIZE; 1551 *pte = (pdp & PG_FRAME) | pteflags; 1552 pmap_update_pg(tmpva); 1553 1554 memset((void *)tmpva, 0, PAGE_SIZE); 1555 } 1556 1557 KASSERT(pmap_largepages != 0); 1558 1559 /* Large pages are supported. Just create L2. */ 1560 for (i = 0; i < NPDPG * n_dm_pdp; i++) { 1561 pdp = (paddr_t)&(((pd_entry_t *)dm_pd)[i]); 1562 *pte = (pdp & PG_FRAME) | pteflags; 1563 pmap_update_pg(tmpva); 1564 1565 pde = (pd_entry_t *)(tmpva + (pdp & ~PG_FRAME)); 1566 *pde = ((paddr_t)i << L2_SHIFT) | pteflags | 1567 PG_U | PG_PS | PG_G; 1568 } 1569 1570 /* Fill in the L3 entries, linked to L2. */ 1571 for (i = 0; i < n_dm_pdp; i++) { 1572 pdp = (paddr_t)&(((pd_entry_t *)dm_pdp)[i]); 1573 *pte = (pdp & PG_FRAME) | pteflags; 1574 pmap_update_pg(tmpva); 1575 1576 pde = (pd_entry_t *)(tmpva + (pdp & ~PG_FRAME)); 1577 *pde = (dm_pd + (i << PAGE_SHIFT)) | pteflags | PG_U; 1578 } 1579 } 1580 1581 kpm->pm_pdir[PDIR_SLOT_DIRECT] = dm_pdp | pteflags | PG_U; 1582 1583 *pte = 0; 1584 pmap_update_pg(tmpva); 1585 1586 tlbflush(); 1587 } 1588 #endif /* __HAVE_DIRECT_MAP */ 1589 1590 #ifndef XEN 1591 /* 1592 * Remap several kernel segments with large pages. We cover as many pages as we 1593 * can. Called only once at boot time, if the CPU supports large pages. 1594 */ 1595 static void 1596 pmap_remap_largepages(void) 1597 { 1598 extern char __rodata_start; 1599 extern char __data_start; 1600 extern char __kernel_end; 1601 pd_entry_t *pde; 1602 vaddr_t kva, kva_end; 1603 paddr_t pa; 1604 1605 /* Remap the kernel text using large pages. */ 1606 kva = KERNBASE; 1607 kva_end = rounddown((vaddr_t)&__rodata_start, NBPD_L1); 1608 pa = kva - KERNBASE; 1609 for (/* */; kva + NBPD_L2 <= kva_end; kva += NBPD_L2, pa += NBPD_L2) { 1610 pde = &L2_BASE[pl2_i(kva)]; 1611 *pde = pa | pmap_pg_g | PG_PS | PG_KR | PG_V; 1612 tlbflushg(); 1613 } 1614 #if defined(DEBUG) 1615 aprint_normal("kernel text is mapped with %" PRIuPSIZE " large " 1616 "pages and %" PRIuPSIZE " normal pages\n", 1617 howmany(kva - KERNBASE, NBPD_L2), 1618 howmany((vaddr_t)&__rodata_start - kva, NBPD_L1)); 1619 #endif /* defined(DEBUG) */ 1620 1621 /* Remap the kernel rodata using large pages. */ 1622 kva = roundup((vaddr_t)&__rodata_start, NBPD_L2); 1623 kva_end = rounddown((vaddr_t)&__data_start, NBPD_L1); 1624 pa = kva - KERNBASE; 1625 for (/* */; kva + NBPD_L2 <= kva_end; kva += NBPD_L2, pa += NBPD_L2) { 1626 pde = &L2_BASE[pl2_i(kva)]; 1627 *pde = pa | pmap_pg_g | PG_PS | pmap_pg_nx | PG_KR | PG_V; 1628 tlbflushg(); 1629 } 1630 1631 /* Remap the kernel data+bss using large pages. */ 1632 kva = roundup((vaddr_t)&__data_start, NBPD_L2); 1633 kva_end = rounddown((vaddr_t)&__kernel_end, NBPD_L1); 1634 pa = kva - KERNBASE; 1635 for (/* */; kva + NBPD_L2 <= kva_end; kva += NBPD_L2, pa += NBPD_L2) { 1636 pde = &L2_BASE[pl2_i(kva)]; 1637 *pde = pa | pmap_pg_g | PG_PS | pmap_pg_nx | PG_KW | PG_V; 1638 tlbflushg(); 1639 } 1640 } 1641 #endif /* !XEN */ 1642 1643 /* 1644 * pmap_init: called from uvm_init, our job is to get the pmap 1645 * system ready to manage mappings... 1646 */ 1647 1648 void 1649 pmap_init(void) 1650 { 1651 int i, flags; 1652 1653 for (i = 0; i < PV_HASH_SIZE; i++) { 1654 SLIST_INIT(&pv_hash_heads[i].hh_list); 1655 } 1656 for (i = 0; i < PV_HASH_LOCK_CNT; i++) { 1657 mutex_init(&pv_hash_locks[i].lock, MUTEX_NODEBUG, IPL_VM); 1658 } 1659 1660 /* 1661 * initialize caches. 1662 */ 1663 1664 pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), 0, 0, 0, 1665 "pmappl", NULL, IPL_NONE, NULL, NULL, NULL); 1666 1667 #ifdef XEN 1668 /* 1669 * pool_cache(9) should not touch cached objects, since they 1670 * are pinned on xen and R/O for the domU 1671 */ 1672 flags = PR_NOTOUCH; 1673 #else /* XEN */ 1674 flags = 0; 1675 #endif /* XEN */ 1676 #ifdef PAE 1677 pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE * PDP_SIZE, 0, 0, flags, 1678 "pdppl", &pmap_pdp_allocator, IPL_NONE, 1679 pmap_pdp_ctor, pmap_pdp_dtor, NULL); 1680 #else /* PAE */ 1681 pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE, 0, 0, flags, 1682 "pdppl", NULL, IPL_NONE, pmap_pdp_ctor, pmap_pdp_dtor, NULL); 1683 #endif /* PAE */ 1684 pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0, 1685 PR_LARGECACHE, "pvpl", &pool_allocator_kmem, IPL_NONE, NULL, 1686 NULL, NULL); 1687 1688 pmap_tlb_init(); 1689 1690 /* XXX: Since cpu_hatch() is only for secondary CPUs. */ 1691 pmap_tlb_cpu_init(curcpu()); 1692 1693 evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC, 1694 NULL, "x86", "io bitmap copy"); 1695 evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC, 1696 NULL, "x86", "ldt sync"); 1697 1698 /* 1699 * done: pmap module is up (and ready for business) 1700 */ 1701 1702 pmap_initialized = true; 1703 } 1704 1705 /* 1706 * pmap_cpu_init_late: perform late per-CPU initialization. 1707 */ 1708 1709 #ifndef XEN 1710 void 1711 pmap_cpu_init_late(struct cpu_info *ci) 1712 { 1713 /* 1714 * The BP has already its own PD page allocated during early 1715 * MD startup. 1716 */ 1717 if (ci == &cpu_info_primary) 1718 return; 1719 1720 #ifdef PAE 1721 cpu_alloc_l3_page(ci); 1722 #endif 1723 } 1724 #endif 1725 1726 /* 1727 * p v _ e n t r y f u n c t i o n s 1728 */ 1729 1730 /* 1731 * pmap_free_pvs: free a list of pv_entrys 1732 */ 1733 1734 static void 1735 pmap_free_pvs(struct pv_entry *pve) 1736 { 1737 struct pv_entry *next; 1738 1739 for ( /* null */ ; pve != NULL ; pve = next) { 1740 next = pve->pve_next; 1741 pool_cache_put(&pmap_pv_cache, pve); 1742 } 1743 } 1744 1745 /* 1746 * main pv_entry manipulation functions: 1747 * pmap_enter_pv: enter a mapping onto a pv_head list 1748 * pmap_remove_pv: remove a mapping from a pv_head list 1749 * 1750 * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock 1751 * the pvh before calling 1752 */ 1753 1754 /* 1755 * insert_pv: a helper of pmap_enter_pv 1756 */ 1757 1758 static void 1759 insert_pv(struct pmap_page *pp, struct pv_entry *pve) 1760 { 1761 struct pv_hash_head *hh; 1762 kmutex_t *lock; 1763 u_int hash; 1764 1765 hash = pvhash_hash(pve->pve_pte.pte_ptp, pve->pve_pte.pte_va); 1766 lock = pvhash_lock(hash); 1767 hh = pvhash_head(hash); 1768 mutex_spin_enter(lock); 1769 SLIST_INSERT_HEAD(&hh->hh_list, pve, pve_hash); 1770 mutex_spin_exit(lock); 1771 1772 LIST_INSERT_HEAD(&pp->pp_head.pvh_list, pve, pve_list); 1773 } 1774 1775 /* 1776 * pmap_enter_pv: enter a mapping onto a pv_head lst 1777 * 1778 * => caller should adjust ptp's wire_count before calling 1779 * => caller has preallocated pve and *sparepve for us 1780 */ 1781 1782 static struct pv_entry * 1783 pmap_enter_pv(struct pmap_page *pp, struct pv_entry *pve, 1784 struct pv_entry **sparepve, struct vm_page *ptp, vaddr_t va) 1785 { 1786 1787 KASSERT(ptp == NULL || ptp->wire_count >= 2); 1788 KASSERT(ptp == NULL || ptp->uobject != NULL); 1789 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 1790 1791 if ((pp->pp_flags & PP_EMBEDDED) == 0) { 1792 if (LIST_EMPTY(&pp->pp_head.pvh_list)) { 1793 pp->pp_flags |= PP_EMBEDDED; 1794 pp->pp_pte.pte_ptp = ptp; 1795 pp->pp_pte.pte_va = va; 1796 1797 return pve; 1798 } 1799 } else { 1800 struct pv_entry *pve2; 1801 1802 pve2 = *sparepve; 1803 *sparepve = NULL; 1804 1805 pve2->pve_pte = pp->pp_pte; 1806 pp->pp_flags &= ~PP_EMBEDDED; 1807 LIST_INIT(&pp->pp_head.pvh_list); 1808 insert_pv(pp, pve2); 1809 } 1810 1811 pve->pve_pte.pte_ptp = ptp; 1812 pve->pve_pte.pte_va = va; 1813 insert_pv(pp, pve); 1814 1815 return NULL; 1816 } 1817 1818 /* 1819 * pmap_remove_pv: try to remove a mapping from a pv_list 1820 * 1821 * => caller should adjust ptp's wire_count and free PTP if needed 1822 * => we return the removed pve 1823 */ 1824 1825 static struct pv_entry * 1826 pmap_remove_pv(struct pmap_page *pp, struct vm_page *ptp, vaddr_t va) 1827 { 1828 struct pv_hash_head *hh; 1829 struct pv_entry *pve; 1830 kmutex_t *lock; 1831 u_int hash; 1832 1833 KASSERT(ptp == NULL || ptp->uobject != NULL); 1834 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 1835 1836 if ((pp->pp_flags & PP_EMBEDDED) != 0) { 1837 KASSERT(pp->pp_pte.pte_ptp == ptp); 1838 KASSERT(pp->pp_pte.pte_va == va); 1839 1840 pp->pp_flags &= ~PP_EMBEDDED; 1841 LIST_INIT(&pp->pp_head.pvh_list); 1842 1843 return NULL; 1844 } 1845 1846 hash = pvhash_hash(ptp, va); 1847 lock = pvhash_lock(hash); 1848 hh = pvhash_head(hash); 1849 mutex_spin_enter(lock); 1850 pve = pvhash_remove(hh, ptp, va); 1851 mutex_spin_exit(lock); 1852 1853 LIST_REMOVE(pve, pve_list); 1854 1855 return pve; 1856 } 1857 1858 /* 1859 * p t p f u n c t i o n s 1860 */ 1861 1862 static inline struct vm_page * 1863 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level) 1864 { 1865 int lidx = level - 1; 1866 struct vm_page *pg; 1867 1868 KASSERT(mutex_owned(pmap->pm_lock)); 1869 1870 if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] && 1871 pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) { 1872 return (pmap->pm_ptphint[lidx]); 1873 } 1874 PMAP_SUBOBJ_LOCK(pmap, lidx); 1875 pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level)); 1876 PMAP_SUBOBJ_UNLOCK(pmap, lidx); 1877 1878 KASSERT(pg == NULL || pg->wire_count >= 1); 1879 return pg; 1880 } 1881 1882 static inline void 1883 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level) 1884 { 1885 lwp_t *l; 1886 int lidx; 1887 struct uvm_object *obj; 1888 1889 KASSERT(ptp->wire_count == 1); 1890 1891 lidx = level - 1; 1892 1893 obj = &pmap->pm_obj[lidx]; 1894 pmap_stats_update(pmap, -1, 0); 1895 if (lidx != 0) 1896 mutex_enter(obj->vmobjlock); 1897 if (pmap->pm_ptphint[lidx] == ptp) 1898 pmap->pm_ptphint[lidx] = TAILQ_FIRST(&obj->memq); 1899 ptp->wire_count = 0; 1900 uvm_pagerealloc(ptp, NULL, 0); 1901 l = curlwp; 1902 KASSERT((l->l_pflag & LP_INTR) == 0); 1903 VM_PAGE_TO_PP(ptp)->pp_link = l->l_md.md_gc_ptp; 1904 l->l_md.md_gc_ptp = ptp; 1905 if (lidx != 0) 1906 mutex_exit(obj->vmobjlock); 1907 } 1908 1909 static void 1910 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va, 1911 pt_entry_t *ptes, pd_entry_t * const *pdes) 1912 { 1913 unsigned long index; 1914 int level; 1915 vaddr_t invaladdr; 1916 pd_entry_t opde; 1917 1918 KASSERT(pmap != pmap_kernel()); 1919 KASSERT(mutex_owned(pmap->pm_lock)); 1920 KASSERT(kpreempt_disabled()); 1921 1922 level = 1; 1923 do { 1924 index = pl_i(va, level + 1); 1925 opde = pmap_pte_testset(&pdes[level - 1][index], 0); 1926 #if defined(XEN) 1927 # if defined(__x86_64__) 1928 /* 1929 * If ptp is a L3 currently mapped in kernel space, 1930 * on any cpu, clear it before freeing 1931 */ 1932 if (level == PTP_LEVELS - 1) { 1933 /* 1934 * Update the per-cpu PD on all cpus the current 1935 * pmap is active on 1936 */ 1937 xen_kpm_sync(pmap, index); 1938 } 1939 # endif /*__x86_64__ */ 1940 invaladdr = level == 1 ? (vaddr_t)ptes : 1941 (vaddr_t)pdes[level - 2]; 1942 pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE, 1943 opde, TLBSHOOT_FREE_PTP1); 1944 pmap_tlb_shootnow(); 1945 #else /* XEN */ 1946 invaladdr = level == 1 ? (vaddr_t)ptes : 1947 (vaddr_t)pdes[level - 2]; 1948 pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE, 1949 opde, TLBSHOOT_FREE_PTP1); 1950 #endif /* XEN */ 1951 pmap_freepage(pmap, ptp, level); 1952 if (level < PTP_LEVELS - 1) { 1953 ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1); 1954 ptp->wire_count--; 1955 if (ptp->wire_count > 1) 1956 break; 1957 } 1958 } while (++level < PTP_LEVELS); 1959 pmap_pte_flush(); 1960 } 1961 1962 /* 1963 * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one) 1964 * 1965 * => pmap should NOT be pmap_kernel() 1966 * => pmap should be locked 1967 * => preemption should be disabled 1968 */ 1969 1970 static struct vm_page * 1971 pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t * const *pdes) 1972 { 1973 struct vm_page *ptp, *pptp; 1974 int i; 1975 unsigned long index; 1976 pd_entry_t *pva; 1977 paddr_t ppa, pa; 1978 struct uvm_object *obj; 1979 1980 KASSERT(pmap != pmap_kernel()); 1981 KASSERT(mutex_owned(pmap->pm_lock)); 1982 KASSERT(kpreempt_disabled()); 1983 1984 ptp = NULL; 1985 pa = (paddr_t)-1; 1986 1987 /* 1988 * Loop through all page table levels seeing if we need to 1989 * add a new page to that level. 1990 */ 1991 for (i = PTP_LEVELS; i > 1; i--) { 1992 /* 1993 * Save values from previous round. 1994 */ 1995 pptp = ptp; 1996 ppa = pa; 1997 1998 index = pl_i(va, i); 1999 pva = pdes[i - 2]; 2000 2001 if (pmap_valid_entry(pva[index])) { 2002 ppa = pmap_pte2pa(pva[index]); 2003 ptp = NULL; 2004 continue; 2005 } 2006 2007 obj = &pmap->pm_obj[i-2]; 2008 PMAP_SUBOBJ_LOCK(pmap, i - 2); 2009 ptp = uvm_pagealloc(obj, ptp_va2o(va, i - 1), NULL, 2010 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 2011 PMAP_SUBOBJ_UNLOCK(pmap, i - 2); 2012 2013 if (ptp == NULL) 2014 return NULL; 2015 2016 ptp->flags &= ~PG_BUSY; /* never busy */ 2017 ptp->wire_count = 1; 2018 pmap->pm_ptphint[i - 2] = ptp; 2019 pa = VM_PAGE_TO_PHYS(ptp); 2020 pmap_pte_set(&pva[index], (pd_entry_t) 2021 (pmap_pa2pte(pa) | PG_u | PG_RW | PG_V)); 2022 #if defined(XEN) && defined(__x86_64__) 2023 if(i == PTP_LEVELS) { 2024 /* 2025 * Update the per-cpu PD on all cpus the current 2026 * pmap is active on 2027 */ 2028 xen_kpm_sync(pmap, index); 2029 } 2030 #endif 2031 pmap_pte_flush(); 2032 pmap_stats_update(pmap, 1, 0); 2033 /* 2034 * If we're not in the top level, increase the 2035 * wire count of the parent page. 2036 */ 2037 if (i < PTP_LEVELS) { 2038 if (pptp == NULL) { 2039 pptp = pmap_find_ptp(pmap, va, ppa, i); 2040 KASSERT(pptp != NULL); 2041 } 2042 pptp->wire_count++; 2043 } 2044 } 2045 2046 /* 2047 * PTP is not NULL if we just allocated a new PTP. If it is 2048 * still NULL, we must look up the existing one. 2049 */ 2050 if (ptp == NULL) { 2051 ptp = pmap_find_ptp(pmap, va, ppa, 1); 2052 KASSERTMSG(ptp != NULL, "pmap_get_ptp: va %" PRIxVADDR 2053 "ppa %" PRIxPADDR "\n", va, ppa); 2054 } 2055 2056 pmap->pm_ptphint[0] = ptp; 2057 return ptp; 2058 } 2059 2060 /* 2061 * p m a p l i f e c y c l e f u n c t i o n s 2062 */ 2063 2064 /* 2065 * pmap_pdp_ctor: constructor for the PDP cache. 2066 */ 2067 static int 2068 pmap_pdp_ctor(void *arg, void *v, int flags) 2069 { 2070 pd_entry_t *pdir = v; 2071 paddr_t pdirpa = 0; 2072 vaddr_t object; 2073 int i; 2074 2075 #if !defined(XEN) || !defined(__x86_64__) 2076 int npde; 2077 #endif 2078 #ifdef XEN 2079 int s; 2080 #endif 2081 2082 /* 2083 * NOTE: The `pmaps_lock' is held when the PDP is allocated. 2084 */ 2085 2086 #if defined(XEN) && defined(__x86_64__) 2087 /* Fetch the physical address of the page directory */ 2088 (void)pmap_extract(pmap_kernel(), (vaddr_t)pdir, &pdirpa); 2089 2090 /* Zero the area */ 2091 memset(pdir, 0, PAGE_SIZE); /* Xen wants a clean page */ 2092 2093 /* 2094 * This pdir will NEVER be active in kernel mode, so mark 2095 * recursive entry invalid. 2096 */ 2097 pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa) | PG_u; 2098 2099 /* 2100 * PDP constructed this way won't be for the kernel, hence we 2101 * don't put kernel mappings on Xen. 2102 * 2103 * But we need to make pmap_create() happy, so put a dummy 2104 * (without PG_V) value at the right place. 2105 */ 2106 pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] = 2107 (pd_entry_t)-1 & PG_FRAME; 2108 #else /* XEN && __x86_64__*/ 2109 /* Zero the area */ 2110 memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t)); 2111 2112 object = (vaddr_t)v; 2113 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2114 /* Fetch the physical address of the page directory */ 2115 (void)pmap_extract(pmap_kernel(), object, &pdirpa); 2116 2117 /* Put in recursive PDE to map the PTEs */ 2118 pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PG_V | 2119 pmap_pg_nx; 2120 #ifndef XEN 2121 pdir[PDIR_SLOT_PTE + i] |= PG_KW; 2122 #endif 2123 } 2124 2125 /* Copy the kernel's top level PDE */ 2126 npde = nkptp[PTP_LEVELS - 1]; 2127 2128 memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN], 2129 npde * sizeof(pd_entry_t)); 2130 2131 /* Zero the rest */ 2132 memset(&pdir[PDIR_SLOT_KERN + npde], 0, (PAGE_SIZE * PDP_SIZE) - 2133 (PDIR_SLOT_KERN + npde) * sizeof(pd_entry_t)); 2134 2135 if (VM_MIN_KERNEL_ADDRESS != KERNBASE) { 2136 int idx = pl_i(KERNBASE, PTP_LEVELS); 2137 pdir[idx] = PDP_BASE[idx]; 2138 } 2139 2140 #ifdef __HAVE_DIRECT_MAP 2141 pdir[PDIR_SLOT_DIRECT] = PDP_BASE[PDIR_SLOT_DIRECT]; 2142 #endif 2143 #endif /* XEN && __x86_64__*/ 2144 2145 #ifdef XEN 2146 s = splvm(); 2147 object = (vaddr_t)v; 2148 pmap_protect(pmap_kernel(), object, object + (PAGE_SIZE * PDP_SIZE), 2149 VM_PROT_READ); 2150 pmap_update(pmap_kernel()); 2151 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2152 /* 2153 * pin as L2/L4 page, we have to do the page with the 2154 * PDIR_SLOT_PTE entries last 2155 */ 2156 #ifdef PAE 2157 if (i == l2tol3(PDIR_SLOT_PTE)) 2158 continue; 2159 #endif 2160 2161 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2162 #ifdef __x86_64__ 2163 xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa)); 2164 #else 2165 xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa)); 2166 #endif 2167 } 2168 #ifdef PAE 2169 object = ((vaddr_t)pdir) + PAGE_SIZE * l2tol3(PDIR_SLOT_PTE); 2170 (void)pmap_extract(pmap_kernel(), object, &pdirpa); 2171 xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa)); 2172 #endif 2173 splx(s); 2174 #endif /* XEN */ 2175 2176 return (0); 2177 } 2178 2179 /* 2180 * pmap_pdp_dtor: destructor for the PDP cache. 2181 */ 2182 2183 static void 2184 pmap_pdp_dtor(void *arg, void *v) 2185 { 2186 #ifdef XEN 2187 paddr_t pdirpa = 0; /* XXX: GCC */ 2188 vaddr_t object = (vaddr_t)v; 2189 int i; 2190 int s = splvm(); 2191 pt_entry_t *pte; 2192 2193 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2194 /* fetch the physical address of the page directory. */ 2195 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2196 /* unpin page table */ 2197 xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa)); 2198 } 2199 object = (vaddr_t)v; 2200 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2201 /* Set page RW again */ 2202 pte = kvtopte(object); 2203 pmap_pte_set(pte, *pte | PG_RW); 2204 xen_bcast_invlpg((vaddr_t)object); 2205 } 2206 splx(s); 2207 #endif /* XEN */ 2208 } 2209 2210 #ifdef PAE 2211 2212 /* pmap_pdp_alloc: Allocate a page for the pdp memory pool. */ 2213 2214 static void * 2215 pmap_pdp_alloc(struct pool *pp, int flags) 2216 { 2217 return (void *)uvm_km_alloc(kernel_map, 2218 PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE, 2219 ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK) 2220 | UVM_KMF_WIRED); 2221 } 2222 2223 /* 2224 * pmap_pdp_free: free a PDP 2225 */ 2226 2227 static void 2228 pmap_pdp_free(struct pool *pp, void *v) 2229 { 2230 uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE, 2231 UVM_KMF_WIRED); 2232 } 2233 #endif /* PAE */ 2234 2235 /* 2236 * pmap_create: create a pmap object. 2237 */ 2238 struct pmap * 2239 pmap_create(void) 2240 { 2241 struct pmap *pmap; 2242 int i; 2243 2244 pmap = pool_cache_get(&pmap_cache, PR_WAITOK); 2245 2246 /* init uvm_object */ 2247 for (i = 0; i < PTP_LEVELS - 1; i++) { 2248 mutex_init(&pmap->pm_obj_lock[i], MUTEX_DEFAULT, IPL_NONE); 2249 uvm_obj_init(&pmap->pm_obj[i], NULL, false, 1); 2250 uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_obj_lock[i]); 2251 pmap->pm_ptphint[i] = NULL; 2252 } 2253 pmap->pm_stats.wired_count = 0; 2254 /* count the PDP allocd below */ 2255 pmap->pm_stats.resident_count = PDP_SIZE; 2256 #if !defined(__x86_64__) 2257 pmap->pm_hiexec = 0; 2258 #endif /* !defined(__x86_64__) */ 2259 pmap->pm_flags = 0; 2260 pmap->pm_gc_ptp = NULL; 2261 2262 kcpuset_create(&pmap->pm_cpus, true); 2263 kcpuset_create(&pmap->pm_kernel_cpus, true); 2264 #ifdef XEN 2265 kcpuset_create(&pmap->pm_xen_ptp_cpus, true); 2266 #endif 2267 /* init the LDT */ 2268 pmap->pm_ldt = NULL; 2269 pmap->pm_ldt_len = 0; 2270 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 2271 2272 /* allocate PDP */ 2273 try_again: 2274 pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK); 2275 2276 mutex_enter(&pmaps_lock); 2277 2278 if (pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] == 0) { 2279 mutex_exit(&pmaps_lock); 2280 pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir); 2281 goto try_again; 2282 } 2283 2284 for (i = 0; i < PDP_SIZE; i++) 2285 pmap->pm_pdirpa[i] = 2286 pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]); 2287 2288 LIST_INSERT_HEAD(&pmaps, pmap, pm_list); 2289 2290 mutex_exit(&pmaps_lock); 2291 2292 return (pmap); 2293 } 2294 2295 /* 2296 * pmap_free_ptps: put a list of ptps back to the freelist. 2297 */ 2298 2299 void 2300 pmap_free_ptps(struct vm_page *empty_ptps) 2301 { 2302 struct vm_page *ptp; 2303 struct pmap_page *pp; 2304 2305 while ((ptp = empty_ptps) != NULL) { 2306 pp = VM_PAGE_TO_PP(ptp); 2307 empty_ptps = pp->pp_link; 2308 LIST_INIT(&pp->pp_head.pvh_list); 2309 uvm_pagefree(ptp); 2310 } 2311 } 2312 2313 /* 2314 * pmap_destroy: drop reference count on pmap. free pmap if 2315 * reference count goes to zero. 2316 */ 2317 2318 void 2319 pmap_destroy(struct pmap *pmap) 2320 { 2321 lwp_t *l; 2322 int i; 2323 2324 /* 2325 * If we have torn down this pmap, process deferred frees and 2326 * invalidations. Free now if the system is low on memory. 2327 * Otherwise, free when the pmap is destroyed thus avoiding a 2328 * TLB shootdown. 2329 */ 2330 l = curlwp; 2331 if (__predict_false(l->l_md.md_gc_pmap == pmap)) { 2332 if (uvmexp.free < uvmexp.freetarg) { 2333 pmap_update(pmap); 2334 } else { 2335 KASSERT(pmap->pm_gc_ptp == NULL); 2336 pmap->pm_gc_ptp = l->l_md.md_gc_ptp; 2337 l->l_md.md_gc_ptp = NULL; 2338 l->l_md.md_gc_pmap = NULL; 2339 } 2340 } 2341 2342 /* 2343 * drop reference count 2344 */ 2345 2346 if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) { 2347 return; 2348 } 2349 2350 #ifdef DIAGNOSTIC 2351 CPU_INFO_ITERATOR cii; 2352 struct cpu_info *ci; 2353 2354 for (CPU_INFO_FOREACH(cii, ci)) { 2355 if (ci->ci_pmap == pmap) 2356 panic("destroying pmap being used"); 2357 #if defined(XEN) && defined(__x86_64__) 2358 for (i = 0; i < PDIR_SLOT_PTE; i++) { 2359 if (pmap->pm_pdir[i] != 0 && 2360 ci->ci_kpm_pdir[i] == pmap->pm_pdir[i]) { 2361 printf("pmap_destroy(%p) pmap_kernel %p " 2362 "curcpu %d cpu %d ci_pmap %p " 2363 "ci->ci_kpm_pdir[%d]=%" PRIx64 2364 " pmap->pm_pdir[%d]=%" PRIx64 "\n", 2365 pmap, pmap_kernel(), curcpu()->ci_index, 2366 ci->ci_index, ci->ci_pmap, 2367 i, ci->ci_kpm_pdir[i], 2368 i, pmap->pm_pdir[i]); 2369 panic("pmap_destroy: used pmap"); 2370 } 2371 } 2372 #endif 2373 } 2374 #endif /* DIAGNOSTIC */ 2375 2376 /* 2377 * Reference count is zero, free pmap resources and then free pmap. 2378 * First, remove it from global list of pmaps. 2379 */ 2380 2381 mutex_enter(&pmaps_lock); 2382 LIST_REMOVE(pmap, pm_list); 2383 mutex_exit(&pmaps_lock); 2384 2385 /* 2386 * Process deferred PTP frees. No TLB shootdown required, as the 2387 * PTP pages are no longer visible to any CPU. 2388 */ 2389 2390 pmap_free_ptps(pmap->pm_gc_ptp); 2391 2392 /* 2393 * destroyed pmap shouldn't have remaining PTPs 2394 */ 2395 2396 for (i = 0; i < PTP_LEVELS - 1; i++) { 2397 KASSERT(pmap->pm_obj[i].uo_npages == 0); 2398 KASSERT(TAILQ_EMPTY(&pmap->pm_obj[i].memq)); 2399 } 2400 2401 pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir); 2402 2403 #ifdef USER_LDT 2404 if (pmap->pm_ldt != NULL) { 2405 /* 2406 * no need to switch the LDT; this address space is gone, 2407 * nothing is using it. 2408 * 2409 * No need to lock the pmap for ldt_free (or anything else), 2410 * we're the last one to use it. 2411 */ 2412 mutex_enter(&cpu_lock); 2413 ldt_free(pmap->pm_ldt_sel); 2414 mutex_exit(&cpu_lock); 2415 uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt, 2416 pmap->pm_ldt_len, UVM_KMF_WIRED); 2417 } 2418 #endif 2419 2420 for (i = 0; i < PTP_LEVELS - 1; i++) { 2421 uvm_obj_destroy(&pmap->pm_obj[i], false); 2422 mutex_destroy(&pmap->pm_obj_lock[i]); 2423 } 2424 kcpuset_destroy(pmap->pm_cpus); 2425 kcpuset_destroy(pmap->pm_kernel_cpus); 2426 #ifdef XEN 2427 kcpuset_destroy(pmap->pm_xen_ptp_cpus); 2428 #endif 2429 pool_cache_put(&pmap_cache, pmap); 2430 } 2431 2432 /* 2433 * pmap_remove_all: pmap is being torn down by the current thread. 2434 * avoid unnecessary invalidations. 2435 */ 2436 2437 void 2438 pmap_remove_all(struct pmap *pmap) 2439 { 2440 lwp_t *l = curlwp; 2441 2442 KASSERT(l->l_md.md_gc_pmap == NULL); 2443 2444 l->l_md.md_gc_pmap = pmap; 2445 } 2446 2447 #if defined(PMAP_FORK) 2448 /* 2449 * pmap_fork: perform any necessary data structure manipulation when 2450 * a VM space is forked. 2451 */ 2452 2453 void 2454 pmap_fork(struct pmap *pmap1, struct pmap *pmap2) 2455 { 2456 #ifdef USER_LDT 2457 union descriptor *new_ldt; 2458 size_t len; 2459 int sel; 2460 2461 if (__predict_true(pmap1->pm_ldt == NULL)) { 2462 return; 2463 } 2464 2465 /* 2466 * Copy the LDT into the new process. 2467 * 2468 * Read pmap1's ldt pointer and length unlocked; if it changes 2469 * behind our back we'll retry. This will starve if there's a 2470 * stream of LDT changes in another thread but that should not 2471 * happen. 2472 */ 2473 2474 retry: 2475 if (pmap1->pm_ldt != NULL) { 2476 len = pmap1->pm_ldt_len; 2477 /* Allocate space for the new process's LDT */ 2478 new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, len, 0, 2479 UVM_KMF_WIRED); 2480 if (new_ldt == NULL) { 2481 printf("WARNING: pmap_fork: " 2482 "unable to allocate LDT space\n"); 2483 return; 2484 } 2485 mutex_enter(&cpu_lock); 2486 /* Get a GDT slot for it */ 2487 sel = ldt_alloc(new_ldt, len); 2488 if (sel == -1) { 2489 mutex_exit(&cpu_lock); 2490 uvm_km_free(kernel_map, (vaddr_t)new_ldt, len, 2491 UVM_KMF_WIRED); 2492 printf("WARNING: pmap_fork: " 2493 "unable to allocate LDT selector\n"); 2494 return; 2495 } 2496 } else { 2497 /* Wasn't anything there after all. */ 2498 len = -1; 2499 new_ldt = NULL; 2500 sel = -1; 2501 mutex_enter(&cpu_lock); 2502 } 2503 2504 /* If there's still something there now that we have cpu_lock... */ 2505 if (pmap1->pm_ldt != NULL) { 2506 if (len != pmap1->pm_ldt_len) { 2507 /* Oops, it changed. Drop what we did and try again */ 2508 if (len != -1) { 2509 ldt_free(sel); 2510 uvm_km_free(kernel_map, (vaddr_t)new_ldt, 2511 len, UVM_KMF_WIRED); 2512 } 2513 mutex_exit(&cpu_lock); 2514 goto retry; 2515 } 2516 2517 /* Copy the LDT data and install it in pmap2 */ 2518 memcpy(new_ldt, pmap1->pm_ldt, len); 2519 pmap2->pm_ldt = new_ldt; 2520 pmap2->pm_ldt_len = pmap1->pm_ldt_len; 2521 pmap2->pm_ldt_sel = sel; 2522 len = -1; 2523 } 2524 2525 if (len != -1) { 2526 /* There wasn't still something there, so mop up */ 2527 ldt_free(sel); 2528 mutex_exit(&cpu_lock); 2529 uvm_km_free(kernel_map, (vaddr_t)new_ldt, len, 2530 UVM_KMF_WIRED); 2531 } else { 2532 mutex_exit(&cpu_lock); 2533 } 2534 #endif /* USER_LDT */ 2535 } 2536 #endif /* PMAP_FORK */ 2537 2538 #ifdef USER_LDT 2539 2540 /* 2541 * pmap_ldt_xcall: cross call used by pmap_ldt_sync. if the named pmap 2542 * is active, reload LDTR. 2543 */ 2544 static void 2545 pmap_ldt_xcall(void *arg1, void *arg2) 2546 { 2547 struct pmap *pm; 2548 2549 kpreempt_disable(); 2550 pm = arg1; 2551 if (curcpu()->ci_pmap == pm) { 2552 lldt(pm->pm_ldt_sel); 2553 } 2554 kpreempt_enable(); 2555 } 2556 2557 /* 2558 * pmap_ldt_sync: LDT selector for the named pmap is changing. swap 2559 * in the new selector on all CPUs. 2560 */ 2561 void 2562 pmap_ldt_sync(struct pmap *pm) 2563 { 2564 uint64_t where; 2565 2566 KASSERT(mutex_owned(&cpu_lock)); 2567 2568 pmap_ldt_evcnt.ev_count++; 2569 where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL); 2570 xc_wait(where); 2571 } 2572 2573 /* 2574 * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and 2575 * restore the default. 2576 */ 2577 2578 void 2579 pmap_ldt_cleanup(struct lwp *l) 2580 { 2581 pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap; 2582 union descriptor *dp = NULL; 2583 size_t len = 0; 2584 int sel = -1; 2585 2586 if (__predict_true(pmap->pm_ldt == NULL)) { 2587 return; 2588 } 2589 2590 mutex_enter(&cpu_lock); 2591 if (pmap->pm_ldt != NULL) { 2592 sel = pmap->pm_ldt_sel; 2593 dp = pmap->pm_ldt; 2594 len = pmap->pm_ldt_len; 2595 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 2596 pmap->pm_ldt = NULL; 2597 pmap->pm_ldt_len = 0; 2598 pmap_ldt_sync(pmap); 2599 ldt_free(sel); 2600 uvm_km_free(kernel_map, (vaddr_t)dp, len, UVM_KMF_WIRED); 2601 } 2602 mutex_exit(&cpu_lock); 2603 } 2604 #endif /* USER_LDT */ 2605 2606 /* 2607 * pmap_activate: activate a process' pmap 2608 * 2609 * => must be called with kernel preemption disabled 2610 * => if lwp is the curlwp, then set ci_want_pmapload so that 2611 * actual MMU context switch will be done by pmap_load() later 2612 */ 2613 2614 void 2615 pmap_activate(struct lwp *l) 2616 { 2617 struct cpu_info *ci; 2618 struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2619 2620 KASSERT(kpreempt_disabled()); 2621 2622 ci = curcpu(); 2623 2624 if (l == ci->ci_curlwp) { 2625 KASSERT(ci->ci_want_pmapload == 0); 2626 KASSERT(ci->ci_tlbstate != TLBSTATE_VALID); 2627 2628 /* 2629 * no need to switch to kernel vmspace because 2630 * it's a subset of any vmspace. 2631 */ 2632 2633 if (pmap == pmap_kernel()) { 2634 ci->ci_want_pmapload = 0; 2635 return; 2636 } 2637 2638 ci->ci_want_pmapload = 1; 2639 } 2640 } 2641 2642 /* 2643 * pmap_reactivate: try to regain reference to the pmap. 2644 * 2645 * => Must be called with kernel preemption disabled. 2646 */ 2647 2648 static bool 2649 pmap_reactivate(struct pmap *pmap) 2650 { 2651 struct cpu_info * const ci = curcpu(); 2652 const cpuid_t cid = cpu_index(ci); 2653 bool result; 2654 2655 KASSERT(kpreempt_disabled()); 2656 #if defined(XEN) && defined(__x86_64__) 2657 KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd); 2658 #elif defined(PAE) 2659 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0])); 2660 #elif !defined(XEN) 2661 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3())); 2662 #endif 2663 2664 /* 2665 * If we still have a lazy reference to this pmap, we can assume 2666 * that there was no TLB shootdown for this pmap in the meantime. 2667 * 2668 * The order of events here is important as we must synchronize 2669 * with TLB shootdown interrupts. Declare interest in invalidations 2670 * (TLBSTATE_VALID) and then check the CPU set, which the IPIs can 2671 * change only when the state is TLBSTATE_LAZY. 2672 */ 2673 2674 ci->ci_tlbstate = TLBSTATE_VALID; 2675 KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid)); 2676 2677 if (kcpuset_isset(pmap->pm_cpus, cid)) { 2678 /* We have the reference, state is valid. */ 2679 result = true; 2680 } else { 2681 /* Must reload the TLB. */ 2682 kcpuset_atomic_set(pmap->pm_cpus, cid); 2683 result = false; 2684 } 2685 return result; 2686 } 2687 2688 /* 2689 * pmap_load: perform the actual pmap switch, i.e. fill in %cr3 register 2690 * and relevant LDT info. 2691 * 2692 * Ensures that the current process' pmap is loaded on the current CPU's 2693 * MMU and that there are no stale TLB entries. 2694 * 2695 * => The caller should disable kernel preemption or do check-and-retry 2696 * to prevent a preemption from undoing our efforts. 2697 * => This function may block. 2698 */ 2699 void 2700 pmap_load(void) 2701 { 2702 struct cpu_info *ci; 2703 struct pmap *pmap, *oldpmap; 2704 struct lwp *l; 2705 struct pcb *pcb; 2706 cpuid_t cid; 2707 uint64_t ncsw; 2708 2709 kpreempt_disable(); 2710 retry: 2711 ci = curcpu(); 2712 if (!ci->ci_want_pmapload) { 2713 kpreempt_enable(); 2714 return; 2715 } 2716 l = ci->ci_curlwp; 2717 ncsw = l->l_ncsw; 2718 2719 /* should be able to take ipis. */ 2720 KASSERT(ci->ci_ilevel < IPL_HIGH); 2721 #ifdef XEN 2722 /* Check to see if interrupts are enabled (ie; no events are masked) */ 2723 KASSERT(x86_read_psl() == 0); 2724 #else 2725 KASSERT((x86_read_psl() & PSL_I) != 0); 2726 #endif 2727 2728 KASSERT(l != NULL); 2729 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2730 KASSERT(pmap != pmap_kernel()); 2731 oldpmap = ci->ci_pmap; 2732 pcb = lwp_getpcb(l); 2733 2734 if (pmap == oldpmap) { 2735 if (!pmap_reactivate(pmap)) { 2736 u_int gen = uvm_emap_gen_return(); 2737 2738 /* 2739 * pmap has been changed during deactivated. 2740 * our tlb may be stale. 2741 */ 2742 2743 tlbflush(); 2744 uvm_emap_update(gen); 2745 } 2746 2747 ci->ci_want_pmapload = 0; 2748 kpreempt_enable(); 2749 return; 2750 } 2751 2752 /* 2753 * Acquire a reference to the new pmap and perform the switch. 2754 */ 2755 2756 pmap_reference(pmap); 2757 2758 cid = cpu_index(ci); 2759 kcpuset_atomic_clear(oldpmap->pm_cpus, cid); 2760 kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid); 2761 2762 #if defined(XEN) && defined(__x86_64__) 2763 KASSERT(pmap_pdirpa(oldpmap, 0) == ci->ci_xen_current_user_pgd || 2764 oldpmap == pmap_kernel()); 2765 #elif defined(PAE) 2766 KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0])); 2767 #elif !defined(XEN) 2768 KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(rcr3())); 2769 #endif 2770 KASSERT(!kcpuset_isset(pmap->pm_cpus, cid)); 2771 KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid)); 2772 2773 /* 2774 * Mark the pmap in use by this CPU. Again, we must synchronize 2775 * with TLB shootdown interrupts, so set the state VALID first, 2776 * then register us for shootdown events on this pmap. 2777 */ 2778 ci->ci_tlbstate = TLBSTATE_VALID; 2779 kcpuset_atomic_set(pmap->pm_cpus, cid); 2780 kcpuset_atomic_set(pmap->pm_kernel_cpus, cid); 2781 ci->ci_pmap = pmap; 2782 2783 /* 2784 * update tss. now that we have registered for invalidations 2785 * from other CPUs, we're good to load the page tables. 2786 */ 2787 #ifdef PAE 2788 pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa; 2789 #else 2790 pcb->pcb_cr3 = pmap_pdirpa(pmap, 0); 2791 #endif 2792 2793 #ifdef i386 2794 #ifndef XEN 2795 ci->ci_tss.tss_ldt = pmap->pm_ldt_sel; 2796 ci->ci_tss.tss_cr3 = pcb->pcb_cr3; 2797 #endif /* !XEN */ 2798 #endif /* i386 */ 2799 2800 lldt(pmap->pm_ldt_sel); 2801 2802 u_int gen = uvm_emap_gen_return(); 2803 cpu_load_pmap(pmap, oldpmap); 2804 uvm_emap_update(gen); 2805 2806 ci->ci_want_pmapload = 0; 2807 2808 /* 2809 * we're now running with the new pmap. drop the reference 2810 * to the old pmap. if we block, we need to go around again. 2811 */ 2812 2813 pmap_destroy(oldpmap); 2814 if (l->l_ncsw != ncsw) { 2815 goto retry; 2816 } 2817 2818 kpreempt_enable(); 2819 } 2820 2821 /* 2822 * pmap_deactivate: deactivate a process' pmap. 2823 * 2824 * => Must be called with kernel preemption disabled (high IPL is enough). 2825 */ 2826 void 2827 pmap_deactivate(struct lwp *l) 2828 { 2829 struct pmap *pmap; 2830 struct cpu_info *ci; 2831 2832 KASSERT(kpreempt_disabled()); 2833 2834 if (l != curlwp) { 2835 return; 2836 } 2837 2838 /* 2839 * Wait for pending TLB shootdowns to complete. Necessary because 2840 * TLB shootdown state is per-CPU, and the LWP may be coming off 2841 * the CPU before it has a chance to call pmap_update(), e.g. due 2842 * to kernel preemption or blocking routine in between. 2843 */ 2844 pmap_tlb_shootnow(); 2845 2846 ci = curcpu(); 2847 2848 if (ci->ci_want_pmapload) { 2849 /* 2850 * ci_want_pmapload means that our pmap is not loaded on 2851 * the CPU or TLB might be stale. note that pmap_kernel() 2852 * is always considered loaded. 2853 */ 2854 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 2855 != pmap_kernel()); 2856 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 2857 != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID); 2858 2859 /* 2860 * userspace has not been touched. 2861 * nothing to do here. 2862 */ 2863 2864 ci->ci_want_pmapload = 0; 2865 return; 2866 } 2867 2868 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2869 2870 if (pmap == pmap_kernel()) { 2871 return; 2872 } 2873 2874 #if defined(XEN) && defined(__x86_64__) 2875 KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd); 2876 #elif defined(PAE) 2877 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0])); 2878 #elif !defined(XEN) 2879 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3())); 2880 #endif 2881 KASSERT(ci->ci_pmap == pmap); 2882 2883 /* 2884 * we aren't interested in TLB invalidations for this pmap, 2885 * at least for the time being. 2886 */ 2887 2888 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); 2889 ci->ci_tlbstate = TLBSTATE_LAZY; 2890 } 2891 2892 /* 2893 * end of lifecycle functions 2894 */ 2895 2896 /* 2897 * some misc. functions 2898 */ 2899 2900 int 2901 pmap_pdes_invalid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde) 2902 { 2903 int i; 2904 unsigned long index; 2905 pd_entry_t pde; 2906 2907 for (i = PTP_LEVELS; i > 1; i--) { 2908 index = pl_i(va, i); 2909 pde = pdes[i - 2][index]; 2910 if ((pde & PG_V) == 0) 2911 return i; 2912 } 2913 if (lastpde != NULL) 2914 *lastpde = pde; 2915 return 0; 2916 } 2917 2918 /* 2919 * pmap_extract: extract a PA for the given VA 2920 */ 2921 2922 bool 2923 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap) 2924 { 2925 pt_entry_t *ptes, pte; 2926 pd_entry_t pde; 2927 pd_entry_t * const *pdes; 2928 struct pmap *pmap2; 2929 struct cpu_info *ci; 2930 paddr_t pa; 2931 lwp_t *l; 2932 bool hard, rv; 2933 2934 #ifdef __HAVE_DIRECT_MAP 2935 if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) { 2936 if (pap != NULL) { 2937 *pap = va - PMAP_DIRECT_BASE; 2938 } 2939 return true; 2940 } 2941 #endif 2942 2943 rv = false; 2944 pa = 0; 2945 l = curlwp; 2946 2947 kpreempt_disable(); 2948 ci = l->l_cpu; 2949 if (__predict_true(!ci->ci_want_pmapload && ci->ci_pmap == pmap) || 2950 pmap == pmap_kernel()) { 2951 /* 2952 * no need to lock, because it's pmap_kernel() or our 2953 * own pmap and is active. if a user pmap, the caller 2954 * will hold the vm_map write/read locked and so prevent 2955 * entries from disappearing while we are here. ptps 2956 * can disappear via pmap_remove() and pmap_protect(), 2957 * but they are called with the vm_map write locked. 2958 */ 2959 hard = false; 2960 ptes = PTE_BASE; 2961 pdes = normal_pdes; 2962 } else { 2963 /* we lose, do it the hard way. */ 2964 hard = true; 2965 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 2966 } 2967 if (pmap_pdes_valid(va, pdes, &pde)) { 2968 pte = ptes[pl1_i(va)]; 2969 if (pde & PG_PS) { 2970 pa = (pde & PG_LGFRAME) | (va & (NBPD_L2 - 1)); 2971 rv = true; 2972 } else if (__predict_true((pte & PG_V) != 0)) { 2973 pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1)); 2974 rv = true; 2975 } 2976 } 2977 if (__predict_false(hard)) { 2978 pmap_unmap_ptes(pmap, pmap2); 2979 } 2980 kpreempt_enable(); 2981 if (pap != NULL) { 2982 *pap = pa; 2983 } 2984 return rv; 2985 } 2986 2987 2988 /* 2989 * vtophys: virtual address to physical address. For use by 2990 * machine-dependent code only. 2991 */ 2992 2993 paddr_t 2994 vtophys(vaddr_t va) 2995 { 2996 paddr_t pa; 2997 2998 if (pmap_extract(pmap_kernel(), va, &pa) == true) 2999 return (pa); 3000 return (0); 3001 } 3002 3003 __strict_weak_alias(pmap_extract_ma, pmap_extract); 3004 3005 #ifdef XEN 3006 3007 /* 3008 * vtomach: virtual address to machine address. For use by 3009 * machine-dependent code only. 3010 */ 3011 3012 paddr_t 3013 vtomach(vaddr_t va) 3014 { 3015 paddr_t pa; 3016 3017 if (pmap_extract_ma(pmap_kernel(), va, &pa) == true) 3018 return (pa); 3019 return (0); 3020 } 3021 3022 #endif /* XEN */ 3023 3024 /* 3025 * pmap_virtual_space: used during bootup [pmap_steal_memory] to 3026 * determine the bounds of the kernel virtual addess space. 3027 */ 3028 3029 void 3030 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp) 3031 { 3032 *startp = virtual_avail; 3033 *endp = virtual_end; 3034 } 3035 3036 /* 3037 * pmap_zero_page: zero a page 3038 */ 3039 3040 void 3041 pmap_zero_page(paddr_t pa) 3042 { 3043 #if defined(__HAVE_DIRECT_MAP) 3044 pagezero(PMAP_DIRECT_MAP(pa)); 3045 #else 3046 #if defined(XEN) 3047 if (XEN_VERSION_SUPPORTED(3, 4)) 3048 xen_pagezero(pa); 3049 #endif 3050 pt_entry_t *zpte; 3051 void *zerova; 3052 int id; 3053 3054 const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_M | PG_U | 3055 PG_k; 3056 3057 kpreempt_disable(); 3058 id = cpu_number(); 3059 zpte = PTESLEW(zero_pte, id); 3060 zerova = VASLEW(zerop, id); 3061 3062 #ifdef DIAGNOSTIC 3063 if (*zpte) 3064 panic("pmap_zero_page: lock botch"); 3065 #endif 3066 3067 pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags); 3068 pmap_pte_flush(); 3069 pmap_update_pg((vaddr_t)zerova); /* flush TLB */ 3070 3071 memset(zerova, 0, PAGE_SIZE); 3072 3073 #if defined(DIAGNOSTIC) || defined(XEN) 3074 pmap_pte_set(zpte, 0); /* zap ! */ 3075 pmap_pte_flush(); 3076 #endif 3077 3078 kpreempt_enable(); 3079 #endif /* defined(__HAVE_DIRECT_MAP) */ 3080 } 3081 3082 /* 3083 * pmap_pagezeroidle: the same, for the idle loop page zero'er. 3084 * Returns true if the page was zero'd, false if we aborted for 3085 * some reason. 3086 */ 3087 3088 bool 3089 pmap_pageidlezero(paddr_t pa) 3090 { 3091 #ifdef __HAVE_DIRECT_MAP 3092 KASSERT(cpu_feature[0] & CPUID_SSE2); 3093 return sse2_idlezero_page((void *)PMAP_DIRECT_MAP(pa)); 3094 #else 3095 pt_entry_t *zpte; 3096 void *zerova; 3097 bool rv; 3098 int id; 3099 3100 const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_M | PG_U | 3101 PG_k; 3102 3103 id = cpu_number(); 3104 zpte = PTESLEW(zero_pte, id); 3105 zerova = VASLEW(zerop, id); 3106 3107 KASSERT(cpu_feature[0] & CPUID_SSE2); 3108 KASSERT(*zpte == 0); 3109 3110 pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags); 3111 pmap_pte_flush(); 3112 pmap_update_pg((vaddr_t)zerova); /* flush TLB */ 3113 3114 rv = sse2_idlezero_page(zerova); 3115 3116 #if defined(DIAGNOSTIC) || defined(XEN) 3117 pmap_pte_set(zpte, 0); /* zap ! */ 3118 pmap_pte_flush(); 3119 #endif 3120 3121 return rv; 3122 #endif 3123 } 3124 3125 /* 3126 * pmap_copy_page: copy a page 3127 */ 3128 3129 void 3130 pmap_copy_page(paddr_t srcpa, paddr_t dstpa) 3131 { 3132 #if defined(__HAVE_DIRECT_MAP) 3133 vaddr_t srcva = PMAP_DIRECT_MAP(srcpa); 3134 vaddr_t dstva = PMAP_DIRECT_MAP(dstpa); 3135 3136 memcpy((void *)dstva, (void *)srcva, PAGE_SIZE); 3137 #else 3138 #if defined(XEN) 3139 if (XEN_VERSION_SUPPORTED(3, 4)) { 3140 xen_copy_page(srcpa, dstpa); 3141 return; 3142 } 3143 #endif 3144 pt_entry_t *spte; 3145 pt_entry_t *dpte; 3146 void *csrcva; 3147 void *cdstva; 3148 int id; 3149 3150 const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_U | PG_k; 3151 3152 kpreempt_disable(); 3153 id = cpu_number(); 3154 spte = PTESLEW(csrc_pte,id); 3155 dpte = PTESLEW(cdst_pte,id); 3156 csrcva = VASLEW(csrcp, id); 3157 cdstva = VASLEW(cdstp, id); 3158 3159 KASSERT(*spte == 0 && *dpte == 0); 3160 3161 pmap_pte_set(spte, pmap_pa2pte(srcpa) | pteflags); 3162 pmap_pte_set(dpte, pmap_pa2pte(dstpa) | pteflags | PG_M); 3163 pmap_pte_flush(); 3164 pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva); 3165 3166 memcpy(cdstva, csrcva, PAGE_SIZE); 3167 3168 #if defined(DIAGNOSTIC) || defined(XEN) 3169 pmap_pte_set(spte, 0); 3170 pmap_pte_set(dpte, 0); 3171 pmap_pte_flush(); 3172 #endif 3173 3174 kpreempt_enable(); 3175 #endif /* defined(__HAVE_DIRECT_MAP) */ 3176 } 3177 3178 static pt_entry_t * 3179 pmap_map_ptp(struct vm_page *ptp) 3180 { 3181 #ifdef __HAVE_DIRECT_MAP 3182 return (void *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp)); 3183 #else 3184 pt_entry_t *ptppte; 3185 void *ptpva; 3186 int id; 3187 3188 KASSERT(kpreempt_disabled()); 3189 3190 #ifndef XEN 3191 const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_U | PG_M | 3192 PG_k; 3193 #else 3194 const pd_entry_t pteflags = PG_V | pmap_pg_nx | PG_U | PG_M | PG_k; 3195 #endif 3196 3197 id = cpu_number(); 3198 ptppte = PTESLEW(ptp_pte, id); 3199 ptpva = VASLEW(ptpp, id); 3200 pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | pteflags); 3201 3202 pmap_pte_flush(); 3203 pmap_update_pg((vaddr_t)ptpva); 3204 3205 return (pt_entry_t *)ptpva; 3206 #endif 3207 } 3208 3209 static void 3210 pmap_unmap_ptp(void) 3211 { 3212 #ifndef __HAVE_DIRECT_MAP 3213 #if defined(DIAGNOSTIC) || defined(XEN) 3214 pt_entry_t *pte; 3215 3216 KASSERT(kpreempt_disabled()); 3217 3218 pte = PTESLEW(ptp_pte, cpu_number()); 3219 if (*pte != 0) { 3220 pmap_pte_set(pte, 0); 3221 pmap_pte_flush(); 3222 } 3223 #endif 3224 #endif 3225 } 3226 3227 static pt_entry_t * 3228 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va) 3229 { 3230 3231 KASSERT(kpreempt_disabled()); 3232 if (pmap_is_curpmap(pmap)) { 3233 return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */ 3234 } 3235 KASSERT(ptp != NULL); 3236 return pmap_map_ptp(ptp) + pl1_pi(va); 3237 } 3238 3239 static void 3240 pmap_unmap_pte(void) 3241 { 3242 3243 KASSERT(kpreempt_disabled()); 3244 3245 pmap_unmap_ptp(); 3246 } 3247 3248 /* 3249 * p m a p r e m o v e f u n c t i o n s 3250 * 3251 * functions that remove mappings 3252 */ 3253 3254 /* 3255 * pmap_remove_ptes: remove PTEs from a PTP 3256 * 3257 * => caller must hold pmap's lock 3258 * => PTP must be mapped into KVA 3259 * => PTP should be null if pmap == pmap_kernel() 3260 * => must be called with kernel preemption disabled 3261 * => returns composite pte if at least one page should be shot down 3262 */ 3263 3264 static void 3265 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva, 3266 vaddr_t startva, vaddr_t endva, struct pv_entry **pv_tofree) 3267 { 3268 pt_entry_t *pte = (pt_entry_t *)ptpva; 3269 3270 KASSERT(pmap == pmap_kernel() || mutex_owned(pmap->pm_lock)); 3271 KASSERT(kpreempt_disabled()); 3272 3273 /* 3274 * note that ptpva points to the PTE that maps startva. this may 3275 * or may not be the first PTE in the PTP. 3276 * 3277 * we loop through the PTP while there are still PTEs to look at 3278 * and the wire_count is greater than 1 (because we use the wire_count 3279 * to keep track of the number of real PTEs in the PTP). 3280 */ 3281 while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) { 3282 (void)pmap_remove_pte(pmap, ptp, pte, startva, pv_tofree); 3283 startva += PAGE_SIZE; 3284 pte++; 3285 } 3286 } 3287 3288 3289 /* 3290 * pmap_remove_pte: remove a single PTE from a PTP. 3291 * 3292 * => caller must hold pmap's lock 3293 * => PTP must be mapped into KVA 3294 * => PTP should be null if pmap == pmap_kernel() 3295 * => returns true if we removed a mapping 3296 * => must be called with kernel preemption disabled 3297 */ 3298 static bool 3299 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, 3300 vaddr_t va, struct pv_entry **pv_tofree) 3301 { 3302 struct pv_entry *pve; 3303 struct vm_page *pg; 3304 struct pmap_page *pp; 3305 pt_entry_t opte; 3306 3307 KASSERT(pmap == pmap_kernel() || mutex_owned(pmap->pm_lock)); 3308 KASSERT(kpreempt_disabled()); 3309 3310 if (!pmap_valid_entry(*pte)) { 3311 /* VA not mapped. */ 3312 return false; 3313 } 3314 3315 /* Atomically save the old PTE and zap it. */ 3316 opte = pmap_pte_testset(pte, 0); 3317 if (!pmap_valid_entry(opte)) { 3318 return false; 3319 } 3320 3321 pmap_exec_account(pmap, va, opte, 0); 3322 pmap_stats_update_bypte(pmap, 0, opte); 3323 3324 if (ptp) { 3325 /* 3326 * Dropping a PTE. Make sure that the PDE is flushed. 3327 */ 3328 ptp->wire_count--; 3329 if (ptp->wire_count <= 1) { 3330 opte |= PG_U; 3331 } 3332 } 3333 3334 if ((opte & PG_U) != 0) { 3335 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE); 3336 } 3337 3338 /* 3339 * If we are not on a pv_head list - we are done. 3340 */ 3341 if ((opte & PG_PVLIST) == 0) { 3342 #if defined(DIAGNOSTIC) && !defined(DOM0OPS) 3343 if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL || 3344 pmap_pv_tracked(pmap_pte2pa(opte)) != NULL) 3345 panic("pmap_remove_pte: managed or pv-tracked page" 3346 " without PG_PVLIST for %#"PRIxVADDR, va); 3347 #endif 3348 return true; 3349 } 3350 3351 if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) { 3352 KASSERT(uvm_page_locked_p(pg)); 3353 pp = VM_PAGE_TO_PP(pg); 3354 } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) { 3355 paddr_t pa = pmap_pte2pa(opte); 3356 panic("pmap_remove_pte: PG_PVLIST with pv-untracked page" 3357 " va = 0x%"PRIxVADDR 3358 " pa = 0x%"PRIxPADDR" (0x%"PRIxPADDR")", 3359 va, pa, atop(pa)); 3360 } 3361 3362 /* Sync R/M bits. */ 3363 pp->pp_attrs |= opte; 3364 pve = pmap_remove_pv(pp, ptp, va); 3365 3366 if (pve) { 3367 pve->pve_next = *pv_tofree; 3368 *pv_tofree = pve; 3369 } 3370 return true; 3371 } 3372 3373 /* 3374 * pmap_remove: mapping removal function. 3375 * 3376 * => caller should not be holding any pmap locks 3377 */ 3378 3379 void 3380 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 3381 { 3382 pt_entry_t *ptes; 3383 pd_entry_t pde; 3384 pd_entry_t * const *pdes; 3385 struct pv_entry *pv_tofree = NULL; 3386 bool result; 3387 int i; 3388 paddr_t ptppa; 3389 vaddr_t blkendva, va = sva; 3390 struct vm_page *ptp; 3391 struct pmap *pmap2; 3392 3393 kpreempt_disable(); 3394 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 3395 3396 /* 3397 * removing one page? take shortcut function. 3398 */ 3399 3400 if (va + PAGE_SIZE == eva) { 3401 if (pmap_pdes_valid(va, pdes, &pde)) { 3402 3403 /* PA of the PTP */ 3404 ptppa = pmap_pte2pa(pde); 3405 3406 /* Get PTP if non-kernel mapping. */ 3407 if (pmap != pmap_kernel()) { 3408 ptp = pmap_find_ptp(pmap, va, ptppa, 1); 3409 KASSERTMSG(ptp != NULL, 3410 "pmap_remove: unmanaged PTP detected"); 3411 } else { 3412 /* Never free kernel PTPs. */ 3413 ptp = NULL; 3414 } 3415 3416 result = pmap_remove_pte(pmap, ptp, 3417 &ptes[pl1_i(va)], va, &pv_tofree); 3418 3419 /* 3420 * if mapping removed and the PTP is no longer 3421 * being used, free it! 3422 */ 3423 3424 if (result && ptp && ptp->wire_count <= 1) 3425 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3426 } 3427 } else for (/* null */ ; va < eva ; va = blkendva) { 3428 int lvl; 3429 3430 /* determine range of block */ 3431 blkendva = x86_round_pdr(va+1); 3432 if (blkendva > eva) 3433 blkendva = eva; 3434 3435 /* 3436 * Our PTE mappings should never be removed with pmap_remove. 3437 * 3438 * XXXmaxv: still needed? 3439 * 3440 * A long term solution is to move the PTEs out of user address 3441 * space, and into kernel address space. Then we can set 3442 * VM_MAXUSER_ADDRESS to be VM_MAX_ADDRESS. 3443 */ 3444 for (i = 0; i < PDP_SIZE; i++) { 3445 if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i) 3446 panic("PTE space accessed"); 3447 } 3448 3449 lvl = pmap_pdes_invalid(va, pdes, &pde); 3450 if (lvl != 0) { 3451 /* 3452 * skip a range corresponding to an invalid pde. 3453 */ 3454 blkendva = (va & ptp_masks[lvl - 1]) + nbpd[lvl - 1]; 3455 continue; 3456 } 3457 3458 /* PA of the PTP */ 3459 ptppa = pmap_pte2pa(pde); 3460 3461 /* Get PTP if non-kernel mapping. */ 3462 if (pmap != pmap_kernel()) { 3463 ptp = pmap_find_ptp(pmap, va, ptppa, 1); 3464 KASSERTMSG(ptp != NULL, 3465 "pmap_remove: unmanaged PTP detected"); 3466 } else { 3467 /* Never free kernel PTPs. */ 3468 ptp = NULL; 3469 } 3470 3471 pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va, 3472 blkendva, &pv_tofree); 3473 3474 /* if PTP is no longer being used, free it! */ 3475 if (ptp && ptp->wire_count <= 1) { 3476 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3477 } 3478 } 3479 pmap_unmap_ptes(pmap, pmap2); /* unlock pmap */ 3480 kpreempt_enable(); 3481 3482 /* Now we free unused PVs */ 3483 if (pv_tofree) 3484 pmap_free_pvs(pv_tofree); 3485 } 3486 3487 /* 3488 * pmap_sync_pv: clear pte bits and return the old value of the pte. 3489 * 3490 * => Caller should disable kernel preemption. 3491 * => issues tlb shootdowns if necessary. 3492 */ 3493 3494 static int 3495 pmap_sync_pv(struct pv_pte *pvpte, pt_entry_t expect, int clearbits, 3496 pt_entry_t *optep) 3497 { 3498 struct pmap *pmap; 3499 struct vm_page *ptp; 3500 vaddr_t va; 3501 pt_entry_t *ptep; 3502 pt_entry_t opte; 3503 pt_entry_t npte; 3504 bool need_shootdown; 3505 3506 ptp = pvpte->pte_ptp; 3507 va = pvpte->pte_va; 3508 KASSERT(ptp == NULL || ptp->uobject != NULL); 3509 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 3510 pmap = ptp_to_pmap(ptp); 3511 3512 KASSERT((expect & ~(PG_FRAME | PG_V)) == 0); 3513 KASSERT((expect & PG_V) != 0); 3514 KASSERT(clearbits == ~0 || (clearbits & ~(PG_M | PG_U | PG_RW)) == 0); 3515 KASSERT(kpreempt_disabled()); 3516 3517 ptep = pmap_map_pte(pmap, ptp, va); 3518 do { 3519 opte = *ptep; 3520 KASSERT((opte & (PG_M | PG_U)) != PG_M); 3521 KASSERT((opte & (PG_U | PG_V)) != PG_U); 3522 KASSERT(opte == 0 || (opte & PG_V) != 0); 3523 if ((opte & (PG_FRAME | PG_V)) != expect) { 3524 3525 /* 3526 * we lost a race with a V->P operation like 3527 * pmap_remove(). wait for the competitor 3528 * reflecting pte bits into mp_attrs. 3529 * 3530 * issue a redundant TLB shootdown so that 3531 * we can wait for its completion. 3532 */ 3533 3534 pmap_unmap_pte(); 3535 if (clearbits != 0) { 3536 pmap_tlb_shootdown(pmap, va, 3537 (pmap == pmap_kernel() ? PG_G : 0), 3538 TLBSHOOT_SYNC_PV1); 3539 } 3540 return EAGAIN; 3541 } 3542 3543 /* 3544 * check if there's anything to do on this pte. 3545 */ 3546 3547 if ((opte & clearbits) == 0) { 3548 need_shootdown = false; 3549 break; 3550 } 3551 3552 /* 3553 * we need a shootdown if the pte is cached. (PG_U) 3554 * 3555 * ...unless we are clearing only the PG_RW bit and 3556 * it isn't cached as RW. (PG_M) 3557 */ 3558 3559 need_shootdown = (opte & PG_U) != 0 && 3560 !(clearbits == PG_RW && (opte & PG_M) == 0); 3561 3562 npte = opte & ~clearbits; 3563 3564 /* 3565 * if we need a shootdown anyway, clear PG_U and PG_M. 3566 */ 3567 3568 if (need_shootdown) { 3569 npte &= ~(PG_U | PG_M); 3570 } 3571 KASSERT((npte & (PG_M | PG_U)) != PG_M); 3572 KASSERT((npte & (PG_U | PG_V)) != PG_U); 3573 KASSERT(npte == 0 || (opte & PG_V) != 0); 3574 } while (pmap_pte_cas(ptep, opte, npte) != opte); 3575 3576 if (need_shootdown) { 3577 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV2); 3578 } 3579 pmap_unmap_pte(); 3580 3581 *optep = opte; 3582 return 0; 3583 } 3584 3585 static void 3586 pmap_pp_remove(struct pmap_page *pp, paddr_t pa) 3587 { 3588 struct pv_pte *pvpte; 3589 struct pv_entry *killlist = NULL; 3590 struct vm_page *ptp; 3591 pt_entry_t expect; 3592 int count; 3593 3594 expect = pmap_pa2pte(pa) | PG_V; 3595 count = SPINLOCK_BACKOFF_MIN; 3596 kpreempt_disable(); 3597 startover: 3598 while ((pvpte = pv_pte_first(pp)) != NULL) { 3599 struct pmap *pmap; 3600 struct pv_entry *pve; 3601 pt_entry_t opte; 3602 vaddr_t va; 3603 int error; 3604 3605 /* 3606 * add a reference to the pmap before clearing the pte. 3607 * otherwise the pmap can disappear behind us. 3608 */ 3609 3610 ptp = pvpte->pte_ptp; 3611 pmap = ptp_to_pmap(ptp); 3612 if (ptp != NULL) { 3613 pmap_reference(pmap); 3614 } 3615 3616 error = pmap_sync_pv(pvpte, expect, ~0, &opte); 3617 if (error == EAGAIN) { 3618 int hold_count; 3619 KERNEL_UNLOCK_ALL(curlwp, &hold_count); 3620 if (ptp != NULL) { 3621 pmap_destroy(pmap); 3622 } 3623 SPINLOCK_BACKOFF(count); 3624 KERNEL_LOCK(hold_count, curlwp); 3625 goto startover; 3626 } 3627 3628 pp->pp_attrs |= opte; 3629 va = pvpte->pte_va; 3630 pve = pmap_remove_pv(pp, ptp, va); 3631 3632 /* update the PTP reference count. free if last reference. */ 3633 if (ptp != NULL) { 3634 struct pmap *pmap2; 3635 pt_entry_t *ptes; 3636 pd_entry_t * const *pdes; 3637 3638 KASSERT(pmap != pmap_kernel()); 3639 3640 pmap_tlb_shootnow(); 3641 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3642 pmap_stats_update_bypte(pmap, 0, opte); 3643 ptp->wire_count--; 3644 if (ptp->wire_count <= 1) { 3645 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3646 } 3647 pmap_unmap_ptes(pmap, pmap2); 3648 pmap_destroy(pmap); 3649 } else { 3650 KASSERT(pmap == pmap_kernel()); 3651 pmap_stats_update_bypte(pmap, 0, opte); 3652 } 3653 3654 if (pve != NULL) { 3655 pve->pve_next = killlist; /* mark it for death */ 3656 killlist = pve; 3657 } 3658 } 3659 pmap_tlb_shootnow(); 3660 kpreempt_enable(); 3661 3662 /* Now free unused pvs. */ 3663 pmap_free_pvs(killlist); 3664 } 3665 3666 /* 3667 * pmap_page_remove: remove a managed vm_page from all pmaps that map it 3668 * 3669 * => R/M bits are sync'd back to attrs 3670 */ 3671 3672 void 3673 pmap_page_remove(struct vm_page *pg) 3674 { 3675 struct pmap_page *pp; 3676 paddr_t pa; 3677 3678 KASSERT(uvm_page_locked_p(pg)); 3679 3680 pp = VM_PAGE_TO_PP(pg); 3681 pa = VM_PAGE_TO_PHYS(pg); 3682 pmap_pp_remove(pp, pa); 3683 } 3684 3685 /* 3686 * pmap_pv_remove: remove an unmanaged pv-tracked page from all pmaps 3687 * that map it 3688 */ 3689 3690 void 3691 pmap_pv_remove(paddr_t pa) 3692 { 3693 struct pmap_page *pp; 3694 3695 pp = pmap_pv_tracked(pa); 3696 if (pp == NULL) 3697 panic("pmap_pv_protect: page not pv-tracked: 0x%"PRIxPADDR, 3698 pa); 3699 pmap_pp_remove(pp, pa); 3700 } 3701 3702 /* 3703 * p m a p a t t r i b u t e f u n c t i o n s 3704 * functions that test/change managed page's attributes 3705 * since a page can be mapped multiple times we must check each PTE that 3706 * maps it by going down the pv lists. 3707 */ 3708 3709 /* 3710 * pmap_test_attrs: test a page's attributes 3711 */ 3712 3713 bool 3714 pmap_test_attrs(struct vm_page *pg, unsigned testbits) 3715 { 3716 struct pmap_page *pp; 3717 struct pv_pte *pvpte; 3718 pt_entry_t expect; 3719 u_int result; 3720 3721 KASSERT(uvm_page_locked_p(pg)); 3722 3723 pp = VM_PAGE_TO_PP(pg); 3724 if ((pp->pp_attrs & testbits) != 0) { 3725 return true; 3726 } 3727 expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V; 3728 kpreempt_disable(); 3729 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 3730 pt_entry_t opte; 3731 int error; 3732 3733 if ((pp->pp_attrs & testbits) != 0) { 3734 break; 3735 } 3736 error = pmap_sync_pv(pvpte, expect, 0, &opte); 3737 if (error == 0) { 3738 pp->pp_attrs |= opte; 3739 } 3740 } 3741 result = pp->pp_attrs & testbits; 3742 kpreempt_enable(); 3743 3744 /* 3745 * note that we will exit the for loop with a non-null pve if 3746 * we have found the bits we are testing for. 3747 */ 3748 3749 return result != 0; 3750 } 3751 3752 static bool 3753 pmap_pp_clear_attrs(struct pmap_page *pp, paddr_t pa, unsigned clearbits) 3754 { 3755 struct pv_pte *pvpte; 3756 u_int result; 3757 pt_entry_t expect; 3758 int count; 3759 3760 expect = pmap_pa2pte(pa) | PG_V; 3761 count = SPINLOCK_BACKOFF_MIN; 3762 kpreempt_disable(); 3763 startover: 3764 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 3765 pt_entry_t opte; 3766 int error; 3767 3768 error = pmap_sync_pv(pvpte, expect, clearbits, &opte); 3769 if (error == EAGAIN) { 3770 int hold_count; 3771 KERNEL_UNLOCK_ALL(curlwp, &hold_count); 3772 SPINLOCK_BACKOFF(count); 3773 KERNEL_LOCK(hold_count, curlwp); 3774 goto startover; 3775 } 3776 pp->pp_attrs |= opte; 3777 } 3778 result = pp->pp_attrs & clearbits; 3779 pp->pp_attrs &= ~clearbits; 3780 pmap_tlb_shootnow(); 3781 kpreempt_enable(); 3782 3783 return result != 0; 3784 } 3785 3786 /* 3787 * pmap_clear_attrs: clear the specified attribute for a page. 3788 * 3789 * => we return true if we cleared one of the bits we were asked to 3790 */ 3791 3792 bool 3793 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits) 3794 { 3795 struct pmap_page *pp; 3796 paddr_t pa; 3797 3798 KASSERT(uvm_page_locked_p(pg)); 3799 3800 pp = VM_PAGE_TO_PP(pg); 3801 pa = VM_PAGE_TO_PHYS(pg); 3802 3803 return pmap_pp_clear_attrs(pp, pa, clearbits); 3804 } 3805 3806 /* 3807 * pmap_pv_clear_attrs: clear the specified attributes for an unmanaged 3808 * pv-tracked page. 3809 */ 3810 3811 bool 3812 pmap_pv_clear_attrs(paddr_t pa, unsigned clearbits) 3813 { 3814 struct pmap_page *pp; 3815 3816 pp = pmap_pv_tracked(pa); 3817 if (pp == NULL) 3818 panic("pmap_pv_protect: page not pv-tracked: 0x%"PRIxPADDR, 3819 pa); 3820 3821 return pmap_pp_clear_attrs(pp, pa, clearbits); 3822 } 3823 3824 /* 3825 * p m a p p r o t e c t i o n f u n c t i o n s 3826 */ 3827 3828 /* 3829 * pmap_page_protect: change the protection of all recorded mappings 3830 * of a managed page 3831 * 3832 * => NOTE: this is an inline function in pmap.h 3833 */ 3834 3835 /* see pmap.h */ 3836 3837 /* 3838 * pmap_pv_protect: change the protection of all recorded mappings 3839 * of an unmanaged pv-tracked page 3840 * 3841 * => NOTE: this is an inline function in pmap.h 3842 */ 3843 3844 /* see pmap.h */ 3845 3846 /* 3847 * pmap_protect: set the protection in of the pages in a pmap 3848 * 3849 * => NOTE: this is an inline function in pmap.h 3850 */ 3851 3852 /* see pmap.h */ 3853 3854 /* 3855 * pmap_write_protect: write-protect pages in a pmap. 3856 */ 3857 void 3858 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot) 3859 { 3860 pt_entry_t bit_rem, bit_put; 3861 pt_entry_t *ptes; 3862 pt_entry_t * const *pdes; 3863 struct pmap *pmap2; 3864 vaddr_t blockend, va; 3865 3866 KASSERT(curlwp->l_md.md_gc_pmap != pmap); 3867 3868 bit_rem = 0; 3869 if (!(prot & VM_PROT_WRITE)) 3870 bit_rem = PG_RW; 3871 3872 bit_put = 0; 3873 if (!(prot & VM_PROT_EXECUTE)) 3874 bit_put = pmap_pg_nx; 3875 3876 sva &= PG_FRAME; 3877 eva &= PG_FRAME; 3878 3879 /* Acquire pmap. */ 3880 kpreempt_disable(); 3881 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3882 3883 for (va = sva ; va < eva; va = blockend) { 3884 pt_entry_t *spte, *epte; 3885 int i; 3886 3887 blockend = x86_round_pdr(va + 1); 3888 if (blockend > eva) 3889 blockend = eva; 3890 3891 /* 3892 * Our PTE mappings should never be write-protected. 3893 * 3894 * XXXmaxv: still needed? 3895 * 3896 * A long term solution is to move the PTEs out of user address 3897 * space, and into kernel address space. Then we can set 3898 * VM_MAXUSER_ADDRESS to be VM_MAX_ADDRESS. 3899 */ 3900 for (i = 0; i < PDP_SIZE; i++) { 3901 if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i) 3902 panic("PTE space accessed"); 3903 } 3904 3905 /* Is it a valid block? */ 3906 if (!pmap_pdes_valid(va, pdes, NULL)) { 3907 continue; 3908 } 3909 KASSERT(va < VM_MAXUSER_ADDRESS || va >= VM_MAX_ADDRESS); 3910 3911 spte = &ptes[pl1_i(va)]; 3912 epte = &ptes[pl1_i(blockend)]; 3913 3914 for (/* */; spte < epte; spte++) { 3915 pt_entry_t opte, npte; 3916 3917 do { 3918 opte = *spte; 3919 if (!pmap_valid_entry(opte)) { 3920 goto next; 3921 } 3922 npte = (opte & ~bit_rem) | bit_put; 3923 } while (pmap_pte_cas(spte, opte, npte) != opte); 3924 3925 if ((opte & PG_M) != 0) { 3926 vaddr_t tva = x86_ptob(spte - ptes); 3927 pmap_tlb_shootdown(pmap, tva, opte, 3928 TLBSHOOT_WRITE_PROTECT); 3929 } 3930 next:; 3931 } 3932 } 3933 3934 /* Release pmap. */ 3935 pmap_unmap_ptes(pmap, pmap2); 3936 kpreempt_enable(); 3937 } 3938 3939 /* 3940 * pmap_unwire: clear the wired bit in the PTE. 3941 * 3942 * => Mapping should already be present. 3943 */ 3944 void 3945 pmap_unwire(struct pmap *pmap, vaddr_t va) 3946 { 3947 pt_entry_t *ptes, *ptep, opte; 3948 pd_entry_t * const *pdes; 3949 struct pmap *pmap2; 3950 3951 /* Acquire pmap. */ 3952 kpreempt_disable(); 3953 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3954 3955 if (!pmap_pdes_valid(va, pdes, NULL)) { 3956 panic("pmap_unwire: invalid PDE"); 3957 } 3958 3959 ptep = &ptes[pl1_i(va)]; 3960 opte = *ptep; 3961 KASSERT(pmap_valid_entry(opte)); 3962 3963 if (opte & PG_W) { 3964 pt_entry_t npte = opte & ~PG_W; 3965 3966 opte = pmap_pte_testset(ptep, npte); 3967 pmap_stats_update_bypte(pmap, npte, opte); 3968 } else { 3969 printf("pmap_unwire: wiring for pmap %p va 0x%lx " 3970 "did not change!\n", pmap, va); 3971 } 3972 3973 /* Release pmap. */ 3974 pmap_unmap_ptes(pmap, pmap2); 3975 kpreempt_enable(); 3976 } 3977 3978 /* 3979 * pmap_copy: copy mappings from one pmap to another 3980 * 3981 * => optional function 3982 * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr) 3983 */ 3984 3985 /* 3986 * defined as macro in pmap.h 3987 */ 3988 3989 __strict_weak_alias(pmap_enter, pmap_enter_default); 3990 3991 int 3992 pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, 3993 u_int flags) 3994 { 3995 return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0); 3996 } 3997 3998 /* 3999 * pmap_enter: enter a mapping into a pmap 4000 * 4001 * => must be done "now" ... no lazy-evaluation 4002 * => we set pmap => pv_head locking 4003 */ 4004 int 4005 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa, 4006 vm_prot_t prot, u_int flags, int domid) 4007 { 4008 pt_entry_t *ptes, opte, npte; 4009 pt_entry_t *ptep; 4010 pd_entry_t * const *pdes; 4011 struct vm_page *ptp; 4012 struct vm_page *new_pg, *old_pg; 4013 struct pmap_page *new_pp, *old_pp; 4014 struct pv_entry *old_pve = NULL; 4015 struct pv_entry *new_pve; 4016 struct pv_entry *new_sparepve; 4017 int error; 4018 bool wired = (flags & PMAP_WIRED) != 0; 4019 struct pmap *pmap2; 4020 4021 KASSERT(pmap_initialized); 4022 KASSERT(curlwp->l_md.md_gc_pmap != pmap); 4023 KASSERT(va < VM_MAX_KERNEL_ADDRESS); 4024 KASSERTMSG(va != (vaddr_t)PDP_BASE, 4025 "pmap_enter: trying to map over PDP!"); 4026 KASSERTMSG(va < VM_MIN_KERNEL_ADDRESS || 4027 pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]), 4028 "pmap_enter: missing kernel PTP for VA %lx!", va); 4029 4030 #ifdef XEN 4031 KASSERT(domid == DOMID_SELF || pa == 0); 4032 #endif /* XEN */ 4033 4034 npte = ma | protection_codes[prot] | PG_V; 4035 npte |= pmap_pat_flags(flags); 4036 if (wired) 4037 npte |= PG_W; 4038 if (va < VM_MAXUSER_ADDRESS) 4039 npte |= PG_u; 4040 else if (va < VM_MAX_ADDRESS) 4041 panic("PTE space accessed"); /* XXXmaxv: no longer needed? */ 4042 else 4043 npte |= PG_k; 4044 if (pmap == pmap_kernel()) 4045 npte |= pmap_pg_g; 4046 if (flags & VM_PROT_ALL) { 4047 npte |= PG_U; 4048 if (flags & VM_PROT_WRITE) { 4049 KASSERT((npte & PG_RW) != 0); 4050 npte |= PG_M; 4051 } 4052 } 4053 4054 #ifdef XEN 4055 if (domid != DOMID_SELF) 4056 new_pg = NULL; 4057 else 4058 #endif 4059 new_pg = PHYS_TO_VM_PAGE(pa); 4060 if (new_pg != NULL) { 4061 /* This is a managed page */ 4062 npte |= PG_PVLIST; 4063 new_pp = VM_PAGE_TO_PP(new_pg); 4064 } else if ((new_pp = pmap_pv_tracked(pa)) != NULL) { 4065 /* This is an unmanaged pv-tracked page */ 4066 npte |= PG_PVLIST; 4067 } else { 4068 new_pp = NULL; 4069 } 4070 4071 /* get pves. */ 4072 new_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT); 4073 new_sparepve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT); 4074 if (new_pve == NULL || new_sparepve == NULL) { 4075 if (flags & PMAP_CANFAIL) { 4076 error = ENOMEM; 4077 goto out2; 4078 } 4079 panic("pmap_enter: pve allocation failed"); 4080 } 4081 4082 kpreempt_disable(); 4083 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 4084 if (pmap == pmap_kernel()) { 4085 ptp = NULL; 4086 } else { 4087 ptp = pmap_get_ptp(pmap, va, pdes); 4088 if (ptp == NULL) { 4089 pmap_unmap_ptes(pmap, pmap2); 4090 if (flags & PMAP_CANFAIL) { 4091 error = ENOMEM; 4092 goto out; 4093 } 4094 panic("pmap_enter: get ptp failed"); 4095 } 4096 } 4097 4098 /* 4099 * update the pte. 4100 */ 4101 4102 ptep = &ptes[pl1_i(va)]; 4103 do { 4104 opte = *ptep; 4105 4106 /* 4107 * if the same page, inherit PG_U and PG_M. 4108 */ 4109 if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) { 4110 npte |= opte & (PG_U | PG_M); 4111 } 4112 #if defined(XEN) 4113 if (domid != DOMID_SELF) { 4114 /* pmap_pte_cas with error handling */ 4115 int s = splvm(); 4116 if (opte != *ptep) { 4117 splx(s); 4118 continue; 4119 } 4120 error = xpq_update_foreign( 4121 vtomach((vaddr_t)ptep), npte, domid); 4122 splx(s); 4123 if (error) { 4124 if (ptp != NULL && ptp->wire_count <= 1) { 4125 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 4126 } 4127 pmap_unmap_ptes(pmap, pmap2); 4128 goto out; 4129 } 4130 break; 4131 } 4132 #endif /* defined(XEN) */ 4133 } while (pmap_pte_cas(ptep, opte, npte) != opte); 4134 4135 /* 4136 * update statistics and PTP's reference count. 4137 */ 4138 4139 pmap_stats_update_bypte(pmap, npte, opte); 4140 if (ptp != NULL && !pmap_valid_entry(opte)) { 4141 ptp->wire_count++; 4142 } 4143 KASSERT(ptp == NULL || ptp->wire_count > 1); 4144 4145 /* 4146 * if the same page, we can skip pv_entry handling. 4147 */ 4148 4149 if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) { 4150 KASSERT(((opte ^ npte) & PG_PVLIST) == 0); 4151 goto same_pa; 4152 } 4153 4154 /* 4155 * if old page is pv-tracked, remove pv_entry from its list. 4156 */ 4157 4158 if ((~opte & (PG_V | PG_PVLIST)) == 0) { 4159 if ((old_pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) { 4160 KASSERT(uvm_page_locked_p(old_pg)); 4161 old_pp = VM_PAGE_TO_PP(old_pg); 4162 } else if ((old_pp = pmap_pv_tracked(pmap_pte2pa(opte))) 4163 == NULL) { 4164 pa = pmap_pte2pa(opte); 4165 panic("pmap_enter: PG_PVLIST with pv-untracked page" 4166 " va = 0x%"PRIxVADDR 4167 " pa = 0x%" PRIxPADDR " (0x%" PRIxPADDR ")", 4168 va, pa, atop(pa)); 4169 } 4170 4171 old_pve = pmap_remove_pv(old_pp, ptp, va); 4172 old_pp->pp_attrs |= opte; 4173 } 4174 4175 /* 4176 * if new page is pv-tracked, insert pv_entry into its list. 4177 */ 4178 4179 if (new_pp) { 4180 new_pve = pmap_enter_pv(new_pp, new_pve, &new_sparepve, ptp, va); 4181 } 4182 4183 same_pa: 4184 pmap_unmap_ptes(pmap, pmap2); 4185 4186 /* 4187 * shootdown tlb if necessary. 4188 */ 4189 4190 if ((~opte & (PG_V | PG_U)) == 0 && 4191 ((opte ^ npte) & (PG_FRAME | PG_RW)) != 0) { 4192 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER); 4193 } 4194 4195 error = 0; 4196 out: 4197 kpreempt_enable(); 4198 out2: 4199 if (old_pve != NULL) { 4200 pool_cache_put(&pmap_pv_cache, old_pve); 4201 } 4202 if (new_pve != NULL) { 4203 pool_cache_put(&pmap_pv_cache, new_pve); 4204 } 4205 if (new_sparepve != NULL) { 4206 pool_cache_put(&pmap_pv_cache, new_sparepve); 4207 } 4208 4209 return error; 4210 } 4211 4212 static paddr_t 4213 pmap_get_physpage(void) 4214 { 4215 struct vm_page *ptp; 4216 struct pmap *kpm = pmap_kernel(); 4217 paddr_t pa; 4218 4219 if (!uvm.page_init_done) { 4220 /* 4221 * We're growing the kernel pmap early (from 4222 * uvm_pageboot_alloc()). This case must be 4223 * handled a little differently. 4224 */ 4225 4226 if (!uvm_page_physget(&pa)) 4227 panic("pmap_get_physpage: out of memory"); 4228 #if defined(__HAVE_DIRECT_MAP) 4229 pagezero(PMAP_DIRECT_MAP(pa)); 4230 #else 4231 #if defined(XEN) 4232 if (XEN_VERSION_SUPPORTED(3, 4)) { 4233 xen_pagezero(pa); 4234 return pa; 4235 } 4236 #endif 4237 kpreempt_disable(); 4238 pmap_pte_set(early_zero_pte, pmap_pa2pte(pa) | PG_V | 4239 PG_RW | pmap_pg_nx | PG_k); 4240 pmap_pte_flush(); 4241 pmap_update_pg((vaddr_t)early_zerop); 4242 memset(early_zerop, 0, PAGE_SIZE); 4243 #if defined(DIAGNOSTIC) || defined(XEN) 4244 pmap_pte_set(early_zero_pte, 0); 4245 pmap_pte_flush(); 4246 #endif /* defined(DIAGNOSTIC) */ 4247 kpreempt_enable(); 4248 #endif /* defined(__HAVE_DIRECT_MAP) */ 4249 } else { 4250 /* XXX */ 4251 ptp = uvm_pagealloc(NULL, 0, NULL, 4252 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 4253 if (ptp == NULL) 4254 panic("pmap_get_physpage: out of memory"); 4255 ptp->flags &= ~PG_BUSY; 4256 ptp->wire_count = 1; 4257 pa = VM_PAGE_TO_PHYS(ptp); 4258 } 4259 pmap_stats_update(kpm, 1, 0); 4260 4261 return pa; 4262 } 4263 4264 /* 4265 * Expand the page tree with the specified amount of PTPs, mapping virtual 4266 * addresses starting at kva. We populate all the levels but the last one 4267 * (L1). The nodes of the tree are created as RWX, but the pages covered 4268 * will be kentered in L1, with proper permissions. 4269 * 4270 * Used only by pmap_growkernel. 4271 */ 4272 static void 4273 pmap_alloc_level(vaddr_t kva, long *needed_ptps) 4274 { 4275 unsigned long i; 4276 paddr_t pa; 4277 unsigned long index, endindex; 4278 int level; 4279 pd_entry_t *pdep; 4280 #ifdef XEN 4281 int s = splvm(); /* protect xpq_* */ 4282 #endif 4283 4284 for (level = PTP_LEVELS; level > 1; level--) { 4285 if (level == PTP_LEVELS) 4286 pdep = pmap_kernel()->pm_pdir; 4287 else 4288 pdep = normal_pdes[level - 2]; 4289 index = pl_i_roundup(kva, level); 4290 endindex = index + needed_ptps[level - 1] - 1; 4291 4292 for (i = index; i <= endindex; i++) { 4293 pt_entry_t pte; 4294 4295 KASSERT(!pmap_valid_entry(pdep[i])); 4296 pa = pmap_get_physpage(); 4297 pte = pmap_pa2pte(pa) | PG_k | PG_V | PG_RW; 4298 pmap_pte_set(&pdep[i], pte); 4299 4300 #if defined(XEN) && (defined(PAE) || defined(__x86_64__)) 4301 if (level == PTP_LEVELS && i >= PDIR_SLOT_KERN) { 4302 if (__predict_true( 4303 cpu_info_primary.ci_flags & CPUF_PRESENT)) { 4304 /* update per-cpu PMDs on all cpus */ 4305 xen_kpm_sync(pmap_kernel(), i); 4306 } else { 4307 /* 4308 * too early; update primary CPU 4309 * PMD only (without locks) 4310 */ 4311 #ifdef PAE 4312 pd_entry_t *cpu_pdep = 4313 &cpu_info_primary.ci_kpm_pdir[l2tol2(i)]; 4314 #endif 4315 #ifdef __x86_64__ 4316 pd_entry_t *cpu_pdep = 4317 &cpu_info_primary.ci_kpm_pdir[i]; 4318 #endif 4319 pmap_pte_set(cpu_pdep, pte); 4320 } 4321 } 4322 #endif /* XEN && (PAE || __x86_64__) */ 4323 4324 KASSERT(level != PTP_LEVELS || nkptp[level - 1] + 4325 pl_i(VM_MIN_KERNEL_ADDRESS, level) == i); 4326 nkptp[level - 1]++; 4327 } 4328 pmap_pte_flush(); 4329 } 4330 #ifdef XEN 4331 splx(s); 4332 #endif 4333 } 4334 4335 /* 4336 * pmap_growkernel: increase usage of KVM space. 4337 * 4338 * => we allocate new PTPs for the kernel and install them in all 4339 * the pmaps on the system. 4340 */ 4341 4342 vaddr_t 4343 pmap_growkernel(vaddr_t maxkvaddr) 4344 { 4345 struct pmap *kpm = pmap_kernel(); 4346 #if !defined(XEN) || !defined(__x86_64__) 4347 struct pmap *pm; 4348 long old; 4349 #endif 4350 int s, i; 4351 long needed_kptp[PTP_LEVELS], target_nptp; 4352 bool invalidate = false; 4353 4354 s = splvm(); /* to be safe */ 4355 mutex_enter(kpm->pm_lock); 4356 4357 if (maxkvaddr <= pmap_maxkvaddr) { 4358 mutex_exit(kpm->pm_lock); 4359 splx(s); 4360 return pmap_maxkvaddr; 4361 } 4362 4363 maxkvaddr = x86_round_pdr(maxkvaddr); 4364 #if !defined(XEN) || !defined(__x86_64__) 4365 old = nkptp[PTP_LEVELS - 1]; 4366 #endif 4367 4368 /* Initialize needed_kptp. */ 4369 for (i = PTP_LEVELS - 1; i >= 1; i--) { 4370 target_nptp = pl_i_roundup(maxkvaddr, i + 1) - 4371 pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1); 4372 4373 if (target_nptp > nkptpmax[i]) 4374 panic("out of KVA space"); 4375 KASSERT(target_nptp >= nkptp[i]); 4376 needed_kptp[i] = target_nptp - nkptp[i]; 4377 } 4378 4379 pmap_alloc_level(pmap_maxkvaddr, needed_kptp); 4380 4381 /* 4382 * If the number of top level entries changed, update all pmaps. 4383 */ 4384 if (needed_kptp[PTP_LEVELS - 1] != 0) { 4385 #ifdef XEN 4386 #ifdef __x86_64__ 4387 /* nothing, kernel entries are never entered in user pmap */ 4388 #else /* __x86_64__ */ 4389 mutex_enter(&pmaps_lock); 4390 LIST_FOREACH(pm, &pmaps, pm_list) { 4391 int pdkidx; 4392 for (pdkidx = PDIR_SLOT_KERN + old; 4393 pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1]; 4394 pdkidx++) { 4395 pmap_pte_set(&pm->pm_pdir[pdkidx], 4396 kpm->pm_pdir[pdkidx]); 4397 } 4398 pmap_pte_flush(); 4399 } 4400 mutex_exit(&pmaps_lock); 4401 #endif /* __x86_64__ */ 4402 #else /* XEN */ 4403 unsigned newpdes; 4404 newpdes = nkptp[PTP_LEVELS - 1] - old; 4405 mutex_enter(&pmaps_lock); 4406 LIST_FOREACH(pm, &pmaps, pm_list) { 4407 memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old], 4408 &kpm->pm_pdir[PDIR_SLOT_KERN + old], 4409 newpdes * sizeof (pd_entry_t)); 4410 } 4411 mutex_exit(&pmaps_lock); 4412 #endif 4413 invalidate = true; 4414 } 4415 pmap_maxkvaddr = maxkvaddr; 4416 mutex_exit(kpm->pm_lock); 4417 splx(s); 4418 4419 if (invalidate && pmap_initialized) { 4420 /* Invalidate the PDP cache. */ 4421 pool_cache_invalidate(&pmap_pdp_cache); 4422 } 4423 4424 return maxkvaddr; 4425 } 4426 4427 #ifdef DEBUG 4428 void pmap_dump(struct pmap *, vaddr_t, vaddr_t); 4429 4430 /* 4431 * pmap_dump: dump all the mappings from a pmap 4432 * 4433 * => caller should not be holding any pmap locks 4434 */ 4435 4436 void 4437 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 4438 { 4439 pt_entry_t *ptes, *pte; 4440 pd_entry_t * const *pdes; 4441 struct pmap *pmap2; 4442 vaddr_t blkendva; 4443 4444 /* 4445 * if end is out of range truncate. 4446 * if (end == start) update to max. 4447 */ 4448 4449 if (eva > VM_MAXUSER_ADDRESS || eva <= sva) 4450 eva = VM_MAXUSER_ADDRESS; 4451 4452 /* 4453 * we lock in the pmap => pv_head direction 4454 */ 4455 4456 kpreempt_disable(); 4457 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 4458 4459 /* 4460 * dumping a range of pages: we dump in PTP sized blocks (4MB) 4461 */ 4462 4463 for (/* null */ ; sva < eva ; sva = blkendva) { 4464 4465 /* determine range of block */ 4466 blkendva = x86_round_pdr(sva+1); 4467 if (blkendva > eva) 4468 blkendva = eva; 4469 4470 /* valid block? */ 4471 if (!pmap_pdes_valid(sva, pdes, NULL)) 4472 continue; 4473 4474 pte = &ptes[pl1_i(sva)]; 4475 for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) { 4476 if (!pmap_valid_entry(*pte)) 4477 continue; 4478 printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR 4479 " (pte=%#" PRIxPADDR ")\n", 4480 sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte); 4481 } 4482 } 4483 pmap_unmap_ptes(pmap, pmap2); 4484 kpreempt_enable(); 4485 } 4486 #endif 4487 4488 /* 4489 * pmap_update: process deferred invalidations and frees. 4490 */ 4491 4492 void 4493 pmap_update(struct pmap *pmap) 4494 { 4495 struct vm_page *empty_ptps; 4496 lwp_t *l = curlwp; 4497 4498 /* 4499 * If we have torn down this pmap, invalidate non-global TLB 4500 * entries on any processors using it. 4501 */ 4502 kpreempt_disable(); 4503 if (__predict_false(l->l_md.md_gc_pmap == pmap)) { 4504 l->l_md.md_gc_pmap = NULL; 4505 pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, TLBSHOOT_UPDATE); 4506 } 4507 /* 4508 * Initiate any pending TLB shootdowns. Wait for them to 4509 * complete before returning control to the caller. 4510 */ 4511 pmap_tlb_shootnow(); 4512 kpreempt_enable(); 4513 4514 /* 4515 * Now that shootdowns are complete, process deferred frees, 4516 * but not from interrupt context. 4517 */ 4518 if (l->l_md.md_gc_ptp != NULL) { 4519 KASSERT((l->l_pflag & LP_INTR) == 0); 4520 if (cpu_intr_p()) { 4521 return; 4522 } 4523 empty_ptps = l->l_md.md_gc_ptp; 4524 l->l_md.md_gc_ptp = NULL; 4525 pmap_free_ptps(empty_ptps); 4526 } 4527 } 4528 4529 #if PTP_LEVELS > 4 4530 #error "Unsupported number of page table mappings" 4531 #endif 4532 4533 paddr_t 4534 pmap_init_tmp_pgtbl(paddr_t pg) 4535 { 4536 static bool maps_loaded; 4537 static const paddr_t x86_tmp_pml_paddr[] = { 4538 4 * PAGE_SIZE, /* L1 */ 4539 5 * PAGE_SIZE, /* L2 */ 4540 6 * PAGE_SIZE, /* L3 */ 4541 7 * PAGE_SIZE /* L4 */ 4542 }; 4543 static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 }; 4544 4545 pd_entry_t *tmp_pml, *kernel_pml; 4546 4547 int level; 4548 4549 if (!maps_loaded) { 4550 for (level = 0; level < PTP_LEVELS; ++level) { 4551 x86_tmp_pml_vaddr[level] = 4552 uvm_km_alloc(kernel_map, PAGE_SIZE, 0, 4553 UVM_KMF_VAONLY); 4554 4555 if (x86_tmp_pml_vaddr[level] == 0) 4556 panic("mapping of real mode PML failed\n"); 4557 pmap_kenter_pa(x86_tmp_pml_vaddr[level], 4558 x86_tmp_pml_paddr[level], 4559 VM_PROT_READ | VM_PROT_WRITE, 0); 4560 } 4561 pmap_update(pmap_kernel()); 4562 maps_loaded = true; 4563 } 4564 4565 /* Zero levels 1-3 */ 4566 for (level = 0; level < PTP_LEVELS - 1; ++level) { 4567 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 4568 memset(tmp_pml, 0, PAGE_SIZE); 4569 } 4570 4571 /* Copy PML4 */ 4572 kernel_pml = pmap_kernel()->pm_pdir; 4573 tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1]; 4574 memcpy(tmp_pml, kernel_pml, PAGE_SIZE); 4575 4576 #ifdef PAE 4577 /* 4578 * Use the last 4 entries of the L2 page as L3 PD entries. These 4579 * last entries are unlikely to be used for temporary mappings. 4580 * 508: maps 0->1GB (userland) 4581 * 509: unused 4582 * 510: unused 4583 * 511: maps 3->4GB (kernel) 4584 */ 4585 tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PG_V; 4586 tmp_pml[509] = 0; 4587 tmp_pml[510] = 0; 4588 tmp_pml[511] = pmap_pdirpa(pmap_kernel(), PDIR_SLOT_KERN) | PG_V; 4589 #endif 4590 4591 for (level = PTP_LEVELS - 1; level > 0; --level) { 4592 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 4593 4594 tmp_pml[pl_i(pg, level + 1)] = 4595 (x86_tmp_pml_paddr[level - 1] & PG_FRAME) | PG_RW | PG_V; 4596 } 4597 4598 tmp_pml = (void *)x86_tmp_pml_vaddr[0]; 4599 tmp_pml[pl_i(pg, 1)] = (pg & PG_FRAME) | PG_RW | PG_V; 4600 4601 #ifdef PAE 4602 /* Return the PA of the L3 page (entry 508 of the L2 page) */ 4603 return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t); 4604 #endif 4605 4606 return x86_tmp_pml_paddr[PTP_LEVELS - 1]; 4607 } 4608 4609 u_int 4610 x86_mmap_flags(paddr_t mdpgno) 4611 { 4612 u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK; 4613 u_int pflag = 0; 4614 4615 if (nflag & X86_MMAP_FLAG_PREFETCH) 4616 pflag |= PMAP_WRITE_COMBINE; 4617 4618 return pflag; 4619 } 4620