1 /* $NetBSD: pmap.c,v 1.423 2022/09/24 11:05:47 riastradh Exp $ */ 2 3 /* 4 * Copyright (c) 2008, 2010, 2016, 2017, 2019, 2020 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran, and by Maxime Villard. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 2007 Manuel Bouyer. 34 * 35 * Redistribution and use in source and binary forms, with or without 36 * modification, are permitted provided that the following conditions 37 * are met: 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 44 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 45 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 46 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 47 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 48 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 49 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 50 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 51 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 52 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 53 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 54 */ 55 56 /* 57 * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr> 58 * 59 * Permission to use, copy, modify, and distribute this software for any 60 * purpose with or without fee is hereby granted, provided that the above 61 * copyright notice and this permission notice appear in all copies. 62 * 63 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 64 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 65 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 66 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 67 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 68 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 69 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 70 */ 71 72 /* 73 * Copyright 2001 (c) Wasabi Systems, Inc. 74 * All rights reserved. 75 * 76 * Written by Frank van der Linden for Wasabi Systems, Inc. 77 * 78 * Redistribution and use in source and binary forms, with or without 79 * modification, are permitted provided that the following conditions 80 * are met: 81 * 1. Redistributions of source code must retain the above copyright 82 * notice, this list of conditions and the following disclaimer. 83 * 2. Redistributions in binary form must reproduce the above copyright 84 * notice, this list of conditions and the following disclaimer in the 85 * documentation and/or other materials provided with the distribution. 86 * 3. All advertising materials mentioning features or use of this software 87 * must display the following acknowledgement: 88 * This product includes software developed for the NetBSD Project by 89 * Wasabi Systems, Inc. 90 * 4. The name of Wasabi Systems, Inc. may not be used to endorse 91 * or promote products derived from this software without specific prior 92 * written permission. 93 * 94 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND 95 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 96 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 97 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC 98 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 99 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 100 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 101 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 102 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 103 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 104 * POSSIBILITY OF SUCH DAMAGE. 105 */ 106 107 /* 108 * Copyright (c) 1997 Charles D. Cranor and Washington University. 109 * All rights reserved. 110 * 111 * Redistribution and use in source and binary forms, with or without 112 * modification, are permitted provided that the following conditions 113 * are met: 114 * 1. Redistributions of source code must retain the above copyright 115 * notice, this list of conditions and the following disclaimer. 116 * 2. Redistributions in binary form must reproduce the above copyright 117 * notice, this list of conditions and the following disclaimer in the 118 * documentation and/or other materials provided with the distribution. 119 * 120 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 121 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 122 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 123 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 124 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 125 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 126 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 127 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 128 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 129 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 130 */ 131 132 #include <sys/cdefs.h> 133 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.423 2022/09/24 11:05:47 riastradh Exp $"); 134 135 #include "opt_user_ldt.h" 136 #include "opt_lockdebug.h" 137 #include "opt_multiprocessor.h" 138 #include "opt_xen.h" 139 #include "opt_svs.h" 140 #include "opt_kaslr.h" 141 #include "opt_efi.h" 142 143 #define __MUTEX_PRIVATE /* for assertions */ 144 145 #include <sys/param.h> 146 #include <sys/systm.h> 147 #include <sys/proc.h> 148 #include <sys/pool.h> 149 #include <sys/kernel.h> 150 #include <sys/atomic.h> 151 #include <sys/cpu.h> 152 #include <sys/intr.h> 153 #include <sys/xcall.h> 154 #include <sys/kcore.h> 155 #include <sys/kmem.h> 156 #include <sys/asan.h> 157 #include <sys/msan.h> 158 #include <sys/entropy.h> 159 160 #include <uvm/uvm.h> 161 #include <uvm/pmap/pmap_pvt.h> 162 163 #include <dev/isa/isareg.h> 164 165 #include <machine/specialreg.h> 166 #include <machine/gdt.h> 167 #include <machine/isa_machdep.h> 168 #include <machine/cpuvar.h> 169 #include <machine/cputypes.h> 170 #include <machine/pmap_private.h> 171 172 #include <x86/bootspace.h> 173 #include <x86/pat.h> 174 #include <x86/pmap_pv.h> 175 176 #include <x86/i82489reg.h> 177 #include <x86/i82489var.h> 178 179 #ifdef XEN 180 #include <xen/include/public/xen.h> 181 #include <xen/hypervisor.h> 182 #include <xen/xenpmap.h> 183 #endif 184 185 #ifdef __HAVE_DIRECT_MAP 186 #include <crypto/nist_hash_drbg/nist_hash_drbg.h> 187 #endif 188 189 /* 190 * general info: 191 * 192 * - for an explanation of how the x86 MMU hardware works see 193 * the comments in <machine/pte.h>. 194 * 195 * - for an explanation of the general memory structure used by 196 * this pmap (including the recursive mapping), see the comments 197 * in <machine/pmap.h>. 198 * 199 * this file contains the code for the "pmap module." the module's 200 * job is to manage the hardware's virtual to physical address mappings. 201 * note that there are two levels of mapping in the VM system: 202 * 203 * [1] the upper layer of the VM system uses vm_map's and vm_map_entry's 204 * to map ranges of virtual address space to objects/files. for 205 * example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only 206 * to the file /bin/ls starting at offset zero." note that 207 * the upper layer mapping is not concerned with how individual 208 * vm_pages are mapped. 209 * 210 * [2] the lower layer of the VM system (the pmap) maintains the mappings 211 * from virtual addresses. it is concerned with which vm_page is 212 * mapped where. for example, when you run /bin/ls and start 213 * at page 0x1000 the fault routine may lookup the correct page 214 * of the /bin/ls file and then ask the pmap layer to establish 215 * a mapping for it. 216 * 217 * note that information in the lower layer of the VM system can be 218 * thrown away since it can easily be reconstructed from the info 219 * in the upper layer. 220 * 221 * data structures we use include: 222 * 223 * - struct pmap: describes the address space of one thread 224 * - struct pmap_page: describes one pv-tracked page, without 225 * necessarily a corresponding vm_page 226 * - struct pv_entry: describes one <PMAP,VA> mapping of a PA 227 * - pmap_page::pp_pvlist: there is one list per pv-tracked page of 228 * physical memory. the pp_pvlist points to a list of pv_entry 229 * structures which describe all the <PMAP,VA> pairs that this 230 * page is mapped in. this is critical for page based operations 231 * such as pmap_page_protect() [change protection on _all_ mappings 232 * of a page] 233 */ 234 235 /* 236 * Locking 237 * 238 * We have the following locks that we must deal with, listed in the order 239 * that they are acquired: 240 * 241 * pg->uobject->vmobjlock, pg->uanon->an_lock 242 * 243 * For managed pages, these per-object locks are taken by the VM system 244 * before calling into the pmap module - either a read or write hold. 245 * The lock hold prevent pages from changing identity while the pmap is 246 * operating on them. For example, the same lock is held across a call 247 * to pmap_remove() and the following call to pmap_update(), so that a 248 * page does not gain a new identity while its TLB visibility is stale. 249 * 250 * pmap->pm_lock 251 * 252 * This lock protects the fields in the pmap structure including the 253 * non-kernel PDEs in the PDP, the PTEs, and PTPs and connected data 254 * structures. For modifying unmanaged kernel PTEs it is not needed as 255 * kernel PDEs are never freed, and the kernel is expected to be self 256 * consistent (and the lock can't be taken for unmanaged kernel PTEs, 257 * because they can be modified from interrupt context). 258 * 259 * pmaps_lock 260 * 261 * This lock protects the list of active pmaps (headed by "pmaps"). 262 * It's acquired when adding or removing pmaps or adjusting kernel PDEs. 263 * 264 * pp_lock 265 * 266 * This per-page lock protects PV entry lists and the embedded PV entry 267 * in each vm_page, allowing for concurrent operation on pages by 268 * different pmaps. This is a spin mutex at IPL_VM, because at the 269 * points it is taken context switching is usually not tolerable, and 270 * spin mutexes must block out interrupts that could take kernel_lock. 271 */ 272 273 /* uvm_object is abused here to index pmap_pages; make assertions happy. */ 274 #ifdef DIAGNOSTIC 275 #define PMAP_DUMMY_LOCK(pm) rw_enter(&(pm)->pm_dummy_lock, RW_WRITER) 276 #define PMAP_DUMMY_UNLOCK(pm) rw_exit(&(pm)->pm_dummy_lock) 277 #else 278 #define PMAP_DUMMY_LOCK(pm) 279 #define PMAP_DUMMY_UNLOCK(pm) 280 #endif 281 282 static const struct uvm_pagerops pmap_pager = { 283 /* nothing */ 284 }; 285 286 /* 287 * pl_i(va, X) == plX_i(va) <= pl_i_roundup(va, X) 288 */ 289 #define pl_i(va, lvl) \ 290 (((VA_SIGN_POS(va)) & ptp_frames[(lvl)-1]) >> ptp_shifts[(lvl)-1]) 291 292 #define pl_i_roundup(va, lvl) pl_i((va)+ ~ptp_frames[(lvl)-1], (lvl)) 293 294 /* 295 * PTP macros: 296 * a PTP's index is the PD index of the PDE that points to it 297 * a PTP's offset is the byte-offset in the PTE space that this PTP is at 298 * a PTP's VA is the first VA mapped by that PTP 299 */ 300 301 #define ptp_va2o(va, lvl) (pl_i(va, (lvl)+1) * PAGE_SIZE) 302 303 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER; 304 const vaddr_t ptp_frames[] = PTP_FRAME_INITIALIZER; 305 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER; 306 const long nkptpmax[] = NKPTPMAX_INITIALIZER; 307 const long nbpd[] = NBPD_INITIALIZER; 308 #ifdef i386 309 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER; 310 #else 311 pd_entry_t *normal_pdes[3]; 312 #endif 313 314 long nkptp[] = NKPTP_INITIALIZER; 315 316 struct pmap_head pmaps; 317 kmutex_t pmaps_lock __cacheline_aligned; 318 319 struct pcpu_area *pcpuarea __read_mostly; 320 321 static vaddr_t pmap_maxkvaddr; 322 323 /* 324 * Misc. event counters. 325 */ 326 struct evcnt pmap_iobmp_evcnt; 327 struct evcnt pmap_ldt_evcnt; 328 329 /* 330 * PAT 331 */ 332 static bool cpu_pat_enabled __read_mostly = false; 333 334 /* 335 * Global data structures 336 */ 337 338 static struct pmap kernel_pmap_store __cacheline_aligned; /* kernel's pmap */ 339 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store; 340 static rb_tree_t pmap_kernel_rb __cacheline_aligned; 341 342 struct bootspace bootspace __read_mostly; 343 struct slotspace slotspace __read_mostly; 344 345 /* Set to PTE_NX if supported. */ 346 pd_entry_t pmap_pg_nx __read_mostly = 0; 347 348 /* Set to PTE_G if supported. */ 349 pd_entry_t pmap_pg_g __read_mostly = 0; 350 351 /* Set to true if large pages are supported. */ 352 int pmap_largepages __read_mostly = 0; 353 354 paddr_t lowmem_rsvd __read_mostly; 355 paddr_t avail_start __read_mostly; /* PA of first available physical page */ 356 paddr_t avail_end __read_mostly; /* PA of last available physical page */ 357 358 #ifdef XENPV 359 paddr_t pmap_pa_start; /* PA of first physical page for this domain */ 360 paddr_t pmap_pa_end; /* PA of last physical page for this domain */ 361 #endif 362 363 #define VM_PAGE_TO_PP(pg) (&(pg)->mdpage.mp_pp) 364 #define PMAP_CHECK_PP(pp) \ 365 KASSERTMSG((pp)->pp_lock.mtx_ipl._ipl == IPL_VM, "bad pmap_page %p", pp) 366 367 #define PAGE_ALIGNED(pp) \ 368 __builtin_assume_aligned((void *)(pp), PAGE_SIZE) 369 370 /* 371 * Other data structures 372 */ 373 374 static pt_entry_t protection_codes[8] __read_mostly; 375 376 static bool pmap_initialized __read_mostly = false; /* pmap_init done yet? */ 377 378 /* 379 * The following two vaddr_t's are used during system startup to keep track of 380 * how much of the kernel's VM space we have used. Once the system is started, 381 * the management of the remaining kernel VM space is turned over to the 382 * kernel_map vm_map. 383 */ 384 static vaddr_t virtual_avail __read_mostly; /* VA of first free KVA */ 385 static vaddr_t virtual_end __read_mostly; /* VA of last free KVA */ 386 387 #ifndef XENPV 388 /* 389 * LAPIC virtual address, and fake physical address. 390 */ 391 volatile vaddr_t local_apic_va __read_mostly; 392 paddr_t local_apic_pa __read_mostly; 393 #endif 394 395 /* 396 * pool that pmap structures are allocated from 397 */ 398 struct pool_cache pmap_cache; 399 static int pmap_ctor(void *, void *, int); 400 static void pmap_dtor(void *, void *); 401 402 /* 403 * pv_page cache 404 */ 405 static struct pool_cache pmap_pvp_cache; 406 407 #ifdef __HAVE_DIRECT_MAP 408 vaddr_t pmap_direct_base __read_mostly; 409 vaddr_t pmap_direct_end __read_mostly; 410 #endif 411 412 #ifndef __HAVE_DIRECT_MAP 413 /* 414 * Special VAs and the PTEs that map them 415 */ 416 static pt_entry_t *early_zero_pte; 417 static void pmap_vpage_cpualloc(struct cpu_info *); 418 #ifdef XENPV 419 char *early_zerop; /* also referenced from xen_locore() */ 420 #else 421 static char *early_zerop; 422 #endif 423 #endif 424 425 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int); 426 427 /* PDP pool and its callbacks */ 428 static struct pool pmap_pdp_pool; 429 static void pmap_pdp_init(pd_entry_t *); 430 static void pmap_pdp_fini(pd_entry_t *); 431 432 #ifdef PAE 433 /* need to allocate items of 4 pages */ 434 static void *pmap_pdp_alloc(struct pool *, int); 435 static void pmap_pdp_free(struct pool *, void *); 436 static struct pool_allocator pmap_pdp_allocator = { 437 .pa_alloc = pmap_pdp_alloc, 438 .pa_free = pmap_pdp_free, 439 .pa_pagesz = PAGE_SIZE * PDP_SIZE, 440 }; 441 #endif 442 443 extern vaddr_t idt_vaddr; 444 extern paddr_t idt_paddr; 445 extern vaddr_t gdt_vaddr; 446 extern paddr_t gdt_paddr; 447 extern vaddr_t ldt_vaddr; 448 extern paddr_t ldt_paddr; 449 450 #ifdef i386 451 /* stuff to fix the pentium f00f bug */ 452 extern vaddr_t pentium_idt_vaddr; 453 #endif 454 455 /* Array of freshly allocated PTPs, for pmap_get_ptp(). */ 456 struct pmap_ptparray { 457 struct vm_page *pg[PTP_LEVELS + 1]; 458 bool alloced[PTP_LEVELS + 1]; 459 }; 460 461 /* 462 * PV entries are allocated in page-sized chunks and cached per-pmap to 463 * avoid intense pressure on memory allocators. 464 */ 465 466 struct pv_page { 467 LIST_HEAD(, pv_entry) pvp_pves; 468 LIST_ENTRY(pv_page) pvp_list; 469 long pvp_nfree; 470 struct pmap *pvp_pmap; 471 }; 472 473 #define PVE_PER_PVP ((PAGE_SIZE / sizeof(struct pv_entry)) - 1) 474 475 /* 476 * PV tree prototypes 477 */ 478 479 static int pmap_compare_key(void *, const void *, const void *); 480 static int pmap_compare_nodes(void *, const void *, const void *); 481 482 /* Read-black tree */ 483 static const rb_tree_ops_t pmap_rbtree_ops = { 484 .rbto_compare_nodes = pmap_compare_nodes, 485 .rbto_compare_key = pmap_compare_key, 486 .rbto_node_offset = offsetof(struct pv_entry, pve_rb), 487 .rbto_context = NULL 488 }; 489 490 /* 491 * Local prototypes 492 */ 493 494 #ifdef __HAVE_PCPU_AREA 495 static void pmap_init_pcpu(void); 496 #endif 497 #ifdef __HAVE_DIRECT_MAP 498 static void pmap_init_directmap(struct pmap *); 499 #endif 500 #if !defined(XENPV) 501 static void pmap_remap_global(void); 502 #endif 503 #ifndef XENPV 504 static void pmap_init_lapic(void); 505 static void pmap_remap_largepages(void); 506 #endif 507 508 static int pmap_get_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t, int, 509 struct vm_page **); 510 static void pmap_unget_ptp(struct pmap *, struct pmap_ptparray *); 511 static void pmap_install_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t, 512 pd_entry_t * const *); 513 static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, int); 514 static void pmap_freepage(struct pmap *, struct vm_page *, int); 515 static void pmap_free_ptp(struct pmap *, struct vm_page *, vaddr_t, 516 pt_entry_t *, pd_entry_t * const *); 517 static bool pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *, 518 vaddr_t); 519 static void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t, vaddr_t, 520 vaddr_t); 521 static int pmap_pvp_ctor(void *, void *, int); 522 static void pmap_pvp_dtor(void *, void *); 523 static struct pv_entry *pmap_alloc_pv(struct pmap *); 524 static void pmap_free_pv(struct pmap *, struct pv_entry *); 525 static void pmap_drain_pv(struct pmap *); 526 527 static void pmap_alloc_level(struct pmap *, vaddr_t, long *); 528 529 static void pmap_load1(struct lwp *, struct pmap *, struct pmap *); 530 static void pmap_reactivate(struct pmap *); 531 532 long 533 pmap_resident_count(struct pmap *pmap) 534 { 535 536 return pmap->pm_stats.resident_count; 537 } 538 539 long 540 pmap_wired_count(struct pmap *pmap) 541 { 542 543 return pmap->pm_stats.wired_count; 544 } 545 546 /* 547 * p m a p h e l p e r f u n c t i o n s 548 */ 549 550 static inline void 551 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff) 552 { 553 554 KASSERT(cold || mutex_owned(&pmap->pm_lock)); 555 pmap->pm_stats.resident_count += resid_diff; 556 pmap->pm_stats.wired_count += wired_diff; 557 } 558 559 static inline void 560 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte) 561 { 562 int resid_diff = ((npte & PTE_P) ? 1 : 0) - ((opte & PTE_P) ? 1 : 0); 563 int wired_diff = ((npte & PTE_WIRED) ? 1 : 0) - ((opte & PTE_WIRED) ? 1 : 0); 564 565 KASSERT((npte & (PTE_P | PTE_WIRED)) != PTE_WIRED); 566 KASSERT((opte & (PTE_P | PTE_WIRED)) != PTE_WIRED); 567 568 pmap_stats_update(pmap, resid_diff, wired_diff); 569 } 570 571 /* 572 * ptp_to_pmap: lookup pmap by ptp 573 */ 574 static inline struct pmap * 575 ptp_to_pmap(struct vm_page *ptp) 576 { 577 struct pmap *pmap; 578 579 if (ptp == NULL) { 580 return pmap_kernel(); 581 } 582 pmap = (struct pmap *)ptp->uobject; 583 KASSERT(pmap != NULL); 584 KASSERT(&pmap->pm_obj[0] == ptp->uobject); 585 return pmap; 586 } 587 588 static inline struct pv_pte * 589 pve_to_pvpte(struct pv_entry *pve) 590 { 591 592 if (pve == NULL) 593 return NULL; 594 KASSERT((void *)&pve->pve_pte == (void *)pve); 595 return &pve->pve_pte; 596 } 597 598 static inline struct pv_entry * 599 pvpte_to_pve(struct pv_pte *pvpte) 600 { 601 struct pv_entry *pve = (void *)pvpte; 602 603 KASSERT(pve_to_pvpte(pve) == pvpte); 604 return pve; 605 } 606 607 /* 608 * Return true if the pmap page has an embedded PV entry. 609 */ 610 static inline bool 611 pv_pte_embedded(struct pmap_page *pp) 612 { 613 614 KASSERT(mutex_owned(&pp->pp_lock)); 615 return (bool)((vaddr_t)pp->pp_pte.pte_ptp | pp->pp_pte.pte_va); 616 } 617 618 /* 619 * pv_pte_first, pv_pte_next: PV list iterator. 620 */ 621 static inline struct pv_pte * 622 pv_pte_first(struct pmap_page *pp) 623 { 624 625 KASSERT(mutex_owned(&pp->pp_lock)); 626 if (pv_pte_embedded(pp)) { 627 return &pp->pp_pte; 628 } 629 return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist)); 630 } 631 632 static inline struct pv_pte * 633 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte) 634 { 635 636 KASSERT(mutex_owned(&pp->pp_lock)); 637 KASSERT(pvpte != NULL); 638 if (pvpte == &pp->pp_pte) { 639 return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist)); 640 } 641 return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list)); 642 } 643 644 static inline uint8_t 645 pmap_pte_to_pp_attrs(pt_entry_t pte) 646 { 647 uint8_t ret = 0; 648 if (pte & PTE_D) 649 ret |= PP_ATTRS_D; 650 if (pte & PTE_A) 651 ret |= PP_ATTRS_A; 652 if (pte & PTE_W) 653 ret |= PP_ATTRS_W; 654 return ret; 655 } 656 657 static inline pt_entry_t 658 pmap_pp_attrs_to_pte(uint8_t attrs) 659 { 660 pt_entry_t pte = 0; 661 if (attrs & PP_ATTRS_D) 662 pte |= PTE_D; 663 if (attrs & PP_ATTRS_A) 664 pte |= PTE_A; 665 if (attrs & PP_ATTRS_W) 666 pte |= PTE_W; 667 return pte; 668 } 669 670 /* 671 * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]? 672 * of course the kernel is always loaded 673 */ 674 bool 675 pmap_is_curpmap(struct pmap *pmap) 676 { 677 return ((pmap == pmap_kernel()) || (pmap == curcpu()->ci_pmap)); 678 } 679 680 inline void 681 pmap_reference(struct pmap *pmap) 682 { 683 684 atomic_inc_uint(&pmap->pm_obj[0].uo_refs); 685 } 686 687 /* 688 * rbtree: compare two nodes. 689 */ 690 static int 691 pmap_compare_nodes(void *context, const void *n1, const void *n2) 692 { 693 const struct pv_entry *pve1 = n1; 694 const struct pv_entry *pve2 = n2; 695 696 KASSERT(pve1->pve_pte.pte_ptp == pve2->pve_pte.pte_ptp); 697 698 if (pve1->pve_pte.pte_va < pve2->pve_pte.pte_va) { 699 return -1; 700 } 701 if (pve1->pve_pte.pte_va > pve2->pve_pte.pte_va) { 702 return 1; 703 } 704 return 0; 705 } 706 707 /* 708 * rbtree: compare a node and a key. 709 */ 710 static int 711 pmap_compare_key(void *context, const void *n, const void *k) 712 { 713 const struct pv_entry *pve = n; 714 const vaddr_t key = (vaddr_t)k; 715 716 if (pve->pve_pte.pte_va < key) { 717 return -1; 718 } 719 if (pve->pve_pte.pte_va > key) { 720 return 1; 721 } 722 return 0; 723 } 724 725 /* 726 * pmap_ptp_range_set: abuse ptp->uanon to record minimum VA of PTE 727 */ 728 static inline void 729 pmap_ptp_range_set(struct vm_page *ptp, vaddr_t va) 730 { 731 vaddr_t *min = (vaddr_t *)&ptp->uanon; 732 733 if (va < *min) { 734 *min = va; 735 } 736 } 737 738 /* 739 * pmap_ptp_range_clip: abuse ptp->uanon to clip range of PTEs to remove 740 */ 741 static inline void 742 pmap_ptp_range_clip(struct vm_page *ptp, vaddr_t *startva, pt_entry_t **pte) 743 { 744 vaddr_t sclip; 745 746 if (ptp == NULL) { 747 return; 748 } 749 750 sclip = (vaddr_t)ptp->uanon; 751 sclip = (*startva < sclip ? sclip : *startva); 752 *pte += (sclip - *startva) / PAGE_SIZE; 753 *startva = sclip; 754 } 755 756 /* 757 * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in 758 * 759 * there are several pmaps involved. some or all of them might be same. 760 * 761 * - the pmap given by the first argument 762 * our caller wants to access this pmap's PTEs. 763 * 764 * - pmap_kernel() 765 * the kernel pmap. note that it only contains the kernel part 766 * of the address space which is shared by any pmap. ie. any 767 * pmap can be used instead of pmap_kernel() for our purpose. 768 * 769 * - ci->ci_pmap 770 * pmap currently loaded on the cpu. 771 * 772 * - vm_map_pmap(&curproc->p_vmspace->vm_map) 773 * current process' pmap. 774 * 775 * => caller must lock pmap first (if not the kernel pmap) 776 * => must be undone with pmap_unmap_ptes before returning 777 * => disables kernel preemption 778 */ 779 void 780 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2, pd_entry_t **ptepp, 781 pd_entry_t * const **pdeppp) 782 { 783 struct pmap *curpmap; 784 struct cpu_info *ci; 785 lwp_t *l; 786 787 kpreempt_disable(); 788 789 /* The kernel's pmap is always accessible. */ 790 if (pmap == pmap_kernel()) { 791 *pmap2 = NULL; 792 *ptepp = PTE_BASE; 793 *pdeppp = normal_pdes; 794 return; 795 } 796 797 KASSERT(mutex_owned(&pmap->pm_lock)); 798 799 l = curlwp; 800 ci = l->l_cpu; 801 curpmap = ci->ci_pmap; 802 if (pmap == curpmap) { 803 /* 804 * Already on the CPU: make it valid. This is very 805 * often the case during exit(), when we have switched 806 * to the kernel pmap in order to destroy a user pmap. 807 */ 808 if (__predict_false(ci->ci_tlbstate != TLBSTATE_VALID)) { 809 pmap_reactivate(pmap); 810 } 811 *pmap2 = NULL; 812 } else { 813 /* 814 * Toss current pmap from CPU and install new pmap, but keep 815 * a reference to the old one. Dropping the reference can 816 * can block as it needs to take locks, so defer that to 817 * pmap_unmap_ptes(). 818 */ 819 pmap_reference(pmap); 820 pmap_load1(l, pmap, curpmap); 821 *pmap2 = curpmap; 822 } 823 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); 824 #ifdef DIAGNOSTIC 825 pmap->pm_ncsw = lwp_pctr(); 826 #endif 827 *ptepp = PTE_BASE; 828 829 #if defined(XENPV) && defined(__x86_64__) 830 KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] == L4_BASE); 831 ci->ci_normal_pdes[PTP_LEVELS - 2] = pmap->pm_pdir; 832 *pdeppp = ci->ci_normal_pdes; 833 #else 834 *pdeppp = normal_pdes; 835 #endif 836 } 837 838 /* 839 * pmap_unmap_ptes: unlock the PTE mapping of "pmap" 840 * 841 * => we cannot tolerate context switches while mapped in: assert this. 842 * => reenables kernel preemption. 843 * => does not unlock pmap. 844 */ 845 void 846 pmap_unmap_ptes(struct pmap *pmap, struct pmap * pmap2) 847 { 848 struct cpu_info *ci; 849 struct pmap *mypmap; 850 struct lwp *l; 851 852 KASSERT(kpreempt_disabled()); 853 854 /* The kernel's pmap is always accessible. */ 855 if (pmap == pmap_kernel()) { 856 kpreempt_enable(); 857 return; 858 } 859 860 l = curlwp; 861 ci = l->l_cpu; 862 863 KASSERT(mutex_owned(&pmap->pm_lock)); 864 KASSERT(pmap->pm_ncsw == lwp_pctr()); 865 866 #if defined(XENPV) && defined(__x86_64__) 867 KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] != L4_BASE); 868 ci->ci_normal_pdes[PTP_LEVELS - 2] = L4_BASE; 869 #endif 870 871 /* If not our own pmap, mark whatever's on the CPU now as lazy. */ 872 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); 873 mypmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 874 if (ci->ci_pmap == vm_map_pmap(&l->l_proc->p_vmspace->vm_map)) { 875 ci->ci_want_pmapload = 0; 876 } else { 877 ci->ci_want_pmapload = (mypmap != pmap_kernel()); 878 ci->ci_tlbstate = TLBSTATE_LAZY; 879 } 880 881 /* Now safe to re-enable preemption. */ 882 kpreempt_enable(); 883 884 /* Toss reference to other pmap taken earlier. */ 885 if (pmap2 != NULL) { 886 pmap_destroy(pmap2); 887 } 888 } 889 890 inline static void 891 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte) 892 { 893 894 #if !defined(__x86_64__) 895 if (curproc == NULL || curproc->p_vmspace == NULL || 896 pm != vm_map_pmap(&curproc->p_vmspace->vm_map)) 897 return; 898 899 if ((opte ^ npte) & PTE_X) 900 pmap_update_pg(va); 901 902 /* 903 * Executability was removed on the last executable change. 904 * Reset the code segment to something conservative and 905 * let the trap handler deal with setting the right limit. 906 * We can't do that because of locking constraints on the vm map. 907 */ 908 909 if ((opte & PTE_X) && (npte & PTE_X) == 0 && va == pm->pm_hiexec) { 910 struct trapframe *tf = curlwp->l_md.md_regs; 911 912 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 913 pm->pm_hiexec = I386_MAX_EXE_ADDR; 914 } 915 #endif /* !defined(__x86_64__) */ 916 } 917 918 #if !defined(__x86_64__) 919 /* 920 * Fixup the code segment to cover all potential executable mappings. 921 * returns 0 if no changes to the code segment were made. 922 */ 923 int 924 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb) 925 { 926 struct vm_map_entry *ent; 927 struct pmap *pm = vm_map_pmap(map); 928 vaddr_t va = 0; 929 930 vm_map_lock_read(map); 931 for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) { 932 /* 933 * This entry has greater va than the entries before. 934 * We need to make it point to the last page, not past it. 935 */ 936 if (ent->protection & VM_PROT_EXECUTE) 937 va = trunc_page(ent->end) - PAGE_SIZE; 938 } 939 vm_map_unlock_read(map); 940 if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL)) 941 return 0; 942 943 pm->pm_hiexec = va; 944 if (pm->pm_hiexec > I386_MAX_EXE_ADDR) { 945 tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL); 946 } else { 947 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 948 return 0; 949 } 950 return 1; 951 } 952 #endif /* !defined(__x86_64__) */ 953 954 void 955 pat_init(struct cpu_info *ci) 956 { 957 #ifndef XENPV 958 uint64_t pat; 959 960 if (!(ci->ci_feat_val[0] & CPUID_PAT)) 961 return; 962 963 /* We change WT to WC. Leave all other entries the default values. */ 964 pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) | 965 PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) | 966 PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) | 967 PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC); 968 969 wrmsr(MSR_CR_PAT, pat); 970 cpu_pat_enabled = true; 971 #endif 972 } 973 974 static pt_entry_t 975 pmap_pat_flags(u_int flags) 976 { 977 u_int cacheflags = (flags & PMAP_CACHE_MASK); 978 979 if (!cpu_pat_enabled) { 980 switch (cacheflags) { 981 case PMAP_NOCACHE: 982 case PMAP_NOCACHE_OVR: 983 /* results in PGC_UCMINUS on cpus which have 984 * the cpuid PAT but PAT "disabled" 985 */ 986 return PTE_PCD; 987 default: 988 return 0; 989 } 990 } 991 992 switch (cacheflags) { 993 case PMAP_NOCACHE: 994 return PGC_UC; 995 case PMAP_WRITE_COMBINE: 996 return PGC_WC; 997 case PMAP_WRITE_BACK: 998 return PGC_WB; 999 case PMAP_NOCACHE_OVR: 1000 return PGC_UCMINUS; 1001 } 1002 1003 return 0; 1004 } 1005 1006 /* 1007 * p m a p k e n t e r f u n c t i o n s 1008 * 1009 * functions to quickly enter/remove pages from the kernel address 1010 * space. pmap_kremove is exported to MI kernel. we make use of 1011 * the recursive PTE mappings. 1012 */ 1013 1014 /* 1015 * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking 1016 * 1017 * => no need to lock anything, assume va is already allocated 1018 * => should be faster than normal pmap enter function 1019 */ 1020 void 1021 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags) 1022 { 1023 pt_entry_t *pte, opte, npte; 1024 1025 KASSERT(!(prot & ~VM_PROT_ALL)); 1026 1027 if (va < VM_MIN_KERNEL_ADDRESS) 1028 pte = vtopte(va); 1029 else 1030 pte = kvtopte(va); 1031 #if defined(XENPV) && defined(DOM0OPS) 1032 if (pa < pmap_pa_start || pa >= pmap_pa_end) { 1033 #ifdef DEBUG 1034 printf_nolog("%s: pa %#" PRIxPADDR " for va %#" PRIxVADDR 1035 " outside range\n", __func__, pa, va); 1036 #endif /* DEBUG */ 1037 npte = pa; 1038 } else 1039 #endif /* XENPV && DOM0OPS */ 1040 npte = pmap_pa2pte(pa); 1041 npte |= protection_codes[prot] | PTE_P | pmap_pg_g; 1042 npte |= pmap_pat_flags(flags); 1043 opte = pmap_pte_testset(pte, npte); /* zap! */ 1044 1045 /* 1046 * XXX: make sure we are not dealing with a large page, since the only 1047 * large pages created are for the kernel image, and they should never 1048 * be kentered. 1049 */ 1050 KASSERTMSG(!(opte & PTE_PS), "PTE_PS va=%#"PRIxVADDR, va); 1051 1052 if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A)) { 1053 /* This should not happen. */ 1054 printf_nolog("%s: mapping already present\n", __func__); 1055 kpreempt_disable(); 1056 pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER); 1057 kpreempt_enable(); 1058 } 1059 } 1060 1061 __strict_weak_alias(pmap_kenter_ma, pmap_kenter_pa); 1062 1063 #if defined(__x86_64__) 1064 /* 1065 * Change protection for a virtual address. Local for a CPU only, don't 1066 * care about TLB shootdowns. 1067 * 1068 * => must be called with preemption disabled 1069 */ 1070 void 1071 pmap_changeprot_local(vaddr_t va, vm_prot_t prot) 1072 { 1073 pt_entry_t *pte, opte, npte; 1074 1075 KASSERT(kpreempt_disabled()); 1076 1077 if (va < VM_MIN_KERNEL_ADDRESS) 1078 pte = vtopte(va); 1079 else 1080 pte = kvtopte(va); 1081 1082 npte = opte = *pte; 1083 1084 if ((prot & VM_PROT_WRITE) != 0) 1085 npte |= PTE_W; 1086 else 1087 npte &= ~(PTE_W|PTE_D); 1088 1089 if (opte != npte) { 1090 pmap_pte_set(pte, npte); 1091 pmap_pte_flush(); 1092 invlpg(va); 1093 } 1094 } 1095 #endif /* defined(__x86_64__) */ 1096 1097 /* 1098 * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking 1099 * 1100 * => no need to lock anything 1101 * => caller must dispose of any vm_page mapped in the va range 1102 * => note: not an inline function 1103 * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE 1104 * => we assume kernel only unmaps valid addresses and thus don't bother 1105 * checking the valid bit before doing TLB flushing 1106 * => must be followed by call to pmap_update() before reuse of page 1107 */ 1108 static void 1109 pmap_kremove1(vaddr_t sva, vsize_t len, bool localonly) 1110 { 1111 pt_entry_t *pte, opte; 1112 vaddr_t va, eva; 1113 1114 eva = sva + len; 1115 1116 kpreempt_disable(); 1117 for (va = sva; va < eva; va += PAGE_SIZE) { 1118 pte = kvtopte(va); 1119 opte = pmap_pte_testset(pte, 0); /* zap! */ 1120 if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A) && !localonly) { 1121 pmap_tlb_shootdown(pmap_kernel(), va, opte, 1122 TLBSHOOT_KREMOVE); 1123 } 1124 KASSERTMSG((opte & PTE_PS) == 0, 1125 "va %#" PRIxVADDR " is a large page", va); 1126 KASSERTMSG((opte & PTE_PVLIST) == 0, 1127 "va %#" PRIxVADDR " is a pv tracked page", va); 1128 } 1129 if (localonly) { 1130 tlbflushg(); 1131 } 1132 kpreempt_enable(); 1133 } 1134 1135 void 1136 pmap_kremove(vaddr_t sva, vsize_t len) 1137 { 1138 1139 pmap_kremove1(sva, len, false); 1140 } 1141 1142 /* 1143 * pmap_kremove_local: like pmap_kremove(), but only worry about 1144 * TLB invalidations on the current CPU. this is only intended 1145 * for use while writing kernel crash dumps, either after panic 1146 * or via reboot -d. 1147 */ 1148 void 1149 pmap_kremove_local(vaddr_t sva, vsize_t len) 1150 { 1151 1152 pmap_kremove1(sva, len, true); 1153 } 1154 1155 /* 1156 * p m a p i n i t f u n c t i o n s 1157 * 1158 * pmap_bootstrap and pmap_init are called during system startup 1159 * to init the pmap module. pmap_bootstrap() does a low level 1160 * init just to get things rolling. pmap_init() finishes the job. 1161 */ 1162 1163 /* 1164 * pmap_bootstrap_valloc: allocate a virtual address in the bootstrap area. 1165 * This function is to be used before any VM system has been set up. 1166 * 1167 * The va is taken from virtual_avail. 1168 */ 1169 static vaddr_t 1170 pmap_bootstrap_valloc(size_t npages) 1171 { 1172 vaddr_t va = virtual_avail; 1173 virtual_avail += npages * PAGE_SIZE; 1174 return va; 1175 } 1176 1177 /* 1178 * pmap_bootstrap_palloc: allocate a physical address in the bootstrap area. 1179 * This function is to be used before any VM system has been set up. 1180 * 1181 * The pa is taken from avail_start. 1182 */ 1183 static paddr_t 1184 pmap_bootstrap_palloc(size_t npages) 1185 { 1186 paddr_t pa = avail_start; 1187 avail_start += npages * PAGE_SIZE; 1188 return pa; 1189 } 1190 1191 /* 1192 * pmap_bootstrap: get the system in a state where it can run with VM properly 1193 * enabled (called before main()). The VM system is fully init'd later. 1194 * 1195 * => on i386, locore.S has already enabled the MMU by allocating a PDP for the 1196 * kernel, and nkpde PTP's for the kernel. 1197 * => kva_start is the first free virtual address in kernel space. 1198 */ 1199 void 1200 pmap_bootstrap(vaddr_t kva_start) 1201 { 1202 struct pmap *kpm; 1203 int i; 1204 vaddr_t kva; 1205 1206 pmap_pg_nx = (cpu_feature[2] & CPUID_NOX ? PTE_NX : 0); 1207 1208 /* 1209 * Set up our local static global vars that keep track of the usage of 1210 * KVM before kernel_map is set up. 1211 */ 1212 virtual_avail = kva_start; /* first free KVA */ 1213 virtual_end = VM_MAX_KERNEL_ADDRESS; /* last KVA */ 1214 1215 /* 1216 * Set up protection_codes: we need to be able to convert from a MI 1217 * protection code (some combo of VM_PROT...) to something we can jam 1218 * into a x86 PTE. 1219 */ 1220 protection_codes[VM_PROT_NONE] = pmap_pg_nx; 1221 protection_codes[VM_PROT_EXECUTE] = PTE_X; 1222 protection_codes[VM_PROT_READ] = pmap_pg_nx; 1223 protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PTE_X; 1224 protection_codes[VM_PROT_WRITE] = PTE_W | pmap_pg_nx; 1225 protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PTE_W | PTE_X; 1226 protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PTE_W | pmap_pg_nx; 1227 protection_codes[VM_PROT_ALL] = PTE_W | PTE_X; 1228 1229 /* 1230 * Now we init the kernel's pmap. 1231 * 1232 * The kernel pmap's pm_obj is not used for much. However, in user pmaps 1233 * the pm_obj contains the list of active PTPs. 1234 */ 1235 kpm = pmap_kernel(); 1236 mutex_init(&kpm->pm_lock, MUTEX_DEFAULT, IPL_NONE); 1237 rw_init(&kpm->pm_dummy_lock); 1238 for (i = 0; i < PTP_LEVELS - 1; i++) { 1239 uvm_obj_init(&kpm->pm_obj[i], &pmap_pager, false, 1); 1240 uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_dummy_lock); 1241 kpm->pm_ptphint[i] = NULL; 1242 } 1243 memset(&kpm->pm_list, 0, sizeof(kpm->pm_list)); /* pm_list not used */ 1244 1245 kpm->pm_pdir = (pd_entry_t *)bootspace.pdir; 1246 for (i = 0; i < PDP_SIZE; i++) 1247 kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i; 1248 1249 kpm->pm_stats.wired_count = kpm->pm_stats.resident_count = 1250 x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS); 1251 1252 kcpuset_create(&kpm->pm_cpus, true); 1253 kcpuset_create(&kpm->pm_kernel_cpus, true); 1254 1255 kpm->pm_ldt = NULL; 1256 kpm->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 1257 1258 /* 1259 * the above is just a rough estimate and not critical to the proper 1260 * operation of the system. 1261 */ 1262 1263 #if !defined(XENPV) 1264 /* 1265 * Begin to enable global TLB entries if they are supported: add PTE_G 1266 * attribute to already mapped kernel pages. Do that only if SVS is 1267 * disabled. 1268 * 1269 * The G bit has no effect until the CR4_PGE bit is set in CR4, which 1270 * happens later in cpu_init(). 1271 */ 1272 #ifdef SVS 1273 if (!svs_enabled && (cpu_feature[0] & CPUID_PGE)) { 1274 #else 1275 if (cpu_feature[0] & CPUID_PGE) { 1276 #endif 1277 pmap_pg_g = PTE_G; 1278 pmap_remap_global(); 1279 } 1280 #endif 1281 1282 #ifndef XENPV 1283 /* 1284 * Enable large pages if they are supported. 1285 */ 1286 if (cpu_feature[0] & CPUID_PSE) { 1287 lcr4(rcr4() | CR4_PSE); /* enable hardware (via %cr4) */ 1288 pmap_largepages = 1; /* enable software */ 1289 1290 /* 1291 * The TLB must be flushed after enabling large pages on Pentium 1292 * CPUs, according to section 3.6.2.2 of "Intel Architecture 1293 * Software Developer's Manual, Volume 3: System Programming". 1294 */ 1295 tlbflushg(); 1296 1297 /* Remap the kernel. */ 1298 pmap_remap_largepages(); 1299 } 1300 pmap_init_lapic(); 1301 #endif /* !XENPV */ 1302 1303 #ifdef __HAVE_PCPU_AREA 1304 pmap_init_pcpu(); 1305 #endif 1306 1307 #ifdef __HAVE_DIRECT_MAP 1308 pmap_init_directmap(kpm); 1309 #else 1310 pmap_vpage_cpualloc(&cpu_info_primary); 1311 1312 if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { /* i386 */ 1313 early_zerop = (void *)cpu_info_primary.vpage[VPAGE_ZER]; 1314 early_zero_pte = cpu_info_primary.vpage_pte[VPAGE_ZER]; 1315 } else { /* amd64 */ 1316 /* 1317 * zero_pte is stuck at the end of mapped space for the kernel 1318 * image (disjunct from kva space). This is done so that it 1319 * can safely be used in pmap_growkernel (pmap_get_physpage), 1320 * when it's called for the first time. 1321 * XXXfvdl fix this for MULTIPROCESSOR later. 1322 */ 1323 #ifdef XENPV 1324 /* early_zerop initialized in xen_locore() */ 1325 #else 1326 early_zerop = (void *)bootspace.spareva; 1327 #endif 1328 early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop); 1329 } 1330 #endif 1331 1332 #if defined(XENPV) && defined(__x86_64__) 1333 extern vaddr_t xen_dummy_page; 1334 paddr_t xen_dummy_user_pgd; 1335 1336 /* 1337 * We want a dummy page directory for Xen: when deactivating a pmap, 1338 * Xen will still consider it active. So we set user PGD to this one 1339 * to lift all protection on the now inactive page tables set. 1340 */ 1341 xen_dummy_user_pgd = xen_dummy_page - KERNBASE; 1342 1343 /* Zero fill it, the less checks in Xen it requires the better */ 1344 memset(PAGE_ALIGNED(xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE); 1345 /* Mark read-only */ 1346 HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE, 1347 pmap_pa2pte(xen_dummy_user_pgd) | PTE_P | pmap_pg_nx, 1348 UVMF_INVLPG); 1349 /* Pin as L4 */ 1350 xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd)); 1351 #endif 1352 1353 /* 1354 * Allocate space for the IDT, GDT and LDT. 1355 */ 1356 idt_vaddr = pmap_bootstrap_valloc(1); 1357 idt_paddr = pmap_bootstrap_palloc(1); 1358 1359 gdt_vaddr = pmap_bootstrap_valloc(1); 1360 gdt_paddr = pmap_bootstrap_palloc(1); 1361 1362 #ifdef __HAVE_PCPU_AREA 1363 ldt_vaddr = (vaddr_t)&pcpuarea->ldt; 1364 #else 1365 ldt_vaddr = pmap_bootstrap_valloc(1); 1366 #endif 1367 ldt_paddr = pmap_bootstrap_palloc(1); 1368 1369 #if !defined(__x86_64__) 1370 /* pentium f00f bug stuff */ 1371 pentium_idt_vaddr = pmap_bootstrap_valloc(1); 1372 #endif 1373 1374 #if defined(XENPVHVM) 1375 /* XXX: move to hypervisor.c with appropriate API adjustments */ 1376 extern paddr_t HYPERVISOR_shared_info_pa; 1377 extern volatile struct xencons_interface *xencons_interface; /* XXX */ 1378 extern struct xenstore_domain_interface *xenstore_interface; /* XXX */ 1379 1380 if (vm_guest != VM_GUEST_XENPVH) { 1381 HYPERVISOR_shared_info = (void *) pmap_bootstrap_valloc(1); 1382 HYPERVISOR_shared_info_pa = pmap_bootstrap_palloc(1); 1383 } 1384 xencons_interface = (void *) pmap_bootstrap_valloc(1); 1385 xenstore_interface = (void *) pmap_bootstrap_valloc(1); 1386 #endif 1387 /* 1388 * Now we reserve some VM for mapping pages when doing a crash dump. 1389 */ 1390 virtual_avail = reserve_dumppages(virtual_avail); 1391 1392 /* 1393 * Init the global lock and global list. 1394 */ 1395 mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE); 1396 LIST_INIT(&pmaps); 1397 1398 /* 1399 * Ensure the TLB is sync'd with reality by flushing it... 1400 */ 1401 tlbflushg(); 1402 1403 /* 1404 * Calculate pmap_maxkvaddr from nkptp[]. 1405 */ 1406 kva = VM_MIN_KERNEL_ADDRESS; 1407 for (i = PTP_LEVELS - 1; i >= 1; i--) { 1408 kva += nkptp[i] * nbpd[i]; 1409 } 1410 pmap_maxkvaddr = kva; 1411 } 1412 1413 #ifndef XENPV 1414 static void 1415 pmap_init_lapic(void) 1416 { 1417 /* 1418 * On CPUs that have no LAPIC, local_apic_va is never kentered. But our 1419 * x86 implementation relies a lot on this address to be valid; so just 1420 * allocate a fake physical page that will be kentered into 1421 * local_apic_va by machdep. 1422 * 1423 * If the LAPIC is present, the va will be remapped somewhere else 1424 * later in lapic_map. 1425 */ 1426 local_apic_va = pmap_bootstrap_valloc(1); 1427 local_apic_pa = pmap_bootstrap_palloc(1); 1428 } 1429 #endif 1430 1431 #ifdef __x86_64__ 1432 static size_t 1433 pmap_pagetree_nentries_range(vaddr_t startva, vaddr_t endva, size_t pgsz) 1434 { 1435 size_t npages; 1436 npages = (roundup(endva, pgsz) / pgsz) - 1437 (rounddown(startva, pgsz) / pgsz); 1438 return npages; 1439 } 1440 #endif 1441 1442 #if defined(__HAVE_DIRECT_MAP) || defined(KASAN) || defined(KMSAN) 1443 static inline void 1444 slotspace_copy(int type, pd_entry_t *dst, pd_entry_t *src) 1445 { 1446 size_t sslot = slotspace.area[type].sslot; 1447 size_t nslot = slotspace.area[type].nslot; 1448 1449 memcpy(&dst[sslot], &src[sslot], nslot * sizeof(pd_entry_t)); 1450 } 1451 #endif 1452 1453 #ifdef __x86_64__ 1454 /* 1455 * Randomize the location of an area. We count the holes in the VM space. We 1456 * randomly select one hole, and then randomly select an area within that hole. 1457 * Finally we update the associated entry in the slotspace structure. 1458 */ 1459 vaddr_t 1460 slotspace_rand(int type, size_t sz, size_t align, size_t randhole, 1461 vaddr_t randva) 1462 { 1463 struct { 1464 int start; 1465 int end; 1466 } holes[SLSPACE_NAREAS+1]; 1467 size_t i, nholes, hole; 1468 size_t startsl, endsl, nslots, winsize; 1469 vaddr_t startva, va; 1470 1471 sz = roundup(sz, align); 1472 1473 /* 1474 * Take one more slot with +NBPD_L4, because we may end up choosing 1475 * an area that crosses slots: 1476 * +------+------+------+ 1477 * | Slot | Slot | Slot | 1478 * +------+------+------+ 1479 * [Chosen Area] 1480 * And in that case we must take into account the additional slot 1481 * consumed. 1482 */ 1483 nslots = roundup(sz+NBPD_L4, NBPD_L4) / NBPD_L4; 1484 1485 /* Get the holes. */ 1486 nholes = 0; 1487 size_t curslot = 0 + 256; /* end of SLAREA_USER */ 1488 while (1) { 1489 /* 1490 * Find the first occupied slot after the current one. 1491 * The area between the two is a hole. 1492 */ 1493 size_t minsslot = 512; 1494 size_t minnslot = 0; 1495 for (i = 0; i < SLSPACE_NAREAS; i++) { 1496 if (!slotspace.area[i].active) 1497 continue; 1498 if (slotspace.area[i].sslot >= curslot && 1499 slotspace.area[i].sslot < minsslot) { 1500 minsslot = slotspace.area[i].sslot; 1501 minnslot = slotspace.area[i].nslot; 1502 } 1503 } 1504 1505 /* No hole anymore, stop here. */ 1506 if (minsslot == 512) { 1507 break; 1508 } 1509 1510 /* Register the hole. */ 1511 if (minsslot - curslot >= nslots) { 1512 holes[nholes].start = curslot; 1513 holes[nholes].end = minsslot; 1514 nholes++; 1515 } 1516 1517 /* Skip that hole, and iterate again. */ 1518 curslot = minsslot + minnslot; 1519 } 1520 1521 if (nholes == 0) { 1522 panic("%s: impossible", __func__); 1523 } 1524 1525 /* Select a hole. */ 1526 hole = randhole; 1527 #ifdef NO_X86_ASLR 1528 hole = 0; 1529 #endif 1530 hole %= nholes; 1531 startsl = holes[hole].start; 1532 endsl = holes[hole].end; 1533 startva = VA_SIGN_NEG(startsl * NBPD_L4); 1534 1535 /* Select an area within the hole. */ 1536 va = randva; 1537 #ifdef NO_X86_ASLR 1538 va = 0; 1539 #endif 1540 winsize = ((endsl - startsl) * NBPD_L4) - sz; 1541 va %= winsize; 1542 va = rounddown(va, align); 1543 va += startva; 1544 1545 /* Update the entry. */ 1546 slotspace.area[type].sslot = pl4_i(va); 1547 slotspace.area[type].nslot = 1548 pmap_pagetree_nentries_range(va, va+sz, NBPD_L4); 1549 slotspace.area[type].active = true; 1550 1551 return va; 1552 } 1553 #endif 1554 1555 #ifdef __HAVE_PCPU_AREA 1556 static void 1557 pmap_init_pcpu(void) 1558 { 1559 const vaddr_t startva = PMAP_PCPU_BASE; 1560 size_t nL4e, nL3e, nL2e, nL1e; 1561 size_t L4e_idx, L3e_idx, L2e_idx, L1e_idx __diagused; 1562 paddr_t pa; 1563 vaddr_t endva; 1564 vaddr_t tmpva; 1565 pt_entry_t *pte; 1566 size_t size; 1567 int i; 1568 1569 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx; 1570 1571 size = sizeof(struct pcpu_area); 1572 1573 endva = startva + size; 1574 1575 /* We will use this temporary va. */ 1576 tmpva = bootspace.spareva; 1577 pte = PTE_BASE + pl1_i(tmpva); 1578 1579 /* Build L4 */ 1580 L4e_idx = pl4_i(startva); 1581 nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4); 1582 KASSERT(nL4e == 1); 1583 for (i = 0; i < nL4e; i++) { 1584 KASSERT(L4_BASE[L4e_idx+i] == 0); 1585 1586 pa = pmap_bootstrap_palloc(1); 1587 *pte = (pa & PTE_FRAME) | pteflags; 1588 pmap_update_pg(tmpva); 1589 memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE); 1590 1591 L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A; 1592 } 1593 1594 /* Build L3 */ 1595 L3e_idx = pl3_i(startva); 1596 nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3); 1597 for (i = 0; i < nL3e; i++) { 1598 KASSERT(L3_BASE[L3e_idx+i] == 0); 1599 1600 pa = pmap_bootstrap_palloc(1); 1601 *pte = (pa & PTE_FRAME) | pteflags; 1602 pmap_update_pg(tmpva); 1603 memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE); 1604 1605 L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A; 1606 } 1607 1608 /* Build L2 */ 1609 L2e_idx = pl2_i(startva); 1610 nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2); 1611 for (i = 0; i < nL2e; i++) { 1612 1613 KASSERT(L2_BASE[L2e_idx+i] == 0); 1614 1615 pa = pmap_bootstrap_palloc(1); 1616 *pte = (pa & PTE_FRAME) | pteflags; 1617 pmap_update_pg(tmpva); 1618 memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE); 1619 1620 L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A; 1621 } 1622 1623 /* Build L1 */ 1624 L1e_idx = pl1_i(startva); 1625 nL1e = pmap_pagetree_nentries_range(startva, endva, NBPD_L1); 1626 for (i = 0; i < nL1e; i++) { 1627 /* 1628 * Nothing to do, the PTEs will be entered via 1629 * pmap_kenter_pa. 1630 */ 1631 KASSERT(L1_BASE[L1e_idx+i] == 0); 1632 } 1633 1634 *pte = 0; 1635 pmap_update_pg(tmpva); 1636 1637 pcpuarea = (struct pcpu_area *)startva; 1638 1639 tlbflush(); 1640 } 1641 #endif 1642 1643 #ifdef __HAVE_DIRECT_MAP 1644 static void 1645 randomize_hole(size_t *randholep, vaddr_t *randvap) 1646 { 1647 struct nist_hash_drbg drbg; 1648 uint8_t seed[NIST_HASH_DRBG_SEEDLEN_BYTES]; 1649 const char p[] = "x86/directmap"; 1650 int error; 1651 1652 entropy_extract(seed, sizeof(seed), 0); 1653 1654 error = nist_hash_drbg_instantiate(&drbg, seed, sizeof(seed), 1655 /*nonce*/NULL, 0, 1656 /*personalization*/p, strlen(p)); 1657 KASSERTMSG(error == 0, "error=%d", error); 1658 1659 error = nist_hash_drbg_generate(&drbg, randholep, sizeof(*randholep), 1660 /*additional*/NULL, 0); 1661 KASSERTMSG(error == 0, "error=%d", error); 1662 1663 error = nist_hash_drbg_generate(&drbg, randvap, sizeof(*randvap), 1664 /*additional*/NULL, 0); 1665 KASSERTMSG(error == 0, "error=%d", error); 1666 1667 explicit_memset(seed, 0, sizeof(seed)); 1668 explicit_memset(&drbg, 0, sizeof(drbg)); 1669 } 1670 1671 /* 1672 * Create the amd64 direct map. Called only once at boot time. We map all of 1673 * the physical memory contiguously using 2MB large pages, with RW permissions. 1674 * However there is a hole: the kernel is mapped with RO permissions. 1675 */ 1676 static void 1677 pmap_init_directmap(struct pmap *kpm) 1678 { 1679 extern phys_ram_seg_t mem_clusters[]; 1680 extern int mem_cluster_cnt; 1681 1682 vaddr_t startva; 1683 size_t nL4e, nL3e, nL2e; 1684 size_t L4e_idx, L3e_idx, L2e_idx; 1685 size_t spahole, epahole; 1686 paddr_t lastpa, pa; 1687 vaddr_t endva; 1688 vaddr_t tmpva; 1689 pt_entry_t *pte; 1690 phys_ram_seg_t *mc; 1691 int i; 1692 size_t randhole; 1693 vaddr_t randva; 1694 1695 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx; 1696 const pd_entry_t holepteflags = PTE_P | pmap_pg_nx; 1697 1698 CTASSERT(NL4_SLOT_DIRECT * NBPD_L4 == MAXPHYSMEM); 1699 1700 spahole = roundup(bootspace.head.pa, NBPD_L2); 1701 epahole = rounddown(bootspace.boot.pa, NBPD_L2); 1702 1703 /* Get the last physical address available */ 1704 lastpa = 0; 1705 for (i = 0; i < mem_cluster_cnt; i++) { 1706 mc = &mem_clusters[i]; 1707 lastpa = MAX(lastpa, mc->start + mc->size); 1708 } 1709 1710 /* 1711 * x86_add_cluster should have truncated the memory to MAXPHYSMEM. 1712 */ 1713 if (lastpa > MAXPHYSMEM) { 1714 panic("pmap_init_directmap: lastpa incorrect"); 1715 } 1716 1717 randomize_hole(&randhole, &randva); 1718 startva = slotspace_rand(SLAREA_DMAP, lastpa, NBPD_L2, 1719 randhole, randva); 1720 endva = startva + lastpa; 1721 1722 /* We will use this temporary va. */ 1723 tmpva = bootspace.spareva; 1724 pte = PTE_BASE + pl1_i(tmpva); 1725 1726 /* Build L4 */ 1727 L4e_idx = pl4_i(startva); 1728 nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4); 1729 KASSERT(nL4e <= NL4_SLOT_DIRECT); 1730 for (i = 0; i < nL4e; i++) { 1731 KASSERT(L4_BASE[L4e_idx+i] == 0); 1732 1733 pa = pmap_bootstrap_palloc(1); 1734 *pte = (pa & PTE_FRAME) | pteflags; 1735 pmap_update_pg(tmpva); 1736 memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE); 1737 1738 L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A; 1739 } 1740 1741 /* Build L3 */ 1742 L3e_idx = pl3_i(startva); 1743 nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3); 1744 for (i = 0; i < nL3e; i++) { 1745 KASSERT(L3_BASE[L3e_idx+i] == 0); 1746 1747 pa = pmap_bootstrap_palloc(1); 1748 *pte = (pa & PTE_FRAME) | pteflags; 1749 pmap_update_pg(tmpva); 1750 memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE); 1751 1752 L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A; 1753 } 1754 1755 /* Build L2 */ 1756 L2e_idx = pl2_i(startva); 1757 nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2); 1758 for (i = 0; i < nL2e; i++) { 1759 KASSERT(L2_BASE[L2e_idx+i] == 0); 1760 1761 pa = (paddr_t)(i * NBPD_L2); 1762 1763 if (spahole <= pa && pa < epahole) { 1764 L2_BASE[L2e_idx+i] = pa | holepteflags | PTE_A | 1765 PTE_PS | pmap_pg_g; 1766 } else { 1767 L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A | 1768 PTE_PS | pmap_pg_g; 1769 } 1770 } 1771 1772 *pte = 0; 1773 pmap_update_pg(tmpva); 1774 1775 pmap_direct_base = startva; 1776 pmap_direct_end = endva; 1777 1778 tlbflush(); 1779 } 1780 #endif /* __HAVE_DIRECT_MAP */ 1781 1782 #if !defined(XENPV) 1783 /* 1784 * Remap all of the virtual pages created so far with the PTE_G bit. 1785 */ 1786 static void 1787 pmap_remap_global(void) 1788 { 1789 vaddr_t kva, kva_end; 1790 unsigned long p1i; 1791 size_t i; 1792 1793 /* head */ 1794 kva = bootspace.head.va; 1795 kva_end = kva + bootspace.head.sz; 1796 for ( ; kva < kva_end; kva += PAGE_SIZE) { 1797 p1i = pl1_i(kva); 1798 if (pmap_valid_entry(PTE_BASE[p1i])) 1799 PTE_BASE[p1i] |= pmap_pg_g; 1800 } 1801 1802 /* kernel segments */ 1803 for (i = 0; i < BTSPACE_NSEGS; i++) { 1804 if (bootspace.segs[i].type == BTSEG_NONE) { 1805 continue; 1806 } 1807 kva = bootspace.segs[i].va; 1808 kva_end = kva + bootspace.segs[i].sz; 1809 for ( ; kva < kva_end; kva += PAGE_SIZE) { 1810 p1i = pl1_i(kva); 1811 if (pmap_valid_entry(PTE_BASE[p1i])) 1812 PTE_BASE[p1i] |= pmap_pg_g; 1813 } 1814 } 1815 1816 /* boot space */ 1817 kva = bootspace.boot.va; 1818 kva_end = kva + bootspace.boot.sz; 1819 for ( ; kva < kva_end; kva += PAGE_SIZE) { 1820 p1i = pl1_i(kva); 1821 if (pmap_valid_entry(PTE_BASE[p1i])) 1822 PTE_BASE[p1i] |= pmap_pg_g; 1823 } 1824 } 1825 #endif 1826 1827 #ifndef XENPV 1828 /* 1829 * Remap several kernel segments with large pages. We cover as many pages as we 1830 * can. Called only once at boot time, if the CPU supports large pages. 1831 */ 1832 static void 1833 pmap_remap_largepages(void) 1834 { 1835 pd_entry_t *pde; 1836 vaddr_t kva, kva_end; 1837 paddr_t pa; 1838 size_t i; 1839 1840 /* Remap the kernel text using large pages. */ 1841 for (i = 0; i < BTSPACE_NSEGS; i++) { 1842 if (bootspace.segs[i].type != BTSEG_TEXT) { 1843 continue; 1844 } 1845 kva = roundup(bootspace.segs[i].va, NBPD_L2); 1846 if (kva < bootspace.segs[i].va) { 1847 continue; 1848 } 1849 kva_end = rounddown(bootspace.segs[i].va + 1850 bootspace.segs[i].sz, NBPD_L2); 1851 pa = roundup(bootspace.segs[i].pa, NBPD_L2); 1852 for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) { 1853 pde = &L2_BASE[pl2_i(kva)]; 1854 *pde = pa | pmap_pg_g | PTE_PS | PTE_P; 1855 tlbflushg(); 1856 } 1857 } 1858 1859 /* Remap the kernel rodata using large pages. */ 1860 for (i = 0; i < BTSPACE_NSEGS; i++) { 1861 if (bootspace.segs[i].type != BTSEG_RODATA) { 1862 continue; 1863 } 1864 kva = roundup(bootspace.segs[i].va, NBPD_L2); 1865 if (kva < bootspace.segs[i].va) { 1866 continue; 1867 } 1868 kva_end = rounddown(bootspace.segs[i].va + 1869 bootspace.segs[i].sz, NBPD_L2); 1870 pa = roundup(bootspace.segs[i].pa, NBPD_L2); 1871 for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) { 1872 pde = &L2_BASE[pl2_i(kva)]; 1873 *pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_P; 1874 tlbflushg(); 1875 } 1876 } 1877 1878 /* Remap the kernel data+bss using large pages. */ 1879 for (i = 0; i < BTSPACE_NSEGS; i++) { 1880 if (bootspace.segs[i].type != BTSEG_DATA) { 1881 continue; 1882 } 1883 kva = roundup(bootspace.segs[i].va, NBPD_L2); 1884 if (kva < bootspace.segs[i].va) { 1885 continue; 1886 } 1887 kva_end = rounddown(bootspace.segs[i].va + 1888 bootspace.segs[i].sz, NBPD_L2); 1889 pa = roundup(bootspace.segs[i].pa, NBPD_L2); 1890 for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) { 1891 pde = &L2_BASE[pl2_i(kva)]; 1892 *pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_W | PTE_P; 1893 tlbflushg(); 1894 } 1895 } 1896 } 1897 #endif /* !XENPV */ 1898 1899 /* 1900 * pmap_init: called from uvm_init, our job is to get the pmap system ready 1901 * to manage mappings. 1902 */ 1903 void 1904 pmap_init(void) 1905 { 1906 int flags; 1907 1908 /* 1909 * initialize caches. 1910 */ 1911 1912 pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), COHERENCY_UNIT, 1913 0, 0, "pmappl", NULL, IPL_NONE, pmap_ctor, pmap_dtor, NULL); 1914 1915 #ifdef XENPV 1916 /* 1917 * pool_cache(9) should not touch cached objects, since they 1918 * are pinned on xen and R/O for the domU 1919 */ 1920 flags = PR_NOTOUCH; 1921 #else 1922 flags = 0; 1923 #endif 1924 1925 #ifdef PAE 1926 pool_init(&pmap_pdp_pool, PAGE_SIZE * PDP_SIZE, 0, 0, flags, 1927 "pdppl", &pmap_pdp_allocator, IPL_NONE); 1928 #else 1929 pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, 0, flags, 1930 "pdppl", NULL, IPL_NONE); 1931 #endif 1932 pool_cache_bootstrap(&pmap_pvp_cache, PAGE_SIZE, PAGE_SIZE, 1933 0, 0, "pvpage", &pool_allocator_kmem, 1934 IPL_NONE, pmap_pvp_ctor, pmap_pvp_dtor, NULL); 1935 1936 pmap_tlb_init(); 1937 1938 /* XXX: Since cpu_hatch() is only for secondary CPUs. */ 1939 pmap_tlb_cpu_init(curcpu()); 1940 1941 evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC, 1942 NULL, "x86", "io bitmap copy"); 1943 evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC, 1944 NULL, "x86", "ldt sync"); 1945 1946 /* 1947 * The kernel doesn't keep track of PTPs, so there's nowhere handy 1948 * to hang a tree of pv_entry records. Dynamically allocated 1949 * pv_entry lists are not heavily used in the kernel's pmap (the 1950 * usual case is embedded), so cop out and use a single RB tree 1951 * to cover them. 1952 */ 1953 rb_tree_init(&pmap_kernel_rb, &pmap_rbtree_ops); 1954 1955 /* 1956 * done: pmap module is up (and ready for business) 1957 */ 1958 1959 pmap_initialized = true; 1960 } 1961 1962 #ifndef XENPV 1963 /* 1964 * pmap_cpu_init_late: perform late per-CPU initialization. 1965 */ 1966 void 1967 pmap_cpu_init_late(struct cpu_info *ci) 1968 { 1969 /* 1970 * The BP has already its own PD page allocated during early 1971 * MD startup. 1972 */ 1973 if (ci == &cpu_info_primary) 1974 return; 1975 #ifdef PAE 1976 cpu_alloc_l3_page(ci); 1977 #endif 1978 } 1979 #endif 1980 1981 #ifndef __HAVE_DIRECT_MAP 1982 CTASSERT(CACHE_LINE_SIZE > sizeof(pt_entry_t)); 1983 CTASSERT(CACHE_LINE_SIZE % sizeof(pt_entry_t) == 0); 1984 1985 static void 1986 pmap_vpage_cpualloc(struct cpu_info *ci) 1987 { 1988 bool primary = (ci == &cpu_info_primary); 1989 size_t i, npages; 1990 vaddr_t vabase; 1991 vsize_t vrange; 1992 1993 npages = (CACHE_LINE_SIZE / sizeof(pt_entry_t)); 1994 KASSERT(npages >= VPAGE_MAX); 1995 vrange = npages * PAGE_SIZE; 1996 1997 if (primary) { 1998 while ((vabase = pmap_bootstrap_valloc(1)) % vrange != 0) { 1999 /* Waste some pages to align properly */ 2000 } 2001 /* The base is aligned, allocate the rest (contiguous) */ 2002 pmap_bootstrap_valloc(npages - 1); 2003 } else { 2004 vabase = uvm_km_alloc(kernel_map, vrange, vrange, 2005 UVM_KMF_VAONLY); 2006 if (vabase == 0) { 2007 panic("%s: failed to allocate tmp VA for CPU %d\n", 2008 __func__, cpu_index(ci)); 2009 } 2010 } 2011 2012 KASSERT((vaddr_t)&PTE_BASE[pl1_i(vabase)] % CACHE_LINE_SIZE == 0); 2013 2014 for (i = 0; i < VPAGE_MAX; i++) { 2015 ci->vpage[i] = vabase + i * PAGE_SIZE; 2016 ci->vpage_pte[i] = PTE_BASE + pl1_i(ci->vpage[i]); 2017 } 2018 } 2019 2020 void 2021 pmap_vpage_cpu_init(struct cpu_info *ci) 2022 { 2023 if (ci == &cpu_info_primary) { 2024 /* cpu0 already taken care of in pmap_bootstrap */ 2025 return; 2026 } 2027 2028 pmap_vpage_cpualloc(ci); 2029 } 2030 #endif 2031 2032 /* 2033 * p v _ e n t r y f u n c t i o n s 2034 */ 2035 2036 /* 2037 * pmap_pvp_dtor: pool_cache constructor for PV pages. 2038 */ 2039 static int 2040 pmap_pvp_ctor(void *arg, void *obj, int flags) 2041 { 2042 struct pv_page *pvp = (struct pv_page *)obj; 2043 struct pv_entry *pve = (struct pv_entry *)obj + 1; 2044 struct pv_entry *maxpve = pve + PVE_PER_PVP; 2045 2046 KASSERT(sizeof(struct pv_page) <= sizeof(struct pv_entry)); 2047 KASSERT(trunc_page((vaddr_t)obj) == (vaddr_t)obj); 2048 2049 LIST_INIT(&pvp->pvp_pves); 2050 pvp->pvp_nfree = PVE_PER_PVP; 2051 pvp->pvp_pmap = NULL; 2052 2053 for (; pve < maxpve; pve++) { 2054 LIST_INSERT_HEAD(&pvp->pvp_pves, pve, pve_list); 2055 } 2056 2057 return 0; 2058 } 2059 2060 /* 2061 * pmap_pvp_dtor: pool_cache destructor for PV pages. 2062 */ 2063 static void 2064 pmap_pvp_dtor(void *arg, void *obj) 2065 { 2066 struct pv_page *pvp __diagused = obj; 2067 2068 KASSERT(pvp->pvp_pmap == NULL); 2069 KASSERT(pvp->pvp_nfree == PVE_PER_PVP); 2070 } 2071 2072 /* 2073 * pmap_alloc_pv: allocate a PV entry (likely cached with pmap). 2074 */ 2075 static struct pv_entry * 2076 pmap_alloc_pv(struct pmap *pmap) 2077 { 2078 struct pv_entry *pve; 2079 struct pv_page *pvp; 2080 2081 KASSERT(mutex_owned(&pmap->pm_lock)); 2082 2083 if (__predict_false((pvp = LIST_FIRST(&pmap->pm_pvp_part)) == NULL)) { 2084 if ((pvp = LIST_FIRST(&pmap->pm_pvp_full)) != NULL) { 2085 LIST_REMOVE(pvp, pvp_list); 2086 } else { 2087 pvp = pool_cache_get(&pmap_pvp_cache, PR_NOWAIT); 2088 } 2089 if (__predict_false(pvp == NULL)) { 2090 return NULL; 2091 } 2092 /* full -> part */ 2093 LIST_INSERT_HEAD(&pmap->pm_pvp_part, pvp, pvp_list); 2094 pvp->pvp_pmap = pmap; 2095 } 2096 2097 KASSERT(pvp->pvp_pmap == pmap); 2098 KASSERT(pvp->pvp_nfree > 0); 2099 2100 pve = LIST_FIRST(&pvp->pvp_pves); 2101 LIST_REMOVE(pve, pve_list); 2102 pvp->pvp_nfree--; 2103 2104 if (__predict_false(pvp->pvp_nfree == 0)) { 2105 /* part -> empty */ 2106 KASSERT(LIST_EMPTY(&pvp->pvp_pves)); 2107 LIST_REMOVE(pvp, pvp_list); 2108 LIST_INSERT_HEAD(&pmap->pm_pvp_empty, pvp, pvp_list); 2109 } else { 2110 KASSERT(!LIST_EMPTY(&pvp->pvp_pves)); 2111 } 2112 2113 return pve; 2114 } 2115 2116 /* 2117 * pmap_free_pv: delayed free of a PV entry. 2118 */ 2119 static void 2120 pmap_free_pv(struct pmap *pmap, struct pv_entry *pve) 2121 { 2122 struct pv_page *pvp = (struct pv_page *)trunc_page((vaddr_t)pve); 2123 2124 KASSERT(mutex_owned(&pmap->pm_lock)); 2125 KASSERT(pvp->pvp_pmap == pmap); 2126 KASSERT(pvp->pvp_nfree >= 0); 2127 2128 LIST_INSERT_HEAD(&pvp->pvp_pves, pve, pve_list); 2129 pvp->pvp_nfree++; 2130 2131 if (__predict_false(pvp->pvp_nfree == 1)) { 2132 /* empty -> part */ 2133 LIST_REMOVE(pvp, pvp_list); 2134 LIST_INSERT_HEAD(&pmap->pm_pvp_part, pvp, pvp_list); 2135 } else if (__predict_false(pvp->pvp_nfree == PVE_PER_PVP)) { 2136 /* part -> full */ 2137 LIST_REMOVE(pvp, pvp_list); 2138 LIST_INSERT_HEAD(&pmap->pm_pvp_full, pvp, pvp_list); 2139 } 2140 } 2141 2142 /* 2143 * pmap_drain_pv: free full PV pages. 2144 */ 2145 static void 2146 pmap_drain_pv(struct pmap *pmap) 2147 { 2148 struct pv_page *pvp; 2149 2150 KASSERT(mutex_owned(&pmap->pm_lock)); 2151 2152 while ((pvp = LIST_FIRST(&pmap->pm_pvp_full)) != NULL) { 2153 LIST_REMOVE(pvp, pvp_list); 2154 KASSERT(pvp->pvp_pmap == pmap); 2155 KASSERT(pvp->pvp_nfree == PVE_PER_PVP); 2156 pvp->pvp_pmap = NULL; 2157 pool_cache_put(&pmap_pvp_cache, pvp); 2158 } 2159 } 2160 2161 /* 2162 * pmap_check_pv: verify {VA, PTP} pair is either tracked/untracked by page 2163 */ 2164 static void 2165 pmap_check_pv(struct pmap *pmap, struct vm_page *ptp, struct pmap_page *pp, 2166 vaddr_t va, bool tracked) 2167 { 2168 #ifdef DEBUG 2169 struct pv_pte *pvpte; 2170 2171 PMAP_CHECK_PP(pp); 2172 2173 mutex_spin_enter(&pp->pp_lock); 2174 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 2175 if (pvpte->pte_ptp == ptp && pvpte->pte_va == va) { 2176 break; 2177 } 2178 } 2179 mutex_spin_exit(&pp->pp_lock); 2180 2181 if (pvpte && !tracked) { 2182 panic("pmap_check_pv: %p/%lx found on pp %p", ptp, va, pp); 2183 } else if (!pvpte && tracked) { 2184 panic("pmap_check_pv: %p/%lx missing on pp %p", ptp, va, pp); 2185 } 2186 #endif 2187 } 2188 2189 /* 2190 * pmap_treelookup_pv: search the PV tree for a dynamic entry 2191 * 2192 * => pmap must be locked 2193 */ 2194 static struct pv_entry * 2195 pmap_treelookup_pv(const struct pmap *pmap, const struct vm_page *ptp, 2196 const rb_tree_t *tree, const vaddr_t va) 2197 { 2198 struct pv_entry *pve; 2199 rb_node_t *node; 2200 2201 /* 2202 * Inlined lookup tailored for exactly what's needed here that is 2203 * quite a bit faster than using rb_tree_find_node(). 2204 */ 2205 for (node = tree->rbt_root;;) { 2206 if (__predict_false(RB_SENTINEL_P(node))) { 2207 return NULL; 2208 } 2209 pve = (struct pv_entry *) 2210 ((uintptr_t)node - offsetof(struct pv_entry, pve_rb)); 2211 if (pve->pve_pte.pte_va == va) { 2212 KASSERT(pve->pve_pte.pte_ptp == ptp); 2213 return pve; 2214 } 2215 node = node->rb_nodes[pve->pve_pte.pte_va < va]; 2216 } 2217 } 2218 2219 /* 2220 * pmap_lookup_pv: look up a non-embedded pv entry for the given pmap 2221 * 2222 * => a PV entry must be known present (doesn't check for existence) 2223 * => pmap must be locked 2224 */ 2225 static struct pv_entry * 2226 pmap_lookup_pv(const struct pmap *pmap, const struct vm_page *ptp, 2227 const struct pmap_page * const old_pp, const vaddr_t va) 2228 { 2229 struct pv_entry *pve; 2230 const rb_tree_t *tree; 2231 2232 KASSERT(mutex_owned(&pmap->pm_lock)); 2233 KASSERT(ptp != NULL || pmap == pmap_kernel()); 2234 2235 /* 2236 * [This mostly deals with the case of process-private pages, i.e. 2237 * anonymous memory allocations or COW.] 2238 * 2239 * If the page is tracked with an embedded entry then the tree 2240 * lookup can be avoided. It's safe to check for this specific 2241 * set of values without pp_lock because both will only ever be 2242 * set together for this pmap. 2243 * 2244 */ 2245 if (atomic_load_relaxed(&old_pp->pp_pte.pte_ptp) == ptp && 2246 atomic_load_relaxed(&old_pp->pp_pte.pte_va) == va) { 2247 return NULL; 2248 } 2249 2250 /* 2251 * [This mostly deals with shared mappings, for example shared libs 2252 * and executables.] 2253 * 2254 * Optimise for pmap_remove_ptes() which works by ascending scan: 2255 * look at the lowest numbered node in the tree first. The tree is 2256 * known non-empty because of the check above. For short lived 2257 * processes where pmap_remove() isn't used much this gets close to 2258 * a 100% hit rate. 2259 */ 2260 tree = (ptp != NULL ? &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb); 2261 KASSERT(!RB_SENTINEL_P(tree->rbt_root)); 2262 pve = (struct pv_entry *) 2263 ((uintptr_t)tree->rbt_minmax[RB_DIR_LEFT] - 2264 offsetof(struct pv_entry, pve_rb)); 2265 if (__predict_true(pve->pve_pte.pte_va == va)) { 2266 KASSERT(pve->pve_pte.pte_ptp == ptp); 2267 return pve; 2268 } 2269 2270 /* Search the RB tree for the key (uncommon). */ 2271 return pmap_treelookup_pv(pmap, ptp, tree, va); 2272 } 2273 2274 /* 2275 * pmap_enter_pv: enter a mapping onto a pmap_page lst 2276 * 2277 * => pmap must be locked 2278 * => does NOT insert dynamic entries to tree (pmap_enter() does later) 2279 */ 2280 static int 2281 pmap_enter_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp, 2282 vaddr_t va, struct pv_entry **new_pve, struct pv_entry **old_pve, 2283 bool *samepage, bool *new_embedded, rb_tree_t *tree) 2284 { 2285 struct pv_entry *pve; 2286 int error; 2287 2288 KASSERT(mutex_owned(&pmap->pm_lock)); 2289 KASSERT(ptp_to_pmap(ptp) == pmap); 2290 KASSERT(ptp == NULL || ptp->uobject != NULL); 2291 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 2292 PMAP_CHECK_PP(pp); 2293 2294 /* 2295 * If entering the same page and it's already tracked with an 2296 * embedded entry, we can avoid the expense below. It's safe 2297 * to check for this very specific set of values without a lock 2298 * because both will only ever be set together for this pmap. 2299 */ 2300 if (atomic_load_relaxed(&pp->pp_pte.pte_ptp) == ptp && 2301 atomic_load_relaxed(&pp->pp_pte.pte_va) == va) { 2302 *samepage = true; 2303 pmap_check_pv(pmap, ptp, pp, va, true); 2304 return 0; 2305 } 2306 2307 /* 2308 * Check for an existing dynamic mapping at this address. If it's 2309 * for the same page, then it will be reused and nothing needs to be 2310 * changed. 2311 */ 2312 *old_pve = pmap_treelookup_pv(pmap, ptp, tree, va); 2313 if (*old_pve != NULL && (*old_pve)->pve_pp == pp) { 2314 *samepage = true; 2315 pmap_check_pv(pmap, ptp, pp, va, true); 2316 return 0; 2317 } 2318 2319 /* 2320 * Need to put a new mapping in place. Grab a spare pv_entry in 2321 * case it's needed; won't know for sure until the lock is taken. 2322 */ 2323 if (pmap->pm_pve == NULL) { 2324 pmap->pm_pve = pmap_alloc_pv(pmap); 2325 } 2326 2327 error = 0; 2328 pmap_check_pv(pmap, ptp, pp, va, false); 2329 mutex_spin_enter(&pp->pp_lock); 2330 if (!pv_pte_embedded(pp)) { 2331 /* 2332 * Embedded PV tracking available - easy. 2333 */ 2334 pp->pp_pte.pte_ptp = ptp; 2335 pp->pp_pte.pte_va = va; 2336 *new_embedded = true; 2337 } else if (__predict_false(pmap->pm_pve == NULL)) { 2338 /* 2339 * No memory. 2340 */ 2341 error = ENOMEM; 2342 } else { 2343 /* 2344 * Install new pv_entry on the page. 2345 */ 2346 pve = pmap->pm_pve; 2347 pmap->pm_pve = NULL; 2348 *new_pve = pve; 2349 pve->pve_pte.pte_ptp = ptp; 2350 pve->pve_pte.pte_va = va; 2351 pve->pve_pp = pp; 2352 LIST_INSERT_HEAD(&pp->pp_pvlist, pve, pve_list); 2353 } 2354 mutex_spin_exit(&pp->pp_lock); 2355 if (error == 0) { 2356 pmap_check_pv(pmap, ptp, pp, va, true); 2357 } 2358 2359 return error; 2360 } 2361 2362 /* 2363 * pmap_remove_pv: try to remove a mapping from a pv_list 2364 * 2365 * => pmap must be locked 2366 * => removes dynamic entries from tree and frees them 2367 * => caller should adjust ptp's wire_count and free PTP if needed 2368 */ 2369 static void 2370 pmap_remove_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp, 2371 vaddr_t va, struct pv_entry *pve, uint8_t oattrs) 2372 { 2373 rb_tree_t *tree = (ptp != NULL ? 2374 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb); 2375 2376 KASSERT(mutex_owned(&pmap->pm_lock)); 2377 KASSERT(ptp_to_pmap(ptp) == pmap); 2378 KASSERT(ptp == NULL || ptp->uobject != NULL); 2379 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 2380 KASSERT(ptp != NULL || pmap == pmap_kernel()); 2381 2382 pmap_check_pv(pmap, ptp, pp, va, true); 2383 2384 if (pve == NULL) { 2385 mutex_spin_enter(&pp->pp_lock); 2386 KASSERT(pp->pp_pte.pte_ptp == ptp); 2387 KASSERT(pp->pp_pte.pte_va == va); 2388 pp->pp_attrs |= oattrs; 2389 pp->pp_pte.pte_ptp = NULL; 2390 pp->pp_pte.pte_va = 0; 2391 mutex_spin_exit(&pp->pp_lock); 2392 } else { 2393 mutex_spin_enter(&pp->pp_lock); 2394 KASSERT(pp->pp_pte.pte_ptp != ptp || 2395 pp->pp_pte.pte_va != va); 2396 KASSERT(pve->pve_pte.pte_ptp == ptp); 2397 KASSERT(pve->pve_pte.pte_va == va); 2398 KASSERT(pve->pve_pp == pp); 2399 pp->pp_attrs |= oattrs; 2400 LIST_REMOVE(pve, pve_list); 2401 mutex_spin_exit(&pp->pp_lock); 2402 2403 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == pve); 2404 rb_tree_remove_node(tree, pve); 2405 #ifdef DIAGNOSTIC 2406 memset(pve, 0, sizeof(*pve)); 2407 #endif 2408 pmap_free_pv(pmap, pve); 2409 } 2410 2411 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 2412 pmap_check_pv(pmap, ptp, pp, va, false); 2413 } 2414 2415 /* 2416 * p t p f u n c t i o n s 2417 */ 2418 2419 static struct vm_page * 2420 pmap_find_ptp(struct pmap *pmap, vaddr_t va, int level) 2421 { 2422 int lidx = level - 1; 2423 off_t off = ptp_va2o(va, level); 2424 struct vm_page *pg; 2425 2426 KASSERT(mutex_owned(&pmap->pm_lock)); 2427 2428 if (pmap->pm_ptphint[lidx] && off == pmap->pm_ptphint[lidx]->offset) { 2429 KASSERT(pmap->pm_ptphint[lidx]->wire_count > 0); 2430 pg = pmap->pm_ptphint[lidx]; 2431 PMAP_CHECK_PP(VM_PAGE_TO_PP(pg)); 2432 return pg; 2433 } 2434 PMAP_DUMMY_LOCK(pmap); 2435 pg = uvm_pagelookup(&pmap->pm_obj[lidx], off); 2436 PMAP_DUMMY_UNLOCK(pmap); 2437 if (pg != NULL && __predict_false(pg->wire_count == 0)) { 2438 /* This page is queued to be freed - ignore. */ 2439 pg = NULL; 2440 } 2441 if (pg != NULL) { 2442 PMAP_CHECK_PP(VM_PAGE_TO_PP(pg)); 2443 } 2444 pmap->pm_ptphint[lidx] = pg; 2445 return pg; 2446 } 2447 2448 static inline void 2449 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level) 2450 { 2451 int lidx; 2452 2453 KASSERT(ptp->wire_count <= 1); 2454 PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp)); 2455 2456 lidx = level - 1; 2457 pmap_stats_update(pmap, -ptp->wire_count, 0); 2458 if (pmap->pm_ptphint[lidx] == ptp) 2459 pmap->pm_ptphint[lidx] = NULL; 2460 ptp->wire_count = 0; 2461 ptp->uanon = NULL; 2462 KASSERT(RB_TREE_MIN(&VM_PAGE_TO_PP(ptp)->pp_rb) == NULL); 2463 2464 /* 2465 * Enqueue the PTP to be freed by pmap_update(). We can't remove 2466 * the page from the uvm_object, as that can take further locks 2467 * (intolerable right now because the PTEs are likely mapped in). 2468 * Instead mark the PTP as free and if we bump into it again, we'll 2469 * either ignore or reuse (depending on what's useful at the time). 2470 */ 2471 LIST_INSERT_HEAD(&pmap->pm_gc_ptp, ptp, mdpage.mp_pp.pp_link); 2472 } 2473 2474 static void 2475 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va, 2476 pt_entry_t *ptes, pd_entry_t * const *pdes) 2477 { 2478 unsigned long index; 2479 int level; 2480 vaddr_t invaladdr; 2481 pd_entry_t opde; 2482 2483 KASSERT(pmap != pmap_kernel()); 2484 KASSERT(mutex_owned(&pmap->pm_lock)); 2485 KASSERT(kpreempt_disabled()); 2486 2487 level = 1; 2488 do { 2489 index = pl_i(va, level + 1); 2490 opde = pmap_pte_testset(&pdes[level - 1][index], 0); 2491 2492 /* 2493 * On Xen-amd64 or SVS, we need to sync the top level page 2494 * directory on each CPU. 2495 */ 2496 #if defined(XENPV) && defined(__x86_64__) 2497 if (level == PTP_LEVELS - 1) { 2498 xen_kpm_sync(pmap, index); 2499 } 2500 #elif defined(SVS) 2501 if (svs_enabled && level == PTP_LEVELS - 1 && 2502 pmap_is_user(pmap)) { 2503 svs_pmap_sync(pmap, index); 2504 } 2505 #endif 2506 2507 invaladdr = level == 1 ? (vaddr_t)ptes : 2508 (vaddr_t)pdes[level - 2]; 2509 pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE, 2510 opde, TLBSHOOT_FREE_PTP); 2511 2512 #if defined(XENPV) 2513 pmap_tlb_shootnow(); 2514 #endif 2515 2516 pmap_freepage(pmap, ptp, level); 2517 if (level < PTP_LEVELS - 1) { 2518 ptp = pmap_find_ptp(pmap, va, level + 1); 2519 ptp->wire_count--; 2520 if (ptp->wire_count > 1) 2521 break; 2522 } 2523 } while (++level < PTP_LEVELS); 2524 pmap_pte_flush(); 2525 } 2526 2527 /* 2528 * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one) 2529 * 2530 * => pmap should NOT be pmap_kernel() 2531 * => pmap should be locked 2532 * => we are not touching any PTEs yet, so they need not be mapped in 2533 */ 2534 static int 2535 pmap_get_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va, 2536 int flags, struct vm_page **resultp) 2537 { 2538 struct vm_page *ptp; 2539 int i, aflags; 2540 struct uvm_object *obj; 2541 voff_t off; 2542 2543 KASSERT(pmap != pmap_kernel()); 2544 KASSERT(mutex_owned(&pmap->pm_lock)); 2545 2546 /* 2547 * Loop through all page table levels allocating a page 2548 * for any level where we don't already have one. 2549 */ 2550 memset(pt, 0, sizeof(*pt)); 2551 aflags = ((flags & PMAP_CANFAIL) ? 0 : UVM_PGA_USERESERVE) | 2552 UVM_PGA_ZERO; 2553 for (i = PTP_LEVELS; i > 1; i--) { 2554 obj = &pmap->pm_obj[i - 2]; 2555 off = ptp_va2o(va, i - 1); 2556 2557 PMAP_DUMMY_LOCK(pmap); 2558 pt->pg[i] = uvm_pagelookup(obj, off); 2559 2560 if (pt->pg[i] == NULL) { 2561 pt->pg[i] = uvm_pagealloc(obj, off, NULL, aflags); 2562 pt->alloced[i] = (pt->pg[i] != NULL); 2563 } else if (pt->pg[i]->wire_count == 0) { 2564 /* This page was queued to be freed; dequeue it. */ 2565 LIST_REMOVE(pt->pg[i], mdpage.mp_pp.pp_link); 2566 pt->alloced[i] = true; 2567 } 2568 PMAP_DUMMY_UNLOCK(pmap); 2569 if (pt->pg[i] == NULL) { 2570 pmap_unget_ptp(pmap, pt); 2571 return ENOMEM; 2572 } else if (pt->alloced[i]) { 2573 pt->pg[i]->uanon = (struct vm_anon *)(vaddr_t)~0L; 2574 rb_tree_init(&VM_PAGE_TO_PP(pt->pg[i])->pp_rb, 2575 &pmap_rbtree_ops); 2576 PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i])); 2577 } 2578 } 2579 ptp = pt->pg[2]; 2580 KASSERT(ptp != NULL); 2581 *resultp = ptp; 2582 pmap->pm_ptphint[0] = ptp; 2583 return 0; 2584 } 2585 2586 /* 2587 * pmap_install_ptp: install any freshly allocated PTPs 2588 * 2589 * => pmap should NOT be pmap_kernel() 2590 * => pmap should be locked 2591 * => PTEs must be mapped 2592 * => preemption must be disabled 2593 */ 2594 static void 2595 pmap_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va, 2596 pd_entry_t * const *pdes) 2597 { 2598 struct vm_page *ptp; 2599 unsigned long index; 2600 pd_entry_t *pva; 2601 paddr_t pa; 2602 int i; 2603 2604 KASSERT(pmap != pmap_kernel()); 2605 KASSERT(mutex_owned(&pmap->pm_lock)); 2606 KASSERT(kpreempt_disabled()); 2607 2608 /* 2609 * Now that we have all the pages looked up or allocated, 2610 * loop through again installing any new ones into the tree. 2611 */ 2612 for (i = PTP_LEVELS; i > 1; i--) { 2613 index = pl_i(va, i); 2614 pva = pdes[i - 2]; 2615 2616 if (pmap_valid_entry(pva[index])) { 2617 KASSERT(!pt->alloced[i]); 2618 continue; 2619 } 2620 2621 ptp = pt->pg[i]; 2622 ptp->flags &= ~PG_BUSY; /* never busy */ 2623 ptp->wire_count = 1; 2624 pmap->pm_ptphint[i - 2] = ptp; 2625 pa = VM_PAGE_TO_PHYS(ptp); 2626 pmap_pte_set(&pva[index], (pd_entry_t) 2627 (pmap_pa2pte(pa) | PTE_U | PTE_W | PTE_P)); 2628 2629 /* 2630 * On Xen-amd64 or SVS, we need to sync the top level page 2631 * directory on each CPU. 2632 */ 2633 #if defined(XENPV) && defined(__x86_64__) 2634 if (i == PTP_LEVELS) { 2635 xen_kpm_sync(pmap, index); 2636 } 2637 #elif defined(SVS) 2638 if (svs_enabled && i == PTP_LEVELS && 2639 pmap_is_user(pmap)) { 2640 svs_pmap_sync(pmap, index); 2641 } 2642 #endif 2643 2644 pmap_pte_flush(); 2645 pmap_stats_update(pmap, 1, 0); 2646 2647 /* 2648 * If we're not in the top level, increase the 2649 * wire count of the parent page. 2650 */ 2651 if (i < PTP_LEVELS) { 2652 pt->pg[i + 1]->wire_count++; 2653 } 2654 } 2655 } 2656 2657 /* 2658 * pmap_unget_ptp: free unusued PTPs 2659 * 2660 * => pmap should NOT be pmap_kernel() 2661 * => pmap should be locked 2662 */ 2663 static void 2664 pmap_unget_ptp(struct pmap *pmap, struct pmap_ptparray *pt) 2665 { 2666 int i; 2667 2668 KASSERT(pmap != pmap_kernel()); 2669 KASSERT(mutex_owned(&pmap->pm_lock)); 2670 2671 for (i = PTP_LEVELS; i > 1; i--) { 2672 if (!pt->alloced[i]) { 2673 continue; 2674 } 2675 KASSERT(pt->pg[i]->wire_count == 0); 2676 PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i])); 2677 pmap_freepage(pmap, pt->pg[i], i - 1); 2678 } 2679 } 2680 2681 /* 2682 * p m a p l i f e c y c l e f u n c t i o n s 2683 */ 2684 2685 /* 2686 * pmap_pdp_init: constructor a new PDP. 2687 */ 2688 static void 2689 pmap_pdp_init(pd_entry_t *pdir) 2690 { 2691 paddr_t pdirpa = 0; 2692 vaddr_t object; 2693 int i; 2694 2695 #if !defined(XENPV) || !defined(__x86_64__) 2696 int npde; 2697 #endif 2698 #ifdef XENPV 2699 int s; 2700 #endif 2701 2702 memset(PAGE_ALIGNED(pdir), 0, PDP_SIZE * PAGE_SIZE); 2703 2704 /* 2705 * NOTE: This is all done unlocked, but we will check afterwards 2706 * if we have raced with pmap_growkernel(). 2707 */ 2708 2709 #if defined(XENPV) && defined(__x86_64__) 2710 /* Fetch the physical address of the page directory */ 2711 (void)pmap_extract(pmap_kernel(), (vaddr_t)pdir, &pdirpa); 2712 2713 /* 2714 * This pdir will NEVER be active in kernel mode, so mark 2715 * recursive entry invalid. 2716 */ 2717 pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa); 2718 2719 /* 2720 * PDP constructed this way won't be for the kernel, hence we 2721 * don't put kernel mappings on Xen. 2722 * 2723 * But we need to make pmap_create() happy, so put a dummy 2724 * (without PTE_P) value at the right place. 2725 */ 2726 pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] = 2727 (pd_entry_t)-1 & PTE_FRAME; 2728 #else /* XENPV && __x86_64__*/ 2729 object = (vaddr_t)pdir; 2730 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2731 /* Fetch the physical address of the page directory */ 2732 (void)pmap_extract(pmap_kernel(), object, &pdirpa); 2733 2734 /* Put in recursive PDE to map the PTEs */ 2735 pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PTE_P | 2736 pmap_pg_nx; 2737 #ifndef XENPV 2738 pdir[PDIR_SLOT_PTE + i] |= PTE_W; 2739 #endif 2740 } 2741 2742 /* Copy the kernel's top level PDE */ 2743 npde = nkptp[PTP_LEVELS - 1]; 2744 2745 memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN], 2746 npde * sizeof(pd_entry_t)); 2747 2748 if (VM_MIN_KERNEL_ADDRESS != KERNBASE) { 2749 int idx = pl_i(KERNBASE, PTP_LEVELS); 2750 pdir[idx] = PDP_BASE[idx]; 2751 } 2752 2753 #ifdef __HAVE_PCPU_AREA 2754 pdir[PDIR_SLOT_PCPU] = PDP_BASE[PDIR_SLOT_PCPU]; 2755 #endif 2756 #ifdef __HAVE_DIRECT_MAP 2757 slotspace_copy(SLAREA_DMAP, pdir, PDP_BASE); 2758 #endif 2759 #ifdef KASAN 2760 slotspace_copy(SLAREA_ASAN, pdir, PDP_BASE); 2761 #endif 2762 #ifdef KMSAN 2763 slotspace_copy(SLAREA_MSAN, pdir, PDP_BASE); 2764 #endif 2765 #endif /* XENPV && __x86_64__*/ 2766 2767 #ifdef XENPV 2768 s = splvm(); 2769 object = (vaddr_t)pdir; 2770 pmap_protect(pmap_kernel(), object, object + (PAGE_SIZE * PDP_SIZE), 2771 VM_PROT_READ); 2772 pmap_update(pmap_kernel()); 2773 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2774 /* 2775 * pin as L2/L4 page, we have to do the page with the 2776 * PDIR_SLOT_PTE entries last 2777 */ 2778 #ifdef PAE 2779 if (i == l2tol3(PDIR_SLOT_PTE)) 2780 continue; 2781 #endif 2782 2783 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2784 #ifdef __x86_64__ 2785 xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa)); 2786 #else 2787 xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa)); 2788 #endif 2789 } 2790 #ifdef PAE 2791 object = ((vaddr_t)pdir) + PAGE_SIZE * l2tol3(PDIR_SLOT_PTE); 2792 (void)pmap_extract(pmap_kernel(), object, &pdirpa); 2793 xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa)); 2794 #endif 2795 splx(s); 2796 #endif /* XENPV */ 2797 } 2798 2799 /* 2800 * pmap_pdp_fini: destructor for the PDPs. 2801 */ 2802 static void 2803 pmap_pdp_fini(pd_entry_t *pdir) 2804 { 2805 #ifdef XENPV 2806 paddr_t pdirpa = 0; /* XXX: GCC */ 2807 vaddr_t object = (vaddr_t)pdir; 2808 int i; 2809 int s = splvm(); 2810 pt_entry_t *pte; 2811 2812 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2813 /* fetch the physical address of the page directory. */ 2814 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2815 /* unpin page table */ 2816 xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa)); 2817 } 2818 object = (vaddr_t)pdir; 2819 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2820 /* Set page RW again */ 2821 pte = kvtopte(object); 2822 pmap_pte_set(pte, *pte | PTE_W); 2823 xen_bcast_invlpg((vaddr_t)object); 2824 } 2825 splx(s); 2826 #endif /* XENPV */ 2827 } 2828 2829 #ifdef PAE 2830 static void * 2831 pmap_pdp_alloc(struct pool *pp, int flags) 2832 { 2833 return (void *)uvm_km_alloc(kernel_map, 2834 PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE, 2835 ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK) | 2836 UVM_KMF_WIRED); 2837 } 2838 2839 static void 2840 pmap_pdp_free(struct pool *pp, void *v) 2841 { 2842 uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE, 2843 UVM_KMF_WIRED); 2844 } 2845 #endif /* PAE */ 2846 2847 /* 2848 * pmap_ctor: constructor for the pmap cache. 2849 */ 2850 static int 2851 pmap_ctor(void *arg, void *obj, int flags) 2852 { 2853 struct pmap *pmap = obj; 2854 pt_entry_t p; 2855 int i; 2856 2857 KASSERT((flags & PR_WAITOK) != 0); 2858 2859 mutex_init(&pmap->pm_lock, MUTEX_DEFAULT, IPL_NONE); 2860 rw_init(&pmap->pm_dummy_lock); 2861 kcpuset_create(&pmap->pm_cpus, true); 2862 kcpuset_create(&pmap->pm_kernel_cpus, true); 2863 #ifdef XENPV 2864 kcpuset_create(&pmap->pm_xen_ptp_cpus, true); 2865 #endif 2866 LIST_INIT(&pmap->pm_gc_ptp); 2867 pmap->pm_pve = NULL; 2868 LIST_INIT(&pmap->pm_pvp_full); 2869 LIST_INIT(&pmap->pm_pvp_part); 2870 LIST_INIT(&pmap->pm_pvp_empty); 2871 2872 /* allocate and init PDP */ 2873 pmap->pm_pdir = pool_get(&pmap_pdp_pool, PR_WAITOK); 2874 2875 for (;;) { 2876 pmap_pdp_init(pmap->pm_pdir); 2877 mutex_enter(&pmaps_lock); 2878 p = pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1]; 2879 if (__predict_true(p != 0)) { 2880 break; 2881 } 2882 mutex_exit(&pmaps_lock); 2883 } 2884 2885 for (i = 0; i < PDP_SIZE; i++) 2886 pmap->pm_pdirpa[i] = 2887 pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]); 2888 2889 LIST_INSERT_HEAD(&pmaps, pmap, pm_list); 2890 mutex_exit(&pmaps_lock); 2891 2892 return 0; 2893 } 2894 2895 /* 2896 * pmap_ctor: destructor for the pmap cache. 2897 */ 2898 static void 2899 pmap_dtor(void *arg, void *obj) 2900 { 2901 struct pmap *pmap = obj; 2902 2903 mutex_enter(&pmaps_lock); 2904 LIST_REMOVE(pmap, pm_list); 2905 mutex_exit(&pmaps_lock); 2906 2907 pmap_pdp_fini(pmap->pm_pdir); 2908 pool_put(&pmap_pdp_pool, pmap->pm_pdir); 2909 mutex_destroy(&pmap->pm_lock); 2910 rw_destroy(&pmap->pm_dummy_lock); 2911 kcpuset_destroy(pmap->pm_cpus); 2912 kcpuset_destroy(pmap->pm_kernel_cpus); 2913 #ifdef XENPV 2914 kcpuset_destroy(pmap->pm_xen_ptp_cpus); 2915 #endif 2916 } 2917 2918 /* 2919 * pmap_create: create a pmap object. 2920 */ 2921 struct pmap * 2922 pmap_create(void) 2923 { 2924 struct pmap *pmap; 2925 int i; 2926 2927 pmap = pool_cache_get(&pmap_cache, PR_WAITOK); 2928 2929 /* init uvm_object */ 2930 for (i = 0; i < PTP_LEVELS - 1; i++) { 2931 uvm_obj_init(&pmap->pm_obj[i], &pmap_pager, false, 1); 2932 uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_dummy_lock); 2933 pmap->pm_ptphint[i] = NULL; 2934 } 2935 pmap->pm_stats.wired_count = 0; 2936 /* count the PDP allocd below */ 2937 pmap->pm_stats.resident_count = PDP_SIZE; 2938 #if !defined(__x86_64__) 2939 pmap->pm_hiexec = 0; 2940 #endif 2941 2942 /* Used by NVMM and Xen */ 2943 pmap->pm_enter = NULL; 2944 pmap->pm_extract = NULL; 2945 pmap->pm_remove = NULL; 2946 pmap->pm_sync_pv = NULL; 2947 pmap->pm_pp_remove_ent = NULL; 2948 pmap->pm_write_protect = NULL; 2949 pmap->pm_unwire = NULL; 2950 pmap->pm_tlb_flush = NULL; 2951 pmap->pm_data = NULL; 2952 2953 /* init the LDT */ 2954 pmap->pm_ldt = NULL; 2955 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 2956 2957 return pmap; 2958 } 2959 2960 /* 2961 * pmap_check_ptps: verify that none of the pmap's page table objects 2962 * have any pages allocated to them. 2963 */ 2964 static void 2965 pmap_check_ptps(struct pmap *pmap) 2966 { 2967 int i; 2968 2969 for (i = 0; i < PTP_LEVELS - 1; i++) { 2970 KASSERTMSG(pmap->pm_obj[i].uo_npages == 0, 2971 "pmap %p level %d still has %d pages", 2972 pmap, i, (int)pmap->pm_obj[i].uo_npages); 2973 } 2974 } 2975 2976 static void 2977 pmap_check_inuse(struct pmap *pmap) 2978 { 2979 #ifdef DEBUG 2980 CPU_INFO_ITERATOR cii; 2981 struct cpu_info *ci; 2982 2983 for (CPU_INFO_FOREACH(cii, ci)) { 2984 if (ci->ci_pmap == pmap) 2985 panic("destroying pmap being used"); 2986 #if defined(XENPV) && defined(__x86_64__) 2987 for (int i = 0; i < PDIR_SLOT_USERLIM; i++) { 2988 if (pmap->pm_pdir[i] != 0 && 2989 ci->ci_kpm_pdir[i] == pmap->pm_pdir[i]) { 2990 printf("pmap_destroy(%p) pmap_kernel %p " 2991 "curcpu %d cpu %d ci_pmap %p " 2992 "ci->ci_kpm_pdir[%d]=%" PRIx64 2993 " pmap->pm_pdir[%d]=%" PRIx64 "\n", 2994 pmap, pmap_kernel(), curcpu()->ci_index, 2995 ci->ci_index, ci->ci_pmap, 2996 i, ci->ci_kpm_pdir[i], 2997 i, pmap->pm_pdir[i]); 2998 panic("%s: used pmap", __func__); 2999 } 3000 } 3001 #endif 3002 } 3003 #endif /* DEBUG */ 3004 } 3005 3006 /* 3007 * pmap_destroy: drop reference count on pmap. free pmap if reference 3008 * count goes to zero. 3009 * 3010 * => we can be called from pmap_unmap_ptes() with a different, unrelated 3011 * pmap's lock held. be careful! 3012 */ 3013 void 3014 pmap_destroy(struct pmap *pmap) 3015 { 3016 int i; 3017 3018 /* 3019 * drop reference count and verify not in use. 3020 */ 3021 3022 if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) { 3023 return; 3024 } 3025 pmap_check_inuse(pmap); 3026 3027 /* 3028 * handle any deferred frees. 3029 */ 3030 3031 mutex_enter(&pmap->pm_lock); 3032 if (pmap->pm_pve != NULL) { 3033 pmap_free_pv(pmap, pmap->pm_pve); 3034 pmap->pm_pve = NULL; 3035 } 3036 pmap_drain_pv(pmap); 3037 mutex_exit(&pmap->pm_lock); 3038 pmap_update(pmap); 3039 3040 /* 3041 * Reference count is zero, free pmap resources and then free pmap. 3042 */ 3043 3044 pmap_check_ptps(pmap); 3045 KASSERT(LIST_EMPTY(&pmap->pm_gc_ptp)); 3046 3047 #ifdef USER_LDT 3048 if (pmap->pm_ldt != NULL) { 3049 /* 3050 * No need to switch the LDT; this address space is gone, 3051 * nothing is using it. 3052 * 3053 * No need to lock the pmap for ldt_free (or anything else), 3054 * we're the last one to use it. 3055 */ 3056 /* XXXAD can't take cpu_lock here - fix soon. */ 3057 mutex_enter(&cpu_lock); 3058 ldt_free(pmap->pm_ldt_sel); 3059 mutex_exit(&cpu_lock); 3060 uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt, 3061 MAX_USERLDT_SIZE, UVM_KMF_WIRED); 3062 } 3063 #endif 3064 3065 for (i = 0; i < PTP_LEVELS - 1; i++) { 3066 uvm_obj_destroy(&pmap->pm_obj[i], false); 3067 } 3068 kcpuset_zero(pmap->pm_cpus); 3069 kcpuset_zero(pmap->pm_kernel_cpus); 3070 #ifdef XENPV 3071 kcpuset_zero(pmap->pm_xen_ptp_cpus); 3072 #endif 3073 3074 KASSERT(LIST_EMPTY(&pmap->pm_pvp_full)); 3075 KASSERT(LIST_EMPTY(&pmap->pm_pvp_part)); 3076 KASSERT(LIST_EMPTY(&pmap->pm_pvp_empty)); 3077 3078 pmap_check_ptps(pmap); 3079 if (__predict_false(pmap->pm_enter != NULL)) { 3080 /* XXX make this a different cache */ 3081 pool_cache_destruct_object(&pmap_cache, pmap); 3082 } else { 3083 pool_cache_put(&pmap_cache, pmap); 3084 } 3085 } 3086 3087 /* 3088 * pmap_zap_ptp: clear out an entire PTP without modifying PTEs 3089 * 3090 * => caller must hold pmap's lock 3091 * => PTP must be mapped into KVA 3092 * => must be called with kernel preemption disabled 3093 * => does as little work as possible 3094 */ 3095 static void 3096 pmap_zap_ptp(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, 3097 vaddr_t startva, vaddr_t blkendva) 3098 { 3099 #ifndef XENPV 3100 struct pv_entry *pve; 3101 struct vm_page *pg; 3102 struct pmap_page *pp; 3103 pt_entry_t opte; 3104 rb_tree_t *tree; 3105 vaddr_t va; 3106 int wired; 3107 uint8_t oattrs; 3108 u_int cnt; 3109 3110 KASSERT(mutex_owned(&pmap->pm_lock)); 3111 KASSERT(kpreempt_disabled()); 3112 KASSERT(pmap != pmap_kernel()); 3113 KASSERT(ptp->wire_count > 1); 3114 KASSERT(ptp->wire_count - 1 <= PAGE_SIZE / sizeof(pt_entry_t)); 3115 3116 /* 3117 * Start at the lowest entered VA, and scan until there are no more 3118 * PTEs in the PTPs. 3119 */ 3120 tree = &VM_PAGE_TO_PP(ptp)->pp_rb; 3121 pve = RB_TREE_MIN(tree); 3122 wired = 0; 3123 va = (vaddr_t)ptp->uanon; 3124 pte += ((va - startva) >> PAGE_SHIFT); 3125 3126 for (cnt = ptp->wire_count; cnt > 1; pte++, va += PAGE_SIZE) { 3127 /* 3128 * No need for an atomic to clear the PTE. Nothing else can 3129 * see the address space any more and speculative access (if 3130 * possible) won't modify. Therefore there's no need to 3131 * track the accessed/dirty bits. 3132 */ 3133 opte = *pte; 3134 if (!pmap_valid_entry(opte)) { 3135 continue; 3136 } 3137 3138 /* 3139 * Count the PTE. If it's not for a managed mapping 3140 * there's noting more to do. 3141 */ 3142 cnt--; 3143 wired -= (opte & PTE_WIRED); 3144 if ((opte & PTE_PVLIST) == 0) { 3145 #ifndef DOM0OPS 3146 KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL), 3147 "managed page without PTE_PVLIST for %#" 3148 PRIxVADDR, va); 3149 KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL), 3150 "pv-tracked page without PTE_PVLIST for %#" 3151 PRIxVADDR, va); 3152 #endif 3153 KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ? 3154 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), 3155 va) == NULL); 3156 continue; 3157 } 3158 3159 /* 3160 * "pve" now points to the lowest (by VA) dynamic PV entry 3161 * in the PTP. If it's for this VA, take advantage of it to 3162 * avoid calling PHYS_TO_VM_PAGE(). Avoid modifying the RB 3163 * tree by skipping to the next VA in the tree whenever 3164 * there is a match here. The tree will be cleared out in 3165 * one pass before return to pmap_remove_all(). 3166 */ 3167 oattrs = pmap_pte_to_pp_attrs(opte); 3168 if (pve != NULL && pve->pve_pte.pte_va == va) { 3169 pp = pve->pve_pp; 3170 KASSERT(pve->pve_pte.pte_ptp == ptp); 3171 KASSERT(pp->pp_pte.pte_ptp != ptp || 3172 pp->pp_pte.pte_va != va); 3173 mutex_spin_enter(&pp->pp_lock); 3174 pp->pp_attrs |= oattrs; 3175 LIST_REMOVE(pve, pve_list); 3176 mutex_spin_exit(&pp->pp_lock); 3177 3178 /* 3179 * pve won't be touched again until pmap_drain_pv(), 3180 * so it's still safe to traverse the tree. 3181 */ 3182 pmap_free_pv(pmap, pve); 3183 pve = RB_TREE_NEXT(tree, pve); 3184 continue; 3185 } 3186 3187 /* 3188 * No entry in the tree so it must be embedded. Look up the 3189 * page and cancel the embedded entry. 3190 */ 3191 if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) { 3192 pp = VM_PAGE_TO_PP(pg); 3193 } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) { 3194 paddr_t pa = pmap_pte2pa(opte); 3195 panic("%s: PTE_PVLIST with pv-untracked page" 3196 " va = %#"PRIxVADDR"pa = %#"PRIxPADDR 3197 "(%#"PRIxPADDR")", __func__, va, pa, atop(pa)); 3198 } 3199 mutex_spin_enter(&pp->pp_lock); 3200 KASSERT(pp->pp_pte.pte_ptp == ptp); 3201 KASSERT(pp->pp_pte.pte_va == va); 3202 pp->pp_attrs |= oattrs; 3203 pp->pp_pte.pte_ptp = NULL; 3204 pp->pp_pte.pte_va = 0; 3205 mutex_spin_exit(&pp->pp_lock); 3206 } 3207 3208 /* PTP now empty - adjust the tree & stats to match. */ 3209 pmap_stats_update(pmap, -(ptp->wire_count - 1), wired / PTE_WIRED); 3210 ptp->wire_count = 1; 3211 #ifdef DIAGNOSTIC 3212 rb_tree_init(tree, &pmap_rbtree_ops); 3213 #endif 3214 #else /* !XENPV */ 3215 /* 3216 * XXXAD For XEN, it's not clear to me that we can do this, because 3217 * I guess the hypervisor keeps track of PTEs too. 3218 */ 3219 pmap_remove_ptes(pmap, ptp, (vaddr_t)pte, startva, blkendva); 3220 #endif /* !XENPV */ 3221 } 3222 3223 /* 3224 * pmap_remove_all: remove all mappings from pmap in bulk. 3225 * 3226 * Ordinarily when removing mappings it's important to hold the UVM object's 3227 * lock, so that pages do not gain a new identity while retaining stale TLB 3228 * entries (the same lock hold covers both pmap_remove() and pmap_update()). 3229 * Here it's known that the address space is no longer visible to any user 3230 * process, so we don't need to worry about that. 3231 */ 3232 bool 3233 pmap_remove_all(struct pmap *pmap) 3234 { 3235 struct vm_page *ptps[32]; 3236 vaddr_t va, blkendva; 3237 struct pmap *pmap2; 3238 pt_entry_t *ptes; 3239 pd_entry_t pde __diagused; 3240 pd_entry_t * const *pdes; 3241 int lvl __diagused, i, n; 3242 3243 /* XXX Can't handle EPT just yet. */ 3244 if (pmap->pm_remove != NULL) { 3245 return false; 3246 } 3247 3248 for (;;) { 3249 /* Fetch a block of PTPs from tree. */ 3250 mutex_enter(&pmap->pm_lock); 3251 n = radix_tree_gang_lookup_node(&pmap->pm_obj[0].uo_pages, 0, 3252 (void **)ptps, __arraycount(ptps), false); 3253 if (n == 0) { 3254 mutex_exit(&pmap->pm_lock); 3255 break; 3256 } 3257 3258 /* Remove all mappings in the set of PTPs. */ 3259 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3260 for (i = 0; i < n; i++) { 3261 if (ptps[i]->wire_count == 0) { 3262 /* It's dead: pmap_update() will expunge. */ 3263 continue; 3264 } 3265 3266 /* Determine range of block. */ 3267 va = ptps[i]->offset * PAGE_SIZE / sizeof(pt_entry_t); 3268 blkendva = x86_round_pdr(va + 1); 3269 3270 /* Make sure everything squares up... */ 3271 KASSERT(pmap_pdes_valid(va, pdes, &pde, &lvl)); 3272 KASSERT(lvl == 1); 3273 KASSERT(pmap_find_ptp(pmap, va, 1) == ptps[i]); 3274 3275 /* Zap! */ 3276 pmap_zap_ptp(pmap, ptps[i], &ptes[pl1_i(va)], va, 3277 blkendva); 3278 3279 /* PTP should now be unused - free it. */ 3280 KASSERT(ptps[i]->wire_count == 1); 3281 pmap_free_ptp(pmap, ptps[i], va, ptes, pdes); 3282 } 3283 pmap_unmap_ptes(pmap, pmap2); 3284 pmap_drain_pv(pmap); 3285 pmap_tlb_shootdown(pmap, -1L, 0, TLBSHOOT_REMOVE_ALL); 3286 mutex_exit(&pmap->pm_lock); 3287 3288 /* Process deferred frees. */ 3289 pmap_update(pmap); 3290 3291 /* A breathing point. */ 3292 preempt_point(); 3293 } 3294 3295 /* Verify that the pmap is now completely empty. */ 3296 pmap_check_ptps(pmap); 3297 KASSERTMSG(pmap->pm_stats.resident_count == PDP_SIZE, 3298 "pmap %p not empty", pmap); 3299 3300 return true; 3301 } 3302 3303 #if defined(PMAP_FORK) 3304 /* 3305 * pmap_fork: perform any necessary data structure manipulation when 3306 * a VM space is forked. 3307 */ 3308 void 3309 pmap_fork(struct pmap *pmap1, struct pmap *pmap2) 3310 { 3311 #ifdef USER_LDT 3312 union descriptor *new_ldt; 3313 int sel; 3314 3315 if (__predict_true(pmap1->pm_ldt == NULL)) { 3316 return; 3317 } 3318 3319 /* 3320 * Copy the LDT into the new process. 3321 * 3322 * Read pmap1's ldt pointer unlocked; if it changes behind our back 3323 * we'll retry. This will starve if there's a stream of LDT changes 3324 * in another thread but that should not happen. 3325 */ 3326 3327 retry: 3328 if (pmap1->pm_ldt != NULL) { 3329 /* Allocate space for the new process's LDT */ 3330 new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, 3331 MAX_USERLDT_SIZE, 0, UVM_KMF_WIRED); 3332 if (new_ldt == NULL) { 3333 printf("WARNING: %s: unable to allocate LDT space\n", 3334 __func__); 3335 return; 3336 } 3337 mutex_enter(&cpu_lock); 3338 /* Get a GDT slot for it */ 3339 sel = ldt_alloc(new_ldt, MAX_USERLDT_SIZE); 3340 if (sel == -1) { 3341 mutex_exit(&cpu_lock); 3342 uvm_km_free(kernel_map, (vaddr_t)new_ldt, 3343 MAX_USERLDT_SIZE, UVM_KMF_WIRED); 3344 printf("WARNING: %s: unable to allocate LDT selector\n", 3345 __func__); 3346 return; 3347 } 3348 } else { 3349 /* Wasn't anything there after all. */ 3350 new_ldt = NULL; 3351 sel = -1; 3352 mutex_enter(&cpu_lock); 3353 } 3354 3355 /* 3356 * Now that we have cpu_lock, ensure the LDT status is the same. 3357 */ 3358 if (pmap1->pm_ldt != NULL) { 3359 if (new_ldt == NULL) { 3360 /* A wild LDT just appeared. */ 3361 mutex_exit(&cpu_lock); 3362 goto retry; 3363 } 3364 3365 /* Copy the LDT data and install it in pmap2 */ 3366 memcpy(new_ldt, pmap1->pm_ldt, MAX_USERLDT_SIZE); 3367 pmap2->pm_ldt = new_ldt; 3368 pmap2->pm_ldt_sel = sel; 3369 mutex_exit(&cpu_lock); 3370 } else { 3371 if (new_ldt != NULL) { 3372 /* The LDT disappeared, drop what we did. */ 3373 ldt_free(sel); 3374 mutex_exit(&cpu_lock); 3375 uvm_km_free(kernel_map, (vaddr_t)new_ldt, 3376 MAX_USERLDT_SIZE, UVM_KMF_WIRED); 3377 return; 3378 } 3379 3380 /* We're good, just leave. */ 3381 mutex_exit(&cpu_lock); 3382 } 3383 #endif /* USER_LDT */ 3384 } 3385 #endif /* PMAP_FORK */ 3386 3387 #ifdef USER_LDT 3388 3389 /* 3390 * pmap_ldt_xcall: cross call used by pmap_ldt_sync. if the named pmap 3391 * is active, reload LDTR. 3392 */ 3393 static void 3394 pmap_ldt_xcall(void *arg1, void *arg2) 3395 { 3396 struct pmap *pm; 3397 3398 kpreempt_disable(); 3399 pm = arg1; 3400 if (curcpu()->ci_pmap == pm) { 3401 #if defined(SVS) 3402 if (svs_enabled) { 3403 svs_ldt_sync(pm); 3404 } else 3405 #endif 3406 lldt(pm->pm_ldt_sel); 3407 } 3408 kpreempt_enable(); 3409 } 3410 3411 /* 3412 * pmap_ldt_sync: LDT selector for the named pmap is changing. swap 3413 * in the new selector on all CPUs. 3414 */ 3415 void 3416 pmap_ldt_sync(struct pmap *pm) 3417 { 3418 uint64_t where; 3419 3420 KASSERT(mutex_owned(&cpu_lock)); 3421 3422 pmap_ldt_evcnt.ev_count++; 3423 where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL); 3424 xc_wait(where); 3425 } 3426 3427 /* 3428 * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and 3429 * restore the default. 3430 */ 3431 void 3432 pmap_ldt_cleanup(struct lwp *l) 3433 { 3434 pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap; 3435 union descriptor *ldt; 3436 int sel; 3437 3438 if (__predict_true(pmap->pm_ldt == NULL)) { 3439 return; 3440 } 3441 3442 mutex_enter(&cpu_lock); 3443 if (pmap->pm_ldt != NULL) { 3444 sel = pmap->pm_ldt_sel; 3445 ldt = pmap->pm_ldt; 3446 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 3447 pmap->pm_ldt = NULL; 3448 pmap_ldt_sync(pmap); 3449 ldt_free(sel); 3450 uvm_km_free(kernel_map, (vaddr_t)ldt, MAX_USERLDT_SIZE, 3451 UVM_KMF_WIRED); 3452 } 3453 mutex_exit(&cpu_lock); 3454 } 3455 #endif /* USER_LDT */ 3456 3457 /* 3458 * pmap_activate: activate a process' pmap 3459 * 3460 * => must be called with kernel preemption disabled 3461 * => if lwp is the curlwp, then set ci_want_pmapload so that 3462 * actual MMU context switch will be done by pmap_load() later 3463 */ 3464 void 3465 pmap_activate(struct lwp *l) 3466 { 3467 struct cpu_info *ci; 3468 struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 3469 3470 KASSERT(kpreempt_disabled()); 3471 3472 ci = curcpu(); 3473 3474 if (l != ci->ci_curlwp) 3475 return; 3476 3477 KASSERT(ci->ci_want_pmapload == 0); 3478 KASSERT(ci->ci_tlbstate != TLBSTATE_VALID); 3479 3480 /* 3481 * no need to switch to kernel vmspace because 3482 * it's a subset of any vmspace. 3483 */ 3484 3485 if (pmap == pmap_kernel()) { 3486 ci->ci_want_pmapload = 0; 3487 return; 3488 } 3489 3490 ci->ci_want_pmapload = 1; 3491 } 3492 3493 #if defined(XENPV) && defined(__x86_64__) 3494 #define KASSERT_PDIRPA(pmap) \ 3495 KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd || \ 3496 pmap == pmap_kernel()) 3497 #elif defined(PAE) 3498 #define KASSERT_PDIRPA(pmap) \ 3499 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0])) 3500 #elif !defined(XENPV) 3501 #define KASSERT_PDIRPA(pmap) \ 3502 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3())) 3503 #else 3504 #define KASSERT_PDIRPA(pmap) KASSERT(true) /* nothing to do */ 3505 #endif 3506 3507 /* 3508 * pmap_reactivate: try to regain reference to the pmap. 3509 * 3510 * => Must be called with kernel preemption disabled. 3511 */ 3512 static void 3513 pmap_reactivate(struct pmap *pmap) 3514 { 3515 struct cpu_info * const ci = curcpu(); 3516 const cpuid_t cid = cpu_index(ci); 3517 3518 KASSERT(kpreempt_disabled()); 3519 KASSERT_PDIRPA(pmap); 3520 3521 /* 3522 * If we still have a lazy reference to this pmap, we can assume 3523 * that there was no TLB shootdown for this pmap in the meantime. 3524 * 3525 * The order of events here is important as we must synchronize 3526 * with TLB shootdown interrupts. Declare interest in invalidations 3527 * (TLBSTATE_VALID) and then check the CPU set, which the IPIs can 3528 * change only when the state is TLBSTATE_LAZY. 3529 */ 3530 3531 ci->ci_tlbstate = TLBSTATE_VALID; 3532 KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid)); 3533 3534 if (__predict_true(kcpuset_isset(pmap->pm_cpus, cid))) { 3535 /* We have the reference, state is valid. */ 3536 } else { 3537 /* 3538 * Must reload the TLB, pmap has been changed during 3539 * deactivated. 3540 */ 3541 kcpuset_atomic_set(pmap->pm_cpus, cid); 3542 3543 tlbflush(); 3544 } 3545 } 3546 3547 /* 3548 * pmap_load: perform the actual pmap switch, i.e. fill in %cr3 register 3549 * and relevant LDT info. 3550 * 3551 * Ensures that the current process' pmap is loaded on the current CPU's 3552 * MMU and that there are no stale TLB entries. 3553 * 3554 * => The caller should disable kernel preemption or do check-and-retry 3555 * to prevent a preemption from undoing our efforts. 3556 * => This function may block. 3557 */ 3558 void 3559 pmap_load(void) 3560 { 3561 struct cpu_info *ci; 3562 struct pmap *pmap, *oldpmap; 3563 struct lwp *l; 3564 uint64_t ncsw; 3565 3566 kpreempt_disable(); 3567 retry: 3568 ci = curcpu(); 3569 if (!ci->ci_want_pmapload) { 3570 kpreempt_enable(); 3571 return; 3572 } 3573 l = ci->ci_curlwp; 3574 ncsw = l->l_ncsw; 3575 __insn_barrier(); 3576 3577 /* should be able to take ipis. */ 3578 KASSERT(ci->ci_ilevel < IPL_HIGH); 3579 #ifdef XENPV 3580 /* Check to see if interrupts are enabled (ie; no events are masked) */ 3581 KASSERT(x86_read_psl() == 0); 3582 #else 3583 KASSERT((x86_read_psl() & PSL_I) != 0); 3584 #endif 3585 3586 KASSERT(l != NULL); 3587 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 3588 KASSERT(pmap != pmap_kernel()); 3589 oldpmap = ci->ci_pmap; 3590 3591 if (pmap == oldpmap) { 3592 pmap_reactivate(pmap); 3593 ci->ci_want_pmapload = 0; 3594 kpreempt_enable(); 3595 return; 3596 } 3597 3598 /* 3599 * Acquire a reference to the new pmap and perform the switch. 3600 */ 3601 3602 pmap_reference(pmap); 3603 pmap_load1(l, pmap, oldpmap); 3604 ci->ci_want_pmapload = 0; 3605 3606 /* 3607 * we're now running with the new pmap. drop the reference 3608 * to the old pmap. if we block, we need to go around again. 3609 */ 3610 3611 pmap_destroy(oldpmap); 3612 __insn_barrier(); 3613 if (l->l_ncsw != ncsw) { 3614 goto retry; 3615 } 3616 3617 kpreempt_enable(); 3618 } 3619 3620 /* 3621 * pmap_load1: the guts of pmap load, shared by pmap_map_ptes() and 3622 * pmap_load(). It's critically important that this function does not 3623 * block. 3624 */ 3625 static void 3626 pmap_load1(struct lwp *l, struct pmap *pmap, struct pmap *oldpmap) 3627 { 3628 struct cpu_info *ci; 3629 struct pcb *pcb; 3630 cpuid_t cid; 3631 3632 KASSERT(kpreempt_disabled()); 3633 3634 pcb = lwp_getpcb(l); 3635 ci = l->l_cpu; 3636 cid = cpu_index(ci); 3637 3638 kcpuset_atomic_clear(oldpmap->pm_cpus, cid); 3639 kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid); 3640 3641 KASSERT_PDIRPA(oldpmap); 3642 KASSERT(!kcpuset_isset(pmap->pm_cpus, cid)); 3643 KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid)); 3644 3645 /* 3646 * Mark the pmap in use by this CPU. Again, we must synchronize 3647 * with TLB shootdown interrupts, so set the state VALID first, 3648 * then register us for shootdown events on this pmap. 3649 */ 3650 ci->ci_tlbstate = TLBSTATE_VALID; 3651 kcpuset_atomic_set(pmap->pm_cpus, cid); 3652 kcpuset_atomic_set(pmap->pm_kernel_cpus, cid); 3653 ci->ci_pmap = pmap; 3654 3655 /* 3656 * update tss. now that we have registered for invalidations 3657 * from other CPUs, we're good to load the page tables. 3658 */ 3659 #ifdef PAE 3660 pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa; 3661 #else 3662 pcb->pcb_cr3 = pmap_pdirpa(pmap, 0); 3663 #endif 3664 3665 #ifdef i386 3666 #ifndef XENPV 3667 ci->ci_tss->tss.tss_ldt = pmap->pm_ldt_sel; 3668 ci->ci_tss->tss.tss_cr3 = pcb->pcb_cr3; 3669 #endif 3670 #endif 3671 3672 #if defined(SVS) && defined(USER_LDT) 3673 if (svs_enabled) { 3674 svs_ldt_sync(pmap); 3675 } else 3676 #endif 3677 lldt(pmap->pm_ldt_sel); 3678 3679 cpu_load_pmap(pmap, oldpmap); 3680 } 3681 3682 /* 3683 * pmap_deactivate: deactivate a process' pmap. 3684 * 3685 * => Must be called with kernel preemption disabled (high IPL is enough). 3686 */ 3687 void 3688 pmap_deactivate(struct lwp *l) 3689 { 3690 struct pmap *pmap; 3691 struct cpu_info *ci; 3692 3693 KASSERT(kpreempt_disabled()); 3694 3695 if (l != curlwp) { 3696 return; 3697 } 3698 3699 /* 3700 * Wait for pending TLB shootdowns to complete. Necessary because 3701 * TLB shootdown state is per-CPU, and the LWP may be coming off 3702 * the CPU before it has a chance to call pmap_update(), e.g. due 3703 * to kernel preemption or blocking routine in between. 3704 */ 3705 pmap_tlb_shootnow(); 3706 3707 ci = curcpu(); 3708 3709 if (ci->ci_want_pmapload) { 3710 /* 3711 * ci_want_pmapload means that our pmap is not loaded on 3712 * the CPU or TLB might be stale. note that pmap_kernel() 3713 * is always considered loaded. 3714 */ 3715 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 3716 != pmap_kernel()); 3717 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 3718 != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID); 3719 3720 /* 3721 * userspace has not been touched. 3722 * nothing to do here. 3723 */ 3724 3725 ci->ci_want_pmapload = 0; 3726 return; 3727 } 3728 3729 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 3730 3731 if (pmap == pmap_kernel()) { 3732 return; 3733 } 3734 3735 KASSERT_PDIRPA(pmap); 3736 KASSERT(ci->ci_pmap == pmap); 3737 3738 /* 3739 * we aren't interested in TLB invalidations for this pmap, 3740 * at least for the time being. 3741 */ 3742 3743 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); 3744 ci->ci_tlbstate = TLBSTATE_LAZY; 3745 } 3746 3747 #ifdef EFI_RUNTIME 3748 3749 extern struct pmap *efi_runtime_pmap; 3750 3751 /* 3752 * pmap_is_user: true if pmap, which must not be the kernel pmap, is 3753 * for an unprivileged user process 3754 */ 3755 bool 3756 pmap_is_user(struct pmap *pmap) 3757 { 3758 3759 KASSERT(pmap != pmap_kernel()); 3760 return (pmap != efi_runtime_pmap); 3761 } 3762 3763 /* 3764 * pmap_activate_sync: synchronously activate specified pmap. 3765 * 3766 * => Must be called with kernel preemption disabled (high IPL is enough). 3767 * => Must not sleep before pmap_deactivate_sync. 3768 */ 3769 void * 3770 pmap_activate_sync(struct pmap *pmap) 3771 { 3772 struct cpu_info *ci = curcpu(); 3773 struct pmap *oldpmap = ci->ci_pmap; 3774 unsigned cid = cpu_index(ci); 3775 3776 KASSERT(kpreempt_disabled()); 3777 KASSERT(pmap != pmap_kernel()); 3778 3779 KASSERT(!kcpuset_isset(pmap->pm_cpus, cid)); 3780 KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid)); 3781 3782 if (oldpmap) { 3783 KASSERT_PDIRPA(oldpmap); 3784 kcpuset_atomic_clear(oldpmap->pm_cpus, cid); 3785 kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid); 3786 } 3787 3788 ci->ci_tlbstate = TLBSTATE_VALID; 3789 kcpuset_atomic_set(pmap->pm_cpus, cid); 3790 kcpuset_atomic_set(pmap->pm_kernel_cpus, cid); 3791 ci->ci_pmap = pmap; 3792 3793 #if defined(SVS) && defined(USER_LDT) 3794 if (svs_enabled) { 3795 svs_ldt_sync(pmap); 3796 } else 3797 #endif 3798 lldt(pmap->pm_ldt_sel); 3799 3800 cpu_load_pmap(pmap, oldpmap); 3801 3802 return oldpmap; 3803 } 3804 3805 /* 3806 * pmap_deactivate_sync: synchronously deactivate specified pmap and 3807 * restore whatever was active before pmap_activate_sync. 3808 * 3809 * => Must be called with kernel preemption disabled (high IPL is enough). 3810 * => Must not have slept since pmap_activate_sync. 3811 */ 3812 void 3813 pmap_deactivate_sync(struct pmap *pmap, void *cookie) 3814 { 3815 struct cpu_info *ci = curcpu(); 3816 struct pmap *oldpmap = cookie; 3817 unsigned cid = cpu_index(ci); 3818 3819 KASSERT(kpreempt_disabled()); 3820 KASSERT(pmap != pmap_kernel()); 3821 KASSERT(ci->ci_pmap == pmap); 3822 3823 KASSERT_PDIRPA(pmap); 3824 3825 KASSERT(kcpuset_isset(pmap->pm_cpus, cid)); 3826 KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid)); 3827 3828 pmap_tlb_shootnow(); 3829 3830 kcpuset_atomic_clear(pmap->pm_cpus, cid); 3831 kcpuset_atomic_clear(pmap->pm_kernel_cpus, cid); 3832 3833 ci->ci_tlbstate = TLBSTATE_VALID; 3834 ci->ci_pmap = oldpmap; 3835 if (oldpmap) { 3836 kcpuset_atomic_set(oldpmap->pm_cpus, cid); 3837 kcpuset_atomic_set(oldpmap->pm_kernel_cpus, cid); 3838 #if defined(SVS) && defined(USER_LDT) 3839 if (svs_enabled) { 3840 svs_ldt_sync(oldpmap); 3841 } else 3842 #endif 3843 lldt(oldpmap->pm_ldt_sel); 3844 cpu_load_pmap(oldpmap, pmap); 3845 } else { 3846 lcr3(pmap_pdirpa(pmap_kernel(), 0)); 3847 } 3848 } 3849 3850 #endif /* EFI_RUNTIME */ 3851 3852 /* 3853 * some misc. functions 3854 */ 3855 3856 bool 3857 pmap_pdes_valid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde, 3858 int *lastlvl) 3859 { 3860 unsigned long index; 3861 pd_entry_t pde; 3862 int i; 3863 3864 for (i = PTP_LEVELS; i > 1; i--) { 3865 index = pl_i(va, i); 3866 pde = pdes[i - 2][index]; 3867 if ((pde & PTE_P) == 0) { 3868 *lastlvl = i; 3869 return false; 3870 } 3871 if (pde & PTE_PS) 3872 break; 3873 } 3874 if (lastpde != NULL) 3875 *lastpde = pde; 3876 *lastlvl = i; 3877 return true; 3878 } 3879 3880 /* 3881 * pmap_extract: extract a PA for the given VA 3882 */ 3883 bool 3884 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap) 3885 { 3886 pt_entry_t *ptes, pte; 3887 pd_entry_t pde; 3888 pd_entry_t * const *pdes; 3889 struct pmap *pmap2; 3890 paddr_t pa; 3891 bool rv; 3892 int lvl; 3893 3894 if (__predict_false(pmap->pm_extract != NULL)) { 3895 return (*pmap->pm_extract)(pmap, va, pap); 3896 } 3897 3898 #ifdef __HAVE_DIRECT_MAP 3899 if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) { 3900 if (pap != NULL) { 3901 *pap = PMAP_DIRECT_UNMAP(va); 3902 } 3903 return true; 3904 } 3905 #endif 3906 3907 rv = false; 3908 pa = 0; 3909 3910 if (pmap != pmap_kernel()) { 3911 mutex_enter(&pmap->pm_lock); 3912 } 3913 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3914 if (pmap_pdes_valid(va, pdes, &pde, &lvl)) { 3915 if (lvl == 2) { 3916 pa = (pde & PTE_LGFRAME) | (va & (NBPD_L2 - 1)); 3917 rv = true; 3918 } else { 3919 KASSERT(lvl == 1); 3920 pte = ptes[pl1_i(va)]; 3921 if (__predict_true((pte & PTE_P) != 0)) { 3922 pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1)); 3923 rv = true; 3924 } 3925 } 3926 } 3927 pmap_unmap_ptes(pmap, pmap2); 3928 if (pmap != pmap_kernel()) { 3929 mutex_exit(&pmap->pm_lock); 3930 } 3931 if (pap != NULL) { 3932 *pap = pa; 3933 } 3934 3935 return rv; 3936 } 3937 3938 /* 3939 * vtophys: virtual address to physical address. For use by 3940 * machine-dependent code only. 3941 */ 3942 paddr_t 3943 vtophys(vaddr_t va) 3944 { 3945 paddr_t pa; 3946 3947 if (pmap_extract(pmap_kernel(), va, &pa) == true) 3948 return pa; 3949 return 0; 3950 } 3951 3952 __strict_weak_alias(pmap_extract_ma, pmap_extract); 3953 3954 #ifdef XENPV 3955 /* 3956 * vtomach: virtual address to machine address. For use by 3957 * machine-dependent code only. 3958 */ 3959 paddr_t 3960 vtomach(vaddr_t va) 3961 { 3962 paddr_t pa; 3963 3964 if (pmap_extract_ma(pmap_kernel(), va, &pa) == true) 3965 return pa; 3966 return 0; 3967 } 3968 #endif 3969 3970 /* 3971 * pmap_virtual_space: used during bootup [pmap_steal_memory] to 3972 * determine the bounds of the kernel virtual address space. 3973 */ 3974 void 3975 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp) 3976 { 3977 *startp = virtual_avail; 3978 *endp = virtual_end; 3979 } 3980 3981 void 3982 pmap_zero_page(paddr_t pa) 3983 { 3984 #if defined(__HAVE_DIRECT_MAP) 3985 memset(PAGE_ALIGNED(PMAP_DIRECT_MAP(pa)), 0, PAGE_SIZE); 3986 #else 3987 #if defined(XENPV) 3988 if (XEN_VERSION_SUPPORTED(3, 4)) { 3989 xen_pagezero(pa); 3990 return; 3991 } 3992 #endif 3993 struct cpu_info *ci; 3994 pt_entry_t *zpte; 3995 vaddr_t zerova; 3996 3997 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_D | PTE_A; 3998 3999 kpreempt_disable(); 4000 4001 ci = curcpu(); 4002 zerova = ci->vpage[VPAGE_ZER]; 4003 zpte = ci->vpage_pte[VPAGE_ZER]; 4004 4005 KASSERTMSG(!*zpte, "pmap_zero_page: lock botch"); 4006 4007 pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags); 4008 pmap_pte_flush(); 4009 pmap_update_pg(zerova); /* flush TLB */ 4010 4011 memset(PAGE_ALIGNED(zerova), 0, PAGE_SIZE); 4012 4013 #if defined(DIAGNOSTIC) || defined(XENPV) 4014 pmap_pte_set(zpte, 0); /* zap ! */ 4015 pmap_pte_flush(); 4016 #endif 4017 4018 kpreempt_enable(); 4019 #endif /* defined(__HAVE_DIRECT_MAP) */ 4020 } 4021 4022 void 4023 pmap_copy_page(paddr_t srcpa, paddr_t dstpa) 4024 { 4025 #if defined(__HAVE_DIRECT_MAP) 4026 vaddr_t srcva = PMAP_DIRECT_MAP(srcpa); 4027 vaddr_t dstva = PMAP_DIRECT_MAP(dstpa); 4028 4029 memcpy(PAGE_ALIGNED(dstva), PAGE_ALIGNED(srcva), PAGE_SIZE); 4030 #else 4031 #if defined(XENPV) 4032 if (XEN_VERSION_SUPPORTED(3, 4)) { 4033 xen_copy_page(srcpa, dstpa); 4034 return; 4035 } 4036 #endif 4037 struct cpu_info *ci; 4038 pt_entry_t *srcpte, *dstpte; 4039 vaddr_t srcva, dstva; 4040 4041 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A; 4042 4043 kpreempt_disable(); 4044 4045 ci = curcpu(); 4046 srcva = ci->vpage[VPAGE_SRC]; 4047 dstva = ci->vpage[VPAGE_DST]; 4048 srcpte = ci->vpage_pte[VPAGE_SRC]; 4049 dstpte = ci->vpage_pte[VPAGE_DST]; 4050 4051 KASSERT(*srcpte == 0 && *dstpte == 0); 4052 4053 pmap_pte_set(srcpte, pmap_pa2pte(srcpa) | pteflags); 4054 pmap_pte_set(dstpte, pmap_pa2pte(dstpa) | pteflags | PTE_D); 4055 pmap_pte_flush(); 4056 pmap_update_pg(srcva); 4057 pmap_update_pg(dstva); 4058 4059 memcpy(PAGE_ALIGNED(dstva), PAGE_ALIGNED(srcva), PAGE_SIZE); 4060 4061 #if defined(DIAGNOSTIC) || defined(XENPV) 4062 pmap_pte_set(srcpte, 0); 4063 pmap_pte_set(dstpte, 0); 4064 pmap_pte_flush(); 4065 #endif 4066 4067 kpreempt_enable(); 4068 #endif /* defined(__HAVE_DIRECT_MAP) */ 4069 } 4070 4071 static pt_entry_t * 4072 pmap_map_ptp(struct vm_page *ptp) 4073 { 4074 #ifdef __HAVE_DIRECT_MAP 4075 return (void *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp)); 4076 #else 4077 struct cpu_info *ci; 4078 pt_entry_t *ptppte; 4079 vaddr_t ptpva; 4080 4081 KASSERT(kpreempt_disabled()); 4082 4083 #ifndef XENPV 4084 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A | PTE_D; 4085 #else 4086 const pd_entry_t pteflags = PTE_P | pmap_pg_nx | PTE_A | PTE_D; 4087 #endif 4088 4089 ci = curcpu(); 4090 ptpva = ci->vpage[VPAGE_PTP]; 4091 ptppte = ci->vpage_pte[VPAGE_PTP]; 4092 4093 pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | pteflags); 4094 4095 pmap_pte_flush(); 4096 pmap_update_pg(ptpva); 4097 4098 return (pt_entry_t *)ptpva; 4099 #endif 4100 } 4101 4102 static void 4103 pmap_unmap_ptp(void) 4104 { 4105 #ifndef __HAVE_DIRECT_MAP 4106 #if defined(DIAGNOSTIC) || defined(XENPV) 4107 struct cpu_info *ci; 4108 pt_entry_t *pte; 4109 4110 KASSERT(kpreempt_disabled()); 4111 4112 ci = curcpu(); 4113 pte = ci->vpage_pte[VPAGE_PTP]; 4114 4115 if (*pte != 0) { 4116 pmap_pte_set(pte, 0); 4117 pmap_pte_flush(); 4118 } 4119 #endif 4120 #endif 4121 } 4122 4123 static pt_entry_t * 4124 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va) 4125 { 4126 4127 KASSERT(kpreempt_disabled()); 4128 if (pmap_is_curpmap(pmap)) { 4129 return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */ 4130 } 4131 KASSERT(ptp != NULL); 4132 return pmap_map_ptp(ptp) + pl1_pi(va); 4133 } 4134 4135 static void 4136 pmap_unmap_pte(void) 4137 { 4138 4139 KASSERT(kpreempt_disabled()); 4140 4141 pmap_unmap_ptp(); 4142 } 4143 4144 /* 4145 * p m a p r e m o v e f u n c t i o n s 4146 * 4147 * functions that remove mappings 4148 */ 4149 4150 /* 4151 * pmap_remove_ptes: remove PTEs from a PTP 4152 * 4153 * => caller must hold pmap's lock 4154 * => PTP must be mapped into KVA 4155 * => PTP should be null if pmap == pmap_kernel() 4156 * => must be called with kernel preemption disabled 4157 * => returns composite pte if at least one page should be shot down 4158 */ 4159 static void 4160 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva, 4161 vaddr_t startva, vaddr_t endva) 4162 { 4163 pt_entry_t *pte = (pt_entry_t *)ptpva; 4164 4165 KASSERT(mutex_owned(&pmap->pm_lock)); 4166 KASSERT(kpreempt_disabled()); 4167 4168 /* 4169 * mappings are very often sparse, so clip the given range to the 4170 * range of PTEs that are known present in the PTP. 4171 */ 4172 pmap_ptp_range_clip(ptp, &startva, &pte); 4173 4174 /* 4175 * note that ptpva points to the PTE that maps startva. this may 4176 * or may not be the first PTE in the PTP. 4177 * 4178 * we loop through the PTP while there are still PTEs to look at 4179 * and the wire_count is greater than 1 (because we use the wire_count 4180 * to keep track of the number of real PTEs in the PTP). 4181 */ 4182 while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) { 4183 (void)pmap_remove_pte(pmap, ptp, pte, startva); 4184 startva += PAGE_SIZE; 4185 pte++; 4186 } 4187 } 4188 4189 /* 4190 * pmap_remove_pte: remove a single PTE from a PTP. 4191 * 4192 * => caller must hold pmap's lock 4193 * => PTP must be mapped into KVA 4194 * => PTP should be null if pmap == pmap_kernel() 4195 * => returns true if we removed a mapping 4196 * => must be called with kernel preemption disabled 4197 */ 4198 static bool 4199 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, 4200 vaddr_t va) 4201 { 4202 struct pv_entry *pve; 4203 struct vm_page *pg; 4204 struct pmap_page *pp; 4205 pt_entry_t opte; 4206 4207 KASSERT(mutex_owned(&pmap->pm_lock)); 4208 KASSERT(kpreempt_disabled()); 4209 4210 if (!pmap_valid_entry(*pte)) { 4211 /* VA not mapped. */ 4212 return false; 4213 } 4214 4215 /* Atomically save the old PTE and zap it. */ 4216 opte = pmap_pte_testset(pte, 0); 4217 if (!pmap_valid_entry(opte)) { 4218 return false; 4219 } 4220 4221 pmap_exec_account(pmap, va, opte, 0); 4222 pmap_stats_update_bypte(pmap, 0, opte); 4223 4224 if (ptp) { 4225 /* 4226 * Dropping a PTE. Make sure that the PDE is flushed. 4227 */ 4228 ptp->wire_count--; 4229 if (ptp->wire_count <= 1) { 4230 opte |= PTE_A; 4231 } 4232 } 4233 4234 if ((opte & PTE_A) != 0) { 4235 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE); 4236 } 4237 4238 /* 4239 * If we are not on a pv list - we are done. 4240 */ 4241 if ((opte & PTE_PVLIST) == 0) { 4242 #ifndef DOM0OPS 4243 KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL), 4244 "managed page without PTE_PVLIST for %#"PRIxVADDR, va); 4245 KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL), 4246 "pv-tracked page without PTE_PVLIST for %#"PRIxVADDR, va); 4247 #endif 4248 KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ? 4249 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL); 4250 return true; 4251 } 4252 4253 if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) { 4254 pp = VM_PAGE_TO_PP(pg); 4255 } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) { 4256 paddr_t pa = pmap_pte2pa(opte); 4257 panic("%s: PTE_PVLIST with pv-untracked page" 4258 " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")", 4259 __func__, va, pa, atop(pa)); 4260 } 4261 4262 /* Sync R/M bits. */ 4263 pve = pmap_lookup_pv(pmap, ptp, pp, va); 4264 pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_pte_to_pp_attrs(opte)); 4265 return true; 4266 } 4267 4268 static void 4269 pmap_remove_locked(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 4270 { 4271 pt_entry_t *ptes; 4272 pd_entry_t pde; 4273 pd_entry_t * const *pdes; 4274 bool result; 4275 vaddr_t blkendva, va = sva; 4276 struct vm_page *ptp; 4277 struct pmap *pmap2; 4278 int lvl; 4279 4280 KASSERT(mutex_owned(&pmap->pm_lock)); 4281 4282 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 4283 4284 /* 4285 * removing one page? take shortcut function. 4286 */ 4287 4288 if (va + PAGE_SIZE == eva) { 4289 if (pmap_pdes_valid(va, pdes, &pde, &lvl)) { 4290 KASSERT(lvl == 1); 4291 4292 /* Get PTP if non-kernel mapping. */ 4293 if (pmap != pmap_kernel()) { 4294 ptp = pmap_find_ptp(pmap, va, 1); 4295 KASSERTMSG(ptp != NULL, 4296 "%s: unmanaged PTP detected", __func__); 4297 } else { 4298 /* Never free kernel PTPs. */ 4299 ptp = NULL; 4300 } 4301 4302 result = pmap_remove_pte(pmap, ptp, 4303 &ptes[pl1_i(va)], va); 4304 4305 /* 4306 * if mapping removed and the PTP is no longer 4307 * being used, free it! 4308 */ 4309 4310 if (result && ptp && ptp->wire_count <= 1) 4311 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 4312 } 4313 } else for (/* null */ ; va < eva ; va = blkendva) { 4314 /* determine range of block */ 4315 blkendva = x86_round_pdr(va+1); 4316 if (blkendva > eva) 4317 blkendva = eva; 4318 4319 if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) { 4320 /* Skip a range corresponding to an invalid pde. */ 4321 blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1]; 4322 continue; 4323 } 4324 KASSERT(lvl == 1); 4325 4326 /* Get PTP if non-kernel mapping. */ 4327 if (pmap != pmap_kernel()) { 4328 ptp = pmap_find_ptp(pmap, va, 1); 4329 KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected", 4330 __func__); 4331 } else { 4332 /* Never free kernel PTPs. */ 4333 ptp = NULL; 4334 } 4335 4336 pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va, 4337 blkendva); 4338 4339 /* If PTP is no longer being used, free it. */ 4340 if (ptp && ptp->wire_count <= 1) { 4341 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 4342 } 4343 } 4344 pmap_unmap_ptes(pmap, pmap2); 4345 pmap_drain_pv(pmap); 4346 } 4347 4348 /* 4349 * pmap_remove: mapping removal function. 4350 * 4351 * => caller should not be holding any pmap locks 4352 */ 4353 void 4354 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 4355 { 4356 if (__predict_false(pmap->pm_remove != NULL)) { 4357 (*pmap->pm_remove)(pmap, sva, eva); 4358 return; 4359 } 4360 4361 mutex_enter(&pmap->pm_lock); 4362 pmap_remove_locked(pmap, sva, eva); 4363 mutex_exit(&pmap->pm_lock); 4364 } 4365 4366 /* 4367 * pmap_sync_pv: clear pte bits and return the old value of the pp_attrs. 4368 * 4369 * => The 'clearbits' parameter is either ~0 or PP_ATTRS_... 4370 * => Caller should disable kernel preemption. 4371 * => issues tlb shootdowns if necessary. 4372 */ 4373 static int 4374 pmap_sync_pv(struct pv_pte *pvpte, paddr_t pa, int clearbits, uint8_t *oattrs, 4375 pt_entry_t *optep) 4376 { 4377 struct pmap *pmap; 4378 struct vm_page *ptp; 4379 vaddr_t va; 4380 pt_entry_t *ptep; 4381 pt_entry_t opte; 4382 pt_entry_t npte; 4383 pt_entry_t expect; 4384 bool need_shootdown; 4385 4386 ptp = pvpte->pte_ptp; 4387 va = pvpte->pte_va; 4388 KASSERT(ptp == NULL || ptp->uobject != NULL); 4389 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 4390 pmap = ptp_to_pmap(ptp); 4391 KASSERT(kpreempt_disabled()); 4392 4393 if (__predict_false(pmap->pm_sync_pv != NULL)) { 4394 return (*pmap->pm_sync_pv)(ptp, va, pa, clearbits, oattrs, 4395 optep); 4396 } 4397 4398 expect = pmap_pa2pte(pa) | PTE_P; 4399 4400 if (clearbits != ~0) { 4401 KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0); 4402 clearbits = pmap_pp_attrs_to_pte(clearbits); 4403 } 4404 4405 ptep = pmap_map_pte(pmap, ptp, va); 4406 do { 4407 opte = *ptep; 4408 KASSERT((opte & (PTE_D | PTE_A)) != PTE_D); 4409 KASSERT((opte & (PTE_A | PTE_P)) != PTE_A); 4410 KASSERT(opte == 0 || (opte & PTE_P) != 0); 4411 if ((opte & (PTE_FRAME | PTE_P)) != expect) { 4412 /* 4413 * We lost a race with a V->P operation like 4414 * pmap_remove(). Wait for the competitor 4415 * reflecting pte bits into mp_attrs. 4416 */ 4417 pmap_unmap_pte(); 4418 return EAGAIN; 4419 } 4420 4421 /* 4422 * Check if there's anything to do on this PTE. 4423 */ 4424 if ((opte & clearbits) == 0) { 4425 need_shootdown = false; 4426 break; 4427 } 4428 4429 /* 4430 * We need a shootdown if the PTE is cached (PTE_A) ... 4431 * ... Unless we are clearing only the PTE_W bit and 4432 * it isn't cached as RW (PTE_D). 4433 */ 4434 need_shootdown = (opte & PTE_A) != 0 && 4435 !(clearbits == PTE_W && (opte & PTE_D) == 0); 4436 4437 npte = opte & ~clearbits; 4438 4439 /* 4440 * If we need a shootdown anyway, clear PTE_A and PTE_D. 4441 */ 4442 if (need_shootdown) { 4443 npte &= ~(PTE_A | PTE_D); 4444 } 4445 KASSERT((npte & (PTE_D | PTE_A)) != PTE_D); 4446 KASSERT((npte & (PTE_A | PTE_P)) != PTE_A); 4447 KASSERT(npte == 0 || (opte & PTE_P) != 0); 4448 } while (pmap_pte_cas(ptep, opte, npte) != opte); 4449 4450 if (need_shootdown) { 4451 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV); 4452 } 4453 pmap_unmap_pte(); 4454 4455 *oattrs = pmap_pte_to_pp_attrs(opte); 4456 if (optep != NULL) 4457 *optep = opte; 4458 return 0; 4459 } 4460 4461 static void 4462 pmap_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte, 4463 vaddr_t va) 4464 { 4465 struct pmap *pmap2; 4466 pt_entry_t *ptes; 4467 pd_entry_t * const *pdes; 4468 4469 KASSERT(mutex_owned(&pmap->pm_lock)); 4470 4471 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 4472 pmap_stats_update_bypte(pmap, 0, opte); 4473 ptp->wire_count--; 4474 if (ptp->wire_count <= 1) { 4475 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 4476 } 4477 pmap_unmap_ptes(pmap, pmap2); 4478 } 4479 4480 static void 4481 pmap_pp_remove(struct pmap_page *pp, paddr_t pa) 4482 { 4483 struct pv_pte *pvpte; 4484 struct vm_page *ptp; 4485 uintptr_t sum; 4486 uint8_t oattrs; 4487 bool locked; 4488 4489 /* 4490 * Do an unlocked check to see if the page has no mappings, eg when 4491 * pmap_remove_all() was called before amap_wipeout() for a process 4492 * private amap - common. The page being removed must be on the way 4493 * out, so we don't have to worry about concurrent attempts to enter 4494 * it (otherwise the caller either doesn't care or has screwed up). 4495 */ 4496 sum = (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_va); 4497 sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_ptp); 4498 sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pvlist.lh_first); 4499 if (sum == 0) { 4500 return; 4501 } 4502 4503 kpreempt_disable(); 4504 for (;;) { 4505 struct pmap *pmap; 4506 struct pv_entry *pve; 4507 pt_entry_t opte; 4508 vaddr_t va; 4509 4510 mutex_spin_enter(&pp->pp_lock); 4511 if ((pvpte = pv_pte_first(pp)) == NULL) { 4512 mutex_spin_exit(&pp->pp_lock); 4513 break; 4514 } 4515 4516 /* 4517 * Add a reference to the pmap before clearing the pte. 4518 * Otherwise the pmap can disappear behind us. 4519 */ 4520 ptp = pvpte->pte_ptp; 4521 pmap = ptp_to_pmap(ptp); 4522 KASSERT(pmap->pm_obj[0].uo_refs > 0); 4523 if (ptp != NULL) { 4524 pmap_reference(pmap); 4525 } 4526 4527 /* 4528 * Now try to lock it. We need a direct handoff between 4529 * pp_lock and pm_lock to know the pv_entry is kept intact 4530 * and kept associated with this pmap. If that can't be 4531 * had, wait for the pmap's lock to become free and then 4532 * retry. 4533 */ 4534 locked = mutex_tryenter(&pmap->pm_lock); 4535 mutex_spin_exit(&pp->pp_lock); 4536 if (!locked) { 4537 mutex_enter(&pmap->pm_lock); 4538 /* nothing, just wait for it */ 4539 mutex_exit(&pmap->pm_lock); 4540 if (ptp != NULL) { 4541 pmap_destroy(pmap); 4542 } 4543 continue; 4544 } 4545 va = pvpte->pte_va; 4546 4547 KASSERTMSG(pmap->pm_stats.resident_count > PDP_SIZE, 4548 "va %lx pmap %p ptp %p is empty", va, pmap, ptp); 4549 KASSERTMSG(ptp == NULL || (ptp->flags & PG_FREE) == 0, 4550 "va %lx pmap %p ptp %p is free", va, pmap, ptp); 4551 KASSERTMSG(ptp == NULL || ptp->wire_count > 1, 4552 "va %lx pmap %p ptp %p is empty", va, pmap, ptp); 4553 4554 #ifdef DEBUG 4555 pmap_check_pv(pmap, ptp, pp, pvpte->pte_va, true); 4556 rb_tree_t *tree = (ptp != NULL ? 4557 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb); 4558 pve = pmap_treelookup_pv(pmap, ptp, tree, va); 4559 if (pve == NULL) { 4560 KASSERTMSG(&pp->pp_pte == pvpte, 4561 "va %lx pmap %p ptp %p pvpte %p pve %p oops 1", 4562 va, pmap, ptp, pvpte, pve); 4563 } else { 4564 KASSERTMSG(&pve->pve_pte == pvpte, 4565 "va %lx pmap %p ptp %p pvpte %p pve %p oops 2", 4566 va, pmap, ptp, pvpte, pve); 4567 } 4568 #endif 4569 4570 if (pmap_sync_pv(pvpte, pa, ~0, &oattrs, &opte)) { 4571 panic("pmap_pp_remove: mapping not present"); 4572 } 4573 4574 pve = pmap_lookup_pv(pmap, ptp, pp, va); 4575 pmap_remove_pv(pmap, pp, ptp, va, pve, oattrs); 4576 4577 /* Update the PTP reference count. Free if last reference. */ 4578 if (ptp != NULL) { 4579 KASSERT(pmap != pmap_kernel()); 4580 pmap_tlb_shootnow(); 4581 if (__predict_false(pmap->pm_pp_remove_ent != NULL)) { 4582 (*pmap->pm_pp_remove_ent)(pmap, ptp, opte, va); 4583 } else { 4584 pmap_pp_remove_ent(pmap, ptp, opte, va); 4585 } 4586 } else { 4587 KASSERT(pmap == pmap_kernel()); 4588 pmap_stats_update_bypte(pmap, 0, opte); 4589 } 4590 pmap_tlb_shootnow(); 4591 pmap_drain_pv(pmap); 4592 mutex_exit(&pmap->pm_lock); 4593 if (ptp != NULL) { 4594 pmap_destroy(pmap); 4595 } 4596 } 4597 kpreempt_enable(); 4598 } 4599 4600 /* 4601 * pmap_page_remove: remove a managed vm_page from all pmaps that map it 4602 * 4603 * => R/M bits are sync'd back to attrs 4604 */ 4605 void 4606 pmap_page_remove(struct vm_page *pg) 4607 { 4608 struct pmap_page *pp; 4609 paddr_t pa; 4610 4611 pp = VM_PAGE_TO_PP(pg); 4612 pa = VM_PAGE_TO_PHYS(pg); 4613 pmap_pp_remove(pp, pa); 4614 } 4615 4616 /* 4617 * pmap_pv_remove: remove an unmanaged pv-tracked page from all pmaps 4618 * that map it 4619 */ 4620 void 4621 pmap_pv_remove(paddr_t pa) 4622 { 4623 struct pmap_page *pp; 4624 4625 pp = pmap_pv_tracked(pa); 4626 if (pp == NULL) 4627 panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa); 4628 pmap_pp_remove(pp, pa); 4629 } 4630 4631 /* 4632 * p m a p a t t r i b u t e f u n c t i o n s 4633 * functions that test/change managed page's attributes 4634 * since a page can be mapped multiple times we must check each PTE that 4635 * maps it by going down the pv lists. 4636 */ 4637 4638 /* 4639 * pmap_test_attrs: test a page's attributes 4640 */ 4641 bool 4642 pmap_test_attrs(struct vm_page *pg, unsigned testbits) 4643 { 4644 struct pmap_page *pp; 4645 struct pv_pte *pvpte; 4646 struct pmap *pmap; 4647 uint8_t oattrs; 4648 u_int result; 4649 paddr_t pa; 4650 4651 pp = VM_PAGE_TO_PP(pg); 4652 if ((pp->pp_attrs & testbits) != 0) { 4653 return true; 4654 } 4655 pa = VM_PAGE_TO_PHYS(pg); 4656 startover: 4657 mutex_spin_enter(&pp->pp_lock); 4658 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 4659 if ((pp->pp_attrs & testbits) != 0) { 4660 break; 4661 } 4662 if (pmap_sync_pv(pvpte, pa, 0, &oattrs, NULL)) { 4663 /* 4664 * raced with a V->P operation. wait for the other 4665 * side to finish by acquiring pmap's lock. if no 4666 * wait, updates to pp_attrs by the other side may 4667 * go unseen. 4668 */ 4669 pmap = ptp_to_pmap(pvpte->pte_ptp); 4670 pmap_reference(pmap); 4671 mutex_spin_exit(&pp->pp_lock); 4672 mutex_enter(&pmap->pm_lock); 4673 /* nothing. */ 4674 mutex_exit(&pmap->pm_lock); 4675 pmap_destroy(pmap); 4676 goto startover; 4677 } 4678 pp->pp_attrs |= oattrs; 4679 } 4680 result = pp->pp_attrs & testbits; 4681 mutex_spin_exit(&pp->pp_lock); 4682 4683 /* 4684 * note that we will exit the for loop with a non-null pve if 4685 * we have found the bits we are testing for. 4686 */ 4687 4688 return result != 0; 4689 } 4690 4691 static bool 4692 pmap_pp_clear_attrs(struct pmap_page *pp, paddr_t pa, unsigned clearbits) 4693 { 4694 struct pv_pte *pvpte; 4695 struct pmap *pmap; 4696 uint8_t oattrs; 4697 u_int result; 4698 4699 startover: 4700 mutex_spin_enter(&pp->pp_lock); 4701 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 4702 if (pmap_sync_pv(pvpte, pa, clearbits, &oattrs, NULL)) { 4703 /* 4704 * raced with a V->P operation. wait for the other 4705 * side to finish by acquiring pmap's lock. it is 4706 * probably unmapping the page, and it will be gone 4707 * when the loop is restarted. 4708 */ 4709 pmap = ptp_to_pmap(pvpte->pte_ptp); 4710 pmap_reference(pmap); 4711 mutex_spin_exit(&pp->pp_lock); 4712 mutex_enter(&pmap->pm_lock); 4713 /* nothing. */ 4714 mutex_exit(&pmap->pm_lock); 4715 pmap_destroy(pmap); 4716 goto startover; 4717 } 4718 pp->pp_attrs |= oattrs; 4719 } 4720 result = pp->pp_attrs & clearbits; 4721 pp->pp_attrs &= ~clearbits; 4722 pmap_tlb_shootnow(); 4723 mutex_spin_exit(&pp->pp_lock); 4724 4725 return result != 0; 4726 } 4727 4728 /* 4729 * pmap_clear_attrs: clear the specified attribute for a page. 4730 * 4731 * => we return true if we cleared one of the bits we were asked to 4732 */ 4733 bool 4734 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits) 4735 { 4736 struct pmap_page *pp; 4737 paddr_t pa; 4738 4739 pp = VM_PAGE_TO_PP(pg); 4740 pa = VM_PAGE_TO_PHYS(pg); 4741 4742 /* 4743 * If this is a new page, assert it has no mappings and simply zap 4744 * the stored attributes without taking any locks. 4745 */ 4746 if ((pg->flags & PG_FAKE) != 0) { 4747 KASSERT(atomic_load_relaxed(&pp->pp_pte.pte_va) == 0); 4748 KASSERT(atomic_load_relaxed(&pp->pp_pte.pte_ptp) == NULL); 4749 KASSERT(atomic_load_relaxed(&pp->pp_pvlist.lh_first) == NULL); 4750 atomic_store_relaxed(&pp->pp_attrs, 0); 4751 return false; 4752 } else { 4753 return pmap_pp_clear_attrs(pp, pa, clearbits); 4754 } 4755 } 4756 4757 /* 4758 * pmap_pv_clear_attrs: clear the specified attributes for an unmanaged 4759 * pv-tracked page. 4760 */ 4761 bool 4762 pmap_pv_clear_attrs(paddr_t pa, unsigned clearbits) 4763 { 4764 struct pmap_page *pp; 4765 4766 pp = pmap_pv_tracked(pa); 4767 if (pp == NULL) 4768 panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa); 4769 4770 return pmap_pp_clear_attrs(pp, pa, clearbits); 4771 } 4772 4773 /* 4774 * p m a p p r o t e c t i o n f u n c t i o n s 4775 */ 4776 4777 /* 4778 * pmap_page_protect: change the protection of all recorded mappings 4779 * of a managed page 4780 * 4781 * => NOTE: this is an inline function in pmap.h 4782 */ 4783 4784 /* see pmap.h */ 4785 4786 /* 4787 * pmap_pv_protect: change the protection of all recorded mappings 4788 * of an unmanaged pv-tracked page 4789 * 4790 * => NOTE: this is an inline function in pmap.h 4791 */ 4792 4793 /* see pmap.h */ 4794 4795 /* 4796 * pmap_protect: set the protection in of the pages in a pmap 4797 * 4798 * => NOTE: this is an inline function in pmap.h 4799 */ 4800 4801 /* see pmap.h */ 4802 4803 /* 4804 * pmap_write_protect: write-protect pages in a pmap. 4805 * 4806 * Note for Xen-amd64. Xen automatically adds PTE_U to the kernel pages, but we 4807 * don't need to remove this bit when re-entering the PTEs here: Xen tracks the 4808 * kernel pages with a reserved bit (_PAGE_GUEST_KERNEL), so even if PTE_U is 4809 * present the page will still be considered as a kernel page, and the privilege 4810 * separation will be enforced correctly. 4811 */ 4812 void 4813 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot) 4814 { 4815 pt_entry_t bit_rem, bit_put; 4816 pt_entry_t *ptes; 4817 pt_entry_t * const *pdes; 4818 struct pmap *pmap2; 4819 vaddr_t blockend, va; 4820 int lvl, i; 4821 4822 if (__predict_false(pmap->pm_write_protect != NULL)) { 4823 (*pmap->pm_write_protect)(pmap, sva, eva, prot); 4824 return; 4825 } 4826 4827 bit_rem = 0; 4828 if (!(prot & VM_PROT_WRITE)) 4829 bit_rem = PTE_W; 4830 4831 bit_put = 0; 4832 if (!(prot & VM_PROT_EXECUTE)) 4833 bit_put = pmap_pg_nx; 4834 4835 sva &= ~PAGE_MASK; 4836 eva &= ~PAGE_MASK; 4837 4838 /* 4839 * Acquire pmap. No need to lock the kernel pmap as we won't 4840 * be touching PV entries nor stats and kernel PDEs aren't 4841 * freed. 4842 */ 4843 if (pmap != pmap_kernel()) { 4844 mutex_enter(&pmap->pm_lock); 4845 } 4846 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 4847 4848 for (va = sva ; va < eva; va = blockend) { 4849 pt_entry_t *spte, *epte; 4850 4851 blockend = x86_round_pdr(va + 1); 4852 if (blockend > eva) 4853 blockend = eva; 4854 4855 /* Is it a valid block? */ 4856 if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) { 4857 continue; 4858 } 4859 KASSERT(va < VM_MAXUSER_ADDRESS || va >= VM_MAX_ADDRESS); 4860 KASSERT(lvl == 1); 4861 4862 spte = &ptes[pl1_i(va)]; 4863 epte = &ptes[pl1_i(blockend)]; 4864 4865 for (i = 0; spte < epte; spte++, i++) { 4866 pt_entry_t opte, npte; 4867 4868 do { 4869 opte = *spte; 4870 if (!pmap_valid_entry(opte)) { 4871 goto next; 4872 } 4873 npte = (opte & ~bit_rem) | bit_put; 4874 } while (pmap_pte_cas(spte, opte, npte) != opte); 4875 4876 if ((opte & PTE_D) != 0) { 4877 vaddr_t tva = va + x86_ptob(i); 4878 pmap_tlb_shootdown(pmap, tva, opte, 4879 TLBSHOOT_WRITE_PROTECT); 4880 } 4881 next:; 4882 } 4883 } 4884 4885 /* Release pmap. */ 4886 pmap_unmap_ptes(pmap, pmap2); 4887 if (pmap != pmap_kernel()) { 4888 mutex_exit(&pmap->pm_lock); 4889 } 4890 } 4891 4892 /* 4893 * pmap_unwire: clear the wired bit in the PTE. 4894 * 4895 * => Mapping should already be present. 4896 */ 4897 void 4898 pmap_unwire(struct pmap *pmap, vaddr_t va) 4899 { 4900 pt_entry_t *ptes, *ptep, opte; 4901 pd_entry_t * const *pdes; 4902 struct pmap *pmap2; 4903 int lvl; 4904 4905 if (__predict_false(pmap->pm_unwire != NULL)) { 4906 (*pmap->pm_unwire)(pmap, va); 4907 return; 4908 } 4909 4910 /* 4911 * Acquire pmap. Need to lock the kernel pmap only to protect the 4912 * statistics. 4913 */ 4914 mutex_enter(&pmap->pm_lock); 4915 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 4916 4917 if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) { 4918 panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va); 4919 } 4920 KASSERT(lvl == 1); 4921 4922 ptep = &ptes[pl1_i(va)]; 4923 opte = *ptep; 4924 KASSERT(pmap_valid_entry(opte)); 4925 4926 if (opte & PTE_WIRED) { 4927 pt_entry_t npte = opte & ~PTE_WIRED; 4928 4929 opte = pmap_pte_testset(ptep, npte); 4930 pmap_stats_update_bypte(pmap, npte, opte); 4931 } else { 4932 printf("%s: wiring for pmap %p va %#" PRIxVADDR 4933 " did not change!\n", __func__, pmap, va); 4934 } 4935 4936 /* Release pmap. */ 4937 pmap_unmap_ptes(pmap, pmap2); 4938 mutex_exit(&pmap->pm_lock); 4939 } 4940 4941 /* 4942 * pmap_copy: copy mappings from one pmap to another 4943 * 4944 * => optional function 4945 * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr) 4946 */ 4947 4948 /* 4949 * defined as macro in pmap.h 4950 */ 4951 4952 __strict_weak_alias(pmap_enter, pmap_enter_default); 4953 4954 int 4955 pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, 4956 u_int flags) 4957 { 4958 if (__predict_false(pmap->pm_enter != NULL)) { 4959 return (*pmap->pm_enter)(pmap, va, pa, prot, flags); 4960 } 4961 4962 return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0); 4963 } 4964 4965 /* 4966 * pmap_enter: enter a mapping into a pmap 4967 * 4968 * => must be done "now" ... no lazy-evaluation 4969 */ 4970 int 4971 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa, 4972 vm_prot_t prot, u_int flags, int domid) 4973 { 4974 pt_entry_t *ptes, opte, npte; 4975 pt_entry_t *ptep; 4976 pd_entry_t * const *pdes; 4977 struct vm_page *ptp; 4978 struct vm_page *new_pg, *old_pg; 4979 struct pmap_page *new_pp, *old_pp; 4980 struct pv_entry *old_pve, *new_pve; 4981 bool wired = (flags & PMAP_WIRED) != 0; 4982 struct pmap *pmap2; 4983 struct pmap_ptparray pt; 4984 int error; 4985 bool getptp, samepage, new_embedded; 4986 rb_tree_t *tree; 4987 4988 KASSERT(pmap_initialized); 4989 KASSERT(va < VM_MAX_KERNEL_ADDRESS); 4990 KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#" 4991 PRIxVADDR " over PDP!", __func__, va); 4992 KASSERTMSG(va < VM_MIN_KERNEL_ADDRESS || 4993 pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]), 4994 "%s: missing kernel PTP for va=%#" PRIxVADDR, __func__, va); 4995 4996 #ifdef XENPV 4997 KASSERT(domid == DOMID_SELF || pa == 0); 4998 #endif 4999 5000 npte = ma | protection_codes[prot] | PTE_P; 5001 npte |= pmap_pat_flags(flags); 5002 if (wired) 5003 npte |= PTE_WIRED; 5004 if (va < VM_MAXUSER_ADDRESS) { 5005 KASSERTMSG(pmap != pmap_kernel(), 5006 "entering user va %#"PRIxVADDR" into kernel pmap", 5007 va); 5008 if (pmap_is_user(pmap)) 5009 npte |= PTE_U; 5010 } 5011 5012 if (pmap == pmap_kernel()) 5013 npte |= pmap_pg_g; 5014 if (flags & VM_PROT_ALL) { 5015 npte |= PTE_A; 5016 if (flags & VM_PROT_WRITE) { 5017 KASSERT((npte & PTE_W) != 0); 5018 npte |= PTE_D; 5019 } 5020 } 5021 5022 #ifdef XENPV 5023 if (domid != DOMID_SELF) 5024 new_pg = NULL; 5025 else 5026 #endif 5027 new_pg = PHYS_TO_VM_PAGE(pa); 5028 5029 if (new_pg != NULL) { 5030 /* This is a managed page */ 5031 npte |= PTE_PVLIST; 5032 new_pp = VM_PAGE_TO_PP(new_pg); 5033 PMAP_CHECK_PP(new_pp); 5034 } else if ((new_pp = pmap_pv_tracked(pa)) != NULL) { 5035 /* This is an unmanaged pv-tracked page */ 5036 npte |= PTE_PVLIST; 5037 PMAP_CHECK_PP(new_pp); 5038 } else { 5039 new_pp = NULL; 5040 } 5041 5042 /* Begin by locking the pmap. */ 5043 mutex_enter(&pmap->pm_lock); 5044 5045 /* Look up the PTP. Allocate if none present. */ 5046 ptp = NULL; 5047 getptp = false; 5048 if (pmap != pmap_kernel()) { 5049 ptp = pmap_find_ptp(pmap, va, 1); 5050 if (ptp == NULL) { 5051 getptp = true; 5052 error = pmap_get_ptp(pmap, &pt, va, flags, &ptp); 5053 if (error != 0) { 5054 if (flags & PMAP_CANFAIL) { 5055 mutex_exit(&pmap->pm_lock); 5056 return error; 5057 } 5058 panic("%s: get ptp failed, error=%d", __func__, 5059 error); 5060 } 5061 } 5062 tree = &VM_PAGE_TO_PP(ptp)->pp_rb; 5063 } else { 5064 /* Embedded PV entries rely on this. */ 5065 KASSERT(va != 0); 5066 tree = &pmap_kernel_rb; 5067 } 5068 5069 /* 5070 * Look up the old PV entry at this VA (if any), and insert a new PV 5071 * entry if required for the new mapping. Temporarily track the old 5072 * and new mappings concurrently. Only after the old mapping is 5073 * evicted from the pmap will we remove its PV entry. Otherwise, 5074 * our picture of modified/accessed state for either page could get 5075 * out of sync (we need any P->V operation for either page to stall 5076 * on pmap->pm_lock until done here). 5077 */ 5078 new_pve = NULL; 5079 old_pve = NULL; 5080 samepage = false; 5081 new_embedded = false; 5082 5083 if (new_pp != NULL) { 5084 error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve, 5085 &old_pve, &samepage, &new_embedded, tree); 5086 5087 /* 5088 * If a new pv_entry was needed and none was available, we 5089 * can go no further. 5090 */ 5091 if (error != 0) { 5092 if (flags & PMAP_CANFAIL) { 5093 if (getptp) { 5094 pmap_unget_ptp(pmap, &pt); 5095 } 5096 mutex_exit(&pmap->pm_lock); 5097 return error; 5098 } 5099 panic("%s: alloc pve failed", __func__); 5100 } 5101 } else { 5102 old_pve = pmap_treelookup_pv(pmap, ptp, tree, va); 5103 } 5104 5105 /* Map PTEs into address space. */ 5106 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 5107 5108 /* Install any newly allocated PTPs. */ 5109 if (getptp) { 5110 pmap_install_ptp(pmap, &pt, va, pdes); 5111 } 5112 5113 /* Check if there is an existing mapping. */ 5114 ptep = &ptes[pl1_i(va)]; 5115 opte = *ptep; 5116 bool have_oldpa = pmap_valid_entry(opte); 5117 paddr_t oldpa = pmap_pte2pa(opte); 5118 5119 /* 5120 * Update the pte. 5121 */ 5122 do { 5123 opte = *ptep; 5124 5125 /* 5126 * if the same page, inherit PTE_A and PTE_D. 5127 */ 5128 if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) { 5129 npte |= opte & (PTE_A | PTE_D); 5130 } 5131 #if defined(XENPV) 5132 if (domid != DOMID_SELF) { 5133 /* pmap_pte_cas with error handling */ 5134 int s = splvm(); 5135 if (opte != *ptep) { 5136 splx(s); 5137 continue; 5138 } 5139 error = xpq_update_foreign( 5140 vtomach((vaddr_t)ptep), npte, domid, flags); 5141 splx(s); 5142 if (error) { 5143 /* Undo pv_entry tracking - oof. */ 5144 if (new_pp != NULL) { 5145 mutex_spin_enter(&new_pp->pp_lock); 5146 if (new_pve != NULL) { 5147 LIST_REMOVE(new_pve, pve_list); 5148 KASSERT(pmap->pm_pve == NULL); 5149 pmap->pm_pve = new_pve; 5150 } else if (new_embedded) { 5151 new_pp->pp_pte.pte_ptp = NULL; 5152 new_pp->pp_pte.pte_va = 0; 5153 } 5154 mutex_spin_exit(&new_pp->pp_lock); 5155 } 5156 pmap_unmap_ptes(pmap, pmap2); 5157 /* Free new PTP. */ 5158 if (ptp != NULL && ptp->wire_count <= 1) { 5159 pmap_free_ptp(pmap, ptp, va, ptes, 5160 pdes); 5161 } 5162 mutex_exit(&pmap->pm_lock); 5163 return error; 5164 } 5165 break; 5166 } 5167 #endif /* defined(XENPV) */ 5168 } while (pmap_pte_cas(ptep, opte, npte) != opte); 5169 5170 /* 5171 * Done with the PTEs: they can now be unmapped. 5172 */ 5173 pmap_unmap_ptes(pmap, pmap2); 5174 5175 /* 5176 * Update statistics and PTP's reference count. 5177 */ 5178 pmap_stats_update_bypte(pmap, npte, opte); 5179 if (ptp != NULL) { 5180 if (!have_oldpa) { 5181 ptp->wire_count++; 5182 } 5183 /* Remember minimum VA in PTP. */ 5184 pmap_ptp_range_set(ptp, va); 5185 } 5186 KASSERT(ptp == NULL || ptp->wire_count > 1); 5187 5188 /* 5189 * If the same page, we can skip pv_entry handling. 5190 */ 5191 if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) { 5192 KASSERT(((opte ^ npte) & PTE_PVLIST) == 0); 5193 if ((npte & PTE_PVLIST) != 0) { 5194 KASSERT(samepage); 5195 pmap_check_pv(pmap, ptp, new_pp, va, true); 5196 } 5197 goto same_pa; 5198 } else if ((npte & PTE_PVLIST) != 0) { 5199 KASSERT(!samepage); 5200 } 5201 5202 /* 5203 * If old page is pv-tracked, remove pv_entry from its list. 5204 */ 5205 if ((~opte & (PTE_P | PTE_PVLIST)) == 0) { 5206 if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) { 5207 old_pp = VM_PAGE_TO_PP(old_pg); 5208 } else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) { 5209 panic("%s: PTE_PVLIST with pv-untracked page" 5210 " va = %#"PRIxVADDR 5211 " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")", 5212 __func__, va, oldpa, atop(pa)); 5213 } 5214 5215 pmap_remove_pv(pmap, old_pp, ptp, va, old_pve, 5216 pmap_pte_to_pp_attrs(opte)); 5217 } else { 5218 KASSERT(old_pve == NULL); 5219 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 5220 } 5221 5222 /* 5223 * If new page is dynamically PV tracked, insert to tree. 5224 */ 5225 if (new_pve != NULL) { 5226 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 5227 old_pve = rb_tree_insert_node(tree, new_pve); 5228 KASSERT(old_pve == new_pve); 5229 pmap_check_pv(pmap, ptp, new_pp, va, true); 5230 } 5231 5232 same_pa: 5233 /* 5234 * shootdown tlb if necessary. 5235 */ 5236 5237 if ((~opte & (PTE_P | PTE_A)) == 0 && 5238 ((opte ^ npte) & (PTE_FRAME | PTE_W)) != 0) { 5239 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER); 5240 } 5241 pmap_drain_pv(pmap); 5242 mutex_exit(&pmap->pm_lock); 5243 return 0; 5244 } 5245 5246 #if defined(XEN) && defined(DOM0OPS) 5247 5248 struct pmap_data_gnt { 5249 SLIST_ENTRY(pmap_data_gnt) pd_gnt_list; 5250 vaddr_t pd_gnt_sva; 5251 vaddr_t pd_gnt_eva; /* range covered by this gnt */ 5252 int pd_gnt_refs; /* ref counter */ 5253 struct gnttab_map_grant_ref pd_gnt_ops[1]; /* variable length */ 5254 }; 5255 SLIST_HEAD(pmap_data_gnt_head, pmap_data_gnt); 5256 5257 static void pmap_remove_gnt(struct pmap *, vaddr_t, vaddr_t); 5258 5259 static struct pmap_data_gnt * 5260 pmap_find_gnt(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 5261 { 5262 struct pmap_data_gnt_head *headp; 5263 struct pmap_data_gnt *pgnt; 5264 5265 KASSERT(mutex_owned(&pmap->pm_lock)); 5266 headp = pmap->pm_data; 5267 KASSERT(headp != NULL); 5268 SLIST_FOREACH(pgnt, headp, pd_gnt_list) { 5269 if (pgnt->pd_gnt_sva <= sva && eva <= pgnt->pd_gnt_eva) 5270 return pgnt; 5271 /* check that we're not overlapping part of a region */ 5272 KASSERT(pgnt->pd_gnt_sva >= eva || pgnt->pd_gnt_eva <= sva); 5273 } 5274 return NULL; 5275 } 5276 5277 static void 5278 pmap_alloc_gnt(struct pmap *pmap, vaddr_t sva, int nentries, 5279 const struct gnttab_map_grant_ref *ops) 5280 { 5281 struct pmap_data_gnt_head *headp; 5282 struct pmap_data_gnt *pgnt; 5283 vaddr_t eva = sva + nentries * PAGE_SIZE; 5284 KASSERT(mutex_owned(&pmap->pm_lock)); 5285 KASSERT(nentries >= 1); 5286 if (pmap->pm_remove == NULL) { 5287 pmap->pm_remove = pmap_remove_gnt; 5288 KASSERT(pmap->pm_data == NULL); 5289 headp = kmem_alloc(sizeof(*headp), KM_SLEEP); 5290 SLIST_INIT(headp); 5291 pmap->pm_data = headp; 5292 } else { 5293 KASSERT(pmap->pm_remove == pmap_remove_gnt); 5294 KASSERT(pmap->pm_data != NULL); 5295 headp = pmap->pm_data; 5296 } 5297 5298 pgnt = pmap_find_gnt(pmap, sva, eva); 5299 if (pgnt != NULL) { 5300 KASSERT(pgnt->pd_gnt_sva == sva); 5301 KASSERT(pgnt->pd_gnt_eva == eva); 5302 return; 5303 } 5304 5305 /* new entry */ 5306 pgnt = kmem_alloc(sizeof(*pgnt) + 5307 (nentries - 1) * sizeof(struct gnttab_map_grant_ref), KM_SLEEP); 5308 pgnt->pd_gnt_sva = sva; 5309 pgnt->pd_gnt_eva = eva; 5310 pgnt->pd_gnt_refs = 0; 5311 memcpy(pgnt->pd_gnt_ops, ops, 5312 sizeof(struct gnttab_map_grant_ref) * nentries); 5313 SLIST_INSERT_HEAD(headp, pgnt, pd_gnt_list); 5314 } 5315 5316 static void 5317 pmap_free_gnt(struct pmap *pmap, struct pmap_data_gnt *pgnt) 5318 { 5319 struct pmap_data_gnt_head *headp = pmap->pm_data; 5320 int nentries = (pgnt->pd_gnt_eva - pgnt->pd_gnt_sva) / PAGE_SIZE; 5321 KASSERT(nentries >= 1); 5322 KASSERT(mutex_owned(&pmap->pm_lock)); 5323 KASSERT(pgnt->pd_gnt_refs == 0); 5324 SLIST_REMOVE(headp, pgnt, pmap_data_gnt, pd_gnt_list); 5325 kmem_free(pgnt, sizeof(*pgnt) + 5326 (nentries - 1) * sizeof(struct gnttab_map_grant_ref)); 5327 if (SLIST_EMPTY(headp)) { 5328 kmem_free(headp, sizeof(*headp)); 5329 pmap->pm_data = NULL; 5330 pmap->pm_remove = NULL; 5331 } 5332 } 5333 5334 /* 5335 * pmap_enter_gnt: enter a grant entry into a pmap 5336 * 5337 * => must be done "now" ... no lazy-evaluation 5338 */ 5339 int 5340 pmap_enter_gnt(struct pmap *pmap, vaddr_t va, vaddr_t sva, int nentries, 5341 const struct gnttab_map_grant_ref *oops) 5342 { 5343 struct pmap_data_gnt *pgnt; 5344 pt_entry_t *ptes, opte; 5345 #ifndef XENPV 5346 pt_entry_t npte; 5347 #endif 5348 pt_entry_t *ptep; 5349 pd_entry_t * const *pdes; 5350 struct vm_page *ptp; 5351 struct vm_page *old_pg; 5352 struct pmap_page *old_pp; 5353 struct pv_entry *old_pve; 5354 struct pmap *pmap2; 5355 struct pmap_ptparray pt; 5356 int error; 5357 bool getptp; 5358 rb_tree_t *tree; 5359 struct gnttab_map_grant_ref *op; 5360 int ret; 5361 int idx; 5362 5363 KASSERT(pmap_initialized); 5364 KASSERT(va < VM_MAX_KERNEL_ADDRESS); 5365 KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#" 5366 PRIxVADDR " over PDP!", __func__, va); 5367 KASSERT(pmap != pmap_kernel()); 5368 5369 /* Begin by locking the pmap. */ 5370 mutex_enter(&pmap->pm_lock); 5371 pmap_alloc_gnt(pmap, sva, nentries, oops); 5372 5373 pgnt = pmap_find_gnt(pmap, va, va + PAGE_SIZE); 5374 KASSERT(pgnt != NULL); 5375 5376 /* Look up the PTP. Allocate if none present. */ 5377 ptp = NULL; 5378 getptp = false; 5379 ptp = pmap_find_ptp(pmap, va, 1); 5380 if (ptp == NULL) { 5381 getptp = true; 5382 error = pmap_get_ptp(pmap, &pt, va, PMAP_CANFAIL, &ptp); 5383 if (error != 0) { 5384 mutex_exit(&pmap->pm_lock); 5385 return error; 5386 } 5387 } 5388 tree = &VM_PAGE_TO_PP(ptp)->pp_rb; 5389 5390 /* 5391 * Look up the old PV entry at this VA (if any), and insert a new PV 5392 * entry if required for the new mapping. Temporarily track the old 5393 * and new mappings concurrently. Only after the old mapping is 5394 * evicted from the pmap will we remove its PV entry. Otherwise, 5395 * our picture of modified/accessed state for either page could get 5396 * out of sync (we need any P->V operation for either page to stall 5397 * on pmap->pm_lock until done here). 5398 */ 5399 old_pve = NULL; 5400 5401 old_pve = pmap_treelookup_pv(pmap, ptp, tree, va); 5402 5403 /* Map PTEs into address space. */ 5404 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 5405 5406 /* Install any newly allocated PTPs. */ 5407 if (getptp) { 5408 pmap_install_ptp(pmap, &pt, va, pdes); 5409 } 5410 5411 /* Check if there is an existing mapping. */ 5412 ptep = &ptes[pl1_i(va)]; 5413 opte = *ptep; 5414 bool have_oldpa = pmap_valid_entry(opte); 5415 paddr_t oldpa = pmap_pte2pa(opte); 5416 5417 /* 5418 * Update the pte. 5419 */ 5420 5421 idx = (va - pgnt->pd_gnt_sva) / PAGE_SIZE; 5422 op = &pgnt->pd_gnt_ops[idx]; 5423 5424 #ifdef XENPV 5425 KASSERT(op->flags & GNTMAP_contains_pte); 5426 op->host_addr = xpmap_ptetomach(ptep); 5427 #else 5428 KASSERT((op->flags & GNTMAP_contains_pte) == 0); 5429 KASSERT(op->flags != 0); 5430 KASSERT(op->host_addr != 0); 5431 #endif 5432 op->dev_bus_addr = 0; 5433 op->status = GNTST_general_error; 5434 ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, 1); 5435 if (__predict_false(ret)) { 5436 printf("%s: GNTTABOP_map_grant_ref failed: %d\n", 5437 __func__, ret); 5438 op->status = GNTST_general_error; 5439 } 5440 for (int d = 0; d < 256 && op->status == GNTST_eagain; d++) { 5441 kpause("gntmap", false, mstohz(1), NULL); 5442 ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, 1); 5443 if (__predict_false(ret)) { 5444 printf("%s: GNTTABOP_map_grant_ref failed: %d\n", 5445 __func__, ret); 5446 op->status = GNTST_general_error; 5447 } 5448 } 5449 if (__predict_false(op->status != GNTST_okay)) { 5450 printf("%s: GNTTABOP_map_grant_ref status: %d\n", 5451 __func__, op->status); 5452 if (have_oldpa) { /* XXX did the pte really change if XENPV ?*/ 5453 ptp->wire_count--; 5454 } 5455 } else { 5456 #ifndef XENPV 5457 npte = op->host_addr | pmap_pg_nx | PTE_U | PTE_P; 5458 if ((op->flags & GNTMAP_readonly) == 0) 5459 npte |= PTE_W; 5460 do { 5461 opte = *ptep; 5462 } while (pmap_pte_cas(ptep, opte, npte) != opte); 5463 #endif 5464 pgnt->pd_gnt_refs++; 5465 if (!have_oldpa) { 5466 ptp->wire_count++; 5467 } 5468 KASSERT(ptp->wire_count > 1); 5469 /* Remember minimum VA in PTP. */ 5470 pmap_ptp_range_set(ptp, va); 5471 } 5472 if (ptp->wire_count <= 1) 5473 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 5474 5475 /* 5476 * Done with the PTEs: they can now be unmapped. 5477 */ 5478 pmap_unmap_ptes(pmap, pmap2); 5479 5480 /* 5481 * Update statistics and PTP's reference count. 5482 */ 5483 pmap_stats_update_bypte(pmap, 0, opte); 5484 5485 /* 5486 * If old page is pv-tracked, remove pv_entry from its list. 5487 */ 5488 if ((~opte & (PTE_P | PTE_PVLIST)) == 0) { 5489 if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) { 5490 old_pp = VM_PAGE_TO_PP(old_pg); 5491 } else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) { 5492 panic("%s: PTE_PVLIST with pv-untracked page" 5493 " va = %#"PRIxVADDR " pa = %#" PRIxPADDR, 5494 __func__, va, oldpa); 5495 } 5496 5497 pmap_remove_pv(pmap, old_pp, ptp, va, old_pve, 5498 pmap_pte_to_pp_attrs(opte)); 5499 } else { 5500 KASSERT(old_pve == NULL); 5501 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 5502 } 5503 5504 pmap_drain_pv(pmap); 5505 mutex_exit(&pmap->pm_lock); 5506 return op->status; 5507 } 5508 5509 /* 5510 * pmap_remove_gnt: grant mapping removal function. 5511 * 5512 * => caller should not be holding any pmap locks 5513 */ 5514 static void 5515 pmap_remove_gnt(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 5516 { 5517 struct pmap_data_gnt *pgnt; 5518 pt_entry_t *ptes; 5519 pd_entry_t pde; 5520 pd_entry_t * const *pdes; 5521 struct vm_page *ptp; 5522 struct pmap *pmap2; 5523 vaddr_t va; 5524 int lvl; 5525 int idx; 5526 struct gnttab_map_grant_ref *op; 5527 struct gnttab_unmap_grant_ref unmap_op; 5528 int ret; 5529 5530 KASSERT(pmap != pmap_kernel()); 5531 KASSERT(pmap->pm_remove == pmap_remove_gnt); 5532 5533 mutex_enter(&pmap->pm_lock); 5534 for (va = sva; va < eva; va += PAGE_SIZE) { 5535 pgnt = pmap_find_gnt(pmap, va, va + PAGE_SIZE); 5536 if (pgnt == NULL) { 5537 pmap_remove_locked(pmap, sva, eva); 5538 continue; 5539 } 5540 5541 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 5542 if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) { 5543 panic("pmap_remove_gnt pdes not valid"); 5544 } 5545 5546 idx = (va - pgnt->pd_gnt_sva) / PAGE_SIZE; 5547 op = &pgnt->pd_gnt_ops[idx]; 5548 KASSERT(lvl == 1); 5549 5550 /* Get PTP if non-kernel mapping. */ 5551 ptp = pmap_find_ptp(pmap, va, 1); 5552 KASSERTMSG(ptp != NULL, 5553 "%s: unmanaged PTP detected", __func__); 5554 5555 if (op->status == GNTST_okay) { 5556 KASSERT(pmap_valid_entry(ptes[pl1_i(va)])); 5557 #ifdef XENPV 5558 unmap_op.host_addr = xpmap_ptetomach(&ptes[pl1_i(va)]); 5559 #else 5560 unmap_op.host_addr = op->host_addr; 5561 pmap_pte_testset(&ptes[pl1_i(va)], 0); 5562 #endif 5563 unmap_op.handle = op->handle; 5564 unmap_op.dev_bus_addr = 0; 5565 ret = HYPERVISOR_grant_table_op( 5566 GNTTABOP_unmap_grant_ref, &unmap_op, 1); 5567 if (ret) { 5568 printf("%s: GNTTABOP_unmap_grant_ref " 5569 "failed: %d\n", __func__, ret); 5570 } 5571 5572 ptp->wire_count--; 5573 pgnt->pd_gnt_refs--; 5574 } 5575 if (pgnt->pd_gnt_refs == 0) { 5576 pmap_free_gnt(pmap, pgnt); 5577 } 5578 /* 5579 * if mapping removed and the PTP is no longer 5580 * being used, free it! 5581 */ 5582 5583 if (ptp->wire_count <= 1) 5584 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 5585 pmap_unmap_ptes(pmap, pmap2); 5586 } 5587 mutex_exit(&pmap->pm_lock); 5588 } 5589 #endif /* XEN && DOM0OPS */ 5590 5591 paddr_t 5592 pmap_get_physpage(void) 5593 { 5594 struct vm_page *ptp; 5595 struct pmap *kpm = pmap_kernel(); 5596 paddr_t pa; 5597 5598 if (!uvm.page_init_done) { 5599 /* 5600 * We're growing the kernel pmap early (from 5601 * uvm_pageboot_alloc()). This case must be 5602 * handled a little differently. 5603 */ 5604 5605 if (!uvm_page_physget(&pa)) 5606 panic("%s: out of memory", __func__); 5607 #if defined(__HAVE_DIRECT_MAP) 5608 memset(PAGE_ALIGNED(PMAP_DIRECT_MAP(pa)), 0, PAGE_SIZE); 5609 #else 5610 #if defined(XENPV) 5611 if (XEN_VERSION_SUPPORTED(3, 4)) { 5612 xen_pagezero(pa); 5613 return pa; 5614 } 5615 #endif 5616 kpreempt_disable(); 5617 pmap_pte_set(early_zero_pte, pmap_pa2pte(pa) | PTE_P | 5618 PTE_W | pmap_pg_nx); 5619 pmap_pte_flush(); 5620 pmap_update_pg((vaddr_t)early_zerop); 5621 memset(PAGE_ALIGNED(early_zerop), 0, PAGE_SIZE); 5622 #if defined(DIAGNOSTIC) || defined(XENPV) 5623 pmap_pte_set(early_zero_pte, 0); 5624 pmap_pte_flush(); 5625 #endif /* defined(DIAGNOSTIC) */ 5626 kpreempt_enable(); 5627 #endif /* defined(__HAVE_DIRECT_MAP) */ 5628 } else { 5629 /* XXX */ 5630 ptp = uvm_pagealloc(NULL, 0, NULL, 5631 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 5632 if (ptp == NULL) 5633 panic("%s: out of memory", __func__); 5634 ptp->flags &= ~PG_BUSY; 5635 ptp->wire_count = 1; 5636 pa = VM_PAGE_TO_PHYS(ptp); 5637 } 5638 pmap_stats_update(kpm, 1, 0); 5639 5640 return pa; 5641 } 5642 5643 /* 5644 * Expand the page tree with the specified amount of PTPs, mapping virtual 5645 * addresses starting at kva. We populate all the levels but the last one 5646 * (L1). The nodes of the tree are created as RW, but the pages covered 5647 * will be kentered in L1, with proper permissions. 5648 * 5649 * Used only by pmap_growkernel. 5650 */ 5651 static void 5652 pmap_alloc_level(struct pmap *cpm, vaddr_t kva, long *needed_ptps) 5653 { 5654 unsigned long i; 5655 paddr_t pa; 5656 unsigned long index, endindex; 5657 int level; 5658 pd_entry_t *pdep; 5659 #ifdef XENPV 5660 int s = splvm(); /* protect xpq_* */ 5661 #endif 5662 5663 for (level = PTP_LEVELS; level > 1; level--) { 5664 if (level == PTP_LEVELS) 5665 pdep = cpm->pm_pdir; 5666 else 5667 pdep = normal_pdes[level - 2]; 5668 index = pl_i_roundup(kva, level); 5669 endindex = index + needed_ptps[level - 1] - 1; 5670 5671 for (i = index; i <= endindex; i++) { 5672 pt_entry_t pte; 5673 5674 KASSERT(!pmap_valid_entry(pdep[i])); 5675 pa = pmap_get_physpage(); 5676 pte = pmap_pa2pte(pa) | PTE_P | PTE_W; 5677 #ifdef __x86_64__ 5678 pte |= pmap_pg_nx; 5679 #endif 5680 pmap_pte_set(&pdep[i], pte); 5681 5682 #ifdef XENPV 5683 if (level == PTP_LEVELS && i >= PDIR_SLOT_KERN) { 5684 if (__predict_true( 5685 cpu_info_primary.ci_flags & CPUF_PRESENT)) { 5686 /* update per-cpu PMDs on all cpus */ 5687 xen_kpm_sync(pmap_kernel(), i); 5688 } else { 5689 /* 5690 * too early; update primary CPU 5691 * PMD only (without locks) 5692 */ 5693 #ifdef __x86_64__ 5694 pd_entry_t *cpu_pdep = 5695 &cpu_info_primary.ci_kpm_pdir[i]; 5696 #else 5697 pd_entry_t *cpu_pdep = 5698 &cpu_info_primary.ci_kpm_pdir[l2tol2(i)]; 5699 #endif 5700 pmap_pte_set(cpu_pdep, pte); 5701 } 5702 } 5703 #endif 5704 5705 KASSERT(level != PTP_LEVELS || nkptp[level - 1] + 5706 pl_i(VM_MIN_KERNEL_ADDRESS, level) == i); 5707 nkptp[level - 1]++; 5708 } 5709 pmap_pte_flush(); 5710 } 5711 #ifdef XENPV 5712 splx(s); 5713 #endif 5714 } 5715 5716 /* 5717 * pmap_growkernel: increase usage of KVM space. 5718 * 5719 * => we allocate new PTPs for the kernel and install them in all 5720 * the pmaps on the system. 5721 */ 5722 vaddr_t 5723 pmap_growkernel(vaddr_t maxkvaddr) 5724 { 5725 struct pmap *kpm = pmap_kernel(); 5726 struct pmap *cpm; 5727 #if !defined(XENPV) || !defined(__x86_64__) 5728 struct pmap *pm; 5729 long old; 5730 #endif 5731 int s, i; 5732 long needed_kptp[PTP_LEVELS], target_nptp; 5733 bool invalidate = false; 5734 5735 s = splvm(); /* to be safe */ 5736 mutex_enter(&kpm->pm_lock); 5737 5738 if (maxkvaddr <= pmap_maxkvaddr) { 5739 mutex_exit(&kpm->pm_lock); 5740 splx(s); 5741 return pmap_maxkvaddr; 5742 } 5743 5744 maxkvaddr = x86_round_pdr(maxkvaddr); 5745 #if !defined(XENPV) || !defined(__x86_64__) 5746 old = nkptp[PTP_LEVELS - 1]; 5747 #endif 5748 5749 /* Initialize needed_kptp. */ 5750 for (i = PTP_LEVELS - 1; i >= 1; i--) { 5751 target_nptp = pl_i_roundup(maxkvaddr, i + 1) - 5752 pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1); 5753 5754 if (target_nptp > nkptpmax[i]) 5755 panic("out of KVA space"); 5756 KASSERT(target_nptp >= nkptp[i]); 5757 needed_kptp[i] = target_nptp - nkptp[i]; 5758 } 5759 5760 #ifdef XENPV 5761 /* only pmap_kernel(), or the per-cpu map, has kernel entries */ 5762 cpm = kpm; 5763 #else 5764 /* Get the current pmap */ 5765 if (__predict_true(cpu_info_primary.ci_flags & CPUF_PRESENT)) { 5766 cpm = curcpu()->ci_pmap; 5767 } else { 5768 cpm = kpm; 5769 } 5770 #endif 5771 5772 kasan_shadow_map((void *)pmap_maxkvaddr, 5773 (size_t)(maxkvaddr - pmap_maxkvaddr)); 5774 kmsan_shadow_map((void *)pmap_maxkvaddr, 5775 (size_t)(maxkvaddr - pmap_maxkvaddr)); 5776 5777 pmap_alloc_level(cpm, pmap_maxkvaddr, needed_kptp); 5778 5779 /* 5780 * If the number of top level entries changed, update all pmaps. 5781 */ 5782 if (needed_kptp[PTP_LEVELS - 1] != 0) { 5783 #ifdef XENPV 5784 #ifdef __x86_64__ 5785 /* nothing, kernel entries are never entered in user pmap */ 5786 #else 5787 int pdkidx; 5788 5789 mutex_enter(&pmaps_lock); 5790 LIST_FOREACH(pm, &pmaps, pm_list) { 5791 for (pdkidx = PDIR_SLOT_KERN + old; 5792 pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1]; 5793 pdkidx++) { 5794 pmap_pte_set(&pm->pm_pdir[pdkidx], 5795 kpm->pm_pdir[pdkidx]); 5796 } 5797 pmap_pte_flush(); 5798 } 5799 mutex_exit(&pmaps_lock); 5800 #endif /* __x86_64__ */ 5801 #else /* XENPV */ 5802 size_t newpdes; 5803 newpdes = nkptp[PTP_LEVELS - 1] - old; 5804 if (cpm != kpm) { 5805 memcpy(&kpm->pm_pdir[PDIR_SLOT_KERN + old], 5806 &cpm->pm_pdir[PDIR_SLOT_KERN + old], 5807 newpdes * sizeof(pd_entry_t)); 5808 } 5809 5810 mutex_enter(&pmaps_lock); 5811 LIST_FOREACH(pm, &pmaps, pm_list) { 5812 if (__predict_false(pm->pm_enter != NULL)) { 5813 /* 5814 * Not a native pmap, the kernel is not mapped, 5815 * so nothing to synchronize. 5816 */ 5817 continue; 5818 } 5819 memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old], 5820 &kpm->pm_pdir[PDIR_SLOT_KERN + old], 5821 newpdes * sizeof(pd_entry_t)); 5822 } 5823 mutex_exit(&pmaps_lock); 5824 #endif 5825 invalidate = true; 5826 } 5827 pmap_maxkvaddr = maxkvaddr; 5828 mutex_exit(&kpm->pm_lock); 5829 splx(s); 5830 5831 if (invalidate && pmap_initialized) { 5832 /* Invalidate the pmap cache. */ 5833 pool_cache_invalidate(&pmap_cache); 5834 } 5835 5836 return maxkvaddr; 5837 } 5838 5839 #ifdef DEBUG 5840 void pmap_dump(struct pmap *, vaddr_t, vaddr_t); 5841 5842 /* 5843 * pmap_dump: dump all the mappings from a pmap 5844 * 5845 * => caller should not be holding any pmap locks 5846 */ 5847 void 5848 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 5849 { 5850 pt_entry_t *ptes, *pte; 5851 pd_entry_t * const *pdes; 5852 struct pmap *pmap2; 5853 vaddr_t blkendva; 5854 int lvl; 5855 5856 /* 5857 * if end is out of range truncate. 5858 * if (end == start) update to max. 5859 */ 5860 5861 if (eva > VM_MAXUSER_ADDRESS || eva <= sva) 5862 eva = VM_MAXUSER_ADDRESS; 5863 5864 mutex_enter(&pmap->pm_lock); 5865 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 5866 5867 /* 5868 * dumping a range of pages: we dump in PTP sized blocks (4MB) 5869 */ 5870 5871 for (/* null */ ; sva < eva ; sva = blkendva) { 5872 5873 /* determine range of block */ 5874 blkendva = x86_round_pdr(sva+1); 5875 if (blkendva > eva) 5876 blkendva = eva; 5877 5878 /* valid block? */ 5879 if (!pmap_pdes_valid(sva, pdes, NULL, &lvl)) 5880 continue; 5881 KASSERT(lvl == 1); 5882 5883 pte = &ptes[pl1_i(sva)]; 5884 for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) { 5885 if (!pmap_valid_entry(*pte)) 5886 continue; 5887 printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR 5888 " (pte=%#" PRIxPADDR ")\n", 5889 sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte); 5890 } 5891 } 5892 pmap_unmap_ptes(pmap, pmap2); 5893 mutex_exit(&pmap->pm_lock); 5894 } 5895 #endif 5896 5897 /* 5898 * pmap_update: process deferred invalidations and frees. 5899 */ 5900 void 5901 pmap_update(struct pmap *pmap) 5902 { 5903 struct pmap_page *pp; 5904 struct vm_page *ptp; 5905 5906 /* 5907 * Initiate any pending TLB shootdowns. Wait for them to 5908 * complete before returning control to the caller. 5909 */ 5910 kpreempt_disable(); 5911 pmap_tlb_shootnow(); 5912 kpreempt_enable(); 5913 5914 /* 5915 * Now that shootdowns are complete, process deferred frees. This 5916 * is an unlocked check, but is safe as we're only interested in 5917 * work done in this LWP - we won't get a false negative. 5918 */ 5919 if (atomic_load_relaxed(&pmap->pm_gc_ptp.lh_first) == NULL) { 5920 return; 5921 } 5922 5923 mutex_enter(&pmap->pm_lock); 5924 while ((ptp = LIST_FIRST(&pmap->pm_gc_ptp)) != NULL) { 5925 KASSERT(ptp->wire_count == 0); 5926 KASSERT(ptp->uanon == NULL); 5927 LIST_REMOVE(ptp, mdpage.mp_pp.pp_link); 5928 pp = VM_PAGE_TO_PP(ptp); 5929 LIST_INIT(&pp->pp_pvlist); 5930 pp->pp_attrs = 0; 5931 pp->pp_pte.pte_ptp = NULL; 5932 pp->pp_pte.pte_va = 0; 5933 PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp)); 5934 5935 /* 5936 * XXX Hack to avoid extra locking, and lock 5937 * assertions in uvm_pagefree(). Despite uobject 5938 * being set, this isn't a managed page. 5939 */ 5940 PMAP_DUMMY_LOCK(pmap); 5941 uvm_pagerealloc(ptp, NULL, 0); 5942 PMAP_DUMMY_UNLOCK(pmap); 5943 uvm_pagefree(ptp); 5944 } 5945 mutex_exit(&pmap->pm_lock); 5946 } 5947 5948 #if PTP_LEVELS > 4 5949 #error "Unsupported number of page table mappings" 5950 #endif 5951 5952 paddr_t 5953 pmap_init_tmp_pgtbl(paddr_t pg) 5954 { 5955 static bool maps_loaded; 5956 static const paddr_t x86_tmp_pml_paddr[] = { 5957 4 * PAGE_SIZE, /* L1 */ 5958 5 * PAGE_SIZE, /* L2 */ 5959 6 * PAGE_SIZE, /* L3 */ 5960 7 * PAGE_SIZE /* L4 */ 5961 }; 5962 static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 }; 5963 5964 pd_entry_t *tmp_pml, *kernel_pml; 5965 5966 int level; 5967 5968 if (!maps_loaded) { 5969 for (level = 0; level < PTP_LEVELS; ++level) { 5970 x86_tmp_pml_vaddr[level] = 5971 uvm_km_alloc(kernel_map, PAGE_SIZE, 0, 5972 UVM_KMF_VAONLY); 5973 5974 if (x86_tmp_pml_vaddr[level] == 0) 5975 panic("mapping of real mode PML failed\n"); 5976 pmap_kenter_pa(x86_tmp_pml_vaddr[level], 5977 x86_tmp_pml_paddr[level], 5978 VM_PROT_READ | VM_PROT_WRITE, 0); 5979 } 5980 pmap_update(pmap_kernel()); 5981 maps_loaded = true; 5982 } 5983 5984 /* Zero levels 1-3 */ 5985 for (level = 0; level < PTP_LEVELS - 1; ++level) { 5986 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 5987 memset(PAGE_ALIGNED(tmp_pml), 0, PAGE_SIZE); 5988 } 5989 5990 /* Copy PML4 */ 5991 kernel_pml = pmap_kernel()->pm_pdir; 5992 tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1]; 5993 memcpy(PAGE_ALIGNED(tmp_pml), PAGE_ALIGNED(kernel_pml), PAGE_SIZE); 5994 5995 #ifdef PAE 5996 /* 5997 * Use the last 4 entries of the L2 page as L3 PD entries. These 5998 * last entries are unlikely to be used for temporary mappings. 5999 * 508: maps 0->1GB (userland) 6000 * 509: unused 6001 * 510: unused 6002 * 511: maps 3->4GB (kernel) 6003 */ 6004 tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PTE_P; 6005 tmp_pml[509] = 0; 6006 tmp_pml[510] = 0; 6007 tmp_pml[511] = pmap_pdirpa(pmap_kernel(), PDIR_SLOT_KERN) | PTE_P; 6008 #endif 6009 6010 for (level = PTP_LEVELS - 1; level > 0; --level) { 6011 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 6012 6013 tmp_pml[pl_i(pg, level + 1)] = 6014 (x86_tmp_pml_paddr[level - 1] & PTE_FRAME) | PTE_W | PTE_P; 6015 } 6016 6017 tmp_pml = (void *)x86_tmp_pml_vaddr[0]; 6018 tmp_pml[pl_i(pg, 1)] = (pg & PTE_FRAME) | PTE_W | PTE_P; 6019 6020 #ifdef PAE 6021 /* Return the PA of the L3 page (entry 508 of the L2 page) */ 6022 return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t); 6023 #endif 6024 6025 return x86_tmp_pml_paddr[PTP_LEVELS - 1]; 6026 } 6027 6028 u_int 6029 x86_mmap_flags(paddr_t mdpgno) 6030 { 6031 u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK; 6032 u_int pflag = 0; 6033 6034 if (nflag & X86_MMAP_FLAG_PREFETCH) 6035 pflag |= PMAP_WRITE_COMBINE; 6036 6037 return pflag; 6038 } 6039 6040 #if defined(__HAVE_DIRECT_MAP) && defined(__x86_64__) && !defined(XENPV) 6041 6042 /* 6043 * ----------------------------------------------------------------------------- 6044 * ***************************************************************************** 6045 * ***************************************************************************** 6046 * ***************************************************************************** 6047 * ***************************************************************************** 6048 * **************** HERE BEGINS THE EPT CODE, USED BY INTEL-VMX **************** 6049 * ***************************************************************************** 6050 * ***************************************************************************** 6051 * ***************************************************************************** 6052 * ***************************************************************************** 6053 * ----------------------------------------------------------------------------- 6054 * 6055 * These functions are invoked as callbacks from the code above. Contrary to 6056 * native, EPT does not have a recursive slot; therefore, it is not possible 6057 * to call pmap_map_ptes(). Instead, we use the direct map and walk down the 6058 * tree manually. 6059 * 6060 * Apart from that, the logic is mostly the same as native. Once a pmap has 6061 * been created, NVMM calls pmap_ept_transform() to make it an EPT pmap. 6062 * After that we're good, and the callbacks will handle the translations 6063 * for us. 6064 * 6065 * ----------------------------------------------------------------------------- 6066 */ 6067 6068 /* Hardware bits. */ 6069 #define EPT_R __BIT(0) /* read */ 6070 #define EPT_W __BIT(1) /* write */ 6071 #define EPT_X __BIT(2) /* execute */ 6072 #define EPT_T __BITS(5,3) /* type */ 6073 #define TYPE_UC 0 6074 #define TYPE_WC 1 6075 #define TYPE_WT 4 6076 #define TYPE_WP 5 6077 #define TYPE_WB 6 6078 #define EPT_NOPAT __BIT(6) 6079 #define EPT_L __BIT(7) /* large */ 6080 #define EPT_A __BIT(8) /* accessed */ 6081 #define EPT_D __BIT(9) /* dirty */ 6082 /* Software bits. */ 6083 #define EPT_PVLIST __BIT(60) 6084 #define EPT_WIRED __BIT(61) 6085 6086 #define pmap_ept_valid_entry(pte) (pte & EPT_R) 6087 6088 bool pmap_ept_has_ad __read_mostly; 6089 6090 static inline void 6091 pmap_ept_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte) 6092 { 6093 int resid_diff = ((npte & EPT_R) ? 1 : 0) - ((opte & EPT_R) ? 1 : 0); 6094 int wired_diff = ((npte & EPT_WIRED) ? 1 : 0) - ((opte & EPT_WIRED) ? 1 : 0); 6095 6096 KASSERT((npte & (EPT_R | EPT_WIRED)) != EPT_WIRED); 6097 KASSERT((opte & (EPT_R | EPT_WIRED)) != EPT_WIRED); 6098 6099 pmap_stats_update(pmap, resid_diff, wired_diff); 6100 } 6101 6102 static pt_entry_t 6103 pmap_ept_type(u_int flags) 6104 { 6105 u_int cacheflags = (flags & PMAP_CACHE_MASK); 6106 pt_entry_t ret; 6107 6108 switch (cacheflags) { 6109 case PMAP_NOCACHE: 6110 case PMAP_NOCACHE_OVR: 6111 ret = __SHIFTIN(TYPE_UC, EPT_T); 6112 break; 6113 case PMAP_WRITE_COMBINE: 6114 ret = __SHIFTIN(TYPE_WC, EPT_T); 6115 break; 6116 case PMAP_WRITE_BACK: 6117 default: 6118 ret = __SHIFTIN(TYPE_WB, EPT_T); 6119 break; 6120 } 6121 6122 ret |= EPT_NOPAT; 6123 return ret; 6124 } 6125 6126 static inline pt_entry_t 6127 pmap_ept_prot(vm_prot_t prot) 6128 { 6129 pt_entry_t res = 0; 6130 6131 if (prot & VM_PROT_READ) 6132 res |= EPT_R; 6133 if (prot & VM_PROT_WRITE) 6134 res |= EPT_W; 6135 if (prot & VM_PROT_EXECUTE) 6136 res |= EPT_X; 6137 6138 return res; 6139 } 6140 6141 static inline uint8_t 6142 pmap_ept_to_pp_attrs(pt_entry_t ept) 6143 { 6144 uint8_t ret = 0; 6145 if (pmap_ept_has_ad) { 6146 if (ept & EPT_D) 6147 ret |= PP_ATTRS_D; 6148 if (ept & EPT_A) 6149 ret |= PP_ATTRS_A; 6150 } else { 6151 ret |= (PP_ATTRS_D|PP_ATTRS_A); 6152 } 6153 if (ept & EPT_W) 6154 ret |= PP_ATTRS_W; 6155 return ret; 6156 } 6157 6158 static inline pt_entry_t 6159 pmap_pp_attrs_to_ept(uint8_t attrs) 6160 { 6161 pt_entry_t ept = 0; 6162 if (attrs & PP_ATTRS_D) 6163 ept |= EPT_D; 6164 if (attrs & PP_ATTRS_A) 6165 ept |= EPT_A; 6166 if (attrs & PP_ATTRS_W) 6167 ept |= EPT_W; 6168 return ept; 6169 } 6170 6171 /* 6172 * Helper for pmap_ept_free_ptp. 6173 * tree[0] = &L2[L2idx] 6174 * tree[1] = &L3[L3idx] 6175 * tree[2] = &L4[L4idx] 6176 */ 6177 static void 6178 pmap_ept_get_tree(struct pmap *pmap, vaddr_t va, pd_entry_t **tree) 6179 { 6180 pt_entry_t *pteva; 6181 paddr_t ptepa; 6182 int i, index; 6183 6184 ptepa = pmap->pm_pdirpa[0]; 6185 for (i = PTP_LEVELS; i > 1; i--) { 6186 index = pl_pi(va, i); 6187 pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa); 6188 KASSERT(pmap_ept_valid_entry(pteva[index])); 6189 tree[i - 2] = &pteva[index]; 6190 ptepa = pmap_pte2pa(pteva[index]); 6191 } 6192 } 6193 6194 static void 6195 pmap_ept_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va) 6196 { 6197 pd_entry_t *tree[3]; 6198 int level; 6199 6200 KASSERT(pmap != pmap_kernel()); 6201 KASSERT(mutex_owned(&pmap->pm_lock)); 6202 KASSERT(kpreempt_disabled()); 6203 6204 pmap_ept_get_tree(pmap, va, tree); 6205 6206 level = 1; 6207 do { 6208 (void)pmap_pte_testset(tree[level - 1], 0); 6209 6210 pmap_freepage(pmap, ptp, level); 6211 if (level < PTP_LEVELS - 1) { 6212 ptp = pmap_find_ptp(pmap, va, level + 1); 6213 ptp->wire_count--; 6214 if (ptp->wire_count > 1) 6215 break; 6216 } 6217 } while (++level < PTP_LEVELS); 6218 pmap_pte_flush(); 6219 } 6220 6221 /* Allocate L4->L3->L2. Return L2. */ 6222 static void 6223 pmap_ept_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va) 6224 { 6225 struct vm_page *ptp; 6226 unsigned long index; 6227 pd_entry_t *pteva; 6228 paddr_t ptepa; 6229 int i; 6230 6231 KASSERT(pmap != pmap_kernel()); 6232 KASSERT(mutex_owned(&pmap->pm_lock)); 6233 KASSERT(kpreempt_disabled()); 6234 6235 /* 6236 * Now that we have all the pages looked up or allocated, 6237 * loop through again installing any new ones into the tree. 6238 */ 6239 ptepa = pmap->pm_pdirpa[0]; 6240 for (i = PTP_LEVELS; i > 1; i--) { 6241 index = pl_pi(va, i); 6242 pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa); 6243 6244 if (pmap_ept_valid_entry(pteva[index])) { 6245 KASSERT(!pt->alloced[i]); 6246 ptepa = pmap_pte2pa(pteva[index]); 6247 continue; 6248 } 6249 6250 ptp = pt->pg[i]; 6251 ptp->flags &= ~PG_BUSY; /* never busy */ 6252 ptp->wire_count = 1; 6253 pmap->pm_ptphint[i - 2] = ptp; 6254 ptepa = VM_PAGE_TO_PHYS(ptp); 6255 pmap_pte_set(&pteva[index], ptepa | EPT_R | EPT_W | EPT_X); 6256 6257 pmap_pte_flush(); 6258 pmap_stats_update(pmap, 1, 0); 6259 6260 /* 6261 * If we're not in the top level, increase the 6262 * wire count of the parent page. 6263 */ 6264 if (i < PTP_LEVELS) { 6265 pt->pg[i + 1]->wire_count++; 6266 } 6267 } 6268 } 6269 6270 static int 6271 pmap_ept_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, 6272 u_int flags) 6273 { 6274 pt_entry_t *ptes, opte, npte; 6275 pt_entry_t *ptep; 6276 struct vm_page *ptp; 6277 struct vm_page *new_pg, *old_pg; 6278 struct pmap_page *new_pp, *old_pp; 6279 struct pv_entry *old_pve, *new_pve; 6280 bool wired = (flags & PMAP_WIRED) != 0; 6281 bool accessed; 6282 struct pmap_ptparray pt; 6283 int error; 6284 bool getptp, samepage, new_embedded; 6285 rb_tree_t *tree; 6286 6287 KASSERT(pmap_initialized); 6288 KASSERT(va < VM_MAXUSER_ADDRESS); 6289 6290 npte = pa | pmap_ept_prot(prot) | pmap_ept_type(flags); 6291 6292 if (wired) 6293 npte |= EPT_WIRED; 6294 if (flags & VM_PROT_ALL) { 6295 npte |= EPT_A; 6296 if (flags & VM_PROT_WRITE) { 6297 KASSERT((npte & EPT_W) != 0); 6298 npte |= EPT_D; 6299 } 6300 } 6301 6302 new_pg = PHYS_TO_VM_PAGE(pa); 6303 if (new_pg != NULL) { 6304 /* This is a managed page */ 6305 npte |= EPT_PVLIST; 6306 new_pp = VM_PAGE_TO_PP(new_pg); 6307 } else if ((new_pp = pmap_pv_tracked(pa)) != NULL) { 6308 /* This is an unmanaged pv-tracked page */ 6309 npte |= EPT_PVLIST; 6310 } else { 6311 new_pp = NULL; 6312 } 6313 6314 /* Begin by locking the pmap. */ 6315 mutex_enter(&pmap->pm_lock); 6316 6317 /* Look up the PTP. Allocate if none present. */ 6318 ptp = NULL; 6319 getptp = false; 6320 if (pmap != pmap_kernel()) { 6321 ptp = pmap_find_ptp(pmap, va, 1); 6322 if (ptp == NULL) { 6323 getptp = true; 6324 error = pmap_get_ptp(pmap, &pt, va, flags, &ptp); 6325 if (error != 0) { 6326 if (flags & PMAP_CANFAIL) { 6327 mutex_exit(&pmap->pm_lock); 6328 return error; 6329 } 6330 panic("%s: get ptp failed, error=%d", __func__, 6331 error); 6332 } 6333 } 6334 tree = &VM_PAGE_TO_PP(ptp)->pp_rb; 6335 } else { 6336 /* Embedded PV entries rely on this. */ 6337 KASSERT(va != 0); 6338 tree = &pmap_kernel_rb; 6339 } 6340 6341 /* 6342 * Look up the old PV entry at this VA (if any), and insert a new PV 6343 * entry if required for the new mapping. Temporarily track the old 6344 * and new mappings concurrently. Only after the old mapping is 6345 * evicted from the pmap will we remove its PV entry. Otherwise, 6346 * our picture of modified/accessed state for either page could get 6347 * out of sync (we need any P->V operation for either page to stall 6348 * on pmap->pm_lock until done here). 6349 */ 6350 new_pve = NULL; 6351 old_pve = NULL; 6352 samepage = false; 6353 new_embedded = false; 6354 6355 if (new_pp != NULL) { 6356 error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve, 6357 &old_pve, &samepage, &new_embedded, tree); 6358 6359 /* 6360 * If a new pv_entry was needed and none was available, we 6361 * can go no further. 6362 */ 6363 if (error != 0) { 6364 if (flags & PMAP_CANFAIL) { 6365 if (getptp) { 6366 pmap_unget_ptp(pmap, &pt); 6367 } 6368 mutex_exit(&pmap->pm_lock); 6369 return error; 6370 } 6371 panic("%s: alloc pve failed", __func__); 6372 } 6373 } else { 6374 old_pve = pmap_treelookup_pv(pmap, ptp, tree, va); 6375 } 6376 6377 /* Map PTEs into address space. */ 6378 kpreempt_disable(); 6379 6380 /* Install any newly allocated PTPs. */ 6381 if (getptp) { 6382 pmap_ept_install_ptp(pmap, &pt, va); 6383 } 6384 6385 /* Check if there is an existing mapping. */ 6386 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp)); 6387 ptep = &ptes[pl1_pi(va)]; 6388 opte = *ptep; 6389 bool have_oldpa = pmap_ept_valid_entry(opte); 6390 paddr_t oldpa = pmap_pte2pa(opte); 6391 6392 /* 6393 * Update the pte. 6394 */ 6395 do { 6396 opte = *ptep; 6397 6398 /* 6399 * if the same page, inherit PTE_A and PTE_D. 6400 */ 6401 if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) { 6402 npte |= opte & (EPT_A | EPT_D); 6403 } 6404 } while (pmap_pte_cas(ptep, opte, npte) != opte); 6405 6406 /* 6407 * Done with the PTEs: they can now be unmapped. 6408 */ 6409 kpreempt_enable(); 6410 6411 /* 6412 * Update statistics and PTP's reference count. 6413 */ 6414 pmap_ept_stats_update_bypte(pmap, npte, opte); 6415 if (ptp != NULL) { 6416 if (!have_oldpa) { 6417 ptp->wire_count++; 6418 } 6419 /* Remember minimum VA in PTP. */ 6420 pmap_ptp_range_set(ptp, va); 6421 } 6422 KASSERT(ptp == NULL || ptp->wire_count > 1); 6423 6424 /* 6425 * If the same page, we can skip pv_entry handling. 6426 */ 6427 if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) { 6428 KASSERT(((opte ^ npte) & EPT_PVLIST) == 0); 6429 if ((npte & EPT_PVLIST) != 0) { 6430 KASSERT(samepage); 6431 pmap_check_pv(pmap, ptp, new_pp, va, true); 6432 } 6433 goto same_pa; 6434 } else if ((npte & EPT_PVLIST) != 0) { 6435 KASSERT(!samepage); 6436 } 6437 6438 /* 6439 * If old page is pv-tracked, remove pv_entry from its list. 6440 */ 6441 if ((~opte & (EPT_R | EPT_PVLIST)) == 0) { 6442 if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) { 6443 old_pp = VM_PAGE_TO_PP(old_pg); 6444 } else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) { 6445 panic("%s: EPT_PVLIST with pv-untracked page" 6446 " va = %#"PRIxVADDR 6447 " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")", 6448 __func__, va, oldpa, atop(pa)); 6449 } 6450 6451 pmap_remove_pv(pmap, old_pp, ptp, va, old_pve, 6452 pmap_ept_to_pp_attrs(opte)); 6453 } else { 6454 KASSERT(old_pve == NULL); 6455 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 6456 } 6457 6458 /* 6459 * If new page is dynamically PV tracked, insert to tree. 6460 */ 6461 if (new_pve != NULL) { 6462 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 6463 old_pve = rb_tree_insert_node(tree, new_pve); 6464 KASSERT(old_pve == new_pve); 6465 pmap_check_pv(pmap, ptp, new_pp, va, true); 6466 } 6467 6468 same_pa: 6469 /* 6470 * shootdown tlb if necessary. 6471 */ 6472 6473 if (pmap_ept_has_ad) { 6474 accessed = (~opte & (EPT_R | EPT_A)) == 0; 6475 } else { 6476 accessed = (opte & EPT_R) != 0; 6477 } 6478 if (accessed && ((opte ^ npte) & (PTE_FRAME | EPT_W)) != 0) { 6479 pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_ENTER); 6480 } 6481 pmap_drain_pv(pmap); 6482 mutex_exit(&pmap->pm_lock); 6483 return 0; 6484 } 6485 6486 /* Pay close attention, this returns L2. */ 6487 static int 6488 pmap_ept_pdes_invalid(struct pmap *pmap, vaddr_t va, pd_entry_t *lastpde) 6489 { 6490 pt_entry_t *pteva; 6491 paddr_t ptepa; 6492 int i, index; 6493 6494 KASSERT(mutex_owned(&pmap->pm_lock)); 6495 6496 ptepa = pmap->pm_pdirpa[0]; 6497 for (i = PTP_LEVELS; i > 1; i--) { 6498 pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa); 6499 index = pl_pi(va, i); 6500 if (!pmap_ept_valid_entry(pteva[index])) 6501 return i; 6502 ptepa = pmap_pte2pa(pteva[index]); 6503 } 6504 if (lastpde != NULL) { 6505 *lastpde = pteva[index]; 6506 } 6507 6508 return 0; 6509 } 6510 6511 static bool 6512 pmap_ept_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap) 6513 { 6514 pt_entry_t *ptes, pte; 6515 pd_entry_t pde; 6516 paddr_t ptppa, pa; 6517 bool rv; 6518 6519 #ifdef __HAVE_DIRECT_MAP 6520 if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) { 6521 if (pap != NULL) { 6522 *pap = PMAP_DIRECT_UNMAP(va); 6523 } 6524 return true; 6525 } 6526 #endif 6527 6528 rv = false; 6529 pa = 0; 6530 6531 mutex_enter(&pmap->pm_lock); 6532 kpreempt_disable(); 6533 6534 if (!pmap_ept_pdes_invalid(pmap, va, &pde)) { 6535 ptppa = pmap_pte2pa(pde); 6536 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa); 6537 pte = ptes[pl1_pi(va)]; 6538 if (__predict_true((pte & EPT_R) != 0)) { 6539 pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1)); 6540 rv = true; 6541 } 6542 } 6543 6544 kpreempt_enable(); 6545 mutex_exit(&pmap->pm_lock); 6546 6547 if (pap != NULL) { 6548 *pap = pa; 6549 } 6550 return rv; 6551 } 6552 6553 static bool 6554 pmap_ept_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, 6555 vaddr_t va) 6556 { 6557 struct pv_entry *pve; 6558 struct vm_page *pg; 6559 struct pmap_page *pp; 6560 pt_entry_t opte; 6561 bool accessed; 6562 6563 KASSERT(pmap != pmap_kernel()); 6564 KASSERT(mutex_owned(&pmap->pm_lock)); 6565 KASSERT(kpreempt_disabled()); 6566 6567 if (!pmap_ept_valid_entry(*pte)) { 6568 /* VA not mapped. */ 6569 return false; 6570 } 6571 6572 /* Atomically save the old PTE and zap it. */ 6573 opte = pmap_pte_testset(pte, 0); 6574 if (!pmap_ept_valid_entry(opte)) { 6575 return false; 6576 } 6577 6578 pmap_ept_stats_update_bypte(pmap, 0, opte); 6579 6580 if (ptp) { 6581 /* 6582 * Dropping a PTE. Make sure that the PDE is flushed. 6583 */ 6584 ptp->wire_count--; 6585 if (ptp->wire_count <= 1) { 6586 opte |= EPT_A; 6587 } 6588 } 6589 6590 if (pmap_ept_has_ad) { 6591 accessed = (opte & EPT_A) != 0; 6592 } else { 6593 accessed = true; 6594 } 6595 if (accessed) { 6596 pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_REMOVE_PTE); 6597 } 6598 6599 /* 6600 * If we are not on a pv list - we are done. 6601 */ 6602 if ((opte & EPT_PVLIST) == 0) { 6603 KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL), 6604 "managed page without EPT_PVLIST for %#"PRIxVADDR, va); 6605 KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL), 6606 "pv-tracked page without EPT_PVLIST for %#"PRIxVADDR, va); 6607 KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ? 6608 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL); 6609 return true; 6610 } 6611 6612 if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) { 6613 pp = VM_PAGE_TO_PP(pg); 6614 } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) { 6615 paddr_t pa = pmap_pte2pa(opte); 6616 panic("%s: EPT_PVLIST with pv-untracked page" 6617 " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")", 6618 __func__, va, pa, atop(pa)); 6619 } 6620 6621 /* Sync R/M bits. */ 6622 pve = pmap_lookup_pv(pmap, ptp, pp, va); 6623 pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_ept_to_pp_attrs(opte)); 6624 return true; 6625 } 6626 6627 static void 6628 pmap_ept_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva, 6629 vaddr_t startva, vaddr_t endva) 6630 { 6631 pt_entry_t *pte = (pt_entry_t *)ptpva; 6632 6633 KASSERT(pmap != pmap_kernel()); 6634 KASSERT(mutex_owned(&pmap->pm_lock)); 6635 KASSERT(kpreempt_disabled()); 6636 6637 /* 6638 * mappings are very often sparse, so clip the given range to the 6639 * range of PTEs that are known present in the PTP. 6640 */ 6641 pmap_ptp_range_clip(ptp, &startva, &pte); 6642 6643 /* 6644 * note that ptpva points to the PTE that maps startva. this may 6645 * or may not be the first PTE in the PTP. 6646 * 6647 * we loop through the PTP while there are still PTEs to look at 6648 * and the wire_count is greater than 1 (because we use the wire_count 6649 * to keep track of the number of real PTEs in the PTP). 6650 */ 6651 while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) { 6652 (void)pmap_ept_remove_pte(pmap, ptp, pte, startva); 6653 startva += PAGE_SIZE; 6654 pte++; 6655 } 6656 } 6657 6658 static void 6659 pmap_ept_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 6660 { 6661 pt_entry_t *ptes; 6662 pd_entry_t pde; 6663 paddr_t ptppa; 6664 vaddr_t blkendva, va = sva; 6665 struct vm_page *ptp; 6666 6667 mutex_enter(&pmap->pm_lock); 6668 kpreempt_disable(); 6669 6670 for (/* null */ ; va < eva ; va = blkendva) { 6671 int lvl; 6672 6673 /* determine range of block */ 6674 blkendva = x86_round_pdr(va+1); 6675 if (blkendva > eva) 6676 blkendva = eva; 6677 6678 lvl = pmap_ept_pdes_invalid(pmap, va, &pde); 6679 if (lvl != 0) { 6680 /* Skip a range corresponding to an invalid pde. */ 6681 blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1]; 6682 continue; 6683 } 6684 6685 /* PA of the PTP */ 6686 ptppa = pmap_pte2pa(pde); 6687 6688 ptp = pmap_find_ptp(pmap, va, 1); 6689 KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected", 6690 __func__); 6691 6692 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa); 6693 6694 pmap_ept_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_pi(va)], va, 6695 blkendva); 6696 6697 /* If PTP is no longer being used, free it. */ 6698 if (ptp && ptp->wire_count <= 1) { 6699 pmap_ept_free_ptp(pmap, ptp, va); 6700 } 6701 } 6702 6703 kpreempt_enable(); 6704 pmap_drain_pv(pmap); 6705 mutex_exit(&pmap->pm_lock); 6706 } 6707 6708 static int 6709 pmap_ept_sync_pv(struct vm_page *ptp, vaddr_t va, paddr_t pa, int clearbits, 6710 uint8_t *oattrs, pt_entry_t *optep) 6711 { 6712 struct pmap *pmap; 6713 pt_entry_t *ptep; 6714 pt_entry_t opte; 6715 pt_entry_t npte; 6716 pt_entry_t expect; 6717 bool need_shootdown; 6718 6719 expect = pmap_pa2pte(pa) | EPT_R; 6720 pmap = ptp_to_pmap(ptp); 6721 6722 if (clearbits != ~0) { 6723 KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0); 6724 clearbits = pmap_pp_attrs_to_ept(clearbits); 6725 } 6726 6727 ptep = pmap_map_pte(pmap, ptp, va); 6728 do { 6729 opte = *ptep; 6730 KASSERT((opte & (EPT_D | EPT_A)) != EPT_D); 6731 KASSERT((opte & (EPT_A | EPT_R)) != EPT_A); 6732 KASSERT(opte == 0 || (opte & EPT_R) != 0); 6733 if ((opte & (PTE_FRAME | EPT_R)) != expect) { 6734 /* 6735 * We lost a race with a V->P operation like 6736 * pmap_remove(). Wait for the competitor 6737 * reflecting pte bits into mp_attrs. 6738 */ 6739 pmap_unmap_pte(); 6740 return EAGAIN; 6741 } 6742 6743 /* 6744 * Check if there's anything to do on this PTE. 6745 */ 6746 if ((opte & clearbits) == 0) { 6747 need_shootdown = false; 6748 break; 6749 } 6750 6751 /* 6752 * We need a shootdown if the PTE is cached (EPT_A) ... 6753 * ... Unless we are clearing only the EPT_W bit and 6754 * it isn't cached as RW (EPT_D). 6755 */ 6756 if (pmap_ept_has_ad) { 6757 need_shootdown = (opte & EPT_A) != 0 && 6758 !(clearbits == EPT_W && (opte & EPT_D) == 0); 6759 } else { 6760 need_shootdown = true; 6761 } 6762 6763 npte = opte & ~clearbits; 6764 6765 /* 6766 * If we need a shootdown anyway, clear EPT_A and EPT_D. 6767 */ 6768 if (need_shootdown) { 6769 npte &= ~(EPT_A | EPT_D); 6770 } 6771 KASSERT((npte & (EPT_D | EPT_A)) != EPT_D); 6772 KASSERT((npte & (EPT_A | EPT_R)) != EPT_A); 6773 KASSERT(npte == 0 || (opte & EPT_R) != 0); 6774 } while (pmap_pte_cas(ptep, opte, npte) != opte); 6775 6776 if (need_shootdown) { 6777 pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_SYNC_PV); 6778 } 6779 pmap_unmap_pte(); 6780 6781 *oattrs = pmap_ept_to_pp_attrs(opte); 6782 if (optep != NULL) 6783 *optep = opte; 6784 return 0; 6785 } 6786 6787 static void 6788 pmap_ept_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte, 6789 vaddr_t va) 6790 { 6791 6792 KASSERT(mutex_owned(&pmap->pm_lock)); 6793 6794 pmap_ept_stats_update_bypte(pmap, 0, opte); 6795 ptp->wire_count--; 6796 if (ptp->wire_count <= 1) { 6797 pmap_ept_free_ptp(pmap, ptp, va); 6798 } 6799 } 6800 6801 static void 6802 pmap_ept_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot) 6803 { 6804 pt_entry_t bit_rem; 6805 pt_entry_t *ptes, *spte; 6806 pt_entry_t opte, npte; 6807 pd_entry_t pde; 6808 paddr_t ptppa; 6809 vaddr_t va; 6810 bool modified; 6811 6812 bit_rem = 0; 6813 if (!(prot & VM_PROT_WRITE)) 6814 bit_rem = EPT_W; 6815 6816 sva &= PTE_FRAME; 6817 eva &= PTE_FRAME; 6818 6819 /* Acquire pmap. */ 6820 mutex_enter(&pmap->pm_lock); 6821 kpreempt_disable(); 6822 6823 for (va = sva; va < eva; va += PAGE_SIZE) { 6824 if (pmap_ept_pdes_invalid(pmap, va, &pde)) { 6825 continue; 6826 } 6827 6828 ptppa = pmap_pte2pa(pde); 6829 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa); 6830 spte = &ptes[pl1_pi(va)]; 6831 6832 do { 6833 opte = *spte; 6834 if (!pmap_ept_valid_entry(opte)) { 6835 goto next; 6836 } 6837 npte = (opte & ~bit_rem); 6838 } while (pmap_pte_cas(spte, opte, npte) != opte); 6839 6840 if (pmap_ept_has_ad) { 6841 modified = (opte & EPT_D) != 0; 6842 } else { 6843 modified = true; 6844 } 6845 if (modified) { 6846 vaddr_t tva = x86_ptob(spte - ptes); 6847 pmap_tlb_shootdown(pmap, tva, 0, 6848 TLBSHOOT_WRITE_PROTECT); 6849 } 6850 next:; 6851 } 6852 6853 kpreempt_enable(); 6854 mutex_exit(&pmap->pm_lock); 6855 } 6856 6857 static void 6858 pmap_ept_unwire(struct pmap *pmap, vaddr_t va) 6859 { 6860 pt_entry_t *ptes, *ptep, opte; 6861 pd_entry_t pde; 6862 paddr_t ptppa; 6863 6864 /* Acquire pmap. */ 6865 mutex_enter(&pmap->pm_lock); 6866 kpreempt_disable(); 6867 6868 if (pmap_ept_pdes_invalid(pmap, va, &pde)) { 6869 panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va); 6870 } 6871 6872 ptppa = pmap_pte2pa(pde); 6873 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa); 6874 ptep = &ptes[pl1_pi(va)]; 6875 opte = *ptep; 6876 KASSERT(pmap_ept_valid_entry(opte)); 6877 6878 if (opte & EPT_WIRED) { 6879 pt_entry_t npte = opte & ~EPT_WIRED; 6880 6881 opte = pmap_pte_testset(ptep, npte); 6882 pmap_ept_stats_update_bypte(pmap, npte, opte); 6883 } else { 6884 printf("%s: wiring for pmap %p va %#" PRIxVADDR 6885 "did not change!\n", __func__, pmap, va); 6886 } 6887 6888 /* Release pmap. */ 6889 kpreempt_enable(); 6890 mutex_exit(&pmap->pm_lock); 6891 } 6892 6893 /* -------------------------------------------------------------------------- */ 6894 6895 void 6896 pmap_ept_transform(struct pmap *pmap) 6897 { 6898 pmap->pm_enter = pmap_ept_enter; 6899 pmap->pm_extract = pmap_ept_extract; 6900 pmap->pm_remove = pmap_ept_remove; 6901 pmap->pm_sync_pv = pmap_ept_sync_pv; 6902 pmap->pm_pp_remove_ent = pmap_ept_pp_remove_ent; 6903 pmap->pm_write_protect = pmap_ept_write_protect; 6904 pmap->pm_unwire = pmap_ept_unwire; 6905 6906 memset(PAGE_ALIGNED(pmap->pm_pdir), 0, PAGE_SIZE); 6907 } 6908 6909 #endif /* __HAVE_DIRECT_MAP && __x86_64__ && !XENPV */ 6910