1 /* $NetBSD: pmap.c,v 1.426 2023/10/04 20:28:06 ad Exp $ */ 2 3 /* 4 * Copyright (c) 2008, 2010, 2016, 2017, 2019, 2020 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran, and by Maxime Villard. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 2007 Manuel Bouyer. 34 * 35 * Redistribution and use in source and binary forms, with or without 36 * modification, are permitted provided that the following conditions 37 * are met: 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 44 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 45 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 46 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 47 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 48 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 49 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 50 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 51 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 52 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 53 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 54 */ 55 56 /* 57 * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr> 58 * 59 * Permission to use, copy, modify, and distribute this software for any 60 * purpose with or without fee is hereby granted, provided that the above 61 * copyright notice and this permission notice appear in all copies. 62 * 63 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 64 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 65 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 66 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 67 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 68 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 69 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 70 */ 71 72 /* 73 * Copyright 2001 (c) Wasabi Systems, Inc. 74 * All rights reserved. 75 * 76 * Written by Frank van der Linden for Wasabi Systems, Inc. 77 * 78 * Redistribution and use in source and binary forms, with or without 79 * modification, are permitted provided that the following conditions 80 * are met: 81 * 1. Redistributions of source code must retain the above copyright 82 * notice, this list of conditions and the following disclaimer. 83 * 2. Redistributions in binary form must reproduce the above copyright 84 * notice, this list of conditions and the following disclaimer in the 85 * documentation and/or other materials provided with the distribution. 86 * 3. All advertising materials mentioning features or use of this software 87 * must display the following acknowledgement: 88 * This product includes software developed for the NetBSD Project by 89 * Wasabi Systems, Inc. 90 * 4. The name of Wasabi Systems, Inc. may not be used to endorse 91 * or promote products derived from this software without specific prior 92 * written permission. 93 * 94 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND 95 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 96 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 97 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC 98 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 99 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 100 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 101 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 102 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 103 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 104 * POSSIBILITY OF SUCH DAMAGE. 105 */ 106 107 /* 108 * Copyright (c) 1997 Charles D. Cranor and Washington University. 109 * All rights reserved. 110 * 111 * Redistribution and use in source and binary forms, with or without 112 * modification, are permitted provided that the following conditions 113 * are met: 114 * 1. Redistributions of source code must retain the above copyright 115 * notice, this list of conditions and the following disclaimer. 116 * 2. Redistributions in binary form must reproduce the above copyright 117 * notice, this list of conditions and the following disclaimer in the 118 * documentation and/or other materials provided with the distribution. 119 * 120 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 121 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 122 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 123 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 124 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 125 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 126 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 127 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 128 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 129 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 130 */ 131 132 #include <sys/cdefs.h> 133 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.426 2023/10/04 20:28:06 ad Exp $"); 134 135 #include "opt_user_ldt.h" 136 #include "opt_lockdebug.h" 137 #include "opt_multiprocessor.h" 138 #include "opt_xen.h" 139 #include "opt_svs.h" 140 #include "opt_kaslr.h" 141 #include "opt_efi.h" 142 143 #define __MUTEX_PRIVATE /* for assertions */ 144 145 #include <sys/param.h> 146 #include <sys/systm.h> 147 #include <sys/proc.h> 148 #include <sys/pool.h> 149 #include <sys/kernel.h> 150 #include <sys/atomic.h> 151 #include <sys/cpu.h> 152 #include <sys/intr.h> 153 #include <sys/xcall.h> 154 #include <sys/kcore.h> 155 #include <sys/kmem.h> 156 #include <sys/asan.h> 157 #include <sys/msan.h> 158 #include <sys/entropy.h> 159 160 #include <uvm/uvm.h> 161 #include <uvm/pmap/pmap_pvt.h> 162 163 #include <dev/isa/isareg.h> 164 165 #include <machine/specialreg.h> 166 #include <machine/gdt.h> 167 #include <machine/isa_machdep.h> 168 #include <machine/cpuvar.h> 169 #include <machine/cputypes.h> 170 #include <machine/pmap_private.h> 171 172 #include <x86/bootspace.h> 173 #include <x86/pat.h> 174 #include <x86/pmap_pv.h> 175 176 #include <x86/i82489reg.h> 177 #include <x86/i82489var.h> 178 179 #ifdef XEN 180 #include <xen/include/public/xen.h> 181 #include <xen/hypervisor.h> 182 #include <xen/xenpmap.h> 183 #endif 184 185 #ifdef __HAVE_DIRECT_MAP 186 #include <crypto/nist_hash_drbg/nist_hash_drbg.h> 187 #endif 188 189 /* 190 * general info: 191 * 192 * - for an explanation of how the x86 MMU hardware works see 193 * the comments in <machine/pte.h>. 194 * 195 * - for an explanation of the general memory structure used by 196 * this pmap (including the recursive mapping), see the comments 197 * in <machine/pmap.h>. 198 * 199 * this file contains the code for the "pmap module." the module's 200 * job is to manage the hardware's virtual to physical address mappings. 201 * note that there are two levels of mapping in the VM system: 202 * 203 * [1] the upper layer of the VM system uses vm_map's and vm_map_entry's 204 * to map ranges of virtual address space to objects/files. for 205 * example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only 206 * to the file /bin/ls starting at offset zero." note that 207 * the upper layer mapping is not concerned with how individual 208 * vm_pages are mapped. 209 * 210 * [2] the lower layer of the VM system (the pmap) maintains the mappings 211 * from virtual addresses. it is concerned with which vm_page is 212 * mapped where. for example, when you run /bin/ls and start 213 * at page 0x1000 the fault routine may lookup the correct page 214 * of the /bin/ls file and then ask the pmap layer to establish 215 * a mapping for it. 216 * 217 * note that information in the lower layer of the VM system can be 218 * thrown away since it can easily be reconstructed from the info 219 * in the upper layer. 220 * 221 * data structures we use include: 222 * 223 * - struct pmap: describes the address space of one thread 224 * - struct pmap_page: describes one pv-tracked page, without 225 * necessarily a corresponding vm_page 226 * - struct pv_entry: describes one <PMAP,VA> mapping of a PA 227 * - pmap_page::pp_pvlist: there is one list per pv-tracked page of 228 * physical memory. the pp_pvlist points to a list of pv_entry 229 * structures which describe all the <PMAP,VA> pairs that this 230 * page is mapped in. this is critical for page based operations 231 * such as pmap_page_protect() [change protection on _all_ mappings 232 * of a page] 233 */ 234 235 /* 236 * Locking 237 * 238 * We have the following locks that we must deal with, listed in the order 239 * that they are acquired: 240 * 241 * pg->uobject->vmobjlock, pg->uanon->an_lock 242 * 243 * For managed pages, these per-object locks are taken by the VM system 244 * before calling into the pmap module - either a read or write hold. 245 * The lock hold prevent pages from changing identity while the pmap is 246 * operating on them. For example, the same lock is held across a call 247 * to pmap_remove() and the following call to pmap_update(), so that a 248 * page does not gain a new identity while its TLB visibility is stale. 249 * 250 * pmap->pm_lock 251 * 252 * This lock protects the fields in the pmap structure including the 253 * non-kernel PDEs in the PDP, the PTEs, and PTPs and connected data 254 * structures. For modifying unmanaged kernel PTEs it is not needed as 255 * kernel PDEs are never freed, and the kernel is expected to be self 256 * consistent (and the lock can't be taken for unmanaged kernel PTEs, 257 * because they can be modified from interrupt context). 258 * 259 * pmaps_lock 260 * 261 * This lock protects the list of active pmaps (headed by "pmaps"). 262 * It's acquired when adding or removing pmaps or adjusting kernel PDEs. 263 * 264 * pp_lock 265 * 266 * This per-page lock protects PV entry lists and the embedded PV entry 267 * in each vm_page, allowing for concurrent operation on pages by 268 * different pmaps. This is a spin mutex at IPL_VM, because at the 269 * points it is taken context switching is usually not tolerable, and 270 * spin mutexes must block out interrupts that could take kernel_lock. 271 */ 272 273 /* uvm_object is abused here to index pmap_pages; make assertions happy. */ 274 #ifdef DIAGNOSTIC 275 #define PMAP_DUMMY_LOCK(pm) rw_enter(&(pm)->pm_dummy_lock, RW_WRITER) 276 #define PMAP_DUMMY_UNLOCK(pm) rw_exit(&(pm)->pm_dummy_lock) 277 #else 278 #define PMAP_DUMMY_LOCK(pm) 279 #define PMAP_DUMMY_UNLOCK(pm) 280 #endif 281 282 static const struct uvm_pagerops pmap_pager = { 283 /* nothing */ 284 }; 285 286 /* 287 * pl_i(va, X) == plX_i(va) <= pl_i_roundup(va, X) 288 */ 289 #define pl_i(va, lvl) \ 290 (((VA_SIGN_POS(va)) & ptp_frames[(lvl)-1]) >> ptp_shifts[(lvl)-1]) 291 292 #define pl_i_roundup(va, lvl) pl_i((va)+ ~ptp_frames[(lvl)-1], (lvl)) 293 294 /* 295 * PTP macros: 296 * a PTP's index is the PD index of the PDE that points to it 297 * a PTP's offset is the byte-offset in the PTE space that this PTP is at 298 * a PTP's VA is the first VA mapped by that PTP 299 */ 300 301 #define ptp_va2o(va, lvl) (pl_i(va, (lvl)+1) * PAGE_SIZE) 302 303 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER; 304 const vaddr_t ptp_frames[] = PTP_FRAME_INITIALIZER; 305 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER; 306 const long nkptpmax[] = NKPTPMAX_INITIALIZER; 307 const long nbpd[] = NBPD_INITIALIZER; 308 #ifdef i386 309 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER; 310 #else 311 pd_entry_t *normal_pdes[3]; 312 #endif 313 314 long nkptp[] = NKPTP_INITIALIZER; 315 316 struct pmap_head pmaps; 317 kmutex_t pmaps_lock __cacheline_aligned; 318 319 struct pcpu_area *pcpuarea __read_mostly; 320 321 static vaddr_t pmap_maxkvaddr; 322 323 /* 324 * Misc. event counters. 325 */ 326 struct evcnt pmap_iobmp_evcnt; 327 struct evcnt pmap_ldt_evcnt; 328 329 /* 330 * PAT 331 */ 332 static bool cpu_pat_enabled __read_mostly = false; 333 334 /* 335 * Global data structures 336 */ 337 338 static struct pmap kernel_pmap_store __cacheline_aligned; /* kernel's pmap */ 339 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store; 340 static rb_tree_t pmap_kernel_rb __cacheline_aligned; 341 342 struct bootspace bootspace __read_mostly; 343 struct slotspace slotspace __read_mostly; 344 345 /* Set to PTE_NX if supported. */ 346 pd_entry_t pmap_pg_nx __read_mostly = 0; 347 348 /* Set to PTE_G if supported. */ 349 pd_entry_t pmap_pg_g __read_mostly = 0; 350 351 /* Set to true if large pages are supported. */ 352 int pmap_largepages __read_mostly = 0; 353 354 paddr_t lowmem_rsvd __read_mostly; 355 paddr_t avail_start __read_mostly; /* PA of first available physical page */ 356 paddr_t avail_end __read_mostly; /* PA of last available physical page */ 357 358 #ifdef XENPV 359 paddr_t pmap_pa_start; /* PA of first physical page for this domain */ 360 paddr_t pmap_pa_end; /* PA of last physical page for this domain */ 361 #endif 362 363 #define VM_PAGE_TO_PP(pg) (&(pg)->mdpage.mp_pp) 364 #define PMAP_CHECK_PP(pp) \ 365 KASSERTMSG((pp)->pp_lock.mtx_ipl._ipl == IPL_VM, "bad pmap_page %p", pp) 366 367 #define PAGE_ALIGNED(pp) \ 368 __builtin_assume_aligned((void *)(pp), PAGE_SIZE) 369 370 /* 371 * Other data structures 372 */ 373 374 static pt_entry_t protection_codes[8] __read_mostly; 375 376 static bool pmap_initialized __read_mostly = false; /* pmap_init done yet? */ 377 378 /* 379 * The following two vaddr_t's are used during system startup to keep track of 380 * how much of the kernel's VM space we have used. Once the system is started, 381 * the management of the remaining kernel VM space is turned over to the 382 * kernel_map vm_map. 383 */ 384 static vaddr_t virtual_avail __read_mostly; /* VA of first free KVA */ 385 static vaddr_t virtual_end __read_mostly; /* VA of last free KVA */ 386 387 #ifndef XENPV 388 /* 389 * LAPIC virtual address, and fake physical address. 390 */ 391 volatile vaddr_t local_apic_va __read_mostly; 392 paddr_t local_apic_pa __read_mostly; 393 #endif 394 395 /* 396 * pool that pmap structures are allocated from 397 */ 398 struct pool_cache pmap_cache; 399 static int pmap_ctor(void *, void *, int); 400 static void pmap_dtor(void *, void *); 401 402 /* 403 * pv_page cache 404 */ 405 static struct pool_cache pmap_pvp_cache; 406 407 #ifdef __HAVE_DIRECT_MAP 408 vaddr_t pmap_direct_base __read_mostly; 409 vaddr_t pmap_direct_end __read_mostly; 410 #endif 411 412 #ifndef __HAVE_DIRECT_MAP 413 /* 414 * Special VAs and the PTEs that map them 415 */ 416 static pt_entry_t *early_zero_pte; 417 static void pmap_vpage_cpualloc(struct cpu_info *); 418 #ifdef XENPV 419 char *early_zerop; /* also referenced from xen_locore() */ 420 #else 421 static char *early_zerop; 422 #endif 423 #endif 424 425 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int); 426 427 /* PDP pool and its callbacks */ 428 static struct pool pmap_pdp_pool; 429 static void pmap_pdp_init(pd_entry_t *); 430 static void pmap_pdp_fini(pd_entry_t *); 431 432 #ifdef PAE 433 /* need to allocate items of 4 pages */ 434 static void *pmap_pdp_alloc(struct pool *, int); 435 static void pmap_pdp_free(struct pool *, void *); 436 static struct pool_allocator pmap_pdp_allocator = { 437 .pa_alloc = pmap_pdp_alloc, 438 .pa_free = pmap_pdp_free, 439 .pa_pagesz = PAGE_SIZE * PDP_SIZE, 440 }; 441 #endif 442 443 extern vaddr_t idt_vaddr; 444 extern paddr_t idt_paddr; 445 extern vaddr_t gdt_vaddr; 446 extern paddr_t gdt_paddr; 447 extern vaddr_t ldt_vaddr; 448 extern paddr_t ldt_paddr; 449 450 #ifdef i386 451 /* stuff to fix the pentium f00f bug */ 452 extern vaddr_t pentium_idt_vaddr; 453 #endif 454 455 /* Array of freshly allocated PTPs, for pmap_get_ptp(). */ 456 struct pmap_ptparray { 457 struct vm_page *pg[PTP_LEVELS + 1]; 458 bool alloced[PTP_LEVELS + 1]; 459 }; 460 461 /* 462 * PV entries are allocated in page-sized chunks and cached per-pmap to 463 * avoid intense pressure on memory allocators. 464 */ 465 466 struct pv_page { 467 LIST_HEAD(, pv_entry) pvp_pves; 468 LIST_ENTRY(pv_page) pvp_list; 469 long pvp_nfree; 470 struct pmap *pvp_pmap; 471 }; 472 473 #define PVE_PER_PVP ((PAGE_SIZE / sizeof(struct pv_entry)) - 1) 474 475 /* 476 * PV tree prototypes 477 */ 478 479 static int pmap_compare_key(void *, const void *, const void *); 480 static int pmap_compare_nodes(void *, const void *, const void *); 481 482 /* Read-black tree */ 483 static const rb_tree_ops_t pmap_rbtree_ops = { 484 .rbto_compare_nodes = pmap_compare_nodes, 485 .rbto_compare_key = pmap_compare_key, 486 .rbto_node_offset = offsetof(struct pv_entry, pve_rb), 487 .rbto_context = NULL 488 }; 489 490 /* 491 * Local prototypes 492 */ 493 494 #ifdef __HAVE_PCPU_AREA 495 static void pmap_init_pcpu(void); 496 #endif 497 #ifdef __HAVE_DIRECT_MAP 498 static void pmap_init_directmap(struct pmap *); 499 #endif 500 #if !defined(XENPV) 501 static void pmap_remap_global(void); 502 #endif 503 #ifndef XENPV 504 static void pmap_init_lapic(void); 505 static void pmap_remap_largepages(void); 506 #endif 507 508 static int pmap_get_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t, int, 509 struct vm_page **); 510 static void pmap_unget_ptp(struct pmap *, struct pmap_ptparray *); 511 static void pmap_install_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t, 512 pd_entry_t * const *); 513 static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, int); 514 static void pmap_freepage(struct pmap *, struct vm_page *, int); 515 static void pmap_free_ptp(struct pmap *, struct vm_page *, vaddr_t, 516 pt_entry_t *, pd_entry_t * const *); 517 static bool pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *, 518 vaddr_t); 519 static void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t, vaddr_t, 520 vaddr_t); 521 static int pmap_pvp_ctor(void *, void *, int); 522 static void pmap_pvp_dtor(void *, void *); 523 static struct pv_entry *pmap_alloc_pv(struct pmap *); 524 static void pmap_free_pv(struct pmap *, struct pv_entry *); 525 static void pmap_drain_pv(struct pmap *); 526 527 static void pmap_alloc_level(struct pmap *, vaddr_t, long *); 528 529 static void pmap_load1(struct lwp *, struct pmap *, struct pmap *); 530 static void pmap_reactivate(struct pmap *); 531 532 long 533 pmap_resident_count(struct pmap *pmap) 534 { 535 536 return pmap->pm_stats.resident_count; 537 } 538 539 long 540 pmap_wired_count(struct pmap *pmap) 541 { 542 543 return pmap->pm_stats.wired_count; 544 } 545 546 /* 547 * p m a p h e l p e r f u n c t i o n s 548 */ 549 550 static inline void 551 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff) 552 { 553 554 KASSERT(cold || mutex_owned(&pmap->pm_lock)); 555 pmap->pm_stats.resident_count += resid_diff; 556 pmap->pm_stats.wired_count += wired_diff; 557 } 558 559 static inline void 560 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte) 561 { 562 int resid_diff = ((npte & PTE_P) ? 1 : 0) - ((opte & PTE_P) ? 1 : 0); 563 int wired_diff = ((npte & PTE_WIRED) ? 1 : 0) - ((opte & PTE_WIRED) ? 1 : 0); 564 565 KASSERT((npte & (PTE_P | PTE_WIRED)) != PTE_WIRED); 566 KASSERT((opte & (PTE_P | PTE_WIRED)) != PTE_WIRED); 567 568 pmap_stats_update(pmap, resid_diff, wired_diff); 569 } 570 571 /* 572 * ptp_to_pmap: lookup pmap by ptp 573 */ 574 static inline struct pmap * 575 ptp_to_pmap(struct vm_page *ptp) 576 { 577 struct pmap *pmap; 578 579 if (ptp == NULL) { 580 return pmap_kernel(); 581 } 582 pmap = (struct pmap *)ptp->uobject; 583 KASSERT(pmap != NULL); 584 KASSERT(&pmap->pm_obj[0] == ptp->uobject); 585 return pmap; 586 } 587 588 static inline struct pv_pte * 589 pve_to_pvpte(struct pv_entry *pve) 590 { 591 592 if (pve == NULL) 593 return NULL; 594 KASSERT((void *)&pve->pve_pte == (void *)pve); 595 return &pve->pve_pte; 596 } 597 598 static inline struct pv_entry * 599 pvpte_to_pve(struct pv_pte *pvpte) 600 { 601 struct pv_entry *pve = (void *)pvpte; 602 603 KASSERT(pve_to_pvpte(pve) == pvpte); 604 return pve; 605 } 606 607 /* 608 * Return true if the pmap page has an embedded PV entry. 609 */ 610 static inline bool 611 pv_pte_embedded(struct pmap_page *pp) 612 { 613 614 KASSERT(mutex_owned(&pp->pp_lock)); 615 return (bool)((vaddr_t)pp->pp_pte.pte_ptp | pp->pp_pte.pte_va); 616 } 617 618 /* 619 * pv_pte_first, pv_pte_next: PV list iterator. 620 */ 621 static inline struct pv_pte * 622 pv_pte_first(struct pmap_page *pp) 623 { 624 625 KASSERT(mutex_owned(&pp->pp_lock)); 626 if (pv_pte_embedded(pp)) { 627 return &pp->pp_pte; 628 } 629 return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist)); 630 } 631 632 static inline struct pv_pte * 633 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte) 634 { 635 636 KASSERT(mutex_owned(&pp->pp_lock)); 637 KASSERT(pvpte != NULL); 638 if (pvpte == &pp->pp_pte) { 639 return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist)); 640 } 641 return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list)); 642 } 643 644 static inline uint8_t 645 pmap_pte_to_pp_attrs(pt_entry_t pte) 646 { 647 uint8_t ret = 0; 648 if (pte & PTE_D) 649 ret |= PP_ATTRS_D; 650 if (pte & PTE_A) 651 ret |= PP_ATTRS_A; 652 if (pte & PTE_W) 653 ret |= PP_ATTRS_W; 654 return ret; 655 } 656 657 static inline pt_entry_t 658 pmap_pp_attrs_to_pte(uint8_t attrs) 659 { 660 pt_entry_t pte = 0; 661 if (attrs & PP_ATTRS_D) 662 pte |= PTE_D; 663 if (attrs & PP_ATTRS_A) 664 pte |= PTE_A; 665 if (attrs & PP_ATTRS_W) 666 pte |= PTE_W; 667 return pte; 668 } 669 670 /* 671 * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]? 672 * of course the kernel is always loaded 673 */ 674 bool 675 pmap_is_curpmap(struct pmap *pmap) 676 { 677 return ((pmap == pmap_kernel()) || (pmap == curcpu()->ci_pmap)); 678 } 679 680 inline void 681 pmap_reference(struct pmap *pmap) 682 { 683 684 atomic_inc_uint(&pmap->pm_obj[0].uo_refs); 685 } 686 687 /* 688 * rbtree: compare two nodes. 689 */ 690 static int 691 pmap_compare_nodes(void *context, const void *n1, const void *n2) 692 { 693 const struct pv_entry *pve1 = n1; 694 const struct pv_entry *pve2 = n2; 695 696 KASSERT(pve1->pve_pte.pte_ptp == pve2->pve_pte.pte_ptp); 697 698 if (pve1->pve_pte.pte_va < pve2->pve_pte.pte_va) { 699 return -1; 700 } 701 if (pve1->pve_pte.pte_va > pve2->pve_pte.pte_va) { 702 return 1; 703 } 704 return 0; 705 } 706 707 /* 708 * rbtree: compare a node and a key. 709 */ 710 static int 711 pmap_compare_key(void *context, const void *n, const void *k) 712 { 713 const struct pv_entry *pve = n; 714 const vaddr_t key = (vaddr_t)k; 715 716 if (pve->pve_pte.pte_va < key) { 717 return -1; 718 } 719 if (pve->pve_pte.pte_va > key) { 720 return 1; 721 } 722 return 0; 723 } 724 725 /* 726 * pmap_ptp_range_set: abuse ptp->uanon to record minimum VA of PTE 727 */ 728 static inline void 729 pmap_ptp_range_set(struct vm_page *ptp, vaddr_t va) 730 { 731 vaddr_t *min = (vaddr_t *)&ptp->uanon; 732 733 if (va < *min) { 734 *min = va; 735 } 736 } 737 738 /* 739 * pmap_ptp_range_clip: abuse ptp->uanon to clip range of PTEs to remove 740 */ 741 static inline void 742 pmap_ptp_range_clip(struct vm_page *ptp, vaddr_t *startva, pt_entry_t **pte) 743 { 744 vaddr_t sclip; 745 746 if (ptp == NULL) { 747 return; 748 } 749 750 sclip = (vaddr_t)ptp->uanon; 751 sclip = (*startva < sclip ? sclip : *startva); 752 *pte += (sclip - *startva) / PAGE_SIZE; 753 *startva = sclip; 754 } 755 756 /* 757 * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in 758 * 759 * there are several pmaps involved. some or all of them might be same. 760 * 761 * - the pmap given by the first argument 762 * our caller wants to access this pmap's PTEs. 763 * 764 * - pmap_kernel() 765 * the kernel pmap. note that it only contains the kernel part 766 * of the address space which is shared by any pmap. ie. any 767 * pmap can be used instead of pmap_kernel() for our purpose. 768 * 769 * - ci->ci_pmap 770 * pmap currently loaded on the cpu. 771 * 772 * - vm_map_pmap(&curproc->p_vmspace->vm_map) 773 * current process' pmap. 774 * 775 * => caller must lock pmap first (if not the kernel pmap) 776 * => must be undone with pmap_unmap_ptes before returning 777 * => disables kernel preemption 778 */ 779 void 780 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2, pd_entry_t **ptepp, 781 pd_entry_t * const **pdeppp) 782 { 783 struct pmap *curpmap; 784 struct cpu_info *ci; 785 lwp_t *l; 786 787 kpreempt_disable(); 788 789 /* The kernel's pmap is always accessible. */ 790 if (pmap == pmap_kernel()) { 791 *pmap2 = NULL; 792 *ptepp = PTE_BASE; 793 *pdeppp = normal_pdes; 794 return; 795 } 796 797 KASSERT(mutex_owned(&pmap->pm_lock)); 798 799 l = curlwp; 800 ci = l->l_cpu; 801 curpmap = ci->ci_pmap; 802 if (pmap == curpmap) { 803 /* 804 * Already on the CPU: make it valid. This is very 805 * often the case during exit(), when we have switched 806 * to the kernel pmap in order to destroy a user pmap. 807 */ 808 if (__predict_false(ci->ci_tlbstate != TLBSTATE_VALID)) { 809 pmap_reactivate(pmap); 810 } 811 *pmap2 = NULL; 812 } else { 813 /* 814 * Toss current pmap from CPU and install new pmap, but keep 815 * a reference to the old one. Dropping the reference can 816 * can block as it needs to take locks, so defer that to 817 * pmap_unmap_ptes(). 818 */ 819 pmap_reference(pmap); 820 pmap_load1(l, pmap, curpmap); 821 *pmap2 = curpmap; 822 } 823 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); 824 #ifdef DIAGNOSTIC 825 pmap->pm_pctr = lwp_pctr(); 826 #endif 827 *ptepp = PTE_BASE; 828 829 #if defined(XENPV) && defined(__x86_64__) 830 KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] == L4_BASE); 831 ci->ci_normal_pdes[PTP_LEVELS - 2] = pmap->pm_pdir; 832 *pdeppp = ci->ci_normal_pdes; 833 #else 834 *pdeppp = normal_pdes; 835 #endif 836 } 837 838 /* 839 * pmap_unmap_ptes: unlock the PTE mapping of "pmap" 840 * 841 * => we cannot tolerate context switches while mapped in: assert this. 842 * => reenables kernel preemption. 843 * => does not unlock pmap. 844 */ 845 void 846 pmap_unmap_ptes(struct pmap *pmap, struct pmap * pmap2) 847 { 848 struct cpu_info *ci; 849 struct pmap *mypmap; 850 struct lwp *l; 851 852 KASSERT(kpreempt_disabled()); 853 854 /* The kernel's pmap is always accessible. */ 855 if (pmap == pmap_kernel()) { 856 kpreempt_enable(); 857 return; 858 } 859 860 l = curlwp; 861 ci = l->l_cpu; 862 863 KASSERT(mutex_owned(&pmap->pm_lock)); 864 KASSERT(pmap->pm_pctr == lwp_pctr()); 865 866 #if defined(XENPV) && defined(__x86_64__) 867 KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] != L4_BASE); 868 ci->ci_normal_pdes[PTP_LEVELS - 2] = L4_BASE; 869 #endif 870 871 /* If not our own pmap, mark whatever's on the CPU now as lazy. */ 872 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); 873 mypmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 874 if (ci->ci_pmap == vm_map_pmap(&l->l_proc->p_vmspace->vm_map)) { 875 ci->ci_want_pmapload = 0; 876 } else { 877 ci->ci_want_pmapload = (mypmap != pmap_kernel()); 878 ci->ci_tlbstate = TLBSTATE_LAZY; 879 } 880 881 /* Now safe to re-enable preemption. */ 882 kpreempt_enable(); 883 884 /* Toss reference to other pmap taken earlier. */ 885 if (pmap2 != NULL) { 886 pmap_destroy(pmap2); 887 } 888 } 889 890 inline static void 891 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte) 892 { 893 894 #if !defined(__x86_64__) 895 if (curproc == NULL || curproc->p_vmspace == NULL || 896 pm != vm_map_pmap(&curproc->p_vmspace->vm_map)) 897 return; 898 899 if ((opte ^ npte) & PTE_X) 900 pmap_update_pg(va); 901 902 /* 903 * Executability was removed on the last executable change. 904 * Reset the code segment to something conservative and 905 * let the trap handler deal with setting the right limit. 906 * We can't do that because of locking constraints on the vm map. 907 */ 908 909 if ((opte & PTE_X) && (npte & PTE_X) == 0 && va == pm->pm_hiexec) { 910 struct trapframe *tf = curlwp->l_md.md_regs; 911 912 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 913 pm->pm_hiexec = I386_MAX_EXE_ADDR; 914 } 915 #endif /* !defined(__x86_64__) */ 916 } 917 918 #if !defined(__x86_64__) 919 /* 920 * Fixup the code segment to cover all potential executable mappings. 921 * returns 0 if no changes to the code segment were made. 922 */ 923 int 924 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb) 925 { 926 struct vm_map_entry *ent; 927 struct pmap *pm = vm_map_pmap(map); 928 vaddr_t va = 0; 929 930 vm_map_lock_read(map); 931 for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) { 932 /* 933 * This entry has greater va than the entries before. 934 * We need to make it point to the last page, not past it. 935 */ 936 if (ent->protection & VM_PROT_EXECUTE) 937 va = trunc_page(ent->end) - PAGE_SIZE; 938 } 939 vm_map_unlock_read(map); 940 if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL)) 941 return 0; 942 943 pm->pm_hiexec = va; 944 if (pm->pm_hiexec > I386_MAX_EXE_ADDR) { 945 tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL); 946 } else { 947 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 948 return 0; 949 } 950 return 1; 951 } 952 #endif /* !defined(__x86_64__) */ 953 954 void 955 pat_init(struct cpu_info *ci) 956 { 957 #ifndef XENPV 958 uint64_t pat; 959 960 if (!(ci->ci_feat_val[0] & CPUID_PAT)) 961 return; 962 963 /* We change WT to WC. Leave all other entries the default values. */ 964 pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) | 965 PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) | 966 PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) | 967 PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC); 968 969 wrmsr(MSR_CR_PAT, pat); 970 cpu_pat_enabled = true; 971 #endif 972 } 973 974 static pt_entry_t 975 pmap_pat_flags(u_int flags) 976 { 977 u_int cacheflags = (flags & PMAP_CACHE_MASK); 978 979 if (!cpu_pat_enabled) { 980 switch (cacheflags) { 981 case PMAP_NOCACHE: 982 case PMAP_NOCACHE_OVR: 983 /* results in PGC_UCMINUS on cpus which have 984 * the cpuid PAT but PAT "disabled" 985 */ 986 return PTE_PCD; 987 default: 988 return 0; 989 } 990 } 991 992 switch (cacheflags) { 993 case PMAP_NOCACHE: 994 return PGC_UC; 995 case PMAP_WRITE_COMBINE: 996 return PGC_WC; 997 case PMAP_WRITE_BACK: 998 return PGC_WB; 999 case PMAP_NOCACHE_OVR: 1000 return PGC_UCMINUS; 1001 } 1002 1003 return 0; 1004 } 1005 1006 /* 1007 * p m a p k e n t e r f u n c t i o n s 1008 * 1009 * functions to quickly enter/remove pages from the kernel address 1010 * space. pmap_kremove is exported to MI kernel. we make use of 1011 * the recursive PTE mappings. 1012 */ 1013 1014 /* 1015 * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking 1016 * 1017 * => no need to lock anything, assume va is already allocated 1018 * => should be faster than normal pmap enter function 1019 */ 1020 void 1021 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags) 1022 { 1023 pt_entry_t *pte, opte, npte; 1024 1025 KASSERT(!(prot & ~VM_PROT_ALL)); 1026 1027 if (va < VM_MIN_KERNEL_ADDRESS) 1028 pte = vtopte(va); 1029 else 1030 pte = kvtopte(va); 1031 #if defined(XENPV) && defined(DOM0OPS) 1032 if (pa < pmap_pa_start || pa >= pmap_pa_end) { 1033 #ifdef DEBUG 1034 printf_nolog("%s: pa %#" PRIxPADDR " for va %#" PRIxVADDR 1035 " outside range\n", __func__, pa, va); 1036 #endif /* DEBUG */ 1037 npte = pa; 1038 } else 1039 #endif /* XENPV && DOM0OPS */ 1040 npte = pmap_pa2pte(pa); 1041 npte |= protection_codes[prot] | PTE_P | pmap_pg_g; 1042 npte |= pmap_pat_flags(flags); 1043 opte = pmap_pte_testset(pte, npte); /* zap! */ 1044 1045 /* 1046 * XXX: make sure we are not dealing with a large page, since the only 1047 * large pages created are for the kernel image, and they should never 1048 * be kentered. 1049 */ 1050 KASSERTMSG(!(opte & PTE_PS), "PTE_PS va=%#"PRIxVADDR, va); 1051 1052 if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A)) { 1053 /* This should not happen. */ 1054 printf_nolog("%s: mapping already present\n", __func__); 1055 kpreempt_disable(); 1056 pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER); 1057 kpreempt_enable(); 1058 } 1059 } 1060 1061 __strict_weak_alias(pmap_kenter_ma, pmap_kenter_pa); 1062 1063 #if defined(__x86_64__) 1064 /* 1065 * Change protection for a virtual address. Local for a CPU only, don't 1066 * care about TLB shootdowns. 1067 * 1068 * => must be called with preemption disabled 1069 */ 1070 void 1071 pmap_changeprot_local(vaddr_t va, vm_prot_t prot) 1072 { 1073 pt_entry_t *pte, opte, npte; 1074 1075 KASSERT(kpreempt_disabled()); 1076 1077 if (va < VM_MIN_KERNEL_ADDRESS) 1078 pte = vtopte(va); 1079 else 1080 pte = kvtopte(va); 1081 1082 npte = opte = *pte; 1083 1084 if ((prot & VM_PROT_WRITE) != 0) 1085 npte |= PTE_W; 1086 else 1087 npte &= ~(PTE_W|PTE_D); 1088 1089 if (opte != npte) { 1090 pmap_pte_set(pte, npte); 1091 pmap_pte_flush(); 1092 invlpg(va); 1093 } 1094 } 1095 #endif /* defined(__x86_64__) */ 1096 1097 /* 1098 * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking 1099 * 1100 * => no need to lock anything 1101 * => caller must dispose of any vm_page mapped in the va range 1102 * => note: not an inline function 1103 * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE 1104 * => we assume kernel only unmaps valid addresses and thus don't bother 1105 * checking the valid bit before doing TLB flushing 1106 * => must be followed by call to pmap_update() before reuse of page 1107 */ 1108 static void 1109 pmap_kremove1(vaddr_t sva, vsize_t len, bool localonly) 1110 { 1111 pt_entry_t *pte, opte; 1112 vaddr_t va, eva; 1113 1114 eva = sva + len; 1115 1116 kpreempt_disable(); 1117 for (va = sva; va < eva; va += PAGE_SIZE) { 1118 pte = kvtopte(va); 1119 opte = pmap_pte_testset(pte, 0); /* zap! */ 1120 if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A) && !localonly) { 1121 pmap_tlb_shootdown(pmap_kernel(), va, opte, 1122 TLBSHOOT_KREMOVE); 1123 } 1124 KASSERTMSG((opte & PTE_PS) == 0, 1125 "va %#" PRIxVADDR " is a large page", va); 1126 KASSERTMSG((opte & PTE_PVLIST) == 0, 1127 "va %#" PRIxVADDR " is a pv tracked page", va); 1128 } 1129 if (localonly) { 1130 tlbflushg(); 1131 } 1132 kpreempt_enable(); 1133 } 1134 1135 void 1136 pmap_kremove(vaddr_t sva, vsize_t len) 1137 { 1138 1139 pmap_kremove1(sva, len, false); 1140 } 1141 1142 /* 1143 * pmap_kremove_local: like pmap_kremove(), but only worry about 1144 * TLB invalidations on the current CPU. this is only intended 1145 * for use while writing kernel crash dumps, either after panic 1146 * or via reboot -d. 1147 */ 1148 void 1149 pmap_kremove_local(vaddr_t sva, vsize_t len) 1150 { 1151 1152 pmap_kremove1(sva, len, true); 1153 } 1154 1155 /* 1156 * p m a p i n i t f u n c t i o n s 1157 * 1158 * pmap_bootstrap and pmap_init are called during system startup 1159 * to init the pmap module. pmap_bootstrap() does a low level 1160 * init just to get things rolling. pmap_init() finishes the job. 1161 */ 1162 1163 /* 1164 * pmap_bootstrap_valloc: allocate a virtual address in the bootstrap area. 1165 * This function is to be used before any VM system has been set up. 1166 * 1167 * The va is taken from virtual_avail. 1168 */ 1169 static vaddr_t 1170 pmap_bootstrap_valloc(size_t npages) 1171 { 1172 vaddr_t va = virtual_avail; 1173 virtual_avail += npages * PAGE_SIZE; 1174 return va; 1175 } 1176 1177 /* 1178 * pmap_bootstrap_palloc: allocate a physical address in the bootstrap area. 1179 * This function is to be used before any VM system has been set up. 1180 * 1181 * The pa is taken from avail_start. 1182 */ 1183 static paddr_t 1184 pmap_bootstrap_palloc(size_t npages) 1185 { 1186 paddr_t pa = avail_start; 1187 avail_start += npages * PAGE_SIZE; 1188 return pa; 1189 } 1190 1191 /* 1192 * pmap_bootstrap: get the system in a state where it can run with VM properly 1193 * enabled (called before main()). The VM system is fully init'd later. 1194 * 1195 * => on i386, locore.S has already enabled the MMU by allocating a PDP for the 1196 * kernel, and nkpde PTP's for the kernel. 1197 * => kva_start is the first free virtual address in kernel space. 1198 */ 1199 void 1200 pmap_bootstrap(vaddr_t kva_start) 1201 { 1202 struct pmap *kpm; 1203 int i; 1204 vaddr_t kva; 1205 1206 pmap_pg_nx = (cpu_feature[2] & CPUID_NOX ? PTE_NX : 0); 1207 1208 /* 1209 * Set up our local static global vars that keep track of the usage of 1210 * KVM before kernel_map is set up. 1211 */ 1212 virtual_avail = kva_start; /* first free KVA */ 1213 virtual_end = VM_MAX_KERNEL_ADDRESS; /* last KVA */ 1214 1215 /* 1216 * Set up protection_codes: we need to be able to convert from a MI 1217 * protection code (some combo of VM_PROT...) to something we can jam 1218 * into a x86 PTE. 1219 */ 1220 protection_codes[VM_PROT_NONE] = pmap_pg_nx; 1221 protection_codes[VM_PROT_EXECUTE] = PTE_X; 1222 protection_codes[VM_PROT_READ] = pmap_pg_nx; 1223 protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PTE_X; 1224 protection_codes[VM_PROT_WRITE] = PTE_W | pmap_pg_nx; 1225 protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PTE_W | PTE_X; 1226 protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PTE_W | pmap_pg_nx; 1227 protection_codes[VM_PROT_ALL] = PTE_W | PTE_X; 1228 1229 /* 1230 * Now we init the kernel's pmap. 1231 * 1232 * The kernel pmap's pm_obj is not used for much. However, in user pmaps 1233 * the pm_obj contains the list of active PTPs. 1234 */ 1235 kpm = pmap_kernel(); 1236 mutex_init(&kpm->pm_lock, MUTEX_DEFAULT, IPL_NONE); 1237 rw_init(&kpm->pm_dummy_lock); 1238 for (i = 0; i < PTP_LEVELS - 1; i++) { 1239 uvm_obj_init(&kpm->pm_obj[i], &pmap_pager, false, 1); 1240 uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_dummy_lock); 1241 kpm->pm_ptphint[i] = NULL; 1242 } 1243 memset(&kpm->pm_list, 0, sizeof(kpm->pm_list)); /* pm_list not used */ 1244 1245 kpm->pm_pdir = (pd_entry_t *)bootspace.pdir; 1246 for (i = 0; i < PDP_SIZE; i++) 1247 kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i; 1248 1249 kpm->pm_stats.wired_count = kpm->pm_stats.resident_count = 1250 x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS); 1251 1252 kcpuset_create(&kpm->pm_cpus, true); 1253 kcpuset_create(&kpm->pm_kernel_cpus, true); 1254 1255 kpm->pm_ldt = NULL; 1256 kpm->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 1257 1258 /* 1259 * the above is just a rough estimate and not critical to the proper 1260 * operation of the system. 1261 */ 1262 1263 #if !defined(XENPV) 1264 /* 1265 * Begin to enable global TLB entries if they are supported: add PTE_G 1266 * attribute to already mapped kernel pages. Do that only if SVS is 1267 * disabled. 1268 * 1269 * The G bit has no effect until the CR4_PGE bit is set in CR4, which 1270 * happens later in cpu_init(). 1271 */ 1272 #ifdef SVS 1273 if (!svs_enabled && (cpu_feature[0] & CPUID_PGE)) { 1274 #else 1275 if (cpu_feature[0] & CPUID_PGE) { 1276 #endif 1277 pmap_pg_g = PTE_G; 1278 pmap_remap_global(); 1279 } 1280 #endif 1281 1282 #ifndef XENPV 1283 /* 1284 * Enable large pages if they are supported. 1285 */ 1286 if (cpu_feature[0] & CPUID_PSE) { 1287 lcr4(rcr4() | CR4_PSE); /* enable hardware (via %cr4) */ 1288 pmap_largepages = 1; /* enable software */ 1289 1290 /* 1291 * The TLB must be flushed after enabling large pages on Pentium 1292 * CPUs, according to section 3.6.2.2 of "Intel Architecture 1293 * Software Developer's Manual, Volume 3: System Programming". 1294 */ 1295 tlbflushg(); 1296 1297 /* Remap the kernel. */ 1298 pmap_remap_largepages(); 1299 } 1300 pmap_init_lapic(); 1301 #endif /* !XENPV */ 1302 1303 #ifdef __HAVE_PCPU_AREA 1304 pmap_init_pcpu(); 1305 #endif 1306 1307 #ifdef __HAVE_DIRECT_MAP 1308 pmap_init_directmap(kpm); 1309 #else 1310 pmap_vpage_cpualloc(&cpu_info_primary); 1311 1312 if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { /* i386 */ 1313 early_zerop = (void *)cpu_info_primary.vpage[VPAGE_ZER]; 1314 early_zero_pte = cpu_info_primary.vpage_pte[VPAGE_ZER]; 1315 } else { /* amd64 */ 1316 /* 1317 * zero_pte is stuck at the end of mapped space for the kernel 1318 * image (disjunct from kva space). This is done so that it 1319 * can safely be used in pmap_growkernel (pmap_get_physpage), 1320 * when it's called for the first time. 1321 * XXXfvdl fix this for MULTIPROCESSOR later. 1322 */ 1323 #ifdef XENPV 1324 /* early_zerop initialized in xen_locore() */ 1325 #else 1326 early_zerop = (void *)bootspace.spareva; 1327 #endif 1328 early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop); 1329 } 1330 #endif 1331 1332 #if defined(XENPV) && defined(__x86_64__) 1333 extern vaddr_t xen_dummy_page; 1334 paddr_t xen_dummy_user_pgd; 1335 1336 /* 1337 * We want a dummy page directory for Xen: when deactivating a pmap, 1338 * Xen will still consider it active. So we set user PGD to this one 1339 * to lift all protection on the now inactive page tables set. 1340 */ 1341 xen_dummy_user_pgd = xen_dummy_page - KERNBASE; 1342 1343 /* Zero fill it, the less checks in Xen it requires the better */ 1344 memset(PAGE_ALIGNED(xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE); 1345 /* Mark read-only */ 1346 HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE, 1347 pmap_pa2pte(xen_dummy_user_pgd) | PTE_P | pmap_pg_nx, 1348 UVMF_INVLPG); 1349 /* Pin as L4 */ 1350 xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd)); 1351 #endif 1352 1353 /* 1354 * Allocate space for the Interrupt Descriptor Table (IDT), 1355 * Global Descriptor Table (GDT), and Local Descriptor Table 1356 * (LDT). 1357 * 1358 * Currently there is an initial temporary GDT allocated on the 1359 * stack by the caller of init386/init_x86_64, which is (among 1360 * other things) needed on i386 for %fs-relative addressing for 1361 * CPU-local data (CPUVAR(...), curcpu(), curlwp). This 1362 * initial temporary GDT will be popped off the stack before we 1363 * can enter main, so we need to make sure there is space for a 1364 * second temporary GDT to continue existing when we enter main 1365 * before we allocate space for the permanent GDT with 1366 * uvm_km(9) in gdt_init via cpu_startup and switch to that. 1367 */ 1368 idt_vaddr = pmap_bootstrap_valloc(1); 1369 idt_paddr = pmap_bootstrap_palloc(1); 1370 1371 gdt_vaddr = pmap_bootstrap_valloc(1); 1372 gdt_paddr = pmap_bootstrap_palloc(1); 1373 1374 #ifdef __HAVE_PCPU_AREA 1375 ldt_vaddr = (vaddr_t)&pcpuarea->ldt; 1376 #else 1377 ldt_vaddr = pmap_bootstrap_valloc(1); 1378 #endif 1379 ldt_paddr = pmap_bootstrap_palloc(1); 1380 1381 #if !defined(__x86_64__) 1382 /* pentium f00f bug stuff */ 1383 pentium_idt_vaddr = pmap_bootstrap_valloc(1); 1384 #endif 1385 1386 #if defined(XENPVHVM) 1387 /* XXX: move to hypervisor.c with appropriate API adjustments */ 1388 extern paddr_t HYPERVISOR_shared_info_pa; 1389 extern volatile struct xencons_interface *xencons_interface; /* XXX */ 1390 extern struct xenstore_domain_interface *xenstore_interface; /* XXX */ 1391 1392 if (vm_guest != VM_GUEST_XENPVH) { 1393 HYPERVISOR_shared_info = (void *) pmap_bootstrap_valloc(1); 1394 HYPERVISOR_shared_info_pa = pmap_bootstrap_palloc(1); 1395 } 1396 xencons_interface = (void *) pmap_bootstrap_valloc(1); 1397 xenstore_interface = (void *) pmap_bootstrap_valloc(1); 1398 #endif 1399 /* 1400 * Now we reserve some VM for mapping pages when doing a crash dump. 1401 */ 1402 virtual_avail = reserve_dumppages(virtual_avail); 1403 1404 /* 1405 * Init the global lock and global list. 1406 */ 1407 mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE); 1408 LIST_INIT(&pmaps); 1409 1410 /* 1411 * Ensure the TLB is sync'd with reality by flushing it... 1412 */ 1413 tlbflushg(); 1414 1415 /* 1416 * Calculate pmap_maxkvaddr from nkptp[]. 1417 */ 1418 kva = VM_MIN_KERNEL_ADDRESS; 1419 for (i = PTP_LEVELS - 1; i >= 1; i--) { 1420 kva += nkptp[i] * nbpd[i]; 1421 } 1422 pmap_maxkvaddr = kva; 1423 } 1424 1425 #ifndef XENPV 1426 static void 1427 pmap_init_lapic(void) 1428 { 1429 /* 1430 * On CPUs that have no LAPIC, local_apic_va is never kentered. But our 1431 * x86 implementation relies a lot on this address to be valid; so just 1432 * allocate a fake physical page that will be kentered into 1433 * local_apic_va by machdep. 1434 * 1435 * If the LAPIC is present, the va will be remapped somewhere else 1436 * later in lapic_map. 1437 */ 1438 local_apic_va = pmap_bootstrap_valloc(1); 1439 local_apic_pa = pmap_bootstrap_palloc(1); 1440 } 1441 #endif 1442 1443 #ifdef __x86_64__ 1444 static size_t 1445 pmap_pagetree_nentries_range(vaddr_t startva, vaddr_t endva, size_t pgsz) 1446 { 1447 size_t npages; 1448 npages = (roundup(endva, pgsz) / pgsz) - 1449 (rounddown(startva, pgsz) / pgsz); 1450 return npages; 1451 } 1452 #endif 1453 1454 #if defined(__HAVE_DIRECT_MAP) || defined(KASAN) || defined(KMSAN) 1455 static inline void 1456 slotspace_copy(int type, pd_entry_t *dst, pd_entry_t *src) 1457 { 1458 size_t sslot = slotspace.area[type].sslot; 1459 size_t nslot = slotspace.area[type].nslot; 1460 1461 memcpy(&dst[sslot], &src[sslot], nslot * sizeof(pd_entry_t)); 1462 } 1463 #endif 1464 1465 #ifdef __x86_64__ 1466 /* 1467 * Randomize the location of an area. We count the holes in the VM space. We 1468 * randomly select one hole, and then randomly select an area within that hole. 1469 * Finally we update the associated entry in the slotspace structure. 1470 */ 1471 vaddr_t 1472 slotspace_rand(int type, size_t sz, size_t align, size_t randhole, 1473 vaddr_t randva) 1474 { 1475 struct { 1476 int start; 1477 int end; 1478 } holes[SLSPACE_NAREAS+1]; 1479 size_t i, nholes, hole; 1480 size_t startsl, endsl, nslots, winsize; 1481 vaddr_t startva, va; 1482 1483 sz = roundup(sz, align); 1484 1485 /* 1486 * Take one more slot with +NBPD_L4, because we may end up choosing 1487 * an area that crosses slots: 1488 * +------+------+------+ 1489 * | Slot | Slot | Slot | 1490 * +------+------+------+ 1491 * [Chosen Area] 1492 * And in that case we must take into account the additional slot 1493 * consumed. 1494 */ 1495 nslots = roundup(sz+NBPD_L4, NBPD_L4) / NBPD_L4; 1496 1497 /* Get the holes. */ 1498 nholes = 0; 1499 size_t curslot = 0 + 256; /* end of SLAREA_USER */ 1500 while (1) { 1501 /* 1502 * Find the first occupied slot after the current one. 1503 * The area between the two is a hole. 1504 */ 1505 size_t minsslot = 512; 1506 size_t minnslot = 0; 1507 for (i = 0; i < SLSPACE_NAREAS; i++) { 1508 if (!slotspace.area[i].active) 1509 continue; 1510 if (slotspace.area[i].sslot >= curslot && 1511 slotspace.area[i].sslot < minsslot) { 1512 minsslot = slotspace.area[i].sslot; 1513 minnslot = slotspace.area[i].nslot; 1514 } 1515 } 1516 1517 /* No hole anymore, stop here. */ 1518 if (minsslot == 512) { 1519 break; 1520 } 1521 1522 /* Register the hole. */ 1523 if (minsslot - curslot >= nslots) { 1524 holes[nholes].start = curslot; 1525 holes[nholes].end = minsslot; 1526 nholes++; 1527 } 1528 1529 /* Skip that hole, and iterate again. */ 1530 curslot = minsslot + minnslot; 1531 } 1532 1533 if (nholes == 0) { 1534 panic("%s: impossible", __func__); 1535 } 1536 1537 /* Select a hole. */ 1538 hole = randhole; 1539 #ifdef NO_X86_ASLR 1540 hole = 0; 1541 #endif 1542 hole %= nholes; 1543 startsl = holes[hole].start; 1544 endsl = holes[hole].end; 1545 startva = VA_SIGN_NEG(startsl * NBPD_L4); 1546 1547 /* Select an area within the hole. */ 1548 va = randva; 1549 #ifdef NO_X86_ASLR 1550 va = 0; 1551 #endif 1552 winsize = ((endsl - startsl) * NBPD_L4) - sz; 1553 va %= winsize; 1554 va = rounddown(va, align); 1555 va += startva; 1556 1557 /* Update the entry. */ 1558 slotspace.area[type].sslot = pl4_i(va); 1559 slotspace.area[type].nslot = 1560 pmap_pagetree_nentries_range(va, va+sz, NBPD_L4); 1561 slotspace.area[type].active = true; 1562 1563 return va; 1564 } 1565 #endif 1566 1567 #ifdef __HAVE_PCPU_AREA 1568 static void 1569 pmap_init_pcpu(void) 1570 { 1571 const vaddr_t startva = PMAP_PCPU_BASE; 1572 size_t nL4e, nL3e, nL2e, nL1e; 1573 size_t L4e_idx, L3e_idx, L2e_idx, L1e_idx __diagused; 1574 paddr_t pa; 1575 vaddr_t endva; 1576 vaddr_t tmpva; 1577 pt_entry_t *pte; 1578 size_t size; 1579 int i; 1580 1581 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx; 1582 1583 size = sizeof(struct pcpu_area); 1584 1585 endva = startva + size; 1586 1587 /* We will use this temporary va. */ 1588 tmpva = bootspace.spareva; 1589 pte = PTE_BASE + pl1_i(tmpva); 1590 1591 /* Build L4 */ 1592 L4e_idx = pl4_i(startva); 1593 nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4); 1594 KASSERT(nL4e == 1); 1595 for (i = 0; i < nL4e; i++) { 1596 KASSERT(L4_BASE[L4e_idx+i] == 0); 1597 1598 pa = pmap_bootstrap_palloc(1); 1599 *pte = (pa & PTE_FRAME) | pteflags; 1600 pmap_update_pg(tmpva); 1601 memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE); 1602 1603 L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A; 1604 } 1605 1606 /* Build L3 */ 1607 L3e_idx = pl3_i(startva); 1608 nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3); 1609 for (i = 0; i < nL3e; i++) { 1610 KASSERT(L3_BASE[L3e_idx+i] == 0); 1611 1612 pa = pmap_bootstrap_palloc(1); 1613 *pte = (pa & PTE_FRAME) | pteflags; 1614 pmap_update_pg(tmpva); 1615 memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE); 1616 1617 L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A; 1618 } 1619 1620 /* Build L2 */ 1621 L2e_idx = pl2_i(startva); 1622 nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2); 1623 for (i = 0; i < nL2e; i++) { 1624 1625 KASSERT(L2_BASE[L2e_idx+i] == 0); 1626 1627 pa = pmap_bootstrap_palloc(1); 1628 *pte = (pa & PTE_FRAME) | pteflags; 1629 pmap_update_pg(tmpva); 1630 memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE); 1631 1632 L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A; 1633 } 1634 1635 /* Build L1 */ 1636 L1e_idx = pl1_i(startva); 1637 nL1e = pmap_pagetree_nentries_range(startva, endva, NBPD_L1); 1638 for (i = 0; i < nL1e; i++) { 1639 /* 1640 * Nothing to do, the PTEs will be entered via 1641 * pmap_kenter_pa. 1642 */ 1643 KASSERT(L1_BASE[L1e_idx+i] == 0); 1644 } 1645 1646 *pte = 0; 1647 pmap_update_pg(tmpva); 1648 1649 pcpuarea = (struct pcpu_area *)startva; 1650 1651 tlbflush(); 1652 } 1653 #endif 1654 1655 #ifdef __HAVE_DIRECT_MAP 1656 static void 1657 randomize_hole(size_t *randholep, vaddr_t *randvap) 1658 { 1659 struct nist_hash_drbg drbg; 1660 uint8_t seed[NIST_HASH_DRBG_SEEDLEN_BYTES]; 1661 const char p[] = "x86/directmap"; 1662 int error; 1663 1664 entropy_extract(seed, sizeof(seed), 0); 1665 1666 error = nist_hash_drbg_instantiate(&drbg, seed, sizeof(seed), 1667 /*nonce*/NULL, 0, 1668 /*personalization*/p, strlen(p)); 1669 KASSERTMSG(error == 0, "error=%d", error); 1670 1671 error = nist_hash_drbg_generate(&drbg, randholep, sizeof(*randholep), 1672 /*additional*/NULL, 0); 1673 KASSERTMSG(error == 0, "error=%d", error); 1674 1675 error = nist_hash_drbg_generate(&drbg, randvap, sizeof(*randvap), 1676 /*additional*/NULL, 0); 1677 KASSERTMSG(error == 0, "error=%d", error); 1678 1679 explicit_memset(seed, 0, sizeof(seed)); 1680 explicit_memset(&drbg, 0, sizeof(drbg)); 1681 } 1682 1683 /* 1684 * Create the amd64 direct map. Called only once at boot time. We map all of 1685 * the physical memory contiguously using 2MB large pages, with RW permissions. 1686 * However there is a hole: the kernel is mapped with RO permissions. 1687 */ 1688 static void 1689 pmap_init_directmap(struct pmap *kpm) 1690 { 1691 extern phys_ram_seg_t mem_clusters[]; 1692 extern int mem_cluster_cnt; 1693 1694 vaddr_t startva; 1695 size_t nL4e, nL3e, nL2e; 1696 size_t L4e_idx, L3e_idx, L2e_idx; 1697 size_t spahole, epahole; 1698 paddr_t lastpa, pa; 1699 vaddr_t endva; 1700 vaddr_t tmpva; 1701 pt_entry_t *pte; 1702 phys_ram_seg_t *mc; 1703 int i; 1704 size_t randhole; 1705 vaddr_t randva; 1706 1707 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx; 1708 const pd_entry_t holepteflags = PTE_P | pmap_pg_nx; 1709 1710 CTASSERT(NL4_SLOT_DIRECT * NBPD_L4 == MAXPHYSMEM); 1711 1712 spahole = roundup(bootspace.head.pa, NBPD_L2); 1713 epahole = rounddown(bootspace.boot.pa, NBPD_L2); 1714 1715 /* Get the last physical address available */ 1716 lastpa = 0; 1717 for (i = 0; i < mem_cluster_cnt; i++) { 1718 mc = &mem_clusters[i]; 1719 lastpa = MAX(lastpa, mc->start + mc->size); 1720 } 1721 1722 /* 1723 * x86_add_cluster should have truncated the memory to MAXPHYSMEM. 1724 */ 1725 if (lastpa > MAXPHYSMEM) { 1726 panic("pmap_init_directmap: lastpa incorrect"); 1727 } 1728 1729 randomize_hole(&randhole, &randva); 1730 startva = slotspace_rand(SLAREA_DMAP, lastpa, NBPD_L2, 1731 randhole, randva); 1732 endva = startva + lastpa; 1733 1734 /* We will use this temporary va. */ 1735 tmpva = bootspace.spareva; 1736 pte = PTE_BASE + pl1_i(tmpva); 1737 1738 /* Build L4 */ 1739 L4e_idx = pl4_i(startva); 1740 nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4); 1741 KASSERT(nL4e <= NL4_SLOT_DIRECT); 1742 for (i = 0; i < nL4e; i++) { 1743 KASSERT(L4_BASE[L4e_idx+i] == 0); 1744 1745 pa = pmap_bootstrap_palloc(1); 1746 *pte = (pa & PTE_FRAME) | pteflags; 1747 pmap_update_pg(tmpva); 1748 memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE); 1749 1750 L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A; 1751 } 1752 1753 /* Build L3 */ 1754 L3e_idx = pl3_i(startva); 1755 nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3); 1756 for (i = 0; i < nL3e; i++) { 1757 KASSERT(L3_BASE[L3e_idx+i] == 0); 1758 1759 pa = pmap_bootstrap_palloc(1); 1760 *pte = (pa & PTE_FRAME) | pteflags; 1761 pmap_update_pg(tmpva); 1762 memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE); 1763 1764 L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A; 1765 } 1766 1767 /* Build L2 */ 1768 L2e_idx = pl2_i(startva); 1769 nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2); 1770 for (i = 0; i < nL2e; i++) { 1771 KASSERT(L2_BASE[L2e_idx+i] == 0); 1772 1773 pa = (paddr_t)(i * NBPD_L2); 1774 1775 if (spahole <= pa && pa < epahole) { 1776 L2_BASE[L2e_idx+i] = pa | holepteflags | PTE_A | 1777 PTE_PS | pmap_pg_g; 1778 } else { 1779 L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A | 1780 PTE_PS | pmap_pg_g; 1781 } 1782 } 1783 1784 *pte = 0; 1785 pmap_update_pg(tmpva); 1786 1787 pmap_direct_base = startva; 1788 pmap_direct_end = endva; 1789 1790 tlbflush(); 1791 } 1792 #endif /* __HAVE_DIRECT_MAP */ 1793 1794 #if !defined(XENPV) 1795 /* 1796 * Remap all of the virtual pages created so far with the PTE_G bit. 1797 */ 1798 static void 1799 pmap_remap_global(void) 1800 { 1801 vaddr_t kva, kva_end; 1802 unsigned long p1i; 1803 size_t i; 1804 1805 /* head */ 1806 kva = bootspace.head.va; 1807 kva_end = kva + bootspace.head.sz; 1808 for ( ; kva < kva_end; kva += PAGE_SIZE) { 1809 p1i = pl1_i(kva); 1810 if (pmap_valid_entry(PTE_BASE[p1i])) 1811 PTE_BASE[p1i] |= pmap_pg_g; 1812 } 1813 1814 /* kernel segments */ 1815 for (i = 0; i < BTSPACE_NSEGS; i++) { 1816 if (bootspace.segs[i].type == BTSEG_NONE) { 1817 continue; 1818 } 1819 kva = bootspace.segs[i].va; 1820 kva_end = kva + bootspace.segs[i].sz; 1821 for ( ; kva < kva_end; kva += PAGE_SIZE) { 1822 p1i = pl1_i(kva); 1823 if (pmap_valid_entry(PTE_BASE[p1i])) 1824 PTE_BASE[p1i] |= pmap_pg_g; 1825 } 1826 } 1827 1828 /* boot space */ 1829 kva = bootspace.boot.va; 1830 kva_end = kva + bootspace.boot.sz; 1831 for ( ; kva < kva_end; kva += PAGE_SIZE) { 1832 p1i = pl1_i(kva); 1833 if (pmap_valid_entry(PTE_BASE[p1i])) 1834 PTE_BASE[p1i] |= pmap_pg_g; 1835 } 1836 } 1837 #endif 1838 1839 #ifndef XENPV 1840 /* 1841 * Remap several kernel segments with large pages. We cover as many pages as we 1842 * can. Called only once at boot time, if the CPU supports large pages. 1843 */ 1844 static void 1845 pmap_remap_largepages(void) 1846 { 1847 pd_entry_t *pde; 1848 vaddr_t kva, kva_end; 1849 paddr_t pa; 1850 size_t i; 1851 1852 /* Remap the kernel text using large pages. */ 1853 for (i = 0; i < BTSPACE_NSEGS; i++) { 1854 if (bootspace.segs[i].type != BTSEG_TEXT) { 1855 continue; 1856 } 1857 kva = roundup(bootspace.segs[i].va, NBPD_L2); 1858 if (kva < bootspace.segs[i].va) { 1859 continue; 1860 } 1861 kva_end = rounddown(bootspace.segs[i].va + 1862 bootspace.segs[i].sz, NBPD_L2); 1863 pa = roundup(bootspace.segs[i].pa, NBPD_L2); 1864 for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) { 1865 pde = &L2_BASE[pl2_i(kva)]; 1866 *pde = pa | pmap_pg_g | PTE_PS | PTE_P; 1867 tlbflushg(); 1868 } 1869 } 1870 1871 /* Remap the kernel rodata using large pages. */ 1872 for (i = 0; i < BTSPACE_NSEGS; i++) { 1873 if (bootspace.segs[i].type != BTSEG_RODATA) { 1874 continue; 1875 } 1876 kva = roundup(bootspace.segs[i].va, NBPD_L2); 1877 if (kva < bootspace.segs[i].va) { 1878 continue; 1879 } 1880 kva_end = rounddown(bootspace.segs[i].va + 1881 bootspace.segs[i].sz, NBPD_L2); 1882 pa = roundup(bootspace.segs[i].pa, NBPD_L2); 1883 for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) { 1884 pde = &L2_BASE[pl2_i(kva)]; 1885 *pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_P; 1886 tlbflushg(); 1887 } 1888 } 1889 1890 /* Remap the kernel data+bss using large pages. */ 1891 for (i = 0; i < BTSPACE_NSEGS; i++) { 1892 if (bootspace.segs[i].type != BTSEG_DATA) { 1893 continue; 1894 } 1895 kva = roundup(bootspace.segs[i].va, NBPD_L2); 1896 if (kva < bootspace.segs[i].va) { 1897 continue; 1898 } 1899 kva_end = rounddown(bootspace.segs[i].va + 1900 bootspace.segs[i].sz, NBPD_L2); 1901 pa = roundup(bootspace.segs[i].pa, NBPD_L2); 1902 for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) { 1903 pde = &L2_BASE[pl2_i(kva)]; 1904 *pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_W | PTE_P; 1905 tlbflushg(); 1906 } 1907 } 1908 } 1909 #endif /* !XENPV */ 1910 1911 /* 1912 * pmap_init: called from uvm_init, our job is to get the pmap system ready 1913 * to manage mappings. 1914 */ 1915 void 1916 pmap_init(void) 1917 { 1918 int flags; 1919 1920 /* 1921 * initialize caches. 1922 */ 1923 1924 pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), COHERENCY_UNIT, 1925 0, 0, "pmappl", NULL, IPL_NONE, pmap_ctor, pmap_dtor, NULL); 1926 1927 #ifdef XENPV 1928 /* 1929 * pool_cache(9) should not touch cached objects, since they 1930 * are pinned on xen and R/O for the domU 1931 */ 1932 flags = PR_NOTOUCH; 1933 #else 1934 flags = 0; 1935 #endif 1936 1937 #ifdef PAE 1938 pool_init(&pmap_pdp_pool, PAGE_SIZE * PDP_SIZE, 0, 0, flags, 1939 "pdppl", &pmap_pdp_allocator, IPL_NONE); 1940 #else 1941 pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, 0, flags, 1942 "pdppl", NULL, IPL_NONE); 1943 #endif 1944 pool_cache_bootstrap(&pmap_pvp_cache, PAGE_SIZE, PAGE_SIZE, 1945 0, 0, "pvpage", &pool_allocator_kmem, 1946 IPL_NONE, pmap_pvp_ctor, pmap_pvp_dtor, NULL); 1947 1948 pmap_tlb_init(); 1949 1950 /* XXX: Since cpu_hatch() is only for secondary CPUs. */ 1951 pmap_tlb_cpu_init(curcpu()); 1952 1953 evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC, 1954 NULL, "x86", "io bitmap copy"); 1955 evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC, 1956 NULL, "x86", "ldt sync"); 1957 1958 /* 1959 * The kernel doesn't keep track of PTPs, so there's nowhere handy 1960 * to hang a tree of pv_entry records. Dynamically allocated 1961 * pv_entry lists are not heavily used in the kernel's pmap (the 1962 * usual case is embedded), so cop out and use a single RB tree 1963 * to cover them. 1964 */ 1965 rb_tree_init(&pmap_kernel_rb, &pmap_rbtree_ops); 1966 1967 /* 1968 * done: pmap module is up (and ready for business) 1969 */ 1970 1971 pmap_initialized = true; 1972 } 1973 1974 #ifndef XENPV 1975 /* 1976 * pmap_cpu_init_late: perform late per-CPU initialization. 1977 */ 1978 void 1979 pmap_cpu_init_late(struct cpu_info *ci) 1980 { 1981 /* 1982 * The BP has already its own PD page allocated during early 1983 * MD startup. 1984 */ 1985 if (ci == &cpu_info_primary) 1986 return; 1987 #ifdef PAE 1988 cpu_alloc_l3_page(ci); 1989 #endif 1990 } 1991 #endif 1992 1993 #ifndef __HAVE_DIRECT_MAP 1994 CTASSERT(CACHE_LINE_SIZE > sizeof(pt_entry_t)); 1995 CTASSERT(CACHE_LINE_SIZE % sizeof(pt_entry_t) == 0); 1996 1997 static void 1998 pmap_vpage_cpualloc(struct cpu_info *ci) 1999 { 2000 bool primary = (ci == &cpu_info_primary); 2001 size_t i, npages; 2002 vaddr_t vabase; 2003 vsize_t vrange; 2004 2005 npages = (CACHE_LINE_SIZE / sizeof(pt_entry_t)); 2006 KASSERT(npages >= VPAGE_MAX); 2007 vrange = npages * PAGE_SIZE; 2008 2009 if (primary) { 2010 while ((vabase = pmap_bootstrap_valloc(1)) % vrange != 0) { 2011 /* Waste some pages to align properly */ 2012 } 2013 /* The base is aligned, allocate the rest (contiguous) */ 2014 pmap_bootstrap_valloc(npages - 1); 2015 } else { 2016 vabase = uvm_km_alloc(kernel_map, vrange, vrange, 2017 UVM_KMF_VAONLY); 2018 if (vabase == 0) { 2019 panic("%s: failed to allocate tmp VA for CPU %d\n", 2020 __func__, cpu_index(ci)); 2021 } 2022 } 2023 2024 KASSERT((vaddr_t)&PTE_BASE[pl1_i(vabase)] % CACHE_LINE_SIZE == 0); 2025 2026 for (i = 0; i < VPAGE_MAX; i++) { 2027 ci->vpage[i] = vabase + i * PAGE_SIZE; 2028 ci->vpage_pte[i] = PTE_BASE + pl1_i(ci->vpage[i]); 2029 } 2030 } 2031 2032 void 2033 pmap_vpage_cpu_init(struct cpu_info *ci) 2034 { 2035 if (ci == &cpu_info_primary) { 2036 /* cpu0 already taken care of in pmap_bootstrap */ 2037 return; 2038 } 2039 2040 pmap_vpage_cpualloc(ci); 2041 } 2042 #endif 2043 2044 /* 2045 * p v _ e n t r y f u n c t i o n s 2046 */ 2047 2048 /* 2049 * pmap_pvp_dtor: pool_cache constructor for PV pages. 2050 */ 2051 static int 2052 pmap_pvp_ctor(void *arg, void *obj, int flags) 2053 { 2054 struct pv_page *pvp = (struct pv_page *)obj; 2055 struct pv_entry *pve = (struct pv_entry *)obj + 1; 2056 struct pv_entry *maxpve = pve + PVE_PER_PVP; 2057 2058 KASSERT(sizeof(struct pv_page) <= sizeof(struct pv_entry)); 2059 KASSERT(trunc_page((vaddr_t)obj) == (vaddr_t)obj); 2060 2061 LIST_INIT(&pvp->pvp_pves); 2062 pvp->pvp_nfree = PVE_PER_PVP; 2063 pvp->pvp_pmap = NULL; 2064 2065 for (; pve < maxpve; pve++) { 2066 LIST_INSERT_HEAD(&pvp->pvp_pves, pve, pve_list); 2067 } 2068 2069 return 0; 2070 } 2071 2072 /* 2073 * pmap_pvp_dtor: pool_cache destructor for PV pages. 2074 */ 2075 static void 2076 pmap_pvp_dtor(void *arg, void *obj) 2077 { 2078 struct pv_page *pvp __diagused = obj; 2079 2080 KASSERT(pvp->pvp_pmap == NULL); 2081 KASSERT(pvp->pvp_nfree == PVE_PER_PVP); 2082 } 2083 2084 /* 2085 * pmap_alloc_pv: allocate a PV entry (likely cached with pmap). 2086 */ 2087 static struct pv_entry * 2088 pmap_alloc_pv(struct pmap *pmap) 2089 { 2090 struct pv_entry *pve; 2091 struct pv_page *pvp; 2092 2093 KASSERT(mutex_owned(&pmap->pm_lock)); 2094 2095 if (__predict_false((pvp = LIST_FIRST(&pmap->pm_pvp_part)) == NULL)) { 2096 if ((pvp = LIST_FIRST(&pmap->pm_pvp_full)) != NULL) { 2097 LIST_REMOVE(pvp, pvp_list); 2098 } else { 2099 pvp = pool_cache_get(&pmap_pvp_cache, PR_NOWAIT); 2100 } 2101 if (__predict_false(pvp == NULL)) { 2102 return NULL; 2103 } 2104 /* full -> part */ 2105 LIST_INSERT_HEAD(&pmap->pm_pvp_part, pvp, pvp_list); 2106 pvp->pvp_pmap = pmap; 2107 } 2108 2109 KASSERT(pvp->pvp_pmap == pmap); 2110 KASSERT(pvp->pvp_nfree > 0); 2111 2112 pve = LIST_FIRST(&pvp->pvp_pves); 2113 LIST_REMOVE(pve, pve_list); 2114 pvp->pvp_nfree--; 2115 2116 if (__predict_false(pvp->pvp_nfree == 0)) { 2117 /* part -> empty */ 2118 KASSERT(LIST_EMPTY(&pvp->pvp_pves)); 2119 LIST_REMOVE(pvp, pvp_list); 2120 LIST_INSERT_HEAD(&pmap->pm_pvp_empty, pvp, pvp_list); 2121 } else { 2122 KASSERT(!LIST_EMPTY(&pvp->pvp_pves)); 2123 } 2124 2125 return pve; 2126 } 2127 2128 /* 2129 * pmap_free_pv: delayed free of a PV entry. 2130 */ 2131 static void 2132 pmap_free_pv(struct pmap *pmap, struct pv_entry *pve) 2133 { 2134 struct pv_page *pvp = (struct pv_page *)trunc_page((vaddr_t)pve); 2135 2136 KASSERT(mutex_owned(&pmap->pm_lock)); 2137 KASSERT(pvp->pvp_pmap == pmap); 2138 KASSERT(pvp->pvp_nfree >= 0); 2139 2140 LIST_INSERT_HEAD(&pvp->pvp_pves, pve, pve_list); 2141 pvp->pvp_nfree++; 2142 2143 if (__predict_false(pvp->pvp_nfree == 1)) { 2144 /* empty -> part */ 2145 LIST_REMOVE(pvp, pvp_list); 2146 LIST_INSERT_HEAD(&pmap->pm_pvp_part, pvp, pvp_list); 2147 } else if (__predict_false(pvp->pvp_nfree == PVE_PER_PVP)) { 2148 /* part -> full */ 2149 LIST_REMOVE(pvp, pvp_list); 2150 LIST_INSERT_HEAD(&pmap->pm_pvp_full, pvp, pvp_list); 2151 } 2152 } 2153 2154 /* 2155 * pmap_drain_pv: free full PV pages. 2156 */ 2157 static void 2158 pmap_drain_pv(struct pmap *pmap) 2159 { 2160 struct pv_page *pvp; 2161 2162 KASSERT(mutex_owned(&pmap->pm_lock)); 2163 2164 while ((pvp = LIST_FIRST(&pmap->pm_pvp_full)) != NULL) { 2165 LIST_REMOVE(pvp, pvp_list); 2166 KASSERT(pvp->pvp_pmap == pmap); 2167 KASSERT(pvp->pvp_nfree == PVE_PER_PVP); 2168 pvp->pvp_pmap = NULL; 2169 pool_cache_put(&pmap_pvp_cache, pvp); 2170 } 2171 } 2172 2173 /* 2174 * pmap_check_pv: verify {VA, PTP} pair is either tracked/untracked by page 2175 */ 2176 static void 2177 pmap_check_pv(struct pmap *pmap, struct vm_page *ptp, struct pmap_page *pp, 2178 vaddr_t va, bool tracked) 2179 { 2180 #ifdef DEBUG 2181 struct pv_pte *pvpte; 2182 2183 PMAP_CHECK_PP(pp); 2184 2185 mutex_spin_enter(&pp->pp_lock); 2186 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 2187 if (pvpte->pte_ptp == ptp && pvpte->pte_va == va) { 2188 break; 2189 } 2190 } 2191 mutex_spin_exit(&pp->pp_lock); 2192 2193 if (pvpte && !tracked) { 2194 panic("pmap_check_pv: %p/%lx found on pp %p", ptp, va, pp); 2195 } else if (!pvpte && tracked) { 2196 panic("pmap_check_pv: %p/%lx missing on pp %p", ptp, va, pp); 2197 } 2198 #endif 2199 } 2200 2201 /* 2202 * pmap_treelookup_pv: search the PV tree for a dynamic entry 2203 * 2204 * => pmap must be locked 2205 */ 2206 static struct pv_entry * 2207 pmap_treelookup_pv(const struct pmap *pmap, const struct vm_page *ptp, 2208 const rb_tree_t *tree, const vaddr_t va) 2209 { 2210 struct pv_entry *pve; 2211 rb_node_t *node; 2212 2213 /* 2214 * Inlined lookup tailored for exactly what's needed here that is 2215 * quite a bit faster than using rb_tree_find_node(). 2216 */ 2217 for (node = tree->rbt_root;;) { 2218 if (__predict_false(RB_SENTINEL_P(node))) { 2219 return NULL; 2220 } 2221 pve = (struct pv_entry *) 2222 ((uintptr_t)node - offsetof(struct pv_entry, pve_rb)); 2223 if (pve->pve_pte.pte_va == va) { 2224 KASSERT(pve->pve_pte.pte_ptp == ptp); 2225 return pve; 2226 } 2227 node = node->rb_nodes[pve->pve_pte.pte_va < va]; 2228 } 2229 } 2230 2231 /* 2232 * pmap_lookup_pv: look up a non-embedded pv entry for the given pmap 2233 * 2234 * => a PV entry must be known present (doesn't check for existence) 2235 * => pmap must be locked 2236 */ 2237 static struct pv_entry * 2238 pmap_lookup_pv(const struct pmap *pmap, const struct vm_page *ptp, 2239 const struct pmap_page * const old_pp, const vaddr_t va) 2240 { 2241 struct pv_entry *pve; 2242 const rb_tree_t *tree; 2243 2244 KASSERT(mutex_owned(&pmap->pm_lock)); 2245 KASSERT(ptp != NULL || pmap == pmap_kernel()); 2246 2247 /* 2248 * [This mostly deals with the case of process-private pages, i.e. 2249 * anonymous memory allocations or COW.] 2250 * 2251 * If the page is tracked with an embedded entry then the tree 2252 * lookup can be avoided. It's safe to check for this specific 2253 * set of values without pp_lock because both will only ever be 2254 * set together for this pmap. 2255 * 2256 */ 2257 if (atomic_load_relaxed(&old_pp->pp_pte.pte_ptp) == ptp && 2258 atomic_load_relaxed(&old_pp->pp_pte.pte_va) == va) { 2259 return NULL; 2260 } 2261 2262 /* 2263 * [This mostly deals with shared mappings, for example shared libs 2264 * and executables.] 2265 * 2266 * Optimise for pmap_remove_ptes() which works by ascending scan: 2267 * look at the lowest numbered node in the tree first. The tree is 2268 * known non-empty because of the check above. For short lived 2269 * processes where pmap_remove() isn't used much this gets close to 2270 * a 100% hit rate. 2271 */ 2272 tree = (ptp != NULL ? &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb); 2273 KASSERT(!RB_SENTINEL_P(tree->rbt_root)); 2274 pve = (struct pv_entry *) 2275 ((uintptr_t)tree->rbt_minmax[RB_DIR_LEFT] - 2276 offsetof(struct pv_entry, pve_rb)); 2277 if (__predict_true(pve->pve_pte.pte_va == va)) { 2278 KASSERT(pve->pve_pte.pte_ptp == ptp); 2279 return pve; 2280 } 2281 2282 /* Search the RB tree for the key (uncommon). */ 2283 return pmap_treelookup_pv(pmap, ptp, tree, va); 2284 } 2285 2286 /* 2287 * pmap_enter_pv: enter a mapping onto a pmap_page lst 2288 * 2289 * => pmap must be locked 2290 * => does NOT insert dynamic entries to tree (pmap_enter() does later) 2291 */ 2292 static int 2293 pmap_enter_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp, 2294 vaddr_t va, struct pv_entry **new_pve, struct pv_entry **old_pve, 2295 bool *samepage, bool *new_embedded, rb_tree_t *tree) 2296 { 2297 struct pv_entry *pve; 2298 int error; 2299 2300 KASSERT(mutex_owned(&pmap->pm_lock)); 2301 KASSERT(ptp_to_pmap(ptp) == pmap); 2302 KASSERT(ptp == NULL || ptp->uobject != NULL); 2303 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 2304 PMAP_CHECK_PP(pp); 2305 2306 /* 2307 * If entering the same page and it's already tracked with an 2308 * embedded entry, we can avoid the expense below. It's safe 2309 * to check for this very specific set of values without a lock 2310 * because both will only ever be set together for this pmap. 2311 */ 2312 if (atomic_load_relaxed(&pp->pp_pte.pte_ptp) == ptp && 2313 atomic_load_relaxed(&pp->pp_pte.pte_va) == va) { 2314 *samepage = true; 2315 pmap_check_pv(pmap, ptp, pp, va, true); 2316 return 0; 2317 } 2318 2319 /* 2320 * Check for an existing dynamic mapping at this address. If it's 2321 * for the same page, then it will be reused and nothing needs to be 2322 * changed. 2323 */ 2324 *old_pve = pmap_treelookup_pv(pmap, ptp, tree, va); 2325 if (*old_pve != NULL && (*old_pve)->pve_pp == pp) { 2326 *samepage = true; 2327 pmap_check_pv(pmap, ptp, pp, va, true); 2328 return 0; 2329 } 2330 2331 /* 2332 * Need to put a new mapping in place. Grab a spare pv_entry in 2333 * case it's needed; won't know for sure until the lock is taken. 2334 */ 2335 if (pmap->pm_pve == NULL) { 2336 pmap->pm_pve = pmap_alloc_pv(pmap); 2337 } 2338 2339 error = 0; 2340 pmap_check_pv(pmap, ptp, pp, va, false); 2341 mutex_spin_enter(&pp->pp_lock); 2342 if (!pv_pte_embedded(pp)) { 2343 /* 2344 * Embedded PV tracking available - easy. 2345 */ 2346 pp->pp_pte.pte_ptp = ptp; 2347 pp->pp_pte.pte_va = va; 2348 *new_embedded = true; 2349 } else if (__predict_false(pmap->pm_pve == NULL)) { 2350 /* 2351 * No memory. 2352 */ 2353 error = ENOMEM; 2354 } else { 2355 /* 2356 * Install new pv_entry on the page. 2357 */ 2358 pve = pmap->pm_pve; 2359 pmap->pm_pve = NULL; 2360 *new_pve = pve; 2361 pve->pve_pte.pte_ptp = ptp; 2362 pve->pve_pte.pte_va = va; 2363 pve->pve_pp = pp; 2364 LIST_INSERT_HEAD(&pp->pp_pvlist, pve, pve_list); 2365 } 2366 mutex_spin_exit(&pp->pp_lock); 2367 if (error == 0) { 2368 pmap_check_pv(pmap, ptp, pp, va, true); 2369 } 2370 2371 return error; 2372 } 2373 2374 /* 2375 * pmap_remove_pv: try to remove a mapping from a pv_list 2376 * 2377 * => pmap must be locked 2378 * => removes dynamic entries from tree and frees them 2379 * => caller should adjust ptp's wire_count and free PTP if needed 2380 */ 2381 static void 2382 pmap_remove_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp, 2383 vaddr_t va, struct pv_entry *pve, uint8_t oattrs) 2384 { 2385 rb_tree_t *tree = (ptp != NULL ? 2386 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb); 2387 2388 KASSERT(mutex_owned(&pmap->pm_lock)); 2389 KASSERT(ptp_to_pmap(ptp) == pmap); 2390 KASSERT(ptp == NULL || ptp->uobject != NULL); 2391 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 2392 KASSERT(ptp != NULL || pmap == pmap_kernel()); 2393 2394 pmap_check_pv(pmap, ptp, pp, va, true); 2395 2396 if (pve == NULL) { 2397 mutex_spin_enter(&pp->pp_lock); 2398 KASSERT(pp->pp_pte.pte_ptp == ptp); 2399 KASSERT(pp->pp_pte.pte_va == va); 2400 pp->pp_attrs |= oattrs; 2401 pp->pp_pte.pte_ptp = NULL; 2402 pp->pp_pte.pte_va = 0; 2403 mutex_spin_exit(&pp->pp_lock); 2404 } else { 2405 mutex_spin_enter(&pp->pp_lock); 2406 KASSERT(pp->pp_pte.pte_ptp != ptp || 2407 pp->pp_pte.pte_va != va); 2408 KASSERT(pve->pve_pte.pte_ptp == ptp); 2409 KASSERT(pve->pve_pte.pte_va == va); 2410 KASSERT(pve->pve_pp == pp); 2411 pp->pp_attrs |= oattrs; 2412 LIST_REMOVE(pve, pve_list); 2413 mutex_spin_exit(&pp->pp_lock); 2414 2415 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == pve); 2416 rb_tree_remove_node(tree, pve); 2417 #ifdef DIAGNOSTIC 2418 memset(pve, 0, sizeof(*pve)); 2419 #endif 2420 pmap_free_pv(pmap, pve); 2421 } 2422 2423 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 2424 pmap_check_pv(pmap, ptp, pp, va, false); 2425 } 2426 2427 /* 2428 * p t p f u n c t i o n s 2429 */ 2430 2431 static struct vm_page * 2432 pmap_find_ptp(struct pmap *pmap, vaddr_t va, int level) 2433 { 2434 int lidx = level - 1; 2435 off_t off = ptp_va2o(va, level); 2436 struct vm_page *pg; 2437 2438 KASSERT(mutex_owned(&pmap->pm_lock)); 2439 2440 if (pmap->pm_ptphint[lidx] && off == pmap->pm_ptphint[lidx]->offset) { 2441 KASSERT(pmap->pm_ptphint[lidx]->wire_count > 0); 2442 pg = pmap->pm_ptphint[lidx]; 2443 PMAP_CHECK_PP(VM_PAGE_TO_PP(pg)); 2444 return pg; 2445 } 2446 PMAP_DUMMY_LOCK(pmap); 2447 pg = uvm_pagelookup(&pmap->pm_obj[lidx], off); 2448 PMAP_DUMMY_UNLOCK(pmap); 2449 if (pg != NULL && __predict_false(pg->wire_count == 0)) { 2450 /* This page is queued to be freed - ignore. */ 2451 pg = NULL; 2452 } 2453 if (pg != NULL) { 2454 PMAP_CHECK_PP(VM_PAGE_TO_PP(pg)); 2455 } 2456 pmap->pm_ptphint[lidx] = pg; 2457 return pg; 2458 } 2459 2460 static inline void 2461 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level) 2462 { 2463 int lidx; 2464 2465 KASSERT(ptp->wire_count <= 1); 2466 PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp)); 2467 2468 lidx = level - 1; 2469 pmap_stats_update(pmap, -ptp->wire_count, 0); 2470 if (pmap->pm_ptphint[lidx] == ptp) 2471 pmap->pm_ptphint[lidx] = NULL; 2472 ptp->wire_count = 0; 2473 ptp->uanon = NULL; 2474 KASSERT(RB_TREE_MIN(&VM_PAGE_TO_PP(ptp)->pp_rb) == NULL); 2475 2476 /* 2477 * Enqueue the PTP to be freed by pmap_update(). We can't remove 2478 * the page from the uvm_object, as that can take further locks 2479 * (intolerable right now because the PTEs are likely mapped in). 2480 * Instead mark the PTP as free and if we bump into it again, we'll 2481 * either ignore or reuse (depending on what's useful at the time). 2482 */ 2483 LIST_INSERT_HEAD(&pmap->pm_gc_ptp, ptp, mdpage.mp_pp.pp_link); 2484 } 2485 2486 static void 2487 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va, 2488 pt_entry_t *ptes, pd_entry_t * const *pdes) 2489 { 2490 unsigned long index; 2491 int level; 2492 vaddr_t invaladdr; 2493 pd_entry_t opde; 2494 2495 KASSERT(pmap != pmap_kernel()); 2496 KASSERT(mutex_owned(&pmap->pm_lock)); 2497 KASSERT(kpreempt_disabled()); 2498 2499 level = 1; 2500 do { 2501 index = pl_i(va, level + 1); 2502 opde = pmap_pte_testset(&pdes[level - 1][index], 0); 2503 2504 /* 2505 * On Xen-amd64 or SVS, we need to sync the top level page 2506 * directory on each CPU. 2507 */ 2508 #if defined(XENPV) && defined(__x86_64__) 2509 if (level == PTP_LEVELS - 1) { 2510 xen_kpm_sync(pmap, index); 2511 } 2512 #elif defined(SVS) 2513 if (svs_enabled && level == PTP_LEVELS - 1 && 2514 pmap_is_user(pmap)) { 2515 svs_pmap_sync(pmap, index); 2516 } 2517 #endif 2518 2519 invaladdr = level == 1 ? (vaddr_t)ptes : 2520 (vaddr_t)pdes[level - 2]; 2521 pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE, 2522 opde, TLBSHOOT_FREE_PTP); 2523 2524 #if defined(XENPV) 2525 pmap_tlb_shootnow(); 2526 #endif 2527 2528 pmap_freepage(pmap, ptp, level); 2529 if (level < PTP_LEVELS - 1) { 2530 ptp = pmap_find_ptp(pmap, va, level + 1); 2531 ptp->wire_count--; 2532 if (ptp->wire_count > 1) 2533 break; 2534 } 2535 } while (++level < PTP_LEVELS); 2536 pmap_pte_flush(); 2537 } 2538 2539 /* 2540 * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one) 2541 * 2542 * => pmap should NOT be pmap_kernel() 2543 * => pmap should be locked 2544 * => we are not touching any PTEs yet, so they need not be mapped in 2545 */ 2546 static int 2547 pmap_get_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va, 2548 int flags, struct vm_page **resultp) 2549 { 2550 struct vm_page *ptp; 2551 int i, aflags; 2552 struct uvm_object *obj; 2553 voff_t off; 2554 2555 KASSERT(pmap != pmap_kernel()); 2556 KASSERT(mutex_owned(&pmap->pm_lock)); 2557 2558 /* 2559 * Loop through all page table levels allocating a page 2560 * for any level where we don't already have one. 2561 */ 2562 memset(pt, 0, sizeof(*pt)); 2563 aflags = ((flags & PMAP_CANFAIL) ? 0 : UVM_PGA_USERESERVE) | 2564 UVM_PGA_ZERO; 2565 for (i = PTP_LEVELS; i > 1; i--) { 2566 obj = &pmap->pm_obj[i - 2]; 2567 off = ptp_va2o(va, i - 1); 2568 2569 PMAP_DUMMY_LOCK(pmap); 2570 pt->pg[i] = uvm_pagelookup(obj, off); 2571 2572 if (pt->pg[i] == NULL) { 2573 pt->pg[i] = uvm_pagealloc(obj, off, NULL, aflags); 2574 pt->alloced[i] = (pt->pg[i] != NULL); 2575 } else if (pt->pg[i]->wire_count == 0) { 2576 /* This page was queued to be freed; dequeue it. */ 2577 LIST_REMOVE(pt->pg[i], mdpage.mp_pp.pp_link); 2578 pt->alloced[i] = true; 2579 } 2580 PMAP_DUMMY_UNLOCK(pmap); 2581 if (pt->pg[i] == NULL) { 2582 pmap_unget_ptp(pmap, pt); 2583 return ENOMEM; 2584 } else if (pt->alloced[i]) { 2585 pt->pg[i]->uanon = (struct vm_anon *)(vaddr_t)~0L; 2586 rb_tree_init(&VM_PAGE_TO_PP(pt->pg[i])->pp_rb, 2587 &pmap_rbtree_ops); 2588 PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i])); 2589 } 2590 } 2591 ptp = pt->pg[2]; 2592 KASSERT(ptp != NULL); 2593 *resultp = ptp; 2594 pmap->pm_ptphint[0] = ptp; 2595 return 0; 2596 } 2597 2598 /* 2599 * pmap_install_ptp: install any freshly allocated PTPs 2600 * 2601 * => pmap should NOT be pmap_kernel() 2602 * => pmap should be locked 2603 * => PTEs must be mapped 2604 * => preemption must be disabled 2605 */ 2606 static void 2607 pmap_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va, 2608 pd_entry_t * const *pdes) 2609 { 2610 struct vm_page *ptp; 2611 unsigned long index; 2612 pd_entry_t *pva; 2613 paddr_t pa; 2614 int i; 2615 2616 KASSERT(pmap != pmap_kernel()); 2617 KASSERT(mutex_owned(&pmap->pm_lock)); 2618 KASSERT(kpreempt_disabled()); 2619 2620 /* 2621 * Now that we have all the pages looked up or allocated, 2622 * loop through again installing any new ones into the tree. 2623 */ 2624 for (i = PTP_LEVELS; i > 1; i--) { 2625 index = pl_i(va, i); 2626 pva = pdes[i - 2]; 2627 2628 if (pmap_valid_entry(pva[index])) { 2629 KASSERT(!pt->alloced[i]); 2630 continue; 2631 } 2632 2633 ptp = pt->pg[i]; 2634 ptp->flags &= ~PG_BUSY; /* never busy */ 2635 ptp->wire_count = 1; 2636 pmap->pm_ptphint[i - 2] = ptp; 2637 pa = VM_PAGE_TO_PHYS(ptp); 2638 pmap_pte_set(&pva[index], (pd_entry_t) 2639 (pmap_pa2pte(pa) | PTE_U | PTE_W | PTE_P)); 2640 2641 /* 2642 * On Xen-amd64 or SVS, we need to sync the top level page 2643 * directory on each CPU. 2644 */ 2645 #if defined(XENPV) && defined(__x86_64__) 2646 if (i == PTP_LEVELS) { 2647 xen_kpm_sync(pmap, index); 2648 } 2649 #elif defined(SVS) 2650 if (svs_enabled && i == PTP_LEVELS && 2651 pmap_is_user(pmap)) { 2652 svs_pmap_sync(pmap, index); 2653 } 2654 #endif 2655 2656 pmap_pte_flush(); 2657 pmap_stats_update(pmap, 1, 0); 2658 2659 /* 2660 * If we're not in the top level, increase the 2661 * wire count of the parent page. 2662 */ 2663 if (i < PTP_LEVELS) { 2664 pt->pg[i + 1]->wire_count++; 2665 } 2666 } 2667 } 2668 2669 /* 2670 * pmap_unget_ptp: free unusued PTPs 2671 * 2672 * => pmap should NOT be pmap_kernel() 2673 * => pmap should be locked 2674 */ 2675 static void 2676 pmap_unget_ptp(struct pmap *pmap, struct pmap_ptparray *pt) 2677 { 2678 int i; 2679 2680 KASSERT(pmap != pmap_kernel()); 2681 KASSERT(mutex_owned(&pmap->pm_lock)); 2682 2683 for (i = PTP_LEVELS; i > 1; i--) { 2684 if (!pt->alloced[i]) { 2685 continue; 2686 } 2687 KASSERT(pt->pg[i]->wire_count == 0); 2688 PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i])); 2689 pmap_freepage(pmap, pt->pg[i], i - 1); 2690 } 2691 } 2692 2693 /* 2694 * p m a p l i f e c y c l e f u n c t i o n s 2695 */ 2696 2697 /* 2698 * pmap_pdp_init: constructor a new PDP. 2699 */ 2700 static void 2701 pmap_pdp_init(pd_entry_t *pdir) 2702 { 2703 paddr_t pdirpa = 0; 2704 vaddr_t object; 2705 int i; 2706 2707 #if !defined(XENPV) || !defined(__x86_64__) 2708 int npde; 2709 #endif 2710 #ifdef XENPV 2711 int s; 2712 #endif 2713 2714 memset(PAGE_ALIGNED(pdir), 0, PDP_SIZE * PAGE_SIZE); 2715 2716 /* 2717 * NOTE: This is all done unlocked, but we will check afterwards 2718 * if we have raced with pmap_growkernel(). 2719 */ 2720 2721 #if defined(XENPV) && defined(__x86_64__) 2722 /* Fetch the physical address of the page directory */ 2723 (void)pmap_extract(pmap_kernel(), (vaddr_t)pdir, &pdirpa); 2724 2725 /* 2726 * This pdir will NEVER be active in kernel mode, so mark 2727 * recursive entry invalid. 2728 */ 2729 pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa); 2730 2731 /* 2732 * PDP constructed this way won't be for the kernel, hence we 2733 * don't put kernel mappings on Xen. 2734 * 2735 * But we need to make pmap_create() happy, so put a dummy 2736 * (without PTE_P) value at the right place. 2737 */ 2738 pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] = 2739 (pd_entry_t)-1 & PTE_FRAME; 2740 #else /* XENPV && __x86_64__*/ 2741 object = (vaddr_t)pdir; 2742 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2743 /* Fetch the physical address of the page directory */ 2744 (void)pmap_extract(pmap_kernel(), object, &pdirpa); 2745 2746 /* Put in recursive PDE to map the PTEs */ 2747 pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PTE_P | 2748 pmap_pg_nx; 2749 #ifndef XENPV 2750 pdir[PDIR_SLOT_PTE + i] |= PTE_W; 2751 #endif 2752 } 2753 2754 /* Copy the kernel's top level PDE */ 2755 npde = nkptp[PTP_LEVELS - 1]; 2756 2757 memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN], 2758 npde * sizeof(pd_entry_t)); 2759 2760 if (VM_MIN_KERNEL_ADDRESS != KERNBASE) { 2761 int idx = pl_i(KERNBASE, PTP_LEVELS); 2762 pdir[idx] = PDP_BASE[idx]; 2763 } 2764 2765 #ifdef __HAVE_PCPU_AREA 2766 pdir[PDIR_SLOT_PCPU] = PDP_BASE[PDIR_SLOT_PCPU]; 2767 #endif 2768 #ifdef __HAVE_DIRECT_MAP 2769 slotspace_copy(SLAREA_DMAP, pdir, PDP_BASE); 2770 #endif 2771 #ifdef KASAN 2772 slotspace_copy(SLAREA_ASAN, pdir, PDP_BASE); 2773 #endif 2774 #ifdef KMSAN 2775 slotspace_copy(SLAREA_MSAN, pdir, PDP_BASE); 2776 #endif 2777 #endif /* XENPV && __x86_64__*/ 2778 2779 #ifdef XENPV 2780 s = splvm(); 2781 object = (vaddr_t)pdir; 2782 pmap_protect(pmap_kernel(), object, object + (PAGE_SIZE * PDP_SIZE), 2783 VM_PROT_READ); 2784 pmap_update(pmap_kernel()); 2785 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2786 /* 2787 * pin as L2/L4 page, we have to do the page with the 2788 * PDIR_SLOT_PTE entries last 2789 */ 2790 #ifdef PAE 2791 if (i == l2tol3(PDIR_SLOT_PTE)) 2792 continue; 2793 #endif 2794 2795 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2796 #ifdef __x86_64__ 2797 xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa)); 2798 #else 2799 xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa)); 2800 #endif 2801 } 2802 #ifdef PAE 2803 object = ((vaddr_t)pdir) + PAGE_SIZE * l2tol3(PDIR_SLOT_PTE); 2804 (void)pmap_extract(pmap_kernel(), object, &pdirpa); 2805 xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa)); 2806 #endif 2807 splx(s); 2808 #endif /* XENPV */ 2809 } 2810 2811 /* 2812 * pmap_pdp_fini: destructor for the PDPs. 2813 */ 2814 static void 2815 pmap_pdp_fini(pd_entry_t *pdir) 2816 { 2817 #ifdef XENPV 2818 paddr_t pdirpa = 0; /* XXX: GCC */ 2819 vaddr_t object = (vaddr_t)pdir; 2820 int i; 2821 int s = splvm(); 2822 pt_entry_t *pte; 2823 2824 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2825 /* fetch the physical address of the page directory. */ 2826 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2827 /* unpin page table */ 2828 xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa)); 2829 } 2830 object = (vaddr_t)pdir; 2831 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2832 /* Set page RW again */ 2833 pte = kvtopte(object); 2834 pmap_pte_set(pte, *pte | PTE_W); 2835 xen_bcast_invlpg((vaddr_t)object); 2836 } 2837 splx(s); 2838 #endif /* XENPV */ 2839 } 2840 2841 #ifdef PAE 2842 static void * 2843 pmap_pdp_alloc(struct pool *pp, int flags) 2844 { 2845 return (void *)uvm_km_alloc(kernel_map, 2846 PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE, 2847 ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK) | 2848 UVM_KMF_WIRED); 2849 } 2850 2851 static void 2852 pmap_pdp_free(struct pool *pp, void *v) 2853 { 2854 uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE, 2855 UVM_KMF_WIRED); 2856 } 2857 #endif /* PAE */ 2858 2859 /* 2860 * pmap_ctor: constructor for the pmap cache. 2861 */ 2862 static int 2863 pmap_ctor(void *arg, void *obj, int flags) 2864 { 2865 struct pmap *pmap = obj; 2866 pt_entry_t p; 2867 int i; 2868 2869 KASSERT((flags & PR_WAITOK) != 0); 2870 2871 mutex_init(&pmap->pm_lock, MUTEX_DEFAULT, IPL_NONE); 2872 rw_init(&pmap->pm_dummy_lock); 2873 kcpuset_create(&pmap->pm_cpus, true); 2874 kcpuset_create(&pmap->pm_kernel_cpus, true); 2875 #ifdef XENPV 2876 kcpuset_create(&pmap->pm_xen_ptp_cpus, true); 2877 #endif 2878 LIST_INIT(&pmap->pm_gc_ptp); 2879 pmap->pm_pve = NULL; 2880 LIST_INIT(&pmap->pm_pvp_full); 2881 LIST_INIT(&pmap->pm_pvp_part); 2882 LIST_INIT(&pmap->pm_pvp_empty); 2883 2884 /* allocate and init PDP */ 2885 pmap->pm_pdir = pool_get(&pmap_pdp_pool, PR_WAITOK); 2886 2887 for (;;) { 2888 pmap_pdp_init(pmap->pm_pdir); 2889 mutex_enter(&pmaps_lock); 2890 p = pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1]; 2891 if (__predict_true(p != 0)) { 2892 break; 2893 } 2894 mutex_exit(&pmaps_lock); 2895 } 2896 2897 for (i = 0; i < PDP_SIZE; i++) 2898 pmap->pm_pdirpa[i] = 2899 pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]); 2900 2901 LIST_INSERT_HEAD(&pmaps, pmap, pm_list); 2902 mutex_exit(&pmaps_lock); 2903 2904 return 0; 2905 } 2906 2907 /* 2908 * pmap_ctor: destructor for the pmap cache. 2909 */ 2910 static void 2911 pmap_dtor(void *arg, void *obj) 2912 { 2913 struct pmap *pmap = obj; 2914 2915 mutex_enter(&pmaps_lock); 2916 LIST_REMOVE(pmap, pm_list); 2917 mutex_exit(&pmaps_lock); 2918 2919 pmap_pdp_fini(pmap->pm_pdir); 2920 pool_put(&pmap_pdp_pool, pmap->pm_pdir); 2921 mutex_destroy(&pmap->pm_lock); 2922 rw_destroy(&pmap->pm_dummy_lock); 2923 kcpuset_destroy(pmap->pm_cpus); 2924 kcpuset_destroy(pmap->pm_kernel_cpus); 2925 #ifdef XENPV 2926 kcpuset_destroy(pmap->pm_xen_ptp_cpus); 2927 #endif 2928 } 2929 2930 /* 2931 * pmap_create: create a pmap object. 2932 */ 2933 struct pmap * 2934 pmap_create(void) 2935 { 2936 struct pmap *pmap; 2937 int i; 2938 2939 pmap = pool_cache_get(&pmap_cache, PR_WAITOK); 2940 2941 /* init uvm_object */ 2942 for (i = 0; i < PTP_LEVELS - 1; i++) { 2943 uvm_obj_init(&pmap->pm_obj[i], &pmap_pager, false, 1); 2944 uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_dummy_lock); 2945 pmap->pm_ptphint[i] = NULL; 2946 } 2947 pmap->pm_stats.wired_count = 0; 2948 /* count the PDP allocd below */ 2949 pmap->pm_stats.resident_count = PDP_SIZE; 2950 #if !defined(__x86_64__) 2951 pmap->pm_hiexec = 0; 2952 #endif 2953 2954 /* Used by NVMM and Xen */ 2955 pmap->pm_enter = NULL; 2956 pmap->pm_extract = NULL; 2957 pmap->pm_remove = NULL; 2958 pmap->pm_sync_pv = NULL; 2959 pmap->pm_pp_remove_ent = NULL; 2960 pmap->pm_write_protect = NULL; 2961 pmap->pm_unwire = NULL; 2962 pmap->pm_tlb_flush = NULL; 2963 pmap->pm_data = NULL; 2964 2965 /* init the LDT */ 2966 pmap->pm_ldt = NULL; 2967 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 2968 2969 return pmap; 2970 } 2971 2972 /* 2973 * pmap_check_ptps: verify that none of the pmap's page table objects 2974 * have any pages allocated to them. 2975 */ 2976 static void 2977 pmap_check_ptps(struct pmap *pmap) 2978 { 2979 int i; 2980 2981 for (i = 0; i < PTP_LEVELS - 1; i++) { 2982 KASSERTMSG(pmap->pm_obj[i].uo_npages == 0, 2983 "pmap %p level %d still has %d pages", 2984 pmap, i, (int)pmap->pm_obj[i].uo_npages); 2985 } 2986 } 2987 2988 static void 2989 pmap_check_inuse(struct pmap *pmap) 2990 { 2991 #ifdef DEBUG 2992 CPU_INFO_ITERATOR cii; 2993 struct cpu_info *ci; 2994 2995 for (CPU_INFO_FOREACH(cii, ci)) { 2996 if (ci->ci_pmap == pmap) 2997 panic("destroying pmap being used"); 2998 #if defined(XENPV) && defined(__x86_64__) 2999 for (int i = 0; i < PDIR_SLOT_USERLIM; i++) { 3000 if (pmap->pm_pdir[i] != 0 && 3001 ci->ci_kpm_pdir[i] == pmap->pm_pdir[i]) { 3002 printf("pmap_destroy(%p) pmap_kernel %p " 3003 "curcpu %d cpu %d ci_pmap %p " 3004 "ci->ci_kpm_pdir[%d]=%" PRIx64 3005 " pmap->pm_pdir[%d]=%" PRIx64 "\n", 3006 pmap, pmap_kernel(), curcpu()->ci_index, 3007 ci->ci_index, ci->ci_pmap, 3008 i, ci->ci_kpm_pdir[i], 3009 i, pmap->pm_pdir[i]); 3010 panic("%s: used pmap", __func__); 3011 } 3012 } 3013 #endif 3014 } 3015 #endif /* DEBUG */ 3016 } 3017 3018 /* 3019 * pmap_destroy: drop reference count on pmap. free pmap if reference 3020 * count goes to zero. 3021 * 3022 * => we can be called from pmap_unmap_ptes() with a different, unrelated 3023 * pmap's lock held. be careful! 3024 */ 3025 void 3026 pmap_destroy(struct pmap *pmap) 3027 { 3028 int i; 3029 3030 /* 3031 * drop reference count and verify not in use. 3032 */ 3033 3034 if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) { 3035 return; 3036 } 3037 pmap_check_inuse(pmap); 3038 3039 /* 3040 * handle any deferred frees. 3041 */ 3042 3043 mutex_enter(&pmap->pm_lock); 3044 if (pmap->pm_pve != NULL) { 3045 pmap_free_pv(pmap, pmap->pm_pve); 3046 pmap->pm_pve = NULL; 3047 } 3048 pmap_drain_pv(pmap); 3049 mutex_exit(&pmap->pm_lock); 3050 pmap_update(pmap); 3051 3052 /* 3053 * Reference count is zero, free pmap resources and then free pmap. 3054 */ 3055 3056 pmap_check_ptps(pmap); 3057 KASSERT(LIST_EMPTY(&pmap->pm_gc_ptp)); 3058 3059 #ifdef USER_LDT 3060 if (pmap->pm_ldt != NULL) { 3061 /* 3062 * No need to switch the LDT; this address space is gone, 3063 * nothing is using it. 3064 * 3065 * No need to lock the pmap for ldt_free (or anything else), 3066 * we're the last one to use it. 3067 */ 3068 /* XXXAD can't take cpu_lock here - fix soon. */ 3069 mutex_enter(&cpu_lock); 3070 ldt_free(pmap->pm_ldt_sel); 3071 mutex_exit(&cpu_lock); 3072 uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt, 3073 MAX_USERLDT_SIZE, UVM_KMF_WIRED); 3074 } 3075 #endif 3076 3077 for (i = 0; i < PTP_LEVELS - 1; i++) { 3078 uvm_obj_destroy(&pmap->pm_obj[i], false); 3079 } 3080 kcpuset_zero(pmap->pm_cpus); 3081 kcpuset_zero(pmap->pm_kernel_cpus); 3082 #ifdef XENPV 3083 kcpuset_zero(pmap->pm_xen_ptp_cpus); 3084 #endif 3085 3086 KASSERT(LIST_EMPTY(&pmap->pm_pvp_full)); 3087 KASSERT(LIST_EMPTY(&pmap->pm_pvp_part)); 3088 KASSERT(LIST_EMPTY(&pmap->pm_pvp_empty)); 3089 3090 pmap_check_ptps(pmap); 3091 if (__predict_false(pmap->pm_enter != NULL)) { 3092 /* XXX make this a different cache */ 3093 pool_cache_destruct_object(&pmap_cache, pmap); 3094 } else { 3095 pool_cache_put(&pmap_cache, pmap); 3096 } 3097 } 3098 3099 /* 3100 * pmap_zap_ptp: clear out an entire PTP without modifying PTEs 3101 * 3102 * => caller must hold pmap's lock 3103 * => PTP must be mapped into KVA 3104 * => must be called with kernel preemption disabled 3105 * => does as little work as possible 3106 */ 3107 static void 3108 pmap_zap_ptp(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, 3109 vaddr_t startva, vaddr_t blkendva) 3110 { 3111 #ifndef XENPV 3112 struct pv_entry *pve; 3113 struct vm_page *pg; 3114 struct pmap_page *pp; 3115 pt_entry_t opte; 3116 rb_tree_t *tree; 3117 vaddr_t va; 3118 int wired; 3119 uint8_t oattrs; 3120 u_int cnt; 3121 3122 KASSERT(mutex_owned(&pmap->pm_lock)); 3123 KASSERT(kpreempt_disabled()); 3124 KASSERT(pmap != pmap_kernel()); 3125 KASSERT(ptp->wire_count > 1); 3126 KASSERT(ptp->wire_count - 1 <= PAGE_SIZE / sizeof(pt_entry_t)); 3127 3128 /* 3129 * Start at the lowest entered VA, and scan until there are no more 3130 * PTEs in the PTPs. 3131 */ 3132 tree = &VM_PAGE_TO_PP(ptp)->pp_rb; 3133 pve = RB_TREE_MIN(tree); 3134 wired = 0; 3135 va = (vaddr_t)ptp->uanon; 3136 pte += ((va - startva) >> PAGE_SHIFT); 3137 3138 for (cnt = ptp->wire_count; cnt > 1; pte++, va += PAGE_SIZE) { 3139 /* 3140 * No need for an atomic to clear the PTE. Nothing else can 3141 * see the address space any more and speculative access (if 3142 * possible) won't modify. Therefore there's no need to 3143 * track the accessed/dirty bits. 3144 */ 3145 opte = *pte; 3146 if (!pmap_valid_entry(opte)) { 3147 continue; 3148 } 3149 3150 /* 3151 * Count the PTE. If it's not for a managed mapping 3152 * there's noting more to do. 3153 */ 3154 cnt--; 3155 wired -= (opte & PTE_WIRED); 3156 if ((opte & PTE_PVLIST) == 0) { 3157 #ifndef DOM0OPS 3158 KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL), 3159 "managed page without PTE_PVLIST for %#" 3160 PRIxVADDR, va); 3161 KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL), 3162 "pv-tracked page without PTE_PVLIST for %#" 3163 PRIxVADDR, va); 3164 #endif 3165 KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ? 3166 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), 3167 va) == NULL); 3168 continue; 3169 } 3170 3171 /* 3172 * "pve" now points to the lowest (by VA) dynamic PV entry 3173 * in the PTP. If it's for this VA, take advantage of it to 3174 * avoid calling PHYS_TO_VM_PAGE(). Avoid modifying the RB 3175 * tree by skipping to the next VA in the tree whenever 3176 * there is a match here. The tree will be cleared out in 3177 * one pass before return to pmap_remove_all(). 3178 */ 3179 oattrs = pmap_pte_to_pp_attrs(opte); 3180 if (pve != NULL && pve->pve_pte.pte_va == va) { 3181 pp = pve->pve_pp; 3182 KASSERT(pve->pve_pte.pte_ptp == ptp); 3183 KASSERT(pp->pp_pte.pte_ptp != ptp || 3184 pp->pp_pte.pte_va != va); 3185 mutex_spin_enter(&pp->pp_lock); 3186 pp->pp_attrs |= oattrs; 3187 LIST_REMOVE(pve, pve_list); 3188 mutex_spin_exit(&pp->pp_lock); 3189 3190 /* 3191 * pve won't be touched again until pmap_drain_pv(), 3192 * so it's still safe to traverse the tree. 3193 */ 3194 pmap_free_pv(pmap, pve); 3195 pve = RB_TREE_NEXT(tree, pve); 3196 continue; 3197 } 3198 3199 /* 3200 * No entry in the tree so it must be embedded. Look up the 3201 * page and cancel the embedded entry. 3202 */ 3203 if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) { 3204 pp = VM_PAGE_TO_PP(pg); 3205 } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) { 3206 paddr_t pa = pmap_pte2pa(opte); 3207 panic("%s: PTE_PVLIST with pv-untracked page" 3208 " va = %#"PRIxVADDR"pa = %#"PRIxPADDR 3209 "(%#"PRIxPADDR")", __func__, va, pa, atop(pa)); 3210 } 3211 mutex_spin_enter(&pp->pp_lock); 3212 KASSERT(pp->pp_pte.pte_ptp == ptp); 3213 KASSERT(pp->pp_pte.pte_va == va); 3214 pp->pp_attrs |= oattrs; 3215 pp->pp_pte.pte_ptp = NULL; 3216 pp->pp_pte.pte_va = 0; 3217 mutex_spin_exit(&pp->pp_lock); 3218 } 3219 3220 /* PTP now empty - adjust the tree & stats to match. */ 3221 pmap_stats_update(pmap, -(ptp->wire_count - 1), wired / PTE_WIRED); 3222 ptp->wire_count = 1; 3223 #ifdef DIAGNOSTIC 3224 rb_tree_init(tree, &pmap_rbtree_ops); 3225 #endif 3226 #else /* !XENPV */ 3227 /* 3228 * XXXAD For XEN, it's not clear to me that we can do this, because 3229 * I guess the hypervisor keeps track of PTEs too. 3230 */ 3231 pmap_remove_ptes(pmap, ptp, (vaddr_t)pte, startva, blkendva); 3232 #endif /* !XENPV */ 3233 } 3234 3235 /* 3236 * pmap_remove_all: remove all mappings from pmap in bulk. 3237 * 3238 * Ordinarily when removing mappings it's important to hold the UVM object's 3239 * lock, so that pages do not gain a new identity while retaining stale TLB 3240 * entries (the same lock hold covers both pmap_remove() and pmap_update()). 3241 * Here it's known that the address space is no longer visible to any user 3242 * process, so we don't need to worry about that. 3243 */ 3244 bool 3245 pmap_remove_all(struct pmap *pmap) 3246 { 3247 struct vm_page *ptps[32]; 3248 vaddr_t va, blkendva; 3249 struct pmap *pmap2; 3250 pt_entry_t *ptes; 3251 pd_entry_t pde __diagused; 3252 pd_entry_t * const *pdes; 3253 int lvl __diagused, i, n; 3254 3255 /* XXX Can't handle EPT just yet. */ 3256 if (pmap->pm_remove != NULL) { 3257 return false; 3258 } 3259 3260 for (;;) { 3261 /* Fetch a block of PTPs from tree. */ 3262 mutex_enter(&pmap->pm_lock); 3263 n = radix_tree_gang_lookup_node(&pmap->pm_obj[0].uo_pages, 0, 3264 (void **)ptps, __arraycount(ptps), false); 3265 if (n == 0) { 3266 mutex_exit(&pmap->pm_lock); 3267 break; 3268 } 3269 3270 /* Remove all mappings in the set of PTPs. */ 3271 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3272 for (i = 0; i < n; i++) { 3273 if (ptps[i]->wire_count == 0) { 3274 /* It's dead: pmap_update() will expunge. */ 3275 continue; 3276 } 3277 3278 /* Determine range of block. */ 3279 va = ptps[i]->offset * PAGE_SIZE / sizeof(pt_entry_t); 3280 blkendva = x86_round_pdr(va + 1); 3281 3282 /* Make sure everything squares up... */ 3283 KASSERT(pmap_pdes_valid(va, pdes, &pde, &lvl)); 3284 KASSERT(lvl == 1); 3285 KASSERT(pmap_find_ptp(pmap, va, 1) == ptps[i]); 3286 3287 /* Zap! */ 3288 pmap_zap_ptp(pmap, ptps[i], &ptes[pl1_i(va)], va, 3289 blkendva); 3290 3291 /* PTP should now be unused - free it. */ 3292 KASSERT(ptps[i]->wire_count == 1); 3293 pmap_free_ptp(pmap, ptps[i], va, ptes, pdes); 3294 } 3295 pmap_unmap_ptes(pmap, pmap2); 3296 pmap_drain_pv(pmap); 3297 pmap_tlb_shootdown(pmap, -1L, 0, TLBSHOOT_REMOVE_ALL); 3298 mutex_exit(&pmap->pm_lock); 3299 3300 /* Process deferred frees. */ 3301 pmap_update(pmap); 3302 3303 /* A breathing point. */ 3304 preempt_point(); 3305 } 3306 3307 /* Verify that the pmap is now completely empty. */ 3308 pmap_check_ptps(pmap); 3309 KASSERTMSG(pmap->pm_stats.resident_count == PDP_SIZE, 3310 "pmap %p not empty", pmap); 3311 3312 return true; 3313 } 3314 3315 #if defined(PMAP_FORK) 3316 /* 3317 * pmap_fork: perform any necessary data structure manipulation when 3318 * a VM space is forked. 3319 */ 3320 void 3321 pmap_fork(struct pmap *pmap1, struct pmap *pmap2) 3322 { 3323 #ifdef USER_LDT 3324 union descriptor *new_ldt; 3325 int sel; 3326 3327 if (__predict_true(pmap1->pm_ldt == NULL)) { 3328 return; 3329 } 3330 3331 /* 3332 * Copy the LDT into the new process. 3333 * 3334 * Read pmap1's ldt pointer unlocked; if it changes behind our back 3335 * we'll retry. This will starve if there's a stream of LDT changes 3336 * in another thread but that should not happen. 3337 */ 3338 3339 retry: 3340 if (pmap1->pm_ldt != NULL) { 3341 /* Allocate space for the new process's LDT */ 3342 new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, 3343 MAX_USERLDT_SIZE, 0, UVM_KMF_WIRED); 3344 if (new_ldt == NULL) { 3345 printf("WARNING: %s: unable to allocate LDT space\n", 3346 __func__); 3347 return; 3348 } 3349 mutex_enter(&cpu_lock); 3350 /* Get a GDT slot for it */ 3351 sel = ldt_alloc(new_ldt, MAX_USERLDT_SIZE); 3352 if (sel == -1) { 3353 mutex_exit(&cpu_lock); 3354 uvm_km_free(kernel_map, (vaddr_t)new_ldt, 3355 MAX_USERLDT_SIZE, UVM_KMF_WIRED); 3356 printf("WARNING: %s: unable to allocate LDT selector\n", 3357 __func__); 3358 return; 3359 } 3360 } else { 3361 /* Wasn't anything there after all. */ 3362 new_ldt = NULL; 3363 sel = -1; 3364 mutex_enter(&cpu_lock); 3365 } 3366 3367 /* 3368 * Now that we have cpu_lock, ensure the LDT status is the same. 3369 */ 3370 if (pmap1->pm_ldt != NULL) { 3371 if (new_ldt == NULL) { 3372 /* A wild LDT just appeared. */ 3373 mutex_exit(&cpu_lock); 3374 goto retry; 3375 } 3376 3377 /* Copy the LDT data and install it in pmap2 */ 3378 memcpy(new_ldt, pmap1->pm_ldt, MAX_USERLDT_SIZE); 3379 pmap2->pm_ldt = new_ldt; 3380 pmap2->pm_ldt_sel = sel; 3381 mutex_exit(&cpu_lock); 3382 } else { 3383 if (new_ldt != NULL) { 3384 /* The LDT disappeared, drop what we did. */ 3385 ldt_free(sel); 3386 mutex_exit(&cpu_lock); 3387 uvm_km_free(kernel_map, (vaddr_t)new_ldt, 3388 MAX_USERLDT_SIZE, UVM_KMF_WIRED); 3389 return; 3390 } 3391 3392 /* We're good, just leave. */ 3393 mutex_exit(&cpu_lock); 3394 } 3395 #endif /* USER_LDT */ 3396 } 3397 #endif /* PMAP_FORK */ 3398 3399 #ifdef USER_LDT 3400 3401 /* 3402 * pmap_ldt_xcall: cross call used by pmap_ldt_sync. if the named pmap 3403 * is active, reload LDTR. 3404 */ 3405 static void 3406 pmap_ldt_xcall(void *arg1, void *arg2) 3407 { 3408 struct pmap *pm; 3409 3410 kpreempt_disable(); 3411 pm = arg1; 3412 if (curcpu()->ci_pmap == pm) { 3413 #if defined(SVS) 3414 if (svs_enabled) { 3415 svs_ldt_sync(pm); 3416 } else 3417 #endif 3418 lldt(pm->pm_ldt_sel); 3419 } 3420 kpreempt_enable(); 3421 } 3422 3423 /* 3424 * pmap_ldt_sync: LDT selector for the named pmap is changing. swap 3425 * in the new selector on all CPUs. 3426 */ 3427 void 3428 pmap_ldt_sync(struct pmap *pm) 3429 { 3430 uint64_t where; 3431 3432 KASSERT(mutex_owned(&cpu_lock)); 3433 3434 pmap_ldt_evcnt.ev_count++; 3435 where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL); 3436 xc_wait(where); 3437 } 3438 3439 /* 3440 * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and 3441 * restore the default. 3442 */ 3443 void 3444 pmap_ldt_cleanup(struct lwp *l) 3445 { 3446 pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap; 3447 union descriptor *ldt; 3448 int sel; 3449 3450 if (__predict_true(pmap->pm_ldt == NULL)) { 3451 return; 3452 } 3453 3454 mutex_enter(&cpu_lock); 3455 if (pmap->pm_ldt != NULL) { 3456 sel = pmap->pm_ldt_sel; 3457 ldt = pmap->pm_ldt; 3458 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 3459 pmap->pm_ldt = NULL; 3460 pmap_ldt_sync(pmap); 3461 ldt_free(sel); 3462 uvm_km_free(kernel_map, (vaddr_t)ldt, MAX_USERLDT_SIZE, 3463 UVM_KMF_WIRED); 3464 } 3465 mutex_exit(&cpu_lock); 3466 } 3467 #endif /* USER_LDT */ 3468 3469 /* 3470 * pmap_activate: activate a process' pmap 3471 * 3472 * => must be called with kernel preemption disabled 3473 * => if lwp is the curlwp, then set ci_want_pmapload so that 3474 * actual MMU context switch will be done by pmap_load() later 3475 */ 3476 void 3477 pmap_activate(struct lwp *l) 3478 { 3479 struct cpu_info *ci; 3480 struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 3481 3482 KASSERT(kpreempt_disabled()); 3483 3484 ci = curcpu(); 3485 3486 if (l != ci->ci_curlwp) 3487 return; 3488 3489 KASSERT(ci->ci_want_pmapload == 0); 3490 KASSERT(ci->ci_tlbstate != TLBSTATE_VALID); 3491 3492 /* 3493 * no need to switch to kernel vmspace because 3494 * it's a subset of any vmspace. 3495 */ 3496 3497 if (pmap == pmap_kernel()) { 3498 ci->ci_want_pmapload = 0; 3499 return; 3500 } 3501 3502 ci->ci_want_pmapload = 1; 3503 } 3504 3505 #if defined(XENPV) && defined(__x86_64__) 3506 #define KASSERT_PDIRPA(pmap) \ 3507 KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd || \ 3508 pmap == pmap_kernel()) 3509 #elif defined(PAE) 3510 #define KASSERT_PDIRPA(pmap) \ 3511 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0])) 3512 #elif !defined(XENPV) 3513 #define KASSERT_PDIRPA(pmap) \ 3514 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3())) 3515 #else 3516 #define KASSERT_PDIRPA(pmap) KASSERT(true) /* nothing to do */ 3517 #endif 3518 3519 /* 3520 * pmap_reactivate: try to regain reference to the pmap. 3521 * 3522 * => Must be called with kernel preemption disabled. 3523 */ 3524 static void 3525 pmap_reactivate(struct pmap *pmap) 3526 { 3527 struct cpu_info * const ci = curcpu(); 3528 const cpuid_t cid = cpu_index(ci); 3529 3530 KASSERT(kpreempt_disabled()); 3531 KASSERT_PDIRPA(pmap); 3532 3533 /* 3534 * If we still have a lazy reference to this pmap, we can assume 3535 * that there was no TLB shootdown for this pmap in the meantime. 3536 * 3537 * The order of events here is important as we must synchronize 3538 * with TLB shootdown interrupts. Declare interest in invalidations 3539 * (TLBSTATE_VALID) and then check the CPU set, which the IPIs can 3540 * change only when the state is TLBSTATE_LAZY. 3541 */ 3542 3543 ci->ci_tlbstate = TLBSTATE_VALID; 3544 KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid)); 3545 3546 if (__predict_true(kcpuset_isset(pmap->pm_cpus, cid))) { 3547 /* We have the reference, state is valid. */ 3548 } else { 3549 /* 3550 * Must reload the TLB, pmap has been changed during 3551 * deactivated. 3552 */ 3553 kcpuset_atomic_set(pmap->pm_cpus, cid); 3554 3555 tlbflush(); 3556 } 3557 } 3558 3559 /* 3560 * pmap_load: perform the actual pmap switch, i.e. fill in %cr3 register 3561 * and relevant LDT info. 3562 * 3563 * Ensures that the current process' pmap is loaded on the current CPU's 3564 * MMU and that there are no stale TLB entries. 3565 * 3566 * => The caller should disable kernel preemption or do check-and-retry 3567 * to prevent a preemption from undoing our efforts. 3568 * => This function may block. 3569 */ 3570 void 3571 pmap_load(void) 3572 { 3573 struct cpu_info *ci; 3574 struct pmap *pmap, *oldpmap; 3575 struct lwp *l; 3576 uint64_t pctr; 3577 int ilevel __diagused; 3578 u_long psl __diagused; 3579 3580 kpreempt_disable(); 3581 retry: 3582 ci = curcpu(); 3583 if (!ci->ci_want_pmapload) { 3584 kpreempt_enable(); 3585 return; 3586 } 3587 l = ci->ci_curlwp; 3588 pctr = lwp_pctr(); 3589 __insn_barrier(); 3590 3591 /* should be able to take ipis. */ 3592 KASSERTMSG((ilevel = ci->ci_ilevel) < IPL_HIGH, "ilevel=%d", ilevel); 3593 #ifdef XENPV 3594 /* Check to see if interrupts are enabled (ie; no events are masked) */ 3595 KASSERTMSG((psl = x86_read_psl()) == 0, "psl=0x%lx", psl); 3596 #else 3597 KASSERTMSG(((psl = x86_read_psl()) & PSL_I) != 0, "psl=0x%lx", psl); 3598 #endif 3599 3600 KASSERT(l != NULL); 3601 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 3602 KASSERT(pmap != pmap_kernel()); 3603 oldpmap = ci->ci_pmap; 3604 3605 if (pmap == oldpmap) { 3606 pmap_reactivate(pmap); 3607 ci->ci_want_pmapload = 0; 3608 kpreempt_enable(); 3609 return; 3610 } 3611 3612 /* 3613 * Acquire a reference to the new pmap and perform the switch. 3614 */ 3615 3616 pmap_reference(pmap); 3617 pmap_load1(l, pmap, oldpmap); 3618 ci->ci_want_pmapload = 0; 3619 3620 /* 3621 * we're now running with the new pmap. drop the reference 3622 * to the old pmap. if we block, we need to go around again. 3623 */ 3624 3625 pmap_destroy(oldpmap); 3626 __insn_barrier(); 3627 if (lwp_pctr() != pctr) { 3628 goto retry; 3629 } 3630 3631 kpreempt_enable(); 3632 } 3633 3634 /* 3635 * pmap_load1: the guts of pmap load, shared by pmap_map_ptes() and 3636 * pmap_load(). It's critically important that this function does not 3637 * block. 3638 */ 3639 static void 3640 pmap_load1(struct lwp *l, struct pmap *pmap, struct pmap *oldpmap) 3641 { 3642 struct cpu_info *ci; 3643 struct pcb *pcb; 3644 cpuid_t cid; 3645 3646 KASSERT(kpreempt_disabled()); 3647 3648 pcb = lwp_getpcb(l); 3649 ci = l->l_cpu; 3650 cid = cpu_index(ci); 3651 3652 kcpuset_atomic_clear(oldpmap->pm_cpus, cid); 3653 kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid); 3654 3655 KASSERT_PDIRPA(oldpmap); 3656 KASSERT(!kcpuset_isset(pmap->pm_cpus, cid)); 3657 KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid)); 3658 3659 /* 3660 * Mark the pmap in use by this CPU. Again, we must synchronize 3661 * with TLB shootdown interrupts, so set the state VALID first, 3662 * then register us for shootdown events on this pmap. 3663 */ 3664 ci->ci_tlbstate = TLBSTATE_VALID; 3665 kcpuset_atomic_set(pmap->pm_cpus, cid); 3666 kcpuset_atomic_set(pmap->pm_kernel_cpus, cid); 3667 ci->ci_pmap = pmap; 3668 3669 /* 3670 * update tss. now that we have registered for invalidations 3671 * from other CPUs, we're good to load the page tables. 3672 */ 3673 #ifdef PAE 3674 pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa; 3675 #else 3676 pcb->pcb_cr3 = pmap_pdirpa(pmap, 0); 3677 #endif 3678 3679 #ifdef i386 3680 #ifndef XENPV 3681 ci->ci_tss->tss.tss_ldt = pmap->pm_ldt_sel; 3682 ci->ci_tss->tss.tss_cr3 = pcb->pcb_cr3; 3683 #endif 3684 #endif 3685 3686 #if defined(SVS) && defined(USER_LDT) 3687 if (svs_enabled) { 3688 svs_ldt_sync(pmap); 3689 } else 3690 #endif 3691 lldt(pmap->pm_ldt_sel); 3692 3693 cpu_load_pmap(pmap, oldpmap); 3694 } 3695 3696 /* 3697 * pmap_deactivate: deactivate a process' pmap. 3698 * 3699 * => Must be called with kernel preemption disabled (high IPL is enough). 3700 */ 3701 void 3702 pmap_deactivate(struct lwp *l) 3703 { 3704 struct pmap *pmap; 3705 struct cpu_info *ci; 3706 3707 KASSERT(kpreempt_disabled()); 3708 3709 if (l != curlwp) { 3710 return; 3711 } 3712 3713 /* 3714 * Wait for pending TLB shootdowns to complete. Necessary because 3715 * TLB shootdown state is per-CPU, and the LWP may be coming off 3716 * the CPU before it has a chance to call pmap_update(), e.g. due 3717 * to kernel preemption or blocking routine in between. 3718 */ 3719 pmap_tlb_shootnow(); 3720 3721 ci = curcpu(); 3722 3723 if (ci->ci_want_pmapload) { 3724 /* 3725 * ci_want_pmapload means that our pmap is not loaded on 3726 * the CPU or TLB might be stale. note that pmap_kernel() 3727 * is always considered loaded. 3728 */ 3729 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 3730 != pmap_kernel()); 3731 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 3732 != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID); 3733 3734 /* 3735 * userspace has not been touched. 3736 * nothing to do here. 3737 */ 3738 3739 ci->ci_want_pmapload = 0; 3740 return; 3741 } 3742 3743 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 3744 3745 if (pmap == pmap_kernel()) { 3746 return; 3747 } 3748 3749 KASSERT_PDIRPA(pmap); 3750 KASSERT(ci->ci_pmap == pmap); 3751 3752 /* 3753 * we aren't interested in TLB invalidations for this pmap, 3754 * at least for the time being. 3755 */ 3756 3757 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); 3758 ci->ci_tlbstate = TLBSTATE_LAZY; 3759 } 3760 3761 #ifdef EFI_RUNTIME 3762 3763 extern struct pmap *efi_runtime_pmap; 3764 3765 /* 3766 * pmap_is_user: true if pmap, which must not be the kernel pmap, is 3767 * for an unprivileged user process 3768 */ 3769 bool 3770 pmap_is_user(struct pmap *pmap) 3771 { 3772 3773 KASSERT(pmap != pmap_kernel()); 3774 return (pmap != efi_runtime_pmap); 3775 } 3776 3777 /* 3778 * pmap_activate_sync: synchronously activate specified pmap. 3779 * 3780 * => Must be called with kernel preemption disabled (high IPL is enough). 3781 * => Must not sleep before pmap_deactivate_sync. 3782 */ 3783 void * 3784 pmap_activate_sync(struct pmap *pmap) 3785 { 3786 struct cpu_info *ci = curcpu(); 3787 struct pmap *oldpmap = ci->ci_pmap; 3788 unsigned cid = cpu_index(ci); 3789 3790 KASSERT(kpreempt_disabled()); 3791 KASSERT(pmap != pmap_kernel()); 3792 3793 KASSERT(!kcpuset_isset(pmap->pm_cpus, cid)); 3794 KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid)); 3795 3796 if (oldpmap) { 3797 KASSERT_PDIRPA(oldpmap); 3798 kcpuset_atomic_clear(oldpmap->pm_cpus, cid); 3799 kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid); 3800 } 3801 3802 ci->ci_tlbstate = TLBSTATE_VALID; 3803 kcpuset_atomic_set(pmap->pm_cpus, cid); 3804 kcpuset_atomic_set(pmap->pm_kernel_cpus, cid); 3805 ci->ci_pmap = pmap; 3806 3807 #if defined(SVS) && defined(USER_LDT) 3808 if (svs_enabled) { 3809 svs_ldt_sync(pmap); 3810 } else 3811 #endif 3812 lldt(pmap->pm_ldt_sel); 3813 3814 cpu_load_pmap(pmap, oldpmap); 3815 3816 return oldpmap; 3817 } 3818 3819 /* 3820 * pmap_deactivate_sync: synchronously deactivate specified pmap and 3821 * restore whatever was active before pmap_activate_sync. 3822 * 3823 * => Must be called with kernel preemption disabled (high IPL is enough). 3824 * => Must not have slept since pmap_activate_sync. 3825 */ 3826 void 3827 pmap_deactivate_sync(struct pmap *pmap, void *cookie) 3828 { 3829 struct cpu_info *ci = curcpu(); 3830 struct pmap *oldpmap = cookie; 3831 unsigned cid = cpu_index(ci); 3832 3833 KASSERT(kpreempt_disabled()); 3834 KASSERT(pmap != pmap_kernel()); 3835 KASSERT(ci->ci_pmap == pmap); 3836 3837 KASSERT_PDIRPA(pmap); 3838 3839 KASSERT(kcpuset_isset(pmap->pm_cpus, cid)); 3840 KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid)); 3841 3842 pmap_tlb_shootnow(); 3843 3844 kcpuset_atomic_clear(pmap->pm_cpus, cid); 3845 kcpuset_atomic_clear(pmap->pm_kernel_cpus, cid); 3846 3847 ci->ci_tlbstate = TLBSTATE_VALID; 3848 ci->ci_pmap = oldpmap; 3849 if (oldpmap) { 3850 kcpuset_atomic_set(oldpmap->pm_cpus, cid); 3851 kcpuset_atomic_set(oldpmap->pm_kernel_cpus, cid); 3852 #if defined(SVS) && defined(USER_LDT) 3853 if (svs_enabled) { 3854 svs_ldt_sync(oldpmap); 3855 } else 3856 #endif 3857 lldt(oldpmap->pm_ldt_sel); 3858 cpu_load_pmap(oldpmap, pmap); 3859 } else { 3860 lcr3(pmap_pdirpa(pmap_kernel(), 0)); 3861 } 3862 } 3863 3864 #endif /* EFI_RUNTIME */ 3865 3866 /* 3867 * some misc. functions 3868 */ 3869 3870 bool 3871 pmap_pdes_valid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde, 3872 int *lastlvl) 3873 { 3874 unsigned long index; 3875 pd_entry_t pde; 3876 int i; 3877 3878 for (i = PTP_LEVELS; i > 1; i--) { 3879 index = pl_i(va, i); 3880 pde = pdes[i - 2][index]; 3881 if ((pde & PTE_P) == 0) { 3882 *lastlvl = i; 3883 return false; 3884 } 3885 if (pde & PTE_PS) 3886 break; 3887 } 3888 if (lastpde != NULL) 3889 *lastpde = pde; 3890 *lastlvl = i; 3891 return true; 3892 } 3893 3894 /* 3895 * pmap_extract: extract a PA for the given VA 3896 */ 3897 bool 3898 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap) 3899 { 3900 pt_entry_t *ptes, pte; 3901 pd_entry_t pde; 3902 pd_entry_t * const *pdes; 3903 struct pmap *pmap2; 3904 paddr_t pa; 3905 bool rv; 3906 int lvl; 3907 3908 if (__predict_false(pmap->pm_extract != NULL)) { 3909 return (*pmap->pm_extract)(pmap, va, pap); 3910 } 3911 3912 #ifdef __HAVE_DIRECT_MAP 3913 if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) { 3914 if (pap != NULL) { 3915 *pap = PMAP_DIRECT_UNMAP(va); 3916 } 3917 return true; 3918 } 3919 #endif 3920 3921 rv = false; 3922 pa = 0; 3923 3924 if (pmap != pmap_kernel()) { 3925 mutex_enter(&pmap->pm_lock); 3926 } 3927 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3928 if (pmap_pdes_valid(va, pdes, &pde, &lvl)) { 3929 if (lvl == 2) { 3930 pa = (pde & PTE_LGFRAME) | (va & (NBPD_L2 - 1)); 3931 rv = true; 3932 } else { 3933 KASSERT(lvl == 1); 3934 pte = ptes[pl1_i(va)]; 3935 if (__predict_true((pte & PTE_P) != 0)) { 3936 pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1)); 3937 rv = true; 3938 } 3939 } 3940 } 3941 pmap_unmap_ptes(pmap, pmap2); 3942 if (pmap != pmap_kernel()) { 3943 mutex_exit(&pmap->pm_lock); 3944 } 3945 if (pap != NULL) { 3946 *pap = pa; 3947 } 3948 3949 return rv; 3950 } 3951 3952 /* 3953 * vtophys: virtual address to physical address. For use by 3954 * machine-dependent code only. 3955 */ 3956 paddr_t 3957 vtophys(vaddr_t va) 3958 { 3959 paddr_t pa; 3960 3961 if (pmap_extract(pmap_kernel(), va, &pa) == true) 3962 return pa; 3963 return 0; 3964 } 3965 3966 __strict_weak_alias(pmap_extract_ma, pmap_extract); 3967 3968 #ifdef XENPV 3969 /* 3970 * vtomach: virtual address to machine address. For use by 3971 * machine-dependent code only. 3972 */ 3973 paddr_t 3974 vtomach(vaddr_t va) 3975 { 3976 paddr_t pa; 3977 3978 if (pmap_extract_ma(pmap_kernel(), va, &pa) == true) 3979 return pa; 3980 return 0; 3981 } 3982 #endif 3983 3984 /* 3985 * pmap_virtual_space: used during bootup [pmap_steal_memory] to 3986 * determine the bounds of the kernel virtual address space. 3987 */ 3988 void 3989 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp) 3990 { 3991 *startp = virtual_avail; 3992 *endp = virtual_end; 3993 } 3994 3995 void 3996 pmap_zero_page(paddr_t pa) 3997 { 3998 #if defined(__HAVE_DIRECT_MAP) 3999 memset(PAGE_ALIGNED(PMAP_DIRECT_MAP(pa)), 0, PAGE_SIZE); 4000 #else 4001 #if defined(XENPV) 4002 if (XEN_VERSION_SUPPORTED(3, 4)) { 4003 xen_pagezero(pa); 4004 return; 4005 } 4006 #endif 4007 struct cpu_info *ci; 4008 pt_entry_t *zpte; 4009 vaddr_t zerova; 4010 4011 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_D | PTE_A; 4012 4013 kpreempt_disable(); 4014 4015 ci = curcpu(); 4016 zerova = ci->vpage[VPAGE_ZER]; 4017 zpte = ci->vpage_pte[VPAGE_ZER]; 4018 4019 KASSERTMSG(!*zpte, "pmap_zero_page: lock botch"); 4020 4021 pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags); 4022 pmap_pte_flush(); 4023 pmap_update_pg(zerova); /* flush TLB */ 4024 4025 memset(PAGE_ALIGNED(zerova), 0, PAGE_SIZE); 4026 4027 #if defined(DIAGNOSTIC) || defined(XENPV) 4028 pmap_pte_set(zpte, 0); /* zap ! */ 4029 pmap_pte_flush(); 4030 #endif 4031 4032 kpreempt_enable(); 4033 #endif /* defined(__HAVE_DIRECT_MAP) */ 4034 } 4035 4036 void 4037 pmap_copy_page(paddr_t srcpa, paddr_t dstpa) 4038 { 4039 #if defined(__HAVE_DIRECT_MAP) 4040 vaddr_t srcva = PMAP_DIRECT_MAP(srcpa); 4041 vaddr_t dstva = PMAP_DIRECT_MAP(dstpa); 4042 4043 memcpy(PAGE_ALIGNED(dstva), PAGE_ALIGNED(srcva), PAGE_SIZE); 4044 #else 4045 #if defined(XENPV) 4046 if (XEN_VERSION_SUPPORTED(3, 4)) { 4047 xen_copy_page(srcpa, dstpa); 4048 return; 4049 } 4050 #endif 4051 struct cpu_info *ci; 4052 pt_entry_t *srcpte, *dstpte; 4053 vaddr_t srcva, dstva; 4054 4055 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A; 4056 4057 kpreempt_disable(); 4058 4059 ci = curcpu(); 4060 srcva = ci->vpage[VPAGE_SRC]; 4061 dstva = ci->vpage[VPAGE_DST]; 4062 srcpte = ci->vpage_pte[VPAGE_SRC]; 4063 dstpte = ci->vpage_pte[VPAGE_DST]; 4064 4065 KASSERT(*srcpte == 0 && *dstpte == 0); 4066 4067 pmap_pte_set(srcpte, pmap_pa2pte(srcpa) | pteflags); 4068 pmap_pte_set(dstpte, pmap_pa2pte(dstpa) | pteflags | PTE_D); 4069 pmap_pte_flush(); 4070 pmap_update_pg(srcva); 4071 pmap_update_pg(dstva); 4072 4073 memcpy(PAGE_ALIGNED(dstva), PAGE_ALIGNED(srcva), PAGE_SIZE); 4074 4075 #if defined(DIAGNOSTIC) || defined(XENPV) 4076 pmap_pte_set(srcpte, 0); 4077 pmap_pte_set(dstpte, 0); 4078 pmap_pte_flush(); 4079 #endif 4080 4081 kpreempt_enable(); 4082 #endif /* defined(__HAVE_DIRECT_MAP) */ 4083 } 4084 4085 static pt_entry_t * 4086 pmap_map_ptp(struct vm_page *ptp) 4087 { 4088 #ifdef __HAVE_DIRECT_MAP 4089 return (void *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp)); 4090 #else 4091 struct cpu_info *ci; 4092 pt_entry_t *ptppte; 4093 vaddr_t ptpva; 4094 4095 KASSERT(kpreempt_disabled()); 4096 4097 #ifndef XENPV 4098 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A | PTE_D; 4099 #else 4100 const pd_entry_t pteflags = PTE_P | pmap_pg_nx | PTE_A | PTE_D; 4101 #endif 4102 4103 ci = curcpu(); 4104 ptpva = ci->vpage[VPAGE_PTP]; 4105 ptppte = ci->vpage_pte[VPAGE_PTP]; 4106 4107 pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | pteflags); 4108 4109 pmap_pte_flush(); 4110 pmap_update_pg(ptpva); 4111 4112 return (pt_entry_t *)ptpva; 4113 #endif 4114 } 4115 4116 static void 4117 pmap_unmap_ptp(void) 4118 { 4119 #ifndef __HAVE_DIRECT_MAP 4120 #if defined(DIAGNOSTIC) || defined(XENPV) 4121 struct cpu_info *ci; 4122 pt_entry_t *pte; 4123 4124 KASSERT(kpreempt_disabled()); 4125 4126 ci = curcpu(); 4127 pte = ci->vpage_pte[VPAGE_PTP]; 4128 4129 if (*pte != 0) { 4130 pmap_pte_set(pte, 0); 4131 pmap_pte_flush(); 4132 } 4133 #endif 4134 #endif 4135 } 4136 4137 static pt_entry_t * 4138 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va) 4139 { 4140 4141 KASSERT(kpreempt_disabled()); 4142 if (pmap_is_curpmap(pmap)) { 4143 return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */ 4144 } 4145 KASSERT(ptp != NULL); 4146 return pmap_map_ptp(ptp) + pl1_pi(va); 4147 } 4148 4149 static void 4150 pmap_unmap_pte(void) 4151 { 4152 4153 KASSERT(kpreempt_disabled()); 4154 4155 pmap_unmap_ptp(); 4156 } 4157 4158 /* 4159 * p m a p r e m o v e f u n c t i o n s 4160 * 4161 * functions that remove mappings 4162 */ 4163 4164 /* 4165 * pmap_remove_ptes: remove PTEs from a PTP 4166 * 4167 * => caller must hold pmap's lock 4168 * => PTP must be mapped into KVA 4169 * => PTP should be null if pmap == pmap_kernel() 4170 * => must be called with kernel preemption disabled 4171 * => returns composite pte if at least one page should be shot down 4172 */ 4173 static void 4174 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva, 4175 vaddr_t startva, vaddr_t endva) 4176 { 4177 pt_entry_t *pte = (pt_entry_t *)ptpva; 4178 4179 KASSERT(mutex_owned(&pmap->pm_lock)); 4180 KASSERT(kpreempt_disabled()); 4181 4182 /* 4183 * mappings are very often sparse, so clip the given range to the 4184 * range of PTEs that are known present in the PTP. 4185 */ 4186 pmap_ptp_range_clip(ptp, &startva, &pte); 4187 4188 /* 4189 * note that ptpva points to the PTE that maps startva. this may 4190 * or may not be the first PTE in the PTP. 4191 * 4192 * we loop through the PTP while there are still PTEs to look at 4193 * and the wire_count is greater than 1 (because we use the wire_count 4194 * to keep track of the number of real PTEs in the PTP). 4195 */ 4196 while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) { 4197 (void)pmap_remove_pte(pmap, ptp, pte, startva); 4198 startva += PAGE_SIZE; 4199 pte++; 4200 } 4201 } 4202 4203 /* 4204 * pmap_remove_pte: remove a single PTE from a PTP. 4205 * 4206 * => caller must hold pmap's lock 4207 * => PTP must be mapped into KVA 4208 * => PTP should be null if pmap == pmap_kernel() 4209 * => returns true if we removed a mapping 4210 * => must be called with kernel preemption disabled 4211 */ 4212 static bool 4213 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, 4214 vaddr_t va) 4215 { 4216 struct pv_entry *pve; 4217 struct vm_page *pg; 4218 struct pmap_page *pp; 4219 pt_entry_t opte; 4220 4221 KASSERT(mutex_owned(&pmap->pm_lock)); 4222 KASSERT(kpreempt_disabled()); 4223 4224 if (!pmap_valid_entry(*pte)) { 4225 /* VA not mapped. */ 4226 return false; 4227 } 4228 4229 /* Atomically save the old PTE and zap it. */ 4230 opte = pmap_pte_testset(pte, 0); 4231 if (!pmap_valid_entry(opte)) { 4232 return false; 4233 } 4234 4235 pmap_exec_account(pmap, va, opte, 0); 4236 pmap_stats_update_bypte(pmap, 0, opte); 4237 4238 if (ptp) { 4239 /* 4240 * Dropping a PTE. Make sure that the PDE is flushed. 4241 */ 4242 ptp->wire_count--; 4243 if (ptp->wire_count <= 1) { 4244 opte |= PTE_A; 4245 } 4246 } 4247 4248 if ((opte & PTE_A) != 0) { 4249 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE); 4250 } 4251 4252 /* 4253 * If we are not on a pv list - we are done. 4254 */ 4255 if ((opte & PTE_PVLIST) == 0) { 4256 #ifndef DOM0OPS 4257 KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL), 4258 "managed page without PTE_PVLIST for %#"PRIxVADDR, va); 4259 KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL), 4260 "pv-tracked page without PTE_PVLIST for %#"PRIxVADDR, va); 4261 #endif 4262 KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ? 4263 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL); 4264 return true; 4265 } 4266 4267 if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) { 4268 pp = VM_PAGE_TO_PP(pg); 4269 } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) { 4270 paddr_t pa = pmap_pte2pa(opte); 4271 panic("%s: PTE_PVLIST with pv-untracked page" 4272 " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")", 4273 __func__, va, pa, atop(pa)); 4274 } 4275 4276 /* Sync R/M bits. */ 4277 pve = pmap_lookup_pv(pmap, ptp, pp, va); 4278 pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_pte_to_pp_attrs(opte)); 4279 return true; 4280 } 4281 4282 static void 4283 pmap_remove_locked(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 4284 { 4285 pt_entry_t *ptes; 4286 pd_entry_t pde; 4287 pd_entry_t * const *pdes; 4288 bool result; 4289 vaddr_t blkendva, va = sva; 4290 struct vm_page *ptp; 4291 struct pmap *pmap2; 4292 int lvl; 4293 4294 KASSERT(mutex_owned(&pmap->pm_lock)); 4295 4296 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 4297 4298 /* 4299 * removing one page? take shortcut function. 4300 */ 4301 4302 if (va + PAGE_SIZE == eva) { 4303 if (pmap_pdes_valid(va, pdes, &pde, &lvl)) { 4304 KASSERT(lvl == 1); 4305 4306 /* Get PTP if non-kernel mapping. */ 4307 if (pmap != pmap_kernel()) { 4308 ptp = pmap_find_ptp(pmap, va, 1); 4309 KASSERTMSG(ptp != NULL, 4310 "%s: unmanaged PTP detected", __func__); 4311 } else { 4312 /* Never free kernel PTPs. */ 4313 ptp = NULL; 4314 } 4315 4316 result = pmap_remove_pte(pmap, ptp, 4317 &ptes[pl1_i(va)], va); 4318 4319 /* 4320 * if mapping removed and the PTP is no longer 4321 * being used, free it! 4322 */ 4323 4324 if (result && ptp && ptp->wire_count <= 1) 4325 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 4326 } 4327 } else for (/* null */ ; va < eva ; va = blkendva) { 4328 /* determine range of block */ 4329 blkendva = x86_round_pdr(va+1); 4330 if (blkendva > eva) 4331 blkendva = eva; 4332 4333 if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) { 4334 /* Skip a range corresponding to an invalid pde. */ 4335 blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1]; 4336 continue; 4337 } 4338 KASSERT(lvl == 1); 4339 4340 /* Get PTP if non-kernel mapping. */ 4341 if (pmap != pmap_kernel()) { 4342 ptp = pmap_find_ptp(pmap, va, 1); 4343 KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected", 4344 __func__); 4345 } else { 4346 /* Never free kernel PTPs. */ 4347 ptp = NULL; 4348 } 4349 4350 pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va, 4351 blkendva); 4352 4353 /* If PTP is no longer being used, free it. */ 4354 if (ptp && ptp->wire_count <= 1) { 4355 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 4356 } 4357 } 4358 pmap_unmap_ptes(pmap, pmap2); 4359 pmap_drain_pv(pmap); 4360 } 4361 4362 /* 4363 * pmap_remove: mapping removal function. 4364 * 4365 * => caller should not be holding any pmap locks 4366 */ 4367 void 4368 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 4369 { 4370 if (__predict_false(pmap->pm_remove != NULL)) { 4371 (*pmap->pm_remove)(pmap, sva, eva); 4372 return; 4373 } 4374 4375 mutex_enter(&pmap->pm_lock); 4376 pmap_remove_locked(pmap, sva, eva); 4377 mutex_exit(&pmap->pm_lock); 4378 } 4379 4380 /* 4381 * pmap_sync_pv: clear pte bits and return the old value of the pp_attrs. 4382 * 4383 * => The 'clearbits' parameter is either ~0 or PP_ATTRS_... 4384 * => Caller should disable kernel preemption. 4385 * => issues tlb shootdowns if necessary. 4386 */ 4387 static int 4388 pmap_sync_pv(struct pv_pte *pvpte, paddr_t pa, int clearbits, uint8_t *oattrs, 4389 pt_entry_t *optep) 4390 { 4391 struct pmap *pmap; 4392 struct vm_page *ptp; 4393 vaddr_t va; 4394 pt_entry_t *ptep; 4395 pt_entry_t opte; 4396 pt_entry_t npte; 4397 pt_entry_t expect; 4398 bool need_shootdown; 4399 4400 ptp = pvpte->pte_ptp; 4401 va = pvpte->pte_va; 4402 KASSERT(ptp == NULL || ptp->uobject != NULL); 4403 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 4404 pmap = ptp_to_pmap(ptp); 4405 KASSERT(kpreempt_disabled()); 4406 4407 if (__predict_false(pmap->pm_sync_pv != NULL)) { 4408 return (*pmap->pm_sync_pv)(ptp, va, pa, clearbits, oattrs, 4409 optep); 4410 } 4411 4412 expect = pmap_pa2pte(pa) | PTE_P; 4413 4414 if (clearbits != ~0) { 4415 KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0); 4416 clearbits = pmap_pp_attrs_to_pte(clearbits); 4417 } 4418 4419 ptep = pmap_map_pte(pmap, ptp, va); 4420 do { 4421 opte = *ptep; 4422 KASSERT((opte & (PTE_D | PTE_A)) != PTE_D); 4423 KASSERT((opte & (PTE_A | PTE_P)) != PTE_A); 4424 KASSERT(opte == 0 || (opte & PTE_P) != 0); 4425 if ((opte & (PTE_FRAME | PTE_P)) != expect) { 4426 /* 4427 * We lost a race with a V->P operation like 4428 * pmap_remove(). Wait for the competitor 4429 * reflecting pte bits into mp_attrs. 4430 */ 4431 pmap_unmap_pte(); 4432 return EAGAIN; 4433 } 4434 4435 /* 4436 * Check if there's anything to do on this PTE. 4437 */ 4438 if ((opte & clearbits) == 0) { 4439 need_shootdown = false; 4440 break; 4441 } 4442 4443 /* 4444 * We need a shootdown if the PTE is cached (PTE_A) ... 4445 * ... Unless we are clearing only the PTE_W bit and 4446 * it isn't cached as RW (PTE_D). 4447 */ 4448 need_shootdown = (opte & PTE_A) != 0 && 4449 !(clearbits == PTE_W && (opte & PTE_D) == 0); 4450 4451 npte = opte & ~clearbits; 4452 4453 /* 4454 * If we need a shootdown anyway, clear PTE_A and PTE_D. 4455 */ 4456 if (need_shootdown) { 4457 npte &= ~(PTE_A | PTE_D); 4458 } 4459 KASSERT((npte & (PTE_D | PTE_A)) != PTE_D); 4460 KASSERT((npte & (PTE_A | PTE_P)) != PTE_A); 4461 KASSERT(npte == 0 || (opte & PTE_P) != 0); 4462 } while (pmap_pte_cas(ptep, opte, npte) != opte); 4463 4464 if (need_shootdown) { 4465 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV); 4466 } 4467 pmap_unmap_pte(); 4468 4469 *oattrs = pmap_pte_to_pp_attrs(opte); 4470 if (optep != NULL) 4471 *optep = opte; 4472 return 0; 4473 } 4474 4475 static void 4476 pmap_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte, 4477 vaddr_t va) 4478 { 4479 struct pmap *pmap2; 4480 pt_entry_t *ptes; 4481 pd_entry_t * const *pdes; 4482 4483 KASSERT(mutex_owned(&pmap->pm_lock)); 4484 4485 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 4486 pmap_stats_update_bypte(pmap, 0, opte); 4487 ptp->wire_count--; 4488 if (ptp->wire_count <= 1) { 4489 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 4490 } 4491 pmap_unmap_ptes(pmap, pmap2); 4492 } 4493 4494 static void 4495 pmap_pp_remove(struct pmap_page *pp, paddr_t pa) 4496 { 4497 struct pv_pte *pvpte; 4498 struct vm_page *ptp; 4499 uintptr_t sum; 4500 uint8_t oattrs; 4501 bool locked; 4502 4503 /* 4504 * Do an unlocked check to see if the page has no mappings, eg when 4505 * pmap_remove_all() was called before amap_wipeout() for a process 4506 * private amap - common. The page being removed must be on the way 4507 * out, so we don't have to worry about concurrent attempts to enter 4508 * it (otherwise the caller either doesn't care or has screwed up). 4509 */ 4510 sum = (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_va); 4511 sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_ptp); 4512 sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pvlist.lh_first); 4513 if (sum == 0) { 4514 return; 4515 } 4516 4517 kpreempt_disable(); 4518 for (;;) { 4519 struct pmap *pmap; 4520 struct pv_entry *pve; 4521 pt_entry_t opte; 4522 vaddr_t va; 4523 4524 mutex_spin_enter(&pp->pp_lock); 4525 if ((pvpte = pv_pte_first(pp)) == NULL) { 4526 mutex_spin_exit(&pp->pp_lock); 4527 break; 4528 } 4529 4530 /* 4531 * Add a reference to the pmap before clearing the pte. 4532 * Otherwise the pmap can disappear behind us. 4533 */ 4534 ptp = pvpte->pte_ptp; 4535 pmap = ptp_to_pmap(ptp); 4536 KASSERT(pmap->pm_obj[0].uo_refs > 0); 4537 if (ptp != NULL) { 4538 pmap_reference(pmap); 4539 } 4540 4541 /* 4542 * Now try to lock it. We need a direct handoff between 4543 * pp_lock and pm_lock to know the pv_entry is kept intact 4544 * and kept associated with this pmap. If that can't be 4545 * had, wait for the pmap's lock to become free and then 4546 * retry. 4547 */ 4548 locked = mutex_tryenter(&pmap->pm_lock); 4549 mutex_spin_exit(&pp->pp_lock); 4550 if (!locked) { 4551 mutex_enter(&pmap->pm_lock); 4552 /* nothing, just wait for it */ 4553 mutex_exit(&pmap->pm_lock); 4554 if (ptp != NULL) { 4555 pmap_destroy(pmap); 4556 } 4557 continue; 4558 } 4559 va = pvpte->pte_va; 4560 4561 KASSERTMSG(pmap->pm_stats.resident_count > PDP_SIZE, 4562 "va %lx pmap %p ptp %p is empty", va, pmap, ptp); 4563 KASSERTMSG(ptp == NULL || (ptp->flags & PG_FREE) == 0, 4564 "va %lx pmap %p ptp %p is free", va, pmap, ptp); 4565 KASSERTMSG(ptp == NULL || ptp->wire_count > 1, 4566 "va %lx pmap %p ptp %p is empty", va, pmap, ptp); 4567 4568 #ifdef DEBUG 4569 pmap_check_pv(pmap, ptp, pp, pvpte->pte_va, true); 4570 rb_tree_t *tree = (ptp != NULL ? 4571 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb); 4572 pve = pmap_treelookup_pv(pmap, ptp, tree, va); 4573 if (pve == NULL) { 4574 KASSERTMSG(&pp->pp_pte == pvpte, 4575 "va %lx pmap %p ptp %p pvpte %p pve %p oops 1", 4576 va, pmap, ptp, pvpte, pve); 4577 } else { 4578 KASSERTMSG(&pve->pve_pte == pvpte, 4579 "va %lx pmap %p ptp %p pvpte %p pve %p oops 2", 4580 va, pmap, ptp, pvpte, pve); 4581 } 4582 #endif 4583 4584 if (pmap_sync_pv(pvpte, pa, ~0, &oattrs, &opte)) { 4585 panic("pmap_pp_remove: mapping not present"); 4586 } 4587 4588 pve = pmap_lookup_pv(pmap, ptp, pp, va); 4589 pmap_remove_pv(pmap, pp, ptp, va, pve, oattrs); 4590 4591 /* Update the PTP reference count. Free if last reference. */ 4592 if (ptp != NULL) { 4593 KASSERT(pmap != pmap_kernel()); 4594 pmap_tlb_shootnow(); 4595 if (__predict_false(pmap->pm_pp_remove_ent != NULL)) { 4596 (*pmap->pm_pp_remove_ent)(pmap, ptp, opte, va); 4597 } else { 4598 pmap_pp_remove_ent(pmap, ptp, opte, va); 4599 } 4600 } else { 4601 KASSERT(pmap == pmap_kernel()); 4602 pmap_stats_update_bypte(pmap, 0, opte); 4603 } 4604 pmap_tlb_shootnow(); 4605 pmap_drain_pv(pmap); 4606 mutex_exit(&pmap->pm_lock); 4607 if (ptp != NULL) { 4608 pmap_destroy(pmap); 4609 } 4610 } 4611 kpreempt_enable(); 4612 } 4613 4614 /* 4615 * pmap_page_remove: remove a managed vm_page from all pmaps that map it 4616 * 4617 * => R/M bits are sync'd back to attrs 4618 */ 4619 void 4620 pmap_page_remove(struct vm_page *pg) 4621 { 4622 struct pmap_page *pp; 4623 paddr_t pa; 4624 4625 pp = VM_PAGE_TO_PP(pg); 4626 pa = VM_PAGE_TO_PHYS(pg); 4627 pmap_pp_remove(pp, pa); 4628 } 4629 4630 /* 4631 * pmap_pv_remove: remove an unmanaged pv-tracked page from all pmaps 4632 * that map it 4633 */ 4634 void 4635 pmap_pv_remove(paddr_t pa) 4636 { 4637 struct pmap_page *pp; 4638 4639 pp = pmap_pv_tracked(pa); 4640 if (pp == NULL) 4641 panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa); 4642 pmap_pp_remove(pp, pa); 4643 } 4644 4645 /* 4646 * p m a p a t t r i b u t e f u n c t i o n s 4647 * functions that test/change managed page's attributes 4648 * since a page can be mapped multiple times we must check each PTE that 4649 * maps it by going down the pv lists. 4650 */ 4651 4652 /* 4653 * pmap_test_attrs: test a page's attributes 4654 */ 4655 bool 4656 pmap_test_attrs(struct vm_page *pg, unsigned testbits) 4657 { 4658 struct pmap_page *pp; 4659 struct pv_pte *pvpte; 4660 struct pmap *pmap; 4661 uint8_t oattrs; 4662 u_int result; 4663 paddr_t pa; 4664 4665 pp = VM_PAGE_TO_PP(pg); 4666 if ((pp->pp_attrs & testbits) != 0) { 4667 return true; 4668 } 4669 pa = VM_PAGE_TO_PHYS(pg); 4670 startover: 4671 mutex_spin_enter(&pp->pp_lock); 4672 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 4673 if ((pp->pp_attrs & testbits) != 0) { 4674 break; 4675 } 4676 if (pmap_sync_pv(pvpte, pa, 0, &oattrs, NULL)) { 4677 /* 4678 * raced with a V->P operation. wait for the other 4679 * side to finish by acquiring pmap's lock. if no 4680 * wait, updates to pp_attrs by the other side may 4681 * go unseen. 4682 */ 4683 pmap = ptp_to_pmap(pvpte->pte_ptp); 4684 pmap_reference(pmap); 4685 mutex_spin_exit(&pp->pp_lock); 4686 mutex_enter(&pmap->pm_lock); 4687 /* nothing. */ 4688 mutex_exit(&pmap->pm_lock); 4689 pmap_destroy(pmap); 4690 goto startover; 4691 } 4692 pp->pp_attrs |= oattrs; 4693 } 4694 result = pp->pp_attrs & testbits; 4695 mutex_spin_exit(&pp->pp_lock); 4696 4697 /* 4698 * note that we will exit the for loop with a non-null pve if 4699 * we have found the bits we are testing for. 4700 */ 4701 4702 return result != 0; 4703 } 4704 4705 static bool 4706 pmap_pp_clear_attrs(struct pmap_page *pp, paddr_t pa, unsigned clearbits) 4707 { 4708 struct pv_pte *pvpte; 4709 struct pmap *pmap; 4710 uint8_t oattrs; 4711 u_int result; 4712 4713 startover: 4714 mutex_spin_enter(&pp->pp_lock); 4715 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 4716 if (pmap_sync_pv(pvpte, pa, clearbits, &oattrs, NULL)) { 4717 /* 4718 * raced with a V->P operation. wait for the other 4719 * side to finish by acquiring pmap's lock. it is 4720 * probably unmapping the page, and it will be gone 4721 * when the loop is restarted. 4722 */ 4723 pmap = ptp_to_pmap(pvpte->pte_ptp); 4724 pmap_reference(pmap); 4725 mutex_spin_exit(&pp->pp_lock); 4726 mutex_enter(&pmap->pm_lock); 4727 /* nothing. */ 4728 mutex_exit(&pmap->pm_lock); 4729 pmap_destroy(pmap); 4730 goto startover; 4731 } 4732 pp->pp_attrs |= oattrs; 4733 } 4734 result = pp->pp_attrs & clearbits; 4735 pp->pp_attrs &= ~clearbits; 4736 pmap_tlb_shootnow(); 4737 mutex_spin_exit(&pp->pp_lock); 4738 4739 return result != 0; 4740 } 4741 4742 /* 4743 * pmap_clear_attrs: clear the specified attribute for a page. 4744 * 4745 * => we return true if we cleared one of the bits we were asked to 4746 */ 4747 bool 4748 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits) 4749 { 4750 struct pmap_page *pp; 4751 paddr_t pa; 4752 4753 pp = VM_PAGE_TO_PP(pg); 4754 pa = VM_PAGE_TO_PHYS(pg); 4755 4756 /* 4757 * If this is a new page, assert it has no mappings and simply zap 4758 * the stored attributes without taking any locks. 4759 */ 4760 if ((pg->flags & PG_FAKE) != 0) { 4761 KASSERT(atomic_load_relaxed(&pp->pp_pte.pte_va) == 0); 4762 KASSERT(atomic_load_relaxed(&pp->pp_pte.pte_ptp) == NULL); 4763 KASSERT(atomic_load_relaxed(&pp->pp_pvlist.lh_first) == NULL); 4764 atomic_store_relaxed(&pp->pp_attrs, 0); 4765 return false; 4766 } else { 4767 return pmap_pp_clear_attrs(pp, pa, clearbits); 4768 } 4769 } 4770 4771 /* 4772 * pmap_pv_clear_attrs: clear the specified attributes for an unmanaged 4773 * pv-tracked page. 4774 */ 4775 bool 4776 pmap_pv_clear_attrs(paddr_t pa, unsigned clearbits) 4777 { 4778 struct pmap_page *pp; 4779 4780 pp = pmap_pv_tracked(pa); 4781 if (pp == NULL) 4782 panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa); 4783 4784 return pmap_pp_clear_attrs(pp, pa, clearbits); 4785 } 4786 4787 /* 4788 * p m a p p r o t e c t i o n f u n c t i o n s 4789 */ 4790 4791 /* 4792 * pmap_page_protect: change the protection of all recorded mappings 4793 * of a managed page 4794 * 4795 * => NOTE: this is an inline function in pmap.h 4796 */ 4797 4798 /* see pmap.h */ 4799 4800 /* 4801 * pmap_pv_protect: change the protection of all recorded mappings 4802 * of an unmanaged pv-tracked page 4803 * 4804 * => NOTE: this is an inline function in pmap.h 4805 */ 4806 4807 /* see pmap.h */ 4808 4809 /* 4810 * pmap_protect: set the protection in of the pages in a pmap 4811 * 4812 * => NOTE: this is an inline function in pmap.h 4813 */ 4814 4815 /* see pmap.h */ 4816 4817 /* 4818 * pmap_write_protect: write-protect pages in a pmap. 4819 * 4820 * Note for Xen-amd64. Xen automatically adds PTE_U to the kernel pages, but we 4821 * don't need to remove this bit when re-entering the PTEs here: Xen tracks the 4822 * kernel pages with a reserved bit (_PAGE_GUEST_KERNEL), so even if PTE_U is 4823 * present the page will still be considered as a kernel page, and the privilege 4824 * separation will be enforced correctly. 4825 */ 4826 void 4827 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot) 4828 { 4829 pt_entry_t bit_rem, bit_put; 4830 pt_entry_t *ptes; 4831 pt_entry_t * const *pdes; 4832 struct pmap *pmap2; 4833 vaddr_t blockend, va; 4834 int lvl, i; 4835 4836 if (__predict_false(pmap->pm_write_protect != NULL)) { 4837 (*pmap->pm_write_protect)(pmap, sva, eva, prot); 4838 return; 4839 } 4840 4841 bit_rem = 0; 4842 if (!(prot & VM_PROT_WRITE)) 4843 bit_rem = PTE_W; 4844 4845 bit_put = 0; 4846 if (!(prot & VM_PROT_EXECUTE)) 4847 bit_put = pmap_pg_nx; 4848 4849 sva &= ~PAGE_MASK; 4850 eva &= ~PAGE_MASK; 4851 4852 /* 4853 * Acquire pmap. No need to lock the kernel pmap as we won't 4854 * be touching PV entries nor stats and kernel PDEs aren't 4855 * freed. 4856 */ 4857 if (pmap != pmap_kernel()) { 4858 mutex_enter(&pmap->pm_lock); 4859 } 4860 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 4861 4862 for (va = sva ; va < eva; va = blockend) { 4863 pt_entry_t *spte, *epte; 4864 4865 blockend = x86_round_pdr(va + 1); 4866 if (blockend > eva) 4867 blockend = eva; 4868 4869 /* Is it a valid block? */ 4870 if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) { 4871 continue; 4872 } 4873 KASSERT(va < VM_MAXUSER_ADDRESS || va >= VM_MAX_ADDRESS); 4874 KASSERT(lvl == 1); 4875 4876 spte = &ptes[pl1_i(va)]; 4877 epte = &ptes[pl1_i(blockend)]; 4878 4879 for (i = 0; spte < epte; spte++, i++) { 4880 pt_entry_t opte, npte; 4881 4882 do { 4883 opte = *spte; 4884 if (!pmap_valid_entry(opte)) { 4885 goto next; 4886 } 4887 npte = (opte & ~bit_rem) | bit_put; 4888 } while (pmap_pte_cas(spte, opte, npte) != opte); 4889 4890 if ((opte & PTE_D) != 0) { 4891 vaddr_t tva = va + x86_ptob(i); 4892 pmap_tlb_shootdown(pmap, tva, opte, 4893 TLBSHOOT_WRITE_PROTECT); 4894 } 4895 next:; 4896 } 4897 } 4898 4899 /* Release pmap. */ 4900 pmap_unmap_ptes(pmap, pmap2); 4901 if (pmap != pmap_kernel()) { 4902 mutex_exit(&pmap->pm_lock); 4903 } 4904 } 4905 4906 /* 4907 * pmap_unwire: clear the wired bit in the PTE. 4908 * 4909 * => Mapping should already be present. 4910 */ 4911 void 4912 pmap_unwire(struct pmap *pmap, vaddr_t va) 4913 { 4914 pt_entry_t *ptes, *ptep, opte; 4915 pd_entry_t * const *pdes; 4916 struct pmap *pmap2; 4917 int lvl; 4918 4919 if (__predict_false(pmap->pm_unwire != NULL)) { 4920 (*pmap->pm_unwire)(pmap, va); 4921 return; 4922 } 4923 4924 /* 4925 * Acquire pmap. Need to lock the kernel pmap only to protect the 4926 * statistics. 4927 */ 4928 mutex_enter(&pmap->pm_lock); 4929 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 4930 4931 if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) { 4932 panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va); 4933 } 4934 KASSERT(lvl == 1); 4935 4936 ptep = &ptes[pl1_i(va)]; 4937 opte = *ptep; 4938 KASSERT(pmap_valid_entry(opte)); 4939 4940 if (opte & PTE_WIRED) { 4941 pt_entry_t npte = opte & ~PTE_WIRED; 4942 4943 opte = pmap_pte_testset(ptep, npte); 4944 pmap_stats_update_bypte(pmap, npte, opte); 4945 } else { 4946 printf("%s: wiring for pmap %p va %#" PRIxVADDR 4947 " did not change!\n", __func__, pmap, va); 4948 } 4949 4950 /* Release pmap. */ 4951 pmap_unmap_ptes(pmap, pmap2); 4952 mutex_exit(&pmap->pm_lock); 4953 } 4954 4955 /* 4956 * pmap_copy: copy mappings from one pmap to another 4957 * 4958 * => optional function 4959 * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr) 4960 */ 4961 4962 /* 4963 * defined as macro in pmap.h 4964 */ 4965 4966 __strict_weak_alias(pmap_enter, pmap_enter_default); 4967 4968 int 4969 pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, 4970 u_int flags) 4971 { 4972 if (__predict_false(pmap->pm_enter != NULL)) { 4973 return (*pmap->pm_enter)(pmap, va, pa, prot, flags); 4974 } 4975 4976 return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0); 4977 } 4978 4979 /* 4980 * pmap_enter: enter a mapping into a pmap 4981 * 4982 * => must be done "now" ... no lazy-evaluation 4983 */ 4984 int 4985 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa, 4986 vm_prot_t prot, u_int flags, int domid) 4987 { 4988 pt_entry_t *ptes, opte, npte; 4989 pt_entry_t *ptep; 4990 pd_entry_t * const *pdes; 4991 struct vm_page *ptp; 4992 struct vm_page *new_pg, *old_pg; 4993 struct pmap_page *new_pp, *old_pp; 4994 struct pv_entry *old_pve, *new_pve; 4995 bool wired = (flags & PMAP_WIRED) != 0; 4996 struct pmap *pmap2; 4997 struct pmap_ptparray pt; 4998 int error; 4999 bool getptp, samepage, new_embedded; 5000 rb_tree_t *tree; 5001 5002 KASSERT(pmap_initialized); 5003 KASSERT(va < VM_MAX_KERNEL_ADDRESS); 5004 KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#" 5005 PRIxVADDR " over PDP!", __func__, va); 5006 KASSERTMSG(va < VM_MIN_KERNEL_ADDRESS || 5007 pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]), 5008 "%s: missing kernel PTP for va=%#" PRIxVADDR, __func__, va); 5009 5010 #ifdef XENPV 5011 KASSERT(domid == DOMID_SELF || pa == 0); 5012 #endif 5013 5014 npte = ma | protection_codes[prot] | PTE_P; 5015 npte |= pmap_pat_flags(flags); 5016 if (wired) 5017 npte |= PTE_WIRED; 5018 if (va < VM_MAXUSER_ADDRESS) { 5019 KASSERTMSG(pmap != pmap_kernel(), 5020 "entering user va %#"PRIxVADDR" into kernel pmap", 5021 va); 5022 if (pmap_is_user(pmap)) 5023 npte |= PTE_U; 5024 } 5025 5026 if (pmap == pmap_kernel()) 5027 npte |= pmap_pg_g; 5028 if (flags & VM_PROT_ALL) { 5029 npte |= PTE_A; 5030 if (flags & VM_PROT_WRITE) { 5031 KASSERT((npte & PTE_W) != 0); 5032 npte |= PTE_D; 5033 } 5034 } 5035 5036 #ifdef XENPV 5037 if (domid != DOMID_SELF) 5038 new_pg = NULL; 5039 else 5040 #endif 5041 new_pg = PHYS_TO_VM_PAGE(pa); 5042 5043 if (new_pg != NULL) { 5044 /* This is a managed page */ 5045 npte |= PTE_PVLIST; 5046 new_pp = VM_PAGE_TO_PP(new_pg); 5047 PMAP_CHECK_PP(new_pp); 5048 } else if ((new_pp = pmap_pv_tracked(pa)) != NULL) { 5049 /* This is an unmanaged pv-tracked page */ 5050 npte |= PTE_PVLIST; 5051 PMAP_CHECK_PP(new_pp); 5052 } else { 5053 new_pp = NULL; 5054 } 5055 5056 /* Begin by locking the pmap. */ 5057 mutex_enter(&pmap->pm_lock); 5058 5059 /* Look up the PTP. Allocate if none present. */ 5060 ptp = NULL; 5061 getptp = false; 5062 if (pmap != pmap_kernel()) { 5063 ptp = pmap_find_ptp(pmap, va, 1); 5064 if (ptp == NULL) { 5065 getptp = true; 5066 error = pmap_get_ptp(pmap, &pt, va, flags, &ptp); 5067 if (error != 0) { 5068 if (flags & PMAP_CANFAIL) { 5069 mutex_exit(&pmap->pm_lock); 5070 return error; 5071 } 5072 panic("%s: get ptp failed, error=%d", __func__, 5073 error); 5074 } 5075 } 5076 tree = &VM_PAGE_TO_PP(ptp)->pp_rb; 5077 } else { 5078 /* Embedded PV entries rely on this. */ 5079 KASSERT(va != 0); 5080 tree = &pmap_kernel_rb; 5081 } 5082 5083 /* 5084 * Look up the old PV entry at this VA (if any), and insert a new PV 5085 * entry if required for the new mapping. Temporarily track the old 5086 * and new mappings concurrently. Only after the old mapping is 5087 * evicted from the pmap will we remove its PV entry. Otherwise, 5088 * our picture of modified/accessed state for either page could get 5089 * out of sync (we need any P->V operation for either page to stall 5090 * on pmap->pm_lock until done here). 5091 */ 5092 new_pve = NULL; 5093 old_pve = NULL; 5094 samepage = false; 5095 new_embedded = false; 5096 5097 if (new_pp != NULL) { 5098 error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve, 5099 &old_pve, &samepage, &new_embedded, tree); 5100 5101 /* 5102 * If a new pv_entry was needed and none was available, we 5103 * can go no further. 5104 */ 5105 if (error != 0) { 5106 if (flags & PMAP_CANFAIL) { 5107 if (getptp) { 5108 pmap_unget_ptp(pmap, &pt); 5109 } 5110 mutex_exit(&pmap->pm_lock); 5111 return error; 5112 } 5113 panic("%s: alloc pve failed", __func__); 5114 } 5115 } else { 5116 old_pve = pmap_treelookup_pv(pmap, ptp, tree, va); 5117 } 5118 5119 /* Map PTEs into address space. */ 5120 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 5121 5122 /* Install any newly allocated PTPs. */ 5123 if (getptp) { 5124 pmap_install_ptp(pmap, &pt, va, pdes); 5125 } 5126 5127 /* Check if there is an existing mapping. */ 5128 ptep = &ptes[pl1_i(va)]; 5129 opte = *ptep; 5130 bool have_oldpa = pmap_valid_entry(opte); 5131 paddr_t oldpa = pmap_pte2pa(opte); 5132 5133 /* 5134 * Update the pte. 5135 */ 5136 do { 5137 opte = *ptep; 5138 5139 /* 5140 * if the same page, inherit PTE_A and PTE_D. 5141 */ 5142 if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) { 5143 npte |= opte & (PTE_A | PTE_D); 5144 } 5145 #if defined(XENPV) 5146 if (domid != DOMID_SELF) { 5147 /* pmap_pte_cas with error handling */ 5148 int s = splvm(); 5149 if (opte != *ptep) { 5150 splx(s); 5151 continue; 5152 } 5153 error = xpq_update_foreign( 5154 vtomach((vaddr_t)ptep), npte, domid, flags); 5155 splx(s); 5156 if (error) { 5157 /* Undo pv_entry tracking - oof. */ 5158 if (new_pp != NULL) { 5159 mutex_spin_enter(&new_pp->pp_lock); 5160 if (new_pve != NULL) { 5161 LIST_REMOVE(new_pve, pve_list); 5162 KASSERT(pmap->pm_pve == NULL); 5163 pmap->pm_pve = new_pve; 5164 } else if (new_embedded) { 5165 new_pp->pp_pte.pte_ptp = NULL; 5166 new_pp->pp_pte.pte_va = 0; 5167 } 5168 mutex_spin_exit(&new_pp->pp_lock); 5169 } 5170 pmap_unmap_ptes(pmap, pmap2); 5171 /* Free new PTP. */ 5172 if (ptp != NULL && ptp->wire_count <= 1) { 5173 pmap_free_ptp(pmap, ptp, va, ptes, 5174 pdes); 5175 } 5176 mutex_exit(&pmap->pm_lock); 5177 return error; 5178 } 5179 break; 5180 } 5181 #endif /* defined(XENPV) */ 5182 } while (pmap_pte_cas(ptep, opte, npte) != opte); 5183 5184 /* 5185 * Done with the PTEs: they can now be unmapped. 5186 */ 5187 pmap_unmap_ptes(pmap, pmap2); 5188 5189 /* 5190 * Update statistics and PTP's reference count. 5191 */ 5192 pmap_stats_update_bypte(pmap, npte, opte); 5193 if (ptp != NULL) { 5194 if (!have_oldpa) { 5195 ptp->wire_count++; 5196 } 5197 /* Remember minimum VA in PTP. */ 5198 pmap_ptp_range_set(ptp, va); 5199 } 5200 KASSERT(ptp == NULL || ptp->wire_count > 1); 5201 5202 /* 5203 * If the same page, we can skip pv_entry handling. 5204 */ 5205 if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) { 5206 KASSERT(((opte ^ npte) & PTE_PVLIST) == 0); 5207 if ((npte & PTE_PVLIST) != 0) { 5208 KASSERT(samepage); 5209 pmap_check_pv(pmap, ptp, new_pp, va, true); 5210 } 5211 goto same_pa; 5212 } else if ((npte & PTE_PVLIST) != 0) { 5213 KASSERT(!samepage); 5214 } 5215 5216 /* 5217 * If old page is pv-tracked, remove pv_entry from its list. 5218 */ 5219 if ((~opte & (PTE_P | PTE_PVLIST)) == 0) { 5220 if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) { 5221 old_pp = VM_PAGE_TO_PP(old_pg); 5222 } else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) { 5223 panic("%s: PTE_PVLIST with pv-untracked page" 5224 " va = %#"PRIxVADDR 5225 " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")", 5226 __func__, va, oldpa, atop(pa)); 5227 } 5228 5229 pmap_remove_pv(pmap, old_pp, ptp, va, old_pve, 5230 pmap_pte_to_pp_attrs(opte)); 5231 } else { 5232 KASSERT(old_pve == NULL); 5233 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 5234 } 5235 5236 /* 5237 * If new page is dynamically PV tracked, insert to tree. 5238 */ 5239 if (new_pve != NULL) { 5240 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 5241 old_pve = rb_tree_insert_node(tree, new_pve); 5242 KASSERT(old_pve == new_pve); 5243 pmap_check_pv(pmap, ptp, new_pp, va, true); 5244 } 5245 5246 same_pa: 5247 /* 5248 * shootdown tlb if necessary. 5249 */ 5250 5251 if ((~opte & (PTE_P | PTE_A)) == 0 && 5252 ((opte ^ npte) & (PTE_FRAME | PTE_W)) != 0) { 5253 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER); 5254 } 5255 pmap_drain_pv(pmap); 5256 mutex_exit(&pmap->pm_lock); 5257 return 0; 5258 } 5259 5260 #if defined(XEN) && defined(DOM0OPS) 5261 5262 struct pmap_data_gnt { 5263 SLIST_ENTRY(pmap_data_gnt) pd_gnt_list; 5264 vaddr_t pd_gnt_sva; 5265 vaddr_t pd_gnt_eva; /* range covered by this gnt */ 5266 int pd_gnt_refs; /* ref counter */ 5267 struct gnttab_map_grant_ref pd_gnt_ops[1]; /* variable length */ 5268 }; 5269 SLIST_HEAD(pmap_data_gnt_head, pmap_data_gnt); 5270 5271 static void pmap_remove_gnt(struct pmap *, vaddr_t, vaddr_t); 5272 5273 static struct pmap_data_gnt * 5274 pmap_find_gnt(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 5275 { 5276 struct pmap_data_gnt_head *headp; 5277 struct pmap_data_gnt *pgnt; 5278 5279 KASSERT(mutex_owned(&pmap->pm_lock)); 5280 headp = pmap->pm_data; 5281 KASSERT(headp != NULL); 5282 SLIST_FOREACH(pgnt, headp, pd_gnt_list) { 5283 if (pgnt->pd_gnt_sva <= sva && eva <= pgnt->pd_gnt_eva) 5284 return pgnt; 5285 /* check that we're not overlapping part of a region */ 5286 KASSERT(pgnt->pd_gnt_sva >= eva || pgnt->pd_gnt_eva <= sva); 5287 } 5288 return NULL; 5289 } 5290 5291 static void 5292 pmap_alloc_gnt(struct pmap *pmap, vaddr_t sva, int nentries, 5293 const struct gnttab_map_grant_ref *ops) 5294 { 5295 struct pmap_data_gnt_head *headp; 5296 struct pmap_data_gnt *pgnt; 5297 vaddr_t eva = sva + nentries * PAGE_SIZE; 5298 KASSERT(mutex_owned(&pmap->pm_lock)); 5299 KASSERT(nentries >= 1); 5300 if (pmap->pm_remove == NULL) { 5301 pmap->pm_remove = pmap_remove_gnt; 5302 KASSERT(pmap->pm_data == NULL); 5303 headp = kmem_alloc(sizeof(*headp), KM_SLEEP); 5304 SLIST_INIT(headp); 5305 pmap->pm_data = headp; 5306 } else { 5307 KASSERT(pmap->pm_remove == pmap_remove_gnt); 5308 KASSERT(pmap->pm_data != NULL); 5309 headp = pmap->pm_data; 5310 } 5311 5312 pgnt = pmap_find_gnt(pmap, sva, eva); 5313 if (pgnt != NULL) { 5314 KASSERT(pgnt->pd_gnt_sva == sva); 5315 KASSERT(pgnt->pd_gnt_eva == eva); 5316 return; 5317 } 5318 5319 /* new entry */ 5320 pgnt = kmem_alloc(sizeof(*pgnt) + 5321 (nentries - 1) * sizeof(struct gnttab_map_grant_ref), KM_SLEEP); 5322 pgnt->pd_gnt_sva = sva; 5323 pgnt->pd_gnt_eva = eva; 5324 pgnt->pd_gnt_refs = 0; 5325 memcpy(pgnt->pd_gnt_ops, ops, 5326 sizeof(struct gnttab_map_grant_ref) * nentries); 5327 SLIST_INSERT_HEAD(headp, pgnt, pd_gnt_list); 5328 } 5329 5330 static void 5331 pmap_free_gnt(struct pmap *pmap, struct pmap_data_gnt *pgnt) 5332 { 5333 struct pmap_data_gnt_head *headp = pmap->pm_data; 5334 int nentries = (pgnt->pd_gnt_eva - pgnt->pd_gnt_sva) / PAGE_SIZE; 5335 KASSERT(nentries >= 1); 5336 KASSERT(mutex_owned(&pmap->pm_lock)); 5337 KASSERT(pgnt->pd_gnt_refs == 0); 5338 SLIST_REMOVE(headp, pgnt, pmap_data_gnt, pd_gnt_list); 5339 kmem_free(pgnt, sizeof(*pgnt) + 5340 (nentries - 1) * sizeof(struct gnttab_map_grant_ref)); 5341 if (SLIST_EMPTY(headp)) { 5342 kmem_free(headp, sizeof(*headp)); 5343 pmap->pm_data = NULL; 5344 pmap->pm_remove = NULL; 5345 } 5346 } 5347 5348 /* 5349 * pmap_enter_gnt: enter a grant entry into a pmap 5350 * 5351 * => must be done "now" ... no lazy-evaluation 5352 */ 5353 int 5354 pmap_enter_gnt(struct pmap *pmap, vaddr_t va, vaddr_t sva, int nentries, 5355 const struct gnttab_map_grant_ref *oops) 5356 { 5357 struct pmap_data_gnt *pgnt; 5358 pt_entry_t *ptes, opte; 5359 #ifndef XENPV 5360 pt_entry_t npte; 5361 #endif 5362 pt_entry_t *ptep; 5363 pd_entry_t * const *pdes; 5364 struct vm_page *ptp; 5365 struct vm_page *old_pg; 5366 struct pmap_page *old_pp; 5367 struct pv_entry *old_pve; 5368 struct pmap *pmap2; 5369 struct pmap_ptparray pt; 5370 int error; 5371 bool getptp; 5372 rb_tree_t *tree; 5373 struct gnttab_map_grant_ref *op; 5374 int ret; 5375 int idx; 5376 5377 KASSERT(pmap_initialized); 5378 KASSERT(va < VM_MAX_KERNEL_ADDRESS); 5379 KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#" 5380 PRIxVADDR " over PDP!", __func__, va); 5381 KASSERT(pmap != pmap_kernel()); 5382 5383 /* Begin by locking the pmap. */ 5384 mutex_enter(&pmap->pm_lock); 5385 pmap_alloc_gnt(pmap, sva, nentries, oops); 5386 5387 pgnt = pmap_find_gnt(pmap, va, va + PAGE_SIZE); 5388 KASSERT(pgnt != NULL); 5389 5390 /* Look up the PTP. Allocate if none present. */ 5391 ptp = NULL; 5392 getptp = false; 5393 ptp = pmap_find_ptp(pmap, va, 1); 5394 if (ptp == NULL) { 5395 getptp = true; 5396 error = pmap_get_ptp(pmap, &pt, va, PMAP_CANFAIL, &ptp); 5397 if (error != 0) { 5398 mutex_exit(&pmap->pm_lock); 5399 return error; 5400 } 5401 } 5402 tree = &VM_PAGE_TO_PP(ptp)->pp_rb; 5403 5404 /* 5405 * Look up the old PV entry at this VA (if any), and insert a new PV 5406 * entry if required for the new mapping. Temporarily track the old 5407 * and new mappings concurrently. Only after the old mapping is 5408 * evicted from the pmap will we remove its PV entry. Otherwise, 5409 * our picture of modified/accessed state for either page could get 5410 * out of sync (we need any P->V operation for either page to stall 5411 * on pmap->pm_lock until done here). 5412 */ 5413 old_pve = NULL; 5414 5415 old_pve = pmap_treelookup_pv(pmap, ptp, tree, va); 5416 5417 /* Map PTEs into address space. */ 5418 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 5419 5420 /* Install any newly allocated PTPs. */ 5421 if (getptp) { 5422 pmap_install_ptp(pmap, &pt, va, pdes); 5423 } 5424 5425 /* Check if there is an existing mapping. */ 5426 ptep = &ptes[pl1_i(va)]; 5427 opte = *ptep; 5428 bool have_oldpa = pmap_valid_entry(opte); 5429 paddr_t oldpa = pmap_pte2pa(opte); 5430 5431 /* 5432 * Update the pte. 5433 */ 5434 5435 idx = (va - pgnt->pd_gnt_sva) / PAGE_SIZE; 5436 op = &pgnt->pd_gnt_ops[idx]; 5437 5438 #ifdef XENPV 5439 KASSERT(op->flags & GNTMAP_contains_pte); 5440 op->host_addr = xpmap_ptetomach(ptep); 5441 #else 5442 KASSERT((op->flags & GNTMAP_contains_pte) == 0); 5443 KASSERT(op->flags != 0); 5444 KASSERT(op->host_addr != 0); 5445 #endif 5446 op->dev_bus_addr = 0; 5447 op->status = GNTST_general_error; 5448 ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, 1); 5449 if (__predict_false(ret)) { 5450 printf("%s: GNTTABOP_map_grant_ref failed: %d\n", 5451 __func__, ret); 5452 op->status = GNTST_general_error; 5453 } 5454 for (int d = 0; d < 256 && op->status == GNTST_eagain; d++) { 5455 kpause("gntmap", false, mstohz(1), NULL); 5456 ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, 1); 5457 if (__predict_false(ret)) { 5458 printf("%s: GNTTABOP_map_grant_ref failed: %d\n", 5459 __func__, ret); 5460 op->status = GNTST_general_error; 5461 } 5462 } 5463 if (__predict_false(op->status != GNTST_okay)) { 5464 printf("%s: GNTTABOP_map_grant_ref status: %d\n", 5465 __func__, op->status); 5466 if (have_oldpa) { /* XXX did the pte really change if XENPV ?*/ 5467 ptp->wire_count--; 5468 } 5469 } else { 5470 #ifndef XENPV 5471 npte = op->host_addr | pmap_pg_nx | PTE_U | PTE_P; 5472 if ((op->flags & GNTMAP_readonly) == 0) 5473 npte |= PTE_W; 5474 do { 5475 opte = *ptep; 5476 } while (pmap_pte_cas(ptep, opte, npte) != opte); 5477 #endif 5478 pgnt->pd_gnt_refs++; 5479 if (!have_oldpa) { 5480 ptp->wire_count++; 5481 } 5482 KASSERT(ptp->wire_count > 1); 5483 /* Remember minimum VA in PTP. */ 5484 pmap_ptp_range_set(ptp, va); 5485 } 5486 if (ptp->wire_count <= 1) 5487 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 5488 5489 /* 5490 * Done with the PTEs: they can now be unmapped. 5491 */ 5492 pmap_unmap_ptes(pmap, pmap2); 5493 5494 /* 5495 * Update statistics and PTP's reference count. 5496 */ 5497 pmap_stats_update_bypte(pmap, 0, opte); 5498 5499 /* 5500 * If old page is pv-tracked, remove pv_entry from its list. 5501 */ 5502 if ((~opte & (PTE_P | PTE_PVLIST)) == 0) { 5503 if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) { 5504 old_pp = VM_PAGE_TO_PP(old_pg); 5505 } else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) { 5506 panic("%s: PTE_PVLIST with pv-untracked page" 5507 " va = %#"PRIxVADDR " pa = %#" PRIxPADDR, 5508 __func__, va, oldpa); 5509 } 5510 5511 pmap_remove_pv(pmap, old_pp, ptp, va, old_pve, 5512 pmap_pte_to_pp_attrs(opte)); 5513 } else { 5514 KASSERT(old_pve == NULL); 5515 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 5516 } 5517 5518 pmap_drain_pv(pmap); 5519 mutex_exit(&pmap->pm_lock); 5520 return op->status; 5521 } 5522 5523 /* 5524 * pmap_remove_gnt: grant mapping removal function. 5525 * 5526 * => caller should not be holding any pmap locks 5527 */ 5528 static void 5529 pmap_remove_gnt(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 5530 { 5531 struct pmap_data_gnt *pgnt; 5532 pt_entry_t *ptes; 5533 pd_entry_t pde; 5534 pd_entry_t * const *pdes; 5535 struct vm_page *ptp; 5536 struct pmap *pmap2; 5537 vaddr_t va; 5538 int lvl; 5539 int idx; 5540 struct gnttab_map_grant_ref *op; 5541 struct gnttab_unmap_grant_ref unmap_op; 5542 int ret; 5543 5544 KASSERT(pmap != pmap_kernel()); 5545 KASSERT(pmap->pm_remove == pmap_remove_gnt); 5546 5547 mutex_enter(&pmap->pm_lock); 5548 for (va = sva; va < eva; va += PAGE_SIZE) { 5549 pgnt = pmap_find_gnt(pmap, va, va + PAGE_SIZE); 5550 if (pgnt == NULL) { 5551 pmap_remove_locked(pmap, sva, eva); 5552 continue; 5553 } 5554 5555 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 5556 if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) { 5557 panic("pmap_remove_gnt pdes not valid"); 5558 } 5559 5560 idx = (va - pgnt->pd_gnt_sva) / PAGE_SIZE; 5561 op = &pgnt->pd_gnt_ops[idx]; 5562 KASSERT(lvl == 1); 5563 5564 /* Get PTP if non-kernel mapping. */ 5565 ptp = pmap_find_ptp(pmap, va, 1); 5566 KASSERTMSG(ptp != NULL, 5567 "%s: unmanaged PTP detected", __func__); 5568 5569 if (op->status == GNTST_okay) { 5570 KASSERT(pmap_valid_entry(ptes[pl1_i(va)])); 5571 #ifdef XENPV 5572 unmap_op.host_addr = xpmap_ptetomach(&ptes[pl1_i(va)]); 5573 #else 5574 unmap_op.host_addr = op->host_addr; 5575 pmap_pte_testset(&ptes[pl1_i(va)], 0); 5576 #endif 5577 unmap_op.handle = op->handle; 5578 unmap_op.dev_bus_addr = 0; 5579 ret = HYPERVISOR_grant_table_op( 5580 GNTTABOP_unmap_grant_ref, &unmap_op, 1); 5581 if (ret) { 5582 printf("%s: GNTTABOP_unmap_grant_ref " 5583 "failed: %d\n", __func__, ret); 5584 } 5585 5586 ptp->wire_count--; 5587 pgnt->pd_gnt_refs--; 5588 } 5589 if (pgnt->pd_gnt_refs == 0) { 5590 pmap_free_gnt(pmap, pgnt); 5591 } 5592 /* 5593 * if mapping removed and the PTP is no longer 5594 * being used, free it! 5595 */ 5596 5597 if (ptp->wire_count <= 1) 5598 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 5599 pmap_unmap_ptes(pmap, pmap2); 5600 } 5601 mutex_exit(&pmap->pm_lock); 5602 } 5603 #endif /* XEN && DOM0OPS */ 5604 5605 paddr_t 5606 pmap_get_physpage(void) 5607 { 5608 struct vm_page *ptp; 5609 struct pmap *kpm = pmap_kernel(); 5610 paddr_t pa; 5611 5612 if (!uvm.page_init_done) { 5613 /* 5614 * We're growing the kernel pmap early (from 5615 * uvm_pageboot_alloc()). This case must be 5616 * handled a little differently. 5617 */ 5618 5619 if (!uvm_page_physget(&pa)) 5620 panic("%s: out of memory", __func__); 5621 #if defined(__HAVE_DIRECT_MAP) 5622 memset(PAGE_ALIGNED(PMAP_DIRECT_MAP(pa)), 0, PAGE_SIZE); 5623 #else 5624 #if defined(XENPV) 5625 if (XEN_VERSION_SUPPORTED(3, 4)) { 5626 xen_pagezero(pa); 5627 return pa; 5628 } 5629 #endif 5630 kpreempt_disable(); 5631 pmap_pte_set(early_zero_pte, pmap_pa2pte(pa) | PTE_P | 5632 PTE_W | pmap_pg_nx); 5633 pmap_pte_flush(); 5634 pmap_update_pg((vaddr_t)early_zerop); 5635 memset(PAGE_ALIGNED(early_zerop), 0, PAGE_SIZE); 5636 #if defined(DIAGNOSTIC) || defined(XENPV) 5637 pmap_pte_set(early_zero_pte, 0); 5638 pmap_pte_flush(); 5639 #endif /* defined(DIAGNOSTIC) */ 5640 kpreempt_enable(); 5641 #endif /* defined(__HAVE_DIRECT_MAP) */ 5642 } else { 5643 /* XXX */ 5644 ptp = uvm_pagealloc(NULL, 0, NULL, 5645 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 5646 if (ptp == NULL) 5647 panic("%s: out of memory", __func__); 5648 ptp->flags &= ~PG_BUSY; 5649 ptp->wire_count = 1; 5650 pa = VM_PAGE_TO_PHYS(ptp); 5651 } 5652 pmap_stats_update(kpm, 1, 0); 5653 5654 return pa; 5655 } 5656 5657 /* 5658 * Expand the page tree with the specified amount of PTPs, mapping virtual 5659 * addresses starting at kva. We populate all the levels but the last one 5660 * (L1). The nodes of the tree are created as RW, but the pages covered 5661 * will be kentered in L1, with proper permissions. 5662 * 5663 * Used only by pmap_growkernel. 5664 */ 5665 static void 5666 pmap_alloc_level(struct pmap *cpm, vaddr_t kva, long *needed_ptps) 5667 { 5668 unsigned long i; 5669 paddr_t pa; 5670 unsigned long index, endindex; 5671 int level; 5672 pd_entry_t *pdep; 5673 #ifdef XENPV 5674 int s = splvm(); /* protect xpq_* */ 5675 #endif 5676 5677 for (level = PTP_LEVELS; level > 1; level--) { 5678 if (level == PTP_LEVELS) 5679 pdep = cpm->pm_pdir; 5680 else 5681 pdep = normal_pdes[level - 2]; 5682 index = pl_i_roundup(kva, level); 5683 endindex = index + needed_ptps[level - 1] - 1; 5684 5685 for (i = index; i <= endindex; i++) { 5686 pt_entry_t pte; 5687 5688 KASSERT(!pmap_valid_entry(pdep[i])); 5689 pa = pmap_get_physpage(); 5690 pte = pmap_pa2pte(pa) | PTE_P | PTE_W; 5691 #ifdef __x86_64__ 5692 pte |= pmap_pg_nx; 5693 #endif 5694 pmap_pte_set(&pdep[i], pte); 5695 5696 #ifdef XENPV 5697 if (level == PTP_LEVELS && i >= PDIR_SLOT_KERN) { 5698 if (__predict_true( 5699 cpu_info_primary.ci_flags & CPUF_PRESENT)) { 5700 /* update per-cpu PMDs on all cpus */ 5701 xen_kpm_sync(pmap_kernel(), i); 5702 } else { 5703 /* 5704 * too early; update primary CPU 5705 * PMD only (without locks) 5706 */ 5707 #ifdef __x86_64__ 5708 pd_entry_t *cpu_pdep = 5709 &cpu_info_primary.ci_kpm_pdir[i]; 5710 #else 5711 pd_entry_t *cpu_pdep = 5712 &cpu_info_primary.ci_kpm_pdir[l2tol2(i)]; 5713 #endif 5714 pmap_pte_set(cpu_pdep, pte); 5715 } 5716 } 5717 #endif 5718 5719 KASSERT(level != PTP_LEVELS || nkptp[level - 1] + 5720 pl_i(VM_MIN_KERNEL_ADDRESS, level) == i); 5721 nkptp[level - 1]++; 5722 } 5723 pmap_pte_flush(); 5724 } 5725 #ifdef XENPV 5726 splx(s); 5727 #endif 5728 } 5729 5730 /* 5731 * pmap_growkernel: increase usage of KVM space. 5732 * 5733 * => we allocate new PTPs for the kernel and install them in all 5734 * the pmaps on the system. 5735 */ 5736 vaddr_t 5737 pmap_growkernel(vaddr_t maxkvaddr) 5738 { 5739 struct pmap *kpm = pmap_kernel(); 5740 struct pmap *cpm; 5741 #if !defined(XENPV) || !defined(__x86_64__) 5742 struct pmap *pm; 5743 long old; 5744 #endif 5745 int s, i; 5746 long needed_kptp[PTP_LEVELS], target_nptp; 5747 bool invalidate = false; 5748 5749 s = splvm(); /* to be safe */ 5750 mutex_enter(&kpm->pm_lock); 5751 5752 if (maxkvaddr <= pmap_maxkvaddr) { 5753 mutex_exit(&kpm->pm_lock); 5754 splx(s); 5755 return pmap_maxkvaddr; 5756 } 5757 5758 maxkvaddr = x86_round_pdr(maxkvaddr); 5759 #if !defined(XENPV) || !defined(__x86_64__) 5760 old = nkptp[PTP_LEVELS - 1]; 5761 #endif 5762 5763 /* Initialize needed_kptp. */ 5764 for (i = PTP_LEVELS - 1; i >= 1; i--) { 5765 target_nptp = pl_i_roundup(maxkvaddr, i + 1) - 5766 pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1); 5767 5768 if (target_nptp > nkptpmax[i]) 5769 panic("out of KVA space"); 5770 KASSERT(target_nptp >= nkptp[i]); 5771 needed_kptp[i] = target_nptp - nkptp[i]; 5772 } 5773 5774 #ifdef XENPV 5775 /* only pmap_kernel(), or the per-cpu map, has kernel entries */ 5776 cpm = kpm; 5777 #else 5778 /* Get the current pmap */ 5779 if (__predict_true(cpu_info_primary.ci_flags & CPUF_PRESENT)) { 5780 cpm = curcpu()->ci_pmap; 5781 } else { 5782 cpm = kpm; 5783 } 5784 #endif 5785 5786 kasan_shadow_map((void *)pmap_maxkvaddr, 5787 (size_t)(maxkvaddr - pmap_maxkvaddr)); 5788 kmsan_shadow_map((void *)pmap_maxkvaddr, 5789 (size_t)(maxkvaddr - pmap_maxkvaddr)); 5790 5791 pmap_alloc_level(cpm, pmap_maxkvaddr, needed_kptp); 5792 5793 /* 5794 * If the number of top level entries changed, update all pmaps. 5795 */ 5796 if (needed_kptp[PTP_LEVELS - 1] != 0) { 5797 #ifdef XENPV 5798 #ifdef __x86_64__ 5799 /* nothing, kernel entries are never entered in user pmap */ 5800 #else 5801 int pdkidx; 5802 5803 mutex_enter(&pmaps_lock); 5804 LIST_FOREACH(pm, &pmaps, pm_list) { 5805 for (pdkidx = PDIR_SLOT_KERN + old; 5806 pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1]; 5807 pdkidx++) { 5808 pmap_pte_set(&pm->pm_pdir[pdkidx], 5809 kpm->pm_pdir[pdkidx]); 5810 } 5811 pmap_pte_flush(); 5812 } 5813 mutex_exit(&pmaps_lock); 5814 #endif /* __x86_64__ */ 5815 #else /* XENPV */ 5816 size_t newpdes; 5817 newpdes = nkptp[PTP_LEVELS - 1] - old; 5818 if (cpm != kpm) { 5819 memcpy(&kpm->pm_pdir[PDIR_SLOT_KERN + old], 5820 &cpm->pm_pdir[PDIR_SLOT_KERN + old], 5821 newpdes * sizeof(pd_entry_t)); 5822 } 5823 5824 mutex_enter(&pmaps_lock); 5825 LIST_FOREACH(pm, &pmaps, pm_list) { 5826 if (__predict_false(pm->pm_enter != NULL)) { 5827 /* 5828 * Not a native pmap, the kernel is not mapped, 5829 * so nothing to synchronize. 5830 */ 5831 continue; 5832 } 5833 memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old], 5834 &kpm->pm_pdir[PDIR_SLOT_KERN + old], 5835 newpdes * sizeof(pd_entry_t)); 5836 } 5837 mutex_exit(&pmaps_lock); 5838 #endif 5839 invalidate = true; 5840 } 5841 pmap_maxkvaddr = maxkvaddr; 5842 mutex_exit(&kpm->pm_lock); 5843 splx(s); 5844 5845 if (invalidate && pmap_initialized) { 5846 /* Invalidate the pmap cache. */ 5847 pool_cache_invalidate(&pmap_cache); 5848 } 5849 5850 return maxkvaddr; 5851 } 5852 5853 #ifdef DEBUG 5854 void pmap_dump(struct pmap *, vaddr_t, vaddr_t); 5855 5856 /* 5857 * pmap_dump: dump all the mappings from a pmap 5858 * 5859 * => caller should not be holding any pmap locks 5860 */ 5861 void 5862 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 5863 { 5864 pt_entry_t *ptes, *pte; 5865 pd_entry_t * const *pdes; 5866 struct pmap *pmap2; 5867 vaddr_t blkendva; 5868 int lvl; 5869 5870 /* 5871 * if end is out of range truncate. 5872 * if (end == start) update to max. 5873 */ 5874 5875 if (eva > VM_MAXUSER_ADDRESS || eva <= sva) 5876 eva = VM_MAXUSER_ADDRESS; 5877 5878 mutex_enter(&pmap->pm_lock); 5879 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 5880 5881 /* 5882 * dumping a range of pages: we dump in PTP sized blocks (4MB) 5883 */ 5884 5885 for (/* null */ ; sva < eva ; sva = blkendva) { 5886 5887 /* determine range of block */ 5888 blkendva = x86_round_pdr(sva+1); 5889 if (blkendva > eva) 5890 blkendva = eva; 5891 5892 /* valid block? */ 5893 if (!pmap_pdes_valid(sva, pdes, NULL, &lvl)) 5894 continue; 5895 KASSERT(lvl == 1); 5896 5897 pte = &ptes[pl1_i(sva)]; 5898 for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) { 5899 if (!pmap_valid_entry(*pte)) 5900 continue; 5901 printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR 5902 " (pte=%#" PRIxPADDR ")\n", 5903 sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte); 5904 } 5905 } 5906 pmap_unmap_ptes(pmap, pmap2); 5907 mutex_exit(&pmap->pm_lock); 5908 } 5909 #endif 5910 5911 /* 5912 * pmap_update: process deferred invalidations and frees. 5913 */ 5914 void 5915 pmap_update(struct pmap *pmap) 5916 { 5917 struct pmap_page *pp; 5918 struct vm_page *ptp; 5919 5920 /* 5921 * Initiate any pending TLB shootdowns. Wait for them to 5922 * complete before returning control to the caller. 5923 */ 5924 kpreempt_disable(); 5925 pmap_tlb_shootnow(); 5926 kpreempt_enable(); 5927 5928 /* 5929 * Now that shootdowns are complete, process deferred frees. This 5930 * is an unlocked check, but is safe as we're only interested in 5931 * work done in this LWP - we won't get a false negative. 5932 */ 5933 if (atomic_load_relaxed(&pmap->pm_gc_ptp.lh_first) == NULL) { 5934 return; 5935 } 5936 5937 mutex_enter(&pmap->pm_lock); 5938 while ((ptp = LIST_FIRST(&pmap->pm_gc_ptp)) != NULL) { 5939 KASSERT(ptp->wire_count == 0); 5940 KASSERT(ptp->uanon == NULL); 5941 LIST_REMOVE(ptp, mdpage.mp_pp.pp_link); 5942 pp = VM_PAGE_TO_PP(ptp); 5943 LIST_INIT(&pp->pp_pvlist); 5944 pp->pp_attrs = 0; 5945 pp->pp_pte.pte_ptp = NULL; 5946 pp->pp_pte.pte_va = 0; 5947 PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp)); 5948 5949 /* 5950 * XXX Hack to avoid extra locking, and lock 5951 * assertions in uvm_pagefree(). Despite uobject 5952 * being set, this isn't a managed page. 5953 */ 5954 PMAP_DUMMY_LOCK(pmap); 5955 uvm_pagerealloc(ptp, NULL, 0); 5956 PMAP_DUMMY_UNLOCK(pmap); 5957 uvm_pagefree(ptp); 5958 } 5959 mutex_exit(&pmap->pm_lock); 5960 } 5961 5962 #if PTP_LEVELS > 4 5963 #error "Unsupported number of page table mappings" 5964 #endif 5965 5966 paddr_t 5967 pmap_init_tmp_pgtbl(paddr_t pg) 5968 { 5969 static bool maps_loaded; 5970 static const paddr_t x86_tmp_pml_paddr[] = { 5971 4 * PAGE_SIZE, /* L1 */ 5972 5 * PAGE_SIZE, /* L2 */ 5973 6 * PAGE_SIZE, /* L3 */ 5974 7 * PAGE_SIZE /* L4 */ 5975 }; 5976 static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 }; 5977 5978 pd_entry_t *tmp_pml, *kernel_pml; 5979 5980 int level; 5981 5982 if (!maps_loaded) { 5983 for (level = 0; level < PTP_LEVELS; ++level) { 5984 x86_tmp_pml_vaddr[level] = 5985 uvm_km_alloc(kernel_map, PAGE_SIZE, 0, 5986 UVM_KMF_VAONLY); 5987 5988 if (x86_tmp_pml_vaddr[level] == 0) 5989 panic("mapping of real mode PML failed\n"); 5990 pmap_kenter_pa(x86_tmp_pml_vaddr[level], 5991 x86_tmp_pml_paddr[level], 5992 VM_PROT_READ | VM_PROT_WRITE, 0); 5993 } 5994 pmap_update(pmap_kernel()); 5995 maps_loaded = true; 5996 } 5997 5998 /* Zero levels 1-3 */ 5999 for (level = 0; level < PTP_LEVELS - 1; ++level) { 6000 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 6001 memset(PAGE_ALIGNED(tmp_pml), 0, PAGE_SIZE); 6002 } 6003 6004 /* Copy PML4 */ 6005 kernel_pml = pmap_kernel()->pm_pdir; 6006 tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1]; 6007 memcpy(PAGE_ALIGNED(tmp_pml), PAGE_ALIGNED(kernel_pml), PAGE_SIZE); 6008 6009 #ifdef PAE 6010 /* 6011 * Use the last 4 entries of the L2 page as L3 PD entries. These 6012 * last entries are unlikely to be used for temporary mappings. 6013 * 508: maps 0->1GB (userland) 6014 * 509: unused 6015 * 510: unused 6016 * 511: maps 3->4GB (kernel) 6017 */ 6018 tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PTE_P; 6019 tmp_pml[509] = 0; 6020 tmp_pml[510] = 0; 6021 tmp_pml[511] = pmap_pdirpa(pmap_kernel(), PDIR_SLOT_KERN) | PTE_P; 6022 #endif 6023 6024 for (level = PTP_LEVELS - 1; level > 0; --level) { 6025 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 6026 6027 tmp_pml[pl_i(pg, level + 1)] = 6028 (x86_tmp_pml_paddr[level - 1] & PTE_FRAME) | PTE_W | PTE_P; 6029 } 6030 6031 tmp_pml = (void *)x86_tmp_pml_vaddr[0]; 6032 tmp_pml[pl_i(pg, 1)] = (pg & PTE_FRAME) | PTE_W | PTE_P; 6033 6034 #ifdef PAE 6035 /* Return the PA of the L3 page (entry 508 of the L2 page) */ 6036 return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t); 6037 #endif 6038 6039 return x86_tmp_pml_paddr[PTP_LEVELS - 1]; 6040 } 6041 6042 u_int 6043 x86_mmap_flags(paddr_t mdpgno) 6044 { 6045 u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK; 6046 u_int pflag = 0; 6047 6048 if (nflag & X86_MMAP_FLAG_PREFETCH) 6049 pflag |= PMAP_WRITE_COMBINE; 6050 6051 return pflag; 6052 } 6053 6054 #if defined(__HAVE_DIRECT_MAP) && defined(__x86_64__) && !defined(XENPV) 6055 6056 /* 6057 * ----------------------------------------------------------------------------- 6058 * ***************************************************************************** 6059 * ***************************************************************************** 6060 * ***************************************************************************** 6061 * ***************************************************************************** 6062 * **************** HERE BEGINS THE EPT CODE, USED BY INTEL-VMX **************** 6063 * ***************************************************************************** 6064 * ***************************************************************************** 6065 * ***************************************************************************** 6066 * ***************************************************************************** 6067 * ----------------------------------------------------------------------------- 6068 * 6069 * These functions are invoked as callbacks from the code above. Contrary to 6070 * native, EPT does not have a recursive slot; therefore, it is not possible 6071 * to call pmap_map_ptes(). Instead, we use the direct map and walk down the 6072 * tree manually. 6073 * 6074 * Apart from that, the logic is mostly the same as native. Once a pmap has 6075 * been created, NVMM calls pmap_ept_transform() to make it an EPT pmap. 6076 * After that we're good, and the callbacks will handle the translations 6077 * for us. 6078 * 6079 * ----------------------------------------------------------------------------- 6080 */ 6081 6082 /* Hardware bits. */ 6083 #define EPT_R __BIT(0) /* read */ 6084 #define EPT_W __BIT(1) /* write */ 6085 #define EPT_X __BIT(2) /* execute */ 6086 #define EPT_T __BITS(5,3) /* type */ 6087 #define TYPE_UC 0 6088 #define TYPE_WC 1 6089 #define TYPE_WT 4 6090 #define TYPE_WP 5 6091 #define TYPE_WB 6 6092 #define EPT_NOPAT __BIT(6) 6093 #define EPT_L __BIT(7) /* large */ 6094 #define EPT_A __BIT(8) /* accessed */ 6095 #define EPT_D __BIT(9) /* dirty */ 6096 /* Software bits. */ 6097 #define EPT_PVLIST __BIT(60) 6098 #define EPT_WIRED __BIT(61) 6099 6100 #define pmap_ept_valid_entry(pte) (pte & EPT_R) 6101 6102 bool pmap_ept_has_ad __read_mostly; 6103 6104 static inline void 6105 pmap_ept_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte) 6106 { 6107 int resid_diff = ((npte & EPT_R) ? 1 : 0) - ((opte & EPT_R) ? 1 : 0); 6108 int wired_diff = ((npte & EPT_WIRED) ? 1 : 0) - ((opte & EPT_WIRED) ? 1 : 0); 6109 6110 KASSERT((npte & (EPT_R | EPT_WIRED)) != EPT_WIRED); 6111 KASSERT((opte & (EPT_R | EPT_WIRED)) != EPT_WIRED); 6112 6113 pmap_stats_update(pmap, resid_diff, wired_diff); 6114 } 6115 6116 static pt_entry_t 6117 pmap_ept_type(u_int flags) 6118 { 6119 u_int cacheflags = (flags & PMAP_CACHE_MASK); 6120 pt_entry_t ret; 6121 6122 switch (cacheflags) { 6123 case PMAP_NOCACHE: 6124 case PMAP_NOCACHE_OVR: 6125 ret = __SHIFTIN(TYPE_UC, EPT_T); 6126 break; 6127 case PMAP_WRITE_COMBINE: 6128 ret = __SHIFTIN(TYPE_WC, EPT_T); 6129 break; 6130 case PMAP_WRITE_BACK: 6131 default: 6132 ret = __SHIFTIN(TYPE_WB, EPT_T); 6133 break; 6134 } 6135 6136 ret |= EPT_NOPAT; 6137 return ret; 6138 } 6139 6140 static inline pt_entry_t 6141 pmap_ept_prot(vm_prot_t prot) 6142 { 6143 pt_entry_t res = 0; 6144 6145 if (prot & VM_PROT_READ) 6146 res |= EPT_R; 6147 if (prot & VM_PROT_WRITE) 6148 res |= EPT_W; 6149 if (prot & VM_PROT_EXECUTE) 6150 res |= EPT_X; 6151 6152 return res; 6153 } 6154 6155 static inline uint8_t 6156 pmap_ept_to_pp_attrs(pt_entry_t ept) 6157 { 6158 uint8_t ret = 0; 6159 if (pmap_ept_has_ad) { 6160 if (ept & EPT_D) 6161 ret |= PP_ATTRS_D; 6162 if (ept & EPT_A) 6163 ret |= PP_ATTRS_A; 6164 } else { 6165 ret |= (PP_ATTRS_D|PP_ATTRS_A); 6166 } 6167 if (ept & EPT_W) 6168 ret |= PP_ATTRS_W; 6169 return ret; 6170 } 6171 6172 static inline pt_entry_t 6173 pmap_pp_attrs_to_ept(uint8_t attrs) 6174 { 6175 pt_entry_t ept = 0; 6176 if (attrs & PP_ATTRS_D) 6177 ept |= EPT_D; 6178 if (attrs & PP_ATTRS_A) 6179 ept |= EPT_A; 6180 if (attrs & PP_ATTRS_W) 6181 ept |= EPT_W; 6182 return ept; 6183 } 6184 6185 /* 6186 * Helper for pmap_ept_free_ptp. 6187 * tree[0] = &L2[L2idx] 6188 * tree[1] = &L3[L3idx] 6189 * tree[2] = &L4[L4idx] 6190 */ 6191 static void 6192 pmap_ept_get_tree(struct pmap *pmap, vaddr_t va, pd_entry_t **tree) 6193 { 6194 pt_entry_t *pteva; 6195 paddr_t ptepa; 6196 int i, index; 6197 6198 ptepa = pmap->pm_pdirpa[0]; 6199 for (i = PTP_LEVELS; i > 1; i--) { 6200 index = pl_pi(va, i); 6201 pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa); 6202 KASSERT(pmap_ept_valid_entry(pteva[index])); 6203 tree[i - 2] = &pteva[index]; 6204 ptepa = pmap_pte2pa(pteva[index]); 6205 } 6206 } 6207 6208 static void 6209 pmap_ept_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va) 6210 { 6211 pd_entry_t *tree[3]; 6212 int level; 6213 6214 KASSERT(pmap != pmap_kernel()); 6215 KASSERT(mutex_owned(&pmap->pm_lock)); 6216 KASSERT(kpreempt_disabled()); 6217 6218 pmap_ept_get_tree(pmap, va, tree); 6219 6220 level = 1; 6221 do { 6222 (void)pmap_pte_testset(tree[level - 1], 0); 6223 6224 pmap_freepage(pmap, ptp, level); 6225 if (level < PTP_LEVELS - 1) { 6226 ptp = pmap_find_ptp(pmap, va, level + 1); 6227 ptp->wire_count--; 6228 if (ptp->wire_count > 1) 6229 break; 6230 } 6231 } while (++level < PTP_LEVELS); 6232 pmap_pte_flush(); 6233 } 6234 6235 /* Allocate L4->L3->L2. Return L2. */ 6236 static void 6237 pmap_ept_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va) 6238 { 6239 struct vm_page *ptp; 6240 unsigned long index; 6241 pd_entry_t *pteva; 6242 paddr_t ptepa; 6243 int i; 6244 6245 KASSERT(pmap != pmap_kernel()); 6246 KASSERT(mutex_owned(&pmap->pm_lock)); 6247 KASSERT(kpreempt_disabled()); 6248 6249 /* 6250 * Now that we have all the pages looked up or allocated, 6251 * loop through again installing any new ones into the tree. 6252 */ 6253 ptepa = pmap->pm_pdirpa[0]; 6254 for (i = PTP_LEVELS; i > 1; i--) { 6255 index = pl_pi(va, i); 6256 pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa); 6257 6258 if (pmap_ept_valid_entry(pteva[index])) { 6259 KASSERT(!pt->alloced[i]); 6260 ptepa = pmap_pte2pa(pteva[index]); 6261 continue; 6262 } 6263 6264 ptp = pt->pg[i]; 6265 ptp->flags &= ~PG_BUSY; /* never busy */ 6266 ptp->wire_count = 1; 6267 pmap->pm_ptphint[i - 2] = ptp; 6268 ptepa = VM_PAGE_TO_PHYS(ptp); 6269 pmap_pte_set(&pteva[index], ptepa | EPT_R | EPT_W | EPT_X); 6270 6271 pmap_pte_flush(); 6272 pmap_stats_update(pmap, 1, 0); 6273 6274 /* 6275 * If we're not in the top level, increase the 6276 * wire count of the parent page. 6277 */ 6278 if (i < PTP_LEVELS) { 6279 pt->pg[i + 1]->wire_count++; 6280 } 6281 } 6282 } 6283 6284 static int 6285 pmap_ept_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, 6286 u_int flags) 6287 { 6288 pt_entry_t *ptes, opte, npte; 6289 pt_entry_t *ptep; 6290 struct vm_page *ptp; 6291 struct vm_page *new_pg, *old_pg; 6292 struct pmap_page *new_pp, *old_pp; 6293 struct pv_entry *old_pve, *new_pve; 6294 bool wired = (flags & PMAP_WIRED) != 0; 6295 bool accessed; 6296 struct pmap_ptparray pt; 6297 int error; 6298 bool getptp, samepage, new_embedded; 6299 rb_tree_t *tree; 6300 6301 KASSERT(pmap_initialized); 6302 KASSERT(va < VM_MAXUSER_ADDRESS); 6303 6304 npte = pa | pmap_ept_prot(prot) | pmap_ept_type(flags); 6305 6306 if (wired) 6307 npte |= EPT_WIRED; 6308 if (flags & VM_PROT_ALL) { 6309 npte |= EPT_A; 6310 if (flags & VM_PROT_WRITE) { 6311 KASSERT((npte & EPT_W) != 0); 6312 npte |= EPT_D; 6313 } 6314 } 6315 6316 new_pg = PHYS_TO_VM_PAGE(pa); 6317 if (new_pg != NULL) { 6318 /* This is a managed page */ 6319 npte |= EPT_PVLIST; 6320 new_pp = VM_PAGE_TO_PP(new_pg); 6321 } else if ((new_pp = pmap_pv_tracked(pa)) != NULL) { 6322 /* This is an unmanaged pv-tracked page */ 6323 npte |= EPT_PVLIST; 6324 } else { 6325 new_pp = NULL; 6326 } 6327 6328 /* Begin by locking the pmap. */ 6329 mutex_enter(&pmap->pm_lock); 6330 6331 /* Look up the PTP. Allocate if none present. */ 6332 ptp = NULL; 6333 getptp = false; 6334 if (pmap != pmap_kernel()) { 6335 ptp = pmap_find_ptp(pmap, va, 1); 6336 if (ptp == NULL) { 6337 getptp = true; 6338 error = pmap_get_ptp(pmap, &pt, va, flags, &ptp); 6339 if (error != 0) { 6340 if (flags & PMAP_CANFAIL) { 6341 mutex_exit(&pmap->pm_lock); 6342 return error; 6343 } 6344 panic("%s: get ptp failed, error=%d", __func__, 6345 error); 6346 } 6347 } 6348 tree = &VM_PAGE_TO_PP(ptp)->pp_rb; 6349 } else { 6350 /* Embedded PV entries rely on this. */ 6351 KASSERT(va != 0); 6352 tree = &pmap_kernel_rb; 6353 } 6354 6355 /* 6356 * Look up the old PV entry at this VA (if any), and insert a new PV 6357 * entry if required for the new mapping. Temporarily track the old 6358 * and new mappings concurrently. Only after the old mapping is 6359 * evicted from the pmap will we remove its PV entry. Otherwise, 6360 * our picture of modified/accessed state for either page could get 6361 * out of sync (we need any P->V operation for either page to stall 6362 * on pmap->pm_lock until done here). 6363 */ 6364 new_pve = NULL; 6365 old_pve = NULL; 6366 samepage = false; 6367 new_embedded = false; 6368 6369 if (new_pp != NULL) { 6370 error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve, 6371 &old_pve, &samepage, &new_embedded, tree); 6372 6373 /* 6374 * If a new pv_entry was needed and none was available, we 6375 * can go no further. 6376 */ 6377 if (error != 0) { 6378 if (flags & PMAP_CANFAIL) { 6379 if (getptp) { 6380 pmap_unget_ptp(pmap, &pt); 6381 } 6382 mutex_exit(&pmap->pm_lock); 6383 return error; 6384 } 6385 panic("%s: alloc pve failed", __func__); 6386 } 6387 } else { 6388 old_pve = pmap_treelookup_pv(pmap, ptp, tree, va); 6389 } 6390 6391 /* Map PTEs into address space. */ 6392 kpreempt_disable(); 6393 6394 /* Install any newly allocated PTPs. */ 6395 if (getptp) { 6396 pmap_ept_install_ptp(pmap, &pt, va); 6397 } 6398 6399 /* Check if there is an existing mapping. */ 6400 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp)); 6401 ptep = &ptes[pl1_pi(va)]; 6402 opte = *ptep; 6403 bool have_oldpa = pmap_ept_valid_entry(opte); 6404 paddr_t oldpa = pmap_pte2pa(opte); 6405 6406 /* 6407 * Update the pte. 6408 */ 6409 do { 6410 opte = *ptep; 6411 6412 /* 6413 * if the same page, inherit PTE_A and PTE_D. 6414 */ 6415 if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) { 6416 npte |= opte & (EPT_A | EPT_D); 6417 } 6418 } while (pmap_pte_cas(ptep, opte, npte) != opte); 6419 6420 /* 6421 * Done with the PTEs: they can now be unmapped. 6422 */ 6423 kpreempt_enable(); 6424 6425 /* 6426 * Update statistics and PTP's reference count. 6427 */ 6428 pmap_ept_stats_update_bypte(pmap, npte, opte); 6429 if (ptp != NULL) { 6430 if (!have_oldpa) { 6431 ptp->wire_count++; 6432 } 6433 /* Remember minimum VA in PTP. */ 6434 pmap_ptp_range_set(ptp, va); 6435 } 6436 KASSERT(ptp == NULL || ptp->wire_count > 1); 6437 6438 /* 6439 * If the same page, we can skip pv_entry handling. 6440 */ 6441 if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) { 6442 KASSERT(((opte ^ npte) & EPT_PVLIST) == 0); 6443 if ((npte & EPT_PVLIST) != 0) { 6444 KASSERT(samepage); 6445 pmap_check_pv(pmap, ptp, new_pp, va, true); 6446 } 6447 goto same_pa; 6448 } else if ((npte & EPT_PVLIST) != 0) { 6449 KASSERT(!samepage); 6450 } 6451 6452 /* 6453 * If old page is pv-tracked, remove pv_entry from its list. 6454 */ 6455 if ((~opte & (EPT_R | EPT_PVLIST)) == 0) { 6456 if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) { 6457 old_pp = VM_PAGE_TO_PP(old_pg); 6458 } else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) { 6459 panic("%s: EPT_PVLIST with pv-untracked page" 6460 " va = %#"PRIxVADDR 6461 " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")", 6462 __func__, va, oldpa, atop(pa)); 6463 } 6464 6465 pmap_remove_pv(pmap, old_pp, ptp, va, old_pve, 6466 pmap_ept_to_pp_attrs(opte)); 6467 } else { 6468 KASSERT(old_pve == NULL); 6469 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 6470 } 6471 6472 /* 6473 * If new page is dynamically PV tracked, insert to tree. 6474 */ 6475 if (new_pve != NULL) { 6476 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 6477 old_pve = rb_tree_insert_node(tree, new_pve); 6478 KASSERT(old_pve == new_pve); 6479 pmap_check_pv(pmap, ptp, new_pp, va, true); 6480 } 6481 6482 same_pa: 6483 /* 6484 * shootdown tlb if necessary. 6485 */ 6486 6487 if (pmap_ept_has_ad) { 6488 accessed = (~opte & (EPT_R | EPT_A)) == 0; 6489 } else { 6490 accessed = (opte & EPT_R) != 0; 6491 } 6492 if (accessed && ((opte ^ npte) & (PTE_FRAME | EPT_W)) != 0) { 6493 pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_ENTER); 6494 } 6495 pmap_drain_pv(pmap); 6496 mutex_exit(&pmap->pm_lock); 6497 return 0; 6498 } 6499 6500 /* Pay close attention, this returns L2. */ 6501 static int 6502 pmap_ept_pdes_invalid(struct pmap *pmap, vaddr_t va, pd_entry_t *lastpde) 6503 { 6504 pt_entry_t *pteva; 6505 paddr_t ptepa; 6506 int i, index; 6507 6508 KASSERT(mutex_owned(&pmap->pm_lock)); 6509 6510 ptepa = pmap->pm_pdirpa[0]; 6511 for (i = PTP_LEVELS; i > 1; i--) { 6512 pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa); 6513 index = pl_pi(va, i); 6514 if (!pmap_ept_valid_entry(pteva[index])) 6515 return i; 6516 ptepa = pmap_pte2pa(pteva[index]); 6517 } 6518 if (lastpde != NULL) { 6519 *lastpde = pteva[index]; 6520 } 6521 6522 return 0; 6523 } 6524 6525 static bool 6526 pmap_ept_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap) 6527 { 6528 pt_entry_t *ptes, pte; 6529 pd_entry_t pde; 6530 paddr_t ptppa, pa; 6531 bool rv; 6532 6533 #ifdef __HAVE_DIRECT_MAP 6534 if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) { 6535 if (pap != NULL) { 6536 *pap = PMAP_DIRECT_UNMAP(va); 6537 } 6538 return true; 6539 } 6540 #endif 6541 6542 rv = false; 6543 pa = 0; 6544 6545 mutex_enter(&pmap->pm_lock); 6546 kpreempt_disable(); 6547 6548 if (!pmap_ept_pdes_invalid(pmap, va, &pde)) { 6549 ptppa = pmap_pte2pa(pde); 6550 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa); 6551 pte = ptes[pl1_pi(va)]; 6552 if (__predict_true((pte & EPT_R) != 0)) { 6553 pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1)); 6554 rv = true; 6555 } 6556 } 6557 6558 kpreempt_enable(); 6559 mutex_exit(&pmap->pm_lock); 6560 6561 if (pap != NULL) { 6562 *pap = pa; 6563 } 6564 return rv; 6565 } 6566 6567 static bool 6568 pmap_ept_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, 6569 vaddr_t va) 6570 { 6571 struct pv_entry *pve; 6572 struct vm_page *pg; 6573 struct pmap_page *pp; 6574 pt_entry_t opte; 6575 bool accessed; 6576 6577 KASSERT(pmap != pmap_kernel()); 6578 KASSERT(mutex_owned(&pmap->pm_lock)); 6579 KASSERT(kpreempt_disabled()); 6580 6581 if (!pmap_ept_valid_entry(*pte)) { 6582 /* VA not mapped. */ 6583 return false; 6584 } 6585 6586 /* Atomically save the old PTE and zap it. */ 6587 opte = pmap_pte_testset(pte, 0); 6588 if (!pmap_ept_valid_entry(opte)) { 6589 return false; 6590 } 6591 6592 pmap_ept_stats_update_bypte(pmap, 0, opte); 6593 6594 if (ptp) { 6595 /* 6596 * Dropping a PTE. Make sure that the PDE is flushed. 6597 */ 6598 ptp->wire_count--; 6599 if (ptp->wire_count <= 1) { 6600 opte |= EPT_A; 6601 } 6602 } 6603 6604 if (pmap_ept_has_ad) { 6605 accessed = (opte & EPT_A) != 0; 6606 } else { 6607 accessed = true; 6608 } 6609 if (accessed) { 6610 pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_REMOVE_PTE); 6611 } 6612 6613 /* 6614 * If we are not on a pv list - we are done. 6615 */ 6616 if ((opte & EPT_PVLIST) == 0) { 6617 KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL), 6618 "managed page without EPT_PVLIST for %#"PRIxVADDR, va); 6619 KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL), 6620 "pv-tracked page without EPT_PVLIST for %#"PRIxVADDR, va); 6621 KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ? 6622 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL); 6623 return true; 6624 } 6625 6626 if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) { 6627 pp = VM_PAGE_TO_PP(pg); 6628 } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) { 6629 paddr_t pa = pmap_pte2pa(opte); 6630 panic("%s: EPT_PVLIST with pv-untracked page" 6631 " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")", 6632 __func__, va, pa, atop(pa)); 6633 } 6634 6635 /* Sync R/M bits. */ 6636 pve = pmap_lookup_pv(pmap, ptp, pp, va); 6637 pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_ept_to_pp_attrs(opte)); 6638 return true; 6639 } 6640 6641 static void 6642 pmap_ept_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva, 6643 vaddr_t startva, vaddr_t endva) 6644 { 6645 pt_entry_t *pte = (pt_entry_t *)ptpva; 6646 6647 KASSERT(pmap != pmap_kernel()); 6648 KASSERT(mutex_owned(&pmap->pm_lock)); 6649 KASSERT(kpreempt_disabled()); 6650 6651 /* 6652 * mappings are very often sparse, so clip the given range to the 6653 * range of PTEs that are known present in the PTP. 6654 */ 6655 pmap_ptp_range_clip(ptp, &startva, &pte); 6656 6657 /* 6658 * note that ptpva points to the PTE that maps startva. this may 6659 * or may not be the first PTE in the PTP. 6660 * 6661 * we loop through the PTP while there are still PTEs to look at 6662 * and the wire_count is greater than 1 (because we use the wire_count 6663 * to keep track of the number of real PTEs in the PTP). 6664 */ 6665 while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) { 6666 (void)pmap_ept_remove_pte(pmap, ptp, pte, startva); 6667 startva += PAGE_SIZE; 6668 pte++; 6669 } 6670 } 6671 6672 static void 6673 pmap_ept_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 6674 { 6675 pt_entry_t *ptes; 6676 pd_entry_t pde; 6677 paddr_t ptppa; 6678 vaddr_t blkendva, va = sva; 6679 struct vm_page *ptp; 6680 6681 mutex_enter(&pmap->pm_lock); 6682 kpreempt_disable(); 6683 6684 for (/* null */ ; va < eva ; va = blkendva) { 6685 int lvl; 6686 6687 /* determine range of block */ 6688 blkendva = x86_round_pdr(va+1); 6689 if (blkendva > eva) 6690 blkendva = eva; 6691 6692 lvl = pmap_ept_pdes_invalid(pmap, va, &pde); 6693 if (lvl != 0) { 6694 /* Skip a range corresponding to an invalid pde. */ 6695 blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1]; 6696 continue; 6697 } 6698 6699 /* PA of the PTP */ 6700 ptppa = pmap_pte2pa(pde); 6701 6702 ptp = pmap_find_ptp(pmap, va, 1); 6703 KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected", 6704 __func__); 6705 6706 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa); 6707 6708 pmap_ept_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_pi(va)], va, 6709 blkendva); 6710 6711 /* If PTP is no longer being used, free it. */ 6712 if (ptp && ptp->wire_count <= 1) { 6713 pmap_ept_free_ptp(pmap, ptp, va); 6714 } 6715 } 6716 6717 kpreempt_enable(); 6718 pmap_drain_pv(pmap); 6719 mutex_exit(&pmap->pm_lock); 6720 } 6721 6722 static int 6723 pmap_ept_sync_pv(struct vm_page *ptp, vaddr_t va, paddr_t pa, int clearbits, 6724 uint8_t *oattrs, pt_entry_t *optep) 6725 { 6726 struct pmap *pmap; 6727 pt_entry_t *ptep; 6728 pt_entry_t opte; 6729 pt_entry_t npte; 6730 pt_entry_t expect; 6731 bool need_shootdown; 6732 6733 expect = pmap_pa2pte(pa) | EPT_R; 6734 pmap = ptp_to_pmap(ptp); 6735 6736 if (clearbits != ~0) { 6737 KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0); 6738 clearbits = pmap_pp_attrs_to_ept(clearbits); 6739 } 6740 6741 ptep = pmap_map_pte(pmap, ptp, va); 6742 do { 6743 opte = *ptep; 6744 KASSERT((opte & (EPT_D | EPT_A)) != EPT_D); 6745 KASSERT((opte & (EPT_A | EPT_R)) != EPT_A); 6746 KASSERT(opte == 0 || (opte & EPT_R) != 0); 6747 if ((opte & (PTE_FRAME | EPT_R)) != expect) { 6748 /* 6749 * We lost a race with a V->P operation like 6750 * pmap_remove(). Wait for the competitor 6751 * reflecting pte bits into mp_attrs. 6752 */ 6753 pmap_unmap_pte(); 6754 return EAGAIN; 6755 } 6756 6757 /* 6758 * Check if there's anything to do on this PTE. 6759 */ 6760 if ((opte & clearbits) == 0) { 6761 need_shootdown = false; 6762 break; 6763 } 6764 6765 /* 6766 * We need a shootdown if the PTE is cached (EPT_A) ... 6767 * ... Unless we are clearing only the EPT_W bit and 6768 * it isn't cached as RW (EPT_D). 6769 */ 6770 if (pmap_ept_has_ad) { 6771 need_shootdown = (opte & EPT_A) != 0 && 6772 !(clearbits == EPT_W && (opte & EPT_D) == 0); 6773 } else { 6774 need_shootdown = true; 6775 } 6776 6777 npte = opte & ~clearbits; 6778 6779 /* 6780 * If we need a shootdown anyway, clear EPT_A and EPT_D. 6781 */ 6782 if (need_shootdown) { 6783 npte &= ~(EPT_A | EPT_D); 6784 } 6785 KASSERT((npte & (EPT_D | EPT_A)) != EPT_D); 6786 KASSERT((npte & (EPT_A | EPT_R)) != EPT_A); 6787 KASSERT(npte == 0 || (opte & EPT_R) != 0); 6788 } while (pmap_pte_cas(ptep, opte, npte) != opte); 6789 6790 if (need_shootdown) { 6791 pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_SYNC_PV); 6792 } 6793 pmap_unmap_pte(); 6794 6795 *oattrs = pmap_ept_to_pp_attrs(opte); 6796 if (optep != NULL) 6797 *optep = opte; 6798 return 0; 6799 } 6800 6801 static void 6802 pmap_ept_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte, 6803 vaddr_t va) 6804 { 6805 6806 KASSERT(mutex_owned(&pmap->pm_lock)); 6807 6808 pmap_ept_stats_update_bypte(pmap, 0, opte); 6809 ptp->wire_count--; 6810 if (ptp->wire_count <= 1) { 6811 pmap_ept_free_ptp(pmap, ptp, va); 6812 } 6813 } 6814 6815 static void 6816 pmap_ept_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot) 6817 { 6818 pt_entry_t bit_rem; 6819 pt_entry_t *ptes, *spte; 6820 pt_entry_t opte, npte; 6821 pd_entry_t pde; 6822 paddr_t ptppa; 6823 vaddr_t va; 6824 bool modified; 6825 6826 bit_rem = 0; 6827 if (!(prot & VM_PROT_WRITE)) 6828 bit_rem = EPT_W; 6829 6830 sva &= PTE_FRAME; 6831 eva &= PTE_FRAME; 6832 6833 /* Acquire pmap. */ 6834 mutex_enter(&pmap->pm_lock); 6835 kpreempt_disable(); 6836 6837 for (va = sva; va < eva; va += PAGE_SIZE) { 6838 if (pmap_ept_pdes_invalid(pmap, va, &pde)) { 6839 continue; 6840 } 6841 6842 ptppa = pmap_pte2pa(pde); 6843 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa); 6844 spte = &ptes[pl1_pi(va)]; 6845 6846 do { 6847 opte = *spte; 6848 if (!pmap_ept_valid_entry(opte)) { 6849 goto next; 6850 } 6851 npte = (opte & ~bit_rem); 6852 } while (pmap_pte_cas(spte, opte, npte) != opte); 6853 6854 if (pmap_ept_has_ad) { 6855 modified = (opte & EPT_D) != 0; 6856 } else { 6857 modified = true; 6858 } 6859 if (modified) { 6860 vaddr_t tva = x86_ptob(spte - ptes); 6861 pmap_tlb_shootdown(pmap, tva, 0, 6862 TLBSHOOT_WRITE_PROTECT); 6863 } 6864 next:; 6865 } 6866 6867 kpreempt_enable(); 6868 mutex_exit(&pmap->pm_lock); 6869 } 6870 6871 static void 6872 pmap_ept_unwire(struct pmap *pmap, vaddr_t va) 6873 { 6874 pt_entry_t *ptes, *ptep, opte; 6875 pd_entry_t pde; 6876 paddr_t ptppa; 6877 6878 /* Acquire pmap. */ 6879 mutex_enter(&pmap->pm_lock); 6880 kpreempt_disable(); 6881 6882 if (pmap_ept_pdes_invalid(pmap, va, &pde)) { 6883 panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va); 6884 } 6885 6886 ptppa = pmap_pte2pa(pde); 6887 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa); 6888 ptep = &ptes[pl1_pi(va)]; 6889 opte = *ptep; 6890 KASSERT(pmap_ept_valid_entry(opte)); 6891 6892 if (opte & EPT_WIRED) { 6893 pt_entry_t npte = opte & ~EPT_WIRED; 6894 6895 opte = pmap_pte_testset(ptep, npte); 6896 pmap_ept_stats_update_bypte(pmap, npte, opte); 6897 } else { 6898 printf("%s: wiring for pmap %p va %#" PRIxVADDR 6899 "did not change!\n", __func__, pmap, va); 6900 } 6901 6902 /* Release pmap. */ 6903 kpreempt_enable(); 6904 mutex_exit(&pmap->pm_lock); 6905 } 6906 6907 /* -------------------------------------------------------------------------- */ 6908 6909 void 6910 pmap_ept_transform(struct pmap *pmap) 6911 { 6912 pmap->pm_enter = pmap_ept_enter; 6913 pmap->pm_extract = pmap_ept_extract; 6914 pmap->pm_remove = pmap_ept_remove; 6915 pmap->pm_sync_pv = pmap_ept_sync_pv; 6916 pmap->pm_pp_remove_ent = pmap_ept_pp_remove_ent; 6917 pmap->pm_write_protect = pmap_ept_write_protect; 6918 pmap->pm_unwire = pmap_ept_unwire; 6919 6920 memset(PAGE_ALIGNED(pmap->pm_pdir), 0, PAGE_SIZE); 6921 } 6922 6923 #endif /* __HAVE_DIRECT_MAP && __x86_64__ && !XENPV */ 6924