1 /* $NetBSD: pmap.c,v 1.427 2024/10/08 21:09:08 riastradh Exp $ */ 2 3 /* 4 * Copyright (c) 2008, 2010, 2016, 2017, 2019, 2020 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran, and by Maxime Villard. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 2007 Manuel Bouyer. 34 * 35 * Redistribution and use in source and binary forms, with or without 36 * modification, are permitted provided that the following conditions 37 * are met: 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 44 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 45 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 46 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 47 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 48 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 49 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 50 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 51 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 52 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 53 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 54 */ 55 56 /* 57 * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr> 58 * 59 * Permission to use, copy, modify, and distribute this software for any 60 * purpose with or without fee is hereby granted, provided that the above 61 * copyright notice and this permission notice appear in all copies. 62 * 63 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 64 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 65 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 66 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 67 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 68 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 69 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 70 */ 71 72 /* 73 * Copyright 2001 (c) Wasabi Systems, Inc. 74 * All rights reserved. 75 * 76 * Written by Frank van der Linden for Wasabi Systems, Inc. 77 * 78 * Redistribution and use in source and binary forms, with or without 79 * modification, are permitted provided that the following conditions 80 * are met: 81 * 1. Redistributions of source code must retain the above copyright 82 * notice, this list of conditions and the following disclaimer. 83 * 2. Redistributions in binary form must reproduce the above copyright 84 * notice, this list of conditions and the following disclaimer in the 85 * documentation and/or other materials provided with the distribution. 86 * 3. All advertising materials mentioning features or use of this software 87 * must display the following acknowledgement: 88 * This product includes software developed for the NetBSD Project by 89 * Wasabi Systems, Inc. 90 * 4. The name of Wasabi Systems, Inc. may not be used to endorse 91 * or promote products derived from this software without specific prior 92 * written permission. 93 * 94 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND 95 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 96 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 97 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC 98 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 99 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 100 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 101 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 102 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 103 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 104 * POSSIBILITY OF SUCH DAMAGE. 105 */ 106 107 /* 108 * Copyright (c) 1997 Charles D. Cranor and Washington University. 109 * All rights reserved. 110 * 111 * Redistribution and use in source and binary forms, with or without 112 * modification, are permitted provided that the following conditions 113 * are met: 114 * 1. Redistributions of source code must retain the above copyright 115 * notice, this list of conditions and the following disclaimer. 116 * 2. Redistributions in binary form must reproduce the above copyright 117 * notice, this list of conditions and the following disclaimer in the 118 * documentation and/or other materials provided with the distribution. 119 * 120 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 121 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 122 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 123 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 124 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 125 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 126 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 127 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 128 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 129 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 130 */ 131 132 #include <sys/cdefs.h> 133 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.427 2024/10/08 21:09:08 riastradh Exp $"); 134 135 #include "opt_user_ldt.h" 136 #include "opt_lockdebug.h" 137 #include "opt_multiprocessor.h" 138 #include "opt_xen.h" 139 #include "opt_svs.h" 140 #include "opt_kaslr.h" 141 #include "opt_efi.h" 142 143 #define __MUTEX_PRIVATE /* for assertions */ 144 145 #include <sys/param.h> 146 #include <sys/systm.h> 147 #include <sys/proc.h> 148 #include <sys/pool.h> 149 #include <sys/kernel.h> 150 #include <sys/atomic.h> 151 #include <sys/cpu.h> 152 #include <sys/intr.h> 153 #include <sys/xcall.h> 154 #include <sys/kcore.h> 155 #include <sys/kmem.h> 156 #include <sys/asan.h> 157 #include <sys/msan.h> 158 #include <sys/entropy.h> 159 160 #include <uvm/uvm.h> 161 #include <uvm/pmap/pmap_pvt.h> 162 163 #include <dev/isa/isareg.h> 164 165 #include <machine/specialreg.h> 166 #include <machine/gdt.h> 167 #include <machine/isa_machdep.h> 168 #include <machine/cpuvar.h> 169 #include <machine/cputypes.h> 170 #include <machine/pmap_private.h> 171 172 #include <x86/bootspace.h> 173 #include <x86/pat.h> 174 #include <x86/pmap_pv.h> 175 176 #include <x86/i82489reg.h> 177 #include <x86/i82489var.h> 178 179 #ifdef XEN 180 #include <xen/include/public/xen.h> 181 #include <xen/hypervisor.h> 182 #include <xen/xenpmap.h> 183 #endif 184 185 #ifdef __HAVE_DIRECT_MAP 186 #include <crypto/nist_hash_drbg/nist_hash_drbg.h> 187 #endif 188 189 /* 190 * general info: 191 * 192 * - for an explanation of how the x86 MMU hardware works see 193 * the comments in <machine/pte.h>. 194 * 195 * - for an explanation of the general memory structure used by 196 * this pmap (including the recursive mapping), see the comments 197 * in <machine/pmap.h>. 198 * 199 * this file contains the code for the "pmap module." the module's 200 * job is to manage the hardware's virtual to physical address mappings. 201 * note that there are two levels of mapping in the VM system: 202 * 203 * [1] the upper layer of the VM system uses vm_map's and vm_map_entry's 204 * to map ranges of virtual address space to objects/files. for 205 * example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only 206 * to the file /bin/ls starting at offset zero." note that 207 * the upper layer mapping is not concerned with how individual 208 * vm_pages are mapped. 209 * 210 * [2] the lower layer of the VM system (the pmap) maintains the mappings 211 * from virtual addresses. it is concerned with which vm_page is 212 * mapped where. for example, when you run /bin/ls and start 213 * at page 0x1000 the fault routine may lookup the correct page 214 * of the /bin/ls file and then ask the pmap layer to establish 215 * a mapping for it. 216 * 217 * note that information in the lower layer of the VM system can be 218 * thrown away since it can easily be reconstructed from the info 219 * in the upper layer. 220 * 221 * data structures we use include: 222 * 223 * - struct pmap: describes the address space of one thread 224 * - struct pmap_page: describes one pv-tracked page, without 225 * necessarily a corresponding vm_page 226 * - struct pv_entry: describes one <PMAP,VA> mapping of a PA 227 * - pmap_page::pp_pvlist: there is one list per pv-tracked page of 228 * physical memory. the pp_pvlist points to a list of pv_entry 229 * structures which describe all the <PMAP,VA> pairs that this 230 * page is mapped in. this is critical for page based operations 231 * such as pmap_page_protect() [change protection on _all_ mappings 232 * of a page] 233 */ 234 235 /* 236 * Locking 237 * 238 * We have the following locks that we must deal with, listed in the order 239 * that they are acquired: 240 * 241 * pg->uobject->vmobjlock, pg->uanon->an_lock 242 * 243 * For managed pages, these per-object locks are taken by the VM system 244 * before calling into the pmap module - either a read or write hold. 245 * The lock hold prevent pages from changing identity while the pmap is 246 * operating on them. For example, the same lock is held across a call 247 * to pmap_remove() and the following call to pmap_update(), so that a 248 * page does not gain a new identity while its TLB visibility is stale. 249 * 250 * pmap->pm_lock 251 * 252 * This lock protects the fields in the pmap structure including the 253 * non-kernel PDEs in the PDP, the PTEs, and PTPs and connected data 254 * structures. For modifying unmanaged kernel PTEs it is not needed as 255 * kernel PDEs are never freed, and the kernel is expected to be self 256 * consistent (and the lock can't be taken for unmanaged kernel PTEs, 257 * because they can be modified from interrupt context). 258 * 259 * pmaps_lock 260 * 261 * This lock protects the list of active pmaps (headed by "pmaps"). 262 * It's acquired when adding or removing pmaps or adjusting kernel PDEs. 263 * 264 * pp_lock 265 * 266 * This per-page lock protects PV entry lists and the embedded PV entry 267 * in each vm_page, allowing for concurrent operation on pages by 268 * different pmaps. This is a spin mutex at IPL_VM, because at the 269 * points it is taken context switching is usually not tolerable, and 270 * spin mutexes must block out interrupts that could take kernel_lock. 271 */ 272 273 /* uvm_object is abused here to index pmap_pages; make assertions happy. */ 274 #ifdef DIAGNOSTIC 275 #define PMAP_DUMMY_LOCK(pm) rw_enter(&(pm)->pm_dummy_lock, RW_WRITER) 276 #define PMAP_DUMMY_UNLOCK(pm) rw_exit(&(pm)->pm_dummy_lock) 277 #else 278 #define PMAP_DUMMY_LOCK(pm) 279 #define PMAP_DUMMY_UNLOCK(pm) 280 #endif 281 282 static const struct uvm_pagerops pmap_pager = { 283 /* nothing */ 284 }; 285 286 /* 287 * pl_i(va, X) == plX_i(va) <= pl_i_roundup(va, X) 288 */ 289 #define pl_i(va, lvl) \ 290 (((VA_SIGN_POS(va)) & ptp_frames[(lvl)-1]) >> ptp_shifts[(lvl)-1]) 291 292 #define pl_i_roundup(va, lvl) pl_i((va)+ ~ptp_frames[(lvl)-1], (lvl)) 293 294 /* 295 * PTP macros: 296 * a PTP's index is the PD index of the PDE that points to it 297 * a PTP's offset is the byte-offset in the PTE space that this PTP is at 298 * a PTP's VA is the first VA mapped by that PTP 299 */ 300 301 #define ptp_va2o(va, lvl) (pl_i(va, (lvl)+1) * PAGE_SIZE) 302 303 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER; 304 const vaddr_t ptp_frames[] = PTP_FRAME_INITIALIZER; 305 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER; 306 const long nkptpmax[] = NKPTPMAX_INITIALIZER; 307 const long nbpd[] = NBPD_INITIALIZER; 308 #ifdef i386 309 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER; 310 #else 311 pd_entry_t *normal_pdes[3]; 312 #endif 313 314 long nkptp[] = NKPTP_INITIALIZER; 315 316 struct pmap_head pmaps; 317 kmutex_t pmaps_lock __cacheline_aligned; 318 319 struct pcpu_area *pcpuarea __read_mostly; 320 321 static vaddr_t pmap_maxkvaddr; 322 323 /* 324 * Misc. event counters. 325 */ 326 struct evcnt pmap_iobmp_evcnt; 327 struct evcnt pmap_ldt_evcnt; 328 329 /* 330 * PAT 331 */ 332 static bool cpu_pat_enabled __read_mostly = false; 333 334 /* 335 * Global data structures 336 */ 337 338 static struct pmap kernel_pmap_store __cacheline_aligned; /* kernel's pmap */ 339 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store; 340 static rb_tree_t pmap_kernel_rb __cacheline_aligned; 341 342 struct bootspace bootspace __read_mostly; 343 struct slotspace slotspace __read_mostly; 344 345 /* Set to PTE_NX if supported. */ 346 pd_entry_t pmap_pg_nx __read_mostly = 0; 347 348 /* Set to PTE_G if supported. */ 349 pd_entry_t pmap_pg_g __read_mostly = 0; 350 351 /* Set to true if large pages are supported. */ 352 int pmap_largepages __read_mostly = 0; 353 354 paddr_t lowmem_rsvd __read_mostly; 355 paddr_t avail_start __read_mostly; /* PA of first available physical page */ 356 paddr_t avail_end __read_mostly; /* PA of last available physical page */ 357 358 #ifdef XENPV 359 paddr_t pmap_pa_start; /* PA of first physical page for this domain */ 360 paddr_t pmap_pa_end; /* PA of last physical page for this domain */ 361 #endif 362 363 #define VM_PAGE_TO_PP(pg) (&(pg)->mdpage.mp_pp) 364 #define PMAP_CHECK_PP(pp) \ 365 KASSERTMSG((pp)->pp_lock.mtx_ipl._ipl == IPL_VM, "bad pmap_page %p", pp) 366 367 #define PAGE_ALIGNED(pp) \ 368 __builtin_assume_aligned((void *)(pp), PAGE_SIZE) 369 370 /* 371 * Other data structures 372 */ 373 374 static pt_entry_t protection_codes[8] __read_mostly; 375 376 static bool pmap_initialized __read_mostly = false; /* pmap_init done yet? */ 377 378 /* 379 * The following two vaddr_t's are used during system startup to keep track of 380 * how much of the kernel's VM space we have used. Once the system is started, 381 * the management of the remaining kernel VM space is turned over to the 382 * kernel_map vm_map. 383 */ 384 static vaddr_t virtual_avail __read_mostly; /* VA of first free KVA */ 385 static vaddr_t virtual_end __read_mostly; /* VA of last free KVA */ 386 387 #ifndef XENPV 388 /* 389 * LAPIC virtual address, and fake physical address. 390 */ 391 volatile vaddr_t local_apic_va __read_mostly; 392 paddr_t local_apic_pa __read_mostly; 393 #endif 394 395 /* 396 * pool that pmap structures are allocated from 397 */ 398 struct pool_cache pmap_cache; 399 static int pmap_ctor(void *, void *, int); 400 static void pmap_dtor(void *, void *); 401 402 /* 403 * pv_page cache 404 */ 405 static struct pool_cache pmap_pvp_cache; 406 407 #ifdef __HAVE_DIRECT_MAP 408 vaddr_t pmap_direct_base __read_mostly; 409 vaddr_t pmap_direct_end __read_mostly; 410 #endif 411 412 #ifndef __HAVE_DIRECT_MAP 413 /* 414 * Special VAs and the PTEs that map them 415 */ 416 static pt_entry_t *early_zero_pte; 417 static void pmap_vpage_cpualloc(struct cpu_info *); 418 #ifdef XENPV 419 char *early_zerop; /* also referenced from xen_locore() */ 420 #else 421 static char *early_zerop; 422 #endif 423 #endif 424 425 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int); 426 427 /* PDP pool and its callbacks */ 428 static struct pool pmap_pdp_pool; 429 static void pmap_pdp_init(pd_entry_t *); 430 static void pmap_pdp_fini(pd_entry_t *); 431 432 #ifdef PAE 433 /* need to allocate items of 4 pages */ 434 static void *pmap_pdp_alloc(struct pool *, int); 435 static void pmap_pdp_free(struct pool *, void *); 436 static struct pool_allocator pmap_pdp_allocator = { 437 .pa_alloc = pmap_pdp_alloc, 438 .pa_free = pmap_pdp_free, 439 .pa_pagesz = PAGE_SIZE * PDP_SIZE, 440 }; 441 #endif 442 443 extern vaddr_t idt_vaddr; 444 extern paddr_t idt_paddr; 445 extern vaddr_t gdt_vaddr; 446 extern paddr_t gdt_paddr; 447 extern vaddr_t ldt_vaddr; 448 extern paddr_t ldt_paddr; 449 450 #ifdef i386 451 /* stuff to fix the pentium f00f bug */ 452 extern vaddr_t pentium_idt_vaddr; 453 #endif 454 455 /* Array of freshly allocated PTPs, for pmap_get_ptp(). */ 456 struct pmap_ptparray { 457 struct vm_page *pg[PTP_LEVELS + 1]; 458 bool alloced[PTP_LEVELS + 1]; 459 }; 460 461 /* 462 * PV entries are allocated in page-sized chunks and cached per-pmap to 463 * avoid intense pressure on memory allocators. 464 */ 465 466 struct pv_page { 467 LIST_HEAD(, pv_entry) pvp_pves; 468 LIST_ENTRY(pv_page) pvp_list; 469 long pvp_nfree; 470 struct pmap *pvp_pmap; 471 }; 472 473 #define PVE_PER_PVP ((PAGE_SIZE / sizeof(struct pv_entry)) - 1) 474 475 /* 476 * PV tree prototypes 477 */ 478 479 static int pmap_compare_key(void *, const void *, const void *); 480 static int pmap_compare_nodes(void *, const void *, const void *); 481 482 /* Read-black tree */ 483 static const rb_tree_ops_t pmap_rbtree_ops = { 484 .rbto_compare_nodes = pmap_compare_nodes, 485 .rbto_compare_key = pmap_compare_key, 486 .rbto_node_offset = offsetof(struct pv_entry, pve_rb), 487 .rbto_context = NULL 488 }; 489 490 /* 491 * Local prototypes 492 */ 493 494 #ifdef __HAVE_PCPU_AREA 495 static void pmap_init_pcpu(void); 496 #endif 497 #ifdef __HAVE_DIRECT_MAP 498 static void pmap_init_directmap(struct pmap *); 499 #endif 500 #if !defined(XENPV) 501 static void pmap_remap_global(void); 502 #endif 503 #ifndef XENPV 504 static void pmap_init_lapic(void); 505 static void pmap_remap_largepages(void); 506 #endif 507 508 static int pmap_get_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t, int, 509 struct vm_page **); 510 static void pmap_unget_ptp(struct pmap *, struct pmap_ptparray *); 511 static void pmap_install_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t, 512 pd_entry_t * const *); 513 static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, int); 514 static void pmap_freepage(struct pmap *, struct vm_page *, int); 515 static void pmap_free_ptp(struct pmap *, struct vm_page *, vaddr_t, 516 pt_entry_t *, pd_entry_t * const *); 517 static bool pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *, 518 vaddr_t); 519 static void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t, vaddr_t, 520 vaddr_t); 521 static int pmap_pvp_ctor(void *, void *, int); 522 static void pmap_pvp_dtor(void *, void *); 523 static struct pv_entry *pmap_alloc_pv(struct pmap *); 524 static void pmap_free_pv(struct pmap *, struct pv_entry *); 525 static void pmap_drain_pv(struct pmap *); 526 527 static void pmap_alloc_level(struct pmap *, vaddr_t, long *); 528 529 static void pmap_load1(struct lwp *, struct pmap *, struct pmap *); 530 static void pmap_reactivate(struct pmap *); 531 532 long 533 pmap_resident_count(struct pmap *pmap) 534 { 535 536 return pmap->pm_stats.resident_count; 537 } 538 539 long 540 pmap_wired_count(struct pmap *pmap) 541 { 542 543 return pmap->pm_stats.wired_count; 544 } 545 546 /* 547 * p m a p h e l p e r f u n c t i o n s 548 */ 549 550 static inline void 551 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff) 552 { 553 554 KASSERT(cold || mutex_owned(&pmap->pm_lock)); 555 pmap->pm_stats.resident_count += resid_diff; 556 pmap->pm_stats.wired_count += wired_diff; 557 } 558 559 static inline void 560 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte) 561 { 562 int resid_diff = ((npte & PTE_P) ? 1 : 0) - ((opte & PTE_P) ? 1 : 0); 563 int wired_diff = ((npte & PTE_WIRED) ? 1 : 0) - ((opte & PTE_WIRED) ? 1 : 0); 564 565 KASSERT((npte & (PTE_P | PTE_WIRED)) != PTE_WIRED); 566 KASSERT((opte & (PTE_P | PTE_WIRED)) != PTE_WIRED); 567 568 pmap_stats_update(pmap, resid_diff, wired_diff); 569 } 570 571 /* 572 * ptp_to_pmap: lookup pmap by ptp 573 */ 574 static inline struct pmap * 575 ptp_to_pmap(struct vm_page *ptp) 576 { 577 struct pmap *pmap; 578 579 if (ptp == NULL) { 580 return pmap_kernel(); 581 } 582 pmap = (struct pmap *)ptp->uobject; 583 KASSERT(pmap != NULL); 584 KASSERT(&pmap->pm_obj[0] == ptp->uobject); 585 return pmap; 586 } 587 588 static inline struct pv_pte * 589 pve_to_pvpte(struct pv_entry *pve) 590 { 591 592 if (pve == NULL) 593 return NULL; 594 KASSERT((void *)&pve->pve_pte == (void *)pve); 595 return &pve->pve_pte; 596 } 597 598 static inline struct pv_entry * 599 pvpte_to_pve(struct pv_pte *pvpte) 600 { 601 struct pv_entry *pve = (void *)pvpte; 602 603 KASSERT(pve_to_pvpte(pve) == pvpte); 604 return pve; 605 } 606 607 /* 608 * Return true if the pmap page has an embedded PV entry. 609 */ 610 static inline bool 611 pv_pte_embedded(struct pmap_page *pp) 612 { 613 614 KASSERT(mutex_owned(&pp->pp_lock)); 615 return (bool)((vaddr_t)pp->pp_pte.pte_ptp | pp->pp_pte.pte_va); 616 } 617 618 /* 619 * pv_pte_first, pv_pte_next: PV list iterator. 620 */ 621 static inline struct pv_pte * 622 pv_pte_first(struct pmap_page *pp) 623 { 624 625 KASSERT(mutex_owned(&pp->pp_lock)); 626 if (pv_pte_embedded(pp)) { 627 return &pp->pp_pte; 628 } 629 return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist)); 630 } 631 632 static inline struct pv_pte * 633 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte) 634 { 635 636 KASSERT(mutex_owned(&pp->pp_lock)); 637 KASSERT(pvpte != NULL); 638 if (pvpte == &pp->pp_pte) { 639 return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist)); 640 } 641 return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list)); 642 } 643 644 static inline uint8_t 645 pmap_pte_to_pp_attrs(pt_entry_t pte) 646 { 647 uint8_t ret = 0; 648 if (pte & PTE_D) 649 ret |= PP_ATTRS_D; 650 if (pte & PTE_A) 651 ret |= PP_ATTRS_A; 652 if (pte & PTE_W) 653 ret |= PP_ATTRS_W; 654 return ret; 655 } 656 657 static inline pt_entry_t 658 pmap_pp_attrs_to_pte(uint8_t attrs) 659 { 660 pt_entry_t pte = 0; 661 if (attrs & PP_ATTRS_D) 662 pte |= PTE_D; 663 if (attrs & PP_ATTRS_A) 664 pte |= PTE_A; 665 if (attrs & PP_ATTRS_W) 666 pte |= PTE_W; 667 return pte; 668 } 669 670 /* 671 * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]? 672 * of course the kernel is always loaded 673 */ 674 bool 675 pmap_is_curpmap(struct pmap *pmap) 676 { 677 return ((pmap == pmap_kernel()) || (pmap == curcpu()->ci_pmap)); 678 } 679 680 inline void 681 pmap_reference(struct pmap *pmap) 682 { 683 684 atomic_inc_uint(&pmap->pm_obj[0].uo_refs); 685 } 686 687 /* 688 * rbtree: compare two nodes. 689 */ 690 static int 691 pmap_compare_nodes(void *context, const void *n1, const void *n2) 692 { 693 const struct pv_entry *pve1 = n1; 694 const struct pv_entry *pve2 = n2; 695 696 KASSERT(pve1->pve_pte.pte_ptp == pve2->pve_pte.pte_ptp); 697 698 if (pve1->pve_pte.pte_va < pve2->pve_pte.pte_va) { 699 return -1; 700 } 701 if (pve1->pve_pte.pte_va > pve2->pve_pte.pte_va) { 702 return 1; 703 } 704 return 0; 705 } 706 707 /* 708 * rbtree: compare a node and a key. 709 */ 710 static int 711 pmap_compare_key(void *context, const void *n, const void *k) 712 { 713 const struct pv_entry *pve = n; 714 const vaddr_t key = (vaddr_t)k; 715 716 if (pve->pve_pte.pte_va < key) { 717 return -1; 718 } 719 if (pve->pve_pte.pte_va > key) { 720 return 1; 721 } 722 return 0; 723 } 724 725 /* 726 * pmap_ptp_range_set: abuse ptp->uanon to record minimum VA of PTE 727 */ 728 static inline void 729 pmap_ptp_range_set(struct vm_page *ptp, vaddr_t va) 730 { 731 vaddr_t *min = (vaddr_t *)&ptp->uanon; 732 733 if (va < *min) { 734 *min = va; 735 } 736 } 737 738 /* 739 * pmap_ptp_range_clip: abuse ptp->uanon to clip range of PTEs to remove 740 */ 741 static inline void 742 pmap_ptp_range_clip(struct vm_page *ptp, vaddr_t *startva, pt_entry_t **pte) 743 { 744 vaddr_t sclip; 745 746 if (ptp == NULL) { 747 return; 748 } 749 750 sclip = (vaddr_t)ptp->uanon; 751 sclip = (*startva < sclip ? sclip : *startva); 752 *pte += (sclip - *startva) / PAGE_SIZE; 753 *startva = sclip; 754 } 755 756 /* 757 * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in 758 * 759 * there are several pmaps involved. some or all of them might be same. 760 * 761 * - the pmap given by the first argument 762 * our caller wants to access this pmap's PTEs. 763 * 764 * - pmap_kernel() 765 * the kernel pmap. note that it only contains the kernel part 766 * of the address space which is shared by any pmap. ie. any 767 * pmap can be used instead of pmap_kernel() for our purpose. 768 * 769 * - ci->ci_pmap 770 * pmap currently loaded on the cpu. 771 * 772 * - vm_map_pmap(&curproc->p_vmspace->vm_map) 773 * current process' pmap. 774 * 775 * => caller must lock pmap first (if not the kernel pmap) 776 * => must be undone with pmap_unmap_ptes before returning 777 * => disables kernel preemption 778 */ 779 void 780 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2, pd_entry_t **ptepp, 781 pd_entry_t * const **pdeppp) 782 { 783 struct pmap *curpmap; 784 struct cpu_info *ci; 785 lwp_t *l; 786 787 kpreempt_disable(); 788 789 /* The kernel's pmap is always accessible. */ 790 if (pmap == pmap_kernel()) { 791 *pmap2 = NULL; 792 *ptepp = PTE_BASE; 793 *pdeppp = normal_pdes; 794 return; 795 } 796 797 KASSERT(mutex_owned(&pmap->pm_lock)); 798 799 l = curlwp; 800 ci = l->l_cpu; 801 curpmap = ci->ci_pmap; 802 if (pmap == curpmap) { 803 /* 804 * Already on the CPU: make it valid. This is very 805 * often the case during exit(), when we have switched 806 * to the kernel pmap in order to destroy a user pmap. 807 */ 808 if (__predict_false(ci->ci_tlbstate != TLBSTATE_VALID)) { 809 pmap_reactivate(pmap); 810 } 811 *pmap2 = NULL; 812 } else { 813 /* 814 * Toss current pmap from CPU and install new pmap, but keep 815 * a reference to the old one. Dropping the reference can 816 * can block as it needs to take locks, so defer that to 817 * pmap_unmap_ptes(). 818 */ 819 pmap_reference(pmap); 820 pmap_load1(l, pmap, curpmap); 821 *pmap2 = curpmap; 822 } 823 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); 824 #ifdef DIAGNOSTIC 825 pmap->pm_pctr = lwp_pctr(); 826 #endif 827 *ptepp = PTE_BASE; 828 829 #if defined(XENPV) && defined(__x86_64__) 830 KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] == L4_BASE); 831 ci->ci_normal_pdes[PTP_LEVELS - 2] = pmap->pm_pdir; 832 *pdeppp = ci->ci_normal_pdes; 833 #else 834 *pdeppp = normal_pdes; 835 #endif 836 } 837 838 /* 839 * pmap_unmap_ptes: unlock the PTE mapping of "pmap" 840 * 841 * => we cannot tolerate context switches while mapped in: assert this. 842 * => reenables kernel preemption. 843 * => does not unlock pmap. 844 */ 845 void 846 pmap_unmap_ptes(struct pmap *pmap, struct pmap * pmap2) 847 { 848 struct cpu_info *ci; 849 struct pmap *mypmap; 850 struct lwp *l; 851 852 KASSERT(kpreempt_disabled()); 853 854 /* The kernel's pmap is always accessible. */ 855 if (pmap == pmap_kernel()) { 856 kpreempt_enable(); 857 return; 858 } 859 860 l = curlwp; 861 ci = l->l_cpu; 862 863 KASSERT(mutex_owned(&pmap->pm_lock)); 864 KASSERT(pmap->pm_pctr == lwp_pctr()); 865 866 #if defined(XENPV) && defined(__x86_64__) 867 KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] != L4_BASE); 868 ci->ci_normal_pdes[PTP_LEVELS - 2] = L4_BASE; 869 #endif 870 871 /* If not our own pmap, mark whatever's on the CPU now as lazy. */ 872 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); 873 mypmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 874 if (ci->ci_pmap == vm_map_pmap(&l->l_proc->p_vmspace->vm_map)) { 875 ci->ci_want_pmapload = 0; 876 } else { 877 ci->ci_want_pmapload = (mypmap != pmap_kernel()); 878 ci->ci_tlbstate = TLBSTATE_LAZY; 879 } 880 881 /* Now safe to re-enable preemption. */ 882 kpreempt_enable(); 883 884 /* Toss reference to other pmap taken earlier. */ 885 if (pmap2 != NULL) { 886 pmap_destroy(pmap2); 887 } 888 } 889 890 inline static void 891 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte) 892 { 893 894 #if !defined(__x86_64__) 895 if (curproc == NULL || curproc->p_vmspace == NULL || 896 pm != vm_map_pmap(&curproc->p_vmspace->vm_map)) 897 return; 898 899 if ((opte ^ npte) & PTE_X) 900 pmap_update_pg(va); 901 902 /* 903 * Executability was removed on the last executable change. 904 * Reset the code segment to something conservative and 905 * let the trap handler deal with setting the right limit. 906 * We can't do that because of locking constraints on the vm map. 907 */ 908 909 if ((opte & PTE_X) && (npte & PTE_X) == 0 && va == pm->pm_hiexec) { 910 struct trapframe *tf = curlwp->l_md.md_regs; 911 912 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 913 pm->pm_hiexec = I386_MAX_EXE_ADDR; 914 } 915 #endif /* !defined(__x86_64__) */ 916 } 917 918 #if !defined(__x86_64__) 919 /* 920 * Fixup the code segment to cover all potential executable mappings. 921 * returns 0 if no changes to the code segment were made. 922 */ 923 int 924 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb) 925 { 926 struct vm_map_entry *ent; 927 struct pmap *pm = vm_map_pmap(map); 928 vaddr_t va = 0; 929 930 vm_map_lock_read(map); 931 for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) { 932 /* 933 * This entry has greater va than the entries before. 934 * We need to make it point to the last page, not past it. 935 */ 936 if (ent->protection & VM_PROT_EXECUTE) 937 va = trunc_page(ent->end) - PAGE_SIZE; 938 } 939 vm_map_unlock_read(map); 940 if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL)) 941 return 0; 942 943 pm->pm_hiexec = va; 944 if (pm->pm_hiexec > I386_MAX_EXE_ADDR) { 945 tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL); 946 } else { 947 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 948 return 0; 949 } 950 return 1; 951 } 952 #endif /* !defined(__x86_64__) */ 953 954 void 955 pat_init(struct cpu_info *ci) 956 { 957 #ifndef XENPV 958 uint64_t pat; 959 960 if (!(ci->ci_feat_val[0] & CPUID_PAT)) 961 return; 962 963 /* We change WT to WC. Leave all other entries the default values. */ 964 pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) | 965 PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) | 966 PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) | 967 PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC); 968 969 wrmsr(MSR_CR_PAT, pat); 970 cpu_pat_enabled = true; 971 #endif 972 } 973 974 static pt_entry_t 975 pmap_pat_flags(u_int flags) 976 { 977 u_int cacheflags = (flags & PMAP_CACHE_MASK); 978 979 if (!cpu_pat_enabled) { 980 switch (cacheflags) { 981 case PMAP_NOCACHE: 982 case PMAP_NOCACHE_OVR: 983 /* results in PGC_UCMINUS on cpus which have 984 * the cpuid PAT but PAT "disabled" 985 */ 986 return PTE_PCD; 987 default: 988 return 0; 989 } 990 } 991 992 switch (cacheflags) { 993 case PMAP_NOCACHE: 994 return PGC_UC; 995 case PMAP_WRITE_COMBINE: 996 return PGC_WC; 997 case PMAP_WRITE_BACK: 998 return PGC_WB; 999 case PMAP_NOCACHE_OVR: 1000 return PGC_UCMINUS; 1001 } 1002 1003 return 0; 1004 } 1005 1006 /* 1007 * p m a p k e n t e r f u n c t i o n s 1008 * 1009 * functions to quickly enter/remove pages from the kernel address 1010 * space. pmap_kremove is exported to MI kernel. we make use of 1011 * the recursive PTE mappings. 1012 */ 1013 1014 /* 1015 * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking 1016 * 1017 * => no need to lock anything, assume va is already allocated 1018 * => should be faster than normal pmap enter function 1019 */ 1020 void 1021 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags) 1022 { 1023 pt_entry_t *pte, opte, npte; 1024 1025 KASSERT(!(prot & ~VM_PROT_ALL)); 1026 1027 if (va < VM_MIN_KERNEL_ADDRESS) 1028 pte = vtopte(va); 1029 else 1030 pte = kvtopte(va); 1031 #if defined(XENPV) && defined(DOM0OPS) 1032 if (pa < pmap_pa_start || pa >= pmap_pa_end) { 1033 #ifdef DEBUG 1034 printf_nolog("%s: pa %#" PRIxPADDR " for va %#" PRIxVADDR 1035 " outside range\n", __func__, pa, va); 1036 #endif /* DEBUG */ 1037 npte = pa; 1038 } else 1039 #endif /* XENPV && DOM0OPS */ 1040 npte = pmap_pa2pte(pa); 1041 npte |= protection_codes[prot] | PTE_P | pmap_pg_g; 1042 npte |= pmap_pat_flags(flags); 1043 opte = pmap_pte_testset(pte, npte); /* zap! */ 1044 1045 /* 1046 * XXX: make sure we are not dealing with a large page, since the only 1047 * large pages created are for the kernel image, and they should never 1048 * be kentered. 1049 */ 1050 KASSERTMSG(!(opte & PTE_PS), "PTE_PS va=%#"PRIxVADDR, va); 1051 1052 if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A)) { 1053 /* This should not happen. */ 1054 printf_nolog("%s: mapping already present\n", __func__); 1055 kpreempt_disable(); 1056 pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER); 1057 kpreempt_enable(); 1058 } 1059 } 1060 1061 __strict_weak_alias(pmap_kenter_ma, pmap_kenter_pa); 1062 1063 #if defined(__x86_64__) 1064 /* 1065 * Change protection for a virtual address. Local for a CPU only, don't 1066 * care about TLB shootdowns. 1067 * 1068 * => must be called with preemption disabled 1069 */ 1070 void 1071 pmap_changeprot_local(vaddr_t va, vm_prot_t prot) 1072 { 1073 pt_entry_t *pte, opte, npte; 1074 1075 KASSERT(kpreempt_disabled()); 1076 1077 if (va < VM_MIN_KERNEL_ADDRESS) 1078 pte = vtopte(va); 1079 else 1080 pte = kvtopte(va); 1081 1082 npte = opte = *pte; 1083 1084 if ((prot & VM_PROT_WRITE) != 0) 1085 npte |= PTE_W; 1086 else 1087 npte &= ~(PTE_W|PTE_D); 1088 1089 if (opte != npte) { 1090 pmap_pte_set(pte, npte); 1091 pmap_pte_flush(); 1092 invlpg(va); 1093 } 1094 } 1095 #endif /* defined(__x86_64__) */ 1096 1097 /* 1098 * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking 1099 * 1100 * => no need to lock anything 1101 * => caller must dispose of any vm_page mapped in the va range 1102 * => note: not an inline function 1103 * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE 1104 * => we assume kernel only unmaps valid addresses and thus don't bother 1105 * checking the valid bit before doing TLB flushing 1106 * => must be followed by call to pmap_update() before reuse of page 1107 */ 1108 static void 1109 pmap_kremove1(vaddr_t sva, vsize_t len, bool localonly) 1110 { 1111 pt_entry_t *pte, opte; 1112 vaddr_t va, eva; 1113 1114 eva = sva + len; 1115 1116 kpreempt_disable(); 1117 for (va = sva; va < eva; va += PAGE_SIZE) { 1118 pte = kvtopte(va); 1119 opte = pmap_pte_testset(pte, 0); /* zap! */ 1120 if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A) && !localonly) { 1121 pmap_tlb_shootdown(pmap_kernel(), va, opte, 1122 TLBSHOOT_KREMOVE); 1123 } 1124 KASSERTMSG((opte & PTE_PS) == 0, 1125 "va %#" PRIxVADDR " is a large page", va); 1126 KASSERTMSG((opte & PTE_PVLIST) == 0, 1127 "va %#" PRIxVADDR " is a pv tracked page", va); 1128 } 1129 if (localonly) { 1130 tlbflushg(); 1131 } 1132 kpreempt_enable(); 1133 } 1134 1135 void 1136 pmap_kremove(vaddr_t sva, vsize_t len) 1137 { 1138 1139 pmap_kremove1(sva, len, false); 1140 } 1141 1142 /* 1143 * pmap_kremove_local: like pmap_kremove(), but only worry about 1144 * TLB invalidations on the current CPU. this is only intended 1145 * for use while writing kernel crash dumps, either after panic 1146 * or via reboot -d. 1147 */ 1148 void 1149 pmap_kremove_local(vaddr_t sva, vsize_t len) 1150 { 1151 1152 pmap_kremove1(sva, len, true); 1153 } 1154 1155 /* 1156 * p m a p i n i t f u n c t i o n s 1157 * 1158 * pmap_bootstrap and pmap_init are called during system startup 1159 * to init the pmap module. pmap_bootstrap() does a low level 1160 * init just to get things rolling. pmap_init() finishes the job. 1161 */ 1162 1163 /* 1164 * pmap_bootstrap_valloc: allocate a virtual address in the bootstrap area. 1165 * This function is to be used before any VM system has been set up. 1166 * 1167 * The va is taken from virtual_avail. 1168 */ 1169 static vaddr_t 1170 pmap_bootstrap_valloc(size_t npages) 1171 { 1172 vaddr_t va = virtual_avail; 1173 virtual_avail += npages * PAGE_SIZE; 1174 return va; 1175 } 1176 1177 /* 1178 * pmap_bootstrap_palloc: allocate a physical address in the bootstrap area. 1179 * This function is to be used before any VM system has been set up. 1180 * 1181 * The pa is taken from avail_start. 1182 */ 1183 static paddr_t 1184 pmap_bootstrap_palloc(size_t npages) 1185 { 1186 paddr_t pa = avail_start; 1187 avail_start += npages * PAGE_SIZE; 1188 return pa; 1189 } 1190 1191 /* 1192 * pmap_bootstrap: get the system in a state where it can run with VM properly 1193 * enabled (called before main()). The VM system is fully init'd later. 1194 * 1195 * => on i386, locore.S has already enabled the MMU by allocating a PDP for the 1196 * kernel, and nkpde PTP's for the kernel. 1197 * => kva_start is the first free virtual address in kernel space. 1198 */ 1199 void 1200 pmap_bootstrap(vaddr_t kva_start) 1201 { 1202 struct pmap *kpm; 1203 int i; 1204 vaddr_t kva; 1205 1206 pmap_pg_nx = (cpu_feature[2] & CPUID_NOX ? PTE_NX : 0); 1207 1208 /* 1209 * Set up our local static global vars that keep track of the usage of 1210 * KVM before kernel_map is set up. 1211 */ 1212 virtual_avail = kva_start; /* first free KVA */ 1213 virtual_end = VM_MAX_KERNEL_ADDRESS; /* last KVA */ 1214 1215 /* 1216 * Set up protection_codes: we need to be able to convert from a MI 1217 * protection code (some combo of VM_PROT...) to something we can jam 1218 * into a x86 PTE. 1219 */ 1220 protection_codes[VM_PROT_NONE] = pmap_pg_nx; 1221 protection_codes[VM_PROT_EXECUTE] = PTE_X; 1222 protection_codes[VM_PROT_READ] = pmap_pg_nx; 1223 protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PTE_X; 1224 protection_codes[VM_PROT_WRITE] = PTE_W | pmap_pg_nx; 1225 protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PTE_W | PTE_X; 1226 protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PTE_W | pmap_pg_nx; 1227 protection_codes[VM_PROT_ALL] = PTE_W | PTE_X; 1228 1229 /* 1230 * Now we init the kernel's pmap. 1231 * 1232 * The kernel pmap's pm_obj is not used for much. However, in user pmaps 1233 * the pm_obj contains the list of active PTPs. 1234 */ 1235 kpm = pmap_kernel(); 1236 mutex_init(&kpm->pm_lock, MUTEX_DEFAULT, IPL_NONE); 1237 rw_init(&kpm->pm_dummy_lock); 1238 for (i = 0; i < PTP_LEVELS - 1; i++) { 1239 uvm_obj_init(&kpm->pm_obj[i], &pmap_pager, false, 1); 1240 uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_dummy_lock); 1241 kpm->pm_ptphint[i] = NULL; 1242 } 1243 memset(&kpm->pm_list, 0, sizeof(kpm->pm_list)); /* pm_list not used */ 1244 1245 kpm->pm_pdir = (pd_entry_t *)bootspace.pdir; 1246 for (i = 0; i < PDP_SIZE; i++) 1247 kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i; 1248 1249 kpm->pm_stats.wired_count = kpm->pm_stats.resident_count = 1250 x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS); 1251 1252 kcpuset_create(&kpm->pm_cpus, true); 1253 kcpuset_create(&kpm->pm_kernel_cpus, true); 1254 1255 kpm->pm_ldt = NULL; 1256 kpm->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 1257 1258 /* 1259 * the above is just a rough estimate and not critical to the proper 1260 * operation of the system. 1261 */ 1262 1263 #if !defined(XENPV) 1264 /* 1265 * Begin to enable global TLB entries if they are supported: add PTE_G 1266 * attribute to already mapped kernel pages. Do that only if SVS is 1267 * disabled. 1268 * 1269 * The G bit has no effect until the CR4_PGE bit is set in CR4, which 1270 * happens later in cpu_init(). 1271 */ 1272 #ifdef SVS 1273 if (!svs_enabled && (cpu_feature[0] & CPUID_PGE)) { 1274 #else 1275 if (cpu_feature[0] & CPUID_PGE) { 1276 #endif 1277 pmap_pg_g = PTE_G; 1278 pmap_remap_global(); 1279 } 1280 #endif 1281 1282 #ifndef XENPV 1283 /* 1284 * Enable large pages if they are supported. 1285 */ 1286 if (cpu_feature[0] & CPUID_PSE) { 1287 lcr4(rcr4() | CR4_PSE); /* enable hardware (via %cr4) */ 1288 pmap_largepages = 1; /* enable software */ 1289 1290 /* 1291 * The TLB must be flushed after enabling large pages on Pentium 1292 * CPUs, according to section 3.6.2.2 of "Intel Architecture 1293 * Software Developer's Manual, Volume 3: System Programming". 1294 */ 1295 tlbflushg(); 1296 1297 /* Remap the kernel. */ 1298 pmap_remap_largepages(); 1299 } 1300 pmap_init_lapic(); 1301 #endif /* !XENPV */ 1302 1303 #ifdef __HAVE_PCPU_AREA 1304 pmap_init_pcpu(); 1305 #endif 1306 1307 #ifdef __HAVE_DIRECT_MAP 1308 pmap_init_directmap(kpm); 1309 #else 1310 pmap_vpage_cpualloc(&cpu_info_primary); 1311 1312 if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { /* i386 */ 1313 early_zerop = (void *)cpu_info_primary.vpage[VPAGE_ZER]; 1314 early_zero_pte = cpu_info_primary.vpage_pte[VPAGE_ZER]; 1315 } else { /* amd64 */ 1316 /* 1317 * zero_pte is stuck at the end of mapped space for the kernel 1318 * image (disjunct from kva space). This is done so that it 1319 * can safely be used in pmap_growkernel (pmap_get_physpage), 1320 * when it's called for the first time. 1321 * XXXfvdl fix this for MULTIPROCESSOR later. 1322 */ 1323 #ifdef XENPV 1324 /* early_zerop initialized in xen_locore() */ 1325 #else 1326 early_zerop = (void *)bootspace.spareva; 1327 #endif 1328 early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop); 1329 } 1330 #endif 1331 1332 #if defined(XENPV) && defined(__x86_64__) 1333 extern vaddr_t xen_dummy_page; 1334 paddr_t xen_dummy_user_pgd; 1335 1336 /* 1337 * We want a dummy page directory for Xen: when deactivating a pmap, 1338 * Xen will still consider it active. So we set user PGD to this one 1339 * to lift all protection on the now inactive page tables set. 1340 */ 1341 xen_dummy_user_pgd = xen_dummy_page - KERNBASE; 1342 1343 /* Zero fill it, the less checks in Xen it requires the better */ 1344 memset(PAGE_ALIGNED(xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE); 1345 /* Mark read-only */ 1346 HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE, 1347 pmap_pa2pte(xen_dummy_user_pgd) | PTE_P | pmap_pg_nx, 1348 UVMF_INVLPG); 1349 /* Pin as L4 */ 1350 xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd)); 1351 #endif 1352 1353 /* 1354 * Allocate space for the Interrupt Descriptor Table (IDT), 1355 * Global Descriptor Table (GDT), and Local Descriptor Table 1356 * (LDT). 1357 * 1358 * Currently there is an initial temporary GDT allocated on the 1359 * stack by the caller of init386/init_x86_64, which is (among 1360 * other things) needed on i386 for %fs-relative addressing for 1361 * CPU-local data (CPUVAR(...), curcpu(), curlwp). This 1362 * initial temporary GDT will be popped off the stack before we 1363 * can enter main, so we need to make sure there is space for a 1364 * second temporary GDT to continue existing when we enter main 1365 * before we allocate space for the permanent GDT with 1366 * uvm_km(9) in gdt_init via cpu_startup and switch to that. 1367 */ 1368 idt_vaddr = pmap_bootstrap_valloc(1); 1369 idt_paddr = pmap_bootstrap_palloc(1); 1370 1371 gdt_vaddr = pmap_bootstrap_valloc(1); 1372 gdt_paddr = pmap_bootstrap_palloc(1); 1373 1374 #ifdef __HAVE_PCPU_AREA 1375 ldt_vaddr = (vaddr_t)&pcpuarea->ldt; 1376 #else 1377 ldt_vaddr = pmap_bootstrap_valloc(1); 1378 #endif 1379 ldt_paddr = pmap_bootstrap_palloc(1); 1380 1381 #if !defined(__x86_64__) 1382 /* pentium f00f bug stuff */ 1383 pentium_idt_vaddr = pmap_bootstrap_valloc(1); 1384 #endif 1385 1386 #if defined(XENPVHVM) 1387 /* XXX: move to hypervisor.c with appropriate API adjustments */ 1388 extern paddr_t HYPERVISOR_shared_info_pa; 1389 extern volatile struct xencons_interface *xencons_interface; /* XXX */ 1390 extern struct xenstore_domain_interface *xenstore_interface; /* XXX */ 1391 1392 if (vm_guest != VM_GUEST_XENPVH) { 1393 HYPERVISOR_shared_info = (void *) pmap_bootstrap_valloc(1); 1394 HYPERVISOR_shared_info_pa = pmap_bootstrap_palloc(1); 1395 } 1396 xencons_interface = (void *) pmap_bootstrap_valloc(1); 1397 xenstore_interface = (void *) pmap_bootstrap_valloc(1); 1398 #endif 1399 /* 1400 * Now we reserve some VM for mapping pages when doing a crash dump. 1401 */ 1402 virtual_avail = reserve_dumppages(virtual_avail); 1403 1404 /* 1405 * Init the global lock and global list. 1406 */ 1407 mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE); 1408 LIST_INIT(&pmaps); 1409 1410 /* 1411 * Ensure the TLB is sync'd with reality by flushing it... 1412 */ 1413 tlbflushg(); 1414 1415 /* 1416 * Calculate pmap_maxkvaddr from nkptp[]. 1417 */ 1418 kva = VM_MIN_KERNEL_ADDRESS; 1419 for (i = PTP_LEVELS - 1; i >= 1; i--) { 1420 kva += nkptp[i] * nbpd[i]; 1421 } 1422 pmap_maxkvaddr = kva; 1423 } 1424 1425 #ifndef XENPV 1426 static void 1427 pmap_init_lapic(void) 1428 { 1429 /* 1430 * On CPUs that have no LAPIC, local_apic_va is never kentered. But our 1431 * x86 implementation relies a lot on this address to be valid; so just 1432 * allocate a fake physical page that will be kentered into 1433 * local_apic_va by machdep. 1434 * 1435 * If the LAPIC is present, the va will be remapped somewhere else 1436 * later in lapic_map. 1437 */ 1438 local_apic_va = pmap_bootstrap_valloc(1); 1439 local_apic_pa = pmap_bootstrap_palloc(1); 1440 } 1441 #endif 1442 1443 #ifdef __x86_64__ 1444 static size_t 1445 pmap_pagetree_nentries_range(vaddr_t startva, vaddr_t endva, size_t pgsz) 1446 { 1447 size_t npages; 1448 npages = (roundup(endva, pgsz) / pgsz) - 1449 (rounddown(startva, pgsz) / pgsz); 1450 return npages; 1451 } 1452 #endif 1453 1454 #if defined(__HAVE_DIRECT_MAP) || defined(KASAN) || defined(KMSAN) 1455 static inline void 1456 slotspace_copy(int type, pd_entry_t *dst, pd_entry_t *src) 1457 { 1458 size_t sslot = slotspace.area[type].sslot; 1459 size_t nslot = slotspace.area[type].nslot; 1460 1461 memcpy(&dst[sslot], &src[sslot], nslot * sizeof(pd_entry_t)); 1462 } 1463 #endif 1464 1465 #ifdef __x86_64__ 1466 /* 1467 * Randomize the location of an area. We count the holes in the VM space. We 1468 * randomly select one hole, and then randomly select an area within that hole. 1469 * Finally we update the associated entry in the slotspace structure. 1470 */ 1471 vaddr_t 1472 slotspace_rand(int type, size_t sz, size_t align, size_t randhole, 1473 vaddr_t randva) 1474 { 1475 struct { 1476 int start; 1477 int end; 1478 } holes[SLSPACE_NAREAS+1]; 1479 size_t i, nholes, hole; 1480 size_t startsl, endsl, nslots, winsize; 1481 vaddr_t startva, va; 1482 1483 sz = roundup(sz, align); 1484 1485 /* 1486 * Take one more slot with +NBPD_L4, because we may end up choosing 1487 * an area that crosses slots: 1488 * +------+------+------+ 1489 * | Slot | Slot | Slot | 1490 * +------+------+------+ 1491 * [Chosen Area] 1492 * And in that case we must take into account the additional slot 1493 * consumed. 1494 */ 1495 nslots = roundup(sz+NBPD_L4, NBPD_L4) / NBPD_L4; 1496 1497 /* Get the holes. */ 1498 nholes = 0; 1499 size_t curslot = 0 + 256; /* end of SLAREA_USER */ 1500 while (1) { 1501 /* 1502 * Find the first occupied slot after the current one. 1503 * The area between the two is a hole. 1504 */ 1505 size_t minsslot = 512; 1506 size_t minnslot = 0; 1507 for (i = 0; i < SLSPACE_NAREAS; i++) { 1508 if (!slotspace.area[i].active) 1509 continue; 1510 if (slotspace.area[i].sslot >= curslot && 1511 slotspace.area[i].sslot < minsslot) { 1512 minsslot = slotspace.area[i].sslot; 1513 minnslot = slotspace.area[i].nslot; 1514 } 1515 } 1516 1517 /* No hole anymore, stop here. */ 1518 if (minsslot == 512) { 1519 break; 1520 } 1521 1522 /* Register the hole. */ 1523 if (minsslot - curslot >= nslots) { 1524 holes[nholes].start = curslot; 1525 holes[nholes].end = minsslot; 1526 nholes++; 1527 } 1528 1529 /* Skip that hole, and iterate again. */ 1530 curslot = minsslot + minnslot; 1531 } 1532 1533 if (nholes == 0) { 1534 panic("%s: impossible", __func__); 1535 } 1536 1537 /* Select a hole. */ 1538 hole = randhole; 1539 #ifdef NO_X86_ASLR 1540 hole = 0; 1541 #endif 1542 hole %= nholes; 1543 startsl = holes[hole].start; 1544 endsl = holes[hole].end; 1545 startva = VA_SIGN_NEG(startsl * NBPD_L4); 1546 1547 /* Select an area within the hole. */ 1548 va = randva; 1549 #ifdef NO_X86_ASLR 1550 va = 0; 1551 #endif 1552 winsize = ((endsl - startsl) * NBPD_L4) - sz; 1553 va %= winsize; 1554 va = rounddown(va, align); 1555 va += startva; 1556 1557 /* Update the entry. */ 1558 slotspace.area[type].sslot = pl4_i(va); 1559 slotspace.area[type].nslot = 1560 pmap_pagetree_nentries_range(va, va+sz, NBPD_L4); 1561 slotspace.area[type].active = true; 1562 1563 return va; 1564 } 1565 #endif 1566 1567 #ifdef __HAVE_PCPU_AREA 1568 static void 1569 pmap_init_pcpu(void) 1570 { 1571 const vaddr_t startva = PMAP_PCPU_BASE; 1572 size_t nL4e, nL3e, nL2e, nL1e; 1573 size_t L4e_idx, L3e_idx, L2e_idx, L1e_idx __diagused; 1574 paddr_t pa; 1575 vaddr_t endva; 1576 vaddr_t tmpva; 1577 pt_entry_t *pte; 1578 size_t size; 1579 int i; 1580 1581 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx; 1582 1583 size = sizeof(struct pcpu_area); 1584 1585 endva = startva + size; 1586 1587 /* We will use this temporary va. */ 1588 tmpva = bootspace.spareva; 1589 pte = PTE_BASE + pl1_i(tmpva); 1590 1591 /* Build L4 */ 1592 L4e_idx = pl4_i(startva); 1593 nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4); 1594 KASSERT(nL4e == 1); 1595 for (i = 0; i < nL4e; i++) { 1596 KASSERT(L4_BASE[L4e_idx+i] == 0); 1597 1598 pa = pmap_bootstrap_palloc(1); 1599 *pte = (pa & PTE_FRAME) | pteflags; 1600 pmap_update_pg(tmpva); 1601 memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE); 1602 1603 L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A; 1604 } 1605 1606 /* Build L3 */ 1607 L3e_idx = pl3_i(startva); 1608 nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3); 1609 for (i = 0; i < nL3e; i++) { 1610 KASSERT(L3_BASE[L3e_idx+i] == 0); 1611 1612 pa = pmap_bootstrap_palloc(1); 1613 *pte = (pa & PTE_FRAME) | pteflags; 1614 pmap_update_pg(tmpva); 1615 memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE); 1616 1617 L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A; 1618 } 1619 1620 /* Build L2 */ 1621 L2e_idx = pl2_i(startva); 1622 nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2); 1623 for (i = 0; i < nL2e; i++) { 1624 1625 KASSERT(L2_BASE[L2e_idx+i] == 0); 1626 1627 pa = pmap_bootstrap_palloc(1); 1628 *pte = (pa & PTE_FRAME) | pteflags; 1629 pmap_update_pg(tmpva); 1630 memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE); 1631 1632 L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A; 1633 } 1634 1635 /* Build L1 */ 1636 L1e_idx = pl1_i(startva); 1637 nL1e = pmap_pagetree_nentries_range(startva, endva, NBPD_L1); 1638 for (i = 0; i < nL1e; i++) { 1639 /* 1640 * Nothing to do, the PTEs will be entered via 1641 * pmap_kenter_pa. 1642 */ 1643 KASSERT(L1_BASE[L1e_idx+i] == 0); 1644 } 1645 1646 *pte = 0; 1647 pmap_update_pg(tmpva); 1648 1649 pcpuarea = (struct pcpu_area *)startva; 1650 1651 tlbflush(); 1652 } 1653 #endif 1654 1655 #ifdef __HAVE_DIRECT_MAP 1656 static void 1657 randomize_hole(size_t *randholep, vaddr_t *randvap) 1658 { 1659 struct nist_hash_drbg drbg; 1660 uint8_t seed[NIST_HASH_DRBG_SEEDLEN_BYTES]; 1661 const char p[] = "x86/directmap"; 1662 int error; 1663 1664 entropy_extract(seed, sizeof(seed), 0); 1665 1666 error = nist_hash_drbg_instantiate(&drbg, seed, sizeof(seed), 1667 /*nonce*/NULL, 0, 1668 /*personalization*/p, strlen(p)); 1669 KASSERTMSG(error == 0, "error=%d", error); 1670 1671 error = nist_hash_drbg_generate(&drbg, randholep, sizeof(*randholep), 1672 /*additional*/NULL, 0); 1673 KASSERTMSG(error == 0, "error=%d", error); 1674 1675 error = nist_hash_drbg_generate(&drbg, randvap, sizeof(*randvap), 1676 /*additional*/NULL, 0); 1677 KASSERTMSG(error == 0, "error=%d", error); 1678 1679 explicit_memset(seed, 0, sizeof(seed)); 1680 explicit_memset(&drbg, 0, sizeof(drbg)); 1681 } 1682 1683 /* 1684 * Create the amd64 direct map. Called only once at boot time. We map all of 1685 * the physical memory contiguously using 2MB large pages, with RW permissions. 1686 * However there is a hole: the kernel is mapped with RO permissions. 1687 */ 1688 static void 1689 pmap_init_directmap(struct pmap *kpm) 1690 { 1691 extern phys_ram_seg_t mem_clusters[]; 1692 extern int mem_cluster_cnt; 1693 1694 vaddr_t startva; 1695 size_t nL4e, nL3e, nL2e; 1696 size_t L4e_idx, L3e_idx, L2e_idx; 1697 size_t spahole, epahole; 1698 paddr_t lastpa, pa; 1699 vaddr_t endva; 1700 vaddr_t tmpva; 1701 pt_entry_t *pte; 1702 phys_ram_seg_t *mc; 1703 int i; 1704 size_t randhole; 1705 vaddr_t randva; 1706 1707 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx; 1708 const pd_entry_t holepteflags = PTE_P | pmap_pg_nx; 1709 1710 CTASSERT(NL4_SLOT_DIRECT * NBPD_L4 == MAXPHYSMEM); 1711 1712 spahole = roundup(bootspace.head.pa, NBPD_L2); 1713 epahole = rounddown(bootspace.boot.pa, NBPD_L2); 1714 1715 /* Get the last physical address available */ 1716 lastpa = 0; 1717 for (i = 0; i < mem_cluster_cnt; i++) { 1718 mc = &mem_clusters[i]; 1719 lastpa = MAX(lastpa, mc->start + mc->size); 1720 } 1721 1722 /* 1723 * x86_add_cluster should have truncated the memory to MAXPHYSMEM. 1724 */ 1725 if (lastpa > MAXPHYSMEM) { 1726 panic("pmap_init_directmap: lastpa incorrect"); 1727 } 1728 1729 randomize_hole(&randhole, &randva); 1730 startva = slotspace_rand(SLAREA_DMAP, lastpa, NBPD_L2, 1731 randhole, randva); 1732 endva = startva + lastpa; 1733 1734 /* We will use this temporary va. */ 1735 tmpva = bootspace.spareva; 1736 pte = PTE_BASE + pl1_i(tmpva); 1737 1738 /* Build L4 */ 1739 L4e_idx = pl4_i(startva); 1740 nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4); 1741 KASSERT(nL4e <= NL4_SLOT_DIRECT); 1742 for (i = 0; i < nL4e; i++) { 1743 KASSERT(L4_BASE[L4e_idx+i] == 0); 1744 1745 pa = pmap_bootstrap_palloc(1); 1746 *pte = (pa & PTE_FRAME) | pteflags; 1747 pmap_update_pg(tmpva); 1748 memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE); 1749 1750 L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A; 1751 } 1752 1753 /* Build L3 */ 1754 L3e_idx = pl3_i(startva); 1755 nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3); 1756 for (i = 0; i < nL3e; i++) { 1757 KASSERT(L3_BASE[L3e_idx+i] == 0); 1758 1759 pa = pmap_bootstrap_palloc(1); 1760 *pte = (pa & PTE_FRAME) | pteflags; 1761 pmap_update_pg(tmpva); 1762 memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE); 1763 1764 L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A; 1765 } 1766 1767 /* Build L2 */ 1768 L2e_idx = pl2_i(startva); 1769 nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2); 1770 for (i = 0; i < nL2e; i++) { 1771 KASSERT(L2_BASE[L2e_idx+i] == 0); 1772 1773 pa = (paddr_t)(i * NBPD_L2); 1774 1775 if (spahole <= pa && pa < epahole) { 1776 L2_BASE[L2e_idx+i] = pa | holepteflags | PTE_A | 1777 PTE_PS | pmap_pg_g; 1778 } else { 1779 L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A | 1780 PTE_PS | pmap_pg_g; 1781 } 1782 } 1783 1784 *pte = 0; 1785 pmap_update_pg(tmpva); 1786 1787 pmap_direct_base = startva; 1788 pmap_direct_end = endva; 1789 1790 tlbflush(); 1791 } 1792 #endif /* __HAVE_DIRECT_MAP */ 1793 1794 #if !defined(XENPV) 1795 /* 1796 * Remap all of the virtual pages created so far with the PTE_G bit. 1797 */ 1798 static void 1799 pmap_remap_global(void) 1800 { 1801 vaddr_t kva, kva_end; 1802 unsigned long p1i; 1803 size_t i; 1804 1805 /* head */ 1806 kva = bootspace.head.va; 1807 kva_end = kva + bootspace.head.sz; 1808 for ( ; kva < kva_end; kva += PAGE_SIZE) { 1809 p1i = pl1_i(kva); 1810 if (pmap_valid_entry(PTE_BASE[p1i])) 1811 PTE_BASE[p1i] |= pmap_pg_g; 1812 } 1813 1814 /* kernel segments */ 1815 for (i = 0; i < BTSPACE_NSEGS; i++) { 1816 if (bootspace.segs[i].type == BTSEG_NONE) { 1817 continue; 1818 } 1819 kva = bootspace.segs[i].va; 1820 kva_end = kva + bootspace.segs[i].sz; 1821 for ( ; kva < kva_end; kva += PAGE_SIZE) { 1822 p1i = pl1_i(kva); 1823 if (pmap_valid_entry(PTE_BASE[p1i])) 1824 PTE_BASE[p1i] |= pmap_pg_g; 1825 } 1826 } 1827 1828 /* boot space */ 1829 kva = bootspace.boot.va; 1830 kva_end = kva + bootspace.boot.sz; 1831 for ( ; kva < kva_end; kva += PAGE_SIZE) { 1832 p1i = pl1_i(kva); 1833 if (pmap_valid_entry(PTE_BASE[p1i])) 1834 PTE_BASE[p1i] |= pmap_pg_g; 1835 } 1836 } 1837 #endif 1838 1839 #ifndef XENPV 1840 /* 1841 * Remap several kernel segments with large pages. We cover as many pages as we 1842 * can. Called only once at boot time, if the CPU supports large pages. 1843 */ 1844 static void 1845 pmap_remap_largepages(void) 1846 { 1847 pd_entry_t *pde; 1848 vaddr_t kva, kva_end; 1849 paddr_t pa; 1850 size_t i; 1851 1852 /* Remap the kernel text using large pages. */ 1853 for (i = 0; i < BTSPACE_NSEGS; i++) { 1854 if (bootspace.segs[i].type != BTSEG_TEXT) { 1855 continue; 1856 } 1857 kva = roundup(bootspace.segs[i].va, NBPD_L2); 1858 if (kva < bootspace.segs[i].va) { 1859 continue; 1860 } 1861 kva_end = rounddown(bootspace.segs[i].va + 1862 bootspace.segs[i].sz, NBPD_L2); 1863 pa = roundup(bootspace.segs[i].pa, NBPD_L2); 1864 for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) { 1865 pde = &L2_BASE[pl2_i(kva)]; 1866 *pde = pa | pmap_pg_g | PTE_PS | PTE_P; 1867 tlbflushg(); 1868 } 1869 } 1870 1871 /* Remap the kernel rodata using large pages. */ 1872 for (i = 0; i < BTSPACE_NSEGS; i++) { 1873 if (bootspace.segs[i].type != BTSEG_RODATA) { 1874 continue; 1875 } 1876 kva = roundup(bootspace.segs[i].va, NBPD_L2); 1877 if (kva < bootspace.segs[i].va) { 1878 continue; 1879 } 1880 kva_end = rounddown(bootspace.segs[i].va + 1881 bootspace.segs[i].sz, NBPD_L2); 1882 pa = roundup(bootspace.segs[i].pa, NBPD_L2); 1883 for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) { 1884 pde = &L2_BASE[pl2_i(kva)]; 1885 *pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_P; 1886 tlbflushg(); 1887 } 1888 } 1889 1890 /* Remap the kernel data+bss using large pages. */ 1891 for (i = 0; i < BTSPACE_NSEGS; i++) { 1892 if (bootspace.segs[i].type != BTSEG_DATA) { 1893 continue; 1894 } 1895 kva = roundup(bootspace.segs[i].va, NBPD_L2); 1896 if (kva < bootspace.segs[i].va) { 1897 continue; 1898 } 1899 kva_end = rounddown(bootspace.segs[i].va + 1900 bootspace.segs[i].sz, NBPD_L2); 1901 pa = roundup(bootspace.segs[i].pa, NBPD_L2); 1902 for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) { 1903 pde = &L2_BASE[pl2_i(kva)]; 1904 *pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_W | PTE_P; 1905 tlbflushg(); 1906 } 1907 } 1908 } 1909 #endif /* !XENPV */ 1910 1911 /* 1912 * pmap_init: called from uvm_init, our job is to get the pmap system ready 1913 * to manage mappings. 1914 */ 1915 void 1916 pmap_init(void) 1917 { 1918 int flags; 1919 1920 /* 1921 * initialize caches. 1922 */ 1923 1924 pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), COHERENCY_UNIT, 1925 0, 0, "pmappl", NULL, IPL_NONE, pmap_ctor, pmap_dtor, NULL); 1926 1927 #ifdef XENPV 1928 /* 1929 * pool_cache(9) should not touch cached objects, since they 1930 * are pinned on xen and R/O for the domU 1931 */ 1932 flags = PR_NOTOUCH; 1933 #else 1934 flags = 0; 1935 #endif 1936 1937 #ifdef PAE 1938 pool_init(&pmap_pdp_pool, PAGE_SIZE * PDP_SIZE, 0, 0, flags, 1939 "pdppl", &pmap_pdp_allocator, IPL_NONE); 1940 #else 1941 pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, 0, flags, 1942 "pdppl", NULL, IPL_NONE); 1943 #endif 1944 pool_cache_bootstrap(&pmap_pvp_cache, PAGE_SIZE, PAGE_SIZE, 1945 0, 0, "pvpage", &pool_allocator_kmem, 1946 IPL_NONE, pmap_pvp_ctor, pmap_pvp_dtor, NULL); 1947 1948 pmap_tlb_init(); 1949 1950 /* XXX: Since cpu_hatch() is only for secondary CPUs. */ 1951 pmap_tlb_cpu_init(curcpu()); 1952 1953 evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC, 1954 NULL, "x86", "io bitmap copy"); 1955 evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC, 1956 NULL, "x86", "ldt sync"); 1957 1958 /* 1959 * The kernel doesn't keep track of PTPs, so there's nowhere handy 1960 * to hang a tree of pv_entry records. Dynamically allocated 1961 * pv_entry lists are not heavily used in the kernel's pmap (the 1962 * usual case is embedded), so cop out and use a single RB tree 1963 * to cover them. 1964 */ 1965 rb_tree_init(&pmap_kernel_rb, &pmap_rbtree_ops); 1966 1967 /* 1968 * done: pmap module is up (and ready for business) 1969 */ 1970 1971 pmap_initialized = true; 1972 } 1973 1974 #ifndef XENPV 1975 /* 1976 * pmap_cpu_init_late: perform late per-CPU initialization. 1977 */ 1978 void 1979 pmap_cpu_init_late(struct cpu_info *ci) 1980 { 1981 /* 1982 * The BP has already its own PD page allocated during early 1983 * MD startup. 1984 */ 1985 if (ci == &cpu_info_primary) 1986 return; 1987 #ifdef PAE 1988 cpu_alloc_l3_page(ci); 1989 #endif 1990 } 1991 #endif 1992 1993 #ifndef __HAVE_DIRECT_MAP 1994 CTASSERT(CACHE_LINE_SIZE > sizeof(pt_entry_t)); 1995 CTASSERT(CACHE_LINE_SIZE % sizeof(pt_entry_t) == 0); 1996 1997 static void 1998 pmap_vpage_cpualloc(struct cpu_info *ci) 1999 { 2000 bool primary = (ci == &cpu_info_primary); 2001 size_t i, npages; 2002 vaddr_t vabase; 2003 vsize_t vrange; 2004 2005 npages = (CACHE_LINE_SIZE / sizeof(pt_entry_t)); 2006 KASSERT(npages >= VPAGE_MAX); 2007 vrange = npages * PAGE_SIZE; 2008 2009 if (primary) { 2010 while ((vabase = pmap_bootstrap_valloc(1)) % vrange != 0) { 2011 /* Waste some pages to align properly */ 2012 } 2013 /* The base is aligned, allocate the rest (contiguous) */ 2014 pmap_bootstrap_valloc(npages - 1); 2015 } else { 2016 vabase = uvm_km_alloc(kernel_map, vrange, vrange, 2017 UVM_KMF_VAONLY); 2018 if (vabase == 0) { 2019 panic("%s: failed to allocate tmp VA for CPU %d\n", 2020 __func__, cpu_index(ci)); 2021 } 2022 } 2023 2024 KASSERT((vaddr_t)&PTE_BASE[pl1_i(vabase)] % CACHE_LINE_SIZE == 0); 2025 2026 for (i = 0; i < VPAGE_MAX; i++) { 2027 ci->vpage[i] = vabase + i * PAGE_SIZE; 2028 ci->vpage_pte[i] = PTE_BASE + pl1_i(ci->vpage[i]); 2029 } 2030 } 2031 2032 void 2033 pmap_vpage_cpu_init(struct cpu_info *ci) 2034 { 2035 if (ci == &cpu_info_primary) { 2036 /* cpu0 already taken care of in pmap_bootstrap */ 2037 return; 2038 } 2039 2040 pmap_vpage_cpualloc(ci); 2041 } 2042 #endif 2043 2044 /* 2045 * p v _ e n t r y f u n c t i o n s 2046 */ 2047 2048 /* 2049 * pmap_pvp_dtor: pool_cache constructor for PV pages. 2050 */ 2051 static int 2052 pmap_pvp_ctor(void *arg, void *obj, int flags) 2053 { 2054 struct pv_page *pvp = (struct pv_page *)obj; 2055 struct pv_entry *pve = (struct pv_entry *)obj + 1; 2056 struct pv_entry *maxpve = pve + PVE_PER_PVP; 2057 2058 KASSERT(sizeof(struct pv_page) <= sizeof(struct pv_entry)); 2059 KASSERT(trunc_page((vaddr_t)obj) == (vaddr_t)obj); 2060 2061 LIST_INIT(&pvp->pvp_pves); 2062 pvp->pvp_nfree = PVE_PER_PVP; 2063 pvp->pvp_pmap = NULL; 2064 2065 for (; pve < maxpve; pve++) { 2066 LIST_INSERT_HEAD(&pvp->pvp_pves, pve, pve_list); 2067 } 2068 2069 return 0; 2070 } 2071 2072 /* 2073 * pmap_pvp_dtor: pool_cache destructor for PV pages. 2074 */ 2075 static void 2076 pmap_pvp_dtor(void *arg, void *obj) 2077 { 2078 struct pv_page *pvp __diagused = obj; 2079 2080 KASSERT(pvp->pvp_pmap == NULL); 2081 KASSERT(pvp->pvp_nfree == PVE_PER_PVP); 2082 } 2083 2084 /* 2085 * pmap_alloc_pv: allocate a PV entry (likely cached with pmap). 2086 */ 2087 static struct pv_entry * 2088 pmap_alloc_pv(struct pmap *pmap) 2089 { 2090 struct pv_entry *pve; 2091 struct pv_page *pvp; 2092 2093 KASSERT(mutex_owned(&pmap->pm_lock)); 2094 2095 if (__predict_false((pvp = LIST_FIRST(&pmap->pm_pvp_part)) == NULL)) { 2096 if ((pvp = LIST_FIRST(&pmap->pm_pvp_full)) != NULL) { 2097 LIST_REMOVE(pvp, pvp_list); 2098 } else { 2099 pvp = pool_cache_get(&pmap_pvp_cache, PR_NOWAIT); 2100 } 2101 if (__predict_false(pvp == NULL)) { 2102 return NULL; 2103 } 2104 /* full -> part */ 2105 LIST_INSERT_HEAD(&pmap->pm_pvp_part, pvp, pvp_list); 2106 pvp->pvp_pmap = pmap; 2107 } 2108 2109 KASSERT(pvp->pvp_pmap == pmap); 2110 KASSERT(pvp->pvp_nfree > 0); 2111 2112 pve = LIST_FIRST(&pvp->pvp_pves); 2113 LIST_REMOVE(pve, pve_list); 2114 pvp->pvp_nfree--; 2115 2116 if (__predict_false(pvp->pvp_nfree == 0)) { 2117 /* part -> empty */ 2118 KASSERT(LIST_EMPTY(&pvp->pvp_pves)); 2119 LIST_REMOVE(pvp, pvp_list); 2120 LIST_INSERT_HEAD(&pmap->pm_pvp_empty, pvp, pvp_list); 2121 } else { 2122 KASSERT(!LIST_EMPTY(&pvp->pvp_pves)); 2123 } 2124 2125 return pve; 2126 } 2127 2128 /* 2129 * pmap_free_pv: delayed free of a PV entry. 2130 */ 2131 static void 2132 pmap_free_pv(struct pmap *pmap, struct pv_entry *pve) 2133 { 2134 struct pv_page *pvp = (struct pv_page *)trunc_page((vaddr_t)pve); 2135 2136 KASSERT(mutex_owned(&pmap->pm_lock)); 2137 KASSERT(pvp->pvp_pmap == pmap); 2138 KASSERT(pvp->pvp_nfree >= 0); 2139 2140 LIST_INSERT_HEAD(&pvp->pvp_pves, pve, pve_list); 2141 pvp->pvp_nfree++; 2142 2143 if (__predict_false(pvp->pvp_nfree == 1)) { 2144 /* empty -> part */ 2145 LIST_REMOVE(pvp, pvp_list); 2146 LIST_INSERT_HEAD(&pmap->pm_pvp_part, pvp, pvp_list); 2147 } else if (__predict_false(pvp->pvp_nfree == PVE_PER_PVP)) { 2148 /* part -> full */ 2149 LIST_REMOVE(pvp, pvp_list); 2150 LIST_INSERT_HEAD(&pmap->pm_pvp_full, pvp, pvp_list); 2151 } 2152 } 2153 2154 /* 2155 * pmap_drain_pv: free full PV pages. 2156 */ 2157 static void 2158 pmap_drain_pv(struct pmap *pmap) 2159 { 2160 struct pv_page *pvp; 2161 2162 KASSERT(mutex_owned(&pmap->pm_lock)); 2163 2164 while ((pvp = LIST_FIRST(&pmap->pm_pvp_full)) != NULL) { 2165 LIST_REMOVE(pvp, pvp_list); 2166 KASSERT(pvp->pvp_pmap == pmap); 2167 KASSERT(pvp->pvp_nfree == PVE_PER_PVP); 2168 pvp->pvp_pmap = NULL; 2169 pool_cache_put(&pmap_pvp_cache, pvp); 2170 } 2171 } 2172 2173 /* 2174 * pmap_check_pv: verify {VA, PTP} pair is either tracked/untracked by page 2175 */ 2176 static void 2177 pmap_check_pv(struct pmap *pmap, struct vm_page *ptp, struct pmap_page *pp, 2178 vaddr_t va, bool tracked) 2179 { 2180 #ifdef DEBUG 2181 struct pv_pte *pvpte; 2182 2183 PMAP_CHECK_PP(pp); 2184 2185 mutex_spin_enter(&pp->pp_lock); 2186 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 2187 if (pvpte->pte_ptp == ptp && pvpte->pte_va == va) { 2188 break; 2189 } 2190 } 2191 mutex_spin_exit(&pp->pp_lock); 2192 2193 if (pvpte && !tracked) { 2194 panic("pmap_check_pv: %p/%lx found on pp %p", ptp, va, pp); 2195 } else if (!pvpte && tracked) { 2196 panic("pmap_check_pv: %p/%lx missing on pp %p", ptp, va, pp); 2197 } 2198 #endif 2199 } 2200 2201 /* 2202 * pmap_treelookup_pv: search the PV tree for a dynamic entry 2203 * 2204 * => pmap must be locked 2205 */ 2206 static struct pv_entry * 2207 pmap_treelookup_pv(const struct pmap *pmap, const struct vm_page *ptp, 2208 const rb_tree_t *tree, const vaddr_t va) 2209 { 2210 struct pv_entry *pve; 2211 rb_node_t *node; 2212 2213 /* 2214 * Inlined lookup tailored for exactly what's needed here that is 2215 * quite a bit faster than using rb_tree_find_node(). 2216 */ 2217 for (node = tree->rbt_root;;) { 2218 if (__predict_false(RB_SENTINEL_P(node))) { 2219 return NULL; 2220 } 2221 pve = (struct pv_entry *) 2222 ((uintptr_t)node - offsetof(struct pv_entry, pve_rb)); 2223 if (pve->pve_pte.pte_va == va) { 2224 KASSERT(pve->pve_pte.pte_ptp == ptp); 2225 return pve; 2226 } 2227 node = node->rb_nodes[pve->pve_pte.pte_va < va]; 2228 } 2229 } 2230 2231 /* 2232 * pmap_lookup_pv: look up a non-embedded pv entry for the given pmap 2233 * 2234 * => a PV entry must be known present (doesn't check for existence) 2235 * => pmap must be locked 2236 */ 2237 static struct pv_entry * 2238 pmap_lookup_pv(const struct pmap *pmap, const struct vm_page *ptp, 2239 const struct pmap_page * const old_pp, const vaddr_t va) 2240 { 2241 struct pv_entry *pve; 2242 const rb_tree_t *tree; 2243 2244 KASSERT(mutex_owned(&pmap->pm_lock)); 2245 KASSERT(ptp != NULL || pmap == pmap_kernel()); 2246 2247 /* 2248 * [This mostly deals with the case of process-private pages, i.e. 2249 * anonymous memory allocations or COW.] 2250 * 2251 * If the page is tracked with an embedded entry then the tree 2252 * lookup can be avoided. It's safe to check for this specific 2253 * set of values without pp_lock because both will only ever be 2254 * set together for this pmap. 2255 * 2256 */ 2257 if (atomic_load_relaxed(&old_pp->pp_pte.pte_ptp) == ptp && 2258 atomic_load_relaxed(&old_pp->pp_pte.pte_va) == va) { 2259 return NULL; 2260 } 2261 2262 /* 2263 * [This mostly deals with shared mappings, for example shared libs 2264 * and executables.] 2265 * 2266 * Optimise for pmap_remove_ptes() which works by ascending scan: 2267 * look at the lowest numbered node in the tree first. The tree is 2268 * known non-empty because of the check above. For short lived 2269 * processes where pmap_remove() isn't used much this gets close to 2270 * a 100% hit rate. 2271 */ 2272 tree = (ptp != NULL ? &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb); 2273 KASSERT(!RB_SENTINEL_P(tree->rbt_root)); 2274 pve = (struct pv_entry *) 2275 ((uintptr_t)tree->rbt_minmax[RB_DIR_LEFT] - 2276 offsetof(struct pv_entry, pve_rb)); 2277 if (__predict_true(pve->pve_pte.pte_va == va)) { 2278 KASSERT(pve->pve_pte.pte_ptp == ptp); 2279 return pve; 2280 } 2281 2282 /* Search the RB tree for the key (uncommon). */ 2283 return pmap_treelookup_pv(pmap, ptp, tree, va); 2284 } 2285 2286 /* 2287 * pmap_enter_pv: enter a mapping onto a pmap_page lst 2288 * 2289 * => pmap must be locked 2290 * => does NOT insert dynamic entries to tree (pmap_enter() does later) 2291 */ 2292 static int 2293 pmap_enter_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp, 2294 vaddr_t va, struct pv_entry **new_pve, struct pv_entry **old_pve, 2295 bool *samepage, bool *new_embedded, rb_tree_t *tree) 2296 { 2297 struct pv_entry *pve; 2298 int error; 2299 2300 KASSERT(mutex_owned(&pmap->pm_lock)); 2301 KASSERT(ptp_to_pmap(ptp) == pmap); 2302 KASSERT(ptp == NULL || ptp->uobject != NULL); 2303 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 2304 PMAP_CHECK_PP(pp); 2305 2306 /* 2307 * If entering the same page and it's already tracked with an 2308 * embedded entry, we can avoid the expense below. It's safe 2309 * to check for this very specific set of values without a lock 2310 * because both will only ever be set together for this pmap. 2311 */ 2312 if (atomic_load_relaxed(&pp->pp_pte.pte_ptp) == ptp && 2313 atomic_load_relaxed(&pp->pp_pte.pte_va) == va) { 2314 *samepage = true; 2315 pmap_check_pv(pmap, ptp, pp, va, true); 2316 return 0; 2317 } 2318 2319 /* 2320 * Check for an existing dynamic mapping at this address. If it's 2321 * for the same page, then it will be reused and nothing needs to be 2322 * changed. 2323 */ 2324 *old_pve = pmap_treelookup_pv(pmap, ptp, tree, va); 2325 if (*old_pve != NULL && (*old_pve)->pve_pp == pp) { 2326 *samepage = true; 2327 pmap_check_pv(pmap, ptp, pp, va, true); 2328 return 0; 2329 } 2330 2331 /* 2332 * Need to put a new mapping in place. Grab a spare pv_entry in 2333 * case it's needed; won't know for sure until the lock is taken. 2334 */ 2335 if (pmap->pm_pve == NULL) { 2336 pmap->pm_pve = pmap_alloc_pv(pmap); 2337 } 2338 2339 error = 0; 2340 pmap_check_pv(pmap, ptp, pp, va, false); 2341 mutex_spin_enter(&pp->pp_lock); 2342 if (!pv_pte_embedded(pp)) { 2343 /* 2344 * Embedded PV tracking available - easy. 2345 */ 2346 pp->pp_pte.pte_ptp = ptp; 2347 pp->pp_pte.pte_va = va; 2348 *new_embedded = true; 2349 } else if (__predict_false(pmap->pm_pve == NULL)) { 2350 /* 2351 * No memory. 2352 */ 2353 error = ENOMEM; 2354 } else { 2355 /* 2356 * Install new pv_entry on the page. 2357 */ 2358 pve = pmap->pm_pve; 2359 pmap->pm_pve = NULL; 2360 *new_pve = pve; 2361 pve->pve_pte.pte_ptp = ptp; 2362 pve->pve_pte.pte_va = va; 2363 pve->pve_pp = pp; 2364 LIST_INSERT_HEAD(&pp->pp_pvlist, pve, pve_list); 2365 } 2366 mutex_spin_exit(&pp->pp_lock); 2367 if (error == 0) { 2368 pmap_check_pv(pmap, ptp, pp, va, true); 2369 } 2370 2371 return error; 2372 } 2373 2374 /* 2375 * pmap_remove_pv: try to remove a mapping from a pv_list 2376 * 2377 * => pmap must be locked 2378 * => removes dynamic entries from tree and frees them 2379 * => caller should adjust ptp's wire_count and free PTP if needed 2380 */ 2381 static void 2382 pmap_remove_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp, 2383 vaddr_t va, struct pv_entry *pve, uint8_t oattrs) 2384 { 2385 rb_tree_t *tree = (ptp != NULL ? 2386 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb); 2387 2388 KASSERT(mutex_owned(&pmap->pm_lock)); 2389 KASSERT(ptp_to_pmap(ptp) == pmap); 2390 KASSERT(ptp == NULL || ptp->uobject != NULL); 2391 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 2392 KASSERT(ptp != NULL || pmap == pmap_kernel()); 2393 2394 pmap_check_pv(pmap, ptp, pp, va, true); 2395 2396 if (pve == NULL) { 2397 mutex_spin_enter(&pp->pp_lock); 2398 KASSERT(pp->pp_pte.pte_ptp == ptp); 2399 KASSERT(pp->pp_pte.pte_va == va); 2400 pp->pp_attrs |= oattrs; 2401 pp->pp_pte.pte_ptp = NULL; 2402 pp->pp_pte.pte_va = 0; 2403 mutex_spin_exit(&pp->pp_lock); 2404 } else { 2405 mutex_spin_enter(&pp->pp_lock); 2406 KASSERT(pp->pp_pte.pte_ptp != ptp || 2407 pp->pp_pte.pte_va != va); 2408 KASSERT(pve->pve_pte.pte_ptp == ptp); 2409 KASSERT(pve->pve_pte.pte_va == va); 2410 KASSERT(pve->pve_pp == pp); 2411 pp->pp_attrs |= oattrs; 2412 LIST_REMOVE(pve, pve_list); 2413 mutex_spin_exit(&pp->pp_lock); 2414 2415 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == pve); 2416 rb_tree_remove_node(tree, pve); 2417 #ifdef DIAGNOSTIC 2418 memset(pve, 0, sizeof(*pve)); 2419 #endif 2420 pmap_free_pv(pmap, pve); 2421 } 2422 2423 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 2424 pmap_check_pv(pmap, ptp, pp, va, false); 2425 } 2426 2427 /* 2428 * p t p f u n c t i o n s 2429 */ 2430 2431 static struct vm_page * 2432 pmap_find_ptp(struct pmap *pmap, vaddr_t va, int level) 2433 { 2434 int lidx = level - 1; 2435 off_t off = ptp_va2o(va, level); 2436 struct vm_page *pg; 2437 2438 KASSERT(mutex_owned(&pmap->pm_lock)); 2439 2440 if (pmap->pm_ptphint[lidx] && off == pmap->pm_ptphint[lidx]->offset) { 2441 KASSERT(pmap->pm_ptphint[lidx]->wire_count > 0); 2442 pg = pmap->pm_ptphint[lidx]; 2443 PMAP_CHECK_PP(VM_PAGE_TO_PP(pg)); 2444 return pg; 2445 } 2446 PMAP_DUMMY_LOCK(pmap); 2447 pg = uvm_pagelookup(&pmap->pm_obj[lidx], off); 2448 PMAP_DUMMY_UNLOCK(pmap); 2449 if (pg != NULL && __predict_false(pg->wire_count == 0)) { 2450 /* This page is queued to be freed - ignore. */ 2451 pg = NULL; 2452 } 2453 if (pg != NULL) { 2454 PMAP_CHECK_PP(VM_PAGE_TO_PP(pg)); 2455 } 2456 pmap->pm_ptphint[lidx] = pg; 2457 return pg; 2458 } 2459 2460 static inline void 2461 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level) 2462 { 2463 int lidx; 2464 2465 KASSERT(ptp->wire_count <= 1); 2466 PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp)); 2467 2468 lidx = level - 1; 2469 pmap_stats_update(pmap, -ptp->wire_count, 0); 2470 if (pmap->pm_ptphint[lidx] == ptp) 2471 pmap->pm_ptphint[lidx] = NULL; 2472 ptp->wire_count = 0; 2473 ptp->uanon = NULL; 2474 KASSERT(RB_TREE_MIN(&VM_PAGE_TO_PP(ptp)->pp_rb) == NULL); 2475 2476 /* 2477 * Enqueue the PTP to be freed by pmap_update(). We can't remove 2478 * the page from the uvm_object, as that can take further locks 2479 * (intolerable right now because the PTEs are likely mapped in). 2480 * Instead mark the PTP as free and if we bump into it again, we'll 2481 * either ignore or reuse (depending on what's useful at the time). 2482 */ 2483 LIST_INSERT_HEAD(&pmap->pm_gc_ptp, ptp, mdpage.mp_pp.pp_link); 2484 } 2485 2486 static void 2487 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va, 2488 pt_entry_t *ptes, pd_entry_t * const *pdes) 2489 { 2490 unsigned long index; 2491 int level; 2492 vaddr_t invaladdr; 2493 pd_entry_t opde; 2494 2495 KASSERT(pmap != pmap_kernel()); 2496 KASSERT(mutex_owned(&pmap->pm_lock)); 2497 KASSERT(kpreempt_disabled()); 2498 2499 level = 1; 2500 do { 2501 index = pl_i(va, level + 1); 2502 opde = pmap_pte_testset(&pdes[level - 1][index], 0); 2503 2504 /* 2505 * On Xen-amd64 or SVS, we need to sync the top level page 2506 * directory on each CPU. 2507 */ 2508 #if defined(XENPV) && defined(__x86_64__) 2509 if (level == PTP_LEVELS - 1) { 2510 xen_kpm_sync(pmap, index); 2511 } 2512 #elif defined(SVS) 2513 if (svs_enabled && level == PTP_LEVELS - 1 && 2514 pmap_is_user(pmap)) { 2515 svs_pmap_sync(pmap, index); 2516 } 2517 #endif 2518 2519 invaladdr = level == 1 ? (vaddr_t)ptes : 2520 (vaddr_t)pdes[level - 2]; 2521 pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE, 2522 opde, TLBSHOOT_FREE_PTP); 2523 2524 #if defined(XENPV) 2525 pmap_tlb_shootnow(); 2526 #endif 2527 2528 pmap_freepage(pmap, ptp, level); 2529 if (level < PTP_LEVELS - 1) { 2530 ptp = pmap_find_ptp(pmap, va, level + 1); 2531 ptp->wire_count--; 2532 if (ptp->wire_count > 1) 2533 break; 2534 } 2535 } while (++level < PTP_LEVELS); 2536 pmap_pte_flush(); 2537 } 2538 2539 /* 2540 * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one) 2541 * 2542 * => pmap should NOT be pmap_kernel() 2543 * => pmap should be locked 2544 * => we are not touching any PTEs yet, so they need not be mapped in 2545 */ 2546 static int 2547 pmap_get_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va, 2548 int flags, struct vm_page **resultp) 2549 { 2550 struct vm_page *ptp; 2551 int i, aflags; 2552 struct uvm_object *obj; 2553 voff_t off; 2554 2555 KASSERT(pmap != pmap_kernel()); 2556 KASSERT(mutex_owned(&pmap->pm_lock)); 2557 2558 /* 2559 * Loop through all page table levels allocating a page 2560 * for any level where we don't already have one. 2561 */ 2562 memset(pt, 0, sizeof(*pt)); 2563 aflags = ((flags & PMAP_CANFAIL) ? 0 : UVM_PGA_USERESERVE) | 2564 UVM_PGA_ZERO; 2565 for (i = PTP_LEVELS; i > 1; i--) { 2566 obj = &pmap->pm_obj[i - 2]; 2567 off = ptp_va2o(va, i - 1); 2568 2569 PMAP_DUMMY_LOCK(pmap); 2570 pt->pg[i] = uvm_pagelookup(obj, off); 2571 2572 if (pt->pg[i] == NULL) { 2573 pt->pg[i] = uvm_pagealloc(obj, off, NULL, aflags); 2574 pt->alloced[i] = (pt->pg[i] != NULL); 2575 } else if (pt->pg[i]->wire_count == 0) { 2576 /* This page was queued to be freed; dequeue it. */ 2577 LIST_REMOVE(pt->pg[i], mdpage.mp_pp.pp_link); 2578 pt->alloced[i] = true; 2579 } 2580 PMAP_DUMMY_UNLOCK(pmap); 2581 if (pt->pg[i] == NULL) { 2582 pmap_unget_ptp(pmap, pt); 2583 return ENOMEM; 2584 } else if (pt->alloced[i]) { 2585 pt->pg[i]->uanon = (struct vm_anon *)(vaddr_t)~0L; 2586 rb_tree_init(&VM_PAGE_TO_PP(pt->pg[i])->pp_rb, 2587 &pmap_rbtree_ops); 2588 PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i])); 2589 } 2590 } 2591 ptp = pt->pg[2]; 2592 KASSERT(ptp != NULL); 2593 *resultp = ptp; 2594 pmap->pm_ptphint[0] = ptp; 2595 return 0; 2596 } 2597 2598 /* 2599 * pmap_install_ptp: install any freshly allocated PTPs 2600 * 2601 * => pmap should NOT be pmap_kernel() 2602 * => pmap should be locked 2603 * => PTEs must be mapped 2604 * => preemption must be disabled 2605 */ 2606 static void 2607 pmap_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va, 2608 pd_entry_t * const *pdes) 2609 { 2610 struct vm_page *ptp; 2611 unsigned long index; 2612 pd_entry_t *pva; 2613 paddr_t pa; 2614 int i; 2615 2616 KASSERT(pmap != pmap_kernel()); 2617 KASSERT(mutex_owned(&pmap->pm_lock)); 2618 KASSERT(kpreempt_disabled()); 2619 2620 /* 2621 * Now that we have all the pages looked up or allocated, 2622 * loop through again installing any new ones into the tree. 2623 */ 2624 for (i = PTP_LEVELS; i > 1; i--) { 2625 index = pl_i(va, i); 2626 pva = pdes[i - 2]; 2627 2628 if (pmap_valid_entry(pva[index])) { 2629 KASSERT(!pt->alloced[i]); 2630 continue; 2631 } 2632 2633 ptp = pt->pg[i]; 2634 ptp->flags &= ~PG_BUSY; /* never busy */ 2635 ptp->wire_count = 1; 2636 pmap->pm_ptphint[i - 2] = ptp; 2637 pa = VM_PAGE_TO_PHYS(ptp); 2638 pmap_pte_set(&pva[index], (pd_entry_t) 2639 (pmap_pa2pte(pa) | PTE_U | PTE_W | PTE_P)); 2640 2641 /* 2642 * On Xen-amd64 or SVS, we need to sync the top level page 2643 * directory on each CPU. 2644 */ 2645 #if defined(XENPV) && defined(__x86_64__) 2646 if (i == PTP_LEVELS) { 2647 xen_kpm_sync(pmap, index); 2648 } 2649 #elif defined(SVS) 2650 if (svs_enabled && i == PTP_LEVELS && 2651 pmap_is_user(pmap)) { 2652 svs_pmap_sync(pmap, index); 2653 } 2654 #endif 2655 2656 pmap_pte_flush(); 2657 pmap_stats_update(pmap, 1, 0); 2658 2659 /* 2660 * If we're not in the top level, increase the 2661 * wire count of the parent page. 2662 */ 2663 if (i < PTP_LEVELS) { 2664 pt->pg[i + 1]->wire_count++; 2665 } 2666 } 2667 } 2668 2669 /* 2670 * pmap_unget_ptp: free unusued PTPs 2671 * 2672 * => pmap should NOT be pmap_kernel() 2673 * => pmap should be locked 2674 */ 2675 static void 2676 pmap_unget_ptp(struct pmap *pmap, struct pmap_ptparray *pt) 2677 { 2678 int i; 2679 2680 KASSERT(pmap != pmap_kernel()); 2681 KASSERT(mutex_owned(&pmap->pm_lock)); 2682 2683 for (i = PTP_LEVELS; i > 1; i--) { 2684 if (!pt->alloced[i]) { 2685 continue; 2686 } 2687 KASSERT(pt->pg[i]->wire_count == 0); 2688 PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i])); 2689 pmap_freepage(pmap, pt->pg[i], i - 1); 2690 } 2691 } 2692 2693 /* 2694 * p m a p l i f e c y c l e f u n c t i o n s 2695 */ 2696 2697 /* 2698 * pmap_pdp_init: constructor a new PDP. 2699 */ 2700 static void 2701 pmap_pdp_init(pd_entry_t *pdir) 2702 { 2703 paddr_t pdirpa = 0; 2704 vaddr_t object; 2705 int i; 2706 2707 #if !defined(XENPV) || !defined(__x86_64__) 2708 int npde; 2709 #endif 2710 #ifdef XENPV 2711 int s; 2712 #endif 2713 2714 memset(PAGE_ALIGNED(pdir), 0, PDP_SIZE * PAGE_SIZE); 2715 2716 /* 2717 * NOTE: This is all done unlocked, but we will check afterwards 2718 * if we have raced with pmap_growkernel(). 2719 */ 2720 2721 #if defined(XENPV) && defined(__x86_64__) 2722 /* Fetch the physical address of the page directory */ 2723 (void)pmap_extract(pmap_kernel(), (vaddr_t)pdir, &pdirpa); 2724 2725 /* 2726 * This pdir will NEVER be active in kernel mode, so mark 2727 * recursive entry invalid. 2728 */ 2729 pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa); 2730 2731 /* 2732 * PDP constructed this way won't be for the kernel, hence we 2733 * don't put kernel mappings on Xen. 2734 * 2735 * But we need to make pmap_create() happy, so put a dummy 2736 * (without PTE_P) value at the right place. 2737 */ 2738 pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] = 2739 (pd_entry_t)-1 & PTE_FRAME; 2740 #else /* XENPV && __x86_64__*/ 2741 object = (vaddr_t)pdir; 2742 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2743 /* Fetch the physical address of the page directory */ 2744 (void)pmap_extract(pmap_kernel(), object, &pdirpa); 2745 2746 /* Put in recursive PDE to map the PTEs */ 2747 pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PTE_P | 2748 pmap_pg_nx; 2749 #ifndef XENPV 2750 pdir[PDIR_SLOT_PTE + i] |= PTE_W; 2751 #endif 2752 } 2753 2754 /* Copy the kernel's top level PDE */ 2755 npde = nkptp[PTP_LEVELS - 1]; 2756 2757 memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN], 2758 npde * sizeof(pd_entry_t)); 2759 2760 if (VM_MIN_KERNEL_ADDRESS != KERNBASE) { 2761 int idx = pl_i(KERNBASE, PTP_LEVELS); 2762 pdir[idx] = PDP_BASE[idx]; 2763 } 2764 2765 #ifdef __HAVE_PCPU_AREA 2766 pdir[PDIR_SLOT_PCPU] = PDP_BASE[PDIR_SLOT_PCPU]; 2767 #endif 2768 #ifdef __HAVE_DIRECT_MAP 2769 slotspace_copy(SLAREA_DMAP, pdir, PDP_BASE); 2770 #endif 2771 #ifdef KASAN 2772 slotspace_copy(SLAREA_ASAN, pdir, PDP_BASE); 2773 #endif 2774 #ifdef KMSAN 2775 slotspace_copy(SLAREA_MSAN, pdir, PDP_BASE); 2776 #endif 2777 #endif /* XENPV && __x86_64__*/ 2778 2779 #ifdef XENPV 2780 s = splvm(); 2781 object = (vaddr_t)pdir; 2782 pmap_protect(pmap_kernel(), object, object + (PAGE_SIZE * PDP_SIZE), 2783 VM_PROT_READ); 2784 pmap_update(pmap_kernel()); 2785 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2786 /* 2787 * pin as L2/L4 page, we have to do the page with the 2788 * PDIR_SLOT_PTE entries last 2789 */ 2790 #ifdef PAE 2791 if (i == l2tol3(PDIR_SLOT_PTE)) 2792 continue; 2793 #endif 2794 2795 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2796 #ifdef __x86_64__ 2797 xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa)); 2798 #else 2799 xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa)); 2800 #endif 2801 } 2802 #ifdef PAE 2803 object = ((vaddr_t)pdir) + PAGE_SIZE * l2tol3(PDIR_SLOT_PTE); 2804 (void)pmap_extract(pmap_kernel(), object, &pdirpa); 2805 xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa)); 2806 #endif 2807 splx(s); 2808 #endif /* XENPV */ 2809 } 2810 2811 /* 2812 * pmap_pdp_fini: destructor for the PDPs. 2813 */ 2814 static void 2815 pmap_pdp_fini(pd_entry_t *pdir) 2816 { 2817 #ifdef XENPV 2818 paddr_t pdirpa = 0; /* XXX: GCC */ 2819 vaddr_t object = (vaddr_t)pdir; 2820 int i; 2821 int s = splvm(); 2822 pt_entry_t *pte; 2823 2824 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2825 /* fetch the physical address of the page directory. */ 2826 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2827 /* unpin page table */ 2828 xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa)); 2829 } 2830 object = (vaddr_t)pdir; 2831 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2832 /* Set page RW again */ 2833 pte = kvtopte(object); 2834 pmap_pte_set(pte, *pte | PTE_W); 2835 xen_bcast_invlpg((vaddr_t)object); 2836 } 2837 splx(s); 2838 #endif /* XENPV */ 2839 } 2840 2841 #ifdef PAE 2842 static void * 2843 pmap_pdp_alloc(struct pool *pp, int flags) 2844 { 2845 return (void *)uvm_km_alloc(kernel_map, 2846 PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE, 2847 ((flags & PR_WAITOK) ? UVM_KMF_WAITVA 2848 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK) | 2849 UVM_KMF_WIRED); 2850 } 2851 2852 static void 2853 pmap_pdp_free(struct pool *pp, void *v) 2854 { 2855 uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE, 2856 UVM_KMF_WIRED); 2857 } 2858 #endif /* PAE */ 2859 2860 /* 2861 * pmap_ctor: constructor for the pmap cache. 2862 */ 2863 static int 2864 pmap_ctor(void *arg, void *obj, int flags) 2865 { 2866 struct pmap *pmap = obj; 2867 pt_entry_t p; 2868 int i; 2869 2870 KASSERT((flags & PR_WAITOK) != 0); 2871 2872 mutex_init(&pmap->pm_lock, MUTEX_DEFAULT, IPL_NONE); 2873 rw_init(&pmap->pm_dummy_lock); 2874 kcpuset_create(&pmap->pm_cpus, true); 2875 kcpuset_create(&pmap->pm_kernel_cpus, true); 2876 #ifdef XENPV 2877 kcpuset_create(&pmap->pm_xen_ptp_cpus, true); 2878 #endif 2879 LIST_INIT(&pmap->pm_gc_ptp); 2880 pmap->pm_pve = NULL; 2881 LIST_INIT(&pmap->pm_pvp_full); 2882 LIST_INIT(&pmap->pm_pvp_part); 2883 LIST_INIT(&pmap->pm_pvp_empty); 2884 2885 /* allocate and init PDP */ 2886 pmap->pm_pdir = pool_get(&pmap_pdp_pool, PR_WAITOK); 2887 2888 for (;;) { 2889 pmap_pdp_init(pmap->pm_pdir); 2890 mutex_enter(&pmaps_lock); 2891 p = pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1]; 2892 if (__predict_true(p != 0)) { 2893 break; 2894 } 2895 mutex_exit(&pmaps_lock); 2896 } 2897 2898 for (i = 0; i < PDP_SIZE; i++) 2899 pmap->pm_pdirpa[i] = 2900 pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]); 2901 2902 LIST_INSERT_HEAD(&pmaps, pmap, pm_list); 2903 mutex_exit(&pmaps_lock); 2904 2905 return 0; 2906 } 2907 2908 /* 2909 * pmap_ctor: destructor for the pmap cache. 2910 */ 2911 static void 2912 pmap_dtor(void *arg, void *obj) 2913 { 2914 struct pmap *pmap = obj; 2915 2916 mutex_enter(&pmaps_lock); 2917 LIST_REMOVE(pmap, pm_list); 2918 mutex_exit(&pmaps_lock); 2919 2920 pmap_pdp_fini(pmap->pm_pdir); 2921 pool_put(&pmap_pdp_pool, pmap->pm_pdir); 2922 mutex_destroy(&pmap->pm_lock); 2923 rw_destroy(&pmap->pm_dummy_lock); 2924 kcpuset_destroy(pmap->pm_cpus); 2925 kcpuset_destroy(pmap->pm_kernel_cpus); 2926 #ifdef XENPV 2927 kcpuset_destroy(pmap->pm_xen_ptp_cpus); 2928 #endif 2929 } 2930 2931 /* 2932 * pmap_create: create a pmap object. 2933 */ 2934 struct pmap * 2935 pmap_create(void) 2936 { 2937 struct pmap *pmap; 2938 int i; 2939 2940 pmap = pool_cache_get(&pmap_cache, PR_WAITOK); 2941 2942 /* init uvm_object */ 2943 for (i = 0; i < PTP_LEVELS - 1; i++) { 2944 uvm_obj_init(&pmap->pm_obj[i], &pmap_pager, false, 1); 2945 uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_dummy_lock); 2946 pmap->pm_ptphint[i] = NULL; 2947 } 2948 pmap->pm_stats.wired_count = 0; 2949 /* count the PDP allocd below */ 2950 pmap->pm_stats.resident_count = PDP_SIZE; 2951 #if !defined(__x86_64__) 2952 pmap->pm_hiexec = 0; 2953 #endif 2954 2955 /* Used by NVMM and Xen */ 2956 pmap->pm_enter = NULL; 2957 pmap->pm_extract = NULL; 2958 pmap->pm_remove = NULL; 2959 pmap->pm_sync_pv = NULL; 2960 pmap->pm_pp_remove_ent = NULL; 2961 pmap->pm_write_protect = NULL; 2962 pmap->pm_unwire = NULL; 2963 pmap->pm_tlb_flush = NULL; 2964 pmap->pm_data = NULL; 2965 2966 /* init the LDT */ 2967 pmap->pm_ldt = NULL; 2968 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 2969 2970 return pmap; 2971 } 2972 2973 /* 2974 * pmap_check_ptps: verify that none of the pmap's page table objects 2975 * have any pages allocated to them. 2976 */ 2977 static void 2978 pmap_check_ptps(struct pmap *pmap) 2979 { 2980 int i; 2981 2982 for (i = 0; i < PTP_LEVELS - 1; i++) { 2983 KASSERTMSG(pmap->pm_obj[i].uo_npages == 0, 2984 "pmap %p level %d still has %d pages", 2985 pmap, i, (int)pmap->pm_obj[i].uo_npages); 2986 } 2987 } 2988 2989 static void 2990 pmap_check_inuse(struct pmap *pmap) 2991 { 2992 #ifdef DEBUG 2993 CPU_INFO_ITERATOR cii; 2994 struct cpu_info *ci; 2995 2996 for (CPU_INFO_FOREACH(cii, ci)) { 2997 if (ci->ci_pmap == pmap) 2998 panic("destroying pmap being used"); 2999 #if defined(XENPV) && defined(__x86_64__) 3000 for (int i = 0; i < PDIR_SLOT_USERLIM; i++) { 3001 if (pmap->pm_pdir[i] != 0 && 3002 ci->ci_kpm_pdir[i] == pmap->pm_pdir[i]) { 3003 printf("pmap_destroy(%p) pmap_kernel %p " 3004 "curcpu %d cpu %d ci_pmap %p " 3005 "ci->ci_kpm_pdir[%d]=%" PRIx64 3006 " pmap->pm_pdir[%d]=%" PRIx64 "\n", 3007 pmap, pmap_kernel(), curcpu()->ci_index, 3008 ci->ci_index, ci->ci_pmap, 3009 i, ci->ci_kpm_pdir[i], 3010 i, pmap->pm_pdir[i]); 3011 panic("%s: used pmap", __func__); 3012 } 3013 } 3014 #endif 3015 } 3016 #endif /* DEBUG */ 3017 } 3018 3019 /* 3020 * pmap_destroy: drop reference count on pmap. free pmap if reference 3021 * count goes to zero. 3022 * 3023 * => we can be called from pmap_unmap_ptes() with a different, unrelated 3024 * pmap's lock held. be careful! 3025 */ 3026 void 3027 pmap_destroy(struct pmap *pmap) 3028 { 3029 int i; 3030 3031 /* 3032 * drop reference count and verify not in use. 3033 */ 3034 3035 if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) { 3036 return; 3037 } 3038 pmap_check_inuse(pmap); 3039 3040 /* 3041 * handle any deferred frees. 3042 */ 3043 3044 mutex_enter(&pmap->pm_lock); 3045 if (pmap->pm_pve != NULL) { 3046 pmap_free_pv(pmap, pmap->pm_pve); 3047 pmap->pm_pve = NULL; 3048 } 3049 pmap_drain_pv(pmap); 3050 mutex_exit(&pmap->pm_lock); 3051 pmap_update(pmap); 3052 3053 /* 3054 * Reference count is zero, free pmap resources and then free pmap. 3055 */ 3056 3057 pmap_check_ptps(pmap); 3058 KASSERT(LIST_EMPTY(&pmap->pm_gc_ptp)); 3059 3060 #ifdef USER_LDT 3061 if (pmap->pm_ldt != NULL) { 3062 /* 3063 * No need to switch the LDT; this address space is gone, 3064 * nothing is using it. 3065 * 3066 * No need to lock the pmap for ldt_free (or anything else), 3067 * we're the last one to use it. 3068 */ 3069 /* XXXAD can't take cpu_lock here - fix soon. */ 3070 mutex_enter(&cpu_lock); 3071 ldt_free(pmap->pm_ldt_sel); 3072 mutex_exit(&cpu_lock); 3073 uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt, 3074 MAX_USERLDT_SIZE, UVM_KMF_WIRED); 3075 } 3076 #endif 3077 3078 for (i = 0; i < PTP_LEVELS - 1; i++) { 3079 uvm_obj_destroy(&pmap->pm_obj[i], false); 3080 } 3081 kcpuset_zero(pmap->pm_cpus); 3082 kcpuset_zero(pmap->pm_kernel_cpus); 3083 #ifdef XENPV 3084 kcpuset_zero(pmap->pm_xen_ptp_cpus); 3085 #endif 3086 3087 KASSERT(LIST_EMPTY(&pmap->pm_pvp_full)); 3088 KASSERT(LIST_EMPTY(&pmap->pm_pvp_part)); 3089 KASSERT(LIST_EMPTY(&pmap->pm_pvp_empty)); 3090 3091 pmap_check_ptps(pmap); 3092 if (__predict_false(pmap->pm_enter != NULL)) { 3093 /* XXX make this a different cache */ 3094 pool_cache_destruct_object(&pmap_cache, pmap); 3095 } else { 3096 pool_cache_put(&pmap_cache, pmap); 3097 } 3098 } 3099 3100 /* 3101 * pmap_zap_ptp: clear out an entire PTP without modifying PTEs 3102 * 3103 * => caller must hold pmap's lock 3104 * => PTP must be mapped into KVA 3105 * => must be called with kernel preemption disabled 3106 * => does as little work as possible 3107 */ 3108 static void 3109 pmap_zap_ptp(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, 3110 vaddr_t startva, vaddr_t blkendva) 3111 { 3112 #ifndef XENPV 3113 struct pv_entry *pve; 3114 struct vm_page *pg; 3115 struct pmap_page *pp; 3116 pt_entry_t opte; 3117 rb_tree_t *tree; 3118 vaddr_t va; 3119 int wired; 3120 uint8_t oattrs; 3121 u_int cnt; 3122 3123 KASSERT(mutex_owned(&pmap->pm_lock)); 3124 KASSERT(kpreempt_disabled()); 3125 KASSERT(pmap != pmap_kernel()); 3126 KASSERT(ptp->wire_count > 1); 3127 KASSERT(ptp->wire_count - 1 <= PAGE_SIZE / sizeof(pt_entry_t)); 3128 3129 /* 3130 * Start at the lowest entered VA, and scan until there are no more 3131 * PTEs in the PTPs. 3132 */ 3133 tree = &VM_PAGE_TO_PP(ptp)->pp_rb; 3134 pve = RB_TREE_MIN(tree); 3135 wired = 0; 3136 va = (vaddr_t)ptp->uanon; 3137 pte += ((va - startva) >> PAGE_SHIFT); 3138 3139 for (cnt = ptp->wire_count; cnt > 1; pte++, va += PAGE_SIZE) { 3140 /* 3141 * No need for an atomic to clear the PTE. Nothing else can 3142 * see the address space any more and speculative access (if 3143 * possible) won't modify. Therefore there's no need to 3144 * track the accessed/dirty bits. 3145 */ 3146 opte = *pte; 3147 if (!pmap_valid_entry(opte)) { 3148 continue; 3149 } 3150 3151 /* 3152 * Count the PTE. If it's not for a managed mapping 3153 * there's noting more to do. 3154 */ 3155 cnt--; 3156 wired -= (opte & PTE_WIRED); 3157 if ((opte & PTE_PVLIST) == 0) { 3158 #ifndef DOM0OPS 3159 KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL), 3160 "managed page without PTE_PVLIST for %#" 3161 PRIxVADDR, va); 3162 KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL), 3163 "pv-tracked page without PTE_PVLIST for %#" 3164 PRIxVADDR, va); 3165 #endif 3166 KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ? 3167 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), 3168 va) == NULL); 3169 continue; 3170 } 3171 3172 /* 3173 * "pve" now points to the lowest (by VA) dynamic PV entry 3174 * in the PTP. If it's for this VA, take advantage of it to 3175 * avoid calling PHYS_TO_VM_PAGE(). Avoid modifying the RB 3176 * tree by skipping to the next VA in the tree whenever 3177 * there is a match here. The tree will be cleared out in 3178 * one pass before return to pmap_remove_all(). 3179 */ 3180 oattrs = pmap_pte_to_pp_attrs(opte); 3181 if (pve != NULL && pve->pve_pte.pte_va == va) { 3182 pp = pve->pve_pp; 3183 KASSERT(pve->pve_pte.pte_ptp == ptp); 3184 KASSERT(pp->pp_pte.pte_ptp != ptp || 3185 pp->pp_pte.pte_va != va); 3186 mutex_spin_enter(&pp->pp_lock); 3187 pp->pp_attrs |= oattrs; 3188 LIST_REMOVE(pve, pve_list); 3189 mutex_spin_exit(&pp->pp_lock); 3190 3191 /* 3192 * pve won't be touched again until pmap_drain_pv(), 3193 * so it's still safe to traverse the tree. 3194 */ 3195 pmap_free_pv(pmap, pve); 3196 pve = RB_TREE_NEXT(tree, pve); 3197 continue; 3198 } 3199 3200 /* 3201 * No entry in the tree so it must be embedded. Look up the 3202 * page and cancel the embedded entry. 3203 */ 3204 if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) { 3205 pp = VM_PAGE_TO_PP(pg); 3206 } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) { 3207 paddr_t pa = pmap_pte2pa(opte); 3208 panic("%s: PTE_PVLIST with pv-untracked page" 3209 " va = %#"PRIxVADDR"pa = %#"PRIxPADDR 3210 "(%#"PRIxPADDR")", __func__, va, pa, atop(pa)); 3211 } 3212 mutex_spin_enter(&pp->pp_lock); 3213 KASSERT(pp->pp_pte.pte_ptp == ptp); 3214 KASSERT(pp->pp_pte.pte_va == va); 3215 pp->pp_attrs |= oattrs; 3216 pp->pp_pte.pte_ptp = NULL; 3217 pp->pp_pte.pte_va = 0; 3218 mutex_spin_exit(&pp->pp_lock); 3219 } 3220 3221 /* PTP now empty - adjust the tree & stats to match. */ 3222 pmap_stats_update(pmap, -(ptp->wire_count - 1), wired / PTE_WIRED); 3223 ptp->wire_count = 1; 3224 #ifdef DIAGNOSTIC 3225 rb_tree_init(tree, &pmap_rbtree_ops); 3226 #endif 3227 #else /* !XENPV */ 3228 /* 3229 * XXXAD For XEN, it's not clear to me that we can do this, because 3230 * I guess the hypervisor keeps track of PTEs too. 3231 */ 3232 pmap_remove_ptes(pmap, ptp, (vaddr_t)pte, startva, blkendva); 3233 #endif /* !XENPV */ 3234 } 3235 3236 /* 3237 * pmap_remove_all: remove all mappings from pmap in bulk. 3238 * 3239 * Ordinarily when removing mappings it's important to hold the UVM object's 3240 * lock, so that pages do not gain a new identity while retaining stale TLB 3241 * entries (the same lock hold covers both pmap_remove() and pmap_update()). 3242 * Here it's known that the address space is no longer visible to any user 3243 * process, so we don't need to worry about that. 3244 */ 3245 bool 3246 pmap_remove_all(struct pmap *pmap) 3247 { 3248 struct vm_page *ptps[32]; 3249 vaddr_t va, blkendva; 3250 struct pmap *pmap2; 3251 pt_entry_t *ptes; 3252 pd_entry_t pde __diagused; 3253 pd_entry_t * const *pdes; 3254 int lvl __diagused, i, n; 3255 3256 /* XXX Can't handle EPT just yet. */ 3257 if (pmap->pm_remove != NULL) { 3258 return false; 3259 } 3260 3261 for (;;) { 3262 /* Fetch a block of PTPs from tree. */ 3263 mutex_enter(&pmap->pm_lock); 3264 n = radix_tree_gang_lookup_node(&pmap->pm_obj[0].uo_pages, 0, 3265 (void **)ptps, __arraycount(ptps), false); 3266 if (n == 0) { 3267 mutex_exit(&pmap->pm_lock); 3268 break; 3269 } 3270 3271 /* Remove all mappings in the set of PTPs. */ 3272 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3273 for (i = 0; i < n; i++) { 3274 if (ptps[i]->wire_count == 0) { 3275 /* It's dead: pmap_update() will expunge. */ 3276 continue; 3277 } 3278 3279 /* Determine range of block. */ 3280 va = ptps[i]->offset * PAGE_SIZE / sizeof(pt_entry_t); 3281 blkendva = x86_round_pdr(va + 1); 3282 3283 /* Make sure everything squares up... */ 3284 KASSERT(pmap_pdes_valid(va, pdes, &pde, &lvl)); 3285 KASSERT(lvl == 1); 3286 KASSERT(pmap_find_ptp(pmap, va, 1) == ptps[i]); 3287 3288 /* Zap! */ 3289 pmap_zap_ptp(pmap, ptps[i], &ptes[pl1_i(va)], va, 3290 blkendva); 3291 3292 /* PTP should now be unused - free it. */ 3293 KASSERT(ptps[i]->wire_count == 1); 3294 pmap_free_ptp(pmap, ptps[i], va, ptes, pdes); 3295 } 3296 pmap_unmap_ptes(pmap, pmap2); 3297 pmap_drain_pv(pmap); 3298 pmap_tlb_shootdown(pmap, -1L, 0, TLBSHOOT_REMOVE_ALL); 3299 mutex_exit(&pmap->pm_lock); 3300 3301 /* Process deferred frees. */ 3302 pmap_update(pmap); 3303 3304 /* A breathing point. */ 3305 preempt_point(); 3306 } 3307 3308 /* Verify that the pmap is now completely empty. */ 3309 pmap_check_ptps(pmap); 3310 KASSERTMSG(pmap->pm_stats.resident_count == PDP_SIZE, 3311 "pmap %p not empty", pmap); 3312 3313 return true; 3314 } 3315 3316 #if defined(PMAP_FORK) 3317 /* 3318 * pmap_fork: perform any necessary data structure manipulation when 3319 * a VM space is forked. 3320 */ 3321 void 3322 pmap_fork(struct pmap *pmap1, struct pmap *pmap2) 3323 { 3324 #ifdef USER_LDT 3325 union descriptor *new_ldt; 3326 int sel; 3327 3328 if (__predict_true(pmap1->pm_ldt == NULL)) { 3329 return; 3330 } 3331 3332 /* 3333 * Copy the LDT into the new process. 3334 * 3335 * Read pmap1's ldt pointer unlocked; if it changes behind our back 3336 * we'll retry. This will starve if there's a stream of LDT changes 3337 * in another thread but that should not happen. 3338 */ 3339 3340 retry: 3341 if (pmap1->pm_ldt != NULL) { 3342 /* Allocate space for the new process's LDT */ 3343 new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, 3344 MAX_USERLDT_SIZE, 0, UVM_KMF_WIRED); 3345 if (new_ldt == NULL) { 3346 printf("WARNING: %s: unable to allocate LDT space\n", 3347 __func__); 3348 return; 3349 } 3350 mutex_enter(&cpu_lock); 3351 /* Get a GDT slot for it */ 3352 sel = ldt_alloc(new_ldt, MAX_USERLDT_SIZE); 3353 if (sel == -1) { 3354 mutex_exit(&cpu_lock); 3355 uvm_km_free(kernel_map, (vaddr_t)new_ldt, 3356 MAX_USERLDT_SIZE, UVM_KMF_WIRED); 3357 printf("WARNING: %s: unable to allocate LDT selector\n", 3358 __func__); 3359 return; 3360 } 3361 } else { 3362 /* Wasn't anything there after all. */ 3363 new_ldt = NULL; 3364 sel = -1; 3365 mutex_enter(&cpu_lock); 3366 } 3367 3368 /* 3369 * Now that we have cpu_lock, ensure the LDT status is the same. 3370 */ 3371 if (pmap1->pm_ldt != NULL) { 3372 if (new_ldt == NULL) { 3373 /* A wild LDT just appeared. */ 3374 mutex_exit(&cpu_lock); 3375 goto retry; 3376 } 3377 3378 /* Copy the LDT data and install it in pmap2 */ 3379 memcpy(new_ldt, pmap1->pm_ldt, MAX_USERLDT_SIZE); 3380 pmap2->pm_ldt = new_ldt; 3381 pmap2->pm_ldt_sel = sel; 3382 mutex_exit(&cpu_lock); 3383 } else { 3384 if (new_ldt != NULL) { 3385 /* The LDT disappeared, drop what we did. */ 3386 ldt_free(sel); 3387 mutex_exit(&cpu_lock); 3388 uvm_km_free(kernel_map, (vaddr_t)new_ldt, 3389 MAX_USERLDT_SIZE, UVM_KMF_WIRED); 3390 return; 3391 } 3392 3393 /* We're good, just leave. */ 3394 mutex_exit(&cpu_lock); 3395 } 3396 #endif /* USER_LDT */ 3397 } 3398 #endif /* PMAP_FORK */ 3399 3400 #ifdef USER_LDT 3401 3402 /* 3403 * pmap_ldt_xcall: cross call used by pmap_ldt_sync. if the named pmap 3404 * is active, reload LDTR. 3405 */ 3406 static void 3407 pmap_ldt_xcall(void *arg1, void *arg2) 3408 { 3409 struct pmap *pm; 3410 3411 kpreempt_disable(); 3412 pm = arg1; 3413 if (curcpu()->ci_pmap == pm) { 3414 #if defined(SVS) 3415 if (svs_enabled) { 3416 svs_ldt_sync(pm); 3417 } else 3418 #endif 3419 lldt(pm->pm_ldt_sel); 3420 } 3421 kpreempt_enable(); 3422 } 3423 3424 /* 3425 * pmap_ldt_sync: LDT selector for the named pmap is changing. swap 3426 * in the new selector on all CPUs. 3427 */ 3428 void 3429 pmap_ldt_sync(struct pmap *pm) 3430 { 3431 uint64_t where; 3432 3433 KASSERT(mutex_owned(&cpu_lock)); 3434 3435 pmap_ldt_evcnt.ev_count++; 3436 where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL); 3437 xc_wait(where); 3438 } 3439 3440 /* 3441 * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and 3442 * restore the default. 3443 */ 3444 void 3445 pmap_ldt_cleanup(struct lwp *l) 3446 { 3447 pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap; 3448 union descriptor *ldt; 3449 int sel; 3450 3451 if (__predict_true(pmap->pm_ldt == NULL)) { 3452 return; 3453 } 3454 3455 mutex_enter(&cpu_lock); 3456 if (pmap->pm_ldt != NULL) { 3457 sel = pmap->pm_ldt_sel; 3458 ldt = pmap->pm_ldt; 3459 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 3460 pmap->pm_ldt = NULL; 3461 pmap_ldt_sync(pmap); 3462 ldt_free(sel); 3463 uvm_km_free(kernel_map, (vaddr_t)ldt, MAX_USERLDT_SIZE, 3464 UVM_KMF_WIRED); 3465 } 3466 mutex_exit(&cpu_lock); 3467 } 3468 #endif /* USER_LDT */ 3469 3470 /* 3471 * pmap_activate: activate a process' pmap 3472 * 3473 * => must be called with kernel preemption disabled 3474 * => if lwp is the curlwp, then set ci_want_pmapload so that 3475 * actual MMU context switch will be done by pmap_load() later 3476 */ 3477 void 3478 pmap_activate(struct lwp *l) 3479 { 3480 struct cpu_info *ci; 3481 struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 3482 3483 KASSERT(kpreempt_disabled()); 3484 3485 ci = curcpu(); 3486 3487 if (l != ci->ci_curlwp) 3488 return; 3489 3490 KASSERT(ci->ci_want_pmapload == 0); 3491 KASSERT(ci->ci_tlbstate != TLBSTATE_VALID); 3492 3493 /* 3494 * no need to switch to kernel vmspace because 3495 * it's a subset of any vmspace. 3496 */ 3497 3498 if (pmap == pmap_kernel()) { 3499 ci->ci_want_pmapload = 0; 3500 return; 3501 } 3502 3503 ci->ci_want_pmapload = 1; 3504 } 3505 3506 #if defined(XENPV) && defined(__x86_64__) 3507 #define KASSERT_PDIRPA(pmap) \ 3508 KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd || \ 3509 pmap == pmap_kernel()) 3510 #elif defined(PAE) 3511 #define KASSERT_PDIRPA(pmap) \ 3512 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0])) 3513 #elif !defined(XENPV) 3514 #define KASSERT_PDIRPA(pmap) \ 3515 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3())) 3516 #else 3517 #define KASSERT_PDIRPA(pmap) KASSERT(true) /* nothing to do */ 3518 #endif 3519 3520 /* 3521 * pmap_reactivate: try to regain reference to the pmap. 3522 * 3523 * => Must be called with kernel preemption disabled. 3524 */ 3525 static void 3526 pmap_reactivate(struct pmap *pmap) 3527 { 3528 struct cpu_info * const ci = curcpu(); 3529 const cpuid_t cid = cpu_index(ci); 3530 3531 KASSERT(kpreempt_disabled()); 3532 KASSERT_PDIRPA(pmap); 3533 3534 /* 3535 * If we still have a lazy reference to this pmap, we can assume 3536 * that there was no TLB shootdown for this pmap in the meantime. 3537 * 3538 * The order of events here is important as we must synchronize 3539 * with TLB shootdown interrupts. Declare interest in invalidations 3540 * (TLBSTATE_VALID) and then check the CPU set, which the IPIs can 3541 * change only when the state is TLBSTATE_LAZY. 3542 */ 3543 3544 ci->ci_tlbstate = TLBSTATE_VALID; 3545 KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid)); 3546 3547 if (__predict_true(kcpuset_isset(pmap->pm_cpus, cid))) { 3548 /* We have the reference, state is valid. */ 3549 } else { 3550 /* 3551 * Must reload the TLB, pmap has been changed during 3552 * deactivated. 3553 */ 3554 kcpuset_atomic_set(pmap->pm_cpus, cid); 3555 3556 tlbflush(); 3557 } 3558 } 3559 3560 /* 3561 * pmap_load: perform the actual pmap switch, i.e. fill in %cr3 register 3562 * and relevant LDT info. 3563 * 3564 * Ensures that the current process' pmap is loaded on the current CPU's 3565 * MMU and that there are no stale TLB entries. 3566 * 3567 * => The caller should disable kernel preemption or do check-and-retry 3568 * to prevent a preemption from undoing our efforts. 3569 * => This function may block. 3570 */ 3571 void 3572 pmap_load(void) 3573 { 3574 struct cpu_info *ci; 3575 struct pmap *pmap, *oldpmap; 3576 struct lwp *l; 3577 uint64_t pctr; 3578 int ilevel __diagused; 3579 u_long psl __diagused; 3580 3581 kpreempt_disable(); 3582 retry: 3583 ci = curcpu(); 3584 if (!ci->ci_want_pmapload) { 3585 kpreempt_enable(); 3586 return; 3587 } 3588 l = ci->ci_curlwp; 3589 pctr = lwp_pctr(); 3590 __insn_barrier(); 3591 3592 /* should be able to take ipis. */ 3593 KASSERTMSG((ilevel = ci->ci_ilevel) < IPL_HIGH, "ilevel=%d", ilevel); 3594 #ifdef XENPV 3595 /* Check to see if interrupts are enabled (ie; no events are masked) */ 3596 KASSERTMSG((psl = x86_read_psl()) == 0, "psl=0x%lx", psl); 3597 #else 3598 KASSERTMSG(((psl = x86_read_psl()) & PSL_I) != 0, "psl=0x%lx", psl); 3599 #endif 3600 3601 KASSERT(l != NULL); 3602 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 3603 KASSERT(pmap != pmap_kernel()); 3604 oldpmap = ci->ci_pmap; 3605 3606 if (pmap == oldpmap) { 3607 pmap_reactivate(pmap); 3608 ci->ci_want_pmapload = 0; 3609 kpreempt_enable(); 3610 return; 3611 } 3612 3613 /* 3614 * Acquire a reference to the new pmap and perform the switch. 3615 */ 3616 3617 pmap_reference(pmap); 3618 pmap_load1(l, pmap, oldpmap); 3619 ci->ci_want_pmapload = 0; 3620 3621 /* 3622 * we're now running with the new pmap. drop the reference 3623 * to the old pmap. if we block, we need to go around again. 3624 */ 3625 3626 pmap_destroy(oldpmap); 3627 __insn_barrier(); 3628 if (lwp_pctr() != pctr) { 3629 goto retry; 3630 } 3631 3632 kpreempt_enable(); 3633 } 3634 3635 /* 3636 * pmap_load1: the guts of pmap load, shared by pmap_map_ptes() and 3637 * pmap_load(). It's critically important that this function does not 3638 * block. 3639 */ 3640 static void 3641 pmap_load1(struct lwp *l, struct pmap *pmap, struct pmap *oldpmap) 3642 { 3643 struct cpu_info *ci; 3644 struct pcb *pcb; 3645 cpuid_t cid; 3646 3647 KASSERT(kpreempt_disabled()); 3648 3649 pcb = lwp_getpcb(l); 3650 ci = l->l_cpu; 3651 cid = cpu_index(ci); 3652 3653 kcpuset_atomic_clear(oldpmap->pm_cpus, cid); 3654 kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid); 3655 3656 KASSERT_PDIRPA(oldpmap); 3657 KASSERT(!kcpuset_isset(pmap->pm_cpus, cid)); 3658 KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid)); 3659 3660 /* 3661 * Mark the pmap in use by this CPU. Again, we must synchronize 3662 * with TLB shootdown interrupts, so set the state VALID first, 3663 * then register us for shootdown events on this pmap. 3664 */ 3665 ci->ci_tlbstate = TLBSTATE_VALID; 3666 kcpuset_atomic_set(pmap->pm_cpus, cid); 3667 kcpuset_atomic_set(pmap->pm_kernel_cpus, cid); 3668 ci->ci_pmap = pmap; 3669 3670 /* 3671 * update tss. now that we have registered for invalidations 3672 * from other CPUs, we're good to load the page tables. 3673 */ 3674 #ifdef PAE 3675 pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa; 3676 #else 3677 pcb->pcb_cr3 = pmap_pdirpa(pmap, 0); 3678 #endif 3679 3680 #ifdef i386 3681 #ifndef XENPV 3682 ci->ci_tss->tss.tss_ldt = pmap->pm_ldt_sel; 3683 ci->ci_tss->tss.tss_cr3 = pcb->pcb_cr3; 3684 #endif 3685 #endif 3686 3687 #if defined(SVS) && defined(USER_LDT) 3688 if (svs_enabled) { 3689 svs_ldt_sync(pmap); 3690 } else 3691 #endif 3692 lldt(pmap->pm_ldt_sel); 3693 3694 cpu_load_pmap(pmap, oldpmap); 3695 } 3696 3697 /* 3698 * pmap_deactivate: deactivate a process' pmap. 3699 * 3700 * => Must be called with kernel preemption disabled (high IPL is enough). 3701 */ 3702 void 3703 pmap_deactivate(struct lwp *l) 3704 { 3705 struct pmap *pmap; 3706 struct cpu_info *ci; 3707 3708 KASSERT(kpreempt_disabled()); 3709 3710 if (l != curlwp) { 3711 return; 3712 } 3713 3714 /* 3715 * Wait for pending TLB shootdowns to complete. Necessary because 3716 * TLB shootdown state is per-CPU, and the LWP may be coming off 3717 * the CPU before it has a chance to call pmap_update(), e.g. due 3718 * to kernel preemption or blocking routine in between. 3719 */ 3720 pmap_tlb_shootnow(); 3721 3722 ci = curcpu(); 3723 3724 if (ci->ci_want_pmapload) { 3725 /* 3726 * ci_want_pmapload means that our pmap is not loaded on 3727 * the CPU or TLB might be stale. note that pmap_kernel() 3728 * is always considered loaded. 3729 */ 3730 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 3731 != pmap_kernel()); 3732 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 3733 != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID); 3734 3735 /* 3736 * userspace has not been touched. 3737 * nothing to do here. 3738 */ 3739 3740 ci->ci_want_pmapload = 0; 3741 return; 3742 } 3743 3744 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 3745 3746 if (pmap == pmap_kernel()) { 3747 return; 3748 } 3749 3750 KASSERT_PDIRPA(pmap); 3751 KASSERT(ci->ci_pmap == pmap); 3752 3753 /* 3754 * we aren't interested in TLB invalidations for this pmap, 3755 * at least for the time being. 3756 */ 3757 3758 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); 3759 ci->ci_tlbstate = TLBSTATE_LAZY; 3760 } 3761 3762 #ifdef EFI_RUNTIME 3763 3764 extern struct pmap *efi_runtime_pmap; 3765 3766 /* 3767 * pmap_is_user: true if pmap, which must not be the kernel pmap, is 3768 * for an unprivileged user process 3769 */ 3770 bool 3771 pmap_is_user(struct pmap *pmap) 3772 { 3773 3774 KASSERT(pmap != pmap_kernel()); 3775 return (pmap != efi_runtime_pmap); 3776 } 3777 3778 /* 3779 * pmap_activate_sync: synchronously activate specified pmap. 3780 * 3781 * => Must be called with kernel preemption disabled (high IPL is enough). 3782 * => Must not sleep before pmap_deactivate_sync. 3783 */ 3784 void * 3785 pmap_activate_sync(struct pmap *pmap) 3786 { 3787 struct cpu_info *ci = curcpu(); 3788 struct pmap *oldpmap = ci->ci_pmap; 3789 unsigned cid = cpu_index(ci); 3790 3791 KASSERT(kpreempt_disabled()); 3792 KASSERT(pmap != pmap_kernel()); 3793 3794 KASSERT(!kcpuset_isset(pmap->pm_cpus, cid)); 3795 KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid)); 3796 3797 if (oldpmap) { 3798 KASSERT_PDIRPA(oldpmap); 3799 kcpuset_atomic_clear(oldpmap->pm_cpus, cid); 3800 kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid); 3801 } 3802 3803 ci->ci_tlbstate = TLBSTATE_VALID; 3804 kcpuset_atomic_set(pmap->pm_cpus, cid); 3805 kcpuset_atomic_set(pmap->pm_kernel_cpus, cid); 3806 ci->ci_pmap = pmap; 3807 3808 #if defined(SVS) && defined(USER_LDT) 3809 if (svs_enabled) { 3810 svs_ldt_sync(pmap); 3811 } else 3812 #endif 3813 lldt(pmap->pm_ldt_sel); 3814 3815 cpu_load_pmap(pmap, oldpmap); 3816 3817 return oldpmap; 3818 } 3819 3820 /* 3821 * pmap_deactivate_sync: synchronously deactivate specified pmap and 3822 * restore whatever was active before pmap_activate_sync. 3823 * 3824 * => Must be called with kernel preemption disabled (high IPL is enough). 3825 * => Must not have slept since pmap_activate_sync. 3826 */ 3827 void 3828 pmap_deactivate_sync(struct pmap *pmap, void *cookie) 3829 { 3830 struct cpu_info *ci = curcpu(); 3831 struct pmap *oldpmap = cookie; 3832 unsigned cid = cpu_index(ci); 3833 3834 KASSERT(kpreempt_disabled()); 3835 KASSERT(pmap != pmap_kernel()); 3836 KASSERT(ci->ci_pmap == pmap); 3837 3838 KASSERT_PDIRPA(pmap); 3839 3840 KASSERT(kcpuset_isset(pmap->pm_cpus, cid)); 3841 KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid)); 3842 3843 pmap_tlb_shootnow(); 3844 3845 kcpuset_atomic_clear(pmap->pm_cpus, cid); 3846 kcpuset_atomic_clear(pmap->pm_kernel_cpus, cid); 3847 3848 ci->ci_tlbstate = TLBSTATE_VALID; 3849 ci->ci_pmap = oldpmap; 3850 if (oldpmap) { 3851 kcpuset_atomic_set(oldpmap->pm_cpus, cid); 3852 kcpuset_atomic_set(oldpmap->pm_kernel_cpus, cid); 3853 #if defined(SVS) && defined(USER_LDT) 3854 if (svs_enabled) { 3855 svs_ldt_sync(oldpmap); 3856 } else 3857 #endif 3858 lldt(oldpmap->pm_ldt_sel); 3859 cpu_load_pmap(oldpmap, pmap); 3860 } else { 3861 lcr3(pmap_pdirpa(pmap_kernel(), 0)); 3862 } 3863 } 3864 3865 #endif /* EFI_RUNTIME */ 3866 3867 /* 3868 * some misc. functions 3869 */ 3870 3871 bool 3872 pmap_pdes_valid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde, 3873 int *lastlvl) 3874 { 3875 unsigned long index; 3876 pd_entry_t pde; 3877 int i; 3878 3879 for (i = PTP_LEVELS; i > 1; i--) { 3880 index = pl_i(va, i); 3881 pde = pdes[i - 2][index]; 3882 if ((pde & PTE_P) == 0) { 3883 *lastlvl = i; 3884 return false; 3885 } 3886 if (pde & PTE_PS) 3887 break; 3888 } 3889 if (lastpde != NULL) 3890 *lastpde = pde; 3891 *lastlvl = i; 3892 return true; 3893 } 3894 3895 /* 3896 * pmap_extract: extract a PA for the given VA 3897 */ 3898 bool 3899 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap) 3900 { 3901 pt_entry_t *ptes, pte; 3902 pd_entry_t pde; 3903 pd_entry_t * const *pdes; 3904 struct pmap *pmap2; 3905 paddr_t pa; 3906 bool rv; 3907 int lvl; 3908 3909 if (__predict_false(pmap->pm_extract != NULL)) { 3910 return (*pmap->pm_extract)(pmap, va, pap); 3911 } 3912 3913 #ifdef __HAVE_DIRECT_MAP 3914 if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) { 3915 if (pap != NULL) { 3916 *pap = PMAP_DIRECT_UNMAP(va); 3917 } 3918 return true; 3919 } 3920 #endif 3921 3922 rv = false; 3923 pa = 0; 3924 3925 if (pmap != pmap_kernel()) { 3926 mutex_enter(&pmap->pm_lock); 3927 } 3928 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3929 if (pmap_pdes_valid(va, pdes, &pde, &lvl)) { 3930 if (lvl == 2) { 3931 pa = (pde & PTE_LGFRAME) | (va & (NBPD_L2 - 1)); 3932 rv = true; 3933 } else { 3934 KASSERT(lvl == 1); 3935 pte = ptes[pl1_i(va)]; 3936 if (__predict_true((pte & PTE_P) != 0)) { 3937 pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1)); 3938 rv = true; 3939 } 3940 } 3941 } 3942 pmap_unmap_ptes(pmap, pmap2); 3943 if (pmap != pmap_kernel()) { 3944 mutex_exit(&pmap->pm_lock); 3945 } 3946 if (pap != NULL) { 3947 *pap = pa; 3948 } 3949 3950 return rv; 3951 } 3952 3953 /* 3954 * vtophys: virtual address to physical address. For use by 3955 * machine-dependent code only. 3956 */ 3957 paddr_t 3958 vtophys(vaddr_t va) 3959 { 3960 paddr_t pa; 3961 3962 if (pmap_extract(pmap_kernel(), va, &pa) == true) 3963 return pa; 3964 return 0; 3965 } 3966 3967 __strict_weak_alias(pmap_extract_ma, pmap_extract); 3968 3969 #ifdef XENPV 3970 /* 3971 * vtomach: virtual address to machine address. For use by 3972 * machine-dependent code only. 3973 */ 3974 paddr_t 3975 vtomach(vaddr_t va) 3976 { 3977 paddr_t pa; 3978 3979 if (pmap_extract_ma(pmap_kernel(), va, &pa) == true) 3980 return pa; 3981 return 0; 3982 } 3983 #endif 3984 3985 /* 3986 * pmap_virtual_space: used during bootup [pmap_steal_memory] to 3987 * determine the bounds of the kernel virtual address space. 3988 */ 3989 void 3990 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp) 3991 { 3992 *startp = virtual_avail; 3993 *endp = virtual_end; 3994 } 3995 3996 void 3997 pmap_zero_page(paddr_t pa) 3998 { 3999 #if defined(__HAVE_DIRECT_MAP) 4000 memset(PAGE_ALIGNED(PMAP_DIRECT_MAP(pa)), 0, PAGE_SIZE); 4001 #else 4002 #if defined(XENPV) 4003 if (XEN_VERSION_SUPPORTED(3, 4)) { 4004 xen_pagezero(pa); 4005 return; 4006 } 4007 #endif 4008 struct cpu_info *ci; 4009 pt_entry_t *zpte; 4010 vaddr_t zerova; 4011 4012 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_D | PTE_A; 4013 4014 kpreempt_disable(); 4015 4016 ci = curcpu(); 4017 zerova = ci->vpage[VPAGE_ZER]; 4018 zpte = ci->vpage_pte[VPAGE_ZER]; 4019 4020 KASSERTMSG(!*zpte, "pmap_zero_page: lock botch"); 4021 4022 pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags); 4023 pmap_pte_flush(); 4024 pmap_update_pg(zerova); /* flush TLB */ 4025 4026 memset(PAGE_ALIGNED(zerova), 0, PAGE_SIZE); 4027 4028 #if defined(DIAGNOSTIC) || defined(XENPV) 4029 pmap_pte_set(zpte, 0); /* zap ! */ 4030 pmap_pte_flush(); 4031 #endif 4032 4033 kpreempt_enable(); 4034 #endif /* defined(__HAVE_DIRECT_MAP) */ 4035 } 4036 4037 void 4038 pmap_copy_page(paddr_t srcpa, paddr_t dstpa) 4039 { 4040 #if defined(__HAVE_DIRECT_MAP) 4041 vaddr_t srcva = PMAP_DIRECT_MAP(srcpa); 4042 vaddr_t dstva = PMAP_DIRECT_MAP(dstpa); 4043 4044 memcpy(PAGE_ALIGNED(dstva), PAGE_ALIGNED(srcva), PAGE_SIZE); 4045 #else 4046 #if defined(XENPV) 4047 if (XEN_VERSION_SUPPORTED(3, 4)) { 4048 xen_copy_page(srcpa, dstpa); 4049 return; 4050 } 4051 #endif 4052 struct cpu_info *ci; 4053 pt_entry_t *srcpte, *dstpte; 4054 vaddr_t srcva, dstva; 4055 4056 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A; 4057 4058 kpreempt_disable(); 4059 4060 ci = curcpu(); 4061 srcva = ci->vpage[VPAGE_SRC]; 4062 dstva = ci->vpage[VPAGE_DST]; 4063 srcpte = ci->vpage_pte[VPAGE_SRC]; 4064 dstpte = ci->vpage_pte[VPAGE_DST]; 4065 4066 KASSERT(*srcpte == 0 && *dstpte == 0); 4067 4068 pmap_pte_set(srcpte, pmap_pa2pte(srcpa) | pteflags); 4069 pmap_pte_set(dstpte, pmap_pa2pte(dstpa) | pteflags | PTE_D); 4070 pmap_pte_flush(); 4071 pmap_update_pg(srcva); 4072 pmap_update_pg(dstva); 4073 4074 memcpy(PAGE_ALIGNED(dstva), PAGE_ALIGNED(srcva), PAGE_SIZE); 4075 4076 #if defined(DIAGNOSTIC) || defined(XENPV) 4077 pmap_pte_set(srcpte, 0); 4078 pmap_pte_set(dstpte, 0); 4079 pmap_pte_flush(); 4080 #endif 4081 4082 kpreempt_enable(); 4083 #endif /* defined(__HAVE_DIRECT_MAP) */ 4084 } 4085 4086 static pt_entry_t * 4087 pmap_map_ptp(struct vm_page *ptp) 4088 { 4089 #ifdef __HAVE_DIRECT_MAP 4090 return (void *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp)); 4091 #else 4092 struct cpu_info *ci; 4093 pt_entry_t *ptppte; 4094 vaddr_t ptpva; 4095 4096 KASSERT(kpreempt_disabled()); 4097 4098 #ifndef XENPV 4099 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A | PTE_D; 4100 #else 4101 const pd_entry_t pteflags = PTE_P | pmap_pg_nx | PTE_A | PTE_D; 4102 #endif 4103 4104 ci = curcpu(); 4105 ptpva = ci->vpage[VPAGE_PTP]; 4106 ptppte = ci->vpage_pte[VPAGE_PTP]; 4107 4108 pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | pteflags); 4109 4110 pmap_pte_flush(); 4111 pmap_update_pg(ptpva); 4112 4113 return (pt_entry_t *)ptpva; 4114 #endif 4115 } 4116 4117 static void 4118 pmap_unmap_ptp(void) 4119 { 4120 #ifndef __HAVE_DIRECT_MAP 4121 #if defined(DIAGNOSTIC) || defined(XENPV) 4122 struct cpu_info *ci; 4123 pt_entry_t *pte; 4124 4125 KASSERT(kpreempt_disabled()); 4126 4127 ci = curcpu(); 4128 pte = ci->vpage_pte[VPAGE_PTP]; 4129 4130 if (*pte != 0) { 4131 pmap_pte_set(pte, 0); 4132 pmap_pte_flush(); 4133 } 4134 #endif 4135 #endif 4136 } 4137 4138 static pt_entry_t * 4139 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va) 4140 { 4141 4142 KASSERT(kpreempt_disabled()); 4143 if (pmap_is_curpmap(pmap)) { 4144 return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */ 4145 } 4146 KASSERT(ptp != NULL); 4147 return pmap_map_ptp(ptp) + pl1_pi(va); 4148 } 4149 4150 static void 4151 pmap_unmap_pte(void) 4152 { 4153 4154 KASSERT(kpreempt_disabled()); 4155 4156 pmap_unmap_ptp(); 4157 } 4158 4159 /* 4160 * p m a p r e m o v e f u n c t i o n s 4161 * 4162 * functions that remove mappings 4163 */ 4164 4165 /* 4166 * pmap_remove_ptes: remove PTEs from a PTP 4167 * 4168 * => caller must hold pmap's lock 4169 * => PTP must be mapped into KVA 4170 * => PTP should be null if pmap == pmap_kernel() 4171 * => must be called with kernel preemption disabled 4172 * => returns composite pte if at least one page should be shot down 4173 */ 4174 static void 4175 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva, 4176 vaddr_t startva, vaddr_t endva) 4177 { 4178 pt_entry_t *pte = (pt_entry_t *)ptpva; 4179 4180 KASSERT(mutex_owned(&pmap->pm_lock)); 4181 KASSERT(kpreempt_disabled()); 4182 4183 /* 4184 * mappings are very often sparse, so clip the given range to the 4185 * range of PTEs that are known present in the PTP. 4186 */ 4187 pmap_ptp_range_clip(ptp, &startva, &pte); 4188 4189 /* 4190 * note that ptpva points to the PTE that maps startva. this may 4191 * or may not be the first PTE in the PTP. 4192 * 4193 * we loop through the PTP while there are still PTEs to look at 4194 * and the wire_count is greater than 1 (because we use the wire_count 4195 * to keep track of the number of real PTEs in the PTP). 4196 */ 4197 while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) { 4198 (void)pmap_remove_pte(pmap, ptp, pte, startva); 4199 startva += PAGE_SIZE; 4200 pte++; 4201 } 4202 } 4203 4204 /* 4205 * pmap_remove_pte: remove a single PTE from a PTP. 4206 * 4207 * => caller must hold pmap's lock 4208 * => PTP must be mapped into KVA 4209 * => PTP should be null if pmap == pmap_kernel() 4210 * => returns true if we removed a mapping 4211 * => must be called with kernel preemption disabled 4212 */ 4213 static bool 4214 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, 4215 vaddr_t va) 4216 { 4217 struct pv_entry *pve; 4218 struct vm_page *pg; 4219 struct pmap_page *pp; 4220 pt_entry_t opte; 4221 4222 KASSERT(mutex_owned(&pmap->pm_lock)); 4223 KASSERT(kpreempt_disabled()); 4224 4225 if (!pmap_valid_entry(*pte)) { 4226 /* VA not mapped. */ 4227 return false; 4228 } 4229 4230 /* Atomically save the old PTE and zap it. */ 4231 opte = pmap_pte_testset(pte, 0); 4232 if (!pmap_valid_entry(opte)) { 4233 return false; 4234 } 4235 4236 pmap_exec_account(pmap, va, opte, 0); 4237 pmap_stats_update_bypte(pmap, 0, opte); 4238 4239 if (ptp) { 4240 /* 4241 * Dropping a PTE. Make sure that the PDE is flushed. 4242 */ 4243 ptp->wire_count--; 4244 if (ptp->wire_count <= 1) { 4245 opte |= PTE_A; 4246 } 4247 } 4248 4249 if ((opte & PTE_A) != 0) { 4250 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE); 4251 } 4252 4253 /* 4254 * If we are not on a pv list - we are done. 4255 */ 4256 if ((opte & PTE_PVLIST) == 0) { 4257 #ifndef DOM0OPS 4258 KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL), 4259 "managed page without PTE_PVLIST for %#"PRIxVADDR, va); 4260 KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL), 4261 "pv-tracked page without PTE_PVLIST for %#"PRIxVADDR, va); 4262 #endif 4263 KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ? 4264 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL); 4265 return true; 4266 } 4267 4268 if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) { 4269 pp = VM_PAGE_TO_PP(pg); 4270 } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) { 4271 paddr_t pa = pmap_pte2pa(opte); 4272 panic("%s: PTE_PVLIST with pv-untracked page" 4273 " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")", 4274 __func__, va, pa, atop(pa)); 4275 } 4276 4277 /* Sync R/M bits. */ 4278 pve = pmap_lookup_pv(pmap, ptp, pp, va); 4279 pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_pte_to_pp_attrs(opte)); 4280 return true; 4281 } 4282 4283 static void 4284 pmap_remove_locked(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 4285 { 4286 pt_entry_t *ptes; 4287 pd_entry_t pde; 4288 pd_entry_t * const *pdes; 4289 bool result; 4290 vaddr_t blkendva, va = sva; 4291 struct vm_page *ptp; 4292 struct pmap *pmap2; 4293 int lvl; 4294 4295 KASSERT(mutex_owned(&pmap->pm_lock)); 4296 4297 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 4298 4299 /* 4300 * removing one page? take shortcut function. 4301 */ 4302 4303 if (va + PAGE_SIZE == eva) { 4304 if (pmap_pdes_valid(va, pdes, &pde, &lvl)) { 4305 KASSERT(lvl == 1); 4306 4307 /* Get PTP if non-kernel mapping. */ 4308 if (pmap != pmap_kernel()) { 4309 ptp = pmap_find_ptp(pmap, va, 1); 4310 KASSERTMSG(ptp != NULL, 4311 "%s: unmanaged PTP detected", __func__); 4312 } else { 4313 /* Never free kernel PTPs. */ 4314 ptp = NULL; 4315 } 4316 4317 result = pmap_remove_pte(pmap, ptp, 4318 &ptes[pl1_i(va)], va); 4319 4320 /* 4321 * if mapping removed and the PTP is no longer 4322 * being used, free it! 4323 */ 4324 4325 if (result && ptp && ptp->wire_count <= 1) 4326 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 4327 } 4328 } else for (/* null */ ; va < eva ; va = blkendva) { 4329 /* determine range of block */ 4330 blkendva = x86_round_pdr(va+1); 4331 if (blkendva > eva) 4332 blkendva = eva; 4333 4334 if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) { 4335 /* Skip a range corresponding to an invalid pde. */ 4336 blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1]; 4337 continue; 4338 } 4339 KASSERT(lvl == 1); 4340 4341 /* Get PTP if non-kernel mapping. */ 4342 if (pmap != pmap_kernel()) { 4343 ptp = pmap_find_ptp(pmap, va, 1); 4344 KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected", 4345 __func__); 4346 } else { 4347 /* Never free kernel PTPs. */ 4348 ptp = NULL; 4349 } 4350 4351 pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va, 4352 blkendva); 4353 4354 /* If PTP is no longer being used, free it. */ 4355 if (ptp && ptp->wire_count <= 1) { 4356 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 4357 } 4358 } 4359 pmap_unmap_ptes(pmap, pmap2); 4360 pmap_drain_pv(pmap); 4361 } 4362 4363 /* 4364 * pmap_remove: mapping removal function. 4365 * 4366 * => caller should not be holding any pmap locks 4367 */ 4368 void 4369 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 4370 { 4371 if (__predict_false(pmap->pm_remove != NULL)) { 4372 (*pmap->pm_remove)(pmap, sva, eva); 4373 return; 4374 } 4375 4376 mutex_enter(&pmap->pm_lock); 4377 pmap_remove_locked(pmap, sva, eva); 4378 mutex_exit(&pmap->pm_lock); 4379 } 4380 4381 /* 4382 * pmap_sync_pv: clear pte bits and return the old value of the pp_attrs. 4383 * 4384 * => The 'clearbits' parameter is either ~0 or PP_ATTRS_... 4385 * => Caller should disable kernel preemption. 4386 * => issues tlb shootdowns if necessary. 4387 */ 4388 static int 4389 pmap_sync_pv(struct pv_pte *pvpte, paddr_t pa, int clearbits, uint8_t *oattrs, 4390 pt_entry_t *optep) 4391 { 4392 struct pmap *pmap; 4393 struct vm_page *ptp; 4394 vaddr_t va; 4395 pt_entry_t *ptep; 4396 pt_entry_t opte; 4397 pt_entry_t npte; 4398 pt_entry_t expect; 4399 bool need_shootdown; 4400 4401 ptp = pvpte->pte_ptp; 4402 va = pvpte->pte_va; 4403 KASSERT(ptp == NULL || ptp->uobject != NULL); 4404 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 4405 pmap = ptp_to_pmap(ptp); 4406 KASSERT(kpreempt_disabled()); 4407 4408 if (__predict_false(pmap->pm_sync_pv != NULL)) { 4409 return (*pmap->pm_sync_pv)(ptp, va, pa, clearbits, oattrs, 4410 optep); 4411 } 4412 4413 expect = pmap_pa2pte(pa) | PTE_P; 4414 4415 if (clearbits != ~0) { 4416 KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0); 4417 clearbits = pmap_pp_attrs_to_pte(clearbits); 4418 } 4419 4420 ptep = pmap_map_pte(pmap, ptp, va); 4421 do { 4422 opte = *ptep; 4423 KASSERT((opte & (PTE_D | PTE_A)) != PTE_D); 4424 KASSERT((opte & (PTE_A | PTE_P)) != PTE_A); 4425 KASSERT(opte == 0 || (opte & PTE_P) != 0); 4426 if ((opte & (PTE_FRAME | PTE_P)) != expect) { 4427 /* 4428 * We lost a race with a V->P operation like 4429 * pmap_remove(). Wait for the competitor 4430 * reflecting pte bits into mp_attrs. 4431 */ 4432 pmap_unmap_pte(); 4433 return EAGAIN; 4434 } 4435 4436 /* 4437 * Check if there's anything to do on this PTE. 4438 */ 4439 if ((opte & clearbits) == 0) { 4440 need_shootdown = false; 4441 break; 4442 } 4443 4444 /* 4445 * We need a shootdown if the PTE is cached (PTE_A) ... 4446 * ... Unless we are clearing only the PTE_W bit and 4447 * it isn't cached as RW (PTE_D). 4448 */ 4449 need_shootdown = (opte & PTE_A) != 0 && 4450 !(clearbits == PTE_W && (opte & PTE_D) == 0); 4451 4452 npte = opte & ~clearbits; 4453 4454 /* 4455 * If we need a shootdown anyway, clear PTE_A and PTE_D. 4456 */ 4457 if (need_shootdown) { 4458 npte &= ~(PTE_A | PTE_D); 4459 } 4460 KASSERT((npte & (PTE_D | PTE_A)) != PTE_D); 4461 KASSERT((npte & (PTE_A | PTE_P)) != PTE_A); 4462 KASSERT(npte == 0 || (opte & PTE_P) != 0); 4463 } while (pmap_pte_cas(ptep, opte, npte) != opte); 4464 4465 if (need_shootdown) { 4466 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV); 4467 } 4468 pmap_unmap_pte(); 4469 4470 *oattrs = pmap_pte_to_pp_attrs(opte); 4471 if (optep != NULL) 4472 *optep = opte; 4473 return 0; 4474 } 4475 4476 static void 4477 pmap_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte, 4478 vaddr_t va) 4479 { 4480 struct pmap *pmap2; 4481 pt_entry_t *ptes; 4482 pd_entry_t * const *pdes; 4483 4484 KASSERT(mutex_owned(&pmap->pm_lock)); 4485 4486 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 4487 pmap_stats_update_bypte(pmap, 0, opte); 4488 ptp->wire_count--; 4489 if (ptp->wire_count <= 1) { 4490 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 4491 } 4492 pmap_unmap_ptes(pmap, pmap2); 4493 } 4494 4495 static void 4496 pmap_pp_remove(struct pmap_page *pp, paddr_t pa) 4497 { 4498 struct pv_pte *pvpte; 4499 struct vm_page *ptp; 4500 uintptr_t sum; 4501 uint8_t oattrs; 4502 bool locked; 4503 4504 /* 4505 * Do an unlocked check to see if the page has no mappings, eg when 4506 * pmap_remove_all() was called before amap_wipeout() for a process 4507 * private amap - common. The page being removed must be on the way 4508 * out, so we don't have to worry about concurrent attempts to enter 4509 * it (otherwise the caller either doesn't care or has screwed up). 4510 */ 4511 sum = (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_va); 4512 sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_ptp); 4513 sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pvlist.lh_first); 4514 if (sum == 0) { 4515 return; 4516 } 4517 4518 kpreempt_disable(); 4519 for (;;) { 4520 struct pmap *pmap; 4521 struct pv_entry *pve; 4522 pt_entry_t opte; 4523 vaddr_t va; 4524 4525 mutex_spin_enter(&pp->pp_lock); 4526 if ((pvpte = pv_pte_first(pp)) == NULL) { 4527 mutex_spin_exit(&pp->pp_lock); 4528 break; 4529 } 4530 4531 /* 4532 * Add a reference to the pmap before clearing the pte. 4533 * Otherwise the pmap can disappear behind us. 4534 */ 4535 ptp = pvpte->pte_ptp; 4536 pmap = ptp_to_pmap(ptp); 4537 KASSERT(pmap->pm_obj[0].uo_refs > 0); 4538 if (ptp != NULL) { 4539 pmap_reference(pmap); 4540 } 4541 4542 /* 4543 * Now try to lock it. We need a direct handoff between 4544 * pp_lock and pm_lock to know the pv_entry is kept intact 4545 * and kept associated with this pmap. If that can't be 4546 * had, wait for the pmap's lock to become free and then 4547 * retry. 4548 */ 4549 locked = mutex_tryenter(&pmap->pm_lock); 4550 mutex_spin_exit(&pp->pp_lock); 4551 if (!locked) { 4552 mutex_enter(&pmap->pm_lock); 4553 /* nothing, just wait for it */ 4554 mutex_exit(&pmap->pm_lock); 4555 if (ptp != NULL) { 4556 pmap_destroy(pmap); 4557 } 4558 continue; 4559 } 4560 va = pvpte->pte_va; 4561 4562 KASSERTMSG(pmap->pm_stats.resident_count > PDP_SIZE, 4563 "va %lx pmap %p ptp %p is empty", va, pmap, ptp); 4564 KASSERTMSG(ptp == NULL || (ptp->flags & PG_FREE) == 0, 4565 "va %lx pmap %p ptp %p is free", va, pmap, ptp); 4566 KASSERTMSG(ptp == NULL || ptp->wire_count > 1, 4567 "va %lx pmap %p ptp %p is empty", va, pmap, ptp); 4568 4569 #ifdef DEBUG 4570 pmap_check_pv(pmap, ptp, pp, pvpte->pte_va, true); 4571 rb_tree_t *tree = (ptp != NULL ? 4572 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb); 4573 pve = pmap_treelookup_pv(pmap, ptp, tree, va); 4574 if (pve == NULL) { 4575 KASSERTMSG(&pp->pp_pte == pvpte, 4576 "va %lx pmap %p ptp %p pvpte %p pve %p oops 1", 4577 va, pmap, ptp, pvpte, pve); 4578 } else { 4579 KASSERTMSG(&pve->pve_pte == pvpte, 4580 "va %lx pmap %p ptp %p pvpte %p pve %p oops 2", 4581 va, pmap, ptp, pvpte, pve); 4582 } 4583 #endif 4584 4585 if (pmap_sync_pv(pvpte, pa, ~0, &oattrs, &opte)) { 4586 panic("pmap_pp_remove: mapping not present"); 4587 } 4588 4589 pve = pmap_lookup_pv(pmap, ptp, pp, va); 4590 pmap_remove_pv(pmap, pp, ptp, va, pve, oattrs); 4591 4592 /* Update the PTP reference count. Free if last reference. */ 4593 if (ptp != NULL) { 4594 KASSERT(pmap != pmap_kernel()); 4595 pmap_tlb_shootnow(); 4596 if (__predict_false(pmap->pm_pp_remove_ent != NULL)) { 4597 (*pmap->pm_pp_remove_ent)(pmap, ptp, opte, va); 4598 } else { 4599 pmap_pp_remove_ent(pmap, ptp, opte, va); 4600 } 4601 } else { 4602 KASSERT(pmap == pmap_kernel()); 4603 pmap_stats_update_bypte(pmap, 0, opte); 4604 } 4605 pmap_tlb_shootnow(); 4606 pmap_drain_pv(pmap); 4607 mutex_exit(&pmap->pm_lock); 4608 if (ptp != NULL) { 4609 pmap_destroy(pmap); 4610 } 4611 } 4612 kpreempt_enable(); 4613 } 4614 4615 /* 4616 * pmap_page_remove: remove a managed vm_page from all pmaps that map it 4617 * 4618 * => R/M bits are sync'd back to attrs 4619 */ 4620 void 4621 pmap_page_remove(struct vm_page *pg) 4622 { 4623 struct pmap_page *pp; 4624 paddr_t pa; 4625 4626 pp = VM_PAGE_TO_PP(pg); 4627 pa = VM_PAGE_TO_PHYS(pg); 4628 pmap_pp_remove(pp, pa); 4629 } 4630 4631 /* 4632 * pmap_pv_remove: remove an unmanaged pv-tracked page from all pmaps 4633 * that map it 4634 */ 4635 void 4636 pmap_pv_remove(paddr_t pa) 4637 { 4638 struct pmap_page *pp; 4639 4640 pp = pmap_pv_tracked(pa); 4641 if (pp == NULL) 4642 panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa); 4643 pmap_pp_remove(pp, pa); 4644 } 4645 4646 /* 4647 * p m a p a t t r i b u t e f u n c t i o n s 4648 * functions that test/change managed page's attributes 4649 * since a page can be mapped multiple times we must check each PTE that 4650 * maps it by going down the pv lists. 4651 */ 4652 4653 /* 4654 * pmap_test_attrs: test a page's attributes 4655 */ 4656 bool 4657 pmap_test_attrs(struct vm_page *pg, unsigned testbits) 4658 { 4659 struct pmap_page *pp; 4660 struct pv_pte *pvpte; 4661 struct pmap *pmap; 4662 uint8_t oattrs; 4663 u_int result; 4664 paddr_t pa; 4665 4666 pp = VM_PAGE_TO_PP(pg); 4667 if ((pp->pp_attrs & testbits) != 0) { 4668 return true; 4669 } 4670 pa = VM_PAGE_TO_PHYS(pg); 4671 startover: 4672 mutex_spin_enter(&pp->pp_lock); 4673 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 4674 if ((pp->pp_attrs & testbits) != 0) { 4675 break; 4676 } 4677 if (pmap_sync_pv(pvpte, pa, 0, &oattrs, NULL)) { 4678 /* 4679 * raced with a V->P operation. wait for the other 4680 * side to finish by acquiring pmap's lock. if no 4681 * wait, updates to pp_attrs by the other side may 4682 * go unseen. 4683 */ 4684 pmap = ptp_to_pmap(pvpte->pte_ptp); 4685 pmap_reference(pmap); 4686 mutex_spin_exit(&pp->pp_lock); 4687 mutex_enter(&pmap->pm_lock); 4688 /* nothing. */ 4689 mutex_exit(&pmap->pm_lock); 4690 pmap_destroy(pmap); 4691 goto startover; 4692 } 4693 pp->pp_attrs |= oattrs; 4694 } 4695 result = pp->pp_attrs & testbits; 4696 mutex_spin_exit(&pp->pp_lock); 4697 4698 /* 4699 * note that we will exit the for loop with a non-null pve if 4700 * we have found the bits we are testing for. 4701 */ 4702 4703 return result != 0; 4704 } 4705 4706 static bool 4707 pmap_pp_clear_attrs(struct pmap_page *pp, paddr_t pa, unsigned clearbits) 4708 { 4709 struct pv_pte *pvpte; 4710 struct pmap *pmap; 4711 uint8_t oattrs; 4712 u_int result; 4713 4714 startover: 4715 mutex_spin_enter(&pp->pp_lock); 4716 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 4717 if (pmap_sync_pv(pvpte, pa, clearbits, &oattrs, NULL)) { 4718 /* 4719 * raced with a V->P operation. wait for the other 4720 * side to finish by acquiring pmap's lock. it is 4721 * probably unmapping the page, and it will be gone 4722 * when the loop is restarted. 4723 */ 4724 pmap = ptp_to_pmap(pvpte->pte_ptp); 4725 pmap_reference(pmap); 4726 mutex_spin_exit(&pp->pp_lock); 4727 mutex_enter(&pmap->pm_lock); 4728 /* nothing. */ 4729 mutex_exit(&pmap->pm_lock); 4730 pmap_destroy(pmap); 4731 goto startover; 4732 } 4733 pp->pp_attrs |= oattrs; 4734 } 4735 result = pp->pp_attrs & clearbits; 4736 pp->pp_attrs &= ~clearbits; 4737 pmap_tlb_shootnow(); 4738 mutex_spin_exit(&pp->pp_lock); 4739 4740 return result != 0; 4741 } 4742 4743 /* 4744 * pmap_clear_attrs: clear the specified attribute for a page. 4745 * 4746 * => we return true if we cleared one of the bits we were asked to 4747 */ 4748 bool 4749 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits) 4750 { 4751 struct pmap_page *pp; 4752 paddr_t pa; 4753 4754 pp = VM_PAGE_TO_PP(pg); 4755 pa = VM_PAGE_TO_PHYS(pg); 4756 4757 /* 4758 * If this is a new page, assert it has no mappings and simply zap 4759 * the stored attributes without taking any locks. 4760 */ 4761 if ((pg->flags & PG_FAKE) != 0) { 4762 KASSERT(atomic_load_relaxed(&pp->pp_pte.pte_va) == 0); 4763 KASSERT(atomic_load_relaxed(&pp->pp_pte.pte_ptp) == NULL); 4764 KASSERT(atomic_load_relaxed(&pp->pp_pvlist.lh_first) == NULL); 4765 atomic_store_relaxed(&pp->pp_attrs, 0); 4766 return false; 4767 } else { 4768 return pmap_pp_clear_attrs(pp, pa, clearbits); 4769 } 4770 } 4771 4772 /* 4773 * pmap_pv_clear_attrs: clear the specified attributes for an unmanaged 4774 * pv-tracked page. 4775 */ 4776 bool 4777 pmap_pv_clear_attrs(paddr_t pa, unsigned clearbits) 4778 { 4779 struct pmap_page *pp; 4780 4781 pp = pmap_pv_tracked(pa); 4782 if (pp == NULL) 4783 panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa); 4784 4785 return pmap_pp_clear_attrs(pp, pa, clearbits); 4786 } 4787 4788 /* 4789 * p m a p p r o t e c t i o n f u n c t i o n s 4790 */ 4791 4792 /* 4793 * pmap_page_protect: change the protection of all recorded mappings 4794 * of a managed page 4795 * 4796 * => NOTE: this is an inline function in pmap.h 4797 */ 4798 4799 /* see pmap.h */ 4800 4801 /* 4802 * pmap_pv_protect: change the protection of all recorded mappings 4803 * of an unmanaged pv-tracked page 4804 * 4805 * => NOTE: this is an inline function in pmap.h 4806 */ 4807 4808 /* see pmap.h */ 4809 4810 /* 4811 * pmap_protect: set the protection in of the pages in a pmap 4812 * 4813 * => NOTE: this is an inline function in pmap.h 4814 */ 4815 4816 /* see pmap.h */ 4817 4818 /* 4819 * pmap_write_protect: write-protect pages in a pmap. 4820 * 4821 * Note for Xen-amd64. Xen automatically adds PTE_U to the kernel pages, but we 4822 * don't need to remove this bit when re-entering the PTEs here: Xen tracks the 4823 * kernel pages with a reserved bit (_PAGE_GUEST_KERNEL), so even if PTE_U is 4824 * present the page will still be considered as a kernel page, and the privilege 4825 * separation will be enforced correctly. 4826 */ 4827 void 4828 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot) 4829 { 4830 pt_entry_t bit_rem, bit_put; 4831 pt_entry_t *ptes; 4832 pt_entry_t * const *pdes; 4833 struct pmap *pmap2; 4834 vaddr_t blockend, va; 4835 int lvl, i; 4836 4837 if (__predict_false(pmap->pm_write_protect != NULL)) { 4838 (*pmap->pm_write_protect)(pmap, sva, eva, prot); 4839 return; 4840 } 4841 4842 bit_rem = 0; 4843 if (!(prot & VM_PROT_WRITE)) 4844 bit_rem = PTE_W; 4845 4846 bit_put = 0; 4847 if (!(prot & VM_PROT_EXECUTE)) 4848 bit_put = pmap_pg_nx; 4849 4850 sva &= ~PAGE_MASK; 4851 eva &= ~PAGE_MASK; 4852 4853 /* 4854 * Acquire pmap. No need to lock the kernel pmap as we won't 4855 * be touching PV entries nor stats and kernel PDEs aren't 4856 * freed. 4857 */ 4858 if (pmap != pmap_kernel()) { 4859 mutex_enter(&pmap->pm_lock); 4860 } 4861 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 4862 4863 for (va = sva ; va < eva; va = blockend) { 4864 pt_entry_t *spte, *epte; 4865 4866 blockend = x86_round_pdr(va + 1); 4867 if (blockend > eva) 4868 blockend = eva; 4869 4870 /* Is it a valid block? */ 4871 if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) { 4872 continue; 4873 } 4874 KASSERT(va < VM_MAXUSER_ADDRESS || va >= VM_MAX_ADDRESS); 4875 KASSERT(lvl == 1); 4876 4877 spte = &ptes[pl1_i(va)]; 4878 epte = &ptes[pl1_i(blockend)]; 4879 4880 for (i = 0; spte < epte; spte++, i++) { 4881 pt_entry_t opte, npte; 4882 4883 do { 4884 opte = *spte; 4885 if (!pmap_valid_entry(opte)) { 4886 goto next; 4887 } 4888 npte = (opte & ~bit_rem) | bit_put; 4889 } while (pmap_pte_cas(spte, opte, npte) != opte); 4890 4891 if ((opte & PTE_D) != 0) { 4892 vaddr_t tva = va + x86_ptob(i); 4893 pmap_tlb_shootdown(pmap, tva, opte, 4894 TLBSHOOT_WRITE_PROTECT); 4895 } 4896 next:; 4897 } 4898 } 4899 4900 /* Release pmap. */ 4901 pmap_unmap_ptes(pmap, pmap2); 4902 if (pmap != pmap_kernel()) { 4903 mutex_exit(&pmap->pm_lock); 4904 } 4905 } 4906 4907 /* 4908 * pmap_unwire: clear the wired bit in the PTE. 4909 * 4910 * => Mapping should already be present. 4911 */ 4912 void 4913 pmap_unwire(struct pmap *pmap, vaddr_t va) 4914 { 4915 pt_entry_t *ptes, *ptep, opte; 4916 pd_entry_t * const *pdes; 4917 struct pmap *pmap2; 4918 int lvl; 4919 4920 if (__predict_false(pmap->pm_unwire != NULL)) { 4921 (*pmap->pm_unwire)(pmap, va); 4922 return; 4923 } 4924 4925 /* 4926 * Acquire pmap. Need to lock the kernel pmap only to protect the 4927 * statistics. 4928 */ 4929 mutex_enter(&pmap->pm_lock); 4930 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 4931 4932 if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) { 4933 panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va); 4934 } 4935 KASSERT(lvl == 1); 4936 4937 ptep = &ptes[pl1_i(va)]; 4938 opte = *ptep; 4939 KASSERT(pmap_valid_entry(opte)); 4940 4941 if (opte & PTE_WIRED) { 4942 pt_entry_t npte = opte & ~PTE_WIRED; 4943 4944 opte = pmap_pte_testset(ptep, npte); 4945 pmap_stats_update_bypte(pmap, npte, opte); 4946 } else { 4947 printf("%s: wiring for pmap %p va %#" PRIxVADDR 4948 " did not change!\n", __func__, pmap, va); 4949 } 4950 4951 /* Release pmap. */ 4952 pmap_unmap_ptes(pmap, pmap2); 4953 mutex_exit(&pmap->pm_lock); 4954 } 4955 4956 /* 4957 * pmap_copy: copy mappings from one pmap to another 4958 * 4959 * => optional function 4960 * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr) 4961 */ 4962 4963 /* 4964 * defined as macro in pmap.h 4965 */ 4966 4967 __strict_weak_alias(pmap_enter, pmap_enter_default); 4968 4969 int 4970 pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, 4971 u_int flags) 4972 { 4973 if (__predict_false(pmap->pm_enter != NULL)) { 4974 return (*pmap->pm_enter)(pmap, va, pa, prot, flags); 4975 } 4976 4977 return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0); 4978 } 4979 4980 /* 4981 * pmap_enter: enter a mapping into a pmap 4982 * 4983 * => must be done "now" ... no lazy-evaluation 4984 */ 4985 int 4986 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa, 4987 vm_prot_t prot, u_int flags, int domid) 4988 { 4989 pt_entry_t *ptes, opte, npte; 4990 pt_entry_t *ptep; 4991 pd_entry_t * const *pdes; 4992 struct vm_page *ptp; 4993 struct vm_page *new_pg, *old_pg; 4994 struct pmap_page *new_pp, *old_pp; 4995 struct pv_entry *old_pve, *new_pve; 4996 bool wired = (flags & PMAP_WIRED) != 0; 4997 struct pmap *pmap2; 4998 struct pmap_ptparray pt; 4999 int error; 5000 bool getptp, samepage, new_embedded; 5001 rb_tree_t *tree; 5002 5003 KASSERT(pmap_initialized); 5004 KASSERT(va < VM_MAX_KERNEL_ADDRESS); 5005 KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#" 5006 PRIxVADDR " over PDP!", __func__, va); 5007 KASSERTMSG(va < VM_MIN_KERNEL_ADDRESS || 5008 pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]), 5009 "%s: missing kernel PTP for va=%#" PRIxVADDR, __func__, va); 5010 5011 #ifdef XENPV 5012 KASSERT(domid == DOMID_SELF || pa == 0); 5013 #endif 5014 5015 npte = ma | protection_codes[prot] | PTE_P; 5016 npte |= pmap_pat_flags(flags); 5017 if (wired) 5018 npte |= PTE_WIRED; 5019 if (va < VM_MAXUSER_ADDRESS) { 5020 KASSERTMSG(pmap != pmap_kernel(), 5021 "entering user va %#"PRIxVADDR" into kernel pmap", 5022 va); 5023 if (pmap_is_user(pmap)) 5024 npte |= PTE_U; 5025 } 5026 5027 if (pmap == pmap_kernel()) 5028 npte |= pmap_pg_g; 5029 if (flags & VM_PROT_ALL) { 5030 npte |= PTE_A; 5031 if (flags & VM_PROT_WRITE) { 5032 KASSERT((npte & PTE_W) != 0); 5033 npte |= PTE_D; 5034 } 5035 } 5036 5037 #ifdef XENPV 5038 if (domid != DOMID_SELF) 5039 new_pg = NULL; 5040 else 5041 #endif 5042 new_pg = PHYS_TO_VM_PAGE(pa); 5043 5044 if (new_pg != NULL) { 5045 /* This is a managed page */ 5046 npte |= PTE_PVLIST; 5047 new_pp = VM_PAGE_TO_PP(new_pg); 5048 PMAP_CHECK_PP(new_pp); 5049 } else if ((new_pp = pmap_pv_tracked(pa)) != NULL) { 5050 /* This is an unmanaged pv-tracked page */ 5051 npte |= PTE_PVLIST; 5052 PMAP_CHECK_PP(new_pp); 5053 } else { 5054 new_pp = NULL; 5055 } 5056 5057 /* Begin by locking the pmap. */ 5058 mutex_enter(&pmap->pm_lock); 5059 5060 /* Look up the PTP. Allocate if none present. */ 5061 ptp = NULL; 5062 getptp = false; 5063 if (pmap != pmap_kernel()) { 5064 ptp = pmap_find_ptp(pmap, va, 1); 5065 if (ptp == NULL) { 5066 getptp = true; 5067 error = pmap_get_ptp(pmap, &pt, va, flags, &ptp); 5068 if (error != 0) { 5069 if (flags & PMAP_CANFAIL) { 5070 mutex_exit(&pmap->pm_lock); 5071 return error; 5072 } 5073 panic("%s: get ptp failed, error=%d", __func__, 5074 error); 5075 } 5076 } 5077 tree = &VM_PAGE_TO_PP(ptp)->pp_rb; 5078 } else { 5079 /* Embedded PV entries rely on this. */ 5080 KASSERT(va != 0); 5081 tree = &pmap_kernel_rb; 5082 } 5083 5084 /* 5085 * Look up the old PV entry at this VA (if any), and insert a new PV 5086 * entry if required for the new mapping. Temporarily track the old 5087 * and new mappings concurrently. Only after the old mapping is 5088 * evicted from the pmap will we remove its PV entry. Otherwise, 5089 * our picture of modified/accessed state for either page could get 5090 * out of sync (we need any P->V operation for either page to stall 5091 * on pmap->pm_lock until done here). 5092 */ 5093 new_pve = NULL; 5094 old_pve = NULL; 5095 samepage = false; 5096 new_embedded = false; 5097 5098 if (new_pp != NULL) { 5099 error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve, 5100 &old_pve, &samepage, &new_embedded, tree); 5101 5102 /* 5103 * If a new pv_entry was needed and none was available, we 5104 * can go no further. 5105 */ 5106 if (error != 0) { 5107 if (flags & PMAP_CANFAIL) { 5108 if (getptp) { 5109 pmap_unget_ptp(pmap, &pt); 5110 } 5111 mutex_exit(&pmap->pm_lock); 5112 return error; 5113 } 5114 panic("%s: alloc pve failed", __func__); 5115 } 5116 } else { 5117 old_pve = pmap_treelookup_pv(pmap, ptp, tree, va); 5118 } 5119 5120 /* Map PTEs into address space. */ 5121 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 5122 5123 /* Install any newly allocated PTPs. */ 5124 if (getptp) { 5125 pmap_install_ptp(pmap, &pt, va, pdes); 5126 } 5127 5128 /* Check if there is an existing mapping. */ 5129 ptep = &ptes[pl1_i(va)]; 5130 opte = *ptep; 5131 bool have_oldpa = pmap_valid_entry(opte); 5132 paddr_t oldpa = pmap_pte2pa(opte); 5133 5134 /* 5135 * Update the pte. 5136 */ 5137 do { 5138 opte = *ptep; 5139 5140 /* 5141 * if the same page, inherit PTE_A and PTE_D. 5142 */ 5143 if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) { 5144 npte |= opte & (PTE_A | PTE_D); 5145 } 5146 #if defined(XENPV) 5147 if (domid != DOMID_SELF) { 5148 /* pmap_pte_cas with error handling */ 5149 int s = splvm(); 5150 if (opte != *ptep) { 5151 splx(s); 5152 continue; 5153 } 5154 error = xpq_update_foreign( 5155 vtomach((vaddr_t)ptep), npte, domid, flags); 5156 splx(s); 5157 if (error) { 5158 /* Undo pv_entry tracking - oof. */ 5159 if (new_pp != NULL) { 5160 mutex_spin_enter(&new_pp->pp_lock); 5161 if (new_pve != NULL) { 5162 LIST_REMOVE(new_pve, pve_list); 5163 KASSERT(pmap->pm_pve == NULL); 5164 pmap->pm_pve = new_pve; 5165 } else if (new_embedded) { 5166 new_pp->pp_pte.pte_ptp = NULL; 5167 new_pp->pp_pte.pte_va = 0; 5168 } 5169 mutex_spin_exit(&new_pp->pp_lock); 5170 } 5171 pmap_unmap_ptes(pmap, pmap2); 5172 /* Free new PTP. */ 5173 if (ptp != NULL && ptp->wire_count <= 1) { 5174 pmap_free_ptp(pmap, ptp, va, ptes, 5175 pdes); 5176 } 5177 mutex_exit(&pmap->pm_lock); 5178 return error; 5179 } 5180 break; 5181 } 5182 #endif /* defined(XENPV) */ 5183 } while (pmap_pte_cas(ptep, opte, npte) != opte); 5184 5185 /* 5186 * Done with the PTEs: they can now be unmapped. 5187 */ 5188 pmap_unmap_ptes(pmap, pmap2); 5189 5190 /* 5191 * Update statistics and PTP's reference count. 5192 */ 5193 pmap_stats_update_bypte(pmap, npte, opte); 5194 if (ptp != NULL) { 5195 if (!have_oldpa) { 5196 ptp->wire_count++; 5197 } 5198 /* Remember minimum VA in PTP. */ 5199 pmap_ptp_range_set(ptp, va); 5200 } 5201 KASSERT(ptp == NULL || ptp->wire_count > 1); 5202 5203 /* 5204 * If the same page, we can skip pv_entry handling. 5205 */ 5206 if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) { 5207 KASSERT(((opte ^ npte) & PTE_PVLIST) == 0); 5208 if ((npte & PTE_PVLIST) != 0) { 5209 KASSERT(samepage); 5210 pmap_check_pv(pmap, ptp, new_pp, va, true); 5211 } 5212 goto same_pa; 5213 } else if ((npte & PTE_PVLIST) != 0) { 5214 KASSERT(!samepage); 5215 } 5216 5217 /* 5218 * If old page is pv-tracked, remove pv_entry from its list. 5219 */ 5220 if ((~opte & (PTE_P | PTE_PVLIST)) == 0) { 5221 if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) { 5222 old_pp = VM_PAGE_TO_PP(old_pg); 5223 } else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) { 5224 panic("%s: PTE_PVLIST with pv-untracked page" 5225 " va = %#"PRIxVADDR 5226 " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")", 5227 __func__, va, oldpa, atop(pa)); 5228 } 5229 5230 pmap_remove_pv(pmap, old_pp, ptp, va, old_pve, 5231 pmap_pte_to_pp_attrs(opte)); 5232 } else { 5233 KASSERT(old_pve == NULL); 5234 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 5235 } 5236 5237 /* 5238 * If new page is dynamically PV tracked, insert to tree. 5239 */ 5240 if (new_pve != NULL) { 5241 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 5242 old_pve = rb_tree_insert_node(tree, new_pve); 5243 KASSERT(old_pve == new_pve); 5244 pmap_check_pv(pmap, ptp, new_pp, va, true); 5245 } 5246 5247 same_pa: 5248 /* 5249 * shootdown tlb if necessary. 5250 */ 5251 5252 if ((~opte & (PTE_P | PTE_A)) == 0 && 5253 ((opte ^ npte) & (PTE_FRAME | PTE_W)) != 0) { 5254 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER); 5255 } 5256 pmap_drain_pv(pmap); 5257 mutex_exit(&pmap->pm_lock); 5258 return 0; 5259 } 5260 5261 #if defined(XEN) && defined(DOM0OPS) 5262 5263 struct pmap_data_gnt { 5264 SLIST_ENTRY(pmap_data_gnt) pd_gnt_list; 5265 vaddr_t pd_gnt_sva; 5266 vaddr_t pd_gnt_eva; /* range covered by this gnt */ 5267 int pd_gnt_refs; /* ref counter */ 5268 struct gnttab_map_grant_ref pd_gnt_ops[1]; /* variable length */ 5269 }; 5270 SLIST_HEAD(pmap_data_gnt_head, pmap_data_gnt); 5271 5272 static void pmap_remove_gnt(struct pmap *, vaddr_t, vaddr_t); 5273 5274 static struct pmap_data_gnt * 5275 pmap_find_gnt(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 5276 { 5277 struct pmap_data_gnt_head *headp; 5278 struct pmap_data_gnt *pgnt; 5279 5280 KASSERT(mutex_owned(&pmap->pm_lock)); 5281 headp = pmap->pm_data; 5282 KASSERT(headp != NULL); 5283 SLIST_FOREACH(pgnt, headp, pd_gnt_list) { 5284 if (pgnt->pd_gnt_sva <= sva && eva <= pgnt->pd_gnt_eva) 5285 return pgnt; 5286 /* check that we're not overlapping part of a region */ 5287 KASSERT(pgnt->pd_gnt_sva >= eva || pgnt->pd_gnt_eva <= sva); 5288 } 5289 return NULL; 5290 } 5291 5292 static void 5293 pmap_alloc_gnt(struct pmap *pmap, vaddr_t sva, int nentries, 5294 const struct gnttab_map_grant_ref *ops) 5295 { 5296 struct pmap_data_gnt_head *headp; 5297 struct pmap_data_gnt *pgnt; 5298 vaddr_t eva = sva + nentries * PAGE_SIZE; 5299 KASSERT(mutex_owned(&pmap->pm_lock)); 5300 KASSERT(nentries >= 1); 5301 if (pmap->pm_remove == NULL) { 5302 pmap->pm_remove = pmap_remove_gnt; 5303 KASSERT(pmap->pm_data == NULL); 5304 headp = kmem_alloc(sizeof(*headp), KM_SLEEP); 5305 SLIST_INIT(headp); 5306 pmap->pm_data = headp; 5307 } else { 5308 KASSERT(pmap->pm_remove == pmap_remove_gnt); 5309 KASSERT(pmap->pm_data != NULL); 5310 headp = pmap->pm_data; 5311 } 5312 5313 pgnt = pmap_find_gnt(pmap, sva, eva); 5314 if (pgnt != NULL) { 5315 KASSERT(pgnt->pd_gnt_sva == sva); 5316 KASSERT(pgnt->pd_gnt_eva == eva); 5317 return; 5318 } 5319 5320 /* new entry */ 5321 pgnt = kmem_alloc(sizeof(*pgnt) + 5322 (nentries - 1) * sizeof(struct gnttab_map_grant_ref), KM_SLEEP); 5323 pgnt->pd_gnt_sva = sva; 5324 pgnt->pd_gnt_eva = eva; 5325 pgnt->pd_gnt_refs = 0; 5326 memcpy(pgnt->pd_gnt_ops, ops, 5327 sizeof(struct gnttab_map_grant_ref) * nentries); 5328 SLIST_INSERT_HEAD(headp, pgnt, pd_gnt_list); 5329 } 5330 5331 static void 5332 pmap_free_gnt(struct pmap *pmap, struct pmap_data_gnt *pgnt) 5333 { 5334 struct pmap_data_gnt_head *headp = pmap->pm_data; 5335 int nentries = (pgnt->pd_gnt_eva - pgnt->pd_gnt_sva) / PAGE_SIZE; 5336 KASSERT(nentries >= 1); 5337 KASSERT(mutex_owned(&pmap->pm_lock)); 5338 KASSERT(pgnt->pd_gnt_refs == 0); 5339 SLIST_REMOVE(headp, pgnt, pmap_data_gnt, pd_gnt_list); 5340 kmem_free(pgnt, sizeof(*pgnt) + 5341 (nentries - 1) * sizeof(struct gnttab_map_grant_ref)); 5342 if (SLIST_EMPTY(headp)) { 5343 kmem_free(headp, sizeof(*headp)); 5344 pmap->pm_data = NULL; 5345 pmap->pm_remove = NULL; 5346 } 5347 } 5348 5349 /* 5350 * pmap_enter_gnt: enter a grant entry into a pmap 5351 * 5352 * => must be done "now" ... no lazy-evaluation 5353 */ 5354 int 5355 pmap_enter_gnt(struct pmap *pmap, vaddr_t va, vaddr_t sva, int nentries, 5356 const struct gnttab_map_grant_ref *oops) 5357 { 5358 struct pmap_data_gnt *pgnt; 5359 pt_entry_t *ptes, opte; 5360 #ifndef XENPV 5361 pt_entry_t npte; 5362 #endif 5363 pt_entry_t *ptep; 5364 pd_entry_t * const *pdes; 5365 struct vm_page *ptp; 5366 struct vm_page *old_pg; 5367 struct pmap_page *old_pp; 5368 struct pv_entry *old_pve; 5369 struct pmap *pmap2; 5370 struct pmap_ptparray pt; 5371 int error; 5372 bool getptp; 5373 rb_tree_t *tree; 5374 struct gnttab_map_grant_ref *op; 5375 int ret; 5376 int idx; 5377 5378 KASSERT(pmap_initialized); 5379 KASSERT(va < VM_MAX_KERNEL_ADDRESS); 5380 KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#" 5381 PRIxVADDR " over PDP!", __func__, va); 5382 KASSERT(pmap != pmap_kernel()); 5383 5384 /* Begin by locking the pmap. */ 5385 mutex_enter(&pmap->pm_lock); 5386 pmap_alloc_gnt(pmap, sva, nentries, oops); 5387 5388 pgnt = pmap_find_gnt(pmap, va, va + PAGE_SIZE); 5389 KASSERT(pgnt != NULL); 5390 5391 /* Look up the PTP. Allocate if none present. */ 5392 ptp = NULL; 5393 getptp = false; 5394 ptp = pmap_find_ptp(pmap, va, 1); 5395 if (ptp == NULL) { 5396 getptp = true; 5397 error = pmap_get_ptp(pmap, &pt, va, PMAP_CANFAIL, &ptp); 5398 if (error != 0) { 5399 mutex_exit(&pmap->pm_lock); 5400 return error; 5401 } 5402 } 5403 tree = &VM_PAGE_TO_PP(ptp)->pp_rb; 5404 5405 /* 5406 * Look up the old PV entry at this VA (if any), and insert a new PV 5407 * entry if required for the new mapping. Temporarily track the old 5408 * and new mappings concurrently. Only after the old mapping is 5409 * evicted from the pmap will we remove its PV entry. Otherwise, 5410 * our picture of modified/accessed state for either page could get 5411 * out of sync (we need any P->V operation for either page to stall 5412 * on pmap->pm_lock until done here). 5413 */ 5414 old_pve = NULL; 5415 5416 old_pve = pmap_treelookup_pv(pmap, ptp, tree, va); 5417 5418 /* Map PTEs into address space. */ 5419 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 5420 5421 /* Install any newly allocated PTPs. */ 5422 if (getptp) { 5423 pmap_install_ptp(pmap, &pt, va, pdes); 5424 } 5425 5426 /* Check if there is an existing mapping. */ 5427 ptep = &ptes[pl1_i(va)]; 5428 opte = *ptep; 5429 bool have_oldpa = pmap_valid_entry(opte); 5430 paddr_t oldpa = pmap_pte2pa(opte); 5431 5432 /* 5433 * Update the pte. 5434 */ 5435 5436 idx = (va - pgnt->pd_gnt_sva) / PAGE_SIZE; 5437 op = &pgnt->pd_gnt_ops[idx]; 5438 5439 #ifdef XENPV 5440 KASSERT(op->flags & GNTMAP_contains_pte); 5441 op->host_addr = xpmap_ptetomach(ptep); 5442 #else 5443 KASSERT((op->flags & GNTMAP_contains_pte) == 0); 5444 KASSERT(op->flags != 0); 5445 KASSERT(op->host_addr != 0); 5446 #endif 5447 op->dev_bus_addr = 0; 5448 op->status = GNTST_general_error; 5449 ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, 1); 5450 if (__predict_false(ret)) { 5451 printf("%s: GNTTABOP_map_grant_ref failed: %d\n", 5452 __func__, ret); 5453 op->status = GNTST_general_error; 5454 } 5455 for (int d = 0; d < 256 && op->status == GNTST_eagain; d++) { 5456 kpause("gntmap", false, mstohz(1), NULL); 5457 ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, 1); 5458 if (__predict_false(ret)) { 5459 printf("%s: GNTTABOP_map_grant_ref failed: %d\n", 5460 __func__, ret); 5461 op->status = GNTST_general_error; 5462 } 5463 } 5464 if (__predict_false(op->status != GNTST_okay)) { 5465 printf("%s: GNTTABOP_map_grant_ref status: %d\n", 5466 __func__, op->status); 5467 if (have_oldpa) { /* XXX did the pte really change if XENPV ?*/ 5468 ptp->wire_count--; 5469 } 5470 } else { 5471 #ifndef XENPV 5472 npte = op->host_addr | pmap_pg_nx | PTE_U | PTE_P; 5473 if ((op->flags & GNTMAP_readonly) == 0) 5474 npte |= PTE_W; 5475 do { 5476 opte = *ptep; 5477 } while (pmap_pte_cas(ptep, opte, npte) != opte); 5478 #endif 5479 pgnt->pd_gnt_refs++; 5480 if (!have_oldpa) { 5481 ptp->wire_count++; 5482 } 5483 KASSERT(ptp->wire_count > 1); 5484 /* Remember minimum VA in PTP. */ 5485 pmap_ptp_range_set(ptp, va); 5486 } 5487 if (ptp->wire_count <= 1) 5488 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 5489 5490 /* 5491 * Done with the PTEs: they can now be unmapped. 5492 */ 5493 pmap_unmap_ptes(pmap, pmap2); 5494 5495 /* 5496 * Update statistics and PTP's reference count. 5497 */ 5498 pmap_stats_update_bypte(pmap, 0, opte); 5499 5500 /* 5501 * If old page is pv-tracked, remove pv_entry from its list. 5502 */ 5503 if ((~opte & (PTE_P | PTE_PVLIST)) == 0) { 5504 if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) { 5505 old_pp = VM_PAGE_TO_PP(old_pg); 5506 } else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) { 5507 panic("%s: PTE_PVLIST with pv-untracked page" 5508 " va = %#"PRIxVADDR " pa = %#" PRIxPADDR, 5509 __func__, va, oldpa); 5510 } 5511 5512 pmap_remove_pv(pmap, old_pp, ptp, va, old_pve, 5513 pmap_pte_to_pp_attrs(opte)); 5514 } else { 5515 KASSERT(old_pve == NULL); 5516 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 5517 } 5518 5519 pmap_drain_pv(pmap); 5520 mutex_exit(&pmap->pm_lock); 5521 return op->status; 5522 } 5523 5524 /* 5525 * pmap_remove_gnt: grant mapping removal function. 5526 * 5527 * => caller should not be holding any pmap locks 5528 */ 5529 static void 5530 pmap_remove_gnt(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 5531 { 5532 struct pmap_data_gnt *pgnt; 5533 pt_entry_t *ptes; 5534 pd_entry_t pde; 5535 pd_entry_t * const *pdes; 5536 struct vm_page *ptp; 5537 struct pmap *pmap2; 5538 vaddr_t va; 5539 int lvl; 5540 int idx; 5541 struct gnttab_map_grant_ref *op; 5542 struct gnttab_unmap_grant_ref unmap_op; 5543 int ret; 5544 5545 KASSERT(pmap != pmap_kernel()); 5546 KASSERT(pmap->pm_remove == pmap_remove_gnt); 5547 5548 mutex_enter(&pmap->pm_lock); 5549 for (va = sva; va < eva; va += PAGE_SIZE) { 5550 pgnt = pmap_find_gnt(pmap, va, va + PAGE_SIZE); 5551 if (pgnt == NULL) { 5552 pmap_remove_locked(pmap, sva, eva); 5553 continue; 5554 } 5555 5556 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 5557 if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) { 5558 panic("pmap_remove_gnt pdes not valid"); 5559 } 5560 5561 idx = (va - pgnt->pd_gnt_sva) / PAGE_SIZE; 5562 op = &pgnt->pd_gnt_ops[idx]; 5563 KASSERT(lvl == 1); 5564 5565 /* Get PTP if non-kernel mapping. */ 5566 ptp = pmap_find_ptp(pmap, va, 1); 5567 KASSERTMSG(ptp != NULL, 5568 "%s: unmanaged PTP detected", __func__); 5569 5570 if (op->status == GNTST_okay) { 5571 KASSERT(pmap_valid_entry(ptes[pl1_i(va)])); 5572 #ifdef XENPV 5573 unmap_op.host_addr = xpmap_ptetomach(&ptes[pl1_i(va)]); 5574 #else 5575 unmap_op.host_addr = op->host_addr; 5576 pmap_pte_testset(&ptes[pl1_i(va)], 0); 5577 #endif 5578 unmap_op.handle = op->handle; 5579 unmap_op.dev_bus_addr = 0; 5580 ret = HYPERVISOR_grant_table_op( 5581 GNTTABOP_unmap_grant_ref, &unmap_op, 1); 5582 if (ret) { 5583 printf("%s: GNTTABOP_unmap_grant_ref " 5584 "failed: %d\n", __func__, ret); 5585 } 5586 5587 ptp->wire_count--; 5588 pgnt->pd_gnt_refs--; 5589 } 5590 if (pgnt->pd_gnt_refs == 0) { 5591 pmap_free_gnt(pmap, pgnt); 5592 } 5593 /* 5594 * if mapping removed and the PTP is no longer 5595 * being used, free it! 5596 */ 5597 5598 if (ptp->wire_count <= 1) 5599 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 5600 pmap_unmap_ptes(pmap, pmap2); 5601 } 5602 mutex_exit(&pmap->pm_lock); 5603 } 5604 #endif /* XEN && DOM0OPS */ 5605 5606 paddr_t 5607 pmap_get_physpage(void) 5608 { 5609 struct vm_page *ptp; 5610 struct pmap *kpm = pmap_kernel(); 5611 paddr_t pa; 5612 5613 if (!uvm.page_init_done) { 5614 /* 5615 * We're growing the kernel pmap early (from 5616 * uvm_pageboot_alloc()). This case must be 5617 * handled a little differently. 5618 */ 5619 5620 if (!uvm_page_physget(&pa)) 5621 panic("%s: out of memory", __func__); 5622 #if defined(__HAVE_DIRECT_MAP) 5623 memset(PAGE_ALIGNED(PMAP_DIRECT_MAP(pa)), 0, PAGE_SIZE); 5624 #else 5625 #if defined(XENPV) 5626 if (XEN_VERSION_SUPPORTED(3, 4)) { 5627 xen_pagezero(pa); 5628 return pa; 5629 } 5630 #endif 5631 kpreempt_disable(); 5632 pmap_pte_set(early_zero_pte, pmap_pa2pte(pa) | PTE_P | 5633 PTE_W | pmap_pg_nx); 5634 pmap_pte_flush(); 5635 pmap_update_pg((vaddr_t)early_zerop); 5636 memset(PAGE_ALIGNED(early_zerop), 0, PAGE_SIZE); 5637 #if defined(DIAGNOSTIC) || defined(XENPV) 5638 pmap_pte_set(early_zero_pte, 0); 5639 pmap_pte_flush(); 5640 #endif /* defined(DIAGNOSTIC) */ 5641 kpreempt_enable(); 5642 #endif /* defined(__HAVE_DIRECT_MAP) */ 5643 } else { 5644 /* XXX */ 5645 ptp = uvm_pagealloc(NULL, 0, NULL, 5646 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 5647 if (ptp == NULL) 5648 panic("%s: out of memory", __func__); 5649 ptp->flags &= ~PG_BUSY; 5650 ptp->wire_count = 1; 5651 pa = VM_PAGE_TO_PHYS(ptp); 5652 } 5653 pmap_stats_update(kpm, 1, 0); 5654 5655 return pa; 5656 } 5657 5658 /* 5659 * Expand the page tree with the specified amount of PTPs, mapping virtual 5660 * addresses starting at kva. We populate all the levels but the last one 5661 * (L1). The nodes of the tree are created as RW, but the pages covered 5662 * will be kentered in L1, with proper permissions. 5663 * 5664 * Used only by pmap_growkernel. 5665 */ 5666 static void 5667 pmap_alloc_level(struct pmap *cpm, vaddr_t kva, long *needed_ptps) 5668 { 5669 unsigned long i; 5670 paddr_t pa; 5671 unsigned long index, endindex; 5672 int level; 5673 pd_entry_t *pdep; 5674 #ifdef XENPV 5675 int s = splvm(); /* protect xpq_* */ 5676 #endif 5677 5678 for (level = PTP_LEVELS; level > 1; level--) { 5679 if (level == PTP_LEVELS) 5680 pdep = cpm->pm_pdir; 5681 else 5682 pdep = normal_pdes[level - 2]; 5683 index = pl_i_roundup(kva, level); 5684 endindex = index + needed_ptps[level - 1] - 1; 5685 5686 for (i = index; i <= endindex; i++) { 5687 pt_entry_t pte; 5688 5689 KASSERT(!pmap_valid_entry(pdep[i])); 5690 pa = pmap_get_physpage(); 5691 pte = pmap_pa2pte(pa) | PTE_P | PTE_W; 5692 #ifdef __x86_64__ 5693 pte |= pmap_pg_nx; 5694 #endif 5695 pmap_pte_set(&pdep[i], pte); 5696 5697 #ifdef XENPV 5698 if (level == PTP_LEVELS && i >= PDIR_SLOT_KERN) { 5699 if (__predict_true( 5700 cpu_info_primary.ci_flags & CPUF_PRESENT)) { 5701 /* update per-cpu PMDs on all cpus */ 5702 xen_kpm_sync(pmap_kernel(), i); 5703 } else { 5704 /* 5705 * too early; update primary CPU 5706 * PMD only (without locks) 5707 */ 5708 #ifdef __x86_64__ 5709 pd_entry_t *cpu_pdep = 5710 &cpu_info_primary.ci_kpm_pdir[i]; 5711 #else 5712 pd_entry_t *cpu_pdep = 5713 &cpu_info_primary.ci_kpm_pdir[l2tol2(i)]; 5714 #endif 5715 pmap_pte_set(cpu_pdep, pte); 5716 } 5717 } 5718 #endif 5719 5720 KASSERT(level != PTP_LEVELS || nkptp[level - 1] + 5721 pl_i(VM_MIN_KERNEL_ADDRESS, level) == i); 5722 nkptp[level - 1]++; 5723 } 5724 pmap_pte_flush(); 5725 } 5726 #ifdef XENPV 5727 splx(s); 5728 #endif 5729 } 5730 5731 /* 5732 * pmap_growkernel: increase usage of KVM space. 5733 * 5734 * => we allocate new PTPs for the kernel and install them in all 5735 * the pmaps on the system. 5736 */ 5737 vaddr_t 5738 pmap_growkernel(vaddr_t maxkvaddr) 5739 { 5740 struct pmap *kpm = pmap_kernel(); 5741 struct pmap *cpm; 5742 #if !defined(XENPV) || !defined(__x86_64__) 5743 struct pmap *pm; 5744 long old; 5745 #endif 5746 int s, i; 5747 long needed_kptp[PTP_LEVELS], target_nptp; 5748 bool invalidate = false; 5749 5750 s = splvm(); /* to be safe */ 5751 mutex_enter(&kpm->pm_lock); 5752 5753 if (maxkvaddr <= pmap_maxkvaddr) { 5754 mutex_exit(&kpm->pm_lock); 5755 splx(s); 5756 return pmap_maxkvaddr; 5757 } 5758 5759 maxkvaddr = x86_round_pdr(maxkvaddr); 5760 #if !defined(XENPV) || !defined(__x86_64__) 5761 old = nkptp[PTP_LEVELS - 1]; 5762 #endif 5763 5764 /* Initialize needed_kptp. */ 5765 for (i = PTP_LEVELS - 1; i >= 1; i--) { 5766 target_nptp = pl_i_roundup(maxkvaddr, i + 1) - 5767 pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1); 5768 5769 if (target_nptp > nkptpmax[i]) 5770 panic("out of KVA space"); 5771 KASSERT(target_nptp >= nkptp[i]); 5772 needed_kptp[i] = target_nptp - nkptp[i]; 5773 } 5774 5775 #ifdef XENPV 5776 /* only pmap_kernel(), or the per-cpu map, has kernel entries */ 5777 cpm = kpm; 5778 #else 5779 /* Get the current pmap */ 5780 if (__predict_true(cpu_info_primary.ci_flags & CPUF_PRESENT)) { 5781 cpm = curcpu()->ci_pmap; 5782 } else { 5783 cpm = kpm; 5784 } 5785 #endif 5786 5787 kasan_shadow_map((void *)pmap_maxkvaddr, 5788 (size_t)(maxkvaddr - pmap_maxkvaddr)); 5789 kmsan_shadow_map((void *)pmap_maxkvaddr, 5790 (size_t)(maxkvaddr - pmap_maxkvaddr)); 5791 5792 pmap_alloc_level(cpm, pmap_maxkvaddr, needed_kptp); 5793 5794 /* 5795 * If the number of top level entries changed, update all pmaps. 5796 */ 5797 if (needed_kptp[PTP_LEVELS - 1] != 0) { 5798 #ifdef XENPV 5799 #ifdef __x86_64__ 5800 /* nothing, kernel entries are never entered in user pmap */ 5801 #else 5802 int pdkidx; 5803 5804 mutex_enter(&pmaps_lock); 5805 LIST_FOREACH(pm, &pmaps, pm_list) { 5806 for (pdkidx = PDIR_SLOT_KERN + old; 5807 pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1]; 5808 pdkidx++) { 5809 pmap_pte_set(&pm->pm_pdir[pdkidx], 5810 kpm->pm_pdir[pdkidx]); 5811 } 5812 pmap_pte_flush(); 5813 } 5814 mutex_exit(&pmaps_lock); 5815 #endif /* __x86_64__ */ 5816 #else /* XENPV */ 5817 size_t newpdes; 5818 newpdes = nkptp[PTP_LEVELS - 1] - old; 5819 if (cpm != kpm) { 5820 memcpy(&kpm->pm_pdir[PDIR_SLOT_KERN + old], 5821 &cpm->pm_pdir[PDIR_SLOT_KERN + old], 5822 newpdes * sizeof(pd_entry_t)); 5823 } 5824 5825 mutex_enter(&pmaps_lock); 5826 LIST_FOREACH(pm, &pmaps, pm_list) { 5827 if (__predict_false(pm->pm_enter != NULL)) { 5828 /* 5829 * Not a native pmap, the kernel is not mapped, 5830 * so nothing to synchronize. 5831 */ 5832 continue; 5833 } 5834 memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old], 5835 &kpm->pm_pdir[PDIR_SLOT_KERN + old], 5836 newpdes * sizeof(pd_entry_t)); 5837 } 5838 mutex_exit(&pmaps_lock); 5839 #endif 5840 invalidate = true; 5841 } 5842 pmap_maxkvaddr = maxkvaddr; 5843 mutex_exit(&kpm->pm_lock); 5844 splx(s); 5845 5846 if (invalidate && pmap_initialized) { 5847 /* Invalidate the pmap cache. */ 5848 pool_cache_invalidate(&pmap_cache); 5849 } 5850 5851 return maxkvaddr; 5852 } 5853 5854 #ifdef DEBUG 5855 void pmap_dump(struct pmap *, vaddr_t, vaddr_t); 5856 5857 /* 5858 * pmap_dump: dump all the mappings from a pmap 5859 * 5860 * => caller should not be holding any pmap locks 5861 */ 5862 void 5863 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 5864 { 5865 pt_entry_t *ptes, *pte; 5866 pd_entry_t * const *pdes; 5867 struct pmap *pmap2; 5868 vaddr_t blkendva; 5869 int lvl; 5870 5871 /* 5872 * if end is out of range truncate. 5873 * if (end == start) update to max. 5874 */ 5875 5876 if (eva > VM_MAXUSER_ADDRESS || eva <= sva) 5877 eva = VM_MAXUSER_ADDRESS; 5878 5879 mutex_enter(&pmap->pm_lock); 5880 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 5881 5882 /* 5883 * dumping a range of pages: we dump in PTP sized blocks (4MB) 5884 */ 5885 5886 for (/* null */ ; sva < eva ; sva = blkendva) { 5887 5888 /* determine range of block */ 5889 blkendva = x86_round_pdr(sva+1); 5890 if (blkendva > eva) 5891 blkendva = eva; 5892 5893 /* valid block? */ 5894 if (!pmap_pdes_valid(sva, pdes, NULL, &lvl)) 5895 continue; 5896 KASSERT(lvl == 1); 5897 5898 pte = &ptes[pl1_i(sva)]; 5899 for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) { 5900 if (!pmap_valid_entry(*pte)) 5901 continue; 5902 printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR 5903 " (pte=%#" PRIxPADDR ")\n", 5904 sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte); 5905 } 5906 } 5907 pmap_unmap_ptes(pmap, pmap2); 5908 mutex_exit(&pmap->pm_lock); 5909 } 5910 #endif 5911 5912 /* 5913 * pmap_update: process deferred invalidations and frees. 5914 */ 5915 void 5916 pmap_update(struct pmap *pmap) 5917 { 5918 struct pmap_page *pp; 5919 struct vm_page *ptp; 5920 5921 /* 5922 * Initiate any pending TLB shootdowns. Wait for them to 5923 * complete before returning control to the caller. 5924 */ 5925 kpreempt_disable(); 5926 pmap_tlb_shootnow(); 5927 kpreempt_enable(); 5928 5929 /* 5930 * Now that shootdowns are complete, process deferred frees. This 5931 * is an unlocked check, but is safe as we're only interested in 5932 * work done in this LWP - we won't get a false negative. 5933 */ 5934 if (atomic_load_relaxed(&pmap->pm_gc_ptp.lh_first) == NULL) { 5935 return; 5936 } 5937 5938 mutex_enter(&pmap->pm_lock); 5939 while ((ptp = LIST_FIRST(&pmap->pm_gc_ptp)) != NULL) { 5940 KASSERT(ptp->wire_count == 0); 5941 KASSERT(ptp->uanon == NULL); 5942 LIST_REMOVE(ptp, mdpage.mp_pp.pp_link); 5943 pp = VM_PAGE_TO_PP(ptp); 5944 LIST_INIT(&pp->pp_pvlist); 5945 pp->pp_attrs = 0; 5946 pp->pp_pte.pte_ptp = NULL; 5947 pp->pp_pte.pte_va = 0; 5948 PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp)); 5949 5950 /* 5951 * XXX Hack to avoid extra locking, and lock 5952 * assertions in uvm_pagefree(). Despite uobject 5953 * being set, this isn't a managed page. 5954 */ 5955 PMAP_DUMMY_LOCK(pmap); 5956 uvm_pagerealloc(ptp, NULL, 0); 5957 PMAP_DUMMY_UNLOCK(pmap); 5958 uvm_pagefree(ptp); 5959 } 5960 mutex_exit(&pmap->pm_lock); 5961 } 5962 5963 #if PTP_LEVELS > 4 5964 #error "Unsupported number of page table mappings" 5965 #endif 5966 5967 paddr_t 5968 pmap_init_tmp_pgtbl(paddr_t pg) 5969 { 5970 static bool maps_loaded; 5971 static const paddr_t x86_tmp_pml_paddr[] = { 5972 4 * PAGE_SIZE, /* L1 */ 5973 5 * PAGE_SIZE, /* L2 */ 5974 6 * PAGE_SIZE, /* L3 */ 5975 7 * PAGE_SIZE /* L4 */ 5976 }; 5977 static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 }; 5978 5979 pd_entry_t *tmp_pml, *kernel_pml; 5980 5981 int level; 5982 5983 if (!maps_loaded) { 5984 for (level = 0; level < PTP_LEVELS; ++level) { 5985 x86_tmp_pml_vaddr[level] = 5986 uvm_km_alloc(kernel_map, PAGE_SIZE, 0, 5987 UVM_KMF_VAONLY); 5988 5989 if (x86_tmp_pml_vaddr[level] == 0) 5990 panic("mapping of real mode PML failed\n"); 5991 pmap_kenter_pa(x86_tmp_pml_vaddr[level], 5992 x86_tmp_pml_paddr[level], 5993 VM_PROT_READ | VM_PROT_WRITE, 0); 5994 } 5995 pmap_update(pmap_kernel()); 5996 maps_loaded = true; 5997 } 5998 5999 /* Zero levels 1-3 */ 6000 for (level = 0; level < PTP_LEVELS - 1; ++level) { 6001 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 6002 memset(PAGE_ALIGNED(tmp_pml), 0, PAGE_SIZE); 6003 } 6004 6005 /* Copy PML4 */ 6006 kernel_pml = pmap_kernel()->pm_pdir; 6007 tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1]; 6008 memcpy(PAGE_ALIGNED(tmp_pml), PAGE_ALIGNED(kernel_pml), PAGE_SIZE); 6009 6010 #ifdef PAE 6011 /* 6012 * Use the last 4 entries of the L2 page as L3 PD entries. These 6013 * last entries are unlikely to be used for temporary mappings. 6014 * 508: maps 0->1GB (userland) 6015 * 509: unused 6016 * 510: unused 6017 * 511: maps 3->4GB (kernel) 6018 */ 6019 tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PTE_P; 6020 tmp_pml[509] = 0; 6021 tmp_pml[510] = 0; 6022 tmp_pml[511] = pmap_pdirpa(pmap_kernel(), PDIR_SLOT_KERN) | PTE_P; 6023 #endif 6024 6025 for (level = PTP_LEVELS - 1; level > 0; --level) { 6026 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 6027 6028 tmp_pml[pl_i(pg, level + 1)] = 6029 (x86_tmp_pml_paddr[level - 1] & PTE_FRAME) | PTE_W | PTE_P; 6030 } 6031 6032 tmp_pml = (void *)x86_tmp_pml_vaddr[0]; 6033 tmp_pml[pl_i(pg, 1)] = (pg & PTE_FRAME) | PTE_W | PTE_P; 6034 6035 #ifdef PAE 6036 /* Return the PA of the L3 page (entry 508 of the L2 page) */ 6037 return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t); 6038 #endif 6039 6040 return x86_tmp_pml_paddr[PTP_LEVELS - 1]; 6041 } 6042 6043 u_int 6044 x86_mmap_flags(paddr_t mdpgno) 6045 { 6046 u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK; 6047 u_int pflag = 0; 6048 6049 if (nflag & X86_MMAP_FLAG_PREFETCH) 6050 pflag |= PMAP_WRITE_COMBINE; 6051 6052 return pflag; 6053 } 6054 6055 #if defined(__HAVE_DIRECT_MAP) && defined(__x86_64__) && !defined(XENPV) 6056 6057 /* 6058 * ----------------------------------------------------------------------------- 6059 * ***************************************************************************** 6060 * ***************************************************************************** 6061 * ***************************************************************************** 6062 * ***************************************************************************** 6063 * **************** HERE BEGINS THE EPT CODE, USED BY INTEL-VMX **************** 6064 * ***************************************************************************** 6065 * ***************************************************************************** 6066 * ***************************************************************************** 6067 * ***************************************************************************** 6068 * ----------------------------------------------------------------------------- 6069 * 6070 * These functions are invoked as callbacks from the code above. Contrary to 6071 * native, EPT does not have a recursive slot; therefore, it is not possible 6072 * to call pmap_map_ptes(). Instead, we use the direct map and walk down the 6073 * tree manually. 6074 * 6075 * Apart from that, the logic is mostly the same as native. Once a pmap has 6076 * been created, NVMM calls pmap_ept_transform() to make it an EPT pmap. 6077 * After that we're good, and the callbacks will handle the translations 6078 * for us. 6079 * 6080 * ----------------------------------------------------------------------------- 6081 */ 6082 6083 /* Hardware bits. */ 6084 #define EPT_R __BIT(0) /* read */ 6085 #define EPT_W __BIT(1) /* write */ 6086 #define EPT_X __BIT(2) /* execute */ 6087 #define EPT_T __BITS(5,3) /* type */ 6088 #define TYPE_UC 0 6089 #define TYPE_WC 1 6090 #define TYPE_WT 4 6091 #define TYPE_WP 5 6092 #define TYPE_WB 6 6093 #define EPT_NOPAT __BIT(6) 6094 #define EPT_L __BIT(7) /* large */ 6095 #define EPT_A __BIT(8) /* accessed */ 6096 #define EPT_D __BIT(9) /* dirty */ 6097 /* Software bits. */ 6098 #define EPT_PVLIST __BIT(60) 6099 #define EPT_WIRED __BIT(61) 6100 6101 #define pmap_ept_valid_entry(pte) (pte & EPT_R) 6102 6103 bool pmap_ept_has_ad __read_mostly; 6104 6105 static inline void 6106 pmap_ept_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte) 6107 { 6108 int resid_diff = ((npte & EPT_R) ? 1 : 0) - ((opte & EPT_R) ? 1 : 0); 6109 int wired_diff = ((npte & EPT_WIRED) ? 1 : 0) - ((opte & EPT_WIRED) ? 1 : 0); 6110 6111 KASSERT((npte & (EPT_R | EPT_WIRED)) != EPT_WIRED); 6112 KASSERT((opte & (EPT_R | EPT_WIRED)) != EPT_WIRED); 6113 6114 pmap_stats_update(pmap, resid_diff, wired_diff); 6115 } 6116 6117 static pt_entry_t 6118 pmap_ept_type(u_int flags) 6119 { 6120 u_int cacheflags = (flags & PMAP_CACHE_MASK); 6121 pt_entry_t ret; 6122 6123 switch (cacheflags) { 6124 case PMAP_NOCACHE: 6125 case PMAP_NOCACHE_OVR: 6126 ret = __SHIFTIN(TYPE_UC, EPT_T); 6127 break; 6128 case PMAP_WRITE_COMBINE: 6129 ret = __SHIFTIN(TYPE_WC, EPT_T); 6130 break; 6131 case PMAP_WRITE_BACK: 6132 default: 6133 ret = __SHIFTIN(TYPE_WB, EPT_T); 6134 break; 6135 } 6136 6137 ret |= EPT_NOPAT; 6138 return ret; 6139 } 6140 6141 static inline pt_entry_t 6142 pmap_ept_prot(vm_prot_t prot) 6143 { 6144 pt_entry_t res = 0; 6145 6146 if (prot & VM_PROT_READ) 6147 res |= EPT_R; 6148 if (prot & VM_PROT_WRITE) 6149 res |= EPT_W; 6150 if (prot & VM_PROT_EXECUTE) 6151 res |= EPT_X; 6152 6153 return res; 6154 } 6155 6156 static inline uint8_t 6157 pmap_ept_to_pp_attrs(pt_entry_t ept) 6158 { 6159 uint8_t ret = 0; 6160 if (pmap_ept_has_ad) { 6161 if (ept & EPT_D) 6162 ret |= PP_ATTRS_D; 6163 if (ept & EPT_A) 6164 ret |= PP_ATTRS_A; 6165 } else { 6166 ret |= (PP_ATTRS_D|PP_ATTRS_A); 6167 } 6168 if (ept & EPT_W) 6169 ret |= PP_ATTRS_W; 6170 return ret; 6171 } 6172 6173 static inline pt_entry_t 6174 pmap_pp_attrs_to_ept(uint8_t attrs) 6175 { 6176 pt_entry_t ept = 0; 6177 if (attrs & PP_ATTRS_D) 6178 ept |= EPT_D; 6179 if (attrs & PP_ATTRS_A) 6180 ept |= EPT_A; 6181 if (attrs & PP_ATTRS_W) 6182 ept |= EPT_W; 6183 return ept; 6184 } 6185 6186 /* 6187 * Helper for pmap_ept_free_ptp. 6188 * tree[0] = &L2[L2idx] 6189 * tree[1] = &L3[L3idx] 6190 * tree[2] = &L4[L4idx] 6191 */ 6192 static void 6193 pmap_ept_get_tree(struct pmap *pmap, vaddr_t va, pd_entry_t **tree) 6194 { 6195 pt_entry_t *pteva; 6196 paddr_t ptepa; 6197 int i, index; 6198 6199 ptepa = pmap->pm_pdirpa[0]; 6200 for (i = PTP_LEVELS; i > 1; i--) { 6201 index = pl_pi(va, i); 6202 pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa); 6203 KASSERT(pmap_ept_valid_entry(pteva[index])); 6204 tree[i - 2] = &pteva[index]; 6205 ptepa = pmap_pte2pa(pteva[index]); 6206 } 6207 } 6208 6209 static void 6210 pmap_ept_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va) 6211 { 6212 pd_entry_t *tree[3]; 6213 int level; 6214 6215 KASSERT(pmap != pmap_kernel()); 6216 KASSERT(mutex_owned(&pmap->pm_lock)); 6217 KASSERT(kpreempt_disabled()); 6218 6219 pmap_ept_get_tree(pmap, va, tree); 6220 6221 level = 1; 6222 do { 6223 (void)pmap_pte_testset(tree[level - 1], 0); 6224 6225 pmap_freepage(pmap, ptp, level); 6226 if (level < PTP_LEVELS - 1) { 6227 ptp = pmap_find_ptp(pmap, va, level + 1); 6228 ptp->wire_count--; 6229 if (ptp->wire_count > 1) 6230 break; 6231 } 6232 } while (++level < PTP_LEVELS); 6233 pmap_pte_flush(); 6234 } 6235 6236 /* Allocate L4->L3->L2. Return L2. */ 6237 static void 6238 pmap_ept_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va) 6239 { 6240 struct vm_page *ptp; 6241 unsigned long index; 6242 pd_entry_t *pteva; 6243 paddr_t ptepa; 6244 int i; 6245 6246 KASSERT(pmap != pmap_kernel()); 6247 KASSERT(mutex_owned(&pmap->pm_lock)); 6248 KASSERT(kpreempt_disabled()); 6249 6250 /* 6251 * Now that we have all the pages looked up or allocated, 6252 * loop through again installing any new ones into the tree. 6253 */ 6254 ptepa = pmap->pm_pdirpa[0]; 6255 for (i = PTP_LEVELS; i > 1; i--) { 6256 index = pl_pi(va, i); 6257 pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa); 6258 6259 if (pmap_ept_valid_entry(pteva[index])) { 6260 KASSERT(!pt->alloced[i]); 6261 ptepa = pmap_pte2pa(pteva[index]); 6262 continue; 6263 } 6264 6265 ptp = pt->pg[i]; 6266 ptp->flags &= ~PG_BUSY; /* never busy */ 6267 ptp->wire_count = 1; 6268 pmap->pm_ptphint[i - 2] = ptp; 6269 ptepa = VM_PAGE_TO_PHYS(ptp); 6270 pmap_pte_set(&pteva[index], ptepa | EPT_R | EPT_W | EPT_X); 6271 6272 pmap_pte_flush(); 6273 pmap_stats_update(pmap, 1, 0); 6274 6275 /* 6276 * If we're not in the top level, increase the 6277 * wire count of the parent page. 6278 */ 6279 if (i < PTP_LEVELS) { 6280 pt->pg[i + 1]->wire_count++; 6281 } 6282 } 6283 } 6284 6285 static int 6286 pmap_ept_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, 6287 u_int flags) 6288 { 6289 pt_entry_t *ptes, opte, npte; 6290 pt_entry_t *ptep; 6291 struct vm_page *ptp; 6292 struct vm_page *new_pg, *old_pg; 6293 struct pmap_page *new_pp, *old_pp; 6294 struct pv_entry *old_pve, *new_pve; 6295 bool wired = (flags & PMAP_WIRED) != 0; 6296 bool accessed; 6297 struct pmap_ptparray pt; 6298 int error; 6299 bool getptp, samepage, new_embedded; 6300 rb_tree_t *tree; 6301 6302 KASSERT(pmap_initialized); 6303 KASSERT(va < VM_MAXUSER_ADDRESS); 6304 6305 npte = pa | pmap_ept_prot(prot) | pmap_ept_type(flags); 6306 6307 if (wired) 6308 npte |= EPT_WIRED; 6309 if (flags & VM_PROT_ALL) { 6310 npte |= EPT_A; 6311 if (flags & VM_PROT_WRITE) { 6312 KASSERT((npte & EPT_W) != 0); 6313 npte |= EPT_D; 6314 } 6315 } 6316 6317 new_pg = PHYS_TO_VM_PAGE(pa); 6318 if (new_pg != NULL) { 6319 /* This is a managed page */ 6320 npte |= EPT_PVLIST; 6321 new_pp = VM_PAGE_TO_PP(new_pg); 6322 } else if ((new_pp = pmap_pv_tracked(pa)) != NULL) { 6323 /* This is an unmanaged pv-tracked page */ 6324 npte |= EPT_PVLIST; 6325 } else { 6326 new_pp = NULL; 6327 } 6328 6329 /* Begin by locking the pmap. */ 6330 mutex_enter(&pmap->pm_lock); 6331 6332 /* Look up the PTP. Allocate if none present. */ 6333 ptp = NULL; 6334 getptp = false; 6335 if (pmap != pmap_kernel()) { 6336 ptp = pmap_find_ptp(pmap, va, 1); 6337 if (ptp == NULL) { 6338 getptp = true; 6339 error = pmap_get_ptp(pmap, &pt, va, flags, &ptp); 6340 if (error != 0) { 6341 if (flags & PMAP_CANFAIL) { 6342 mutex_exit(&pmap->pm_lock); 6343 return error; 6344 } 6345 panic("%s: get ptp failed, error=%d", __func__, 6346 error); 6347 } 6348 } 6349 tree = &VM_PAGE_TO_PP(ptp)->pp_rb; 6350 } else { 6351 /* Embedded PV entries rely on this. */ 6352 KASSERT(va != 0); 6353 tree = &pmap_kernel_rb; 6354 } 6355 6356 /* 6357 * Look up the old PV entry at this VA (if any), and insert a new PV 6358 * entry if required for the new mapping. Temporarily track the old 6359 * and new mappings concurrently. Only after the old mapping is 6360 * evicted from the pmap will we remove its PV entry. Otherwise, 6361 * our picture of modified/accessed state for either page could get 6362 * out of sync (we need any P->V operation for either page to stall 6363 * on pmap->pm_lock until done here). 6364 */ 6365 new_pve = NULL; 6366 old_pve = NULL; 6367 samepage = false; 6368 new_embedded = false; 6369 6370 if (new_pp != NULL) { 6371 error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve, 6372 &old_pve, &samepage, &new_embedded, tree); 6373 6374 /* 6375 * If a new pv_entry was needed and none was available, we 6376 * can go no further. 6377 */ 6378 if (error != 0) { 6379 if (flags & PMAP_CANFAIL) { 6380 if (getptp) { 6381 pmap_unget_ptp(pmap, &pt); 6382 } 6383 mutex_exit(&pmap->pm_lock); 6384 return error; 6385 } 6386 panic("%s: alloc pve failed", __func__); 6387 } 6388 } else { 6389 old_pve = pmap_treelookup_pv(pmap, ptp, tree, va); 6390 } 6391 6392 /* Map PTEs into address space. */ 6393 kpreempt_disable(); 6394 6395 /* Install any newly allocated PTPs. */ 6396 if (getptp) { 6397 pmap_ept_install_ptp(pmap, &pt, va); 6398 } 6399 6400 /* Check if there is an existing mapping. */ 6401 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp)); 6402 ptep = &ptes[pl1_pi(va)]; 6403 opte = *ptep; 6404 bool have_oldpa = pmap_ept_valid_entry(opte); 6405 paddr_t oldpa = pmap_pte2pa(opte); 6406 6407 /* 6408 * Update the pte. 6409 */ 6410 do { 6411 opte = *ptep; 6412 6413 /* 6414 * if the same page, inherit PTE_A and PTE_D. 6415 */ 6416 if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) { 6417 npte |= opte & (EPT_A | EPT_D); 6418 } 6419 } while (pmap_pte_cas(ptep, opte, npte) != opte); 6420 6421 /* 6422 * Done with the PTEs: they can now be unmapped. 6423 */ 6424 kpreempt_enable(); 6425 6426 /* 6427 * Update statistics and PTP's reference count. 6428 */ 6429 pmap_ept_stats_update_bypte(pmap, npte, opte); 6430 if (ptp != NULL) { 6431 if (!have_oldpa) { 6432 ptp->wire_count++; 6433 } 6434 /* Remember minimum VA in PTP. */ 6435 pmap_ptp_range_set(ptp, va); 6436 } 6437 KASSERT(ptp == NULL || ptp->wire_count > 1); 6438 6439 /* 6440 * If the same page, we can skip pv_entry handling. 6441 */ 6442 if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) { 6443 KASSERT(((opte ^ npte) & EPT_PVLIST) == 0); 6444 if ((npte & EPT_PVLIST) != 0) { 6445 KASSERT(samepage); 6446 pmap_check_pv(pmap, ptp, new_pp, va, true); 6447 } 6448 goto same_pa; 6449 } else if ((npte & EPT_PVLIST) != 0) { 6450 KASSERT(!samepage); 6451 } 6452 6453 /* 6454 * If old page is pv-tracked, remove pv_entry from its list. 6455 */ 6456 if ((~opte & (EPT_R | EPT_PVLIST)) == 0) { 6457 if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) { 6458 old_pp = VM_PAGE_TO_PP(old_pg); 6459 } else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) { 6460 panic("%s: EPT_PVLIST with pv-untracked page" 6461 " va = %#"PRIxVADDR 6462 " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")", 6463 __func__, va, oldpa, atop(pa)); 6464 } 6465 6466 pmap_remove_pv(pmap, old_pp, ptp, va, old_pve, 6467 pmap_ept_to_pp_attrs(opte)); 6468 } else { 6469 KASSERT(old_pve == NULL); 6470 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 6471 } 6472 6473 /* 6474 * If new page is dynamically PV tracked, insert to tree. 6475 */ 6476 if (new_pve != NULL) { 6477 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 6478 old_pve = rb_tree_insert_node(tree, new_pve); 6479 KASSERT(old_pve == new_pve); 6480 pmap_check_pv(pmap, ptp, new_pp, va, true); 6481 } 6482 6483 same_pa: 6484 /* 6485 * shootdown tlb if necessary. 6486 */ 6487 6488 if (pmap_ept_has_ad) { 6489 accessed = (~opte & (EPT_R | EPT_A)) == 0; 6490 } else { 6491 accessed = (opte & EPT_R) != 0; 6492 } 6493 if (accessed && ((opte ^ npte) & (PTE_FRAME | EPT_W)) != 0) { 6494 pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_ENTER); 6495 } 6496 pmap_drain_pv(pmap); 6497 mutex_exit(&pmap->pm_lock); 6498 return 0; 6499 } 6500 6501 /* Pay close attention, this returns L2. */ 6502 static int 6503 pmap_ept_pdes_invalid(struct pmap *pmap, vaddr_t va, pd_entry_t *lastpde) 6504 { 6505 pt_entry_t *pteva; 6506 paddr_t ptepa; 6507 int i, index; 6508 6509 KASSERT(mutex_owned(&pmap->pm_lock)); 6510 6511 ptepa = pmap->pm_pdirpa[0]; 6512 for (i = PTP_LEVELS; i > 1; i--) { 6513 pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa); 6514 index = pl_pi(va, i); 6515 if (!pmap_ept_valid_entry(pteva[index])) 6516 return i; 6517 ptepa = pmap_pte2pa(pteva[index]); 6518 } 6519 if (lastpde != NULL) { 6520 *lastpde = pteva[index]; 6521 } 6522 6523 return 0; 6524 } 6525 6526 static bool 6527 pmap_ept_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap) 6528 { 6529 pt_entry_t *ptes, pte; 6530 pd_entry_t pde; 6531 paddr_t ptppa, pa; 6532 bool rv; 6533 6534 #ifdef __HAVE_DIRECT_MAP 6535 if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) { 6536 if (pap != NULL) { 6537 *pap = PMAP_DIRECT_UNMAP(va); 6538 } 6539 return true; 6540 } 6541 #endif 6542 6543 rv = false; 6544 pa = 0; 6545 6546 mutex_enter(&pmap->pm_lock); 6547 kpreempt_disable(); 6548 6549 if (!pmap_ept_pdes_invalid(pmap, va, &pde)) { 6550 ptppa = pmap_pte2pa(pde); 6551 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa); 6552 pte = ptes[pl1_pi(va)]; 6553 if (__predict_true((pte & EPT_R) != 0)) { 6554 pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1)); 6555 rv = true; 6556 } 6557 } 6558 6559 kpreempt_enable(); 6560 mutex_exit(&pmap->pm_lock); 6561 6562 if (pap != NULL) { 6563 *pap = pa; 6564 } 6565 return rv; 6566 } 6567 6568 static bool 6569 pmap_ept_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, 6570 vaddr_t va) 6571 { 6572 struct pv_entry *pve; 6573 struct vm_page *pg; 6574 struct pmap_page *pp; 6575 pt_entry_t opte; 6576 bool accessed; 6577 6578 KASSERT(pmap != pmap_kernel()); 6579 KASSERT(mutex_owned(&pmap->pm_lock)); 6580 KASSERT(kpreempt_disabled()); 6581 6582 if (!pmap_ept_valid_entry(*pte)) { 6583 /* VA not mapped. */ 6584 return false; 6585 } 6586 6587 /* Atomically save the old PTE and zap it. */ 6588 opte = pmap_pte_testset(pte, 0); 6589 if (!pmap_ept_valid_entry(opte)) { 6590 return false; 6591 } 6592 6593 pmap_ept_stats_update_bypte(pmap, 0, opte); 6594 6595 if (ptp) { 6596 /* 6597 * Dropping a PTE. Make sure that the PDE is flushed. 6598 */ 6599 ptp->wire_count--; 6600 if (ptp->wire_count <= 1) { 6601 opte |= EPT_A; 6602 } 6603 } 6604 6605 if (pmap_ept_has_ad) { 6606 accessed = (opte & EPT_A) != 0; 6607 } else { 6608 accessed = true; 6609 } 6610 if (accessed) { 6611 pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_REMOVE_PTE); 6612 } 6613 6614 /* 6615 * If we are not on a pv list - we are done. 6616 */ 6617 if ((opte & EPT_PVLIST) == 0) { 6618 KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL), 6619 "managed page without EPT_PVLIST for %#"PRIxVADDR, va); 6620 KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL), 6621 "pv-tracked page without EPT_PVLIST for %#"PRIxVADDR, va); 6622 KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ? 6623 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL); 6624 return true; 6625 } 6626 6627 if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) { 6628 pp = VM_PAGE_TO_PP(pg); 6629 } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) { 6630 paddr_t pa = pmap_pte2pa(opte); 6631 panic("%s: EPT_PVLIST with pv-untracked page" 6632 " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")", 6633 __func__, va, pa, atop(pa)); 6634 } 6635 6636 /* Sync R/M bits. */ 6637 pve = pmap_lookup_pv(pmap, ptp, pp, va); 6638 pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_ept_to_pp_attrs(opte)); 6639 return true; 6640 } 6641 6642 static void 6643 pmap_ept_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva, 6644 vaddr_t startva, vaddr_t endva) 6645 { 6646 pt_entry_t *pte = (pt_entry_t *)ptpva; 6647 6648 KASSERT(pmap != pmap_kernel()); 6649 KASSERT(mutex_owned(&pmap->pm_lock)); 6650 KASSERT(kpreempt_disabled()); 6651 6652 /* 6653 * mappings are very often sparse, so clip the given range to the 6654 * range of PTEs that are known present in the PTP. 6655 */ 6656 pmap_ptp_range_clip(ptp, &startva, &pte); 6657 6658 /* 6659 * note that ptpva points to the PTE that maps startva. this may 6660 * or may not be the first PTE in the PTP. 6661 * 6662 * we loop through the PTP while there are still PTEs to look at 6663 * and the wire_count is greater than 1 (because we use the wire_count 6664 * to keep track of the number of real PTEs in the PTP). 6665 */ 6666 while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) { 6667 (void)pmap_ept_remove_pte(pmap, ptp, pte, startva); 6668 startva += PAGE_SIZE; 6669 pte++; 6670 } 6671 } 6672 6673 static void 6674 pmap_ept_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 6675 { 6676 pt_entry_t *ptes; 6677 pd_entry_t pde; 6678 paddr_t ptppa; 6679 vaddr_t blkendva, va = sva; 6680 struct vm_page *ptp; 6681 6682 mutex_enter(&pmap->pm_lock); 6683 kpreempt_disable(); 6684 6685 for (/* null */ ; va < eva ; va = blkendva) { 6686 int lvl; 6687 6688 /* determine range of block */ 6689 blkendva = x86_round_pdr(va+1); 6690 if (blkendva > eva) 6691 blkendva = eva; 6692 6693 lvl = pmap_ept_pdes_invalid(pmap, va, &pde); 6694 if (lvl != 0) { 6695 /* Skip a range corresponding to an invalid pde. */ 6696 blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1]; 6697 continue; 6698 } 6699 6700 /* PA of the PTP */ 6701 ptppa = pmap_pte2pa(pde); 6702 6703 ptp = pmap_find_ptp(pmap, va, 1); 6704 KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected", 6705 __func__); 6706 6707 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa); 6708 6709 pmap_ept_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_pi(va)], va, 6710 blkendva); 6711 6712 /* If PTP is no longer being used, free it. */ 6713 if (ptp && ptp->wire_count <= 1) { 6714 pmap_ept_free_ptp(pmap, ptp, va); 6715 } 6716 } 6717 6718 kpreempt_enable(); 6719 pmap_drain_pv(pmap); 6720 mutex_exit(&pmap->pm_lock); 6721 } 6722 6723 static int 6724 pmap_ept_sync_pv(struct vm_page *ptp, vaddr_t va, paddr_t pa, int clearbits, 6725 uint8_t *oattrs, pt_entry_t *optep) 6726 { 6727 struct pmap *pmap; 6728 pt_entry_t *ptep; 6729 pt_entry_t opte; 6730 pt_entry_t npte; 6731 pt_entry_t expect; 6732 bool need_shootdown; 6733 6734 expect = pmap_pa2pte(pa) | EPT_R; 6735 pmap = ptp_to_pmap(ptp); 6736 6737 if (clearbits != ~0) { 6738 KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0); 6739 clearbits = pmap_pp_attrs_to_ept(clearbits); 6740 } 6741 6742 ptep = pmap_map_pte(pmap, ptp, va); 6743 do { 6744 opte = *ptep; 6745 KASSERT((opte & (EPT_D | EPT_A)) != EPT_D); 6746 KASSERT((opte & (EPT_A | EPT_R)) != EPT_A); 6747 KASSERT(opte == 0 || (opte & EPT_R) != 0); 6748 if ((opte & (PTE_FRAME | EPT_R)) != expect) { 6749 /* 6750 * We lost a race with a V->P operation like 6751 * pmap_remove(). Wait for the competitor 6752 * reflecting pte bits into mp_attrs. 6753 */ 6754 pmap_unmap_pte(); 6755 return EAGAIN; 6756 } 6757 6758 /* 6759 * Check if there's anything to do on this PTE. 6760 */ 6761 if ((opte & clearbits) == 0) { 6762 need_shootdown = false; 6763 break; 6764 } 6765 6766 /* 6767 * We need a shootdown if the PTE is cached (EPT_A) ... 6768 * ... Unless we are clearing only the EPT_W bit and 6769 * it isn't cached as RW (EPT_D). 6770 */ 6771 if (pmap_ept_has_ad) { 6772 need_shootdown = (opte & EPT_A) != 0 && 6773 !(clearbits == EPT_W && (opte & EPT_D) == 0); 6774 } else { 6775 need_shootdown = true; 6776 } 6777 6778 npte = opte & ~clearbits; 6779 6780 /* 6781 * If we need a shootdown anyway, clear EPT_A and EPT_D. 6782 */ 6783 if (need_shootdown) { 6784 npte &= ~(EPT_A | EPT_D); 6785 } 6786 KASSERT((npte & (EPT_D | EPT_A)) != EPT_D); 6787 KASSERT((npte & (EPT_A | EPT_R)) != EPT_A); 6788 KASSERT(npte == 0 || (opte & EPT_R) != 0); 6789 } while (pmap_pte_cas(ptep, opte, npte) != opte); 6790 6791 if (need_shootdown) { 6792 pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_SYNC_PV); 6793 } 6794 pmap_unmap_pte(); 6795 6796 *oattrs = pmap_ept_to_pp_attrs(opte); 6797 if (optep != NULL) 6798 *optep = opte; 6799 return 0; 6800 } 6801 6802 static void 6803 pmap_ept_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte, 6804 vaddr_t va) 6805 { 6806 6807 KASSERT(mutex_owned(&pmap->pm_lock)); 6808 6809 pmap_ept_stats_update_bypte(pmap, 0, opte); 6810 ptp->wire_count--; 6811 if (ptp->wire_count <= 1) { 6812 pmap_ept_free_ptp(pmap, ptp, va); 6813 } 6814 } 6815 6816 static void 6817 pmap_ept_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot) 6818 { 6819 pt_entry_t bit_rem; 6820 pt_entry_t *ptes, *spte; 6821 pt_entry_t opte, npte; 6822 pd_entry_t pde; 6823 paddr_t ptppa; 6824 vaddr_t va; 6825 bool modified; 6826 6827 bit_rem = 0; 6828 if (!(prot & VM_PROT_WRITE)) 6829 bit_rem = EPT_W; 6830 6831 sva &= PTE_FRAME; 6832 eva &= PTE_FRAME; 6833 6834 /* Acquire pmap. */ 6835 mutex_enter(&pmap->pm_lock); 6836 kpreempt_disable(); 6837 6838 for (va = sva; va < eva; va += PAGE_SIZE) { 6839 if (pmap_ept_pdes_invalid(pmap, va, &pde)) { 6840 continue; 6841 } 6842 6843 ptppa = pmap_pte2pa(pde); 6844 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa); 6845 spte = &ptes[pl1_pi(va)]; 6846 6847 do { 6848 opte = *spte; 6849 if (!pmap_ept_valid_entry(opte)) { 6850 goto next; 6851 } 6852 npte = (opte & ~bit_rem); 6853 } while (pmap_pte_cas(spte, opte, npte) != opte); 6854 6855 if (pmap_ept_has_ad) { 6856 modified = (opte & EPT_D) != 0; 6857 } else { 6858 modified = true; 6859 } 6860 if (modified) { 6861 vaddr_t tva = x86_ptob(spte - ptes); 6862 pmap_tlb_shootdown(pmap, tva, 0, 6863 TLBSHOOT_WRITE_PROTECT); 6864 } 6865 next:; 6866 } 6867 6868 kpreempt_enable(); 6869 mutex_exit(&pmap->pm_lock); 6870 } 6871 6872 static void 6873 pmap_ept_unwire(struct pmap *pmap, vaddr_t va) 6874 { 6875 pt_entry_t *ptes, *ptep, opte; 6876 pd_entry_t pde; 6877 paddr_t ptppa; 6878 6879 /* Acquire pmap. */ 6880 mutex_enter(&pmap->pm_lock); 6881 kpreempt_disable(); 6882 6883 if (pmap_ept_pdes_invalid(pmap, va, &pde)) { 6884 panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va); 6885 } 6886 6887 ptppa = pmap_pte2pa(pde); 6888 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa); 6889 ptep = &ptes[pl1_pi(va)]; 6890 opte = *ptep; 6891 KASSERT(pmap_ept_valid_entry(opte)); 6892 6893 if (opte & EPT_WIRED) { 6894 pt_entry_t npte = opte & ~EPT_WIRED; 6895 6896 opte = pmap_pte_testset(ptep, npte); 6897 pmap_ept_stats_update_bypte(pmap, npte, opte); 6898 } else { 6899 printf("%s: wiring for pmap %p va %#" PRIxVADDR 6900 "did not change!\n", __func__, pmap, va); 6901 } 6902 6903 /* Release pmap. */ 6904 kpreempt_enable(); 6905 mutex_exit(&pmap->pm_lock); 6906 } 6907 6908 /* -------------------------------------------------------------------------- */ 6909 6910 void 6911 pmap_ept_transform(struct pmap *pmap) 6912 { 6913 pmap->pm_enter = pmap_ept_enter; 6914 pmap->pm_extract = pmap_ept_extract; 6915 pmap->pm_remove = pmap_ept_remove; 6916 pmap->pm_sync_pv = pmap_ept_sync_pv; 6917 pmap->pm_pp_remove_ent = pmap_ept_pp_remove_ent; 6918 pmap->pm_write_protect = pmap_ept_write_protect; 6919 pmap->pm_unwire = pmap_ept_unwire; 6920 6921 memset(PAGE_ALIGNED(pmap->pm_pdir), 0, PAGE_SIZE); 6922 } 6923 6924 #endif /* __HAVE_DIRECT_MAP && __x86_64__ && !XENPV */ 6925