1 /* $NetBSD: pmap.c,v 1.380 2020/03/22 00:16:16 ad Exp $ */ 2 3 /* 4 * Copyright (c) 2008, 2010, 2016, 2017, 2019, 2020 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran, and by Maxime Villard. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 2007 Manuel Bouyer. 34 * 35 * Redistribution and use in source and binary forms, with or without 36 * modification, are permitted provided that the following conditions 37 * are met: 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 44 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 45 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 46 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 47 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 48 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 49 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 50 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 51 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 52 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 53 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 54 */ 55 56 /* 57 * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr> 58 * 59 * Permission to use, copy, modify, and distribute this software for any 60 * purpose with or without fee is hereby granted, provided that the above 61 * copyright notice and this permission notice appear in all copies. 62 * 63 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 64 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 65 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 66 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 67 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 68 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 69 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 70 */ 71 72 /* 73 * Copyright 2001 (c) Wasabi Systems, Inc. 74 * All rights reserved. 75 * 76 * Written by Frank van der Linden for Wasabi Systems, Inc. 77 * 78 * Redistribution and use in source and binary forms, with or without 79 * modification, are permitted provided that the following conditions 80 * are met: 81 * 1. Redistributions of source code must retain the above copyright 82 * notice, this list of conditions and the following disclaimer. 83 * 2. Redistributions in binary form must reproduce the above copyright 84 * notice, this list of conditions and the following disclaimer in the 85 * documentation and/or other materials provided with the distribution. 86 * 3. All advertising materials mentioning features or use of this software 87 * must display the following acknowledgement: 88 * This product includes software developed for the NetBSD Project by 89 * Wasabi Systems, Inc. 90 * 4. The name of Wasabi Systems, Inc. may not be used to endorse 91 * or promote products derived from this software without specific prior 92 * written permission. 93 * 94 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND 95 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 96 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 97 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC 98 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 99 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 100 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 101 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 102 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 103 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 104 * POSSIBILITY OF SUCH DAMAGE. 105 */ 106 107 /* 108 * Copyright (c) 1997 Charles D. Cranor and Washington University. 109 * All rights reserved. 110 * 111 * Redistribution and use in source and binary forms, with or without 112 * modification, are permitted provided that the following conditions 113 * are met: 114 * 1. Redistributions of source code must retain the above copyright 115 * notice, this list of conditions and the following disclaimer. 116 * 2. Redistributions in binary form must reproduce the above copyright 117 * notice, this list of conditions and the following disclaimer in the 118 * documentation and/or other materials provided with the distribution. 119 * 120 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 121 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 122 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 123 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 124 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 125 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 126 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 127 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 128 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 129 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 130 */ 131 132 #include <sys/cdefs.h> 133 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.380 2020/03/22 00:16:16 ad Exp $"); 134 135 #include "opt_user_ldt.h" 136 #include "opt_lockdebug.h" 137 #include "opt_multiprocessor.h" 138 #include "opt_xen.h" 139 #include "opt_svs.h" 140 #include "opt_kaslr.h" 141 142 #define __MUTEX_PRIVATE /* for assertions */ 143 144 #include <sys/param.h> 145 #include <sys/systm.h> 146 #include <sys/proc.h> 147 #include <sys/pool.h> 148 #include <sys/kernel.h> 149 #include <sys/atomic.h> 150 #include <sys/cpu.h> 151 #include <sys/intr.h> 152 #include <sys/xcall.h> 153 #include <sys/kcore.h> 154 #include <sys/asan.h> 155 #include <sys/msan.h> 156 157 #include <uvm/uvm.h> 158 #include <uvm/pmap/pmap_pvt.h> 159 160 #include <dev/isa/isareg.h> 161 162 #include <machine/specialreg.h> 163 #include <machine/gdt.h> 164 #include <machine/isa_machdep.h> 165 #include <machine/cpuvar.h> 166 #include <machine/cputypes.h> 167 #include <machine/cpu_rng.h> 168 169 #include <x86/pmap.h> 170 #include <x86/pmap_pv.h> 171 172 #include <x86/i82489reg.h> 173 #include <x86/i82489var.h> 174 175 #ifdef XEN 176 #include <xen/include/public/xen.h> 177 #include <xen/hypervisor.h> 178 #endif 179 180 /* 181 * general info: 182 * 183 * - for an explanation of how the x86 MMU hardware works see 184 * the comments in <machine/pte.h>. 185 * 186 * - for an explanation of the general memory structure used by 187 * this pmap (including the recursive mapping), see the comments 188 * in <machine/pmap.h>. 189 * 190 * this file contains the code for the "pmap module." the module's 191 * job is to manage the hardware's virtual to physical address mappings. 192 * note that there are two levels of mapping in the VM system: 193 * 194 * [1] the upper layer of the VM system uses vm_map's and vm_map_entry's 195 * to map ranges of virtual address space to objects/files. for 196 * example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only 197 * to the file /bin/ls starting at offset zero." note that 198 * the upper layer mapping is not concerned with how individual 199 * vm_pages are mapped. 200 * 201 * [2] the lower layer of the VM system (the pmap) maintains the mappings 202 * from virtual addresses. it is concerned with which vm_page is 203 * mapped where. for example, when you run /bin/ls and start 204 * at page 0x1000 the fault routine may lookup the correct page 205 * of the /bin/ls file and then ask the pmap layer to establish 206 * a mapping for it. 207 * 208 * note that information in the lower layer of the VM system can be 209 * thrown away since it can easily be reconstructed from the info 210 * in the upper layer. 211 * 212 * data structures we use include: 213 * 214 * - struct pmap: describes the address space of one thread 215 * - struct pmap_page: describes one pv-tracked page, without 216 * necessarily a corresponding vm_page 217 * - struct pv_entry: describes one <PMAP,VA> mapping of a PA 218 * - pmap_page::pp_pvlist: there is one list per pv-tracked page of 219 * physical memory. the pp_pvlist points to a list of pv_entry 220 * structures which describe all the <PMAP,VA> pairs that this 221 * page is mapped in. this is critical for page based operations 222 * such as pmap_page_protect() [change protection on _all_ mappings 223 * of a page] 224 */ 225 226 /* 227 * Locking 228 * 229 * We have the following locks that we must deal with, listed in the order 230 * that they are acquired: 231 * 232 * pg->uobject->vmobjlock, pg->uanon->an_lock 233 * 234 * For managed pages, these per-object locks are taken by the VM system 235 * before calling into the pmap module - either a read or write hold. 236 * The lock hold prevent pages from changing identity while the pmap is 237 * operating on them. For example, the same lock is held across a call 238 * to pmap_remove() and the following call to pmap_update(), so that a 239 * page does not gain a new identity while its TLB visibility is stale. 240 * 241 * pmap->pm_lock 242 * 243 * This lock protects the fields in the pmap structure including the 244 * non-kernel PDEs in the PDP, the PTEs, and PTPs and connected data 245 * structures. For modifying unmanaged kernel PTEs it is not needed as 246 * kernel PDEs are never freed, and the kernel is expected to be self 247 * consistent (and the lock can't be taken for unmanaged kernel PTEs, 248 * because they can be modified from interrupt context). 249 * 250 * pmaps_lock 251 * 252 * This lock protects the list of active pmaps (headed by "pmaps"). 253 * It's acqired when adding or removing pmaps or adjusting kernel PDEs. 254 * 255 * pp_lock 256 * 257 * This per-page lock protects PV entry lists and the embedded PV entry 258 * in each vm_page, allowing for concurrent operation on pages by 259 * different pmaps. This is a spin mutex at IPL_VM, because at the 260 * points it is taken context switching is usually not tolerable, and 261 * spin mutexes must block out interrupts that could take kernel_lock. 262 */ 263 264 /* uvm_object is abused here to index pmap_pages; make assertions happy. */ 265 #ifdef DIAGNOSTIC 266 #define PMAP_DUMMY_LOCK(pm) rw_enter(&(pm)->pm_dummy_lock, RW_WRITER) 267 #define PMAP_DUMMY_UNLOCK(pm) rw_exit(&(pm)->pm_dummy_lock) 268 #else 269 #define PMAP_DUMMY_LOCK(pm) 270 #define PMAP_DUMMY_UNLOCK(pm) 271 #endif 272 273 static const struct uvm_pagerops pmap_pager = { 274 /* nothing */ 275 }; 276 277 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER; 278 const vaddr_t ptp_frames[] = PTP_FRAME_INITIALIZER; 279 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER; 280 const long nkptpmax[] = NKPTPMAX_INITIALIZER; 281 const long nbpd[] = NBPD_INITIALIZER; 282 #ifdef i386 283 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER; 284 #else 285 pd_entry_t *normal_pdes[3]; 286 #endif 287 288 long nkptp[] = NKPTP_INITIALIZER; 289 290 struct pmap_head pmaps; 291 kmutex_t pmaps_lock __cacheline_aligned; 292 293 struct pcpu_area *pcpuarea __read_mostly; 294 295 static vaddr_t pmap_maxkvaddr; 296 297 /* 298 * Misc. event counters. 299 */ 300 struct evcnt pmap_iobmp_evcnt; 301 struct evcnt pmap_ldt_evcnt; 302 303 /* 304 * PAT 305 */ 306 static bool cpu_pat_enabled __read_mostly = false; 307 308 /* 309 * Global data structures 310 */ 311 312 static struct pmap kernel_pmap_store __cacheline_aligned; /* kernel's pmap */ 313 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store; 314 static rb_tree_t pmap_kernel_rb __cacheline_aligned; 315 316 struct bootspace bootspace __read_mostly; 317 struct slotspace slotspace __read_mostly; 318 319 /* Set to PTE_NX if supported. */ 320 pd_entry_t pmap_pg_nx __read_mostly = 0; 321 322 /* Set to PTE_G if supported. */ 323 pd_entry_t pmap_pg_g __read_mostly = 0; 324 325 /* Set to true if large pages are supported. */ 326 int pmap_largepages __read_mostly = 0; 327 328 paddr_t lowmem_rsvd __read_mostly; 329 paddr_t avail_start __read_mostly; /* PA of first available physical page */ 330 paddr_t avail_end __read_mostly; /* PA of last available physical page */ 331 332 #ifdef XENPV 333 paddr_t pmap_pa_start; /* PA of first physical page for this domain */ 334 paddr_t pmap_pa_end; /* PA of last physical page for this domain */ 335 #endif 336 337 #define VM_PAGE_TO_PP(pg) (&(pg)->mdpage.mp_pp) 338 #define PMAP_CHECK_PP(pp) \ 339 KASSERTMSG((pp)->pp_lock.mtx_ipl._ipl == IPL_VM, "bad pmap_page %p", pp) 340 341 /* 342 * Other data structures 343 */ 344 345 static pt_entry_t protection_codes[8] __read_mostly; 346 347 static bool pmap_initialized __read_mostly = false; /* pmap_init done yet? */ 348 349 /* 350 * The following two vaddr_t's are used during system startup to keep track of 351 * how much of the kernel's VM space we have used. Once the system is started, 352 * the management of the remaining kernel VM space is turned over to the 353 * kernel_map vm_map. 354 */ 355 static vaddr_t virtual_avail __read_mostly; /* VA of first free KVA */ 356 static vaddr_t virtual_end __read_mostly; /* VA of last free KVA */ 357 358 #ifndef XENPV 359 /* 360 * LAPIC virtual address, and fake physical address. 361 */ 362 volatile vaddr_t local_apic_va __read_mostly; 363 paddr_t local_apic_pa __read_mostly; 364 #endif 365 366 /* 367 * pool that pmap structures are allocated from 368 */ 369 struct pool_cache pmap_cache; 370 static int pmap_ctor(void *, void *, int); 371 static void pmap_dtor(void *, void *); 372 373 /* 374 * pv_entry cache 375 */ 376 static struct pool_cache pmap_pv_cache; 377 378 #ifdef __HAVE_DIRECT_MAP 379 vaddr_t pmap_direct_base __read_mostly; 380 vaddr_t pmap_direct_end __read_mostly; 381 #endif 382 383 #ifndef __HAVE_DIRECT_MAP 384 /* 385 * Special VAs and the PTEs that map them 386 */ 387 static pt_entry_t *early_zero_pte; 388 static void pmap_vpage_cpualloc(struct cpu_info *); 389 #ifdef XENPV 390 char *early_zerop; /* also referenced from xen_locore() */ 391 #else 392 static char *early_zerop; 393 #endif 394 #endif 395 396 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int); 397 398 /* PDP pool and its callbacks */ 399 static struct pool pmap_pdp_pool; 400 static void pmap_pdp_init(pd_entry_t *); 401 static void pmap_pdp_fini(pd_entry_t *); 402 403 #ifdef PAE 404 /* need to allocate items of 4 pages */ 405 static void *pmap_pdp_alloc(struct pool *, int); 406 static void pmap_pdp_free(struct pool *, void *); 407 static struct pool_allocator pmap_pdp_allocator = { 408 .pa_alloc = pmap_pdp_alloc, 409 .pa_free = pmap_pdp_free, 410 .pa_pagesz = PAGE_SIZE * PDP_SIZE, 411 }; 412 #endif 413 414 extern vaddr_t idt_vaddr; 415 extern paddr_t idt_paddr; 416 extern vaddr_t gdt_vaddr; 417 extern paddr_t gdt_paddr; 418 extern vaddr_t ldt_vaddr; 419 extern paddr_t ldt_paddr; 420 421 #ifdef i386 422 /* stuff to fix the pentium f00f bug */ 423 extern vaddr_t pentium_idt_vaddr; 424 #endif 425 426 /* Array of freshly allocated PTPs, for pmap_get_ptp(). */ 427 struct pmap_ptparray { 428 struct vm_page *pg[PTP_LEVELS + 1]; 429 bool alloced[PTP_LEVELS + 1]; 430 }; 431 432 /* 433 * PV tree prototypes 434 */ 435 436 static int pmap_compare_key(void *, const void *, const void *); 437 static int pmap_compare_nodes(void *, const void *, const void *); 438 439 /* Read-black tree */ 440 static const rb_tree_ops_t pmap_rbtree_ops = { 441 .rbto_compare_nodes = pmap_compare_nodes, 442 .rbto_compare_key = pmap_compare_key, 443 .rbto_node_offset = offsetof(struct pv_entry, pve_rb), 444 .rbto_context = NULL 445 }; 446 447 /* 448 * Local prototypes 449 */ 450 451 #ifdef __HAVE_PCPU_AREA 452 static void pmap_init_pcpu(void); 453 #endif 454 #ifdef __HAVE_DIRECT_MAP 455 static void pmap_init_directmap(struct pmap *); 456 #endif 457 #if !defined(XENPV) 458 static void pmap_remap_global(void); 459 #endif 460 #ifndef XENPV 461 static void pmap_init_lapic(void); 462 static void pmap_remap_largepages(void); 463 #endif 464 465 static int pmap_get_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t, int, 466 struct vm_page **); 467 static void pmap_unget_ptp(struct pmap *, struct pmap_ptparray *); 468 static void pmap_install_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t, 469 pd_entry_t * const *); 470 static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, int); 471 static void pmap_freepage(struct pmap *, struct vm_page *, int); 472 static void pmap_free_ptp(struct pmap *, struct vm_page *, vaddr_t, 473 pt_entry_t *, pd_entry_t * const *); 474 static bool pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *, 475 vaddr_t, struct pv_entry **); 476 static void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t, vaddr_t, 477 vaddr_t, struct pv_entry **); 478 479 static void pmap_alloc_level(struct pmap *, vaddr_t, long *); 480 481 static void pmap_load1(struct lwp *, struct pmap *, struct pmap *); 482 static void pmap_reactivate(struct pmap *); 483 484 /* 485 * p m a p h e l p e r f u n c t i o n s 486 */ 487 488 static inline void 489 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff) 490 { 491 492 KASSERT(cold || mutex_owned(&pmap->pm_lock)); 493 pmap->pm_stats.resident_count += resid_diff; 494 pmap->pm_stats.wired_count += wired_diff; 495 } 496 497 static inline void 498 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte) 499 { 500 int resid_diff = ((npte & PTE_P) ? 1 : 0) - ((opte & PTE_P) ? 1 : 0); 501 int wired_diff = ((npte & PTE_WIRED) ? 1 : 0) - ((opte & PTE_WIRED) ? 1 : 0); 502 503 KASSERT((npte & (PTE_P | PTE_WIRED)) != PTE_WIRED); 504 KASSERT((opte & (PTE_P | PTE_WIRED)) != PTE_WIRED); 505 506 pmap_stats_update(pmap, resid_diff, wired_diff); 507 } 508 509 /* 510 * ptp_to_pmap: lookup pmap by ptp 511 */ 512 static inline struct pmap * 513 ptp_to_pmap(struct vm_page *ptp) 514 { 515 struct pmap *pmap; 516 517 if (ptp == NULL) { 518 return pmap_kernel(); 519 } 520 pmap = (struct pmap *)ptp->uobject; 521 KASSERT(pmap != NULL); 522 KASSERT(&pmap->pm_obj[0] == ptp->uobject); 523 return pmap; 524 } 525 526 static inline struct pv_pte * 527 pve_to_pvpte(struct pv_entry *pve) 528 { 529 530 if (pve == NULL) 531 return NULL; 532 KASSERT((void *)&pve->pve_pte == (void *)pve); 533 return &pve->pve_pte; 534 } 535 536 static inline struct pv_entry * 537 pvpte_to_pve(struct pv_pte *pvpte) 538 { 539 struct pv_entry *pve = (void *)pvpte; 540 541 KASSERT(pve_to_pvpte(pve) == pvpte); 542 return pve; 543 } 544 545 /* 546 * Return true if the pmap page has an embedded PV entry. 547 */ 548 static inline bool 549 pv_pte_embedded(struct pmap_page *pp) 550 { 551 552 KASSERT(mutex_owned(&pp->pp_lock)); 553 return (bool)((vaddr_t)pp->pp_pte.pte_ptp | pp->pp_pte.pte_va); 554 } 555 556 /* 557 * pv_pte_first, pv_pte_next: PV list iterator. 558 */ 559 static inline struct pv_pte * 560 pv_pte_first(struct pmap_page *pp) 561 { 562 563 KASSERT(mutex_owned(&pp->pp_lock)); 564 if (pv_pte_embedded(pp)) { 565 return &pp->pp_pte; 566 } 567 return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist)); 568 } 569 570 static inline struct pv_pte * 571 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte) 572 { 573 574 KASSERT(mutex_owned(&pp->pp_lock)); 575 KASSERT(pvpte != NULL); 576 if (pvpte == &pp->pp_pte) { 577 return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist)); 578 } 579 return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list)); 580 } 581 582 static inline uint8_t 583 pmap_pte_to_pp_attrs(pt_entry_t pte) 584 { 585 uint8_t ret = 0; 586 if (pte & PTE_D) 587 ret |= PP_ATTRS_D; 588 if (pte & PTE_A) 589 ret |= PP_ATTRS_A; 590 if (pte & PTE_W) 591 ret |= PP_ATTRS_W; 592 return ret; 593 } 594 595 static inline pt_entry_t 596 pmap_pp_attrs_to_pte(uint8_t attrs) 597 { 598 pt_entry_t pte = 0; 599 if (attrs & PP_ATTRS_D) 600 pte |= PTE_D; 601 if (attrs & PP_ATTRS_A) 602 pte |= PTE_A; 603 if (attrs & PP_ATTRS_W) 604 pte |= PTE_W; 605 return pte; 606 } 607 608 /* 609 * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]? 610 * of course the kernel is always loaded 611 */ 612 bool 613 pmap_is_curpmap(struct pmap *pmap) 614 { 615 return ((pmap == pmap_kernel()) || (pmap == curcpu()->ci_pmap)); 616 } 617 618 inline void 619 pmap_reference(struct pmap *pmap) 620 { 621 622 atomic_inc_uint(&pmap->pm_obj[0].uo_refs); 623 } 624 625 /* 626 * rbtree: compare two nodes. 627 */ 628 static int 629 pmap_compare_nodes(void *context, const void *n1, const void *n2) 630 { 631 const struct pv_entry *pve1 = n1; 632 const struct pv_entry *pve2 = n2; 633 634 KASSERT(pve1->pve_pte.pte_ptp == pve2->pve_pte.pte_ptp); 635 636 if (pve1->pve_pte.pte_va < pve2->pve_pte.pte_va) { 637 return -1; 638 } 639 if (pve1->pve_pte.pte_va > pve2->pve_pte.pte_va) { 640 return 1; 641 } 642 return 0; 643 } 644 645 /* 646 * rbtree: compare a node and a key. 647 */ 648 static int 649 pmap_compare_key(void *context, const void *n, const void *k) 650 { 651 const struct pv_entry *pve = n; 652 const vaddr_t key = (vaddr_t)k; 653 654 if (pve->pve_pte.pte_va < key) { 655 return -1; 656 } 657 if (pve->pve_pte.pte_va > key) { 658 return 1; 659 } 660 return 0; 661 } 662 663 /* 664 * pmap_ptp_range_set: abuse ptp->uanon to record minimum VA of PTE 665 */ 666 static inline void 667 pmap_ptp_range_set(struct vm_page *ptp, vaddr_t va) 668 { 669 vaddr_t *min = (vaddr_t *)&ptp->uanon; 670 671 if (va < *min) { 672 *min = va; 673 } 674 } 675 676 /* 677 * pmap_ptp_range_clip: abuse ptp->uanon to clip range of PTEs to remove 678 */ 679 static inline void 680 pmap_ptp_range_clip(struct vm_page *ptp, vaddr_t *startva, pt_entry_t **pte) 681 { 682 vaddr_t sclip; 683 684 if (ptp == NULL) { 685 return; 686 } 687 688 sclip = (vaddr_t)ptp->uanon; 689 sclip = (*startva < sclip ? sclip : *startva); 690 *pte += (sclip - *startva) / PAGE_SIZE; 691 *startva = sclip; 692 } 693 694 /* 695 * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in 696 * 697 * there are several pmaps involved. some or all of them might be same. 698 * 699 * - the pmap given by the first argument 700 * our caller wants to access this pmap's PTEs. 701 * 702 * - pmap_kernel() 703 * the kernel pmap. note that it only contains the kernel part 704 * of the address space which is shared by any pmap. ie. any 705 * pmap can be used instead of pmap_kernel() for our purpose. 706 * 707 * - ci->ci_pmap 708 * pmap currently loaded on the cpu. 709 * 710 * - vm_map_pmap(&curproc->p_vmspace->vm_map) 711 * current process' pmap. 712 * 713 * => caller must lock pmap first (if not the kernel pmap) 714 * => must be undone with pmap_unmap_ptes before returning 715 * => disables kernel preemption 716 */ 717 void 718 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2, pd_entry_t **ptepp, 719 pd_entry_t * const **pdeppp) 720 { 721 struct pmap *curpmap; 722 struct cpu_info *ci; 723 lwp_t *l; 724 725 kpreempt_disable(); 726 727 /* The kernel's pmap is always accessible. */ 728 if (pmap == pmap_kernel()) { 729 *pmap2 = NULL; 730 *ptepp = PTE_BASE; 731 *pdeppp = normal_pdes; 732 return; 733 } 734 735 KASSERT(mutex_owned(&pmap->pm_lock)); 736 737 l = curlwp; 738 ci = l->l_cpu; 739 curpmap = ci->ci_pmap; 740 if (pmap == curpmap) { 741 /* 742 * Already on the CPU: make it valid. This is very 743 * often the case during exit(), when we have switched 744 * to the kernel pmap in order to destroy a user pmap. 745 */ 746 if (__predict_false(ci->ci_tlbstate != TLBSTATE_VALID)) { 747 pmap_reactivate(pmap); 748 } 749 *pmap2 = NULL; 750 } else { 751 /* 752 * Toss current pmap from CPU and install new pmap, but keep 753 * a reference to the old one. Dropping the reference can 754 * can block as it needs to take locks, so defer that to 755 * pmap_unmap_ptes(). 756 */ 757 pmap_reference(pmap); 758 pmap_load1(l, pmap, curpmap); 759 *pmap2 = curpmap; 760 } 761 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); 762 #ifdef DIAGNOSTIC 763 pmap->pm_ncsw = lwp_pctr(); 764 #endif 765 *ptepp = PTE_BASE; 766 767 #if defined(XENPV) && defined(__x86_64__) 768 KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] == L4_BASE); 769 ci->ci_normal_pdes[PTP_LEVELS - 2] = pmap->pm_pdir; 770 *pdeppp = ci->ci_normal_pdes; 771 #else 772 *pdeppp = normal_pdes; 773 #endif 774 } 775 776 /* 777 * pmap_unmap_ptes: unlock the PTE mapping of "pmap" 778 * 779 * => we cannot tolerate context switches while mapped in: assert this. 780 * => reenables kernel preemption. 781 * => does not unlock pmap. 782 */ 783 void 784 pmap_unmap_ptes(struct pmap *pmap, struct pmap * pmap2) 785 { 786 struct cpu_info *ci; 787 struct pmap *mypmap; 788 struct lwp *l; 789 790 KASSERT(kpreempt_disabled()); 791 792 /* The kernel's pmap is always accessible. */ 793 if (pmap == pmap_kernel()) { 794 kpreempt_enable(); 795 return; 796 } 797 798 l = curlwp; 799 ci = l->l_cpu; 800 801 KASSERT(mutex_owned(&pmap->pm_lock)); 802 KASSERT(pmap->pm_ncsw == lwp_pctr()); 803 804 #if defined(XENPV) && defined(__x86_64__) 805 KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] != L4_BASE); 806 ci->ci_normal_pdes[PTP_LEVELS - 2] = L4_BASE; 807 #endif 808 809 /* If not our own pmap, mark whatever's on the CPU now as lazy. */ 810 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); 811 mypmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 812 if (ci->ci_pmap == vm_map_pmap(&l->l_proc->p_vmspace->vm_map)) { 813 ci->ci_want_pmapload = 0; 814 } else { 815 ci->ci_want_pmapload = (mypmap != pmap_kernel()); 816 ci->ci_tlbstate = TLBSTATE_LAZY; 817 } 818 819 /* Now safe to re-enable preemption. */ 820 kpreempt_enable(); 821 822 /* Toss reference to other pmap taken earlier. */ 823 if (pmap2 != NULL) { 824 pmap_destroy(pmap2); 825 } 826 } 827 828 inline static void 829 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte) 830 { 831 832 #if !defined(__x86_64__) 833 if (curproc == NULL || curproc->p_vmspace == NULL || 834 pm != vm_map_pmap(&curproc->p_vmspace->vm_map)) 835 return; 836 837 if ((opte ^ npte) & PTE_X) 838 pmap_update_pg(va); 839 840 /* 841 * Executability was removed on the last executable change. 842 * Reset the code segment to something conservative and 843 * let the trap handler deal with setting the right limit. 844 * We can't do that because of locking constraints on the vm map. 845 */ 846 847 if ((opte & PTE_X) && (npte & PTE_X) == 0 && va == pm->pm_hiexec) { 848 struct trapframe *tf = curlwp->l_md.md_regs; 849 850 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 851 pm->pm_hiexec = I386_MAX_EXE_ADDR; 852 } 853 #endif /* !defined(__x86_64__) */ 854 } 855 856 #if !defined(__x86_64__) 857 /* 858 * Fixup the code segment to cover all potential executable mappings. 859 * returns 0 if no changes to the code segment were made. 860 */ 861 int 862 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb) 863 { 864 struct vm_map_entry *ent; 865 struct pmap *pm = vm_map_pmap(map); 866 vaddr_t va = 0; 867 868 vm_map_lock_read(map); 869 for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) { 870 /* 871 * This entry has greater va than the entries before. 872 * We need to make it point to the last page, not past it. 873 */ 874 if (ent->protection & VM_PROT_EXECUTE) 875 va = trunc_page(ent->end) - PAGE_SIZE; 876 } 877 vm_map_unlock_read(map); 878 if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL)) 879 return 0; 880 881 pm->pm_hiexec = va; 882 if (pm->pm_hiexec > I386_MAX_EXE_ADDR) { 883 tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL); 884 } else { 885 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 886 return 0; 887 } 888 return 1; 889 } 890 #endif /* !defined(__x86_64__) */ 891 892 void 893 pat_init(struct cpu_info *ci) 894 { 895 uint64_t pat; 896 897 if (!(ci->ci_feat_val[0] & CPUID_PAT)) 898 return; 899 900 /* We change WT to WC. Leave all other entries the default values. */ 901 pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) | 902 PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) | 903 PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) | 904 PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC); 905 906 wrmsr(MSR_CR_PAT, pat); 907 cpu_pat_enabled = true; 908 aprint_debug_dev(ci->ci_dev, "PAT enabled\n"); 909 } 910 911 static pt_entry_t 912 pmap_pat_flags(u_int flags) 913 { 914 u_int cacheflags = (flags & PMAP_CACHE_MASK); 915 916 if (!cpu_pat_enabled) { 917 switch (cacheflags) { 918 case PMAP_NOCACHE: 919 case PMAP_NOCACHE_OVR: 920 /* results in PGC_UCMINUS on cpus which have 921 * the cpuid PAT but PAT "disabled" 922 */ 923 return PTE_PCD; 924 default: 925 return 0; 926 } 927 } 928 929 switch (cacheflags) { 930 case PMAP_NOCACHE: 931 return PGC_UC; 932 case PMAP_WRITE_COMBINE: 933 return PGC_WC; 934 case PMAP_WRITE_BACK: 935 return PGC_WB; 936 case PMAP_NOCACHE_OVR: 937 return PGC_UCMINUS; 938 } 939 940 return 0; 941 } 942 943 /* 944 * p m a p k e n t e r f u n c t i o n s 945 * 946 * functions to quickly enter/remove pages from the kernel address 947 * space. pmap_kremove is exported to MI kernel. we make use of 948 * the recursive PTE mappings. 949 */ 950 951 /* 952 * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking 953 * 954 * => no need to lock anything, assume va is already allocated 955 * => should be faster than normal pmap enter function 956 */ 957 void 958 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags) 959 { 960 pt_entry_t *pte, opte, npte; 961 962 KASSERT(!(prot & ~VM_PROT_ALL)); 963 964 if (va < VM_MIN_KERNEL_ADDRESS) 965 pte = vtopte(va); 966 else 967 pte = kvtopte(va); 968 #ifdef DOM0OPS 969 if (pa < pmap_pa_start || pa >= pmap_pa_end) { 970 #ifdef DEBUG 971 printf_nolog("%s: pa %#" PRIxPADDR " for va %#" PRIxVADDR 972 " outside range\n", __func__, pa, va); 973 #endif /* DEBUG */ 974 npte = pa; 975 } else 976 #endif /* DOM0OPS */ 977 npte = pmap_pa2pte(pa); 978 npte |= protection_codes[prot] | PTE_P | pmap_pg_g; 979 npte |= pmap_pat_flags(flags); 980 opte = pmap_pte_testset(pte, npte); /* zap! */ 981 982 /* 983 * XXX: make sure we are not dealing with a large page, since the only 984 * large pages created are for the kernel image, and they should never 985 * be kentered. 986 */ 987 KASSERTMSG(!(opte & PTE_PS), "PTE_PS va=%#"PRIxVADDR, va); 988 989 if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A)) { 990 /* This should not happen. */ 991 printf_nolog("%s: mapping already present\n", __func__); 992 kpreempt_disable(); 993 pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER); 994 kpreempt_enable(); 995 } 996 } 997 998 __strict_weak_alias(pmap_kenter_ma, pmap_kenter_pa); 999 1000 #if defined(__x86_64__) 1001 /* 1002 * Change protection for a virtual address. Local for a CPU only, don't 1003 * care about TLB shootdowns. 1004 * 1005 * => must be called with preemption disabled 1006 */ 1007 void 1008 pmap_changeprot_local(vaddr_t va, vm_prot_t prot) 1009 { 1010 pt_entry_t *pte, opte, npte; 1011 1012 KASSERT(kpreempt_disabled()); 1013 1014 if (va < VM_MIN_KERNEL_ADDRESS) 1015 pte = vtopte(va); 1016 else 1017 pte = kvtopte(va); 1018 1019 npte = opte = *pte; 1020 1021 if ((prot & VM_PROT_WRITE) != 0) 1022 npte |= PTE_W; 1023 else 1024 npte &= ~(PTE_W|PTE_D); 1025 1026 if (opte != npte) { 1027 pmap_pte_set(pte, npte); 1028 pmap_pte_flush(); 1029 invlpg(va); 1030 } 1031 } 1032 #endif /* defined(__x86_64__) */ 1033 1034 /* 1035 * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking 1036 * 1037 * => no need to lock anything 1038 * => caller must dispose of any vm_page mapped in the va range 1039 * => note: not an inline function 1040 * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE 1041 * => we assume kernel only unmaps valid addresses and thus don't bother 1042 * checking the valid bit before doing TLB flushing 1043 * => must be followed by call to pmap_update() before reuse of page 1044 */ 1045 static void 1046 pmap_kremove1(vaddr_t sva, vsize_t len, bool localonly) 1047 { 1048 pt_entry_t *pte, opte; 1049 vaddr_t va, eva; 1050 1051 eva = sva + len; 1052 1053 kpreempt_disable(); 1054 for (va = sva; va < eva; va += PAGE_SIZE) { 1055 pte = kvtopte(va); 1056 opte = pmap_pte_testset(pte, 0); /* zap! */ 1057 if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A) && !localonly) { 1058 pmap_tlb_shootdown(pmap_kernel(), va, opte, 1059 TLBSHOOT_KREMOVE); 1060 } 1061 KASSERTMSG((opte & PTE_PS) == 0, 1062 "va %#" PRIxVADDR " is a large page", va); 1063 KASSERTMSG((opte & PTE_PVLIST) == 0, 1064 "va %#" PRIxVADDR " is a pv tracked page", va); 1065 } 1066 if (localonly) { 1067 tlbflushg(); 1068 } 1069 kpreempt_enable(); 1070 } 1071 1072 void 1073 pmap_kremove(vaddr_t sva, vsize_t len) 1074 { 1075 1076 pmap_kremove1(sva, len, false); 1077 } 1078 1079 /* 1080 * pmap_kremove_local: like pmap_kremove(), but only worry about 1081 * TLB invalidations on the current CPU. this is only intended 1082 * for use while writing kernel crash dumps, either after panic 1083 * or via reboot -d. 1084 */ 1085 void 1086 pmap_kremove_local(vaddr_t sva, vsize_t len) 1087 { 1088 1089 pmap_kremove1(sva, len, true); 1090 } 1091 1092 /* 1093 * p m a p i n i t f u n c t i o n s 1094 * 1095 * pmap_bootstrap and pmap_init are called during system startup 1096 * to init the pmap module. pmap_bootstrap() does a low level 1097 * init just to get things rolling. pmap_init() finishes the job. 1098 */ 1099 1100 /* 1101 * pmap_bootstrap_valloc: allocate a virtual address in the bootstrap area. 1102 * This function is to be used before any VM system has been set up. 1103 * 1104 * The va is taken from virtual_avail. 1105 */ 1106 static vaddr_t 1107 pmap_bootstrap_valloc(size_t npages) 1108 { 1109 vaddr_t va = virtual_avail; 1110 virtual_avail += npages * PAGE_SIZE; 1111 return va; 1112 } 1113 1114 /* 1115 * pmap_bootstrap_palloc: allocate a physical address in the bootstrap area. 1116 * This function is to be used before any VM system has been set up. 1117 * 1118 * The pa is taken from avail_start. 1119 */ 1120 static paddr_t 1121 pmap_bootstrap_palloc(size_t npages) 1122 { 1123 paddr_t pa = avail_start; 1124 avail_start += npages * PAGE_SIZE; 1125 return pa; 1126 } 1127 1128 /* 1129 * pmap_bootstrap: get the system in a state where it can run with VM properly 1130 * enabled (called before main()). The VM system is fully init'd later. 1131 * 1132 * => on i386, locore.S has already enabled the MMU by allocating a PDP for the 1133 * kernel, and nkpde PTP's for the kernel. 1134 * => kva_start is the first free virtual address in kernel space. 1135 */ 1136 void 1137 pmap_bootstrap(vaddr_t kva_start) 1138 { 1139 struct pmap *kpm; 1140 int i; 1141 vaddr_t kva; 1142 1143 pmap_pg_nx = (cpu_feature[2] & CPUID_NOX ? PTE_NX : 0); 1144 1145 /* 1146 * Set up our local static global vars that keep track of the usage of 1147 * KVM before kernel_map is set up. 1148 */ 1149 virtual_avail = kva_start; /* first free KVA */ 1150 virtual_end = VM_MAX_KERNEL_ADDRESS; /* last KVA */ 1151 1152 /* 1153 * Set up protection_codes: we need to be able to convert from a MI 1154 * protection code (some combo of VM_PROT...) to something we can jam 1155 * into a x86 PTE. 1156 */ 1157 protection_codes[VM_PROT_NONE] = pmap_pg_nx; 1158 protection_codes[VM_PROT_EXECUTE] = PTE_X; 1159 protection_codes[VM_PROT_READ] = pmap_pg_nx; 1160 protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PTE_X; 1161 protection_codes[VM_PROT_WRITE] = PTE_W | pmap_pg_nx; 1162 protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PTE_W | PTE_X; 1163 protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PTE_W | pmap_pg_nx; 1164 protection_codes[VM_PROT_ALL] = PTE_W | PTE_X; 1165 1166 /* 1167 * Now we init the kernel's pmap. 1168 * 1169 * The kernel pmap's pm_obj is not used for much. However, in user pmaps 1170 * the pm_obj contains the list of active PTPs. 1171 */ 1172 kpm = pmap_kernel(); 1173 mutex_init(&kpm->pm_lock, MUTEX_DEFAULT, IPL_NONE); 1174 rw_init(&kpm->pm_dummy_lock); 1175 for (i = 0; i < PTP_LEVELS - 1; i++) { 1176 uvm_obj_init(&kpm->pm_obj[i], &pmap_pager, false, 1); 1177 uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_dummy_lock); 1178 kpm->pm_ptphint[i] = NULL; 1179 } 1180 memset(&kpm->pm_list, 0, sizeof(kpm->pm_list)); /* pm_list not used */ 1181 1182 kpm->pm_pdir = (pd_entry_t *)bootspace.pdir; 1183 for (i = 0; i < PDP_SIZE; i++) 1184 kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i; 1185 1186 kpm->pm_stats.wired_count = kpm->pm_stats.resident_count = 1187 x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS); 1188 1189 kcpuset_create(&kpm->pm_cpus, true); 1190 kcpuset_create(&kpm->pm_kernel_cpus, true); 1191 1192 kpm->pm_ldt = NULL; 1193 kpm->pm_ldt_len = 0; 1194 kpm->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 1195 1196 /* 1197 * the above is just a rough estimate and not critical to the proper 1198 * operation of the system. 1199 */ 1200 1201 #if !defined(XENPV) 1202 /* 1203 * Begin to enable global TLB entries if they are supported: add PTE_G 1204 * attribute to already mapped kernel pages. Do that only if SVS is 1205 * disabled. 1206 * 1207 * The G bit has no effect until the CR4_PGE bit is set in CR4, which 1208 * happens later in cpu_init(). 1209 */ 1210 #ifdef SVS 1211 if (!svs_enabled && (cpu_feature[0] & CPUID_PGE)) { 1212 #else 1213 if (cpu_feature[0] & CPUID_PGE) { 1214 #endif 1215 pmap_pg_g = PTE_G; 1216 pmap_remap_global(); 1217 } 1218 #endif 1219 1220 #ifndef XENPV 1221 /* 1222 * Enable large pages if they are supported. 1223 */ 1224 if (cpu_feature[0] & CPUID_PSE) { 1225 lcr4(rcr4() | CR4_PSE); /* enable hardware (via %cr4) */ 1226 pmap_largepages = 1; /* enable software */ 1227 1228 /* 1229 * The TLB must be flushed after enabling large pages on Pentium 1230 * CPUs, according to section 3.6.2.2 of "Intel Architecture 1231 * Software Developer's Manual, Volume 3: System Programming". 1232 */ 1233 tlbflushg(); 1234 1235 /* Remap the kernel. */ 1236 pmap_remap_largepages(); 1237 } 1238 pmap_init_lapic(); 1239 #endif /* !XENPV */ 1240 1241 #ifdef __HAVE_PCPU_AREA 1242 pmap_init_pcpu(); 1243 #endif 1244 1245 #ifdef __HAVE_DIRECT_MAP 1246 pmap_init_directmap(kpm); 1247 #else 1248 pmap_vpage_cpualloc(&cpu_info_primary); 1249 1250 if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { /* i386 */ 1251 early_zerop = (void *)cpu_info_primary.vpage[VPAGE_ZER]; 1252 early_zero_pte = cpu_info_primary.vpage_pte[VPAGE_ZER]; 1253 } else { /* amd64 */ 1254 /* 1255 * zero_pte is stuck at the end of mapped space for the kernel 1256 * image (disjunct from kva space). This is done so that it 1257 * can safely be used in pmap_growkernel (pmap_get_physpage), 1258 * when it's called for the first time. 1259 * XXXfvdl fix this for MULTIPROCESSOR later. 1260 */ 1261 #ifdef XENPV 1262 /* early_zerop initialized in xen_locore() */ 1263 #else 1264 early_zerop = (void *)bootspace.spareva; 1265 #endif 1266 early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop); 1267 } 1268 #endif 1269 1270 #if defined(XENPV) && defined(__x86_64__) 1271 extern vaddr_t xen_dummy_page; 1272 paddr_t xen_dummy_user_pgd; 1273 1274 /* 1275 * We want a dummy page directory for Xen: when deactivating a pmap, 1276 * Xen will still consider it active. So we set user PGD to this one 1277 * to lift all protection on the now inactive page tables set. 1278 */ 1279 xen_dummy_user_pgd = xen_dummy_page - KERNBASE; 1280 1281 /* Zero fill it, the less checks in Xen it requires the better */ 1282 memset((void *)(xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE); 1283 /* Mark read-only */ 1284 HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE, 1285 pmap_pa2pte(xen_dummy_user_pgd) | PTE_P | pmap_pg_nx, 1286 UVMF_INVLPG); 1287 /* Pin as L4 */ 1288 xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd)); 1289 #endif 1290 1291 /* 1292 * Allocate space for the IDT, GDT and LDT. 1293 */ 1294 #ifdef __HAVE_PCPU_AREA 1295 idt_vaddr = (vaddr_t)&pcpuarea->idt; 1296 #else 1297 idt_vaddr = pmap_bootstrap_valloc(1); 1298 #endif 1299 idt_paddr = pmap_bootstrap_palloc(1); 1300 1301 gdt_vaddr = pmap_bootstrap_valloc(1); 1302 gdt_paddr = pmap_bootstrap_palloc(1); 1303 1304 #ifdef __HAVE_PCPU_AREA 1305 ldt_vaddr = (vaddr_t)&pcpuarea->ldt; 1306 #else 1307 ldt_vaddr = pmap_bootstrap_valloc(1); 1308 #endif 1309 ldt_paddr = pmap_bootstrap_palloc(1); 1310 1311 #if !defined(__x86_64__) 1312 /* pentium f00f bug stuff */ 1313 pentium_idt_vaddr = pmap_bootstrap_valloc(1); 1314 #endif 1315 1316 #if defined(XENPVHVM) 1317 /* XXX: move to hypervisor.c with appropriate API adjustments */ 1318 extern paddr_t HYPERVISOR_shared_info_pa; 1319 extern volatile struct xencons_interface *xencons_interface; /* XXX */ 1320 extern struct xenstore_domain_interface *xenstore_interface; /* XXX */ 1321 1322 HYPERVISOR_shared_info = (void *) pmap_bootstrap_valloc(1); 1323 HYPERVISOR_shared_info_pa = pmap_bootstrap_palloc(1); 1324 xencons_interface = (void *) pmap_bootstrap_valloc(1); 1325 xenstore_interface = (void *) pmap_bootstrap_valloc(1); 1326 #endif 1327 /* 1328 * Now we reserve some VM for mapping pages when doing a crash dump. 1329 */ 1330 virtual_avail = reserve_dumppages(virtual_avail); 1331 1332 /* 1333 * Init the global lock and global list. 1334 */ 1335 mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE); 1336 LIST_INIT(&pmaps); 1337 1338 /* 1339 * Ensure the TLB is sync'd with reality by flushing it... 1340 */ 1341 tlbflushg(); 1342 1343 /* 1344 * Calculate pmap_maxkvaddr from nkptp[]. 1345 */ 1346 kva = VM_MIN_KERNEL_ADDRESS; 1347 for (i = PTP_LEVELS - 1; i >= 1; i--) { 1348 kva += nkptp[i] * nbpd[i]; 1349 } 1350 pmap_maxkvaddr = kva; 1351 } 1352 1353 #ifndef XENPV 1354 static void 1355 pmap_init_lapic(void) 1356 { 1357 /* 1358 * On CPUs that have no LAPIC, local_apic_va is never kentered. But our 1359 * x86 implementation relies a lot on this address to be valid; so just 1360 * allocate a fake physical page that will be kentered into 1361 * local_apic_va by machdep. 1362 * 1363 * If the LAPIC is present, the va will be remapped somewhere else 1364 * later in lapic_map. 1365 */ 1366 local_apic_va = pmap_bootstrap_valloc(1); 1367 local_apic_pa = pmap_bootstrap_palloc(1); 1368 } 1369 #endif 1370 1371 #ifdef __x86_64__ 1372 static size_t 1373 pmap_pagetree_nentries_range(vaddr_t startva, vaddr_t endva, size_t pgsz) 1374 { 1375 size_t npages; 1376 npages = (roundup(endva, pgsz) / pgsz) - 1377 (rounddown(startva, pgsz) / pgsz); 1378 return npages; 1379 } 1380 #endif 1381 1382 #if defined(__HAVE_DIRECT_MAP) || defined(KASAN) || defined(KMSAN) 1383 static inline void 1384 slotspace_copy(int type, pd_entry_t *dst, pd_entry_t *src) 1385 { 1386 size_t sslot = slotspace.area[type].sslot; 1387 size_t nslot = slotspace.area[type].nslot; 1388 1389 memcpy(&dst[sslot], &src[sslot], nslot * sizeof(pd_entry_t)); 1390 } 1391 #endif 1392 1393 #ifdef __x86_64__ 1394 /* 1395 * Randomize the location of an area. We count the holes in the VM space. We 1396 * randomly select one hole, and then randomly select an area within that hole. 1397 * Finally we update the associated entry in the slotspace structure. 1398 */ 1399 vaddr_t __noasan 1400 slotspace_rand(int type, size_t sz, size_t align) 1401 { 1402 struct { 1403 int start; 1404 int end; 1405 } holes[SLSPACE_NAREAS+1]; 1406 size_t i, nholes, hole; 1407 size_t startsl, endsl, nslots, winsize; 1408 vaddr_t startva, va; 1409 1410 sz = roundup(sz, align); 1411 1412 /* 1413 * Take one more slot with +NBPD_L4, because we may end up choosing 1414 * an area that crosses slots: 1415 * +------+------+------+ 1416 * | Slot | Slot | Slot | 1417 * +------+------+------+ 1418 * [Chosen Area] 1419 * And in that case we must take into account the additional slot 1420 * consumed. 1421 */ 1422 nslots = roundup(sz+NBPD_L4, NBPD_L4) / NBPD_L4; 1423 1424 /* Get the holes. */ 1425 nholes = 0; 1426 size_t curslot = 0 + 256; /* end of SLAREA_USER */ 1427 while (1) { 1428 /* 1429 * Find the first occupied slot after the current one. 1430 * The area between the two is a hole. 1431 */ 1432 size_t minsslot = 512; 1433 size_t minnslot = 0; 1434 for (i = 0; i < SLSPACE_NAREAS; i++) { 1435 if (!slotspace.area[i].active) 1436 continue; 1437 if (slotspace.area[i].sslot >= curslot && 1438 slotspace.area[i].sslot < minsslot) { 1439 minsslot = slotspace.area[i].sslot; 1440 minnslot = slotspace.area[i].nslot; 1441 } 1442 } 1443 1444 /* No hole anymore, stop here. */ 1445 if (minsslot == 512) { 1446 break; 1447 } 1448 1449 /* Register the hole. */ 1450 if (minsslot - curslot >= nslots) { 1451 holes[nholes].start = curslot; 1452 holes[nholes].end = minsslot; 1453 nholes++; 1454 } 1455 1456 /* Skip that hole, and iterate again. */ 1457 curslot = minsslot + minnslot; 1458 } 1459 1460 if (nholes == 0) { 1461 panic("%s: impossible", __func__); 1462 } 1463 1464 /* Select a hole. */ 1465 cpu_earlyrng(&hole, sizeof(hole)); 1466 #ifdef NO_X86_ASLR 1467 hole = 0; 1468 #endif 1469 hole %= nholes; 1470 startsl = holes[hole].start; 1471 endsl = holes[hole].end; 1472 startva = VA_SIGN_NEG(startsl * NBPD_L4); 1473 1474 /* Select an area within the hole. */ 1475 cpu_earlyrng(&va, sizeof(va)); 1476 #ifdef NO_X86_ASLR 1477 va = 0; 1478 #endif 1479 winsize = ((endsl - startsl) * NBPD_L4) - sz; 1480 va %= winsize; 1481 va = rounddown(va, align); 1482 va += startva; 1483 1484 /* Update the entry. */ 1485 slotspace.area[type].sslot = pl4_i(va); 1486 slotspace.area[type].nslot = 1487 pmap_pagetree_nentries_range(va, va+sz, NBPD_L4); 1488 slotspace.area[type].active = true; 1489 1490 return va; 1491 } 1492 #endif 1493 1494 #ifdef __HAVE_PCPU_AREA 1495 static void 1496 pmap_init_pcpu(void) 1497 { 1498 const vaddr_t startva = PMAP_PCPU_BASE; 1499 size_t nL4e, nL3e, nL2e, nL1e; 1500 size_t L4e_idx, L3e_idx, L2e_idx, L1e_idx __diagused; 1501 paddr_t pa; 1502 vaddr_t endva; 1503 vaddr_t tmpva; 1504 pt_entry_t *pte; 1505 size_t size; 1506 int i; 1507 1508 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx; 1509 1510 size = sizeof(struct pcpu_area); 1511 1512 endva = startva + size; 1513 1514 /* We will use this temporary va. */ 1515 tmpva = bootspace.spareva; 1516 pte = PTE_BASE + pl1_i(tmpva); 1517 1518 /* Build L4 */ 1519 L4e_idx = pl4_i(startva); 1520 nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4); 1521 KASSERT(nL4e == 1); 1522 for (i = 0; i < nL4e; i++) { 1523 KASSERT(L4_BASE[L4e_idx+i] == 0); 1524 1525 pa = pmap_bootstrap_palloc(1); 1526 *pte = (pa & PTE_FRAME) | pteflags; 1527 pmap_update_pg(tmpva); 1528 memset((void *)tmpva, 0, PAGE_SIZE); 1529 1530 L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A; 1531 } 1532 1533 /* Build L3 */ 1534 L3e_idx = pl3_i(startva); 1535 nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3); 1536 for (i = 0; i < nL3e; i++) { 1537 KASSERT(L3_BASE[L3e_idx+i] == 0); 1538 1539 pa = pmap_bootstrap_palloc(1); 1540 *pte = (pa & PTE_FRAME) | pteflags; 1541 pmap_update_pg(tmpva); 1542 memset((void *)tmpva, 0, PAGE_SIZE); 1543 1544 L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A; 1545 } 1546 1547 /* Build L2 */ 1548 L2e_idx = pl2_i(startva); 1549 nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2); 1550 for (i = 0; i < nL2e; i++) { 1551 1552 KASSERT(L2_BASE[L2e_idx+i] == 0); 1553 1554 pa = pmap_bootstrap_palloc(1); 1555 *pte = (pa & PTE_FRAME) | pteflags; 1556 pmap_update_pg(tmpva); 1557 memset((void *)tmpva, 0, PAGE_SIZE); 1558 1559 L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A; 1560 } 1561 1562 /* Build L1 */ 1563 L1e_idx = pl1_i(startva); 1564 nL1e = pmap_pagetree_nentries_range(startva, endva, NBPD_L1); 1565 for (i = 0; i < nL1e; i++) { 1566 /* 1567 * Nothing to do, the PTEs will be entered via 1568 * pmap_kenter_pa. 1569 */ 1570 KASSERT(L1_BASE[L1e_idx+i] == 0); 1571 } 1572 1573 *pte = 0; 1574 pmap_update_pg(tmpva); 1575 1576 pcpuarea = (struct pcpu_area *)startva; 1577 1578 tlbflush(); 1579 } 1580 #endif 1581 1582 #ifdef __HAVE_DIRECT_MAP 1583 /* 1584 * Create the amd64 direct map. Called only once at boot time. We map all of 1585 * the physical memory contiguously using 2MB large pages, with RW permissions. 1586 * However there is a hole: the kernel is mapped with RO permissions. 1587 */ 1588 static void 1589 pmap_init_directmap(struct pmap *kpm) 1590 { 1591 extern phys_ram_seg_t mem_clusters[]; 1592 extern int mem_cluster_cnt; 1593 1594 vaddr_t startva; 1595 size_t nL4e, nL3e, nL2e; 1596 size_t L4e_idx, L3e_idx, L2e_idx; 1597 size_t spahole, epahole; 1598 paddr_t lastpa, pa; 1599 vaddr_t endva; 1600 vaddr_t tmpva; 1601 pt_entry_t *pte; 1602 phys_ram_seg_t *mc; 1603 int i; 1604 1605 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx; 1606 const pd_entry_t holepteflags = PTE_P | pmap_pg_nx; 1607 1608 CTASSERT(NL4_SLOT_DIRECT * NBPD_L4 == MAXPHYSMEM); 1609 1610 spahole = roundup(bootspace.head.pa, NBPD_L2); 1611 epahole = rounddown(bootspace.boot.pa, NBPD_L2); 1612 1613 /* Get the last physical address available */ 1614 lastpa = 0; 1615 for (i = 0; i < mem_cluster_cnt; i++) { 1616 mc = &mem_clusters[i]; 1617 lastpa = MAX(lastpa, mc->start + mc->size); 1618 } 1619 1620 /* 1621 * x86_add_cluster should have truncated the memory to MAXPHYSMEM. 1622 */ 1623 if (lastpa > MAXPHYSMEM) { 1624 panic("pmap_init_directmap: lastpa incorrect"); 1625 } 1626 1627 startva = slotspace_rand(SLAREA_DMAP, lastpa, NBPD_L2); 1628 endva = startva + lastpa; 1629 1630 /* We will use this temporary va. */ 1631 tmpva = bootspace.spareva; 1632 pte = PTE_BASE + pl1_i(tmpva); 1633 1634 /* Build L4 */ 1635 L4e_idx = pl4_i(startva); 1636 nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4); 1637 KASSERT(nL4e <= NL4_SLOT_DIRECT); 1638 for (i = 0; i < nL4e; i++) { 1639 KASSERT(L4_BASE[L4e_idx+i] == 0); 1640 1641 pa = pmap_bootstrap_palloc(1); 1642 *pte = (pa & PTE_FRAME) | pteflags; 1643 pmap_update_pg(tmpva); 1644 memset((void *)tmpva, 0, PAGE_SIZE); 1645 1646 L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A; 1647 } 1648 1649 /* Build L3 */ 1650 L3e_idx = pl3_i(startva); 1651 nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3); 1652 for (i = 0; i < nL3e; i++) { 1653 KASSERT(L3_BASE[L3e_idx+i] == 0); 1654 1655 pa = pmap_bootstrap_palloc(1); 1656 *pte = (pa & PTE_FRAME) | pteflags; 1657 pmap_update_pg(tmpva); 1658 memset((void *)tmpva, 0, PAGE_SIZE); 1659 1660 L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A; 1661 } 1662 1663 /* Build L2 */ 1664 L2e_idx = pl2_i(startva); 1665 nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2); 1666 for (i = 0; i < nL2e; i++) { 1667 KASSERT(L2_BASE[L2e_idx+i] == 0); 1668 1669 pa = (paddr_t)(i * NBPD_L2); 1670 1671 if (spahole <= pa && pa < epahole) { 1672 L2_BASE[L2e_idx+i] = pa | holepteflags | PTE_A | 1673 PTE_PS | pmap_pg_g; 1674 } else { 1675 L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A | 1676 PTE_PS | pmap_pg_g; 1677 } 1678 } 1679 1680 *pte = 0; 1681 pmap_update_pg(tmpva); 1682 1683 pmap_direct_base = startva; 1684 pmap_direct_end = endva; 1685 1686 tlbflush(); 1687 } 1688 #endif /* __HAVE_DIRECT_MAP */ 1689 1690 #if !defined(XENPV) 1691 /* 1692 * Remap all of the virtual pages created so far with the PTE_G bit. 1693 */ 1694 static void 1695 pmap_remap_global(void) 1696 { 1697 vaddr_t kva, kva_end; 1698 unsigned long p1i; 1699 size_t i; 1700 1701 /* head */ 1702 kva = bootspace.head.va; 1703 kva_end = kva + bootspace.head.sz; 1704 for ( ; kva < kva_end; kva += PAGE_SIZE) { 1705 p1i = pl1_i(kva); 1706 if (pmap_valid_entry(PTE_BASE[p1i])) 1707 PTE_BASE[p1i] |= pmap_pg_g; 1708 } 1709 1710 /* kernel segments */ 1711 for (i = 0; i < BTSPACE_NSEGS; i++) { 1712 if (bootspace.segs[i].type == BTSEG_NONE) { 1713 continue; 1714 } 1715 kva = bootspace.segs[i].va; 1716 kva_end = kva + bootspace.segs[i].sz; 1717 for ( ; kva < kva_end; kva += PAGE_SIZE) { 1718 p1i = pl1_i(kva); 1719 if (pmap_valid_entry(PTE_BASE[p1i])) 1720 PTE_BASE[p1i] |= pmap_pg_g; 1721 } 1722 } 1723 1724 /* boot space */ 1725 kva = bootspace.boot.va; 1726 kva_end = kva + bootspace.boot.sz; 1727 for ( ; kva < kva_end; kva += PAGE_SIZE) { 1728 p1i = pl1_i(kva); 1729 if (pmap_valid_entry(PTE_BASE[p1i])) 1730 PTE_BASE[p1i] |= pmap_pg_g; 1731 } 1732 } 1733 #endif 1734 1735 #ifndef XENPV 1736 /* 1737 * Remap several kernel segments with large pages. We cover as many pages as we 1738 * can. Called only once at boot time, if the CPU supports large pages. 1739 */ 1740 static void 1741 pmap_remap_largepages(void) 1742 { 1743 pd_entry_t *pde; 1744 vaddr_t kva, kva_end; 1745 paddr_t pa; 1746 size_t i; 1747 1748 /* Remap the kernel text using large pages. */ 1749 for (i = 0; i < BTSPACE_NSEGS; i++) { 1750 if (bootspace.segs[i].type != BTSEG_TEXT) { 1751 continue; 1752 } 1753 kva = roundup(bootspace.segs[i].va, NBPD_L2); 1754 if (kva < bootspace.segs[i].va) { 1755 continue; 1756 } 1757 kva_end = rounddown(bootspace.segs[i].va + 1758 bootspace.segs[i].sz, NBPD_L2); 1759 pa = roundup(bootspace.segs[i].pa, NBPD_L2); 1760 for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) { 1761 pde = &L2_BASE[pl2_i(kva)]; 1762 *pde = pa | pmap_pg_g | PTE_PS | PTE_P; 1763 tlbflushg(); 1764 } 1765 } 1766 1767 /* Remap the kernel rodata using large pages. */ 1768 for (i = 0; i < BTSPACE_NSEGS; i++) { 1769 if (bootspace.segs[i].type != BTSEG_RODATA) { 1770 continue; 1771 } 1772 kva = roundup(bootspace.segs[i].va, NBPD_L2); 1773 if (kva < bootspace.segs[i].va) { 1774 continue; 1775 } 1776 kva_end = rounddown(bootspace.segs[i].va + 1777 bootspace.segs[i].sz, NBPD_L2); 1778 pa = roundup(bootspace.segs[i].pa, NBPD_L2); 1779 for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) { 1780 pde = &L2_BASE[pl2_i(kva)]; 1781 *pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_P; 1782 tlbflushg(); 1783 } 1784 } 1785 1786 /* Remap the kernel data+bss using large pages. */ 1787 for (i = 0; i < BTSPACE_NSEGS; i++) { 1788 if (bootspace.segs[i].type != BTSEG_DATA) { 1789 continue; 1790 } 1791 kva = roundup(bootspace.segs[i].va, NBPD_L2); 1792 if (kva < bootspace.segs[i].va) { 1793 continue; 1794 } 1795 kva_end = rounddown(bootspace.segs[i].va + 1796 bootspace.segs[i].sz, NBPD_L2); 1797 pa = roundup(bootspace.segs[i].pa, NBPD_L2); 1798 for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) { 1799 pde = &L2_BASE[pl2_i(kva)]; 1800 *pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_W | PTE_P; 1801 tlbflushg(); 1802 } 1803 } 1804 } 1805 #endif /* !XENPV */ 1806 1807 /* 1808 * pmap_init: called from uvm_init, our job is to get the pmap system ready 1809 * to manage mappings. 1810 */ 1811 void 1812 pmap_init(void) 1813 { 1814 int flags; 1815 1816 /* 1817 * initialize caches. 1818 */ 1819 1820 pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), COHERENCY_UNIT, 1821 0, 0, "pmappl", NULL, IPL_NONE, pmap_ctor, pmap_dtor, NULL); 1822 1823 #ifdef XENPV 1824 /* 1825 * pool_cache(9) should not touch cached objects, since they 1826 * are pinned on xen and R/O for the domU 1827 */ 1828 flags = PR_NOTOUCH; 1829 #else 1830 flags = 0; 1831 #endif 1832 1833 #ifdef PAE 1834 pool_init(&pmap_pdp_pool, PAGE_SIZE * PDP_SIZE, 0, 0, flags, 1835 "pdppl", &pmap_pdp_allocator, IPL_NONE); 1836 #else 1837 pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, 0, flags, 1838 "pdppl", NULL, IPL_NONE); 1839 #endif 1840 pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 1841 #ifdef _LP64 1842 coherency_unit, 1843 #else 1844 coherency_unit / 2, 1845 #endif 1846 0, PR_LARGECACHE, "pvpl", &pool_allocator_kmem, 1847 IPL_NONE, NULL, NULL, NULL); 1848 1849 pmap_tlb_init(); 1850 1851 /* XXX: Since cpu_hatch() is only for secondary CPUs. */ 1852 pmap_tlb_cpu_init(curcpu()); 1853 1854 evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC, 1855 NULL, "x86", "io bitmap copy"); 1856 evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC, 1857 NULL, "x86", "ldt sync"); 1858 1859 /* 1860 * The kernel doesn't keep track of PTPs, so there's nowhere handy 1861 * to hang a tree of pv_entry records. Dynamically allocated 1862 * pv_entry lists are not heavily used in the kernel's pmap (the 1863 * usual case is embedded), so cop out and use a single RB tree 1864 * to cover them. 1865 */ 1866 rb_tree_init(&pmap_kernel_rb, &pmap_rbtree_ops); 1867 1868 /* 1869 * done: pmap module is up (and ready for business) 1870 */ 1871 1872 pmap_initialized = true; 1873 } 1874 1875 #ifndef XENPV 1876 /* 1877 * pmap_cpu_init_late: perform late per-CPU initialization. 1878 */ 1879 void 1880 pmap_cpu_init_late(struct cpu_info *ci) 1881 { 1882 /* 1883 * The BP has already its own PD page allocated during early 1884 * MD startup. 1885 */ 1886 if (ci == &cpu_info_primary) 1887 return; 1888 #ifdef PAE 1889 cpu_alloc_l3_page(ci); 1890 #endif 1891 } 1892 #endif 1893 1894 #ifndef __HAVE_DIRECT_MAP 1895 CTASSERT(CACHE_LINE_SIZE > sizeof(pt_entry_t)); 1896 CTASSERT(CACHE_LINE_SIZE % sizeof(pt_entry_t) == 0); 1897 1898 static void 1899 pmap_vpage_cpualloc(struct cpu_info *ci) 1900 { 1901 bool primary = (ci == &cpu_info_primary); 1902 size_t i, npages; 1903 vaddr_t vabase; 1904 vsize_t vrange; 1905 1906 npages = (CACHE_LINE_SIZE / sizeof(pt_entry_t)); 1907 KASSERT(npages >= VPAGE_MAX); 1908 vrange = npages * PAGE_SIZE; 1909 1910 if (primary) { 1911 while ((vabase = pmap_bootstrap_valloc(1)) % vrange != 0) { 1912 /* Waste some pages to align properly */ 1913 } 1914 /* The base is aligned, allocate the rest (contiguous) */ 1915 pmap_bootstrap_valloc(npages - 1); 1916 } else { 1917 vabase = uvm_km_alloc(kernel_map, vrange, vrange, 1918 UVM_KMF_VAONLY); 1919 if (vabase == 0) { 1920 panic("%s: failed to allocate tmp VA for CPU %d\n", 1921 __func__, cpu_index(ci)); 1922 } 1923 } 1924 1925 KASSERT((vaddr_t)&PTE_BASE[pl1_i(vabase)] % CACHE_LINE_SIZE == 0); 1926 1927 for (i = 0; i < VPAGE_MAX; i++) { 1928 ci->vpage[i] = vabase + i * PAGE_SIZE; 1929 ci->vpage_pte[i] = PTE_BASE + pl1_i(ci->vpage[i]); 1930 } 1931 } 1932 1933 void 1934 pmap_vpage_cpu_init(struct cpu_info *ci) 1935 { 1936 if (ci == &cpu_info_primary) { 1937 /* cpu0 already taken care of in pmap_bootstrap */ 1938 return; 1939 } 1940 1941 pmap_vpage_cpualloc(ci); 1942 } 1943 #endif 1944 1945 /* 1946 * p v _ e n t r y f u n c t i o n s 1947 */ 1948 1949 /* 1950 * pmap_free_pvs: free a linked list of pv entries. the pv entries have 1951 * been removed from their respective pages, but are still entered into the 1952 * map and we must undo that. 1953 * 1954 * => must be called with pmap locked. 1955 */ 1956 static void 1957 pmap_free_pvs(struct pmap *pmap, struct pv_entry *pve) 1958 { 1959 struct pv_entry *next; 1960 1961 KASSERT(mutex_owned(&pmap->pm_lock)); 1962 1963 for ( /* null */ ; pve != NULL ; pve = next) { 1964 next = pve->pve_next; 1965 pool_cache_put(&pmap_pv_cache, pve); 1966 } 1967 } 1968 1969 /* 1970 * pmap_check_pv: verify {VA, PTP} pair is either tracked/untracked by page 1971 */ 1972 static void 1973 pmap_check_pv(struct pmap *pmap, struct vm_page *ptp, struct pmap_page *pp, 1974 vaddr_t va, bool tracked) 1975 { 1976 #ifdef DEBUG 1977 struct pv_pte *pvpte; 1978 1979 PMAP_CHECK_PP(pp); 1980 1981 mutex_spin_enter(&pp->pp_lock); 1982 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 1983 if (pvpte->pte_ptp == ptp && pvpte->pte_va == va) { 1984 break; 1985 } 1986 } 1987 mutex_spin_exit(&pp->pp_lock); 1988 1989 if (pvpte && !tracked) { 1990 panic("pmap_check_pv: %p/%lx found on pp %p", ptp, va, pp); 1991 } else if (!pvpte && tracked) { 1992 panic("pmap_check_pv: %p/%lx missing on pp %p", ptp, va, pp); 1993 } 1994 #endif 1995 } 1996 1997 /* 1998 * pmap_treelookup_pv: search the PV tree for a dynamic entry 1999 * 2000 * => pmap must be locked 2001 */ 2002 static struct pv_entry * 2003 pmap_treelookup_pv(const struct pmap *pmap, const struct vm_page *ptp, 2004 const rb_tree_t *tree, const vaddr_t va) 2005 { 2006 struct pv_entry *pve; 2007 rb_node_t *node; 2008 2009 /* 2010 * Inlined lookup tailored for exactly what's needed here that is 2011 * quite a bit faster than using rb_tree_find_node(). 2012 */ 2013 for (node = tree->rbt_root;;) { 2014 if (__predict_false(RB_SENTINEL_P(node))) { 2015 return NULL; 2016 } 2017 pve = (struct pv_entry *) 2018 ((uintptr_t)node - offsetof(struct pv_entry, pve_rb)); 2019 if (pve->pve_pte.pte_va == va) { 2020 KASSERT(pve->pve_pte.pte_ptp == ptp); 2021 return pve; 2022 } 2023 node = node->rb_nodes[pve->pve_pte.pte_va < va]; 2024 } 2025 } 2026 2027 /* 2028 * pmap_lookup_pv: look up a non-embedded pv entry for the given pmap 2029 * 2030 * => a PV entry must be known present (doesn't check for existence) 2031 * => pmap must be locked 2032 */ 2033 static struct pv_entry * 2034 pmap_lookup_pv(const struct pmap *pmap, const struct vm_page *ptp, 2035 const struct pmap_page * const old_pp, const vaddr_t va) 2036 { 2037 struct pv_entry *pve; 2038 const rb_tree_t *tree; 2039 2040 KASSERT(mutex_owned(&pmap->pm_lock)); 2041 KASSERT(ptp != NULL || pmap == pmap_kernel()); 2042 2043 /* 2044 * [This mostly deals with the case of process-private pages, i.e. 2045 * anonymous memory allocations or COW.] 2046 * 2047 * If the page is tracked with an embedded entry then the tree 2048 * lookup can be avoided. It's safe to check for this specific 2049 * set of values without pp_lock because both will only ever be 2050 * set together for this pmap. 2051 * 2052 */ 2053 if (atomic_load_relaxed(&old_pp->pp_pte.pte_ptp) == ptp && 2054 atomic_load_relaxed(&old_pp->pp_pte.pte_va) == va) { 2055 return NULL; 2056 } 2057 2058 /* 2059 * [This mostly deals with shared mappings, for example shared libs 2060 * and executables.] 2061 * 2062 * Optimise for pmap_remove_ptes() which works by ascending scan: 2063 * look at the lowest numbered node in the tree first. The tree is 2064 * known non-empty because of the check above. For short lived 2065 * processes where pmap_remove() isn't used much this gets close to 2066 * a 100% hit rate. 2067 */ 2068 tree = (ptp != NULL ? &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb); 2069 KASSERT(!RB_SENTINEL_P(tree->rbt_root)); 2070 pve = (struct pv_entry *) 2071 ((uintptr_t)tree->rbt_minmax[RB_DIR_LEFT] - 2072 offsetof(struct pv_entry, pve_rb)); 2073 if (__predict_true(pve->pve_pte.pte_va == va)) { 2074 KASSERT(pve->pve_pte.pte_ptp == ptp); 2075 return pve; 2076 } 2077 2078 /* Search the RB tree for the key (uncommon). */ 2079 return pmap_treelookup_pv(pmap, ptp, tree, va); 2080 } 2081 2082 /* 2083 * pmap_enter_pv: enter a mapping onto a pmap_page lst 2084 * 2085 * => pmap must be locked 2086 * => does NOT insert dynamic entries to tree (pmap_enter() does later) 2087 */ 2088 static int 2089 pmap_enter_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp, 2090 vaddr_t va, struct pv_entry **new_pve, struct pv_entry **old_pve, 2091 bool *samepage, bool *new_embedded, rb_tree_t *tree) 2092 { 2093 struct pv_entry *pve; 2094 int error; 2095 2096 KASSERT(mutex_owned(&pmap->pm_lock)); 2097 KASSERT(ptp_to_pmap(ptp) == pmap); 2098 KASSERT(ptp == NULL || ptp->uobject != NULL); 2099 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 2100 PMAP_CHECK_PP(pp); 2101 2102 /* 2103 * If entering the same page and it's already tracked with an 2104 * embedded entry, we can avoid the expense below. It's safe 2105 * to check for this very specific set of values without a lock 2106 * because both will only ever be set together for this pmap. 2107 */ 2108 if (atomic_load_relaxed(&pp->pp_pte.pte_ptp) == ptp && 2109 atomic_load_relaxed(&pp->pp_pte.pte_va) == va) { 2110 *samepage = true; 2111 pmap_check_pv(pmap, ptp, pp, va, true); 2112 return 0; 2113 } 2114 2115 /* 2116 * Check for an existing dynamic mapping at this address. If it's 2117 * for the same page, then it will be reused and nothing needs to be 2118 * changed. 2119 */ 2120 *old_pve = pmap_treelookup_pv(pmap, ptp, tree, va); 2121 if (*old_pve != NULL && (*old_pve)->pve_pp == pp) { 2122 *samepage = true; 2123 pmap_check_pv(pmap, ptp, pp, va, true); 2124 return 0; 2125 } 2126 2127 /* 2128 * Need to put a new mapping in place. Grab a spare pv_entry in 2129 * case it's needed; won't know for sure until the lock is taken. 2130 */ 2131 if (pmap->pm_pve == NULL) { 2132 pmap->pm_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT); 2133 } 2134 2135 error = 0; 2136 pmap_check_pv(pmap, ptp, pp, va, false); 2137 mutex_spin_enter(&pp->pp_lock); 2138 if (!pv_pte_embedded(pp)) { 2139 /* 2140 * Embedded PV tracking available - easy. 2141 */ 2142 pp->pp_pte.pte_ptp = ptp; 2143 pp->pp_pte.pte_va = va; 2144 *new_embedded = true; 2145 } else if (__predict_false(pmap->pm_pve == NULL)) { 2146 /* 2147 * No memory. 2148 */ 2149 error = ENOMEM; 2150 } else { 2151 /* 2152 * Install new pv_entry on the page. 2153 */ 2154 pve = pmap->pm_pve; 2155 pmap->pm_pve = NULL; 2156 *new_pve = pve; 2157 pve->pve_pte.pte_ptp = ptp; 2158 pve->pve_pte.pte_va = va; 2159 pve->pve_pp = pp; 2160 LIST_INSERT_HEAD(&pp->pp_pvlist, pve, pve_list); 2161 } 2162 mutex_spin_exit(&pp->pp_lock); 2163 if (error == 0) { 2164 pmap_check_pv(pmap, ptp, pp, va, true); 2165 } 2166 2167 return error; 2168 } 2169 2170 /* 2171 * pmap_remove_pv: try to remove a mapping from a pv_list 2172 * 2173 * => pmap must be locked 2174 * => removes dynamic entries from tree 2175 * => caller should adjust ptp's wire_count and free PTP if needed 2176 */ 2177 static void 2178 pmap_remove_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp, 2179 vaddr_t va, struct pv_entry *pve, uint8_t oattrs) 2180 { 2181 rb_tree_t *tree = (ptp != NULL ? 2182 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb); 2183 2184 KASSERT(mutex_owned(&pmap->pm_lock)); 2185 KASSERT(ptp_to_pmap(ptp) == pmap); 2186 KASSERT(ptp == NULL || ptp->uobject != NULL); 2187 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 2188 KASSERT(ptp != NULL || pmap == pmap_kernel()); 2189 2190 pmap_check_pv(pmap, ptp, pp, va, true); 2191 2192 if (pve == NULL) { 2193 mutex_spin_enter(&pp->pp_lock); 2194 KASSERT(pp->pp_pte.pte_ptp == ptp); 2195 KASSERT(pp->pp_pte.pte_va == va); 2196 pp->pp_attrs |= oattrs; 2197 pp->pp_pte.pte_ptp = NULL; 2198 pp->pp_pte.pte_va = 0; 2199 mutex_spin_exit(&pp->pp_lock); 2200 } else { 2201 mutex_spin_enter(&pp->pp_lock); 2202 KASSERT(pp->pp_pte.pte_ptp != ptp || 2203 pp->pp_pte.pte_va != va); 2204 KASSERT(pve->pve_pte.pte_ptp == ptp); 2205 KASSERT(pve->pve_pte.pte_va == va); 2206 KASSERT(pve->pve_pp == pp); 2207 pp->pp_attrs |= oattrs; 2208 LIST_REMOVE(pve, pve_list); 2209 mutex_spin_exit(&pp->pp_lock); 2210 2211 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == pve); 2212 rb_tree_remove_node(tree, pve); 2213 #ifdef DIAGNOSTIC 2214 memset(pve, 0, sizeof(*pve)); 2215 #endif 2216 } 2217 2218 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 2219 pmap_check_pv(pmap, ptp, pp, va, false); 2220 } 2221 2222 /* 2223 * p t p f u n c t i o n s 2224 */ 2225 2226 static struct vm_page * 2227 pmap_find_ptp(struct pmap *pmap, vaddr_t va, int level) 2228 { 2229 int lidx = level - 1; 2230 off_t off = ptp_va2o(va, level); 2231 struct vm_page *pg; 2232 2233 KASSERT(mutex_owned(&pmap->pm_lock)); 2234 2235 if (pmap->pm_ptphint[lidx] && off == pmap->pm_ptphint[lidx]->offset) { 2236 KASSERT(pmap->pm_ptphint[lidx]->wire_count > 0); 2237 pg = pmap->pm_ptphint[lidx]; 2238 PMAP_CHECK_PP(VM_PAGE_TO_PP(pg)); 2239 return pg; 2240 } 2241 PMAP_DUMMY_LOCK(pmap); 2242 pg = uvm_pagelookup(&pmap->pm_obj[lidx], off); 2243 PMAP_DUMMY_UNLOCK(pmap); 2244 if (pg != NULL && __predict_false(pg->wire_count == 0)) { 2245 /* This page is queued to be freed - ignore. */ 2246 pg = NULL; 2247 } 2248 if (pg != NULL) { 2249 PMAP_CHECK_PP(VM_PAGE_TO_PP(pg)); 2250 } 2251 pmap->pm_ptphint[lidx] = pg; 2252 return pg; 2253 } 2254 2255 static inline void 2256 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level) 2257 { 2258 int lidx; 2259 2260 KASSERT(ptp->wire_count <= 1); 2261 PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp)); 2262 2263 lidx = level - 1; 2264 pmap_stats_update(pmap, -ptp->wire_count, 0); 2265 if (pmap->pm_ptphint[lidx] == ptp) 2266 pmap->pm_ptphint[lidx] = NULL; 2267 ptp->wire_count = 0; 2268 ptp->uanon = NULL; 2269 KASSERT(RB_TREE_MIN(&VM_PAGE_TO_PP(ptp)->pp_rb) == NULL); 2270 2271 /* 2272 * Enqueue the PTP to be freed by pmap_update(). We can't remove 2273 * the page from the uvm_object, as that can take further locks 2274 * (intolerable right now because the PTEs are likely mapped in). 2275 * Instead mark the PTP as free and if we bump into it again, we'll 2276 * either ignore or reuse (depending on what's useful at the time). 2277 */ 2278 LIST_INSERT_HEAD(&pmap->pm_gc_ptp, ptp, mdpage.mp_pp.pp_link); 2279 } 2280 2281 static void 2282 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va, 2283 pt_entry_t *ptes, pd_entry_t * const *pdes) 2284 { 2285 unsigned long index; 2286 int level; 2287 vaddr_t invaladdr; 2288 pd_entry_t opde; 2289 2290 KASSERT(pmap != pmap_kernel()); 2291 KASSERT(mutex_owned(&pmap->pm_lock)); 2292 KASSERT(kpreempt_disabled()); 2293 2294 level = 1; 2295 do { 2296 index = pl_i(va, level + 1); 2297 opde = pmap_pte_testset(&pdes[level - 1][index], 0); 2298 2299 /* 2300 * On Xen-amd64 or SVS, we need to sync the top level page 2301 * directory on each CPU. 2302 */ 2303 #if defined(XENPV) && defined(__x86_64__) 2304 if (level == PTP_LEVELS - 1) { 2305 xen_kpm_sync(pmap, index); 2306 } 2307 #elif defined(SVS) 2308 if (svs_enabled && level == PTP_LEVELS - 1) { 2309 svs_pmap_sync(pmap, index); 2310 } 2311 #endif 2312 2313 invaladdr = level == 1 ? (vaddr_t)ptes : 2314 (vaddr_t)pdes[level - 2]; 2315 pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE, 2316 opde, TLBSHOOT_FREE_PTP); 2317 2318 #if defined(XENPV) 2319 pmap_tlb_shootnow(); 2320 #endif 2321 2322 pmap_freepage(pmap, ptp, level); 2323 if (level < PTP_LEVELS - 1) { 2324 ptp = pmap_find_ptp(pmap, va, level + 1); 2325 ptp->wire_count--; 2326 if (ptp->wire_count > 1) 2327 break; 2328 } 2329 } while (++level < PTP_LEVELS); 2330 pmap_pte_flush(); 2331 } 2332 2333 /* 2334 * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one) 2335 * 2336 * => pmap should NOT be pmap_kernel() 2337 * => pmap should be locked 2338 * => we are not touching any PTEs yet, so they need not be mapped in 2339 */ 2340 static int 2341 pmap_get_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va, 2342 int flags, struct vm_page **resultp) 2343 { 2344 struct vm_page *ptp; 2345 int i, aflags; 2346 struct uvm_object *obj; 2347 voff_t off; 2348 2349 KASSERT(pmap != pmap_kernel()); 2350 KASSERT(mutex_owned(&pmap->pm_lock)); 2351 2352 /* 2353 * Loop through all page table levels allocating a page 2354 * for any level where we don't already have one. 2355 */ 2356 memset(pt, 0, sizeof(*pt)); 2357 aflags = ((flags & PMAP_CANFAIL) ? 0 : UVM_PGA_USERESERVE) | 2358 UVM_PGA_ZERO; 2359 for (i = PTP_LEVELS; i > 1; i--) { 2360 obj = &pmap->pm_obj[i - 2]; 2361 off = ptp_va2o(va, i - 1); 2362 2363 PMAP_DUMMY_LOCK(pmap); 2364 pt->pg[i] = uvm_pagelookup(obj, off); 2365 2366 if (pt->pg[i] == NULL) { 2367 pt->pg[i] = uvm_pagealloc(obj, off, NULL, aflags); 2368 pt->alloced[i] = (pt->pg[i] != NULL); 2369 } else if (pt->pg[i]->wire_count == 0) { 2370 /* This page was queued to be freed; dequeue it. */ 2371 LIST_REMOVE(pt->pg[i], mdpage.mp_pp.pp_link); 2372 pt->alloced[i] = true; 2373 } 2374 PMAP_DUMMY_UNLOCK(pmap); 2375 if (pt->pg[i] == NULL) { 2376 pmap_unget_ptp(pmap, pt); 2377 return ENOMEM; 2378 } else if (pt->alloced[i]) { 2379 pt->pg[i]->uanon = (struct vm_anon *)(vaddr_t)~0L; 2380 rb_tree_init(&VM_PAGE_TO_PP(pt->pg[i])->pp_rb, 2381 &pmap_rbtree_ops); 2382 PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i])); 2383 } 2384 } 2385 ptp = pt->pg[2]; 2386 KASSERT(ptp != NULL); 2387 *resultp = ptp; 2388 pmap->pm_ptphint[0] = ptp; 2389 return 0; 2390 } 2391 2392 /* 2393 * pmap_install_ptp: install any freshly allocated PTPs 2394 * 2395 * => pmap should NOT be pmap_kernel() 2396 * => pmap should be locked 2397 * => PTEs must be mapped 2398 * => preemption must be disabled 2399 */ 2400 static void 2401 pmap_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va, 2402 pd_entry_t * const *pdes) 2403 { 2404 struct vm_page *ptp; 2405 unsigned long index; 2406 pd_entry_t *pva; 2407 paddr_t pa; 2408 int i; 2409 2410 KASSERT(pmap != pmap_kernel()); 2411 KASSERT(mutex_owned(&pmap->pm_lock)); 2412 KASSERT(kpreempt_disabled()); 2413 2414 /* 2415 * Now that we have all the pages looked up or allocated, 2416 * loop through again installing any new ones into the tree. 2417 */ 2418 for (i = PTP_LEVELS; i > 1; i--) { 2419 index = pl_i(va, i); 2420 pva = pdes[i - 2]; 2421 2422 if (pmap_valid_entry(pva[index])) { 2423 KASSERT(!pt->alloced[i]); 2424 continue; 2425 } 2426 2427 ptp = pt->pg[i]; 2428 ptp->flags &= ~PG_BUSY; /* never busy */ 2429 ptp->wire_count = 1; 2430 pmap->pm_ptphint[i - 2] = ptp; 2431 pa = VM_PAGE_TO_PHYS(ptp); 2432 pmap_pte_set(&pva[index], (pd_entry_t) 2433 (pmap_pa2pte(pa) | PTE_U | PTE_W | PTE_P)); 2434 2435 /* 2436 * On Xen-amd64 or SVS, we need to sync the top level page 2437 * directory on each CPU. 2438 */ 2439 #if defined(XENPV) && defined(__x86_64__) 2440 if (i == PTP_LEVELS) { 2441 xen_kpm_sync(pmap, index); 2442 } 2443 #elif defined(SVS) 2444 if (svs_enabled && i == PTP_LEVELS) { 2445 svs_pmap_sync(pmap, index); 2446 } 2447 #endif 2448 2449 pmap_pte_flush(); 2450 pmap_stats_update(pmap, 1, 0); 2451 2452 /* 2453 * If we're not in the top level, increase the 2454 * wire count of the parent page. 2455 */ 2456 if (i < PTP_LEVELS) { 2457 pt->pg[i + 1]->wire_count++; 2458 } 2459 } 2460 } 2461 2462 /* 2463 * pmap_unget_ptp: free unusued PTPs 2464 * 2465 * => pmap should NOT be pmap_kernel() 2466 * => pmap should be locked 2467 */ 2468 static void 2469 pmap_unget_ptp(struct pmap *pmap, struct pmap_ptparray *pt) 2470 { 2471 int i; 2472 2473 KASSERT(pmap != pmap_kernel()); 2474 KASSERT(mutex_owned(&pmap->pm_lock)); 2475 2476 for (i = PTP_LEVELS; i > 1; i--) { 2477 if (!pt->alloced[i]) { 2478 continue; 2479 } 2480 KASSERT(pt->pg[i]->wire_count == 0); 2481 PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i])); 2482 pmap_freepage(pmap, pt->pg[i], i - 1); 2483 } 2484 } 2485 2486 /* 2487 * p m a p l i f e c y c l e f u n c t i o n s 2488 */ 2489 2490 /* 2491 * pmap_pdp_init: constructor a new PDP. 2492 */ 2493 static void 2494 pmap_pdp_init(pd_entry_t *pdir) 2495 { 2496 paddr_t pdirpa = 0; 2497 vaddr_t object; 2498 int i; 2499 2500 #if !defined(XENPV) || !defined(__x86_64__) 2501 int npde; 2502 #endif 2503 #ifdef XENPV 2504 int s; 2505 #endif 2506 2507 memset(pdir, 0, PDP_SIZE * PAGE_SIZE); 2508 2509 /* 2510 * NOTE: This is all done unlocked, but we will check afterwards 2511 * if we have raced with pmap_growkernel(). 2512 */ 2513 2514 #if defined(XENPV) && defined(__x86_64__) 2515 /* Fetch the physical address of the page directory */ 2516 (void)pmap_extract(pmap_kernel(), (vaddr_t)pdir, &pdirpa); 2517 2518 /* 2519 * This pdir will NEVER be active in kernel mode, so mark 2520 * recursive entry invalid. 2521 */ 2522 pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa); 2523 2524 /* 2525 * PDP constructed this way won't be for the kernel, hence we 2526 * don't put kernel mappings on Xen. 2527 * 2528 * But we need to make pmap_create() happy, so put a dummy 2529 * (without PTE_P) value at the right place. 2530 */ 2531 pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] = 2532 (pd_entry_t)-1 & PTE_FRAME; 2533 #else /* XENPV && __x86_64__*/ 2534 object = (vaddr_t)pdir; 2535 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2536 /* Fetch the physical address of the page directory */ 2537 (void)pmap_extract(pmap_kernel(), object, &pdirpa); 2538 2539 /* Put in recursive PDE to map the PTEs */ 2540 pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PTE_P | 2541 pmap_pg_nx; 2542 #ifndef XENPV 2543 pdir[PDIR_SLOT_PTE + i] |= PTE_W; 2544 #endif 2545 } 2546 2547 /* Copy the kernel's top level PDE */ 2548 npde = nkptp[PTP_LEVELS - 1]; 2549 2550 memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN], 2551 npde * sizeof(pd_entry_t)); 2552 2553 if (VM_MIN_KERNEL_ADDRESS != KERNBASE) { 2554 int idx = pl_i(KERNBASE, PTP_LEVELS); 2555 pdir[idx] = PDP_BASE[idx]; 2556 } 2557 2558 #ifdef __HAVE_PCPU_AREA 2559 pdir[PDIR_SLOT_PCPU] = PDP_BASE[PDIR_SLOT_PCPU]; 2560 #endif 2561 #ifdef __HAVE_DIRECT_MAP 2562 slotspace_copy(SLAREA_DMAP, pdir, PDP_BASE); 2563 #endif 2564 #ifdef KASAN 2565 slotspace_copy(SLAREA_ASAN, pdir, PDP_BASE); 2566 #endif 2567 #ifdef KMSAN 2568 slotspace_copy(SLAREA_MSAN, pdir, PDP_BASE); 2569 #endif 2570 #endif /* XENPV && __x86_64__*/ 2571 2572 #ifdef XENPV 2573 s = splvm(); 2574 object = (vaddr_t)pdir; 2575 pmap_protect(pmap_kernel(), object, object + (PAGE_SIZE * PDP_SIZE), 2576 VM_PROT_READ); 2577 pmap_update(pmap_kernel()); 2578 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2579 /* 2580 * pin as L2/L4 page, we have to do the page with the 2581 * PDIR_SLOT_PTE entries last 2582 */ 2583 #ifdef PAE 2584 if (i == l2tol3(PDIR_SLOT_PTE)) 2585 continue; 2586 #endif 2587 2588 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2589 #ifdef __x86_64__ 2590 xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa)); 2591 #else 2592 xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa)); 2593 #endif 2594 } 2595 #ifdef PAE 2596 object = ((vaddr_t)pdir) + PAGE_SIZE * l2tol3(PDIR_SLOT_PTE); 2597 (void)pmap_extract(pmap_kernel(), object, &pdirpa); 2598 xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa)); 2599 #endif 2600 splx(s); 2601 #endif /* XENPV */ 2602 } 2603 2604 /* 2605 * pmap_pdp_fini: destructor for the PDPs. 2606 */ 2607 static void 2608 pmap_pdp_fini(pd_entry_t *pdir) 2609 { 2610 #ifdef XENPV 2611 paddr_t pdirpa = 0; /* XXX: GCC */ 2612 vaddr_t object = (vaddr_t)pdir; 2613 int i; 2614 int s = splvm(); 2615 pt_entry_t *pte; 2616 2617 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2618 /* fetch the physical address of the page directory. */ 2619 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2620 /* unpin page table */ 2621 xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa)); 2622 } 2623 object = (vaddr_t)pdir; 2624 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2625 /* Set page RW again */ 2626 pte = kvtopte(object); 2627 pmap_pte_set(pte, *pte | PTE_W); 2628 xen_bcast_invlpg((vaddr_t)object); 2629 } 2630 splx(s); 2631 #endif /* XENPV */ 2632 } 2633 2634 #ifdef PAE 2635 static void * 2636 pmap_pdp_alloc(struct pool *pp, int flags) 2637 { 2638 return (void *)uvm_km_alloc(kernel_map, 2639 PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE, 2640 ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK) | 2641 UVM_KMF_WIRED); 2642 } 2643 2644 static void 2645 pmap_pdp_free(struct pool *pp, void *v) 2646 { 2647 uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE, 2648 UVM_KMF_WIRED); 2649 } 2650 #endif /* PAE */ 2651 2652 /* 2653 * pmap_ctor: constructor for the pmap cache. 2654 */ 2655 static int 2656 pmap_ctor(void *arg, void *obj, int flags) 2657 { 2658 struct pmap *pmap = obj; 2659 pt_entry_t p; 2660 int i; 2661 2662 KASSERT((flags & PR_WAITOK) != 0); 2663 2664 mutex_init(&pmap->pm_lock, MUTEX_DEFAULT, IPL_NONE); 2665 rw_init(&pmap->pm_dummy_lock); 2666 kcpuset_create(&pmap->pm_cpus, true); 2667 kcpuset_create(&pmap->pm_kernel_cpus, true); 2668 #ifdef XENPV 2669 kcpuset_create(&pmap->pm_xen_ptp_cpus, true); 2670 #endif 2671 LIST_INIT(&pmap->pm_gc_ptp); 2672 pmap->pm_pve = NULL; 2673 2674 /* allocate and init PDP */ 2675 pmap->pm_pdir = pool_get(&pmap_pdp_pool, PR_WAITOK); 2676 2677 for (;;) { 2678 pmap_pdp_init(pmap->pm_pdir); 2679 mutex_enter(&pmaps_lock); 2680 p = pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1]; 2681 if (__predict_true(p != 0)) { 2682 break; 2683 } 2684 mutex_exit(&pmaps_lock); 2685 } 2686 2687 for (i = 0; i < PDP_SIZE; i++) 2688 pmap->pm_pdirpa[i] = 2689 pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]); 2690 2691 LIST_INSERT_HEAD(&pmaps, pmap, pm_list); 2692 mutex_exit(&pmaps_lock); 2693 2694 return 0; 2695 } 2696 2697 /* 2698 * pmap_ctor: destructor for the pmap cache. 2699 */ 2700 static void 2701 pmap_dtor(void *arg, void *obj) 2702 { 2703 struct pmap *pmap = obj; 2704 2705 if (pmap->pm_pve != NULL) { 2706 pool_cache_put(&pmap_pv_cache, pmap->pm_pve); 2707 } 2708 2709 mutex_enter(&pmaps_lock); 2710 LIST_REMOVE(pmap, pm_list); 2711 mutex_exit(&pmaps_lock); 2712 2713 pmap_pdp_fini(pmap->pm_pdir); 2714 pool_put(&pmap_pdp_pool, pmap->pm_pdir); 2715 mutex_destroy(&pmap->pm_lock); 2716 rw_destroy(&pmap->pm_dummy_lock); 2717 kcpuset_destroy(pmap->pm_cpus); 2718 kcpuset_destroy(pmap->pm_kernel_cpus); 2719 #ifdef XENPV 2720 kcpuset_destroy(pmap->pm_xen_ptp_cpus); 2721 #endif 2722 } 2723 2724 /* 2725 * pmap_create: create a pmap object. 2726 */ 2727 struct pmap * 2728 pmap_create(void) 2729 { 2730 struct pmap *pmap; 2731 int i; 2732 2733 pmap = pool_cache_get(&pmap_cache, PR_WAITOK); 2734 2735 /* init uvm_object */ 2736 for (i = 0; i < PTP_LEVELS - 1; i++) { 2737 uvm_obj_init(&pmap->pm_obj[i], &pmap_pager, false, 1); 2738 uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_dummy_lock); 2739 pmap->pm_ptphint[i] = NULL; 2740 } 2741 pmap->pm_stats.wired_count = 0; 2742 /* count the PDP allocd below */ 2743 pmap->pm_stats.resident_count = PDP_SIZE; 2744 #if !defined(__x86_64__) 2745 pmap->pm_hiexec = 0; 2746 #endif 2747 2748 /* Used by NVMM. */ 2749 pmap->pm_enter = NULL; 2750 pmap->pm_extract = NULL; 2751 pmap->pm_remove = NULL; 2752 pmap->pm_sync_pv = NULL; 2753 pmap->pm_pp_remove_ent = NULL; 2754 pmap->pm_write_protect = NULL; 2755 pmap->pm_unwire = NULL; 2756 pmap->pm_tlb_flush = NULL; 2757 pmap->pm_data = NULL; 2758 2759 /* init the LDT */ 2760 pmap->pm_ldt = NULL; 2761 pmap->pm_ldt_len = 0; 2762 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 2763 2764 return (pmap); 2765 } 2766 2767 /* 2768 * pmap_check_ptps: verify that none of the pmap's page table objects 2769 * have any pages allocated to them. 2770 */ 2771 static void 2772 pmap_check_ptps(struct pmap *pmap) 2773 { 2774 int i; 2775 2776 for (i = 0; i < PTP_LEVELS - 1; i++) { 2777 KASSERTMSG(pmap->pm_obj[i].uo_npages == 0, 2778 "pmap %p level %d still has %d pages", 2779 pmap, i, (int)pmap->pm_obj[i].uo_npages); 2780 } 2781 } 2782 2783 static void 2784 pmap_check_inuse(struct pmap *pmap) 2785 { 2786 #ifdef DEBUG 2787 CPU_INFO_ITERATOR cii; 2788 struct cpu_info *ci; 2789 2790 for (CPU_INFO_FOREACH(cii, ci)) { 2791 if (ci->ci_pmap == pmap) 2792 panic("destroying pmap being used"); 2793 #if defined(XENPV) && defined(__x86_64__) 2794 for (int i = 0; i < PDIR_SLOT_USERLIM; i++) { 2795 if (pmap->pm_pdir[i] != 0 && 2796 ci->ci_kpm_pdir[i] == pmap->pm_pdir[i]) { 2797 printf("pmap_destroy(%p) pmap_kernel %p " 2798 "curcpu %d cpu %d ci_pmap %p " 2799 "ci->ci_kpm_pdir[%d]=%" PRIx64 2800 " pmap->pm_pdir[%d]=%" PRIx64 "\n", 2801 pmap, pmap_kernel(), curcpu()->ci_index, 2802 ci->ci_index, ci->ci_pmap, 2803 i, ci->ci_kpm_pdir[i], 2804 i, pmap->pm_pdir[i]); 2805 panic("%s: used pmap", __func__); 2806 } 2807 } 2808 #endif 2809 } 2810 #endif /* DEBUG */ 2811 } 2812 2813 /* 2814 * pmap_destroy: drop reference count on pmap. free pmap if reference 2815 * count goes to zero. 2816 * 2817 * => we can be called from pmap_unmap_ptes() with a different, unrelated 2818 * pmap's lock held. be careful! 2819 */ 2820 void 2821 pmap_destroy(struct pmap *pmap) 2822 { 2823 int i; 2824 2825 /* 2826 * drop reference count and verify not in use. 2827 */ 2828 2829 if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) { 2830 return; 2831 } 2832 pmap_check_inuse(pmap); 2833 2834 /* 2835 * XXX handle deferred PTP page free for EPT. ordinarily this is 2836 * taken care of by pmap_remove_all(). once shared with EPT this 2837 * can go away. 2838 */ 2839 if (__predict_false(!LIST_EMPTY(&pmap->pm_gc_ptp))) { 2840 pmap_update(pmap); 2841 } 2842 2843 /* 2844 * Reference count is zero, free pmap resources and then free pmap. 2845 */ 2846 2847 pmap_check_ptps(pmap); 2848 KASSERT(LIST_EMPTY(&pmap->pm_gc_ptp)); 2849 2850 #ifdef USER_LDT 2851 if (pmap->pm_ldt != NULL) { 2852 /* 2853 * no need to switch the LDT; this address space is gone, 2854 * nothing is using it. 2855 * 2856 * No need to lock the pmap for ldt_free (or anything else), 2857 * we're the last one to use it. 2858 */ 2859 /* XXXAD can't take cpu_lock here - fix soon. */ 2860 mutex_enter(&cpu_lock); 2861 ldt_free(pmap->pm_ldt_sel); 2862 mutex_exit(&cpu_lock); 2863 uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt, 2864 pmap->pm_ldt_len, UVM_KMF_WIRED); 2865 } 2866 #endif 2867 2868 for (i = 0; i < PTP_LEVELS - 1; i++) { 2869 uvm_obj_destroy(&pmap->pm_obj[i], false); 2870 } 2871 kcpuset_zero(pmap->pm_cpus); 2872 kcpuset_zero(pmap->pm_kernel_cpus); 2873 #ifdef XENPV 2874 kcpuset_zero(pmap->pm_xen_ptp_cpus); 2875 #endif 2876 2877 pmap_check_ptps(pmap); 2878 if (__predict_false(pmap->pm_enter != NULL)) { 2879 /* XXX make this a different cache */ 2880 pool_cache_destruct_object(&pmap_cache, pmap); 2881 } else { 2882 pool_cache_put(&pmap_cache, pmap); 2883 } 2884 } 2885 2886 /* 2887 * pmap_zap_ptp: clear out an entire PTP without modifying PTEs 2888 * 2889 * => caller must hold pmap's lock 2890 * => PTP must be mapped into KVA 2891 * => must be called with kernel preemption disabled 2892 * => does as little work as possible 2893 */ 2894 static void 2895 pmap_zap_ptp(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, 2896 vaddr_t startva, vaddr_t blkendva, struct pv_entry **pv_tofree) 2897 { 2898 #ifndef XEN 2899 struct pv_entry *pve; 2900 struct vm_page *pg; 2901 struct pmap_page *pp; 2902 pt_entry_t opte; 2903 rb_tree_t *tree; 2904 vaddr_t va; 2905 int wired; 2906 uint8_t oattrs; 2907 u_int cnt; 2908 2909 KASSERT(mutex_owned(&pmap->pm_lock)); 2910 KASSERT(kpreempt_disabled()); 2911 KASSERT(pmap != pmap_kernel()); 2912 KASSERT(ptp->wire_count > 1); 2913 KASSERT(ptp->wire_count - 1 <= PAGE_SIZE / sizeof(pt_entry_t)); 2914 2915 /* 2916 * Start at the lowest entered VA, and scan until there are no more 2917 * PTEs in the PTPs. The goal is to disconnect PV entries and patch 2918 * up the pmap's stats. No PTEs will be modified. 2919 */ 2920 tree = &VM_PAGE_TO_PP(ptp)->pp_rb; 2921 pve = RB_TREE_MIN(tree); 2922 wired = 0; 2923 va = (vaddr_t)ptp->uanon; 2924 pte += ((va - startva) >> PAGE_SHIFT); 2925 2926 for (cnt = ptp->wire_count; cnt > 1; pte++, va += PAGE_SIZE) { 2927 opte = *pte; 2928 if (!pmap_valid_entry(opte)) { 2929 continue; 2930 } 2931 2932 /* 2933 * Count the PTE. If it's not for a managed mapping 2934 * there's noting more to do. 2935 */ 2936 cnt--; 2937 wired -= (opte & PTE_WIRED); 2938 if ((opte & PTE_PVLIST) == 0) { 2939 #ifndef DOM0OPS 2940 KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL), 2941 "managed page without PTE_PVLIST for %#" 2942 PRIxVADDR, va); 2943 KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL), 2944 "pv-tracked page without PTE_PVLIST for %#" 2945 PRIxVADDR, va); 2946 #endif 2947 KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ? 2948 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), 2949 va) == NULL); 2950 continue; 2951 } 2952 2953 /* 2954 * "pve" now points to the lowest (by VA) dynamic PV entry 2955 * in the PTP. If it's for this VA, take advantage of it to 2956 * avoid calling PHYS_TO_VM_PAGE(). Avoid modifying the RB 2957 * tree by skipping to the next VA in the tree whenever 2958 * there is a match here. The tree will be cleared out in 2959 * one pass before return to pmap_remove_all(). 2960 */ 2961 oattrs = pmap_pte_to_pp_attrs(opte); 2962 if (pve != NULL && pve->pve_pte.pte_va == va) { 2963 pp = pve->pve_pp; 2964 KASSERT(pve->pve_pte.pte_ptp == ptp); 2965 KASSERT(pp->pp_pte.pte_ptp != ptp || 2966 pp->pp_pte.pte_va != va); 2967 mutex_spin_enter(&pp->pp_lock); 2968 pp->pp_attrs |= oattrs; 2969 LIST_REMOVE(pve, pve_list); 2970 mutex_spin_exit(&pp->pp_lock); 2971 pve->pve_next = *pv_tofree; 2972 *pv_tofree = pve; 2973 pve = RB_TREE_NEXT(tree, pve); 2974 continue; 2975 } 2976 2977 /* 2978 * No entry in the tree so it must be embedded. Look up the 2979 * page and cancel the embedded entry. 2980 */ 2981 if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) { 2982 pp = VM_PAGE_TO_PP(pg); 2983 } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) { 2984 paddr_t pa = pmap_pte2pa(opte); 2985 panic("%s: PTE_PVLIST with pv-untracked page" 2986 " va = %#"PRIxVADDR"pa = %#"PRIxPADDR 2987 "(%#"PRIxPADDR")", __func__, va, pa, atop(pa)); 2988 } 2989 mutex_spin_enter(&pp->pp_lock); 2990 KASSERT(pp->pp_pte.pte_ptp == ptp); 2991 KASSERT(pp->pp_pte.pte_va == va); 2992 pp->pp_attrs |= oattrs; 2993 pp->pp_pte.pte_ptp = NULL; 2994 pp->pp_pte.pte_va = 0; 2995 mutex_spin_exit(&pp->pp_lock); 2996 } 2997 2998 /* PTP now empty - adjust the tree & stats to match. */ 2999 pmap_stats_update(pmap, -(ptp->wire_count - 1), wired / PTE_WIRED); 3000 ptp->wire_count = 1; 3001 #ifdef DIAGNOSTIC 3002 rb_tree_init(tree, &pmap_rbtree_ops); 3003 #endif 3004 #else /* !XEN */ 3005 /* 3006 * XXXAD For XEN, it's not clear to me that we can do this, because 3007 * I guess the hypervisor keeps track of PTEs too. 3008 */ 3009 pmap_remove_ptes(pmap, ptp, (vaddr_t)pte, startva, blkendva, 3010 pv_tofree); 3011 #endif /* !XEN */ 3012 } 3013 3014 /* 3015 * pmap_remove_all: remove all mappings from pmap in bulk. 3016 * 3017 * Ordinarily when removing mappings it's important to hold the UVM object's 3018 * lock, so that pages do not gain a new identity while retaining stale TLB 3019 * entries (the same lock hold covers both pmap_remove() and pmap_update()). 3020 * Here it's known that the address space is no longer visible to any user 3021 * process, so we don't need to worry about that. 3022 */ 3023 bool 3024 pmap_remove_all(struct pmap *pmap) 3025 { 3026 struct vm_page *ptps[32]; 3027 vaddr_t va, blkendva; 3028 struct pmap *pmap2; 3029 pt_entry_t *ptes; 3030 pd_entry_t pde __diagused; 3031 pd_entry_t * const *pdes; 3032 struct pv_entry *pv_tofree; 3033 int lvl __diagused, i, n; 3034 3035 /* XXX Can't handle EPT just yet. */ 3036 if (pmap->pm_remove != NULL) { 3037 return false; 3038 } 3039 3040 for (;;) { 3041 /* Fetch a block of PTPs from tree. */ 3042 mutex_enter(&pmap->pm_lock); 3043 n = radix_tree_gang_lookup_node(&pmap->pm_obj[0].uo_pages, 0, 3044 (void **)ptps, __arraycount(ptps), false); 3045 if (n == 0) { 3046 mutex_exit(&pmap->pm_lock); 3047 break; 3048 } 3049 3050 /* Remove all mappings in the set of PTPs. */ 3051 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3052 pv_tofree = NULL; 3053 for (i = 0; i < n; i++) { 3054 if (ptps[i]->wire_count == 0) { 3055 /* It's dead: pmap_update() will expunge. */ 3056 continue; 3057 } 3058 3059 /* Determine range of block. */ 3060 va = ptps[i]->offset * PAGE_SIZE / sizeof(pt_entry_t); 3061 blkendva = x86_round_pdr(va + 1); 3062 3063 /* Make sure everything squares up... */ 3064 KASSERT(pmap_pdes_valid(va, pdes, &pde, &lvl)); 3065 KASSERT(lvl == 1); 3066 KASSERT(pmap_find_ptp(pmap, va, 1) == ptps[i]); 3067 3068 /* Zap! */ 3069 pmap_zap_ptp(pmap, ptps[i], &ptes[pl1_i(va)], va, 3070 blkendva, &pv_tofree); 3071 3072 /* PTP should now be unused - free it. */ 3073 KASSERT(ptps[i]->wire_count == 1); 3074 pmap_free_ptp(pmap, ptps[i], va, ptes, pdes); 3075 } 3076 pmap_unmap_ptes(pmap, pmap2); 3077 pmap_free_pvs(pmap, pv_tofree); 3078 pmap_tlb_shootdown(pmap, -1L, 0, TLBSHOOT_REMOVE_ALL); 3079 mutex_exit(&pmap->pm_lock); 3080 3081 /* Process deferred frees. */ 3082 pmap_update(pmap); 3083 3084 /* A breathing point. */ 3085 preempt_point(); 3086 } 3087 3088 /* Verify that the pmap is now completely empty. */ 3089 pmap_check_ptps(pmap); 3090 KASSERTMSG(pmap->pm_stats.resident_count == PDP_SIZE, 3091 "pmap %p not empty", pmap); 3092 3093 return true; 3094 } 3095 3096 #if defined(PMAP_FORK) 3097 /* 3098 * pmap_fork: perform any necessary data structure manipulation when 3099 * a VM space is forked. 3100 */ 3101 void 3102 pmap_fork(struct pmap *pmap1, struct pmap *pmap2) 3103 { 3104 #ifdef USER_LDT 3105 union descriptor *new_ldt; 3106 size_t len; 3107 int sel; 3108 3109 if (__predict_true(pmap1->pm_ldt == NULL)) { 3110 return; 3111 } 3112 3113 /* 3114 * Copy the LDT into the new process. 3115 * 3116 * Read pmap1's ldt pointer and length unlocked; if it changes 3117 * behind our back we'll retry. This will starve if there's a 3118 * stream of LDT changes in another thread but that should not 3119 * happen. 3120 */ 3121 3122 retry: 3123 if (pmap1->pm_ldt != NULL) { 3124 len = pmap1->pm_ldt_len; 3125 /* Allocate space for the new process's LDT */ 3126 new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, len, 0, 3127 UVM_KMF_WIRED); 3128 if (new_ldt == NULL) { 3129 printf("WARNING: %s: unable to allocate LDT space\n", 3130 __func__); 3131 return; 3132 } 3133 mutex_enter(&cpu_lock); 3134 /* Get a GDT slot for it */ 3135 sel = ldt_alloc(new_ldt, len); 3136 if (sel == -1) { 3137 mutex_exit(&cpu_lock); 3138 uvm_km_free(kernel_map, (vaddr_t)new_ldt, len, 3139 UVM_KMF_WIRED); 3140 printf("WARNING: %s: unable to allocate LDT selector\n", 3141 __func__); 3142 return; 3143 } 3144 } else { 3145 /* Wasn't anything there after all. */ 3146 len = -1; 3147 new_ldt = NULL; 3148 sel = -1; 3149 mutex_enter(&cpu_lock); 3150 } 3151 3152 /* If there's still something there now that we have cpu_lock... */ 3153 if (pmap1->pm_ldt != NULL) { 3154 if (len != pmap1->pm_ldt_len) { 3155 /* Oops, it changed. Drop what we did and try again */ 3156 if (len != -1) { 3157 ldt_free(sel); 3158 uvm_km_free(kernel_map, (vaddr_t)new_ldt, 3159 len, UVM_KMF_WIRED); 3160 } 3161 mutex_exit(&cpu_lock); 3162 goto retry; 3163 } 3164 3165 /* Copy the LDT data and install it in pmap2 */ 3166 memcpy(new_ldt, pmap1->pm_ldt, len); 3167 pmap2->pm_ldt = new_ldt; 3168 pmap2->pm_ldt_len = pmap1->pm_ldt_len; 3169 pmap2->pm_ldt_sel = sel; 3170 len = -1; 3171 } 3172 3173 if (len != -1) { 3174 /* There wasn't still something there, so mop up */ 3175 ldt_free(sel); 3176 mutex_exit(&cpu_lock); 3177 uvm_km_free(kernel_map, (vaddr_t)new_ldt, len, 3178 UVM_KMF_WIRED); 3179 } else { 3180 mutex_exit(&cpu_lock); 3181 } 3182 #endif /* USER_LDT */ 3183 } 3184 #endif /* PMAP_FORK */ 3185 3186 #ifdef USER_LDT 3187 3188 /* 3189 * pmap_ldt_xcall: cross call used by pmap_ldt_sync. if the named pmap 3190 * is active, reload LDTR. 3191 */ 3192 static void 3193 pmap_ldt_xcall(void *arg1, void *arg2) 3194 { 3195 struct pmap *pm; 3196 3197 kpreempt_disable(); 3198 pm = arg1; 3199 if (curcpu()->ci_pmap == pm) { 3200 #if defined(SVS) && defined(USER_LDT) 3201 if (svs_enabled) { 3202 svs_ldt_sync(pm); 3203 } else 3204 #endif 3205 lldt(pm->pm_ldt_sel); 3206 } 3207 kpreempt_enable(); 3208 } 3209 3210 /* 3211 * pmap_ldt_sync: LDT selector for the named pmap is changing. swap 3212 * in the new selector on all CPUs. 3213 */ 3214 void 3215 pmap_ldt_sync(struct pmap *pm) 3216 { 3217 uint64_t where; 3218 3219 KASSERT(mutex_owned(&cpu_lock)); 3220 3221 pmap_ldt_evcnt.ev_count++; 3222 where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL); 3223 xc_wait(where); 3224 } 3225 3226 /* 3227 * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and 3228 * restore the default. 3229 */ 3230 void 3231 pmap_ldt_cleanup(struct lwp *l) 3232 { 3233 pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap; 3234 union descriptor *dp = NULL; 3235 size_t len = 0; 3236 int sel = -1; 3237 3238 if (__predict_true(pmap->pm_ldt == NULL)) { 3239 return; 3240 } 3241 3242 mutex_enter(&cpu_lock); 3243 if (pmap->pm_ldt != NULL) { 3244 sel = pmap->pm_ldt_sel; 3245 dp = pmap->pm_ldt; 3246 len = pmap->pm_ldt_len; 3247 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 3248 pmap->pm_ldt = NULL; 3249 pmap->pm_ldt_len = 0; 3250 pmap_ldt_sync(pmap); 3251 ldt_free(sel); 3252 uvm_km_free(kernel_map, (vaddr_t)dp, len, UVM_KMF_WIRED); 3253 } 3254 mutex_exit(&cpu_lock); 3255 } 3256 #endif /* USER_LDT */ 3257 3258 /* 3259 * pmap_activate: activate a process' pmap 3260 * 3261 * => must be called with kernel preemption disabled 3262 * => if lwp is the curlwp, then set ci_want_pmapload so that 3263 * actual MMU context switch will be done by pmap_load() later 3264 */ 3265 void 3266 pmap_activate(struct lwp *l) 3267 { 3268 struct cpu_info *ci; 3269 struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 3270 3271 KASSERT(kpreempt_disabled()); 3272 3273 ci = curcpu(); 3274 3275 if (l != ci->ci_curlwp) 3276 return; 3277 3278 KASSERT(ci->ci_want_pmapload == 0); 3279 KASSERT(ci->ci_tlbstate != TLBSTATE_VALID); 3280 3281 /* 3282 * no need to switch to kernel vmspace because 3283 * it's a subset of any vmspace. 3284 */ 3285 3286 if (pmap == pmap_kernel()) { 3287 ci->ci_want_pmapload = 0; 3288 return; 3289 } 3290 3291 ci->ci_want_pmapload = 1; 3292 } 3293 3294 #if defined(XENPV) && defined(__x86_64__) 3295 #define KASSERT_PDIRPA(pmap) \ 3296 KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd || \ 3297 pmap == pmap_kernel()) 3298 #elif defined(PAE) 3299 #define KASSERT_PDIRPA(pmap) \ 3300 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0])) 3301 #elif !defined(XENPV) 3302 #define KASSERT_PDIRPA(pmap) \ 3303 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3())) 3304 #else 3305 #define KASSERT_PDIRPA(pmap) KASSERT(true) /* nothing to do */ 3306 #endif 3307 3308 /* 3309 * pmap_reactivate: try to regain reference to the pmap. 3310 * 3311 * => Must be called with kernel preemption disabled. 3312 */ 3313 static void 3314 pmap_reactivate(struct pmap *pmap) 3315 { 3316 struct cpu_info * const ci = curcpu(); 3317 const cpuid_t cid = cpu_index(ci); 3318 3319 KASSERT(kpreempt_disabled()); 3320 KASSERT_PDIRPA(pmap); 3321 3322 /* 3323 * If we still have a lazy reference to this pmap, we can assume 3324 * that there was no TLB shootdown for this pmap in the meantime. 3325 * 3326 * The order of events here is important as we must synchronize 3327 * with TLB shootdown interrupts. Declare interest in invalidations 3328 * (TLBSTATE_VALID) and then check the CPU set, which the IPIs can 3329 * change only when the state is TLBSTATE_LAZY. 3330 */ 3331 3332 ci->ci_tlbstate = TLBSTATE_VALID; 3333 KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid)); 3334 3335 if (__predict_true(kcpuset_isset(pmap->pm_cpus, cid))) { 3336 /* We have the reference, state is valid. */ 3337 } else { 3338 /* 3339 * Must reload the TLB, pmap has been changed during 3340 * deactivated. 3341 */ 3342 kcpuset_atomic_set(pmap->pm_cpus, cid); 3343 3344 tlbflush(); 3345 } 3346 } 3347 3348 /* 3349 * pmap_load: perform the actual pmap switch, i.e. fill in %cr3 register 3350 * and relevant LDT info. 3351 * 3352 * Ensures that the current process' pmap is loaded on the current CPU's 3353 * MMU and that there are no stale TLB entries. 3354 * 3355 * => The caller should disable kernel preemption or do check-and-retry 3356 * to prevent a preemption from undoing our efforts. 3357 * => This function may block. 3358 */ 3359 void 3360 pmap_load(void) 3361 { 3362 struct cpu_info *ci; 3363 struct pmap *pmap, *oldpmap; 3364 struct lwp *l; 3365 uint64_t ncsw; 3366 3367 kpreempt_disable(); 3368 retry: 3369 ci = curcpu(); 3370 if (!ci->ci_want_pmapload) { 3371 kpreempt_enable(); 3372 return; 3373 } 3374 l = ci->ci_curlwp; 3375 ncsw = l->l_ncsw; 3376 __insn_barrier(); 3377 3378 /* should be able to take ipis. */ 3379 KASSERT(ci->ci_ilevel < IPL_HIGH); 3380 #ifdef XENPV 3381 /* Check to see if interrupts are enabled (ie; no events are masked) */ 3382 KASSERT(x86_read_psl() == 0); 3383 #else 3384 KASSERT((x86_read_psl() & PSL_I) != 0); 3385 #endif 3386 3387 KASSERT(l != NULL); 3388 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 3389 KASSERT(pmap != pmap_kernel()); 3390 oldpmap = ci->ci_pmap; 3391 3392 if (pmap == oldpmap) { 3393 pmap_reactivate(pmap); 3394 ci->ci_want_pmapload = 0; 3395 kpreempt_enable(); 3396 return; 3397 } 3398 3399 /* 3400 * Acquire a reference to the new pmap and perform the switch. 3401 */ 3402 3403 pmap_reference(pmap); 3404 pmap_load1(l, pmap, oldpmap); 3405 ci->ci_want_pmapload = 0; 3406 3407 /* 3408 * we're now running with the new pmap. drop the reference 3409 * to the old pmap. if we block, we need to go around again. 3410 */ 3411 3412 pmap_destroy(oldpmap); 3413 __insn_barrier(); 3414 if (l->l_ncsw != ncsw) { 3415 goto retry; 3416 } 3417 3418 kpreempt_enable(); 3419 } 3420 3421 /* 3422 * pmap_load1: the guts of pmap load, shared by pmap_map_ptes() and 3423 * pmap_load(). It's critically important that this function does not 3424 * block. 3425 */ 3426 static void 3427 pmap_load1(struct lwp *l, struct pmap *pmap, struct pmap *oldpmap) 3428 { 3429 struct cpu_info *ci; 3430 struct pcb *pcb; 3431 cpuid_t cid; 3432 3433 KASSERT(kpreempt_disabled()); 3434 3435 pcb = lwp_getpcb(l); 3436 ci = l->l_cpu; 3437 cid = cpu_index(ci); 3438 3439 kcpuset_atomic_clear(oldpmap->pm_cpus, cid); 3440 kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid); 3441 3442 KASSERT_PDIRPA(oldpmap); 3443 KASSERT(!kcpuset_isset(pmap->pm_cpus, cid)); 3444 KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid)); 3445 3446 /* 3447 * Mark the pmap in use by this CPU. Again, we must synchronize 3448 * with TLB shootdown interrupts, so set the state VALID first, 3449 * then register us for shootdown events on this pmap. 3450 */ 3451 ci->ci_tlbstate = TLBSTATE_VALID; 3452 kcpuset_atomic_set(pmap->pm_cpus, cid); 3453 kcpuset_atomic_set(pmap->pm_kernel_cpus, cid); 3454 ci->ci_pmap = pmap; 3455 3456 /* 3457 * update tss. now that we have registered for invalidations 3458 * from other CPUs, we're good to load the page tables. 3459 */ 3460 #ifdef PAE 3461 pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa; 3462 #else 3463 pcb->pcb_cr3 = pmap_pdirpa(pmap, 0); 3464 #endif 3465 3466 #ifdef i386 3467 #ifndef XENPV 3468 ci->ci_tss->tss.tss_ldt = pmap->pm_ldt_sel; 3469 ci->ci_tss->tss.tss_cr3 = pcb->pcb_cr3; 3470 #endif 3471 #endif 3472 3473 #if defined(SVS) && defined(USER_LDT) 3474 if (svs_enabled) { 3475 svs_ldt_sync(pmap); 3476 } else 3477 #endif 3478 lldt(pmap->pm_ldt_sel); 3479 3480 cpu_load_pmap(pmap, oldpmap); 3481 } 3482 3483 /* 3484 * pmap_deactivate: deactivate a process' pmap. 3485 * 3486 * => Must be called with kernel preemption disabled (high IPL is enough). 3487 */ 3488 void 3489 pmap_deactivate(struct lwp *l) 3490 { 3491 struct pmap *pmap; 3492 struct cpu_info *ci; 3493 3494 KASSERT(kpreempt_disabled()); 3495 3496 if (l != curlwp) { 3497 return; 3498 } 3499 3500 /* 3501 * Wait for pending TLB shootdowns to complete. Necessary because 3502 * TLB shootdown state is per-CPU, and the LWP may be coming off 3503 * the CPU before it has a chance to call pmap_update(), e.g. due 3504 * to kernel preemption or blocking routine in between. 3505 */ 3506 pmap_tlb_shootnow(); 3507 3508 ci = curcpu(); 3509 3510 if (ci->ci_want_pmapload) { 3511 /* 3512 * ci_want_pmapload means that our pmap is not loaded on 3513 * the CPU or TLB might be stale. note that pmap_kernel() 3514 * is always considered loaded. 3515 */ 3516 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 3517 != pmap_kernel()); 3518 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 3519 != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID); 3520 3521 /* 3522 * userspace has not been touched. 3523 * nothing to do here. 3524 */ 3525 3526 ci->ci_want_pmapload = 0; 3527 return; 3528 } 3529 3530 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 3531 3532 if (pmap == pmap_kernel()) { 3533 return; 3534 } 3535 3536 KASSERT_PDIRPA(pmap); 3537 KASSERT(ci->ci_pmap == pmap); 3538 3539 /* 3540 * we aren't interested in TLB invalidations for this pmap, 3541 * at least for the time being. 3542 */ 3543 3544 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); 3545 ci->ci_tlbstate = TLBSTATE_LAZY; 3546 } 3547 3548 /* 3549 * some misc. functions 3550 */ 3551 3552 bool 3553 pmap_pdes_valid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde, 3554 int *lastlvl) 3555 { 3556 unsigned long index; 3557 pd_entry_t pde; 3558 int i; 3559 3560 for (i = PTP_LEVELS; i > 1; i--) { 3561 index = pl_i(va, i); 3562 pde = pdes[i - 2][index]; 3563 if ((pde & PTE_P) == 0) { 3564 *lastlvl = i; 3565 return false; 3566 } 3567 if (pde & PTE_PS) 3568 break; 3569 } 3570 if (lastpde != NULL) 3571 *lastpde = pde; 3572 *lastlvl = i; 3573 return true; 3574 } 3575 3576 /* 3577 * pmap_extract: extract a PA for the given VA 3578 */ 3579 bool 3580 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap) 3581 { 3582 pt_entry_t *ptes, pte; 3583 pd_entry_t pde; 3584 pd_entry_t * const *pdes; 3585 struct pmap *pmap2; 3586 paddr_t pa; 3587 bool rv; 3588 int lvl; 3589 3590 if (__predict_false(pmap->pm_extract != NULL)) { 3591 return (*pmap->pm_extract)(pmap, va, pap); 3592 } 3593 3594 #ifdef __HAVE_DIRECT_MAP 3595 if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) { 3596 if (pap != NULL) { 3597 *pap = PMAP_DIRECT_UNMAP(va); 3598 } 3599 return true; 3600 } 3601 #endif 3602 3603 rv = false; 3604 pa = 0; 3605 3606 if (pmap != pmap_kernel()) { 3607 mutex_enter(&pmap->pm_lock); 3608 } 3609 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3610 if (pmap_pdes_valid(va, pdes, &pde, &lvl)) { 3611 if (lvl == 2) { 3612 pa = (pde & PTE_LGFRAME) | (va & (NBPD_L2 - 1)); 3613 rv = true; 3614 } else { 3615 KASSERT(lvl == 1); 3616 pte = ptes[pl1_i(va)]; 3617 if (__predict_true((pte & PTE_P) != 0)) { 3618 pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1)); 3619 rv = true; 3620 } 3621 } 3622 } 3623 pmap_unmap_ptes(pmap, pmap2); 3624 if (pmap != pmap_kernel()) { 3625 mutex_exit(&pmap->pm_lock); 3626 } 3627 if (pap != NULL) { 3628 *pap = pa; 3629 } 3630 3631 return rv; 3632 } 3633 3634 /* 3635 * vtophys: virtual address to physical address. For use by 3636 * machine-dependent code only. 3637 */ 3638 paddr_t 3639 vtophys(vaddr_t va) 3640 { 3641 paddr_t pa; 3642 3643 if (pmap_extract(pmap_kernel(), va, &pa) == true) 3644 return pa; 3645 return 0; 3646 } 3647 3648 __strict_weak_alias(pmap_extract_ma, pmap_extract); 3649 3650 #ifdef XENPV 3651 /* 3652 * vtomach: virtual address to machine address. For use by 3653 * machine-dependent code only. 3654 */ 3655 paddr_t 3656 vtomach(vaddr_t va) 3657 { 3658 paddr_t pa; 3659 3660 if (pmap_extract_ma(pmap_kernel(), va, &pa) == true) 3661 return pa; 3662 return 0; 3663 } 3664 #endif 3665 3666 /* 3667 * pmap_virtual_space: used during bootup [pmap_steal_memory] to 3668 * determine the bounds of the kernel virtual addess space. 3669 */ 3670 void 3671 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp) 3672 { 3673 *startp = virtual_avail; 3674 *endp = virtual_end; 3675 } 3676 3677 void 3678 pmap_zero_page(paddr_t pa) 3679 { 3680 #if defined(__HAVE_DIRECT_MAP) 3681 pagezero(PMAP_DIRECT_MAP(pa)); 3682 #else 3683 #if defined(XENPV) 3684 if (XEN_VERSION_SUPPORTED(3, 4)) 3685 xen_pagezero(pa); 3686 #endif 3687 struct cpu_info *ci; 3688 pt_entry_t *zpte; 3689 vaddr_t zerova; 3690 3691 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_D | PTE_A; 3692 3693 kpreempt_disable(); 3694 3695 ci = curcpu(); 3696 zerova = ci->vpage[VPAGE_ZER]; 3697 zpte = ci->vpage_pte[VPAGE_ZER]; 3698 3699 KASSERTMSG(!*zpte, "pmap_zero_page: lock botch"); 3700 3701 pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags); 3702 pmap_pte_flush(); 3703 pmap_update_pg(zerova); /* flush TLB */ 3704 3705 memset((void *)zerova, 0, PAGE_SIZE); 3706 3707 #if defined(DIAGNOSTIC) || defined(XENPV) 3708 pmap_pte_set(zpte, 0); /* zap ! */ 3709 pmap_pte_flush(); 3710 #endif 3711 3712 kpreempt_enable(); 3713 #endif /* defined(__HAVE_DIRECT_MAP) */ 3714 } 3715 3716 /* 3717 * pmap_pagezeroidle: the same, for the idle loop page zero'er. 3718 * Returns true if the page was zero'd, false if we aborted for 3719 * some reason. 3720 */ 3721 bool 3722 pmap_pageidlezero(paddr_t pa) 3723 { 3724 #ifdef __HAVE_DIRECT_MAP 3725 KASSERT(cpu_feature[0] & CPUID_SSE2); 3726 return sse2_idlezero_page((void *)PMAP_DIRECT_MAP(pa)); 3727 #else 3728 struct cpu_info *ci; 3729 pt_entry_t *zpte; 3730 vaddr_t zerova; 3731 bool rv; 3732 3733 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_D | PTE_A; 3734 3735 ci = curcpu(); 3736 zerova = ci->vpage[VPAGE_ZER]; 3737 zpte = ci->vpage_pte[VPAGE_ZER]; 3738 3739 KASSERT(cpu_feature[0] & CPUID_SSE2); 3740 KASSERT(*zpte == 0); 3741 3742 pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags); 3743 pmap_pte_flush(); 3744 pmap_update_pg(zerova); /* flush TLB */ 3745 3746 rv = sse2_idlezero_page((void *)zerova); 3747 3748 #if defined(DIAGNOSTIC) || defined(XENPV) 3749 pmap_pte_set(zpte, 0); /* zap ! */ 3750 pmap_pte_flush(); 3751 #endif 3752 3753 return rv; 3754 #endif 3755 } 3756 3757 void 3758 pmap_copy_page(paddr_t srcpa, paddr_t dstpa) 3759 { 3760 #if defined(__HAVE_DIRECT_MAP) 3761 vaddr_t srcva = PMAP_DIRECT_MAP(srcpa); 3762 vaddr_t dstva = PMAP_DIRECT_MAP(dstpa); 3763 3764 memcpy((void *)dstva, (void *)srcva, PAGE_SIZE); 3765 #else 3766 #if defined(XENPV) 3767 if (XEN_VERSION_SUPPORTED(3, 4)) { 3768 xen_copy_page(srcpa, dstpa); 3769 return; 3770 } 3771 #endif 3772 struct cpu_info *ci; 3773 pt_entry_t *srcpte, *dstpte; 3774 vaddr_t srcva, dstva; 3775 3776 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A; 3777 3778 kpreempt_disable(); 3779 3780 ci = curcpu(); 3781 srcva = ci->vpage[VPAGE_SRC]; 3782 dstva = ci->vpage[VPAGE_DST]; 3783 srcpte = ci->vpage_pte[VPAGE_SRC]; 3784 dstpte = ci->vpage_pte[VPAGE_DST]; 3785 3786 KASSERT(*srcpte == 0 && *dstpte == 0); 3787 3788 pmap_pte_set(srcpte, pmap_pa2pte(srcpa) | pteflags); 3789 pmap_pte_set(dstpte, pmap_pa2pte(dstpa) | pteflags | PTE_D); 3790 pmap_pte_flush(); 3791 pmap_update_pg(srcva); 3792 pmap_update_pg(dstva); 3793 3794 memcpy((void *)dstva, (void *)srcva, PAGE_SIZE); 3795 3796 #if defined(DIAGNOSTIC) || defined(XENPV) 3797 pmap_pte_set(srcpte, 0); 3798 pmap_pte_set(dstpte, 0); 3799 pmap_pte_flush(); 3800 #endif 3801 3802 kpreempt_enable(); 3803 #endif /* defined(__HAVE_DIRECT_MAP) */ 3804 } 3805 3806 static pt_entry_t * 3807 pmap_map_ptp(struct vm_page *ptp) 3808 { 3809 #ifdef __HAVE_DIRECT_MAP 3810 return (void *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp)); 3811 #else 3812 struct cpu_info *ci; 3813 pt_entry_t *ptppte; 3814 vaddr_t ptpva; 3815 3816 KASSERT(kpreempt_disabled()); 3817 3818 #ifndef XENPV 3819 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A | PTE_D; 3820 #else 3821 const pd_entry_t pteflags = PTE_P | pmap_pg_nx | PTE_A | PTE_D; 3822 #endif 3823 3824 ci = curcpu(); 3825 ptpva = ci->vpage[VPAGE_PTP]; 3826 ptppte = ci->vpage_pte[VPAGE_PTP]; 3827 3828 pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | pteflags); 3829 3830 pmap_pte_flush(); 3831 pmap_update_pg(ptpva); 3832 3833 return (pt_entry_t *)ptpva; 3834 #endif 3835 } 3836 3837 static void 3838 pmap_unmap_ptp(void) 3839 { 3840 #ifndef __HAVE_DIRECT_MAP 3841 #if defined(DIAGNOSTIC) || defined(XENPV) 3842 struct cpu_info *ci; 3843 pt_entry_t *pte; 3844 3845 KASSERT(kpreempt_disabled()); 3846 3847 ci = curcpu(); 3848 pte = ci->vpage_pte[VPAGE_PTP]; 3849 3850 if (*pte != 0) { 3851 pmap_pte_set(pte, 0); 3852 pmap_pte_flush(); 3853 } 3854 #endif 3855 #endif 3856 } 3857 3858 static pt_entry_t * 3859 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va) 3860 { 3861 3862 KASSERT(kpreempt_disabled()); 3863 if (pmap_is_curpmap(pmap)) { 3864 return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */ 3865 } 3866 KASSERT(ptp != NULL); 3867 return pmap_map_ptp(ptp) + pl1_pi(va); 3868 } 3869 3870 static void 3871 pmap_unmap_pte(void) 3872 { 3873 3874 KASSERT(kpreempt_disabled()); 3875 3876 pmap_unmap_ptp(); 3877 } 3878 3879 /* 3880 * p m a p r e m o v e f u n c t i o n s 3881 * 3882 * functions that remove mappings 3883 */ 3884 3885 /* 3886 * pmap_remove_ptes: remove PTEs from a PTP 3887 * 3888 * => caller must hold pmap's lock 3889 * => PTP must be mapped into KVA 3890 * => PTP should be null if pmap == pmap_kernel() 3891 * => must be called with kernel preemption disabled 3892 * => returns composite pte if at least one page should be shot down 3893 */ 3894 static void 3895 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva, 3896 vaddr_t startva, vaddr_t endva, struct pv_entry **pv_tofree) 3897 { 3898 pt_entry_t *pte = (pt_entry_t *)ptpva; 3899 3900 KASSERT(mutex_owned(&pmap->pm_lock)); 3901 KASSERT(kpreempt_disabled()); 3902 3903 /* 3904 * mappings are very often sparse, so clip the given range to the 3905 * range of PTEs that are known present in the PTP. 3906 */ 3907 pmap_ptp_range_clip(ptp, &startva, &pte); 3908 3909 /* 3910 * note that ptpva points to the PTE that maps startva. this may 3911 * or may not be the first PTE in the PTP. 3912 * 3913 * we loop through the PTP while there are still PTEs to look at 3914 * and the wire_count is greater than 1 (because we use the wire_count 3915 * to keep track of the number of real PTEs in the PTP). 3916 */ 3917 while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) { 3918 (void)pmap_remove_pte(pmap, ptp, pte, startva, pv_tofree); 3919 startva += PAGE_SIZE; 3920 pte++; 3921 } 3922 } 3923 3924 /* 3925 * pmap_remove_pte: remove a single PTE from a PTP. 3926 * 3927 * => caller must hold pmap's lock 3928 * => PTP must be mapped into KVA 3929 * => PTP should be null if pmap == pmap_kernel() 3930 * => returns true if we removed a mapping 3931 * => must be called with kernel preemption disabled 3932 */ 3933 static bool 3934 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, 3935 vaddr_t va, struct pv_entry **pv_tofree) 3936 { 3937 struct pv_entry *pve; 3938 struct vm_page *pg; 3939 struct pmap_page *pp; 3940 pt_entry_t opte; 3941 3942 KASSERT(mutex_owned(&pmap->pm_lock)); 3943 KASSERT(kpreempt_disabled()); 3944 3945 if (!pmap_valid_entry(*pte)) { 3946 /* VA not mapped. */ 3947 return false; 3948 } 3949 3950 /* Atomically save the old PTE and zap it. */ 3951 opte = pmap_pte_testset(pte, 0); 3952 if (!pmap_valid_entry(opte)) { 3953 return false; 3954 } 3955 3956 pmap_exec_account(pmap, va, opte, 0); 3957 pmap_stats_update_bypte(pmap, 0, opte); 3958 3959 if (ptp) { 3960 /* 3961 * Dropping a PTE. Make sure that the PDE is flushed. 3962 */ 3963 ptp->wire_count--; 3964 if (ptp->wire_count <= 1) { 3965 opte |= PTE_A; 3966 } 3967 } 3968 3969 if ((opte & PTE_A) != 0) { 3970 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE); 3971 } 3972 3973 /* 3974 * If we are not on a pv list - we are done. 3975 */ 3976 if ((opte & PTE_PVLIST) == 0) { 3977 #ifndef DOM0OPS 3978 KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL), 3979 "managed page without PTE_PVLIST for %#"PRIxVADDR, va); 3980 KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL), 3981 "pv-tracked page without PTE_PVLIST for %#"PRIxVADDR, va); 3982 #endif 3983 KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ? 3984 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL); 3985 return true; 3986 } 3987 3988 if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) { 3989 pp = VM_PAGE_TO_PP(pg); 3990 } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) { 3991 paddr_t pa = pmap_pte2pa(opte); 3992 panic("%s: PTE_PVLIST with pv-untracked page" 3993 " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")", 3994 __func__, va, pa, atop(pa)); 3995 } 3996 3997 /* Sync R/M bits. */ 3998 pve = pmap_lookup_pv(pmap, ptp, pp, va); 3999 pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_pte_to_pp_attrs(opte)); 4000 4001 if (pve) { 4002 pve->pve_next = *pv_tofree; 4003 *pv_tofree = pve; 4004 } 4005 return true; 4006 } 4007 4008 /* 4009 * pmap_remove: mapping removal function. 4010 * 4011 * => caller should not be holding any pmap locks 4012 */ 4013 void 4014 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 4015 { 4016 pt_entry_t *ptes; 4017 pd_entry_t pde; 4018 pd_entry_t * const *pdes; 4019 struct pv_entry *pv_tofree = NULL; 4020 bool result; 4021 vaddr_t blkendva, va = sva; 4022 struct vm_page *ptp; 4023 struct pmap *pmap2; 4024 int lvl; 4025 4026 if (__predict_false(pmap->pm_remove != NULL)) { 4027 (*pmap->pm_remove)(pmap, sva, eva); 4028 return; 4029 } 4030 4031 mutex_enter(&pmap->pm_lock); 4032 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 4033 4034 /* 4035 * removing one page? take shortcut function. 4036 */ 4037 4038 if (va + PAGE_SIZE == eva) { 4039 if (pmap_pdes_valid(va, pdes, &pde, &lvl)) { 4040 KASSERT(lvl == 1); 4041 4042 /* Get PTP if non-kernel mapping. */ 4043 if (pmap != pmap_kernel()) { 4044 ptp = pmap_find_ptp(pmap, va, 1); 4045 KASSERTMSG(ptp != NULL, 4046 "%s: unmanaged PTP detected", __func__); 4047 } else { 4048 /* Never free kernel PTPs. */ 4049 ptp = NULL; 4050 } 4051 4052 result = pmap_remove_pte(pmap, ptp, 4053 &ptes[pl1_i(va)], va, &pv_tofree); 4054 4055 /* 4056 * if mapping removed and the PTP is no longer 4057 * being used, free it! 4058 */ 4059 4060 if (result && ptp && ptp->wire_count <= 1) 4061 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 4062 } 4063 } else for (/* null */ ; va < eva ; va = blkendva) { 4064 /* determine range of block */ 4065 blkendva = x86_round_pdr(va+1); 4066 if (blkendva > eva) 4067 blkendva = eva; 4068 4069 if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) { 4070 /* Skip a range corresponding to an invalid pde. */ 4071 blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1]; 4072 continue; 4073 } 4074 KASSERT(lvl == 1); 4075 4076 /* Get PTP if non-kernel mapping. */ 4077 if (pmap != pmap_kernel()) { 4078 ptp = pmap_find_ptp(pmap, va, 1); 4079 KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected", 4080 __func__); 4081 } else { 4082 /* Never free kernel PTPs. */ 4083 ptp = NULL; 4084 } 4085 4086 pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va, 4087 blkendva, &pv_tofree); 4088 4089 /* If PTP is no longer being used, free it. */ 4090 if (ptp && ptp->wire_count <= 1) { 4091 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 4092 } 4093 } 4094 pmap_unmap_ptes(pmap, pmap2); 4095 /* 4096 * Now safe to free, as we no longer have the PTEs mapped and can 4097 * block again. 4098 */ 4099 if (pv_tofree != NULL) { 4100 pmap_free_pvs(pmap, pv_tofree); 4101 } 4102 mutex_exit(&pmap->pm_lock); 4103 } 4104 4105 /* 4106 * pmap_sync_pv: clear pte bits and return the old value of the pp_attrs. 4107 * 4108 * => The 'clearbits' parameter is either ~0 or PP_ATTRS_... 4109 * => Caller should disable kernel preemption. 4110 * => issues tlb shootdowns if necessary. 4111 */ 4112 static int 4113 pmap_sync_pv(struct pv_pte *pvpte, paddr_t pa, int clearbits, uint8_t *oattrs, 4114 pt_entry_t *optep) 4115 { 4116 struct pmap *pmap; 4117 struct vm_page *ptp; 4118 vaddr_t va; 4119 pt_entry_t *ptep; 4120 pt_entry_t opte; 4121 pt_entry_t npte; 4122 pt_entry_t expect; 4123 bool need_shootdown; 4124 4125 ptp = pvpte->pte_ptp; 4126 va = pvpte->pte_va; 4127 KASSERT(ptp == NULL || ptp->uobject != NULL); 4128 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 4129 pmap = ptp_to_pmap(ptp); 4130 KASSERT(kpreempt_disabled()); 4131 4132 if (__predict_false(pmap->pm_sync_pv != NULL)) { 4133 return (*pmap->pm_sync_pv)(ptp, va, pa, clearbits, oattrs, 4134 optep); 4135 } 4136 4137 expect = pmap_pa2pte(pa) | PTE_P; 4138 4139 if (clearbits != ~0) { 4140 KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0); 4141 clearbits = pmap_pp_attrs_to_pte(clearbits); 4142 } 4143 4144 ptep = pmap_map_pte(pmap, ptp, va); 4145 do { 4146 opte = *ptep; 4147 KASSERT((opte & (PTE_D | PTE_A)) != PTE_D); 4148 KASSERT((opte & (PTE_A | PTE_P)) != PTE_A); 4149 KASSERT(opte == 0 || (opte & PTE_P) != 0); 4150 if ((opte & (PTE_FRAME | PTE_P)) != expect) { 4151 /* 4152 * We lost a race with a V->P operation like 4153 * pmap_remove(). Wait for the competitor 4154 * reflecting pte bits into mp_attrs. 4155 */ 4156 pmap_unmap_pte(); 4157 return EAGAIN; 4158 } 4159 4160 /* 4161 * Check if there's anything to do on this PTE. 4162 */ 4163 if ((opte & clearbits) == 0) { 4164 need_shootdown = false; 4165 break; 4166 } 4167 4168 /* 4169 * We need a shootdown if the PTE is cached (PTE_A) ... 4170 * ... Unless we are clearing only the PTE_W bit and 4171 * it isn't cached as RW (PTE_D). 4172 */ 4173 need_shootdown = (opte & PTE_A) != 0 && 4174 !(clearbits == PTE_W && (opte & PTE_D) == 0); 4175 4176 npte = opte & ~clearbits; 4177 4178 /* 4179 * If we need a shootdown anyway, clear PTE_A and PTE_D. 4180 */ 4181 if (need_shootdown) { 4182 npte &= ~(PTE_A | PTE_D); 4183 } 4184 KASSERT((npte & (PTE_D | PTE_A)) != PTE_D); 4185 KASSERT((npte & (PTE_A | PTE_P)) != PTE_A); 4186 KASSERT(npte == 0 || (opte & PTE_P) != 0); 4187 } while (pmap_pte_cas(ptep, opte, npte) != opte); 4188 4189 if (need_shootdown) { 4190 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV); 4191 } 4192 pmap_unmap_pte(); 4193 4194 *oattrs = pmap_pte_to_pp_attrs(opte); 4195 if (optep != NULL) 4196 *optep = opte; 4197 return 0; 4198 } 4199 4200 static void 4201 pmap_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte, 4202 vaddr_t va) 4203 { 4204 struct pmap *pmap2; 4205 pt_entry_t *ptes; 4206 pd_entry_t * const *pdes; 4207 4208 KASSERT(mutex_owned(&pmap->pm_lock)); 4209 4210 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 4211 pmap_stats_update_bypte(pmap, 0, opte); 4212 ptp->wire_count--; 4213 if (ptp->wire_count <= 1) { 4214 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 4215 } 4216 pmap_unmap_ptes(pmap, pmap2); 4217 } 4218 4219 static void 4220 pmap_pp_remove(struct pmap_page *pp, paddr_t pa) 4221 { 4222 struct pv_pte *pvpte; 4223 struct vm_page *ptp; 4224 uintptr_t sum; 4225 uint8_t oattrs; 4226 bool locked; 4227 4228 /* 4229 * Do an unlocked check to see if the page has no mappings, eg when 4230 * pmap_remove_all() was called before amap_wipeout() for a process 4231 * private amap - common. The page being removed must be on the way 4232 * out, so we don't have to worry about concurrent attempts to enter 4233 * it (otherwise the caller either doesn't care or has screwed up). 4234 */ 4235 sum = (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_va); 4236 sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_ptp); 4237 sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pvlist.lh_first); 4238 if (sum == 0) { 4239 return; 4240 } 4241 4242 kpreempt_disable(); 4243 for (;;) { 4244 struct pmap *pmap; 4245 struct pv_entry *pve; 4246 pt_entry_t opte; 4247 vaddr_t va; 4248 4249 mutex_spin_enter(&pp->pp_lock); 4250 if ((pvpte = pv_pte_first(pp)) == NULL) { 4251 mutex_spin_exit(&pp->pp_lock); 4252 break; 4253 } 4254 4255 /* 4256 * Add a reference to the pmap before clearing the pte. 4257 * Otherwise the pmap can disappear behind us. 4258 */ 4259 ptp = pvpte->pte_ptp; 4260 pmap = ptp_to_pmap(ptp); 4261 KASSERT(pmap->pm_obj[0].uo_refs > 0); 4262 if (ptp != NULL) { 4263 pmap_reference(pmap); 4264 } 4265 4266 /* 4267 * Now try to lock it. We need a direct handoff between 4268 * pp_lock and pm_lock to know the pv_entry is kept intact 4269 * and kept associated with this pmap. If that can't be 4270 * had, wait for the pmap's lock to become free and then 4271 * retry. 4272 */ 4273 locked = mutex_tryenter(&pmap->pm_lock); 4274 mutex_spin_exit(&pp->pp_lock); 4275 if (!locked) { 4276 mutex_enter(&pmap->pm_lock); 4277 /* nothing, just wait for it */ 4278 mutex_exit(&pmap->pm_lock); 4279 if (ptp != NULL) { 4280 pmap_destroy(pmap); 4281 } 4282 continue; 4283 } 4284 va = pvpte->pte_va; 4285 4286 KASSERTMSG(pmap->pm_stats.resident_count > PDP_SIZE, 4287 "va %lx pmap %p ptp %p is empty", va, pmap, ptp); 4288 KASSERTMSG(ptp == NULL || (ptp->flags & PG_FREE) == 0, 4289 "va %lx pmap %p ptp %p is free", va, pmap, ptp); 4290 KASSERTMSG(ptp == NULL || ptp->wire_count > 1, 4291 "va %lx pmap %p ptp %p is empty", va, pmap, ptp); 4292 4293 #ifdef DEBUG 4294 pmap_check_pv(pmap, ptp, pp, pvpte->pte_va, true); 4295 rb_tree_t *tree = (ptp != NULL ? 4296 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb); 4297 pve = pmap_treelookup_pv(pmap, ptp, tree, va); 4298 if (pve == NULL) { 4299 KASSERTMSG(&pp->pp_pte == pvpte, 4300 "va %lx pmap %p ptp %p pvpte %p pve %p oops 1", 4301 va, pmap, ptp, pvpte, pve); 4302 } else { 4303 KASSERTMSG(&pve->pve_pte == pvpte, 4304 "va %lx pmap %p ptp %p pvpte %p pve %p oops 2", 4305 va, pmap, ptp, pvpte, pve); 4306 } 4307 #endif 4308 4309 if (pmap_sync_pv(pvpte, pa, ~0, &oattrs, &opte)) { 4310 panic("pmap_pp_remove: mapping not present"); 4311 } 4312 4313 pve = pmap_lookup_pv(pmap, ptp, pp, va); 4314 pmap_remove_pv(pmap, pp, ptp, va, pve, oattrs); 4315 4316 /* Update the PTP reference count. Free if last reference. */ 4317 if (ptp != NULL) { 4318 KASSERT(pmap != pmap_kernel()); 4319 pmap_tlb_shootnow(); 4320 if (__predict_false(pmap->pm_pp_remove_ent != NULL)) { 4321 (*pmap->pm_pp_remove_ent)(pmap, ptp, opte, va); 4322 } else { 4323 pmap_pp_remove_ent(pmap, ptp, opte, va); 4324 } 4325 } else { 4326 KASSERT(pmap == pmap_kernel()); 4327 pmap_stats_update_bypte(pmap, 0, opte); 4328 } 4329 if (pve != NULL) { 4330 pve->pve_next = NULL; 4331 pmap_free_pvs(pmap, pve); 4332 } 4333 pmap_tlb_shootnow(); 4334 mutex_exit(&pmap->pm_lock); 4335 if (ptp != NULL) { 4336 pmap_destroy(pmap); 4337 } 4338 } 4339 kpreempt_enable(); 4340 } 4341 4342 /* 4343 * pmap_page_remove: remove a managed vm_page from all pmaps that map it 4344 * 4345 * => R/M bits are sync'd back to attrs 4346 */ 4347 void 4348 pmap_page_remove(struct vm_page *pg) 4349 { 4350 struct pmap_page *pp; 4351 paddr_t pa; 4352 4353 pp = VM_PAGE_TO_PP(pg); 4354 pa = VM_PAGE_TO_PHYS(pg); 4355 pmap_pp_remove(pp, pa); 4356 } 4357 4358 /* 4359 * pmap_pv_remove: remove an unmanaged pv-tracked page from all pmaps 4360 * that map it 4361 */ 4362 void 4363 pmap_pv_remove(paddr_t pa) 4364 { 4365 struct pmap_page *pp; 4366 4367 pp = pmap_pv_tracked(pa); 4368 if (pp == NULL) 4369 panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa); 4370 pmap_pp_remove(pp, pa); 4371 } 4372 4373 /* 4374 * p m a p a t t r i b u t e f u n c t i o n s 4375 * functions that test/change managed page's attributes 4376 * since a page can be mapped multiple times we must check each PTE that 4377 * maps it by going down the pv lists. 4378 */ 4379 4380 /* 4381 * pmap_test_attrs: test a page's attributes 4382 */ 4383 bool 4384 pmap_test_attrs(struct vm_page *pg, unsigned testbits) 4385 { 4386 struct pmap_page *pp; 4387 struct pv_pte *pvpte; 4388 struct pmap *pmap; 4389 uint8_t oattrs; 4390 u_int result; 4391 paddr_t pa; 4392 4393 pp = VM_PAGE_TO_PP(pg); 4394 if ((pp->pp_attrs & testbits) != 0) { 4395 return true; 4396 } 4397 pa = VM_PAGE_TO_PHYS(pg); 4398 startover: 4399 mutex_spin_enter(&pp->pp_lock); 4400 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 4401 if ((pp->pp_attrs & testbits) != 0) { 4402 break; 4403 } 4404 if (pmap_sync_pv(pvpte, pa, 0, &oattrs, NULL)) { 4405 /* 4406 * raced with a V->P operation. wait for the other 4407 * side to finish by acquring pmap's lock. if no 4408 * wait, updates to pp_attrs by the other side may 4409 * go unseen. 4410 */ 4411 pmap = ptp_to_pmap(pvpte->pte_ptp); 4412 pmap_reference(pmap); 4413 mutex_spin_exit(&pp->pp_lock); 4414 mutex_enter(&pmap->pm_lock); 4415 /* nothing. */ 4416 mutex_exit(&pmap->pm_lock); 4417 pmap_destroy(pmap); 4418 goto startover; 4419 } 4420 pp->pp_attrs |= oattrs; 4421 } 4422 result = pp->pp_attrs & testbits; 4423 mutex_spin_exit(&pp->pp_lock); 4424 4425 /* 4426 * note that we will exit the for loop with a non-null pve if 4427 * we have found the bits we are testing for. 4428 */ 4429 4430 return result != 0; 4431 } 4432 4433 static bool 4434 pmap_pp_clear_attrs(struct pmap_page *pp, paddr_t pa, unsigned clearbits) 4435 { 4436 struct pv_pte *pvpte; 4437 struct pmap *pmap; 4438 uint8_t oattrs; 4439 u_int result; 4440 4441 startover: 4442 mutex_spin_enter(&pp->pp_lock); 4443 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 4444 if (pmap_sync_pv(pvpte, pa, clearbits, &oattrs, NULL)) { 4445 /* 4446 * raced with a V->P operation. wait for the other 4447 * side to finish by acquring pmap's lock. it is 4448 * probably unmapping the page, and it will be gone 4449 * when the loop is restarted. 4450 */ 4451 pmap = ptp_to_pmap(pvpte->pte_ptp); 4452 pmap_reference(pmap); 4453 mutex_spin_exit(&pp->pp_lock); 4454 mutex_enter(&pmap->pm_lock); 4455 /* nothing. */ 4456 mutex_exit(&pmap->pm_lock); 4457 pmap_destroy(pmap); 4458 goto startover; 4459 } 4460 pp->pp_attrs |= oattrs; 4461 } 4462 result = pp->pp_attrs & clearbits; 4463 pp->pp_attrs &= ~clearbits; 4464 pmap_tlb_shootnow(); 4465 mutex_spin_exit(&pp->pp_lock); 4466 4467 return result != 0; 4468 } 4469 4470 /* 4471 * pmap_clear_attrs: clear the specified attribute for a page. 4472 * 4473 * => we return true if we cleared one of the bits we were asked to 4474 */ 4475 bool 4476 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits) 4477 { 4478 struct pmap_page *pp; 4479 paddr_t pa; 4480 4481 pp = VM_PAGE_TO_PP(pg); 4482 pa = VM_PAGE_TO_PHYS(pg); 4483 4484 return pmap_pp_clear_attrs(pp, pa, clearbits); 4485 } 4486 4487 /* 4488 * pmap_pv_clear_attrs: clear the specified attributes for an unmanaged 4489 * pv-tracked page. 4490 */ 4491 bool 4492 pmap_pv_clear_attrs(paddr_t pa, unsigned clearbits) 4493 { 4494 struct pmap_page *pp; 4495 4496 pp = pmap_pv_tracked(pa); 4497 if (pp == NULL) 4498 panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa); 4499 4500 return pmap_pp_clear_attrs(pp, pa, clearbits); 4501 } 4502 4503 /* 4504 * p m a p p r o t e c t i o n f u n c t i o n s 4505 */ 4506 4507 /* 4508 * pmap_page_protect: change the protection of all recorded mappings 4509 * of a managed page 4510 * 4511 * => NOTE: this is an inline function in pmap.h 4512 */ 4513 4514 /* see pmap.h */ 4515 4516 /* 4517 * pmap_pv_protect: change the protection of all recorded mappings 4518 * of an unmanaged pv-tracked page 4519 * 4520 * => NOTE: this is an inline function in pmap.h 4521 */ 4522 4523 /* see pmap.h */ 4524 4525 /* 4526 * pmap_protect: set the protection in of the pages in a pmap 4527 * 4528 * => NOTE: this is an inline function in pmap.h 4529 */ 4530 4531 /* see pmap.h */ 4532 4533 /* 4534 * pmap_write_protect: write-protect pages in a pmap. 4535 * 4536 * Note for Xen-amd64. Xen automatically adds PTE_U to the kernel pages, but we 4537 * don't need to remove this bit when re-entering the PTEs here: Xen tracks the 4538 * kernel pages with a reserved bit (_PAGE_GUEST_KERNEL), so even if PTE_U is 4539 * present the page will still be considered as a kernel page, and the privilege 4540 * separation will be enforced correctly. 4541 */ 4542 void 4543 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot) 4544 { 4545 pt_entry_t bit_rem, bit_put; 4546 pt_entry_t *ptes; 4547 pt_entry_t * const *pdes; 4548 struct pmap *pmap2; 4549 vaddr_t blockend, va; 4550 int lvl, i; 4551 4552 if (__predict_false(pmap->pm_write_protect != NULL)) { 4553 (*pmap->pm_write_protect)(pmap, sva, eva, prot); 4554 return; 4555 } 4556 4557 bit_rem = 0; 4558 if (!(prot & VM_PROT_WRITE)) 4559 bit_rem = PTE_W; 4560 4561 bit_put = 0; 4562 if (!(prot & VM_PROT_EXECUTE)) 4563 bit_put = pmap_pg_nx; 4564 4565 sva &= ~PAGE_MASK; 4566 eva &= ~PAGE_MASK; 4567 4568 /* 4569 * Acquire pmap. No need to lock the kernel pmap as we won't 4570 * be touching PV entries nor stats and kernel PDEs aren't 4571 * freed. 4572 */ 4573 if (pmap != pmap_kernel()) { 4574 mutex_enter(&pmap->pm_lock); 4575 } 4576 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 4577 4578 for (va = sva ; va < eva; va = blockend) { 4579 pt_entry_t *spte, *epte; 4580 4581 blockend = x86_round_pdr(va + 1); 4582 if (blockend > eva) 4583 blockend = eva; 4584 4585 /* Is it a valid block? */ 4586 if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) { 4587 continue; 4588 } 4589 KASSERT(va < VM_MAXUSER_ADDRESS || va >= VM_MAX_ADDRESS); 4590 KASSERT(lvl == 1); 4591 4592 spte = &ptes[pl1_i(va)]; 4593 epte = &ptes[pl1_i(blockend)]; 4594 4595 for (i = 0; spte < epte; spte++, i++) { 4596 pt_entry_t opte, npte; 4597 4598 do { 4599 opte = *spte; 4600 if (!pmap_valid_entry(opte)) { 4601 goto next; 4602 } 4603 npte = (opte & ~bit_rem) | bit_put; 4604 } while (pmap_pte_cas(spte, opte, npte) != opte); 4605 4606 if ((opte & PTE_D) != 0) { 4607 vaddr_t tva = va + x86_ptob(i); 4608 pmap_tlb_shootdown(pmap, tva, opte, 4609 TLBSHOOT_WRITE_PROTECT); 4610 } 4611 next:; 4612 } 4613 } 4614 4615 /* Release pmap. */ 4616 pmap_unmap_ptes(pmap, pmap2); 4617 if (pmap != pmap_kernel()) { 4618 mutex_exit(&pmap->pm_lock); 4619 } 4620 } 4621 4622 /* 4623 * pmap_unwire: clear the wired bit in the PTE. 4624 * 4625 * => Mapping should already be present. 4626 */ 4627 void 4628 pmap_unwire(struct pmap *pmap, vaddr_t va) 4629 { 4630 pt_entry_t *ptes, *ptep, opte; 4631 pd_entry_t * const *pdes; 4632 struct pmap *pmap2; 4633 int lvl; 4634 4635 if (__predict_false(pmap->pm_unwire != NULL)) { 4636 (*pmap->pm_unwire)(pmap, va); 4637 return; 4638 } 4639 4640 /* 4641 * Acquire pmap. Need to lock the kernel pmap only to protect the 4642 * statistics. 4643 */ 4644 mutex_enter(&pmap->pm_lock); 4645 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 4646 4647 if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) { 4648 panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va); 4649 } 4650 KASSERT(lvl == 1); 4651 4652 ptep = &ptes[pl1_i(va)]; 4653 opte = *ptep; 4654 KASSERT(pmap_valid_entry(opte)); 4655 4656 if (opte & PTE_WIRED) { 4657 pt_entry_t npte = opte & ~PTE_WIRED; 4658 4659 opte = pmap_pte_testset(ptep, npte); 4660 pmap_stats_update_bypte(pmap, npte, opte); 4661 } else { 4662 printf("%s: wiring for pmap %p va %#" PRIxVADDR 4663 " did not change!\n", __func__, pmap, va); 4664 } 4665 4666 /* Release pmap. */ 4667 pmap_unmap_ptes(pmap, pmap2); 4668 mutex_exit(&pmap->pm_lock); 4669 } 4670 4671 /* 4672 * pmap_copy: copy mappings from one pmap to another 4673 * 4674 * => optional function 4675 * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr) 4676 */ 4677 4678 /* 4679 * defined as macro in pmap.h 4680 */ 4681 4682 __strict_weak_alias(pmap_enter, pmap_enter_default); 4683 4684 int 4685 pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, 4686 u_int flags) 4687 { 4688 if (__predict_false(pmap->pm_enter != NULL)) { 4689 return (*pmap->pm_enter)(pmap, va, pa, prot, flags); 4690 } 4691 4692 return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0); 4693 } 4694 4695 /* 4696 * pmap_enter: enter a mapping into a pmap 4697 * 4698 * => must be done "now" ... no lazy-evaluation 4699 */ 4700 int 4701 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa, 4702 vm_prot_t prot, u_int flags, int domid) 4703 { 4704 pt_entry_t *ptes, opte, npte; 4705 pt_entry_t *ptep; 4706 pd_entry_t * const *pdes; 4707 struct vm_page *ptp; 4708 struct vm_page *new_pg, *old_pg; 4709 struct pmap_page *new_pp, *old_pp; 4710 struct pv_entry *old_pve, *new_pve; 4711 bool wired = (flags & PMAP_WIRED) != 0; 4712 struct pmap *pmap2; 4713 struct pmap_ptparray pt; 4714 int error; 4715 bool getptp, samepage, new_embedded; 4716 rb_tree_t *tree; 4717 4718 KASSERT(pmap_initialized); 4719 KASSERT(va < VM_MAX_KERNEL_ADDRESS); 4720 KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#" 4721 PRIxVADDR " over PDP!", __func__, va); 4722 KASSERTMSG(va < VM_MIN_KERNEL_ADDRESS || 4723 pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]), 4724 "%s: missing kernel PTP for va=%#" PRIxVADDR, __func__, va); 4725 4726 #ifdef XENPV 4727 KASSERT(domid == DOMID_SELF || pa == 0); 4728 #endif 4729 4730 npte = ma | protection_codes[prot] | PTE_P; 4731 npte |= pmap_pat_flags(flags); 4732 if (wired) 4733 npte |= PTE_WIRED; 4734 if (va < VM_MAXUSER_ADDRESS) 4735 npte |= PTE_U; 4736 4737 if (pmap == pmap_kernel()) 4738 npte |= pmap_pg_g; 4739 if (flags & VM_PROT_ALL) { 4740 npte |= PTE_A; 4741 if (flags & VM_PROT_WRITE) { 4742 KASSERT((npte & PTE_W) != 0); 4743 npte |= PTE_D; 4744 } 4745 } 4746 4747 #ifdef XENPV 4748 if (domid != DOMID_SELF) 4749 new_pg = NULL; 4750 else 4751 #endif 4752 new_pg = PHYS_TO_VM_PAGE(pa); 4753 4754 if (new_pg != NULL) { 4755 /* This is a managed page */ 4756 npte |= PTE_PVLIST; 4757 new_pp = VM_PAGE_TO_PP(new_pg); 4758 PMAP_CHECK_PP(new_pp); 4759 } else if ((new_pp = pmap_pv_tracked(pa)) != NULL) { 4760 /* This is an unmanaged pv-tracked page */ 4761 npte |= PTE_PVLIST; 4762 PMAP_CHECK_PP(new_pp); 4763 } else { 4764 new_pp = NULL; 4765 } 4766 4767 /* Begin by locking the pmap. */ 4768 mutex_enter(&pmap->pm_lock); 4769 4770 /* Look up the PTP. Allocate if none present. */ 4771 ptp = NULL; 4772 getptp = false; 4773 if (pmap != pmap_kernel()) { 4774 ptp = pmap_find_ptp(pmap, va, 1); 4775 if (ptp == NULL) { 4776 getptp = true; 4777 error = pmap_get_ptp(pmap, &pt, va, flags, &ptp); 4778 if (error != 0) { 4779 if (flags & PMAP_CANFAIL) { 4780 mutex_exit(&pmap->pm_lock); 4781 return error; 4782 } 4783 panic("%s: get ptp failed, error=%d", __func__, 4784 error); 4785 } 4786 } 4787 tree = &VM_PAGE_TO_PP(ptp)->pp_rb; 4788 } else { 4789 /* Embedded PV entries rely on this. */ 4790 KASSERT(va != 0); 4791 tree = &pmap_kernel_rb; 4792 } 4793 4794 /* 4795 * Look up the old PV entry at this VA (if any), and insert a new PV 4796 * entry if required for the new mapping. Temporarily track the old 4797 * and new mappings concurrently. Only after the old mapping is 4798 * evicted from the pmap will we remove its PV entry. Otherwise, 4799 * our picture of modified/accessed state for either page could get 4800 * out of sync (we need any P->V operation for either page to stall 4801 * on pmap->pm_lock until done here). 4802 */ 4803 new_pve = NULL; 4804 old_pve = NULL; 4805 samepage = false; 4806 new_embedded = false; 4807 4808 if (new_pp != NULL) { 4809 error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve, 4810 &old_pve, &samepage, &new_embedded, tree); 4811 4812 /* 4813 * If a new pv_entry was needed and none was available, we 4814 * can go no further. 4815 */ 4816 if (error != 0) { 4817 if (flags & PMAP_CANFAIL) { 4818 if (getptp) { 4819 pmap_unget_ptp(pmap, &pt); 4820 } 4821 mutex_exit(&pmap->pm_lock); 4822 return error; 4823 } 4824 panic("%s: alloc pve failed", __func__); 4825 } 4826 } else { 4827 old_pve = pmap_treelookup_pv(pmap, ptp, tree, va); 4828 } 4829 4830 /* Map PTEs into address space. */ 4831 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 4832 4833 /* Install any newly allocated PTPs. */ 4834 if (getptp) { 4835 pmap_install_ptp(pmap, &pt, va, pdes); 4836 } 4837 4838 /* Check if there is an existing mapping. */ 4839 ptep = &ptes[pl1_i(va)]; 4840 opte = *ptep; 4841 bool have_oldpa = pmap_valid_entry(opte); 4842 paddr_t oldpa = pmap_pte2pa(opte); 4843 4844 /* 4845 * Update the pte. 4846 */ 4847 do { 4848 opte = *ptep; 4849 4850 /* 4851 * if the same page, inherit PTE_A and PTE_D. 4852 */ 4853 if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) { 4854 npte |= opte & (PTE_A | PTE_D); 4855 } 4856 #if defined(XENPV) 4857 if (domid != DOMID_SELF) { 4858 /* pmap_pte_cas with error handling */ 4859 int s = splvm(); 4860 if (opte != *ptep) { 4861 splx(s); 4862 continue; 4863 } 4864 error = xpq_update_foreign( 4865 vtomach((vaddr_t)ptep), npte, domid); 4866 splx(s); 4867 if (error) { 4868 /* Undo pv_entry tracking - oof. */ 4869 if (new_pp != NULL) { 4870 mutex_spin_enter(&new_pp->pp_lock); 4871 if (new_pve != NULL) { 4872 LIST_REMOVE(new_pve, pve_list); 4873 KASSERT(pmap->pm_pve == NULL); 4874 pmap->pm_pve = new_pve; 4875 } else if (new_embedded) { 4876 new_pp->pp_pte.pte_ptp = NULL; 4877 new_pp->pp_pte.pte_va = 0; 4878 } 4879 mutex_spin_exit(&new_pp->pp_lock); 4880 } 4881 pmap_unmap_ptes(pmap, pmap2); 4882 /* Free new PTP. */ 4883 if (ptp != NULL && ptp->wire_count <= 1) { 4884 pmap_free_ptp(pmap, ptp, va, ptes, 4885 pdes); 4886 } 4887 mutex_exit(&pmap->pm_lock); 4888 return error; 4889 } 4890 break; 4891 } 4892 #endif /* defined(XENPV) */ 4893 } while (pmap_pte_cas(ptep, opte, npte) != opte); 4894 4895 /* 4896 * Done with the PTEs: they can now be unmapped. 4897 */ 4898 pmap_unmap_ptes(pmap, pmap2); 4899 4900 /* 4901 * Update statistics and PTP's reference count. 4902 */ 4903 pmap_stats_update_bypte(pmap, npte, opte); 4904 if (ptp != NULL) { 4905 if (!have_oldpa) { 4906 ptp->wire_count++; 4907 } 4908 /* Remember minimum VA in PTP. */ 4909 pmap_ptp_range_set(ptp, va); 4910 } 4911 KASSERT(ptp == NULL || ptp->wire_count > 1); 4912 4913 /* 4914 * If the same page, we can skip pv_entry handling. 4915 */ 4916 if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) { 4917 KASSERT(((opte ^ npte) & PTE_PVLIST) == 0); 4918 if ((npte & PTE_PVLIST) != 0) { 4919 KASSERT(samepage); 4920 pmap_check_pv(pmap, ptp, new_pp, va, true); 4921 } 4922 goto same_pa; 4923 } else if ((npte & PTE_PVLIST) != 0) { 4924 KASSERT(!samepage); 4925 } 4926 4927 /* 4928 * If old page is pv-tracked, remove pv_entry from its list. 4929 */ 4930 if ((~opte & (PTE_P | PTE_PVLIST)) == 0) { 4931 if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) { 4932 old_pp = VM_PAGE_TO_PP(old_pg); 4933 } else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) { 4934 panic("%s: PTE_PVLIST with pv-untracked page" 4935 " va = %#"PRIxVADDR 4936 " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")", 4937 __func__, va, oldpa, atop(pa)); 4938 } 4939 4940 pmap_remove_pv(pmap, old_pp, ptp, va, old_pve, 4941 pmap_pte_to_pp_attrs(opte)); 4942 if (old_pve != NULL) { 4943 if (pmap->pm_pve == NULL) { 4944 pmap->pm_pve = old_pve; 4945 } else { 4946 pool_cache_put(&pmap_pv_cache, old_pve); 4947 } 4948 } 4949 } else { 4950 KASSERT(old_pve == NULL); 4951 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 4952 } 4953 4954 /* 4955 * If new page is dynamically PV tracked, insert to tree. 4956 */ 4957 if (new_pve != NULL) { 4958 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 4959 old_pve = rb_tree_insert_node(tree, new_pve); 4960 KASSERT(old_pve == new_pve); 4961 pmap_check_pv(pmap, ptp, new_pp, va, true); 4962 } 4963 4964 same_pa: 4965 /* 4966 * shootdown tlb if necessary. 4967 */ 4968 4969 if ((~opte & (PTE_P | PTE_A)) == 0 && 4970 ((opte ^ npte) & (PTE_FRAME | PTE_W)) != 0) { 4971 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER); 4972 } 4973 mutex_exit(&pmap->pm_lock); 4974 return 0; 4975 } 4976 4977 paddr_t 4978 pmap_get_physpage(void) 4979 { 4980 struct vm_page *ptp; 4981 struct pmap *kpm = pmap_kernel(); 4982 paddr_t pa; 4983 4984 if (!uvm.page_init_done) { 4985 /* 4986 * We're growing the kernel pmap early (from 4987 * uvm_pageboot_alloc()). This case must be 4988 * handled a little differently. 4989 */ 4990 4991 if (!uvm_page_physget(&pa)) 4992 panic("%s: out of memory", __func__); 4993 #if defined(__HAVE_DIRECT_MAP) 4994 pagezero(PMAP_DIRECT_MAP(pa)); 4995 #else 4996 #if defined(XENPV) 4997 if (XEN_VERSION_SUPPORTED(3, 4)) { 4998 xen_pagezero(pa); 4999 return pa; 5000 } 5001 #endif 5002 kpreempt_disable(); 5003 pmap_pte_set(early_zero_pte, pmap_pa2pte(pa) | PTE_P | 5004 PTE_W | pmap_pg_nx); 5005 pmap_pte_flush(); 5006 pmap_update_pg((vaddr_t)early_zerop); 5007 memset(early_zerop, 0, PAGE_SIZE); 5008 #if defined(DIAGNOSTIC) || defined(XENPV) 5009 pmap_pte_set(early_zero_pte, 0); 5010 pmap_pte_flush(); 5011 #endif /* defined(DIAGNOSTIC) */ 5012 kpreempt_enable(); 5013 #endif /* defined(__HAVE_DIRECT_MAP) */ 5014 } else { 5015 /* XXX */ 5016 ptp = uvm_pagealloc(NULL, 0, NULL, 5017 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 5018 if (ptp == NULL) 5019 panic("%s: out of memory", __func__); 5020 ptp->flags &= ~PG_BUSY; 5021 ptp->wire_count = 1; 5022 pa = VM_PAGE_TO_PHYS(ptp); 5023 } 5024 pmap_stats_update(kpm, 1, 0); 5025 5026 return pa; 5027 } 5028 5029 /* 5030 * Expand the page tree with the specified amount of PTPs, mapping virtual 5031 * addresses starting at kva. We populate all the levels but the last one 5032 * (L1). The nodes of the tree are created as RW, but the pages covered 5033 * will be kentered in L1, with proper permissions. 5034 * 5035 * Used only by pmap_growkernel. 5036 */ 5037 static void 5038 pmap_alloc_level(struct pmap *cpm, vaddr_t kva, long *needed_ptps) 5039 { 5040 unsigned long i; 5041 paddr_t pa; 5042 unsigned long index, endindex; 5043 int level; 5044 pd_entry_t *pdep; 5045 #ifdef XENPV 5046 int s = splvm(); /* protect xpq_* */ 5047 #endif 5048 5049 for (level = PTP_LEVELS; level > 1; level--) { 5050 if (level == PTP_LEVELS) 5051 pdep = cpm->pm_pdir; 5052 else 5053 pdep = normal_pdes[level - 2]; 5054 index = pl_i_roundup(kva, level); 5055 endindex = index + needed_ptps[level - 1] - 1; 5056 5057 for (i = index; i <= endindex; i++) { 5058 pt_entry_t pte; 5059 5060 KASSERT(!pmap_valid_entry(pdep[i])); 5061 pa = pmap_get_physpage(); 5062 pte = pmap_pa2pte(pa) | PTE_P | PTE_W; 5063 #ifdef __x86_64__ 5064 pte |= pmap_pg_nx; 5065 #endif 5066 pmap_pte_set(&pdep[i], pte); 5067 5068 #ifdef XENPV 5069 if (level == PTP_LEVELS && i >= PDIR_SLOT_KERN) { 5070 if (__predict_true( 5071 cpu_info_primary.ci_flags & CPUF_PRESENT)) { 5072 /* update per-cpu PMDs on all cpus */ 5073 xen_kpm_sync(pmap_kernel(), i); 5074 } else { 5075 /* 5076 * too early; update primary CPU 5077 * PMD only (without locks) 5078 */ 5079 #ifdef __x86_64__ 5080 pd_entry_t *cpu_pdep = 5081 &cpu_info_primary.ci_kpm_pdir[i]; 5082 #else 5083 pd_entry_t *cpu_pdep = 5084 &cpu_info_primary.ci_kpm_pdir[l2tol2(i)]; 5085 #endif 5086 pmap_pte_set(cpu_pdep, pte); 5087 } 5088 } 5089 #endif 5090 5091 KASSERT(level != PTP_LEVELS || nkptp[level - 1] + 5092 pl_i(VM_MIN_KERNEL_ADDRESS, level) == i); 5093 nkptp[level - 1]++; 5094 } 5095 pmap_pte_flush(); 5096 } 5097 #ifdef XENPV 5098 splx(s); 5099 #endif 5100 } 5101 5102 /* 5103 * pmap_growkernel: increase usage of KVM space. 5104 * 5105 * => we allocate new PTPs for the kernel and install them in all 5106 * the pmaps on the system. 5107 */ 5108 vaddr_t 5109 pmap_growkernel(vaddr_t maxkvaddr) 5110 { 5111 struct pmap *kpm = pmap_kernel(); 5112 struct pmap *cpm; 5113 #if !defined(XENPV) || !defined(__x86_64__) 5114 struct pmap *pm; 5115 long old; 5116 #endif 5117 int s, i; 5118 long needed_kptp[PTP_LEVELS], target_nptp; 5119 bool invalidate = false; 5120 5121 s = splvm(); /* to be safe */ 5122 mutex_enter(&kpm->pm_lock); 5123 5124 if (maxkvaddr <= pmap_maxkvaddr) { 5125 mutex_exit(&kpm->pm_lock); 5126 splx(s); 5127 return pmap_maxkvaddr; 5128 } 5129 5130 maxkvaddr = x86_round_pdr(maxkvaddr); 5131 #if !defined(XENPV) || !defined(__x86_64__) 5132 old = nkptp[PTP_LEVELS - 1]; 5133 #endif 5134 5135 /* Initialize needed_kptp. */ 5136 for (i = PTP_LEVELS - 1; i >= 1; i--) { 5137 target_nptp = pl_i_roundup(maxkvaddr, i + 1) - 5138 pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1); 5139 5140 if (target_nptp > nkptpmax[i]) 5141 panic("out of KVA space"); 5142 KASSERT(target_nptp >= nkptp[i]); 5143 needed_kptp[i] = target_nptp - nkptp[i]; 5144 } 5145 5146 #ifdef XENPV 5147 /* only pmap_kernel(), or the per-cpu map, has kernel entries */ 5148 cpm = kpm; 5149 #else 5150 /* Get the current pmap */ 5151 if (__predict_true(cpu_info_primary.ci_flags & CPUF_PRESENT)) { 5152 cpm = curcpu()->ci_pmap; 5153 } else { 5154 cpm = kpm; 5155 } 5156 #endif 5157 5158 kasan_shadow_map((void *)pmap_maxkvaddr, 5159 (size_t)(maxkvaddr - pmap_maxkvaddr)); 5160 kmsan_shadow_map((void *)pmap_maxkvaddr, 5161 (size_t)(maxkvaddr - pmap_maxkvaddr)); 5162 5163 pmap_alloc_level(cpm, pmap_maxkvaddr, needed_kptp); 5164 5165 /* 5166 * If the number of top level entries changed, update all pmaps. 5167 */ 5168 if (needed_kptp[PTP_LEVELS - 1] != 0) { 5169 #ifdef XENPV 5170 #ifdef __x86_64__ 5171 /* nothing, kernel entries are never entered in user pmap */ 5172 #else 5173 int pdkidx; 5174 5175 mutex_enter(&pmaps_lock); 5176 LIST_FOREACH(pm, &pmaps, pm_list) { 5177 for (pdkidx = PDIR_SLOT_KERN + old; 5178 pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1]; 5179 pdkidx++) { 5180 pmap_pte_set(&pm->pm_pdir[pdkidx], 5181 kpm->pm_pdir[pdkidx]); 5182 } 5183 pmap_pte_flush(); 5184 } 5185 mutex_exit(&pmaps_lock); 5186 #endif /* __x86_64__ */ 5187 #else /* XENPV */ 5188 size_t newpdes; 5189 newpdes = nkptp[PTP_LEVELS - 1] - old; 5190 if (cpm != kpm) { 5191 memcpy(&kpm->pm_pdir[PDIR_SLOT_KERN + old], 5192 &cpm->pm_pdir[PDIR_SLOT_KERN + old], 5193 newpdes * sizeof(pd_entry_t)); 5194 } 5195 5196 mutex_enter(&pmaps_lock); 5197 LIST_FOREACH(pm, &pmaps, pm_list) { 5198 if (__predict_false(pm->pm_enter != NULL)) { 5199 /* 5200 * Not a native pmap, the kernel is not mapped, 5201 * so nothing to synchronize. 5202 */ 5203 continue; 5204 } 5205 memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old], 5206 &kpm->pm_pdir[PDIR_SLOT_KERN + old], 5207 newpdes * sizeof(pd_entry_t)); 5208 } 5209 mutex_exit(&pmaps_lock); 5210 #endif 5211 invalidate = true; 5212 } 5213 pmap_maxkvaddr = maxkvaddr; 5214 mutex_exit(&kpm->pm_lock); 5215 splx(s); 5216 5217 if (invalidate && pmap_initialized) { 5218 /* Invalidate the pmap cache. */ 5219 pool_cache_invalidate(&pmap_cache); 5220 } 5221 5222 return maxkvaddr; 5223 } 5224 5225 #ifdef DEBUG 5226 void pmap_dump(struct pmap *, vaddr_t, vaddr_t); 5227 5228 /* 5229 * pmap_dump: dump all the mappings from a pmap 5230 * 5231 * => caller should not be holding any pmap locks 5232 */ 5233 void 5234 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 5235 { 5236 pt_entry_t *ptes, *pte; 5237 pd_entry_t * const *pdes; 5238 struct pmap *pmap2; 5239 vaddr_t blkendva; 5240 int lvl; 5241 5242 /* 5243 * if end is out of range truncate. 5244 * if (end == start) update to max. 5245 */ 5246 5247 if (eva > VM_MAXUSER_ADDRESS || eva <= sva) 5248 eva = VM_MAXUSER_ADDRESS; 5249 5250 mutex_enter(&pmap->pm_lock); 5251 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 5252 5253 /* 5254 * dumping a range of pages: we dump in PTP sized blocks (4MB) 5255 */ 5256 5257 for (/* null */ ; sva < eva ; sva = blkendva) { 5258 5259 /* determine range of block */ 5260 blkendva = x86_round_pdr(sva+1); 5261 if (blkendva > eva) 5262 blkendva = eva; 5263 5264 /* valid block? */ 5265 if (!pmap_pdes_valid(sva, pdes, NULL, &lvl)) 5266 continue; 5267 KASSERT(lvl == 1); 5268 5269 pte = &ptes[pl1_i(sva)]; 5270 for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) { 5271 if (!pmap_valid_entry(*pte)) 5272 continue; 5273 printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR 5274 " (pte=%#" PRIxPADDR ")\n", 5275 sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte); 5276 } 5277 } 5278 pmap_unmap_ptes(pmap, pmap2); 5279 mutex_exit(&pmap->pm_lock); 5280 } 5281 #endif 5282 5283 /* 5284 * pmap_update: process deferred invalidations and frees. 5285 */ 5286 void 5287 pmap_update(struct pmap *pmap) 5288 { 5289 struct pmap_page *pp; 5290 struct vm_page *ptp; 5291 5292 /* 5293 * Initiate any pending TLB shootdowns. Wait for them to 5294 * complete before returning control to the caller. 5295 */ 5296 kpreempt_disable(); 5297 pmap_tlb_shootnow(); 5298 kpreempt_enable(); 5299 5300 /* 5301 * Now that shootdowns are complete, process deferred frees. This 5302 * is an unlocked check, but is safe as we're only interested in 5303 * work done in this LWP - we won't get a false negative. 5304 */ 5305 if (__predict_false(!LIST_EMPTY(&pmap->pm_gc_ptp))) { 5306 mutex_enter(&pmap->pm_lock); 5307 while ((ptp = LIST_FIRST(&pmap->pm_gc_ptp)) != NULL) { 5308 KASSERT(ptp->wire_count == 0); 5309 KASSERT(ptp->uanon == NULL); 5310 LIST_REMOVE(ptp, mdpage.mp_pp.pp_link); 5311 pp = VM_PAGE_TO_PP(ptp); 5312 LIST_INIT(&pp->pp_pvlist); 5313 pp->pp_attrs = 0; 5314 pp->pp_pte.pte_ptp = NULL; 5315 pp->pp_pte.pte_va = 0; 5316 PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp)); 5317 5318 /* 5319 * XXX Hack to avoid extra locking, and lock 5320 * assertions in uvm_pagefree(). Despite uobject 5321 * being set, this isn't a managed page. 5322 */ 5323 PMAP_DUMMY_LOCK(pmap); 5324 uvm_pagerealloc(ptp, NULL, 0); 5325 PMAP_DUMMY_UNLOCK(pmap); 5326 5327 /* 5328 * XXX for PTPs freed by pmap_remove_ptes() but not 5329 * pmap_zap_ptp(), we could mark them PG_ZERO. 5330 */ 5331 uvm_pagefree(ptp); 5332 } 5333 mutex_exit(&pmap->pm_lock); 5334 } 5335 } 5336 5337 #if PTP_LEVELS > 4 5338 #error "Unsupported number of page table mappings" 5339 #endif 5340 5341 paddr_t 5342 pmap_init_tmp_pgtbl(paddr_t pg) 5343 { 5344 static bool maps_loaded; 5345 static const paddr_t x86_tmp_pml_paddr[] = { 5346 4 * PAGE_SIZE, /* L1 */ 5347 5 * PAGE_SIZE, /* L2 */ 5348 6 * PAGE_SIZE, /* L3 */ 5349 7 * PAGE_SIZE /* L4 */ 5350 }; 5351 static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 }; 5352 5353 pd_entry_t *tmp_pml, *kernel_pml; 5354 5355 int level; 5356 5357 if (!maps_loaded) { 5358 for (level = 0; level < PTP_LEVELS; ++level) { 5359 x86_tmp_pml_vaddr[level] = 5360 uvm_km_alloc(kernel_map, PAGE_SIZE, 0, 5361 UVM_KMF_VAONLY); 5362 5363 if (x86_tmp_pml_vaddr[level] == 0) 5364 panic("mapping of real mode PML failed\n"); 5365 pmap_kenter_pa(x86_tmp_pml_vaddr[level], 5366 x86_tmp_pml_paddr[level], 5367 VM_PROT_READ | VM_PROT_WRITE, 0); 5368 } 5369 pmap_update(pmap_kernel()); 5370 maps_loaded = true; 5371 } 5372 5373 /* Zero levels 1-3 */ 5374 for (level = 0; level < PTP_LEVELS - 1; ++level) { 5375 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 5376 memset(tmp_pml, 0, PAGE_SIZE); 5377 } 5378 5379 /* Copy PML4 */ 5380 kernel_pml = pmap_kernel()->pm_pdir; 5381 tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1]; 5382 memcpy(tmp_pml, kernel_pml, PAGE_SIZE); 5383 5384 #ifdef PAE 5385 /* 5386 * Use the last 4 entries of the L2 page as L3 PD entries. These 5387 * last entries are unlikely to be used for temporary mappings. 5388 * 508: maps 0->1GB (userland) 5389 * 509: unused 5390 * 510: unused 5391 * 511: maps 3->4GB (kernel) 5392 */ 5393 tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PTE_P; 5394 tmp_pml[509] = 0; 5395 tmp_pml[510] = 0; 5396 tmp_pml[511] = pmap_pdirpa(pmap_kernel(), PDIR_SLOT_KERN) | PTE_P; 5397 #endif 5398 5399 for (level = PTP_LEVELS - 1; level > 0; --level) { 5400 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 5401 5402 tmp_pml[pl_i(pg, level + 1)] = 5403 (x86_tmp_pml_paddr[level - 1] & PTE_FRAME) | PTE_W | PTE_P; 5404 } 5405 5406 tmp_pml = (void *)x86_tmp_pml_vaddr[0]; 5407 tmp_pml[pl_i(pg, 1)] = (pg & PTE_FRAME) | PTE_W | PTE_P; 5408 5409 #ifdef PAE 5410 /* Return the PA of the L3 page (entry 508 of the L2 page) */ 5411 return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t); 5412 #endif 5413 5414 return x86_tmp_pml_paddr[PTP_LEVELS - 1]; 5415 } 5416 5417 u_int 5418 x86_mmap_flags(paddr_t mdpgno) 5419 { 5420 u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK; 5421 u_int pflag = 0; 5422 5423 if (nflag & X86_MMAP_FLAG_PREFETCH) 5424 pflag |= PMAP_WRITE_COMBINE; 5425 5426 return pflag; 5427 } 5428 5429 #if defined(__HAVE_DIRECT_MAP) && defined(__x86_64__) && !defined(XEN) 5430 5431 /* 5432 * ----------------------------------------------------------------------------- 5433 * ***************************************************************************** 5434 * ***************************************************************************** 5435 * ***************************************************************************** 5436 * ***************************************************************************** 5437 * **************** HERE BEGINS THE EPT CODE, USED BY INTEL-VMX **************** 5438 * ***************************************************************************** 5439 * ***************************************************************************** 5440 * ***************************************************************************** 5441 * ***************************************************************************** 5442 * ----------------------------------------------------------------------------- 5443 * 5444 * These functions are invoked as callbacks from the code above. Contrary to 5445 * native, EPT does not have a recursive slot; therefore, it is not possible 5446 * to call pmap_map_ptes(). Instead, we use the direct map and walk down the 5447 * tree manually. 5448 * 5449 * Apart from that, the logic is mostly the same as native. Once a pmap has 5450 * been created, NVMM calls pmap_ept_transform() to make it an EPT pmap. 5451 * After that we're good, and the callbacks will handle the translations 5452 * for us. 5453 * 5454 * ----------------------------------------------------------------------------- 5455 */ 5456 5457 /* Hardware bits. */ 5458 #define EPT_R __BIT(0) /* read */ 5459 #define EPT_W __BIT(1) /* write */ 5460 #define EPT_X __BIT(2) /* execute */ 5461 #define EPT_T __BITS(5,3) /* type */ 5462 #define TYPE_UC 0 5463 #define TYPE_WC 1 5464 #define TYPE_WT 4 5465 #define TYPE_WP 5 5466 #define TYPE_WB 6 5467 #define EPT_NOPAT __BIT(6) 5468 #define EPT_L __BIT(7) /* large */ 5469 #define EPT_A __BIT(8) /* accessed */ 5470 #define EPT_D __BIT(9) /* dirty */ 5471 /* Software bits. */ 5472 #define EPT_PVLIST __BIT(60) 5473 #define EPT_WIRED __BIT(61) 5474 5475 #define pmap_ept_valid_entry(pte) (pte & EPT_R) 5476 5477 bool pmap_ept_has_ad __read_mostly; 5478 5479 static inline void 5480 pmap_ept_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte) 5481 { 5482 int resid_diff = ((npte & EPT_R) ? 1 : 0) - ((opte & EPT_R) ? 1 : 0); 5483 int wired_diff = ((npte & EPT_WIRED) ? 1 : 0) - ((opte & EPT_WIRED) ? 1 : 0); 5484 5485 KASSERT((npte & (EPT_R | EPT_WIRED)) != EPT_WIRED); 5486 KASSERT((opte & (EPT_R | EPT_WIRED)) != EPT_WIRED); 5487 5488 pmap_stats_update(pmap, resid_diff, wired_diff); 5489 } 5490 5491 static pt_entry_t 5492 pmap_ept_type(u_int flags) 5493 { 5494 u_int cacheflags = (flags & PMAP_CACHE_MASK); 5495 pt_entry_t ret; 5496 5497 switch (cacheflags) { 5498 case PMAP_NOCACHE: 5499 case PMAP_NOCACHE_OVR: 5500 ret = __SHIFTIN(TYPE_UC, EPT_T); 5501 break; 5502 case PMAP_WRITE_COMBINE: 5503 ret = __SHIFTIN(TYPE_WC, EPT_T); 5504 break; 5505 case PMAP_WRITE_BACK: 5506 default: 5507 ret = __SHIFTIN(TYPE_WB, EPT_T); 5508 break; 5509 } 5510 5511 ret |= EPT_NOPAT; 5512 return ret; 5513 } 5514 5515 static inline pt_entry_t 5516 pmap_ept_prot(vm_prot_t prot) 5517 { 5518 pt_entry_t res = 0; 5519 5520 if (prot & VM_PROT_READ) 5521 res |= EPT_R; 5522 if (prot & VM_PROT_WRITE) 5523 res |= EPT_W; 5524 if (prot & VM_PROT_EXECUTE) 5525 res |= EPT_X; 5526 5527 return res; 5528 } 5529 5530 static inline uint8_t 5531 pmap_ept_to_pp_attrs(pt_entry_t ept) 5532 { 5533 uint8_t ret = 0; 5534 if (pmap_ept_has_ad) { 5535 if (ept & EPT_D) 5536 ret |= PP_ATTRS_D; 5537 if (ept & EPT_A) 5538 ret |= PP_ATTRS_A; 5539 } else { 5540 ret |= (PP_ATTRS_D|PP_ATTRS_A); 5541 } 5542 if (ept & EPT_W) 5543 ret |= PP_ATTRS_W; 5544 return ret; 5545 } 5546 5547 static inline pt_entry_t 5548 pmap_pp_attrs_to_ept(uint8_t attrs) 5549 { 5550 pt_entry_t ept = 0; 5551 if (attrs & PP_ATTRS_D) 5552 ept |= EPT_D; 5553 if (attrs & PP_ATTRS_A) 5554 ept |= EPT_A; 5555 if (attrs & PP_ATTRS_W) 5556 ept |= EPT_W; 5557 return ept; 5558 } 5559 5560 /* 5561 * Helper for pmap_ept_free_ptp. 5562 * tree[0] = &L2[L2idx] 5563 * tree[1] = &L3[L3idx] 5564 * tree[2] = &L4[L4idx] 5565 */ 5566 static void 5567 pmap_ept_get_tree(struct pmap *pmap, vaddr_t va, pd_entry_t **tree) 5568 { 5569 pt_entry_t *pteva; 5570 paddr_t ptepa; 5571 int i, index; 5572 5573 ptepa = pmap->pm_pdirpa[0]; 5574 for (i = PTP_LEVELS; i > 1; i--) { 5575 index = pl_pi(va, i); 5576 pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa); 5577 KASSERT(pmap_ept_valid_entry(pteva[index])); 5578 tree[i - 2] = &pteva[index]; 5579 ptepa = pmap_pte2pa(pteva[index]); 5580 } 5581 } 5582 5583 static void 5584 pmap_ept_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va) 5585 { 5586 pd_entry_t *tree[3]; 5587 int level; 5588 5589 KASSERT(pmap != pmap_kernel()); 5590 KASSERT(mutex_owned(&pmap->pm_lock)); 5591 KASSERT(kpreempt_disabled()); 5592 5593 pmap_ept_get_tree(pmap, va, tree); 5594 5595 level = 1; 5596 do { 5597 (void)pmap_pte_testset(tree[level - 1], 0); 5598 5599 pmap_freepage(pmap, ptp, level); 5600 if (level < PTP_LEVELS - 1) { 5601 ptp = pmap_find_ptp(pmap, va, level + 1); 5602 ptp->wire_count--; 5603 if (ptp->wire_count > 1) 5604 break; 5605 } 5606 } while (++level < PTP_LEVELS); 5607 pmap_pte_flush(); 5608 } 5609 5610 /* Allocate L4->L3->L2. Return L2. */ 5611 static void 5612 pmap_ept_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va) 5613 { 5614 struct vm_page *ptp; 5615 unsigned long index; 5616 pd_entry_t *pteva; 5617 paddr_t ptepa; 5618 int i; 5619 5620 KASSERT(pmap != pmap_kernel()); 5621 KASSERT(mutex_owned(&pmap->pm_lock)); 5622 KASSERT(kpreempt_disabled()); 5623 5624 /* 5625 * Now that we have all the pages looked up or allocated, 5626 * loop through again installing any new ones into the tree. 5627 */ 5628 ptepa = pmap->pm_pdirpa[0]; 5629 for (i = PTP_LEVELS; i > 1; i--) { 5630 index = pl_pi(va, i); 5631 pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa); 5632 5633 if (pmap_ept_valid_entry(pteva[index])) { 5634 KASSERT(!pt->alloced[i]); 5635 ptepa = pmap_pte2pa(pteva[index]); 5636 continue; 5637 } 5638 5639 ptp = pt->pg[i]; 5640 ptp->flags &= ~PG_BUSY; /* never busy */ 5641 ptp->wire_count = 1; 5642 pmap->pm_ptphint[i - 2] = ptp; 5643 ptepa = VM_PAGE_TO_PHYS(ptp); 5644 pmap_pte_set(&pteva[index], ptepa | EPT_R | EPT_W | EPT_X); 5645 5646 pmap_pte_flush(); 5647 pmap_stats_update(pmap, 1, 0); 5648 5649 /* 5650 * If we're not in the top level, increase the 5651 * wire count of the parent page. 5652 */ 5653 if (i < PTP_LEVELS) { 5654 pt->pg[i + 1]->wire_count++; 5655 } 5656 } 5657 } 5658 5659 static int 5660 pmap_ept_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, 5661 u_int flags) 5662 { 5663 pt_entry_t *ptes, opte, npte; 5664 pt_entry_t *ptep; 5665 struct vm_page *ptp; 5666 struct vm_page *new_pg, *old_pg; 5667 struct pmap_page *new_pp, *old_pp; 5668 struct pv_entry *old_pve, *new_pve; 5669 bool wired = (flags & PMAP_WIRED) != 0; 5670 bool accessed; 5671 struct pmap_ptparray pt; 5672 int error; 5673 bool getptp, samepage, new_embedded; 5674 rb_tree_t *tree; 5675 5676 KASSERT(pmap_initialized); 5677 KASSERT(va < VM_MAXUSER_ADDRESS); 5678 5679 npte = pa | pmap_ept_prot(prot) | pmap_ept_type(flags); 5680 5681 if (wired) 5682 npte |= EPT_WIRED; 5683 if (flags & VM_PROT_ALL) { 5684 npte |= EPT_A; 5685 if (flags & VM_PROT_WRITE) { 5686 KASSERT((npte & EPT_W) != 0); 5687 npte |= EPT_D; 5688 } 5689 } 5690 5691 new_pg = PHYS_TO_VM_PAGE(pa); 5692 if (new_pg != NULL) { 5693 /* This is a managed page */ 5694 npte |= EPT_PVLIST; 5695 new_pp = VM_PAGE_TO_PP(new_pg); 5696 } else if ((new_pp = pmap_pv_tracked(pa)) != NULL) { 5697 /* This is an unmanaged pv-tracked page */ 5698 npte |= EPT_PVLIST; 5699 } else { 5700 new_pp = NULL; 5701 } 5702 5703 /* Begin by locking the pmap. */ 5704 mutex_enter(&pmap->pm_lock); 5705 5706 /* Look up the PTP. Allocate if none present. */ 5707 ptp = NULL; 5708 getptp = false; 5709 if (pmap != pmap_kernel()) { 5710 ptp = pmap_find_ptp(pmap, va, 1); 5711 if (ptp == NULL) { 5712 getptp = true; 5713 error = pmap_get_ptp(pmap, &pt, va, flags, &ptp); 5714 if (error != 0) { 5715 if (flags & PMAP_CANFAIL) { 5716 mutex_exit(&pmap->pm_lock); 5717 return error; 5718 } 5719 panic("%s: get ptp failed, error=%d", __func__, 5720 error); 5721 } 5722 } 5723 tree = &VM_PAGE_TO_PP(ptp)->pp_rb; 5724 } else { 5725 /* Embedded PV entries rely on this. */ 5726 KASSERT(va != 0); 5727 tree = &pmap_kernel_rb; 5728 } 5729 5730 /* 5731 * Look up the old PV entry at this VA (if any), and insert a new PV 5732 * entry if required for the new mapping. Temporarily track the old 5733 * and new mappings concurrently. Only after the old mapping is 5734 * evicted from the pmap will we remove its PV entry. Otherwise, 5735 * our picture of modified/accessed state for either page could get 5736 * out of sync (we need any P->V operation for either page to stall 5737 * on pmap->pm_lock until done here). 5738 */ 5739 new_pve = NULL; 5740 old_pve = NULL; 5741 samepage = false; 5742 new_embedded = false; 5743 5744 if (new_pp != NULL) { 5745 error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve, 5746 &old_pve, &samepage, &new_embedded, tree); 5747 5748 /* 5749 * If a new pv_entry was needed and none was available, we 5750 * can go no further. 5751 */ 5752 if (error != 0) { 5753 if (flags & PMAP_CANFAIL) { 5754 if (getptp) { 5755 pmap_unget_ptp(pmap, &pt); 5756 } 5757 mutex_exit(&pmap->pm_lock); 5758 return error; 5759 } 5760 panic("%s: alloc pve failed", __func__); 5761 } 5762 } else { 5763 old_pve = pmap_treelookup_pv(pmap, ptp, tree, va); 5764 } 5765 5766 /* Map PTEs into address space. */ 5767 kpreempt_disable(); 5768 5769 /* Install any newly allocated PTPs. */ 5770 if (getptp) { 5771 pmap_ept_install_ptp(pmap, &pt, va); 5772 } 5773 5774 /* Check if there is an existing mapping. */ 5775 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp)); 5776 ptep = &ptes[pl1_pi(va)]; 5777 opte = *ptep; 5778 bool have_oldpa = pmap_ept_valid_entry(opte); 5779 paddr_t oldpa = pmap_pte2pa(opte); 5780 5781 /* 5782 * Update the pte. 5783 */ 5784 do { 5785 opte = *ptep; 5786 5787 /* 5788 * if the same page, inherit PTE_A and PTE_D. 5789 */ 5790 if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) { 5791 npte |= opte & (EPT_A | EPT_D); 5792 } 5793 } while (pmap_pte_cas(ptep, opte, npte) != opte); 5794 5795 /* 5796 * Done with the PTEs: they can now be unmapped. 5797 */ 5798 kpreempt_enable(); 5799 5800 /* 5801 * Update statistics and PTP's reference count. 5802 */ 5803 pmap_ept_stats_update_bypte(pmap, npte, opte); 5804 if (ptp != NULL) { 5805 if (!have_oldpa) { 5806 ptp->wire_count++; 5807 } 5808 /* Remember minimum VA in PTP. */ 5809 pmap_ptp_range_set(ptp, va); 5810 } 5811 KASSERT(ptp == NULL || ptp->wire_count > 1); 5812 5813 /* 5814 * If the same page, we can skip pv_entry handling. 5815 */ 5816 if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) { 5817 KASSERT(((opte ^ npte) & EPT_PVLIST) == 0); 5818 if ((npte & EPT_PVLIST) != 0) { 5819 KASSERT(samepage); 5820 pmap_check_pv(pmap, ptp, new_pp, va, true); 5821 } 5822 goto same_pa; 5823 } else if ((npte & EPT_PVLIST) != 0) { 5824 KASSERT(!samepage); 5825 } 5826 5827 /* 5828 * If old page is pv-tracked, remove pv_entry from its list. 5829 */ 5830 if ((~opte & (EPT_R | EPT_PVLIST)) == 0) { 5831 if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) { 5832 old_pp = VM_PAGE_TO_PP(old_pg); 5833 } else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) { 5834 panic("%s: EPT_PVLIST with pv-untracked page" 5835 " va = %#"PRIxVADDR 5836 " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")", 5837 __func__, va, oldpa, atop(pa)); 5838 } 5839 5840 pmap_remove_pv(pmap, old_pp, ptp, va, old_pve, 5841 pmap_ept_to_pp_attrs(opte)); 5842 if (old_pve != NULL) { 5843 if (pmap->pm_pve == NULL) { 5844 pmap->pm_pve = old_pve; 5845 } else { 5846 pool_cache_put(&pmap_pv_cache, old_pve); 5847 } 5848 } 5849 } else { 5850 KASSERT(old_pve == NULL); 5851 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 5852 } 5853 5854 /* 5855 * If new page is dynamically PV tracked, insert to tree. 5856 */ 5857 if (new_pve != NULL) { 5858 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 5859 old_pve = rb_tree_insert_node(tree, new_pve); 5860 KASSERT(old_pve == new_pve); 5861 pmap_check_pv(pmap, ptp, new_pp, va, true); 5862 } 5863 5864 same_pa: 5865 /* 5866 * shootdown tlb if necessary. 5867 */ 5868 5869 if (pmap_ept_has_ad) { 5870 accessed = (~opte & (EPT_R | EPT_A)) == 0; 5871 } else { 5872 accessed = (opte & EPT_R) != 0; 5873 } 5874 if (accessed && ((opte ^ npte) & (PTE_FRAME | EPT_W)) != 0) { 5875 pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_ENTER); 5876 } 5877 mutex_exit(&pmap->pm_lock); 5878 return 0; 5879 } 5880 5881 /* Pay close attention, this returns L2. */ 5882 static int 5883 pmap_ept_pdes_invalid(struct pmap *pmap, vaddr_t va, pd_entry_t *lastpde) 5884 { 5885 pt_entry_t *pteva; 5886 paddr_t ptepa; 5887 int i, index; 5888 5889 KASSERT(mutex_owned(&pmap->pm_lock)); 5890 5891 ptepa = pmap->pm_pdirpa[0]; 5892 for (i = PTP_LEVELS; i > 1; i--) { 5893 pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa); 5894 index = pl_pi(va, i); 5895 if (!pmap_ept_valid_entry(pteva[index])) 5896 return i; 5897 ptepa = pmap_pte2pa(pteva[index]); 5898 } 5899 if (lastpde != NULL) { 5900 *lastpde = pteva[index]; 5901 } 5902 5903 return 0; 5904 } 5905 5906 static bool 5907 pmap_ept_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap) 5908 { 5909 pt_entry_t *ptes, pte; 5910 pd_entry_t pde; 5911 paddr_t ptppa, pa; 5912 bool rv; 5913 5914 #ifdef __HAVE_DIRECT_MAP 5915 if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) { 5916 if (pap != NULL) { 5917 *pap = PMAP_DIRECT_UNMAP(va); 5918 } 5919 return true; 5920 } 5921 #endif 5922 5923 rv = false; 5924 pa = 0; 5925 5926 mutex_enter(&pmap->pm_lock); 5927 kpreempt_disable(); 5928 5929 if (!pmap_ept_pdes_invalid(pmap, va, &pde)) { 5930 ptppa = pmap_pte2pa(pde); 5931 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa); 5932 pte = ptes[pl1_pi(va)]; 5933 if (__predict_true((pte & EPT_R) != 0)) { 5934 pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1)); 5935 rv = true; 5936 } 5937 } 5938 5939 kpreempt_enable(); 5940 mutex_exit(&pmap->pm_lock); 5941 5942 if (pap != NULL) { 5943 *pap = pa; 5944 } 5945 return rv; 5946 } 5947 5948 static bool 5949 pmap_ept_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, 5950 vaddr_t va, struct pv_entry **pv_tofree) 5951 { 5952 struct pv_entry *pve; 5953 struct vm_page *pg; 5954 struct pmap_page *pp; 5955 pt_entry_t opte; 5956 bool accessed; 5957 5958 KASSERT(pmap != pmap_kernel()); 5959 KASSERT(mutex_owned(&pmap->pm_lock)); 5960 KASSERT(kpreempt_disabled()); 5961 5962 if (!pmap_ept_valid_entry(*pte)) { 5963 /* VA not mapped. */ 5964 return false; 5965 } 5966 5967 /* Atomically save the old PTE and zap it. */ 5968 opte = pmap_pte_testset(pte, 0); 5969 if (!pmap_ept_valid_entry(opte)) { 5970 return false; 5971 } 5972 5973 pmap_ept_stats_update_bypte(pmap, 0, opte); 5974 5975 if (ptp) { 5976 /* 5977 * Dropping a PTE. Make sure that the PDE is flushed. 5978 */ 5979 ptp->wire_count--; 5980 if (ptp->wire_count <= 1) { 5981 opte |= EPT_A; 5982 } 5983 } 5984 5985 if (pmap_ept_has_ad) { 5986 accessed = (opte & EPT_A) != 0; 5987 } else { 5988 accessed = true; 5989 } 5990 if (accessed) { 5991 pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_REMOVE_PTE); 5992 } 5993 5994 /* 5995 * If we are not on a pv list - we are done. 5996 */ 5997 if ((opte & EPT_PVLIST) == 0) { 5998 KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL), 5999 "managed page without EPT_PVLIST for %#"PRIxVADDR, va); 6000 KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL), 6001 "pv-tracked page without EPT_PVLIST for %#"PRIxVADDR, va); 6002 KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ? 6003 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL); 6004 return true; 6005 } 6006 6007 if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) { 6008 pp = VM_PAGE_TO_PP(pg); 6009 } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) { 6010 paddr_t pa = pmap_pte2pa(opte); 6011 panic("%s: EPT_PVLIST with pv-untracked page" 6012 " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")", 6013 __func__, va, pa, atop(pa)); 6014 } 6015 6016 /* Sync R/M bits. */ 6017 pve = pmap_lookup_pv(pmap, ptp, pp, va); 6018 pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_ept_to_pp_attrs(opte)); 6019 6020 if (pve) { 6021 pve->pve_next = *pv_tofree; 6022 *pv_tofree = pve; 6023 } 6024 return true; 6025 } 6026 6027 static void 6028 pmap_ept_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva, 6029 vaddr_t startva, vaddr_t endva, struct pv_entry **pv_tofree) 6030 { 6031 pt_entry_t *pte = (pt_entry_t *)ptpva; 6032 6033 KASSERT(pmap != pmap_kernel()); 6034 KASSERT(mutex_owned(&pmap->pm_lock)); 6035 KASSERT(kpreempt_disabled()); 6036 6037 /* 6038 * mappings are very often sparse, so clip the given range to the 6039 * range of PTEs that are known present in the PTP. 6040 */ 6041 pmap_ptp_range_clip(ptp, &startva, &pte); 6042 6043 /* 6044 * note that ptpva points to the PTE that maps startva. this may 6045 * or may not be the first PTE in the PTP. 6046 * 6047 * we loop through the PTP while there are still PTEs to look at 6048 * and the wire_count is greater than 1 (because we use the wire_count 6049 * to keep track of the number of real PTEs in the PTP). 6050 */ 6051 while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) { 6052 (void)pmap_ept_remove_pte(pmap, ptp, pte, startva, pv_tofree); 6053 startva += PAGE_SIZE; 6054 pte++; 6055 } 6056 } 6057 6058 static void 6059 pmap_ept_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 6060 { 6061 struct pv_entry *pv_tofree = NULL; 6062 pt_entry_t *ptes; 6063 pd_entry_t pde; 6064 paddr_t ptppa; 6065 vaddr_t blkendva, va = sva; 6066 struct vm_page *ptp; 6067 6068 mutex_enter(&pmap->pm_lock); 6069 kpreempt_disable(); 6070 6071 for (/* null */ ; va < eva ; va = blkendva) { 6072 int lvl; 6073 6074 /* determine range of block */ 6075 blkendva = x86_round_pdr(va+1); 6076 if (blkendva > eva) 6077 blkendva = eva; 6078 6079 lvl = pmap_ept_pdes_invalid(pmap, va, &pde); 6080 if (lvl != 0) { 6081 /* Skip a range corresponding to an invalid pde. */ 6082 blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1]; 6083 continue; 6084 } 6085 6086 /* PA of the PTP */ 6087 ptppa = pmap_pte2pa(pde); 6088 6089 ptp = pmap_find_ptp(pmap, va, 1); 6090 KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected", 6091 __func__); 6092 6093 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa); 6094 6095 pmap_ept_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_pi(va)], va, 6096 blkendva, &pv_tofree); 6097 6098 /* If PTP is no longer being used, free it. */ 6099 if (ptp && ptp->wire_count <= 1) { 6100 pmap_ept_free_ptp(pmap, ptp, va); 6101 } 6102 } 6103 6104 kpreempt_enable(); 6105 if (pv_tofree != NULL) { 6106 pmap_free_pvs(pmap, pv_tofree); 6107 } 6108 mutex_exit(&pmap->pm_lock); 6109 } 6110 6111 static int 6112 pmap_ept_sync_pv(struct vm_page *ptp, vaddr_t va, paddr_t pa, int clearbits, 6113 uint8_t *oattrs, pt_entry_t *optep) 6114 { 6115 struct pmap *pmap; 6116 pt_entry_t *ptep; 6117 pt_entry_t opte; 6118 pt_entry_t npte; 6119 pt_entry_t expect; 6120 bool need_shootdown; 6121 6122 expect = pmap_pa2pte(pa) | EPT_R; 6123 pmap = ptp_to_pmap(ptp); 6124 6125 if (clearbits != ~0) { 6126 KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0); 6127 clearbits = pmap_pp_attrs_to_ept(clearbits); 6128 } 6129 6130 ptep = pmap_map_pte(pmap, ptp, va); 6131 do { 6132 opte = *ptep; 6133 KASSERT((opte & (EPT_D | EPT_A)) != EPT_D); 6134 KASSERT((opte & (EPT_A | EPT_R)) != EPT_A); 6135 KASSERT(opte == 0 || (opte & EPT_R) != 0); 6136 if ((opte & (PTE_FRAME | EPT_R)) != expect) { 6137 /* 6138 * We lost a race with a V->P operation like 6139 * pmap_remove(). Wait for the competitor 6140 * reflecting pte bits into mp_attrs. 6141 */ 6142 pmap_unmap_pte(); 6143 return EAGAIN; 6144 } 6145 6146 /* 6147 * Check if there's anything to do on this PTE. 6148 */ 6149 if ((opte & clearbits) == 0) { 6150 need_shootdown = false; 6151 break; 6152 } 6153 6154 /* 6155 * We need a shootdown if the PTE is cached (EPT_A) ... 6156 * ... Unless we are clearing only the EPT_W bit and 6157 * it isn't cached as RW (EPT_D). 6158 */ 6159 if (pmap_ept_has_ad) { 6160 need_shootdown = (opte & EPT_A) != 0 && 6161 !(clearbits == EPT_W && (opte & EPT_D) == 0); 6162 } else { 6163 need_shootdown = true; 6164 } 6165 6166 npte = opte & ~clearbits; 6167 6168 /* 6169 * If we need a shootdown anyway, clear EPT_A and EPT_D. 6170 */ 6171 if (need_shootdown) { 6172 npte &= ~(EPT_A | EPT_D); 6173 } 6174 KASSERT((npte & (EPT_D | EPT_A)) != EPT_D); 6175 KASSERT((npte & (EPT_A | EPT_R)) != EPT_A); 6176 KASSERT(npte == 0 || (opte & EPT_R) != 0); 6177 } while (pmap_pte_cas(ptep, opte, npte) != opte); 6178 6179 if (need_shootdown) { 6180 pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_SYNC_PV); 6181 } 6182 pmap_unmap_pte(); 6183 6184 *oattrs = pmap_ept_to_pp_attrs(opte); 6185 if (optep != NULL) 6186 *optep = opte; 6187 return 0; 6188 } 6189 6190 static void 6191 pmap_ept_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte, 6192 vaddr_t va) 6193 { 6194 6195 KASSERT(mutex_owned(&pmap->pm_lock)); 6196 6197 pmap_ept_stats_update_bypte(pmap, 0, opte); 6198 ptp->wire_count--; 6199 if (ptp->wire_count <= 1) { 6200 pmap_ept_free_ptp(pmap, ptp, va); 6201 } 6202 } 6203 6204 static void 6205 pmap_ept_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot) 6206 { 6207 pt_entry_t bit_rem; 6208 pt_entry_t *ptes, *spte; 6209 pt_entry_t opte, npte; 6210 pd_entry_t pde; 6211 paddr_t ptppa; 6212 vaddr_t va; 6213 bool modified; 6214 6215 bit_rem = 0; 6216 if (!(prot & VM_PROT_WRITE)) 6217 bit_rem = EPT_W; 6218 6219 sva &= PTE_FRAME; 6220 eva &= PTE_FRAME; 6221 6222 /* Acquire pmap. */ 6223 mutex_enter(&pmap->pm_lock); 6224 kpreempt_disable(); 6225 6226 for (va = sva; va < eva; va += PAGE_SIZE) { 6227 if (pmap_ept_pdes_invalid(pmap, va, &pde)) { 6228 continue; 6229 } 6230 6231 ptppa = pmap_pte2pa(pde); 6232 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa); 6233 spte = &ptes[pl1_pi(va)]; 6234 6235 do { 6236 opte = *spte; 6237 if (!pmap_ept_valid_entry(opte)) { 6238 goto next; 6239 } 6240 npte = (opte & ~bit_rem); 6241 } while (pmap_pte_cas(spte, opte, npte) != opte); 6242 6243 if (pmap_ept_has_ad) { 6244 modified = (opte & EPT_D) != 0; 6245 } else { 6246 modified = true; 6247 } 6248 if (modified) { 6249 vaddr_t tva = x86_ptob(spte - ptes); 6250 pmap_tlb_shootdown(pmap, tva, 0, 6251 TLBSHOOT_WRITE_PROTECT); 6252 } 6253 next:; 6254 } 6255 6256 kpreempt_enable(); 6257 mutex_exit(&pmap->pm_lock); 6258 } 6259 6260 static void 6261 pmap_ept_unwire(struct pmap *pmap, vaddr_t va) 6262 { 6263 pt_entry_t *ptes, *ptep, opte; 6264 pd_entry_t pde; 6265 paddr_t ptppa; 6266 6267 /* Acquire pmap. */ 6268 mutex_enter(&pmap->pm_lock); 6269 kpreempt_disable(); 6270 6271 if (pmap_ept_pdes_invalid(pmap, va, &pde)) { 6272 panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va); 6273 } 6274 6275 ptppa = pmap_pte2pa(pde); 6276 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa); 6277 ptep = &ptes[pl1_pi(va)]; 6278 opte = *ptep; 6279 KASSERT(pmap_ept_valid_entry(opte)); 6280 6281 if (opte & EPT_WIRED) { 6282 pt_entry_t npte = opte & ~EPT_WIRED; 6283 6284 opte = pmap_pte_testset(ptep, npte); 6285 pmap_ept_stats_update_bypte(pmap, npte, opte); 6286 } else { 6287 printf("%s: wiring for pmap %p va %#" PRIxVADDR 6288 "did not change!\n", __func__, pmap, va); 6289 } 6290 6291 /* Release pmap. */ 6292 kpreempt_enable(); 6293 mutex_exit(&pmap->pm_lock); 6294 } 6295 6296 /* -------------------------------------------------------------------------- */ 6297 6298 void 6299 pmap_ept_transform(struct pmap *pmap) 6300 { 6301 pmap->pm_enter = pmap_ept_enter; 6302 pmap->pm_extract = pmap_ept_extract; 6303 pmap->pm_remove = pmap_ept_remove; 6304 pmap->pm_sync_pv = pmap_ept_sync_pv; 6305 pmap->pm_pp_remove_ent = pmap_ept_pp_remove_ent; 6306 pmap->pm_write_protect = pmap_ept_write_protect; 6307 pmap->pm_unwire = pmap_ept_unwire; 6308 6309 memset(pmap->pm_pdir, 0, PAGE_SIZE); 6310 } 6311 6312 #endif /* __HAVE_DIRECT_MAP && __x86_64__ && !XEN */ 6313