1 /* $NetBSD: pmap.c,v 1.406 2020/09/02 17:37:57 bouyer Exp $ */ 2 3 /* 4 * Copyright (c) 2008, 2010, 2016, 2017, 2019, 2020 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran, and by Maxime Villard. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 2007 Manuel Bouyer. 34 * 35 * Redistribution and use in source and binary forms, with or without 36 * modification, are permitted provided that the following conditions 37 * are met: 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 44 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 45 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 46 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 47 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 48 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 49 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 50 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 51 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 52 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 53 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 54 */ 55 56 /* 57 * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr> 58 * 59 * Permission to use, copy, modify, and distribute this software for any 60 * purpose with or without fee is hereby granted, provided that the above 61 * copyright notice and this permission notice appear in all copies. 62 * 63 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 64 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 65 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 66 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 67 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 68 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 69 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 70 */ 71 72 /* 73 * Copyright 2001 (c) Wasabi Systems, Inc. 74 * All rights reserved. 75 * 76 * Written by Frank van der Linden for Wasabi Systems, Inc. 77 * 78 * Redistribution and use in source and binary forms, with or without 79 * modification, are permitted provided that the following conditions 80 * are met: 81 * 1. Redistributions of source code must retain the above copyright 82 * notice, this list of conditions and the following disclaimer. 83 * 2. Redistributions in binary form must reproduce the above copyright 84 * notice, this list of conditions and the following disclaimer in the 85 * documentation and/or other materials provided with the distribution. 86 * 3. All advertising materials mentioning features or use of this software 87 * must display the following acknowledgement: 88 * This product includes software developed for the NetBSD Project by 89 * Wasabi Systems, Inc. 90 * 4. The name of Wasabi Systems, Inc. may not be used to endorse 91 * or promote products derived from this software without specific prior 92 * written permission. 93 * 94 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND 95 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 96 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 97 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC 98 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 99 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 100 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 101 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 102 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 103 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 104 * POSSIBILITY OF SUCH DAMAGE. 105 */ 106 107 /* 108 * Copyright (c) 1997 Charles D. Cranor and Washington University. 109 * All rights reserved. 110 * 111 * Redistribution and use in source and binary forms, with or without 112 * modification, are permitted provided that the following conditions 113 * are met: 114 * 1. Redistributions of source code must retain the above copyright 115 * notice, this list of conditions and the following disclaimer. 116 * 2. Redistributions in binary form must reproduce the above copyright 117 * notice, this list of conditions and the following disclaimer in the 118 * documentation and/or other materials provided with the distribution. 119 * 120 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 121 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 122 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 123 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 124 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 125 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 126 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 127 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 128 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 129 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 130 */ 131 132 #include <sys/cdefs.h> 133 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.406 2020/09/02 17:37:57 bouyer Exp $"); 134 135 #include "opt_user_ldt.h" 136 #include "opt_lockdebug.h" 137 #include "opt_multiprocessor.h" 138 #include "opt_xen.h" 139 #include "opt_svs.h" 140 #include "opt_kaslr.h" 141 142 #define __MUTEX_PRIVATE /* for assertions */ 143 144 #include <sys/param.h> 145 #include <sys/systm.h> 146 #include <sys/proc.h> 147 #include <sys/pool.h> 148 #include <sys/kernel.h> 149 #include <sys/atomic.h> 150 #include <sys/cpu.h> 151 #include <sys/intr.h> 152 #include <sys/xcall.h> 153 #include <sys/kcore.h> 154 #include <sys/kmem.h> 155 #include <sys/asan.h> 156 #include <sys/msan.h> 157 #include <sys/entropy.h> 158 159 #include <uvm/uvm.h> 160 #include <uvm/pmap/pmap_pvt.h> 161 162 #include <dev/isa/isareg.h> 163 164 #include <machine/specialreg.h> 165 #include <machine/gdt.h> 166 #include <machine/isa_machdep.h> 167 #include <machine/cpuvar.h> 168 #include <machine/cputypes.h> 169 170 #include <x86/pmap.h> 171 #include <x86/pmap_pv.h> 172 173 #include <x86/i82489reg.h> 174 #include <x86/i82489var.h> 175 176 #ifdef XEN 177 #include <xen/include/public/xen.h> 178 #include <xen/hypervisor.h> 179 #endif 180 181 /* 182 * general info: 183 * 184 * - for an explanation of how the x86 MMU hardware works see 185 * the comments in <machine/pte.h>. 186 * 187 * - for an explanation of the general memory structure used by 188 * this pmap (including the recursive mapping), see the comments 189 * in <machine/pmap.h>. 190 * 191 * this file contains the code for the "pmap module." the module's 192 * job is to manage the hardware's virtual to physical address mappings. 193 * note that there are two levels of mapping in the VM system: 194 * 195 * [1] the upper layer of the VM system uses vm_map's and vm_map_entry's 196 * to map ranges of virtual address space to objects/files. for 197 * example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only 198 * to the file /bin/ls starting at offset zero." note that 199 * the upper layer mapping is not concerned with how individual 200 * vm_pages are mapped. 201 * 202 * [2] the lower layer of the VM system (the pmap) maintains the mappings 203 * from virtual addresses. it is concerned with which vm_page is 204 * mapped where. for example, when you run /bin/ls and start 205 * at page 0x1000 the fault routine may lookup the correct page 206 * of the /bin/ls file and then ask the pmap layer to establish 207 * a mapping for it. 208 * 209 * note that information in the lower layer of the VM system can be 210 * thrown away since it can easily be reconstructed from the info 211 * in the upper layer. 212 * 213 * data structures we use include: 214 * 215 * - struct pmap: describes the address space of one thread 216 * - struct pmap_page: describes one pv-tracked page, without 217 * necessarily a corresponding vm_page 218 * - struct pv_entry: describes one <PMAP,VA> mapping of a PA 219 * - pmap_page::pp_pvlist: there is one list per pv-tracked page of 220 * physical memory. the pp_pvlist points to a list of pv_entry 221 * structures which describe all the <PMAP,VA> pairs that this 222 * page is mapped in. this is critical for page based operations 223 * such as pmap_page_protect() [change protection on _all_ mappings 224 * of a page] 225 */ 226 227 /* 228 * Locking 229 * 230 * We have the following locks that we must deal with, listed in the order 231 * that they are acquired: 232 * 233 * pg->uobject->vmobjlock, pg->uanon->an_lock 234 * 235 * For managed pages, these per-object locks are taken by the VM system 236 * before calling into the pmap module - either a read or write hold. 237 * The lock hold prevent pages from changing identity while the pmap is 238 * operating on them. For example, the same lock is held across a call 239 * to pmap_remove() and the following call to pmap_update(), so that a 240 * page does not gain a new identity while its TLB visibility is stale. 241 * 242 * pmap->pm_lock 243 * 244 * This lock protects the fields in the pmap structure including the 245 * non-kernel PDEs in the PDP, the PTEs, and PTPs and connected data 246 * structures. For modifying unmanaged kernel PTEs it is not needed as 247 * kernel PDEs are never freed, and the kernel is expected to be self 248 * consistent (and the lock can't be taken for unmanaged kernel PTEs, 249 * because they can be modified from interrupt context). 250 * 251 * pmaps_lock 252 * 253 * This lock protects the list of active pmaps (headed by "pmaps"). 254 * It's acquired when adding or removing pmaps or adjusting kernel PDEs. 255 * 256 * pp_lock 257 * 258 * This per-page lock protects PV entry lists and the embedded PV entry 259 * in each vm_page, allowing for concurrent operation on pages by 260 * different pmaps. This is a spin mutex at IPL_VM, because at the 261 * points it is taken context switching is usually not tolerable, and 262 * spin mutexes must block out interrupts that could take kernel_lock. 263 */ 264 265 /* uvm_object is abused here to index pmap_pages; make assertions happy. */ 266 #ifdef DIAGNOSTIC 267 #define PMAP_DUMMY_LOCK(pm) rw_enter(&(pm)->pm_dummy_lock, RW_WRITER) 268 #define PMAP_DUMMY_UNLOCK(pm) rw_exit(&(pm)->pm_dummy_lock) 269 #else 270 #define PMAP_DUMMY_LOCK(pm) 271 #define PMAP_DUMMY_UNLOCK(pm) 272 #endif 273 274 static const struct uvm_pagerops pmap_pager = { 275 /* nothing */ 276 }; 277 278 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER; 279 const vaddr_t ptp_frames[] = PTP_FRAME_INITIALIZER; 280 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER; 281 const long nkptpmax[] = NKPTPMAX_INITIALIZER; 282 const long nbpd[] = NBPD_INITIALIZER; 283 #ifdef i386 284 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER; 285 #else 286 pd_entry_t *normal_pdes[3]; 287 #endif 288 289 long nkptp[] = NKPTP_INITIALIZER; 290 291 struct pmap_head pmaps; 292 kmutex_t pmaps_lock __cacheline_aligned; 293 294 struct pcpu_area *pcpuarea __read_mostly; 295 296 static vaddr_t pmap_maxkvaddr; 297 298 /* 299 * Misc. event counters. 300 */ 301 struct evcnt pmap_iobmp_evcnt; 302 struct evcnt pmap_ldt_evcnt; 303 304 /* 305 * PAT 306 */ 307 static bool cpu_pat_enabled __read_mostly = false; 308 309 /* 310 * Global data structures 311 */ 312 313 static struct pmap kernel_pmap_store __cacheline_aligned; /* kernel's pmap */ 314 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store; 315 static rb_tree_t pmap_kernel_rb __cacheline_aligned; 316 317 struct bootspace bootspace __read_mostly; 318 struct slotspace slotspace __read_mostly; 319 320 /* Set to PTE_NX if supported. */ 321 pd_entry_t pmap_pg_nx __read_mostly = 0; 322 323 /* Set to PTE_G if supported. */ 324 pd_entry_t pmap_pg_g __read_mostly = 0; 325 326 /* Set to true if large pages are supported. */ 327 int pmap_largepages __read_mostly = 0; 328 329 paddr_t lowmem_rsvd __read_mostly; 330 paddr_t avail_start __read_mostly; /* PA of first available physical page */ 331 paddr_t avail_end __read_mostly; /* PA of last available physical page */ 332 333 #ifdef XENPV 334 paddr_t pmap_pa_start; /* PA of first physical page for this domain */ 335 paddr_t pmap_pa_end; /* PA of last physical page for this domain */ 336 #endif 337 338 #define VM_PAGE_TO_PP(pg) (&(pg)->mdpage.mp_pp) 339 #define PMAP_CHECK_PP(pp) \ 340 KASSERTMSG((pp)->pp_lock.mtx_ipl._ipl == IPL_VM, "bad pmap_page %p", pp) 341 342 /* 343 * Other data structures 344 */ 345 346 static pt_entry_t protection_codes[8] __read_mostly; 347 348 static bool pmap_initialized __read_mostly = false; /* pmap_init done yet? */ 349 350 /* 351 * The following two vaddr_t's are used during system startup to keep track of 352 * how much of the kernel's VM space we have used. Once the system is started, 353 * the management of the remaining kernel VM space is turned over to the 354 * kernel_map vm_map. 355 */ 356 static vaddr_t virtual_avail __read_mostly; /* VA of first free KVA */ 357 static vaddr_t virtual_end __read_mostly; /* VA of last free KVA */ 358 359 #ifndef XENPV 360 /* 361 * LAPIC virtual address, and fake physical address. 362 */ 363 volatile vaddr_t local_apic_va __read_mostly; 364 paddr_t local_apic_pa __read_mostly; 365 #endif 366 367 /* 368 * pool that pmap structures are allocated from 369 */ 370 struct pool_cache pmap_cache; 371 static int pmap_ctor(void *, void *, int); 372 static void pmap_dtor(void *, void *); 373 374 /* 375 * pv_page cache 376 */ 377 static struct pool_cache pmap_pvp_cache; 378 379 #ifdef __HAVE_DIRECT_MAP 380 vaddr_t pmap_direct_base __read_mostly; 381 vaddr_t pmap_direct_end __read_mostly; 382 #endif 383 384 #ifndef __HAVE_DIRECT_MAP 385 /* 386 * Special VAs and the PTEs that map them 387 */ 388 static pt_entry_t *early_zero_pte; 389 static void pmap_vpage_cpualloc(struct cpu_info *); 390 #ifdef XENPV 391 char *early_zerop; /* also referenced from xen_locore() */ 392 #else 393 static char *early_zerop; 394 #endif 395 #endif 396 397 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int); 398 399 /* PDP pool and its callbacks */ 400 static struct pool pmap_pdp_pool; 401 static void pmap_pdp_init(pd_entry_t *); 402 static void pmap_pdp_fini(pd_entry_t *); 403 404 #ifdef PAE 405 /* need to allocate items of 4 pages */ 406 static void *pmap_pdp_alloc(struct pool *, int); 407 static void pmap_pdp_free(struct pool *, void *); 408 static struct pool_allocator pmap_pdp_allocator = { 409 .pa_alloc = pmap_pdp_alloc, 410 .pa_free = pmap_pdp_free, 411 .pa_pagesz = PAGE_SIZE * PDP_SIZE, 412 }; 413 #endif 414 415 extern vaddr_t idt_vaddr; 416 extern paddr_t idt_paddr; 417 extern vaddr_t gdt_vaddr; 418 extern paddr_t gdt_paddr; 419 extern vaddr_t ldt_vaddr; 420 extern paddr_t ldt_paddr; 421 422 #ifdef i386 423 /* stuff to fix the pentium f00f bug */ 424 extern vaddr_t pentium_idt_vaddr; 425 #endif 426 427 /* Array of freshly allocated PTPs, for pmap_get_ptp(). */ 428 struct pmap_ptparray { 429 struct vm_page *pg[PTP_LEVELS + 1]; 430 bool alloced[PTP_LEVELS + 1]; 431 }; 432 433 /* 434 * PV entries are allocated in page-sized chunks and cached per-pmap to 435 * avoid intense pressure on memory allocators. 436 */ 437 438 struct pv_page { 439 LIST_HEAD(, pv_entry) pvp_pves; 440 LIST_ENTRY(pv_page) pvp_list; 441 long pvp_nfree; 442 struct pmap *pvp_pmap; 443 }; 444 445 #define PVE_PER_PVP ((PAGE_SIZE / sizeof(struct pv_entry)) - 1) 446 447 /* 448 * PV tree prototypes 449 */ 450 451 static int pmap_compare_key(void *, const void *, const void *); 452 static int pmap_compare_nodes(void *, const void *, const void *); 453 454 /* Read-black tree */ 455 static const rb_tree_ops_t pmap_rbtree_ops = { 456 .rbto_compare_nodes = pmap_compare_nodes, 457 .rbto_compare_key = pmap_compare_key, 458 .rbto_node_offset = offsetof(struct pv_entry, pve_rb), 459 .rbto_context = NULL 460 }; 461 462 /* 463 * Local prototypes 464 */ 465 466 #ifdef __HAVE_PCPU_AREA 467 static void pmap_init_pcpu(void); 468 #endif 469 #ifdef __HAVE_DIRECT_MAP 470 static void pmap_init_directmap(struct pmap *); 471 #endif 472 #if !defined(XENPV) 473 static void pmap_remap_global(void); 474 #endif 475 #ifndef XENPV 476 static void pmap_init_lapic(void); 477 static void pmap_remap_largepages(void); 478 #endif 479 480 static int pmap_get_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t, int, 481 struct vm_page **); 482 static void pmap_unget_ptp(struct pmap *, struct pmap_ptparray *); 483 static void pmap_install_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t, 484 pd_entry_t * const *); 485 static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, int); 486 static void pmap_freepage(struct pmap *, struct vm_page *, int); 487 static void pmap_free_ptp(struct pmap *, struct vm_page *, vaddr_t, 488 pt_entry_t *, pd_entry_t * const *); 489 static bool pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *, 490 vaddr_t); 491 static void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t, vaddr_t, 492 vaddr_t); 493 static int pmap_pvp_ctor(void *, void *, int); 494 static void pmap_pvp_dtor(void *, void *); 495 static struct pv_entry *pmap_alloc_pv(struct pmap *); 496 static void pmap_free_pv(struct pmap *, struct pv_entry *); 497 static void pmap_drain_pv(struct pmap *); 498 499 static void pmap_alloc_level(struct pmap *, vaddr_t, long *); 500 501 static void pmap_load1(struct lwp *, struct pmap *, struct pmap *); 502 static void pmap_reactivate(struct pmap *); 503 504 /* 505 * p m a p h e l p e r f u n c t i o n s 506 */ 507 508 static inline void 509 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff) 510 { 511 512 KASSERT(cold || mutex_owned(&pmap->pm_lock)); 513 pmap->pm_stats.resident_count += resid_diff; 514 pmap->pm_stats.wired_count += wired_diff; 515 } 516 517 static inline void 518 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte) 519 { 520 int resid_diff = ((npte & PTE_P) ? 1 : 0) - ((opte & PTE_P) ? 1 : 0); 521 int wired_diff = ((npte & PTE_WIRED) ? 1 : 0) - ((opte & PTE_WIRED) ? 1 : 0); 522 523 KASSERT((npte & (PTE_P | PTE_WIRED)) != PTE_WIRED); 524 KASSERT((opte & (PTE_P | PTE_WIRED)) != PTE_WIRED); 525 526 pmap_stats_update(pmap, resid_diff, wired_diff); 527 } 528 529 /* 530 * ptp_to_pmap: lookup pmap by ptp 531 */ 532 static inline struct pmap * 533 ptp_to_pmap(struct vm_page *ptp) 534 { 535 struct pmap *pmap; 536 537 if (ptp == NULL) { 538 return pmap_kernel(); 539 } 540 pmap = (struct pmap *)ptp->uobject; 541 KASSERT(pmap != NULL); 542 KASSERT(&pmap->pm_obj[0] == ptp->uobject); 543 return pmap; 544 } 545 546 static inline struct pv_pte * 547 pve_to_pvpte(struct pv_entry *pve) 548 { 549 550 if (pve == NULL) 551 return NULL; 552 KASSERT((void *)&pve->pve_pte == (void *)pve); 553 return &pve->pve_pte; 554 } 555 556 static inline struct pv_entry * 557 pvpte_to_pve(struct pv_pte *pvpte) 558 { 559 struct pv_entry *pve = (void *)pvpte; 560 561 KASSERT(pve_to_pvpte(pve) == pvpte); 562 return pve; 563 } 564 565 /* 566 * Return true if the pmap page has an embedded PV entry. 567 */ 568 static inline bool 569 pv_pte_embedded(struct pmap_page *pp) 570 { 571 572 KASSERT(mutex_owned(&pp->pp_lock)); 573 return (bool)((vaddr_t)pp->pp_pte.pte_ptp | pp->pp_pte.pte_va); 574 } 575 576 /* 577 * pv_pte_first, pv_pte_next: PV list iterator. 578 */ 579 static inline struct pv_pte * 580 pv_pte_first(struct pmap_page *pp) 581 { 582 583 KASSERT(mutex_owned(&pp->pp_lock)); 584 if (pv_pte_embedded(pp)) { 585 return &pp->pp_pte; 586 } 587 return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist)); 588 } 589 590 static inline struct pv_pte * 591 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte) 592 { 593 594 KASSERT(mutex_owned(&pp->pp_lock)); 595 KASSERT(pvpte != NULL); 596 if (pvpte == &pp->pp_pte) { 597 return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist)); 598 } 599 return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list)); 600 } 601 602 static inline uint8_t 603 pmap_pte_to_pp_attrs(pt_entry_t pte) 604 { 605 uint8_t ret = 0; 606 if (pte & PTE_D) 607 ret |= PP_ATTRS_D; 608 if (pte & PTE_A) 609 ret |= PP_ATTRS_A; 610 if (pte & PTE_W) 611 ret |= PP_ATTRS_W; 612 return ret; 613 } 614 615 static inline pt_entry_t 616 pmap_pp_attrs_to_pte(uint8_t attrs) 617 { 618 pt_entry_t pte = 0; 619 if (attrs & PP_ATTRS_D) 620 pte |= PTE_D; 621 if (attrs & PP_ATTRS_A) 622 pte |= PTE_A; 623 if (attrs & PP_ATTRS_W) 624 pte |= PTE_W; 625 return pte; 626 } 627 628 /* 629 * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]? 630 * of course the kernel is always loaded 631 */ 632 bool 633 pmap_is_curpmap(struct pmap *pmap) 634 { 635 return ((pmap == pmap_kernel()) || (pmap == curcpu()->ci_pmap)); 636 } 637 638 inline void 639 pmap_reference(struct pmap *pmap) 640 { 641 642 atomic_inc_uint(&pmap->pm_obj[0].uo_refs); 643 } 644 645 /* 646 * rbtree: compare two nodes. 647 */ 648 static int 649 pmap_compare_nodes(void *context, const void *n1, const void *n2) 650 { 651 const struct pv_entry *pve1 = n1; 652 const struct pv_entry *pve2 = n2; 653 654 KASSERT(pve1->pve_pte.pte_ptp == pve2->pve_pte.pte_ptp); 655 656 if (pve1->pve_pte.pte_va < pve2->pve_pte.pte_va) { 657 return -1; 658 } 659 if (pve1->pve_pte.pte_va > pve2->pve_pte.pte_va) { 660 return 1; 661 } 662 return 0; 663 } 664 665 /* 666 * rbtree: compare a node and a key. 667 */ 668 static int 669 pmap_compare_key(void *context, const void *n, const void *k) 670 { 671 const struct pv_entry *pve = n; 672 const vaddr_t key = (vaddr_t)k; 673 674 if (pve->pve_pte.pte_va < key) { 675 return -1; 676 } 677 if (pve->pve_pte.pte_va > key) { 678 return 1; 679 } 680 return 0; 681 } 682 683 /* 684 * pmap_ptp_range_set: abuse ptp->uanon to record minimum VA of PTE 685 */ 686 static inline void 687 pmap_ptp_range_set(struct vm_page *ptp, vaddr_t va) 688 { 689 vaddr_t *min = (vaddr_t *)&ptp->uanon; 690 691 if (va < *min) { 692 *min = va; 693 } 694 } 695 696 /* 697 * pmap_ptp_range_clip: abuse ptp->uanon to clip range of PTEs to remove 698 */ 699 static inline void 700 pmap_ptp_range_clip(struct vm_page *ptp, vaddr_t *startva, pt_entry_t **pte) 701 { 702 vaddr_t sclip; 703 704 if (ptp == NULL) { 705 return; 706 } 707 708 sclip = (vaddr_t)ptp->uanon; 709 sclip = (*startva < sclip ? sclip : *startva); 710 *pte += (sclip - *startva) / PAGE_SIZE; 711 *startva = sclip; 712 } 713 714 /* 715 * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in 716 * 717 * there are several pmaps involved. some or all of them might be same. 718 * 719 * - the pmap given by the first argument 720 * our caller wants to access this pmap's PTEs. 721 * 722 * - pmap_kernel() 723 * the kernel pmap. note that it only contains the kernel part 724 * of the address space which is shared by any pmap. ie. any 725 * pmap can be used instead of pmap_kernel() for our purpose. 726 * 727 * - ci->ci_pmap 728 * pmap currently loaded on the cpu. 729 * 730 * - vm_map_pmap(&curproc->p_vmspace->vm_map) 731 * current process' pmap. 732 * 733 * => caller must lock pmap first (if not the kernel pmap) 734 * => must be undone with pmap_unmap_ptes before returning 735 * => disables kernel preemption 736 */ 737 void 738 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2, pd_entry_t **ptepp, 739 pd_entry_t * const **pdeppp) 740 { 741 struct pmap *curpmap; 742 struct cpu_info *ci; 743 lwp_t *l; 744 745 kpreempt_disable(); 746 747 /* The kernel's pmap is always accessible. */ 748 if (pmap == pmap_kernel()) { 749 *pmap2 = NULL; 750 *ptepp = PTE_BASE; 751 *pdeppp = normal_pdes; 752 return; 753 } 754 755 KASSERT(mutex_owned(&pmap->pm_lock)); 756 757 l = curlwp; 758 ci = l->l_cpu; 759 curpmap = ci->ci_pmap; 760 if (pmap == curpmap) { 761 /* 762 * Already on the CPU: make it valid. This is very 763 * often the case during exit(), when we have switched 764 * to the kernel pmap in order to destroy a user pmap. 765 */ 766 if (__predict_false(ci->ci_tlbstate != TLBSTATE_VALID)) { 767 pmap_reactivate(pmap); 768 } 769 *pmap2 = NULL; 770 } else { 771 /* 772 * Toss current pmap from CPU and install new pmap, but keep 773 * a reference to the old one. Dropping the reference can 774 * can block as it needs to take locks, so defer that to 775 * pmap_unmap_ptes(). 776 */ 777 pmap_reference(pmap); 778 pmap_load1(l, pmap, curpmap); 779 *pmap2 = curpmap; 780 } 781 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); 782 #ifdef DIAGNOSTIC 783 pmap->pm_ncsw = lwp_pctr(); 784 #endif 785 *ptepp = PTE_BASE; 786 787 #if defined(XENPV) && defined(__x86_64__) 788 KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] == L4_BASE); 789 ci->ci_normal_pdes[PTP_LEVELS - 2] = pmap->pm_pdir; 790 *pdeppp = ci->ci_normal_pdes; 791 #else 792 *pdeppp = normal_pdes; 793 #endif 794 } 795 796 /* 797 * pmap_unmap_ptes: unlock the PTE mapping of "pmap" 798 * 799 * => we cannot tolerate context switches while mapped in: assert this. 800 * => reenables kernel preemption. 801 * => does not unlock pmap. 802 */ 803 void 804 pmap_unmap_ptes(struct pmap *pmap, struct pmap * pmap2) 805 { 806 struct cpu_info *ci; 807 struct pmap *mypmap; 808 struct lwp *l; 809 810 KASSERT(kpreempt_disabled()); 811 812 /* The kernel's pmap is always accessible. */ 813 if (pmap == pmap_kernel()) { 814 kpreempt_enable(); 815 return; 816 } 817 818 l = curlwp; 819 ci = l->l_cpu; 820 821 KASSERT(mutex_owned(&pmap->pm_lock)); 822 KASSERT(pmap->pm_ncsw == lwp_pctr()); 823 824 #if defined(XENPV) && defined(__x86_64__) 825 KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] != L4_BASE); 826 ci->ci_normal_pdes[PTP_LEVELS - 2] = L4_BASE; 827 #endif 828 829 /* If not our own pmap, mark whatever's on the CPU now as lazy. */ 830 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); 831 mypmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 832 if (ci->ci_pmap == vm_map_pmap(&l->l_proc->p_vmspace->vm_map)) { 833 ci->ci_want_pmapload = 0; 834 } else { 835 ci->ci_want_pmapload = (mypmap != pmap_kernel()); 836 ci->ci_tlbstate = TLBSTATE_LAZY; 837 } 838 839 /* Now safe to re-enable preemption. */ 840 kpreempt_enable(); 841 842 /* Toss reference to other pmap taken earlier. */ 843 if (pmap2 != NULL) { 844 pmap_destroy(pmap2); 845 } 846 } 847 848 inline static void 849 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte) 850 { 851 852 #if !defined(__x86_64__) 853 if (curproc == NULL || curproc->p_vmspace == NULL || 854 pm != vm_map_pmap(&curproc->p_vmspace->vm_map)) 855 return; 856 857 if ((opte ^ npte) & PTE_X) 858 pmap_update_pg(va); 859 860 /* 861 * Executability was removed on the last executable change. 862 * Reset the code segment to something conservative and 863 * let the trap handler deal with setting the right limit. 864 * We can't do that because of locking constraints on the vm map. 865 */ 866 867 if ((opte & PTE_X) && (npte & PTE_X) == 0 && va == pm->pm_hiexec) { 868 struct trapframe *tf = curlwp->l_md.md_regs; 869 870 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 871 pm->pm_hiexec = I386_MAX_EXE_ADDR; 872 } 873 #endif /* !defined(__x86_64__) */ 874 } 875 876 #if !defined(__x86_64__) 877 /* 878 * Fixup the code segment to cover all potential executable mappings. 879 * returns 0 if no changes to the code segment were made. 880 */ 881 int 882 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb) 883 { 884 struct vm_map_entry *ent; 885 struct pmap *pm = vm_map_pmap(map); 886 vaddr_t va = 0; 887 888 vm_map_lock_read(map); 889 for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) { 890 /* 891 * This entry has greater va than the entries before. 892 * We need to make it point to the last page, not past it. 893 */ 894 if (ent->protection & VM_PROT_EXECUTE) 895 va = trunc_page(ent->end) - PAGE_SIZE; 896 } 897 vm_map_unlock_read(map); 898 if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL)) 899 return 0; 900 901 pm->pm_hiexec = va; 902 if (pm->pm_hiexec > I386_MAX_EXE_ADDR) { 903 tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL); 904 } else { 905 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 906 return 0; 907 } 908 return 1; 909 } 910 #endif /* !defined(__x86_64__) */ 911 912 void 913 pat_init(struct cpu_info *ci) 914 { 915 uint64_t pat; 916 917 if (!(ci->ci_feat_val[0] & CPUID_PAT)) 918 return; 919 920 /* We change WT to WC. Leave all other entries the default values. */ 921 pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) | 922 PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) | 923 PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) | 924 PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC); 925 926 wrmsr(MSR_CR_PAT, pat); 927 cpu_pat_enabled = true; 928 } 929 930 static pt_entry_t 931 pmap_pat_flags(u_int flags) 932 { 933 u_int cacheflags = (flags & PMAP_CACHE_MASK); 934 935 if (!cpu_pat_enabled) { 936 switch (cacheflags) { 937 case PMAP_NOCACHE: 938 case PMAP_NOCACHE_OVR: 939 /* results in PGC_UCMINUS on cpus which have 940 * the cpuid PAT but PAT "disabled" 941 */ 942 return PTE_PCD; 943 default: 944 return 0; 945 } 946 } 947 948 switch (cacheflags) { 949 case PMAP_NOCACHE: 950 return PGC_UC; 951 case PMAP_WRITE_COMBINE: 952 return PGC_WC; 953 case PMAP_WRITE_BACK: 954 return PGC_WB; 955 case PMAP_NOCACHE_OVR: 956 return PGC_UCMINUS; 957 } 958 959 return 0; 960 } 961 962 /* 963 * p m a p k e n t e r f u n c t i o n s 964 * 965 * functions to quickly enter/remove pages from the kernel address 966 * space. pmap_kremove is exported to MI kernel. we make use of 967 * the recursive PTE mappings. 968 */ 969 970 /* 971 * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking 972 * 973 * => no need to lock anything, assume va is already allocated 974 * => should be faster than normal pmap enter function 975 */ 976 void 977 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags) 978 { 979 pt_entry_t *pte, opte, npte; 980 981 KASSERT(!(prot & ~VM_PROT_ALL)); 982 983 if (va < VM_MIN_KERNEL_ADDRESS) 984 pte = vtopte(va); 985 else 986 pte = kvtopte(va); 987 #if defined(XENPV) && defined(DOM0OPS) 988 if (pa < pmap_pa_start || pa >= pmap_pa_end) { 989 #ifdef DEBUG 990 printf_nolog("%s: pa %#" PRIxPADDR " for va %#" PRIxVADDR 991 " outside range\n", __func__, pa, va); 992 #endif /* DEBUG */ 993 npte = pa; 994 } else 995 #endif /* XENPV && DOM0OPS */ 996 npte = pmap_pa2pte(pa); 997 npte |= protection_codes[prot] | PTE_P | pmap_pg_g; 998 npte |= pmap_pat_flags(flags); 999 opte = pmap_pte_testset(pte, npte); /* zap! */ 1000 1001 /* 1002 * XXX: make sure we are not dealing with a large page, since the only 1003 * large pages created are for the kernel image, and they should never 1004 * be kentered. 1005 */ 1006 KASSERTMSG(!(opte & PTE_PS), "PTE_PS va=%#"PRIxVADDR, va); 1007 1008 if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A)) { 1009 /* This should not happen. */ 1010 printf_nolog("%s: mapping already present\n", __func__); 1011 kpreempt_disable(); 1012 pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER); 1013 kpreempt_enable(); 1014 } 1015 } 1016 1017 __strict_weak_alias(pmap_kenter_ma, pmap_kenter_pa); 1018 1019 #if defined(__x86_64__) 1020 /* 1021 * Change protection for a virtual address. Local for a CPU only, don't 1022 * care about TLB shootdowns. 1023 * 1024 * => must be called with preemption disabled 1025 */ 1026 void 1027 pmap_changeprot_local(vaddr_t va, vm_prot_t prot) 1028 { 1029 pt_entry_t *pte, opte, npte; 1030 1031 KASSERT(kpreempt_disabled()); 1032 1033 if (va < VM_MIN_KERNEL_ADDRESS) 1034 pte = vtopte(va); 1035 else 1036 pte = kvtopte(va); 1037 1038 npte = opte = *pte; 1039 1040 if ((prot & VM_PROT_WRITE) != 0) 1041 npte |= PTE_W; 1042 else 1043 npte &= ~(PTE_W|PTE_D); 1044 1045 if (opte != npte) { 1046 pmap_pte_set(pte, npte); 1047 pmap_pte_flush(); 1048 invlpg(va); 1049 } 1050 } 1051 #endif /* defined(__x86_64__) */ 1052 1053 /* 1054 * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking 1055 * 1056 * => no need to lock anything 1057 * => caller must dispose of any vm_page mapped in the va range 1058 * => note: not an inline function 1059 * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE 1060 * => we assume kernel only unmaps valid addresses and thus don't bother 1061 * checking the valid bit before doing TLB flushing 1062 * => must be followed by call to pmap_update() before reuse of page 1063 */ 1064 static void 1065 pmap_kremove1(vaddr_t sva, vsize_t len, bool localonly) 1066 { 1067 pt_entry_t *pte, opte; 1068 vaddr_t va, eva; 1069 1070 eva = sva + len; 1071 1072 kpreempt_disable(); 1073 for (va = sva; va < eva; va += PAGE_SIZE) { 1074 pte = kvtopte(va); 1075 opte = pmap_pte_testset(pte, 0); /* zap! */ 1076 if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A) && !localonly) { 1077 pmap_tlb_shootdown(pmap_kernel(), va, opte, 1078 TLBSHOOT_KREMOVE); 1079 } 1080 KASSERTMSG((opte & PTE_PS) == 0, 1081 "va %#" PRIxVADDR " is a large page", va); 1082 KASSERTMSG((opte & PTE_PVLIST) == 0, 1083 "va %#" PRIxVADDR " is a pv tracked page", va); 1084 } 1085 if (localonly) { 1086 tlbflushg(); 1087 } 1088 kpreempt_enable(); 1089 } 1090 1091 void 1092 pmap_kremove(vaddr_t sva, vsize_t len) 1093 { 1094 1095 pmap_kremove1(sva, len, false); 1096 } 1097 1098 /* 1099 * pmap_kremove_local: like pmap_kremove(), but only worry about 1100 * TLB invalidations on the current CPU. this is only intended 1101 * for use while writing kernel crash dumps, either after panic 1102 * or via reboot -d. 1103 */ 1104 void 1105 pmap_kremove_local(vaddr_t sva, vsize_t len) 1106 { 1107 1108 pmap_kremove1(sva, len, true); 1109 } 1110 1111 /* 1112 * p m a p i n i t f u n c t i o n s 1113 * 1114 * pmap_bootstrap and pmap_init are called during system startup 1115 * to init the pmap module. pmap_bootstrap() does a low level 1116 * init just to get things rolling. pmap_init() finishes the job. 1117 */ 1118 1119 /* 1120 * pmap_bootstrap_valloc: allocate a virtual address in the bootstrap area. 1121 * This function is to be used before any VM system has been set up. 1122 * 1123 * The va is taken from virtual_avail. 1124 */ 1125 static vaddr_t 1126 pmap_bootstrap_valloc(size_t npages) 1127 { 1128 vaddr_t va = virtual_avail; 1129 virtual_avail += npages * PAGE_SIZE; 1130 return va; 1131 } 1132 1133 /* 1134 * pmap_bootstrap_palloc: allocate a physical address in the bootstrap area. 1135 * This function is to be used before any VM system has been set up. 1136 * 1137 * The pa is taken from avail_start. 1138 */ 1139 static paddr_t 1140 pmap_bootstrap_palloc(size_t npages) 1141 { 1142 paddr_t pa = avail_start; 1143 avail_start += npages * PAGE_SIZE; 1144 return pa; 1145 } 1146 1147 /* 1148 * pmap_bootstrap: get the system in a state where it can run with VM properly 1149 * enabled (called before main()). The VM system is fully init'd later. 1150 * 1151 * => on i386, locore.S has already enabled the MMU by allocating a PDP for the 1152 * kernel, and nkpde PTP's for the kernel. 1153 * => kva_start is the first free virtual address in kernel space. 1154 */ 1155 void 1156 pmap_bootstrap(vaddr_t kva_start) 1157 { 1158 struct pmap *kpm; 1159 int i; 1160 vaddr_t kva; 1161 1162 pmap_pg_nx = (cpu_feature[2] & CPUID_NOX ? PTE_NX : 0); 1163 1164 /* 1165 * Set up our local static global vars that keep track of the usage of 1166 * KVM before kernel_map is set up. 1167 */ 1168 virtual_avail = kva_start; /* first free KVA */ 1169 virtual_end = VM_MAX_KERNEL_ADDRESS; /* last KVA */ 1170 1171 /* 1172 * Set up protection_codes: we need to be able to convert from a MI 1173 * protection code (some combo of VM_PROT...) to something we can jam 1174 * into a x86 PTE. 1175 */ 1176 protection_codes[VM_PROT_NONE] = pmap_pg_nx; 1177 protection_codes[VM_PROT_EXECUTE] = PTE_X; 1178 protection_codes[VM_PROT_READ] = pmap_pg_nx; 1179 protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PTE_X; 1180 protection_codes[VM_PROT_WRITE] = PTE_W | pmap_pg_nx; 1181 protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PTE_W | PTE_X; 1182 protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PTE_W | pmap_pg_nx; 1183 protection_codes[VM_PROT_ALL] = PTE_W | PTE_X; 1184 1185 /* 1186 * Now we init the kernel's pmap. 1187 * 1188 * The kernel pmap's pm_obj is not used for much. However, in user pmaps 1189 * the pm_obj contains the list of active PTPs. 1190 */ 1191 kpm = pmap_kernel(); 1192 mutex_init(&kpm->pm_lock, MUTEX_DEFAULT, IPL_NONE); 1193 rw_init(&kpm->pm_dummy_lock); 1194 for (i = 0; i < PTP_LEVELS - 1; i++) { 1195 uvm_obj_init(&kpm->pm_obj[i], &pmap_pager, false, 1); 1196 uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_dummy_lock); 1197 kpm->pm_ptphint[i] = NULL; 1198 } 1199 memset(&kpm->pm_list, 0, sizeof(kpm->pm_list)); /* pm_list not used */ 1200 1201 kpm->pm_pdir = (pd_entry_t *)bootspace.pdir; 1202 for (i = 0; i < PDP_SIZE; i++) 1203 kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i; 1204 1205 kpm->pm_stats.wired_count = kpm->pm_stats.resident_count = 1206 x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS); 1207 1208 kcpuset_create(&kpm->pm_cpus, true); 1209 kcpuset_create(&kpm->pm_kernel_cpus, true); 1210 1211 kpm->pm_ldt = NULL; 1212 kpm->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 1213 1214 /* 1215 * the above is just a rough estimate and not critical to the proper 1216 * operation of the system. 1217 */ 1218 1219 #if !defined(XENPV) 1220 /* 1221 * Begin to enable global TLB entries if they are supported: add PTE_G 1222 * attribute to already mapped kernel pages. Do that only if SVS is 1223 * disabled. 1224 * 1225 * The G bit has no effect until the CR4_PGE bit is set in CR4, which 1226 * happens later in cpu_init(). 1227 */ 1228 #ifdef SVS 1229 if (!svs_enabled && (cpu_feature[0] & CPUID_PGE)) { 1230 #else 1231 if (cpu_feature[0] & CPUID_PGE) { 1232 #endif 1233 pmap_pg_g = PTE_G; 1234 pmap_remap_global(); 1235 } 1236 #endif 1237 1238 #ifndef XENPV 1239 /* 1240 * Enable large pages if they are supported. 1241 */ 1242 if (cpu_feature[0] & CPUID_PSE) { 1243 lcr4(rcr4() | CR4_PSE); /* enable hardware (via %cr4) */ 1244 pmap_largepages = 1; /* enable software */ 1245 1246 /* 1247 * The TLB must be flushed after enabling large pages on Pentium 1248 * CPUs, according to section 3.6.2.2 of "Intel Architecture 1249 * Software Developer's Manual, Volume 3: System Programming". 1250 */ 1251 tlbflushg(); 1252 1253 /* Remap the kernel. */ 1254 pmap_remap_largepages(); 1255 } 1256 pmap_init_lapic(); 1257 #endif /* !XENPV */ 1258 1259 #ifdef __HAVE_PCPU_AREA 1260 pmap_init_pcpu(); 1261 #endif 1262 1263 #ifdef __HAVE_DIRECT_MAP 1264 pmap_init_directmap(kpm); 1265 #else 1266 pmap_vpage_cpualloc(&cpu_info_primary); 1267 1268 if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { /* i386 */ 1269 early_zerop = (void *)cpu_info_primary.vpage[VPAGE_ZER]; 1270 early_zero_pte = cpu_info_primary.vpage_pte[VPAGE_ZER]; 1271 } else { /* amd64 */ 1272 /* 1273 * zero_pte is stuck at the end of mapped space for the kernel 1274 * image (disjunct from kva space). This is done so that it 1275 * can safely be used in pmap_growkernel (pmap_get_physpage), 1276 * when it's called for the first time. 1277 * XXXfvdl fix this for MULTIPROCESSOR later. 1278 */ 1279 #ifdef XENPV 1280 /* early_zerop initialized in xen_locore() */ 1281 #else 1282 early_zerop = (void *)bootspace.spareva; 1283 #endif 1284 early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop); 1285 } 1286 #endif 1287 1288 #if defined(XENPV) && defined(__x86_64__) 1289 extern vaddr_t xen_dummy_page; 1290 paddr_t xen_dummy_user_pgd; 1291 1292 /* 1293 * We want a dummy page directory for Xen: when deactivating a pmap, 1294 * Xen will still consider it active. So we set user PGD to this one 1295 * to lift all protection on the now inactive page tables set. 1296 */ 1297 xen_dummy_user_pgd = xen_dummy_page - KERNBASE; 1298 1299 /* Zero fill it, the less checks in Xen it requires the better */ 1300 memset((void *)(xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE); 1301 /* Mark read-only */ 1302 HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE, 1303 pmap_pa2pte(xen_dummy_user_pgd) | PTE_P | pmap_pg_nx, 1304 UVMF_INVLPG); 1305 /* Pin as L4 */ 1306 xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd)); 1307 #endif 1308 1309 /* 1310 * Allocate space for the IDT, GDT and LDT. 1311 */ 1312 idt_vaddr = pmap_bootstrap_valloc(1); 1313 idt_paddr = pmap_bootstrap_palloc(1); 1314 1315 gdt_vaddr = pmap_bootstrap_valloc(1); 1316 gdt_paddr = pmap_bootstrap_palloc(1); 1317 1318 #ifdef __HAVE_PCPU_AREA 1319 ldt_vaddr = (vaddr_t)&pcpuarea->ldt; 1320 #else 1321 ldt_vaddr = pmap_bootstrap_valloc(1); 1322 #endif 1323 ldt_paddr = pmap_bootstrap_palloc(1); 1324 1325 #if !defined(__x86_64__) 1326 /* pentium f00f bug stuff */ 1327 pentium_idt_vaddr = pmap_bootstrap_valloc(1); 1328 #endif 1329 1330 #if defined(XENPVHVM) 1331 /* XXX: move to hypervisor.c with appropriate API adjustments */ 1332 extern paddr_t HYPERVISOR_shared_info_pa; 1333 extern volatile struct xencons_interface *xencons_interface; /* XXX */ 1334 extern struct xenstore_domain_interface *xenstore_interface; /* XXX */ 1335 1336 if (vm_guest != VM_GUEST_XENPVH) { 1337 HYPERVISOR_shared_info = (void *) pmap_bootstrap_valloc(1); 1338 HYPERVISOR_shared_info_pa = pmap_bootstrap_palloc(1); 1339 } 1340 xencons_interface = (void *) pmap_bootstrap_valloc(1); 1341 xenstore_interface = (void *) pmap_bootstrap_valloc(1); 1342 #endif 1343 /* 1344 * Now we reserve some VM for mapping pages when doing a crash dump. 1345 */ 1346 virtual_avail = reserve_dumppages(virtual_avail); 1347 1348 /* 1349 * Init the global lock and global list. 1350 */ 1351 mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE); 1352 LIST_INIT(&pmaps); 1353 1354 /* 1355 * Ensure the TLB is sync'd with reality by flushing it... 1356 */ 1357 tlbflushg(); 1358 1359 /* 1360 * Calculate pmap_maxkvaddr from nkptp[]. 1361 */ 1362 kva = VM_MIN_KERNEL_ADDRESS; 1363 for (i = PTP_LEVELS - 1; i >= 1; i--) { 1364 kva += nkptp[i] * nbpd[i]; 1365 } 1366 pmap_maxkvaddr = kva; 1367 } 1368 1369 #ifndef XENPV 1370 static void 1371 pmap_init_lapic(void) 1372 { 1373 /* 1374 * On CPUs that have no LAPIC, local_apic_va is never kentered. But our 1375 * x86 implementation relies a lot on this address to be valid; so just 1376 * allocate a fake physical page that will be kentered into 1377 * local_apic_va by machdep. 1378 * 1379 * If the LAPIC is present, the va will be remapped somewhere else 1380 * later in lapic_map. 1381 */ 1382 local_apic_va = pmap_bootstrap_valloc(1); 1383 local_apic_pa = pmap_bootstrap_palloc(1); 1384 } 1385 #endif 1386 1387 #ifdef __x86_64__ 1388 static size_t 1389 pmap_pagetree_nentries_range(vaddr_t startva, vaddr_t endva, size_t pgsz) 1390 { 1391 size_t npages; 1392 npages = (roundup(endva, pgsz) / pgsz) - 1393 (rounddown(startva, pgsz) / pgsz); 1394 return npages; 1395 } 1396 #endif 1397 1398 #if defined(__HAVE_DIRECT_MAP) || defined(KASAN) || defined(KMSAN) 1399 static inline void 1400 slotspace_copy(int type, pd_entry_t *dst, pd_entry_t *src) 1401 { 1402 size_t sslot = slotspace.area[type].sslot; 1403 size_t nslot = slotspace.area[type].nslot; 1404 1405 memcpy(&dst[sslot], &src[sslot], nslot * sizeof(pd_entry_t)); 1406 } 1407 #endif 1408 1409 #ifdef __x86_64__ 1410 /* 1411 * Randomize the location of an area. We count the holes in the VM space. We 1412 * randomly select one hole, and then randomly select an area within that hole. 1413 * Finally we update the associated entry in the slotspace structure. 1414 */ 1415 vaddr_t 1416 slotspace_rand(int type, size_t sz, size_t align, size_t randhole, 1417 vaddr_t randva) 1418 { 1419 struct { 1420 int start; 1421 int end; 1422 } holes[SLSPACE_NAREAS+1]; 1423 size_t i, nholes, hole; 1424 size_t startsl, endsl, nslots, winsize; 1425 vaddr_t startva, va; 1426 1427 sz = roundup(sz, align); 1428 1429 /* 1430 * Take one more slot with +NBPD_L4, because we may end up choosing 1431 * an area that crosses slots: 1432 * +------+------+------+ 1433 * | Slot | Slot | Slot | 1434 * +------+------+------+ 1435 * [Chosen Area] 1436 * And in that case we must take into account the additional slot 1437 * consumed. 1438 */ 1439 nslots = roundup(sz+NBPD_L4, NBPD_L4) / NBPD_L4; 1440 1441 /* Get the holes. */ 1442 nholes = 0; 1443 size_t curslot = 0 + 256; /* end of SLAREA_USER */ 1444 while (1) { 1445 /* 1446 * Find the first occupied slot after the current one. 1447 * The area between the two is a hole. 1448 */ 1449 size_t minsslot = 512; 1450 size_t minnslot = 0; 1451 for (i = 0; i < SLSPACE_NAREAS; i++) { 1452 if (!slotspace.area[i].active) 1453 continue; 1454 if (slotspace.area[i].sslot >= curslot && 1455 slotspace.area[i].sslot < minsslot) { 1456 minsslot = slotspace.area[i].sslot; 1457 minnslot = slotspace.area[i].nslot; 1458 } 1459 } 1460 1461 /* No hole anymore, stop here. */ 1462 if (minsslot == 512) { 1463 break; 1464 } 1465 1466 /* Register the hole. */ 1467 if (minsslot - curslot >= nslots) { 1468 holes[nholes].start = curslot; 1469 holes[nholes].end = minsslot; 1470 nholes++; 1471 } 1472 1473 /* Skip that hole, and iterate again. */ 1474 curslot = minsslot + minnslot; 1475 } 1476 1477 if (nholes == 0) { 1478 panic("%s: impossible", __func__); 1479 } 1480 1481 /* Select a hole. */ 1482 hole = randhole; 1483 #ifdef NO_X86_ASLR 1484 hole = 0; 1485 #endif 1486 hole %= nholes; 1487 startsl = holes[hole].start; 1488 endsl = holes[hole].end; 1489 startva = VA_SIGN_NEG(startsl * NBPD_L4); 1490 1491 /* Select an area within the hole. */ 1492 va = randva; 1493 #ifdef NO_X86_ASLR 1494 va = 0; 1495 #endif 1496 winsize = ((endsl - startsl) * NBPD_L4) - sz; 1497 va %= winsize; 1498 va = rounddown(va, align); 1499 va += startva; 1500 1501 /* Update the entry. */ 1502 slotspace.area[type].sslot = pl4_i(va); 1503 slotspace.area[type].nslot = 1504 pmap_pagetree_nentries_range(va, va+sz, NBPD_L4); 1505 slotspace.area[type].active = true; 1506 1507 return va; 1508 } 1509 #endif 1510 1511 #ifdef __HAVE_PCPU_AREA 1512 static void 1513 pmap_init_pcpu(void) 1514 { 1515 const vaddr_t startva = PMAP_PCPU_BASE; 1516 size_t nL4e, nL3e, nL2e, nL1e; 1517 size_t L4e_idx, L3e_idx, L2e_idx, L1e_idx __diagused; 1518 paddr_t pa; 1519 vaddr_t endva; 1520 vaddr_t tmpva; 1521 pt_entry_t *pte; 1522 size_t size; 1523 int i; 1524 1525 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx; 1526 1527 size = sizeof(struct pcpu_area); 1528 1529 endva = startva + size; 1530 1531 /* We will use this temporary va. */ 1532 tmpva = bootspace.spareva; 1533 pte = PTE_BASE + pl1_i(tmpva); 1534 1535 /* Build L4 */ 1536 L4e_idx = pl4_i(startva); 1537 nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4); 1538 KASSERT(nL4e == 1); 1539 for (i = 0; i < nL4e; i++) { 1540 KASSERT(L4_BASE[L4e_idx+i] == 0); 1541 1542 pa = pmap_bootstrap_palloc(1); 1543 *pte = (pa & PTE_FRAME) | pteflags; 1544 pmap_update_pg(tmpva); 1545 memset((void *)tmpva, 0, PAGE_SIZE); 1546 1547 L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A; 1548 } 1549 1550 /* Build L3 */ 1551 L3e_idx = pl3_i(startva); 1552 nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3); 1553 for (i = 0; i < nL3e; i++) { 1554 KASSERT(L3_BASE[L3e_idx+i] == 0); 1555 1556 pa = pmap_bootstrap_palloc(1); 1557 *pte = (pa & PTE_FRAME) | pteflags; 1558 pmap_update_pg(tmpva); 1559 memset((void *)tmpva, 0, PAGE_SIZE); 1560 1561 L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A; 1562 } 1563 1564 /* Build L2 */ 1565 L2e_idx = pl2_i(startva); 1566 nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2); 1567 for (i = 0; i < nL2e; i++) { 1568 1569 KASSERT(L2_BASE[L2e_idx+i] == 0); 1570 1571 pa = pmap_bootstrap_palloc(1); 1572 *pte = (pa & PTE_FRAME) | pteflags; 1573 pmap_update_pg(tmpva); 1574 memset((void *)tmpva, 0, PAGE_SIZE); 1575 1576 L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A; 1577 } 1578 1579 /* Build L1 */ 1580 L1e_idx = pl1_i(startva); 1581 nL1e = pmap_pagetree_nentries_range(startva, endva, NBPD_L1); 1582 for (i = 0; i < nL1e; i++) { 1583 /* 1584 * Nothing to do, the PTEs will be entered via 1585 * pmap_kenter_pa. 1586 */ 1587 KASSERT(L1_BASE[L1e_idx+i] == 0); 1588 } 1589 1590 *pte = 0; 1591 pmap_update_pg(tmpva); 1592 1593 pcpuarea = (struct pcpu_area *)startva; 1594 1595 tlbflush(); 1596 } 1597 #endif 1598 1599 #ifdef __HAVE_DIRECT_MAP 1600 /* 1601 * Create the amd64 direct map. Called only once at boot time. We map all of 1602 * the physical memory contiguously using 2MB large pages, with RW permissions. 1603 * However there is a hole: the kernel is mapped with RO permissions. 1604 */ 1605 static void 1606 pmap_init_directmap(struct pmap *kpm) 1607 { 1608 extern phys_ram_seg_t mem_clusters[]; 1609 extern int mem_cluster_cnt; 1610 1611 vaddr_t startva; 1612 size_t nL4e, nL3e, nL2e; 1613 size_t L4e_idx, L3e_idx, L2e_idx; 1614 size_t spahole, epahole; 1615 paddr_t lastpa, pa; 1616 vaddr_t endva; 1617 vaddr_t tmpva; 1618 pt_entry_t *pte; 1619 phys_ram_seg_t *mc; 1620 int i; 1621 size_t randhole; 1622 vaddr_t randva; 1623 1624 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx; 1625 const pd_entry_t holepteflags = PTE_P | pmap_pg_nx; 1626 1627 CTASSERT(NL4_SLOT_DIRECT * NBPD_L4 == MAXPHYSMEM); 1628 1629 spahole = roundup(bootspace.head.pa, NBPD_L2); 1630 epahole = rounddown(bootspace.boot.pa, NBPD_L2); 1631 1632 /* Get the last physical address available */ 1633 lastpa = 0; 1634 for (i = 0; i < mem_cluster_cnt; i++) { 1635 mc = &mem_clusters[i]; 1636 lastpa = MAX(lastpa, mc->start + mc->size); 1637 } 1638 1639 /* 1640 * x86_add_cluster should have truncated the memory to MAXPHYSMEM. 1641 */ 1642 if (lastpa > MAXPHYSMEM) { 1643 panic("pmap_init_directmap: lastpa incorrect"); 1644 } 1645 1646 entropy_extract(&randhole, sizeof randhole, 0); 1647 entropy_extract(&randva, sizeof randva, 0); 1648 startva = slotspace_rand(SLAREA_DMAP, lastpa, NBPD_L2, 1649 randhole, randva); 1650 endva = startva + lastpa; 1651 1652 /* We will use this temporary va. */ 1653 tmpva = bootspace.spareva; 1654 pte = PTE_BASE + pl1_i(tmpva); 1655 1656 /* Build L4 */ 1657 L4e_idx = pl4_i(startva); 1658 nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4); 1659 KASSERT(nL4e <= NL4_SLOT_DIRECT); 1660 for (i = 0; i < nL4e; i++) { 1661 KASSERT(L4_BASE[L4e_idx+i] == 0); 1662 1663 pa = pmap_bootstrap_palloc(1); 1664 *pte = (pa & PTE_FRAME) | pteflags; 1665 pmap_update_pg(tmpva); 1666 memset((void *)tmpva, 0, PAGE_SIZE); 1667 1668 L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A; 1669 } 1670 1671 /* Build L3 */ 1672 L3e_idx = pl3_i(startva); 1673 nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3); 1674 for (i = 0; i < nL3e; i++) { 1675 KASSERT(L3_BASE[L3e_idx+i] == 0); 1676 1677 pa = pmap_bootstrap_palloc(1); 1678 *pte = (pa & PTE_FRAME) | pteflags; 1679 pmap_update_pg(tmpva); 1680 memset((void *)tmpva, 0, PAGE_SIZE); 1681 1682 L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A; 1683 } 1684 1685 /* Build L2 */ 1686 L2e_idx = pl2_i(startva); 1687 nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2); 1688 for (i = 0; i < nL2e; i++) { 1689 KASSERT(L2_BASE[L2e_idx+i] == 0); 1690 1691 pa = (paddr_t)(i * NBPD_L2); 1692 1693 if (spahole <= pa && pa < epahole) { 1694 L2_BASE[L2e_idx+i] = pa | holepteflags | PTE_A | 1695 PTE_PS | pmap_pg_g; 1696 } else { 1697 L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A | 1698 PTE_PS | pmap_pg_g; 1699 } 1700 } 1701 1702 *pte = 0; 1703 pmap_update_pg(tmpva); 1704 1705 pmap_direct_base = startva; 1706 pmap_direct_end = endva; 1707 1708 tlbflush(); 1709 } 1710 #endif /* __HAVE_DIRECT_MAP */ 1711 1712 #if !defined(XENPV) 1713 /* 1714 * Remap all of the virtual pages created so far with the PTE_G bit. 1715 */ 1716 static void 1717 pmap_remap_global(void) 1718 { 1719 vaddr_t kva, kva_end; 1720 unsigned long p1i; 1721 size_t i; 1722 1723 /* head */ 1724 kva = bootspace.head.va; 1725 kva_end = kva + bootspace.head.sz; 1726 for ( ; kva < kva_end; kva += PAGE_SIZE) { 1727 p1i = pl1_i(kva); 1728 if (pmap_valid_entry(PTE_BASE[p1i])) 1729 PTE_BASE[p1i] |= pmap_pg_g; 1730 } 1731 1732 /* kernel segments */ 1733 for (i = 0; i < BTSPACE_NSEGS; i++) { 1734 if (bootspace.segs[i].type == BTSEG_NONE) { 1735 continue; 1736 } 1737 kva = bootspace.segs[i].va; 1738 kva_end = kva + bootspace.segs[i].sz; 1739 for ( ; kva < kva_end; kva += PAGE_SIZE) { 1740 p1i = pl1_i(kva); 1741 if (pmap_valid_entry(PTE_BASE[p1i])) 1742 PTE_BASE[p1i] |= pmap_pg_g; 1743 } 1744 } 1745 1746 /* boot space */ 1747 kva = bootspace.boot.va; 1748 kva_end = kva + bootspace.boot.sz; 1749 for ( ; kva < kva_end; kva += PAGE_SIZE) { 1750 p1i = pl1_i(kva); 1751 if (pmap_valid_entry(PTE_BASE[p1i])) 1752 PTE_BASE[p1i] |= pmap_pg_g; 1753 } 1754 } 1755 #endif 1756 1757 #ifndef XENPV 1758 /* 1759 * Remap several kernel segments with large pages. We cover as many pages as we 1760 * can. Called only once at boot time, if the CPU supports large pages. 1761 */ 1762 static void 1763 pmap_remap_largepages(void) 1764 { 1765 pd_entry_t *pde; 1766 vaddr_t kva, kva_end; 1767 paddr_t pa; 1768 size_t i; 1769 1770 /* Remap the kernel text using large pages. */ 1771 for (i = 0; i < BTSPACE_NSEGS; i++) { 1772 if (bootspace.segs[i].type != BTSEG_TEXT) { 1773 continue; 1774 } 1775 kva = roundup(bootspace.segs[i].va, NBPD_L2); 1776 if (kva < bootspace.segs[i].va) { 1777 continue; 1778 } 1779 kva_end = rounddown(bootspace.segs[i].va + 1780 bootspace.segs[i].sz, NBPD_L2); 1781 pa = roundup(bootspace.segs[i].pa, NBPD_L2); 1782 for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) { 1783 pde = &L2_BASE[pl2_i(kva)]; 1784 *pde = pa | pmap_pg_g | PTE_PS | PTE_P; 1785 tlbflushg(); 1786 } 1787 } 1788 1789 /* Remap the kernel rodata using large pages. */ 1790 for (i = 0; i < BTSPACE_NSEGS; i++) { 1791 if (bootspace.segs[i].type != BTSEG_RODATA) { 1792 continue; 1793 } 1794 kva = roundup(bootspace.segs[i].va, NBPD_L2); 1795 if (kva < bootspace.segs[i].va) { 1796 continue; 1797 } 1798 kva_end = rounddown(bootspace.segs[i].va + 1799 bootspace.segs[i].sz, NBPD_L2); 1800 pa = roundup(bootspace.segs[i].pa, NBPD_L2); 1801 for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) { 1802 pde = &L2_BASE[pl2_i(kva)]; 1803 *pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_P; 1804 tlbflushg(); 1805 } 1806 } 1807 1808 /* Remap the kernel data+bss using large pages. */ 1809 for (i = 0; i < BTSPACE_NSEGS; i++) { 1810 if (bootspace.segs[i].type != BTSEG_DATA) { 1811 continue; 1812 } 1813 kva = roundup(bootspace.segs[i].va, NBPD_L2); 1814 if (kva < bootspace.segs[i].va) { 1815 continue; 1816 } 1817 kva_end = rounddown(bootspace.segs[i].va + 1818 bootspace.segs[i].sz, NBPD_L2); 1819 pa = roundup(bootspace.segs[i].pa, NBPD_L2); 1820 for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) { 1821 pde = &L2_BASE[pl2_i(kva)]; 1822 *pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_W | PTE_P; 1823 tlbflushg(); 1824 } 1825 } 1826 } 1827 #endif /* !XENPV */ 1828 1829 /* 1830 * pmap_init: called from uvm_init, our job is to get the pmap system ready 1831 * to manage mappings. 1832 */ 1833 void 1834 pmap_init(void) 1835 { 1836 int flags; 1837 1838 /* 1839 * initialize caches. 1840 */ 1841 1842 pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), COHERENCY_UNIT, 1843 0, 0, "pmappl", NULL, IPL_NONE, pmap_ctor, pmap_dtor, NULL); 1844 1845 #ifdef XENPV 1846 /* 1847 * pool_cache(9) should not touch cached objects, since they 1848 * are pinned on xen and R/O for the domU 1849 */ 1850 flags = PR_NOTOUCH; 1851 #else 1852 flags = 0; 1853 #endif 1854 1855 #ifdef PAE 1856 pool_init(&pmap_pdp_pool, PAGE_SIZE * PDP_SIZE, 0, 0, flags, 1857 "pdppl", &pmap_pdp_allocator, IPL_NONE); 1858 #else 1859 pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, 0, flags, 1860 "pdppl", NULL, IPL_NONE); 1861 #endif 1862 pool_cache_bootstrap(&pmap_pvp_cache, PAGE_SIZE, PAGE_SIZE, 1863 0, 0, "pvpage", &pool_allocator_kmem, 1864 IPL_NONE, pmap_pvp_ctor, pmap_pvp_dtor, NULL); 1865 1866 pmap_tlb_init(); 1867 1868 /* XXX: Since cpu_hatch() is only for secondary CPUs. */ 1869 pmap_tlb_cpu_init(curcpu()); 1870 1871 evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC, 1872 NULL, "x86", "io bitmap copy"); 1873 evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC, 1874 NULL, "x86", "ldt sync"); 1875 1876 /* 1877 * The kernel doesn't keep track of PTPs, so there's nowhere handy 1878 * to hang a tree of pv_entry records. Dynamically allocated 1879 * pv_entry lists are not heavily used in the kernel's pmap (the 1880 * usual case is embedded), so cop out and use a single RB tree 1881 * to cover them. 1882 */ 1883 rb_tree_init(&pmap_kernel_rb, &pmap_rbtree_ops); 1884 1885 /* 1886 * done: pmap module is up (and ready for business) 1887 */ 1888 1889 pmap_initialized = true; 1890 } 1891 1892 #ifndef XENPV 1893 /* 1894 * pmap_cpu_init_late: perform late per-CPU initialization. 1895 */ 1896 void 1897 pmap_cpu_init_late(struct cpu_info *ci) 1898 { 1899 /* 1900 * The BP has already its own PD page allocated during early 1901 * MD startup. 1902 */ 1903 if (ci == &cpu_info_primary) 1904 return; 1905 #ifdef PAE 1906 cpu_alloc_l3_page(ci); 1907 #endif 1908 } 1909 #endif 1910 1911 #ifndef __HAVE_DIRECT_MAP 1912 CTASSERT(CACHE_LINE_SIZE > sizeof(pt_entry_t)); 1913 CTASSERT(CACHE_LINE_SIZE % sizeof(pt_entry_t) == 0); 1914 1915 static void 1916 pmap_vpage_cpualloc(struct cpu_info *ci) 1917 { 1918 bool primary = (ci == &cpu_info_primary); 1919 size_t i, npages; 1920 vaddr_t vabase; 1921 vsize_t vrange; 1922 1923 npages = (CACHE_LINE_SIZE / sizeof(pt_entry_t)); 1924 KASSERT(npages >= VPAGE_MAX); 1925 vrange = npages * PAGE_SIZE; 1926 1927 if (primary) { 1928 while ((vabase = pmap_bootstrap_valloc(1)) % vrange != 0) { 1929 /* Waste some pages to align properly */ 1930 } 1931 /* The base is aligned, allocate the rest (contiguous) */ 1932 pmap_bootstrap_valloc(npages - 1); 1933 } else { 1934 vabase = uvm_km_alloc(kernel_map, vrange, vrange, 1935 UVM_KMF_VAONLY); 1936 if (vabase == 0) { 1937 panic("%s: failed to allocate tmp VA for CPU %d\n", 1938 __func__, cpu_index(ci)); 1939 } 1940 } 1941 1942 KASSERT((vaddr_t)&PTE_BASE[pl1_i(vabase)] % CACHE_LINE_SIZE == 0); 1943 1944 for (i = 0; i < VPAGE_MAX; i++) { 1945 ci->vpage[i] = vabase + i * PAGE_SIZE; 1946 ci->vpage_pte[i] = PTE_BASE + pl1_i(ci->vpage[i]); 1947 } 1948 } 1949 1950 void 1951 pmap_vpage_cpu_init(struct cpu_info *ci) 1952 { 1953 if (ci == &cpu_info_primary) { 1954 /* cpu0 already taken care of in pmap_bootstrap */ 1955 return; 1956 } 1957 1958 pmap_vpage_cpualloc(ci); 1959 } 1960 #endif 1961 1962 /* 1963 * p v _ e n t r y f u n c t i o n s 1964 */ 1965 1966 /* 1967 * pmap_pvp_dtor: pool_cache constructor for PV pages. 1968 */ 1969 static int 1970 pmap_pvp_ctor(void *arg, void *obj, int flags) 1971 { 1972 struct pv_page *pvp = (struct pv_page *)obj; 1973 struct pv_entry *pve = (struct pv_entry *)obj + 1; 1974 struct pv_entry *maxpve = pve + PVE_PER_PVP; 1975 1976 KASSERT(sizeof(struct pv_page) <= sizeof(struct pv_entry)); 1977 KASSERT(trunc_page((vaddr_t)obj) == (vaddr_t)obj); 1978 1979 LIST_INIT(&pvp->pvp_pves); 1980 pvp->pvp_nfree = PVE_PER_PVP; 1981 pvp->pvp_pmap = NULL; 1982 1983 for (; pve < maxpve; pve++) { 1984 LIST_INSERT_HEAD(&pvp->pvp_pves, pve, pve_list); 1985 } 1986 1987 return 0; 1988 } 1989 1990 /* 1991 * pmap_pvp_dtor: pool_cache destructor for PV pages. 1992 */ 1993 static void 1994 pmap_pvp_dtor(void *arg, void *obj) 1995 { 1996 struct pv_page *pvp __diagused = obj; 1997 1998 KASSERT(pvp->pvp_pmap == NULL); 1999 KASSERT(pvp->pvp_nfree == PVE_PER_PVP); 2000 } 2001 2002 /* 2003 * pmap_alloc_pv: allocate a PV entry (likely cached with pmap). 2004 */ 2005 static struct pv_entry * 2006 pmap_alloc_pv(struct pmap *pmap) 2007 { 2008 struct pv_entry *pve; 2009 struct pv_page *pvp; 2010 2011 KASSERT(mutex_owned(&pmap->pm_lock)); 2012 2013 if (__predict_false((pvp = LIST_FIRST(&pmap->pm_pvp_part)) == NULL)) { 2014 if ((pvp = LIST_FIRST(&pmap->pm_pvp_full)) != NULL) { 2015 LIST_REMOVE(pvp, pvp_list); 2016 } else { 2017 pvp = pool_cache_get(&pmap_pvp_cache, PR_NOWAIT); 2018 } 2019 if (__predict_false(pvp == NULL)) { 2020 return NULL; 2021 } 2022 /* full -> part */ 2023 LIST_INSERT_HEAD(&pmap->pm_pvp_part, pvp, pvp_list); 2024 pvp->pvp_pmap = pmap; 2025 } 2026 2027 KASSERT(pvp->pvp_pmap == pmap); 2028 KASSERT(pvp->pvp_nfree > 0); 2029 2030 pve = LIST_FIRST(&pvp->pvp_pves); 2031 LIST_REMOVE(pve, pve_list); 2032 pvp->pvp_nfree--; 2033 2034 if (__predict_false(pvp->pvp_nfree == 0)) { 2035 /* part -> empty */ 2036 KASSERT(LIST_EMPTY(&pvp->pvp_pves)); 2037 LIST_REMOVE(pvp, pvp_list); 2038 LIST_INSERT_HEAD(&pmap->pm_pvp_empty, pvp, pvp_list); 2039 } else { 2040 KASSERT(!LIST_EMPTY(&pvp->pvp_pves)); 2041 } 2042 2043 return pve; 2044 } 2045 2046 /* 2047 * pmap_free_pv: delayed free of a PV entry. 2048 */ 2049 static void 2050 pmap_free_pv(struct pmap *pmap, struct pv_entry *pve) 2051 { 2052 struct pv_page *pvp = (struct pv_page *)trunc_page((vaddr_t)pve); 2053 2054 KASSERT(mutex_owned(&pmap->pm_lock)); 2055 KASSERT(pvp->pvp_pmap == pmap); 2056 KASSERT(pvp->pvp_nfree >= 0); 2057 2058 LIST_INSERT_HEAD(&pvp->pvp_pves, pve, pve_list); 2059 pvp->pvp_nfree++; 2060 2061 if (__predict_false(pvp->pvp_nfree == 1)) { 2062 /* empty -> part */ 2063 LIST_REMOVE(pvp, pvp_list); 2064 LIST_INSERT_HEAD(&pmap->pm_pvp_part, pvp, pvp_list); 2065 } else if (__predict_false(pvp->pvp_nfree == PVE_PER_PVP)) { 2066 /* part -> full */ 2067 LIST_REMOVE(pvp, pvp_list); 2068 LIST_INSERT_HEAD(&pmap->pm_pvp_full, pvp, pvp_list); 2069 } 2070 } 2071 2072 /* 2073 * pmap_drain_pv: free full PV pages. 2074 */ 2075 static void 2076 pmap_drain_pv(struct pmap *pmap) 2077 { 2078 struct pv_page *pvp; 2079 2080 KASSERT(mutex_owned(&pmap->pm_lock)); 2081 2082 while ((pvp = LIST_FIRST(&pmap->pm_pvp_full)) != NULL) { 2083 LIST_REMOVE(pvp, pvp_list); 2084 KASSERT(pvp->pvp_pmap == pmap); 2085 KASSERT(pvp->pvp_nfree == PVE_PER_PVP); 2086 pvp->pvp_pmap = NULL; 2087 pool_cache_put(&pmap_pvp_cache, pvp); 2088 } 2089 } 2090 2091 /* 2092 * pmap_check_pv: verify {VA, PTP} pair is either tracked/untracked by page 2093 */ 2094 static void 2095 pmap_check_pv(struct pmap *pmap, struct vm_page *ptp, struct pmap_page *pp, 2096 vaddr_t va, bool tracked) 2097 { 2098 #ifdef DEBUG 2099 struct pv_pte *pvpte; 2100 2101 PMAP_CHECK_PP(pp); 2102 2103 mutex_spin_enter(&pp->pp_lock); 2104 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 2105 if (pvpte->pte_ptp == ptp && pvpte->pte_va == va) { 2106 break; 2107 } 2108 } 2109 mutex_spin_exit(&pp->pp_lock); 2110 2111 if (pvpte && !tracked) { 2112 panic("pmap_check_pv: %p/%lx found on pp %p", ptp, va, pp); 2113 } else if (!pvpte && tracked) { 2114 panic("pmap_check_pv: %p/%lx missing on pp %p", ptp, va, pp); 2115 } 2116 #endif 2117 } 2118 2119 /* 2120 * pmap_treelookup_pv: search the PV tree for a dynamic entry 2121 * 2122 * => pmap must be locked 2123 */ 2124 static struct pv_entry * 2125 pmap_treelookup_pv(const struct pmap *pmap, const struct vm_page *ptp, 2126 const rb_tree_t *tree, const vaddr_t va) 2127 { 2128 struct pv_entry *pve; 2129 rb_node_t *node; 2130 2131 /* 2132 * Inlined lookup tailored for exactly what's needed here that is 2133 * quite a bit faster than using rb_tree_find_node(). 2134 */ 2135 for (node = tree->rbt_root;;) { 2136 if (__predict_false(RB_SENTINEL_P(node))) { 2137 return NULL; 2138 } 2139 pve = (struct pv_entry *) 2140 ((uintptr_t)node - offsetof(struct pv_entry, pve_rb)); 2141 if (pve->pve_pte.pte_va == va) { 2142 KASSERT(pve->pve_pte.pte_ptp == ptp); 2143 return pve; 2144 } 2145 node = node->rb_nodes[pve->pve_pte.pte_va < va]; 2146 } 2147 } 2148 2149 /* 2150 * pmap_lookup_pv: look up a non-embedded pv entry for the given pmap 2151 * 2152 * => a PV entry must be known present (doesn't check for existence) 2153 * => pmap must be locked 2154 */ 2155 static struct pv_entry * 2156 pmap_lookup_pv(const struct pmap *pmap, const struct vm_page *ptp, 2157 const struct pmap_page * const old_pp, const vaddr_t va) 2158 { 2159 struct pv_entry *pve; 2160 const rb_tree_t *tree; 2161 2162 KASSERT(mutex_owned(&pmap->pm_lock)); 2163 KASSERT(ptp != NULL || pmap == pmap_kernel()); 2164 2165 /* 2166 * [This mostly deals with the case of process-private pages, i.e. 2167 * anonymous memory allocations or COW.] 2168 * 2169 * If the page is tracked with an embedded entry then the tree 2170 * lookup can be avoided. It's safe to check for this specific 2171 * set of values without pp_lock because both will only ever be 2172 * set together for this pmap. 2173 * 2174 */ 2175 if (atomic_load_relaxed(&old_pp->pp_pte.pte_ptp) == ptp && 2176 atomic_load_relaxed(&old_pp->pp_pte.pte_va) == va) { 2177 return NULL; 2178 } 2179 2180 /* 2181 * [This mostly deals with shared mappings, for example shared libs 2182 * and executables.] 2183 * 2184 * Optimise for pmap_remove_ptes() which works by ascending scan: 2185 * look at the lowest numbered node in the tree first. The tree is 2186 * known non-empty because of the check above. For short lived 2187 * processes where pmap_remove() isn't used much this gets close to 2188 * a 100% hit rate. 2189 */ 2190 tree = (ptp != NULL ? &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb); 2191 KASSERT(!RB_SENTINEL_P(tree->rbt_root)); 2192 pve = (struct pv_entry *) 2193 ((uintptr_t)tree->rbt_minmax[RB_DIR_LEFT] - 2194 offsetof(struct pv_entry, pve_rb)); 2195 if (__predict_true(pve->pve_pte.pte_va == va)) { 2196 KASSERT(pve->pve_pte.pte_ptp == ptp); 2197 return pve; 2198 } 2199 2200 /* Search the RB tree for the key (uncommon). */ 2201 return pmap_treelookup_pv(pmap, ptp, tree, va); 2202 } 2203 2204 /* 2205 * pmap_enter_pv: enter a mapping onto a pmap_page lst 2206 * 2207 * => pmap must be locked 2208 * => does NOT insert dynamic entries to tree (pmap_enter() does later) 2209 */ 2210 static int 2211 pmap_enter_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp, 2212 vaddr_t va, struct pv_entry **new_pve, struct pv_entry **old_pve, 2213 bool *samepage, bool *new_embedded, rb_tree_t *tree) 2214 { 2215 struct pv_entry *pve; 2216 int error; 2217 2218 KASSERT(mutex_owned(&pmap->pm_lock)); 2219 KASSERT(ptp_to_pmap(ptp) == pmap); 2220 KASSERT(ptp == NULL || ptp->uobject != NULL); 2221 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 2222 PMAP_CHECK_PP(pp); 2223 2224 /* 2225 * If entering the same page and it's already tracked with an 2226 * embedded entry, we can avoid the expense below. It's safe 2227 * to check for this very specific set of values without a lock 2228 * because both will only ever be set together for this pmap. 2229 */ 2230 if (atomic_load_relaxed(&pp->pp_pte.pte_ptp) == ptp && 2231 atomic_load_relaxed(&pp->pp_pte.pte_va) == va) { 2232 *samepage = true; 2233 pmap_check_pv(pmap, ptp, pp, va, true); 2234 return 0; 2235 } 2236 2237 /* 2238 * Check for an existing dynamic mapping at this address. If it's 2239 * for the same page, then it will be reused and nothing needs to be 2240 * changed. 2241 */ 2242 *old_pve = pmap_treelookup_pv(pmap, ptp, tree, va); 2243 if (*old_pve != NULL && (*old_pve)->pve_pp == pp) { 2244 *samepage = true; 2245 pmap_check_pv(pmap, ptp, pp, va, true); 2246 return 0; 2247 } 2248 2249 /* 2250 * Need to put a new mapping in place. Grab a spare pv_entry in 2251 * case it's needed; won't know for sure until the lock is taken. 2252 */ 2253 if (pmap->pm_pve == NULL) { 2254 pmap->pm_pve = pmap_alloc_pv(pmap); 2255 } 2256 2257 error = 0; 2258 pmap_check_pv(pmap, ptp, pp, va, false); 2259 mutex_spin_enter(&pp->pp_lock); 2260 if (!pv_pte_embedded(pp)) { 2261 /* 2262 * Embedded PV tracking available - easy. 2263 */ 2264 pp->pp_pte.pte_ptp = ptp; 2265 pp->pp_pte.pte_va = va; 2266 *new_embedded = true; 2267 } else if (__predict_false(pmap->pm_pve == NULL)) { 2268 /* 2269 * No memory. 2270 */ 2271 error = ENOMEM; 2272 } else { 2273 /* 2274 * Install new pv_entry on the page. 2275 */ 2276 pve = pmap->pm_pve; 2277 pmap->pm_pve = NULL; 2278 *new_pve = pve; 2279 pve->pve_pte.pte_ptp = ptp; 2280 pve->pve_pte.pte_va = va; 2281 pve->pve_pp = pp; 2282 LIST_INSERT_HEAD(&pp->pp_pvlist, pve, pve_list); 2283 } 2284 mutex_spin_exit(&pp->pp_lock); 2285 if (error == 0) { 2286 pmap_check_pv(pmap, ptp, pp, va, true); 2287 } 2288 2289 return error; 2290 } 2291 2292 /* 2293 * pmap_remove_pv: try to remove a mapping from a pv_list 2294 * 2295 * => pmap must be locked 2296 * => removes dynamic entries from tree and frees them 2297 * => caller should adjust ptp's wire_count and free PTP if needed 2298 */ 2299 static void 2300 pmap_remove_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp, 2301 vaddr_t va, struct pv_entry *pve, uint8_t oattrs) 2302 { 2303 rb_tree_t *tree = (ptp != NULL ? 2304 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb); 2305 2306 KASSERT(mutex_owned(&pmap->pm_lock)); 2307 KASSERT(ptp_to_pmap(ptp) == pmap); 2308 KASSERT(ptp == NULL || ptp->uobject != NULL); 2309 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 2310 KASSERT(ptp != NULL || pmap == pmap_kernel()); 2311 2312 pmap_check_pv(pmap, ptp, pp, va, true); 2313 2314 if (pve == NULL) { 2315 mutex_spin_enter(&pp->pp_lock); 2316 KASSERT(pp->pp_pte.pte_ptp == ptp); 2317 KASSERT(pp->pp_pte.pte_va == va); 2318 pp->pp_attrs |= oattrs; 2319 pp->pp_pte.pte_ptp = NULL; 2320 pp->pp_pte.pte_va = 0; 2321 mutex_spin_exit(&pp->pp_lock); 2322 } else { 2323 mutex_spin_enter(&pp->pp_lock); 2324 KASSERT(pp->pp_pte.pte_ptp != ptp || 2325 pp->pp_pte.pte_va != va); 2326 KASSERT(pve->pve_pte.pte_ptp == ptp); 2327 KASSERT(pve->pve_pte.pte_va == va); 2328 KASSERT(pve->pve_pp == pp); 2329 pp->pp_attrs |= oattrs; 2330 LIST_REMOVE(pve, pve_list); 2331 mutex_spin_exit(&pp->pp_lock); 2332 2333 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == pve); 2334 rb_tree_remove_node(tree, pve); 2335 #ifdef DIAGNOSTIC 2336 memset(pve, 0, sizeof(*pve)); 2337 #endif 2338 pmap_free_pv(pmap, pve); 2339 } 2340 2341 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 2342 pmap_check_pv(pmap, ptp, pp, va, false); 2343 } 2344 2345 /* 2346 * p t p f u n c t i o n s 2347 */ 2348 2349 static struct vm_page * 2350 pmap_find_ptp(struct pmap *pmap, vaddr_t va, int level) 2351 { 2352 int lidx = level - 1; 2353 off_t off = ptp_va2o(va, level); 2354 struct vm_page *pg; 2355 2356 KASSERT(mutex_owned(&pmap->pm_lock)); 2357 2358 if (pmap->pm_ptphint[lidx] && off == pmap->pm_ptphint[lidx]->offset) { 2359 KASSERT(pmap->pm_ptphint[lidx]->wire_count > 0); 2360 pg = pmap->pm_ptphint[lidx]; 2361 PMAP_CHECK_PP(VM_PAGE_TO_PP(pg)); 2362 return pg; 2363 } 2364 PMAP_DUMMY_LOCK(pmap); 2365 pg = uvm_pagelookup(&pmap->pm_obj[lidx], off); 2366 PMAP_DUMMY_UNLOCK(pmap); 2367 if (pg != NULL && __predict_false(pg->wire_count == 0)) { 2368 /* This page is queued to be freed - ignore. */ 2369 pg = NULL; 2370 } 2371 if (pg != NULL) { 2372 PMAP_CHECK_PP(VM_PAGE_TO_PP(pg)); 2373 } 2374 pmap->pm_ptphint[lidx] = pg; 2375 return pg; 2376 } 2377 2378 static inline void 2379 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level) 2380 { 2381 int lidx; 2382 2383 KASSERT(ptp->wire_count <= 1); 2384 PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp)); 2385 2386 lidx = level - 1; 2387 pmap_stats_update(pmap, -ptp->wire_count, 0); 2388 if (pmap->pm_ptphint[lidx] == ptp) 2389 pmap->pm_ptphint[lidx] = NULL; 2390 ptp->wire_count = 0; 2391 ptp->uanon = NULL; 2392 KASSERT(RB_TREE_MIN(&VM_PAGE_TO_PP(ptp)->pp_rb) == NULL); 2393 2394 /* 2395 * Enqueue the PTP to be freed by pmap_update(). We can't remove 2396 * the page from the uvm_object, as that can take further locks 2397 * (intolerable right now because the PTEs are likely mapped in). 2398 * Instead mark the PTP as free and if we bump into it again, we'll 2399 * either ignore or reuse (depending on what's useful at the time). 2400 */ 2401 LIST_INSERT_HEAD(&pmap->pm_gc_ptp, ptp, mdpage.mp_pp.pp_link); 2402 } 2403 2404 static void 2405 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va, 2406 pt_entry_t *ptes, pd_entry_t * const *pdes) 2407 { 2408 unsigned long index; 2409 int level; 2410 vaddr_t invaladdr; 2411 pd_entry_t opde; 2412 2413 KASSERT(pmap != pmap_kernel()); 2414 KASSERT(mutex_owned(&pmap->pm_lock)); 2415 KASSERT(kpreempt_disabled()); 2416 2417 level = 1; 2418 do { 2419 index = pl_i(va, level + 1); 2420 opde = pmap_pte_testset(&pdes[level - 1][index], 0); 2421 2422 /* 2423 * On Xen-amd64 or SVS, we need to sync the top level page 2424 * directory on each CPU. 2425 */ 2426 #if defined(XENPV) && defined(__x86_64__) 2427 if (level == PTP_LEVELS - 1) { 2428 xen_kpm_sync(pmap, index); 2429 } 2430 #elif defined(SVS) 2431 if (svs_enabled && level == PTP_LEVELS - 1) { 2432 svs_pmap_sync(pmap, index); 2433 } 2434 #endif 2435 2436 invaladdr = level == 1 ? (vaddr_t)ptes : 2437 (vaddr_t)pdes[level - 2]; 2438 pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE, 2439 opde, TLBSHOOT_FREE_PTP); 2440 2441 #if defined(XENPV) 2442 pmap_tlb_shootnow(); 2443 #endif 2444 2445 pmap_freepage(pmap, ptp, level); 2446 if (level < PTP_LEVELS - 1) { 2447 ptp = pmap_find_ptp(pmap, va, level + 1); 2448 ptp->wire_count--; 2449 if (ptp->wire_count > 1) 2450 break; 2451 } 2452 } while (++level < PTP_LEVELS); 2453 pmap_pte_flush(); 2454 } 2455 2456 /* 2457 * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one) 2458 * 2459 * => pmap should NOT be pmap_kernel() 2460 * => pmap should be locked 2461 * => we are not touching any PTEs yet, so they need not be mapped in 2462 */ 2463 static int 2464 pmap_get_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va, 2465 int flags, struct vm_page **resultp) 2466 { 2467 struct vm_page *ptp; 2468 int i, aflags; 2469 struct uvm_object *obj; 2470 voff_t off; 2471 2472 KASSERT(pmap != pmap_kernel()); 2473 KASSERT(mutex_owned(&pmap->pm_lock)); 2474 2475 /* 2476 * Loop through all page table levels allocating a page 2477 * for any level where we don't already have one. 2478 */ 2479 memset(pt, 0, sizeof(*pt)); 2480 aflags = ((flags & PMAP_CANFAIL) ? 0 : UVM_PGA_USERESERVE) | 2481 UVM_PGA_ZERO; 2482 for (i = PTP_LEVELS; i > 1; i--) { 2483 obj = &pmap->pm_obj[i - 2]; 2484 off = ptp_va2o(va, i - 1); 2485 2486 PMAP_DUMMY_LOCK(pmap); 2487 pt->pg[i] = uvm_pagelookup(obj, off); 2488 2489 if (pt->pg[i] == NULL) { 2490 pt->pg[i] = uvm_pagealloc(obj, off, NULL, aflags); 2491 pt->alloced[i] = (pt->pg[i] != NULL); 2492 } else if (pt->pg[i]->wire_count == 0) { 2493 /* This page was queued to be freed; dequeue it. */ 2494 LIST_REMOVE(pt->pg[i], mdpage.mp_pp.pp_link); 2495 pt->alloced[i] = true; 2496 } 2497 PMAP_DUMMY_UNLOCK(pmap); 2498 if (pt->pg[i] == NULL) { 2499 pmap_unget_ptp(pmap, pt); 2500 return ENOMEM; 2501 } else if (pt->alloced[i]) { 2502 pt->pg[i]->uanon = (struct vm_anon *)(vaddr_t)~0L; 2503 rb_tree_init(&VM_PAGE_TO_PP(pt->pg[i])->pp_rb, 2504 &pmap_rbtree_ops); 2505 PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i])); 2506 } 2507 } 2508 ptp = pt->pg[2]; 2509 KASSERT(ptp != NULL); 2510 *resultp = ptp; 2511 pmap->pm_ptphint[0] = ptp; 2512 return 0; 2513 } 2514 2515 /* 2516 * pmap_install_ptp: install any freshly allocated PTPs 2517 * 2518 * => pmap should NOT be pmap_kernel() 2519 * => pmap should be locked 2520 * => PTEs must be mapped 2521 * => preemption must be disabled 2522 */ 2523 static void 2524 pmap_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va, 2525 pd_entry_t * const *pdes) 2526 { 2527 struct vm_page *ptp; 2528 unsigned long index; 2529 pd_entry_t *pva; 2530 paddr_t pa; 2531 int i; 2532 2533 KASSERT(pmap != pmap_kernel()); 2534 KASSERT(mutex_owned(&pmap->pm_lock)); 2535 KASSERT(kpreempt_disabled()); 2536 2537 /* 2538 * Now that we have all the pages looked up or allocated, 2539 * loop through again installing any new ones into the tree. 2540 */ 2541 for (i = PTP_LEVELS; i > 1; i--) { 2542 index = pl_i(va, i); 2543 pva = pdes[i - 2]; 2544 2545 if (pmap_valid_entry(pva[index])) { 2546 KASSERT(!pt->alloced[i]); 2547 continue; 2548 } 2549 2550 ptp = pt->pg[i]; 2551 ptp->flags &= ~PG_BUSY; /* never busy */ 2552 ptp->wire_count = 1; 2553 pmap->pm_ptphint[i - 2] = ptp; 2554 pa = VM_PAGE_TO_PHYS(ptp); 2555 pmap_pte_set(&pva[index], (pd_entry_t) 2556 (pmap_pa2pte(pa) | PTE_U | PTE_W | PTE_P)); 2557 2558 /* 2559 * On Xen-amd64 or SVS, we need to sync the top level page 2560 * directory on each CPU. 2561 */ 2562 #if defined(XENPV) && defined(__x86_64__) 2563 if (i == PTP_LEVELS) { 2564 xen_kpm_sync(pmap, index); 2565 } 2566 #elif defined(SVS) 2567 if (svs_enabled && i == PTP_LEVELS) { 2568 svs_pmap_sync(pmap, index); 2569 } 2570 #endif 2571 2572 pmap_pte_flush(); 2573 pmap_stats_update(pmap, 1, 0); 2574 2575 /* 2576 * If we're not in the top level, increase the 2577 * wire count of the parent page. 2578 */ 2579 if (i < PTP_LEVELS) { 2580 pt->pg[i + 1]->wire_count++; 2581 } 2582 } 2583 } 2584 2585 /* 2586 * pmap_unget_ptp: free unusued PTPs 2587 * 2588 * => pmap should NOT be pmap_kernel() 2589 * => pmap should be locked 2590 */ 2591 static void 2592 pmap_unget_ptp(struct pmap *pmap, struct pmap_ptparray *pt) 2593 { 2594 int i; 2595 2596 KASSERT(pmap != pmap_kernel()); 2597 KASSERT(mutex_owned(&pmap->pm_lock)); 2598 2599 for (i = PTP_LEVELS; i > 1; i--) { 2600 if (!pt->alloced[i]) { 2601 continue; 2602 } 2603 KASSERT(pt->pg[i]->wire_count == 0); 2604 PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i])); 2605 pmap_freepage(pmap, pt->pg[i], i - 1); 2606 } 2607 } 2608 2609 /* 2610 * p m a p l i f e c y c l e f u n c t i o n s 2611 */ 2612 2613 /* 2614 * pmap_pdp_init: constructor a new PDP. 2615 */ 2616 static void 2617 pmap_pdp_init(pd_entry_t *pdir) 2618 { 2619 paddr_t pdirpa = 0; 2620 vaddr_t object; 2621 int i; 2622 2623 #if !defined(XENPV) || !defined(__x86_64__) 2624 int npde; 2625 #endif 2626 #ifdef XENPV 2627 int s; 2628 #endif 2629 2630 memset(pdir, 0, PDP_SIZE * PAGE_SIZE); 2631 2632 /* 2633 * NOTE: This is all done unlocked, but we will check afterwards 2634 * if we have raced with pmap_growkernel(). 2635 */ 2636 2637 #if defined(XENPV) && defined(__x86_64__) 2638 /* Fetch the physical address of the page directory */ 2639 (void)pmap_extract(pmap_kernel(), (vaddr_t)pdir, &pdirpa); 2640 2641 /* 2642 * This pdir will NEVER be active in kernel mode, so mark 2643 * recursive entry invalid. 2644 */ 2645 pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa); 2646 2647 /* 2648 * PDP constructed this way won't be for the kernel, hence we 2649 * don't put kernel mappings on Xen. 2650 * 2651 * But we need to make pmap_create() happy, so put a dummy 2652 * (without PTE_P) value at the right place. 2653 */ 2654 pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] = 2655 (pd_entry_t)-1 & PTE_FRAME; 2656 #else /* XENPV && __x86_64__*/ 2657 object = (vaddr_t)pdir; 2658 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2659 /* Fetch the physical address of the page directory */ 2660 (void)pmap_extract(pmap_kernel(), object, &pdirpa); 2661 2662 /* Put in recursive PDE to map the PTEs */ 2663 pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PTE_P | 2664 pmap_pg_nx; 2665 #ifndef XENPV 2666 pdir[PDIR_SLOT_PTE + i] |= PTE_W; 2667 #endif 2668 } 2669 2670 /* Copy the kernel's top level PDE */ 2671 npde = nkptp[PTP_LEVELS - 1]; 2672 2673 memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN], 2674 npde * sizeof(pd_entry_t)); 2675 2676 if (VM_MIN_KERNEL_ADDRESS != KERNBASE) { 2677 int idx = pl_i(KERNBASE, PTP_LEVELS); 2678 pdir[idx] = PDP_BASE[idx]; 2679 } 2680 2681 #ifdef __HAVE_PCPU_AREA 2682 pdir[PDIR_SLOT_PCPU] = PDP_BASE[PDIR_SLOT_PCPU]; 2683 #endif 2684 #ifdef __HAVE_DIRECT_MAP 2685 slotspace_copy(SLAREA_DMAP, pdir, PDP_BASE); 2686 #endif 2687 #ifdef KASAN 2688 slotspace_copy(SLAREA_ASAN, pdir, PDP_BASE); 2689 #endif 2690 #ifdef KMSAN 2691 slotspace_copy(SLAREA_MSAN, pdir, PDP_BASE); 2692 #endif 2693 #endif /* XENPV && __x86_64__*/ 2694 2695 #ifdef XENPV 2696 s = splvm(); 2697 object = (vaddr_t)pdir; 2698 pmap_protect(pmap_kernel(), object, object + (PAGE_SIZE * PDP_SIZE), 2699 VM_PROT_READ); 2700 pmap_update(pmap_kernel()); 2701 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2702 /* 2703 * pin as L2/L4 page, we have to do the page with the 2704 * PDIR_SLOT_PTE entries last 2705 */ 2706 #ifdef PAE 2707 if (i == l2tol3(PDIR_SLOT_PTE)) 2708 continue; 2709 #endif 2710 2711 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2712 #ifdef __x86_64__ 2713 xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa)); 2714 #else 2715 xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa)); 2716 #endif 2717 } 2718 #ifdef PAE 2719 object = ((vaddr_t)pdir) + PAGE_SIZE * l2tol3(PDIR_SLOT_PTE); 2720 (void)pmap_extract(pmap_kernel(), object, &pdirpa); 2721 xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa)); 2722 #endif 2723 splx(s); 2724 #endif /* XENPV */ 2725 } 2726 2727 /* 2728 * pmap_pdp_fini: destructor for the PDPs. 2729 */ 2730 static void 2731 pmap_pdp_fini(pd_entry_t *pdir) 2732 { 2733 #ifdef XENPV 2734 paddr_t pdirpa = 0; /* XXX: GCC */ 2735 vaddr_t object = (vaddr_t)pdir; 2736 int i; 2737 int s = splvm(); 2738 pt_entry_t *pte; 2739 2740 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2741 /* fetch the physical address of the page directory. */ 2742 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2743 /* unpin page table */ 2744 xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa)); 2745 } 2746 object = (vaddr_t)pdir; 2747 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2748 /* Set page RW again */ 2749 pte = kvtopte(object); 2750 pmap_pte_set(pte, *pte | PTE_W); 2751 xen_bcast_invlpg((vaddr_t)object); 2752 } 2753 splx(s); 2754 #endif /* XENPV */ 2755 } 2756 2757 #ifdef PAE 2758 static void * 2759 pmap_pdp_alloc(struct pool *pp, int flags) 2760 { 2761 return (void *)uvm_km_alloc(kernel_map, 2762 PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE, 2763 ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK) | 2764 UVM_KMF_WIRED); 2765 } 2766 2767 static void 2768 pmap_pdp_free(struct pool *pp, void *v) 2769 { 2770 uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE, 2771 UVM_KMF_WIRED); 2772 } 2773 #endif /* PAE */ 2774 2775 /* 2776 * pmap_ctor: constructor for the pmap cache. 2777 */ 2778 static int 2779 pmap_ctor(void *arg, void *obj, int flags) 2780 { 2781 struct pmap *pmap = obj; 2782 pt_entry_t p; 2783 int i; 2784 2785 KASSERT((flags & PR_WAITOK) != 0); 2786 2787 mutex_init(&pmap->pm_lock, MUTEX_DEFAULT, IPL_NONE); 2788 rw_init(&pmap->pm_dummy_lock); 2789 kcpuset_create(&pmap->pm_cpus, true); 2790 kcpuset_create(&pmap->pm_kernel_cpus, true); 2791 #ifdef XENPV 2792 kcpuset_create(&pmap->pm_xen_ptp_cpus, true); 2793 #endif 2794 LIST_INIT(&pmap->pm_gc_ptp); 2795 pmap->pm_pve = NULL; 2796 LIST_INIT(&pmap->pm_pvp_full); 2797 LIST_INIT(&pmap->pm_pvp_part); 2798 LIST_INIT(&pmap->pm_pvp_empty); 2799 2800 /* allocate and init PDP */ 2801 pmap->pm_pdir = pool_get(&pmap_pdp_pool, PR_WAITOK); 2802 2803 for (;;) { 2804 pmap_pdp_init(pmap->pm_pdir); 2805 mutex_enter(&pmaps_lock); 2806 p = pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1]; 2807 if (__predict_true(p != 0)) { 2808 break; 2809 } 2810 mutex_exit(&pmaps_lock); 2811 } 2812 2813 for (i = 0; i < PDP_SIZE; i++) 2814 pmap->pm_pdirpa[i] = 2815 pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]); 2816 2817 LIST_INSERT_HEAD(&pmaps, pmap, pm_list); 2818 mutex_exit(&pmaps_lock); 2819 2820 return 0; 2821 } 2822 2823 /* 2824 * pmap_ctor: destructor for the pmap cache. 2825 */ 2826 static void 2827 pmap_dtor(void *arg, void *obj) 2828 { 2829 struct pmap *pmap = obj; 2830 2831 mutex_enter(&pmaps_lock); 2832 LIST_REMOVE(pmap, pm_list); 2833 mutex_exit(&pmaps_lock); 2834 2835 pmap_pdp_fini(pmap->pm_pdir); 2836 pool_put(&pmap_pdp_pool, pmap->pm_pdir); 2837 mutex_destroy(&pmap->pm_lock); 2838 rw_destroy(&pmap->pm_dummy_lock); 2839 kcpuset_destroy(pmap->pm_cpus); 2840 kcpuset_destroy(pmap->pm_kernel_cpus); 2841 #ifdef XENPV 2842 kcpuset_destroy(pmap->pm_xen_ptp_cpus); 2843 #endif 2844 } 2845 2846 /* 2847 * pmap_create: create a pmap object. 2848 */ 2849 struct pmap * 2850 pmap_create(void) 2851 { 2852 struct pmap *pmap; 2853 int i; 2854 2855 pmap = pool_cache_get(&pmap_cache, PR_WAITOK); 2856 2857 /* init uvm_object */ 2858 for (i = 0; i < PTP_LEVELS - 1; i++) { 2859 uvm_obj_init(&pmap->pm_obj[i], &pmap_pager, false, 1); 2860 uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_dummy_lock); 2861 pmap->pm_ptphint[i] = NULL; 2862 } 2863 pmap->pm_stats.wired_count = 0; 2864 /* count the PDP allocd below */ 2865 pmap->pm_stats.resident_count = PDP_SIZE; 2866 #if !defined(__x86_64__) 2867 pmap->pm_hiexec = 0; 2868 #endif 2869 2870 /* Used by NVMM and Xen */ 2871 pmap->pm_enter = NULL; 2872 pmap->pm_extract = NULL; 2873 pmap->pm_remove = NULL; 2874 pmap->pm_sync_pv = NULL; 2875 pmap->pm_pp_remove_ent = NULL; 2876 pmap->pm_write_protect = NULL; 2877 pmap->pm_unwire = NULL; 2878 pmap->pm_tlb_flush = NULL; 2879 pmap->pm_data = NULL; 2880 2881 /* init the LDT */ 2882 pmap->pm_ldt = NULL; 2883 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 2884 2885 return (pmap); 2886 } 2887 2888 /* 2889 * pmap_check_ptps: verify that none of the pmap's page table objects 2890 * have any pages allocated to them. 2891 */ 2892 static void 2893 pmap_check_ptps(struct pmap *pmap) 2894 { 2895 int i; 2896 2897 for (i = 0; i < PTP_LEVELS - 1; i++) { 2898 KASSERTMSG(pmap->pm_obj[i].uo_npages == 0, 2899 "pmap %p level %d still has %d pages", 2900 pmap, i, (int)pmap->pm_obj[i].uo_npages); 2901 } 2902 } 2903 2904 static void 2905 pmap_check_inuse(struct pmap *pmap) 2906 { 2907 #ifdef DEBUG 2908 CPU_INFO_ITERATOR cii; 2909 struct cpu_info *ci; 2910 2911 for (CPU_INFO_FOREACH(cii, ci)) { 2912 if (ci->ci_pmap == pmap) 2913 panic("destroying pmap being used"); 2914 #if defined(XENPV) && defined(__x86_64__) 2915 for (int i = 0; i < PDIR_SLOT_USERLIM; i++) { 2916 if (pmap->pm_pdir[i] != 0 && 2917 ci->ci_kpm_pdir[i] == pmap->pm_pdir[i]) { 2918 printf("pmap_destroy(%p) pmap_kernel %p " 2919 "curcpu %d cpu %d ci_pmap %p " 2920 "ci->ci_kpm_pdir[%d]=%" PRIx64 2921 " pmap->pm_pdir[%d]=%" PRIx64 "\n", 2922 pmap, pmap_kernel(), curcpu()->ci_index, 2923 ci->ci_index, ci->ci_pmap, 2924 i, ci->ci_kpm_pdir[i], 2925 i, pmap->pm_pdir[i]); 2926 panic("%s: used pmap", __func__); 2927 } 2928 } 2929 #endif 2930 } 2931 #endif /* DEBUG */ 2932 } 2933 2934 /* 2935 * pmap_destroy: drop reference count on pmap. free pmap if reference 2936 * count goes to zero. 2937 * 2938 * => we can be called from pmap_unmap_ptes() with a different, unrelated 2939 * pmap's lock held. be careful! 2940 */ 2941 void 2942 pmap_destroy(struct pmap *pmap) 2943 { 2944 int i; 2945 2946 /* 2947 * drop reference count and verify not in use. 2948 */ 2949 2950 if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) { 2951 return; 2952 } 2953 pmap_check_inuse(pmap); 2954 2955 /* 2956 * handle any deferred frees. 2957 */ 2958 2959 mutex_enter(&pmap->pm_lock); 2960 if (pmap->pm_pve != NULL) { 2961 pmap_free_pv(pmap, pmap->pm_pve); 2962 pmap->pm_pve = NULL; 2963 } 2964 pmap_drain_pv(pmap); 2965 mutex_exit(&pmap->pm_lock); 2966 pmap_update(pmap); 2967 2968 /* 2969 * Reference count is zero, free pmap resources and then free pmap. 2970 */ 2971 2972 pmap_check_ptps(pmap); 2973 KASSERT(LIST_EMPTY(&pmap->pm_gc_ptp)); 2974 2975 #ifdef USER_LDT 2976 if (pmap->pm_ldt != NULL) { 2977 /* 2978 * No need to switch the LDT; this address space is gone, 2979 * nothing is using it. 2980 * 2981 * No need to lock the pmap for ldt_free (or anything else), 2982 * we're the last one to use it. 2983 */ 2984 /* XXXAD can't take cpu_lock here - fix soon. */ 2985 mutex_enter(&cpu_lock); 2986 ldt_free(pmap->pm_ldt_sel); 2987 mutex_exit(&cpu_lock); 2988 uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt, 2989 MAX_USERLDT_SIZE, UVM_KMF_WIRED); 2990 } 2991 #endif 2992 2993 for (i = 0; i < PTP_LEVELS - 1; i++) { 2994 uvm_obj_destroy(&pmap->pm_obj[i], false); 2995 } 2996 kcpuset_zero(pmap->pm_cpus); 2997 kcpuset_zero(pmap->pm_kernel_cpus); 2998 #ifdef XENPV 2999 kcpuset_zero(pmap->pm_xen_ptp_cpus); 3000 #endif 3001 3002 KASSERT(LIST_EMPTY(&pmap->pm_pvp_full)); 3003 KASSERT(LIST_EMPTY(&pmap->pm_pvp_part)); 3004 KASSERT(LIST_EMPTY(&pmap->pm_pvp_empty)); 3005 3006 pmap_check_ptps(pmap); 3007 if (__predict_false(pmap->pm_enter != NULL)) { 3008 /* XXX make this a different cache */ 3009 pool_cache_destruct_object(&pmap_cache, pmap); 3010 } else { 3011 pool_cache_put(&pmap_cache, pmap); 3012 } 3013 } 3014 3015 /* 3016 * pmap_zap_ptp: clear out an entire PTP without modifying PTEs 3017 * 3018 * => caller must hold pmap's lock 3019 * => PTP must be mapped into KVA 3020 * => must be called with kernel preemption disabled 3021 * => does as little work as possible 3022 */ 3023 static void 3024 pmap_zap_ptp(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, 3025 vaddr_t startva, vaddr_t blkendva) 3026 { 3027 #ifndef XENPV 3028 struct pv_entry *pve; 3029 struct vm_page *pg; 3030 struct pmap_page *pp; 3031 pt_entry_t opte; 3032 rb_tree_t *tree; 3033 vaddr_t va; 3034 int wired; 3035 uint8_t oattrs; 3036 u_int cnt; 3037 3038 KASSERT(mutex_owned(&pmap->pm_lock)); 3039 KASSERT(kpreempt_disabled()); 3040 KASSERT(pmap != pmap_kernel()); 3041 KASSERT(ptp->wire_count > 1); 3042 KASSERT(ptp->wire_count - 1 <= PAGE_SIZE / sizeof(pt_entry_t)); 3043 3044 /* 3045 * Start at the lowest entered VA, and scan until there are no more 3046 * PTEs in the PTPs. 3047 */ 3048 tree = &VM_PAGE_TO_PP(ptp)->pp_rb; 3049 pve = RB_TREE_MIN(tree); 3050 wired = 0; 3051 va = (vaddr_t)ptp->uanon; 3052 pte += ((va - startva) >> PAGE_SHIFT); 3053 3054 for (cnt = ptp->wire_count; cnt > 1; pte++, va += PAGE_SIZE) { 3055 /* 3056 * No need for an atomic to clear the PTE. Nothing else can 3057 * see the address space any more and speculative access (if 3058 * possible) won't modify. Therefore there's no need to 3059 * track the accessed/dirty bits. 3060 */ 3061 opte = *pte; 3062 if (!pmap_valid_entry(opte)) { 3063 continue; 3064 } 3065 3066 /* 3067 * Count the PTE. If it's not for a managed mapping 3068 * there's noting more to do. 3069 */ 3070 cnt--; 3071 wired -= (opte & PTE_WIRED); 3072 if ((opte & PTE_PVLIST) == 0) { 3073 #ifndef DOM0OPS 3074 KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL), 3075 "managed page without PTE_PVLIST for %#" 3076 PRIxVADDR, va); 3077 KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL), 3078 "pv-tracked page without PTE_PVLIST for %#" 3079 PRIxVADDR, va); 3080 #endif 3081 KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ? 3082 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), 3083 va) == NULL); 3084 continue; 3085 } 3086 3087 /* 3088 * "pve" now points to the lowest (by VA) dynamic PV entry 3089 * in the PTP. If it's for this VA, take advantage of it to 3090 * avoid calling PHYS_TO_VM_PAGE(). Avoid modifying the RB 3091 * tree by skipping to the next VA in the tree whenever 3092 * there is a match here. The tree will be cleared out in 3093 * one pass before return to pmap_remove_all(). 3094 */ 3095 oattrs = pmap_pte_to_pp_attrs(opte); 3096 if (pve != NULL && pve->pve_pte.pte_va == va) { 3097 pp = pve->pve_pp; 3098 KASSERT(pve->pve_pte.pte_ptp == ptp); 3099 KASSERT(pp->pp_pte.pte_ptp != ptp || 3100 pp->pp_pte.pte_va != va); 3101 mutex_spin_enter(&pp->pp_lock); 3102 pp->pp_attrs |= oattrs; 3103 LIST_REMOVE(pve, pve_list); 3104 mutex_spin_exit(&pp->pp_lock); 3105 3106 /* 3107 * pve won't be touched again until pmap_drain_pv(), 3108 * so it's still safe to traverse the tree. 3109 */ 3110 pmap_free_pv(pmap, pve); 3111 pve = RB_TREE_NEXT(tree, pve); 3112 continue; 3113 } 3114 3115 /* 3116 * No entry in the tree so it must be embedded. Look up the 3117 * page and cancel the embedded entry. 3118 */ 3119 if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) { 3120 pp = VM_PAGE_TO_PP(pg); 3121 } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) { 3122 paddr_t pa = pmap_pte2pa(opte); 3123 panic("%s: PTE_PVLIST with pv-untracked page" 3124 " va = %#"PRIxVADDR"pa = %#"PRIxPADDR 3125 "(%#"PRIxPADDR")", __func__, va, pa, atop(pa)); 3126 } 3127 mutex_spin_enter(&pp->pp_lock); 3128 KASSERT(pp->pp_pte.pte_ptp == ptp); 3129 KASSERT(pp->pp_pte.pte_va == va); 3130 pp->pp_attrs |= oattrs; 3131 pp->pp_pte.pte_ptp = NULL; 3132 pp->pp_pte.pte_va = 0; 3133 mutex_spin_exit(&pp->pp_lock); 3134 } 3135 3136 /* PTP now empty - adjust the tree & stats to match. */ 3137 pmap_stats_update(pmap, -(ptp->wire_count - 1), wired / PTE_WIRED); 3138 ptp->wire_count = 1; 3139 #ifdef DIAGNOSTIC 3140 rb_tree_init(tree, &pmap_rbtree_ops); 3141 #endif 3142 #else /* !XENPV */ 3143 /* 3144 * XXXAD For XEN, it's not clear to me that we can do this, because 3145 * I guess the hypervisor keeps track of PTEs too. 3146 */ 3147 pmap_remove_ptes(pmap, ptp, (vaddr_t)pte, startva, blkendva); 3148 #endif /* !XENPV */ 3149 } 3150 3151 /* 3152 * pmap_remove_all: remove all mappings from pmap in bulk. 3153 * 3154 * Ordinarily when removing mappings it's important to hold the UVM object's 3155 * lock, so that pages do not gain a new identity while retaining stale TLB 3156 * entries (the same lock hold covers both pmap_remove() and pmap_update()). 3157 * Here it's known that the address space is no longer visible to any user 3158 * process, so we don't need to worry about that. 3159 */ 3160 bool 3161 pmap_remove_all(struct pmap *pmap) 3162 { 3163 struct vm_page *ptps[32]; 3164 vaddr_t va, blkendva; 3165 struct pmap *pmap2; 3166 pt_entry_t *ptes; 3167 pd_entry_t pde __diagused; 3168 pd_entry_t * const *pdes; 3169 int lvl __diagused, i, n; 3170 3171 /* XXX Can't handle EPT just yet. */ 3172 if (pmap->pm_remove != NULL) { 3173 return false; 3174 } 3175 3176 for (;;) { 3177 /* Fetch a block of PTPs from tree. */ 3178 mutex_enter(&pmap->pm_lock); 3179 n = radix_tree_gang_lookup_node(&pmap->pm_obj[0].uo_pages, 0, 3180 (void **)ptps, __arraycount(ptps), false); 3181 if (n == 0) { 3182 mutex_exit(&pmap->pm_lock); 3183 break; 3184 } 3185 3186 /* Remove all mappings in the set of PTPs. */ 3187 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3188 for (i = 0; i < n; i++) { 3189 if (ptps[i]->wire_count == 0) { 3190 /* It's dead: pmap_update() will expunge. */ 3191 continue; 3192 } 3193 3194 /* Determine range of block. */ 3195 va = ptps[i]->offset * PAGE_SIZE / sizeof(pt_entry_t); 3196 blkendva = x86_round_pdr(va + 1); 3197 3198 /* Make sure everything squares up... */ 3199 KASSERT(pmap_pdes_valid(va, pdes, &pde, &lvl)); 3200 KASSERT(lvl == 1); 3201 KASSERT(pmap_find_ptp(pmap, va, 1) == ptps[i]); 3202 3203 /* Zap! */ 3204 pmap_zap_ptp(pmap, ptps[i], &ptes[pl1_i(va)], va, 3205 blkendva); 3206 3207 /* PTP should now be unused - free it. */ 3208 KASSERT(ptps[i]->wire_count == 1); 3209 pmap_free_ptp(pmap, ptps[i], va, ptes, pdes); 3210 } 3211 pmap_unmap_ptes(pmap, pmap2); 3212 pmap_drain_pv(pmap); 3213 pmap_tlb_shootdown(pmap, -1L, 0, TLBSHOOT_REMOVE_ALL); 3214 mutex_exit(&pmap->pm_lock); 3215 3216 /* Process deferred frees. */ 3217 pmap_update(pmap); 3218 3219 /* A breathing point. */ 3220 preempt_point(); 3221 } 3222 3223 /* Verify that the pmap is now completely empty. */ 3224 pmap_check_ptps(pmap); 3225 KASSERTMSG(pmap->pm_stats.resident_count == PDP_SIZE, 3226 "pmap %p not empty", pmap); 3227 3228 return true; 3229 } 3230 3231 #if defined(PMAP_FORK) 3232 /* 3233 * pmap_fork: perform any necessary data structure manipulation when 3234 * a VM space is forked. 3235 */ 3236 void 3237 pmap_fork(struct pmap *pmap1, struct pmap *pmap2) 3238 { 3239 #ifdef USER_LDT 3240 union descriptor *new_ldt; 3241 int sel; 3242 3243 if (__predict_true(pmap1->pm_ldt == NULL)) { 3244 return; 3245 } 3246 3247 /* 3248 * Copy the LDT into the new process. 3249 * 3250 * Read pmap1's ldt pointer unlocked; if it changes behind our back 3251 * we'll retry. This will starve if there's a stream of LDT changes 3252 * in another thread but that should not happen. 3253 */ 3254 3255 retry: 3256 if (pmap1->pm_ldt != NULL) { 3257 /* Allocate space for the new process's LDT */ 3258 new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, 3259 MAX_USERLDT_SIZE, 0, UVM_KMF_WIRED); 3260 if (new_ldt == NULL) { 3261 printf("WARNING: %s: unable to allocate LDT space\n", 3262 __func__); 3263 return; 3264 } 3265 mutex_enter(&cpu_lock); 3266 /* Get a GDT slot for it */ 3267 sel = ldt_alloc(new_ldt, MAX_USERLDT_SIZE); 3268 if (sel == -1) { 3269 mutex_exit(&cpu_lock); 3270 uvm_km_free(kernel_map, (vaddr_t)new_ldt, 3271 MAX_USERLDT_SIZE, UVM_KMF_WIRED); 3272 printf("WARNING: %s: unable to allocate LDT selector\n", 3273 __func__); 3274 return; 3275 } 3276 } else { 3277 /* Wasn't anything there after all. */ 3278 new_ldt = NULL; 3279 sel = -1; 3280 mutex_enter(&cpu_lock); 3281 } 3282 3283 /* 3284 * Now that we have cpu_lock, ensure the LDT status is the same. 3285 */ 3286 if (pmap1->pm_ldt != NULL) { 3287 if (new_ldt == NULL) { 3288 /* A wild LDT just appeared. */ 3289 mutex_exit(&cpu_lock); 3290 goto retry; 3291 } 3292 3293 /* Copy the LDT data and install it in pmap2 */ 3294 memcpy(new_ldt, pmap1->pm_ldt, MAX_USERLDT_SIZE); 3295 pmap2->pm_ldt = new_ldt; 3296 pmap2->pm_ldt_sel = sel; 3297 mutex_exit(&cpu_lock); 3298 } else { 3299 if (new_ldt != NULL) { 3300 /* The LDT disappeared, drop what we did. */ 3301 ldt_free(sel); 3302 mutex_exit(&cpu_lock); 3303 uvm_km_free(kernel_map, (vaddr_t)new_ldt, 3304 MAX_USERLDT_SIZE, UVM_KMF_WIRED); 3305 return; 3306 } 3307 3308 /* We're good, just leave. */ 3309 mutex_exit(&cpu_lock); 3310 } 3311 #endif /* USER_LDT */ 3312 } 3313 #endif /* PMAP_FORK */ 3314 3315 #ifdef USER_LDT 3316 3317 /* 3318 * pmap_ldt_xcall: cross call used by pmap_ldt_sync. if the named pmap 3319 * is active, reload LDTR. 3320 */ 3321 static void 3322 pmap_ldt_xcall(void *arg1, void *arg2) 3323 { 3324 struct pmap *pm; 3325 3326 kpreempt_disable(); 3327 pm = arg1; 3328 if (curcpu()->ci_pmap == pm) { 3329 #if defined(SVS) 3330 if (svs_enabled) { 3331 svs_ldt_sync(pm); 3332 } else 3333 #endif 3334 lldt(pm->pm_ldt_sel); 3335 } 3336 kpreempt_enable(); 3337 } 3338 3339 /* 3340 * pmap_ldt_sync: LDT selector for the named pmap is changing. swap 3341 * in the new selector on all CPUs. 3342 */ 3343 void 3344 pmap_ldt_sync(struct pmap *pm) 3345 { 3346 uint64_t where; 3347 3348 KASSERT(mutex_owned(&cpu_lock)); 3349 3350 pmap_ldt_evcnt.ev_count++; 3351 where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL); 3352 xc_wait(where); 3353 } 3354 3355 /* 3356 * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and 3357 * restore the default. 3358 */ 3359 void 3360 pmap_ldt_cleanup(struct lwp *l) 3361 { 3362 pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap; 3363 union descriptor *ldt; 3364 int sel; 3365 3366 if (__predict_true(pmap->pm_ldt == NULL)) { 3367 return; 3368 } 3369 3370 mutex_enter(&cpu_lock); 3371 if (pmap->pm_ldt != NULL) { 3372 sel = pmap->pm_ldt_sel; 3373 ldt = pmap->pm_ldt; 3374 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 3375 pmap->pm_ldt = NULL; 3376 pmap_ldt_sync(pmap); 3377 ldt_free(sel); 3378 uvm_km_free(kernel_map, (vaddr_t)ldt, MAX_USERLDT_SIZE, 3379 UVM_KMF_WIRED); 3380 } 3381 mutex_exit(&cpu_lock); 3382 } 3383 #endif /* USER_LDT */ 3384 3385 /* 3386 * pmap_activate: activate a process' pmap 3387 * 3388 * => must be called with kernel preemption disabled 3389 * => if lwp is the curlwp, then set ci_want_pmapload so that 3390 * actual MMU context switch will be done by pmap_load() later 3391 */ 3392 void 3393 pmap_activate(struct lwp *l) 3394 { 3395 struct cpu_info *ci; 3396 struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 3397 3398 KASSERT(kpreempt_disabled()); 3399 3400 ci = curcpu(); 3401 3402 if (l != ci->ci_curlwp) 3403 return; 3404 3405 KASSERT(ci->ci_want_pmapload == 0); 3406 KASSERT(ci->ci_tlbstate != TLBSTATE_VALID); 3407 3408 /* 3409 * no need to switch to kernel vmspace because 3410 * it's a subset of any vmspace. 3411 */ 3412 3413 if (pmap == pmap_kernel()) { 3414 ci->ci_want_pmapload = 0; 3415 return; 3416 } 3417 3418 ci->ci_want_pmapload = 1; 3419 } 3420 3421 #if defined(XENPV) && defined(__x86_64__) 3422 #define KASSERT_PDIRPA(pmap) \ 3423 KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd || \ 3424 pmap == pmap_kernel()) 3425 #elif defined(PAE) 3426 #define KASSERT_PDIRPA(pmap) \ 3427 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0])) 3428 #elif !defined(XENPV) 3429 #define KASSERT_PDIRPA(pmap) \ 3430 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3())) 3431 #else 3432 #define KASSERT_PDIRPA(pmap) KASSERT(true) /* nothing to do */ 3433 #endif 3434 3435 /* 3436 * pmap_reactivate: try to regain reference to the pmap. 3437 * 3438 * => Must be called with kernel preemption disabled. 3439 */ 3440 static void 3441 pmap_reactivate(struct pmap *pmap) 3442 { 3443 struct cpu_info * const ci = curcpu(); 3444 const cpuid_t cid = cpu_index(ci); 3445 3446 KASSERT(kpreempt_disabled()); 3447 KASSERT_PDIRPA(pmap); 3448 3449 /* 3450 * If we still have a lazy reference to this pmap, we can assume 3451 * that there was no TLB shootdown for this pmap in the meantime. 3452 * 3453 * The order of events here is important as we must synchronize 3454 * with TLB shootdown interrupts. Declare interest in invalidations 3455 * (TLBSTATE_VALID) and then check the CPU set, which the IPIs can 3456 * change only when the state is TLBSTATE_LAZY. 3457 */ 3458 3459 ci->ci_tlbstate = TLBSTATE_VALID; 3460 KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid)); 3461 3462 if (__predict_true(kcpuset_isset(pmap->pm_cpus, cid))) { 3463 /* We have the reference, state is valid. */ 3464 } else { 3465 /* 3466 * Must reload the TLB, pmap has been changed during 3467 * deactivated. 3468 */ 3469 kcpuset_atomic_set(pmap->pm_cpus, cid); 3470 3471 tlbflush(); 3472 } 3473 } 3474 3475 /* 3476 * pmap_load: perform the actual pmap switch, i.e. fill in %cr3 register 3477 * and relevant LDT info. 3478 * 3479 * Ensures that the current process' pmap is loaded on the current CPU's 3480 * MMU and that there are no stale TLB entries. 3481 * 3482 * => The caller should disable kernel preemption or do check-and-retry 3483 * to prevent a preemption from undoing our efforts. 3484 * => This function may block. 3485 */ 3486 void 3487 pmap_load(void) 3488 { 3489 struct cpu_info *ci; 3490 struct pmap *pmap, *oldpmap; 3491 struct lwp *l; 3492 uint64_t ncsw; 3493 3494 kpreempt_disable(); 3495 retry: 3496 ci = curcpu(); 3497 if (!ci->ci_want_pmapload) { 3498 kpreempt_enable(); 3499 return; 3500 } 3501 l = ci->ci_curlwp; 3502 ncsw = l->l_ncsw; 3503 __insn_barrier(); 3504 3505 /* should be able to take ipis. */ 3506 KASSERT(ci->ci_ilevel < IPL_HIGH); 3507 #ifdef XENPV 3508 /* Check to see if interrupts are enabled (ie; no events are masked) */ 3509 KASSERT(x86_read_psl() == 0); 3510 #else 3511 KASSERT((x86_read_psl() & PSL_I) != 0); 3512 #endif 3513 3514 KASSERT(l != NULL); 3515 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 3516 KASSERT(pmap != pmap_kernel()); 3517 oldpmap = ci->ci_pmap; 3518 3519 if (pmap == oldpmap) { 3520 pmap_reactivate(pmap); 3521 ci->ci_want_pmapload = 0; 3522 kpreempt_enable(); 3523 return; 3524 } 3525 3526 /* 3527 * Acquire a reference to the new pmap and perform the switch. 3528 */ 3529 3530 pmap_reference(pmap); 3531 pmap_load1(l, pmap, oldpmap); 3532 ci->ci_want_pmapload = 0; 3533 3534 /* 3535 * we're now running with the new pmap. drop the reference 3536 * to the old pmap. if we block, we need to go around again. 3537 */ 3538 3539 pmap_destroy(oldpmap); 3540 __insn_barrier(); 3541 if (l->l_ncsw != ncsw) { 3542 goto retry; 3543 } 3544 3545 kpreempt_enable(); 3546 } 3547 3548 /* 3549 * pmap_load1: the guts of pmap load, shared by pmap_map_ptes() and 3550 * pmap_load(). It's critically important that this function does not 3551 * block. 3552 */ 3553 static void 3554 pmap_load1(struct lwp *l, struct pmap *pmap, struct pmap *oldpmap) 3555 { 3556 struct cpu_info *ci; 3557 struct pcb *pcb; 3558 cpuid_t cid; 3559 3560 KASSERT(kpreempt_disabled()); 3561 3562 pcb = lwp_getpcb(l); 3563 ci = l->l_cpu; 3564 cid = cpu_index(ci); 3565 3566 kcpuset_atomic_clear(oldpmap->pm_cpus, cid); 3567 kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid); 3568 3569 KASSERT_PDIRPA(oldpmap); 3570 KASSERT(!kcpuset_isset(pmap->pm_cpus, cid)); 3571 KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid)); 3572 3573 /* 3574 * Mark the pmap in use by this CPU. Again, we must synchronize 3575 * with TLB shootdown interrupts, so set the state VALID first, 3576 * then register us for shootdown events on this pmap. 3577 */ 3578 ci->ci_tlbstate = TLBSTATE_VALID; 3579 kcpuset_atomic_set(pmap->pm_cpus, cid); 3580 kcpuset_atomic_set(pmap->pm_kernel_cpus, cid); 3581 ci->ci_pmap = pmap; 3582 3583 /* 3584 * update tss. now that we have registered for invalidations 3585 * from other CPUs, we're good to load the page tables. 3586 */ 3587 #ifdef PAE 3588 pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa; 3589 #else 3590 pcb->pcb_cr3 = pmap_pdirpa(pmap, 0); 3591 #endif 3592 3593 #ifdef i386 3594 #ifndef XENPV 3595 ci->ci_tss->tss.tss_ldt = pmap->pm_ldt_sel; 3596 ci->ci_tss->tss.tss_cr3 = pcb->pcb_cr3; 3597 #endif 3598 #endif 3599 3600 #if defined(SVS) && defined(USER_LDT) 3601 if (svs_enabled) { 3602 svs_ldt_sync(pmap); 3603 } else 3604 #endif 3605 lldt(pmap->pm_ldt_sel); 3606 3607 cpu_load_pmap(pmap, oldpmap); 3608 } 3609 3610 /* 3611 * pmap_deactivate: deactivate a process' pmap. 3612 * 3613 * => Must be called with kernel preemption disabled (high IPL is enough). 3614 */ 3615 void 3616 pmap_deactivate(struct lwp *l) 3617 { 3618 struct pmap *pmap; 3619 struct cpu_info *ci; 3620 3621 KASSERT(kpreempt_disabled()); 3622 3623 if (l != curlwp) { 3624 return; 3625 } 3626 3627 /* 3628 * Wait for pending TLB shootdowns to complete. Necessary because 3629 * TLB shootdown state is per-CPU, and the LWP may be coming off 3630 * the CPU before it has a chance to call pmap_update(), e.g. due 3631 * to kernel preemption or blocking routine in between. 3632 */ 3633 pmap_tlb_shootnow(); 3634 3635 ci = curcpu(); 3636 3637 if (ci->ci_want_pmapload) { 3638 /* 3639 * ci_want_pmapload means that our pmap is not loaded on 3640 * the CPU or TLB might be stale. note that pmap_kernel() 3641 * is always considered loaded. 3642 */ 3643 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 3644 != pmap_kernel()); 3645 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 3646 != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID); 3647 3648 /* 3649 * userspace has not been touched. 3650 * nothing to do here. 3651 */ 3652 3653 ci->ci_want_pmapload = 0; 3654 return; 3655 } 3656 3657 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 3658 3659 if (pmap == pmap_kernel()) { 3660 return; 3661 } 3662 3663 KASSERT_PDIRPA(pmap); 3664 KASSERT(ci->ci_pmap == pmap); 3665 3666 /* 3667 * we aren't interested in TLB invalidations for this pmap, 3668 * at least for the time being. 3669 */ 3670 3671 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); 3672 ci->ci_tlbstate = TLBSTATE_LAZY; 3673 } 3674 3675 /* 3676 * some misc. functions 3677 */ 3678 3679 bool 3680 pmap_pdes_valid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde, 3681 int *lastlvl) 3682 { 3683 unsigned long index; 3684 pd_entry_t pde; 3685 int i; 3686 3687 for (i = PTP_LEVELS; i > 1; i--) { 3688 index = pl_i(va, i); 3689 pde = pdes[i - 2][index]; 3690 if ((pde & PTE_P) == 0) { 3691 *lastlvl = i; 3692 return false; 3693 } 3694 if (pde & PTE_PS) 3695 break; 3696 } 3697 if (lastpde != NULL) 3698 *lastpde = pde; 3699 *lastlvl = i; 3700 return true; 3701 } 3702 3703 /* 3704 * pmap_extract: extract a PA for the given VA 3705 */ 3706 bool 3707 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap) 3708 { 3709 pt_entry_t *ptes, pte; 3710 pd_entry_t pde; 3711 pd_entry_t * const *pdes; 3712 struct pmap *pmap2; 3713 paddr_t pa; 3714 bool rv; 3715 int lvl; 3716 3717 if (__predict_false(pmap->pm_extract != NULL)) { 3718 return (*pmap->pm_extract)(pmap, va, pap); 3719 } 3720 3721 #ifdef __HAVE_DIRECT_MAP 3722 if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) { 3723 if (pap != NULL) { 3724 *pap = PMAP_DIRECT_UNMAP(va); 3725 } 3726 return true; 3727 } 3728 #endif 3729 3730 rv = false; 3731 pa = 0; 3732 3733 if (pmap != pmap_kernel()) { 3734 mutex_enter(&pmap->pm_lock); 3735 } 3736 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3737 if (pmap_pdes_valid(va, pdes, &pde, &lvl)) { 3738 if (lvl == 2) { 3739 pa = (pde & PTE_LGFRAME) | (va & (NBPD_L2 - 1)); 3740 rv = true; 3741 } else { 3742 KASSERT(lvl == 1); 3743 pte = ptes[pl1_i(va)]; 3744 if (__predict_true((pte & PTE_P) != 0)) { 3745 pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1)); 3746 rv = true; 3747 } 3748 } 3749 } 3750 pmap_unmap_ptes(pmap, pmap2); 3751 if (pmap != pmap_kernel()) { 3752 mutex_exit(&pmap->pm_lock); 3753 } 3754 if (pap != NULL) { 3755 *pap = pa; 3756 } 3757 3758 return rv; 3759 } 3760 3761 /* 3762 * vtophys: virtual address to physical address. For use by 3763 * machine-dependent code only. 3764 */ 3765 paddr_t 3766 vtophys(vaddr_t va) 3767 { 3768 paddr_t pa; 3769 3770 if (pmap_extract(pmap_kernel(), va, &pa) == true) 3771 return pa; 3772 return 0; 3773 } 3774 3775 __strict_weak_alias(pmap_extract_ma, pmap_extract); 3776 3777 #ifdef XENPV 3778 /* 3779 * vtomach: virtual address to machine address. For use by 3780 * machine-dependent code only. 3781 */ 3782 paddr_t 3783 vtomach(vaddr_t va) 3784 { 3785 paddr_t pa; 3786 3787 if (pmap_extract_ma(pmap_kernel(), va, &pa) == true) 3788 return pa; 3789 return 0; 3790 } 3791 #endif 3792 3793 /* 3794 * pmap_virtual_space: used during bootup [pmap_steal_memory] to 3795 * determine the bounds of the kernel virtual addess space. 3796 */ 3797 void 3798 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp) 3799 { 3800 *startp = virtual_avail; 3801 *endp = virtual_end; 3802 } 3803 3804 void 3805 pmap_zero_page(paddr_t pa) 3806 { 3807 #if defined(__HAVE_DIRECT_MAP) 3808 memset((void *)PMAP_DIRECT_MAP(pa), 0, PAGE_SIZE); 3809 #else 3810 #if defined(XENPV) 3811 if (XEN_VERSION_SUPPORTED(3, 4)) 3812 xen_pagezero(pa); 3813 #endif 3814 struct cpu_info *ci; 3815 pt_entry_t *zpte; 3816 vaddr_t zerova; 3817 3818 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_D | PTE_A; 3819 3820 kpreempt_disable(); 3821 3822 ci = curcpu(); 3823 zerova = ci->vpage[VPAGE_ZER]; 3824 zpte = ci->vpage_pte[VPAGE_ZER]; 3825 3826 KASSERTMSG(!*zpte, "pmap_zero_page: lock botch"); 3827 3828 pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags); 3829 pmap_pte_flush(); 3830 pmap_update_pg(zerova); /* flush TLB */ 3831 3832 memset((void *)zerova, 0, PAGE_SIZE); 3833 3834 #if defined(DIAGNOSTIC) || defined(XENPV) 3835 pmap_pte_set(zpte, 0); /* zap ! */ 3836 pmap_pte_flush(); 3837 #endif 3838 3839 kpreempt_enable(); 3840 #endif /* defined(__HAVE_DIRECT_MAP) */ 3841 } 3842 3843 void 3844 pmap_copy_page(paddr_t srcpa, paddr_t dstpa) 3845 { 3846 #if defined(__HAVE_DIRECT_MAP) 3847 vaddr_t srcva = PMAP_DIRECT_MAP(srcpa); 3848 vaddr_t dstva = PMAP_DIRECT_MAP(dstpa); 3849 3850 memcpy((void *)dstva, (void *)srcva, PAGE_SIZE); 3851 #else 3852 #if defined(XENPV) 3853 if (XEN_VERSION_SUPPORTED(3, 4)) { 3854 xen_copy_page(srcpa, dstpa); 3855 return; 3856 } 3857 #endif 3858 struct cpu_info *ci; 3859 pt_entry_t *srcpte, *dstpte; 3860 vaddr_t srcva, dstva; 3861 3862 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A; 3863 3864 kpreempt_disable(); 3865 3866 ci = curcpu(); 3867 srcva = ci->vpage[VPAGE_SRC]; 3868 dstva = ci->vpage[VPAGE_DST]; 3869 srcpte = ci->vpage_pte[VPAGE_SRC]; 3870 dstpte = ci->vpage_pte[VPAGE_DST]; 3871 3872 KASSERT(*srcpte == 0 && *dstpte == 0); 3873 3874 pmap_pte_set(srcpte, pmap_pa2pte(srcpa) | pteflags); 3875 pmap_pte_set(dstpte, pmap_pa2pte(dstpa) | pteflags | PTE_D); 3876 pmap_pte_flush(); 3877 pmap_update_pg(srcva); 3878 pmap_update_pg(dstva); 3879 3880 memcpy((void *)dstva, (void *)srcva, PAGE_SIZE); 3881 3882 #if defined(DIAGNOSTIC) || defined(XENPV) 3883 pmap_pte_set(srcpte, 0); 3884 pmap_pte_set(dstpte, 0); 3885 pmap_pte_flush(); 3886 #endif 3887 3888 kpreempt_enable(); 3889 #endif /* defined(__HAVE_DIRECT_MAP) */ 3890 } 3891 3892 static pt_entry_t * 3893 pmap_map_ptp(struct vm_page *ptp) 3894 { 3895 #ifdef __HAVE_DIRECT_MAP 3896 return (void *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp)); 3897 #else 3898 struct cpu_info *ci; 3899 pt_entry_t *ptppte; 3900 vaddr_t ptpva; 3901 3902 KASSERT(kpreempt_disabled()); 3903 3904 #ifndef XENPV 3905 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A | PTE_D; 3906 #else 3907 const pd_entry_t pteflags = PTE_P | pmap_pg_nx | PTE_A | PTE_D; 3908 #endif 3909 3910 ci = curcpu(); 3911 ptpva = ci->vpage[VPAGE_PTP]; 3912 ptppte = ci->vpage_pte[VPAGE_PTP]; 3913 3914 pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | pteflags); 3915 3916 pmap_pte_flush(); 3917 pmap_update_pg(ptpva); 3918 3919 return (pt_entry_t *)ptpva; 3920 #endif 3921 } 3922 3923 static void 3924 pmap_unmap_ptp(void) 3925 { 3926 #ifndef __HAVE_DIRECT_MAP 3927 #if defined(DIAGNOSTIC) || defined(XENPV) 3928 struct cpu_info *ci; 3929 pt_entry_t *pte; 3930 3931 KASSERT(kpreempt_disabled()); 3932 3933 ci = curcpu(); 3934 pte = ci->vpage_pte[VPAGE_PTP]; 3935 3936 if (*pte != 0) { 3937 pmap_pte_set(pte, 0); 3938 pmap_pte_flush(); 3939 } 3940 #endif 3941 #endif 3942 } 3943 3944 static pt_entry_t * 3945 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va) 3946 { 3947 3948 KASSERT(kpreempt_disabled()); 3949 if (pmap_is_curpmap(pmap)) { 3950 return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */ 3951 } 3952 KASSERT(ptp != NULL); 3953 return pmap_map_ptp(ptp) + pl1_pi(va); 3954 } 3955 3956 static void 3957 pmap_unmap_pte(void) 3958 { 3959 3960 KASSERT(kpreempt_disabled()); 3961 3962 pmap_unmap_ptp(); 3963 } 3964 3965 /* 3966 * p m a p r e m o v e f u n c t i o n s 3967 * 3968 * functions that remove mappings 3969 */ 3970 3971 /* 3972 * pmap_remove_ptes: remove PTEs from a PTP 3973 * 3974 * => caller must hold pmap's lock 3975 * => PTP must be mapped into KVA 3976 * => PTP should be null if pmap == pmap_kernel() 3977 * => must be called with kernel preemption disabled 3978 * => returns composite pte if at least one page should be shot down 3979 */ 3980 static void 3981 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva, 3982 vaddr_t startva, vaddr_t endva) 3983 { 3984 pt_entry_t *pte = (pt_entry_t *)ptpva; 3985 3986 KASSERT(mutex_owned(&pmap->pm_lock)); 3987 KASSERT(kpreempt_disabled()); 3988 3989 /* 3990 * mappings are very often sparse, so clip the given range to the 3991 * range of PTEs that are known present in the PTP. 3992 */ 3993 pmap_ptp_range_clip(ptp, &startva, &pte); 3994 3995 /* 3996 * note that ptpva points to the PTE that maps startva. this may 3997 * or may not be the first PTE in the PTP. 3998 * 3999 * we loop through the PTP while there are still PTEs to look at 4000 * and the wire_count is greater than 1 (because we use the wire_count 4001 * to keep track of the number of real PTEs in the PTP). 4002 */ 4003 while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) { 4004 (void)pmap_remove_pte(pmap, ptp, pte, startva); 4005 startva += PAGE_SIZE; 4006 pte++; 4007 } 4008 } 4009 4010 /* 4011 * pmap_remove_pte: remove a single PTE from a PTP. 4012 * 4013 * => caller must hold pmap's lock 4014 * => PTP must be mapped into KVA 4015 * => PTP should be null if pmap == pmap_kernel() 4016 * => returns true if we removed a mapping 4017 * => must be called with kernel preemption disabled 4018 */ 4019 static bool 4020 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, 4021 vaddr_t va) 4022 { 4023 struct pv_entry *pve; 4024 struct vm_page *pg; 4025 struct pmap_page *pp; 4026 pt_entry_t opte; 4027 4028 KASSERT(mutex_owned(&pmap->pm_lock)); 4029 KASSERT(kpreempt_disabled()); 4030 4031 if (!pmap_valid_entry(*pte)) { 4032 /* VA not mapped. */ 4033 return false; 4034 } 4035 4036 /* Atomically save the old PTE and zap it. */ 4037 opte = pmap_pte_testset(pte, 0); 4038 if (!pmap_valid_entry(opte)) { 4039 return false; 4040 } 4041 4042 pmap_exec_account(pmap, va, opte, 0); 4043 pmap_stats_update_bypte(pmap, 0, opte); 4044 4045 if (ptp) { 4046 /* 4047 * Dropping a PTE. Make sure that the PDE is flushed. 4048 */ 4049 ptp->wire_count--; 4050 if (ptp->wire_count <= 1) { 4051 opte |= PTE_A; 4052 } 4053 } 4054 4055 if ((opte & PTE_A) != 0) { 4056 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE); 4057 } 4058 4059 /* 4060 * If we are not on a pv list - we are done. 4061 */ 4062 if ((opte & PTE_PVLIST) == 0) { 4063 #ifndef DOM0OPS 4064 KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL), 4065 "managed page without PTE_PVLIST for %#"PRIxVADDR, va); 4066 KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL), 4067 "pv-tracked page without PTE_PVLIST for %#"PRIxVADDR, va); 4068 #endif 4069 KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ? 4070 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL); 4071 return true; 4072 } 4073 4074 if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) { 4075 pp = VM_PAGE_TO_PP(pg); 4076 } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) { 4077 paddr_t pa = pmap_pte2pa(opte); 4078 panic("%s: PTE_PVLIST with pv-untracked page" 4079 " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")", 4080 __func__, va, pa, atop(pa)); 4081 } 4082 4083 /* Sync R/M bits. */ 4084 pve = pmap_lookup_pv(pmap, ptp, pp, va); 4085 pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_pte_to_pp_attrs(opte)); 4086 return true; 4087 } 4088 4089 static void 4090 pmap_remove_locked(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 4091 { 4092 pt_entry_t *ptes; 4093 pd_entry_t pde; 4094 pd_entry_t * const *pdes; 4095 bool result; 4096 vaddr_t blkendva, va = sva; 4097 struct vm_page *ptp; 4098 struct pmap *pmap2; 4099 int lvl; 4100 4101 KASSERT(mutex_owned(&pmap->pm_lock)); 4102 4103 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 4104 4105 /* 4106 * removing one page? take shortcut function. 4107 */ 4108 4109 if (va + PAGE_SIZE == eva) { 4110 if (pmap_pdes_valid(va, pdes, &pde, &lvl)) { 4111 KASSERT(lvl == 1); 4112 4113 /* Get PTP if non-kernel mapping. */ 4114 if (pmap != pmap_kernel()) { 4115 ptp = pmap_find_ptp(pmap, va, 1); 4116 KASSERTMSG(ptp != NULL, 4117 "%s: unmanaged PTP detected", __func__); 4118 } else { 4119 /* Never free kernel PTPs. */ 4120 ptp = NULL; 4121 } 4122 4123 result = pmap_remove_pte(pmap, ptp, 4124 &ptes[pl1_i(va)], va); 4125 4126 /* 4127 * if mapping removed and the PTP is no longer 4128 * being used, free it! 4129 */ 4130 4131 if (result && ptp && ptp->wire_count <= 1) 4132 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 4133 } 4134 } else for (/* null */ ; va < eva ; va = blkendva) { 4135 /* determine range of block */ 4136 blkendva = x86_round_pdr(va+1); 4137 if (blkendva > eva) 4138 blkendva = eva; 4139 4140 if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) { 4141 /* Skip a range corresponding to an invalid pde. */ 4142 blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1]; 4143 continue; 4144 } 4145 KASSERT(lvl == 1); 4146 4147 /* Get PTP if non-kernel mapping. */ 4148 if (pmap != pmap_kernel()) { 4149 ptp = pmap_find_ptp(pmap, va, 1); 4150 KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected", 4151 __func__); 4152 } else { 4153 /* Never free kernel PTPs. */ 4154 ptp = NULL; 4155 } 4156 4157 pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va, 4158 blkendva); 4159 4160 /* If PTP is no longer being used, free it. */ 4161 if (ptp && ptp->wire_count <= 1) { 4162 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 4163 } 4164 } 4165 pmap_unmap_ptes(pmap, pmap2); 4166 pmap_drain_pv(pmap); 4167 } 4168 4169 /* 4170 * pmap_remove: mapping removal function. 4171 * 4172 * => caller should not be holding any pmap locks 4173 */ 4174 void 4175 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 4176 { 4177 if (__predict_false(pmap->pm_remove != NULL)) { 4178 (*pmap->pm_remove)(pmap, sva, eva); 4179 return; 4180 } 4181 4182 mutex_enter(&pmap->pm_lock); 4183 pmap_remove_locked(pmap, sva, eva); 4184 mutex_exit(&pmap->pm_lock); 4185 } 4186 4187 /* 4188 * pmap_sync_pv: clear pte bits and return the old value of the pp_attrs. 4189 * 4190 * => The 'clearbits' parameter is either ~0 or PP_ATTRS_... 4191 * => Caller should disable kernel preemption. 4192 * => issues tlb shootdowns if necessary. 4193 */ 4194 static int 4195 pmap_sync_pv(struct pv_pte *pvpte, paddr_t pa, int clearbits, uint8_t *oattrs, 4196 pt_entry_t *optep) 4197 { 4198 struct pmap *pmap; 4199 struct vm_page *ptp; 4200 vaddr_t va; 4201 pt_entry_t *ptep; 4202 pt_entry_t opte; 4203 pt_entry_t npte; 4204 pt_entry_t expect; 4205 bool need_shootdown; 4206 4207 ptp = pvpte->pte_ptp; 4208 va = pvpte->pte_va; 4209 KASSERT(ptp == NULL || ptp->uobject != NULL); 4210 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 4211 pmap = ptp_to_pmap(ptp); 4212 KASSERT(kpreempt_disabled()); 4213 4214 if (__predict_false(pmap->pm_sync_pv != NULL)) { 4215 return (*pmap->pm_sync_pv)(ptp, va, pa, clearbits, oattrs, 4216 optep); 4217 } 4218 4219 expect = pmap_pa2pte(pa) | PTE_P; 4220 4221 if (clearbits != ~0) { 4222 KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0); 4223 clearbits = pmap_pp_attrs_to_pte(clearbits); 4224 } 4225 4226 ptep = pmap_map_pte(pmap, ptp, va); 4227 do { 4228 opte = *ptep; 4229 KASSERT((opte & (PTE_D | PTE_A)) != PTE_D); 4230 KASSERT((opte & (PTE_A | PTE_P)) != PTE_A); 4231 KASSERT(opte == 0 || (opte & PTE_P) != 0); 4232 if ((opte & (PTE_FRAME | PTE_P)) != expect) { 4233 /* 4234 * We lost a race with a V->P operation like 4235 * pmap_remove(). Wait for the competitor 4236 * reflecting pte bits into mp_attrs. 4237 */ 4238 pmap_unmap_pte(); 4239 return EAGAIN; 4240 } 4241 4242 /* 4243 * Check if there's anything to do on this PTE. 4244 */ 4245 if ((opte & clearbits) == 0) { 4246 need_shootdown = false; 4247 break; 4248 } 4249 4250 /* 4251 * We need a shootdown if the PTE is cached (PTE_A) ... 4252 * ... Unless we are clearing only the PTE_W bit and 4253 * it isn't cached as RW (PTE_D). 4254 */ 4255 need_shootdown = (opte & PTE_A) != 0 && 4256 !(clearbits == PTE_W && (opte & PTE_D) == 0); 4257 4258 npte = opte & ~clearbits; 4259 4260 /* 4261 * If we need a shootdown anyway, clear PTE_A and PTE_D. 4262 */ 4263 if (need_shootdown) { 4264 npte &= ~(PTE_A | PTE_D); 4265 } 4266 KASSERT((npte & (PTE_D | PTE_A)) != PTE_D); 4267 KASSERT((npte & (PTE_A | PTE_P)) != PTE_A); 4268 KASSERT(npte == 0 || (opte & PTE_P) != 0); 4269 } while (pmap_pte_cas(ptep, opte, npte) != opte); 4270 4271 if (need_shootdown) { 4272 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV); 4273 } 4274 pmap_unmap_pte(); 4275 4276 *oattrs = pmap_pte_to_pp_attrs(opte); 4277 if (optep != NULL) 4278 *optep = opte; 4279 return 0; 4280 } 4281 4282 static void 4283 pmap_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte, 4284 vaddr_t va) 4285 { 4286 struct pmap *pmap2; 4287 pt_entry_t *ptes; 4288 pd_entry_t * const *pdes; 4289 4290 KASSERT(mutex_owned(&pmap->pm_lock)); 4291 4292 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 4293 pmap_stats_update_bypte(pmap, 0, opte); 4294 ptp->wire_count--; 4295 if (ptp->wire_count <= 1) { 4296 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 4297 } 4298 pmap_unmap_ptes(pmap, pmap2); 4299 } 4300 4301 static void 4302 pmap_pp_remove(struct pmap_page *pp, paddr_t pa) 4303 { 4304 struct pv_pte *pvpte; 4305 struct vm_page *ptp; 4306 uintptr_t sum; 4307 uint8_t oattrs; 4308 bool locked; 4309 4310 /* 4311 * Do an unlocked check to see if the page has no mappings, eg when 4312 * pmap_remove_all() was called before amap_wipeout() for a process 4313 * private amap - common. The page being removed must be on the way 4314 * out, so we don't have to worry about concurrent attempts to enter 4315 * it (otherwise the caller either doesn't care or has screwed up). 4316 */ 4317 sum = (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_va); 4318 sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_ptp); 4319 sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pvlist.lh_first); 4320 if (sum == 0) { 4321 return; 4322 } 4323 4324 kpreempt_disable(); 4325 for (;;) { 4326 struct pmap *pmap; 4327 struct pv_entry *pve; 4328 pt_entry_t opte; 4329 vaddr_t va; 4330 4331 mutex_spin_enter(&pp->pp_lock); 4332 if ((pvpte = pv_pte_first(pp)) == NULL) { 4333 mutex_spin_exit(&pp->pp_lock); 4334 break; 4335 } 4336 4337 /* 4338 * Add a reference to the pmap before clearing the pte. 4339 * Otherwise the pmap can disappear behind us. 4340 */ 4341 ptp = pvpte->pte_ptp; 4342 pmap = ptp_to_pmap(ptp); 4343 KASSERT(pmap->pm_obj[0].uo_refs > 0); 4344 if (ptp != NULL) { 4345 pmap_reference(pmap); 4346 } 4347 4348 /* 4349 * Now try to lock it. We need a direct handoff between 4350 * pp_lock and pm_lock to know the pv_entry is kept intact 4351 * and kept associated with this pmap. If that can't be 4352 * had, wait for the pmap's lock to become free and then 4353 * retry. 4354 */ 4355 locked = mutex_tryenter(&pmap->pm_lock); 4356 mutex_spin_exit(&pp->pp_lock); 4357 if (!locked) { 4358 mutex_enter(&pmap->pm_lock); 4359 /* nothing, just wait for it */ 4360 mutex_exit(&pmap->pm_lock); 4361 if (ptp != NULL) { 4362 pmap_destroy(pmap); 4363 } 4364 continue; 4365 } 4366 va = pvpte->pte_va; 4367 4368 KASSERTMSG(pmap->pm_stats.resident_count > PDP_SIZE, 4369 "va %lx pmap %p ptp %p is empty", va, pmap, ptp); 4370 KASSERTMSG(ptp == NULL || (ptp->flags & PG_FREE) == 0, 4371 "va %lx pmap %p ptp %p is free", va, pmap, ptp); 4372 KASSERTMSG(ptp == NULL || ptp->wire_count > 1, 4373 "va %lx pmap %p ptp %p is empty", va, pmap, ptp); 4374 4375 #ifdef DEBUG 4376 pmap_check_pv(pmap, ptp, pp, pvpte->pte_va, true); 4377 rb_tree_t *tree = (ptp != NULL ? 4378 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb); 4379 pve = pmap_treelookup_pv(pmap, ptp, tree, va); 4380 if (pve == NULL) { 4381 KASSERTMSG(&pp->pp_pte == pvpte, 4382 "va %lx pmap %p ptp %p pvpte %p pve %p oops 1", 4383 va, pmap, ptp, pvpte, pve); 4384 } else { 4385 KASSERTMSG(&pve->pve_pte == pvpte, 4386 "va %lx pmap %p ptp %p pvpte %p pve %p oops 2", 4387 va, pmap, ptp, pvpte, pve); 4388 } 4389 #endif 4390 4391 if (pmap_sync_pv(pvpte, pa, ~0, &oattrs, &opte)) { 4392 panic("pmap_pp_remove: mapping not present"); 4393 } 4394 4395 pve = pmap_lookup_pv(pmap, ptp, pp, va); 4396 pmap_remove_pv(pmap, pp, ptp, va, pve, oattrs); 4397 4398 /* Update the PTP reference count. Free if last reference. */ 4399 if (ptp != NULL) { 4400 KASSERT(pmap != pmap_kernel()); 4401 pmap_tlb_shootnow(); 4402 if (__predict_false(pmap->pm_pp_remove_ent != NULL)) { 4403 (*pmap->pm_pp_remove_ent)(pmap, ptp, opte, va); 4404 } else { 4405 pmap_pp_remove_ent(pmap, ptp, opte, va); 4406 } 4407 } else { 4408 KASSERT(pmap == pmap_kernel()); 4409 pmap_stats_update_bypte(pmap, 0, opte); 4410 } 4411 pmap_tlb_shootnow(); 4412 pmap_drain_pv(pmap); 4413 mutex_exit(&pmap->pm_lock); 4414 if (ptp != NULL) { 4415 pmap_destroy(pmap); 4416 } 4417 } 4418 kpreempt_enable(); 4419 } 4420 4421 /* 4422 * pmap_page_remove: remove a managed vm_page from all pmaps that map it 4423 * 4424 * => R/M bits are sync'd back to attrs 4425 */ 4426 void 4427 pmap_page_remove(struct vm_page *pg) 4428 { 4429 struct pmap_page *pp; 4430 paddr_t pa; 4431 4432 pp = VM_PAGE_TO_PP(pg); 4433 pa = VM_PAGE_TO_PHYS(pg); 4434 pmap_pp_remove(pp, pa); 4435 } 4436 4437 /* 4438 * pmap_pv_remove: remove an unmanaged pv-tracked page from all pmaps 4439 * that map it 4440 */ 4441 void 4442 pmap_pv_remove(paddr_t pa) 4443 { 4444 struct pmap_page *pp; 4445 4446 pp = pmap_pv_tracked(pa); 4447 if (pp == NULL) 4448 panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa); 4449 pmap_pp_remove(pp, pa); 4450 } 4451 4452 /* 4453 * p m a p a t t r i b u t e f u n c t i o n s 4454 * functions that test/change managed page's attributes 4455 * since a page can be mapped multiple times we must check each PTE that 4456 * maps it by going down the pv lists. 4457 */ 4458 4459 /* 4460 * pmap_test_attrs: test a page's attributes 4461 */ 4462 bool 4463 pmap_test_attrs(struct vm_page *pg, unsigned testbits) 4464 { 4465 struct pmap_page *pp; 4466 struct pv_pte *pvpte; 4467 struct pmap *pmap; 4468 uint8_t oattrs; 4469 u_int result; 4470 paddr_t pa; 4471 4472 pp = VM_PAGE_TO_PP(pg); 4473 if ((pp->pp_attrs & testbits) != 0) { 4474 return true; 4475 } 4476 pa = VM_PAGE_TO_PHYS(pg); 4477 startover: 4478 mutex_spin_enter(&pp->pp_lock); 4479 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 4480 if ((pp->pp_attrs & testbits) != 0) { 4481 break; 4482 } 4483 if (pmap_sync_pv(pvpte, pa, 0, &oattrs, NULL)) { 4484 /* 4485 * raced with a V->P operation. wait for the other 4486 * side to finish by acquring pmap's lock. if no 4487 * wait, updates to pp_attrs by the other side may 4488 * go unseen. 4489 */ 4490 pmap = ptp_to_pmap(pvpte->pte_ptp); 4491 pmap_reference(pmap); 4492 mutex_spin_exit(&pp->pp_lock); 4493 mutex_enter(&pmap->pm_lock); 4494 /* nothing. */ 4495 mutex_exit(&pmap->pm_lock); 4496 pmap_destroy(pmap); 4497 goto startover; 4498 } 4499 pp->pp_attrs |= oattrs; 4500 } 4501 result = pp->pp_attrs & testbits; 4502 mutex_spin_exit(&pp->pp_lock); 4503 4504 /* 4505 * note that we will exit the for loop with a non-null pve if 4506 * we have found the bits we are testing for. 4507 */ 4508 4509 return result != 0; 4510 } 4511 4512 static bool 4513 pmap_pp_clear_attrs(struct pmap_page *pp, paddr_t pa, unsigned clearbits) 4514 { 4515 struct pv_pte *pvpte; 4516 struct pmap *pmap; 4517 uint8_t oattrs; 4518 u_int result; 4519 4520 startover: 4521 mutex_spin_enter(&pp->pp_lock); 4522 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 4523 if (pmap_sync_pv(pvpte, pa, clearbits, &oattrs, NULL)) { 4524 /* 4525 * raced with a V->P operation. wait for the other 4526 * side to finish by acquring pmap's lock. it is 4527 * probably unmapping the page, and it will be gone 4528 * when the loop is restarted. 4529 */ 4530 pmap = ptp_to_pmap(pvpte->pte_ptp); 4531 pmap_reference(pmap); 4532 mutex_spin_exit(&pp->pp_lock); 4533 mutex_enter(&pmap->pm_lock); 4534 /* nothing. */ 4535 mutex_exit(&pmap->pm_lock); 4536 pmap_destroy(pmap); 4537 goto startover; 4538 } 4539 pp->pp_attrs |= oattrs; 4540 } 4541 result = pp->pp_attrs & clearbits; 4542 pp->pp_attrs &= ~clearbits; 4543 pmap_tlb_shootnow(); 4544 mutex_spin_exit(&pp->pp_lock); 4545 4546 return result != 0; 4547 } 4548 4549 /* 4550 * pmap_clear_attrs: clear the specified attribute for a page. 4551 * 4552 * => we return true if we cleared one of the bits we were asked to 4553 */ 4554 bool 4555 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits) 4556 { 4557 struct pmap_page *pp; 4558 paddr_t pa; 4559 4560 pp = VM_PAGE_TO_PP(pg); 4561 pa = VM_PAGE_TO_PHYS(pg); 4562 4563 /* 4564 * If this is a new page, assert it has no mappings and simply zap 4565 * the stored attributes without taking any locks. 4566 */ 4567 if ((pg->flags & PG_FAKE) != 0) { 4568 KASSERT(atomic_load_relaxed(&pp->pp_pte.pte_va) == 0); 4569 KASSERT(atomic_load_relaxed(&pp->pp_pte.pte_ptp) == NULL); 4570 KASSERT(atomic_load_relaxed(&pp->pp_pvlist.lh_first) == NULL); 4571 atomic_store_relaxed(&pp->pp_attrs, 0); 4572 return false; 4573 } else { 4574 return pmap_pp_clear_attrs(pp, pa, clearbits); 4575 } 4576 } 4577 4578 /* 4579 * pmap_pv_clear_attrs: clear the specified attributes for an unmanaged 4580 * pv-tracked page. 4581 */ 4582 bool 4583 pmap_pv_clear_attrs(paddr_t pa, unsigned clearbits) 4584 { 4585 struct pmap_page *pp; 4586 4587 pp = pmap_pv_tracked(pa); 4588 if (pp == NULL) 4589 panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa); 4590 4591 return pmap_pp_clear_attrs(pp, pa, clearbits); 4592 } 4593 4594 /* 4595 * p m a p p r o t e c t i o n f u n c t i o n s 4596 */ 4597 4598 /* 4599 * pmap_page_protect: change the protection of all recorded mappings 4600 * of a managed page 4601 * 4602 * => NOTE: this is an inline function in pmap.h 4603 */ 4604 4605 /* see pmap.h */ 4606 4607 /* 4608 * pmap_pv_protect: change the protection of all recorded mappings 4609 * of an unmanaged pv-tracked page 4610 * 4611 * => NOTE: this is an inline function in pmap.h 4612 */ 4613 4614 /* see pmap.h */ 4615 4616 /* 4617 * pmap_protect: set the protection in of the pages in a pmap 4618 * 4619 * => NOTE: this is an inline function in pmap.h 4620 */ 4621 4622 /* see pmap.h */ 4623 4624 /* 4625 * pmap_write_protect: write-protect pages in a pmap. 4626 * 4627 * Note for Xen-amd64. Xen automatically adds PTE_U to the kernel pages, but we 4628 * don't need to remove this bit when re-entering the PTEs here: Xen tracks the 4629 * kernel pages with a reserved bit (_PAGE_GUEST_KERNEL), so even if PTE_U is 4630 * present the page will still be considered as a kernel page, and the privilege 4631 * separation will be enforced correctly. 4632 */ 4633 void 4634 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot) 4635 { 4636 pt_entry_t bit_rem, bit_put; 4637 pt_entry_t *ptes; 4638 pt_entry_t * const *pdes; 4639 struct pmap *pmap2; 4640 vaddr_t blockend, va; 4641 int lvl, i; 4642 4643 if (__predict_false(pmap->pm_write_protect != NULL)) { 4644 (*pmap->pm_write_protect)(pmap, sva, eva, prot); 4645 return; 4646 } 4647 4648 bit_rem = 0; 4649 if (!(prot & VM_PROT_WRITE)) 4650 bit_rem = PTE_W; 4651 4652 bit_put = 0; 4653 if (!(prot & VM_PROT_EXECUTE)) 4654 bit_put = pmap_pg_nx; 4655 4656 sva &= ~PAGE_MASK; 4657 eva &= ~PAGE_MASK; 4658 4659 /* 4660 * Acquire pmap. No need to lock the kernel pmap as we won't 4661 * be touching PV entries nor stats and kernel PDEs aren't 4662 * freed. 4663 */ 4664 if (pmap != pmap_kernel()) { 4665 mutex_enter(&pmap->pm_lock); 4666 } 4667 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 4668 4669 for (va = sva ; va < eva; va = blockend) { 4670 pt_entry_t *spte, *epte; 4671 4672 blockend = x86_round_pdr(va + 1); 4673 if (blockend > eva) 4674 blockend = eva; 4675 4676 /* Is it a valid block? */ 4677 if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) { 4678 continue; 4679 } 4680 KASSERT(va < VM_MAXUSER_ADDRESS || va >= VM_MAX_ADDRESS); 4681 KASSERT(lvl == 1); 4682 4683 spte = &ptes[pl1_i(va)]; 4684 epte = &ptes[pl1_i(blockend)]; 4685 4686 for (i = 0; spte < epte; spte++, i++) { 4687 pt_entry_t opte, npte; 4688 4689 do { 4690 opte = *spte; 4691 if (!pmap_valid_entry(opte)) { 4692 goto next; 4693 } 4694 npte = (opte & ~bit_rem) | bit_put; 4695 } while (pmap_pte_cas(spte, opte, npte) != opte); 4696 4697 if ((opte & PTE_D) != 0) { 4698 vaddr_t tva = va + x86_ptob(i); 4699 pmap_tlb_shootdown(pmap, tva, opte, 4700 TLBSHOOT_WRITE_PROTECT); 4701 } 4702 next:; 4703 } 4704 } 4705 4706 /* Release pmap. */ 4707 pmap_unmap_ptes(pmap, pmap2); 4708 if (pmap != pmap_kernel()) { 4709 mutex_exit(&pmap->pm_lock); 4710 } 4711 } 4712 4713 /* 4714 * pmap_unwire: clear the wired bit in the PTE. 4715 * 4716 * => Mapping should already be present. 4717 */ 4718 void 4719 pmap_unwire(struct pmap *pmap, vaddr_t va) 4720 { 4721 pt_entry_t *ptes, *ptep, opte; 4722 pd_entry_t * const *pdes; 4723 struct pmap *pmap2; 4724 int lvl; 4725 4726 if (__predict_false(pmap->pm_unwire != NULL)) { 4727 (*pmap->pm_unwire)(pmap, va); 4728 return; 4729 } 4730 4731 /* 4732 * Acquire pmap. Need to lock the kernel pmap only to protect the 4733 * statistics. 4734 */ 4735 mutex_enter(&pmap->pm_lock); 4736 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 4737 4738 if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) { 4739 panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va); 4740 } 4741 KASSERT(lvl == 1); 4742 4743 ptep = &ptes[pl1_i(va)]; 4744 opte = *ptep; 4745 KASSERT(pmap_valid_entry(opte)); 4746 4747 if (opte & PTE_WIRED) { 4748 pt_entry_t npte = opte & ~PTE_WIRED; 4749 4750 opte = pmap_pte_testset(ptep, npte); 4751 pmap_stats_update_bypte(pmap, npte, opte); 4752 } else { 4753 printf("%s: wiring for pmap %p va %#" PRIxVADDR 4754 " did not change!\n", __func__, pmap, va); 4755 } 4756 4757 /* Release pmap. */ 4758 pmap_unmap_ptes(pmap, pmap2); 4759 mutex_exit(&pmap->pm_lock); 4760 } 4761 4762 /* 4763 * pmap_copy: copy mappings from one pmap to another 4764 * 4765 * => optional function 4766 * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr) 4767 */ 4768 4769 /* 4770 * defined as macro in pmap.h 4771 */ 4772 4773 __strict_weak_alias(pmap_enter, pmap_enter_default); 4774 4775 int 4776 pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, 4777 u_int flags) 4778 { 4779 if (__predict_false(pmap->pm_enter != NULL)) { 4780 return (*pmap->pm_enter)(pmap, va, pa, prot, flags); 4781 } 4782 4783 return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0); 4784 } 4785 4786 /* 4787 * pmap_enter: enter a mapping into a pmap 4788 * 4789 * => must be done "now" ... no lazy-evaluation 4790 */ 4791 int 4792 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa, 4793 vm_prot_t prot, u_int flags, int domid) 4794 { 4795 pt_entry_t *ptes, opte, npte; 4796 pt_entry_t *ptep; 4797 pd_entry_t * const *pdes; 4798 struct vm_page *ptp; 4799 struct vm_page *new_pg, *old_pg; 4800 struct pmap_page *new_pp, *old_pp; 4801 struct pv_entry *old_pve, *new_pve; 4802 bool wired = (flags & PMAP_WIRED) != 0; 4803 struct pmap *pmap2; 4804 struct pmap_ptparray pt; 4805 int error; 4806 bool getptp, samepage, new_embedded; 4807 rb_tree_t *tree; 4808 4809 KASSERT(pmap_initialized); 4810 KASSERT(va < VM_MAX_KERNEL_ADDRESS); 4811 KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#" 4812 PRIxVADDR " over PDP!", __func__, va); 4813 KASSERTMSG(va < VM_MIN_KERNEL_ADDRESS || 4814 pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]), 4815 "%s: missing kernel PTP for va=%#" PRIxVADDR, __func__, va); 4816 4817 #ifdef XENPV 4818 KASSERT(domid == DOMID_SELF || pa == 0); 4819 #endif 4820 4821 npte = ma | protection_codes[prot] | PTE_P; 4822 npte |= pmap_pat_flags(flags); 4823 if (wired) 4824 npte |= PTE_WIRED; 4825 if (va < VM_MAXUSER_ADDRESS) 4826 npte |= PTE_U; 4827 4828 if (pmap == pmap_kernel()) 4829 npte |= pmap_pg_g; 4830 if (flags & VM_PROT_ALL) { 4831 npte |= PTE_A; 4832 if (flags & VM_PROT_WRITE) { 4833 KASSERT((npte & PTE_W) != 0); 4834 npte |= PTE_D; 4835 } 4836 } 4837 4838 #ifdef XENPV 4839 if (domid != DOMID_SELF) 4840 new_pg = NULL; 4841 else 4842 #endif 4843 new_pg = PHYS_TO_VM_PAGE(pa); 4844 4845 if (new_pg != NULL) { 4846 /* This is a managed page */ 4847 npte |= PTE_PVLIST; 4848 new_pp = VM_PAGE_TO_PP(new_pg); 4849 PMAP_CHECK_PP(new_pp); 4850 } else if ((new_pp = pmap_pv_tracked(pa)) != NULL) { 4851 /* This is an unmanaged pv-tracked page */ 4852 npte |= PTE_PVLIST; 4853 PMAP_CHECK_PP(new_pp); 4854 } else { 4855 new_pp = NULL; 4856 } 4857 4858 /* Begin by locking the pmap. */ 4859 mutex_enter(&pmap->pm_lock); 4860 4861 /* Look up the PTP. Allocate if none present. */ 4862 ptp = NULL; 4863 getptp = false; 4864 if (pmap != pmap_kernel()) { 4865 ptp = pmap_find_ptp(pmap, va, 1); 4866 if (ptp == NULL) { 4867 getptp = true; 4868 error = pmap_get_ptp(pmap, &pt, va, flags, &ptp); 4869 if (error != 0) { 4870 if (flags & PMAP_CANFAIL) { 4871 mutex_exit(&pmap->pm_lock); 4872 return error; 4873 } 4874 panic("%s: get ptp failed, error=%d", __func__, 4875 error); 4876 } 4877 } 4878 tree = &VM_PAGE_TO_PP(ptp)->pp_rb; 4879 } else { 4880 /* Embedded PV entries rely on this. */ 4881 KASSERT(va != 0); 4882 tree = &pmap_kernel_rb; 4883 } 4884 4885 /* 4886 * Look up the old PV entry at this VA (if any), and insert a new PV 4887 * entry if required for the new mapping. Temporarily track the old 4888 * and new mappings concurrently. Only after the old mapping is 4889 * evicted from the pmap will we remove its PV entry. Otherwise, 4890 * our picture of modified/accessed state for either page could get 4891 * out of sync (we need any P->V operation for either page to stall 4892 * on pmap->pm_lock until done here). 4893 */ 4894 new_pve = NULL; 4895 old_pve = NULL; 4896 samepage = false; 4897 new_embedded = false; 4898 4899 if (new_pp != NULL) { 4900 error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve, 4901 &old_pve, &samepage, &new_embedded, tree); 4902 4903 /* 4904 * If a new pv_entry was needed and none was available, we 4905 * can go no further. 4906 */ 4907 if (error != 0) { 4908 if (flags & PMAP_CANFAIL) { 4909 if (getptp) { 4910 pmap_unget_ptp(pmap, &pt); 4911 } 4912 mutex_exit(&pmap->pm_lock); 4913 return error; 4914 } 4915 panic("%s: alloc pve failed", __func__); 4916 } 4917 } else { 4918 old_pve = pmap_treelookup_pv(pmap, ptp, tree, va); 4919 } 4920 4921 /* Map PTEs into address space. */ 4922 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 4923 4924 /* Install any newly allocated PTPs. */ 4925 if (getptp) { 4926 pmap_install_ptp(pmap, &pt, va, pdes); 4927 } 4928 4929 /* Check if there is an existing mapping. */ 4930 ptep = &ptes[pl1_i(va)]; 4931 opte = *ptep; 4932 bool have_oldpa = pmap_valid_entry(opte); 4933 paddr_t oldpa = pmap_pte2pa(opte); 4934 4935 /* 4936 * Update the pte. 4937 */ 4938 do { 4939 opte = *ptep; 4940 4941 /* 4942 * if the same page, inherit PTE_A and PTE_D. 4943 */ 4944 if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) { 4945 npte |= opte & (PTE_A | PTE_D); 4946 } 4947 #if defined(XENPV) 4948 if (domid != DOMID_SELF) { 4949 /* pmap_pte_cas with error handling */ 4950 int s = splvm(); 4951 if (opte != *ptep) { 4952 splx(s); 4953 continue; 4954 } 4955 error = xpq_update_foreign( 4956 vtomach((vaddr_t)ptep), npte, domid, flags); 4957 splx(s); 4958 if (error) { 4959 /* Undo pv_entry tracking - oof. */ 4960 if (new_pp != NULL) { 4961 mutex_spin_enter(&new_pp->pp_lock); 4962 if (new_pve != NULL) { 4963 LIST_REMOVE(new_pve, pve_list); 4964 KASSERT(pmap->pm_pve == NULL); 4965 pmap->pm_pve = new_pve; 4966 } else if (new_embedded) { 4967 new_pp->pp_pte.pte_ptp = NULL; 4968 new_pp->pp_pte.pte_va = 0; 4969 } 4970 mutex_spin_exit(&new_pp->pp_lock); 4971 } 4972 pmap_unmap_ptes(pmap, pmap2); 4973 /* Free new PTP. */ 4974 if (ptp != NULL && ptp->wire_count <= 1) { 4975 pmap_free_ptp(pmap, ptp, va, ptes, 4976 pdes); 4977 } 4978 mutex_exit(&pmap->pm_lock); 4979 return error; 4980 } 4981 break; 4982 } 4983 #endif /* defined(XENPV) */ 4984 } while (pmap_pte_cas(ptep, opte, npte) != opte); 4985 4986 /* 4987 * Done with the PTEs: they can now be unmapped. 4988 */ 4989 pmap_unmap_ptes(pmap, pmap2); 4990 4991 /* 4992 * Update statistics and PTP's reference count. 4993 */ 4994 pmap_stats_update_bypte(pmap, npte, opte); 4995 if (ptp != NULL) { 4996 if (!have_oldpa) { 4997 ptp->wire_count++; 4998 } 4999 /* Remember minimum VA in PTP. */ 5000 pmap_ptp_range_set(ptp, va); 5001 } 5002 KASSERT(ptp == NULL || ptp->wire_count > 1); 5003 5004 /* 5005 * If the same page, we can skip pv_entry handling. 5006 */ 5007 if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) { 5008 KASSERT(((opte ^ npte) & PTE_PVLIST) == 0); 5009 if ((npte & PTE_PVLIST) != 0) { 5010 KASSERT(samepage); 5011 pmap_check_pv(pmap, ptp, new_pp, va, true); 5012 } 5013 goto same_pa; 5014 } else if ((npte & PTE_PVLIST) != 0) { 5015 KASSERT(!samepage); 5016 } 5017 5018 /* 5019 * If old page is pv-tracked, remove pv_entry from its list. 5020 */ 5021 if ((~opte & (PTE_P | PTE_PVLIST)) == 0) { 5022 if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) { 5023 old_pp = VM_PAGE_TO_PP(old_pg); 5024 } else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) { 5025 panic("%s: PTE_PVLIST with pv-untracked page" 5026 " va = %#"PRIxVADDR 5027 " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")", 5028 __func__, va, oldpa, atop(pa)); 5029 } 5030 5031 pmap_remove_pv(pmap, old_pp, ptp, va, old_pve, 5032 pmap_pte_to_pp_attrs(opte)); 5033 } else { 5034 KASSERT(old_pve == NULL); 5035 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 5036 } 5037 5038 /* 5039 * If new page is dynamically PV tracked, insert to tree. 5040 */ 5041 if (new_pve != NULL) { 5042 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 5043 old_pve = rb_tree_insert_node(tree, new_pve); 5044 KASSERT(old_pve == new_pve); 5045 pmap_check_pv(pmap, ptp, new_pp, va, true); 5046 } 5047 5048 same_pa: 5049 /* 5050 * shootdown tlb if necessary. 5051 */ 5052 5053 if ((~opte & (PTE_P | PTE_A)) == 0 && 5054 ((opte ^ npte) & (PTE_FRAME | PTE_W)) != 0) { 5055 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER); 5056 } 5057 pmap_drain_pv(pmap); 5058 mutex_exit(&pmap->pm_lock); 5059 return 0; 5060 } 5061 5062 #if defined(XEN) && defined(DOM0OPS) 5063 5064 struct pmap_data_gnt { 5065 SLIST_ENTRY(pmap_data_gnt) pd_gnt_list; 5066 vaddr_t pd_gnt_sva; 5067 vaddr_t pd_gnt_eva; /* range covered by this gnt */ 5068 int pd_gnt_refs; /* ref counter */ 5069 struct gnttab_map_grant_ref pd_gnt_ops[1]; /* variable length */ 5070 }; 5071 SLIST_HEAD(pmap_data_gnt_head, pmap_data_gnt); 5072 5073 static void pmap_remove_gnt(struct pmap *, vaddr_t, vaddr_t); 5074 5075 static struct pmap_data_gnt * 5076 pmap_find_gnt(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 5077 { 5078 struct pmap_data_gnt_head *headp; 5079 struct pmap_data_gnt *pgnt; 5080 5081 KASSERT(mutex_owned(&pmap->pm_lock)); 5082 headp = pmap->pm_data; 5083 KASSERT(headp != NULL); 5084 SLIST_FOREACH(pgnt, headp, pd_gnt_list) { 5085 if (pgnt->pd_gnt_sva <= sva && eva <= pgnt->pd_gnt_eva) 5086 return pgnt; 5087 /* check that we're not overlapping part of a region */ 5088 KASSERT(pgnt->pd_gnt_sva >= eva || pgnt->pd_gnt_eva <= sva); 5089 } 5090 return NULL; 5091 } 5092 5093 static void 5094 pmap_alloc_gnt(struct pmap *pmap, vaddr_t sva, int nentries, 5095 const struct gnttab_map_grant_ref *ops) 5096 { 5097 struct pmap_data_gnt_head *headp; 5098 struct pmap_data_gnt *pgnt; 5099 vaddr_t eva = sva + nentries * PAGE_SIZE; 5100 KASSERT(mutex_owned(&pmap->pm_lock)); 5101 KASSERT(nentries >= 1); 5102 if (pmap->pm_remove == NULL) { 5103 pmap->pm_remove = pmap_remove_gnt; 5104 KASSERT(pmap->pm_data == NULL); 5105 headp = kmem_alloc(sizeof(*headp), KM_SLEEP); 5106 SLIST_INIT(headp); 5107 pmap->pm_data = headp; 5108 } else { 5109 KASSERT(pmap->pm_remove == pmap_remove_gnt); 5110 KASSERT(pmap->pm_data != NULL); 5111 headp = pmap->pm_data; 5112 } 5113 5114 pgnt = pmap_find_gnt(pmap, sva, eva); 5115 if (pgnt != NULL) { 5116 KASSERT(pgnt->pd_gnt_sva == sva); 5117 KASSERT(pgnt->pd_gnt_eva == eva); 5118 return; 5119 } 5120 5121 /* new entry */ 5122 pgnt = kmem_alloc(sizeof(*pgnt) + 5123 (nentries - 1) * sizeof(struct gnttab_map_grant_ref), KM_SLEEP); 5124 pgnt->pd_gnt_sva = sva; 5125 pgnt->pd_gnt_eva = eva; 5126 pgnt->pd_gnt_refs = 0; 5127 memcpy(pgnt->pd_gnt_ops, ops, 5128 sizeof(struct gnttab_map_grant_ref) * nentries); 5129 SLIST_INSERT_HEAD(headp, pgnt, pd_gnt_list); 5130 } 5131 5132 static void 5133 pmap_free_gnt(struct pmap *pmap, struct pmap_data_gnt *pgnt) 5134 { 5135 struct pmap_data_gnt_head *headp = pmap->pm_data; 5136 int nentries = (pgnt->pd_gnt_eva - pgnt->pd_gnt_sva) / PAGE_SIZE; 5137 KASSERT(nentries >= 1); 5138 KASSERT(mutex_owned(&pmap->pm_lock)); 5139 KASSERT(pgnt->pd_gnt_refs == 0); 5140 SLIST_REMOVE(headp, pgnt, pmap_data_gnt, pd_gnt_list); 5141 kmem_free(pgnt, sizeof(*pgnt) + 5142 (nentries - 1) * sizeof(struct gnttab_map_grant_ref)); 5143 if (SLIST_EMPTY(headp)) { 5144 kmem_free(headp, sizeof(*headp)); 5145 pmap->pm_data = NULL; 5146 pmap->pm_remove = NULL; 5147 } 5148 } 5149 5150 /* 5151 * pmap_enter_gnt: enter a grant entry into a pmap 5152 * 5153 * => must be done "now" ... no lazy-evaluation 5154 */ 5155 int 5156 pmap_enter_gnt(struct pmap *pmap, vaddr_t va, vaddr_t sva, int nentries, 5157 const struct gnttab_map_grant_ref *oops) 5158 { 5159 struct pmap_data_gnt *pgnt; 5160 pt_entry_t *ptes, opte; 5161 pt_entry_t *ptep; 5162 pd_entry_t * const *pdes; 5163 struct vm_page *ptp; 5164 struct vm_page *old_pg; 5165 struct pmap_page *old_pp; 5166 struct pv_entry *old_pve; 5167 struct pmap *pmap2; 5168 struct pmap_ptparray pt; 5169 int error; 5170 bool getptp; 5171 rb_tree_t *tree; 5172 struct gnttab_map_grant_ref *op; 5173 int ret; 5174 int idx; 5175 5176 KASSERT(pmap_initialized); 5177 KASSERT(va < VM_MAX_KERNEL_ADDRESS); 5178 KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#" 5179 PRIxVADDR " over PDP!", __func__, va); 5180 KASSERT(pmap != pmap_kernel()); 5181 5182 /* Begin by locking the pmap. */ 5183 mutex_enter(&pmap->pm_lock); 5184 pmap_alloc_gnt(pmap, sva, nentries, oops); 5185 5186 pgnt = pmap_find_gnt(pmap, va, va + PAGE_SIZE); 5187 KASSERT(pgnt != NULL); 5188 5189 /* Look up the PTP. Allocate if none present. */ 5190 ptp = NULL; 5191 getptp = false; 5192 ptp = pmap_find_ptp(pmap, va, 1); 5193 if (ptp == NULL) { 5194 getptp = true; 5195 error = pmap_get_ptp(pmap, &pt, va, PMAP_CANFAIL, &ptp); 5196 if (error != 0) { 5197 mutex_exit(&pmap->pm_lock); 5198 return error; 5199 } 5200 } 5201 tree = &VM_PAGE_TO_PP(ptp)->pp_rb; 5202 5203 /* 5204 * Look up the old PV entry at this VA (if any), and insert a new PV 5205 * entry if required for the new mapping. Temporarily track the old 5206 * and new mappings concurrently. Only after the old mapping is 5207 * evicted from the pmap will we remove its PV entry. Otherwise, 5208 * our picture of modified/accessed state for either page could get 5209 * out of sync (we need any P->V operation for either page to stall 5210 * on pmap->pm_lock until done here). 5211 */ 5212 old_pve = NULL; 5213 5214 old_pve = pmap_treelookup_pv(pmap, ptp, tree, va); 5215 5216 /* Map PTEs into address space. */ 5217 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 5218 5219 /* Install any newly allocated PTPs. */ 5220 if (getptp) { 5221 pmap_install_ptp(pmap, &pt, va, pdes); 5222 } 5223 5224 /* Check if there is an existing mapping. */ 5225 ptep = &ptes[pl1_i(va)]; 5226 opte = *ptep; 5227 bool have_oldpa = pmap_valid_entry(opte); 5228 paddr_t oldpa = pmap_pte2pa(opte); 5229 5230 /* 5231 * Update the pte. 5232 */ 5233 5234 idx = (va - pgnt->pd_gnt_sva) / PAGE_SIZE; 5235 op = &pgnt->pd_gnt_ops[idx]; 5236 5237 op->host_addr = xpmap_ptetomach(ptep); 5238 op->dev_bus_addr = 0; 5239 op->status = GNTST_general_error; 5240 ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, 1); 5241 if (__predict_false(ret)) { 5242 printf("%s: GNTTABOP_map_grant_ref failed: %d\n", 5243 __func__, ret); 5244 op->status = GNTST_general_error; 5245 } 5246 for (int d = 0; d < 256 && op->status == GNTST_eagain; d++) { 5247 kpause("gntmap", false, mstohz(1), NULL); 5248 ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, 1); 5249 if (__predict_false(ret)) { 5250 printf("%s: GNTTABOP_map_grant_ref failed: %d\n", 5251 __func__, ret); 5252 op->status = GNTST_general_error; 5253 } 5254 } 5255 if (__predict_false(op->status != GNTST_okay)) { 5256 printf("%s: GNTTABOP_map_grant_ref status: %d\n", 5257 __func__, op->status); 5258 if (have_oldpa) { 5259 ptp->wire_count--; 5260 } 5261 } else { 5262 pgnt->pd_gnt_refs++; 5263 if (!have_oldpa) { 5264 ptp->wire_count++; 5265 } 5266 KASSERT(ptp->wire_count > 1); 5267 /* Remember minimum VA in PTP. */ 5268 pmap_ptp_range_set(ptp, va); 5269 } 5270 if (ptp->wire_count <= 1) 5271 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 5272 5273 /* 5274 * Done with the PTEs: they can now be unmapped. 5275 */ 5276 pmap_unmap_ptes(pmap, pmap2); 5277 5278 /* 5279 * Update statistics and PTP's reference count. 5280 */ 5281 pmap_stats_update_bypte(pmap, 0, opte); 5282 5283 /* 5284 * If old page is pv-tracked, remove pv_entry from its list. 5285 */ 5286 if ((~opte & (PTE_P | PTE_PVLIST)) == 0) { 5287 if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) { 5288 old_pp = VM_PAGE_TO_PP(old_pg); 5289 } else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) { 5290 panic("%s: PTE_PVLIST with pv-untracked page" 5291 " va = %#"PRIxVADDR " pa = %#" PRIxPADDR, 5292 __func__, va, oldpa); 5293 } 5294 5295 pmap_remove_pv(pmap, old_pp, ptp, va, old_pve, 5296 pmap_pte_to_pp_attrs(opte)); 5297 } else { 5298 KASSERT(old_pve == NULL); 5299 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 5300 } 5301 5302 pmap_drain_pv(pmap); 5303 mutex_exit(&pmap->pm_lock); 5304 return op->status; 5305 } 5306 5307 /* 5308 * pmap_remove_gnt: grant mapping removal function. 5309 * 5310 * => caller should not be holding any pmap locks 5311 */ 5312 static void 5313 pmap_remove_gnt(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 5314 { 5315 struct pmap_data_gnt *pgnt; 5316 pt_entry_t *ptes; 5317 pd_entry_t pde; 5318 pd_entry_t * const *pdes; 5319 struct vm_page *ptp; 5320 struct pmap *pmap2; 5321 vaddr_t va; 5322 int lvl; 5323 int idx; 5324 struct gnttab_map_grant_ref *op; 5325 struct gnttab_unmap_grant_ref unmap_op; 5326 int ret; 5327 5328 KASSERT(pmap != pmap_kernel()); 5329 KASSERT(pmap->pm_remove == pmap_remove_gnt); 5330 5331 mutex_enter(&pmap->pm_lock); 5332 for (va = sva; va < eva; va += PAGE_SIZE) { 5333 pgnt = pmap_find_gnt(pmap, va, va + PAGE_SIZE); 5334 if (pgnt == NULL) { 5335 pmap_remove_locked(pmap, sva, eva); 5336 continue; 5337 } 5338 5339 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 5340 if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) { 5341 panic("pmap_remove_gnt pdes not valid"); 5342 } 5343 5344 idx = (va - pgnt->pd_gnt_sva) / PAGE_SIZE; 5345 op = &pgnt->pd_gnt_ops[idx]; 5346 KASSERT(lvl == 1); 5347 KASSERT(op->status == GNTST_okay); 5348 5349 /* Get PTP if non-kernel mapping. */ 5350 ptp = pmap_find_ptp(pmap, va, 1); 5351 KASSERTMSG(ptp != NULL, 5352 "%s: unmanaged PTP detected", __func__); 5353 5354 if (op->status == GNTST_okay) { 5355 KASSERT(pmap_valid_entry(ptes[pl1_i(va)])); 5356 unmap_op.handle = op->handle; 5357 unmap_op.dev_bus_addr = 0; 5358 unmap_op.host_addr = xpmap_ptetomach(&ptes[pl1_i(va)]); 5359 ret = HYPERVISOR_grant_table_op( 5360 GNTTABOP_unmap_grant_ref, &unmap_op, 1); 5361 if (ret) { 5362 printf("%s: GNTTABOP_unmap_grant_ref " 5363 "failed: %d\n", __func__, ret); 5364 } 5365 5366 ptp->wire_count--; 5367 pgnt->pd_gnt_refs--; 5368 if (pgnt->pd_gnt_refs == 0) { 5369 pmap_free_gnt(pmap, pgnt); 5370 } 5371 } 5372 /* 5373 * if mapping removed and the PTP is no longer 5374 * being used, free it! 5375 */ 5376 5377 if (ptp->wire_count <= 1) 5378 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 5379 pmap_unmap_ptes(pmap, pmap2); 5380 } 5381 mutex_exit(&pmap->pm_lock); 5382 } 5383 #endif /* XEN && DOM0OPS */ 5384 5385 paddr_t 5386 pmap_get_physpage(void) 5387 { 5388 struct vm_page *ptp; 5389 struct pmap *kpm = pmap_kernel(); 5390 paddr_t pa; 5391 5392 if (!uvm.page_init_done) { 5393 /* 5394 * We're growing the kernel pmap early (from 5395 * uvm_pageboot_alloc()). This case must be 5396 * handled a little differently. 5397 */ 5398 5399 if (!uvm_page_physget(&pa)) 5400 panic("%s: out of memory", __func__); 5401 #if defined(__HAVE_DIRECT_MAP) 5402 memset((void *)PMAP_DIRECT_MAP(pa), 0, PAGE_SIZE); 5403 #else 5404 #if defined(XENPV) 5405 if (XEN_VERSION_SUPPORTED(3, 4)) { 5406 xen_pagezero(pa); 5407 return pa; 5408 } 5409 #endif 5410 kpreempt_disable(); 5411 pmap_pte_set(early_zero_pte, pmap_pa2pte(pa) | PTE_P | 5412 PTE_W | pmap_pg_nx); 5413 pmap_pte_flush(); 5414 pmap_update_pg((vaddr_t)early_zerop); 5415 memset(early_zerop, 0, PAGE_SIZE); 5416 #if defined(DIAGNOSTIC) || defined(XENPV) 5417 pmap_pte_set(early_zero_pte, 0); 5418 pmap_pte_flush(); 5419 #endif /* defined(DIAGNOSTIC) */ 5420 kpreempt_enable(); 5421 #endif /* defined(__HAVE_DIRECT_MAP) */ 5422 } else { 5423 /* XXX */ 5424 ptp = uvm_pagealloc(NULL, 0, NULL, 5425 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 5426 if (ptp == NULL) 5427 panic("%s: out of memory", __func__); 5428 ptp->flags &= ~PG_BUSY; 5429 ptp->wire_count = 1; 5430 pa = VM_PAGE_TO_PHYS(ptp); 5431 } 5432 pmap_stats_update(kpm, 1, 0); 5433 5434 return pa; 5435 } 5436 5437 /* 5438 * Expand the page tree with the specified amount of PTPs, mapping virtual 5439 * addresses starting at kva. We populate all the levels but the last one 5440 * (L1). The nodes of the tree are created as RW, but the pages covered 5441 * will be kentered in L1, with proper permissions. 5442 * 5443 * Used only by pmap_growkernel. 5444 */ 5445 static void 5446 pmap_alloc_level(struct pmap *cpm, vaddr_t kva, long *needed_ptps) 5447 { 5448 unsigned long i; 5449 paddr_t pa; 5450 unsigned long index, endindex; 5451 int level; 5452 pd_entry_t *pdep; 5453 #ifdef XENPV 5454 int s = splvm(); /* protect xpq_* */ 5455 #endif 5456 5457 for (level = PTP_LEVELS; level > 1; level--) { 5458 if (level == PTP_LEVELS) 5459 pdep = cpm->pm_pdir; 5460 else 5461 pdep = normal_pdes[level - 2]; 5462 index = pl_i_roundup(kva, level); 5463 endindex = index + needed_ptps[level - 1] - 1; 5464 5465 for (i = index; i <= endindex; i++) { 5466 pt_entry_t pte; 5467 5468 KASSERT(!pmap_valid_entry(pdep[i])); 5469 pa = pmap_get_physpage(); 5470 pte = pmap_pa2pte(pa) | PTE_P | PTE_W; 5471 #ifdef __x86_64__ 5472 pte |= pmap_pg_nx; 5473 #endif 5474 pmap_pte_set(&pdep[i], pte); 5475 5476 #ifdef XENPV 5477 if (level == PTP_LEVELS && i >= PDIR_SLOT_KERN) { 5478 if (__predict_true( 5479 cpu_info_primary.ci_flags & CPUF_PRESENT)) { 5480 /* update per-cpu PMDs on all cpus */ 5481 xen_kpm_sync(pmap_kernel(), i); 5482 } else { 5483 /* 5484 * too early; update primary CPU 5485 * PMD only (without locks) 5486 */ 5487 #ifdef __x86_64__ 5488 pd_entry_t *cpu_pdep = 5489 &cpu_info_primary.ci_kpm_pdir[i]; 5490 #else 5491 pd_entry_t *cpu_pdep = 5492 &cpu_info_primary.ci_kpm_pdir[l2tol2(i)]; 5493 #endif 5494 pmap_pte_set(cpu_pdep, pte); 5495 } 5496 } 5497 #endif 5498 5499 KASSERT(level != PTP_LEVELS || nkptp[level - 1] + 5500 pl_i(VM_MIN_KERNEL_ADDRESS, level) == i); 5501 nkptp[level - 1]++; 5502 } 5503 pmap_pte_flush(); 5504 } 5505 #ifdef XENPV 5506 splx(s); 5507 #endif 5508 } 5509 5510 /* 5511 * pmap_growkernel: increase usage of KVM space. 5512 * 5513 * => we allocate new PTPs for the kernel and install them in all 5514 * the pmaps on the system. 5515 */ 5516 vaddr_t 5517 pmap_growkernel(vaddr_t maxkvaddr) 5518 { 5519 struct pmap *kpm = pmap_kernel(); 5520 struct pmap *cpm; 5521 #if !defined(XENPV) || !defined(__x86_64__) 5522 struct pmap *pm; 5523 long old; 5524 #endif 5525 int s, i; 5526 long needed_kptp[PTP_LEVELS], target_nptp; 5527 bool invalidate = false; 5528 5529 s = splvm(); /* to be safe */ 5530 mutex_enter(&kpm->pm_lock); 5531 5532 if (maxkvaddr <= pmap_maxkvaddr) { 5533 mutex_exit(&kpm->pm_lock); 5534 splx(s); 5535 return pmap_maxkvaddr; 5536 } 5537 5538 maxkvaddr = x86_round_pdr(maxkvaddr); 5539 #if !defined(XENPV) || !defined(__x86_64__) 5540 old = nkptp[PTP_LEVELS - 1]; 5541 #endif 5542 5543 /* Initialize needed_kptp. */ 5544 for (i = PTP_LEVELS - 1; i >= 1; i--) { 5545 target_nptp = pl_i_roundup(maxkvaddr, i + 1) - 5546 pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1); 5547 5548 if (target_nptp > nkptpmax[i]) 5549 panic("out of KVA space"); 5550 KASSERT(target_nptp >= nkptp[i]); 5551 needed_kptp[i] = target_nptp - nkptp[i]; 5552 } 5553 5554 #ifdef XENPV 5555 /* only pmap_kernel(), or the per-cpu map, has kernel entries */ 5556 cpm = kpm; 5557 #else 5558 /* Get the current pmap */ 5559 if (__predict_true(cpu_info_primary.ci_flags & CPUF_PRESENT)) { 5560 cpm = curcpu()->ci_pmap; 5561 } else { 5562 cpm = kpm; 5563 } 5564 #endif 5565 5566 kasan_shadow_map((void *)pmap_maxkvaddr, 5567 (size_t)(maxkvaddr - pmap_maxkvaddr)); 5568 kmsan_shadow_map((void *)pmap_maxkvaddr, 5569 (size_t)(maxkvaddr - pmap_maxkvaddr)); 5570 5571 pmap_alloc_level(cpm, pmap_maxkvaddr, needed_kptp); 5572 5573 /* 5574 * If the number of top level entries changed, update all pmaps. 5575 */ 5576 if (needed_kptp[PTP_LEVELS - 1] != 0) { 5577 #ifdef XENPV 5578 #ifdef __x86_64__ 5579 /* nothing, kernel entries are never entered in user pmap */ 5580 #else 5581 int pdkidx; 5582 5583 mutex_enter(&pmaps_lock); 5584 LIST_FOREACH(pm, &pmaps, pm_list) { 5585 for (pdkidx = PDIR_SLOT_KERN + old; 5586 pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1]; 5587 pdkidx++) { 5588 pmap_pte_set(&pm->pm_pdir[pdkidx], 5589 kpm->pm_pdir[pdkidx]); 5590 } 5591 pmap_pte_flush(); 5592 } 5593 mutex_exit(&pmaps_lock); 5594 #endif /* __x86_64__ */ 5595 #else /* XENPV */ 5596 size_t newpdes; 5597 newpdes = nkptp[PTP_LEVELS - 1] - old; 5598 if (cpm != kpm) { 5599 memcpy(&kpm->pm_pdir[PDIR_SLOT_KERN + old], 5600 &cpm->pm_pdir[PDIR_SLOT_KERN + old], 5601 newpdes * sizeof(pd_entry_t)); 5602 } 5603 5604 mutex_enter(&pmaps_lock); 5605 LIST_FOREACH(pm, &pmaps, pm_list) { 5606 if (__predict_false(pm->pm_enter != NULL)) { 5607 /* 5608 * Not a native pmap, the kernel is not mapped, 5609 * so nothing to synchronize. 5610 */ 5611 continue; 5612 } 5613 memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old], 5614 &kpm->pm_pdir[PDIR_SLOT_KERN + old], 5615 newpdes * sizeof(pd_entry_t)); 5616 } 5617 mutex_exit(&pmaps_lock); 5618 #endif 5619 invalidate = true; 5620 } 5621 pmap_maxkvaddr = maxkvaddr; 5622 mutex_exit(&kpm->pm_lock); 5623 splx(s); 5624 5625 if (invalidate && pmap_initialized) { 5626 /* Invalidate the pmap cache. */ 5627 pool_cache_invalidate(&pmap_cache); 5628 } 5629 5630 return maxkvaddr; 5631 } 5632 5633 #ifdef DEBUG 5634 void pmap_dump(struct pmap *, vaddr_t, vaddr_t); 5635 5636 /* 5637 * pmap_dump: dump all the mappings from a pmap 5638 * 5639 * => caller should not be holding any pmap locks 5640 */ 5641 void 5642 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 5643 { 5644 pt_entry_t *ptes, *pte; 5645 pd_entry_t * const *pdes; 5646 struct pmap *pmap2; 5647 vaddr_t blkendva; 5648 int lvl; 5649 5650 /* 5651 * if end is out of range truncate. 5652 * if (end == start) update to max. 5653 */ 5654 5655 if (eva > VM_MAXUSER_ADDRESS || eva <= sva) 5656 eva = VM_MAXUSER_ADDRESS; 5657 5658 mutex_enter(&pmap->pm_lock); 5659 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 5660 5661 /* 5662 * dumping a range of pages: we dump in PTP sized blocks (4MB) 5663 */ 5664 5665 for (/* null */ ; sva < eva ; sva = blkendva) { 5666 5667 /* determine range of block */ 5668 blkendva = x86_round_pdr(sva+1); 5669 if (blkendva > eva) 5670 blkendva = eva; 5671 5672 /* valid block? */ 5673 if (!pmap_pdes_valid(sva, pdes, NULL, &lvl)) 5674 continue; 5675 KASSERT(lvl == 1); 5676 5677 pte = &ptes[pl1_i(sva)]; 5678 for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) { 5679 if (!pmap_valid_entry(*pte)) 5680 continue; 5681 printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR 5682 " (pte=%#" PRIxPADDR ")\n", 5683 sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte); 5684 } 5685 } 5686 pmap_unmap_ptes(pmap, pmap2); 5687 mutex_exit(&pmap->pm_lock); 5688 } 5689 #endif 5690 5691 /* 5692 * pmap_update: process deferred invalidations and frees. 5693 */ 5694 void 5695 pmap_update(struct pmap *pmap) 5696 { 5697 struct pmap_page *pp; 5698 struct vm_page *ptp; 5699 5700 /* 5701 * Initiate any pending TLB shootdowns. Wait for them to 5702 * complete before returning control to the caller. 5703 */ 5704 kpreempt_disable(); 5705 pmap_tlb_shootnow(); 5706 kpreempt_enable(); 5707 5708 /* 5709 * Now that shootdowns are complete, process deferred frees. This 5710 * is an unlocked check, but is safe as we're only interested in 5711 * work done in this LWP - we won't get a false negative. 5712 */ 5713 if (atomic_load_relaxed(&pmap->pm_gc_ptp.lh_first) == NULL) { 5714 return; 5715 } 5716 5717 mutex_enter(&pmap->pm_lock); 5718 while ((ptp = LIST_FIRST(&pmap->pm_gc_ptp)) != NULL) { 5719 KASSERT(ptp->wire_count == 0); 5720 KASSERT(ptp->uanon == NULL); 5721 LIST_REMOVE(ptp, mdpage.mp_pp.pp_link); 5722 pp = VM_PAGE_TO_PP(ptp); 5723 LIST_INIT(&pp->pp_pvlist); 5724 pp->pp_attrs = 0; 5725 pp->pp_pte.pte_ptp = NULL; 5726 pp->pp_pte.pte_va = 0; 5727 PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp)); 5728 5729 /* 5730 * XXX Hack to avoid extra locking, and lock 5731 * assertions in uvm_pagefree(). Despite uobject 5732 * being set, this isn't a managed page. 5733 */ 5734 PMAP_DUMMY_LOCK(pmap); 5735 uvm_pagerealloc(ptp, NULL, 0); 5736 PMAP_DUMMY_UNLOCK(pmap); 5737 uvm_pagefree(ptp); 5738 } 5739 mutex_exit(&pmap->pm_lock); 5740 } 5741 5742 #if PTP_LEVELS > 4 5743 #error "Unsupported number of page table mappings" 5744 #endif 5745 5746 paddr_t 5747 pmap_init_tmp_pgtbl(paddr_t pg) 5748 { 5749 static bool maps_loaded; 5750 static const paddr_t x86_tmp_pml_paddr[] = { 5751 4 * PAGE_SIZE, /* L1 */ 5752 5 * PAGE_SIZE, /* L2 */ 5753 6 * PAGE_SIZE, /* L3 */ 5754 7 * PAGE_SIZE /* L4 */ 5755 }; 5756 static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 }; 5757 5758 pd_entry_t *tmp_pml, *kernel_pml; 5759 5760 int level; 5761 5762 if (!maps_loaded) { 5763 for (level = 0; level < PTP_LEVELS; ++level) { 5764 x86_tmp_pml_vaddr[level] = 5765 uvm_km_alloc(kernel_map, PAGE_SIZE, 0, 5766 UVM_KMF_VAONLY); 5767 5768 if (x86_tmp_pml_vaddr[level] == 0) 5769 panic("mapping of real mode PML failed\n"); 5770 pmap_kenter_pa(x86_tmp_pml_vaddr[level], 5771 x86_tmp_pml_paddr[level], 5772 VM_PROT_READ | VM_PROT_WRITE, 0); 5773 } 5774 pmap_update(pmap_kernel()); 5775 maps_loaded = true; 5776 } 5777 5778 /* Zero levels 1-3 */ 5779 for (level = 0; level < PTP_LEVELS - 1; ++level) { 5780 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 5781 memset(tmp_pml, 0, PAGE_SIZE); 5782 } 5783 5784 /* Copy PML4 */ 5785 kernel_pml = pmap_kernel()->pm_pdir; 5786 tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1]; 5787 memcpy(tmp_pml, kernel_pml, PAGE_SIZE); 5788 5789 #ifdef PAE 5790 /* 5791 * Use the last 4 entries of the L2 page as L3 PD entries. These 5792 * last entries are unlikely to be used for temporary mappings. 5793 * 508: maps 0->1GB (userland) 5794 * 509: unused 5795 * 510: unused 5796 * 511: maps 3->4GB (kernel) 5797 */ 5798 tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PTE_P; 5799 tmp_pml[509] = 0; 5800 tmp_pml[510] = 0; 5801 tmp_pml[511] = pmap_pdirpa(pmap_kernel(), PDIR_SLOT_KERN) | PTE_P; 5802 #endif 5803 5804 for (level = PTP_LEVELS - 1; level > 0; --level) { 5805 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 5806 5807 tmp_pml[pl_i(pg, level + 1)] = 5808 (x86_tmp_pml_paddr[level - 1] & PTE_FRAME) | PTE_W | PTE_P; 5809 } 5810 5811 tmp_pml = (void *)x86_tmp_pml_vaddr[0]; 5812 tmp_pml[pl_i(pg, 1)] = (pg & PTE_FRAME) | PTE_W | PTE_P; 5813 5814 #ifdef PAE 5815 /* Return the PA of the L3 page (entry 508 of the L2 page) */ 5816 return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t); 5817 #endif 5818 5819 return x86_tmp_pml_paddr[PTP_LEVELS - 1]; 5820 } 5821 5822 u_int 5823 x86_mmap_flags(paddr_t mdpgno) 5824 { 5825 u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK; 5826 u_int pflag = 0; 5827 5828 if (nflag & X86_MMAP_FLAG_PREFETCH) 5829 pflag |= PMAP_WRITE_COMBINE; 5830 5831 return pflag; 5832 } 5833 5834 #if defined(__HAVE_DIRECT_MAP) && defined(__x86_64__) && !defined(XENPV) 5835 5836 /* 5837 * ----------------------------------------------------------------------------- 5838 * ***************************************************************************** 5839 * ***************************************************************************** 5840 * ***************************************************************************** 5841 * ***************************************************************************** 5842 * **************** HERE BEGINS THE EPT CODE, USED BY INTEL-VMX **************** 5843 * ***************************************************************************** 5844 * ***************************************************************************** 5845 * ***************************************************************************** 5846 * ***************************************************************************** 5847 * ----------------------------------------------------------------------------- 5848 * 5849 * These functions are invoked as callbacks from the code above. Contrary to 5850 * native, EPT does not have a recursive slot; therefore, it is not possible 5851 * to call pmap_map_ptes(). Instead, we use the direct map and walk down the 5852 * tree manually. 5853 * 5854 * Apart from that, the logic is mostly the same as native. Once a pmap has 5855 * been created, NVMM calls pmap_ept_transform() to make it an EPT pmap. 5856 * After that we're good, and the callbacks will handle the translations 5857 * for us. 5858 * 5859 * ----------------------------------------------------------------------------- 5860 */ 5861 5862 /* Hardware bits. */ 5863 #define EPT_R __BIT(0) /* read */ 5864 #define EPT_W __BIT(1) /* write */ 5865 #define EPT_X __BIT(2) /* execute */ 5866 #define EPT_T __BITS(5,3) /* type */ 5867 #define TYPE_UC 0 5868 #define TYPE_WC 1 5869 #define TYPE_WT 4 5870 #define TYPE_WP 5 5871 #define TYPE_WB 6 5872 #define EPT_NOPAT __BIT(6) 5873 #define EPT_L __BIT(7) /* large */ 5874 #define EPT_A __BIT(8) /* accessed */ 5875 #define EPT_D __BIT(9) /* dirty */ 5876 /* Software bits. */ 5877 #define EPT_PVLIST __BIT(60) 5878 #define EPT_WIRED __BIT(61) 5879 5880 #define pmap_ept_valid_entry(pte) (pte & EPT_R) 5881 5882 bool pmap_ept_has_ad __read_mostly; 5883 5884 static inline void 5885 pmap_ept_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte) 5886 { 5887 int resid_diff = ((npte & EPT_R) ? 1 : 0) - ((opte & EPT_R) ? 1 : 0); 5888 int wired_diff = ((npte & EPT_WIRED) ? 1 : 0) - ((opte & EPT_WIRED) ? 1 : 0); 5889 5890 KASSERT((npte & (EPT_R | EPT_WIRED)) != EPT_WIRED); 5891 KASSERT((opte & (EPT_R | EPT_WIRED)) != EPT_WIRED); 5892 5893 pmap_stats_update(pmap, resid_diff, wired_diff); 5894 } 5895 5896 static pt_entry_t 5897 pmap_ept_type(u_int flags) 5898 { 5899 u_int cacheflags = (flags & PMAP_CACHE_MASK); 5900 pt_entry_t ret; 5901 5902 switch (cacheflags) { 5903 case PMAP_NOCACHE: 5904 case PMAP_NOCACHE_OVR: 5905 ret = __SHIFTIN(TYPE_UC, EPT_T); 5906 break; 5907 case PMAP_WRITE_COMBINE: 5908 ret = __SHIFTIN(TYPE_WC, EPT_T); 5909 break; 5910 case PMAP_WRITE_BACK: 5911 default: 5912 ret = __SHIFTIN(TYPE_WB, EPT_T); 5913 break; 5914 } 5915 5916 ret |= EPT_NOPAT; 5917 return ret; 5918 } 5919 5920 static inline pt_entry_t 5921 pmap_ept_prot(vm_prot_t prot) 5922 { 5923 pt_entry_t res = 0; 5924 5925 if (prot & VM_PROT_READ) 5926 res |= EPT_R; 5927 if (prot & VM_PROT_WRITE) 5928 res |= EPT_W; 5929 if (prot & VM_PROT_EXECUTE) 5930 res |= EPT_X; 5931 5932 return res; 5933 } 5934 5935 static inline uint8_t 5936 pmap_ept_to_pp_attrs(pt_entry_t ept) 5937 { 5938 uint8_t ret = 0; 5939 if (pmap_ept_has_ad) { 5940 if (ept & EPT_D) 5941 ret |= PP_ATTRS_D; 5942 if (ept & EPT_A) 5943 ret |= PP_ATTRS_A; 5944 } else { 5945 ret |= (PP_ATTRS_D|PP_ATTRS_A); 5946 } 5947 if (ept & EPT_W) 5948 ret |= PP_ATTRS_W; 5949 return ret; 5950 } 5951 5952 static inline pt_entry_t 5953 pmap_pp_attrs_to_ept(uint8_t attrs) 5954 { 5955 pt_entry_t ept = 0; 5956 if (attrs & PP_ATTRS_D) 5957 ept |= EPT_D; 5958 if (attrs & PP_ATTRS_A) 5959 ept |= EPT_A; 5960 if (attrs & PP_ATTRS_W) 5961 ept |= EPT_W; 5962 return ept; 5963 } 5964 5965 /* 5966 * Helper for pmap_ept_free_ptp. 5967 * tree[0] = &L2[L2idx] 5968 * tree[1] = &L3[L3idx] 5969 * tree[2] = &L4[L4idx] 5970 */ 5971 static void 5972 pmap_ept_get_tree(struct pmap *pmap, vaddr_t va, pd_entry_t **tree) 5973 { 5974 pt_entry_t *pteva; 5975 paddr_t ptepa; 5976 int i, index; 5977 5978 ptepa = pmap->pm_pdirpa[0]; 5979 for (i = PTP_LEVELS; i > 1; i--) { 5980 index = pl_pi(va, i); 5981 pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa); 5982 KASSERT(pmap_ept_valid_entry(pteva[index])); 5983 tree[i - 2] = &pteva[index]; 5984 ptepa = pmap_pte2pa(pteva[index]); 5985 } 5986 } 5987 5988 static void 5989 pmap_ept_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va) 5990 { 5991 pd_entry_t *tree[3]; 5992 int level; 5993 5994 KASSERT(pmap != pmap_kernel()); 5995 KASSERT(mutex_owned(&pmap->pm_lock)); 5996 KASSERT(kpreempt_disabled()); 5997 5998 pmap_ept_get_tree(pmap, va, tree); 5999 6000 level = 1; 6001 do { 6002 (void)pmap_pte_testset(tree[level - 1], 0); 6003 6004 pmap_freepage(pmap, ptp, level); 6005 if (level < PTP_LEVELS - 1) { 6006 ptp = pmap_find_ptp(pmap, va, level + 1); 6007 ptp->wire_count--; 6008 if (ptp->wire_count > 1) 6009 break; 6010 } 6011 } while (++level < PTP_LEVELS); 6012 pmap_pte_flush(); 6013 } 6014 6015 /* Allocate L4->L3->L2. Return L2. */ 6016 static void 6017 pmap_ept_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va) 6018 { 6019 struct vm_page *ptp; 6020 unsigned long index; 6021 pd_entry_t *pteva; 6022 paddr_t ptepa; 6023 int i; 6024 6025 KASSERT(pmap != pmap_kernel()); 6026 KASSERT(mutex_owned(&pmap->pm_lock)); 6027 KASSERT(kpreempt_disabled()); 6028 6029 /* 6030 * Now that we have all the pages looked up or allocated, 6031 * loop through again installing any new ones into the tree. 6032 */ 6033 ptepa = pmap->pm_pdirpa[0]; 6034 for (i = PTP_LEVELS; i > 1; i--) { 6035 index = pl_pi(va, i); 6036 pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa); 6037 6038 if (pmap_ept_valid_entry(pteva[index])) { 6039 KASSERT(!pt->alloced[i]); 6040 ptepa = pmap_pte2pa(pteva[index]); 6041 continue; 6042 } 6043 6044 ptp = pt->pg[i]; 6045 ptp->flags &= ~PG_BUSY; /* never busy */ 6046 ptp->wire_count = 1; 6047 pmap->pm_ptphint[i - 2] = ptp; 6048 ptepa = VM_PAGE_TO_PHYS(ptp); 6049 pmap_pte_set(&pteva[index], ptepa | EPT_R | EPT_W | EPT_X); 6050 6051 pmap_pte_flush(); 6052 pmap_stats_update(pmap, 1, 0); 6053 6054 /* 6055 * If we're not in the top level, increase the 6056 * wire count of the parent page. 6057 */ 6058 if (i < PTP_LEVELS) { 6059 pt->pg[i + 1]->wire_count++; 6060 } 6061 } 6062 } 6063 6064 static int 6065 pmap_ept_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, 6066 u_int flags) 6067 { 6068 pt_entry_t *ptes, opte, npte; 6069 pt_entry_t *ptep; 6070 struct vm_page *ptp; 6071 struct vm_page *new_pg, *old_pg; 6072 struct pmap_page *new_pp, *old_pp; 6073 struct pv_entry *old_pve, *new_pve; 6074 bool wired = (flags & PMAP_WIRED) != 0; 6075 bool accessed; 6076 struct pmap_ptparray pt; 6077 int error; 6078 bool getptp, samepage, new_embedded; 6079 rb_tree_t *tree; 6080 6081 KASSERT(pmap_initialized); 6082 KASSERT(va < VM_MAXUSER_ADDRESS); 6083 6084 npte = pa | pmap_ept_prot(prot) | pmap_ept_type(flags); 6085 6086 if (wired) 6087 npte |= EPT_WIRED; 6088 if (flags & VM_PROT_ALL) { 6089 npte |= EPT_A; 6090 if (flags & VM_PROT_WRITE) { 6091 KASSERT((npte & EPT_W) != 0); 6092 npte |= EPT_D; 6093 } 6094 } 6095 6096 new_pg = PHYS_TO_VM_PAGE(pa); 6097 if (new_pg != NULL) { 6098 /* This is a managed page */ 6099 npte |= EPT_PVLIST; 6100 new_pp = VM_PAGE_TO_PP(new_pg); 6101 } else if ((new_pp = pmap_pv_tracked(pa)) != NULL) { 6102 /* This is an unmanaged pv-tracked page */ 6103 npte |= EPT_PVLIST; 6104 } else { 6105 new_pp = NULL; 6106 } 6107 6108 /* Begin by locking the pmap. */ 6109 mutex_enter(&pmap->pm_lock); 6110 6111 /* Look up the PTP. Allocate if none present. */ 6112 ptp = NULL; 6113 getptp = false; 6114 if (pmap != pmap_kernel()) { 6115 ptp = pmap_find_ptp(pmap, va, 1); 6116 if (ptp == NULL) { 6117 getptp = true; 6118 error = pmap_get_ptp(pmap, &pt, va, flags, &ptp); 6119 if (error != 0) { 6120 if (flags & PMAP_CANFAIL) { 6121 mutex_exit(&pmap->pm_lock); 6122 return error; 6123 } 6124 panic("%s: get ptp failed, error=%d", __func__, 6125 error); 6126 } 6127 } 6128 tree = &VM_PAGE_TO_PP(ptp)->pp_rb; 6129 } else { 6130 /* Embedded PV entries rely on this. */ 6131 KASSERT(va != 0); 6132 tree = &pmap_kernel_rb; 6133 } 6134 6135 /* 6136 * Look up the old PV entry at this VA (if any), and insert a new PV 6137 * entry if required for the new mapping. Temporarily track the old 6138 * and new mappings concurrently. Only after the old mapping is 6139 * evicted from the pmap will we remove its PV entry. Otherwise, 6140 * our picture of modified/accessed state for either page could get 6141 * out of sync (we need any P->V operation for either page to stall 6142 * on pmap->pm_lock until done here). 6143 */ 6144 new_pve = NULL; 6145 old_pve = NULL; 6146 samepage = false; 6147 new_embedded = false; 6148 6149 if (new_pp != NULL) { 6150 error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve, 6151 &old_pve, &samepage, &new_embedded, tree); 6152 6153 /* 6154 * If a new pv_entry was needed and none was available, we 6155 * can go no further. 6156 */ 6157 if (error != 0) { 6158 if (flags & PMAP_CANFAIL) { 6159 if (getptp) { 6160 pmap_unget_ptp(pmap, &pt); 6161 } 6162 mutex_exit(&pmap->pm_lock); 6163 return error; 6164 } 6165 panic("%s: alloc pve failed", __func__); 6166 } 6167 } else { 6168 old_pve = pmap_treelookup_pv(pmap, ptp, tree, va); 6169 } 6170 6171 /* Map PTEs into address space. */ 6172 kpreempt_disable(); 6173 6174 /* Install any newly allocated PTPs. */ 6175 if (getptp) { 6176 pmap_ept_install_ptp(pmap, &pt, va); 6177 } 6178 6179 /* Check if there is an existing mapping. */ 6180 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp)); 6181 ptep = &ptes[pl1_pi(va)]; 6182 opte = *ptep; 6183 bool have_oldpa = pmap_ept_valid_entry(opte); 6184 paddr_t oldpa = pmap_pte2pa(opte); 6185 6186 /* 6187 * Update the pte. 6188 */ 6189 do { 6190 opte = *ptep; 6191 6192 /* 6193 * if the same page, inherit PTE_A and PTE_D. 6194 */ 6195 if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) { 6196 npte |= opte & (EPT_A | EPT_D); 6197 } 6198 } while (pmap_pte_cas(ptep, opte, npte) != opte); 6199 6200 /* 6201 * Done with the PTEs: they can now be unmapped. 6202 */ 6203 kpreempt_enable(); 6204 6205 /* 6206 * Update statistics and PTP's reference count. 6207 */ 6208 pmap_ept_stats_update_bypte(pmap, npte, opte); 6209 if (ptp != NULL) { 6210 if (!have_oldpa) { 6211 ptp->wire_count++; 6212 } 6213 /* Remember minimum VA in PTP. */ 6214 pmap_ptp_range_set(ptp, va); 6215 } 6216 KASSERT(ptp == NULL || ptp->wire_count > 1); 6217 6218 /* 6219 * If the same page, we can skip pv_entry handling. 6220 */ 6221 if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) { 6222 KASSERT(((opte ^ npte) & EPT_PVLIST) == 0); 6223 if ((npte & EPT_PVLIST) != 0) { 6224 KASSERT(samepage); 6225 pmap_check_pv(pmap, ptp, new_pp, va, true); 6226 } 6227 goto same_pa; 6228 } else if ((npte & EPT_PVLIST) != 0) { 6229 KASSERT(!samepage); 6230 } 6231 6232 /* 6233 * If old page is pv-tracked, remove pv_entry from its list. 6234 */ 6235 if ((~opte & (EPT_R | EPT_PVLIST)) == 0) { 6236 if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) { 6237 old_pp = VM_PAGE_TO_PP(old_pg); 6238 } else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) { 6239 panic("%s: EPT_PVLIST with pv-untracked page" 6240 " va = %#"PRIxVADDR 6241 " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")", 6242 __func__, va, oldpa, atop(pa)); 6243 } 6244 6245 pmap_remove_pv(pmap, old_pp, ptp, va, old_pve, 6246 pmap_ept_to_pp_attrs(opte)); 6247 } else { 6248 KASSERT(old_pve == NULL); 6249 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 6250 } 6251 6252 /* 6253 * If new page is dynamically PV tracked, insert to tree. 6254 */ 6255 if (new_pve != NULL) { 6256 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 6257 old_pve = rb_tree_insert_node(tree, new_pve); 6258 KASSERT(old_pve == new_pve); 6259 pmap_check_pv(pmap, ptp, new_pp, va, true); 6260 } 6261 6262 same_pa: 6263 /* 6264 * shootdown tlb if necessary. 6265 */ 6266 6267 if (pmap_ept_has_ad) { 6268 accessed = (~opte & (EPT_R | EPT_A)) == 0; 6269 } else { 6270 accessed = (opte & EPT_R) != 0; 6271 } 6272 if (accessed && ((opte ^ npte) & (PTE_FRAME | EPT_W)) != 0) { 6273 pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_ENTER); 6274 } 6275 pmap_drain_pv(pmap); 6276 mutex_exit(&pmap->pm_lock); 6277 return 0; 6278 } 6279 6280 /* Pay close attention, this returns L2. */ 6281 static int 6282 pmap_ept_pdes_invalid(struct pmap *pmap, vaddr_t va, pd_entry_t *lastpde) 6283 { 6284 pt_entry_t *pteva; 6285 paddr_t ptepa; 6286 int i, index; 6287 6288 KASSERT(mutex_owned(&pmap->pm_lock)); 6289 6290 ptepa = pmap->pm_pdirpa[0]; 6291 for (i = PTP_LEVELS; i > 1; i--) { 6292 pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa); 6293 index = pl_pi(va, i); 6294 if (!pmap_ept_valid_entry(pteva[index])) 6295 return i; 6296 ptepa = pmap_pte2pa(pteva[index]); 6297 } 6298 if (lastpde != NULL) { 6299 *lastpde = pteva[index]; 6300 } 6301 6302 return 0; 6303 } 6304 6305 static bool 6306 pmap_ept_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap) 6307 { 6308 pt_entry_t *ptes, pte; 6309 pd_entry_t pde; 6310 paddr_t ptppa, pa; 6311 bool rv; 6312 6313 #ifdef __HAVE_DIRECT_MAP 6314 if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) { 6315 if (pap != NULL) { 6316 *pap = PMAP_DIRECT_UNMAP(va); 6317 } 6318 return true; 6319 } 6320 #endif 6321 6322 rv = false; 6323 pa = 0; 6324 6325 mutex_enter(&pmap->pm_lock); 6326 kpreempt_disable(); 6327 6328 if (!pmap_ept_pdes_invalid(pmap, va, &pde)) { 6329 ptppa = pmap_pte2pa(pde); 6330 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa); 6331 pte = ptes[pl1_pi(va)]; 6332 if (__predict_true((pte & EPT_R) != 0)) { 6333 pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1)); 6334 rv = true; 6335 } 6336 } 6337 6338 kpreempt_enable(); 6339 mutex_exit(&pmap->pm_lock); 6340 6341 if (pap != NULL) { 6342 *pap = pa; 6343 } 6344 return rv; 6345 } 6346 6347 static bool 6348 pmap_ept_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, 6349 vaddr_t va) 6350 { 6351 struct pv_entry *pve; 6352 struct vm_page *pg; 6353 struct pmap_page *pp; 6354 pt_entry_t opte; 6355 bool accessed; 6356 6357 KASSERT(pmap != pmap_kernel()); 6358 KASSERT(mutex_owned(&pmap->pm_lock)); 6359 KASSERT(kpreempt_disabled()); 6360 6361 if (!pmap_ept_valid_entry(*pte)) { 6362 /* VA not mapped. */ 6363 return false; 6364 } 6365 6366 /* Atomically save the old PTE and zap it. */ 6367 opte = pmap_pte_testset(pte, 0); 6368 if (!pmap_ept_valid_entry(opte)) { 6369 return false; 6370 } 6371 6372 pmap_ept_stats_update_bypte(pmap, 0, opte); 6373 6374 if (ptp) { 6375 /* 6376 * Dropping a PTE. Make sure that the PDE is flushed. 6377 */ 6378 ptp->wire_count--; 6379 if (ptp->wire_count <= 1) { 6380 opte |= EPT_A; 6381 } 6382 } 6383 6384 if (pmap_ept_has_ad) { 6385 accessed = (opte & EPT_A) != 0; 6386 } else { 6387 accessed = true; 6388 } 6389 if (accessed) { 6390 pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_REMOVE_PTE); 6391 } 6392 6393 /* 6394 * If we are not on a pv list - we are done. 6395 */ 6396 if ((opte & EPT_PVLIST) == 0) { 6397 KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL), 6398 "managed page without EPT_PVLIST for %#"PRIxVADDR, va); 6399 KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL), 6400 "pv-tracked page without EPT_PVLIST for %#"PRIxVADDR, va); 6401 KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ? 6402 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL); 6403 return true; 6404 } 6405 6406 if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) { 6407 pp = VM_PAGE_TO_PP(pg); 6408 } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) { 6409 paddr_t pa = pmap_pte2pa(opte); 6410 panic("%s: EPT_PVLIST with pv-untracked page" 6411 " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")", 6412 __func__, va, pa, atop(pa)); 6413 } 6414 6415 /* Sync R/M bits. */ 6416 pve = pmap_lookup_pv(pmap, ptp, pp, va); 6417 pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_ept_to_pp_attrs(opte)); 6418 return true; 6419 } 6420 6421 static void 6422 pmap_ept_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva, 6423 vaddr_t startva, vaddr_t endva) 6424 { 6425 pt_entry_t *pte = (pt_entry_t *)ptpva; 6426 6427 KASSERT(pmap != pmap_kernel()); 6428 KASSERT(mutex_owned(&pmap->pm_lock)); 6429 KASSERT(kpreempt_disabled()); 6430 6431 /* 6432 * mappings are very often sparse, so clip the given range to the 6433 * range of PTEs that are known present in the PTP. 6434 */ 6435 pmap_ptp_range_clip(ptp, &startva, &pte); 6436 6437 /* 6438 * note that ptpva points to the PTE that maps startva. this may 6439 * or may not be the first PTE in the PTP. 6440 * 6441 * we loop through the PTP while there are still PTEs to look at 6442 * and the wire_count is greater than 1 (because we use the wire_count 6443 * to keep track of the number of real PTEs in the PTP). 6444 */ 6445 while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) { 6446 (void)pmap_ept_remove_pte(pmap, ptp, pte, startva); 6447 startva += PAGE_SIZE; 6448 pte++; 6449 } 6450 } 6451 6452 static void 6453 pmap_ept_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 6454 { 6455 pt_entry_t *ptes; 6456 pd_entry_t pde; 6457 paddr_t ptppa; 6458 vaddr_t blkendva, va = sva; 6459 struct vm_page *ptp; 6460 6461 mutex_enter(&pmap->pm_lock); 6462 kpreempt_disable(); 6463 6464 for (/* null */ ; va < eva ; va = blkendva) { 6465 int lvl; 6466 6467 /* determine range of block */ 6468 blkendva = x86_round_pdr(va+1); 6469 if (blkendva > eva) 6470 blkendva = eva; 6471 6472 lvl = pmap_ept_pdes_invalid(pmap, va, &pde); 6473 if (lvl != 0) { 6474 /* Skip a range corresponding to an invalid pde. */ 6475 blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1]; 6476 continue; 6477 } 6478 6479 /* PA of the PTP */ 6480 ptppa = pmap_pte2pa(pde); 6481 6482 ptp = pmap_find_ptp(pmap, va, 1); 6483 KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected", 6484 __func__); 6485 6486 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa); 6487 6488 pmap_ept_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_pi(va)], va, 6489 blkendva); 6490 6491 /* If PTP is no longer being used, free it. */ 6492 if (ptp && ptp->wire_count <= 1) { 6493 pmap_ept_free_ptp(pmap, ptp, va); 6494 } 6495 } 6496 6497 kpreempt_enable(); 6498 pmap_drain_pv(pmap); 6499 mutex_exit(&pmap->pm_lock); 6500 } 6501 6502 static int 6503 pmap_ept_sync_pv(struct vm_page *ptp, vaddr_t va, paddr_t pa, int clearbits, 6504 uint8_t *oattrs, pt_entry_t *optep) 6505 { 6506 struct pmap *pmap; 6507 pt_entry_t *ptep; 6508 pt_entry_t opte; 6509 pt_entry_t npte; 6510 pt_entry_t expect; 6511 bool need_shootdown; 6512 6513 expect = pmap_pa2pte(pa) | EPT_R; 6514 pmap = ptp_to_pmap(ptp); 6515 6516 if (clearbits != ~0) { 6517 KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0); 6518 clearbits = pmap_pp_attrs_to_ept(clearbits); 6519 } 6520 6521 ptep = pmap_map_pte(pmap, ptp, va); 6522 do { 6523 opte = *ptep; 6524 KASSERT((opte & (EPT_D | EPT_A)) != EPT_D); 6525 KASSERT((opte & (EPT_A | EPT_R)) != EPT_A); 6526 KASSERT(opte == 0 || (opte & EPT_R) != 0); 6527 if ((opte & (PTE_FRAME | EPT_R)) != expect) { 6528 /* 6529 * We lost a race with a V->P operation like 6530 * pmap_remove(). Wait for the competitor 6531 * reflecting pte bits into mp_attrs. 6532 */ 6533 pmap_unmap_pte(); 6534 return EAGAIN; 6535 } 6536 6537 /* 6538 * Check if there's anything to do on this PTE. 6539 */ 6540 if ((opte & clearbits) == 0) { 6541 need_shootdown = false; 6542 break; 6543 } 6544 6545 /* 6546 * We need a shootdown if the PTE is cached (EPT_A) ... 6547 * ... Unless we are clearing only the EPT_W bit and 6548 * it isn't cached as RW (EPT_D). 6549 */ 6550 if (pmap_ept_has_ad) { 6551 need_shootdown = (opte & EPT_A) != 0 && 6552 !(clearbits == EPT_W && (opte & EPT_D) == 0); 6553 } else { 6554 need_shootdown = true; 6555 } 6556 6557 npte = opte & ~clearbits; 6558 6559 /* 6560 * If we need a shootdown anyway, clear EPT_A and EPT_D. 6561 */ 6562 if (need_shootdown) { 6563 npte &= ~(EPT_A | EPT_D); 6564 } 6565 KASSERT((npte & (EPT_D | EPT_A)) != EPT_D); 6566 KASSERT((npte & (EPT_A | EPT_R)) != EPT_A); 6567 KASSERT(npte == 0 || (opte & EPT_R) != 0); 6568 } while (pmap_pte_cas(ptep, opte, npte) != opte); 6569 6570 if (need_shootdown) { 6571 pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_SYNC_PV); 6572 } 6573 pmap_unmap_pte(); 6574 6575 *oattrs = pmap_ept_to_pp_attrs(opte); 6576 if (optep != NULL) 6577 *optep = opte; 6578 return 0; 6579 } 6580 6581 static void 6582 pmap_ept_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte, 6583 vaddr_t va) 6584 { 6585 6586 KASSERT(mutex_owned(&pmap->pm_lock)); 6587 6588 pmap_ept_stats_update_bypte(pmap, 0, opte); 6589 ptp->wire_count--; 6590 if (ptp->wire_count <= 1) { 6591 pmap_ept_free_ptp(pmap, ptp, va); 6592 } 6593 } 6594 6595 static void 6596 pmap_ept_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot) 6597 { 6598 pt_entry_t bit_rem; 6599 pt_entry_t *ptes, *spte; 6600 pt_entry_t opte, npte; 6601 pd_entry_t pde; 6602 paddr_t ptppa; 6603 vaddr_t va; 6604 bool modified; 6605 6606 bit_rem = 0; 6607 if (!(prot & VM_PROT_WRITE)) 6608 bit_rem = EPT_W; 6609 6610 sva &= PTE_FRAME; 6611 eva &= PTE_FRAME; 6612 6613 /* Acquire pmap. */ 6614 mutex_enter(&pmap->pm_lock); 6615 kpreempt_disable(); 6616 6617 for (va = sva; va < eva; va += PAGE_SIZE) { 6618 if (pmap_ept_pdes_invalid(pmap, va, &pde)) { 6619 continue; 6620 } 6621 6622 ptppa = pmap_pte2pa(pde); 6623 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa); 6624 spte = &ptes[pl1_pi(va)]; 6625 6626 do { 6627 opte = *spte; 6628 if (!pmap_ept_valid_entry(opte)) { 6629 goto next; 6630 } 6631 npte = (opte & ~bit_rem); 6632 } while (pmap_pte_cas(spte, opte, npte) != opte); 6633 6634 if (pmap_ept_has_ad) { 6635 modified = (opte & EPT_D) != 0; 6636 } else { 6637 modified = true; 6638 } 6639 if (modified) { 6640 vaddr_t tva = x86_ptob(spte - ptes); 6641 pmap_tlb_shootdown(pmap, tva, 0, 6642 TLBSHOOT_WRITE_PROTECT); 6643 } 6644 next:; 6645 } 6646 6647 kpreempt_enable(); 6648 mutex_exit(&pmap->pm_lock); 6649 } 6650 6651 static void 6652 pmap_ept_unwire(struct pmap *pmap, vaddr_t va) 6653 { 6654 pt_entry_t *ptes, *ptep, opte; 6655 pd_entry_t pde; 6656 paddr_t ptppa; 6657 6658 /* Acquire pmap. */ 6659 mutex_enter(&pmap->pm_lock); 6660 kpreempt_disable(); 6661 6662 if (pmap_ept_pdes_invalid(pmap, va, &pde)) { 6663 panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va); 6664 } 6665 6666 ptppa = pmap_pte2pa(pde); 6667 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa); 6668 ptep = &ptes[pl1_pi(va)]; 6669 opte = *ptep; 6670 KASSERT(pmap_ept_valid_entry(opte)); 6671 6672 if (opte & EPT_WIRED) { 6673 pt_entry_t npte = opte & ~EPT_WIRED; 6674 6675 opte = pmap_pte_testset(ptep, npte); 6676 pmap_ept_stats_update_bypte(pmap, npte, opte); 6677 } else { 6678 printf("%s: wiring for pmap %p va %#" PRIxVADDR 6679 "did not change!\n", __func__, pmap, va); 6680 } 6681 6682 /* Release pmap. */ 6683 kpreempt_enable(); 6684 mutex_exit(&pmap->pm_lock); 6685 } 6686 6687 /* -------------------------------------------------------------------------- */ 6688 6689 void 6690 pmap_ept_transform(struct pmap *pmap) 6691 { 6692 pmap->pm_enter = pmap_ept_enter; 6693 pmap->pm_extract = pmap_ept_extract; 6694 pmap->pm_remove = pmap_ept_remove; 6695 pmap->pm_sync_pv = pmap_ept_sync_pv; 6696 pmap->pm_pp_remove_ent = pmap_ept_pp_remove_ent; 6697 pmap->pm_write_protect = pmap_ept_write_protect; 6698 pmap->pm_unwire = pmap_ept_unwire; 6699 6700 memset(pmap->pm_pdir, 0, PAGE_SIZE); 6701 } 6702 6703 #endif /* __HAVE_DIRECT_MAP && __x86_64__ && !XENPV */ 6704