1 /* $NetBSD: pmap.c,v 1.409 2021/02/06 21:24:19 jdolecek Exp $ */ 2 3 /* 4 * Copyright (c) 2008, 2010, 2016, 2017, 2019, 2020 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran, and by Maxime Villard. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 2007 Manuel Bouyer. 34 * 35 * Redistribution and use in source and binary forms, with or without 36 * modification, are permitted provided that the following conditions 37 * are met: 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 44 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 45 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 46 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 47 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 48 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 49 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 50 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 51 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 52 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 53 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 54 */ 55 56 /* 57 * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr> 58 * 59 * Permission to use, copy, modify, and distribute this software for any 60 * purpose with or without fee is hereby granted, provided that the above 61 * copyright notice and this permission notice appear in all copies. 62 * 63 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 64 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 65 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 66 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 67 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 68 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 69 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 70 */ 71 72 /* 73 * Copyright 2001 (c) Wasabi Systems, Inc. 74 * All rights reserved. 75 * 76 * Written by Frank van der Linden for Wasabi Systems, Inc. 77 * 78 * Redistribution and use in source and binary forms, with or without 79 * modification, are permitted provided that the following conditions 80 * are met: 81 * 1. Redistributions of source code must retain the above copyright 82 * notice, this list of conditions and the following disclaimer. 83 * 2. Redistributions in binary form must reproduce the above copyright 84 * notice, this list of conditions and the following disclaimer in the 85 * documentation and/or other materials provided with the distribution. 86 * 3. All advertising materials mentioning features or use of this software 87 * must display the following acknowledgement: 88 * This product includes software developed for the NetBSD Project by 89 * Wasabi Systems, Inc. 90 * 4. The name of Wasabi Systems, Inc. may not be used to endorse 91 * or promote products derived from this software without specific prior 92 * written permission. 93 * 94 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND 95 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 96 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 97 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC 98 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 99 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 100 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 101 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 102 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 103 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 104 * POSSIBILITY OF SUCH DAMAGE. 105 */ 106 107 /* 108 * Copyright (c) 1997 Charles D. Cranor and Washington University. 109 * All rights reserved. 110 * 111 * Redistribution and use in source and binary forms, with or without 112 * modification, are permitted provided that the following conditions 113 * are met: 114 * 1. Redistributions of source code must retain the above copyright 115 * notice, this list of conditions and the following disclaimer. 116 * 2. Redistributions in binary form must reproduce the above copyright 117 * notice, this list of conditions and the following disclaimer in the 118 * documentation and/or other materials provided with the distribution. 119 * 120 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 121 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 122 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 123 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 124 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 125 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 126 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 127 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 128 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 129 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 130 */ 131 132 #include <sys/cdefs.h> 133 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.409 2021/02/06 21:24:19 jdolecek Exp $"); 134 135 #include "opt_user_ldt.h" 136 #include "opt_lockdebug.h" 137 #include "opt_multiprocessor.h" 138 #include "opt_xen.h" 139 #include "opt_svs.h" 140 #include "opt_kaslr.h" 141 142 #define __MUTEX_PRIVATE /* for assertions */ 143 144 #include <sys/param.h> 145 #include <sys/systm.h> 146 #include <sys/proc.h> 147 #include <sys/pool.h> 148 #include <sys/kernel.h> 149 #include <sys/atomic.h> 150 #include <sys/cpu.h> 151 #include <sys/intr.h> 152 #include <sys/xcall.h> 153 #include <sys/kcore.h> 154 #include <sys/kmem.h> 155 #include <sys/asan.h> 156 #include <sys/msan.h> 157 #include <sys/entropy.h> 158 159 #include <uvm/uvm.h> 160 #include <uvm/pmap/pmap_pvt.h> 161 162 #include <dev/isa/isareg.h> 163 164 #include <machine/specialreg.h> 165 #include <machine/gdt.h> 166 #include <machine/isa_machdep.h> 167 #include <machine/cpuvar.h> 168 #include <machine/cputypes.h> 169 170 #include <x86/pmap_pv.h> 171 172 #include <x86/i82489reg.h> 173 #include <x86/i82489var.h> 174 175 #ifdef XEN 176 #include <xen/include/public/xen.h> 177 #include <xen/hypervisor.h> 178 #include <xen/xenpmap.h> 179 #endif 180 181 /* 182 * general info: 183 * 184 * - for an explanation of how the x86 MMU hardware works see 185 * the comments in <machine/pte.h>. 186 * 187 * - for an explanation of the general memory structure used by 188 * this pmap (including the recursive mapping), see the comments 189 * in <machine/pmap.h>. 190 * 191 * this file contains the code for the "pmap module." the module's 192 * job is to manage the hardware's virtual to physical address mappings. 193 * note that there are two levels of mapping in the VM system: 194 * 195 * [1] the upper layer of the VM system uses vm_map's and vm_map_entry's 196 * to map ranges of virtual address space to objects/files. for 197 * example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only 198 * to the file /bin/ls starting at offset zero." note that 199 * the upper layer mapping is not concerned with how individual 200 * vm_pages are mapped. 201 * 202 * [2] the lower layer of the VM system (the pmap) maintains the mappings 203 * from virtual addresses. it is concerned with which vm_page is 204 * mapped where. for example, when you run /bin/ls and start 205 * at page 0x1000 the fault routine may lookup the correct page 206 * of the /bin/ls file and then ask the pmap layer to establish 207 * a mapping for it. 208 * 209 * note that information in the lower layer of the VM system can be 210 * thrown away since it can easily be reconstructed from the info 211 * in the upper layer. 212 * 213 * data structures we use include: 214 * 215 * - struct pmap: describes the address space of one thread 216 * - struct pmap_page: describes one pv-tracked page, without 217 * necessarily a corresponding vm_page 218 * - struct pv_entry: describes one <PMAP,VA> mapping of a PA 219 * - pmap_page::pp_pvlist: there is one list per pv-tracked page of 220 * physical memory. the pp_pvlist points to a list of pv_entry 221 * structures which describe all the <PMAP,VA> pairs that this 222 * page is mapped in. this is critical for page based operations 223 * such as pmap_page_protect() [change protection on _all_ mappings 224 * of a page] 225 */ 226 227 /* 228 * Locking 229 * 230 * We have the following locks that we must deal with, listed in the order 231 * that they are acquired: 232 * 233 * pg->uobject->vmobjlock, pg->uanon->an_lock 234 * 235 * For managed pages, these per-object locks are taken by the VM system 236 * before calling into the pmap module - either a read or write hold. 237 * The lock hold prevent pages from changing identity while the pmap is 238 * operating on them. For example, the same lock is held across a call 239 * to pmap_remove() and the following call to pmap_update(), so that a 240 * page does not gain a new identity while its TLB visibility is stale. 241 * 242 * pmap->pm_lock 243 * 244 * This lock protects the fields in the pmap structure including the 245 * non-kernel PDEs in the PDP, the PTEs, and PTPs and connected data 246 * structures. For modifying unmanaged kernel PTEs it is not needed as 247 * kernel PDEs are never freed, and the kernel is expected to be self 248 * consistent (and the lock can't be taken for unmanaged kernel PTEs, 249 * because they can be modified from interrupt context). 250 * 251 * pmaps_lock 252 * 253 * This lock protects the list of active pmaps (headed by "pmaps"). 254 * It's acquired when adding or removing pmaps or adjusting kernel PDEs. 255 * 256 * pp_lock 257 * 258 * This per-page lock protects PV entry lists and the embedded PV entry 259 * in each vm_page, allowing for concurrent operation on pages by 260 * different pmaps. This is a spin mutex at IPL_VM, because at the 261 * points it is taken context switching is usually not tolerable, and 262 * spin mutexes must block out interrupts that could take kernel_lock. 263 */ 264 265 /* uvm_object is abused here to index pmap_pages; make assertions happy. */ 266 #ifdef DIAGNOSTIC 267 #define PMAP_DUMMY_LOCK(pm) rw_enter(&(pm)->pm_dummy_lock, RW_WRITER) 268 #define PMAP_DUMMY_UNLOCK(pm) rw_exit(&(pm)->pm_dummy_lock) 269 #else 270 #define PMAP_DUMMY_LOCK(pm) 271 #define PMAP_DUMMY_UNLOCK(pm) 272 #endif 273 274 static const struct uvm_pagerops pmap_pager = { 275 /* nothing */ 276 }; 277 278 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER; 279 const vaddr_t ptp_frames[] = PTP_FRAME_INITIALIZER; 280 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER; 281 const long nkptpmax[] = NKPTPMAX_INITIALIZER; 282 const long nbpd[] = NBPD_INITIALIZER; 283 #ifdef i386 284 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER; 285 #else 286 pd_entry_t *normal_pdes[3]; 287 #endif 288 289 long nkptp[] = NKPTP_INITIALIZER; 290 291 struct pmap_head pmaps; 292 kmutex_t pmaps_lock __cacheline_aligned; 293 294 struct pcpu_area *pcpuarea __read_mostly; 295 296 static vaddr_t pmap_maxkvaddr; 297 298 /* 299 * Misc. event counters. 300 */ 301 struct evcnt pmap_iobmp_evcnt; 302 struct evcnt pmap_ldt_evcnt; 303 304 /* 305 * PAT 306 */ 307 static bool cpu_pat_enabled __read_mostly = false; 308 309 /* 310 * Global data structures 311 */ 312 313 static struct pmap kernel_pmap_store __cacheline_aligned; /* kernel's pmap */ 314 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store; 315 static rb_tree_t pmap_kernel_rb __cacheline_aligned; 316 317 struct bootspace bootspace __read_mostly; 318 struct slotspace slotspace __read_mostly; 319 320 /* Set to PTE_NX if supported. */ 321 pd_entry_t pmap_pg_nx __read_mostly = 0; 322 323 /* Set to PTE_G if supported. */ 324 pd_entry_t pmap_pg_g __read_mostly = 0; 325 326 /* Set to true if large pages are supported. */ 327 int pmap_largepages __read_mostly = 0; 328 329 paddr_t lowmem_rsvd __read_mostly; 330 paddr_t avail_start __read_mostly; /* PA of first available physical page */ 331 paddr_t avail_end __read_mostly; /* PA of last available physical page */ 332 333 #ifdef XENPV 334 paddr_t pmap_pa_start; /* PA of first physical page for this domain */ 335 paddr_t pmap_pa_end; /* PA of last physical page for this domain */ 336 #endif 337 338 #define VM_PAGE_TO_PP(pg) (&(pg)->mdpage.mp_pp) 339 #define PMAP_CHECK_PP(pp) \ 340 KASSERTMSG((pp)->pp_lock.mtx_ipl._ipl == IPL_VM, "bad pmap_page %p", pp) 341 342 #define PAGE_ALIGNED(pp) \ 343 __builtin_assume_aligned((void *)(pp), PAGE_SIZE) 344 345 /* 346 * Other data structures 347 */ 348 349 static pt_entry_t protection_codes[8] __read_mostly; 350 351 static bool pmap_initialized __read_mostly = false; /* pmap_init done yet? */ 352 353 /* 354 * The following two vaddr_t's are used during system startup to keep track of 355 * how much of the kernel's VM space we have used. Once the system is started, 356 * the management of the remaining kernel VM space is turned over to the 357 * kernel_map vm_map. 358 */ 359 static vaddr_t virtual_avail __read_mostly; /* VA of first free KVA */ 360 static vaddr_t virtual_end __read_mostly; /* VA of last free KVA */ 361 362 #ifndef XENPV 363 /* 364 * LAPIC virtual address, and fake physical address. 365 */ 366 volatile vaddr_t local_apic_va __read_mostly; 367 paddr_t local_apic_pa __read_mostly; 368 #endif 369 370 /* 371 * pool that pmap structures are allocated from 372 */ 373 struct pool_cache pmap_cache; 374 static int pmap_ctor(void *, void *, int); 375 static void pmap_dtor(void *, void *); 376 377 /* 378 * pv_page cache 379 */ 380 static struct pool_cache pmap_pvp_cache; 381 382 #ifdef __HAVE_DIRECT_MAP 383 vaddr_t pmap_direct_base __read_mostly; 384 vaddr_t pmap_direct_end __read_mostly; 385 #endif 386 387 #ifndef __HAVE_DIRECT_MAP 388 /* 389 * Special VAs and the PTEs that map them 390 */ 391 static pt_entry_t *early_zero_pte; 392 static void pmap_vpage_cpualloc(struct cpu_info *); 393 #ifdef XENPV 394 char *early_zerop; /* also referenced from xen_locore() */ 395 #else 396 static char *early_zerop; 397 #endif 398 #endif 399 400 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int); 401 402 /* PDP pool and its callbacks */ 403 static struct pool pmap_pdp_pool; 404 static void pmap_pdp_init(pd_entry_t *); 405 static void pmap_pdp_fini(pd_entry_t *); 406 407 #ifdef PAE 408 /* need to allocate items of 4 pages */ 409 static void *pmap_pdp_alloc(struct pool *, int); 410 static void pmap_pdp_free(struct pool *, void *); 411 static struct pool_allocator pmap_pdp_allocator = { 412 .pa_alloc = pmap_pdp_alloc, 413 .pa_free = pmap_pdp_free, 414 .pa_pagesz = PAGE_SIZE * PDP_SIZE, 415 }; 416 #endif 417 418 extern vaddr_t idt_vaddr; 419 extern paddr_t idt_paddr; 420 extern vaddr_t gdt_vaddr; 421 extern paddr_t gdt_paddr; 422 extern vaddr_t ldt_vaddr; 423 extern paddr_t ldt_paddr; 424 425 #ifdef i386 426 /* stuff to fix the pentium f00f bug */ 427 extern vaddr_t pentium_idt_vaddr; 428 #endif 429 430 /* Array of freshly allocated PTPs, for pmap_get_ptp(). */ 431 struct pmap_ptparray { 432 struct vm_page *pg[PTP_LEVELS + 1]; 433 bool alloced[PTP_LEVELS + 1]; 434 }; 435 436 /* 437 * PV entries are allocated in page-sized chunks and cached per-pmap to 438 * avoid intense pressure on memory allocators. 439 */ 440 441 struct pv_page { 442 LIST_HEAD(, pv_entry) pvp_pves; 443 LIST_ENTRY(pv_page) pvp_list; 444 long pvp_nfree; 445 struct pmap *pvp_pmap; 446 }; 447 448 #define PVE_PER_PVP ((PAGE_SIZE / sizeof(struct pv_entry)) - 1) 449 450 /* 451 * PV tree prototypes 452 */ 453 454 static int pmap_compare_key(void *, const void *, const void *); 455 static int pmap_compare_nodes(void *, const void *, const void *); 456 457 /* Read-black tree */ 458 static const rb_tree_ops_t pmap_rbtree_ops = { 459 .rbto_compare_nodes = pmap_compare_nodes, 460 .rbto_compare_key = pmap_compare_key, 461 .rbto_node_offset = offsetof(struct pv_entry, pve_rb), 462 .rbto_context = NULL 463 }; 464 465 /* 466 * Local prototypes 467 */ 468 469 #ifdef __HAVE_PCPU_AREA 470 static void pmap_init_pcpu(void); 471 #endif 472 #ifdef __HAVE_DIRECT_MAP 473 static void pmap_init_directmap(struct pmap *); 474 #endif 475 #if !defined(XENPV) 476 static void pmap_remap_global(void); 477 #endif 478 #ifndef XENPV 479 static void pmap_init_lapic(void); 480 static void pmap_remap_largepages(void); 481 #endif 482 483 static int pmap_get_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t, int, 484 struct vm_page **); 485 static void pmap_unget_ptp(struct pmap *, struct pmap_ptparray *); 486 static void pmap_install_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t, 487 pd_entry_t * const *); 488 static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, int); 489 static void pmap_freepage(struct pmap *, struct vm_page *, int); 490 static void pmap_free_ptp(struct pmap *, struct vm_page *, vaddr_t, 491 pt_entry_t *, pd_entry_t * const *); 492 static bool pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *, 493 vaddr_t); 494 static void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t, vaddr_t, 495 vaddr_t); 496 static int pmap_pvp_ctor(void *, void *, int); 497 static void pmap_pvp_dtor(void *, void *); 498 static struct pv_entry *pmap_alloc_pv(struct pmap *); 499 static void pmap_free_pv(struct pmap *, struct pv_entry *); 500 static void pmap_drain_pv(struct pmap *); 501 502 static void pmap_alloc_level(struct pmap *, vaddr_t, long *); 503 504 static void pmap_load1(struct lwp *, struct pmap *, struct pmap *); 505 static void pmap_reactivate(struct pmap *); 506 507 /* 508 * p m a p h e l p e r f u n c t i o n s 509 */ 510 511 static inline void 512 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff) 513 { 514 515 KASSERT(cold || mutex_owned(&pmap->pm_lock)); 516 pmap->pm_stats.resident_count += resid_diff; 517 pmap->pm_stats.wired_count += wired_diff; 518 } 519 520 static inline void 521 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte) 522 { 523 int resid_diff = ((npte & PTE_P) ? 1 : 0) - ((opte & PTE_P) ? 1 : 0); 524 int wired_diff = ((npte & PTE_WIRED) ? 1 : 0) - ((opte & PTE_WIRED) ? 1 : 0); 525 526 KASSERT((npte & (PTE_P | PTE_WIRED)) != PTE_WIRED); 527 KASSERT((opte & (PTE_P | PTE_WIRED)) != PTE_WIRED); 528 529 pmap_stats_update(pmap, resid_diff, wired_diff); 530 } 531 532 /* 533 * ptp_to_pmap: lookup pmap by ptp 534 */ 535 static inline struct pmap * 536 ptp_to_pmap(struct vm_page *ptp) 537 { 538 struct pmap *pmap; 539 540 if (ptp == NULL) { 541 return pmap_kernel(); 542 } 543 pmap = (struct pmap *)ptp->uobject; 544 KASSERT(pmap != NULL); 545 KASSERT(&pmap->pm_obj[0] == ptp->uobject); 546 return pmap; 547 } 548 549 static inline struct pv_pte * 550 pve_to_pvpte(struct pv_entry *pve) 551 { 552 553 if (pve == NULL) 554 return NULL; 555 KASSERT((void *)&pve->pve_pte == (void *)pve); 556 return &pve->pve_pte; 557 } 558 559 static inline struct pv_entry * 560 pvpte_to_pve(struct pv_pte *pvpte) 561 { 562 struct pv_entry *pve = (void *)pvpte; 563 564 KASSERT(pve_to_pvpte(pve) == pvpte); 565 return pve; 566 } 567 568 /* 569 * Return true if the pmap page has an embedded PV entry. 570 */ 571 static inline bool 572 pv_pte_embedded(struct pmap_page *pp) 573 { 574 575 KASSERT(mutex_owned(&pp->pp_lock)); 576 return (bool)((vaddr_t)pp->pp_pte.pte_ptp | pp->pp_pte.pte_va); 577 } 578 579 /* 580 * pv_pte_first, pv_pte_next: PV list iterator. 581 */ 582 static inline struct pv_pte * 583 pv_pte_first(struct pmap_page *pp) 584 { 585 586 KASSERT(mutex_owned(&pp->pp_lock)); 587 if (pv_pte_embedded(pp)) { 588 return &pp->pp_pte; 589 } 590 return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist)); 591 } 592 593 static inline struct pv_pte * 594 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte) 595 { 596 597 KASSERT(mutex_owned(&pp->pp_lock)); 598 KASSERT(pvpte != NULL); 599 if (pvpte == &pp->pp_pte) { 600 return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist)); 601 } 602 return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list)); 603 } 604 605 static inline uint8_t 606 pmap_pte_to_pp_attrs(pt_entry_t pte) 607 { 608 uint8_t ret = 0; 609 if (pte & PTE_D) 610 ret |= PP_ATTRS_D; 611 if (pte & PTE_A) 612 ret |= PP_ATTRS_A; 613 if (pte & PTE_W) 614 ret |= PP_ATTRS_W; 615 return ret; 616 } 617 618 static inline pt_entry_t 619 pmap_pp_attrs_to_pte(uint8_t attrs) 620 { 621 pt_entry_t pte = 0; 622 if (attrs & PP_ATTRS_D) 623 pte |= PTE_D; 624 if (attrs & PP_ATTRS_A) 625 pte |= PTE_A; 626 if (attrs & PP_ATTRS_W) 627 pte |= PTE_W; 628 return pte; 629 } 630 631 /* 632 * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]? 633 * of course the kernel is always loaded 634 */ 635 bool 636 pmap_is_curpmap(struct pmap *pmap) 637 { 638 return ((pmap == pmap_kernel()) || (pmap == curcpu()->ci_pmap)); 639 } 640 641 inline void 642 pmap_reference(struct pmap *pmap) 643 { 644 645 atomic_inc_uint(&pmap->pm_obj[0].uo_refs); 646 } 647 648 /* 649 * rbtree: compare two nodes. 650 */ 651 static int 652 pmap_compare_nodes(void *context, const void *n1, const void *n2) 653 { 654 const struct pv_entry *pve1 = n1; 655 const struct pv_entry *pve2 = n2; 656 657 KASSERT(pve1->pve_pte.pte_ptp == pve2->pve_pte.pte_ptp); 658 659 if (pve1->pve_pte.pte_va < pve2->pve_pte.pte_va) { 660 return -1; 661 } 662 if (pve1->pve_pte.pte_va > pve2->pve_pte.pte_va) { 663 return 1; 664 } 665 return 0; 666 } 667 668 /* 669 * rbtree: compare a node and a key. 670 */ 671 static int 672 pmap_compare_key(void *context, const void *n, const void *k) 673 { 674 const struct pv_entry *pve = n; 675 const vaddr_t key = (vaddr_t)k; 676 677 if (pve->pve_pte.pte_va < key) { 678 return -1; 679 } 680 if (pve->pve_pte.pte_va > key) { 681 return 1; 682 } 683 return 0; 684 } 685 686 /* 687 * pmap_ptp_range_set: abuse ptp->uanon to record minimum VA of PTE 688 */ 689 static inline void 690 pmap_ptp_range_set(struct vm_page *ptp, vaddr_t va) 691 { 692 vaddr_t *min = (vaddr_t *)&ptp->uanon; 693 694 if (va < *min) { 695 *min = va; 696 } 697 } 698 699 /* 700 * pmap_ptp_range_clip: abuse ptp->uanon to clip range of PTEs to remove 701 */ 702 static inline void 703 pmap_ptp_range_clip(struct vm_page *ptp, vaddr_t *startva, pt_entry_t **pte) 704 { 705 vaddr_t sclip; 706 707 if (ptp == NULL) { 708 return; 709 } 710 711 sclip = (vaddr_t)ptp->uanon; 712 sclip = (*startva < sclip ? sclip : *startva); 713 *pte += (sclip - *startva) / PAGE_SIZE; 714 *startva = sclip; 715 } 716 717 /* 718 * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in 719 * 720 * there are several pmaps involved. some or all of them might be same. 721 * 722 * - the pmap given by the first argument 723 * our caller wants to access this pmap's PTEs. 724 * 725 * - pmap_kernel() 726 * the kernel pmap. note that it only contains the kernel part 727 * of the address space which is shared by any pmap. ie. any 728 * pmap can be used instead of pmap_kernel() for our purpose. 729 * 730 * - ci->ci_pmap 731 * pmap currently loaded on the cpu. 732 * 733 * - vm_map_pmap(&curproc->p_vmspace->vm_map) 734 * current process' pmap. 735 * 736 * => caller must lock pmap first (if not the kernel pmap) 737 * => must be undone with pmap_unmap_ptes before returning 738 * => disables kernel preemption 739 */ 740 void 741 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2, pd_entry_t **ptepp, 742 pd_entry_t * const **pdeppp) 743 { 744 struct pmap *curpmap; 745 struct cpu_info *ci; 746 lwp_t *l; 747 748 kpreempt_disable(); 749 750 /* The kernel's pmap is always accessible. */ 751 if (pmap == pmap_kernel()) { 752 *pmap2 = NULL; 753 *ptepp = PTE_BASE; 754 *pdeppp = normal_pdes; 755 return; 756 } 757 758 KASSERT(mutex_owned(&pmap->pm_lock)); 759 760 l = curlwp; 761 ci = l->l_cpu; 762 curpmap = ci->ci_pmap; 763 if (pmap == curpmap) { 764 /* 765 * Already on the CPU: make it valid. This is very 766 * often the case during exit(), when we have switched 767 * to the kernel pmap in order to destroy a user pmap. 768 */ 769 if (__predict_false(ci->ci_tlbstate != TLBSTATE_VALID)) { 770 pmap_reactivate(pmap); 771 } 772 *pmap2 = NULL; 773 } else { 774 /* 775 * Toss current pmap from CPU and install new pmap, but keep 776 * a reference to the old one. Dropping the reference can 777 * can block as it needs to take locks, so defer that to 778 * pmap_unmap_ptes(). 779 */ 780 pmap_reference(pmap); 781 pmap_load1(l, pmap, curpmap); 782 *pmap2 = curpmap; 783 } 784 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); 785 #ifdef DIAGNOSTIC 786 pmap->pm_ncsw = lwp_pctr(); 787 #endif 788 *ptepp = PTE_BASE; 789 790 #if defined(XENPV) && defined(__x86_64__) 791 KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] == L4_BASE); 792 ci->ci_normal_pdes[PTP_LEVELS - 2] = pmap->pm_pdir; 793 *pdeppp = ci->ci_normal_pdes; 794 #else 795 *pdeppp = normal_pdes; 796 #endif 797 } 798 799 /* 800 * pmap_unmap_ptes: unlock the PTE mapping of "pmap" 801 * 802 * => we cannot tolerate context switches while mapped in: assert this. 803 * => reenables kernel preemption. 804 * => does not unlock pmap. 805 */ 806 void 807 pmap_unmap_ptes(struct pmap *pmap, struct pmap * pmap2) 808 { 809 struct cpu_info *ci; 810 struct pmap *mypmap; 811 struct lwp *l; 812 813 KASSERT(kpreempt_disabled()); 814 815 /* The kernel's pmap is always accessible. */ 816 if (pmap == pmap_kernel()) { 817 kpreempt_enable(); 818 return; 819 } 820 821 l = curlwp; 822 ci = l->l_cpu; 823 824 KASSERT(mutex_owned(&pmap->pm_lock)); 825 KASSERT(pmap->pm_ncsw == lwp_pctr()); 826 827 #if defined(XENPV) && defined(__x86_64__) 828 KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] != L4_BASE); 829 ci->ci_normal_pdes[PTP_LEVELS - 2] = L4_BASE; 830 #endif 831 832 /* If not our own pmap, mark whatever's on the CPU now as lazy. */ 833 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); 834 mypmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 835 if (ci->ci_pmap == vm_map_pmap(&l->l_proc->p_vmspace->vm_map)) { 836 ci->ci_want_pmapload = 0; 837 } else { 838 ci->ci_want_pmapload = (mypmap != pmap_kernel()); 839 ci->ci_tlbstate = TLBSTATE_LAZY; 840 } 841 842 /* Now safe to re-enable preemption. */ 843 kpreempt_enable(); 844 845 /* Toss reference to other pmap taken earlier. */ 846 if (pmap2 != NULL) { 847 pmap_destroy(pmap2); 848 } 849 } 850 851 inline static void 852 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte) 853 { 854 855 #if !defined(__x86_64__) 856 if (curproc == NULL || curproc->p_vmspace == NULL || 857 pm != vm_map_pmap(&curproc->p_vmspace->vm_map)) 858 return; 859 860 if ((opte ^ npte) & PTE_X) 861 pmap_update_pg(va); 862 863 /* 864 * Executability was removed on the last executable change. 865 * Reset the code segment to something conservative and 866 * let the trap handler deal with setting the right limit. 867 * We can't do that because of locking constraints on the vm map. 868 */ 869 870 if ((opte & PTE_X) && (npte & PTE_X) == 0 && va == pm->pm_hiexec) { 871 struct trapframe *tf = curlwp->l_md.md_regs; 872 873 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 874 pm->pm_hiexec = I386_MAX_EXE_ADDR; 875 } 876 #endif /* !defined(__x86_64__) */ 877 } 878 879 #if !defined(__x86_64__) 880 /* 881 * Fixup the code segment to cover all potential executable mappings. 882 * returns 0 if no changes to the code segment were made. 883 */ 884 int 885 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb) 886 { 887 struct vm_map_entry *ent; 888 struct pmap *pm = vm_map_pmap(map); 889 vaddr_t va = 0; 890 891 vm_map_lock_read(map); 892 for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) { 893 /* 894 * This entry has greater va than the entries before. 895 * We need to make it point to the last page, not past it. 896 */ 897 if (ent->protection & VM_PROT_EXECUTE) 898 va = trunc_page(ent->end) - PAGE_SIZE; 899 } 900 vm_map_unlock_read(map); 901 if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL)) 902 return 0; 903 904 pm->pm_hiexec = va; 905 if (pm->pm_hiexec > I386_MAX_EXE_ADDR) { 906 tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL); 907 } else { 908 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 909 return 0; 910 } 911 return 1; 912 } 913 #endif /* !defined(__x86_64__) */ 914 915 void 916 pat_init(struct cpu_info *ci) 917 { 918 uint64_t pat; 919 920 if (!(ci->ci_feat_val[0] & CPUID_PAT)) 921 return; 922 923 /* We change WT to WC. Leave all other entries the default values. */ 924 pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) | 925 PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) | 926 PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) | 927 PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC); 928 929 wrmsr(MSR_CR_PAT, pat); 930 cpu_pat_enabled = true; 931 } 932 933 static pt_entry_t 934 pmap_pat_flags(u_int flags) 935 { 936 u_int cacheflags = (flags & PMAP_CACHE_MASK); 937 938 if (!cpu_pat_enabled) { 939 switch (cacheflags) { 940 case PMAP_NOCACHE: 941 case PMAP_NOCACHE_OVR: 942 /* results in PGC_UCMINUS on cpus which have 943 * the cpuid PAT but PAT "disabled" 944 */ 945 return PTE_PCD; 946 default: 947 return 0; 948 } 949 } 950 951 switch (cacheflags) { 952 case PMAP_NOCACHE: 953 return PGC_UC; 954 case PMAP_WRITE_COMBINE: 955 return PGC_WC; 956 case PMAP_WRITE_BACK: 957 return PGC_WB; 958 case PMAP_NOCACHE_OVR: 959 return PGC_UCMINUS; 960 } 961 962 return 0; 963 } 964 965 /* 966 * p m a p k e n t e r f u n c t i o n s 967 * 968 * functions to quickly enter/remove pages from the kernel address 969 * space. pmap_kremove is exported to MI kernel. we make use of 970 * the recursive PTE mappings. 971 */ 972 973 /* 974 * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking 975 * 976 * => no need to lock anything, assume va is already allocated 977 * => should be faster than normal pmap enter function 978 */ 979 void 980 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags) 981 { 982 pt_entry_t *pte, opte, npte; 983 984 KASSERT(!(prot & ~VM_PROT_ALL)); 985 986 if (va < VM_MIN_KERNEL_ADDRESS) 987 pte = vtopte(va); 988 else 989 pte = kvtopte(va); 990 #if defined(XENPV) && defined(DOM0OPS) 991 if (pa < pmap_pa_start || pa >= pmap_pa_end) { 992 #ifdef DEBUG 993 printf_nolog("%s: pa %#" PRIxPADDR " for va %#" PRIxVADDR 994 " outside range\n", __func__, pa, va); 995 #endif /* DEBUG */ 996 npte = pa; 997 } else 998 #endif /* XENPV && DOM0OPS */ 999 npte = pmap_pa2pte(pa); 1000 npte |= protection_codes[prot] | PTE_P | pmap_pg_g; 1001 npte |= pmap_pat_flags(flags); 1002 opte = pmap_pte_testset(pte, npte); /* zap! */ 1003 1004 /* 1005 * XXX: make sure we are not dealing with a large page, since the only 1006 * large pages created are for the kernel image, and they should never 1007 * be kentered. 1008 */ 1009 KASSERTMSG(!(opte & PTE_PS), "PTE_PS va=%#"PRIxVADDR, va); 1010 1011 if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A)) { 1012 /* This should not happen. */ 1013 printf_nolog("%s: mapping already present\n", __func__); 1014 kpreempt_disable(); 1015 pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER); 1016 kpreempt_enable(); 1017 } 1018 } 1019 1020 __strict_weak_alias(pmap_kenter_ma, pmap_kenter_pa); 1021 1022 #if defined(__x86_64__) 1023 /* 1024 * Change protection for a virtual address. Local for a CPU only, don't 1025 * care about TLB shootdowns. 1026 * 1027 * => must be called with preemption disabled 1028 */ 1029 void 1030 pmap_changeprot_local(vaddr_t va, vm_prot_t prot) 1031 { 1032 pt_entry_t *pte, opte, npte; 1033 1034 KASSERT(kpreempt_disabled()); 1035 1036 if (va < VM_MIN_KERNEL_ADDRESS) 1037 pte = vtopte(va); 1038 else 1039 pte = kvtopte(va); 1040 1041 npte = opte = *pte; 1042 1043 if ((prot & VM_PROT_WRITE) != 0) 1044 npte |= PTE_W; 1045 else 1046 npte &= ~(PTE_W|PTE_D); 1047 1048 if (opte != npte) { 1049 pmap_pte_set(pte, npte); 1050 pmap_pte_flush(); 1051 invlpg(va); 1052 } 1053 } 1054 #endif /* defined(__x86_64__) */ 1055 1056 /* 1057 * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking 1058 * 1059 * => no need to lock anything 1060 * => caller must dispose of any vm_page mapped in the va range 1061 * => note: not an inline function 1062 * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE 1063 * => we assume kernel only unmaps valid addresses and thus don't bother 1064 * checking the valid bit before doing TLB flushing 1065 * => must be followed by call to pmap_update() before reuse of page 1066 */ 1067 static void 1068 pmap_kremove1(vaddr_t sva, vsize_t len, bool localonly) 1069 { 1070 pt_entry_t *pte, opte; 1071 vaddr_t va, eva; 1072 1073 eva = sva + len; 1074 1075 kpreempt_disable(); 1076 for (va = sva; va < eva; va += PAGE_SIZE) { 1077 pte = kvtopte(va); 1078 opte = pmap_pte_testset(pte, 0); /* zap! */ 1079 if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A) && !localonly) { 1080 pmap_tlb_shootdown(pmap_kernel(), va, opte, 1081 TLBSHOOT_KREMOVE); 1082 } 1083 KASSERTMSG((opte & PTE_PS) == 0, 1084 "va %#" PRIxVADDR " is a large page", va); 1085 KASSERTMSG((opte & PTE_PVLIST) == 0, 1086 "va %#" PRIxVADDR " is a pv tracked page", va); 1087 } 1088 if (localonly) { 1089 tlbflushg(); 1090 } 1091 kpreempt_enable(); 1092 } 1093 1094 void 1095 pmap_kremove(vaddr_t sva, vsize_t len) 1096 { 1097 1098 pmap_kremove1(sva, len, false); 1099 } 1100 1101 /* 1102 * pmap_kremove_local: like pmap_kremove(), but only worry about 1103 * TLB invalidations on the current CPU. this is only intended 1104 * for use while writing kernel crash dumps, either after panic 1105 * or via reboot -d. 1106 */ 1107 void 1108 pmap_kremove_local(vaddr_t sva, vsize_t len) 1109 { 1110 1111 pmap_kremove1(sva, len, true); 1112 } 1113 1114 /* 1115 * p m a p i n i t f u n c t i o n s 1116 * 1117 * pmap_bootstrap and pmap_init are called during system startup 1118 * to init the pmap module. pmap_bootstrap() does a low level 1119 * init just to get things rolling. pmap_init() finishes the job. 1120 */ 1121 1122 /* 1123 * pmap_bootstrap_valloc: allocate a virtual address in the bootstrap area. 1124 * This function is to be used before any VM system has been set up. 1125 * 1126 * The va is taken from virtual_avail. 1127 */ 1128 static vaddr_t 1129 pmap_bootstrap_valloc(size_t npages) 1130 { 1131 vaddr_t va = virtual_avail; 1132 virtual_avail += npages * PAGE_SIZE; 1133 return va; 1134 } 1135 1136 /* 1137 * pmap_bootstrap_palloc: allocate a physical address in the bootstrap area. 1138 * This function is to be used before any VM system has been set up. 1139 * 1140 * The pa is taken from avail_start. 1141 */ 1142 static paddr_t 1143 pmap_bootstrap_palloc(size_t npages) 1144 { 1145 paddr_t pa = avail_start; 1146 avail_start += npages * PAGE_SIZE; 1147 return pa; 1148 } 1149 1150 /* 1151 * pmap_bootstrap: get the system in a state where it can run with VM properly 1152 * enabled (called before main()). The VM system is fully init'd later. 1153 * 1154 * => on i386, locore.S has already enabled the MMU by allocating a PDP for the 1155 * kernel, and nkpde PTP's for the kernel. 1156 * => kva_start is the first free virtual address in kernel space. 1157 */ 1158 void 1159 pmap_bootstrap(vaddr_t kva_start) 1160 { 1161 struct pmap *kpm; 1162 int i; 1163 vaddr_t kva; 1164 1165 pmap_pg_nx = (cpu_feature[2] & CPUID_NOX ? PTE_NX : 0); 1166 1167 /* 1168 * Set up our local static global vars that keep track of the usage of 1169 * KVM before kernel_map is set up. 1170 */ 1171 virtual_avail = kva_start; /* first free KVA */ 1172 virtual_end = VM_MAX_KERNEL_ADDRESS; /* last KVA */ 1173 1174 /* 1175 * Set up protection_codes: we need to be able to convert from a MI 1176 * protection code (some combo of VM_PROT...) to something we can jam 1177 * into a x86 PTE. 1178 */ 1179 protection_codes[VM_PROT_NONE] = pmap_pg_nx; 1180 protection_codes[VM_PROT_EXECUTE] = PTE_X; 1181 protection_codes[VM_PROT_READ] = pmap_pg_nx; 1182 protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PTE_X; 1183 protection_codes[VM_PROT_WRITE] = PTE_W | pmap_pg_nx; 1184 protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PTE_W | PTE_X; 1185 protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PTE_W | pmap_pg_nx; 1186 protection_codes[VM_PROT_ALL] = PTE_W | PTE_X; 1187 1188 /* 1189 * Now we init the kernel's pmap. 1190 * 1191 * The kernel pmap's pm_obj is not used for much. However, in user pmaps 1192 * the pm_obj contains the list of active PTPs. 1193 */ 1194 kpm = pmap_kernel(); 1195 mutex_init(&kpm->pm_lock, MUTEX_DEFAULT, IPL_NONE); 1196 rw_init(&kpm->pm_dummy_lock); 1197 for (i = 0; i < PTP_LEVELS - 1; i++) { 1198 uvm_obj_init(&kpm->pm_obj[i], &pmap_pager, false, 1); 1199 uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_dummy_lock); 1200 kpm->pm_ptphint[i] = NULL; 1201 } 1202 memset(&kpm->pm_list, 0, sizeof(kpm->pm_list)); /* pm_list not used */ 1203 1204 kpm->pm_pdir = (pd_entry_t *)bootspace.pdir; 1205 for (i = 0; i < PDP_SIZE; i++) 1206 kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i; 1207 1208 kpm->pm_stats.wired_count = kpm->pm_stats.resident_count = 1209 x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS); 1210 1211 kcpuset_create(&kpm->pm_cpus, true); 1212 kcpuset_create(&kpm->pm_kernel_cpus, true); 1213 1214 kpm->pm_ldt = NULL; 1215 kpm->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 1216 1217 /* 1218 * the above is just a rough estimate and not critical to the proper 1219 * operation of the system. 1220 */ 1221 1222 #if !defined(XENPV) 1223 /* 1224 * Begin to enable global TLB entries if they are supported: add PTE_G 1225 * attribute to already mapped kernel pages. Do that only if SVS is 1226 * disabled. 1227 * 1228 * The G bit has no effect until the CR4_PGE bit is set in CR4, which 1229 * happens later in cpu_init(). 1230 */ 1231 #ifdef SVS 1232 if (!svs_enabled && (cpu_feature[0] & CPUID_PGE)) { 1233 #else 1234 if (cpu_feature[0] & CPUID_PGE) { 1235 #endif 1236 pmap_pg_g = PTE_G; 1237 pmap_remap_global(); 1238 } 1239 #endif 1240 1241 #ifndef XENPV 1242 /* 1243 * Enable large pages if they are supported. 1244 */ 1245 if (cpu_feature[0] & CPUID_PSE) { 1246 lcr4(rcr4() | CR4_PSE); /* enable hardware (via %cr4) */ 1247 pmap_largepages = 1; /* enable software */ 1248 1249 /* 1250 * The TLB must be flushed after enabling large pages on Pentium 1251 * CPUs, according to section 3.6.2.2 of "Intel Architecture 1252 * Software Developer's Manual, Volume 3: System Programming". 1253 */ 1254 tlbflushg(); 1255 1256 /* Remap the kernel. */ 1257 pmap_remap_largepages(); 1258 } 1259 pmap_init_lapic(); 1260 #endif /* !XENPV */ 1261 1262 #ifdef __HAVE_PCPU_AREA 1263 pmap_init_pcpu(); 1264 #endif 1265 1266 #ifdef __HAVE_DIRECT_MAP 1267 pmap_init_directmap(kpm); 1268 #else 1269 pmap_vpage_cpualloc(&cpu_info_primary); 1270 1271 if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { /* i386 */ 1272 early_zerop = (void *)cpu_info_primary.vpage[VPAGE_ZER]; 1273 early_zero_pte = cpu_info_primary.vpage_pte[VPAGE_ZER]; 1274 } else { /* amd64 */ 1275 /* 1276 * zero_pte is stuck at the end of mapped space for the kernel 1277 * image (disjunct from kva space). This is done so that it 1278 * can safely be used in pmap_growkernel (pmap_get_physpage), 1279 * when it's called for the first time. 1280 * XXXfvdl fix this for MULTIPROCESSOR later. 1281 */ 1282 #ifdef XENPV 1283 /* early_zerop initialized in xen_locore() */ 1284 #else 1285 early_zerop = (void *)bootspace.spareva; 1286 #endif 1287 early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop); 1288 } 1289 #endif 1290 1291 #if defined(XENPV) && defined(__x86_64__) 1292 extern vaddr_t xen_dummy_page; 1293 paddr_t xen_dummy_user_pgd; 1294 1295 /* 1296 * We want a dummy page directory for Xen: when deactivating a pmap, 1297 * Xen will still consider it active. So we set user PGD to this one 1298 * to lift all protection on the now inactive page tables set. 1299 */ 1300 xen_dummy_user_pgd = xen_dummy_page - KERNBASE; 1301 1302 /* Zero fill it, the less checks in Xen it requires the better */ 1303 memset(PAGE_ALIGNED(xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE); 1304 /* Mark read-only */ 1305 HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE, 1306 pmap_pa2pte(xen_dummy_user_pgd) | PTE_P | pmap_pg_nx, 1307 UVMF_INVLPG); 1308 /* Pin as L4 */ 1309 xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd)); 1310 #endif 1311 1312 /* 1313 * Allocate space for the IDT, GDT and LDT. 1314 */ 1315 idt_vaddr = pmap_bootstrap_valloc(1); 1316 idt_paddr = pmap_bootstrap_palloc(1); 1317 1318 gdt_vaddr = pmap_bootstrap_valloc(1); 1319 gdt_paddr = pmap_bootstrap_palloc(1); 1320 1321 #ifdef __HAVE_PCPU_AREA 1322 ldt_vaddr = (vaddr_t)&pcpuarea->ldt; 1323 #else 1324 ldt_vaddr = pmap_bootstrap_valloc(1); 1325 #endif 1326 ldt_paddr = pmap_bootstrap_palloc(1); 1327 1328 #if !defined(__x86_64__) 1329 /* pentium f00f bug stuff */ 1330 pentium_idt_vaddr = pmap_bootstrap_valloc(1); 1331 #endif 1332 1333 #if defined(XENPVHVM) 1334 /* XXX: move to hypervisor.c with appropriate API adjustments */ 1335 extern paddr_t HYPERVISOR_shared_info_pa; 1336 extern volatile struct xencons_interface *xencons_interface; /* XXX */ 1337 extern struct xenstore_domain_interface *xenstore_interface; /* XXX */ 1338 1339 if (vm_guest != VM_GUEST_XENPVH) { 1340 HYPERVISOR_shared_info = (void *) pmap_bootstrap_valloc(1); 1341 HYPERVISOR_shared_info_pa = pmap_bootstrap_palloc(1); 1342 } 1343 xencons_interface = (void *) pmap_bootstrap_valloc(1); 1344 xenstore_interface = (void *) pmap_bootstrap_valloc(1); 1345 #endif 1346 /* 1347 * Now we reserve some VM for mapping pages when doing a crash dump. 1348 */ 1349 virtual_avail = reserve_dumppages(virtual_avail); 1350 1351 /* 1352 * Init the global lock and global list. 1353 */ 1354 mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE); 1355 LIST_INIT(&pmaps); 1356 1357 /* 1358 * Ensure the TLB is sync'd with reality by flushing it... 1359 */ 1360 tlbflushg(); 1361 1362 /* 1363 * Calculate pmap_maxkvaddr from nkptp[]. 1364 */ 1365 kva = VM_MIN_KERNEL_ADDRESS; 1366 for (i = PTP_LEVELS - 1; i >= 1; i--) { 1367 kva += nkptp[i] * nbpd[i]; 1368 } 1369 pmap_maxkvaddr = kva; 1370 } 1371 1372 #ifndef XENPV 1373 static void 1374 pmap_init_lapic(void) 1375 { 1376 /* 1377 * On CPUs that have no LAPIC, local_apic_va is never kentered. But our 1378 * x86 implementation relies a lot on this address to be valid; so just 1379 * allocate a fake physical page that will be kentered into 1380 * local_apic_va by machdep. 1381 * 1382 * If the LAPIC is present, the va will be remapped somewhere else 1383 * later in lapic_map. 1384 */ 1385 local_apic_va = pmap_bootstrap_valloc(1); 1386 local_apic_pa = pmap_bootstrap_palloc(1); 1387 } 1388 #endif 1389 1390 #ifdef __x86_64__ 1391 static size_t 1392 pmap_pagetree_nentries_range(vaddr_t startva, vaddr_t endva, size_t pgsz) 1393 { 1394 size_t npages; 1395 npages = (roundup(endva, pgsz) / pgsz) - 1396 (rounddown(startva, pgsz) / pgsz); 1397 return npages; 1398 } 1399 #endif 1400 1401 #if defined(__HAVE_DIRECT_MAP) || defined(KASAN) || defined(KMSAN) 1402 static inline void 1403 slotspace_copy(int type, pd_entry_t *dst, pd_entry_t *src) 1404 { 1405 size_t sslot = slotspace.area[type].sslot; 1406 size_t nslot = slotspace.area[type].nslot; 1407 1408 memcpy(&dst[sslot], &src[sslot], nslot * sizeof(pd_entry_t)); 1409 } 1410 #endif 1411 1412 #ifdef __x86_64__ 1413 /* 1414 * Randomize the location of an area. We count the holes in the VM space. We 1415 * randomly select one hole, and then randomly select an area within that hole. 1416 * Finally we update the associated entry in the slotspace structure. 1417 */ 1418 vaddr_t 1419 slotspace_rand(int type, size_t sz, size_t align, size_t randhole, 1420 vaddr_t randva) 1421 { 1422 struct { 1423 int start; 1424 int end; 1425 } holes[SLSPACE_NAREAS+1]; 1426 size_t i, nholes, hole; 1427 size_t startsl, endsl, nslots, winsize; 1428 vaddr_t startva, va; 1429 1430 sz = roundup(sz, align); 1431 1432 /* 1433 * Take one more slot with +NBPD_L4, because we may end up choosing 1434 * an area that crosses slots: 1435 * +------+------+------+ 1436 * | Slot | Slot | Slot | 1437 * +------+------+------+ 1438 * [Chosen Area] 1439 * And in that case we must take into account the additional slot 1440 * consumed. 1441 */ 1442 nslots = roundup(sz+NBPD_L4, NBPD_L4) / NBPD_L4; 1443 1444 /* Get the holes. */ 1445 nholes = 0; 1446 size_t curslot = 0 + 256; /* end of SLAREA_USER */ 1447 while (1) { 1448 /* 1449 * Find the first occupied slot after the current one. 1450 * The area between the two is a hole. 1451 */ 1452 size_t minsslot = 512; 1453 size_t minnslot = 0; 1454 for (i = 0; i < SLSPACE_NAREAS; i++) { 1455 if (!slotspace.area[i].active) 1456 continue; 1457 if (slotspace.area[i].sslot >= curslot && 1458 slotspace.area[i].sslot < minsslot) { 1459 minsslot = slotspace.area[i].sslot; 1460 minnslot = slotspace.area[i].nslot; 1461 } 1462 } 1463 1464 /* No hole anymore, stop here. */ 1465 if (minsslot == 512) { 1466 break; 1467 } 1468 1469 /* Register the hole. */ 1470 if (minsslot - curslot >= nslots) { 1471 holes[nholes].start = curslot; 1472 holes[nholes].end = minsslot; 1473 nholes++; 1474 } 1475 1476 /* Skip that hole, and iterate again. */ 1477 curslot = minsslot + minnslot; 1478 } 1479 1480 if (nholes == 0) { 1481 panic("%s: impossible", __func__); 1482 } 1483 1484 /* Select a hole. */ 1485 hole = randhole; 1486 #ifdef NO_X86_ASLR 1487 hole = 0; 1488 #endif 1489 hole %= nholes; 1490 startsl = holes[hole].start; 1491 endsl = holes[hole].end; 1492 startva = VA_SIGN_NEG(startsl * NBPD_L4); 1493 1494 /* Select an area within the hole. */ 1495 va = randva; 1496 #ifdef NO_X86_ASLR 1497 va = 0; 1498 #endif 1499 winsize = ((endsl - startsl) * NBPD_L4) - sz; 1500 va %= winsize; 1501 va = rounddown(va, align); 1502 va += startva; 1503 1504 /* Update the entry. */ 1505 slotspace.area[type].sslot = pl4_i(va); 1506 slotspace.area[type].nslot = 1507 pmap_pagetree_nentries_range(va, va+sz, NBPD_L4); 1508 slotspace.area[type].active = true; 1509 1510 return va; 1511 } 1512 #endif 1513 1514 #ifdef __HAVE_PCPU_AREA 1515 static void 1516 pmap_init_pcpu(void) 1517 { 1518 const vaddr_t startva = PMAP_PCPU_BASE; 1519 size_t nL4e, nL3e, nL2e, nL1e; 1520 size_t L4e_idx, L3e_idx, L2e_idx, L1e_idx __diagused; 1521 paddr_t pa; 1522 vaddr_t endva; 1523 vaddr_t tmpva; 1524 pt_entry_t *pte; 1525 size_t size; 1526 int i; 1527 1528 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx; 1529 1530 size = sizeof(struct pcpu_area); 1531 1532 endva = startva + size; 1533 1534 /* We will use this temporary va. */ 1535 tmpva = bootspace.spareva; 1536 pte = PTE_BASE + pl1_i(tmpva); 1537 1538 /* Build L4 */ 1539 L4e_idx = pl4_i(startva); 1540 nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4); 1541 KASSERT(nL4e == 1); 1542 for (i = 0; i < nL4e; i++) { 1543 KASSERT(L4_BASE[L4e_idx+i] == 0); 1544 1545 pa = pmap_bootstrap_palloc(1); 1546 *pte = (pa & PTE_FRAME) | pteflags; 1547 pmap_update_pg(tmpva); 1548 memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE); 1549 1550 L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A; 1551 } 1552 1553 /* Build L3 */ 1554 L3e_idx = pl3_i(startva); 1555 nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3); 1556 for (i = 0; i < nL3e; i++) { 1557 KASSERT(L3_BASE[L3e_idx+i] == 0); 1558 1559 pa = pmap_bootstrap_palloc(1); 1560 *pte = (pa & PTE_FRAME) | pteflags; 1561 pmap_update_pg(tmpva); 1562 memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE); 1563 1564 L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A; 1565 } 1566 1567 /* Build L2 */ 1568 L2e_idx = pl2_i(startva); 1569 nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2); 1570 for (i = 0; i < nL2e; i++) { 1571 1572 KASSERT(L2_BASE[L2e_idx+i] == 0); 1573 1574 pa = pmap_bootstrap_palloc(1); 1575 *pte = (pa & PTE_FRAME) | pteflags; 1576 pmap_update_pg(tmpva); 1577 memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE); 1578 1579 L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A; 1580 } 1581 1582 /* Build L1 */ 1583 L1e_idx = pl1_i(startva); 1584 nL1e = pmap_pagetree_nentries_range(startva, endva, NBPD_L1); 1585 for (i = 0; i < nL1e; i++) { 1586 /* 1587 * Nothing to do, the PTEs will be entered via 1588 * pmap_kenter_pa. 1589 */ 1590 KASSERT(L1_BASE[L1e_idx+i] == 0); 1591 } 1592 1593 *pte = 0; 1594 pmap_update_pg(tmpva); 1595 1596 pcpuarea = (struct pcpu_area *)startva; 1597 1598 tlbflush(); 1599 } 1600 #endif 1601 1602 #ifdef __HAVE_DIRECT_MAP 1603 /* 1604 * Create the amd64 direct map. Called only once at boot time. We map all of 1605 * the physical memory contiguously using 2MB large pages, with RW permissions. 1606 * However there is a hole: the kernel is mapped with RO permissions. 1607 */ 1608 static void 1609 pmap_init_directmap(struct pmap *kpm) 1610 { 1611 extern phys_ram_seg_t mem_clusters[]; 1612 extern int mem_cluster_cnt; 1613 1614 vaddr_t startva; 1615 size_t nL4e, nL3e, nL2e; 1616 size_t L4e_idx, L3e_idx, L2e_idx; 1617 size_t spahole, epahole; 1618 paddr_t lastpa, pa; 1619 vaddr_t endva; 1620 vaddr_t tmpva; 1621 pt_entry_t *pte; 1622 phys_ram_seg_t *mc; 1623 int i; 1624 size_t randhole; 1625 vaddr_t randva; 1626 1627 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx; 1628 const pd_entry_t holepteflags = PTE_P | pmap_pg_nx; 1629 1630 CTASSERT(NL4_SLOT_DIRECT * NBPD_L4 == MAXPHYSMEM); 1631 1632 spahole = roundup(bootspace.head.pa, NBPD_L2); 1633 epahole = rounddown(bootspace.boot.pa, NBPD_L2); 1634 1635 /* Get the last physical address available */ 1636 lastpa = 0; 1637 for (i = 0; i < mem_cluster_cnt; i++) { 1638 mc = &mem_clusters[i]; 1639 lastpa = MAX(lastpa, mc->start + mc->size); 1640 } 1641 1642 /* 1643 * x86_add_cluster should have truncated the memory to MAXPHYSMEM. 1644 */ 1645 if (lastpa > MAXPHYSMEM) { 1646 panic("pmap_init_directmap: lastpa incorrect"); 1647 } 1648 1649 entropy_extract(&randhole, sizeof randhole, 0); 1650 entropy_extract(&randva, sizeof randva, 0); 1651 startva = slotspace_rand(SLAREA_DMAP, lastpa, NBPD_L2, 1652 randhole, randva); 1653 endva = startva + lastpa; 1654 1655 /* We will use this temporary va. */ 1656 tmpva = bootspace.spareva; 1657 pte = PTE_BASE + pl1_i(tmpva); 1658 1659 /* Build L4 */ 1660 L4e_idx = pl4_i(startva); 1661 nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4); 1662 KASSERT(nL4e <= NL4_SLOT_DIRECT); 1663 for (i = 0; i < nL4e; i++) { 1664 KASSERT(L4_BASE[L4e_idx+i] == 0); 1665 1666 pa = pmap_bootstrap_palloc(1); 1667 *pte = (pa & PTE_FRAME) | pteflags; 1668 pmap_update_pg(tmpva); 1669 memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE); 1670 1671 L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A; 1672 } 1673 1674 /* Build L3 */ 1675 L3e_idx = pl3_i(startva); 1676 nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3); 1677 for (i = 0; i < nL3e; i++) { 1678 KASSERT(L3_BASE[L3e_idx+i] == 0); 1679 1680 pa = pmap_bootstrap_palloc(1); 1681 *pte = (pa & PTE_FRAME) | pteflags; 1682 pmap_update_pg(tmpva); 1683 memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE); 1684 1685 L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A; 1686 } 1687 1688 /* Build L2 */ 1689 L2e_idx = pl2_i(startva); 1690 nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2); 1691 for (i = 0; i < nL2e; i++) { 1692 KASSERT(L2_BASE[L2e_idx+i] == 0); 1693 1694 pa = (paddr_t)(i * NBPD_L2); 1695 1696 if (spahole <= pa && pa < epahole) { 1697 L2_BASE[L2e_idx+i] = pa | holepteflags | PTE_A | 1698 PTE_PS | pmap_pg_g; 1699 } else { 1700 L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A | 1701 PTE_PS | pmap_pg_g; 1702 } 1703 } 1704 1705 *pte = 0; 1706 pmap_update_pg(tmpva); 1707 1708 pmap_direct_base = startva; 1709 pmap_direct_end = endva; 1710 1711 tlbflush(); 1712 } 1713 #endif /* __HAVE_DIRECT_MAP */ 1714 1715 #if !defined(XENPV) 1716 /* 1717 * Remap all of the virtual pages created so far with the PTE_G bit. 1718 */ 1719 static void 1720 pmap_remap_global(void) 1721 { 1722 vaddr_t kva, kva_end; 1723 unsigned long p1i; 1724 size_t i; 1725 1726 /* head */ 1727 kva = bootspace.head.va; 1728 kva_end = kva + bootspace.head.sz; 1729 for ( ; kva < kva_end; kva += PAGE_SIZE) { 1730 p1i = pl1_i(kva); 1731 if (pmap_valid_entry(PTE_BASE[p1i])) 1732 PTE_BASE[p1i] |= pmap_pg_g; 1733 } 1734 1735 /* kernel segments */ 1736 for (i = 0; i < BTSPACE_NSEGS; i++) { 1737 if (bootspace.segs[i].type == BTSEG_NONE) { 1738 continue; 1739 } 1740 kva = bootspace.segs[i].va; 1741 kva_end = kva + bootspace.segs[i].sz; 1742 for ( ; kva < kva_end; kva += PAGE_SIZE) { 1743 p1i = pl1_i(kva); 1744 if (pmap_valid_entry(PTE_BASE[p1i])) 1745 PTE_BASE[p1i] |= pmap_pg_g; 1746 } 1747 } 1748 1749 /* boot space */ 1750 kva = bootspace.boot.va; 1751 kva_end = kva + bootspace.boot.sz; 1752 for ( ; kva < kva_end; kva += PAGE_SIZE) { 1753 p1i = pl1_i(kva); 1754 if (pmap_valid_entry(PTE_BASE[p1i])) 1755 PTE_BASE[p1i] |= pmap_pg_g; 1756 } 1757 } 1758 #endif 1759 1760 #ifndef XENPV 1761 /* 1762 * Remap several kernel segments with large pages. We cover as many pages as we 1763 * can. Called only once at boot time, if the CPU supports large pages. 1764 */ 1765 static void 1766 pmap_remap_largepages(void) 1767 { 1768 pd_entry_t *pde; 1769 vaddr_t kva, kva_end; 1770 paddr_t pa; 1771 size_t i; 1772 1773 /* Remap the kernel text using large pages. */ 1774 for (i = 0; i < BTSPACE_NSEGS; i++) { 1775 if (bootspace.segs[i].type != BTSEG_TEXT) { 1776 continue; 1777 } 1778 kva = roundup(bootspace.segs[i].va, NBPD_L2); 1779 if (kva < bootspace.segs[i].va) { 1780 continue; 1781 } 1782 kva_end = rounddown(bootspace.segs[i].va + 1783 bootspace.segs[i].sz, NBPD_L2); 1784 pa = roundup(bootspace.segs[i].pa, NBPD_L2); 1785 for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) { 1786 pde = &L2_BASE[pl2_i(kva)]; 1787 *pde = pa | pmap_pg_g | PTE_PS | PTE_P; 1788 tlbflushg(); 1789 } 1790 } 1791 1792 /* Remap the kernel rodata using large pages. */ 1793 for (i = 0; i < BTSPACE_NSEGS; i++) { 1794 if (bootspace.segs[i].type != BTSEG_RODATA) { 1795 continue; 1796 } 1797 kva = roundup(bootspace.segs[i].va, NBPD_L2); 1798 if (kva < bootspace.segs[i].va) { 1799 continue; 1800 } 1801 kva_end = rounddown(bootspace.segs[i].va + 1802 bootspace.segs[i].sz, NBPD_L2); 1803 pa = roundup(bootspace.segs[i].pa, NBPD_L2); 1804 for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) { 1805 pde = &L2_BASE[pl2_i(kva)]; 1806 *pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_P; 1807 tlbflushg(); 1808 } 1809 } 1810 1811 /* Remap the kernel data+bss using large pages. */ 1812 for (i = 0; i < BTSPACE_NSEGS; i++) { 1813 if (bootspace.segs[i].type != BTSEG_DATA) { 1814 continue; 1815 } 1816 kva = roundup(bootspace.segs[i].va, NBPD_L2); 1817 if (kva < bootspace.segs[i].va) { 1818 continue; 1819 } 1820 kva_end = rounddown(bootspace.segs[i].va + 1821 bootspace.segs[i].sz, NBPD_L2); 1822 pa = roundup(bootspace.segs[i].pa, NBPD_L2); 1823 for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) { 1824 pde = &L2_BASE[pl2_i(kva)]; 1825 *pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_W | PTE_P; 1826 tlbflushg(); 1827 } 1828 } 1829 } 1830 #endif /* !XENPV */ 1831 1832 /* 1833 * pmap_init: called from uvm_init, our job is to get the pmap system ready 1834 * to manage mappings. 1835 */ 1836 void 1837 pmap_init(void) 1838 { 1839 int flags; 1840 1841 /* 1842 * initialize caches. 1843 */ 1844 1845 pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), COHERENCY_UNIT, 1846 0, 0, "pmappl", NULL, IPL_NONE, pmap_ctor, pmap_dtor, NULL); 1847 1848 #ifdef XENPV 1849 /* 1850 * pool_cache(9) should not touch cached objects, since they 1851 * are pinned on xen and R/O for the domU 1852 */ 1853 flags = PR_NOTOUCH; 1854 #else 1855 flags = 0; 1856 #endif 1857 1858 #ifdef PAE 1859 pool_init(&pmap_pdp_pool, PAGE_SIZE * PDP_SIZE, 0, 0, flags, 1860 "pdppl", &pmap_pdp_allocator, IPL_NONE); 1861 #else 1862 pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, 0, flags, 1863 "pdppl", NULL, IPL_NONE); 1864 #endif 1865 pool_cache_bootstrap(&pmap_pvp_cache, PAGE_SIZE, PAGE_SIZE, 1866 0, 0, "pvpage", &pool_allocator_kmem, 1867 IPL_NONE, pmap_pvp_ctor, pmap_pvp_dtor, NULL); 1868 1869 pmap_tlb_init(); 1870 1871 /* XXX: Since cpu_hatch() is only for secondary CPUs. */ 1872 pmap_tlb_cpu_init(curcpu()); 1873 1874 evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC, 1875 NULL, "x86", "io bitmap copy"); 1876 evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC, 1877 NULL, "x86", "ldt sync"); 1878 1879 /* 1880 * The kernel doesn't keep track of PTPs, so there's nowhere handy 1881 * to hang a tree of pv_entry records. Dynamically allocated 1882 * pv_entry lists are not heavily used in the kernel's pmap (the 1883 * usual case is embedded), so cop out and use a single RB tree 1884 * to cover them. 1885 */ 1886 rb_tree_init(&pmap_kernel_rb, &pmap_rbtree_ops); 1887 1888 /* 1889 * done: pmap module is up (and ready for business) 1890 */ 1891 1892 pmap_initialized = true; 1893 } 1894 1895 #ifndef XENPV 1896 /* 1897 * pmap_cpu_init_late: perform late per-CPU initialization. 1898 */ 1899 void 1900 pmap_cpu_init_late(struct cpu_info *ci) 1901 { 1902 /* 1903 * The BP has already its own PD page allocated during early 1904 * MD startup. 1905 */ 1906 if (ci == &cpu_info_primary) 1907 return; 1908 #ifdef PAE 1909 cpu_alloc_l3_page(ci); 1910 #endif 1911 } 1912 #endif 1913 1914 #ifndef __HAVE_DIRECT_MAP 1915 CTASSERT(CACHE_LINE_SIZE > sizeof(pt_entry_t)); 1916 CTASSERT(CACHE_LINE_SIZE % sizeof(pt_entry_t) == 0); 1917 1918 static void 1919 pmap_vpage_cpualloc(struct cpu_info *ci) 1920 { 1921 bool primary = (ci == &cpu_info_primary); 1922 size_t i, npages; 1923 vaddr_t vabase; 1924 vsize_t vrange; 1925 1926 npages = (CACHE_LINE_SIZE / sizeof(pt_entry_t)); 1927 KASSERT(npages >= VPAGE_MAX); 1928 vrange = npages * PAGE_SIZE; 1929 1930 if (primary) { 1931 while ((vabase = pmap_bootstrap_valloc(1)) % vrange != 0) { 1932 /* Waste some pages to align properly */ 1933 } 1934 /* The base is aligned, allocate the rest (contiguous) */ 1935 pmap_bootstrap_valloc(npages - 1); 1936 } else { 1937 vabase = uvm_km_alloc(kernel_map, vrange, vrange, 1938 UVM_KMF_VAONLY); 1939 if (vabase == 0) { 1940 panic("%s: failed to allocate tmp VA for CPU %d\n", 1941 __func__, cpu_index(ci)); 1942 } 1943 } 1944 1945 KASSERT((vaddr_t)&PTE_BASE[pl1_i(vabase)] % CACHE_LINE_SIZE == 0); 1946 1947 for (i = 0; i < VPAGE_MAX; i++) { 1948 ci->vpage[i] = vabase + i * PAGE_SIZE; 1949 ci->vpage_pte[i] = PTE_BASE + pl1_i(ci->vpage[i]); 1950 } 1951 } 1952 1953 void 1954 pmap_vpage_cpu_init(struct cpu_info *ci) 1955 { 1956 if (ci == &cpu_info_primary) { 1957 /* cpu0 already taken care of in pmap_bootstrap */ 1958 return; 1959 } 1960 1961 pmap_vpage_cpualloc(ci); 1962 } 1963 #endif 1964 1965 /* 1966 * p v _ e n t r y f u n c t i o n s 1967 */ 1968 1969 /* 1970 * pmap_pvp_dtor: pool_cache constructor for PV pages. 1971 */ 1972 static int 1973 pmap_pvp_ctor(void *arg, void *obj, int flags) 1974 { 1975 struct pv_page *pvp = (struct pv_page *)obj; 1976 struct pv_entry *pve = (struct pv_entry *)obj + 1; 1977 struct pv_entry *maxpve = pve + PVE_PER_PVP; 1978 1979 KASSERT(sizeof(struct pv_page) <= sizeof(struct pv_entry)); 1980 KASSERT(trunc_page((vaddr_t)obj) == (vaddr_t)obj); 1981 1982 LIST_INIT(&pvp->pvp_pves); 1983 pvp->pvp_nfree = PVE_PER_PVP; 1984 pvp->pvp_pmap = NULL; 1985 1986 for (; pve < maxpve; pve++) { 1987 LIST_INSERT_HEAD(&pvp->pvp_pves, pve, pve_list); 1988 } 1989 1990 return 0; 1991 } 1992 1993 /* 1994 * pmap_pvp_dtor: pool_cache destructor for PV pages. 1995 */ 1996 static void 1997 pmap_pvp_dtor(void *arg, void *obj) 1998 { 1999 struct pv_page *pvp __diagused = obj; 2000 2001 KASSERT(pvp->pvp_pmap == NULL); 2002 KASSERT(pvp->pvp_nfree == PVE_PER_PVP); 2003 } 2004 2005 /* 2006 * pmap_alloc_pv: allocate a PV entry (likely cached with pmap). 2007 */ 2008 static struct pv_entry * 2009 pmap_alloc_pv(struct pmap *pmap) 2010 { 2011 struct pv_entry *pve; 2012 struct pv_page *pvp; 2013 2014 KASSERT(mutex_owned(&pmap->pm_lock)); 2015 2016 if (__predict_false((pvp = LIST_FIRST(&pmap->pm_pvp_part)) == NULL)) { 2017 if ((pvp = LIST_FIRST(&pmap->pm_pvp_full)) != NULL) { 2018 LIST_REMOVE(pvp, pvp_list); 2019 } else { 2020 pvp = pool_cache_get(&pmap_pvp_cache, PR_NOWAIT); 2021 } 2022 if (__predict_false(pvp == NULL)) { 2023 return NULL; 2024 } 2025 /* full -> part */ 2026 LIST_INSERT_HEAD(&pmap->pm_pvp_part, pvp, pvp_list); 2027 pvp->pvp_pmap = pmap; 2028 } 2029 2030 KASSERT(pvp->pvp_pmap == pmap); 2031 KASSERT(pvp->pvp_nfree > 0); 2032 2033 pve = LIST_FIRST(&pvp->pvp_pves); 2034 LIST_REMOVE(pve, pve_list); 2035 pvp->pvp_nfree--; 2036 2037 if (__predict_false(pvp->pvp_nfree == 0)) { 2038 /* part -> empty */ 2039 KASSERT(LIST_EMPTY(&pvp->pvp_pves)); 2040 LIST_REMOVE(pvp, pvp_list); 2041 LIST_INSERT_HEAD(&pmap->pm_pvp_empty, pvp, pvp_list); 2042 } else { 2043 KASSERT(!LIST_EMPTY(&pvp->pvp_pves)); 2044 } 2045 2046 return pve; 2047 } 2048 2049 /* 2050 * pmap_free_pv: delayed free of a PV entry. 2051 */ 2052 static void 2053 pmap_free_pv(struct pmap *pmap, struct pv_entry *pve) 2054 { 2055 struct pv_page *pvp = (struct pv_page *)trunc_page((vaddr_t)pve); 2056 2057 KASSERT(mutex_owned(&pmap->pm_lock)); 2058 KASSERT(pvp->pvp_pmap == pmap); 2059 KASSERT(pvp->pvp_nfree >= 0); 2060 2061 LIST_INSERT_HEAD(&pvp->pvp_pves, pve, pve_list); 2062 pvp->pvp_nfree++; 2063 2064 if (__predict_false(pvp->pvp_nfree == 1)) { 2065 /* empty -> part */ 2066 LIST_REMOVE(pvp, pvp_list); 2067 LIST_INSERT_HEAD(&pmap->pm_pvp_part, pvp, pvp_list); 2068 } else if (__predict_false(pvp->pvp_nfree == PVE_PER_PVP)) { 2069 /* part -> full */ 2070 LIST_REMOVE(pvp, pvp_list); 2071 LIST_INSERT_HEAD(&pmap->pm_pvp_full, pvp, pvp_list); 2072 } 2073 } 2074 2075 /* 2076 * pmap_drain_pv: free full PV pages. 2077 */ 2078 static void 2079 pmap_drain_pv(struct pmap *pmap) 2080 { 2081 struct pv_page *pvp; 2082 2083 KASSERT(mutex_owned(&pmap->pm_lock)); 2084 2085 while ((pvp = LIST_FIRST(&pmap->pm_pvp_full)) != NULL) { 2086 LIST_REMOVE(pvp, pvp_list); 2087 KASSERT(pvp->pvp_pmap == pmap); 2088 KASSERT(pvp->pvp_nfree == PVE_PER_PVP); 2089 pvp->pvp_pmap = NULL; 2090 pool_cache_put(&pmap_pvp_cache, pvp); 2091 } 2092 } 2093 2094 /* 2095 * pmap_check_pv: verify {VA, PTP} pair is either tracked/untracked by page 2096 */ 2097 static void 2098 pmap_check_pv(struct pmap *pmap, struct vm_page *ptp, struct pmap_page *pp, 2099 vaddr_t va, bool tracked) 2100 { 2101 #ifdef DEBUG 2102 struct pv_pte *pvpte; 2103 2104 PMAP_CHECK_PP(pp); 2105 2106 mutex_spin_enter(&pp->pp_lock); 2107 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 2108 if (pvpte->pte_ptp == ptp && pvpte->pte_va == va) { 2109 break; 2110 } 2111 } 2112 mutex_spin_exit(&pp->pp_lock); 2113 2114 if (pvpte && !tracked) { 2115 panic("pmap_check_pv: %p/%lx found on pp %p", ptp, va, pp); 2116 } else if (!pvpte && tracked) { 2117 panic("pmap_check_pv: %p/%lx missing on pp %p", ptp, va, pp); 2118 } 2119 #endif 2120 } 2121 2122 /* 2123 * pmap_treelookup_pv: search the PV tree for a dynamic entry 2124 * 2125 * => pmap must be locked 2126 */ 2127 static struct pv_entry * 2128 pmap_treelookup_pv(const struct pmap *pmap, const struct vm_page *ptp, 2129 const rb_tree_t *tree, const vaddr_t va) 2130 { 2131 struct pv_entry *pve; 2132 rb_node_t *node; 2133 2134 /* 2135 * Inlined lookup tailored for exactly what's needed here that is 2136 * quite a bit faster than using rb_tree_find_node(). 2137 */ 2138 for (node = tree->rbt_root;;) { 2139 if (__predict_false(RB_SENTINEL_P(node))) { 2140 return NULL; 2141 } 2142 pve = (struct pv_entry *) 2143 ((uintptr_t)node - offsetof(struct pv_entry, pve_rb)); 2144 if (pve->pve_pte.pte_va == va) { 2145 KASSERT(pve->pve_pte.pte_ptp == ptp); 2146 return pve; 2147 } 2148 node = node->rb_nodes[pve->pve_pte.pte_va < va]; 2149 } 2150 } 2151 2152 /* 2153 * pmap_lookup_pv: look up a non-embedded pv entry for the given pmap 2154 * 2155 * => a PV entry must be known present (doesn't check for existence) 2156 * => pmap must be locked 2157 */ 2158 static struct pv_entry * 2159 pmap_lookup_pv(const struct pmap *pmap, const struct vm_page *ptp, 2160 const struct pmap_page * const old_pp, const vaddr_t va) 2161 { 2162 struct pv_entry *pve; 2163 const rb_tree_t *tree; 2164 2165 KASSERT(mutex_owned(&pmap->pm_lock)); 2166 KASSERT(ptp != NULL || pmap == pmap_kernel()); 2167 2168 /* 2169 * [This mostly deals with the case of process-private pages, i.e. 2170 * anonymous memory allocations or COW.] 2171 * 2172 * If the page is tracked with an embedded entry then the tree 2173 * lookup can be avoided. It's safe to check for this specific 2174 * set of values without pp_lock because both will only ever be 2175 * set together for this pmap. 2176 * 2177 */ 2178 if (atomic_load_relaxed(&old_pp->pp_pte.pte_ptp) == ptp && 2179 atomic_load_relaxed(&old_pp->pp_pte.pte_va) == va) { 2180 return NULL; 2181 } 2182 2183 /* 2184 * [This mostly deals with shared mappings, for example shared libs 2185 * and executables.] 2186 * 2187 * Optimise for pmap_remove_ptes() which works by ascending scan: 2188 * look at the lowest numbered node in the tree first. The tree is 2189 * known non-empty because of the check above. For short lived 2190 * processes where pmap_remove() isn't used much this gets close to 2191 * a 100% hit rate. 2192 */ 2193 tree = (ptp != NULL ? &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb); 2194 KASSERT(!RB_SENTINEL_P(tree->rbt_root)); 2195 pve = (struct pv_entry *) 2196 ((uintptr_t)tree->rbt_minmax[RB_DIR_LEFT] - 2197 offsetof(struct pv_entry, pve_rb)); 2198 if (__predict_true(pve->pve_pte.pte_va == va)) { 2199 KASSERT(pve->pve_pte.pte_ptp == ptp); 2200 return pve; 2201 } 2202 2203 /* Search the RB tree for the key (uncommon). */ 2204 return pmap_treelookup_pv(pmap, ptp, tree, va); 2205 } 2206 2207 /* 2208 * pmap_enter_pv: enter a mapping onto a pmap_page lst 2209 * 2210 * => pmap must be locked 2211 * => does NOT insert dynamic entries to tree (pmap_enter() does later) 2212 */ 2213 static int 2214 pmap_enter_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp, 2215 vaddr_t va, struct pv_entry **new_pve, struct pv_entry **old_pve, 2216 bool *samepage, bool *new_embedded, rb_tree_t *tree) 2217 { 2218 struct pv_entry *pve; 2219 int error; 2220 2221 KASSERT(mutex_owned(&pmap->pm_lock)); 2222 KASSERT(ptp_to_pmap(ptp) == pmap); 2223 KASSERT(ptp == NULL || ptp->uobject != NULL); 2224 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 2225 PMAP_CHECK_PP(pp); 2226 2227 /* 2228 * If entering the same page and it's already tracked with an 2229 * embedded entry, we can avoid the expense below. It's safe 2230 * to check for this very specific set of values without a lock 2231 * because both will only ever be set together for this pmap. 2232 */ 2233 if (atomic_load_relaxed(&pp->pp_pte.pte_ptp) == ptp && 2234 atomic_load_relaxed(&pp->pp_pte.pte_va) == va) { 2235 *samepage = true; 2236 pmap_check_pv(pmap, ptp, pp, va, true); 2237 return 0; 2238 } 2239 2240 /* 2241 * Check for an existing dynamic mapping at this address. If it's 2242 * for the same page, then it will be reused and nothing needs to be 2243 * changed. 2244 */ 2245 *old_pve = pmap_treelookup_pv(pmap, ptp, tree, va); 2246 if (*old_pve != NULL && (*old_pve)->pve_pp == pp) { 2247 *samepage = true; 2248 pmap_check_pv(pmap, ptp, pp, va, true); 2249 return 0; 2250 } 2251 2252 /* 2253 * Need to put a new mapping in place. Grab a spare pv_entry in 2254 * case it's needed; won't know for sure until the lock is taken. 2255 */ 2256 if (pmap->pm_pve == NULL) { 2257 pmap->pm_pve = pmap_alloc_pv(pmap); 2258 } 2259 2260 error = 0; 2261 pmap_check_pv(pmap, ptp, pp, va, false); 2262 mutex_spin_enter(&pp->pp_lock); 2263 if (!pv_pte_embedded(pp)) { 2264 /* 2265 * Embedded PV tracking available - easy. 2266 */ 2267 pp->pp_pte.pte_ptp = ptp; 2268 pp->pp_pte.pte_va = va; 2269 *new_embedded = true; 2270 } else if (__predict_false(pmap->pm_pve == NULL)) { 2271 /* 2272 * No memory. 2273 */ 2274 error = ENOMEM; 2275 } else { 2276 /* 2277 * Install new pv_entry on the page. 2278 */ 2279 pve = pmap->pm_pve; 2280 pmap->pm_pve = NULL; 2281 *new_pve = pve; 2282 pve->pve_pte.pte_ptp = ptp; 2283 pve->pve_pte.pte_va = va; 2284 pve->pve_pp = pp; 2285 LIST_INSERT_HEAD(&pp->pp_pvlist, pve, pve_list); 2286 } 2287 mutex_spin_exit(&pp->pp_lock); 2288 if (error == 0) { 2289 pmap_check_pv(pmap, ptp, pp, va, true); 2290 } 2291 2292 return error; 2293 } 2294 2295 /* 2296 * pmap_remove_pv: try to remove a mapping from a pv_list 2297 * 2298 * => pmap must be locked 2299 * => removes dynamic entries from tree and frees them 2300 * => caller should adjust ptp's wire_count and free PTP if needed 2301 */ 2302 static void 2303 pmap_remove_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp, 2304 vaddr_t va, struct pv_entry *pve, uint8_t oattrs) 2305 { 2306 rb_tree_t *tree = (ptp != NULL ? 2307 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb); 2308 2309 KASSERT(mutex_owned(&pmap->pm_lock)); 2310 KASSERT(ptp_to_pmap(ptp) == pmap); 2311 KASSERT(ptp == NULL || ptp->uobject != NULL); 2312 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 2313 KASSERT(ptp != NULL || pmap == pmap_kernel()); 2314 2315 pmap_check_pv(pmap, ptp, pp, va, true); 2316 2317 if (pve == NULL) { 2318 mutex_spin_enter(&pp->pp_lock); 2319 KASSERT(pp->pp_pte.pte_ptp == ptp); 2320 KASSERT(pp->pp_pte.pte_va == va); 2321 pp->pp_attrs |= oattrs; 2322 pp->pp_pte.pte_ptp = NULL; 2323 pp->pp_pte.pte_va = 0; 2324 mutex_spin_exit(&pp->pp_lock); 2325 } else { 2326 mutex_spin_enter(&pp->pp_lock); 2327 KASSERT(pp->pp_pte.pte_ptp != ptp || 2328 pp->pp_pte.pte_va != va); 2329 KASSERT(pve->pve_pte.pte_ptp == ptp); 2330 KASSERT(pve->pve_pte.pte_va == va); 2331 KASSERT(pve->pve_pp == pp); 2332 pp->pp_attrs |= oattrs; 2333 LIST_REMOVE(pve, pve_list); 2334 mutex_spin_exit(&pp->pp_lock); 2335 2336 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == pve); 2337 rb_tree_remove_node(tree, pve); 2338 #ifdef DIAGNOSTIC 2339 memset(pve, 0, sizeof(*pve)); 2340 #endif 2341 pmap_free_pv(pmap, pve); 2342 } 2343 2344 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 2345 pmap_check_pv(pmap, ptp, pp, va, false); 2346 } 2347 2348 /* 2349 * p t p f u n c t i o n s 2350 */ 2351 2352 static struct vm_page * 2353 pmap_find_ptp(struct pmap *pmap, vaddr_t va, int level) 2354 { 2355 int lidx = level - 1; 2356 off_t off = ptp_va2o(va, level); 2357 struct vm_page *pg; 2358 2359 KASSERT(mutex_owned(&pmap->pm_lock)); 2360 2361 if (pmap->pm_ptphint[lidx] && off == pmap->pm_ptphint[lidx]->offset) { 2362 KASSERT(pmap->pm_ptphint[lidx]->wire_count > 0); 2363 pg = pmap->pm_ptphint[lidx]; 2364 PMAP_CHECK_PP(VM_PAGE_TO_PP(pg)); 2365 return pg; 2366 } 2367 PMAP_DUMMY_LOCK(pmap); 2368 pg = uvm_pagelookup(&pmap->pm_obj[lidx], off); 2369 PMAP_DUMMY_UNLOCK(pmap); 2370 if (pg != NULL && __predict_false(pg->wire_count == 0)) { 2371 /* This page is queued to be freed - ignore. */ 2372 pg = NULL; 2373 } 2374 if (pg != NULL) { 2375 PMAP_CHECK_PP(VM_PAGE_TO_PP(pg)); 2376 } 2377 pmap->pm_ptphint[lidx] = pg; 2378 return pg; 2379 } 2380 2381 static inline void 2382 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level) 2383 { 2384 int lidx; 2385 2386 KASSERT(ptp->wire_count <= 1); 2387 PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp)); 2388 2389 lidx = level - 1; 2390 pmap_stats_update(pmap, -ptp->wire_count, 0); 2391 if (pmap->pm_ptphint[lidx] == ptp) 2392 pmap->pm_ptphint[lidx] = NULL; 2393 ptp->wire_count = 0; 2394 ptp->uanon = NULL; 2395 KASSERT(RB_TREE_MIN(&VM_PAGE_TO_PP(ptp)->pp_rb) == NULL); 2396 2397 /* 2398 * Enqueue the PTP to be freed by pmap_update(). We can't remove 2399 * the page from the uvm_object, as that can take further locks 2400 * (intolerable right now because the PTEs are likely mapped in). 2401 * Instead mark the PTP as free and if we bump into it again, we'll 2402 * either ignore or reuse (depending on what's useful at the time). 2403 */ 2404 LIST_INSERT_HEAD(&pmap->pm_gc_ptp, ptp, mdpage.mp_pp.pp_link); 2405 } 2406 2407 static void 2408 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va, 2409 pt_entry_t *ptes, pd_entry_t * const *pdes) 2410 { 2411 unsigned long index; 2412 int level; 2413 vaddr_t invaladdr; 2414 pd_entry_t opde; 2415 2416 KASSERT(pmap != pmap_kernel()); 2417 KASSERT(mutex_owned(&pmap->pm_lock)); 2418 KASSERT(kpreempt_disabled()); 2419 2420 level = 1; 2421 do { 2422 index = pl_i(va, level + 1); 2423 opde = pmap_pte_testset(&pdes[level - 1][index], 0); 2424 2425 /* 2426 * On Xen-amd64 or SVS, we need to sync the top level page 2427 * directory on each CPU. 2428 */ 2429 #if defined(XENPV) && defined(__x86_64__) 2430 if (level == PTP_LEVELS - 1) { 2431 xen_kpm_sync(pmap, index); 2432 } 2433 #elif defined(SVS) 2434 if (svs_enabled && level == PTP_LEVELS - 1) { 2435 svs_pmap_sync(pmap, index); 2436 } 2437 #endif 2438 2439 invaladdr = level == 1 ? (vaddr_t)ptes : 2440 (vaddr_t)pdes[level - 2]; 2441 pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE, 2442 opde, TLBSHOOT_FREE_PTP); 2443 2444 #if defined(XENPV) 2445 pmap_tlb_shootnow(); 2446 #endif 2447 2448 pmap_freepage(pmap, ptp, level); 2449 if (level < PTP_LEVELS - 1) { 2450 ptp = pmap_find_ptp(pmap, va, level + 1); 2451 ptp->wire_count--; 2452 if (ptp->wire_count > 1) 2453 break; 2454 } 2455 } while (++level < PTP_LEVELS); 2456 pmap_pte_flush(); 2457 } 2458 2459 /* 2460 * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one) 2461 * 2462 * => pmap should NOT be pmap_kernel() 2463 * => pmap should be locked 2464 * => we are not touching any PTEs yet, so they need not be mapped in 2465 */ 2466 static int 2467 pmap_get_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va, 2468 int flags, struct vm_page **resultp) 2469 { 2470 struct vm_page *ptp; 2471 int i, aflags; 2472 struct uvm_object *obj; 2473 voff_t off; 2474 2475 KASSERT(pmap != pmap_kernel()); 2476 KASSERT(mutex_owned(&pmap->pm_lock)); 2477 2478 /* 2479 * Loop through all page table levels allocating a page 2480 * for any level where we don't already have one. 2481 */ 2482 memset(pt, 0, sizeof(*pt)); 2483 aflags = ((flags & PMAP_CANFAIL) ? 0 : UVM_PGA_USERESERVE) | 2484 UVM_PGA_ZERO; 2485 for (i = PTP_LEVELS; i > 1; i--) { 2486 obj = &pmap->pm_obj[i - 2]; 2487 off = ptp_va2o(va, i - 1); 2488 2489 PMAP_DUMMY_LOCK(pmap); 2490 pt->pg[i] = uvm_pagelookup(obj, off); 2491 2492 if (pt->pg[i] == NULL) { 2493 pt->pg[i] = uvm_pagealloc(obj, off, NULL, aflags); 2494 pt->alloced[i] = (pt->pg[i] != NULL); 2495 } else if (pt->pg[i]->wire_count == 0) { 2496 /* This page was queued to be freed; dequeue it. */ 2497 LIST_REMOVE(pt->pg[i], mdpage.mp_pp.pp_link); 2498 pt->alloced[i] = true; 2499 } 2500 PMAP_DUMMY_UNLOCK(pmap); 2501 if (pt->pg[i] == NULL) { 2502 pmap_unget_ptp(pmap, pt); 2503 return ENOMEM; 2504 } else if (pt->alloced[i]) { 2505 pt->pg[i]->uanon = (struct vm_anon *)(vaddr_t)~0L; 2506 rb_tree_init(&VM_PAGE_TO_PP(pt->pg[i])->pp_rb, 2507 &pmap_rbtree_ops); 2508 PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i])); 2509 } 2510 } 2511 ptp = pt->pg[2]; 2512 KASSERT(ptp != NULL); 2513 *resultp = ptp; 2514 pmap->pm_ptphint[0] = ptp; 2515 return 0; 2516 } 2517 2518 /* 2519 * pmap_install_ptp: install any freshly allocated PTPs 2520 * 2521 * => pmap should NOT be pmap_kernel() 2522 * => pmap should be locked 2523 * => PTEs must be mapped 2524 * => preemption must be disabled 2525 */ 2526 static void 2527 pmap_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va, 2528 pd_entry_t * const *pdes) 2529 { 2530 struct vm_page *ptp; 2531 unsigned long index; 2532 pd_entry_t *pva; 2533 paddr_t pa; 2534 int i; 2535 2536 KASSERT(pmap != pmap_kernel()); 2537 KASSERT(mutex_owned(&pmap->pm_lock)); 2538 KASSERT(kpreempt_disabled()); 2539 2540 /* 2541 * Now that we have all the pages looked up or allocated, 2542 * loop through again installing any new ones into the tree. 2543 */ 2544 for (i = PTP_LEVELS; i > 1; i--) { 2545 index = pl_i(va, i); 2546 pva = pdes[i - 2]; 2547 2548 if (pmap_valid_entry(pva[index])) { 2549 KASSERT(!pt->alloced[i]); 2550 continue; 2551 } 2552 2553 ptp = pt->pg[i]; 2554 ptp->flags &= ~PG_BUSY; /* never busy */ 2555 ptp->wire_count = 1; 2556 pmap->pm_ptphint[i - 2] = ptp; 2557 pa = VM_PAGE_TO_PHYS(ptp); 2558 pmap_pte_set(&pva[index], (pd_entry_t) 2559 (pmap_pa2pte(pa) | PTE_U | PTE_W | PTE_P)); 2560 2561 /* 2562 * On Xen-amd64 or SVS, we need to sync the top level page 2563 * directory on each CPU. 2564 */ 2565 #if defined(XENPV) && defined(__x86_64__) 2566 if (i == PTP_LEVELS) { 2567 xen_kpm_sync(pmap, index); 2568 } 2569 #elif defined(SVS) 2570 if (svs_enabled && i == PTP_LEVELS) { 2571 svs_pmap_sync(pmap, index); 2572 } 2573 #endif 2574 2575 pmap_pte_flush(); 2576 pmap_stats_update(pmap, 1, 0); 2577 2578 /* 2579 * If we're not in the top level, increase the 2580 * wire count of the parent page. 2581 */ 2582 if (i < PTP_LEVELS) { 2583 pt->pg[i + 1]->wire_count++; 2584 } 2585 } 2586 } 2587 2588 /* 2589 * pmap_unget_ptp: free unusued PTPs 2590 * 2591 * => pmap should NOT be pmap_kernel() 2592 * => pmap should be locked 2593 */ 2594 static void 2595 pmap_unget_ptp(struct pmap *pmap, struct pmap_ptparray *pt) 2596 { 2597 int i; 2598 2599 KASSERT(pmap != pmap_kernel()); 2600 KASSERT(mutex_owned(&pmap->pm_lock)); 2601 2602 for (i = PTP_LEVELS; i > 1; i--) { 2603 if (!pt->alloced[i]) { 2604 continue; 2605 } 2606 KASSERT(pt->pg[i]->wire_count == 0); 2607 PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i])); 2608 pmap_freepage(pmap, pt->pg[i], i - 1); 2609 } 2610 } 2611 2612 /* 2613 * p m a p l i f e c y c l e f u n c t i o n s 2614 */ 2615 2616 /* 2617 * pmap_pdp_init: constructor a new PDP. 2618 */ 2619 static void 2620 pmap_pdp_init(pd_entry_t *pdir) 2621 { 2622 paddr_t pdirpa = 0; 2623 vaddr_t object; 2624 int i; 2625 2626 #if !defined(XENPV) || !defined(__x86_64__) 2627 int npde; 2628 #endif 2629 #ifdef XENPV 2630 int s; 2631 #endif 2632 2633 memset(PAGE_ALIGNED(pdir), 0, PDP_SIZE * PAGE_SIZE); 2634 2635 /* 2636 * NOTE: This is all done unlocked, but we will check afterwards 2637 * if we have raced with pmap_growkernel(). 2638 */ 2639 2640 #if defined(XENPV) && defined(__x86_64__) 2641 /* Fetch the physical address of the page directory */ 2642 (void)pmap_extract(pmap_kernel(), (vaddr_t)pdir, &pdirpa); 2643 2644 /* 2645 * This pdir will NEVER be active in kernel mode, so mark 2646 * recursive entry invalid. 2647 */ 2648 pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa); 2649 2650 /* 2651 * PDP constructed this way won't be for the kernel, hence we 2652 * don't put kernel mappings on Xen. 2653 * 2654 * But we need to make pmap_create() happy, so put a dummy 2655 * (without PTE_P) value at the right place. 2656 */ 2657 pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] = 2658 (pd_entry_t)-1 & PTE_FRAME; 2659 #else /* XENPV && __x86_64__*/ 2660 object = (vaddr_t)pdir; 2661 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2662 /* Fetch the physical address of the page directory */ 2663 (void)pmap_extract(pmap_kernel(), object, &pdirpa); 2664 2665 /* Put in recursive PDE to map the PTEs */ 2666 pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PTE_P | 2667 pmap_pg_nx; 2668 #ifndef XENPV 2669 pdir[PDIR_SLOT_PTE + i] |= PTE_W; 2670 #endif 2671 } 2672 2673 /* Copy the kernel's top level PDE */ 2674 npde = nkptp[PTP_LEVELS - 1]; 2675 2676 memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN], 2677 npde * sizeof(pd_entry_t)); 2678 2679 if (VM_MIN_KERNEL_ADDRESS != KERNBASE) { 2680 int idx = pl_i(KERNBASE, PTP_LEVELS); 2681 pdir[idx] = PDP_BASE[idx]; 2682 } 2683 2684 #ifdef __HAVE_PCPU_AREA 2685 pdir[PDIR_SLOT_PCPU] = PDP_BASE[PDIR_SLOT_PCPU]; 2686 #endif 2687 #ifdef __HAVE_DIRECT_MAP 2688 slotspace_copy(SLAREA_DMAP, pdir, PDP_BASE); 2689 #endif 2690 #ifdef KASAN 2691 slotspace_copy(SLAREA_ASAN, pdir, PDP_BASE); 2692 #endif 2693 #ifdef KMSAN 2694 slotspace_copy(SLAREA_MSAN, pdir, PDP_BASE); 2695 #endif 2696 #endif /* XENPV && __x86_64__*/ 2697 2698 #ifdef XENPV 2699 s = splvm(); 2700 object = (vaddr_t)pdir; 2701 pmap_protect(pmap_kernel(), object, object + (PAGE_SIZE * PDP_SIZE), 2702 VM_PROT_READ); 2703 pmap_update(pmap_kernel()); 2704 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2705 /* 2706 * pin as L2/L4 page, we have to do the page with the 2707 * PDIR_SLOT_PTE entries last 2708 */ 2709 #ifdef PAE 2710 if (i == l2tol3(PDIR_SLOT_PTE)) 2711 continue; 2712 #endif 2713 2714 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2715 #ifdef __x86_64__ 2716 xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa)); 2717 #else 2718 xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa)); 2719 #endif 2720 } 2721 #ifdef PAE 2722 object = ((vaddr_t)pdir) + PAGE_SIZE * l2tol3(PDIR_SLOT_PTE); 2723 (void)pmap_extract(pmap_kernel(), object, &pdirpa); 2724 xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa)); 2725 #endif 2726 splx(s); 2727 #endif /* XENPV */ 2728 } 2729 2730 /* 2731 * pmap_pdp_fini: destructor for the PDPs. 2732 */ 2733 static void 2734 pmap_pdp_fini(pd_entry_t *pdir) 2735 { 2736 #ifdef XENPV 2737 paddr_t pdirpa = 0; /* XXX: GCC */ 2738 vaddr_t object = (vaddr_t)pdir; 2739 int i; 2740 int s = splvm(); 2741 pt_entry_t *pte; 2742 2743 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2744 /* fetch the physical address of the page directory. */ 2745 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2746 /* unpin page table */ 2747 xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa)); 2748 } 2749 object = (vaddr_t)pdir; 2750 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2751 /* Set page RW again */ 2752 pte = kvtopte(object); 2753 pmap_pte_set(pte, *pte | PTE_W); 2754 xen_bcast_invlpg((vaddr_t)object); 2755 } 2756 splx(s); 2757 #endif /* XENPV */ 2758 } 2759 2760 #ifdef PAE 2761 static void * 2762 pmap_pdp_alloc(struct pool *pp, int flags) 2763 { 2764 return (void *)uvm_km_alloc(kernel_map, 2765 PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE, 2766 ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK) | 2767 UVM_KMF_WIRED); 2768 } 2769 2770 static void 2771 pmap_pdp_free(struct pool *pp, void *v) 2772 { 2773 uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE, 2774 UVM_KMF_WIRED); 2775 } 2776 #endif /* PAE */ 2777 2778 /* 2779 * pmap_ctor: constructor for the pmap cache. 2780 */ 2781 static int 2782 pmap_ctor(void *arg, void *obj, int flags) 2783 { 2784 struct pmap *pmap = obj; 2785 pt_entry_t p; 2786 int i; 2787 2788 KASSERT((flags & PR_WAITOK) != 0); 2789 2790 mutex_init(&pmap->pm_lock, MUTEX_DEFAULT, IPL_NONE); 2791 rw_init(&pmap->pm_dummy_lock); 2792 kcpuset_create(&pmap->pm_cpus, true); 2793 kcpuset_create(&pmap->pm_kernel_cpus, true); 2794 #ifdef XENPV 2795 kcpuset_create(&pmap->pm_xen_ptp_cpus, true); 2796 #endif 2797 LIST_INIT(&pmap->pm_gc_ptp); 2798 pmap->pm_pve = NULL; 2799 LIST_INIT(&pmap->pm_pvp_full); 2800 LIST_INIT(&pmap->pm_pvp_part); 2801 LIST_INIT(&pmap->pm_pvp_empty); 2802 2803 /* allocate and init PDP */ 2804 pmap->pm_pdir = pool_get(&pmap_pdp_pool, PR_WAITOK); 2805 2806 for (;;) { 2807 pmap_pdp_init(pmap->pm_pdir); 2808 mutex_enter(&pmaps_lock); 2809 p = pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1]; 2810 if (__predict_true(p != 0)) { 2811 break; 2812 } 2813 mutex_exit(&pmaps_lock); 2814 } 2815 2816 for (i = 0; i < PDP_SIZE; i++) 2817 pmap->pm_pdirpa[i] = 2818 pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]); 2819 2820 LIST_INSERT_HEAD(&pmaps, pmap, pm_list); 2821 mutex_exit(&pmaps_lock); 2822 2823 return 0; 2824 } 2825 2826 /* 2827 * pmap_ctor: destructor for the pmap cache. 2828 */ 2829 static void 2830 pmap_dtor(void *arg, void *obj) 2831 { 2832 struct pmap *pmap = obj; 2833 2834 mutex_enter(&pmaps_lock); 2835 LIST_REMOVE(pmap, pm_list); 2836 mutex_exit(&pmaps_lock); 2837 2838 pmap_pdp_fini(pmap->pm_pdir); 2839 pool_put(&pmap_pdp_pool, pmap->pm_pdir); 2840 mutex_destroy(&pmap->pm_lock); 2841 rw_destroy(&pmap->pm_dummy_lock); 2842 kcpuset_destroy(pmap->pm_cpus); 2843 kcpuset_destroy(pmap->pm_kernel_cpus); 2844 #ifdef XENPV 2845 kcpuset_destroy(pmap->pm_xen_ptp_cpus); 2846 #endif 2847 } 2848 2849 /* 2850 * pmap_create: create a pmap object. 2851 */ 2852 struct pmap * 2853 pmap_create(void) 2854 { 2855 struct pmap *pmap; 2856 int i; 2857 2858 pmap = pool_cache_get(&pmap_cache, PR_WAITOK); 2859 2860 /* init uvm_object */ 2861 for (i = 0; i < PTP_LEVELS - 1; i++) { 2862 uvm_obj_init(&pmap->pm_obj[i], &pmap_pager, false, 1); 2863 uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_dummy_lock); 2864 pmap->pm_ptphint[i] = NULL; 2865 } 2866 pmap->pm_stats.wired_count = 0; 2867 /* count the PDP allocd below */ 2868 pmap->pm_stats.resident_count = PDP_SIZE; 2869 #if !defined(__x86_64__) 2870 pmap->pm_hiexec = 0; 2871 #endif 2872 2873 /* Used by NVMM and Xen */ 2874 pmap->pm_enter = NULL; 2875 pmap->pm_extract = NULL; 2876 pmap->pm_remove = NULL; 2877 pmap->pm_sync_pv = NULL; 2878 pmap->pm_pp_remove_ent = NULL; 2879 pmap->pm_write_protect = NULL; 2880 pmap->pm_unwire = NULL; 2881 pmap->pm_tlb_flush = NULL; 2882 pmap->pm_data = NULL; 2883 2884 /* init the LDT */ 2885 pmap->pm_ldt = NULL; 2886 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 2887 2888 return (pmap); 2889 } 2890 2891 /* 2892 * pmap_check_ptps: verify that none of the pmap's page table objects 2893 * have any pages allocated to them. 2894 */ 2895 static void 2896 pmap_check_ptps(struct pmap *pmap) 2897 { 2898 int i; 2899 2900 for (i = 0; i < PTP_LEVELS - 1; i++) { 2901 KASSERTMSG(pmap->pm_obj[i].uo_npages == 0, 2902 "pmap %p level %d still has %d pages", 2903 pmap, i, (int)pmap->pm_obj[i].uo_npages); 2904 } 2905 } 2906 2907 static void 2908 pmap_check_inuse(struct pmap *pmap) 2909 { 2910 #ifdef DEBUG 2911 CPU_INFO_ITERATOR cii; 2912 struct cpu_info *ci; 2913 2914 for (CPU_INFO_FOREACH(cii, ci)) { 2915 if (ci->ci_pmap == pmap) 2916 panic("destroying pmap being used"); 2917 #if defined(XENPV) && defined(__x86_64__) 2918 for (int i = 0; i < PDIR_SLOT_USERLIM; i++) { 2919 if (pmap->pm_pdir[i] != 0 && 2920 ci->ci_kpm_pdir[i] == pmap->pm_pdir[i]) { 2921 printf("pmap_destroy(%p) pmap_kernel %p " 2922 "curcpu %d cpu %d ci_pmap %p " 2923 "ci->ci_kpm_pdir[%d]=%" PRIx64 2924 " pmap->pm_pdir[%d]=%" PRIx64 "\n", 2925 pmap, pmap_kernel(), curcpu()->ci_index, 2926 ci->ci_index, ci->ci_pmap, 2927 i, ci->ci_kpm_pdir[i], 2928 i, pmap->pm_pdir[i]); 2929 panic("%s: used pmap", __func__); 2930 } 2931 } 2932 #endif 2933 } 2934 #endif /* DEBUG */ 2935 } 2936 2937 /* 2938 * pmap_destroy: drop reference count on pmap. free pmap if reference 2939 * count goes to zero. 2940 * 2941 * => we can be called from pmap_unmap_ptes() with a different, unrelated 2942 * pmap's lock held. be careful! 2943 */ 2944 void 2945 pmap_destroy(struct pmap *pmap) 2946 { 2947 int i; 2948 2949 /* 2950 * drop reference count and verify not in use. 2951 */ 2952 2953 if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) { 2954 return; 2955 } 2956 pmap_check_inuse(pmap); 2957 2958 /* 2959 * handle any deferred frees. 2960 */ 2961 2962 mutex_enter(&pmap->pm_lock); 2963 if (pmap->pm_pve != NULL) { 2964 pmap_free_pv(pmap, pmap->pm_pve); 2965 pmap->pm_pve = NULL; 2966 } 2967 pmap_drain_pv(pmap); 2968 mutex_exit(&pmap->pm_lock); 2969 pmap_update(pmap); 2970 2971 /* 2972 * Reference count is zero, free pmap resources and then free pmap. 2973 */ 2974 2975 pmap_check_ptps(pmap); 2976 KASSERT(LIST_EMPTY(&pmap->pm_gc_ptp)); 2977 2978 #ifdef USER_LDT 2979 if (pmap->pm_ldt != NULL) { 2980 /* 2981 * No need to switch the LDT; this address space is gone, 2982 * nothing is using it. 2983 * 2984 * No need to lock the pmap for ldt_free (or anything else), 2985 * we're the last one to use it. 2986 */ 2987 /* XXXAD can't take cpu_lock here - fix soon. */ 2988 mutex_enter(&cpu_lock); 2989 ldt_free(pmap->pm_ldt_sel); 2990 mutex_exit(&cpu_lock); 2991 uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt, 2992 MAX_USERLDT_SIZE, UVM_KMF_WIRED); 2993 } 2994 #endif 2995 2996 for (i = 0; i < PTP_LEVELS - 1; i++) { 2997 uvm_obj_destroy(&pmap->pm_obj[i], false); 2998 } 2999 kcpuset_zero(pmap->pm_cpus); 3000 kcpuset_zero(pmap->pm_kernel_cpus); 3001 #ifdef XENPV 3002 kcpuset_zero(pmap->pm_xen_ptp_cpus); 3003 #endif 3004 3005 KASSERT(LIST_EMPTY(&pmap->pm_pvp_full)); 3006 KASSERT(LIST_EMPTY(&pmap->pm_pvp_part)); 3007 KASSERT(LIST_EMPTY(&pmap->pm_pvp_empty)); 3008 3009 pmap_check_ptps(pmap); 3010 if (__predict_false(pmap->pm_enter != NULL)) { 3011 /* XXX make this a different cache */ 3012 pool_cache_destruct_object(&pmap_cache, pmap); 3013 } else { 3014 pool_cache_put(&pmap_cache, pmap); 3015 } 3016 } 3017 3018 /* 3019 * pmap_zap_ptp: clear out an entire PTP without modifying PTEs 3020 * 3021 * => caller must hold pmap's lock 3022 * => PTP must be mapped into KVA 3023 * => must be called with kernel preemption disabled 3024 * => does as little work as possible 3025 */ 3026 static void 3027 pmap_zap_ptp(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, 3028 vaddr_t startva, vaddr_t blkendva) 3029 { 3030 #ifndef XENPV 3031 struct pv_entry *pve; 3032 struct vm_page *pg; 3033 struct pmap_page *pp; 3034 pt_entry_t opte; 3035 rb_tree_t *tree; 3036 vaddr_t va; 3037 int wired; 3038 uint8_t oattrs; 3039 u_int cnt; 3040 3041 KASSERT(mutex_owned(&pmap->pm_lock)); 3042 KASSERT(kpreempt_disabled()); 3043 KASSERT(pmap != pmap_kernel()); 3044 KASSERT(ptp->wire_count > 1); 3045 KASSERT(ptp->wire_count - 1 <= PAGE_SIZE / sizeof(pt_entry_t)); 3046 3047 /* 3048 * Start at the lowest entered VA, and scan until there are no more 3049 * PTEs in the PTPs. 3050 */ 3051 tree = &VM_PAGE_TO_PP(ptp)->pp_rb; 3052 pve = RB_TREE_MIN(tree); 3053 wired = 0; 3054 va = (vaddr_t)ptp->uanon; 3055 pte += ((va - startva) >> PAGE_SHIFT); 3056 3057 for (cnt = ptp->wire_count; cnt > 1; pte++, va += PAGE_SIZE) { 3058 /* 3059 * No need for an atomic to clear the PTE. Nothing else can 3060 * see the address space any more and speculative access (if 3061 * possible) won't modify. Therefore there's no need to 3062 * track the accessed/dirty bits. 3063 */ 3064 opte = *pte; 3065 if (!pmap_valid_entry(opte)) { 3066 continue; 3067 } 3068 3069 /* 3070 * Count the PTE. If it's not for a managed mapping 3071 * there's noting more to do. 3072 */ 3073 cnt--; 3074 wired -= (opte & PTE_WIRED); 3075 if ((opte & PTE_PVLIST) == 0) { 3076 #ifndef DOM0OPS 3077 KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL), 3078 "managed page without PTE_PVLIST for %#" 3079 PRIxVADDR, va); 3080 KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL), 3081 "pv-tracked page without PTE_PVLIST for %#" 3082 PRIxVADDR, va); 3083 #endif 3084 KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ? 3085 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), 3086 va) == NULL); 3087 continue; 3088 } 3089 3090 /* 3091 * "pve" now points to the lowest (by VA) dynamic PV entry 3092 * in the PTP. If it's for this VA, take advantage of it to 3093 * avoid calling PHYS_TO_VM_PAGE(). Avoid modifying the RB 3094 * tree by skipping to the next VA in the tree whenever 3095 * there is a match here. The tree will be cleared out in 3096 * one pass before return to pmap_remove_all(). 3097 */ 3098 oattrs = pmap_pte_to_pp_attrs(opte); 3099 if (pve != NULL && pve->pve_pte.pte_va == va) { 3100 pp = pve->pve_pp; 3101 KASSERT(pve->pve_pte.pte_ptp == ptp); 3102 KASSERT(pp->pp_pte.pte_ptp != ptp || 3103 pp->pp_pte.pte_va != va); 3104 mutex_spin_enter(&pp->pp_lock); 3105 pp->pp_attrs |= oattrs; 3106 LIST_REMOVE(pve, pve_list); 3107 mutex_spin_exit(&pp->pp_lock); 3108 3109 /* 3110 * pve won't be touched again until pmap_drain_pv(), 3111 * so it's still safe to traverse the tree. 3112 */ 3113 pmap_free_pv(pmap, pve); 3114 pve = RB_TREE_NEXT(tree, pve); 3115 continue; 3116 } 3117 3118 /* 3119 * No entry in the tree so it must be embedded. Look up the 3120 * page and cancel the embedded entry. 3121 */ 3122 if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) { 3123 pp = VM_PAGE_TO_PP(pg); 3124 } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) { 3125 paddr_t pa = pmap_pte2pa(opte); 3126 panic("%s: PTE_PVLIST with pv-untracked page" 3127 " va = %#"PRIxVADDR"pa = %#"PRIxPADDR 3128 "(%#"PRIxPADDR")", __func__, va, pa, atop(pa)); 3129 } 3130 mutex_spin_enter(&pp->pp_lock); 3131 KASSERT(pp->pp_pte.pte_ptp == ptp); 3132 KASSERT(pp->pp_pte.pte_va == va); 3133 pp->pp_attrs |= oattrs; 3134 pp->pp_pte.pte_ptp = NULL; 3135 pp->pp_pte.pte_va = 0; 3136 mutex_spin_exit(&pp->pp_lock); 3137 } 3138 3139 /* PTP now empty - adjust the tree & stats to match. */ 3140 pmap_stats_update(pmap, -(ptp->wire_count - 1), wired / PTE_WIRED); 3141 ptp->wire_count = 1; 3142 #ifdef DIAGNOSTIC 3143 rb_tree_init(tree, &pmap_rbtree_ops); 3144 #endif 3145 #else /* !XENPV */ 3146 /* 3147 * XXXAD For XEN, it's not clear to me that we can do this, because 3148 * I guess the hypervisor keeps track of PTEs too. 3149 */ 3150 pmap_remove_ptes(pmap, ptp, (vaddr_t)pte, startva, blkendva); 3151 #endif /* !XENPV */ 3152 } 3153 3154 /* 3155 * pmap_remove_all: remove all mappings from pmap in bulk. 3156 * 3157 * Ordinarily when removing mappings it's important to hold the UVM object's 3158 * lock, so that pages do not gain a new identity while retaining stale TLB 3159 * entries (the same lock hold covers both pmap_remove() and pmap_update()). 3160 * Here it's known that the address space is no longer visible to any user 3161 * process, so we don't need to worry about that. 3162 */ 3163 bool 3164 pmap_remove_all(struct pmap *pmap) 3165 { 3166 struct vm_page *ptps[32]; 3167 vaddr_t va, blkendva; 3168 struct pmap *pmap2; 3169 pt_entry_t *ptes; 3170 pd_entry_t pde __diagused; 3171 pd_entry_t * const *pdes; 3172 int lvl __diagused, i, n; 3173 3174 /* XXX Can't handle EPT just yet. */ 3175 if (pmap->pm_remove != NULL) { 3176 return false; 3177 } 3178 3179 for (;;) { 3180 /* Fetch a block of PTPs from tree. */ 3181 mutex_enter(&pmap->pm_lock); 3182 n = radix_tree_gang_lookup_node(&pmap->pm_obj[0].uo_pages, 0, 3183 (void **)ptps, __arraycount(ptps), false); 3184 if (n == 0) { 3185 mutex_exit(&pmap->pm_lock); 3186 break; 3187 } 3188 3189 /* Remove all mappings in the set of PTPs. */ 3190 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3191 for (i = 0; i < n; i++) { 3192 if (ptps[i]->wire_count == 0) { 3193 /* It's dead: pmap_update() will expunge. */ 3194 continue; 3195 } 3196 3197 /* Determine range of block. */ 3198 va = ptps[i]->offset * PAGE_SIZE / sizeof(pt_entry_t); 3199 blkendva = x86_round_pdr(va + 1); 3200 3201 /* Make sure everything squares up... */ 3202 KASSERT(pmap_pdes_valid(va, pdes, &pde, &lvl)); 3203 KASSERT(lvl == 1); 3204 KASSERT(pmap_find_ptp(pmap, va, 1) == ptps[i]); 3205 3206 /* Zap! */ 3207 pmap_zap_ptp(pmap, ptps[i], &ptes[pl1_i(va)], va, 3208 blkendva); 3209 3210 /* PTP should now be unused - free it. */ 3211 KASSERT(ptps[i]->wire_count == 1); 3212 pmap_free_ptp(pmap, ptps[i], va, ptes, pdes); 3213 } 3214 pmap_unmap_ptes(pmap, pmap2); 3215 pmap_drain_pv(pmap); 3216 pmap_tlb_shootdown(pmap, -1L, 0, TLBSHOOT_REMOVE_ALL); 3217 mutex_exit(&pmap->pm_lock); 3218 3219 /* Process deferred frees. */ 3220 pmap_update(pmap); 3221 3222 /* A breathing point. */ 3223 preempt_point(); 3224 } 3225 3226 /* Verify that the pmap is now completely empty. */ 3227 pmap_check_ptps(pmap); 3228 KASSERTMSG(pmap->pm_stats.resident_count == PDP_SIZE, 3229 "pmap %p not empty", pmap); 3230 3231 return true; 3232 } 3233 3234 #if defined(PMAP_FORK) 3235 /* 3236 * pmap_fork: perform any necessary data structure manipulation when 3237 * a VM space is forked. 3238 */ 3239 void 3240 pmap_fork(struct pmap *pmap1, struct pmap *pmap2) 3241 { 3242 #ifdef USER_LDT 3243 union descriptor *new_ldt; 3244 int sel; 3245 3246 if (__predict_true(pmap1->pm_ldt == NULL)) { 3247 return; 3248 } 3249 3250 /* 3251 * Copy the LDT into the new process. 3252 * 3253 * Read pmap1's ldt pointer unlocked; if it changes behind our back 3254 * we'll retry. This will starve if there's a stream of LDT changes 3255 * in another thread but that should not happen. 3256 */ 3257 3258 retry: 3259 if (pmap1->pm_ldt != NULL) { 3260 /* Allocate space for the new process's LDT */ 3261 new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, 3262 MAX_USERLDT_SIZE, 0, UVM_KMF_WIRED); 3263 if (new_ldt == NULL) { 3264 printf("WARNING: %s: unable to allocate LDT space\n", 3265 __func__); 3266 return; 3267 } 3268 mutex_enter(&cpu_lock); 3269 /* Get a GDT slot for it */ 3270 sel = ldt_alloc(new_ldt, MAX_USERLDT_SIZE); 3271 if (sel == -1) { 3272 mutex_exit(&cpu_lock); 3273 uvm_km_free(kernel_map, (vaddr_t)new_ldt, 3274 MAX_USERLDT_SIZE, UVM_KMF_WIRED); 3275 printf("WARNING: %s: unable to allocate LDT selector\n", 3276 __func__); 3277 return; 3278 } 3279 } else { 3280 /* Wasn't anything there after all. */ 3281 new_ldt = NULL; 3282 sel = -1; 3283 mutex_enter(&cpu_lock); 3284 } 3285 3286 /* 3287 * Now that we have cpu_lock, ensure the LDT status is the same. 3288 */ 3289 if (pmap1->pm_ldt != NULL) { 3290 if (new_ldt == NULL) { 3291 /* A wild LDT just appeared. */ 3292 mutex_exit(&cpu_lock); 3293 goto retry; 3294 } 3295 3296 /* Copy the LDT data and install it in pmap2 */ 3297 memcpy(new_ldt, pmap1->pm_ldt, MAX_USERLDT_SIZE); 3298 pmap2->pm_ldt = new_ldt; 3299 pmap2->pm_ldt_sel = sel; 3300 mutex_exit(&cpu_lock); 3301 } else { 3302 if (new_ldt != NULL) { 3303 /* The LDT disappeared, drop what we did. */ 3304 ldt_free(sel); 3305 mutex_exit(&cpu_lock); 3306 uvm_km_free(kernel_map, (vaddr_t)new_ldt, 3307 MAX_USERLDT_SIZE, UVM_KMF_WIRED); 3308 return; 3309 } 3310 3311 /* We're good, just leave. */ 3312 mutex_exit(&cpu_lock); 3313 } 3314 #endif /* USER_LDT */ 3315 } 3316 #endif /* PMAP_FORK */ 3317 3318 #ifdef USER_LDT 3319 3320 /* 3321 * pmap_ldt_xcall: cross call used by pmap_ldt_sync. if the named pmap 3322 * is active, reload LDTR. 3323 */ 3324 static void 3325 pmap_ldt_xcall(void *arg1, void *arg2) 3326 { 3327 struct pmap *pm; 3328 3329 kpreempt_disable(); 3330 pm = arg1; 3331 if (curcpu()->ci_pmap == pm) { 3332 #if defined(SVS) 3333 if (svs_enabled) { 3334 svs_ldt_sync(pm); 3335 } else 3336 #endif 3337 lldt(pm->pm_ldt_sel); 3338 } 3339 kpreempt_enable(); 3340 } 3341 3342 /* 3343 * pmap_ldt_sync: LDT selector for the named pmap is changing. swap 3344 * in the new selector on all CPUs. 3345 */ 3346 void 3347 pmap_ldt_sync(struct pmap *pm) 3348 { 3349 uint64_t where; 3350 3351 KASSERT(mutex_owned(&cpu_lock)); 3352 3353 pmap_ldt_evcnt.ev_count++; 3354 where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL); 3355 xc_wait(where); 3356 } 3357 3358 /* 3359 * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and 3360 * restore the default. 3361 */ 3362 void 3363 pmap_ldt_cleanup(struct lwp *l) 3364 { 3365 pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap; 3366 union descriptor *ldt; 3367 int sel; 3368 3369 if (__predict_true(pmap->pm_ldt == NULL)) { 3370 return; 3371 } 3372 3373 mutex_enter(&cpu_lock); 3374 if (pmap->pm_ldt != NULL) { 3375 sel = pmap->pm_ldt_sel; 3376 ldt = pmap->pm_ldt; 3377 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 3378 pmap->pm_ldt = NULL; 3379 pmap_ldt_sync(pmap); 3380 ldt_free(sel); 3381 uvm_km_free(kernel_map, (vaddr_t)ldt, MAX_USERLDT_SIZE, 3382 UVM_KMF_WIRED); 3383 } 3384 mutex_exit(&cpu_lock); 3385 } 3386 #endif /* USER_LDT */ 3387 3388 /* 3389 * pmap_activate: activate a process' pmap 3390 * 3391 * => must be called with kernel preemption disabled 3392 * => if lwp is the curlwp, then set ci_want_pmapload so that 3393 * actual MMU context switch will be done by pmap_load() later 3394 */ 3395 void 3396 pmap_activate(struct lwp *l) 3397 { 3398 struct cpu_info *ci; 3399 struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 3400 3401 KASSERT(kpreempt_disabled()); 3402 3403 ci = curcpu(); 3404 3405 if (l != ci->ci_curlwp) 3406 return; 3407 3408 KASSERT(ci->ci_want_pmapload == 0); 3409 KASSERT(ci->ci_tlbstate != TLBSTATE_VALID); 3410 3411 /* 3412 * no need to switch to kernel vmspace because 3413 * it's a subset of any vmspace. 3414 */ 3415 3416 if (pmap == pmap_kernel()) { 3417 ci->ci_want_pmapload = 0; 3418 return; 3419 } 3420 3421 ci->ci_want_pmapload = 1; 3422 } 3423 3424 #if defined(XENPV) && defined(__x86_64__) 3425 #define KASSERT_PDIRPA(pmap) \ 3426 KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd || \ 3427 pmap == pmap_kernel()) 3428 #elif defined(PAE) 3429 #define KASSERT_PDIRPA(pmap) \ 3430 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0])) 3431 #elif !defined(XENPV) 3432 #define KASSERT_PDIRPA(pmap) \ 3433 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3())) 3434 #else 3435 #define KASSERT_PDIRPA(pmap) KASSERT(true) /* nothing to do */ 3436 #endif 3437 3438 /* 3439 * pmap_reactivate: try to regain reference to the pmap. 3440 * 3441 * => Must be called with kernel preemption disabled. 3442 */ 3443 static void 3444 pmap_reactivate(struct pmap *pmap) 3445 { 3446 struct cpu_info * const ci = curcpu(); 3447 const cpuid_t cid = cpu_index(ci); 3448 3449 KASSERT(kpreempt_disabled()); 3450 KASSERT_PDIRPA(pmap); 3451 3452 /* 3453 * If we still have a lazy reference to this pmap, we can assume 3454 * that there was no TLB shootdown for this pmap in the meantime. 3455 * 3456 * The order of events here is important as we must synchronize 3457 * with TLB shootdown interrupts. Declare interest in invalidations 3458 * (TLBSTATE_VALID) and then check the CPU set, which the IPIs can 3459 * change only when the state is TLBSTATE_LAZY. 3460 */ 3461 3462 ci->ci_tlbstate = TLBSTATE_VALID; 3463 KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid)); 3464 3465 if (__predict_true(kcpuset_isset(pmap->pm_cpus, cid))) { 3466 /* We have the reference, state is valid. */ 3467 } else { 3468 /* 3469 * Must reload the TLB, pmap has been changed during 3470 * deactivated. 3471 */ 3472 kcpuset_atomic_set(pmap->pm_cpus, cid); 3473 3474 tlbflush(); 3475 } 3476 } 3477 3478 /* 3479 * pmap_load: perform the actual pmap switch, i.e. fill in %cr3 register 3480 * and relevant LDT info. 3481 * 3482 * Ensures that the current process' pmap is loaded on the current CPU's 3483 * MMU and that there are no stale TLB entries. 3484 * 3485 * => The caller should disable kernel preemption or do check-and-retry 3486 * to prevent a preemption from undoing our efforts. 3487 * => This function may block. 3488 */ 3489 void 3490 pmap_load(void) 3491 { 3492 struct cpu_info *ci; 3493 struct pmap *pmap, *oldpmap; 3494 struct lwp *l; 3495 uint64_t ncsw; 3496 3497 kpreempt_disable(); 3498 retry: 3499 ci = curcpu(); 3500 if (!ci->ci_want_pmapload) { 3501 kpreempt_enable(); 3502 return; 3503 } 3504 l = ci->ci_curlwp; 3505 ncsw = l->l_ncsw; 3506 __insn_barrier(); 3507 3508 /* should be able to take ipis. */ 3509 KASSERT(ci->ci_ilevel < IPL_HIGH); 3510 #ifdef XENPV 3511 /* Check to see if interrupts are enabled (ie; no events are masked) */ 3512 KASSERT(x86_read_psl() == 0); 3513 #else 3514 KASSERT((x86_read_psl() & PSL_I) != 0); 3515 #endif 3516 3517 KASSERT(l != NULL); 3518 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 3519 KASSERT(pmap != pmap_kernel()); 3520 oldpmap = ci->ci_pmap; 3521 3522 if (pmap == oldpmap) { 3523 pmap_reactivate(pmap); 3524 ci->ci_want_pmapload = 0; 3525 kpreempt_enable(); 3526 return; 3527 } 3528 3529 /* 3530 * Acquire a reference to the new pmap and perform the switch. 3531 */ 3532 3533 pmap_reference(pmap); 3534 pmap_load1(l, pmap, oldpmap); 3535 ci->ci_want_pmapload = 0; 3536 3537 /* 3538 * we're now running with the new pmap. drop the reference 3539 * to the old pmap. if we block, we need to go around again. 3540 */ 3541 3542 pmap_destroy(oldpmap); 3543 __insn_barrier(); 3544 if (l->l_ncsw != ncsw) { 3545 goto retry; 3546 } 3547 3548 kpreempt_enable(); 3549 } 3550 3551 /* 3552 * pmap_load1: the guts of pmap load, shared by pmap_map_ptes() and 3553 * pmap_load(). It's critically important that this function does not 3554 * block. 3555 */ 3556 static void 3557 pmap_load1(struct lwp *l, struct pmap *pmap, struct pmap *oldpmap) 3558 { 3559 struct cpu_info *ci; 3560 struct pcb *pcb; 3561 cpuid_t cid; 3562 3563 KASSERT(kpreempt_disabled()); 3564 3565 pcb = lwp_getpcb(l); 3566 ci = l->l_cpu; 3567 cid = cpu_index(ci); 3568 3569 kcpuset_atomic_clear(oldpmap->pm_cpus, cid); 3570 kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid); 3571 3572 KASSERT_PDIRPA(oldpmap); 3573 KASSERT(!kcpuset_isset(pmap->pm_cpus, cid)); 3574 KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid)); 3575 3576 /* 3577 * Mark the pmap in use by this CPU. Again, we must synchronize 3578 * with TLB shootdown interrupts, so set the state VALID first, 3579 * then register us for shootdown events on this pmap. 3580 */ 3581 ci->ci_tlbstate = TLBSTATE_VALID; 3582 kcpuset_atomic_set(pmap->pm_cpus, cid); 3583 kcpuset_atomic_set(pmap->pm_kernel_cpus, cid); 3584 ci->ci_pmap = pmap; 3585 3586 /* 3587 * update tss. now that we have registered for invalidations 3588 * from other CPUs, we're good to load the page tables. 3589 */ 3590 #ifdef PAE 3591 pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa; 3592 #else 3593 pcb->pcb_cr3 = pmap_pdirpa(pmap, 0); 3594 #endif 3595 3596 #ifdef i386 3597 #ifndef XENPV 3598 ci->ci_tss->tss.tss_ldt = pmap->pm_ldt_sel; 3599 ci->ci_tss->tss.tss_cr3 = pcb->pcb_cr3; 3600 #endif 3601 #endif 3602 3603 #if defined(SVS) && defined(USER_LDT) 3604 if (svs_enabled) { 3605 svs_ldt_sync(pmap); 3606 } else 3607 #endif 3608 lldt(pmap->pm_ldt_sel); 3609 3610 cpu_load_pmap(pmap, oldpmap); 3611 } 3612 3613 /* 3614 * pmap_deactivate: deactivate a process' pmap. 3615 * 3616 * => Must be called with kernel preemption disabled (high IPL is enough). 3617 */ 3618 void 3619 pmap_deactivate(struct lwp *l) 3620 { 3621 struct pmap *pmap; 3622 struct cpu_info *ci; 3623 3624 KASSERT(kpreempt_disabled()); 3625 3626 if (l != curlwp) { 3627 return; 3628 } 3629 3630 /* 3631 * Wait for pending TLB shootdowns to complete. Necessary because 3632 * TLB shootdown state is per-CPU, and the LWP may be coming off 3633 * the CPU before it has a chance to call pmap_update(), e.g. due 3634 * to kernel preemption or blocking routine in between. 3635 */ 3636 pmap_tlb_shootnow(); 3637 3638 ci = curcpu(); 3639 3640 if (ci->ci_want_pmapload) { 3641 /* 3642 * ci_want_pmapload means that our pmap is not loaded on 3643 * the CPU or TLB might be stale. note that pmap_kernel() 3644 * is always considered loaded. 3645 */ 3646 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 3647 != pmap_kernel()); 3648 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 3649 != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID); 3650 3651 /* 3652 * userspace has not been touched. 3653 * nothing to do here. 3654 */ 3655 3656 ci->ci_want_pmapload = 0; 3657 return; 3658 } 3659 3660 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 3661 3662 if (pmap == pmap_kernel()) { 3663 return; 3664 } 3665 3666 KASSERT_PDIRPA(pmap); 3667 KASSERT(ci->ci_pmap == pmap); 3668 3669 /* 3670 * we aren't interested in TLB invalidations for this pmap, 3671 * at least for the time being. 3672 */ 3673 3674 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); 3675 ci->ci_tlbstate = TLBSTATE_LAZY; 3676 } 3677 3678 /* 3679 * some misc. functions 3680 */ 3681 3682 bool 3683 pmap_pdes_valid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde, 3684 int *lastlvl) 3685 { 3686 unsigned long index; 3687 pd_entry_t pde; 3688 int i; 3689 3690 for (i = PTP_LEVELS; i > 1; i--) { 3691 index = pl_i(va, i); 3692 pde = pdes[i - 2][index]; 3693 if ((pde & PTE_P) == 0) { 3694 *lastlvl = i; 3695 return false; 3696 } 3697 if (pde & PTE_PS) 3698 break; 3699 } 3700 if (lastpde != NULL) 3701 *lastpde = pde; 3702 *lastlvl = i; 3703 return true; 3704 } 3705 3706 /* 3707 * pmap_extract: extract a PA for the given VA 3708 */ 3709 bool 3710 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap) 3711 { 3712 pt_entry_t *ptes, pte; 3713 pd_entry_t pde; 3714 pd_entry_t * const *pdes; 3715 struct pmap *pmap2; 3716 paddr_t pa; 3717 bool rv; 3718 int lvl; 3719 3720 if (__predict_false(pmap->pm_extract != NULL)) { 3721 return (*pmap->pm_extract)(pmap, va, pap); 3722 } 3723 3724 #ifdef __HAVE_DIRECT_MAP 3725 if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) { 3726 if (pap != NULL) { 3727 *pap = PMAP_DIRECT_UNMAP(va); 3728 } 3729 return true; 3730 } 3731 #endif 3732 3733 rv = false; 3734 pa = 0; 3735 3736 if (pmap != pmap_kernel()) { 3737 mutex_enter(&pmap->pm_lock); 3738 } 3739 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3740 if (pmap_pdes_valid(va, pdes, &pde, &lvl)) { 3741 if (lvl == 2) { 3742 pa = (pde & PTE_LGFRAME) | (va & (NBPD_L2 - 1)); 3743 rv = true; 3744 } else { 3745 KASSERT(lvl == 1); 3746 pte = ptes[pl1_i(va)]; 3747 if (__predict_true((pte & PTE_P) != 0)) { 3748 pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1)); 3749 rv = true; 3750 } 3751 } 3752 } 3753 pmap_unmap_ptes(pmap, pmap2); 3754 if (pmap != pmap_kernel()) { 3755 mutex_exit(&pmap->pm_lock); 3756 } 3757 if (pap != NULL) { 3758 *pap = pa; 3759 } 3760 3761 return rv; 3762 } 3763 3764 /* 3765 * vtophys: virtual address to physical address. For use by 3766 * machine-dependent code only. 3767 */ 3768 paddr_t 3769 vtophys(vaddr_t va) 3770 { 3771 paddr_t pa; 3772 3773 if (pmap_extract(pmap_kernel(), va, &pa) == true) 3774 return pa; 3775 return 0; 3776 } 3777 3778 __strict_weak_alias(pmap_extract_ma, pmap_extract); 3779 3780 #ifdef XENPV 3781 /* 3782 * vtomach: virtual address to machine address. For use by 3783 * machine-dependent code only. 3784 */ 3785 paddr_t 3786 vtomach(vaddr_t va) 3787 { 3788 paddr_t pa; 3789 3790 if (pmap_extract_ma(pmap_kernel(), va, &pa) == true) 3791 return pa; 3792 return 0; 3793 } 3794 #endif 3795 3796 /* 3797 * pmap_virtual_space: used during bootup [pmap_steal_memory] to 3798 * determine the bounds of the kernel virtual addess space. 3799 */ 3800 void 3801 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp) 3802 { 3803 *startp = virtual_avail; 3804 *endp = virtual_end; 3805 } 3806 3807 void 3808 pmap_zero_page(paddr_t pa) 3809 { 3810 #if defined(__HAVE_DIRECT_MAP) 3811 memset(PAGE_ALIGNED(PMAP_DIRECT_MAP(pa)), 0, PAGE_SIZE); 3812 #else 3813 #if defined(XENPV) 3814 if (XEN_VERSION_SUPPORTED(3, 4)) 3815 xen_pagezero(pa); 3816 #endif 3817 struct cpu_info *ci; 3818 pt_entry_t *zpte; 3819 vaddr_t zerova; 3820 3821 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_D | PTE_A; 3822 3823 kpreempt_disable(); 3824 3825 ci = curcpu(); 3826 zerova = ci->vpage[VPAGE_ZER]; 3827 zpte = ci->vpage_pte[VPAGE_ZER]; 3828 3829 KASSERTMSG(!*zpte, "pmap_zero_page: lock botch"); 3830 3831 pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags); 3832 pmap_pte_flush(); 3833 pmap_update_pg(zerova); /* flush TLB */ 3834 3835 memset(PAGE_ALIGNED(zerova), 0, PAGE_SIZE); 3836 3837 #if defined(DIAGNOSTIC) || defined(XENPV) 3838 pmap_pte_set(zpte, 0); /* zap ! */ 3839 pmap_pte_flush(); 3840 #endif 3841 3842 kpreempt_enable(); 3843 #endif /* defined(__HAVE_DIRECT_MAP) */ 3844 } 3845 3846 void 3847 pmap_copy_page(paddr_t srcpa, paddr_t dstpa) 3848 { 3849 #if defined(__HAVE_DIRECT_MAP) 3850 vaddr_t srcva = PMAP_DIRECT_MAP(srcpa); 3851 vaddr_t dstva = PMAP_DIRECT_MAP(dstpa); 3852 3853 memcpy(PAGE_ALIGNED(dstva), PAGE_ALIGNED(srcva), PAGE_SIZE); 3854 #else 3855 #if defined(XENPV) 3856 if (XEN_VERSION_SUPPORTED(3, 4)) { 3857 xen_copy_page(srcpa, dstpa); 3858 return; 3859 } 3860 #endif 3861 struct cpu_info *ci; 3862 pt_entry_t *srcpte, *dstpte; 3863 vaddr_t srcva, dstva; 3864 3865 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A; 3866 3867 kpreempt_disable(); 3868 3869 ci = curcpu(); 3870 srcva = ci->vpage[VPAGE_SRC]; 3871 dstva = ci->vpage[VPAGE_DST]; 3872 srcpte = ci->vpage_pte[VPAGE_SRC]; 3873 dstpte = ci->vpage_pte[VPAGE_DST]; 3874 3875 KASSERT(*srcpte == 0 && *dstpte == 0); 3876 3877 pmap_pte_set(srcpte, pmap_pa2pte(srcpa) | pteflags); 3878 pmap_pte_set(dstpte, pmap_pa2pte(dstpa) | pteflags | PTE_D); 3879 pmap_pte_flush(); 3880 pmap_update_pg(srcva); 3881 pmap_update_pg(dstva); 3882 3883 memcpy(PAGE_ALIGNED(dstva), PAGE_ALIGNED(srcva), PAGE_SIZE); 3884 3885 #if defined(DIAGNOSTIC) || defined(XENPV) 3886 pmap_pte_set(srcpte, 0); 3887 pmap_pte_set(dstpte, 0); 3888 pmap_pte_flush(); 3889 #endif 3890 3891 kpreempt_enable(); 3892 #endif /* defined(__HAVE_DIRECT_MAP) */ 3893 } 3894 3895 static pt_entry_t * 3896 pmap_map_ptp(struct vm_page *ptp) 3897 { 3898 #ifdef __HAVE_DIRECT_MAP 3899 return (void *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp)); 3900 #else 3901 struct cpu_info *ci; 3902 pt_entry_t *ptppte; 3903 vaddr_t ptpva; 3904 3905 KASSERT(kpreempt_disabled()); 3906 3907 #ifndef XENPV 3908 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A | PTE_D; 3909 #else 3910 const pd_entry_t pteflags = PTE_P | pmap_pg_nx | PTE_A | PTE_D; 3911 #endif 3912 3913 ci = curcpu(); 3914 ptpva = ci->vpage[VPAGE_PTP]; 3915 ptppte = ci->vpage_pte[VPAGE_PTP]; 3916 3917 pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | pteflags); 3918 3919 pmap_pte_flush(); 3920 pmap_update_pg(ptpva); 3921 3922 return (pt_entry_t *)ptpva; 3923 #endif 3924 } 3925 3926 static void 3927 pmap_unmap_ptp(void) 3928 { 3929 #ifndef __HAVE_DIRECT_MAP 3930 #if defined(DIAGNOSTIC) || defined(XENPV) 3931 struct cpu_info *ci; 3932 pt_entry_t *pte; 3933 3934 KASSERT(kpreempt_disabled()); 3935 3936 ci = curcpu(); 3937 pte = ci->vpage_pte[VPAGE_PTP]; 3938 3939 if (*pte != 0) { 3940 pmap_pte_set(pte, 0); 3941 pmap_pte_flush(); 3942 } 3943 #endif 3944 #endif 3945 } 3946 3947 static pt_entry_t * 3948 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va) 3949 { 3950 3951 KASSERT(kpreempt_disabled()); 3952 if (pmap_is_curpmap(pmap)) { 3953 return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */ 3954 } 3955 KASSERT(ptp != NULL); 3956 return pmap_map_ptp(ptp) + pl1_pi(va); 3957 } 3958 3959 static void 3960 pmap_unmap_pte(void) 3961 { 3962 3963 KASSERT(kpreempt_disabled()); 3964 3965 pmap_unmap_ptp(); 3966 } 3967 3968 /* 3969 * p m a p r e m o v e f u n c t i o n s 3970 * 3971 * functions that remove mappings 3972 */ 3973 3974 /* 3975 * pmap_remove_ptes: remove PTEs from a PTP 3976 * 3977 * => caller must hold pmap's lock 3978 * => PTP must be mapped into KVA 3979 * => PTP should be null if pmap == pmap_kernel() 3980 * => must be called with kernel preemption disabled 3981 * => returns composite pte if at least one page should be shot down 3982 */ 3983 static void 3984 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva, 3985 vaddr_t startva, vaddr_t endva) 3986 { 3987 pt_entry_t *pte = (pt_entry_t *)ptpva; 3988 3989 KASSERT(mutex_owned(&pmap->pm_lock)); 3990 KASSERT(kpreempt_disabled()); 3991 3992 /* 3993 * mappings are very often sparse, so clip the given range to the 3994 * range of PTEs that are known present in the PTP. 3995 */ 3996 pmap_ptp_range_clip(ptp, &startva, &pte); 3997 3998 /* 3999 * note that ptpva points to the PTE that maps startva. this may 4000 * or may not be the first PTE in the PTP. 4001 * 4002 * we loop through the PTP while there are still PTEs to look at 4003 * and the wire_count is greater than 1 (because we use the wire_count 4004 * to keep track of the number of real PTEs in the PTP). 4005 */ 4006 while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) { 4007 (void)pmap_remove_pte(pmap, ptp, pte, startva); 4008 startva += PAGE_SIZE; 4009 pte++; 4010 } 4011 } 4012 4013 /* 4014 * pmap_remove_pte: remove a single PTE from a PTP. 4015 * 4016 * => caller must hold pmap's lock 4017 * => PTP must be mapped into KVA 4018 * => PTP should be null if pmap == pmap_kernel() 4019 * => returns true if we removed a mapping 4020 * => must be called with kernel preemption disabled 4021 */ 4022 static bool 4023 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, 4024 vaddr_t va) 4025 { 4026 struct pv_entry *pve; 4027 struct vm_page *pg; 4028 struct pmap_page *pp; 4029 pt_entry_t opte; 4030 4031 KASSERT(mutex_owned(&pmap->pm_lock)); 4032 KASSERT(kpreempt_disabled()); 4033 4034 if (!pmap_valid_entry(*pte)) { 4035 /* VA not mapped. */ 4036 return false; 4037 } 4038 4039 /* Atomically save the old PTE and zap it. */ 4040 opte = pmap_pte_testset(pte, 0); 4041 if (!pmap_valid_entry(opte)) { 4042 return false; 4043 } 4044 4045 pmap_exec_account(pmap, va, opte, 0); 4046 pmap_stats_update_bypte(pmap, 0, opte); 4047 4048 if (ptp) { 4049 /* 4050 * Dropping a PTE. Make sure that the PDE is flushed. 4051 */ 4052 ptp->wire_count--; 4053 if (ptp->wire_count <= 1) { 4054 opte |= PTE_A; 4055 } 4056 } 4057 4058 if ((opte & PTE_A) != 0) { 4059 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE); 4060 } 4061 4062 /* 4063 * If we are not on a pv list - we are done. 4064 */ 4065 if ((opte & PTE_PVLIST) == 0) { 4066 #ifndef DOM0OPS 4067 KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL), 4068 "managed page without PTE_PVLIST for %#"PRIxVADDR, va); 4069 KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL), 4070 "pv-tracked page without PTE_PVLIST for %#"PRIxVADDR, va); 4071 #endif 4072 KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ? 4073 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL); 4074 return true; 4075 } 4076 4077 if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) { 4078 pp = VM_PAGE_TO_PP(pg); 4079 } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) { 4080 paddr_t pa = pmap_pte2pa(opte); 4081 panic("%s: PTE_PVLIST with pv-untracked page" 4082 " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")", 4083 __func__, va, pa, atop(pa)); 4084 } 4085 4086 /* Sync R/M bits. */ 4087 pve = pmap_lookup_pv(pmap, ptp, pp, va); 4088 pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_pte_to_pp_attrs(opte)); 4089 return true; 4090 } 4091 4092 static void 4093 pmap_remove_locked(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 4094 { 4095 pt_entry_t *ptes; 4096 pd_entry_t pde; 4097 pd_entry_t * const *pdes; 4098 bool result; 4099 vaddr_t blkendva, va = sva; 4100 struct vm_page *ptp; 4101 struct pmap *pmap2; 4102 int lvl; 4103 4104 KASSERT(mutex_owned(&pmap->pm_lock)); 4105 4106 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 4107 4108 /* 4109 * removing one page? take shortcut function. 4110 */ 4111 4112 if (va + PAGE_SIZE == eva) { 4113 if (pmap_pdes_valid(va, pdes, &pde, &lvl)) { 4114 KASSERT(lvl == 1); 4115 4116 /* Get PTP if non-kernel mapping. */ 4117 if (pmap != pmap_kernel()) { 4118 ptp = pmap_find_ptp(pmap, va, 1); 4119 KASSERTMSG(ptp != NULL, 4120 "%s: unmanaged PTP detected", __func__); 4121 } else { 4122 /* Never free kernel PTPs. */ 4123 ptp = NULL; 4124 } 4125 4126 result = pmap_remove_pte(pmap, ptp, 4127 &ptes[pl1_i(va)], va); 4128 4129 /* 4130 * if mapping removed and the PTP is no longer 4131 * being used, free it! 4132 */ 4133 4134 if (result && ptp && ptp->wire_count <= 1) 4135 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 4136 } 4137 } else for (/* null */ ; va < eva ; va = blkendva) { 4138 /* determine range of block */ 4139 blkendva = x86_round_pdr(va+1); 4140 if (blkendva > eva) 4141 blkendva = eva; 4142 4143 if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) { 4144 /* Skip a range corresponding to an invalid pde. */ 4145 blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1]; 4146 continue; 4147 } 4148 KASSERT(lvl == 1); 4149 4150 /* Get PTP if non-kernel mapping. */ 4151 if (pmap != pmap_kernel()) { 4152 ptp = pmap_find_ptp(pmap, va, 1); 4153 KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected", 4154 __func__); 4155 } else { 4156 /* Never free kernel PTPs. */ 4157 ptp = NULL; 4158 } 4159 4160 pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va, 4161 blkendva); 4162 4163 /* If PTP is no longer being used, free it. */ 4164 if (ptp && ptp->wire_count <= 1) { 4165 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 4166 } 4167 } 4168 pmap_unmap_ptes(pmap, pmap2); 4169 pmap_drain_pv(pmap); 4170 } 4171 4172 /* 4173 * pmap_remove: mapping removal function. 4174 * 4175 * => caller should not be holding any pmap locks 4176 */ 4177 void 4178 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 4179 { 4180 if (__predict_false(pmap->pm_remove != NULL)) { 4181 (*pmap->pm_remove)(pmap, sva, eva); 4182 return; 4183 } 4184 4185 mutex_enter(&pmap->pm_lock); 4186 pmap_remove_locked(pmap, sva, eva); 4187 mutex_exit(&pmap->pm_lock); 4188 } 4189 4190 /* 4191 * pmap_sync_pv: clear pte bits and return the old value of the pp_attrs. 4192 * 4193 * => The 'clearbits' parameter is either ~0 or PP_ATTRS_... 4194 * => Caller should disable kernel preemption. 4195 * => issues tlb shootdowns if necessary. 4196 */ 4197 static int 4198 pmap_sync_pv(struct pv_pte *pvpte, paddr_t pa, int clearbits, uint8_t *oattrs, 4199 pt_entry_t *optep) 4200 { 4201 struct pmap *pmap; 4202 struct vm_page *ptp; 4203 vaddr_t va; 4204 pt_entry_t *ptep; 4205 pt_entry_t opte; 4206 pt_entry_t npte; 4207 pt_entry_t expect; 4208 bool need_shootdown; 4209 4210 ptp = pvpte->pte_ptp; 4211 va = pvpte->pte_va; 4212 KASSERT(ptp == NULL || ptp->uobject != NULL); 4213 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 4214 pmap = ptp_to_pmap(ptp); 4215 KASSERT(kpreempt_disabled()); 4216 4217 if (__predict_false(pmap->pm_sync_pv != NULL)) { 4218 return (*pmap->pm_sync_pv)(ptp, va, pa, clearbits, oattrs, 4219 optep); 4220 } 4221 4222 expect = pmap_pa2pte(pa) | PTE_P; 4223 4224 if (clearbits != ~0) { 4225 KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0); 4226 clearbits = pmap_pp_attrs_to_pte(clearbits); 4227 } 4228 4229 ptep = pmap_map_pte(pmap, ptp, va); 4230 do { 4231 opte = *ptep; 4232 KASSERT((opte & (PTE_D | PTE_A)) != PTE_D); 4233 KASSERT((opte & (PTE_A | PTE_P)) != PTE_A); 4234 KASSERT(opte == 0 || (opte & PTE_P) != 0); 4235 if ((opte & (PTE_FRAME | PTE_P)) != expect) { 4236 /* 4237 * We lost a race with a V->P operation like 4238 * pmap_remove(). Wait for the competitor 4239 * reflecting pte bits into mp_attrs. 4240 */ 4241 pmap_unmap_pte(); 4242 return EAGAIN; 4243 } 4244 4245 /* 4246 * Check if there's anything to do on this PTE. 4247 */ 4248 if ((opte & clearbits) == 0) { 4249 need_shootdown = false; 4250 break; 4251 } 4252 4253 /* 4254 * We need a shootdown if the PTE is cached (PTE_A) ... 4255 * ... Unless we are clearing only the PTE_W bit and 4256 * it isn't cached as RW (PTE_D). 4257 */ 4258 need_shootdown = (opte & PTE_A) != 0 && 4259 !(clearbits == PTE_W && (opte & PTE_D) == 0); 4260 4261 npte = opte & ~clearbits; 4262 4263 /* 4264 * If we need a shootdown anyway, clear PTE_A and PTE_D. 4265 */ 4266 if (need_shootdown) { 4267 npte &= ~(PTE_A | PTE_D); 4268 } 4269 KASSERT((npte & (PTE_D | PTE_A)) != PTE_D); 4270 KASSERT((npte & (PTE_A | PTE_P)) != PTE_A); 4271 KASSERT(npte == 0 || (opte & PTE_P) != 0); 4272 } while (pmap_pte_cas(ptep, opte, npte) != opte); 4273 4274 if (need_shootdown) { 4275 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV); 4276 } 4277 pmap_unmap_pte(); 4278 4279 *oattrs = pmap_pte_to_pp_attrs(opte); 4280 if (optep != NULL) 4281 *optep = opte; 4282 return 0; 4283 } 4284 4285 static void 4286 pmap_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte, 4287 vaddr_t va) 4288 { 4289 struct pmap *pmap2; 4290 pt_entry_t *ptes; 4291 pd_entry_t * const *pdes; 4292 4293 KASSERT(mutex_owned(&pmap->pm_lock)); 4294 4295 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 4296 pmap_stats_update_bypte(pmap, 0, opte); 4297 ptp->wire_count--; 4298 if (ptp->wire_count <= 1) { 4299 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 4300 } 4301 pmap_unmap_ptes(pmap, pmap2); 4302 } 4303 4304 static void 4305 pmap_pp_remove(struct pmap_page *pp, paddr_t pa) 4306 { 4307 struct pv_pte *pvpte; 4308 struct vm_page *ptp; 4309 uintptr_t sum; 4310 uint8_t oattrs; 4311 bool locked; 4312 4313 /* 4314 * Do an unlocked check to see if the page has no mappings, eg when 4315 * pmap_remove_all() was called before amap_wipeout() for a process 4316 * private amap - common. The page being removed must be on the way 4317 * out, so we don't have to worry about concurrent attempts to enter 4318 * it (otherwise the caller either doesn't care or has screwed up). 4319 */ 4320 sum = (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_va); 4321 sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_ptp); 4322 sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pvlist.lh_first); 4323 if (sum == 0) { 4324 return; 4325 } 4326 4327 kpreempt_disable(); 4328 for (;;) { 4329 struct pmap *pmap; 4330 struct pv_entry *pve; 4331 pt_entry_t opte; 4332 vaddr_t va; 4333 4334 mutex_spin_enter(&pp->pp_lock); 4335 if ((pvpte = pv_pte_first(pp)) == NULL) { 4336 mutex_spin_exit(&pp->pp_lock); 4337 break; 4338 } 4339 4340 /* 4341 * Add a reference to the pmap before clearing the pte. 4342 * Otherwise the pmap can disappear behind us. 4343 */ 4344 ptp = pvpte->pte_ptp; 4345 pmap = ptp_to_pmap(ptp); 4346 KASSERT(pmap->pm_obj[0].uo_refs > 0); 4347 if (ptp != NULL) { 4348 pmap_reference(pmap); 4349 } 4350 4351 /* 4352 * Now try to lock it. We need a direct handoff between 4353 * pp_lock and pm_lock to know the pv_entry is kept intact 4354 * and kept associated with this pmap. If that can't be 4355 * had, wait for the pmap's lock to become free and then 4356 * retry. 4357 */ 4358 locked = mutex_tryenter(&pmap->pm_lock); 4359 mutex_spin_exit(&pp->pp_lock); 4360 if (!locked) { 4361 mutex_enter(&pmap->pm_lock); 4362 /* nothing, just wait for it */ 4363 mutex_exit(&pmap->pm_lock); 4364 if (ptp != NULL) { 4365 pmap_destroy(pmap); 4366 } 4367 continue; 4368 } 4369 va = pvpte->pte_va; 4370 4371 KASSERTMSG(pmap->pm_stats.resident_count > PDP_SIZE, 4372 "va %lx pmap %p ptp %p is empty", va, pmap, ptp); 4373 KASSERTMSG(ptp == NULL || (ptp->flags & PG_FREE) == 0, 4374 "va %lx pmap %p ptp %p is free", va, pmap, ptp); 4375 KASSERTMSG(ptp == NULL || ptp->wire_count > 1, 4376 "va %lx pmap %p ptp %p is empty", va, pmap, ptp); 4377 4378 #ifdef DEBUG 4379 pmap_check_pv(pmap, ptp, pp, pvpte->pte_va, true); 4380 rb_tree_t *tree = (ptp != NULL ? 4381 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb); 4382 pve = pmap_treelookup_pv(pmap, ptp, tree, va); 4383 if (pve == NULL) { 4384 KASSERTMSG(&pp->pp_pte == pvpte, 4385 "va %lx pmap %p ptp %p pvpte %p pve %p oops 1", 4386 va, pmap, ptp, pvpte, pve); 4387 } else { 4388 KASSERTMSG(&pve->pve_pte == pvpte, 4389 "va %lx pmap %p ptp %p pvpte %p pve %p oops 2", 4390 va, pmap, ptp, pvpte, pve); 4391 } 4392 #endif 4393 4394 if (pmap_sync_pv(pvpte, pa, ~0, &oattrs, &opte)) { 4395 panic("pmap_pp_remove: mapping not present"); 4396 } 4397 4398 pve = pmap_lookup_pv(pmap, ptp, pp, va); 4399 pmap_remove_pv(pmap, pp, ptp, va, pve, oattrs); 4400 4401 /* Update the PTP reference count. Free if last reference. */ 4402 if (ptp != NULL) { 4403 KASSERT(pmap != pmap_kernel()); 4404 pmap_tlb_shootnow(); 4405 if (__predict_false(pmap->pm_pp_remove_ent != NULL)) { 4406 (*pmap->pm_pp_remove_ent)(pmap, ptp, opte, va); 4407 } else { 4408 pmap_pp_remove_ent(pmap, ptp, opte, va); 4409 } 4410 } else { 4411 KASSERT(pmap == pmap_kernel()); 4412 pmap_stats_update_bypte(pmap, 0, opte); 4413 } 4414 pmap_tlb_shootnow(); 4415 pmap_drain_pv(pmap); 4416 mutex_exit(&pmap->pm_lock); 4417 if (ptp != NULL) { 4418 pmap_destroy(pmap); 4419 } 4420 } 4421 kpreempt_enable(); 4422 } 4423 4424 /* 4425 * pmap_page_remove: remove a managed vm_page from all pmaps that map it 4426 * 4427 * => R/M bits are sync'd back to attrs 4428 */ 4429 void 4430 pmap_page_remove(struct vm_page *pg) 4431 { 4432 struct pmap_page *pp; 4433 paddr_t pa; 4434 4435 pp = VM_PAGE_TO_PP(pg); 4436 pa = VM_PAGE_TO_PHYS(pg); 4437 pmap_pp_remove(pp, pa); 4438 } 4439 4440 /* 4441 * pmap_pv_remove: remove an unmanaged pv-tracked page from all pmaps 4442 * that map it 4443 */ 4444 void 4445 pmap_pv_remove(paddr_t pa) 4446 { 4447 struct pmap_page *pp; 4448 4449 pp = pmap_pv_tracked(pa); 4450 if (pp == NULL) 4451 panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa); 4452 pmap_pp_remove(pp, pa); 4453 } 4454 4455 /* 4456 * p m a p a t t r i b u t e f u n c t i o n s 4457 * functions that test/change managed page's attributes 4458 * since a page can be mapped multiple times we must check each PTE that 4459 * maps it by going down the pv lists. 4460 */ 4461 4462 /* 4463 * pmap_test_attrs: test a page's attributes 4464 */ 4465 bool 4466 pmap_test_attrs(struct vm_page *pg, unsigned testbits) 4467 { 4468 struct pmap_page *pp; 4469 struct pv_pte *pvpte; 4470 struct pmap *pmap; 4471 uint8_t oattrs; 4472 u_int result; 4473 paddr_t pa; 4474 4475 pp = VM_PAGE_TO_PP(pg); 4476 if ((pp->pp_attrs & testbits) != 0) { 4477 return true; 4478 } 4479 pa = VM_PAGE_TO_PHYS(pg); 4480 startover: 4481 mutex_spin_enter(&pp->pp_lock); 4482 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 4483 if ((pp->pp_attrs & testbits) != 0) { 4484 break; 4485 } 4486 if (pmap_sync_pv(pvpte, pa, 0, &oattrs, NULL)) { 4487 /* 4488 * raced with a V->P operation. wait for the other 4489 * side to finish by acquring pmap's lock. if no 4490 * wait, updates to pp_attrs by the other side may 4491 * go unseen. 4492 */ 4493 pmap = ptp_to_pmap(pvpte->pte_ptp); 4494 pmap_reference(pmap); 4495 mutex_spin_exit(&pp->pp_lock); 4496 mutex_enter(&pmap->pm_lock); 4497 /* nothing. */ 4498 mutex_exit(&pmap->pm_lock); 4499 pmap_destroy(pmap); 4500 goto startover; 4501 } 4502 pp->pp_attrs |= oattrs; 4503 } 4504 result = pp->pp_attrs & testbits; 4505 mutex_spin_exit(&pp->pp_lock); 4506 4507 /* 4508 * note that we will exit the for loop with a non-null pve if 4509 * we have found the bits we are testing for. 4510 */ 4511 4512 return result != 0; 4513 } 4514 4515 static bool 4516 pmap_pp_clear_attrs(struct pmap_page *pp, paddr_t pa, unsigned clearbits) 4517 { 4518 struct pv_pte *pvpte; 4519 struct pmap *pmap; 4520 uint8_t oattrs; 4521 u_int result; 4522 4523 startover: 4524 mutex_spin_enter(&pp->pp_lock); 4525 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 4526 if (pmap_sync_pv(pvpte, pa, clearbits, &oattrs, NULL)) { 4527 /* 4528 * raced with a V->P operation. wait for the other 4529 * side to finish by acquring pmap's lock. it is 4530 * probably unmapping the page, and it will be gone 4531 * when the loop is restarted. 4532 */ 4533 pmap = ptp_to_pmap(pvpte->pte_ptp); 4534 pmap_reference(pmap); 4535 mutex_spin_exit(&pp->pp_lock); 4536 mutex_enter(&pmap->pm_lock); 4537 /* nothing. */ 4538 mutex_exit(&pmap->pm_lock); 4539 pmap_destroy(pmap); 4540 goto startover; 4541 } 4542 pp->pp_attrs |= oattrs; 4543 } 4544 result = pp->pp_attrs & clearbits; 4545 pp->pp_attrs &= ~clearbits; 4546 pmap_tlb_shootnow(); 4547 mutex_spin_exit(&pp->pp_lock); 4548 4549 return result != 0; 4550 } 4551 4552 /* 4553 * pmap_clear_attrs: clear the specified attribute for a page. 4554 * 4555 * => we return true if we cleared one of the bits we were asked to 4556 */ 4557 bool 4558 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits) 4559 { 4560 struct pmap_page *pp; 4561 paddr_t pa; 4562 4563 pp = VM_PAGE_TO_PP(pg); 4564 pa = VM_PAGE_TO_PHYS(pg); 4565 4566 /* 4567 * If this is a new page, assert it has no mappings and simply zap 4568 * the stored attributes without taking any locks. 4569 */ 4570 if ((pg->flags & PG_FAKE) != 0) { 4571 KASSERT(atomic_load_relaxed(&pp->pp_pte.pte_va) == 0); 4572 KASSERT(atomic_load_relaxed(&pp->pp_pte.pte_ptp) == NULL); 4573 KASSERT(atomic_load_relaxed(&pp->pp_pvlist.lh_first) == NULL); 4574 atomic_store_relaxed(&pp->pp_attrs, 0); 4575 return false; 4576 } else { 4577 return pmap_pp_clear_attrs(pp, pa, clearbits); 4578 } 4579 } 4580 4581 /* 4582 * pmap_pv_clear_attrs: clear the specified attributes for an unmanaged 4583 * pv-tracked page. 4584 */ 4585 bool 4586 pmap_pv_clear_attrs(paddr_t pa, unsigned clearbits) 4587 { 4588 struct pmap_page *pp; 4589 4590 pp = pmap_pv_tracked(pa); 4591 if (pp == NULL) 4592 panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa); 4593 4594 return pmap_pp_clear_attrs(pp, pa, clearbits); 4595 } 4596 4597 /* 4598 * p m a p p r o t e c t i o n f u n c t i o n s 4599 */ 4600 4601 /* 4602 * pmap_page_protect: change the protection of all recorded mappings 4603 * of a managed page 4604 * 4605 * => NOTE: this is an inline function in pmap.h 4606 */ 4607 4608 /* see pmap.h */ 4609 4610 /* 4611 * pmap_pv_protect: change the protection of all recorded mappings 4612 * of an unmanaged pv-tracked page 4613 * 4614 * => NOTE: this is an inline function in pmap.h 4615 */ 4616 4617 /* see pmap.h */ 4618 4619 /* 4620 * pmap_protect: set the protection in of the pages in a pmap 4621 * 4622 * => NOTE: this is an inline function in pmap.h 4623 */ 4624 4625 /* see pmap.h */ 4626 4627 /* 4628 * pmap_write_protect: write-protect pages in a pmap. 4629 * 4630 * Note for Xen-amd64. Xen automatically adds PTE_U to the kernel pages, but we 4631 * don't need to remove this bit when re-entering the PTEs here: Xen tracks the 4632 * kernel pages with a reserved bit (_PAGE_GUEST_KERNEL), so even if PTE_U is 4633 * present the page will still be considered as a kernel page, and the privilege 4634 * separation will be enforced correctly. 4635 */ 4636 void 4637 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot) 4638 { 4639 pt_entry_t bit_rem, bit_put; 4640 pt_entry_t *ptes; 4641 pt_entry_t * const *pdes; 4642 struct pmap *pmap2; 4643 vaddr_t blockend, va; 4644 int lvl, i; 4645 4646 if (__predict_false(pmap->pm_write_protect != NULL)) { 4647 (*pmap->pm_write_protect)(pmap, sva, eva, prot); 4648 return; 4649 } 4650 4651 bit_rem = 0; 4652 if (!(prot & VM_PROT_WRITE)) 4653 bit_rem = PTE_W; 4654 4655 bit_put = 0; 4656 if (!(prot & VM_PROT_EXECUTE)) 4657 bit_put = pmap_pg_nx; 4658 4659 sva &= ~PAGE_MASK; 4660 eva &= ~PAGE_MASK; 4661 4662 /* 4663 * Acquire pmap. No need to lock the kernel pmap as we won't 4664 * be touching PV entries nor stats and kernel PDEs aren't 4665 * freed. 4666 */ 4667 if (pmap != pmap_kernel()) { 4668 mutex_enter(&pmap->pm_lock); 4669 } 4670 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 4671 4672 for (va = sva ; va < eva; va = blockend) { 4673 pt_entry_t *spte, *epte; 4674 4675 blockend = x86_round_pdr(va + 1); 4676 if (blockend > eva) 4677 blockend = eva; 4678 4679 /* Is it a valid block? */ 4680 if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) { 4681 continue; 4682 } 4683 KASSERT(va < VM_MAXUSER_ADDRESS || va >= VM_MAX_ADDRESS); 4684 KASSERT(lvl == 1); 4685 4686 spte = &ptes[pl1_i(va)]; 4687 epte = &ptes[pl1_i(blockend)]; 4688 4689 for (i = 0; spte < epte; spte++, i++) { 4690 pt_entry_t opte, npte; 4691 4692 do { 4693 opte = *spte; 4694 if (!pmap_valid_entry(opte)) { 4695 goto next; 4696 } 4697 npte = (opte & ~bit_rem) | bit_put; 4698 } while (pmap_pte_cas(spte, opte, npte) != opte); 4699 4700 if ((opte & PTE_D) != 0) { 4701 vaddr_t tva = va + x86_ptob(i); 4702 pmap_tlb_shootdown(pmap, tva, opte, 4703 TLBSHOOT_WRITE_PROTECT); 4704 } 4705 next:; 4706 } 4707 } 4708 4709 /* Release pmap. */ 4710 pmap_unmap_ptes(pmap, pmap2); 4711 if (pmap != pmap_kernel()) { 4712 mutex_exit(&pmap->pm_lock); 4713 } 4714 } 4715 4716 /* 4717 * pmap_unwire: clear the wired bit in the PTE. 4718 * 4719 * => Mapping should already be present. 4720 */ 4721 void 4722 pmap_unwire(struct pmap *pmap, vaddr_t va) 4723 { 4724 pt_entry_t *ptes, *ptep, opte; 4725 pd_entry_t * const *pdes; 4726 struct pmap *pmap2; 4727 int lvl; 4728 4729 if (__predict_false(pmap->pm_unwire != NULL)) { 4730 (*pmap->pm_unwire)(pmap, va); 4731 return; 4732 } 4733 4734 /* 4735 * Acquire pmap. Need to lock the kernel pmap only to protect the 4736 * statistics. 4737 */ 4738 mutex_enter(&pmap->pm_lock); 4739 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 4740 4741 if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) { 4742 panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va); 4743 } 4744 KASSERT(lvl == 1); 4745 4746 ptep = &ptes[pl1_i(va)]; 4747 opte = *ptep; 4748 KASSERT(pmap_valid_entry(opte)); 4749 4750 if (opte & PTE_WIRED) { 4751 pt_entry_t npte = opte & ~PTE_WIRED; 4752 4753 opte = pmap_pte_testset(ptep, npte); 4754 pmap_stats_update_bypte(pmap, npte, opte); 4755 } else { 4756 printf("%s: wiring for pmap %p va %#" PRIxVADDR 4757 " did not change!\n", __func__, pmap, va); 4758 } 4759 4760 /* Release pmap. */ 4761 pmap_unmap_ptes(pmap, pmap2); 4762 mutex_exit(&pmap->pm_lock); 4763 } 4764 4765 /* 4766 * pmap_copy: copy mappings from one pmap to another 4767 * 4768 * => optional function 4769 * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr) 4770 */ 4771 4772 /* 4773 * defined as macro in pmap.h 4774 */ 4775 4776 __strict_weak_alias(pmap_enter, pmap_enter_default); 4777 4778 int 4779 pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, 4780 u_int flags) 4781 { 4782 if (__predict_false(pmap->pm_enter != NULL)) { 4783 return (*pmap->pm_enter)(pmap, va, pa, prot, flags); 4784 } 4785 4786 return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0); 4787 } 4788 4789 /* 4790 * pmap_enter: enter a mapping into a pmap 4791 * 4792 * => must be done "now" ... no lazy-evaluation 4793 */ 4794 int 4795 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa, 4796 vm_prot_t prot, u_int flags, int domid) 4797 { 4798 pt_entry_t *ptes, opte, npte; 4799 pt_entry_t *ptep; 4800 pd_entry_t * const *pdes; 4801 struct vm_page *ptp; 4802 struct vm_page *new_pg, *old_pg; 4803 struct pmap_page *new_pp, *old_pp; 4804 struct pv_entry *old_pve, *new_pve; 4805 bool wired = (flags & PMAP_WIRED) != 0; 4806 struct pmap *pmap2; 4807 struct pmap_ptparray pt; 4808 int error; 4809 bool getptp, samepage, new_embedded; 4810 rb_tree_t *tree; 4811 4812 KASSERT(pmap_initialized); 4813 KASSERT(va < VM_MAX_KERNEL_ADDRESS); 4814 KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#" 4815 PRIxVADDR " over PDP!", __func__, va); 4816 KASSERTMSG(va < VM_MIN_KERNEL_ADDRESS || 4817 pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]), 4818 "%s: missing kernel PTP for va=%#" PRIxVADDR, __func__, va); 4819 4820 #ifdef XENPV 4821 KASSERT(domid == DOMID_SELF || pa == 0); 4822 #endif 4823 4824 npte = ma | protection_codes[prot] | PTE_P; 4825 npte |= pmap_pat_flags(flags); 4826 if (wired) 4827 npte |= PTE_WIRED; 4828 if (va < VM_MAXUSER_ADDRESS) 4829 npte |= PTE_U; 4830 4831 if (pmap == pmap_kernel()) 4832 npte |= pmap_pg_g; 4833 if (flags & VM_PROT_ALL) { 4834 npte |= PTE_A; 4835 if (flags & VM_PROT_WRITE) { 4836 KASSERT((npte & PTE_W) != 0); 4837 npte |= PTE_D; 4838 } 4839 } 4840 4841 #ifdef XENPV 4842 if (domid != DOMID_SELF) 4843 new_pg = NULL; 4844 else 4845 #endif 4846 new_pg = PHYS_TO_VM_PAGE(pa); 4847 4848 if (new_pg != NULL) { 4849 /* This is a managed page */ 4850 npte |= PTE_PVLIST; 4851 new_pp = VM_PAGE_TO_PP(new_pg); 4852 PMAP_CHECK_PP(new_pp); 4853 } else if ((new_pp = pmap_pv_tracked(pa)) != NULL) { 4854 /* This is an unmanaged pv-tracked page */ 4855 npte |= PTE_PVLIST; 4856 PMAP_CHECK_PP(new_pp); 4857 } else { 4858 new_pp = NULL; 4859 } 4860 4861 /* Begin by locking the pmap. */ 4862 mutex_enter(&pmap->pm_lock); 4863 4864 /* Look up the PTP. Allocate if none present. */ 4865 ptp = NULL; 4866 getptp = false; 4867 if (pmap != pmap_kernel()) { 4868 ptp = pmap_find_ptp(pmap, va, 1); 4869 if (ptp == NULL) { 4870 getptp = true; 4871 error = pmap_get_ptp(pmap, &pt, va, flags, &ptp); 4872 if (error != 0) { 4873 if (flags & PMAP_CANFAIL) { 4874 mutex_exit(&pmap->pm_lock); 4875 return error; 4876 } 4877 panic("%s: get ptp failed, error=%d", __func__, 4878 error); 4879 } 4880 } 4881 tree = &VM_PAGE_TO_PP(ptp)->pp_rb; 4882 } else { 4883 /* Embedded PV entries rely on this. */ 4884 KASSERT(va != 0); 4885 tree = &pmap_kernel_rb; 4886 } 4887 4888 /* 4889 * Look up the old PV entry at this VA (if any), and insert a new PV 4890 * entry if required for the new mapping. Temporarily track the old 4891 * and new mappings concurrently. Only after the old mapping is 4892 * evicted from the pmap will we remove its PV entry. Otherwise, 4893 * our picture of modified/accessed state for either page could get 4894 * out of sync (we need any P->V operation for either page to stall 4895 * on pmap->pm_lock until done here). 4896 */ 4897 new_pve = NULL; 4898 old_pve = NULL; 4899 samepage = false; 4900 new_embedded = false; 4901 4902 if (new_pp != NULL) { 4903 error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve, 4904 &old_pve, &samepage, &new_embedded, tree); 4905 4906 /* 4907 * If a new pv_entry was needed and none was available, we 4908 * can go no further. 4909 */ 4910 if (error != 0) { 4911 if (flags & PMAP_CANFAIL) { 4912 if (getptp) { 4913 pmap_unget_ptp(pmap, &pt); 4914 } 4915 mutex_exit(&pmap->pm_lock); 4916 return error; 4917 } 4918 panic("%s: alloc pve failed", __func__); 4919 } 4920 } else { 4921 old_pve = pmap_treelookup_pv(pmap, ptp, tree, va); 4922 } 4923 4924 /* Map PTEs into address space. */ 4925 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 4926 4927 /* Install any newly allocated PTPs. */ 4928 if (getptp) { 4929 pmap_install_ptp(pmap, &pt, va, pdes); 4930 } 4931 4932 /* Check if there is an existing mapping. */ 4933 ptep = &ptes[pl1_i(va)]; 4934 opte = *ptep; 4935 bool have_oldpa = pmap_valid_entry(opte); 4936 paddr_t oldpa = pmap_pte2pa(opte); 4937 4938 /* 4939 * Update the pte. 4940 */ 4941 do { 4942 opte = *ptep; 4943 4944 /* 4945 * if the same page, inherit PTE_A and PTE_D. 4946 */ 4947 if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) { 4948 npte |= opte & (PTE_A | PTE_D); 4949 } 4950 #if defined(XENPV) 4951 if (domid != DOMID_SELF) { 4952 /* pmap_pte_cas with error handling */ 4953 int s = splvm(); 4954 if (opte != *ptep) { 4955 splx(s); 4956 continue; 4957 } 4958 error = xpq_update_foreign( 4959 vtomach((vaddr_t)ptep), npte, domid, flags); 4960 splx(s); 4961 if (error) { 4962 /* Undo pv_entry tracking - oof. */ 4963 if (new_pp != NULL) { 4964 mutex_spin_enter(&new_pp->pp_lock); 4965 if (new_pve != NULL) { 4966 LIST_REMOVE(new_pve, pve_list); 4967 KASSERT(pmap->pm_pve == NULL); 4968 pmap->pm_pve = new_pve; 4969 } else if (new_embedded) { 4970 new_pp->pp_pte.pte_ptp = NULL; 4971 new_pp->pp_pte.pte_va = 0; 4972 } 4973 mutex_spin_exit(&new_pp->pp_lock); 4974 } 4975 pmap_unmap_ptes(pmap, pmap2); 4976 /* Free new PTP. */ 4977 if (ptp != NULL && ptp->wire_count <= 1) { 4978 pmap_free_ptp(pmap, ptp, va, ptes, 4979 pdes); 4980 } 4981 mutex_exit(&pmap->pm_lock); 4982 return error; 4983 } 4984 break; 4985 } 4986 #endif /* defined(XENPV) */ 4987 } while (pmap_pte_cas(ptep, opte, npte) != opte); 4988 4989 /* 4990 * Done with the PTEs: they can now be unmapped. 4991 */ 4992 pmap_unmap_ptes(pmap, pmap2); 4993 4994 /* 4995 * Update statistics and PTP's reference count. 4996 */ 4997 pmap_stats_update_bypte(pmap, npte, opte); 4998 if (ptp != NULL) { 4999 if (!have_oldpa) { 5000 ptp->wire_count++; 5001 } 5002 /* Remember minimum VA in PTP. */ 5003 pmap_ptp_range_set(ptp, va); 5004 } 5005 KASSERT(ptp == NULL || ptp->wire_count > 1); 5006 5007 /* 5008 * If the same page, we can skip pv_entry handling. 5009 */ 5010 if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) { 5011 KASSERT(((opte ^ npte) & PTE_PVLIST) == 0); 5012 if ((npte & PTE_PVLIST) != 0) { 5013 KASSERT(samepage); 5014 pmap_check_pv(pmap, ptp, new_pp, va, true); 5015 } 5016 goto same_pa; 5017 } else if ((npte & PTE_PVLIST) != 0) { 5018 KASSERT(!samepage); 5019 } 5020 5021 /* 5022 * If old page is pv-tracked, remove pv_entry from its list. 5023 */ 5024 if ((~opte & (PTE_P | PTE_PVLIST)) == 0) { 5025 if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) { 5026 old_pp = VM_PAGE_TO_PP(old_pg); 5027 } else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) { 5028 panic("%s: PTE_PVLIST with pv-untracked page" 5029 " va = %#"PRIxVADDR 5030 " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")", 5031 __func__, va, oldpa, atop(pa)); 5032 } 5033 5034 pmap_remove_pv(pmap, old_pp, ptp, va, old_pve, 5035 pmap_pte_to_pp_attrs(opte)); 5036 } else { 5037 KASSERT(old_pve == NULL); 5038 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 5039 } 5040 5041 /* 5042 * If new page is dynamically PV tracked, insert to tree. 5043 */ 5044 if (new_pve != NULL) { 5045 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 5046 old_pve = rb_tree_insert_node(tree, new_pve); 5047 KASSERT(old_pve == new_pve); 5048 pmap_check_pv(pmap, ptp, new_pp, va, true); 5049 } 5050 5051 same_pa: 5052 /* 5053 * shootdown tlb if necessary. 5054 */ 5055 5056 if ((~opte & (PTE_P | PTE_A)) == 0 && 5057 ((opte ^ npte) & (PTE_FRAME | PTE_W)) != 0) { 5058 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER); 5059 } 5060 pmap_drain_pv(pmap); 5061 mutex_exit(&pmap->pm_lock); 5062 return 0; 5063 } 5064 5065 #if defined(XEN) && defined(DOM0OPS) 5066 5067 struct pmap_data_gnt { 5068 SLIST_ENTRY(pmap_data_gnt) pd_gnt_list; 5069 vaddr_t pd_gnt_sva; 5070 vaddr_t pd_gnt_eva; /* range covered by this gnt */ 5071 int pd_gnt_refs; /* ref counter */ 5072 struct gnttab_map_grant_ref pd_gnt_ops[1]; /* variable length */ 5073 }; 5074 SLIST_HEAD(pmap_data_gnt_head, pmap_data_gnt); 5075 5076 static void pmap_remove_gnt(struct pmap *, vaddr_t, vaddr_t); 5077 5078 static struct pmap_data_gnt * 5079 pmap_find_gnt(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 5080 { 5081 struct pmap_data_gnt_head *headp; 5082 struct pmap_data_gnt *pgnt; 5083 5084 KASSERT(mutex_owned(&pmap->pm_lock)); 5085 headp = pmap->pm_data; 5086 KASSERT(headp != NULL); 5087 SLIST_FOREACH(pgnt, headp, pd_gnt_list) { 5088 if (pgnt->pd_gnt_sva <= sva && eva <= pgnt->pd_gnt_eva) 5089 return pgnt; 5090 /* check that we're not overlapping part of a region */ 5091 KASSERT(pgnt->pd_gnt_sva >= eva || pgnt->pd_gnt_eva <= sva); 5092 } 5093 return NULL; 5094 } 5095 5096 static void 5097 pmap_alloc_gnt(struct pmap *pmap, vaddr_t sva, int nentries, 5098 const struct gnttab_map_grant_ref *ops) 5099 { 5100 struct pmap_data_gnt_head *headp; 5101 struct pmap_data_gnt *pgnt; 5102 vaddr_t eva = sva + nentries * PAGE_SIZE; 5103 KASSERT(mutex_owned(&pmap->pm_lock)); 5104 KASSERT(nentries >= 1); 5105 if (pmap->pm_remove == NULL) { 5106 pmap->pm_remove = pmap_remove_gnt; 5107 KASSERT(pmap->pm_data == NULL); 5108 headp = kmem_alloc(sizeof(*headp), KM_SLEEP); 5109 SLIST_INIT(headp); 5110 pmap->pm_data = headp; 5111 } else { 5112 KASSERT(pmap->pm_remove == pmap_remove_gnt); 5113 KASSERT(pmap->pm_data != NULL); 5114 headp = pmap->pm_data; 5115 } 5116 5117 pgnt = pmap_find_gnt(pmap, sva, eva); 5118 if (pgnt != NULL) { 5119 KASSERT(pgnt->pd_gnt_sva == sva); 5120 KASSERT(pgnt->pd_gnt_eva == eva); 5121 return; 5122 } 5123 5124 /* new entry */ 5125 pgnt = kmem_alloc(sizeof(*pgnt) + 5126 (nentries - 1) * sizeof(struct gnttab_map_grant_ref), KM_SLEEP); 5127 pgnt->pd_gnt_sva = sva; 5128 pgnt->pd_gnt_eva = eva; 5129 pgnt->pd_gnt_refs = 0; 5130 memcpy(pgnt->pd_gnt_ops, ops, 5131 sizeof(struct gnttab_map_grant_ref) * nentries); 5132 SLIST_INSERT_HEAD(headp, pgnt, pd_gnt_list); 5133 } 5134 5135 static void 5136 pmap_free_gnt(struct pmap *pmap, struct pmap_data_gnt *pgnt) 5137 { 5138 struct pmap_data_gnt_head *headp = pmap->pm_data; 5139 int nentries = (pgnt->pd_gnt_eva - pgnt->pd_gnt_sva) / PAGE_SIZE; 5140 KASSERT(nentries >= 1); 5141 KASSERT(mutex_owned(&pmap->pm_lock)); 5142 KASSERT(pgnt->pd_gnt_refs == 0); 5143 SLIST_REMOVE(headp, pgnt, pmap_data_gnt, pd_gnt_list); 5144 kmem_free(pgnt, sizeof(*pgnt) + 5145 (nentries - 1) * sizeof(struct gnttab_map_grant_ref)); 5146 if (SLIST_EMPTY(headp)) { 5147 kmem_free(headp, sizeof(*headp)); 5148 pmap->pm_data = NULL; 5149 pmap->pm_remove = NULL; 5150 } 5151 } 5152 5153 /* 5154 * pmap_enter_gnt: enter a grant entry into a pmap 5155 * 5156 * => must be done "now" ... no lazy-evaluation 5157 */ 5158 int 5159 pmap_enter_gnt(struct pmap *pmap, vaddr_t va, vaddr_t sva, int nentries, 5160 const struct gnttab_map_grant_ref *oops) 5161 { 5162 struct pmap_data_gnt *pgnt; 5163 pt_entry_t *ptes, opte; 5164 pt_entry_t *ptep; 5165 pd_entry_t * const *pdes; 5166 struct vm_page *ptp; 5167 struct vm_page *old_pg; 5168 struct pmap_page *old_pp; 5169 struct pv_entry *old_pve; 5170 struct pmap *pmap2; 5171 struct pmap_ptparray pt; 5172 int error; 5173 bool getptp; 5174 rb_tree_t *tree; 5175 struct gnttab_map_grant_ref *op; 5176 int ret; 5177 int idx; 5178 5179 KASSERT(pmap_initialized); 5180 KASSERT(va < VM_MAX_KERNEL_ADDRESS); 5181 KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#" 5182 PRIxVADDR " over PDP!", __func__, va); 5183 KASSERT(pmap != pmap_kernel()); 5184 5185 /* Begin by locking the pmap. */ 5186 mutex_enter(&pmap->pm_lock); 5187 pmap_alloc_gnt(pmap, sva, nentries, oops); 5188 5189 pgnt = pmap_find_gnt(pmap, va, va + PAGE_SIZE); 5190 KASSERT(pgnt != NULL); 5191 5192 /* Look up the PTP. Allocate if none present. */ 5193 ptp = NULL; 5194 getptp = false; 5195 ptp = pmap_find_ptp(pmap, va, 1); 5196 if (ptp == NULL) { 5197 getptp = true; 5198 error = pmap_get_ptp(pmap, &pt, va, PMAP_CANFAIL, &ptp); 5199 if (error != 0) { 5200 mutex_exit(&pmap->pm_lock); 5201 return error; 5202 } 5203 } 5204 tree = &VM_PAGE_TO_PP(ptp)->pp_rb; 5205 5206 /* 5207 * Look up the old PV entry at this VA (if any), and insert a new PV 5208 * entry if required for the new mapping. Temporarily track the old 5209 * and new mappings concurrently. Only after the old mapping is 5210 * evicted from the pmap will we remove its PV entry. Otherwise, 5211 * our picture of modified/accessed state for either page could get 5212 * out of sync (we need any P->V operation for either page to stall 5213 * on pmap->pm_lock until done here). 5214 */ 5215 old_pve = NULL; 5216 5217 old_pve = pmap_treelookup_pv(pmap, ptp, tree, va); 5218 5219 /* Map PTEs into address space. */ 5220 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 5221 5222 /* Install any newly allocated PTPs. */ 5223 if (getptp) { 5224 pmap_install_ptp(pmap, &pt, va, pdes); 5225 } 5226 5227 /* Check if there is an existing mapping. */ 5228 ptep = &ptes[pl1_i(va)]; 5229 opte = *ptep; 5230 bool have_oldpa = pmap_valid_entry(opte); 5231 paddr_t oldpa = pmap_pte2pa(opte); 5232 5233 /* 5234 * Update the pte. 5235 */ 5236 5237 idx = (va - pgnt->pd_gnt_sva) / PAGE_SIZE; 5238 op = &pgnt->pd_gnt_ops[idx]; 5239 5240 #ifdef XENPV /* XXX */ 5241 op->host_addr = xpmap_ptetomach(ptep); 5242 #endif 5243 op->dev_bus_addr = 0; 5244 op->status = GNTST_general_error; 5245 ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, 1); 5246 if (__predict_false(ret)) { 5247 printf("%s: GNTTABOP_map_grant_ref failed: %d\n", 5248 __func__, ret); 5249 op->status = GNTST_general_error; 5250 } 5251 for (int d = 0; d < 256 && op->status == GNTST_eagain; d++) { 5252 kpause("gntmap", false, mstohz(1), NULL); 5253 ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, 1); 5254 if (__predict_false(ret)) { 5255 printf("%s: GNTTABOP_map_grant_ref failed: %d\n", 5256 __func__, ret); 5257 op->status = GNTST_general_error; 5258 } 5259 } 5260 if (__predict_false(op->status != GNTST_okay)) { 5261 printf("%s: GNTTABOP_map_grant_ref status: %d\n", 5262 __func__, op->status); 5263 if (have_oldpa) { 5264 ptp->wire_count--; 5265 } 5266 } else { 5267 pgnt->pd_gnt_refs++; 5268 if (!have_oldpa) { 5269 ptp->wire_count++; 5270 } 5271 KASSERT(ptp->wire_count > 1); 5272 /* Remember minimum VA in PTP. */ 5273 pmap_ptp_range_set(ptp, va); 5274 } 5275 if (ptp->wire_count <= 1) 5276 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 5277 5278 /* 5279 * Done with the PTEs: they can now be unmapped. 5280 */ 5281 pmap_unmap_ptes(pmap, pmap2); 5282 5283 /* 5284 * Update statistics and PTP's reference count. 5285 */ 5286 pmap_stats_update_bypte(pmap, 0, opte); 5287 5288 /* 5289 * If old page is pv-tracked, remove pv_entry from its list. 5290 */ 5291 if ((~opte & (PTE_P | PTE_PVLIST)) == 0) { 5292 if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) { 5293 old_pp = VM_PAGE_TO_PP(old_pg); 5294 } else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) { 5295 panic("%s: PTE_PVLIST with pv-untracked page" 5296 " va = %#"PRIxVADDR " pa = %#" PRIxPADDR, 5297 __func__, va, oldpa); 5298 } 5299 5300 pmap_remove_pv(pmap, old_pp, ptp, va, old_pve, 5301 pmap_pte_to_pp_attrs(opte)); 5302 } else { 5303 KASSERT(old_pve == NULL); 5304 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 5305 } 5306 5307 pmap_drain_pv(pmap); 5308 mutex_exit(&pmap->pm_lock); 5309 return op->status; 5310 } 5311 5312 /* 5313 * pmap_remove_gnt: grant mapping removal function. 5314 * 5315 * => caller should not be holding any pmap locks 5316 */ 5317 static void 5318 pmap_remove_gnt(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 5319 { 5320 struct pmap_data_gnt *pgnt; 5321 pt_entry_t *ptes; 5322 pd_entry_t pde; 5323 pd_entry_t * const *pdes; 5324 struct vm_page *ptp; 5325 struct pmap *pmap2; 5326 vaddr_t va; 5327 int lvl; 5328 int idx; 5329 struct gnttab_map_grant_ref *op; 5330 struct gnttab_unmap_grant_ref unmap_op; 5331 int ret; 5332 5333 KASSERT(pmap != pmap_kernel()); 5334 KASSERT(pmap->pm_remove == pmap_remove_gnt); 5335 5336 mutex_enter(&pmap->pm_lock); 5337 for (va = sva; va < eva; va += PAGE_SIZE) { 5338 pgnt = pmap_find_gnt(pmap, va, va + PAGE_SIZE); 5339 if (pgnt == NULL) { 5340 pmap_remove_locked(pmap, sva, eva); 5341 continue; 5342 } 5343 5344 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 5345 if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) { 5346 panic("pmap_remove_gnt pdes not valid"); 5347 } 5348 5349 idx = (va - pgnt->pd_gnt_sva) / PAGE_SIZE; 5350 op = &pgnt->pd_gnt_ops[idx]; 5351 KASSERT(lvl == 1); 5352 KASSERT(op->status == GNTST_okay); 5353 5354 /* Get PTP if non-kernel mapping. */ 5355 ptp = pmap_find_ptp(pmap, va, 1); 5356 KASSERTMSG(ptp != NULL, 5357 "%s: unmanaged PTP detected", __func__); 5358 5359 if (op->status == GNTST_okay) { 5360 KASSERT(pmap_valid_entry(ptes[pl1_i(va)])); 5361 unmap_op.handle = op->handle; 5362 unmap_op.dev_bus_addr = 0; 5363 #ifdef XENPV /* XXX */ 5364 unmap_op.host_addr = xpmap_ptetomach(&ptes[pl1_i(va)]); 5365 #endif 5366 ret = HYPERVISOR_grant_table_op( 5367 GNTTABOP_unmap_grant_ref, &unmap_op, 1); 5368 if (ret) { 5369 printf("%s: GNTTABOP_unmap_grant_ref " 5370 "failed: %d\n", __func__, ret); 5371 } 5372 5373 ptp->wire_count--; 5374 pgnt->pd_gnt_refs--; 5375 if (pgnt->pd_gnt_refs == 0) { 5376 pmap_free_gnt(pmap, pgnt); 5377 } 5378 } 5379 /* 5380 * if mapping removed and the PTP is no longer 5381 * being used, free it! 5382 */ 5383 5384 if (ptp->wire_count <= 1) 5385 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 5386 pmap_unmap_ptes(pmap, pmap2); 5387 } 5388 mutex_exit(&pmap->pm_lock); 5389 } 5390 #endif /* XEN && DOM0OPS */ 5391 5392 paddr_t 5393 pmap_get_physpage(void) 5394 { 5395 struct vm_page *ptp; 5396 struct pmap *kpm = pmap_kernel(); 5397 paddr_t pa; 5398 5399 if (!uvm.page_init_done) { 5400 /* 5401 * We're growing the kernel pmap early (from 5402 * uvm_pageboot_alloc()). This case must be 5403 * handled a little differently. 5404 */ 5405 5406 if (!uvm_page_physget(&pa)) 5407 panic("%s: out of memory", __func__); 5408 #if defined(__HAVE_DIRECT_MAP) 5409 memset(PAGE_ALIGNED(PMAP_DIRECT_MAP(pa)), 0, PAGE_SIZE); 5410 #else 5411 #if defined(XENPV) 5412 if (XEN_VERSION_SUPPORTED(3, 4)) { 5413 xen_pagezero(pa); 5414 return pa; 5415 } 5416 #endif 5417 kpreempt_disable(); 5418 pmap_pte_set(early_zero_pte, pmap_pa2pte(pa) | PTE_P | 5419 PTE_W | pmap_pg_nx); 5420 pmap_pte_flush(); 5421 pmap_update_pg((vaddr_t)early_zerop); 5422 memset(PAGE_ALIGNED(early_zerop), 0, PAGE_SIZE); 5423 #if defined(DIAGNOSTIC) || defined(XENPV) 5424 pmap_pte_set(early_zero_pte, 0); 5425 pmap_pte_flush(); 5426 #endif /* defined(DIAGNOSTIC) */ 5427 kpreempt_enable(); 5428 #endif /* defined(__HAVE_DIRECT_MAP) */ 5429 } else { 5430 /* XXX */ 5431 ptp = uvm_pagealloc(NULL, 0, NULL, 5432 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 5433 if (ptp == NULL) 5434 panic("%s: out of memory", __func__); 5435 ptp->flags &= ~PG_BUSY; 5436 ptp->wire_count = 1; 5437 pa = VM_PAGE_TO_PHYS(ptp); 5438 } 5439 pmap_stats_update(kpm, 1, 0); 5440 5441 return pa; 5442 } 5443 5444 /* 5445 * Expand the page tree with the specified amount of PTPs, mapping virtual 5446 * addresses starting at kva. We populate all the levels but the last one 5447 * (L1). The nodes of the tree are created as RW, but the pages covered 5448 * will be kentered in L1, with proper permissions. 5449 * 5450 * Used only by pmap_growkernel. 5451 */ 5452 static void 5453 pmap_alloc_level(struct pmap *cpm, vaddr_t kva, long *needed_ptps) 5454 { 5455 unsigned long i; 5456 paddr_t pa; 5457 unsigned long index, endindex; 5458 int level; 5459 pd_entry_t *pdep; 5460 #ifdef XENPV 5461 int s = splvm(); /* protect xpq_* */ 5462 #endif 5463 5464 for (level = PTP_LEVELS; level > 1; level--) { 5465 if (level == PTP_LEVELS) 5466 pdep = cpm->pm_pdir; 5467 else 5468 pdep = normal_pdes[level - 2]; 5469 index = pl_i_roundup(kva, level); 5470 endindex = index + needed_ptps[level - 1] - 1; 5471 5472 for (i = index; i <= endindex; i++) { 5473 pt_entry_t pte; 5474 5475 KASSERT(!pmap_valid_entry(pdep[i])); 5476 pa = pmap_get_physpage(); 5477 pte = pmap_pa2pte(pa) | PTE_P | PTE_W; 5478 #ifdef __x86_64__ 5479 pte |= pmap_pg_nx; 5480 #endif 5481 pmap_pte_set(&pdep[i], pte); 5482 5483 #ifdef XENPV 5484 if (level == PTP_LEVELS && i >= PDIR_SLOT_KERN) { 5485 if (__predict_true( 5486 cpu_info_primary.ci_flags & CPUF_PRESENT)) { 5487 /* update per-cpu PMDs on all cpus */ 5488 xen_kpm_sync(pmap_kernel(), i); 5489 } else { 5490 /* 5491 * too early; update primary CPU 5492 * PMD only (without locks) 5493 */ 5494 #ifdef __x86_64__ 5495 pd_entry_t *cpu_pdep = 5496 &cpu_info_primary.ci_kpm_pdir[i]; 5497 #else 5498 pd_entry_t *cpu_pdep = 5499 &cpu_info_primary.ci_kpm_pdir[l2tol2(i)]; 5500 #endif 5501 pmap_pte_set(cpu_pdep, pte); 5502 } 5503 } 5504 #endif 5505 5506 KASSERT(level != PTP_LEVELS || nkptp[level - 1] + 5507 pl_i(VM_MIN_KERNEL_ADDRESS, level) == i); 5508 nkptp[level - 1]++; 5509 } 5510 pmap_pte_flush(); 5511 } 5512 #ifdef XENPV 5513 splx(s); 5514 #endif 5515 } 5516 5517 /* 5518 * pmap_growkernel: increase usage of KVM space. 5519 * 5520 * => we allocate new PTPs for the kernel and install them in all 5521 * the pmaps on the system. 5522 */ 5523 vaddr_t 5524 pmap_growkernel(vaddr_t maxkvaddr) 5525 { 5526 struct pmap *kpm = pmap_kernel(); 5527 struct pmap *cpm; 5528 #if !defined(XENPV) || !defined(__x86_64__) 5529 struct pmap *pm; 5530 long old; 5531 #endif 5532 int s, i; 5533 long needed_kptp[PTP_LEVELS], target_nptp; 5534 bool invalidate = false; 5535 5536 s = splvm(); /* to be safe */ 5537 mutex_enter(&kpm->pm_lock); 5538 5539 if (maxkvaddr <= pmap_maxkvaddr) { 5540 mutex_exit(&kpm->pm_lock); 5541 splx(s); 5542 return pmap_maxkvaddr; 5543 } 5544 5545 maxkvaddr = x86_round_pdr(maxkvaddr); 5546 #if !defined(XENPV) || !defined(__x86_64__) 5547 old = nkptp[PTP_LEVELS - 1]; 5548 #endif 5549 5550 /* Initialize needed_kptp. */ 5551 for (i = PTP_LEVELS - 1; i >= 1; i--) { 5552 target_nptp = pl_i_roundup(maxkvaddr, i + 1) - 5553 pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1); 5554 5555 if (target_nptp > nkptpmax[i]) 5556 panic("out of KVA space"); 5557 KASSERT(target_nptp >= nkptp[i]); 5558 needed_kptp[i] = target_nptp - nkptp[i]; 5559 } 5560 5561 #ifdef XENPV 5562 /* only pmap_kernel(), or the per-cpu map, has kernel entries */ 5563 cpm = kpm; 5564 #else 5565 /* Get the current pmap */ 5566 if (__predict_true(cpu_info_primary.ci_flags & CPUF_PRESENT)) { 5567 cpm = curcpu()->ci_pmap; 5568 } else { 5569 cpm = kpm; 5570 } 5571 #endif 5572 5573 kasan_shadow_map((void *)pmap_maxkvaddr, 5574 (size_t)(maxkvaddr - pmap_maxkvaddr)); 5575 kmsan_shadow_map((void *)pmap_maxkvaddr, 5576 (size_t)(maxkvaddr - pmap_maxkvaddr)); 5577 5578 pmap_alloc_level(cpm, pmap_maxkvaddr, needed_kptp); 5579 5580 /* 5581 * If the number of top level entries changed, update all pmaps. 5582 */ 5583 if (needed_kptp[PTP_LEVELS - 1] != 0) { 5584 #ifdef XENPV 5585 #ifdef __x86_64__ 5586 /* nothing, kernel entries are never entered in user pmap */ 5587 #else 5588 int pdkidx; 5589 5590 mutex_enter(&pmaps_lock); 5591 LIST_FOREACH(pm, &pmaps, pm_list) { 5592 for (pdkidx = PDIR_SLOT_KERN + old; 5593 pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1]; 5594 pdkidx++) { 5595 pmap_pte_set(&pm->pm_pdir[pdkidx], 5596 kpm->pm_pdir[pdkidx]); 5597 } 5598 pmap_pte_flush(); 5599 } 5600 mutex_exit(&pmaps_lock); 5601 #endif /* __x86_64__ */ 5602 #else /* XENPV */ 5603 size_t newpdes; 5604 newpdes = nkptp[PTP_LEVELS - 1] - old; 5605 if (cpm != kpm) { 5606 memcpy(&kpm->pm_pdir[PDIR_SLOT_KERN + old], 5607 &cpm->pm_pdir[PDIR_SLOT_KERN + old], 5608 newpdes * sizeof(pd_entry_t)); 5609 } 5610 5611 mutex_enter(&pmaps_lock); 5612 LIST_FOREACH(pm, &pmaps, pm_list) { 5613 if (__predict_false(pm->pm_enter != NULL)) { 5614 /* 5615 * Not a native pmap, the kernel is not mapped, 5616 * so nothing to synchronize. 5617 */ 5618 continue; 5619 } 5620 memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old], 5621 &kpm->pm_pdir[PDIR_SLOT_KERN + old], 5622 newpdes * sizeof(pd_entry_t)); 5623 } 5624 mutex_exit(&pmaps_lock); 5625 #endif 5626 invalidate = true; 5627 } 5628 pmap_maxkvaddr = maxkvaddr; 5629 mutex_exit(&kpm->pm_lock); 5630 splx(s); 5631 5632 if (invalidate && pmap_initialized) { 5633 /* Invalidate the pmap cache. */ 5634 pool_cache_invalidate(&pmap_cache); 5635 } 5636 5637 return maxkvaddr; 5638 } 5639 5640 #ifdef DEBUG 5641 void pmap_dump(struct pmap *, vaddr_t, vaddr_t); 5642 5643 /* 5644 * pmap_dump: dump all the mappings from a pmap 5645 * 5646 * => caller should not be holding any pmap locks 5647 */ 5648 void 5649 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 5650 { 5651 pt_entry_t *ptes, *pte; 5652 pd_entry_t * const *pdes; 5653 struct pmap *pmap2; 5654 vaddr_t blkendva; 5655 int lvl; 5656 5657 /* 5658 * if end is out of range truncate. 5659 * if (end == start) update to max. 5660 */ 5661 5662 if (eva > VM_MAXUSER_ADDRESS || eva <= sva) 5663 eva = VM_MAXUSER_ADDRESS; 5664 5665 mutex_enter(&pmap->pm_lock); 5666 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 5667 5668 /* 5669 * dumping a range of pages: we dump in PTP sized blocks (4MB) 5670 */ 5671 5672 for (/* null */ ; sva < eva ; sva = blkendva) { 5673 5674 /* determine range of block */ 5675 blkendva = x86_round_pdr(sva+1); 5676 if (blkendva > eva) 5677 blkendva = eva; 5678 5679 /* valid block? */ 5680 if (!pmap_pdes_valid(sva, pdes, NULL, &lvl)) 5681 continue; 5682 KASSERT(lvl == 1); 5683 5684 pte = &ptes[pl1_i(sva)]; 5685 for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) { 5686 if (!pmap_valid_entry(*pte)) 5687 continue; 5688 printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR 5689 " (pte=%#" PRIxPADDR ")\n", 5690 sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte); 5691 } 5692 } 5693 pmap_unmap_ptes(pmap, pmap2); 5694 mutex_exit(&pmap->pm_lock); 5695 } 5696 #endif 5697 5698 /* 5699 * pmap_update: process deferred invalidations and frees. 5700 */ 5701 void 5702 pmap_update(struct pmap *pmap) 5703 { 5704 struct pmap_page *pp; 5705 struct vm_page *ptp; 5706 5707 /* 5708 * Initiate any pending TLB shootdowns. Wait for them to 5709 * complete before returning control to the caller. 5710 */ 5711 kpreempt_disable(); 5712 pmap_tlb_shootnow(); 5713 kpreempt_enable(); 5714 5715 /* 5716 * Now that shootdowns are complete, process deferred frees. This 5717 * is an unlocked check, but is safe as we're only interested in 5718 * work done in this LWP - we won't get a false negative. 5719 */ 5720 if (atomic_load_relaxed(&pmap->pm_gc_ptp.lh_first) == NULL) { 5721 return; 5722 } 5723 5724 mutex_enter(&pmap->pm_lock); 5725 while ((ptp = LIST_FIRST(&pmap->pm_gc_ptp)) != NULL) { 5726 KASSERT(ptp->wire_count == 0); 5727 KASSERT(ptp->uanon == NULL); 5728 LIST_REMOVE(ptp, mdpage.mp_pp.pp_link); 5729 pp = VM_PAGE_TO_PP(ptp); 5730 LIST_INIT(&pp->pp_pvlist); 5731 pp->pp_attrs = 0; 5732 pp->pp_pte.pte_ptp = NULL; 5733 pp->pp_pte.pte_va = 0; 5734 PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp)); 5735 5736 /* 5737 * XXX Hack to avoid extra locking, and lock 5738 * assertions in uvm_pagefree(). Despite uobject 5739 * being set, this isn't a managed page. 5740 */ 5741 PMAP_DUMMY_LOCK(pmap); 5742 uvm_pagerealloc(ptp, NULL, 0); 5743 PMAP_DUMMY_UNLOCK(pmap); 5744 uvm_pagefree(ptp); 5745 } 5746 mutex_exit(&pmap->pm_lock); 5747 } 5748 5749 #if PTP_LEVELS > 4 5750 #error "Unsupported number of page table mappings" 5751 #endif 5752 5753 paddr_t 5754 pmap_init_tmp_pgtbl(paddr_t pg) 5755 { 5756 static bool maps_loaded; 5757 static const paddr_t x86_tmp_pml_paddr[] = { 5758 4 * PAGE_SIZE, /* L1 */ 5759 5 * PAGE_SIZE, /* L2 */ 5760 6 * PAGE_SIZE, /* L3 */ 5761 7 * PAGE_SIZE /* L4 */ 5762 }; 5763 static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 }; 5764 5765 pd_entry_t *tmp_pml, *kernel_pml; 5766 5767 int level; 5768 5769 if (!maps_loaded) { 5770 for (level = 0; level < PTP_LEVELS; ++level) { 5771 x86_tmp_pml_vaddr[level] = 5772 uvm_km_alloc(kernel_map, PAGE_SIZE, 0, 5773 UVM_KMF_VAONLY); 5774 5775 if (x86_tmp_pml_vaddr[level] == 0) 5776 panic("mapping of real mode PML failed\n"); 5777 pmap_kenter_pa(x86_tmp_pml_vaddr[level], 5778 x86_tmp_pml_paddr[level], 5779 VM_PROT_READ | VM_PROT_WRITE, 0); 5780 } 5781 pmap_update(pmap_kernel()); 5782 maps_loaded = true; 5783 } 5784 5785 /* Zero levels 1-3 */ 5786 for (level = 0; level < PTP_LEVELS - 1; ++level) { 5787 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 5788 memset(PAGE_ALIGNED(tmp_pml), 0, PAGE_SIZE); 5789 } 5790 5791 /* Copy PML4 */ 5792 kernel_pml = pmap_kernel()->pm_pdir; 5793 tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1]; 5794 memcpy(PAGE_ALIGNED(tmp_pml), PAGE_ALIGNED(kernel_pml), PAGE_SIZE); 5795 5796 #ifdef PAE 5797 /* 5798 * Use the last 4 entries of the L2 page as L3 PD entries. These 5799 * last entries are unlikely to be used for temporary mappings. 5800 * 508: maps 0->1GB (userland) 5801 * 509: unused 5802 * 510: unused 5803 * 511: maps 3->4GB (kernel) 5804 */ 5805 tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PTE_P; 5806 tmp_pml[509] = 0; 5807 tmp_pml[510] = 0; 5808 tmp_pml[511] = pmap_pdirpa(pmap_kernel(), PDIR_SLOT_KERN) | PTE_P; 5809 #endif 5810 5811 for (level = PTP_LEVELS - 1; level > 0; --level) { 5812 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 5813 5814 tmp_pml[pl_i(pg, level + 1)] = 5815 (x86_tmp_pml_paddr[level - 1] & PTE_FRAME) | PTE_W | PTE_P; 5816 } 5817 5818 tmp_pml = (void *)x86_tmp_pml_vaddr[0]; 5819 tmp_pml[pl_i(pg, 1)] = (pg & PTE_FRAME) | PTE_W | PTE_P; 5820 5821 #ifdef PAE 5822 /* Return the PA of the L3 page (entry 508 of the L2 page) */ 5823 return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t); 5824 #endif 5825 5826 return x86_tmp_pml_paddr[PTP_LEVELS - 1]; 5827 } 5828 5829 u_int 5830 x86_mmap_flags(paddr_t mdpgno) 5831 { 5832 u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK; 5833 u_int pflag = 0; 5834 5835 if (nflag & X86_MMAP_FLAG_PREFETCH) 5836 pflag |= PMAP_WRITE_COMBINE; 5837 5838 return pflag; 5839 } 5840 5841 #if defined(__HAVE_DIRECT_MAP) && defined(__x86_64__) && !defined(XENPV) 5842 5843 /* 5844 * ----------------------------------------------------------------------------- 5845 * ***************************************************************************** 5846 * ***************************************************************************** 5847 * ***************************************************************************** 5848 * ***************************************************************************** 5849 * **************** HERE BEGINS THE EPT CODE, USED BY INTEL-VMX **************** 5850 * ***************************************************************************** 5851 * ***************************************************************************** 5852 * ***************************************************************************** 5853 * ***************************************************************************** 5854 * ----------------------------------------------------------------------------- 5855 * 5856 * These functions are invoked as callbacks from the code above. Contrary to 5857 * native, EPT does not have a recursive slot; therefore, it is not possible 5858 * to call pmap_map_ptes(). Instead, we use the direct map and walk down the 5859 * tree manually. 5860 * 5861 * Apart from that, the logic is mostly the same as native. Once a pmap has 5862 * been created, NVMM calls pmap_ept_transform() to make it an EPT pmap. 5863 * After that we're good, and the callbacks will handle the translations 5864 * for us. 5865 * 5866 * ----------------------------------------------------------------------------- 5867 */ 5868 5869 /* Hardware bits. */ 5870 #define EPT_R __BIT(0) /* read */ 5871 #define EPT_W __BIT(1) /* write */ 5872 #define EPT_X __BIT(2) /* execute */ 5873 #define EPT_T __BITS(5,3) /* type */ 5874 #define TYPE_UC 0 5875 #define TYPE_WC 1 5876 #define TYPE_WT 4 5877 #define TYPE_WP 5 5878 #define TYPE_WB 6 5879 #define EPT_NOPAT __BIT(6) 5880 #define EPT_L __BIT(7) /* large */ 5881 #define EPT_A __BIT(8) /* accessed */ 5882 #define EPT_D __BIT(9) /* dirty */ 5883 /* Software bits. */ 5884 #define EPT_PVLIST __BIT(60) 5885 #define EPT_WIRED __BIT(61) 5886 5887 #define pmap_ept_valid_entry(pte) (pte & EPT_R) 5888 5889 bool pmap_ept_has_ad __read_mostly; 5890 5891 static inline void 5892 pmap_ept_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte) 5893 { 5894 int resid_diff = ((npte & EPT_R) ? 1 : 0) - ((opte & EPT_R) ? 1 : 0); 5895 int wired_diff = ((npte & EPT_WIRED) ? 1 : 0) - ((opte & EPT_WIRED) ? 1 : 0); 5896 5897 KASSERT((npte & (EPT_R | EPT_WIRED)) != EPT_WIRED); 5898 KASSERT((opte & (EPT_R | EPT_WIRED)) != EPT_WIRED); 5899 5900 pmap_stats_update(pmap, resid_diff, wired_diff); 5901 } 5902 5903 static pt_entry_t 5904 pmap_ept_type(u_int flags) 5905 { 5906 u_int cacheflags = (flags & PMAP_CACHE_MASK); 5907 pt_entry_t ret; 5908 5909 switch (cacheflags) { 5910 case PMAP_NOCACHE: 5911 case PMAP_NOCACHE_OVR: 5912 ret = __SHIFTIN(TYPE_UC, EPT_T); 5913 break; 5914 case PMAP_WRITE_COMBINE: 5915 ret = __SHIFTIN(TYPE_WC, EPT_T); 5916 break; 5917 case PMAP_WRITE_BACK: 5918 default: 5919 ret = __SHIFTIN(TYPE_WB, EPT_T); 5920 break; 5921 } 5922 5923 ret |= EPT_NOPAT; 5924 return ret; 5925 } 5926 5927 static inline pt_entry_t 5928 pmap_ept_prot(vm_prot_t prot) 5929 { 5930 pt_entry_t res = 0; 5931 5932 if (prot & VM_PROT_READ) 5933 res |= EPT_R; 5934 if (prot & VM_PROT_WRITE) 5935 res |= EPT_W; 5936 if (prot & VM_PROT_EXECUTE) 5937 res |= EPT_X; 5938 5939 return res; 5940 } 5941 5942 static inline uint8_t 5943 pmap_ept_to_pp_attrs(pt_entry_t ept) 5944 { 5945 uint8_t ret = 0; 5946 if (pmap_ept_has_ad) { 5947 if (ept & EPT_D) 5948 ret |= PP_ATTRS_D; 5949 if (ept & EPT_A) 5950 ret |= PP_ATTRS_A; 5951 } else { 5952 ret |= (PP_ATTRS_D|PP_ATTRS_A); 5953 } 5954 if (ept & EPT_W) 5955 ret |= PP_ATTRS_W; 5956 return ret; 5957 } 5958 5959 static inline pt_entry_t 5960 pmap_pp_attrs_to_ept(uint8_t attrs) 5961 { 5962 pt_entry_t ept = 0; 5963 if (attrs & PP_ATTRS_D) 5964 ept |= EPT_D; 5965 if (attrs & PP_ATTRS_A) 5966 ept |= EPT_A; 5967 if (attrs & PP_ATTRS_W) 5968 ept |= EPT_W; 5969 return ept; 5970 } 5971 5972 /* 5973 * Helper for pmap_ept_free_ptp. 5974 * tree[0] = &L2[L2idx] 5975 * tree[1] = &L3[L3idx] 5976 * tree[2] = &L4[L4idx] 5977 */ 5978 static void 5979 pmap_ept_get_tree(struct pmap *pmap, vaddr_t va, pd_entry_t **tree) 5980 { 5981 pt_entry_t *pteva; 5982 paddr_t ptepa; 5983 int i, index; 5984 5985 ptepa = pmap->pm_pdirpa[0]; 5986 for (i = PTP_LEVELS; i > 1; i--) { 5987 index = pl_pi(va, i); 5988 pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa); 5989 KASSERT(pmap_ept_valid_entry(pteva[index])); 5990 tree[i - 2] = &pteva[index]; 5991 ptepa = pmap_pte2pa(pteva[index]); 5992 } 5993 } 5994 5995 static void 5996 pmap_ept_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va) 5997 { 5998 pd_entry_t *tree[3]; 5999 int level; 6000 6001 KASSERT(pmap != pmap_kernel()); 6002 KASSERT(mutex_owned(&pmap->pm_lock)); 6003 KASSERT(kpreempt_disabled()); 6004 6005 pmap_ept_get_tree(pmap, va, tree); 6006 6007 level = 1; 6008 do { 6009 (void)pmap_pte_testset(tree[level - 1], 0); 6010 6011 pmap_freepage(pmap, ptp, level); 6012 if (level < PTP_LEVELS - 1) { 6013 ptp = pmap_find_ptp(pmap, va, level + 1); 6014 ptp->wire_count--; 6015 if (ptp->wire_count > 1) 6016 break; 6017 } 6018 } while (++level < PTP_LEVELS); 6019 pmap_pte_flush(); 6020 } 6021 6022 /* Allocate L4->L3->L2. Return L2. */ 6023 static void 6024 pmap_ept_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va) 6025 { 6026 struct vm_page *ptp; 6027 unsigned long index; 6028 pd_entry_t *pteva; 6029 paddr_t ptepa; 6030 int i; 6031 6032 KASSERT(pmap != pmap_kernel()); 6033 KASSERT(mutex_owned(&pmap->pm_lock)); 6034 KASSERT(kpreempt_disabled()); 6035 6036 /* 6037 * Now that we have all the pages looked up or allocated, 6038 * loop through again installing any new ones into the tree. 6039 */ 6040 ptepa = pmap->pm_pdirpa[0]; 6041 for (i = PTP_LEVELS; i > 1; i--) { 6042 index = pl_pi(va, i); 6043 pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa); 6044 6045 if (pmap_ept_valid_entry(pteva[index])) { 6046 KASSERT(!pt->alloced[i]); 6047 ptepa = pmap_pte2pa(pteva[index]); 6048 continue; 6049 } 6050 6051 ptp = pt->pg[i]; 6052 ptp->flags &= ~PG_BUSY; /* never busy */ 6053 ptp->wire_count = 1; 6054 pmap->pm_ptphint[i - 2] = ptp; 6055 ptepa = VM_PAGE_TO_PHYS(ptp); 6056 pmap_pte_set(&pteva[index], ptepa | EPT_R | EPT_W | EPT_X); 6057 6058 pmap_pte_flush(); 6059 pmap_stats_update(pmap, 1, 0); 6060 6061 /* 6062 * If we're not in the top level, increase the 6063 * wire count of the parent page. 6064 */ 6065 if (i < PTP_LEVELS) { 6066 pt->pg[i + 1]->wire_count++; 6067 } 6068 } 6069 } 6070 6071 static int 6072 pmap_ept_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, 6073 u_int flags) 6074 { 6075 pt_entry_t *ptes, opte, npte; 6076 pt_entry_t *ptep; 6077 struct vm_page *ptp; 6078 struct vm_page *new_pg, *old_pg; 6079 struct pmap_page *new_pp, *old_pp; 6080 struct pv_entry *old_pve, *new_pve; 6081 bool wired = (flags & PMAP_WIRED) != 0; 6082 bool accessed; 6083 struct pmap_ptparray pt; 6084 int error; 6085 bool getptp, samepage, new_embedded; 6086 rb_tree_t *tree; 6087 6088 KASSERT(pmap_initialized); 6089 KASSERT(va < VM_MAXUSER_ADDRESS); 6090 6091 npte = pa | pmap_ept_prot(prot) | pmap_ept_type(flags); 6092 6093 if (wired) 6094 npte |= EPT_WIRED; 6095 if (flags & VM_PROT_ALL) { 6096 npte |= EPT_A; 6097 if (flags & VM_PROT_WRITE) { 6098 KASSERT((npte & EPT_W) != 0); 6099 npte |= EPT_D; 6100 } 6101 } 6102 6103 new_pg = PHYS_TO_VM_PAGE(pa); 6104 if (new_pg != NULL) { 6105 /* This is a managed page */ 6106 npte |= EPT_PVLIST; 6107 new_pp = VM_PAGE_TO_PP(new_pg); 6108 } else if ((new_pp = pmap_pv_tracked(pa)) != NULL) { 6109 /* This is an unmanaged pv-tracked page */ 6110 npte |= EPT_PVLIST; 6111 } else { 6112 new_pp = NULL; 6113 } 6114 6115 /* Begin by locking the pmap. */ 6116 mutex_enter(&pmap->pm_lock); 6117 6118 /* Look up the PTP. Allocate if none present. */ 6119 ptp = NULL; 6120 getptp = false; 6121 if (pmap != pmap_kernel()) { 6122 ptp = pmap_find_ptp(pmap, va, 1); 6123 if (ptp == NULL) { 6124 getptp = true; 6125 error = pmap_get_ptp(pmap, &pt, va, flags, &ptp); 6126 if (error != 0) { 6127 if (flags & PMAP_CANFAIL) { 6128 mutex_exit(&pmap->pm_lock); 6129 return error; 6130 } 6131 panic("%s: get ptp failed, error=%d", __func__, 6132 error); 6133 } 6134 } 6135 tree = &VM_PAGE_TO_PP(ptp)->pp_rb; 6136 } else { 6137 /* Embedded PV entries rely on this. */ 6138 KASSERT(va != 0); 6139 tree = &pmap_kernel_rb; 6140 } 6141 6142 /* 6143 * Look up the old PV entry at this VA (if any), and insert a new PV 6144 * entry if required for the new mapping. Temporarily track the old 6145 * and new mappings concurrently. Only after the old mapping is 6146 * evicted from the pmap will we remove its PV entry. Otherwise, 6147 * our picture of modified/accessed state for either page could get 6148 * out of sync (we need any P->V operation for either page to stall 6149 * on pmap->pm_lock until done here). 6150 */ 6151 new_pve = NULL; 6152 old_pve = NULL; 6153 samepage = false; 6154 new_embedded = false; 6155 6156 if (new_pp != NULL) { 6157 error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve, 6158 &old_pve, &samepage, &new_embedded, tree); 6159 6160 /* 6161 * If a new pv_entry was needed and none was available, we 6162 * can go no further. 6163 */ 6164 if (error != 0) { 6165 if (flags & PMAP_CANFAIL) { 6166 if (getptp) { 6167 pmap_unget_ptp(pmap, &pt); 6168 } 6169 mutex_exit(&pmap->pm_lock); 6170 return error; 6171 } 6172 panic("%s: alloc pve failed", __func__); 6173 } 6174 } else { 6175 old_pve = pmap_treelookup_pv(pmap, ptp, tree, va); 6176 } 6177 6178 /* Map PTEs into address space. */ 6179 kpreempt_disable(); 6180 6181 /* Install any newly allocated PTPs. */ 6182 if (getptp) { 6183 pmap_ept_install_ptp(pmap, &pt, va); 6184 } 6185 6186 /* Check if there is an existing mapping. */ 6187 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp)); 6188 ptep = &ptes[pl1_pi(va)]; 6189 opte = *ptep; 6190 bool have_oldpa = pmap_ept_valid_entry(opte); 6191 paddr_t oldpa = pmap_pte2pa(opte); 6192 6193 /* 6194 * Update the pte. 6195 */ 6196 do { 6197 opte = *ptep; 6198 6199 /* 6200 * if the same page, inherit PTE_A and PTE_D. 6201 */ 6202 if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) { 6203 npte |= opte & (EPT_A | EPT_D); 6204 } 6205 } while (pmap_pte_cas(ptep, opte, npte) != opte); 6206 6207 /* 6208 * Done with the PTEs: they can now be unmapped. 6209 */ 6210 kpreempt_enable(); 6211 6212 /* 6213 * Update statistics and PTP's reference count. 6214 */ 6215 pmap_ept_stats_update_bypte(pmap, npte, opte); 6216 if (ptp != NULL) { 6217 if (!have_oldpa) { 6218 ptp->wire_count++; 6219 } 6220 /* Remember minimum VA in PTP. */ 6221 pmap_ptp_range_set(ptp, va); 6222 } 6223 KASSERT(ptp == NULL || ptp->wire_count > 1); 6224 6225 /* 6226 * If the same page, we can skip pv_entry handling. 6227 */ 6228 if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) { 6229 KASSERT(((opte ^ npte) & EPT_PVLIST) == 0); 6230 if ((npte & EPT_PVLIST) != 0) { 6231 KASSERT(samepage); 6232 pmap_check_pv(pmap, ptp, new_pp, va, true); 6233 } 6234 goto same_pa; 6235 } else if ((npte & EPT_PVLIST) != 0) { 6236 KASSERT(!samepage); 6237 } 6238 6239 /* 6240 * If old page is pv-tracked, remove pv_entry from its list. 6241 */ 6242 if ((~opte & (EPT_R | EPT_PVLIST)) == 0) { 6243 if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) { 6244 old_pp = VM_PAGE_TO_PP(old_pg); 6245 } else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) { 6246 panic("%s: EPT_PVLIST with pv-untracked page" 6247 " va = %#"PRIxVADDR 6248 " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")", 6249 __func__, va, oldpa, atop(pa)); 6250 } 6251 6252 pmap_remove_pv(pmap, old_pp, ptp, va, old_pve, 6253 pmap_ept_to_pp_attrs(opte)); 6254 } else { 6255 KASSERT(old_pve == NULL); 6256 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 6257 } 6258 6259 /* 6260 * If new page is dynamically PV tracked, insert to tree. 6261 */ 6262 if (new_pve != NULL) { 6263 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 6264 old_pve = rb_tree_insert_node(tree, new_pve); 6265 KASSERT(old_pve == new_pve); 6266 pmap_check_pv(pmap, ptp, new_pp, va, true); 6267 } 6268 6269 same_pa: 6270 /* 6271 * shootdown tlb if necessary. 6272 */ 6273 6274 if (pmap_ept_has_ad) { 6275 accessed = (~opte & (EPT_R | EPT_A)) == 0; 6276 } else { 6277 accessed = (opte & EPT_R) != 0; 6278 } 6279 if (accessed && ((opte ^ npte) & (PTE_FRAME | EPT_W)) != 0) { 6280 pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_ENTER); 6281 } 6282 pmap_drain_pv(pmap); 6283 mutex_exit(&pmap->pm_lock); 6284 return 0; 6285 } 6286 6287 /* Pay close attention, this returns L2. */ 6288 static int 6289 pmap_ept_pdes_invalid(struct pmap *pmap, vaddr_t va, pd_entry_t *lastpde) 6290 { 6291 pt_entry_t *pteva; 6292 paddr_t ptepa; 6293 int i, index; 6294 6295 KASSERT(mutex_owned(&pmap->pm_lock)); 6296 6297 ptepa = pmap->pm_pdirpa[0]; 6298 for (i = PTP_LEVELS; i > 1; i--) { 6299 pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa); 6300 index = pl_pi(va, i); 6301 if (!pmap_ept_valid_entry(pteva[index])) 6302 return i; 6303 ptepa = pmap_pte2pa(pteva[index]); 6304 } 6305 if (lastpde != NULL) { 6306 *lastpde = pteva[index]; 6307 } 6308 6309 return 0; 6310 } 6311 6312 static bool 6313 pmap_ept_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap) 6314 { 6315 pt_entry_t *ptes, pte; 6316 pd_entry_t pde; 6317 paddr_t ptppa, pa; 6318 bool rv; 6319 6320 #ifdef __HAVE_DIRECT_MAP 6321 if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) { 6322 if (pap != NULL) { 6323 *pap = PMAP_DIRECT_UNMAP(va); 6324 } 6325 return true; 6326 } 6327 #endif 6328 6329 rv = false; 6330 pa = 0; 6331 6332 mutex_enter(&pmap->pm_lock); 6333 kpreempt_disable(); 6334 6335 if (!pmap_ept_pdes_invalid(pmap, va, &pde)) { 6336 ptppa = pmap_pte2pa(pde); 6337 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa); 6338 pte = ptes[pl1_pi(va)]; 6339 if (__predict_true((pte & EPT_R) != 0)) { 6340 pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1)); 6341 rv = true; 6342 } 6343 } 6344 6345 kpreempt_enable(); 6346 mutex_exit(&pmap->pm_lock); 6347 6348 if (pap != NULL) { 6349 *pap = pa; 6350 } 6351 return rv; 6352 } 6353 6354 static bool 6355 pmap_ept_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, 6356 vaddr_t va) 6357 { 6358 struct pv_entry *pve; 6359 struct vm_page *pg; 6360 struct pmap_page *pp; 6361 pt_entry_t opte; 6362 bool accessed; 6363 6364 KASSERT(pmap != pmap_kernel()); 6365 KASSERT(mutex_owned(&pmap->pm_lock)); 6366 KASSERT(kpreempt_disabled()); 6367 6368 if (!pmap_ept_valid_entry(*pte)) { 6369 /* VA not mapped. */ 6370 return false; 6371 } 6372 6373 /* Atomically save the old PTE and zap it. */ 6374 opte = pmap_pte_testset(pte, 0); 6375 if (!pmap_ept_valid_entry(opte)) { 6376 return false; 6377 } 6378 6379 pmap_ept_stats_update_bypte(pmap, 0, opte); 6380 6381 if (ptp) { 6382 /* 6383 * Dropping a PTE. Make sure that the PDE is flushed. 6384 */ 6385 ptp->wire_count--; 6386 if (ptp->wire_count <= 1) { 6387 opte |= EPT_A; 6388 } 6389 } 6390 6391 if (pmap_ept_has_ad) { 6392 accessed = (opte & EPT_A) != 0; 6393 } else { 6394 accessed = true; 6395 } 6396 if (accessed) { 6397 pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_REMOVE_PTE); 6398 } 6399 6400 /* 6401 * If we are not on a pv list - we are done. 6402 */ 6403 if ((opte & EPT_PVLIST) == 0) { 6404 KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL), 6405 "managed page without EPT_PVLIST for %#"PRIxVADDR, va); 6406 KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL), 6407 "pv-tracked page without EPT_PVLIST for %#"PRIxVADDR, va); 6408 KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ? 6409 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL); 6410 return true; 6411 } 6412 6413 if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) { 6414 pp = VM_PAGE_TO_PP(pg); 6415 } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) { 6416 paddr_t pa = pmap_pte2pa(opte); 6417 panic("%s: EPT_PVLIST with pv-untracked page" 6418 " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")", 6419 __func__, va, pa, atop(pa)); 6420 } 6421 6422 /* Sync R/M bits. */ 6423 pve = pmap_lookup_pv(pmap, ptp, pp, va); 6424 pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_ept_to_pp_attrs(opte)); 6425 return true; 6426 } 6427 6428 static void 6429 pmap_ept_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva, 6430 vaddr_t startva, vaddr_t endva) 6431 { 6432 pt_entry_t *pte = (pt_entry_t *)ptpva; 6433 6434 KASSERT(pmap != pmap_kernel()); 6435 KASSERT(mutex_owned(&pmap->pm_lock)); 6436 KASSERT(kpreempt_disabled()); 6437 6438 /* 6439 * mappings are very often sparse, so clip the given range to the 6440 * range of PTEs that are known present in the PTP. 6441 */ 6442 pmap_ptp_range_clip(ptp, &startva, &pte); 6443 6444 /* 6445 * note that ptpva points to the PTE that maps startva. this may 6446 * or may not be the first PTE in the PTP. 6447 * 6448 * we loop through the PTP while there are still PTEs to look at 6449 * and the wire_count is greater than 1 (because we use the wire_count 6450 * to keep track of the number of real PTEs in the PTP). 6451 */ 6452 while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) { 6453 (void)pmap_ept_remove_pte(pmap, ptp, pte, startva); 6454 startva += PAGE_SIZE; 6455 pte++; 6456 } 6457 } 6458 6459 static void 6460 pmap_ept_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 6461 { 6462 pt_entry_t *ptes; 6463 pd_entry_t pde; 6464 paddr_t ptppa; 6465 vaddr_t blkendva, va = sva; 6466 struct vm_page *ptp; 6467 6468 mutex_enter(&pmap->pm_lock); 6469 kpreempt_disable(); 6470 6471 for (/* null */ ; va < eva ; va = blkendva) { 6472 int lvl; 6473 6474 /* determine range of block */ 6475 blkendva = x86_round_pdr(va+1); 6476 if (blkendva > eva) 6477 blkendva = eva; 6478 6479 lvl = pmap_ept_pdes_invalid(pmap, va, &pde); 6480 if (lvl != 0) { 6481 /* Skip a range corresponding to an invalid pde. */ 6482 blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1]; 6483 continue; 6484 } 6485 6486 /* PA of the PTP */ 6487 ptppa = pmap_pte2pa(pde); 6488 6489 ptp = pmap_find_ptp(pmap, va, 1); 6490 KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected", 6491 __func__); 6492 6493 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa); 6494 6495 pmap_ept_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_pi(va)], va, 6496 blkendva); 6497 6498 /* If PTP is no longer being used, free it. */ 6499 if (ptp && ptp->wire_count <= 1) { 6500 pmap_ept_free_ptp(pmap, ptp, va); 6501 } 6502 } 6503 6504 kpreempt_enable(); 6505 pmap_drain_pv(pmap); 6506 mutex_exit(&pmap->pm_lock); 6507 } 6508 6509 static int 6510 pmap_ept_sync_pv(struct vm_page *ptp, vaddr_t va, paddr_t pa, int clearbits, 6511 uint8_t *oattrs, pt_entry_t *optep) 6512 { 6513 struct pmap *pmap; 6514 pt_entry_t *ptep; 6515 pt_entry_t opte; 6516 pt_entry_t npte; 6517 pt_entry_t expect; 6518 bool need_shootdown; 6519 6520 expect = pmap_pa2pte(pa) | EPT_R; 6521 pmap = ptp_to_pmap(ptp); 6522 6523 if (clearbits != ~0) { 6524 KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0); 6525 clearbits = pmap_pp_attrs_to_ept(clearbits); 6526 } 6527 6528 ptep = pmap_map_pte(pmap, ptp, va); 6529 do { 6530 opte = *ptep; 6531 KASSERT((opte & (EPT_D | EPT_A)) != EPT_D); 6532 KASSERT((opte & (EPT_A | EPT_R)) != EPT_A); 6533 KASSERT(opte == 0 || (opte & EPT_R) != 0); 6534 if ((opte & (PTE_FRAME | EPT_R)) != expect) { 6535 /* 6536 * We lost a race with a V->P operation like 6537 * pmap_remove(). Wait for the competitor 6538 * reflecting pte bits into mp_attrs. 6539 */ 6540 pmap_unmap_pte(); 6541 return EAGAIN; 6542 } 6543 6544 /* 6545 * Check if there's anything to do on this PTE. 6546 */ 6547 if ((opte & clearbits) == 0) { 6548 need_shootdown = false; 6549 break; 6550 } 6551 6552 /* 6553 * We need a shootdown if the PTE is cached (EPT_A) ... 6554 * ... Unless we are clearing only the EPT_W bit and 6555 * it isn't cached as RW (EPT_D). 6556 */ 6557 if (pmap_ept_has_ad) { 6558 need_shootdown = (opte & EPT_A) != 0 && 6559 !(clearbits == EPT_W && (opte & EPT_D) == 0); 6560 } else { 6561 need_shootdown = true; 6562 } 6563 6564 npte = opte & ~clearbits; 6565 6566 /* 6567 * If we need a shootdown anyway, clear EPT_A and EPT_D. 6568 */ 6569 if (need_shootdown) { 6570 npte &= ~(EPT_A | EPT_D); 6571 } 6572 KASSERT((npte & (EPT_D | EPT_A)) != EPT_D); 6573 KASSERT((npte & (EPT_A | EPT_R)) != EPT_A); 6574 KASSERT(npte == 0 || (opte & EPT_R) != 0); 6575 } while (pmap_pte_cas(ptep, opte, npte) != opte); 6576 6577 if (need_shootdown) { 6578 pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_SYNC_PV); 6579 } 6580 pmap_unmap_pte(); 6581 6582 *oattrs = pmap_ept_to_pp_attrs(opte); 6583 if (optep != NULL) 6584 *optep = opte; 6585 return 0; 6586 } 6587 6588 static void 6589 pmap_ept_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte, 6590 vaddr_t va) 6591 { 6592 6593 KASSERT(mutex_owned(&pmap->pm_lock)); 6594 6595 pmap_ept_stats_update_bypte(pmap, 0, opte); 6596 ptp->wire_count--; 6597 if (ptp->wire_count <= 1) { 6598 pmap_ept_free_ptp(pmap, ptp, va); 6599 } 6600 } 6601 6602 static void 6603 pmap_ept_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot) 6604 { 6605 pt_entry_t bit_rem; 6606 pt_entry_t *ptes, *spte; 6607 pt_entry_t opte, npte; 6608 pd_entry_t pde; 6609 paddr_t ptppa; 6610 vaddr_t va; 6611 bool modified; 6612 6613 bit_rem = 0; 6614 if (!(prot & VM_PROT_WRITE)) 6615 bit_rem = EPT_W; 6616 6617 sva &= PTE_FRAME; 6618 eva &= PTE_FRAME; 6619 6620 /* Acquire pmap. */ 6621 mutex_enter(&pmap->pm_lock); 6622 kpreempt_disable(); 6623 6624 for (va = sva; va < eva; va += PAGE_SIZE) { 6625 if (pmap_ept_pdes_invalid(pmap, va, &pde)) { 6626 continue; 6627 } 6628 6629 ptppa = pmap_pte2pa(pde); 6630 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa); 6631 spte = &ptes[pl1_pi(va)]; 6632 6633 do { 6634 opte = *spte; 6635 if (!pmap_ept_valid_entry(opte)) { 6636 goto next; 6637 } 6638 npte = (opte & ~bit_rem); 6639 } while (pmap_pte_cas(spte, opte, npte) != opte); 6640 6641 if (pmap_ept_has_ad) { 6642 modified = (opte & EPT_D) != 0; 6643 } else { 6644 modified = true; 6645 } 6646 if (modified) { 6647 vaddr_t tva = x86_ptob(spte - ptes); 6648 pmap_tlb_shootdown(pmap, tva, 0, 6649 TLBSHOOT_WRITE_PROTECT); 6650 } 6651 next:; 6652 } 6653 6654 kpreempt_enable(); 6655 mutex_exit(&pmap->pm_lock); 6656 } 6657 6658 static void 6659 pmap_ept_unwire(struct pmap *pmap, vaddr_t va) 6660 { 6661 pt_entry_t *ptes, *ptep, opte; 6662 pd_entry_t pde; 6663 paddr_t ptppa; 6664 6665 /* Acquire pmap. */ 6666 mutex_enter(&pmap->pm_lock); 6667 kpreempt_disable(); 6668 6669 if (pmap_ept_pdes_invalid(pmap, va, &pde)) { 6670 panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va); 6671 } 6672 6673 ptppa = pmap_pte2pa(pde); 6674 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa); 6675 ptep = &ptes[pl1_pi(va)]; 6676 opte = *ptep; 6677 KASSERT(pmap_ept_valid_entry(opte)); 6678 6679 if (opte & EPT_WIRED) { 6680 pt_entry_t npte = opte & ~EPT_WIRED; 6681 6682 opte = pmap_pte_testset(ptep, npte); 6683 pmap_ept_stats_update_bypte(pmap, npte, opte); 6684 } else { 6685 printf("%s: wiring for pmap %p va %#" PRIxVADDR 6686 "did not change!\n", __func__, pmap, va); 6687 } 6688 6689 /* Release pmap. */ 6690 kpreempt_enable(); 6691 mutex_exit(&pmap->pm_lock); 6692 } 6693 6694 /* -------------------------------------------------------------------------- */ 6695 6696 void 6697 pmap_ept_transform(struct pmap *pmap) 6698 { 6699 pmap->pm_enter = pmap_ept_enter; 6700 pmap->pm_extract = pmap_ept_extract; 6701 pmap->pm_remove = pmap_ept_remove; 6702 pmap->pm_sync_pv = pmap_ept_sync_pv; 6703 pmap->pm_pp_remove_ent = pmap_ept_pp_remove_ent; 6704 pmap->pm_write_protect = pmap_ept_write_protect; 6705 pmap->pm_unwire = pmap_ept_unwire; 6706 6707 memset(PAGE_ALIGNED(pmap->pm_pdir), 0, PAGE_SIZE); 6708 } 6709 6710 #endif /* __HAVE_DIRECT_MAP && __x86_64__ && !XENPV */ 6711