1 /* $NetBSD: pmap.c,v 1.410 2021/04/17 18:03:21 bouyer Exp $ */ 2 3 /* 4 * Copyright (c) 2008, 2010, 2016, 2017, 2019, 2020 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran, and by Maxime Villard. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 2007 Manuel Bouyer. 34 * 35 * Redistribution and use in source and binary forms, with or without 36 * modification, are permitted provided that the following conditions 37 * are met: 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 44 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 45 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 46 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 47 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 48 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 49 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 50 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 51 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 52 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 53 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 54 */ 55 56 /* 57 * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr> 58 * 59 * Permission to use, copy, modify, and distribute this software for any 60 * purpose with or without fee is hereby granted, provided that the above 61 * copyright notice and this permission notice appear in all copies. 62 * 63 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 64 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 65 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 66 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 67 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 68 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 69 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 70 */ 71 72 /* 73 * Copyright 2001 (c) Wasabi Systems, Inc. 74 * All rights reserved. 75 * 76 * Written by Frank van der Linden for Wasabi Systems, Inc. 77 * 78 * Redistribution and use in source and binary forms, with or without 79 * modification, are permitted provided that the following conditions 80 * are met: 81 * 1. Redistributions of source code must retain the above copyright 82 * notice, this list of conditions and the following disclaimer. 83 * 2. Redistributions in binary form must reproduce the above copyright 84 * notice, this list of conditions and the following disclaimer in the 85 * documentation and/or other materials provided with the distribution. 86 * 3. All advertising materials mentioning features or use of this software 87 * must display the following acknowledgement: 88 * This product includes software developed for the NetBSD Project by 89 * Wasabi Systems, Inc. 90 * 4. The name of Wasabi Systems, Inc. may not be used to endorse 91 * or promote products derived from this software without specific prior 92 * written permission. 93 * 94 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND 95 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 96 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 97 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC 98 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 99 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 100 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 101 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 102 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 103 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 104 * POSSIBILITY OF SUCH DAMAGE. 105 */ 106 107 /* 108 * Copyright (c) 1997 Charles D. Cranor and Washington University. 109 * All rights reserved. 110 * 111 * Redistribution and use in source and binary forms, with or without 112 * modification, are permitted provided that the following conditions 113 * are met: 114 * 1. Redistributions of source code must retain the above copyright 115 * notice, this list of conditions and the following disclaimer. 116 * 2. Redistributions in binary form must reproduce the above copyright 117 * notice, this list of conditions and the following disclaimer in the 118 * documentation and/or other materials provided with the distribution. 119 * 120 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 121 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 122 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 123 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 124 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 125 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 126 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 127 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 128 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 129 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 130 */ 131 132 #include <sys/cdefs.h> 133 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.410 2021/04/17 18:03:21 bouyer Exp $"); 134 135 #include "opt_user_ldt.h" 136 #include "opt_lockdebug.h" 137 #include "opt_multiprocessor.h" 138 #include "opt_xen.h" 139 #include "opt_svs.h" 140 #include "opt_kaslr.h" 141 142 #define __MUTEX_PRIVATE /* for assertions */ 143 144 #include <sys/param.h> 145 #include <sys/systm.h> 146 #include <sys/proc.h> 147 #include <sys/pool.h> 148 #include <sys/kernel.h> 149 #include <sys/atomic.h> 150 #include <sys/cpu.h> 151 #include <sys/intr.h> 152 #include <sys/xcall.h> 153 #include <sys/kcore.h> 154 #include <sys/kmem.h> 155 #include <sys/asan.h> 156 #include <sys/msan.h> 157 #include <sys/entropy.h> 158 159 #include <uvm/uvm.h> 160 #include <uvm/pmap/pmap_pvt.h> 161 162 #include <dev/isa/isareg.h> 163 164 #include <machine/specialreg.h> 165 #include <machine/gdt.h> 166 #include <machine/isa_machdep.h> 167 #include <machine/cpuvar.h> 168 #include <machine/cputypes.h> 169 170 #include <x86/pmap_pv.h> 171 172 #include <x86/i82489reg.h> 173 #include <x86/i82489var.h> 174 175 #ifdef XEN 176 #include <xen/include/public/xen.h> 177 #include <xen/hypervisor.h> 178 #include <xen/xenpmap.h> 179 #endif 180 181 /* 182 * general info: 183 * 184 * - for an explanation of how the x86 MMU hardware works see 185 * the comments in <machine/pte.h>. 186 * 187 * - for an explanation of the general memory structure used by 188 * this pmap (including the recursive mapping), see the comments 189 * in <machine/pmap.h>. 190 * 191 * this file contains the code for the "pmap module." the module's 192 * job is to manage the hardware's virtual to physical address mappings. 193 * note that there are two levels of mapping in the VM system: 194 * 195 * [1] the upper layer of the VM system uses vm_map's and vm_map_entry's 196 * to map ranges of virtual address space to objects/files. for 197 * example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only 198 * to the file /bin/ls starting at offset zero." note that 199 * the upper layer mapping is not concerned with how individual 200 * vm_pages are mapped. 201 * 202 * [2] the lower layer of the VM system (the pmap) maintains the mappings 203 * from virtual addresses. it is concerned with which vm_page is 204 * mapped where. for example, when you run /bin/ls and start 205 * at page 0x1000 the fault routine may lookup the correct page 206 * of the /bin/ls file and then ask the pmap layer to establish 207 * a mapping for it. 208 * 209 * note that information in the lower layer of the VM system can be 210 * thrown away since it can easily be reconstructed from the info 211 * in the upper layer. 212 * 213 * data structures we use include: 214 * 215 * - struct pmap: describes the address space of one thread 216 * - struct pmap_page: describes one pv-tracked page, without 217 * necessarily a corresponding vm_page 218 * - struct pv_entry: describes one <PMAP,VA> mapping of a PA 219 * - pmap_page::pp_pvlist: there is one list per pv-tracked page of 220 * physical memory. the pp_pvlist points to a list of pv_entry 221 * structures which describe all the <PMAP,VA> pairs that this 222 * page is mapped in. this is critical for page based operations 223 * such as pmap_page_protect() [change protection on _all_ mappings 224 * of a page] 225 */ 226 227 /* 228 * Locking 229 * 230 * We have the following locks that we must deal with, listed in the order 231 * that they are acquired: 232 * 233 * pg->uobject->vmobjlock, pg->uanon->an_lock 234 * 235 * For managed pages, these per-object locks are taken by the VM system 236 * before calling into the pmap module - either a read or write hold. 237 * The lock hold prevent pages from changing identity while the pmap is 238 * operating on them. For example, the same lock is held across a call 239 * to pmap_remove() and the following call to pmap_update(), so that a 240 * page does not gain a new identity while its TLB visibility is stale. 241 * 242 * pmap->pm_lock 243 * 244 * This lock protects the fields in the pmap structure including the 245 * non-kernel PDEs in the PDP, the PTEs, and PTPs and connected data 246 * structures. For modifying unmanaged kernel PTEs it is not needed as 247 * kernel PDEs are never freed, and the kernel is expected to be self 248 * consistent (and the lock can't be taken for unmanaged kernel PTEs, 249 * because they can be modified from interrupt context). 250 * 251 * pmaps_lock 252 * 253 * This lock protects the list of active pmaps (headed by "pmaps"). 254 * It's acquired when adding or removing pmaps or adjusting kernel PDEs. 255 * 256 * pp_lock 257 * 258 * This per-page lock protects PV entry lists and the embedded PV entry 259 * in each vm_page, allowing for concurrent operation on pages by 260 * different pmaps. This is a spin mutex at IPL_VM, because at the 261 * points it is taken context switching is usually not tolerable, and 262 * spin mutexes must block out interrupts that could take kernel_lock. 263 */ 264 265 /* uvm_object is abused here to index pmap_pages; make assertions happy. */ 266 #ifdef DIAGNOSTIC 267 #define PMAP_DUMMY_LOCK(pm) rw_enter(&(pm)->pm_dummy_lock, RW_WRITER) 268 #define PMAP_DUMMY_UNLOCK(pm) rw_exit(&(pm)->pm_dummy_lock) 269 #else 270 #define PMAP_DUMMY_LOCK(pm) 271 #define PMAP_DUMMY_UNLOCK(pm) 272 #endif 273 274 static const struct uvm_pagerops pmap_pager = { 275 /* nothing */ 276 }; 277 278 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER; 279 const vaddr_t ptp_frames[] = PTP_FRAME_INITIALIZER; 280 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER; 281 const long nkptpmax[] = NKPTPMAX_INITIALIZER; 282 const long nbpd[] = NBPD_INITIALIZER; 283 #ifdef i386 284 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER; 285 #else 286 pd_entry_t *normal_pdes[3]; 287 #endif 288 289 long nkptp[] = NKPTP_INITIALIZER; 290 291 struct pmap_head pmaps; 292 kmutex_t pmaps_lock __cacheline_aligned; 293 294 struct pcpu_area *pcpuarea __read_mostly; 295 296 static vaddr_t pmap_maxkvaddr; 297 298 /* 299 * Misc. event counters. 300 */ 301 struct evcnt pmap_iobmp_evcnt; 302 struct evcnt pmap_ldt_evcnt; 303 304 /* 305 * PAT 306 */ 307 static bool cpu_pat_enabled __read_mostly = false; 308 309 /* 310 * Global data structures 311 */ 312 313 static struct pmap kernel_pmap_store __cacheline_aligned; /* kernel's pmap */ 314 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store; 315 static rb_tree_t pmap_kernel_rb __cacheline_aligned; 316 317 struct bootspace bootspace __read_mostly; 318 struct slotspace slotspace __read_mostly; 319 320 /* Set to PTE_NX if supported. */ 321 pd_entry_t pmap_pg_nx __read_mostly = 0; 322 323 /* Set to PTE_G if supported. */ 324 pd_entry_t pmap_pg_g __read_mostly = 0; 325 326 /* Set to true if large pages are supported. */ 327 int pmap_largepages __read_mostly = 0; 328 329 paddr_t lowmem_rsvd __read_mostly; 330 paddr_t avail_start __read_mostly; /* PA of first available physical page */ 331 paddr_t avail_end __read_mostly; /* PA of last available physical page */ 332 333 #ifdef XENPV 334 paddr_t pmap_pa_start; /* PA of first physical page for this domain */ 335 paddr_t pmap_pa_end; /* PA of last physical page for this domain */ 336 #endif 337 338 #define VM_PAGE_TO_PP(pg) (&(pg)->mdpage.mp_pp) 339 #define PMAP_CHECK_PP(pp) \ 340 KASSERTMSG((pp)->pp_lock.mtx_ipl._ipl == IPL_VM, "bad pmap_page %p", pp) 341 342 #define PAGE_ALIGNED(pp) \ 343 __builtin_assume_aligned((void *)(pp), PAGE_SIZE) 344 345 /* 346 * Other data structures 347 */ 348 349 static pt_entry_t protection_codes[8] __read_mostly; 350 351 static bool pmap_initialized __read_mostly = false; /* pmap_init done yet? */ 352 353 /* 354 * The following two vaddr_t's are used during system startup to keep track of 355 * how much of the kernel's VM space we have used. Once the system is started, 356 * the management of the remaining kernel VM space is turned over to the 357 * kernel_map vm_map. 358 */ 359 static vaddr_t virtual_avail __read_mostly; /* VA of first free KVA */ 360 static vaddr_t virtual_end __read_mostly; /* VA of last free KVA */ 361 362 #ifndef XENPV 363 /* 364 * LAPIC virtual address, and fake physical address. 365 */ 366 volatile vaddr_t local_apic_va __read_mostly; 367 paddr_t local_apic_pa __read_mostly; 368 #endif 369 370 /* 371 * pool that pmap structures are allocated from 372 */ 373 struct pool_cache pmap_cache; 374 static int pmap_ctor(void *, void *, int); 375 static void pmap_dtor(void *, void *); 376 377 /* 378 * pv_page cache 379 */ 380 static struct pool_cache pmap_pvp_cache; 381 382 #ifdef __HAVE_DIRECT_MAP 383 vaddr_t pmap_direct_base __read_mostly; 384 vaddr_t pmap_direct_end __read_mostly; 385 #endif 386 387 #ifndef __HAVE_DIRECT_MAP 388 /* 389 * Special VAs and the PTEs that map them 390 */ 391 static pt_entry_t *early_zero_pte; 392 static void pmap_vpage_cpualloc(struct cpu_info *); 393 #ifdef XENPV 394 char *early_zerop; /* also referenced from xen_locore() */ 395 #else 396 static char *early_zerop; 397 #endif 398 #endif 399 400 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int); 401 402 /* PDP pool and its callbacks */ 403 static struct pool pmap_pdp_pool; 404 static void pmap_pdp_init(pd_entry_t *); 405 static void pmap_pdp_fini(pd_entry_t *); 406 407 #ifdef PAE 408 /* need to allocate items of 4 pages */ 409 static void *pmap_pdp_alloc(struct pool *, int); 410 static void pmap_pdp_free(struct pool *, void *); 411 static struct pool_allocator pmap_pdp_allocator = { 412 .pa_alloc = pmap_pdp_alloc, 413 .pa_free = pmap_pdp_free, 414 .pa_pagesz = PAGE_SIZE * PDP_SIZE, 415 }; 416 #endif 417 418 extern vaddr_t idt_vaddr; 419 extern paddr_t idt_paddr; 420 extern vaddr_t gdt_vaddr; 421 extern paddr_t gdt_paddr; 422 extern vaddr_t ldt_vaddr; 423 extern paddr_t ldt_paddr; 424 425 #ifdef i386 426 /* stuff to fix the pentium f00f bug */ 427 extern vaddr_t pentium_idt_vaddr; 428 #endif 429 430 /* Array of freshly allocated PTPs, for pmap_get_ptp(). */ 431 struct pmap_ptparray { 432 struct vm_page *pg[PTP_LEVELS + 1]; 433 bool alloced[PTP_LEVELS + 1]; 434 }; 435 436 /* 437 * PV entries are allocated in page-sized chunks and cached per-pmap to 438 * avoid intense pressure on memory allocators. 439 */ 440 441 struct pv_page { 442 LIST_HEAD(, pv_entry) pvp_pves; 443 LIST_ENTRY(pv_page) pvp_list; 444 long pvp_nfree; 445 struct pmap *pvp_pmap; 446 }; 447 448 #define PVE_PER_PVP ((PAGE_SIZE / sizeof(struct pv_entry)) - 1) 449 450 /* 451 * PV tree prototypes 452 */ 453 454 static int pmap_compare_key(void *, const void *, const void *); 455 static int pmap_compare_nodes(void *, const void *, const void *); 456 457 /* Read-black tree */ 458 static const rb_tree_ops_t pmap_rbtree_ops = { 459 .rbto_compare_nodes = pmap_compare_nodes, 460 .rbto_compare_key = pmap_compare_key, 461 .rbto_node_offset = offsetof(struct pv_entry, pve_rb), 462 .rbto_context = NULL 463 }; 464 465 /* 466 * Local prototypes 467 */ 468 469 #ifdef __HAVE_PCPU_AREA 470 static void pmap_init_pcpu(void); 471 #endif 472 #ifdef __HAVE_DIRECT_MAP 473 static void pmap_init_directmap(struct pmap *); 474 #endif 475 #if !defined(XENPV) 476 static void pmap_remap_global(void); 477 #endif 478 #ifndef XENPV 479 static void pmap_init_lapic(void); 480 static void pmap_remap_largepages(void); 481 #endif 482 483 static int pmap_get_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t, int, 484 struct vm_page **); 485 static void pmap_unget_ptp(struct pmap *, struct pmap_ptparray *); 486 static void pmap_install_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t, 487 pd_entry_t * const *); 488 static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, int); 489 static void pmap_freepage(struct pmap *, struct vm_page *, int); 490 static void pmap_free_ptp(struct pmap *, struct vm_page *, vaddr_t, 491 pt_entry_t *, pd_entry_t * const *); 492 static bool pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *, 493 vaddr_t); 494 static void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t, vaddr_t, 495 vaddr_t); 496 static int pmap_pvp_ctor(void *, void *, int); 497 static void pmap_pvp_dtor(void *, void *); 498 static struct pv_entry *pmap_alloc_pv(struct pmap *); 499 static void pmap_free_pv(struct pmap *, struct pv_entry *); 500 static void pmap_drain_pv(struct pmap *); 501 502 static void pmap_alloc_level(struct pmap *, vaddr_t, long *); 503 504 static void pmap_load1(struct lwp *, struct pmap *, struct pmap *); 505 static void pmap_reactivate(struct pmap *); 506 507 /* 508 * p m a p h e l p e r f u n c t i o n s 509 */ 510 511 static inline void 512 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff) 513 { 514 515 KASSERT(cold || mutex_owned(&pmap->pm_lock)); 516 pmap->pm_stats.resident_count += resid_diff; 517 pmap->pm_stats.wired_count += wired_diff; 518 } 519 520 static inline void 521 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte) 522 { 523 int resid_diff = ((npte & PTE_P) ? 1 : 0) - ((opte & PTE_P) ? 1 : 0); 524 int wired_diff = ((npte & PTE_WIRED) ? 1 : 0) - ((opte & PTE_WIRED) ? 1 : 0); 525 526 KASSERT((npte & (PTE_P | PTE_WIRED)) != PTE_WIRED); 527 KASSERT((opte & (PTE_P | PTE_WIRED)) != PTE_WIRED); 528 529 pmap_stats_update(pmap, resid_diff, wired_diff); 530 } 531 532 /* 533 * ptp_to_pmap: lookup pmap by ptp 534 */ 535 static inline struct pmap * 536 ptp_to_pmap(struct vm_page *ptp) 537 { 538 struct pmap *pmap; 539 540 if (ptp == NULL) { 541 return pmap_kernel(); 542 } 543 pmap = (struct pmap *)ptp->uobject; 544 KASSERT(pmap != NULL); 545 KASSERT(&pmap->pm_obj[0] == ptp->uobject); 546 return pmap; 547 } 548 549 static inline struct pv_pte * 550 pve_to_pvpte(struct pv_entry *pve) 551 { 552 553 if (pve == NULL) 554 return NULL; 555 KASSERT((void *)&pve->pve_pte == (void *)pve); 556 return &pve->pve_pte; 557 } 558 559 static inline struct pv_entry * 560 pvpte_to_pve(struct pv_pte *pvpte) 561 { 562 struct pv_entry *pve = (void *)pvpte; 563 564 KASSERT(pve_to_pvpte(pve) == pvpte); 565 return pve; 566 } 567 568 /* 569 * Return true if the pmap page has an embedded PV entry. 570 */ 571 static inline bool 572 pv_pte_embedded(struct pmap_page *pp) 573 { 574 575 KASSERT(mutex_owned(&pp->pp_lock)); 576 return (bool)((vaddr_t)pp->pp_pte.pte_ptp | pp->pp_pte.pte_va); 577 } 578 579 /* 580 * pv_pte_first, pv_pte_next: PV list iterator. 581 */ 582 static inline struct pv_pte * 583 pv_pte_first(struct pmap_page *pp) 584 { 585 586 KASSERT(mutex_owned(&pp->pp_lock)); 587 if (pv_pte_embedded(pp)) { 588 return &pp->pp_pte; 589 } 590 return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist)); 591 } 592 593 static inline struct pv_pte * 594 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte) 595 { 596 597 KASSERT(mutex_owned(&pp->pp_lock)); 598 KASSERT(pvpte != NULL); 599 if (pvpte == &pp->pp_pte) { 600 return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist)); 601 } 602 return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list)); 603 } 604 605 static inline uint8_t 606 pmap_pte_to_pp_attrs(pt_entry_t pte) 607 { 608 uint8_t ret = 0; 609 if (pte & PTE_D) 610 ret |= PP_ATTRS_D; 611 if (pte & PTE_A) 612 ret |= PP_ATTRS_A; 613 if (pte & PTE_W) 614 ret |= PP_ATTRS_W; 615 return ret; 616 } 617 618 static inline pt_entry_t 619 pmap_pp_attrs_to_pte(uint8_t attrs) 620 { 621 pt_entry_t pte = 0; 622 if (attrs & PP_ATTRS_D) 623 pte |= PTE_D; 624 if (attrs & PP_ATTRS_A) 625 pte |= PTE_A; 626 if (attrs & PP_ATTRS_W) 627 pte |= PTE_W; 628 return pte; 629 } 630 631 /* 632 * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]? 633 * of course the kernel is always loaded 634 */ 635 bool 636 pmap_is_curpmap(struct pmap *pmap) 637 { 638 return ((pmap == pmap_kernel()) || (pmap == curcpu()->ci_pmap)); 639 } 640 641 inline void 642 pmap_reference(struct pmap *pmap) 643 { 644 645 atomic_inc_uint(&pmap->pm_obj[0].uo_refs); 646 } 647 648 /* 649 * rbtree: compare two nodes. 650 */ 651 static int 652 pmap_compare_nodes(void *context, const void *n1, const void *n2) 653 { 654 const struct pv_entry *pve1 = n1; 655 const struct pv_entry *pve2 = n2; 656 657 KASSERT(pve1->pve_pte.pte_ptp == pve2->pve_pte.pte_ptp); 658 659 if (pve1->pve_pte.pte_va < pve2->pve_pte.pte_va) { 660 return -1; 661 } 662 if (pve1->pve_pte.pte_va > pve2->pve_pte.pte_va) { 663 return 1; 664 } 665 return 0; 666 } 667 668 /* 669 * rbtree: compare a node and a key. 670 */ 671 static int 672 pmap_compare_key(void *context, const void *n, const void *k) 673 { 674 const struct pv_entry *pve = n; 675 const vaddr_t key = (vaddr_t)k; 676 677 if (pve->pve_pte.pte_va < key) { 678 return -1; 679 } 680 if (pve->pve_pte.pte_va > key) { 681 return 1; 682 } 683 return 0; 684 } 685 686 /* 687 * pmap_ptp_range_set: abuse ptp->uanon to record minimum VA of PTE 688 */ 689 static inline void 690 pmap_ptp_range_set(struct vm_page *ptp, vaddr_t va) 691 { 692 vaddr_t *min = (vaddr_t *)&ptp->uanon; 693 694 if (va < *min) { 695 *min = va; 696 } 697 } 698 699 /* 700 * pmap_ptp_range_clip: abuse ptp->uanon to clip range of PTEs to remove 701 */ 702 static inline void 703 pmap_ptp_range_clip(struct vm_page *ptp, vaddr_t *startva, pt_entry_t **pte) 704 { 705 vaddr_t sclip; 706 707 if (ptp == NULL) { 708 return; 709 } 710 711 sclip = (vaddr_t)ptp->uanon; 712 sclip = (*startva < sclip ? sclip : *startva); 713 *pte += (sclip - *startva) / PAGE_SIZE; 714 *startva = sclip; 715 } 716 717 /* 718 * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in 719 * 720 * there are several pmaps involved. some or all of them might be same. 721 * 722 * - the pmap given by the first argument 723 * our caller wants to access this pmap's PTEs. 724 * 725 * - pmap_kernel() 726 * the kernel pmap. note that it only contains the kernel part 727 * of the address space which is shared by any pmap. ie. any 728 * pmap can be used instead of pmap_kernel() for our purpose. 729 * 730 * - ci->ci_pmap 731 * pmap currently loaded on the cpu. 732 * 733 * - vm_map_pmap(&curproc->p_vmspace->vm_map) 734 * current process' pmap. 735 * 736 * => caller must lock pmap first (if not the kernel pmap) 737 * => must be undone with pmap_unmap_ptes before returning 738 * => disables kernel preemption 739 */ 740 void 741 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2, pd_entry_t **ptepp, 742 pd_entry_t * const **pdeppp) 743 { 744 struct pmap *curpmap; 745 struct cpu_info *ci; 746 lwp_t *l; 747 748 kpreempt_disable(); 749 750 /* The kernel's pmap is always accessible. */ 751 if (pmap == pmap_kernel()) { 752 *pmap2 = NULL; 753 *ptepp = PTE_BASE; 754 *pdeppp = normal_pdes; 755 return; 756 } 757 758 KASSERT(mutex_owned(&pmap->pm_lock)); 759 760 l = curlwp; 761 ci = l->l_cpu; 762 curpmap = ci->ci_pmap; 763 if (pmap == curpmap) { 764 /* 765 * Already on the CPU: make it valid. This is very 766 * often the case during exit(), when we have switched 767 * to the kernel pmap in order to destroy a user pmap. 768 */ 769 if (__predict_false(ci->ci_tlbstate != TLBSTATE_VALID)) { 770 pmap_reactivate(pmap); 771 } 772 *pmap2 = NULL; 773 } else { 774 /* 775 * Toss current pmap from CPU and install new pmap, but keep 776 * a reference to the old one. Dropping the reference can 777 * can block as it needs to take locks, so defer that to 778 * pmap_unmap_ptes(). 779 */ 780 pmap_reference(pmap); 781 pmap_load1(l, pmap, curpmap); 782 *pmap2 = curpmap; 783 } 784 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); 785 #ifdef DIAGNOSTIC 786 pmap->pm_ncsw = lwp_pctr(); 787 #endif 788 *ptepp = PTE_BASE; 789 790 #if defined(XENPV) && defined(__x86_64__) 791 KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] == L4_BASE); 792 ci->ci_normal_pdes[PTP_LEVELS - 2] = pmap->pm_pdir; 793 *pdeppp = ci->ci_normal_pdes; 794 #else 795 *pdeppp = normal_pdes; 796 #endif 797 } 798 799 /* 800 * pmap_unmap_ptes: unlock the PTE mapping of "pmap" 801 * 802 * => we cannot tolerate context switches while mapped in: assert this. 803 * => reenables kernel preemption. 804 * => does not unlock pmap. 805 */ 806 void 807 pmap_unmap_ptes(struct pmap *pmap, struct pmap * pmap2) 808 { 809 struct cpu_info *ci; 810 struct pmap *mypmap; 811 struct lwp *l; 812 813 KASSERT(kpreempt_disabled()); 814 815 /* The kernel's pmap is always accessible. */ 816 if (pmap == pmap_kernel()) { 817 kpreempt_enable(); 818 return; 819 } 820 821 l = curlwp; 822 ci = l->l_cpu; 823 824 KASSERT(mutex_owned(&pmap->pm_lock)); 825 KASSERT(pmap->pm_ncsw == lwp_pctr()); 826 827 #if defined(XENPV) && defined(__x86_64__) 828 KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] != L4_BASE); 829 ci->ci_normal_pdes[PTP_LEVELS - 2] = L4_BASE; 830 #endif 831 832 /* If not our own pmap, mark whatever's on the CPU now as lazy. */ 833 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); 834 mypmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 835 if (ci->ci_pmap == vm_map_pmap(&l->l_proc->p_vmspace->vm_map)) { 836 ci->ci_want_pmapload = 0; 837 } else { 838 ci->ci_want_pmapload = (mypmap != pmap_kernel()); 839 ci->ci_tlbstate = TLBSTATE_LAZY; 840 } 841 842 /* Now safe to re-enable preemption. */ 843 kpreempt_enable(); 844 845 /* Toss reference to other pmap taken earlier. */ 846 if (pmap2 != NULL) { 847 pmap_destroy(pmap2); 848 } 849 } 850 851 inline static void 852 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte) 853 { 854 855 #if !defined(__x86_64__) 856 if (curproc == NULL || curproc->p_vmspace == NULL || 857 pm != vm_map_pmap(&curproc->p_vmspace->vm_map)) 858 return; 859 860 if ((opte ^ npte) & PTE_X) 861 pmap_update_pg(va); 862 863 /* 864 * Executability was removed on the last executable change. 865 * Reset the code segment to something conservative and 866 * let the trap handler deal with setting the right limit. 867 * We can't do that because of locking constraints on the vm map. 868 */ 869 870 if ((opte & PTE_X) && (npte & PTE_X) == 0 && va == pm->pm_hiexec) { 871 struct trapframe *tf = curlwp->l_md.md_regs; 872 873 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 874 pm->pm_hiexec = I386_MAX_EXE_ADDR; 875 } 876 #endif /* !defined(__x86_64__) */ 877 } 878 879 #if !defined(__x86_64__) 880 /* 881 * Fixup the code segment to cover all potential executable mappings. 882 * returns 0 if no changes to the code segment were made. 883 */ 884 int 885 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb) 886 { 887 struct vm_map_entry *ent; 888 struct pmap *pm = vm_map_pmap(map); 889 vaddr_t va = 0; 890 891 vm_map_lock_read(map); 892 for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) { 893 /* 894 * This entry has greater va than the entries before. 895 * We need to make it point to the last page, not past it. 896 */ 897 if (ent->protection & VM_PROT_EXECUTE) 898 va = trunc_page(ent->end) - PAGE_SIZE; 899 } 900 vm_map_unlock_read(map); 901 if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL)) 902 return 0; 903 904 pm->pm_hiexec = va; 905 if (pm->pm_hiexec > I386_MAX_EXE_ADDR) { 906 tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL); 907 } else { 908 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 909 return 0; 910 } 911 return 1; 912 } 913 #endif /* !defined(__x86_64__) */ 914 915 void 916 pat_init(struct cpu_info *ci) 917 { 918 #ifndef XENPV 919 uint64_t pat; 920 921 if (!(ci->ci_feat_val[0] & CPUID_PAT)) 922 return; 923 924 /* We change WT to WC. Leave all other entries the default values. */ 925 pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) | 926 PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) | 927 PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) | 928 PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC); 929 930 wrmsr(MSR_CR_PAT, pat); 931 cpu_pat_enabled = true; 932 #endif 933 } 934 935 static pt_entry_t 936 pmap_pat_flags(u_int flags) 937 { 938 u_int cacheflags = (flags & PMAP_CACHE_MASK); 939 940 if (!cpu_pat_enabled) { 941 switch (cacheflags) { 942 case PMAP_NOCACHE: 943 case PMAP_NOCACHE_OVR: 944 /* results in PGC_UCMINUS on cpus which have 945 * the cpuid PAT but PAT "disabled" 946 */ 947 return PTE_PCD; 948 default: 949 return 0; 950 } 951 } 952 953 switch (cacheflags) { 954 case PMAP_NOCACHE: 955 return PGC_UC; 956 case PMAP_WRITE_COMBINE: 957 return PGC_WC; 958 case PMAP_WRITE_BACK: 959 return PGC_WB; 960 case PMAP_NOCACHE_OVR: 961 return PGC_UCMINUS; 962 } 963 964 return 0; 965 } 966 967 /* 968 * p m a p k e n t e r f u n c t i o n s 969 * 970 * functions to quickly enter/remove pages from the kernel address 971 * space. pmap_kremove is exported to MI kernel. we make use of 972 * the recursive PTE mappings. 973 */ 974 975 /* 976 * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking 977 * 978 * => no need to lock anything, assume va is already allocated 979 * => should be faster than normal pmap enter function 980 */ 981 void 982 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags) 983 { 984 pt_entry_t *pte, opte, npte; 985 986 KASSERT(!(prot & ~VM_PROT_ALL)); 987 988 if (va < VM_MIN_KERNEL_ADDRESS) 989 pte = vtopte(va); 990 else 991 pte = kvtopte(va); 992 #if defined(XENPV) && defined(DOM0OPS) 993 if (pa < pmap_pa_start || pa >= pmap_pa_end) { 994 #ifdef DEBUG 995 printf_nolog("%s: pa %#" PRIxPADDR " for va %#" PRIxVADDR 996 " outside range\n", __func__, pa, va); 997 #endif /* DEBUG */ 998 npte = pa; 999 } else 1000 #endif /* XENPV && DOM0OPS */ 1001 npte = pmap_pa2pte(pa); 1002 npte |= protection_codes[prot] | PTE_P | pmap_pg_g; 1003 npte |= pmap_pat_flags(flags); 1004 opte = pmap_pte_testset(pte, npte); /* zap! */ 1005 1006 /* 1007 * XXX: make sure we are not dealing with a large page, since the only 1008 * large pages created are for the kernel image, and they should never 1009 * be kentered. 1010 */ 1011 KASSERTMSG(!(opte & PTE_PS), "PTE_PS va=%#"PRIxVADDR, va); 1012 1013 if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A)) { 1014 /* This should not happen. */ 1015 printf_nolog("%s: mapping already present\n", __func__); 1016 kpreempt_disable(); 1017 pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER); 1018 kpreempt_enable(); 1019 } 1020 } 1021 1022 __strict_weak_alias(pmap_kenter_ma, pmap_kenter_pa); 1023 1024 #if defined(__x86_64__) 1025 /* 1026 * Change protection for a virtual address. Local for a CPU only, don't 1027 * care about TLB shootdowns. 1028 * 1029 * => must be called with preemption disabled 1030 */ 1031 void 1032 pmap_changeprot_local(vaddr_t va, vm_prot_t prot) 1033 { 1034 pt_entry_t *pte, opte, npte; 1035 1036 KASSERT(kpreempt_disabled()); 1037 1038 if (va < VM_MIN_KERNEL_ADDRESS) 1039 pte = vtopte(va); 1040 else 1041 pte = kvtopte(va); 1042 1043 npte = opte = *pte; 1044 1045 if ((prot & VM_PROT_WRITE) != 0) 1046 npte |= PTE_W; 1047 else 1048 npte &= ~(PTE_W|PTE_D); 1049 1050 if (opte != npte) { 1051 pmap_pte_set(pte, npte); 1052 pmap_pte_flush(); 1053 invlpg(va); 1054 } 1055 } 1056 #endif /* defined(__x86_64__) */ 1057 1058 /* 1059 * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking 1060 * 1061 * => no need to lock anything 1062 * => caller must dispose of any vm_page mapped in the va range 1063 * => note: not an inline function 1064 * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE 1065 * => we assume kernel only unmaps valid addresses and thus don't bother 1066 * checking the valid bit before doing TLB flushing 1067 * => must be followed by call to pmap_update() before reuse of page 1068 */ 1069 static void 1070 pmap_kremove1(vaddr_t sva, vsize_t len, bool localonly) 1071 { 1072 pt_entry_t *pte, opte; 1073 vaddr_t va, eva; 1074 1075 eva = sva + len; 1076 1077 kpreempt_disable(); 1078 for (va = sva; va < eva; va += PAGE_SIZE) { 1079 pte = kvtopte(va); 1080 opte = pmap_pte_testset(pte, 0); /* zap! */ 1081 if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A) && !localonly) { 1082 pmap_tlb_shootdown(pmap_kernel(), va, opte, 1083 TLBSHOOT_KREMOVE); 1084 } 1085 KASSERTMSG((opte & PTE_PS) == 0, 1086 "va %#" PRIxVADDR " is a large page", va); 1087 KASSERTMSG((opte & PTE_PVLIST) == 0, 1088 "va %#" PRIxVADDR " is a pv tracked page", va); 1089 } 1090 if (localonly) { 1091 tlbflushg(); 1092 } 1093 kpreempt_enable(); 1094 } 1095 1096 void 1097 pmap_kremove(vaddr_t sva, vsize_t len) 1098 { 1099 1100 pmap_kremove1(sva, len, false); 1101 } 1102 1103 /* 1104 * pmap_kremove_local: like pmap_kremove(), but only worry about 1105 * TLB invalidations on the current CPU. this is only intended 1106 * for use while writing kernel crash dumps, either after panic 1107 * or via reboot -d. 1108 */ 1109 void 1110 pmap_kremove_local(vaddr_t sva, vsize_t len) 1111 { 1112 1113 pmap_kremove1(sva, len, true); 1114 } 1115 1116 /* 1117 * p m a p i n i t f u n c t i o n s 1118 * 1119 * pmap_bootstrap and pmap_init are called during system startup 1120 * to init the pmap module. pmap_bootstrap() does a low level 1121 * init just to get things rolling. pmap_init() finishes the job. 1122 */ 1123 1124 /* 1125 * pmap_bootstrap_valloc: allocate a virtual address in the bootstrap area. 1126 * This function is to be used before any VM system has been set up. 1127 * 1128 * The va is taken from virtual_avail. 1129 */ 1130 static vaddr_t 1131 pmap_bootstrap_valloc(size_t npages) 1132 { 1133 vaddr_t va = virtual_avail; 1134 virtual_avail += npages * PAGE_SIZE; 1135 return va; 1136 } 1137 1138 /* 1139 * pmap_bootstrap_palloc: allocate a physical address in the bootstrap area. 1140 * This function is to be used before any VM system has been set up. 1141 * 1142 * The pa is taken from avail_start. 1143 */ 1144 static paddr_t 1145 pmap_bootstrap_palloc(size_t npages) 1146 { 1147 paddr_t pa = avail_start; 1148 avail_start += npages * PAGE_SIZE; 1149 return pa; 1150 } 1151 1152 /* 1153 * pmap_bootstrap: get the system in a state where it can run with VM properly 1154 * enabled (called before main()). The VM system is fully init'd later. 1155 * 1156 * => on i386, locore.S has already enabled the MMU by allocating a PDP for the 1157 * kernel, and nkpde PTP's for the kernel. 1158 * => kva_start is the first free virtual address in kernel space. 1159 */ 1160 void 1161 pmap_bootstrap(vaddr_t kva_start) 1162 { 1163 struct pmap *kpm; 1164 int i; 1165 vaddr_t kva; 1166 1167 pmap_pg_nx = (cpu_feature[2] & CPUID_NOX ? PTE_NX : 0); 1168 1169 /* 1170 * Set up our local static global vars that keep track of the usage of 1171 * KVM before kernel_map is set up. 1172 */ 1173 virtual_avail = kva_start; /* first free KVA */ 1174 virtual_end = VM_MAX_KERNEL_ADDRESS; /* last KVA */ 1175 1176 /* 1177 * Set up protection_codes: we need to be able to convert from a MI 1178 * protection code (some combo of VM_PROT...) to something we can jam 1179 * into a x86 PTE. 1180 */ 1181 protection_codes[VM_PROT_NONE] = pmap_pg_nx; 1182 protection_codes[VM_PROT_EXECUTE] = PTE_X; 1183 protection_codes[VM_PROT_READ] = pmap_pg_nx; 1184 protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PTE_X; 1185 protection_codes[VM_PROT_WRITE] = PTE_W | pmap_pg_nx; 1186 protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PTE_W | PTE_X; 1187 protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PTE_W | pmap_pg_nx; 1188 protection_codes[VM_PROT_ALL] = PTE_W | PTE_X; 1189 1190 /* 1191 * Now we init the kernel's pmap. 1192 * 1193 * The kernel pmap's pm_obj is not used for much. However, in user pmaps 1194 * the pm_obj contains the list of active PTPs. 1195 */ 1196 kpm = pmap_kernel(); 1197 mutex_init(&kpm->pm_lock, MUTEX_DEFAULT, IPL_NONE); 1198 rw_init(&kpm->pm_dummy_lock); 1199 for (i = 0; i < PTP_LEVELS - 1; i++) { 1200 uvm_obj_init(&kpm->pm_obj[i], &pmap_pager, false, 1); 1201 uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_dummy_lock); 1202 kpm->pm_ptphint[i] = NULL; 1203 } 1204 memset(&kpm->pm_list, 0, sizeof(kpm->pm_list)); /* pm_list not used */ 1205 1206 kpm->pm_pdir = (pd_entry_t *)bootspace.pdir; 1207 for (i = 0; i < PDP_SIZE; i++) 1208 kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i; 1209 1210 kpm->pm_stats.wired_count = kpm->pm_stats.resident_count = 1211 x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS); 1212 1213 kcpuset_create(&kpm->pm_cpus, true); 1214 kcpuset_create(&kpm->pm_kernel_cpus, true); 1215 1216 kpm->pm_ldt = NULL; 1217 kpm->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 1218 1219 /* 1220 * the above is just a rough estimate and not critical to the proper 1221 * operation of the system. 1222 */ 1223 1224 #if !defined(XENPV) 1225 /* 1226 * Begin to enable global TLB entries if they are supported: add PTE_G 1227 * attribute to already mapped kernel pages. Do that only if SVS is 1228 * disabled. 1229 * 1230 * The G bit has no effect until the CR4_PGE bit is set in CR4, which 1231 * happens later in cpu_init(). 1232 */ 1233 #ifdef SVS 1234 if (!svs_enabled && (cpu_feature[0] & CPUID_PGE)) { 1235 #else 1236 if (cpu_feature[0] & CPUID_PGE) { 1237 #endif 1238 pmap_pg_g = PTE_G; 1239 pmap_remap_global(); 1240 } 1241 #endif 1242 1243 #ifndef XENPV 1244 /* 1245 * Enable large pages if they are supported. 1246 */ 1247 if (cpu_feature[0] & CPUID_PSE) { 1248 lcr4(rcr4() | CR4_PSE); /* enable hardware (via %cr4) */ 1249 pmap_largepages = 1; /* enable software */ 1250 1251 /* 1252 * The TLB must be flushed after enabling large pages on Pentium 1253 * CPUs, according to section 3.6.2.2 of "Intel Architecture 1254 * Software Developer's Manual, Volume 3: System Programming". 1255 */ 1256 tlbflushg(); 1257 1258 /* Remap the kernel. */ 1259 pmap_remap_largepages(); 1260 } 1261 pmap_init_lapic(); 1262 #endif /* !XENPV */ 1263 1264 #ifdef __HAVE_PCPU_AREA 1265 pmap_init_pcpu(); 1266 #endif 1267 1268 #ifdef __HAVE_DIRECT_MAP 1269 pmap_init_directmap(kpm); 1270 #else 1271 pmap_vpage_cpualloc(&cpu_info_primary); 1272 1273 if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { /* i386 */ 1274 early_zerop = (void *)cpu_info_primary.vpage[VPAGE_ZER]; 1275 early_zero_pte = cpu_info_primary.vpage_pte[VPAGE_ZER]; 1276 } else { /* amd64 */ 1277 /* 1278 * zero_pte is stuck at the end of mapped space for the kernel 1279 * image (disjunct from kva space). This is done so that it 1280 * can safely be used in pmap_growkernel (pmap_get_physpage), 1281 * when it's called for the first time. 1282 * XXXfvdl fix this for MULTIPROCESSOR later. 1283 */ 1284 #ifdef XENPV 1285 /* early_zerop initialized in xen_locore() */ 1286 #else 1287 early_zerop = (void *)bootspace.spareva; 1288 #endif 1289 early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop); 1290 } 1291 #endif 1292 1293 #if defined(XENPV) && defined(__x86_64__) 1294 extern vaddr_t xen_dummy_page; 1295 paddr_t xen_dummy_user_pgd; 1296 1297 /* 1298 * We want a dummy page directory for Xen: when deactivating a pmap, 1299 * Xen will still consider it active. So we set user PGD to this one 1300 * to lift all protection on the now inactive page tables set. 1301 */ 1302 xen_dummy_user_pgd = xen_dummy_page - KERNBASE; 1303 1304 /* Zero fill it, the less checks in Xen it requires the better */ 1305 memset(PAGE_ALIGNED(xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE); 1306 /* Mark read-only */ 1307 HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE, 1308 pmap_pa2pte(xen_dummy_user_pgd) | PTE_P | pmap_pg_nx, 1309 UVMF_INVLPG); 1310 /* Pin as L4 */ 1311 xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd)); 1312 #endif 1313 1314 /* 1315 * Allocate space for the IDT, GDT and LDT. 1316 */ 1317 idt_vaddr = pmap_bootstrap_valloc(1); 1318 idt_paddr = pmap_bootstrap_palloc(1); 1319 1320 gdt_vaddr = pmap_bootstrap_valloc(1); 1321 gdt_paddr = pmap_bootstrap_palloc(1); 1322 1323 #ifdef __HAVE_PCPU_AREA 1324 ldt_vaddr = (vaddr_t)&pcpuarea->ldt; 1325 #else 1326 ldt_vaddr = pmap_bootstrap_valloc(1); 1327 #endif 1328 ldt_paddr = pmap_bootstrap_palloc(1); 1329 1330 #if !defined(__x86_64__) 1331 /* pentium f00f bug stuff */ 1332 pentium_idt_vaddr = pmap_bootstrap_valloc(1); 1333 #endif 1334 1335 #if defined(XENPVHVM) 1336 /* XXX: move to hypervisor.c with appropriate API adjustments */ 1337 extern paddr_t HYPERVISOR_shared_info_pa; 1338 extern volatile struct xencons_interface *xencons_interface; /* XXX */ 1339 extern struct xenstore_domain_interface *xenstore_interface; /* XXX */ 1340 1341 if (vm_guest != VM_GUEST_XENPVH) { 1342 HYPERVISOR_shared_info = (void *) pmap_bootstrap_valloc(1); 1343 HYPERVISOR_shared_info_pa = pmap_bootstrap_palloc(1); 1344 } 1345 xencons_interface = (void *) pmap_bootstrap_valloc(1); 1346 xenstore_interface = (void *) pmap_bootstrap_valloc(1); 1347 #endif 1348 /* 1349 * Now we reserve some VM for mapping pages when doing a crash dump. 1350 */ 1351 virtual_avail = reserve_dumppages(virtual_avail); 1352 1353 /* 1354 * Init the global lock and global list. 1355 */ 1356 mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE); 1357 LIST_INIT(&pmaps); 1358 1359 /* 1360 * Ensure the TLB is sync'd with reality by flushing it... 1361 */ 1362 tlbflushg(); 1363 1364 /* 1365 * Calculate pmap_maxkvaddr from nkptp[]. 1366 */ 1367 kva = VM_MIN_KERNEL_ADDRESS; 1368 for (i = PTP_LEVELS - 1; i >= 1; i--) { 1369 kva += nkptp[i] * nbpd[i]; 1370 } 1371 pmap_maxkvaddr = kva; 1372 } 1373 1374 #ifndef XENPV 1375 static void 1376 pmap_init_lapic(void) 1377 { 1378 /* 1379 * On CPUs that have no LAPIC, local_apic_va is never kentered. But our 1380 * x86 implementation relies a lot on this address to be valid; so just 1381 * allocate a fake physical page that will be kentered into 1382 * local_apic_va by machdep. 1383 * 1384 * If the LAPIC is present, the va will be remapped somewhere else 1385 * later in lapic_map. 1386 */ 1387 local_apic_va = pmap_bootstrap_valloc(1); 1388 local_apic_pa = pmap_bootstrap_palloc(1); 1389 } 1390 #endif 1391 1392 #ifdef __x86_64__ 1393 static size_t 1394 pmap_pagetree_nentries_range(vaddr_t startva, vaddr_t endva, size_t pgsz) 1395 { 1396 size_t npages; 1397 npages = (roundup(endva, pgsz) / pgsz) - 1398 (rounddown(startva, pgsz) / pgsz); 1399 return npages; 1400 } 1401 #endif 1402 1403 #if defined(__HAVE_DIRECT_MAP) || defined(KASAN) || defined(KMSAN) 1404 static inline void 1405 slotspace_copy(int type, pd_entry_t *dst, pd_entry_t *src) 1406 { 1407 size_t sslot = slotspace.area[type].sslot; 1408 size_t nslot = slotspace.area[type].nslot; 1409 1410 memcpy(&dst[sslot], &src[sslot], nslot * sizeof(pd_entry_t)); 1411 } 1412 #endif 1413 1414 #ifdef __x86_64__ 1415 /* 1416 * Randomize the location of an area. We count the holes in the VM space. We 1417 * randomly select one hole, and then randomly select an area within that hole. 1418 * Finally we update the associated entry in the slotspace structure. 1419 */ 1420 vaddr_t 1421 slotspace_rand(int type, size_t sz, size_t align, size_t randhole, 1422 vaddr_t randva) 1423 { 1424 struct { 1425 int start; 1426 int end; 1427 } holes[SLSPACE_NAREAS+1]; 1428 size_t i, nholes, hole; 1429 size_t startsl, endsl, nslots, winsize; 1430 vaddr_t startva, va; 1431 1432 sz = roundup(sz, align); 1433 1434 /* 1435 * Take one more slot with +NBPD_L4, because we may end up choosing 1436 * an area that crosses slots: 1437 * +------+------+------+ 1438 * | Slot | Slot | Slot | 1439 * +------+------+------+ 1440 * [Chosen Area] 1441 * And in that case we must take into account the additional slot 1442 * consumed. 1443 */ 1444 nslots = roundup(sz+NBPD_L4, NBPD_L4) / NBPD_L4; 1445 1446 /* Get the holes. */ 1447 nholes = 0; 1448 size_t curslot = 0 + 256; /* end of SLAREA_USER */ 1449 while (1) { 1450 /* 1451 * Find the first occupied slot after the current one. 1452 * The area between the two is a hole. 1453 */ 1454 size_t minsslot = 512; 1455 size_t minnslot = 0; 1456 for (i = 0; i < SLSPACE_NAREAS; i++) { 1457 if (!slotspace.area[i].active) 1458 continue; 1459 if (slotspace.area[i].sslot >= curslot && 1460 slotspace.area[i].sslot < minsslot) { 1461 minsslot = slotspace.area[i].sslot; 1462 minnslot = slotspace.area[i].nslot; 1463 } 1464 } 1465 1466 /* No hole anymore, stop here. */ 1467 if (minsslot == 512) { 1468 break; 1469 } 1470 1471 /* Register the hole. */ 1472 if (minsslot - curslot >= nslots) { 1473 holes[nholes].start = curslot; 1474 holes[nholes].end = minsslot; 1475 nholes++; 1476 } 1477 1478 /* Skip that hole, and iterate again. */ 1479 curslot = minsslot + minnslot; 1480 } 1481 1482 if (nholes == 0) { 1483 panic("%s: impossible", __func__); 1484 } 1485 1486 /* Select a hole. */ 1487 hole = randhole; 1488 #ifdef NO_X86_ASLR 1489 hole = 0; 1490 #endif 1491 hole %= nholes; 1492 startsl = holes[hole].start; 1493 endsl = holes[hole].end; 1494 startva = VA_SIGN_NEG(startsl * NBPD_L4); 1495 1496 /* Select an area within the hole. */ 1497 va = randva; 1498 #ifdef NO_X86_ASLR 1499 va = 0; 1500 #endif 1501 winsize = ((endsl - startsl) * NBPD_L4) - sz; 1502 va %= winsize; 1503 va = rounddown(va, align); 1504 va += startva; 1505 1506 /* Update the entry. */ 1507 slotspace.area[type].sslot = pl4_i(va); 1508 slotspace.area[type].nslot = 1509 pmap_pagetree_nentries_range(va, va+sz, NBPD_L4); 1510 slotspace.area[type].active = true; 1511 1512 return va; 1513 } 1514 #endif 1515 1516 #ifdef __HAVE_PCPU_AREA 1517 static void 1518 pmap_init_pcpu(void) 1519 { 1520 const vaddr_t startva = PMAP_PCPU_BASE; 1521 size_t nL4e, nL3e, nL2e, nL1e; 1522 size_t L4e_idx, L3e_idx, L2e_idx, L1e_idx __diagused; 1523 paddr_t pa; 1524 vaddr_t endva; 1525 vaddr_t tmpva; 1526 pt_entry_t *pte; 1527 size_t size; 1528 int i; 1529 1530 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx; 1531 1532 size = sizeof(struct pcpu_area); 1533 1534 endva = startva + size; 1535 1536 /* We will use this temporary va. */ 1537 tmpva = bootspace.spareva; 1538 pte = PTE_BASE + pl1_i(tmpva); 1539 1540 /* Build L4 */ 1541 L4e_idx = pl4_i(startva); 1542 nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4); 1543 KASSERT(nL4e == 1); 1544 for (i = 0; i < nL4e; i++) { 1545 KASSERT(L4_BASE[L4e_idx+i] == 0); 1546 1547 pa = pmap_bootstrap_palloc(1); 1548 *pte = (pa & PTE_FRAME) | pteflags; 1549 pmap_update_pg(tmpva); 1550 memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE); 1551 1552 L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A; 1553 } 1554 1555 /* Build L3 */ 1556 L3e_idx = pl3_i(startva); 1557 nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3); 1558 for (i = 0; i < nL3e; i++) { 1559 KASSERT(L3_BASE[L3e_idx+i] == 0); 1560 1561 pa = pmap_bootstrap_palloc(1); 1562 *pte = (pa & PTE_FRAME) | pteflags; 1563 pmap_update_pg(tmpva); 1564 memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE); 1565 1566 L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A; 1567 } 1568 1569 /* Build L2 */ 1570 L2e_idx = pl2_i(startva); 1571 nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2); 1572 for (i = 0; i < nL2e; i++) { 1573 1574 KASSERT(L2_BASE[L2e_idx+i] == 0); 1575 1576 pa = pmap_bootstrap_palloc(1); 1577 *pte = (pa & PTE_FRAME) | pteflags; 1578 pmap_update_pg(tmpva); 1579 memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE); 1580 1581 L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A; 1582 } 1583 1584 /* Build L1 */ 1585 L1e_idx = pl1_i(startva); 1586 nL1e = pmap_pagetree_nentries_range(startva, endva, NBPD_L1); 1587 for (i = 0; i < nL1e; i++) { 1588 /* 1589 * Nothing to do, the PTEs will be entered via 1590 * pmap_kenter_pa. 1591 */ 1592 KASSERT(L1_BASE[L1e_idx+i] == 0); 1593 } 1594 1595 *pte = 0; 1596 pmap_update_pg(tmpva); 1597 1598 pcpuarea = (struct pcpu_area *)startva; 1599 1600 tlbflush(); 1601 } 1602 #endif 1603 1604 #ifdef __HAVE_DIRECT_MAP 1605 /* 1606 * Create the amd64 direct map. Called only once at boot time. We map all of 1607 * the physical memory contiguously using 2MB large pages, with RW permissions. 1608 * However there is a hole: the kernel is mapped with RO permissions. 1609 */ 1610 static void 1611 pmap_init_directmap(struct pmap *kpm) 1612 { 1613 extern phys_ram_seg_t mem_clusters[]; 1614 extern int mem_cluster_cnt; 1615 1616 vaddr_t startva; 1617 size_t nL4e, nL3e, nL2e; 1618 size_t L4e_idx, L3e_idx, L2e_idx; 1619 size_t spahole, epahole; 1620 paddr_t lastpa, pa; 1621 vaddr_t endva; 1622 vaddr_t tmpva; 1623 pt_entry_t *pte; 1624 phys_ram_seg_t *mc; 1625 int i; 1626 size_t randhole; 1627 vaddr_t randva; 1628 1629 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx; 1630 const pd_entry_t holepteflags = PTE_P | pmap_pg_nx; 1631 1632 CTASSERT(NL4_SLOT_DIRECT * NBPD_L4 == MAXPHYSMEM); 1633 1634 spahole = roundup(bootspace.head.pa, NBPD_L2); 1635 epahole = rounddown(bootspace.boot.pa, NBPD_L2); 1636 1637 /* Get the last physical address available */ 1638 lastpa = 0; 1639 for (i = 0; i < mem_cluster_cnt; i++) { 1640 mc = &mem_clusters[i]; 1641 lastpa = MAX(lastpa, mc->start + mc->size); 1642 } 1643 1644 /* 1645 * x86_add_cluster should have truncated the memory to MAXPHYSMEM. 1646 */ 1647 if (lastpa > MAXPHYSMEM) { 1648 panic("pmap_init_directmap: lastpa incorrect"); 1649 } 1650 1651 entropy_extract(&randhole, sizeof randhole, 0); 1652 entropy_extract(&randva, sizeof randva, 0); 1653 startva = slotspace_rand(SLAREA_DMAP, lastpa, NBPD_L2, 1654 randhole, randva); 1655 endva = startva + lastpa; 1656 1657 /* We will use this temporary va. */ 1658 tmpva = bootspace.spareva; 1659 pte = PTE_BASE + pl1_i(tmpva); 1660 1661 /* Build L4 */ 1662 L4e_idx = pl4_i(startva); 1663 nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4); 1664 KASSERT(nL4e <= NL4_SLOT_DIRECT); 1665 for (i = 0; i < nL4e; i++) { 1666 KASSERT(L4_BASE[L4e_idx+i] == 0); 1667 1668 pa = pmap_bootstrap_palloc(1); 1669 *pte = (pa & PTE_FRAME) | pteflags; 1670 pmap_update_pg(tmpva); 1671 memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE); 1672 1673 L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A; 1674 } 1675 1676 /* Build L3 */ 1677 L3e_idx = pl3_i(startva); 1678 nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3); 1679 for (i = 0; i < nL3e; i++) { 1680 KASSERT(L3_BASE[L3e_idx+i] == 0); 1681 1682 pa = pmap_bootstrap_palloc(1); 1683 *pte = (pa & PTE_FRAME) | pteflags; 1684 pmap_update_pg(tmpva); 1685 memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE); 1686 1687 L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A; 1688 } 1689 1690 /* Build L2 */ 1691 L2e_idx = pl2_i(startva); 1692 nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2); 1693 for (i = 0; i < nL2e; i++) { 1694 KASSERT(L2_BASE[L2e_idx+i] == 0); 1695 1696 pa = (paddr_t)(i * NBPD_L2); 1697 1698 if (spahole <= pa && pa < epahole) { 1699 L2_BASE[L2e_idx+i] = pa | holepteflags | PTE_A | 1700 PTE_PS | pmap_pg_g; 1701 } else { 1702 L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A | 1703 PTE_PS | pmap_pg_g; 1704 } 1705 } 1706 1707 *pte = 0; 1708 pmap_update_pg(tmpva); 1709 1710 pmap_direct_base = startva; 1711 pmap_direct_end = endva; 1712 1713 tlbflush(); 1714 } 1715 #endif /* __HAVE_DIRECT_MAP */ 1716 1717 #if !defined(XENPV) 1718 /* 1719 * Remap all of the virtual pages created so far with the PTE_G bit. 1720 */ 1721 static void 1722 pmap_remap_global(void) 1723 { 1724 vaddr_t kva, kva_end; 1725 unsigned long p1i; 1726 size_t i; 1727 1728 /* head */ 1729 kva = bootspace.head.va; 1730 kva_end = kva + bootspace.head.sz; 1731 for ( ; kva < kva_end; kva += PAGE_SIZE) { 1732 p1i = pl1_i(kva); 1733 if (pmap_valid_entry(PTE_BASE[p1i])) 1734 PTE_BASE[p1i] |= pmap_pg_g; 1735 } 1736 1737 /* kernel segments */ 1738 for (i = 0; i < BTSPACE_NSEGS; i++) { 1739 if (bootspace.segs[i].type == BTSEG_NONE) { 1740 continue; 1741 } 1742 kva = bootspace.segs[i].va; 1743 kva_end = kva + bootspace.segs[i].sz; 1744 for ( ; kva < kva_end; kva += PAGE_SIZE) { 1745 p1i = pl1_i(kva); 1746 if (pmap_valid_entry(PTE_BASE[p1i])) 1747 PTE_BASE[p1i] |= pmap_pg_g; 1748 } 1749 } 1750 1751 /* boot space */ 1752 kva = bootspace.boot.va; 1753 kva_end = kva + bootspace.boot.sz; 1754 for ( ; kva < kva_end; kva += PAGE_SIZE) { 1755 p1i = pl1_i(kva); 1756 if (pmap_valid_entry(PTE_BASE[p1i])) 1757 PTE_BASE[p1i] |= pmap_pg_g; 1758 } 1759 } 1760 #endif 1761 1762 #ifndef XENPV 1763 /* 1764 * Remap several kernel segments with large pages. We cover as many pages as we 1765 * can. Called only once at boot time, if the CPU supports large pages. 1766 */ 1767 static void 1768 pmap_remap_largepages(void) 1769 { 1770 pd_entry_t *pde; 1771 vaddr_t kva, kva_end; 1772 paddr_t pa; 1773 size_t i; 1774 1775 /* Remap the kernel text using large pages. */ 1776 for (i = 0; i < BTSPACE_NSEGS; i++) { 1777 if (bootspace.segs[i].type != BTSEG_TEXT) { 1778 continue; 1779 } 1780 kva = roundup(bootspace.segs[i].va, NBPD_L2); 1781 if (kva < bootspace.segs[i].va) { 1782 continue; 1783 } 1784 kva_end = rounddown(bootspace.segs[i].va + 1785 bootspace.segs[i].sz, NBPD_L2); 1786 pa = roundup(bootspace.segs[i].pa, NBPD_L2); 1787 for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) { 1788 pde = &L2_BASE[pl2_i(kva)]; 1789 *pde = pa | pmap_pg_g | PTE_PS | PTE_P; 1790 tlbflushg(); 1791 } 1792 } 1793 1794 /* Remap the kernel rodata using large pages. */ 1795 for (i = 0; i < BTSPACE_NSEGS; i++) { 1796 if (bootspace.segs[i].type != BTSEG_RODATA) { 1797 continue; 1798 } 1799 kva = roundup(bootspace.segs[i].va, NBPD_L2); 1800 if (kva < bootspace.segs[i].va) { 1801 continue; 1802 } 1803 kva_end = rounddown(bootspace.segs[i].va + 1804 bootspace.segs[i].sz, NBPD_L2); 1805 pa = roundup(bootspace.segs[i].pa, NBPD_L2); 1806 for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) { 1807 pde = &L2_BASE[pl2_i(kva)]; 1808 *pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_P; 1809 tlbflushg(); 1810 } 1811 } 1812 1813 /* Remap the kernel data+bss using large pages. */ 1814 for (i = 0; i < BTSPACE_NSEGS; i++) { 1815 if (bootspace.segs[i].type != BTSEG_DATA) { 1816 continue; 1817 } 1818 kva = roundup(bootspace.segs[i].va, NBPD_L2); 1819 if (kva < bootspace.segs[i].va) { 1820 continue; 1821 } 1822 kva_end = rounddown(bootspace.segs[i].va + 1823 bootspace.segs[i].sz, NBPD_L2); 1824 pa = roundup(bootspace.segs[i].pa, NBPD_L2); 1825 for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) { 1826 pde = &L2_BASE[pl2_i(kva)]; 1827 *pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_W | PTE_P; 1828 tlbflushg(); 1829 } 1830 } 1831 } 1832 #endif /* !XENPV */ 1833 1834 /* 1835 * pmap_init: called from uvm_init, our job is to get the pmap system ready 1836 * to manage mappings. 1837 */ 1838 void 1839 pmap_init(void) 1840 { 1841 int flags; 1842 1843 /* 1844 * initialize caches. 1845 */ 1846 1847 pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), COHERENCY_UNIT, 1848 0, 0, "pmappl", NULL, IPL_NONE, pmap_ctor, pmap_dtor, NULL); 1849 1850 #ifdef XENPV 1851 /* 1852 * pool_cache(9) should not touch cached objects, since they 1853 * are pinned on xen and R/O for the domU 1854 */ 1855 flags = PR_NOTOUCH; 1856 #else 1857 flags = 0; 1858 #endif 1859 1860 #ifdef PAE 1861 pool_init(&pmap_pdp_pool, PAGE_SIZE * PDP_SIZE, 0, 0, flags, 1862 "pdppl", &pmap_pdp_allocator, IPL_NONE); 1863 #else 1864 pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, 0, flags, 1865 "pdppl", NULL, IPL_NONE); 1866 #endif 1867 pool_cache_bootstrap(&pmap_pvp_cache, PAGE_SIZE, PAGE_SIZE, 1868 0, 0, "pvpage", &pool_allocator_kmem, 1869 IPL_NONE, pmap_pvp_ctor, pmap_pvp_dtor, NULL); 1870 1871 pmap_tlb_init(); 1872 1873 /* XXX: Since cpu_hatch() is only for secondary CPUs. */ 1874 pmap_tlb_cpu_init(curcpu()); 1875 1876 evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC, 1877 NULL, "x86", "io bitmap copy"); 1878 evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC, 1879 NULL, "x86", "ldt sync"); 1880 1881 /* 1882 * The kernel doesn't keep track of PTPs, so there's nowhere handy 1883 * to hang a tree of pv_entry records. Dynamically allocated 1884 * pv_entry lists are not heavily used in the kernel's pmap (the 1885 * usual case is embedded), so cop out and use a single RB tree 1886 * to cover them. 1887 */ 1888 rb_tree_init(&pmap_kernel_rb, &pmap_rbtree_ops); 1889 1890 /* 1891 * done: pmap module is up (and ready for business) 1892 */ 1893 1894 pmap_initialized = true; 1895 } 1896 1897 #ifndef XENPV 1898 /* 1899 * pmap_cpu_init_late: perform late per-CPU initialization. 1900 */ 1901 void 1902 pmap_cpu_init_late(struct cpu_info *ci) 1903 { 1904 /* 1905 * The BP has already its own PD page allocated during early 1906 * MD startup. 1907 */ 1908 if (ci == &cpu_info_primary) 1909 return; 1910 #ifdef PAE 1911 cpu_alloc_l3_page(ci); 1912 #endif 1913 } 1914 #endif 1915 1916 #ifndef __HAVE_DIRECT_MAP 1917 CTASSERT(CACHE_LINE_SIZE > sizeof(pt_entry_t)); 1918 CTASSERT(CACHE_LINE_SIZE % sizeof(pt_entry_t) == 0); 1919 1920 static void 1921 pmap_vpage_cpualloc(struct cpu_info *ci) 1922 { 1923 bool primary = (ci == &cpu_info_primary); 1924 size_t i, npages; 1925 vaddr_t vabase; 1926 vsize_t vrange; 1927 1928 npages = (CACHE_LINE_SIZE / sizeof(pt_entry_t)); 1929 KASSERT(npages >= VPAGE_MAX); 1930 vrange = npages * PAGE_SIZE; 1931 1932 if (primary) { 1933 while ((vabase = pmap_bootstrap_valloc(1)) % vrange != 0) { 1934 /* Waste some pages to align properly */ 1935 } 1936 /* The base is aligned, allocate the rest (contiguous) */ 1937 pmap_bootstrap_valloc(npages - 1); 1938 } else { 1939 vabase = uvm_km_alloc(kernel_map, vrange, vrange, 1940 UVM_KMF_VAONLY); 1941 if (vabase == 0) { 1942 panic("%s: failed to allocate tmp VA for CPU %d\n", 1943 __func__, cpu_index(ci)); 1944 } 1945 } 1946 1947 KASSERT((vaddr_t)&PTE_BASE[pl1_i(vabase)] % CACHE_LINE_SIZE == 0); 1948 1949 for (i = 0; i < VPAGE_MAX; i++) { 1950 ci->vpage[i] = vabase + i * PAGE_SIZE; 1951 ci->vpage_pte[i] = PTE_BASE + pl1_i(ci->vpage[i]); 1952 } 1953 } 1954 1955 void 1956 pmap_vpage_cpu_init(struct cpu_info *ci) 1957 { 1958 if (ci == &cpu_info_primary) { 1959 /* cpu0 already taken care of in pmap_bootstrap */ 1960 return; 1961 } 1962 1963 pmap_vpage_cpualloc(ci); 1964 } 1965 #endif 1966 1967 /* 1968 * p v _ e n t r y f u n c t i o n s 1969 */ 1970 1971 /* 1972 * pmap_pvp_dtor: pool_cache constructor for PV pages. 1973 */ 1974 static int 1975 pmap_pvp_ctor(void *arg, void *obj, int flags) 1976 { 1977 struct pv_page *pvp = (struct pv_page *)obj; 1978 struct pv_entry *pve = (struct pv_entry *)obj + 1; 1979 struct pv_entry *maxpve = pve + PVE_PER_PVP; 1980 1981 KASSERT(sizeof(struct pv_page) <= sizeof(struct pv_entry)); 1982 KASSERT(trunc_page((vaddr_t)obj) == (vaddr_t)obj); 1983 1984 LIST_INIT(&pvp->pvp_pves); 1985 pvp->pvp_nfree = PVE_PER_PVP; 1986 pvp->pvp_pmap = NULL; 1987 1988 for (; pve < maxpve; pve++) { 1989 LIST_INSERT_HEAD(&pvp->pvp_pves, pve, pve_list); 1990 } 1991 1992 return 0; 1993 } 1994 1995 /* 1996 * pmap_pvp_dtor: pool_cache destructor for PV pages. 1997 */ 1998 static void 1999 pmap_pvp_dtor(void *arg, void *obj) 2000 { 2001 struct pv_page *pvp __diagused = obj; 2002 2003 KASSERT(pvp->pvp_pmap == NULL); 2004 KASSERT(pvp->pvp_nfree == PVE_PER_PVP); 2005 } 2006 2007 /* 2008 * pmap_alloc_pv: allocate a PV entry (likely cached with pmap). 2009 */ 2010 static struct pv_entry * 2011 pmap_alloc_pv(struct pmap *pmap) 2012 { 2013 struct pv_entry *pve; 2014 struct pv_page *pvp; 2015 2016 KASSERT(mutex_owned(&pmap->pm_lock)); 2017 2018 if (__predict_false((pvp = LIST_FIRST(&pmap->pm_pvp_part)) == NULL)) { 2019 if ((pvp = LIST_FIRST(&pmap->pm_pvp_full)) != NULL) { 2020 LIST_REMOVE(pvp, pvp_list); 2021 } else { 2022 pvp = pool_cache_get(&pmap_pvp_cache, PR_NOWAIT); 2023 } 2024 if (__predict_false(pvp == NULL)) { 2025 return NULL; 2026 } 2027 /* full -> part */ 2028 LIST_INSERT_HEAD(&pmap->pm_pvp_part, pvp, pvp_list); 2029 pvp->pvp_pmap = pmap; 2030 } 2031 2032 KASSERT(pvp->pvp_pmap == pmap); 2033 KASSERT(pvp->pvp_nfree > 0); 2034 2035 pve = LIST_FIRST(&pvp->pvp_pves); 2036 LIST_REMOVE(pve, pve_list); 2037 pvp->pvp_nfree--; 2038 2039 if (__predict_false(pvp->pvp_nfree == 0)) { 2040 /* part -> empty */ 2041 KASSERT(LIST_EMPTY(&pvp->pvp_pves)); 2042 LIST_REMOVE(pvp, pvp_list); 2043 LIST_INSERT_HEAD(&pmap->pm_pvp_empty, pvp, pvp_list); 2044 } else { 2045 KASSERT(!LIST_EMPTY(&pvp->pvp_pves)); 2046 } 2047 2048 return pve; 2049 } 2050 2051 /* 2052 * pmap_free_pv: delayed free of a PV entry. 2053 */ 2054 static void 2055 pmap_free_pv(struct pmap *pmap, struct pv_entry *pve) 2056 { 2057 struct pv_page *pvp = (struct pv_page *)trunc_page((vaddr_t)pve); 2058 2059 KASSERT(mutex_owned(&pmap->pm_lock)); 2060 KASSERT(pvp->pvp_pmap == pmap); 2061 KASSERT(pvp->pvp_nfree >= 0); 2062 2063 LIST_INSERT_HEAD(&pvp->pvp_pves, pve, pve_list); 2064 pvp->pvp_nfree++; 2065 2066 if (__predict_false(pvp->pvp_nfree == 1)) { 2067 /* empty -> part */ 2068 LIST_REMOVE(pvp, pvp_list); 2069 LIST_INSERT_HEAD(&pmap->pm_pvp_part, pvp, pvp_list); 2070 } else if (__predict_false(pvp->pvp_nfree == PVE_PER_PVP)) { 2071 /* part -> full */ 2072 LIST_REMOVE(pvp, pvp_list); 2073 LIST_INSERT_HEAD(&pmap->pm_pvp_full, pvp, pvp_list); 2074 } 2075 } 2076 2077 /* 2078 * pmap_drain_pv: free full PV pages. 2079 */ 2080 static void 2081 pmap_drain_pv(struct pmap *pmap) 2082 { 2083 struct pv_page *pvp; 2084 2085 KASSERT(mutex_owned(&pmap->pm_lock)); 2086 2087 while ((pvp = LIST_FIRST(&pmap->pm_pvp_full)) != NULL) { 2088 LIST_REMOVE(pvp, pvp_list); 2089 KASSERT(pvp->pvp_pmap == pmap); 2090 KASSERT(pvp->pvp_nfree == PVE_PER_PVP); 2091 pvp->pvp_pmap = NULL; 2092 pool_cache_put(&pmap_pvp_cache, pvp); 2093 } 2094 } 2095 2096 /* 2097 * pmap_check_pv: verify {VA, PTP} pair is either tracked/untracked by page 2098 */ 2099 static void 2100 pmap_check_pv(struct pmap *pmap, struct vm_page *ptp, struct pmap_page *pp, 2101 vaddr_t va, bool tracked) 2102 { 2103 #ifdef DEBUG 2104 struct pv_pte *pvpte; 2105 2106 PMAP_CHECK_PP(pp); 2107 2108 mutex_spin_enter(&pp->pp_lock); 2109 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 2110 if (pvpte->pte_ptp == ptp && pvpte->pte_va == va) { 2111 break; 2112 } 2113 } 2114 mutex_spin_exit(&pp->pp_lock); 2115 2116 if (pvpte && !tracked) { 2117 panic("pmap_check_pv: %p/%lx found on pp %p", ptp, va, pp); 2118 } else if (!pvpte && tracked) { 2119 panic("pmap_check_pv: %p/%lx missing on pp %p", ptp, va, pp); 2120 } 2121 #endif 2122 } 2123 2124 /* 2125 * pmap_treelookup_pv: search the PV tree for a dynamic entry 2126 * 2127 * => pmap must be locked 2128 */ 2129 static struct pv_entry * 2130 pmap_treelookup_pv(const struct pmap *pmap, const struct vm_page *ptp, 2131 const rb_tree_t *tree, const vaddr_t va) 2132 { 2133 struct pv_entry *pve; 2134 rb_node_t *node; 2135 2136 /* 2137 * Inlined lookup tailored for exactly what's needed here that is 2138 * quite a bit faster than using rb_tree_find_node(). 2139 */ 2140 for (node = tree->rbt_root;;) { 2141 if (__predict_false(RB_SENTINEL_P(node))) { 2142 return NULL; 2143 } 2144 pve = (struct pv_entry *) 2145 ((uintptr_t)node - offsetof(struct pv_entry, pve_rb)); 2146 if (pve->pve_pte.pte_va == va) { 2147 KASSERT(pve->pve_pte.pte_ptp == ptp); 2148 return pve; 2149 } 2150 node = node->rb_nodes[pve->pve_pte.pte_va < va]; 2151 } 2152 } 2153 2154 /* 2155 * pmap_lookup_pv: look up a non-embedded pv entry for the given pmap 2156 * 2157 * => a PV entry must be known present (doesn't check for existence) 2158 * => pmap must be locked 2159 */ 2160 static struct pv_entry * 2161 pmap_lookup_pv(const struct pmap *pmap, const struct vm_page *ptp, 2162 const struct pmap_page * const old_pp, const vaddr_t va) 2163 { 2164 struct pv_entry *pve; 2165 const rb_tree_t *tree; 2166 2167 KASSERT(mutex_owned(&pmap->pm_lock)); 2168 KASSERT(ptp != NULL || pmap == pmap_kernel()); 2169 2170 /* 2171 * [This mostly deals with the case of process-private pages, i.e. 2172 * anonymous memory allocations or COW.] 2173 * 2174 * If the page is tracked with an embedded entry then the tree 2175 * lookup can be avoided. It's safe to check for this specific 2176 * set of values without pp_lock because both will only ever be 2177 * set together for this pmap. 2178 * 2179 */ 2180 if (atomic_load_relaxed(&old_pp->pp_pte.pte_ptp) == ptp && 2181 atomic_load_relaxed(&old_pp->pp_pte.pte_va) == va) { 2182 return NULL; 2183 } 2184 2185 /* 2186 * [This mostly deals with shared mappings, for example shared libs 2187 * and executables.] 2188 * 2189 * Optimise for pmap_remove_ptes() which works by ascending scan: 2190 * look at the lowest numbered node in the tree first. The tree is 2191 * known non-empty because of the check above. For short lived 2192 * processes where pmap_remove() isn't used much this gets close to 2193 * a 100% hit rate. 2194 */ 2195 tree = (ptp != NULL ? &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb); 2196 KASSERT(!RB_SENTINEL_P(tree->rbt_root)); 2197 pve = (struct pv_entry *) 2198 ((uintptr_t)tree->rbt_minmax[RB_DIR_LEFT] - 2199 offsetof(struct pv_entry, pve_rb)); 2200 if (__predict_true(pve->pve_pte.pte_va == va)) { 2201 KASSERT(pve->pve_pte.pte_ptp == ptp); 2202 return pve; 2203 } 2204 2205 /* Search the RB tree for the key (uncommon). */ 2206 return pmap_treelookup_pv(pmap, ptp, tree, va); 2207 } 2208 2209 /* 2210 * pmap_enter_pv: enter a mapping onto a pmap_page lst 2211 * 2212 * => pmap must be locked 2213 * => does NOT insert dynamic entries to tree (pmap_enter() does later) 2214 */ 2215 static int 2216 pmap_enter_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp, 2217 vaddr_t va, struct pv_entry **new_pve, struct pv_entry **old_pve, 2218 bool *samepage, bool *new_embedded, rb_tree_t *tree) 2219 { 2220 struct pv_entry *pve; 2221 int error; 2222 2223 KASSERT(mutex_owned(&pmap->pm_lock)); 2224 KASSERT(ptp_to_pmap(ptp) == pmap); 2225 KASSERT(ptp == NULL || ptp->uobject != NULL); 2226 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 2227 PMAP_CHECK_PP(pp); 2228 2229 /* 2230 * If entering the same page and it's already tracked with an 2231 * embedded entry, we can avoid the expense below. It's safe 2232 * to check for this very specific set of values without a lock 2233 * because both will only ever be set together for this pmap. 2234 */ 2235 if (atomic_load_relaxed(&pp->pp_pte.pte_ptp) == ptp && 2236 atomic_load_relaxed(&pp->pp_pte.pte_va) == va) { 2237 *samepage = true; 2238 pmap_check_pv(pmap, ptp, pp, va, true); 2239 return 0; 2240 } 2241 2242 /* 2243 * Check for an existing dynamic mapping at this address. If it's 2244 * for the same page, then it will be reused and nothing needs to be 2245 * changed. 2246 */ 2247 *old_pve = pmap_treelookup_pv(pmap, ptp, tree, va); 2248 if (*old_pve != NULL && (*old_pve)->pve_pp == pp) { 2249 *samepage = true; 2250 pmap_check_pv(pmap, ptp, pp, va, true); 2251 return 0; 2252 } 2253 2254 /* 2255 * Need to put a new mapping in place. Grab a spare pv_entry in 2256 * case it's needed; won't know for sure until the lock is taken. 2257 */ 2258 if (pmap->pm_pve == NULL) { 2259 pmap->pm_pve = pmap_alloc_pv(pmap); 2260 } 2261 2262 error = 0; 2263 pmap_check_pv(pmap, ptp, pp, va, false); 2264 mutex_spin_enter(&pp->pp_lock); 2265 if (!pv_pte_embedded(pp)) { 2266 /* 2267 * Embedded PV tracking available - easy. 2268 */ 2269 pp->pp_pte.pte_ptp = ptp; 2270 pp->pp_pte.pte_va = va; 2271 *new_embedded = true; 2272 } else if (__predict_false(pmap->pm_pve == NULL)) { 2273 /* 2274 * No memory. 2275 */ 2276 error = ENOMEM; 2277 } else { 2278 /* 2279 * Install new pv_entry on the page. 2280 */ 2281 pve = pmap->pm_pve; 2282 pmap->pm_pve = NULL; 2283 *new_pve = pve; 2284 pve->pve_pte.pte_ptp = ptp; 2285 pve->pve_pte.pte_va = va; 2286 pve->pve_pp = pp; 2287 LIST_INSERT_HEAD(&pp->pp_pvlist, pve, pve_list); 2288 } 2289 mutex_spin_exit(&pp->pp_lock); 2290 if (error == 0) { 2291 pmap_check_pv(pmap, ptp, pp, va, true); 2292 } 2293 2294 return error; 2295 } 2296 2297 /* 2298 * pmap_remove_pv: try to remove a mapping from a pv_list 2299 * 2300 * => pmap must be locked 2301 * => removes dynamic entries from tree and frees them 2302 * => caller should adjust ptp's wire_count and free PTP if needed 2303 */ 2304 static void 2305 pmap_remove_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp, 2306 vaddr_t va, struct pv_entry *pve, uint8_t oattrs) 2307 { 2308 rb_tree_t *tree = (ptp != NULL ? 2309 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb); 2310 2311 KASSERT(mutex_owned(&pmap->pm_lock)); 2312 KASSERT(ptp_to_pmap(ptp) == pmap); 2313 KASSERT(ptp == NULL || ptp->uobject != NULL); 2314 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 2315 KASSERT(ptp != NULL || pmap == pmap_kernel()); 2316 2317 pmap_check_pv(pmap, ptp, pp, va, true); 2318 2319 if (pve == NULL) { 2320 mutex_spin_enter(&pp->pp_lock); 2321 KASSERT(pp->pp_pte.pte_ptp == ptp); 2322 KASSERT(pp->pp_pte.pte_va == va); 2323 pp->pp_attrs |= oattrs; 2324 pp->pp_pte.pte_ptp = NULL; 2325 pp->pp_pte.pte_va = 0; 2326 mutex_spin_exit(&pp->pp_lock); 2327 } else { 2328 mutex_spin_enter(&pp->pp_lock); 2329 KASSERT(pp->pp_pte.pte_ptp != ptp || 2330 pp->pp_pte.pte_va != va); 2331 KASSERT(pve->pve_pte.pte_ptp == ptp); 2332 KASSERT(pve->pve_pte.pte_va == va); 2333 KASSERT(pve->pve_pp == pp); 2334 pp->pp_attrs |= oattrs; 2335 LIST_REMOVE(pve, pve_list); 2336 mutex_spin_exit(&pp->pp_lock); 2337 2338 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == pve); 2339 rb_tree_remove_node(tree, pve); 2340 #ifdef DIAGNOSTIC 2341 memset(pve, 0, sizeof(*pve)); 2342 #endif 2343 pmap_free_pv(pmap, pve); 2344 } 2345 2346 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 2347 pmap_check_pv(pmap, ptp, pp, va, false); 2348 } 2349 2350 /* 2351 * p t p f u n c t i o n s 2352 */ 2353 2354 static struct vm_page * 2355 pmap_find_ptp(struct pmap *pmap, vaddr_t va, int level) 2356 { 2357 int lidx = level - 1; 2358 off_t off = ptp_va2o(va, level); 2359 struct vm_page *pg; 2360 2361 KASSERT(mutex_owned(&pmap->pm_lock)); 2362 2363 if (pmap->pm_ptphint[lidx] && off == pmap->pm_ptphint[lidx]->offset) { 2364 KASSERT(pmap->pm_ptphint[lidx]->wire_count > 0); 2365 pg = pmap->pm_ptphint[lidx]; 2366 PMAP_CHECK_PP(VM_PAGE_TO_PP(pg)); 2367 return pg; 2368 } 2369 PMAP_DUMMY_LOCK(pmap); 2370 pg = uvm_pagelookup(&pmap->pm_obj[lidx], off); 2371 PMAP_DUMMY_UNLOCK(pmap); 2372 if (pg != NULL && __predict_false(pg->wire_count == 0)) { 2373 /* This page is queued to be freed - ignore. */ 2374 pg = NULL; 2375 } 2376 if (pg != NULL) { 2377 PMAP_CHECK_PP(VM_PAGE_TO_PP(pg)); 2378 } 2379 pmap->pm_ptphint[lidx] = pg; 2380 return pg; 2381 } 2382 2383 static inline void 2384 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level) 2385 { 2386 int lidx; 2387 2388 KASSERT(ptp->wire_count <= 1); 2389 PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp)); 2390 2391 lidx = level - 1; 2392 pmap_stats_update(pmap, -ptp->wire_count, 0); 2393 if (pmap->pm_ptphint[lidx] == ptp) 2394 pmap->pm_ptphint[lidx] = NULL; 2395 ptp->wire_count = 0; 2396 ptp->uanon = NULL; 2397 KASSERT(RB_TREE_MIN(&VM_PAGE_TO_PP(ptp)->pp_rb) == NULL); 2398 2399 /* 2400 * Enqueue the PTP to be freed by pmap_update(). We can't remove 2401 * the page from the uvm_object, as that can take further locks 2402 * (intolerable right now because the PTEs are likely mapped in). 2403 * Instead mark the PTP as free and if we bump into it again, we'll 2404 * either ignore or reuse (depending on what's useful at the time). 2405 */ 2406 LIST_INSERT_HEAD(&pmap->pm_gc_ptp, ptp, mdpage.mp_pp.pp_link); 2407 } 2408 2409 static void 2410 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va, 2411 pt_entry_t *ptes, pd_entry_t * const *pdes) 2412 { 2413 unsigned long index; 2414 int level; 2415 vaddr_t invaladdr; 2416 pd_entry_t opde; 2417 2418 KASSERT(pmap != pmap_kernel()); 2419 KASSERT(mutex_owned(&pmap->pm_lock)); 2420 KASSERT(kpreempt_disabled()); 2421 2422 level = 1; 2423 do { 2424 index = pl_i(va, level + 1); 2425 opde = pmap_pte_testset(&pdes[level - 1][index], 0); 2426 2427 /* 2428 * On Xen-amd64 or SVS, we need to sync the top level page 2429 * directory on each CPU. 2430 */ 2431 #if defined(XENPV) && defined(__x86_64__) 2432 if (level == PTP_LEVELS - 1) { 2433 xen_kpm_sync(pmap, index); 2434 } 2435 #elif defined(SVS) 2436 if (svs_enabled && level == PTP_LEVELS - 1) { 2437 svs_pmap_sync(pmap, index); 2438 } 2439 #endif 2440 2441 invaladdr = level == 1 ? (vaddr_t)ptes : 2442 (vaddr_t)pdes[level - 2]; 2443 pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE, 2444 opde, TLBSHOOT_FREE_PTP); 2445 2446 #if defined(XENPV) 2447 pmap_tlb_shootnow(); 2448 #endif 2449 2450 pmap_freepage(pmap, ptp, level); 2451 if (level < PTP_LEVELS - 1) { 2452 ptp = pmap_find_ptp(pmap, va, level + 1); 2453 ptp->wire_count--; 2454 if (ptp->wire_count > 1) 2455 break; 2456 } 2457 } while (++level < PTP_LEVELS); 2458 pmap_pte_flush(); 2459 } 2460 2461 /* 2462 * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one) 2463 * 2464 * => pmap should NOT be pmap_kernel() 2465 * => pmap should be locked 2466 * => we are not touching any PTEs yet, so they need not be mapped in 2467 */ 2468 static int 2469 pmap_get_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va, 2470 int flags, struct vm_page **resultp) 2471 { 2472 struct vm_page *ptp; 2473 int i, aflags; 2474 struct uvm_object *obj; 2475 voff_t off; 2476 2477 KASSERT(pmap != pmap_kernel()); 2478 KASSERT(mutex_owned(&pmap->pm_lock)); 2479 2480 /* 2481 * Loop through all page table levels allocating a page 2482 * for any level where we don't already have one. 2483 */ 2484 memset(pt, 0, sizeof(*pt)); 2485 aflags = ((flags & PMAP_CANFAIL) ? 0 : UVM_PGA_USERESERVE) | 2486 UVM_PGA_ZERO; 2487 for (i = PTP_LEVELS; i > 1; i--) { 2488 obj = &pmap->pm_obj[i - 2]; 2489 off = ptp_va2o(va, i - 1); 2490 2491 PMAP_DUMMY_LOCK(pmap); 2492 pt->pg[i] = uvm_pagelookup(obj, off); 2493 2494 if (pt->pg[i] == NULL) { 2495 pt->pg[i] = uvm_pagealloc(obj, off, NULL, aflags); 2496 pt->alloced[i] = (pt->pg[i] != NULL); 2497 } else if (pt->pg[i]->wire_count == 0) { 2498 /* This page was queued to be freed; dequeue it. */ 2499 LIST_REMOVE(pt->pg[i], mdpage.mp_pp.pp_link); 2500 pt->alloced[i] = true; 2501 } 2502 PMAP_DUMMY_UNLOCK(pmap); 2503 if (pt->pg[i] == NULL) { 2504 pmap_unget_ptp(pmap, pt); 2505 return ENOMEM; 2506 } else if (pt->alloced[i]) { 2507 pt->pg[i]->uanon = (struct vm_anon *)(vaddr_t)~0L; 2508 rb_tree_init(&VM_PAGE_TO_PP(pt->pg[i])->pp_rb, 2509 &pmap_rbtree_ops); 2510 PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i])); 2511 } 2512 } 2513 ptp = pt->pg[2]; 2514 KASSERT(ptp != NULL); 2515 *resultp = ptp; 2516 pmap->pm_ptphint[0] = ptp; 2517 return 0; 2518 } 2519 2520 /* 2521 * pmap_install_ptp: install any freshly allocated PTPs 2522 * 2523 * => pmap should NOT be pmap_kernel() 2524 * => pmap should be locked 2525 * => PTEs must be mapped 2526 * => preemption must be disabled 2527 */ 2528 static void 2529 pmap_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va, 2530 pd_entry_t * const *pdes) 2531 { 2532 struct vm_page *ptp; 2533 unsigned long index; 2534 pd_entry_t *pva; 2535 paddr_t pa; 2536 int i; 2537 2538 KASSERT(pmap != pmap_kernel()); 2539 KASSERT(mutex_owned(&pmap->pm_lock)); 2540 KASSERT(kpreempt_disabled()); 2541 2542 /* 2543 * Now that we have all the pages looked up or allocated, 2544 * loop through again installing any new ones into the tree. 2545 */ 2546 for (i = PTP_LEVELS; i > 1; i--) { 2547 index = pl_i(va, i); 2548 pva = pdes[i - 2]; 2549 2550 if (pmap_valid_entry(pva[index])) { 2551 KASSERT(!pt->alloced[i]); 2552 continue; 2553 } 2554 2555 ptp = pt->pg[i]; 2556 ptp->flags &= ~PG_BUSY; /* never busy */ 2557 ptp->wire_count = 1; 2558 pmap->pm_ptphint[i - 2] = ptp; 2559 pa = VM_PAGE_TO_PHYS(ptp); 2560 pmap_pte_set(&pva[index], (pd_entry_t) 2561 (pmap_pa2pte(pa) | PTE_U | PTE_W | PTE_P)); 2562 2563 /* 2564 * On Xen-amd64 or SVS, we need to sync the top level page 2565 * directory on each CPU. 2566 */ 2567 #if defined(XENPV) && defined(__x86_64__) 2568 if (i == PTP_LEVELS) { 2569 xen_kpm_sync(pmap, index); 2570 } 2571 #elif defined(SVS) 2572 if (svs_enabled && i == PTP_LEVELS) { 2573 svs_pmap_sync(pmap, index); 2574 } 2575 #endif 2576 2577 pmap_pte_flush(); 2578 pmap_stats_update(pmap, 1, 0); 2579 2580 /* 2581 * If we're not in the top level, increase the 2582 * wire count of the parent page. 2583 */ 2584 if (i < PTP_LEVELS) { 2585 pt->pg[i + 1]->wire_count++; 2586 } 2587 } 2588 } 2589 2590 /* 2591 * pmap_unget_ptp: free unusued PTPs 2592 * 2593 * => pmap should NOT be pmap_kernel() 2594 * => pmap should be locked 2595 */ 2596 static void 2597 pmap_unget_ptp(struct pmap *pmap, struct pmap_ptparray *pt) 2598 { 2599 int i; 2600 2601 KASSERT(pmap != pmap_kernel()); 2602 KASSERT(mutex_owned(&pmap->pm_lock)); 2603 2604 for (i = PTP_LEVELS; i > 1; i--) { 2605 if (!pt->alloced[i]) { 2606 continue; 2607 } 2608 KASSERT(pt->pg[i]->wire_count == 0); 2609 PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i])); 2610 pmap_freepage(pmap, pt->pg[i], i - 1); 2611 } 2612 } 2613 2614 /* 2615 * p m a p l i f e c y c l e f u n c t i o n s 2616 */ 2617 2618 /* 2619 * pmap_pdp_init: constructor a new PDP. 2620 */ 2621 static void 2622 pmap_pdp_init(pd_entry_t *pdir) 2623 { 2624 paddr_t pdirpa = 0; 2625 vaddr_t object; 2626 int i; 2627 2628 #if !defined(XENPV) || !defined(__x86_64__) 2629 int npde; 2630 #endif 2631 #ifdef XENPV 2632 int s; 2633 #endif 2634 2635 memset(PAGE_ALIGNED(pdir), 0, PDP_SIZE * PAGE_SIZE); 2636 2637 /* 2638 * NOTE: This is all done unlocked, but we will check afterwards 2639 * if we have raced with pmap_growkernel(). 2640 */ 2641 2642 #if defined(XENPV) && defined(__x86_64__) 2643 /* Fetch the physical address of the page directory */ 2644 (void)pmap_extract(pmap_kernel(), (vaddr_t)pdir, &pdirpa); 2645 2646 /* 2647 * This pdir will NEVER be active in kernel mode, so mark 2648 * recursive entry invalid. 2649 */ 2650 pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa); 2651 2652 /* 2653 * PDP constructed this way won't be for the kernel, hence we 2654 * don't put kernel mappings on Xen. 2655 * 2656 * But we need to make pmap_create() happy, so put a dummy 2657 * (without PTE_P) value at the right place. 2658 */ 2659 pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] = 2660 (pd_entry_t)-1 & PTE_FRAME; 2661 #else /* XENPV && __x86_64__*/ 2662 object = (vaddr_t)pdir; 2663 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2664 /* Fetch the physical address of the page directory */ 2665 (void)pmap_extract(pmap_kernel(), object, &pdirpa); 2666 2667 /* Put in recursive PDE to map the PTEs */ 2668 pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PTE_P | 2669 pmap_pg_nx; 2670 #ifndef XENPV 2671 pdir[PDIR_SLOT_PTE + i] |= PTE_W; 2672 #endif 2673 } 2674 2675 /* Copy the kernel's top level PDE */ 2676 npde = nkptp[PTP_LEVELS - 1]; 2677 2678 memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN], 2679 npde * sizeof(pd_entry_t)); 2680 2681 if (VM_MIN_KERNEL_ADDRESS != KERNBASE) { 2682 int idx = pl_i(KERNBASE, PTP_LEVELS); 2683 pdir[idx] = PDP_BASE[idx]; 2684 } 2685 2686 #ifdef __HAVE_PCPU_AREA 2687 pdir[PDIR_SLOT_PCPU] = PDP_BASE[PDIR_SLOT_PCPU]; 2688 #endif 2689 #ifdef __HAVE_DIRECT_MAP 2690 slotspace_copy(SLAREA_DMAP, pdir, PDP_BASE); 2691 #endif 2692 #ifdef KASAN 2693 slotspace_copy(SLAREA_ASAN, pdir, PDP_BASE); 2694 #endif 2695 #ifdef KMSAN 2696 slotspace_copy(SLAREA_MSAN, pdir, PDP_BASE); 2697 #endif 2698 #endif /* XENPV && __x86_64__*/ 2699 2700 #ifdef XENPV 2701 s = splvm(); 2702 object = (vaddr_t)pdir; 2703 pmap_protect(pmap_kernel(), object, object + (PAGE_SIZE * PDP_SIZE), 2704 VM_PROT_READ); 2705 pmap_update(pmap_kernel()); 2706 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2707 /* 2708 * pin as L2/L4 page, we have to do the page with the 2709 * PDIR_SLOT_PTE entries last 2710 */ 2711 #ifdef PAE 2712 if (i == l2tol3(PDIR_SLOT_PTE)) 2713 continue; 2714 #endif 2715 2716 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2717 #ifdef __x86_64__ 2718 xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa)); 2719 #else 2720 xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa)); 2721 #endif 2722 } 2723 #ifdef PAE 2724 object = ((vaddr_t)pdir) + PAGE_SIZE * l2tol3(PDIR_SLOT_PTE); 2725 (void)pmap_extract(pmap_kernel(), object, &pdirpa); 2726 xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa)); 2727 #endif 2728 splx(s); 2729 #endif /* XENPV */ 2730 } 2731 2732 /* 2733 * pmap_pdp_fini: destructor for the PDPs. 2734 */ 2735 static void 2736 pmap_pdp_fini(pd_entry_t *pdir) 2737 { 2738 #ifdef XENPV 2739 paddr_t pdirpa = 0; /* XXX: GCC */ 2740 vaddr_t object = (vaddr_t)pdir; 2741 int i; 2742 int s = splvm(); 2743 pt_entry_t *pte; 2744 2745 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2746 /* fetch the physical address of the page directory. */ 2747 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2748 /* unpin page table */ 2749 xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa)); 2750 } 2751 object = (vaddr_t)pdir; 2752 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2753 /* Set page RW again */ 2754 pte = kvtopte(object); 2755 pmap_pte_set(pte, *pte | PTE_W); 2756 xen_bcast_invlpg((vaddr_t)object); 2757 } 2758 splx(s); 2759 #endif /* XENPV */ 2760 } 2761 2762 #ifdef PAE 2763 static void * 2764 pmap_pdp_alloc(struct pool *pp, int flags) 2765 { 2766 return (void *)uvm_km_alloc(kernel_map, 2767 PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE, 2768 ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK) | 2769 UVM_KMF_WIRED); 2770 } 2771 2772 static void 2773 pmap_pdp_free(struct pool *pp, void *v) 2774 { 2775 uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE, 2776 UVM_KMF_WIRED); 2777 } 2778 #endif /* PAE */ 2779 2780 /* 2781 * pmap_ctor: constructor for the pmap cache. 2782 */ 2783 static int 2784 pmap_ctor(void *arg, void *obj, int flags) 2785 { 2786 struct pmap *pmap = obj; 2787 pt_entry_t p; 2788 int i; 2789 2790 KASSERT((flags & PR_WAITOK) != 0); 2791 2792 mutex_init(&pmap->pm_lock, MUTEX_DEFAULT, IPL_NONE); 2793 rw_init(&pmap->pm_dummy_lock); 2794 kcpuset_create(&pmap->pm_cpus, true); 2795 kcpuset_create(&pmap->pm_kernel_cpus, true); 2796 #ifdef XENPV 2797 kcpuset_create(&pmap->pm_xen_ptp_cpus, true); 2798 #endif 2799 LIST_INIT(&pmap->pm_gc_ptp); 2800 pmap->pm_pve = NULL; 2801 LIST_INIT(&pmap->pm_pvp_full); 2802 LIST_INIT(&pmap->pm_pvp_part); 2803 LIST_INIT(&pmap->pm_pvp_empty); 2804 2805 /* allocate and init PDP */ 2806 pmap->pm_pdir = pool_get(&pmap_pdp_pool, PR_WAITOK); 2807 2808 for (;;) { 2809 pmap_pdp_init(pmap->pm_pdir); 2810 mutex_enter(&pmaps_lock); 2811 p = pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1]; 2812 if (__predict_true(p != 0)) { 2813 break; 2814 } 2815 mutex_exit(&pmaps_lock); 2816 } 2817 2818 for (i = 0; i < PDP_SIZE; i++) 2819 pmap->pm_pdirpa[i] = 2820 pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]); 2821 2822 LIST_INSERT_HEAD(&pmaps, pmap, pm_list); 2823 mutex_exit(&pmaps_lock); 2824 2825 return 0; 2826 } 2827 2828 /* 2829 * pmap_ctor: destructor for the pmap cache. 2830 */ 2831 static void 2832 pmap_dtor(void *arg, void *obj) 2833 { 2834 struct pmap *pmap = obj; 2835 2836 mutex_enter(&pmaps_lock); 2837 LIST_REMOVE(pmap, pm_list); 2838 mutex_exit(&pmaps_lock); 2839 2840 pmap_pdp_fini(pmap->pm_pdir); 2841 pool_put(&pmap_pdp_pool, pmap->pm_pdir); 2842 mutex_destroy(&pmap->pm_lock); 2843 rw_destroy(&pmap->pm_dummy_lock); 2844 kcpuset_destroy(pmap->pm_cpus); 2845 kcpuset_destroy(pmap->pm_kernel_cpus); 2846 #ifdef XENPV 2847 kcpuset_destroy(pmap->pm_xen_ptp_cpus); 2848 #endif 2849 } 2850 2851 /* 2852 * pmap_create: create a pmap object. 2853 */ 2854 struct pmap * 2855 pmap_create(void) 2856 { 2857 struct pmap *pmap; 2858 int i; 2859 2860 pmap = pool_cache_get(&pmap_cache, PR_WAITOK); 2861 2862 /* init uvm_object */ 2863 for (i = 0; i < PTP_LEVELS - 1; i++) { 2864 uvm_obj_init(&pmap->pm_obj[i], &pmap_pager, false, 1); 2865 uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_dummy_lock); 2866 pmap->pm_ptphint[i] = NULL; 2867 } 2868 pmap->pm_stats.wired_count = 0; 2869 /* count the PDP allocd below */ 2870 pmap->pm_stats.resident_count = PDP_SIZE; 2871 #if !defined(__x86_64__) 2872 pmap->pm_hiexec = 0; 2873 #endif 2874 2875 /* Used by NVMM and Xen */ 2876 pmap->pm_enter = NULL; 2877 pmap->pm_extract = NULL; 2878 pmap->pm_remove = NULL; 2879 pmap->pm_sync_pv = NULL; 2880 pmap->pm_pp_remove_ent = NULL; 2881 pmap->pm_write_protect = NULL; 2882 pmap->pm_unwire = NULL; 2883 pmap->pm_tlb_flush = NULL; 2884 pmap->pm_data = NULL; 2885 2886 /* init the LDT */ 2887 pmap->pm_ldt = NULL; 2888 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 2889 2890 return (pmap); 2891 } 2892 2893 /* 2894 * pmap_check_ptps: verify that none of the pmap's page table objects 2895 * have any pages allocated to them. 2896 */ 2897 static void 2898 pmap_check_ptps(struct pmap *pmap) 2899 { 2900 int i; 2901 2902 for (i = 0; i < PTP_LEVELS - 1; i++) { 2903 KASSERTMSG(pmap->pm_obj[i].uo_npages == 0, 2904 "pmap %p level %d still has %d pages", 2905 pmap, i, (int)pmap->pm_obj[i].uo_npages); 2906 } 2907 } 2908 2909 static void 2910 pmap_check_inuse(struct pmap *pmap) 2911 { 2912 #ifdef DEBUG 2913 CPU_INFO_ITERATOR cii; 2914 struct cpu_info *ci; 2915 2916 for (CPU_INFO_FOREACH(cii, ci)) { 2917 if (ci->ci_pmap == pmap) 2918 panic("destroying pmap being used"); 2919 #if defined(XENPV) && defined(__x86_64__) 2920 for (int i = 0; i < PDIR_SLOT_USERLIM; i++) { 2921 if (pmap->pm_pdir[i] != 0 && 2922 ci->ci_kpm_pdir[i] == pmap->pm_pdir[i]) { 2923 printf("pmap_destroy(%p) pmap_kernel %p " 2924 "curcpu %d cpu %d ci_pmap %p " 2925 "ci->ci_kpm_pdir[%d]=%" PRIx64 2926 " pmap->pm_pdir[%d]=%" PRIx64 "\n", 2927 pmap, pmap_kernel(), curcpu()->ci_index, 2928 ci->ci_index, ci->ci_pmap, 2929 i, ci->ci_kpm_pdir[i], 2930 i, pmap->pm_pdir[i]); 2931 panic("%s: used pmap", __func__); 2932 } 2933 } 2934 #endif 2935 } 2936 #endif /* DEBUG */ 2937 } 2938 2939 /* 2940 * pmap_destroy: drop reference count on pmap. free pmap if reference 2941 * count goes to zero. 2942 * 2943 * => we can be called from pmap_unmap_ptes() with a different, unrelated 2944 * pmap's lock held. be careful! 2945 */ 2946 void 2947 pmap_destroy(struct pmap *pmap) 2948 { 2949 int i; 2950 2951 /* 2952 * drop reference count and verify not in use. 2953 */ 2954 2955 if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) { 2956 return; 2957 } 2958 pmap_check_inuse(pmap); 2959 2960 /* 2961 * handle any deferred frees. 2962 */ 2963 2964 mutex_enter(&pmap->pm_lock); 2965 if (pmap->pm_pve != NULL) { 2966 pmap_free_pv(pmap, pmap->pm_pve); 2967 pmap->pm_pve = NULL; 2968 } 2969 pmap_drain_pv(pmap); 2970 mutex_exit(&pmap->pm_lock); 2971 pmap_update(pmap); 2972 2973 /* 2974 * Reference count is zero, free pmap resources and then free pmap. 2975 */ 2976 2977 pmap_check_ptps(pmap); 2978 KASSERT(LIST_EMPTY(&pmap->pm_gc_ptp)); 2979 2980 #ifdef USER_LDT 2981 if (pmap->pm_ldt != NULL) { 2982 /* 2983 * No need to switch the LDT; this address space is gone, 2984 * nothing is using it. 2985 * 2986 * No need to lock the pmap for ldt_free (or anything else), 2987 * we're the last one to use it. 2988 */ 2989 /* XXXAD can't take cpu_lock here - fix soon. */ 2990 mutex_enter(&cpu_lock); 2991 ldt_free(pmap->pm_ldt_sel); 2992 mutex_exit(&cpu_lock); 2993 uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt, 2994 MAX_USERLDT_SIZE, UVM_KMF_WIRED); 2995 } 2996 #endif 2997 2998 for (i = 0; i < PTP_LEVELS - 1; i++) { 2999 uvm_obj_destroy(&pmap->pm_obj[i], false); 3000 } 3001 kcpuset_zero(pmap->pm_cpus); 3002 kcpuset_zero(pmap->pm_kernel_cpus); 3003 #ifdef XENPV 3004 kcpuset_zero(pmap->pm_xen_ptp_cpus); 3005 #endif 3006 3007 KASSERT(LIST_EMPTY(&pmap->pm_pvp_full)); 3008 KASSERT(LIST_EMPTY(&pmap->pm_pvp_part)); 3009 KASSERT(LIST_EMPTY(&pmap->pm_pvp_empty)); 3010 3011 pmap_check_ptps(pmap); 3012 if (__predict_false(pmap->pm_enter != NULL)) { 3013 /* XXX make this a different cache */ 3014 pool_cache_destruct_object(&pmap_cache, pmap); 3015 } else { 3016 pool_cache_put(&pmap_cache, pmap); 3017 } 3018 } 3019 3020 /* 3021 * pmap_zap_ptp: clear out an entire PTP without modifying PTEs 3022 * 3023 * => caller must hold pmap's lock 3024 * => PTP must be mapped into KVA 3025 * => must be called with kernel preemption disabled 3026 * => does as little work as possible 3027 */ 3028 static void 3029 pmap_zap_ptp(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, 3030 vaddr_t startva, vaddr_t blkendva) 3031 { 3032 #ifndef XENPV 3033 struct pv_entry *pve; 3034 struct vm_page *pg; 3035 struct pmap_page *pp; 3036 pt_entry_t opte; 3037 rb_tree_t *tree; 3038 vaddr_t va; 3039 int wired; 3040 uint8_t oattrs; 3041 u_int cnt; 3042 3043 KASSERT(mutex_owned(&pmap->pm_lock)); 3044 KASSERT(kpreempt_disabled()); 3045 KASSERT(pmap != pmap_kernel()); 3046 KASSERT(ptp->wire_count > 1); 3047 KASSERT(ptp->wire_count - 1 <= PAGE_SIZE / sizeof(pt_entry_t)); 3048 3049 /* 3050 * Start at the lowest entered VA, and scan until there are no more 3051 * PTEs in the PTPs. 3052 */ 3053 tree = &VM_PAGE_TO_PP(ptp)->pp_rb; 3054 pve = RB_TREE_MIN(tree); 3055 wired = 0; 3056 va = (vaddr_t)ptp->uanon; 3057 pte += ((va - startva) >> PAGE_SHIFT); 3058 3059 for (cnt = ptp->wire_count; cnt > 1; pte++, va += PAGE_SIZE) { 3060 /* 3061 * No need for an atomic to clear the PTE. Nothing else can 3062 * see the address space any more and speculative access (if 3063 * possible) won't modify. Therefore there's no need to 3064 * track the accessed/dirty bits. 3065 */ 3066 opte = *pte; 3067 if (!pmap_valid_entry(opte)) { 3068 continue; 3069 } 3070 3071 /* 3072 * Count the PTE. If it's not for a managed mapping 3073 * there's noting more to do. 3074 */ 3075 cnt--; 3076 wired -= (opte & PTE_WIRED); 3077 if ((opte & PTE_PVLIST) == 0) { 3078 #ifndef DOM0OPS 3079 KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL), 3080 "managed page without PTE_PVLIST for %#" 3081 PRIxVADDR, va); 3082 KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL), 3083 "pv-tracked page without PTE_PVLIST for %#" 3084 PRIxVADDR, va); 3085 #endif 3086 KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ? 3087 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), 3088 va) == NULL); 3089 continue; 3090 } 3091 3092 /* 3093 * "pve" now points to the lowest (by VA) dynamic PV entry 3094 * in the PTP. If it's for this VA, take advantage of it to 3095 * avoid calling PHYS_TO_VM_PAGE(). Avoid modifying the RB 3096 * tree by skipping to the next VA in the tree whenever 3097 * there is a match here. The tree will be cleared out in 3098 * one pass before return to pmap_remove_all(). 3099 */ 3100 oattrs = pmap_pte_to_pp_attrs(opte); 3101 if (pve != NULL && pve->pve_pte.pte_va == va) { 3102 pp = pve->pve_pp; 3103 KASSERT(pve->pve_pte.pte_ptp == ptp); 3104 KASSERT(pp->pp_pte.pte_ptp != ptp || 3105 pp->pp_pte.pte_va != va); 3106 mutex_spin_enter(&pp->pp_lock); 3107 pp->pp_attrs |= oattrs; 3108 LIST_REMOVE(pve, pve_list); 3109 mutex_spin_exit(&pp->pp_lock); 3110 3111 /* 3112 * pve won't be touched again until pmap_drain_pv(), 3113 * so it's still safe to traverse the tree. 3114 */ 3115 pmap_free_pv(pmap, pve); 3116 pve = RB_TREE_NEXT(tree, pve); 3117 continue; 3118 } 3119 3120 /* 3121 * No entry in the tree so it must be embedded. Look up the 3122 * page and cancel the embedded entry. 3123 */ 3124 if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) { 3125 pp = VM_PAGE_TO_PP(pg); 3126 } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) { 3127 paddr_t pa = pmap_pte2pa(opte); 3128 panic("%s: PTE_PVLIST with pv-untracked page" 3129 " va = %#"PRIxVADDR"pa = %#"PRIxPADDR 3130 "(%#"PRIxPADDR")", __func__, va, pa, atop(pa)); 3131 } 3132 mutex_spin_enter(&pp->pp_lock); 3133 KASSERT(pp->pp_pte.pte_ptp == ptp); 3134 KASSERT(pp->pp_pte.pte_va == va); 3135 pp->pp_attrs |= oattrs; 3136 pp->pp_pte.pte_ptp = NULL; 3137 pp->pp_pte.pte_va = 0; 3138 mutex_spin_exit(&pp->pp_lock); 3139 } 3140 3141 /* PTP now empty - adjust the tree & stats to match. */ 3142 pmap_stats_update(pmap, -(ptp->wire_count - 1), wired / PTE_WIRED); 3143 ptp->wire_count = 1; 3144 #ifdef DIAGNOSTIC 3145 rb_tree_init(tree, &pmap_rbtree_ops); 3146 #endif 3147 #else /* !XENPV */ 3148 /* 3149 * XXXAD For XEN, it's not clear to me that we can do this, because 3150 * I guess the hypervisor keeps track of PTEs too. 3151 */ 3152 pmap_remove_ptes(pmap, ptp, (vaddr_t)pte, startva, blkendva); 3153 #endif /* !XENPV */ 3154 } 3155 3156 /* 3157 * pmap_remove_all: remove all mappings from pmap in bulk. 3158 * 3159 * Ordinarily when removing mappings it's important to hold the UVM object's 3160 * lock, so that pages do not gain a new identity while retaining stale TLB 3161 * entries (the same lock hold covers both pmap_remove() and pmap_update()). 3162 * Here it's known that the address space is no longer visible to any user 3163 * process, so we don't need to worry about that. 3164 */ 3165 bool 3166 pmap_remove_all(struct pmap *pmap) 3167 { 3168 struct vm_page *ptps[32]; 3169 vaddr_t va, blkendva; 3170 struct pmap *pmap2; 3171 pt_entry_t *ptes; 3172 pd_entry_t pde __diagused; 3173 pd_entry_t * const *pdes; 3174 int lvl __diagused, i, n; 3175 3176 /* XXX Can't handle EPT just yet. */ 3177 if (pmap->pm_remove != NULL) { 3178 return false; 3179 } 3180 3181 for (;;) { 3182 /* Fetch a block of PTPs from tree. */ 3183 mutex_enter(&pmap->pm_lock); 3184 n = radix_tree_gang_lookup_node(&pmap->pm_obj[0].uo_pages, 0, 3185 (void **)ptps, __arraycount(ptps), false); 3186 if (n == 0) { 3187 mutex_exit(&pmap->pm_lock); 3188 break; 3189 } 3190 3191 /* Remove all mappings in the set of PTPs. */ 3192 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3193 for (i = 0; i < n; i++) { 3194 if (ptps[i]->wire_count == 0) { 3195 /* It's dead: pmap_update() will expunge. */ 3196 continue; 3197 } 3198 3199 /* Determine range of block. */ 3200 va = ptps[i]->offset * PAGE_SIZE / sizeof(pt_entry_t); 3201 blkendva = x86_round_pdr(va + 1); 3202 3203 /* Make sure everything squares up... */ 3204 KASSERT(pmap_pdes_valid(va, pdes, &pde, &lvl)); 3205 KASSERT(lvl == 1); 3206 KASSERT(pmap_find_ptp(pmap, va, 1) == ptps[i]); 3207 3208 /* Zap! */ 3209 pmap_zap_ptp(pmap, ptps[i], &ptes[pl1_i(va)], va, 3210 blkendva); 3211 3212 /* PTP should now be unused - free it. */ 3213 KASSERT(ptps[i]->wire_count == 1); 3214 pmap_free_ptp(pmap, ptps[i], va, ptes, pdes); 3215 } 3216 pmap_unmap_ptes(pmap, pmap2); 3217 pmap_drain_pv(pmap); 3218 pmap_tlb_shootdown(pmap, -1L, 0, TLBSHOOT_REMOVE_ALL); 3219 mutex_exit(&pmap->pm_lock); 3220 3221 /* Process deferred frees. */ 3222 pmap_update(pmap); 3223 3224 /* A breathing point. */ 3225 preempt_point(); 3226 } 3227 3228 /* Verify that the pmap is now completely empty. */ 3229 pmap_check_ptps(pmap); 3230 KASSERTMSG(pmap->pm_stats.resident_count == PDP_SIZE, 3231 "pmap %p not empty", pmap); 3232 3233 return true; 3234 } 3235 3236 #if defined(PMAP_FORK) 3237 /* 3238 * pmap_fork: perform any necessary data structure manipulation when 3239 * a VM space is forked. 3240 */ 3241 void 3242 pmap_fork(struct pmap *pmap1, struct pmap *pmap2) 3243 { 3244 #ifdef USER_LDT 3245 union descriptor *new_ldt; 3246 int sel; 3247 3248 if (__predict_true(pmap1->pm_ldt == NULL)) { 3249 return; 3250 } 3251 3252 /* 3253 * Copy the LDT into the new process. 3254 * 3255 * Read pmap1's ldt pointer unlocked; if it changes behind our back 3256 * we'll retry. This will starve if there's a stream of LDT changes 3257 * in another thread but that should not happen. 3258 */ 3259 3260 retry: 3261 if (pmap1->pm_ldt != NULL) { 3262 /* Allocate space for the new process's LDT */ 3263 new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, 3264 MAX_USERLDT_SIZE, 0, UVM_KMF_WIRED); 3265 if (new_ldt == NULL) { 3266 printf("WARNING: %s: unable to allocate LDT space\n", 3267 __func__); 3268 return; 3269 } 3270 mutex_enter(&cpu_lock); 3271 /* Get a GDT slot for it */ 3272 sel = ldt_alloc(new_ldt, MAX_USERLDT_SIZE); 3273 if (sel == -1) { 3274 mutex_exit(&cpu_lock); 3275 uvm_km_free(kernel_map, (vaddr_t)new_ldt, 3276 MAX_USERLDT_SIZE, UVM_KMF_WIRED); 3277 printf("WARNING: %s: unable to allocate LDT selector\n", 3278 __func__); 3279 return; 3280 } 3281 } else { 3282 /* Wasn't anything there after all. */ 3283 new_ldt = NULL; 3284 sel = -1; 3285 mutex_enter(&cpu_lock); 3286 } 3287 3288 /* 3289 * Now that we have cpu_lock, ensure the LDT status is the same. 3290 */ 3291 if (pmap1->pm_ldt != NULL) { 3292 if (new_ldt == NULL) { 3293 /* A wild LDT just appeared. */ 3294 mutex_exit(&cpu_lock); 3295 goto retry; 3296 } 3297 3298 /* Copy the LDT data and install it in pmap2 */ 3299 memcpy(new_ldt, pmap1->pm_ldt, MAX_USERLDT_SIZE); 3300 pmap2->pm_ldt = new_ldt; 3301 pmap2->pm_ldt_sel = sel; 3302 mutex_exit(&cpu_lock); 3303 } else { 3304 if (new_ldt != NULL) { 3305 /* The LDT disappeared, drop what we did. */ 3306 ldt_free(sel); 3307 mutex_exit(&cpu_lock); 3308 uvm_km_free(kernel_map, (vaddr_t)new_ldt, 3309 MAX_USERLDT_SIZE, UVM_KMF_WIRED); 3310 return; 3311 } 3312 3313 /* We're good, just leave. */ 3314 mutex_exit(&cpu_lock); 3315 } 3316 #endif /* USER_LDT */ 3317 } 3318 #endif /* PMAP_FORK */ 3319 3320 #ifdef USER_LDT 3321 3322 /* 3323 * pmap_ldt_xcall: cross call used by pmap_ldt_sync. if the named pmap 3324 * is active, reload LDTR. 3325 */ 3326 static void 3327 pmap_ldt_xcall(void *arg1, void *arg2) 3328 { 3329 struct pmap *pm; 3330 3331 kpreempt_disable(); 3332 pm = arg1; 3333 if (curcpu()->ci_pmap == pm) { 3334 #if defined(SVS) 3335 if (svs_enabled) { 3336 svs_ldt_sync(pm); 3337 } else 3338 #endif 3339 lldt(pm->pm_ldt_sel); 3340 } 3341 kpreempt_enable(); 3342 } 3343 3344 /* 3345 * pmap_ldt_sync: LDT selector for the named pmap is changing. swap 3346 * in the new selector on all CPUs. 3347 */ 3348 void 3349 pmap_ldt_sync(struct pmap *pm) 3350 { 3351 uint64_t where; 3352 3353 KASSERT(mutex_owned(&cpu_lock)); 3354 3355 pmap_ldt_evcnt.ev_count++; 3356 where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL); 3357 xc_wait(where); 3358 } 3359 3360 /* 3361 * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and 3362 * restore the default. 3363 */ 3364 void 3365 pmap_ldt_cleanup(struct lwp *l) 3366 { 3367 pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap; 3368 union descriptor *ldt; 3369 int sel; 3370 3371 if (__predict_true(pmap->pm_ldt == NULL)) { 3372 return; 3373 } 3374 3375 mutex_enter(&cpu_lock); 3376 if (pmap->pm_ldt != NULL) { 3377 sel = pmap->pm_ldt_sel; 3378 ldt = pmap->pm_ldt; 3379 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 3380 pmap->pm_ldt = NULL; 3381 pmap_ldt_sync(pmap); 3382 ldt_free(sel); 3383 uvm_km_free(kernel_map, (vaddr_t)ldt, MAX_USERLDT_SIZE, 3384 UVM_KMF_WIRED); 3385 } 3386 mutex_exit(&cpu_lock); 3387 } 3388 #endif /* USER_LDT */ 3389 3390 /* 3391 * pmap_activate: activate a process' pmap 3392 * 3393 * => must be called with kernel preemption disabled 3394 * => if lwp is the curlwp, then set ci_want_pmapload so that 3395 * actual MMU context switch will be done by pmap_load() later 3396 */ 3397 void 3398 pmap_activate(struct lwp *l) 3399 { 3400 struct cpu_info *ci; 3401 struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 3402 3403 KASSERT(kpreempt_disabled()); 3404 3405 ci = curcpu(); 3406 3407 if (l != ci->ci_curlwp) 3408 return; 3409 3410 KASSERT(ci->ci_want_pmapload == 0); 3411 KASSERT(ci->ci_tlbstate != TLBSTATE_VALID); 3412 3413 /* 3414 * no need to switch to kernel vmspace because 3415 * it's a subset of any vmspace. 3416 */ 3417 3418 if (pmap == pmap_kernel()) { 3419 ci->ci_want_pmapload = 0; 3420 return; 3421 } 3422 3423 ci->ci_want_pmapload = 1; 3424 } 3425 3426 #if defined(XENPV) && defined(__x86_64__) 3427 #define KASSERT_PDIRPA(pmap) \ 3428 KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd || \ 3429 pmap == pmap_kernel()) 3430 #elif defined(PAE) 3431 #define KASSERT_PDIRPA(pmap) \ 3432 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0])) 3433 #elif !defined(XENPV) 3434 #define KASSERT_PDIRPA(pmap) \ 3435 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3())) 3436 #else 3437 #define KASSERT_PDIRPA(pmap) KASSERT(true) /* nothing to do */ 3438 #endif 3439 3440 /* 3441 * pmap_reactivate: try to regain reference to the pmap. 3442 * 3443 * => Must be called with kernel preemption disabled. 3444 */ 3445 static void 3446 pmap_reactivate(struct pmap *pmap) 3447 { 3448 struct cpu_info * const ci = curcpu(); 3449 const cpuid_t cid = cpu_index(ci); 3450 3451 KASSERT(kpreempt_disabled()); 3452 KASSERT_PDIRPA(pmap); 3453 3454 /* 3455 * If we still have a lazy reference to this pmap, we can assume 3456 * that there was no TLB shootdown for this pmap in the meantime. 3457 * 3458 * The order of events here is important as we must synchronize 3459 * with TLB shootdown interrupts. Declare interest in invalidations 3460 * (TLBSTATE_VALID) and then check the CPU set, which the IPIs can 3461 * change only when the state is TLBSTATE_LAZY. 3462 */ 3463 3464 ci->ci_tlbstate = TLBSTATE_VALID; 3465 KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid)); 3466 3467 if (__predict_true(kcpuset_isset(pmap->pm_cpus, cid))) { 3468 /* We have the reference, state is valid. */ 3469 } else { 3470 /* 3471 * Must reload the TLB, pmap has been changed during 3472 * deactivated. 3473 */ 3474 kcpuset_atomic_set(pmap->pm_cpus, cid); 3475 3476 tlbflush(); 3477 } 3478 } 3479 3480 /* 3481 * pmap_load: perform the actual pmap switch, i.e. fill in %cr3 register 3482 * and relevant LDT info. 3483 * 3484 * Ensures that the current process' pmap is loaded on the current CPU's 3485 * MMU and that there are no stale TLB entries. 3486 * 3487 * => The caller should disable kernel preemption or do check-and-retry 3488 * to prevent a preemption from undoing our efforts. 3489 * => This function may block. 3490 */ 3491 void 3492 pmap_load(void) 3493 { 3494 struct cpu_info *ci; 3495 struct pmap *pmap, *oldpmap; 3496 struct lwp *l; 3497 uint64_t ncsw; 3498 3499 kpreempt_disable(); 3500 retry: 3501 ci = curcpu(); 3502 if (!ci->ci_want_pmapload) { 3503 kpreempt_enable(); 3504 return; 3505 } 3506 l = ci->ci_curlwp; 3507 ncsw = l->l_ncsw; 3508 __insn_barrier(); 3509 3510 /* should be able to take ipis. */ 3511 KASSERT(ci->ci_ilevel < IPL_HIGH); 3512 #ifdef XENPV 3513 /* Check to see if interrupts are enabled (ie; no events are masked) */ 3514 KASSERT(x86_read_psl() == 0); 3515 #else 3516 KASSERT((x86_read_psl() & PSL_I) != 0); 3517 #endif 3518 3519 KASSERT(l != NULL); 3520 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 3521 KASSERT(pmap != pmap_kernel()); 3522 oldpmap = ci->ci_pmap; 3523 3524 if (pmap == oldpmap) { 3525 pmap_reactivate(pmap); 3526 ci->ci_want_pmapload = 0; 3527 kpreempt_enable(); 3528 return; 3529 } 3530 3531 /* 3532 * Acquire a reference to the new pmap and perform the switch. 3533 */ 3534 3535 pmap_reference(pmap); 3536 pmap_load1(l, pmap, oldpmap); 3537 ci->ci_want_pmapload = 0; 3538 3539 /* 3540 * we're now running with the new pmap. drop the reference 3541 * to the old pmap. if we block, we need to go around again. 3542 */ 3543 3544 pmap_destroy(oldpmap); 3545 __insn_barrier(); 3546 if (l->l_ncsw != ncsw) { 3547 goto retry; 3548 } 3549 3550 kpreempt_enable(); 3551 } 3552 3553 /* 3554 * pmap_load1: the guts of pmap load, shared by pmap_map_ptes() and 3555 * pmap_load(). It's critically important that this function does not 3556 * block. 3557 */ 3558 static void 3559 pmap_load1(struct lwp *l, struct pmap *pmap, struct pmap *oldpmap) 3560 { 3561 struct cpu_info *ci; 3562 struct pcb *pcb; 3563 cpuid_t cid; 3564 3565 KASSERT(kpreempt_disabled()); 3566 3567 pcb = lwp_getpcb(l); 3568 ci = l->l_cpu; 3569 cid = cpu_index(ci); 3570 3571 kcpuset_atomic_clear(oldpmap->pm_cpus, cid); 3572 kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid); 3573 3574 KASSERT_PDIRPA(oldpmap); 3575 KASSERT(!kcpuset_isset(pmap->pm_cpus, cid)); 3576 KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid)); 3577 3578 /* 3579 * Mark the pmap in use by this CPU. Again, we must synchronize 3580 * with TLB shootdown interrupts, so set the state VALID first, 3581 * then register us for shootdown events on this pmap. 3582 */ 3583 ci->ci_tlbstate = TLBSTATE_VALID; 3584 kcpuset_atomic_set(pmap->pm_cpus, cid); 3585 kcpuset_atomic_set(pmap->pm_kernel_cpus, cid); 3586 ci->ci_pmap = pmap; 3587 3588 /* 3589 * update tss. now that we have registered for invalidations 3590 * from other CPUs, we're good to load the page tables. 3591 */ 3592 #ifdef PAE 3593 pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa; 3594 #else 3595 pcb->pcb_cr3 = pmap_pdirpa(pmap, 0); 3596 #endif 3597 3598 #ifdef i386 3599 #ifndef XENPV 3600 ci->ci_tss->tss.tss_ldt = pmap->pm_ldt_sel; 3601 ci->ci_tss->tss.tss_cr3 = pcb->pcb_cr3; 3602 #endif 3603 #endif 3604 3605 #if defined(SVS) && defined(USER_LDT) 3606 if (svs_enabled) { 3607 svs_ldt_sync(pmap); 3608 } else 3609 #endif 3610 lldt(pmap->pm_ldt_sel); 3611 3612 cpu_load_pmap(pmap, oldpmap); 3613 } 3614 3615 /* 3616 * pmap_deactivate: deactivate a process' pmap. 3617 * 3618 * => Must be called with kernel preemption disabled (high IPL is enough). 3619 */ 3620 void 3621 pmap_deactivate(struct lwp *l) 3622 { 3623 struct pmap *pmap; 3624 struct cpu_info *ci; 3625 3626 KASSERT(kpreempt_disabled()); 3627 3628 if (l != curlwp) { 3629 return; 3630 } 3631 3632 /* 3633 * Wait for pending TLB shootdowns to complete. Necessary because 3634 * TLB shootdown state is per-CPU, and the LWP may be coming off 3635 * the CPU before it has a chance to call pmap_update(), e.g. due 3636 * to kernel preemption or blocking routine in between. 3637 */ 3638 pmap_tlb_shootnow(); 3639 3640 ci = curcpu(); 3641 3642 if (ci->ci_want_pmapload) { 3643 /* 3644 * ci_want_pmapload means that our pmap is not loaded on 3645 * the CPU or TLB might be stale. note that pmap_kernel() 3646 * is always considered loaded. 3647 */ 3648 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 3649 != pmap_kernel()); 3650 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 3651 != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID); 3652 3653 /* 3654 * userspace has not been touched. 3655 * nothing to do here. 3656 */ 3657 3658 ci->ci_want_pmapload = 0; 3659 return; 3660 } 3661 3662 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 3663 3664 if (pmap == pmap_kernel()) { 3665 return; 3666 } 3667 3668 KASSERT_PDIRPA(pmap); 3669 KASSERT(ci->ci_pmap == pmap); 3670 3671 /* 3672 * we aren't interested in TLB invalidations for this pmap, 3673 * at least for the time being. 3674 */ 3675 3676 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); 3677 ci->ci_tlbstate = TLBSTATE_LAZY; 3678 } 3679 3680 /* 3681 * some misc. functions 3682 */ 3683 3684 bool 3685 pmap_pdes_valid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde, 3686 int *lastlvl) 3687 { 3688 unsigned long index; 3689 pd_entry_t pde; 3690 int i; 3691 3692 for (i = PTP_LEVELS; i > 1; i--) { 3693 index = pl_i(va, i); 3694 pde = pdes[i - 2][index]; 3695 if ((pde & PTE_P) == 0) { 3696 *lastlvl = i; 3697 return false; 3698 } 3699 if (pde & PTE_PS) 3700 break; 3701 } 3702 if (lastpde != NULL) 3703 *lastpde = pde; 3704 *lastlvl = i; 3705 return true; 3706 } 3707 3708 /* 3709 * pmap_extract: extract a PA for the given VA 3710 */ 3711 bool 3712 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap) 3713 { 3714 pt_entry_t *ptes, pte; 3715 pd_entry_t pde; 3716 pd_entry_t * const *pdes; 3717 struct pmap *pmap2; 3718 paddr_t pa; 3719 bool rv; 3720 int lvl; 3721 3722 if (__predict_false(pmap->pm_extract != NULL)) { 3723 return (*pmap->pm_extract)(pmap, va, pap); 3724 } 3725 3726 #ifdef __HAVE_DIRECT_MAP 3727 if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) { 3728 if (pap != NULL) { 3729 *pap = PMAP_DIRECT_UNMAP(va); 3730 } 3731 return true; 3732 } 3733 #endif 3734 3735 rv = false; 3736 pa = 0; 3737 3738 if (pmap != pmap_kernel()) { 3739 mutex_enter(&pmap->pm_lock); 3740 } 3741 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3742 if (pmap_pdes_valid(va, pdes, &pde, &lvl)) { 3743 if (lvl == 2) { 3744 pa = (pde & PTE_LGFRAME) | (va & (NBPD_L2 - 1)); 3745 rv = true; 3746 } else { 3747 KASSERT(lvl == 1); 3748 pte = ptes[pl1_i(va)]; 3749 if (__predict_true((pte & PTE_P) != 0)) { 3750 pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1)); 3751 rv = true; 3752 } 3753 } 3754 } 3755 pmap_unmap_ptes(pmap, pmap2); 3756 if (pmap != pmap_kernel()) { 3757 mutex_exit(&pmap->pm_lock); 3758 } 3759 if (pap != NULL) { 3760 *pap = pa; 3761 } 3762 3763 return rv; 3764 } 3765 3766 /* 3767 * vtophys: virtual address to physical address. For use by 3768 * machine-dependent code only. 3769 */ 3770 paddr_t 3771 vtophys(vaddr_t va) 3772 { 3773 paddr_t pa; 3774 3775 if (pmap_extract(pmap_kernel(), va, &pa) == true) 3776 return pa; 3777 return 0; 3778 } 3779 3780 __strict_weak_alias(pmap_extract_ma, pmap_extract); 3781 3782 #ifdef XENPV 3783 /* 3784 * vtomach: virtual address to machine address. For use by 3785 * machine-dependent code only. 3786 */ 3787 paddr_t 3788 vtomach(vaddr_t va) 3789 { 3790 paddr_t pa; 3791 3792 if (pmap_extract_ma(pmap_kernel(), va, &pa) == true) 3793 return pa; 3794 return 0; 3795 } 3796 #endif 3797 3798 /* 3799 * pmap_virtual_space: used during bootup [pmap_steal_memory] to 3800 * determine the bounds of the kernel virtual addess space. 3801 */ 3802 void 3803 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp) 3804 { 3805 *startp = virtual_avail; 3806 *endp = virtual_end; 3807 } 3808 3809 void 3810 pmap_zero_page(paddr_t pa) 3811 { 3812 #if defined(__HAVE_DIRECT_MAP) 3813 memset(PAGE_ALIGNED(PMAP_DIRECT_MAP(pa)), 0, PAGE_SIZE); 3814 #else 3815 #if defined(XENPV) 3816 if (XEN_VERSION_SUPPORTED(3, 4)) 3817 xen_pagezero(pa); 3818 #endif 3819 struct cpu_info *ci; 3820 pt_entry_t *zpte; 3821 vaddr_t zerova; 3822 3823 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_D | PTE_A; 3824 3825 kpreempt_disable(); 3826 3827 ci = curcpu(); 3828 zerova = ci->vpage[VPAGE_ZER]; 3829 zpte = ci->vpage_pte[VPAGE_ZER]; 3830 3831 KASSERTMSG(!*zpte, "pmap_zero_page: lock botch"); 3832 3833 pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags); 3834 pmap_pte_flush(); 3835 pmap_update_pg(zerova); /* flush TLB */ 3836 3837 memset(PAGE_ALIGNED(zerova), 0, PAGE_SIZE); 3838 3839 #if defined(DIAGNOSTIC) || defined(XENPV) 3840 pmap_pte_set(zpte, 0); /* zap ! */ 3841 pmap_pte_flush(); 3842 #endif 3843 3844 kpreempt_enable(); 3845 #endif /* defined(__HAVE_DIRECT_MAP) */ 3846 } 3847 3848 void 3849 pmap_copy_page(paddr_t srcpa, paddr_t dstpa) 3850 { 3851 #if defined(__HAVE_DIRECT_MAP) 3852 vaddr_t srcva = PMAP_DIRECT_MAP(srcpa); 3853 vaddr_t dstva = PMAP_DIRECT_MAP(dstpa); 3854 3855 memcpy(PAGE_ALIGNED(dstva), PAGE_ALIGNED(srcva), PAGE_SIZE); 3856 #else 3857 #if defined(XENPV) 3858 if (XEN_VERSION_SUPPORTED(3, 4)) { 3859 xen_copy_page(srcpa, dstpa); 3860 return; 3861 } 3862 #endif 3863 struct cpu_info *ci; 3864 pt_entry_t *srcpte, *dstpte; 3865 vaddr_t srcva, dstva; 3866 3867 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A; 3868 3869 kpreempt_disable(); 3870 3871 ci = curcpu(); 3872 srcva = ci->vpage[VPAGE_SRC]; 3873 dstva = ci->vpage[VPAGE_DST]; 3874 srcpte = ci->vpage_pte[VPAGE_SRC]; 3875 dstpte = ci->vpage_pte[VPAGE_DST]; 3876 3877 KASSERT(*srcpte == 0 && *dstpte == 0); 3878 3879 pmap_pte_set(srcpte, pmap_pa2pte(srcpa) | pteflags); 3880 pmap_pte_set(dstpte, pmap_pa2pte(dstpa) | pteflags | PTE_D); 3881 pmap_pte_flush(); 3882 pmap_update_pg(srcva); 3883 pmap_update_pg(dstva); 3884 3885 memcpy(PAGE_ALIGNED(dstva), PAGE_ALIGNED(srcva), PAGE_SIZE); 3886 3887 #if defined(DIAGNOSTIC) || defined(XENPV) 3888 pmap_pte_set(srcpte, 0); 3889 pmap_pte_set(dstpte, 0); 3890 pmap_pte_flush(); 3891 #endif 3892 3893 kpreempt_enable(); 3894 #endif /* defined(__HAVE_DIRECT_MAP) */ 3895 } 3896 3897 static pt_entry_t * 3898 pmap_map_ptp(struct vm_page *ptp) 3899 { 3900 #ifdef __HAVE_DIRECT_MAP 3901 return (void *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp)); 3902 #else 3903 struct cpu_info *ci; 3904 pt_entry_t *ptppte; 3905 vaddr_t ptpva; 3906 3907 KASSERT(kpreempt_disabled()); 3908 3909 #ifndef XENPV 3910 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A | PTE_D; 3911 #else 3912 const pd_entry_t pteflags = PTE_P | pmap_pg_nx | PTE_A | PTE_D; 3913 #endif 3914 3915 ci = curcpu(); 3916 ptpva = ci->vpage[VPAGE_PTP]; 3917 ptppte = ci->vpage_pte[VPAGE_PTP]; 3918 3919 pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | pteflags); 3920 3921 pmap_pte_flush(); 3922 pmap_update_pg(ptpva); 3923 3924 return (pt_entry_t *)ptpva; 3925 #endif 3926 } 3927 3928 static void 3929 pmap_unmap_ptp(void) 3930 { 3931 #ifndef __HAVE_DIRECT_MAP 3932 #if defined(DIAGNOSTIC) || defined(XENPV) 3933 struct cpu_info *ci; 3934 pt_entry_t *pte; 3935 3936 KASSERT(kpreempt_disabled()); 3937 3938 ci = curcpu(); 3939 pte = ci->vpage_pte[VPAGE_PTP]; 3940 3941 if (*pte != 0) { 3942 pmap_pte_set(pte, 0); 3943 pmap_pte_flush(); 3944 } 3945 #endif 3946 #endif 3947 } 3948 3949 static pt_entry_t * 3950 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va) 3951 { 3952 3953 KASSERT(kpreempt_disabled()); 3954 if (pmap_is_curpmap(pmap)) { 3955 return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */ 3956 } 3957 KASSERT(ptp != NULL); 3958 return pmap_map_ptp(ptp) + pl1_pi(va); 3959 } 3960 3961 static void 3962 pmap_unmap_pte(void) 3963 { 3964 3965 KASSERT(kpreempt_disabled()); 3966 3967 pmap_unmap_ptp(); 3968 } 3969 3970 /* 3971 * p m a p r e m o v e f u n c t i o n s 3972 * 3973 * functions that remove mappings 3974 */ 3975 3976 /* 3977 * pmap_remove_ptes: remove PTEs from a PTP 3978 * 3979 * => caller must hold pmap's lock 3980 * => PTP must be mapped into KVA 3981 * => PTP should be null if pmap == pmap_kernel() 3982 * => must be called with kernel preemption disabled 3983 * => returns composite pte if at least one page should be shot down 3984 */ 3985 static void 3986 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva, 3987 vaddr_t startva, vaddr_t endva) 3988 { 3989 pt_entry_t *pte = (pt_entry_t *)ptpva; 3990 3991 KASSERT(mutex_owned(&pmap->pm_lock)); 3992 KASSERT(kpreempt_disabled()); 3993 3994 /* 3995 * mappings are very often sparse, so clip the given range to the 3996 * range of PTEs that are known present in the PTP. 3997 */ 3998 pmap_ptp_range_clip(ptp, &startva, &pte); 3999 4000 /* 4001 * note that ptpva points to the PTE that maps startva. this may 4002 * or may not be the first PTE in the PTP. 4003 * 4004 * we loop through the PTP while there are still PTEs to look at 4005 * and the wire_count is greater than 1 (because we use the wire_count 4006 * to keep track of the number of real PTEs in the PTP). 4007 */ 4008 while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) { 4009 (void)pmap_remove_pte(pmap, ptp, pte, startva); 4010 startva += PAGE_SIZE; 4011 pte++; 4012 } 4013 } 4014 4015 /* 4016 * pmap_remove_pte: remove a single PTE from a PTP. 4017 * 4018 * => caller must hold pmap's lock 4019 * => PTP must be mapped into KVA 4020 * => PTP should be null if pmap == pmap_kernel() 4021 * => returns true if we removed a mapping 4022 * => must be called with kernel preemption disabled 4023 */ 4024 static bool 4025 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, 4026 vaddr_t va) 4027 { 4028 struct pv_entry *pve; 4029 struct vm_page *pg; 4030 struct pmap_page *pp; 4031 pt_entry_t opte; 4032 4033 KASSERT(mutex_owned(&pmap->pm_lock)); 4034 KASSERT(kpreempt_disabled()); 4035 4036 if (!pmap_valid_entry(*pte)) { 4037 /* VA not mapped. */ 4038 return false; 4039 } 4040 4041 /* Atomically save the old PTE and zap it. */ 4042 opte = pmap_pte_testset(pte, 0); 4043 if (!pmap_valid_entry(opte)) { 4044 return false; 4045 } 4046 4047 pmap_exec_account(pmap, va, opte, 0); 4048 pmap_stats_update_bypte(pmap, 0, opte); 4049 4050 if (ptp) { 4051 /* 4052 * Dropping a PTE. Make sure that the PDE is flushed. 4053 */ 4054 ptp->wire_count--; 4055 if (ptp->wire_count <= 1) { 4056 opte |= PTE_A; 4057 } 4058 } 4059 4060 if ((opte & PTE_A) != 0) { 4061 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE); 4062 } 4063 4064 /* 4065 * If we are not on a pv list - we are done. 4066 */ 4067 if ((opte & PTE_PVLIST) == 0) { 4068 #ifndef DOM0OPS 4069 KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL), 4070 "managed page without PTE_PVLIST for %#"PRIxVADDR, va); 4071 KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL), 4072 "pv-tracked page without PTE_PVLIST for %#"PRIxVADDR, va); 4073 #endif 4074 KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ? 4075 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL); 4076 return true; 4077 } 4078 4079 if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) { 4080 pp = VM_PAGE_TO_PP(pg); 4081 } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) { 4082 paddr_t pa = pmap_pte2pa(opte); 4083 panic("%s: PTE_PVLIST with pv-untracked page" 4084 " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")", 4085 __func__, va, pa, atop(pa)); 4086 } 4087 4088 /* Sync R/M bits. */ 4089 pve = pmap_lookup_pv(pmap, ptp, pp, va); 4090 pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_pte_to_pp_attrs(opte)); 4091 return true; 4092 } 4093 4094 static void 4095 pmap_remove_locked(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 4096 { 4097 pt_entry_t *ptes; 4098 pd_entry_t pde; 4099 pd_entry_t * const *pdes; 4100 bool result; 4101 vaddr_t blkendva, va = sva; 4102 struct vm_page *ptp; 4103 struct pmap *pmap2; 4104 int lvl; 4105 4106 KASSERT(mutex_owned(&pmap->pm_lock)); 4107 4108 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 4109 4110 /* 4111 * removing one page? take shortcut function. 4112 */ 4113 4114 if (va + PAGE_SIZE == eva) { 4115 if (pmap_pdes_valid(va, pdes, &pde, &lvl)) { 4116 KASSERT(lvl == 1); 4117 4118 /* Get PTP if non-kernel mapping. */ 4119 if (pmap != pmap_kernel()) { 4120 ptp = pmap_find_ptp(pmap, va, 1); 4121 KASSERTMSG(ptp != NULL, 4122 "%s: unmanaged PTP detected", __func__); 4123 } else { 4124 /* Never free kernel PTPs. */ 4125 ptp = NULL; 4126 } 4127 4128 result = pmap_remove_pte(pmap, ptp, 4129 &ptes[pl1_i(va)], va); 4130 4131 /* 4132 * if mapping removed and the PTP is no longer 4133 * being used, free it! 4134 */ 4135 4136 if (result && ptp && ptp->wire_count <= 1) 4137 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 4138 } 4139 } else for (/* null */ ; va < eva ; va = blkendva) { 4140 /* determine range of block */ 4141 blkendva = x86_round_pdr(va+1); 4142 if (blkendva > eva) 4143 blkendva = eva; 4144 4145 if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) { 4146 /* Skip a range corresponding to an invalid pde. */ 4147 blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1]; 4148 continue; 4149 } 4150 KASSERT(lvl == 1); 4151 4152 /* Get PTP if non-kernel mapping. */ 4153 if (pmap != pmap_kernel()) { 4154 ptp = pmap_find_ptp(pmap, va, 1); 4155 KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected", 4156 __func__); 4157 } else { 4158 /* Never free kernel PTPs. */ 4159 ptp = NULL; 4160 } 4161 4162 pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va, 4163 blkendva); 4164 4165 /* If PTP is no longer being used, free it. */ 4166 if (ptp && ptp->wire_count <= 1) { 4167 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 4168 } 4169 } 4170 pmap_unmap_ptes(pmap, pmap2); 4171 pmap_drain_pv(pmap); 4172 } 4173 4174 /* 4175 * pmap_remove: mapping removal function. 4176 * 4177 * => caller should not be holding any pmap locks 4178 */ 4179 void 4180 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 4181 { 4182 if (__predict_false(pmap->pm_remove != NULL)) { 4183 (*pmap->pm_remove)(pmap, sva, eva); 4184 return; 4185 } 4186 4187 mutex_enter(&pmap->pm_lock); 4188 pmap_remove_locked(pmap, sva, eva); 4189 mutex_exit(&pmap->pm_lock); 4190 } 4191 4192 /* 4193 * pmap_sync_pv: clear pte bits and return the old value of the pp_attrs. 4194 * 4195 * => The 'clearbits' parameter is either ~0 or PP_ATTRS_... 4196 * => Caller should disable kernel preemption. 4197 * => issues tlb shootdowns if necessary. 4198 */ 4199 static int 4200 pmap_sync_pv(struct pv_pte *pvpte, paddr_t pa, int clearbits, uint8_t *oattrs, 4201 pt_entry_t *optep) 4202 { 4203 struct pmap *pmap; 4204 struct vm_page *ptp; 4205 vaddr_t va; 4206 pt_entry_t *ptep; 4207 pt_entry_t opte; 4208 pt_entry_t npte; 4209 pt_entry_t expect; 4210 bool need_shootdown; 4211 4212 ptp = pvpte->pte_ptp; 4213 va = pvpte->pte_va; 4214 KASSERT(ptp == NULL || ptp->uobject != NULL); 4215 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 4216 pmap = ptp_to_pmap(ptp); 4217 KASSERT(kpreempt_disabled()); 4218 4219 if (__predict_false(pmap->pm_sync_pv != NULL)) { 4220 return (*pmap->pm_sync_pv)(ptp, va, pa, clearbits, oattrs, 4221 optep); 4222 } 4223 4224 expect = pmap_pa2pte(pa) | PTE_P; 4225 4226 if (clearbits != ~0) { 4227 KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0); 4228 clearbits = pmap_pp_attrs_to_pte(clearbits); 4229 } 4230 4231 ptep = pmap_map_pte(pmap, ptp, va); 4232 do { 4233 opte = *ptep; 4234 KASSERT((opte & (PTE_D | PTE_A)) != PTE_D); 4235 KASSERT((opte & (PTE_A | PTE_P)) != PTE_A); 4236 KASSERT(opte == 0 || (opte & PTE_P) != 0); 4237 if ((opte & (PTE_FRAME | PTE_P)) != expect) { 4238 /* 4239 * We lost a race with a V->P operation like 4240 * pmap_remove(). Wait for the competitor 4241 * reflecting pte bits into mp_attrs. 4242 */ 4243 pmap_unmap_pte(); 4244 return EAGAIN; 4245 } 4246 4247 /* 4248 * Check if there's anything to do on this PTE. 4249 */ 4250 if ((opte & clearbits) == 0) { 4251 need_shootdown = false; 4252 break; 4253 } 4254 4255 /* 4256 * We need a shootdown if the PTE is cached (PTE_A) ... 4257 * ... Unless we are clearing only the PTE_W bit and 4258 * it isn't cached as RW (PTE_D). 4259 */ 4260 need_shootdown = (opte & PTE_A) != 0 && 4261 !(clearbits == PTE_W && (opte & PTE_D) == 0); 4262 4263 npte = opte & ~clearbits; 4264 4265 /* 4266 * If we need a shootdown anyway, clear PTE_A and PTE_D. 4267 */ 4268 if (need_shootdown) { 4269 npte &= ~(PTE_A | PTE_D); 4270 } 4271 KASSERT((npte & (PTE_D | PTE_A)) != PTE_D); 4272 KASSERT((npte & (PTE_A | PTE_P)) != PTE_A); 4273 KASSERT(npte == 0 || (opte & PTE_P) != 0); 4274 } while (pmap_pte_cas(ptep, opte, npte) != opte); 4275 4276 if (need_shootdown) { 4277 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV); 4278 } 4279 pmap_unmap_pte(); 4280 4281 *oattrs = pmap_pte_to_pp_attrs(opte); 4282 if (optep != NULL) 4283 *optep = opte; 4284 return 0; 4285 } 4286 4287 static void 4288 pmap_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte, 4289 vaddr_t va) 4290 { 4291 struct pmap *pmap2; 4292 pt_entry_t *ptes; 4293 pd_entry_t * const *pdes; 4294 4295 KASSERT(mutex_owned(&pmap->pm_lock)); 4296 4297 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 4298 pmap_stats_update_bypte(pmap, 0, opte); 4299 ptp->wire_count--; 4300 if (ptp->wire_count <= 1) { 4301 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 4302 } 4303 pmap_unmap_ptes(pmap, pmap2); 4304 } 4305 4306 static void 4307 pmap_pp_remove(struct pmap_page *pp, paddr_t pa) 4308 { 4309 struct pv_pte *pvpte; 4310 struct vm_page *ptp; 4311 uintptr_t sum; 4312 uint8_t oattrs; 4313 bool locked; 4314 4315 /* 4316 * Do an unlocked check to see if the page has no mappings, eg when 4317 * pmap_remove_all() was called before amap_wipeout() for a process 4318 * private amap - common. The page being removed must be on the way 4319 * out, so we don't have to worry about concurrent attempts to enter 4320 * it (otherwise the caller either doesn't care or has screwed up). 4321 */ 4322 sum = (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_va); 4323 sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_ptp); 4324 sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pvlist.lh_first); 4325 if (sum == 0) { 4326 return; 4327 } 4328 4329 kpreempt_disable(); 4330 for (;;) { 4331 struct pmap *pmap; 4332 struct pv_entry *pve; 4333 pt_entry_t opte; 4334 vaddr_t va; 4335 4336 mutex_spin_enter(&pp->pp_lock); 4337 if ((pvpte = pv_pte_first(pp)) == NULL) { 4338 mutex_spin_exit(&pp->pp_lock); 4339 break; 4340 } 4341 4342 /* 4343 * Add a reference to the pmap before clearing the pte. 4344 * Otherwise the pmap can disappear behind us. 4345 */ 4346 ptp = pvpte->pte_ptp; 4347 pmap = ptp_to_pmap(ptp); 4348 KASSERT(pmap->pm_obj[0].uo_refs > 0); 4349 if (ptp != NULL) { 4350 pmap_reference(pmap); 4351 } 4352 4353 /* 4354 * Now try to lock it. We need a direct handoff between 4355 * pp_lock and pm_lock to know the pv_entry is kept intact 4356 * and kept associated with this pmap. If that can't be 4357 * had, wait for the pmap's lock to become free and then 4358 * retry. 4359 */ 4360 locked = mutex_tryenter(&pmap->pm_lock); 4361 mutex_spin_exit(&pp->pp_lock); 4362 if (!locked) { 4363 mutex_enter(&pmap->pm_lock); 4364 /* nothing, just wait for it */ 4365 mutex_exit(&pmap->pm_lock); 4366 if (ptp != NULL) { 4367 pmap_destroy(pmap); 4368 } 4369 continue; 4370 } 4371 va = pvpte->pte_va; 4372 4373 KASSERTMSG(pmap->pm_stats.resident_count > PDP_SIZE, 4374 "va %lx pmap %p ptp %p is empty", va, pmap, ptp); 4375 KASSERTMSG(ptp == NULL || (ptp->flags & PG_FREE) == 0, 4376 "va %lx pmap %p ptp %p is free", va, pmap, ptp); 4377 KASSERTMSG(ptp == NULL || ptp->wire_count > 1, 4378 "va %lx pmap %p ptp %p is empty", va, pmap, ptp); 4379 4380 #ifdef DEBUG 4381 pmap_check_pv(pmap, ptp, pp, pvpte->pte_va, true); 4382 rb_tree_t *tree = (ptp != NULL ? 4383 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb); 4384 pve = pmap_treelookup_pv(pmap, ptp, tree, va); 4385 if (pve == NULL) { 4386 KASSERTMSG(&pp->pp_pte == pvpte, 4387 "va %lx pmap %p ptp %p pvpte %p pve %p oops 1", 4388 va, pmap, ptp, pvpte, pve); 4389 } else { 4390 KASSERTMSG(&pve->pve_pte == pvpte, 4391 "va %lx pmap %p ptp %p pvpte %p pve %p oops 2", 4392 va, pmap, ptp, pvpte, pve); 4393 } 4394 #endif 4395 4396 if (pmap_sync_pv(pvpte, pa, ~0, &oattrs, &opte)) { 4397 panic("pmap_pp_remove: mapping not present"); 4398 } 4399 4400 pve = pmap_lookup_pv(pmap, ptp, pp, va); 4401 pmap_remove_pv(pmap, pp, ptp, va, pve, oattrs); 4402 4403 /* Update the PTP reference count. Free if last reference. */ 4404 if (ptp != NULL) { 4405 KASSERT(pmap != pmap_kernel()); 4406 pmap_tlb_shootnow(); 4407 if (__predict_false(pmap->pm_pp_remove_ent != NULL)) { 4408 (*pmap->pm_pp_remove_ent)(pmap, ptp, opte, va); 4409 } else { 4410 pmap_pp_remove_ent(pmap, ptp, opte, va); 4411 } 4412 } else { 4413 KASSERT(pmap == pmap_kernel()); 4414 pmap_stats_update_bypte(pmap, 0, opte); 4415 } 4416 pmap_tlb_shootnow(); 4417 pmap_drain_pv(pmap); 4418 mutex_exit(&pmap->pm_lock); 4419 if (ptp != NULL) { 4420 pmap_destroy(pmap); 4421 } 4422 } 4423 kpreempt_enable(); 4424 } 4425 4426 /* 4427 * pmap_page_remove: remove a managed vm_page from all pmaps that map it 4428 * 4429 * => R/M bits are sync'd back to attrs 4430 */ 4431 void 4432 pmap_page_remove(struct vm_page *pg) 4433 { 4434 struct pmap_page *pp; 4435 paddr_t pa; 4436 4437 pp = VM_PAGE_TO_PP(pg); 4438 pa = VM_PAGE_TO_PHYS(pg); 4439 pmap_pp_remove(pp, pa); 4440 } 4441 4442 /* 4443 * pmap_pv_remove: remove an unmanaged pv-tracked page from all pmaps 4444 * that map it 4445 */ 4446 void 4447 pmap_pv_remove(paddr_t pa) 4448 { 4449 struct pmap_page *pp; 4450 4451 pp = pmap_pv_tracked(pa); 4452 if (pp == NULL) 4453 panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa); 4454 pmap_pp_remove(pp, pa); 4455 } 4456 4457 /* 4458 * p m a p a t t r i b u t e f u n c t i o n s 4459 * functions that test/change managed page's attributes 4460 * since a page can be mapped multiple times we must check each PTE that 4461 * maps it by going down the pv lists. 4462 */ 4463 4464 /* 4465 * pmap_test_attrs: test a page's attributes 4466 */ 4467 bool 4468 pmap_test_attrs(struct vm_page *pg, unsigned testbits) 4469 { 4470 struct pmap_page *pp; 4471 struct pv_pte *pvpte; 4472 struct pmap *pmap; 4473 uint8_t oattrs; 4474 u_int result; 4475 paddr_t pa; 4476 4477 pp = VM_PAGE_TO_PP(pg); 4478 if ((pp->pp_attrs & testbits) != 0) { 4479 return true; 4480 } 4481 pa = VM_PAGE_TO_PHYS(pg); 4482 startover: 4483 mutex_spin_enter(&pp->pp_lock); 4484 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 4485 if ((pp->pp_attrs & testbits) != 0) { 4486 break; 4487 } 4488 if (pmap_sync_pv(pvpte, pa, 0, &oattrs, NULL)) { 4489 /* 4490 * raced with a V->P operation. wait for the other 4491 * side to finish by acquring pmap's lock. if no 4492 * wait, updates to pp_attrs by the other side may 4493 * go unseen. 4494 */ 4495 pmap = ptp_to_pmap(pvpte->pte_ptp); 4496 pmap_reference(pmap); 4497 mutex_spin_exit(&pp->pp_lock); 4498 mutex_enter(&pmap->pm_lock); 4499 /* nothing. */ 4500 mutex_exit(&pmap->pm_lock); 4501 pmap_destroy(pmap); 4502 goto startover; 4503 } 4504 pp->pp_attrs |= oattrs; 4505 } 4506 result = pp->pp_attrs & testbits; 4507 mutex_spin_exit(&pp->pp_lock); 4508 4509 /* 4510 * note that we will exit the for loop with a non-null pve if 4511 * we have found the bits we are testing for. 4512 */ 4513 4514 return result != 0; 4515 } 4516 4517 static bool 4518 pmap_pp_clear_attrs(struct pmap_page *pp, paddr_t pa, unsigned clearbits) 4519 { 4520 struct pv_pte *pvpte; 4521 struct pmap *pmap; 4522 uint8_t oattrs; 4523 u_int result; 4524 4525 startover: 4526 mutex_spin_enter(&pp->pp_lock); 4527 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 4528 if (pmap_sync_pv(pvpte, pa, clearbits, &oattrs, NULL)) { 4529 /* 4530 * raced with a V->P operation. wait for the other 4531 * side to finish by acquring pmap's lock. it is 4532 * probably unmapping the page, and it will be gone 4533 * when the loop is restarted. 4534 */ 4535 pmap = ptp_to_pmap(pvpte->pte_ptp); 4536 pmap_reference(pmap); 4537 mutex_spin_exit(&pp->pp_lock); 4538 mutex_enter(&pmap->pm_lock); 4539 /* nothing. */ 4540 mutex_exit(&pmap->pm_lock); 4541 pmap_destroy(pmap); 4542 goto startover; 4543 } 4544 pp->pp_attrs |= oattrs; 4545 } 4546 result = pp->pp_attrs & clearbits; 4547 pp->pp_attrs &= ~clearbits; 4548 pmap_tlb_shootnow(); 4549 mutex_spin_exit(&pp->pp_lock); 4550 4551 return result != 0; 4552 } 4553 4554 /* 4555 * pmap_clear_attrs: clear the specified attribute for a page. 4556 * 4557 * => we return true if we cleared one of the bits we were asked to 4558 */ 4559 bool 4560 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits) 4561 { 4562 struct pmap_page *pp; 4563 paddr_t pa; 4564 4565 pp = VM_PAGE_TO_PP(pg); 4566 pa = VM_PAGE_TO_PHYS(pg); 4567 4568 /* 4569 * If this is a new page, assert it has no mappings and simply zap 4570 * the stored attributes without taking any locks. 4571 */ 4572 if ((pg->flags & PG_FAKE) != 0) { 4573 KASSERT(atomic_load_relaxed(&pp->pp_pte.pte_va) == 0); 4574 KASSERT(atomic_load_relaxed(&pp->pp_pte.pte_ptp) == NULL); 4575 KASSERT(atomic_load_relaxed(&pp->pp_pvlist.lh_first) == NULL); 4576 atomic_store_relaxed(&pp->pp_attrs, 0); 4577 return false; 4578 } else { 4579 return pmap_pp_clear_attrs(pp, pa, clearbits); 4580 } 4581 } 4582 4583 /* 4584 * pmap_pv_clear_attrs: clear the specified attributes for an unmanaged 4585 * pv-tracked page. 4586 */ 4587 bool 4588 pmap_pv_clear_attrs(paddr_t pa, unsigned clearbits) 4589 { 4590 struct pmap_page *pp; 4591 4592 pp = pmap_pv_tracked(pa); 4593 if (pp == NULL) 4594 panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa); 4595 4596 return pmap_pp_clear_attrs(pp, pa, clearbits); 4597 } 4598 4599 /* 4600 * p m a p p r o t e c t i o n f u n c t i o n s 4601 */ 4602 4603 /* 4604 * pmap_page_protect: change the protection of all recorded mappings 4605 * of a managed page 4606 * 4607 * => NOTE: this is an inline function in pmap.h 4608 */ 4609 4610 /* see pmap.h */ 4611 4612 /* 4613 * pmap_pv_protect: change the protection of all recorded mappings 4614 * of an unmanaged pv-tracked page 4615 * 4616 * => NOTE: this is an inline function in pmap.h 4617 */ 4618 4619 /* see pmap.h */ 4620 4621 /* 4622 * pmap_protect: set the protection in of the pages in a pmap 4623 * 4624 * => NOTE: this is an inline function in pmap.h 4625 */ 4626 4627 /* see pmap.h */ 4628 4629 /* 4630 * pmap_write_protect: write-protect pages in a pmap. 4631 * 4632 * Note for Xen-amd64. Xen automatically adds PTE_U to the kernel pages, but we 4633 * don't need to remove this bit when re-entering the PTEs here: Xen tracks the 4634 * kernel pages with a reserved bit (_PAGE_GUEST_KERNEL), so even if PTE_U is 4635 * present the page will still be considered as a kernel page, and the privilege 4636 * separation will be enforced correctly. 4637 */ 4638 void 4639 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot) 4640 { 4641 pt_entry_t bit_rem, bit_put; 4642 pt_entry_t *ptes; 4643 pt_entry_t * const *pdes; 4644 struct pmap *pmap2; 4645 vaddr_t blockend, va; 4646 int lvl, i; 4647 4648 if (__predict_false(pmap->pm_write_protect != NULL)) { 4649 (*pmap->pm_write_protect)(pmap, sva, eva, prot); 4650 return; 4651 } 4652 4653 bit_rem = 0; 4654 if (!(prot & VM_PROT_WRITE)) 4655 bit_rem = PTE_W; 4656 4657 bit_put = 0; 4658 if (!(prot & VM_PROT_EXECUTE)) 4659 bit_put = pmap_pg_nx; 4660 4661 sva &= ~PAGE_MASK; 4662 eva &= ~PAGE_MASK; 4663 4664 /* 4665 * Acquire pmap. No need to lock the kernel pmap as we won't 4666 * be touching PV entries nor stats and kernel PDEs aren't 4667 * freed. 4668 */ 4669 if (pmap != pmap_kernel()) { 4670 mutex_enter(&pmap->pm_lock); 4671 } 4672 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 4673 4674 for (va = sva ; va < eva; va = blockend) { 4675 pt_entry_t *spte, *epte; 4676 4677 blockend = x86_round_pdr(va + 1); 4678 if (blockend > eva) 4679 blockend = eva; 4680 4681 /* Is it a valid block? */ 4682 if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) { 4683 continue; 4684 } 4685 KASSERT(va < VM_MAXUSER_ADDRESS || va >= VM_MAX_ADDRESS); 4686 KASSERT(lvl == 1); 4687 4688 spte = &ptes[pl1_i(va)]; 4689 epte = &ptes[pl1_i(blockend)]; 4690 4691 for (i = 0; spte < epte; spte++, i++) { 4692 pt_entry_t opte, npte; 4693 4694 do { 4695 opte = *spte; 4696 if (!pmap_valid_entry(opte)) { 4697 goto next; 4698 } 4699 npte = (opte & ~bit_rem) | bit_put; 4700 } while (pmap_pte_cas(spte, opte, npte) != opte); 4701 4702 if ((opte & PTE_D) != 0) { 4703 vaddr_t tva = va + x86_ptob(i); 4704 pmap_tlb_shootdown(pmap, tva, opte, 4705 TLBSHOOT_WRITE_PROTECT); 4706 } 4707 next:; 4708 } 4709 } 4710 4711 /* Release pmap. */ 4712 pmap_unmap_ptes(pmap, pmap2); 4713 if (pmap != pmap_kernel()) { 4714 mutex_exit(&pmap->pm_lock); 4715 } 4716 } 4717 4718 /* 4719 * pmap_unwire: clear the wired bit in the PTE. 4720 * 4721 * => Mapping should already be present. 4722 */ 4723 void 4724 pmap_unwire(struct pmap *pmap, vaddr_t va) 4725 { 4726 pt_entry_t *ptes, *ptep, opte; 4727 pd_entry_t * const *pdes; 4728 struct pmap *pmap2; 4729 int lvl; 4730 4731 if (__predict_false(pmap->pm_unwire != NULL)) { 4732 (*pmap->pm_unwire)(pmap, va); 4733 return; 4734 } 4735 4736 /* 4737 * Acquire pmap. Need to lock the kernel pmap only to protect the 4738 * statistics. 4739 */ 4740 mutex_enter(&pmap->pm_lock); 4741 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 4742 4743 if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) { 4744 panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va); 4745 } 4746 KASSERT(lvl == 1); 4747 4748 ptep = &ptes[pl1_i(va)]; 4749 opte = *ptep; 4750 KASSERT(pmap_valid_entry(opte)); 4751 4752 if (opte & PTE_WIRED) { 4753 pt_entry_t npte = opte & ~PTE_WIRED; 4754 4755 opte = pmap_pte_testset(ptep, npte); 4756 pmap_stats_update_bypte(pmap, npte, opte); 4757 } else { 4758 printf("%s: wiring for pmap %p va %#" PRIxVADDR 4759 " did not change!\n", __func__, pmap, va); 4760 } 4761 4762 /* Release pmap. */ 4763 pmap_unmap_ptes(pmap, pmap2); 4764 mutex_exit(&pmap->pm_lock); 4765 } 4766 4767 /* 4768 * pmap_copy: copy mappings from one pmap to another 4769 * 4770 * => optional function 4771 * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr) 4772 */ 4773 4774 /* 4775 * defined as macro in pmap.h 4776 */ 4777 4778 __strict_weak_alias(pmap_enter, pmap_enter_default); 4779 4780 int 4781 pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, 4782 u_int flags) 4783 { 4784 if (__predict_false(pmap->pm_enter != NULL)) { 4785 return (*pmap->pm_enter)(pmap, va, pa, prot, flags); 4786 } 4787 4788 return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0); 4789 } 4790 4791 /* 4792 * pmap_enter: enter a mapping into a pmap 4793 * 4794 * => must be done "now" ... no lazy-evaluation 4795 */ 4796 int 4797 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa, 4798 vm_prot_t prot, u_int flags, int domid) 4799 { 4800 pt_entry_t *ptes, opte, npte; 4801 pt_entry_t *ptep; 4802 pd_entry_t * const *pdes; 4803 struct vm_page *ptp; 4804 struct vm_page *new_pg, *old_pg; 4805 struct pmap_page *new_pp, *old_pp; 4806 struct pv_entry *old_pve, *new_pve; 4807 bool wired = (flags & PMAP_WIRED) != 0; 4808 struct pmap *pmap2; 4809 struct pmap_ptparray pt; 4810 int error; 4811 bool getptp, samepage, new_embedded; 4812 rb_tree_t *tree; 4813 4814 KASSERT(pmap_initialized); 4815 KASSERT(va < VM_MAX_KERNEL_ADDRESS); 4816 KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#" 4817 PRIxVADDR " over PDP!", __func__, va); 4818 KASSERTMSG(va < VM_MIN_KERNEL_ADDRESS || 4819 pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]), 4820 "%s: missing kernel PTP for va=%#" PRIxVADDR, __func__, va); 4821 4822 #ifdef XENPV 4823 KASSERT(domid == DOMID_SELF || pa == 0); 4824 #endif 4825 4826 npte = ma | protection_codes[prot] | PTE_P; 4827 npte |= pmap_pat_flags(flags); 4828 if (wired) 4829 npte |= PTE_WIRED; 4830 if (va < VM_MAXUSER_ADDRESS) 4831 npte |= PTE_U; 4832 4833 if (pmap == pmap_kernel()) 4834 npte |= pmap_pg_g; 4835 if (flags & VM_PROT_ALL) { 4836 npte |= PTE_A; 4837 if (flags & VM_PROT_WRITE) { 4838 KASSERT((npte & PTE_W) != 0); 4839 npte |= PTE_D; 4840 } 4841 } 4842 4843 #ifdef XENPV 4844 if (domid != DOMID_SELF) 4845 new_pg = NULL; 4846 else 4847 #endif 4848 new_pg = PHYS_TO_VM_PAGE(pa); 4849 4850 if (new_pg != NULL) { 4851 /* This is a managed page */ 4852 npte |= PTE_PVLIST; 4853 new_pp = VM_PAGE_TO_PP(new_pg); 4854 PMAP_CHECK_PP(new_pp); 4855 } else if ((new_pp = pmap_pv_tracked(pa)) != NULL) { 4856 /* This is an unmanaged pv-tracked page */ 4857 npte |= PTE_PVLIST; 4858 PMAP_CHECK_PP(new_pp); 4859 } else { 4860 new_pp = NULL; 4861 } 4862 4863 /* Begin by locking the pmap. */ 4864 mutex_enter(&pmap->pm_lock); 4865 4866 /* Look up the PTP. Allocate if none present. */ 4867 ptp = NULL; 4868 getptp = false; 4869 if (pmap != pmap_kernel()) { 4870 ptp = pmap_find_ptp(pmap, va, 1); 4871 if (ptp == NULL) { 4872 getptp = true; 4873 error = pmap_get_ptp(pmap, &pt, va, flags, &ptp); 4874 if (error != 0) { 4875 if (flags & PMAP_CANFAIL) { 4876 mutex_exit(&pmap->pm_lock); 4877 return error; 4878 } 4879 panic("%s: get ptp failed, error=%d", __func__, 4880 error); 4881 } 4882 } 4883 tree = &VM_PAGE_TO_PP(ptp)->pp_rb; 4884 } else { 4885 /* Embedded PV entries rely on this. */ 4886 KASSERT(va != 0); 4887 tree = &pmap_kernel_rb; 4888 } 4889 4890 /* 4891 * Look up the old PV entry at this VA (if any), and insert a new PV 4892 * entry if required for the new mapping. Temporarily track the old 4893 * and new mappings concurrently. Only after the old mapping is 4894 * evicted from the pmap will we remove its PV entry. Otherwise, 4895 * our picture of modified/accessed state for either page could get 4896 * out of sync (we need any P->V operation for either page to stall 4897 * on pmap->pm_lock until done here). 4898 */ 4899 new_pve = NULL; 4900 old_pve = NULL; 4901 samepage = false; 4902 new_embedded = false; 4903 4904 if (new_pp != NULL) { 4905 error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve, 4906 &old_pve, &samepage, &new_embedded, tree); 4907 4908 /* 4909 * If a new pv_entry was needed and none was available, we 4910 * can go no further. 4911 */ 4912 if (error != 0) { 4913 if (flags & PMAP_CANFAIL) { 4914 if (getptp) { 4915 pmap_unget_ptp(pmap, &pt); 4916 } 4917 mutex_exit(&pmap->pm_lock); 4918 return error; 4919 } 4920 panic("%s: alloc pve failed", __func__); 4921 } 4922 } else { 4923 old_pve = pmap_treelookup_pv(pmap, ptp, tree, va); 4924 } 4925 4926 /* Map PTEs into address space. */ 4927 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 4928 4929 /* Install any newly allocated PTPs. */ 4930 if (getptp) { 4931 pmap_install_ptp(pmap, &pt, va, pdes); 4932 } 4933 4934 /* Check if there is an existing mapping. */ 4935 ptep = &ptes[pl1_i(va)]; 4936 opte = *ptep; 4937 bool have_oldpa = pmap_valid_entry(opte); 4938 paddr_t oldpa = pmap_pte2pa(opte); 4939 4940 /* 4941 * Update the pte. 4942 */ 4943 do { 4944 opte = *ptep; 4945 4946 /* 4947 * if the same page, inherit PTE_A and PTE_D. 4948 */ 4949 if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) { 4950 npte |= opte & (PTE_A | PTE_D); 4951 } 4952 #if defined(XENPV) 4953 if (domid != DOMID_SELF) { 4954 /* pmap_pte_cas with error handling */ 4955 int s = splvm(); 4956 if (opte != *ptep) { 4957 splx(s); 4958 continue; 4959 } 4960 error = xpq_update_foreign( 4961 vtomach((vaddr_t)ptep), npte, domid, flags); 4962 splx(s); 4963 if (error) { 4964 /* Undo pv_entry tracking - oof. */ 4965 if (new_pp != NULL) { 4966 mutex_spin_enter(&new_pp->pp_lock); 4967 if (new_pve != NULL) { 4968 LIST_REMOVE(new_pve, pve_list); 4969 KASSERT(pmap->pm_pve == NULL); 4970 pmap->pm_pve = new_pve; 4971 } else if (new_embedded) { 4972 new_pp->pp_pte.pte_ptp = NULL; 4973 new_pp->pp_pte.pte_va = 0; 4974 } 4975 mutex_spin_exit(&new_pp->pp_lock); 4976 } 4977 pmap_unmap_ptes(pmap, pmap2); 4978 /* Free new PTP. */ 4979 if (ptp != NULL && ptp->wire_count <= 1) { 4980 pmap_free_ptp(pmap, ptp, va, ptes, 4981 pdes); 4982 } 4983 mutex_exit(&pmap->pm_lock); 4984 return error; 4985 } 4986 break; 4987 } 4988 #endif /* defined(XENPV) */ 4989 } while (pmap_pte_cas(ptep, opte, npte) != opte); 4990 4991 /* 4992 * Done with the PTEs: they can now be unmapped. 4993 */ 4994 pmap_unmap_ptes(pmap, pmap2); 4995 4996 /* 4997 * Update statistics and PTP's reference count. 4998 */ 4999 pmap_stats_update_bypte(pmap, npte, opte); 5000 if (ptp != NULL) { 5001 if (!have_oldpa) { 5002 ptp->wire_count++; 5003 } 5004 /* Remember minimum VA in PTP. */ 5005 pmap_ptp_range_set(ptp, va); 5006 } 5007 KASSERT(ptp == NULL || ptp->wire_count > 1); 5008 5009 /* 5010 * If the same page, we can skip pv_entry handling. 5011 */ 5012 if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) { 5013 KASSERT(((opte ^ npte) & PTE_PVLIST) == 0); 5014 if ((npte & PTE_PVLIST) != 0) { 5015 KASSERT(samepage); 5016 pmap_check_pv(pmap, ptp, new_pp, va, true); 5017 } 5018 goto same_pa; 5019 } else if ((npte & PTE_PVLIST) != 0) { 5020 KASSERT(!samepage); 5021 } 5022 5023 /* 5024 * If old page is pv-tracked, remove pv_entry from its list. 5025 */ 5026 if ((~opte & (PTE_P | PTE_PVLIST)) == 0) { 5027 if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) { 5028 old_pp = VM_PAGE_TO_PP(old_pg); 5029 } else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) { 5030 panic("%s: PTE_PVLIST with pv-untracked page" 5031 " va = %#"PRIxVADDR 5032 " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")", 5033 __func__, va, oldpa, atop(pa)); 5034 } 5035 5036 pmap_remove_pv(pmap, old_pp, ptp, va, old_pve, 5037 pmap_pte_to_pp_attrs(opte)); 5038 } else { 5039 KASSERT(old_pve == NULL); 5040 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 5041 } 5042 5043 /* 5044 * If new page is dynamically PV tracked, insert to tree. 5045 */ 5046 if (new_pve != NULL) { 5047 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 5048 old_pve = rb_tree_insert_node(tree, new_pve); 5049 KASSERT(old_pve == new_pve); 5050 pmap_check_pv(pmap, ptp, new_pp, va, true); 5051 } 5052 5053 same_pa: 5054 /* 5055 * shootdown tlb if necessary. 5056 */ 5057 5058 if ((~opte & (PTE_P | PTE_A)) == 0 && 5059 ((opte ^ npte) & (PTE_FRAME | PTE_W)) != 0) { 5060 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER); 5061 } 5062 pmap_drain_pv(pmap); 5063 mutex_exit(&pmap->pm_lock); 5064 return 0; 5065 } 5066 5067 #if defined(XEN) && defined(DOM0OPS) 5068 5069 struct pmap_data_gnt { 5070 SLIST_ENTRY(pmap_data_gnt) pd_gnt_list; 5071 vaddr_t pd_gnt_sva; 5072 vaddr_t pd_gnt_eva; /* range covered by this gnt */ 5073 int pd_gnt_refs; /* ref counter */ 5074 struct gnttab_map_grant_ref pd_gnt_ops[1]; /* variable length */ 5075 }; 5076 SLIST_HEAD(pmap_data_gnt_head, pmap_data_gnt); 5077 5078 static void pmap_remove_gnt(struct pmap *, vaddr_t, vaddr_t); 5079 5080 static struct pmap_data_gnt * 5081 pmap_find_gnt(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 5082 { 5083 struct pmap_data_gnt_head *headp; 5084 struct pmap_data_gnt *pgnt; 5085 5086 KASSERT(mutex_owned(&pmap->pm_lock)); 5087 headp = pmap->pm_data; 5088 KASSERT(headp != NULL); 5089 SLIST_FOREACH(pgnt, headp, pd_gnt_list) { 5090 if (pgnt->pd_gnt_sva <= sva && eva <= pgnt->pd_gnt_eva) 5091 return pgnt; 5092 /* check that we're not overlapping part of a region */ 5093 KASSERT(pgnt->pd_gnt_sva >= eva || pgnt->pd_gnt_eva <= sva); 5094 } 5095 return NULL; 5096 } 5097 5098 static void 5099 pmap_alloc_gnt(struct pmap *pmap, vaddr_t sva, int nentries, 5100 const struct gnttab_map_grant_ref *ops) 5101 { 5102 struct pmap_data_gnt_head *headp; 5103 struct pmap_data_gnt *pgnt; 5104 vaddr_t eva = sva + nentries * PAGE_SIZE; 5105 KASSERT(mutex_owned(&pmap->pm_lock)); 5106 KASSERT(nentries >= 1); 5107 if (pmap->pm_remove == NULL) { 5108 pmap->pm_remove = pmap_remove_gnt; 5109 KASSERT(pmap->pm_data == NULL); 5110 headp = kmem_alloc(sizeof(*headp), KM_SLEEP); 5111 SLIST_INIT(headp); 5112 pmap->pm_data = headp; 5113 } else { 5114 KASSERT(pmap->pm_remove == pmap_remove_gnt); 5115 KASSERT(pmap->pm_data != NULL); 5116 headp = pmap->pm_data; 5117 } 5118 5119 pgnt = pmap_find_gnt(pmap, sva, eva); 5120 if (pgnt != NULL) { 5121 KASSERT(pgnt->pd_gnt_sva == sva); 5122 KASSERT(pgnt->pd_gnt_eva == eva); 5123 return; 5124 } 5125 5126 /* new entry */ 5127 pgnt = kmem_alloc(sizeof(*pgnt) + 5128 (nentries - 1) * sizeof(struct gnttab_map_grant_ref), KM_SLEEP); 5129 pgnt->pd_gnt_sva = sva; 5130 pgnt->pd_gnt_eva = eva; 5131 pgnt->pd_gnt_refs = 0; 5132 memcpy(pgnt->pd_gnt_ops, ops, 5133 sizeof(struct gnttab_map_grant_ref) * nentries); 5134 SLIST_INSERT_HEAD(headp, pgnt, pd_gnt_list); 5135 } 5136 5137 static void 5138 pmap_free_gnt(struct pmap *pmap, struct pmap_data_gnt *pgnt) 5139 { 5140 struct pmap_data_gnt_head *headp = pmap->pm_data; 5141 int nentries = (pgnt->pd_gnt_eva - pgnt->pd_gnt_sva) / PAGE_SIZE; 5142 KASSERT(nentries >= 1); 5143 KASSERT(mutex_owned(&pmap->pm_lock)); 5144 KASSERT(pgnt->pd_gnt_refs == 0); 5145 SLIST_REMOVE(headp, pgnt, pmap_data_gnt, pd_gnt_list); 5146 kmem_free(pgnt, sizeof(*pgnt) + 5147 (nentries - 1) * sizeof(struct gnttab_map_grant_ref)); 5148 if (SLIST_EMPTY(headp)) { 5149 kmem_free(headp, sizeof(*headp)); 5150 pmap->pm_data = NULL; 5151 pmap->pm_remove = NULL; 5152 } 5153 } 5154 5155 /* 5156 * pmap_enter_gnt: enter a grant entry into a pmap 5157 * 5158 * => must be done "now" ... no lazy-evaluation 5159 */ 5160 int 5161 pmap_enter_gnt(struct pmap *pmap, vaddr_t va, vaddr_t sva, int nentries, 5162 const struct gnttab_map_grant_ref *oops) 5163 { 5164 struct pmap_data_gnt *pgnt; 5165 pt_entry_t *ptes, opte; 5166 pt_entry_t *ptep; 5167 pd_entry_t * const *pdes; 5168 struct vm_page *ptp; 5169 struct vm_page *old_pg; 5170 struct pmap_page *old_pp; 5171 struct pv_entry *old_pve; 5172 struct pmap *pmap2; 5173 struct pmap_ptparray pt; 5174 int error; 5175 bool getptp; 5176 rb_tree_t *tree; 5177 struct gnttab_map_grant_ref *op; 5178 int ret; 5179 int idx; 5180 5181 KASSERT(pmap_initialized); 5182 KASSERT(va < VM_MAX_KERNEL_ADDRESS); 5183 KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#" 5184 PRIxVADDR " over PDP!", __func__, va); 5185 KASSERT(pmap != pmap_kernel()); 5186 5187 /* Begin by locking the pmap. */ 5188 mutex_enter(&pmap->pm_lock); 5189 pmap_alloc_gnt(pmap, sva, nentries, oops); 5190 5191 pgnt = pmap_find_gnt(pmap, va, va + PAGE_SIZE); 5192 KASSERT(pgnt != NULL); 5193 5194 /* Look up the PTP. Allocate if none present. */ 5195 ptp = NULL; 5196 getptp = false; 5197 ptp = pmap_find_ptp(pmap, va, 1); 5198 if (ptp == NULL) { 5199 getptp = true; 5200 error = pmap_get_ptp(pmap, &pt, va, PMAP_CANFAIL, &ptp); 5201 if (error != 0) { 5202 mutex_exit(&pmap->pm_lock); 5203 return error; 5204 } 5205 } 5206 tree = &VM_PAGE_TO_PP(ptp)->pp_rb; 5207 5208 /* 5209 * Look up the old PV entry at this VA (if any), and insert a new PV 5210 * entry if required for the new mapping. Temporarily track the old 5211 * and new mappings concurrently. Only after the old mapping is 5212 * evicted from the pmap will we remove its PV entry. Otherwise, 5213 * our picture of modified/accessed state for either page could get 5214 * out of sync (we need any P->V operation for either page to stall 5215 * on pmap->pm_lock until done here). 5216 */ 5217 old_pve = NULL; 5218 5219 old_pve = pmap_treelookup_pv(pmap, ptp, tree, va); 5220 5221 /* Map PTEs into address space. */ 5222 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 5223 5224 /* Install any newly allocated PTPs. */ 5225 if (getptp) { 5226 pmap_install_ptp(pmap, &pt, va, pdes); 5227 } 5228 5229 /* Check if there is an existing mapping. */ 5230 ptep = &ptes[pl1_i(va)]; 5231 opte = *ptep; 5232 bool have_oldpa = pmap_valid_entry(opte); 5233 paddr_t oldpa = pmap_pte2pa(opte); 5234 5235 /* 5236 * Update the pte. 5237 */ 5238 5239 idx = (va - pgnt->pd_gnt_sva) / PAGE_SIZE; 5240 op = &pgnt->pd_gnt_ops[idx]; 5241 5242 #ifdef XENPV /* XXX */ 5243 op->host_addr = xpmap_ptetomach(ptep); 5244 #endif 5245 op->dev_bus_addr = 0; 5246 op->status = GNTST_general_error; 5247 ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, 1); 5248 if (__predict_false(ret)) { 5249 printf("%s: GNTTABOP_map_grant_ref failed: %d\n", 5250 __func__, ret); 5251 op->status = GNTST_general_error; 5252 } 5253 for (int d = 0; d < 256 && op->status == GNTST_eagain; d++) { 5254 kpause("gntmap", false, mstohz(1), NULL); 5255 ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, 1); 5256 if (__predict_false(ret)) { 5257 printf("%s: GNTTABOP_map_grant_ref failed: %d\n", 5258 __func__, ret); 5259 op->status = GNTST_general_error; 5260 } 5261 } 5262 if (__predict_false(op->status != GNTST_okay)) { 5263 printf("%s: GNTTABOP_map_grant_ref status: %d\n", 5264 __func__, op->status); 5265 if (have_oldpa) { 5266 ptp->wire_count--; 5267 } 5268 } else { 5269 pgnt->pd_gnt_refs++; 5270 if (!have_oldpa) { 5271 ptp->wire_count++; 5272 } 5273 KASSERT(ptp->wire_count > 1); 5274 /* Remember minimum VA in PTP. */ 5275 pmap_ptp_range_set(ptp, va); 5276 } 5277 if (ptp->wire_count <= 1) 5278 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 5279 5280 /* 5281 * Done with the PTEs: they can now be unmapped. 5282 */ 5283 pmap_unmap_ptes(pmap, pmap2); 5284 5285 /* 5286 * Update statistics and PTP's reference count. 5287 */ 5288 pmap_stats_update_bypte(pmap, 0, opte); 5289 5290 /* 5291 * If old page is pv-tracked, remove pv_entry from its list. 5292 */ 5293 if ((~opte & (PTE_P | PTE_PVLIST)) == 0) { 5294 if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) { 5295 old_pp = VM_PAGE_TO_PP(old_pg); 5296 } else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) { 5297 panic("%s: PTE_PVLIST with pv-untracked page" 5298 " va = %#"PRIxVADDR " pa = %#" PRIxPADDR, 5299 __func__, va, oldpa); 5300 } 5301 5302 pmap_remove_pv(pmap, old_pp, ptp, va, old_pve, 5303 pmap_pte_to_pp_attrs(opte)); 5304 } else { 5305 KASSERT(old_pve == NULL); 5306 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 5307 } 5308 5309 pmap_drain_pv(pmap); 5310 mutex_exit(&pmap->pm_lock); 5311 return op->status; 5312 } 5313 5314 /* 5315 * pmap_remove_gnt: grant mapping removal function. 5316 * 5317 * => caller should not be holding any pmap locks 5318 */ 5319 static void 5320 pmap_remove_gnt(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 5321 { 5322 struct pmap_data_gnt *pgnt; 5323 pt_entry_t *ptes; 5324 pd_entry_t pde; 5325 pd_entry_t * const *pdes; 5326 struct vm_page *ptp; 5327 struct pmap *pmap2; 5328 vaddr_t va; 5329 int lvl; 5330 int idx; 5331 struct gnttab_map_grant_ref *op; 5332 struct gnttab_unmap_grant_ref unmap_op; 5333 int ret; 5334 5335 KASSERT(pmap != pmap_kernel()); 5336 KASSERT(pmap->pm_remove == pmap_remove_gnt); 5337 5338 mutex_enter(&pmap->pm_lock); 5339 for (va = sva; va < eva; va += PAGE_SIZE) { 5340 pgnt = pmap_find_gnt(pmap, va, va + PAGE_SIZE); 5341 if (pgnt == NULL) { 5342 pmap_remove_locked(pmap, sva, eva); 5343 continue; 5344 } 5345 5346 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 5347 if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) { 5348 panic("pmap_remove_gnt pdes not valid"); 5349 } 5350 5351 idx = (va - pgnt->pd_gnt_sva) / PAGE_SIZE; 5352 op = &pgnt->pd_gnt_ops[idx]; 5353 KASSERT(lvl == 1); 5354 KASSERT(op->status == GNTST_okay); 5355 5356 /* Get PTP if non-kernel mapping. */ 5357 ptp = pmap_find_ptp(pmap, va, 1); 5358 KASSERTMSG(ptp != NULL, 5359 "%s: unmanaged PTP detected", __func__); 5360 5361 if (op->status == GNTST_okay) { 5362 KASSERT(pmap_valid_entry(ptes[pl1_i(va)])); 5363 unmap_op.handle = op->handle; 5364 unmap_op.dev_bus_addr = 0; 5365 #ifdef XENPV /* XXX */ 5366 unmap_op.host_addr = xpmap_ptetomach(&ptes[pl1_i(va)]); 5367 #endif 5368 ret = HYPERVISOR_grant_table_op( 5369 GNTTABOP_unmap_grant_ref, &unmap_op, 1); 5370 if (ret) { 5371 printf("%s: GNTTABOP_unmap_grant_ref " 5372 "failed: %d\n", __func__, ret); 5373 } 5374 5375 ptp->wire_count--; 5376 pgnt->pd_gnt_refs--; 5377 if (pgnt->pd_gnt_refs == 0) { 5378 pmap_free_gnt(pmap, pgnt); 5379 } 5380 } 5381 /* 5382 * if mapping removed and the PTP is no longer 5383 * being used, free it! 5384 */ 5385 5386 if (ptp->wire_count <= 1) 5387 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 5388 pmap_unmap_ptes(pmap, pmap2); 5389 } 5390 mutex_exit(&pmap->pm_lock); 5391 } 5392 #endif /* XEN && DOM0OPS */ 5393 5394 paddr_t 5395 pmap_get_physpage(void) 5396 { 5397 struct vm_page *ptp; 5398 struct pmap *kpm = pmap_kernel(); 5399 paddr_t pa; 5400 5401 if (!uvm.page_init_done) { 5402 /* 5403 * We're growing the kernel pmap early (from 5404 * uvm_pageboot_alloc()). This case must be 5405 * handled a little differently. 5406 */ 5407 5408 if (!uvm_page_physget(&pa)) 5409 panic("%s: out of memory", __func__); 5410 #if defined(__HAVE_DIRECT_MAP) 5411 memset(PAGE_ALIGNED(PMAP_DIRECT_MAP(pa)), 0, PAGE_SIZE); 5412 #else 5413 #if defined(XENPV) 5414 if (XEN_VERSION_SUPPORTED(3, 4)) { 5415 xen_pagezero(pa); 5416 return pa; 5417 } 5418 #endif 5419 kpreempt_disable(); 5420 pmap_pte_set(early_zero_pte, pmap_pa2pte(pa) | PTE_P | 5421 PTE_W | pmap_pg_nx); 5422 pmap_pte_flush(); 5423 pmap_update_pg((vaddr_t)early_zerop); 5424 memset(PAGE_ALIGNED(early_zerop), 0, PAGE_SIZE); 5425 #if defined(DIAGNOSTIC) || defined(XENPV) 5426 pmap_pte_set(early_zero_pte, 0); 5427 pmap_pte_flush(); 5428 #endif /* defined(DIAGNOSTIC) */ 5429 kpreempt_enable(); 5430 #endif /* defined(__HAVE_DIRECT_MAP) */ 5431 } else { 5432 /* XXX */ 5433 ptp = uvm_pagealloc(NULL, 0, NULL, 5434 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 5435 if (ptp == NULL) 5436 panic("%s: out of memory", __func__); 5437 ptp->flags &= ~PG_BUSY; 5438 ptp->wire_count = 1; 5439 pa = VM_PAGE_TO_PHYS(ptp); 5440 } 5441 pmap_stats_update(kpm, 1, 0); 5442 5443 return pa; 5444 } 5445 5446 /* 5447 * Expand the page tree with the specified amount of PTPs, mapping virtual 5448 * addresses starting at kva. We populate all the levels but the last one 5449 * (L1). The nodes of the tree are created as RW, but the pages covered 5450 * will be kentered in L1, with proper permissions. 5451 * 5452 * Used only by pmap_growkernel. 5453 */ 5454 static void 5455 pmap_alloc_level(struct pmap *cpm, vaddr_t kva, long *needed_ptps) 5456 { 5457 unsigned long i; 5458 paddr_t pa; 5459 unsigned long index, endindex; 5460 int level; 5461 pd_entry_t *pdep; 5462 #ifdef XENPV 5463 int s = splvm(); /* protect xpq_* */ 5464 #endif 5465 5466 for (level = PTP_LEVELS; level > 1; level--) { 5467 if (level == PTP_LEVELS) 5468 pdep = cpm->pm_pdir; 5469 else 5470 pdep = normal_pdes[level - 2]; 5471 index = pl_i_roundup(kva, level); 5472 endindex = index + needed_ptps[level - 1] - 1; 5473 5474 for (i = index; i <= endindex; i++) { 5475 pt_entry_t pte; 5476 5477 KASSERT(!pmap_valid_entry(pdep[i])); 5478 pa = pmap_get_physpage(); 5479 pte = pmap_pa2pte(pa) | PTE_P | PTE_W; 5480 #ifdef __x86_64__ 5481 pte |= pmap_pg_nx; 5482 #endif 5483 pmap_pte_set(&pdep[i], pte); 5484 5485 #ifdef XENPV 5486 if (level == PTP_LEVELS && i >= PDIR_SLOT_KERN) { 5487 if (__predict_true( 5488 cpu_info_primary.ci_flags & CPUF_PRESENT)) { 5489 /* update per-cpu PMDs on all cpus */ 5490 xen_kpm_sync(pmap_kernel(), i); 5491 } else { 5492 /* 5493 * too early; update primary CPU 5494 * PMD only (without locks) 5495 */ 5496 #ifdef __x86_64__ 5497 pd_entry_t *cpu_pdep = 5498 &cpu_info_primary.ci_kpm_pdir[i]; 5499 #else 5500 pd_entry_t *cpu_pdep = 5501 &cpu_info_primary.ci_kpm_pdir[l2tol2(i)]; 5502 #endif 5503 pmap_pte_set(cpu_pdep, pte); 5504 } 5505 } 5506 #endif 5507 5508 KASSERT(level != PTP_LEVELS || nkptp[level - 1] + 5509 pl_i(VM_MIN_KERNEL_ADDRESS, level) == i); 5510 nkptp[level - 1]++; 5511 } 5512 pmap_pte_flush(); 5513 } 5514 #ifdef XENPV 5515 splx(s); 5516 #endif 5517 } 5518 5519 /* 5520 * pmap_growkernel: increase usage of KVM space. 5521 * 5522 * => we allocate new PTPs for the kernel and install them in all 5523 * the pmaps on the system. 5524 */ 5525 vaddr_t 5526 pmap_growkernel(vaddr_t maxkvaddr) 5527 { 5528 struct pmap *kpm = pmap_kernel(); 5529 struct pmap *cpm; 5530 #if !defined(XENPV) || !defined(__x86_64__) 5531 struct pmap *pm; 5532 long old; 5533 #endif 5534 int s, i; 5535 long needed_kptp[PTP_LEVELS], target_nptp; 5536 bool invalidate = false; 5537 5538 s = splvm(); /* to be safe */ 5539 mutex_enter(&kpm->pm_lock); 5540 5541 if (maxkvaddr <= pmap_maxkvaddr) { 5542 mutex_exit(&kpm->pm_lock); 5543 splx(s); 5544 return pmap_maxkvaddr; 5545 } 5546 5547 maxkvaddr = x86_round_pdr(maxkvaddr); 5548 #if !defined(XENPV) || !defined(__x86_64__) 5549 old = nkptp[PTP_LEVELS - 1]; 5550 #endif 5551 5552 /* Initialize needed_kptp. */ 5553 for (i = PTP_LEVELS - 1; i >= 1; i--) { 5554 target_nptp = pl_i_roundup(maxkvaddr, i + 1) - 5555 pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1); 5556 5557 if (target_nptp > nkptpmax[i]) 5558 panic("out of KVA space"); 5559 KASSERT(target_nptp >= nkptp[i]); 5560 needed_kptp[i] = target_nptp - nkptp[i]; 5561 } 5562 5563 #ifdef XENPV 5564 /* only pmap_kernel(), or the per-cpu map, has kernel entries */ 5565 cpm = kpm; 5566 #else 5567 /* Get the current pmap */ 5568 if (__predict_true(cpu_info_primary.ci_flags & CPUF_PRESENT)) { 5569 cpm = curcpu()->ci_pmap; 5570 } else { 5571 cpm = kpm; 5572 } 5573 #endif 5574 5575 kasan_shadow_map((void *)pmap_maxkvaddr, 5576 (size_t)(maxkvaddr - pmap_maxkvaddr)); 5577 kmsan_shadow_map((void *)pmap_maxkvaddr, 5578 (size_t)(maxkvaddr - pmap_maxkvaddr)); 5579 5580 pmap_alloc_level(cpm, pmap_maxkvaddr, needed_kptp); 5581 5582 /* 5583 * If the number of top level entries changed, update all pmaps. 5584 */ 5585 if (needed_kptp[PTP_LEVELS - 1] != 0) { 5586 #ifdef XENPV 5587 #ifdef __x86_64__ 5588 /* nothing, kernel entries are never entered in user pmap */ 5589 #else 5590 int pdkidx; 5591 5592 mutex_enter(&pmaps_lock); 5593 LIST_FOREACH(pm, &pmaps, pm_list) { 5594 for (pdkidx = PDIR_SLOT_KERN + old; 5595 pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1]; 5596 pdkidx++) { 5597 pmap_pte_set(&pm->pm_pdir[pdkidx], 5598 kpm->pm_pdir[pdkidx]); 5599 } 5600 pmap_pte_flush(); 5601 } 5602 mutex_exit(&pmaps_lock); 5603 #endif /* __x86_64__ */ 5604 #else /* XENPV */ 5605 size_t newpdes; 5606 newpdes = nkptp[PTP_LEVELS - 1] - old; 5607 if (cpm != kpm) { 5608 memcpy(&kpm->pm_pdir[PDIR_SLOT_KERN + old], 5609 &cpm->pm_pdir[PDIR_SLOT_KERN + old], 5610 newpdes * sizeof(pd_entry_t)); 5611 } 5612 5613 mutex_enter(&pmaps_lock); 5614 LIST_FOREACH(pm, &pmaps, pm_list) { 5615 if (__predict_false(pm->pm_enter != NULL)) { 5616 /* 5617 * Not a native pmap, the kernel is not mapped, 5618 * so nothing to synchronize. 5619 */ 5620 continue; 5621 } 5622 memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old], 5623 &kpm->pm_pdir[PDIR_SLOT_KERN + old], 5624 newpdes * sizeof(pd_entry_t)); 5625 } 5626 mutex_exit(&pmaps_lock); 5627 #endif 5628 invalidate = true; 5629 } 5630 pmap_maxkvaddr = maxkvaddr; 5631 mutex_exit(&kpm->pm_lock); 5632 splx(s); 5633 5634 if (invalidate && pmap_initialized) { 5635 /* Invalidate the pmap cache. */ 5636 pool_cache_invalidate(&pmap_cache); 5637 } 5638 5639 return maxkvaddr; 5640 } 5641 5642 #ifdef DEBUG 5643 void pmap_dump(struct pmap *, vaddr_t, vaddr_t); 5644 5645 /* 5646 * pmap_dump: dump all the mappings from a pmap 5647 * 5648 * => caller should not be holding any pmap locks 5649 */ 5650 void 5651 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 5652 { 5653 pt_entry_t *ptes, *pte; 5654 pd_entry_t * const *pdes; 5655 struct pmap *pmap2; 5656 vaddr_t blkendva; 5657 int lvl; 5658 5659 /* 5660 * if end is out of range truncate. 5661 * if (end == start) update to max. 5662 */ 5663 5664 if (eva > VM_MAXUSER_ADDRESS || eva <= sva) 5665 eva = VM_MAXUSER_ADDRESS; 5666 5667 mutex_enter(&pmap->pm_lock); 5668 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 5669 5670 /* 5671 * dumping a range of pages: we dump in PTP sized blocks (4MB) 5672 */ 5673 5674 for (/* null */ ; sva < eva ; sva = blkendva) { 5675 5676 /* determine range of block */ 5677 blkendva = x86_round_pdr(sva+1); 5678 if (blkendva > eva) 5679 blkendva = eva; 5680 5681 /* valid block? */ 5682 if (!pmap_pdes_valid(sva, pdes, NULL, &lvl)) 5683 continue; 5684 KASSERT(lvl == 1); 5685 5686 pte = &ptes[pl1_i(sva)]; 5687 for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) { 5688 if (!pmap_valid_entry(*pte)) 5689 continue; 5690 printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR 5691 " (pte=%#" PRIxPADDR ")\n", 5692 sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte); 5693 } 5694 } 5695 pmap_unmap_ptes(pmap, pmap2); 5696 mutex_exit(&pmap->pm_lock); 5697 } 5698 #endif 5699 5700 /* 5701 * pmap_update: process deferred invalidations and frees. 5702 */ 5703 void 5704 pmap_update(struct pmap *pmap) 5705 { 5706 struct pmap_page *pp; 5707 struct vm_page *ptp; 5708 5709 /* 5710 * Initiate any pending TLB shootdowns. Wait for them to 5711 * complete before returning control to the caller. 5712 */ 5713 kpreempt_disable(); 5714 pmap_tlb_shootnow(); 5715 kpreempt_enable(); 5716 5717 /* 5718 * Now that shootdowns are complete, process deferred frees. This 5719 * is an unlocked check, but is safe as we're only interested in 5720 * work done in this LWP - we won't get a false negative. 5721 */ 5722 if (atomic_load_relaxed(&pmap->pm_gc_ptp.lh_first) == NULL) { 5723 return; 5724 } 5725 5726 mutex_enter(&pmap->pm_lock); 5727 while ((ptp = LIST_FIRST(&pmap->pm_gc_ptp)) != NULL) { 5728 KASSERT(ptp->wire_count == 0); 5729 KASSERT(ptp->uanon == NULL); 5730 LIST_REMOVE(ptp, mdpage.mp_pp.pp_link); 5731 pp = VM_PAGE_TO_PP(ptp); 5732 LIST_INIT(&pp->pp_pvlist); 5733 pp->pp_attrs = 0; 5734 pp->pp_pte.pte_ptp = NULL; 5735 pp->pp_pte.pte_va = 0; 5736 PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp)); 5737 5738 /* 5739 * XXX Hack to avoid extra locking, and lock 5740 * assertions in uvm_pagefree(). Despite uobject 5741 * being set, this isn't a managed page. 5742 */ 5743 PMAP_DUMMY_LOCK(pmap); 5744 uvm_pagerealloc(ptp, NULL, 0); 5745 PMAP_DUMMY_UNLOCK(pmap); 5746 uvm_pagefree(ptp); 5747 } 5748 mutex_exit(&pmap->pm_lock); 5749 } 5750 5751 #if PTP_LEVELS > 4 5752 #error "Unsupported number of page table mappings" 5753 #endif 5754 5755 paddr_t 5756 pmap_init_tmp_pgtbl(paddr_t pg) 5757 { 5758 static bool maps_loaded; 5759 static const paddr_t x86_tmp_pml_paddr[] = { 5760 4 * PAGE_SIZE, /* L1 */ 5761 5 * PAGE_SIZE, /* L2 */ 5762 6 * PAGE_SIZE, /* L3 */ 5763 7 * PAGE_SIZE /* L4 */ 5764 }; 5765 static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 }; 5766 5767 pd_entry_t *tmp_pml, *kernel_pml; 5768 5769 int level; 5770 5771 if (!maps_loaded) { 5772 for (level = 0; level < PTP_LEVELS; ++level) { 5773 x86_tmp_pml_vaddr[level] = 5774 uvm_km_alloc(kernel_map, PAGE_SIZE, 0, 5775 UVM_KMF_VAONLY); 5776 5777 if (x86_tmp_pml_vaddr[level] == 0) 5778 panic("mapping of real mode PML failed\n"); 5779 pmap_kenter_pa(x86_tmp_pml_vaddr[level], 5780 x86_tmp_pml_paddr[level], 5781 VM_PROT_READ | VM_PROT_WRITE, 0); 5782 } 5783 pmap_update(pmap_kernel()); 5784 maps_loaded = true; 5785 } 5786 5787 /* Zero levels 1-3 */ 5788 for (level = 0; level < PTP_LEVELS - 1; ++level) { 5789 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 5790 memset(PAGE_ALIGNED(tmp_pml), 0, PAGE_SIZE); 5791 } 5792 5793 /* Copy PML4 */ 5794 kernel_pml = pmap_kernel()->pm_pdir; 5795 tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1]; 5796 memcpy(PAGE_ALIGNED(tmp_pml), PAGE_ALIGNED(kernel_pml), PAGE_SIZE); 5797 5798 #ifdef PAE 5799 /* 5800 * Use the last 4 entries of the L2 page as L3 PD entries. These 5801 * last entries are unlikely to be used for temporary mappings. 5802 * 508: maps 0->1GB (userland) 5803 * 509: unused 5804 * 510: unused 5805 * 511: maps 3->4GB (kernel) 5806 */ 5807 tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PTE_P; 5808 tmp_pml[509] = 0; 5809 tmp_pml[510] = 0; 5810 tmp_pml[511] = pmap_pdirpa(pmap_kernel(), PDIR_SLOT_KERN) | PTE_P; 5811 #endif 5812 5813 for (level = PTP_LEVELS - 1; level > 0; --level) { 5814 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 5815 5816 tmp_pml[pl_i(pg, level + 1)] = 5817 (x86_tmp_pml_paddr[level - 1] & PTE_FRAME) | PTE_W | PTE_P; 5818 } 5819 5820 tmp_pml = (void *)x86_tmp_pml_vaddr[0]; 5821 tmp_pml[pl_i(pg, 1)] = (pg & PTE_FRAME) | PTE_W | PTE_P; 5822 5823 #ifdef PAE 5824 /* Return the PA of the L3 page (entry 508 of the L2 page) */ 5825 return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t); 5826 #endif 5827 5828 return x86_tmp_pml_paddr[PTP_LEVELS - 1]; 5829 } 5830 5831 u_int 5832 x86_mmap_flags(paddr_t mdpgno) 5833 { 5834 u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK; 5835 u_int pflag = 0; 5836 5837 if (nflag & X86_MMAP_FLAG_PREFETCH) 5838 pflag |= PMAP_WRITE_COMBINE; 5839 5840 return pflag; 5841 } 5842 5843 #if defined(__HAVE_DIRECT_MAP) && defined(__x86_64__) && !defined(XENPV) 5844 5845 /* 5846 * ----------------------------------------------------------------------------- 5847 * ***************************************************************************** 5848 * ***************************************************************************** 5849 * ***************************************************************************** 5850 * ***************************************************************************** 5851 * **************** HERE BEGINS THE EPT CODE, USED BY INTEL-VMX **************** 5852 * ***************************************************************************** 5853 * ***************************************************************************** 5854 * ***************************************************************************** 5855 * ***************************************************************************** 5856 * ----------------------------------------------------------------------------- 5857 * 5858 * These functions are invoked as callbacks from the code above. Contrary to 5859 * native, EPT does not have a recursive slot; therefore, it is not possible 5860 * to call pmap_map_ptes(). Instead, we use the direct map and walk down the 5861 * tree manually. 5862 * 5863 * Apart from that, the logic is mostly the same as native. Once a pmap has 5864 * been created, NVMM calls pmap_ept_transform() to make it an EPT pmap. 5865 * After that we're good, and the callbacks will handle the translations 5866 * for us. 5867 * 5868 * ----------------------------------------------------------------------------- 5869 */ 5870 5871 /* Hardware bits. */ 5872 #define EPT_R __BIT(0) /* read */ 5873 #define EPT_W __BIT(1) /* write */ 5874 #define EPT_X __BIT(2) /* execute */ 5875 #define EPT_T __BITS(5,3) /* type */ 5876 #define TYPE_UC 0 5877 #define TYPE_WC 1 5878 #define TYPE_WT 4 5879 #define TYPE_WP 5 5880 #define TYPE_WB 6 5881 #define EPT_NOPAT __BIT(6) 5882 #define EPT_L __BIT(7) /* large */ 5883 #define EPT_A __BIT(8) /* accessed */ 5884 #define EPT_D __BIT(9) /* dirty */ 5885 /* Software bits. */ 5886 #define EPT_PVLIST __BIT(60) 5887 #define EPT_WIRED __BIT(61) 5888 5889 #define pmap_ept_valid_entry(pte) (pte & EPT_R) 5890 5891 bool pmap_ept_has_ad __read_mostly; 5892 5893 static inline void 5894 pmap_ept_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte) 5895 { 5896 int resid_diff = ((npte & EPT_R) ? 1 : 0) - ((opte & EPT_R) ? 1 : 0); 5897 int wired_diff = ((npte & EPT_WIRED) ? 1 : 0) - ((opte & EPT_WIRED) ? 1 : 0); 5898 5899 KASSERT((npte & (EPT_R | EPT_WIRED)) != EPT_WIRED); 5900 KASSERT((opte & (EPT_R | EPT_WIRED)) != EPT_WIRED); 5901 5902 pmap_stats_update(pmap, resid_diff, wired_diff); 5903 } 5904 5905 static pt_entry_t 5906 pmap_ept_type(u_int flags) 5907 { 5908 u_int cacheflags = (flags & PMAP_CACHE_MASK); 5909 pt_entry_t ret; 5910 5911 switch (cacheflags) { 5912 case PMAP_NOCACHE: 5913 case PMAP_NOCACHE_OVR: 5914 ret = __SHIFTIN(TYPE_UC, EPT_T); 5915 break; 5916 case PMAP_WRITE_COMBINE: 5917 ret = __SHIFTIN(TYPE_WC, EPT_T); 5918 break; 5919 case PMAP_WRITE_BACK: 5920 default: 5921 ret = __SHIFTIN(TYPE_WB, EPT_T); 5922 break; 5923 } 5924 5925 ret |= EPT_NOPAT; 5926 return ret; 5927 } 5928 5929 static inline pt_entry_t 5930 pmap_ept_prot(vm_prot_t prot) 5931 { 5932 pt_entry_t res = 0; 5933 5934 if (prot & VM_PROT_READ) 5935 res |= EPT_R; 5936 if (prot & VM_PROT_WRITE) 5937 res |= EPT_W; 5938 if (prot & VM_PROT_EXECUTE) 5939 res |= EPT_X; 5940 5941 return res; 5942 } 5943 5944 static inline uint8_t 5945 pmap_ept_to_pp_attrs(pt_entry_t ept) 5946 { 5947 uint8_t ret = 0; 5948 if (pmap_ept_has_ad) { 5949 if (ept & EPT_D) 5950 ret |= PP_ATTRS_D; 5951 if (ept & EPT_A) 5952 ret |= PP_ATTRS_A; 5953 } else { 5954 ret |= (PP_ATTRS_D|PP_ATTRS_A); 5955 } 5956 if (ept & EPT_W) 5957 ret |= PP_ATTRS_W; 5958 return ret; 5959 } 5960 5961 static inline pt_entry_t 5962 pmap_pp_attrs_to_ept(uint8_t attrs) 5963 { 5964 pt_entry_t ept = 0; 5965 if (attrs & PP_ATTRS_D) 5966 ept |= EPT_D; 5967 if (attrs & PP_ATTRS_A) 5968 ept |= EPT_A; 5969 if (attrs & PP_ATTRS_W) 5970 ept |= EPT_W; 5971 return ept; 5972 } 5973 5974 /* 5975 * Helper for pmap_ept_free_ptp. 5976 * tree[0] = &L2[L2idx] 5977 * tree[1] = &L3[L3idx] 5978 * tree[2] = &L4[L4idx] 5979 */ 5980 static void 5981 pmap_ept_get_tree(struct pmap *pmap, vaddr_t va, pd_entry_t **tree) 5982 { 5983 pt_entry_t *pteva; 5984 paddr_t ptepa; 5985 int i, index; 5986 5987 ptepa = pmap->pm_pdirpa[0]; 5988 for (i = PTP_LEVELS; i > 1; i--) { 5989 index = pl_pi(va, i); 5990 pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa); 5991 KASSERT(pmap_ept_valid_entry(pteva[index])); 5992 tree[i - 2] = &pteva[index]; 5993 ptepa = pmap_pte2pa(pteva[index]); 5994 } 5995 } 5996 5997 static void 5998 pmap_ept_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va) 5999 { 6000 pd_entry_t *tree[3]; 6001 int level; 6002 6003 KASSERT(pmap != pmap_kernel()); 6004 KASSERT(mutex_owned(&pmap->pm_lock)); 6005 KASSERT(kpreempt_disabled()); 6006 6007 pmap_ept_get_tree(pmap, va, tree); 6008 6009 level = 1; 6010 do { 6011 (void)pmap_pte_testset(tree[level - 1], 0); 6012 6013 pmap_freepage(pmap, ptp, level); 6014 if (level < PTP_LEVELS - 1) { 6015 ptp = pmap_find_ptp(pmap, va, level + 1); 6016 ptp->wire_count--; 6017 if (ptp->wire_count > 1) 6018 break; 6019 } 6020 } while (++level < PTP_LEVELS); 6021 pmap_pte_flush(); 6022 } 6023 6024 /* Allocate L4->L3->L2. Return L2. */ 6025 static void 6026 pmap_ept_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va) 6027 { 6028 struct vm_page *ptp; 6029 unsigned long index; 6030 pd_entry_t *pteva; 6031 paddr_t ptepa; 6032 int i; 6033 6034 KASSERT(pmap != pmap_kernel()); 6035 KASSERT(mutex_owned(&pmap->pm_lock)); 6036 KASSERT(kpreempt_disabled()); 6037 6038 /* 6039 * Now that we have all the pages looked up or allocated, 6040 * loop through again installing any new ones into the tree. 6041 */ 6042 ptepa = pmap->pm_pdirpa[0]; 6043 for (i = PTP_LEVELS; i > 1; i--) { 6044 index = pl_pi(va, i); 6045 pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa); 6046 6047 if (pmap_ept_valid_entry(pteva[index])) { 6048 KASSERT(!pt->alloced[i]); 6049 ptepa = pmap_pte2pa(pteva[index]); 6050 continue; 6051 } 6052 6053 ptp = pt->pg[i]; 6054 ptp->flags &= ~PG_BUSY; /* never busy */ 6055 ptp->wire_count = 1; 6056 pmap->pm_ptphint[i - 2] = ptp; 6057 ptepa = VM_PAGE_TO_PHYS(ptp); 6058 pmap_pte_set(&pteva[index], ptepa | EPT_R | EPT_W | EPT_X); 6059 6060 pmap_pte_flush(); 6061 pmap_stats_update(pmap, 1, 0); 6062 6063 /* 6064 * If we're not in the top level, increase the 6065 * wire count of the parent page. 6066 */ 6067 if (i < PTP_LEVELS) { 6068 pt->pg[i + 1]->wire_count++; 6069 } 6070 } 6071 } 6072 6073 static int 6074 pmap_ept_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, 6075 u_int flags) 6076 { 6077 pt_entry_t *ptes, opte, npte; 6078 pt_entry_t *ptep; 6079 struct vm_page *ptp; 6080 struct vm_page *new_pg, *old_pg; 6081 struct pmap_page *new_pp, *old_pp; 6082 struct pv_entry *old_pve, *new_pve; 6083 bool wired = (flags & PMAP_WIRED) != 0; 6084 bool accessed; 6085 struct pmap_ptparray pt; 6086 int error; 6087 bool getptp, samepage, new_embedded; 6088 rb_tree_t *tree; 6089 6090 KASSERT(pmap_initialized); 6091 KASSERT(va < VM_MAXUSER_ADDRESS); 6092 6093 npte = pa | pmap_ept_prot(prot) | pmap_ept_type(flags); 6094 6095 if (wired) 6096 npte |= EPT_WIRED; 6097 if (flags & VM_PROT_ALL) { 6098 npte |= EPT_A; 6099 if (flags & VM_PROT_WRITE) { 6100 KASSERT((npte & EPT_W) != 0); 6101 npte |= EPT_D; 6102 } 6103 } 6104 6105 new_pg = PHYS_TO_VM_PAGE(pa); 6106 if (new_pg != NULL) { 6107 /* This is a managed page */ 6108 npte |= EPT_PVLIST; 6109 new_pp = VM_PAGE_TO_PP(new_pg); 6110 } else if ((new_pp = pmap_pv_tracked(pa)) != NULL) { 6111 /* This is an unmanaged pv-tracked page */ 6112 npte |= EPT_PVLIST; 6113 } else { 6114 new_pp = NULL; 6115 } 6116 6117 /* Begin by locking the pmap. */ 6118 mutex_enter(&pmap->pm_lock); 6119 6120 /* Look up the PTP. Allocate if none present. */ 6121 ptp = NULL; 6122 getptp = false; 6123 if (pmap != pmap_kernel()) { 6124 ptp = pmap_find_ptp(pmap, va, 1); 6125 if (ptp == NULL) { 6126 getptp = true; 6127 error = pmap_get_ptp(pmap, &pt, va, flags, &ptp); 6128 if (error != 0) { 6129 if (flags & PMAP_CANFAIL) { 6130 mutex_exit(&pmap->pm_lock); 6131 return error; 6132 } 6133 panic("%s: get ptp failed, error=%d", __func__, 6134 error); 6135 } 6136 } 6137 tree = &VM_PAGE_TO_PP(ptp)->pp_rb; 6138 } else { 6139 /* Embedded PV entries rely on this. */ 6140 KASSERT(va != 0); 6141 tree = &pmap_kernel_rb; 6142 } 6143 6144 /* 6145 * Look up the old PV entry at this VA (if any), and insert a new PV 6146 * entry if required for the new mapping. Temporarily track the old 6147 * and new mappings concurrently. Only after the old mapping is 6148 * evicted from the pmap will we remove its PV entry. Otherwise, 6149 * our picture of modified/accessed state for either page could get 6150 * out of sync (we need any P->V operation for either page to stall 6151 * on pmap->pm_lock until done here). 6152 */ 6153 new_pve = NULL; 6154 old_pve = NULL; 6155 samepage = false; 6156 new_embedded = false; 6157 6158 if (new_pp != NULL) { 6159 error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve, 6160 &old_pve, &samepage, &new_embedded, tree); 6161 6162 /* 6163 * If a new pv_entry was needed and none was available, we 6164 * can go no further. 6165 */ 6166 if (error != 0) { 6167 if (flags & PMAP_CANFAIL) { 6168 if (getptp) { 6169 pmap_unget_ptp(pmap, &pt); 6170 } 6171 mutex_exit(&pmap->pm_lock); 6172 return error; 6173 } 6174 panic("%s: alloc pve failed", __func__); 6175 } 6176 } else { 6177 old_pve = pmap_treelookup_pv(pmap, ptp, tree, va); 6178 } 6179 6180 /* Map PTEs into address space. */ 6181 kpreempt_disable(); 6182 6183 /* Install any newly allocated PTPs. */ 6184 if (getptp) { 6185 pmap_ept_install_ptp(pmap, &pt, va); 6186 } 6187 6188 /* Check if there is an existing mapping. */ 6189 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp)); 6190 ptep = &ptes[pl1_pi(va)]; 6191 opte = *ptep; 6192 bool have_oldpa = pmap_ept_valid_entry(opte); 6193 paddr_t oldpa = pmap_pte2pa(opte); 6194 6195 /* 6196 * Update the pte. 6197 */ 6198 do { 6199 opte = *ptep; 6200 6201 /* 6202 * if the same page, inherit PTE_A and PTE_D. 6203 */ 6204 if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) { 6205 npte |= opte & (EPT_A | EPT_D); 6206 } 6207 } while (pmap_pte_cas(ptep, opte, npte) != opte); 6208 6209 /* 6210 * Done with the PTEs: they can now be unmapped. 6211 */ 6212 kpreempt_enable(); 6213 6214 /* 6215 * Update statistics and PTP's reference count. 6216 */ 6217 pmap_ept_stats_update_bypte(pmap, npte, opte); 6218 if (ptp != NULL) { 6219 if (!have_oldpa) { 6220 ptp->wire_count++; 6221 } 6222 /* Remember minimum VA in PTP. */ 6223 pmap_ptp_range_set(ptp, va); 6224 } 6225 KASSERT(ptp == NULL || ptp->wire_count > 1); 6226 6227 /* 6228 * If the same page, we can skip pv_entry handling. 6229 */ 6230 if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) { 6231 KASSERT(((opte ^ npte) & EPT_PVLIST) == 0); 6232 if ((npte & EPT_PVLIST) != 0) { 6233 KASSERT(samepage); 6234 pmap_check_pv(pmap, ptp, new_pp, va, true); 6235 } 6236 goto same_pa; 6237 } else if ((npte & EPT_PVLIST) != 0) { 6238 KASSERT(!samepage); 6239 } 6240 6241 /* 6242 * If old page is pv-tracked, remove pv_entry from its list. 6243 */ 6244 if ((~opte & (EPT_R | EPT_PVLIST)) == 0) { 6245 if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) { 6246 old_pp = VM_PAGE_TO_PP(old_pg); 6247 } else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) { 6248 panic("%s: EPT_PVLIST with pv-untracked page" 6249 " va = %#"PRIxVADDR 6250 " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")", 6251 __func__, va, oldpa, atop(pa)); 6252 } 6253 6254 pmap_remove_pv(pmap, old_pp, ptp, va, old_pve, 6255 pmap_ept_to_pp_attrs(opte)); 6256 } else { 6257 KASSERT(old_pve == NULL); 6258 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 6259 } 6260 6261 /* 6262 * If new page is dynamically PV tracked, insert to tree. 6263 */ 6264 if (new_pve != NULL) { 6265 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 6266 old_pve = rb_tree_insert_node(tree, new_pve); 6267 KASSERT(old_pve == new_pve); 6268 pmap_check_pv(pmap, ptp, new_pp, va, true); 6269 } 6270 6271 same_pa: 6272 /* 6273 * shootdown tlb if necessary. 6274 */ 6275 6276 if (pmap_ept_has_ad) { 6277 accessed = (~opte & (EPT_R | EPT_A)) == 0; 6278 } else { 6279 accessed = (opte & EPT_R) != 0; 6280 } 6281 if (accessed && ((opte ^ npte) & (PTE_FRAME | EPT_W)) != 0) { 6282 pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_ENTER); 6283 } 6284 pmap_drain_pv(pmap); 6285 mutex_exit(&pmap->pm_lock); 6286 return 0; 6287 } 6288 6289 /* Pay close attention, this returns L2. */ 6290 static int 6291 pmap_ept_pdes_invalid(struct pmap *pmap, vaddr_t va, pd_entry_t *lastpde) 6292 { 6293 pt_entry_t *pteva; 6294 paddr_t ptepa; 6295 int i, index; 6296 6297 KASSERT(mutex_owned(&pmap->pm_lock)); 6298 6299 ptepa = pmap->pm_pdirpa[0]; 6300 for (i = PTP_LEVELS; i > 1; i--) { 6301 pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa); 6302 index = pl_pi(va, i); 6303 if (!pmap_ept_valid_entry(pteva[index])) 6304 return i; 6305 ptepa = pmap_pte2pa(pteva[index]); 6306 } 6307 if (lastpde != NULL) { 6308 *lastpde = pteva[index]; 6309 } 6310 6311 return 0; 6312 } 6313 6314 static bool 6315 pmap_ept_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap) 6316 { 6317 pt_entry_t *ptes, pte; 6318 pd_entry_t pde; 6319 paddr_t ptppa, pa; 6320 bool rv; 6321 6322 #ifdef __HAVE_DIRECT_MAP 6323 if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) { 6324 if (pap != NULL) { 6325 *pap = PMAP_DIRECT_UNMAP(va); 6326 } 6327 return true; 6328 } 6329 #endif 6330 6331 rv = false; 6332 pa = 0; 6333 6334 mutex_enter(&pmap->pm_lock); 6335 kpreempt_disable(); 6336 6337 if (!pmap_ept_pdes_invalid(pmap, va, &pde)) { 6338 ptppa = pmap_pte2pa(pde); 6339 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa); 6340 pte = ptes[pl1_pi(va)]; 6341 if (__predict_true((pte & EPT_R) != 0)) { 6342 pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1)); 6343 rv = true; 6344 } 6345 } 6346 6347 kpreempt_enable(); 6348 mutex_exit(&pmap->pm_lock); 6349 6350 if (pap != NULL) { 6351 *pap = pa; 6352 } 6353 return rv; 6354 } 6355 6356 static bool 6357 pmap_ept_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, 6358 vaddr_t va) 6359 { 6360 struct pv_entry *pve; 6361 struct vm_page *pg; 6362 struct pmap_page *pp; 6363 pt_entry_t opte; 6364 bool accessed; 6365 6366 KASSERT(pmap != pmap_kernel()); 6367 KASSERT(mutex_owned(&pmap->pm_lock)); 6368 KASSERT(kpreempt_disabled()); 6369 6370 if (!pmap_ept_valid_entry(*pte)) { 6371 /* VA not mapped. */ 6372 return false; 6373 } 6374 6375 /* Atomically save the old PTE and zap it. */ 6376 opte = pmap_pte_testset(pte, 0); 6377 if (!pmap_ept_valid_entry(opte)) { 6378 return false; 6379 } 6380 6381 pmap_ept_stats_update_bypte(pmap, 0, opte); 6382 6383 if (ptp) { 6384 /* 6385 * Dropping a PTE. Make sure that the PDE is flushed. 6386 */ 6387 ptp->wire_count--; 6388 if (ptp->wire_count <= 1) { 6389 opte |= EPT_A; 6390 } 6391 } 6392 6393 if (pmap_ept_has_ad) { 6394 accessed = (opte & EPT_A) != 0; 6395 } else { 6396 accessed = true; 6397 } 6398 if (accessed) { 6399 pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_REMOVE_PTE); 6400 } 6401 6402 /* 6403 * If we are not on a pv list - we are done. 6404 */ 6405 if ((opte & EPT_PVLIST) == 0) { 6406 KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL), 6407 "managed page without EPT_PVLIST for %#"PRIxVADDR, va); 6408 KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL), 6409 "pv-tracked page without EPT_PVLIST for %#"PRIxVADDR, va); 6410 KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ? 6411 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL); 6412 return true; 6413 } 6414 6415 if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) { 6416 pp = VM_PAGE_TO_PP(pg); 6417 } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) { 6418 paddr_t pa = pmap_pte2pa(opte); 6419 panic("%s: EPT_PVLIST with pv-untracked page" 6420 " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")", 6421 __func__, va, pa, atop(pa)); 6422 } 6423 6424 /* Sync R/M bits. */ 6425 pve = pmap_lookup_pv(pmap, ptp, pp, va); 6426 pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_ept_to_pp_attrs(opte)); 6427 return true; 6428 } 6429 6430 static void 6431 pmap_ept_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva, 6432 vaddr_t startva, vaddr_t endva) 6433 { 6434 pt_entry_t *pte = (pt_entry_t *)ptpva; 6435 6436 KASSERT(pmap != pmap_kernel()); 6437 KASSERT(mutex_owned(&pmap->pm_lock)); 6438 KASSERT(kpreempt_disabled()); 6439 6440 /* 6441 * mappings are very often sparse, so clip the given range to the 6442 * range of PTEs that are known present in the PTP. 6443 */ 6444 pmap_ptp_range_clip(ptp, &startva, &pte); 6445 6446 /* 6447 * note that ptpva points to the PTE that maps startva. this may 6448 * or may not be the first PTE in the PTP. 6449 * 6450 * we loop through the PTP while there are still PTEs to look at 6451 * and the wire_count is greater than 1 (because we use the wire_count 6452 * to keep track of the number of real PTEs in the PTP). 6453 */ 6454 while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) { 6455 (void)pmap_ept_remove_pte(pmap, ptp, pte, startva); 6456 startva += PAGE_SIZE; 6457 pte++; 6458 } 6459 } 6460 6461 static void 6462 pmap_ept_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 6463 { 6464 pt_entry_t *ptes; 6465 pd_entry_t pde; 6466 paddr_t ptppa; 6467 vaddr_t blkendva, va = sva; 6468 struct vm_page *ptp; 6469 6470 mutex_enter(&pmap->pm_lock); 6471 kpreempt_disable(); 6472 6473 for (/* null */ ; va < eva ; va = blkendva) { 6474 int lvl; 6475 6476 /* determine range of block */ 6477 blkendva = x86_round_pdr(va+1); 6478 if (blkendva > eva) 6479 blkendva = eva; 6480 6481 lvl = pmap_ept_pdes_invalid(pmap, va, &pde); 6482 if (lvl != 0) { 6483 /* Skip a range corresponding to an invalid pde. */ 6484 blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1]; 6485 continue; 6486 } 6487 6488 /* PA of the PTP */ 6489 ptppa = pmap_pte2pa(pde); 6490 6491 ptp = pmap_find_ptp(pmap, va, 1); 6492 KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected", 6493 __func__); 6494 6495 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa); 6496 6497 pmap_ept_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_pi(va)], va, 6498 blkendva); 6499 6500 /* If PTP is no longer being used, free it. */ 6501 if (ptp && ptp->wire_count <= 1) { 6502 pmap_ept_free_ptp(pmap, ptp, va); 6503 } 6504 } 6505 6506 kpreempt_enable(); 6507 pmap_drain_pv(pmap); 6508 mutex_exit(&pmap->pm_lock); 6509 } 6510 6511 static int 6512 pmap_ept_sync_pv(struct vm_page *ptp, vaddr_t va, paddr_t pa, int clearbits, 6513 uint8_t *oattrs, pt_entry_t *optep) 6514 { 6515 struct pmap *pmap; 6516 pt_entry_t *ptep; 6517 pt_entry_t opte; 6518 pt_entry_t npte; 6519 pt_entry_t expect; 6520 bool need_shootdown; 6521 6522 expect = pmap_pa2pte(pa) | EPT_R; 6523 pmap = ptp_to_pmap(ptp); 6524 6525 if (clearbits != ~0) { 6526 KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0); 6527 clearbits = pmap_pp_attrs_to_ept(clearbits); 6528 } 6529 6530 ptep = pmap_map_pte(pmap, ptp, va); 6531 do { 6532 opte = *ptep; 6533 KASSERT((opte & (EPT_D | EPT_A)) != EPT_D); 6534 KASSERT((opte & (EPT_A | EPT_R)) != EPT_A); 6535 KASSERT(opte == 0 || (opte & EPT_R) != 0); 6536 if ((opte & (PTE_FRAME | EPT_R)) != expect) { 6537 /* 6538 * We lost a race with a V->P operation like 6539 * pmap_remove(). Wait for the competitor 6540 * reflecting pte bits into mp_attrs. 6541 */ 6542 pmap_unmap_pte(); 6543 return EAGAIN; 6544 } 6545 6546 /* 6547 * Check if there's anything to do on this PTE. 6548 */ 6549 if ((opte & clearbits) == 0) { 6550 need_shootdown = false; 6551 break; 6552 } 6553 6554 /* 6555 * We need a shootdown if the PTE is cached (EPT_A) ... 6556 * ... Unless we are clearing only the EPT_W bit and 6557 * it isn't cached as RW (EPT_D). 6558 */ 6559 if (pmap_ept_has_ad) { 6560 need_shootdown = (opte & EPT_A) != 0 && 6561 !(clearbits == EPT_W && (opte & EPT_D) == 0); 6562 } else { 6563 need_shootdown = true; 6564 } 6565 6566 npte = opte & ~clearbits; 6567 6568 /* 6569 * If we need a shootdown anyway, clear EPT_A and EPT_D. 6570 */ 6571 if (need_shootdown) { 6572 npte &= ~(EPT_A | EPT_D); 6573 } 6574 KASSERT((npte & (EPT_D | EPT_A)) != EPT_D); 6575 KASSERT((npte & (EPT_A | EPT_R)) != EPT_A); 6576 KASSERT(npte == 0 || (opte & EPT_R) != 0); 6577 } while (pmap_pte_cas(ptep, opte, npte) != opte); 6578 6579 if (need_shootdown) { 6580 pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_SYNC_PV); 6581 } 6582 pmap_unmap_pte(); 6583 6584 *oattrs = pmap_ept_to_pp_attrs(opte); 6585 if (optep != NULL) 6586 *optep = opte; 6587 return 0; 6588 } 6589 6590 static void 6591 pmap_ept_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte, 6592 vaddr_t va) 6593 { 6594 6595 KASSERT(mutex_owned(&pmap->pm_lock)); 6596 6597 pmap_ept_stats_update_bypte(pmap, 0, opte); 6598 ptp->wire_count--; 6599 if (ptp->wire_count <= 1) { 6600 pmap_ept_free_ptp(pmap, ptp, va); 6601 } 6602 } 6603 6604 static void 6605 pmap_ept_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot) 6606 { 6607 pt_entry_t bit_rem; 6608 pt_entry_t *ptes, *spte; 6609 pt_entry_t opte, npte; 6610 pd_entry_t pde; 6611 paddr_t ptppa; 6612 vaddr_t va; 6613 bool modified; 6614 6615 bit_rem = 0; 6616 if (!(prot & VM_PROT_WRITE)) 6617 bit_rem = EPT_W; 6618 6619 sva &= PTE_FRAME; 6620 eva &= PTE_FRAME; 6621 6622 /* Acquire pmap. */ 6623 mutex_enter(&pmap->pm_lock); 6624 kpreempt_disable(); 6625 6626 for (va = sva; va < eva; va += PAGE_SIZE) { 6627 if (pmap_ept_pdes_invalid(pmap, va, &pde)) { 6628 continue; 6629 } 6630 6631 ptppa = pmap_pte2pa(pde); 6632 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa); 6633 spte = &ptes[pl1_pi(va)]; 6634 6635 do { 6636 opte = *spte; 6637 if (!pmap_ept_valid_entry(opte)) { 6638 goto next; 6639 } 6640 npte = (opte & ~bit_rem); 6641 } while (pmap_pte_cas(spte, opte, npte) != opte); 6642 6643 if (pmap_ept_has_ad) { 6644 modified = (opte & EPT_D) != 0; 6645 } else { 6646 modified = true; 6647 } 6648 if (modified) { 6649 vaddr_t tva = x86_ptob(spte - ptes); 6650 pmap_tlb_shootdown(pmap, tva, 0, 6651 TLBSHOOT_WRITE_PROTECT); 6652 } 6653 next:; 6654 } 6655 6656 kpreempt_enable(); 6657 mutex_exit(&pmap->pm_lock); 6658 } 6659 6660 static void 6661 pmap_ept_unwire(struct pmap *pmap, vaddr_t va) 6662 { 6663 pt_entry_t *ptes, *ptep, opte; 6664 pd_entry_t pde; 6665 paddr_t ptppa; 6666 6667 /* Acquire pmap. */ 6668 mutex_enter(&pmap->pm_lock); 6669 kpreempt_disable(); 6670 6671 if (pmap_ept_pdes_invalid(pmap, va, &pde)) { 6672 panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va); 6673 } 6674 6675 ptppa = pmap_pte2pa(pde); 6676 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa); 6677 ptep = &ptes[pl1_pi(va)]; 6678 opte = *ptep; 6679 KASSERT(pmap_ept_valid_entry(opte)); 6680 6681 if (opte & EPT_WIRED) { 6682 pt_entry_t npte = opte & ~EPT_WIRED; 6683 6684 opte = pmap_pte_testset(ptep, npte); 6685 pmap_ept_stats_update_bypte(pmap, npte, opte); 6686 } else { 6687 printf("%s: wiring for pmap %p va %#" PRIxVADDR 6688 "did not change!\n", __func__, pmap, va); 6689 } 6690 6691 /* Release pmap. */ 6692 kpreempt_enable(); 6693 mutex_exit(&pmap->pm_lock); 6694 } 6695 6696 /* -------------------------------------------------------------------------- */ 6697 6698 void 6699 pmap_ept_transform(struct pmap *pmap) 6700 { 6701 pmap->pm_enter = pmap_ept_enter; 6702 pmap->pm_extract = pmap_ept_extract; 6703 pmap->pm_remove = pmap_ept_remove; 6704 pmap->pm_sync_pv = pmap_ept_sync_pv; 6705 pmap->pm_pp_remove_ent = pmap_ept_pp_remove_ent; 6706 pmap->pm_write_protect = pmap_ept_write_protect; 6707 pmap->pm_unwire = pmap_ept_unwire; 6708 6709 memset(PAGE_ALIGNED(pmap->pm_pdir), 0, PAGE_SIZE); 6710 } 6711 6712 #endif /* __HAVE_DIRECT_MAP && __x86_64__ && !XENPV */ 6713