1 /* $NetBSD: pmap.c,v 1.415 2022/05/13 09:39:40 riastradh Exp $ */ 2 3 /* 4 * Copyright (c) 2008, 2010, 2016, 2017, 2019, 2020 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran, and by Maxime Villard. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 2007 Manuel Bouyer. 34 * 35 * Redistribution and use in source and binary forms, with or without 36 * modification, are permitted provided that the following conditions 37 * are met: 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 44 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 45 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 46 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 47 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 48 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 49 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 50 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 51 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 52 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 53 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 54 */ 55 56 /* 57 * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr> 58 * 59 * Permission to use, copy, modify, and distribute this software for any 60 * purpose with or without fee is hereby granted, provided that the above 61 * copyright notice and this permission notice appear in all copies. 62 * 63 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 64 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 65 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 66 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 67 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 68 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 69 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 70 */ 71 72 /* 73 * Copyright 2001 (c) Wasabi Systems, Inc. 74 * All rights reserved. 75 * 76 * Written by Frank van der Linden for Wasabi Systems, Inc. 77 * 78 * Redistribution and use in source and binary forms, with or without 79 * modification, are permitted provided that the following conditions 80 * are met: 81 * 1. Redistributions of source code must retain the above copyright 82 * notice, this list of conditions and the following disclaimer. 83 * 2. Redistributions in binary form must reproduce the above copyright 84 * notice, this list of conditions and the following disclaimer in the 85 * documentation and/or other materials provided with the distribution. 86 * 3. All advertising materials mentioning features or use of this software 87 * must display the following acknowledgement: 88 * This product includes software developed for the NetBSD Project by 89 * Wasabi Systems, Inc. 90 * 4. The name of Wasabi Systems, Inc. may not be used to endorse 91 * or promote products derived from this software without specific prior 92 * written permission. 93 * 94 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND 95 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 96 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 97 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC 98 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 99 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 100 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 101 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 102 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 103 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 104 * POSSIBILITY OF SUCH DAMAGE. 105 */ 106 107 /* 108 * Copyright (c) 1997 Charles D. Cranor and Washington University. 109 * All rights reserved. 110 * 111 * Redistribution and use in source and binary forms, with or without 112 * modification, are permitted provided that the following conditions 113 * are met: 114 * 1. Redistributions of source code must retain the above copyright 115 * notice, this list of conditions and the following disclaimer. 116 * 2. Redistributions in binary form must reproduce the above copyright 117 * notice, this list of conditions and the following disclaimer in the 118 * documentation and/or other materials provided with the distribution. 119 * 120 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 121 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 122 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 123 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 124 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 125 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 126 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 127 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 128 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 129 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 130 */ 131 132 #include <sys/cdefs.h> 133 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.415 2022/05/13 09:39:40 riastradh Exp $"); 134 135 #include "opt_user_ldt.h" 136 #include "opt_lockdebug.h" 137 #include "opt_multiprocessor.h" 138 #include "opt_xen.h" 139 #include "opt_svs.h" 140 #include "opt_kaslr.h" 141 142 #define __MUTEX_PRIVATE /* for assertions */ 143 144 #include <sys/param.h> 145 #include <sys/systm.h> 146 #include <sys/proc.h> 147 #include <sys/pool.h> 148 #include <sys/kernel.h> 149 #include <sys/atomic.h> 150 #include <sys/cpu.h> 151 #include <sys/intr.h> 152 #include <sys/xcall.h> 153 #include <sys/kcore.h> 154 #include <sys/kmem.h> 155 #include <sys/asan.h> 156 #include <sys/msan.h> 157 #include <sys/entropy.h> 158 159 #include <uvm/uvm.h> 160 #include <uvm/pmap/pmap_pvt.h> 161 162 #include <dev/isa/isareg.h> 163 164 #include <machine/specialreg.h> 165 #include <machine/gdt.h> 166 #include <machine/isa_machdep.h> 167 #include <machine/cpuvar.h> 168 #include <machine/cputypes.h> 169 170 #include <x86/pmap_pv.h> 171 172 #include <x86/i82489reg.h> 173 #include <x86/i82489var.h> 174 175 #ifdef XEN 176 #include <xen/include/public/xen.h> 177 #include <xen/hypervisor.h> 178 #include <xen/xenpmap.h> 179 #endif 180 181 #ifdef __HAVE_DIRECT_MAP 182 #include <crypto/nist_hash_drbg/nist_hash_drbg.h> 183 #endif 184 185 /* 186 * general info: 187 * 188 * - for an explanation of how the x86 MMU hardware works see 189 * the comments in <machine/pte.h>. 190 * 191 * - for an explanation of the general memory structure used by 192 * this pmap (including the recursive mapping), see the comments 193 * in <machine/pmap.h>. 194 * 195 * this file contains the code for the "pmap module." the module's 196 * job is to manage the hardware's virtual to physical address mappings. 197 * note that there are two levels of mapping in the VM system: 198 * 199 * [1] the upper layer of the VM system uses vm_map's and vm_map_entry's 200 * to map ranges of virtual address space to objects/files. for 201 * example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only 202 * to the file /bin/ls starting at offset zero." note that 203 * the upper layer mapping is not concerned with how individual 204 * vm_pages are mapped. 205 * 206 * [2] the lower layer of the VM system (the pmap) maintains the mappings 207 * from virtual addresses. it is concerned with which vm_page is 208 * mapped where. for example, when you run /bin/ls and start 209 * at page 0x1000 the fault routine may lookup the correct page 210 * of the /bin/ls file and then ask the pmap layer to establish 211 * a mapping for it. 212 * 213 * note that information in the lower layer of the VM system can be 214 * thrown away since it can easily be reconstructed from the info 215 * in the upper layer. 216 * 217 * data structures we use include: 218 * 219 * - struct pmap: describes the address space of one thread 220 * - struct pmap_page: describes one pv-tracked page, without 221 * necessarily a corresponding vm_page 222 * - struct pv_entry: describes one <PMAP,VA> mapping of a PA 223 * - pmap_page::pp_pvlist: there is one list per pv-tracked page of 224 * physical memory. the pp_pvlist points to a list of pv_entry 225 * structures which describe all the <PMAP,VA> pairs that this 226 * page is mapped in. this is critical for page based operations 227 * such as pmap_page_protect() [change protection on _all_ mappings 228 * of a page] 229 */ 230 231 /* 232 * Locking 233 * 234 * We have the following locks that we must deal with, listed in the order 235 * that they are acquired: 236 * 237 * pg->uobject->vmobjlock, pg->uanon->an_lock 238 * 239 * For managed pages, these per-object locks are taken by the VM system 240 * before calling into the pmap module - either a read or write hold. 241 * The lock hold prevent pages from changing identity while the pmap is 242 * operating on them. For example, the same lock is held across a call 243 * to pmap_remove() and the following call to pmap_update(), so that a 244 * page does not gain a new identity while its TLB visibility is stale. 245 * 246 * pmap->pm_lock 247 * 248 * This lock protects the fields in the pmap structure including the 249 * non-kernel PDEs in the PDP, the PTEs, and PTPs and connected data 250 * structures. For modifying unmanaged kernel PTEs it is not needed as 251 * kernel PDEs are never freed, and the kernel is expected to be self 252 * consistent (and the lock can't be taken for unmanaged kernel PTEs, 253 * because they can be modified from interrupt context). 254 * 255 * pmaps_lock 256 * 257 * This lock protects the list of active pmaps (headed by "pmaps"). 258 * It's acquired when adding or removing pmaps or adjusting kernel PDEs. 259 * 260 * pp_lock 261 * 262 * This per-page lock protects PV entry lists and the embedded PV entry 263 * in each vm_page, allowing for concurrent operation on pages by 264 * different pmaps. This is a spin mutex at IPL_VM, because at the 265 * points it is taken context switching is usually not tolerable, and 266 * spin mutexes must block out interrupts that could take kernel_lock. 267 */ 268 269 /* uvm_object is abused here to index pmap_pages; make assertions happy. */ 270 #ifdef DIAGNOSTIC 271 #define PMAP_DUMMY_LOCK(pm) rw_enter(&(pm)->pm_dummy_lock, RW_WRITER) 272 #define PMAP_DUMMY_UNLOCK(pm) rw_exit(&(pm)->pm_dummy_lock) 273 #else 274 #define PMAP_DUMMY_LOCK(pm) 275 #define PMAP_DUMMY_UNLOCK(pm) 276 #endif 277 278 static const struct uvm_pagerops pmap_pager = { 279 /* nothing */ 280 }; 281 282 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER; 283 const vaddr_t ptp_frames[] = PTP_FRAME_INITIALIZER; 284 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER; 285 const long nkptpmax[] = NKPTPMAX_INITIALIZER; 286 const long nbpd[] = NBPD_INITIALIZER; 287 #ifdef i386 288 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER; 289 #else 290 pd_entry_t *normal_pdes[3]; 291 #endif 292 293 long nkptp[] = NKPTP_INITIALIZER; 294 295 struct pmap_head pmaps; 296 kmutex_t pmaps_lock __cacheline_aligned; 297 298 struct pcpu_area *pcpuarea __read_mostly; 299 300 static vaddr_t pmap_maxkvaddr; 301 302 /* 303 * Misc. event counters. 304 */ 305 struct evcnt pmap_iobmp_evcnt; 306 struct evcnt pmap_ldt_evcnt; 307 308 /* 309 * PAT 310 */ 311 static bool cpu_pat_enabled __read_mostly = false; 312 313 /* 314 * Global data structures 315 */ 316 317 static struct pmap kernel_pmap_store __cacheline_aligned; /* kernel's pmap */ 318 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store; 319 static rb_tree_t pmap_kernel_rb __cacheline_aligned; 320 321 struct bootspace bootspace __read_mostly; 322 struct slotspace slotspace __read_mostly; 323 324 /* Set to PTE_NX if supported. */ 325 pd_entry_t pmap_pg_nx __read_mostly = 0; 326 327 /* Set to PTE_G if supported. */ 328 pd_entry_t pmap_pg_g __read_mostly = 0; 329 330 /* Set to true if large pages are supported. */ 331 int pmap_largepages __read_mostly = 0; 332 333 paddr_t lowmem_rsvd __read_mostly; 334 paddr_t avail_start __read_mostly; /* PA of first available physical page */ 335 paddr_t avail_end __read_mostly; /* PA of last available physical page */ 336 337 #ifdef XENPV 338 paddr_t pmap_pa_start; /* PA of first physical page for this domain */ 339 paddr_t pmap_pa_end; /* PA of last physical page for this domain */ 340 #endif 341 342 #define VM_PAGE_TO_PP(pg) (&(pg)->mdpage.mp_pp) 343 #define PMAP_CHECK_PP(pp) \ 344 KASSERTMSG((pp)->pp_lock.mtx_ipl._ipl == IPL_VM, "bad pmap_page %p", pp) 345 346 #define PAGE_ALIGNED(pp) \ 347 __builtin_assume_aligned((void *)(pp), PAGE_SIZE) 348 349 /* 350 * Other data structures 351 */ 352 353 static pt_entry_t protection_codes[8] __read_mostly; 354 355 static bool pmap_initialized __read_mostly = false; /* pmap_init done yet? */ 356 357 /* 358 * The following two vaddr_t's are used during system startup to keep track of 359 * how much of the kernel's VM space we have used. Once the system is started, 360 * the management of the remaining kernel VM space is turned over to the 361 * kernel_map vm_map. 362 */ 363 static vaddr_t virtual_avail __read_mostly; /* VA of first free KVA */ 364 static vaddr_t virtual_end __read_mostly; /* VA of last free KVA */ 365 366 #ifndef XENPV 367 /* 368 * LAPIC virtual address, and fake physical address. 369 */ 370 volatile vaddr_t local_apic_va __read_mostly; 371 paddr_t local_apic_pa __read_mostly; 372 #endif 373 374 /* 375 * pool that pmap structures are allocated from 376 */ 377 struct pool_cache pmap_cache; 378 static int pmap_ctor(void *, void *, int); 379 static void pmap_dtor(void *, void *); 380 381 /* 382 * pv_page cache 383 */ 384 static struct pool_cache pmap_pvp_cache; 385 386 #ifdef __HAVE_DIRECT_MAP 387 vaddr_t pmap_direct_base __read_mostly; 388 vaddr_t pmap_direct_end __read_mostly; 389 #endif 390 391 #ifndef __HAVE_DIRECT_MAP 392 /* 393 * Special VAs and the PTEs that map them 394 */ 395 static pt_entry_t *early_zero_pte; 396 static void pmap_vpage_cpualloc(struct cpu_info *); 397 #ifdef XENPV 398 char *early_zerop; /* also referenced from xen_locore() */ 399 #else 400 static char *early_zerop; 401 #endif 402 #endif 403 404 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int); 405 406 /* PDP pool and its callbacks */ 407 static struct pool pmap_pdp_pool; 408 static void pmap_pdp_init(pd_entry_t *); 409 static void pmap_pdp_fini(pd_entry_t *); 410 411 #ifdef PAE 412 /* need to allocate items of 4 pages */ 413 static void *pmap_pdp_alloc(struct pool *, int); 414 static void pmap_pdp_free(struct pool *, void *); 415 static struct pool_allocator pmap_pdp_allocator = { 416 .pa_alloc = pmap_pdp_alloc, 417 .pa_free = pmap_pdp_free, 418 .pa_pagesz = PAGE_SIZE * PDP_SIZE, 419 }; 420 #endif 421 422 extern vaddr_t idt_vaddr; 423 extern paddr_t idt_paddr; 424 extern vaddr_t gdt_vaddr; 425 extern paddr_t gdt_paddr; 426 extern vaddr_t ldt_vaddr; 427 extern paddr_t ldt_paddr; 428 429 #ifdef i386 430 /* stuff to fix the pentium f00f bug */ 431 extern vaddr_t pentium_idt_vaddr; 432 #endif 433 434 /* Array of freshly allocated PTPs, for pmap_get_ptp(). */ 435 struct pmap_ptparray { 436 struct vm_page *pg[PTP_LEVELS + 1]; 437 bool alloced[PTP_LEVELS + 1]; 438 }; 439 440 /* 441 * PV entries are allocated in page-sized chunks and cached per-pmap to 442 * avoid intense pressure on memory allocators. 443 */ 444 445 struct pv_page { 446 LIST_HEAD(, pv_entry) pvp_pves; 447 LIST_ENTRY(pv_page) pvp_list; 448 long pvp_nfree; 449 struct pmap *pvp_pmap; 450 }; 451 452 #define PVE_PER_PVP ((PAGE_SIZE / sizeof(struct pv_entry)) - 1) 453 454 /* 455 * PV tree prototypes 456 */ 457 458 static int pmap_compare_key(void *, const void *, const void *); 459 static int pmap_compare_nodes(void *, const void *, const void *); 460 461 /* Read-black tree */ 462 static const rb_tree_ops_t pmap_rbtree_ops = { 463 .rbto_compare_nodes = pmap_compare_nodes, 464 .rbto_compare_key = pmap_compare_key, 465 .rbto_node_offset = offsetof(struct pv_entry, pve_rb), 466 .rbto_context = NULL 467 }; 468 469 /* 470 * Local prototypes 471 */ 472 473 #ifdef __HAVE_PCPU_AREA 474 static void pmap_init_pcpu(void); 475 #endif 476 #ifdef __HAVE_DIRECT_MAP 477 static void pmap_init_directmap(struct pmap *); 478 #endif 479 #if !defined(XENPV) 480 static void pmap_remap_global(void); 481 #endif 482 #ifndef XENPV 483 static void pmap_init_lapic(void); 484 static void pmap_remap_largepages(void); 485 #endif 486 487 static int pmap_get_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t, int, 488 struct vm_page **); 489 static void pmap_unget_ptp(struct pmap *, struct pmap_ptparray *); 490 static void pmap_install_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t, 491 pd_entry_t * const *); 492 static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, int); 493 static void pmap_freepage(struct pmap *, struct vm_page *, int); 494 static void pmap_free_ptp(struct pmap *, struct vm_page *, vaddr_t, 495 pt_entry_t *, pd_entry_t * const *); 496 static bool pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *, 497 vaddr_t); 498 static void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t, vaddr_t, 499 vaddr_t); 500 static int pmap_pvp_ctor(void *, void *, int); 501 static void pmap_pvp_dtor(void *, void *); 502 static struct pv_entry *pmap_alloc_pv(struct pmap *); 503 static void pmap_free_pv(struct pmap *, struct pv_entry *); 504 static void pmap_drain_pv(struct pmap *); 505 506 static void pmap_alloc_level(struct pmap *, vaddr_t, long *); 507 508 static void pmap_load1(struct lwp *, struct pmap *, struct pmap *); 509 static void pmap_reactivate(struct pmap *); 510 511 /* 512 * p m a p h e l p e r f u n c t i o n s 513 */ 514 515 static inline void 516 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff) 517 { 518 519 KASSERT(cold || mutex_owned(&pmap->pm_lock)); 520 pmap->pm_stats.resident_count += resid_diff; 521 pmap->pm_stats.wired_count += wired_diff; 522 } 523 524 static inline void 525 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte) 526 { 527 int resid_diff = ((npte & PTE_P) ? 1 : 0) - ((opte & PTE_P) ? 1 : 0); 528 int wired_diff = ((npte & PTE_WIRED) ? 1 : 0) - ((opte & PTE_WIRED) ? 1 : 0); 529 530 KASSERT((npte & (PTE_P | PTE_WIRED)) != PTE_WIRED); 531 KASSERT((opte & (PTE_P | PTE_WIRED)) != PTE_WIRED); 532 533 pmap_stats_update(pmap, resid_diff, wired_diff); 534 } 535 536 /* 537 * ptp_to_pmap: lookup pmap by ptp 538 */ 539 static inline struct pmap * 540 ptp_to_pmap(struct vm_page *ptp) 541 { 542 struct pmap *pmap; 543 544 if (ptp == NULL) { 545 return pmap_kernel(); 546 } 547 pmap = (struct pmap *)ptp->uobject; 548 KASSERT(pmap != NULL); 549 KASSERT(&pmap->pm_obj[0] == ptp->uobject); 550 return pmap; 551 } 552 553 static inline struct pv_pte * 554 pve_to_pvpte(struct pv_entry *pve) 555 { 556 557 if (pve == NULL) 558 return NULL; 559 KASSERT((void *)&pve->pve_pte == (void *)pve); 560 return &pve->pve_pte; 561 } 562 563 static inline struct pv_entry * 564 pvpte_to_pve(struct pv_pte *pvpte) 565 { 566 struct pv_entry *pve = (void *)pvpte; 567 568 KASSERT(pve_to_pvpte(pve) == pvpte); 569 return pve; 570 } 571 572 /* 573 * Return true if the pmap page has an embedded PV entry. 574 */ 575 static inline bool 576 pv_pte_embedded(struct pmap_page *pp) 577 { 578 579 KASSERT(mutex_owned(&pp->pp_lock)); 580 return (bool)((vaddr_t)pp->pp_pte.pte_ptp | pp->pp_pte.pte_va); 581 } 582 583 /* 584 * pv_pte_first, pv_pte_next: PV list iterator. 585 */ 586 static inline struct pv_pte * 587 pv_pte_first(struct pmap_page *pp) 588 { 589 590 KASSERT(mutex_owned(&pp->pp_lock)); 591 if (pv_pte_embedded(pp)) { 592 return &pp->pp_pte; 593 } 594 return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist)); 595 } 596 597 static inline struct pv_pte * 598 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte) 599 { 600 601 KASSERT(mutex_owned(&pp->pp_lock)); 602 KASSERT(pvpte != NULL); 603 if (pvpte == &pp->pp_pte) { 604 return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist)); 605 } 606 return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list)); 607 } 608 609 static inline uint8_t 610 pmap_pte_to_pp_attrs(pt_entry_t pte) 611 { 612 uint8_t ret = 0; 613 if (pte & PTE_D) 614 ret |= PP_ATTRS_D; 615 if (pte & PTE_A) 616 ret |= PP_ATTRS_A; 617 if (pte & PTE_W) 618 ret |= PP_ATTRS_W; 619 return ret; 620 } 621 622 static inline pt_entry_t 623 pmap_pp_attrs_to_pte(uint8_t attrs) 624 { 625 pt_entry_t pte = 0; 626 if (attrs & PP_ATTRS_D) 627 pte |= PTE_D; 628 if (attrs & PP_ATTRS_A) 629 pte |= PTE_A; 630 if (attrs & PP_ATTRS_W) 631 pte |= PTE_W; 632 return pte; 633 } 634 635 /* 636 * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]? 637 * of course the kernel is always loaded 638 */ 639 bool 640 pmap_is_curpmap(struct pmap *pmap) 641 { 642 return ((pmap == pmap_kernel()) || (pmap == curcpu()->ci_pmap)); 643 } 644 645 inline void 646 pmap_reference(struct pmap *pmap) 647 { 648 649 atomic_inc_uint(&pmap->pm_obj[0].uo_refs); 650 } 651 652 /* 653 * rbtree: compare two nodes. 654 */ 655 static int 656 pmap_compare_nodes(void *context, const void *n1, const void *n2) 657 { 658 const struct pv_entry *pve1 = n1; 659 const struct pv_entry *pve2 = n2; 660 661 KASSERT(pve1->pve_pte.pte_ptp == pve2->pve_pte.pte_ptp); 662 663 if (pve1->pve_pte.pte_va < pve2->pve_pte.pte_va) { 664 return -1; 665 } 666 if (pve1->pve_pte.pte_va > pve2->pve_pte.pte_va) { 667 return 1; 668 } 669 return 0; 670 } 671 672 /* 673 * rbtree: compare a node and a key. 674 */ 675 static int 676 pmap_compare_key(void *context, const void *n, const void *k) 677 { 678 const struct pv_entry *pve = n; 679 const vaddr_t key = (vaddr_t)k; 680 681 if (pve->pve_pte.pte_va < key) { 682 return -1; 683 } 684 if (pve->pve_pte.pte_va > key) { 685 return 1; 686 } 687 return 0; 688 } 689 690 /* 691 * pmap_ptp_range_set: abuse ptp->uanon to record minimum VA of PTE 692 */ 693 static inline void 694 pmap_ptp_range_set(struct vm_page *ptp, vaddr_t va) 695 { 696 vaddr_t *min = (vaddr_t *)&ptp->uanon; 697 698 if (va < *min) { 699 *min = va; 700 } 701 } 702 703 /* 704 * pmap_ptp_range_clip: abuse ptp->uanon to clip range of PTEs to remove 705 */ 706 static inline void 707 pmap_ptp_range_clip(struct vm_page *ptp, vaddr_t *startva, pt_entry_t **pte) 708 { 709 vaddr_t sclip; 710 711 if (ptp == NULL) { 712 return; 713 } 714 715 sclip = (vaddr_t)ptp->uanon; 716 sclip = (*startva < sclip ? sclip : *startva); 717 *pte += (sclip - *startva) / PAGE_SIZE; 718 *startva = sclip; 719 } 720 721 /* 722 * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in 723 * 724 * there are several pmaps involved. some or all of them might be same. 725 * 726 * - the pmap given by the first argument 727 * our caller wants to access this pmap's PTEs. 728 * 729 * - pmap_kernel() 730 * the kernel pmap. note that it only contains the kernel part 731 * of the address space which is shared by any pmap. ie. any 732 * pmap can be used instead of pmap_kernel() for our purpose. 733 * 734 * - ci->ci_pmap 735 * pmap currently loaded on the cpu. 736 * 737 * - vm_map_pmap(&curproc->p_vmspace->vm_map) 738 * current process' pmap. 739 * 740 * => caller must lock pmap first (if not the kernel pmap) 741 * => must be undone with pmap_unmap_ptes before returning 742 * => disables kernel preemption 743 */ 744 void 745 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2, pd_entry_t **ptepp, 746 pd_entry_t * const **pdeppp) 747 { 748 struct pmap *curpmap; 749 struct cpu_info *ci; 750 lwp_t *l; 751 752 kpreempt_disable(); 753 754 /* The kernel's pmap is always accessible. */ 755 if (pmap == pmap_kernel()) { 756 *pmap2 = NULL; 757 *ptepp = PTE_BASE; 758 *pdeppp = normal_pdes; 759 return; 760 } 761 762 KASSERT(mutex_owned(&pmap->pm_lock)); 763 764 l = curlwp; 765 ci = l->l_cpu; 766 curpmap = ci->ci_pmap; 767 if (pmap == curpmap) { 768 /* 769 * Already on the CPU: make it valid. This is very 770 * often the case during exit(), when we have switched 771 * to the kernel pmap in order to destroy a user pmap. 772 */ 773 if (__predict_false(ci->ci_tlbstate != TLBSTATE_VALID)) { 774 pmap_reactivate(pmap); 775 } 776 *pmap2 = NULL; 777 } else { 778 /* 779 * Toss current pmap from CPU and install new pmap, but keep 780 * a reference to the old one. Dropping the reference can 781 * can block as it needs to take locks, so defer that to 782 * pmap_unmap_ptes(). 783 */ 784 pmap_reference(pmap); 785 pmap_load1(l, pmap, curpmap); 786 *pmap2 = curpmap; 787 } 788 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); 789 #ifdef DIAGNOSTIC 790 pmap->pm_ncsw = lwp_pctr(); 791 #endif 792 *ptepp = PTE_BASE; 793 794 #if defined(XENPV) && defined(__x86_64__) 795 KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] == L4_BASE); 796 ci->ci_normal_pdes[PTP_LEVELS - 2] = pmap->pm_pdir; 797 *pdeppp = ci->ci_normal_pdes; 798 #else 799 *pdeppp = normal_pdes; 800 #endif 801 } 802 803 /* 804 * pmap_unmap_ptes: unlock the PTE mapping of "pmap" 805 * 806 * => we cannot tolerate context switches while mapped in: assert this. 807 * => reenables kernel preemption. 808 * => does not unlock pmap. 809 */ 810 void 811 pmap_unmap_ptes(struct pmap *pmap, struct pmap * pmap2) 812 { 813 struct cpu_info *ci; 814 struct pmap *mypmap; 815 struct lwp *l; 816 817 KASSERT(kpreempt_disabled()); 818 819 /* The kernel's pmap is always accessible. */ 820 if (pmap == pmap_kernel()) { 821 kpreempt_enable(); 822 return; 823 } 824 825 l = curlwp; 826 ci = l->l_cpu; 827 828 KASSERT(mutex_owned(&pmap->pm_lock)); 829 KASSERT(pmap->pm_ncsw == lwp_pctr()); 830 831 #if defined(XENPV) && defined(__x86_64__) 832 KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] != L4_BASE); 833 ci->ci_normal_pdes[PTP_LEVELS - 2] = L4_BASE; 834 #endif 835 836 /* If not our own pmap, mark whatever's on the CPU now as lazy. */ 837 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); 838 mypmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 839 if (ci->ci_pmap == vm_map_pmap(&l->l_proc->p_vmspace->vm_map)) { 840 ci->ci_want_pmapload = 0; 841 } else { 842 ci->ci_want_pmapload = (mypmap != pmap_kernel()); 843 ci->ci_tlbstate = TLBSTATE_LAZY; 844 } 845 846 /* Now safe to re-enable preemption. */ 847 kpreempt_enable(); 848 849 /* Toss reference to other pmap taken earlier. */ 850 if (pmap2 != NULL) { 851 pmap_destroy(pmap2); 852 } 853 } 854 855 inline static void 856 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte) 857 { 858 859 #if !defined(__x86_64__) 860 if (curproc == NULL || curproc->p_vmspace == NULL || 861 pm != vm_map_pmap(&curproc->p_vmspace->vm_map)) 862 return; 863 864 if ((opte ^ npte) & PTE_X) 865 pmap_update_pg(va); 866 867 /* 868 * Executability was removed on the last executable change. 869 * Reset the code segment to something conservative and 870 * let the trap handler deal with setting the right limit. 871 * We can't do that because of locking constraints on the vm map. 872 */ 873 874 if ((opte & PTE_X) && (npte & PTE_X) == 0 && va == pm->pm_hiexec) { 875 struct trapframe *tf = curlwp->l_md.md_regs; 876 877 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 878 pm->pm_hiexec = I386_MAX_EXE_ADDR; 879 } 880 #endif /* !defined(__x86_64__) */ 881 } 882 883 #if !defined(__x86_64__) 884 /* 885 * Fixup the code segment to cover all potential executable mappings. 886 * returns 0 if no changes to the code segment were made. 887 */ 888 int 889 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb) 890 { 891 struct vm_map_entry *ent; 892 struct pmap *pm = vm_map_pmap(map); 893 vaddr_t va = 0; 894 895 vm_map_lock_read(map); 896 for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) { 897 /* 898 * This entry has greater va than the entries before. 899 * We need to make it point to the last page, not past it. 900 */ 901 if (ent->protection & VM_PROT_EXECUTE) 902 va = trunc_page(ent->end) - PAGE_SIZE; 903 } 904 vm_map_unlock_read(map); 905 if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL)) 906 return 0; 907 908 pm->pm_hiexec = va; 909 if (pm->pm_hiexec > I386_MAX_EXE_ADDR) { 910 tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL); 911 } else { 912 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 913 return 0; 914 } 915 return 1; 916 } 917 #endif /* !defined(__x86_64__) */ 918 919 void 920 pat_init(struct cpu_info *ci) 921 { 922 #ifndef XENPV 923 uint64_t pat; 924 925 if (!(ci->ci_feat_val[0] & CPUID_PAT)) 926 return; 927 928 /* We change WT to WC. Leave all other entries the default values. */ 929 pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) | 930 PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) | 931 PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) | 932 PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC); 933 934 wrmsr(MSR_CR_PAT, pat); 935 cpu_pat_enabled = true; 936 #endif 937 } 938 939 static pt_entry_t 940 pmap_pat_flags(u_int flags) 941 { 942 u_int cacheflags = (flags & PMAP_CACHE_MASK); 943 944 if (!cpu_pat_enabled) { 945 switch (cacheflags) { 946 case PMAP_NOCACHE: 947 case PMAP_NOCACHE_OVR: 948 /* results in PGC_UCMINUS on cpus which have 949 * the cpuid PAT but PAT "disabled" 950 */ 951 return PTE_PCD; 952 default: 953 return 0; 954 } 955 } 956 957 switch (cacheflags) { 958 case PMAP_NOCACHE: 959 return PGC_UC; 960 case PMAP_WRITE_COMBINE: 961 return PGC_WC; 962 case PMAP_WRITE_BACK: 963 return PGC_WB; 964 case PMAP_NOCACHE_OVR: 965 return PGC_UCMINUS; 966 } 967 968 return 0; 969 } 970 971 /* 972 * p m a p k e n t e r f u n c t i o n s 973 * 974 * functions to quickly enter/remove pages from the kernel address 975 * space. pmap_kremove is exported to MI kernel. we make use of 976 * the recursive PTE mappings. 977 */ 978 979 /* 980 * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking 981 * 982 * => no need to lock anything, assume va is already allocated 983 * => should be faster than normal pmap enter function 984 */ 985 void 986 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags) 987 { 988 pt_entry_t *pte, opte, npte; 989 990 KASSERT(!(prot & ~VM_PROT_ALL)); 991 992 if (va < VM_MIN_KERNEL_ADDRESS) 993 pte = vtopte(va); 994 else 995 pte = kvtopte(va); 996 #if defined(XENPV) && defined(DOM0OPS) 997 if (pa < pmap_pa_start || pa >= pmap_pa_end) { 998 #ifdef DEBUG 999 printf_nolog("%s: pa %#" PRIxPADDR " for va %#" PRIxVADDR 1000 " outside range\n", __func__, pa, va); 1001 #endif /* DEBUG */ 1002 npte = pa; 1003 } else 1004 #endif /* XENPV && DOM0OPS */ 1005 npte = pmap_pa2pte(pa); 1006 npte |= protection_codes[prot] | PTE_P | pmap_pg_g; 1007 npte |= pmap_pat_flags(flags); 1008 opte = pmap_pte_testset(pte, npte); /* zap! */ 1009 1010 /* 1011 * XXX: make sure we are not dealing with a large page, since the only 1012 * large pages created are for the kernel image, and they should never 1013 * be kentered. 1014 */ 1015 KASSERTMSG(!(opte & PTE_PS), "PTE_PS va=%#"PRIxVADDR, va); 1016 1017 if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A)) { 1018 /* This should not happen. */ 1019 printf_nolog("%s: mapping already present\n", __func__); 1020 kpreempt_disable(); 1021 pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER); 1022 kpreempt_enable(); 1023 } 1024 } 1025 1026 __strict_weak_alias(pmap_kenter_ma, pmap_kenter_pa); 1027 1028 #if defined(__x86_64__) 1029 /* 1030 * Change protection for a virtual address. Local for a CPU only, don't 1031 * care about TLB shootdowns. 1032 * 1033 * => must be called with preemption disabled 1034 */ 1035 void 1036 pmap_changeprot_local(vaddr_t va, vm_prot_t prot) 1037 { 1038 pt_entry_t *pte, opte, npte; 1039 1040 KASSERT(kpreempt_disabled()); 1041 1042 if (va < VM_MIN_KERNEL_ADDRESS) 1043 pte = vtopte(va); 1044 else 1045 pte = kvtopte(va); 1046 1047 npte = opte = *pte; 1048 1049 if ((prot & VM_PROT_WRITE) != 0) 1050 npte |= PTE_W; 1051 else 1052 npte &= ~(PTE_W|PTE_D); 1053 1054 if (opte != npte) { 1055 pmap_pte_set(pte, npte); 1056 pmap_pte_flush(); 1057 invlpg(va); 1058 } 1059 } 1060 #endif /* defined(__x86_64__) */ 1061 1062 /* 1063 * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking 1064 * 1065 * => no need to lock anything 1066 * => caller must dispose of any vm_page mapped in the va range 1067 * => note: not an inline function 1068 * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE 1069 * => we assume kernel only unmaps valid addresses and thus don't bother 1070 * checking the valid bit before doing TLB flushing 1071 * => must be followed by call to pmap_update() before reuse of page 1072 */ 1073 static void 1074 pmap_kremove1(vaddr_t sva, vsize_t len, bool localonly) 1075 { 1076 pt_entry_t *pte, opte; 1077 vaddr_t va, eva; 1078 1079 eva = sva + len; 1080 1081 kpreempt_disable(); 1082 for (va = sva; va < eva; va += PAGE_SIZE) { 1083 pte = kvtopte(va); 1084 opte = pmap_pte_testset(pte, 0); /* zap! */ 1085 if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A) && !localonly) { 1086 pmap_tlb_shootdown(pmap_kernel(), va, opte, 1087 TLBSHOOT_KREMOVE); 1088 } 1089 KASSERTMSG((opte & PTE_PS) == 0, 1090 "va %#" PRIxVADDR " is a large page", va); 1091 KASSERTMSG((opte & PTE_PVLIST) == 0, 1092 "va %#" PRIxVADDR " is a pv tracked page", va); 1093 } 1094 if (localonly) { 1095 tlbflushg(); 1096 } 1097 kpreempt_enable(); 1098 } 1099 1100 void 1101 pmap_kremove(vaddr_t sva, vsize_t len) 1102 { 1103 1104 pmap_kremove1(sva, len, false); 1105 } 1106 1107 /* 1108 * pmap_kremove_local: like pmap_kremove(), but only worry about 1109 * TLB invalidations on the current CPU. this is only intended 1110 * for use while writing kernel crash dumps, either after panic 1111 * or via reboot -d. 1112 */ 1113 void 1114 pmap_kremove_local(vaddr_t sva, vsize_t len) 1115 { 1116 1117 pmap_kremove1(sva, len, true); 1118 } 1119 1120 /* 1121 * p m a p i n i t f u n c t i o n s 1122 * 1123 * pmap_bootstrap and pmap_init are called during system startup 1124 * to init the pmap module. pmap_bootstrap() does a low level 1125 * init just to get things rolling. pmap_init() finishes the job. 1126 */ 1127 1128 /* 1129 * pmap_bootstrap_valloc: allocate a virtual address in the bootstrap area. 1130 * This function is to be used before any VM system has been set up. 1131 * 1132 * The va is taken from virtual_avail. 1133 */ 1134 static vaddr_t 1135 pmap_bootstrap_valloc(size_t npages) 1136 { 1137 vaddr_t va = virtual_avail; 1138 virtual_avail += npages * PAGE_SIZE; 1139 return va; 1140 } 1141 1142 /* 1143 * pmap_bootstrap_palloc: allocate a physical address in the bootstrap area. 1144 * This function is to be used before any VM system has been set up. 1145 * 1146 * The pa is taken from avail_start. 1147 */ 1148 static paddr_t 1149 pmap_bootstrap_palloc(size_t npages) 1150 { 1151 paddr_t pa = avail_start; 1152 avail_start += npages * PAGE_SIZE; 1153 return pa; 1154 } 1155 1156 /* 1157 * pmap_bootstrap: get the system in a state where it can run with VM properly 1158 * enabled (called before main()). The VM system is fully init'd later. 1159 * 1160 * => on i386, locore.S has already enabled the MMU by allocating a PDP for the 1161 * kernel, and nkpde PTP's for the kernel. 1162 * => kva_start is the first free virtual address in kernel space. 1163 */ 1164 void 1165 pmap_bootstrap(vaddr_t kva_start) 1166 { 1167 struct pmap *kpm; 1168 int i; 1169 vaddr_t kva; 1170 1171 pmap_pg_nx = (cpu_feature[2] & CPUID_NOX ? PTE_NX : 0); 1172 1173 /* 1174 * Set up our local static global vars that keep track of the usage of 1175 * KVM before kernel_map is set up. 1176 */ 1177 virtual_avail = kva_start; /* first free KVA */ 1178 virtual_end = VM_MAX_KERNEL_ADDRESS; /* last KVA */ 1179 1180 /* 1181 * Set up protection_codes: we need to be able to convert from a MI 1182 * protection code (some combo of VM_PROT...) to something we can jam 1183 * into a x86 PTE. 1184 */ 1185 protection_codes[VM_PROT_NONE] = pmap_pg_nx; 1186 protection_codes[VM_PROT_EXECUTE] = PTE_X; 1187 protection_codes[VM_PROT_READ] = pmap_pg_nx; 1188 protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PTE_X; 1189 protection_codes[VM_PROT_WRITE] = PTE_W | pmap_pg_nx; 1190 protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PTE_W | PTE_X; 1191 protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PTE_W | pmap_pg_nx; 1192 protection_codes[VM_PROT_ALL] = PTE_W | PTE_X; 1193 1194 /* 1195 * Now we init the kernel's pmap. 1196 * 1197 * The kernel pmap's pm_obj is not used for much. However, in user pmaps 1198 * the pm_obj contains the list of active PTPs. 1199 */ 1200 kpm = pmap_kernel(); 1201 mutex_init(&kpm->pm_lock, MUTEX_DEFAULT, IPL_NONE); 1202 rw_init(&kpm->pm_dummy_lock); 1203 for (i = 0; i < PTP_LEVELS - 1; i++) { 1204 uvm_obj_init(&kpm->pm_obj[i], &pmap_pager, false, 1); 1205 uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_dummy_lock); 1206 kpm->pm_ptphint[i] = NULL; 1207 } 1208 memset(&kpm->pm_list, 0, sizeof(kpm->pm_list)); /* pm_list not used */ 1209 1210 kpm->pm_pdir = (pd_entry_t *)bootspace.pdir; 1211 for (i = 0; i < PDP_SIZE; i++) 1212 kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i; 1213 1214 kpm->pm_stats.wired_count = kpm->pm_stats.resident_count = 1215 x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS); 1216 1217 kcpuset_create(&kpm->pm_cpus, true); 1218 kcpuset_create(&kpm->pm_kernel_cpus, true); 1219 1220 kpm->pm_ldt = NULL; 1221 kpm->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 1222 1223 /* 1224 * the above is just a rough estimate and not critical to the proper 1225 * operation of the system. 1226 */ 1227 1228 #if !defined(XENPV) 1229 /* 1230 * Begin to enable global TLB entries if they are supported: add PTE_G 1231 * attribute to already mapped kernel pages. Do that only if SVS is 1232 * disabled. 1233 * 1234 * The G bit has no effect until the CR4_PGE bit is set in CR4, which 1235 * happens later in cpu_init(). 1236 */ 1237 #ifdef SVS 1238 if (!svs_enabled && (cpu_feature[0] & CPUID_PGE)) { 1239 #else 1240 if (cpu_feature[0] & CPUID_PGE) { 1241 #endif 1242 pmap_pg_g = PTE_G; 1243 pmap_remap_global(); 1244 } 1245 #endif 1246 1247 #ifndef XENPV 1248 /* 1249 * Enable large pages if they are supported. 1250 */ 1251 if (cpu_feature[0] & CPUID_PSE) { 1252 lcr4(rcr4() | CR4_PSE); /* enable hardware (via %cr4) */ 1253 pmap_largepages = 1; /* enable software */ 1254 1255 /* 1256 * The TLB must be flushed after enabling large pages on Pentium 1257 * CPUs, according to section 3.6.2.2 of "Intel Architecture 1258 * Software Developer's Manual, Volume 3: System Programming". 1259 */ 1260 tlbflushg(); 1261 1262 /* Remap the kernel. */ 1263 pmap_remap_largepages(); 1264 } 1265 pmap_init_lapic(); 1266 #endif /* !XENPV */ 1267 1268 #ifdef __HAVE_PCPU_AREA 1269 pmap_init_pcpu(); 1270 #endif 1271 1272 #ifdef __HAVE_DIRECT_MAP 1273 pmap_init_directmap(kpm); 1274 #else 1275 pmap_vpage_cpualloc(&cpu_info_primary); 1276 1277 if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { /* i386 */ 1278 early_zerop = (void *)cpu_info_primary.vpage[VPAGE_ZER]; 1279 early_zero_pte = cpu_info_primary.vpage_pte[VPAGE_ZER]; 1280 } else { /* amd64 */ 1281 /* 1282 * zero_pte is stuck at the end of mapped space for the kernel 1283 * image (disjunct from kva space). This is done so that it 1284 * can safely be used in pmap_growkernel (pmap_get_physpage), 1285 * when it's called for the first time. 1286 * XXXfvdl fix this for MULTIPROCESSOR later. 1287 */ 1288 #ifdef XENPV 1289 /* early_zerop initialized in xen_locore() */ 1290 #else 1291 early_zerop = (void *)bootspace.spareva; 1292 #endif 1293 early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop); 1294 } 1295 #endif 1296 1297 #if defined(XENPV) && defined(__x86_64__) 1298 extern vaddr_t xen_dummy_page; 1299 paddr_t xen_dummy_user_pgd; 1300 1301 /* 1302 * We want a dummy page directory for Xen: when deactivating a pmap, 1303 * Xen will still consider it active. So we set user PGD to this one 1304 * to lift all protection on the now inactive page tables set. 1305 */ 1306 xen_dummy_user_pgd = xen_dummy_page - KERNBASE; 1307 1308 /* Zero fill it, the less checks in Xen it requires the better */ 1309 memset(PAGE_ALIGNED(xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE); 1310 /* Mark read-only */ 1311 HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE, 1312 pmap_pa2pte(xen_dummy_user_pgd) | PTE_P | pmap_pg_nx, 1313 UVMF_INVLPG); 1314 /* Pin as L4 */ 1315 xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd)); 1316 #endif 1317 1318 /* 1319 * Allocate space for the IDT, GDT and LDT. 1320 */ 1321 idt_vaddr = pmap_bootstrap_valloc(1); 1322 idt_paddr = pmap_bootstrap_palloc(1); 1323 1324 gdt_vaddr = pmap_bootstrap_valloc(1); 1325 gdt_paddr = pmap_bootstrap_palloc(1); 1326 1327 #ifdef __HAVE_PCPU_AREA 1328 ldt_vaddr = (vaddr_t)&pcpuarea->ldt; 1329 #else 1330 ldt_vaddr = pmap_bootstrap_valloc(1); 1331 #endif 1332 ldt_paddr = pmap_bootstrap_palloc(1); 1333 1334 #if !defined(__x86_64__) 1335 /* pentium f00f bug stuff */ 1336 pentium_idt_vaddr = pmap_bootstrap_valloc(1); 1337 #endif 1338 1339 #if defined(XENPVHVM) 1340 /* XXX: move to hypervisor.c with appropriate API adjustments */ 1341 extern paddr_t HYPERVISOR_shared_info_pa; 1342 extern volatile struct xencons_interface *xencons_interface; /* XXX */ 1343 extern struct xenstore_domain_interface *xenstore_interface; /* XXX */ 1344 1345 if (vm_guest != VM_GUEST_XENPVH) { 1346 HYPERVISOR_shared_info = (void *) pmap_bootstrap_valloc(1); 1347 HYPERVISOR_shared_info_pa = pmap_bootstrap_palloc(1); 1348 } 1349 xencons_interface = (void *) pmap_bootstrap_valloc(1); 1350 xenstore_interface = (void *) pmap_bootstrap_valloc(1); 1351 #endif 1352 /* 1353 * Now we reserve some VM for mapping pages when doing a crash dump. 1354 */ 1355 virtual_avail = reserve_dumppages(virtual_avail); 1356 1357 /* 1358 * Init the global lock and global list. 1359 */ 1360 mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE); 1361 LIST_INIT(&pmaps); 1362 1363 /* 1364 * Ensure the TLB is sync'd with reality by flushing it... 1365 */ 1366 tlbflushg(); 1367 1368 /* 1369 * Calculate pmap_maxkvaddr from nkptp[]. 1370 */ 1371 kva = VM_MIN_KERNEL_ADDRESS; 1372 for (i = PTP_LEVELS - 1; i >= 1; i--) { 1373 kva += nkptp[i] * nbpd[i]; 1374 } 1375 pmap_maxkvaddr = kva; 1376 } 1377 1378 #ifndef XENPV 1379 static void 1380 pmap_init_lapic(void) 1381 { 1382 /* 1383 * On CPUs that have no LAPIC, local_apic_va is never kentered. But our 1384 * x86 implementation relies a lot on this address to be valid; so just 1385 * allocate a fake physical page that will be kentered into 1386 * local_apic_va by machdep. 1387 * 1388 * If the LAPIC is present, the va will be remapped somewhere else 1389 * later in lapic_map. 1390 */ 1391 local_apic_va = pmap_bootstrap_valloc(1); 1392 local_apic_pa = pmap_bootstrap_palloc(1); 1393 } 1394 #endif 1395 1396 #ifdef __x86_64__ 1397 static size_t 1398 pmap_pagetree_nentries_range(vaddr_t startva, vaddr_t endva, size_t pgsz) 1399 { 1400 size_t npages; 1401 npages = (roundup(endva, pgsz) / pgsz) - 1402 (rounddown(startva, pgsz) / pgsz); 1403 return npages; 1404 } 1405 #endif 1406 1407 #if defined(__HAVE_DIRECT_MAP) || defined(KASAN) || defined(KMSAN) 1408 static inline void 1409 slotspace_copy(int type, pd_entry_t *dst, pd_entry_t *src) 1410 { 1411 size_t sslot = slotspace.area[type].sslot; 1412 size_t nslot = slotspace.area[type].nslot; 1413 1414 memcpy(&dst[sslot], &src[sslot], nslot * sizeof(pd_entry_t)); 1415 } 1416 #endif 1417 1418 #ifdef __x86_64__ 1419 /* 1420 * Randomize the location of an area. We count the holes in the VM space. We 1421 * randomly select one hole, and then randomly select an area within that hole. 1422 * Finally we update the associated entry in the slotspace structure. 1423 */ 1424 vaddr_t 1425 slotspace_rand(int type, size_t sz, size_t align, size_t randhole, 1426 vaddr_t randva) 1427 { 1428 struct { 1429 int start; 1430 int end; 1431 } holes[SLSPACE_NAREAS+1]; 1432 size_t i, nholes, hole; 1433 size_t startsl, endsl, nslots, winsize; 1434 vaddr_t startva, va; 1435 1436 sz = roundup(sz, align); 1437 1438 /* 1439 * Take one more slot with +NBPD_L4, because we may end up choosing 1440 * an area that crosses slots: 1441 * +------+------+------+ 1442 * | Slot | Slot | Slot | 1443 * +------+------+------+ 1444 * [Chosen Area] 1445 * And in that case we must take into account the additional slot 1446 * consumed. 1447 */ 1448 nslots = roundup(sz+NBPD_L4, NBPD_L4) / NBPD_L4; 1449 1450 /* Get the holes. */ 1451 nholes = 0; 1452 size_t curslot = 0 + 256; /* end of SLAREA_USER */ 1453 while (1) { 1454 /* 1455 * Find the first occupied slot after the current one. 1456 * The area between the two is a hole. 1457 */ 1458 size_t minsslot = 512; 1459 size_t minnslot = 0; 1460 for (i = 0; i < SLSPACE_NAREAS; i++) { 1461 if (!slotspace.area[i].active) 1462 continue; 1463 if (slotspace.area[i].sslot >= curslot && 1464 slotspace.area[i].sslot < minsslot) { 1465 minsslot = slotspace.area[i].sslot; 1466 minnslot = slotspace.area[i].nslot; 1467 } 1468 } 1469 1470 /* No hole anymore, stop here. */ 1471 if (minsslot == 512) { 1472 break; 1473 } 1474 1475 /* Register the hole. */ 1476 if (minsslot - curslot >= nslots) { 1477 holes[nholes].start = curslot; 1478 holes[nholes].end = minsslot; 1479 nholes++; 1480 } 1481 1482 /* Skip that hole, and iterate again. */ 1483 curslot = minsslot + minnslot; 1484 } 1485 1486 if (nholes == 0) { 1487 panic("%s: impossible", __func__); 1488 } 1489 1490 /* Select a hole. */ 1491 hole = randhole; 1492 #ifdef NO_X86_ASLR 1493 hole = 0; 1494 #endif 1495 hole %= nholes; 1496 startsl = holes[hole].start; 1497 endsl = holes[hole].end; 1498 startva = VA_SIGN_NEG(startsl * NBPD_L4); 1499 1500 /* Select an area within the hole. */ 1501 va = randva; 1502 #ifdef NO_X86_ASLR 1503 va = 0; 1504 #endif 1505 winsize = ((endsl - startsl) * NBPD_L4) - sz; 1506 va %= winsize; 1507 va = rounddown(va, align); 1508 va += startva; 1509 1510 /* Update the entry. */ 1511 slotspace.area[type].sslot = pl4_i(va); 1512 slotspace.area[type].nslot = 1513 pmap_pagetree_nentries_range(va, va+sz, NBPD_L4); 1514 slotspace.area[type].active = true; 1515 1516 return va; 1517 } 1518 #endif 1519 1520 #ifdef __HAVE_PCPU_AREA 1521 static void 1522 pmap_init_pcpu(void) 1523 { 1524 const vaddr_t startva = PMAP_PCPU_BASE; 1525 size_t nL4e, nL3e, nL2e, nL1e; 1526 size_t L4e_idx, L3e_idx, L2e_idx, L1e_idx __diagused; 1527 paddr_t pa; 1528 vaddr_t endva; 1529 vaddr_t tmpva; 1530 pt_entry_t *pte; 1531 size_t size; 1532 int i; 1533 1534 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx; 1535 1536 size = sizeof(struct pcpu_area); 1537 1538 endva = startva + size; 1539 1540 /* We will use this temporary va. */ 1541 tmpva = bootspace.spareva; 1542 pte = PTE_BASE + pl1_i(tmpva); 1543 1544 /* Build L4 */ 1545 L4e_idx = pl4_i(startva); 1546 nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4); 1547 KASSERT(nL4e == 1); 1548 for (i = 0; i < nL4e; i++) { 1549 KASSERT(L4_BASE[L4e_idx+i] == 0); 1550 1551 pa = pmap_bootstrap_palloc(1); 1552 *pte = (pa & PTE_FRAME) | pteflags; 1553 pmap_update_pg(tmpva); 1554 memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE); 1555 1556 L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A; 1557 } 1558 1559 /* Build L3 */ 1560 L3e_idx = pl3_i(startva); 1561 nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3); 1562 for (i = 0; i < nL3e; i++) { 1563 KASSERT(L3_BASE[L3e_idx+i] == 0); 1564 1565 pa = pmap_bootstrap_palloc(1); 1566 *pte = (pa & PTE_FRAME) | pteflags; 1567 pmap_update_pg(tmpva); 1568 memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE); 1569 1570 L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A; 1571 } 1572 1573 /* Build L2 */ 1574 L2e_idx = pl2_i(startva); 1575 nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2); 1576 for (i = 0; i < nL2e; i++) { 1577 1578 KASSERT(L2_BASE[L2e_idx+i] == 0); 1579 1580 pa = pmap_bootstrap_palloc(1); 1581 *pte = (pa & PTE_FRAME) | pteflags; 1582 pmap_update_pg(tmpva); 1583 memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE); 1584 1585 L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A; 1586 } 1587 1588 /* Build L1 */ 1589 L1e_idx = pl1_i(startva); 1590 nL1e = pmap_pagetree_nentries_range(startva, endva, NBPD_L1); 1591 for (i = 0; i < nL1e; i++) { 1592 /* 1593 * Nothing to do, the PTEs will be entered via 1594 * pmap_kenter_pa. 1595 */ 1596 KASSERT(L1_BASE[L1e_idx+i] == 0); 1597 } 1598 1599 *pte = 0; 1600 pmap_update_pg(tmpva); 1601 1602 pcpuarea = (struct pcpu_area *)startva; 1603 1604 tlbflush(); 1605 } 1606 #endif 1607 1608 #ifdef __HAVE_DIRECT_MAP 1609 static void 1610 randomize_hole(size_t *randholep, vaddr_t *randvap) 1611 { 1612 struct nist_hash_drbg drbg; 1613 uint8_t seed[NIST_HASH_DRBG_SEEDLEN_BYTES]; 1614 const char p[] = "x86/directmap"; 1615 int error; 1616 1617 entropy_extract(seed, sizeof(seed), 0); 1618 1619 error = nist_hash_drbg_instantiate(&drbg, seed, sizeof(seed), 1620 /*nonce*/NULL, 0, 1621 /*personalization*/p, strlen(p)); 1622 KASSERTMSG(error == 0, "error=%d", error); 1623 1624 error = nist_hash_drbg_generate(&drbg, randholep, sizeof(*randholep), 1625 /*additional*/NULL, 0); 1626 KASSERTMSG(error == 0, "error=%d", error); 1627 1628 error = nist_hash_drbg_generate(&drbg, randvap, sizeof(*randvap), 1629 /*additional*/NULL, 0); 1630 KASSERTMSG(error == 0, "error=%d", error); 1631 1632 explicit_memset(seed, 0, sizeof(seed)); 1633 explicit_memset(&drbg, 0, sizeof(drbg)); 1634 } 1635 1636 /* 1637 * Create the amd64 direct map. Called only once at boot time. We map all of 1638 * the physical memory contiguously using 2MB large pages, with RW permissions. 1639 * However there is a hole: the kernel is mapped with RO permissions. 1640 */ 1641 static void 1642 pmap_init_directmap(struct pmap *kpm) 1643 { 1644 extern phys_ram_seg_t mem_clusters[]; 1645 extern int mem_cluster_cnt; 1646 1647 vaddr_t startva; 1648 size_t nL4e, nL3e, nL2e; 1649 size_t L4e_idx, L3e_idx, L2e_idx; 1650 size_t spahole, epahole; 1651 paddr_t lastpa, pa; 1652 vaddr_t endva; 1653 vaddr_t tmpva; 1654 pt_entry_t *pte; 1655 phys_ram_seg_t *mc; 1656 int i; 1657 size_t randhole; 1658 vaddr_t randva; 1659 1660 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx; 1661 const pd_entry_t holepteflags = PTE_P | pmap_pg_nx; 1662 1663 CTASSERT(NL4_SLOT_DIRECT * NBPD_L4 == MAXPHYSMEM); 1664 1665 spahole = roundup(bootspace.head.pa, NBPD_L2); 1666 epahole = rounddown(bootspace.boot.pa, NBPD_L2); 1667 1668 /* Get the last physical address available */ 1669 lastpa = 0; 1670 for (i = 0; i < mem_cluster_cnt; i++) { 1671 mc = &mem_clusters[i]; 1672 lastpa = MAX(lastpa, mc->start + mc->size); 1673 } 1674 1675 /* 1676 * x86_add_cluster should have truncated the memory to MAXPHYSMEM. 1677 */ 1678 if (lastpa > MAXPHYSMEM) { 1679 panic("pmap_init_directmap: lastpa incorrect"); 1680 } 1681 1682 randomize_hole(&randhole, &randva); 1683 startva = slotspace_rand(SLAREA_DMAP, lastpa, NBPD_L2, 1684 randhole, randva); 1685 endva = startva + lastpa; 1686 1687 /* We will use this temporary va. */ 1688 tmpva = bootspace.spareva; 1689 pte = PTE_BASE + pl1_i(tmpva); 1690 1691 /* Build L4 */ 1692 L4e_idx = pl4_i(startva); 1693 nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4); 1694 KASSERT(nL4e <= NL4_SLOT_DIRECT); 1695 for (i = 0; i < nL4e; i++) { 1696 KASSERT(L4_BASE[L4e_idx+i] == 0); 1697 1698 pa = pmap_bootstrap_palloc(1); 1699 *pte = (pa & PTE_FRAME) | pteflags; 1700 pmap_update_pg(tmpva); 1701 memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE); 1702 1703 L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A; 1704 } 1705 1706 /* Build L3 */ 1707 L3e_idx = pl3_i(startva); 1708 nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3); 1709 for (i = 0; i < nL3e; i++) { 1710 KASSERT(L3_BASE[L3e_idx+i] == 0); 1711 1712 pa = pmap_bootstrap_palloc(1); 1713 *pte = (pa & PTE_FRAME) | pteflags; 1714 pmap_update_pg(tmpva); 1715 memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE); 1716 1717 L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A; 1718 } 1719 1720 /* Build L2 */ 1721 L2e_idx = pl2_i(startva); 1722 nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2); 1723 for (i = 0; i < nL2e; i++) { 1724 KASSERT(L2_BASE[L2e_idx+i] == 0); 1725 1726 pa = (paddr_t)(i * NBPD_L2); 1727 1728 if (spahole <= pa && pa < epahole) { 1729 L2_BASE[L2e_idx+i] = pa | holepteflags | PTE_A | 1730 PTE_PS | pmap_pg_g; 1731 } else { 1732 L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A | 1733 PTE_PS | pmap_pg_g; 1734 } 1735 } 1736 1737 *pte = 0; 1738 pmap_update_pg(tmpva); 1739 1740 pmap_direct_base = startva; 1741 pmap_direct_end = endva; 1742 1743 tlbflush(); 1744 } 1745 #endif /* __HAVE_DIRECT_MAP */ 1746 1747 #if !defined(XENPV) 1748 /* 1749 * Remap all of the virtual pages created so far with the PTE_G bit. 1750 */ 1751 static void 1752 pmap_remap_global(void) 1753 { 1754 vaddr_t kva, kva_end; 1755 unsigned long p1i; 1756 size_t i; 1757 1758 /* head */ 1759 kva = bootspace.head.va; 1760 kva_end = kva + bootspace.head.sz; 1761 for ( ; kva < kva_end; kva += PAGE_SIZE) { 1762 p1i = pl1_i(kva); 1763 if (pmap_valid_entry(PTE_BASE[p1i])) 1764 PTE_BASE[p1i] |= pmap_pg_g; 1765 } 1766 1767 /* kernel segments */ 1768 for (i = 0; i < BTSPACE_NSEGS; i++) { 1769 if (bootspace.segs[i].type == BTSEG_NONE) { 1770 continue; 1771 } 1772 kva = bootspace.segs[i].va; 1773 kva_end = kva + bootspace.segs[i].sz; 1774 for ( ; kva < kva_end; kva += PAGE_SIZE) { 1775 p1i = pl1_i(kva); 1776 if (pmap_valid_entry(PTE_BASE[p1i])) 1777 PTE_BASE[p1i] |= pmap_pg_g; 1778 } 1779 } 1780 1781 /* boot space */ 1782 kva = bootspace.boot.va; 1783 kva_end = kva + bootspace.boot.sz; 1784 for ( ; kva < kva_end; kva += PAGE_SIZE) { 1785 p1i = pl1_i(kva); 1786 if (pmap_valid_entry(PTE_BASE[p1i])) 1787 PTE_BASE[p1i] |= pmap_pg_g; 1788 } 1789 } 1790 #endif 1791 1792 #ifndef XENPV 1793 /* 1794 * Remap several kernel segments with large pages. We cover as many pages as we 1795 * can. Called only once at boot time, if the CPU supports large pages. 1796 */ 1797 static void 1798 pmap_remap_largepages(void) 1799 { 1800 pd_entry_t *pde; 1801 vaddr_t kva, kva_end; 1802 paddr_t pa; 1803 size_t i; 1804 1805 /* Remap the kernel text using large pages. */ 1806 for (i = 0; i < BTSPACE_NSEGS; i++) { 1807 if (bootspace.segs[i].type != BTSEG_TEXT) { 1808 continue; 1809 } 1810 kva = roundup(bootspace.segs[i].va, NBPD_L2); 1811 if (kva < bootspace.segs[i].va) { 1812 continue; 1813 } 1814 kva_end = rounddown(bootspace.segs[i].va + 1815 bootspace.segs[i].sz, NBPD_L2); 1816 pa = roundup(bootspace.segs[i].pa, NBPD_L2); 1817 for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) { 1818 pde = &L2_BASE[pl2_i(kva)]; 1819 *pde = pa | pmap_pg_g | PTE_PS | PTE_P; 1820 tlbflushg(); 1821 } 1822 } 1823 1824 /* Remap the kernel rodata using large pages. */ 1825 for (i = 0; i < BTSPACE_NSEGS; i++) { 1826 if (bootspace.segs[i].type != BTSEG_RODATA) { 1827 continue; 1828 } 1829 kva = roundup(bootspace.segs[i].va, NBPD_L2); 1830 if (kva < bootspace.segs[i].va) { 1831 continue; 1832 } 1833 kva_end = rounddown(bootspace.segs[i].va + 1834 bootspace.segs[i].sz, NBPD_L2); 1835 pa = roundup(bootspace.segs[i].pa, NBPD_L2); 1836 for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) { 1837 pde = &L2_BASE[pl2_i(kva)]; 1838 *pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_P; 1839 tlbflushg(); 1840 } 1841 } 1842 1843 /* Remap the kernel data+bss using large pages. */ 1844 for (i = 0; i < BTSPACE_NSEGS; i++) { 1845 if (bootspace.segs[i].type != BTSEG_DATA) { 1846 continue; 1847 } 1848 kva = roundup(bootspace.segs[i].va, NBPD_L2); 1849 if (kva < bootspace.segs[i].va) { 1850 continue; 1851 } 1852 kva_end = rounddown(bootspace.segs[i].va + 1853 bootspace.segs[i].sz, NBPD_L2); 1854 pa = roundup(bootspace.segs[i].pa, NBPD_L2); 1855 for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) { 1856 pde = &L2_BASE[pl2_i(kva)]; 1857 *pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_W | PTE_P; 1858 tlbflushg(); 1859 } 1860 } 1861 } 1862 #endif /* !XENPV */ 1863 1864 /* 1865 * pmap_init: called from uvm_init, our job is to get the pmap system ready 1866 * to manage mappings. 1867 */ 1868 void 1869 pmap_init(void) 1870 { 1871 int flags; 1872 1873 /* 1874 * initialize caches. 1875 */ 1876 1877 pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), COHERENCY_UNIT, 1878 0, 0, "pmappl", NULL, IPL_NONE, pmap_ctor, pmap_dtor, NULL); 1879 1880 #ifdef XENPV 1881 /* 1882 * pool_cache(9) should not touch cached objects, since they 1883 * are pinned on xen and R/O for the domU 1884 */ 1885 flags = PR_NOTOUCH; 1886 #else 1887 flags = 0; 1888 #endif 1889 1890 #ifdef PAE 1891 pool_init(&pmap_pdp_pool, PAGE_SIZE * PDP_SIZE, 0, 0, flags, 1892 "pdppl", &pmap_pdp_allocator, IPL_NONE); 1893 #else 1894 pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, 0, flags, 1895 "pdppl", NULL, IPL_NONE); 1896 #endif 1897 pool_cache_bootstrap(&pmap_pvp_cache, PAGE_SIZE, PAGE_SIZE, 1898 0, 0, "pvpage", &pool_allocator_kmem, 1899 IPL_NONE, pmap_pvp_ctor, pmap_pvp_dtor, NULL); 1900 1901 pmap_tlb_init(); 1902 1903 /* XXX: Since cpu_hatch() is only for secondary CPUs. */ 1904 pmap_tlb_cpu_init(curcpu()); 1905 1906 evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC, 1907 NULL, "x86", "io bitmap copy"); 1908 evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC, 1909 NULL, "x86", "ldt sync"); 1910 1911 /* 1912 * The kernel doesn't keep track of PTPs, so there's nowhere handy 1913 * to hang a tree of pv_entry records. Dynamically allocated 1914 * pv_entry lists are not heavily used in the kernel's pmap (the 1915 * usual case is embedded), so cop out and use a single RB tree 1916 * to cover them. 1917 */ 1918 rb_tree_init(&pmap_kernel_rb, &pmap_rbtree_ops); 1919 1920 /* 1921 * done: pmap module is up (and ready for business) 1922 */ 1923 1924 pmap_initialized = true; 1925 } 1926 1927 #ifndef XENPV 1928 /* 1929 * pmap_cpu_init_late: perform late per-CPU initialization. 1930 */ 1931 void 1932 pmap_cpu_init_late(struct cpu_info *ci) 1933 { 1934 /* 1935 * The BP has already its own PD page allocated during early 1936 * MD startup. 1937 */ 1938 if (ci == &cpu_info_primary) 1939 return; 1940 #ifdef PAE 1941 cpu_alloc_l3_page(ci); 1942 #endif 1943 } 1944 #endif 1945 1946 #ifndef __HAVE_DIRECT_MAP 1947 CTASSERT(CACHE_LINE_SIZE > sizeof(pt_entry_t)); 1948 CTASSERT(CACHE_LINE_SIZE % sizeof(pt_entry_t) == 0); 1949 1950 static void 1951 pmap_vpage_cpualloc(struct cpu_info *ci) 1952 { 1953 bool primary = (ci == &cpu_info_primary); 1954 size_t i, npages; 1955 vaddr_t vabase; 1956 vsize_t vrange; 1957 1958 npages = (CACHE_LINE_SIZE / sizeof(pt_entry_t)); 1959 KASSERT(npages >= VPAGE_MAX); 1960 vrange = npages * PAGE_SIZE; 1961 1962 if (primary) { 1963 while ((vabase = pmap_bootstrap_valloc(1)) % vrange != 0) { 1964 /* Waste some pages to align properly */ 1965 } 1966 /* The base is aligned, allocate the rest (contiguous) */ 1967 pmap_bootstrap_valloc(npages - 1); 1968 } else { 1969 vabase = uvm_km_alloc(kernel_map, vrange, vrange, 1970 UVM_KMF_VAONLY); 1971 if (vabase == 0) { 1972 panic("%s: failed to allocate tmp VA for CPU %d\n", 1973 __func__, cpu_index(ci)); 1974 } 1975 } 1976 1977 KASSERT((vaddr_t)&PTE_BASE[pl1_i(vabase)] % CACHE_LINE_SIZE == 0); 1978 1979 for (i = 0; i < VPAGE_MAX; i++) { 1980 ci->vpage[i] = vabase + i * PAGE_SIZE; 1981 ci->vpage_pte[i] = PTE_BASE + pl1_i(ci->vpage[i]); 1982 } 1983 } 1984 1985 void 1986 pmap_vpage_cpu_init(struct cpu_info *ci) 1987 { 1988 if (ci == &cpu_info_primary) { 1989 /* cpu0 already taken care of in pmap_bootstrap */ 1990 return; 1991 } 1992 1993 pmap_vpage_cpualloc(ci); 1994 } 1995 #endif 1996 1997 /* 1998 * p v _ e n t r y f u n c t i o n s 1999 */ 2000 2001 /* 2002 * pmap_pvp_dtor: pool_cache constructor for PV pages. 2003 */ 2004 static int 2005 pmap_pvp_ctor(void *arg, void *obj, int flags) 2006 { 2007 struct pv_page *pvp = (struct pv_page *)obj; 2008 struct pv_entry *pve = (struct pv_entry *)obj + 1; 2009 struct pv_entry *maxpve = pve + PVE_PER_PVP; 2010 2011 KASSERT(sizeof(struct pv_page) <= sizeof(struct pv_entry)); 2012 KASSERT(trunc_page((vaddr_t)obj) == (vaddr_t)obj); 2013 2014 LIST_INIT(&pvp->pvp_pves); 2015 pvp->pvp_nfree = PVE_PER_PVP; 2016 pvp->pvp_pmap = NULL; 2017 2018 for (; pve < maxpve; pve++) { 2019 LIST_INSERT_HEAD(&pvp->pvp_pves, pve, pve_list); 2020 } 2021 2022 return 0; 2023 } 2024 2025 /* 2026 * pmap_pvp_dtor: pool_cache destructor for PV pages. 2027 */ 2028 static void 2029 pmap_pvp_dtor(void *arg, void *obj) 2030 { 2031 struct pv_page *pvp __diagused = obj; 2032 2033 KASSERT(pvp->pvp_pmap == NULL); 2034 KASSERT(pvp->pvp_nfree == PVE_PER_PVP); 2035 } 2036 2037 /* 2038 * pmap_alloc_pv: allocate a PV entry (likely cached with pmap). 2039 */ 2040 static struct pv_entry * 2041 pmap_alloc_pv(struct pmap *pmap) 2042 { 2043 struct pv_entry *pve; 2044 struct pv_page *pvp; 2045 2046 KASSERT(mutex_owned(&pmap->pm_lock)); 2047 2048 if (__predict_false((pvp = LIST_FIRST(&pmap->pm_pvp_part)) == NULL)) { 2049 if ((pvp = LIST_FIRST(&pmap->pm_pvp_full)) != NULL) { 2050 LIST_REMOVE(pvp, pvp_list); 2051 } else { 2052 pvp = pool_cache_get(&pmap_pvp_cache, PR_NOWAIT); 2053 } 2054 if (__predict_false(pvp == NULL)) { 2055 return NULL; 2056 } 2057 /* full -> part */ 2058 LIST_INSERT_HEAD(&pmap->pm_pvp_part, pvp, pvp_list); 2059 pvp->pvp_pmap = pmap; 2060 } 2061 2062 KASSERT(pvp->pvp_pmap == pmap); 2063 KASSERT(pvp->pvp_nfree > 0); 2064 2065 pve = LIST_FIRST(&pvp->pvp_pves); 2066 LIST_REMOVE(pve, pve_list); 2067 pvp->pvp_nfree--; 2068 2069 if (__predict_false(pvp->pvp_nfree == 0)) { 2070 /* part -> empty */ 2071 KASSERT(LIST_EMPTY(&pvp->pvp_pves)); 2072 LIST_REMOVE(pvp, pvp_list); 2073 LIST_INSERT_HEAD(&pmap->pm_pvp_empty, pvp, pvp_list); 2074 } else { 2075 KASSERT(!LIST_EMPTY(&pvp->pvp_pves)); 2076 } 2077 2078 return pve; 2079 } 2080 2081 /* 2082 * pmap_free_pv: delayed free of a PV entry. 2083 */ 2084 static void 2085 pmap_free_pv(struct pmap *pmap, struct pv_entry *pve) 2086 { 2087 struct pv_page *pvp = (struct pv_page *)trunc_page((vaddr_t)pve); 2088 2089 KASSERT(mutex_owned(&pmap->pm_lock)); 2090 KASSERT(pvp->pvp_pmap == pmap); 2091 KASSERT(pvp->pvp_nfree >= 0); 2092 2093 LIST_INSERT_HEAD(&pvp->pvp_pves, pve, pve_list); 2094 pvp->pvp_nfree++; 2095 2096 if (__predict_false(pvp->pvp_nfree == 1)) { 2097 /* empty -> part */ 2098 LIST_REMOVE(pvp, pvp_list); 2099 LIST_INSERT_HEAD(&pmap->pm_pvp_part, pvp, pvp_list); 2100 } else if (__predict_false(pvp->pvp_nfree == PVE_PER_PVP)) { 2101 /* part -> full */ 2102 LIST_REMOVE(pvp, pvp_list); 2103 LIST_INSERT_HEAD(&pmap->pm_pvp_full, pvp, pvp_list); 2104 } 2105 } 2106 2107 /* 2108 * pmap_drain_pv: free full PV pages. 2109 */ 2110 static void 2111 pmap_drain_pv(struct pmap *pmap) 2112 { 2113 struct pv_page *pvp; 2114 2115 KASSERT(mutex_owned(&pmap->pm_lock)); 2116 2117 while ((pvp = LIST_FIRST(&pmap->pm_pvp_full)) != NULL) { 2118 LIST_REMOVE(pvp, pvp_list); 2119 KASSERT(pvp->pvp_pmap == pmap); 2120 KASSERT(pvp->pvp_nfree == PVE_PER_PVP); 2121 pvp->pvp_pmap = NULL; 2122 pool_cache_put(&pmap_pvp_cache, pvp); 2123 } 2124 } 2125 2126 /* 2127 * pmap_check_pv: verify {VA, PTP} pair is either tracked/untracked by page 2128 */ 2129 static void 2130 pmap_check_pv(struct pmap *pmap, struct vm_page *ptp, struct pmap_page *pp, 2131 vaddr_t va, bool tracked) 2132 { 2133 #ifdef DEBUG 2134 struct pv_pte *pvpte; 2135 2136 PMAP_CHECK_PP(pp); 2137 2138 mutex_spin_enter(&pp->pp_lock); 2139 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 2140 if (pvpte->pte_ptp == ptp && pvpte->pte_va == va) { 2141 break; 2142 } 2143 } 2144 mutex_spin_exit(&pp->pp_lock); 2145 2146 if (pvpte && !tracked) { 2147 panic("pmap_check_pv: %p/%lx found on pp %p", ptp, va, pp); 2148 } else if (!pvpte && tracked) { 2149 panic("pmap_check_pv: %p/%lx missing on pp %p", ptp, va, pp); 2150 } 2151 #endif 2152 } 2153 2154 /* 2155 * pmap_treelookup_pv: search the PV tree for a dynamic entry 2156 * 2157 * => pmap must be locked 2158 */ 2159 static struct pv_entry * 2160 pmap_treelookup_pv(const struct pmap *pmap, const struct vm_page *ptp, 2161 const rb_tree_t *tree, const vaddr_t va) 2162 { 2163 struct pv_entry *pve; 2164 rb_node_t *node; 2165 2166 /* 2167 * Inlined lookup tailored for exactly what's needed here that is 2168 * quite a bit faster than using rb_tree_find_node(). 2169 */ 2170 for (node = tree->rbt_root;;) { 2171 if (__predict_false(RB_SENTINEL_P(node))) { 2172 return NULL; 2173 } 2174 pve = (struct pv_entry *) 2175 ((uintptr_t)node - offsetof(struct pv_entry, pve_rb)); 2176 if (pve->pve_pte.pte_va == va) { 2177 KASSERT(pve->pve_pte.pte_ptp == ptp); 2178 return pve; 2179 } 2180 node = node->rb_nodes[pve->pve_pte.pte_va < va]; 2181 } 2182 } 2183 2184 /* 2185 * pmap_lookup_pv: look up a non-embedded pv entry for the given pmap 2186 * 2187 * => a PV entry must be known present (doesn't check for existence) 2188 * => pmap must be locked 2189 */ 2190 static struct pv_entry * 2191 pmap_lookup_pv(const struct pmap *pmap, const struct vm_page *ptp, 2192 const struct pmap_page * const old_pp, const vaddr_t va) 2193 { 2194 struct pv_entry *pve; 2195 const rb_tree_t *tree; 2196 2197 KASSERT(mutex_owned(&pmap->pm_lock)); 2198 KASSERT(ptp != NULL || pmap == pmap_kernel()); 2199 2200 /* 2201 * [This mostly deals with the case of process-private pages, i.e. 2202 * anonymous memory allocations or COW.] 2203 * 2204 * If the page is tracked with an embedded entry then the tree 2205 * lookup can be avoided. It's safe to check for this specific 2206 * set of values without pp_lock because both will only ever be 2207 * set together for this pmap. 2208 * 2209 */ 2210 if (atomic_load_relaxed(&old_pp->pp_pte.pte_ptp) == ptp && 2211 atomic_load_relaxed(&old_pp->pp_pte.pte_va) == va) { 2212 return NULL; 2213 } 2214 2215 /* 2216 * [This mostly deals with shared mappings, for example shared libs 2217 * and executables.] 2218 * 2219 * Optimise for pmap_remove_ptes() which works by ascending scan: 2220 * look at the lowest numbered node in the tree first. The tree is 2221 * known non-empty because of the check above. For short lived 2222 * processes where pmap_remove() isn't used much this gets close to 2223 * a 100% hit rate. 2224 */ 2225 tree = (ptp != NULL ? &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb); 2226 KASSERT(!RB_SENTINEL_P(tree->rbt_root)); 2227 pve = (struct pv_entry *) 2228 ((uintptr_t)tree->rbt_minmax[RB_DIR_LEFT] - 2229 offsetof(struct pv_entry, pve_rb)); 2230 if (__predict_true(pve->pve_pte.pte_va == va)) { 2231 KASSERT(pve->pve_pte.pte_ptp == ptp); 2232 return pve; 2233 } 2234 2235 /* Search the RB tree for the key (uncommon). */ 2236 return pmap_treelookup_pv(pmap, ptp, tree, va); 2237 } 2238 2239 /* 2240 * pmap_enter_pv: enter a mapping onto a pmap_page lst 2241 * 2242 * => pmap must be locked 2243 * => does NOT insert dynamic entries to tree (pmap_enter() does later) 2244 */ 2245 static int 2246 pmap_enter_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp, 2247 vaddr_t va, struct pv_entry **new_pve, struct pv_entry **old_pve, 2248 bool *samepage, bool *new_embedded, rb_tree_t *tree) 2249 { 2250 struct pv_entry *pve; 2251 int error; 2252 2253 KASSERT(mutex_owned(&pmap->pm_lock)); 2254 KASSERT(ptp_to_pmap(ptp) == pmap); 2255 KASSERT(ptp == NULL || ptp->uobject != NULL); 2256 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 2257 PMAP_CHECK_PP(pp); 2258 2259 /* 2260 * If entering the same page and it's already tracked with an 2261 * embedded entry, we can avoid the expense below. It's safe 2262 * to check for this very specific set of values without a lock 2263 * because both will only ever be set together for this pmap. 2264 */ 2265 if (atomic_load_relaxed(&pp->pp_pte.pte_ptp) == ptp && 2266 atomic_load_relaxed(&pp->pp_pte.pte_va) == va) { 2267 *samepage = true; 2268 pmap_check_pv(pmap, ptp, pp, va, true); 2269 return 0; 2270 } 2271 2272 /* 2273 * Check for an existing dynamic mapping at this address. If it's 2274 * for the same page, then it will be reused and nothing needs to be 2275 * changed. 2276 */ 2277 *old_pve = pmap_treelookup_pv(pmap, ptp, tree, va); 2278 if (*old_pve != NULL && (*old_pve)->pve_pp == pp) { 2279 *samepage = true; 2280 pmap_check_pv(pmap, ptp, pp, va, true); 2281 return 0; 2282 } 2283 2284 /* 2285 * Need to put a new mapping in place. Grab a spare pv_entry in 2286 * case it's needed; won't know for sure until the lock is taken. 2287 */ 2288 if (pmap->pm_pve == NULL) { 2289 pmap->pm_pve = pmap_alloc_pv(pmap); 2290 } 2291 2292 error = 0; 2293 pmap_check_pv(pmap, ptp, pp, va, false); 2294 mutex_spin_enter(&pp->pp_lock); 2295 if (!pv_pte_embedded(pp)) { 2296 /* 2297 * Embedded PV tracking available - easy. 2298 */ 2299 pp->pp_pte.pte_ptp = ptp; 2300 pp->pp_pte.pte_va = va; 2301 *new_embedded = true; 2302 } else if (__predict_false(pmap->pm_pve == NULL)) { 2303 /* 2304 * No memory. 2305 */ 2306 error = ENOMEM; 2307 } else { 2308 /* 2309 * Install new pv_entry on the page. 2310 */ 2311 pve = pmap->pm_pve; 2312 pmap->pm_pve = NULL; 2313 *new_pve = pve; 2314 pve->pve_pte.pte_ptp = ptp; 2315 pve->pve_pte.pte_va = va; 2316 pve->pve_pp = pp; 2317 LIST_INSERT_HEAD(&pp->pp_pvlist, pve, pve_list); 2318 } 2319 mutex_spin_exit(&pp->pp_lock); 2320 if (error == 0) { 2321 pmap_check_pv(pmap, ptp, pp, va, true); 2322 } 2323 2324 return error; 2325 } 2326 2327 /* 2328 * pmap_remove_pv: try to remove a mapping from a pv_list 2329 * 2330 * => pmap must be locked 2331 * => removes dynamic entries from tree and frees them 2332 * => caller should adjust ptp's wire_count and free PTP if needed 2333 */ 2334 static void 2335 pmap_remove_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp, 2336 vaddr_t va, struct pv_entry *pve, uint8_t oattrs) 2337 { 2338 rb_tree_t *tree = (ptp != NULL ? 2339 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb); 2340 2341 KASSERT(mutex_owned(&pmap->pm_lock)); 2342 KASSERT(ptp_to_pmap(ptp) == pmap); 2343 KASSERT(ptp == NULL || ptp->uobject != NULL); 2344 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 2345 KASSERT(ptp != NULL || pmap == pmap_kernel()); 2346 2347 pmap_check_pv(pmap, ptp, pp, va, true); 2348 2349 if (pve == NULL) { 2350 mutex_spin_enter(&pp->pp_lock); 2351 KASSERT(pp->pp_pte.pte_ptp == ptp); 2352 KASSERT(pp->pp_pte.pte_va == va); 2353 pp->pp_attrs |= oattrs; 2354 pp->pp_pte.pte_ptp = NULL; 2355 pp->pp_pte.pte_va = 0; 2356 mutex_spin_exit(&pp->pp_lock); 2357 } else { 2358 mutex_spin_enter(&pp->pp_lock); 2359 KASSERT(pp->pp_pte.pte_ptp != ptp || 2360 pp->pp_pte.pte_va != va); 2361 KASSERT(pve->pve_pte.pte_ptp == ptp); 2362 KASSERT(pve->pve_pte.pte_va == va); 2363 KASSERT(pve->pve_pp == pp); 2364 pp->pp_attrs |= oattrs; 2365 LIST_REMOVE(pve, pve_list); 2366 mutex_spin_exit(&pp->pp_lock); 2367 2368 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == pve); 2369 rb_tree_remove_node(tree, pve); 2370 #ifdef DIAGNOSTIC 2371 memset(pve, 0, sizeof(*pve)); 2372 #endif 2373 pmap_free_pv(pmap, pve); 2374 } 2375 2376 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 2377 pmap_check_pv(pmap, ptp, pp, va, false); 2378 } 2379 2380 /* 2381 * p t p f u n c t i o n s 2382 */ 2383 2384 static struct vm_page * 2385 pmap_find_ptp(struct pmap *pmap, vaddr_t va, int level) 2386 { 2387 int lidx = level - 1; 2388 off_t off = ptp_va2o(va, level); 2389 struct vm_page *pg; 2390 2391 KASSERT(mutex_owned(&pmap->pm_lock)); 2392 2393 if (pmap->pm_ptphint[lidx] && off == pmap->pm_ptphint[lidx]->offset) { 2394 KASSERT(pmap->pm_ptphint[lidx]->wire_count > 0); 2395 pg = pmap->pm_ptphint[lidx]; 2396 PMAP_CHECK_PP(VM_PAGE_TO_PP(pg)); 2397 return pg; 2398 } 2399 PMAP_DUMMY_LOCK(pmap); 2400 pg = uvm_pagelookup(&pmap->pm_obj[lidx], off); 2401 PMAP_DUMMY_UNLOCK(pmap); 2402 if (pg != NULL && __predict_false(pg->wire_count == 0)) { 2403 /* This page is queued to be freed - ignore. */ 2404 pg = NULL; 2405 } 2406 if (pg != NULL) { 2407 PMAP_CHECK_PP(VM_PAGE_TO_PP(pg)); 2408 } 2409 pmap->pm_ptphint[lidx] = pg; 2410 return pg; 2411 } 2412 2413 static inline void 2414 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level) 2415 { 2416 int lidx; 2417 2418 KASSERT(ptp->wire_count <= 1); 2419 PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp)); 2420 2421 lidx = level - 1; 2422 pmap_stats_update(pmap, -ptp->wire_count, 0); 2423 if (pmap->pm_ptphint[lidx] == ptp) 2424 pmap->pm_ptphint[lidx] = NULL; 2425 ptp->wire_count = 0; 2426 ptp->uanon = NULL; 2427 KASSERT(RB_TREE_MIN(&VM_PAGE_TO_PP(ptp)->pp_rb) == NULL); 2428 2429 /* 2430 * Enqueue the PTP to be freed by pmap_update(). We can't remove 2431 * the page from the uvm_object, as that can take further locks 2432 * (intolerable right now because the PTEs are likely mapped in). 2433 * Instead mark the PTP as free and if we bump into it again, we'll 2434 * either ignore or reuse (depending on what's useful at the time). 2435 */ 2436 LIST_INSERT_HEAD(&pmap->pm_gc_ptp, ptp, mdpage.mp_pp.pp_link); 2437 } 2438 2439 static void 2440 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va, 2441 pt_entry_t *ptes, pd_entry_t * const *pdes) 2442 { 2443 unsigned long index; 2444 int level; 2445 vaddr_t invaladdr; 2446 pd_entry_t opde; 2447 2448 KASSERT(pmap != pmap_kernel()); 2449 KASSERT(mutex_owned(&pmap->pm_lock)); 2450 KASSERT(kpreempt_disabled()); 2451 2452 level = 1; 2453 do { 2454 index = pl_i(va, level + 1); 2455 opde = pmap_pte_testset(&pdes[level - 1][index], 0); 2456 2457 /* 2458 * On Xen-amd64 or SVS, we need to sync the top level page 2459 * directory on each CPU. 2460 */ 2461 #if defined(XENPV) && defined(__x86_64__) 2462 if (level == PTP_LEVELS - 1) { 2463 xen_kpm_sync(pmap, index); 2464 } 2465 #elif defined(SVS) 2466 if (svs_enabled && level == PTP_LEVELS - 1) { 2467 svs_pmap_sync(pmap, index); 2468 } 2469 #endif 2470 2471 invaladdr = level == 1 ? (vaddr_t)ptes : 2472 (vaddr_t)pdes[level - 2]; 2473 pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE, 2474 opde, TLBSHOOT_FREE_PTP); 2475 2476 #if defined(XENPV) 2477 pmap_tlb_shootnow(); 2478 #endif 2479 2480 pmap_freepage(pmap, ptp, level); 2481 if (level < PTP_LEVELS - 1) { 2482 ptp = pmap_find_ptp(pmap, va, level + 1); 2483 ptp->wire_count--; 2484 if (ptp->wire_count > 1) 2485 break; 2486 } 2487 } while (++level < PTP_LEVELS); 2488 pmap_pte_flush(); 2489 } 2490 2491 /* 2492 * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one) 2493 * 2494 * => pmap should NOT be pmap_kernel() 2495 * => pmap should be locked 2496 * => we are not touching any PTEs yet, so they need not be mapped in 2497 */ 2498 static int 2499 pmap_get_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va, 2500 int flags, struct vm_page **resultp) 2501 { 2502 struct vm_page *ptp; 2503 int i, aflags; 2504 struct uvm_object *obj; 2505 voff_t off; 2506 2507 KASSERT(pmap != pmap_kernel()); 2508 KASSERT(mutex_owned(&pmap->pm_lock)); 2509 2510 /* 2511 * Loop through all page table levels allocating a page 2512 * for any level where we don't already have one. 2513 */ 2514 memset(pt, 0, sizeof(*pt)); 2515 aflags = ((flags & PMAP_CANFAIL) ? 0 : UVM_PGA_USERESERVE) | 2516 UVM_PGA_ZERO; 2517 for (i = PTP_LEVELS; i > 1; i--) { 2518 obj = &pmap->pm_obj[i - 2]; 2519 off = ptp_va2o(va, i - 1); 2520 2521 PMAP_DUMMY_LOCK(pmap); 2522 pt->pg[i] = uvm_pagelookup(obj, off); 2523 2524 if (pt->pg[i] == NULL) { 2525 pt->pg[i] = uvm_pagealloc(obj, off, NULL, aflags); 2526 pt->alloced[i] = (pt->pg[i] != NULL); 2527 } else if (pt->pg[i]->wire_count == 0) { 2528 /* This page was queued to be freed; dequeue it. */ 2529 LIST_REMOVE(pt->pg[i], mdpage.mp_pp.pp_link); 2530 pt->alloced[i] = true; 2531 } 2532 PMAP_DUMMY_UNLOCK(pmap); 2533 if (pt->pg[i] == NULL) { 2534 pmap_unget_ptp(pmap, pt); 2535 return ENOMEM; 2536 } else if (pt->alloced[i]) { 2537 pt->pg[i]->uanon = (struct vm_anon *)(vaddr_t)~0L; 2538 rb_tree_init(&VM_PAGE_TO_PP(pt->pg[i])->pp_rb, 2539 &pmap_rbtree_ops); 2540 PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i])); 2541 } 2542 } 2543 ptp = pt->pg[2]; 2544 KASSERT(ptp != NULL); 2545 *resultp = ptp; 2546 pmap->pm_ptphint[0] = ptp; 2547 return 0; 2548 } 2549 2550 /* 2551 * pmap_install_ptp: install any freshly allocated PTPs 2552 * 2553 * => pmap should NOT be pmap_kernel() 2554 * => pmap should be locked 2555 * => PTEs must be mapped 2556 * => preemption must be disabled 2557 */ 2558 static void 2559 pmap_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va, 2560 pd_entry_t * const *pdes) 2561 { 2562 struct vm_page *ptp; 2563 unsigned long index; 2564 pd_entry_t *pva; 2565 paddr_t pa; 2566 int i; 2567 2568 KASSERT(pmap != pmap_kernel()); 2569 KASSERT(mutex_owned(&pmap->pm_lock)); 2570 KASSERT(kpreempt_disabled()); 2571 2572 /* 2573 * Now that we have all the pages looked up or allocated, 2574 * loop through again installing any new ones into the tree. 2575 */ 2576 for (i = PTP_LEVELS; i > 1; i--) { 2577 index = pl_i(va, i); 2578 pva = pdes[i - 2]; 2579 2580 if (pmap_valid_entry(pva[index])) { 2581 KASSERT(!pt->alloced[i]); 2582 continue; 2583 } 2584 2585 ptp = pt->pg[i]; 2586 ptp->flags &= ~PG_BUSY; /* never busy */ 2587 ptp->wire_count = 1; 2588 pmap->pm_ptphint[i - 2] = ptp; 2589 pa = VM_PAGE_TO_PHYS(ptp); 2590 pmap_pte_set(&pva[index], (pd_entry_t) 2591 (pmap_pa2pte(pa) | PTE_U | PTE_W | PTE_P)); 2592 2593 /* 2594 * On Xen-amd64 or SVS, we need to sync the top level page 2595 * directory on each CPU. 2596 */ 2597 #if defined(XENPV) && defined(__x86_64__) 2598 if (i == PTP_LEVELS) { 2599 xen_kpm_sync(pmap, index); 2600 } 2601 #elif defined(SVS) 2602 if (svs_enabled && i == PTP_LEVELS) { 2603 svs_pmap_sync(pmap, index); 2604 } 2605 #endif 2606 2607 pmap_pte_flush(); 2608 pmap_stats_update(pmap, 1, 0); 2609 2610 /* 2611 * If we're not in the top level, increase the 2612 * wire count of the parent page. 2613 */ 2614 if (i < PTP_LEVELS) { 2615 pt->pg[i + 1]->wire_count++; 2616 } 2617 } 2618 } 2619 2620 /* 2621 * pmap_unget_ptp: free unusued PTPs 2622 * 2623 * => pmap should NOT be pmap_kernel() 2624 * => pmap should be locked 2625 */ 2626 static void 2627 pmap_unget_ptp(struct pmap *pmap, struct pmap_ptparray *pt) 2628 { 2629 int i; 2630 2631 KASSERT(pmap != pmap_kernel()); 2632 KASSERT(mutex_owned(&pmap->pm_lock)); 2633 2634 for (i = PTP_LEVELS; i > 1; i--) { 2635 if (!pt->alloced[i]) { 2636 continue; 2637 } 2638 KASSERT(pt->pg[i]->wire_count == 0); 2639 PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i])); 2640 pmap_freepage(pmap, pt->pg[i], i - 1); 2641 } 2642 } 2643 2644 /* 2645 * p m a p l i f e c y c l e f u n c t i o n s 2646 */ 2647 2648 /* 2649 * pmap_pdp_init: constructor a new PDP. 2650 */ 2651 static void 2652 pmap_pdp_init(pd_entry_t *pdir) 2653 { 2654 paddr_t pdirpa = 0; 2655 vaddr_t object; 2656 int i; 2657 2658 #if !defined(XENPV) || !defined(__x86_64__) 2659 int npde; 2660 #endif 2661 #ifdef XENPV 2662 int s; 2663 #endif 2664 2665 memset(PAGE_ALIGNED(pdir), 0, PDP_SIZE * PAGE_SIZE); 2666 2667 /* 2668 * NOTE: This is all done unlocked, but we will check afterwards 2669 * if we have raced with pmap_growkernel(). 2670 */ 2671 2672 #if defined(XENPV) && defined(__x86_64__) 2673 /* Fetch the physical address of the page directory */ 2674 (void)pmap_extract(pmap_kernel(), (vaddr_t)pdir, &pdirpa); 2675 2676 /* 2677 * This pdir will NEVER be active in kernel mode, so mark 2678 * recursive entry invalid. 2679 */ 2680 pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa); 2681 2682 /* 2683 * PDP constructed this way won't be for the kernel, hence we 2684 * don't put kernel mappings on Xen. 2685 * 2686 * But we need to make pmap_create() happy, so put a dummy 2687 * (without PTE_P) value at the right place. 2688 */ 2689 pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] = 2690 (pd_entry_t)-1 & PTE_FRAME; 2691 #else /* XENPV && __x86_64__*/ 2692 object = (vaddr_t)pdir; 2693 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2694 /* Fetch the physical address of the page directory */ 2695 (void)pmap_extract(pmap_kernel(), object, &pdirpa); 2696 2697 /* Put in recursive PDE to map the PTEs */ 2698 pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PTE_P | 2699 pmap_pg_nx; 2700 #ifndef XENPV 2701 pdir[PDIR_SLOT_PTE + i] |= PTE_W; 2702 #endif 2703 } 2704 2705 /* Copy the kernel's top level PDE */ 2706 npde = nkptp[PTP_LEVELS - 1]; 2707 2708 memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN], 2709 npde * sizeof(pd_entry_t)); 2710 2711 if (VM_MIN_KERNEL_ADDRESS != KERNBASE) { 2712 int idx = pl_i(KERNBASE, PTP_LEVELS); 2713 pdir[idx] = PDP_BASE[idx]; 2714 } 2715 2716 #ifdef __HAVE_PCPU_AREA 2717 pdir[PDIR_SLOT_PCPU] = PDP_BASE[PDIR_SLOT_PCPU]; 2718 #endif 2719 #ifdef __HAVE_DIRECT_MAP 2720 slotspace_copy(SLAREA_DMAP, pdir, PDP_BASE); 2721 #endif 2722 #ifdef KASAN 2723 slotspace_copy(SLAREA_ASAN, pdir, PDP_BASE); 2724 #endif 2725 #ifdef KMSAN 2726 slotspace_copy(SLAREA_MSAN, pdir, PDP_BASE); 2727 #endif 2728 #endif /* XENPV && __x86_64__*/ 2729 2730 #ifdef XENPV 2731 s = splvm(); 2732 object = (vaddr_t)pdir; 2733 pmap_protect(pmap_kernel(), object, object + (PAGE_SIZE * PDP_SIZE), 2734 VM_PROT_READ); 2735 pmap_update(pmap_kernel()); 2736 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2737 /* 2738 * pin as L2/L4 page, we have to do the page with the 2739 * PDIR_SLOT_PTE entries last 2740 */ 2741 #ifdef PAE 2742 if (i == l2tol3(PDIR_SLOT_PTE)) 2743 continue; 2744 #endif 2745 2746 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2747 #ifdef __x86_64__ 2748 xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa)); 2749 #else 2750 xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa)); 2751 #endif 2752 } 2753 #ifdef PAE 2754 object = ((vaddr_t)pdir) + PAGE_SIZE * l2tol3(PDIR_SLOT_PTE); 2755 (void)pmap_extract(pmap_kernel(), object, &pdirpa); 2756 xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa)); 2757 #endif 2758 splx(s); 2759 #endif /* XENPV */ 2760 } 2761 2762 /* 2763 * pmap_pdp_fini: destructor for the PDPs. 2764 */ 2765 static void 2766 pmap_pdp_fini(pd_entry_t *pdir) 2767 { 2768 #ifdef XENPV 2769 paddr_t pdirpa = 0; /* XXX: GCC */ 2770 vaddr_t object = (vaddr_t)pdir; 2771 int i; 2772 int s = splvm(); 2773 pt_entry_t *pte; 2774 2775 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2776 /* fetch the physical address of the page directory. */ 2777 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2778 /* unpin page table */ 2779 xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa)); 2780 } 2781 object = (vaddr_t)pdir; 2782 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2783 /* Set page RW again */ 2784 pte = kvtopte(object); 2785 pmap_pte_set(pte, *pte | PTE_W); 2786 xen_bcast_invlpg((vaddr_t)object); 2787 } 2788 splx(s); 2789 #endif /* XENPV */ 2790 } 2791 2792 #ifdef PAE 2793 static void * 2794 pmap_pdp_alloc(struct pool *pp, int flags) 2795 { 2796 return (void *)uvm_km_alloc(kernel_map, 2797 PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE, 2798 ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK) | 2799 UVM_KMF_WIRED); 2800 } 2801 2802 static void 2803 pmap_pdp_free(struct pool *pp, void *v) 2804 { 2805 uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE, 2806 UVM_KMF_WIRED); 2807 } 2808 #endif /* PAE */ 2809 2810 /* 2811 * pmap_ctor: constructor for the pmap cache. 2812 */ 2813 static int 2814 pmap_ctor(void *arg, void *obj, int flags) 2815 { 2816 struct pmap *pmap = obj; 2817 pt_entry_t p; 2818 int i; 2819 2820 KASSERT((flags & PR_WAITOK) != 0); 2821 2822 mutex_init(&pmap->pm_lock, MUTEX_DEFAULT, IPL_NONE); 2823 rw_init(&pmap->pm_dummy_lock); 2824 kcpuset_create(&pmap->pm_cpus, true); 2825 kcpuset_create(&pmap->pm_kernel_cpus, true); 2826 #ifdef XENPV 2827 kcpuset_create(&pmap->pm_xen_ptp_cpus, true); 2828 #endif 2829 LIST_INIT(&pmap->pm_gc_ptp); 2830 pmap->pm_pve = NULL; 2831 LIST_INIT(&pmap->pm_pvp_full); 2832 LIST_INIT(&pmap->pm_pvp_part); 2833 LIST_INIT(&pmap->pm_pvp_empty); 2834 2835 /* allocate and init PDP */ 2836 pmap->pm_pdir = pool_get(&pmap_pdp_pool, PR_WAITOK); 2837 2838 for (;;) { 2839 pmap_pdp_init(pmap->pm_pdir); 2840 mutex_enter(&pmaps_lock); 2841 p = pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1]; 2842 if (__predict_true(p != 0)) { 2843 break; 2844 } 2845 mutex_exit(&pmaps_lock); 2846 } 2847 2848 for (i = 0; i < PDP_SIZE; i++) 2849 pmap->pm_pdirpa[i] = 2850 pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]); 2851 2852 LIST_INSERT_HEAD(&pmaps, pmap, pm_list); 2853 mutex_exit(&pmaps_lock); 2854 2855 return 0; 2856 } 2857 2858 /* 2859 * pmap_ctor: destructor for the pmap cache. 2860 */ 2861 static void 2862 pmap_dtor(void *arg, void *obj) 2863 { 2864 struct pmap *pmap = obj; 2865 2866 mutex_enter(&pmaps_lock); 2867 LIST_REMOVE(pmap, pm_list); 2868 mutex_exit(&pmaps_lock); 2869 2870 pmap_pdp_fini(pmap->pm_pdir); 2871 pool_put(&pmap_pdp_pool, pmap->pm_pdir); 2872 mutex_destroy(&pmap->pm_lock); 2873 rw_destroy(&pmap->pm_dummy_lock); 2874 kcpuset_destroy(pmap->pm_cpus); 2875 kcpuset_destroy(pmap->pm_kernel_cpus); 2876 #ifdef XENPV 2877 kcpuset_destroy(pmap->pm_xen_ptp_cpus); 2878 #endif 2879 } 2880 2881 /* 2882 * pmap_create: create a pmap object. 2883 */ 2884 struct pmap * 2885 pmap_create(void) 2886 { 2887 struct pmap *pmap; 2888 int i; 2889 2890 pmap = pool_cache_get(&pmap_cache, PR_WAITOK); 2891 2892 /* init uvm_object */ 2893 for (i = 0; i < PTP_LEVELS - 1; i++) { 2894 uvm_obj_init(&pmap->pm_obj[i], &pmap_pager, false, 1); 2895 uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_dummy_lock); 2896 pmap->pm_ptphint[i] = NULL; 2897 } 2898 pmap->pm_stats.wired_count = 0; 2899 /* count the PDP allocd below */ 2900 pmap->pm_stats.resident_count = PDP_SIZE; 2901 #if !defined(__x86_64__) 2902 pmap->pm_hiexec = 0; 2903 #endif 2904 2905 /* Used by NVMM and Xen */ 2906 pmap->pm_enter = NULL; 2907 pmap->pm_extract = NULL; 2908 pmap->pm_remove = NULL; 2909 pmap->pm_sync_pv = NULL; 2910 pmap->pm_pp_remove_ent = NULL; 2911 pmap->pm_write_protect = NULL; 2912 pmap->pm_unwire = NULL; 2913 pmap->pm_tlb_flush = NULL; 2914 pmap->pm_data = NULL; 2915 2916 /* init the LDT */ 2917 pmap->pm_ldt = NULL; 2918 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 2919 2920 return pmap; 2921 } 2922 2923 /* 2924 * pmap_check_ptps: verify that none of the pmap's page table objects 2925 * have any pages allocated to them. 2926 */ 2927 static void 2928 pmap_check_ptps(struct pmap *pmap) 2929 { 2930 int i; 2931 2932 for (i = 0; i < PTP_LEVELS - 1; i++) { 2933 KASSERTMSG(pmap->pm_obj[i].uo_npages == 0, 2934 "pmap %p level %d still has %d pages", 2935 pmap, i, (int)pmap->pm_obj[i].uo_npages); 2936 } 2937 } 2938 2939 static void 2940 pmap_check_inuse(struct pmap *pmap) 2941 { 2942 #ifdef DEBUG 2943 CPU_INFO_ITERATOR cii; 2944 struct cpu_info *ci; 2945 2946 for (CPU_INFO_FOREACH(cii, ci)) { 2947 if (ci->ci_pmap == pmap) 2948 panic("destroying pmap being used"); 2949 #if defined(XENPV) && defined(__x86_64__) 2950 for (int i = 0; i < PDIR_SLOT_USERLIM; i++) { 2951 if (pmap->pm_pdir[i] != 0 && 2952 ci->ci_kpm_pdir[i] == pmap->pm_pdir[i]) { 2953 printf("pmap_destroy(%p) pmap_kernel %p " 2954 "curcpu %d cpu %d ci_pmap %p " 2955 "ci->ci_kpm_pdir[%d]=%" PRIx64 2956 " pmap->pm_pdir[%d]=%" PRIx64 "\n", 2957 pmap, pmap_kernel(), curcpu()->ci_index, 2958 ci->ci_index, ci->ci_pmap, 2959 i, ci->ci_kpm_pdir[i], 2960 i, pmap->pm_pdir[i]); 2961 panic("%s: used pmap", __func__); 2962 } 2963 } 2964 #endif 2965 } 2966 #endif /* DEBUG */ 2967 } 2968 2969 /* 2970 * pmap_destroy: drop reference count on pmap. free pmap if reference 2971 * count goes to zero. 2972 * 2973 * => we can be called from pmap_unmap_ptes() with a different, unrelated 2974 * pmap's lock held. be careful! 2975 */ 2976 void 2977 pmap_destroy(struct pmap *pmap) 2978 { 2979 int i; 2980 2981 /* 2982 * drop reference count and verify not in use. 2983 */ 2984 2985 if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) { 2986 return; 2987 } 2988 pmap_check_inuse(pmap); 2989 2990 /* 2991 * handle any deferred frees. 2992 */ 2993 2994 mutex_enter(&pmap->pm_lock); 2995 if (pmap->pm_pve != NULL) { 2996 pmap_free_pv(pmap, pmap->pm_pve); 2997 pmap->pm_pve = NULL; 2998 } 2999 pmap_drain_pv(pmap); 3000 mutex_exit(&pmap->pm_lock); 3001 pmap_update(pmap); 3002 3003 /* 3004 * Reference count is zero, free pmap resources and then free pmap. 3005 */ 3006 3007 pmap_check_ptps(pmap); 3008 KASSERT(LIST_EMPTY(&pmap->pm_gc_ptp)); 3009 3010 #ifdef USER_LDT 3011 if (pmap->pm_ldt != NULL) { 3012 /* 3013 * No need to switch the LDT; this address space is gone, 3014 * nothing is using it. 3015 * 3016 * No need to lock the pmap for ldt_free (or anything else), 3017 * we're the last one to use it. 3018 */ 3019 /* XXXAD can't take cpu_lock here - fix soon. */ 3020 mutex_enter(&cpu_lock); 3021 ldt_free(pmap->pm_ldt_sel); 3022 mutex_exit(&cpu_lock); 3023 uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt, 3024 MAX_USERLDT_SIZE, UVM_KMF_WIRED); 3025 } 3026 #endif 3027 3028 for (i = 0; i < PTP_LEVELS - 1; i++) { 3029 uvm_obj_destroy(&pmap->pm_obj[i], false); 3030 } 3031 kcpuset_zero(pmap->pm_cpus); 3032 kcpuset_zero(pmap->pm_kernel_cpus); 3033 #ifdef XENPV 3034 kcpuset_zero(pmap->pm_xen_ptp_cpus); 3035 #endif 3036 3037 KASSERT(LIST_EMPTY(&pmap->pm_pvp_full)); 3038 KASSERT(LIST_EMPTY(&pmap->pm_pvp_part)); 3039 KASSERT(LIST_EMPTY(&pmap->pm_pvp_empty)); 3040 3041 pmap_check_ptps(pmap); 3042 if (__predict_false(pmap->pm_enter != NULL)) { 3043 /* XXX make this a different cache */ 3044 pool_cache_destruct_object(&pmap_cache, pmap); 3045 } else { 3046 pool_cache_put(&pmap_cache, pmap); 3047 } 3048 } 3049 3050 /* 3051 * pmap_zap_ptp: clear out an entire PTP without modifying PTEs 3052 * 3053 * => caller must hold pmap's lock 3054 * => PTP must be mapped into KVA 3055 * => must be called with kernel preemption disabled 3056 * => does as little work as possible 3057 */ 3058 static void 3059 pmap_zap_ptp(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, 3060 vaddr_t startva, vaddr_t blkendva) 3061 { 3062 #ifndef XENPV 3063 struct pv_entry *pve; 3064 struct vm_page *pg; 3065 struct pmap_page *pp; 3066 pt_entry_t opte; 3067 rb_tree_t *tree; 3068 vaddr_t va; 3069 int wired; 3070 uint8_t oattrs; 3071 u_int cnt; 3072 3073 KASSERT(mutex_owned(&pmap->pm_lock)); 3074 KASSERT(kpreempt_disabled()); 3075 KASSERT(pmap != pmap_kernel()); 3076 KASSERT(ptp->wire_count > 1); 3077 KASSERT(ptp->wire_count - 1 <= PAGE_SIZE / sizeof(pt_entry_t)); 3078 3079 /* 3080 * Start at the lowest entered VA, and scan until there are no more 3081 * PTEs in the PTPs. 3082 */ 3083 tree = &VM_PAGE_TO_PP(ptp)->pp_rb; 3084 pve = RB_TREE_MIN(tree); 3085 wired = 0; 3086 va = (vaddr_t)ptp->uanon; 3087 pte += ((va - startva) >> PAGE_SHIFT); 3088 3089 for (cnt = ptp->wire_count; cnt > 1; pte++, va += PAGE_SIZE) { 3090 /* 3091 * No need for an atomic to clear the PTE. Nothing else can 3092 * see the address space any more and speculative access (if 3093 * possible) won't modify. Therefore there's no need to 3094 * track the accessed/dirty bits. 3095 */ 3096 opte = *pte; 3097 if (!pmap_valid_entry(opte)) { 3098 continue; 3099 } 3100 3101 /* 3102 * Count the PTE. If it's not for a managed mapping 3103 * there's noting more to do. 3104 */ 3105 cnt--; 3106 wired -= (opte & PTE_WIRED); 3107 if ((opte & PTE_PVLIST) == 0) { 3108 #ifndef DOM0OPS 3109 KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL), 3110 "managed page without PTE_PVLIST for %#" 3111 PRIxVADDR, va); 3112 KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL), 3113 "pv-tracked page without PTE_PVLIST for %#" 3114 PRIxVADDR, va); 3115 #endif 3116 KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ? 3117 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), 3118 va) == NULL); 3119 continue; 3120 } 3121 3122 /* 3123 * "pve" now points to the lowest (by VA) dynamic PV entry 3124 * in the PTP. If it's for this VA, take advantage of it to 3125 * avoid calling PHYS_TO_VM_PAGE(). Avoid modifying the RB 3126 * tree by skipping to the next VA in the tree whenever 3127 * there is a match here. The tree will be cleared out in 3128 * one pass before return to pmap_remove_all(). 3129 */ 3130 oattrs = pmap_pte_to_pp_attrs(opte); 3131 if (pve != NULL && pve->pve_pte.pte_va == va) { 3132 pp = pve->pve_pp; 3133 KASSERT(pve->pve_pte.pte_ptp == ptp); 3134 KASSERT(pp->pp_pte.pte_ptp != ptp || 3135 pp->pp_pte.pte_va != va); 3136 mutex_spin_enter(&pp->pp_lock); 3137 pp->pp_attrs |= oattrs; 3138 LIST_REMOVE(pve, pve_list); 3139 mutex_spin_exit(&pp->pp_lock); 3140 3141 /* 3142 * pve won't be touched again until pmap_drain_pv(), 3143 * so it's still safe to traverse the tree. 3144 */ 3145 pmap_free_pv(pmap, pve); 3146 pve = RB_TREE_NEXT(tree, pve); 3147 continue; 3148 } 3149 3150 /* 3151 * No entry in the tree so it must be embedded. Look up the 3152 * page and cancel the embedded entry. 3153 */ 3154 if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) { 3155 pp = VM_PAGE_TO_PP(pg); 3156 } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) { 3157 paddr_t pa = pmap_pte2pa(opte); 3158 panic("%s: PTE_PVLIST with pv-untracked page" 3159 " va = %#"PRIxVADDR"pa = %#"PRIxPADDR 3160 "(%#"PRIxPADDR")", __func__, va, pa, atop(pa)); 3161 } 3162 mutex_spin_enter(&pp->pp_lock); 3163 KASSERT(pp->pp_pte.pte_ptp == ptp); 3164 KASSERT(pp->pp_pte.pte_va == va); 3165 pp->pp_attrs |= oattrs; 3166 pp->pp_pte.pte_ptp = NULL; 3167 pp->pp_pte.pte_va = 0; 3168 mutex_spin_exit(&pp->pp_lock); 3169 } 3170 3171 /* PTP now empty - adjust the tree & stats to match. */ 3172 pmap_stats_update(pmap, -(ptp->wire_count - 1), wired / PTE_WIRED); 3173 ptp->wire_count = 1; 3174 #ifdef DIAGNOSTIC 3175 rb_tree_init(tree, &pmap_rbtree_ops); 3176 #endif 3177 #else /* !XENPV */ 3178 /* 3179 * XXXAD For XEN, it's not clear to me that we can do this, because 3180 * I guess the hypervisor keeps track of PTEs too. 3181 */ 3182 pmap_remove_ptes(pmap, ptp, (vaddr_t)pte, startva, blkendva); 3183 #endif /* !XENPV */ 3184 } 3185 3186 /* 3187 * pmap_remove_all: remove all mappings from pmap in bulk. 3188 * 3189 * Ordinarily when removing mappings it's important to hold the UVM object's 3190 * lock, so that pages do not gain a new identity while retaining stale TLB 3191 * entries (the same lock hold covers both pmap_remove() and pmap_update()). 3192 * Here it's known that the address space is no longer visible to any user 3193 * process, so we don't need to worry about that. 3194 */ 3195 bool 3196 pmap_remove_all(struct pmap *pmap) 3197 { 3198 struct vm_page *ptps[32]; 3199 vaddr_t va, blkendva; 3200 struct pmap *pmap2; 3201 pt_entry_t *ptes; 3202 pd_entry_t pde __diagused; 3203 pd_entry_t * const *pdes; 3204 int lvl __diagused, i, n; 3205 3206 /* XXX Can't handle EPT just yet. */ 3207 if (pmap->pm_remove != NULL) { 3208 return false; 3209 } 3210 3211 for (;;) { 3212 /* Fetch a block of PTPs from tree. */ 3213 mutex_enter(&pmap->pm_lock); 3214 n = radix_tree_gang_lookup_node(&pmap->pm_obj[0].uo_pages, 0, 3215 (void **)ptps, __arraycount(ptps), false); 3216 if (n == 0) { 3217 mutex_exit(&pmap->pm_lock); 3218 break; 3219 } 3220 3221 /* Remove all mappings in the set of PTPs. */ 3222 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3223 for (i = 0; i < n; i++) { 3224 if (ptps[i]->wire_count == 0) { 3225 /* It's dead: pmap_update() will expunge. */ 3226 continue; 3227 } 3228 3229 /* Determine range of block. */ 3230 va = ptps[i]->offset * PAGE_SIZE / sizeof(pt_entry_t); 3231 blkendva = x86_round_pdr(va + 1); 3232 3233 /* Make sure everything squares up... */ 3234 KASSERT(pmap_pdes_valid(va, pdes, &pde, &lvl)); 3235 KASSERT(lvl == 1); 3236 KASSERT(pmap_find_ptp(pmap, va, 1) == ptps[i]); 3237 3238 /* Zap! */ 3239 pmap_zap_ptp(pmap, ptps[i], &ptes[pl1_i(va)], va, 3240 blkendva); 3241 3242 /* PTP should now be unused - free it. */ 3243 KASSERT(ptps[i]->wire_count == 1); 3244 pmap_free_ptp(pmap, ptps[i], va, ptes, pdes); 3245 } 3246 pmap_unmap_ptes(pmap, pmap2); 3247 pmap_drain_pv(pmap); 3248 pmap_tlb_shootdown(pmap, -1L, 0, TLBSHOOT_REMOVE_ALL); 3249 mutex_exit(&pmap->pm_lock); 3250 3251 /* Process deferred frees. */ 3252 pmap_update(pmap); 3253 3254 /* A breathing point. */ 3255 preempt_point(); 3256 } 3257 3258 /* Verify that the pmap is now completely empty. */ 3259 pmap_check_ptps(pmap); 3260 KASSERTMSG(pmap->pm_stats.resident_count == PDP_SIZE, 3261 "pmap %p not empty", pmap); 3262 3263 return true; 3264 } 3265 3266 #if defined(PMAP_FORK) 3267 /* 3268 * pmap_fork: perform any necessary data structure manipulation when 3269 * a VM space is forked. 3270 */ 3271 void 3272 pmap_fork(struct pmap *pmap1, struct pmap *pmap2) 3273 { 3274 #ifdef USER_LDT 3275 union descriptor *new_ldt; 3276 int sel; 3277 3278 if (__predict_true(pmap1->pm_ldt == NULL)) { 3279 return; 3280 } 3281 3282 /* 3283 * Copy the LDT into the new process. 3284 * 3285 * Read pmap1's ldt pointer unlocked; if it changes behind our back 3286 * we'll retry. This will starve if there's a stream of LDT changes 3287 * in another thread but that should not happen. 3288 */ 3289 3290 retry: 3291 if (pmap1->pm_ldt != NULL) { 3292 /* Allocate space for the new process's LDT */ 3293 new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, 3294 MAX_USERLDT_SIZE, 0, UVM_KMF_WIRED); 3295 if (new_ldt == NULL) { 3296 printf("WARNING: %s: unable to allocate LDT space\n", 3297 __func__); 3298 return; 3299 } 3300 mutex_enter(&cpu_lock); 3301 /* Get a GDT slot for it */ 3302 sel = ldt_alloc(new_ldt, MAX_USERLDT_SIZE); 3303 if (sel == -1) { 3304 mutex_exit(&cpu_lock); 3305 uvm_km_free(kernel_map, (vaddr_t)new_ldt, 3306 MAX_USERLDT_SIZE, UVM_KMF_WIRED); 3307 printf("WARNING: %s: unable to allocate LDT selector\n", 3308 __func__); 3309 return; 3310 } 3311 } else { 3312 /* Wasn't anything there after all. */ 3313 new_ldt = NULL; 3314 sel = -1; 3315 mutex_enter(&cpu_lock); 3316 } 3317 3318 /* 3319 * Now that we have cpu_lock, ensure the LDT status is the same. 3320 */ 3321 if (pmap1->pm_ldt != NULL) { 3322 if (new_ldt == NULL) { 3323 /* A wild LDT just appeared. */ 3324 mutex_exit(&cpu_lock); 3325 goto retry; 3326 } 3327 3328 /* Copy the LDT data and install it in pmap2 */ 3329 memcpy(new_ldt, pmap1->pm_ldt, MAX_USERLDT_SIZE); 3330 pmap2->pm_ldt = new_ldt; 3331 pmap2->pm_ldt_sel = sel; 3332 mutex_exit(&cpu_lock); 3333 } else { 3334 if (new_ldt != NULL) { 3335 /* The LDT disappeared, drop what we did. */ 3336 ldt_free(sel); 3337 mutex_exit(&cpu_lock); 3338 uvm_km_free(kernel_map, (vaddr_t)new_ldt, 3339 MAX_USERLDT_SIZE, UVM_KMF_WIRED); 3340 return; 3341 } 3342 3343 /* We're good, just leave. */ 3344 mutex_exit(&cpu_lock); 3345 } 3346 #endif /* USER_LDT */ 3347 } 3348 #endif /* PMAP_FORK */ 3349 3350 #ifdef USER_LDT 3351 3352 /* 3353 * pmap_ldt_xcall: cross call used by pmap_ldt_sync. if the named pmap 3354 * is active, reload LDTR. 3355 */ 3356 static void 3357 pmap_ldt_xcall(void *arg1, void *arg2) 3358 { 3359 struct pmap *pm; 3360 3361 kpreempt_disable(); 3362 pm = arg1; 3363 if (curcpu()->ci_pmap == pm) { 3364 #if defined(SVS) 3365 if (svs_enabled) { 3366 svs_ldt_sync(pm); 3367 } else 3368 #endif 3369 lldt(pm->pm_ldt_sel); 3370 } 3371 kpreempt_enable(); 3372 } 3373 3374 /* 3375 * pmap_ldt_sync: LDT selector for the named pmap is changing. swap 3376 * in the new selector on all CPUs. 3377 */ 3378 void 3379 pmap_ldt_sync(struct pmap *pm) 3380 { 3381 uint64_t where; 3382 3383 KASSERT(mutex_owned(&cpu_lock)); 3384 3385 pmap_ldt_evcnt.ev_count++; 3386 where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL); 3387 xc_wait(where); 3388 } 3389 3390 /* 3391 * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and 3392 * restore the default. 3393 */ 3394 void 3395 pmap_ldt_cleanup(struct lwp *l) 3396 { 3397 pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap; 3398 union descriptor *ldt; 3399 int sel; 3400 3401 if (__predict_true(pmap->pm_ldt == NULL)) { 3402 return; 3403 } 3404 3405 mutex_enter(&cpu_lock); 3406 if (pmap->pm_ldt != NULL) { 3407 sel = pmap->pm_ldt_sel; 3408 ldt = pmap->pm_ldt; 3409 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 3410 pmap->pm_ldt = NULL; 3411 pmap_ldt_sync(pmap); 3412 ldt_free(sel); 3413 uvm_km_free(kernel_map, (vaddr_t)ldt, MAX_USERLDT_SIZE, 3414 UVM_KMF_WIRED); 3415 } 3416 mutex_exit(&cpu_lock); 3417 } 3418 #endif /* USER_LDT */ 3419 3420 /* 3421 * pmap_activate: activate a process' pmap 3422 * 3423 * => must be called with kernel preemption disabled 3424 * => if lwp is the curlwp, then set ci_want_pmapload so that 3425 * actual MMU context switch will be done by pmap_load() later 3426 */ 3427 void 3428 pmap_activate(struct lwp *l) 3429 { 3430 struct cpu_info *ci; 3431 struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 3432 3433 KASSERT(kpreempt_disabled()); 3434 3435 ci = curcpu(); 3436 3437 if (l != ci->ci_curlwp) 3438 return; 3439 3440 KASSERT(ci->ci_want_pmapload == 0); 3441 KASSERT(ci->ci_tlbstate != TLBSTATE_VALID); 3442 3443 /* 3444 * no need to switch to kernel vmspace because 3445 * it's a subset of any vmspace. 3446 */ 3447 3448 if (pmap == pmap_kernel()) { 3449 ci->ci_want_pmapload = 0; 3450 return; 3451 } 3452 3453 ci->ci_want_pmapload = 1; 3454 } 3455 3456 #if defined(XENPV) && defined(__x86_64__) 3457 #define KASSERT_PDIRPA(pmap) \ 3458 KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd || \ 3459 pmap == pmap_kernel()) 3460 #elif defined(PAE) 3461 #define KASSERT_PDIRPA(pmap) \ 3462 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0])) 3463 #elif !defined(XENPV) 3464 #define KASSERT_PDIRPA(pmap) \ 3465 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3())) 3466 #else 3467 #define KASSERT_PDIRPA(pmap) KASSERT(true) /* nothing to do */ 3468 #endif 3469 3470 /* 3471 * pmap_reactivate: try to regain reference to the pmap. 3472 * 3473 * => Must be called with kernel preemption disabled. 3474 */ 3475 static void 3476 pmap_reactivate(struct pmap *pmap) 3477 { 3478 struct cpu_info * const ci = curcpu(); 3479 const cpuid_t cid = cpu_index(ci); 3480 3481 KASSERT(kpreempt_disabled()); 3482 KASSERT_PDIRPA(pmap); 3483 3484 /* 3485 * If we still have a lazy reference to this pmap, we can assume 3486 * that there was no TLB shootdown for this pmap in the meantime. 3487 * 3488 * The order of events here is important as we must synchronize 3489 * with TLB shootdown interrupts. Declare interest in invalidations 3490 * (TLBSTATE_VALID) and then check the CPU set, which the IPIs can 3491 * change only when the state is TLBSTATE_LAZY. 3492 */ 3493 3494 ci->ci_tlbstate = TLBSTATE_VALID; 3495 KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid)); 3496 3497 if (__predict_true(kcpuset_isset(pmap->pm_cpus, cid))) { 3498 /* We have the reference, state is valid. */ 3499 } else { 3500 /* 3501 * Must reload the TLB, pmap has been changed during 3502 * deactivated. 3503 */ 3504 kcpuset_atomic_set(pmap->pm_cpus, cid); 3505 3506 tlbflush(); 3507 } 3508 } 3509 3510 /* 3511 * pmap_load: perform the actual pmap switch, i.e. fill in %cr3 register 3512 * and relevant LDT info. 3513 * 3514 * Ensures that the current process' pmap is loaded on the current CPU's 3515 * MMU and that there are no stale TLB entries. 3516 * 3517 * => The caller should disable kernel preemption or do check-and-retry 3518 * to prevent a preemption from undoing our efforts. 3519 * => This function may block. 3520 */ 3521 void 3522 pmap_load(void) 3523 { 3524 struct cpu_info *ci; 3525 struct pmap *pmap, *oldpmap; 3526 struct lwp *l; 3527 uint64_t ncsw; 3528 3529 kpreempt_disable(); 3530 retry: 3531 ci = curcpu(); 3532 if (!ci->ci_want_pmapload) { 3533 kpreempt_enable(); 3534 return; 3535 } 3536 l = ci->ci_curlwp; 3537 ncsw = l->l_ncsw; 3538 __insn_barrier(); 3539 3540 /* should be able to take ipis. */ 3541 KASSERT(ci->ci_ilevel < IPL_HIGH); 3542 #ifdef XENPV 3543 /* Check to see if interrupts are enabled (ie; no events are masked) */ 3544 KASSERT(x86_read_psl() == 0); 3545 #else 3546 KASSERT((x86_read_psl() & PSL_I) != 0); 3547 #endif 3548 3549 KASSERT(l != NULL); 3550 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 3551 KASSERT(pmap != pmap_kernel()); 3552 oldpmap = ci->ci_pmap; 3553 3554 if (pmap == oldpmap) { 3555 pmap_reactivate(pmap); 3556 ci->ci_want_pmapload = 0; 3557 kpreempt_enable(); 3558 return; 3559 } 3560 3561 /* 3562 * Acquire a reference to the new pmap and perform the switch. 3563 */ 3564 3565 pmap_reference(pmap); 3566 pmap_load1(l, pmap, oldpmap); 3567 ci->ci_want_pmapload = 0; 3568 3569 /* 3570 * we're now running with the new pmap. drop the reference 3571 * to the old pmap. if we block, we need to go around again. 3572 */ 3573 3574 pmap_destroy(oldpmap); 3575 __insn_barrier(); 3576 if (l->l_ncsw != ncsw) { 3577 goto retry; 3578 } 3579 3580 kpreempt_enable(); 3581 } 3582 3583 /* 3584 * pmap_load1: the guts of pmap load, shared by pmap_map_ptes() and 3585 * pmap_load(). It's critically important that this function does not 3586 * block. 3587 */ 3588 static void 3589 pmap_load1(struct lwp *l, struct pmap *pmap, struct pmap *oldpmap) 3590 { 3591 struct cpu_info *ci; 3592 struct pcb *pcb; 3593 cpuid_t cid; 3594 3595 KASSERT(kpreempt_disabled()); 3596 3597 pcb = lwp_getpcb(l); 3598 ci = l->l_cpu; 3599 cid = cpu_index(ci); 3600 3601 kcpuset_atomic_clear(oldpmap->pm_cpus, cid); 3602 kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid); 3603 3604 KASSERT_PDIRPA(oldpmap); 3605 KASSERT(!kcpuset_isset(pmap->pm_cpus, cid)); 3606 KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid)); 3607 3608 /* 3609 * Mark the pmap in use by this CPU. Again, we must synchronize 3610 * with TLB shootdown interrupts, so set the state VALID first, 3611 * then register us for shootdown events on this pmap. 3612 */ 3613 ci->ci_tlbstate = TLBSTATE_VALID; 3614 kcpuset_atomic_set(pmap->pm_cpus, cid); 3615 kcpuset_atomic_set(pmap->pm_kernel_cpus, cid); 3616 ci->ci_pmap = pmap; 3617 3618 /* 3619 * update tss. now that we have registered for invalidations 3620 * from other CPUs, we're good to load the page tables. 3621 */ 3622 #ifdef PAE 3623 pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa; 3624 #else 3625 pcb->pcb_cr3 = pmap_pdirpa(pmap, 0); 3626 #endif 3627 3628 #ifdef i386 3629 #ifndef XENPV 3630 ci->ci_tss->tss.tss_ldt = pmap->pm_ldt_sel; 3631 ci->ci_tss->tss.tss_cr3 = pcb->pcb_cr3; 3632 #endif 3633 #endif 3634 3635 #if defined(SVS) && defined(USER_LDT) 3636 if (svs_enabled) { 3637 svs_ldt_sync(pmap); 3638 } else 3639 #endif 3640 lldt(pmap->pm_ldt_sel); 3641 3642 cpu_load_pmap(pmap, oldpmap); 3643 } 3644 3645 /* 3646 * pmap_deactivate: deactivate a process' pmap. 3647 * 3648 * => Must be called with kernel preemption disabled (high IPL is enough). 3649 */ 3650 void 3651 pmap_deactivate(struct lwp *l) 3652 { 3653 struct pmap *pmap; 3654 struct cpu_info *ci; 3655 3656 KASSERT(kpreempt_disabled()); 3657 3658 if (l != curlwp) { 3659 return; 3660 } 3661 3662 /* 3663 * Wait for pending TLB shootdowns to complete. Necessary because 3664 * TLB shootdown state is per-CPU, and the LWP may be coming off 3665 * the CPU before it has a chance to call pmap_update(), e.g. due 3666 * to kernel preemption or blocking routine in between. 3667 */ 3668 pmap_tlb_shootnow(); 3669 3670 ci = curcpu(); 3671 3672 if (ci->ci_want_pmapload) { 3673 /* 3674 * ci_want_pmapload means that our pmap is not loaded on 3675 * the CPU or TLB might be stale. note that pmap_kernel() 3676 * is always considered loaded. 3677 */ 3678 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 3679 != pmap_kernel()); 3680 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 3681 != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID); 3682 3683 /* 3684 * userspace has not been touched. 3685 * nothing to do here. 3686 */ 3687 3688 ci->ci_want_pmapload = 0; 3689 return; 3690 } 3691 3692 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 3693 3694 if (pmap == pmap_kernel()) { 3695 return; 3696 } 3697 3698 KASSERT_PDIRPA(pmap); 3699 KASSERT(ci->ci_pmap == pmap); 3700 3701 /* 3702 * we aren't interested in TLB invalidations for this pmap, 3703 * at least for the time being. 3704 */ 3705 3706 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); 3707 ci->ci_tlbstate = TLBSTATE_LAZY; 3708 } 3709 3710 /* 3711 * some misc. functions 3712 */ 3713 3714 bool 3715 pmap_pdes_valid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde, 3716 int *lastlvl) 3717 { 3718 unsigned long index; 3719 pd_entry_t pde; 3720 int i; 3721 3722 for (i = PTP_LEVELS; i > 1; i--) { 3723 index = pl_i(va, i); 3724 pde = pdes[i - 2][index]; 3725 if ((pde & PTE_P) == 0) { 3726 *lastlvl = i; 3727 return false; 3728 } 3729 if (pde & PTE_PS) 3730 break; 3731 } 3732 if (lastpde != NULL) 3733 *lastpde = pde; 3734 *lastlvl = i; 3735 return true; 3736 } 3737 3738 /* 3739 * pmap_extract: extract a PA for the given VA 3740 */ 3741 bool 3742 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap) 3743 { 3744 pt_entry_t *ptes, pte; 3745 pd_entry_t pde; 3746 pd_entry_t * const *pdes; 3747 struct pmap *pmap2; 3748 paddr_t pa; 3749 bool rv; 3750 int lvl; 3751 3752 if (__predict_false(pmap->pm_extract != NULL)) { 3753 return (*pmap->pm_extract)(pmap, va, pap); 3754 } 3755 3756 #ifdef __HAVE_DIRECT_MAP 3757 if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) { 3758 if (pap != NULL) { 3759 *pap = PMAP_DIRECT_UNMAP(va); 3760 } 3761 return true; 3762 } 3763 #endif 3764 3765 rv = false; 3766 pa = 0; 3767 3768 if (pmap != pmap_kernel()) { 3769 mutex_enter(&pmap->pm_lock); 3770 } 3771 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3772 if (pmap_pdes_valid(va, pdes, &pde, &lvl)) { 3773 if (lvl == 2) { 3774 pa = (pde & PTE_LGFRAME) | (va & (NBPD_L2 - 1)); 3775 rv = true; 3776 } else { 3777 KASSERT(lvl == 1); 3778 pte = ptes[pl1_i(va)]; 3779 if (__predict_true((pte & PTE_P) != 0)) { 3780 pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1)); 3781 rv = true; 3782 } 3783 } 3784 } 3785 pmap_unmap_ptes(pmap, pmap2); 3786 if (pmap != pmap_kernel()) { 3787 mutex_exit(&pmap->pm_lock); 3788 } 3789 if (pap != NULL) { 3790 *pap = pa; 3791 } 3792 3793 return rv; 3794 } 3795 3796 /* 3797 * vtophys: virtual address to physical address. For use by 3798 * machine-dependent code only. 3799 */ 3800 paddr_t 3801 vtophys(vaddr_t va) 3802 { 3803 paddr_t pa; 3804 3805 if (pmap_extract(pmap_kernel(), va, &pa) == true) 3806 return pa; 3807 return 0; 3808 } 3809 3810 __strict_weak_alias(pmap_extract_ma, pmap_extract); 3811 3812 #ifdef XENPV 3813 /* 3814 * vtomach: virtual address to machine address. For use by 3815 * machine-dependent code only. 3816 */ 3817 paddr_t 3818 vtomach(vaddr_t va) 3819 { 3820 paddr_t pa; 3821 3822 if (pmap_extract_ma(pmap_kernel(), va, &pa) == true) 3823 return pa; 3824 return 0; 3825 } 3826 #endif 3827 3828 /* 3829 * pmap_virtual_space: used during bootup [pmap_steal_memory] to 3830 * determine the bounds of the kernel virtual address space. 3831 */ 3832 void 3833 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp) 3834 { 3835 *startp = virtual_avail; 3836 *endp = virtual_end; 3837 } 3838 3839 void 3840 pmap_zero_page(paddr_t pa) 3841 { 3842 #if defined(__HAVE_DIRECT_MAP) 3843 memset(PAGE_ALIGNED(PMAP_DIRECT_MAP(pa)), 0, PAGE_SIZE); 3844 #else 3845 #if defined(XENPV) 3846 if (XEN_VERSION_SUPPORTED(3, 4)) { 3847 xen_pagezero(pa); 3848 return; 3849 } 3850 #endif 3851 struct cpu_info *ci; 3852 pt_entry_t *zpte; 3853 vaddr_t zerova; 3854 3855 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_D | PTE_A; 3856 3857 kpreempt_disable(); 3858 3859 ci = curcpu(); 3860 zerova = ci->vpage[VPAGE_ZER]; 3861 zpte = ci->vpage_pte[VPAGE_ZER]; 3862 3863 KASSERTMSG(!*zpte, "pmap_zero_page: lock botch"); 3864 3865 pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags); 3866 pmap_pte_flush(); 3867 pmap_update_pg(zerova); /* flush TLB */ 3868 3869 memset(PAGE_ALIGNED(zerova), 0, PAGE_SIZE); 3870 3871 #if defined(DIAGNOSTIC) || defined(XENPV) 3872 pmap_pte_set(zpte, 0); /* zap ! */ 3873 pmap_pte_flush(); 3874 #endif 3875 3876 kpreempt_enable(); 3877 #endif /* defined(__HAVE_DIRECT_MAP) */ 3878 } 3879 3880 void 3881 pmap_copy_page(paddr_t srcpa, paddr_t dstpa) 3882 { 3883 #if defined(__HAVE_DIRECT_MAP) 3884 vaddr_t srcva = PMAP_DIRECT_MAP(srcpa); 3885 vaddr_t dstva = PMAP_DIRECT_MAP(dstpa); 3886 3887 memcpy(PAGE_ALIGNED(dstva), PAGE_ALIGNED(srcva), PAGE_SIZE); 3888 #else 3889 #if defined(XENPV) 3890 if (XEN_VERSION_SUPPORTED(3, 4)) { 3891 xen_copy_page(srcpa, dstpa); 3892 return; 3893 } 3894 #endif 3895 struct cpu_info *ci; 3896 pt_entry_t *srcpte, *dstpte; 3897 vaddr_t srcva, dstva; 3898 3899 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A; 3900 3901 kpreempt_disable(); 3902 3903 ci = curcpu(); 3904 srcva = ci->vpage[VPAGE_SRC]; 3905 dstva = ci->vpage[VPAGE_DST]; 3906 srcpte = ci->vpage_pte[VPAGE_SRC]; 3907 dstpte = ci->vpage_pte[VPAGE_DST]; 3908 3909 KASSERT(*srcpte == 0 && *dstpte == 0); 3910 3911 pmap_pte_set(srcpte, pmap_pa2pte(srcpa) | pteflags); 3912 pmap_pte_set(dstpte, pmap_pa2pte(dstpa) | pteflags | PTE_D); 3913 pmap_pte_flush(); 3914 pmap_update_pg(srcva); 3915 pmap_update_pg(dstva); 3916 3917 memcpy(PAGE_ALIGNED(dstva), PAGE_ALIGNED(srcva), PAGE_SIZE); 3918 3919 #if defined(DIAGNOSTIC) || defined(XENPV) 3920 pmap_pte_set(srcpte, 0); 3921 pmap_pte_set(dstpte, 0); 3922 pmap_pte_flush(); 3923 #endif 3924 3925 kpreempt_enable(); 3926 #endif /* defined(__HAVE_DIRECT_MAP) */ 3927 } 3928 3929 static pt_entry_t * 3930 pmap_map_ptp(struct vm_page *ptp) 3931 { 3932 #ifdef __HAVE_DIRECT_MAP 3933 return (void *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp)); 3934 #else 3935 struct cpu_info *ci; 3936 pt_entry_t *ptppte; 3937 vaddr_t ptpva; 3938 3939 KASSERT(kpreempt_disabled()); 3940 3941 #ifndef XENPV 3942 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A | PTE_D; 3943 #else 3944 const pd_entry_t pteflags = PTE_P | pmap_pg_nx | PTE_A | PTE_D; 3945 #endif 3946 3947 ci = curcpu(); 3948 ptpva = ci->vpage[VPAGE_PTP]; 3949 ptppte = ci->vpage_pte[VPAGE_PTP]; 3950 3951 pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | pteflags); 3952 3953 pmap_pte_flush(); 3954 pmap_update_pg(ptpva); 3955 3956 return (pt_entry_t *)ptpva; 3957 #endif 3958 } 3959 3960 static void 3961 pmap_unmap_ptp(void) 3962 { 3963 #ifndef __HAVE_DIRECT_MAP 3964 #if defined(DIAGNOSTIC) || defined(XENPV) 3965 struct cpu_info *ci; 3966 pt_entry_t *pte; 3967 3968 KASSERT(kpreempt_disabled()); 3969 3970 ci = curcpu(); 3971 pte = ci->vpage_pte[VPAGE_PTP]; 3972 3973 if (*pte != 0) { 3974 pmap_pte_set(pte, 0); 3975 pmap_pte_flush(); 3976 } 3977 #endif 3978 #endif 3979 } 3980 3981 static pt_entry_t * 3982 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va) 3983 { 3984 3985 KASSERT(kpreempt_disabled()); 3986 if (pmap_is_curpmap(pmap)) { 3987 return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */ 3988 } 3989 KASSERT(ptp != NULL); 3990 return pmap_map_ptp(ptp) + pl1_pi(va); 3991 } 3992 3993 static void 3994 pmap_unmap_pte(void) 3995 { 3996 3997 KASSERT(kpreempt_disabled()); 3998 3999 pmap_unmap_ptp(); 4000 } 4001 4002 /* 4003 * p m a p r e m o v e f u n c t i o n s 4004 * 4005 * functions that remove mappings 4006 */ 4007 4008 /* 4009 * pmap_remove_ptes: remove PTEs from a PTP 4010 * 4011 * => caller must hold pmap's lock 4012 * => PTP must be mapped into KVA 4013 * => PTP should be null if pmap == pmap_kernel() 4014 * => must be called with kernel preemption disabled 4015 * => returns composite pte if at least one page should be shot down 4016 */ 4017 static void 4018 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva, 4019 vaddr_t startva, vaddr_t endva) 4020 { 4021 pt_entry_t *pte = (pt_entry_t *)ptpva; 4022 4023 KASSERT(mutex_owned(&pmap->pm_lock)); 4024 KASSERT(kpreempt_disabled()); 4025 4026 /* 4027 * mappings are very often sparse, so clip the given range to the 4028 * range of PTEs that are known present in the PTP. 4029 */ 4030 pmap_ptp_range_clip(ptp, &startva, &pte); 4031 4032 /* 4033 * note that ptpva points to the PTE that maps startva. this may 4034 * or may not be the first PTE in the PTP. 4035 * 4036 * we loop through the PTP while there are still PTEs to look at 4037 * and the wire_count is greater than 1 (because we use the wire_count 4038 * to keep track of the number of real PTEs in the PTP). 4039 */ 4040 while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) { 4041 (void)pmap_remove_pte(pmap, ptp, pte, startva); 4042 startva += PAGE_SIZE; 4043 pte++; 4044 } 4045 } 4046 4047 /* 4048 * pmap_remove_pte: remove a single PTE from a PTP. 4049 * 4050 * => caller must hold pmap's lock 4051 * => PTP must be mapped into KVA 4052 * => PTP should be null if pmap == pmap_kernel() 4053 * => returns true if we removed a mapping 4054 * => must be called with kernel preemption disabled 4055 */ 4056 static bool 4057 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, 4058 vaddr_t va) 4059 { 4060 struct pv_entry *pve; 4061 struct vm_page *pg; 4062 struct pmap_page *pp; 4063 pt_entry_t opte; 4064 4065 KASSERT(mutex_owned(&pmap->pm_lock)); 4066 KASSERT(kpreempt_disabled()); 4067 4068 if (!pmap_valid_entry(*pte)) { 4069 /* VA not mapped. */ 4070 return false; 4071 } 4072 4073 /* Atomically save the old PTE and zap it. */ 4074 opte = pmap_pte_testset(pte, 0); 4075 if (!pmap_valid_entry(opte)) { 4076 return false; 4077 } 4078 4079 pmap_exec_account(pmap, va, opte, 0); 4080 pmap_stats_update_bypte(pmap, 0, opte); 4081 4082 if (ptp) { 4083 /* 4084 * Dropping a PTE. Make sure that the PDE is flushed. 4085 */ 4086 ptp->wire_count--; 4087 if (ptp->wire_count <= 1) { 4088 opte |= PTE_A; 4089 } 4090 } 4091 4092 if ((opte & PTE_A) != 0) { 4093 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE); 4094 } 4095 4096 /* 4097 * If we are not on a pv list - we are done. 4098 */ 4099 if ((opte & PTE_PVLIST) == 0) { 4100 #ifndef DOM0OPS 4101 KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL), 4102 "managed page without PTE_PVLIST for %#"PRIxVADDR, va); 4103 KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL), 4104 "pv-tracked page without PTE_PVLIST for %#"PRIxVADDR, va); 4105 #endif 4106 KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ? 4107 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL); 4108 return true; 4109 } 4110 4111 if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) { 4112 pp = VM_PAGE_TO_PP(pg); 4113 } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) { 4114 paddr_t pa = pmap_pte2pa(opte); 4115 panic("%s: PTE_PVLIST with pv-untracked page" 4116 " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")", 4117 __func__, va, pa, atop(pa)); 4118 } 4119 4120 /* Sync R/M bits. */ 4121 pve = pmap_lookup_pv(pmap, ptp, pp, va); 4122 pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_pte_to_pp_attrs(opte)); 4123 return true; 4124 } 4125 4126 static void 4127 pmap_remove_locked(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 4128 { 4129 pt_entry_t *ptes; 4130 pd_entry_t pde; 4131 pd_entry_t * const *pdes; 4132 bool result; 4133 vaddr_t blkendva, va = sva; 4134 struct vm_page *ptp; 4135 struct pmap *pmap2; 4136 int lvl; 4137 4138 KASSERT(mutex_owned(&pmap->pm_lock)); 4139 4140 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 4141 4142 /* 4143 * removing one page? take shortcut function. 4144 */ 4145 4146 if (va + PAGE_SIZE == eva) { 4147 if (pmap_pdes_valid(va, pdes, &pde, &lvl)) { 4148 KASSERT(lvl == 1); 4149 4150 /* Get PTP if non-kernel mapping. */ 4151 if (pmap != pmap_kernel()) { 4152 ptp = pmap_find_ptp(pmap, va, 1); 4153 KASSERTMSG(ptp != NULL, 4154 "%s: unmanaged PTP detected", __func__); 4155 } else { 4156 /* Never free kernel PTPs. */ 4157 ptp = NULL; 4158 } 4159 4160 result = pmap_remove_pte(pmap, ptp, 4161 &ptes[pl1_i(va)], va); 4162 4163 /* 4164 * if mapping removed and the PTP is no longer 4165 * being used, free it! 4166 */ 4167 4168 if (result && ptp && ptp->wire_count <= 1) 4169 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 4170 } 4171 } else for (/* null */ ; va < eva ; va = blkendva) { 4172 /* determine range of block */ 4173 blkendva = x86_round_pdr(va+1); 4174 if (blkendva > eva) 4175 blkendva = eva; 4176 4177 if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) { 4178 /* Skip a range corresponding to an invalid pde. */ 4179 blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1]; 4180 continue; 4181 } 4182 KASSERT(lvl == 1); 4183 4184 /* Get PTP if non-kernel mapping. */ 4185 if (pmap != pmap_kernel()) { 4186 ptp = pmap_find_ptp(pmap, va, 1); 4187 KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected", 4188 __func__); 4189 } else { 4190 /* Never free kernel PTPs. */ 4191 ptp = NULL; 4192 } 4193 4194 pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va, 4195 blkendva); 4196 4197 /* If PTP is no longer being used, free it. */ 4198 if (ptp && ptp->wire_count <= 1) { 4199 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 4200 } 4201 } 4202 pmap_unmap_ptes(pmap, pmap2); 4203 pmap_drain_pv(pmap); 4204 } 4205 4206 /* 4207 * pmap_remove: mapping removal function. 4208 * 4209 * => caller should not be holding any pmap locks 4210 */ 4211 void 4212 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 4213 { 4214 if (__predict_false(pmap->pm_remove != NULL)) { 4215 (*pmap->pm_remove)(pmap, sva, eva); 4216 return; 4217 } 4218 4219 mutex_enter(&pmap->pm_lock); 4220 pmap_remove_locked(pmap, sva, eva); 4221 mutex_exit(&pmap->pm_lock); 4222 } 4223 4224 /* 4225 * pmap_sync_pv: clear pte bits and return the old value of the pp_attrs. 4226 * 4227 * => The 'clearbits' parameter is either ~0 or PP_ATTRS_... 4228 * => Caller should disable kernel preemption. 4229 * => issues tlb shootdowns if necessary. 4230 */ 4231 static int 4232 pmap_sync_pv(struct pv_pte *pvpte, paddr_t pa, int clearbits, uint8_t *oattrs, 4233 pt_entry_t *optep) 4234 { 4235 struct pmap *pmap; 4236 struct vm_page *ptp; 4237 vaddr_t va; 4238 pt_entry_t *ptep; 4239 pt_entry_t opte; 4240 pt_entry_t npte; 4241 pt_entry_t expect; 4242 bool need_shootdown; 4243 4244 ptp = pvpte->pte_ptp; 4245 va = pvpte->pte_va; 4246 KASSERT(ptp == NULL || ptp->uobject != NULL); 4247 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 4248 pmap = ptp_to_pmap(ptp); 4249 KASSERT(kpreempt_disabled()); 4250 4251 if (__predict_false(pmap->pm_sync_pv != NULL)) { 4252 return (*pmap->pm_sync_pv)(ptp, va, pa, clearbits, oattrs, 4253 optep); 4254 } 4255 4256 expect = pmap_pa2pte(pa) | PTE_P; 4257 4258 if (clearbits != ~0) { 4259 KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0); 4260 clearbits = pmap_pp_attrs_to_pte(clearbits); 4261 } 4262 4263 ptep = pmap_map_pte(pmap, ptp, va); 4264 do { 4265 opte = *ptep; 4266 KASSERT((opte & (PTE_D | PTE_A)) != PTE_D); 4267 KASSERT((opte & (PTE_A | PTE_P)) != PTE_A); 4268 KASSERT(opte == 0 || (opte & PTE_P) != 0); 4269 if ((opte & (PTE_FRAME | PTE_P)) != expect) { 4270 /* 4271 * We lost a race with a V->P operation like 4272 * pmap_remove(). Wait for the competitor 4273 * reflecting pte bits into mp_attrs. 4274 */ 4275 pmap_unmap_pte(); 4276 return EAGAIN; 4277 } 4278 4279 /* 4280 * Check if there's anything to do on this PTE. 4281 */ 4282 if ((opte & clearbits) == 0) { 4283 need_shootdown = false; 4284 break; 4285 } 4286 4287 /* 4288 * We need a shootdown if the PTE is cached (PTE_A) ... 4289 * ... Unless we are clearing only the PTE_W bit and 4290 * it isn't cached as RW (PTE_D). 4291 */ 4292 need_shootdown = (opte & PTE_A) != 0 && 4293 !(clearbits == PTE_W && (opte & PTE_D) == 0); 4294 4295 npte = opte & ~clearbits; 4296 4297 /* 4298 * If we need a shootdown anyway, clear PTE_A and PTE_D. 4299 */ 4300 if (need_shootdown) { 4301 npte &= ~(PTE_A | PTE_D); 4302 } 4303 KASSERT((npte & (PTE_D | PTE_A)) != PTE_D); 4304 KASSERT((npte & (PTE_A | PTE_P)) != PTE_A); 4305 KASSERT(npte == 0 || (opte & PTE_P) != 0); 4306 } while (pmap_pte_cas(ptep, opte, npte) != opte); 4307 4308 if (need_shootdown) { 4309 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV); 4310 } 4311 pmap_unmap_pte(); 4312 4313 *oattrs = pmap_pte_to_pp_attrs(opte); 4314 if (optep != NULL) 4315 *optep = opte; 4316 return 0; 4317 } 4318 4319 static void 4320 pmap_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte, 4321 vaddr_t va) 4322 { 4323 struct pmap *pmap2; 4324 pt_entry_t *ptes; 4325 pd_entry_t * const *pdes; 4326 4327 KASSERT(mutex_owned(&pmap->pm_lock)); 4328 4329 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 4330 pmap_stats_update_bypte(pmap, 0, opte); 4331 ptp->wire_count--; 4332 if (ptp->wire_count <= 1) { 4333 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 4334 } 4335 pmap_unmap_ptes(pmap, pmap2); 4336 } 4337 4338 static void 4339 pmap_pp_remove(struct pmap_page *pp, paddr_t pa) 4340 { 4341 struct pv_pte *pvpte; 4342 struct vm_page *ptp; 4343 uintptr_t sum; 4344 uint8_t oattrs; 4345 bool locked; 4346 4347 /* 4348 * Do an unlocked check to see if the page has no mappings, eg when 4349 * pmap_remove_all() was called before amap_wipeout() for a process 4350 * private amap - common. The page being removed must be on the way 4351 * out, so we don't have to worry about concurrent attempts to enter 4352 * it (otherwise the caller either doesn't care or has screwed up). 4353 */ 4354 sum = (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_va); 4355 sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_ptp); 4356 sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pvlist.lh_first); 4357 if (sum == 0) { 4358 return; 4359 } 4360 4361 kpreempt_disable(); 4362 for (;;) { 4363 struct pmap *pmap; 4364 struct pv_entry *pve; 4365 pt_entry_t opte; 4366 vaddr_t va; 4367 4368 mutex_spin_enter(&pp->pp_lock); 4369 if ((pvpte = pv_pte_first(pp)) == NULL) { 4370 mutex_spin_exit(&pp->pp_lock); 4371 break; 4372 } 4373 4374 /* 4375 * Add a reference to the pmap before clearing the pte. 4376 * Otherwise the pmap can disappear behind us. 4377 */ 4378 ptp = pvpte->pte_ptp; 4379 pmap = ptp_to_pmap(ptp); 4380 KASSERT(pmap->pm_obj[0].uo_refs > 0); 4381 if (ptp != NULL) { 4382 pmap_reference(pmap); 4383 } 4384 4385 /* 4386 * Now try to lock it. We need a direct handoff between 4387 * pp_lock and pm_lock to know the pv_entry is kept intact 4388 * and kept associated with this pmap. If that can't be 4389 * had, wait for the pmap's lock to become free and then 4390 * retry. 4391 */ 4392 locked = mutex_tryenter(&pmap->pm_lock); 4393 mutex_spin_exit(&pp->pp_lock); 4394 if (!locked) { 4395 mutex_enter(&pmap->pm_lock); 4396 /* nothing, just wait for it */ 4397 mutex_exit(&pmap->pm_lock); 4398 if (ptp != NULL) { 4399 pmap_destroy(pmap); 4400 } 4401 continue; 4402 } 4403 va = pvpte->pte_va; 4404 4405 KASSERTMSG(pmap->pm_stats.resident_count > PDP_SIZE, 4406 "va %lx pmap %p ptp %p is empty", va, pmap, ptp); 4407 KASSERTMSG(ptp == NULL || (ptp->flags & PG_FREE) == 0, 4408 "va %lx pmap %p ptp %p is free", va, pmap, ptp); 4409 KASSERTMSG(ptp == NULL || ptp->wire_count > 1, 4410 "va %lx pmap %p ptp %p is empty", va, pmap, ptp); 4411 4412 #ifdef DEBUG 4413 pmap_check_pv(pmap, ptp, pp, pvpte->pte_va, true); 4414 rb_tree_t *tree = (ptp != NULL ? 4415 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb); 4416 pve = pmap_treelookup_pv(pmap, ptp, tree, va); 4417 if (pve == NULL) { 4418 KASSERTMSG(&pp->pp_pte == pvpte, 4419 "va %lx pmap %p ptp %p pvpte %p pve %p oops 1", 4420 va, pmap, ptp, pvpte, pve); 4421 } else { 4422 KASSERTMSG(&pve->pve_pte == pvpte, 4423 "va %lx pmap %p ptp %p pvpte %p pve %p oops 2", 4424 va, pmap, ptp, pvpte, pve); 4425 } 4426 #endif 4427 4428 if (pmap_sync_pv(pvpte, pa, ~0, &oattrs, &opte)) { 4429 panic("pmap_pp_remove: mapping not present"); 4430 } 4431 4432 pve = pmap_lookup_pv(pmap, ptp, pp, va); 4433 pmap_remove_pv(pmap, pp, ptp, va, pve, oattrs); 4434 4435 /* Update the PTP reference count. Free if last reference. */ 4436 if (ptp != NULL) { 4437 KASSERT(pmap != pmap_kernel()); 4438 pmap_tlb_shootnow(); 4439 if (__predict_false(pmap->pm_pp_remove_ent != NULL)) { 4440 (*pmap->pm_pp_remove_ent)(pmap, ptp, opte, va); 4441 } else { 4442 pmap_pp_remove_ent(pmap, ptp, opte, va); 4443 } 4444 } else { 4445 KASSERT(pmap == pmap_kernel()); 4446 pmap_stats_update_bypte(pmap, 0, opte); 4447 } 4448 pmap_tlb_shootnow(); 4449 pmap_drain_pv(pmap); 4450 mutex_exit(&pmap->pm_lock); 4451 if (ptp != NULL) { 4452 pmap_destroy(pmap); 4453 } 4454 } 4455 kpreempt_enable(); 4456 } 4457 4458 /* 4459 * pmap_page_remove: remove a managed vm_page from all pmaps that map it 4460 * 4461 * => R/M bits are sync'd back to attrs 4462 */ 4463 void 4464 pmap_page_remove(struct vm_page *pg) 4465 { 4466 struct pmap_page *pp; 4467 paddr_t pa; 4468 4469 pp = VM_PAGE_TO_PP(pg); 4470 pa = VM_PAGE_TO_PHYS(pg); 4471 pmap_pp_remove(pp, pa); 4472 } 4473 4474 /* 4475 * pmap_pv_remove: remove an unmanaged pv-tracked page from all pmaps 4476 * that map it 4477 */ 4478 void 4479 pmap_pv_remove(paddr_t pa) 4480 { 4481 struct pmap_page *pp; 4482 4483 pp = pmap_pv_tracked(pa); 4484 if (pp == NULL) 4485 panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa); 4486 pmap_pp_remove(pp, pa); 4487 } 4488 4489 /* 4490 * p m a p a t t r i b u t e f u n c t i o n s 4491 * functions that test/change managed page's attributes 4492 * since a page can be mapped multiple times we must check each PTE that 4493 * maps it by going down the pv lists. 4494 */ 4495 4496 /* 4497 * pmap_test_attrs: test a page's attributes 4498 */ 4499 bool 4500 pmap_test_attrs(struct vm_page *pg, unsigned testbits) 4501 { 4502 struct pmap_page *pp; 4503 struct pv_pte *pvpte; 4504 struct pmap *pmap; 4505 uint8_t oattrs; 4506 u_int result; 4507 paddr_t pa; 4508 4509 pp = VM_PAGE_TO_PP(pg); 4510 if ((pp->pp_attrs & testbits) != 0) { 4511 return true; 4512 } 4513 pa = VM_PAGE_TO_PHYS(pg); 4514 startover: 4515 mutex_spin_enter(&pp->pp_lock); 4516 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 4517 if ((pp->pp_attrs & testbits) != 0) { 4518 break; 4519 } 4520 if (pmap_sync_pv(pvpte, pa, 0, &oattrs, NULL)) { 4521 /* 4522 * raced with a V->P operation. wait for the other 4523 * side to finish by acquiring pmap's lock. if no 4524 * wait, updates to pp_attrs by the other side may 4525 * go unseen. 4526 */ 4527 pmap = ptp_to_pmap(pvpte->pte_ptp); 4528 pmap_reference(pmap); 4529 mutex_spin_exit(&pp->pp_lock); 4530 mutex_enter(&pmap->pm_lock); 4531 /* nothing. */ 4532 mutex_exit(&pmap->pm_lock); 4533 pmap_destroy(pmap); 4534 goto startover; 4535 } 4536 pp->pp_attrs |= oattrs; 4537 } 4538 result = pp->pp_attrs & testbits; 4539 mutex_spin_exit(&pp->pp_lock); 4540 4541 /* 4542 * note that we will exit the for loop with a non-null pve if 4543 * we have found the bits we are testing for. 4544 */ 4545 4546 return result != 0; 4547 } 4548 4549 static bool 4550 pmap_pp_clear_attrs(struct pmap_page *pp, paddr_t pa, unsigned clearbits) 4551 { 4552 struct pv_pte *pvpte; 4553 struct pmap *pmap; 4554 uint8_t oattrs; 4555 u_int result; 4556 4557 startover: 4558 mutex_spin_enter(&pp->pp_lock); 4559 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 4560 if (pmap_sync_pv(pvpte, pa, clearbits, &oattrs, NULL)) { 4561 /* 4562 * raced with a V->P operation. wait for the other 4563 * side to finish by acquiring pmap's lock. it is 4564 * probably unmapping the page, and it will be gone 4565 * when the loop is restarted. 4566 */ 4567 pmap = ptp_to_pmap(pvpte->pte_ptp); 4568 pmap_reference(pmap); 4569 mutex_spin_exit(&pp->pp_lock); 4570 mutex_enter(&pmap->pm_lock); 4571 /* nothing. */ 4572 mutex_exit(&pmap->pm_lock); 4573 pmap_destroy(pmap); 4574 goto startover; 4575 } 4576 pp->pp_attrs |= oattrs; 4577 } 4578 result = pp->pp_attrs & clearbits; 4579 pp->pp_attrs &= ~clearbits; 4580 pmap_tlb_shootnow(); 4581 mutex_spin_exit(&pp->pp_lock); 4582 4583 return result != 0; 4584 } 4585 4586 /* 4587 * pmap_clear_attrs: clear the specified attribute for a page. 4588 * 4589 * => we return true if we cleared one of the bits we were asked to 4590 */ 4591 bool 4592 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits) 4593 { 4594 struct pmap_page *pp; 4595 paddr_t pa; 4596 4597 pp = VM_PAGE_TO_PP(pg); 4598 pa = VM_PAGE_TO_PHYS(pg); 4599 4600 /* 4601 * If this is a new page, assert it has no mappings and simply zap 4602 * the stored attributes without taking any locks. 4603 */ 4604 if ((pg->flags & PG_FAKE) != 0) { 4605 KASSERT(atomic_load_relaxed(&pp->pp_pte.pte_va) == 0); 4606 KASSERT(atomic_load_relaxed(&pp->pp_pte.pte_ptp) == NULL); 4607 KASSERT(atomic_load_relaxed(&pp->pp_pvlist.lh_first) == NULL); 4608 atomic_store_relaxed(&pp->pp_attrs, 0); 4609 return false; 4610 } else { 4611 return pmap_pp_clear_attrs(pp, pa, clearbits); 4612 } 4613 } 4614 4615 /* 4616 * pmap_pv_clear_attrs: clear the specified attributes for an unmanaged 4617 * pv-tracked page. 4618 */ 4619 bool 4620 pmap_pv_clear_attrs(paddr_t pa, unsigned clearbits) 4621 { 4622 struct pmap_page *pp; 4623 4624 pp = pmap_pv_tracked(pa); 4625 if (pp == NULL) 4626 panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa); 4627 4628 return pmap_pp_clear_attrs(pp, pa, clearbits); 4629 } 4630 4631 /* 4632 * p m a p p r o t e c t i o n f u n c t i o n s 4633 */ 4634 4635 /* 4636 * pmap_page_protect: change the protection of all recorded mappings 4637 * of a managed page 4638 * 4639 * => NOTE: this is an inline function in pmap.h 4640 */ 4641 4642 /* see pmap.h */ 4643 4644 /* 4645 * pmap_pv_protect: change the protection of all recorded mappings 4646 * of an unmanaged pv-tracked page 4647 * 4648 * => NOTE: this is an inline function in pmap.h 4649 */ 4650 4651 /* see pmap.h */ 4652 4653 /* 4654 * pmap_protect: set the protection in of the pages in a pmap 4655 * 4656 * => NOTE: this is an inline function in pmap.h 4657 */ 4658 4659 /* see pmap.h */ 4660 4661 /* 4662 * pmap_write_protect: write-protect pages in a pmap. 4663 * 4664 * Note for Xen-amd64. Xen automatically adds PTE_U to the kernel pages, but we 4665 * don't need to remove this bit when re-entering the PTEs here: Xen tracks the 4666 * kernel pages with a reserved bit (_PAGE_GUEST_KERNEL), so even if PTE_U is 4667 * present the page will still be considered as a kernel page, and the privilege 4668 * separation will be enforced correctly. 4669 */ 4670 void 4671 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot) 4672 { 4673 pt_entry_t bit_rem, bit_put; 4674 pt_entry_t *ptes; 4675 pt_entry_t * const *pdes; 4676 struct pmap *pmap2; 4677 vaddr_t blockend, va; 4678 int lvl, i; 4679 4680 if (__predict_false(pmap->pm_write_protect != NULL)) { 4681 (*pmap->pm_write_protect)(pmap, sva, eva, prot); 4682 return; 4683 } 4684 4685 bit_rem = 0; 4686 if (!(prot & VM_PROT_WRITE)) 4687 bit_rem = PTE_W; 4688 4689 bit_put = 0; 4690 if (!(prot & VM_PROT_EXECUTE)) 4691 bit_put = pmap_pg_nx; 4692 4693 sva &= ~PAGE_MASK; 4694 eva &= ~PAGE_MASK; 4695 4696 /* 4697 * Acquire pmap. No need to lock the kernel pmap as we won't 4698 * be touching PV entries nor stats and kernel PDEs aren't 4699 * freed. 4700 */ 4701 if (pmap != pmap_kernel()) { 4702 mutex_enter(&pmap->pm_lock); 4703 } 4704 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 4705 4706 for (va = sva ; va < eva; va = blockend) { 4707 pt_entry_t *spte, *epte; 4708 4709 blockend = x86_round_pdr(va + 1); 4710 if (blockend > eva) 4711 blockend = eva; 4712 4713 /* Is it a valid block? */ 4714 if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) { 4715 continue; 4716 } 4717 KASSERT(va < VM_MAXUSER_ADDRESS || va >= VM_MAX_ADDRESS); 4718 KASSERT(lvl == 1); 4719 4720 spte = &ptes[pl1_i(va)]; 4721 epte = &ptes[pl1_i(blockend)]; 4722 4723 for (i = 0; spte < epte; spte++, i++) { 4724 pt_entry_t opte, npte; 4725 4726 do { 4727 opte = *spte; 4728 if (!pmap_valid_entry(opte)) { 4729 goto next; 4730 } 4731 npte = (opte & ~bit_rem) | bit_put; 4732 } while (pmap_pte_cas(spte, opte, npte) != opte); 4733 4734 if ((opte & PTE_D) != 0) { 4735 vaddr_t tva = va + x86_ptob(i); 4736 pmap_tlb_shootdown(pmap, tva, opte, 4737 TLBSHOOT_WRITE_PROTECT); 4738 } 4739 next:; 4740 } 4741 } 4742 4743 /* Release pmap. */ 4744 pmap_unmap_ptes(pmap, pmap2); 4745 if (pmap != pmap_kernel()) { 4746 mutex_exit(&pmap->pm_lock); 4747 } 4748 } 4749 4750 /* 4751 * pmap_unwire: clear the wired bit in the PTE. 4752 * 4753 * => Mapping should already be present. 4754 */ 4755 void 4756 pmap_unwire(struct pmap *pmap, vaddr_t va) 4757 { 4758 pt_entry_t *ptes, *ptep, opte; 4759 pd_entry_t * const *pdes; 4760 struct pmap *pmap2; 4761 int lvl; 4762 4763 if (__predict_false(pmap->pm_unwire != NULL)) { 4764 (*pmap->pm_unwire)(pmap, va); 4765 return; 4766 } 4767 4768 /* 4769 * Acquire pmap. Need to lock the kernel pmap only to protect the 4770 * statistics. 4771 */ 4772 mutex_enter(&pmap->pm_lock); 4773 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 4774 4775 if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) { 4776 panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va); 4777 } 4778 KASSERT(lvl == 1); 4779 4780 ptep = &ptes[pl1_i(va)]; 4781 opte = *ptep; 4782 KASSERT(pmap_valid_entry(opte)); 4783 4784 if (opte & PTE_WIRED) { 4785 pt_entry_t npte = opte & ~PTE_WIRED; 4786 4787 opte = pmap_pte_testset(ptep, npte); 4788 pmap_stats_update_bypte(pmap, npte, opte); 4789 } else { 4790 printf("%s: wiring for pmap %p va %#" PRIxVADDR 4791 " did not change!\n", __func__, pmap, va); 4792 } 4793 4794 /* Release pmap. */ 4795 pmap_unmap_ptes(pmap, pmap2); 4796 mutex_exit(&pmap->pm_lock); 4797 } 4798 4799 /* 4800 * pmap_copy: copy mappings from one pmap to another 4801 * 4802 * => optional function 4803 * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr) 4804 */ 4805 4806 /* 4807 * defined as macro in pmap.h 4808 */ 4809 4810 __strict_weak_alias(pmap_enter, pmap_enter_default); 4811 4812 int 4813 pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, 4814 u_int flags) 4815 { 4816 if (__predict_false(pmap->pm_enter != NULL)) { 4817 return (*pmap->pm_enter)(pmap, va, pa, prot, flags); 4818 } 4819 4820 return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0); 4821 } 4822 4823 /* 4824 * pmap_enter: enter a mapping into a pmap 4825 * 4826 * => must be done "now" ... no lazy-evaluation 4827 */ 4828 int 4829 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa, 4830 vm_prot_t prot, u_int flags, int domid) 4831 { 4832 pt_entry_t *ptes, opte, npte; 4833 pt_entry_t *ptep; 4834 pd_entry_t * const *pdes; 4835 struct vm_page *ptp; 4836 struct vm_page *new_pg, *old_pg; 4837 struct pmap_page *new_pp, *old_pp; 4838 struct pv_entry *old_pve, *new_pve; 4839 bool wired = (flags & PMAP_WIRED) != 0; 4840 struct pmap *pmap2; 4841 struct pmap_ptparray pt; 4842 int error; 4843 bool getptp, samepage, new_embedded; 4844 rb_tree_t *tree; 4845 4846 KASSERT(pmap_initialized); 4847 KASSERT(va < VM_MAX_KERNEL_ADDRESS); 4848 KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#" 4849 PRIxVADDR " over PDP!", __func__, va); 4850 KASSERTMSG(va < VM_MIN_KERNEL_ADDRESS || 4851 pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]), 4852 "%s: missing kernel PTP for va=%#" PRIxVADDR, __func__, va); 4853 4854 #ifdef XENPV 4855 KASSERT(domid == DOMID_SELF || pa == 0); 4856 #endif 4857 4858 npte = ma | protection_codes[prot] | PTE_P; 4859 npte |= pmap_pat_flags(flags); 4860 if (wired) 4861 npte |= PTE_WIRED; 4862 if (va < VM_MAXUSER_ADDRESS) 4863 npte |= PTE_U; 4864 4865 if (pmap == pmap_kernel()) 4866 npte |= pmap_pg_g; 4867 if (flags & VM_PROT_ALL) { 4868 npte |= PTE_A; 4869 if (flags & VM_PROT_WRITE) { 4870 KASSERT((npte & PTE_W) != 0); 4871 npte |= PTE_D; 4872 } 4873 } 4874 4875 #ifdef XENPV 4876 if (domid != DOMID_SELF) 4877 new_pg = NULL; 4878 else 4879 #endif 4880 new_pg = PHYS_TO_VM_PAGE(pa); 4881 4882 if (new_pg != NULL) { 4883 /* This is a managed page */ 4884 npte |= PTE_PVLIST; 4885 new_pp = VM_PAGE_TO_PP(new_pg); 4886 PMAP_CHECK_PP(new_pp); 4887 } else if ((new_pp = pmap_pv_tracked(pa)) != NULL) { 4888 /* This is an unmanaged pv-tracked page */ 4889 npte |= PTE_PVLIST; 4890 PMAP_CHECK_PP(new_pp); 4891 } else { 4892 new_pp = NULL; 4893 } 4894 4895 /* Begin by locking the pmap. */ 4896 mutex_enter(&pmap->pm_lock); 4897 4898 /* Look up the PTP. Allocate if none present. */ 4899 ptp = NULL; 4900 getptp = false; 4901 if (pmap != pmap_kernel()) { 4902 ptp = pmap_find_ptp(pmap, va, 1); 4903 if (ptp == NULL) { 4904 getptp = true; 4905 error = pmap_get_ptp(pmap, &pt, va, flags, &ptp); 4906 if (error != 0) { 4907 if (flags & PMAP_CANFAIL) { 4908 mutex_exit(&pmap->pm_lock); 4909 return error; 4910 } 4911 panic("%s: get ptp failed, error=%d", __func__, 4912 error); 4913 } 4914 } 4915 tree = &VM_PAGE_TO_PP(ptp)->pp_rb; 4916 } else { 4917 /* Embedded PV entries rely on this. */ 4918 KASSERT(va != 0); 4919 tree = &pmap_kernel_rb; 4920 } 4921 4922 /* 4923 * Look up the old PV entry at this VA (if any), and insert a new PV 4924 * entry if required for the new mapping. Temporarily track the old 4925 * and new mappings concurrently. Only after the old mapping is 4926 * evicted from the pmap will we remove its PV entry. Otherwise, 4927 * our picture of modified/accessed state for either page could get 4928 * out of sync (we need any P->V operation for either page to stall 4929 * on pmap->pm_lock until done here). 4930 */ 4931 new_pve = NULL; 4932 old_pve = NULL; 4933 samepage = false; 4934 new_embedded = false; 4935 4936 if (new_pp != NULL) { 4937 error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve, 4938 &old_pve, &samepage, &new_embedded, tree); 4939 4940 /* 4941 * If a new pv_entry was needed and none was available, we 4942 * can go no further. 4943 */ 4944 if (error != 0) { 4945 if (flags & PMAP_CANFAIL) { 4946 if (getptp) { 4947 pmap_unget_ptp(pmap, &pt); 4948 } 4949 mutex_exit(&pmap->pm_lock); 4950 return error; 4951 } 4952 panic("%s: alloc pve failed", __func__); 4953 } 4954 } else { 4955 old_pve = pmap_treelookup_pv(pmap, ptp, tree, va); 4956 } 4957 4958 /* Map PTEs into address space. */ 4959 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 4960 4961 /* Install any newly allocated PTPs. */ 4962 if (getptp) { 4963 pmap_install_ptp(pmap, &pt, va, pdes); 4964 } 4965 4966 /* Check if there is an existing mapping. */ 4967 ptep = &ptes[pl1_i(va)]; 4968 opte = *ptep; 4969 bool have_oldpa = pmap_valid_entry(opte); 4970 paddr_t oldpa = pmap_pte2pa(opte); 4971 4972 /* 4973 * Update the pte. 4974 */ 4975 do { 4976 opte = *ptep; 4977 4978 /* 4979 * if the same page, inherit PTE_A and PTE_D. 4980 */ 4981 if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) { 4982 npte |= opte & (PTE_A | PTE_D); 4983 } 4984 #if defined(XENPV) 4985 if (domid != DOMID_SELF) { 4986 /* pmap_pte_cas with error handling */ 4987 int s = splvm(); 4988 if (opte != *ptep) { 4989 splx(s); 4990 continue; 4991 } 4992 error = xpq_update_foreign( 4993 vtomach((vaddr_t)ptep), npte, domid, flags); 4994 splx(s); 4995 if (error) { 4996 /* Undo pv_entry tracking - oof. */ 4997 if (new_pp != NULL) { 4998 mutex_spin_enter(&new_pp->pp_lock); 4999 if (new_pve != NULL) { 5000 LIST_REMOVE(new_pve, pve_list); 5001 KASSERT(pmap->pm_pve == NULL); 5002 pmap->pm_pve = new_pve; 5003 } else if (new_embedded) { 5004 new_pp->pp_pte.pte_ptp = NULL; 5005 new_pp->pp_pte.pte_va = 0; 5006 } 5007 mutex_spin_exit(&new_pp->pp_lock); 5008 } 5009 pmap_unmap_ptes(pmap, pmap2); 5010 /* Free new PTP. */ 5011 if (ptp != NULL && ptp->wire_count <= 1) { 5012 pmap_free_ptp(pmap, ptp, va, ptes, 5013 pdes); 5014 } 5015 mutex_exit(&pmap->pm_lock); 5016 return error; 5017 } 5018 break; 5019 } 5020 #endif /* defined(XENPV) */ 5021 } while (pmap_pte_cas(ptep, opte, npte) != opte); 5022 5023 /* 5024 * Done with the PTEs: they can now be unmapped. 5025 */ 5026 pmap_unmap_ptes(pmap, pmap2); 5027 5028 /* 5029 * Update statistics and PTP's reference count. 5030 */ 5031 pmap_stats_update_bypte(pmap, npte, opte); 5032 if (ptp != NULL) { 5033 if (!have_oldpa) { 5034 ptp->wire_count++; 5035 } 5036 /* Remember minimum VA in PTP. */ 5037 pmap_ptp_range_set(ptp, va); 5038 } 5039 KASSERT(ptp == NULL || ptp->wire_count > 1); 5040 5041 /* 5042 * If the same page, we can skip pv_entry handling. 5043 */ 5044 if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) { 5045 KASSERT(((opte ^ npte) & PTE_PVLIST) == 0); 5046 if ((npte & PTE_PVLIST) != 0) { 5047 KASSERT(samepage); 5048 pmap_check_pv(pmap, ptp, new_pp, va, true); 5049 } 5050 goto same_pa; 5051 } else if ((npte & PTE_PVLIST) != 0) { 5052 KASSERT(!samepage); 5053 } 5054 5055 /* 5056 * If old page is pv-tracked, remove pv_entry from its list. 5057 */ 5058 if ((~opte & (PTE_P | PTE_PVLIST)) == 0) { 5059 if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) { 5060 old_pp = VM_PAGE_TO_PP(old_pg); 5061 } else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) { 5062 panic("%s: PTE_PVLIST with pv-untracked page" 5063 " va = %#"PRIxVADDR 5064 " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")", 5065 __func__, va, oldpa, atop(pa)); 5066 } 5067 5068 pmap_remove_pv(pmap, old_pp, ptp, va, old_pve, 5069 pmap_pte_to_pp_attrs(opte)); 5070 } else { 5071 KASSERT(old_pve == NULL); 5072 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 5073 } 5074 5075 /* 5076 * If new page is dynamically PV tracked, insert to tree. 5077 */ 5078 if (new_pve != NULL) { 5079 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 5080 old_pve = rb_tree_insert_node(tree, new_pve); 5081 KASSERT(old_pve == new_pve); 5082 pmap_check_pv(pmap, ptp, new_pp, va, true); 5083 } 5084 5085 same_pa: 5086 /* 5087 * shootdown tlb if necessary. 5088 */ 5089 5090 if ((~opte & (PTE_P | PTE_A)) == 0 && 5091 ((opte ^ npte) & (PTE_FRAME | PTE_W)) != 0) { 5092 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER); 5093 } 5094 pmap_drain_pv(pmap); 5095 mutex_exit(&pmap->pm_lock); 5096 return 0; 5097 } 5098 5099 #if defined(XEN) && defined(DOM0OPS) 5100 5101 struct pmap_data_gnt { 5102 SLIST_ENTRY(pmap_data_gnt) pd_gnt_list; 5103 vaddr_t pd_gnt_sva; 5104 vaddr_t pd_gnt_eva; /* range covered by this gnt */ 5105 int pd_gnt_refs; /* ref counter */ 5106 struct gnttab_map_grant_ref pd_gnt_ops[1]; /* variable length */ 5107 }; 5108 SLIST_HEAD(pmap_data_gnt_head, pmap_data_gnt); 5109 5110 static void pmap_remove_gnt(struct pmap *, vaddr_t, vaddr_t); 5111 5112 static struct pmap_data_gnt * 5113 pmap_find_gnt(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 5114 { 5115 struct pmap_data_gnt_head *headp; 5116 struct pmap_data_gnt *pgnt; 5117 5118 KASSERT(mutex_owned(&pmap->pm_lock)); 5119 headp = pmap->pm_data; 5120 KASSERT(headp != NULL); 5121 SLIST_FOREACH(pgnt, headp, pd_gnt_list) { 5122 if (pgnt->pd_gnt_sva <= sva && eva <= pgnt->pd_gnt_eva) 5123 return pgnt; 5124 /* check that we're not overlapping part of a region */ 5125 KASSERT(pgnt->pd_gnt_sva >= eva || pgnt->pd_gnt_eva <= sva); 5126 } 5127 return NULL; 5128 } 5129 5130 static void 5131 pmap_alloc_gnt(struct pmap *pmap, vaddr_t sva, int nentries, 5132 const struct gnttab_map_grant_ref *ops) 5133 { 5134 struct pmap_data_gnt_head *headp; 5135 struct pmap_data_gnt *pgnt; 5136 vaddr_t eva = sva + nentries * PAGE_SIZE; 5137 KASSERT(mutex_owned(&pmap->pm_lock)); 5138 KASSERT(nentries >= 1); 5139 if (pmap->pm_remove == NULL) { 5140 pmap->pm_remove = pmap_remove_gnt; 5141 KASSERT(pmap->pm_data == NULL); 5142 headp = kmem_alloc(sizeof(*headp), KM_SLEEP); 5143 SLIST_INIT(headp); 5144 pmap->pm_data = headp; 5145 } else { 5146 KASSERT(pmap->pm_remove == pmap_remove_gnt); 5147 KASSERT(pmap->pm_data != NULL); 5148 headp = pmap->pm_data; 5149 } 5150 5151 pgnt = pmap_find_gnt(pmap, sva, eva); 5152 if (pgnt != NULL) { 5153 KASSERT(pgnt->pd_gnt_sva == sva); 5154 KASSERT(pgnt->pd_gnt_eva == eva); 5155 return; 5156 } 5157 5158 /* new entry */ 5159 pgnt = kmem_alloc(sizeof(*pgnt) + 5160 (nentries - 1) * sizeof(struct gnttab_map_grant_ref), KM_SLEEP); 5161 pgnt->pd_gnt_sva = sva; 5162 pgnt->pd_gnt_eva = eva; 5163 pgnt->pd_gnt_refs = 0; 5164 memcpy(pgnt->pd_gnt_ops, ops, 5165 sizeof(struct gnttab_map_grant_ref) * nentries); 5166 SLIST_INSERT_HEAD(headp, pgnt, pd_gnt_list); 5167 } 5168 5169 static void 5170 pmap_free_gnt(struct pmap *pmap, struct pmap_data_gnt *pgnt) 5171 { 5172 struct pmap_data_gnt_head *headp = pmap->pm_data; 5173 int nentries = (pgnt->pd_gnt_eva - pgnt->pd_gnt_sva) / PAGE_SIZE; 5174 KASSERT(nentries >= 1); 5175 KASSERT(mutex_owned(&pmap->pm_lock)); 5176 KASSERT(pgnt->pd_gnt_refs == 0); 5177 SLIST_REMOVE(headp, pgnt, pmap_data_gnt, pd_gnt_list); 5178 kmem_free(pgnt, sizeof(*pgnt) + 5179 (nentries - 1) * sizeof(struct gnttab_map_grant_ref)); 5180 if (SLIST_EMPTY(headp)) { 5181 kmem_free(headp, sizeof(*headp)); 5182 pmap->pm_data = NULL; 5183 pmap->pm_remove = NULL; 5184 } 5185 } 5186 5187 /* 5188 * pmap_enter_gnt: enter a grant entry into a pmap 5189 * 5190 * => must be done "now" ... no lazy-evaluation 5191 */ 5192 int 5193 pmap_enter_gnt(struct pmap *pmap, vaddr_t va, vaddr_t sva, int nentries, 5194 const struct gnttab_map_grant_ref *oops) 5195 { 5196 struct pmap_data_gnt *pgnt; 5197 pt_entry_t *ptes, opte; 5198 pt_entry_t *ptep; 5199 pd_entry_t * const *pdes; 5200 struct vm_page *ptp; 5201 struct vm_page *old_pg; 5202 struct pmap_page *old_pp; 5203 struct pv_entry *old_pve; 5204 struct pmap *pmap2; 5205 struct pmap_ptparray pt; 5206 int error; 5207 bool getptp; 5208 rb_tree_t *tree; 5209 struct gnttab_map_grant_ref *op; 5210 int ret; 5211 int idx; 5212 5213 KASSERT(pmap_initialized); 5214 KASSERT(va < VM_MAX_KERNEL_ADDRESS); 5215 KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#" 5216 PRIxVADDR " over PDP!", __func__, va); 5217 KASSERT(pmap != pmap_kernel()); 5218 5219 /* Begin by locking the pmap. */ 5220 mutex_enter(&pmap->pm_lock); 5221 pmap_alloc_gnt(pmap, sva, nentries, oops); 5222 5223 pgnt = pmap_find_gnt(pmap, va, va + PAGE_SIZE); 5224 KASSERT(pgnt != NULL); 5225 5226 /* Look up the PTP. Allocate if none present. */ 5227 ptp = NULL; 5228 getptp = false; 5229 ptp = pmap_find_ptp(pmap, va, 1); 5230 if (ptp == NULL) { 5231 getptp = true; 5232 error = pmap_get_ptp(pmap, &pt, va, PMAP_CANFAIL, &ptp); 5233 if (error != 0) { 5234 mutex_exit(&pmap->pm_lock); 5235 return error; 5236 } 5237 } 5238 tree = &VM_PAGE_TO_PP(ptp)->pp_rb; 5239 5240 /* 5241 * Look up the old PV entry at this VA (if any), and insert a new PV 5242 * entry if required for the new mapping. Temporarily track the old 5243 * and new mappings concurrently. Only after the old mapping is 5244 * evicted from the pmap will we remove its PV entry. Otherwise, 5245 * our picture of modified/accessed state for either page could get 5246 * out of sync (we need any P->V operation for either page to stall 5247 * on pmap->pm_lock until done here). 5248 */ 5249 old_pve = NULL; 5250 5251 old_pve = pmap_treelookup_pv(pmap, ptp, tree, va); 5252 5253 /* Map PTEs into address space. */ 5254 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 5255 5256 /* Install any newly allocated PTPs. */ 5257 if (getptp) { 5258 pmap_install_ptp(pmap, &pt, va, pdes); 5259 } 5260 5261 /* Check if there is an existing mapping. */ 5262 ptep = &ptes[pl1_i(va)]; 5263 opte = *ptep; 5264 bool have_oldpa = pmap_valid_entry(opte); 5265 paddr_t oldpa = pmap_pte2pa(opte); 5266 5267 /* 5268 * Update the pte. 5269 */ 5270 5271 idx = (va - pgnt->pd_gnt_sva) / PAGE_SIZE; 5272 op = &pgnt->pd_gnt_ops[idx]; 5273 5274 #ifdef XENPV /* XXX */ 5275 op->host_addr = xpmap_ptetomach(ptep); 5276 #endif 5277 op->dev_bus_addr = 0; 5278 op->status = GNTST_general_error; 5279 ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, 1); 5280 if (__predict_false(ret)) { 5281 printf("%s: GNTTABOP_map_grant_ref failed: %d\n", 5282 __func__, ret); 5283 op->status = GNTST_general_error; 5284 } 5285 for (int d = 0; d < 256 && op->status == GNTST_eagain; d++) { 5286 kpause("gntmap", false, mstohz(1), NULL); 5287 ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, 1); 5288 if (__predict_false(ret)) { 5289 printf("%s: GNTTABOP_map_grant_ref failed: %d\n", 5290 __func__, ret); 5291 op->status = GNTST_general_error; 5292 } 5293 } 5294 if (__predict_false(op->status != GNTST_okay)) { 5295 printf("%s: GNTTABOP_map_grant_ref status: %d\n", 5296 __func__, op->status); 5297 if (have_oldpa) { 5298 ptp->wire_count--; 5299 } 5300 } else { 5301 pgnt->pd_gnt_refs++; 5302 if (!have_oldpa) { 5303 ptp->wire_count++; 5304 } 5305 KASSERT(ptp->wire_count > 1); 5306 /* Remember minimum VA in PTP. */ 5307 pmap_ptp_range_set(ptp, va); 5308 } 5309 if (ptp->wire_count <= 1) 5310 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 5311 5312 /* 5313 * Done with the PTEs: they can now be unmapped. 5314 */ 5315 pmap_unmap_ptes(pmap, pmap2); 5316 5317 /* 5318 * Update statistics and PTP's reference count. 5319 */ 5320 pmap_stats_update_bypte(pmap, 0, opte); 5321 5322 /* 5323 * If old page is pv-tracked, remove pv_entry from its list. 5324 */ 5325 if ((~opte & (PTE_P | PTE_PVLIST)) == 0) { 5326 if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) { 5327 old_pp = VM_PAGE_TO_PP(old_pg); 5328 } else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) { 5329 panic("%s: PTE_PVLIST with pv-untracked page" 5330 " va = %#"PRIxVADDR " pa = %#" PRIxPADDR, 5331 __func__, va, oldpa); 5332 } 5333 5334 pmap_remove_pv(pmap, old_pp, ptp, va, old_pve, 5335 pmap_pte_to_pp_attrs(opte)); 5336 } else { 5337 KASSERT(old_pve == NULL); 5338 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 5339 } 5340 5341 pmap_drain_pv(pmap); 5342 mutex_exit(&pmap->pm_lock); 5343 return op->status; 5344 } 5345 5346 /* 5347 * pmap_remove_gnt: grant mapping removal function. 5348 * 5349 * => caller should not be holding any pmap locks 5350 */ 5351 static void 5352 pmap_remove_gnt(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 5353 { 5354 struct pmap_data_gnt *pgnt; 5355 pt_entry_t *ptes; 5356 pd_entry_t pde; 5357 pd_entry_t * const *pdes; 5358 struct vm_page *ptp; 5359 struct pmap *pmap2; 5360 vaddr_t va; 5361 int lvl; 5362 int idx; 5363 struct gnttab_map_grant_ref *op; 5364 struct gnttab_unmap_grant_ref unmap_op; 5365 int ret; 5366 5367 KASSERT(pmap != pmap_kernel()); 5368 KASSERT(pmap->pm_remove == pmap_remove_gnt); 5369 5370 mutex_enter(&pmap->pm_lock); 5371 for (va = sva; va < eva; va += PAGE_SIZE) { 5372 pgnt = pmap_find_gnt(pmap, va, va + PAGE_SIZE); 5373 if (pgnt == NULL) { 5374 pmap_remove_locked(pmap, sva, eva); 5375 continue; 5376 } 5377 5378 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 5379 if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) { 5380 panic("pmap_remove_gnt pdes not valid"); 5381 } 5382 5383 idx = (va - pgnt->pd_gnt_sva) / PAGE_SIZE; 5384 op = &pgnt->pd_gnt_ops[idx]; 5385 KASSERT(lvl == 1); 5386 KASSERT(op->status == GNTST_okay); 5387 5388 /* Get PTP if non-kernel mapping. */ 5389 ptp = pmap_find_ptp(pmap, va, 1); 5390 KASSERTMSG(ptp != NULL, 5391 "%s: unmanaged PTP detected", __func__); 5392 5393 if (op->status == GNTST_okay) { 5394 KASSERT(pmap_valid_entry(ptes[pl1_i(va)])); 5395 unmap_op.handle = op->handle; 5396 unmap_op.dev_bus_addr = 0; 5397 #ifdef XENPV /* XXX */ 5398 unmap_op.host_addr = xpmap_ptetomach(&ptes[pl1_i(va)]); 5399 #endif 5400 ret = HYPERVISOR_grant_table_op( 5401 GNTTABOP_unmap_grant_ref, &unmap_op, 1); 5402 if (ret) { 5403 printf("%s: GNTTABOP_unmap_grant_ref " 5404 "failed: %d\n", __func__, ret); 5405 } 5406 5407 ptp->wire_count--; 5408 pgnt->pd_gnt_refs--; 5409 if (pgnt->pd_gnt_refs == 0) { 5410 pmap_free_gnt(pmap, pgnt); 5411 } 5412 } 5413 /* 5414 * if mapping removed and the PTP is no longer 5415 * being used, free it! 5416 */ 5417 5418 if (ptp->wire_count <= 1) 5419 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 5420 pmap_unmap_ptes(pmap, pmap2); 5421 } 5422 mutex_exit(&pmap->pm_lock); 5423 } 5424 #endif /* XEN && DOM0OPS */ 5425 5426 paddr_t 5427 pmap_get_physpage(void) 5428 { 5429 struct vm_page *ptp; 5430 struct pmap *kpm = pmap_kernel(); 5431 paddr_t pa; 5432 5433 if (!uvm.page_init_done) { 5434 /* 5435 * We're growing the kernel pmap early (from 5436 * uvm_pageboot_alloc()). This case must be 5437 * handled a little differently. 5438 */ 5439 5440 if (!uvm_page_physget(&pa)) 5441 panic("%s: out of memory", __func__); 5442 #if defined(__HAVE_DIRECT_MAP) 5443 memset(PAGE_ALIGNED(PMAP_DIRECT_MAP(pa)), 0, PAGE_SIZE); 5444 #else 5445 #if defined(XENPV) 5446 if (XEN_VERSION_SUPPORTED(3, 4)) { 5447 xen_pagezero(pa); 5448 return pa; 5449 } 5450 #endif 5451 kpreempt_disable(); 5452 pmap_pte_set(early_zero_pte, pmap_pa2pte(pa) | PTE_P | 5453 PTE_W | pmap_pg_nx); 5454 pmap_pte_flush(); 5455 pmap_update_pg((vaddr_t)early_zerop); 5456 memset(PAGE_ALIGNED(early_zerop), 0, PAGE_SIZE); 5457 #if defined(DIAGNOSTIC) || defined(XENPV) 5458 pmap_pte_set(early_zero_pte, 0); 5459 pmap_pte_flush(); 5460 #endif /* defined(DIAGNOSTIC) */ 5461 kpreempt_enable(); 5462 #endif /* defined(__HAVE_DIRECT_MAP) */ 5463 } else { 5464 /* XXX */ 5465 ptp = uvm_pagealloc(NULL, 0, NULL, 5466 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 5467 if (ptp == NULL) 5468 panic("%s: out of memory", __func__); 5469 ptp->flags &= ~PG_BUSY; 5470 ptp->wire_count = 1; 5471 pa = VM_PAGE_TO_PHYS(ptp); 5472 } 5473 pmap_stats_update(kpm, 1, 0); 5474 5475 return pa; 5476 } 5477 5478 /* 5479 * Expand the page tree with the specified amount of PTPs, mapping virtual 5480 * addresses starting at kva. We populate all the levels but the last one 5481 * (L1). The nodes of the tree are created as RW, but the pages covered 5482 * will be kentered in L1, with proper permissions. 5483 * 5484 * Used only by pmap_growkernel. 5485 */ 5486 static void 5487 pmap_alloc_level(struct pmap *cpm, vaddr_t kva, long *needed_ptps) 5488 { 5489 unsigned long i; 5490 paddr_t pa; 5491 unsigned long index, endindex; 5492 int level; 5493 pd_entry_t *pdep; 5494 #ifdef XENPV 5495 int s = splvm(); /* protect xpq_* */ 5496 #endif 5497 5498 for (level = PTP_LEVELS; level > 1; level--) { 5499 if (level == PTP_LEVELS) 5500 pdep = cpm->pm_pdir; 5501 else 5502 pdep = normal_pdes[level - 2]; 5503 index = pl_i_roundup(kva, level); 5504 endindex = index + needed_ptps[level - 1] - 1; 5505 5506 for (i = index; i <= endindex; i++) { 5507 pt_entry_t pte; 5508 5509 KASSERT(!pmap_valid_entry(pdep[i])); 5510 pa = pmap_get_physpage(); 5511 pte = pmap_pa2pte(pa) | PTE_P | PTE_W; 5512 #ifdef __x86_64__ 5513 pte |= pmap_pg_nx; 5514 #endif 5515 pmap_pte_set(&pdep[i], pte); 5516 5517 #ifdef XENPV 5518 if (level == PTP_LEVELS && i >= PDIR_SLOT_KERN) { 5519 if (__predict_true( 5520 cpu_info_primary.ci_flags & CPUF_PRESENT)) { 5521 /* update per-cpu PMDs on all cpus */ 5522 xen_kpm_sync(pmap_kernel(), i); 5523 } else { 5524 /* 5525 * too early; update primary CPU 5526 * PMD only (without locks) 5527 */ 5528 #ifdef __x86_64__ 5529 pd_entry_t *cpu_pdep = 5530 &cpu_info_primary.ci_kpm_pdir[i]; 5531 #else 5532 pd_entry_t *cpu_pdep = 5533 &cpu_info_primary.ci_kpm_pdir[l2tol2(i)]; 5534 #endif 5535 pmap_pte_set(cpu_pdep, pte); 5536 } 5537 } 5538 #endif 5539 5540 KASSERT(level != PTP_LEVELS || nkptp[level - 1] + 5541 pl_i(VM_MIN_KERNEL_ADDRESS, level) == i); 5542 nkptp[level - 1]++; 5543 } 5544 pmap_pte_flush(); 5545 } 5546 #ifdef XENPV 5547 splx(s); 5548 #endif 5549 } 5550 5551 /* 5552 * pmap_growkernel: increase usage of KVM space. 5553 * 5554 * => we allocate new PTPs for the kernel and install them in all 5555 * the pmaps on the system. 5556 */ 5557 vaddr_t 5558 pmap_growkernel(vaddr_t maxkvaddr) 5559 { 5560 struct pmap *kpm = pmap_kernel(); 5561 struct pmap *cpm; 5562 #if !defined(XENPV) || !defined(__x86_64__) 5563 struct pmap *pm; 5564 long old; 5565 #endif 5566 int s, i; 5567 long needed_kptp[PTP_LEVELS], target_nptp; 5568 bool invalidate = false; 5569 5570 s = splvm(); /* to be safe */ 5571 mutex_enter(&kpm->pm_lock); 5572 5573 if (maxkvaddr <= pmap_maxkvaddr) { 5574 mutex_exit(&kpm->pm_lock); 5575 splx(s); 5576 return pmap_maxkvaddr; 5577 } 5578 5579 maxkvaddr = x86_round_pdr(maxkvaddr); 5580 #if !defined(XENPV) || !defined(__x86_64__) 5581 old = nkptp[PTP_LEVELS - 1]; 5582 #endif 5583 5584 /* Initialize needed_kptp. */ 5585 for (i = PTP_LEVELS - 1; i >= 1; i--) { 5586 target_nptp = pl_i_roundup(maxkvaddr, i + 1) - 5587 pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1); 5588 5589 if (target_nptp > nkptpmax[i]) 5590 panic("out of KVA space"); 5591 KASSERT(target_nptp >= nkptp[i]); 5592 needed_kptp[i] = target_nptp - nkptp[i]; 5593 } 5594 5595 #ifdef XENPV 5596 /* only pmap_kernel(), or the per-cpu map, has kernel entries */ 5597 cpm = kpm; 5598 #else 5599 /* Get the current pmap */ 5600 if (__predict_true(cpu_info_primary.ci_flags & CPUF_PRESENT)) { 5601 cpm = curcpu()->ci_pmap; 5602 } else { 5603 cpm = kpm; 5604 } 5605 #endif 5606 5607 kasan_shadow_map((void *)pmap_maxkvaddr, 5608 (size_t)(maxkvaddr - pmap_maxkvaddr)); 5609 kmsan_shadow_map((void *)pmap_maxkvaddr, 5610 (size_t)(maxkvaddr - pmap_maxkvaddr)); 5611 5612 pmap_alloc_level(cpm, pmap_maxkvaddr, needed_kptp); 5613 5614 /* 5615 * If the number of top level entries changed, update all pmaps. 5616 */ 5617 if (needed_kptp[PTP_LEVELS - 1] != 0) { 5618 #ifdef XENPV 5619 #ifdef __x86_64__ 5620 /* nothing, kernel entries are never entered in user pmap */ 5621 #else 5622 int pdkidx; 5623 5624 mutex_enter(&pmaps_lock); 5625 LIST_FOREACH(pm, &pmaps, pm_list) { 5626 for (pdkidx = PDIR_SLOT_KERN + old; 5627 pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1]; 5628 pdkidx++) { 5629 pmap_pte_set(&pm->pm_pdir[pdkidx], 5630 kpm->pm_pdir[pdkidx]); 5631 } 5632 pmap_pte_flush(); 5633 } 5634 mutex_exit(&pmaps_lock); 5635 #endif /* __x86_64__ */ 5636 #else /* XENPV */ 5637 size_t newpdes; 5638 newpdes = nkptp[PTP_LEVELS - 1] - old; 5639 if (cpm != kpm) { 5640 memcpy(&kpm->pm_pdir[PDIR_SLOT_KERN + old], 5641 &cpm->pm_pdir[PDIR_SLOT_KERN + old], 5642 newpdes * sizeof(pd_entry_t)); 5643 } 5644 5645 mutex_enter(&pmaps_lock); 5646 LIST_FOREACH(pm, &pmaps, pm_list) { 5647 if (__predict_false(pm->pm_enter != NULL)) { 5648 /* 5649 * Not a native pmap, the kernel is not mapped, 5650 * so nothing to synchronize. 5651 */ 5652 continue; 5653 } 5654 memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old], 5655 &kpm->pm_pdir[PDIR_SLOT_KERN + old], 5656 newpdes * sizeof(pd_entry_t)); 5657 } 5658 mutex_exit(&pmaps_lock); 5659 #endif 5660 invalidate = true; 5661 } 5662 pmap_maxkvaddr = maxkvaddr; 5663 mutex_exit(&kpm->pm_lock); 5664 splx(s); 5665 5666 if (invalidate && pmap_initialized) { 5667 /* Invalidate the pmap cache. */ 5668 pool_cache_invalidate(&pmap_cache); 5669 } 5670 5671 return maxkvaddr; 5672 } 5673 5674 #ifdef DEBUG 5675 void pmap_dump(struct pmap *, vaddr_t, vaddr_t); 5676 5677 /* 5678 * pmap_dump: dump all the mappings from a pmap 5679 * 5680 * => caller should not be holding any pmap locks 5681 */ 5682 void 5683 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 5684 { 5685 pt_entry_t *ptes, *pte; 5686 pd_entry_t * const *pdes; 5687 struct pmap *pmap2; 5688 vaddr_t blkendva; 5689 int lvl; 5690 5691 /* 5692 * if end is out of range truncate. 5693 * if (end == start) update to max. 5694 */ 5695 5696 if (eva > VM_MAXUSER_ADDRESS || eva <= sva) 5697 eva = VM_MAXUSER_ADDRESS; 5698 5699 mutex_enter(&pmap->pm_lock); 5700 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 5701 5702 /* 5703 * dumping a range of pages: we dump in PTP sized blocks (4MB) 5704 */ 5705 5706 for (/* null */ ; sva < eva ; sva = blkendva) { 5707 5708 /* determine range of block */ 5709 blkendva = x86_round_pdr(sva+1); 5710 if (blkendva > eva) 5711 blkendva = eva; 5712 5713 /* valid block? */ 5714 if (!pmap_pdes_valid(sva, pdes, NULL, &lvl)) 5715 continue; 5716 KASSERT(lvl == 1); 5717 5718 pte = &ptes[pl1_i(sva)]; 5719 for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) { 5720 if (!pmap_valid_entry(*pte)) 5721 continue; 5722 printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR 5723 " (pte=%#" PRIxPADDR ")\n", 5724 sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte); 5725 } 5726 } 5727 pmap_unmap_ptes(pmap, pmap2); 5728 mutex_exit(&pmap->pm_lock); 5729 } 5730 #endif 5731 5732 /* 5733 * pmap_update: process deferred invalidations and frees. 5734 */ 5735 void 5736 pmap_update(struct pmap *pmap) 5737 { 5738 struct pmap_page *pp; 5739 struct vm_page *ptp; 5740 5741 /* 5742 * Initiate any pending TLB shootdowns. Wait for them to 5743 * complete before returning control to the caller. 5744 */ 5745 kpreempt_disable(); 5746 pmap_tlb_shootnow(); 5747 kpreempt_enable(); 5748 5749 /* 5750 * Now that shootdowns are complete, process deferred frees. This 5751 * is an unlocked check, but is safe as we're only interested in 5752 * work done in this LWP - we won't get a false negative. 5753 */ 5754 if (atomic_load_relaxed(&pmap->pm_gc_ptp.lh_first) == NULL) { 5755 return; 5756 } 5757 5758 mutex_enter(&pmap->pm_lock); 5759 while ((ptp = LIST_FIRST(&pmap->pm_gc_ptp)) != NULL) { 5760 KASSERT(ptp->wire_count == 0); 5761 KASSERT(ptp->uanon == NULL); 5762 LIST_REMOVE(ptp, mdpage.mp_pp.pp_link); 5763 pp = VM_PAGE_TO_PP(ptp); 5764 LIST_INIT(&pp->pp_pvlist); 5765 pp->pp_attrs = 0; 5766 pp->pp_pte.pte_ptp = NULL; 5767 pp->pp_pte.pte_va = 0; 5768 PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp)); 5769 5770 /* 5771 * XXX Hack to avoid extra locking, and lock 5772 * assertions in uvm_pagefree(). Despite uobject 5773 * being set, this isn't a managed page. 5774 */ 5775 PMAP_DUMMY_LOCK(pmap); 5776 uvm_pagerealloc(ptp, NULL, 0); 5777 PMAP_DUMMY_UNLOCK(pmap); 5778 uvm_pagefree(ptp); 5779 } 5780 mutex_exit(&pmap->pm_lock); 5781 } 5782 5783 #if PTP_LEVELS > 4 5784 #error "Unsupported number of page table mappings" 5785 #endif 5786 5787 paddr_t 5788 pmap_init_tmp_pgtbl(paddr_t pg) 5789 { 5790 static bool maps_loaded; 5791 static const paddr_t x86_tmp_pml_paddr[] = { 5792 4 * PAGE_SIZE, /* L1 */ 5793 5 * PAGE_SIZE, /* L2 */ 5794 6 * PAGE_SIZE, /* L3 */ 5795 7 * PAGE_SIZE /* L4 */ 5796 }; 5797 static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 }; 5798 5799 pd_entry_t *tmp_pml, *kernel_pml; 5800 5801 int level; 5802 5803 if (!maps_loaded) { 5804 for (level = 0; level < PTP_LEVELS; ++level) { 5805 x86_tmp_pml_vaddr[level] = 5806 uvm_km_alloc(kernel_map, PAGE_SIZE, 0, 5807 UVM_KMF_VAONLY); 5808 5809 if (x86_tmp_pml_vaddr[level] == 0) 5810 panic("mapping of real mode PML failed\n"); 5811 pmap_kenter_pa(x86_tmp_pml_vaddr[level], 5812 x86_tmp_pml_paddr[level], 5813 VM_PROT_READ | VM_PROT_WRITE, 0); 5814 } 5815 pmap_update(pmap_kernel()); 5816 maps_loaded = true; 5817 } 5818 5819 /* Zero levels 1-3 */ 5820 for (level = 0; level < PTP_LEVELS - 1; ++level) { 5821 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 5822 memset(PAGE_ALIGNED(tmp_pml), 0, PAGE_SIZE); 5823 } 5824 5825 /* Copy PML4 */ 5826 kernel_pml = pmap_kernel()->pm_pdir; 5827 tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1]; 5828 memcpy(PAGE_ALIGNED(tmp_pml), PAGE_ALIGNED(kernel_pml), PAGE_SIZE); 5829 5830 #ifdef PAE 5831 /* 5832 * Use the last 4 entries of the L2 page as L3 PD entries. These 5833 * last entries are unlikely to be used for temporary mappings. 5834 * 508: maps 0->1GB (userland) 5835 * 509: unused 5836 * 510: unused 5837 * 511: maps 3->4GB (kernel) 5838 */ 5839 tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PTE_P; 5840 tmp_pml[509] = 0; 5841 tmp_pml[510] = 0; 5842 tmp_pml[511] = pmap_pdirpa(pmap_kernel(), PDIR_SLOT_KERN) | PTE_P; 5843 #endif 5844 5845 for (level = PTP_LEVELS - 1; level > 0; --level) { 5846 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 5847 5848 tmp_pml[pl_i(pg, level + 1)] = 5849 (x86_tmp_pml_paddr[level - 1] & PTE_FRAME) | PTE_W | PTE_P; 5850 } 5851 5852 tmp_pml = (void *)x86_tmp_pml_vaddr[0]; 5853 tmp_pml[pl_i(pg, 1)] = (pg & PTE_FRAME) | PTE_W | PTE_P; 5854 5855 #ifdef PAE 5856 /* Return the PA of the L3 page (entry 508 of the L2 page) */ 5857 return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t); 5858 #endif 5859 5860 return x86_tmp_pml_paddr[PTP_LEVELS - 1]; 5861 } 5862 5863 u_int 5864 x86_mmap_flags(paddr_t mdpgno) 5865 { 5866 u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK; 5867 u_int pflag = 0; 5868 5869 if (nflag & X86_MMAP_FLAG_PREFETCH) 5870 pflag |= PMAP_WRITE_COMBINE; 5871 5872 return pflag; 5873 } 5874 5875 #if defined(__HAVE_DIRECT_MAP) && defined(__x86_64__) && !defined(XENPV) 5876 5877 /* 5878 * ----------------------------------------------------------------------------- 5879 * ***************************************************************************** 5880 * ***************************************************************************** 5881 * ***************************************************************************** 5882 * ***************************************************************************** 5883 * **************** HERE BEGINS THE EPT CODE, USED BY INTEL-VMX **************** 5884 * ***************************************************************************** 5885 * ***************************************************************************** 5886 * ***************************************************************************** 5887 * ***************************************************************************** 5888 * ----------------------------------------------------------------------------- 5889 * 5890 * These functions are invoked as callbacks from the code above. Contrary to 5891 * native, EPT does not have a recursive slot; therefore, it is not possible 5892 * to call pmap_map_ptes(). Instead, we use the direct map and walk down the 5893 * tree manually. 5894 * 5895 * Apart from that, the logic is mostly the same as native. Once a pmap has 5896 * been created, NVMM calls pmap_ept_transform() to make it an EPT pmap. 5897 * After that we're good, and the callbacks will handle the translations 5898 * for us. 5899 * 5900 * ----------------------------------------------------------------------------- 5901 */ 5902 5903 /* Hardware bits. */ 5904 #define EPT_R __BIT(0) /* read */ 5905 #define EPT_W __BIT(1) /* write */ 5906 #define EPT_X __BIT(2) /* execute */ 5907 #define EPT_T __BITS(5,3) /* type */ 5908 #define TYPE_UC 0 5909 #define TYPE_WC 1 5910 #define TYPE_WT 4 5911 #define TYPE_WP 5 5912 #define TYPE_WB 6 5913 #define EPT_NOPAT __BIT(6) 5914 #define EPT_L __BIT(7) /* large */ 5915 #define EPT_A __BIT(8) /* accessed */ 5916 #define EPT_D __BIT(9) /* dirty */ 5917 /* Software bits. */ 5918 #define EPT_PVLIST __BIT(60) 5919 #define EPT_WIRED __BIT(61) 5920 5921 #define pmap_ept_valid_entry(pte) (pte & EPT_R) 5922 5923 bool pmap_ept_has_ad __read_mostly; 5924 5925 static inline void 5926 pmap_ept_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte) 5927 { 5928 int resid_diff = ((npte & EPT_R) ? 1 : 0) - ((opte & EPT_R) ? 1 : 0); 5929 int wired_diff = ((npte & EPT_WIRED) ? 1 : 0) - ((opte & EPT_WIRED) ? 1 : 0); 5930 5931 KASSERT((npte & (EPT_R | EPT_WIRED)) != EPT_WIRED); 5932 KASSERT((opte & (EPT_R | EPT_WIRED)) != EPT_WIRED); 5933 5934 pmap_stats_update(pmap, resid_diff, wired_diff); 5935 } 5936 5937 static pt_entry_t 5938 pmap_ept_type(u_int flags) 5939 { 5940 u_int cacheflags = (flags & PMAP_CACHE_MASK); 5941 pt_entry_t ret; 5942 5943 switch (cacheflags) { 5944 case PMAP_NOCACHE: 5945 case PMAP_NOCACHE_OVR: 5946 ret = __SHIFTIN(TYPE_UC, EPT_T); 5947 break; 5948 case PMAP_WRITE_COMBINE: 5949 ret = __SHIFTIN(TYPE_WC, EPT_T); 5950 break; 5951 case PMAP_WRITE_BACK: 5952 default: 5953 ret = __SHIFTIN(TYPE_WB, EPT_T); 5954 break; 5955 } 5956 5957 ret |= EPT_NOPAT; 5958 return ret; 5959 } 5960 5961 static inline pt_entry_t 5962 pmap_ept_prot(vm_prot_t prot) 5963 { 5964 pt_entry_t res = 0; 5965 5966 if (prot & VM_PROT_READ) 5967 res |= EPT_R; 5968 if (prot & VM_PROT_WRITE) 5969 res |= EPT_W; 5970 if (prot & VM_PROT_EXECUTE) 5971 res |= EPT_X; 5972 5973 return res; 5974 } 5975 5976 static inline uint8_t 5977 pmap_ept_to_pp_attrs(pt_entry_t ept) 5978 { 5979 uint8_t ret = 0; 5980 if (pmap_ept_has_ad) { 5981 if (ept & EPT_D) 5982 ret |= PP_ATTRS_D; 5983 if (ept & EPT_A) 5984 ret |= PP_ATTRS_A; 5985 } else { 5986 ret |= (PP_ATTRS_D|PP_ATTRS_A); 5987 } 5988 if (ept & EPT_W) 5989 ret |= PP_ATTRS_W; 5990 return ret; 5991 } 5992 5993 static inline pt_entry_t 5994 pmap_pp_attrs_to_ept(uint8_t attrs) 5995 { 5996 pt_entry_t ept = 0; 5997 if (attrs & PP_ATTRS_D) 5998 ept |= EPT_D; 5999 if (attrs & PP_ATTRS_A) 6000 ept |= EPT_A; 6001 if (attrs & PP_ATTRS_W) 6002 ept |= EPT_W; 6003 return ept; 6004 } 6005 6006 /* 6007 * Helper for pmap_ept_free_ptp. 6008 * tree[0] = &L2[L2idx] 6009 * tree[1] = &L3[L3idx] 6010 * tree[2] = &L4[L4idx] 6011 */ 6012 static void 6013 pmap_ept_get_tree(struct pmap *pmap, vaddr_t va, pd_entry_t **tree) 6014 { 6015 pt_entry_t *pteva; 6016 paddr_t ptepa; 6017 int i, index; 6018 6019 ptepa = pmap->pm_pdirpa[0]; 6020 for (i = PTP_LEVELS; i > 1; i--) { 6021 index = pl_pi(va, i); 6022 pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa); 6023 KASSERT(pmap_ept_valid_entry(pteva[index])); 6024 tree[i - 2] = &pteva[index]; 6025 ptepa = pmap_pte2pa(pteva[index]); 6026 } 6027 } 6028 6029 static void 6030 pmap_ept_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va) 6031 { 6032 pd_entry_t *tree[3]; 6033 int level; 6034 6035 KASSERT(pmap != pmap_kernel()); 6036 KASSERT(mutex_owned(&pmap->pm_lock)); 6037 KASSERT(kpreempt_disabled()); 6038 6039 pmap_ept_get_tree(pmap, va, tree); 6040 6041 level = 1; 6042 do { 6043 (void)pmap_pte_testset(tree[level - 1], 0); 6044 6045 pmap_freepage(pmap, ptp, level); 6046 if (level < PTP_LEVELS - 1) { 6047 ptp = pmap_find_ptp(pmap, va, level + 1); 6048 ptp->wire_count--; 6049 if (ptp->wire_count > 1) 6050 break; 6051 } 6052 } while (++level < PTP_LEVELS); 6053 pmap_pte_flush(); 6054 } 6055 6056 /* Allocate L4->L3->L2. Return L2. */ 6057 static void 6058 pmap_ept_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va) 6059 { 6060 struct vm_page *ptp; 6061 unsigned long index; 6062 pd_entry_t *pteva; 6063 paddr_t ptepa; 6064 int i; 6065 6066 KASSERT(pmap != pmap_kernel()); 6067 KASSERT(mutex_owned(&pmap->pm_lock)); 6068 KASSERT(kpreempt_disabled()); 6069 6070 /* 6071 * Now that we have all the pages looked up or allocated, 6072 * loop through again installing any new ones into the tree. 6073 */ 6074 ptepa = pmap->pm_pdirpa[0]; 6075 for (i = PTP_LEVELS; i > 1; i--) { 6076 index = pl_pi(va, i); 6077 pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa); 6078 6079 if (pmap_ept_valid_entry(pteva[index])) { 6080 KASSERT(!pt->alloced[i]); 6081 ptepa = pmap_pte2pa(pteva[index]); 6082 continue; 6083 } 6084 6085 ptp = pt->pg[i]; 6086 ptp->flags &= ~PG_BUSY; /* never busy */ 6087 ptp->wire_count = 1; 6088 pmap->pm_ptphint[i - 2] = ptp; 6089 ptepa = VM_PAGE_TO_PHYS(ptp); 6090 pmap_pte_set(&pteva[index], ptepa | EPT_R | EPT_W | EPT_X); 6091 6092 pmap_pte_flush(); 6093 pmap_stats_update(pmap, 1, 0); 6094 6095 /* 6096 * If we're not in the top level, increase the 6097 * wire count of the parent page. 6098 */ 6099 if (i < PTP_LEVELS) { 6100 pt->pg[i + 1]->wire_count++; 6101 } 6102 } 6103 } 6104 6105 static int 6106 pmap_ept_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, 6107 u_int flags) 6108 { 6109 pt_entry_t *ptes, opte, npte; 6110 pt_entry_t *ptep; 6111 struct vm_page *ptp; 6112 struct vm_page *new_pg, *old_pg; 6113 struct pmap_page *new_pp, *old_pp; 6114 struct pv_entry *old_pve, *new_pve; 6115 bool wired = (flags & PMAP_WIRED) != 0; 6116 bool accessed; 6117 struct pmap_ptparray pt; 6118 int error; 6119 bool getptp, samepage, new_embedded; 6120 rb_tree_t *tree; 6121 6122 KASSERT(pmap_initialized); 6123 KASSERT(va < VM_MAXUSER_ADDRESS); 6124 6125 npte = pa | pmap_ept_prot(prot) | pmap_ept_type(flags); 6126 6127 if (wired) 6128 npte |= EPT_WIRED; 6129 if (flags & VM_PROT_ALL) { 6130 npte |= EPT_A; 6131 if (flags & VM_PROT_WRITE) { 6132 KASSERT((npte & EPT_W) != 0); 6133 npte |= EPT_D; 6134 } 6135 } 6136 6137 new_pg = PHYS_TO_VM_PAGE(pa); 6138 if (new_pg != NULL) { 6139 /* This is a managed page */ 6140 npte |= EPT_PVLIST; 6141 new_pp = VM_PAGE_TO_PP(new_pg); 6142 } else if ((new_pp = pmap_pv_tracked(pa)) != NULL) { 6143 /* This is an unmanaged pv-tracked page */ 6144 npte |= EPT_PVLIST; 6145 } else { 6146 new_pp = NULL; 6147 } 6148 6149 /* Begin by locking the pmap. */ 6150 mutex_enter(&pmap->pm_lock); 6151 6152 /* Look up the PTP. Allocate if none present. */ 6153 ptp = NULL; 6154 getptp = false; 6155 if (pmap != pmap_kernel()) { 6156 ptp = pmap_find_ptp(pmap, va, 1); 6157 if (ptp == NULL) { 6158 getptp = true; 6159 error = pmap_get_ptp(pmap, &pt, va, flags, &ptp); 6160 if (error != 0) { 6161 if (flags & PMAP_CANFAIL) { 6162 mutex_exit(&pmap->pm_lock); 6163 return error; 6164 } 6165 panic("%s: get ptp failed, error=%d", __func__, 6166 error); 6167 } 6168 } 6169 tree = &VM_PAGE_TO_PP(ptp)->pp_rb; 6170 } else { 6171 /* Embedded PV entries rely on this. */ 6172 KASSERT(va != 0); 6173 tree = &pmap_kernel_rb; 6174 } 6175 6176 /* 6177 * Look up the old PV entry at this VA (if any), and insert a new PV 6178 * entry if required for the new mapping. Temporarily track the old 6179 * and new mappings concurrently. Only after the old mapping is 6180 * evicted from the pmap will we remove its PV entry. Otherwise, 6181 * our picture of modified/accessed state for either page could get 6182 * out of sync (we need any P->V operation for either page to stall 6183 * on pmap->pm_lock until done here). 6184 */ 6185 new_pve = NULL; 6186 old_pve = NULL; 6187 samepage = false; 6188 new_embedded = false; 6189 6190 if (new_pp != NULL) { 6191 error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve, 6192 &old_pve, &samepage, &new_embedded, tree); 6193 6194 /* 6195 * If a new pv_entry was needed and none was available, we 6196 * can go no further. 6197 */ 6198 if (error != 0) { 6199 if (flags & PMAP_CANFAIL) { 6200 if (getptp) { 6201 pmap_unget_ptp(pmap, &pt); 6202 } 6203 mutex_exit(&pmap->pm_lock); 6204 return error; 6205 } 6206 panic("%s: alloc pve failed", __func__); 6207 } 6208 } else { 6209 old_pve = pmap_treelookup_pv(pmap, ptp, tree, va); 6210 } 6211 6212 /* Map PTEs into address space. */ 6213 kpreempt_disable(); 6214 6215 /* Install any newly allocated PTPs. */ 6216 if (getptp) { 6217 pmap_ept_install_ptp(pmap, &pt, va); 6218 } 6219 6220 /* Check if there is an existing mapping. */ 6221 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp)); 6222 ptep = &ptes[pl1_pi(va)]; 6223 opte = *ptep; 6224 bool have_oldpa = pmap_ept_valid_entry(opte); 6225 paddr_t oldpa = pmap_pte2pa(opte); 6226 6227 /* 6228 * Update the pte. 6229 */ 6230 do { 6231 opte = *ptep; 6232 6233 /* 6234 * if the same page, inherit PTE_A and PTE_D. 6235 */ 6236 if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) { 6237 npte |= opte & (EPT_A | EPT_D); 6238 } 6239 } while (pmap_pte_cas(ptep, opte, npte) != opte); 6240 6241 /* 6242 * Done with the PTEs: they can now be unmapped. 6243 */ 6244 kpreempt_enable(); 6245 6246 /* 6247 * Update statistics and PTP's reference count. 6248 */ 6249 pmap_ept_stats_update_bypte(pmap, npte, opte); 6250 if (ptp != NULL) { 6251 if (!have_oldpa) { 6252 ptp->wire_count++; 6253 } 6254 /* Remember minimum VA in PTP. */ 6255 pmap_ptp_range_set(ptp, va); 6256 } 6257 KASSERT(ptp == NULL || ptp->wire_count > 1); 6258 6259 /* 6260 * If the same page, we can skip pv_entry handling. 6261 */ 6262 if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) { 6263 KASSERT(((opte ^ npte) & EPT_PVLIST) == 0); 6264 if ((npte & EPT_PVLIST) != 0) { 6265 KASSERT(samepage); 6266 pmap_check_pv(pmap, ptp, new_pp, va, true); 6267 } 6268 goto same_pa; 6269 } else if ((npte & EPT_PVLIST) != 0) { 6270 KASSERT(!samepage); 6271 } 6272 6273 /* 6274 * If old page is pv-tracked, remove pv_entry from its list. 6275 */ 6276 if ((~opte & (EPT_R | EPT_PVLIST)) == 0) { 6277 if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) { 6278 old_pp = VM_PAGE_TO_PP(old_pg); 6279 } else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) { 6280 panic("%s: EPT_PVLIST with pv-untracked page" 6281 " va = %#"PRIxVADDR 6282 " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")", 6283 __func__, va, oldpa, atop(pa)); 6284 } 6285 6286 pmap_remove_pv(pmap, old_pp, ptp, va, old_pve, 6287 pmap_ept_to_pp_attrs(opte)); 6288 } else { 6289 KASSERT(old_pve == NULL); 6290 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 6291 } 6292 6293 /* 6294 * If new page is dynamically PV tracked, insert to tree. 6295 */ 6296 if (new_pve != NULL) { 6297 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); 6298 old_pve = rb_tree_insert_node(tree, new_pve); 6299 KASSERT(old_pve == new_pve); 6300 pmap_check_pv(pmap, ptp, new_pp, va, true); 6301 } 6302 6303 same_pa: 6304 /* 6305 * shootdown tlb if necessary. 6306 */ 6307 6308 if (pmap_ept_has_ad) { 6309 accessed = (~opte & (EPT_R | EPT_A)) == 0; 6310 } else { 6311 accessed = (opte & EPT_R) != 0; 6312 } 6313 if (accessed && ((opte ^ npte) & (PTE_FRAME | EPT_W)) != 0) { 6314 pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_ENTER); 6315 } 6316 pmap_drain_pv(pmap); 6317 mutex_exit(&pmap->pm_lock); 6318 return 0; 6319 } 6320 6321 /* Pay close attention, this returns L2. */ 6322 static int 6323 pmap_ept_pdes_invalid(struct pmap *pmap, vaddr_t va, pd_entry_t *lastpde) 6324 { 6325 pt_entry_t *pteva; 6326 paddr_t ptepa; 6327 int i, index; 6328 6329 KASSERT(mutex_owned(&pmap->pm_lock)); 6330 6331 ptepa = pmap->pm_pdirpa[0]; 6332 for (i = PTP_LEVELS; i > 1; i--) { 6333 pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa); 6334 index = pl_pi(va, i); 6335 if (!pmap_ept_valid_entry(pteva[index])) 6336 return i; 6337 ptepa = pmap_pte2pa(pteva[index]); 6338 } 6339 if (lastpde != NULL) { 6340 *lastpde = pteva[index]; 6341 } 6342 6343 return 0; 6344 } 6345 6346 static bool 6347 pmap_ept_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap) 6348 { 6349 pt_entry_t *ptes, pte; 6350 pd_entry_t pde; 6351 paddr_t ptppa, pa; 6352 bool rv; 6353 6354 #ifdef __HAVE_DIRECT_MAP 6355 if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) { 6356 if (pap != NULL) { 6357 *pap = PMAP_DIRECT_UNMAP(va); 6358 } 6359 return true; 6360 } 6361 #endif 6362 6363 rv = false; 6364 pa = 0; 6365 6366 mutex_enter(&pmap->pm_lock); 6367 kpreempt_disable(); 6368 6369 if (!pmap_ept_pdes_invalid(pmap, va, &pde)) { 6370 ptppa = pmap_pte2pa(pde); 6371 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa); 6372 pte = ptes[pl1_pi(va)]; 6373 if (__predict_true((pte & EPT_R) != 0)) { 6374 pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1)); 6375 rv = true; 6376 } 6377 } 6378 6379 kpreempt_enable(); 6380 mutex_exit(&pmap->pm_lock); 6381 6382 if (pap != NULL) { 6383 *pap = pa; 6384 } 6385 return rv; 6386 } 6387 6388 static bool 6389 pmap_ept_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, 6390 vaddr_t va) 6391 { 6392 struct pv_entry *pve; 6393 struct vm_page *pg; 6394 struct pmap_page *pp; 6395 pt_entry_t opte; 6396 bool accessed; 6397 6398 KASSERT(pmap != pmap_kernel()); 6399 KASSERT(mutex_owned(&pmap->pm_lock)); 6400 KASSERT(kpreempt_disabled()); 6401 6402 if (!pmap_ept_valid_entry(*pte)) { 6403 /* VA not mapped. */ 6404 return false; 6405 } 6406 6407 /* Atomically save the old PTE and zap it. */ 6408 opte = pmap_pte_testset(pte, 0); 6409 if (!pmap_ept_valid_entry(opte)) { 6410 return false; 6411 } 6412 6413 pmap_ept_stats_update_bypte(pmap, 0, opte); 6414 6415 if (ptp) { 6416 /* 6417 * Dropping a PTE. Make sure that the PDE is flushed. 6418 */ 6419 ptp->wire_count--; 6420 if (ptp->wire_count <= 1) { 6421 opte |= EPT_A; 6422 } 6423 } 6424 6425 if (pmap_ept_has_ad) { 6426 accessed = (opte & EPT_A) != 0; 6427 } else { 6428 accessed = true; 6429 } 6430 if (accessed) { 6431 pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_REMOVE_PTE); 6432 } 6433 6434 /* 6435 * If we are not on a pv list - we are done. 6436 */ 6437 if ((opte & EPT_PVLIST) == 0) { 6438 KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL), 6439 "managed page without EPT_PVLIST for %#"PRIxVADDR, va); 6440 KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL), 6441 "pv-tracked page without EPT_PVLIST for %#"PRIxVADDR, va); 6442 KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ? 6443 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL); 6444 return true; 6445 } 6446 6447 if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) { 6448 pp = VM_PAGE_TO_PP(pg); 6449 } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) { 6450 paddr_t pa = pmap_pte2pa(opte); 6451 panic("%s: EPT_PVLIST with pv-untracked page" 6452 " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")", 6453 __func__, va, pa, atop(pa)); 6454 } 6455 6456 /* Sync R/M bits. */ 6457 pve = pmap_lookup_pv(pmap, ptp, pp, va); 6458 pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_ept_to_pp_attrs(opte)); 6459 return true; 6460 } 6461 6462 static void 6463 pmap_ept_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva, 6464 vaddr_t startva, vaddr_t endva) 6465 { 6466 pt_entry_t *pte = (pt_entry_t *)ptpva; 6467 6468 KASSERT(pmap != pmap_kernel()); 6469 KASSERT(mutex_owned(&pmap->pm_lock)); 6470 KASSERT(kpreempt_disabled()); 6471 6472 /* 6473 * mappings are very often sparse, so clip the given range to the 6474 * range of PTEs that are known present in the PTP. 6475 */ 6476 pmap_ptp_range_clip(ptp, &startva, &pte); 6477 6478 /* 6479 * note that ptpva points to the PTE that maps startva. this may 6480 * or may not be the first PTE in the PTP. 6481 * 6482 * we loop through the PTP while there are still PTEs to look at 6483 * and the wire_count is greater than 1 (because we use the wire_count 6484 * to keep track of the number of real PTEs in the PTP). 6485 */ 6486 while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) { 6487 (void)pmap_ept_remove_pte(pmap, ptp, pte, startva); 6488 startva += PAGE_SIZE; 6489 pte++; 6490 } 6491 } 6492 6493 static void 6494 pmap_ept_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 6495 { 6496 pt_entry_t *ptes; 6497 pd_entry_t pde; 6498 paddr_t ptppa; 6499 vaddr_t blkendva, va = sva; 6500 struct vm_page *ptp; 6501 6502 mutex_enter(&pmap->pm_lock); 6503 kpreempt_disable(); 6504 6505 for (/* null */ ; va < eva ; va = blkendva) { 6506 int lvl; 6507 6508 /* determine range of block */ 6509 blkendva = x86_round_pdr(va+1); 6510 if (blkendva > eva) 6511 blkendva = eva; 6512 6513 lvl = pmap_ept_pdes_invalid(pmap, va, &pde); 6514 if (lvl != 0) { 6515 /* Skip a range corresponding to an invalid pde. */ 6516 blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1]; 6517 continue; 6518 } 6519 6520 /* PA of the PTP */ 6521 ptppa = pmap_pte2pa(pde); 6522 6523 ptp = pmap_find_ptp(pmap, va, 1); 6524 KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected", 6525 __func__); 6526 6527 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa); 6528 6529 pmap_ept_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_pi(va)], va, 6530 blkendva); 6531 6532 /* If PTP is no longer being used, free it. */ 6533 if (ptp && ptp->wire_count <= 1) { 6534 pmap_ept_free_ptp(pmap, ptp, va); 6535 } 6536 } 6537 6538 kpreempt_enable(); 6539 pmap_drain_pv(pmap); 6540 mutex_exit(&pmap->pm_lock); 6541 } 6542 6543 static int 6544 pmap_ept_sync_pv(struct vm_page *ptp, vaddr_t va, paddr_t pa, int clearbits, 6545 uint8_t *oattrs, pt_entry_t *optep) 6546 { 6547 struct pmap *pmap; 6548 pt_entry_t *ptep; 6549 pt_entry_t opte; 6550 pt_entry_t npte; 6551 pt_entry_t expect; 6552 bool need_shootdown; 6553 6554 expect = pmap_pa2pte(pa) | EPT_R; 6555 pmap = ptp_to_pmap(ptp); 6556 6557 if (clearbits != ~0) { 6558 KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0); 6559 clearbits = pmap_pp_attrs_to_ept(clearbits); 6560 } 6561 6562 ptep = pmap_map_pte(pmap, ptp, va); 6563 do { 6564 opte = *ptep; 6565 KASSERT((opte & (EPT_D | EPT_A)) != EPT_D); 6566 KASSERT((opte & (EPT_A | EPT_R)) != EPT_A); 6567 KASSERT(opte == 0 || (opte & EPT_R) != 0); 6568 if ((opte & (PTE_FRAME | EPT_R)) != expect) { 6569 /* 6570 * We lost a race with a V->P operation like 6571 * pmap_remove(). Wait for the competitor 6572 * reflecting pte bits into mp_attrs. 6573 */ 6574 pmap_unmap_pte(); 6575 return EAGAIN; 6576 } 6577 6578 /* 6579 * Check if there's anything to do on this PTE. 6580 */ 6581 if ((opte & clearbits) == 0) { 6582 need_shootdown = false; 6583 break; 6584 } 6585 6586 /* 6587 * We need a shootdown if the PTE is cached (EPT_A) ... 6588 * ... Unless we are clearing only the EPT_W bit and 6589 * it isn't cached as RW (EPT_D). 6590 */ 6591 if (pmap_ept_has_ad) { 6592 need_shootdown = (opte & EPT_A) != 0 && 6593 !(clearbits == EPT_W && (opte & EPT_D) == 0); 6594 } else { 6595 need_shootdown = true; 6596 } 6597 6598 npte = opte & ~clearbits; 6599 6600 /* 6601 * If we need a shootdown anyway, clear EPT_A and EPT_D. 6602 */ 6603 if (need_shootdown) { 6604 npte &= ~(EPT_A | EPT_D); 6605 } 6606 KASSERT((npte & (EPT_D | EPT_A)) != EPT_D); 6607 KASSERT((npte & (EPT_A | EPT_R)) != EPT_A); 6608 KASSERT(npte == 0 || (opte & EPT_R) != 0); 6609 } while (pmap_pte_cas(ptep, opte, npte) != opte); 6610 6611 if (need_shootdown) { 6612 pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_SYNC_PV); 6613 } 6614 pmap_unmap_pte(); 6615 6616 *oattrs = pmap_ept_to_pp_attrs(opte); 6617 if (optep != NULL) 6618 *optep = opte; 6619 return 0; 6620 } 6621 6622 static void 6623 pmap_ept_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte, 6624 vaddr_t va) 6625 { 6626 6627 KASSERT(mutex_owned(&pmap->pm_lock)); 6628 6629 pmap_ept_stats_update_bypte(pmap, 0, opte); 6630 ptp->wire_count--; 6631 if (ptp->wire_count <= 1) { 6632 pmap_ept_free_ptp(pmap, ptp, va); 6633 } 6634 } 6635 6636 static void 6637 pmap_ept_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot) 6638 { 6639 pt_entry_t bit_rem; 6640 pt_entry_t *ptes, *spte; 6641 pt_entry_t opte, npte; 6642 pd_entry_t pde; 6643 paddr_t ptppa; 6644 vaddr_t va; 6645 bool modified; 6646 6647 bit_rem = 0; 6648 if (!(prot & VM_PROT_WRITE)) 6649 bit_rem = EPT_W; 6650 6651 sva &= PTE_FRAME; 6652 eva &= PTE_FRAME; 6653 6654 /* Acquire pmap. */ 6655 mutex_enter(&pmap->pm_lock); 6656 kpreempt_disable(); 6657 6658 for (va = sva; va < eva; va += PAGE_SIZE) { 6659 if (pmap_ept_pdes_invalid(pmap, va, &pde)) { 6660 continue; 6661 } 6662 6663 ptppa = pmap_pte2pa(pde); 6664 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa); 6665 spte = &ptes[pl1_pi(va)]; 6666 6667 do { 6668 opte = *spte; 6669 if (!pmap_ept_valid_entry(opte)) { 6670 goto next; 6671 } 6672 npte = (opte & ~bit_rem); 6673 } while (pmap_pte_cas(spte, opte, npte) != opte); 6674 6675 if (pmap_ept_has_ad) { 6676 modified = (opte & EPT_D) != 0; 6677 } else { 6678 modified = true; 6679 } 6680 if (modified) { 6681 vaddr_t tva = x86_ptob(spte - ptes); 6682 pmap_tlb_shootdown(pmap, tva, 0, 6683 TLBSHOOT_WRITE_PROTECT); 6684 } 6685 next:; 6686 } 6687 6688 kpreempt_enable(); 6689 mutex_exit(&pmap->pm_lock); 6690 } 6691 6692 static void 6693 pmap_ept_unwire(struct pmap *pmap, vaddr_t va) 6694 { 6695 pt_entry_t *ptes, *ptep, opte; 6696 pd_entry_t pde; 6697 paddr_t ptppa; 6698 6699 /* Acquire pmap. */ 6700 mutex_enter(&pmap->pm_lock); 6701 kpreempt_disable(); 6702 6703 if (pmap_ept_pdes_invalid(pmap, va, &pde)) { 6704 panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va); 6705 } 6706 6707 ptppa = pmap_pte2pa(pde); 6708 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa); 6709 ptep = &ptes[pl1_pi(va)]; 6710 opte = *ptep; 6711 KASSERT(pmap_ept_valid_entry(opte)); 6712 6713 if (opte & EPT_WIRED) { 6714 pt_entry_t npte = opte & ~EPT_WIRED; 6715 6716 opte = pmap_pte_testset(ptep, npte); 6717 pmap_ept_stats_update_bypte(pmap, npte, opte); 6718 } else { 6719 printf("%s: wiring for pmap %p va %#" PRIxVADDR 6720 "did not change!\n", __func__, pmap, va); 6721 } 6722 6723 /* Release pmap. */ 6724 kpreempt_enable(); 6725 mutex_exit(&pmap->pm_lock); 6726 } 6727 6728 /* -------------------------------------------------------------------------- */ 6729 6730 void 6731 pmap_ept_transform(struct pmap *pmap) 6732 { 6733 pmap->pm_enter = pmap_ept_enter; 6734 pmap->pm_extract = pmap_ept_extract; 6735 pmap->pm_remove = pmap_ept_remove; 6736 pmap->pm_sync_pv = pmap_ept_sync_pv; 6737 pmap->pm_pp_remove_ent = pmap_ept_pp_remove_ent; 6738 pmap->pm_write_protect = pmap_ept_write_protect; 6739 pmap->pm_unwire = pmap_ept_unwire; 6740 6741 memset(PAGE_ALIGNED(pmap->pm_pdir), 0, PAGE_SIZE); 6742 } 6743 6744 #endif /* __HAVE_DIRECT_MAP && __x86_64__ && !XENPV */ 6745