1 /* $NetBSD: pmap.c,v 1.291 2018/06/20 11:57:22 maxv Exp $ */ 2 3 /* 4 * Copyright (c) 2008, 2010, 2016, 2017 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran, and by Maxime Villard. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 2007 Manuel Bouyer. 34 * 35 * Redistribution and use in source and binary forms, with or without 36 * modification, are permitted provided that the following conditions 37 * are met: 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 44 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 45 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 46 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 47 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 48 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 49 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 50 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 51 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 52 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 53 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 54 */ 55 56 /* 57 * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr> 58 * 59 * Permission to use, copy, modify, and distribute this software for any 60 * purpose with or without fee is hereby granted, provided that the above 61 * copyright notice and this permission notice appear in all copies. 62 * 63 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 64 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 65 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 66 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 67 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 68 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 69 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 70 */ 71 72 /* 73 * Copyright (c) 1997 Charles D. Cranor and Washington University. 74 * All rights reserved. 75 * 76 * Redistribution and use in source and binary forms, with or without 77 * modification, are permitted provided that the following conditions 78 * are met: 79 * 1. Redistributions of source code must retain the above copyright 80 * notice, this list of conditions and the following disclaimer. 81 * 2. Redistributions in binary form must reproduce the above copyright 82 * notice, this list of conditions and the following disclaimer in the 83 * documentation and/or other materials provided with the distribution. 84 * 85 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 86 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 87 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 88 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 89 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 90 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 91 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 92 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 93 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 94 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 95 */ 96 97 /* 98 * Copyright 2001 (c) Wasabi Systems, Inc. 99 * All rights reserved. 100 * 101 * Written by Frank van der Linden for Wasabi Systems, Inc. 102 * 103 * Redistribution and use in source and binary forms, with or without 104 * modification, are permitted provided that the following conditions 105 * are met: 106 * 1. Redistributions of source code must retain the above copyright 107 * notice, this list of conditions and the following disclaimer. 108 * 2. Redistributions in binary form must reproduce the above copyright 109 * notice, this list of conditions and the following disclaimer in the 110 * documentation and/or other materials provided with the distribution. 111 * 3. All advertising materials mentioning features or use of this software 112 * must display the following acknowledgement: 113 * This product includes software developed for the NetBSD Project by 114 * Wasabi Systems, Inc. 115 * 4. The name of Wasabi Systems, Inc. may not be used to endorse 116 * or promote products derived from this software without specific prior 117 * written permission. 118 * 119 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND 120 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 121 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 122 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC 123 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 124 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 125 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 126 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 127 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 128 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 129 * POSSIBILITY OF SUCH DAMAGE. 130 */ 131 132 /* 133 * This is the i386 pmap modified and generalized to support x86-64 134 * as well. The idea is to hide the upper N levels of the page tables 135 * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest 136 * is mostly untouched, except that it uses some more generalized 137 * macros and interfaces. 138 * 139 * This pmap has been tested on the i386 as well, and it can be easily 140 * adapted to PAE. 141 * 142 * fvdl@wasabisystems.com 18-Jun-2001 143 */ 144 145 /* 146 * pmap.c: i386 pmap module rewrite 147 * Chuck Cranor <chuck@netbsd> 148 * 11-Aug-97 149 * 150 * history of this pmap module: in addition to my own input, i used 151 * the following references for this rewrite of the i386 pmap: 152 * 153 * [1] the NetBSD i386 pmap. this pmap appears to be based on the 154 * BSD hp300 pmap done by Mike Hibler at University of Utah. 155 * it was then ported to the i386 by William Jolitz of UUNET 156 * Technologies, Inc. Then Charles M. Hannum of the NetBSD 157 * project fixed some bugs and provided some speed ups. 158 * 159 * [2] the FreeBSD i386 pmap. this pmap seems to be the 160 * Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson 161 * and David Greenman. 162 * 163 * [3] the Mach pmap. this pmap, from CMU, seems to have migrated 164 * between several processors. the VAX version was done by 165 * Avadis Tevanian, Jr., and Michael Wayne Young. the i386 166 * version was done by Lance Berc, Mike Kupfer, Bob Baron, 167 * David Golub, and Richard Draves. the alpha version was 168 * done by Alessandro Forin (CMU/Mach) and Chris Demetriou 169 * (NetBSD/alpha). 170 */ 171 172 #include <sys/cdefs.h> 173 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.291 2018/06/20 11:57:22 maxv Exp $"); 174 175 #include "opt_user_ldt.h" 176 #include "opt_lockdebug.h" 177 #include "opt_multiprocessor.h" 178 #include "opt_xen.h" 179 #include "opt_svs.h" 180 181 #include <sys/param.h> 182 #include <sys/systm.h> 183 #include <sys/proc.h> 184 #include <sys/pool.h> 185 #include <sys/kernel.h> 186 #include <sys/atomic.h> 187 #include <sys/cpu.h> 188 #include <sys/intr.h> 189 #include <sys/xcall.h> 190 #include <sys/kcore.h> 191 192 #include <uvm/uvm.h> 193 #include <uvm/pmap/pmap_pvt.h> 194 195 #include <dev/isa/isareg.h> 196 197 #include <machine/specialreg.h> 198 #include <machine/gdt.h> 199 #include <machine/isa_machdep.h> 200 #include <machine/cpuvar.h> 201 #include <machine/cputypes.h> 202 203 #include <x86/pmap.h> 204 #include <x86/pmap_pv.h> 205 206 #include <x86/i82489reg.h> 207 #include <x86/i82489var.h> 208 209 #ifdef XEN 210 #include <xen/xen-public/xen.h> 211 #include <xen/hypervisor.h> 212 #endif 213 214 /* 215 * general info: 216 * 217 * - for an explanation of how the i386 MMU hardware works see 218 * the comments in <machine/pte.h>. 219 * 220 * - for an explanation of the general memory structure used by 221 * this pmap (including the recursive mapping), see the comments 222 * in <machine/pmap.h>. 223 * 224 * this file contains the code for the "pmap module." the module's 225 * job is to manage the hardware's virtual to physical address mappings. 226 * note that there are two levels of mapping in the VM system: 227 * 228 * [1] the upper layer of the VM system uses vm_map's and vm_map_entry's 229 * to map ranges of virtual address space to objects/files. for 230 * example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only 231 * to the file /bin/ls starting at offset zero." note that 232 * the upper layer mapping is not concerned with how individual 233 * vm_pages are mapped. 234 * 235 * [2] the lower layer of the VM system (the pmap) maintains the mappings 236 * from virtual addresses. it is concerned with which vm_page is 237 * mapped where. for example, when you run /bin/ls and start 238 * at page 0x1000 the fault routine may lookup the correct page 239 * of the /bin/ls file and then ask the pmap layer to establish 240 * a mapping for it. 241 * 242 * note that information in the lower layer of the VM system can be 243 * thrown away since it can easily be reconstructed from the info 244 * in the upper layer. 245 * 246 * data structures we use include: 247 * 248 * - struct pmap: describes the address space of one thread 249 * - struct pmap_page: describes one pv-tracked page, without 250 * necessarily a corresponding vm_page 251 * - struct pv_entry: describes one <PMAP,VA> mapping of a PA 252 * - struct pv_head: there is one pv_head per pv-tracked page of 253 * physical memory. the pv_head points to a list of pv_entry 254 * structures which describe all the <PMAP,VA> pairs that this 255 * page is mapped in. this is critical for page based operations 256 * such as pmap_page_protect() [change protection on _all_ mappings 257 * of a page] 258 */ 259 260 /* 261 * memory allocation 262 * 263 * - there are three data structures that we must dynamically allocate: 264 * 265 * [A] new process' page directory page (PDP) 266 * - plan 1: done at pmap_create() we use 267 * uvm_km_alloc(kernel_map, PAGE_SIZE) [fka kmem_alloc] to do this 268 * allocation. 269 * 270 * if we are low in free physical memory then we sleep in 271 * uvm_km_alloc -- in this case this is ok since we are creating 272 * a new pmap and should not be holding any locks. 273 * 274 * if the kernel is totally out of virtual space 275 * (i.e. uvm_km_alloc returns NULL), then we panic. 276 * 277 * [B] new page tables pages (PTP) 278 * - call uvm_pagealloc() 279 * => success: zero page, add to pm_pdir 280 * => failure: we are out of free vm_pages, let pmap_enter() 281 * tell UVM about it. 282 * 283 * note: for kernel PTPs, we start with NKPTP of them. as we map 284 * kernel memory (at uvm_map time) we check to see if we've grown 285 * the kernel pmap. if so, we call the optional function 286 * pmap_growkernel() to grow the kernel PTPs in advance. 287 * 288 * [C] pv_entry structures 289 */ 290 291 /* 292 * locking 293 * 294 * we have the following locks that we must contend with: 295 * 296 * mutexes: 297 * 298 * - pmap lock (per pmap, part of uvm_object) 299 * this lock protects the fields in the pmap structure including 300 * the non-kernel PDEs in the PDP, and the PTEs. it also locks 301 * in the alternate PTE space (since that is determined by the 302 * entry in the PDP). 303 * 304 * - pvh_lock (per pv_head) 305 * this lock protects the pv_entry list which is chained off the 306 * pv_head structure for a specific pv-tracked PA. it is locked 307 * when traversing the list (e.g. adding/removing mappings, 308 * syncing R/M bits, etc.) 309 * 310 * - pmaps_lock 311 * this lock protects the list of active pmaps (headed by "pmaps"). 312 * we lock it when adding or removing pmaps from this list. 313 */ 314 315 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER; 316 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER; 317 const long nkptpmax[] = NKPTPMAX_INITIALIZER; 318 const long nbpd[] = NBPD_INITIALIZER; 319 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER; 320 321 long nkptp[] = NKPTP_INITIALIZER; 322 323 struct pmap_head pmaps; 324 kmutex_t pmaps_lock; 325 326 struct pcpu_area *pcpuarea __read_mostly; 327 328 static vaddr_t pmap_maxkvaddr; 329 330 /* 331 * XXX kludge: dummy locking to make KASSERTs in uvm_page.c comfortable. 332 * actual locking is done by pm_lock. 333 */ 334 #if defined(DIAGNOSTIC) 335 #define PMAP_SUBOBJ_LOCK(pm, idx) \ 336 KASSERT(mutex_owned((pm)->pm_lock)); \ 337 if ((idx) != 0) \ 338 mutex_enter((pm)->pm_obj[(idx)].vmobjlock) 339 #define PMAP_SUBOBJ_UNLOCK(pm, idx) \ 340 KASSERT(mutex_owned((pm)->pm_lock)); \ 341 if ((idx) != 0) \ 342 mutex_exit((pm)->pm_obj[(idx)].vmobjlock) 343 #else /* defined(DIAGNOSTIC) */ 344 #define PMAP_SUBOBJ_LOCK(pm, idx) /* nothing */ 345 #define PMAP_SUBOBJ_UNLOCK(pm, idx) /* nothing */ 346 #endif /* defined(DIAGNOSTIC) */ 347 348 /* 349 * Misc. event counters. 350 */ 351 struct evcnt pmap_iobmp_evcnt; 352 struct evcnt pmap_ldt_evcnt; 353 354 /* 355 * PAT 356 */ 357 #define PATENTRY(n, type) (type << ((n) * 8)) 358 #define PAT_UC 0x0ULL 359 #define PAT_WC 0x1ULL 360 #define PAT_WT 0x4ULL 361 #define PAT_WP 0x5ULL 362 #define PAT_WB 0x6ULL 363 #define PAT_UCMINUS 0x7ULL 364 365 static bool cpu_pat_enabled __read_mostly = false; 366 367 /* 368 * Global data structures 369 */ 370 371 static struct pmap kernel_pmap_store; /* the kernel's pmap (proc0) */ 372 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store; 373 374 struct bootspace bootspace __read_mostly; 375 376 /* 377 * pmap_pg_nx: if our processor supports PG_NX in the PTE then we 378 * set pmap_pg_nx to PG_NX (otherwise it is zero). 379 */ 380 pd_entry_t pmap_pg_nx __read_mostly = 0; 381 382 /* 383 * pmap_pg_g: if our processor supports PG_G in the PTE then we 384 * set pmap_pg_g to PG_G (otherwise it is zero). 385 */ 386 pd_entry_t pmap_pg_g __read_mostly = 0; 387 388 /* 389 * pmap_largepages: if our processor supports PG_PS and we are 390 * using it, this is set to true. 391 */ 392 int pmap_largepages __read_mostly = 0; 393 394 /* 395 * i386 physical memory comes in a big contig chunk with a small 396 * hole toward the front of it... the following two paddr_t's 397 * (shared with machdep.c) describe the physical address space 398 * of this machine. 399 */ 400 paddr_t lowmem_rsvd __read_mostly; 401 paddr_t avail_start __read_mostly; /* PA of first available physical page */ 402 paddr_t avail_end __read_mostly; /* PA of last available physical page */ 403 404 #ifdef XEN 405 paddr_t pmap_pa_start; /* PA of first physical page for this domain */ 406 paddr_t pmap_pa_end; /* PA of last physical page for this domain */ 407 #endif 408 409 #define VM_PAGE_TO_PP(pg) (&(pg)->mdpage.mp_pp) 410 411 #define PV_HASH_SIZE 32768 412 #define PV_HASH_LOCK_CNT 32 413 414 struct pv_hash_lock { 415 kmutex_t lock; 416 } __aligned(CACHE_LINE_SIZE) pv_hash_locks[PV_HASH_LOCK_CNT] 417 __aligned(CACHE_LINE_SIZE); 418 419 struct pv_hash_head { 420 SLIST_HEAD(, pv_entry) hh_list; 421 } pv_hash_heads[PV_HASH_SIZE]; 422 423 static u_int 424 pvhash_hash(struct vm_page *ptp, vaddr_t va) 425 { 426 427 return (uintptr_t)ptp / sizeof(*ptp) + (va >> PAGE_SHIFT); 428 } 429 430 static struct pv_hash_head * 431 pvhash_head(u_int hash) 432 { 433 434 return &pv_hash_heads[hash % PV_HASH_SIZE]; 435 } 436 437 static kmutex_t * 438 pvhash_lock(u_int hash) 439 { 440 441 return &pv_hash_locks[hash % PV_HASH_LOCK_CNT].lock; 442 } 443 444 static struct pv_entry * 445 pvhash_remove(struct pv_hash_head *hh, struct vm_page *ptp, vaddr_t va) 446 { 447 struct pv_entry *pve; 448 struct pv_entry *prev; 449 450 prev = NULL; 451 SLIST_FOREACH(pve, &hh->hh_list, pve_hash) { 452 if (pve->pve_pte.pte_ptp == ptp && 453 pve->pve_pte.pte_va == va) { 454 if (prev != NULL) { 455 SLIST_REMOVE_AFTER(prev, pve_hash); 456 } else { 457 SLIST_REMOVE_HEAD(&hh->hh_list, pve_hash); 458 } 459 break; 460 } 461 prev = pve; 462 } 463 return pve; 464 } 465 466 /* 467 * Other data structures 468 */ 469 470 static pt_entry_t protection_codes[8] __read_mostly; 471 472 static bool pmap_initialized __read_mostly = false; /* pmap_init done yet? */ 473 474 /* 475 * The following two vaddr_t's are used during system startup to keep track of 476 * how much of the kernel's VM space we have used. Once the system is started, 477 * the management of the remaining kernel VM space is turned over to the 478 * kernel_map vm_map. 479 */ 480 static vaddr_t virtual_avail __read_mostly; /* VA of first free KVA */ 481 static vaddr_t virtual_end __read_mostly; /* VA of last free KVA */ 482 483 #ifndef XEN 484 /* 485 * LAPIC virtual address, and fake physical address. 486 */ 487 volatile vaddr_t local_apic_va __read_mostly; 488 paddr_t local_apic_pa __read_mostly; 489 #endif 490 491 /* 492 * pool that pmap structures are allocated from 493 */ 494 static struct pool_cache pmap_cache; 495 496 /* 497 * pv_entry cache 498 */ 499 static struct pool_cache pmap_pv_cache; 500 501 #ifdef __HAVE_DIRECT_MAP 502 vaddr_t pmap_direct_base __read_mostly; 503 vaddr_t pmap_direct_end __read_mostly; 504 size_t pmap_direct_pdpe __read_mostly; 505 size_t pmap_direct_npdp __read_mostly; 506 #endif 507 508 #ifndef __HAVE_DIRECT_MAP 509 /* 510 * Special VAs and the PTEs that map them 511 */ 512 static pt_entry_t *early_zero_pte; 513 static void pmap_vpage_cpualloc(struct cpu_info *); 514 #ifdef XEN 515 char *early_zerop; /* also referenced from xen_locore() */ 516 #else 517 static char *early_zerop; 518 #endif 519 #endif 520 521 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int); 522 523 /* PDP pool_cache(9) and its callbacks */ 524 struct pool_cache pmap_pdp_cache; 525 static int pmap_pdp_ctor(void *, void *, int); 526 static void pmap_pdp_dtor(void *, void *); 527 #ifdef PAE 528 /* need to allocate items of 4 pages */ 529 static void *pmap_pdp_alloc(struct pool *, int); 530 static void pmap_pdp_free(struct pool *, void *); 531 static struct pool_allocator pmap_pdp_allocator = { 532 .pa_alloc = pmap_pdp_alloc, 533 .pa_free = pmap_pdp_free, 534 .pa_pagesz = PAGE_SIZE * PDP_SIZE, 535 }; 536 #endif /* PAE */ 537 538 extern vaddr_t idt_vaddr; 539 extern paddr_t idt_paddr; 540 extern vaddr_t gdt_vaddr; 541 extern paddr_t gdt_paddr; 542 extern vaddr_t ldt_vaddr; 543 extern paddr_t ldt_paddr; 544 545 extern int end; 546 547 #ifdef i386 548 /* stuff to fix the pentium f00f bug */ 549 extern vaddr_t pentium_idt_vaddr; 550 #endif 551 552 /* 553 * Local prototypes 554 */ 555 556 #ifdef __HAVE_PCPU_AREA 557 static void pmap_init_pcpu(void); 558 #endif 559 #ifdef __HAVE_DIRECT_MAP 560 static void pmap_init_directmap(struct pmap *); 561 #endif 562 #if !defined(XEN) 563 static void pmap_remap_global(void); 564 #endif 565 #ifndef XEN 566 static void pmap_init_lapic(void); 567 static void pmap_remap_largepages(void); 568 #endif 569 570 static struct vm_page *pmap_get_ptp(struct pmap *, vaddr_t, 571 pd_entry_t * const *, int); 572 static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int); 573 static void pmap_freepage(struct pmap *, struct vm_page *, int); 574 static void pmap_free_ptp(struct pmap *, struct vm_page *, vaddr_t, 575 pt_entry_t *, pd_entry_t * const *); 576 static bool pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *, 577 vaddr_t, struct pv_entry **); 578 static void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t, vaddr_t, 579 vaddr_t, struct pv_entry **); 580 581 static paddr_t pmap_get_physpage(void); 582 static void pmap_alloc_level(struct pmap *, vaddr_t, long *); 583 584 static void pmap_reactivate(struct pmap *); 585 586 /* 587 * p m a p h e l p e r f u n c t i o n s 588 */ 589 590 static inline void 591 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff) 592 { 593 594 if (pmap == pmap_kernel()) { 595 atomic_add_long(&pmap->pm_stats.resident_count, resid_diff); 596 atomic_add_long(&pmap->pm_stats.wired_count, wired_diff); 597 } else { 598 KASSERT(mutex_owned(pmap->pm_lock)); 599 pmap->pm_stats.resident_count += resid_diff; 600 pmap->pm_stats.wired_count += wired_diff; 601 } 602 } 603 604 static inline void 605 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte) 606 { 607 int resid_diff = ((npte & PG_V) ? 1 : 0) - ((opte & PG_V) ? 1 : 0); 608 int wired_diff = ((npte & PG_W) ? 1 : 0) - ((opte & PG_W) ? 1 : 0); 609 610 KASSERT((npte & (PG_V | PG_W)) != PG_W); 611 KASSERT((opte & (PG_V | PG_W)) != PG_W); 612 613 pmap_stats_update(pmap, resid_diff, wired_diff); 614 } 615 616 /* 617 * ptp_to_pmap: lookup pmap by ptp 618 */ 619 620 static struct pmap * 621 ptp_to_pmap(struct vm_page *ptp) 622 { 623 struct pmap *pmap; 624 625 if (ptp == NULL) { 626 return pmap_kernel(); 627 } 628 pmap = (struct pmap *)ptp->uobject; 629 KASSERT(pmap != NULL); 630 KASSERT(&pmap->pm_obj[0] == ptp->uobject); 631 return pmap; 632 } 633 634 static inline struct pv_pte * 635 pve_to_pvpte(struct pv_entry *pve) 636 { 637 638 KASSERT((void *)&pve->pve_pte == (void *)pve); 639 return &pve->pve_pte; 640 } 641 642 static inline struct pv_entry * 643 pvpte_to_pve(struct pv_pte *pvpte) 644 { 645 struct pv_entry *pve = (void *)pvpte; 646 647 KASSERT(pve_to_pvpte(pve) == pvpte); 648 return pve; 649 } 650 651 /* 652 * pv_pte_first, pv_pte_next: PV list iterator. 653 */ 654 655 static struct pv_pte * 656 pv_pte_first(struct pmap_page *pp) 657 { 658 659 if ((pp->pp_flags & PP_EMBEDDED) != 0) { 660 return &pp->pp_pte; 661 } 662 return pve_to_pvpte(LIST_FIRST(&pp->pp_head.pvh_list)); 663 } 664 665 static struct pv_pte * 666 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte) 667 { 668 669 KASSERT(pvpte != NULL); 670 if (pvpte == &pp->pp_pte) { 671 KASSERT((pp->pp_flags & PP_EMBEDDED) != 0); 672 return NULL; 673 } 674 KASSERT((pp->pp_flags & PP_EMBEDDED) == 0); 675 return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list)); 676 } 677 678 /* 679 * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]? 680 * of course the kernel is always loaded 681 */ 682 683 bool 684 pmap_is_curpmap(struct pmap *pmap) 685 { 686 return((pmap == pmap_kernel()) || 687 (pmap == curcpu()->ci_pmap)); 688 } 689 690 /* 691 * Add a reference to the specified pmap. 692 */ 693 694 void 695 pmap_reference(struct pmap *pmap) 696 { 697 698 atomic_inc_uint(&pmap->pm_obj[0].uo_refs); 699 } 700 701 /* 702 * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in 703 * 704 * there are several pmaps involved. some or all of them might be same. 705 * 706 * - the pmap given by the first argument 707 * our caller wants to access this pmap's PTEs. 708 * 709 * - pmap_kernel() 710 * the kernel pmap. note that it only contains the kernel part 711 * of the address space which is shared by any pmap. ie. any 712 * pmap can be used instead of pmap_kernel() for our purpose. 713 * 714 * - ci->ci_pmap 715 * pmap currently loaded on the cpu. 716 * 717 * - vm_map_pmap(&curproc->p_vmspace->vm_map) 718 * current process' pmap. 719 * 720 * => we lock enough pmaps to keep things locked in 721 * => must be undone with pmap_unmap_ptes before returning 722 */ 723 724 void 725 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2, 726 pd_entry_t **ptepp, pd_entry_t * const **pdeppp) 727 { 728 struct pmap *curpmap; 729 struct cpu_info *ci; 730 lwp_t *l; 731 732 /* The kernel's pmap is always accessible. */ 733 if (pmap == pmap_kernel()) { 734 *pmap2 = NULL; 735 *ptepp = PTE_BASE; 736 *pdeppp = normal_pdes; 737 return; 738 } 739 KASSERT(kpreempt_disabled()); 740 741 l = curlwp; 742 retry: 743 mutex_enter(pmap->pm_lock); 744 ci = curcpu(); 745 curpmap = ci->ci_pmap; 746 if (vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) { 747 /* Our own pmap so just load it: easy. */ 748 if (__predict_false(ci->ci_want_pmapload)) { 749 mutex_exit(pmap->pm_lock); 750 pmap_load(); 751 goto retry; 752 } 753 KASSERT(pmap == curpmap); 754 } else if (pmap == curpmap) { 755 /* 756 * Already on the CPU: make it valid. This is very 757 * often the case during exit(), when we have switched 758 * to the kernel pmap in order to destroy a user pmap. 759 */ 760 pmap_reactivate(pmap); 761 } else { 762 /* 763 * Toss current pmap from CPU, but keep a reference to it. 764 * The reference will be dropped by pmap_unmap_ptes(). 765 * Can happen if we block during exit(). 766 */ 767 const cpuid_t cid = cpu_index(ci); 768 769 kcpuset_atomic_clear(curpmap->pm_cpus, cid); 770 kcpuset_atomic_clear(curpmap->pm_kernel_cpus, cid); 771 ci->ci_pmap = pmap; 772 ci->ci_tlbstate = TLBSTATE_VALID; 773 kcpuset_atomic_set(pmap->pm_cpus, cid); 774 kcpuset_atomic_set(pmap->pm_kernel_cpus, cid); 775 cpu_load_pmap(pmap, curpmap); 776 } 777 pmap->pm_ncsw = l->l_ncsw; 778 *pmap2 = curpmap; 779 *ptepp = PTE_BASE; 780 781 #if defined(XEN) && defined(__x86_64__) 782 KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] == L4_BASE); 783 ci->ci_normal_pdes[PTP_LEVELS - 2] = pmap->pm_pdir; 784 *pdeppp = ci->ci_normal_pdes; 785 #else 786 *pdeppp = normal_pdes; 787 #endif 788 } 789 790 /* 791 * pmap_unmap_ptes: unlock the PTE mapping of "pmap" 792 */ 793 794 void 795 pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2) 796 { 797 struct cpu_info *ci; 798 struct pmap *mypmap; 799 800 KASSERT(kpreempt_disabled()); 801 802 /* The kernel's pmap is always accessible. */ 803 if (pmap == pmap_kernel()) { 804 return; 805 } 806 807 ci = curcpu(); 808 809 #if defined(XEN) && defined(__x86_64__) 810 KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] != L4_BASE); 811 ci->ci_normal_pdes[PTP_LEVELS - 2] = L4_BASE; 812 #endif 813 814 /* 815 * We cannot tolerate context switches while mapped in. 816 * If it is our own pmap all we have to do is unlock. 817 */ 818 KASSERT(pmap->pm_ncsw == curlwp->l_ncsw); 819 mypmap = vm_map_pmap(&curproc->p_vmspace->vm_map); 820 if (pmap == mypmap) { 821 mutex_exit(pmap->pm_lock); 822 return; 823 } 824 825 /* 826 * Mark whatever's on the CPU now as lazy and unlock. 827 * If the pmap was already installed, we are done. 828 */ 829 ci->ci_tlbstate = TLBSTATE_LAZY; 830 ci->ci_want_pmapload = (mypmap != pmap_kernel()); 831 mutex_exit(pmap->pm_lock); 832 if (pmap == pmap2) { 833 return; 834 } 835 836 /* 837 * We installed another pmap on the CPU. Grab a reference to 838 * it and leave in place. Toss the evicted pmap (can block). 839 */ 840 pmap_reference(pmap); 841 pmap_destroy(pmap2); 842 } 843 844 845 inline static void 846 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte) 847 { 848 849 #if !defined(__x86_64__) 850 if (curproc == NULL || curproc->p_vmspace == NULL || 851 pm != vm_map_pmap(&curproc->p_vmspace->vm_map)) 852 return; 853 854 if ((opte ^ npte) & PG_X) 855 pmap_update_pg(va); 856 857 /* 858 * Executability was removed on the last executable change. 859 * Reset the code segment to something conservative and 860 * let the trap handler deal with setting the right limit. 861 * We can't do that because of locking constraints on the vm map. 862 */ 863 864 if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) { 865 struct trapframe *tf = curlwp->l_md.md_regs; 866 867 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 868 pm->pm_hiexec = I386_MAX_EXE_ADDR; 869 } 870 #endif /* !defined(__x86_64__) */ 871 } 872 873 #if !defined(__x86_64__) 874 /* 875 * Fixup the code segment to cover all potential executable mappings. 876 * returns 0 if no changes to the code segment were made. 877 */ 878 879 int 880 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb) 881 { 882 struct vm_map_entry *ent; 883 struct pmap *pm = vm_map_pmap(map); 884 vaddr_t va = 0; 885 886 vm_map_lock_read(map); 887 for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) { 888 889 /* 890 * This entry has greater va than the entries before. 891 * We need to make it point to the last page, not past it. 892 */ 893 894 if (ent->protection & VM_PROT_EXECUTE) 895 va = trunc_page(ent->end) - PAGE_SIZE; 896 } 897 vm_map_unlock_read(map); 898 if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL)) 899 return (0); 900 901 pm->pm_hiexec = va; 902 if (pm->pm_hiexec > I386_MAX_EXE_ADDR) { 903 tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL); 904 } else { 905 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 906 return (0); 907 } 908 return (1); 909 } 910 #endif /* !defined(__x86_64__) */ 911 912 void 913 pat_init(struct cpu_info *ci) 914 { 915 uint64_t pat; 916 917 if (!(ci->ci_feat_val[0] & CPUID_PAT)) 918 return; 919 920 /* We change WT to WC. Leave all other entries the default values. */ 921 pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) | 922 PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) | 923 PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) | 924 PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC); 925 926 wrmsr(MSR_CR_PAT, pat); 927 cpu_pat_enabled = true; 928 aprint_debug_dev(ci->ci_dev, "PAT enabled\n"); 929 } 930 931 static pt_entry_t 932 pmap_pat_flags(u_int flags) 933 { 934 u_int cacheflags = (flags & PMAP_CACHE_MASK); 935 936 if (!cpu_pat_enabled) { 937 switch (cacheflags) { 938 case PMAP_NOCACHE: 939 case PMAP_NOCACHE_OVR: 940 /* results in PGC_UCMINUS on cpus which have 941 * the cpuid PAT but PAT "disabled" 942 */ 943 return PG_N; 944 default: 945 return 0; 946 } 947 } 948 949 switch (cacheflags) { 950 case PMAP_NOCACHE: 951 return PGC_UC; 952 case PMAP_WRITE_COMBINE: 953 return PGC_WC; 954 case PMAP_WRITE_BACK: 955 return PGC_WB; 956 case PMAP_NOCACHE_OVR: 957 return PGC_UCMINUS; 958 } 959 960 return 0; 961 } 962 963 /* 964 * p m a p k e n t e r f u n c t i o n s 965 * 966 * functions to quickly enter/remove pages from the kernel address 967 * space. pmap_kremove is exported to MI kernel. we make use of 968 * the recursive PTE mappings. 969 */ 970 971 /* 972 * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking 973 * 974 * => no need to lock anything, assume va is already allocated 975 * => should be faster than normal pmap enter function 976 */ 977 978 void 979 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags) 980 { 981 pt_entry_t *pte, opte, npte; 982 983 KASSERT(!(prot & ~VM_PROT_ALL)); 984 985 if (va < VM_MIN_KERNEL_ADDRESS) 986 pte = vtopte(va); 987 else 988 pte = kvtopte(va); 989 #ifdef DOM0OPS 990 if (pa < pmap_pa_start || pa >= pmap_pa_end) { 991 #ifdef DEBUG 992 printf_nolog("%s: pa %#" PRIxPADDR " for va %#" PRIxVADDR 993 " outside range\n", __func__, pa, va); 994 #endif /* DEBUG */ 995 npte = pa; 996 } else 997 #endif /* DOM0OPS */ 998 npte = pmap_pa2pte(pa); 999 npte |= protection_codes[prot] | PG_V | pmap_pg_g; 1000 npte |= pmap_pat_flags(flags); 1001 opte = pmap_pte_testset(pte, npte); /* zap! */ 1002 1003 /* 1004 * XXX: make sure we are not dealing with a large page, since the only 1005 * large pages created are for the kernel image, and they should never 1006 * be kentered. 1007 */ 1008 KASSERTMSG(!(opte & PG_PS), "PG_PS va=%#"PRIxVADDR, va); 1009 1010 if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) { 1011 /* This should not happen. */ 1012 printf_nolog("%s: mapping already present\n", __func__); 1013 kpreempt_disable(); 1014 pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER); 1015 kpreempt_enable(); 1016 } 1017 } 1018 1019 __strict_weak_alias(pmap_kenter_ma, pmap_kenter_pa); 1020 1021 #if defined(__x86_64__) 1022 /* 1023 * Change protection for a virtual address. Local for a CPU only, don't 1024 * care about TLB shootdowns. 1025 * 1026 * => must be called with preemption disabled 1027 */ 1028 void 1029 pmap_changeprot_local(vaddr_t va, vm_prot_t prot) 1030 { 1031 pt_entry_t *pte, opte, npte; 1032 1033 KASSERT(kpreempt_disabled()); 1034 1035 if (va < VM_MIN_KERNEL_ADDRESS) 1036 pte = vtopte(va); 1037 else 1038 pte = kvtopte(va); 1039 1040 npte = opte = *pte; 1041 1042 if ((prot & VM_PROT_WRITE) != 0) 1043 npte |= PG_RW; 1044 else 1045 npte &= ~PG_RW; 1046 1047 if (opte != npte) { 1048 pmap_pte_set(pte, npte); 1049 pmap_pte_flush(); 1050 invlpg(va); 1051 } 1052 } 1053 #endif /* defined(__x86_64__) */ 1054 1055 /* 1056 * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking 1057 * 1058 * => no need to lock anything 1059 * => caller must dispose of any vm_page mapped in the va range 1060 * => note: not an inline function 1061 * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE 1062 * => we assume kernel only unmaps valid addresses and thus don't bother 1063 * checking the valid bit before doing TLB flushing 1064 * => must be followed by call to pmap_update() before reuse of page 1065 */ 1066 1067 static inline void 1068 pmap_kremove1(vaddr_t sva, vsize_t len, bool localonly) 1069 { 1070 pt_entry_t *pte, opte; 1071 vaddr_t va, eva; 1072 1073 eva = sva + len; 1074 1075 kpreempt_disable(); 1076 for (va = sva; va < eva; va += PAGE_SIZE) { 1077 pte = kvtopte(va); 1078 opte = pmap_pte_testset(pte, 0); /* zap! */ 1079 if ((opte & (PG_V | PG_U)) == (PG_V | PG_U) && !localonly) { 1080 pmap_tlb_shootdown(pmap_kernel(), va, opte, 1081 TLBSHOOT_KREMOVE); 1082 } 1083 KASSERTMSG((opte & PG_PS) == 0, 1084 "va %#" PRIxVADDR " is a large page", va); 1085 KASSERTMSG((opte & PG_PVLIST) == 0, 1086 "va %#" PRIxVADDR " is a pv tracked page", va); 1087 } 1088 if (localonly) { 1089 tlbflushg(); 1090 } 1091 kpreempt_enable(); 1092 } 1093 1094 void 1095 pmap_kremove(vaddr_t sva, vsize_t len) 1096 { 1097 1098 pmap_kremove1(sva, len, false); 1099 } 1100 1101 /* 1102 * pmap_kremove_local: like pmap_kremove(), but only worry about 1103 * TLB invalidations on the current CPU. this is only intended 1104 * for use while writing kernel crash dumps, either after panic 1105 * or via reboot -d. 1106 */ 1107 1108 void 1109 pmap_kremove_local(vaddr_t sva, vsize_t len) 1110 { 1111 1112 pmap_kremove1(sva, len, true); 1113 } 1114 1115 /* 1116 * p m a p i n i t f u n c t i o n s 1117 * 1118 * pmap_bootstrap and pmap_init are called during system startup 1119 * to init the pmap module. pmap_bootstrap() does a low level 1120 * init just to get things rolling. pmap_init() finishes the job. 1121 */ 1122 1123 /* 1124 * pmap_bootstrap_valloc: allocate a virtual address in the bootstrap area. 1125 * This function is to be used before any VM system has been set up. 1126 * 1127 * The va is taken from virtual_avail. 1128 */ 1129 static vaddr_t 1130 pmap_bootstrap_valloc(size_t npages) 1131 { 1132 vaddr_t va = virtual_avail; 1133 virtual_avail += npages * PAGE_SIZE; 1134 return va; 1135 } 1136 1137 /* 1138 * pmap_bootstrap_palloc: allocate a physical address in the bootstrap area. 1139 * This function is to be used before any VM system has been set up. 1140 * 1141 * The pa is taken from avail_start. 1142 */ 1143 static paddr_t 1144 pmap_bootstrap_palloc(size_t npages) 1145 { 1146 paddr_t pa = avail_start; 1147 avail_start += npages * PAGE_SIZE; 1148 return pa; 1149 } 1150 1151 /* 1152 * pmap_bootstrap: get the system in a state where it can run with VM properly 1153 * enabled (called before main()). The VM system is fully init'd later. 1154 * 1155 * => on i386, locore.S has already enabled the MMU by allocating a PDP for the 1156 * kernel, and nkpde PTP's for the kernel. 1157 * => kva_start is the first free virtual address in kernel space. 1158 */ 1159 void 1160 pmap_bootstrap(vaddr_t kva_start) 1161 { 1162 struct pmap *kpm; 1163 int i; 1164 vaddr_t kva; 1165 1166 pmap_pg_nx = (cpu_feature[2] & CPUID_NOX ? PG_NX : 0); 1167 1168 /* 1169 * Set up our local static global vars that keep track of the usage of 1170 * KVM before kernel_map is set up. 1171 */ 1172 virtual_avail = kva_start; /* first free KVA */ 1173 virtual_end = VM_MAX_KERNEL_ADDRESS; /* last KVA */ 1174 1175 /* 1176 * Set up protection_codes: we need to be able to convert from a MI 1177 * protection code (some combo of VM_PROT...) to something we can jam 1178 * into a x86 PTE. 1179 */ 1180 protection_codes[VM_PROT_NONE] = pmap_pg_nx; 1181 protection_codes[VM_PROT_EXECUTE] = PG_RO | PG_X; 1182 protection_codes[VM_PROT_READ] = PG_RO | pmap_pg_nx; 1183 protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO | PG_X; 1184 protection_codes[VM_PROT_WRITE] = PG_RW | pmap_pg_nx; 1185 protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW | PG_X; 1186 protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW | pmap_pg_nx; 1187 protection_codes[VM_PROT_ALL] = PG_RW | PG_X; 1188 1189 /* 1190 * Now we init the kernel's pmap. 1191 * 1192 * The kernel pmap's pm_obj is not used for much. However, in user pmaps 1193 * the pm_obj contains the list of active PTPs. 1194 * 1195 * The pm_obj currently does not have a pager. It might be possible to 1196 * add a pager that would allow a process to read-only mmap its own page 1197 * tables (fast user-level vtophys?). This may or may not be useful. 1198 */ 1199 kpm = pmap_kernel(); 1200 for (i = 0; i < PTP_LEVELS - 1; i++) { 1201 mutex_init(&kpm->pm_obj_lock[i], MUTEX_DEFAULT, IPL_NONE); 1202 uvm_obj_init(&kpm->pm_obj[i], NULL, false, 1); 1203 uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_obj_lock[i]); 1204 kpm->pm_ptphint[i] = NULL; 1205 } 1206 memset(&kpm->pm_list, 0, sizeof(kpm->pm_list)); /* pm_list not used */ 1207 1208 kpm->pm_pdir = (pd_entry_t *)bootspace.pdir; 1209 for (i = 0; i < PDP_SIZE; i++) 1210 kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i; 1211 1212 kpm->pm_stats.wired_count = kpm->pm_stats.resident_count = 1213 x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS); 1214 1215 kcpuset_create(&kpm->pm_cpus, true); 1216 kcpuset_create(&kpm->pm_kernel_cpus, true); 1217 1218 kpm->pm_ldt = NULL; 1219 kpm->pm_ldt_len = 0; 1220 kpm->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 1221 1222 /* 1223 * the above is just a rough estimate and not critical to the proper 1224 * operation of the system. 1225 */ 1226 1227 #if !defined(XEN) 1228 /* 1229 * Begin to enable global TLB entries if they are supported. 1230 * The G bit has no effect until the CR4_PGE bit is set in CR4, 1231 * which happens in cpu_init(), which is run on each cpu 1232 * (and happens later) 1233 */ 1234 if (cpu_feature[0] & CPUID_PGE) { 1235 pmap_pg_g = PG_G; /* enable software */ 1236 1237 /* add PG_G attribute to already mapped kernel pages */ 1238 pmap_remap_global(); 1239 } 1240 #endif 1241 1242 #ifndef XEN 1243 /* 1244 * Enable large pages if they are supported. 1245 */ 1246 if (cpu_feature[0] & CPUID_PSE) { 1247 lcr4(rcr4() | CR4_PSE); /* enable hardware (via %cr4) */ 1248 pmap_largepages = 1; /* enable software */ 1249 1250 /* 1251 * The TLB must be flushed after enabling large pages on Pentium 1252 * CPUs, according to section 3.6.2.2 of "Intel Architecture 1253 * Software Developer's Manual, Volume 3: System Programming". 1254 */ 1255 tlbflushg(); 1256 1257 /* Remap the kernel. */ 1258 pmap_remap_largepages(); 1259 } 1260 pmap_init_lapic(); 1261 #endif /* !XEN */ 1262 1263 #ifdef __HAVE_PCPU_AREA 1264 pmap_init_pcpu(); 1265 #endif 1266 1267 #ifdef __HAVE_DIRECT_MAP 1268 pmap_init_directmap(kpm); 1269 #else 1270 pmap_vpage_cpualloc(&cpu_info_primary); 1271 1272 if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { /* i386 */ 1273 early_zerop = (void *)cpu_info_primary.vpage[VPAGE_ZER]; 1274 early_zero_pte = cpu_info_primary.vpage_pte[VPAGE_ZER]; 1275 } else { /* amd64 */ 1276 /* 1277 * zero_pte is stuck at the end of mapped space for the kernel 1278 * image (disjunct from kva space). This is done so that it 1279 * can safely be used in pmap_growkernel (pmap_get_physpage), 1280 * when it's called for the first time. 1281 * XXXfvdl fix this for MULTIPROCESSOR later. 1282 */ 1283 #ifdef XEN 1284 /* early_zerop initialized in xen_locore() */ 1285 #else 1286 early_zerop = (void *)bootspace.spareva; 1287 #endif 1288 early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop); 1289 } 1290 #endif 1291 1292 #if defined(XEN) && defined(__x86_64__) 1293 extern vaddr_t xen_dummy_page; 1294 paddr_t xen_dummy_user_pgd; 1295 1296 /* 1297 * We want a dummy page directory for Xen: when deactivating a pmap, 1298 * Xen will still consider it active. So we set user PGD to this one 1299 * to lift all protection on the now inactive page tables set. 1300 */ 1301 xen_dummy_user_pgd = xen_dummy_page - KERNBASE; 1302 1303 /* Zero fill it, the less checks in Xen it requires the better */ 1304 memset((void *)(xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE); 1305 /* Mark read-only */ 1306 HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE, 1307 pmap_pa2pte(xen_dummy_user_pgd) | PG_V | pmap_pg_nx, 1308 UVMF_INVLPG); 1309 /* Pin as L4 */ 1310 xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd)); 1311 #endif 1312 1313 /* 1314 * Allocate space for the IDT, GDT and LDT. 1315 */ 1316 #ifdef __HAVE_PCPU_AREA 1317 idt_vaddr = (vaddr_t)&pcpuarea->idt; 1318 #else 1319 idt_vaddr = pmap_bootstrap_valloc(1); 1320 #endif 1321 idt_paddr = pmap_bootstrap_palloc(1); 1322 1323 gdt_vaddr = pmap_bootstrap_valloc(1); 1324 gdt_paddr = pmap_bootstrap_palloc(1); 1325 1326 #ifdef __HAVE_PCPU_AREA 1327 ldt_vaddr = (vaddr_t)&pcpuarea->ldt; 1328 #else 1329 ldt_vaddr = pmap_bootstrap_valloc(1); 1330 #endif 1331 ldt_paddr = pmap_bootstrap_palloc(1); 1332 1333 #if !defined(__x86_64__) && !defined(XEN) 1334 /* pentium f00f bug stuff */ 1335 pentium_idt_vaddr = pmap_bootstrap_valloc(1); 1336 #endif 1337 1338 /* 1339 * Now we reserve some VM for mapping pages when doing a crash dump. 1340 */ 1341 virtual_avail = reserve_dumppages(virtual_avail); 1342 1343 /* 1344 * Init the static-global locks and global lists. 1345 * 1346 * => pventry::pvh_lock (initialized elsewhere) must also be 1347 * a spin lock, again at IPL_VM to prevent deadlock, and 1348 * again is never taken from interrupt context. 1349 */ 1350 mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE); 1351 LIST_INIT(&pmaps); 1352 1353 /* 1354 * Ensure the TLB is sync'd with reality by flushing it... 1355 */ 1356 tlbflushg(); 1357 1358 /* 1359 * Calculate pmap_maxkvaddr from nkptp[]. 1360 */ 1361 kva = VM_MIN_KERNEL_ADDRESS; 1362 for (i = PTP_LEVELS - 1; i >= 1; i--) { 1363 kva += nkptp[i] * nbpd[i]; 1364 } 1365 pmap_maxkvaddr = kva; 1366 } 1367 1368 #ifndef XEN 1369 static void 1370 pmap_init_lapic(void) 1371 { 1372 /* 1373 * On CPUs that have no LAPIC, local_apic_va is never kentered. But our 1374 * x86 implementation relies a lot on this address to be valid; so just 1375 * allocate a fake physical page that will be kentered into 1376 * local_apic_va by machdep. 1377 * 1378 * If the LAPIC is present, the va will be remapped somewhere else 1379 * later in lapic_map. 1380 */ 1381 local_apic_va = pmap_bootstrap_valloc(1); 1382 local_apic_pa = pmap_bootstrap_palloc(1); 1383 } 1384 #endif 1385 1386 #if defined(__HAVE_PCPU_AREA) || defined(__HAVE_DIRECT_MAP) 1387 static size_t 1388 pmap_pagetree_nentries_range(vaddr_t startva, vaddr_t endva, size_t pgsz) 1389 { 1390 size_t npages; 1391 npages = (roundup(endva, pgsz) / pgsz) - 1392 (rounddown(startva, pgsz) / pgsz); 1393 return npages; 1394 } 1395 #endif 1396 1397 #ifdef __HAVE_PCPU_AREA 1398 static void 1399 pmap_init_pcpu(void) 1400 { 1401 const vaddr_t startva = PMAP_PCPU_BASE; 1402 size_t nL4e, nL3e, nL2e, nL1e; 1403 size_t L4e_idx, L3e_idx, L2e_idx, L1e_idx __diagused; 1404 paddr_t pa; 1405 vaddr_t endva; 1406 vaddr_t tmpva; 1407 pt_entry_t *pte; 1408 size_t size; 1409 int i; 1410 1411 const pd_entry_t pteflags = PG_V | PG_KW | pmap_pg_nx; 1412 1413 size = sizeof(struct pcpu_area); 1414 1415 endva = startva + size; 1416 1417 /* We will use this temporary va. */ 1418 tmpva = bootspace.spareva; 1419 pte = PTE_BASE + pl1_i(tmpva); 1420 1421 /* Build L4 */ 1422 L4e_idx = pl4_i(startva); 1423 nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4); 1424 KASSERT(nL4e == 1); 1425 for (i = 0; i < nL4e; i++) { 1426 KASSERT(L4_BASE[L4e_idx+i] == 0); 1427 1428 pa = pmap_bootstrap_palloc(1); 1429 *pte = (pa & PG_FRAME) | pteflags; 1430 pmap_update_pg(tmpva); 1431 memset((void *)tmpva, 0, PAGE_SIZE); 1432 1433 L4_BASE[L4e_idx+i] = pa | pteflags | PG_U; 1434 } 1435 1436 /* Build L3 */ 1437 L3e_idx = pl3_i(startva); 1438 nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3); 1439 for (i = 0; i < nL3e; i++) { 1440 KASSERT(L3_BASE[L3e_idx+i] == 0); 1441 1442 pa = pmap_bootstrap_palloc(1); 1443 *pte = (pa & PG_FRAME) | pteflags; 1444 pmap_update_pg(tmpva); 1445 memset((void *)tmpva, 0, PAGE_SIZE); 1446 1447 L3_BASE[L3e_idx+i] = pa | pteflags | PG_U; 1448 } 1449 1450 /* Build L2 */ 1451 L2e_idx = pl2_i(startva); 1452 nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2); 1453 for (i = 0; i < nL2e; i++) { 1454 1455 KASSERT(L2_BASE[L2e_idx+i] == 0); 1456 1457 pa = pmap_bootstrap_palloc(1); 1458 *pte = (pa & PG_FRAME) | pteflags; 1459 pmap_update_pg(tmpva); 1460 memset((void *)tmpva, 0, PAGE_SIZE); 1461 1462 L2_BASE[L2e_idx+i] = pa | pteflags | PG_U; 1463 } 1464 1465 /* Build L1 */ 1466 L1e_idx = pl1_i(startva); 1467 nL1e = pmap_pagetree_nentries_range(startva, endva, NBPD_L1); 1468 for (i = 0; i < nL1e; i++) { 1469 /* 1470 * Nothing to do, the PTEs will be entered via 1471 * pmap_kenter_pa. 1472 */ 1473 KASSERT(L1_BASE[L1e_idx+i] == 0); 1474 } 1475 1476 *pte = 0; 1477 pmap_update_pg(tmpva); 1478 1479 pcpuarea = (struct pcpu_area *)startva; 1480 1481 tlbflush(); 1482 } 1483 #endif 1484 1485 #ifdef __HAVE_DIRECT_MAP 1486 /* 1487 * Create the amd64 direct map. Called only once at boot time. We map all of 1488 * the physical memory contiguously using 2MB large pages, with RW permissions. 1489 * However there is a hole: the kernel is mapped with RO permissions. 1490 */ 1491 static void 1492 pmap_init_directmap(struct pmap *kpm) 1493 { 1494 extern phys_ram_seg_t mem_clusters[]; 1495 extern int mem_cluster_cnt; 1496 1497 const vaddr_t startva = PMAP_DIRECT_DEFAULT_BASE; 1498 size_t nL4e, nL3e, nL2e; 1499 size_t L4e_idx, L3e_idx, L2e_idx; 1500 size_t spahole, epahole; 1501 paddr_t lastpa, pa; 1502 vaddr_t endva; 1503 vaddr_t tmpva; 1504 pt_entry_t *pte; 1505 phys_ram_seg_t *mc; 1506 int i; 1507 1508 const pd_entry_t pteflags = PG_V | PG_KW | pmap_pg_nx; 1509 const pd_entry_t holepteflags = PG_V | pmap_pg_nx; 1510 1511 CTASSERT(NL4_SLOT_DIRECT * NBPD_L4 == MAXPHYSMEM); 1512 1513 spahole = roundup(bootspace.head.pa, NBPD_L2); 1514 epahole = rounddown(bootspace.boot.pa, NBPD_L2); 1515 1516 /* Get the last physical address available */ 1517 lastpa = 0; 1518 for (i = 0; i < mem_cluster_cnt; i++) { 1519 mc = &mem_clusters[i]; 1520 lastpa = MAX(lastpa, mc->start + mc->size); 1521 } 1522 1523 /* 1524 * x86_add_cluster should have truncated the memory to MAXPHYSMEM. 1525 */ 1526 if (lastpa > MAXPHYSMEM) { 1527 panic("pmap_init_directmap: lastpa incorrect"); 1528 } 1529 endva = startva + lastpa; 1530 1531 /* We will use this temporary va. */ 1532 tmpva = bootspace.spareva; 1533 pte = PTE_BASE + pl1_i(tmpva); 1534 1535 /* Build L4 */ 1536 L4e_idx = pl4_i(startva); 1537 nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4); 1538 KASSERT(nL4e <= NL4_SLOT_DIRECT); 1539 for (i = 0; i < nL4e; i++) { 1540 KASSERT(L4_BASE[L4e_idx+i] == 0); 1541 1542 pa = pmap_bootstrap_palloc(1); 1543 *pte = (pa & PG_FRAME) | pteflags; 1544 pmap_update_pg(tmpva); 1545 memset((void *)tmpva, 0, PAGE_SIZE); 1546 1547 L4_BASE[L4e_idx+i] = pa | pteflags | PG_U; 1548 } 1549 1550 /* Build L3 */ 1551 L3e_idx = pl3_i(startva); 1552 nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3); 1553 for (i = 0; i < nL3e; i++) { 1554 KASSERT(L3_BASE[L3e_idx+i] == 0); 1555 1556 pa = pmap_bootstrap_palloc(1); 1557 *pte = (pa & PG_FRAME) | pteflags; 1558 pmap_update_pg(tmpva); 1559 memset((void *)tmpva, 0, PAGE_SIZE); 1560 1561 L3_BASE[L3e_idx+i] = pa | pteflags | PG_U; 1562 } 1563 1564 /* Build L2 */ 1565 L2e_idx = pl2_i(startva); 1566 nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2); 1567 for (i = 0; i < nL2e; i++) { 1568 KASSERT(L2_BASE[L2e_idx+i] == 0); 1569 1570 pa = (paddr_t)(i * NBPD_L2); 1571 1572 if (spahole <= pa && pa < epahole) { 1573 L2_BASE[L2e_idx+i] = pa | holepteflags | PG_U | 1574 PG_PS | pmap_pg_g; 1575 } else { 1576 L2_BASE[L2e_idx+i] = pa | pteflags | PG_U | 1577 PG_PS | pmap_pg_g; 1578 } 1579 } 1580 1581 *pte = 0; 1582 pmap_update_pg(tmpva); 1583 1584 pmap_direct_base = startva; 1585 pmap_direct_end = endva; 1586 pmap_direct_pdpe = L4e_idx; 1587 pmap_direct_npdp = nL4e; 1588 1589 tlbflush(); 1590 } 1591 #endif /* __HAVE_DIRECT_MAP */ 1592 1593 #if !defined(XEN) 1594 /* 1595 * Remap all of the virtual pages created so far with the PG_G bit. 1596 */ 1597 static void 1598 pmap_remap_global(void) 1599 { 1600 vaddr_t kva, kva_end; 1601 unsigned long p1i; 1602 size_t i; 1603 1604 /* head */ 1605 kva = bootspace.head.va; 1606 kva_end = kva + bootspace.head.sz; 1607 for ( ; kva < kva_end; kva += PAGE_SIZE) { 1608 p1i = pl1_i(kva); 1609 if (pmap_valid_entry(PTE_BASE[p1i])) 1610 PTE_BASE[p1i] |= pmap_pg_g; 1611 } 1612 1613 /* kernel segments */ 1614 for (i = 0; i < BTSPACE_NSEGS; i++) { 1615 if (bootspace.segs[i].type == BTSEG_NONE) { 1616 continue; 1617 } 1618 kva = bootspace.segs[i].va; 1619 kva_end = kva + bootspace.segs[i].sz; 1620 for ( ; kva < kva_end; kva += PAGE_SIZE) { 1621 p1i = pl1_i(kva); 1622 if (pmap_valid_entry(PTE_BASE[p1i])) 1623 PTE_BASE[p1i] |= pmap_pg_g; 1624 } 1625 } 1626 1627 /* boot space */ 1628 kva = bootspace.boot.va; 1629 kva_end = kva + bootspace.boot.sz; 1630 for ( ; kva < kva_end; kva += PAGE_SIZE) { 1631 p1i = pl1_i(kva); 1632 if (pmap_valid_entry(PTE_BASE[p1i])) 1633 PTE_BASE[p1i] |= pmap_pg_g; 1634 } 1635 } 1636 #endif 1637 1638 #ifndef XEN 1639 /* 1640 * Remap several kernel segments with large pages. We cover as many pages as we 1641 * can. Called only once at boot time, if the CPU supports large pages. 1642 */ 1643 static void 1644 pmap_remap_largepages(void) 1645 { 1646 pd_entry_t *pde; 1647 vaddr_t kva, kva_end; 1648 paddr_t pa; 1649 size_t i; 1650 1651 /* Remap the kernel text using large pages. */ 1652 for (i = 0; i < BTSPACE_NSEGS; i++) { 1653 if (bootspace.segs[i].type != BTSEG_TEXT) { 1654 continue; 1655 } 1656 kva = roundup(bootspace.segs[i].va, NBPD_L2); 1657 if (kva < bootspace.segs[i].va) { 1658 continue; 1659 } 1660 kva_end = rounddown(bootspace.segs[i].va + 1661 bootspace.segs[i].sz, NBPD_L2); 1662 pa = roundup(bootspace.segs[i].pa, NBPD_L2); 1663 for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) { 1664 pde = &L2_BASE[pl2_i(kva)]; 1665 *pde = pa | pmap_pg_g | PG_PS | PG_KR | PG_V; 1666 tlbflushg(); 1667 } 1668 } 1669 1670 /* Remap the kernel rodata using large pages. */ 1671 for (i = 0; i < BTSPACE_NSEGS; i++) { 1672 if (bootspace.segs[i].type != BTSEG_RODATA) { 1673 continue; 1674 } 1675 kva = roundup(bootspace.segs[i].va, NBPD_L2); 1676 if (kva < bootspace.segs[i].va) { 1677 continue; 1678 } 1679 kva_end = rounddown(bootspace.segs[i].va + 1680 bootspace.segs[i].sz, NBPD_L2); 1681 pa = roundup(bootspace.segs[i].pa, NBPD_L2); 1682 for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) { 1683 pde = &L2_BASE[pl2_i(kva)]; 1684 *pde = pa | pmap_pg_g | PG_PS | pmap_pg_nx | PG_KR | PG_V; 1685 tlbflushg(); 1686 } 1687 } 1688 1689 /* Remap the kernel data+bss using large pages. */ 1690 for (i = 0; i < BTSPACE_NSEGS; i++) { 1691 if (bootspace.segs[i].type != BTSEG_DATA) { 1692 continue; 1693 } 1694 kva = roundup(bootspace.segs[i].va, NBPD_L2); 1695 if (kva < bootspace.segs[i].va) { 1696 continue; 1697 } 1698 kva_end = rounddown(bootspace.segs[i].va + 1699 bootspace.segs[i].sz, NBPD_L2); 1700 pa = roundup(bootspace.segs[i].pa, NBPD_L2); 1701 for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) { 1702 pde = &L2_BASE[pl2_i(kva)]; 1703 *pde = pa | pmap_pg_g | PG_PS | pmap_pg_nx | PG_KW | PG_V; 1704 tlbflushg(); 1705 } 1706 } 1707 } 1708 #endif /* !XEN */ 1709 1710 /* 1711 * pmap_init: called from uvm_init, our job is to get the pmap 1712 * system ready to manage mappings... 1713 */ 1714 1715 void 1716 pmap_init(void) 1717 { 1718 int i, flags; 1719 1720 for (i = 0; i < PV_HASH_SIZE; i++) { 1721 SLIST_INIT(&pv_hash_heads[i].hh_list); 1722 } 1723 for (i = 0; i < PV_HASH_LOCK_CNT; i++) { 1724 mutex_init(&pv_hash_locks[i].lock, MUTEX_NODEBUG, IPL_VM); 1725 } 1726 1727 /* 1728 * initialize caches. 1729 */ 1730 1731 pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), 0, 0, 0, 1732 "pmappl", NULL, IPL_NONE, NULL, NULL, NULL); 1733 1734 #ifdef XEN 1735 /* 1736 * pool_cache(9) should not touch cached objects, since they 1737 * are pinned on xen and R/O for the domU 1738 */ 1739 flags = PR_NOTOUCH; 1740 #else /* XEN */ 1741 flags = 0; 1742 #endif /* XEN */ 1743 #ifdef PAE 1744 pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE * PDP_SIZE, 0, 0, flags, 1745 "pdppl", &pmap_pdp_allocator, IPL_NONE, 1746 pmap_pdp_ctor, pmap_pdp_dtor, NULL); 1747 #else /* PAE */ 1748 pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE, 0, 0, flags, 1749 "pdppl", NULL, IPL_NONE, pmap_pdp_ctor, pmap_pdp_dtor, NULL); 1750 #endif /* PAE */ 1751 pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0, 1752 PR_LARGECACHE, "pvpl", &pool_allocator_kmem, IPL_NONE, NULL, 1753 NULL, NULL); 1754 1755 pmap_tlb_init(); 1756 1757 /* XXX: Since cpu_hatch() is only for secondary CPUs. */ 1758 pmap_tlb_cpu_init(curcpu()); 1759 1760 evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC, 1761 NULL, "x86", "io bitmap copy"); 1762 evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC, 1763 NULL, "x86", "ldt sync"); 1764 1765 /* 1766 * done: pmap module is up (and ready for business) 1767 */ 1768 1769 pmap_initialized = true; 1770 } 1771 1772 /* 1773 * pmap_cpu_init_late: perform late per-CPU initialization. 1774 */ 1775 1776 #ifndef XEN 1777 void 1778 pmap_cpu_init_late(struct cpu_info *ci) 1779 { 1780 /* 1781 * The BP has already its own PD page allocated during early 1782 * MD startup. 1783 */ 1784 if (ci == &cpu_info_primary) 1785 return; 1786 1787 #ifdef PAE 1788 cpu_alloc_l3_page(ci); 1789 #endif 1790 } 1791 #endif 1792 1793 #ifndef __HAVE_DIRECT_MAP 1794 CTASSERT(CACHE_LINE_SIZE > sizeof(pt_entry_t)); 1795 CTASSERT(CACHE_LINE_SIZE % sizeof(pt_entry_t) == 0); 1796 1797 static void 1798 pmap_vpage_cpualloc(struct cpu_info *ci) 1799 { 1800 bool primary = (ci == &cpu_info_primary); 1801 size_t i, npages; 1802 vaddr_t vabase; 1803 vsize_t vrange; 1804 1805 npages = (CACHE_LINE_SIZE / sizeof(pt_entry_t)); 1806 KASSERT(npages >= VPAGE_MAX); 1807 vrange = npages * PAGE_SIZE; 1808 1809 if (primary) { 1810 while ((vabase = pmap_bootstrap_valloc(1)) % vrange != 0) { 1811 /* Waste some pages to align properly */ 1812 } 1813 /* The base is aligned, allocate the rest (contiguous) */ 1814 pmap_bootstrap_valloc(npages - 1); 1815 } else { 1816 vabase = uvm_km_alloc(kernel_map, vrange, vrange, 1817 UVM_KMF_VAONLY); 1818 if (vabase == 0) { 1819 panic("%s: failed to allocate tmp VA for CPU %d\n", 1820 __func__, cpu_index(ci)); 1821 } 1822 } 1823 1824 KASSERT((vaddr_t)&PTE_BASE[pl1_i(vabase)] % CACHE_LINE_SIZE == 0); 1825 1826 for (i = 0; i < VPAGE_MAX; i++) { 1827 ci->vpage[i] = vabase + i * PAGE_SIZE; 1828 ci->vpage_pte[i] = PTE_BASE + pl1_i(ci->vpage[i]); 1829 } 1830 } 1831 1832 void 1833 pmap_vpage_cpu_init(struct cpu_info *ci) 1834 { 1835 if (ci == &cpu_info_primary) { 1836 /* cpu0 already taken care of in pmap_bootstrap */ 1837 return; 1838 } 1839 1840 pmap_vpage_cpualloc(ci); 1841 } 1842 #endif 1843 1844 /* 1845 * p v _ e n t r y f u n c t i o n s 1846 */ 1847 1848 static bool 1849 pmap_pp_needs_pve(struct pmap_page *pp) 1850 { 1851 1852 /* 1853 * Adding a pv entry for this page only needs to allocate a pv_entry 1854 * structure if the page already has at least one pv entry, 1855 * since the first pv entry is stored in the pmap_page. 1856 */ 1857 1858 return pp && ((pp->pp_flags & PP_EMBEDDED) != 0 || 1859 !LIST_EMPTY(&pp->pp_head.pvh_list)); 1860 } 1861 1862 /* 1863 * pmap_free_pvs: free a list of pv_entrys 1864 */ 1865 1866 static void 1867 pmap_free_pvs(struct pv_entry *pve) 1868 { 1869 struct pv_entry *next; 1870 1871 for ( /* null */ ; pve != NULL ; pve = next) { 1872 next = pve->pve_next; 1873 pool_cache_put(&pmap_pv_cache, pve); 1874 } 1875 } 1876 1877 /* 1878 * main pv_entry manipulation functions: 1879 * pmap_enter_pv: enter a mapping onto a pv_head list 1880 * pmap_remove_pv: remove a mapping from a pv_head list 1881 * 1882 * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock 1883 * the pvh before calling 1884 */ 1885 1886 /* 1887 * insert_pv: a helper of pmap_enter_pv 1888 */ 1889 1890 static void 1891 insert_pv(struct pmap_page *pp, struct pv_entry *pve) 1892 { 1893 struct pv_hash_head *hh; 1894 kmutex_t *lock; 1895 u_int hash; 1896 1897 hash = pvhash_hash(pve->pve_pte.pte_ptp, pve->pve_pte.pte_va); 1898 lock = pvhash_lock(hash); 1899 hh = pvhash_head(hash); 1900 mutex_spin_enter(lock); 1901 SLIST_INSERT_HEAD(&hh->hh_list, pve, pve_hash); 1902 mutex_spin_exit(lock); 1903 1904 LIST_INSERT_HEAD(&pp->pp_head.pvh_list, pve, pve_list); 1905 } 1906 1907 /* 1908 * pmap_enter_pv: enter a mapping onto a pv_head lst 1909 * 1910 * => caller should adjust ptp's wire_count before calling 1911 * => caller has preallocated pve and *sparepve for us 1912 */ 1913 1914 static struct pv_entry * 1915 pmap_enter_pv(struct pmap_page *pp, struct pv_entry *pve, 1916 struct pv_entry **sparepve, struct vm_page *ptp, vaddr_t va) 1917 { 1918 1919 KASSERT(ptp == NULL || ptp->wire_count >= 2); 1920 KASSERT(ptp == NULL || ptp->uobject != NULL); 1921 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 1922 1923 if ((pp->pp_flags & PP_EMBEDDED) == 0) { 1924 if (LIST_EMPTY(&pp->pp_head.pvh_list)) { 1925 pp->pp_flags |= PP_EMBEDDED; 1926 pp->pp_pte.pte_ptp = ptp; 1927 pp->pp_pte.pte_va = va; 1928 1929 return pve; 1930 } 1931 } else { 1932 struct pv_entry *pve2; 1933 1934 pve2 = *sparepve; 1935 *sparepve = NULL; 1936 1937 pve2->pve_pte = pp->pp_pte; 1938 pp->pp_flags &= ~PP_EMBEDDED; 1939 LIST_INIT(&pp->pp_head.pvh_list); 1940 insert_pv(pp, pve2); 1941 } 1942 1943 pve->pve_pte.pte_ptp = ptp; 1944 pve->pve_pte.pte_va = va; 1945 insert_pv(pp, pve); 1946 1947 return NULL; 1948 } 1949 1950 /* 1951 * pmap_remove_pv: try to remove a mapping from a pv_list 1952 * 1953 * => caller should adjust ptp's wire_count and free PTP if needed 1954 * => we return the removed pve 1955 */ 1956 1957 static struct pv_entry * 1958 pmap_remove_pv(struct pmap_page *pp, struct vm_page *ptp, vaddr_t va) 1959 { 1960 struct pv_hash_head *hh; 1961 struct pv_entry *pve; 1962 kmutex_t *lock; 1963 u_int hash; 1964 1965 KASSERT(ptp == NULL || ptp->uobject != NULL); 1966 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 1967 1968 if ((pp->pp_flags & PP_EMBEDDED) != 0) { 1969 KASSERT(pp->pp_pte.pte_ptp == ptp); 1970 KASSERT(pp->pp_pte.pte_va == va); 1971 1972 pp->pp_flags &= ~PP_EMBEDDED; 1973 LIST_INIT(&pp->pp_head.pvh_list); 1974 1975 return NULL; 1976 } 1977 1978 hash = pvhash_hash(ptp, va); 1979 lock = pvhash_lock(hash); 1980 hh = pvhash_head(hash); 1981 mutex_spin_enter(lock); 1982 pve = pvhash_remove(hh, ptp, va); 1983 mutex_spin_exit(lock); 1984 1985 LIST_REMOVE(pve, pve_list); 1986 1987 return pve; 1988 } 1989 1990 /* 1991 * p t p f u n c t i o n s 1992 */ 1993 1994 static inline struct vm_page * 1995 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level) 1996 { 1997 int lidx = level - 1; 1998 struct vm_page *pg; 1999 2000 KASSERT(mutex_owned(pmap->pm_lock)); 2001 2002 if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] && 2003 pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) { 2004 return (pmap->pm_ptphint[lidx]); 2005 } 2006 PMAP_SUBOBJ_LOCK(pmap, lidx); 2007 pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level)); 2008 PMAP_SUBOBJ_UNLOCK(pmap, lidx); 2009 2010 KASSERT(pg == NULL || pg->wire_count >= 1); 2011 return pg; 2012 } 2013 2014 static inline void 2015 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level) 2016 { 2017 lwp_t *l; 2018 int lidx; 2019 struct uvm_object *obj; 2020 2021 KASSERT(ptp->wire_count == 1); 2022 2023 lidx = level - 1; 2024 2025 obj = &pmap->pm_obj[lidx]; 2026 pmap_stats_update(pmap, -1, 0); 2027 if (lidx != 0) 2028 mutex_enter(obj->vmobjlock); 2029 if (pmap->pm_ptphint[lidx] == ptp) 2030 pmap->pm_ptphint[lidx] = TAILQ_FIRST(&obj->memq); 2031 ptp->wire_count = 0; 2032 uvm_pagerealloc(ptp, NULL, 0); 2033 l = curlwp; 2034 KASSERT((l->l_pflag & LP_INTR) == 0); 2035 VM_PAGE_TO_PP(ptp)->pp_link = l->l_md.md_gc_ptp; 2036 l->l_md.md_gc_ptp = ptp; 2037 if (lidx != 0) 2038 mutex_exit(obj->vmobjlock); 2039 } 2040 2041 static void 2042 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va, 2043 pt_entry_t *ptes, pd_entry_t * const *pdes) 2044 { 2045 unsigned long index; 2046 int level; 2047 vaddr_t invaladdr; 2048 pd_entry_t opde; 2049 2050 KASSERT(pmap != pmap_kernel()); 2051 KASSERT(mutex_owned(pmap->pm_lock)); 2052 KASSERT(kpreempt_disabled()); 2053 2054 level = 1; 2055 do { 2056 index = pl_i(va, level + 1); 2057 opde = pmap_pte_testset(&pdes[level - 1][index], 0); 2058 2059 /* 2060 * On Xen-amd64 or SVS, we need to sync the top level page 2061 * directory on each CPU. 2062 */ 2063 #if defined(XEN) && defined(__x86_64__) 2064 if (level == PTP_LEVELS - 1) { 2065 xen_kpm_sync(pmap, index); 2066 } 2067 #elif defined(SVS) 2068 if (svs_enabled && level == PTP_LEVELS - 1) { 2069 svs_pmap_sync(pmap, index); 2070 } 2071 #endif 2072 2073 invaladdr = level == 1 ? (vaddr_t)ptes : 2074 (vaddr_t)pdes[level - 2]; 2075 pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE, 2076 opde, TLBSHOOT_FREE_PTP1); 2077 2078 #if defined(XEN) 2079 pmap_tlb_shootnow(); 2080 #endif 2081 2082 pmap_freepage(pmap, ptp, level); 2083 if (level < PTP_LEVELS - 1) { 2084 ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1); 2085 ptp->wire_count--; 2086 if (ptp->wire_count > 1) 2087 break; 2088 } 2089 } while (++level < PTP_LEVELS); 2090 pmap_pte_flush(); 2091 } 2092 2093 /* 2094 * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one) 2095 * 2096 * => pmap should NOT be pmap_kernel() 2097 * => pmap should be locked 2098 * => preemption should be disabled 2099 */ 2100 2101 static struct vm_page * 2102 pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t * const *pdes, int flags) 2103 { 2104 struct vm_page *ptp; 2105 struct { 2106 struct vm_page *pg; 2107 bool new; 2108 } pt[PTP_LEVELS + 1]; 2109 int i, aflags; 2110 unsigned long index; 2111 pd_entry_t *pva; 2112 paddr_t pa; 2113 struct uvm_object *obj; 2114 voff_t off; 2115 2116 KASSERT(pmap != pmap_kernel()); 2117 KASSERT(mutex_owned(pmap->pm_lock)); 2118 KASSERT(kpreempt_disabled()); 2119 2120 /* 2121 * Loop through all page table levels allocating a page 2122 * for any level where we don't already have one. 2123 */ 2124 memset(pt, 0, sizeof(pt)); 2125 aflags = ((flags & PMAP_CANFAIL) ? 0 : UVM_PGA_USERESERVE) | 2126 UVM_PGA_ZERO; 2127 for (i = PTP_LEVELS; i > 1; i--) { 2128 obj = &pmap->pm_obj[i - 2]; 2129 off = ptp_va2o(va, i - 1); 2130 2131 PMAP_SUBOBJ_LOCK(pmap, i - 2); 2132 pt[i].pg = uvm_pagelookup(obj, off); 2133 if (pt[i].pg == NULL) { 2134 pt[i].pg = uvm_pagealloc(obj, off, NULL, aflags); 2135 pt[i].new = true; 2136 } 2137 PMAP_SUBOBJ_UNLOCK(pmap, i - 2); 2138 2139 if (pt[i].pg == NULL) 2140 goto fail; 2141 } 2142 2143 /* 2144 * Now that we have all the pages looked up or allocated, 2145 * loop through again installing any new ones into the tree. 2146 */ 2147 for (i = PTP_LEVELS; i > 1; i--) { 2148 index = pl_i(va, i); 2149 pva = pdes[i - 2]; 2150 2151 if (pmap_valid_entry(pva[index])) { 2152 KASSERT(!pt[i].new); 2153 continue; 2154 } 2155 2156 ptp = pt[i].pg; 2157 ptp->flags &= ~PG_BUSY; /* never busy */ 2158 ptp->wire_count = 1; 2159 pmap->pm_ptphint[i - 2] = ptp; 2160 pa = VM_PAGE_TO_PHYS(ptp); 2161 pmap_pte_set(&pva[index], (pd_entry_t) 2162 (pmap_pa2pte(pa) | PG_u | PG_RW | PG_V)); 2163 2164 /* 2165 * On Xen-amd64 or SVS, we need to sync the top level page 2166 * directory on each CPU. 2167 */ 2168 #if defined(XEN) && defined(__x86_64__) 2169 if (i == PTP_LEVELS) { 2170 xen_kpm_sync(pmap, index); 2171 } 2172 #elif defined(SVS) 2173 if (svs_enabled && i == PTP_LEVELS) { 2174 svs_pmap_sync(pmap, index); 2175 } 2176 #endif 2177 2178 pmap_pte_flush(); 2179 pmap_stats_update(pmap, 1, 0); 2180 2181 /* 2182 * If we're not in the top level, increase the 2183 * wire count of the parent page. 2184 */ 2185 if (i < PTP_LEVELS) { 2186 pt[i + 1].pg->wire_count++; 2187 } 2188 } 2189 ptp = pt[2].pg; 2190 KASSERT(ptp != NULL); 2191 pmap->pm_ptphint[0] = ptp; 2192 return ptp; 2193 2194 /* 2195 * Allocation of a ptp failed, free any others that we just allocated. 2196 */ 2197 fail: 2198 for (i = PTP_LEVELS; i > 1; i--) { 2199 if (pt[i].pg == NULL) { 2200 break; 2201 } 2202 if (!pt[i].new) { 2203 continue; 2204 } 2205 obj = &pmap->pm_obj[i - 2]; 2206 PMAP_SUBOBJ_LOCK(pmap, i - 2); 2207 uvm_pagefree(pt[i].pg); 2208 PMAP_SUBOBJ_UNLOCK(pmap, i - 2); 2209 } 2210 return NULL; 2211 } 2212 2213 /* 2214 * p m a p l i f e c y c l e f u n c t i o n s 2215 */ 2216 2217 /* 2218 * pmap_pdp_ctor: constructor for the PDP cache. 2219 */ 2220 static int 2221 pmap_pdp_ctor(void *arg, void *v, int flags) 2222 { 2223 pd_entry_t *pdir = v; 2224 paddr_t pdirpa = 0; 2225 vaddr_t object; 2226 int i; 2227 2228 #if !defined(XEN) || !defined(__x86_64__) 2229 int npde; 2230 #endif 2231 #ifdef XEN 2232 int s; 2233 #endif 2234 2235 /* 2236 * NOTE: The `pmaps_lock' is held when the PDP is allocated. 2237 */ 2238 2239 #if defined(XEN) && defined(__x86_64__) 2240 /* Fetch the physical address of the page directory */ 2241 (void)pmap_extract(pmap_kernel(), (vaddr_t)pdir, &pdirpa); 2242 2243 /* Zero the area */ 2244 memset(pdir, 0, PAGE_SIZE); /* Xen wants a clean page */ 2245 2246 /* 2247 * This pdir will NEVER be active in kernel mode, so mark 2248 * recursive entry invalid. 2249 */ 2250 pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa); 2251 2252 /* 2253 * PDP constructed this way won't be for the kernel, hence we 2254 * don't put kernel mappings on Xen. 2255 * 2256 * But we need to make pmap_create() happy, so put a dummy 2257 * (without PG_V) value at the right place. 2258 */ 2259 pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] = 2260 (pd_entry_t)-1 & PG_FRAME; 2261 #else /* XEN && __x86_64__*/ 2262 /* Zero the area */ 2263 memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t)); 2264 2265 object = (vaddr_t)v; 2266 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2267 /* Fetch the physical address of the page directory */ 2268 (void)pmap_extract(pmap_kernel(), object, &pdirpa); 2269 2270 /* Put in recursive PDE to map the PTEs */ 2271 pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PG_V | 2272 pmap_pg_nx; 2273 #ifndef XEN 2274 pdir[PDIR_SLOT_PTE + i] |= PG_KW; 2275 #endif 2276 } 2277 2278 /* Copy the kernel's top level PDE */ 2279 npde = nkptp[PTP_LEVELS - 1]; 2280 2281 memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN], 2282 npde * sizeof(pd_entry_t)); 2283 2284 /* Zero the rest */ 2285 memset(&pdir[PDIR_SLOT_KERN + npde], 0, (PAGE_SIZE * PDP_SIZE) - 2286 (PDIR_SLOT_KERN + npde) * sizeof(pd_entry_t)); 2287 2288 if (VM_MIN_KERNEL_ADDRESS != KERNBASE) { 2289 int idx = pl_i(KERNBASE, PTP_LEVELS); 2290 pdir[idx] = PDP_BASE[idx]; 2291 } 2292 2293 #ifdef __HAVE_PCPU_AREA 2294 pdir[PDIR_SLOT_PCPU] = PDP_BASE[PDIR_SLOT_PCPU]; 2295 #endif 2296 #ifdef __HAVE_DIRECT_MAP 2297 memcpy(&pdir[pmap_direct_pdpe], &PDP_BASE[pmap_direct_pdpe], 2298 pmap_direct_npdp * sizeof(pd_entry_t)); 2299 #endif 2300 #endif /* XEN && __x86_64__*/ 2301 2302 #ifdef XEN 2303 s = splvm(); 2304 object = (vaddr_t)v; 2305 pmap_protect(pmap_kernel(), object, object + (PAGE_SIZE * PDP_SIZE), 2306 VM_PROT_READ); 2307 pmap_update(pmap_kernel()); 2308 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2309 /* 2310 * pin as L2/L4 page, we have to do the page with the 2311 * PDIR_SLOT_PTE entries last 2312 */ 2313 #ifdef PAE 2314 if (i == l2tol3(PDIR_SLOT_PTE)) 2315 continue; 2316 #endif 2317 2318 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2319 #ifdef __x86_64__ 2320 xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa)); 2321 #else 2322 xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa)); 2323 #endif 2324 } 2325 #ifdef PAE 2326 object = ((vaddr_t)pdir) + PAGE_SIZE * l2tol3(PDIR_SLOT_PTE); 2327 (void)pmap_extract(pmap_kernel(), object, &pdirpa); 2328 xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa)); 2329 #endif 2330 splx(s); 2331 #endif /* XEN */ 2332 2333 return (0); 2334 } 2335 2336 /* 2337 * pmap_pdp_dtor: destructor for the PDP cache. 2338 */ 2339 2340 static void 2341 pmap_pdp_dtor(void *arg, void *v) 2342 { 2343 #ifdef XEN 2344 paddr_t pdirpa = 0; /* XXX: GCC */ 2345 vaddr_t object = (vaddr_t)v; 2346 int i; 2347 int s = splvm(); 2348 pt_entry_t *pte; 2349 2350 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2351 /* fetch the physical address of the page directory. */ 2352 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2353 /* unpin page table */ 2354 xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa)); 2355 } 2356 object = (vaddr_t)v; 2357 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2358 /* Set page RW again */ 2359 pte = kvtopte(object); 2360 pmap_pte_set(pte, *pte | PG_RW); 2361 xen_bcast_invlpg((vaddr_t)object); 2362 } 2363 splx(s); 2364 #endif /* XEN */ 2365 } 2366 2367 #ifdef PAE 2368 2369 /* pmap_pdp_alloc: Allocate a page for the pdp memory pool. */ 2370 2371 static void * 2372 pmap_pdp_alloc(struct pool *pp, int flags) 2373 { 2374 return (void *)uvm_km_alloc(kernel_map, 2375 PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE, 2376 ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK) 2377 | UVM_KMF_WIRED); 2378 } 2379 2380 /* 2381 * pmap_pdp_free: free a PDP 2382 */ 2383 2384 static void 2385 pmap_pdp_free(struct pool *pp, void *v) 2386 { 2387 uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE, 2388 UVM_KMF_WIRED); 2389 } 2390 #endif /* PAE */ 2391 2392 /* 2393 * pmap_create: create a pmap object. 2394 */ 2395 struct pmap * 2396 pmap_create(void) 2397 { 2398 struct pmap *pmap; 2399 int i; 2400 2401 pmap = pool_cache_get(&pmap_cache, PR_WAITOK); 2402 2403 /* init uvm_object */ 2404 for (i = 0; i < PTP_LEVELS - 1; i++) { 2405 mutex_init(&pmap->pm_obj_lock[i], MUTEX_DEFAULT, IPL_NONE); 2406 uvm_obj_init(&pmap->pm_obj[i], NULL, false, 1); 2407 uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_obj_lock[i]); 2408 pmap->pm_ptphint[i] = NULL; 2409 } 2410 pmap->pm_stats.wired_count = 0; 2411 /* count the PDP allocd below */ 2412 pmap->pm_stats.resident_count = PDP_SIZE; 2413 #if !defined(__x86_64__) 2414 pmap->pm_hiexec = 0; 2415 #endif /* !defined(__x86_64__) */ 2416 pmap->pm_flags = 0; 2417 pmap->pm_gc_ptp = NULL; 2418 2419 kcpuset_create(&pmap->pm_cpus, true); 2420 kcpuset_create(&pmap->pm_kernel_cpus, true); 2421 #ifdef XEN 2422 kcpuset_create(&pmap->pm_xen_ptp_cpus, true); 2423 #endif 2424 /* init the LDT */ 2425 pmap->pm_ldt = NULL; 2426 pmap->pm_ldt_len = 0; 2427 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 2428 2429 /* allocate PDP */ 2430 try_again: 2431 pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK); 2432 2433 mutex_enter(&pmaps_lock); 2434 2435 if (pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] == 0) { 2436 mutex_exit(&pmaps_lock); 2437 pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir); 2438 goto try_again; 2439 } 2440 2441 for (i = 0; i < PDP_SIZE; i++) 2442 pmap->pm_pdirpa[i] = 2443 pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]); 2444 2445 LIST_INSERT_HEAD(&pmaps, pmap, pm_list); 2446 2447 mutex_exit(&pmaps_lock); 2448 2449 return (pmap); 2450 } 2451 2452 /* 2453 * pmap_free_ptps: put a list of ptps back to the freelist. 2454 */ 2455 2456 void 2457 pmap_free_ptps(struct vm_page *empty_ptps) 2458 { 2459 struct vm_page *ptp; 2460 struct pmap_page *pp; 2461 2462 while ((ptp = empty_ptps) != NULL) { 2463 pp = VM_PAGE_TO_PP(ptp); 2464 empty_ptps = pp->pp_link; 2465 LIST_INIT(&pp->pp_head.pvh_list); 2466 uvm_pagefree(ptp); 2467 } 2468 } 2469 2470 /* 2471 * pmap_check_ptps: verify that none of the pmap's page table objects 2472 * have any pages allocated to them. 2473 */ 2474 2475 static inline void 2476 pmap_check_ptps(struct pmap *pmap) 2477 { 2478 int i; 2479 2480 for (i = 0; i < PTP_LEVELS - 1; i++) { 2481 KASSERT(pmap->pm_obj[i].uo_npages == 0); 2482 KASSERT(TAILQ_EMPTY(&pmap->pm_obj[i].memq)); 2483 } 2484 } 2485 2486 static inline void 2487 pmap_check_inuse(struct pmap *pmap) 2488 { 2489 #ifdef DIAGNOSTIC 2490 CPU_INFO_ITERATOR cii; 2491 struct cpu_info *ci; 2492 2493 for (CPU_INFO_FOREACH(cii, ci)) { 2494 if (ci->ci_pmap == pmap) 2495 panic("destroying pmap being used"); 2496 #if defined(XEN) && defined(__x86_64__) 2497 for (int i = 0; i < PDIR_SLOT_PTE; i++) { 2498 if (pmap->pm_pdir[i] != 0 && 2499 ci->ci_kpm_pdir[i] == pmap->pm_pdir[i]) { 2500 printf("pmap_destroy(%p) pmap_kernel %p " 2501 "curcpu %d cpu %d ci_pmap %p " 2502 "ci->ci_kpm_pdir[%d]=%" PRIx64 2503 " pmap->pm_pdir[%d]=%" PRIx64 "\n", 2504 pmap, pmap_kernel(), curcpu()->ci_index, 2505 ci->ci_index, ci->ci_pmap, 2506 i, ci->ci_kpm_pdir[i], 2507 i, pmap->pm_pdir[i]); 2508 panic("%s: used pmap", __func__); 2509 } 2510 } 2511 #endif 2512 } 2513 #endif /* DIAGNOSTIC */ 2514 } 2515 2516 /* 2517 * pmap_destroy: drop reference count on pmap. free pmap if 2518 * reference count goes to zero. 2519 */ 2520 2521 void 2522 pmap_destroy(struct pmap *pmap) 2523 { 2524 lwp_t *l; 2525 int i; 2526 2527 /* 2528 * If we have torn down this pmap, process deferred frees and 2529 * invalidations. Free now if the system is low on memory. 2530 * Otherwise, free when the pmap is destroyed thus avoiding a 2531 * TLB shootdown. 2532 */ 2533 l = curlwp; 2534 if (__predict_false(l->l_md.md_gc_pmap == pmap)) { 2535 pmap_check_ptps(pmap); 2536 if (uvmexp.free < uvmexp.freetarg) { 2537 pmap_update(pmap); 2538 } else { 2539 KASSERT(pmap->pm_gc_ptp == NULL); 2540 pmap->pm_gc_ptp = l->l_md.md_gc_ptp; 2541 l->l_md.md_gc_ptp = NULL; 2542 l->l_md.md_gc_pmap = NULL; 2543 } 2544 } 2545 2546 /* 2547 * drop reference count 2548 */ 2549 2550 if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) { 2551 return; 2552 } 2553 2554 pmap_check_inuse(pmap); 2555 2556 /* 2557 * Reference count is zero, free pmap resources and then free pmap. 2558 * First, remove it from global list of pmaps. 2559 */ 2560 2561 mutex_enter(&pmaps_lock); 2562 LIST_REMOVE(pmap, pm_list); 2563 mutex_exit(&pmaps_lock); 2564 2565 /* 2566 * Process deferred PTP frees. No TLB shootdown required, as the 2567 * PTP pages are no longer visible to any CPU. 2568 */ 2569 2570 pmap_free_ptps(pmap->pm_gc_ptp); 2571 2572 pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir); 2573 2574 #ifdef USER_LDT 2575 if (pmap->pm_ldt != NULL) { 2576 /* 2577 * no need to switch the LDT; this address space is gone, 2578 * nothing is using it. 2579 * 2580 * No need to lock the pmap for ldt_free (or anything else), 2581 * we're the last one to use it. 2582 */ 2583 mutex_enter(&cpu_lock); 2584 ldt_free(pmap->pm_ldt_sel); 2585 mutex_exit(&cpu_lock); 2586 uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt, 2587 pmap->pm_ldt_len, UVM_KMF_WIRED); 2588 } 2589 #endif 2590 2591 for (i = 0; i < PTP_LEVELS - 1; i++) { 2592 uvm_obj_destroy(&pmap->pm_obj[i], false); 2593 mutex_destroy(&pmap->pm_obj_lock[i]); 2594 } 2595 kcpuset_destroy(pmap->pm_cpus); 2596 kcpuset_destroy(pmap->pm_kernel_cpus); 2597 #ifdef XEN 2598 kcpuset_destroy(pmap->pm_xen_ptp_cpus); 2599 #endif 2600 2601 pmap_check_ptps(pmap); 2602 pool_cache_put(&pmap_cache, pmap); 2603 } 2604 2605 /* 2606 * pmap_remove_all: pmap is being torn down by the current thread. 2607 * avoid unnecessary invalidations. 2608 */ 2609 2610 void 2611 pmap_remove_all(struct pmap *pmap) 2612 { 2613 lwp_t *l = curlwp; 2614 2615 KASSERT(l->l_md.md_gc_pmap == NULL); 2616 2617 l->l_md.md_gc_pmap = pmap; 2618 } 2619 2620 #if defined(PMAP_FORK) 2621 /* 2622 * pmap_fork: perform any necessary data structure manipulation when 2623 * a VM space is forked. 2624 */ 2625 2626 void 2627 pmap_fork(struct pmap *pmap1, struct pmap *pmap2) 2628 { 2629 #ifdef USER_LDT 2630 union descriptor *new_ldt; 2631 size_t len; 2632 int sel; 2633 2634 if (__predict_true(pmap1->pm_ldt == NULL)) { 2635 return; 2636 } 2637 2638 /* 2639 * Copy the LDT into the new process. 2640 * 2641 * Read pmap1's ldt pointer and length unlocked; if it changes 2642 * behind our back we'll retry. This will starve if there's a 2643 * stream of LDT changes in another thread but that should not 2644 * happen. 2645 */ 2646 2647 retry: 2648 if (pmap1->pm_ldt != NULL) { 2649 len = pmap1->pm_ldt_len; 2650 /* Allocate space for the new process's LDT */ 2651 new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, len, 0, 2652 UVM_KMF_WIRED); 2653 if (new_ldt == NULL) { 2654 printf("WARNING: %s: unable to allocate LDT space\n", 2655 __func__); 2656 return; 2657 } 2658 mutex_enter(&cpu_lock); 2659 /* Get a GDT slot for it */ 2660 sel = ldt_alloc(new_ldt, len); 2661 if (sel == -1) { 2662 mutex_exit(&cpu_lock); 2663 uvm_km_free(kernel_map, (vaddr_t)new_ldt, len, 2664 UVM_KMF_WIRED); 2665 printf("WARNING: %s: unable to allocate LDT selector\n", 2666 __func__); 2667 return; 2668 } 2669 } else { 2670 /* Wasn't anything there after all. */ 2671 len = -1; 2672 new_ldt = NULL; 2673 sel = -1; 2674 mutex_enter(&cpu_lock); 2675 } 2676 2677 /* If there's still something there now that we have cpu_lock... */ 2678 if (pmap1->pm_ldt != NULL) { 2679 if (len != pmap1->pm_ldt_len) { 2680 /* Oops, it changed. Drop what we did and try again */ 2681 if (len != -1) { 2682 ldt_free(sel); 2683 uvm_km_free(kernel_map, (vaddr_t)new_ldt, 2684 len, UVM_KMF_WIRED); 2685 } 2686 mutex_exit(&cpu_lock); 2687 goto retry; 2688 } 2689 2690 /* Copy the LDT data and install it in pmap2 */ 2691 memcpy(new_ldt, pmap1->pm_ldt, len); 2692 pmap2->pm_ldt = new_ldt; 2693 pmap2->pm_ldt_len = pmap1->pm_ldt_len; 2694 pmap2->pm_ldt_sel = sel; 2695 len = -1; 2696 } 2697 2698 if (len != -1) { 2699 /* There wasn't still something there, so mop up */ 2700 ldt_free(sel); 2701 mutex_exit(&cpu_lock); 2702 uvm_km_free(kernel_map, (vaddr_t)new_ldt, len, 2703 UVM_KMF_WIRED); 2704 } else { 2705 mutex_exit(&cpu_lock); 2706 } 2707 #endif /* USER_LDT */ 2708 } 2709 #endif /* PMAP_FORK */ 2710 2711 #ifdef USER_LDT 2712 2713 /* 2714 * pmap_ldt_xcall: cross call used by pmap_ldt_sync. if the named pmap 2715 * is active, reload LDTR. 2716 */ 2717 static void 2718 pmap_ldt_xcall(void *arg1, void *arg2) 2719 { 2720 struct pmap *pm; 2721 2722 kpreempt_disable(); 2723 pm = arg1; 2724 if (curcpu()->ci_pmap == pm) { 2725 lldt(pm->pm_ldt_sel); 2726 } 2727 kpreempt_enable(); 2728 } 2729 2730 /* 2731 * pmap_ldt_sync: LDT selector for the named pmap is changing. swap 2732 * in the new selector on all CPUs. 2733 */ 2734 void 2735 pmap_ldt_sync(struct pmap *pm) 2736 { 2737 uint64_t where; 2738 2739 KASSERT(mutex_owned(&cpu_lock)); 2740 2741 pmap_ldt_evcnt.ev_count++; 2742 where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL); 2743 xc_wait(where); 2744 } 2745 2746 /* 2747 * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and 2748 * restore the default. 2749 */ 2750 2751 void 2752 pmap_ldt_cleanup(struct lwp *l) 2753 { 2754 pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap; 2755 union descriptor *dp = NULL; 2756 size_t len = 0; 2757 int sel = -1; 2758 2759 if (__predict_true(pmap->pm_ldt == NULL)) { 2760 return; 2761 } 2762 2763 mutex_enter(&cpu_lock); 2764 if (pmap->pm_ldt != NULL) { 2765 sel = pmap->pm_ldt_sel; 2766 dp = pmap->pm_ldt; 2767 len = pmap->pm_ldt_len; 2768 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 2769 pmap->pm_ldt = NULL; 2770 pmap->pm_ldt_len = 0; 2771 pmap_ldt_sync(pmap); 2772 ldt_free(sel); 2773 uvm_km_free(kernel_map, (vaddr_t)dp, len, UVM_KMF_WIRED); 2774 } 2775 mutex_exit(&cpu_lock); 2776 } 2777 #endif /* USER_LDT */ 2778 2779 /* 2780 * pmap_activate: activate a process' pmap 2781 * 2782 * => must be called with kernel preemption disabled 2783 * => if lwp is the curlwp, then set ci_want_pmapload so that 2784 * actual MMU context switch will be done by pmap_load() later 2785 */ 2786 2787 void 2788 pmap_activate(struct lwp *l) 2789 { 2790 struct cpu_info *ci; 2791 struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2792 2793 KASSERT(kpreempt_disabled()); 2794 2795 ci = curcpu(); 2796 2797 if (l != ci->ci_curlwp) 2798 return; 2799 2800 KASSERT(ci->ci_want_pmapload == 0); 2801 KASSERT(ci->ci_tlbstate != TLBSTATE_VALID); 2802 2803 /* 2804 * no need to switch to kernel vmspace because 2805 * it's a subset of any vmspace. 2806 */ 2807 2808 if (pmap == pmap_kernel()) { 2809 ci->ci_want_pmapload = 0; 2810 return; 2811 } 2812 2813 ci->ci_want_pmapload = 1; 2814 } 2815 2816 #if defined(XEN) && defined(__x86_64__) 2817 #define KASSERT_PDIRPA(pmap) \ 2818 KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd || \ 2819 pmap == pmap_kernel()) 2820 #elif defined(PAE) 2821 #define KASSERT_PDIRPA(pmap) \ 2822 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0])) 2823 #elif !defined(XEN) 2824 #define KASSERT_PDIRPA(pmap) \ 2825 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3())) 2826 #else 2827 #define KASSERT_PDIRPA(pmap) KASSERT(true) /* nothing to do */ 2828 #endif 2829 2830 /* 2831 * pmap_reactivate: try to regain reference to the pmap. 2832 * 2833 * => Must be called with kernel preemption disabled. 2834 */ 2835 2836 static void 2837 pmap_reactivate(struct pmap *pmap) 2838 { 2839 struct cpu_info * const ci = curcpu(); 2840 const cpuid_t cid = cpu_index(ci); 2841 2842 KASSERT(kpreempt_disabled()); 2843 KASSERT_PDIRPA(pmap); 2844 2845 /* 2846 * If we still have a lazy reference to this pmap, we can assume 2847 * that there was no TLB shootdown for this pmap in the meantime. 2848 * 2849 * The order of events here is important as we must synchronize 2850 * with TLB shootdown interrupts. Declare interest in invalidations 2851 * (TLBSTATE_VALID) and then check the CPU set, which the IPIs can 2852 * change only when the state is TLBSTATE_LAZY. 2853 */ 2854 2855 ci->ci_tlbstate = TLBSTATE_VALID; 2856 KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid)); 2857 2858 if (kcpuset_isset(pmap->pm_cpus, cid)) { 2859 /* We have the reference, state is valid. */ 2860 } else { 2861 /* 2862 * Must reload the TLB, pmap has been changed during 2863 * deactivated. 2864 */ 2865 kcpuset_atomic_set(pmap->pm_cpus, cid); 2866 2867 tlbflush(); 2868 } 2869 } 2870 2871 /* 2872 * pmap_load: perform the actual pmap switch, i.e. fill in %cr3 register 2873 * and relevant LDT info. 2874 * 2875 * Ensures that the current process' pmap is loaded on the current CPU's 2876 * MMU and that there are no stale TLB entries. 2877 * 2878 * => The caller should disable kernel preemption or do check-and-retry 2879 * to prevent a preemption from undoing our efforts. 2880 * => This function may block. 2881 */ 2882 void 2883 pmap_load(void) 2884 { 2885 struct cpu_info *ci; 2886 struct pmap *pmap, *oldpmap; 2887 struct lwp *l; 2888 struct pcb *pcb; 2889 cpuid_t cid; 2890 uint64_t ncsw; 2891 2892 kpreempt_disable(); 2893 retry: 2894 ci = curcpu(); 2895 if (!ci->ci_want_pmapload) { 2896 kpreempt_enable(); 2897 return; 2898 } 2899 l = ci->ci_curlwp; 2900 ncsw = l->l_ncsw; 2901 2902 /* should be able to take ipis. */ 2903 KASSERT(ci->ci_ilevel < IPL_HIGH); 2904 #ifdef XEN 2905 /* Check to see if interrupts are enabled (ie; no events are masked) */ 2906 KASSERT(x86_read_psl() == 0); 2907 #else 2908 KASSERT((x86_read_psl() & PSL_I) != 0); 2909 #endif 2910 2911 KASSERT(l != NULL); 2912 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2913 KASSERT(pmap != pmap_kernel()); 2914 oldpmap = ci->ci_pmap; 2915 pcb = lwp_getpcb(l); 2916 2917 if (pmap == oldpmap) { 2918 pmap_reactivate(pmap); 2919 ci->ci_want_pmapload = 0; 2920 kpreempt_enable(); 2921 return; 2922 } 2923 2924 /* 2925 * Acquire a reference to the new pmap and perform the switch. 2926 */ 2927 2928 pmap_reference(pmap); 2929 2930 cid = cpu_index(ci); 2931 kcpuset_atomic_clear(oldpmap->pm_cpus, cid); 2932 kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid); 2933 2934 KASSERT_PDIRPA(oldpmap); 2935 KASSERT(!kcpuset_isset(pmap->pm_cpus, cid)); 2936 KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid)); 2937 2938 /* 2939 * Mark the pmap in use by this CPU. Again, we must synchronize 2940 * with TLB shootdown interrupts, so set the state VALID first, 2941 * then register us for shootdown events on this pmap. 2942 */ 2943 ci->ci_tlbstate = TLBSTATE_VALID; 2944 kcpuset_atomic_set(pmap->pm_cpus, cid); 2945 kcpuset_atomic_set(pmap->pm_kernel_cpus, cid); 2946 ci->ci_pmap = pmap; 2947 2948 /* 2949 * update tss. now that we have registered for invalidations 2950 * from other CPUs, we're good to load the page tables. 2951 */ 2952 #ifdef PAE 2953 pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa; 2954 #else 2955 pcb->pcb_cr3 = pmap_pdirpa(pmap, 0); 2956 #endif 2957 2958 #ifdef i386 2959 #ifndef XEN 2960 ci->ci_tss->tss.tss_ldt = pmap->pm_ldt_sel; 2961 ci->ci_tss->tss.tss_cr3 = pcb->pcb_cr3; 2962 #endif /* !XEN */ 2963 #endif /* i386 */ 2964 2965 lldt(pmap->pm_ldt_sel); 2966 2967 cpu_load_pmap(pmap, oldpmap); 2968 2969 ci->ci_want_pmapload = 0; 2970 2971 /* 2972 * we're now running with the new pmap. drop the reference 2973 * to the old pmap. if we block, we need to go around again. 2974 */ 2975 2976 pmap_destroy(oldpmap); 2977 if (l->l_ncsw != ncsw) { 2978 goto retry; 2979 } 2980 2981 kpreempt_enable(); 2982 } 2983 2984 /* 2985 * pmap_deactivate: deactivate a process' pmap. 2986 * 2987 * => Must be called with kernel preemption disabled (high IPL is enough). 2988 */ 2989 void 2990 pmap_deactivate(struct lwp *l) 2991 { 2992 struct pmap *pmap; 2993 struct cpu_info *ci; 2994 2995 KASSERT(kpreempt_disabled()); 2996 2997 if (l != curlwp) { 2998 return; 2999 } 3000 3001 /* 3002 * Wait for pending TLB shootdowns to complete. Necessary because 3003 * TLB shootdown state is per-CPU, and the LWP may be coming off 3004 * the CPU before it has a chance to call pmap_update(), e.g. due 3005 * to kernel preemption or blocking routine in between. 3006 */ 3007 pmap_tlb_shootnow(); 3008 3009 ci = curcpu(); 3010 3011 if (ci->ci_want_pmapload) { 3012 /* 3013 * ci_want_pmapload means that our pmap is not loaded on 3014 * the CPU or TLB might be stale. note that pmap_kernel() 3015 * is always considered loaded. 3016 */ 3017 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 3018 != pmap_kernel()); 3019 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 3020 != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID); 3021 3022 /* 3023 * userspace has not been touched. 3024 * nothing to do here. 3025 */ 3026 3027 ci->ci_want_pmapload = 0; 3028 return; 3029 } 3030 3031 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 3032 3033 if (pmap == pmap_kernel()) { 3034 return; 3035 } 3036 3037 KASSERT_PDIRPA(pmap); 3038 KASSERT(ci->ci_pmap == pmap); 3039 3040 /* 3041 * we aren't interested in TLB invalidations for this pmap, 3042 * at least for the time being. 3043 */ 3044 3045 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); 3046 ci->ci_tlbstate = TLBSTATE_LAZY; 3047 } 3048 3049 /* 3050 * end of lifecycle functions 3051 */ 3052 3053 /* 3054 * some misc. functions 3055 */ 3056 3057 int 3058 pmap_pdes_invalid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde) 3059 { 3060 int i; 3061 unsigned long index; 3062 pd_entry_t pde; 3063 3064 for (i = PTP_LEVELS; i > 1; i--) { 3065 index = pl_i(va, i); 3066 pde = pdes[i - 2][index]; 3067 if ((pde & PG_V) == 0) 3068 return i; 3069 } 3070 if (lastpde != NULL) 3071 *lastpde = pde; 3072 return 0; 3073 } 3074 3075 /* 3076 * pmap_extract: extract a PA for the given VA 3077 */ 3078 3079 bool 3080 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap) 3081 { 3082 pt_entry_t *ptes, pte; 3083 pd_entry_t pde; 3084 pd_entry_t * const *pdes; 3085 struct pmap *pmap2; 3086 struct cpu_info *ci; 3087 paddr_t pa; 3088 lwp_t *l; 3089 bool hard, rv; 3090 3091 #ifdef __HAVE_DIRECT_MAP 3092 if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) { 3093 if (pap != NULL) { 3094 *pap = PMAP_DIRECT_UNMAP(va); 3095 } 3096 return true; 3097 } 3098 #endif 3099 3100 rv = false; 3101 pa = 0; 3102 l = curlwp; 3103 3104 kpreempt_disable(); 3105 ci = l->l_cpu; 3106 if (__predict_true(!ci->ci_want_pmapload && ci->ci_pmap == pmap) || 3107 pmap == pmap_kernel()) { 3108 /* 3109 * no need to lock, because it's pmap_kernel() or our 3110 * own pmap and is active. if a user pmap, the caller 3111 * will hold the vm_map write/read locked and so prevent 3112 * entries from disappearing while we are here. ptps 3113 * can disappear via pmap_remove() and pmap_protect(), 3114 * but they are called with the vm_map write locked. 3115 */ 3116 hard = false; 3117 ptes = PTE_BASE; 3118 pdes = normal_pdes; 3119 } else { 3120 /* we lose, do it the hard way. */ 3121 hard = true; 3122 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3123 } 3124 if (pmap_pdes_valid(va, pdes, &pde)) { 3125 pte = ptes[pl1_i(va)]; 3126 if (pde & PG_PS) { 3127 pa = (pde & PG_LGFRAME) | (va & (NBPD_L2 - 1)); 3128 rv = true; 3129 } else if (__predict_true((pte & PG_V) != 0)) { 3130 pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1)); 3131 rv = true; 3132 } 3133 } 3134 if (__predict_false(hard)) { 3135 pmap_unmap_ptes(pmap, pmap2); 3136 } 3137 kpreempt_enable(); 3138 if (pap != NULL) { 3139 *pap = pa; 3140 } 3141 return rv; 3142 } 3143 3144 3145 /* 3146 * vtophys: virtual address to physical address. For use by 3147 * machine-dependent code only. 3148 */ 3149 3150 paddr_t 3151 vtophys(vaddr_t va) 3152 { 3153 paddr_t pa; 3154 3155 if (pmap_extract(pmap_kernel(), va, &pa) == true) 3156 return (pa); 3157 return (0); 3158 } 3159 3160 __strict_weak_alias(pmap_extract_ma, pmap_extract); 3161 3162 #ifdef XEN 3163 3164 /* 3165 * vtomach: virtual address to machine address. For use by 3166 * machine-dependent code only. 3167 */ 3168 3169 paddr_t 3170 vtomach(vaddr_t va) 3171 { 3172 paddr_t pa; 3173 3174 if (pmap_extract_ma(pmap_kernel(), va, &pa) == true) 3175 return (pa); 3176 return (0); 3177 } 3178 3179 #endif /* XEN */ 3180 3181 /* 3182 * pmap_virtual_space: used during bootup [pmap_steal_memory] to 3183 * determine the bounds of the kernel virtual addess space. 3184 */ 3185 3186 void 3187 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp) 3188 { 3189 *startp = virtual_avail; 3190 *endp = virtual_end; 3191 } 3192 3193 /* 3194 * pmap_zero_page: zero a page 3195 */ 3196 3197 void 3198 pmap_zero_page(paddr_t pa) 3199 { 3200 #if defined(__HAVE_DIRECT_MAP) 3201 pagezero(PMAP_DIRECT_MAP(pa)); 3202 #else 3203 #if defined(XEN) 3204 if (XEN_VERSION_SUPPORTED(3, 4)) 3205 xen_pagezero(pa); 3206 #endif 3207 struct cpu_info *ci; 3208 pt_entry_t *zpte; 3209 vaddr_t zerova; 3210 3211 const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_M | PG_U; 3212 3213 kpreempt_disable(); 3214 3215 ci = curcpu(); 3216 zerova = ci->vpage[VPAGE_ZER]; 3217 zpte = ci->vpage_pte[VPAGE_ZER]; 3218 3219 KASSERTMSG(!*zpte, "pmap_zero_page: lock botch"); 3220 3221 pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags); 3222 pmap_pte_flush(); 3223 pmap_update_pg(zerova); /* flush TLB */ 3224 3225 memset((void *)zerova, 0, PAGE_SIZE); 3226 3227 #if defined(DIAGNOSTIC) || defined(XEN) 3228 pmap_pte_set(zpte, 0); /* zap ! */ 3229 pmap_pte_flush(); 3230 #endif 3231 3232 kpreempt_enable(); 3233 #endif /* defined(__HAVE_DIRECT_MAP) */ 3234 } 3235 3236 /* 3237 * pmap_pagezeroidle: the same, for the idle loop page zero'er. 3238 * Returns true if the page was zero'd, false if we aborted for 3239 * some reason. 3240 */ 3241 3242 bool 3243 pmap_pageidlezero(paddr_t pa) 3244 { 3245 #ifdef __HAVE_DIRECT_MAP 3246 KASSERT(cpu_feature[0] & CPUID_SSE2); 3247 return sse2_idlezero_page((void *)PMAP_DIRECT_MAP(pa)); 3248 #else 3249 struct cpu_info *ci; 3250 pt_entry_t *zpte; 3251 vaddr_t zerova; 3252 bool rv; 3253 3254 const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_M | PG_U; 3255 3256 ci = curcpu(); 3257 zerova = ci->vpage[VPAGE_ZER]; 3258 zpte = ci->vpage_pte[VPAGE_ZER]; 3259 3260 KASSERT(cpu_feature[0] & CPUID_SSE2); 3261 KASSERT(*zpte == 0); 3262 3263 pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags); 3264 pmap_pte_flush(); 3265 pmap_update_pg(zerova); /* flush TLB */ 3266 3267 rv = sse2_idlezero_page((void *)zerova); 3268 3269 #if defined(DIAGNOSTIC) || defined(XEN) 3270 pmap_pte_set(zpte, 0); /* zap ! */ 3271 pmap_pte_flush(); 3272 #endif 3273 3274 return rv; 3275 #endif 3276 } 3277 3278 /* 3279 * pmap_copy_page: copy a page 3280 */ 3281 3282 void 3283 pmap_copy_page(paddr_t srcpa, paddr_t dstpa) 3284 { 3285 #if defined(__HAVE_DIRECT_MAP) 3286 vaddr_t srcva = PMAP_DIRECT_MAP(srcpa); 3287 vaddr_t dstva = PMAP_DIRECT_MAP(dstpa); 3288 3289 memcpy((void *)dstva, (void *)srcva, PAGE_SIZE); 3290 #else 3291 #if defined(XEN) 3292 if (XEN_VERSION_SUPPORTED(3, 4)) { 3293 xen_copy_page(srcpa, dstpa); 3294 return; 3295 } 3296 #endif 3297 struct cpu_info *ci; 3298 pt_entry_t *srcpte, *dstpte; 3299 vaddr_t srcva, dstva; 3300 3301 const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_U; 3302 3303 kpreempt_disable(); 3304 3305 ci = curcpu(); 3306 srcva = ci->vpage[VPAGE_SRC]; 3307 dstva = ci->vpage[VPAGE_DST]; 3308 srcpte = ci->vpage_pte[VPAGE_SRC]; 3309 dstpte = ci->vpage_pte[VPAGE_DST]; 3310 3311 KASSERT(*srcpte == 0 && *dstpte == 0); 3312 3313 pmap_pte_set(srcpte, pmap_pa2pte(srcpa) | pteflags); 3314 pmap_pte_set(dstpte, pmap_pa2pte(dstpa) | pteflags | PG_M); 3315 pmap_pte_flush(); 3316 pmap_update_pg(srcva); 3317 pmap_update_pg(dstva); 3318 3319 memcpy((void *)dstva, (void *)srcva, PAGE_SIZE); 3320 3321 #if defined(DIAGNOSTIC) || defined(XEN) 3322 pmap_pte_set(srcpte, 0); 3323 pmap_pte_set(dstpte, 0); 3324 pmap_pte_flush(); 3325 #endif 3326 3327 kpreempt_enable(); 3328 #endif /* defined(__HAVE_DIRECT_MAP) */ 3329 } 3330 3331 static pt_entry_t * 3332 pmap_map_ptp(struct vm_page *ptp) 3333 { 3334 #ifdef __HAVE_DIRECT_MAP 3335 return (void *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp)); 3336 #else 3337 struct cpu_info *ci; 3338 pt_entry_t *ptppte; 3339 vaddr_t ptpva; 3340 3341 KASSERT(kpreempt_disabled()); 3342 3343 #ifndef XEN 3344 const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_U | PG_M; 3345 #else 3346 const pd_entry_t pteflags = PG_V | pmap_pg_nx | PG_U | PG_M; 3347 #endif 3348 3349 ci = curcpu(); 3350 ptpva = ci->vpage[VPAGE_PTP]; 3351 ptppte = ci->vpage_pte[VPAGE_PTP]; 3352 3353 pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | pteflags); 3354 3355 pmap_pte_flush(); 3356 pmap_update_pg(ptpva); 3357 3358 return (pt_entry_t *)ptpva; 3359 #endif 3360 } 3361 3362 static void 3363 pmap_unmap_ptp(void) 3364 { 3365 #ifndef __HAVE_DIRECT_MAP 3366 #if defined(DIAGNOSTIC) || defined(XEN) 3367 struct cpu_info *ci; 3368 pt_entry_t *pte; 3369 3370 KASSERT(kpreempt_disabled()); 3371 3372 ci = curcpu(); 3373 pte = ci->vpage_pte[VPAGE_PTP]; 3374 3375 if (*pte != 0) { 3376 pmap_pte_set(pte, 0); 3377 pmap_pte_flush(); 3378 } 3379 #endif 3380 #endif 3381 } 3382 3383 static pt_entry_t * 3384 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va) 3385 { 3386 3387 KASSERT(kpreempt_disabled()); 3388 if (pmap_is_curpmap(pmap)) { 3389 return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */ 3390 } 3391 KASSERT(ptp != NULL); 3392 return pmap_map_ptp(ptp) + pl1_pi(va); 3393 } 3394 3395 static void 3396 pmap_unmap_pte(void) 3397 { 3398 3399 KASSERT(kpreempt_disabled()); 3400 3401 pmap_unmap_ptp(); 3402 } 3403 3404 /* 3405 * p m a p r e m o v e f u n c t i o n s 3406 * 3407 * functions that remove mappings 3408 */ 3409 3410 /* 3411 * pmap_remove_ptes: remove PTEs from a PTP 3412 * 3413 * => caller must hold pmap's lock 3414 * => PTP must be mapped into KVA 3415 * => PTP should be null if pmap == pmap_kernel() 3416 * => must be called with kernel preemption disabled 3417 * => returns composite pte if at least one page should be shot down 3418 */ 3419 3420 static void 3421 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva, 3422 vaddr_t startva, vaddr_t endva, struct pv_entry **pv_tofree) 3423 { 3424 pt_entry_t *pte = (pt_entry_t *)ptpva; 3425 3426 KASSERT(pmap == pmap_kernel() || mutex_owned(pmap->pm_lock)); 3427 KASSERT(kpreempt_disabled()); 3428 3429 /* 3430 * note that ptpva points to the PTE that maps startva. this may 3431 * or may not be the first PTE in the PTP. 3432 * 3433 * we loop through the PTP while there are still PTEs to look at 3434 * and the wire_count is greater than 1 (because we use the wire_count 3435 * to keep track of the number of real PTEs in the PTP). 3436 */ 3437 while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) { 3438 (void)pmap_remove_pte(pmap, ptp, pte, startva, pv_tofree); 3439 startva += PAGE_SIZE; 3440 pte++; 3441 } 3442 } 3443 3444 3445 /* 3446 * pmap_remove_pte: remove a single PTE from a PTP. 3447 * 3448 * => caller must hold pmap's lock 3449 * => PTP must be mapped into KVA 3450 * => PTP should be null if pmap == pmap_kernel() 3451 * => returns true if we removed a mapping 3452 * => must be called with kernel preemption disabled 3453 */ 3454 static bool 3455 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, 3456 vaddr_t va, struct pv_entry **pv_tofree) 3457 { 3458 struct pv_entry *pve; 3459 struct vm_page *pg; 3460 struct pmap_page *pp; 3461 pt_entry_t opte; 3462 3463 KASSERT(pmap == pmap_kernel() || mutex_owned(pmap->pm_lock)); 3464 KASSERT(kpreempt_disabled()); 3465 3466 if (!pmap_valid_entry(*pte)) { 3467 /* VA not mapped. */ 3468 return false; 3469 } 3470 3471 /* Atomically save the old PTE and zap it. */ 3472 opte = pmap_pte_testset(pte, 0); 3473 if (!pmap_valid_entry(opte)) { 3474 return false; 3475 } 3476 3477 pmap_exec_account(pmap, va, opte, 0); 3478 pmap_stats_update_bypte(pmap, 0, opte); 3479 3480 if (ptp) { 3481 /* 3482 * Dropping a PTE. Make sure that the PDE is flushed. 3483 */ 3484 ptp->wire_count--; 3485 if (ptp->wire_count <= 1) { 3486 opte |= PG_U; 3487 } 3488 } 3489 3490 if ((opte & PG_U) != 0) { 3491 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE); 3492 } 3493 3494 /* 3495 * If we are not on a pv_head list - we are done. 3496 */ 3497 if ((opte & PG_PVLIST) == 0) { 3498 #ifndef DOM0OPS 3499 KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL), 3500 "managed page without PG_PVLIST for %#"PRIxVADDR, va); 3501 KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL), 3502 "pv-tracked page without PG_PVLIST for %#"PRIxVADDR, va); 3503 #endif 3504 return true; 3505 } 3506 3507 if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) { 3508 KASSERT(uvm_page_locked_p(pg)); 3509 pp = VM_PAGE_TO_PP(pg); 3510 } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) { 3511 paddr_t pa = pmap_pte2pa(opte); 3512 panic("%s: PG_PVLIST with pv-untracked page" 3513 " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")", 3514 __func__, va, pa, atop(pa)); 3515 } 3516 3517 /* Sync R/M bits. */ 3518 pp->pp_attrs |= opte; 3519 pve = pmap_remove_pv(pp, ptp, va); 3520 3521 if (pve) { 3522 pve->pve_next = *pv_tofree; 3523 *pv_tofree = pve; 3524 } 3525 return true; 3526 } 3527 3528 /* 3529 * pmap_remove: mapping removal function. 3530 * 3531 * => caller should not be holding any pmap locks 3532 */ 3533 3534 void 3535 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 3536 { 3537 pt_entry_t *ptes; 3538 pd_entry_t pde; 3539 pd_entry_t * const *pdes; 3540 struct pv_entry *pv_tofree = NULL; 3541 bool result; 3542 int i; 3543 paddr_t ptppa; 3544 vaddr_t blkendva, va = sva; 3545 struct vm_page *ptp; 3546 struct pmap *pmap2; 3547 3548 kpreempt_disable(); 3549 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 3550 3551 /* 3552 * removing one page? take shortcut function. 3553 */ 3554 3555 if (va + PAGE_SIZE == eva) { 3556 if (pmap_pdes_valid(va, pdes, &pde)) { 3557 3558 /* PA of the PTP */ 3559 ptppa = pmap_pte2pa(pde); 3560 3561 /* Get PTP if non-kernel mapping. */ 3562 if (pmap != pmap_kernel()) { 3563 ptp = pmap_find_ptp(pmap, va, ptppa, 1); 3564 KASSERTMSG(ptp != NULL, 3565 "%s: unmanaged PTP detected", __func__); 3566 } else { 3567 /* Never free kernel PTPs. */ 3568 ptp = NULL; 3569 } 3570 3571 result = pmap_remove_pte(pmap, ptp, 3572 &ptes[pl1_i(va)], va, &pv_tofree); 3573 3574 /* 3575 * if mapping removed and the PTP is no longer 3576 * being used, free it! 3577 */ 3578 3579 if (result && ptp && ptp->wire_count <= 1) 3580 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3581 } 3582 } else for (/* null */ ; va < eva ; va = blkendva) { 3583 int lvl; 3584 3585 /* determine range of block */ 3586 blkendva = x86_round_pdr(va+1); 3587 if (blkendva > eva) 3588 blkendva = eva; 3589 3590 /* 3591 * Our PTE mappings should never be removed with pmap_remove. 3592 * 3593 * XXXmaxv: still needed? 3594 * 3595 * A long term solution is to move the PTEs out of user address 3596 * space, and into kernel address space. Then we can set 3597 * VM_MAXUSER_ADDRESS to be VM_MAX_ADDRESS. 3598 */ 3599 for (i = 0; i < PDP_SIZE; i++) { 3600 if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i) 3601 panic("PTE space accessed"); 3602 } 3603 3604 lvl = pmap_pdes_invalid(va, pdes, &pde); 3605 if (lvl != 0) { 3606 /* 3607 * skip a range corresponding to an invalid pde. 3608 */ 3609 blkendva = (va & ptp_masks[lvl - 1]) + nbpd[lvl - 1]; 3610 continue; 3611 } 3612 3613 /* PA of the PTP */ 3614 ptppa = pmap_pte2pa(pde); 3615 3616 /* Get PTP if non-kernel mapping. */ 3617 if (pmap != pmap_kernel()) { 3618 ptp = pmap_find_ptp(pmap, va, ptppa, 1); 3619 KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected", 3620 __func__); 3621 } else { 3622 /* Never free kernel PTPs. */ 3623 ptp = NULL; 3624 } 3625 3626 pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va, 3627 blkendva, &pv_tofree); 3628 3629 /* if PTP is no longer being used, free it! */ 3630 if (ptp && ptp->wire_count <= 1) { 3631 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3632 } 3633 } 3634 pmap_unmap_ptes(pmap, pmap2); /* unlock pmap */ 3635 kpreempt_enable(); 3636 3637 /* Now we free unused PVs */ 3638 if (pv_tofree) 3639 pmap_free_pvs(pv_tofree); 3640 } 3641 3642 /* 3643 * pmap_sync_pv: clear pte bits and return the old value of the pte. 3644 * 3645 * => Caller should disable kernel preemption. 3646 * => issues tlb shootdowns if necessary. 3647 */ 3648 3649 static int 3650 pmap_sync_pv(struct pv_pte *pvpte, pt_entry_t expect, int clearbits, 3651 pt_entry_t *optep) 3652 { 3653 struct pmap *pmap; 3654 struct vm_page *ptp; 3655 vaddr_t va; 3656 pt_entry_t *ptep; 3657 pt_entry_t opte; 3658 pt_entry_t npte; 3659 bool need_shootdown; 3660 3661 ptp = pvpte->pte_ptp; 3662 va = pvpte->pte_va; 3663 KASSERT(ptp == NULL || ptp->uobject != NULL); 3664 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 3665 pmap = ptp_to_pmap(ptp); 3666 3667 KASSERT((expect & ~(PG_FRAME | PG_V)) == 0); 3668 KASSERT((expect & PG_V) != 0); 3669 KASSERT(clearbits == ~0 || (clearbits & ~(PG_M | PG_U | PG_RW)) == 0); 3670 KASSERT(kpreempt_disabled()); 3671 3672 ptep = pmap_map_pte(pmap, ptp, va); 3673 do { 3674 opte = *ptep; 3675 KASSERT((opte & (PG_M | PG_U)) != PG_M); 3676 KASSERT((opte & (PG_U | PG_V)) != PG_U); 3677 KASSERT(opte == 0 || (opte & PG_V) != 0); 3678 if ((opte & (PG_FRAME | PG_V)) != expect) { 3679 3680 /* 3681 * we lost a race with a V->P operation like 3682 * pmap_remove(). wait for the competitor 3683 * reflecting pte bits into mp_attrs. 3684 * 3685 * issue a redundant TLB shootdown so that 3686 * we can wait for its completion. 3687 */ 3688 3689 pmap_unmap_pte(); 3690 if (clearbits != 0) { 3691 pmap_tlb_shootdown(pmap, va, 3692 (pmap == pmap_kernel() ? PG_G : 0), 3693 TLBSHOOT_SYNC_PV1); 3694 } 3695 return EAGAIN; 3696 } 3697 3698 /* 3699 * check if there's anything to do on this pte. 3700 */ 3701 3702 if ((opte & clearbits) == 0) { 3703 need_shootdown = false; 3704 break; 3705 } 3706 3707 /* 3708 * we need a shootdown if the pte is cached. (PG_U) 3709 * 3710 * ...unless we are clearing only the PG_RW bit and 3711 * it isn't cached as RW. (PG_M) 3712 */ 3713 3714 need_shootdown = (opte & PG_U) != 0 && 3715 !(clearbits == PG_RW && (opte & PG_M) == 0); 3716 3717 npte = opte & ~clearbits; 3718 3719 /* 3720 * if we need a shootdown anyway, clear PG_U and PG_M. 3721 */ 3722 3723 if (need_shootdown) { 3724 npte &= ~(PG_U | PG_M); 3725 } 3726 KASSERT((npte & (PG_M | PG_U)) != PG_M); 3727 KASSERT((npte & (PG_U | PG_V)) != PG_U); 3728 KASSERT(npte == 0 || (opte & PG_V) != 0); 3729 } while (pmap_pte_cas(ptep, opte, npte) != opte); 3730 3731 if (need_shootdown) { 3732 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV2); 3733 } 3734 pmap_unmap_pte(); 3735 3736 *optep = opte; 3737 return 0; 3738 } 3739 3740 static void 3741 pmap_pp_remove(struct pmap_page *pp, paddr_t pa) 3742 { 3743 struct pv_pte *pvpte; 3744 struct pv_entry *killlist = NULL; 3745 struct vm_page *ptp; 3746 pt_entry_t expect; 3747 int count; 3748 3749 expect = pmap_pa2pte(pa) | PG_V; 3750 count = SPINLOCK_BACKOFF_MIN; 3751 kpreempt_disable(); 3752 startover: 3753 while ((pvpte = pv_pte_first(pp)) != NULL) { 3754 struct pmap *pmap; 3755 struct pv_entry *pve; 3756 pt_entry_t opte; 3757 vaddr_t va; 3758 int error; 3759 3760 /* 3761 * add a reference to the pmap before clearing the pte. 3762 * otherwise the pmap can disappear behind us. 3763 */ 3764 3765 ptp = pvpte->pte_ptp; 3766 pmap = ptp_to_pmap(ptp); 3767 if (ptp != NULL) { 3768 pmap_reference(pmap); 3769 } 3770 3771 error = pmap_sync_pv(pvpte, expect, ~0, &opte); 3772 if (error == EAGAIN) { 3773 int hold_count; 3774 KERNEL_UNLOCK_ALL(curlwp, &hold_count); 3775 if (ptp != NULL) { 3776 pmap_destroy(pmap); 3777 } 3778 SPINLOCK_BACKOFF(count); 3779 KERNEL_LOCK(hold_count, curlwp); 3780 goto startover; 3781 } 3782 3783 pp->pp_attrs |= opte; 3784 va = pvpte->pte_va; 3785 pve = pmap_remove_pv(pp, ptp, va); 3786 3787 /* update the PTP reference count. free if last reference. */ 3788 if (ptp != NULL) { 3789 struct pmap *pmap2; 3790 pt_entry_t *ptes; 3791 pd_entry_t * const *pdes; 3792 3793 KASSERT(pmap != pmap_kernel()); 3794 3795 pmap_tlb_shootnow(); 3796 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3797 pmap_stats_update_bypte(pmap, 0, opte); 3798 ptp->wire_count--; 3799 if (ptp->wire_count <= 1) { 3800 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3801 } 3802 pmap_unmap_ptes(pmap, pmap2); 3803 pmap_destroy(pmap); 3804 } else { 3805 KASSERT(pmap == pmap_kernel()); 3806 pmap_stats_update_bypte(pmap, 0, opte); 3807 } 3808 3809 if (pve != NULL) { 3810 pve->pve_next = killlist; /* mark it for death */ 3811 killlist = pve; 3812 } 3813 } 3814 pmap_tlb_shootnow(); 3815 kpreempt_enable(); 3816 3817 /* Now free unused pvs. */ 3818 pmap_free_pvs(killlist); 3819 } 3820 3821 /* 3822 * pmap_page_remove: remove a managed vm_page from all pmaps that map it 3823 * 3824 * => R/M bits are sync'd back to attrs 3825 */ 3826 3827 void 3828 pmap_page_remove(struct vm_page *pg) 3829 { 3830 struct pmap_page *pp; 3831 paddr_t pa; 3832 3833 KASSERT(uvm_page_locked_p(pg)); 3834 3835 pp = VM_PAGE_TO_PP(pg); 3836 pa = VM_PAGE_TO_PHYS(pg); 3837 pmap_pp_remove(pp, pa); 3838 } 3839 3840 /* 3841 * pmap_pv_remove: remove an unmanaged pv-tracked page from all pmaps 3842 * that map it 3843 */ 3844 3845 void 3846 pmap_pv_remove(paddr_t pa) 3847 { 3848 struct pmap_page *pp; 3849 3850 pp = pmap_pv_tracked(pa); 3851 if (pp == NULL) 3852 panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa); 3853 pmap_pp_remove(pp, pa); 3854 } 3855 3856 /* 3857 * p m a p a t t r i b u t e f u n c t i o n s 3858 * functions that test/change managed page's attributes 3859 * since a page can be mapped multiple times we must check each PTE that 3860 * maps it by going down the pv lists. 3861 */ 3862 3863 /* 3864 * pmap_test_attrs: test a page's attributes 3865 */ 3866 3867 bool 3868 pmap_test_attrs(struct vm_page *pg, unsigned testbits) 3869 { 3870 struct pmap_page *pp; 3871 struct pv_pte *pvpte; 3872 pt_entry_t expect; 3873 u_int result; 3874 3875 KASSERT(uvm_page_locked_p(pg)); 3876 3877 pp = VM_PAGE_TO_PP(pg); 3878 if ((pp->pp_attrs & testbits) != 0) { 3879 return true; 3880 } 3881 expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V; 3882 kpreempt_disable(); 3883 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 3884 pt_entry_t opte; 3885 int error; 3886 3887 if ((pp->pp_attrs & testbits) != 0) { 3888 break; 3889 } 3890 error = pmap_sync_pv(pvpte, expect, 0, &opte); 3891 if (error == 0) { 3892 pp->pp_attrs |= opte; 3893 } 3894 } 3895 result = pp->pp_attrs & testbits; 3896 kpreempt_enable(); 3897 3898 /* 3899 * note that we will exit the for loop with a non-null pve if 3900 * we have found the bits we are testing for. 3901 */ 3902 3903 return result != 0; 3904 } 3905 3906 static bool 3907 pmap_pp_clear_attrs(struct pmap_page *pp, paddr_t pa, unsigned clearbits) 3908 { 3909 struct pv_pte *pvpte; 3910 u_int result; 3911 pt_entry_t expect; 3912 int count; 3913 3914 expect = pmap_pa2pte(pa) | PG_V; 3915 count = SPINLOCK_BACKOFF_MIN; 3916 kpreempt_disable(); 3917 startover: 3918 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 3919 pt_entry_t opte; 3920 int error; 3921 3922 error = pmap_sync_pv(pvpte, expect, clearbits, &opte); 3923 if (error == EAGAIN) { 3924 int hold_count; 3925 KERNEL_UNLOCK_ALL(curlwp, &hold_count); 3926 SPINLOCK_BACKOFF(count); 3927 KERNEL_LOCK(hold_count, curlwp); 3928 goto startover; 3929 } 3930 pp->pp_attrs |= opte; 3931 } 3932 result = pp->pp_attrs & clearbits; 3933 pp->pp_attrs &= ~clearbits; 3934 pmap_tlb_shootnow(); 3935 kpreempt_enable(); 3936 3937 return result != 0; 3938 } 3939 3940 /* 3941 * pmap_clear_attrs: clear the specified attribute for a page. 3942 * 3943 * => we return true if we cleared one of the bits we were asked to 3944 */ 3945 3946 bool 3947 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits) 3948 { 3949 struct pmap_page *pp; 3950 paddr_t pa; 3951 3952 KASSERT(uvm_page_locked_p(pg)); 3953 3954 pp = VM_PAGE_TO_PP(pg); 3955 pa = VM_PAGE_TO_PHYS(pg); 3956 3957 return pmap_pp_clear_attrs(pp, pa, clearbits); 3958 } 3959 3960 /* 3961 * pmap_pv_clear_attrs: clear the specified attributes for an unmanaged 3962 * pv-tracked page. 3963 */ 3964 3965 bool 3966 pmap_pv_clear_attrs(paddr_t pa, unsigned clearbits) 3967 { 3968 struct pmap_page *pp; 3969 3970 pp = pmap_pv_tracked(pa); 3971 if (pp == NULL) 3972 panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa); 3973 3974 return pmap_pp_clear_attrs(pp, pa, clearbits); 3975 } 3976 3977 /* 3978 * p m a p p r o t e c t i o n f u n c t i o n s 3979 */ 3980 3981 /* 3982 * pmap_page_protect: change the protection of all recorded mappings 3983 * of a managed page 3984 * 3985 * => NOTE: this is an inline function in pmap.h 3986 */ 3987 3988 /* see pmap.h */ 3989 3990 /* 3991 * pmap_pv_protect: change the protection of all recorded mappings 3992 * of an unmanaged pv-tracked page 3993 * 3994 * => NOTE: this is an inline function in pmap.h 3995 */ 3996 3997 /* see pmap.h */ 3998 3999 /* 4000 * pmap_protect: set the protection in of the pages in a pmap 4001 * 4002 * => NOTE: this is an inline function in pmap.h 4003 */ 4004 4005 /* see pmap.h */ 4006 4007 /* 4008 * pmap_write_protect: write-protect pages in a pmap. 4009 * 4010 * Note for Xen-amd64. Xen automatically adds PG_u to the kernel pages, but we 4011 * don't need to remove this bit when re-entering the PTEs here: Xen tracks the 4012 * kernel pages with a reserved bit (_PAGE_GUEST_KERNEL), so even if PG_u is 4013 * present the page will still be considered as a kernel page, and the privilege 4014 * separation will be enforced correctly. 4015 */ 4016 void 4017 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot) 4018 { 4019 pt_entry_t bit_rem, bit_put; 4020 pt_entry_t *ptes; 4021 pt_entry_t * const *pdes; 4022 struct pmap *pmap2; 4023 vaddr_t blockend, va; 4024 4025 KASSERT(curlwp->l_md.md_gc_pmap != pmap); 4026 4027 bit_rem = 0; 4028 if (!(prot & VM_PROT_WRITE)) 4029 bit_rem = PG_RW; 4030 4031 bit_put = 0; 4032 if (!(prot & VM_PROT_EXECUTE)) 4033 bit_put = pmap_pg_nx; 4034 4035 sva &= PG_FRAME; 4036 eva &= PG_FRAME; 4037 4038 /* Acquire pmap. */ 4039 kpreempt_disable(); 4040 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 4041 4042 for (va = sva ; va < eva; va = blockend) { 4043 pt_entry_t *spte, *epte; 4044 int i; 4045 4046 blockend = x86_round_pdr(va + 1); 4047 if (blockend > eva) 4048 blockend = eva; 4049 4050 /* 4051 * Our PTE mappings should never be write-protected. 4052 * 4053 * XXXmaxv: still needed? 4054 * 4055 * A long term solution is to move the PTEs out of user address 4056 * space, and into kernel address space. Then we can set 4057 * VM_MAXUSER_ADDRESS to be VM_MAX_ADDRESS. 4058 */ 4059 for (i = 0; i < PDP_SIZE; i++) { 4060 if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i) 4061 panic("PTE space accessed"); 4062 } 4063 4064 /* Is it a valid block? */ 4065 if (!pmap_pdes_valid(va, pdes, NULL)) { 4066 continue; 4067 } 4068 KASSERT(va < VM_MAXUSER_ADDRESS || va >= VM_MAX_ADDRESS); 4069 4070 spte = &ptes[pl1_i(va)]; 4071 epte = &ptes[pl1_i(blockend)]; 4072 4073 for (/* */; spte < epte; spte++) { 4074 pt_entry_t opte, npte; 4075 4076 do { 4077 opte = *spte; 4078 if (!pmap_valid_entry(opte)) { 4079 goto next; 4080 } 4081 npte = (opte & ~bit_rem) | bit_put; 4082 } while (pmap_pte_cas(spte, opte, npte) != opte); 4083 4084 if ((opte & PG_M) != 0) { 4085 vaddr_t tva = x86_ptob(spte - ptes); 4086 pmap_tlb_shootdown(pmap, tva, opte, 4087 TLBSHOOT_WRITE_PROTECT); 4088 } 4089 next:; 4090 } 4091 } 4092 4093 /* Release pmap. */ 4094 pmap_unmap_ptes(pmap, pmap2); 4095 kpreempt_enable(); 4096 } 4097 4098 /* 4099 * pmap_unwire: clear the wired bit in the PTE. 4100 * 4101 * => Mapping should already be present. 4102 */ 4103 void 4104 pmap_unwire(struct pmap *pmap, vaddr_t va) 4105 { 4106 pt_entry_t *ptes, *ptep, opte; 4107 pd_entry_t * const *pdes; 4108 struct pmap *pmap2; 4109 4110 /* Acquire pmap. */ 4111 kpreempt_disable(); 4112 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 4113 4114 if (!pmap_pdes_valid(va, pdes, NULL)) { 4115 panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va); 4116 } 4117 4118 ptep = &ptes[pl1_i(va)]; 4119 opte = *ptep; 4120 KASSERT(pmap_valid_entry(opte)); 4121 4122 if (opte & PG_W) { 4123 pt_entry_t npte = opte & ~PG_W; 4124 4125 opte = pmap_pte_testset(ptep, npte); 4126 pmap_stats_update_bypte(pmap, npte, opte); 4127 } else { 4128 printf("%s: wiring for pmap %p va %#" PRIxVADDR 4129 "did not change!\n", __func__, pmap, va); 4130 } 4131 4132 /* Release pmap. */ 4133 pmap_unmap_ptes(pmap, pmap2); 4134 kpreempt_enable(); 4135 } 4136 4137 /* 4138 * pmap_copy: copy mappings from one pmap to another 4139 * 4140 * => optional function 4141 * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr) 4142 */ 4143 4144 /* 4145 * defined as macro in pmap.h 4146 */ 4147 4148 __strict_weak_alias(pmap_enter, pmap_enter_default); 4149 4150 int 4151 pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, 4152 u_int flags) 4153 { 4154 return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0); 4155 } 4156 4157 /* 4158 * pmap_enter: enter a mapping into a pmap 4159 * 4160 * => must be done "now" ... no lazy-evaluation 4161 * => we set pmap => pv_head locking 4162 */ 4163 int 4164 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa, 4165 vm_prot_t prot, u_int flags, int domid) 4166 { 4167 pt_entry_t *ptes, opte, npte; 4168 pt_entry_t *ptep; 4169 pd_entry_t * const *pdes; 4170 struct vm_page *ptp; 4171 struct vm_page *new_pg, *old_pg; 4172 struct pmap_page *new_pp, *old_pp; 4173 struct pv_entry *old_pve = NULL; 4174 struct pv_entry *new_pve; 4175 struct pv_entry *new_sparepve; 4176 int error; 4177 bool wired = (flags & PMAP_WIRED) != 0; 4178 struct pmap *pmap2; 4179 4180 KASSERT(pmap_initialized); 4181 KASSERT(curlwp->l_md.md_gc_pmap != pmap); 4182 KASSERT(va < VM_MAX_KERNEL_ADDRESS); 4183 KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#" 4184 PRIxVADDR " over PDP!", __func__, va); 4185 KASSERTMSG(va < VM_MIN_KERNEL_ADDRESS || 4186 pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]), 4187 "%s: missing kernel PTP for va=%#" PRIxVADDR, __func__, va); 4188 4189 #ifdef XEN 4190 KASSERT(domid == DOMID_SELF || pa == 0); 4191 #endif /* XEN */ 4192 4193 npte = ma | protection_codes[prot] | PG_V; 4194 npte |= pmap_pat_flags(flags); 4195 if (wired) 4196 npte |= PG_W; 4197 if (va < VM_MAXUSER_ADDRESS) 4198 npte |= PG_u; 4199 else if (va < VM_MAX_ADDRESS) 4200 panic("PTE space accessed"); /* XXXmaxv: no longer needed? */ 4201 4202 if (pmap == pmap_kernel()) 4203 npte |= pmap_pg_g; 4204 if (flags & VM_PROT_ALL) { 4205 npte |= PG_U; 4206 if (flags & VM_PROT_WRITE) { 4207 KASSERT((npte & PG_RW) != 0); 4208 npte |= PG_M; 4209 } 4210 } 4211 4212 #ifdef XEN 4213 if (domid != DOMID_SELF) 4214 new_pg = NULL; 4215 else 4216 #endif 4217 new_pg = PHYS_TO_VM_PAGE(pa); 4218 if (new_pg != NULL) { 4219 /* This is a managed page */ 4220 npte |= PG_PVLIST; 4221 new_pp = VM_PAGE_TO_PP(new_pg); 4222 } else if ((new_pp = pmap_pv_tracked(pa)) != NULL) { 4223 /* This is an unmanaged pv-tracked page */ 4224 npte |= PG_PVLIST; 4225 } else { 4226 new_pp = NULL; 4227 } 4228 4229 /* 4230 * Try to get pves now if we might need them. 4231 * Keep going even if we fail, since we will not actually need them 4232 * if we are just changing the permissions on an existing mapping, 4233 * but we won't know if that's the case until later. 4234 */ 4235 4236 bool needpves = pmap_pp_needs_pve(new_pp); 4237 if (needpves) { 4238 new_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT); 4239 new_sparepve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT); 4240 } else { 4241 new_pve = NULL; 4242 new_sparepve = NULL; 4243 } 4244 4245 kpreempt_disable(); 4246 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 4247 if (pmap == pmap_kernel()) { 4248 ptp = NULL; 4249 } else { 4250 ptp = pmap_get_ptp(pmap, va, pdes, flags); 4251 if (ptp == NULL) { 4252 pmap_unmap_ptes(pmap, pmap2); 4253 if (flags & PMAP_CANFAIL) { 4254 error = ENOMEM; 4255 goto out; 4256 } 4257 panic("%s: get ptp failed", __func__); 4258 } 4259 } 4260 4261 /* 4262 * Check if there is an existing mapping. If we are now sure that 4263 * we need pves and we failed to allocate them earlier, handle that. 4264 * Caching the value of oldpa here is safe because only the mod/ref bits 4265 * can change while the pmap is locked. 4266 */ 4267 4268 ptep = &ptes[pl1_i(va)]; 4269 opte = *ptep; 4270 bool have_oldpa = pmap_valid_entry(opte); 4271 paddr_t oldpa = pmap_pte2pa(opte); 4272 4273 if (needpves && (!have_oldpa || oldpa != pa) && 4274 (new_pve == NULL || new_sparepve == NULL)) { 4275 pmap_unmap_ptes(pmap, pmap2); 4276 if (flags & PMAP_CANFAIL) { 4277 error = ENOMEM; 4278 goto out; 4279 } 4280 panic("%s: pve allocation failed", __func__); 4281 } 4282 4283 /* 4284 * update the pte. 4285 */ 4286 4287 do { 4288 opte = *ptep; 4289 4290 /* 4291 * if the same page, inherit PG_U and PG_M. 4292 */ 4293 if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) { 4294 npte |= opte & (PG_U | PG_M); 4295 } 4296 #if defined(XEN) 4297 if (domid != DOMID_SELF) { 4298 /* pmap_pte_cas with error handling */ 4299 int s = splvm(); 4300 if (opte != *ptep) { 4301 splx(s); 4302 continue; 4303 } 4304 error = xpq_update_foreign( 4305 vtomach((vaddr_t)ptep), npte, domid); 4306 splx(s); 4307 if (error) { 4308 if (ptp != NULL && ptp->wire_count <= 1) { 4309 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 4310 } 4311 pmap_unmap_ptes(pmap, pmap2); 4312 goto out; 4313 } 4314 break; 4315 } 4316 #endif /* defined(XEN) */ 4317 } while (pmap_pte_cas(ptep, opte, npte) != opte); 4318 4319 /* 4320 * update statistics and PTP's reference count. 4321 */ 4322 4323 pmap_stats_update_bypte(pmap, npte, opte); 4324 if (ptp != NULL && !have_oldpa) { 4325 ptp->wire_count++; 4326 } 4327 KASSERT(ptp == NULL || ptp->wire_count > 1); 4328 4329 /* 4330 * if the same page, we can skip pv_entry handling. 4331 */ 4332 4333 if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) { 4334 KASSERT(((opte ^ npte) & PG_PVLIST) == 0); 4335 goto same_pa; 4336 } 4337 4338 /* 4339 * if old page is pv-tracked, remove pv_entry from its list. 4340 */ 4341 4342 if ((~opte & (PG_V | PG_PVLIST)) == 0) { 4343 if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) { 4344 KASSERT(uvm_page_locked_p(old_pg)); 4345 old_pp = VM_PAGE_TO_PP(old_pg); 4346 } else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) { 4347 panic("%s: PG_PVLIST with pv-untracked page" 4348 " va = %#"PRIxVADDR 4349 " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")", 4350 __func__, va, oldpa, atop(pa)); 4351 } 4352 4353 old_pve = pmap_remove_pv(old_pp, ptp, va); 4354 old_pp->pp_attrs |= opte; 4355 } 4356 4357 /* 4358 * if new page is pv-tracked, insert pv_entry into its list. 4359 */ 4360 4361 if (new_pp) { 4362 new_pve = pmap_enter_pv(new_pp, new_pve, &new_sparepve, ptp, va); 4363 } 4364 4365 same_pa: 4366 pmap_unmap_ptes(pmap, pmap2); 4367 4368 /* 4369 * shootdown tlb if necessary. 4370 */ 4371 4372 if ((~opte & (PG_V | PG_U)) == 0 && 4373 ((opte ^ npte) & (PG_FRAME | PG_RW)) != 0) { 4374 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER); 4375 } 4376 4377 error = 0; 4378 out: 4379 kpreempt_enable(); 4380 if (old_pve != NULL) { 4381 pool_cache_put(&pmap_pv_cache, old_pve); 4382 } 4383 if (new_pve != NULL) { 4384 pool_cache_put(&pmap_pv_cache, new_pve); 4385 } 4386 if (new_sparepve != NULL) { 4387 pool_cache_put(&pmap_pv_cache, new_sparepve); 4388 } 4389 4390 return error; 4391 } 4392 4393 static paddr_t 4394 pmap_get_physpage(void) 4395 { 4396 struct vm_page *ptp; 4397 struct pmap *kpm = pmap_kernel(); 4398 paddr_t pa; 4399 4400 if (!uvm.page_init_done) { 4401 /* 4402 * We're growing the kernel pmap early (from 4403 * uvm_pageboot_alloc()). This case must be 4404 * handled a little differently. 4405 */ 4406 4407 if (!uvm_page_physget(&pa)) 4408 panic("%s: out of memory", __func__); 4409 #if defined(__HAVE_DIRECT_MAP) 4410 pagezero(PMAP_DIRECT_MAP(pa)); 4411 #else 4412 #if defined(XEN) 4413 if (XEN_VERSION_SUPPORTED(3, 4)) { 4414 xen_pagezero(pa); 4415 return pa; 4416 } 4417 #endif 4418 kpreempt_disable(); 4419 pmap_pte_set(early_zero_pte, pmap_pa2pte(pa) | PG_V | 4420 PG_RW | pmap_pg_nx); 4421 pmap_pte_flush(); 4422 pmap_update_pg((vaddr_t)early_zerop); 4423 memset(early_zerop, 0, PAGE_SIZE); 4424 #if defined(DIAGNOSTIC) || defined(XEN) 4425 pmap_pte_set(early_zero_pte, 0); 4426 pmap_pte_flush(); 4427 #endif /* defined(DIAGNOSTIC) */ 4428 kpreempt_enable(); 4429 #endif /* defined(__HAVE_DIRECT_MAP) */ 4430 } else { 4431 /* XXX */ 4432 ptp = uvm_pagealloc(NULL, 0, NULL, 4433 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 4434 if (ptp == NULL) 4435 panic("%s: out of memory", __func__); 4436 ptp->flags &= ~PG_BUSY; 4437 ptp->wire_count = 1; 4438 pa = VM_PAGE_TO_PHYS(ptp); 4439 } 4440 pmap_stats_update(kpm, 1, 0); 4441 4442 return pa; 4443 } 4444 4445 /* 4446 * Expand the page tree with the specified amount of PTPs, mapping virtual 4447 * addresses starting at kva. We populate all the levels but the last one 4448 * (L1). The nodes of the tree are created as RWX, but the pages covered 4449 * will be kentered in L1, with proper permissions. 4450 * 4451 * Used only by pmap_growkernel. 4452 */ 4453 static void 4454 pmap_alloc_level(struct pmap *cpm, vaddr_t kva, long *needed_ptps) 4455 { 4456 unsigned long i; 4457 paddr_t pa; 4458 unsigned long index, endindex; 4459 int level; 4460 pd_entry_t *pdep; 4461 #ifdef XEN 4462 int s = splvm(); /* protect xpq_* */ 4463 #endif 4464 4465 for (level = PTP_LEVELS; level > 1; level--) { 4466 if (level == PTP_LEVELS) 4467 pdep = cpm->pm_pdir; 4468 else 4469 pdep = normal_pdes[level - 2]; 4470 index = pl_i_roundup(kva, level); 4471 endindex = index + needed_ptps[level - 1] - 1; 4472 4473 for (i = index; i <= endindex; i++) { 4474 pt_entry_t pte; 4475 4476 KASSERT(!pmap_valid_entry(pdep[i])); 4477 pa = pmap_get_physpage(); 4478 pte = pmap_pa2pte(pa) | PG_V | PG_RW; 4479 pmap_pte_set(&pdep[i], pte); 4480 4481 #if defined(XEN) && (defined(PAE) || defined(__x86_64__)) 4482 if (level == PTP_LEVELS && i >= PDIR_SLOT_KERN) { 4483 if (__predict_true( 4484 cpu_info_primary.ci_flags & CPUF_PRESENT)) { 4485 /* update per-cpu PMDs on all cpus */ 4486 xen_kpm_sync(pmap_kernel(), i); 4487 } else { 4488 /* 4489 * too early; update primary CPU 4490 * PMD only (without locks) 4491 */ 4492 #ifdef PAE 4493 pd_entry_t *cpu_pdep = 4494 &cpu_info_primary.ci_kpm_pdir[l2tol2(i)]; 4495 #endif 4496 #ifdef __x86_64__ 4497 pd_entry_t *cpu_pdep = 4498 &cpu_info_primary.ci_kpm_pdir[i]; 4499 #endif 4500 pmap_pte_set(cpu_pdep, pte); 4501 } 4502 } 4503 #endif /* XEN && (PAE || __x86_64__) */ 4504 4505 KASSERT(level != PTP_LEVELS || nkptp[level - 1] + 4506 pl_i(VM_MIN_KERNEL_ADDRESS, level) == i); 4507 nkptp[level - 1]++; 4508 } 4509 pmap_pte_flush(); 4510 } 4511 #ifdef XEN 4512 splx(s); 4513 #endif 4514 } 4515 4516 /* 4517 * pmap_growkernel: increase usage of KVM space. 4518 * 4519 * => we allocate new PTPs for the kernel and install them in all 4520 * the pmaps on the system. 4521 */ 4522 4523 vaddr_t 4524 pmap_growkernel(vaddr_t maxkvaddr) 4525 { 4526 struct pmap *kpm = pmap_kernel(); 4527 struct pmap *cpm; 4528 #if !defined(XEN) || !defined(__x86_64__) 4529 struct pmap *pm; 4530 long old; 4531 #endif 4532 int s, i; 4533 long needed_kptp[PTP_LEVELS], target_nptp; 4534 bool invalidate = false; 4535 4536 s = splvm(); /* to be safe */ 4537 mutex_enter(kpm->pm_lock); 4538 4539 if (maxkvaddr <= pmap_maxkvaddr) { 4540 mutex_exit(kpm->pm_lock); 4541 splx(s); 4542 return pmap_maxkvaddr; 4543 } 4544 4545 maxkvaddr = x86_round_pdr(maxkvaddr); 4546 #if !defined(XEN) || !defined(__x86_64__) 4547 old = nkptp[PTP_LEVELS - 1]; 4548 #endif 4549 4550 /* Initialize needed_kptp. */ 4551 for (i = PTP_LEVELS - 1; i >= 1; i--) { 4552 target_nptp = pl_i_roundup(maxkvaddr, i + 1) - 4553 pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1); 4554 4555 if (target_nptp > nkptpmax[i]) 4556 panic("out of KVA space"); 4557 KASSERT(target_nptp >= nkptp[i]); 4558 needed_kptp[i] = target_nptp - nkptp[i]; 4559 } 4560 4561 #if defined(XEN) && (defined(__x86_64__) || defined(PAE)) 4562 /* only pmap_kernel(), or the per-cpu map, has kernel entries */ 4563 cpm = kpm; 4564 #else 4565 /* Get the current pmap */ 4566 if (__predict_true(cpu_info_primary.ci_flags & CPUF_PRESENT)) { 4567 cpm = curcpu()->ci_pmap; 4568 } else { 4569 cpm = kpm; 4570 } 4571 #endif 4572 4573 pmap_alloc_level(cpm, pmap_maxkvaddr, needed_kptp); 4574 4575 /* 4576 * If the number of top level entries changed, update all pmaps. 4577 */ 4578 if (needed_kptp[PTP_LEVELS - 1] != 0) { 4579 #ifdef XEN 4580 #ifdef __x86_64__ 4581 /* nothing, kernel entries are never entered in user pmap */ 4582 #else /* __x86_64__ */ 4583 int pdkidx; 4584 #ifndef PAE 4585 /* 4586 * for PAE this is not needed, because pmap_alloc_level() 4587 * already did update the per-CPU tables 4588 */ 4589 if (cpm != kpm) { 4590 for (pdkidx = PDIR_SLOT_KERN + old; 4591 pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1]; 4592 pdkidx++) { 4593 pmap_pte_set(&kpm->pm_pdir[pdkidx], 4594 cpm->pm_pdir[pdkidx]); 4595 } 4596 pmap_pte_flush(); 4597 } 4598 #endif /* !PAE */ 4599 4600 mutex_enter(&pmaps_lock); 4601 LIST_FOREACH(pm, &pmaps, pm_list) { 4602 for (pdkidx = PDIR_SLOT_KERN + old; 4603 pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1]; 4604 pdkidx++) { 4605 pmap_pte_set(&pm->pm_pdir[pdkidx], 4606 kpm->pm_pdir[pdkidx]); 4607 } 4608 pmap_pte_flush(); 4609 } 4610 mutex_exit(&pmaps_lock); 4611 #endif /* __x86_64__ */ 4612 #else /* XEN */ 4613 size_t newpdes; 4614 newpdes = nkptp[PTP_LEVELS - 1] - old; 4615 if (cpm != kpm) { 4616 memcpy(&kpm->pm_pdir[PDIR_SLOT_KERN + old], 4617 &cpm->pm_pdir[PDIR_SLOT_KERN + old], 4618 newpdes * sizeof(pd_entry_t)); 4619 } 4620 4621 mutex_enter(&pmaps_lock); 4622 LIST_FOREACH(pm, &pmaps, pm_list) { 4623 memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old], 4624 &kpm->pm_pdir[PDIR_SLOT_KERN + old], 4625 newpdes * sizeof (pd_entry_t)); 4626 } 4627 mutex_exit(&pmaps_lock); 4628 #endif 4629 invalidate = true; 4630 } 4631 pmap_maxkvaddr = maxkvaddr; 4632 mutex_exit(kpm->pm_lock); 4633 splx(s); 4634 4635 if (invalidate && pmap_initialized) { 4636 /* Invalidate the PDP cache. */ 4637 pool_cache_invalidate(&pmap_pdp_cache); 4638 } 4639 4640 return maxkvaddr; 4641 } 4642 4643 #ifdef DEBUG 4644 void pmap_dump(struct pmap *, vaddr_t, vaddr_t); 4645 4646 /* 4647 * pmap_dump: dump all the mappings from a pmap 4648 * 4649 * => caller should not be holding any pmap locks 4650 */ 4651 4652 void 4653 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 4654 { 4655 pt_entry_t *ptes, *pte; 4656 pd_entry_t * const *pdes; 4657 struct pmap *pmap2; 4658 vaddr_t blkendva; 4659 4660 /* 4661 * if end is out of range truncate. 4662 * if (end == start) update to max. 4663 */ 4664 4665 if (eva > VM_MAXUSER_ADDRESS || eva <= sva) 4666 eva = VM_MAXUSER_ADDRESS; 4667 4668 /* 4669 * we lock in the pmap => pv_head direction 4670 */ 4671 4672 kpreempt_disable(); 4673 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 4674 4675 /* 4676 * dumping a range of pages: we dump in PTP sized blocks (4MB) 4677 */ 4678 4679 for (/* null */ ; sva < eva ; sva = blkendva) { 4680 4681 /* determine range of block */ 4682 blkendva = x86_round_pdr(sva+1); 4683 if (blkendva > eva) 4684 blkendva = eva; 4685 4686 /* valid block? */ 4687 if (!pmap_pdes_valid(sva, pdes, NULL)) 4688 continue; 4689 4690 pte = &ptes[pl1_i(sva)]; 4691 for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) { 4692 if (!pmap_valid_entry(*pte)) 4693 continue; 4694 printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR 4695 " (pte=%#" PRIxPADDR ")\n", 4696 sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte); 4697 } 4698 } 4699 pmap_unmap_ptes(pmap, pmap2); 4700 kpreempt_enable(); 4701 } 4702 #endif 4703 4704 /* 4705 * pmap_update: process deferred invalidations and frees. 4706 */ 4707 4708 void 4709 pmap_update(struct pmap *pmap) 4710 { 4711 struct vm_page *empty_ptps; 4712 lwp_t *l = curlwp; 4713 4714 /* 4715 * If we have torn down this pmap, invalidate non-global TLB 4716 * entries on any processors using it. 4717 */ 4718 kpreempt_disable(); 4719 if (__predict_false(l->l_md.md_gc_pmap == pmap)) { 4720 l->l_md.md_gc_pmap = NULL; 4721 pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, TLBSHOOT_UPDATE); 4722 } 4723 4724 /* 4725 * Initiate any pending TLB shootdowns. Wait for them to 4726 * complete before returning control to the caller. 4727 */ 4728 pmap_tlb_shootnow(); 4729 kpreempt_enable(); 4730 4731 /* 4732 * Now that shootdowns are complete, process deferred frees, 4733 * but not from interrupt context. 4734 */ 4735 if (l->l_md.md_gc_ptp != NULL) { 4736 KASSERT((l->l_pflag & LP_INTR) == 0); 4737 if (cpu_intr_p()) { 4738 return; 4739 } 4740 empty_ptps = l->l_md.md_gc_ptp; 4741 l->l_md.md_gc_ptp = NULL; 4742 pmap_free_ptps(empty_ptps); 4743 } 4744 } 4745 4746 #if PTP_LEVELS > 4 4747 #error "Unsupported number of page table mappings" 4748 #endif 4749 4750 paddr_t 4751 pmap_init_tmp_pgtbl(paddr_t pg) 4752 { 4753 static bool maps_loaded; 4754 static const paddr_t x86_tmp_pml_paddr[] = { 4755 4 * PAGE_SIZE, /* L1 */ 4756 5 * PAGE_SIZE, /* L2 */ 4757 6 * PAGE_SIZE, /* L3 */ 4758 7 * PAGE_SIZE /* L4 */ 4759 }; 4760 static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 }; 4761 4762 pd_entry_t *tmp_pml, *kernel_pml; 4763 4764 int level; 4765 4766 if (!maps_loaded) { 4767 for (level = 0; level < PTP_LEVELS; ++level) { 4768 x86_tmp_pml_vaddr[level] = 4769 uvm_km_alloc(kernel_map, PAGE_SIZE, 0, 4770 UVM_KMF_VAONLY); 4771 4772 if (x86_tmp_pml_vaddr[level] == 0) 4773 panic("mapping of real mode PML failed\n"); 4774 pmap_kenter_pa(x86_tmp_pml_vaddr[level], 4775 x86_tmp_pml_paddr[level], 4776 VM_PROT_READ | VM_PROT_WRITE, 0); 4777 } 4778 pmap_update(pmap_kernel()); 4779 maps_loaded = true; 4780 } 4781 4782 /* Zero levels 1-3 */ 4783 for (level = 0; level < PTP_LEVELS - 1; ++level) { 4784 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 4785 memset(tmp_pml, 0, PAGE_SIZE); 4786 } 4787 4788 /* Copy PML4 */ 4789 kernel_pml = pmap_kernel()->pm_pdir; 4790 tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1]; 4791 memcpy(tmp_pml, kernel_pml, PAGE_SIZE); 4792 4793 #ifdef PAE 4794 /* 4795 * Use the last 4 entries of the L2 page as L3 PD entries. These 4796 * last entries are unlikely to be used for temporary mappings. 4797 * 508: maps 0->1GB (userland) 4798 * 509: unused 4799 * 510: unused 4800 * 511: maps 3->4GB (kernel) 4801 */ 4802 tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PG_V; 4803 tmp_pml[509] = 0; 4804 tmp_pml[510] = 0; 4805 tmp_pml[511] = pmap_pdirpa(pmap_kernel(), PDIR_SLOT_KERN) | PG_V; 4806 #endif 4807 4808 for (level = PTP_LEVELS - 1; level > 0; --level) { 4809 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 4810 4811 tmp_pml[pl_i(pg, level + 1)] = 4812 (x86_tmp_pml_paddr[level - 1] & PG_FRAME) | PG_RW | PG_V; 4813 } 4814 4815 tmp_pml = (void *)x86_tmp_pml_vaddr[0]; 4816 tmp_pml[pl_i(pg, 1)] = (pg & PG_FRAME) | PG_RW | PG_V; 4817 4818 #ifdef PAE 4819 /* Return the PA of the L3 page (entry 508 of the L2 page) */ 4820 return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t); 4821 #endif 4822 4823 return x86_tmp_pml_paddr[PTP_LEVELS - 1]; 4824 } 4825 4826 u_int 4827 x86_mmap_flags(paddr_t mdpgno) 4828 { 4829 u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK; 4830 u_int pflag = 0; 4831 4832 if (nflag & X86_MMAP_FLAG_PREFETCH) 4833 pflag |= PMAP_WRITE_COMBINE; 4834 4835 return pflag; 4836 } 4837