1 /* $NetBSD: pmap.c,v 1.289 2018/03/04 23:25:35 jdolecek Exp $ */ 2 3 /* 4 * Copyright (c) 2008, 2010, 2016, 2017 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran, and by Maxime Villard. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 2007 Manuel Bouyer. 34 * 35 * Redistribution and use in source and binary forms, with or without 36 * modification, are permitted provided that the following conditions 37 * are met: 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 44 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 45 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 46 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 47 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 48 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 49 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 50 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 51 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 52 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 53 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 54 */ 55 56 /* 57 * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr> 58 * 59 * Permission to use, copy, modify, and distribute this software for any 60 * purpose with or without fee is hereby granted, provided that the above 61 * copyright notice and this permission notice appear in all copies. 62 * 63 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 64 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 65 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 66 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 67 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 68 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 69 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 70 */ 71 72 /* 73 * Copyright (c) 1997 Charles D. Cranor and Washington University. 74 * All rights reserved. 75 * 76 * Redistribution and use in source and binary forms, with or without 77 * modification, are permitted provided that the following conditions 78 * are met: 79 * 1. Redistributions of source code must retain the above copyright 80 * notice, this list of conditions and the following disclaimer. 81 * 2. Redistributions in binary form must reproduce the above copyright 82 * notice, this list of conditions and the following disclaimer in the 83 * documentation and/or other materials provided with the distribution. 84 * 85 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 86 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 87 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 88 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 89 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 90 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 91 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 92 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 93 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 94 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 95 */ 96 97 /* 98 * Copyright 2001 (c) Wasabi Systems, Inc. 99 * All rights reserved. 100 * 101 * Written by Frank van der Linden for Wasabi Systems, Inc. 102 * 103 * Redistribution and use in source and binary forms, with or without 104 * modification, are permitted provided that the following conditions 105 * are met: 106 * 1. Redistributions of source code must retain the above copyright 107 * notice, this list of conditions and the following disclaimer. 108 * 2. Redistributions in binary form must reproduce the above copyright 109 * notice, this list of conditions and the following disclaimer in the 110 * documentation and/or other materials provided with the distribution. 111 * 3. All advertising materials mentioning features or use of this software 112 * must display the following acknowledgement: 113 * This product includes software developed for the NetBSD Project by 114 * Wasabi Systems, Inc. 115 * 4. The name of Wasabi Systems, Inc. may not be used to endorse 116 * or promote products derived from this software without specific prior 117 * written permission. 118 * 119 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND 120 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 121 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 122 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC 123 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 124 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 125 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 126 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 127 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 128 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 129 * POSSIBILITY OF SUCH DAMAGE. 130 */ 131 132 /* 133 * This is the i386 pmap modified and generalized to support x86-64 134 * as well. The idea is to hide the upper N levels of the page tables 135 * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest 136 * is mostly untouched, except that it uses some more generalized 137 * macros and interfaces. 138 * 139 * This pmap has been tested on the i386 as well, and it can be easily 140 * adapted to PAE. 141 * 142 * fvdl@wasabisystems.com 18-Jun-2001 143 */ 144 145 /* 146 * pmap.c: i386 pmap module rewrite 147 * Chuck Cranor <chuck@netbsd> 148 * 11-Aug-97 149 * 150 * history of this pmap module: in addition to my own input, i used 151 * the following references for this rewrite of the i386 pmap: 152 * 153 * [1] the NetBSD i386 pmap. this pmap appears to be based on the 154 * BSD hp300 pmap done by Mike Hibler at University of Utah. 155 * it was then ported to the i386 by William Jolitz of UUNET 156 * Technologies, Inc. Then Charles M. Hannum of the NetBSD 157 * project fixed some bugs and provided some speed ups. 158 * 159 * [2] the FreeBSD i386 pmap. this pmap seems to be the 160 * Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson 161 * and David Greenman. 162 * 163 * [3] the Mach pmap. this pmap, from CMU, seems to have migrated 164 * between several processors. the VAX version was done by 165 * Avadis Tevanian, Jr., and Michael Wayne Young. the i386 166 * version was done by Lance Berc, Mike Kupfer, Bob Baron, 167 * David Golub, and Richard Draves. the alpha version was 168 * done by Alessandro Forin (CMU/Mach) and Chris Demetriou 169 * (NetBSD/alpha). 170 */ 171 172 #include <sys/cdefs.h> 173 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.289 2018/03/04 23:25:35 jdolecek Exp $"); 174 175 #include "opt_user_ldt.h" 176 #include "opt_lockdebug.h" 177 #include "opt_multiprocessor.h" 178 #include "opt_xen.h" 179 #include "opt_svs.h" 180 181 #include <sys/param.h> 182 #include <sys/systm.h> 183 #include <sys/proc.h> 184 #include <sys/pool.h> 185 #include <sys/kernel.h> 186 #include <sys/atomic.h> 187 #include <sys/cpu.h> 188 #include <sys/intr.h> 189 #include <sys/xcall.h> 190 #include <sys/kcore.h> 191 192 #include <uvm/uvm.h> 193 #include <uvm/pmap/pmap_pvt.h> 194 195 #include <dev/isa/isareg.h> 196 197 #include <machine/specialreg.h> 198 #include <machine/gdt.h> 199 #include <machine/isa_machdep.h> 200 #include <machine/cpuvar.h> 201 #include <machine/cputypes.h> 202 203 #include <x86/pmap.h> 204 #include <x86/pmap_pv.h> 205 206 #include <x86/i82489reg.h> 207 #include <x86/i82489var.h> 208 209 #ifdef XEN 210 #include <xen/xen-public/xen.h> 211 #include <xen/hypervisor.h> 212 #endif 213 214 /* 215 * general info: 216 * 217 * - for an explanation of how the i386 MMU hardware works see 218 * the comments in <machine/pte.h>. 219 * 220 * - for an explanation of the general memory structure used by 221 * this pmap (including the recursive mapping), see the comments 222 * in <machine/pmap.h>. 223 * 224 * this file contains the code for the "pmap module." the module's 225 * job is to manage the hardware's virtual to physical address mappings. 226 * note that there are two levels of mapping in the VM system: 227 * 228 * [1] the upper layer of the VM system uses vm_map's and vm_map_entry's 229 * to map ranges of virtual address space to objects/files. for 230 * example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only 231 * to the file /bin/ls starting at offset zero." note that 232 * the upper layer mapping is not concerned with how individual 233 * vm_pages are mapped. 234 * 235 * [2] the lower layer of the VM system (the pmap) maintains the mappings 236 * from virtual addresses. it is concerned with which vm_page is 237 * mapped where. for example, when you run /bin/ls and start 238 * at page 0x1000 the fault routine may lookup the correct page 239 * of the /bin/ls file and then ask the pmap layer to establish 240 * a mapping for it. 241 * 242 * note that information in the lower layer of the VM system can be 243 * thrown away since it can easily be reconstructed from the info 244 * in the upper layer. 245 * 246 * data structures we use include: 247 * 248 * - struct pmap: describes the address space of one thread 249 * - struct pmap_page: describes one pv-tracked page, without 250 * necessarily a corresponding vm_page 251 * - struct pv_entry: describes one <PMAP,VA> mapping of a PA 252 * - struct pv_head: there is one pv_head per pv-tracked page of 253 * physical memory. the pv_head points to a list of pv_entry 254 * structures which describe all the <PMAP,VA> pairs that this 255 * page is mapped in. this is critical for page based operations 256 * such as pmap_page_protect() [change protection on _all_ mappings 257 * of a page] 258 */ 259 260 /* 261 * memory allocation 262 * 263 * - there are three data structures that we must dynamically allocate: 264 * 265 * [A] new process' page directory page (PDP) 266 * - plan 1: done at pmap_create() we use 267 * uvm_km_alloc(kernel_map, PAGE_SIZE) [fka kmem_alloc] to do this 268 * allocation. 269 * 270 * if we are low in free physical memory then we sleep in 271 * uvm_km_alloc -- in this case this is ok since we are creating 272 * a new pmap and should not be holding any locks. 273 * 274 * if the kernel is totally out of virtual space 275 * (i.e. uvm_km_alloc returns NULL), then we panic. 276 * 277 * [B] new page tables pages (PTP) 278 * - call uvm_pagealloc() 279 * => success: zero page, add to pm_pdir 280 * => failure: we are out of free vm_pages, let pmap_enter() 281 * tell UVM about it. 282 * 283 * note: for kernel PTPs, we start with NKPTP of them. as we map 284 * kernel memory (at uvm_map time) we check to see if we've grown 285 * the kernel pmap. if so, we call the optional function 286 * pmap_growkernel() to grow the kernel PTPs in advance. 287 * 288 * [C] pv_entry structures 289 */ 290 291 /* 292 * locking 293 * 294 * we have the following locks that we must contend with: 295 * 296 * mutexes: 297 * 298 * - pmap lock (per pmap, part of uvm_object) 299 * this lock protects the fields in the pmap structure including 300 * the non-kernel PDEs in the PDP, and the PTEs. it also locks 301 * in the alternate PTE space (since that is determined by the 302 * entry in the PDP). 303 * 304 * - pvh_lock (per pv_head) 305 * this lock protects the pv_entry list which is chained off the 306 * pv_head structure for a specific pv-tracked PA. it is locked 307 * when traversing the list (e.g. adding/removing mappings, 308 * syncing R/M bits, etc.) 309 * 310 * - pmaps_lock 311 * this lock protects the list of active pmaps (headed by "pmaps"). 312 * we lock it when adding or removing pmaps from this list. 313 */ 314 315 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER; 316 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER; 317 const long nkptpmax[] = NKPTPMAX_INITIALIZER; 318 const long nbpd[] = NBPD_INITIALIZER; 319 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER; 320 321 long nkptp[] = NKPTP_INITIALIZER; 322 323 struct pmap_head pmaps; 324 kmutex_t pmaps_lock; 325 326 struct pcpu_area *pcpuarea __read_mostly; 327 328 static vaddr_t pmap_maxkvaddr; 329 330 /* 331 * XXX kludge: dummy locking to make KASSERTs in uvm_page.c comfortable. 332 * actual locking is done by pm_lock. 333 */ 334 #if defined(DIAGNOSTIC) 335 #define PMAP_SUBOBJ_LOCK(pm, idx) \ 336 KASSERT(mutex_owned((pm)->pm_lock)); \ 337 if ((idx) != 0) \ 338 mutex_enter((pm)->pm_obj[(idx)].vmobjlock) 339 #define PMAP_SUBOBJ_UNLOCK(pm, idx) \ 340 KASSERT(mutex_owned((pm)->pm_lock)); \ 341 if ((idx) != 0) \ 342 mutex_exit((pm)->pm_obj[(idx)].vmobjlock) 343 #else /* defined(DIAGNOSTIC) */ 344 #define PMAP_SUBOBJ_LOCK(pm, idx) /* nothing */ 345 #define PMAP_SUBOBJ_UNLOCK(pm, idx) /* nothing */ 346 #endif /* defined(DIAGNOSTIC) */ 347 348 /* 349 * Misc. event counters. 350 */ 351 struct evcnt pmap_iobmp_evcnt; 352 struct evcnt pmap_ldt_evcnt; 353 354 /* 355 * PAT 356 */ 357 #define PATENTRY(n, type) (type << ((n) * 8)) 358 #define PAT_UC 0x0ULL 359 #define PAT_WC 0x1ULL 360 #define PAT_WT 0x4ULL 361 #define PAT_WP 0x5ULL 362 #define PAT_WB 0x6ULL 363 #define PAT_UCMINUS 0x7ULL 364 365 static bool cpu_pat_enabled __read_mostly = false; 366 367 /* 368 * Global data structures 369 */ 370 371 static struct pmap kernel_pmap_store; /* the kernel's pmap (proc0) */ 372 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store; 373 374 struct bootspace bootspace __read_mostly; 375 376 /* 377 * pmap_pg_nx: if our processor supports PG_NX in the PTE then we 378 * set pmap_pg_nx to PG_NX (otherwise it is zero). 379 */ 380 pd_entry_t pmap_pg_nx __read_mostly = 0; 381 382 /* 383 * pmap_pg_g: if our processor supports PG_G in the PTE then we 384 * set pmap_pg_g to PG_G (otherwise it is zero). 385 */ 386 pd_entry_t pmap_pg_g __read_mostly = 0; 387 388 /* 389 * pmap_largepages: if our processor supports PG_PS and we are 390 * using it, this is set to true. 391 */ 392 int pmap_largepages __read_mostly = 0; 393 394 /* 395 * i386 physical memory comes in a big contig chunk with a small 396 * hole toward the front of it... the following two paddr_t's 397 * (shared with machdep.c) describe the physical address space 398 * of this machine. 399 */ 400 paddr_t lowmem_rsvd __read_mostly; 401 paddr_t avail_start __read_mostly; /* PA of first available physical page */ 402 paddr_t avail_end __read_mostly; /* PA of last available physical page */ 403 404 #ifdef XEN 405 paddr_t pmap_pa_start; /* PA of first physical page for this domain */ 406 paddr_t pmap_pa_end; /* PA of last physical page for this domain */ 407 #endif 408 409 #define VM_PAGE_TO_PP(pg) (&(pg)->mdpage.mp_pp) 410 411 #define PV_HASH_SIZE 32768 412 #define PV_HASH_LOCK_CNT 32 413 414 struct pv_hash_lock { 415 kmutex_t lock; 416 } __aligned(CACHE_LINE_SIZE) pv_hash_locks[PV_HASH_LOCK_CNT] 417 __aligned(CACHE_LINE_SIZE); 418 419 struct pv_hash_head { 420 SLIST_HEAD(, pv_entry) hh_list; 421 } pv_hash_heads[PV_HASH_SIZE]; 422 423 static u_int 424 pvhash_hash(struct vm_page *ptp, vaddr_t va) 425 { 426 427 return (uintptr_t)ptp / sizeof(*ptp) + (va >> PAGE_SHIFT); 428 } 429 430 static struct pv_hash_head * 431 pvhash_head(u_int hash) 432 { 433 434 return &pv_hash_heads[hash % PV_HASH_SIZE]; 435 } 436 437 static kmutex_t * 438 pvhash_lock(u_int hash) 439 { 440 441 return &pv_hash_locks[hash % PV_HASH_LOCK_CNT].lock; 442 } 443 444 static struct pv_entry * 445 pvhash_remove(struct pv_hash_head *hh, struct vm_page *ptp, vaddr_t va) 446 { 447 struct pv_entry *pve; 448 struct pv_entry *prev; 449 450 prev = NULL; 451 SLIST_FOREACH(pve, &hh->hh_list, pve_hash) { 452 if (pve->pve_pte.pte_ptp == ptp && 453 pve->pve_pte.pte_va == va) { 454 if (prev != NULL) { 455 SLIST_REMOVE_AFTER(prev, pve_hash); 456 } else { 457 SLIST_REMOVE_HEAD(&hh->hh_list, pve_hash); 458 } 459 break; 460 } 461 prev = pve; 462 } 463 return pve; 464 } 465 466 /* 467 * Other data structures 468 */ 469 470 static pt_entry_t protection_codes[8] __read_mostly; 471 472 static bool pmap_initialized __read_mostly = false; /* pmap_init done yet? */ 473 474 /* 475 * The following two vaddr_t's are used during system startup to keep track of 476 * how much of the kernel's VM space we have used. Once the system is started, 477 * the management of the remaining kernel VM space is turned over to the 478 * kernel_map vm_map. 479 */ 480 static vaddr_t virtual_avail __read_mostly; /* VA of first free KVA */ 481 static vaddr_t virtual_end __read_mostly; /* VA of last free KVA */ 482 483 #ifndef XEN 484 /* 485 * LAPIC virtual address, and fake physical address. 486 */ 487 volatile vaddr_t local_apic_va __read_mostly; 488 paddr_t local_apic_pa __read_mostly; 489 #endif 490 491 /* 492 * pool that pmap structures are allocated from 493 */ 494 static struct pool_cache pmap_cache; 495 496 /* 497 * pv_entry cache 498 */ 499 static struct pool_cache pmap_pv_cache; 500 501 #ifdef __HAVE_DIRECT_MAP 502 vaddr_t pmap_direct_base __read_mostly; 503 vaddr_t pmap_direct_end __read_mostly; 504 size_t pmap_direct_pdpe __read_mostly; 505 size_t pmap_direct_npdp __read_mostly; 506 #endif 507 508 #ifndef __HAVE_DIRECT_MAP 509 /* 510 * Special VAs and the PTEs that map them 511 */ 512 static pt_entry_t *early_zero_pte; 513 static void pmap_vpage_cpualloc(struct cpu_info *); 514 #ifdef XEN 515 char *early_zerop; /* also referenced from xen_locore() */ 516 #else 517 static char *early_zerop; 518 #endif 519 #endif 520 521 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int); 522 523 /* PDP pool_cache(9) and its callbacks */ 524 struct pool_cache pmap_pdp_cache; 525 static int pmap_pdp_ctor(void *, void *, int); 526 static void pmap_pdp_dtor(void *, void *); 527 #ifdef PAE 528 /* need to allocate items of 4 pages */ 529 static void *pmap_pdp_alloc(struct pool *, int); 530 static void pmap_pdp_free(struct pool *, void *); 531 static struct pool_allocator pmap_pdp_allocator = { 532 .pa_alloc = pmap_pdp_alloc, 533 .pa_free = pmap_pdp_free, 534 .pa_pagesz = PAGE_SIZE * PDP_SIZE, 535 }; 536 #endif /* PAE */ 537 538 extern vaddr_t idt_vaddr; 539 extern paddr_t idt_paddr; 540 extern vaddr_t gdt_vaddr; 541 extern paddr_t gdt_paddr; 542 extern vaddr_t ldt_vaddr; 543 extern paddr_t ldt_paddr; 544 545 extern int end; 546 547 #ifdef i386 548 /* stuff to fix the pentium f00f bug */ 549 extern vaddr_t pentium_idt_vaddr; 550 #endif 551 552 /* 553 * Local prototypes 554 */ 555 556 #ifdef __HAVE_PCPU_AREA 557 static void pmap_init_pcpu(void); 558 #endif 559 #ifdef __HAVE_DIRECT_MAP 560 static void pmap_init_directmap(struct pmap *); 561 #endif 562 #if !defined(XEN) 563 static void pmap_remap_global(void); 564 #endif 565 #ifndef XEN 566 static void pmap_init_lapic(void); 567 static void pmap_remap_largepages(void); 568 #endif 569 570 static struct vm_page *pmap_get_ptp(struct pmap *, vaddr_t, 571 pd_entry_t * const *, int); 572 static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int); 573 static void pmap_freepage(struct pmap *, struct vm_page *, int); 574 static void pmap_free_ptp(struct pmap *, struct vm_page *, vaddr_t, 575 pt_entry_t *, pd_entry_t * const *); 576 static bool pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *, 577 vaddr_t, struct pv_entry **); 578 static void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t, vaddr_t, 579 vaddr_t, struct pv_entry **); 580 581 static paddr_t pmap_get_physpage(void); 582 static void pmap_alloc_level(struct pmap *, vaddr_t, long *); 583 584 static void pmap_reactivate(struct pmap *); 585 586 /* 587 * p m a p h e l p e r f u n c t i o n s 588 */ 589 590 static inline void 591 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff) 592 { 593 594 if (pmap == pmap_kernel()) { 595 atomic_add_long(&pmap->pm_stats.resident_count, resid_diff); 596 atomic_add_long(&pmap->pm_stats.wired_count, wired_diff); 597 } else { 598 KASSERT(mutex_owned(pmap->pm_lock)); 599 pmap->pm_stats.resident_count += resid_diff; 600 pmap->pm_stats.wired_count += wired_diff; 601 } 602 } 603 604 static inline void 605 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte) 606 { 607 int resid_diff = ((npte & PG_V) ? 1 : 0) - ((opte & PG_V) ? 1 : 0); 608 int wired_diff = ((npte & PG_W) ? 1 : 0) - ((opte & PG_W) ? 1 : 0); 609 610 KASSERT((npte & (PG_V | PG_W)) != PG_W); 611 KASSERT((opte & (PG_V | PG_W)) != PG_W); 612 613 pmap_stats_update(pmap, resid_diff, wired_diff); 614 } 615 616 /* 617 * ptp_to_pmap: lookup pmap by ptp 618 */ 619 620 static struct pmap * 621 ptp_to_pmap(struct vm_page *ptp) 622 { 623 struct pmap *pmap; 624 625 if (ptp == NULL) { 626 return pmap_kernel(); 627 } 628 pmap = (struct pmap *)ptp->uobject; 629 KASSERT(pmap != NULL); 630 KASSERT(&pmap->pm_obj[0] == ptp->uobject); 631 return pmap; 632 } 633 634 static inline struct pv_pte * 635 pve_to_pvpte(struct pv_entry *pve) 636 { 637 638 KASSERT((void *)&pve->pve_pte == (void *)pve); 639 return &pve->pve_pte; 640 } 641 642 static inline struct pv_entry * 643 pvpte_to_pve(struct pv_pte *pvpte) 644 { 645 struct pv_entry *pve = (void *)pvpte; 646 647 KASSERT(pve_to_pvpte(pve) == pvpte); 648 return pve; 649 } 650 651 /* 652 * pv_pte_first, pv_pte_next: PV list iterator. 653 */ 654 655 static struct pv_pte * 656 pv_pte_first(struct pmap_page *pp) 657 { 658 659 if ((pp->pp_flags & PP_EMBEDDED) != 0) { 660 return &pp->pp_pte; 661 } 662 return pve_to_pvpte(LIST_FIRST(&pp->pp_head.pvh_list)); 663 } 664 665 static struct pv_pte * 666 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte) 667 { 668 669 KASSERT(pvpte != NULL); 670 if (pvpte == &pp->pp_pte) { 671 KASSERT((pp->pp_flags & PP_EMBEDDED) != 0); 672 return NULL; 673 } 674 KASSERT((pp->pp_flags & PP_EMBEDDED) == 0); 675 return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list)); 676 } 677 678 /* 679 * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]? 680 * of course the kernel is always loaded 681 */ 682 683 bool 684 pmap_is_curpmap(struct pmap *pmap) 685 { 686 return((pmap == pmap_kernel()) || 687 (pmap == curcpu()->ci_pmap)); 688 } 689 690 /* 691 * Add a reference to the specified pmap. 692 */ 693 694 void 695 pmap_reference(struct pmap *pmap) 696 { 697 698 atomic_inc_uint(&pmap->pm_obj[0].uo_refs); 699 } 700 701 /* 702 * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in 703 * 704 * there are several pmaps involved. some or all of them might be same. 705 * 706 * - the pmap given by the first argument 707 * our caller wants to access this pmap's PTEs. 708 * 709 * - pmap_kernel() 710 * the kernel pmap. note that it only contains the kernel part 711 * of the address space which is shared by any pmap. ie. any 712 * pmap can be used instead of pmap_kernel() for our purpose. 713 * 714 * - ci->ci_pmap 715 * pmap currently loaded on the cpu. 716 * 717 * - vm_map_pmap(&curproc->p_vmspace->vm_map) 718 * current process' pmap. 719 * 720 * => we lock enough pmaps to keep things locked in 721 * => must be undone with pmap_unmap_ptes before returning 722 */ 723 724 void 725 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2, 726 pd_entry_t **ptepp, pd_entry_t * const **pdeppp) 727 { 728 struct pmap *curpmap; 729 struct cpu_info *ci; 730 lwp_t *l; 731 732 /* The kernel's pmap is always accessible. */ 733 if (pmap == pmap_kernel()) { 734 *pmap2 = NULL; 735 *ptepp = PTE_BASE; 736 *pdeppp = normal_pdes; 737 return; 738 } 739 KASSERT(kpreempt_disabled()); 740 741 l = curlwp; 742 retry: 743 mutex_enter(pmap->pm_lock); 744 ci = curcpu(); 745 curpmap = ci->ci_pmap; 746 if (vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) { 747 /* Our own pmap so just load it: easy. */ 748 if (__predict_false(ci->ci_want_pmapload)) { 749 mutex_exit(pmap->pm_lock); 750 pmap_load(); 751 goto retry; 752 } 753 KASSERT(pmap == curpmap); 754 } else if (pmap == curpmap) { 755 /* 756 * Already on the CPU: make it valid. This is very 757 * often the case during exit(), when we have switched 758 * to the kernel pmap in order to destroy a user pmap. 759 */ 760 pmap_reactivate(pmap); 761 } else { 762 /* 763 * Toss current pmap from CPU, but keep a reference to it. 764 * The reference will be dropped by pmap_unmap_ptes(). 765 * Can happen if we block during exit(). 766 */ 767 const cpuid_t cid = cpu_index(ci); 768 769 kcpuset_atomic_clear(curpmap->pm_cpus, cid); 770 kcpuset_atomic_clear(curpmap->pm_kernel_cpus, cid); 771 ci->ci_pmap = pmap; 772 ci->ci_tlbstate = TLBSTATE_VALID; 773 kcpuset_atomic_set(pmap->pm_cpus, cid); 774 kcpuset_atomic_set(pmap->pm_kernel_cpus, cid); 775 cpu_load_pmap(pmap, curpmap); 776 } 777 pmap->pm_ncsw = l->l_ncsw; 778 *pmap2 = curpmap; 779 *ptepp = PTE_BASE; 780 781 #if defined(XEN) && defined(__x86_64__) 782 KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] == L4_BASE); 783 ci->ci_normal_pdes[PTP_LEVELS - 2] = pmap->pm_pdir; 784 *pdeppp = ci->ci_normal_pdes; 785 #else 786 *pdeppp = normal_pdes; 787 #endif 788 } 789 790 /* 791 * pmap_unmap_ptes: unlock the PTE mapping of "pmap" 792 */ 793 794 void 795 pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2) 796 { 797 struct cpu_info *ci; 798 struct pmap *mypmap; 799 800 KASSERT(kpreempt_disabled()); 801 802 /* The kernel's pmap is always accessible. */ 803 if (pmap == pmap_kernel()) { 804 return; 805 } 806 807 ci = curcpu(); 808 809 #if defined(XEN) && defined(__x86_64__) 810 KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] != L4_BASE); 811 ci->ci_normal_pdes[PTP_LEVELS - 2] = L4_BASE; 812 #endif 813 814 /* 815 * We cannot tolerate context switches while mapped in. 816 * If it is our own pmap all we have to do is unlock. 817 */ 818 KASSERT(pmap->pm_ncsw == curlwp->l_ncsw); 819 mypmap = vm_map_pmap(&curproc->p_vmspace->vm_map); 820 if (pmap == mypmap) { 821 mutex_exit(pmap->pm_lock); 822 return; 823 } 824 825 /* 826 * Mark whatever's on the CPU now as lazy and unlock. 827 * If the pmap was already installed, we are done. 828 */ 829 ci->ci_tlbstate = TLBSTATE_LAZY; 830 ci->ci_want_pmapload = (mypmap != pmap_kernel()); 831 mutex_exit(pmap->pm_lock); 832 if (pmap == pmap2) { 833 return; 834 } 835 836 /* 837 * We installed another pmap on the CPU. Grab a reference to 838 * it and leave in place. Toss the evicted pmap (can block). 839 */ 840 pmap_reference(pmap); 841 pmap_destroy(pmap2); 842 } 843 844 845 inline static void 846 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte) 847 { 848 849 #if !defined(__x86_64__) 850 if (curproc == NULL || curproc->p_vmspace == NULL || 851 pm != vm_map_pmap(&curproc->p_vmspace->vm_map)) 852 return; 853 854 if ((opte ^ npte) & PG_X) 855 pmap_update_pg(va); 856 857 /* 858 * Executability was removed on the last executable change. 859 * Reset the code segment to something conservative and 860 * let the trap handler deal with setting the right limit. 861 * We can't do that because of locking constraints on the vm map. 862 */ 863 864 if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) { 865 struct trapframe *tf = curlwp->l_md.md_regs; 866 867 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 868 pm->pm_hiexec = I386_MAX_EXE_ADDR; 869 } 870 #endif /* !defined(__x86_64__) */ 871 } 872 873 #if !defined(__x86_64__) 874 /* 875 * Fixup the code segment to cover all potential executable mappings. 876 * returns 0 if no changes to the code segment were made. 877 */ 878 879 int 880 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb) 881 { 882 struct vm_map_entry *ent; 883 struct pmap *pm = vm_map_pmap(map); 884 vaddr_t va = 0; 885 886 vm_map_lock_read(map); 887 for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) { 888 889 /* 890 * This entry has greater va than the entries before. 891 * We need to make it point to the last page, not past it. 892 */ 893 894 if (ent->protection & VM_PROT_EXECUTE) 895 va = trunc_page(ent->end) - PAGE_SIZE; 896 } 897 vm_map_unlock_read(map); 898 if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL)) 899 return (0); 900 901 pm->pm_hiexec = va; 902 if (pm->pm_hiexec > I386_MAX_EXE_ADDR) { 903 tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL); 904 } else { 905 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 906 return (0); 907 } 908 return (1); 909 } 910 #endif /* !defined(__x86_64__) */ 911 912 void 913 pat_init(struct cpu_info *ci) 914 { 915 uint64_t pat; 916 917 if (!(ci->ci_feat_val[0] & CPUID_PAT)) 918 return; 919 920 /* We change WT to WC. Leave all other entries the default values. */ 921 pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) | 922 PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) | 923 PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) | 924 PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC); 925 926 wrmsr(MSR_CR_PAT, pat); 927 cpu_pat_enabled = true; 928 aprint_debug_dev(ci->ci_dev, "PAT enabled\n"); 929 } 930 931 static pt_entry_t 932 pmap_pat_flags(u_int flags) 933 { 934 u_int cacheflags = (flags & PMAP_CACHE_MASK); 935 936 if (!cpu_pat_enabled) { 937 switch (cacheflags) { 938 case PMAP_NOCACHE: 939 case PMAP_NOCACHE_OVR: 940 /* results in PGC_UCMINUS on cpus which have 941 * the cpuid PAT but PAT "disabled" 942 */ 943 return PG_N; 944 default: 945 return 0; 946 } 947 } 948 949 switch (cacheflags) { 950 case PMAP_NOCACHE: 951 return PGC_UC; 952 case PMAP_WRITE_COMBINE: 953 return PGC_WC; 954 case PMAP_WRITE_BACK: 955 return PGC_WB; 956 case PMAP_NOCACHE_OVR: 957 return PGC_UCMINUS; 958 } 959 960 return 0; 961 } 962 963 /* 964 * p m a p k e n t e r f u n c t i o n s 965 * 966 * functions to quickly enter/remove pages from the kernel address 967 * space. pmap_kremove is exported to MI kernel. we make use of 968 * the recursive PTE mappings. 969 */ 970 971 /* 972 * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking 973 * 974 * => no need to lock anything, assume va is already allocated 975 * => should be faster than normal pmap enter function 976 */ 977 978 void 979 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags) 980 { 981 pt_entry_t *pte, opte, npte; 982 983 KASSERT(!(prot & ~VM_PROT_ALL)); 984 985 if (va < VM_MIN_KERNEL_ADDRESS) 986 pte = vtopte(va); 987 else 988 pte = kvtopte(va); 989 #ifdef DOM0OPS 990 if (pa < pmap_pa_start || pa >= pmap_pa_end) { 991 #ifdef DEBUG 992 printf_nolog("%s: pa %#" PRIxPADDR " for va %#" PRIxVADDR 993 " outside range\n", __func__, pa, va); 994 #endif /* DEBUG */ 995 npte = pa; 996 } else 997 #endif /* DOM0OPS */ 998 npte = pmap_pa2pte(pa); 999 npte |= protection_codes[prot] | PG_V | pmap_pg_g; 1000 npte |= pmap_pat_flags(flags); 1001 opte = pmap_pte_testset(pte, npte); /* zap! */ 1002 1003 /* 1004 * XXX: make sure we are not dealing with a large page, since the only 1005 * large pages created are for the kernel image, and they should never 1006 * be kentered. 1007 */ 1008 KASSERTMSG(!(opte & PG_PS), "PG_PS va=%#"PRIxVADDR, va); 1009 1010 if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) { 1011 /* This should not happen. */ 1012 printf_nolog("%s: mapping already present\n", __func__); 1013 kpreempt_disable(); 1014 pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER); 1015 kpreempt_enable(); 1016 } 1017 } 1018 1019 void 1020 pmap_emap_enter(vaddr_t va, paddr_t pa, vm_prot_t prot) 1021 { 1022 pt_entry_t *pte, npte; 1023 1024 KASSERT((prot & ~VM_PROT_ALL) == 0); 1025 pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va); 1026 1027 #ifdef DOM0OPS 1028 if (pa < pmap_pa_start || pa >= pmap_pa_end) { 1029 npte = pa; 1030 } else 1031 #endif 1032 npte = pmap_pa2pte(pa); 1033 1034 npte = pmap_pa2pte(pa); 1035 npte |= protection_codes[prot] | PG_V; 1036 pmap_pte_set(pte, npte); 1037 pmap_pte_flush(); 1038 } 1039 1040 /* 1041 * pmap_emap_sync: perform TLB flush or pmap load, if it was deferred. 1042 */ 1043 void 1044 pmap_emap_sync(bool canload) 1045 { 1046 struct cpu_info *ci = curcpu(); 1047 struct pmap *pmap; 1048 1049 KASSERT(kpreempt_disabled()); 1050 if (__predict_true(ci->ci_want_pmapload && canload)) { 1051 /* 1052 * XXX: Hint for pmap_reactivate(), which might suggest to 1053 * not perform TLB flush, if state has not changed. 1054 */ 1055 pmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map); 1056 if (__predict_false(pmap == ci->ci_pmap)) { 1057 kcpuset_atomic_clear(pmap->pm_cpus, cpu_index(ci)); 1058 } 1059 pmap_load(); 1060 KASSERT(ci->ci_want_pmapload == 0); 1061 } else { 1062 tlbflush(); 1063 } 1064 } 1065 1066 void 1067 pmap_emap_remove(vaddr_t sva, vsize_t len) 1068 { 1069 pt_entry_t *pte; 1070 vaddr_t va, eva = sva + len; 1071 1072 for (va = sva; va < eva; va += PAGE_SIZE) { 1073 pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va); 1074 pmap_pte_set(pte, 0); 1075 } 1076 1077 pmap_pte_flush(); 1078 } 1079 1080 __strict_weak_alias(pmap_kenter_ma, pmap_kenter_pa); 1081 1082 #if defined(__x86_64__) 1083 /* 1084 * Change protection for a virtual address. Local for a CPU only, don't 1085 * care about TLB shootdowns. 1086 * 1087 * => must be called with preemption disabled 1088 */ 1089 void 1090 pmap_changeprot_local(vaddr_t va, vm_prot_t prot) 1091 { 1092 pt_entry_t *pte, opte, npte; 1093 1094 KASSERT(kpreempt_disabled()); 1095 1096 if (va < VM_MIN_KERNEL_ADDRESS) 1097 pte = vtopte(va); 1098 else 1099 pte = kvtopte(va); 1100 1101 npte = opte = *pte; 1102 1103 if ((prot & VM_PROT_WRITE) != 0) 1104 npte |= PG_RW; 1105 else 1106 npte &= ~PG_RW; 1107 1108 if (opte != npte) { 1109 pmap_pte_set(pte, npte); 1110 pmap_pte_flush(); 1111 invlpg(va); 1112 } 1113 } 1114 #endif /* defined(__x86_64__) */ 1115 1116 /* 1117 * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking 1118 * 1119 * => no need to lock anything 1120 * => caller must dispose of any vm_page mapped in the va range 1121 * => note: not an inline function 1122 * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE 1123 * => we assume kernel only unmaps valid addresses and thus don't bother 1124 * checking the valid bit before doing TLB flushing 1125 * => must be followed by call to pmap_update() before reuse of page 1126 */ 1127 1128 static inline void 1129 pmap_kremove1(vaddr_t sva, vsize_t len, bool localonly) 1130 { 1131 pt_entry_t *pte, opte; 1132 vaddr_t va, eva; 1133 1134 eva = sva + len; 1135 1136 kpreempt_disable(); 1137 for (va = sva; va < eva; va += PAGE_SIZE) { 1138 pte = kvtopte(va); 1139 opte = pmap_pte_testset(pte, 0); /* zap! */ 1140 if ((opte & (PG_V | PG_U)) == (PG_V | PG_U) && !localonly) { 1141 pmap_tlb_shootdown(pmap_kernel(), va, opte, 1142 TLBSHOOT_KREMOVE); 1143 } 1144 KASSERTMSG((opte & PG_PS) == 0, 1145 "va %#" PRIxVADDR " is a large page", va); 1146 KASSERTMSG((opte & PG_PVLIST) == 0, 1147 "va %#" PRIxVADDR " is a pv tracked page", va); 1148 } 1149 if (localonly) { 1150 tlbflushg(); 1151 } 1152 kpreempt_enable(); 1153 } 1154 1155 void 1156 pmap_kremove(vaddr_t sva, vsize_t len) 1157 { 1158 1159 pmap_kremove1(sva, len, false); 1160 } 1161 1162 /* 1163 * pmap_kremove_local: like pmap_kremove(), but only worry about 1164 * TLB invalidations on the current CPU. this is only intended 1165 * for use while writing kernel crash dumps, either after panic 1166 * or via reboot -d. 1167 */ 1168 1169 void 1170 pmap_kremove_local(vaddr_t sva, vsize_t len) 1171 { 1172 1173 pmap_kremove1(sva, len, true); 1174 } 1175 1176 /* 1177 * p m a p i n i t f u n c t i o n s 1178 * 1179 * pmap_bootstrap and pmap_init are called during system startup 1180 * to init the pmap module. pmap_bootstrap() does a low level 1181 * init just to get things rolling. pmap_init() finishes the job. 1182 */ 1183 1184 /* 1185 * pmap_bootstrap_valloc: allocate a virtual address in the bootstrap area. 1186 * This function is to be used before any VM system has been set up. 1187 * 1188 * The va is taken from virtual_avail. 1189 */ 1190 static vaddr_t 1191 pmap_bootstrap_valloc(size_t npages) 1192 { 1193 vaddr_t va = virtual_avail; 1194 virtual_avail += npages * PAGE_SIZE; 1195 return va; 1196 } 1197 1198 /* 1199 * pmap_bootstrap_palloc: allocate a physical address in the bootstrap area. 1200 * This function is to be used before any VM system has been set up. 1201 * 1202 * The pa is taken from avail_start. 1203 */ 1204 static paddr_t 1205 pmap_bootstrap_palloc(size_t npages) 1206 { 1207 paddr_t pa = avail_start; 1208 avail_start += npages * PAGE_SIZE; 1209 return pa; 1210 } 1211 1212 /* 1213 * pmap_bootstrap: get the system in a state where it can run with VM properly 1214 * enabled (called before main()). The VM system is fully init'd later. 1215 * 1216 * => on i386, locore.S has already enabled the MMU by allocating a PDP for the 1217 * kernel, and nkpde PTP's for the kernel. 1218 * => kva_start is the first free virtual address in kernel space. 1219 */ 1220 void 1221 pmap_bootstrap(vaddr_t kva_start) 1222 { 1223 struct pmap *kpm; 1224 int i; 1225 vaddr_t kva; 1226 1227 pmap_pg_nx = (cpu_feature[2] & CPUID_NOX ? PG_NX : 0); 1228 1229 /* 1230 * Set up our local static global vars that keep track of the usage of 1231 * KVM before kernel_map is set up. 1232 */ 1233 virtual_avail = kva_start; /* first free KVA */ 1234 virtual_end = VM_MAX_KERNEL_ADDRESS; /* last KVA */ 1235 1236 /* 1237 * Set up protection_codes: we need to be able to convert from a MI 1238 * protection code (some combo of VM_PROT...) to something we can jam 1239 * into a x86 PTE. 1240 */ 1241 protection_codes[VM_PROT_NONE] = pmap_pg_nx; 1242 protection_codes[VM_PROT_EXECUTE] = PG_RO | PG_X; 1243 protection_codes[VM_PROT_READ] = PG_RO | pmap_pg_nx; 1244 protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO | PG_X; 1245 protection_codes[VM_PROT_WRITE] = PG_RW | pmap_pg_nx; 1246 protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW | PG_X; 1247 protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW | pmap_pg_nx; 1248 protection_codes[VM_PROT_ALL] = PG_RW | PG_X; 1249 1250 /* 1251 * Now we init the kernel's pmap. 1252 * 1253 * The kernel pmap's pm_obj is not used for much. However, in user pmaps 1254 * the pm_obj contains the list of active PTPs. 1255 * 1256 * The pm_obj currently does not have a pager. It might be possible to 1257 * add a pager that would allow a process to read-only mmap its own page 1258 * tables (fast user-level vtophys?). This may or may not be useful. 1259 */ 1260 kpm = pmap_kernel(); 1261 for (i = 0; i < PTP_LEVELS - 1; i++) { 1262 mutex_init(&kpm->pm_obj_lock[i], MUTEX_DEFAULT, IPL_NONE); 1263 uvm_obj_init(&kpm->pm_obj[i], NULL, false, 1); 1264 uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_obj_lock[i]); 1265 kpm->pm_ptphint[i] = NULL; 1266 } 1267 memset(&kpm->pm_list, 0, sizeof(kpm->pm_list)); /* pm_list not used */ 1268 1269 kpm->pm_pdir = (pd_entry_t *)bootspace.pdir; 1270 for (i = 0; i < PDP_SIZE; i++) 1271 kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i; 1272 1273 kpm->pm_stats.wired_count = kpm->pm_stats.resident_count = 1274 x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS); 1275 1276 kcpuset_create(&kpm->pm_cpus, true); 1277 kcpuset_create(&kpm->pm_kernel_cpus, true); 1278 1279 kpm->pm_ldt = NULL; 1280 kpm->pm_ldt_len = 0; 1281 kpm->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 1282 1283 /* 1284 * the above is just a rough estimate and not critical to the proper 1285 * operation of the system. 1286 */ 1287 1288 #if !defined(XEN) 1289 /* 1290 * Begin to enable global TLB entries if they are supported. 1291 * The G bit has no effect until the CR4_PGE bit is set in CR4, 1292 * which happens in cpu_init(), which is run on each cpu 1293 * (and happens later) 1294 */ 1295 if (cpu_feature[0] & CPUID_PGE) { 1296 pmap_pg_g = PG_G; /* enable software */ 1297 1298 /* add PG_G attribute to already mapped kernel pages */ 1299 pmap_remap_global(); 1300 } 1301 #endif 1302 1303 #ifndef XEN 1304 /* 1305 * Enable large pages if they are supported. 1306 */ 1307 if (cpu_feature[0] & CPUID_PSE) { 1308 lcr4(rcr4() | CR4_PSE); /* enable hardware (via %cr4) */ 1309 pmap_largepages = 1; /* enable software */ 1310 1311 /* 1312 * The TLB must be flushed after enabling large pages on Pentium 1313 * CPUs, according to section 3.6.2.2 of "Intel Architecture 1314 * Software Developer's Manual, Volume 3: System Programming". 1315 */ 1316 tlbflushg(); 1317 1318 /* Remap the kernel. */ 1319 pmap_remap_largepages(); 1320 } 1321 pmap_init_lapic(); 1322 #endif /* !XEN */ 1323 1324 #ifdef __HAVE_PCPU_AREA 1325 pmap_init_pcpu(); 1326 #endif 1327 1328 #ifdef __HAVE_DIRECT_MAP 1329 pmap_init_directmap(kpm); 1330 #else 1331 pmap_vpage_cpualloc(&cpu_info_primary); 1332 1333 if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { /* i386 */ 1334 early_zerop = (void *)cpu_info_primary.vpage[VPAGE_ZER]; 1335 early_zero_pte = cpu_info_primary.vpage_pte[VPAGE_ZER]; 1336 } else { /* amd64 */ 1337 /* 1338 * zero_pte is stuck at the end of mapped space for the kernel 1339 * image (disjunct from kva space). This is done so that it 1340 * can safely be used in pmap_growkernel (pmap_get_physpage), 1341 * when it's called for the first time. 1342 * XXXfvdl fix this for MULTIPROCESSOR later. 1343 */ 1344 #ifdef XEN 1345 /* early_zerop initialized in xen_locore() */ 1346 #else 1347 early_zerop = (void *)bootspace.spareva; 1348 #endif 1349 early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop); 1350 } 1351 #endif 1352 1353 #if defined(XEN) && defined(__x86_64__) 1354 extern vaddr_t xen_dummy_page; 1355 paddr_t xen_dummy_user_pgd; 1356 1357 /* 1358 * We want a dummy page directory for Xen: when deactivating a pmap, 1359 * Xen will still consider it active. So we set user PGD to this one 1360 * to lift all protection on the now inactive page tables set. 1361 */ 1362 xen_dummy_user_pgd = xen_dummy_page - KERNBASE; 1363 1364 /* Zero fill it, the less checks in Xen it requires the better */ 1365 memset((void *)(xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE); 1366 /* Mark read-only */ 1367 HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE, 1368 pmap_pa2pte(xen_dummy_user_pgd) | PG_V | pmap_pg_nx, 1369 UVMF_INVLPG); 1370 /* Pin as L4 */ 1371 xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd)); 1372 #endif 1373 1374 /* 1375 * Allocate space for the IDT, GDT and LDT. 1376 */ 1377 #ifdef __HAVE_PCPU_AREA 1378 idt_vaddr = (vaddr_t)&pcpuarea->idt; 1379 #else 1380 idt_vaddr = pmap_bootstrap_valloc(1); 1381 #endif 1382 idt_paddr = pmap_bootstrap_palloc(1); 1383 1384 gdt_vaddr = pmap_bootstrap_valloc(1); 1385 gdt_paddr = pmap_bootstrap_palloc(1); 1386 1387 #ifdef __HAVE_PCPU_AREA 1388 ldt_vaddr = (vaddr_t)&pcpuarea->ldt; 1389 #else 1390 ldt_vaddr = pmap_bootstrap_valloc(1); 1391 #endif 1392 ldt_paddr = pmap_bootstrap_palloc(1); 1393 1394 #if !defined(__x86_64__) && !defined(XEN) 1395 /* pentium f00f bug stuff */ 1396 pentium_idt_vaddr = pmap_bootstrap_valloc(1); 1397 #endif 1398 1399 /* 1400 * Now we reserve some VM for mapping pages when doing a crash dump. 1401 */ 1402 virtual_avail = reserve_dumppages(virtual_avail); 1403 1404 /* 1405 * Init the static-global locks and global lists. 1406 * 1407 * => pventry::pvh_lock (initialized elsewhere) must also be 1408 * a spin lock, again at IPL_VM to prevent deadlock, and 1409 * again is never taken from interrupt context. 1410 */ 1411 mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE); 1412 LIST_INIT(&pmaps); 1413 1414 /* 1415 * Ensure the TLB is sync'd with reality by flushing it... 1416 */ 1417 tlbflushg(); 1418 1419 /* 1420 * Calculate pmap_maxkvaddr from nkptp[]. 1421 */ 1422 kva = VM_MIN_KERNEL_ADDRESS; 1423 for (i = PTP_LEVELS - 1; i >= 1; i--) { 1424 kva += nkptp[i] * nbpd[i]; 1425 } 1426 pmap_maxkvaddr = kva; 1427 } 1428 1429 #ifndef XEN 1430 static void 1431 pmap_init_lapic(void) 1432 { 1433 /* 1434 * On CPUs that have no LAPIC, local_apic_va is never kentered. But our 1435 * x86 implementation relies a lot on this address to be valid; so just 1436 * allocate a fake physical page that will be kentered into 1437 * local_apic_va by machdep. 1438 * 1439 * If the LAPIC is present, the va will be remapped somewhere else 1440 * later in lapic_map. 1441 */ 1442 local_apic_va = pmap_bootstrap_valloc(1); 1443 local_apic_pa = pmap_bootstrap_palloc(1); 1444 } 1445 #endif 1446 1447 #if defined(__HAVE_PCPU_AREA) || defined(__HAVE_DIRECT_MAP) 1448 static size_t 1449 pmap_pagetree_nentries_range(vaddr_t startva, vaddr_t endva, size_t pgsz) 1450 { 1451 size_t npages; 1452 npages = (roundup(endva, pgsz) / pgsz) - 1453 (rounddown(startva, pgsz) / pgsz); 1454 return npages; 1455 } 1456 #endif 1457 1458 #ifdef __HAVE_PCPU_AREA 1459 static void 1460 pmap_init_pcpu(void) 1461 { 1462 const vaddr_t startva = PMAP_PCPU_BASE; 1463 size_t nL4e, nL3e, nL2e, nL1e; 1464 size_t L4e_idx, L3e_idx, L2e_idx, L1e_idx __diagused; 1465 paddr_t pa; 1466 vaddr_t endva; 1467 vaddr_t tmpva; 1468 pt_entry_t *pte; 1469 size_t size; 1470 int i; 1471 1472 const pd_entry_t pteflags = PG_V | PG_KW | pmap_pg_nx; 1473 1474 size = sizeof(struct pcpu_area); 1475 1476 endva = startva + size; 1477 1478 /* We will use this temporary va. */ 1479 tmpva = bootspace.spareva; 1480 pte = PTE_BASE + pl1_i(tmpva); 1481 1482 /* Build L4 */ 1483 L4e_idx = pl4_i(startva); 1484 nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4); 1485 KASSERT(nL4e == 1); 1486 for (i = 0; i < nL4e; i++) { 1487 KASSERT(L4_BASE[L4e_idx+i] == 0); 1488 1489 pa = pmap_bootstrap_palloc(1); 1490 *pte = (pa & PG_FRAME) | pteflags; 1491 pmap_update_pg(tmpva); 1492 memset((void *)tmpva, 0, PAGE_SIZE); 1493 1494 L4_BASE[L4e_idx+i] = pa | pteflags | PG_U; 1495 } 1496 1497 /* Build L3 */ 1498 L3e_idx = pl3_i(startva); 1499 nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3); 1500 for (i = 0; i < nL3e; i++) { 1501 KASSERT(L3_BASE[L3e_idx+i] == 0); 1502 1503 pa = pmap_bootstrap_palloc(1); 1504 *pte = (pa & PG_FRAME) | pteflags; 1505 pmap_update_pg(tmpva); 1506 memset((void *)tmpva, 0, PAGE_SIZE); 1507 1508 L3_BASE[L3e_idx+i] = pa | pteflags | PG_U; 1509 } 1510 1511 /* Build L2 */ 1512 L2e_idx = pl2_i(startva); 1513 nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2); 1514 for (i = 0; i < nL2e; i++) { 1515 1516 KASSERT(L2_BASE[L2e_idx+i] == 0); 1517 1518 pa = pmap_bootstrap_palloc(1); 1519 *pte = (pa & PG_FRAME) | pteflags; 1520 pmap_update_pg(tmpva); 1521 memset((void *)tmpva, 0, PAGE_SIZE); 1522 1523 L2_BASE[L2e_idx+i] = pa | pteflags | PG_U; 1524 } 1525 1526 /* Build L1 */ 1527 L1e_idx = pl1_i(startva); 1528 nL1e = pmap_pagetree_nentries_range(startva, endva, NBPD_L1); 1529 for (i = 0; i < nL1e; i++) { 1530 /* 1531 * Nothing to do, the PTEs will be entered via 1532 * pmap_kenter_pa. 1533 */ 1534 KASSERT(L1_BASE[L1e_idx+i] == 0); 1535 } 1536 1537 *pte = 0; 1538 pmap_update_pg(tmpva); 1539 1540 pcpuarea = (struct pcpu_area *)startva; 1541 1542 tlbflush(); 1543 } 1544 #endif 1545 1546 #ifdef __HAVE_DIRECT_MAP 1547 /* 1548 * Create the amd64 direct map. Called only once at boot time. We map all of 1549 * the physical memory contiguously using 2MB large pages, with RW permissions. 1550 * However there is a hole: the kernel is mapped with RO permissions. 1551 */ 1552 static void 1553 pmap_init_directmap(struct pmap *kpm) 1554 { 1555 extern phys_ram_seg_t mem_clusters[]; 1556 extern int mem_cluster_cnt; 1557 1558 const vaddr_t startva = PMAP_DIRECT_DEFAULT_BASE; 1559 size_t nL4e, nL3e, nL2e; 1560 size_t L4e_idx, L3e_idx, L2e_idx; 1561 size_t spahole, epahole; 1562 paddr_t lastpa, pa; 1563 vaddr_t endva; 1564 vaddr_t tmpva; 1565 pt_entry_t *pte; 1566 phys_ram_seg_t *mc; 1567 int i; 1568 1569 const pd_entry_t pteflags = PG_V | PG_KW | pmap_pg_nx; 1570 const pd_entry_t holepteflags = PG_V | pmap_pg_nx; 1571 1572 CTASSERT(NL4_SLOT_DIRECT * NBPD_L4 == MAXPHYSMEM); 1573 1574 spahole = roundup(bootspace.head.pa, NBPD_L2); 1575 epahole = rounddown(bootspace.boot.pa, NBPD_L2); 1576 1577 /* Get the last physical address available */ 1578 lastpa = 0; 1579 for (i = 0; i < mem_cluster_cnt; i++) { 1580 mc = &mem_clusters[i]; 1581 lastpa = MAX(lastpa, mc->start + mc->size); 1582 } 1583 1584 /* 1585 * x86_add_cluster should have truncated the memory to MAXPHYSMEM. 1586 */ 1587 if (lastpa > MAXPHYSMEM) { 1588 panic("pmap_init_directmap: lastpa incorrect"); 1589 } 1590 endva = startva + lastpa; 1591 1592 /* We will use this temporary va. */ 1593 tmpva = bootspace.spareva; 1594 pte = PTE_BASE + pl1_i(tmpva); 1595 1596 /* Build L4 */ 1597 L4e_idx = pl4_i(startva); 1598 nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4); 1599 KASSERT(nL4e <= NL4_SLOT_DIRECT); 1600 for (i = 0; i < nL4e; i++) { 1601 KASSERT(L4_BASE[L4e_idx+i] == 0); 1602 1603 pa = pmap_bootstrap_palloc(1); 1604 *pte = (pa & PG_FRAME) | pteflags; 1605 pmap_update_pg(tmpva); 1606 memset((void *)tmpva, 0, PAGE_SIZE); 1607 1608 L4_BASE[L4e_idx+i] = pa | pteflags | PG_U; 1609 } 1610 1611 /* Build L3 */ 1612 L3e_idx = pl3_i(startva); 1613 nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3); 1614 for (i = 0; i < nL3e; i++) { 1615 KASSERT(L3_BASE[L3e_idx+i] == 0); 1616 1617 pa = pmap_bootstrap_palloc(1); 1618 *pte = (pa & PG_FRAME) | pteflags; 1619 pmap_update_pg(tmpva); 1620 memset((void *)tmpva, 0, PAGE_SIZE); 1621 1622 L3_BASE[L3e_idx+i] = pa | pteflags | PG_U; 1623 } 1624 1625 /* Build L2 */ 1626 L2e_idx = pl2_i(startva); 1627 nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2); 1628 for (i = 0; i < nL2e; i++) { 1629 KASSERT(L2_BASE[L2e_idx+i] == 0); 1630 1631 pa = (paddr_t)(i * NBPD_L2); 1632 1633 if (spahole <= pa && pa < epahole) { 1634 L2_BASE[L2e_idx+i] = pa | holepteflags | PG_U | 1635 PG_PS | pmap_pg_g; 1636 } else { 1637 L2_BASE[L2e_idx+i] = pa | pteflags | PG_U | 1638 PG_PS | pmap_pg_g; 1639 } 1640 } 1641 1642 *pte = 0; 1643 pmap_update_pg(tmpva); 1644 1645 pmap_direct_base = startva; 1646 pmap_direct_end = endva; 1647 pmap_direct_pdpe = L4e_idx; 1648 pmap_direct_npdp = nL4e; 1649 1650 tlbflush(); 1651 } 1652 #endif /* __HAVE_DIRECT_MAP */ 1653 1654 #if !defined(XEN) 1655 /* 1656 * Remap all of the virtual pages created so far with the PG_G bit. 1657 */ 1658 static void 1659 pmap_remap_global(void) 1660 { 1661 vaddr_t kva, kva_end; 1662 unsigned long p1i; 1663 size_t i; 1664 1665 /* head */ 1666 kva = bootspace.head.va; 1667 kva_end = kva + bootspace.head.sz; 1668 for ( ; kva < kva_end; kva += PAGE_SIZE) { 1669 p1i = pl1_i(kva); 1670 if (pmap_valid_entry(PTE_BASE[p1i])) 1671 PTE_BASE[p1i] |= pmap_pg_g; 1672 } 1673 1674 /* kernel segments */ 1675 for (i = 0; i < BTSPACE_NSEGS; i++) { 1676 if (bootspace.segs[i].type == BTSEG_NONE) { 1677 continue; 1678 } 1679 kva = bootspace.segs[i].va; 1680 kva_end = kva + bootspace.segs[i].sz; 1681 for ( ; kva < kva_end; kva += PAGE_SIZE) { 1682 p1i = pl1_i(kva); 1683 if (pmap_valid_entry(PTE_BASE[p1i])) 1684 PTE_BASE[p1i] |= pmap_pg_g; 1685 } 1686 } 1687 1688 /* boot space */ 1689 kva = bootspace.boot.va; 1690 kva_end = kva + bootspace.boot.sz; 1691 for ( ; kva < kva_end; kva += PAGE_SIZE) { 1692 p1i = pl1_i(kva); 1693 if (pmap_valid_entry(PTE_BASE[p1i])) 1694 PTE_BASE[p1i] |= pmap_pg_g; 1695 } 1696 } 1697 #endif 1698 1699 #ifndef XEN 1700 /* 1701 * Remap several kernel segments with large pages. We cover as many pages as we 1702 * can. Called only once at boot time, if the CPU supports large pages. 1703 */ 1704 static void 1705 pmap_remap_largepages(void) 1706 { 1707 pd_entry_t *pde; 1708 vaddr_t kva, kva_end; 1709 paddr_t pa; 1710 size_t i; 1711 1712 /* Remap the kernel text using large pages. */ 1713 for (i = 0; i < BTSPACE_NSEGS; i++) { 1714 if (bootspace.segs[i].type != BTSEG_TEXT) { 1715 continue; 1716 } 1717 kva = roundup(bootspace.segs[i].va, NBPD_L2); 1718 if (kva < bootspace.segs[i].va) { 1719 continue; 1720 } 1721 kva_end = rounddown(bootspace.segs[i].va + 1722 bootspace.segs[i].sz, NBPD_L2); 1723 pa = roundup(bootspace.segs[i].pa, NBPD_L2); 1724 for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) { 1725 pde = &L2_BASE[pl2_i(kva)]; 1726 *pde = pa | pmap_pg_g | PG_PS | PG_KR | PG_V; 1727 tlbflushg(); 1728 } 1729 } 1730 1731 /* Remap the kernel rodata using large pages. */ 1732 for (i = 0; i < BTSPACE_NSEGS; i++) { 1733 if (bootspace.segs[i].type != BTSEG_RODATA) { 1734 continue; 1735 } 1736 kva = roundup(bootspace.segs[i].va, NBPD_L2); 1737 if (kva < bootspace.segs[i].va) { 1738 continue; 1739 } 1740 kva_end = rounddown(bootspace.segs[i].va + 1741 bootspace.segs[i].sz, NBPD_L2); 1742 pa = roundup(bootspace.segs[i].pa, NBPD_L2); 1743 for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) { 1744 pde = &L2_BASE[pl2_i(kva)]; 1745 *pde = pa | pmap_pg_g | PG_PS | pmap_pg_nx | PG_KR | PG_V; 1746 tlbflushg(); 1747 } 1748 } 1749 1750 /* Remap the kernel data+bss using large pages. */ 1751 for (i = 0; i < BTSPACE_NSEGS; i++) { 1752 if (bootspace.segs[i].type != BTSEG_DATA) { 1753 continue; 1754 } 1755 kva = roundup(bootspace.segs[i].va, NBPD_L2); 1756 if (kva < bootspace.segs[i].va) { 1757 continue; 1758 } 1759 kva_end = rounddown(bootspace.segs[i].va + 1760 bootspace.segs[i].sz, NBPD_L2); 1761 pa = roundup(bootspace.segs[i].pa, NBPD_L2); 1762 for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) { 1763 pde = &L2_BASE[pl2_i(kva)]; 1764 *pde = pa | pmap_pg_g | PG_PS | pmap_pg_nx | PG_KW | PG_V; 1765 tlbflushg(); 1766 } 1767 } 1768 } 1769 #endif /* !XEN */ 1770 1771 /* 1772 * pmap_init: called from uvm_init, our job is to get the pmap 1773 * system ready to manage mappings... 1774 */ 1775 1776 void 1777 pmap_init(void) 1778 { 1779 int i, flags; 1780 1781 for (i = 0; i < PV_HASH_SIZE; i++) { 1782 SLIST_INIT(&pv_hash_heads[i].hh_list); 1783 } 1784 for (i = 0; i < PV_HASH_LOCK_CNT; i++) { 1785 mutex_init(&pv_hash_locks[i].lock, MUTEX_NODEBUG, IPL_VM); 1786 } 1787 1788 /* 1789 * initialize caches. 1790 */ 1791 1792 pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), 0, 0, 0, 1793 "pmappl", NULL, IPL_NONE, NULL, NULL, NULL); 1794 1795 #ifdef XEN 1796 /* 1797 * pool_cache(9) should not touch cached objects, since they 1798 * are pinned on xen and R/O for the domU 1799 */ 1800 flags = PR_NOTOUCH; 1801 #else /* XEN */ 1802 flags = 0; 1803 #endif /* XEN */ 1804 #ifdef PAE 1805 pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE * PDP_SIZE, 0, 0, flags, 1806 "pdppl", &pmap_pdp_allocator, IPL_NONE, 1807 pmap_pdp_ctor, pmap_pdp_dtor, NULL); 1808 #else /* PAE */ 1809 pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE, 0, 0, flags, 1810 "pdppl", NULL, IPL_NONE, pmap_pdp_ctor, pmap_pdp_dtor, NULL); 1811 #endif /* PAE */ 1812 pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0, 1813 PR_LARGECACHE, "pvpl", &pool_allocator_kmem, IPL_NONE, NULL, 1814 NULL, NULL); 1815 1816 pmap_tlb_init(); 1817 1818 /* XXX: Since cpu_hatch() is only for secondary CPUs. */ 1819 pmap_tlb_cpu_init(curcpu()); 1820 1821 evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC, 1822 NULL, "x86", "io bitmap copy"); 1823 evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC, 1824 NULL, "x86", "ldt sync"); 1825 1826 /* 1827 * done: pmap module is up (and ready for business) 1828 */ 1829 1830 pmap_initialized = true; 1831 } 1832 1833 /* 1834 * pmap_cpu_init_late: perform late per-CPU initialization. 1835 */ 1836 1837 #ifndef XEN 1838 void 1839 pmap_cpu_init_late(struct cpu_info *ci) 1840 { 1841 /* 1842 * The BP has already its own PD page allocated during early 1843 * MD startup. 1844 */ 1845 if (ci == &cpu_info_primary) 1846 return; 1847 1848 #ifdef PAE 1849 cpu_alloc_l3_page(ci); 1850 #endif 1851 } 1852 #endif 1853 1854 #ifndef __HAVE_DIRECT_MAP 1855 CTASSERT(CACHE_LINE_SIZE > sizeof(pt_entry_t)); 1856 CTASSERT(CACHE_LINE_SIZE % sizeof(pt_entry_t) == 0); 1857 1858 static void 1859 pmap_vpage_cpualloc(struct cpu_info *ci) 1860 { 1861 bool primary = (ci == &cpu_info_primary); 1862 size_t i, npages; 1863 vaddr_t vabase; 1864 vsize_t vrange; 1865 1866 npages = (CACHE_LINE_SIZE / sizeof(pt_entry_t)); 1867 KASSERT(npages >= VPAGE_MAX); 1868 vrange = npages * PAGE_SIZE; 1869 1870 if (primary) { 1871 while ((vabase = pmap_bootstrap_valloc(1)) % vrange != 0) { 1872 /* Waste some pages to align properly */ 1873 } 1874 /* The base is aligned, allocate the rest (contiguous) */ 1875 pmap_bootstrap_valloc(npages - 1); 1876 } else { 1877 vabase = uvm_km_alloc(kernel_map, vrange, vrange, 1878 UVM_KMF_VAONLY); 1879 if (vabase == 0) { 1880 panic("%s: failed to allocate tmp VA for CPU %d\n", 1881 __func__, cpu_index(ci)); 1882 } 1883 } 1884 1885 KASSERT((vaddr_t)&PTE_BASE[pl1_i(vabase)] % CACHE_LINE_SIZE == 0); 1886 1887 for (i = 0; i < VPAGE_MAX; i++) { 1888 ci->vpage[i] = vabase + i * PAGE_SIZE; 1889 ci->vpage_pte[i] = PTE_BASE + pl1_i(ci->vpage[i]); 1890 } 1891 } 1892 1893 void 1894 pmap_vpage_cpu_init(struct cpu_info *ci) 1895 { 1896 if (ci == &cpu_info_primary) { 1897 /* cpu0 already taken care of in pmap_bootstrap */ 1898 return; 1899 } 1900 1901 pmap_vpage_cpualloc(ci); 1902 } 1903 #endif 1904 1905 /* 1906 * p v _ e n t r y f u n c t i o n s 1907 */ 1908 1909 static bool 1910 pmap_pp_needs_pve(struct pmap_page *pp) 1911 { 1912 1913 /* 1914 * Adding a pv entry for this page only needs to allocate a pv_entry 1915 * structure if the page already has at least one pv entry, 1916 * since the first pv entry is stored in the pmap_page. 1917 */ 1918 1919 return pp && ((pp->pp_flags & PP_EMBEDDED) != 0 || 1920 !LIST_EMPTY(&pp->pp_head.pvh_list)); 1921 } 1922 1923 /* 1924 * pmap_free_pvs: free a list of pv_entrys 1925 */ 1926 1927 static void 1928 pmap_free_pvs(struct pv_entry *pve) 1929 { 1930 struct pv_entry *next; 1931 1932 for ( /* null */ ; pve != NULL ; pve = next) { 1933 next = pve->pve_next; 1934 pool_cache_put(&pmap_pv_cache, pve); 1935 } 1936 } 1937 1938 /* 1939 * main pv_entry manipulation functions: 1940 * pmap_enter_pv: enter a mapping onto a pv_head list 1941 * pmap_remove_pv: remove a mapping from a pv_head list 1942 * 1943 * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock 1944 * the pvh before calling 1945 */ 1946 1947 /* 1948 * insert_pv: a helper of pmap_enter_pv 1949 */ 1950 1951 static void 1952 insert_pv(struct pmap_page *pp, struct pv_entry *pve) 1953 { 1954 struct pv_hash_head *hh; 1955 kmutex_t *lock; 1956 u_int hash; 1957 1958 hash = pvhash_hash(pve->pve_pte.pte_ptp, pve->pve_pte.pte_va); 1959 lock = pvhash_lock(hash); 1960 hh = pvhash_head(hash); 1961 mutex_spin_enter(lock); 1962 SLIST_INSERT_HEAD(&hh->hh_list, pve, pve_hash); 1963 mutex_spin_exit(lock); 1964 1965 LIST_INSERT_HEAD(&pp->pp_head.pvh_list, pve, pve_list); 1966 } 1967 1968 /* 1969 * pmap_enter_pv: enter a mapping onto a pv_head lst 1970 * 1971 * => caller should adjust ptp's wire_count before calling 1972 * => caller has preallocated pve and *sparepve for us 1973 */ 1974 1975 static struct pv_entry * 1976 pmap_enter_pv(struct pmap_page *pp, struct pv_entry *pve, 1977 struct pv_entry **sparepve, struct vm_page *ptp, vaddr_t va) 1978 { 1979 1980 KASSERT(ptp == NULL || ptp->wire_count >= 2); 1981 KASSERT(ptp == NULL || ptp->uobject != NULL); 1982 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 1983 1984 if ((pp->pp_flags & PP_EMBEDDED) == 0) { 1985 if (LIST_EMPTY(&pp->pp_head.pvh_list)) { 1986 pp->pp_flags |= PP_EMBEDDED; 1987 pp->pp_pte.pte_ptp = ptp; 1988 pp->pp_pte.pte_va = va; 1989 1990 return pve; 1991 } 1992 } else { 1993 struct pv_entry *pve2; 1994 1995 pve2 = *sparepve; 1996 *sparepve = NULL; 1997 1998 pve2->pve_pte = pp->pp_pte; 1999 pp->pp_flags &= ~PP_EMBEDDED; 2000 LIST_INIT(&pp->pp_head.pvh_list); 2001 insert_pv(pp, pve2); 2002 } 2003 2004 pve->pve_pte.pte_ptp = ptp; 2005 pve->pve_pte.pte_va = va; 2006 insert_pv(pp, pve); 2007 2008 return NULL; 2009 } 2010 2011 /* 2012 * pmap_remove_pv: try to remove a mapping from a pv_list 2013 * 2014 * => caller should adjust ptp's wire_count and free PTP if needed 2015 * => we return the removed pve 2016 */ 2017 2018 static struct pv_entry * 2019 pmap_remove_pv(struct pmap_page *pp, struct vm_page *ptp, vaddr_t va) 2020 { 2021 struct pv_hash_head *hh; 2022 struct pv_entry *pve; 2023 kmutex_t *lock; 2024 u_int hash; 2025 2026 KASSERT(ptp == NULL || ptp->uobject != NULL); 2027 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 2028 2029 if ((pp->pp_flags & PP_EMBEDDED) != 0) { 2030 KASSERT(pp->pp_pte.pte_ptp == ptp); 2031 KASSERT(pp->pp_pte.pte_va == va); 2032 2033 pp->pp_flags &= ~PP_EMBEDDED; 2034 LIST_INIT(&pp->pp_head.pvh_list); 2035 2036 return NULL; 2037 } 2038 2039 hash = pvhash_hash(ptp, va); 2040 lock = pvhash_lock(hash); 2041 hh = pvhash_head(hash); 2042 mutex_spin_enter(lock); 2043 pve = pvhash_remove(hh, ptp, va); 2044 mutex_spin_exit(lock); 2045 2046 LIST_REMOVE(pve, pve_list); 2047 2048 return pve; 2049 } 2050 2051 /* 2052 * p t p f u n c t i o n s 2053 */ 2054 2055 static inline struct vm_page * 2056 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level) 2057 { 2058 int lidx = level - 1; 2059 struct vm_page *pg; 2060 2061 KASSERT(mutex_owned(pmap->pm_lock)); 2062 2063 if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] && 2064 pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) { 2065 return (pmap->pm_ptphint[lidx]); 2066 } 2067 PMAP_SUBOBJ_LOCK(pmap, lidx); 2068 pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level)); 2069 PMAP_SUBOBJ_UNLOCK(pmap, lidx); 2070 2071 KASSERT(pg == NULL || pg->wire_count >= 1); 2072 return pg; 2073 } 2074 2075 static inline void 2076 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level) 2077 { 2078 lwp_t *l; 2079 int lidx; 2080 struct uvm_object *obj; 2081 2082 KASSERT(ptp->wire_count == 1); 2083 2084 lidx = level - 1; 2085 2086 obj = &pmap->pm_obj[lidx]; 2087 pmap_stats_update(pmap, -1, 0); 2088 if (lidx != 0) 2089 mutex_enter(obj->vmobjlock); 2090 if (pmap->pm_ptphint[lidx] == ptp) 2091 pmap->pm_ptphint[lidx] = TAILQ_FIRST(&obj->memq); 2092 ptp->wire_count = 0; 2093 uvm_pagerealloc(ptp, NULL, 0); 2094 l = curlwp; 2095 KASSERT((l->l_pflag & LP_INTR) == 0); 2096 VM_PAGE_TO_PP(ptp)->pp_link = l->l_md.md_gc_ptp; 2097 l->l_md.md_gc_ptp = ptp; 2098 if (lidx != 0) 2099 mutex_exit(obj->vmobjlock); 2100 } 2101 2102 static void 2103 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va, 2104 pt_entry_t *ptes, pd_entry_t * const *pdes) 2105 { 2106 unsigned long index; 2107 int level; 2108 vaddr_t invaladdr; 2109 pd_entry_t opde; 2110 2111 KASSERT(pmap != pmap_kernel()); 2112 KASSERT(mutex_owned(pmap->pm_lock)); 2113 KASSERT(kpreempt_disabled()); 2114 2115 level = 1; 2116 do { 2117 index = pl_i(va, level + 1); 2118 opde = pmap_pte_testset(&pdes[level - 1][index], 0); 2119 2120 /* 2121 * On Xen-amd64 or SVS, we need to sync the top level page 2122 * directory on each CPU. 2123 */ 2124 #if defined(XEN) && defined(__x86_64__) 2125 if (level == PTP_LEVELS - 1) { 2126 xen_kpm_sync(pmap, index); 2127 } 2128 #elif defined(SVS) 2129 if (svs_enabled && level == PTP_LEVELS - 1) { 2130 svs_pmap_sync(pmap, index); 2131 } 2132 #endif 2133 2134 invaladdr = level == 1 ? (vaddr_t)ptes : 2135 (vaddr_t)pdes[level - 2]; 2136 pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE, 2137 opde, TLBSHOOT_FREE_PTP1); 2138 2139 #if defined(XEN) 2140 pmap_tlb_shootnow(); 2141 #endif 2142 2143 pmap_freepage(pmap, ptp, level); 2144 if (level < PTP_LEVELS - 1) { 2145 ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1); 2146 ptp->wire_count--; 2147 if (ptp->wire_count > 1) 2148 break; 2149 } 2150 } while (++level < PTP_LEVELS); 2151 pmap_pte_flush(); 2152 } 2153 2154 /* 2155 * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one) 2156 * 2157 * => pmap should NOT be pmap_kernel() 2158 * => pmap should be locked 2159 * => preemption should be disabled 2160 */ 2161 2162 static struct vm_page * 2163 pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t * const *pdes, int flags) 2164 { 2165 struct vm_page *ptp; 2166 struct { 2167 struct vm_page *pg; 2168 bool new; 2169 } pt[PTP_LEVELS + 1]; 2170 int i, aflags; 2171 unsigned long index; 2172 pd_entry_t *pva; 2173 paddr_t pa; 2174 struct uvm_object *obj; 2175 voff_t off; 2176 2177 KASSERT(pmap != pmap_kernel()); 2178 KASSERT(mutex_owned(pmap->pm_lock)); 2179 KASSERT(kpreempt_disabled()); 2180 2181 /* 2182 * Loop through all page table levels allocating a page 2183 * for any level where we don't already have one. 2184 */ 2185 memset(pt, 0, sizeof(pt)); 2186 aflags = ((flags & PMAP_CANFAIL) ? 0 : UVM_PGA_USERESERVE) | 2187 UVM_PGA_ZERO; 2188 for (i = PTP_LEVELS; i > 1; i--) { 2189 obj = &pmap->pm_obj[i - 2]; 2190 off = ptp_va2o(va, i - 1); 2191 2192 PMAP_SUBOBJ_LOCK(pmap, i - 2); 2193 pt[i].pg = uvm_pagelookup(obj, off); 2194 if (pt[i].pg == NULL) { 2195 pt[i].pg = uvm_pagealloc(obj, off, NULL, aflags); 2196 pt[i].new = true; 2197 } 2198 PMAP_SUBOBJ_UNLOCK(pmap, i - 2); 2199 2200 if (pt[i].pg == NULL) 2201 goto fail; 2202 } 2203 2204 /* 2205 * Now that we have all the pages looked up or allocated, 2206 * loop through again installing any new ones into the tree. 2207 */ 2208 for (i = PTP_LEVELS; i > 1; i--) { 2209 index = pl_i(va, i); 2210 pva = pdes[i - 2]; 2211 2212 if (pmap_valid_entry(pva[index])) { 2213 KASSERT(!pt[i].new); 2214 continue; 2215 } 2216 2217 ptp = pt[i].pg; 2218 ptp->flags &= ~PG_BUSY; /* never busy */ 2219 ptp->wire_count = 1; 2220 pmap->pm_ptphint[i - 2] = ptp; 2221 pa = VM_PAGE_TO_PHYS(ptp); 2222 pmap_pte_set(&pva[index], (pd_entry_t) 2223 (pmap_pa2pte(pa) | PG_u | PG_RW | PG_V)); 2224 2225 /* 2226 * On Xen-amd64 or SVS, we need to sync the top level page 2227 * directory on each CPU. 2228 */ 2229 #if defined(XEN) && defined(__x86_64__) 2230 if (i == PTP_LEVELS) { 2231 xen_kpm_sync(pmap, index); 2232 } 2233 #elif defined(SVS) 2234 if (svs_enabled && i == PTP_LEVELS) { 2235 svs_pmap_sync(pmap, index); 2236 } 2237 #endif 2238 2239 pmap_pte_flush(); 2240 pmap_stats_update(pmap, 1, 0); 2241 2242 /* 2243 * If we're not in the top level, increase the 2244 * wire count of the parent page. 2245 */ 2246 if (i < PTP_LEVELS) { 2247 pt[i + 1].pg->wire_count++; 2248 } 2249 } 2250 ptp = pt[2].pg; 2251 KASSERT(ptp != NULL); 2252 pmap->pm_ptphint[0] = ptp; 2253 return ptp; 2254 2255 /* 2256 * Allocation of a ptp failed, free any others that we just allocated. 2257 */ 2258 fail: 2259 for (i = PTP_LEVELS; i > 1; i--) { 2260 if (pt[i].pg == NULL) { 2261 break; 2262 } 2263 if (!pt[i].new) { 2264 continue; 2265 } 2266 obj = &pmap->pm_obj[i - 2]; 2267 PMAP_SUBOBJ_LOCK(pmap, i - 2); 2268 uvm_pagefree(pt[i].pg); 2269 PMAP_SUBOBJ_UNLOCK(pmap, i - 2); 2270 } 2271 return NULL; 2272 } 2273 2274 /* 2275 * p m a p l i f e c y c l e f u n c t i o n s 2276 */ 2277 2278 /* 2279 * pmap_pdp_ctor: constructor for the PDP cache. 2280 */ 2281 static int 2282 pmap_pdp_ctor(void *arg, void *v, int flags) 2283 { 2284 pd_entry_t *pdir = v; 2285 paddr_t pdirpa = 0; 2286 vaddr_t object; 2287 int i; 2288 2289 #if !defined(XEN) || !defined(__x86_64__) 2290 int npde; 2291 #endif 2292 #ifdef XEN 2293 int s; 2294 #endif 2295 2296 /* 2297 * NOTE: The `pmaps_lock' is held when the PDP is allocated. 2298 */ 2299 2300 #if defined(XEN) && defined(__x86_64__) 2301 /* Fetch the physical address of the page directory */ 2302 (void)pmap_extract(pmap_kernel(), (vaddr_t)pdir, &pdirpa); 2303 2304 /* Zero the area */ 2305 memset(pdir, 0, PAGE_SIZE); /* Xen wants a clean page */ 2306 2307 /* 2308 * This pdir will NEVER be active in kernel mode, so mark 2309 * recursive entry invalid. 2310 */ 2311 pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa); 2312 2313 /* 2314 * PDP constructed this way won't be for the kernel, hence we 2315 * don't put kernel mappings on Xen. 2316 * 2317 * But we need to make pmap_create() happy, so put a dummy 2318 * (without PG_V) value at the right place. 2319 */ 2320 pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] = 2321 (pd_entry_t)-1 & PG_FRAME; 2322 #else /* XEN && __x86_64__*/ 2323 /* Zero the area */ 2324 memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t)); 2325 2326 object = (vaddr_t)v; 2327 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2328 /* Fetch the physical address of the page directory */ 2329 (void)pmap_extract(pmap_kernel(), object, &pdirpa); 2330 2331 /* Put in recursive PDE to map the PTEs */ 2332 pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PG_V | 2333 pmap_pg_nx; 2334 #ifndef XEN 2335 pdir[PDIR_SLOT_PTE + i] |= PG_KW; 2336 #endif 2337 } 2338 2339 /* Copy the kernel's top level PDE */ 2340 npde = nkptp[PTP_LEVELS - 1]; 2341 2342 memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN], 2343 npde * sizeof(pd_entry_t)); 2344 2345 /* Zero the rest */ 2346 memset(&pdir[PDIR_SLOT_KERN + npde], 0, (PAGE_SIZE * PDP_SIZE) - 2347 (PDIR_SLOT_KERN + npde) * sizeof(pd_entry_t)); 2348 2349 if (VM_MIN_KERNEL_ADDRESS != KERNBASE) { 2350 int idx = pl_i(KERNBASE, PTP_LEVELS); 2351 pdir[idx] = PDP_BASE[idx]; 2352 } 2353 2354 #ifdef __HAVE_PCPU_AREA 2355 pdir[PDIR_SLOT_PCPU] = PDP_BASE[PDIR_SLOT_PCPU]; 2356 #endif 2357 #ifdef __HAVE_DIRECT_MAP 2358 memcpy(&pdir[pmap_direct_pdpe], &PDP_BASE[pmap_direct_pdpe], 2359 pmap_direct_npdp * sizeof(pd_entry_t)); 2360 #endif 2361 #endif /* XEN && __x86_64__*/ 2362 2363 #ifdef XEN 2364 s = splvm(); 2365 object = (vaddr_t)v; 2366 pmap_protect(pmap_kernel(), object, object + (PAGE_SIZE * PDP_SIZE), 2367 VM_PROT_READ); 2368 pmap_update(pmap_kernel()); 2369 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2370 /* 2371 * pin as L2/L4 page, we have to do the page with the 2372 * PDIR_SLOT_PTE entries last 2373 */ 2374 #ifdef PAE 2375 if (i == l2tol3(PDIR_SLOT_PTE)) 2376 continue; 2377 #endif 2378 2379 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2380 #ifdef __x86_64__ 2381 xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa)); 2382 #else 2383 xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa)); 2384 #endif 2385 } 2386 #ifdef PAE 2387 object = ((vaddr_t)pdir) + PAGE_SIZE * l2tol3(PDIR_SLOT_PTE); 2388 (void)pmap_extract(pmap_kernel(), object, &pdirpa); 2389 xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa)); 2390 #endif 2391 splx(s); 2392 #endif /* XEN */ 2393 2394 return (0); 2395 } 2396 2397 /* 2398 * pmap_pdp_dtor: destructor for the PDP cache. 2399 */ 2400 2401 static void 2402 pmap_pdp_dtor(void *arg, void *v) 2403 { 2404 #ifdef XEN 2405 paddr_t pdirpa = 0; /* XXX: GCC */ 2406 vaddr_t object = (vaddr_t)v; 2407 int i; 2408 int s = splvm(); 2409 pt_entry_t *pte; 2410 2411 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2412 /* fetch the physical address of the page directory. */ 2413 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2414 /* unpin page table */ 2415 xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa)); 2416 } 2417 object = (vaddr_t)v; 2418 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2419 /* Set page RW again */ 2420 pte = kvtopte(object); 2421 pmap_pte_set(pte, *pte | PG_RW); 2422 xen_bcast_invlpg((vaddr_t)object); 2423 } 2424 splx(s); 2425 #endif /* XEN */ 2426 } 2427 2428 #ifdef PAE 2429 2430 /* pmap_pdp_alloc: Allocate a page for the pdp memory pool. */ 2431 2432 static void * 2433 pmap_pdp_alloc(struct pool *pp, int flags) 2434 { 2435 return (void *)uvm_km_alloc(kernel_map, 2436 PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE, 2437 ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK) 2438 | UVM_KMF_WIRED); 2439 } 2440 2441 /* 2442 * pmap_pdp_free: free a PDP 2443 */ 2444 2445 static void 2446 pmap_pdp_free(struct pool *pp, void *v) 2447 { 2448 uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE, 2449 UVM_KMF_WIRED); 2450 } 2451 #endif /* PAE */ 2452 2453 /* 2454 * pmap_create: create a pmap object. 2455 */ 2456 struct pmap * 2457 pmap_create(void) 2458 { 2459 struct pmap *pmap; 2460 int i; 2461 2462 pmap = pool_cache_get(&pmap_cache, PR_WAITOK); 2463 2464 /* init uvm_object */ 2465 for (i = 0; i < PTP_LEVELS - 1; i++) { 2466 mutex_init(&pmap->pm_obj_lock[i], MUTEX_DEFAULT, IPL_NONE); 2467 uvm_obj_init(&pmap->pm_obj[i], NULL, false, 1); 2468 uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_obj_lock[i]); 2469 pmap->pm_ptphint[i] = NULL; 2470 } 2471 pmap->pm_stats.wired_count = 0; 2472 /* count the PDP allocd below */ 2473 pmap->pm_stats.resident_count = PDP_SIZE; 2474 #if !defined(__x86_64__) 2475 pmap->pm_hiexec = 0; 2476 #endif /* !defined(__x86_64__) */ 2477 pmap->pm_flags = 0; 2478 pmap->pm_gc_ptp = NULL; 2479 2480 kcpuset_create(&pmap->pm_cpus, true); 2481 kcpuset_create(&pmap->pm_kernel_cpus, true); 2482 #ifdef XEN 2483 kcpuset_create(&pmap->pm_xen_ptp_cpus, true); 2484 #endif 2485 /* init the LDT */ 2486 pmap->pm_ldt = NULL; 2487 pmap->pm_ldt_len = 0; 2488 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 2489 2490 /* allocate PDP */ 2491 try_again: 2492 pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK); 2493 2494 mutex_enter(&pmaps_lock); 2495 2496 if (pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] == 0) { 2497 mutex_exit(&pmaps_lock); 2498 pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir); 2499 goto try_again; 2500 } 2501 2502 for (i = 0; i < PDP_SIZE; i++) 2503 pmap->pm_pdirpa[i] = 2504 pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]); 2505 2506 LIST_INSERT_HEAD(&pmaps, pmap, pm_list); 2507 2508 mutex_exit(&pmaps_lock); 2509 2510 return (pmap); 2511 } 2512 2513 /* 2514 * pmap_free_ptps: put a list of ptps back to the freelist. 2515 */ 2516 2517 void 2518 pmap_free_ptps(struct vm_page *empty_ptps) 2519 { 2520 struct vm_page *ptp; 2521 struct pmap_page *pp; 2522 2523 while ((ptp = empty_ptps) != NULL) { 2524 pp = VM_PAGE_TO_PP(ptp); 2525 empty_ptps = pp->pp_link; 2526 LIST_INIT(&pp->pp_head.pvh_list); 2527 uvm_pagefree(ptp); 2528 } 2529 } 2530 2531 /* 2532 * pmap_check_ptps: verify that none of the pmap's page table objects 2533 * have any pages allocated to them. 2534 */ 2535 2536 static inline void 2537 pmap_check_ptps(struct pmap *pmap) 2538 { 2539 int i; 2540 2541 for (i = 0; i < PTP_LEVELS - 1; i++) { 2542 KASSERT(pmap->pm_obj[i].uo_npages == 0); 2543 KASSERT(TAILQ_EMPTY(&pmap->pm_obj[i].memq)); 2544 } 2545 } 2546 2547 static inline void 2548 pmap_check_inuse(struct pmap *pmap) 2549 { 2550 #ifdef DIAGNOSTIC 2551 CPU_INFO_ITERATOR cii; 2552 struct cpu_info *ci; 2553 2554 for (CPU_INFO_FOREACH(cii, ci)) { 2555 if (ci->ci_pmap == pmap) 2556 panic("destroying pmap being used"); 2557 #if defined(XEN) && defined(__x86_64__) 2558 for (int i = 0; i < PDIR_SLOT_PTE; i++) { 2559 if (pmap->pm_pdir[i] != 0 && 2560 ci->ci_kpm_pdir[i] == pmap->pm_pdir[i]) { 2561 printf("pmap_destroy(%p) pmap_kernel %p " 2562 "curcpu %d cpu %d ci_pmap %p " 2563 "ci->ci_kpm_pdir[%d]=%" PRIx64 2564 " pmap->pm_pdir[%d]=%" PRIx64 "\n", 2565 pmap, pmap_kernel(), curcpu()->ci_index, 2566 ci->ci_index, ci->ci_pmap, 2567 i, ci->ci_kpm_pdir[i], 2568 i, pmap->pm_pdir[i]); 2569 panic("%s: used pmap", __func__); 2570 } 2571 } 2572 #endif 2573 } 2574 #endif /* DIAGNOSTIC */ 2575 } 2576 2577 /* 2578 * pmap_destroy: drop reference count on pmap. free pmap if 2579 * reference count goes to zero. 2580 */ 2581 2582 void 2583 pmap_destroy(struct pmap *pmap) 2584 { 2585 lwp_t *l; 2586 int i; 2587 2588 /* 2589 * If we have torn down this pmap, process deferred frees and 2590 * invalidations. Free now if the system is low on memory. 2591 * Otherwise, free when the pmap is destroyed thus avoiding a 2592 * TLB shootdown. 2593 */ 2594 l = curlwp; 2595 if (__predict_false(l->l_md.md_gc_pmap == pmap)) { 2596 pmap_check_ptps(pmap); 2597 if (uvmexp.free < uvmexp.freetarg) { 2598 pmap_update(pmap); 2599 } else { 2600 KASSERT(pmap->pm_gc_ptp == NULL); 2601 pmap->pm_gc_ptp = l->l_md.md_gc_ptp; 2602 l->l_md.md_gc_ptp = NULL; 2603 l->l_md.md_gc_pmap = NULL; 2604 } 2605 } 2606 2607 /* 2608 * drop reference count 2609 */ 2610 2611 if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) { 2612 return; 2613 } 2614 2615 pmap_check_inuse(pmap); 2616 2617 /* 2618 * Reference count is zero, free pmap resources and then free pmap. 2619 * First, remove it from global list of pmaps. 2620 */ 2621 2622 mutex_enter(&pmaps_lock); 2623 LIST_REMOVE(pmap, pm_list); 2624 mutex_exit(&pmaps_lock); 2625 2626 /* 2627 * Process deferred PTP frees. No TLB shootdown required, as the 2628 * PTP pages are no longer visible to any CPU. 2629 */ 2630 2631 pmap_free_ptps(pmap->pm_gc_ptp); 2632 2633 pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir); 2634 2635 #ifdef USER_LDT 2636 if (pmap->pm_ldt != NULL) { 2637 /* 2638 * no need to switch the LDT; this address space is gone, 2639 * nothing is using it. 2640 * 2641 * No need to lock the pmap for ldt_free (or anything else), 2642 * we're the last one to use it. 2643 */ 2644 mutex_enter(&cpu_lock); 2645 ldt_free(pmap->pm_ldt_sel); 2646 mutex_exit(&cpu_lock); 2647 uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt, 2648 pmap->pm_ldt_len, UVM_KMF_WIRED); 2649 } 2650 #endif 2651 2652 for (i = 0; i < PTP_LEVELS - 1; i++) { 2653 uvm_obj_destroy(&pmap->pm_obj[i], false); 2654 mutex_destroy(&pmap->pm_obj_lock[i]); 2655 } 2656 kcpuset_destroy(pmap->pm_cpus); 2657 kcpuset_destroy(pmap->pm_kernel_cpus); 2658 #ifdef XEN 2659 kcpuset_destroy(pmap->pm_xen_ptp_cpus); 2660 #endif 2661 2662 pmap_check_ptps(pmap); 2663 pool_cache_put(&pmap_cache, pmap); 2664 } 2665 2666 /* 2667 * pmap_remove_all: pmap is being torn down by the current thread. 2668 * avoid unnecessary invalidations. 2669 */ 2670 2671 void 2672 pmap_remove_all(struct pmap *pmap) 2673 { 2674 lwp_t *l = curlwp; 2675 2676 KASSERT(l->l_md.md_gc_pmap == NULL); 2677 2678 l->l_md.md_gc_pmap = pmap; 2679 } 2680 2681 #if defined(PMAP_FORK) 2682 /* 2683 * pmap_fork: perform any necessary data structure manipulation when 2684 * a VM space is forked. 2685 */ 2686 2687 void 2688 pmap_fork(struct pmap *pmap1, struct pmap *pmap2) 2689 { 2690 #ifdef USER_LDT 2691 union descriptor *new_ldt; 2692 size_t len; 2693 int sel; 2694 2695 if (__predict_true(pmap1->pm_ldt == NULL)) { 2696 return; 2697 } 2698 2699 /* 2700 * Copy the LDT into the new process. 2701 * 2702 * Read pmap1's ldt pointer and length unlocked; if it changes 2703 * behind our back we'll retry. This will starve if there's a 2704 * stream of LDT changes in another thread but that should not 2705 * happen. 2706 */ 2707 2708 retry: 2709 if (pmap1->pm_ldt != NULL) { 2710 len = pmap1->pm_ldt_len; 2711 /* Allocate space for the new process's LDT */ 2712 new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, len, 0, 2713 UVM_KMF_WIRED); 2714 if (new_ldt == NULL) { 2715 printf("WARNING: %s: unable to allocate LDT space\n", 2716 __func__); 2717 return; 2718 } 2719 mutex_enter(&cpu_lock); 2720 /* Get a GDT slot for it */ 2721 sel = ldt_alloc(new_ldt, len); 2722 if (sel == -1) { 2723 mutex_exit(&cpu_lock); 2724 uvm_km_free(kernel_map, (vaddr_t)new_ldt, len, 2725 UVM_KMF_WIRED); 2726 printf("WARNING: %s: unable to allocate LDT selector\n", 2727 __func__); 2728 return; 2729 } 2730 } else { 2731 /* Wasn't anything there after all. */ 2732 len = -1; 2733 new_ldt = NULL; 2734 sel = -1; 2735 mutex_enter(&cpu_lock); 2736 } 2737 2738 /* If there's still something there now that we have cpu_lock... */ 2739 if (pmap1->pm_ldt != NULL) { 2740 if (len != pmap1->pm_ldt_len) { 2741 /* Oops, it changed. Drop what we did and try again */ 2742 if (len != -1) { 2743 ldt_free(sel); 2744 uvm_km_free(kernel_map, (vaddr_t)new_ldt, 2745 len, UVM_KMF_WIRED); 2746 } 2747 mutex_exit(&cpu_lock); 2748 goto retry; 2749 } 2750 2751 /* Copy the LDT data and install it in pmap2 */ 2752 memcpy(new_ldt, pmap1->pm_ldt, len); 2753 pmap2->pm_ldt = new_ldt; 2754 pmap2->pm_ldt_len = pmap1->pm_ldt_len; 2755 pmap2->pm_ldt_sel = sel; 2756 len = -1; 2757 } 2758 2759 if (len != -1) { 2760 /* There wasn't still something there, so mop up */ 2761 ldt_free(sel); 2762 mutex_exit(&cpu_lock); 2763 uvm_km_free(kernel_map, (vaddr_t)new_ldt, len, 2764 UVM_KMF_WIRED); 2765 } else { 2766 mutex_exit(&cpu_lock); 2767 } 2768 #endif /* USER_LDT */ 2769 } 2770 #endif /* PMAP_FORK */ 2771 2772 #ifdef USER_LDT 2773 2774 /* 2775 * pmap_ldt_xcall: cross call used by pmap_ldt_sync. if the named pmap 2776 * is active, reload LDTR. 2777 */ 2778 static void 2779 pmap_ldt_xcall(void *arg1, void *arg2) 2780 { 2781 struct pmap *pm; 2782 2783 kpreempt_disable(); 2784 pm = arg1; 2785 if (curcpu()->ci_pmap == pm) { 2786 lldt(pm->pm_ldt_sel); 2787 } 2788 kpreempt_enable(); 2789 } 2790 2791 /* 2792 * pmap_ldt_sync: LDT selector for the named pmap is changing. swap 2793 * in the new selector on all CPUs. 2794 */ 2795 void 2796 pmap_ldt_sync(struct pmap *pm) 2797 { 2798 uint64_t where; 2799 2800 KASSERT(mutex_owned(&cpu_lock)); 2801 2802 pmap_ldt_evcnt.ev_count++; 2803 where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL); 2804 xc_wait(where); 2805 } 2806 2807 /* 2808 * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and 2809 * restore the default. 2810 */ 2811 2812 void 2813 pmap_ldt_cleanup(struct lwp *l) 2814 { 2815 pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap; 2816 union descriptor *dp = NULL; 2817 size_t len = 0; 2818 int sel = -1; 2819 2820 if (__predict_true(pmap->pm_ldt == NULL)) { 2821 return; 2822 } 2823 2824 mutex_enter(&cpu_lock); 2825 if (pmap->pm_ldt != NULL) { 2826 sel = pmap->pm_ldt_sel; 2827 dp = pmap->pm_ldt; 2828 len = pmap->pm_ldt_len; 2829 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 2830 pmap->pm_ldt = NULL; 2831 pmap->pm_ldt_len = 0; 2832 pmap_ldt_sync(pmap); 2833 ldt_free(sel); 2834 uvm_km_free(kernel_map, (vaddr_t)dp, len, UVM_KMF_WIRED); 2835 } 2836 mutex_exit(&cpu_lock); 2837 } 2838 #endif /* USER_LDT */ 2839 2840 /* 2841 * pmap_activate: activate a process' pmap 2842 * 2843 * => must be called with kernel preemption disabled 2844 * => if lwp is the curlwp, then set ci_want_pmapload so that 2845 * actual MMU context switch will be done by pmap_load() later 2846 */ 2847 2848 void 2849 pmap_activate(struct lwp *l) 2850 { 2851 struct cpu_info *ci; 2852 struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2853 2854 KASSERT(kpreempt_disabled()); 2855 2856 ci = curcpu(); 2857 2858 if (l != ci->ci_curlwp) 2859 return; 2860 2861 KASSERT(ci->ci_want_pmapload == 0); 2862 KASSERT(ci->ci_tlbstate != TLBSTATE_VALID); 2863 2864 /* 2865 * no need to switch to kernel vmspace because 2866 * it's a subset of any vmspace. 2867 */ 2868 2869 if (pmap == pmap_kernel()) { 2870 ci->ci_want_pmapload = 0; 2871 return; 2872 } 2873 2874 ci->ci_want_pmapload = 1; 2875 } 2876 2877 #if defined(XEN) && defined(__x86_64__) 2878 #define KASSERT_PDIRPA(pmap) \ 2879 KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd || \ 2880 pmap == pmap_kernel()) 2881 #elif defined(PAE) 2882 #define KASSERT_PDIRPA(pmap) \ 2883 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0])) 2884 #elif !defined(XEN) 2885 #define KASSERT_PDIRPA(pmap) \ 2886 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3())) 2887 #else 2888 #define KASSERT_PDIRPA(pmap) KASSERT(true) /* nothing to do */ 2889 #endif 2890 2891 /* 2892 * pmap_reactivate: try to regain reference to the pmap. 2893 * 2894 * => Must be called with kernel preemption disabled. 2895 */ 2896 2897 static void 2898 pmap_reactivate(struct pmap *pmap) 2899 { 2900 struct cpu_info * const ci = curcpu(); 2901 const cpuid_t cid = cpu_index(ci); 2902 2903 KASSERT(kpreempt_disabled()); 2904 KASSERT_PDIRPA(pmap); 2905 2906 /* 2907 * If we still have a lazy reference to this pmap, we can assume 2908 * that there was no TLB shootdown for this pmap in the meantime. 2909 * 2910 * The order of events here is important as we must synchronize 2911 * with TLB shootdown interrupts. Declare interest in invalidations 2912 * (TLBSTATE_VALID) and then check the CPU set, which the IPIs can 2913 * change only when the state is TLBSTATE_LAZY. 2914 */ 2915 2916 ci->ci_tlbstate = TLBSTATE_VALID; 2917 KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid)); 2918 2919 if (kcpuset_isset(pmap->pm_cpus, cid)) { 2920 /* We have the reference, state is valid. */ 2921 } else { 2922 /* 2923 * Must reload the TLB, pmap has been changed during 2924 * deactivated. 2925 */ 2926 kcpuset_atomic_set(pmap->pm_cpus, cid); 2927 2928 u_int gen = uvm_emap_gen_return(); 2929 tlbflush(); 2930 uvm_emap_update(gen); 2931 } 2932 } 2933 2934 /* 2935 * pmap_load: perform the actual pmap switch, i.e. fill in %cr3 register 2936 * and relevant LDT info. 2937 * 2938 * Ensures that the current process' pmap is loaded on the current CPU's 2939 * MMU and that there are no stale TLB entries. 2940 * 2941 * => The caller should disable kernel preemption or do check-and-retry 2942 * to prevent a preemption from undoing our efforts. 2943 * => This function may block. 2944 */ 2945 void 2946 pmap_load(void) 2947 { 2948 struct cpu_info *ci; 2949 struct pmap *pmap, *oldpmap; 2950 struct lwp *l; 2951 struct pcb *pcb; 2952 cpuid_t cid; 2953 uint64_t ncsw; 2954 2955 kpreempt_disable(); 2956 retry: 2957 ci = curcpu(); 2958 if (!ci->ci_want_pmapload) { 2959 kpreempt_enable(); 2960 return; 2961 } 2962 l = ci->ci_curlwp; 2963 ncsw = l->l_ncsw; 2964 2965 /* should be able to take ipis. */ 2966 KASSERT(ci->ci_ilevel < IPL_HIGH); 2967 #ifdef XEN 2968 /* Check to see if interrupts are enabled (ie; no events are masked) */ 2969 KASSERT(x86_read_psl() == 0); 2970 #else 2971 KASSERT((x86_read_psl() & PSL_I) != 0); 2972 #endif 2973 2974 KASSERT(l != NULL); 2975 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2976 KASSERT(pmap != pmap_kernel()); 2977 oldpmap = ci->ci_pmap; 2978 pcb = lwp_getpcb(l); 2979 2980 if (pmap == oldpmap) { 2981 pmap_reactivate(pmap); 2982 ci->ci_want_pmapload = 0; 2983 kpreempt_enable(); 2984 return; 2985 } 2986 2987 /* 2988 * Acquire a reference to the new pmap and perform the switch. 2989 */ 2990 2991 pmap_reference(pmap); 2992 2993 cid = cpu_index(ci); 2994 kcpuset_atomic_clear(oldpmap->pm_cpus, cid); 2995 kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid); 2996 2997 KASSERT_PDIRPA(oldpmap); 2998 KASSERT(!kcpuset_isset(pmap->pm_cpus, cid)); 2999 KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid)); 3000 3001 /* 3002 * Mark the pmap in use by this CPU. Again, we must synchronize 3003 * with TLB shootdown interrupts, so set the state VALID first, 3004 * then register us for shootdown events on this pmap. 3005 */ 3006 ci->ci_tlbstate = TLBSTATE_VALID; 3007 kcpuset_atomic_set(pmap->pm_cpus, cid); 3008 kcpuset_atomic_set(pmap->pm_kernel_cpus, cid); 3009 ci->ci_pmap = pmap; 3010 3011 /* 3012 * update tss. now that we have registered for invalidations 3013 * from other CPUs, we're good to load the page tables. 3014 */ 3015 #ifdef PAE 3016 pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa; 3017 #else 3018 pcb->pcb_cr3 = pmap_pdirpa(pmap, 0); 3019 #endif 3020 3021 #ifdef i386 3022 #ifndef XEN 3023 ci->ci_tss->tss.tss_ldt = pmap->pm_ldt_sel; 3024 ci->ci_tss->tss.tss_cr3 = pcb->pcb_cr3; 3025 #endif /* !XEN */ 3026 #endif /* i386 */ 3027 3028 lldt(pmap->pm_ldt_sel); 3029 3030 u_int gen = uvm_emap_gen_return(); 3031 cpu_load_pmap(pmap, oldpmap); 3032 uvm_emap_update(gen); 3033 3034 ci->ci_want_pmapload = 0; 3035 3036 /* 3037 * we're now running with the new pmap. drop the reference 3038 * to the old pmap. if we block, we need to go around again. 3039 */ 3040 3041 pmap_destroy(oldpmap); 3042 if (l->l_ncsw != ncsw) { 3043 goto retry; 3044 } 3045 3046 kpreempt_enable(); 3047 } 3048 3049 /* 3050 * pmap_deactivate: deactivate a process' pmap. 3051 * 3052 * => Must be called with kernel preemption disabled (high IPL is enough). 3053 */ 3054 void 3055 pmap_deactivate(struct lwp *l) 3056 { 3057 struct pmap *pmap; 3058 struct cpu_info *ci; 3059 3060 KASSERT(kpreempt_disabled()); 3061 3062 if (l != curlwp) { 3063 return; 3064 } 3065 3066 /* 3067 * Wait for pending TLB shootdowns to complete. Necessary because 3068 * TLB shootdown state is per-CPU, and the LWP may be coming off 3069 * the CPU before it has a chance to call pmap_update(), e.g. due 3070 * to kernel preemption or blocking routine in between. 3071 */ 3072 pmap_tlb_shootnow(); 3073 3074 ci = curcpu(); 3075 3076 if (ci->ci_want_pmapload) { 3077 /* 3078 * ci_want_pmapload means that our pmap is not loaded on 3079 * the CPU or TLB might be stale. note that pmap_kernel() 3080 * is always considered loaded. 3081 */ 3082 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 3083 != pmap_kernel()); 3084 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 3085 != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID); 3086 3087 /* 3088 * userspace has not been touched. 3089 * nothing to do here. 3090 */ 3091 3092 ci->ci_want_pmapload = 0; 3093 return; 3094 } 3095 3096 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 3097 3098 if (pmap == pmap_kernel()) { 3099 return; 3100 } 3101 3102 KASSERT_PDIRPA(pmap); 3103 KASSERT(ci->ci_pmap == pmap); 3104 3105 /* 3106 * we aren't interested in TLB invalidations for this pmap, 3107 * at least for the time being. 3108 */ 3109 3110 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); 3111 ci->ci_tlbstate = TLBSTATE_LAZY; 3112 } 3113 3114 /* 3115 * end of lifecycle functions 3116 */ 3117 3118 /* 3119 * some misc. functions 3120 */ 3121 3122 int 3123 pmap_pdes_invalid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde) 3124 { 3125 int i; 3126 unsigned long index; 3127 pd_entry_t pde; 3128 3129 for (i = PTP_LEVELS; i > 1; i--) { 3130 index = pl_i(va, i); 3131 pde = pdes[i - 2][index]; 3132 if ((pde & PG_V) == 0) 3133 return i; 3134 } 3135 if (lastpde != NULL) 3136 *lastpde = pde; 3137 return 0; 3138 } 3139 3140 /* 3141 * pmap_extract: extract a PA for the given VA 3142 */ 3143 3144 bool 3145 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap) 3146 { 3147 pt_entry_t *ptes, pte; 3148 pd_entry_t pde; 3149 pd_entry_t * const *pdes; 3150 struct pmap *pmap2; 3151 struct cpu_info *ci; 3152 paddr_t pa; 3153 lwp_t *l; 3154 bool hard, rv; 3155 3156 #ifdef __HAVE_DIRECT_MAP 3157 if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) { 3158 if (pap != NULL) { 3159 *pap = va - PMAP_DIRECT_BASE; 3160 } 3161 return true; 3162 } 3163 #endif 3164 3165 rv = false; 3166 pa = 0; 3167 l = curlwp; 3168 3169 kpreempt_disable(); 3170 ci = l->l_cpu; 3171 if (__predict_true(!ci->ci_want_pmapload && ci->ci_pmap == pmap) || 3172 pmap == pmap_kernel()) { 3173 /* 3174 * no need to lock, because it's pmap_kernel() or our 3175 * own pmap and is active. if a user pmap, the caller 3176 * will hold the vm_map write/read locked and so prevent 3177 * entries from disappearing while we are here. ptps 3178 * can disappear via pmap_remove() and pmap_protect(), 3179 * but they are called with the vm_map write locked. 3180 */ 3181 hard = false; 3182 ptes = PTE_BASE; 3183 pdes = normal_pdes; 3184 } else { 3185 /* we lose, do it the hard way. */ 3186 hard = true; 3187 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3188 } 3189 if (pmap_pdes_valid(va, pdes, &pde)) { 3190 pte = ptes[pl1_i(va)]; 3191 if (pde & PG_PS) { 3192 pa = (pde & PG_LGFRAME) | (va & (NBPD_L2 - 1)); 3193 rv = true; 3194 } else if (__predict_true((pte & PG_V) != 0)) { 3195 pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1)); 3196 rv = true; 3197 } 3198 } 3199 if (__predict_false(hard)) { 3200 pmap_unmap_ptes(pmap, pmap2); 3201 } 3202 kpreempt_enable(); 3203 if (pap != NULL) { 3204 *pap = pa; 3205 } 3206 return rv; 3207 } 3208 3209 3210 /* 3211 * vtophys: virtual address to physical address. For use by 3212 * machine-dependent code only. 3213 */ 3214 3215 paddr_t 3216 vtophys(vaddr_t va) 3217 { 3218 paddr_t pa; 3219 3220 if (pmap_extract(pmap_kernel(), va, &pa) == true) 3221 return (pa); 3222 return (0); 3223 } 3224 3225 __strict_weak_alias(pmap_extract_ma, pmap_extract); 3226 3227 #ifdef XEN 3228 3229 /* 3230 * vtomach: virtual address to machine address. For use by 3231 * machine-dependent code only. 3232 */ 3233 3234 paddr_t 3235 vtomach(vaddr_t va) 3236 { 3237 paddr_t pa; 3238 3239 if (pmap_extract_ma(pmap_kernel(), va, &pa) == true) 3240 return (pa); 3241 return (0); 3242 } 3243 3244 #endif /* XEN */ 3245 3246 /* 3247 * pmap_virtual_space: used during bootup [pmap_steal_memory] to 3248 * determine the bounds of the kernel virtual addess space. 3249 */ 3250 3251 void 3252 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp) 3253 { 3254 *startp = virtual_avail; 3255 *endp = virtual_end; 3256 } 3257 3258 /* 3259 * pmap_zero_page: zero a page 3260 */ 3261 3262 void 3263 pmap_zero_page(paddr_t pa) 3264 { 3265 #if defined(__HAVE_DIRECT_MAP) 3266 pagezero(PMAP_DIRECT_MAP(pa)); 3267 #else 3268 #if defined(XEN) 3269 if (XEN_VERSION_SUPPORTED(3, 4)) 3270 xen_pagezero(pa); 3271 #endif 3272 struct cpu_info *ci; 3273 pt_entry_t *zpte; 3274 vaddr_t zerova; 3275 3276 const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_M | PG_U; 3277 3278 kpreempt_disable(); 3279 3280 ci = curcpu(); 3281 zerova = ci->vpage[VPAGE_ZER]; 3282 zpte = ci->vpage_pte[VPAGE_ZER]; 3283 3284 KASSERTMSG(!*zpte, "pmap_zero_page: lock botch"); 3285 3286 pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags); 3287 pmap_pte_flush(); 3288 pmap_update_pg(zerova); /* flush TLB */ 3289 3290 memset((void *)zerova, 0, PAGE_SIZE); 3291 3292 #if defined(DIAGNOSTIC) || defined(XEN) 3293 pmap_pte_set(zpte, 0); /* zap ! */ 3294 pmap_pte_flush(); 3295 #endif 3296 3297 kpreempt_enable(); 3298 #endif /* defined(__HAVE_DIRECT_MAP) */ 3299 } 3300 3301 /* 3302 * pmap_pagezeroidle: the same, for the idle loop page zero'er. 3303 * Returns true if the page was zero'd, false if we aborted for 3304 * some reason. 3305 */ 3306 3307 bool 3308 pmap_pageidlezero(paddr_t pa) 3309 { 3310 #ifdef __HAVE_DIRECT_MAP 3311 KASSERT(cpu_feature[0] & CPUID_SSE2); 3312 return sse2_idlezero_page((void *)PMAP_DIRECT_MAP(pa)); 3313 #else 3314 struct cpu_info *ci; 3315 pt_entry_t *zpte; 3316 vaddr_t zerova; 3317 bool rv; 3318 3319 const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_M | PG_U; 3320 3321 ci = curcpu(); 3322 zerova = ci->vpage[VPAGE_ZER]; 3323 zpte = ci->vpage_pte[VPAGE_ZER]; 3324 3325 KASSERT(cpu_feature[0] & CPUID_SSE2); 3326 KASSERT(*zpte == 0); 3327 3328 pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags); 3329 pmap_pte_flush(); 3330 pmap_update_pg(zerova); /* flush TLB */ 3331 3332 rv = sse2_idlezero_page((void *)zerova); 3333 3334 #if defined(DIAGNOSTIC) || defined(XEN) 3335 pmap_pte_set(zpte, 0); /* zap ! */ 3336 pmap_pte_flush(); 3337 #endif 3338 3339 return rv; 3340 #endif 3341 } 3342 3343 /* 3344 * pmap_copy_page: copy a page 3345 */ 3346 3347 void 3348 pmap_copy_page(paddr_t srcpa, paddr_t dstpa) 3349 { 3350 #if defined(__HAVE_DIRECT_MAP) 3351 vaddr_t srcva = PMAP_DIRECT_MAP(srcpa); 3352 vaddr_t dstva = PMAP_DIRECT_MAP(dstpa); 3353 3354 memcpy((void *)dstva, (void *)srcva, PAGE_SIZE); 3355 #else 3356 #if defined(XEN) 3357 if (XEN_VERSION_SUPPORTED(3, 4)) { 3358 xen_copy_page(srcpa, dstpa); 3359 return; 3360 } 3361 #endif 3362 struct cpu_info *ci; 3363 pt_entry_t *srcpte, *dstpte; 3364 vaddr_t srcva, dstva; 3365 3366 const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_U; 3367 3368 kpreempt_disable(); 3369 3370 ci = curcpu(); 3371 srcva = ci->vpage[VPAGE_SRC]; 3372 dstva = ci->vpage[VPAGE_DST]; 3373 srcpte = ci->vpage_pte[VPAGE_SRC]; 3374 dstpte = ci->vpage_pte[VPAGE_DST]; 3375 3376 KASSERT(*srcpte == 0 && *dstpte == 0); 3377 3378 pmap_pte_set(srcpte, pmap_pa2pte(srcpa) | pteflags); 3379 pmap_pte_set(dstpte, pmap_pa2pte(dstpa) | pteflags | PG_M); 3380 pmap_pte_flush(); 3381 pmap_update_pg(srcva); 3382 pmap_update_pg(dstva); 3383 3384 memcpy((void *)dstva, (void *)srcva, PAGE_SIZE); 3385 3386 #if defined(DIAGNOSTIC) || defined(XEN) 3387 pmap_pte_set(srcpte, 0); 3388 pmap_pte_set(dstpte, 0); 3389 pmap_pte_flush(); 3390 #endif 3391 3392 kpreempt_enable(); 3393 #endif /* defined(__HAVE_DIRECT_MAP) */ 3394 } 3395 3396 static pt_entry_t * 3397 pmap_map_ptp(struct vm_page *ptp) 3398 { 3399 #ifdef __HAVE_DIRECT_MAP 3400 return (void *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp)); 3401 #else 3402 struct cpu_info *ci; 3403 pt_entry_t *ptppte; 3404 vaddr_t ptpva; 3405 3406 KASSERT(kpreempt_disabled()); 3407 3408 #ifndef XEN 3409 const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_U | PG_M; 3410 #else 3411 const pd_entry_t pteflags = PG_V | pmap_pg_nx | PG_U | PG_M; 3412 #endif 3413 3414 ci = curcpu(); 3415 ptpva = ci->vpage[VPAGE_PTP]; 3416 ptppte = ci->vpage_pte[VPAGE_PTP]; 3417 3418 pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | pteflags); 3419 3420 pmap_pte_flush(); 3421 pmap_update_pg(ptpva); 3422 3423 return (pt_entry_t *)ptpva; 3424 #endif 3425 } 3426 3427 static void 3428 pmap_unmap_ptp(void) 3429 { 3430 #ifndef __HAVE_DIRECT_MAP 3431 #if defined(DIAGNOSTIC) || defined(XEN) 3432 struct cpu_info *ci; 3433 pt_entry_t *pte; 3434 3435 KASSERT(kpreempt_disabled()); 3436 3437 ci = curcpu(); 3438 pte = ci->vpage_pte[VPAGE_PTP]; 3439 3440 if (*pte != 0) { 3441 pmap_pte_set(pte, 0); 3442 pmap_pte_flush(); 3443 } 3444 #endif 3445 #endif 3446 } 3447 3448 static pt_entry_t * 3449 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va) 3450 { 3451 3452 KASSERT(kpreempt_disabled()); 3453 if (pmap_is_curpmap(pmap)) { 3454 return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */ 3455 } 3456 KASSERT(ptp != NULL); 3457 return pmap_map_ptp(ptp) + pl1_pi(va); 3458 } 3459 3460 static void 3461 pmap_unmap_pte(void) 3462 { 3463 3464 KASSERT(kpreempt_disabled()); 3465 3466 pmap_unmap_ptp(); 3467 } 3468 3469 /* 3470 * p m a p r e m o v e f u n c t i o n s 3471 * 3472 * functions that remove mappings 3473 */ 3474 3475 /* 3476 * pmap_remove_ptes: remove PTEs from a PTP 3477 * 3478 * => caller must hold pmap's lock 3479 * => PTP must be mapped into KVA 3480 * => PTP should be null if pmap == pmap_kernel() 3481 * => must be called with kernel preemption disabled 3482 * => returns composite pte if at least one page should be shot down 3483 */ 3484 3485 static void 3486 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva, 3487 vaddr_t startva, vaddr_t endva, struct pv_entry **pv_tofree) 3488 { 3489 pt_entry_t *pte = (pt_entry_t *)ptpva; 3490 3491 KASSERT(pmap == pmap_kernel() || mutex_owned(pmap->pm_lock)); 3492 KASSERT(kpreempt_disabled()); 3493 3494 /* 3495 * note that ptpva points to the PTE that maps startva. this may 3496 * or may not be the first PTE in the PTP. 3497 * 3498 * we loop through the PTP while there are still PTEs to look at 3499 * and the wire_count is greater than 1 (because we use the wire_count 3500 * to keep track of the number of real PTEs in the PTP). 3501 */ 3502 while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) { 3503 (void)pmap_remove_pte(pmap, ptp, pte, startva, pv_tofree); 3504 startva += PAGE_SIZE; 3505 pte++; 3506 } 3507 } 3508 3509 3510 /* 3511 * pmap_remove_pte: remove a single PTE from a PTP. 3512 * 3513 * => caller must hold pmap's lock 3514 * => PTP must be mapped into KVA 3515 * => PTP should be null if pmap == pmap_kernel() 3516 * => returns true if we removed a mapping 3517 * => must be called with kernel preemption disabled 3518 */ 3519 static bool 3520 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, 3521 vaddr_t va, struct pv_entry **pv_tofree) 3522 { 3523 struct pv_entry *pve; 3524 struct vm_page *pg; 3525 struct pmap_page *pp; 3526 pt_entry_t opte; 3527 3528 KASSERT(pmap == pmap_kernel() || mutex_owned(pmap->pm_lock)); 3529 KASSERT(kpreempt_disabled()); 3530 3531 if (!pmap_valid_entry(*pte)) { 3532 /* VA not mapped. */ 3533 return false; 3534 } 3535 3536 /* Atomically save the old PTE and zap it. */ 3537 opte = pmap_pte_testset(pte, 0); 3538 if (!pmap_valid_entry(opte)) { 3539 return false; 3540 } 3541 3542 pmap_exec_account(pmap, va, opte, 0); 3543 pmap_stats_update_bypte(pmap, 0, opte); 3544 3545 if (ptp) { 3546 /* 3547 * Dropping a PTE. Make sure that the PDE is flushed. 3548 */ 3549 ptp->wire_count--; 3550 if (ptp->wire_count <= 1) { 3551 opte |= PG_U; 3552 } 3553 } 3554 3555 if ((opte & PG_U) != 0) { 3556 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE); 3557 } 3558 3559 /* 3560 * If we are not on a pv_head list - we are done. 3561 */ 3562 if ((opte & PG_PVLIST) == 0) { 3563 #ifndef DOM0OPS 3564 KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL), 3565 "managed page without PG_PVLIST for %#"PRIxVADDR, va); 3566 KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL), 3567 "pv-tracked page without PG_PVLIST for %#"PRIxVADDR, va); 3568 #endif 3569 return true; 3570 } 3571 3572 if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) { 3573 KASSERT(uvm_page_locked_p(pg)); 3574 pp = VM_PAGE_TO_PP(pg); 3575 } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) { 3576 paddr_t pa = pmap_pte2pa(opte); 3577 panic("%s: PG_PVLIST with pv-untracked page" 3578 " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")", 3579 __func__, va, pa, atop(pa)); 3580 } 3581 3582 /* Sync R/M bits. */ 3583 pp->pp_attrs |= opte; 3584 pve = pmap_remove_pv(pp, ptp, va); 3585 3586 if (pve) { 3587 pve->pve_next = *pv_tofree; 3588 *pv_tofree = pve; 3589 } 3590 return true; 3591 } 3592 3593 /* 3594 * pmap_remove: mapping removal function. 3595 * 3596 * => caller should not be holding any pmap locks 3597 */ 3598 3599 void 3600 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 3601 { 3602 pt_entry_t *ptes; 3603 pd_entry_t pde; 3604 pd_entry_t * const *pdes; 3605 struct pv_entry *pv_tofree = NULL; 3606 bool result; 3607 int i; 3608 paddr_t ptppa; 3609 vaddr_t blkendva, va = sva; 3610 struct vm_page *ptp; 3611 struct pmap *pmap2; 3612 3613 kpreempt_disable(); 3614 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 3615 3616 /* 3617 * removing one page? take shortcut function. 3618 */ 3619 3620 if (va + PAGE_SIZE == eva) { 3621 if (pmap_pdes_valid(va, pdes, &pde)) { 3622 3623 /* PA of the PTP */ 3624 ptppa = pmap_pte2pa(pde); 3625 3626 /* Get PTP if non-kernel mapping. */ 3627 if (pmap != pmap_kernel()) { 3628 ptp = pmap_find_ptp(pmap, va, ptppa, 1); 3629 KASSERTMSG(ptp != NULL, 3630 "%s: unmanaged PTP detected", __func__); 3631 } else { 3632 /* Never free kernel PTPs. */ 3633 ptp = NULL; 3634 } 3635 3636 result = pmap_remove_pte(pmap, ptp, 3637 &ptes[pl1_i(va)], va, &pv_tofree); 3638 3639 /* 3640 * if mapping removed and the PTP is no longer 3641 * being used, free it! 3642 */ 3643 3644 if (result && ptp && ptp->wire_count <= 1) 3645 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3646 } 3647 } else for (/* null */ ; va < eva ; va = blkendva) { 3648 int lvl; 3649 3650 /* determine range of block */ 3651 blkendva = x86_round_pdr(va+1); 3652 if (blkendva > eva) 3653 blkendva = eva; 3654 3655 /* 3656 * Our PTE mappings should never be removed with pmap_remove. 3657 * 3658 * XXXmaxv: still needed? 3659 * 3660 * A long term solution is to move the PTEs out of user address 3661 * space, and into kernel address space. Then we can set 3662 * VM_MAXUSER_ADDRESS to be VM_MAX_ADDRESS. 3663 */ 3664 for (i = 0; i < PDP_SIZE; i++) { 3665 if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i) 3666 panic("PTE space accessed"); 3667 } 3668 3669 lvl = pmap_pdes_invalid(va, pdes, &pde); 3670 if (lvl != 0) { 3671 /* 3672 * skip a range corresponding to an invalid pde. 3673 */ 3674 blkendva = (va & ptp_masks[lvl - 1]) + nbpd[lvl - 1]; 3675 continue; 3676 } 3677 3678 /* PA of the PTP */ 3679 ptppa = pmap_pte2pa(pde); 3680 3681 /* Get PTP if non-kernel mapping. */ 3682 if (pmap != pmap_kernel()) { 3683 ptp = pmap_find_ptp(pmap, va, ptppa, 1); 3684 KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected", 3685 __func__); 3686 } else { 3687 /* Never free kernel PTPs. */ 3688 ptp = NULL; 3689 } 3690 3691 pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va, 3692 blkendva, &pv_tofree); 3693 3694 /* if PTP is no longer being used, free it! */ 3695 if (ptp && ptp->wire_count <= 1) { 3696 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3697 } 3698 } 3699 pmap_unmap_ptes(pmap, pmap2); /* unlock pmap */ 3700 kpreempt_enable(); 3701 3702 /* Now we free unused PVs */ 3703 if (pv_tofree) 3704 pmap_free_pvs(pv_tofree); 3705 } 3706 3707 /* 3708 * pmap_sync_pv: clear pte bits and return the old value of the pte. 3709 * 3710 * => Caller should disable kernel preemption. 3711 * => issues tlb shootdowns if necessary. 3712 */ 3713 3714 static int 3715 pmap_sync_pv(struct pv_pte *pvpte, pt_entry_t expect, int clearbits, 3716 pt_entry_t *optep) 3717 { 3718 struct pmap *pmap; 3719 struct vm_page *ptp; 3720 vaddr_t va; 3721 pt_entry_t *ptep; 3722 pt_entry_t opte; 3723 pt_entry_t npte; 3724 bool need_shootdown; 3725 3726 ptp = pvpte->pte_ptp; 3727 va = pvpte->pte_va; 3728 KASSERT(ptp == NULL || ptp->uobject != NULL); 3729 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 3730 pmap = ptp_to_pmap(ptp); 3731 3732 KASSERT((expect & ~(PG_FRAME | PG_V)) == 0); 3733 KASSERT((expect & PG_V) != 0); 3734 KASSERT(clearbits == ~0 || (clearbits & ~(PG_M | PG_U | PG_RW)) == 0); 3735 KASSERT(kpreempt_disabled()); 3736 3737 ptep = pmap_map_pte(pmap, ptp, va); 3738 do { 3739 opte = *ptep; 3740 KASSERT((opte & (PG_M | PG_U)) != PG_M); 3741 KASSERT((opte & (PG_U | PG_V)) != PG_U); 3742 KASSERT(opte == 0 || (opte & PG_V) != 0); 3743 if ((opte & (PG_FRAME | PG_V)) != expect) { 3744 3745 /* 3746 * we lost a race with a V->P operation like 3747 * pmap_remove(). wait for the competitor 3748 * reflecting pte bits into mp_attrs. 3749 * 3750 * issue a redundant TLB shootdown so that 3751 * we can wait for its completion. 3752 */ 3753 3754 pmap_unmap_pte(); 3755 if (clearbits != 0) { 3756 pmap_tlb_shootdown(pmap, va, 3757 (pmap == pmap_kernel() ? PG_G : 0), 3758 TLBSHOOT_SYNC_PV1); 3759 } 3760 return EAGAIN; 3761 } 3762 3763 /* 3764 * check if there's anything to do on this pte. 3765 */ 3766 3767 if ((opte & clearbits) == 0) { 3768 need_shootdown = false; 3769 break; 3770 } 3771 3772 /* 3773 * we need a shootdown if the pte is cached. (PG_U) 3774 * 3775 * ...unless we are clearing only the PG_RW bit and 3776 * it isn't cached as RW. (PG_M) 3777 */ 3778 3779 need_shootdown = (opte & PG_U) != 0 && 3780 !(clearbits == PG_RW && (opte & PG_M) == 0); 3781 3782 npte = opte & ~clearbits; 3783 3784 /* 3785 * if we need a shootdown anyway, clear PG_U and PG_M. 3786 */ 3787 3788 if (need_shootdown) { 3789 npte &= ~(PG_U | PG_M); 3790 } 3791 KASSERT((npte & (PG_M | PG_U)) != PG_M); 3792 KASSERT((npte & (PG_U | PG_V)) != PG_U); 3793 KASSERT(npte == 0 || (opte & PG_V) != 0); 3794 } while (pmap_pte_cas(ptep, opte, npte) != opte); 3795 3796 if (need_shootdown) { 3797 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV2); 3798 } 3799 pmap_unmap_pte(); 3800 3801 *optep = opte; 3802 return 0; 3803 } 3804 3805 static void 3806 pmap_pp_remove(struct pmap_page *pp, paddr_t pa) 3807 { 3808 struct pv_pte *pvpte; 3809 struct pv_entry *killlist = NULL; 3810 struct vm_page *ptp; 3811 pt_entry_t expect; 3812 int count; 3813 3814 expect = pmap_pa2pte(pa) | PG_V; 3815 count = SPINLOCK_BACKOFF_MIN; 3816 kpreempt_disable(); 3817 startover: 3818 while ((pvpte = pv_pte_first(pp)) != NULL) { 3819 struct pmap *pmap; 3820 struct pv_entry *pve; 3821 pt_entry_t opte; 3822 vaddr_t va; 3823 int error; 3824 3825 /* 3826 * add a reference to the pmap before clearing the pte. 3827 * otherwise the pmap can disappear behind us. 3828 */ 3829 3830 ptp = pvpte->pte_ptp; 3831 pmap = ptp_to_pmap(ptp); 3832 if (ptp != NULL) { 3833 pmap_reference(pmap); 3834 } 3835 3836 error = pmap_sync_pv(pvpte, expect, ~0, &opte); 3837 if (error == EAGAIN) { 3838 int hold_count; 3839 KERNEL_UNLOCK_ALL(curlwp, &hold_count); 3840 if (ptp != NULL) { 3841 pmap_destroy(pmap); 3842 } 3843 SPINLOCK_BACKOFF(count); 3844 KERNEL_LOCK(hold_count, curlwp); 3845 goto startover; 3846 } 3847 3848 pp->pp_attrs |= opte; 3849 va = pvpte->pte_va; 3850 pve = pmap_remove_pv(pp, ptp, va); 3851 3852 /* update the PTP reference count. free if last reference. */ 3853 if (ptp != NULL) { 3854 struct pmap *pmap2; 3855 pt_entry_t *ptes; 3856 pd_entry_t * const *pdes; 3857 3858 KASSERT(pmap != pmap_kernel()); 3859 3860 pmap_tlb_shootnow(); 3861 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3862 pmap_stats_update_bypte(pmap, 0, opte); 3863 ptp->wire_count--; 3864 if (ptp->wire_count <= 1) { 3865 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3866 } 3867 pmap_unmap_ptes(pmap, pmap2); 3868 pmap_destroy(pmap); 3869 } else { 3870 KASSERT(pmap == pmap_kernel()); 3871 pmap_stats_update_bypte(pmap, 0, opte); 3872 } 3873 3874 if (pve != NULL) { 3875 pve->pve_next = killlist; /* mark it for death */ 3876 killlist = pve; 3877 } 3878 } 3879 pmap_tlb_shootnow(); 3880 kpreempt_enable(); 3881 3882 /* Now free unused pvs. */ 3883 pmap_free_pvs(killlist); 3884 } 3885 3886 /* 3887 * pmap_page_remove: remove a managed vm_page from all pmaps that map it 3888 * 3889 * => R/M bits are sync'd back to attrs 3890 */ 3891 3892 void 3893 pmap_page_remove(struct vm_page *pg) 3894 { 3895 struct pmap_page *pp; 3896 paddr_t pa; 3897 3898 KASSERT(uvm_page_locked_p(pg)); 3899 3900 pp = VM_PAGE_TO_PP(pg); 3901 pa = VM_PAGE_TO_PHYS(pg); 3902 pmap_pp_remove(pp, pa); 3903 } 3904 3905 /* 3906 * pmap_pv_remove: remove an unmanaged pv-tracked page from all pmaps 3907 * that map it 3908 */ 3909 3910 void 3911 pmap_pv_remove(paddr_t pa) 3912 { 3913 struct pmap_page *pp; 3914 3915 pp = pmap_pv_tracked(pa); 3916 if (pp == NULL) 3917 panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa); 3918 pmap_pp_remove(pp, pa); 3919 } 3920 3921 /* 3922 * p m a p a t t r i b u t e f u n c t i o n s 3923 * functions that test/change managed page's attributes 3924 * since a page can be mapped multiple times we must check each PTE that 3925 * maps it by going down the pv lists. 3926 */ 3927 3928 /* 3929 * pmap_test_attrs: test a page's attributes 3930 */ 3931 3932 bool 3933 pmap_test_attrs(struct vm_page *pg, unsigned testbits) 3934 { 3935 struct pmap_page *pp; 3936 struct pv_pte *pvpte; 3937 pt_entry_t expect; 3938 u_int result; 3939 3940 KASSERT(uvm_page_locked_p(pg)); 3941 3942 pp = VM_PAGE_TO_PP(pg); 3943 if ((pp->pp_attrs & testbits) != 0) { 3944 return true; 3945 } 3946 expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V; 3947 kpreempt_disable(); 3948 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 3949 pt_entry_t opte; 3950 int error; 3951 3952 if ((pp->pp_attrs & testbits) != 0) { 3953 break; 3954 } 3955 error = pmap_sync_pv(pvpte, expect, 0, &opte); 3956 if (error == 0) { 3957 pp->pp_attrs |= opte; 3958 } 3959 } 3960 result = pp->pp_attrs & testbits; 3961 kpreempt_enable(); 3962 3963 /* 3964 * note that we will exit the for loop with a non-null pve if 3965 * we have found the bits we are testing for. 3966 */ 3967 3968 return result != 0; 3969 } 3970 3971 static bool 3972 pmap_pp_clear_attrs(struct pmap_page *pp, paddr_t pa, unsigned clearbits) 3973 { 3974 struct pv_pte *pvpte; 3975 u_int result; 3976 pt_entry_t expect; 3977 int count; 3978 3979 expect = pmap_pa2pte(pa) | PG_V; 3980 count = SPINLOCK_BACKOFF_MIN; 3981 kpreempt_disable(); 3982 startover: 3983 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 3984 pt_entry_t opte; 3985 int error; 3986 3987 error = pmap_sync_pv(pvpte, expect, clearbits, &opte); 3988 if (error == EAGAIN) { 3989 int hold_count; 3990 KERNEL_UNLOCK_ALL(curlwp, &hold_count); 3991 SPINLOCK_BACKOFF(count); 3992 KERNEL_LOCK(hold_count, curlwp); 3993 goto startover; 3994 } 3995 pp->pp_attrs |= opte; 3996 } 3997 result = pp->pp_attrs & clearbits; 3998 pp->pp_attrs &= ~clearbits; 3999 pmap_tlb_shootnow(); 4000 kpreempt_enable(); 4001 4002 return result != 0; 4003 } 4004 4005 /* 4006 * pmap_clear_attrs: clear the specified attribute for a page. 4007 * 4008 * => we return true if we cleared one of the bits we were asked to 4009 */ 4010 4011 bool 4012 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits) 4013 { 4014 struct pmap_page *pp; 4015 paddr_t pa; 4016 4017 KASSERT(uvm_page_locked_p(pg)); 4018 4019 pp = VM_PAGE_TO_PP(pg); 4020 pa = VM_PAGE_TO_PHYS(pg); 4021 4022 return pmap_pp_clear_attrs(pp, pa, clearbits); 4023 } 4024 4025 /* 4026 * pmap_pv_clear_attrs: clear the specified attributes for an unmanaged 4027 * pv-tracked page. 4028 */ 4029 4030 bool 4031 pmap_pv_clear_attrs(paddr_t pa, unsigned clearbits) 4032 { 4033 struct pmap_page *pp; 4034 4035 pp = pmap_pv_tracked(pa); 4036 if (pp == NULL) 4037 panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa); 4038 4039 return pmap_pp_clear_attrs(pp, pa, clearbits); 4040 } 4041 4042 /* 4043 * p m a p p r o t e c t i o n f u n c t i o n s 4044 */ 4045 4046 /* 4047 * pmap_page_protect: change the protection of all recorded mappings 4048 * of a managed page 4049 * 4050 * => NOTE: this is an inline function in pmap.h 4051 */ 4052 4053 /* see pmap.h */ 4054 4055 /* 4056 * pmap_pv_protect: change the protection of all recorded mappings 4057 * of an unmanaged pv-tracked page 4058 * 4059 * => NOTE: this is an inline function in pmap.h 4060 */ 4061 4062 /* see pmap.h */ 4063 4064 /* 4065 * pmap_protect: set the protection in of the pages in a pmap 4066 * 4067 * => NOTE: this is an inline function in pmap.h 4068 */ 4069 4070 /* see pmap.h */ 4071 4072 /* 4073 * pmap_write_protect: write-protect pages in a pmap. 4074 * 4075 * Note for Xen-amd64. Xen automatically adds PG_u to the kernel pages, but we 4076 * don't need to remove this bit when re-entering the PTEs here: Xen tracks the 4077 * kernel pages with a reserved bit (_PAGE_GUEST_KERNEL), so even if PG_u is 4078 * present the page will still be considered as a kernel page, and the privilege 4079 * separation will be enforced correctly. 4080 */ 4081 void 4082 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot) 4083 { 4084 pt_entry_t bit_rem, bit_put; 4085 pt_entry_t *ptes; 4086 pt_entry_t * const *pdes; 4087 struct pmap *pmap2; 4088 vaddr_t blockend, va; 4089 4090 KASSERT(curlwp->l_md.md_gc_pmap != pmap); 4091 4092 bit_rem = 0; 4093 if (!(prot & VM_PROT_WRITE)) 4094 bit_rem = PG_RW; 4095 4096 bit_put = 0; 4097 if (!(prot & VM_PROT_EXECUTE)) 4098 bit_put = pmap_pg_nx; 4099 4100 sva &= PG_FRAME; 4101 eva &= PG_FRAME; 4102 4103 /* Acquire pmap. */ 4104 kpreempt_disable(); 4105 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 4106 4107 for (va = sva ; va < eva; va = blockend) { 4108 pt_entry_t *spte, *epte; 4109 int i; 4110 4111 blockend = x86_round_pdr(va + 1); 4112 if (blockend > eva) 4113 blockend = eva; 4114 4115 /* 4116 * Our PTE mappings should never be write-protected. 4117 * 4118 * XXXmaxv: still needed? 4119 * 4120 * A long term solution is to move the PTEs out of user address 4121 * space, and into kernel address space. Then we can set 4122 * VM_MAXUSER_ADDRESS to be VM_MAX_ADDRESS. 4123 */ 4124 for (i = 0; i < PDP_SIZE; i++) { 4125 if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i) 4126 panic("PTE space accessed"); 4127 } 4128 4129 /* Is it a valid block? */ 4130 if (!pmap_pdes_valid(va, pdes, NULL)) { 4131 continue; 4132 } 4133 KASSERT(va < VM_MAXUSER_ADDRESS || va >= VM_MAX_ADDRESS); 4134 4135 spte = &ptes[pl1_i(va)]; 4136 epte = &ptes[pl1_i(blockend)]; 4137 4138 for (/* */; spte < epte; spte++) { 4139 pt_entry_t opte, npte; 4140 4141 do { 4142 opte = *spte; 4143 if (!pmap_valid_entry(opte)) { 4144 goto next; 4145 } 4146 npte = (opte & ~bit_rem) | bit_put; 4147 } while (pmap_pte_cas(spte, opte, npte) != opte); 4148 4149 if ((opte & PG_M) != 0) { 4150 vaddr_t tva = x86_ptob(spte - ptes); 4151 pmap_tlb_shootdown(pmap, tva, opte, 4152 TLBSHOOT_WRITE_PROTECT); 4153 } 4154 next:; 4155 } 4156 } 4157 4158 /* Release pmap. */ 4159 pmap_unmap_ptes(pmap, pmap2); 4160 kpreempt_enable(); 4161 } 4162 4163 /* 4164 * pmap_unwire: clear the wired bit in the PTE. 4165 * 4166 * => Mapping should already be present. 4167 */ 4168 void 4169 pmap_unwire(struct pmap *pmap, vaddr_t va) 4170 { 4171 pt_entry_t *ptes, *ptep, opte; 4172 pd_entry_t * const *pdes; 4173 struct pmap *pmap2; 4174 4175 /* Acquire pmap. */ 4176 kpreempt_disable(); 4177 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 4178 4179 if (!pmap_pdes_valid(va, pdes, NULL)) { 4180 panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va); 4181 } 4182 4183 ptep = &ptes[pl1_i(va)]; 4184 opte = *ptep; 4185 KASSERT(pmap_valid_entry(opte)); 4186 4187 if (opte & PG_W) { 4188 pt_entry_t npte = opte & ~PG_W; 4189 4190 opte = pmap_pte_testset(ptep, npte); 4191 pmap_stats_update_bypte(pmap, npte, opte); 4192 } else { 4193 printf("%s: wiring for pmap %p va %#" PRIxVADDR 4194 "did not change!\n", __func__, pmap, va); 4195 } 4196 4197 /* Release pmap. */ 4198 pmap_unmap_ptes(pmap, pmap2); 4199 kpreempt_enable(); 4200 } 4201 4202 /* 4203 * pmap_copy: copy mappings from one pmap to another 4204 * 4205 * => optional function 4206 * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr) 4207 */ 4208 4209 /* 4210 * defined as macro in pmap.h 4211 */ 4212 4213 __strict_weak_alias(pmap_enter, pmap_enter_default); 4214 4215 int 4216 pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, 4217 u_int flags) 4218 { 4219 return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0); 4220 } 4221 4222 /* 4223 * pmap_enter: enter a mapping into a pmap 4224 * 4225 * => must be done "now" ... no lazy-evaluation 4226 * => we set pmap => pv_head locking 4227 */ 4228 int 4229 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa, 4230 vm_prot_t prot, u_int flags, int domid) 4231 { 4232 pt_entry_t *ptes, opte, npte; 4233 pt_entry_t *ptep; 4234 pd_entry_t * const *pdes; 4235 struct vm_page *ptp; 4236 struct vm_page *new_pg, *old_pg; 4237 struct pmap_page *new_pp, *old_pp; 4238 struct pv_entry *old_pve = NULL; 4239 struct pv_entry *new_pve; 4240 struct pv_entry *new_sparepve; 4241 int error; 4242 bool wired = (flags & PMAP_WIRED) != 0; 4243 struct pmap *pmap2; 4244 4245 KASSERT(pmap_initialized); 4246 KASSERT(curlwp->l_md.md_gc_pmap != pmap); 4247 KASSERT(va < VM_MAX_KERNEL_ADDRESS); 4248 KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#" 4249 PRIxVADDR " over PDP!", __func__, va); 4250 KASSERTMSG(va < VM_MIN_KERNEL_ADDRESS || 4251 pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]), 4252 "%s: missing kernel PTP for va=%#" PRIxVADDR, __func__, va); 4253 4254 #ifdef XEN 4255 KASSERT(domid == DOMID_SELF || pa == 0); 4256 #endif /* XEN */ 4257 4258 npte = ma | protection_codes[prot] | PG_V; 4259 npte |= pmap_pat_flags(flags); 4260 if (wired) 4261 npte |= PG_W; 4262 if (va < VM_MAXUSER_ADDRESS) 4263 npte |= PG_u; 4264 else if (va < VM_MAX_ADDRESS) 4265 panic("PTE space accessed"); /* XXXmaxv: no longer needed? */ 4266 4267 if (pmap == pmap_kernel()) 4268 npte |= pmap_pg_g; 4269 if (flags & VM_PROT_ALL) { 4270 npte |= PG_U; 4271 if (flags & VM_PROT_WRITE) { 4272 KASSERT((npte & PG_RW) != 0); 4273 npte |= PG_M; 4274 } 4275 } 4276 4277 #ifdef XEN 4278 if (domid != DOMID_SELF) 4279 new_pg = NULL; 4280 else 4281 #endif 4282 new_pg = PHYS_TO_VM_PAGE(pa); 4283 if (new_pg != NULL) { 4284 /* This is a managed page */ 4285 npte |= PG_PVLIST; 4286 new_pp = VM_PAGE_TO_PP(new_pg); 4287 } else if ((new_pp = pmap_pv_tracked(pa)) != NULL) { 4288 /* This is an unmanaged pv-tracked page */ 4289 npte |= PG_PVLIST; 4290 } else { 4291 new_pp = NULL; 4292 } 4293 4294 /* 4295 * Try to get pves now if we might need them. 4296 * Keep going even if we fail, since we will not actually need them 4297 * if we are just changing the permissions on an existing mapping, 4298 * but we won't know if that's the case until later. 4299 */ 4300 4301 bool needpves = pmap_pp_needs_pve(new_pp); 4302 if (needpves) { 4303 new_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT); 4304 new_sparepve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT); 4305 } else { 4306 new_pve = NULL; 4307 new_sparepve = NULL; 4308 } 4309 4310 kpreempt_disable(); 4311 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 4312 if (pmap == pmap_kernel()) { 4313 ptp = NULL; 4314 } else { 4315 ptp = pmap_get_ptp(pmap, va, pdes, flags); 4316 if (ptp == NULL) { 4317 pmap_unmap_ptes(pmap, pmap2); 4318 if (flags & PMAP_CANFAIL) { 4319 error = ENOMEM; 4320 goto out; 4321 } 4322 panic("%s: get ptp failed", __func__); 4323 } 4324 } 4325 4326 /* 4327 * Check if there is an existing mapping. If we are now sure that 4328 * we need pves and we failed to allocate them earlier, handle that. 4329 * Caching the value of oldpa here is safe because only the mod/ref bits 4330 * can change while the pmap is locked. 4331 */ 4332 4333 ptep = &ptes[pl1_i(va)]; 4334 opte = *ptep; 4335 bool have_oldpa = pmap_valid_entry(opte); 4336 paddr_t oldpa = pmap_pte2pa(opte); 4337 4338 if (needpves && (!have_oldpa || oldpa != pa) && 4339 (new_pve == NULL || new_sparepve == NULL)) { 4340 pmap_unmap_ptes(pmap, pmap2); 4341 if (flags & PMAP_CANFAIL) { 4342 error = ENOMEM; 4343 goto out; 4344 } 4345 panic("%s: pve allocation failed", __func__); 4346 } 4347 4348 /* 4349 * update the pte. 4350 */ 4351 4352 do { 4353 opte = *ptep; 4354 4355 /* 4356 * if the same page, inherit PG_U and PG_M. 4357 */ 4358 if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) { 4359 npte |= opte & (PG_U | PG_M); 4360 } 4361 #if defined(XEN) 4362 if (domid != DOMID_SELF) { 4363 /* pmap_pte_cas with error handling */ 4364 int s = splvm(); 4365 if (opte != *ptep) { 4366 splx(s); 4367 continue; 4368 } 4369 error = xpq_update_foreign( 4370 vtomach((vaddr_t)ptep), npte, domid); 4371 splx(s); 4372 if (error) { 4373 if (ptp != NULL && ptp->wire_count <= 1) { 4374 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 4375 } 4376 pmap_unmap_ptes(pmap, pmap2); 4377 goto out; 4378 } 4379 break; 4380 } 4381 #endif /* defined(XEN) */ 4382 } while (pmap_pte_cas(ptep, opte, npte) != opte); 4383 4384 /* 4385 * update statistics and PTP's reference count. 4386 */ 4387 4388 pmap_stats_update_bypte(pmap, npte, opte); 4389 if (ptp != NULL && !have_oldpa) { 4390 ptp->wire_count++; 4391 } 4392 KASSERT(ptp == NULL || ptp->wire_count > 1); 4393 4394 /* 4395 * if the same page, we can skip pv_entry handling. 4396 */ 4397 4398 if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) { 4399 KASSERT(((opte ^ npte) & PG_PVLIST) == 0); 4400 goto same_pa; 4401 } 4402 4403 /* 4404 * if old page is pv-tracked, remove pv_entry from its list. 4405 */ 4406 4407 if ((~opte & (PG_V | PG_PVLIST)) == 0) { 4408 if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) { 4409 KASSERT(uvm_page_locked_p(old_pg)); 4410 old_pp = VM_PAGE_TO_PP(old_pg); 4411 } else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) { 4412 panic("%s: PG_PVLIST with pv-untracked page" 4413 " va = %#"PRIxVADDR 4414 " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")", 4415 __func__, va, oldpa, atop(pa)); 4416 } 4417 4418 old_pve = pmap_remove_pv(old_pp, ptp, va); 4419 old_pp->pp_attrs |= opte; 4420 } 4421 4422 /* 4423 * if new page is pv-tracked, insert pv_entry into its list. 4424 */ 4425 4426 if (new_pp) { 4427 new_pve = pmap_enter_pv(new_pp, new_pve, &new_sparepve, ptp, va); 4428 } 4429 4430 same_pa: 4431 pmap_unmap_ptes(pmap, pmap2); 4432 4433 /* 4434 * shootdown tlb if necessary. 4435 */ 4436 4437 if ((~opte & (PG_V | PG_U)) == 0 && 4438 ((opte ^ npte) & (PG_FRAME | PG_RW)) != 0) { 4439 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER); 4440 } 4441 4442 error = 0; 4443 out: 4444 kpreempt_enable(); 4445 if (old_pve != NULL) { 4446 pool_cache_put(&pmap_pv_cache, old_pve); 4447 } 4448 if (new_pve != NULL) { 4449 pool_cache_put(&pmap_pv_cache, new_pve); 4450 } 4451 if (new_sparepve != NULL) { 4452 pool_cache_put(&pmap_pv_cache, new_sparepve); 4453 } 4454 4455 return error; 4456 } 4457 4458 static paddr_t 4459 pmap_get_physpage(void) 4460 { 4461 struct vm_page *ptp; 4462 struct pmap *kpm = pmap_kernel(); 4463 paddr_t pa; 4464 4465 if (!uvm.page_init_done) { 4466 /* 4467 * We're growing the kernel pmap early (from 4468 * uvm_pageboot_alloc()). This case must be 4469 * handled a little differently. 4470 */ 4471 4472 if (!uvm_page_physget(&pa)) 4473 panic("%s: out of memory", __func__); 4474 #if defined(__HAVE_DIRECT_MAP) 4475 pagezero(PMAP_DIRECT_MAP(pa)); 4476 #else 4477 #if defined(XEN) 4478 if (XEN_VERSION_SUPPORTED(3, 4)) { 4479 xen_pagezero(pa); 4480 return pa; 4481 } 4482 #endif 4483 kpreempt_disable(); 4484 pmap_pte_set(early_zero_pte, pmap_pa2pte(pa) | PG_V | 4485 PG_RW | pmap_pg_nx); 4486 pmap_pte_flush(); 4487 pmap_update_pg((vaddr_t)early_zerop); 4488 memset(early_zerop, 0, PAGE_SIZE); 4489 #if defined(DIAGNOSTIC) || defined(XEN) 4490 pmap_pte_set(early_zero_pte, 0); 4491 pmap_pte_flush(); 4492 #endif /* defined(DIAGNOSTIC) */ 4493 kpreempt_enable(); 4494 #endif /* defined(__HAVE_DIRECT_MAP) */ 4495 } else { 4496 /* XXX */ 4497 ptp = uvm_pagealloc(NULL, 0, NULL, 4498 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 4499 if (ptp == NULL) 4500 panic("%s: out of memory", __func__); 4501 ptp->flags &= ~PG_BUSY; 4502 ptp->wire_count = 1; 4503 pa = VM_PAGE_TO_PHYS(ptp); 4504 } 4505 pmap_stats_update(kpm, 1, 0); 4506 4507 return pa; 4508 } 4509 4510 /* 4511 * Expand the page tree with the specified amount of PTPs, mapping virtual 4512 * addresses starting at kva. We populate all the levels but the last one 4513 * (L1). The nodes of the tree are created as RWX, but the pages covered 4514 * will be kentered in L1, with proper permissions. 4515 * 4516 * Used only by pmap_growkernel. 4517 */ 4518 static void 4519 pmap_alloc_level(struct pmap *cpm, vaddr_t kva, long *needed_ptps) 4520 { 4521 unsigned long i; 4522 paddr_t pa; 4523 unsigned long index, endindex; 4524 int level; 4525 pd_entry_t *pdep; 4526 #ifdef XEN 4527 int s = splvm(); /* protect xpq_* */ 4528 #endif 4529 4530 for (level = PTP_LEVELS; level > 1; level--) { 4531 if (level == PTP_LEVELS) 4532 pdep = cpm->pm_pdir; 4533 else 4534 pdep = normal_pdes[level - 2]; 4535 index = pl_i_roundup(kva, level); 4536 endindex = index + needed_ptps[level - 1] - 1; 4537 4538 for (i = index; i <= endindex; i++) { 4539 pt_entry_t pte; 4540 4541 KASSERT(!pmap_valid_entry(pdep[i])); 4542 pa = pmap_get_physpage(); 4543 pte = pmap_pa2pte(pa) | PG_V | PG_RW; 4544 pmap_pte_set(&pdep[i], pte); 4545 4546 #if defined(XEN) && (defined(PAE) || defined(__x86_64__)) 4547 if (level == PTP_LEVELS && i >= PDIR_SLOT_KERN) { 4548 if (__predict_true( 4549 cpu_info_primary.ci_flags & CPUF_PRESENT)) { 4550 /* update per-cpu PMDs on all cpus */ 4551 xen_kpm_sync(pmap_kernel(), i); 4552 } else { 4553 /* 4554 * too early; update primary CPU 4555 * PMD only (without locks) 4556 */ 4557 #ifdef PAE 4558 pd_entry_t *cpu_pdep = 4559 &cpu_info_primary.ci_kpm_pdir[l2tol2(i)]; 4560 #endif 4561 #ifdef __x86_64__ 4562 pd_entry_t *cpu_pdep = 4563 &cpu_info_primary.ci_kpm_pdir[i]; 4564 #endif 4565 pmap_pte_set(cpu_pdep, pte); 4566 } 4567 } 4568 #endif /* XEN && (PAE || __x86_64__) */ 4569 4570 KASSERT(level != PTP_LEVELS || nkptp[level - 1] + 4571 pl_i(VM_MIN_KERNEL_ADDRESS, level) == i); 4572 nkptp[level - 1]++; 4573 } 4574 pmap_pte_flush(); 4575 } 4576 #ifdef XEN 4577 splx(s); 4578 #endif 4579 } 4580 4581 /* 4582 * pmap_growkernel: increase usage of KVM space. 4583 * 4584 * => we allocate new PTPs for the kernel and install them in all 4585 * the pmaps on the system. 4586 */ 4587 4588 vaddr_t 4589 pmap_growkernel(vaddr_t maxkvaddr) 4590 { 4591 struct pmap *kpm = pmap_kernel(); 4592 struct pmap *cpm; 4593 #if !defined(XEN) || !defined(__x86_64__) 4594 struct pmap *pm; 4595 long old; 4596 #endif 4597 int s, i; 4598 long needed_kptp[PTP_LEVELS], target_nptp; 4599 bool invalidate = false; 4600 4601 s = splvm(); /* to be safe */ 4602 mutex_enter(kpm->pm_lock); 4603 4604 if (maxkvaddr <= pmap_maxkvaddr) { 4605 mutex_exit(kpm->pm_lock); 4606 splx(s); 4607 return pmap_maxkvaddr; 4608 } 4609 4610 maxkvaddr = x86_round_pdr(maxkvaddr); 4611 #if !defined(XEN) || !defined(__x86_64__) 4612 old = nkptp[PTP_LEVELS - 1]; 4613 #endif 4614 4615 /* Initialize needed_kptp. */ 4616 for (i = PTP_LEVELS - 1; i >= 1; i--) { 4617 target_nptp = pl_i_roundup(maxkvaddr, i + 1) - 4618 pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1); 4619 4620 if (target_nptp > nkptpmax[i]) 4621 panic("out of KVA space"); 4622 KASSERT(target_nptp >= nkptp[i]); 4623 needed_kptp[i] = target_nptp - nkptp[i]; 4624 } 4625 4626 #if defined(XEN) && (defined(__x86_64__) || defined(PAE)) 4627 /* only pmap_kernel(), or the per-cpu map, has kernel entries */ 4628 cpm = kpm; 4629 #else 4630 /* Get the current pmap */ 4631 if (__predict_true(cpu_info_primary.ci_flags & CPUF_PRESENT)) { 4632 cpm = curcpu()->ci_pmap; 4633 } else { 4634 cpm = kpm; 4635 } 4636 #endif 4637 4638 pmap_alloc_level(cpm, pmap_maxkvaddr, needed_kptp); 4639 4640 /* 4641 * If the number of top level entries changed, update all pmaps. 4642 */ 4643 if (needed_kptp[PTP_LEVELS - 1] != 0) { 4644 #ifdef XEN 4645 #ifdef __x86_64__ 4646 /* nothing, kernel entries are never entered in user pmap */ 4647 #else /* __x86_64__ */ 4648 int pdkidx; 4649 #ifndef PAE 4650 /* 4651 * for PAE this is not needed, because pmap_alloc_level() 4652 * already did update the per-CPU tables 4653 */ 4654 if (cpm != kpm) { 4655 for (pdkidx = PDIR_SLOT_KERN + old; 4656 pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1]; 4657 pdkidx++) { 4658 pmap_pte_set(&kpm->pm_pdir[pdkidx], 4659 cpm->pm_pdir[pdkidx]); 4660 } 4661 pmap_pte_flush(); 4662 } 4663 #endif /* !PAE */ 4664 4665 mutex_enter(&pmaps_lock); 4666 LIST_FOREACH(pm, &pmaps, pm_list) { 4667 for (pdkidx = PDIR_SLOT_KERN + old; 4668 pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1]; 4669 pdkidx++) { 4670 pmap_pte_set(&pm->pm_pdir[pdkidx], 4671 kpm->pm_pdir[pdkidx]); 4672 } 4673 pmap_pte_flush(); 4674 } 4675 mutex_exit(&pmaps_lock); 4676 #endif /* __x86_64__ */ 4677 #else /* XEN */ 4678 size_t newpdes; 4679 newpdes = nkptp[PTP_LEVELS - 1] - old; 4680 if (cpm != kpm) { 4681 memcpy(&kpm->pm_pdir[PDIR_SLOT_KERN + old], 4682 &cpm->pm_pdir[PDIR_SLOT_KERN + old], 4683 newpdes * sizeof(pd_entry_t)); 4684 } 4685 4686 mutex_enter(&pmaps_lock); 4687 LIST_FOREACH(pm, &pmaps, pm_list) { 4688 memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old], 4689 &kpm->pm_pdir[PDIR_SLOT_KERN + old], 4690 newpdes * sizeof (pd_entry_t)); 4691 } 4692 mutex_exit(&pmaps_lock); 4693 #endif 4694 invalidate = true; 4695 } 4696 pmap_maxkvaddr = maxkvaddr; 4697 mutex_exit(kpm->pm_lock); 4698 splx(s); 4699 4700 if (invalidate && pmap_initialized) { 4701 /* Invalidate the PDP cache. */ 4702 pool_cache_invalidate(&pmap_pdp_cache); 4703 } 4704 4705 return maxkvaddr; 4706 } 4707 4708 #ifdef DEBUG 4709 void pmap_dump(struct pmap *, vaddr_t, vaddr_t); 4710 4711 /* 4712 * pmap_dump: dump all the mappings from a pmap 4713 * 4714 * => caller should not be holding any pmap locks 4715 */ 4716 4717 void 4718 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 4719 { 4720 pt_entry_t *ptes, *pte; 4721 pd_entry_t * const *pdes; 4722 struct pmap *pmap2; 4723 vaddr_t blkendva; 4724 4725 /* 4726 * if end is out of range truncate. 4727 * if (end == start) update to max. 4728 */ 4729 4730 if (eva > VM_MAXUSER_ADDRESS || eva <= sva) 4731 eva = VM_MAXUSER_ADDRESS; 4732 4733 /* 4734 * we lock in the pmap => pv_head direction 4735 */ 4736 4737 kpreempt_disable(); 4738 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 4739 4740 /* 4741 * dumping a range of pages: we dump in PTP sized blocks (4MB) 4742 */ 4743 4744 for (/* null */ ; sva < eva ; sva = blkendva) { 4745 4746 /* determine range of block */ 4747 blkendva = x86_round_pdr(sva+1); 4748 if (blkendva > eva) 4749 blkendva = eva; 4750 4751 /* valid block? */ 4752 if (!pmap_pdes_valid(sva, pdes, NULL)) 4753 continue; 4754 4755 pte = &ptes[pl1_i(sva)]; 4756 for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) { 4757 if (!pmap_valid_entry(*pte)) 4758 continue; 4759 printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR 4760 " (pte=%#" PRIxPADDR ")\n", 4761 sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte); 4762 } 4763 } 4764 pmap_unmap_ptes(pmap, pmap2); 4765 kpreempt_enable(); 4766 } 4767 #endif 4768 4769 /* 4770 * pmap_update: process deferred invalidations and frees. 4771 */ 4772 4773 void 4774 pmap_update(struct pmap *pmap) 4775 { 4776 struct vm_page *empty_ptps; 4777 lwp_t *l = curlwp; 4778 4779 /* 4780 * If we have torn down this pmap, invalidate non-global TLB 4781 * entries on any processors using it. 4782 */ 4783 kpreempt_disable(); 4784 if (__predict_false(l->l_md.md_gc_pmap == pmap)) { 4785 l->l_md.md_gc_pmap = NULL; 4786 pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, TLBSHOOT_UPDATE); 4787 } 4788 4789 /* 4790 * Initiate any pending TLB shootdowns. Wait for them to 4791 * complete before returning control to the caller. 4792 */ 4793 pmap_tlb_shootnow(); 4794 kpreempt_enable(); 4795 4796 /* 4797 * Now that shootdowns are complete, process deferred frees, 4798 * but not from interrupt context. 4799 */ 4800 if (l->l_md.md_gc_ptp != NULL) { 4801 KASSERT((l->l_pflag & LP_INTR) == 0); 4802 if (cpu_intr_p()) { 4803 return; 4804 } 4805 empty_ptps = l->l_md.md_gc_ptp; 4806 l->l_md.md_gc_ptp = NULL; 4807 pmap_free_ptps(empty_ptps); 4808 } 4809 } 4810 4811 #if PTP_LEVELS > 4 4812 #error "Unsupported number of page table mappings" 4813 #endif 4814 4815 paddr_t 4816 pmap_init_tmp_pgtbl(paddr_t pg) 4817 { 4818 static bool maps_loaded; 4819 static const paddr_t x86_tmp_pml_paddr[] = { 4820 4 * PAGE_SIZE, /* L1 */ 4821 5 * PAGE_SIZE, /* L2 */ 4822 6 * PAGE_SIZE, /* L3 */ 4823 7 * PAGE_SIZE /* L4 */ 4824 }; 4825 static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 }; 4826 4827 pd_entry_t *tmp_pml, *kernel_pml; 4828 4829 int level; 4830 4831 if (!maps_loaded) { 4832 for (level = 0; level < PTP_LEVELS; ++level) { 4833 x86_tmp_pml_vaddr[level] = 4834 uvm_km_alloc(kernel_map, PAGE_SIZE, 0, 4835 UVM_KMF_VAONLY); 4836 4837 if (x86_tmp_pml_vaddr[level] == 0) 4838 panic("mapping of real mode PML failed\n"); 4839 pmap_kenter_pa(x86_tmp_pml_vaddr[level], 4840 x86_tmp_pml_paddr[level], 4841 VM_PROT_READ | VM_PROT_WRITE, 0); 4842 } 4843 pmap_update(pmap_kernel()); 4844 maps_loaded = true; 4845 } 4846 4847 /* Zero levels 1-3 */ 4848 for (level = 0; level < PTP_LEVELS - 1; ++level) { 4849 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 4850 memset(tmp_pml, 0, PAGE_SIZE); 4851 } 4852 4853 /* Copy PML4 */ 4854 kernel_pml = pmap_kernel()->pm_pdir; 4855 tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1]; 4856 memcpy(tmp_pml, kernel_pml, PAGE_SIZE); 4857 4858 #ifdef PAE 4859 /* 4860 * Use the last 4 entries of the L2 page as L3 PD entries. These 4861 * last entries are unlikely to be used for temporary mappings. 4862 * 508: maps 0->1GB (userland) 4863 * 509: unused 4864 * 510: unused 4865 * 511: maps 3->4GB (kernel) 4866 */ 4867 tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PG_V; 4868 tmp_pml[509] = 0; 4869 tmp_pml[510] = 0; 4870 tmp_pml[511] = pmap_pdirpa(pmap_kernel(), PDIR_SLOT_KERN) | PG_V; 4871 #endif 4872 4873 for (level = PTP_LEVELS - 1; level > 0; --level) { 4874 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 4875 4876 tmp_pml[pl_i(pg, level + 1)] = 4877 (x86_tmp_pml_paddr[level - 1] & PG_FRAME) | PG_RW | PG_V; 4878 } 4879 4880 tmp_pml = (void *)x86_tmp_pml_vaddr[0]; 4881 tmp_pml[pl_i(pg, 1)] = (pg & PG_FRAME) | PG_RW | PG_V; 4882 4883 #ifdef PAE 4884 /* Return the PA of the L3 page (entry 508 of the L2 page) */ 4885 return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t); 4886 #endif 4887 4888 return x86_tmp_pml_paddr[PTP_LEVELS - 1]; 4889 } 4890 4891 u_int 4892 x86_mmap_flags(paddr_t mdpgno) 4893 { 4894 u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK; 4895 u_int pflag = 0; 4896 4897 if (nflag & X86_MMAP_FLAG_PREFETCH) 4898 pflag |= PMAP_WRITE_COMBINE; 4899 4900 return pflag; 4901 } 4902