1 /* $NetBSD: pmap.c,v 1.267 2017/11/22 21:26:01 christos Exp $ */ 2 3 /* 4 * Copyright (c) 2008, 2010, 2016, 2017 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran, and by Maxime Villard. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 2007 Manuel Bouyer. 34 * 35 * Redistribution and use in source and binary forms, with or without 36 * modification, are permitted provided that the following conditions 37 * are met: 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 44 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 45 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 46 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 47 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 48 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 49 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 50 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 51 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 52 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 53 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 54 */ 55 56 /* 57 * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr> 58 * 59 * Permission to use, copy, modify, and distribute this software for any 60 * purpose with or without fee is hereby granted, provided that the above 61 * copyright notice and this permission notice appear in all copies. 62 * 63 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 64 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 65 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 66 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 67 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 68 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 69 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 70 */ 71 72 /* 73 * Copyright (c) 1997 Charles D. Cranor and Washington University. 74 * All rights reserved. 75 * 76 * Redistribution and use in source and binary forms, with or without 77 * modification, are permitted provided that the following conditions 78 * are met: 79 * 1. Redistributions of source code must retain the above copyright 80 * notice, this list of conditions and the following disclaimer. 81 * 2. Redistributions in binary form must reproduce the above copyright 82 * notice, this list of conditions and the following disclaimer in the 83 * documentation and/or other materials provided with the distribution. 84 * 85 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 86 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 87 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 88 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 89 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 90 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 91 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 92 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 93 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 94 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 95 */ 96 97 /* 98 * Copyright 2001 (c) Wasabi Systems, Inc. 99 * All rights reserved. 100 * 101 * Written by Frank van der Linden for Wasabi Systems, Inc. 102 * 103 * Redistribution and use in source and binary forms, with or without 104 * modification, are permitted provided that the following conditions 105 * are met: 106 * 1. Redistributions of source code must retain the above copyright 107 * notice, this list of conditions and the following disclaimer. 108 * 2. Redistributions in binary form must reproduce the above copyright 109 * notice, this list of conditions and the following disclaimer in the 110 * documentation and/or other materials provided with the distribution. 111 * 3. All advertising materials mentioning features or use of this software 112 * must display the following acknowledgement: 113 * This product includes software developed for the NetBSD Project by 114 * Wasabi Systems, Inc. 115 * 4. The name of Wasabi Systems, Inc. may not be used to endorse 116 * or promote products derived from this software without specific prior 117 * written permission. 118 * 119 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND 120 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 121 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 122 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC 123 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 124 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 125 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 126 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 127 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 128 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 129 * POSSIBILITY OF SUCH DAMAGE. 130 */ 131 132 /* 133 * This is the i386 pmap modified and generalized to support x86-64 134 * as well. The idea is to hide the upper N levels of the page tables 135 * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest 136 * is mostly untouched, except that it uses some more generalized 137 * macros and interfaces. 138 * 139 * This pmap has been tested on the i386 as well, and it can be easily 140 * adapted to PAE. 141 * 142 * fvdl@wasabisystems.com 18-Jun-2001 143 */ 144 145 /* 146 * pmap.c: i386 pmap module rewrite 147 * Chuck Cranor <chuck@netbsd> 148 * 11-Aug-97 149 * 150 * history of this pmap module: in addition to my own input, i used 151 * the following references for this rewrite of the i386 pmap: 152 * 153 * [1] the NetBSD i386 pmap. this pmap appears to be based on the 154 * BSD hp300 pmap done by Mike Hibler at University of Utah. 155 * it was then ported to the i386 by William Jolitz of UUNET 156 * Technologies, Inc. Then Charles M. Hannum of the NetBSD 157 * project fixed some bugs and provided some speed ups. 158 * 159 * [2] the FreeBSD i386 pmap. this pmap seems to be the 160 * Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson 161 * and David Greenman. 162 * 163 * [3] the Mach pmap. this pmap, from CMU, seems to have migrated 164 * between several processors. the VAX version was done by 165 * Avadis Tevanian, Jr., and Michael Wayne Young. the i386 166 * version was done by Lance Berc, Mike Kupfer, Bob Baron, 167 * David Golub, and Richard Draves. the alpha version was 168 * done by Alessandro Forin (CMU/Mach) and Chris Demetriou 169 * (NetBSD/alpha). 170 */ 171 172 #include <sys/cdefs.h> 173 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.267 2017/11/22 21:26:01 christos Exp $"); 174 175 #include "opt_user_ldt.h" 176 #include "opt_lockdebug.h" 177 #include "opt_multiprocessor.h" 178 #include "opt_xen.h" 179 180 #include <sys/param.h> 181 #include <sys/systm.h> 182 #include <sys/proc.h> 183 #include <sys/pool.h> 184 #include <sys/kernel.h> 185 #include <sys/atomic.h> 186 #include <sys/cpu.h> 187 #include <sys/intr.h> 188 #include <sys/xcall.h> 189 #include <sys/kcore.h> 190 191 #include <uvm/uvm.h> 192 #include <uvm/pmap/pmap_pvt.h> 193 194 #include <dev/isa/isareg.h> 195 196 #include <machine/specialreg.h> 197 #include <machine/gdt.h> 198 #include <machine/isa_machdep.h> 199 #include <machine/cpuvar.h> 200 #include <machine/cputypes.h> 201 202 #include <x86/pmap.h> 203 #include <x86/pmap_pv.h> 204 205 #include <x86/i82489reg.h> 206 #include <x86/i82489var.h> 207 208 #ifdef XEN 209 #include <xen/xen-public/xen.h> 210 #include <xen/hypervisor.h> 211 #endif 212 213 /* 214 * general info: 215 * 216 * - for an explanation of how the i386 MMU hardware works see 217 * the comments in <machine/pte.h>. 218 * 219 * - for an explanation of the general memory structure used by 220 * this pmap (including the recursive mapping), see the comments 221 * in <machine/pmap.h>. 222 * 223 * this file contains the code for the "pmap module." the module's 224 * job is to manage the hardware's virtual to physical address mappings. 225 * note that there are two levels of mapping in the VM system: 226 * 227 * [1] the upper layer of the VM system uses vm_map's and vm_map_entry's 228 * to map ranges of virtual address space to objects/files. for 229 * example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only 230 * to the file /bin/ls starting at offset zero." note that 231 * the upper layer mapping is not concerned with how individual 232 * vm_pages are mapped. 233 * 234 * [2] the lower layer of the VM system (the pmap) maintains the mappings 235 * from virtual addresses. it is concerned with which vm_page is 236 * mapped where. for example, when you run /bin/ls and start 237 * at page 0x1000 the fault routine may lookup the correct page 238 * of the /bin/ls file and then ask the pmap layer to establish 239 * a mapping for it. 240 * 241 * note that information in the lower layer of the VM system can be 242 * thrown away since it can easily be reconstructed from the info 243 * in the upper layer. 244 * 245 * data structures we use include: 246 * 247 * - struct pmap: describes the address space of one thread 248 * - struct pmap_page: describes one pv-tracked page, without 249 * necessarily a corresponding vm_page 250 * - struct pv_entry: describes one <PMAP,VA> mapping of a PA 251 * - struct pv_head: there is one pv_head per pv-tracked page of 252 * physical memory. the pv_head points to a list of pv_entry 253 * structures which describe all the <PMAP,VA> pairs that this 254 * page is mapped in. this is critical for page based operations 255 * such as pmap_page_protect() [change protection on _all_ mappings 256 * of a page] 257 */ 258 259 /* 260 * memory allocation 261 * 262 * - there are three data structures that we must dynamically allocate: 263 * 264 * [A] new process' page directory page (PDP) 265 * - plan 1: done at pmap_create() we use 266 * uvm_km_alloc(kernel_map, PAGE_SIZE) [fka kmem_alloc] to do this 267 * allocation. 268 * 269 * if we are low in free physical memory then we sleep in 270 * uvm_km_alloc -- in this case this is ok since we are creating 271 * a new pmap and should not be holding any locks. 272 * 273 * if the kernel is totally out of virtual space 274 * (i.e. uvm_km_alloc returns NULL), then we panic. 275 * 276 * [B] new page tables pages (PTP) 277 * - call uvm_pagealloc() 278 * => success: zero page, add to pm_pdir 279 * => failure: we are out of free vm_pages, let pmap_enter() 280 * tell UVM about it. 281 * 282 * note: for kernel PTPs, we start with NKPTP of them. as we map 283 * kernel memory (at uvm_map time) we check to see if we've grown 284 * the kernel pmap. if so, we call the optional function 285 * pmap_growkernel() to grow the kernel PTPs in advance. 286 * 287 * [C] pv_entry structures 288 */ 289 290 /* 291 * locking 292 * 293 * we have the following locks that we must contend with: 294 * 295 * mutexes: 296 * 297 * - pmap lock (per pmap, part of uvm_object) 298 * this lock protects the fields in the pmap structure including 299 * the non-kernel PDEs in the PDP, and the PTEs. it also locks 300 * in the alternate PTE space (since that is determined by the 301 * entry in the PDP). 302 * 303 * - pvh_lock (per pv_head) 304 * this lock protects the pv_entry list which is chained off the 305 * pv_head structure for a specific pv-tracked PA. it is locked 306 * when traversing the list (e.g. adding/removing mappings, 307 * syncing R/M bits, etc.) 308 * 309 * - pmaps_lock 310 * this lock protects the list of active pmaps (headed by "pmaps"). 311 * we lock it when adding or removing pmaps from this list. 312 */ 313 314 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER; 315 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER; 316 const long nkptpmax[] = NKPTPMAX_INITIALIZER; 317 const long nbpd[] = NBPD_INITIALIZER; 318 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER; 319 320 long nkptp[] = NKPTP_INITIALIZER; 321 322 struct pmap_head pmaps; 323 kmutex_t pmaps_lock; 324 325 static vaddr_t pmap_maxkvaddr; 326 327 /* 328 * XXX kludge: dummy locking to make KASSERTs in uvm_page.c comfortable. 329 * actual locking is done by pm_lock. 330 */ 331 #if defined(DIAGNOSTIC) 332 #define PMAP_SUBOBJ_LOCK(pm, idx) \ 333 KASSERT(mutex_owned((pm)->pm_lock)); \ 334 if ((idx) != 0) \ 335 mutex_enter((pm)->pm_obj[(idx)].vmobjlock) 336 #define PMAP_SUBOBJ_UNLOCK(pm, idx) \ 337 KASSERT(mutex_owned((pm)->pm_lock)); \ 338 if ((idx) != 0) \ 339 mutex_exit((pm)->pm_obj[(idx)].vmobjlock) 340 #else /* defined(DIAGNOSTIC) */ 341 #define PMAP_SUBOBJ_LOCK(pm, idx) /* nothing */ 342 #define PMAP_SUBOBJ_UNLOCK(pm, idx) /* nothing */ 343 #endif /* defined(DIAGNOSTIC) */ 344 345 /* 346 * Misc. event counters. 347 */ 348 struct evcnt pmap_iobmp_evcnt; 349 struct evcnt pmap_ldt_evcnt; 350 351 /* 352 * PAT 353 */ 354 #define PATENTRY(n, type) (type << ((n) * 8)) 355 #define PAT_UC 0x0ULL 356 #define PAT_WC 0x1ULL 357 #define PAT_WT 0x4ULL 358 #define PAT_WP 0x5ULL 359 #define PAT_WB 0x6ULL 360 #define PAT_UCMINUS 0x7ULL 361 362 static bool cpu_pat_enabled __read_mostly = false; 363 364 /* 365 * Global data structures 366 */ 367 368 static struct pmap kernel_pmap_store; /* the kernel's pmap (proc0) */ 369 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store; 370 371 struct bootspace bootspace __read_mostly; 372 373 /* 374 * pmap_pg_nx: if our processor supports PG_NX in the PTE then we 375 * set pmap_pg_nx to PG_NX (otherwise it is zero). 376 */ 377 pd_entry_t pmap_pg_nx __read_mostly = 0; 378 379 /* 380 * pmap_pg_g: if our processor supports PG_G in the PTE then we 381 * set pmap_pg_g to PG_G (otherwise it is zero). 382 */ 383 pd_entry_t pmap_pg_g __read_mostly = 0; 384 385 /* 386 * pmap_largepages: if our processor supports PG_PS and we are 387 * using it, this is set to true. 388 */ 389 int pmap_largepages __read_mostly = 0; 390 391 /* 392 * i386 physical memory comes in a big contig chunk with a small 393 * hole toward the front of it... the following two paddr_t's 394 * (shared with machdep.c) describe the physical address space 395 * of this machine. 396 */ 397 paddr_t lowmem_rsvd __read_mostly; 398 paddr_t avail_start __read_mostly; /* PA of first available physical page */ 399 paddr_t avail_end __read_mostly; /* PA of last available physical page */ 400 401 #ifdef XEN 402 paddr_t pmap_pa_start; /* PA of first physical page for this domain */ 403 paddr_t pmap_pa_end; /* PA of last physical page for this domain */ 404 #endif 405 406 #define VM_PAGE_TO_PP(pg) (&(pg)->mdpage.mp_pp) 407 408 #define PV_HASH_SIZE 32768 409 #define PV_HASH_LOCK_CNT 32 410 411 struct pv_hash_lock { 412 kmutex_t lock; 413 } __aligned(CACHE_LINE_SIZE) pv_hash_locks[PV_HASH_LOCK_CNT] 414 __aligned(CACHE_LINE_SIZE); 415 416 struct pv_hash_head { 417 SLIST_HEAD(, pv_entry) hh_list; 418 } pv_hash_heads[PV_HASH_SIZE]; 419 420 static u_int 421 pvhash_hash(struct vm_page *ptp, vaddr_t va) 422 { 423 424 return (uintptr_t)ptp / sizeof(*ptp) + (va >> PAGE_SHIFT); 425 } 426 427 static struct pv_hash_head * 428 pvhash_head(u_int hash) 429 { 430 431 return &pv_hash_heads[hash % PV_HASH_SIZE]; 432 } 433 434 static kmutex_t * 435 pvhash_lock(u_int hash) 436 { 437 438 return &pv_hash_locks[hash % PV_HASH_LOCK_CNT].lock; 439 } 440 441 static struct pv_entry * 442 pvhash_remove(struct pv_hash_head *hh, struct vm_page *ptp, vaddr_t va) 443 { 444 struct pv_entry *pve; 445 struct pv_entry *prev; 446 447 prev = NULL; 448 SLIST_FOREACH(pve, &hh->hh_list, pve_hash) { 449 if (pve->pve_pte.pte_ptp == ptp && 450 pve->pve_pte.pte_va == va) { 451 if (prev != NULL) { 452 SLIST_REMOVE_AFTER(prev, pve_hash); 453 } else { 454 SLIST_REMOVE_HEAD(&hh->hh_list, pve_hash); 455 } 456 break; 457 } 458 prev = pve; 459 } 460 return pve; 461 } 462 463 /* 464 * Other data structures 465 */ 466 467 static pt_entry_t protection_codes[8] __read_mostly; 468 469 static bool pmap_initialized __read_mostly = false; /* pmap_init done yet? */ 470 471 /* 472 * The following two vaddr_t's are used during system startup to keep track of 473 * how much of the kernel's VM space we have used. Once the system is started, 474 * the management of the remaining kernel VM space is turned over to the 475 * kernel_map vm_map. 476 */ 477 static vaddr_t virtual_avail __read_mostly; /* VA of first free KVA */ 478 static vaddr_t virtual_end __read_mostly; /* VA of last free KVA */ 479 480 #ifndef XEN 481 /* 482 * LAPIC virtual address, and fake physical address. 483 */ 484 volatile vaddr_t local_apic_va __read_mostly; 485 paddr_t local_apic_pa __read_mostly; 486 #endif 487 488 /* 489 * pool that pmap structures are allocated from 490 */ 491 static struct pool_cache pmap_cache; 492 493 /* 494 * pv_entry cache 495 */ 496 static struct pool_cache pmap_pv_cache; 497 498 #ifndef __HAVE_DIRECT_MAP 499 /* 500 * Special VAs and the PTEs that map them 501 */ 502 static pt_entry_t *early_zero_pte; 503 static void pmap_vpage_cpualloc(struct cpu_info *); 504 #ifdef XEN 505 char *early_zerop; /* also referenced from xen_locore() */ 506 #else 507 static char *early_zerop; 508 #endif 509 #endif 510 511 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int); 512 513 /* PDP pool_cache(9) and its callbacks */ 514 struct pool_cache pmap_pdp_cache; 515 static int pmap_pdp_ctor(void *, void *, int); 516 static void pmap_pdp_dtor(void *, void *); 517 #ifdef PAE 518 /* need to allocate items of 4 pages */ 519 static void *pmap_pdp_alloc(struct pool *, int); 520 static void pmap_pdp_free(struct pool *, void *); 521 static struct pool_allocator pmap_pdp_allocator = { 522 .pa_alloc = pmap_pdp_alloc, 523 .pa_free = pmap_pdp_free, 524 .pa_pagesz = PAGE_SIZE * PDP_SIZE, 525 }; 526 #endif /* PAE */ 527 528 extern vaddr_t idt_vaddr; 529 extern paddr_t idt_paddr; 530 extern vaddr_t gdt_vaddr; 531 extern paddr_t gdt_paddr; 532 extern vaddr_t ldt_vaddr; 533 extern paddr_t ldt_paddr; 534 535 extern int end; 536 537 #ifdef i386 538 /* stuff to fix the pentium f00f bug */ 539 extern vaddr_t pentium_idt_vaddr; 540 #endif 541 542 /* 543 * Local prototypes 544 */ 545 546 #ifdef __HAVE_DIRECT_MAP 547 static void pmap_init_directmap(struct pmap *); 548 #endif 549 #ifndef XEN 550 static void pmap_init_lapic(void); 551 static void pmap_remap_global(void); 552 static void pmap_remap_largepages(void); 553 #endif 554 555 static struct vm_page *pmap_get_ptp(struct pmap *, vaddr_t, 556 pd_entry_t * const *, int); 557 static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int); 558 static void pmap_freepage(struct pmap *, struct vm_page *, int); 559 static void pmap_free_ptp(struct pmap *, struct vm_page *, vaddr_t, 560 pt_entry_t *, pd_entry_t * const *); 561 static bool pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *, 562 vaddr_t, struct pv_entry **); 563 static void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t, vaddr_t, 564 vaddr_t, struct pv_entry **); 565 566 static paddr_t pmap_get_physpage(void); 567 static void pmap_alloc_level(struct pmap *, vaddr_t, long *); 568 569 static bool pmap_reactivate(struct pmap *); 570 571 /* 572 * p m a p h e l p e r f u n c t i o n s 573 */ 574 575 static inline void 576 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff) 577 { 578 579 if (pmap == pmap_kernel()) { 580 atomic_add_long(&pmap->pm_stats.resident_count, resid_diff); 581 atomic_add_long(&pmap->pm_stats.wired_count, wired_diff); 582 } else { 583 KASSERT(mutex_owned(pmap->pm_lock)); 584 pmap->pm_stats.resident_count += resid_diff; 585 pmap->pm_stats.wired_count += wired_diff; 586 } 587 } 588 589 static inline void 590 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte) 591 { 592 int resid_diff = ((npte & PG_V) ? 1 : 0) - ((opte & PG_V) ? 1 : 0); 593 int wired_diff = ((npte & PG_W) ? 1 : 0) - ((opte & PG_W) ? 1 : 0); 594 595 KASSERT((npte & (PG_V | PG_W)) != PG_W); 596 KASSERT((opte & (PG_V | PG_W)) != PG_W); 597 598 pmap_stats_update(pmap, resid_diff, wired_diff); 599 } 600 601 /* 602 * ptp_to_pmap: lookup pmap by ptp 603 */ 604 605 static struct pmap * 606 ptp_to_pmap(struct vm_page *ptp) 607 { 608 struct pmap *pmap; 609 610 if (ptp == NULL) { 611 return pmap_kernel(); 612 } 613 pmap = (struct pmap *)ptp->uobject; 614 KASSERT(pmap != NULL); 615 KASSERT(&pmap->pm_obj[0] == ptp->uobject); 616 return pmap; 617 } 618 619 static inline struct pv_pte * 620 pve_to_pvpte(struct pv_entry *pve) 621 { 622 623 KASSERT((void *)&pve->pve_pte == (void *)pve); 624 return &pve->pve_pte; 625 } 626 627 static inline struct pv_entry * 628 pvpte_to_pve(struct pv_pte *pvpte) 629 { 630 struct pv_entry *pve = (void *)pvpte; 631 632 KASSERT(pve_to_pvpte(pve) == pvpte); 633 return pve; 634 } 635 636 /* 637 * pv_pte_first, pv_pte_next: PV list iterator. 638 */ 639 640 static struct pv_pte * 641 pv_pte_first(struct pmap_page *pp) 642 { 643 644 if ((pp->pp_flags & PP_EMBEDDED) != 0) { 645 return &pp->pp_pte; 646 } 647 return pve_to_pvpte(LIST_FIRST(&pp->pp_head.pvh_list)); 648 } 649 650 static struct pv_pte * 651 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte) 652 { 653 654 KASSERT(pvpte != NULL); 655 if (pvpte == &pp->pp_pte) { 656 KASSERT((pp->pp_flags & PP_EMBEDDED) != 0); 657 return NULL; 658 } 659 KASSERT((pp->pp_flags & PP_EMBEDDED) == 0); 660 return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list)); 661 } 662 663 /* 664 * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]? 665 * of course the kernel is always loaded 666 */ 667 668 bool 669 pmap_is_curpmap(struct pmap *pmap) 670 { 671 return((pmap == pmap_kernel()) || 672 (pmap == curcpu()->ci_pmap)); 673 } 674 675 /* 676 * Add a reference to the specified pmap. 677 */ 678 679 void 680 pmap_reference(struct pmap *pmap) 681 { 682 683 atomic_inc_uint(&pmap->pm_obj[0].uo_refs); 684 } 685 686 /* 687 * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in 688 * 689 * there are several pmaps involved. some or all of them might be same. 690 * 691 * - the pmap given by the first argument 692 * our caller wants to access this pmap's PTEs. 693 * 694 * - pmap_kernel() 695 * the kernel pmap. note that it only contains the kernel part 696 * of the address space which is shared by any pmap. ie. any 697 * pmap can be used instead of pmap_kernel() for our purpose. 698 * 699 * - ci->ci_pmap 700 * pmap currently loaded on the cpu. 701 * 702 * - vm_map_pmap(&curproc->p_vmspace->vm_map) 703 * current process' pmap. 704 * 705 * => we lock enough pmaps to keep things locked in 706 * => must be undone with pmap_unmap_ptes before returning 707 */ 708 709 void 710 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2, 711 pd_entry_t **ptepp, pd_entry_t * const **pdeppp) 712 { 713 struct pmap *curpmap; 714 struct cpu_info *ci; 715 lwp_t *l; 716 717 /* The kernel's pmap is always accessible. */ 718 if (pmap == pmap_kernel()) { 719 *pmap2 = NULL; 720 *ptepp = PTE_BASE; 721 *pdeppp = normal_pdes; 722 return; 723 } 724 KASSERT(kpreempt_disabled()); 725 726 l = curlwp; 727 retry: 728 mutex_enter(pmap->pm_lock); 729 ci = curcpu(); 730 curpmap = ci->ci_pmap; 731 if (vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) { 732 /* Our own pmap so just load it: easy. */ 733 if (__predict_false(ci->ci_want_pmapload)) { 734 mutex_exit(pmap->pm_lock); 735 pmap_load(); 736 goto retry; 737 } 738 KASSERT(pmap == curpmap); 739 } else if (pmap == curpmap) { 740 /* 741 * Already on the CPU: make it valid. This is very 742 * often the case during exit(), when we have switched 743 * to the kernel pmap in order to destroy a user pmap. 744 */ 745 if (!pmap_reactivate(pmap)) { 746 u_int gen = uvm_emap_gen_return(); 747 tlbflush(); 748 uvm_emap_update(gen); 749 } 750 } else { 751 /* 752 * Toss current pmap from CPU, but keep a reference to it. 753 * The reference will be dropped by pmap_unmap_ptes(). 754 * Can happen if we block during exit(). 755 */ 756 const cpuid_t cid = cpu_index(ci); 757 758 kcpuset_atomic_clear(curpmap->pm_cpus, cid); 759 kcpuset_atomic_clear(curpmap->pm_kernel_cpus, cid); 760 ci->ci_pmap = pmap; 761 ci->ci_tlbstate = TLBSTATE_VALID; 762 kcpuset_atomic_set(pmap->pm_cpus, cid); 763 kcpuset_atomic_set(pmap->pm_kernel_cpus, cid); 764 cpu_load_pmap(pmap, curpmap); 765 } 766 pmap->pm_ncsw = l->l_ncsw; 767 *pmap2 = curpmap; 768 *ptepp = PTE_BASE; 769 #if defined(XEN) && defined(__x86_64__) 770 KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] == L4_BASE); 771 ci->ci_normal_pdes[PTP_LEVELS - 2] = pmap->pm_pdir; 772 *pdeppp = ci->ci_normal_pdes; 773 #else /* XEN && __x86_64__ */ 774 *pdeppp = normal_pdes; 775 #endif /* XEN && __x86_64__ */ 776 } 777 778 /* 779 * pmap_unmap_ptes: unlock the PTE mapping of "pmap" 780 */ 781 782 void 783 pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2) 784 { 785 struct cpu_info *ci; 786 struct pmap *mypmap; 787 788 KASSERT(kpreempt_disabled()); 789 790 /* The kernel's pmap is always accessible. */ 791 if (pmap == pmap_kernel()) { 792 return; 793 } 794 795 ci = curcpu(); 796 #if defined(XEN) && defined(__x86_64__) 797 /* Reset per-cpu normal_pdes */ 798 KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] != L4_BASE); 799 ci->ci_normal_pdes[PTP_LEVELS - 2] = L4_BASE; 800 #endif /* XEN && __x86_64__ */ 801 /* 802 * We cannot tolerate context switches while mapped in. 803 * If it is our own pmap all we have to do is unlock. 804 */ 805 KASSERT(pmap->pm_ncsw == curlwp->l_ncsw); 806 mypmap = vm_map_pmap(&curproc->p_vmspace->vm_map); 807 if (pmap == mypmap) { 808 mutex_exit(pmap->pm_lock); 809 return; 810 } 811 812 /* 813 * Mark whatever's on the CPU now as lazy and unlock. 814 * If the pmap was already installed, we are done. 815 */ 816 ci->ci_tlbstate = TLBSTATE_LAZY; 817 ci->ci_want_pmapload = (mypmap != pmap_kernel()); 818 mutex_exit(pmap->pm_lock); 819 if (pmap == pmap2) { 820 return; 821 } 822 823 /* 824 * We installed another pmap on the CPU. Grab a reference to 825 * it and leave in place. Toss the evicted pmap (can block). 826 */ 827 pmap_reference(pmap); 828 pmap_destroy(pmap2); 829 } 830 831 832 inline static void 833 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte) 834 { 835 836 #if !defined(__x86_64__) 837 if (curproc == NULL || curproc->p_vmspace == NULL || 838 pm != vm_map_pmap(&curproc->p_vmspace->vm_map)) 839 return; 840 841 if ((opte ^ npte) & PG_X) 842 pmap_update_pg(va); 843 844 /* 845 * Executability was removed on the last executable change. 846 * Reset the code segment to something conservative and 847 * let the trap handler deal with setting the right limit. 848 * We can't do that because of locking constraints on the vm map. 849 */ 850 851 if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) { 852 struct trapframe *tf = curlwp->l_md.md_regs; 853 854 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 855 pm->pm_hiexec = I386_MAX_EXE_ADDR; 856 } 857 #endif /* !defined(__x86_64__) */ 858 } 859 860 #if !defined(__x86_64__) 861 /* 862 * Fixup the code segment to cover all potential executable mappings. 863 * returns 0 if no changes to the code segment were made. 864 */ 865 866 int 867 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb) 868 { 869 struct vm_map_entry *ent; 870 struct pmap *pm = vm_map_pmap(map); 871 vaddr_t va = 0; 872 873 vm_map_lock_read(map); 874 for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) { 875 876 /* 877 * This entry has greater va than the entries before. 878 * We need to make it point to the last page, not past it. 879 */ 880 881 if (ent->protection & VM_PROT_EXECUTE) 882 va = trunc_page(ent->end) - PAGE_SIZE; 883 } 884 vm_map_unlock_read(map); 885 if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL)) 886 return (0); 887 888 pm->pm_hiexec = va; 889 if (pm->pm_hiexec > I386_MAX_EXE_ADDR) { 890 tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL); 891 } else { 892 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 893 return (0); 894 } 895 return (1); 896 } 897 #endif /* !defined(__x86_64__) */ 898 899 void 900 pat_init(struct cpu_info *ci) 901 { 902 uint64_t pat; 903 904 if (!(ci->ci_feat_val[0] & CPUID_PAT)) 905 return; 906 907 /* We change WT to WC. Leave all other entries the default values. */ 908 pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) | 909 PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) | 910 PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) | 911 PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC); 912 913 wrmsr(MSR_CR_PAT, pat); 914 cpu_pat_enabled = true; 915 aprint_debug_dev(ci->ci_dev, "PAT enabled\n"); 916 } 917 918 static pt_entry_t 919 pmap_pat_flags(u_int flags) 920 { 921 u_int cacheflags = (flags & PMAP_CACHE_MASK); 922 923 if (!cpu_pat_enabled) { 924 switch (cacheflags) { 925 case PMAP_NOCACHE: 926 case PMAP_NOCACHE_OVR: 927 /* results in PGC_UCMINUS on cpus which have 928 * the cpuid PAT but PAT "disabled" 929 */ 930 return PG_N; 931 default: 932 return 0; 933 } 934 } 935 936 switch (cacheflags) { 937 case PMAP_NOCACHE: 938 return PGC_UC; 939 case PMAP_WRITE_COMBINE: 940 return PGC_WC; 941 case PMAP_WRITE_BACK: 942 return PGC_WB; 943 case PMAP_NOCACHE_OVR: 944 return PGC_UCMINUS; 945 } 946 947 return 0; 948 } 949 950 /* 951 * p m a p k e n t e r f u n c t i o n s 952 * 953 * functions to quickly enter/remove pages from the kernel address 954 * space. pmap_kremove is exported to MI kernel. we make use of 955 * the recursive PTE mappings. 956 */ 957 958 /* 959 * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking 960 * 961 * => no need to lock anything, assume va is already allocated 962 * => should be faster than normal pmap enter function 963 */ 964 965 void 966 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags) 967 { 968 pt_entry_t *pte, opte, npte; 969 970 KASSERT(!(prot & ~VM_PROT_ALL)); 971 972 if (va < VM_MIN_KERNEL_ADDRESS) 973 pte = vtopte(va); 974 else 975 pte = kvtopte(va); 976 #ifdef DOM0OPS 977 if (pa < pmap_pa_start || pa >= pmap_pa_end) { 978 #ifdef DEBUG 979 printf_nolog("%s: pa %#" PRIxPADDR " for va %#" PRIxVADDR 980 " outside range\n", __func__, pa, va); 981 #endif /* DEBUG */ 982 npte = pa; 983 } else 984 #endif /* DOM0OPS */ 985 npte = pmap_pa2pte(pa); 986 npte |= protection_codes[prot] | PG_V | pmap_pg_g; 987 npte |= pmap_pat_flags(flags); 988 opte = pmap_pte_testset(pte, npte); /* zap! */ 989 990 /* 991 * XXX: make sure we are not dealing with a large page, since the only 992 * large pages created are for the kernel image, and they should never 993 * be kentered. 994 */ 995 KASSERTMSG(!(opte & PG_PS), "PG_PS va=%#"PRIxVADDR, va); 996 997 if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) { 998 /* This should not happen. */ 999 printf_nolog("%s: mapping already present\n", __func__); 1000 kpreempt_disable(); 1001 pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER); 1002 kpreempt_enable(); 1003 } 1004 } 1005 1006 void 1007 pmap_emap_enter(vaddr_t va, paddr_t pa, vm_prot_t prot) 1008 { 1009 pt_entry_t *pte, npte; 1010 1011 KASSERT((prot & ~VM_PROT_ALL) == 0); 1012 pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va); 1013 1014 #ifdef DOM0OPS 1015 if (pa < pmap_pa_start || pa >= pmap_pa_end) { 1016 npte = pa; 1017 } else 1018 #endif 1019 npte = pmap_pa2pte(pa); 1020 1021 npte = pmap_pa2pte(pa); 1022 npte |= protection_codes[prot] | PG_V; 1023 pmap_pte_set(pte, npte); 1024 pmap_pte_flush(); 1025 } 1026 1027 /* 1028 * pmap_emap_sync: perform TLB flush or pmap load, if it was deferred. 1029 */ 1030 void 1031 pmap_emap_sync(bool canload) 1032 { 1033 struct cpu_info *ci = curcpu(); 1034 struct pmap *pmap; 1035 1036 KASSERT(kpreempt_disabled()); 1037 if (__predict_true(ci->ci_want_pmapload && canload)) { 1038 /* 1039 * XXX: Hint for pmap_reactivate(), which might suggest to 1040 * not perform TLB flush, if state has not changed. 1041 */ 1042 pmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map); 1043 if (__predict_false(pmap == ci->ci_pmap)) { 1044 kcpuset_atomic_clear(pmap->pm_cpus, cpu_index(ci)); 1045 } 1046 pmap_load(); 1047 KASSERT(ci->ci_want_pmapload == 0); 1048 } else { 1049 tlbflush(); 1050 } 1051 } 1052 1053 void 1054 pmap_emap_remove(vaddr_t sva, vsize_t len) 1055 { 1056 pt_entry_t *pte; 1057 vaddr_t va, eva = sva + len; 1058 1059 for (va = sva; va < eva; va += PAGE_SIZE) { 1060 pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va); 1061 pmap_pte_set(pte, 0); 1062 } 1063 1064 pmap_pte_flush(); 1065 } 1066 1067 __strict_weak_alias(pmap_kenter_ma, pmap_kenter_pa); 1068 1069 #if defined(__x86_64__) 1070 /* 1071 * Change protection for a virtual address. Local for a CPU only, don't 1072 * care about TLB shootdowns. 1073 * 1074 * => must be called with preemption disabled 1075 */ 1076 void 1077 pmap_changeprot_local(vaddr_t va, vm_prot_t prot) 1078 { 1079 pt_entry_t *pte, opte, npte; 1080 1081 KASSERT(kpreempt_disabled()); 1082 1083 if (va < VM_MIN_KERNEL_ADDRESS) 1084 pte = vtopte(va); 1085 else 1086 pte = kvtopte(va); 1087 1088 npte = opte = *pte; 1089 1090 if ((prot & VM_PROT_WRITE) != 0) 1091 npte |= PG_RW; 1092 else 1093 npte &= ~PG_RW; 1094 1095 if (opte != npte) { 1096 pmap_pte_set(pte, npte); 1097 pmap_pte_flush(); 1098 invlpg(va); 1099 } 1100 } 1101 #endif /* defined(__x86_64__) */ 1102 1103 /* 1104 * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking 1105 * 1106 * => no need to lock anything 1107 * => caller must dispose of any vm_page mapped in the va range 1108 * => note: not an inline function 1109 * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE 1110 * => we assume kernel only unmaps valid addresses and thus don't bother 1111 * checking the valid bit before doing TLB flushing 1112 * => must be followed by call to pmap_update() before reuse of page 1113 */ 1114 1115 static inline void 1116 pmap_kremove1(vaddr_t sva, vsize_t len, bool localonly) 1117 { 1118 pt_entry_t *pte, opte; 1119 vaddr_t va, eva; 1120 1121 eva = sva + len; 1122 1123 kpreempt_disable(); 1124 for (va = sva; va < eva; va += PAGE_SIZE) { 1125 pte = kvtopte(va); 1126 opte = pmap_pte_testset(pte, 0); /* zap! */ 1127 if ((opte & (PG_V | PG_U)) == (PG_V | PG_U) && !localonly) { 1128 pmap_tlb_shootdown(pmap_kernel(), va, opte, 1129 TLBSHOOT_KREMOVE); 1130 } 1131 KASSERTMSG((opte & PG_PS) == 0, 1132 "va %#" PRIxVADDR " is a large page", va); 1133 KASSERTMSG((opte & PG_PVLIST) == 0, 1134 "va %#" PRIxVADDR " is a pv tracked page", va); 1135 } 1136 if (localonly) { 1137 tlbflushg(); 1138 } 1139 kpreempt_enable(); 1140 } 1141 1142 void 1143 pmap_kremove(vaddr_t sva, vsize_t len) 1144 { 1145 1146 pmap_kremove1(sva, len, false); 1147 } 1148 1149 /* 1150 * pmap_kremove_local: like pmap_kremove(), but only worry about 1151 * TLB invalidations on the current CPU. this is only intended 1152 * for use while writing kernel crash dumps, either after panic 1153 * or via reboot -d. 1154 */ 1155 1156 void 1157 pmap_kremove_local(vaddr_t sva, vsize_t len) 1158 { 1159 1160 pmap_kremove1(sva, len, true); 1161 } 1162 1163 /* 1164 * p m a p i n i t f u n c t i o n s 1165 * 1166 * pmap_bootstrap and pmap_init are called during system startup 1167 * to init the pmap module. pmap_bootstrap() does a low level 1168 * init just to get things rolling. pmap_init() finishes the job. 1169 */ 1170 1171 /* 1172 * pmap_bootstrap_valloc: allocate a virtual address in the bootstrap area. 1173 * This function is to be used before any VM system has been set up. 1174 * 1175 * The va is taken from virtual_avail. 1176 */ 1177 static vaddr_t 1178 pmap_bootstrap_valloc(size_t npages) 1179 { 1180 vaddr_t va = virtual_avail; 1181 virtual_avail += npages * PAGE_SIZE; 1182 return va; 1183 } 1184 1185 /* 1186 * pmap_bootstrap_palloc: allocate a physical address in the bootstrap area. 1187 * This function is to be used before any VM system has been set up. 1188 * 1189 * The pa is taken from avail_start. 1190 */ 1191 static paddr_t 1192 pmap_bootstrap_palloc(size_t npages) 1193 { 1194 paddr_t pa = avail_start; 1195 avail_start += npages * PAGE_SIZE; 1196 return pa; 1197 } 1198 1199 /* 1200 * pmap_bootstrap: get the system in a state where it can run with VM properly 1201 * enabled (called before main()). The VM system is fully init'd later. 1202 * 1203 * => on i386, locore.S has already enabled the MMU by allocating a PDP for the 1204 * kernel, and nkpde PTP's for the kernel. 1205 * => kva_start is the first free virtual address in kernel space. 1206 */ 1207 void 1208 pmap_bootstrap(vaddr_t kva_start) 1209 { 1210 struct pmap *kpm; 1211 int i; 1212 vaddr_t kva; 1213 1214 pmap_pg_nx = (cpu_feature[2] & CPUID_NOX ? PG_NX : 0); 1215 1216 /* 1217 * Set up our local static global vars that keep track of the usage of 1218 * KVM before kernel_map is set up. 1219 */ 1220 virtual_avail = kva_start; /* first free KVA */ 1221 virtual_end = VM_MAX_KERNEL_ADDRESS; /* last KVA */ 1222 1223 /* 1224 * Set up protection_codes: we need to be able to convert from a MI 1225 * protection code (some combo of VM_PROT...) to something we can jam 1226 * into a x86 PTE. 1227 */ 1228 protection_codes[VM_PROT_NONE] = pmap_pg_nx; 1229 protection_codes[VM_PROT_EXECUTE] = PG_RO | PG_X; 1230 protection_codes[VM_PROT_READ] = PG_RO | pmap_pg_nx; 1231 protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO | PG_X; 1232 protection_codes[VM_PROT_WRITE] = PG_RW | pmap_pg_nx; 1233 protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW | PG_X; 1234 protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW | pmap_pg_nx; 1235 protection_codes[VM_PROT_ALL] = PG_RW | PG_X; 1236 1237 /* 1238 * Now we init the kernel's pmap. 1239 * 1240 * The kernel pmap's pm_obj is not used for much. However, in user pmaps 1241 * the pm_obj contains the list of active PTPs. 1242 * 1243 * The pm_obj currently does not have a pager. It might be possible to 1244 * add a pager that would allow a process to read-only mmap its own page 1245 * tables (fast user-level vtophys?). This may or may not be useful. 1246 */ 1247 kpm = pmap_kernel(); 1248 for (i = 0; i < PTP_LEVELS - 1; i++) { 1249 mutex_init(&kpm->pm_obj_lock[i], MUTEX_DEFAULT, IPL_NONE); 1250 uvm_obj_init(&kpm->pm_obj[i], NULL, false, 1); 1251 uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_obj_lock[i]); 1252 kpm->pm_ptphint[i] = NULL; 1253 } 1254 memset(&kpm->pm_list, 0, sizeof(kpm->pm_list)); /* pm_list not used */ 1255 1256 kpm->pm_pdir = (pd_entry_t *)bootspace.pdir; 1257 for (i = 0; i < PDP_SIZE; i++) 1258 kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i; 1259 1260 kpm->pm_stats.wired_count = kpm->pm_stats.resident_count = 1261 x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS); 1262 1263 kcpuset_create(&kpm->pm_cpus, true); 1264 kcpuset_create(&kpm->pm_kernel_cpus, true); 1265 1266 kpm->pm_ldt = NULL; 1267 kpm->pm_ldt_len = 0; 1268 kpm->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 1269 1270 /* 1271 * the above is just a rough estimate and not critical to the proper 1272 * operation of the system. 1273 */ 1274 1275 #ifndef XEN 1276 /* 1277 * Begin to enable global TLB entries if they are supported. 1278 * The G bit has no effect until the CR4_PGE bit is set in CR4, 1279 * which happens in cpu_init(), which is run on each cpu 1280 * (and happens later) 1281 */ 1282 if (cpu_feature[0] & CPUID_PGE) { 1283 pmap_pg_g = PG_G; /* enable software */ 1284 1285 /* add PG_G attribute to already mapped kernel pages */ 1286 pmap_remap_global(); 1287 } 1288 1289 /* 1290 * Enable large pages if they are supported. 1291 */ 1292 if (cpu_feature[0] & CPUID_PSE) { 1293 lcr4(rcr4() | CR4_PSE); /* enable hardware (via %cr4) */ 1294 pmap_largepages = 1; /* enable software */ 1295 1296 /* 1297 * The TLB must be flushed after enabling large pages on Pentium 1298 * CPUs, according to section 3.6.2.2 of "Intel Architecture 1299 * Software Developer's Manual, Volume 3: System Programming". 1300 */ 1301 tlbflushg(); 1302 1303 /* Remap the kernel. */ 1304 pmap_remap_largepages(); 1305 } 1306 pmap_init_lapic(); 1307 #endif /* !XEN */ 1308 1309 #ifdef __HAVE_DIRECT_MAP 1310 pmap_init_directmap(kpm); 1311 #else 1312 pmap_vpage_cpualloc(&cpu_info_primary); 1313 1314 if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { /* i386 */ 1315 early_zerop = (void *)cpu_info_primary.vpage[VPAGE_ZER]; 1316 early_zero_pte = cpu_info_primary.vpage_pte[VPAGE_ZER]; 1317 } else { /* amd64 */ 1318 /* 1319 * zero_pte is stuck at the end of mapped space for the kernel 1320 * image (disjunct from kva space). This is done so that it 1321 * can safely be used in pmap_growkernel (pmap_get_physpage), 1322 * when it's called for the first time. 1323 * XXXfvdl fix this for MULTIPROCESSOR later. 1324 */ 1325 #ifdef XEN 1326 /* early_zerop initialized in xen_locore() */ 1327 #else 1328 early_zerop = (void *)bootspace.spareva; 1329 #endif 1330 early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop); 1331 } 1332 #endif 1333 1334 #if defined(XEN) && defined(__x86_64__) 1335 extern vaddr_t xen_dummy_page; 1336 paddr_t xen_dummy_user_pgd; 1337 1338 /* 1339 * We want a dummy page directory for Xen: when deactivating a pmap, 1340 * Xen will still consider it active. So we set user PGD to this one 1341 * to lift all protection on the now inactive page tables set. 1342 */ 1343 xen_dummy_user_pgd = xen_dummy_page - KERNBASE; 1344 1345 /* Zero fill it, the less checks in Xen it requires the better */ 1346 memset((void *)(xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE); 1347 /* Mark read-only */ 1348 HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE, 1349 pmap_pa2pte(xen_dummy_user_pgd) | PG_V | pmap_pg_nx, 1350 UVMF_INVLPG); 1351 /* Pin as L4 */ 1352 xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd)); 1353 #endif 1354 1355 /* 1356 * Allocate space for the IDT, GDT and LDT. 1357 */ 1358 idt_vaddr = pmap_bootstrap_valloc(1); 1359 idt_paddr = pmap_bootstrap_palloc(1); 1360 1361 gdt_vaddr = pmap_bootstrap_valloc(1); 1362 gdt_paddr = pmap_bootstrap_palloc(1); 1363 1364 ldt_vaddr = pmap_bootstrap_valloc(1); 1365 ldt_paddr = pmap_bootstrap_palloc(1); 1366 1367 #if !defined(__x86_64__) && !defined(XEN) 1368 /* pentium f00f bug stuff */ 1369 pentium_idt_vaddr = pmap_bootstrap_valloc(1); 1370 #endif 1371 1372 /* 1373 * Now we reserve some VM for mapping pages when doing a crash dump. 1374 */ 1375 virtual_avail = reserve_dumppages(virtual_avail); 1376 1377 /* 1378 * Init the static-global locks and global lists. 1379 * 1380 * => pventry::pvh_lock (initialized elsewhere) must also be 1381 * a spin lock, again at IPL_VM to prevent deadlock, and 1382 * again is never taken from interrupt context. 1383 */ 1384 mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE); 1385 LIST_INIT(&pmaps); 1386 1387 /* 1388 * Ensure the TLB is sync'd with reality by flushing it... 1389 */ 1390 tlbflushg(); 1391 1392 /* 1393 * Calculate pmap_maxkvaddr from nkptp[]. 1394 */ 1395 kva = VM_MIN_KERNEL_ADDRESS; 1396 for (i = PTP_LEVELS - 1; i >= 1; i--) { 1397 kva += nkptp[i] * nbpd[i]; 1398 } 1399 pmap_maxkvaddr = kva; 1400 } 1401 1402 #ifndef XEN 1403 static void 1404 pmap_init_lapic(void) 1405 { 1406 /* 1407 * On CPUs that have no LAPIC, local_apic_va is never kentered. But our 1408 * x86 implementation relies a lot on this address to be valid; so just 1409 * allocate a fake physical page that will be kentered into 1410 * local_apic_va by machdep. 1411 * 1412 * If the LAPIC is present, the va will be remapped somewhere else 1413 * later in lapic_map. 1414 */ 1415 local_apic_va = pmap_bootstrap_valloc(1); 1416 local_apic_pa = pmap_bootstrap_palloc(1); 1417 } 1418 #endif 1419 1420 #ifdef __HAVE_DIRECT_MAP 1421 /* 1422 * Create the amd64 direct map. Called only once at boot time. 1423 */ 1424 static void 1425 pmap_init_directmap(struct pmap *kpm) 1426 { 1427 extern phys_ram_seg_t mem_clusters[]; 1428 extern int mem_cluster_cnt; 1429 1430 paddr_t lastpa, L2page_pa, L3page_pa, pdp; 1431 vaddr_t tmpva; 1432 pt_entry_t *pte; 1433 pd_entry_t *pde; 1434 phys_ram_seg_t *mc; 1435 size_t nL4e, nL3e, nL2e; 1436 size_t pn, npd; 1437 int i, n; 1438 1439 const pd_entry_t pteflags = PG_V | PG_KW | pmap_pg_nx; 1440 1441 CTASSERT(NL4_SLOT_DIRECT * NBPD_L4 == MAXPHYSMEM); 1442 1443 /* Get the last physical address available */ 1444 lastpa = 0; 1445 for (i = 0; i < mem_cluster_cnt; i++) { 1446 mc = &mem_clusters[i]; 1447 lastpa = MAX(lastpa, mc->start + mc->size); 1448 } 1449 1450 /* 1451 * x86_add_cluster should have truncated the memory to MAXPHYSMEM. 1452 */ 1453 if (lastpa > MAXPHYSMEM) { 1454 panic("pmap_init_directmap: lastpa incorrect"); 1455 } 1456 1457 /* We will use this temporary va. */ 1458 tmpva = bootspace.spareva; 1459 pte = PTE_BASE + pl1_i(tmpva); 1460 1461 /* Number of L4 entries. */ 1462 nL4e = (lastpa + NBPD_L4 - 1) >> L4_SHIFT; 1463 KASSERT(nL4e <= NL4_SLOT_DIRECT); 1464 1465 /* Allocate L3, and zero it out. */ 1466 L3page_pa = pmap_bootstrap_palloc(nL4e); 1467 for (i = 0; i < nL4e; i++) { 1468 pdp = L3page_pa + i * PAGE_SIZE; 1469 *pte = (pdp & PG_FRAME) | pteflags; 1470 pmap_update_pg(tmpva); 1471 memset((void *)tmpva, 0, PAGE_SIZE); 1472 } 1473 1474 /* Number of L3 entries. */ 1475 nL3e = (lastpa + NBPD_L3 - 1) >> L3_SHIFT; 1476 1477 /* 1478 * Map the direct map RW. Use super pages (1GB) or large pages (2MB) if 1479 * they are supported. Note: PG_G is not allowed on non-leaf PTPs. 1480 */ 1481 if (cpu_feature[2] & CPUID_P1GB) { 1482 /* Super pages are supported. Just create L3. */ 1483 for (i = 0; i < nL3e; i++) { 1484 pdp = (paddr_t)&(((pd_entry_t *)L3page_pa)[i]); 1485 *pte = (pdp & PG_FRAME) | pteflags; 1486 pmap_update_pg(tmpva); 1487 1488 pde = (pd_entry_t *)(tmpva + (pdp & ~PG_FRAME)); 1489 *pde = ((paddr_t)i << L3_SHIFT) | pteflags | PG_U | 1490 PG_PS | PG_G; 1491 } 1492 } else { 1493 /* Allocate L2. */ 1494 L2page_pa = pmap_bootstrap_palloc(nL3e); 1495 1496 /* Number of L2 entries. */ 1497 nL2e = (lastpa + NBPD_L2 - 1) >> L2_SHIFT; 1498 1499 KASSERT(pmap_largepages != 0); 1500 1501 /* Large pages are supported. Just create L2. */ 1502 for (i = 0; i < nL3e; i++) { 1503 pdp = L2page_pa + i * PAGE_SIZE; 1504 *pte = (pdp & PG_FRAME) | pteflags; 1505 pmap_update_pg(tmpva); 1506 1507 memset((void *)tmpva, 0, PAGE_SIZE); 1508 1509 pde = (pd_entry_t *)tmpva; 1510 npd = ((i == nL3e - 1) && (nL2e % NPDPG != 0)) ? 1511 (nL2e % NPDPG) : NPDPG; 1512 for (n = 0; n < npd; n++) { 1513 pn = (i * NPDPG) + n; 1514 pde[n] = ((paddr_t)pn << L2_SHIFT) | pteflags | 1515 PG_U | PG_PS | PG_G; 1516 } 1517 } 1518 1519 /* Fill in the L3 entries, linked to L2. */ 1520 for (i = 0; i < nL4e; i++) { 1521 pdp = L3page_pa + i * PAGE_SIZE; 1522 *pte = (pdp & PG_FRAME) | pteflags; 1523 pmap_update_pg(tmpva); 1524 1525 pde = (pd_entry_t *)tmpva; 1526 npd = ((i == nL4e - 1) && (nL3e % NPDPG != 0)) ? 1527 (nL3e % NPDPG) : NPDPG; 1528 for (n = 0; n < npd; n++) { 1529 pn = (i * NPDPG) + n; 1530 pde[n] = (L2page_pa + (pn << PAGE_SHIFT)) | 1531 pteflags | PG_U; 1532 } 1533 } 1534 } 1535 1536 /* Fill in the L4 entries, linked to L3. */ 1537 for (i = 0; i < nL4e; i++) { 1538 kpm->pm_pdir[PDIR_SLOT_DIRECT + i] = 1539 (L3page_pa + (i << PAGE_SHIFT)) | pteflags | PG_U; 1540 } 1541 1542 *pte = 0; 1543 pmap_update_pg(tmpva); 1544 1545 tlbflush(); 1546 } 1547 #endif /* __HAVE_DIRECT_MAP */ 1548 1549 #ifndef XEN 1550 /* 1551 * Remap all of the virtual pages created so far with the PG_G bit. 1552 */ 1553 static void 1554 pmap_remap_global(void) 1555 { 1556 vaddr_t kva, kva_end; 1557 unsigned long p1i; 1558 size_t i; 1559 1560 /* head */ 1561 kva = bootspace.head.va; 1562 kva_end = kva + bootspace.head.sz; 1563 for ( ; kva < kva_end; kva += PAGE_SIZE) { 1564 p1i = pl1_i(kva); 1565 if (pmap_valid_entry(PTE_BASE[p1i])) 1566 PTE_BASE[p1i] |= PG_G; 1567 } 1568 1569 /* kernel segments */ 1570 for (i = 0; i < BTSPACE_NSEGS; i++) { 1571 if (bootspace.segs[i].type == BTSEG_NONE) { 1572 continue; 1573 } 1574 kva = bootspace.segs[i].va; 1575 kva_end = kva + bootspace.segs[i].sz; 1576 for ( ; kva < kva_end; kva += PAGE_SIZE) { 1577 p1i = pl1_i(kva); 1578 if (pmap_valid_entry(PTE_BASE[p1i])) 1579 PTE_BASE[p1i] |= PG_G; 1580 } 1581 } 1582 1583 /* boot space */ 1584 kva = bootspace.boot.va; 1585 kva_end = kva + bootspace.boot.sz; 1586 for ( ; kva < kva_end; kva += PAGE_SIZE) { 1587 p1i = pl1_i(kva); 1588 if (pmap_valid_entry(PTE_BASE[p1i])) 1589 PTE_BASE[p1i] |= PG_G; 1590 } 1591 } 1592 1593 /* 1594 * Remap several kernel segments with large pages. We cover as many pages as we 1595 * can. Called only once at boot time, if the CPU supports large pages. 1596 */ 1597 static void 1598 pmap_remap_largepages(void) 1599 { 1600 pd_entry_t *pde; 1601 vaddr_t kva, kva_end; 1602 paddr_t pa; 1603 size_t i; 1604 1605 /* Remap the kernel text using large pages. */ 1606 for (i = 0; i < BTSPACE_NSEGS; i++) { 1607 if (bootspace.segs[i].type != BTSEG_TEXT) { 1608 continue; 1609 } 1610 kva = roundup(bootspace.segs[i].va, NBPD_L2); 1611 kva_end = rounddown(bootspace.segs[i].va + 1612 bootspace.segs[i].sz, NBPD_L1); 1613 pa = roundup(bootspace.segs[i].pa, NBPD_L2); 1614 for (/* */; kva + NBPD_L2 <= kva_end; kva += NBPD_L2, pa += NBPD_L2) { 1615 pde = &L2_BASE[pl2_i(kva)]; 1616 *pde = pa | pmap_pg_g | PG_PS | PG_KR | PG_V; 1617 tlbflushg(); 1618 } 1619 } 1620 1621 /* Remap the kernel rodata using large pages. */ 1622 for (i = 0; i < BTSPACE_NSEGS; i++) { 1623 if (bootspace.segs[i].type != BTSEG_RODATA) { 1624 continue; 1625 } 1626 kva = roundup(bootspace.segs[i].va, NBPD_L2); 1627 kva_end = rounddown(bootspace.segs[i].va + 1628 bootspace.segs[i].sz, NBPD_L1); 1629 pa = roundup(bootspace.segs[i].pa, NBPD_L2); 1630 for (/* */; kva + NBPD_L2 <= kva_end; kva += NBPD_L2, pa += NBPD_L2) { 1631 pde = &L2_BASE[pl2_i(kva)]; 1632 *pde = pa | pmap_pg_g | PG_PS | pmap_pg_nx | PG_KR | PG_V; 1633 tlbflushg(); 1634 } 1635 } 1636 1637 /* Remap the kernel data+bss using large pages. */ 1638 for (i = 0; i < BTSPACE_NSEGS; i++) { 1639 if (bootspace.segs[i].type != BTSEG_DATA) { 1640 continue; 1641 } 1642 kva = roundup(bootspace.segs[i].va, NBPD_L2); 1643 kva_end = rounddown(bootspace.segs[i].va + 1644 bootspace.segs[i].sz, NBPD_L1); 1645 pa = roundup(bootspace.segs[i].pa, NBPD_L2); 1646 for (/* */; kva + NBPD_L2 <= kva_end; kva += NBPD_L2, pa += NBPD_L2) { 1647 pde = &L2_BASE[pl2_i(kva)]; 1648 *pde = pa | pmap_pg_g | PG_PS | pmap_pg_nx | PG_KW | PG_V; 1649 tlbflushg(); 1650 } 1651 } 1652 } 1653 #endif /* !XEN */ 1654 1655 /* 1656 * pmap_init: called from uvm_init, our job is to get the pmap 1657 * system ready to manage mappings... 1658 */ 1659 1660 void 1661 pmap_init(void) 1662 { 1663 int i, flags; 1664 1665 for (i = 0; i < PV_HASH_SIZE; i++) { 1666 SLIST_INIT(&pv_hash_heads[i].hh_list); 1667 } 1668 for (i = 0; i < PV_HASH_LOCK_CNT; i++) { 1669 mutex_init(&pv_hash_locks[i].lock, MUTEX_NODEBUG, IPL_VM); 1670 } 1671 1672 /* 1673 * initialize caches. 1674 */ 1675 1676 pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), 0, 0, 0, 1677 "pmappl", NULL, IPL_NONE, NULL, NULL, NULL); 1678 1679 #ifdef XEN 1680 /* 1681 * pool_cache(9) should not touch cached objects, since they 1682 * are pinned on xen and R/O for the domU 1683 */ 1684 flags = PR_NOTOUCH; 1685 #else /* XEN */ 1686 flags = 0; 1687 #endif /* XEN */ 1688 #ifdef PAE 1689 pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE * PDP_SIZE, 0, 0, flags, 1690 "pdppl", &pmap_pdp_allocator, IPL_NONE, 1691 pmap_pdp_ctor, pmap_pdp_dtor, NULL); 1692 #else /* PAE */ 1693 pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE, 0, 0, flags, 1694 "pdppl", NULL, IPL_NONE, pmap_pdp_ctor, pmap_pdp_dtor, NULL); 1695 #endif /* PAE */ 1696 pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0, 1697 PR_LARGECACHE, "pvpl", &pool_allocator_kmem, IPL_NONE, NULL, 1698 NULL, NULL); 1699 1700 pmap_tlb_init(); 1701 1702 /* XXX: Since cpu_hatch() is only for secondary CPUs. */ 1703 pmap_tlb_cpu_init(curcpu()); 1704 1705 evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC, 1706 NULL, "x86", "io bitmap copy"); 1707 evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC, 1708 NULL, "x86", "ldt sync"); 1709 1710 /* 1711 * done: pmap module is up (and ready for business) 1712 */ 1713 1714 pmap_initialized = true; 1715 } 1716 1717 /* 1718 * pmap_cpu_init_late: perform late per-CPU initialization. 1719 */ 1720 1721 #ifndef XEN 1722 void 1723 pmap_cpu_init_late(struct cpu_info *ci) 1724 { 1725 /* 1726 * The BP has already its own PD page allocated during early 1727 * MD startup. 1728 */ 1729 if (ci == &cpu_info_primary) 1730 return; 1731 1732 #ifdef PAE 1733 cpu_alloc_l3_page(ci); 1734 #endif 1735 } 1736 #endif 1737 1738 #ifndef __HAVE_DIRECT_MAP 1739 CTASSERT(CACHE_LINE_SIZE > sizeof(pt_entry_t)); 1740 CTASSERT(CACHE_LINE_SIZE % sizeof(pt_entry_t) == 0); 1741 1742 static void 1743 pmap_vpage_cpualloc(struct cpu_info *ci) 1744 { 1745 bool primary = (ci == &cpu_info_primary); 1746 size_t i, npages; 1747 vaddr_t vabase; 1748 vsize_t vrange; 1749 1750 npages = (CACHE_LINE_SIZE / sizeof(pt_entry_t)); 1751 KASSERT(npages >= VPAGE_MAX); 1752 vrange = npages * PAGE_SIZE; 1753 1754 if (primary) { 1755 while ((vabase = pmap_bootstrap_valloc(1)) % vrange != 0) { 1756 /* Waste some pages to align properly */ 1757 } 1758 /* The base is aligned, allocate the rest (contiguous) */ 1759 pmap_bootstrap_valloc(npages - 1); 1760 } else { 1761 vabase = uvm_km_alloc(kernel_map, vrange, vrange, 1762 UVM_KMF_VAONLY); 1763 if (vabase == 0) { 1764 panic("%s: failed to allocate tmp VA for CPU %d\n", 1765 __func__, cpu_index(ci)); 1766 } 1767 } 1768 1769 KASSERT((vaddr_t)&PTE_BASE[pl1_i(vabase)] % CACHE_LINE_SIZE == 0); 1770 1771 for (i = 0; i < VPAGE_MAX; i++) { 1772 ci->vpage[i] = vabase + i * PAGE_SIZE; 1773 ci->vpage_pte[i] = PTE_BASE + pl1_i(ci->vpage[i]); 1774 } 1775 } 1776 1777 void 1778 pmap_vpage_cpu_init(struct cpu_info *ci) 1779 { 1780 if (ci == &cpu_info_primary) { 1781 /* cpu0 already taken care of in pmap_bootstrap */ 1782 return; 1783 } 1784 1785 pmap_vpage_cpualloc(ci); 1786 } 1787 #endif 1788 1789 /* 1790 * p v _ e n t r y f u n c t i o n s 1791 */ 1792 1793 static bool 1794 pmap_pp_needs_pve(struct pmap_page *pp) 1795 { 1796 1797 /* 1798 * Adding a pv entry for this page only needs to allocate a pv_entry 1799 * structure if the page already has at least one pv entry, 1800 * since the first pv entry is stored in the pmap_page. 1801 */ 1802 1803 return pp && ((pp->pp_flags & PP_EMBEDDED) != 0 || 1804 !LIST_EMPTY(&pp->pp_head.pvh_list)); 1805 } 1806 1807 /* 1808 * pmap_free_pvs: free a list of pv_entrys 1809 */ 1810 1811 static void 1812 pmap_free_pvs(struct pv_entry *pve) 1813 { 1814 struct pv_entry *next; 1815 1816 for ( /* null */ ; pve != NULL ; pve = next) { 1817 next = pve->pve_next; 1818 pool_cache_put(&pmap_pv_cache, pve); 1819 } 1820 } 1821 1822 /* 1823 * main pv_entry manipulation functions: 1824 * pmap_enter_pv: enter a mapping onto a pv_head list 1825 * pmap_remove_pv: remove a mapping from a pv_head list 1826 * 1827 * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock 1828 * the pvh before calling 1829 */ 1830 1831 /* 1832 * insert_pv: a helper of pmap_enter_pv 1833 */ 1834 1835 static void 1836 insert_pv(struct pmap_page *pp, struct pv_entry *pve) 1837 { 1838 struct pv_hash_head *hh; 1839 kmutex_t *lock; 1840 u_int hash; 1841 1842 hash = pvhash_hash(pve->pve_pte.pte_ptp, pve->pve_pte.pte_va); 1843 lock = pvhash_lock(hash); 1844 hh = pvhash_head(hash); 1845 mutex_spin_enter(lock); 1846 SLIST_INSERT_HEAD(&hh->hh_list, pve, pve_hash); 1847 mutex_spin_exit(lock); 1848 1849 LIST_INSERT_HEAD(&pp->pp_head.pvh_list, pve, pve_list); 1850 } 1851 1852 /* 1853 * pmap_enter_pv: enter a mapping onto a pv_head lst 1854 * 1855 * => caller should adjust ptp's wire_count before calling 1856 * => caller has preallocated pve and *sparepve for us 1857 */ 1858 1859 static struct pv_entry * 1860 pmap_enter_pv(struct pmap_page *pp, struct pv_entry *pve, 1861 struct pv_entry **sparepve, struct vm_page *ptp, vaddr_t va) 1862 { 1863 1864 KASSERT(ptp == NULL || ptp->wire_count >= 2); 1865 KASSERT(ptp == NULL || ptp->uobject != NULL); 1866 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 1867 1868 if ((pp->pp_flags & PP_EMBEDDED) == 0) { 1869 if (LIST_EMPTY(&pp->pp_head.pvh_list)) { 1870 pp->pp_flags |= PP_EMBEDDED; 1871 pp->pp_pte.pte_ptp = ptp; 1872 pp->pp_pte.pte_va = va; 1873 1874 return pve; 1875 } 1876 } else { 1877 struct pv_entry *pve2; 1878 1879 pve2 = *sparepve; 1880 *sparepve = NULL; 1881 1882 pve2->pve_pte = pp->pp_pte; 1883 pp->pp_flags &= ~PP_EMBEDDED; 1884 LIST_INIT(&pp->pp_head.pvh_list); 1885 insert_pv(pp, pve2); 1886 } 1887 1888 pve->pve_pte.pte_ptp = ptp; 1889 pve->pve_pte.pte_va = va; 1890 insert_pv(pp, pve); 1891 1892 return NULL; 1893 } 1894 1895 /* 1896 * pmap_remove_pv: try to remove a mapping from a pv_list 1897 * 1898 * => caller should adjust ptp's wire_count and free PTP if needed 1899 * => we return the removed pve 1900 */ 1901 1902 static struct pv_entry * 1903 pmap_remove_pv(struct pmap_page *pp, struct vm_page *ptp, vaddr_t va) 1904 { 1905 struct pv_hash_head *hh; 1906 struct pv_entry *pve; 1907 kmutex_t *lock; 1908 u_int hash; 1909 1910 KASSERT(ptp == NULL || ptp->uobject != NULL); 1911 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 1912 1913 if ((pp->pp_flags & PP_EMBEDDED) != 0) { 1914 KASSERT(pp->pp_pte.pte_ptp == ptp); 1915 KASSERT(pp->pp_pte.pte_va == va); 1916 1917 pp->pp_flags &= ~PP_EMBEDDED; 1918 LIST_INIT(&pp->pp_head.pvh_list); 1919 1920 return NULL; 1921 } 1922 1923 hash = pvhash_hash(ptp, va); 1924 lock = pvhash_lock(hash); 1925 hh = pvhash_head(hash); 1926 mutex_spin_enter(lock); 1927 pve = pvhash_remove(hh, ptp, va); 1928 mutex_spin_exit(lock); 1929 1930 LIST_REMOVE(pve, pve_list); 1931 1932 return pve; 1933 } 1934 1935 /* 1936 * p t p f u n c t i o n s 1937 */ 1938 1939 static inline struct vm_page * 1940 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level) 1941 { 1942 int lidx = level - 1; 1943 struct vm_page *pg; 1944 1945 KASSERT(mutex_owned(pmap->pm_lock)); 1946 1947 if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] && 1948 pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) { 1949 return (pmap->pm_ptphint[lidx]); 1950 } 1951 PMAP_SUBOBJ_LOCK(pmap, lidx); 1952 pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level)); 1953 PMAP_SUBOBJ_UNLOCK(pmap, lidx); 1954 1955 KASSERT(pg == NULL || pg->wire_count >= 1); 1956 return pg; 1957 } 1958 1959 static inline void 1960 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level) 1961 { 1962 lwp_t *l; 1963 int lidx; 1964 struct uvm_object *obj; 1965 1966 KASSERT(ptp->wire_count == 1); 1967 1968 lidx = level - 1; 1969 1970 obj = &pmap->pm_obj[lidx]; 1971 pmap_stats_update(pmap, -1, 0); 1972 if (lidx != 0) 1973 mutex_enter(obj->vmobjlock); 1974 if (pmap->pm_ptphint[lidx] == ptp) 1975 pmap->pm_ptphint[lidx] = TAILQ_FIRST(&obj->memq); 1976 ptp->wire_count = 0; 1977 uvm_pagerealloc(ptp, NULL, 0); 1978 l = curlwp; 1979 KASSERT((l->l_pflag & LP_INTR) == 0); 1980 VM_PAGE_TO_PP(ptp)->pp_link = l->l_md.md_gc_ptp; 1981 l->l_md.md_gc_ptp = ptp; 1982 if (lidx != 0) 1983 mutex_exit(obj->vmobjlock); 1984 } 1985 1986 static void 1987 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va, 1988 pt_entry_t *ptes, pd_entry_t * const *pdes) 1989 { 1990 unsigned long index; 1991 int level; 1992 vaddr_t invaladdr; 1993 pd_entry_t opde; 1994 1995 KASSERT(pmap != pmap_kernel()); 1996 KASSERT(mutex_owned(pmap->pm_lock)); 1997 KASSERT(kpreempt_disabled()); 1998 1999 level = 1; 2000 do { 2001 index = pl_i(va, level + 1); 2002 opde = pmap_pte_testset(&pdes[level - 1][index], 0); 2003 #if defined(XEN) 2004 # if defined(__x86_64__) 2005 /* 2006 * If ptp is a L3 currently mapped in kernel space, 2007 * on any cpu, clear it before freeing 2008 */ 2009 if (level == PTP_LEVELS - 1) { 2010 /* 2011 * Update the per-cpu PD on all cpus the current 2012 * pmap is active on 2013 */ 2014 xen_kpm_sync(pmap, index); 2015 } 2016 # endif /*__x86_64__ */ 2017 invaladdr = level == 1 ? (vaddr_t)ptes : 2018 (vaddr_t)pdes[level - 2]; 2019 pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE, 2020 opde, TLBSHOOT_FREE_PTP1); 2021 pmap_tlb_shootnow(); 2022 #else /* XEN */ 2023 invaladdr = level == 1 ? (vaddr_t)ptes : 2024 (vaddr_t)pdes[level - 2]; 2025 pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE, 2026 opde, TLBSHOOT_FREE_PTP1); 2027 #endif /* XEN */ 2028 pmap_freepage(pmap, ptp, level); 2029 if (level < PTP_LEVELS - 1) { 2030 ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1); 2031 ptp->wire_count--; 2032 if (ptp->wire_count > 1) 2033 break; 2034 } 2035 } while (++level < PTP_LEVELS); 2036 pmap_pte_flush(); 2037 } 2038 2039 /* 2040 * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one) 2041 * 2042 * => pmap should NOT be pmap_kernel() 2043 * => pmap should be locked 2044 * => preemption should be disabled 2045 */ 2046 2047 static struct vm_page * 2048 pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t * const *pdes, int flags) 2049 { 2050 struct vm_page *ptp; 2051 struct { 2052 struct vm_page *pg; 2053 bool new; 2054 } pt[PTP_LEVELS + 1]; 2055 int i, aflags; 2056 unsigned long index; 2057 pd_entry_t *pva; 2058 paddr_t pa; 2059 struct uvm_object *obj; 2060 voff_t off; 2061 2062 KASSERT(pmap != pmap_kernel()); 2063 KASSERT(mutex_owned(pmap->pm_lock)); 2064 KASSERT(kpreempt_disabled()); 2065 2066 /* 2067 * Loop through all page table levels allocating a page 2068 * for any level where we don't already have one. 2069 */ 2070 memset(pt, 0, sizeof(pt)); 2071 aflags = ((flags & PMAP_CANFAIL) ? 0 : UVM_PGA_USERESERVE) | 2072 UVM_PGA_ZERO; 2073 for (i = PTP_LEVELS; i > 1; i--) { 2074 obj = &pmap->pm_obj[i - 2]; 2075 off = ptp_va2o(va, i - 1); 2076 2077 PMAP_SUBOBJ_LOCK(pmap, i - 2); 2078 pt[i].pg = uvm_pagelookup(obj, off); 2079 if (pt[i].pg == NULL) { 2080 pt[i].pg = uvm_pagealloc(obj, off, NULL, aflags); 2081 pt[i].new = true; 2082 } 2083 PMAP_SUBOBJ_UNLOCK(pmap, i - 2); 2084 2085 if (pt[i].pg == NULL) 2086 goto fail; 2087 } 2088 2089 /* 2090 * Now that we have all the pages looked up or allocated, 2091 * loop through again installing any new ones into the tree. 2092 */ 2093 for (i = PTP_LEVELS; i > 1; i--) { 2094 index = pl_i(va, i); 2095 pva = pdes[i - 2]; 2096 2097 if (pmap_valid_entry(pva[index])) { 2098 KASSERT(!pt[i].new); 2099 continue; 2100 } 2101 2102 ptp = pt[i].pg; 2103 ptp->flags &= ~PG_BUSY; /* never busy */ 2104 ptp->wire_count = 1; 2105 pmap->pm_ptphint[i - 2] = ptp; 2106 pa = VM_PAGE_TO_PHYS(ptp); 2107 pmap_pte_set(&pva[index], (pd_entry_t) 2108 (pmap_pa2pte(pa) | PG_u | PG_RW | PG_V)); 2109 #if defined(XEN) && defined(__x86_64__) 2110 if (i == PTP_LEVELS) { 2111 2112 /* 2113 * Update the per-cpu PD on all cpus the current 2114 * pmap is active on 2115 */ 2116 xen_kpm_sync(pmap, index); 2117 } 2118 #endif 2119 pmap_pte_flush(); 2120 pmap_stats_update(pmap, 1, 0); 2121 2122 /* 2123 * If we're not in the top level, increase the 2124 * wire count of the parent page. 2125 */ 2126 if (i < PTP_LEVELS) { 2127 pt[i + 1].pg->wire_count++; 2128 } 2129 } 2130 ptp = pt[2].pg; 2131 KASSERT(ptp != NULL); 2132 pmap->pm_ptphint[0] = ptp; 2133 return ptp; 2134 2135 /* 2136 * Allocation of a ptp failed, free any others that we just allocated. 2137 */ 2138 fail: 2139 for (i = PTP_LEVELS; i > 1; i--) { 2140 if (pt[i].pg == NULL) { 2141 break; 2142 } 2143 if (!pt[i].new) { 2144 continue; 2145 } 2146 obj = &pmap->pm_obj[i - 2]; 2147 PMAP_SUBOBJ_LOCK(pmap, i - 2); 2148 uvm_pagefree(pt[i].pg); 2149 PMAP_SUBOBJ_UNLOCK(pmap, i - 2); 2150 } 2151 return NULL; 2152 } 2153 2154 /* 2155 * p m a p l i f e c y c l e f u n c t i o n s 2156 */ 2157 2158 /* 2159 * pmap_pdp_ctor: constructor for the PDP cache. 2160 */ 2161 static int 2162 pmap_pdp_ctor(void *arg, void *v, int flags) 2163 { 2164 pd_entry_t *pdir = v; 2165 paddr_t pdirpa = 0; 2166 vaddr_t object; 2167 int i; 2168 2169 #if !defined(XEN) || !defined(__x86_64__) 2170 int npde; 2171 #endif 2172 #ifdef XEN 2173 int s; 2174 #endif 2175 2176 /* 2177 * NOTE: The `pmaps_lock' is held when the PDP is allocated. 2178 */ 2179 2180 #if defined(XEN) && defined(__x86_64__) 2181 /* Fetch the physical address of the page directory */ 2182 (void)pmap_extract(pmap_kernel(), (vaddr_t)pdir, &pdirpa); 2183 2184 /* Zero the area */ 2185 memset(pdir, 0, PAGE_SIZE); /* Xen wants a clean page */ 2186 2187 /* 2188 * This pdir will NEVER be active in kernel mode, so mark 2189 * recursive entry invalid. 2190 */ 2191 pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa); 2192 2193 /* 2194 * PDP constructed this way won't be for the kernel, hence we 2195 * don't put kernel mappings on Xen. 2196 * 2197 * But we need to make pmap_create() happy, so put a dummy 2198 * (without PG_V) value at the right place. 2199 */ 2200 pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] = 2201 (pd_entry_t)-1 & PG_FRAME; 2202 #else /* XEN && __x86_64__*/ 2203 /* Zero the area */ 2204 memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t)); 2205 2206 object = (vaddr_t)v; 2207 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2208 /* Fetch the physical address of the page directory */ 2209 (void)pmap_extract(pmap_kernel(), object, &pdirpa); 2210 2211 /* Put in recursive PDE to map the PTEs */ 2212 pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PG_V | 2213 pmap_pg_nx; 2214 #ifndef XEN 2215 pdir[PDIR_SLOT_PTE + i] |= PG_KW; 2216 #endif 2217 } 2218 2219 /* Copy the kernel's top level PDE */ 2220 npde = nkptp[PTP_LEVELS - 1]; 2221 2222 memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN], 2223 npde * sizeof(pd_entry_t)); 2224 2225 /* Zero the rest */ 2226 memset(&pdir[PDIR_SLOT_KERN + npde], 0, (PAGE_SIZE * PDP_SIZE) - 2227 (PDIR_SLOT_KERN + npde) * sizeof(pd_entry_t)); 2228 2229 if (VM_MIN_KERNEL_ADDRESS != KERNBASE) { 2230 int idx = pl_i(KERNBASE, PTP_LEVELS); 2231 pdir[idx] = PDP_BASE[idx]; 2232 } 2233 2234 #ifdef __HAVE_DIRECT_MAP 2235 memcpy(&pdir[PDIR_SLOT_DIRECT], &PDP_BASE[PDIR_SLOT_DIRECT], 2236 NL4_SLOT_DIRECT * sizeof(pd_entry_t)); 2237 #endif 2238 #endif /* XEN && __x86_64__*/ 2239 2240 #ifdef XEN 2241 s = splvm(); 2242 object = (vaddr_t)v; 2243 pmap_protect(pmap_kernel(), object, object + (PAGE_SIZE * PDP_SIZE), 2244 VM_PROT_READ); 2245 pmap_update(pmap_kernel()); 2246 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2247 /* 2248 * pin as L2/L4 page, we have to do the page with the 2249 * PDIR_SLOT_PTE entries last 2250 */ 2251 #ifdef PAE 2252 if (i == l2tol3(PDIR_SLOT_PTE)) 2253 continue; 2254 #endif 2255 2256 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2257 #ifdef __x86_64__ 2258 xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa)); 2259 #else 2260 xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa)); 2261 #endif 2262 } 2263 #ifdef PAE 2264 object = ((vaddr_t)pdir) + PAGE_SIZE * l2tol3(PDIR_SLOT_PTE); 2265 (void)pmap_extract(pmap_kernel(), object, &pdirpa); 2266 xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa)); 2267 #endif 2268 splx(s); 2269 #endif /* XEN */ 2270 2271 return (0); 2272 } 2273 2274 /* 2275 * pmap_pdp_dtor: destructor for the PDP cache. 2276 */ 2277 2278 static void 2279 pmap_pdp_dtor(void *arg, void *v) 2280 { 2281 #ifdef XEN 2282 paddr_t pdirpa = 0; /* XXX: GCC */ 2283 vaddr_t object = (vaddr_t)v; 2284 int i; 2285 int s = splvm(); 2286 pt_entry_t *pte; 2287 2288 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2289 /* fetch the physical address of the page directory. */ 2290 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2291 /* unpin page table */ 2292 xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa)); 2293 } 2294 object = (vaddr_t)v; 2295 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2296 /* Set page RW again */ 2297 pte = kvtopte(object); 2298 pmap_pte_set(pte, *pte | PG_RW); 2299 xen_bcast_invlpg((vaddr_t)object); 2300 } 2301 splx(s); 2302 #endif /* XEN */ 2303 } 2304 2305 #ifdef PAE 2306 2307 /* pmap_pdp_alloc: Allocate a page for the pdp memory pool. */ 2308 2309 static void * 2310 pmap_pdp_alloc(struct pool *pp, int flags) 2311 { 2312 return (void *)uvm_km_alloc(kernel_map, 2313 PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE, 2314 ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK) 2315 | UVM_KMF_WIRED); 2316 } 2317 2318 /* 2319 * pmap_pdp_free: free a PDP 2320 */ 2321 2322 static void 2323 pmap_pdp_free(struct pool *pp, void *v) 2324 { 2325 uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE, 2326 UVM_KMF_WIRED); 2327 } 2328 #endif /* PAE */ 2329 2330 /* 2331 * pmap_create: create a pmap object. 2332 */ 2333 struct pmap * 2334 pmap_create(void) 2335 { 2336 struct pmap *pmap; 2337 int i; 2338 2339 pmap = pool_cache_get(&pmap_cache, PR_WAITOK); 2340 2341 /* init uvm_object */ 2342 for (i = 0; i < PTP_LEVELS - 1; i++) { 2343 mutex_init(&pmap->pm_obj_lock[i], MUTEX_DEFAULT, IPL_NONE); 2344 uvm_obj_init(&pmap->pm_obj[i], NULL, false, 1); 2345 uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_obj_lock[i]); 2346 pmap->pm_ptphint[i] = NULL; 2347 } 2348 pmap->pm_stats.wired_count = 0; 2349 /* count the PDP allocd below */ 2350 pmap->pm_stats.resident_count = PDP_SIZE; 2351 #if !defined(__x86_64__) 2352 pmap->pm_hiexec = 0; 2353 #endif /* !defined(__x86_64__) */ 2354 pmap->pm_flags = 0; 2355 pmap->pm_gc_ptp = NULL; 2356 2357 kcpuset_create(&pmap->pm_cpus, true); 2358 kcpuset_create(&pmap->pm_kernel_cpus, true); 2359 #ifdef XEN 2360 kcpuset_create(&pmap->pm_xen_ptp_cpus, true); 2361 #endif 2362 /* init the LDT */ 2363 pmap->pm_ldt = NULL; 2364 pmap->pm_ldt_len = 0; 2365 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 2366 2367 /* allocate PDP */ 2368 try_again: 2369 pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK); 2370 2371 mutex_enter(&pmaps_lock); 2372 2373 if (pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] == 0) { 2374 mutex_exit(&pmaps_lock); 2375 pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir); 2376 goto try_again; 2377 } 2378 2379 for (i = 0; i < PDP_SIZE; i++) 2380 pmap->pm_pdirpa[i] = 2381 pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]); 2382 2383 LIST_INSERT_HEAD(&pmaps, pmap, pm_list); 2384 2385 mutex_exit(&pmaps_lock); 2386 2387 return (pmap); 2388 } 2389 2390 /* 2391 * pmap_free_ptps: put a list of ptps back to the freelist. 2392 */ 2393 2394 void 2395 pmap_free_ptps(struct vm_page *empty_ptps) 2396 { 2397 struct vm_page *ptp; 2398 struct pmap_page *pp; 2399 2400 while ((ptp = empty_ptps) != NULL) { 2401 pp = VM_PAGE_TO_PP(ptp); 2402 empty_ptps = pp->pp_link; 2403 LIST_INIT(&pp->pp_head.pvh_list); 2404 uvm_pagefree(ptp); 2405 } 2406 } 2407 2408 /* 2409 * pmap_check_ptps: verify that none of the pmap's page table objects 2410 * have any pages allocated to them. 2411 */ 2412 2413 static inline void 2414 pmap_check_ptps(struct pmap *pmap) 2415 { 2416 int i; 2417 2418 for (i = 0; i < PTP_LEVELS - 1; i++) { 2419 KASSERT(pmap->pm_obj[i].uo_npages == 0); 2420 KASSERT(TAILQ_EMPTY(&pmap->pm_obj[i].memq)); 2421 } 2422 } 2423 2424 /* 2425 * pmap_destroy: drop reference count on pmap. free pmap if 2426 * reference count goes to zero. 2427 */ 2428 2429 void 2430 pmap_destroy(struct pmap *pmap) 2431 { 2432 lwp_t *l; 2433 int i; 2434 2435 /* 2436 * If we have torn down this pmap, process deferred frees and 2437 * invalidations. Free now if the system is low on memory. 2438 * Otherwise, free when the pmap is destroyed thus avoiding a 2439 * TLB shootdown. 2440 */ 2441 l = curlwp; 2442 if (__predict_false(l->l_md.md_gc_pmap == pmap)) { 2443 pmap_check_ptps(pmap); 2444 if (uvmexp.free < uvmexp.freetarg) { 2445 pmap_update(pmap); 2446 } else { 2447 KASSERT(pmap->pm_gc_ptp == NULL); 2448 pmap->pm_gc_ptp = l->l_md.md_gc_ptp; 2449 l->l_md.md_gc_ptp = NULL; 2450 l->l_md.md_gc_pmap = NULL; 2451 } 2452 } 2453 2454 /* 2455 * drop reference count 2456 */ 2457 2458 if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) { 2459 return; 2460 } 2461 2462 #ifdef DIAGNOSTIC 2463 CPU_INFO_ITERATOR cii; 2464 struct cpu_info *ci; 2465 2466 for (CPU_INFO_FOREACH(cii, ci)) { 2467 if (ci->ci_pmap == pmap) 2468 panic("destroying pmap being used"); 2469 #if defined(XEN) && defined(__x86_64__) 2470 for (i = 0; i < PDIR_SLOT_PTE; i++) { 2471 if (pmap->pm_pdir[i] != 0 && 2472 ci->ci_kpm_pdir[i] == pmap->pm_pdir[i]) { 2473 printf("pmap_destroy(%p) pmap_kernel %p " 2474 "curcpu %d cpu %d ci_pmap %p " 2475 "ci->ci_kpm_pdir[%d]=%" PRIx64 2476 " pmap->pm_pdir[%d]=%" PRIx64 "\n", 2477 pmap, pmap_kernel(), curcpu()->ci_index, 2478 ci->ci_index, ci->ci_pmap, 2479 i, ci->ci_kpm_pdir[i], 2480 i, pmap->pm_pdir[i]); 2481 panic("%s: used pmap", __func__); 2482 } 2483 } 2484 #endif 2485 } 2486 #endif /* DIAGNOSTIC */ 2487 2488 /* 2489 * Reference count is zero, free pmap resources and then free pmap. 2490 * First, remove it from global list of pmaps. 2491 */ 2492 2493 mutex_enter(&pmaps_lock); 2494 LIST_REMOVE(pmap, pm_list); 2495 mutex_exit(&pmaps_lock); 2496 2497 /* 2498 * Process deferred PTP frees. No TLB shootdown required, as the 2499 * PTP pages are no longer visible to any CPU. 2500 */ 2501 2502 pmap_free_ptps(pmap->pm_gc_ptp); 2503 2504 pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir); 2505 2506 #ifdef USER_LDT 2507 if (pmap->pm_ldt != NULL) { 2508 /* 2509 * no need to switch the LDT; this address space is gone, 2510 * nothing is using it. 2511 * 2512 * No need to lock the pmap for ldt_free (or anything else), 2513 * we're the last one to use it. 2514 */ 2515 mutex_enter(&cpu_lock); 2516 ldt_free(pmap->pm_ldt_sel); 2517 mutex_exit(&cpu_lock); 2518 uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt, 2519 pmap->pm_ldt_len, UVM_KMF_WIRED); 2520 } 2521 #endif 2522 2523 for (i = 0; i < PTP_LEVELS - 1; i++) { 2524 uvm_obj_destroy(&pmap->pm_obj[i], false); 2525 mutex_destroy(&pmap->pm_obj_lock[i]); 2526 } 2527 kcpuset_destroy(pmap->pm_cpus); 2528 kcpuset_destroy(pmap->pm_kernel_cpus); 2529 #ifdef XEN 2530 kcpuset_destroy(pmap->pm_xen_ptp_cpus); 2531 #endif 2532 2533 pmap_check_ptps(pmap); 2534 pool_cache_put(&pmap_cache, pmap); 2535 } 2536 2537 /* 2538 * pmap_remove_all: pmap is being torn down by the current thread. 2539 * avoid unnecessary invalidations. 2540 */ 2541 2542 void 2543 pmap_remove_all(struct pmap *pmap) 2544 { 2545 lwp_t *l = curlwp; 2546 2547 KASSERT(l->l_md.md_gc_pmap == NULL); 2548 2549 l->l_md.md_gc_pmap = pmap; 2550 } 2551 2552 #if defined(PMAP_FORK) 2553 /* 2554 * pmap_fork: perform any necessary data structure manipulation when 2555 * a VM space is forked. 2556 */ 2557 2558 void 2559 pmap_fork(struct pmap *pmap1, struct pmap *pmap2) 2560 { 2561 #ifdef USER_LDT 2562 union descriptor *new_ldt; 2563 size_t len; 2564 int sel; 2565 2566 if (__predict_true(pmap1->pm_ldt == NULL)) { 2567 return; 2568 } 2569 2570 /* 2571 * Copy the LDT into the new process. 2572 * 2573 * Read pmap1's ldt pointer and length unlocked; if it changes 2574 * behind our back we'll retry. This will starve if there's a 2575 * stream of LDT changes in another thread but that should not 2576 * happen. 2577 */ 2578 2579 retry: 2580 if (pmap1->pm_ldt != NULL) { 2581 len = pmap1->pm_ldt_len; 2582 /* Allocate space for the new process's LDT */ 2583 new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, len, 0, 2584 UVM_KMF_WIRED); 2585 if (new_ldt == NULL) { 2586 printf("WARNING: %s: unable to allocate LDT space\n", 2587 __func__); 2588 return; 2589 } 2590 mutex_enter(&cpu_lock); 2591 /* Get a GDT slot for it */ 2592 sel = ldt_alloc(new_ldt, len); 2593 if (sel == -1) { 2594 mutex_exit(&cpu_lock); 2595 uvm_km_free(kernel_map, (vaddr_t)new_ldt, len, 2596 UVM_KMF_WIRED); 2597 printf("WARNING: %s: unable to allocate LDT selector\n", 2598 __func__); 2599 return; 2600 } 2601 } else { 2602 /* Wasn't anything there after all. */ 2603 len = -1; 2604 new_ldt = NULL; 2605 sel = -1; 2606 mutex_enter(&cpu_lock); 2607 } 2608 2609 /* If there's still something there now that we have cpu_lock... */ 2610 if (pmap1->pm_ldt != NULL) { 2611 if (len != pmap1->pm_ldt_len) { 2612 /* Oops, it changed. Drop what we did and try again */ 2613 if (len != -1) { 2614 ldt_free(sel); 2615 uvm_km_free(kernel_map, (vaddr_t)new_ldt, 2616 len, UVM_KMF_WIRED); 2617 } 2618 mutex_exit(&cpu_lock); 2619 goto retry; 2620 } 2621 2622 /* Copy the LDT data and install it in pmap2 */ 2623 memcpy(new_ldt, pmap1->pm_ldt, len); 2624 pmap2->pm_ldt = new_ldt; 2625 pmap2->pm_ldt_len = pmap1->pm_ldt_len; 2626 pmap2->pm_ldt_sel = sel; 2627 len = -1; 2628 } 2629 2630 if (len != -1) { 2631 /* There wasn't still something there, so mop up */ 2632 ldt_free(sel); 2633 mutex_exit(&cpu_lock); 2634 uvm_km_free(kernel_map, (vaddr_t)new_ldt, len, 2635 UVM_KMF_WIRED); 2636 } else { 2637 mutex_exit(&cpu_lock); 2638 } 2639 #endif /* USER_LDT */ 2640 } 2641 #endif /* PMAP_FORK */ 2642 2643 #ifdef USER_LDT 2644 2645 /* 2646 * pmap_ldt_xcall: cross call used by pmap_ldt_sync. if the named pmap 2647 * is active, reload LDTR. 2648 */ 2649 static void 2650 pmap_ldt_xcall(void *arg1, void *arg2) 2651 { 2652 struct pmap *pm; 2653 2654 kpreempt_disable(); 2655 pm = arg1; 2656 if (curcpu()->ci_pmap == pm) { 2657 lldt(pm->pm_ldt_sel); 2658 } 2659 kpreempt_enable(); 2660 } 2661 2662 /* 2663 * pmap_ldt_sync: LDT selector for the named pmap is changing. swap 2664 * in the new selector on all CPUs. 2665 */ 2666 void 2667 pmap_ldt_sync(struct pmap *pm) 2668 { 2669 uint64_t where; 2670 2671 KASSERT(mutex_owned(&cpu_lock)); 2672 2673 pmap_ldt_evcnt.ev_count++; 2674 where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL); 2675 xc_wait(where); 2676 } 2677 2678 /* 2679 * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and 2680 * restore the default. 2681 */ 2682 2683 void 2684 pmap_ldt_cleanup(struct lwp *l) 2685 { 2686 pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap; 2687 union descriptor *dp = NULL; 2688 size_t len = 0; 2689 int sel = -1; 2690 2691 if (__predict_true(pmap->pm_ldt == NULL)) { 2692 return; 2693 } 2694 2695 mutex_enter(&cpu_lock); 2696 if (pmap->pm_ldt != NULL) { 2697 sel = pmap->pm_ldt_sel; 2698 dp = pmap->pm_ldt; 2699 len = pmap->pm_ldt_len; 2700 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 2701 pmap->pm_ldt = NULL; 2702 pmap->pm_ldt_len = 0; 2703 pmap_ldt_sync(pmap); 2704 ldt_free(sel); 2705 uvm_km_free(kernel_map, (vaddr_t)dp, len, UVM_KMF_WIRED); 2706 } 2707 mutex_exit(&cpu_lock); 2708 } 2709 #endif /* USER_LDT */ 2710 2711 /* 2712 * pmap_activate: activate a process' pmap 2713 * 2714 * => must be called with kernel preemption disabled 2715 * => if lwp is the curlwp, then set ci_want_pmapload so that 2716 * actual MMU context switch will be done by pmap_load() later 2717 */ 2718 2719 void 2720 pmap_activate(struct lwp *l) 2721 { 2722 struct cpu_info *ci; 2723 struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2724 2725 KASSERT(kpreempt_disabled()); 2726 2727 ci = curcpu(); 2728 2729 if (l == ci->ci_curlwp) { 2730 KASSERT(ci->ci_want_pmapload == 0); 2731 KASSERT(ci->ci_tlbstate != TLBSTATE_VALID); 2732 2733 /* 2734 * no need to switch to kernel vmspace because 2735 * it's a subset of any vmspace. 2736 */ 2737 2738 if (pmap == pmap_kernel()) { 2739 ci->ci_want_pmapload = 0; 2740 return; 2741 } 2742 2743 ci->ci_want_pmapload = 1; 2744 } 2745 } 2746 2747 /* 2748 * pmap_reactivate: try to regain reference to the pmap. 2749 * 2750 * => Must be called with kernel preemption disabled. 2751 */ 2752 2753 static bool 2754 pmap_reactivate(struct pmap *pmap) 2755 { 2756 struct cpu_info * const ci = curcpu(); 2757 const cpuid_t cid = cpu_index(ci); 2758 bool result; 2759 2760 KASSERT(kpreempt_disabled()); 2761 #if defined(XEN) && defined(__x86_64__) 2762 KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd); 2763 #elif defined(PAE) 2764 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0])); 2765 #elif !defined(XEN) 2766 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3())); 2767 #endif 2768 2769 /* 2770 * If we still have a lazy reference to this pmap, we can assume 2771 * that there was no TLB shootdown for this pmap in the meantime. 2772 * 2773 * The order of events here is important as we must synchronize 2774 * with TLB shootdown interrupts. Declare interest in invalidations 2775 * (TLBSTATE_VALID) and then check the CPU set, which the IPIs can 2776 * change only when the state is TLBSTATE_LAZY. 2777 */ 2778 2779 ci->ci_tlbstate = TLBSTATE_VALID; 2780 KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid)); 2781 2782 if (kcpuset_isset(pmap->pm_cpus, cid)) { 2783 /* We have the reference, state is valid. */ 2784 result = true; 2785 } else { 2786 /* Must reload the TLB. */ 2787 kcpuset_atomic_set(pmap->pm_cpus, cid); 2788 result = false; 2789 } 2790 return result; 2791 } 2792 2793 /* 2794 * pmap_load: perform the actual pmap switch, i.e. fill in %cr3 register 2795 * and relevant LDT info. 2796 * 2797 * Ensures that the current process' pmap is loaded on the current CPU's 2798 * MMU and that there are no stale TLB entries. 2799 * 2800 * => The caller should disable kernel preemption or do check-and-retry 2801 * to prevent a preemption from undoing our efforts. 2802 * => This function may block. 2803 */ 2804 void 2805 pmap_load(void) 2806 { 2807 struct cpu_info *ci; 2808 struct pmap *pmap, *oldpmap; 2809 struct lwp *l; 2810 struct pcb *pcb; 2811 cpuid_t cid; 2812 uint64_t ncsw; 2813 2814 kpreempt_disable(); 2815 retry: 2816 ci = curcpu(); 2817 if (!ci->ci_want_pmapload) { 2818 kpreempt_enable(); 2819 return; 2820 } 2821 l = ci->ci_curlwp; 2822 ncsw = l->l_ncsw; 2823 2824 /* should be able to take ipis. */ 2825 KASSERT(ci->ci_ilevel < IPL_HIGH); 2826 #ifdef XEN 2827 /* Check to see if interrupts are enabled (ie; no events are masked) */ 2828 KASSERT(x86_read_psl() == 0); 2829 #else 2830 KASSERT((x86_read_psl() & PSL_I) != 0); 2831 #endif 2832 2833 KASSERT(l != NULL); 2834 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2835 KASSERT(pmap != pmap_kernel()); 2836 oldpmap = ci->ci_pmap; 2837 pcb = lwp_getpcb(l); 2838 2839 if (pmap == oldpmap) { 2840 if (!pmap_reactivate(pmap)) { 2841 u_int gen = uvm_emap_gen_return(); 2842 2843 /* 2844 * pmap has been changed during deactivated. 2845 * our tlb may be stale. 2846 */ 2847 2848 tlbflush(); 2849 uvm_emap_update(gen); 2850 } 2851 2852 ci->ci_want_pmapload = 0; 2853 kpreempt_enable(); 2854 return; 2855 } 2856 2857 /* 2858 * Acquire a reference to the new pmap and perform the switch. 2859 */ 2860 2861 pmap_reference(pmap); 2862 2863 cid = cpu_index(ci); 2864 kcpuset_atomic_clear(oldpmap->pm_cpus, cid); 2865 kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid); 2866 2867 #if defined(XEN) && defined(__x86_64__) 2868 KASSERT(pmap_pdirpa(oldpmap, 0) == ci->ci_xen_current_user_pgd || 2869 oldpmap == pmap_kernel()); 2870 #elif defined(PAE) 2871 KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0])); 2872 #elif !defined(XEN) 2873 KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(rcr3())); 2874 #endif 2875 KASSERT(!kcpuset_isset(pmap->pm_cpus, cid)); 2876 KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid)); 2877 2878 /* 2879 * Mark the pmap in use by this CPU. Again, we must synchronize 2880 * with TLB shootdown interrupts, so set the state VALID first, 2881 * then register us for shootdown events on this pmap. 2882 */ 2883 ci->ci_tlbstate = TLBSTATE_VALID; 2884 kcpuset_atomic_set(pmap->pm_cpus, cid); 2885 kcpuset_atomic_set(pmap->pm_kernel_cpus, cid); 2886 ci->ci_pmap = pmap; 2887 2888 /* 2889 * update tss. now that we have registered for invalidations 2890 * from other CPUs, we're good to load the page tables. 2891 */ 2892 #ifdef PAE 2893 pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa; 2894 #else 2895 pcb->pcb_cr3 = pmap_pdirpa(pmap, 0); 2896 #endif 2897 2898 #ifdef i386 2899 #ifndef XEN 2900 ci->ci_tss.tss_ldt = pmap->pm_ldt_sel; 2901 ci->ci_tss.tss_cr3 = pcb->pcb_cr3; 2902 #endif /* !XEN */ 2903 #endif /* i386 */ 2904 2905 lldt(pmap->pm_ldt_sel); 2906 2907 u_int gen = uvm_emap_gen_return(); 2908 cpu_load_pmap(pmap, oldpmap); 2909 uvm_emap_update(gen); 2910 2911 ci->ci_want_pmapload = 0; 2912 2913 /* 2914 * we're now running with the new pmap. drop the reference 2915 * to the old pmap. if we block, we need to go around again. 2916 */ 2917 2918 pmap_destroy(oldpmap); 2919 if (l->l_ncsw != ncsw) { 2920 goto retry; 2921 } 2922 2923 kpreempt_enable(); 2924 } 2925 2926 /* 2927 * pmap_deactivate: deactivate a process' pmap. 2928 * 2929 * => Must be called with kernel preemption disabled (high IPL is enough). 2930 */ 2931 void 2932 pmap_deactivate(struct lwp *l) 2933 { 2934 struct pmap *pmap; 2935 struct cpu_info *ci; 2936 2937 KASSERT(kpreempt_disabled()); 2938 2939 if (l != curlwp) { 2940 return; 2941 } 2942 2943 /* 2944 * Wait for pending TLB shootdowns to complete. Necessary because 2945 * TLB shootdown state is per-CPU, and the LWP may be coming off 2946 * the CPU before it has a chance to call pmap_update(), e.g. due 2947 * to kernel preemption or blocking routine in between. 2948 */ 2949 pmap_tlb_shootnow(); 2950 2951 ci = curcpu(); 2952 2953 if (ci->ci_want_pmapload) { 2954 /* 2955 * ci_want_pmapload means that our pmap is not loaded on 2956 * the CPU or TLB might be stale. note that pmap_kernel() 2957 * is always considered loaded. 2958 */ 2959 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 2960 != pmap_kernel()); 2961 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 2962 != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID); 2963 2964 /* 2965 * userspace has not been touched. 2966 * nothing to do here. 2967 */ 2968 2969 ci->ci_want_pmapload = 0; 2970 return; 2971 } 2972 2973 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2974 2975 if (pmap == pmap_kernel()) { 2976 return; 2977 } 2978 2979 #if defined(XEN) && defined(__x86_64__) 2980 KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd); 2981 #elif defined(PAE) 2982 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0])); 2983 #elif !defined(XEN) 2984 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3())); 2985 #endif 2986 KASSERT(ci->ci_pmap == pmap); 2987 2988 /* 2989 * we aren't interested in TLB invalidations for this pmap, 2990 * at least for the time being. 2991 */ 2992 2993 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); 2994 ci->ci_tlbstate = TLBSTATE_LAZY; 2995 } 2996 2997 /* 2998 * end of lifecycle functions 2999 */ 3000 3001 /* 3002 * some misc. functions 3003 */ 3004 3005 int 3006 pmap_pdes_invalid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde) 3007 { 3008 int i; 3009 unsigned long index; 3010 pd_entry_t pde; 3011 3012 for (i = PTP_LEVELS; i > 1; i--) { 3013 index = pl_i(va, i); 3014 pde = pdes[i - 2][index]; 3015 if ((pde & PG_V) == 0) 3016 return i; 3017 } 3018 if (lastpde != NULL) 3019 *lastpde = pde; 3020 return 0; 3021 } 3022 3023 /* 3024 * pmap_extract: extract a PA for the given VA 3025 */ 3026 3027 bool 3028 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap) 3029 { 3030 pt_entry_t *ptes, pte; 3031 pd_entry_t pde; 3032 pd_entry_t * const *pdes; 3033 struct pmap *pmap2; 3034 struct cpu_info *ci; 3035 paddr_t pa; 3036 lwp_t *l; 3037 bool hard, rv; 3038 3039 #ifdef __HAVE_DIRECT_MAP 3040 if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) { 3041 if (pap != NULL) { 3042 *pap = va - PMAP_DIRECT_BASE; 3043 } 3044 return true; 3045 } 3046 #endif 3047 3048 rv = false; 3049 pa = 0; 3050 l = curlwp; 3051 3052 kpreempt_disable(); 3053 ci = l->l_cpu; 3054 if (__predict_true(!ci->ci_want_pmapload && ci->ci_pmap == pmap) || 3055 pmap == pmap_kernel()) { 3056 /* 3057 * no need to lock, because it's pmap_kernel() or our 3058 * own pmap and is active. if a user pmap, the caller 3059 * will hold the vm_map write/read locked and so prevent 3060 * entries from disappearing while we are here. ptps 3061 * can disappear via pmap_remove() and pmap_protect(), 3062 * but they are called with the vm_map write locked. 3063 */ 3064 hard = false; 3065 ptes = PTE_BASE; 3066 pdes = normal_pdes; 3067 } else { 3068 /* we lose, do it the hard way. */ 3069 hard = true; 3070 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3071 } 3072 if (pmap_pdes_valid(va, pdes, &pde)) { 3073 pte = ptes[pl1_i(va)]; 3074 if (pde & PG_PS) { 3075 pa = (pde & PG_LGFRAME) | (va & (NBPD_L2 - 1)); 3076 rv = true; 3077 } else if (__predict_true((pte & PG_V) != 0)) { 3078 pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1)); 3079 rv = true; 3080 } 3081 } 3082 if (__predict_false(hard)) { 3083 pmap_unmap_ptes(pmap, pmap2); 3084 } 3085 kpreempt_enable(); 3086 if (pap != NULL) { 3087 *pap = pa; 3088 } 3089 return rv; 3090 } 3091 3092 3093 /* 3094 * vtophys: virtual address to physical address. For use by 3095 * machine-dependent code only. 3096 */ 3097 3098 paddr_t 3099 vtophys(vaddr_t va) 3100 { 3101 paddr_t pa; 3102 3103 if (pmap_extract(pmap_kernel(), va, &pa) == true) 3104 return (pa); 3105 return (0); 3106 } 3107 3108 __strict_weak_alias(pmap_extract_ma, pmap_extract); 3109 3110 #ifdef XEN 3111 3112 /* 3113 * vtomach: virtual address to machine address. For use by 3114 * machine-dependent code only. 3115 */ 3116 3117 paddr_t 3118 vtomach(vaddr_t va) 3119 { 3120 paddr_t pa; 3121 3122 if (pmap_extract_ma(pmap_kernel(), va, &pa) == true) 3123 return (pa); 3124 return (0); 3125 } 3126 3127 #endif /* XEN */ 3128 3129 /* 3130 * pmap_virtual_space: used during bootup [pmap_steal_memory] to 3131 * determine the bounds of the kernel virtual addess space. 3132 */ 3133 3134 void 3135 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp) 3136 { 3137 *startp = virtual_avail; 3138 *endp = virtual_end; 3139 } 3140 3141 /* 3142 * pmap_zero_page: zero a page 3143 */ 3144 3145 void 3146 pmap_zero_page(paddr_t pa) 3147 { 3148 #if defined(__HAVE_DIRECT_MAP) 3149 pagezero(PMAP_DIRECT_MAP(pa)); 3150 #else 3151 #if defined(XEN) 3152 if (XEN_VERSION_SUPPORTED(3, 4)) 3153 xen_pagezero(pa); 3154 #endif 3155 struct cpu_info *ci; 3156 pt_entry_t *zpte; 3157 vaddr_t zerova; 3158 3159 const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_M | PG_U; 3160 3161 kpreempt_disable(); 3162 3163 ci = curcpu(); 3164 zerova = ci->vpage[VPAGE_ZER]; 3165 zpte = ci->vpage_pte[VPAGE_ZER]; 3166 3167 KASSERTMSG(!*zpte, "pmap_zero_page: lock botch"); 3168 3169 pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags); 3170 pmap_pte_flush(); 3171 pmap_update_pg(zerova); /* flush TLB */ 3172 3173 memset((void *)zerova, 0, PAGE_SIZE); 3174 3175 #if defined(DIAGNOSTIC) || defined(XEN) 3176 pmap_pte_set(zpte, 0); /* zap ! */ 3177 pmap_pte_flush(); 3178 #endif 3179 3180 kpreempt_enable(); 3181 #endif /* defined(__HAVE_DIRECT_MAP) */ 3182 } 3183 3184 /* 3185 * pmap_pagezeroidle: the same, for the idle loop page zero'er. 3186 * Returns true if the page was zero'd, false if we aborted for 3187 * some reason. 3188 */ 3189 3190 bool 3191 pmap_pageidlezero(paddr_t pa) 3192 { 3193 #ifdef __HAVE_DIRECT_MAP 3194 KASSERT(cpu_feature[0] & CPUID_SSE2); 3195 return sse2_idlezero_page((void *)PMAP_DIRECT_MAP(pa)); 3196 #else 3197 struct cpu_info *ci; 3198 pt_entry_t *zpte; 3199 vaddr_t zerova; 3200 bool rv; 3201 3202 const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_M | PG_U; 3203 3204 ci = curcpu(); 3205 zerova = ci->vpage[VPAGE_ZER]; 3206 zpte = ci->vpage_pte[VPAGE_ZER]; 3207 3208 KASSERT(cpu_feature[0] & CPUID_SSE2); 3209 KASSERT(*zpte == 0); 3210 3211 pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags); 3212 pmap_pte_flush(); 3213 pmap_update_pg(zerova); /* flush TLB */ 3214 3215 rv = sse2_idlezero_page((void *)zerova); 3216 3217 #if defined(DIAGNOSTIC) || defined(XEN) 3218 pmap_pte_set(zpte, 0); /* zap ! */ 3219 pmap_pte_flush(); 3220 #endif 3221 3222 return rv; 3223 #endif 3224 } 3225 3226 /* 3227 * pmap_copy_page: copy a page 3228 */ 3229 3230 void 3231 pmap_copy_page(paddr_t srcpa, paddr_t dstpa) 3232 { 3233 #if defined(__HAVE_DIRECT_MAP) 3234 vaddr_t srcva = PMAP_DIRECT_MAP(srcpa); 3235 vaddr_t dstva = PMAP_DIRECT_MAP(dstpa); 3236 3237 memcpy((void *)dstva, (void *)srcva, PAGE_SIZE); 3238 #else 3239 #if defined(XEN) 3240 if (XEN_VERSION_SUPPORTED(3, 4)) { 3241 xen_copy_page(srcpa, dstpa); 3242 return; 3243 } 3244 #endif 3245 struct cpu_info *ci; 3246 pt_entry_t *srcpte, *dstpte; 3247 vaddr_t srcva, dstva; 3248 3249 const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_U; 3250 3251 kpreempt_disable(); 3252 3253 ci = curcpu(); 3254 srcva = ci->vpage[VPAGE_SRC]; 3255 dstva = ci->vpage[VPAGE_DST]; 3256 srcpte = ci->vpage_pte[VPAGE_SRC]; 3257 dstpte = ci->vpage_pte[VPAGE_DST]; 3258 3259 KASSERT(*srcpte == 0 && *dstpte == 0); 3260 3261 pmap_pte_set(srcpte, pmap_pa2pte(srcpa) | pteflags); 3262 pmap_pte_set(dstpte, pmap_pa2pte(dstpa) | pteflags | PG_M); 3263 pmap_pte_flush(); 3264 pmap_update_2pg(srcva, dstva); 3265 3266 memcpy((void *)dstva, (void *)srcva, PAGE_SIZE); 3267 3268 #if defined(DIAGNOSTIC) || defined(XEN) 3269 pmap_pte_set(srcpte, 0); 3270 pmap_pte_set(dstpte, 0); 3271 pmap_pte_flush(); 3272 #endif 3273 3274 kpreempt_enable(); 3275 #endif /* defined(__HAVE_DIRECT_MAP) */ 3276 } 3277 3278 static pt_entry_t * 3279 pmap_map_ptp(struct vm_page *ptp) 3280 { 3281 #ifdef __HAVE_DIRECT_MAP 3282 return (void *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp)); 3283 #else 3284 struct cpu_info *ci; 3285 pt_entry_t *ptppte; 3286 vaddr_t ptpva; 3287 3288 KASSERT(kpreempt_disabled()); 3289 3290 #ifndef XEN 3291 const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_U | PG_M; 3292 #else 3293 const pd_entry_t pteflags = PG_V | pmap_pg_nx | PG_U | PG_M; 3294 #endif 3295 3296 ci = curcpu(); 3297 ptpva = ci->vpage[VPAGE_PTP]; 3298 ptppte = ci->vpage_pte[VPAGE_PTP]; 3299 3300 pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | pteflags); 3301 3302 pmap_pte_flush(); 3303 pmap_update_pg(ptpva); 3304 3305 return (pt_entry_t *)ptpva; 3306 #endif 3307 } 3308 3309 static void 3310 pmap_unmap_ptp(void) 3311 { 3312 #ifndef __HAVE_DIRECT_MAP 3313 #if defined(DIAGNOSTIC) || defined(XEN) 3314 struct cpu_info *ci; 3315 pt_entry_t *pte; 3316 3317 KASSERT(kpreempt_disabled()); 3318 3319 ci = curcpu(); 3320 pte = ci->vpage_pte[VPAGE_PTP]; 3321 3322 if (*pte != 0) { 3323 pmap_pte_set(pte, 0); 3324 pmap_pte_flush(); 3325 } 3326 #endif 3327 #endif 3328 } 3329 3330 static pt_entry_t * 3331 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va) 3332 { 3333 3334 KASSERT(kpreempt_disabled()); 3335 if (pmap_is_curpmap(pmap)) { 3336 return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */ 3337 } 3338 KASSERT(ptp != NULL); 3339 return pmap_map_ptp(ptp) + pl1_pi(va); 3340 } 3341 3342 static void 3343 pmap_unmap_pte(void) 3344 { 3345 3346 KASSERT(kpreempt_disabled()); 3347 3348 pmap_unmap_ptp(); 3349 } 3350 3351 /* 3352 * p m a p r e m o v e f u n c t i o n s 3353 * 3354 * functions that remove mappings 3355 */ 3356 3357 /* 3358 * pmap_remove_ptes: remove PTEs from a PTP 3359 * 3360 * => caller must hold pmap's lock 3361 * => PTP must be mapped into KVA 3362 * => PTP should be null if pmap == pmap_kernel() 3363 * => must be called with kernel preemption disabled 3364 * => returns composite pte if at least one page should be shot down 3365 */ 3366 3367 static void 3368 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva, 3369 vaddr_t startva, vaddr_t endva, struct pv_entry **pv_tofree) 3370 { 3371 pt_entry_t *pte = (pt_entry_t *)ptpva; 3372 3373 KASSERT(pmap == pmap_kernel() || mutex_owned(pmap->pm_lock)); 3374 KASSERT(kpreempt_disabled()); 3375 3376 /* 3377 * note that ptpva points to the PTE that maps startva. this may 3378 * or may not be the first PTE in the PTP. 3379 * 3380 * we loop through the PTP while there are still PTEs to look at 3381 * and the wire_count is greater than 1 (because we use the wire_count 3382 * to keep track of the number of real PTEs in the PTP). 3383 */ 3384 while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) { 3385 (void)pmap_remove_pte(pmap, ptp, pte, startva, pv_tofree); 3386 startva += PAGE_SIZE; 3387 pte++; 3388 } 3389 } 3390 3391 3392 /* 3393 * pmap_remove_pte: remove a single PTE from a PTP. 3394 * 3395 * => caller must hold pmap's lock 3396 * => PTP must be mapped into KVA 3397 * => PTP should be null if pmap == pmap_kernel() 3398 * => returns true if we removed a mapping 3399 * => must be called with kernel preemption disabled 3400 */ 3401 static bool 3402 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, 3403 vaddr_t va, struct pv_entry **pv_tofree) 3404 { 3405 struct pv_entry *pve; 3406 struct vm_page *pg; 3407 struct pmap_page *pp; 3408 pt_entry_t opte; 3409 3410 KASSERT(pmap == pmap_kernel() || mutex_owned(pmap->pm_lock)); 3411 KASSERT(kpreempt_disabled()); 3412 3413 if (!pmap_valid_entry(*pte)) { 3414 /* VA not mapped. */ 3415 return false; 3416 } 3417 3418 /* Atomically save the old PTE and zap it. */ 3419 opte = pmap_pte_testset(pte, 0); 3420 if (!pmap_valid_entry(opte)) { 3421 return false; 3422 } 3423 3424 pmap_exec_account(pmap, va, opte, 0); 3425 pmap_stats_update_bypte(pmap, 0, opte); 3426 3427 if (ptp) { 3428 /* 3429 * Dropping a PTE. Make sure that the PDE is flushed. 3430 */ 3431 ptp->wire_count--; 3432 if (ptp->wire_count <= 1) { 3433 opte |= PG_U; 3434 } 3435 } 3436 3437 if ((opte & PG_U) != 0) { 3438 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE); 3439 } 3440 3441 /* 3442 * If we are not on a pv_head list - we are done. 3443 */ 3444 if ((opte & PG_PVLIST) == 0) { 3445 #ifndef DOM0OPS 3446 KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL), 3447 "managed page without PG_PVLIST for %#"PRIxVADDR, va); 3448 KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL), 3449 "pv-tracked page without PG_PVLIST for %#"PRIxVADDR, va); 3450 #endif 3451 return true; 3452 } 3453 3454 if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) { 3455 KASSERT(uvm_page_locked_p(pg)); 3456 pp = VM_PAGE_TO_PP(pg); 3457 } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) { 3458 paddr_t pa = pmap_pte2pa(opte); 3459 panic("%s: PG_PVLIST with pv-untracked page" 3460 " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")", 3461 __func__, va, pa, atop(pa)); 3462 } 3463 3464 /* Sync R/M bits. */ 3465 pp->pp_attrs |= opte; 3466 pve = pmap_remove_pv(pp, ptp, va); 3467 3468 if (pve) { 3469 pve->pve_next = *pv_tofree; 3470 *pv_tofree = pve; 3471 } 3472 return true; 3473 } 3474 3475 /* 3476 * pmap_remove: mapping removal function. 3477 * 3478 * => caller should not be holding any pmap locks 3479 */ 3480 3481 void 3482 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 3483 { 3484 pt_entry_t *ptes; 3485 pd_entry_t pde; 3486 pd_entry_t * const *pdes; 3487 struct pv_entry *pv_tofree = NULL; 3488 bool result; 3489 int i; 3490 paddr_t ptppa; 3491 vaddr_t blkendva, va = sva; 3492 struct vm_page *ptp; 3493 struct pmap *pmap2; 3494 3495 kpreempt_disable(); 3496 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 3497 3498 /* 3499 * removing one page? take shortcut function. 3500 */ 3501 3502 if (va + PAGE_SIZE == eva) { 3503 if (pmap_pdes_valid(va, pdes, &pde)) { 3504 3505 /* PA of the PTP */ 3506 ptppa = pmap_pte2pa(pde); 3507 3508 /* Get PTP if non-kernel mapping. */ 3509 if (pmap != pmap_kernel()) { 3510 ptp = pmap_find_ptp(pmap, va, ptppa, 1); 3511 KASSERTMSG(ptp != NULL, 3512 "%s: unmanaged PTP detected", __func__); 3513 } else { 3514 /* Never free kernel PTPs. */ 3515 ptp = NULL; 3516 } 3517 3518 result = pmap_remove_pte(pmap, ptp, 3519 &ptes[pl1_i(va)], va, &pv_tofree); 3520 3521 /* 3522 * if mapping removed and the PTP is no longer 3523 * being used, free it! 3524 */ 3525 3526 if (result && ptp && ptp->wire_count <= 1) 3527 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3528 } 3529 } else for (/* null */ ; va < eva ; va = blkendva) { 3530 int lvl; 3531 3532 /* determine range of block */ 3533 blkendva = x86_round_pdr(va+1); 3534 if (blkendva > eva) 3535 blkendva = eva; 3536 3537 /* 3538 * Our PTE mappings should never be removed with pmap_remove. 3539 * 3540 * XXXmaxv: still needed? 3541 * 3542 * A long term solution is to move the PTEs out of user address 3543 * space, and into kernel address space. Then we can set 3544 * VM_MAXUSER_ADDRESS to be VM_MAX_ADDRESS. 3545 */ 3546 for (i = 0; i < PDP_SIZE; i++) { 3547 if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i) 3548 panic("PTE space accessed"); 3549 } 3550 3551 lvl = pmap_pdes_invalid(va, pdes, &pde); 3552 if (lvl != 0) { 3553 /* 3554 * skip a range corresponding to an invalid pde. 3555 */ 3556 blkendva = (va & ptp_masks[lvl - 1]) + nbpd[lvl - 1]; 3557 continue; 3558 } 3559 3560 /* PA of the PTP */ 3561 ptppa = pmap_pte2pa(pde); 3562 3563 /* Get PTP if non-kernel mapping. */ 3564 if (pmap != pmap_kernel()) { 3565 ptp = pmap_find_ptp(pmap, va, ptppa, 1); 3566 KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected", 3567 __func__); 3568 } else { 3569 /* Never free kernel PTPs. */ 3570 ptp = NULL; 3571 } 3572 3573 pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va, 3574 blkendva, &pv_tofree); 3575 3576 /* if PTP is no longer being used, free it! */ 3577 if (ptp && ptp->wire_count <= 1) { 3578 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3579 } 3580 } 3581 pmap_unmap_ptes(pmap, pmap2); /* unlock pmap */ 3582 kpreempt_enable(); 3583 3584 /* Now we free unused PVs */ 3585 if (pv_tofree) 3586 pmap_free_pvs(pv_tofree); 3587 } 3588 3589 /* 3590 * pmap_sync_pv: clear pte bits and return the old value of the pte. 3591 * 3592 * => Caller should disable kernel preemption. 3593 * => issues tlb shootdowns if necessary. 3594 */ 3595 3596 static int 3597 pmap_sync_pv(struct pv_pte *pvpte, pt_entry_t expect, int clearbits, 3598 pt_entry_t *optep) 3599 { 3600 struct pmap *pmap; 3601 struct vm_page *ptp; 3602 vaddr_t va; 3603 pt_entry_t *ptep; 3604 pt_entry_t opte; 3605 pt_entry_t npte; 3606 bool need_shootdown; 3607 3608 ptp = pvpte->pte_ptp; 3609 va = pvpte->pte_va; 3610 KASSERT(ptp == NULL || ptp->uobject != NULL); 3611 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 3612 pmap = ptp_to_pmap(ptp); 3613 3614 KASSERT((expect & ~(PG_FRAME | PG_V)) == 0); 3615 KASSERT((expect & PG_V) != 0); 3616 KASSERT(clearbits == ~0 || (clearbits & ~(PG_M | PG_U | PG_RW)) == 0); 3617 KASSERT(kpreempt_disabled()); 3618 3619 ptep = pmap_map_pte(pmap, ptp, va); 3620 do { 3621 opte = *ptep; 3622 KASSERT((opte & (PG_M | PG_U)) != PG_M); 3623 KASSERT((opte & (PG_U | PG_V)) != PG_U); 3624 KASSERT(opte == 0 || (opte & PG_V) != 0); 3625 if ((opte & (PG_FRAME | PG_V)) != expect) { 3626 3627 /* 3628 * we lost a race with a V->P operation like 3629 * pmap_remove(). wait for the competitor 3630 * reflecting pte bits into mp_attrs. 3631 * 3632 * issue a redundant TLB shootdown so that 3633 * we can wait for its completion. 3634 */ 3635 3636 pmap_unmap_pte(); 3637 if (clearbits != 0) { 3638 pmap_tlb_shootdown(pmap, va, 3639 (pmap == pmap_kernel() ? PG_G : 0), 3640 TLBSHOOT_SYNC_PV1); 3641 } 3642 return EAGAIN; 3643 } 3644 3645 /* 3646 * check if there's anything to do on this pte. 3647 */ 3648 3649 if ((opte & clearbits) == 0) { 3650 need_shootdown = false; 3651 break; 3652 } 3653 3654 /* 3655 * we need a shootdown if the pte is cached. (PG_U) 3656 * 3657 * ...unless we are clearing only the PG_RW bit and 3658 * it isn't cached as RW. (PG_M) 3659 */ 3660 3661 need_shootdown = (opte & PG_U) != 0 && 3662 !(clearbits == PG_RW && (opte & PG_M) == 0); 3663 3664 npte = opte & ~clearbits; 3665 3666 /* 3667 * if we need a shootdown anyway, clear PG_U and PG_M. 3668 */ 3669 3670 if (need_shootdown) { 3671 npte &= ~(PG_U | PG_M); 3672 } 3673 KASSERT((npte & (PG_M | PG_U)) != PG_M); 3674 KASSERT((npte & (PG_U | PG_V)) != PG_U); 3675 KASSERT(npte == 0 || (opte & PG_V) != 0); 3676 } while (pmap_pte_cas(ptep, opte, npte) != opte); 3677 3678 if (need_shootdown) { 3679 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV2); 3680 } 3681 pmap_unmap_pte(); 3682 3683 *optep = opte; 3684 return 0; 3685 } 3686 3687 static void 3688 pmap_pp_remove(struct pmap_page *pp, paddr_t pa) 3689 { 3690 struct pv_pte *pvpte; 3691 struct pv_entry *killlist = NULL; 3692 struct vm_page *ptp; 3693 pt_entry_t expect; 3694 int count; 3695 3696 expect = pmap_pa2pte(pa) | PG_V; 3697 count = SPINLOCK_BACKOFF_MIN; 3698 kpreempt_disable(); 3699 startover: 3700 while ((pvpte = pv_pte_first(pp)) != NULL) { 3701 struct pmap *pmap; 3702 struct pv_entry *pve; 3703 pt_entry_t opte; 3704 vaddr_t va; 3705 int error; 3706 3707 /* 3708 * add a reference to the pmap before clearing the pte. 3709 * otherwise the pmap can disappear behind us. 3710 */ 3711 3712 ptp = pvpte->pte_ptp; 3713 pmap = ptp_to_pmap(ptp); 3714 if (ptp != NULL) { 3715 pmap_reference(pmap); 3716 } 3717 3718 error = pmap_sync_pv(pvpte, expect, ~0, &opte); 3719 if (error == EAGAIN) { 3720 int hold_count; 3721 KERNEL_UNLOCK_ALL(curlwp, &hold_count); 3722 if (ptp != NULL) { 3723 pmap_destroy(pmap); 3724 } 3725 SPINLOCK_BACKOFF(count); 3726 KERNEL_LOCK(hold_count, curlwp); 3727 goto startover; 3728 } 3729 3730 pp->pp_attrs |= opte; 3731 va = pvpte->pte_va; 3732 pve = pmap_remove_pv(pp, ptp, va); 3733 3734 /* update the PTP reference count. free if last reference. */ 3735 if (ptp != NULL) { 3736 struct pmap *pmap2; 3737 pt_entry_t *ptes; 3738 pd_entry_t * const *pdes; 3739 3740 KASSERT(pmap != pmap_kernel()); 3741 3742 pmap_tlb_shootnow(); 3743 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3744 pmap_stats_update_bypte(pmap, 0, opte); 3745 ptp->wire_count--; 3746 if (ptp->wire_count <= 1) { 3747 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3748 } 3749 pmap_unmap_ptes(pmap, pmap2); 3750 pmap_destroy(pmap); 3751 } else { 3752 KASSERT(pmap == pmap_kernel()); 3753 pmap_stats_update_bypte(pmap, 0, opte); 3754 } 3755 3756 if (pve != NULL) { 3757 pve->pve_next = killlist; /* mark it for death */ 3758 killlist = pve; 3759 } 3760 } 3761 pmap_tlb_shootnow(); 3762 kpreempt_enable(); 3763 3764 /* Now free unused pvs. */ 3765 pmap_free_pvs(killlist); 3766 } 3767 3768 /* 3769 * pmap_page_remove: remove a managed vm_page from all pmaps that map it 3770 * 3771 * => R/M bits are sync'd back to attrs 3772 */ 3773 3774 void 3775 pmap_page_remove(struct vm_page *pg) 3776 { 3777 struct pmap_page *pp; 3778 paddr_t pa; 3779 3780 KASSERT(uvm_page_locked_p(pg)); 3781 3782 pp = VM_PAGE_TO_PP(pg); 3783 pa = VM_PAGE_TO_PHYS(pg); 3784 pmap_pp_remove(pp, pa); 3785 } 3786 3787 /* 3788 * pmap_pv_remove: remove an unmanaged pv-tracked page from all pmaps 3789 * that map it 3790 */ 3791 3792 void 3793 pmap_pv_remove(paddr_t pa) 3794 { 3795 struct pmap_page *pp; 3796 3797 pp = pmap_pv_tracked(pa); 3798 if (pp == NULL) 3799 panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa); 3800 pmap_pp_remove(pp, pa); 3801 } 3802 3803 /* 3804 * p m a p a t t r i b u t e f u n c t i o n s 3805 * functions that test/change managed page's attributes 3806 * since a page can be mapped multiple times we must check each PTE that 3807 * maps it by going down the pv lists. 3808 */ 3809 3810 /* 3811 * pmap_test_attrs: test a page's attributes 3812 */ 3813 3814 bool 3815 pmap_test_attrs(struct vm_page *pg, unsigned testbits) 3816 { 3817 struct pmap_page *pp; 3818 struct pv_pte *pvpte; 3819 pt_entry_t expect; 3820 u_int result; 3821 3822 KASSERT(uvm_page_locked_p(pg)); 3823 3824 pp = VM_PAGE_TO_PP(pg); 3825 if ((pp->pp_attrs & testbits) != 0) { 3826 return true; 3827 } 3828 expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V; 3829 kpreempt_disable(); 3830 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 3831 pt_entry_t opte; 3832 int error; 3833 3834 if ((pp->pp_attrs & testbits) != 0) { 3835 break; 3836 } 3837 error = pmap_sync_pv(pvpte, expect, 0, &opte); 3838 if (error == 0) { 3839 pp->pp_attrs |= opte; 3840 } 3841 } 3842 result = pp->pp_attrs & testbits; 3843 kpreempt_enable(); 3844 3845 /* 3846 * note that we will exit the for loop with a non-null pve if 3847 * we have found the bits we are testing for. 3848 */ 3849 3850 return result != 0; 3851 } 3852 3853 static bool 3854 pmap_pp_clear_attrs(struct pmap_page *pp, paddr_t pa, unsigned clearbits) 3855 { 3856 struct pv_pte *pvpte; 3857 u_int result; 3858 pt_entry_t expect; 3859 int count; 3860 3861 expect = pmap_pa2pte(pa) | PG_V; 3862 count = SPINLOCK_BACKOFF_MIN; 3863 kpreempt_disable(); 3864 startover: 3865 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 3866 pt_entry_t opte; 3867 int error; 3868 3869 error = pmap_sync_pv(pvpte, expect, clearbits, &opte); 3870 if (error == EAGAIN) { 3871 int hold_count; 3872 KERNEL_UNLOCK_ALL(curlwp, &hold_count); 3873 SPINLOCK_BACKOFF(count); 3874 KERNEL_LOCK(hold_count, curlwp); 3875 goto startover; 3876 } 3877 pp->pp_attrs |= opte; 3878 } 3879 result = pp->pp_attrs & clearbits; 3880 pp->pp_attrs &= ~clearbits; 3881 pmap_tlb_shootnow(); 3882 kpreempt_enable(); 3883 3884 return result != 0; 3885 } 3886 3887 /* 3888 * pmap_clear_attrs: clear the specified attribute for a page. 3889 * 3890 * => we return true if we cleared one of the bits we were asked to 3891 */ 3892 3893 bool 3894 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits) 3895 { 3896 struct pmap_page *pp; 3897 paddr_t pa; 3898 3899 KASSERT(uvm_page_locked_p(pg)); 3900 3901 pp = VM_PAGE_TO_PP(pg); 3902 pa = VM_PAGE_TO_PHYS(pg); 3903 3904 return pmap_pp_clear_attrs(pp, pa, clearbits); 3905 } 3906 3907 /* 3908 * pmap_pv_clear_attrs: clear the specified attributes for an unmanaged 3909 * pv-tracked page. 3910 */ 3911 3912 bool 3913 pmap_pv_clear_attrs(paddr_t pa, unsigned clearbits) 3914 { 3915 struct pmap_page *pp; 3916 3917 pp = pmap_pv_tracked(pa); 3918 if (pp == NULL) 3919 panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa); 3920 3921 return pmap_pp_clear_attrs(pp, pa, clearbits); 3922 } 3923 3924 /* 3925 * p m a p p r o t e c t i o n f u n c t i o n s 3926 */ 3927 3928 /* 3929 * pmap_page_protect: change the protection of all recorded mappings 3930 * of a managed page 3931 * 3932 * => NOTE: this is an inline function in pmap.h 3933 */ 3934 3935 /* see pmap.h */ 3936 3937 /* 3938 * pmap_pv_protect: change the protection of all recorded mappings 3939 * of an unmanaged pv-tracked page 3940 * 3941 * => NOTE: this is an inline function in pmap.h 3942 */ 3943 3944 /* see pmap.h */ 3945 3946 /* 3947 * pmap_protect: set the protection in of the pages in a pmap 3948 * 3949 * => NOTE: this is an inline function in pmap.h 3950 */ 3951 3952 /* see pmap.h */ 3953 3954 /* 3955 * pmap_write_protect: write-protect pages in a pmap. 3956 * 3957 * Note for Xen-amd64. Xen automatically adds PG_u to the kernel pages, but we 3958 * don't need to remove this bit when re-entering the PTEs here: Xen tracks the 3959 * kernel pages with a reserved bit (_PAGE_GUEST_KERNEL), so even if PG_u is 3960 * present the page will still be considered as a kernel page, and the privilege 3961 * separation will be enforced correctly. 3962 */ 3963 void 3964 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot) 3965 { 3966 pt_entry_t bit_rem, bit_put; 3967 pt_entry_t *ptes; 3968 pt_entry_t * const *pdes; 3969 struct pmap *pmap2; 3970 vaddr_t blockend, va; 3971 3972 KASSERT(curlwp->l_md.md_gc_pmap != pmap); 3973 3974 bit_rem = 0; 3975 if (!(prot & VM_PROT_WRITE)) 3976 bit_rem = PG_RW; 3977 3978 bit_put = 0; 3979 if (!(prot & VM_PROT_EXECUTE)) 3980 bit_put = pmap_pg_nx; 3981 3982 sva &= PG_FRAME; 3983 eva &= PG_FRAME; 3984 3985 /* Acquire pmap. */ 3986 kpreempt_disable(); 3987 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3988 3989 for (va = sva ; va < eva; va = blockend) { 3990 pt_entry_t *spte, *epte; 3991 int i; 3992 3993 blockend = x86_round_pdr(va + 1); 3994 if (blockend > eva) 3995 blockend = eva; 3996 3997 /* 3998 * Our PTE mappings should never be write-protected. 3999 * 4000 * XXXmaxv: still needed? 4001 * 4002 * A long term solution is to move the PTEs out of user address 4003 * space, and into kernel address space. Then we can set 4004 * VM_MAXUSER_ADDRESS to be VM_MAX_ADDRESS. 4005 */ 4006 for (i = 0; i < PDP_SIZE; i++) { 4007 if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i) 4008 panic("PTE space accessed"); 4009 } 4010 4011 /* Is it a valid block? */ 4012 if (!pmap_pdes_valid(va, pdes, NULL)) { 4013 continue; 4014 } 4015 KASSERT(va < VM_MAXUSER_ADDRESS || va >= VM_MAX_ADDRESS); 4016 4017 spte = &ptes[pl1_i(va)]; 4018 epte = &ptes[pl1_i(blockend)]; 4019 4020 for (/* */; spte < epte; spte++) { 4021 pt_entry_t opte, npte; 4022 4023 do { 4024 opte = *spte; 4025 if (!pmap_valid_entry(opte)) { 4026 goto next; 4027 } 4028 npte = (opte & ~bit_rem) | bit_put; 4029 } while (pmap_pte_cas(spte, opte, npte) != opte); 4030 4031 if ((opte & PG_M) != 0) { 4032 vaddr_t tva = x86_ptob(spte - ptes); 4033 pmap_tlb_shootdown(pmap, tva, opte, 4034 TLBSHOOT_WRITE_PROTECT); 4035 } 4036 next:; 4037 } 4038 } 4039 4040 /* Release pmap. */ 4041 pmap_unmap_ptes(pmap, pmap2); 4042 kpreempt_enable(); 4043 } 4044 4045 /* 4046 * pmap_unwire: clear the wired bit in the PTE. 4047 * 4048 * => Mapping should already be present. 4049 */ 4050 void 4051 pmap_unwire(struct pmap *pmap, vaddr_t va) 4052 { 4053 pt_entry_t *ptes, *ptep, opte; 4054 pd_entry_t * const *pdes; 4055 struct pmap *pmap2; 4056 4057 /* Acquire pmap. */ 4058 kpreempt_disable(); 4059 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 4060 4061 if (!pmap_pdes_valid(va, pdes, NULL)) { 4062 panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va); 4063 } 4064 4065 ptep = &ptes[pl1_i(va)]; 4066 opte = *ptep; 4067 KASSERT(pmap_valid_entry(opte)); 4068 4069 if (opte & PG_W) { 4070 pt_entry_t npte = opte & ~PG_W; 4071 4072 opte = pmap_pte_testset(ptep, npte); 4073 pmap_stats_update_bypte(pmap, npte, opte); 4074 } else { 4075 printf("%s: wiring for pmap %p va %#" PRIxVADDR 4076 "did not change!\n", __func__, pmap, va); 4077 } 4078 4079 /* Release pmap. */ 4080 pmap_unmap_ptes(pmap, pmap2); 4081 kpreempt_enable(); 4082 } 4083 4084 /* 4085 * pmap_copy: copy mappings from one pmap to another 4086 * 4087 * => optional function 4088 * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr) 4089 */ 4090 4091 /* 4092 * defined as macro in pmap.h 4093 */ 4094 4095 __strict_weak_alias(pmap_enter, pmap_enter_default); 4096 4097 int 4098 pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, 4099 u_int flags) 4100 { 4101 return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0); 4102 } 4103 4104 /* 4105 * pmap_enter: enter a mapping into a pmap 4106 * 4107 * => must be done "now" ... no lazy-evaluation 4108 * => we set pmap => pv_head locking 4109 */ 4110 int 4111 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa, 4112 vm_prot_t prot, u_int flags, int domid) 4113 { 4114 pt_entry_t *ptes, opte, npte; 4115 pt_entry_t *ptep; 4116 pd_entry_t * const *pdes; 4117 struct vm_page *ptp; 4118 struct vm_page *new_pg, *old_pg; 4119 struct pmap_page *new_pp, *old_pp; 4120 struct pv_entry *old_pve = NULL; 4121 struct pv_entry *new_pve; 4122 struct pv_entry *new_sparepve; 4123 int error; 4124 bool wired = (flags & PMAP_WIRED) != 0; 4125 struct pmap *pmap2; 4126 4127 KASSERT(pmap_initialized); 4128 KASSERT(curlwp->l_md.md_gc_pmap != pmap); 4129 KASSERT(va < VM_MAX_KERNEL_ADDRESS); 4130 KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#" 4131 PRIxVADDR " over PDP!", __func__, va); 4132 KASSERTMSG(va < VM_MIN_KERNEL_ADDRESS || 4133 pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]), 4134 "%s: missing kernel PTP for va=%#" PRIxVADDR, __func__, va); 4135 4136 #ifdef XEN 4137 KASSERT(domid == DOMID_SELF || pa == 0); 4138 #endif /* XEN */ 4139 4140 npte = ma | protection_codes[prot] | PG_V; 4141 npte |= pmap_pat_flags(flags); 4142 if (wired) 4143 npte |= PG_W; 4144 if (va < VM_MAXUSER_ADDRESS) 4145 npte |= PG_u; 4146 else if (va < VM_MAX_ADDRESS) 4147 panic("PTE space accessed"); /* XXXmaxv: no longer needed? */ 4148 4149 if (pmap == pmap_kernel()) 4150 npte |= pmap_pg_g; 4151 if (flags & VM_PROT_ALL) { 4152 npte |= PG_U; 4153 if (flags & VM_PROT_WRITE) { 4154 KASSERT((npte & PG_RW) != 0); 4155 npte |= PG_M; 4156 } 4157 } 4158 4159 #ifdef XEN 4160 if (domid != DOMID_SELF) 4161 new_pg = NULL; 4162 else 4163 #endif 4164 new_pg = PHYS_TO_VM_PAGE(pa); 4165 if (new_pg != NULL) { 4166 /* This is a managed page */ 4167 npte |= PG_PVLIST; 4168 new_pp = VM_PAGE_TO_PP(new_pg); 4169 } else if ((new_pp = pmap_pv_tracked(pa)) != NULL) { 4170 /* This is an unmanaged pv-tracked page */ 4171 npte |= PG_PVLIST; 4172 } else { 4173 new_pp = NULL; 4174 } 4175 4176 /* 4177 * Try to get pves now if we might need them. 4178 * Keep going even if we fail, since we will not actually need them 4179 * if we are just changing the permissions on an existing mapping, 4180 * but we won't know if that's the case until later. 4181 */ 4182 4183 bool needpves = pmap_pp_needs_pve(new_pp); 4184 if (needpves) { 4185 new_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT); 4186 new_sparepve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT); 4187 } else { 4188 new_pve = NULL; 4189 new_sparepve = NULL; 4190 } 4191 4192 kpreempt_disable(); 4193 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 4194 if (pmap == pmap_kernel()) { 4195 ptp = NULL; 4196 } else { 4197 ptp = pmap_get_ptp(pmap, va, pdes, flags); 4198 if (ptp == NULL) { 4199 pmap_unmap_ptes(pmap, pmap2); 4200 if (flags & PMAP_CANFAIL) { 4201 error = ENOMEM; 4202 goto out; 4203 } 4204 panic("%s: get ptp failed", __func__); 4205 } 4206 } 4207 4208 /* 4209 * Check if there is an existing mapping. If we are now sure that 4210 * we need pves and we failed to allocate them earlier, handle that. 4211 * Caching the value of oldpa here is safe because only the mod/ref bits 4212 * can change while the pmap is locked. 4213 */ 4214 4215 ptep = &ptes[pl1_i(va)]; 4216 opte = *ptep; 4217 bool have_oldpa = pmap_valid_entry(opte); 4218 paddr_t oldpa = pmap_pte2pa(opte); 4219 4220 if (needpves && (!have_oldpa || oldpa != pa) && 4221 (new_pve == NULL || new_sparepve == NULL)) { 4222 pmap_unmap_ptes(pmap, pmap2); 4223 if (flags & PMAP_CANFAIL) { 4224 error = ENOMEM; 4225 goto out; 4226 } 4227 panic("%s: pve allocation failed", __func__); 4228 } 4229 4230 /* 4231 * update the pte. 4232 */ 4233 4234 do { 4235 opte = *ptep; 4236 4237 /* 4238 * if the same page, inherit PG_U and PG_M. 4239 */ 4240 if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) { 4241 npte |= opte & (PG_U | PG_M); 4242 } 4243 #if defined(XEN) 4244 if (domid != DOMID_SELF) { 4245 /* pmap_pte_cas with error handling */ 4246 int s = splvm(); 4247 if (opte != *ptep) { 4248 splx(s); 4249 continue; 4250 } 4251 error = xpq_update_foreign( 4252 vtomach((vaddr_t)ptep), npte, domid); 4253 splx(s); 4254 if (error) { 4255 if (ptp != NULL && ptp->wire_count <= 1) { 4256 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 4257 } 4258 pmap_unmap_ptes(pmap, pmap2); 4259 goto out; 4260 } 4261 break; 4262 } 4263 #endif /* defined(XEN) */ 4264 } while (pmap_pte_cas(ptep, opte, npte) != opte); 4265 4266 /* 4267 * update statistics and PTP's reference count. 4268 */ 4269 4270 pmap_stats_update_bypte(pmap, npte, opte); 4271 if (ptp != NULL && !have_oldpa) { 4272 ptp->wire_count++; 4273 } 4274 KASSERT(ptp == NULL || ptp->wire_count > 1); 4275 4276 /* 4277 * if the same page, we can skip pv_entry handling. 4278 */ 4279 4280 if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) { 4281 KASSERT(((opte ^ npte) & PG_PVLIST) == 0); 4282 goto same_pa; 4283 } 4284 4285 /* 4286 * if old page is pv-tracked, remove pv_entry from its list. 4287 */ 4288 4289 if ((~opte & (PG_V | PG_PVLIST)) == 0) { 4290 if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) { 4291 KASSERT(uvm_page_locked_p(old_pg)); 4292 old_pp = VM_PAGE_TO_PP(old_pg); 4293 } else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) { 4294 panic("%s: PG_PVLIST with pv-untracked page" 4295 " va = %#"PRIxVADDR 4296 " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")", 4297 __func__, va, oldpa, atop(pa)); 4298 } 4299 4300 old_pve = pmap_remove_pv(old_pp, ptp, va); 4301 old_pp->pp_attrs |= opte; 4302 } 4303 4304 /* 4305 * if new page is pv-tracked, insert pv_entry into its list. 4306 */ 4307 4308 if (new_pp) { 4309 new_pve = pmap_enter_pv(new_pp, new_pve, &new_sparepve, ptp, va); 4310 } 4311 4312 same_pa: 4313 pmap_unmap_ptes(pmap, pmap2); 4314 4315 /* 4316 * shootdown tlb if necessary. 4317 */ 4318 4319 if ((~opte & (PG_V | PG_U)) == 0 && 4320 ((opte ^ npte) & (PG_FRAME | PG_RW)) != 0) { 4321 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER); 4322 } 4323 4324 error = 0; 4325 out: 4326 kpreempt_enable(); 4327 if (old_pve != NULL) { 4328 pool_cache_put(&pmap_pv_cache, old_pve); 4329 } 4330 if (new_pve != NULL) { 4331 pool_cache_put(&pmap_pv_cache, new_pve); 4332 } 4333 if (new_sparepve != NULL) { 4334 pool_cache_put(&pmap_pv_cache, new_sparepve); 4335 } 4336 4337 return error; 4338 } 4339 4340 static paddr_t 4341 pmap_get_physpage(void) 4342 { 4343 struct vm_page *ptp; 4344 struct pmap *kpm = pmap_kernel(); 4345 paddr_t pa; 4346 4347 if (!uvm.page_init_done) { 4348 /* 4349 * We're growing the kernel pmap early (from 4350 * uvm_pageboot_alloc()). This case must be 4351 * handled a little differently. 4352 */ 4353 4354 if (!uvm_page_physget(&pa)) 4355 panic("%s: out of memory", __func__); 4356 #if defined(__HAVE_DIRECT_MAP) 4357 pagezero(PMAP_DIRECT_MAP(pa)); 4358 #else 4359 #if defined(XEN) 4360 if (XEN_VERSION_SUPPORTED(3, 4)) { 4361 xen_pagezero(pa); 4362 return pa; 4363 } 4364 #endif 4365 kpreempt_disable(); 4366 pmap_pte_set(early_zero_pte, pmap_pa2pte(pa) | PG_V | 4367 PG_RW | pmap_pg_nx); 4368 pmap_pte_flush(); 4369 pmap_update_pg((vaddr_t)early_zerop); 4370 memset(early_zerop, 0, PAGE_SIZE); 4371 #if defined(DIAGNOSTIC) || defined(XEN) 4372 pmap_pte_set(early_zero_pte, 0); 4373 pmap_pte_flush(); 4374 #endif /* defined(DIAGNOSTIC) */ 4375 kpreempt_enable(); 4376 #endif /* defined(__HAVE_DIRECT_MAP) */ 4377 } else { 4378 /* XXX */ 4379 ptp = uvm_pagealloc(NULL, 0, NULL, 4380 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 4381 if (ptp == NULL) 4382 panic("%s: out of memory", __func__); 4383 ptp->flags &= ~PG_BUSY; 4384 ptp->wire_count = 1; 4385 pa = VM_PAGE_TO_PHYS(ptp); 4386 } 4387 pmap_stats_update(kpm, 1, 0); 4388 4389 return pa; 4390 } 4391 4392 /* 4393 * Expand the page tree with the specified amount of PTPs, mapping virtual 4394 * addresses starting at kva. We populate all the levels but the last one 4395 * (L1). The nodes of the tree are created as RWX, but the pages covered 4396 * will be kentered in L1, with proper permissions. 4397 * 4398 * Used only by pmap_growkernel. 4399 */ 4400 static void 4401 pmap_alloc_level(struct pmap *cpm, vaddr_t kva, long *needed_ptps) 4402 { 4403 unsigned long i; 4404 paddr_t pa; 4405 unsigned long index, endindex; 4406 int level; 4407 pd_entry_t *pdep; 4408 #ifdef XEN 4409 int s = splvm(); /* protect xpq_* */ 4410 #endif 4411 4412 for (level = PTP_LEVELS; level > 1; level--) { 4413 if (level == PTP_LEVELS) 4414 pdep = cpm->pm_pdir; 4415 else 4416 pdep = normal_pdes[level - 2]; 4417 index = pl_i_roundup(kva, level); 4418 endindex = index + needed_ptps[level - 1] - 1; 4419 4420 for (i = index; i <= endindex; i++) { 4421 pt_entry_t pte; 4422 4423 KASSERT(!pmap_valid_entry(pdep[i])); 4424 pa = pmap_get_physpage(); 4425 pte = pmap_pa2pte(pa) | PG_V | PG_RW; 4426 pmap_pte_set(&pdep[i], pte); 4427 4428 #if defined(XEN) && (defined(PAE) || defined(__x86_64__)) 4429 if (level == PTP_LEVELS && i >= PDIR_SLOT_KERN) { 4430 if (__predict_true( 4431 cpu_info_primary.ci_flags & CPUF_PRESENT)) { 4432 /* update per-cpu PMDs on all cpus */ 4433 xen_kpm_sync(pmap_kernel(), i); 4434 } else { 4435 /* 4436 * too early; update primary CPU 4437 * PMD only (without locks) 4438 */ 4439 #ifdef PAE 4440 pd_entry_t *cpu_pdep = 4441 &cpu_info_primary.ci_kpm_pdir[l2tol2(i)]; 4442 #endif 4443 #ifdef __x86_64__ 4444 pd_entry_t *cpu_pdep = 4445 &cpu_info_primary.ci_kpm_pdir[i]; 4446 #endif 4447 pmap_pte_set(cpu_pdep, pte); 4448 } 4449 } 4450 #endif /* XEN && (PAE || __x86_64__) */ 4451 4452 KASSERT(level != PTP_LEVELS || nkptp[level - 1] + 4453 pl_i(VM_MIN_KERNEL_ADDRESS, level) == i); 4454 nkptp[level - 1]++; 4455 } 4456 pmap_pte_flush(); 4457 } 4458 #ifdef XEN 4459 splx(s); 4460 #endif 4461 } 4462 4463 /* 4464 * pmap_growkernel: increase usage of KVM space. 4465 * 4466 * => we allocate new PTPs for the kernel and install them in all 4467 * the pmaps on the system. 4468 */ 4469 4470 vaddr_t 4471 pmap_growkernel(vaddr_t maxkvaddr) 4472 { 4473 struct pmap *kpm = pmap_kernel(); 4474 struct pmap *cpm; 4475 #if !defined(XEN) || !defined(__x86_64__) 4476 struct pmap *pm; 4477 long old; 4478 #endif 4479 int s, i; 4480 long needed_kptp[PTP_LEVELS], target_nptp; 4481 bool invalidate = false; 4482 4483 s = splvm(); /* to be safe */ 4484 mutex_enter(kpm->pm_lock); 4485 4486 if (maxkvaddr <= pmap_maxkvaddr) { 4487 mutex_exit(kpm->pm_lock); 4488 splx(s); 4489 return pmap_maxkvaddr; 4490 } 4491 4492 maxkvaddr = x86_round_pdr(maxkvaddr); 4493 #if !defined(XEN) || !defined(__x86_64__) 4494 old = nkptp[PTP_LEVELS - 1]; 4495 #endif 4496 4497 /* Initialize needed_kptp. */ 4498 for (i = PTP_LEVELS - 1; i >= 1; i--) { 4499 target_nptp = pl_i_roundup(maxkvaddr, i + 1) - 4500 pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1); 4501 4502 if (target_nptp > nkptpmax[i]) 4503 panic("out of KVA space"); 4504 KASSERT(target_nptp >= nkptp[i]); 4505 needed_kptp[i] = target_nptp - nkptp[i]; 4506 } 4507 4508 #if defined(XEN) && (defined(__x86_64__) || defined(PAE)) 4509 /* only pmap_kernel(), or the per-cpu map, has kernel entries */ 4510 cpm = kpm; 4511 #else 4512 /* Get the current pmap */ 4513 if (__predict_true(cpu_info_primary.ci_flags & CPUF_PRESENT)) { 4514 cpm = curcpu()->ci_pmap; 4515 } else { 4516 cpm = kpm; 4517 } 4518 #endif 4519 4520 pmap_alloc_level(cpm, pmap_maxkvaddr, needed_kptp); 4521 4522 /* 4523 * If the number of top level entries changed, update all pmaps. 4524 */ 4525 if (needed_kptp[PTP_LEVELS - 1] != 0) { 4526 #ifdef XEN 4527 #ifdef __x86_64__ 4528 /* nothing, kernel entries are never entered in user pmap */ 4529 #else /* __x86_64__ */ 4530 int pdkidx; 4531 #ifndef PAE 4532 /* 4533 * for PAE this is not needed, because pmap_alloc_level() 4534 * already did update the per-CPU tables 4535 */ 4536 if (cpm != kpm) { 4537 for (pdkidx = PDIR_SLOT_KERN + old; 4538 pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1]; 4539 pdkidx++) { 4540 pmap_pte_set(&kpm->pm_pdir[pdkidx], 4541 cpm->pm_pdir[pdkidx]); 4542 } 4543 pmap_pte_flush(); 4544 } 4545 #endif /* !PAE */ 4546 4547 mutex_enter(&pmaps_lock); 4548 LIST_FOREACH(pm, &pmaps, pm_list) { 4549 for (pdkidx = PDIR_SLOT_KERN + old; 4550 pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1]; 4551 pdkidx++) { 4552 pmap_pte_set(&pm->pm_pdir[pdkidx], 4553 kpm->pm_pdir[pdkidx]); 4554 } 4555 pmap_pte_flush(); 4556 } 4557 mutex_exit(&pmaps_lock); 4558 #endif /* __x86_64__ */ 4559 #else /* XEN */ 4560 size_t newpdes; 4561 newpdes = nkptp[PTP_LEVELS - 1] - old; 4562 if (cpm != kpm) { 4563 memcpy(&kpm->pm_pdir[PDIR_SLOT_KERN + old], 4564 &cpm->pm_pdir[PDIR_SLOT_KERN + old], 4565 newpdes * sizeof(pd_entry_t)); 4566 } 4567 4568 mutex_enter(&pmaps_lock); 4569 LIST_FOREACH(pm, &pmaps, pm_list) { 4570 memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old], 4571 &kpm->pm_pdir[PDIR_SLOT_KERN + old], 4572 newpdes * sizeof (pd_entry_t)); 4573 } 4574 mutex_exit(&pmaps_lock); 4575 #endif 4576 invalidate = true; 4577 } 4578 pmap_maxkvaddr = maxkvaddr; 4579 mutex_exit(kpm->pm_lock); 4580 splx(s); 4581 4582 if (invalidate && pmap_initialized) { 4583 /* Invalidate the PDP cache. */ 4584 pool_cache_invalidate(&pmap_pdp_cache); 4585 } 4586 4587 return maxkvaddr; 4588 } 4589 4590 #ifdef DEBUG 4591 void pmap_dump(struct pmap *, vaddr_t, vaddr_t); 4592 4593 /* 4594 * pmap_dump: dump all the mappings from a pmap 4595 * 4596 * => caller should not be holding any pmap locks 4597 */ 4598 4599 void 4600 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 4601 { 4602 pt_entry_t *ptes, *pte; 4603 pd_entry_t * const *pdes; 4604 struct pmap *pmap2; 4605 vaddr_t blkendva; 4606 4607 /* 4608 * if end is out of range truncate. 4609 * if (end == start) update to max. 4610 */ 4611 4612 if (eva > VM_MAXUSER_ADDRESS || eva <= sva) 4613 eva = VM_MAXUSER_ADDRESS; 4614 4615 /* 4616 * we lock in the pmap => pv_head direction 4617 */ 4618 4619 kpreempt_disable(); 4620 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 4621 4622 /* 4623 * dumping a range of pages: we dump in PTP sized blocks (4MB) 4624 */ 4625 4626 for (/* null */ ; sva < eva ; sva = blkendva) { 4627 4628 /* determine range of block */ 4629 blkendva = x86_round_pdr(sva+1); 4630 if (blkendva > eva) 4631 blkendva = eva; 4632 4633 /* valid block? */ 4634 if (!pmap_pdes_valid(sva, pdes, NULL)) 4635 continue; 4636 4637 pte = &ptes[pl1_i(sva)]; 4638 for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) { 4639 if (!pmap_valid_entry(*pte)) 4640 continue; 4641 printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR 4642 " (pte=%#" PRIxPADDR ")\n", 4643 sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte); 4644 } 4645 } 4646 pmap_unmap_ptes(pmap, pmap2); 4647 kpreempt_enable(); 4648 } 4649 #endif 4650 4651 /* 4652 * pmap_update: process deferred invalidations and frees. 4653 */ 4654 4655 void 4656 pmap_update(struct pmap *pmap) 4657 { 4658 struct vm_page *empty_ptps; 4659 lwp_t *l = curlwp; 4660 4661 /* 4662 * If we have torn down this pmap, invalidate non-global TLB 4663 * entries on any processors using it. 4664 */ 4665 kpreempt_disable(); 4666 if (__predict_false(l->l_md.md_gc_pmap == pmap)) { 4667 l->l_md.md_gc_pmap = NULL; 4668 pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, TLBSHOOT_UPDATE); 4669 } 4670 4671 /* 4672 * Initiate any pending TLB shootdowns. Wait for them to 4673 * complete before returning control to the caller. 4674 */ 4675 pmap_tlb_shootnow(); 4676 kpreempt_enable(); 4677 4678 /* 4679 * Now that shootdowns are complete, process deferred frees, 4680 * but not from interrupt context. 4681 */ 4682 if (l->l_md.md_gc_ptp != NULL) { 4683 KASSERT((l->l_pflag & LP_INTR) == 0); 4684 if (cpu_intr_p()) { 4685 return; 4686 } 4687 empty_ptps = l->l_md.md_gc_ptp; 4688 l->l_md.md_gc_ptp = NULL; 4689 pmap_free_ptps(empty_ptps); 4690 } 4691 } 4692 4693 #if PTP_LEVELS > 4 4694 #error "Unsupported number of page table mappings" 4695 #endif 4696 4697 paddr_t 4698 pmap_init_tmp_pgtbl(paddr_t pg) 4699 { 4700 static bool maps_loaded; 4701 static const paddr_t x86_tmp_pml_paddr[] = { 4702 4 * PAGE_SIZE, /* L1 */ 4703 5 * PAGE_SIZE, /* L2 */ 4704 6 * PAGE_SIZE, /* L3 */ 4705 7 * PAGE_SIZE /* L4 */ 4706 }; 4707 static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 }; 4708 4709 pd_entry_t *tmp_pml, *kernel_pml; 4710 4711 int level; 4712 4713 if (!maps_loaded) { 4714 for (level = 0; level < PTP_LEVELS; ++level) { 4715 x86_tmp_pml_vaddr[level] = 4716 uvm_km_alloc(kernel_map, PAGE_SIZE, 0, 4717 UVM_KMF_VAONLY); 4718 4719 if (x86_tmp_pml_vaddr[level] == 0) 4720 panic("mapping of real mode PML failed\n"); 4721 pmap_kenter_pa(x86_tmp_pml_vaddr[level], 4722 x86_tmp_pml_paddr[level], 4723 VM_PROT_READ | VM_PROT_WRITE, 0); 4724 } 4725 pmap_update(pmap_kernel()); 4726 maps_loaded = true; 4727 } 4728 4729 /* Zero levels 1-3 */ 4730 for (level = 0; level < PTP_LEVELS - 1; ++level) { 4731 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 4732 memset(tmp_pml, 0, PAGE_SIZE); 4733 } 4734 4735 /* Copy PML4 */ 4736 kernel_pml = pmap_kernel()->pm_pdir; 4737 tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1]; 4738 memcpy(tmp_pml, kernel_pml, PAGE_SIZE); 4739 4740 #ifdef PAE 4741 /* 4742 * Use the last 4 entries of the L2 page as L3 PD entries. These 4743 * last entries are unlikely to be used for temporary mappings. 4744 * 508: maps 0->1GB (userland) 4745 * 509: unused 4746 * 510: unused 4747 * 511: maps 3->4GB (kernel) 4748 */ 4749 tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PG_V; 4750 tmp_pml[509] = 0; 4751 tmp_pml[510] = 0; 4752 tmp_pml[511] = pmap_pdirpa(pmap_kernel(), PDIR_SLOT_KERN) | PG_V; 4753 #endif 4754 4755 for (level = PTP_LEVELS - 1; level > 0; --level) { 4756 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 4757 4758 tmp_pml[pl_i(pg, level + 1)] = 4759 (x86_tmp_pml_paddr[level - 1] & PG_FRAME) | PG_RW | PG_V; 4760 } 4761 4762 tmp_pml = (void *)x86_tmp_pml_vaddr[0]; 4763 tmp_pml[pl_i(pg, 1)] = (pg & PG_FRAME) | PG_RW | PG_V; 4764 4765 #ifdef PAE 4766 /* Return the PA of the L3 page (entry 508 of the L2 page) */ 4767 return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t); 4768 #endif 4769 4770 return x86_tmp_pml_paddr[PTP_LEVELS - 1]; 4771 } 4772 4773 u_int 4774 x86_mmap_flags(paddr_t mdpgno) 4775 { 4776 u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK; 4777 u_int pflag = 0; 4778 4779 if (nflag & X86_MMAP_FLAG_PREFETCH) 4780 pflag |= PMAP_WRITE_COMBINE; 4781 4782 return pflag; 4783 } 4784