1 /* $NetBSD: pmap.c,v 1.240 2017/02/11 14:11:24 maxv Exp $ */ 2 3 /*- 4 * Copyright (c) 2008, 2010, 2016, 2017 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran, and by Maxime Villard. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 2007 Manuel Bouyer. 34 * 35 * Redistribution and use in source and binary forms, with or without 36 * modification, are permitted provided that the following conditions 37 * are met: 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 44 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 45 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 46 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 47 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 48 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 49 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 50 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 51 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 52 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 53 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 54 * 55 */ 56 57 /* 58 * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr> 59 * 60 * Permission to use, copy, modify, and distribute this software for any 61 * purpose with or without fee is hereby granted, provided that the above 62 * copyright notice and this permission notice appear in all copies. 63 * 64 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 65 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 66 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 67 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 68 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 69 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 70 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 71 */ 72 73 /* 74 * Copyright (c) 1997 Charles D. Cranor and Washington University. 75 * All rights reserved. 76 * 77 * Redistribution and use in source and binary forms, with or without 78 * modification, are permitted provided that the following conditions 79 * are met: 80 * 1. Redistributions of source code must retain the above copyright 81 * notice, this list of conditions and the following disclaimer. 82 * 2. Redistributions in binary form must reproduce the above copyright 83 * notice, this list of conditions and the following disclaimer in the 84 * documentation and/or other materials provided with the distribution. 85 * 86 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 87 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 88 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 89 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 90 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 91 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 92 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 93 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 94 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 95 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 96 */ 97 98 /* 99 * Copyright 2001 (c) Wasabi Systems, Inc. 100 * All rights reserved. 101 * 102 * Written by Frank van der Linden for Wasabi Systems, Inc. 103 * 104 * Redistribution and use in source and binary forms, with or without 105 * modification, are permitted provided that the following conditions 106 * are met: 107 * 1. Redistributions of source code must retain the above copyright 108 * notice, this list of conditions and the following disclaimer. 109 * 2. Redistributions in binary form must reproduce the above copyright 110 * notice, this list of conditions and the following disclaimer in the 111 * documentation and/or other materials provided with the distribution. 112 * 3. All advertising materials mentioning features or use of this software 113 * must display the following acknowledgement: 114 * This product includes software developed for the NetBSD Project by 115 * Wasabi Systems, Inc. 116 * 4. The name of Wasabi Systems, Inc. may not be used to endorse 117 * or promote products derived from this software without specific prior 118 * written permission. 119 * 120 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND 121 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 122 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 123 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC 124 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 125 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 126 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 127 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 128 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 129 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 130 * POSSIBILITY OF SUCH DAMAGE. 131 */ 132 133 /* 134 * This is the i386 pmap modified and generalized to support x86-64 135 * as well. The idea is to hide the upper N levels of the page tables 136 * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest 137 * is mostly untouched, except that it uses some more generalized 138 * macros and interfaces. 139 * 140 * This pmap has been tested on the i386 as well, and it can be easily 141 * adapted to PAE. 142 * 143 * fvdl@wasabisystems.com 18-Jun-2001 144 */ 145 146 /* 147 * pmap.c: i386 pmap module rewrite 148 * Chuck Cranor <chuck@netbsd> 149 * 11-Aug-97 150 * 151 * history of this pmap module: in addition to my own input, i used 152 * the following references for this rewrite of the i386 pmap: 153 * 154 * [1] the NetBSD i386 pmap. this pmap appears to be based on the 155 * BSD hp300 pmap done by Mike Hibler at University of Utah. 156 * it was then ported to the i386 by William Jolitz of UUNET 157 * Technologies, Inc. Then Charles M. Hannum of the NetBSD 158 * project fixed some bugs and provided some speed ups. 159 * 160 * [2] the FreeBSD i386 pmap. this pmap seems to be the 161 * Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson 162 * and David Greenman. 163 * 164 * [3] the Mach pmap. this pmap, from CMU, seems to have migrated 165 * between several processors. the VAX version was done by 166 * Avadis Tevanian, Jr., and Michael Wayne Young. the i386 167 * version was done by Lance Berc, Mike Kupfer, Bob Baron, 168 * David Golub, and Richard Draves. the alpha version was 169 * done by Alessandro Forin (CMU/Mach) and Chris Demetriou 170 * (NetBSD/alpha). 171 */ 172 173 #include <sys/cdefs.h> 174 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.240 2017/02/11 14:11:24 maxv Exp $"); 175 176 #include "opt_user_ldt.h" 177 #include "opt_lockdebug.h" 178 #include "opt_multiprocessor.h" 179 #include "opt_xen.h" 180 181 #include <sys/param.h> 182 #include <sys/systm.h> 183 #include <sys/proc.h> 184 #include <sys/pool.h> 185 #include <sys/kernel.h> 186 #include <sys/atomic.h> 187 #include <sys/cpu.h> 188 #include <sys/intr.h> 189 #include <sys/xcall.h> 190 #include <sys/kcore.h> 191 192 #include <uvm/uvm.h> 193 #include <uvm/pmap/pmap_pvt.h> 194 195 #include <dev/isa/isareg.h> 196 197 #include <machine/specialreg.h> 198 #include <machine/gdt.h> 199 #include <machine/isa_machdep.h> 200 #include <machine/cpuvar.h> 201 #include <machine/cputypes.h> 202 203 #include <x86/pmap.h> 204 #include <x86/pmap_pv.h> 205 206 #include <x86/i82489reg.h> 207 #include <x86/i82489var.h> 208 209 #ifdef XEN 210 #include <xen/xen-public/xen.h> 211 #include <xen/hypervisor.h> 212 #endif 213 214 /* 215 * general info: 216 * 217 * - for an explanation of how the i386 MMU hardware works see 218 * the comments in <machine/pte.h>. 219 * 220 * - for an explanation of the general memory structure used by 221 * this pmap (including the recursive mapping), see the comments 222 * in <machine/pmap.h>. 223 * 224 * this file contains the code for the "pmap module." the module's 225 * job is to manage the hardware's virtual to physical address mappings. 226 * note that there are two levels of mapping in the VM system: 227 * 228 * [1] the upper layer of the VM system uses vm_map's and vm_map_entry's 229 * to map ranges of virtual address space to objects/files. for 230 * example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only 231 * to the file /bin/ls starting at offset zero." note that 232 * the upper layer mapping is not concerned with how individual 233 * vm_pages are mapped. 234 * 235 * [2] the lower layer of the VM system (the pmap) maintains the mappings 236 * from virtual addresses. it is concerned with which vm_page is 237 * mapped where. for example, when you run /bin/ls and start 238 * at page 0x1000 the fault routine may lookup the correct page 239 * of the /bin/ls file and then ask the pmap layer to establish 240 * a mapping for it. 241 * 242 * note that information in the lower layer of the VM system can be 243 * thrown away since it can easily be reconstructed from the info 244 * in the upper layer. 245 * 246 * data structures we use include: 247 * 248 * - struct pmap: describes the address space of one thread 249 * - struct pmap_page: describes one pv-tracked page, without 250 * necessarily a corresponding vm_page 251 * - struct pv_entry: describes one <PMAP,VA> mapping of a PA 252 * - struct pv_head: there is one pv_head per pv-tracked page of 253 * physical memory. the pv_head points to a list of pv_entry 254 * structures which describe all the <PMAP,VA> pairs that this 255 * page is mapped in. this is critical for page based operations 256 * such as pmap_page_protect() [change protection on _all_ mappings 257 * of a page] 258 */ 259 260 /* 261 * memory allocation 262 * 263 * - there are three data structures that we must dynamically allocate: 264 * 265 * [A] new process' page directory page (PDP) 266 * - plan 1: done at pmap_create() we use 267 * uvm_km_alloc(kernel_map, PAGE_SIZE) [fka kmem_alloc] to do this 268 * allocation. 269 * 270 * if we are low in free physical memory then we sleep in 271 * uvm_km_alloc -- in this case this is ok since we are creating 272 * a new pmap and should not be holding any locks. 273 * 274 * if the kernel is totally out of virtual space 275 * (i.e. uvm_km_alloc returns NULL), then we panic. 276 * 277 * [B] new page tables pages (PTP) 278 * - call uvm_pagealloc() 279 * => success: zero page, add to pm_pdir 280 * => failure: we are out of free vm_pages, let pmap_enter() 281 * tell UVM about it. 282 * 283 * note: for kernel PTPs, we start with NKPTP of them. as we map 284 * kernel memory (at uvm_map time) we check to see if we've grown 285 * the kernel pmap. if so, we call the optional function 286 * pmap_growkernel() to grow the kernel PTPs in advance. 287 * 288 * [C] pv_entry structures 289 */ 290 291 /* 292 * locking 293 * 294 * we have the following locks that we must contend with: 295 * 296 * mutexes: 297 * 298 * - pmap lock (per pmap, part of uvm_object) 299 * this lock protects the fields in the pmap structure including 300 * the non-kernel PDEs in the PDP, and the PTEs. it also locks 301 * in the alternate PTE space (since that is determined by the 302 * entry in the PDP). 303 * 304 * - pvh_lock (per pv_head) 305 * this lock protects the pv_entry list which is chained off the 306 * pv_head structure for a specific pv-tracked PA. it is locked 307 * when traversing the list (e.g. adding/removing mappings, 308 * syncing R/M bits, etc.) 309 * 310 * - pmaps_lock 311 * this lock protects the list of active pmaps (headed by "pmaps"). 312 * we lock it when adding or removing pmaps from this list. 313 */ 314 315 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER; 316 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER; 317 const long nkptpmax[] = NKPTPMAX_INITIALIZER; 318 const long nbpd[] = NBPD_INITIALIZER; 319 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER; 320 321 long nkptp[] = NKPTP_INITIALIZER; 322 323 struct pmap_head pmaps; 324 kmutex_t pmaps_lock; 325 326 static vaddr_t pmap_maxkvaddr; 327 328 /* 329 * XXX kludge: dummy locking to make KASSERTs in uvm_page.c comfortable. 330 * actual locking is done by pm_lock. 331 */ 332 #if defined(DIAGNOSTIC) 333 #define PMAP_SUBOBJ_LOCK(pm, idx) \ 334 KASSERT(mutex_owned((pm)->pm_lock)); \ 335 if ((idx) != 0) \ 336 mutex_enter((pm)->pm_obj[(idx)].vmobjlock) 337 #define PMAP_SUBOBJ_UNLOCK(pm, idx) \ 338 KASSERT(mutex_owned((pm)->pm_lock)); \ 339 if ((idx) != 0) \ 340 mutex_exit((pm)->pm_obj[(idx)].vmobjlock) 341 #else /* defined(DIAGNOSTIC) */ 342 #define PMAP_SUBOBJ_LOCK(pm, idx) /* nothing */ 343 #define PMAP_SUBOBJ_UNLOCK(pm, idx) /* nothing */ 344 #endif /* defined(DIAGNOSTIC) */ 345 346 /* 347 * Misc. event counters. 348 */ 349 struct evcnt pmap_iobmp_evcnt; 350 struct evcnt pmap_ldt_evcnt; 351 352 /* 353 * PAT 354 */ 355 #define PATENTRY(n, type) (type << ((n) * 8)) 356 #define PAT_UC 0x0ULL 357 #define PAT_WC 0x1ULL 358 #define PAT_WT 0x4ULL 359 #define PAT_WP 0x5ULL 360 #define PAT_WB 0x6ULL 361 #define PAT_UCMINUS 0x7ULL 362 363 static bool cpu_pat_enabled __read_mostly = false; 364 365 /* 366 * Global data structures 367 */ 368 369 static struct pmap kernel_pmap_store; /* the kernel's pmap (proc0) */ 370 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store; 371 372 /* 373 * pmap_pg_nx: if our processor supports PG_NX in the PTE then we 374 * set pmap_pg_nx to PG_NX (otherwise it is zero). 375 */ 376 pd_entry_t pmap_pg_nx __read_mostly = 0; 377 378 /* 379 * pmap_pg_g: if our processor supports PG_G in the PTE then we 380 * set pmap_pg_g to PG_G (otherwise it is zero). 381 */ 382 pd_entry_t pmap_pg_g __read_mostly = 0; 383 384 /* 385 * pmap_largepages: if our processor supports PG_PS and we are 386 * using it, this is set to true. 387 */ 388 int pmap_largepages __read_mostly = 0; 389 390 /* 391 * i386 physical memory comes in a big contig chunk with a small 392 * hole toward the front of it... the following two paddr_t's 393 * (shared with machdep.c) describe the physical address space 394 * of this machine. 395 */ 396 paddr_t lowmem_rsvd __read_mostly; 397 paddr_t avail_start __read_mostly; /* PA of first available physical page */ 398 paddr_t avail_end __read_mostly; /* PA of last available physical page */ 399 400 #ifdef XEN 401 paddr_t pmap_pa_start; /* PA of first physical page for this domain */ 402 paddr_t pmap_pa_end; /* PA of last physical page for this domain */ 403 #endif 404 405 #define VM_PAGE_TO_PP(pg) (&(pg)->mdpage.mp_pp) 406 407 #define PV_HASH_SIZE 32768 408 #define PV_HASH_LOCK_CNT 32 409 410 struct pv_hash_lock { 411 kmutex_t lock; 412 } __aligned(CACHE_LINE_SIZE) pv_hash_locks[PV_HASH_LOCK_CNT] 413 __aligned(CACHE_LINE_SIZE); 414 415 struct pv_hash_head { 416 SLIST_HEAD(, pv_entry) hh_list; 417 } pv_hash_heads[PV_HASH_SIZE]; 418 419 static u_int 420 pvhash_hash(struct vm_page *ptp, vaddr_t va) 421 { 422 423 return (uintptr_t)ptp / sizeof(*ptp) + (va >> PAGE_SHIFT); 424 } 425 426 static struct pv_hash_head * 427 pvhash_head(u_int hash) 428 { 429 430 return &pv_hash_heads[hash % PV_HASH_SIZE]; 431 } 432 433 static kmutex_t * 434 pvhash_lock(u_int hash) 435 { 436 437 return &pv_hash_locks[hash % PV_HASH_LOCK_CNT].lock; 438 } 439 440 static struct pv_entry * 441 pvhash_remove(struct pv_hash_head *hh, struct vm_page *ptp, vaddr_t va) 442 { 443 struct pv_entry *pve; 444 struct pv_entry *prev; 445 446 prev = NULL; 447 SLIST_FOREACH(pve, &hh->hh_list, pve_hash) { 448 if (pve->pve_pte.pte_ptp == ptp && 449 pve->pve_pte.pte_va == va) { 450 if (prev != NULL) { 451 SLIST_REMOVE_AFTER(prev, pve_hash); 452 } else { 453 SLIST_REMOVE_HEAD(&hh->hh_list, pve_hash); 454 } 455 break; 456 } 457 prev = pve; 458 } 459 return pve; 460 } 461 462 /* 463 * Other data structures 464 */ 465 466 static pt_entry_t protection_codes[8] __read_mostly; 467 468 static bool pmap_initialized __read_mostly = false; /* pmap_init done yet? */ 469 470 /* 471 * The following two vaddr_t's are used during system startup to keep track of 472 * how much of the kernel's VM space we have used. Once the system is started, 473 * the management of the remaining kernel VM space is turned over to the 474 * kernel_map vm_map. 475 */ 476 static vaddr_t virtual_avail __read_mostly; /* VA of first free KVA */ 477 static vaddr_t virtual_end __read_mostly; /* VA of last free KVA */ 478 479 #ifndef XEN 480 /* 481 * LAPIC virtual address, and fake physical address. 482 */ 483 volatile vaddr_t local_apic_va __read_mostly; 484 paddr_t local_apic_pa __read_mostly; 485 #endif 486 487 /* 488 * pool that pmap structures are allocated from 489 */ 490 static struct pool_cache pmap_cache; 491 492 /* 493 * pv_entry cache 494 */ 495 static struct pool_cache pmap_pv_cache; 496 497 #ifndef __HAVE_DIRECT_MAP 498 /* 499 * Special VAs and the PTEs that map them 500 */ 501 static pt_entry_t *early_zero_pte; 502 static void pmap_vpage_cpualloc(struct cpu_info *); 503 #ifdef XEN 504 char *early_zerop; /* also referenced from xen_locore() */ 505 #else 506 static char *early_zerop; 507 #endif 508 #endif 509 510 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int); 511 512 /* PDP pool_cache(9) and its callbacks */ 513 struct pool_cache pmap_pdp_cache; 514 static int pmap_pdp_ctor(void *, void *, int); 515 static void pmap_pdp_dtor(void *, void *); 516 #ifdef PAE 517 /* need to allocate items of 4 pages */ 518 static void *pmap_pdp_alloc(struct pool *, int); 519 static void pmap_pdp_free(struct pool *, void *); 520 static struct pool_allocator pmap_pdp_allocator = { 521 .pa_alloc = pmap_pdp_alloc, 522 .pa_free = pmap_pdp_free, 523 .pa_pagesz = PAGE_SIZE * PDP_SIZE, 524 }; 525 #endif /* PAE */ 526 527 extern vaddr_t idt_vaddr; 528 extern paddr_t idt_paddr; 529 extern vaddr_t gdt_vaddr; 530 extern paddr_t gdt_paddr; 531 extern vaddr_t ldt_vaddr; 532 extern paddr_t ldt_paddr; 533 534 extern int end; 535 536 #ifdef i386 537 /* stuff to fix the pentium f00f bug */ 538 extern vaddr_t pentium_idt_vaddr; 539 #endif 540 541 /* 542 * Local prototypes 543 */ 544 545 #ifdef __HAVE_DIRECT_MAP 546 static void pmap_init_directmap(struct pmap *); 547 #endif 548 #ifndef XEN 549 static void pmap_init_lapic(void); 550 static void pmap_remap_largepages(void); 551 #endif 552 553 static struct vm_page *pmap_get_ptp(struct pmap *, vaddr_t, 554 pd_entry_t * const *); 555 static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int); 556 static void pmap_freepage(struct pmap *, struct vm_page *, int); 557 static void pmap_free_ptp(struct pmap *, struct vm_page *, vaddr_t, 558 pt_entry_t *, pd_entry_t * const *); 559 static bool pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *, 560 vaddr_t, struct pv_entry **); 561 static void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t, vaddr_t, 562 vaddr_t, struct pv_entry **); 563 564 static paddr_t pmap_get_physpage(void); 565 static void pmap_alloc_level(vaddr_t, long *); 566 567 static bool pmap_reactivate(struct pmap *); 568 569 /* 570 * p m a p h e l p e r f u n c t i o n s 571 */ 572 573 static inline void 574 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff) 575 { 576 577 if (pmap == pmap_kernel()) { 578 atomic_add_long(&pmap->pm_stats.resident_count, resid_diff); 579 atomic_add_long(&pmap->pm_stats.wired_count, wired_diff); 580 } else { 581 KASSERT(mutex_owned(pmap->pm_lock)); 582 pmap->pm_stats.resident_count += resid_diff; 583 pmap->pm_stats.wired_count += wired_diff; 584 } 585 } 586 587 static inline void 588 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte) 589 { 590 int resid_diff = ((npte & PG_V) ? 1 : 0) - ((opte & PG_V) ? 1 : 0); 591 int wired_diff = ((npte & PG_W) ? 1 : 0) - ((opte & PG_W) ? 1 : 0); 592 593 KASSERT((npte & (PG_V | PG_W)) != PG_W); 594 KASSERT((opte & (PG_V | PG_W)) != PG_W); 595 596 pmap_stats_update(pmap, resid_diff, wired_diff); 597 } 598 599 /* 600 * ptp_to_pmap: lookup pmap by ptp 601 */ 602 603 static struct pmap * 604 ptp_to_pmap(struct vm_page *ptp) 605 { 606 struct pmap *pmap; 607 608 if (ptp == NULL) { 609 return pmap_kernel(); 610 } 611 pmap = (struct pmap *)ptp->uobject; 612 KASSERT(pmap != NULL); 613 KASSERT(&pmap->pm_obj[0] == ptp->uobject); 614 return pmap; 615 } 616 617 static inline struct pv_pte * 618 pve_to_pvpte(struct pv_entry *pve) 619 { 620 621 KASSERT((void *)&pve->pve_pte == (void *)pve); 622 return &pve->pve_pte; 623 } 624 625 static inline struct pv_entry * 626 pvpte_to_pve(struct pv_pte *pvpte) 627 { 628 struct pv_entry *pve = (void *)pvpte; 629 630 KASSERT(pve_to_pvpte(pve) == pvpte); 631 return pve; 632 } 633 634 /* 635 * pv_pte_first, pv_pte_next: PV list iterator. 636 */ 637 638 static struct pv_pte * 639 pv_pte_first(struct pmap_page *pp) 640 { 641 642 if ((pp->pp_flags & PP_EMBEDDED) != 0) { 643 return &pp->pp_pte; 644 } 645 return pve_to_pvpte(LIST_FIRST(&pp->pp_head.pvh_list)); 646 } 647 648 static struct pv_pte * 649 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte) 650 { 651 652 KASSERT(pvpte != NULL); 653 if (pvpte == &pp->pp_pte) { 654 KASSERT((pp->pp_flags & PP_EMBEDDED) != 0); 655 return NULL; 656 } 657 KASSERT((pp->pp_flags & PP_EMBEDDED) == 0); 658 return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list)); 659 } 660 661 /* 662 * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]? 663 * of course the kernel is always loaded 664 */ 665 666 bool 667 pmap_is_curpmap(struct pmap *pmap) 668 { 669 return((pmap == pmap_kernel()) || 670 (pmap == curcpu()->ci_pmap)); 671 } 672 673 /* 674 * Add a reference to the specified pmap. 675 */ 676 677 void 678 pmap_reference(struct pmap *pmap) 679 { 680 681 atomic_inc_uint(&pmap->pm_obj[0].uo_refs); 682 } 683 684 /* 685 * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in 686 * 687 * there are several pmaps involved. some or all of them might be same. 688 * 689 * - the pmap given by the first argument 690 * our caller wants to access this pmap's PTEs. 691 * 692 * - pmap_kernel() 693 * the kernel pmap. note that it only contains the kernel part 694 * of the address space which is shared by any pmap. ie. any 695 * pmap can be used instead of pmap_kernel() for our purpose. 696 * 697 * - ci->ci_pmap 698 * pmap currently loaded on the cpu. 699 * 700 * - vm_map_pmap(&curproc->p_vmspace->vm_map) 701 * current process' pmap. 702 * 703 * => we lock enough pmaps to keep things locked in 704 * => must be undone with pmap_unmap_ptes before returning 705 */ 706 707 void 708 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2, 709 pd_entry_t **ptepp, pd_entry_t * const **pdeppp) 710 { 711 struct pmap *curpmap; 712 struct cpu_info *ci; 713 lwp_t *l; 714 715 /* The kernel's pmap is always accessible. */ 716 if (pmap == pmap_kernel()) { 717 *pmap2 = NULL; 718 *ptepp = PTE_BASE; 719 *pdeppp = normal_pdes; 720 return; 721 } 722 KASSERT(kpreempt_disabled()); 723 724 l = curlwp; 725 retry: 726 mutex_enter(pmap->pm_lock); 727 ci = curcpu(); 728 curpmap = ci->ci_pmap; 729 if (vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) { 730 /* Our own pmap so just load it: easy. */ 731 if (__predict_false(ci->ci_want_pmapload)) { 732 mutex_exit(pmap->pm_lock); 733 pmap_load(); 734 goto retry; 735 } 736 KASSERT(pmap == curpmap); 737 } else if (pmap == curpmap) { 738 /* 739 * Already on the CPU: make it valid. This is very 740 * often the case during exit(), when we have switched 741 * to the kernel pmap in order to destroy a user pmap. 742 */ 743 if (!pmap_reactivate(pmap)) { 744 u_int gen = uvm_emap_gen_return(); 745 tlbflush(); 746 uvm_emap_update(gen); 747 } 748 } else { 749 /* 750 * Toss current pmap from CPU, but keep a reference to it. 751 * The reference will be dropped by pmap_unmap_ptes(). 752 * Can happen if we block during exit(). 753 */ 754 const cpuid_t cid = cpu_index(ci); 755 756 kcpuset_atomic_clear(curpmap->pm_cpus, cid); 757 kcpuset_atomic_clear(curpmap->pm_kernel_cpus, cid); 758 ci->ci_pmap = pmap; 759 ci->ci_tlbstate = TLBSTATE_VALID; 760 kcpuset_atomic_set(pmap->pm_cpus, cid); 761 kcpuset_atomic_set(pmap->pm_kernel_cpus, cid); 762 cpu_load_pmap(pmap, curpmap); 763 } 764 pmap->pm_ncsw = l->l_ncsw; 765 *pmap2 = curpmap; 766 *ptepp = PTE_BASE; 767 #if defined(XEN) && defined(__x86_64__) 768 KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] == L4_BASE); 769 ci->ci_normal_pdes[PTP_LEVELS - 2] = pmap->pm_pdir; 770 *pdeppp = ci->ci_normal_pdes; 771 #else /* XEN && __x86_64__ */ 772 *pdeppp = normal_pdes; 773 #endif /* XEN && __x86_64__ */ 774 } 775 776 /* 777 * pmap_unmap_ptes: unlock the PTE mapping of "pmap" 778 */ 779 780 void 781 pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2) 782 { 783 struct cpu_info *ci; 784 struct pmap *mypmap; 785 786 KASSERT(kpreempt_disabled()); 787 788 /* The kernel's pmap is always accessible. */ 789 if (pmap == pmap_kernel()) { 790 return; 791 } 792 793 ci = curcpu(); 794 #if defined(XEN) && defined(__x86_64__) 795 /* Reset per-cpu normal_pdes */ 796 KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] != L4_BASE); 797 ci->ci_normal_pdes[PTP_LEVELS - 2] = L4_BASE; 798 #endif /* XEN && __x86_64__ */ 799 /* 800 * We cannot tolerate context switches while mapped in. 801 * If it is our own pmap all we have to do is unlock. 802 */ 803 KASSERT(pmap->pm_ncsw == curlwp->l_ncsw); 804 mypmap = vm_map_pmap(&curproc->p_vmspace->vm_map); 805 if (pmap == mypmap) { 806 mutex_exit(pmap->pm_lock); 807 return; 808 } 809 810 /* 811 * Mark whatever's on the CPU now as lazy and unlock. 812 * If the pmap was already installed, we are done. 813 */ 814 ci->ci_tlbstate = TLBSTATE_LAZY; 815 ci->ci_want_pmapload = (mypmap != pmap_kernel()); 816 mutex_exit(pmap->pm_lock); 817 if (pmap == pmap2) { 818 return; 819 } 820 821 /* 822 * We installed another pmap on the CPU. Grab a reference to 823 * it and leave in place. Toss the evicted pmap (can block). 824 */ 825 pmap_reference(pmap); 826 pmap_destroy(pmap2); 827 } 828 829 830 inline static void 831 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte) 832 { 833 834 #if !defined(__x86_64__) 835 if (curproc == NULL || curproc->p_vmspace == NULL || 836 pm != vm_map_pmap(&curproc->p_vmspace->vm_map)) 837 return; 838 839 if ((opte ^ npte) & PG_X) 840 pmap_update_pg(va); 841 842 /* 843 * Executability was removed on the last executable change. 844 * Reset the code segment to something conservative and 845 * let the trap handler deal with setting the right limit. 846 * We can't do that because of locking constraints on the vm map. 847 */ 848 849 if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) { 850 struct trapframe *tf = curlwp->l_md.md_regs; 851 852 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 853 pm->pm_hiexec = I386_MAX_EXE_ADDR; 854 } 855 #endif /* !defined(__x86_64__) */ 856 } 857 858 #if !defined(__x86_64__) 859 /* 860 * Fixup the code segment to cover all potential executable mappings. 861 * returns 0 if no changes to the code segment were made. 862 */ 863 864 int 865 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb) 866 { 867 struct vm_map_entry *ent; 868 struct pmap *pm = vm_map_pmap(map); 869 vaddr_t va = 0; 870 871 vm_map_lock_read(map); 872 for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) { 873 874 /* 875 * This entry has greater va than the entries before. 876 * We need to make it point to the last page, not past it. 877 */ 878 879 if (ent->protection & VM_PROT_EXECUTE) 880 va = trunc_page(ent->end) - PAGE_SIZE; 881 } 882 vm_map_unlock_read(map); 883 if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL)) 884 return (0); 885 886 pm->pm_hiexec = va; 887 if (pm->pm_hiexec > I386_MAX_EXE_ADDR) { 888 tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL); 889 } else { 890 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 891 return (0); 892 } 893 return (1); 894 } 895 #endif /* !defined(__x86_64__) */ 896 897 void 898 pat_init(struct cpu_info *ci) 899 { 900 uint64_t pat; 901 902 if (!(ci->ci_feat_val[0] & CPUID_PAT)) 903 return; 904 905 /* We change WT to WC. Leave all other entries the default values. */ 906 pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) | 907 PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) | 908 PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) | 909 PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC); 910 911 wrmsr(MSR_CR_PAT, pat); 912 cpu_pat_enabled = true; 913 aprint_debug_dev(ci->ci_dev, "PAT enabled\n"); 914 } 915 916 static pt_entry_t 917 pmap_pat_flags(u_int flags) 918 { 919 u_int cacheflags = (flags & PMAP_CACHE_MASK); 920 921 if (!cpu_pat_enabled) { 922 switch (cacheflags) { 923 case PMAP_NOCACHE: 924 case PMAP_NOCACHE_OVR: 925 /* results in PGC_UCMINUS on cpus which have 926 * the cpuid PAT but PAT "disabled" 927 */ 928 return PG_N; 929 default: 930 return 0; 931 } 932 } 933 934 switch (cacheflags) { 935 case PMAP_NOCACHE: 936 return PGC_UC; 937 case PMAP_WRITE_COMBINE: 938 return PGC_WC; 939 case PMAP_WRITE_BACK: 940 return PGC_WB; 941 case PMAP_NOCACHE_OVR: 942 return PGC_UCMINUS; 943 } 944 945 return 0; 946 } 947 948 /* 949 * p m a p k e n t e r f u n c t i o n s 950 * 951 * functions to quickly enter/remove pages from the kernel address 952 * space. pmap_kremove is exported to MI kernel. we make use of 953 * the recursive PTE mappings. 954 */ 955 956 /* 957 * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking 958 * 959 * => no need to lock anything, assume va is already allocated 960 * => should be faster than normal pmap enter function 961 */ 962 963 void 964 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags) 965 { 966 pt_entry_t *pte, opte, npte; 967 968 KASSERT(!(prot & ~VM_PROT_ALL)); 969 970 if (va < VM_MIN_KERNEL_ADDRESS) 971 pte = vtopte(va); 972 else 973 pte = kvtopte(va); 974 #ifdef DOM0OPS 975 if (pa < pmap_pa_start || pa >= pmap_pa_end) { 976 #ifdef DEBUG 977 printf_nolog("%s: pa 0x%" PRIx64 " for va 0x%" PRIx64 978 " outside range\n", __func__, (int64_t)pa, (int64_t)va); 979 #endif /* DEBUG */ 980 npte = pa; 981 } else 982 #endif /* DOM0OPS */ 983 npte = pmap_pa2pte(pa); 984 npte |= protection_codes[prot] | PG_k | PG_V | pmap_pg_g; 985 npte |= pmap_pat_flags(flags); 986 opte = pmap_pte_testset(pte, npte); /* zap! */ 987 #if defined(DIAGNOSTIC) 988 /* 989 * XXX: make sure we are not dealing with a large page, since the only 990 * large pages created are for the kernel image, and they should never 991 * be kentered. 992 */ 993 if (opte & PG_PS) 994 panic("%s: PG_PS", __func__); 995 #endif 996 if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) { 997 /* This should not happen. */ 998 printf_nolog("%s: mapping already present\n", __func__); 999 kpreempt_disable(); 1000 pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER); 1001 kpreempt_enable(); 1002 } 1003 } 1004 1005 void 1006 pmap_emap_enter(vaddr_t va, paddr_t pa, vm_prot_t prot) 1007 { 1008 pt_entry_t *pte, npte; 1009 1010 KASSERT((prot & ~VM_PROT_ALL) == 0); 1011 pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va); 1012 1013 #ifdef DOM0OPS 1014 if (pa < pmap_pa_start || pa >= pmap_pa_end) { 1015 npte = pa; 1016 } else 1017 #endif 1018 npte = pmap_pa2pte(pa); 1019 1020 npte = pmap_pa2pte(pa); 1021 npte |= protection_codes[prot] | PG_k | PG_V; 1022 pmap_pte_set(pte, npte); 1023 } 1024 1025 /* 1026 * pmap_emap_sync: perform TLB flush or pmap load, if it was deferred. 1027 */ 1028 void 1029 pmap_emap_sync(bool canload) 1030 { 1031 struct cpu_info *ci = curcpu(); 1032 struct pmap *pmap; 1033 1034 KASSERT(kpreempt_disabled()); 1035 if (__predict_true(ci->ci_want_pmapload && canload)) { 1036 /* 1037 * XXX: Hint for pmap_reactivate(), which might suggest to 1038 * not perform TLB flush, if state has not changed. 1039 */ 1040 pmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map); 1041 if (__predict_false(pmap == ci->ci_pmap)) { 1042 kcpuset_atomic_clear(pmap->pm_cpus, cpu_index(ci)); 1043 } 1044 pmap_load(); 1045 KASSERT(ci->ci_want_pmapload == 0); 1046 } else { 1047 tlbflush(); 1048 } 1049 } 1050 1051 void 1052 pmap_emap_remove(vaddr_t sva, vsize_t len) 1053 { 1054 pt_entry_t *pte; 1055 vaddr_t va, eva = sva + len; 1056 1057 for (va = sva; va < eva; va += PAGE_SIZE) { 1058 pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va); 1059 pmap_pte_set(pte, 0); 1060 } 1061 } 1062 1063 __strict_weak_alias(pmap_kenter_ma, pmap_kenter_pa); 1064 1065 #if defined(__x86_64__) 1066 /* 1067 * Change protection for a virtual address. Local for a CPU only, don't 1068 * care about TLB shootdowns. 1069 * 1070 * => must be called with preemption disabled 1071 */ 1072 void 1073 pmap_changeprot_local(vaddr_t va, vm_prot_t prot) 1074 { 1075 pt_entry_t *pte, opte, npte; 1076 1077 KASSERT(kpreempt_disabled()); 1078 1079 if (va < VM_MIN_KERNEL_ADDRESS) 1080 pte = vtopte(va); 1081 else 1082 pte = kvtopte(va); 1083 1084 npte = opte = *pte; 1085 1086 if ((prot & VM_PROT_WRITE) != 0) 1087 npte |= PG_RW; 1088 else 1089 npte &= ~PG_RW; 1090 1091 if (opte != npte) { 1092 pmap_pte_set(pte, npte); 1093 pmap_pte_flush(); 1094 invlpg(va); 1095 } 1096 } 1097 #endif /* defined(__x86_64__) */ 1098 1099 /* 1100 * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking 1101 * 1102 * => no need to lock anything 1103 * => caller must dispose of any vm_page mapped in the va range 1104 * => note: not an inline function 1105 * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE 1106 * => we assume kernel only unmaps valid addresses and thus don't bother 1107 * checking the valid bit before doing TLB flushing 1108 * => must be followed by call to pmap_update() before reuse of page 1109 */ 1110 1111 static inline void 1112 pmap_kremove1(vaddr_t sva, vsize_t len, bool localonly) 1113 { 1114 pt_entry_t *pte, opte; 1115 vaddr_t va, eva; 1116 1117 eva = sva + len; 1118 1119 kpreempt_disable(); 1120 for (va = sva; va < eva; va += PAGE_SIZE) { 1121 pte = kvtopte(va); 1122 opte = pmap_pte_testset(pte, 0); /* zap! */ 1123 if ((opte & (PG_V | PG_U)) == (PG_V | PG_U) && !localonly) { 1124 pmap_tlb_shootdown(pmap_kernel(), va, opte, 1125 TLBSHOOT_KREMOVE); 1126 } 1127 KASSERT((opte & PG_PS) == 0); 1128 KASSERT((opte & PG_PVLIST) == 0); 1129 } 1130 if (localonly) { 1131 tlbflushg(); 1132 } 1133 kpreempt_enable(); 1134 } 1135 1136 void 1137 pmap_kremove(vaddr_t sva, vsize_t len) 1138 { 1139 1140 pmap_kremove1(sva, len, false); 1141 } 1142 1143 /* 1144 * pmap_kremove_local: like pmap_kremove(), but only worry about 1145 * TLB invalidations on the current CPU. this is only intended 1146 * for use while writing kernel crash dumps. 1147 */ 1148 1149 void 1150 pmap_kremove_local(vaddr_t sva, vsize_t len) 1151 { 1152 1153 KASSERT(panicstr != NULL); 1154 pmap_kremove1(sva, len, true); 1155 } 1156 1157 /* 1158 * p m a p i n i t f u n c t i o n s 1159 * 1160 * pmap_bootstrap and pmap_init are called during system startup 1161 * to init the pmap module. pmap_bootstrap() does a low level 1162 * init just to get things rolling. pmap_init() finishes the job. 1163 */ 1164 1165 /* 1166 * pmap_bootstrap_valloc: allocate a virtual address in the bootstrap area. 1167 * This function is to be used before any VM system has been set up. 1168 * 1169 * The va is taken from virtual_avail. 1170 */ 1171 static vaddr_t 1172 pmap_bootstrap_valloc(size_t npages) 1173 { 1174 vaddr_t va = virtual_avail; 1175 virtual_avail += npages * PAGE_SIZE; 1176 return va; 1177 } 1178 1179 /* 1180 * pmap_bootstrap_palloc: allocate a physical address in the bootstrap area. 1181 * This function is to be used before any VM system has been set up. 1182 * 1183 * The pa is taken from avail_start. 1184 */ 1185 static paddr_t 1186 pmap_bootstrap_palloc(size_t npages) 1187 { 1188 paddr_t pa = avail_start; 1189 avail_start += npages * PAGE_SIZE; 1190 return pa; 1191 } 1192 1193 /* 1194 * pmap_bootstrap: get the system in a state where it can run with VM properly 1195 * enabled (called before main()). The VM system is fully init'd later. 1196 * 1197 * => on i386, locore.S has already enabled the MMU by allocating a PDP for the 1198 * kernel, and nkpde PTP's for the kernel. 1199 * => kva_start is the first free virtual address in kernel space. 1200 */ 1201 void 1202 pmap_bootstrap(vaddr_t kva_start) 1203 { 1204 struct pmap *kpm; 1205 int i; 1206 vaddr_t kva; 1207 #ifndef XEN 1208 unsigned long p1i; 1209 vaddr_t kva_end; 1210 #endif 1211 1212 pmap_pg_nx = (cpu_feature[2] & CPUID_NOX ? PG_NX : 0); 1213 1214 /* 1215 * Set up our local static global vars that keep track of the usage of 1216 * KVM before kernel_map is set up. 1217 */ 1218 virtual_avail = kva_start; /* first free KVA */ 1219 virtual_end = VM_MAX_KERNEL_ADDRESS; /* last KVA */ 1220 1221 /* 1222 * Set up protection_codes: we need to be able to convert from a MI 1223 * protection code (some combo of VM_PROT...) to something we can jam 1224 * into a x86 PTE. 1225 */ 1226 protection_codes[VM_PROT_NONE] = pmap_pg_nx; 1227 protection_codes[VM_PROT_EXECUTE] = PG_RO | PG_X; 1228 protection_codes[VM_PROT_READ] = PG_RO | pmap_pg_nx; 1229 protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO | PG_X; 1230 protection_codes[VM_PROT_WRITE] = PG_RW | pmap_pg_nx; 1231 protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW | PG_X; 1232 protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW | pmap_pg_nx; 1233 protection_codes[VM_PROT_ALL] = PG_RW | PG_X; 1234 1235 /* 1236 * Now we init the kernel's pmap. 1237 * 1238 * The kernel pmap's pm_obj is not used for much. However, in user pmaps 1239 * the pm_obj contains the list of active PTPs. 1240 * 1241 * The pm_obj currently does not have a pager. It might be possible to 1242 * add a pager that would allow a process to read-only mmap its own page 1243 * tables (fast user-level vtophys?). This may or may not be useful. 1244 */ 1245 kpm = pmap_kernel(); 1246 for (i = 0; i < PTP_LEVELS - 1; i++) { 1247 mutex_init(&kpm->pm_obj_lock[i], MUTEX_DEFAULT, IPL_NONE); 1248 uvm_obj_init(&kpm->pm_obj[i], NULL, false, 1); 1249 uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_obj_lock[i]); 1250 kpm->pm_ptphint[i] = NULL; 1251 } 1252 memset(&kpm->pm_list, 0, sizeof(kpm->pm_list)); /* pm_list not used */ 1253 1254 kpm->pm_pdir = (pd_entry_t *)(PDPpaddr + KERNBASE); 1255 for (i = 0; i < PDP_SIZE; i++) 1256 kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i; 1257 1258 kpm->pm_stats.wired_count = kpm->pm_stats.resident_count = 1259 x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS); 1260 1261 kcpuset_create(&kpm->pm_cpus, true); 1262 kcpuset_create(&kpm->pm_kernel_cpus, true); 1263 1264 /* 1265 * the above is just a rough estimate and not critical to the proper 1266 * operation of the system. 1267 */ 1268 1269 #ifndef XEN 1270 /* 1271 * Begin to enable global TLB entries if they are supported. 1272 * The G bit has no effect until the CR4_PGE bit is set in CR4, 1273 * which happens in cpu_init(), which is run on each cpu 1274 * (and happens later) 1275 */ 1276 if (cpu_feature[0] & CPUID_PGE) { 1277 pmap_pg_g = PG_G; /* enable software */ 1278 1279 /* add PG_G attribute to already mapped kernel pages */ 1280 1281 if (KERNBASE == VM_MIN_KERNEL_ADDRESS) { 1282 /* i386 only */ 1283 kva_end = virtual_avail; 1284 } else { 1285 /* amd64 only */ 1286 extern vaddr_t kern_end; 1287 kva_end = kern_end; 1288 } 1289 1290 for (kva = KERNBASE; kva < kva_end; kva += PAGE_SIZE) { 1291 p1i = pl1_i(kva); 1292 if (pmap_valid_entry(PTE_BASE[p1i])) 1293 PTE_BASE[p1i] |= PG_G; 1294 } 1295 } 1296 1297 /* 1298 * Enable large pages if they are supported. 1299 */ 1300 if (cpu_feature[0] & CPUID_PSE) { 1301 lcr4(rcr4() | CR4_PSE); /* enable hardware (via %cr4) */ 1302 pmap_largepages = 1; /* enable software */ 1303 1304 /* 1305 * The TLB must be flushed after enabling large pages on Pentium 1306 * CPUs, according to section 3.6.2.2 of "Intel Architecture 1307 * Software Developer's Manual, Volume 3: System Programming". 1308 */ 1309 tlbflushg(); 1310 1311 /* Remap the kernel. */ 1312 pmap_remap_largepages(); 1313 } 1314 pmap_init_lapic(); 1315 #endif /* !XEN */ 1316 1317 #ifdef __HAVE_DIRECT_MAP 1318 pmap_init_directmap(kpm); 1319 #else 1320 pmap_vpage_cpualloc(&cpu_info_primary); 1321 1322 if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { /* i386 */ 1323 early_zerop = (void *)cpu_info_primary.vpage[VPAGE_ZER]; 1324 early_zero_pte = cpu_info_primary.vpage_pte[VPAGE_ZER]; 1325 } else { /* amd64 */ 1326 /* 1327 * zero_pte is stuck at the end of mapped space for the kernel 1328 * image (disjunct from kva space). This is done so that it 1329 * can safely be used in pmap_growkernel (pmap_get_physpage), 1330 * when it's called for the first time. 1331 * XXXfvdl fix this for MULTIPROCESSOR later. 1332 */ 1333 #ifdef XEN 1334 /* early_zerop initialized in xen_locore() */ 1335 #else 1336 early_zerop = (void *)(KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2); 1337 #endif 1338 early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop); 1339 } 1340 #endif 1341 1342 #if defined(XEN) && defined(__x86_64__) 1343 extern vaddr_t xen_dummy_page; 1344 paddr_t xen_dummy_user_pgd; 1345 1346 /* 1347 * We want a dummy page directory for Xen: when deactivating a pmap, 1348 * Xen will still consider it active. So we set user PGD to this one 1349 * to lift all protection on the now inactive page tables set. 1350 */ 1351 xen_dummy_user_pgd = xen_dummy_page - KERNBASE; 1352 1353 /* Zero fill it, the less checks in Xen it requires the better */ 1354 memset((void *)(xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE); 1355 /* Mark read-only */ 1356 HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE, 1357 pmap_pa2pte(xen_dummy_user_pgd) | PG_u | PG_V | pmap_pg_nx, 1358 UVMF_INVLPG); 1359 /* Pin as L4 */ 1360 xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd)); 1361 #endif 1362 1363 /* 1364 * Allocate space for the IDT, GDT and LDT. 1365 */ 1366 idt_vaddr = pmap_bootstrap_valloc(1); 1367 idt_paddr = pmap_bootstrap_palloc(1); 1368 1369 gdt_vaddr = pmap_bootstrap_valloc(1); 1370 gdt_paddr = pmap_bootstrap_palloc(1); 1371 1372 ldt_vaddr = pmap_bootstrap_valloc(1); 1373 ldt_paddr = pmap_bootstrap_palloc(1); 1374 1375 #if !defined(__x86_64__) && !defined(XEN) 1376 /* pentium f00f bug stuff */ 1377 pentium_idt_vaddr = pmap_bootstrap_valloc(1); 1378 #endif 1379 1380 /* 1381 * Now we reserve some VM for mapping pages when doing a crash dump. 1382 */ 1383 virtual_avail = reserve_dumppages(virtual_avail); 1384 1385 /* 1386 * Init the static-global locks and global lists. 1387 * 1388 * => pventry::pvh_lock (initialized elsewhere) must also be 1389 * a spin lock, again at IPL_VM to prevent deadlock, and 1390 * again is never taken from interrupt context. 1391 */ 1392 mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE); 1393 LIST_INIT(&pmaps); 1394 1395 /* 1396 * Ensure the TLB is sync'd with reality by flushing it... 1397 */ 1398 tlbflushg(); 1399 1400 /* 1401 * Calculate pmap_maxkvaddr from nkptp[]. 1402 */ 1403 kva = VM_MIN_KERNEL_ADDRESS; 1404 for (i = PTP_LEVELS - 1; i >= 1; i--) { 1405 kva += nkptp[i] * nbpd[i]; 1406 } 1407 pmap_maxkvaddr = kva; 1408 } 1409 1410 #ifndef XEN 1411 static void 1412 pmap_init_lapic(void) 1413 { 1414 /* 1415 * On CPUs that have no LAPIC, local_apic_va is never kentered. But our 1416 * x86 implementation relies a lot on this address to be valid; so just 1417 * allocate a fake physical page that will be kentered into 1418 * local_apic_va by machdep. 1419 * 1420 * If the LAPIC is present, the va will be remapped somewhere else 1421 * later in lapic_map. 1422 */ 1423 local_apic_va = pmap_bootstrap_valloc(1); 1424 local_apic_pa = pmap_bootstrap_palloc(1); 1425 } 1426 #endif 1427 1428 #ifdef __HAVE_DIRECT_MAP 1429 /* 1430 * Create the amd64 direct map. Called only once at boot time. 1431 */ 1432 static void 1433 pmap_init_directmap(struct pmap *kpm) 1434 { 1435 extern phys_ram_seg_t mem_clusters[]; 1436 extern int mem_cluster_cnt; 1437 1438 paddr_t lastpa, L2page_pa, L3page_pa, pdp; 1439 vaddr_t tmpva; 1440 pt_entry_t *pte; 1441 pd_entry_t *pde; 1442 phys_ram_seg_t *mc; 1443 size_t nL3e; 1444 int i; 1445 1446 const pd_entry_t pteflags = PG_V | PG_KW | pmap_pg_nx; 1447 1448 /* Get the last physical address available */ 1449 lastpa = 0; 1450 for (i = 0; i < mem_cluster_cnt; i++) { 1451 mc = &mem_clusters[i]; 1452 lastpa = MAX(lastpa, mc->start + mc->size); 1453 } 1454 1455 /* 1456 * We allocate only one L4 entry for the direct map (PDIR_SLOT_DIRECT), 1457 * so we cannot map more than 512GB. 1458 */ 1459 if (lastpa > NBPD_L4) { 1460 panic("RAM limit reached: > 512GB not supported"); 1461 } 1462 1463 /* In locore.S, we allocated a tmp va. We will use it now. */ 1464 tmpva = (KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2); 1465 pte = PTE_BASE + pl1_i(tmpva); 1466 1467 /* Allocate L3, and zero it out. */ 1468 L3page_pa = pmap_bootstrap_palloc(1); 1469 *pte = L3page_pa | pteflags; 1470 pmap_update_pg(tmpva); 1471 memset((void *)tmpva, 0, PAGE_SIZE); 1472 1473 /* Number of L3 entries. */ 1474 nL3e = (lastpa + NBPD_L3 - 1) >> L3_SHIFT; 1475 1476 /* 1477 * Map the direct map RW. Use super pages (1GB) or large pages (2MB) if 1478 * they are supported. Note: PG_G is not allowed on non-leaf PTPs. 1479 */ 1480 if (cpu_feature[2] & CPUID_P1GB) { 1481 /* Super pages are supported. Just create L3. */ 1482 for (i = 0; i < nL3e; i++) { 1483 pdp = (paddr_t)&(((pd_entry_t *)L3page_pa)[i]); 1484 *pte = (pdp & PG_FRAME) | pteflags; 1485 pmap_update_pg(tmpva); 1486 1487 pde = (pd_entry_t *)(tmpva + (pdp & ~PG_FRAME)); 1488 *pde = ((paddr_t)i << L3_SHIFT) | pteflags | PG_U | 1489 PG_PS | PG_G; 1490 } 1491 } else { 1492 /* Allocate L2. */ 1493 L2page_pa = pmap_bootstrap_palloc(nL3e); 1494 1495 /* Zero out the L2 pages. */ 1496 for (i = 0; i < nL3e; i++) { 1497 pdp = L2page_pa + i * PAGE_SIZE; 1498 *pte = (pdp & PG_FRAME) | pteflags; 1499 pmap_update_pg(tmpva); 1500 1501 memset((void *)tmpva, 0, PAGE_SIZE); 1502 } 1503 1504 KASSERT(pmap_largepages != 0); 1505 1506 /* Large pages are supported. Just create L2. */ 1507 for (i = 0; i < NPDPG * nL3e; i++) { 1508 pdp = (paddr_t)&(((pd_entry_t *)L2page_pa)[i]); 1509 *pte = (pdp & PG_FRAME) | pteflags; 1510 pmap_update_pg(tmpva); 1511 1512 pde = (pd_entry_t *)(tmpva + (pdp & ~PG_FRAME)); 1513 *pde = ((paddr_t)i << L2_SHIFT) | pteflags | 1514 PG_U | PG_PS | PG_G; 1515 } 1516 1517 /* Fill in the L3 entries, linked to L2. */ 1518 for (i = 0; i < nL3e; i++) { 1519 pdp = (paddr_t)&(((pd_entry_t *)L3page_pa)[i]); 1520 *pte = (pdp & PG_FRAME) | pteflags; 1521 pmap_update_pg(tmpva); 1522 1523 pde = (pd_entry_t *)(tmpva + (pdp & ~PG_FRAME)); 1524 *pde = (L2page_pa + (i << PAGE_SHIFT)) | pteflags | PG_U; 1525 } 1526 } 1527 1528 kpm->pm_pdir[PDIR_SLOT_DIRECT] = L3page_pa | pteflags | PG_U; 1529 1530 *pte = 0; 1531 pmap_update_pg(tmpva); 1532 1533 tlbflush(); 1534 } 1535 #endif /* __HAVE_DIRECT_MAP */ 1536 1537 #ifndef XEN 1538 /* 1539 * Remap several kernel segments with large pages. We cover as many pages as we 1540 * can. Called only once at boot time, if the CPU supports large pages. 1541 */ 1542 static void 1543 pmap_remap_largepages(void) 1544 { 1545 extern char __rodata_start; 1546 extern char __data_start; 1547 extern char __kernel_end; 1548 pd_entry_t *pde; 1549 vaddr_t kva, kva_end; 1550 paddr_t pa; 1551 1552 /* Remap the kernel text using large pages. */ 1553 kva = rounddown((vaddr_t)KERNTEXTOFF, NBPD_L2); 1554 kva_end = rounddown((vaddr_t)&__rodata_start, NBPD_L1); 1555 pa = kva - KERNBASE; 1556 for (/* */; kva + NBPD_L2 <= kva_end; kva += NBPD_L2, pa += NBPD_L2) { 1557 pde = &L2_BASE[pl2_i(kva)]; 1558 *pde = pa | pmap_pg_g | PG_PS | PG_KR | PG_V; 1559 tlbflushg(); 1560 } 1561 #if defined(DEBUG) 1562 aprint_normal("kernel text is mapped with %" PRIuPSIZE " large " 1563 "pages and %" PRIuPSIZE " normal pages\n", 1564 howmany(kva - KERNBASE, NBPD_L2), 1565 howmany((vaddr_t)&__rodata_start - kva, NBPD_L1)); 1566 #endif /* defined(DEBUG) */ 1567 1568 /* Remap the kernel rodata using large pages. */ 1569 kva = roundup((vaddr_t)&__rodata_start, NBPD_L2); 1570 kva_end = rounddown((vaddr_t)&__data_start, NBPD_L1); 1571 pa = kva - KERNBASE; 1572 for (/* */; kva + NBPD_L2 <= kva_end; kva += NBPD_L2, pa += NBPD_L2) { 1573 pde = &L2_BASE[pl2_i(kva)]; 1574 *pde = pa | pmap_pg_g | PG_PS | pmap_pg_nx | PG_KR | PG_V; 1575 tlbflushg(); 1576 } 1577 1578 /* Remap the kernel data+bss using large pages. */ 1579 kva = roundup((vaddr_t)&__data_start, NBPD_L2); 1580 kva_end = rounddown((vaddr_t)&__kernel_end, NBPD_L1); 1581 pa = kva - KERNBASE; 1582 for (/* */; kva + NBPD_L2 <= kva_end; kva += NBPD_L2, pa += NBPD_L2) { 1583 pde = &L2_BASE[pl2_i(kva)]; 1584 *pde = pa | pmap_pg_g | PG_PS | pmap_pg_nx | PG_KW | PG_V; 1585 tlbflushg(); 1586 } 1587 } 1588 #endif /* !XEN */ 1589 1590 /* 1591 * pmap_init: called from uvm_init, our job is to get the pmap 1592 * system ready to manage mappings... 1593 */ 1594 1595 void 1596 pmap_init(void) 1597 { 1598 int i, flags; 1599 1600 for (i = 0; i < PV_HASH_SIZE; i++) { 1601 SLIST_INIT(&pv_hash_heads[i].hh_list); 1602 } 1603 for (i = 0; i < PV_HASH_LOCK_CNT; i++) { 1604 mutex_init(&pv_hash_locks[i].lock, MUTEX_NODEBUG, IPL_VM); 1605 } 1606 1607 /* 1608 * initialize caches. 1609 */ 1610 1611 pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), 0, 0, 0, 1612 "pmappl", NULL, IPL_NONE, NULL, NULL, NULL); 1613 1614 #ifdef XEN 1615 /* 1616 * pool_cache(9) should not touch cached objects, since they 1617 * are pinned on xen and R/O for the domU 1618 */ 1619 flags = PR_NOTOUCH; 1620 #else /* XEN */ 1621 flags = 0; 1622 #endif /* XEN */ 1623 #ifdef PAE 1624 pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE * PDP_SIZE, 0, 0, flags, 1625 "pdppl", &pmap_pdp_allocator, IPL_NONE, 1626 pmap_pdp_ctor, pmap_pdp_dtor, NULL); 1627 #else /* PAE */ 1628 pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE, 0, 0, flags, 1629 "pdppl", NULL, IPL_NONE, pmap_pdp_ctor, pmap_pdp_dtor, NULL); 1630 #endif /* PAE */ 1631 pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0, 1632 PR_LARGECACHE, "pvpl", &pool_allocator_kmem, IPL_NONE, NULL, 1633 NULL, NULL); 1634 1635 pmap_tlb_init(); 1636 1637 /* XXX: Since cpu_hatch() is only for secondary CPUs. */ 1638 pmap_tlb_cpu_init(curcpu()); 1639 1640 evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC, 1641 NULL, "x86", "io bitmap copy"); 1642 evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC, 1643 NULL, "x86", "ldt sync"); 1644 1645 /* 1646 * done: pmap module is up (and ready for business) 1647 */ 1648 1649 pmap_initialized = true; 1650 } 1651 1652 /* 1653 * pmap_cpu_init_late: perform late per-CPU initialization. 1654 */ 1655 1656 #ifndef XEN 1657 void 1658 pmap_cpu_init_late(struct cpu_info *ci) 1659 { 1660 /* 1661 * The BP has already its own PD page allocated during early 1662 * MD startup. 1663 */ 1664 if (ci == &cpu_info_primary) 1665 return; 1666 1667 #ifdef PAE 1668 cpu_alloc_l3_page(ci); 1669 #endif 1670 } 1671 #endif 1672 1673 #ifndef __HAVE_DIRECT_MAP 1674 CTASSERT(CACHE_LINE_SIZE > sizeof(pt_entry_t)); 1675 CTASSERT(CACHE_LINE_SIZE % sizeof(pt_entry_t) == 0); 1676 1677 static void 1678 pmap_vpage_cpualloc(struct cpu_info *ci) 1679 { 1680 bool primary = (ci == &cpu_info_primary); 1681 size_t i, npages; 1682 vaddr_t vabase; 1683 vsize_t vrange; 1684 1685 npages = (CACHE_LINE_SIZE / sizeof(pt_entry_t)); 1686 KASSERT(npages >= VPAGE_MAX); 1687 vrange = npages * PAGE_SIZE; 1688 1689 if (primary) { 1690 while ((vabase = pmap_bootstrap_valloc(1)) % vrange != 0) { 1691 /* Waste some pages to align properly */ 1692 } 1693 /* The base is aligned, allocate the rest (contiguous) */ 1694 pmap_bootstrap_valloc(npages - 1); 1695 } else { 1696 vabase = uvm_km_alloc(kernel_map, vrange, vrange, 1697 UVM_KMF_VAONLY); 1698 if (vabase == 0) { 1699 panic("%s: failed to allocate tmp VA for CPU %d\n", 1700 __func__, cpu_index(ci)); 1701 } 1702 } 1703 1704 KASSERT((vaddr_t)&PTE_BASE[pl1_i(vabase)] % CACHE_LINE_SIZE == 0); 1705 1706 for (i = 0; i < VPAGE_MAX; i++) { 1707 ci->vpage[i] = vabase + i * PAGE_SIZE; 1708 ci->vpage_pte[i] = PTE_BASE + pl1_i(ci->vpage[i]); 1709 } 1710 } 1711 1712 void 1713 pmap_vpage_cpu_init(struct cpu_info *ci) 1714 { 1715 if (ci == &cpu_info_primary) { 1716 /* cpu0 already taken care of in pmap_bootstrap */ 1717 return; 1718 } 1719 1720 pmap_vpage_cpualloc(ci); 1721 } 1722 #endif 1723 1724 /* 1725 * p v _ e n t r y f u n c t i o n s 1726 */ 1727 1728 /* 1729 * pmap_free_pvs: free a list of pv_entrys 1730 */ 1731 1732 static void 1733 pmap_free_pvs(struct pv_entry *pve) 1734 { 1735 struct pv_entry *next; 1736 1737 for ( /* null */ ; pve != NULL ; pve = next) { 1738 next = pve->pve_next; 1739 pool_cache_put(&pmap_pv_cache, pve); 1740 } 1741 } 1742 1743 /* 1744 * main pv_entry manipulation functions: 1745 * pmap_enter_pv: enter a mapping onto a pv_head list 1746 * pmap_remove_pv: remove a mapping from a pv_head list 1747 * 1748 * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock 1749 * the pvh before calling 1750 */ 1751 1752 /* 1753 * insert_pv: a helper of pmap_enter_pv 1754 */ 1755 1756 static void 1757 insert_pv(struct pmap_page *pp, struct pv_entry *pve) 1758 { 1759 struct pv_hash_head *hh; 1760 kmutex_t *lock; 1761 u_int hash; 1762 1763 hash = pvhash_hash(pve->pve_pte.pte_ptp, pve->pve_pte.pte_va); 1764 lock = pvhash_lock(hash); 1765 hh = pvhash_head(hash); 1766 mutex_spin_enter(lock); 1767 SLIST_INSERT_HEAD(&hh->hh_list, pve, pve_hash); 1768 mutex_spin_exit(lock); 1769 1770 LIST_INSERT_HEAD(&pp->pp_head.pvh_list, pve, pve_list); 1771 } 1772 1773 /* 1774 * pmap_enter_pv: enter a mapping onto a pv_head lst 1775 * 1776 * => caller should adjust ptp's wire_count before calling 1777 * => caller has preallocated pve and *sparepve for us 1778 */ 1779 1780 static struct pv_entry * 1781 pmap_enter_pv(struct pmap_page *pp, struct pv_entry *pve, 1782 struct pv_entry **sparepve, struct vm_page *ptp, vaddr_t va) 1783 { 1784 1785 KASSERT(ptp == NULL || ptp->wire_count >= 2); 1786 KASSERT(ptp == NULL || ptp->uobject != NULL); 1787 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 1788 1789 if ((pp->pp_flags & PP_EMBEDDED) == 0) { 1790 if (LIST_EMPTY(&pp->pp_head.pvh_list)) { 1791 pp->pp_flags |= PP_EMBEDDED; 1792 pp->pp_pte.pte_ptp = ptp; 1793 pp->pp_pte.pte_va = va; 1794 1795 return pve; 1796 } 1797 } else { 1798 struct pv_entry *pve2; 1799 1800 pve2 = *sparepve; 1801 *sparepve = NULL; 1802 1803 pve2->pve_pte = pp->pp_pte; 1804 pp->pp_flags &= ~PP_EMBEDDED; 1805 LIST_INIT(&pp->pp_head.pvh_list); 1806 insert_pv(pp, pve2); 1807 } 1808 1809 pve->pve_pte.pte_ptp = ptp; 1810 pve->pve_pte.pte_va = va; 1811 insert_pv(pp, pve); 1812 1813 return NULL; 1814 } 1815 1816 /* 1817 * pmap_remove_pv: try to remove a mapping from a pv_list 1818 * 1819 * => caller should adjust ptp's wire_count and free PTP if needed 1820 * => we return the removed pve 1821 */ 1822 1823 static struct pv_entry * 1824 pmap_remove_pv(struct pmap_page *pp, struct vm_page *ptp, vaddr_t va) 1825 { 1826 struct pv_hash_head *hh; 1827 struct pv_entry *pve; 1828 kmutex_t *lock; 1829 u_int hash; 1830 1831 KASSERT(ptp == NULL || ptp->uobject != NULL); 1832 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 1833 1834 if ((pp->pp_flags & PP_EMBEDDED) != 0) { 1835 KASSERT(pp->pp_pte.pte_ptp == ptp); 1836 KASSERT(pp->pp_pte.pte_va == va); 1837 1838 pp->pp_flags &= ~PP_EMBEDDED; 1839 LIST_INIT(&pp->pp_head.pvh_list); 1840 1841 return NULL; 1842 } 1843 1844 hash = pvhash_hash(ptp, va); 1845 lock = pvhash_lock(hash); 1846 hh = pvhash_head(hash); 1847 mutex_spin_enter(lock); 1848 pve = pvhash_remove(hh, ptp, va); 1849 mutex_spin_exit(lock); 1850 1851 LIST_REMOVE(pve, pve_list); 1852 1853 return pve; 1854 } 1855 1856 /* 1857 * p t p f u n c t i o n s 1858 */ 1859 1860 static inline struct vm_page * 1861 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level) 1862 { 1863 int lidx = level - 1; 1864 struct vm_page *pg; 1865 1866 KASSERT(mutex_owned(pmap->pm_lock)); 1867 1868 if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] && 1869 pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) { 1870 return (pmap->pm_ptphint[lidx]); 1871 } 1872 PMAP_SUBOBJ_LOCK(pmap, lidx); 1873 pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level)); 1874 PMAP_SUBOBJ_UNLOCK(pmap, lidx); 1875 1876 KASSERT(pg == NULL || pg->wire_count >= 1); 1877 return pg; 1878 } 1879 1880 static inline void 1881 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level) 1882 { 1883 lwp_t *l; 1884 int lidx; 1885 struct uvm_object *obj; 1886 1887 KASSERT(ptp->wire_count == 1); 1888 1889 lidx = level - 1; 1890 1891 obj = &pmap->pm_obj[lidx]; 1892 pmap_stats_update(pmap, -1, 0); 1893 if (lidx != 0) 1894 mutex_enter(obj->vmobjlock); 1895 if (pmap->pm_ptphint[lidx] == ptp) 1896 pmap->pm_ptphint[lidx] = TAILQ_FIRST(&obj->memq); 1897 ptp->wire_count = 0; 1898 uvm_pagerealloc(ptp, NULL, 0); 1899 l = curlwp; 1900 KASSERT((l->l_pflag & LP_INTR) == 0); 1901 VM_PAGE_TO_PP(ptp)->pp_link = l->l_md.md_gc_ptp; 1902 l->l_md.md_gc_ptp = ptp; 1903 if (lidx != 0) 1904 mutex_exit(obj->vmobjlock); 1905 } 1906 1907 static void 1908 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va, 1909 pt_entry_t *ptes, pd_entry_t * const *pdes) 1910 { 1911 unsigned long index; 1912 int level; 1913 vaddr_t invaladdr; 1914 pd_entry_t opde; 1915 1916 KASSERT(pmap != pmap_kernel()); 1917 KASSERT(mutex_owned(pmap->pm_lock)); 1918 KASSERT(kpreempt_disabled()); 1919 1920 level = 1; 1921 do { 1922 index = pl_i(va, level + 1); 1923 opde = pmap_pte_testset(&pdes[level - 1][index], 0); 1924 #if defined(XEN) 1925 # if defined(__x86_64__) 1926 /* 1927 * If ptp is a L3 currently mapped in kernel space, 1928 * on any cpu, clear it before freeing 1929 */ 1930 if (level == PTP_LEVELS - 1) { 1931 /* 1932 * Update the per-cpu PD on all cpus the current 1933 * pmap is active on 1934 */ 1935 xen_kpm_sync(pmap, index); 1936 } 1937 # endif /*__x86_64__ */ 1938 invaladdr = level == 1 ? (vaddr_t)ptes : 1939 (vaddr_t)pdes[level - 2]; 1940 pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE, 1941 opde, TLBSHOOT_FREE_PTP1); 1942 pmap_tlb_shootnow(); 1943 #else /* XEN */ 1944 invaladdr = level == 1 ? (vaddr_t)ptes : 1945 (vaddr_t)pdes[level - 2]; 1946 pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE, 1947 opde, TLBSHOOT_FREE_PTP1); 1948 #endif /* XEN */ 1949 pmap_freepage(pmap, ptp, level); 1950 if (level < PTP_LEVELS - 1) { 1951 ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1); 1952 ptp->wire_count--; 1953 if (ptp->wire_count > 1) 1954 break; 1955 } 1956 } while (++level < PTP_LEVELS); 1957 pmap_pte_flush(); 1958 } 1959 1960 /* 1961 * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one) 1962 * 1963 * => pmap should NOT be pmap_kernel() 1964 * => pmap should be locked 1965 * => preemption should be disabled 1966 */ 1967 1968 static struct vm_page * 1969 pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t * const *pdes) 1970 { 1971 struct vm_page *ptp, *pptp; 1972 int i; 1973 unsigned long index; 1974 pd_entry_t *pva; 1975 paddr_t ppa, pa; 1976 struct uvm_object *obj; 1977 1978 KASSERT(pmap != pmap_kernel()); 1979 KASSERT(mutex_owned(pmap->pm_lock)); 1980 KASSERT(kpreempt_disabled()); 1981 1982 ptp = NULL; 1983 pa = (paddr_t)-1; 1984 1985 /* 1986 * Loop through all page table levels seeing if we need to 1987 * add a new page to that level. 1988 */ 1989 for (i = PTP_LEVELS; i > 1; i--) { 1990 /* 1991 * Save values from previous round. 1992 */ 1993 pptp = ptp; 1994 ppa = pa; 1995 1996 index = pl_i(va, i); 1997 pva = pdes[i - 2]; 1998 1999 if (pmap_valid_entry(pva[index])) { 2000 ppa = pmap_pte2pa(pva[index]); 2001 ptp = NULL; 2002 continue; 2003 } 2004 2005 obj = &pmap->pm_obj[i-2]; 2006 PMAP_SUBOBJ_LOCK(pmap, i - 2); 2007 ptp = uvm_pagealloc(obj, ptp_va2o(va, i - 1), NULL, 2008 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 2009 PMAP_SUBOBJ_UNLOCK(pmap, i - 2); 2010 2011 if (ptp == NULL) 2012 return NULL; 2013 2014 ptp->flags &= ~PG_BUSY; /* never busy */ 2015 ptp->wire_count = 1; 2016 pmap->pm_ptphint[i - 2] = ptp; 2017 pa = VM_PAGE_TO_PHYS(ptp); 2018 pmap_pte_set(&pva[index], (pd_entry_t) 2019 (pmap_pa2pte(pa) | PG_u | PG_RW | PG_V)); 2020 #if defined(XEN) && defined(__x86_64__) 2021 if (i == PTP_LEVELS) { 2022 /* 2023 * Update the per-cpu PD on all cpus the current 2024 * pmap is active on 2025 */ 2026 xen_kpm_sync(pmap, index); 2027 } 2028 #endif 2029 pmap_pte_flush(); 2030 pmap_stats_update(pmap, 1, 0); 2031 /* 2032 * If we're not in the top level, increase the 2033 * wire count of the parent page. 2034 */ 2035 if (i < PTP_LEVELS) { 2036 if (pptp == NULL) { 2037 pptp = pmap_find_ptp(pmap, va, ppa, i); 2038 KASSERT(pptp != NULL); 2039 } 2040 pptp->wire_count++; 2041 } 2042 } 2043 2044 /* 2045 * PTP is not NULL if we just allocated a new PTP. If it is 2046 * still NULL, we must look up the existing one. 2047 */ 2048 if (ptp == NULL) { 2049 ptp = pmap_find_ptp(pmap, va, ppa, 1); 2050 KASSERTMSG(ptp != NULL, "pmap_get_ptp: va %" PRIxVADDR 2051 "ppa %" PRIxPADDR "\n", va, ppa); 2052 } 2053 2054 pmap->pm_ptphint[0] = ptp; 2055 return ptp; 2056 } 2057 2058 /* 2059 * p m a p l i f e c y c l e f u n c t i o n s 2060 */ 2061 2062 /* 2063 * pmap_pdp_ctor: constructor for the PDP cache. 2064 */ 2065 static int 2066 pmap_pdp_ctor(void *arg, void *v, int flags) 2067 { 2068 pd_entry_t *pdir = v; 2069 paddr_t pdirpa = 0; 2070 vaddr_t object; 2071 int i; 2072 2073 #if !defined(XEN) || !defined(__x86_64__) 2074 int npde; 2075 #endif 2076 #ifdef XEN 2077 int s; 2078 #endif 2079 2080 /* 2081 * NOTE: The `pmaps_lock' is held when the PDP is allocated. 2082 */ 2083 2084 #if defined(XEN) && defined(__x86_64__) 2085 /* Fetch the physical address of the page directory */ 2086 (void)pmap_extract(pmap_kernel(), (vaddr_t)pdir, &pdirpa); 2087 2088 /* Zero the area */ 2089 memset(pdir, 0, PAGE_SIZE); /* Xen wants a clean page */ 2090 2091 /* 2092 * This pdir will NEVER be active in kernel mode, so mark 2093 * recursive entry invalid. 2094 */ 2095 pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa) | PG_u; 2096 2097 /* 2098 * PDP constructed this way won't be for the kernel, hence we 2099 * don't put kernel mappings on Xen. 2100 * 2101 * But we need to make pmap_create() happy, so put a dummy 2102 * (without PG_V) value at the right place. 2103 */ 2104 pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] = 2105 (pd_entry_t)-1 & PG_FRAME; 2106 #else /* XEN && __x86_64__*/ 2107 /* Zero the area */ 2108 memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t)); 2109 2110 object = (vaddr_t)v; 2111 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2112 /* Fetch the physical address of the page directory */ 2113 (void)pmap_extract(pmap_kernel(), object, &pdirpa); 2114 2115 /* Put in recursive PDE to map the PTEs */ 2116 pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PG_V | 2117 pmap_pg_nx; 2118 #ifndef XEN 2119 pdir[PDIR_SLOT_PTE + i] |= PG_KW; 2120 #endif 2121 } 2122 2123 /* Copy the kernel's top level PDE */ 2124 npde = nkptp[PTP_LEVELS - 1]; 2125 2126 memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN], 2127 npde * sizeof(pd_entry_t)); 2128 2129 /* Zero the rest */ 2130 memset(&pdir[PDIR_SLOT_KERN + npde], 0, (PAGE_SIZE * PDP_SIZE) - 2131 (PDIR_SLOT_KERN + npde) * sizeof(pd_entry_t)); 2132 2133 if (VM_MIN_KERNEL_ADDRESS != KERNBASE) { 2134 int idx = pl_i(KERNBASE, PTP_LEVELS); 2135 pdir[idx] = PDP_BASE[idx]; 2136 } 2137 2138 #ifdef __HAVE_DIRECT_MAP 2139 pdir[PDIR_SLOT_DIRECT] = PDP_BASE[PDIR_SLOT_DIRECT]; 2140 #endif 2141 #endif /* XEN && __x86_64__*/ 2142 2143 #ifdef XEN 2144 s = splvm(); 2145 object = (vaddr_t)v; 2146 pmap_protect(pmap_kernel(), object, object + (PAGE_SIZE * PDP_SIZE), 2147 VM_PROT_READ); 2148 pmap_update(pmap_kernel()); 2149 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2150 /* 2151 * pin as L2/L4 page, we have to do the page with the 2152 * PDIR_SLOT_PTE entries last 2153 */ 2154 #ifdef PAE 2155 if (i == l2tol3(PDIR_SLOT_PTE)) 2156 continue; 2157 #endif 2158 2159 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2160 #ifdef __x86_64__ 2161 xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa)); 2162 #else 2163 xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa)); 2164 #endif 2165 } 2166 #ifdef PAE 2167 object = ((vaddr_t)pdir) + PAGE_SIZE * l2tol3(PDIR_SLOT_PTE); 2168 (void)pmap_extract(pmap_kernel(), object, &pdirpa); 2169 xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa)); 2170 #endif 2171 splx(s); 2172 #endif /* XEN */ 2173 2174 return (0); 2175 } 2176 2177 /* 2178 * pmap_pdp_dtor: destructor for the PDP cache. 2179 */ 2180 2181 static void 2182 pmap_pdp_dtor(void *arg, void *v) 2183 { 2184 #ifdef XEN 2185 paddr_t pdirpa = 0; /* XXX: GCC */ 2186 vaddr_t object = (vaddr_t)v; 2187 int i; 2188 int s = splvm(); 2189 pt_entry_t *pte; 2190 2191 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2192 /* fetch the physical address of the page directory. */ 2193 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2194 /* unpin page table */ 2195 xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa)); 2196 } 2197 object = (vaddr_t)v; 2198 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2199 /* Set page RW again */ 2200 pte = kvtopte(object); 2201 pmap_pte_set(pte, *pte | PG_RW); 2202 xen_bcast_invlpg((vaddr_t)object); 2203 } 2204 splx(s); 2205 #endif /* XEN */ 2206 } 2207 2208 #ifdef PAE 2209 2210 /* pmap_pdp_alloc: Allocate a page for the pdp memory pool. */ 2211 2212 static void * 2213 pmap_pdp_alloc(struct pool *pp, int flags) 2214 { 2215 return (void *)uvm_km_alloc(kernel_map, 2216 PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE, 2217 ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK) 2218 | UVM_KMF_WIRED); 2219 } 2220 2221 /* 2222 * pmap_pdp_free: free a PDP 2223 */ 2224 2225 static void 2226 pmap_pdp_free(struct pool *pp, void *v) 2227 { 2228 uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE, 2229 UVM_KMF_WIRED); 2230 } 2231 #endif /* PAE */ 2232 2233 /* 2234 * pmap_create: create a pmap object. 2235 */ 2236 struct pmap * 2237 pmap_create(void) 2238 { 2239 struct pmap *pmap; 2240 int i; 2241 2242 pmap = pool_cache_get(&pmap_cache, PR_WAITOK); 2243 2244 /* init uvm_object */ 2245 for (i = 0; i < PTP_LEVELS - 1; i++) { 2246 mutex_init(&pmap->pm_obj_lock[i], MUTEX_DEFAULT, IPL_NONE); 2247 uvm_obj_init(&pmap->pm_obj[i], NULL, false, 1); 2248 uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_obj_lock[i]); 2249 pmap->pm_ptphint[i] = NULL; 2250 } 2251 pmap->pm_stats.wired_count = 0; 2252 /* count the PDP allocd below */ 2253 pmap->pm_stats.resident_count = PDP_SIZE; 2254 #if !defined(__x86_64__) 2255 pmap->pm_hiexec = 0; 2256 #endif /* !defined(__x86_64__) */ 2257 pmap->pm_flags = 0; 2258 pmap->pm_gc_ptp = NULL; 2259 2260 kcpuset_create(&pmap->pm_cpus, true); 2261 kcpuset_create(&pmap->pm_kernel_cpus, true); 2262 #ifdef XEN 2263 kcpuset_create(&pmap->pm_xen_ptp_cpus, true); 2264 #endif 2265 /* init the LDT */ 2266 pmap->pm_ldt = NULL; 2267 pmap->pm_ldt_len = 0; 2268 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 2269 2270 /* allocate PDP */ 2271 try_again: 2272 pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK); 2273 2274 mutex_enter(&pmaps_lock); 2275 2276 if (pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] == 0) { 2277 mutex_exit(&pmaps_lock); 2278 pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir); 2279 goto try_again; 2280 } 2281 2282 for (i = 0; i < PDP_SIZE; i++) 2283 pmap->pm_pdirpa[i] = 2284 pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]); 2285 2286 LIST_INSERT_HEAD(&pmaps, pmap, pm_list); 2287 2288 mutex_exit(&pmaps_lock); 2289 2290 return (pmap); 2291 } 2292 2293 /* 2294 * pmap_free_ptps: put a list of ptps back to the freelist. 2295 */ 2296 2297 void 2298 pmap_free_ptps(struct vm_page *empty_ptps) 2299 { 2300 struct vm_page *ptp; 2301 struct pmap_page *pp; 2302 2303 while ((ptp = empty_ptps) != NULL) { 2304 pp = VM_PAGE_TO_PP(ptp); 2305 empty_ptps = pp->pp_link; 2306 LIST_INIT(&pp->pp_head.pvh_list); 2307 uvm_pagefree(ptp); 2308 } 2309 } 2310 2311 /* 2312 * pmap_destroy: drop reference count on pmap. free pmap if 2313 * reference count goes to zero. 2314 */ 2315 2316 void 2317 pmap_destroy(struct pmap *pmap) 2318 { 2319 lwp_t *l; 2320 int i; 2321 2322 /* 2323 * If we have torn down this pmap, process deferred frees and 2324 * invalidations. Free now if the system is low on memory. 2325 * Otherwise, free when the pmap is destroyed thus avoiding a 2326 * TLB shootdown. 2327 */ 2328 l = curlwp; 2329 if (__predict_false(l->l_md.md_gc_pmap == pmap)) { 2330 if (uvmexp.free < uvmexp.freetarg) { 2331 pmap_update(pmap); 2332 } else { 2333 KASSERT(pmap->pm_gc_ptp == NULL); 2334 pmap->pm_gc_ptp = l->l_md.md_gc_ptp; 2335 l->l_md.md_gc_ptp = NULL; 2336 l->l_md.md_gc_pmap = NULL; 2337 } 2338 } 2339 2340 /* 2341 * drop reference count 2342 */ 2343 2344 if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) { 2345 return; 2346 } 2347 2348 #ifdef DIAGNOSTIC 2349 CPU_INFO_ITERATOR cii; 2350 struct cpu_info *ci; 2351 2352 for (CPU_INFO_FOREACH(cii, ci)) { 2353 if (ci->ci_pmap == pmap) 2354 panic("destroying pmap being used"); 2355 #if defined(XEN) && defined(__x86_64__) 2356 for (i = 0; i < PDIR_SLOT_PTE; i++) { 2357 if (pmap->pm_pdir[i] != 0 && 2358 ci->ci_kpm_pdir[i] == pmap->pm_pdir[i]) { 2359 printf("pmap_destroy(%p) pmap_kernel %p " 2360 "curcpu %d cpu %d ci_pmap %p " 2361 "ci->ci_kpm_pdir[%d]=%" PRIx64 2362 " pmap->pm_pdir[%d]=%" PRIx64 "\n", 2363 pmap, pmap_kernel(), curcpu()->ci_index, 2364 ci->ci_index, ci->ci_pmap, 2365 i, ci->ci_kpm_pdir[i], 2366 i, pmap->pm_pdir[i]); 2367 panic("pmap_destroy: used pmap"); 2368 } 2369 } 2370 #endif 2371 } 2372 #endif /* DIAGNOSTIC */ 2373 2374 /* 2375 * Reference count is zero, free pmap resources and then free pmap. 2376 * First, remove it from global list of pmaps. 2377 */ 2378 2379 mutex_enter(&pmaps_lock); 2380 LIST_REMOVE(pmap, pm_list); 2381 mutex_exit(&pmaps_lock); 2382 2383 /* 2384 * Process deferred PTP frees. No TLB shootdown required, as the 2385 * PTP pages are no longer visible to any CPU. 2386 */ 2387 2388 pmap_free_ptps(pmap->pm_gc_ptp); 2389 2390 /* 2391 * destroyed pmap shouldn't have remaining PTPs 2392 */ 2393 2394 for (i = 0; i < PTP_LEVELS - 1; i++) { 2395 KASSERT(pmap->pm_obj[i].uo_npages == 0); 2396 KASSERT(TAILQ_EMPTY(&pmap->pm_obj[i].memq)); 2397 } 2398 2399 pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir); 2400 2401 #ifdef USER_LDT 2402 if (pmap->pm_ldt != NULL) { 2403 /* 2404 * no need to switch the LDT; this address space is gone, 2405 * nothing is using it. 2406 * 2407 * No need to lock the pmap for ldt_free (or anything else), 2408 * we're the last one to use it. 2409 */ 2410 mutex_enter(&cpu_lock); 2411 ldt_free(pmap->pm_ldt_sel); 2412 mutex_exit(&cpu_lock); 2413 uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt, 2414 pmap->pm_ldt_len, UVM_KMF_WIRED); 2415 } 2416 #endif 2417 2418 for (i = 0; i < PTP_LEVELS - 1; i++) { 2419 uvm_obj_destroy(&pmap->pm_obj[i], false); 2420 mutex_destroy(&pmap->pm_obj_lock[i]); 2421 } 2422 kcpuset_destroy(pmap->pm_cpus); 2423 kcpuset_destroy(pmap->pm_kernel_cpus); 2424 #ifdef XEN 2425 kcpuset_destroy(pmap->pm_xen_ptp_cpus); 2426 #endif 2427 pool_cache_put(&pmap_cache, pmap); 2428 } 2429 2430 /* 2431 * pmap_remove_all: pmap is being torn down by the current thread. 2432 * avoid unnecessary invalidations. 2433 */ 2434 2435 void 2436 pmap_remove_all(struct pmap *pmap) 2437 { 2438 lwp_t *l = curlwp; 2439 2440 KASSERT(l->l_md.md_gc_pmap == NULL); 2441 2442 l->l_md.md_gc_pmap = pmap; 2443 } 2444 2445 #if defined(PMAP_FORK) 2446 /* 2447 * pmap_fork: perform any necessary data structure manipulation when 2448 * a VM space is forked. 2449 */ 2450 2451 void 2452 pmap_fork(struct pmap *pmap1, struct pmap *pmap2) 2453 { 2454 #ifdef USER_LDT 2455 union descriptor *new_ldt; 2456 size_t len; 2457 int sel; 2458 2459 if (__predict_true(pmap1->pm_ldt == NULL)) { 2460 return; 2461 } 2462 2463 /* 2464 * Copy the LDT into the new process. 2465 * 2466 * Read pmap1's ldt pointer and length unlocked; if it changes 2467 * behind our back we'll retry. This will starve if there's a 2468 * stream of LDT changes in another thread but that should not 2469 * happen. 2470 */ 2471 2472 retry: 2473 if (pmap1->pm_ldt != NULL) { 2474 len = pmap1->pm_ldt_len; 2475 /* Allocate space for the new process's LDT */ 2476 new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, len, 0, 2477 UVM_KMF_WIRED); 2478 if (new_ldt == NULL) { 2479 printf("WARNING: pmap_fork: " 2480 "unable to allocate LDT space\n"); 2481 return; 2482 } 2483 mutex_enter(&cpu_lock); 2484 /* Get a GDT slot for it */ 2485 sel = ldt_alloc(new_ldt, len); 2486 if (sel == -1) { 2487 mutex_exit(&cpu_lock); 2488 uvm_km_free(kernel_map, (vaddr_t)new_ldt, len, 2489 UVM_KMF_WIRED); 2490 printf("WARNING: pmap_fork: " 2491 "unable to allocate LDT selector\n"); 2492 return; 2493 } 2494 } else { 2495 /* Wasn't anything there after all. */ 2496 len = -1; 2497 new_ldt = NULL; 2498 sel = -1; 2499 mutex_enter(&cpu_lock); 2500 } 2501 2502 /* If there's still something there now that we have cpu_lock... */ 2503 if (pmap1->pm_ldt != NULL) { 2504 if (len != pmap1->pm_ldt_len) { 2505 /* Oops, it changed. Drop what we did and try again */ 2506 if (len != -1) { 2507 ldt_free(sel); 2508 uvm_km_free(kernel_map, (vaddr_t)new_ldt, 2509 len, UVM_KMF_WIRED); 2510 } 2511 mutex_exit(&cpu_lock); 2512 goto retry; 2513 } 2514 2515 /* Copy the LDT data and install it in pmap2 */ 2516 memcpy(new_ldt, pmap1->pm_ldt, len); 2517 pmap2->pm_ldt = new_ldt; 2518 pmap2->pm_ldt_len = pmap1->pm_ldt_len; 2519 pmap2->pm_ldt_sel = sel; 2520 len = -1; 2521 } 2522 2523 if (len != -1) { 2524 /* There wasn't still something there, so mop up */ 2525 ldt_free(sel); 2526 mutex_exit(&cpu_lock); 2527 uvm_km_free(kernel_map, (vaddr_t)new_ldt, len, 2528 UVM_KMF_WIRED); 2529 } else { 2530 mutex_exit(&cpu_lock); 2531 } 2532 #endif /* USER_LDT */ 2533 } 2534 #endif /* PMAP_FORK */ 2535 2536 #ifdef USER_LDT 2537 2538 /* 2539 * pmap_ldt_xcall: cross call used by pmap_ldt_sync. if the named pmap 2540 * is active, reload LDTR. 2541 */ 2542 static void 2543 pmap_ldt_xcall(void *arg1, void *arg2) 2544 { 2545 struct pmap *pm; 2546 2547 kpreempt_disable(); 2548 pm = arg1; 2549 if (curcpu()->ci_pmap == pm) { 2550 lldt(pm->pm_ldt_sel); 2551 } 2552 kpreempt_enable(); 2553 } 2554 2555 /* 2556 * pmap_ldt_sync: LDT selector for the named pmap is changing. swap 2557 * in the new selector on all CPUs. 2558 */ 2559 void 2560 pmap_ldt_sync(struct pmap *pm) 2561 { 2562 uint64_t where; 2563 2564 KASSERT(mutex_owned(&cpu_lock)); 2565 2566 pmap_ldt_evcnt.ev_count++; 2567 where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL); 2568 xc_wait(where); 2569 } 2570 2571 /* 2572 * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and 2573 * restore the default. 2574 */ 2575 2576 void 2577 pmap_ldt_cleanup(struct lwp *l) 2578 { 2579 pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap; 2580 union descriptor *dp = NULL; 2581 size_t len = 0; 2582 int sel = -1; 2583 2584 if (__predict_true(pmap->pm_ldt == NULL)) { 2585 return; 2586 } 2587 2588 mutex_enter(&cpu_lock); 2589 if (pmap->pm_ldt != NULL) { 2590 sel = pmap->pm_ldt_sel; 2591 dp = pmap->pm_ldt; 2592 len = pmap->pm_ldt_len; 2593 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 2594 pmap->pm_ldt = NULL; 2595 pmap->pm_ldt_len = 0; 2596 pmap_ldt_sync(pmap); 2597 ldt_free(sel); 2598 uvm_km_free(kernel_map, (vaddr_t)dp, len, UVM_KMF_WIRED); 2599 } 2600 mutex_exit(&cpu_lock); 2601 } 2602 #endif /* USER_LDT */ 2603 2604 /* 2605 * pmap_activate: activate a process' pmap 2606 * 2607 * => must be called with kernel preemption disabled 2608 * => if lwp is the curlwp, then set ci_want_pmapload so that 2609 * actual MMU context switch will be done by pmap_load() later 2610 */ 2611 2612 void 2613 pmap_activate(struct lwp *l) 2614 { 2615 struct cpu_info *ci; 2616 struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2617 2618 KASSERT(kpreempt_disabled()); 2619 2620 ci = curcpu(); 2621 2622 if (l == ci->ci_curlwp) { 2623 KASSERT(ci->ci_want_pmapload == 0); 2624 KASSERT(ci->ci_tlbstate != TLBSTATE_VALID); 2625 2626 /* 2627 * no need to switch to kernel vmspace because 2628 * it's a subset of any vmspace. 2629 */ 2630 2631 if (pmap == pmap_kernel()) { 2632 ci->ci_want_pmapload = 0; 2633 return; 2634 } 2635 2636 ci->ci_want_pmapload = 1; 2637 } 2638 } 2639 2640 /* 2641 * pmap_reactivate: try to regain reference to the pmap. 2642 * 2643 * => Must be called with kernel preemption disabled. 2644 */ 2645 2646 static bool 2647 pmap_reactivate(struct pmap *pmap) 2648 { 2649 struct cpu_info * const ci = curcpu(); 2650 const cpuid_t cid = cpu_index(ci); 2651 bool result; 2652 2653 KASSERT(kpreempt_disabled()); 2654 #if defined(XEN) && defined(__x86_64__) 2655 KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd); 2656 #elif defined(PAE) 2657 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0])); 2658 #elif !defined(XEN) 2659 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3())); 2660 #endif 2661 2662 /* 2663 * If we still have a lazy reference to this pmap, we can assume 2664 * that there was no TLB shootdown for this pmap in the meantime. 2665 * 2666 * The order of events here is important as we must synchronize 2667 * with TLB shootdown interrupts. Declare interest in invalidations 2668 * (TLBSTATE_VALID) and then check the CPU set, which the IPIs can 2669 * change only when the state is TLBSTATE_LAZY. 2670 */ 2671 2672 ci->ci_tlbstate = TLBSTATE_VALID; 2673 KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid)); 2674 2675 if (kcpuset_isset(pmap->pm_cpus, cid)) { 2676 /* We have the reference, state is valid. */ 2677 result = true; 2678 } else { 2679 /* Must reload the TLB. */ 2680 kcpuset_atomic_set(pmap->pm_cpus, cid); 2681 result = false; 2682 } 2683 return result; 2684 } 2685 2686 /* 2687 * pmap_load: perform the actual pmap switch, i.e. fill in %cr3 register 2688 * and relevant LDT info. 2689 * 2690 * Ensures that the current process' pmap is loaded on the current CPU's 2691 * MMU and that there are no stale TLB entries. 2692 * 2693 * => The caller should disable kernel preemption or do check-and-retry 2694 * to prevent a preemption from undoing our efforts. 2695 * => This function may block. 2696 */ 2697 void 2698 pmap_load(void) 2699 { 2700 struct cpu_info *ci; 2701 struct pmap *pmap, *oldpmap; 2702 struct lwp *l; 2703 struct pcb *pcb; 2704 cpuid_t cid; 2705 uint64_t ncsw; 2706 2707 kpreempt_disable(); 2708 retry: 2709 ci = curcpu(); 2710 if (!ci->ci_want_pmapload) { 2711 kpreempt_enable(); 2712 return; 2713 } 2714 l = ci->ci_curlwp; 2715 ncsw = l->l_ncsw; 2716 2717 /* should be able to take ipis. */ 2718 KASSERT(ci->ci_ilevel < IPL_HIGH); 2719 #ifdef XEN 2720 /* Check to see if interrupts are enabled (ie; no events are masked) */ 2721 KASSERT(x86_read_psl() == 0); 2722 #else 2723 KASSERT((x86_read_psl() & PSL_I) != 0); 2724 #endif 2725 2726 KASSERT(l != NULL); 2727 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2728 KASSERT(pmap != pmap_kernel()); 2729 oldpmap = ci->ci_pmap; 2730 pcb = lwp_getpcb(l); 2731 2732 if (pmap == oldpmap) { 2733 if (!pmap_reactivate(pmap)) { 2734 u_int gen = uvm_emap_gen_return(); 2735 2736 /* 2737 * pmap has been changed during deactivated. 2738 * our tlb may be stale. 2739 */ 2740 2741 tlbflush(); 2742 uvm_emap_update(gen); 2743 } 2744 2745 ci->ci_want_pmapload = 0; 2746 kpreempt_enable(); 2747 return; 2748 } 2749 2750 /* 2751 * Acquire a reference to the new pmap and perform the switch. 2752 */ 2753 2754 pmap_reference(pmap); 2755 2756 cid = cpu_index(ci); 2757 kcpuset_atomic_clear(oldpmap->pm_cpus, cid); 2758 kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid); 2759 2760 #if defined(XEN) && defined(__x86_64__) 2761 KASSERT(pmap_pdirpa(oldpmap, 0) == ci->ci_xen_current_user_pgd || 2762 oldpmap == pmap_kernel()); 2763 #elif defined(PAE) 2764 KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0])); 2765 #elif !defined(XEN) 2766 KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(rcr3())); 2767 #endif 2768 KASSERT(!kcpuset_isset(pmap->pm_cpus, cid)); 2769 KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid)); 2770 2771 /* 2772 * Mark the pmap in use by this CPU. Again, we must synchronize 2773 * with TLB shootdown interrupts, so set the state VALID first, 2774 * then register us for shootdown events on this pmap. 2775 */ 2776 ci->ci_tlbstate = TLBSTATE_VALID; 2777 kcpuset_atomic_set(pmap->pm_cpus, cid); 2778 kcpuset_atomic_set(pmap->pm_kernel_cpus, cid); 2779 ci->ci_pmap = pmap; 2780 2781 /* 2782 * update tss. now that we have registered for invalidations 2783 * from other CPUs, we're good to load the page tables. 2784 */ 2785 #ifdef PAE 2786 pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa; 2787 #else 2788 pcb->pcb_cr3 = pmap_pdirpa(pmap, 0); 2789 #endif 2790 2791 #ifdef i386 2792 #ifndef XEN 2793 ci->ci_tss.tss_ldt = pmap->pm_ldt_sel; 2794 ci->ci_tss.tss_cr3 = pcb->pcb_cr3; 2795 #endif /* !XEN */ 2796 #endif /* i386 */ 2797 2798 lldt(pmap->pm_ldt_sel); 2799 2800 u_int gen = uvm_emap_gen_return(); 2801 cpu_load_pmap(pmap, oldpmap); 2802 uvm_emap_update(gen); 2803 2804 ci->ci_want_pmapload = 0; 2805 2806 /* 2807 * we're now running with the new pmap. drop the reference 2808 * to the old pmap. if we block, we need to go around again. 2809 */ 2810 2811 pmap_destroy(oldpmap); 2812 if (l->l_ncsw != ncsw) { 2813 goto retry; 2814 } 2815 2816 kpreempt_enable(); 2817 } 2818 2819 /* 2820 * pmap_deactivate: deactivate a process' pmap. 2821 * 2822 * => Must be called with kernel preemption disabled (high IPL is enough). 2823 */ 2824 void 2825 pmap_deactivate(struct lwp *l) 2826 { 2827 struct pmap *pmap; 2828 struct cpu_info *ci; 2829 2830 KASSERT(kpreempt_disabled()); 2831 2832 if (l != curlwp) { 2833 return; 2834 } 2835 2836 /* 2837 * Wait for pending TLB shootdowns to complete. Necessary because 2838 * TLB shootdown state is per-CPU, and the LWP may be coming off 2839 * the CPU before it has a chance to call pmap_update(), e.g. due 2840 * to kernel preemption or blocking routine in between. 2841 */ 2842 pmap_tlb_shootnow(); 2843 2844 ci = curcpu(); 2845 2846 if (ci->ci_want_pmapload) { 2847 /* 2848 * ci_want_pmapload means that our pmap is not loaded on 2849 * the CPU or TLB might be stale. note that pmap_kernel() 2850 * is always considered loaded. 2851 */ 2852 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 2853 != pmap_kernel()); 2854 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 2855 != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID); 2856 2857 /* 2858 * userspace has not been touched. 2859 * nothing to do here. 2860 */ 2861 2862 ci->ci_want_pmapload = 0; 2863 return; 2864 } 2865 2866 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2867 2868 if (pmap == pmap_kernel()) { 2869 return; 2870 } 2871 2872 #if defined(XEN) && defined(__x86_64__) 2873 KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd); 2874 #elif defined(PAE) 2875 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0])); 2876 #elif !defined(XEN) 2877 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3())); 2878 #endif 2879 KASSERT(ci->ci_pmap == pmap); 2880 2881 /* 2882 * we aren't interested in TLB invalidations for this pmap, 2883 * at least for the time being. 2884 */ 2885 2886 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); 2887 ci->ci_tlbstate = TLBSTATE_LAZY; 2888 } 2889 2890 /* 2891 * end of lifecycle functions 2892 */ 2893 2894 /* 2895 * some misc. functions 2896 */ 2897 2898 int 2899 pmap_pdes_invalid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde) 2900 { 2901 int i; 2902 unsigned long index; 2903 pd_entry_t pde; 2904 2905 for (i = PTP_LEVELS; i > 1; i--) { 2906 index = pl_i(va, i); 2907 pde = pdes[i - 2][index]; 2908 if ((pde & PG_V) == 0) 2909 return i; 2910 } 2911 if (lastpde != NULL) 2912 *lastpde = pde; 2913 return 0; 2914 } 2915 2916 /* 2917 * pmap_extract: extract a PA for the given VA 2918 */ 2919 2920 bool 2921 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap) 2922 { 2923 pt_entry_t *ptes, pte; 2924 pd_entry_t pde; 2925 pd_entry_t * const *pdes; 2926 struct pmap *pmap2; 2927 struct cpu_info *ci; 2928 paddr_t pa; 2929 lwp_t *l; 2930 bool hard, rv; 2931 2932 #ifdef __HAVE_DIRECT_MAP 2933 if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) { 2934 if (pap != NULL) { 2935 *pap = va - PMAP_DIRECT_BASE; 2936 } 2937 return true; 2938 } 2939 #endif 2940 2941 rv = false; 2942 pa = 0; 2943 l = curlwp; 2944 2945 kpreempt_disable(); 2946 ci = l->l_cpu; 2947 if (__predict_true(!ci->ci_want_pmapload && ci->ci_pmap == pmap) || 2948 pmap == pmap_kernel()) { 2949 /* 2950 * no need to lock, because it's pmap_kernel() or our 2951 * own pmap and is active. if a user pmap, the caller 2952 * will hold the vm_map write/read locked and so prevent 2953 * entries from disappearing while we are here. ptps 2954 * can disappear via pmap_remove() and pmap_protect(), 2955 * but they are called with the vm_map write locked. 2956 */ 2957 hard = false; 2958 ptes = PTE_BASE; 2959 pdes = normal_pdes; 2960 } else { 2961 /* we lose, do it the hard way. */ 2962 hard = true; 2963 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 2964 } 2965 if (pmap_pdes_valid(va, pdes, &pde)) { 2966 pte = ptes[pl1_i(va)]; 2967 if (pde & PG_PS) { 2968 pa = (pde & PG_LGFRAME) | (va & (NBPD_L2 - 1)); 2969 rv = true; 2970 } else if (__predict_true((pte & PG_V) != 0)) { 2971 pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1)); 2972 rv = true; 2973 } 2974 } 2975 if (__predict_false(hard)) { 2976 pmap_unmap_ptes(pmap, pmap2); 2977 } 2978 kpreempt_enable(); 2979 if (pap != NULL) { 2980 *pap = pa; 2981 } 2982 return rv; 2983 } 2984 2985 2986 /* 2987 * vtophys: virtual address to physical address. For use by 2988 * machine-dependent code only. 2989 */ 2990 2991 paddr_t 2992 vtophys(vaddr_t va) 2993 { 2994 paddr_t pa; 2995 2996 if (pmap_extract(pmap_kernel(), va, &pa) == true) 2997 return (pa); 2998 return (0); 2999 } 3000 3001 __strict_weak_alias(pmap_extract_ma, pmap_extract); 3002 3003 #ifdef XEN 3004 3005 /* 3006 * vtomach: virtual address to machine address. For use by 3007 * machine-dependent code only. 3008 */ 3009 3010 paddr_t 3011 vtomach(vaddr_t va) 3012 { 3013 paddr_t pa; 3014 3015 if (pmap_extract_ma(pmap_kernel(), va, &pa) == true) 3016 return (pa); 3017 return (0); 3018 } 3019 3020 #endif /* XEN */ 3021 3022 /* 3023 * pmap_virtual_space: used during bootup [pmap_steal_memory] to 3024 * determine the bounds of the kernel virtual addess space. 3025 */ 3026 3027 void 3028 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp) 3029 { 3030 *startp = virtual_avail; 3031 *endp = virtual_end; 3032 } 3033 3034 /* 3035 * pmap_zero_page: zero a page 3036 */ 3037 3038 void 3039 pmap_zero_page(paddr_t pa) 3040 { 3041 #if defined(__HAVE_DIRECT_MAP) 3042 pagezero(PMAP_DIRECT_MAP(pa)); 3043 #else 3044 #if defined(XEN) 3045 if (XEN_VERSION_SUPPORTED(3, 4)) 3046 xen_pagezero(pa); 3047 #endif 3048 struct cpu_info *ci; 3049 pt_entry_t *zpte; 3050 vaddr_t zerova; 3051 3052 const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_M | PG_U | 3053 PG_k; 3054 3055 kpreempt_disable(); 3056 3057 ci = curcpu(); 3058 zerova = ci->vpage[VPAGE_ZER]; 3059 zpte = ci->vpage_pte[VPAGE_ZER]; 3060 3061 #ifdef DIAGNOSTIC 3062 if (*zpte) 3063 panic("pmap_zero_page: lock botch"); 3064 #endif 3065 3066 pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags); 3067 pmap_pte_flush(); 3068 pmap_update_pg(zerova); /* flush TLB */ 3069 3070 memset((void *)zerova, 0, PAGE_SIZE); 3071 3072 #if defined(DIAGNOSTIC) || defined(XEN) 3073 pmap_pte_set(zpte, 0); /* zap ! */ 3074 pmap_pte_flush(); 3075 #endif 3076 3077 kpreempt_enable(); 3078 #endif /* defined(__HAVE_DIRECT_MAP) */ 3079 } 3080 3081 /* 3082 * pmap_pagezeroidle: the same, for the idle loop page zero'er. 3083 * Returns true if the page was zero'd, false if we aborted for 3084 * some reason. 3085 */ 3086 3087 bool 3088 pmap_pageidlezero(paddr_t pa) 3089 { 3090 #ifdef __HAVE_DIRECT_MAP 3091 KASSERT(cpu_feature[0] & CPUID_SSE2); 3092 return sse2_idlezero_page((void *)PMAP_DIRECT_MAP(pa)); 3093 #else 3094 struct cpu_info *ci; 3095 pt_entry_t *zpte; 3096 vaddr_t zerova; 3097 bool rv; 3098 3099 const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_M | PG_U | 3100 PG_k; 3101 3102 ci = curcpu(); 3103 zerova = ci->vpage[VPAGE_ZER]; 3104 zpte = ci->vpage_pte[VPAGE_ZER]; 3105 3106 KASSERT(cpu_feature[0] & CPUID_SSE2); 3107 KASSERT(*zpte == 0); 3108 3109 pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags); 3110 pmap_pte_flush(); 3111 pmap_update_pg(zerova); /* flush TLB */ 3112 3113 rv = sse2_idlezero_page((void *)zerova); 3114 3115 #if defined(DIAGNOSTIC) || defined(XEN) 3116 pmap_pte_set(zpte, 0); /* zap ! */ 3117 pmap_pte_flush(); 3118 #endif 3119 3120 return rv; 3121 #endif 3122 } 3123 3124 /* 3125 * pmap_copy_page: copy a page 3126 */ 3127 3128 void 3129 pmap_copy_page(paddr_t srcpa, paddr_t dstpa) 3130 { 3131 #if defined(__HAVE_DIRECT_MAP) 3132 vaddr_t srcva = PMAP_DIRECT_MAP(srcpa); 3133 vaddr_t dstva = PMAP_DIRECT_MAP(dstpa); 3134 3135 memcpy((void *)dstva, (void *)srcva, PAGE_SIZE); 3136 #else 3137 #if defined(XEN) 3138 if (XEN_VERSION_SUPPORTED(3, 4)) { 3139 xen_copy_page(srcpa, dstpa); 3140 return; 3141 } 3142 #endif 3143 struct cpu_info *ci; 3144 pt_entry_t *srcpte, *dstpte; 3145 vaddr_t srcva, dstva; 3146 3147 const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_U | PG_k; 3148 3149 kpreempt_disable(); 3150 3151 ci = curcpu(); 3152 srcva = ci->vpage[VPAGE_SRC]; 3153 dstva = ci->vpage[VPAGE_DST]; 3154 srcpte = ci->vpage_pte[VPAGE_SRC]; 3155 dstpte = ci->vpage_pte[VPAGE_DST]; 3156 3157 KASSERT(*srcpte == 0 && *dstpte == 0); 3158 3159 pmap_pte_set(srcpte, pmap_pa2pte(srcpa) | pteflags); 3160 pmap_pte_set(dstpte, pmap_pa2pte(dstpa) | pteflags | PG_M); 3161 pmap_pte_flush(); 3162 pmap_update_2pg(srcva, dstva); 3163 3164 memcpy((void *)dstva, (void *)srcva, PAGE_SIZE); 3165 3166 #if defined(DIAGNOSTIC) || defined(XEN) 3167 pmap_pte_set(srcpte, 0); 3168 pmap_pte_set(dstpte, 0); 3169 pmap_pte_flush(); 3170 #endif 3171 3172 kpreempt_enable(); 3173 #endif /* defined(__HAVE_DIRECT_MAP) */ 3174 } 3175 3176 static pt_entry_t * 3177 pmap_map_ptp(struct vm_page *ptp) 3178 { 3179 #ifdef __HAVE_DIRECT_MAP 3180 return (void *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp)); 3181 #else 3182 struct cpu_info *ci; 3183 pt_entry_t *ptppte; 3184 vaddr_t ptpva; 3185 3186 KASSERT(kpreempt_disabled()); 3187 3188 #ifndef XEN 3189 const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_U | PG_M | 3190 PG_k; 3191 #else 3192 const pd_entry_t pteflags = PG_V | pmap_pg_nx | PG_U | PG_M | PG_k; 3193 #endif 3194 3195 ci = curcpu(); 3196 ptpva = ci->vpage[VPAGE_PTP]; 3197 ptppte = ci->vpage_pte[VPAGE_PTP]; 3198 3199 pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | pteflags); 3200 3201 pmap_pte_flush(); 3202 pmap_update_pg(ptpva); 3203 3204 return (pt_entry_t *)ptpva; 3205 #endif 3206 } 3207 3208 static void 3209 pmap_unmap_ptp(void) 3210 { 3211 #ifndef __HAVE_DIRECT_MAP 3212 #if defined(DIAGNOSTIC) || defined(XEN) 3213 struct cpu_info *ci; 3214 pt_entry_t *pte; 3215 3216 KASSERT(kpreempt_disabled()); 3217 3218 ci = curcpu(); 3219 pte = ci->vpage_pte[VPAGE_PTP]; 3220 3221 if (*pte != 0) { 3222 pmap_pte_set(pte, 0); 3223 pmap_pte_flush(); 3224 } 3225 #endif 3226 #endif 3227 } 3228 3229 static pt_entry_t * 3230 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va) 3231 { 3232 3233 KASSERT(kpreempt_disabled()); 3234 if (pmap_is_curpmap(pmap)) { 3235 return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */ 3236 } 3237 KASSERT(ptp != NULL); 3238 return pmap_map_ptp(ptp) + pl1_pi(va); 3239 } 3240 3241 static void 3242 pmap_unmap_pte(void) 3243 { 3244 3245 KASSERT(kpreempt_disabled()); 3246 3247 pmap_unmap_ptp(); 3248 } 3249 3250 /* 3251 * p m a p r e m o v e f u n c t i o n s 3252 * 3253 * functions that remove mappings 3254 */ 3255 3256 /* 3257 * pmap_remove_ptes: remove PTEs from a PTP 3258 * 3259 * => caller must hold pmap's lock 3260 * => PTP must be mapped into KVA 3261 * => PTP should be null if pmap == pmap_kernel() 3262 * => must be called with kernel preemption disabled 3263 * => returns composite pte if at least one page should be shot down 3264 */ 3265 3266 static void 3267 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva, 3268 vaddr_t startva, vaddr_t endva, struct pv_entry **pv_tofree) 3269 { 3270 pt_entry_t *pte = (pt_entry_t *)ptpva; 3271 3272 KASSERT(pmap == pmap_kernel() || mutex_owned(pmap->pm_lock)); 3273 KASSERT(kpreempt_disabled()); 3274 3275 /* 3276 * note that ptpva points to the PTE that maps startva. this may 3277 * or may not be the first PTE in the PTP. 3278 * 3279 * we loop through the PTP while there are still PTEs to look at 3280 * and the wire_count is greater than 1 (because we use the wire_count 3281 * to keep track of the number of real PTEs in the PTP). 3282 */ 3283 while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) { 3284 (void)pmap_remove_pte(pmap, ptp, pte, startva, pv_tofree); 3285 startva += PAGE_SIZE; 3286 pte++; 3287 } 3288 } 3289 3290 3291 /* 3292 * pmap_remove_pte: remove a single PTE from a PTP. 3293 * 3294 * => caller must hold pmap's lock 3295 * => PTP must be mapped into KVA 3296 * => PTP should be null if pmap == pmap_kernel() 3297 * => returns true if we removed a mapping 3298 * => must be called with kernel preemption disabled 3299 */ 3300 static bool 3301 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, 3302 vaddr_t va, struct pv_entry **pv_tofree) 3303 { 3304 struct pv_entry *pve; 3305 struct vm_page *pg; 3306 struct pmap_page *pp; 3307 pt_entry_t opte; 3308 3309 KASSERT(pmap == pmap_kernel() || mutex_owned(pmap->pm_lock)); 3310 KASSERT(kpreempt_disabled()); 3311 3312 if (!pmap_valid_entry(*pte)) { 3313 /* VA not mapped. */ 3314 return false; 3315 } 3316 3317 /* Atomically save the old PTE and zap it. */ 3318 opte = pmap_pte_testset(pte, 0); 3319 if (!pmap_valid_entry(opte)) { 3320 return false; 3321 } 3322 3323 pmap_exec_account(pmap, va, opte, 0); 3324 pmap_stats_update_bypte(pmap, 0, opte); 3325 3326 if (ptp) { 3327 /* 3328 * Dropping a PTE. Make sure that the PDE is flushed. 3329 */ 3330 ptp->wire_count--; 3331 if (ptp->wire_count <= 1) { 3332 opte |= PG_U; 3333 } 3334 } 3335 3336 if ((opte & PG_U) != 0) { 3337 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE); 3338 } 3339 3340 /* 3341 * If we are not on a pv_head list - we are done. 3342 */ 3343 if ((opte & PG_PVLIST) == 0) { 3344 #if defined(DIAGNOSTIC) && !defined(DOM0OPS) 3345 if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL || 3346 pmap_pv_tracked(pmap_pte2pa(opte)) != NULL) 3347 panic("pmap_remove_pte: managed or pv-tracked page" 3348 " without PG_PVLIST for %#"PRIxVADDR, va); 3349 #endif 3350 return true; 3351 } 3352 3353 if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) { 3354 KASSERT(uvm_page_locked_p(pg)); 3355 pp = VM_PAGE_TO_PP(pg); 3356 } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) { 3357 paddr_t pa = pmap_pte2pa(opte); 3358 panic("pmap_remove_pte: PG_PVLIST with pv-untracked page" 3359 " va = 0x%"PRIxVADDR 3360 " pa = 0x%"PRIxPADDR" (0x%"PRIxPADDR")", 3361 va, pa, atop(pa)); 3362 } 3363 3364 /* Sync R/M bits. */ 3365 pp->pp_attrs |= opte; 3366 pve = pmap_remove_pv(pp, ptp, va); 3367 3368 if (pve) { 3369 pve->pve_next = *pv_tofree; 3370 *pv_tofree = pve; 3371 } 3372 return true; 3373 } 3374 3375 /* 3376 * pmap_remove: mapping removal function. 3377 * 3378 * => caller should not be holding any pmap locks 3379 */ 3380 3381 void 3382 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 3383 { 3384 pt_entry_t *ptes; 3385 pd_entry_t pde; 3386 pd_entry_t * const *pdes; 3387 struct pv_entry *pv_tofree = NULL; 3388 bool result; 3389 int i; 3390 paddr_t ptppa; 3391 vaddr_t blkendva, va = sva; 3392 struct vm_page *ptp; 3393 struct pmap *pmap2; 3394 3395 kpreempt_disable(); 3396 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 3397 3398 /* 3399 * removing one page? take shortcut function. 3400 */ 3401 3402 if (va + PAGE_SIZE == eva) { 3403 if (pmap_pdes_valid(va, pdes, &pde)) { 3404 3405 /* PA of the PTP */ 3406 ptppa = pmap_pte2pa(pde); 3407 3408 /* Get PTP if non-kernel mapping. */ 3409 if (pmap != pmap_kernel()) { 3410 ptp = pmap_find_ptp(pmap, va, ptppa, 1); 3411 KASSERTMSG(ptp != NULL, 3412 "pmap_remove: unmanaged PTP detected"); 3413 } else { 3414 /* Never free kernel PTPs. */ 3415 ptp = NULL; 3416 } 3417 3418 result = pmap_remove_pte(pmap, ptp, 3419 &ptes[pl1_i(va)], va, &pv_tofree); 3420 3421 /* 3422 * if mapping removed and the PTP is no longer 3423 * being used, free it! 3424 */ 3425 3426 if (result && ptp && ptp->wire_count <= 1) 3427 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3428 } 3429 } else for (/* null */ ; va < eva ; va = blkendva) { 3430 int lvl; 3431 3432 /* determine range of block */ 3433 blkendva = x86_round_pdr(va+1); 3434 if (blkendva > eva) 3435 blkendva = eva; 3436 3437 /* 3438 * Our PTE mappings should never be removed with pmap_remove. 3439 * 3440 * XXXmaxv: still needed? 3441 * 3442 * A long term solution is to move the PTEs out of user address 3443 * space, and into kernel address space. Then we can set 3444 * VM_MAXUSER_ADDRESS to be VM_MAX_ADDRESS. 3445 */ 3446 for (i = 0; i < PDP_SIZE; i++) { 3447 if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i) 3448 panic("PTE space accessed"); 3449 } 3450 3451 lvl = pmap_pdes_invalid(va, pdes, &pde); 3452 if (lvl != 0) { 3453 /* 3454 * skip a range corresponding to an invalid pde. 3455 */ 3456 blkendva = (va & ptp_masks[lvl - 1]) + nbpd[lvl - 1]; 3457 continue; 3458 } 3459 3460 /* PA of the PTP */ 3461 ptppa = pmap_pte2pa(pde); 3462 3463 /* Get PTP if non-kernel mapping. */ 3464 if (pmap != pmap_kernel()) { 3465 ptp = pmap_find_ptp(pmap, va, ptppa, 1); 3466 KASSERTMSG(ptp != NULL, 3467 "pmap_remove: unmanaged PTP detected"); 3468 } else { 3469 /* Never free kernel PTPs. */ 3470 ptp = NULL; 3471 } 3472 3473 pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va, 3474 blkendva, &pv_tofree); 3475 3476 /* if PTP is no longer being used, free it! */ 3477 if (ptp && ptp->wire_count <= 1) { 3478 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3479 } 3480 } 3481 pmap_unmap_ptes(pmap, pmap2); /* unlock pmap */ 3482 kpreempt_enable(); 3483 3484 /* Now we free unused PVs */ 3485 if (pv_tofree) 3486 pmap_free_pvs(pv_tofree); 3487 } 3488 3489 /* 3490 * pmap_sync_pv: clear pte bits and return the old value of the pte. 3491 * 3492 * => Caller should disable kernel preemption. 3493 * => issues tlb shootdowns if necessary. 3494 */ 3495 3496 static int 3497 pmap_sync_pv(struct pv_pte *pvpte, pt_entry_t expect, int clearbits, 3498 pt_entry_t *optep) 3499 { 3500 struct pmap *pmap; 3501 struct vm_page *ptp; 3502 vaddr_t va; 3503 pt_entry_t *ptep; 3504 pt_entry_t opte; 3505 pt_entry_t npte; 3506 bool need_shootdown; 3507 3508 ptp = pvpte->pte_ptp; 3509 va = pvpte->pte_va; 3510 KASSERT(ptp == NULL || ptp->uobject != NULL); 3511 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 3512 pmap = ptp_to_pmap(ptp); 3513 3514 KASSERT((expect & ~(PG_FRAME | PG_V)) == 0); 3515 KASSERT((expect & PG_V) != 0); 3516 KASSERT(clearbits == ~0 || (clearbits & ~(PG_M | PG_U | PG_RW)) == 0); 3517 KASSERT(kpreempt_disabled()); 3518 3519 ptep = pmap_map_pte(pmap, ptp, va); 3520 do { 3521 opte = *ptep; 3522 KASSERT((opte & (PG_M | PG_U)) != PG_M); 3523 KASSERT((opte & (PG_U | PG_V)) != PG_U); 3524 KASSERT(opte == 0 || (opte & PG_V) != 0); 3525 if ((opte & (PG_FRAME | PG_V)) != expect) { 3526 3527 /* 3528 * we lost a race with a V->P operation like 3529 * pmap_remove(). wait for the competitor 3530 * reflecting pte bits into mp_attrs. 3531 * 3532 * issue a redundant TLB shootdown so that 3533 * we can wait for its completion. 3534 */ 3535 3536 pmap_unmap_pte(); 3537 if (clearbits != 0) { 3538 pmap_tlb_shootdown(pmap, va, 3539 (pmap == pmap_kernel() ? PG_G : 0), 3540 TLBSHOOT_SYNC_PV1); 3541 } 3542 return EAGAIN; 3543 } 3544 3545 /* 3546 * check if there's anything to do on this pte. 3547 */ 3548 3549 if ((opte & clearbits) == 0) { 3550 need_shootdown = false; 3551 break; 3552 } 3553 3554 /* 3555 * we need a shootdown if the pte is cached. (PG_U) 3556 * 3557 * ...unless we are clearing only the PG_RW bit and 3558 * it isn't cached as RW. (PG_M) 3559 */ 3560 3561 need_shootdown = (opte & PG_U) != 0 && 3562 !(clearbits == PG_RW && (opte & PG_M) == 0); 3563 3564 npte = opte & ~clearbits; 3565 3566 /* 3567 * if we need a shootdown anyway, clear PG_U and PG_M. 3568 */ 3569 3570 if (need_shootdown) { 3571 npte &= ~(PG_U | PG_M); 3572 } 3573 KASSERT((npte & (PG_M | PG_U)) != PG_M); 3574 KASSERT((npte & (PG_U | PG_V)) != PG_U); 3575 KASSERT(npte == 0 || (opte & PG_V) != 0); 3576 } while (pmap_pte_cas(ptep, opte, npte) != opte); 3577 3578 if (need_shootdown) { 3579 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV2); 3580 } 3581 pmap_unmap_pte(); 3582 3583 *optep = opte; 3584 return 0; 3585 } 3586 3587 static void 3588 pmap_pp_remove(struct pmap_page *pp, paddr_t pa) 3589 { 3590 struct pv_pte *pvpte; 3591 struct pv_entry *killlist = NULL; 3592 struct vm_page *ptp; 3593 pt_entry_t expect; 3594 int count; 3595 3596 expect = pmap_pa2pte(pa) | PG_V; 3597 count = SPINLOCK_BACKOFF_MIN; 3598 kpreempt_disable(); 3599 startover: 3600 while ((pvpte = pv_pte_first(pp)) != NULL) { 3601 struct pmap *pmap; 3602 struct pv_entry *pve; 3603 pt_entry_t opte; 3604 vaddr_t va; 3605 int error; 3606 3607 /* 3608 * add a reference to the pmap before clearing the pte. 3609 * otherwise the pmap can disappear behind us. 3610 */ 3611 3612 ptp = pvpte->pte_ptp; 3613 pmap = ptp_to_pmap(ptp); 3614 if (ptp != NULL) { 3615 pmap_reference(pmap); 3616 } 3617 3618 error = pmap_sync_pv(pvpte, expect, ~0, &opte); 3619 if (error == EAGAIN) { 3620 int hold_count; 3621 KERNEL_UNLOCK_ALL(curlwp, &hold_count); 3622 if (ptp != NULL) { 3623 pmap_destroy(pmap); 3624 } 3625 SPINLOCK_BACKOFF(count); 3626 KERNEL_LOCK(hold_count, curlwp); 3627 goto startover; 3628 } 3629 3630 pp->pp_attrs |= opte; 3631 va = pvpte->pte_va; 3632 pve = pmap_remove_pv(pp, ptp, va); 3633 3634 /* update the PTP reference count. free if last reference. */ 3635 if (ptp != NULL) { 3636 struct pmap *pmap2; 3637 pt_entry_t *ptes; 3638 pd_entry_t * const *pdes; 3639 3640 KASSERT(pmap != pmap_kernel()); 3641 3642 pmap_tlb_shootnow(); 3643 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3644 pmap_stats_update_bypte(pmap, 0, opte); 3645 ptp->wire_count--; 3646 if (ptp->wire_count <= 1) { 3647 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3648 } 3649 pmap_unmap_ptes(pmap, pmap2); 3650 pmap_destroy(pmap); 3651 } else { 3652 KASSERT(pmap == pmap_kernel()); 3653 pmap_stats_update_bypte(pmap, 0, opte); 3654 } 3655 3656 if (pve != NULL) { 3657 pve->pve_next = killlist; /* mark it for death */ 3658 killlist = pve; 3659 } 3660 } 3661 pmap_tlb_shootnow(); 3662 kpreempt_enable(); 3663 3664 /* Now free unused pvs. */ 3665 pmap_free_pvs(killlist); 3666 } 3667 3668 /* 3669 * pmap_page_remove: remove a managed vm_page from all pmaps that map it 3670 * 3671 * => R/M bits are sync'd back to attrs 3672 */ 3673 3674 void 3675 pmap_page_remove(struct vm_page *pg) 3676 { 3677 struct pmap_page *pp; 3678 paddr_t pa; 3679 3680 KASSERT(uvm_page_locked_p(pg)); 3681 3682 pp = VM_PAGE_TO_PP(pg); 3683 pa = VM_PAGE_TO_PHYS(pg); 3684 pmap_pp_remove(pp, pa); 3685 } 3686 3687 /* 3688 * pmap_pv_remove: remove an unmanaged pv-tracked page from all pmaps 3689 * that map it 3690 */ 3691 3692 void 3693 pmap_pv_remove(paddr_t pa) 3694 { 3695 struct pmap_page *pp; 3696 3697 pp = pmap_pv_tracked(pa); 3698 if (pp == NULL) 3699 panic("pmap_pv_protect: page not pv-tracked: 0x%"PRIxPADDR, 3700 pa); 3701 pmap_pp_remove(pp, pa); 3702 } 3703 3704 /* 3705 * p m a p a t t r i b u t e f u n c t i o n s 3706 * functions that test/change managed page's attributes 3707 * since a page can be mapped multiple times we must check each PTE that 3708 * maps it by going down the pv lists. 3709 */ 3710 3711 /* 3712 * pmap_test_attrs: test a page's attributes 3713 */ 3714 3715 bool 3716 pmap_test_attrs(struct vm_page *pg, unsigned testbits) 3717 { 3718 struct pmap_page *pp; 3719 struct pv_pte *pvpte; 3720 pt_entry_t expect; 3721 u_int result; 3722 3723 KASSERT(uvm_page_locked_p(pg)); 3724 3725 pp = VM_PAGE_TO_PP(pg); 3726 if ((pp->pp_attrs & testbits) != 0) { 3727 return true; 3728 } 3729 expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V; 3730 kpreempt_disable(); 3731 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 3732 pt_entry_t opte; 3733 int error; 3734 3735 if ((pp->pp_attrs & testbits) != 0) { 3736 break; 3737 } 3738 error = pmap_sync_pv(pvpte, expect, 0, &opte); 3739 if (error == 0) { 3740 pp->pp_attrs |= opte; 3741 } 3742 } 3743 result = pp->pp_attrs & testbits; 3744 kpreempt_enable(); 3745 3746 /* 3747 * note that we will exit the for loop with a non-null pve if 3748 * we have found the bits we are testing for. 3749 */ 3750 3751 return result != 0; 3752 } 3753 3754 static bool 3755 pmap_pp_clear_attrs(struct pmap_page *pp, paddr_t pa, unsigned clearbits) 3756 { 3757 struct pv_pte *pvpte; 3758 u_int result; 3759 pt_entry_t expect; 3760 int count; 3761 3762 expect = pmap_pa2pte(pa) | PG_V; 3763 count = SPINLOCK_BACKOFF_MIN; 3764 kpreempt_disable(); 3765 startover: 3766 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 3767 pt_entry_t opte; 3768 int error; 3769 3770 error = pmap_sync_pv(pvpte, expect, clearbits, &opte); 3771 if (error == EAGAIN) { 3772 int hold_count; 3773 KERNEL_UNLOCK_ALL(curlwp, &hold_count); 3774 SPINLOCK_BACKOFF(count); 3775 KERNEL_LOCK(hold_count, curlwp); 3776 goto startover; 3777 } 3778 pp->pp_attrs |= opte; 3779 } 3780 result = pp->pp_attrs & clearbits; 3781 pp->pp_attrs &= ~clearbits; 3782 pmap_tlb_shootnow(); 3783 kpreempt_enable(); 3784 3785 return result != 0; 3786 } 3787 3788 /* 3789 * pmap_clear_attrs: clear the specified attribute for a page. 3790 * 3791 * => we return true if we cleared one of the bits we were asked to 3792 */ 3793 3794 bool 3795 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits) 3796 { 3797 struct pmap_page *pp; 3798 paddr_t pa; 3799 3800 KASSERT(uvm_page_locked_p(pg)); 3801 3802 pp = VM_PAGE_TO_PP(pg); 3803 pa = VM_PAGE_TO_PHYS(pg); 3804 3805 return pmap_pp_clear_attrs(pp, pa, clearbits); 3806 } 3807 3808 /* 3809 * pmap_pv_clear_attrs: clear the specified attributes for an unmanaged 3810 * pv-tracked page. 3811 */ 3812 3813 bool 3814 pmap_pv_clear_attrs(paddr_t pa, unsigned clearbits) 3815 { 3816 struct pmap_page *pp; 3817 3818 pp = pmap_pv_tracked(pa); 3819 if (pp == NULL) 3820 panic("pmap_pv_protect: page not pv-tracked: 0x%"PRIxPADDR, 3821 pa); 3822 3823 return pmap_pp_clear_attrs(pp, pa, clearbits); 3824 } 3825 3826 /* 3827 * p m a p p r o t e c t i o n f u n c t i o n s 3828 */ 3829 3830 /* 3831 * pmap_page_protect: change the protection of all recorded mappings 3832 * of a managed page 3833 * 3834 * => NOTE: this is an inline function in pmap.h 3835 */ 3836 3837 /* see pmap.h */ 3838 3839 /* 3840 * pmap_pv_protect: change the protection of all recorded mappings 3841 * of an unmanaged pv-tracked page 3842 * 3843 * => NOTE: this is an inline function in pmap.h 3844 */ 3845 3846 /* see pmap.h */ 3847 3848 /* 3849 * pmap_protect: set the protection in of the pages in a pmap 3850 * 3851 * => NOTE: this is an inline function in pmap.h 3852 */ 3853 3854 /* see pmap.h */ 3855 3856 /* 3857 * pmap_write_protect: write-protect pages in a pmap. 3858 */ 3859 void 3860 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot) 3861 { 3862 pt_entry_t bit_rem, bit_put; 3863 pt_entry_t *ptes; 3864 pt_entry_t * const *pdes; 3865 struct pmap *pmap2; 3866 vaddr_t blockend, va; 3867 3868 KASSERT(curlwp->l_md.md_gc_pmap != pmap); 3869 3870 bit_rem = 0; 3871 if (!(prot & VM_PROT_WRITE)) 3872 bit_rem = PG_RW; 3873 3874 bit_put = 0; 3875 if (!(prot & VM_PROT_EXECUTE)) 3876 bit_put = pmap_pg_nx; 3877 3878 sva &= PG_FRAME; 3879 eva &= PG_FRAME; 3880 3881 /* Acquire pmap. */ 3882 kpreempt_disable(); 3883 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3884 3885 for (va = sva ; va < eva; va = blockend) { 3886 pt_entry_t *spte, *epte; 3887 int i; 3888 3889 blockend = x86_round_pdr(va + 1); 3890 if (blockend > eva) 3891 blockend = eva; 3892 3893 /* 3894 * Our PTE mappings should never be write-protected. 3895 * 3896 * XXXmaxv: still needed? 3897 * 3898 * A long term solution is to move the PTEs out of user address 3899 * space, and into kernel address space. Then we can set 3900 * VM_MAXUSER_ADDRESS to be VM_MAX_ADDRESS. 3901 */ 3902 for (i = 0; i < PDP_SIZE; i++) { 3903 if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i) 3904 panic("PTE space accessed"); 3905 } 3906 3907 /* Is it a valid block? */ 3908 if (!pmap_pdes_valid(va, pdes, NULL)) { 3909 continue; 3910 } 3911 KASSERT(va < VM_MAXUSER_ADDRESS || va >= VM_MAX_ADDRESS); 3912 3913 spte = &ptes[pl1_i(va)]; 3914 epte = &ptes[pl1_i(blockend)]; 3915 3916 for (/* */; spte < epte; spte++) { 3917 pt_entry_t opte, npte; 3918 3919 do { 3920 opte = *spte; 3921 if (!pmap_valid_entry(opte)) { 3922 goto next; 3923 } 3924 npte = (opte & ~bit_rem) | bit_put; 3925 } while (pmap_pte_cas(spte, opte, npte) != opte); 3926 3927 if ((opte & PG_M) != 0) { 3928 vaddr_t tva = x86_ptob(spte - ptes); 3929 pmap_tlb_shootdown(pmap, tva, opte, 3930 TLBSHOOT_WRITE_PROTECT); 3931 } 3932 next:; 3933 } 3934 } 3935 3936 /* Release pmap. */ 3937 pmap_unmap_ptes(pmap, pmap2); 3938 kpreempt_enable(); 3939 } 3940 3941 /* 3942 * pmap_unwire: clear the wired bit in the PTE. 3943 * 3944 * => Mapping should already be present. 3945 */ 3946 void 3947 pmap_unwire(struct pmap *pmap, vaddr_t va) 3948 { 3949 pt_entry_t *ptes, *ptep, opte; 3950 pd_entry_t * const *pdes; 3951 struct pmap *pmap2; 3952 3953 /* Acquire pmap. */ 3954 kpreempt_disable(); 3955 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3956 3957 if (!pmap_pdes_valid(va, pdes, NULL)) { 3958 panic("pmap_unwire: invalid PDE"); 3959 } 3960 3961 ptep = &ptes[pl1_i(va)]; 3962 opte = *ptep; 3963 KASSERT(pmap_valid_entry(opte)); 3964 3965 if (opte & PG_W) { 3966 pt_entry_t npte = opte & ~PG_W; 3967 3968 opte = pmap_pte_testset(ptep, npte); 3969 pmap_stats_update_bypte(pmap, npte, opte); 3970 } else { 3971 printf("pmap_unwire: wiring for pmap %p va 0x%lx " 3972 "did not change!\n", pmap, va); 3973 } 3974 3975 /* Release pmap. */ 3976 pmap_unmap_ptes(pmap, pmap2); 3977 kpreempt_enable(); 3978 } 3979 3980 /* 3981 * pmap_copy: copy mappings from one pmap to another 3982 * 3983 * => optional function 3984 * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr) 3985 */ 3986 3987 /* 3988 * defined as macro in pmap.h 3989 */ 3990 3991 __strict_weak_alias(pmap_enter, pmap_enter_default); 3992 3993 int 3994 pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, 3995 u_int flags) 3996 { 3997 return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0); 3998 } 3999 4000 /* 4001 * pmap_enter: enter a mapping into a pmap 4002 * 4003 * => must be done "now" ... no lazy-evaluation 4004 * => we set pmap => pv_head locking 4005 */ 4006 int 4007 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa, 4008 vm_prot_t prot, u_int flags, int domid) 4009 { 4010 pt_entry_t *ptes, opte, npte; 4011 pt_entry_t *ptep; 4012 pd_entry_t * const *pdes; 4013 struct vm_page *ptp; 4014 struct vm_page *new_pg, *old_pg; 4015 struct pmap_page *new_pp, *old_pp; 4016 struct pv_entry *old_pve = NULL; 4017 struct pv_entry *new_pve; 4018 struct pv_entry *new_sparepve; 4019 int error; 4020 bool wired = (flags & PMAP_WIRED) != 0; 4021 struct pmap *pmap2; 4022 4023 KASSERT(pmap_initialized); 4024 KASSERT(curlwp->l_md.md_gc_pmap != pmap); 4025 KASSERT(va < VM_MAX_KERNEL_ADDRESS); 4026 KASSERTMSG(va != (vaddr_t)PDP_BASE, 4027 "pmap_enter: trying to map over PDP!"); 4028 KASSERTMSG(va < VM_MIN_KERNEL_ADDRESS || 4029 pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]), 4030 "pmap_enter: missing kernel PTP for VA %lx!", va); 4031 4032 #ifdef XEN 4033 KASSERT(domid == DOMID_SELF || pa == 0); 4034 #endif /* XEN */ 4035 4036 npte = ma | protection_codes[prot] | PG_V; 4037 npte |= pmap_pat_flags(flags); 4038 if (wired) 4039 npte |= PG_W; 4040 if (va < VM_MAXUSER_ADDRESS) 4041 npte |= PG_u; 4042 else if (va < VM_MAX_ADDRESS) 4043 panic("PTE space accessed"); /* XXXmaxv: no longer needed? */ 4044 else 4045 npte |= PG_k; 4046 if (pmap == pmap_kernel()) 4047 npte |= pmap_pg_g; 4048 if (flags & VM_PROT_ALL) { 4049 npte |= PG_U; 4050 if (flags & VM_PROT_WRITE) { 4051 KASSERT((npte & PG_RW) != 0); 4052 npte |= PG_M; 4053 } 4054 } 4055 4056 #ifdef XEN 4057 if (domid != DOMID_SELF) 4058 new_pg = NULL; 4059 else 4060 #endif 4061 new_pg = PHYS_TO_VM_PAGE(pa); 4062 if (new_pg != NULL) { 4063 /* This is a managed page */ 4064 npte |= PG_PVLIST; 4065 new_pp = VM_PAGE_TO_PP(new_pg); 4066 } else if ((new_pp = pmap_pv_tracked(pa)) != NULL) { 4067 /* This is an unmanaged pv-tracked page */ 4068 npte |= PG_PVLIST; 4069 } else { 4070 new_pp = NULL; 4071 } 4072 4073 /* get pves. */ 4074 new_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT); 4075 new_sparepve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT); 4076 if (new_pve == NULL || new_sparepve == NULL) { 4077 if (flags & PMAP_CANFAIL) { 4078 error = ENOMEM; 4079 goto out2; 4080 } 4081 panic("pmap_enter: pve allocation failed"); 4082 } 4083 4084 kpreempt_disable(); 4085 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 4086 if (pmap == pmap_kernel()) { 4087 ptp = NULL; 4088 } else { 4089 ptp = pmap_get_ptp(pmap, va, pdes); 4090 if (ptp == NULL) { 4091 pmap_unmap_ptes(pmap, pmap2); 4092 if (flags & PMAP_CANFAIL) { 4093 error = ENOMEM; 4094 goto out; 4095 } 4096 panic("pmap_enter: get ptp failed"); 4097 } 4098 } 4099 4100 /* 4101 * update the pte. 4102 */ 4103 4104 ptep = &ptes[pl1_i(va)]; 4105 do { 4106 opte = *ptep; 4107 4108 /* 4109 * if the same page, inherit PG_U and PG_M. 4110 */ 4111 if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) { 4112 npte |= opte & (PG_U | PG_M); 4113 } 4114 #if defined(XEN) 4115 if (domid != DOMID_SELF) { 4116 /* pmap_pte_cas with error handling */ 4117 int s = splvm(); 4118 if (opte != *ptep) { 4119 splx(s); 4120 continue; 4121 } 4122 error = xpq_update_foreign( 4123 vtomach((vaddr_t)ptep), npte, domid); 4124 splx(s); 4125 if (error) { 4126 if (ptp != NULL && ptp->wire_count <= 1) { 4127 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 4128 } 4129 pmap_unmap_ptes(pmap, pmap2); 4130 goto out; 4131 } 4132 break; 4133 } 4134 #endif /* defined(XEN) */ 4135 } while (pmap_pte_cas(ptep, opte, npte) != opte); 4136 4137 /* 4138 * update statistics and PTP's reference count. 4139 */ 4140 4141 pmap_stats_update_bypte(pmap, npte, opte); 4142 if (ptp != NULL && !pmap_valid_entry(opte)) { 4143 ptp->wire_count++; 4144 } 4145 KASSERT(ptp == NULL || ptp->wire_count > 1); 4146 4147 /* 4148 * if the same page, we can skip pv_entry handling. 4149 */ 4150 4151 if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) { 4152 KASSERT(((opte ^ npte) & PG_PVLIST) == 0); 4153 goto same_pa; 4154 } 4155 4156 /* 4157 * if old page is pv-tracked, remove pv_entry from its list. 4158 */ 4159 4160 if ((~opte & (PG_V | PG_PVLIST)) == 0) { 4161 if ((old_pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) { 4162 KASSERT(uvm_page_locked_p(old_pg)); 4163 old_pp = VM_PAGE_TO_PP(old_pg); 4164 } else if ((old_pp = pmap_pv_tracked(pmap_pte2pa(opte))) 4165 == NULL) { 4166 pa = pmap_pte2pa(opte); 4167 panic("pmap_enter: PG_PVLIST with pv-untracked page" 4168 " va = 0x%"PRIxVADDR 4169 " pa = 0x%" PRIxPADDR " (0x%" PRIxPADDR ")", 4170 va, pa, atop(pa)); 4171 } 4172 4173 old_pve = pmap_remove_pv(old_pp, ptp, va); 4174 old_pp->pp_attrs |= opte; 4175 } 4176 4177 /* 4178 * if new page is pv-tracked, insert pv_entry into its list. 4179 */ 4180 4181 if (new_pp) { 4182 new_pve = pmap_enter_pv(new_pp, new_pve, &new_sparepve, ptp, va); 4183 } 4184 4185 same_pa: 4186 pmap_unmap_ptes(pmap, pmap2); 4187 4188 /* 4189 * shootdown tlb if necessary. 4190 */ 4191 4192 if ((~opte & (PG_V | PG_U)) == 0 && 4193 ((opte ^ npte) & (PG_FRAME | PG_RW)) != 0) { 4194 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER); 4195 } 4196 4197 error = 0; 4198 out: 4199 kpreempt_enable(); 4200 out2: 4201 if (old_pve != NULL) { 4202 pool_cache_put(&pmap_pv_cache, old_pve); 4203 } 4204 if (new_pve != NULL) { 4205 pool_cache_put(&pmap_pv_cache, new_pve); 4206 } 4207 if (new_sparepve != NULL) { 4208 pool_cache_put(&pmap_pv_cache, new_sparepve); 4209 } 4210 4211 return error; 4212 } 4213 4214 static paddr_t 4215 pmap_get_physpage(void) 4216 { 4217 struct vm_page *ptp; 4218 struct pmap *kpm = pmap_kernel(); 4219 paddr_t pa; 4220 4221 if (!uvm.page_init_done) { 4222 /* 4223 * We're growing the kernel pmap early (from 4224 * uvm_pageboot_alloc()). This case must be 4225 * handled a little differently. 4226 */ 4227 4228 if (!uvm_page_physget(&pa)) 4229 panic("pmap_get_physpage: out of memory"); 4230 #if defined(__HAVE_DIRECT_MAP) 4231 pagezero(PMAP_DIRECT_MAP(pa)); 4232 #else 4233 #if defined(XEN) 4234 if (XEN_VERSION_SUPPORTED(3, 4)) { 4235 xen_pagezero(pa); 4236 return pa; 4237 } 4238 #endif 4239 kpreempt_disable(); 4240 pmap_pte_set(early_zero_pte, pmap_pa2pte(pa) | PG_V | 4241 PG_RW | pmap_pg_nx | PG_k); 4242 pmap_pte_flush(); 4243 pmap_update_pg((vaddr_t)early_zerop); 4244 memset(early_zerop, 0, PAGE_SIZE); 4245 #if defined(DIAGNOSTIC) || defined(XEN) 4246 pmap_pte_set(early_zero_pte, 0); 4247 pmap_pte_flush(); 4248 #endif /* defined(DIAGNOSTIC) */ 4249 kpreempt_enable(); 4250 #endif /* defined(__HAVE_DIRECT_MAP) */ 4251 } else { 4252 /* XXX */ 4253 ptp = uvm_pagealloc(NULL, 0, NULL, 4254 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 4255 if (ptp == NULL) 4256 panic("pmap_get_physpage: out of memory"); 4257 ptp->flags &= ~PG_BUSY; 4258 ptp->wire_count = 1; 4259 pa = VM_PAGE_TO_PHYS(ptp); 4260 } 4261 pmap_stats_update(kpm, 1, 0); 4262 4263 return pa; 4264 } 4265 4266 /* 4267 * Expand the page tree with the specified amount of PTPs, mapping virtual 4268 * addresses starting at kva. We populate all the levels but the last one 4269 * (L1). The nodes of the tree are created as RWX, but the pages covered 4270 * will be kentered in L1, with proper permissions. 4271 * 4272 * Used only by pmap_growkernel. 4273 */ 4274 static void 4275 pmap_alloc_level(vaddr_t kva, long *needed_ptps) 4276 { 4277 unsigned long i; 4278 paddr_t pa; 4279 unsigned long index, endindex; 4280 int level; 4281 pd_entry_t *pdep; 4282 #ifdef XEN 4283 int s = splvm(); /* protect xpq_* */ 4284 #endif 4285 4286 for (level = PTP_LEVELS; level > 1; level--) { 4287 if (level == PTP_LEVELS) 4288 pdep = pmap_kernel()->pm_pdir; 4289 else 4290 pdep = normal_pdes[level - 2]; 4291 index = pl_i_roundup(kva, level); 4292 endindex = index + needed_ptps[level - 1] - 1; 4293 4294 for (i = index; i <= endindex; i++) { 4295 pt_entry_t pte; 4296 4297 KASSERT(!pmap_valid_entry(pdep[i])); 4298 pa = pmap_get_physpage(); 4299 pte = pmap_pa2pte(pa) | PG_k | PG_V | PG_RW; 4300 pmap_pte_set(&pdep[i], pte); 4301 4302 #if defined(XEN) && (defined(PAE) || defined(__x86_64__)) 4303 if (level == PTP_LEVELS && i >= PDIR_SLOT_KERN) { 4304 if (__predict_true( 4305 cpu_info_primary.ci_flags & CPUF_PRESENT)) { 4306 /* update per-cpu PMDs on all cpus */ 4307 xen_kpm_sync(pmap_kernel(), i); 4308 } else { 4309 /* 4310 * too early; update primary CPU 4311 * PMD only (without locks) 4312 */ 4313 #ifdef PAE 4314 pd_entry_t *cpu_pdep = 4315 &cpu_info_primary.ci_kpm_pdir[l2tol2(i)]; 4316 #endif 4317 #ifdef __x86_64__ 4318 pd_entry_t *cpu_pdep = 4319 &cpu_info_primary.ci_kpm_pdir[i]; 4320 #endif 4321 pmap_pte_set(cpu_pdep, pte); 4322 } 4323 } 4324 #endif /* XEN && (PAE || __x86_64__) */ 4325 4326 KASSERT(level != PTP_LEVELS || nkptp[level - 1] + 4327 pl_i(VM_MIN_KERNEL_ADDRESS, level) == i); 4328 nkptp[level - 1]++; 4329 } 4330 pmap_pte_flush(); 4331 } 4332 #ifdef XEN 4333 splx(s); 4334 #endif 4335 } 4336 4337 /* 4338 * pmap_growkernel: increase usage of KVM space. 4339 * 4340 * => we allocate new PTPs for the kernel and install them in all 4341 * the pmaps on the system. 4342 */ 4343 4344 vaddr_t 4345 pmap_growkernel(vaddr_t maxkvaddr) 4346 { 4347 struct pmap *kpm = pmap_kernel(); 4348 #if !defined(XEN) || !defined(__x86_64__) 4349 struct pmap *pm; 4350 long old; 4351 #endif 4352 int s, i; 4353 long needed_kptp[PTP_LEVELS], target_nptp; 4354 bool invalidate = false; 4355 4356 s = splvm(); /* to be safe */ 4357 mutex_enter(kpm->pm_lock); 4358 4359 if (maxkvaddr <= pmap_maxkvaddr) { 4360 mutex_exit(kpm->pm_lock); 4361 splx(s); 4362 return pmap_maxkvaddr; 4363 } 4364 4365 maxkvaddr = x86_round_pdr(maxkvaddr); 4366 #if !defined(XEN) || !defined(__x86_64__) 4367 old = nkptp[PTP_LEVELS - 1]; 4368 #endif 4369 4370 /* Initialize needed_kptp. */ 4371 for (i = PTP_LEVELS - 1; i >= 1; i--) { 4372 target_nptp = pl_i_roundup(maxkvaddr, i + 1) - 4373 pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1); 4374 4375 if (target_nptp > nkptpmax[i]) 4376 panic("out of KVA space"); 4377 KASSERT(target_nptp >= nkptp[i]); 4378 needed_kptp[i] = target_nptp - nkptp[i]; 4379 } 4380 4381 pmap_alloc_level(pmap_maxkvaddr, needed_kptp); 4382 4383 /* 4384 * If the number of top level entries changed, update all pmaps. 4385 */ 4386 if (needed_kptp[PTP_LEVELS - 1] != 0) { 4387 #ifdef XEN 4388 #ifdef __x86_64__ 4389 /* nothing, kernel entries are never entered in user pmap */ 4390 #else /* __x86_64__ */ 4391 mutex_enter(&pmaps_lock); 4392 LIST_FOREACH(pm, &pmaps, pm_list) { 4393 int pdkidx; 4394 for (pdkidx = PDIR_SLOT_KERN + old; 4395 pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1]; 4396 pdkidx++) { 4397 pmap_pte_set(&pm->pm_pdir[pdkidx], 4398 kpm->pm_pdir[pdkidx]); 4399 } 4400 pmap_pte_flush(); 4401 } 4402 mutex_exit(&pmaps_lock); 4403 #endif /* __x86_64__ */ 4404 #else /* XEN */ 4405 unsigned newpdes; 4406 newpdes = nkptp[PTP_LEVELS - 1] - old; 4407 mutex_enter(&pmaps_lock); 4408 LIST_FOREACH(pm, &pmaps, pm_list) { 4409 memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old], 4410 &kpm->pm_pdir[PDIR_SLOT_KERN + old], 4411 newpdes * sizeof (pd_entry_t)); 4412 } 4413 mutex_exit(&pmaps_lock); 4414 #endif 4415 invalidate = true; 4416 } 4417 pmap_maxkvaddr = maxkvaddr; 4418 mutex_exit(kpm->pm_lock); 4419 splx(s); 4420 4421 if (invalidate && pmap_initialized) { 4422 /* Invalidate the PDP cache. */ 4423 pool_cache_invalidate(&pmap_pdp_cache); 4424 } 4425 4426 return maxkvaddr; 4427 } 4428 4429 #ifdef DEBUG 4430 void pmap_dump(struct pmap *, vaddr_t, vaddr_t); 4431 4432 /* 4433 * pmap_dump: dump all the mappings from a pmap 4434 * 4435 * => caller should not be holding any pmap locks 4436 */ 4437 4438 void 4439 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 4440 { 4441 pt_entry_t *ptes, *pte; 4442 pd_entry_t * const *pdes; 4443 struct pmap *pmap2; 4444 vaddr_t blkendva; 4445 4446 /* 4447 * if end is out of range truncate. 4448 * if (end == start) update to max. 4449 */ 4450 4451 if (eva > VM_MAXUSER_ADDRESS || eva <= sva) 4452 eva = VM_MAXUSER_ADDRESS; 4453 4454 /* 4455 * we lock in the pmap => pv_head direction 4456 */ 4457 4458 kpreempt_disable(); 4459 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 4460 4461 /* 4462 * dumping a range of pages: we dump in PTP sized blocks (4MB) 4463 */ 4464 4465 for (/* null */ ; sva < eva ; sva = blkendva) { 4466 4467 /* determine range of block */ 4468 blkendva = x86_round_pdr(sva+1); 4469 if (blkendva > eva) 4470 blkendva = eva; 4471 4472 /* valid block? */ 4473 if (!pmap_pdes_valid(sva, pdes, NULL)) 4474 continue; 4475 4476 pte = &ptes[pl1_i(sva)]; 4477 for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) { 4478 if (!pmap_valid_entry(*pte)) 4479 continue; 4480 printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR 4481 " (pte=%#" PRIxPADDR ")\n", 4482 sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte); 4483 } 4484 } 4485 pmap_unmap_ptes(pmap, pmap2); 4486 kpreempt_enable(); 4487 } 4488 #endif 4489 4490 /* 4491 * pmap_update: process deferred invalidations and frees. 4492 */ 4493 4494 void 4495 pmap_update(struct pmap *pmap) 4496 { 4497 struct vm_page *empty_ptps; 4498 lwp_t *l = curlwp; 4499 4500 /* 4501 * If we have torn down this pmap, invalidate non-global TLB 4502 * entries on any processors using it. 4503 */ 4504 kpreempt_disable(); 4505 if (__predict_false(l->l_md.md_gc_pmap == pmap)) { 4506 l->l_md.md_gc_pmap = NULL; 4507 pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, TLBSHOOT_UPDATE); 4508 } 4509 /* 4510 * Initiate any pending TLB shootdowns. Wait for them to 4511 * complete before returning control to the caller. 4512 */ 4513 pmap_tlb_shootnow(); 4514 kpreempt_enable(); 4515 4516 /* 4517 * Now that shootdowns are complete, process deferred frees, 4518 * but not from interrupt context. 4519 */ 4520 if (l->l_md.md_gc_ptp != NULL) { 4521 KASSERT((l->l_pflag & LP_INTR) == 0); 4522 if (cpu_intr_p()) { 4523 return; 4524 } 4525 empty_ptps = l->l_md.md_gc_ptp; 4526 l->l_md.md_gc_ptp = NULL; 4527 pmap_free_ptps(empty_ptps); 4528 } 4529 } 4530 4531 #if PTP_LEVELS > 4 4532 #error "Unsupported number of page table mappings" 4533 #endif 4534 4535 paddr_t 4536 pmap_init_tmp_pgtbl(paddr_t pg) 4537 { 4538 static bool maps_loaded; 4539 static const paddr_t x86_tmp_pml_paddr[] = { 4540 4 * PAGE_SIZE, /* L1 */ 4541 5 * PAGE_SIZE, /* L2 */ 4542 6 * PAGE_SIZE, /* L3 */ 4543 7 * PAGE_SIZE /* L4 */ 4544 }; 4545 static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 }; 4546 4547 pd_entry_t *tmp_pml, *kernel_pml; 4548 4549 int level; 4550 4551 if (!maps_loaded) { 4552 for (level = 0; level < PTP_LEVELS; ++level) { 4553 x86_tmp_pml_vaddr[level] = 4554 uvm_km_alloc(kernel_map, PAGE_SIZE, 0, 4555 UVM_KMF_VAONLY); 4556 4557 if (x86_tmp_pml_vaddr[level] == 0) 4558 panic("mapping of real mode PML failed\n"); 4559 pmap_kenter_pa(x86_tmp_pml_vaddr[level], 4560 x86_tmp_pml_paddr[level], 4561 VM_PROT_READ | VM_PROT_WRITE, 0); 4562 } 4563 pmap_update(pmap_kernel()); 4564 maps_loaded = true; 4565 } 4566 4567 /* Zero levels 1-3 */ 4568 for (level = 0; level < PTP_LEVELS - 1; ++level) { 4569 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 4570 memset(tmp_pml, 0, PAGE_SIZE); 4571 } 4572 4573 /* Copy PML4 */ 4574 kernel_pml = pmap_kernel()->pm_pdir; 4575 tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1]; 4576 memcpy(tmp_pml, kernel_pml, PAGE_SIZE); 4577 4578 #ifdef PAE 4579 /* 4580 * Use the last 4 entries of the L2 page as L3 PD entries. These 4581 * last entries are unlikely to be used for temporary mappings. 4582 * 508: maps 0->1GB (userland) 4583 * 509: unused 4584 * 510: unused 4585 * 511: maps 3->4GB (kernel) 4586 */ 4587 tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PG_V; 4588 tmp_pml[509] = 0; 4589 tmp_pml[510] = 0; 4590 tmp_pml[511] = pmap_pdirpa(pmap_kernel(), PDIR_SLOT_KERN) | PG_V; 4591 #endif 4592 4593 for (level = PTP_LEVELS - 1; level > 0; --level) { 4594 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 4595 4596 tmp_pml[pl_i(pg, level + 1)] = 4597 (x86_tmp_pml_paddr[level - 1] & PG_FRAME) | PG_RW | PG_V; 4598 } 4599 4600 tmp_pml = (void *)x86_tmp_pml_vaddr[0]; 4601 tmp_pml[pl_i(pg, 1)] = (pg & PG_FRAME) | PG_RW | PG_V; 4602 4603 #ifdef PAE 4604 /* Return the PA of the L3 page (entry 508 of the L2 page) */ 4605 return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t); 4606 #endif 4607 4608 return x86_tmp_pml_paddr[PTP_LEVELS - 1]; 4609 } 4610 4611 u_int 4612 x86_mmap_flags(paddr_t mdpgno) 4613 { 4614 u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK; 4615 u_int pflag = 0; 4616 4617 if (nflag & X86_MMAP_FLAG_PREFETCH) 4618 pflag |= PMAP_WRITE_COMBINE; 4619 4620 return pflag; 4621 } 4622