1 /* $NetBSD: pmap.c,v 1.196 2016/05/21 07:15:56 maxv Exp $ */ 2 3 /*- 4 * Copyright (c) 2008, 2010 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 2007 Manuel Bouyer. 34 * 35 * Redistribution and use in source and binary forms, with or without 36 * modification, are permitted provided that the following conditions 37 * are met: 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 44 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 45 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 46 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 47 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 48 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 49 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 50 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 51 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 52 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 53 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 54 * 55 */ 56 57 /* 58 * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr> 59 * 60 * Permission to use, copy, modify, and distribute this software for any 61 * purpose with or without fee is hereby granted, provided that the above 62 * copyright notice and this permission notice appear in all copies. 63 * 64 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 65 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 66 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 67 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 68 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 69 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 70 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 71 */ 72 73 /* 74 * Copyright (c) 1997 Charles D. Cranor and Washington University. 75 * All rights reserved. 76 * 77 * Redistribution and use in source and binary forms, with or without 78 * modification, are permitted provided that the following conditions 79 * are met: 80 * 1. Redistributions of source code must retain the above copyright 81 * notice, this list of conditions and the following disclaimer. 82 * 2. Redistributions in binary form must reproduce the above copyright 83 * notice, this list of conditions and the following disclaimer in the 84 * documentation and/or other materials provided with the distribution. 85 * 86 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 87 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 88 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 89 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 90 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 91 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 92 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 93 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 94 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 95 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 96 */ 97 98 /* 99 * Copyright 2001 (c) Wasabi Systems, Inc. 100 * All rights reserved. 101 * 102 * Written by Frank van der Linden for Wasabi Systems, Inc. 103 * 104 * Redistribution and use in source and binary forms, with or without 105 * modification, are permitted provided that the following conditions 106 * are met: 107 * 1. Redistributions of source code must retain the above copyright 108 * notice, this list of conditions and the following disclaimer. 109 * 2. Redistributions in binary form must reproduce the above copyright 110 * notice, this list of conditions and the following disclaimer in the 111 * documentation and/or other materials provided with the distribution. 112 * 3. All advertising materials mentioning features or use of this software 113 * must display the following acknowledgement: 114 * This product includes software developed for the NetBSD Project by 115 * Wasabi Systems, Inc. 116 * 4. The name of Wasabi Systems, Inc. may not be used to endorse 117 * or promote products derived from this software without specific prior 118 * written permission. 119 * 120 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND 121 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 122 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 123 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC 124 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 125 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 126 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 127 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 128 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 129 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 130 * POSSIBILITY OF SUCH DAMAGE. 131 */ 132 133 /* 134 * This is the i386 pmap modified and generalized to support x86-64 135 * as well. The idea is to hide the upper N levels of the page tables 136 * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest 137 * is mostly untouched, except that it uses some more generalized 138 * macros and interfaces. 139 * 140 * This pmap has been tested on the i386 as well, and it can be easily 141 * adapted to PAE. 142 * 143 * fvdl@wasabisystems.com 18-Jun-2001 144 */ 145 146 /* 147 * pmap.c: i386 pmap module rewrite 148 * Chuck Cranor <chuck@netbsd> 149 * 11-Aug-97 150 * 151 * history of this pmap module: in addition to my own input, i used 152 * the following references for this rewrite of the i386 pmap: 153 * 154 * [1] the NetBSD i386 pmap. this pmap appears to be based on the 155 * BSD hp300 pmap done by Mike Hibler at University of Utah. 156 * it was then ported to the i386 by William Jolitz of UUNET 157 * Technologies, Inc. Then Charles M. Hannum of the NetBSD 158 * project fixed some bugs and provided some speed ups. 159 * 160 * [2] the FreeBSD i386 pmap. this pmap seems to be the 161 * Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson 162 * and David Greenman. 163 * 164 * [3] the Mach pmap. this pmap, from CMU, seems to have migrated 165 * between several processors. the VAX version was done by 166 * Avadis Tevanian, Jr., and Michael Wayne Young. the i386 167 * version was done by Lance Berc, Mike Kupfer, Bob Baron, 168 * David Golub, and Richard Draves. the alpha version was 169 * done by Alessandro Forin (CMU/Mach) and Chris Demetriou 170 * (NetBSD/alpha). 171 */ 172 173 #include <sys/cdefs.h> 174 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.196 2016/05/21 07:15:56 maxv Exp $"); 175 176 #include "opt_user_ldt.h" 177 #include "opt_lockdebug.h" 178 #include "opt_multiprocessor.h" 179 #include "opt_xen.h" 180 #if !defined(__x86_64__) 181 #include "opt_kstack_dr0.h" 182 #endif /* !defined(__x86_64__) */ 183 184 #include <sys/param.h> 185 #include <sys/systm.h> 186 #include <sys/proc.h> 187 #include <sys/pool.h> 188 #include <sys/kernel.h> 189 #include <sys/atomic.h> 190 #include <sys/cpu.h> 191 #include <sys/intr.h> 192 #include <sys/xcall.h> 193 #include <sys/kcore.h> 194 195 #include <uvm/uvm.h> 196 #include <uvm/pmap/pmap_pvt.h> 197 198 #include <dev/isa/isareg.h> 199 200 #include <machine/specialreg.h> 201 #include <machine/gdt.h> 202 #include <machine/isa_machdep.h> 203 #include <machine/cpuvar.h> 204 #include <machine/cputypes.h> 205 206 #include <x86/pmap.h> 207 #include <x86/pmap_pv.h> 208 209 #include <x86/i82489reg.h> 210 #include <x86/i82489var.h> 211 212 #ifdef XEN 213 #include <xen/xen-public/xen.h> 214 #include <xen/hypervisor.h> 215 #endif 216 217 /* 218 * general info: 219 * 220 * - for an explanation of how the i386 MMU hardware works see 221 * the comments in <machine/pte.h>. 222 * 223 * - for an explanation of the general memory structure used by 224 * this pmap (including the recursive mapping), see the comments 225 * in <machine/pmap.h>. 226 * 227 * this file contains the code for the "pmap module." the module's 228 * job is to manage the hardware's virtual to physical address mappings. 229 * note that there are two levels of mapping in the VM system: 230 * 231 * [1] the upper layer of the VM system uses vm_map's and vm_map_entry's 232 * to map ranges of virtual address space to objects/files. for 233 * example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only 234 * to the file /bin/ls starting at offset zero." note that 235 * the upper layer mapping is not concerned with how individual 236 * vm_pages are mapped. 237 * 238 * [2] the lower layer of the VM system (the pmap) maintains the mappings 239 * from virtual addresses. it is concerned with which vm_page is 240 * mapped where. for example, when you run /bin/ls and start 241 * at page 0x1000 the fault routine may lookup the correct page 242 * of the /bin/ls file and then ask the pmap layer to establish 243 * a mapping for it. 244 * 245 * note that information in the lower layer of the VM system can be 246 * thrown away since it can easily be reconstructed from the info 247 * in the upper layer. 248 * 249 * data structures we use include: 250 * 251 * - struct pmap: describes the address space of one thread 252 * - struct pmap_page: describes one pv-tracked page, without 253 * necessarily a corresponding vm_page 254 * - struct pv_entry: describes one <PMAP,VA> mapping of a PA 255 * - struct pv_head: there is one pv_head per pv-tracked page of 256 * physical memory. the pv_head points to a list of pv_entry 257 * structures which describe all the <PMAP,VA> pairs that this 258 * page is mapped in. this is critical for page based operations 259 * such as pmap_page_protect() [change protection on _all_ mappings 260 * of a page] 261 */ 262 263 /* 264 * memory allocation 265 * 266 * - there are three data structures that we must dynamically allocate: 267 * 268 * [A] new process' page directory page (PDP) 269 * - plan 1: done at pmap_create() we use 270 * uvm_km_alloc(kernel_map, PAGE_SIZE) [fka kmem_alloc] to do this 271 * allocation. 272 * 273 * if we are low in free physical memory then we sleep in 274 * uvm_km_alloc -- in this case this is ok since we are creating 275 * a new pmap and should not be holding any locks. 276 * 277 * if the kernel is totally out of virtual space 278 * (i.e. uvm_km_alloc returns NULL), then we panic. 279 * 280 * [B] new page tables pages (PTP) 281 * - call uvm_pagealloc() 282 * => success: zero page, add to pm_pdir 283 * => failure: we are out of free vm_pages, let pmap_enter() 284 * tell UVM about it. 285 * 286 * note: for kernel PTPs, we start with NKPTP of them. as we map 287 * kernel memory (at uvm_map time) we check to see if we've grown 288 * the kernel pmap. if so, we call the optional function 289 * pmap_growkernel() to grow the kernel PTPs in advance. 290 * 291 * [C] pv_entry structures 292 */ 293 294 /* 295 * locking 296 * 297 * we have the following locks that we must contend with: 298 * 299 * mutexes: 300 * 301 * - pmap lock (per pmap, part of uvm_object) 302 * this lock protects the fields in the pmap structure including 303 * the non-kernel PDEs in the PDP, and the PTEs. it also locks 304 * in the alternate PTE space (since that is determined by the 305 * entry in the PDP). 306 * 307 * - pvh_lock (per pv_head) 308 * this lock protects the pv_entry list which is chained off the 309 * pv_head structure for a specific pv-tracked PA. it is locked 310 * when traversing the list (e.g. adding/removing mappings, 311 * syncing R/M bits, etc.) 312 * 313 * - pmaps_lock 314 * this lock protects the list of active pmaps (headed by "pmaps"). 315 * we lock it when adding or removing pmaps from this list. 316 */ 317 318 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER; 319 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER; 320 const long nkptpmax[] = NKPTPMAX_INITIALIZER; 321 const long nbpd[] = NBPD_INITIALIZER; 322 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER; 323 324 long nkptp[] = NKPTP_INITIALIZER; 325 326 struct pmap_head pmaps; 327 kmutex_t pmaps_lock; 328 329 static vaddr_t pmap_maxkvaddr; 330 331 /* 332 * XXX kludge: dummy locking to make KASSERTs in uvm_page.c comfortable. 333 * actual locking is done by pm_lock. 334 */ 335 #if defined(DIAGNOSTIC) 336 #define PMAP_SUBOBJ_LOCK(pm, idx) \ 337 KASSERT(mutex_owned((pm)->pm_lock)); \ 338 if ((idx) != 0) \ 339 mutex_enter((pm)->pm_obj[(idx)].vmobjlock) 340 #define PMAP_SUBOBJ_UNLOCK(pm, idx) \ 341 KASSERT(mutex_owned((pm)->pm_lock)); \ 342 if ((idx) != 0) \ 343 mutex_exit((pm)->pm_obj[(idx)].vmobjlock) 344 #else /* defined(DIAGNOSTIC) */ 345 #define PMAP_SUBOBJ_LOCK(pm, idx) /* nothing */ 346 #define PMAP_SUBOBJ_UNLOCK(pm, idx) /* nothing */ 347 #endif /* defined(DIAGNOSTIC) */ 348 349 /* 350 * Misc. event counters. 351 */ 352 struct evcnt pmap_iobmp_evcnt; 353 struct evcnt pmap_ldt_evcnt; 354 355 /* 356 * PAT 357 */ 358 #define PATENTRY(n, type) (type << ((n) * 8)) 359 #define PAT_UC 0x0ULL 360 #define PAT_WC 0x1ULL 361 #define PAT_WT 0x4ULL 362 #define PAT_WP 0x5ULL 363 #define PAT_WB 0x6ULL 364 #define PAT_UCMINUS 0x7ULL 365 366 static bool cpu_pat_enabled __read_mostly = false; 367 368 /* 369 * global data structures 370 */ 371 372 static struct pmap kernel_pmap_store; /* the kernel's pmap (proc0) */ 373 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store; 374 375 /* 376 * pmap_pg_g: if our processor supports PG_G in the PTE then we 377 * set pmap_pg_g to PG_G (otherwise it is zero). 378 */ 379 380 int pmap_pg_g __read_mostly = 0; 381 382 /* 383 * pmap_largepages: if our processor supports PG_PS and we are 384 * using it, this is set to true. 385 */ 386 387 int pmap_largepages __read_mostly; 388 389 /* 390 * i386 physical memory comes in a big contig chunk with a small 391 * hole toward the front of it... the following two paddr_t's 392 * (shared with machdep.c) describe the physical address space 393 * of this machine. 394 */ 395 paddr_t avail_start __read_mostly; /* PA of first available physical page */ 396 paddr_t avail_end __read_mostly; /* PA of last available physical page */ 397 398 #ifdef XEN 399 #ifdef __x86_64__ 400 /* Dummy PGD for user cr3, used between pmap_deactivate() and pmap_activate() */ 401 static paddr_t xen_dummy_user_pgd; 402 #endif /* __x86_64__ */ 403 paddr_t pmap_pa_start; /* PA of first physical page for this domain */ 404 paddr_t pmap_pa_end; /* PA of last physical page for this domain */ 405 #endif /* XEN */ 406 407 #define VM_PAGE_TO_PP(pg) (&(pg)->mdpage.mp_pp) 408 409 #define PV_HASH_SIZE 32768 410 #define PV_HASH_LOCK_CNT 32 411 412 struct pv_hash_lock { 413 kmutex_t lock; 414 } __aligned(CACHE_LINE_SIZE) pv_hash_locks[PV_HASH_LOCK_CNT] 415 __aligned(CACHE_LINE_SIZE); 416 417 struct pv_hash_head { 418 SLIST_HEAD(, pv_entry) hh_list; 419 } pv_hash_heads[PV_HASH_SIZE]; 420 421 static u_int 422 pvhash_hash(struct vm_page *ptp, vaddr_t va) 423 { 424 425 return (uintptr_t)ptp / sizeof(*ptp) + (va >> PAGE_SHIFT); 426 } 427 428 static struct pv_hash_head * 429 pvhash_head(u_int hash) 430 { 431 432 return &pv_hash_heads[hash % PV_HASH_SIZE]; 433 } 434 435 static kmutex_t * 436 pvhash_lock(u_int hash) 437 { 438 439 return &pv_hash_locks[hash % PV_HASH_LOCK_CNT].lock; 440 } 441 442 static struct pv_entry * 443 pvhash_remove(struct pv_hash_head *hh, struct vm_page *ptp, vaddr_t va) 444 { 445 struct pv_entry *pve; 446 struct pv_entry *prev; 447 448 prev = NULL; 449 SLIST_FOREACH(pve, &hh->hh_list, pve_hash) { 450 if (pve->pve_pte.pte_ptp == ptp && 451 pve->pve_pte.pte_va == va) { 452 if (prev != NULL) { 453 SLIST_REMOVE_AFTER(prev, pve_hash); 454 } else { 455 SLIST_REMOVE_HEAD(&hh->hh_list, pve_hash); 456 } 457 break; 458 } 459 prev = pve; 460 } 461 return pve; 462 } 463 464 /* 465 * other data structures 466 */ 467 468 static pt_entry_t protection_codes[8] __read_mostly; /* maps MI prot to i386 469 prot code */ 470 static bool pmap_initialized __read_mostly = false; /* pmap_init done yet? */ 471 472 /* 473 * the following two vaddr_t's are used during system startup 474 * to keep track of how much of the kernel's VM space we have used. 475 * once the system is started, the management of the remaining kernel 476 * VM space is turned over to the kernel_map vm_map. 477 */ 478 479 static vaddr_t virtual_avail __read_mostly; /* VA of first free KVA */ 480 static vaddr_t virtual_end __read_mostly; /* VA of last free KVA */ 481 482 /* 483 * pool that pmap structures are allocated from 484 */ 485 486 static struct pool_cache pmap_cache; 487 488 /* 489 * pv_entry cache 490 */ 491 492 static struct pool_cache pmap_pv_cache; 493 494 #ifdef __HAVE_DIRECT_MAP 495 496 extern phys_ram_seg_t mem_clusters[]; 497 extern int mem_cluster_cnt; 498 499 #else 500 501 /* 502 * MULTIPROCESSOR: special VA's/ PTE's are actually allocated inside a 503 * maxcpus*NPTECL array of PTE's, to avoid cache line thrashing 504 * due to false sharing. 505 */ 506 507 #ifdef MULTIPROCESSOR 508 #define PTESLEW(pte, id) ((pte)+(id)*NPTECL) 509 #define VASLEW(va,id) ((va)+(id)*NPTECL*PAGE_SIZE) 510 #else 511 #define PTESLEW(pte, id) ((void)id, pte) 512 #define VASLEW(va,id) ((void)id, va) 513 #endif 514 515 /* 516 * special VAs and the PTEs that map them 517 */ 518 static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte, *early_zero_pte; 519 static char *csrcp, *cdstp, *zerop, *ptpp; 520 #ifdef XEN 521 char *early_zerop; /* also referenced from xen_pmap_bootstrap() */ 522 #else 523 static char *early_zerop; 524 #endif 525 526 #endif 527 528 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int); 529 530 /* PDP pool_cache(9) and its callbacks */ 531 struct pool_cache pmap_pdp_cache; 532 static int pmap_pdp_ctor(void *, void *, int); 533 static void pmap_pdp_dtor(void *, void *); 534 #ifdef PAE 535 /* need to allocate items of 4 pages */ 536 static void *pmap_pdp_alloc(struct pool *, int); 537 static void pmap_pdp_free(struct pool *, void *); 538 static struct pool_allocator pmap_pdp_allocator = { 539 .pa_alloc = pmap_pdp_alloc, 540 .pa_free = pmap_pdp_free, 541 .pa_pagesz = PAGE_SIZE * PDP_SIZE, 542 }; 543 #endif /* PAE */ 544 545 extern vaddr_t idt_vaddr; /* we allocate IDT early */ 546 extern paddr_t idt_paddr; 547 548 #ifdef _LP64 549 extern vaddr_t lo32_vaddr; 550 extern vaddr_t lo32_paddr; 551 #endif 552 553 extern int end; 554 555 #ifdef i386 556 /* stuff to fix the pentium f00f bug */ 557 extern vaddr_t pentium_idt_vaddr; 558 #endif 559 560 561 /* 562 * local prototypes 563 */ 564 565 static struct vm_page *pmap_get_ptp(struct pmap *, vaddr_t, 566 pd_entry_t * const *); 567 static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int); 568 static void pmap_freepage(struct pmap *, struct vm_page *, int); 569 static void pmap_free_ptp(struct pmap *, struct vm_page *, 570 vaddr_t, pt_entry_t *, 571 pd_entry_t * const *); 572 static bool pmap_remove_pte(struct pmap *, struct vm_page *, 573 pt_entry_t *, vaddr_t, 574 struct pv_entry **); 575 static void pmap_remove_ptes(struct pmap *, struct vm_page *, 576 vaddr_t, vaddr_t, vaddr_t, 577 struct pv_entry **); 578 579 static bool pmap_get_physpage(vaddr_t, int, paddr_t *); 580 static void pmap_alloc_level(pd_entry_t * const *, vaddr_t, int, 581 long *); 582 583 static bool pmap_reactivate(struct pmap *); 584 585 /* 586 * p m a p h e l p e r f u n c t i o n s 587 */ 588 589 static inline void 590 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff) 591 { 592 593 if (pmap == pmap_kernel()) { 594 atomic_add_long(&pmap->pm_stats.resident_count, resid_diff); 595 atomic_add_long(&pmap->pm_stats.wired_count, wired_diff); 596 } else { 597 KASSERT(mutex_owned(pmap->pm_lock)); 598 pmap->pm_stats.resident_count += resid_diff; 599 pmap->pm_stats.wired_count += wired_diff; 600 } 601 } 602 603 static inline void 604 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte) 605 { 606 int resid_diff = ((npte & PG_V) ? 1 : 0) - ((opte & PG_V) ? 1 : 0); 607 int wired_diff = ((npte & PG_W) ? 1 : 0) - ((opte & PG_W) ? 1 : 0); 608 609 KASSERT((npte & (PG_V | PG_W)) != PG_W); 610 KASSERT((opte & (PG_V | PG_W)) != PG_W); 611 612 pmap_stats_update(pmap, resid_diff, wired_diff); 613 } 614 615 /* 616 * ptp_to_pmap: lookup pmap by ptp 617 */ 618 619 static struct pmap * 620 ptp_to_pmap(struct vm_page *ptp) 621 { 622 struct pmap *pmap; 623 624 if (ptp == NULL) { 625 return pmap_kernel(); 626 } 627 pmap = (struct pmap *)ptp->uobject; 628 KASSERT(pmap != NULL); 629 KASSERT(&pmap->pm_obj[0] == ptp->uobject); 630 return pmap; 631 } 632 633 static inline struct pv_pte * 634 pve_to_pvpte(struct pv_entry *pve) 635 { 636 637 KASSERT((void *)&pve->pve_pte == (void *)pve); 638 return &pve->pve_pte; 639 } 640 641 static inline struct pv_entry * 642 pvpte_to_pve(struct pv_pte *pvpte) 643 { 644 struct pv_entry *pve = (void *)pvpte; 645 646 KASSERT(pve_to_pvpte(pve) == pvpte); 647 return pve; 648 } 649 650 /* 651 * pv_pte_first, pv_pte_next: PV list iterator. 652 */ 653 654 static struct pv_pte * 655 pv_pte_first(struct pmap_page *pp) 656 { 657 658 if ((pp->pp_flags & PP_EMBEDDED) != 0) { 659 return &pp->pp_pte; 660 } 661 return pve_to_pvpte(LIST_FIRST(&pp->pp_head.pvh_list)); 662 } 663 664 static struct pv_pte * 665 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte) 666 { 667 668 KASSERT(pvpte != NULL); 669 if (pvpte == &pp->pp_pte) { 670 KASSERT((pp->pp_flags & PP_EMBEDDED) != 0); 671 return NULL; 672 } 673 KASSERT((pp->pp_flags & PP_EMBEDDED) == 0); 674 return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list)); 675 } 676 677 /* 678 * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]? 679 * of course the kernel is always loaded 680 */ 681 682 bool 683 pmap_is_curpmap(struct pmap *pmap) 684 { 685 return((pmap == pmap_kernel()) || 686 (pmap == curcpu()->ci_pmap)); 687 } 688 689 /* 690 * Add a reference to the specified pmap. 691 */ 692 693 void 694 pmap_reference(struct pmap *pmap) 695 { 696 697 atomic_inc_uint(&pmap->pm_obj[0].uo_refs); 698 } 699 700 /* 701 * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in 702 * 703 * there are several pmaps involved. some or all of them might be same. 704 * 705 * - the pmap given by the first argument 706 * our caller wants to access this pmap's PTEs. 707 * 708 * - pmap_kernel() 709 * the kernel pmap. note that it only contains the kernel part 710 * of the address space which is shared by any pmap. ie. any 711 * pmap can be used instead of pmap_kernel() for our purpose. 712 * 713 * - ci->ci_pmap 714 * pmap currently loaded on the cpu. 715 * 716 * - vm_map_pmap(&curproc->p_vmspace->vm_map) 717 * current process' pmap. 718 * 719 * => we lock enough pmaps to keep things locked in 720 * => must be undone with pmap_unmap_ptes before returning 721 */ 722 723 void 724 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2, 725 pd_entry_t **ptepp, pd_entry_t * const **pdeppp) 726 { 727 struct pmap *curpmap; 728 struct cpu_info *ci; 729 lwp_t *l; 730 731 /* The kernel's pmap is always accessible. */ 732 if (pmap == pmap_kernel()) { 733 *pmap2 = NULL; 734 *ptepp = PTE_BASE; 735 *pdeppp = normal_pdes; 736 return; 737 } 738 KASSERT(kpreempt_disabled()); 739 740 l = curlwp; 741 retry: 742 mutex_enter(pmap->pm_lock); 743 ci = curcpu(); 744 curpmap = ci->ci_pmap; 745 if (vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) { 746 /* Our own pmap so just load it: easy. */ 747 if (__predict_false(ci->ci_want_pmapload)) { 748 mutex_exit(pmap->pm_lock); 749 pmap_load(); 750 goto retry; 751 } 752 KASSERT(pmap == curpmap); 753 } else if (pmap == curpmap) { 754 /* 755 * Already on the CPU: make it valid. This is very 756 * often the case during exit(), when we have switched 757 * to the kernel pmap in order to destroy a user pmap. 758 */ 759 if (!pmap_reactivate(pmap)) { 760 u_int gen = uvm_emap_gen_return(); 761 tlbflush(); 762 uvm_emap_update(gen); 763 } 764 } else { 765 /* 766 * Toss current pmap from CPU, but keep a reference to it. 767 * The reference will be dropped by pmap_unmap_ptes(). 768 * Can happen if we block during exit(). 769 */ 770 const cpuid_t cid = cpu_index(ci); 771 772 kcpuset_atomic_clear(curpmap->pm_cpus, cid); 773 kcpuset_atomic_clear(curpmap->pm_kernel_cpus, cid); 774 ci->ci_pmap = pmap; 775 ci->ci_tlbstate = TLBSTATE_VALID; 776 kcpuset_atomic_set(pmap->pm_cpus, cid); 777 kcpuset_atomic_set(pmap->pm_kernel_cpus, cid); 778 cpu_load_pmap(pmap, curpmap); 779 } 780 pmap->pm_ncsw = l->l_ncsw; 781 *pmap2 = curpmap; 782 *ptepp = PTE_BASE; 783 #if defined(XEN) && defined(__x86_64__) 784 KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] == L4_BASE); 785 ci->ci_normal_pdes[PTP_LEVELS - 2] = pmap->pm_pdir; 786 *pdeppp = ci->ci_normal_pdes; 787 #else /* XEN && __x86_64__ */ 788 *pdeppp = normal_pdes; 789 #endif /* XEN && __x86_64__ */ 790 } 791 792 /* 793 * pmap_unmap_ptes: unlock the PTE mapping of "pmap" 794 */ 795 796 void 797 pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2) 798 { 799 struct cpu_info *ci; 800 struct pmap *mypmap; 801 802 KASSERT(kpreempt_disabled()); 803 804 /* The kernel's pmap is always accessible. */ 805 if (pmap == pmap_kernel()) { 806 return; 807 } 808 809 ci = curcpu(); 810 #if defined(XEN) && defined(__x86_64__) 811 /* Reset per-cpu normal_pdes */ 812 KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] != L4_BASE); 813 ci->ci_normal_pdes[PTP_LEVELS - 2] = L4_BASE; 814 #endif /* XEN && __x86_64__ */ 815 /* 816 * We cannot tolerate context switches while mapped in. 817 * If it is our own pmap all we have to do is unlock. 818 */ 819 KASSERT(pmap->pm_ncsw == curlwp->l_ncsw); 820 mypmap = vm_map_pmap(&curproc->p_vmspace->vm_map); 821 if (pmap == mypmap) { 822 mutex_exit(pmap->pm_lock); 823 return; 824 } 825 826 /* 827 * Mark whatever's on the CPU now as lazy and unlock. 828 * If the pmap was already installed, we are done. 829 */ 830 ci->ci_tlbstate = TLBSTATE_LAZY; 831 ci->ci_want_pmapload = (mypmap != pmap_kernel()); 832 mutex_exit(pmap->pm_lock); 833 if (pmap == pmap2) { 834 return; 835 } 836 837 /* 838 * We installed another pmap on the CPU. Grab a reference to 839 * it and leave in place. Toss the evicted pmap (can block). 840 */ 841 pmap_reference(pmap); 842 pmap_destroy(pmap2); 843 } 844 845 846 inline static void 847 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte) 848 { 849 850 #if !defined(__x86_64__) 851 if (curproc == NULL || curproc->p_vmspace == NULL || 852 pm != vm_map_pmap(&curproc->p_vmspace->vm_map)) 853 return; 854 855 if ((opte ^ npte) & PG_X) 856 pmap_update_pg(va); 857 858 /* 859 * Executability was removed on the last executable change. 860 * Reset the code segment to something conservative and 861 * let the trap handler deal with setting the right limit. 862 * We can't do that because of locking constraints on the vm map. 863 */ 864 865 if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) { 866 struct trapframe *tf = curlwp->l_md.md_regs; 867 868 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 869 pm->pm_hiexec = I386_MAX_EXE_ADDR; 870 } 871 #endif /* !defined(__x86_64__) */ 872 } 873 874 #if !defined(__x86_64__) 875 /* 876 * Fixup the code segment to cover all potential executable mappings. 877 * returns 0 if no changes to the code segment were made. 878 */ 879 880 int 881 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb) 882 { 883 struct vm_map_entry *ent; 884 struct pmap *pm = vm_map_pmap(map); 885 vaddr_t va = 0; 886 887 vm_map_lock_read(map); 888 for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) { 889 890 /* 891 * This entry has greater va than the entries before. 892 * We need to make it point to the last page, not past it. 893 */ 894 895 if (ent->protection & VM_PROT_EXECUTE) 896 va = trunc_page(ent->end) - PAGE_SIZE; 897 } 898 vm_map_unlock_read(map); 899 if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL)) 900 return (0); 901 902 pm->pm_hiexec = va; 903 if (pm->pm_hiexec > I386_MAX_EXE_ADDR) { 904 tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL); 905 } else { 906 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 907 return (0); 908 } 909 return (1); 910 } 911 #endif /* !defined(__x86_64__) */ 912 913 void 914 pat_init(struct cpu_info *ci) 915 { 916 uint64_t pat; 917 918 if (!(ci->ci_feat_val[0] & CPUID_PAT)) 919 return; 920 921 /* We change WT to WC. Leave all other entries the default values. */ 922 pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) | 923 PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) | 924 PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) | 925 PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC); 926 927 wrmsr(MSR_CR_PAT, pat); 928 cpu_pat_enabled = true; 929 aprint_debug_dev(ci->ci_dev, "PAT enabled\n"); 930 } 931 932 static pt_entry_t 933 pmap_pat_flags(u_int flags) 934 { 935 u_int cacheflags = (flags & PMAP_CACHE_MASK); 936 937 if (!cpu_pat_enabled) { 938 switch (cacheflags) { 939 case PMAP_NOCACHE: 940 case PMAP_NOCACHE_OVR: 941 /* results in PGC_UCMINUS on cpus which have 942 * the cpuid PAT but PAT "disabled" 943 */ 944 return PG_N; 945 default: 946 return 0; 947 } 948 } 949 950 switch (cacheflags) { 951 case PMAP_NOCACHE: 952 return PGC_UC; 953 case PMAP_WRITE_COMBINE: 954 return PGC_WC; 955 case PMAP_WRITE_BACK: 956 return PGC_WB; 957 case PMAP_NOCACHE_OVR: 958 return PGC_UCMINUS; 959 } 960 961 return 0; 962 } 963 964 /* 965 * p m a p k e n t e r f u n c t i o n s 966 * 967 * functions to quickly enter/remove pages from the kernel address 968 * space. pmap_kremove is exported to MI kernel. we make use of 969 * the recursive PTE mappings. 970 */ 971 972 /* 973 * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking 974 * 975 * => no need to lock anything, assume va is already allocated 976 * => should be faster than normal pmap enter function 977 */ 978 979 void 980 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags) 981 { 982 pt_entry_t *pte, opte, npte; 983 984 KASSERT(!(prot & ~VM_PROT_ALL)); 985 986 if (va < VM_MIN_KERNEL_ADDRESS) 987 pte = vtopte(va); 988 else 989 pte = kvtopte(va); 990 #ifdef DOM0OPS 991 if (pa < pmap_pa_start || pa >= pmap_pa_end) { 992 #ifdef DEBUG 993 printf_nolog("%s: pa 0x%" PRIx64 " for va 0x%" PRIx64 994 " outside range\n", __func__, (int64_t)pa, (int64_t)va); 995 #endif /* DEBUG */ 996 npte = pa; 997 } else 998 #endif /* DOM0OPS */ 999 npte = pmap_pa2pte(pa); 1000 npte |= protection_codes[prot] | PG_k | PG_V | pmap_pg_g; 1001 npte |= pmap_pat_flags(flags); 1002 opte = pmap_pte_testset(pte, npte); /* zap! */ 1003 #if defined(DIAGNOSTIC) 1004 /* XXX For now... */ 1005 if (opte & PG_PS) 1006 panic("%s: PG_PS", __func__); 1007 #endif 1008 if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) { 1009 /* This should not happen. */ 1010 printf_nolog("%s: mapping already present\n", __func__); 1011 kpreempt_disable(); 1012 pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER); 1013 kpreempt_enable(); 1014 } 1015 } 1016 1017 void 1018 pmap_emap_enter(vaddr_t va, paddr_t pa, vm_prot_t prot) 1019 { 1020 pt_entry_t *pte, npte; 1021 1022 KASSERT((prot & ~VM_PROT_ALL) == 0); 1023 pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va); 1024 1025 #ifdef DOM0OPS 1026 if (pa < pmap_pa_start || pa >= pmap_pa_end) { 1027 npte = pa; 1028 } else 1029 #endif 1030 npte = pmap_pa2pte(pa); 1031 1032 npte = pmap_pa2pte(pa); 1033 npte |= protection_codes[prot] | PG_k | PG_V; 1034 pmap_pte_set(pte, npte); 1035 } 1036 1037 /* 1038 * pmap_emap_sync: perform TLB flush or pmap load, if it was deferred. 1039 */ 1040 void 1041 pmap_emap_sync(bool canload) 1042 { 1043 struct cpu_info *ci = curcpu(); 1044 struct pmap *pmap; 1045 1046 KASSERT(kpreempt_disabled()); 1047 if (__predict_true(ci->ci_want_pmapload && canload)) { 1048 /* 1049 * XXX: Hint for pmap_reactivate(), which might suggest to 1050 * not perform TLB flush, if state has not changed. 1051 */ 1052 pmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map); 1053 if (__predict_false(pmap == ci->ci_pmap)) { 1054 kcpuset_atomic_clear(pmap->pm_cpus, cpu_index(ci)); 1055 } 1056 pmap_load(); 1057 KASSERT(ci->ci_want_pmapload == 0); 1058 } else { 1059 tlbflush(); 1060 } 1061 1062 } 1063 1064 void 1065 pmap_emap_remove(vaddr_t sva, vsize_t len) 1066 { 1067 pt_entry_t *pte; 1068 vaddr_t va, eva = sva + len; 1069 1070 for (va = sva; va < eva; va += PAGE_SIZE) { 1071 pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va); 1072 pmap_pte_set(pte, 0); 1073 } 1074 } 1075 1076 __strict_weak_alias(pmap_kenter_ma, pmap_kenter_pa); 1077 1078 #if defined(__x86_64__) 1079 /* 1080 * Change protection for a virtual address. Local for a CPU only, don't 1081 * care about TLB shootdowns. 1082 * 1083 * => must be called with preemption disabled 1084 */ 1085 void 1086 pmap_changeprot_local(vaddr_t va, vm_prot_t prot) 1087 { 1088 pt_entry_t *pte, opte, npte; 1089 1090 KASSERT(kpreempt_disabled()); 1091 1092 if (va < VM_MIN_KERNEL_ADDRESS) 1093 pte = vtopte(va); 1094 else 1095 pte = kvtopte(va); 1096 1097 npte = opte = *pte; 1098 1099 if ((prot & VM_PROT_WRITE) != 0) 1100 npte |= PG_RW; 1101 else 1102 npte &= ~PG_RW; 1103 1104 if (opte != npte) { 1105 pmap_pte_set(pte, npte); 1106 pmap_pte_flush(); 1107 invlpg(va); 1108 } 1109 } 1110 #endif /* defined(__x86_64__) */ 1111 1112 /* 1113 * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking 1114 * 1115 * => no need to lock anything 1116 * => caller must dispose of any vm_page mapped in the va range 1117 * => note: not an inline function 1118 * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE 1119 * => we assume kernel only unmaps valid addresses and thus don't bother 1120 * checking the valid bit before doing TLB flushing 1121 * => must be followed by call to pmap_update() before reuse of page 1122 */ 1123 1124 static inline void 1125 pmap_kremove1(vaddr_t sva, vsize_t len, bool localonly) 1126 { 1127 pt_entry_t *pte, opte; 1128 vaddr_t va, eva; 1129 1130 eva = sva + len; 1131 1132 kpreempt_disable(); 1133 for (va = sva; va < eva; va += PAGE_SIZE) { 1134 pte = kvtopte(va); 1135 opte = pmap_pte_testset(pte, 0); /* zap! */ 1136 if ((opte & (PG_V | PG_U)) == (PG_V | PG_U) && !localonly) { 1137 pmap_tlb_shootdown(pmap_kernel(), va, opte, 1138 TLBSHOOT_KREMOVE); 1139 } 1140 KASSERT((opte & PG_PS) == 0); 1141 KASSERT((opte & PG_PVLIST) == 0); 1142 } 1143 if (localonly) { 1144 tlbflushg(); 1145 } 1146 kpreempt_enable(); 1147 } 1148 1149 void 1150 pmap_kremove(vaddr_t sva, vsize_t len) 1151 { 1152 1153 pmap_kremove1(sva, len, false); 1154 } 1155 1156 /* 1157 * pmap_kremove_local: like pmap_kremove(), but only worry about 1158 * TLB invalidations on the current CPU. this is only intended 1159 * for use while writing kernel crash dumps. 1160 */ 1161 1162 void 1163 pmap_kremove_local(vaddr_t sva, vsize_t len) 1164 { 1165 1166 KASSERT(panicstr != NULL); 1167 pmap_kremove1(sva, len, true); 1168 } 1169 1170 /* 1171 * p m a p i n i t f u n c t i o n s 1172 * 1173 * pmap_bootstrap and pmap_init are called during system startup 1174 * to init the pmap module. pmap_bootstrap() does a low level 1175 * init just to get things rolling. pmap_init() finishes the job. 1176 */ 1177 1178 /* 1179 * pmap_bootstrap: get the system in a state where it can run with VM 1180 * properly enabled (called before main()). the VM system is 1181 * fully init'd later... 1182 * 1183 * => on i386, locore.s has already enabled the MMU by allocating 1184 * a PDP for the kernel, and nkpde PTP's for the kernel. 1185 * => kva_start is the first free virtual address in kernel space 1186 */ 1187 1188 void 1189 pmap_bootstrap(vaddr_t kva_start) 1190 { 1191 struct pmap *kpm; 1192 pt_entry_t *pte; 1193 int i; 1194 vaddr_t kva; 1195 #ifndef XEN 1196 pd_entry_t *pde; 1197 unsigned long p1i; 1198 vaddr_t kva_end; 1199 #endif 1200 #ifdef __HAVE_DIRECT_MAP 1201 phys_ram_seg_t *mc; 1202 long ndmpdp; 1203 paddr_t lastpa, dmpd, dmpdp, pdp; 1204 vaddr_t tmpva; 1205 #endif 1206 1207 pt_entry_t pg_nx = (cpu_feature[2] & CPUID_NOX ? PG_NX : 0); 1208 1209 /* 1210 * set up our local static global vars that keep track of the 1211 * usage of KVM before kernel_map is set up 1212 */ 1213 1214 virtual_avail = kva_start; /* first free KVA */ 1215 virtual_end = VM_MAX_KERNEL_ADDRESS; /* last KVA */ 1216 1217 /* 1218 * set up protection_codes: we need to be able to convert from 1219 * a MI protection code (some combo of VM_PROT...) to something 1220 * we can jam into a i386 PTE. 1221 */ 1222 1223 protection_codes[VM_PROT_NONE] = pg_nx; /* --- */ 1224 protection_codes[VM_PROT_EXECUTE] = PG_RO | PG_X; /* --x */ 1225 protection_codes[VM_PROT_READ] = PG_RO | pg_nx; /* -r- */ 1226 protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO | PG_X;/* -rx */ 1227 protection_codes[VM_PROT_WRITE] = PG_RW | pg_nx; /* w-- */ 1228 protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW | PG_X;/* w-x */ 1229 protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW | pg_nx; 1230 /* wr- */ 1231 protection_codes[VM_PROT_ALL] = PG_RW | PG_X; /* wrx */ 1232 1233 /* 1234 * now we init the kernel's pmap 1235 * 1236 * the kernel pmap's pm_obj is not used for much. however, in 1237 * user pmaps the pm_obj contains the list of active PTPs. 1238 * the pm_obj currently does not have a pager. it might be possible 1239 * to add a pager that would allow a process to read-only mmap its 1240 * own page tables (fast user level vtophys?). this may or may not 1241 * be useful. 1242 */ 1243 1244 kpm = pmap_kernel(); 1245 for (i = 0; i < PTP_LEVELS - 1; i++) { 1246 mutex_init(&kpm->pm_obj_lock[i], MUTEX_DEFAULT, IPL_NONE); 1247 uvm_obj_init(&kpm->pm_obj[i], NULL, false, 1); 1248 uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_obj_lock[i]); 1249 kpm->pm_ptphint[i] = NULL; 1250 } 1251 memset(&kpm->pm_list, 0, sizeof(kpm->pm_list)); /* pm_list not used */ 1252 1253 kpm->pm_pdir = (pd_entry_t *)(PDPpaddr + KERNBASE); 1254 for (i = 0; i < PDP_SIZE; i++) 1255 kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i; 1256 1257 kpm->pm_stats.wired_count = kpm->pm_stats.resident_count = 1258 x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS); 1259 1260 kcpuset_create(&kpm->pm_cpus, true); 1261 kcpuset_create(&kpm->pm_kernel_cpus, true); 1262 1263 /* 1264 * the above is just a rough estimate and not critical to the proper 1265 * operation of the system. 1266 */ 1267 1268 #ifndef XEN 1269 /* 1270 * Begin to enable global TLB entries if they are supported. 1271 * The G bit has no effect until the CR4_PGE bit is set in CR4, 1272 * which happens in cpu_init(), which is run on each cpu 1273 * (and happens later) 1274 */ 1275 if (cpu_feature[0] & CPUID_PGE) { 1276 pmap_pg_g = PG_G; /* enable software */ 1277 1278 /* add PG_G attribute to already mapped kernel pages */ 1279 if (KERNBASE == VM_MIN_KERNEL_ADDRESS) { 1280 kva_end = virtual_avail; 1281 } else { 1282 extern vaddr_t eblob, esym; 1283 kva_end = (vaddr_t)&end; 1284 if (esym > kva_end) 1285 kva_end = esym; 1286 if (eblob > kva_end) 1287 kva_end = eblob; 1288 kva_end = roundup(kva_end, PAGE_SIZE); 1289 } 1290 for (kva = KERNBASE; kva < kva_end; kva += PAGE_SIZE) { 1291 p1i = pl1_i(kva); 1292 if (pmap_valid_entry(PTE_BASE[p1i])) 1293 PTE_BASE[p1i] |= PG_G; 1294 } 1295 } 1296 1297 /* 1298 * Enable large pages if they are supported. 1299 */ 1300 if (cpu_feature[0] & CPUID_PSE) { 1301 paddr_t pa; 1302 extern char __rodata_start; 1303 extern char __data_start; 1304 extern char __kernel_end; 1305 1306 lcr4(rcr4() | CR4_PSE); /* enable hardware (via %cr4) */ 1307 pmap_largepages = 1; /* enable software */ 1308 1309 /* 1310 * The TLB must be flushed after enabling large pages 1311 * on Pentium CPUs, according to section 3.6.2.2 of 1312 * "Intel Architecture Software Developer's Manual, 1313 * Volume 3: System Programming". 1314 */ 1315 tlbflushg(); 1316 1317 /* 1318 * Now, we remap several kernel segments with large pages. We 1319 * cover as many pages as we can. 1320 */ 1321 1322 /* Remap the kernel text using large pages. */ 1323 kva = KERNBASE; 1324 kva_end = rounddown((vaddr_t)&__rodata_start, NBPD_L1); 1325 pa = kva - KERNBASE; 1326 for (/* */; kva + NBPD_L2 <= kva_end; kva += NBPD_L2, 1327 pa += NBPD_L2) { 1328 pde = &L2_BASE[pl2_i(kva)]; 1329 *pde = pa | pmap_pg_g | PG_PS | PG_KR | PG_V; 1330 tlbflushg(); 1331 } 1332 #if defined(DEBUG) 1333 aprint_normal("kernel text is mapped with %" PRIuPSIZE " large " 1334 "pages and %" PRIuPSIZE " normal pages\n", 1335 howmany(kva - KERNBASE, NBPD_L2), 1336 howmany((vaddr_t)&__rodata_start - kva, NBPD_L1)); 1337 #endif /* defined(DEBUG) */ 1338 1339 /* Remap the kernel rodata using large pages. */ 1340 kva = roundup((vaddr_t)&__rodata_start, NBPD_L2); 1341 kva_end = rounddown((vaddr_t)&__data_start, NBPD_L1); 1342 pa = kva - KERNBASE; 1343 for (/* */; kva + NBPD_L2 <= kva_end; kva += NBPD_L2, 1344 pa += NBPD_L2) { 1345 pde = &L2_BASE[pl2_i(kva)]; 1346 *pde = pa | pmap_pg_g | PG_PS | pg_nx | PG_KR | PG_V; 1347 tlbflushg(); 1348 } 1349 1350 /* Remap the kernel data+bss using large pages. */ 1351 kva = roundup((vaddr_t)&__data_start, NBPD_L2); 1352 kva_end = rounddown((vaddr_t)&__kernel_end, NBPD_L1); 1353 pa = kva - KERNBASE; 1354 for (/* */; kva + NBPD_L2 <= kva_end; kva += NBPD_L2, 1355 pa += NBPD_L2) { 1356 pde = &L2_BASE[pl2_i(kva)]; 1357 *pde = pa | pmap_pg_g | PG_PS | pg_nx | PG_KW | PG_V; 1358 tlbflushg(); 1359 } 1360 } 1361 #endif /* !XEN */ 1362 1363 #ifdef __HAVE_DIRECT_MAP 1364 1365 tmpva = (KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2); 1366 pte = PTE_BASE + pl1_i(tmpva); 1367 1368 /* 1369 * Map the direct map RW. Use 1GB pages if they are available, 1370 * otherwise use 2MB pages. Note that the unused parts of 1371 * PTPs * must be zero outed, as they might be accessed due 1372 * to speculative execution. Also, PG_G is not allowed on 1373 * non-leaf PTPs. 1374 */ 1375 1376 lastpa = 0; 1377 for (i = 0; i < mem_cluster_cnt; i++) { 1378 mc = &mem_clusters[i]; 1379 lastpa = MAX(lastpa, mc->start + mc->size); 1380 } 1381 1382 ndmpdp = (lastpa + NBPD_L3 - 1) >> L3_SHIFT; 1383 dmpdp = avail_start; avail_start += PAGE_SIZE; 1384 1385 *pte = dmpdp | PG_V | PG_RW | pg_nx; 1386 pmap_update_pg(tmpva); 1387 memset((void *)tmpva, 0, PAGE_SIZE); 1388 1389 if (cpu_feature[2] & CPUID_P1GB) { 1390 for (i = 0; i < ndmpdp; i++) { 1391 pdp = (paddr_t)&(((pd_entry_t *)dmpdp)[i]); 1392 *pte = (pdp & PG_FRAME) | PG_V | PG_RW | pg_nx; 1393 pmap_update_pg(tmpva); 1394 1395 pde = (pd_entry_t *)(tmpva + (pdp & ~PG_FRAME)); 1396 *pde = ((paddr_t)i << L3_SHIFT) | PG_RW | pg_nx | 1397 PG_V | PG_U | PG_PS | PG_G; 1398 } 1399 } else { 1400 dmpd = avail_start; avail_start += ndmpdp * PAGE_SIZE; 1401 1402 for (i = 0; i < ndmpdp; i++) { 1403 pdp = dmpd + i * PAGE_SIZE; 1404 *pte = (pdp & PG_FRAME) | PG_V | PG_RW | pg_nx; 1405 pmap_update_pg(tmpva); 1406 1407 memset((void *)tmpva, 0, PAGE_SIZE); 1408 } 1409 for (i = 0; i < NPDPG * ndmpdp; i++) { 1410 pdp = (paddr_t)&(((pd_entry_t *)dmpd)[i]); 1411 *pte = (pdp & PG_FRAME) | PG_V | PG_RW | pg_nx; 1412 pmap_update_pg(tmpva); 1413 1414 pde = (pd_entry_t *)(tmpva + (pdp & ~PG_FRAME)); 1415 *pde = ((paddr_t)i << L2_SHIFT) | PG_RW | pg_nx | 1416 PG_V | PG_U | PG_PS | PG_G; 1417 } 1418 for (i = 0; i < ndmpdp; i++) { 1419 pdp = (paddr_t)&(((pd_entry_t *)dmpdp)[i]); 1420 *pte = (pdp & PG_FRAME) | PG_V | PG_RW | pg_nx; 1421 pmap_update_pg((vaddr_t)tmpva); 1422 1423 pde = (pd_entry_t *)(tmpva + (pdp & ~PG_FRAME)); 1424 *pde = (dmpd + (i << PAGE_SHIFT)) | PG_RW | pg_nx | 1425 PG_V | PG_U; 1426 } 1427 } 1428 1429 kpm->pm_pdir[PDIR_SLOT_DIRECT] = dmpdp | PG_KW | pg_nx | PG_V | PG_U; 1430 1431 tlbflush(); 1432 1433 #else 1434 if (VM_MIN_KERNEL_ADDRESS != KERNBASE) { 1435 /* 1436 * zero_pte is stuck at the end of mapped space for the kernel 1437 * image (disjunct from kva space). This is done so that it 1438 * can safely be used in pmap_growkernel (pmap_get_physpage), 1439 * when it's called for the first time. 1440 * XXXfvdl fix this for MULTIPROCESSOR later. 1441 */ 1442 #ifdef XEN 1443 /* early_zerop initialized in xen_pmap_bootstrap() */ 1444 #else 1445 early_zerop = (void *)(KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2); 1446 #endif 1447 early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop); 1448 } 1449 1450 /* 1451 * now we allocate the "special" VAs which are used for tmp mappings 1452 * by the pmap (and other modules). we allocate the VAs by advancing 1453 * virtual_avail (note that there are no pages mapped at these VAs). 1454 * we find the PTE that maps the allocated VA via the linear PTE 1455 * mapping. 1456 */ 1457 1458 pte = PTE_BASE + pl1_i(virtual_avail); 1459 1460 #ifdef MULTIPROCESSOR 1461 /* 1462 * Waste some VA space to avoid false sharing of cache lines 1463 * for page table pages: Give each possible CPU a cache line 1464 * of PTE's (8) to play with, though we only need 4. We could 1465 * recycle some of this waste by putting the idle stacks here 1466 * as well; we could waste less space if we knew the largest 1467 * CPU ID beforehand. 1468 */ 1469 csrcp = (char *) virtual_avail; csrc_pte = pte; 1470 1471 cdstp = (char *) virtual_avail+PAGE_SIZE; cdst_pte = pte+1; 1472 1473 zerop = (char *) virtual_avail+PAGE_SIZE*2; zero_pte = pte+2; 1474 1475 ptpp = (char *) virtual_avail+PAGE_SIZE*3; ptp_pte = pte+3; 1476 1477 virtual_avail += PAGE_SIZE * maxcpus * NPTECL; 1478 pte += maxcpus * NPTECL; 1479 #else 1480 csrcp = (void *) virtual_avail; csrc_pte = pte; /* allocate */ 1481 virtual_avail += PAGE_SIZE; pte++; /* advance */ 1482 1483 cdstp = (void *) virtual_avail; cdst_pte = pte; 1484 virtual_avail += PAGE_SIZE; pte++; 1485 1486 zerop = (void *) virtual_avail; zero_pte = pte; 1487 virtual_avail += PAGE_SIZE; pte++; 1488 1489 ptpp = (void *) virtual_avail; ptp_pte = pte; 1490 virtual_avail += PAGE_SIZE; pte++; 1491 #endif 1492 1493 if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { 1494 early_zerop = zerop; 1495 early_zero_pte = zero_pte; 1496 } 1497 #endif 1498 1499 /* 1500 * Nothing after this point actually needs pte. 1501 */ 1502 pte = (void *)0xdeadbeef; 1503 1504 #ifdef XEN 1505 #ifdef __x86_64__ 1506 /* 1507 * We want a dummy page directory for Xen: 1508 * when deactivate a pmap, Xen will still consider it active. 1509 * So we set user PGD to this one to lift all protection on 1510 * the now inactive page tables set. 1511 */ 1512 xen_dummy_user_pgd = avail_start; 1513 avail_start += PAGE_SIZE; 1514 1515 /* Zero fill it, the less checks in Xen it requires the better */ 1516 memset((void *) (xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE); 1517 /* Mark read-only */ 1518 HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE, 1519 pmap_pa2pte(xen_dummy_user_pgd) | PG_u | PG_V, UVMF_INVLPG); 1520 /* Pin as L4 */ 1521 xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd)); 1522 #endif /* __x86_64__ */ 1523 idt_vaddr = virtual_avail; /* don't need pte */ 1524 idt_paddr = avail_start; /* steal a page */ 1525 /* 1526 * Xen require one more page as we can't store 1527 * GDT and LDT on the same page 1528 */ 1529 virtual_avail += 3 * PAGE_SIZE; 1530 avail_start += 3 * PAGE_SIZE; 1531 #else /* XEN */ 1532 idt_vaddr = virtual_avail; /* don't need pte */ 1533 idt_paddr = avail_start; /* steal a page */ 1534 #if defined(__x86_64__) 1535 virtual_avail += 2 * PAGE_SIZE; 1536 avail_start += 2 * PAGE_SIZE; 1537 #else /* defined(__x86_64__) */ 1538 virtual_avail += PAGE_SIZE; 1539 avail_start += PAGE_SIZE; 1540 /* pentium f00f bug stuff */ 1541 pentium_idt_vaddr = virtual_avail; /* don't need pte */ 1542 virtual_avail += PAGE_SIZE; 1543 #endif /* defined(__x86_64__) */ 1544 #endif /* XEN */ 1545 1546 #ifdef _LP64 1547 /* 1548 * Grab a page below 4G for things that need it (i.e. 1549 * having an initial %cr3 for the MP trampoline). 1550 */ 1551 lo32_vaddr = virtual_avail; 1552 virtual_avail += PAGE_SIZE; 1553 lo32_paddr = avail_start; 1554 avail_start += PAGE_SIZE; 1555 #endif 1556 1557 /* 1558 * now we reserve some VM for mapping pages when doing a crash dump 1559 */ 1560 1561 virtual_avail = reserve_dumppages(virtual_avail); 1562 1563 /* 1564 * init the static-global locks and global lists. 1565 * 1566 * => pventry::pvh_lock (initialized elsewhere) must also be 1567 * a spin lock, again at IPL_VM to prevent deadlock, and 1568 * again is never taken from interrupt context. 1569 */ 1570 1571 mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE); 1572 LIST_INIT(&pmaps); 1573 1574 /* 1575 * ensure the TLB is sync'd with reality by flushing it... 1576 */ 1577 1578 tlbflushg(); 1579 1580 /* 1581 * calculate pmap_maxkvaddr from nkptp[]. 1582 */ 1583 1584 kva = VM_MIN_KERNEL_ADDRESS; 1585 for (i = PTP_LEVELS - 1; i >= 1; i--) { 1586 kva += nkptp[i] * nbpd[i]; 1587 } 1588 pmap_maxkvaddr = kva; 1589 } 1590 1591 #if defined(__x86_64__) 1592 /* 1593 * Pre-allocate PTPs for low memory, so that 1:1 mappings for various 1594 * trampoline code can be entered. 1595 */ 1596 void 1597 pmap_prealloc_lowmem_ptps(void) 1598 { 1599 int level; 1600 paddr_t newp; 1601 pd_entry_t *pdes; 1602 1603 const pd_entry_t pteflags = PG_k | PG_V | PG_RW; 1604 1605 pdes = pmap_kernel()->pm_pdir; 1606 level = PTP_LEVELS; 1607 for (;;) { 1608 newp = avail_start; 1609 avail_start += PAGE_SIZE; 1610 #ifdef __HAVE_DIRECT_MAP 1611 memset((void *)PMAP_DIRECT_MAP(newp), 0, PAGE_SIZE); 1612 #else 1613 pmap_pte_set(early_zero_pte, pmap_pa2pte(newp) | pteflags); 1614 pmap_pte_flush(); 1615 pmap_update_pg((vaddr_t)early_zerop); 1616 memset(early_zerop, 0, PAGE_SIZE); 1617 #endif 1618 1619 #ifdef XEN 1620 /* Mark R/O before installing */ 1621 HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop, 1622 xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG); 1623 if (newp < (NKL2_KIMG_ENTRIES * NBPD_L2)) 1624 HYPERVISOR_update_va_mapping (newp + KERNBASE, 1625 xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG); 1626 1627 1628 if (level == PTP_LEVELS) { /* Top level pde is per-cpu */ 1629 pd_entry_t *kpm_pdir; 1630 /* Reach it via recursive mapping */ 1631 kpm_pdir = normal_pdes[PTP_LEVELS - 2]; 1632 1633 /* Set it as usual. We can't defer this 1634 * outside the loop since recursive 1635 * pte entries won't be accessible during 1636 * further iterations at lower levels 1637 * otherwise. 1638 */ 1639 pmap_pte_set(&kpm_pdir[pl_i(0, PTP_LEVELS)], 1640 pmap_pa2pte(newp) | pteflags); 1641 } 1642 1643 #endif /* XEN */ 1644 pmap_pte_set(&pdes[pl_i(0, level)], 1645 pmap_pa2pte(newp) | pteflags); 1646 1647 pmap_pte_flush(); 1648 1649 level--; 1650 if (level <= 1) 1651 break; 1652 pdes = normal_pdes[level - 2]; 1653 } 1654 } 1655 #endif /* defined(__x86_64__) */ 1656 1657 /* 1658 * pmap_init: called from uvm_init, our job is to get the pmap 1659 * system ready to manage mappings... 1660 */ 1661 1662 void 1663 pmap_init(void) 1664 { 1665 int i, flags; 1666 1667 for (i = 0; i < PV_HASH_SIZE; i++) { 1668 SLIST_INIT(&pv_hash_heads[i].hh_list); 1669 } 1670 for (i = 0; i < PV_HASH_LOCK_CNT; i++) { 1671 mutex_init(&pv_hash_locks[i].lock, MUTEX_NODEBUG, IPL_VM); 1672 } 1673 1674 /* 1675 * initialize caches. 1676 */ 1677 1678 pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), 0, 0, 0, 1679 "pmappl", NULL, IPL_NONE, NULL, NULL, NULL); 1680 1681 #ifdef XEN 1682 /* 1683 * pool_cache(9) should not touch cached objects, since they 1684 * are pinned on xen and R/O for the domU 1685 */ 1686 flags = PR_NOTOUCH; 1687 #else /* XEN */ 1688 flags = 0; 1689 #endif /* XEN */ 1690 #ifdef PAE 1691 pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE * PDP_SIZE, 0, 0, flags, 1692 "pdppl", &pmap_pdp_allocator, IPL_NONE, 1693 pmap_pdp_ctor, pmap_pdp_dtor, NULL); 1694 #else /* PAE */ 1695 pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE, 0, 0, flags, 1696 "pdppl", NULL, IPL_NONE, pmap_pdp_ctor, pmap_pdp_dtor, NULL); 1697 #endif /* PAE */ 1698 pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0, 1699 PR_LARGECACHE, "pvpl", &pool_allocator_kmem, IPL_NONE, NULL, 1700 NULL, NULL); 1701 1702 pmap_tlb_init(); 1703 1704 /* XXX: Since cpu_hatch() is only for secondary CPUs. */ 1705 pmap_tlb_cpu_init(curcpu()); 1706 1707 evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC, 1708 NULL, "x86", "io bitmap copy"); 1709 evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC, 1710 NULL, "x86", "ldt sync"); 1711 1712 /* 1713 * done: pmap module is up (and ready for business) 1714 */ 1715 1716 pmap_initialized = true; 1717 } 1718 1719 /* 1720 * pmap_cpu_init_late: perform late per-CPU initialization. 1721 */ 1722 1723 #ifndef XEN 1724 void 1725 pmap_cpu_init_late(struct cpu_info *ci) 1726 { 1727 /* 1728 * The BP has already its own PD page allocated during early 1729 * MD startup. 1730 */ 1731 if (ci == &cpu_info_primary) 1732 return; 1733 1734 #ifdef PAE 1735 cpu_alloc_l3_page(ci); 1736 #endif 1737 } 1738 #endif 1739 1740 /* 1741 * p v _ e n t r y f u n c t i o n s 1742 */ 1743 1744 /* 1745 * pmap_free_pvs: free a list of pv_entrys 1746 */ 1747 1748 static void 1749 pmap_free_pvs(struct pv_entry *pve) 1750 { 1751 struct pv_entry *next; 1752 1753 for ( /* null */ ; pve != NULL ; pve = next) { 1754 next = pve->pve_next; 1755 pool_cache_put(&pmap_pv_cache, pve); 1756 } 1757 } 1758 1759 /* 1760 * main pv_entry manipulation functions: 1761 * pmap_enter_pv: enter a mapping onto a pv_head list 1762 * pmap_remove_pv: remove a mapping from a pv_head list 1763 * 1764 * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock 1765 * the pvh before calling 1766 */ 1767 1768 /* 1769 * insert_pv: a helper of pmap_enter_pv 1770 */ 1771 1772 static void 1773 insert_pv(struct pmap_page *pp, struct pv_entry *pve) 1774 { 1775 struct pv_hash_head *hh; 1776 kmutex_t *lock; 1777 u_int hash; 1778 1779 hash = pvhash_hash(pve->pve_pte.pte_ptp, pve->pve_pte.pte_va); 1780 lock = pvhash_lock(hash); 1781 hh = pvhash_head(hash); 1782 mutex_spin_enter(lock); 1783 SLIST_INSERT_HEAD(&hh->hh_list, pve, pve_hash); 1784 mutex_spin_exit(lock); 1785 1786 LIST_INSERT_HEAD(&pp->pp_head.pvh_list, pve, pve_list); 1787 } 1788 1789 /* 1790 * pmap_enter_pv: enter a mapping onto a pv_head lst 1791 * 1792 * => caller should adjust ptp's wire_count before calling 1793 */ 1794 1795 static struct pv_entry * 1796 pmap_enter_pv(struct pmap_page *pp, 1797 struct pv_entry *pve, /* preallocated pve for us to use */ 1798 struct pv_entry **sparepve, 1799 struct vm_page *ptp, 1800 vaddr_t va) 1801 { 1802 1803 KASSERT(ptp == NULL || ptp->wire_count >= 2); 1804 KASSERT(ptp == NULL || ptp->uobject != NULL); 1805 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 1806 1807 if ((pp->pp_flags & PP_EMBEDDED) == 0) { 1808 if (LIST_EMPTY(&pp->pp_head.pvh_list)) { 1809 pp->pp_flags |= PP_EMBEDDED; 1810 pp->pp_pte.pte_ptp = ptp; 1811 pp->pp_pte.pte_va = va; 1812 1813 return pve; 1814 } 1815 } else { 1816 struct pv_entry *pve2; 1817 1818 pve2 = *sparepve; 1819 *sparepve = NULL; 1820 1821 pve2->pve_pte = pp->pp_pte; 1822 pp->pp_flags &= ~PP_EMBEDDED; 1823 LIST_INIT(&pp->pp_head.pvh_list); 1824 insert_pv(pp, pve2); 1825 } 1826 1827 pve->pve_pte.pte_ptp = ptp; 1828 pve->pve_pte.pte_va = va; 1829 insert_pv(pp, pve); 1830 1831 return NULL; 1832 } 1833 1834 /* 1835 * pmap_remove_pv: try to remove a mapping from a pv_list 1836 * 1837 * => caller should adjust ptp's wire_count and free PTP if needed 1838 * => we return the removed pve 1839 */ 1840 1841 static struct pv_entry * 1842 pmap_remove_pv(struct pmap_page *pp, struct vm_page *ptp, vaddr_t va) 1843 { 1844 struct pv_hash_head *hh; 1845 struct pv_entry *pve; 1846 kmutex_t *lock; 1847 u_int hash; 1848 1849 KASSERT(ptp == NULL || ptp->uobject != NULL); 1850 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 1851 1852 if ((pp->pp_flags & PP_EMBEDDED) != 0) { 1853 KASSERT(pp->pp_pte.pte_ptp == ptp); 1854 KASSERT(pp->pp_pte.pte_va == va); 1855 1856 pp->pp_flags &= ~PP_EMBEDDED; 1857 LIST_INIT(&pp->pp_head.pvh_list); 1858 1859 return NULL; 1860 } 1861 1862 hash = pvhash_hash(ptp, va); 1863 lock = pvhash_lock(hash); 1864 hh = pvhash_head(hash); 1865 mutex_spin_enter(lock); 1866 pve = pvhash_remove(hh, ptp, va); 1867 mutex_spin_exit(lock); 1868 1869 LIST_REMOVE(pve, pve_list); 1870 1871 return pve; 1872 } 1873 1874 /* 1875 * p t p f u n c t i o n s 1876 */ 1877 1878 static inline struct vm_page * 1879 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level) 1880 { 1881 int lidx = level - 1; 1882 struct vm_page *pg; 1883 1884 KASSERT(mutex_owned(pmap->pm_lock)); 1885 1886 if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] && 1887 pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) { 1888 return (pmap->pm_ptphint[lidx]); 1889 } 1890 PMAP_SUBOBJ_LOCK(pmap, lidx); 1891 pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level)); 1892 PMAP_SUBOBJ_UNLOCK(pmap, lidx); 1893 1894 KASSERT(pg == NULL || pg->wire_count >= 1); 1895 return pg; 1896 } 1897 1898 static inline void 1899 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level) 1900 { 1901 lwp_t *l; 1902 int lidx; 1903 struct uvm_object *obj; 1904 1905 KASSERT(ptp->wire_count == 1); 1906 1907 lidx = level - 1; 1908 1909 obj = &pmap->pm_obj[lidx]; 1910 pmap_stats_update(pmap, -1, 0); 1911 if (lidx != 0) 1912 mutex_enter(obj->vmobjlock); 1913 if (pmap->pm_ptphint[lidx] == ptp) 1914 pmap->pm_ptphint[lidx] = TAILQ_FIRST(&obj->memq); 1915 ptp->wire_count = 0; 1916 uvm_pagerealloc(ptp, NULL, 0); 1917 l = curlwp; 1918 KASSERT((l->l_pflag & LP_INTR) == 0); 1919 VM_PAGE_TO_PP(ptp)->pp_link = l->l_md.md_gc_ptp; 1920 l->l_md.md_gc_ptp = ptp; 1921 if (lidx != 0) 1922 mutex_exit(obj->vmobjlock); 1923 } 1924 1925 static void 1926 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va, 1927 pt_entry_t *ptes, pd_entry_t * const *pdes) 1928 { 1929 unsigned long index; 1930 int level; 1931 vaddr_t invaladdr; 1932 pd_entry_t opde; 1933 1934 KASSERT(pmap != pmap_kernel()); 1935 KASSERT(mutex_owned(pmap->pm_lock)); 1936 KASSERT(kpreempt_disabled()); 1937 1938 level = 1; 1939 do { 1940 index = pl_i(va, level + 1); 1941 opde = pmap_pte_testset(&pdes[level - 1][index], 0); 1942 #if defined(XEN) 1943 # if defined(__x86_64__) 1944 /* 1945 * If ptp is a L3 currently mapped in kernel space, 1946 * on any cpu, clear it before freeing 1947 */ 1948 if (level == PTP_LEVELS - 1) { 1949 /* 1950 * Update the per-cpu PD on all cpus the current 1951 * pmap is active on 1952 */ 1953 xen_kpm_sync(pmap, index); 1954 } 1955 # endif /*__x86_64__ */ 1956 invaladdr = level == 1 ? (vaddr_t)ptes : 1957 (vaddr_t)pdes[level - 2]; 1958 pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE, 1959 opde, TLBSHOOT_FREE_PTP1); 1960 pmap_tlb_shootnow(); 1961 #else /* XEN */ 1962 invaladdr = level == 1 ? (vaddr_t)ptes : 1963 (vaddr_t)pdes[level - 2]; 1964 pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE, 1965 opde, TLBSHOOT_FREE_PTP1); 1966 #endif /* XEN */ 1967 pmap_freepage(pmap, ptp, level); 1968 if (level < PTP_LEVELS - 1) { 1969 ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1); 1970 ptp->wire_count--; 1971 if (ptp->wire_count > 1) 1972 break; 1973 } 1974 } while (++level < PTP_LEVELS); 1975 pmap_pte_flush(); 1976 } 1977 1978 /* 1979 * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one) 1980 * 1981 * => pmap should NOT be pmap_kernel() 1982 * => pmap should be locked 1983 * => preemption should be disabled 1984 */ 1985 1986 static struct vm_page * 1987 pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t * const *pdes) 1988 { 1989 struct vm_page *ptp, *pptp; 1990 int i; 1991 unsigned long index; 1992 pd_entry_t *pva; 1993 paddr_t ppa, pa; 1994 struct uvm_object *obj; 1995 1996 KASSERT(pmap != pmap_kernel()); 1997 KASSERT(mutex_owned(pmap->pm_lock)); 1998 KASSERT(kpreempt_disabled()); 1999 2000 ptp = NULL; 2001 pa = (paddr_t)-1; 2002 2003 /* 2004 * Loop through all page table levels seeing if we need to 2005 * add a new page to that level. 2006 */ 2007 for (i = PTP_LEVELS; i > 1; i--) { 2008 /* 2009 * Save values from previous round. 2010 */ 2011 pptp = ptp; 2012 ppa = pa; 2013 2014 index = pl_i(va, i); 2015 pva = pdes[i - 2]; 2016 2017 if (pmap_valid_entry(pva[index])) { 2018 ppa = pmap_pte2pa(pva[index]); 2019 ptp = NULL; 2020 continue; 2021 } 2022 2023 obj = &pmap->pm_obj[i-2]; 2024 PMAP_SUBOBJ_LOCK(pmap, i - 2); 2025 ptp = uvm_pagealloc(obj, ptp_va2o(va, i - 1), NULL, 2026 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 2027 PMAP_SUBOBJ_UNLOCK(pmap, i - 2); 2028 2029 if (ptp == NULL) 2030 return NULL; 2031 2032 ptp->flags &= ~PG_BUSY; /* never busy */ 2033 ptp->wire_count = 1; 2034 pmap->pm_ptphint[i - 2] = ptp; 2035 pa = VM_PAGE_TO_PHYS(ptp); 2036 pmap_pte_set(&pva[index], (pd_entry_t) 2037 (pmap_pa2pte(pa) | PG_u | PG_RW | PG_V)); 2038 #if defined(XEN) && defined(__x86_64__) 2039 if(i == PTP_LEVELS) { 2040 /* 2041 * Update the per-cpu PD on all cpus the current 2042 * pmap is active on 2043 */ 2044 xen_kpm_sync(pmap, index); 2045 } 2046 #endif 2047 pmap_pte_flush(); 2048 pmap_stats_update(pmap, 1, 0); 2049 /* 2050 * If we're not in the top level, increase the 2051 * wire count of the parent page. 2052 */ 2053 if (i < PTP_LEVELS) { 2054 if (pptp == NULL) { 2055 pptp = pmap_find_ptp(pmap, va, ppa, i); 2056 KASSERT(pptp != NULL); 2057 } 2058 pptp->wire_count++; 2059 } 2060 } 2061 2062 /* 2063 * PTP is not NULL if we just allocated a new PTP. If it is 2064 * still NULL, we must look up the existing one. 2065 */ 2066 if (ptp == NULL) { 2067 ptp = pmap_find_ptp(pmap, va, ppa, 1); 2068 KASSERTMSG(ptp != NULL, "pmap_get_ptp: va %" PRIxVADDR 2069 "ppa %" PRIxPADDR "\n", va, ppa); 2070 } 2071 2072 pmap->pm_ptphint[0] = ptp; 2073 return ptp; 2074 } 2075 2076 /* 2077 * p m a p l i f e c y c l e f u n c t i o n s 2078 */ 2079 2080 /* 2081 * pmap_pdp_ctor: constructor for the PDP cache. 2082 */ 2083 static int 2084 pmap_pdp_ctor(void *arg, void *v, int flags) 2085 { 2086 pd_entry_t *pdir = v; 2087 paddr_t pdirpa = 0; /* XXX: GCC */ 2088 vaddr_t object; 2089 int i; 2090 2091 #if !defined(XEN) || !defined(__x86_64__) 2092 int npde; 2093 #endif 2094 #ifdef XEN 2095 int s; 2096 #endif 2097 2098 /* 2099 * NOTE: The `pmaps_lock' is held when the PDP is allocated. 2100 */ 2101 2102 #if defined(XEN) && defined(__x86_64__) 2103 /* fetch the physical address of the page directory. */ 2104 (void) pmap_extract(pmap_kernel(), (vaddr_t) pdir, &pdirpa); 2105 2106 /* zero init area */ 2107 memset (pdir, 0, PAGE_SIZE); /* Xen wants a clean page */ 2108 /* 2109 * this pdir will NEVER be active in kernel mode 2110 * so mark recursive entry invalid 2111 */ 2112 pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa) | PG_u; 2113 /* 2114 * PDP constructed this way won't be for kernel, 2115 * hence we don't put kernel mappings on Xen. 2116 * But we need to make pmap_create() happy, so put a dummy (without 2117 * PG_V) value at the right place. 2118 */ 2119 pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] = 2120 (pd_entry_t)-1 & PG_FRAME; 2121 #else /* XEN && __x86_64__*/ 2122 /* zero init area */ 2123 memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t)); 2124 2125 object = (vaddr_t)v; 2126 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2127 /* fetch the physical address of the page directory. */ 2128 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2129 /* put in recursive PDE to map the PTEs */ 2130 pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PG_V; 2131 #ifndef XEN 2132 pdir[PDIR_SLOT_PTE + i] |= PG_KW; 2133 #endif 2134 } 2135 2136 /* copy kernel's PDE */ 2137 npde = nkptp[PTP_LEVELS - 1]; 2138 2139 memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN], 2140 npde * sizeof(pd_entry_t)); 2141 2142 /* zero the rest */ 2143 memset(&pdir[PDIR_SLOT_KERN + npde], 0, (PAGE_SIZE * PDP_SIZE) - 2144 (PDIR_SLOT_KERN + npde) * sizeof(pd_entry_t)); 2145 2146 if (VM_MIN_KERNEL_ADDRESS != KERNBASE) { 2147 int idx = pl_i(KERNBASE, PTP_LEVELS); 2148 2149 pdir[idx] = PDP_BASE[idx]; 2150 } 2151 2152 #ifdef __HAVE_DIRECT_MAP 2153 pdir[PDIR_SLOT_DIRECT] = PDP_BASE[PDIR_SLOT_DIRECT]; 2154 #endif 2155 2156 #endif /* XEN && __x86_64__*/ 2157 #ifdef XEN 2158 s = splvm(); 2159 object = (vaddr_t)v; 2160 pmap_protect(pmap_kernel(), object, object + (PAGE_SIZE * PDP_SIZE), 2161 VM_PROT_READ); 2162 pmap_update(pmap_kernel()); 2163 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2164 /* 2165 * pin as L2/L4 page, we have to do the page with the 2166 * PDIR_SLOT_PTE entries last 2167 */ 2168 #ifdef PAE 2169 if (i == l2tol3(PDIR_SLOT_PTE)) 2170 continue; 2171 #endif 2172 2173 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2174 #ifdef __x86_64__ 2175 xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa)); 2176 #else 2177 xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa)); 2178 #endif 2179 } 2180 #ifdef PAE 2181 object = ((vaddr_t)pdir) + PAGE_SIZE * l2tol3(PDIR_SLOT_PTE); 2182 (void)pmap_extract(pmap_kernel(), object, &pdirpa); 2183 xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa)); 2184 #endif 2185 splx(s); 2186 #endif /* XEN */ 2187 2188 return (0); 2189 } 2190 2191 /* 2192 * pmap_pdp_dtor: destructor for the PDP cache. 2193 */ 2194 2195 static void 2196 pmap_pdp_dtor(void *arg, void *v) 2197 { 2198 #ifdef XEN 2199 paddr_t pdirpa = 0; /* XXX: GCC */ 2200 vaddr_t object = (vaddr_t)v; 2201 int i; 2202 int s = splvm(); 2203 pt_entry_t *pte; 2204 2205 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2206 /* fetch the physical address of the page directory. */ 2207 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2208 /* unpin page table */ 2209 xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa)); 2210 } 2211 object = (vaddr_t)v; 2212 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2213 /* Set page RW again */ 2214 pte = kvtopte(object); 2215 pmap_pte_set(pte, *pte | PG_RW); 2216 xen_bcast_invlpg((vaddr_t)object); 2217 } 2218 splx(s); 2219 #endif /* XEN */ 2220 } 2221 2222 #ifdef PAE 2223 2224 /* pmap_pdp_alloc: Allocate a page for the pdp memory pool. */ 2225 2226 static void * 2227 pmap_pdp_alloc(struct pool *pp, int flags) 2228 { 2229 return (void *)uvm_km_alloc(kernel_map, 2230 PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE, 2231 ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK) 2232 | UVM_KMF_WIRED); 2233 } 2234 2235 /* 2236 * pmap_pdp_free: free a PDP 2237 */ 2238 2239 static void 2240 pmap_pdp_free(struct pool *pp, void *v) 2241 { 2242 uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE, 2243 UVM_KMF_WIRED); 2244 } 2245 #endif /* PAE */ 2246 2247 /* 2248 * pmap_create: create a pmap object. 2249 */ 2250 struct pmap * 2251 pmap_create(void) 2252 { 2253 struct pmap *pmap; 2254 int i; 2255 2256 pmap = pool_cache_get(&pmap_cache, PR_WAITOK); 2257 2258 /* init uvm_object */ 2259 for (i = 0; i < PTP_LEVELS - 1; i++) { 2260 mutex_init(&pmap->pm_obj_lock[i], MUTEX_DEFAULT, IPL_NONE); 2261 uvm_obj_init(&pmap->pm_obj[i], NULL, false, 1); 2262 uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_obj_lock[i]); 2263 pmap->pm_ptphint[i] = NULL; 2264 } 2265 pmap->pm_stats.wired_count = 0; 2266 /* count the PDP allocd below */ 2267 pmap->pm_stats.resident_count = PDP_SIZE; 2268 #if !defined(__x86_64__) 2269 pmap->pm_hiexec = 0; 2270 #endif /* !defined(__x86_64__) */ 2271 pmap->pm_flags = 0; 2272 pmap->pm_gc_ptp = NULL; 2273 2274 kcpuset_create(&pmap->pm_cpus, true); 2275 kcpuset_create(&pmap->pm_kernel_cpus, true); 2276 #ifdef XEN 2277 kcpuset_create(&pmap->pm_xen_ptp_cpus, true); 2278 #endif 2279 /* init the LDT */ 2280 pmap->pm_ldt = NULL; 2281 pmap->pm_ldt_len = 0; 2282 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 2283 2284 /* allocate PDP */ 2285 try_again: 2286 pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK); 2287 2288 mutex_enter(&pmaps_lock); 2289 2290 if (pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] == 0) { 2291 mutex_exit(&pmaps_lock); 2292 pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir); 2293 goto try_again; 2294 } 2295 2296 for (i = 0; i < PDP_SIZE; i++) 2297 pmap->pm_pdirpa[i] = 2298 pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]); 2299 2300 LIST_INSERT_HEAD(&pmaps, pmap, pm_list); 2301 2302 mutex_exit(&pmaps_lock); 2303 2304 return (pmap); 2305 } 2306 2307 /* 2308 * pmap_free_ptps: put a list of ptps back to the freelist. 2309 */ 2310 2311 static void 2312 pmap_free_ptps(struct vm_page *empty_ptps) 2313 { 2314 struct vm_page *ptp; 2315 struct pmap_page *pp; 2316 2317 while ((ptp = empty_ptps) != NULL) { 2318 pp = VM_PAGE_TO_PP(ptp); 2319 empty_ptps = pp->pp_link; 2320 LIST_INIT(&pp->pp_head.pvh_list); 2321 uvm_pagefree(ptp); 2322 } 2323 } 2324 2325 /* 2326 * pmap_destroy: drop reference count on pmap. free pmap if 2327 * reference count goes to zero. 2328 */ 2329 2330 void 2331 pmap_destroy(struct pmap *pmap) 2332 { 2333 lwp_t *l; 2334 int i; 2335 2336 /* 2337 * If we have torn down this pmap, process deferred frees and 2338 * invalidations. Free now if the system is low on memory. 2339 * Otherwise, free when the pmap is destroyed thus avoiding a 2340 * TLB shootdown. 2341 */ 2342 l = curlwp; 2343 if (__predict_false(l->l_md.md_gc_pmap == pmap)) { 2344 if (uvmexp.free < uvmexp.freetarg) { 2345 pmap_update(pmap); 2346 } else { 2347 KASSERT(pmap->pm_gc_ptp == NULL); 2348 pmap->pm_gc_ptp = l->l_md.md_gc_ptp; 2349 l->l_md.md_gc_ptp = NULL; 2350 l->l_md.md_gc_pmap = NULL; 2351 } 2352 } 2353 2354 /* 2355 * drop reference count 2356 */ 2357 2358 if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) { 2359 return; 2360 } 2361 2362 #ifdef DIAGNOSTIC 2363 CPU_INFO_ITERATOR cii; 2364 struct cpu_info *ci; 2365 2366 for (CPU_INFO_FOREACH(cii, ci)) { 2367 if (ci->ci_pmap == pmap) 2368 panic("destroying pmap being used"); 2369 #if defined(XEN) && defined(__x86_64__) 2370 for (i = 0; i < PDIR_SLOT_PTE; i++) { 2371 if (pmap->pm_pdir[i] != 0 && 2372 ci->ci_kpm_pdir[i] == pmap->pm_pdir[i]) { 2373 printf("pmap_destroy(%p) pmap_kernel %p " 2374 "curcpu %d cpu %d ci_pmap %p " 2375 "ci->ci_kpm_pdir[%d]=%" PRIx64 2376 " pmap->pm_pdir[%d]=%" PRIx64 "\n", 2377 pmap, pmap_kernel(), curcpu()->ci_index, 2378 ci->ci_index, ci->ci_pmap, 2379 i, ci->ci_kpm_pdir[i], 2380 i, pmap->pm_pdir[i]); 2381 panic("pmap_destroy: used pmap"); 2382 } 2383 } 2384 #endif 2385 } 2386 #endif /* DIAGNOSTIC */ 2387 2388 /* 2389 * Reference count is zero, free pmap resources and then free pmap. 2390 * First, remove it from global list of pmaps. 2391 */ 2392 2393 mutex_enter(&pmaps_lock); 2394 LIST_REMOVE(pmap, pm_list); 2395 mutex_exit(&pmaps_lock); 2396 2397 /* 2398 * Process deferred PTP frees. No TLB shootdown required, as the 2399 * PTP pages are no longer visible to any CPU. 2400 */ 2401 2402 pmap_free_ptps(pmap->pm_gc_ptp); 2403 2404 /* 2405 * destroyed pmap shouldn't have remaining PTPs 2406 */ 2407 2408 for (i = 0; i < PTP_LEVELS - 1; i++) { 2409 KASSERT(pmap->pm_obj[i].uo_npages == 0); 2410 KASSERT(TAILQ_EMPTY(&pmap->pm_obj[i].memq)); 2411 } 2412 2413 pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir); 2414 2415 #ifdef USER_LDT 2416 if (pmap->pm_ldt != NULL) { 2417 /* 2418 * no need to switch the LDT; this address space is gone, 2419 * nothing is using it. 2420 * 2421 * No need to lock the pmap for ldt_free (or anything else), 2422 * we're the last one to use it. 2423 */ 2424 mutex_enter(&cpu_lock); 2425 ldt_free(pmap->pm_ldt_sel); 2426 mutex_exit(&cpu_lock); 2427 uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt, 2428 pmap->pm_ldt_len, UVM_KMF_WIRED); 2429 } 2430 #endif 2431 2432 for (i = 0; i < PTP_LEVELS - 1; i++) { 2433 uvm_obj_destroy(&pmap->pm_obj[i], false); 2434 mutex_destroy(&pmap->pm_obj_lock[i]); 2435 } 2436 kcpuset_destroy(pmap->pm_cpus); 2437 kcpuset_destroy(pmap->pm_kernel_cpus); 2438 #ifdef XEN 2439 kcpuset_destroy(pmap->pm_xen_ptp_cpus); 2440 #endif 2441 pool_cache_put(&pmap_cache, pmap); 2442 } 2443 2444 /* 2445 * pmap_remove_all: pmap is being torn down by the current thread. 2446 * avoid unnecessary invalidations. 2447 */ 2448 2449 void 2450 pmap_remove_all(struct pmap *pmap) 2451 { 2452 lwp_t *l = curlwp; 2453 2454 KASSERT(l->l_md.md_gc_pmap == NULL); 2455 2456 l->l_md.md_gc_pmap = pmap; 2457 } 2458 2459 #if defined(PMAP_FORK) 2460 /* 2461 * pmap_fork: perform any necessary data structure manipulation when 2462 * a VM space is forked. 2463 */ 2464 2465 void 2466 pmap_fork(struct pmap *pmap1, struct pmap *pmap2) 2467 { 2468 #ifdef USER_LDT 2469 union descriptor *new_ldt; 2470 size_t len; 2471 int sel; 2472 2473 if (__predict_true(pmap1->pm_ldt == NULL)) { 2474 return; 2475 } 2476 2477 retry: 2478 if (pmap1->pm_ldt != NULL) { 2479 len = pmap1->pm_ldt_len; 2480 new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, len, 0, 2481 UVM_KMF_WIRED); 2482 mutex_enter(&cpu_lock); 2483 sel = ldt_alloc(new_ldt, len); 2484 if (sel == -1) { 2485 mutex_exit(&cpu_lock); 2486 uvm_km_free(kernel_map, (vaddr_t)new_ldt, len, 2487 UVM_KMF_WIRED); 2488 printf("WARNING: pmap_fork: unable to allocate LDT\n"); 2489 return; 2490 } 2491 } else { 2492 len = -1; 2493 new_ldt = NULL; 2494 sel = -1; 2495 mutex_enter(&cpu_lock); 2496 } 2497 2498 /* Copy the LDT, if necessary. */ 2499 if (pmap1->pm_ldt != NULL) { 2500 if (len != pmap1->pm_ldt_len) { 2501 if (len != -1) { 2502 ldt_free(sel); 2503 uvm_km_free(kernel_map, (vaddr_t)new_ldt, 2504 len, UVM_KMF_WIRED); 2505 } 2506 mutex_exit(&cpu_lock); 2507 goto retry; 2508 } 2509 2510 memcpy(new_ldt, pmap1->pm_ldt, len); 2511 pmap2->pm_ldt = new_ldt; 2512 pmap2->pm_ldt_len = pmap1->pm_ldt_len; 2513 pmap2->pm_ldt_sel = sel; 2514 len = -1; 2515 } 2516 2517 if (len != -1) { 2518 ldt_free(sel); 2519 uvm_km_free(kernel_map, (vaddr_t)new_ldt, len, 2520 UVM_KMF_WIRED); 2521 } 2522 mutex_exit(&cpu_lock); 2523 #endif /* USER_LDT */ 2524 } 2525 #endif /* PMAP_FORK */ 2526 2527 #ifdef USER_LDT 2528 2529 /* 2530 * pmap_ldt_xcall: cross call used by pmap_ldt_sync. if the named pmap 2531 * is active, reload LDTR. 2532 */ 2533 static void 2534 pmap_ldt_xcall(void *arg1, void *arg2) 2535 { 2536 struct pmap *pm; 2537 2538 kpreempt_disable(); 2539 pm = arg1; 2540 if (curcpu()->ci_pmap == pm) { 2541 lldt(pm->pm_ldt_sel); 2542 } 2543 kpreempt_enable(); 2544 } 2545 2546 /* 2547 * pmap_ldt_sync: LDT selector for the named pmap is changing. swap 2548 * in the new selector on all CPUs. 2549 */ 2550 void 2551 pmap_ldt_sync(struct pmap *pm) 2552 { 2553 uint64_t where; 2554 2555 KASSERT(mutex_owned(&cpu_lock)); 2556 2557 pmap_ldt_evcnt.ev_count++; 2558 where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL); 2559 xc_wait(where); 2560 } 2561 2562 /* 2563 * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and 2564 * restore the default. 2565 */ 2566 2567 void 2568 pmap_ldt_cleanup(struct lwp *l) 2569 { 2570 pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap; 2571 union descriptor *dp = NULL; 2572 size_t len = 0; 2573 int sel = -1; 2574 2575 if (__predict_true(pmap->pm_ldt == NULL)) { 2576 return; 2577 } 2578 2579 mutex_enter(&cpu_lock); 2580 if (pmap->pm_ldt != NULL) { 2581 sel = pmap->pm_ldt_sel; 2582 dp = pmap->pm_ldt; 2583 len = pmap->pm_ldt_len; 2584 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 2585 pmap->pm_ldt = NULL; 2586 pmap->pm_ldt_len = 0; 2587 pmap_ldt_sync(pmap); 2588 ldt_free(sel); 2589 uvm_km_free(kernel_map, (vaddr_t)dp, len, UVM_KMF_WIRED); 2590 } 2591 mutex_exit(&cpu_lock); 2592 } 2593 #endif /* USER_LDT */ 2594 2595 /* 2596 * pmap_activate: activate a process' pmap 2597 * 2598 * => must be called with kernel preemption disabled 2599 * => if lwp is the curlwp, then set ci_want_pmapload so that 2600 * actual MMU context switch will be done by pmap_load() later 2601 */ 2602 2603 void 2604 pmap_activate(struct lwp *l) 2605 { 2606 struct cpu_info *ci; 2607 struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2608 2609 KASSERT(kpreempt_disabled()); 2610 2611 ci = curcpu(); 2612 2613 if (l == ci->ci_curlwp) { 2614 KASSERT(ci->ci_want_pmapload == 0); 2615 KASSERT(ci->ci_tlbstate != TLBSTATE_VALID); 2616 #ifdef KSTACK_CHECK_DR0 2617 /* 2618 * setup breakpoint on the top of stack 2619 */ 2620 if (l == &lwp0) 2621 dr0(0, 0, 0, 0); 2622 else 2623 dr0(KSTACK_LOWEST_ADDR(l), 1, 3, 1); 2624 #endif 2625 2626 /* 2627 * no need to switch to kernel vmspace because 2628 * it's a subset of any vmspace. 2629 */ 2630 2631 if (pmap == pmap_kernel()) { 2632 ci->ci_want_pmapload = 0; 2633 return; 2634 } 2635 2636 ci->ci_want_pmapload = 1; 2637 } 2638 } 2639 2640 /* 2641 * pmap_reactivate: try to regain reference to the pmap. 2642 * 2643 * => Must be called with kernel preemption disabled. 2644 */ 2645 2646 static bool 2647 pmap_reactivate(struct pmap *pmap) 2648 { 2649 struct cpu_info * const ci = curcpu(); 2650 const cpuid_t cid = cpu_index(ci); 2651 bool result; 2652 2653 KASSERT(kpreempt_disabled()); 2654 #if defined(XEN) && defined(__x86_64__) 2655 KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd); 2656 #elif defined(PAE) 2657 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0])); 2658 #elif !defined(XEN) 2659 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3())); 2660 #endif 2661 2662 /* 2663 * If we still have a lazy reference to this pmap, we can assume 2664 * that there was no TLB shootdown for this pmap in the meantime. 2665 * 2666 * The order of events here is important as we must synchronize 2667 * with TLB shootdown interrupts. Declare interest in invalidations 2668 * (TLBSTATE_VALID) and then check the CPU set, which the IPIs can 2669 * change only when the state is TLBSTATE_LAZY. 2670 */ 2671 2672 ci->ci_tlbstate = TLBSTATE_VALID; 2673 KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid)); 2674 2675 if (kcpuset_isset(pmap->pm_cpus, cid)) { 2676 /* We have the reference, state is valid. */ 2677 result = true; 2678 } else { 2679 /* Must reload the TLB. */ 2680 kcpuset_atomic_set(pmap->pm_cpus, cid); 2681 result = false; 2682 } 2683 return result; 2684 } 2685 2686 /* 2687 * pmap_load: perform the actual pmap switch, i.e. fill in %cr3 register 2688 * and relevant LDT info. 2689 * 2690 * Ensures that the current process' pmap is loaded on the current CPU's 2691 * MMU and that there are no stale TLB entries. 2692 * 2693 * => The caller should disable kernel preemption or do check-and-retry 2694 * to prevent a preemption from undoing our efforts. 2695 * => This function may block. 2696 */ 2697 void 2698 pmap_load(void) 2699 { 2700 struct cpu_info *ci; 2701 struct pmap *pmap, *oldpmap; 2702 struct lwp *l; 2703 struct pcb *pcb; 2704 cpuid_t cid; 2705 uint64_t ncsw; 2706 2707 kpreempt_disable(); 2708 retry: 2709 ci = curcpu(); 2710 if (!ci->ci_want_pmapload) { 2711 kpreempt_enable(); 2712 return; 2713 } 2714 l = ci->ci_curlwp; 2715 ncsw = l->l_ncsw; 2716 2717 /* should be able to take ipis. */ 2718 KASSERT(ci->ci_ilevel < IPL_HIGH); 2719 #ifdef XEN 2720 /* Check to see if interrupts are enabled (ie; no events are masked) */ 2721 KASSERT(x86_read_psl() == 0); 2722 #else 2723 KASSERT((x86_read_psl() & PSL_I) != 0); 2724 #endif 2725 2726 KASSERT(l != NULL); 2727 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2728 KASSERT(pmap != pmap_kernel()); 2729 oldpmap = ci->ci_pmap; 2730 pcb = lwp_getpcb(l); 2731 2732 if (pmap == oldpmap) { 2733 if (!pmap_reactivate(pmap)) { 2734 u_int gen = uvm_emap_gen_return(); 2735 2736 /* 2737 * pmap has been changed during deactivated. 2738 * our tlb may be stale. 2739 */ 2740 2741 tlbflush(); 2742 uvm_emap_update(gen); 2743 } 2744 2745 ci->ci_want_pmapload = 0; 2746 kpreempt_enable(); 2747 return; 2748 } 2749 2750 /* 2751 * Acquire a reference to the new pmap and perform the switch. 2752 */ 2753 2754 pmap_reference(pmap); 2755 2756 cid = cpu_index(ci); 2757 kcpuset_atomic_clear(oldpmap->pm_cpus, cid); 2758 kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid); 2759 2760 #if defined(XEN) && defined(__x86_64__) 2761 KASSERT(pmap_pdirpa(oldpmap, 0) == ci->ci_xen_current_user_pgd || 2762 oldpmap == pmap_kernel()); 2763 #elif defined(PAE) 2764 KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0])); 2765 #elif !defined(XEN) 2766 KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(rcr3())); 2767 #endif 2768 KASSERT(!kcpuset_isset(pmap->pm_cpus, cid)); 2769 KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid)); 2770 2771 /* 2772 * Mark the pmap in use by this CPU. Again, we must synchronize 2773 * with TLB shootdown interrupts, so set the state VALID first, 2774 * then register us for shootdown events on this pmap. 2775 */ 2776 ci->ci_tlbstate = TLBSTATE_VALID; 2777 kcpuset_atomic_set(pmap->pm_cpus, cid); 2778 kcpuset_atomic_set(pmap->pm_kernel_cpus, cid); 2779 ci->ci_pmap = pmap; 2780 2781 /* 2782 * update tss. now that we have registered for invalidations 2783 * from other CPUs, we're good to load the page tables. 2784 */ 2785 #ifdef PAE 2786 pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa; 2787 #else 2788 pcb->pcb_cr3 = pmap_pdirpa(pmap, 0); 2789 #endif 2790 2791 #ifdef i386 2792 #ifndef XEN 2793 ci->ci_tss.tss_ldt = pmap->pm_ldt_sel; 2794 ci->ci_tss.tss_cr3 = pcb->pcb_cr3; 2795 #endif /* !XEN */ 2796 #endif /* i386 */ 2797 2798 lldt(pmap->pm_ldt_sel); 2799 2800 u_int gen = uvm_emap_gen_return(); 2801 cpu_load_pmap(pmap, oldpmap); 2802 uvm_emap_update(gen); 2803 2804 ci->ci_want_pmapload = 0; 2805 2806 /* 2807 * we're now running with the new pmap. drop the reference 2808 * to the old pmap. if we block, we need to go around again. 2809 */ 2810 2811 pmap_destroy(oldpmap); 2812 if (l->l_ncsw != ncsw) { 2813 goto retry; 2814 } 2815 2816 kpreempt_enable(); 2817 } 2818 2819 /* 2820 * pmap_deactivate: deactivate a process' pmap. 2821 * 2822 * => Must be called with kernel preemption disabled (high IPL is enough). 2823 */ 2824 void 2825 pmap_deactivate(struct lwp *l) 2826 { 2827 struct pmap *pmap; 2828 struct cpu_info *ci; 2829 2830 KASSERT(kpreempt_disabled()); 2831 2832 if (l != curlwp) { 2833 return; 2834 } 2835 2836 /* 2837 * Wait for pending TLB shootdowns to complete. Necessary because 2838 * TLB shootdown state is per-CPU, and the LWP may be coming off 2839 * the CPU before it has a chance to call pmap_update(), e.g. due 2840 * to kernel preemption or blocking routine in between. 2841 */ 2842 pmap_tlb_shootnow(); 2843 2844 ci = curcpu(); 2845 2846 if (ci->ci_want_pmapload) { 2847 /* 2848 * ci_want_pmapload means that our pmap is not loaded on 2849 * the CPU or TLB might be stale. note that pmap_kernel() 2850 * is always considered loaded. 2851 */ 2852 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 2853 != pmap_kernel()); 2854 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 2855 != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID); 2856 2857 /* 2858 * userspace has not been touched. 2859 * nothing to do here. 2860 */ 2861 2862 ci->ci_want_pmapload = 0; 2863 return; 2864 } 2865 2866 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2867 2868 if (pmap == pmap_kernel()) { 2869 return; 2870 } 2871 2872 #if defined(XEN) && defined(__x86_64__) 2873 KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd); 2874 #elif defined(PAE) 2875 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0])); 2876 #elif !defined(XEN) 2877 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3())); 2878 #endif 2879 KASSERT(ci->ci_pmap == pmap); 2880 2881 /* 2882 * we aren't interested in TLB invalidations for this pmap, 2883 * at least for the time being. 2884 */ 2885 2886 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); 2887 ci->ci_tlbstate = TLBSTATE_LAZY; 2888 } 2889 2890 /* 2891 * end of lifecycle functions 2892 */ 2893 2894 /* 2895 * some misc. functions 2896 */ 2897 2898 int 2899 pmap_pdes_invalid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde) 2900 { 2901 int i; 2902 unsigned long index; 2903 pd_entry_t pde; 2904 2905 for (i = PTP_LEVELS; i > 1; i--) { 2906 index = pl_i(va, i); 2907 pde = pdes[i - 2][index]; 2908 if ((pde & PG_V) == 0) 2909 return i; 2910 } 2911 if (lastpde != NULL) 2912 *lastpde = pde; 2913 return 0; 2914 } 2915 2916 /* 2917 * pmap_extract: extract a PA for the given VA 2918 */ 2919 2920 bool 2921 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap) 2922 { 2923 pt_entry_t *ptes, pte; 2924 pd_entry_t pde; 2925 pd_entry_t * const *pdes; 2926 struct pmap *pmap2; 2927 struct cpu_info *ci; 2928 paddr_t pa; 2929 lwp_t *l; 2930 bool hard, rv; 2931 2932 #ifdef __HAVE_DIRECT_MAP 2933 if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) { 2934 if (pap != NULL) { 2935 *pap = va - PMAP_DIRECT_BASE; 2936 } 2937 return true; 2938 } 2939 #endif 2940 2941 rv = false; 2942 pa = 0; 2943 l = curlwp; 2944 2945 kpreempt_disable(); 2946 ci = l->l_cpu; 2947 if (__predict_true(!ci->ci_want_pmapload && ci->ci_pmap == pmap) || 2948 pmap == pmap_kernel()) { 2949 /* 2950 * no need to lock, because it's pmap_kernel() or our 2951 * own pmap and is active. if a user pmap, the caller 2952 * will hold the vm_map write/read locked and so prevent 2953 * entries from disappearing while we are here. ptps 2954 * can disappear via pmap_remove() and pmap_protect(), 2955 * but they are called with the vm_map write locked. 2956 */ 2957 hard = false; 2958 ptes = PTE_BASE; 2959 pdes = normal_pdes; 2960 } else { 2961 /* we lose, do it the hard way. */ 2962 hard = true; 2963 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 2964 } 2965 if (pmap_pdes_valid(va, pdes, &pde)) { 2966 pte = ptes[pl1_i(va)]; 2967 if (pde & PG_PS) { 2968 pa = (pde & PG_LGFRAME) | (va & (NBPD_L2 - 1)); 2969 rv = true; 2970 } else if (__predict_true((pte & PG_V) != 0)) { 2971 pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1)); 2972 rv = true; 2973 } 2974 } 2975 if (__predict_false(hard)) { 2976 pmap_unmap_ptes(pmap, pmap2); 2977 } 2978 kpreempt_enable(); 2979 if (pap != NULL) { 2980 *pap = pa; 2981 } 2982 return rv; 2983 } 2984 2985 2986 /* 2987 * vtophys: virtual address to physical address. For use by 2988 * machine-dependent code only. 2989 */ 2990 2991 paddr_t 2992 vtophys(vaddr_t va) 2993 { 2994 paddr_t pa; 2995 2996 if (pmap_extract(pmap_kernel(), va, &pa) == true) 2997 return (pa); 2998 return (0); 2999 } 3000 3001 __strict_weak_alias(pmap_extract_ma, pmap_extract); 3002 3003 #ifdef XEN 3004 3005 /* 3006 * vtomach: virtual address to machine address. For use by 3007 * machine-dependent code only. 3008 */ 3009 3010 paddr_t 3011 vtomach(vaddr_t va) 3012 { 3013 paddr_t pa; 3014 3015 if (pmap_extract_ma(pmap_kernel(), va, &pa) == true) 3016 return (pa); 3017 return (0); 3018 } 3019 3020 #endif /* XEN */ 3021 3022 /* 3023 * pmap_virtual_space: used during bootup [pmap_steal_memory] to 3024 * determine the bounds of the kernel virtual addess space. 3025 */ 3026 3027 void 3028 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp) 3029 { 3030 *startp = virtual_avail; 3031 *endp = virtual_end; 3032 } 3033 3034 /* 3035 * pmap_zero_page: zero a page 3036 */ 3037 3038 void 3039 pmap_zero_page(paddr_t pa) 3040 { 3041 #if defined(__HAVE_DIRECT_MAP) 3042 pagezero(PMAP_DIRECT_MAP(pa)); 3043 #else 3044 #if defined(XEN) 3045 if (XEN_VERSION_SUPPORTED(3, 4)) 3046 xen_pagezero(pa); 3047 #endif 3048 pt_entry_t *zpte; 3049 void *zerova; 3050 int id; 3051 3052 kpreempt_disable(); 3053 id = cpu_number(); 3054 zpte = PTESLEW(zero_pte, id); 3055 zerova = VASLEW(zerop, id); 3056 3057 #ifdef DIAGNOSTIC 3058 if (*zpte) 3059 panic("pmap_zero_page: lock botch"); 3060 #endif 3061 3062 pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k); 3063 pmap_pte_flush(); 3064 pmap_update_pg((vaddr_t)zerova); /* flush TLB */ 3065 3066 memset(zerova, 0, PAGE_SIZE); 3067 3068 #if defined(DIAGNOSTIC) || defined(XEN) 3069 pmap_pte_set(zpte, 0); /* zap ! */ 3070 pmap_pte_flush(); 3071 #endif 3072 kpreempt_enable(); 3073 #endif /* defined(__HAVE_DIRECT_MAP) */ 3074 } 3075 3076 /* 3077 * pmap_pagezeroidle: the same, for the idle loop page zero'er. 3078 * Returns true if the page was zero'd, false if we aborted for 3079 * some reason. 3080 */ 3081 3082 bool 3083 pmap_pageidlezero(paddr_t pa) 3084 { 3085 #ifdef __HAVE_DIRECT_MAP 3086 KASSERT(cpu_feature[0] & CPUID_SSE2); 3087 return sse2_idlezero_page((void *)PMAP_DIRECT_MAP(pa)); 3088 #else 3089 pt_entry_t *zpte; 3090 void *zerova; 3091 bool rv; 3092 int id; 3093 3094 id = cpu_number(); 3095 zpte = PTESLEW(zero_pte, id); 3096 zerova = VASLEW(zerop, id); 3097 3098 KASSERT(cpu_feature[0] & CPUID_SSE2); 3099 KASSERT(*zpte == 0); 3100 3101 pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k); 3102 pmap_pte_flush(); 3103 pmap_update_pg((vaddr_t)zerova); /* flush TLB */ 3104 3105 rv = sse2_idlezero_page(zerova); 3106 3107 #if defined(DIAGNOSTIC) || defined(XEN) 3108 pmap_pte_set(zpte, 0); /* zap ! */ 3109 pmap_pte_flush(); 3110 #endif 3111 3112 return rv; 3113 #endif 3114 } 3115 3116 /* 3117 * pmap_copy_page: copy a page 3118 */ 3119 3120 void 3121 pmap_copy_page(paddr_t srcpa, paddr_t dstpa) 3122 { 3123 #if defined(__HAVE_DIRECT_MAP) 3124 vaddr_t srcva = PMAP_DIRECT_MAP(srcpa); 3125 vaddr_t dstva = PMAP_DIRECT_MAP(dstpa); 3126 3127 memcpy((void *)dstva, (void *)srcva, PAGE_SIZE); 3128 #else 3129 #if defined(XEN) 3130 if (XEN_VERSION_SUPPORTED(3, 4)) { 3131 xen_copy_page(srcpa, dstpa); 3132 return; 3133 } 3134 #endif 3135 pt_entry_t *spte; 3136 pt_entry_t *dpte; 3137 void *csrcva; 3138 void *cdstva; 3139 int id; 3140 3141 kpreempt_disable(); 3142 id = cpu_number(); 3143 spte = PTESLEW(csrc_pte,id); 3144 dpte = PTESLEW(cdst_pte,id); 3145 csrcva = VASLEW(csrcp, id); 3146 cdstva = VASLEW(cdstp, id); 3147 3148 KASSERT(*spte == 0 && *dpte == 0); 3149 3150 pmap_pte_set(spte, pmap_pa2pte(srcpa) | PG_V | PG_RW | PG_U | PG_k); 3151 pmap_pte_set(dpte, 3152 pmap_pa2pte(dstpa) | PG_V | PG_RW | PG_M | PG_U | PG_k); 3153 pmap_pte_flush(); 3154 pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva); 3155 3156 memcpy(cdstva, csrcva, PAGE_SIZE); 3157 3158 #if defined(DIAGNOSTIC) || defined(XEN) 3159 pmap_pte_set(spte, 0); 3160 pmap_pte_set(dpte, 0); 3161 pmap_pte_flush(); 3162 #endif 3163 kpreempt_enable(); 3164 #endif /* defined(__HAVE_DIRECT_MAP) */ 3165 } 3166 3167 static pt_entry_t * 3168 pmap_map_ptp(struct vm_page *ptp) 3169 { 3170 #ifdef __HAVE_DIRECT_MAP 3171 return (void *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp)); 3172 #else 3173 pt_entry_t *ptppte; 3174 void *ptpva; 3175 int id; 3176 3177 KASSERT(kpreempt_disabled()); 3178 3179 id = cpu_number(); 3180 ptppte = PTESLEW(ptp_pte, id); 3181 ptpva = VASLEW(ptpp, id); 3182 #if !defined(XEN) 3183 pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M | 3184 PG_RW | PG_U | PG_k); 3185 #else 3186 pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M | 3187 PG_U | PG_k); 3188 #endif 3189 pmap_pte_flush(); 3190 pmap_update_pg((vaddr_t)ptpva); 3191 3192 return (pt_entry_t *)ptpva; 3193 #endif 3194 } 3195 3196 static void 3197 pmap_unmap_ptp(void) 3198 { 3199 #ifndef __HAVE_DIRECT_MAP 3200 #if defined(DIAGNOSTIC) || defined(XEN) 3201 pt_entry_t *pte; 3202 3203 KASSERT(kpreempt_disabled()); 3204 3205 pte = PTESLEW(ptp_pte, cpu_number()); 3206 if (*pte != 0) { 3207 pmap_pte_set(pte, 0); 3208 pmap_pte_flush(); 3209 } 3210 #endif 3211 #endif 3212 } 3213 3214 static pt_entry_t * 3215 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va) 3216 { 3217 3218 KASSERT(kpreempt_disabled()); 3219 if (pmap_is_curpmap(pmap)) { 3220 return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */ 3221 } 3222 KASSERT(ptp != NULL); 3223 return pmap_map_ptp(ptp) + pl1_pi(va); 3224 } 3225 3226 static void 3227 pmap_unmap_pte(void) 3228 { 3229 3230 KASSERT(kpreempt_disabled()); 3231 3232 pmap_unmap_ptp(); 3233 } 3234 3235 /* 3236 * p m a p r e m o v e f u n c t i o n s 3237 * 3238 * functions that remove mappings 3239 */ 3240 3241 /* 3242 * pmap_remove_ptes: remove PTEs from a PTP 3243 * 3244 * => caller must hold pmap's lock 3245 * => PTP must be mapped into KVA 3246 * => PTP should be null if pmap == pmap_kernel() 3247 * => must be called with kernel preemption disabled 3248 * => returns composite pte if at least one page should be shot down 3249 */ 3250 3251 static void 3252 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva, 3253 vaddr_t startva, vaddr_t endva, struct pv_entry **pv_tofree) 3254 { 3255 pt_entry_t *pte = (pt_entry_t *)ptpva; 3256 3257 KASSERT(pmap == pmap_kernel() || mutex_owned(pmap->pm_lock)); 3258 KASSERT(kpreempt_disabled()); 3259 3260 /* 3261 * note that ptpva points to the PTE that maps startva. this may 3262 * or may not be the first PTE in the PTP. 3263 * 3264 * we loop through the PTP while there are still PTEs to look at 3265 * and the wire_count is greater than 1 (because we use the wire_count 3266 * to keep track of the number of real PTEs in the PTP). 3267 */ 3268 while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) { 3269 (void)pmap_remove_pte(pmap, ptp, pte, startva, pv_tofree); 3270 startva += PAGE_SIZE; 3271 pte++; 3272 } 3273 } 3274 3275 3276 /* 3277 * pmap_remove_pte: remove a single PTE from a PTP. 3278 * 3279 * => caller must hold pmap's lock 3280 * => PTP must be mapped into KVA 3281 * => PTP should be null if pmap == pmap_kernel() 3282 * => returns true if we removed a mapping 3283 * => must be called with kernel preemption disabled 3284 */ 3285 static bool 3286 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, 3287 vaddr_t va, struct pv_entry **pv_tofree) 3288 { 3289 struct pv_entry *pve; 3290 struct vm_page *pg; 3291 struct pmap_page *pp; 3292 pt_entry_t opte; 3293 3294 KASSERT(pmap == pmap_kernel() || mutex_owned(pmap->pm_lock)); 3295 KASSERT(kpreempt_disabled()); 3296 3297 if (!pmap_valid_entry(*pte)) { 3298 /* VA not mapped. */ 3299 return false; 3300 } 3301 3302 /* Atomically save the old PTE and zap it. */ 3303 opte = pmap_pte_testset(pte, 0); 3304 if (!pmap_valid_entry(opte)) { 3305 return false; 3306 } 3307 3308 pmap_exec_account(pmap, va, opte, 0); 3309 pmap_stats_update_bypte(pmap, 0, opte); 3310 3311 if (ptp) { 3312 /* 3313 * Dropping a PTE. Make sure that the PDE is flushed. 3314 */ 3315 ptp->wire_count--; 3316 if (ptp->wire_count <= 1) { 3317 opte |= PG_U; 3318 } 3319 } 3320 3321 if ((opte & PG_U) != 0) { 3322 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE); 3323 } 3324 3325 /* 3326 * If we are not on a pv_head list - we are done. 3327 */ 3328 if ((opte & PG_PVLIST) == 0) { 3329 #if defined(DIAGNOSTIC) && !defined(DOM0OPS) 3330 if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL || 3331 pmap_pv_tracked(pmap_pte2pa(opte)) != NULL) 3332 panic("pmap_remove_pte: managed or pv-tracked page" 3333 " without PG_PVLIST for %#"PRIxVADDR, va); 3334 #endif 3335 return true; 3336 } 3337 3338 if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) { 3339 KASSERT(uvm_page_locked_p(pg)); 3340 pp = VM_PAGE_TO_PP(pg); 3341 } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) { 3342 paddr_t pa = pmap_pte2pa(opte); 3343 panic("pmap_remove_pte: PG_PVLIST with pv-untracked page" 3344 " va = 0x%"PRIxVADDR 3345 " pa = 0x%"PRIxPADDR" (0x%"PRIxPADDR")", 3346 va, pa, atop(pa)); 3347 } 3348 3349 /* Sync R/M bits. */ 3350 pp->pp_attrs |= opte; 3351 pve = pmap_remove_pv(pp, ptp, va); 3352 3353 if (pve) { 3354 pve->pve_next = *pv_tofree; 3355 *pv_tofree = pve; 3356 } 3357 return true; 3358 } 3359 3360 /* 3361 * pmap_remove: mapping removal function. 3362 * 3363 * => caller should not be holding any pmap locks 3364 */ 3365 3366 void 3367 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 3368 { 3369 pt_entry_t *ptes; 3370 pd_entry_t pde; 3371 pd_entry_t * const *pdes; 3372 struct pv_entry *pv_tofree = NULL; 3373 bool result; 3374 int i; 3375 paddr_t ptppa; 3376 vaddr_t blkendva, va = sva; 3377 struct vm_page *ptp; 3378 struct pmap *pmap2; 3379 3380 kpreempt_disable(); 3381 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 3382 3383 /* 3384 * removing one page? take shortcut function. 3385 */ 3386 3387 if (va + PAGE_SIZE == eva) { 3388 if (pmap_pdes_valid(va, pdes, &pde)) { 3389 3390 /* PA of the PTP */ 3391 ptppa = pmap_pte2pa(pde); 3392 3393 /* Get PTP if non-kernel mapping. */ 3394 if (pmap != pmap_kernel()) { 3395 ptp = pmap_find_ptp(pmap, va, ptppa, 1); 3396 KASSERTMSG(ptp != NULL, 3397 "pmap_remove: unmanaged PTP detected"); 3398 } else { 3399 /* Never free kernel PTPs. */ 3400 ptp = NULL; 3401 } 3402 3403 result = pmap_remove_pte(pmap, ptp, 3404 &ptes[pl1_i(va)], va, &pv_tofree); 3405 3406 /* 3407 * if mapping removed and the PTP is no longer 3408 * being used, free it! 3409 */ 3410 3411 if (result && ptp && ptp->wire_count <= 1) 3412 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3413 } 3414 } else for (/* null */ ; va < eva ; va = blkendva) { 3415 int lvl; 3416 3417 /* determine range of block */ 3418 blkendva = x86_round_pdr(va+1); 3419 if (blkendva > eva) 3420 blkendva = eva; 3421 3422 /* 3423 * XXXCDC: our PTE mappings should never be removed 3424 * with pmap_remove! if we allow this (and why would 3425 * we?) then we end up freeing the pmap's page 3426 * directory page (PDP) before we are finished using 3427 * it when we hit in in the recursive mapping. this 3428 * is BAD. 3429 * 3430 * long term solution is to move the PTEs out of user 3431 * address space. and into kernel address space (up 3432 * with APTE). then we can set VM_MAXUSER_ADDRESS to 3433 * be VM_MAX_ADDRESS. 3434 */ 3435 3436 /* XXXCDC: ugly hack to avoid freeing PDP here */ 3437 for (i = 0; i < PDP_SIZE; i++) { 3438 if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i) 3439 continue; 3440 } 3441 3442 lvl = pmap_pdes_invalid(va, pdes, &pde); 3443 if (lvl != 0) { 3444 /* 3445 * skip a range corresponding to an invalid pde. 3446 */ 3447 blkendva = (va & ptp_masks[lvl - 1]) + nbpd[lvl - 1]; 3448 continue; 3449 } 3450 3451 /* PA of the PTP */ 3452 ptppa = pmap_pte2pa(pde); 3453 3454 /* Get PTP if non-kernel mapping. */ 3455 if (pmap != pmap_kernel()) { 3456 ptp = pmap_find_ptp(pmap, va, ptppa, 1); 3457 KASSERTMSG(ptp != NULL, 3458 "pmap_remove: unmanaged PTP detected"); 3459 } else { 3460 /* Never free kernel PTPs. */ 3461 ptp = NULL; 3462 } 3463 3464 pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va, 3465 blkendva, &pv_tofree); 3466 3467 /* if PTP is no longer being used, free it! */ 3468 if (ptp && ptp->wire_count <= 1) { 3469 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3470 } 3471 } 3472 pmap_unmap_ptes(pmap, pmap2); /* unlock pmap */ 3473 kpreempt_enable(); 3474 3475 /* Now we free unused PVs */ 3476 if (pv_tofree) 3477 pmap_free_pvs(pv_tofree); 3478 } 3479 3480 /* 3481 * pmap_sync_pv: clear pte bits and return the old value of the pte. 3482 * 3483 * => Caller should disable kernel preemption. 3484 * => issues tlb shootdowns if necessary. 3485 */ 3486 3487 static int 3488 pmap_sync_pv(struct pv_pte *pvpte, pt_entry_t expect, int clearbits, 3489 pt_entry_t *optep) 3490 { 3491 struct pmap *pmap; 3492 struct vm_page *ptp; 3493 vaddr_t va; 3494 pt_entry_t *ptep; 3495 pt_entry_t opte; 3496 pt_entry_t npte; 3497 bool need_shootdown; 3498 3499 ptp = pvpte->pte_ptp; 3500 va = pvpte->pte_va; 3501 KASSERT(ptp == NULL || ptp->uobject != NULL); 3502 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 3503 pmap = ptp_to_pmap(ptp); 3504 3505 KASSERT((expect & ~(PG_FRAME | PG_V)) == 0); 3506 KASSERT((expect & PG_V) != 0); 3507 KASSERT(clearbits == ~0 || (clearbits & ~(PG_M | PG_U | PG_RW)) == 0); 3508 KASSERT(kpreempt_disabled()); 3509 3510 ptep = pmap_map_pte(pmap, ptp, va); 3511 do { 3512 opte = *ptep; 3513 KASSERT((opte & (PG_M | PG_U)) != PG_M); 3514 KASSERT((opte & (PG_U | PG_V)) != PG_U); 3515 KASSERT(opte == 0 || (opte & PG_V) != 0); 3516 if ((opte & (PG_FRAME | PG_V)) != expect) { 3517 3518 /* 3519 * we lost a race with a V->P operation like 3520 * pmap_remove(). wait for the competitor 3521 * reflecting pte bits into mp_attrs. 3522 * 3523 * issue a redundant TLB shootdown so that 3524 * we can wait for its completion. 3525 */ 3526 3527 pmap_unmap_pte(); 3528 if (clearbits != 0) { 3529 pmap_tlb_shootdown(pmap, va, 3530 (pmap == pmap_kernel() ? PG_G : 0), 3531 TLBSHOOT_SYNC_PV1); 3532 } 3533 return EAGAIN; 3534 } 3535 3536 /* 3537 * check if there's anything to do on this pte. 3538 */ 3539 3540 if ((opte & clearbits) == 0) { 3541 need_shootdown = false; 3542 break; 3543 } 3544 3545 /* 3546 * we need a shootdown if the pte is cached. (PG_U) 3547 * 3548 * ...unless we are clearing only the PG_RW bit and 3549 * it isn't cached as RW. (PG_M) 3550 */ 3551 3552 need_shootdown = (opte & PG_U) != 0 && 3553 !(clearbits == PG_RW && (opte & PG_M) == 0); 3554 3555 npte = opte & ~clearbits; 3556 3557 /* 3558 * if we need a shootdown anyway, clear PG_U and PG_M. 3559 */ 3560 3561 if (need_shootdown) { 3562 npte &= ~(PG_U | PG_M); 3563 } 3564 KASSERT((npte & (PG_M | PG_U)) != PG_M); 3565 KASSERT((npte & (PG_U | PG_V)) != PG_U); 3566 KASSERT(npte == 0 || (opte & PG_V) != 0); 3567 } while (pmap_pte_cas(ptep, opte, npte) != opte); 3568 3569 if (need_shootdown) { 3570 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV2); 3571 } 3572 pmap_unmap_pte(); 3573 3574 *optep = opte; 3575 return 0; 3576 } 3577 3578 static void 3579 pmap_pp_remove(struct pmap_page *pp, paddr_t pa) 3580 { 3581 struct pv_pte *pvpte; 3582 struct pv_entry *killlist = NULL; 3583 struct vm_page *ptp; 3584 pt_entry_t expect; 3585 int count; 3586 3587 expect = pmap_pa2pte(pa) | PG_V; 3588 count = SPINLOCK_BACKOFF_MIN; 3589 kpreempt_disable(); 3590 startover: 3591 while ((pvpte = pv_pte_first(pp)) != NULL) { 3592 struct pmap *pmap; 3593 struct pv_entry *pve; 3594 pt_entry_t opte; 3595 vaddr_t va; 3596 int error; 3597 3598 /* 3599 * add a reference to the pmap before clearing the pte. 3600 * otherwise the pmap can disappear behind us. 3601 */ 3602 3603 ptp = pvpte->pte_ptp; 3604 pmap = ptp_to_pmap(ptp); 3605 if (ptp != NULL) { 3606 pmap_reference(pmap); 3607 } 3608 3609 error = pmap_sync_pv(pvpte, expect, ~0, &opte); 3610 if (error == EAGAIN) { 3611 int hold_count; 3612 KERNEL_UNLOCK_ALL(curlwp, &hold_count); 3613 if (ptp != NULL) { 3614 pmap_destroy(pmap); 3615 } 3616 SPINLOCK_BACKOFF(count); 3617 KERNEL_LOCK(hold_count, curlwp); 3618 goto startover; 3619 } 3620 3621 pp->pp_attrs |= opte; 3622 va = pvpte->pte_va; 3623 pve = pmap_remove_pv(pp, ptp, va); 3624 3625 /* update the PTP reference count. free if last reference. */ 3626 if (ptp != NULL) { 3627 struct pmap *pmap2; 3628 pt_entry_t *ptes; 3629 pd_entry_t * const *pdes; 3630 3631 KASSERT(pmap != pmap_kernel()); 3632 3633 pmap_tlb_shootnow(); 3634 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3635 pmap_stats_update_bypte(pmap, 0, opte); 3636 ptp->wire_count--; 3637 if (ptp->wire_count <= 1) { 3638 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3639 } 3640 pmap_unmap_ptes(pmap, pmap2); 3641 pmap_destroy(pmap); 3642 } else { 3643 KASSERT(pmap == pmap_kernel()); 3644 pmap_stats_update_bypte(pmap, 0, opte); 3645 } 3646 3647 if (pve != NULL) { 3648 pve->pve_next = killlist; /* mark it for death */ 3649 killlist = pve; 3650 } 3651 } 3652 pmap_tlb_shootnow(); 3653 kpreempt_enable(); 3654 3655 /* Now free unused pvs. */ 3656 pmap_free_pvs(killlist); 3657 } 3658 3659 /* 3660 * pmap_page_remove: remove a managed vm_page from all pmaps that map it 3661 * 3662 * => R/M bits are sync'd back to attrs 3663 */ 3664 3665 void 3666 pmap_page_remove(struct vm_page *pg) 3667 { 3668 struct pmap_page *pp; 3669 paddr_t pa; 3670 3671 KASSERT(uvm_page_locked_p(pg)); 3672 3673 pp = VM_PAGE_TO_PP(pg); 3674 pa = VM_PAGE_TO_PHYS(pg); 3675 pmap_pp_remove(pp, pa); 3676 } 3677 3678 /* 3679 * pmap_pv_remove: remove an unmanaged pv-tracked page from all pmaps 3680 * that map it 3681 */ 3682 3683 void 3684 pmap_pv_remove(paddr_t pa) 3685 { 3686 struct pmap_page *pp; 3687 3688 pp = pmap_pv_tracked(pa); 3689 if (pp == NULL) 3690 panic("pmap_pv_protect: page not pv-tracked: 0x%"PRIxPADDR, 3691 pa); 3692 pmap_pp_remove(pp, pa); 3693 } 3694 3695 /* 3696 * p m a p a t t r i b u t e f u n c t i o n s 3697 * functions that test/change managed page's attributes 3698 * since a page can be mapped multiple times we must check each PTE that 3699 * maps it by going down the pv lists. 3700 */ 3701 3702 /* 3703 * pmap_test_attrs: test a page's attributes 3704 */ 3705 3706 bool 3707 pmap_test_attrs(struct vm_page *pg, unsigned testbits) 3708 { 3709 struct pmap_page *pp; 3710 struct pv_pte *pvpte; 3711 pt_entry_t expect; 3712 u_int result; 3713 3714 KASSERT(uvm_page_locked_p(pg)); 3715 3716 pp = VM_PAGE_TO_PP(pg); 3717 if ((pp->pp_attrs & testbits) != 0) { 3718 return true; 3719 } 3720 expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V; 3721 kpreempt_disable(); 3722 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 3723 pt_entry_t opte; 3724 int error; 3725 3726 if ((pp->pp_attrs & testbits) != 0) { 3727 break; 3728 } 3729 error = pmap_sync_pv(pvpte, expect, 0, &opte); 3730 if (error == 0) { 3731 pp->pp_attrs |= opte; 3732 } 3733 } 3734 result = pp->pp_attrs & testbits; 3735 kpreempt_enable(); 3736 3737 /* 3738 * note that we will exit the for loop with a non-null pve if 3739 * we have found the bits we are testing for. 3740 */ 3741 3742 return result != 0; 3743 } 3744 3745 static bool 3746 pmap_pp_clear_attrs(struct pmap_page *pp, paddr_t pa, unsigned clearbits) 3747 { 3748 struct pv_pte *pvpte; 3749 u_int result; 3750 pt_entry_t expect; 3751 int count; 3752 3753 expect = pmap_pa2pte(pa) | PG_V; 3754 count = SPINLOCK_BACKOFF_MIN; 3755 kpreempt_disable(); 3756 startover: 3757 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 3758 pt_entry_t opte; 3759 int error; 3760 3761 error = pmap_sync_pv(pvpte, expect, clearbits, &opte); 3762 if (error == EAGAIN) { 3763 int hold_count; 3764 KERNEL_UNLOCK_ALL(curlwp, &hold_count); 3765 SPINLOCK_BACKOFF(count); 3766 KERNEL_LOCK(hold_count, curlwp); 3767 goto startover; 3768 } 3769 pp->pp_attrs |= opte; 3770 } 3771 result = pp->pp_attrs & clearbits; 3772 pp->pp_attrs &= ~clearbits; 3773 pmap_tlb_shootnow(); 3774 kpreempt_enable(); 3775 3776 return result != 0; 3777 } 3778 3779 /* 3780 * pmap_clear_attrs: clear the specified attribute for a page. 3781 * 3782 * => we return true if we cleared one of the bits we were asked to 3783 */ 3784 3785 bool 3786 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits) 3787 { 3788 struct pmap_page *pp; 3789 paddr_t pa; 3790 3791 KASSERT(uvm_page_locked_p(pg)); 3792 3793 pp = VM_PAGE_TO_PP(pg); 3794 pa = VM_PAGE_TO_PHYS(pg); 3795 3796 return pmap_pp_clear_attrs(pp, pa, clearbits); 3797 } 3798 3799 /* 3800 * pmap_pv_clear_attrs: clear the specified attributes for an unmanaged 3801 * pv-tracked page. 3802 */ 3803 3804 bool 3805 pmap_pv_clear_attrs(paddr_t pa, unsigned clearbits) 3806 { 3807 struct pmap_page *pp; 3808 3809 pp = pmap_pv_tracked(pa); 3810 if (pp == NULL) 3811 panic("pmap_pv_protect: page not pv-tracked: 0x%"PRIxPADDR, 3812 pa); 3813 3814 return pmap_pp_clear_attrs(pp, pa, clearbits); 3815 } 3816 3817 /* 3818 * p m a p p r o t e c t i o n f u n c t i o n s 3819 */ 3820 3821 /* 3822 * pmap_page_protect: change the protection of all recorded mappings 3823 * of a managed page 3824 * 3825 * => NOTE: this is an inline function in pmap.h 3826 */ 3827 3828 /* see pmap.h */ 3829 3830 /* 3831 * pmap_pv_protect: change the protection of all recorded mappings 3832 * of an unmanaged pv-tracked page 3833 * 3834 * => NOTE: this is an inline function in pmap.h 3835 */ 3836 3837 /* see pmap.h */ 3838 3839 /* 3840 * pmap_protect: set the protection in of the pages in a pmap 3841 * 3842 * => NOTE: this is an inline function in pmap.h 3843 */ 3844 3845 /* see pmap.h */ 3846 3847 /* 3848 * pmap_write_protect: write-protect pages in a pmap. 3849 */ 3850 void 3851 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot) 3852 { 3853 pt_entry_t *ptes; 3854 pt_entry_t * const *pdes; 3855 struct pmap *pmap2; 3856 vaddr_t blockend, va; 3857 3858 KASSERT(curlwp->l_md.md_gc_pmap != pmap); 3859 3860 sva &= PG_FRAME; 3861 eva &= PG_FRAME; 3862 3863 /* Acquire pmap. */ 3864 kpreempt_disable(); 3865 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3866 3867 for (va = sva ; va < eva ; va = blockend) { 3868 pt_entry_t *spte, *epte; 3869 int i; 3870 3871 blockend = x86_round_pdr(va + 1); 3872 if (blockend > eva) 3873 blockend = eva; 3874 3875 /* 3876 * XXXCDC: our PTE mappings should never be write-protected! 3877 * 3878 * long term solution is to move the PTEs out of user 3879 * address space. and into kernel address space (up 3880 * with APTE). then we can set VM_MAXUSER_ADDRESS to 3881 * be VM_MAX_ADDRESS. 3882 */ 3883 3884 /* XXXCDC: ugly hack to avoid freeing PDP here */ 3885 for (i = 0; i < PDP_SIZE; i++) { 3886 if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i) 3887 continue; 3888 } 3889 3890 /* Is it a valid block? */ 3891 if (!pmap_pdes_valid(va, pdes, NULL)) { 3892 continue; 3893 } 3894 KASSERT(va < VM_MAXUSER_ADDRESS || va >= VM_MAX_ADDRESS); 3895 3896 spte = &ptes[pl1_i(va)]; 3897 epte = &ptes[pl1_i(blockend)]; 3898 3899 for (/*null */; spte < epte ; spte++) { 3900 pt_entry_t opte, npte; 3901 3902 do { 3903 opte = *spte; 3904 if ((~opte & (PG_RW | PG_V)) != 0) { 3905 goto next; 3906 } 3907 npte = opte & ~PG_RW; 3908 } while (pmap_pte_cas(spte, opte, npte) != opte); 3909 3910 if ((opte & PG_M) != 0) { 3911 vaddr_t tva = x86_ptob(spte - ptes); 3912 pmap_tlb_shootdown(pmap, tva, opte, 3913 TLBSHOOT_WRITE_PROTECT); 3914 } 3915 next:; 3916 } 3917 } 3918 3919 /* Release pmap. */ 3920 pmap_unmap_ptes(pmap, pmap2); 3921 kpreempt_enable(); 3922 } 3923 3924 /* 3925 * pmap_unwire: clear the wired bit in the PTE. 3926 * 3927 * => Mapping should already be present. 3928 */ 3929 void 3930 pmap_unwire(struct pmap *pmap, vaddr_t va) 3931 { 3932 pt_entry_t *ptes, *ptep, opte; 3933 pd_entry_t * const *pdes; 3934 struct pmap *pmap2; 3935 3936 /* Acquire pmap. */ 3937 kpreempt_disable(); 3938 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3939 3940 if (!pmap_pdes_valid(va, pdes, NULL)) { 3941 panic("pmap_unwire: invalid PDE"); 3942 } 3943 3944 ptep = &ptes[pl1_i(va)]; 3945 opte = *ptep; 3946 KASSERT(pmap_valid_entry(opte)); 3947 3948 if (opte & PG_W) { 3949 pt_entry_t npte = opte & ~PG_W; 3950 3951 opte = pmap_pte_testset(ptep, npte); 3952 pmap_stats_update_bypte(pmap, npte, opte); 3953 } else { 3954 printf("pmap_unwire: wiring for pmap %p va 0x%lx " 3955 "did not change!\n", pmap, va); 3956 } 3957 3958 /* Release pmap. */ 3959 pmap_unmap_ptes(pmap, pmap2); 3960 kpreempt_enable(); 3961 } 3962 3963 /* 3964 * pmap_copy: copy mappings from one pmap to another 3965 * 3966 * => optional function 3967 * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr) 3968 */ 3969 3970 /* 3971 * defined as macro in pmap.h 3972 */ 3973 3974 __strict_weak_alias(pmap_enter, pmap_enter_default); 3975 3976 int 3977 pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, 3978 u_int flags) 3979 { 3980 return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0); 3981 } 3982 3983 /* 3984 * pmap_enter: enter a mapping into a pmap 3985 * 3986 * => must be done "now" ... no lazy-evaluation 3987 * => we set pmap => pv_head locking 3988 */ 3989 int 3990 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa, 3991 vm_prot_t prot, u_int flags, int domid) 3992 { 3993 pt_entry_t *ptes, opte, npte; 3994 pt_entry_t *ptep; 3995 pd_entry_t * const *pdes; 3996 struct vm_page *ptp; 3997 struct vm_page *new_pg, *old_pg; 3998 struct pmap_page *new_pp, *old_pp; 3999 struct pv_entry *old_pve = NULL; 4000 struct pv_entry *new_pve; 4001 struct pv_entry *new_pve2; 4002 int error; 4003 bool wired = (flags & PMAP_WIRED) != 0; 4004 struct pmap *pmap2; 4005 4006 KASSERT(pmap_initialized); 4007 KASSERT(curlwp->l_md.md_gc_pmap != pmap); 4008 KASSERT(va < VM_MAX_KERNEL_ADDRESS); 4009 KASSERTMSG(va != (vaddr_t)PDP_BASE, 4010 "pmap_enter: trying to map over PDP!"); 4011 KASSERTMSG(va < VM_MIN_KERNEL_ADDRESS || 4012 pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]), 4013 "pmap_enter: missing kernel PTP for VA %lx!", va); 4014 4015 #ifdef XEN 4016 KASSERT(domid == DOMID_SELF || pa == 0); 4017 #endif /* XEN */ 4018 4019 npte = ma | protection_codes[prot] | PG_V; 4020 npte |= pmap_pat_flags(flags); 4021 if (wired) 4022 npte |= PG_W; 4023 if (va < VM_MAXUSER_ADDRESS) 4024 npte |= PG_u; 4025 else if (va < VM_MAX_ADDRESS) 4026 npte |= (PG_u | PG_RW); /* XXXCDC: no longer needed? */ 4027 else 4028 npte |= PG_k; 4029 if (pmap == pmap_kernel()) 4030 npte |= pmap_pg_g; 4031 if (flags & VM_PROT_ALL) { 4032 npte |= PG_U; 4033 if (flags & VM_PROT_WRITE) { 4034 KASSERT((npte & PG_RW) != 0); 4035 npte |= PG_M; 4036 } 4037 } 4038 4039 #ifdef XEN 4040 if (domid != DOMID_SELF) 4041 new_pg = NULL; 4042 else 4043 #endif 4044 new_pg = PHYS_TO_VM_PAGE(pa); 4045 if (new_pg != NULL) { 4046 /* This is a managed page */ 4047 npte |= PG_PVLIST; 4048 new_pp = VM_PAGE_TO_PP(new_pg); 4049 } else if ((new_pp = pmap_pv_tracked(pa)) != NULL) { 4050 /* This is an unmanaged pv-tracked page */ 4051 npte |= PG_PVLIST; 4052 } else { 4053 new_pp = NULL; 4054 } 4055 4056 /* get pves. */ 4057 new_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT); 4058 new_pve2 = pool_cache_get(&pmap_pv_cache, PR_NOWAIT); 4059 if (new_pve == NULL || new_pve2 == NULL) { 4060 if (flags & PMAP_CANFAIL) { 4061 error = ENOMEM; 4062 goto out2; 4063 } 4064 panic("pmap_enter: pve allocation failed"); 4065 } 4066 4067 kpreempt_disable(); 4068 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 4069 if (pmap == pmap_kernel()) { 4070 ptp = NULL; 4071 } else { 4072 ptp = pmap_get_ptp(pmap, va, pdes); 4073 if (ptp == NULL) { 4074 pmap_unmap_ptes(pmap, pmap2); 4075 if (flags & PMAP_CANFAIL) { 4076 error = ENOMEM; 4077 goto out; 4078 } 4079 panic("pmap_enter: get ptp failed"); 4080 } 4081 } 4082 4083 /* 4084 * update the pte. 4085 */ 4086 4087 ptep = &ptes[pl1_i(va)]; 4088 do { 4089 opte = *ptep; 4090 4091 /* 4092 * if the same page, inherit PG_U and PG_M. 4093 */ 4094 if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) { 4095 npte |= opte & (PG_U | PG_M); 4096 } 4097 #if defined(XEN) 4098 if (domid != DOMID_SELF) { 4099 /* pmap_pte_cas with error handling */ 4100 int s = splvm(); 4101 if (opte != *ptep) { 4102 splx(s); 4103 continue; 4104 } 4105 error = xpq_update_foreign( 4106 vtomach((vaddr_t)ptep), npte, domid); 4107 splx(s); 4108 if (error) { 4109 if (ptp != NULL && ptp->wire_count <= 1) { 4110 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 4111 } 4112 pmap_unmap_ptes(pmap, pmap2); 4113 goto out; 4114 } 4115 break; 4116 } 4117 #endif /* defined(XEN) */ 4118 } while (pmap_pte_cas(ptep, opte, npte) != opte); 4119 4120 /* 4121 * update statistics and PTP's reference count. 4122 */ 4123 4124 pmap_stats_update_bypte(pmap, npte, opte); 4125 if (ptp != NULL && !pmap_valid_entry(opte)) { 4126 ptp->wire_count++; 4127 } 4128 KASSERT(ptp == NULL || ptp->wire_count > 1); 4129 4130 /* 4131 * if the same page, we can skip pv_entry handling. 4132 */ 4133 4134 if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) { 4135 KASSERT(((opte ^ npte) & PG_PVLIST) == 0); 4136 goto same_pa; 4137 } 4138 4139 /* 4140 * if old page is pv-tracked, remove pv_entry from its list. 4141 */ 4142 4143 if ((~opte & (PG_V | PG_PVLIST)) == 0) { 4144 if ((old_pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) { 4145 KASSERT(uvm_page_locked_p(old_pg)); 4146 old_pp = VM_PAGE_TO_PP(old_pg); 4147 } else if ((old_pp = pmap_pv_tracked(pmap_pte2pa(opte))) 4148 == NULL) { 4149 pa = pmap_pte2pa(opte); 4150 panic("pmap_enter: PG_PVLIST with pv-untracked page" 4151 " va = 0x%"PRIxVADDR 4152 " pa = 0x%" PRIxPADDR " (0x%" PRIxPADDR ")", 4153 va, pa, atop(pa)); 4154 } 4155 4156 old_pve = pmap_remove_pv(old_pp, ptp, va); 4157 old_pp->pp_attrs |= opte; 4158 } 4159 4160 /* 4161 * if new page is pv-tracked, insert pv_entry into its list. 4162 */ 4163 4164 if (new_pp) { 4165 new_pve = pmap_enter_pv(new_pp, new_pve, &new_pve2, ptp, va); 4166 } 4167 4168 same_pa: 4169 pmap_unmap_ptes(pmap, pmap2); 4170 4171 /* 4172 * shootdown tlb if necessary. 4173 */ 4174 4175 if ((~opte & (PG_V | PG_U)) == 0 && 4176 ((opte ^ npte) & (PG_FRAME | PG_RW)) != 0) { 4177 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER); 4178 } 4179 4180 error = 0; 4181 out: 4182 kpreempt_enable(); 4183 out2: 4184 if (old_pve != NULL) { 4185 pool_cache_put(&pmap_pv_cache, old_pve); 4186 } 4187 if (new_pve != NULL) { 4188 pool_cache_put(&pmap_pv_cache, new_pve); 4189 } 4190 if (new_pve2 != NULL) { 4191 pool_cache_put(&pmap_pv_cache, new_pve2); 4192 } 4193 4194 return error; 4195 } 4196 4197 static bool 4198 pmap_get_physpage(vaddr_t va, int level, paddr_t *paddrp) 4199 { 4200 struct vm_page *ptp; 4201 struct pmap *kpm = pmap_kernel(); 4202 4203 if (!uvm.page_init_done) { 4204 4205 /* 4206 * we're growing the kernel pmap early (from 4207 * uvm_pageboot_alloc()). this case must be 4208 * handled a little differently. 4209 */ 4210 4211 if (!uvm_page_physget(paddrp)) 4212 panic("pmap_get_physpage: out of memory"); 4213 #if defined(__HAVE_DIRECT_MAP) 4214 pagezero(PMAP_DIRECT_MAP(*paddrp)); 4215 #else 4216 #if defined(XEN) 4217 if (XEN_VERSION_SUPPORTED(3, 4)) { 4218 xen_pagezero(*paddrp); 4219 return true; 4220 } 4221 #endif 4222 kpreempt_disable(); 4223 pmap_pte_set(early_zero_pte, 4224 pmap_pa2pte(*paddrp) | PG_V | PG_RW | PG_k); 4225 pmap_pte_flush(); 4226 pmap_update_pg((vaddr_t)early_zerop); 4227 memset(early_zerop, 0, PAGE_SIZE); 4228 #if defined(DIAGNOSTIC) || defined(XEN) 4229 pmap_pte_set(early_zero_pte, 0); 4230 pmap_pte_flush(); 4231 #endif /* defined(DIAGNOSTIC) */ 4232 kpreempt_enable(); 4233 #endif /* defined(__HAVE_DIRECT_MAP) */ 4234 } else { 4235 /* XXX */ 4236 ptp = uvm_pagealloc(NULL, 0, NULL, 4237 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 4238 if (ptp == NULL) 4239 panic("pmap_get_physpage: out of memory"); 4240 ptp->flags &= ~PG_BUSY; 4241 ptp->wire_count = 1; 4242 *paddrp = VM_PAGE_TO_PHYS(ptp); 4243 } 4244 pmap_stats_update(kpm, 1, 0); 4245 return true; 4246 } 4247 4248 /* 4249 * Allocate the amount of specified ptps for a ptp level, and populate 4250 * all levels below accordingly, mapping virtual addresses starting at 4251 * kva. 4252 * 4253 * Used by pmap_growkernel. 4254 */ 4255 static void 4256 pmap_alloc_level(pd_entry_t * const *pdes, vaddr_t kva, int lvl, 4257 long *needed_ptps) 4258 { 4259 unsigned long i; 4260 vaddr_t va; 4261 paddr_t pa; 4262 unsigned long index, endindex; 4263 int level; 4264 pd_entry_t *pdep; 4265 #ifdef XEN 4266 int s = splvm(); /* protect xpq_* */ 4267 #endif 4268 4269 for (level = lvl; level > 1; level--) { 4270 if (level == PTP_LEVELS) 4271 pdep = pmap_kernel()->pm_pdir; 4272 else 4273 pdep = pdes[level - 2]; 4274 va = kva; 4275 index = pl_i_roundup(kva, level); 4276 endindex = index + needed_ptps[level - 1] - 1; 4277 4278 4279 for (i = index; i <= endindex; i++) { 4280 pt_entry_t pte; 4281 4282 KASSERT(!pmap_valid_entry(pdep[i])); 4283 pmap_get_physpage(va, level - 1, &pa); 4284 pte = pmap_pa2pte(pa) | PG_k | PG_V | PG_RW; 4285 #ifdef XEN 4286 pmap_pte_set(&pdep[i], pte); 4287 #if defined(PAE) || defined(__x86_64__) 4288 if (level == PTP_LEVELS && i >= PDIR_SLOT_KERN) { 4289 if (__predict_true( 4290 cpu_info_primary.ci_flags & CPUF_PRESENT)) { 4291 /* update per-cpu PMDs on all cpus */ 4292 xen_kpm_sync(pmap_kernel(), i); 4293 } else { 4294 /* 4295 * too early; update primary CPU 4296 * PMD only (without locks) 4297 */ 4298 #ifdef PAE 4299 pd_entry_t *cpu_pdep = 4300 &cpu_info_primary.ci_kpm_pdir[l2tol2(i)]; 4301 #endif 4302 #ifdef __x86_64__ 4303 pd_entry_t *cpu_pdep = 4304 &cpu_info_primary.ci_kpm_pdir[i]; 4305 #endif 4306 pmap_pte_set(cpu_pdep, pte); 4307 } 4308 } 4309 #endif /* PAE || __x86_64__ */ 4310 #else /* XEN */ 4311 pdep[i] = pte; 4312 #endif /* XEN */ 4313 KASSERT(level != PTP_LEVELS || nkptp[level - 1] + 4314 pl_i(VM_MIN_KERNEL_ADDRESS, level) == i); 4315 nkptp[level - 1]++; 4316 va += nbpd[level - 1]; 4317 } 4318 pmap_pte_flush(); 4319 } 4320 #ifdef XEN 4321 splx(s); 4322 #endif 4323 } 4324 4325 /* 4326 * pmap_growkernel: increase usage of KVM space 4327 * 4328 * => we allocate new PTPs for the kernel and install them in all 4329 * the pmaps on the system. 4330 */ 4331 4332 vaddr_t 4333 pmap_growkernel(vaddr_t maxkvaddr) 4334 { 4335 struct pmap *kpm = pmap_kernel(); 4336 #if !defined(XEN) || !defined(__x86_64__) 4337 struct pmap *pm; 4338 long old; 4339 #endif 4340 int s, i; 4341 long needed_kptp[PTP_LEVELS], target_nptp; 4342 bool invalidate = false; 4343 4344 s = splvm(); /* to be safe */ 4345 mutex_enter(kpm->pm_lock); 4346 4347 if (maxkvaddr <= pmap_maxkvaddr) { 4348 mutex_exit(kpm->pm_lock); 4349 splx(s); 4350 return pmap_maxkvaddr; 4351 } 4352 4353 maxkvaddr = x86_round_pdr(maxkvaddr); 4354 #if !defined(XEN) || !defined(__x86_64__) 4355 old = nkptp[PTP_LEVELS - 1]; 4356 #endif 4357 4358 /* 4359 * This loop could be optimized more, but pmap_growkernel() 4360 * is called infrequently. 4361 */ 4362 for (i = PTP_LEVELS - 1; i >= 1; i--) { 4363 target_nptp = pl_i_roundup(maxkvaddr, i + 1) - 4364 pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1); 4365 /* 4366 * XXX only need to check toplevel. 4367 */ 4368 if (target_nptp > nkptpmax[i]) 4369 panic("out of KVA space"); 4370 KASSERT(target_nptp >= nkptp[i]); 4371 needed_kptp[i] = target_nptp - nkptp[i]; 4372 } 4373 4374 pmap_alloc_level(normal_pdes, pmap_maxkvaddr, PTP_LEVELS, needed_kptp); 4375 4376 /* 4377 * If the number of top level entries changed, update all 4378 * pmaps. 4379 */ 4380 if (needed_kptp[PTP_LEVELS - 1] != 0) { 4381 #ifdef XEN 4382 #ifdef __x86_64__ 4383 /* nothing, kernel entries are never entered in user pmap */ 4384 #else /* __x86_64__ */ 4385 mutex_enter(&pmaps_lock); 4386 LIST_FOREACH(pm, &pmaps, pm_list) { 4387 int pdkidx; 4388 for (pdkidx = PDIR_SLOT_KERN + old; 4389 pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1]; 4390 pdkidx++) { 4391 pmap_pte_set(&pm->pm_pdir[pdkidx], 4392 kpm->pm_pdir[pdkidx]); 4393 } 4394 pmap_pte_flush(); 4395 } 4396 mutex_exit(&pmaps_lock); 4397 #endif /* __x86_64__ */ 4398 #else /* XEN */ 4399 unsigned newpdes; 4400 newpdes = nkptp[PTP_LEVELS - 1] - old; 4401 mutex_enter(&pmaps_lock); 4402 LIST_FOREACH(pm, &pmaps, pm_list) { 4403 memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old], 4404 &kpm->pm_pdir[PDIR_SLOT_KERN + old], 4405 newpdes * sizeof (pd_entry_t)); 4406 } 4407 mutex_exit(&pmaps_lock); 4408 #endif 4409 invalidate = true; 4410 } 4411 pmap_maxkvaddr = maxkvaddr; 4412 mutex_exit(kpm->pm_lock); 4413 splx(s); 4414 4415 if (invalidate && pmap_initialized) { 4416 /* Invalidate the PDP cache. */ 4417 pool_cache_invalidate(&pmap_pdp_cache); 4418 } 4419 4420 return maxkvaddr; 4421 } 4422 4423 #ifdef DEBUG 4424 void pmap_dump(struct pmap *, vaddr_t, vaddr_t); 4425 4426 /* 4427 * pmap_dump: dump all the mappings from a pmap 4428 * 4429 * => caller should not be holding any pmap locks 4430 */ 4431 4432 void 4433 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 4434 { 4435 pt_entry_t *ptes, *pte; 4436 pd_entry_t * const *pdes; 4437 struct pmap *pmap2; 4438 vaddr_t blkendva; 4439 4440 /* 4441 * if end is out of range truncate. 4442 * if (end == start) update to max. 4443 */ 4444 4445 if (eva > VM_MAXUSER_ADDRESS || eva <= sva) 4446 eva = VM_MAXUSER_ADDRESS; 4447 4448 /* 4449 * we lock in the pmap => pv_head direction 4450 */ 4451 4452 kpreempt_disable(); 4453 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 4454 4455 /* 4456 * dumping a range of pages: we dump in PTP sized blocks (4MB) 4457 */ 4458 4459 for (/* null */ ; sva < eva ; sva = blkendva) { 4460 4461 /* determine range of block */ 4462 blkendva = x86_round_pdr(sva+1); 4463 if (blkendva > eva) 4464 blkendva = eva; 4465 4466 /* valid block? */ 4467 if (!pmap_pdes_valid(sva, pdes, NULL)) 4468 continue; 4469 4470 pte = &ptes[pl1_i(sva)]; 4471 for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) { 4472 if (!pmap_valid_entry(*pte)) 4473 continue; 4474 printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR 4475 " (pte=%#" PRIxPADDR ")\n", 4476 sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte); 4477 } 4478 } 4479 pmap_unmap_ptes(pmap, pmap2); 4480 kpreempt_enable(); 4481 } 4482 #endif 4483 4484 /* 4485 * pmap_update: process deferred invalidations and frees. 4486 */ 4487 4488 void 4489 pmap_update(struct pmap *pmap) 4490 { 4491 struct vm_page *empty_ptps; 4492 lwp_t *l = curlwp; 4493 4494 /* 4495 * If we have torn down this pmap, invalidate non-global TLB 4496 * entries on any processors using it. 4497 */ 4498 kpreempt_disable(); 4499 if (__predict_false(l->l_md.md_gc_pmap == pmap)) { 4500 l->l_md.md_gc_pmap = NULL; 4501 pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, TLBSHOOT_UPDATE); 4502 } 4503 /* 4504 * Initiate any pending TLB shootdowns. Wait for them to 4505 * complete before returning control to the caller. 4506 */ 4507 pmap_tlb_shootnow(); 4508 kpreempt_enable(); 4509 4510 /* 4511 * Now that shootdowns are complete, process deferred frees, 4512 * but not from interrupt context. 4513 */ 4514 if (l->l_md.md_gc_ptp != NULL) { 4515 KASSERT((l->l_pflag & LP_INTR) == 0); 4516 if (cpu_intr_p()) { 4517 return; 4518 } 4519 empty_ptps = l->l_md.md_gc_ptp; 4520 l->l_md.md_gc_ptp = NULL; 4521 pmap_free_ptps(empty_ptps); 4522 } 4523 } 4524 4525 #if PTP_LEVELS > 4 4526 #error "Unsupported number of page table mappings" 4527 #endif 4528 4529 paddr_t 4530 pmap_init_tmp_pgtbl(paddr_t pg) 4531 { 4532 static bool maps_loaded; 4533 static const paddr_t x86_tmp_pml_paddr[] = { 4534 4 * PAGE_SIZE, /* L1 */ 4535 5 * PAGE_SIZE, /* L2 */ 4536 6 * PAGE_SIZE, /* L3 */ 4537 7 * PAGE_SIZE /* L4 */ 4538 }; 4539 static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 }; 4540 4541 pd_entry_t *tmp_pml, *kernel_pml; 4542 4543 int level; 4544 4545 if (!maps_loaded) { 4546 for (level = 0; level < PTP_LEVELS; ++level) { 4547 x86_tmp_pml_vaddr[level] = 4548 uvm_km_alloc(kernel_map, PAGE_SIZE, 0, 4549 UVM_KMF_VAONLY); 4550 4551 if (x86_tmp_pml_vaddr[level] == 0) 4552 panic("mapping of real mode PML failed\n"); 4553 pmap_kenter_pa(x86_tmp_pml_vaddr[level], 4554 x86_tmp_pml_paddr[level], 4555 VM_PROT_READ | VM_PROT_WRITE, 0); 4556 pmap_update(pmap_kernel()); 4557 } 4558 maps_loaded = true; 4559 } 4560 4561 /* Zero levels 1-3 */ 4562 for (level = 0; level < PTP_LEVELS - 1; ++level) { 4563 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 4564 memset(tmp_pml, 0, PAGE_SIZE); 4565 } 4566 4567 /* Copy PML4 */ 4568 kernel_pml = pmap_kernel()->pm_pdir; 4569 tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1]; 4570 memcpy(tmp_pml, kernel_pml, PAGE_SIZE); 4571 4572 #ifdef PAE 4573 /* 4574 * Use the last 4 entries of the L2 page as L3 PD entries. These 4575 * last entries are unlikely to be used for temporary mappings. 4576 * 508: maps 0->1GB (userland) 4577 * 509: unused 4578 * 510: unused 4579 * 511: maps 3->4GB (kernel) 4580 */ 4581 tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PG_V; 4582 tmp_pml[509] = 0; 4583 tmp_pml[510] = 0; 4584 tmp_pml[511] = pmap_pdirpa(pmap_kernel(), PDIR_SLOT_KERN) | PG_V; 4585 #endif 4586 4587 for (level = PTP_LEVELS - 1; level > 0; --level) { 4588 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 4589 4590 tmp_pml[pl_i(pg, level + 1)] = 4591 (x86_tmp_pml_paddr[level - 1] & PG_FRAME) | PG_RW | PG_V; 4592 } 4593 4594 tmp_pml = (void *)x86_tmp_pml_vaddr[0]; 4595 tmp_pml[pl_i(pg, 1)] = (pg & PG_FRAME) | PG_RW | PG_V; 4596 4597 #ifdef PAE 4598 /* Return the PA of the L3 page (entry 508 of the L2 page) */ 4599 return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t); 4600 #endif 4601 4602 return x86_tmp_pml_paddr[PTP_LEVELS - 1]; 4603 } 4604 4605 u_int 4606 x86_mmap_flags(paddr_t mdpgno) 4607 { 4608 u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK; 4609 u_int pflag = 0; 4610 4611 if (nflag & X86_MMAP_FLAG_PREFETCH) 4612 pflag |= PMAP_WRITE_COMBINE; 4613 4614 return pflag; 4615 } 4616