1 /* $NetBSD: pmap.c,v 1.220 2016/08/19 18:24:57 maxv Exp $ */ 2 3 /*- 4 * Copyright (c) 2008, 2010, 2016 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran, and by Maxime Villard. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 2007 Manuel Bouyer. 34 * 35 * Redistribution and use in source and binary forms, with or without 36 * modification, are permitted provided that the following conditions 37 * are met: 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 44 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 45 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 46 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 47 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 48 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 49 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 50 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 51 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 52 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 53 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 54 * 55 */ 56 57 /* 58 * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr> 59 * 60 * Permission to use, copy, modify, and distribute this software for any 61 * purpose with or without fee is hereby granted, provided that the above 62 * copyright notice and this permission notice appear in all copies. 63 * 64 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 65 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 66 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 67 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 68 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 69 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 70 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 71 */ 72 73 /* 74 * Copyright (c) 1997 Charles D. Cranor and Washington University. 75 * All rights reserved. 76 * 77 * Redistribution and use in source and binary forms, with or without 78 * modification, are permitted provided that the following conditions 79 * are met: 80 * 1. Redistributions of source code must retain the above copyright 81 * notice, this list of conditions and the following disclaimer. 82 * 2. Redistributions in binary form must reproduce the above copyright 83 * notice, this list of conditions and the following disclaimer in the 84 * documentation and/or other materials provided with the distribution. 85 * 86 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 87 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 88 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 89 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 90 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 91 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 92 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 93 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 94 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 95 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 96 */ 97 98 /* 99 * Copyright 2001 (c) Wasabi Systems, Inc. 100 * All rights reserved. 101 * 102 * Written by Frank van der Linden for Wasabi Systems, Inc. 103 * 104 * Redistribution and use in source and binary forms, with or without 105 * modification, are permitted provided that the following conditions 106 * are met: 107 * 1. Redistributions of source code must retain the above copyright 108 * notice, this list of conditions and the following disclaimer. 109 * 2. Redistributions in binary form must reproduce the above copyright 110 * notice, this list of conditions and the following disclaimer in the 111 * documentation and/or other materials provided with the distribution. 112 * 3. All advertising materials mentioning features or use of this software 113 * must display the following acknowledgement: 114 * This product includes software developed for the NetBSD Project by 115 * Wasabi Systems, Inc. 116 * 4. The name of Wasabi Systems, Inc. may not be used to endorse 117 * or promote products derived from this software without specific prior 118 * written permission. 119 * 120 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND 121 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 122 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 123 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC 124 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 125 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 126 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 127 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 128 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 129 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 130 * POSSIBILITY OF SUCH DAMAGE. 131 */ 132 133 /* 134 * This is the i386 pmap modified and generalized to support x86-64 135 * as well. The idea is to hide the upper N levels of the page tables 136 * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest 137 * is mostly untouched, except that it uses some more generalized 138 * macros and interfaces. 139 * 140 * This pmap has been tested on the i386 as well, and it can be easily 141 * adapted to PAE. 142 * 143 * fvdl@wasabisystems.com 18-Jun-2001 144 */ 145 146 /* 147 * pmap.c: i386 pmap module rewrite 148 * Chuck Cranor <chuck@netbsd> 149 * 11-Aug-97 150 * 151 * history of this pmap module: in addition to my own input, i used 152 * the following references for this rewrite of the i386 pmap: 153 * 154 * [1] the NetBSD i386 pmap. this pmap appears to be based on the 155 * BSD hp300 pmap done by Mike Hibler at University of Utah. 156 * it was then ported to the i386 by William Jolitz of UUNET 157 * Technologies, Inc. Then Charles M. Hannum of the NetBSD 158 * project fixed some bugs and provided some speed ups. 159 * 160 * [2] the FreeBSD i386 pmap. this pmap seems to be the 161 * Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson 162 * and David Greenman. 163 * 164 * [3] the Mach pmap. this pmap, from CMU, seems to have migrated 165 * between several processors. the VAX version was done by 166 * Avadis Tevanian, Jr., and Michael Wayne Young. the i386 167 * version was done by Lance Berc, Mike Kupfer, Bob Baron, 168 * David Golub, and Richard Draves. the alpha version was 169 * done by Alessandro Forin (CMU/Mach) and Chris Demetriou 170 * (NetBSD/alpha). 171 */ 172 173 #include <sys/cdefs.h> 174 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.220 2016/08/19 18:24:57 maxv Exp $"); 175 176 #include "opt_user_ldt.h" 177 #include "opt_lockdebug.h" 178 #include "opt_multiprocessor.h" 179 #include "opt_xen.h" 180 #if !defined(__x86_64__) 181 #include "opt_kstack_dr0.h" 182 #endif /* !defined(__x86_64__) */ 183 184 #include <sys/param.h> 185 #include <sys/systm.h> 186 #include <sys/proc.h> 187 #include <sys/pool.h> 188 #include <sys/kernel.h> 189 #include <sys/atomic.h> 190 #include <sys/cpu.h> 191 #include <sys/intr.h> 192 #include <sys/xcall.h> 193 #include <sys/kcore.h> 194 195 #include <uvm/uvm.h> 196 #include <uvm/pmap/pmap_pvt.h> 197 198 #include <dev/isa/isareg.h> 199 200 #include <machine/specialreg.h> 201 #include <machine/gdt.h> 202 #include <machine/isa_machdep.h> 203 #include <machine/cpuvar.h> 204 #include <machine/cputypes.h> 205 206 #include <x86/pmap.h> 207 #include <x86/pmap_pv.h> 208 209 #include <x86/i82489reg.h> 210 #include <x86/i82489var.h> 211 212 #ifdef XEN 213 #include <xen/xen-public/xen.h> 214 #include <xen/hypervisor.h> 215 #endif 216 217 /* 218 * general info: 219 * 220 * - for an explanation of how the i386 MMU hardware works see 221 * the comments in <machine/pte.h>. 222 * 223 * - for an explanation of the general memory structure used by 224 * this pmap (including the recursive mapping), see the comments 225 * in <machine/pmap.h>. 226 * 227 * this file contains the code for the "pmap module." the module's 228 * job is to manage the hardware's virtual to physical address mappings. 229 * note that there are two levels of mapping in the VM system: 230 * 231 * [1] the upper layer of the VM system uses vm_map's and vm_map_entry's 232 * to map ranges of virtual address space to objects/files. for 233 * example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only 234 * to the file /bin/ls starting at offset zero." note that 235 * the upper layer mapping is not concerned with how individual 236 * vm_pages are mapped. 237 * 238 * [2] the lower layer of the VM system (the pmap) maintains the mappings 239 * from virtual addresses. it is concerned with which vm_page is 240 * mapped where. for example, when you run /bin/ls and start 241 * at page 0x1000 the fault routine may lookup the correct page 242 * of the /bin/ls file and then ask the pmap layer to establish 243 * a mapping for it. 244 * 245 * note that information in the lower layer of the VM system can be 246 * thrown away since it can easily be reconstructed from the info 247 * in the upper layer. 248 * 249 * data structures we use include: 250 * 251 * - struct pmap: describes the address space of one thread 252 * - struct pmap_page: describes one pv-tracked page, without 253 * necessarily a corresponding vm_page 254 * - struct pv_entry: describes one <PMAP,VA> mapping of a PA 255 * - struct pv_head: there is one pv_head per pv-tracked page of 256 * physical memory. the pv_head points to a list of pv_entry 257 * structures which describe all the <PMAP,VA> pairs that this 258 * page is mapped in. this is critical for page based operations 259 * such as pmap_page_protect() [change protection on _all_ mappings 260 * of a page] 261 */ 262 263 /* 264 * memory allocation 265 * 266 * - there are three data structures that we must dynamically allocate: 267 * 268 * [A] new process' page directory page (PDP) 269 * - plan 1: done at pmap_create() we use 270 * uvm_km_alloc(kernel_map, PAGE_SIZE) [fka kmem_alloc] to do this 271 * allocation. 272 * 273 * if we are low in free physical memory then we sleep in 274 * uvm_km_alloc -- in this case this is ok since we are creating 275 * a new pmap and should not be holding any locks. 276 * 277 * if the kernel is totally out of virtual space 278 * (i.e. uvm_km_alloc returns NULL), then we panic. 279 * 280 * [B] new page tables pages (PTP) 281 * - call uvm_pagealloc() 282 * => success: zero page, add to pm_pdir 283 * => failure: we are out of free vm_pages, let pmap_enter() 284 * tell UVM about it. 285 * 286 * note: for kernel PTPs, we start with NKPTP of them. as we map 287 * kernel memory (at uvm_map time) we check to see if we've grown 288 * the kernel pmap. if so, we call the optional function 289 * pmap_growkernel() to grow the kernel PTPs in advance. 290 * 291 * [C] pv_entry structures 292 */ 293 294 /* 295 * locking 296 * 297 * we have the following locks that we must contend with: 298 * 299 * mutexes: 300 * 301 * - pmap lock (per pmap, part of uvm_object) 302 * this lock protects the fields in the pmap structure including 303 * the non-kernel PDEs in the PDP, and the PTEs. it also locks 304 * in the alternate PTE space (since that is determined by the 305 * entry in the PDP). 306 * 307 * - pvh_lock (per pv_head) 308 * this lock protects the pv_entry list which is chained off the 309 * pv_head structure for a specific pv-tracked PA. it is locked 310 * when traversing the list (e.g. adding/removing mappings, 311 * syncing R/M bits, etc.) 312 * 313 * - pmaps_lock 314 * this lock protects the list of active pmaps (headed by "pmaps"). 315 * we lock it when adding or removing pmaps from this list. 316 */ 317 318 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER; 319 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER; 320 const long nkptpmax[] = NKPTPMAX_INITIALIZER; 321 const long nbpd[] = NBPD_INITIALIZER; 322 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER; 323 324 long nkptp[] = NKPTP_INITIALIZER; 325 326 struct pmap_head pmaps; 327 kmutex_t pmaps_lock; 328 329 static vaddr_t pmap_maxkvaddr; 330 331 /* 332 * XXX kludge: dummy locking to make KASSERTs in uvm_page.c comfortable. 333 * actual locking is done by pm_lock. 334 */ 335 #if defined(DIAGNOSTIC) 336 #define PMAP_SUBOBJ_LOCK(pm, idx) \ 337 KASSERT(mutex_owned((pm)->pm_lock)); \ 338 if ((idx) != 0) \ 339 mutex_enter((pm)->pm_obj[(idx)].vmobjlock) 340 #define PMAP_SUBOBJ_UNLOCK(pm, idx) \ 341 KASSERT(mutex_owned((pm)->pm_lock)); \ 342 if ((idx) != 0) \ 343 mutex_exit((pm)->pm_obj[(idx)].vmobjlock) 344 #else /* defined(DIAGNOSTIC) */ 345 #define PMAP_SUBOBJ_LOCK(pm, idx) /* nothing */ 346 #define PMAP_SUBOBJ_UNLOCK(pm, idx) /* nothing */ 347 #endif /* defined(DIAGNOSTIC) */ 348 349 /* 350 * Misc. event counters. 351 */ 352 struct evcnt pmap_iobmp_evcnt; 353 struct evcnt pmap_ldt_evcnt; 354 355 /* 356 * PAT 357 */ 358 #define PATENTRY(n, type) (type << ((n) * 8)) 359 #define PAT_UC 0x0ULL 360 #define PAT_WC 0x1ULL 361 #define PAT_WT 0x4ULL 362 #define PAT_WP 0x5ULL 363 #define PAT_WB 0x6ULL 364 #define PAT_UCMINUS 0x7ULL 365 366 static bool cpu_pat_enabled __read_mostly = false; 367 368 /* 369 * Global data structures 370 */ 371 372 static struct pmap kernel_pmap_store; /* the kernel's pmap (proc0) */ 373 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store; 374 375 /* 376 * pmap_pg_nx: if our processor supports PG_NX in the PTE then we 377 * set pmap_pg_nx to PG_NX (otherwise it is zero). 378 */ 379 pd_entry_t pmap_pg_nx __read_mostly = 0; 380 381 /* 382 * pmap_pg_g: if our processor supports PG_G in the PTE then we 383 * set pmap_pg_g to PG_G (otherwise it is zero). 384 */ 385 pd_entry_t pmap_pg_g __read_mostly = 0; 386 387 /* 388 * pmap_largepages: if our processor supports PG_PS and we are 389 * using it, this is set to true. 390 */ 391 int pmap_largepages __read_mostly = 0; 392 393 /* 394 * i386 physical memory comes in a big contig chunk with a small 395 * hole toward the front of it... the following two paddr_t's 396 * (shared with machdep.c) describe the physical address space 397 * of this machine. 398 */ 399 paddr_t avail_start __read_mostly; /* PA of first available physical page */ 400 paddr_t avail_end __read_mostly; /* PA of last available physical page */ 401 402 #ifdef XEN 403 #ifdef __x86_64__ 404 /* Dummy PGD for user cr3, used between pmap_deactivate() and pmap_activate() */ 405 static paddr_t xen_dummy_user_pgd; 406 #endif /* __x86_64__ */ 407 paddr_t pmap_pa_start; /* PA of first physical page for this domain */ 408 paddr_t pmap_pa_end; /* PA of last physical page for this domain */ 409 #endif /* XEN */ 410 411 #define VM_PAGE_TO_PP(pg) (&(pg)->mdpage.mp_pp) 412 413 #define PV_HASH_SIZE 32768 414 #define PV_HASH_LOCK_CNT 32 415 416 struct pv_hash_lock { 417 kmutex_t lock; 418 } __aligned(CACHE_LINE_SIZE) pv_hash_locks[PV_HASH_LOCK_CNT] 419 __aligned(CACHE_LINE_SIZE); 420 421 struct pv_hash_head { 422 SLIST_HEAD(, pv_entry) hh_list; 423 } pv_hash_heads[PV_HASH_SIZE]; 424 425 static u_int 426 pvhash_hash(struct vm_page *ptp, vaddr_t va) 427 { 428 429 return (uintptr_t)ptp / sizeof(*ptp) + (va >> PAGE_SHIFT); 430 } 431 432 static struct pv_hash_head * 433 pvhash_head(u_int hash) 434 { 435 436 return &pv_hash_heads[hash % PV_HASH_SIZE]; 437 } 438 439 static kmutex_t * 440 pvhash_lock(u_int hash) 441 { 442 443 return &pv_hash_locks[hash % PV_HASH_LOCK_CNT].lock; 444 } 445 446 static struct pv_entry * 447 pvhash_remove(struct pv_hash_head *hh, struct vm_page *ptp, vaddr_t va) 448 { 449 struct pv_entry *pve; 450 struct pv_entry *prev; 451 452 prev = NULL; 453 SLIST_FOREACH(pve, &hh->hh_list, pve_hash) { 454 if (pve->pve_pte.pte_ptp == ptp && 455 pve->pve_pte.pte_va == va) { 456 if (prev != NULL) { 457 SLIST_REMOVE_AFTER(prev, pve_hash); 458 } else { 459 SLIST_REMOVE_HEAD(&hh->hh_list, pve_hash); 460 } 461 break; 462 } 463 prev = pve; 464 } 465 return pve; 466 } 467 468 /* 469 * Other data structures 470 */ 471 472 static pt_entry_t protection_codes[8] __read_mostly; 473 474 static bool pmap_initialized __read_mostly = false; /* pmap_init done yet? */ 475 476 /* 477 * The following two vaddr_t's are used during system startup to keep track of 478 * how much of the kernel's VM space we have used. Once the system is started, 479 * the management of the remaining kernel VM space is turned over to the 480 * kernel_map vm_map. 481 */ 482 static vaddr_t virtual_avail __read_mostly; /* VA of first free KVA */ 483 static vaddr_t virtual_end __read_mostly; /* VA of last free KVA */ 484 485 /* 486 * pool that pmap structures are allocated from 487 */ 488 static struct pool_cache pmap_cache; 489 490 /* 491 * pv_entry cache 492 */ 493 static struct pool_cache pmap_pv_cache; 494 495 #ifndef __HAVE_DIRECT_MAP 496 /* 497 * MULTIPROCESSOR: special VAs and PTEs are actually allocated inside a 498 * (maxcpus * NPTECL) array of PTE, to avoid cache line thrashing due to 499 * false sharing. 500 */ 501 #ifdef MULTIPROCESSOR 502 #define PTESLEW(pte, id) ((pte)+(id)*NPTECL) 503 #define VASLEW(va,id) ((va)+(id)*NPTECL*PAGE_SIZE) 504 #else 505 #define PTESLEW(pte, id) ((void)id, pte) 506 #define VASLEW(va,id) ((void)id, va) 507 #endif 508 509 /* 510 * Special VAs and the PTEs that map them 511 */ 512 static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte, *early_zero_pte; 513 static char *csrcp, *cdstp, *zerop, *ptpp; 514 #ifdef XEN 515 char *early_zerop; /* also referenced from xen_pmap_bootstrap() */ 516 #else 517 static char *early_zerop; 518 #endif 519 520 #endif 521 522 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int); 523 524 /* PDP pool_cache(9) and its callbacks */ 525 struct pool_cache pmap_pdp_cache; 526 static int pmap_pdp_ctor(void *, void *, int); 527 static void pmap_pdp_dtor(void *, void *); 528 #ifdef PAE 529 /* need to allocate items of 4 pages */ 530 static void *pmap_pdp_alloc(struct pool *, int); 531 static void pmap_pdp_free(struct pool *, void *); 532 static struct pool_allocator pmap_pdp_allocator = { 533 .pa_alloc = pmap_pdp_alloc, 534 .pa_free = pmap_pdp_free, 535 .pa_pagesz = PAGE_SIZE * PDP_SIZE, 536 }; 537 #endif /* PAE */ 538 539 extern vaddr_t idt_vaddr; 540 extern paddr_t idt_paddr; 541 542 extern int end; 543 544 #ifdef i386 545 /* stuff to fix the pentium f00f bug */ 546 extern vaddr_t pentium_idt_vaddr; 547 #endif 548 549 /* 550 * Local prototypes 551 */ 552 553 #ifdef __HAVE_DIRECT_MAP 554 static void pmap_init_directmap(struct pmap *); 555 #endif 556 #ifndef XEN 557 static void pmap_remap_largepages(void); 558 #endif 559 560 static struct vm_page *pmap_get_ptp(struct pmap *, vaddr_t, 561 pd_entry_t * const *); 562 static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int); 563 static void pmap_freepage(struct pmap *, struct vm_page *, int); 564 static void pmap_free_ptp(struct pmap *, struct vm_page *, vaddr_t, 565 pt_entry_t *, pd_entry_t * const *); 566 static bool pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *, 567 vaddr_t, struct pv_entry **); 568 static void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t, vaddr_t, 569 vaddr_t, struct pv_entry **); 570 571 static paddr_t pmap_get_physpage(void); 572 static void pmap_alloc_level(vaddr_t, long *); 573 574 static bool pmap_reactivate(struct pmap *); 575 576 /* 577 * p m a p h e l p e r f u n c t i o n s 578 */ 579 580 static inline void 581 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff) 582 { 583 584 if (pmap == pmap_kernel()) { 585 atomic_add_long(&pmap->pm_stats.resident_count, resid_diff); 586 atomic_add_long(&pmap->pm_stats.wired_count, wired_diff); 587 } else { 588 KASSERT(mutex_owned(pmap->pm_lock)); 589 pmap->pm_stats.resident_count += resid_diff; 590 pmap->pm_stats.wired_count += wired_diff; 591 } 592 } 593 594 static inline void 595 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte) 596 { 597 int resid_diff = ((npte & PG_V) ? 1 : 0) - ((opte & PG_V) ? 1 : 0); 598 int wired_diff = ((npte & PG_W) ? 1 : 0) - ((opte & PG_W) ? 1 : 0); 599 600 KASSERT((npte & (PG_V | PG_W)) != PG_W); 601 KASSERT((opte & (PG_V | PG_W)) != PG_W); 602 603 pmap_stats_update(pmap, resid_diff, wired_diff); 604 } 605 606 /* 607 * ptp_to_pmap: lookup pmap by ptp 608 */ 609 610 static struct pmap * 611 ptp_to_pmap(struct vm_page *ptp) 612 { 613 struct pmap *pmap; 614 615 if (ptp == NULL) { 616 return pmap_kernel(); 617 } 618 pmap = (struct pmap *)ptp->uobject; 619 KASSERT(pmap != NULL); 620 KASSERT(&pmap->pm_obj[0] == ptp->uobject); 621 return pmap; 622 } 623 624 static inline struct pv_pte * 625 pve_to_pvpte(struct pv_entry *pve) 626 { 627 628 KASSERT((void *)&pve->pve_pte == (void *)pve); 629 return &pve->pve_pte; 630 } 631 632 static inline struct pv_entry * 633 pvpte_to_pve(struct pv_pte *pvpte) 634 { 635 struct pv_entry *pve = (void *)pvpte; 636 637 KASSERT(pve_to_pvpte(pve) == pvpte); 638 return pve; 639 } 640 641 /* 642 * pv_pte_first, pv_pte_next: PV list iterator. 643 */ 644 645 static struct pv_pte * 646 pv_pte_first(struct pmap_page *pp) 647 { 648 649 if ((pp->pp_flags & PP_EMBEDDED) != 0) { 650 return &pp->pp_pte; 651 } 652 return pve_to_pvpte(LIST_FIRST(&pp->pp_head.pvh_list)); 653 } 654 655 static struct pv_pte * 656 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte) 657 { 658 659 KASSERT(pvpte != NULL); 660 if (pvpte == &pp->pp_pte) { 661 KASSERT((pp->pp_flags & PP_EMBEDDED) != 0); 662 return NULL; 663 } 664 KASSERT((pp->pp_flags & PP_EMBEDDED) == 0); 665 return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list)); 666 } 667 668 /* 669 * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]? 670 * of course the kernel is always loaded 671 */ 672 673 bool 674 pmap_is_curpmap(struct pmap *pmap) 675 { 676 return((pmap == pmap_kernel()) || 677 (pmap == curcpu()->ci_pmap)); 678 } 679 680 /* 681 * Add a reference to the specified pmap. 682 */ 683 684 void 685 pmap_reference(struct pmap *pmap) 686 { 687 688 atomic_inc_uint(&pmap->pm_obj[0].uo_refs); 689 } 690 691 /* 692 * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in 693 * 694 * there are several pmaps involved. some or all of them might be same. 695 * 696 * - the pmap given by the first argument 697 * our caller wants to access this pmap's PTEs. 698 * 699 * - pmap_kernel() 700 * the kernel pmap. note that it only contains the kernel part 701 * of the address space which is shared by any pmap. ie. any 702 * pmap can be used instead of pmap_kernel() for our purpose. 703 * 704 * - ci->ci_pmap 705 * pmap currently loaded on the cpu. 706 * 707 * - vm_map_pmap(&curproc->p_vmspace->vm_map) 708 * current process' pmap. 709 * 710 * => we lock enough pmaps to keep things locked in 711 * => must be undone with pmap_unmap_ptes before returning 712 */ 713 714 void 715 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2, 716 pd_entry_t **ptepp, pd_entry_t * const **pdeppp) 717 { 718 struct pmap *curpmap; 719 struct cpu_info *ci; 720 lwp_t *l; 721 722 /* The kernel's pmap is always accessible. */ 723 if (pmap == pmap_kernel()) { 724 *pmap2 = NULL; 725 *ptepp = PTE_BASE; 726 *pdeppp = normal_pdes; 727 return; 728 } 729 KASSERT(kpreempt_disabled()); 730 731 l = curlwp; 732 retry: 733 mutex_enter(pmap->pm_lock); 734 ci = curcpu(); 735 curpmap = ci->ci_pmap; 736 if (vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) { 737 /* Our own pmap so just load it: easy. */ 738 if (__predict_false(ci->ci_want_pmapload)) { 739 mutex_exit(pmap->pm_lock); 740 pmap_load(); 741 goto retry; 742 } 743 KASSERT(pmap == curpmap); 744 } else if (pmap == curpmap) { 745 /* 746 * Already on the CPU: make it valid. This is very 747 * often the case during exit(), when we have switched 748 * to the kernel pmap in order to destroy a user pmap. 749 */ 750 if (!pmap_reactivate(pmap)) { 751 u_int gen = uvm_emap_gen_return(); 752 tlbflush(); 753 uvm_emap_update(gen); 754 } 755 } else { 756 /* 757 * Toss current pmap from CPU, but keep a reference to it. 758 * The reference will be dropped by pmap_unmap_ptes(). 759 * Can happen if we block during exit(). 760 */ 761 const cpuid_t cid = cpu_index(ci); 762 763 kcpuset_atomic_clear(curpmap->pm_cpus, cid); 764 kcpuset_atomic_clear(curpmap->pm_kernel_cpus, cid); 765 ci->ci_pmap = pmap; 766 ci->ci_tlbstate = TLBSTATE_VALID; 767 kcpuset_atomic_set(pmap->pm_cpus, cid); 768 kcpuset_atomic_set(pmap->pm_kernel_cpus, cid); 769 cpu_load_pmap(pmap, curpmap); 770 } 771 pmap->pm_ncsw = l->l_ncsw; 772 *pmap2 = curpmap; 773 *ptepp = PTE_BASE; 774 #if defined(XEN) && defined(__x86_64__) 775 KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] == L4_BASE); 776 ci->ci_normal_pdes[PTP_LEVELS - 2] = pmap->pm_pdir; 777 *pdeppp = ci->ci_normal_pdes; 778 #else /* XEN && __x86_64__ */ 779 *pdeppp = normal_pdes; 780 #endif /* XEN && __x86_64__ */ 781 } 782 783 /* 784 * pmap_unmap_ptes: unlock the PTE mapping of "pmap" 785 */ 786 787 void 788 pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2) 789 { 790 struct cpu_info *ci; 791 struct pmap *mypmap; 792 793 KASSERT(kpreempt_disabled()); 794 795 /* The kernel's pmap is always accessible. */ 796 if (pmap == pmap_kernel()) { 797 return; 798 } 799 800 ci = curcpu(); 801 #if defined(XEN) && defined(__x86_64__) 802 /* Reset per-cpu normal_pdes */ 803 KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] != L4_BASE); 804 ci->ci_normal_pdes[PTP_LEVELS - 2] = L4_BASE; 805 #endif /* XEN && __x86_64__ */ 806 /* 807 * We cannot tolerate context switches while mapped in. 808 * If it is our own pmap all we have to do is unlock. 809 */ 810 KASSERT(pmap->pm_ncsw == curlwp->l_ncsw); 811 mypmap = vm_map_pmap(&curproc->p_vmspace->vm_map); 812 if (pmap == mypmap) { 813 mutex_exit(pmap->pm_lock); 814 return; 815 } 816 817 /* 818 * Mark whatever's on the CPU now as lazy and unlock. 819 * If the pmap was already installed, we are done. 820 */ 821 ci->ci_tlbstate = TLBSTATE_LAZY; 822 ci->ci_want_pmapload = (mypmap != pmap_kernel()); 823 mutex_exit(pmap->pm_lock); 824 if (pmap == pmap2) { 825 return; 826 } 827 828 /* 829 * We installed another pmap on the CPU. Grab a reference to 830 * it and leave in place. Toss the evicted pmap (can block). 831 */ 832 pmap_reference(pmap); 833 pmap_destroy(pmap2); 834 } 835 836 837 inline static void 838 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte) 839 { 840 841 #if !defined(__x86_64__) 842 if (curproc == NULL || curproc->p_vmspace == NULL || 843 pm != vm_map_pmap(&curproc->p_vmspace->vm_map)) 844 return; 845 846 if ((opte ^ npte) & PG_X) 847 pmap_update_pg(va); 848 849 /* 850 * Executability was removed on the last executable change. 851 * Reset the code segment to something conservative and 852 * let the trap handler deal with setting the right limit. 853 * We can't do that because of locking constraints on the vm map. 854 */ 855 856 if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) { 857 struct trapframe *tf = curlwp->l_md.md_regs; 858 859 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 860 pm->pm_hiexec = I386_MAX_EXE_ADDR; 861 } 862 #endif /* !defined(__x86_64__) */ 863 } 864 865 #if !defined(__x86_64__) 866 /* 867 * Fixup the code segment to cover all potential executable mappings. 868 * returns 0 if no changes to the code segment were made. 869 */ 870 871 int 872 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb) 873 { 874 struct vm_map_entry *ent; 875 struct pmap *pm = vm_map_pmap(map); 876 vaddr_t va = 0; 877 878 vm_map_lock_read(map); 879 for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) { 880 881 /* 882 * This entry has greater va than the entries before. 883 * We need to make it point to the last page, not past it. 884 */ 885 886 if (ent->protection & VM_PROT_EXECUTE) 887 va = trunc_page(ent->end) - PAGE_SIZE; 888 } 889 vm_map_unlock_read(map); 890 if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL)) 891 return (0); 892 893 pm->pm_hiexec = va; 894 if (pm->pm_hiexec > I386_MAX_EXE_ADDR) { 895 tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL); 896 } else { 897 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 898 return (0); 899 } 900 return (1); 901 } 902 #endif /* !defined(__x86_64__) */ 903 904 void 905 pat_init(struct cpu_info *ci) 906 { 907 uint64_t pat; 908 909 if (!(ci->ci_feat_val[0] & CPUID_PAT)) 910 return; 911 912 /* We change WT to WC. Leave all other entries the default values. */ 913 pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) | 914 PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) | 915 PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) | 916 PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC); 917 918 wrmsr(MSR_CR_PAT, pat); 919 cpu_pat_enabled = true; 920 aprint_debug_dev(ci->ci_dev, "PAT enabled\n"); 921 } 922 923 static pt_entry_t 924 pmap_pat_flags(u_int flags) 925 { 926 u_int cacheflags = (flags & PMAP_CACHE_MASK); 927 928 if (!cpu_pat_enabled) { 929 switch (cacheflags) { 930 case PMAP_NOCACHE: 931 case PMAP_NOCACHE_OVR: 932 /* results in PGC_UCMINUS on cpus which have 933 * the cpuid PAT but PAT "disabled" 934 */ 935 return PG_N; 936 default: 937 return 0; 938 } 939 } 940 941 switch (cacheflags) { 942 case PMAP_NOCACHE: 943 return PGC_UC; 944 case PMAP_WRITE_COMBINE: 945 return PGC_WC; 946 case PMAP_WRITE_BACK: 947 return PGC_WB; 948 case PMAP_NOCACHE_OVR: 949 return PGC_UCMINUS; 950 } 951 952 return 0; 953 } 954 955 /* 956 * p m a p k e n t e r f u n c t i o n s 957 * 958 * functions to quickly enter/remove pages from the kernel address 959 * space. pmap_kremove is exported to MI kernel. we make use of 960 * the recursive PTE mappings. 961 */ 962 963 /* 964 * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking 965 * 966 * => no need to lock anything, assume va is already allocated 967 * => should be faster than normal pmap enter function 968 */ 969 970 void 971 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags) 972 { 973 pt_entry_t *pte, opte, npte; 974 975 KASSERT(!(prot & ~VM_PROT_ALL)); 976 977 if (va < VM_MIN_KERNEL_ADDRESS) 978 pte = vtopte(va); 979 else 980 pte = kvtopte(va); 981 #ifdef DOM0OPS 982 if (pa < pmap_pa_start || pa >= pmap_pa_end) { 983 #ifdef DEBUG 984 printf_nolog("%s: pa 0x%" PRIx64 " for va 0x%" PRIx64 985 " outside range\n", __func__, (int64_t)pa, (int64_t)va); 986 #endif /* DEBUG */ 987 npte = pa; 988 } else 989 #endif /* DOM0OPS */ 990 npte = pmap_pa2pte(pa); 991 npte |= protection_codes[prot] | PG_k | PG_V | pmap_pg_g; 992 npte |= pmap_pat_flags(flags); 993 opte = pmap_pte_testset(pte, npte); /* zap! */ 994 #if defined(DIAGNOSTIC) 995 /* 996 * XXX: make sure we are not dealing with a large page, since the only 997 * large pages created are for the kernel image, and they should never 998 * be kentered. 999 */ 1000 if (opte & PG_PS) 1001 panic("%s: PG_PS", __func__); 1002 #endif 1003 if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) { 1004 /* This should not happen. */ 1005 printf_nolog("%s: mapping already present\n", __func__); 1006 kpreempt_disable(); 1007 pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER); 1008 kpreempt_enable(); 1009 } 1010 } 1011 1012 void 1013 pmap_emap_enter(vaddr_t va, paddr_t pa, vm_prot_t prot) 1014 { 1015 pt_entry_t *pte, npte; 1016 1017 KASSERT((prot & ~VM_PROT_ALL) == 0); 1018 pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va); 1019 1020 #ifdef DOM0OPS 1021 if (pa < pmap_pa_start || pa >= pmap_pa_end) { 1022 npte = pa; 1023 } else 1024 #endif 1025 npte = pmap_pa2pte(pa); 1026 1027 npte = pmap_pa2pte(pa); 1028 npte |= protection_codes[prot] | PG_k | PG_V; 1029 pmap_pte_set(pte, npte); 1030 } 1031 1032 /* 1033 * pmap_emap_sync: perform TLB flush or pmap load, if it was deferred. 1034 */ 1035 void 1036 pmap_emap_sync(bool canload) 1037 { 1038 struct cpu_info *ci = curcpu(); 1039 struct pmap *pmap; 1040 1041 KASSERT(kpreempt_disabled()); 1042 if (__predict_true(ci->ci_want_pmapload && canload)) { 1043 /* 1044 * XXX: Hint for pmap_reactivate(), which might suggest to 1045 * not perform TLB flush, if state has not changed. 1046 */ 1047 pmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map); 1048 if (__predict_false(pmap == ci->ci_pmap)) { 1049 kcpuset_atomic_clear(pmap->pm_cpus, cpu_index(ci)); 1050 } 1051 pmap_load(); 1052 KASSERT(ci->ci_want_pmapload == 0); 1053 } else { 1054 tlbflush(); 1055 } 1056 } 1057 1058 void 1059 pmap_emap_remove(vaddr_t sva, vsize_t len) 1060 { 1061 pt_entry_t *pte; 1062 vaddr_t va, eva = sva + len; 1063 1064 for (va = sva; va < eva; va += PAGE_SIZE) { 1065 pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va); 1066 pmap_pte_set(pte, 0); 1067 } 1068 } 1069 1070 __strict_weak_alias(pmap_kenter_ma, pmap_kenter_pa); 1071 1072 #if defined(__x86_64__) 1073 /* 1074 * Change protection for a virtual address. Local for a CPU only, don't 1075 * care about TLB shootdowns. 1076 * 1077 * => must be called with preemption disabled 1078 */ 1079 void 1080 pmap_changeprot_local(vaddr_t va, vm_prot_t prot) 1081 { 1082 pt_entry_t *pte, opte, npte; 1083 1084 KASSERT(kpreempt_disabled()); 1085 1086 if (va < VM_MIN_KERNEL_ADDRESS) 1087 pte = vtopte(va); 1088 else 1089 pte = kvtopte(va); 1090 1091 npte = opte = *pte; 1092 1093 if ((prot & VM_PROT_WRITE) != 0) 1094 npte |= PG_RW; 1095 else 1096 npte &= ~PG_RW; 1097 1098 if (opte != npte) { 1099 pmap_pte_set(pte, npte); 1100 pmap_pte_flush(); 1101 invlpg(va); 1102 } 1103 } 1104 #endif /* defined(__x86_64__) */ 1105 1106 /* 1107 * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking 1108 * 1109 * => no need to lock anything 1110 * => caller must dispose of any vm_page mapped in the va range 1111 * => note: not an inline function 1112 * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE 1113 * => we assume kernel only unmaps valid addresses and thus don't bother 1114 * checking the valid bit before doing TLB flushing 1115 * => must be followed by call to pmap_update() before reuse of page 1116 */ 1117 1118 static inline void 1119 pmap_kremove1(vaddr_t sva, vsize_t len, bool localonly) 1120 { 1121 pt_entry_t *pte, opte; 1122 vaddr_t va, eva; 1123 1124 eva = sva + len; 1125 1126 kpreempt_disable(); 1127 for (va = sva; va < eva; va += PAGE_SIZE) { 1128 pte = kvtopte(va); 1129 opte = pmap_pte_testset(pte, 0); /* zap! */ 1130 if ((opte & (PG_V | PG_U)) == (PG_V | PG_U) && !localonly) { 1131 pmap_tlb_shootdown(pmap_kernel(), va, opte, 1132 TLBSHOOT_KREMOVE); 1133 } 1134 KASSERT((opte & PG_PS) == 0); 1135 KASSERT((opte & PG_PVLIST) == 0); 1136 } 1137 if (localonly) { 1138 tlbflushg(); 1139 } 1140 kpreempt_enable(); 1141 } 1142 1143 void 1144 pmap_kremove(vaddr_t sva, vsize_t len) 1145 { 1146 1147 pmap_kremove1(sva, len, false); 1148 } 1149 1150 /* 1151 * pmap_kremove_local: like pmap_kremove(), but only worry about 1152 * TLB invalidations on the current CPU. this is only intended 1153 * for use while writing kernel crash dumps. 1154 */ 1155 1156 void 1157 pmap_kremove_local(vaddr_t sva, vsize_t len) 1158 { 1159 1160 KASSERT(panicstr != NULL); 1161 pmap_kremove1(sva, len, true); 1162 } 1163 1164 /* 1165 * p m a p i n i t f u n c t i o n s 1166 * 1167 * pmap_bootstrap and pmap_init are called during system startup 1168 * to init the pmap module. pmap_bootstrap() does a low level 1169 * init just to get things rolling. pmap_init() finishes the job. 1170 */ 1171 1172 /* 1173 * pmap_bootstrap_valloc: allocate a virtual address in the bootstrap area. 1174 * This function is to be used before any VM system has been set up. 1175 * 1176 * The va is taken from virtual_avail. 1177 */ 1178 static vaddr_t 1179 pmap_bootstrap_valloc(size_t npages) 1180 { 1181 vaddr_t va = virtual_avail; 1182 virtual_avail += npages * PAGE_SIZE; 1183 return va; 1184 } 1185 1186 /* 1187 * pmap_bootstrap_palloc: allocate a physical address in the bootstrap area. 1188 * This function is to be used before any VM system has been set up. 1189 * 1190 * The pa is taken from avail_start. 1191 */ 1192 static paddr_t 1193 pmap_bootstrap_palloc(size_t npages) 1194 { 1195 paddr_t pa = avail_start; 1196 avail_start += npages * PAGE_SIZE; 1197 return pa; 1198 } 1199 1200 /* 1201 * pmap_bootstrap: get the system in a state where it can run with VM properly 1202 * enabled (called before main()). The VM system is fully init'd later. 1203 * 1204 * => on i386, locore.S has already enabled the MMU by allocating a PDP for the 1205 * kernel, and nkpde PTP's for the kernel. 1206 * => kva_start is the first free virtual address in kernel space. 1207 */ 1208 void 1209 pmap_bootstrap(vaddr_t kva_start) 1210 { 1211 struct pmap *kpm; 1212 int i; 1213 vaddr_t kva; 1214 #ifndef XEN 1215 unsigned long p1i; 1216 vaddr_t kva_end; 1217 #endif 1218 1219 pmap_pg_nx = (cpu_feature[2] & CPUID_NOX ? PG_NX : 0); 1220 1221 /* 1222 * Set up our local static global vars that keep track of the usage of 1223 * KVM before kernel_map is set up. 1224 */ 1225 virtual_avail = kva_start; /* first free KVA */ 1226 virtual_end = VM_MAX_KERNEL_ADDRESS; /* last KVA */ 1227 1228 /* 1229 * Set up protection_codes: we need to be able to convert from a MI 1230 * protection code (some combo of VM_PROT...) to something we can jam 1231 * into a x86 PTE. 1232 */ 1233 protection_codes[VM_PROT_NONE] = pmap_pg_nx; 1234 protection_codes[VM_PROT_EXECUTE] = PG_RO | PG_X; 1235 protection_codes[VM_PROT_READ] = PG_RO | pmap_pg_nx; 1236 protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO | PG_X; 1237 protection_codes[VM_PROT_WRITE] = PG_RW | pmap_pg_nx; 1238 protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW | PG_X; 1239 protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW | pmap_pg_nx; 1240 protection_codes[VM_PROT_ALL] = PG_RW | PG_X; 1241 1242 /* 1243 * Now we init the kernel's pmap. 1244 * 1245 * The kernel pmap's pm_obj is not used for much. However, in user pmaps 1246 * the pm_obj contains the list of active PTPs. 1247 * 1248 * The pm_obj currently does not have a pager. It might be possible to 1249 * add a pager that would allow a process to read-only mmap its own page 1250 * tables (fast user-level vtophys?). This may or may not be useful. 1251 */ 1252 kpm = pmap_kernel(); 1253 for (i = 0; i < PTP_LEVELS - 1; i++) { 1254 mutex_init(&kpm->pm_obj_lock[i], MUTEX_DEFAULT, IPL_NONE); 1255 uvm_obj_init(&kpm->pm_obj[i], NULL, false, 1); 1256 uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_obj_lock[i]); 1257 kpm->pm_ptphint[i] = NULL; 1258 } 1259 memset(&kpm->pm_list, 0, sizeof(kpm->pm_list)); /* pm_list not used */ 1260 1261 kpm->pm_pdir = (pd_entry_t *)(PDPpaddr + KERNBASE); 1262 for (i = 0; i < PDP_SIZE; i++) 1263 kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i; 1264 1265 kpm->pm_stats.wired_count = kpm->pm_stats.resident_count = 1266 x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS); 1267 1268 kcpuset_create(&kpm->pm_cpus, true); 1269 kcpuset_create(&kpm->pm_kernel_cpus, true); 1270 1271 /* 1272 * the above is just a rough estimate and not critical to the proper 1273 * operation of the system. 1274 */ 1275 1276 #ifndef XEN 1277 /* 1278 * Begin to enable global TLB entries if they are supported. 1279 * The G bit has no effect until the CR4_PGE bit is set in CR4, 1280 * which happens in cpu_init(), which is run on each cpu 1281 * (and happens later) 1282 */ 1283 if (cpu_feature[0] & CPUID_PGE) { 1284 pmap_pg_g = PG_G; /* enable software */ 1285 1286 /* add PG_G attribute to already mapped kernel pages */ 1287 if (KERNBASE == VM_MIN_KERNEL_ADDRESS) { 1288 kva_end = virtual_avail; 1289 } else { 1290 extern vaddr_t eblob, esym; 1291 kva_end = (vaddr_t)&end; 1292 if (esym > kva_end) 1293 kva_end = esym; 1294 if (eblob > kva_end) 1295 kva_end = eblob; 1296 kva_end = roundup(kva_end, PAGE_SIZE); 1297 } 1298 for (kva = KERNBASE; kva < kva_end; kva += PAGE_SIZE) { 1299 p1i = pl1_i(kva); 1300 if (pmap_valid_entry(PTE_BASE[p1i])) 1301 PTE_BASE[p1i] |= PG_G; 1302 } 1303 } 1304 1305 /* 1306 * Enable large pages if they are supported. 1307 */ 1308 if (cpu_feature[0] & CPUID_PSE) { 1309 lcr4(rcr4() | CR4_PSE); /* enable hardware (via %cr4) */ 1310 pmap_largepages = 1; /* enable software */ 1311 1312 /* 1313 * The TLB must be flushed after enabling large pages on Pentium 1314 * CPUs, according to section 3.6.2.2 of "Intel Architecture 1315 * Software Developer's Manual, Volume 3: System Programming". 1316 */ 1317 tlbflushg(); 1318 1319 /* Remap the kernel. */ 1320 pmap_remap_largepages(); 1321 } 1322 #endif /* !XEN */ 1323 1324 #ifdef __HAVE_DIRECT_MAP 1325 pmap_init_directmap(kpm); 1326 #else 1327 if (VM_MIN_KERNEL_ADDRESS != KERNBASE) { 1328 /* 1329 * zero_pte is stuck at the end of mapped space for the kernel 1330 * image (disjunct from kva space). This is done so that it 1331 * can safely be used in pmap_growkernel (pmap_get_physpage), 1332 * when it's called for the first time. 1333 * XXXfvdl fix this for MULTIPROCESSOR later. 1334 */ 1335 #ifdef XEN 1336 /* early_zerop initialized in xen_pmap_bootstrap() */ 1337 #else 1338 early_zerop = (void *)(KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2); 1339 #endif 1340 early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop); 1341 } 1342 1343 /* 1344 * Now we allocate the "special" VAs which are used for tmp mappings 1345 * by the pmap (and other modules). We allocate the VAs by advancing 1346 * virtual_avail (note that there are no pages mapped at these VAs). 1347 * we find the PTE that maps the allocated VA via the linear PTE 1348 * mapping. 1349 */ 1350 1351 pt_entry_t *pte = PTE_BASE + pl1_i(virtual_avail); 1352 1353 #ifdef MULTIPROCESSOR 1354 /* 1355 * Waste some VA space to avoid false sharing of cache lines 1356 * for page table pages: Give each possible CPU a cache line 1357 * of PTE's (8) to play with, though we only need 4. We could 1358 * recycle some of this waste by putting the idle stacks here 1359 * as well; we could waste less space if we knew the largest 1360 * CPU ID beforehand. 1361 */ 1362 csrcp = (char *) virtual_avail; csrc_pte = pte; 1363 1364 cdstp = (char *) virtual_avail+PAGE_SIZE; cdst_pte = pte+1; 1365 1366 zerop = (char *) virtual_avail+PAGE_SIZE*2; zero_pte = pte+2; 1367 1368 ptpp = (char *) virtual_avail+PAGE_SIZE*3; ptp_pte = pte+3; 1369 1370 virtual_avail += PAGE_SIZE * maxcpus * NPTECL; 1371 pte += maxcpus * NPTECL; 1372 #else 1373 csrcp = (void *) virtual_avail; csrc_pte = pte; /* allocate */ 1374 virtual_avail += PAGE_SIZE; pte++; /* advance */ 1375 1376 cdstp = (void *) virtual_avail; cdst_pte = pte; 1377 virtual_avail += PAGE_SIZE; pte++; 1378 1379 zerop = (void *) virtual_avail; zero_pte = pte; 1380 virtual_avail += PAGE_SIZE; pte++; 1381 1382 ptpp = (void *) virtual_avail; ptp_pte = pte; 1383 virtual_avail += PAGE_SIZE; pte++; 1384 #endif 1385 1386 if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { 1387 early_zerop = zerop; 1388 early_zero_pte = zero_pte; 1389 } 1390 #endif 1391 1392 #ifdef XEN 1393 #ifdef __x86_64__ 1394 /* 1395 * We want a dummy page directory for Xen: when deactivating a pmap, Xen 1396 * will still consider it active. So we set user PGD to this one to lift 1397 * all protection on the now inactive page tables set. 1398 */ 1399 xen_dummy_user_pgd = pmap_bootstrap_palloc(1); 1400 1401 /* Zero fill it, the less checks in Xen it requires the better */ 1402 memset((void *) (xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE); 1403 /* Mark read-only */ 1404 HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE, 1405 pmap_pa2pte(xen_dummy_user_pgd) | PG_u | PG_V, UVMF_INVLPG); 1406 /* Pin as L4 */ 1407 xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd)); 1408 #endif /* __x86_64__ */ 1409 /* 1410 * Xen requires one more page as we can't store GDT and LDT on the same 1411 * page. 1412 */ 1413 idt_vaddr = pmap_bootstrap_valloc(3); 1414 idt_paddr = pmap_bootstrap_palloc(3); 1415 #else /* XEN */ 1416 1417 #if defined(__x86_64__) 1418 idt_vaddr = pmap_bootstrap_valloc(2); 1419 idt_paddr = pmap_bootstrap_palloc(2); 1420 #else 1421 idt_vaddr = pmap_bootstrap_valloc(1); 1422 idt_paddr = pmap_bootstrap_palloc(1); 1423 1424 /* pentium f00f bug stuff */ 1425 pentium_idt_vaddr = pmap_bootstrap_valloc(1); 1426 #endif 1427 1428 #endif /* XEN */ 1429 1430 /* 1431 * Now we reserve some VM for mapping pages when doing a crash dump. 1432 */ 1433 virtual_avail = reserve_dumppages(virtual_avail); 1434 1435 /* 1436 * Init the static-global locks and global lists. 1437 * 1438 * => pventry::pvh_lock (initialized elsewhere) must also be 1439 * a spin lock, again at IPL_VM to prevent deadlock, and 1440 * again is never taken from interrupt context. 1441 */ 1442 mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE); 1443 LIST_INIT(&pmaps); 1444 1445 /* 1446 * Ensure the TLB is sync'd with reality by flushing it... 1447 */ 1448 tlbflushg(); 1449 1450 /* 1451 * Calculate pmap_maxkvaddr from nkptp[]. 1452 */ 1453 kva = VM_MIN_KERNEL_ADDRESS; 1454 for (i = PTP_LEVELS - 1; i >= 1; i--) { 1455 kva += nkptp[i] * nbpd[i]; 1456 } 1457 pmap_maxkvaddr = kva; 1458 } 1459 1460 #ifdef __HAVE_DIRECT_MAP 1461 /* 1462 * Create the amd64 direct map. Called only once at boot time. 1463 */ 1464 static void 1465 pmap_init_directmap(struct pmap *kpm) 1466 { 1467 extern phys_ram_seg_t mem_clusters[]; 1468 extern int mem_cluster_cnt; 1469 1470 paddr_t lastpa, dm_pd, dm_pdp, pdp; 1471 vaddr_t tmpva; 1472 pt_entry_t *pte; 1473 pd_entry_t *pde; 1474 phys_ram_seg_t *mc; 1475 long n_dm_pdp; 1476 int i; 1477 1478 const pd_entry_t pteflags = PG_V | PG_KW | pmap_pg_nx; 1479 1480 /* Get the last physical address available */ 1481 lastpa = 0; 1482 for (i = 0; i < mem_cluster_cnt; i++) { 1483 mc = &mem_clusters[i]; 1484 lastpa = MAX(lastpa, mc->start + mc->size); 1485 } 1486 1487 /* 1488 * We allocate only one L4 entry for the direct map (PDIR_SLOT_DIRECT), 1489 * so we cannot map more than 512GB. 1490 */ 1491 if (lastpa > NBPD_L4) { 1492 panic("RAM limit reached: > 512GB not supported"); 1493 } 1494 1495 /* Allocate L3. */ 1496 dm_pdp = pmap_bootstrap_palloc(1); 1497 1498 /* Number of L3 entries. */ 1499 n_dm_pdp = (lastpa + NBPD_L3 - 1) >> L3_SHIFT; 1500 1501 /* In locore.S, we allocated a tmp va. Use it now. */ 1502 tmpva = (KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2); 1503 pte = PTE_BASE + pl1_i(tmpva); 1504 *pte = dm_pdp | pteflags; 1505 pmap_update_pg(tmpva); 1506 memset((void *)tmpva, 0, PAGE_SIZE); 1507 1508 /* 1509 * Map the direct map RW. Use super pages (1GB) or large pages (2MB) if 1510 * they are supported. Note: PG_G is not allowed on non-leaf PTPs. 1511 */ 1512 if (cpu_feature[2] & CPUID_P1GB) { 1513 /* Super pages are supported. Just create L3. */ 1514 for (i = 0; i < n_dm_pdp; i++) { 1515 pdp = (paddr_t)&(((pd_entry_t *)dm_pdp)[i]); 1516 *pte = (pdp & PG_FRAME) | pteflags; 1517 pmap_update_pg(tmpva); 1518 1519 pde = (pd_entry_t *)(tmpva + (pdp & ~PG_FRAME)); 1520 *pde = ((paddr_t)i << L3_SHIFT) | pteflags | PG_U | 1521 PG_PS | PG_G; 1522 } 1523 } else { 1524 /* Allocate L2. */ 1525 dm_pd = pmap_bootstrap_palloc(n_dm_pdp); 1526 1527 /* Zero out the L2 pages. */ 1528 for (i = 0; i < n_dm_pdp; i++) { 1529 pdp = dm_pd + i * PAGE_SIZE; 1530 *pte = (pdp & PG_FRAME) | pteflags; 1531 pmap_update_pg(tmpva); 1532 1533 memset((void *)tmpva, 0, PAGE_SIZE); 1534 } 1535 1536 KASSERT(pmap_largepages != 0); 1537 1538 /* Large pages are supported. Just create L2. */ 1539 for (i = 0; i < NPDPG * n_dm_pdp; i++) { 1540 pdp = (paddr_t)&(((pd_entry_t *)dm_pd)[i]); 1541 *pte = (pdp & PG_FRAME) | pteflags; 1542 pmap_update_pg(tmpva); 1543 1544 pde = (pd_entry_t *)(tmpva + (pdp & ~PG_FRAME)); 1545 *pde = ((paddr_t)i << L2_SHIFT) | pteflags | 1546 PG_U | PG_PS | PG_G; 1547 } 1548 1549 /* Fill in the L3 entries, linked to L2. */ 1550 for (i = 0; i < n_dm_pdp; i++) { 1551 pdp = (paddr_t)&(((pd_entry_t *)dm_pdp)[i]); 1552 *pte = (pdp & PG_FRAME) | pteflags; 1553 pmap_update_pg(tmpva); 1554 1555 pde = (pd_entry_t *)(tmpva + (pdp & ~PG_FRAME)); 1556 *pde = (dm_pd + (i << PAGE_SHIFT)) | pteflags | PG_U; 1557 } 1558 } 1559 1560 kpm->pm_pdir[PDIR_SLOT_DIRECT] = dm_pdp | pteflags | PG_U; 1561 1562 tlbflush(); 1563 } 1564 #endif /* __HAVE_DIRECT_MAP */ 1565 1566 #ifndef XEN 1567 /* 1568 * Remap several kernel segments with large pages. We cover as many pages as we 1569 * can. Called only once at boot time, if the CPU supports large pages. 1570 */ 1571 static void 1572 pmap_remap_largepages(void) 1573 { 1574 extern char __rodata_start; 1575 extern char __data_start; 1576 extern char __kernel_end; 1577 pd_entry_t *pde; 1578 vaddr_t kva, kva_end; 1579 paddr_t pa; 1580 1581 /* Remap the kernel text using large pages. */ 1582 kva = KERNBASE; 1583 kva_end = rounddown((vaddr_t)&__rodata_start, NBPD_L1); 1584 pa = kva - KERNBASE; 1585 for (/* */; kva + NBPD_L2 <= kva_end; kva += NBPD_L2, pa += NBPD_L2) { 1586 pde = &L2_BASE[pl2_i(kva)]; 1587 *pde = pa | pmap_pg_g | PG_PS | PG_KR | PG_V; 1588 tlbflushg(); 1589 } 1590 #if defined(DEBUG) 1591 aprint_normal("kernel text is mapped with %" PRIuPSIZE " large " 1592 "pages and %" PRIuPSIZE " normal pages\n", 1593 howmany(kva - KERNBASE, NBPD_L2), 1594 howmany((vaddr_t)&__rodata_start - kva, NBPD_L1)); 1595 #endif /* defined(DEBUG) */ 1596 1597 /* Remap the kernel rodata using large pages. */ 1598 kva = roundup((vaddr_t)&__rodata_start, NBPD_L2); 1599 kva_end = rounddown((vaddr_t)&__data_start, NBPD_L1); 1600 pa = kva - KERNBASE; 1601 for (/* */; kva + NBPD_L2 <= kva_end; kva += NBPD_L2, pa += NBPD_L2) { 1602 pde = &L2_BASE[pl2_i(kva)]; 1603 *pde = pa | pmap_pg_g | PG_PS | pmap_pg_nx | PG_KR | PG_V; 1604 tlbflushg(); 1605 } 1606 1607 /* Remap the kernel data+bss using large pages. */ 1608 /* 1609 * XXX: we need to make sure the first page (PAGE_SIZE) of .data is not 1610 * mapped with a large page. As bizarre as it might seem, this first 1611 * page is used as the VA for the LAPIC page. 1612 */ 1613 kva = roundup((vaddr_t)&__data_start+PAGE_SIZE, NBPD_L2); 1614 kva_end = rounddown((vaddr_t)&__kernel_end, NBPD_L1); 1615 pa = kva - KERNBASE; 1616 for (/* */; kva + NBPD_L2 <= kva_end; kva += NBPD_L2, pa += NBPD_L2) { 1617 pde = &L2_BASE[pl2_i(kva)]; 1618 *pde = pa | pmap_pg_g | PG_PS | pmap_pg_nx | PG_KW | PG_V; 1619 tlbflushg(); 1620 } 1621 } 1622 #endif /* !XEN */ 1623 1624 /* 1625 * pmap_init: called from uvm_init, our job is to get the pmap 1626 * system ready to manage mappings... 1627 */ 1628 1629 void 1630 pmap_init(void) 1631 { 1632 int i, flags; 1633 1634 for (i = 0; i < PV_HASH_SIZE; i++) { 1635 SLIST_INIT(&pv_hash_heads[i].hh_list); 1636 } 1637 for (i = 0; i < PV_HASH_LOCK_CNT; i++) { 1638 mutex_init(&pv_hash_locks[i].lock, MUTEX_NODEBUG, IPL_VM); 1639 } 1640 1641 /* 1642 * initialize caches. 1643 */ 1644 1645 pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), 0, 0, 0, 1646 "pmappl", NULL, IPL_NONE, NULL, NULL, NULL); 1647 1648 #ifdef XEN 1649 /* 1650 * pool_cache(9) should not touch cached objects, since they 1651 * are pinned on xen and R/O for the domU 1652 */ 1653 flags = PR_NOTOUCH; 1654 #else /* XEN */ 1655 flags = 0; 1656 #endif /* XEN */ 1657 #ifdef PAE 1658 pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE * PDP_SIZE, 0, 0, flags, 1659 "pdppl", &pmap_pdp_allocator, IPL_NONE, 1660 pmap_pdp_ctor, pmap_pdp_dtor, NULL); 1661 #else /* PAE */ 1662 pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE, 0, 0, flags, 1663 "pdppl", NULL, IPL_NONE, pmap_pdp_ctor, pmap_pdp_dtor, NULL); 1664 #endif /* PAE */ 1665 pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0, 1666 PR_LARGECACHE, "pvpl", &pool_allocator_kmem, IPL_NONE, NULL, 1667 NULL, NULL); 1668 1669 pmap_tlb_init(); 1670 1671 /* XXX: Since cpu_hatch() is only for secondary CPUs. */ 1672 pmap_tlb_cpu_init(curcpu()); 1673 1674 evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC, 1675 NULL, "x86", "io bitmap copy"); 1676 evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC, 1677 NULL, "x86", "ldt sync"); 1678 1679 /* 1680 * done: pmap module is up (and ready for business) 1681 */ 1682 1683 pmap_initialized = true; 1684 } 1685 1686 /* 1687 * pmap_cpu_init_late: perform late per-CPU initialization. 1688 */ 1689 1690 #ifndef XEN 1691 void 1692 pmap_cpu_init_late(struct cpu_info *ci) 1693 { 1694 /* 1695 * The BP has already its own PD page allocated during early 1696 * MD startup. 1697 */ 1698 if (ci == &cpu_info_primary) 1699 return; 1700 1701 #ifdef PAE 1702 cpu_alloc_l3_page(ci); 1703 #endif 1704 } 1705 #endif 1706 1707 /* 1708 * p v _ e n t r y f u n c t i o n s 1709 */ 1710 1711 /* 1712 * pmap_free_pvs: free a list of pv_entrys 1713 */ 1714 1715 static void 1716 pmap_free_pvs(struct pv_entry *pve) 1717 { 1718 struct pv_entry *next; 1719 1720 for ( /* null */ ; pve != NULL ; pve = next) { 1721 next = pve->pve_next; 1722 pool_cache_put(&pmap_pv_cache, pve); 1723 } 1724 } 1725 1726 /* 1727 * main pv_entry manipulation functions: 1728 * pmap_enter_pv: enter a mapping onto a pv_head list 1729 * pmap_remove_pv: remove a mapping from a pv_head list 1730 * 1731 * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock 1732 * the pvh before calling 1733 */ 1734 1735 /* 1736 * insert_pv: a helper of pmap_enter_pv 1737 */ 1738 1739 static void 1740 insert_pv(struct pmap_page *pp, struct pv_entry *pve) 1741 { 1742 struct pv_hash_head *hh; 1743 kmutex_t *lock; 1744 u_int hash; 1745 1746 hash = pvhash_hash(pve->pve_pte.pte_ptp, pve->pve_pte.pte_va); 1747 lock = pvhash_lock(hash); 1748 hh = pvhash_head(hash); 1749 mutex_spin_enter(lock); 1750 SLIST_INSERT_HEAD(&hh->hh_list, pve, pve_hash); 1751 mutex_spin_exit(lock); 1752 1753 LIST_INSERT_HEAD(&pp->pp_head.pvh_list, pve, pve_list); 1754 } 1755 1756 /* 1757 * pmap_enter_pv: enter a mapping onto a pv_head lst 1758 * 1759 * => caller should adjust ptp's wire_count before calling 1760 * => caller has preallocated pve and *sparepve for us 1761 */ 1762 1763 static struct pv_entry * 1764 pmap_enter_pv(struct pmap_page *pp, struct pv_entry *pve, 1765 struct pv_entry **sparepve, struct vm_page *ptp, vaddr_t va) 1766 { 1767 1768 KASSERT(ptp == NULL || ptp->wire_count >= 2); 1769 KASSERT(ptp == NULL || ptp->uobject != NULL); 1770 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 1771 1772 if ((pp->pp_flags & PP_EMBEDDED) == 0) { 1773 if (LIST_EMPTY(&pp->pp_head.pvh_list)) { 1774 pp->pp_flags |= PP_EMBEDDED; 1775 pp->pp_pte.pte_ptp = ptp; 1776 pp->pp_pte.pte_va = va; 1777 1778 return pve; 1779 } 1780 } else { 1781 struct pv_entry *pve2; 1782 1783 pve2 = *sparepve; 1784 *sparepve = NULL; 1785 1786 pve2->pve_pte = pp->pp_pte; 1787 pp->pp_flags &= ~PP_EMBEDDED; 1788 LIST_INIT(&pp->pp_head.pvh_list); 1789 insert_pv(pp, pve2); 1790 } 1791 1792 pve->pve_pte.pte_ptp = ptp; 1793 pve->pve_pte.pte_va = va; 1794 insert_pv(pp, pve); 1795 1796 return NULL; 1797 } 1798 1799 /* 1800 * pmap_remove_pv: try to remove a mapping from a pv_list 1801 * 1802 * => caller should adjust ptp's wire_count and free PTP if needed 1803 * => we return the removed pve 1804 */ 1805 1806 static struct pv_entry * 1807 pmap_remove_pv(struct pmap_page *pp, struct vm_page *ptp, vaddr_t va) 1808 { 1809 struct pv_hash_head *hh; 1810 struct pv_entry *pve; 1811 kmutex_t *lock; 1812 u_int hash; 1813 1814 KASSERT(ptp == NULL || ptp->uobject != NULL); 1815 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 1816 1817 if ((pp->pp_flags & PP_EMBEDDED) != 0) { 1818 KASSERT(pp->pp_pte.pte_ptp == ptp); 1819 KASSERT(pp->pp_pte.pte_va == va); 1820 1821 pp->pp_flags &= ~PP_EMBEDDED; 1822 LIST_INIT(&pp->pp_head.pvh_list); 1823 1824 return NULL; 1825 } 1826 1827 hash = pvhash_hash(ptp, va); 1828 lock = pvhash_lock(hash); 1829 hh = pvhash_head(hash); 1830 mutex_spin_enter(lock); 1831 pve = pvhash_remove(hh, ptp, va); 1832 mutex_spin_exit(lock); 1833 1834 LIST_REMOVE(pve, pve_list); 1835 1836 return pve; 1837 } 1838 1839 /* 1840 * p t p f u n c t i o n s 1841 */ 1842 1843 static inline struct vm_page * 1844 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level) 1845 { 1846 int lidx = level - 1; 1847 struct vm_page *pg; 1848 1849 KASSERT(mutex_owned(pmap->pm_lock)); 1850 1851 if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] && 1852 pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) { 1853 return (pmap->pm_ptphint[lidx]); 1854 } 1855 PMAP_SUBOBJ_LOCK(pmap, lidx); 1856 pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level)); 1857 PMAP_SUBOBJ_UNLOCK(pmap, lidx); 1858 1859 KASSERT(pg == NULL || pg->wire_count >= 1); 1860 return pg; 1861 } 1862 1863 static inline void 1864 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level) 1865 { 1866 lwp_t *l; 1867 int lidx; 1868 struct uvm_object *obj; 1869 1870 KASSERT(ptp->wire_count == 1); 1871 1872 lidx = level - 1; 1873 1874 obj = &pmap->pm_obj[lidx]; 1875 pmap_stats_update(pmap, -1, 0); 1876 if (lidx != 0) 1877 mutex_enter(obj->vmobjlock); 1878 if (pmap->pm_ptphint[lidx] == ptp) 1879 pmap->pm_ptphint[lidx] = TAILQ_FIRST(&obj->memq); 1880 ptp->wire_count = 0; 1881 uvm_pagerealloc(ptp, NULL, 0); 1882 l = curlwp; 1883 KASSERT((l->l_pflag & LP_INTR) == 0); 1884 VM_PAGE_TO_PP(ptp)->pp_link = l->l_md.md_gc_ptp; 1885 l->l_md.md_gc_ptp = ptp; 1886 if (lidx != 0) 1887 mutex_exit(obj->vmobjlock); 1888 } 1889 1890 static void 1891 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va, 1892 pt_entry_t *ptes, pd_entry_t * const *pdes) 1893 { 1894 unsigned long index; 1895 int level; 1896 vaddr_t invaladdr; 1897 pd_entry_t opde; 1898 1899 KASSERT(pmap != pmap_kernel()); 1900 KASSERT(mutex_owned(pmap->pm_lock)); 1901 KASSERT(kpreempt_disabled()); 1902 1903 level = 1; 1904 do { 1905 index = pl_i(va, level + 1); 1906 opde = pmap_pte_testset(&pdes[level - 1][index], 0); 1907 #if defined(XEN) 1908 # if defined(__x86_64__) 1909 /* 1910 * If ptp is a L3 currently mapped in kernel space, 1911 * on any cpu, clear it before freeing 1912 */ 1913 if (level == PTP_LEVELS - 1) { 1914 /* 1915 * Update the per-cpu PD on all cpus the current 1916 * pmap is active on 1917 */ 1918 xen_kpm_sync(pmap, index); 1919 } 1920 # endif /*__x86_64__ */ 1921 invaladdr = level == 1 ? (vaddr_t)ptes : 1922 (vaddr_t)pdes[level - 2]; 1923 pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE, 1924 opde, TLBSHOOT_FREE_PTP1); 1925 pmap_tlb_shootnow(); 1926 #else /* XEN */ 1927 invaladdr = level == 1 ? (vaddr_t)ptes : 1928 (vaddr_t)pdes[level - 2]; 1929 pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE, 1930 opde, TLBSHOOT_FREE_PTP1); 1931 #endif /* XEN */ 1932 pmap_freepage(pmap, ptp, level); 1933 if (level < PTP_LEVELS - 1) { 1934 ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1); 1935 ptp->wire_count--; 1936 if (ptp->wire_count > 1) 1937 break; 1938 } 1939 } while (++level < PTP_LEVELS); 1940 pmap_pte_flush(); 1941 } 1942 1943 /* 1944 * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one) 1945 * 1946 * => pmap should NOT be pmap_kernel() 1947 * => pmap should be locked 1948 * => preemption should be disabled 1949 */ 1950 1951 static struct vm_page * 1952 pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t * const *pdes) 1953 { 1954 struct vm_page *ptp, *pptp; 1955 int i; 1956 unsigned long index; 1957 pd_entry_t *pva; 1958 paddr_t ppa, pa; 1959 struct uvm_object *obj; 1960 1961 KASSERT(pmap != pmap_kernel()); 1962 KASSERT(mutex_owned(pmap->pm_lock)); 1963 KASSERT(kpreempt_disabled()); 1964 1965 ptp = NULL; 1966 pa = (paddr_t)-1; 1967 1968 /* 1969 * Loop through all page table levels seeing if we need to 1970 * add a new page to that level. 1971 */ 1972 for (i = PTP_LEVELS; i > 1; i--) { 1973 /* 1974 * Save values from previous round. 1975 */ 1976 pptp = ptp; 1977 ppa = pa; 1978 1979 index = pl_i(va, i); 1980 pva = pdes[i - 2]; 1981 1982 if (pmap_valid_entry(pva[index])) { 1983 ppa = pmap_pte2pa(pva[index]); 1984 ptp = NULL; 1985 continue; 1986 } 1987 1988 obj = &pmap->pm_obj[i-2]; 1989 PMAP_SUBOBJ_LOCK(pmap, i - 2); 1990 ptp = uvm_pagealloc(obj, ptp_va2o(va, i - 1), NULL, 1991 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 1992 PMAP_SUBOBJ_UNLOCK(pmap, i - 2); 1993 1994 if (ptp == NULL) 1995 return NULL; 1996 1997 ptp->flags &= ~PG_BUSY; /* never busy */ 1998 ptp->wire_count = 1; 1999 pmap->pm_ptphint[i - 2] = ptp; 2000 pa = VM_PAGE_TO_PHYS(ptp); 2001 pmap_pte_set(&pva[index], (pd_entry_t) 2002 (pmap_pa2pte(pa) | PG_u | PG_RW | PG_V)); 2003 #if defined(XEN) && defined(__x86_64__) 2004 if(i == PTP_LEVELS) { 2005 /* 2006 * Update the per-cpu PD on all cpus the current 2007 * pmap is active on 2008 */ 2009 xen_kpm_sync(pmap, index); 2010 } 2011 #endif 2012 pmap_pte_flush(); 2013 pmap_stats_update(pmap, 1, 0); 2014 /* 2015 * If we're not in the top level, increase the 2016 * wire count of the parent page. 2017 */ 2018 if (i < PTP_LEVELS) { 2019 if (pptp == NULL) { 2020 pptp = pmap_find_ptp(pmap, va, ppa, i); 2021 KASSERT(pptp != NULL); 2022 } 2023 pptp->wire_count++; 2024 } 2025 } 2026 2027 /* 2028 * PTP is not NULL if we just allocated a new PTP. If it is 2029 * still NULL, we must look up the existing one. 2030 */ 2031 if (ptp == NULL) { 2032 ptp = pmap_find_ptp(pmap, va, ppa, 1); 2033 KASSERTMSG(ptp != NULL, "pmap_get_ptp: va %" PRIxVADDR 2034 "ppa %" PRIxPADDR "\n", va, ppa); 2035 } 2036 2037 pmap->pm_ptphint[0] = ptp; 2038 return ptp; 2039 } 2040 2041 /* 2042 * p m a p l i f e c y c l e f u n c t i o n s 2043 */ 2044 2045 /* 2046 * pmap_pdp_ctor: constructor for the PDP cache. 2047 */ 2048 static int 2049 pmap_pdp_ctor(void *arg, void *v, int flags) 2050 { 2051 pd_entry_t *pdir = v; 2052 paddr_t pdirpa = 0; 2053 vaddr_t object; 2054 int i; 2055 2056 #if !defined(XEN) || !defined(__x86_64__) 2057 int npde; 2058 #endif 2059 #ifdef XEN 2060 int s; 2061 #endif 2062 2063 /* 2064 * NOTE: The `pmaps_lock' is held when the PDP is allocated. 2065 */ 2066 2067 #if defined(XEN) && defined(__x86_64__) 2068 /* Fetch the physical address of the page directory */ 2069 (void)pmap_extract(pmap_kernel(), (vaddr_t)pdir, &pdirpa); 2070 2071 /* Zero the area */ 2072 memset(pdir, 0, PAGE_SIZE); /* Xen wants a clean page */ 2073 2074 /* 2075 * This pdir will NEVER be active in kernel mode, so mark 2076 * recursive entry invalid. 2077 */ 2078 pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa) | PG_u; 2079 2080 /* 2081 * PDP constructed this way won't be for the kernel, hence we 2082 * don't put kernel mappings on Xen. 2083 * 2084 * But we need to make pmap_create() happy, so put a dummy 2085 * (without PG_V) value at the right place. 2086 */ 2087 pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] = 2088 (pd_entry_t)-1 & PG_FRAME; 2089 #else /* XEN && __x86_64__*/ 2090 /* Zero the area */ 2091 memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t)); 2092 2093 object = (vaddr_t)v; 2094 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2095 /* Fetch the physical address of the page directory */ 2096 (void)pmap_extract(pmap_kernel(), object, &pdirpa); 2097 2098 /* Put in recursive PDE to map the PTEs */ 2099 pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PG_V | 2100 pmap_pg_nx; 2101 #ifndef XEN 2102 pdir[PDIR_SLOT_PTE + i] |= PG_KW; 2103 #endif 2104 } 2105 2106 /* Copy the kernel's top level PDE */ 2107 npde = nkptp[PTP_LEVELS - 1]; 2108 2109 memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN], 2110 npde * sizeof(pd_entry_t)); 2111 2112 /* Zero the rest */ 2113 memset(&pdir[PDIR_SLOT_KERN + npde], 0, (PAGE_SIZE * PDP_SIZE) - 2114 (PDIR_SLOT_KERN + npde) * sizeof(pd_entry_t)); 2115 2116 if (VM_MIN_KERNEL_ADDRESS != KERNBASE) { 2117 int idx = pl_i(KERNBASE, PTP_LEVELS); 2118 pdir[idx] = PDP_BASE[idx]; 2119 } 2120 2121 #ifdef __HAVE_DIRECT_MAP 2122 pdir[PDIR_SLOT_DIRECT] = PDP_BASE[PDIR_SLOT_DIRECT]; 2123 #endif 2124 #endif /* XEN && __x86_64__*/ 2125 2126 #ifdef XEN 2127 s = splvm(); 2128 object = (vaddr_t)v; 2129 pmap_protect(pmap_kernel(), object, object + (PAGE_SIZE * PDP_SIZE), 2130 VM_PROT_READ); 2131 pmap_update(pmap_kernel()); 2132 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2133 /* 2134 * pin as L2/L4 page, we have to do the page with the 2135 * PDIR_SLOT_PTE entries last 2136 */ 2137 #ifdef PAE 2138 if (i == l2tol3(PDIR_SLOT_PTE)) 2139 continue; 2140 #endif 2141 2142 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2143 #ifdef __x86_64__ 2144 xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa)); 2145 #else 2146 xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa)); 2147 #endif 2148 } 2149 #ifdef PAE 2150 object = ((vaddr_t)pdir) + PAGE_SIZE * l2tol3(PDIR_SLOT_PTE); 2151 (void)pmap_extract(pmap_kernel(), object, &pdirpa); 2152 xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa)); 2153 #endif 2154 splx(s); 2155 #endif /* XEN */ 2156 2157 return (0); 2158 } 2159 2160 /* 2161 * pmap_pdp_dtor: destructor for the PDP cache. 2162 */ 2163 2164 static void 2165 pmap_pdp_dtor(void *arg, void *v) 2166 { 2167 #ifdef XEN 2168 paddr_t pdirpa = 0; /* XXX: GCC */ 2169 vaddr_t object = (vaddr_t)v; 2170 int i; 2171 int s = splvm(); 2172 pt_entry_t *pte; 2173 2174 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2175 /* fetch the physical address of the page directory. */ 2176 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2177 /* unpin page table */ 2178 xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa)); 2179 } 2180 object = (vaddr_t)v; 2181 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2182 /* Set page RW again */ 2183 pte = kvtopte(object); 2184 pmap_pte_set(pte, *pte | PG_RW); 2185 xen_bcast_invlpg((vaddr_t)object); 2186 } 2187 splx(s); 2188 #endif /* XEN */ 2189 } 2190 2191 #ifdef PAE 2192 2193 /* pmap_pdp_alloc: Allocate a page for the pdp memory pool. */ 2194 2195 static void * 2196 pmap_pdp_alloc(struct pool *pp, int flags) 2197 { 2198 return (void *)uvm_km_alloc(kernel_map, 2199 PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE, 2200 ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK) 2201 | UVM_KMF_WIRED); 2202 } 2203 2204 /* 2205 * pmap_pdp_free: free a PDP 2206 */ 2207 2208 static void 2209 pmap_pdp_free(struct pool *pp, void *v) 2210 { 2211 uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE, 2212 UVM_KMF_WIRED); 2213 } 2214 #endif /* PAE */ 2215 2216 /* 2217 * pmap_create: create a pmap object. 2218 */ 2219 struct pmap * 2220 pmap_create(void) 2221 { 2222 struct pmap *pmap; 2223 int i; 2224 2225 pmap = pool_cache_get(&pmap_cache, PR_WAITOK); 2226 2227 /* init uvm_object */ 2228 for (i = 0; i < PTP_LEVELS - 1; i++) { 2229 mutex_init(&pmap->pm_obj_lock[i], MUTEX_DEFAULT, IPL_NONE); 2230 uvm_obj_init(&pmap->pm_obj[i], NULL, false, 1); 2231 uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_obj_lock[i]); 2232 pmap->pm_ptphint[i] = NULL; 2233 } 2234 pmap->pm_stats.wired_count = 0; 2235 /* count the PDP allocd below */ 2236 pmap->pm_stats.resident_count = PDP_SIZE; 2237 #if !defined(__x86_64__) 2238 pmap->pm_hiexec = 0; 2239 #endif /* !defined(__x86_64__) */ 2240 pmap->pm_flags = 0; 2241 pmap->pm_gc_ptp = NULL; 2242 2243 kcpuset_create(&pmap->pm_cpus, true); 2244 kcpuset_create(&pmap->pm_kernel_cpus, true); 2245 #ifdef XEN 2246 kcpuset_create(&pmap->pm_xen_ptp_cpus, true); 2247 #endif 2248 /* init the LDT */ 2249 pmap->pm_ldt = NULL; 2250 pmap->pm_ldt_len = 0; 2251 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 2252 2253 /* allocate PDP */ 2254 try_again: 2255 pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK); 2256 2257 mutex_enter(&pmaps_lock); 2258 2259 if (pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] == 0) { 2260 mutex_exit(&pmaps_lock); 2261 pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir); 2262 goto try_again; 2263 } 2264 2265 for (i = 0; i < PDP_SIZE; i++) 2266 pmap->pm_pdirpa[i] = 2267 pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]); 2268 2269 LIST_INSERT_HEAD(&pmaps, pmap, pm_list); 2270 2271 mutex_exit(&pmaps_lock); 2272 2273 return (pmap); 2274 } 2275 2276 /* 2277 * pmap_free_ptps: put a list of ptps back to the freelist. 2278 */ 2279 2280 static void 2281 pmap_free_ptps(struct vm_page *empty_ptps) 2282 { 2283 struct vm_page *ptp; 2284 struct pmap_page *pp; 2285 2286 while ((ptp = empty_ptps) != NULL) { 2287 pp = VM_PAGE_TO_PP(ptp); 2288 empty_ptps = pp->pp_link; 2289 LIST_INIT(&pp->pp_head.pvh_list); 2290 uvm_pagefree(ptp); 2291 } 2292 } 2293 2294 /* 2295 * pmap_destroy: drop reference count on pmap. free pmap if 2296 * reference count goes to zero. 2297 */ 2298 2299 void 2300 pmap_destroy(struct pmap *pmap) 2301 { 2302 lwp_t *l; 2303 int i; 2304 2305 /* 2306 * If we have torn down this pmap, process deferred frees and 2307 * invalidations. Free now if the system is low on memory. 2308 * Otherwise, free when the pmap is destroyed thus avoiding a 2309 * TLB shootdown. 2310 */ 2311 l = curlwp; 2312 if (__predict_false(l->l_md.md_gc_pmap == pmap)) { 2313 if (uvmexp.free < uvmexp.freetarg) { 2314 pmap_update(pmap); 2315 } else { 2316 KASSERT(pmap->pm_gc_ptp == NULL); 2317 pmap->pm_gc_ptp = l->l_md.md_gc_ptp; 2318 l->l_md.md_gc_ptp = NULL; 2319 l->l_md.md_gc_pmap = NULL; 2320 } 2321 } 2322 2323 /* 2324 * drop reference count 2325 */ 2326 2327 if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) { 2328 return; 2329 } 2330 2331 #ifdef DIAGNOSTIC 2332 CPU_INFO_ITERATOR cii; 2333 struct cpu_info *ci; 2334 2335 for (CPU_INFO_FOREACH(cii, ci)) { 2336 if (ci->ci_pmap == pmap) 2337 panic("destroying pmap being used"); 2338 #if defined(XEN) && defined(__x86_64__) 2339 for (i = 0; i < PDIR_SLOT_PTE; i++) { 2340 if (pmap->pm_pdir[i] != 0 && 2341 ci->ci_kpm_pdir[i] == pmap->pm_pdir[i]) { 2342 printf("pmap_destroy(%p) pmap_kernel %p " 2343 "curcpu %d cpu %d ci_pmap %p " 2344 "ci->ci_kpm_pdir[%d]=%" PRIx64 2345 " pmap->pm_pdir[%d]=%" PRIx64 "\n", 2346 pmap, pmap_kernel(), curcpu()->ci_index, 2347 ci->ci_index, ci->ci_pmap, 2348 i, ci->ci_kpm_pdir[i], 2349 i, pmap->pm_pdir[i]); 2350 panic("pmap_destroy: used pmap"); 2351 } 2352 } 2353 #endif 2354 } 2355 #endif /* DIAGNOSTIC */ 2356 2357 /* 2358 * Reference count is zero, free pmap resources and then free pmap. 2359 * First, remove it from global list of pmaps. 2360 */ 2361 2362 mutex_enter(&pmaps_lock); 2363 LIST_REMOVE(pmap, pm_list); 2364 mutex_exit(&pmaps_lock); 2365 2366 /* 2367 * Process deferred PTP frees. No TLB shootdown required, as the 2368 * PTP pages are no longer visible to any CPU. 2369 */ 2370 2371 pmap_free_ptps(pmap->pm_gc_ptp); 2372 2373 /* 2374 * destroyed pmap shouldn't have remaining PTPs 2375 */ 2376 2377 for (i = 0; i < PTP_LEVELS - 1; i++) { 2378 KASSERT(pmap->pm_obj[i].uo_npages == 0); 2379 KASSERT(TAILQ_EMPTY(&pmap->pm_obj[i].memq)); 2380 } 2381 2382 pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir); 2383 2384 #ifdef USER_LDT 2385 if (pmap->pm_ldt != NULL) { 2386 /* 2387 * no need to switch the LDT; this address space is gone, 2388 * nothing is using it. 2389 * 2390 * No need to lock the pmap for ldt_free (or anything else), 2391 * we're the last one to use it. 2392 */ 2393 mutex_enter(&cpu_lock); 2394 ldt_free(pmap->pm_ldt_sel); 2395 mutex_exit(&cpu_lock); 2396 uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt, 2397 pmap->pm_ldt_len, UVM_KMF_WIRED); 2398 } 2399 #endif 2400 2401 for (i = 0; i < PTP_LEVELS - 1; i++) { 2402 uvm_obj_destroy(&pmap->pm_obj[i], false); 2403 mutex_destroy(&pmap->pm_obj_lock[i]); 2404 } 2405 kcpuset_destroy(pmap->pm_cpus); 2406 kcpuset_destroy(pmap->pm_kernel_cpus); 2407 #ifdef XEN 2408 kcpuset_destroy(pmap->pm_xen_ptp_cpus); 2409 #endif 2410 pool_cache_put(&pmap_cache, pmap); 2411 } 2412 2413 /* 2414 * pmap_remove_all: pmap is being torn down by the current thread. 2415 * avoid unnecessary invalidations. 2416 */ 2417 2418 void 2419 pmap_remove_all(struct pmap *pmap) 2420 { 2421 lwp_t *l = curlwp; 2422 2423 KASSERT(l->l_md.md_gc_pmap == NULL); 2424 2425 l->l_md.md_gc_pmap = pmap; 2426 } 2427 2428 #if defined(PMAP_FORK) 2429 /* 2430 * pmap_fork: perform any necessary data structure manipulation when 2431 * a VM space is forked. 2432 */ 2433 2434 void 2435 pmap_fork(struct pmap *pmap1, struct pmap *pmap2) 2436 { 2437 #ifdef USER_LDT 2438 union descriptor *new_ldt; 2439 size_t len; 2440 int sel; 2441 2442 if (__predict_true(pmap1->pm_ldt == NULL)) { 2443 return; 2444 } 2445 2446 retry: 2447 if (pmap1->pm_ldt != NULL) { 2448 len = pmap1->pm_ldt_len; 2449 new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, len, 0, 2450 UVM_KMF_WIRED); 2451 mutex_enter(&cpu_lock); 2452 sel = ldt_alloc(new_ldt, len); 2453 if (sel == -1) { 2454 mutex_exit(&cpu_lock); 2455 uvm_km_free(kernel_map, (vaddr_t)new_ldt, len, 2456 UVM_KMF_WIRED); 2457 printf("WARNING: pmap_fork: unable to allocate LDT\n"); 2458 return; 2459 } 2460 } else { 2461 len = -1; 2462 new_ldt = NULL; 2463 sel = -1; 2464 mutex_enter(&cpu_lock); 2465 } 2466 2467 /* Copy the LDT, if necessary. */ 2468 if (pmap1->pm_ldt != NULL) { 2469 if (len != pmap1->pm_ldt_len) { 2470 if (len != -1) { 2471 ldt_free(sel); 2472 uvm_km_free(kernel_map, (vaddr_t)new_ldt, 2473 len, UVM_KMF_WIRED); 2474 } 2475 mutex_exit(&cpu_lock); 2476 goto retry; 2477 } 2478 2479 memcpy(new_ldt, pmap1->pm_ldt, len); 2480 pmap2->pm_ldt = new_ldt; 2481 pmap2->pm_ldt_len = pmap1->pm_ldt_len; 2482 pmap2->pm_ldt_sel = sel; 2483 len = -1; 2484 } 2485 2486 if (len != -1) { 2487 ldt_free(sel); 2488 uvm_km_free(kernel_map, (vaddr_t)new_ldt, len, 2489 UVM_KMF_WIRED); 2490 } 2491 mutex_exit(&cpu_lock); 2492 #endif /* USER_LDT */ 2493 } 2494 #endif /* PMAP_FORK */ 2495 2496 #ifdef USER_LDT 2497 2498 /* 2499 * pmap_ldt_xcall: cross call used by pmap_ldt_sync. if the named pmap 2500 * is active, reload LDTR. 2501 */ 2502 static void 2503 pmap_ldt_xcall(void *arg1, void *arg2) 2504 { 2505 struct pmap *pm; 2506 2507 kpreempt_disable(); 2508 pm = arg1; 2509 if (curcpu()->ci_pmap == pm) { 2510 lldt(pm->pm_ldt_sel); 2511 } 2512 kpreempt_enable(); 2513 } 2514 2515 /* 2516 * pmap_ldt_sync: LDT selector for the named pmap is changing. swap 2517 * in the new selector on all CPUs. 2518 */ 2519 void 2520 pmap_ldt_sync(struct pmap *pm) 2521 { 2522 uint64_t where; 2523 2524 KASSERT(mutex_owned(&cpu_lock)); 2525 2526 pmap_ldt_evcnt.ev_count++; 2527 where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL); 2528 xc_wait(where); 2529 } 2530 2531 /* 2532 * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and 2533 * restore the default. 2534 */ 2535 2536 void 2537 pmap_ldt_cleanup(struct lwp *l) 2538 { 2539 pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap; 2540 union descriptor *dp = NULL; 2541 size_t len = 0; 2542 int sel = -1; 2543 2544 if (__predict_true(pmap->pm_ldt == NULL)) { 2545 return; 2546 } 2547 2548 mutex_enter(&cpu_lock); 2549 if (pmap->pm_ldt != NULL) { 2550 sel = pmap->pm_ldt_sel; 2551 dp = pmap->pm_ldt; 2552 len = pmap->pm_ldt_len; 2553 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 2554 pmap->pm_ldt = NULL; 2555 pmap->pm_ldt_len = 0; 2556 pmap_ldt_sync(pmap); 2557 ldt_free(sel); 2558 uvm_km_free(kernel_map, (vaddr_t)dp, len, UVM_KMF_WIRED); 2559 } 2560 mutex_exit(&cpu_lock); 2561 } 2562 #endif /* USER_LDT */ 2563 2564 /* 2565 * pmap_activate: activate a process' pmap 2566 * 2567 * => must be called with kernel preemption disabled 2568 * => if lwp is the curlwp, then set ci_want_pmapload so that 2569 * actual MMU context switch will be done by pmap_load() later 2570 */ 2571 2572 void 2573 pmap_activate(struct lwp *l) 2574 { 2575 struct cpu_info *ci; 2576 struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2577 2578 KASSERT(kpreempt_disabled()); 2579 2580 ci = curcpu(); 2581 2582 if (l == ci->ci_curlwp) { 2583 KASSERT(ci->ci_want_pmapload == 0); 2584 KASSERT(ci->ci_tlbstate != TLBSTATE_VALID); 2585 #ifdef KSTACK_CHECK_DR0 2586 /* 2587 * setup breakpoint on the top of stack 2588 */ 2589 if (l == &lwp0) 2590 dr0(0, 0, 0, 0); 2591 else 2592 dr0(KSTACK_LOWEST_ADDR(l), 1, 3, 1); 2593 #endif 2594 2595 /* 2596 * no need to switch to kernel vmspace because 2597 * it's a subset of any vmspace. 2598 */ 2599 2600 if (pmap == pmap_kernel()) { 2601 ci->ci_want_pmapload = 0; 2602 return; 2603 } 2604 2605 ci->ci_want_pmapload = 1; 2606 } 2607 } 2608 2609 /* 2610 * pmap_reactivate: try to regain reference to the pmap. 2611 * 2612 * => Must be called with kernel preemption disabled. 2613 */ 2614 2615 static bool 2616 pmap_reactivate(struct pmap *pmap) 2617 { 2618 struct cpu_info * const ci = curcpu(); 2619 const cpuid_t cid = cpu_index(ci); 2620 bool result; 2621 2622 KASSERT(kpreempt_disabled()); 2623 #if defined(XEN) && defined(__x86_64__) 2624 KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd); 2625 #elif defined(PAE) 2626 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0])); 2627 #elif !defined(XEN) 2628 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3())); 2629 #endif 2630 2631 /* 2632 * If we still have a lazy reference to this pmap, we can assume 2633 * that there was no TLB shootdown for this pmap in the meantime. 2634 * 2635 * The order of events here is important as we must synchronize 2636 * with TLB shootdown interrupts. Declare interest in invalidations 2637 * (TLBSTATE_VALID) and then check the CPU set, which the IPIs can 2638 * change only when the state is TLBSTATE_LAZY. 2639 */ 2640 2641 ci->ci_tlbstate = TLBSTATE_VALID; 2642 KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid)); 2643 2644 if (kcpuset_isset(pmap->pm_cpus, cid)) { 2645 /* We have the reference, state is valid. */ 2646 result = true; 2647 } else { 2648 /* Must reload the TLB. */ 2649 kcpuset_atomic_set(pmap->pm_cpus, cid); 2650 result = false; 2651 } 2652 return result; 2653 } 2654 2655 /* 2656 * pmap_load: perform the actual pmap switch, i.e. fill in %cr3 register 2657 * and relevant LDT info. 2658 * 2659 * Ensures that the current process' pmap is loaded on the current CPU's 2660 * MMU and that there are no stale TLB entries. 2661 * 2662 * => The caller should disable kernel preemption or do check-and-retry 2663 * to prevent a preemption from undoing our efforts. 2664 * => This function may block. 2665 */ 2666 void 2667 pmap_load(void) 2668 { 2669 struct cpu_info *ci; 2670 struct pmap *pmap, *oldpmap; 2671 struct lwp *l; 2672 struct pcb *pcb; 2673 cpuid_t cid; 2674 uint64_t ncsw; 2675 2676 kpreempt_disable(); 2677 retry: 2678 ci = curcpu(); 2679 if (!ci->ci_want_pmapload) { 2680 kpreempt_enable(); 2681 return; 2682 } 2683 l = ci->ci_curlwp; 2684 ncsw = l->l_ncsw; 2685 2686 /* should be able to take ipis. */ 2687 KASSERT(ci->ci_ilevel < IPL_HIGH); 2688 #ifdef XEN 2689 /* Check to see if interrupts are enabled (ie; no events are masked) */ 2690 KASSERT(x86_read_psl() == 0); 2691 #else 2692 KASSERT((x86_read_psl() & PSL_I) != 0); 2693 #endif 2694 2695 KASSERT(l != NULL); 2696 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2697 KASSERT(pmap != pmap_kernel()); 2698 oldpmap = ci->ci_pmap; 2699 pcb = lwp_getpcb(l); 2700 2701 if (pmap == oldpmap) { 2702 if (!pmap_reactivate(pmap)) { 2703 u_int gen = uvm_emap_gen_return(); 2704 2705 /* 2706 * pmap has been changed during deactivated. 2707 * our tlb may be stale. 2708 */ 2709 2710 tlbflush(); 2711 uvm_emap_update(gen); 2712 } 2713 2714 ci->ci_want_pmapload = 0; 2715 kpreempt_enable(); 2716 return; 2717 } 2718 2719 /* 2720 * Acquire a reference to the new pmap and perform the switch. 2721 */ 2722 2723 pmap_reference(pmap); 2724 2725 cid = cpu_index(ci); 2726 kcpuset_atomic_clear(oldpmap->pm_cpus, cid); 2727 kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid); 2728 2729 #if defined(XEN) && defined(__x86_64__) 2730 KASSERT(pmap_pdirpa(oldpmap, 0) == ci->ci_xen_current_user_pgd || 2731 oldpmap == pmap_kernel()); 2732 #elif defined(PAE) 2733 KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0])); 2734 #elif !defined(XEN) 2735 KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(rcr3())); 2736 #endif 2737 KASSERT(!kcpuset_isset(pmap->pm_cpus, cid)); 2738 KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid)); 2739 2740 /* 2741 * Mark the pmap in use by this CPU. Again, we must synchronize 2742 * with TLB shootdown interrupts, so set the state VALID first, 2743 * then register us for shootdown events on this pmap. 2744 */ 2745 ci->ci_tlbstate = TLBSTATE_VALID; 2746 kcpuset_atomic_set(pmap->pm_cpus, cid); 2747 kcpuset_atomic_set(pmap->pm_kernel_cpus, cid); 2748 ci->ci_pmap = pmap; 2749 2750 /* 2751 * update tss. now that we have registered for invalidations 2752 * from other CPUs, we're good to load the page tables. 2753 */ 2754 #ifdef PAE 2755 pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa; 2756 #else 2757 pcb->pcb_cr3 = pmap_pdirpa(pmap, 0); 2758 #endif 2759 2760 #ifdef i386 2761 #ifndef XEN 2762 ci->ci_tss.tss_ldt = pmap->pm_ldt_sel; 2763 ci->ci_tss.tss_cr3 = pcb->pcb_cr3; 2764 #endif /* !XEN */ 2765 #endif /* i386 */ 2766 2767 lldt(pmap->pm_ldt_sel); 2768 2769 u_int gen = uvm_emap_gen_return(); 2770 cpu_load_pmap(pmap, oldpmap); 2771 uvm_emap_update(gen); 2772 2773 ci->ci_want_pmapload = 0; 2774 2775 /* 2776 * we're now running with the new pmap. drop the reference 2777 * to the old pmap. if we block, we need to go around again. 2778 */ 2779 2780 pmap_destroy(oldpmap); 2781 if (l->l_ncsw != ncsw) { 2782 goto retry; 2783 } 2784 2785 kpreempt_enable(); 2786 } 2787 2788 /* 2789 * pmap_deactivate: deactivate a process' pmap. 2790 * 2791 * => Must be called with kernel preemption disabled (high IPL is enough). 2792 */ 2793 void 2794 pmap_deactivate(struct lwp *l) 2795 { 2796 struct pmap *pmap; 2797 struct cpu_info *ci; 2798 2799 KASSERT(kpreempt_disabled()); 2800 2801 if (l != curlwp) { 2802 return; 2803 } 2804 2805 /* 2806 * Wait for pending TLB shootdowns to complete. Necessary because 2807 * TLB shootdown state is per-CPU, and the LWP may be coming off 2808 * the CPU before it has a chance to call pmap_update(), e.g. due 2809 * to kernel preemption or blocking routine in between. 2810 */ 2811 pmap_tlb_shootnow(); 2812 2813 ci = curcpu(); 2814 2815 if (ci->ci_want_pmapload) { 2816 /* 2817 * ci_want_pmapload means that our pmap is not loaded on 2818 * the CPU or TLB might be stale. note that pmap_kernel() 2819 * is always considered loaded. 2820 */ 2821 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 2822 != pmap_kernel()); 2823 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 2824 != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID); 2825 2826 /* 2827 * userspace has not been touched. 2828 * nothing to do here. 2829 */ 2830 2831 ci->ci_want_pmapload = 0; 2832 return; 2833 } 2834 2835 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2836 2837 if (pmap == pmap_kernel()) { 2838 return; 2839 } 2840 2841 #if defined(XEN) && defined(__x86_64__) 2842 KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd); 2843 #elif defined(PAE) 2844 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0])); 2845 #elif !defined(XEN) 2846 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3())); 2847 #endif 2848 KASSERT(ci->ci_pmap == pmap); 2849 2850 /* 2851 * we aren't interested in TLB invalidations for this pmap, 2852 * at least for the time being. 2853 */ 2854 2855 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); 2856 ci->ci_tlbstate = TLBSTATE_LAZY; 2857 } 2858 2859 /* 2860 * end of lifecycle functions 2861 */ 2862 2863 /* 2864 * some misc. functions 2865 */ 2866 2867 int 2868 pmap_pdes_invalid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde) 2869 { 2870 int i; 2871 unsigned long index; 2872 pd_entry_t pde; 2873 2874 for (i = PTP_LEVELS; i > 1; i--) { 2875 index = pl_i(va, i); 2876 pde = pdes[i - 2][index]; 2877 if ((pde & PG_V) == 0) 2878 return i; 2879 } 2880 if (lastpde != NULL) 2881 *lastpde = pde; 2882 return 0; 2883 } 2884 2885 /* 2886 * pmap_extract: extract a PA for the given VA 2887 */ 2888 2889 bool 2890 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap) 2891 { 2892 pt_entry_t *ptes, pte; 2893 pd_entry_t pde; 2894 pd_entry_t * const *pdes; 2895 struct pmap *pmap2; 2896 struct cpu_info *ci; 2897 paddr_t pa; 2898 lwp_t *l; 2899 bool hard, rv; 2900 2901 #ifdef __HAVE_DIRECT_MAP 2902 if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) { 2903 if (pap != NULL) { 2904 *pap = va - PMAP_DIRECT_BASE; 2905 } 2906 return true; 2907 } 2908 #endif 2909 2910 rv = false; 2911 pa = 0; 2912 l = curlwp; 2913 2914 kpreempt_disable(); 2915 ci = l->l_cpu; 2916 if (__predict_true(!ci->ci_want_pmapload && ci->ci_pmap == pmap) || 2917 pmap == pmap_kernel()) { 2918 /* 2919 * no need to lock, because it's pmap_kernel() or our 2920 * own pmap and is active. if a user pmap, the caller 2921 * will hold the vm_map write/read locked and so prevent 2922 * entries from disappearing while we are here. ptps 2923 * can disappear via pmap_remove() and pmap_protect(), 2924 * but they are called with the vm_map write locked. 2925 */ 2926 hard = false; 2927 ptes = PTE_BASE; 2928 pdes = normal_pdes; 2929 } else { 2930 /* we lose, do it the hard way. */ 2931 hard = true; 2932 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 2933 } 2934 if (pmap_pdes_valid(va, pdes, &pde)) { 2935 pte = ptes[pl1_i(va)]; 2936 if (pde & PG_PS) { 2937 pa = (pde & PG_LGFRAME) | (va & (NBPD_L2 - 1)); 2938 rv = true; 2939 } else if (__predict_true((pte & PG_V) != 0)) { 2940 pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1)); 2941 rv = true; 2942 } 2943 } 2944 if (__predict_false(hard)) { 2945 pmap_unmap_ptes(pmap, pmap2); 2946 } 2947 kpreempt_enable(); 2948 if (pap != NULL) { 2949 *pap = pa; 2950 } 2951 return rv; 2952 } 2953 2954 2955 /* 2956 * vtophys: virtual address to physical address. For use by 2957 * machine-dependent code only. 2958 */ 2959 2960 paddr_t 2961 vtophys(vaddr_t va) 2962 { 2963 paddr_t pa; 2964 2965 if (pmap_extract(pmap_kernel(), va, &pa) == true) 2966 return (pa); 2967 return (0); 2968 } 2969 2970 __strict_weak_alias(pmap_extract_ma, pmap_extract); 2971 2972 #ifdef XEN 2973 2974 /* 2975 * vtomach: virtual address to machine address. For use by 2976 * machine-dependent code only. 2977 */ 2978 2979 paddr_t 2980 vtomach(vaddr_t va) 2981 { 2982 paddr_t pa; 2983 2984 if (pmap_extract_ma(pmap_kernel(), va, &pa) == true) 2985 return (pa); 2986 return (0); 2987 } 2988 2989 #endif /* XEN */ 2990 2991 /* 2992 * pmap_virtual_space: used during bootup [pmap_steal_memory] to 2993 * determine the bounds of the kernel virtual addess space. 2994 */ 2995 2996 void 2997 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp) 2998 { 2999 *startp = virtual_avail; 3000 *endp = virtual_end; 3001 } 3002 3003 /* 3004 * pmap_zero_page: zero a page 3005 */ 3006 3007 void 3008 pmap_zero_page(paddr_t pa) 3009 { 3010 #if defined(__HAVE_DIRECT_MAP) 3011 pagezero(PMAP_DIRECT_MAP(pa)); 3012 #else 3013 #if defined(XEN) 3014 if (XEN_VERSION_SUPPORTED(3, 4)) 3015 xen_pagezero(pa); 3016 #endif 3017 pt_entry_t *zpte; 3018 void *zerova; 3019 int id; 3020 3021 const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_M | PG_U | 3022 PG_k; 3023 3024 kpreempt_disable(); 3025 id = cpu_number(); 3026 zpte = PTESLEW(zero_pte, id); 3027 zerova = VASLEW(zerop, id); 3028 3029 #ifdef DIAGNOSTIC 3030 if (*zpte) 3031 panic("pmap_zero_page: lock botch"); 3032 #endif 3033 3034 pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags); 3035 pmap_pte_flush(); 3036 pmap_update_pg((vaddr_t)zerova); /* flush TLB */ 3037 3038 memset(zerova, 0, PAGE_SIZE); 3039 3040 #if defined(DIAGNOSTIC) || defined(XEN) 3041 pmap_pte_set(zpte, 0); /* zap ! */ 3042 pmap_pte_flush(); 3043 #endif 3044 3045 kpreempt_enable(); 3046 #endif /* defined(__HAVE_DIRECT_MAP) */ 3047 } 3048 3049 /* 3050 * pmap_pagezeroidle: the same, for the idle loop page zero'er. 3051 * Returns true if the page was zero'd, false if we aborted for 3052 * some reason. 3053 */ 3054 3055 bool 3056 pmap_pageidlezero(paddr_t pa) 3057 { 3058 #ifdef __HAVE_DIRECT_MAP 3059 KASSERT(cpu_feature[0] & CPUID_SSE2); 3060 return sse2_idlezero_page((void *)PMAP_DIRECT_MAP(pa)); 3061 #else 3062 pt_entry_t *zpte; 3063 void *zerova; 3064 bool rv; 3065 int id; 3066 3067 const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_M | PG_U | 3068 PG_k; 3069 3070 id = cpu_number(); 3071 zpte = PTESLEW(zero_pte, id); 3072 zerova = VASLEW(zerop, id); 3073 3074 KASSERT(cpu_feature[0] & CPUID_SSE2); 3075 KASSERT(*zpte == 0); 3076 3077 pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags); 3078 pmap_pte_flush(); 3079 pmap_update_pg((vaddr_t)zerova); /* flush TLB */ 3080 3081 rv = sse2_idlezero_page(zerova); 3082 3083 #if defined(DIAGNOSTIC) || defined(XEN) 3084 pmap_pte_set(zpte, 0); /* zap ! */ 3085 pmap_pte_flush(); 3086 #endif 3087 3088 return rv; 3089 #endif 3090 } 3091 3092 /* 3093 * pmap_copy_page: copy a page 3094 */ 3095 3096 void 3097 pmap_copy_page(paddr_t srcpa, paddr_t dstpa) 3098 { 3099 #if defined(__HAVE_DIRECT_MAP) 3100 vaddr_t srcva = PMAP_DIRECT_MAP(srcpa); 3101 vaddr_t dstva = PMAP_DIRECT_MAP(dstpa); 3102 3103 memcpy((void *)dstva, (void *)srcva, PAGE_SIZE); 3104 #else 3105 #if defined(XEN) 3106 if (XEN_VERSION_SUPPORTED(3, 4)) { 3107 xen_copy_page(srcpa, dstpa); 3108 return; 3109 } 3110 #endif 3111 pt_entry_t *spte; 3112 pt_entry_t *dpte; 3113 void *csrcva; 3114 void *cdstva; 3115 int id; 3116 3117 const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_U | PG_k; 3118 3119 kpreempt_disable(); 3120 id = cpu_number(); 3121 spte = PTESLEW(csrc_pte,id); 3122 dpte = PTESLEW(cdst_pte,id); 3123 csrcva = VASLEW(csrcp, id); 3124 cdstva = VASLEW(cdstp, id); 3125 3126 KASSERT(*spte == 0 && *dpte == 0); 3127 3128 pmap_pte_set(spte, pmap_pa2pte(srcpa) | pteflags); 3129 pmap_pte_set(dpte, pmap_pa2pte(dstpa) | pteflags | PG_M); 3130 pmap_pte_flush(); 3131 pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva); 3132 3133 memcpy(cdstva, csrcva, PAGE_SIZE); 3134 3135 #if defined(DIAGNOSTIC) || defined(XEN) 3136 pmap_pte_set(spte, 0); 3137 pmap_pte_set(dpte, 0); 3138 pmap_pte_flush(); 3139 #endif 3140 3141 kpreempt_enable(); 3142 #endif /* defined(__HAVE_DIRECT_MAP) */ 3143 } 3144 3145 static pt_entry_t * 3146 pmap_map_ptp(struct vm_page *ptp) 3147 { 3148 #ifdef __HAVE_DIRECT_MAP 3149 return (void *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp)); 3150 #else 3151 pt_entry_t *ptppte; 3152 void *ptpva; 3153 int id; 3154 3155 KASSERT(kpreempt_disabled()); 3156 3157 #ifndef XEN 3158 const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_U | PG_M | 3159 PG_k; 3160 #else 3161 const pd_entry_t pteflags = PG_V | pmap_pg_nx | PG_U | PG_M | PG_k; 3162 #endif 3163 3164 id = cpu_number(); 3165 ptppte = PTESLEW(ptp_pte, id); 3166 ptpva = VASLEW(ptpp, id); 3167 pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | pteflags); 3168 3169 pmap_pte_flush(); 3170 pmap_update_pg((vaddr_t)ptpva); 3171 3172 return (pt_entry_t *)ptpva; 3173 #endif 3174 } 3175 3176 static void 3177 pmap_unmap_ptp(void) 3178 { 3179 #ifndef __HAVE_DIRECT_MAP 3180 #if defined(DIAGNOSTIC) || defined(XEN) 3181 pt_entry_t *pte; 3182 3183 KASSERT(kpreempt_disabled()); 3184 3185 pte = PTESLEW(ptp_pte, cpu_number()); 3186 if (*pte != 0) { 3187 pmap_pte_set(pte, 0); 3188 pmap_pte_flush(); 3189 } 3190 #endif 3191 #endif 3192 } 3193 3194 static pt_entry_t * 3195 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va) 3196 { 3197 3198 KASSERT(kpreempt_disabled()); 3199 if (pmap_is_curpmap(pmap)) { 3200 return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */ 3201 } 3202 KASSERT(ptp != NULL); 3203 return pmap_map_ptp(ptp) + pl1_pi(va); 3204 } 3205 3206 static void 3207 pmap_unmap_pte(void) 3208 { 3209 3210 KASSERT(kpreempt_disabled()); 3211 3212 pmap_unmap_ptp(); 3213 } 3214 3215 /* 3216 * p m a p r e m o v e f u n c t i o n s 3217 * 3218 * functions that remove mappings 3219 */ 3220 3221 /* 3222 * pmap_remove_ptes: remove PTEs from a PTP 3223 * 3224 * => caller must hold pmap's lock 3225 * => PTP must be mapped into KVA 3226 * => PTP should be null if pmap == pmap_kernel() 3227 * => must be called with kernel preemption disabled 3228 * => returns composite pte if at least one page should be shot down 3229 */ 3230 3231 static void 3232 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva, 3233 vaddr_t startva, vaddr_t endva, struct pv_entry **pv_tofree) 3234 { 3235 pt_entry_t *pte = (pt_entry_t *)ptpva; 3236 3237 KASSERT(pmap == pmap_kernel() || mutex_owned(pmap->pm_lock)); 3238 KASSERT(kpreempt_disabled()); 3239 3240 /* 3241 * note that ptpva points to the PTE that maps startva. this may 3242 * or may not be the first PTE in the PTP. 3243 * 3244 * we loop through the PTP while there are still PTEs to look at 3245 * and the wire_count is greater than 1 (because we use the wire_count 3246 * to keep track of the number of real PTEs in the PTP). 3247 */ 3248 while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) { 3249 (void)pmap_remove_pte(pmap, ptp, pte, startva, pv_tofree); 3250 startva += PAGE_SIZE; 3251 pte++; 3252 } 3253 } 3254 3255 3256 /* 3257 * pmap_remove_pte: remove a single PTE from a PTP. 3258 * 3259 * => caller must hold pmap's lock 3260 * => PTP must be mapped into KVA 3261 * => PTP should be null if pmap == pmap_kernel() 3262 * => returns true if we removed a mapping 3263 * => must be called with kernel preemption disabled 3264 */ 3265 static bool 3266 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, 3267 vaddr_t va, struct pv_entry **pv_tofree) 3268 { 3269 struct pv_entry *pve; 3270 struct vm_page *pg; 3271 struct pmap_page *pp; 3272 pt_entry_t opte; 3273 3274 KASSERT(pmap == pmap_kernel() || mutex_owned(pmap->pm_lock)); 3275 KASSERT(kpreempt_disabled()); 3276 3277 if (!pmap_valid_entry(*pte)) { 3278 /* VA not mapped. */ 3279 return false; 3280 } 3281 3282 /* Atomically save the old PTE and zap it. */ 3283 opte = pmap_pte_testset(pte, 0); 3284 if (!pmap_valid_entry(opte)) { 3285 return false; 3286 } 3287 3288 pmap_exec_account(pmap, va, opte, 0); 3289 pmap_stats_update_bypte(pmap, 0, opte); 3290 3291 if (ptp) { 3292 /* 3293 * Dropping a PTE. Make sure that the PDE is flushed. 3294 */ 3295 ptp->wire_count--; 3296 if (ptp->wire_count <= 1) { 3297 opte |= PG_U; 3298 } 3299 } 3300 3301 if ((opte & PG_U) != 0) { 3302 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE); 3303 } 3304 3305 /* 3306 * If we are not on a pv_head list - we are done. 3307 */ 3308 if ((opte & PG_PVLIST) == 0) { 3309 #if defined(DIAGNOSTIC) && !defined(DOM0OPS) 3310 if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL || 3311 pmap_pv_tracked(pmap_pte2pa(opte)) != NULL) 3312 panic("pmap_remove_pte: managed or pv-tracked page" 3313 " without PG_PVLIST for %#"PRIxVADDR, va); 3314 #endif 3315 return true; 3316 } 3317 3318 if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) { 3319 KASSERT(uvm_page_locked_p(pg)); 3320 pp = VM_PAGE_TO_PP(pg); 3321 } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) { 3322 paddr_t pa = pmap_pte2pa(opte); 3323 panic("pmap_remove_pte: PG_PVLIST with pv-untracked page" 3324 " va = 0x%"PRIxVADDR 3325 " pa = 0x%"PRIxPADDR" (0x%"PRIxPADDR")", 3326 va, pa, atop(pa)); 3327 } 3328 3329 /* Sync R/M bits. */ 3330 pp->pp_attrs |= opte; 3331 pve = pmap_remove_pv(pp, ptp, va); 3332 3333 if (pve) { 3334 pve->pve_next = *pv_tofree; 3335 *pv_tofree = pve; 3336 } 3337 return true; 3338 } 3339 3340 /* 3341 * pmap_remove: mapping removal function. 3342 * 3343 * => caller should not be holding any pmap locks 3344 */ 3345 3346 void 3347 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 3348 { 3349 pt_entry_t *ptes; 3350 pd_entry_t pde; 3351 pd_entry_t * const *pdes; 3352 struct pv_entry *pv_tofree = NULL; 3353 bool result; 3354 int i; 3355 paddr_t ptppa; 3356 vaddr_t blkendva, va = sva; 3357 struct vm_page *ptp; 3358 struct pmap *pmap2; 3359 3360 kpreempt_disable(); 3361 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 3362 3363 /* 3364 * removing one page? take shortcut function. 3365 */ 3366 3367 if (va + PAGE_SIZE == eva) { 3368 if (pmap_pdes_valid(va, pdes, &pde)) { 3369 3370 /* PA of the PTP */ 3371 ptppa = pmap_pte2pa(pde); 3372 3373 /* Get PTP if non-kernel mapping. */ 3374 if (pmap != pmap_kernel()) { 3375 ptp = pmap_find_ptp(pmap, va, ptppa, 1); 3376 KASSERTMSG(ptp != NULL, 3377 "pmap_remove: unmanaged PTP detected"); 3378 } else { 3379 /* Never free kernel PTPs. */ 3380 ptp = NULL; 3381 } 3382 3383 result = pmap_remove_pte(pmap, ptp, 3384 &ptes[pl1_i(va)], va, &pv_tofree); 3385 3386 /* 3387 * if mapping removed and the PTP is no longer 3388 * being used, free it! 3389 */ 3390 3391 if (result && ptp && ptp->wire_count <= 1) 3392 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3393 } 3394 } else for (/* null */ ; va < eva ; va = blkendva) { 3395 int lvl; 3396 3397 /* determine range of block */ 3398 blkendva = x86_round_pdr(va+1); 3399 if (blkendva > eva) 3400 blkendva = eva; 3401 3402 /* 3403 * Our PTE mappings should never be removed with pmap_remove. 3404 * 3405 * XXXmaxv: still needed? 3406 * 3407 * A long term solution is to move the PTEs out of user address 3408 * space, and into kernel address space. Then we can set 3409 * VM_MAXUSER_ADDRESS to be VM_MAX_ADDRESS. 3410 */ 3411 for (i = 0; i < PDP_SIZE; i++) { 3412 if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i) 3413 panic("PTE space accessed"); 3414 } 3415 3416 lvl = pmap_pdes_invalid(va, pdes, &pde); 3417 if (lvl != 0) { 3418 /* 3419 * skip a range corresponding to an invalid pde. 3420 */ 3421 blkendva = (va & ptp_masks[lvl - 1]) + nbpd[lvl - 1]; 3422 continue; 3423 } 3424 3425 /* PA of the PTP */ 3426 ptppa = pmap_pte2pa(pde); 3427 3428 /* Get PTP if non-kernel mapping. */ 3429 if (pmap != pmap_kernel()) { 3430 ptp = pmap_find_ptp(pmap, va, ptppa, 1); 3431 KASSERTMSG(ptp != NULL, 3432 "pmap_remove: unmanaged PTP detected"); 3433 } else { 3434 /* Never free kernel PTPs. */ 3435 ptp = NULL; 3436 } 3437 3438 pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va, 3439 blkendva, &pv_tofree); 3440 3441 /* if PTP is no longer being used, free it! */ 3442 if (ptp && ptp->wire_count <= 1) { 3443 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3444 } 3445 } 3446 pmap_unmap_ptes(pmap, pmap2); /* unlock pmap */ 3447 kpreempt_enable(); 3448 3449 /* Now we free unused PVs */ 3450 if (pv_tofree) 3451 pmap_free_pvs(pv_tofree); 3452 } 3453 3454 /* 3455 * pmap_sync_pv: clear pte bits and return the old value of the pte. 3456 * 3457 * => Caller should disable kernel preemption. 3458 * => issues tlb shootdowns if necessary. 3459 */ 3460 3461 static int 3462 pmap_sync_pv(struct pv_pte *pvpte, pt_entry_t expect, int clearbits, 3463 pt_entry_t *optep) 3464 { 3465 struct pmap *pmap; 3466 struct vm_page *ptp; 3467 vaddr_t va; 3468 pt_entry_t *ptep; 3469 pt_entry_t opte; 3470 pt_entry_t npte; 3471 bool need_shootdown; 3472 3473 ptp = pvpte->pte_ptp; 3474 va = pvpte->pte_va; 3475 KASSERT(ptp == NULL || ptp->uobject != NULL); 3476 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 3477 pmap = ptp_to_pmap(ptp); 3478 3479 KASSERT((expect & ~(PG_FRAME | PG_V)) == 0); 3480 KASSERT((expect & PG_V) != 0); 3481 KASSERT(clearbits == ~0 || (clearbits & ~(PG_M | PG_U | PG_RW)) == 0); 3482 KASSERT(kpreempt_disabled()); 3483 3484 ptep = pmap_map_pte(pmap, ptp, va); 3485 do { 3486 opte = *ptep; 3487 KASSERT((opte & (PG_M | PG_U)) != PG_M); 3488 KASSERT((opte & (PG_U | PG_V)) != PG_U); 3489 KASSERT(opte == 0 || (opte & PG_V) != 0); 3490 if ((opte & (PG_FRAME | PG_V)) != expect) { 3491 3492 /* 3493 * we lost a race with a V->P operation like 3494 * pmap_remove(). wait for the competitor 3495 * reflecting pte bits into mp_attrs. 3496 * 3497 * issue a redundant TLB shootdown so that 3498 * we can wait for its completion. 3499 */ 3500 3501 pmap_unmap_pte(); 3502 if (clearbits != 0) { 3503 pmap_tlb_shootdown(pmap, va, 3504 (pmap == pmap_kernel() ? PG_G : 0), 3505 TLBSHOOT_SYNC_PV1); 3506 } 3507 return EAGAIN; 3508 } 3509 3510 /* 3511 * check if there's anything to do on this pte. 3512 */ 3513 3514 if ((opte & clearbits) == 0) { 3515 need_shootdown = false; 3516 break; 3517 } 3518 3519 /* 3520 * we need a shootdown if the pte is cached. (PG_U) 3521 * 3522 * ...unless we are clearing only the PG_RW bit and 3523 * it isn't cached as RW. (PG_M) 3524 */ 3525 3526 need_shootdown = (opte & PG_U) != 0 && 3527 !(clearbits == PG_RW && (opte & PG_M) == 0); 3528 3529 npte = opte & ~clearbits; 3530 3531 /* 3532 * if we need a shootdown anyway, clear PG_U and PG_M. 3533 */ 3534 3535 if (need_shootdown) { 3536 npte &= ~(PG_U | PG_M); 3537 } 3538 KASSERT((npte & (PG_M | PG_U)) != PG_M); 3539 KASSERT((npte & (PG_U | PG_V)) != PG_U); 3540 KASSERT(npte == 0 || (opte & PG_V) != 0); 3541 } while (pmap_pte_cas(ptep, opte, npte) != opte); 3542 3543 if (need_shootdown) { 3544 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV2); 3545 } 3546 pmap_unmap_pte(); 3547 3548 *optep = opte; 3549 return 0; 3550 } 3551 3552 static void 3553 pmap_pp_remove(struct pmap_page *pp, paddr_t pa) 3554 { 3555 struct pv_pte *pvpte; 3556 struct pv_entry *killlist = NULL; 3557 struct vm_page *ptp; 3558 pt_entry_t expect; 3559 int count; 3560 3561 expect = pmap_pa2pte(pa) | PG_V; 3562 count = SPINLOCK_BACKOFF_MIN; 3563 kpreempt_disable(); 3564 startover: 3565 while ((pvpte = pv_pte_first(pp)) != NULL) { 3566 struct pmap *pmap; 3567 struct pv_entry *pve; 3568 pt_entry_t opte; 3569 vaddr_t va; 3570 int error; 3571 3572 /* 3573 * add a reference to the pmap before clearing the pte. 3574 * otherwise the pmap can disappear behind us. 3575 */ 3576 3577 ptp = pvpte->pte_ptp; 3578 pmap = ptp_to_pmap(ptp); 3579 if (ptp != NULL) { 3580 pmap_reference(pmap); 3581 } 3582 3583 error = pmap_sync_pv(pvpte, expect, ~0, &opte); 3584 if (error == EAGAIN) { 3585 int hold_count; 3586 KERNEL_UNLOCK_ALL(curlwp, &hold_count); 3587 if (ptp != NULL) { 3588 pmap_destroy(pmap); 3589 } 3590 SPINLOCK_BACKOFF(count); 3591 KERNEL_LOCK(hold_count, curlwp); 3592 goto startover; 3593 } 3594 3595 pp->pp_attrs |= opte; 3596 va = pvpte->pte_va; 3597 pve = pmap_remove_pv(pp, ptp, va); 3598 3599 /* update the PTP reference count. free if last reference. */ 3600 if (ptp != NULL) { 3601 struct pmap *pmap2; 3602 pt_entry_t *ptes; 3603 pd_entry_t * const *pdes; 3604 3605 KASSERT(pmap != pmap_kernel()); 3606 3607 pmap_tlb_shootnow(); 3608 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3609 pmap_stats_update_bypte(pmap, 0, opte); 3610 ptp->wire_count--; 3611 if (ptp->wire_count <= 1) { 3612 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3613 } 3614 pmap_unmap_ptes(pmap, pmap2); 3615 pmap_destroy(pmap); 3616 } else { 3617 KASSERT(pmap == pmap_kernel()); 3618 pmap_stats_update_bypte(pmap, 0, opte); 3619 } 3620 3621 if (pve != NULL) { 3622 pve->pve_next = killlist; /* mark it for death */ 3623 killlist = pve; 3624 } 3625 } 3626 pmap_tlb_shootnow(); 3627 kpreempt_enable(); 3628 3629 /* Now free unused pvs. */ 3630 pmap_free_pvs(killlist); 3631 } 3632 3633 /* 3634 * pmap_page_remove: remove a managed vm_page from all pmaps that map it 3635 * 3636 * => R/M bits are sync'd back to attrs 3637 */ 3638 3639 void 3640 pmap_page_remove(struct vm_page *pg) 3641 { 3642 struct pmap_page *pp; 3643 paddr_t pa; 3644 3645 KASSERT(uvm_page_locked_p(pg)); 3646 3647 pp = VM_PAGE_TO_PP(pg); 3648 pa = VM_PAGE_TO_PHYS(pg); 3649 pmap_pp_remove(pp, pa); 3650 } 3651 3652 /* 3653 * pmap_pv_remove: remove an unmanaged pv-tracked page from all pmaps 3654 * that map it 3655 */ 3656 3657 void 3658 pmap_pv_remove(paddr_t pa) 3659 { 3660 struct pmap_page *pp; 3661 3662 pp = pmap_pv_tracked(pa); 3663 if (pp == NULL) 3664 panic("pmap_pv_protect: page not pv-tracked: 0x%"PRIxPADDR, 3665 pa); 3666 pmap_pp_remove(pp, pa); 3667 } 3668 3669 /* 3670 * p m a p a t t r i b u t e f u n c t i o n s 3671 * functions that test/change managed page's attributes 3672 * since a page can be mapped multiple times we must check each PTE that 3673 * maps it by going down the pv lists. 3674 */ 3675 3676 /* 3677 * pmap_test_attrs: test a page's attributes 3678 */ 3679 3680 bool 3681 pmap_test_attrs(struct vm_page *pg, unsigned testbits) 3682 { 3683 struct pmap_page *pp; 3684 struct pv_pte *pvpte; 3685 pt_entry_t expect; 3686 u_int result; 3687 3688 KASSERT(uvm_page_locked_p(pg)); 3689 3690 pp = VM_PAGE_TO_PP(pg); 3691 if ((pp->pp_attrs & testbits) != 0) { 3692 return true; 3693 } 3694 expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V; 3695 kpreempt_disable(); 3696 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 3697 pt_entry_t opte; 3698 int error; 3699 3700 if ((pp->pp_attrs & testbits) != 0) { 3701 break; 3702 } 3703 error = pmap_sync_pv(pvpte, expect, 0, &opte); 3704 if (error == 0) { 3705 pp->pp_attrs |= opte; 3706 } 3707 } 3708 result = pp->pp_attrs & testbits; 3709 kpreempt_enable(); 3710 3711 /* 3712 * note that we will exit the for loop with a non-null pve if 3713 * we have found the bits we are testing for. 3714 */ 3715 3716 return result != 0; 3717 } 3718 3719 static bool 3720 pmap_pp_clear_attrs(struct pmap_page *pp, paddr_t pa, unsigned clearbits) 3721 { 3722 struct pv_pte *pvpte; 3723 u_int result; 3724 pt_entry_t expect; 3725 int count; 3726 3727 expect = pmap_pa2pte(pa) | PG_V; 3728 count = SPINLOCK_BACKOFF_MIN; 3729 kpreempt_disable(); 3730 startover: 3731 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 3732 pt_entry_t opte; 3733 int error; 3734 3735 error = pmap_sync_pv(pvpte, expect, clearbits, &opte); 3736 if (error == EAGAIN) { 3737 int hold_count; 3738 KERNEL_UNLOCK_ALL(curlwp, &hold_count); 3739 SPINLOCK_BACKOFF(count); 3740 KERNEL_LOCK(hold_count, curlwp); 3741 goto startover; 3742 } 3743 pp->pp_attrs |= opte; 3744 } 3745 result = pp->pp_attrs & clearbits; 3746 pp->pp_attrs &= ~clearbits; 3747 pmap_tlb_shootnow(); 3748 kpreempt_enable(); 3749 3750 return result != 0; 3751 } 3752 3753 /* 3754 * pmap_clear_attrs: clear the specified attribute for a page. 3755 * 3756 * => we return true if we cleared one of the bits we were asked to 3757 */ 3758 3759 bool 3760 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits) 3761 { 3762 struct pmap_page *pp; 3763 paddr_t pa; 3764 3765 KASSERT(uvm_page_locked_p(pg)); 3766 3767 pp = VM_PAGE_TO_PP(pg); 3768 pa = VM_PAGE_TO_PHYS(pg); 3769 3770 return pmap_pp_clear_attrs(pp, pa, clearbits); 3771 } 3772 3773 /* 3774 * pmap_pv_clear_attrs: clear the specified attributes for an unmanaged 3775 * pv-tracked page. 3776 */ 3777 3778 bool 3779 pmap_pv_clear_attrs(paddr_t pa, unsigned clearbits) 3780 { 3781 struct pmap_page *pp; 3782 3783 pp = pmap_pv_tracked(pa); 3784 if (pp == NULL) 3785 panic("pmap_pv_protect: page not pv-tracked: 0x%"PRIxPADDR, 3786 pa); 3787 3788 return pmap_pp_clear_attrs(pp, pa, clearbits); 3789 } 3790 3791 /* 3792 * p m a p p r o t e c t i o n f u n c t i o n s 3793 */ 3794 3795 /* 3796 * pmap_page_protect: change the protection of all recorded mappings 3797 * of a managed page 3798 * 3799 * => NOTE: this is an inline function in pmap.h 3800 */ 3801 3802 /* see pmap.h */ 3803 3804 /* 3805 * pmap_pv_protect: change the protection of all recorded mappings 3806 * of an unmanaged pv-tracked page 3807 * 3808 * => NOTE: this is an inline function in pmap.h 3809 */ 3810 3811 /* see pmap.h */ 3812 3813 /* 3814 * pmap_protect: set the protection in of the pages in a pmap 3815 * 3816 * => NOTE: this is an inline function in pmap.h 3817 */ 3818 3819 /* see pmap.h */ 3820 3821 /* 3822 * pmap_write_protect: write-protect pages in a pmap. 3823 */ 3824 void 3825 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot) 3826 { 3827 pt_entry_t bit_rem, bit_put; 3828 pt_entry_t *ptes; 3829 pt_entry_t * const *pdes; 3830 struct pmap *pmap2; 3831 vaddr_t blockend, va; 3832 3833 KASSERT(curlwp->l_md.md_gc_pmap != pmap); 3834 3835 bit_rem = 0; 3836 if (!(prot & VM_PROT_WRITE)) 3837 bit_rem = PG_RW; 3838 3839 bit_put = 0; 3840 if (!(prot & VM_PROT_EXECUTE)) 3841 bit_put = pmap_pg_nx; 3842 3843 sva &= PG_FRAME; 3844 eva &= PG_FRAME; 3845 3846 /* Acquire pmap. */ 3847 kpreempt_disable(); 3848 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3849 3850 for (va = sva ; va < eva; va = blockend) { 3851 pt_entry_t *spte, *epte; 3852 int i; 3853 3854 blockend = x86_round_pdr(va + 1); 3855 if (blockend > eva) 3856 blockend = eva; 3857 3858 /* 3859 * Our PTE mappings should never be write-protected. 3860 * 3861 * XXXmaxv: still needed? 3862 * 3863 * A long term solution is to move the PTEs out of user address 3864 * space, and into kernel address space. Then we can set 3865 * VM_MAXUSER_ADDRESS to be VM_MAX_ADDRESS. 3866 */ 3867 for (i = 0; i < PDP_SIZE; i++) { 3868 if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i) 3869 panic("PTE space accessed"); 3870 } 3871 3872 /* Is it a valid block? */ 3873 if (!pmap_pdes_valid(va, pdes, NULL)) { 3874 continue; 3875 } 3876 KASSERT(va < VM_MAXUSER_ADDRESS || va >= VM_MAX_ADDRESS); 3877 3878 spte = &ptes[pl1_i(va)]; 3879 epte = &ptes[pl1_i(blockend)]; 3880 3881 for (/* */; spte < epte; spte++) { 3882 pt_entry_t opte, npte; 3883 3884 do { 3885 opte = *spte; 3886 if (!pmap_valid_entry(opte)) { 3887 goto next; 3888 } 3889 npte = (opte & ~bit_rem) | bit_put; 3890 } while (pmap_pte_cas(spte, opte, npte) != opte); 3891 3892 if ((opte & PG_M) != 0) { 3893 vaddr_t tva = x86_ptob(spte - ptes); 3894 pmap_tlb_shootdown(pmap, tva, opte, 3895 TLBSHOOT_WRITE_PROTECT); 3896 } 3897 next:; 3898 } 3899 } 3900 3901 /* Release pmap. */ 3902 pmap_unmap_ptes(pmap, pmap2); 3903 kpreempt_enable(); 3904 } 3905 3906 /* 3907 * pmap_unwire: clear the wired bit in the PTE. 3908 * 3909 * => Mapping should already be present. 3910 */ 3911 void 3912 pmap_unwire(struct pmap *pmap, vaddr_t va) 3913 { 3914 pt_entry_t *ptes, *ptep, opte; 3915 pd_entry_t * const *pdes; 3916 struct pmap *pmap2; 3917 3918 /* Acquire pmap. */ 3919 kpreempt_disable(); 3920 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3921 3922 if (!pmap_pdes_valid(va, pdes, NULL)) { 3923 panic("pmap_unwire: invalid PDE"); 3924 } 3925 3926 ptep = &ptes[pl1_i(va)]; 3927 opte = *ptep; 3928 KASSERT(pmap_valid_entry(opte)); 3929 3930 if (opte & PG_W) { 3931 pt_entry_t npte = opte & ~PG_W; 3932 3933 opte = pmap_pte_testset(ptep, npte); 3934 pmap_stats_update_bypte(pmap, npte, opte); 3935 } else { 3936 printf("pmap_unwire: wiring for pmap %p va 0x%lx " 3937 "did not change!\n", pmap, va); 3938 } 3939 3940 /* Release pmap. */ 3941 pmap_unmap_ptes(pmap, pmap2); 3942 kpreempt_enable(); 3943 } 3944 3945 /* 3946 * pmap_copy: copy mappings from one pmap to another 3947 * 3948 * => optional function 3949 * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr) 3950 */ 3951 3952 /* 3953 * defined as macro in pmap.h 3954 */ 3955 3956 __strict_weak_alias(pmap_enter, pmap_enter_default); 3957 3958 int 3959 pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, 3960 u_int flags) 3961 { 3962 return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0); 3963 } 3964 3965 /* 3966 * pmap_enter: enter a mapping into a pmap 3967 * 3968 * => must be done "now" ... no lazy-evaluation 3969 * => we set pmap => pv_head locking 3970 */ 3971 int 3972 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa, 3973 vm_prot_t prot, u_int flags, int domid) 3974 { 3975 pt_entry_t *ptes, opte, npte; 3976 pt_entry_t *ptep; 3977 pd_entry_t * const *pdes; 3978 struct vm_page *ptp; 3979 struct vm_page *new_pg, *old_pg; 3980 struct pmap_page *new_pp, *old_pp; 3981 struct pv_entry *old_pve = NULL; 3982 struct pv_entry *new_pve; 3983 struct pv_entry *new_sparepve; 3984 int error; 3985 bool wired = (flags & PMAP_WIRED) != 0; 3986 struct pmap *pmap2; 3987 3988 KASSERT(pmap_initialized); 3989 KASSERT(curlwp->l_md.md_gc_pmap != pmap); 3990 KASSERT(va < VM_MAX_KERNEL_ADDRESS); 3991 KASSERTMSG(va != (vaddr_t)PDP_BASE, 3992 "pmap_enter: trying to map over PDP!"); 3993 KASSERTMSG(va < VM_MIN_KERNEL_ADDRESS || 3994 pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]), 3995 "pmap_enter: missing kernel PTP for VA %lx!", va); 3996 3997 #ifdef XEN 3998 KASSERT(domid == DOMID_SELF || pa == 0); 3999 #endif /* XEN */ 4000 4001 npte = ma | protection_codes[prot] | PG_V; 4002 npte |= pmap_pat_flags(flags); 4003 if (wired) 4004 npte |= PG_W; 4005 if (va < VM_MAXUSER_ADDRESS) 4006 npte |= PG_u; 4007 else if (va < VM_MAX_ADDRESS) 4008 panic("PTE space accessed"); /* XXXmaxv: no longer needed? */ 4009 else 4010 npte |= PG_k; 4011 if (pmap == pmap_kernel()) 4012 npte |= pmap_pg_g; 4013 if (flags & VM_PROT_ALL) { 4014 npte |= PG_U; 4015 if (flags & VM_PROT_WRITE) { 4016 KASSERT((npte & PG_RW) != 0); 4017 npte |= PG_M; 4018 } 4019 } 4020 4021 #ifdef XEN 4022 if (domid != DOMID_SELF) 4023 new_pg = NULL; 4024 else 4025 #endif 4026 new_pg = PHYS_TO_VM_PAGE(pa); 4027 if (new_pg != NULL) { 4028 /* This is a managed page */ 4029 npte |= PG_PVLIST; 4030 new_pp = VM_PAGE_TO_PP(new_pg); 4031 } else if ((new_pp = pmap_pv_tracked(pa)) != NULL) { 4032 /* This is an unmanaged pv-tracked page */ 4033 npte |= PG_PVLIST; 4034 } else { 4035 new_pp = NULL; 4036 } 4037 4038 /* get pves. */ 4039 new_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT); 4040 new_sparepve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT); 4041 if (new_pve == NULL || new_sparepve == NULL) { 4042 if (flags & PMAP_CANFAIL) { 4043 error = ENOMEM; 4044 goto out2; 4045 } 4046 panic("pmap_enter: pve allocation failed"); 4047 } 4048 4049 kpreempt_disable(); 4050 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 4051 if (pmap == pmap_kernel()) { 4052 ptp = NULL; 4053 } else { 4054 ptp = pmap_get_ptp(pmap, va, pdes); 4055 if (ptp == NULL) { 4056 pmap_unmap_ptes(pmap, pmap2); 4057 if (flags & PMAP_CANFAIL) { 4058 error = ENOMEM; 4059 goto out; 4060 } 4061 panic("pmap_enter: get ptp failed"); 4062 } 4063 } 4064 4065 /* 4066 * update the pte. 4067 */ 4068 4069 ptep = &ptes[pl1_i(va)]; 4070 do { 4071 opte = *ptep; 4072 4073 /* 4074 * if the same page, inherit PG_U and PG_M. 4075 */ 4076 if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) { 4077 npte |= opte & (PG_U | PG_M); 4078 } 4079 #if defined(XEN) 4080 if (domid != DOMID_SELF) { 4081 /* pmap_pte_cas with error handling */ 4082 int s = splvm(); 4083 if (opte != *ptep) { 4084 splx(s); 4085 continue; 4086 } 4087 error = xpq_update_foreign( 4088 vtomach((vaddr_t)ptep), npte, domid); 4089 splx(s); 4090 if (error) { 4091 if (ptp != NULL && ptp->wire_count <= 1) { 4092 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 4093 } 4094 pmap_unmap_ptes(pmap, pmap2); 4095 goto out; 4096 } 4097 break; 4098 } 4099 #endif /* defined(XEN) */ 4100 } while (pmap_pte_cas(ptep, opte, npte) != opte); 4101 4102 /* 4103 * update statistics and PTP's reference count. 4104 */ 4105 4106 pmap_stats_update_bypte(pmap, npte, opte); 4107 if (ptp != NULL && !pmap_valid_entry(opte)) { 4108 ptp->wire_count++; 4109 } 4110 KASSERT(ptp == NULL || ptp->wire_count > 1); 4111 4112 /* 4113 * if the same page, we can skip pv_entry handling. 4114 */ 4115 4116 if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) { 4117 KASSERT(((opte ^ npte) & PG_PVLIST) == 0); 4118 goto same_pa; 4119 } 4120 4121 /* 4122 * if old page is pv-tracked, remove pv_entry from its list. 4123 */ 4124 4125 if ((~opte & (PG_V | PG_PVLIST)) == 0) { 4126 if ((old_pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) { 4127 KASSERT(uvm_page_locked_p(old_pg)); 4128 old_pp = VM_PAGE_TO_PP(old_pg); 4129 } else if ((old_pp = pmap_pv_tracked(pmap_pte2pa(opte))) 4130 == NULL) { 4131 pa = pmap_pte2pa(opte); 4132 panic("pmap_enter: PG_PVLIST with pv-untracked page" 4133 " va = 0x%"PRIxVADDR 4134 " pa = 0x%" PRIxPADDR " (0x%" PRIxPADDR ")", 4135 va, pa, atop(pa)); 4136 } 4137 4138 old_pve = pmap_remove_pv(old_pp, ptp, va); 4139 old_pp->pp_attrs |= opte; 4140 } 4141 4142 /* 4143 * if new page is pv-tracked, insert pv_entry into its list. 4144 */ 4145 4146 if (new_pp) { 4147 new_pve = pmap_enter_pv(new_pp, new_pve, &new_sparepve, ptp, va); 4148 } 4149 4150 same_pa: 4151 pmap_unmap_ptes(pmap, pmap2); 4152 4153 /* 4154 * shootdown tlb if necessary. 4155 */ 4156 4157 if ((~opte & (PG_V | PG_U)) == 0 && 4158 ((opte ^ npte) & (PG_FRAME | PG_RW)) != 0) { 4159 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER); 4160 } 4161 4162 error = 0; 4163 out: 4164 kpreempt_enable(); 4165 out2: 4166 if (old_pve != NULL) { 4167 pool_cache_put(&pmap_pv_cache, old_pve); 4168 } 4169 if (new_pve != NULL) { 4170 pool_cache_put(&pmap_pv_cache, new_pve); 4171 } 4172 if (new_sparepve != NULL) { 4173 pool_cache_put(&pmap_pv_cache, new_sparepve); 4174 } 4175 4176 return error; 4177 } 4178 4179 static paddr_t 4180 pmap_get_physpage(void) 4181 { 4182 struct vm_page *ptp; 4183 struct pmap *kpm = pmap_kernel(); 4184 paddr_t pa; 4185 4186 if (!uvm.page_init_done) { 4187 /* 4188 * We're growing the kernel pmap early (from 4189 * uvm_pageboot_alloc()). This case must be 4190 * handled a little differently. 4191 */ 4192 4193 if (!uvm_page_physget(&pa)) 4194 panic("pmap_get_physpage: out of memory"); 4195 #if defined(__HAVE_DIRECT_MAP) 4196 pagezero(PMAP_DIRECT_MAP(pa)); 4197 #else 4198 #if defined(XEN) 4199 if (XEN_VERSION_SUPPORTED(3, 4)) { 4200 xen_pagezero(pa); 4201 return pa; 4202 } 4203 #endif 4204 kpreempt_disable(); 4205 pmap_pte_set(early_zero_pte, pmap_pa2pte(pa) | PG_V | 4206 PG_RW | pmap_pg_nx | PG_k); 4207 pmap_pte_flush(); 4208 pmap_update_pg((vaddr_t)early_zerop); 4209 memset(early_zerop, 0, PAGE_SIZE); 4210 #if defined(DIAGNOSTIC) || defined(XEN) 4211 pmap_pte_set(early_zero_pte, 0); 4212 pmap_pte_flush(); 4213 #endif /* defined(DIAGNOSTIC) */ 4214 kpreempt_enable(); 4215 #endif /* defined(__HAVE_DIRECT_MAP) */ 4216 } else { 4217 /* XXX */ 4218 ptp = uvm_pagealloc(NULL, 0, NULL, 4219 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 4220 if (ptp == NULL) 4221 panic("pmap_get_physpage: out of memory"); 4222 ptp->flags &= ~PG_BUSY; 4223 ptp->wire_count = 1; 4224 pa = VM_PAGE_TO_PHYS(ptp); 4225 } 4226 pmap_stats_update(kpm, 1, 0); 4227 4228 return pa; 4229 } 4230 4231 /* 4232 * Expand the page tree with the specified amount of PTPs, mapping virtual 4233 * addresses starting at kva. We populate all the levels but the last one 4234 * (L1). The nodes of the tree are created as RWX, but the pages covered 4235 * will be kentered in L1, with proper permissions. 4236 * 4237 * Used only by pmap_growkernel. 4238 */ 4239 static void 4240 pmap_alloc_level(vaddr_t kva, long *needed_ptps) 4241 { 4242 unsigned long i; 4243 paddr_t pa; 4244 unsigned long index, endindex; 4245 int level; 4246 pd_entry_t *pdep; 4247 #ifdef XEN 4248 int s = splvm(); /* protect xpq_* */ 4249 #endif 4250 4251 for (level = PTP_LEVELS; level > 1; level--) { 4252 if (level == PTP_LEVELS) 4253 pdep = pmap_kernel()->pm_pdir; 4254 else 4255 pdep = normal_pdes[level - 2]; 4256 index = pl_i_roundup(kva, level); 4257 endindex = index + needed_ptps[level - 1] - 1; 4258 4259 for (i = index; i <= endindex; i++) { 4260 pt_entry_t pte; 4261 4262 KASSERT(!pmap_valid_entry(pdep[i])); 4263 pa = pmap_get_physpage(); 4264 pte = pmap_pa2pte(pa) | PG_k | PG_V | PG_RW; 4265 pmap_pte_set(&pdep[i], pte); 4266 4267 #if defined(XEN) && (defined(PAE) || defined(__x86_64__)) 4268 if (level == PTP_LEVELS && i >= PDIR_SLOT_KERN) { 4269 if (__predict_true( 4270 cpu_info_primary.ci_flags & CPUF_PRESENT)) { 4271 /* update per-cpu PMDs on all cpus */ 4272 xen_kpm_sync(pmap_kernel(), i); 4273 } else { 4274 /* 4275 * too early; update primary CPU 4276 * PMD only (without locks) 4277 */ 4278 #ifdef PAE 4279 pd_entry_t *cpu_pdep = 4280 &cpu_info_primary.ci_kpm_pdir[l2tol2(i)]; 4281 #endif 4282 #ifdef __x86_64__ 4283 pd_entry_t *cpu_pdep = 4284 &cpu_info_primary.ci_kpm_pdir[i]; 4285 #endif 4286 pmap_pte_set(cpu_pdep, pte); 4287 } 4288 } 4289 #endif /* XEN && (PAE || __x86_64__) */ 4290 4291 KASSERT(level != PTP_LEVELS || nkptp[level - 1] + 4292 pl_i(VM_MIN_KERNEL_ADDRESS, level) == i); 4293 nkptp[level - 1]++; 4294 } 4295 pmap_pte_flush(); 4296 } 4297 #ifdef XEN 4298 splx(s); 4299 #endif 4300 } 4301 4302 /* 4303 * pmap_growkernel: increase usage of KVM space. 4304 * 4305 * => we allocate new PTPs for the kernel and install them in all 4306 * the pmaps on the system. 4307 */ 4308 4309 vaddr_t 4310 pmap_growkernel(vaddr_t maxkvaddr) 4311 { 4312 struct pmap *kpm = pmap_kernel(); 4313 #if !defined(XEN) || !defined(__x86_64__) 4314 struct pmap *pm; 4315 long old; 4316 #endif 4317 int s, i; 4318 long needed_kptp[PTP_LEVELS], target_nptp; 4319 bool invalidate = false; 4320 4321 s = splvm(); /* to be safe */ 4322 mutex_enter(kpm->pm_lock); 4323 4324 if (maxkvaddr <= pmap_maxkvaddr) { 4325 mutex_exit(kpm->pm_lock); 4326 splx(s); 4327 return pmap_maxkvaddr; 4328 } 4329 4330 maxkvaddr = x86_round_pdr(maxkvaddr); 4331 #if !defined(XEN) || !defined(__x86_64__) 4332 old = nkptp[PTP_LEVELS - 1]; 4333 #endif 4334 4335 /* Initialize needed_kptp. */ 4336 for (i = PTP_LEVELS - 1; i >= 1; i--) { 4337 target_nptp = pl_i_roundup(maxkvaddr, i + 1) - 4338 pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1); 4339 4340 if (target_nptp > nkptpmax[i]) 4341 panic("out of KVA space"); 4342 KASSERT(target_nptp >= nkptp[i]); 4343 needed_kptp[i] = target_nptp - nkptp[i]; 4344 } 4345 4346 pmap_alloc_level(pmap_maxkvaddr, needed_kptp); 4347 4348 /* 4349 * If the number of top level entries changed, update all pmaps. 4350 */ 4351 if (needed_kptp[PTP_LEVELS - 1] != 0) { 4352 #ifdef XEN 4353 #ifdef __x86_64__ 4354 /* nothing, kernel entries are never entered in user pmap */ 4355 #else /* __x86_64__ */ 4356 mutex_enter(&pmaps_lock); 4357 LIST_FOREACH(pm, &pmaps, pm_list) { 4358 int pdkidx; 4359 for (pdkidx = PDIR_SLOT_KERN + old; 4360 pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1]; 4361 pdkidx++) { 4362 pmap_pte_set(&pm->pm_pdir[pdkidx], 4363 kpm->pm_pdir[pdkidx]); 4364 } 4365 pmap_pte_flush(); 4366 } 4367 mutex_exit(&pmaps_lock); 4368 #endif /* __x86_64__ */ 4369 #else /* XEN */ 4370 unsigned newpdes; 4371 newpdes = nkptp[PTP_LEVELS - 1] - old; 4372 mutex_enter(&pmaps_lock); 4373 LIST_FOREACH(pm, &pmaps, pm_list) { 4374 memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old], 4375 &kpm->pm_pdir[PDIR_SLOT_KERN + old], 4376 newpdes * sizeof (pd_entry_t)); 4377 } 4378 mutex_exit(&pmaps_lock); 4379 #endif 4380 invalidate = true; 4381 } 4382 pmap_maxkvaddr = maxkvaddr; 4383 mutex_exit(kpm->pm_lock); 4384 splx(s); 4385 4386 if (invalidate && pmap_initialized) { 4387 /* Invalidate the PDP cache. */ 4388 pool_cache_invalidate(&pmap_pdp_cache); 4389 } 4390 4391 return maxkvaddr; 4392 } 4393 4394 #ifdef DEBUG 4395 void pmap_dump(struct pmap *, vaddr_t, vaddr_t); 4396 4397 /* 4398 * pmap_dump: dump all the mappings from a pmap 4399 * 4400 * => caller should not be holding any pmap locks 4401 */ 4402 4403 void 4404 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 4405 { 4406 pt_entry_t *ptes, *pte; 4407 pd_entry_t * const *pdes; 4408 struct pmap *pmap2; 4409 vaddr_t blkendva; 4410 4411 /* 4412 * if end is out of range truncate. 4413 * if (end == start) update to max. 4414 */ 4415 4416 if (eva > VM_MAXUSER_ADDRESS || eva <= sva) 4417 eva = VM_MAXUSER_ADDRESS; 4418 4419 /* 4420 * we lock in the pmap => pv_head direction 4421 */ 4422 4423 kpreempt_disable(); 4424 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 4425 4426 /* 4427 * dumping a range of pages: we dump in PTP sized blocks (4MB) 4428 */ 4429 4430 for (/* null */ ; sva < eva ; sva = blkendva) { 4431 4432 /* determine range of block */ 4433 blkendva = x86_round_pdr(sva+1); 4434 if (blkendva > eva) 4435 blkendva = eva; 4436 4437 /* valid block? */ 4438 if (!pmap_pdes_valid(sva, pdes, NULL)) 4439 continue; 4440 4441 pte = &ptes[pl1_i(sva)]; 4442 for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) { 4443 if (!pmap_valid_entry(*pte)) 4444 continue; 4445 printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR 4446 " (pte=%#" PRIxPADDR ")\n", 4447 sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte); 4448 } 4449 } 4450 pmap_unmap_ptes(pmap, pmap2); 4451 kpreempt_enable(); 4452 } 4453 #endif 4454 4455 /* 4456 * pmap_update: process deferred invalidations and frees. 4457 */ 4458 4459 void 4460 pmap_update(struct pmap *pmap) 4461 { 4462 struct vm_page *empty_ptps; 4463 lwp_t *l = curlwp; 4464 4465 /* 4466 * If we have torn down this pmap, invalidate non-global TLB 4467 * entries on any processors using it. 4468 */ 4469 kpreempt_disable(); 4470 if (__predict_false(l->l_md.md_gc_pmap == pmap)) { 4471 l->l_md.md_gc_pmap = NULL; 4472 pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, TLBSHOOT_UPDATE); 4473 } 4474 /* 4475 * Initiate any pending TLB shootdowns. Wait for them to 4476 * complete before returning control to the caller. 4477 */ 4478 pmap_tlb_shootnow(); 4479 kpreempt_enable(); 4480 4481 /* 4482 * Now that shootdowns are complete, process deferred frees, 4483 * but not from interrupt context. 4484 */ 4485 if (l->l_md.md_gc_ptp != NULL) { 4486 KASSERT((l->l_pflag & LP_INTR) == 0); 4487 if (cpu_intr_p()) { 4488 return; 4489 } 4490 empty_ptps = l->l_md.md_gc_ptp; 4491 l->l_md.md_gc_ptp = NULL; 4492 pmap_free_ptps(empty_ptps); 4493 } 4494 } 4495 4496 #if PTP_LEVELS > 4 4497 #error "Unsupported number of page table mappings" 4498 #endif 4499 4500 paddr_t 4501 pmap_init_tmp_pgtbl(paddr_t pg) 4502 { 4503 static bool maps_loaded; 4504 static const paddr_t x86_tmp_pml_paddr[] = { 4505 4 * PAGE_SIZE, /* L1 */ 4506 5 * PAGE_SIZE, /* L2 */ 4507 6 * PAGE_SIZE, /* L3 */ 4508 7 * PAGE_SIZE /* L4 */ 4509 }; 4510 static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 }; 4511 4512 pd_entry_t *tmp_pml, *kernel_pml; 4513 4514 int level; 4515 4516 if (!maps_loaded) { 4517 for (level = 0; level < PTP_LEVELS; ++level) { 4518 x86_tmp_pml_vaddr[level] = 4519 uvm_km_alloc(kernel_map, PAGE_SIZE, 0, 4520 UVM_KMF_VAONLY); 4521 4522 if (x86_tmp_pml_vaddr[level] == 0) 4523 panic("mapping of real mode PML failed\n"); 4524 pmap_kenter_pa(x86_tmp_pml_vaddr[level], 4525 x86_tmp_pml_paddr[level], 4526 VM_PROT_READ | VM_PROT_WRITE, 0); 4527 pmap_update(pmap_kernel()); 4528 } 4529 maps_loaded = true; 4530 } 4531 4532 /* Zero levels 1-3 */ 4533 for (level = 0; level < PTP_LEVELS - 1; ++level) { 4534 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 4535 memset(tmp_pml, 0, PAGE_SIZE); 4536 } 4537 4538 /* Copy PML4 */ 4539 kernel_pml = pmap_kernel()->pm_pdir; 4540 tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1]; 4541 memcpy(tmp_pml, kernel_pml, PAGE_SIZE); 4542 4543 #ifdef PAE 4544 /* 4545 * Use the last 4 entries of the L2 page as L3 PD entries. These 4546 * last entries are unlikely to be used for temporary mappings. 4547 * 508: maps 0->1GB (userland) 4548 * 509: unused 4549 * 510: unused 4550 * 511: maps 3->4GB (kernel) 4551 */ 4552 tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PG_V; 4553 tmp_pml[509] = 0; 4554 tmp_pml[510] = 0; 4555 tmp_pml[511] = pmap_pdirpa(pmap_kernel(), PDIR_SLOT_KERN) | PG_V; 4556 #endif 4557 4558 for (level = PTP_LEVELS - 1; level > 0; --level) { 4559 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 4560 4561 tmp_pml[pl_i(pg, level + 1)] = 4562 (x86_tmp_pml_paddr[level - 1] & PG_FRAME) | PG_RW | PG_V; 4563 } 4564 4565 tmp_pml = (void *)x86_tmp_pml_vaddr[0]; 4566 tmp_pml[pl_i(pg, 1)] = (pg & PG_FRAME) | PG_RW | PG_V; 4567 4568 #ifdef PAE 4569 /* Return the PA of the L3 page (entry 508 of the L2 page) */ 4570 return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t); 4571 #endif 4572 4573 return x86_tmp_pml_paddr[PTP_LEVELS - 1]; 4574 } 4575 4576 u_int 4577 x86_mmap_flags(paddr_t mdpgno) 4578 { 4579 u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK; 4580 u_int pflag = 0; 4581 4582 if (nflag & X86_MMAP_FLAG_PREFETCH) 4583 pflag |= PMAP_WRITE_COMBINE; 4584 4585 return pflag; 4586 } 4587