1 /* $NetBSD: pmap.c,v 1.183 2014/06/14 02:54:47 pgoyette Exp $ */ 2 3 /*- 4 * Copyright (c) 2008, 2010 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 2007 Manuel Bouyer. 34 * 35 * Redistribution and use in source and binary forms, with or without 36 * modification, are permitted provided that the following conditions 37 * are met: 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 44 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 45 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 46 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 47 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 48 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 49 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 50 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 51 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 52 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 53 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 54 * 55 */ 56 57 /* 58 * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr> 59 * 60 * Permission to use, copy, modify, and distribute this software for any 61 * purpose with or without fee is hereby granted, provided that the above 62 * copyright notice and this permission notice appear in all copies. 63 * 64 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 65 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 66 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 67 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 68 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 69 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 70 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 71 */ 72 73 /* 74 * Copyright (c) 1997 Charles D. Cranor and Washington University. 75 * All rights reserved. 76 * 77 * Redistribution and use in source and binary forms, with or without 78 * modification, are permitted provided that the following conditions 79 * are met: 80 * 1. Redistributions of source code must retain the above copyright 81 * notice, this list of conditions and the following disclaimer. 82 * 2. Redistributions in binary form must reproduce the above copyright 83 * notice, this list of conditions and the following disclaimer in the 84 * documentation and/or other materials provided with the distribution. 85 * 86 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 87 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 88 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 89 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 90 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 91 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 92 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 93 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 94 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 95 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 96 */ 97 98 /* 99 * Copyright 2001 (c) Wasabi Systems, Inc. 100 * All rights reserved. 101 * 102 * Written by Frank van der Linden for Wasabi Systems, Inc. 103 * 104 * Redistribution and use in source and binary forms, with or without 105 * modification, are permitted provided that the following conditions 106 * are met: 107 * 1. Redistributions of source code must retain the above copyright 108 * notice, this list of conditions and the following disclaimer. 109 * 2. Redistributions in binary form must reproduce the above copyright 110 * notice, this list of conditions and the following disclaimer in the 111 * documentation and/or other materials provided with the distribution. 112 * 3. All advertising materials mentioning features or use of this software 113 * must display the following acknowledgement: 114 * This product includes software developed for the NetBSD Project by 115 * Wasabi Systems, Inc. 116 * 4. The name of Wasabi Systems, Inc. may not be used to endorse 117 * or promote products derived from this software without specific prior 118 * written permission. 119 * 120 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND 121 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 122 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 123 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC 124 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 125 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 126 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 127 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 128 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 129 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 130 * POSSIBILITY OF SUCH DAMAGE. 131 */ 132 133 /* 134 * This is the i386 pmap modified and generalized to support x86-64 135 * as well. The idea is to hide the upper N levels of the page tables 136 * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest 137 * is mostly untouched, except that it uses some more generalized 138 * macros and interfaces. 139 * 140 * This pmap has been tested on the i386 as well, and it can be easily 141 * adapted to PAE. 142 * 143 * fvdl@wasabisystems.com 18-Jun-2001 144 */ 145 146 /* 147 * pmap.c: i386 pmap module rewrite 148 * Chuck Cranor <chuck@netbsd> 149 * 11-Aug-97 150 * 151 * history of this pmap module: in addition to my own input, i used 152 * the following references for this rewrite of the i386 pmap: 153 * 154 * [1] the NetBSD i386 pmap. this pmap appears to be based on the 155 * BSD hp300 pmap done by Mike Hibler at University of Utah. 156 * it was then ported to the i386 by William Jolitz of UUNET 157 * Technologies, Inc. Then Charles M. Hannum of the NetBSD 158 * project fixed some bugs and provided some speed ups. 159 * 160 * [2] the FreeBSD i386 pmap. this pmap seems to be the 161 * Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson 162 * and David Greenman. 163 * 164 * [3] the Mach pmap. this pmap, from CMU, seems to have migrated 165 * between several processors. the VAX version was done by 166 * Avadis Tevanian, Jr., and Michael Wayne Young. the i386 167 * version was done by Lance Berc, Mike Kupfer, Bob Baron, 168 * David Golub, and Richard Draves. the alpha version was 169 * done by Alessandro Forin (CMU/Mach) and Chris Demetriou 170 * (NetBSD/alpha). 171 */ 172 173 #include <sys/cdefs.h> 174 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.183 2014/06/14 02:54:47 pgoyette Exp $"); 175 176 #include "opt_user_ldt.h" 177 #include "opt_lockdebug.h" 178 #include "opt_multiprocessor.h" 179 #include "opt_xen.h" 180 #if !defined(__x86_64__) 181 #include "opt_kstack_dr0.h" 182 #endif /* !defined(__x86_64__) */ 183 184 #include <sys/param.h> 185 #include <sys/systm.h> 186 #include <sys/proc.h> 187 #include <sys/pool.h> 188 #include <sys/kernel.h> 189 #include <sys/atomic.h> 190 #include <sys/cpu.h> 191 #include <sys/intr.h> 192 #include <sys/xcall.h> 193 #include <sys/kcore.h> 194 195 #include <uvm/uvm.h> 196 197 #include <dev/isa/isareg.h> 198 199 #include <machine/specialreg.h> 200 #include <machine/gdt.h> 201 #include <machine/isa_machdep.h> 202 #include <machine/cpuvar.h> 203 #include <machine/cputypes.h> 204 205 #include <x86/pmap.h> 206 #include <x86/pmap_pv.h> 207 208 #include <x86/i82489reg.h> 209 #include <x86/i82489var.h> 210 211 #ifdef XEN 212 #include <xen/xen-public/xen.h> 213 #include <xen/hypervisor.h> 214 #endif 215 216 /* 217 * general info: 218 * 219 * - for an explanation of how the i386 MMU hardware works see 220 * the comments in <machine/pte.h>. 221 * 222 * - for an explanation of the general memory structure used by 223 * this pmap (including the recursive mapping), see the comments 224 * in <machine/pmap.h>. 225 * 226 * this file contains the code for the "pmap module." the module's 227 * job is to manage the hardware's virtual to physical address mappings. 228 * note that there are two levels of mapping in the VM system: 229 * 230 * [1] the upper layer of the VM system uses vm_map's and vm_map_entry's 231 * to map ranges of virtual address space to objects/files. for 232 * example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only 233 * to the file /bin/ls starting at offset zero." note that 234 * the upper layer mapping is not concerned with how individual 235 * vm_pages are mapped. 236 * 237 * [2] the lower layer of the VM system (the pmap) maintains the mappings 238 * from virtual addresses. it is concerned with which vm_page is 239 * mapped where. for example, when you run /bin/ls and start 240 * at page 0x1000 the fault routine may lookup the correct page 241 * of the /bin/ls file and then ask the pmap layer to establish 242 * a mapping for it. 243 * 244 * note that information in the lower layer of the VM system can be 245 * thrown away since it can easily be reconstructed from the info 246 * in the upper layer. 247 * 248 * data structures we use include: 249 * 250 * - struct pmap: describes the address space of one thread 251 * - struct pv_entry: describes one <PMAP,VA> mapping of a PA 252 * - struct pv_head: there is one pv_head per managed page of 253 * physical memory. the pv_head points to a list of pv_entry 254 * structures which describe all the <PMAP,VA> pairs that this 255 * page is mapped in. this is critical for page based operations 256 * such as pmap_page_protect() [change protection on _all_ mappings 257 * of a page] 258 */ 259 260 /* 261 * memory allocation 262 * 263 * - there are three data structures that we must dynamically allocate: 264 * 265 * [A] new process' page directory page (PDP) 266 * - plan 1: done at pmap_create() we use 267 * uvm_km_alloc(kernel_map, PAGE_SIZE) [fka kmem_alloc] to do this 268 * allocation. 269 * 270 * if we are low in free physical memory then we sleep in 271 * uvm_km_alloc -- in this case this is ok since we are creating 272 * a new pmap and should not be holding any locks. 273 * 274 * if the kernel is totally out of virtual space 275 * (i.e. uvm_km_alloc returns NULL), then we panic. 276 * 277 * [B] new page tables pages (PTP) 278 * - call uvm_pagealloc() 279 * => success: zero page, add to pm_pdir 280 * => failure: we are out of free vm_pages, let pmap_enter() 281 * tell UVM about it. 282 * 283 * note: for kernel PTPs, we start with NKPTP of them. as we map 284 * kernel memory (at uvm_map time) we check to see if we've grown 285 * the kernel pmap. if so, we call the optional function 286 * pmap_growkernel() to grow the kernel PTPs in advance. 287 * 288 * [C] pv_entry structures 289 */ 290 291 /* 292 * locking 293 * 294 * we have the following locks that we must contend with: 295 * 296 * mutexes: 297 * 298 * - pmap lock (per pmap, part of uvm_object) 299 * this lock protects the fields in the pmap structure including 300 * the non-kernel PDEs in the PDP, and the PTEs. it also locks 301 * in the alternate PTE space (since that is determined by the 302 * entry in the PDP). 303 * 304 * - pvh_lock (per pv_head) 305 * this lock protects the pv_entry list which is chained off the 306 * pv_head structure for a specific managed PA. it is locked 307 * when traversing the list (e.g. adding/removing mappings, 308 * syncing R/M bits, etc.) 309 * 310 * - pmaps_lock 311 * this lock protects the list of active pmaps (headed by "pmaps"). 312 * we lock it when adding or removing pmaps from this list. 313 */ 314 315 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER; 316 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER; 317 const long nkptpmax[] = NKPTPMAX_INITIALIZER; 318 const long nbpd[] = NBPD_INITIALIZER; 319 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER; 320 321 long nkptp[] = NKPTP_INITIALIZER; 322 323 struct pmap_head pmaps; 324 kmutex_t pmaps_lock; 325 326 static vaddr_t pmap_maxkvaddr; 327 328 /* 329 * XXX kludge: dummy locking to make KASSERTs in uvm_page.c comfortable. 330 * actual locking is done by pm_lock. 331 */ 332 #if defined(DIAGNOSTIC) 333 #define PMAP_SUBOBJ_LOCK(pm, idx) \ 334 KASSERT(mutex_owned((pm)->pm_lock)); \ 335 if ((idx) != 0) \ 336 mutex_enter((pm)->pm_obj[(idx)].vmobjlock) 337 #define PMAP_SUBOBJ_UNLOCK(pm, idx) \ 338 KASSERT(mutex_owned((pm)->pm_lock)); \ 339 if ((idx) != 0) \ 340 mutex_exit((pm)->pm_obj[(idx)].vmobjlock) 341 #else /* defined(DIAGNOSTIC) */ 342 #define PMAP_SUBOBJ_LOCK(pm, idx) /* nothing */ 343 #define PMAP_SUBOBJ_UNLOCK(pm, idx) /* nothing */ 344 #endif /* defined(DIAGNOSTIC) */ 345 346 /* 347 * Misc. event counters. 348 */ 349 struct evcnt pmap_iobmp_evcnt; 350 struct evcnt pmap_ldt_evcnt; 351 352 /* 353 * PAT 354 */ 355 #define PATENTRY(n, type) (type << ((n) * 8)) 356 #define PAT_UC 0x0ULL 357 #define PAT_WC 0x1ULL 358 #define PAT_WT 0x4ULL 359 #define PAT_WP 0x5ULL 360 #define PAT_WB 0x6ULL 361 #define PAT_UCMINUS 0x7ULL 362 363 static bool cpu_pat_enabled __read_mostly = false; 364 365 /* 366 * global data structures 367 */ 368 369 static struct pmap kernel_pmap_store; /* the kernel's pmap (proc0) */ 370 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store; 371 372 /* 373 * pmap_pg_g: if our processor supports PG_G in the PTE then we 374 * set pmap_pg_g to PG_G (otherwise it is zero). 375 */ 376 377 int pmap_pg_g __read_mostly = 0; 378 379 /* 380 * pmap_largepages: if our processor supports PG_PS and we are 381 * using it, this is set to true. 382 */ 383 384 int pmap_largepages __read_mostly; 385 386 /* 387 * i386 physical memory comes in a big contig chunk with a small 388 * hole toward the front of it... the following two paddr_t's 389 * (shared with machdep.c) describe the physical address space 390 * of this machine. 391 */ 392 paddr_t avail_start __read_mostly; /* PA of first available physical page */ 393 paddr_t avail_end __read_mostly; /* PA of last available physical page */ 394 395 #ifdef XEN 396 #ifdef __x86_64__ 397 /* Dummy PGD for user cr3, used between pmap_deactivate() and pmap_activate() */ 398 static paddr_t xen_dummy_user_pgd; 399 #endif /* __x86_64__ */ 400 paddr_t pmap_pa_start; /* PA of first physical page for this domain */ 401 paddr_t pmap_pa_end; /* PA of last physical page for this domain */ 402 #endif /* XEN */ 403 404 #define VM_PAGE_TO_PP(pg) (&(pg)->mdpage.mp_pp) 405 406 #define PV_HASH_SIZE 32768 407 #define PV_HASH_LOCK_CNT 32 408 409 struct pv_hash_lock { 410 kmutex_t lock; 411 } __aligned(CACHE_LINE_SIZE) pv_hash_locks[PV_HASH_LOCK_CNT] 412 __aligned(CACHE_LINE_SIZE); 413 414 struct pv_hash_head { 415 SLIST_HEAD(, pv_entry) hh_list; 416 } pv_hash_heads[PV_HASH_SIZE]; 417 418 static u_int 419 pvhash_hash(struct vm_page *ptp, vaddr_t va) 420 { 421 422 return (uintptr_t)ptp / sizeof(*ptp) + (va >> PAGE_SHIFT); 423 } 424 425 static struct pv_hash_head * 426 pvhash_head(u_int hash) 427 { 428 429 return &pv_hash_heads[hash % PV_HASH_SIZE]; 430 } 431 432 static kmutex_t * 433 pvhash_lock(u_int hash) 434 { 435 436 return &pv_hash_locks[hash % PV_HASH_LOCK_CNT].lock; 437 } 438 439 static struct pv_entry * 440 pvhash_remove(struct pv_hash_head *hh, struct vm_page *ptp, vaddr_t va) 441 { 442 struct pv_entry *pve; 443 struct pv_entry *prev; 444 445 prev = NULL; 446 SLIST_FOREACH(pve, &hh->hh_list, pve_hash) { 447 if (pve->pve_pte.pte_ptp == ptp && 448 pve->pve_pte.pte_va == va) { 449 if (prev != NULL) { 450 SLIST_REMOVE_AFTER(prev, pve_hash); 451 } else { 452 SLIST_REMOVE_HEAD(&hh->hh_list, pve_hash); 453 } 454 break; 455 } 456 prev = pve; 457 } 458 return pve; 459 } 460 461 /* 462 * other data structures 463 */ 464 465 static pt_entry_t protection_codes[8] __read_mostly; /* maps MI prot to i386 466 prot code */ 467 static bool pmap_initialized __read_mostly = false; /* pmap_init done yet? */ 468 469 /* 470 * the following two vaddr_t's are used during system startup 471 * to keep track of how much of the kernel's VM space we have used. 472 * once the system is started, the management of the remaining kernel 473 * VM space is turned over to the kernel_map vm_map. 474 */ 475 476 static vaddr_t virtual_avail __read_mostly; /* VA of first free KVA */ 477 static vaddr_t virtual_end __read_mostly; /* VA of last free KVA */ 478 479 /* 480 * pool that pmap structures are allocated from 481 */ 482 483 static struct pool_cache pmap_cache; 484 485 /* 486 * pv_entry cache 487 */ 488 489 static struct pool_cache pmap_pv_cache; 490 491 #ifdef __HAVE_DIRECT_MAP 492 493 extern phys_ram_seg_t mem_clusters[]; 494 extern int mem_cluster_cnt; 495 496 #else 497 498 /* 499 * MULTIPROCESSOR: special VA's/ PTE's are actually allocated inside a 500 * maxcpus*NPTECL array of PTE's, to avoid cache line thrashing 501 * due to false sharing. 502 */ 503 504 #ifdef MULTIPROCESSOR 505 #define PTESLEW(pte, id) ((pte)+(id)*NPTECL) 506 #define VASLEW(va,id) ((va)+(id)*NPTECL*PAGE_SIZE) 507 #else 508 #define PTESLEW(pte, id) ((void)id, pte) 509 #define VASLEW(va,id) ((void)id, va) 510 #endif 511 512 /* 513 * special VAs and the PTEs that map them 514 */ 515 static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte, *early_zero_pte; 516 static char *csrcp, *cdstp, *zerop, *ptpp; 517 #ifdef XEN 518 char *early_zerop; /* also referenced from xen_pmap_bootstrap() */ 519 #else 520 static char *early_zerop; 521 #endif 522 523 #endif 524 525 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int); 526 527 /* PDP pool_cache(9) and its callbacks */ 528 struct pool_cache pmap_pdp_cache; 529 static int pmap_pdp_ctor(void *, void *, int); 530 static void pmap_pdp_dtor(void *, void *); 531 #ifdef PAE 532 /* need to allocate items of 4 pages */ 533 static void *pmap_pdp_alloc(struct pool *, int); 534 static void pmap_pdp_free(struct pool *, void *); 535 static struct pool_allocator pmap_pdp_allocator = { 536 .pa_alloc = pmap_pdp_alloc, 537 .pa_free = pmap_pdp_free, 538 .pa_pagesz = PAGE_SIZE * PDP_SIZE, 539 }; 540 #endif /* PAE */ 541 542 extern vaddr_t idt_vaddr; /* we allocate IDT early */ 543 extern paddr_t idt_paddr; 544 545 #ifdef _LP64 546 extern vaddr_t lo32_vaddr; 547 extern vaddr_t lo32_paddr; 548 #endif 549 550 extern int end; 551 552 #ifdef i386 553 /* stuff to fix the pentium f00f bug */ 554 extern vaddr_t pentium_idt_vaddr; 555 #endif 556 557 558 /* 559 * local prototypes 560 */ 561 562 static struct vm_page *pmap_get_ptp(struct pmap *, vaddr_t, 563 pd_entry_t * const *); 564 static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int); 565 static void pmap_freepage(struct pmap *, struct vm_page *, int); 566 static void pmap_free_ptp(struct pmap *, struct vm_page *, 567 vaddr_t, pt_entry_t *, 568 pd_entry_t * const *); 569 static bool pmap_remove_pte(struct pmap *, struct vm_page *, 570 pt_entry_t *, vaddr_t, 571 struct pv_entry **); 572 static void pmap_remove_ptes(struct pmap *, struct vm_page *, 573 vaddr_t, vaddr_t, vaddr_t, 574 struct pv_entry **); 575 576 static bool pmap_get_physpage(vaddr_t, int, paddr_t *); 577 static void pmap_alloc_level(pd_entry_t * const *, vaddr_t, int, 578 long *); 579 580 static bool pmap_reactivate(struct pmap *); 581 582 /* 583 * p m a p h e l p e r f u n c t i o n s 584 */ 585 586 static inline void 587 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff) 588 { 589 590 if (pmap == pmap_kernel()) { 591 atomic_add_long(&pmap->pm_stats.resident_count, resid_diff); 592 atomic_add_long(&pmap->pm_stats.wired_count, wired_diff); 593 } else { 594 KASSERT(mutex_owned(pmap->pm_lock)); 595 pmap->pm_stats.resident_count += resid_diff; 596 pmap->pm_stats.wired_count += wired_diff; 597 } 598 } 599 600 static inline void 601 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte) 602 { 603 int resid_diff = ((npte & PG_V) ? 1 : 0) - ((opte & PG_V) ? 1 : 0); 604 int wired_diff = ((npte & PG_W) ? 1 : 0) - ((opte & PG_W) ? 1 : 0); 605 606 KASSERT((npte & (PG_V | PG_W)) != PG_W); 607 KASSERT((opte & (PG_V | PG_W)) != PG_W); 608 609 pmap_stats_update(pmap, resid_diff, wired_diff); 610 } 611 612 /* 613 * ptp_to_pmap: lookup pmap by ptp 614 */ 615 616 static struct pmap * 617 ptp_to_pmap(struct vm_page *ptp) 618 { 619 struct pmap *pmap; 620 621 if (ptp == NULL) { 622 return pmap_kernel(); 623 } 624 pmap = (struct pmap *)ptp->uobject; 625 KASSERT(pmap != NULL); 626 KASSERT(&pmap->pm_obj[0] == ptp->uobject); 627 return pmap; 628 } 629 630 static inline struct pv_pte * 631 pve_to_pvpte(struct pv_entry *pve) 632 { 633 634 KASSERT((void *)&pve->pve_pte == (void *)pve); 635 return &pve->pve_pte; 636 } 637 638 static inline struct pv_entry * 639 pvpte_to_pve(struct pv_pte *pvpte) 640 { 641 struct pv_entry *pve = (void *)pvpte; 642 643 KASSERT(pve_to_pvpte(pve) == pvpte); 644 return pve; 645 } 646 647 /* 648 * pv_pte_first, pv_pte_next: PV list iterator. 649 */ 650 651 static struct pv_pte * 652 pv_pte_first(struct pmap_page *pp) 653 { 654 655 if ((pp->pp_flags & PP_EMBEDDED) != 0) { 656 return &pp->pp_pte; 657 } 658 return pve_to_pvpte(LIST_FIRST(&pp->pp_head.pvh_list)); 659 } 660 661 static struct pv_pte * 662 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte) 663 { 664 665 KASSERT(pvpte != NULL); 666 if (pvpte == &pp->pp_pte) { 667 KASSERT((pp->pp_flags & PP_EMBEDDED) != 0); 668 return NULL; 669 } 670 KASSERT((pp->pp_flags & PP_EMBEDDED) == 0); 671 return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list)); 672 } 673 674 /* 675 * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]? 676 * of course the kernel is always loaded 677 */ 678 679 bool 680 pmap_is_curpmap(struct pmap *pmap) 681 { 682 return((pmap == pmap_kernel()) || 683 (pmap == curcpu()->ci_pmap)); 684 } 685 686 /* 687 * Add a reference to the specified pmap. 688 */ 689 690 void 691 pmap_reference(struct pmap *pmap) 692 { 693 694 atomic_inc_uint(&pmap->pm_obj[0].uo_refs); 695 } 696 697 /* 698 * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in 699 * 700 * there are several pmaps involved. some or all of them might be same. 701 * 702 * - the pmap given by the first argument 703 * our caller wants to access this pmap's PTEs. 704 * 705 * - pmap_kernel() 706 * the kernel pmap. note that it only contains the kernel part 707 * of the address space which is shared by any pmap. ie. any 708 * pmap can be used instead of pmap_kernel() for our purpose. 709 * 710 * - ci->ci_pmap 711 * pmap currently loaded on the cpu. 712 * 713 * - vm_map_pmap(&curproc->p_vmspace->vm_map) 714 * current process' pmap. 715 * 716 * => we lock enough pmaps to keep things locked in 717 * => must be undone with pmap_unmap_ptes before returning 718 */ 719 720 void 721 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2, 722 pd_entry_t **ptepp, pd_entry_t * const **pdeppp) 723 { 724 struct pmap *curpmap; 725 struct cpu_info *ci; 726 lwp_t *l; 727 728 /* The kernel's pmap is always accessible. */ 729 if (pmap == pmap_kernel()) { 730 *pmap2 = NULL; 731 *ptepp = PTE_BASE; 732 *pdeppp = normal_pdes; 733 return; 734 } 735 KASSERT(kpreempt_disabled()); 736 737 l = curlwp; 738 retry: 739 mutex_enter(pmap->pm_lock); 740 ci = curcpu(); 741 curpmap = ci->ci_pmap; 742 if (vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) { 743 /* Our own pmap so just load it: easy. */ 744 if (__predict_false(ci->ci_want_pmapload)) { 745 mutex_exit(pmap->pm_lock); 746 pmap_load(); 747 goto retry; 748 } 749 KASSERT(pmap == curpmap); 750 } else if (pmap == curpmap) { 751 /* 752 * Already on the CPU: make it valid. This is very 753 * often the case during exit(), when we have switched 754 * to the kernel pmap in order to destroy a user pmap. 755 */ 756 if (!pmap_reactivate(pmap)) { 757 u_int gen = uvm_emap_gen_return(); 758 tlbflush(); 759 uvm_emap_update(gen); 760 } 761 } else { 762 /* 763 * Toss current pmap from CPU, but keep a reference to it. 764 * The reference will be dropped by pmap_unmap_ptes(). 765 * Can happen if we block during exit(). 766 */ 767 const cpuid_t cid = cpu_index(ci); 768 769 kcpuset_atomic_clear(curpmap->pm_cpus, cid); 770 kcpuset_atomic_clear(curpmap->pm_kernel_cpus, cid); 771 ci->ci_pmap = pmap; 772 ci->ci_tlbstate = TLBSTATE_VALID; 773 kcpuset_atomic_set(pmap->pm_cpus, cid); 774 kcpuset_atomic_set(pmap->pm_kernel_cpus, cid); 775 cpu_load_pmap(pmap, curpmap); 776 } 777 pmap->pm_ncsw = l->l_ncsw; 778 *pmap2 = curpmap; 779 *ptepp = PTE_BASE; 780 #if defined(XEN) && defined(__x86_64__) 781 KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] == L4_BASE); 782 ci->ci_normal_pdes[PTP_LEVELS - 2] = pmap->pm_pdir; 783 *pdeppp = ci->ci_normal_pdes; 784 #else /* XEN && __x86_64__ */ 785 *pdeppp = normal_pdes; 786 #endif /* XEN && __x86_64__ */ 787 } 788 789 /* 790 * pmap_unmap_ptes: unlock the PTE mapping of "pmap" 791 */ 792 793 void 794 pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2) 795 { 796 struct cpu_info *ci; 797 struct pmap *mypmap; 798 799 KASSERT(kpreempt_disabled()); 800 801 /* The kernel's pmap is always accessible. */ 802 if (pmap == pmap_kernel()) { 803 return; 804 } 805 806 ci = curcpu(); 807 #if defined(XEN) && defined(__x86_64__) 808 /* Reset per-cpu normal_pdes */ 809 KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] != L4_BASE); 810 ci->ci_normal_pdes[PTP_LEVELS - 2] = L4_BASE; 811 #endif /* XEN && __x86_64__ */ 812 /* 813 * We cannot tolerate context switches while mapped in. 814 * If it is our own pmap all we have to do is unlock. 815 */ 816 KASSERT(pmap->pm_ncsw == curlwp->l_ncsw); 817 mypmap = vm_map_pmap(&curproc->p_vmspace->vm_map); 818 if (pmap == mypmap) { 819 mutex_exit(pmap->pm_lock); 820 return; 821 } 822 823 /* 824 * Mark whatever's on the CPU now as lazy and unlock. 825 * If the pmap was already installed, we are done. 826 */ 827 ci->ci_tlbstate = TLBSTATE_LAZY; 828 ci->ci_want_pmapload = (mypmap != pmap_kernel()); 829 mutex_exit(pmap->pm_lock); 830 if (pmap == pmap2) { 831 return; 832 } 833 834 /* 835 * We installed another pmap on the CPU. Grab a reference to 836 * it and leave in place. Toss the evicted pmap (can block). 837 */ 838 pmap_reference(pmap); 839 pmap_destroy(pmap2); 840 } 841 842 843 inline static void 844 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte) 845 { 846 847 #if !defined(__x86_64__) 848 if (curproc == NULL || curproc->p_vmspace == NULL || 849 pm != vm_map_pmap(&curproc->p_vmspace->vm_map)) 850 return; 851 852 if ((opte ^ npte) & PG_X) 853 pmap_update_pg(va); 854 855 /* 856 * Executability was removed on the last executable change. 857 * Reset the code segment to something conservative and 858 * let the trap handler deal with setting the right limit. 859 * We can't do that because of locking constraints on the vm map. 860 */ 861 862 if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) { 863 struct trapframe *tf = curlwp->l_md.md_regs; 864 865 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 866 pm->pm_hiexec = I386_MAX_EXE_ADDR; 867 } 868 #endif /* !defined(__x86_64__) */ 869 } 870 871 #if !defined(__x86_64__) 872 /* 873 * Fixup the code segment to cover all potential executable mappings. 874 * returns 0 if no changes to the code segment were made. 875 */ 876 877 int 878 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb) 879 { 880 struct vm_map_entry *ent; 881 struct pmap *pm = vm_map_pmap(map); 882 vaddr_t va = 0; 883 884 vm_map_lock_read(map); 885 for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) { 886 887 /* 888 * This entry has greater va than the entries before. 889 * We need to make it point to the last page, not past it. 890 */ 891 892 if (ent->protection & VM_PROT_EXECUTE) 893 va = trunc_page(ent->end) - PAGE_SIZE; 894 } 895 vm_map_unlock_read(map); 896 if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL)) 897 return (0); 898 899 pm->pm_hiexec = va; 900 if (pm->pm_hiexec > I386_MAX_EXE_ADDR) { 901 tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL); 902 } else { 903 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); 904 return (0); 905 } 906 return (1); 907 } 908 #endif /* !defined(__x86_64__) */ 909 910 void 911 pat_init(struct cpu_info *ci) 912 { 913 uint64_t pat; 914 915 if (!(ci->ci_feat_val[0] & CPUID_PAT)) 916 return; 917 918 /* We change WT to WC. Leave all other entries the default values. */ 919 pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) | 920 PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) | 921 PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) | 922 PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC); 923 924 wrmsr(MSR_CR_PAT, pat); 925 cpu_pat_enabled = true; 926 aprint_debug_dev(ci->ci_dev, "PAT enabled\n"); 927 } 928 929 static pt_entry_t 930 pmap_pat_flags(u_int flags) 931 { 932 u_int cacheflags = (flags & PMAP_CACHE_MASK); 933 934 if (!cpu_pat_enabled) { 935 switch (cacheflags) { 936 case PMAP_NOCACHE: 937 case PMAP_NOCACHE_OVR: 938 /* results in PGC_UCMINUS on cpus which have 939 * the cpuid PAT but PAT "disabled" 940 */ 941 return PG_N; 942 default: 943 return 0; 944 } 945 } 946 947 switch (cacheflags) { 948 case PMAP_NOCACHE: 949 return PGC_UC; 950 case PMAP_WRITE_COMBINE: 951 return PGC_WC; 952 case PMAP_WRITE_BACK: 953 return PGC_WB; 954 case PMAP_NOCACHE_OVR: 955 return PGC_UCMINUS; 956 } 957 958 return 0; 959 } 960 961 /* 962 * p m a p k e n t e r f u n c t i o n s 963 * 964 * functions to quickly enter/remove pages from the kernel address 965 * space. pmap_kremove is exported to MI kernel. we make use of 966 * the recursive PTE mappings. 967 */ 968 969 /* 970 * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking 971 * 972 * => no need to lock anything, assume va is already allocated 973 * => should be faster than normal pmap enter function 974 */ 975 976 void 977 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags) 978 { 979 pt_entry_t *pte, opte, npte; 980 981 KASSERT(!(prot & ~VM_PROT_ALL)); 982 983 if (va < VM_MIN_KERNEL_ADDRESS) 984 pte = vtopte(va); 985 else 986 pte = kvtopte(va); 987 #ifdef DOM0OPS 988 if (pa < pmap_pa_start || pa >= pmap_pa_end) { 989 #ifdef DEBUG 990 printf_nolog("%s: pa 0x%" PRIx64 " for va 0x%" PRIx64 991 " outside range\n", __func__, (int64_t)pa, (int64_t)va); 992 #endif /* DEBUG */ 993 npte = pa; 994 } else 995 #endif /* DOM0OPS */ 996 npte = pmap_pa2pte(pa); 997 npte |= protection_codes[prot] | PG_k | PG_V | pmap_pg_g; 998 npte |= pmap_pat_flags(flags); 999 opte = pmap_pte_testset(pte, npte); /* zap! */ 1000 #if defined(DIAGNOSTIC) 1001 /* XXX For now... */ 1002 if (opte & PG_PS) 1003 panic("%s: PG_PS", __func__); 1004 #endif 1005 if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) { 1006 /* This should not happen. */ 1007 printf_nolog("%s: mapping already present\n", __func__); 1008 kpreempt_disable(); 1009 pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER); 1010 kpreempt_enable(); 1011 } 1012 } 1013 1014 void 1015 pmap_emap_enter(vaddr_t va, paddr_t pa, vm_prot_t prot) 1016 { 1017 pt_entry_t *pte, npte; 1018 1019 KASSERT((prot & ~VM_PROT_ALL) == 0); 1020 pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va); 1021 1022 #ifdef DOM0OPS 1023 if (pa < pmap_pa_start || pa >= pmap_pa_end) { 1024 npte = pa; 1025 } else 1026 #endif 1027 npte = pmap_pa2pte(pa); 1028 1029 npte = pmap_pa2pte(pa); 1030 npte |= protection_codes[prot] | PG_k | PG_V; 1031 pmap_pte_set(pte, npte); 1032 } 1033 1034 /* 1035 * pmap_emap_sync: perform TLB flush or pmap load, if it was deferred. 1036 */ 1037 void 1038 pmap_emap_sync(bool canload) 1039 { 1040 struct cpu_info *ci = curcpu(); 1041 struct pmap *pmap; 1042 1043 KASSERT(kpreempt_disabled()); 1044 if (__predict_true(ci->ci_want_pmapload && canload)) { 1045 /* 1046 * XXX: Hint for pmap_reactivate(), which might suggest to 1047 * not perform TLB flush, if state has not changed. 1048 */ 1049 pmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map); 1050 if (__predict_false(pmap == ci->ci_pmap)) { 1051 kcpuset_atomic_clear(pmap->pm_cpus, cpu_index(ci)); 1052 } 1053 pmap_load(); 1054 KASSERT(ci->ci_want_pmapload == 0); 1055 } else { 1056 tlbflush(); 1057 } 1058 1059 } 1060 1061 void 1062 pmap_emap_remove(vaddr_t sva, vsize_t len) 1063 { 1064 pt_entry_t *pte; 1065 vaddr_t va, eva = sva + len; 1066 1067 for (va = sva; va < eva; va += PAGE_SIZE) { 1068 pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va); 1069 pmap_pte_set(pte, 0); 1070 } 1071 } 1072 1073 __strict_weak_alias(pmap_kenter_ma, pmap_kenter_pa); 1074 1075 #if defined(__x86_64__) 1076 /* 1077 * Change protection for a virtual address. Local for a CPU only, don't 1078 * care about TLB shootdowns. 1079 * 1080 * => must be called with preemption disabled 1081 */ 1082 void 1083 pmap_changeprot_local(vaddr_t va, vm_prot_t prot) 1084 { 1085 pt_entry_t *pte, opte, npte; 1086 1087 KASSERT(kpreempt_disabled()); 1088 1089 if (va < VM_MIN_KERNEL_ADDRESS) 1090 pte = vtopte(va); 1091 else 1092 pte = kvtopte(va); 1093 1094 npte = opte = *pte; 1095 1096 if ((prot & VM_PROT_WRITE) != 0) 1097 npte |= PG_RW; 1098 else 1099 npte &= ~PG_RW; 1100 1101 if (opte != npte) { 1102 pmap_pte_set(pte, npte); 1103 pmap_pte_flush(); 1104 invlpg(va); 1105 } 1106 } 1107 #endif /* defined(__x86_64__) */ 1108 1109 /* 1110 * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking 1111 * 1112 * => no need to lock anything 1113 * => caller must dispose of any vm_page mapped in the va range 1114 * => note: not an inline function 1115 * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE 1116 * => we assume kernel only unmaps valid addresses and thus don't bother 1117 * checking the valid bit before doing TLB flushing 1118 * => must be followed by call to pmap_update() before reuse of page 1119 */ 1120 1121 static inline void 1122 pmap_kremove1(vaddr_t sva, vsize_t len, bool localonly) 1123 { 1124 pt_entry_t *pte, opte; 1125 vaddr_t va, eva; 1126 1127 eva = sva + len; 1128 1129 kpreempt_disable(); 1130 for (va = sva; va < eva; va += PAGE_SIZE) { 1131 pte = kvtopte(va); 1132 opte = pmap_pte_testset(pte, 0); /* zap! */ 1133 if ((opte & (PG_V | PG_U)) == (PG_V | PG_U) && !localonly) { 1134 pmap_tlb_shootdown(pmap_kernel(), va, opte, 1135 TLBSHOOT_KREMOVE); 1136 } 1137 KASSERT((opte & PG_PS) == 0); 1138 KASSERT((opte & PG_PVLIST) == 0); 1139 } 1140 if (localonly) { 1141 tlbflushg(); 1142 } 1143 kpreempt_enable(); 1144 } 1145 1146 void 1147 pmap_kremove(vaddr_t sva, vsize_t len) 1148 { 1149 1150 pmap_kremove1(sva, len, false); 1151 } 1152 1153 /* 1154 * pmap_kremove_local: like pmap_kremove(), but only worry about 1155 * TLB invalidations on the current CPU. this is only intended 1156 * for use while writing kernel crash dumps. 1157 */ 1158 1159 void 1160 pmap_kremove_local(vaddr_t sva, vsize_t len) 1161 { 1162 1163 KASSERT(panicstr != NULL); 1164 pmap_kremove1(sva, len, true); 1165 } 1166 1167 /* 1168 * p m a p i n i t f u n c t i o n s 1169 * 1170 * pmap_bootstrap and pmap_init are called during system startup 1171 * to init the pmap module. pmap_bootstrap() does a low level 1172 * init just to get things rolling. pmap_init() finishes the job. 1173 */ 1174 1175 /* 1176 * pmap_bootstrap: get the system in a state where it can run with VM 1177 * properly enabled (called before main()). the VM system is 1178 * fully init'd later... 1179 * 1180 * => on i386, locore.s has already enabled the MMU by allocating 1181 * a PDP for the kernel, and nkpde PTP's for the kernel. 1182 * => kva_start is the first free virtual address in kernel space 1183 */ 1184 1185 void 1186 pmap_bootstrap(vaddr_t kva_start) 1187 { 1188 struct pmap *kpm; 1189 pt_entry_t *pte; 1190 int i; 1191 vaddr_t kva; 1192 #ifndef XEN 1193 pd_entry_t *pde; 1194 unsigned long p1i; 1195 vaddr_t kva_end; 1196 #endif 1197 #ifdef __HAVE_DIRECT_MAP 1198 phys_ram_seg_t *mc; 1199 long ndmpdp; 1200 paddr_t lastpa, dmpd, dmpdp, pdp; 1201 vaddr_t tmpva; 1202 #endif 1203 1204 pt_entry_t pg_nx = (cpu_feature[2] & CPUID_NOX ? PG_NX : 0); 1205 1206 /* 1207 * set up our local static global vars that keep track of the 1208 * usage of KVM before kernel_map is set up 1209 */ 1210 1211 virtual_avail = kva_start; /* first free KVA */ 1212 virtual_end = VM_MAX_KERNEL_ADDRESS; /* last KVA */ 1213 1214 /* 1215 * set up protection_codes: we need to be able to convert from 1216 * a MI protection code (some combo of VM_PROT...) to something 1217 * we can jam into a i386 PTE. 1218 */ 1219 1220 protection_codes[VM_PROT_NONE] = pg_nx; /* --- */ 1221 protection_codes[VM_PROT_EXECUTE] = PG_RO | PG_X; /* --x */ 1222 protection_codes[VM_PROT_READ] = PG_RO | pg_nx; /* -r- */ 1223 protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO | PG_X;/* -rx */ 1224 protection_codes[VM_PROT_WRITE] = PG_RW | pg_nx; /* w-- */ 1225 protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW | PG_X;/* w-x */ 1226 protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW | pg_nx; 1227 /* wr- */ 1228 protection_codes[VM_PROT_ALL] = PG_RW | PG_X; /* wrx */ 1229 1230 /* 1231 * now we init the kernel's pmap 1232 * 1233 * the kernel pmap's pm_obj is not used for much. however, in 1234 * user pmaps the pm_obj contains the list of active PTPs. 1235 * the pm_obj currently does not have a pager. it might be possible 1236 * to add a pager that would allow a process to read-only mmap its 1237 * own page tables (fast user level vtophys?). this may or may not 1238 * be useful. 1239 */ 1240 1241 kpm = pmap_kernel(); 1242 for (i = 0; i < PTP_LEVELS - 1; i++) { 1243 mutex_init(&kpm->pm_obj_lock[i], MUTEX_DEFAULT, IPL_NONE); 1244 uvm_obj_init(&kpm->pm_obj[i], NULL, false, 1); 1245 uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_obj_lock[i]); 1246 kpm->pm_ptphint[i] = NULL; 1247 } 1248 memset(&kpm->pm_list, 0, sizeof(kpm->pm_list)); /* pm_list not used */ 1249 1250 kpm->pm_pdir = (pd_entry_t *)(PDPpaddr + KERNBASE); 1251 for (i = 0; i < PDP_SIZE; i++) 1252 kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i; 1253 1254 kpm->pm_stats.wired_count = kpm->pm_stats.resident_count = 1255 x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS); 1256 1257 kcpuset_create(&kpm->pm_cpus, true); 1258 kcpuset_create(&kpm->pm_kernel_cpus, true); 1259 1260 /* 1261 * the above is just a rough estimate and not critical to the proper 1262 * operation of the system. 1263 */ 1264 1265 #ifndef XEN 1266 /* 1267 * Begin to enable global TLB entries if they are supported. 1268 * The G bit has no effect until the CR4_PGE bit is set in CR4, 1269 * which happens in cpu_init(), which is run on each cpu 1270 * (and happens later) 1271 */ 1272 1273 if (cpu_feature[0] & CPUID_PGE) { 1274 pmap_pg_g = PG_G; /* enable software */ 1275 1276 /* add PG_G attribute to already mapped kernel pages */ 1277 if (KERNBASE == VM_MIN_KERNEL_ADDRESS) { 1278 kva_end = virtual_avail; 1279 } else { 1280 extern vaddr_t eblob, esym; 1281 kva_end = (vaddr_t)&end; 1282 if (esym > kva_end) 1283 kva_end = esym; 1284 if (eblob > kva_end) 1285 kva_end = eblob; 1286 kva_end = roundup(kva_end, PAGE_SIZE); 1287 } 1288 for (kva = KERNBASE; kva < kva_end; kva += PAGE_SIZE) { 1289 p1i = pl1_i(kva); 1290 if (pmap_valid_entry(PTE_BASE[p1i])) 1291 PTE_BASE[p1i] |= PG_G; 1292 } 1293 } 1294 1295 /* 1296 * enable large pages if they are supported. 1297 */ 1298 1299 if (cpu_feature[0] & CPUID_PSE) { 1300 paddr_t pa; 1301 extern char __data_start; 1302 1303 lcr4(rcr4() | CR4_PSE); /* enable hardware (via %cr4) */ 1304 pmap_largepages = 1; /* enable software */ 1305 1306 /* 1307 * the TLB must be flushed after enabling large pages 1308 * on Pentium CPUs, according to section 3.6.2.2 of 1309 * "Intel Architecture Software Developer's Manual, 1310 * Volume 3: System Programming". 1311 */ 1312 tlbflushg(); 1313 1314 /* 1315 * now, remap the kernel text using large pages. we 1316 * assume that the linker has properly aligned the 1317 * .data segment to a NBPD_L2 boundary. 1318 */ 1319 kva_end = rounddown((vaddr_t)&__data_start, NBPD_L1); 1320 for (pa = 0, kva = KERNBASE; kva + NBPD_L2 <= kva_end; 1321 kva += NBPD_L2, pa += NBPD_L2) { 1322 pde = &L2_BASE[pl2_i(kva)]; 1323 *pde = pa | pmap_pg_g | PG_PS | 1324 PG_KR | PG_V; /* zap! */ 1325 tlbflushg(); 1326 } 1327 #if defined(DEBUG) 1328 aprint_normal("kernel text is mapped with %" PRIuPSIZE " large " 1329 "pages and %" PRIuPSIZE " normal pages\n", 1330 howmany(kva - KERNBASE, NBPD_L2), 1331 howmany((vaddr_t)&__data_start - kva, NBPD_L1)); 1332 #endif /* defined(DEBUG) */ 1333 } 1334 #endif /* !XEN */ 1335 1336 #ifdef __HAVE_DIRECT_MAP 1337 1338 tmpva = (KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2); 1339 pte = PTE_BASE + pl1_i(tmpva); 1340 1341 /* 1342 * Map the direct map. Use 1GB pages if they are available, 1343 * otherwise use 2MB pages. Note that the unused parts of 1344 * PTPs * must be zero outed, as they might be accessed due 1345 * to speculative execution. Also, PG_G is not allowed on 1346 * non-leaf PTPs. 1347 */ 1348 1349 lastpa = 0; 1350 for (i = 0; i < mem_cluster_cnt; i++) { 1351 mc = &mem_clusters[i]; 1352 lastpa = MAX(lastpa, mc->start + mc->size); 1353 } 1354 1355 ndmpdp = (lastpa + NBPD_L3 - 1) >> L3_SHIFT; 1356 dmpdp = avail_start; avail_start += PAGE_SIZE; 1357 1358 *pte = dmpdp | PG_V | PG_RW; 1359 pmap_update_pg(tmpva); 1360 memset((void *)tmpva, 0, PAGE_SIZE); 1361 1362 if (cpu_feature[2] & CPUID_P1GB) { 1363 for (i = 0; i < ndmpdp; i++) { 1364 pdp = (paddr_t)&(((pd_entry_t *)dmpdp)[i]); 1365 *pte = (pdp & PG_FRAME) | PG_V | PG_RW; 1366 pmap_update_pg(tmpva); 1367 1368 pde = (pd_entry_t *)(tmpva + (pdp & ~PG_FRAME)); 1369 *pde = ((paddr_t)i << L3_SHIFT) | 1370 PG_RW | PG_V | PG_U | PG_PS | PG_G; 1371 } 1372 } else { 1373 dmpd = avail_start; avail_start += ndmpdp * PAGE_SIZE; 1374 1375 for (i = 0; i < ndmpdp; i++) { 1376 pdp = dmpd + i * PAGE_SIZE; 1377 *pte = (pdp & PG_FRAME) | PG_V | PG_RW; 1378 pmap_update_pg(tmpva); 1379 1380 memset((void *)tmpva, 0, PAGE_SIZE); 1381 } 1382 for (i = 0; i < NPDPG * ndmpdp; i++) { 1383 pdp = (paddr_t)&(((pd_entry_t *)dmpd)[i]); 1384 *pte = (pdp & PG_FRAME) | PG_V | PG_RW; 1385 pmap_update_pg(tmpva); 1386 1387 pde = (pd_entry_t *)(tmpva + (pdp & ~PG_FRAME)); 1388 *pde = ((paddr_t)i << L2_SHIFT) | 1389 PG_RW | PG_V | PG_U | PG_PS | PG_G; 1390 } 1391 for (i = 0; i < ndmpdp; i++) { 1392 pdp = (paddr_t)&(((pd_entry_t *)dmpdp)[i]); 1393 *pte = (pdp & PG_FRAME) | PG_V | PG_RW; 1394 pmap_update_pg((vaddr_t)tmpva); 1395 1396 pde = (pd_entry_t *)(tmpva + (pdp & ~PG_FRAME)); 1397 *pde = (dmpd + (i << PAGE_SHIFT)) | 1398 PG_RW | PG_V | PG_U; 1399 } 1400 } 1401 1402 kpm->pm_pdir[PDIR_SLOT_DIRECT] = dmpdp | PG_KW | PG_V | PG_U; 1403 1404 tlbflush(); 1405 1406 #else 1407 if (VM_MIN_KERNEL_ADDRESS != KERNBASE) { 1408 /* 1409 * zero_pte is stuck at the end of mapped space for the kernel 1410 * image (disjunct from kva space). This is done so that it 1411 * can safely be used in pmap_growkernel (pmap_get_physpage), 1412 * when it's called for the first time. 1413 * XXXfvdl fix this for MULTIPROCESSOR later. 1414 */ 1415 #ifdef XEN 1416 /* early_zerop initialized in xen_pmap_bootstrap() */ 1417 #else 1418 early_zerop = (void *)(KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2); 1419 #endif 1420 early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop); 1421 } 1422 1423 /* 1424 * now we allocate the "special" VAs which are used for tmp mappings 1425 * by the pmap (and other modules). we allocate the VAs by advancing 1426 * virtual_avail (note that there are no pages mapped at these VAs). 1427 * we find the PTE that maps the allocated VA via the linear PTE 1428 * mapping. 1429 */ 1430 1431 pte = PTE_BASE + pl1_i(virtual_avail); 1432 1433 #ifdef MULTIPROCESSOR 1434 /* 1435 * Waste some VA space to avoid false sharing of cache lines 1436 * for page table pages: Give each possible CPU a cache line 1437 * of PTE's (8) to play with, though we only need 4. We could 1438 * recycle some of this waste by putting the idle stacks here 1439 * as well; we could waste less space if we knew the largest 1440 * CPU ID beforehand. 1441 */ 1442 csrcp = (char *) virtual_avail; csrc_pte = pte; 1443 1444 cdstp = (char *) virtual_avail+PAGE_SIZE; cdst_pte = pte+1; 1445 1446 zerop = (char *) virtual_avail+PAGE_SIZE*2; zero_pte = pte+2; 1447 1448 ptpp = (char *) virtual_avail+PAGE_SIZE*3; ptp_pte = pte+3; 1449 1450 virtual_avail += PAGE_SIZE * maxcpus * NPTECL; 1451 pte += maxcpus * NPTECL; 1452 #else 1453 csrcp = (void *) virtual_avail; csrc_pte = pte; /* allocate */ 1454 virtual_avail += PAGE_SIZE; pte++; /* advance */ 1455 1456 cdstp = (void *) virtual_avail; cdst_pte = pte; 1457 virtual_avail += PAGE_SIZE; pte++; 1458 1459 zerop = (void *) virtual_avail; zero_pte = pte; 1460 virtual_avail += PAGE_SIZE; pte++; 1461 1462 ptpp = (void *) virtual_avail; ptp_pte = pte; 1463 virtual_avail += PAGE_SIZE; pte++; 1464 #endif 1465 1466 if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { 1467 early_zerop = zerop; 1468 early_zero_pte = zero_pte; 1469 } 1470 #endif 1471 1472 /* 1473 * Nothing after this point actually needs pte. 1474 */ 1475 pte = (void *)0xdeadbeef; 1476 1477 #ifdef XEN 1478 #ifdef __x86_64__ 1479 /* 1480 * We want a dummy page directory for Xen: 1481 * when deactivate a pmap, Xen will still consider it active. 1482 * So we set user PGD to this one to lift all protection on 1483 * the now inactive page tables set. 1484 */ 1485 xen_dummy_user_pgd = avail_start; 1486 avail_start += PAGE_SIZE; 1487 1488 /* Zero fill it, the less checks in Xen it requires the better */ 1489 memset((void *) (xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE); 1490 /* Mark read-only */ 1491 HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE, 1492 pmap_pa2pte(xen_dummy_user_pgd) | PG_u | PG_V, UVMF_INVLPG); 1493 /* Pin as L4 */ 1494 xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd)); 1495 #endif /* __x86_64__ */ 1496 idt_vaddr = virtual_avail; /* don't need pte */ 1497 idt_paddr = avail_start; /* steal a page */ 1498 /* 1499 * Xen require one more page as we can't store 1500 * GDT and LDT on the same page 1501 */ 1502 virtual_avail += 3 * PAGE_SIZE; 1503 avail_start += 3 * PAGE_SIZE; 1504 #else /* XEN */ 1505 idt_vaddr = virtual_avail; /* don't need pte */ 1506 idt_paddr = avail_start; /* steal a page */ 1507 #if defined(__x86_64__) 1508 virtual_avail += 2 * PAGE_SIZE; 1509 avail_start += 2 * PAGE_SIZE; 1510 #else /* defined(__x86_64__) */ 1511 virtual_avail += PAGE_SIZE; 1512 avail_start += PAGE_SIZE; 1513 /* pentium f00f bug stuff */ 1514 pentium_idt_vaddr = virtual_avail; /* don't need pte */ 1515 virtual_avail += PAGE_SIZE; 1516 #endif /* defined(__x86_64__) */ 1517 #endif /* XEN */ 1518 1519 #ifdef _LP64 1520 /* 1521 * Grab a page below 4G for things that need it (i.e. 1522 * having an initial %cr3 for the MP trampoline). 1523 */ 1524 lo32_vaddr = virtual_avail; 1525 virtual_avail += PAGE_SIZE; 1526 lo32_paddr = avail_start; 1527 avail_start += PAGE_SIZE; 1528 #endif 1529 1530 /* 1531 * now we reserve some VM for mapping pages when doing a crash dump 1532 */ 1533 1534 virtual_avail = reserve_dumppages(virtual_avail); 1535 1536 /* 1537 * init the static-global locks and global lists. 1538 * 1539 * => pventry::pvh_lock (initialized elsewhere) must also be 1540 * a spin lock, again at IPL_VM to prevent deadlock, and 1541 * again is never taken from interrupt context. 1542 */ 1543 1544 mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE); 1545 LIST_INIT(&pmaps); 1546 1547 /* 1548 * ensure the TLB is sync'd with reality by flushing it... 1549 */ 1550 1551 tlbflushg(); 1552 1553 /* 1554 * calculate pmap_maxkvaddr from nkptp[]. 1555 */ 1556 1557 kva = VM_MIN_KERNEL_ADDRESS; 1558 for (i = PTP_LEVELS - 1; i >= 1; i--) { 1559 kva += nkptp[i] * nbpd[i]; 1560 } 1561 pmap_maxkvaddr = kva; 1562 } 1563 1564 #if defined(__x86_64__) 1565 /* 1566 * Pre-allocate PTPs for low memory, so that 1:1 mappings for various 1567 * trampoline code can be entered. 1568 */ 1569 void 1570 pmap_prealloc_lowmem_ptps(void) 1571 { 1572 int level; 1573 paddr_t newp; 1574 pd_entry_t *pdes; 1575 1576 const pd_entry_t pteflags = PG_k | PG_V | PG_RW; 1577 1578 pdes = pmap_kernel()->pm_pdir; 1579 level = PTP_LEVELS; 1580 for (;;) { 1581 newp = avail_start; 1582 avail_start += PAGE_SIZE; 1583 #ifdef __HAVE_DIRECT_MAP 1584 memset((void *)PMAP_DIRECT_MAP(newp), 0, PAGE_SIZE); 1585 #else 1586 pmap_pte_set(early_zero_pte, pmap_pa2pte(newp) | pteflags); 1587 pmap_pte_flush(); 1588 pmap_update_pg((vaddr_t)early_zerop); 1589 memset(early_zerop, 0, PAGE_SIZE); 1590 #endif 1591 1592 #ifdef XEN 1593 /* Mark R/O before installing */ 1594 HYPERVISOR_update_va_mapping ((vaddr_t)early_zerop, 1595 xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG); 1596 if (newp < (NKL2_KIMG_ENTRIES * NBPD_L2)) 1597 HYPERVISOR_update_va_mapping (newp + KERNBASE, 1598 xpmap_ptom_masked(newp) | PG_u | PG_V, UVMF_INVLPG); 1599 1600 1601 if (level == PTP_LEVELS) { /* Top level pde is per-cpu */ 1602 pd_entry_t *kpm_pdir; 1603 /* Reach it via recursive mapping */ 1604 kpm_pdir = normal_pdes[PTP_LEVELS - 2]; 1605 1606 /* Set it as usual. We can't defer this 1607 * outside the loop since recursive 1608 * pte entries won't be accessible during 1609 * further iterations at lower levels 1610 * otherwise. 1611 */ 1612 pmap_pte_set(&kpm_pdir[pl_i(0, PTP_LEVELS)], 1613 pmap_pa2pte(newp) | pteflags); 1614 } 1615 1616 #endif /* XEN */ 1617 pmap_pte_set(&pdes[pl_i(0, level)], 1618 pmap_pa2pte(newp) | pteflags); 1619 1620 pmap_pte_flush(); 1621 1622 level--; 1623 if (level <= 1) 1624 break; 1625 pdes = normal_pdes[level - 2]; 1626 } 1627 } 1628 #endif /* defined(__x86_64__) */ 1629 1630 /* 1631 * pmap_init: called from uvm_init, our job is to get the pmap 1632 * system ready to manage mappings... 1633 */ 1634 1635 void 1636 pmap_init(void) 1637 { 1638 int i, flags; 1639 1640 for (i = 0; i < PV_HASH_SIZE; i++) { 1641 SLIST_INIT(&pv_hash_heads[i].hh_list); 1642 } 1643 for (i = 0; i < PV_HASH_LOCK_CNT; i++) { 1644 mutex_init(&pv_hash_locks[i].lock, MUTEX_NODEBUG, IPL_VM); 1645 } 1646 1647 /* 1648 * initialize caches. 1649 */ 1650 1651 pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), 0, 0, 0, 1652 "pmappl", NULL, IPL_NONE, NULL, NULL, NULL); 1653 1654 #ifdef XEN 1655 /* 1656 * pool_cache(9) should not touch cached objects, since they 1657 * are pinned on xen and R/O for the domU 1658 */ 1659 flags = PR_NOTOUCH; 1660 #else /* XEN */ 1661 flags = 0; 1662 #endif /* XEN */ 1663 #ifdef PAE 1664 pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE * PDP_SIZE, 0, 0, flags, 1665 "pdppl", &pmap_pdp_allocator, IPL_NONE, 1666 pmap_pdp_ctor, pmap_pdp_dtor, NULL); 1667 #else /* PAE */ 1668 pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE, 0, 0, flags, 1669 "pdppl", NULL, IPL_NONE, pmap_pdp_ctor, pmap_pdp_dtor, NULL); 1670 #endif /* PAE */ 1671 pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0, 1672 PR_LARGECACHE, "pvpl", &pool_allocator_kmem, IPL_NONE, NULL, 1673 NULL, NULL); 1674 1675 pmap_tlb_init(); 1676 1677 /* XXX: Since cpu_hatch() is only for secondary CPUs. */ 1678 pmap_tlb_cpu_init(curcpu()); 1679 1680 evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC, 1681 NULL, "x86", "io bitmap copy"); 1682 evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC, 1683 NULL, "x86", "ldt sync"); 1684 1685 /* 1686 * done: pmap module is up (and ready for business) 1687 */ 1688 1689 pmap_initialized = true; 1690 } 1691 1692 /* 1693 * pmap_cpu_init_late: perform late per-CPU initialization. 1694 */ 1695 1696 #ifndef XEN 1697 void 1698 pmap_cpu_init_late(struct cpu_info *ci) 1699 { 1700 /* 1701 * The BP has already its own PD page allocated during early 1702 * MD startup. 1703 */ 1704 if (ci == &cpu_info_primary) 1705 return; 1706 1707 #ifdef PAE 1708 cpu_alloc_l3_page(ci); 1709 #endif 1710 } 1711 #endif 1712 1713 /* 1714 * p v _ e n t r y f u n c t i o n s 1715 */ 1716 1717 /* 1718 * pmap_free_pvs: free a list of pv_entrys 1719 */ 1720 1721 static void 1722 pmap_free_pvs(struct pv_entry *pve) 1723 { 1724 struct pv_entry *next; 1725 1726 for ( /* null */ ; pve != NULL ; pve = next) { 1727 next = pve->pve_next; 1728 pool_cache_put(&pmap_pv_cache, pve); 1729 } 1730 } 1731 1732 /* 1733 * main pv_entry manipulation functions: 1734 * pmap_enter_pv: enter a mapping onto a pv_head list 1735 * pmap_remove_pv: remove a mapping from a pv_head list 1736 * 1737 * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock 1738 * the pvh before calling 1739 */ 1740 1741 /* 1742 * insert_pv: a helper of pmap_enter_pv 1743 */ 1744 1745 static void 1746 insert_pv(struct pmap_page *pp, struct pv_entry *pve) 1747 { 1748 struct pv_hash_head *hh; 1749 kmutex_t *lock; 1750 u_int hash; 1751 1752 hash = pvhash_hash(pve->pve_pte.pte_ptp, pve->pve_pte.pte_va); 1753 lock = pvhash_lock(hash); 1754 hh = pvhash_head(hash); 1755 mutex_spin_enter(lock); 1756 SLIST_INSERT_HEAD(&hh->hh_list, pve, pve_hash); 1757 mutex_spin_exit(lock); 1758 1759 LIST_INSERT_HEAD(&pp->pp_head.pvh_list, pve, pve_list); 1760 } 1761 1762 /* 1763 * pmap_enter_pv: enter a mapping onto a pv_head lst 1764 * 1765 * => caller should adjust ptp's wire_count before calling 1766 */ 1767 1768 static struct pv_entry * 1769 pmap_enter_pv(struct pmap_page *pp, 1770 struct pv_entry *pve, /* preallocated pve for us to use */ 1771 struct pv_entry **sparepve, 1772 struct vm_page *ptp, 1773 vaddr_t va) 1774 { 1775 1776 KASSERT(ptp == NULL || ptp->wire_count >= 2); 1777 KASSERT(ptp == NULL || ptp->uobject != NULL); 1778 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 1779 1780 if ((pp->pp_flags & PP_EMBEDDED) == 0) { 1781 if (LIST_EMPTY(&pp->pp_head.pvh_list)) { 1782 pp->pp_flags |= PP_EMBEDDED; 1783 pp->pp_pte.pte_ptp = ptp; 1784 pp->pp_pte.pte_va = va; 1785 1786 return pve; 1787 } 1788 } else { 1789 struct pv_entry *pve2; 1790 1791 pve2 = *sparepve; 1792 *sparepve = NULL; 1793 1794 pve2->pve_pte = pp->pp_pte; 1795 pp->pp_flags &= ~PP_EMBEDDED; 1796 LIST_INIT(&pp->pp_head.pvh_list); 1797 insert_pv(pp, pve2); 1798 } 1799 1800 pve->pve_pte.pte_ptp = ptp; 1801 pve->pve_pte.pte_va = va; 1802 insert_pv(pp, pve); 1803 1804 return NULL; 1805 } 1806 1807 /* 1808 * pmap_remove_pv: try to remove a mapping from a pv_list 1809 * 1810 * => caller should adjust ptp's wire_count and free PTP if needed 1811 * => we return the removed pve 1812 */ 1813 1814 static struct pv_entry * 1815 pmap_remove_pv(struct pmap_page *pp, struct vm_page *ptp, vaddr_t va) 1816 { 1817 struct pv_hash_head *hh; 1818 struct pv_entry *pve; 1819 kmutex_t *lock; 1820 u_int hash; 1821 1822 KASSERT(ptp == NULL || ptp->uobject != NULL); 1823 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 1824 1825 if ((pp->pp_flags & PP_EMBEDDED) != 0) { 1826 KASSERT(pp->pp_pte.pte_ptp == ptp); 1827 KASSERT(pp->pp_pte.pte_va == va); 1828 1829 pp->pp_flags &= ~PP_EMBEDDED; 1830 LIST_INIT(&pp->pp_head.pvh_list); 1831 1832 return NULL; 1833 } 1834 1835 hash = pvhash_hash(ptp, va); 1836 lock = pvhash_lock(hash); 1837 hh = pvhash_head(hash); 1838 mutex_spin_enter(lock); 1839 pve = pvhash_remove(hh, ptp, va); 1840 mutex_spin_exit(lock); 1841 1842 LIST_REMOVE(pve, pve_list); 1843 1844 return pve; 1845 } 1846 1847 /* 1848 * p t p f u n c t i o n s 1849 */ 1850 1851 static inline struct vm_page * 1852 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level) 1853 { 1854 int lidx = level - 1; 1855 struct vm_page *pg; 1856 1857 KASSERT(mutex_owned(pmap->pm_lock)); 1858 1859 if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] && 1860 pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) { 1861 return (pmap->pm_ptphint[lidx]); 1862 } 1863 PMAP_SUBOBJ_LOCK(pmap, lidx); 1864 pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level)); 1865 PMAP_SUBOBJ_UNLOCK(pmap, lidx); 1866 1867 KASSERT(pg == NULL || pg->wire_count >= 1); 1868 return pg; 1869 } 1870 1871 static inline void 1872 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level) 1873 { 1874 lwp_t *l; 1875 int lidx; 1876 struct uvm_object *obj; 1877 1878 KASSERT(ptp->wire_count == 1); 1879 1880 lidx = level - 1; 1881 1882 obj = &pmap->pm_obj[lidx]; 1883 pmap_stats_update(pmap, -1, 0); 1884 if (lidx != 0) 1885 mutex_enter(obj->vmobjlock); 1886 if (pmap->pm_ptphint[lidx] == ptp) 1887 pmap->pm_ptphint[lidx] = TAILQ_FIRST(&obj->memq); 1888 ptp->wire_count = 0; 1889 uvm_pagerealloc(ptp, NULL, 0); 1890 l = curlwp; 1891 KASSERT((l->l_pflag & LP_INTR) == 0); 1892 VM_PAGE_TO_PP(ptp)->pp_link = l->l_md.md_gc_ptp; 1893 l->l_md.md_gc_ptp = ptp; 1894 if (lidx != 0) 1895 mutex_exit(obj->vmobjlock); 1896 } 1897 1898 static void 1899 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va, 1900 pt_entry_t *ptes, pd_entry_t * const *pdes) 1901 { 1902 unsigned long index; 1903 int level; 1904 vaddr_t invaladdr; 1905 pd_entry_t opde; 1906 1907 KASSERT(pmap != pmap_kernel()); 1908 KASSERT(mutex_owned(pmap->pm_lock)); 1909 KASSERT(kpreempt_disabled()); 1910 1911 level = 1; 1912 do { 1913 index = pl_i(va, level + 1); 1914 opde = pmap_pte_testset(&pdes[level - 1][index], 0); 1915 #if defined(XEN) 1916 # if defined(__x86_64__) 1917 /* 1918 * If ptp is a L3 currently mapped in kernel space, 1919 * on any cpu, clear it before freeing 1920 */ 1921 if (level == PTP_LEVELS - 1) { 1922 /* 1923 * Update the per-cpu PD on all cpus the current 1924 * pmap is active on 1925 */ 1926 xen_kpm_sync(pmap, index); 1927 } 1928 # endif /*__x86_64__ */ 1929 invaladdr = level == 1 ? (vaddr_t)ptes : 1930 (vaddr_t)pdes[level - 2]; 1931 pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE, 1932 opde, TLBSHOOT_FREE_PTP1); 1933 pmap_tlb_shootnow(); 1934 #else /* XEN */ 1935 invaladdr = level == 1 ? (vaddr_t)ptes : 1936 (vaddr_t)pdes[level - 2]; 1937 pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE, 1938 opde, TLBSHOOT_FREE_PTP1); 1939 #endif /* XEN */ 1940 pmap_freepage(pmap, ptp, level); 1941 if (level < PTP_LEVELS - 1) { 1942 ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1); 1943 ptp->wire_count--; 1944 if (ptp->wire_count > 1) 1945 break; 1946 } 1947 } while (++level < PTP_LEVELS); 1948 pmap_pte_flush(); 1949 } 1950 1951 /* 1952 * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one) 1953 * 1954 * => pmap should NOT be pmap_kernel() 1955 * => pmap should be locked 1956 * => preemption should be disabled 1957 */ 1958 1959 static struct vm_page * 1960 pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t * const *pdes) 1961 { 1962 struct vm_page *ptp, *pptp; 1963 int i; 1964 unsigned long index; 1965 pd_entry_t *pva; 1966 paddr_t ppa, pa; 1967 struct uvm_object *obj; 1968 1969 KASSERT(pmap != pmap_kernel()); 1970 KASSERT(mutex_owned(pmap->pm_lock)); 1971 KASSERT(kpreempt_disabled()); 1972 1973 ptp = NULL; 1974 pa = (paddr_t)-1; 1975 1976 /* 1977 * Loop through all page table levels seeing if we need to 1978 * add a new page to that level. 1979 */ 1980 for (i = PTP_LEVELS; i > 1; i--) { 1981 /* 1982 * Save values from previous round. 1983 */ 1984 pptp = ptp; 1985 ppa = pa; 1986 1987 index = pl_i(va, i); 1988 pva = pdes[i - 2]; 1989 1990 if (pmap_valid_entry(pva[index])) { 1991 ppa = pmap_pte2pa(pva[index]); 1992 ptp = NULL; 1993 continue; 1994 } 1995 1996 obj = &pmap->pm_obj[i-2]; 1997 PMAP_SUBOBJ_LOCK(pmap, i - 2); 1998 ptp = uvm_pagealloc(obj, ptp_va2o(va, i - 1), NULL, 1999 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 2000 PMAP_SUBOBJ_UNLOCK(pmap, i - 2); 2001 2002 if (ptp == NULL) 2003 return NULL; 2004 2005 ptp->flags &= ~PG_BUSY; /* never busy */ 2006 ptp->wire_count = 1; 2007 pmap->pm_ptphint[i - 2] = ptp; 2008 pa = VM_PAGE_TO_PHYS(ptp); 2009 pmap_pte_set(&pva[index], (pd_entry_t) 2010 (pmap_pa2pte(pa) | PG_u | PG_RW | PG_V)); 2011 #if defined(XEN) && defined(__x86_64__) 2012 if(i == PTP_LEVELS) { 2013 /* 2014 * Update the per-cpu PD on all cpus the current 2015 * pmap is active on 2016 */ 2017 xen_kpm_sync(pmap, index); 2018 } 2019 #endif 2020 pmap_pte_flush(); 2021 pmap_stats_update(pmap, 1, 0); 2022 /* 2023 * If we're not in the top level, increase the 2024 * wire count of the parent page. 2025 */ 2026 if (i < PTP_LEVELS) { 2027 if (pptp == NULL) { 2028 pptp = pmap_find_ptp(pmap, va, ppa, i); 2029 KASSERT(pptp != NULL); 2030 } 2031 pptp->wire_count++; 2032 } 2033 } 2034 2035 /* 2036 * PTP is not NULL if we just allocated a new PTP. If it is 2037 * still NULL, we must look up the existing one. 2038 */ 2039 if (ptp == NULL) { 2040 ptp = pmap_find_ptp(pmap, va, ppa, 1); 2041 KASSERTMSG(ptp != NULL, "pmap_get_ptp: va %" PRIxVADDR 2042 "ppa %" PRIxPADDR "\n", va, ppa); 2043 } 2044 2045 pmap->pm_ptphint[0] = ptp; 2046 return ptp; 2047 } 2048 2049 /* 2050 * p m a p l i f e c y c l e f u n c t i o n s 2051 */ 2052 2053 /* 2054 * pmap_pdp_ctor: constructor for the PDP cache. 2055 */ 2056 static int 2057 pmap_pdp_ctor(void *arg, void *v, int flags) 2058 { 2059 pd_entry_t *pdir = v; 2060 paddr_t pdirpa = 0; /* XXX: GCC */ 2061 vaddr_t object; 2062 int i; 2063 2064 #if !defined(XEN) || !defined(__x86_64__) 2065 int npde; 2066 #endif 2067 #ifdef XEN 2068 int s; 2069 #endif 2070 2071 /* 2072 * NOTE: The `pmaps_lock' is held when the PDP is allocated. 2073 */ 2074 2075 #if defined(XEN) && defined(__x86_64__) 2076 /* fetch the physical address of the page directory. */ 2077 (void) pmap_extract(pmap_kernel(), (vaddr_t) pdir, &pdirpa); 2078 2079 /* zero init area */ 2080 memset (pdir, 0, PAGE_SIZE); /* Xen wants a clean page */ 2081 /* 2082 * this pdir will NEVER be active in kernel mode 2083 * so mark recursive entry invalid 2084 */ 2085 pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa) | PG_u; 2086 /* 2087 * PDP constructed this way won't be for kernel, 2088 * hence we don't put kernel mappings on Xen. 2089 * But we need to make pmap_create() happy, so put a dummy (without 2090 * PG_V) value at the right place. 2091 */ 2092 pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] = 2093 (pd_entry_t)-1 & PG_FRAME; 2094 #else /* XEN && __x86_64__*/ 2095 /* zero init area */ 2096 memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t)); 2097 2098 object = (vaddr_t)v; 2099 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2100 /* fetch the physical address of the page directory. */ 2101 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2102 /* put in recursive PDE to map the PTEs */ 2103 pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PG_V; 2104 #ifndef XEN 2105 pdir[PDIR_SLOT_PTE + i] |= PG_KW; 2106 #endif 2107 } 2108 2109 /* copy kernel's PDE */ 2110 npde = nkptp[PTP_LEVELS - 1]; 2111 2112 memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN], 2113 npde * sizeof(pd_entry_t)); 2114 2115 /* zero the rest */ 2116 memset(&pdir[PDIR_SLOT_KERN + npde], 0, (PAGE_SIZE * PDP_SIZE) - 2117 (PDIR_SLOT_KERN + npde) * sizeof(pd_entry_t)); 2118 2119 if (VM_MIN_KERNEL_ADDRESS != KERNBASE) { 2120 int idx = pl_i(KERNBASE, PTP_LEVELS); 2121 2122 pdir[idx] = PDP_BASE[idx]; 2123 } 2124 2125 #ifdef __HAVE_DIRECT_MAP 2126 pdir[PDIR_SLOT_DIRECT] = PDP_BASE[PDIR_SLOT_DIRECT]; 2127 #endif 2128 2129 #endif /* XEN && __x86_64__*/ 2130 #ifdef XEN 2131 s = splvm(); 2132 object = (vaddr_t)v; 2133 pmap_protect(pmap_kernel(), object, object + (PAGE_SIZE * PDP_SIZE), 2134 VM_PROT_READ); 2135 pmap_update(pmap_kernel()); 2136 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2137 /* 2138 * pin as L2/L4 page, we have to do the page with the 2139 * PDIR_SLOT_PTE entries last 2140 */ 2141 #ifdef PAE 2142 if (i == l2tol3(PDIR_SLOT_PTE)) 2143 continue; 2144 #endif 2145 2146 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2147 #ifdef __x86_64__ 2148 xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa)); 2149 #else 2150 xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa)); 2151 #endif 2152 } 2153 #ifdef PAE 2154 object = ((vaddr_t)pdir) + PAGE_SIZE * l2tol3(PDIR_SLOT_PTE); 2155 (void)pmap_extract(pmap_kernel(), object, &pdirpa); 2156 xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa)); 2157 #endif 2158 splx(s); 2159 #endif /* XEN */ 2160 2161 return (0); 2162 } 2163 2164 /* 2165 * pmap_pdp_dtor: destructor for the PDP cache. 2166 */ 2167 2168 static void 2169 pmap_pdp_dtor(void *arg, void *v) 2170 { 2171 #ifdef XEN 2172 paddr_t pdirpa = 0; /* XXX: GCC */ 2173 vaddr_t object = (vaddr_t)v; 2174 int i; 2175 int s = splvm(); 2176 pt_entry_t *pte; 2177 2178 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2179 /* fetch the physical address of the page directory. */ 2180 (void) pmap_extract(pmap_kernel(), object, &pdirpa); 2181 /* unpin page table */ 2182 xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa)); 2183 } 2184 object = (vaddr_t)v; 2185 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) { 2186 /* Set page RW again */ 2187 pte = kvtopte(object); 2188 pmap_pte_set(pte, *pte | PG_RW); 2189 xen_bcast_invlpg((vaddr_t)object); 2190 } 2191 splx(s); 2192 #endif /* XEN */ 2193 } 2194 2195 #ifdef PAE 2196 2197 /* pmap_pdp_alloc: Allocate a page for the pdp memory pool. */ 2198 2199 static void * 2200 pmap_pdp_alloc(struct pool *pp, int flags) 2201 { 2202 return (void *)uvm_km_alloc(kernel_map, 2203 PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE, 2204 ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK) 2205 | UVM_KMF_WIRED); 2206 } 2207 2208 /* 2209 * pmap_pdp_free: free a PDP 2210 */ 2211 2212 static void 2213 pmap_pdp_free(struct pool *pp, void *v) 2214 { 2215 uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE, 2216 UVM_KMF_WIRED); 2217 } 2218 #endif /* PAE */ 2219 2220 /* 2221 * pmap_create: create a pmap object. 2222 */ 2223 struct pmap * 2224 pmap_create(void) 2225 { 2226 struct pmap *pmap; 2227 int i; 2228 2229 pmap = pool_cache_get(&pmap_cache, PR_WAITOK); 2230 2231 /* init uvm_object */ 2232 for (i = 0; i < PTP_LEVELS - 1; i++) { 2233 mutex_init(&pmap->pm_obj_lock[i], MUTEX_DEFAULT, IPL_NONE); 2234 uvm_obj_init(&pmap->pm_obj[i], NULL, false, 1); 2235 uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_obj_lock[i]); 2236 pmap->pm_ptphint[i] = NULL; 2237 } 2238 pmap->pm_stats.wired_count = 0; 2239 /* count the PDP allocd below */ 2240 pmap->pm_stats.resident_count = PDP_SIZE; 2241 #if !defined(__x86_64__) 2242 pmap->pm_hiexec = 0; 2243 #endif /* !defined(__x86_64__) */ 2244 pmap->pm_flags = 0; 2245 pmap->pm_gc_ptp = NULL; 2246 2247 kcpuset_create(&pmap->pm_cpus, true); 2248 kcpuset_create(&pmap->pm_kernel_cpus, true); 2249 #ifdef XEN 2250 kcpuset_create(&pmap->pm_xen_ptp_cpus, true); 2251 #endif 2252 /* init the LDT */ 2253 pmap->pm_ldt = NULL; 2254 pmap->pm_ldt_len = 0; 2255 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 2256 2257 /* allocate PDP */ 2258 try_again: 2259 pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK); 2260 2261 mutex_enter(&pmaps_lock); 2262 2263 if (pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] == 0) { 2264 mutex_exit(&pmaps_lock); 2265 pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir); 2266 goto try_again; 2267 } 2268 2269 for (i = 0; i < PDP_SIZE; i++) 2270 pmap->pm_pdirpa[i] = 2271 pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]); 2272 2273 LIST_INSERT_HEAD(&pmaps, pmap, pm_list); 2274 2275 mutex_exit(&pmaps_lock); 2276 2277 return (pmap); 2278 } 2279 2280 /* 2281 * pmap_free_ptps: put a list of ptps back to the freelist. 2282 */ 2283 2284 static void 2285 pmap_free_ptps(struct vm_page *empty_ptps) 2286 { 2287 struct vm_page *ptp; 2288 struct pmap_page *pp; 2289 2290 while ((ptp = empty_ptps) != NULL) { 2291 pp = VM_PAGE_TO_PP(ptp); 2292 empty_ptps = pp->pp_link; 2293 LIST_INIT(&pp->pp_head.pvh_list); 2294 uvm_pagefree(ptp); 2295 } 2296 } 2297 2298 /* 2299 * pmap_destroy: drop reference count on pmap. free pmap if 2300 * reference count goes to zero. 2301 */ 2302 2303 void 2304 pmap_destroy(struct pmap *pmap) 2305 { 2306 lwp_t *l; 2307 int i; 2308 2309 /* 2310 * If we have torn down this pmap, process deferred frees and 2311 * invalidations. Free now if the system is low on memory. 2312 * Otherwise, free when the pmap is destroyed thus avoiding a 2313 * TLB shootdown. 2314 */ 2315 l = curlwp; 2316 if (__predict_false(l->l_md.md_gc_pmap == pmap)) { 2317 if (uvmexp.free < uvmexp.freetarg) { 2318 pmap_update(pmap); 2319 } else { 2320 KASSERT(pmap->pm_gc_ptp == NULL); 2321 pmap->pm_gc_ptp = l->l_md.md_gc_ptp; 2322 l->l_md.md_gc_ptp = NULL; 2323 l->l_md.md_gc_pmap = NULL; 2324 } 2325 } 2326 2327 /* 2328 * drop reference count 2329 */ 2330 2331 if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) { 2332 return; 2333 } 2334 2335 #ifdef DIAGNOSTIC 2336 CPU_INFO_ITERATOR cii; 2337 struct cpu_info *ci; 2338 2339 for (CPU_INFO_FOREACH(cii, ci)) { 2340 if (ci->ci_pmap == pmap) 2341 panic("destroying pmap being used"); 2342 #if defined(XEN) && defined(__x86_64__) 2343 for (i = 0; i < PDIR_SLOT_PTE; i++) { 2344 if (pmap->pm_pdir[i] != 0 && 2345 ci->ci_kpm_pdir[i] == pmap->pm_pdir[i]) { 2346 printf("pmap_destroy(%p) pmap_kernel %p " 2347 "curcpu %d cpu %d ci_pmap %p " 2348 "ci->ci_kpm_pdir[%d]=%" PRIx64 2349 " pmap->pm_pdir[%d]=%" PRIx64 "\n", 2350 pmap, pmap_kernel(), curcpu()->ci_index, 2351 ci->ci_index, ci->ci_pmap, 2352 i, ci->ci_kpm_pdir[i], 2353 i, pmap->pm_pdir[i]); 2354 panic("pmap_destroy: used pmap"); 2355 } 2356 } 2357 #endif 2358 } 2359 #endif /* DIAGNOSTIC */ 2360 2361 /* 2362 * Reference count is zero, free pmap resources and then free pmap. 2363 * First, remove it from global list of pmaps. 2364 */ 2365 2366 mutex_enter(&pmaps_lock); 2367 LIST_REMOVE(pmap, pm_list); 2368 mutex_exit(&pmaps_lock); 2369 2370 /* 2371 * Process deferred PTP frees. No TLB shootdown required, as the 2372 * PTP pages are no longer visible to any CPU. 2373 */ 2374 2375 pmap_free_ptps(pmap->pm_gc_ptp); 2376 2377 /* 2378 * destroyed pmap shouldn't have remaining PTPs 2379 */ 2380 2381 for (i = 0; i < PTP_LEVELS - 1; i++) { 2382 KASSERT(pmap->pm_obj[i].uo_npages == 0); 2383 KASSERT(TAILQ_EMPTY(&pmap->pm_obj[i].memq)); 2384 } 2385 2386 pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir); 2387 2388 #ifdef USER_LDT 2389 if (pmap->pm_ldt != NULL) { 2390 /* 2391 * no need to switch the LDT; this address space is gone, 2392 * nothing is using it. 2393 * 2394 * No need to lock the pmap for ldt_free (or anything else), 2395 * we're the last one to use it. 2396 */ 2397 mutex_enter(&cpu_lock); 2398 ldt_free(pmap->pm_ldt_sel); 2399 mutex_exit(&cpu_lock); 2400 uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt, 2401 pmap->pm_ldt_len, UVM_KMF_WIRED); 2402 } 2403 #endif 2404 2405 for (i = 0; i < PTP_LEVELS - 1; i++) { 2406 uvm_obj_destroy(&pmap->pm_obj[i], false); 2407 mutex_destroy(&pmap->pm_obj_lock[i]); 2408 } 2409 kcpuset_destroy(pmap->pm_cpus); 2410 kcpuset_destroy(pmap->pm_kernel_cpus); 2411 #ifdef XEN 2412 kcpuset_destroy(pmap->pm_xen_ptp_cpus); 2413 #endif 2414 pool_cache_put(&pmap_cache, pmap); 2415 } 2416 2417 /* 2418 * pmap_remove_all: pmap is being torn down by the current thread. 2419 * avoid unnecessary invalidations. 2420 */ 2421 2422 void 2423 pmap_remove_all(struct pmap *pmap) 2424 { 2425 lwp_t *l = curlwp; 2426 2427 KASSERT(l->l_md.md_gc_pmap == NULL); 2428 2429 l->l_md.md_gc_pmap = pmap; 2430 } 2431 2432 #if defined(PMAP_FORK) 2433 /* 2434 * pmap_fork: perform any necessary data structure manipulation when 2435 * a VM space is forked. 2436 */ 2437 2438 void 2439 pmap_fork(struct pmap *pmap1, struct pmap *pmap2) 2440 { 2441 #ifdef USER_LDT 2442 union descriptor *new_ldt; 2443 size_t len; 2444 int sel; 2445 2446 if (__predict_true(pmap1->pm_ldt == NULL)) { 2447 return; 2448 } 2449 2450 retry: 2451 if (pmap1->pm_ldt != NULL) { 2452 len = pmap1->pm_ldt_len; 2453 new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, len, 0, 2454 UVM_KMF_WIRED); 2455 mutex_enter(&cpu_lock); 2456 sel = ldt_alloc(new_ldt, len); 2457 if (sel == -1) { 2458 mutex_exit(&cpu_lock); 2459 uvm_km_free(kernel_map, (vaddr_t)new_ldt, len, 2460 UVM_KMF_WIRED); 2461 printf("WARNING: pmap_fork: unable to allocate LDT\n"); 2462 return; 2463 } 2464 } else { 2465 len = -1; 2466 new_ldt = NULL; 2467 sel = -1; 2468 mutex_enter(&cpu_lock); 2469 } 2470 2471 /* Copy the LDT, if necessary. */ 2472 if (pmap1->pm_ldt != NULL) { 2473 if (len != pmap1->pm_ldt_len) { 2474 if (len != -1) { 2475 ldt_free(sel); 2476 uvm_km_free(kernel_map, (vaddr_t)new_ldt, 2477 len, UVM_KMF_WIRED); 2478 } 2479 mutex_exit(&cpu_lock); 2480 goto retry; 2481 } 2482 2483 memcpy(new_ldt, pmap1->pm_ldt, len); 2484 pmap2->pm_ldt = new_ldt; 2485 pmap2->pm_ldt_len = pmap1->pm_ldt_len; 2486 pmap2->pm_ldt_sel = sel; 2487 len = -1; 2488 } 2489 2490 if (len != -1) { 2491 ldt_free(sel); 2492 uvm_km_free(kernel_map, (vaddr_t)new_ldt, len, 2493 UVM_KMF_WIRED); 2494 } 2495 mutex_exit(&cpu_lock); 2496 #endif /* USER_LDT */ 2497 } 2498 #endif /* PMAP_FORK */ 2499 2500 #ifdef USER_LDT 2501 2502 /* 2503 * pmap_ldt_xcall: cross call used by pmap_ldt_sync. if the named pmap 2504 * is active, reload LDTR. 2505 */ 2506 static void 2507 pmap_ldt_xcall(void *arg1, void *arg2) 2508 { 2509 struct pmap *pm; 2510 2511 kpreempt_disable(); 2512 pm = arg1; 2513 if (curcpu()->ci_pmap == pm) { 2514 lldt(pm->pm_ldt_sel); 2515 } 2516 kpreempt_enable(); 2517 } 2518 2519 /* 2520 * pmap_ldt_sync: LDT selector for the named pmap is changing. swap 2521 * in the new selector on all CPUs. 2522 */ 2523 void 2524 pmap_ldt_sync(struct pmap *pm) 2525 { 2526 uint64_t where; 2527 2528 KASSERT(mutex_owned(&cpu_lock)); 2529 2530 pmap_ldt_evcnt.ev_count++; 2531 where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL); 2532 xc_wait(where); 2533 } 2534 2535 /* 2536 * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and 2537 * restore the default. 2538 */ 2539 2540 void 2541 pmap_ldt_cleanup(struct lwp *l) 2542 { 2543 pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap; 2544 union descriptor *dp = NULL; 2545 size_t len = 0; 2546 int sel = -1; 2547 2548 if (__predict_true(pmap->pm_ldt == NULL)) { 2549 return; 2550 } 2551 2552 mutex_enter(&cpu_lock); 2553 if (pmap->pm_ldt != NULL) { 2554 sel = pmap->pm_ldt_sel; 2555 dp = pmap->pm_ldt; 2556 len = pmap->pm_ldt_len; 2557 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); 2558 pmap->pm_ldt = NULL; 2559 pmap->pm_ldt_len = 0; 2560 pmap_ldt_sync(pmap); 2561 ldt_free(sel); 2562 uvm_km_free(kernel_map, (vaddr_t)dp, len, UVM_KMF_WIRED); 2563 } 2564 mutex_exit(&cpu_lock); 2565 } 2566 #endif /* USER_LDT */ 2567 2568 /* 2569 * pmap_activate: activate a process' pmap 2570 * 2571 * => must be called with kernel preemption disabled 2572 * => if lwp is the curlwp, then set ci_want_pmapload so that 2573 * actual MMU context switch will be done by pmap_load() later 2574 */ 2575 2576 void 2577 pmap_activate(struct lwp *l) 2578 { 2579 struct cpu_info *ci; 2580 struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2581 2582 KASSERT(kpreempt_disabled()); 2583 2584 ci = curcpu(); 2585 2586 if (l == ci->ci_curlwp) { 2587 KASSERT(ci->ci_want_pmapload == 0); 2588 KASSERT(ci->ci_tlbstate != TLBSTATE_VALID); 2589 #ifdef KSTACK_CHECK_DR0 2590 /* 2591 * setup breakpoint on the top of stack 2592 */ 2593 if (l == &lwp0) 2594 dr0(0, 0, 0, 0); 2595 else 2596 dr0(KSTACK_LOWEST_ADDR(l), 1, 3, 1); 2597 #endif 2598 2599 /* 2600 * no need to switch to kernel vmspace because 2601 * it's a subset of any vmspace. 2602 */ 2603 2604 if (pmap == pmap_kernel()) { 2605 ci->ci_want_pmapload = 0; 2606 return; 2607 } 2608 2609 ci->ci_want_pmapload = 1; 2610 } 2611 } 2612 2613 /* 2614 * pmap_reactivate: try to regain reference to the pmap. 2615 * 2616 * => Must be called with kernel preemption disabled. 2617 */ 2618 2619 static bool 2620 pmap_reactivate(struct pmap *pmap) 2621 { 2622 struct cpu_info * const ci = curcpu(); 2623 const cpuid_t cid = cpu_index(ci); 2624 bool result; 2625 2626 KASSERT(kpreempt_disabled()); 2627 #if defined(XEN) && defined(__x86_64__) 2628 KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd); 2629 #elif defined(PAE) 2630 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0])); 2631 #elif !defined(XEN) 2632 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3())); 2633 #endif 2634 2635 /* 2636 * If we still have a lazy reference to this pmap, we can assume 2637 * that there was no TLB shootdown for this pmap in the meantime. 2638 * 2639 * The order of events here is important as we must synchronize 2640 * with TLB shootdown interrupts. Declare interest in invalidations 2641 * (TLBSTATE_VALID) and then check the CPU set, which the IPIs can 2642 * change only when the state is TLBSTATE_LAZY. 2643 */ 2644 2645 ci->ci_tlbstate = TLBSTATE_VALID; 2646 KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid)); 2647 2648 if (kcpuset_isset(pmap->pm_cpus, cid)) { 2649 /* We have the reference, state is valid. */ 2650 result = true; 2651 } else { 2652 /* Must reload the TLB. */ 2653 kcpuset_atomic_set(pmap->pm_cpus, cid); 2654 result = false; 2655 } 2656 return result; 2657 } 2658 2659 /* 2660 * pmap_load: perform the actual pmap switch, i.e. fill in %cr3 register 2661 * and relevant LDT info. 2662 * 2663 * Ensures that the current process' pmap is loaded on the current CPU's 2664 * MMU and that there are no stale TLB entries. 2665 * 2666 * => The caller should disable kernel preemption or do check-and-retry 2667 * to prevent a preemption from undoing our efforts. 2668 * => This function may block. 2669 */ 2670 void 2671 pmap_load(void) 2672 { 2673 struct cpu_info *ci; 2674 struct pmap *pmap, *oldpmap; 2675 struct lwp *l; 2676 struct pcb *pcb; 2677 cpuid_t cid; 2678 uint64_t ncsw; 2679 2680 kpreempt_disable(); 2681 retry: 2682 ci = curcpu(); 2683 if (!ci->ci_want_pmapload) { 2684 kpreempt_enable(); 2685 return; 2686 } 2687 l = ci->ci_curlwp; 2688 ncsw = l->l_ncsw; 2689 2690 /* should be able to take ipis. */ 2691 KASSERT(ci->ci_ilevel < IPL_HIGH); 2692 #ifdef XEN 2693 /* Check to see if interrupts are enabled (ie; no events are masked) */ 2694 KASSERT(x86_read_psl() == 0); 2695 #else 2696 KASSERT((x86_read_psl() & PSL_I) != 0); 2697 #endif 2698 2699 KASSERT(l != NULL); 2700 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2701 KASSERT(pmap != pmap_kernel()); 2702 oldpmap = ci->ci_pmap; 2703 pcb = lwp_getpcb(l); 2704 2705 if (pmap == oldpmap) { 2706 if (!pmap_reactivate(pmap)) { 2707 u_int gen = uvm_emap_gen_return(); 2708 2709 /* 2710 * pmap has been changed during deactivated. 2711 * our tlb may be stale. 2712 */ 2713 2714 tlbflush(); 2715 uvm_emap_update(gen); 2716 } 2717 2718 ci->ci_want_pmapload = 0; 2719 kpreempt_enable(); 2720 return; 2721 } 2722 2723 /* 2724 * Acquire a reference to the new pmap and perform the switch. 2725 */ 2726 2727 pmap_reference(pmap); 2728 2729 cid = cpu_index(ci); 2730 kcpuset_atomic_clear(oldpmap->pm_cpus, cid); 2731 kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid); 2732 2733 #if defined(XEN) && defined(__x86_64__) 2734 KASSERT(pmap_pdirpa(oldpmap, 0) == ci->ci_xen_current_user_pgd || 2735 oldpmap == pmap_kernel()); 2736 #elif defined(PAE) 2737 KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0])); 2738 #elif !defined(XEN) 2739 KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(rcr3())); 2740 #endif 2741 KASSERT(!kcpuset_isset(pmap->pm_cpus, cid)); 2742 KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid)); 2743 2744 /* 2745 * Mark the pmap in use by this CPU. Again, we must synchronize 2746 * with TLB shootdown interrupts, so set the state VALID first, 2747 * then register us for shootdown events on this pmap. 2748 */ 2749 ci->ci_tlbstate = TLBSTATE_VALID; 2750 kcpuset_atomic_set(pmap->pm_cpus, cid); 2751 kcpuset_atomic_set(pmap->pm_kernel_cpus, cid); 2752 ci->ci_pmap = pmap; 2753 2754 /* 2755 * update tss. now that we have registered for invalidations 2756 * from other CPUs, we're good to load the page tables. 2757 */ 2758 #ifdef PAE 2759 pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa; 2760 #else 2761 pcb->pcb_cr3 = pmap_pdirpa(pmap, 0); 2762 #endif 2763 2764 #ifdef i386 2765 #ifndef XEN 2766 ci->ci_tss.tss_ldt = pmap->pm_ldt_sel; 2767 ci->ci_tss.tss_cr3 = pcb->pcb_cr3; 2768 #endif /* !XEN */ 2769 #endif /* i386 */ 2770 2771 lldt(pmap->pm_ldt_sel); 2772 2773 u_int gen = uvm_emap_gen_return(); 2774 cpu_load_pmap(pmap, oldpmap); 2775 uvm_emap_update(gen); 2776 2777 ci->ci_want_pmapload = 0; 2778 2779 /* 2780 * we're now running with the new pmap. drop the reference 2781 * to the old pmap. if we block, we need to go around again. 2782 */ 2783 2784 pmap_destroy(oldpmap); 2785 if (l->l_ncsw != ncsw) { 2786 goto retry; 2787 } 2788 2789 kpreempt_enable(); 2790 } 2791 2792 /* 2793 * pmap_deactivate: deactivate a process' pmap. 2794 * 2795 * => Must be called with kernel preemption disabled (high IPL is enough). 2796 */ 2797 void 2798 pmap_deactivate(struct lwp *l) 2799 { 2800 struct pmap *pmap; 2801 struct cpu_info *ci; 2802 2803 KASSERT(kpreempt_disabled()); 2804 2805 if (l != curlwp) { 2806 return; 2807 } 2808 2809 /* 2810 * Wait for pending TLB shootdowns to complete. Necessary because 2811 * TLB shootdown state is per-CPU, and the LWP may be coming off 2812 * the CPU before it has a chance to call pmap_update(), e.g. due 2813 * to kernel preemption or blocking routine in between. 2814 */ 2815 pmap_tlb_shootnow(); 2816 2817 ci = curcpu(); 2818 2819 if (ci->ci_want_pmapload) { 2820 /* 2821 * ci_want_pmapload means that our pmap is not loaded on 2822 * the CPU or TLB might be stale. note that pmap_kernel() 2823 * is always considered loaded. 2824 */ 2825 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 2826 != pmap_kernel()); 2827 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map) 2828 != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID); 2829 2830 /* 2831 * userspace has not been touched. 2832 * nothing to do here. 2833 */ 2834 2835 ci->ci_want_pmapload = 0; 2836 return; 2837 } 2838 2839 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map); 2840 2841 if (pmap == pmap_kernel()) { 2842 return; 2843 } 2844 2845 #if defined(XEN) && defined(__x86_64__) 2846 KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd); 2847 #elif defined(PAE) 2848 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0])); 2849 #elif !defined(XEN) 2850 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3())); 2851 #endif 2852 KASSERT(ci->ci_pmap == pmap); 2853 2854 /* 2855 * we aren't interested in TLB invalidations for this pmap, 2856 * at least for the time being. 2857 */ 2858 2859 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID); 2860 ci->ci_tlbstate = TLBSTATE_LAZY; 2861 } 2862 2863 /* 2864 * end of lifecycle functions 2865 */ 2866 2867 /* 2868 * some misc. functions 2869 */ 2870 2871 int 2872 pmap_pdes_invalid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde) 2873 { 2874 int i; 2875 unsigned long index; 2876 pd_entry_t pde; 2877 2878 for (i = PTP_LEVELS; i > 1; i--) { 2879 index = pl_i(va, i); 2880 pde = pdes[i - 2][index]; 2881 if ((pde & PG_V) == 0) 2882 return i; 2883 } 2884 if (lastpde != NULL) 2885 *lastpde = pde; 2886 return 0; 2887 } 2888 2889 /* 2890 * pmap_extract: extract a PA for the given VA 2891 */ 2892 2893 bool 2894 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap) 2895 { 2896 pt_entry_t *ptes, pte; 2897 pd_entry_t pde; 2898 pd_entry_t * const *pdes; 2899 struct pmap *pmap2; 2900 struct cpu_info *ci; 2901 paddr_t pa; 2902 lwp_t *l; 2903 bool hard, rv; 2904 2905 #ifdef __HAVE_DIRECT_MAP 2906 if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) { 2907 if (pap != NULL) { 2908 *pap = va - PMAP_DIRECT_BASE; 2909 } 2910 return true; 2911 } 2912 #endif 2913 2914 rv = false; 2915 pa = 0; 2916 l = curlwp; 2917 2918 KPREEMPT_DISABLE(l); 2919 ci = l->l_cpu; 2920 if (__predict_true(!ci->ci_want_pmapload && ci->ci_pmap == pmap) || 2921 pmap == pmap_kernel()) { 2922 /* 2923 * no need to lock, because it's pmap_kernel() or our 2924 * own pmap and is active. if a user pmap, the caller 2925 * will hold the vm_map write/read locked and so prevent 2926 * entries from disappearing while we are here. ptps 2927 * can disappear via pmap_remove() and pmap_protect(), 2928 * but they are called with the vm_map write locked. 2929 */ 2930 hard = false; 2931 ptes = PTE_BASE; 2932 pdes = normal_pdes; 2933 } else { 2934 /* we lose, do it the hard way. */ 2935 hard = true; 2936 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 2937 } 2938 if (pmap_pdes_valid(va, pdes, &pde)) { 2939 pte = ptes[pl1_i(va)]; 2940 if (pde & PG_PS) { 2941 pa = (pde & PG_LGFRAME) | (va & (NBPD_L2 - 1)); 2942 rv = true; 2943 } else if (__predict_true((pte & PG_V) != 0)) { 2944 pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1)); 2945 rv = true; 2946 } 2947 } 2948 if (__predict_false(hard)) { 2949 pmap_unmap_ptes(pmap, pmap2); 2950 } 2951 KPREEMPT_ENABLE(l); 2952 if (pap != NULL) { 2953 *pap = pa; 2954 } 2955 return rv; 2956 } 2957 2958 2959 /* 2960 * vtophys: virtual address to physical address. For use by 2961 * machine-dependent code only. 2962 */ 2963 2964 paddr_t 2965 vtophys(vaddr_t va) 2966 { 2967 paddr_t pa; 2968 2969 if (pmap_extract(pmap_kernel(), va, &pa) == true) 2970 return (pa); 2971 return (0); 2972 } 2973 2974 __strict_weak_alias(pmap_extract_ma, pmap_extract); 2975 2976 #ifdef XEN 2977 2978 /* 2979 * vtomach: virtual address to machine address. For use by 2980 * machine-dependent code only. 2981 */ 2982 2983 paddr_t 2984 vtomach(vaddr_t va) 2985 { 2986 paddr_t pa; 2987 2988 if (pmap_extract_ma(pmap_kernel(), va, &pa) == true) 2989 return (pa); 2990 return (0); 2991 } 2992 2993 #endif /* XEN */ 2994 2995 /* 2996 * pmap_virtual_space: used during bootup [pmap_steal_memory] to 2997 * determine the bounds of the kernel virtual addess space. 2998 */ 2999 3000 void 3001 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp) 3002 { 3003 *startp = virtual_avail; 3004 *endp = virtual_end; 3005 } 3006 3007 /* 3008 * pmap_zero_page: zero a page 3009 */ 3010 3011 void 3012 pmap_zero_page(paddr_t pa) 3013 { 3014 #if defined(__HAVE_DIRECT_MAP) 3015 pagezero(PMAP_DIRECT_MAP(pa)); 3016 #else 3017 #if defined(XEN) 3018 if (XEN_VERSION_SUPPORTED(3, 4)) 3019 xen_pagezero(pa); 3020 #endif 3021 pt_entry_t *zpte; 3022 void *zerova; 3023 int id; 3024 3025 kpreempt_disable(); 3026 id = cpu_number(); 3027 zpte = PTESLEW(zero_pte, id); 3028 zerova = VASLEW(zerop, id); 3029 3030 #ifdef DIAGNOSTIC 3031 if (*zpte) 3032 panic("pmap_zero_page: lock botch"); 3033 #endif 3034 3035 pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k); 3036 pmap_pte_flush(); 3037 pmap_update_pg((vaddr_t)zerova); /* flush TLB */ 3038 3039 memset(zerova, 0, PAGE_SIZE); 3040 3041 #if defined(DIAGNOSTIC) || defined(XEN) 3042 pmap_pte_set(zpte, 0); /* zap ! */ 3043 pmap_pte_flush(); 3044 #endif 3045 kpreempt_enable(); 3046 #endif /* defined(__HAVE_DIRECT_MAP) */ 3047 } 3048 3049 /* 3050 * pmap_pagezeroidle: the same, for the idle loop page zero'er. 3051 * Returns true if the page was zero'd, false if we aborted for 3052 * some reason. 3053 */ 3054 3055 bool 3056 pmap_pageidlezero(paddr_t pa) 3057 { 3058 #ifdef __HAVE_DIRECT_MAP 3059 KASSERT(cpu_feature[0] & CPUID_SSE2); 3060 return sse2_idlezero_page((void *)PMAP_DIRECT_MAP(pa)); 3061 #else 3062 pt_entry_t *zpte; 3063 void *zerova; 3064 bool rv; 3065 int id; 3066 3067 id = cpu_number(); 3068 zpte = PTESLEW(zero_pte, id); 3069 zerova = VASLEW(zerop, id); 3070 3071 KASSERT(cpu_feature[0] & CPUID_SSE2); 3072 KASSERT(*zpte == 0); 3073 3074 pmap_pte_set(zpte, pmap_pa2pte(pa) | PG_V | PG_RW | PG_M | PG_U | PG_k); 3075 pmap_pte_flush(); 3076 pmap_update_pg((vaddr_t)zerova); /* flush TLB */ 3077 3078 rv = sse2_idlezero_page(zerova); 3079 3080 #if defined(DIAGNOSTIC) || defined(XEN) 3081 pmap_pte_set(zpte, 0); /* zap ! */ 3082 pmap_pte_flush(); 3083 #endif 3084 3085 return rv; 3086 #endif 3087 } 3088 3089 /* 3090 * pmap_copy_page: copy a page 3091 */ 3092 3093 void 3094 pmap_copy_page(paddr_t srcpa, paddr_t dstpa) 3095 { 3096 #if defined(__HAVE_DIRECT_MAP) 3097 vaddr_t srcva = PMAP_DIRECT_MAP(srcpa); 3098 vaddr_t dstva = PMAP_DIRECT_MAP(dstpa); 3099 3100 memcpy((void *)dstva, (void *)srcva, PAGE_SIZE); 3101 #else 3102 #if defined(XEN) 3103 if (XEN_VERSION_SUPPORTED(3, 4)) { 3104 xen_copy_page(srcpa, dstpa); 3105 return; 3106 } 3107 #endif 3108 pt_entry_t *spte; 3109 pt_entry_t *dpte; 3110 void *csrcva; 3111 void *cdstva; 3112 int id; 3113 3114 kpreempt_disable(); 3115 id = cpu_number(); 3116 spte = PTESLEW(csrc_pte,id); 3117 dpte = PTESLEW(cdst_pte,id); 3118 csrcva = VASLEW(csrcp, id); 3119 cdstva = VASLEW(cdstp, id); 3120 3121 KASSERT(*spte == 0 && *dpte == 0); 3122 3123 pmap_pte_set(spte, pmap_pa2pte(srcpa) | PG_V | PG_RW | PG_U | PG_k); 3124 pmap_pte_set(dpte, 3125 pmap_pa2pte(dstpa) | PG_V | PG_RW | PG_M | PG_U | PG_k); 3126 pmap_pte_flush(); 3127 pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva); 3128 3129 memcpy(cdstva, csrcva, PAGE_SIZE); 3130 3131 #if defined(DIAGNOSTIC) || defined(XEN) 3132 pmap_pte_set(spte, 0); 3133 pmap_pte_set(dpte, 0); 3134 pmap_pte_flush(); 3135 #endif 3136 kpreempt_enable(); 3137 #endif /* defined(__HAVE_DIRECT_MAP) */ 3138 } 3139 3140 static pt_entry_t * 3141 pmap_map_ptp(struct vm_page *ptp) 3142 { 3143 #ifdef __HAVE_DIRECT_MAP 3144 return (void *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp)); 3145 #else 3146 pt_entry_t *ptppte; 3147 void *ptpva; 3148 int id; 3149 3150 KASSERT(kpreempt_disabled()); 3151 3152 id = cpu_number(); 3153 ptppte = PTESLEW(ptp_pte, id); 3154 ptpva = VASLEW(ptpp, id); 3155 #if !defined(XEN) 3156 pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M | 3157 PG_RW | PG_U | PG_k); 3158 #else 3159 pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | PG_V | PG_M | 3160 PG_U | PG_k); 3161 #endif 3162 pmap_pte_flush(); 3163 pmap_update_pg((vaddr_t)ptpva); 3164 3165 return (pt_entry_t *)ptpva; 3166 #endif 3167 } 3168 3169 static void 3170 pmap_unmap_ptp(void) 3171 { 3172 #ifndef __HAVE_DIRECT_MAP 3173 #if defined(DIAGNOSTIC) || defined(XEN) 3174 pt_entry_t *pte; 3175 3176 KASSERT(kpreempt_disabled()); 3177 3178 pte = PTESLEW(ptp_pte, cpu_number()); 3179 if (*pte != 0) { 3180 pmap_pte_set(pte, 0); 3181 pmap_pte_flush(); 3182 } 3183 #endif 3184 #endif 3185 } 3186 3187 static pt_entry_t * 3188 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va) 3189 { 3190 3191 KASSERT(kpreempt_disabled()); 3192 if (pmap_is_curpmap(pmap)) { 3193 return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */ 3194 } 3195 KASSERT(ptp != NULL); 3196 return pmap_map_ptp(ptp) + pl1_pi(va); 3197 } 3198 3199 static void 3200 pmap_unmap_pte(void) 3201 { 3202 3203 KASSERT(kpreempt_disabled()); 3204 3205 pmap_unmap_ptp(); 3206 } 3207 3208 /* 3209 * p m a p r e m o v e f u n c t i o n s 3210 * 3211 * functions that remove mappings 3212 */ 3213 3214 /* 3215 * pmap_remove_ptes: remove PTEs from a PTP 3216 * 3217 * => caller must hold pmap's lock 3218 * => PTP must be mapped into KVA 3219 * => PTP should be null if pmap == pmap_kernel() 3220 * => must be called with kernel preemption disabled 3221 * => returns composite pte if at least one page should be shot down 3222 */ 3223 3224 static void 3225 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva, 3226 vaddr_t startva, vaddr_t endva, struct pv_entry **pv_tofree) 3227 { 3228 pt_entry_t *pte = (pt_entry_t *)ptpva; 3229 3230 KASSERT(pmap == pmap_kernel() || mutex_owned(pmap->pm_lock)); 3231 KASSERT(kpreempt_disabled()); 3232 3233 /* 3234 * note that ptpva points to the PTE that maps startva. this may 3235 * or may not be the first PTE in the PTP. 3236 * 3237 * we loop through the PTP while there are still PTEs to look at 3238 * and the wire_count is greater than 1 (because we use the wire_count 3239 * to keep track of the number of real PTEs in the PTP). 3240 */ 3241 while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) { 3242 (void)pmap_remove_pte(pmap, ptp, pte, startva, pv_tofree); 3243 startva += PAGE_SIZE; 3244 pte++; 3245 } 3246 } 3247 3248 3249 /* 3250 * pmap_remove_pte: remove a single PTE from a PTP. 3251 * 3252 * => caller must hold pmap's lock 3253 * => PTP must be mapped into KVA 3254 * => PTP should be null if pmap == pmap_kernel() 3255 * => returns true if we removed a mapping 3256 * => must be called with kernel preemption disabled 3257 */ 3258 static bool 3259 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte, 3260 vaddr_t va, struct pv_entry **pv_tofree) 3261 { 3262 struct pv_entry *pve; 3263 struct vm_page *pg; 3264 struct pmap_page *pp; 3265 pt_entry_t opte; 3266 3267 KASSERT(pmap == pmap_kernel() || mutex_owned(pmap->pm_lock)); 3268 KASSERT(kpreempt_disabled()); 3269 3270 if (!pmap_valid_entry(*pte)) { 3271 /* VA not mapped. */ 3272 return false; 3273 } 3274 3275 /* Atomically save the old PTE and zap it. */ 3276 opte = pmap_pte_testset(pte, 0); 3277 if (!pmap_valid_entry(opte)) { 3278 return false; 3279 } 3280 3281 pmap_exec_account(pmap, va, opte, 0); 3282 pmap_stats_update_bypte(pmap, 0, opte); 3283 3284 if (ptp) { 3285 /* 3286 * Dropping a PTE. Make sure that the PDE is flushed. 3287 */ 3288 ptp->wire_count--; 3289 if (ptp->wire_count <= 1) { 3290 opte |= PG_U; 3291 } 3292 } 3293 3294 if ((opte & PG_U) != 0) { 3295 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE); 3296 } 3297 3298 /* 3299 * If we are not on a pv_head list - we are done. 3300 */ 3301 if ((opte & PG_PVLIST) == 0) { 3302 #if defined(DIAGNOSTIC) && !defined(DOM0OPS) 3303 if (PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) != NULL) 3304 panic("pmap_remove_pte: managed page without " 3305 "PG_PVLIST for %#" PRIxVADDR, va); 3306 #endif 3307 return true; 3308 } 3309 3310 pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte)); 3311 3312 KASSERTMSG(pg != NULL, "pmap_remove_pte: unmanaged page marked " 3313 "PG_PVLIST, va = %#" PRIxVADDR ", pa = %#" PRIxPADDR, 3314 va, (paddr_t)pmap_pte2pa(opte)); 3315 3316 KASSERT(uvm_page_locked_p(pg)); 3317 3318 /* Sync R/M bits. */ 3319 pp = VM_PAGE_TO_PP(pg); 3320 pp->pp_attrs |= opte; 3321 pve = pmap_remove_pv(pp, ptp, va); 3322 3323 if (pve) { 3324 pve->pve_next = *pv_tofree; 3325 *pv_tofree = pve; 3326 } 3327 return true; 3328 } 3329 3330 /* 3331 * pmap_remove: mapping removal function. 3332 * 3333 * => caller should not be holding any pmap locks 3334 */ 3335 3336 void 3337 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 3338 { 3339 pt_entry_t *ptes; 3340 pd_entry_t pde; 3341 pd_entry_t * const *pdes; 3342 struct pv_entry *pv_tofree = NULL; 3343 bool result; 3344 int i; 3345 paddr_t ptppa; 3346 vaddr_t blkendva, va = sva; 3347 struct vm_page *ptp; 3348 struct pmap *pmap2; 3349 3350 kpreempt_disable(); 3351 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 3352 3353 /* 3354 * removing one page? take shortcut function. 3355 */ 3356 3357 if (va + PAGE_SIZE == eva) { 3358 if (pmap_pdes_valid(va, pdes, &pde)) { 3359 3360 /* PA of the PTP */ 3361 ptppa = pmap_pte2pa(pde); 3362 3363 /* Get PTP if non-kernel mapping. */ 3364 if (pmap != pmap_kernel()) { 3365 ptp = pmap_find_ptp(pmap, va, ptppa, 1); 3366 KASSERTMSG(ptp != NULL, 3367 "pmap_remove: unmanaged PTP detected"); 3368 } else { 3369 /* Never free kernel PTPs. */ 3370 ptp = NULL; 3371 } 3372 3373 result = pmap_remove_pte(pmap, ptp, 3374 &ptes[pl1_i(va)], va, &pv_tofree); 3375 3376 /* 3377 * if mapping removed and the PTP is no longer 3378 * being used, free it! 3379 */ 3380 3381 if (result && ptp && ptp->wire_count <= 1) 3382 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3383 } 3384 } else for (/* null */ ; va < eva ; va = blkendva) { 3385 int lvl; 3386 3387 /* determine range of block */ 3388 blkendva = x86_round_pdr(va+1); 3389 if (blkendva > eva) 3390 blkendva = eva; 3391 3392 /* 3393 * XXXCDC: our PTE mappings should never be removed 3394 * with pmap_remove! if we allow this (and why would 3395 * we?) then we end up freeing the pmap's page 3396 * directory page (PDP) before we are finished using 3397 * it when we hit in in the recursive mapping. this 3398 * is BAD. 3399 * 3400 * long term solution is to move the PTEs out of user 3401 * address space. and into kernel address space (up 3402 * with APTE). then we can set VM_MAXUSER_ADDRESS to 3403 * be VM_MAX_ADDRESS. 3404 */ 3405 3406 /* XXXCDC: ugly hack to avoid freeing PDP here */ 3407 for (i = 0; i < PDP_SIZE; i++) { 3408 if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i) 3409 continue; 3410 } 3411 3412 lvl = pmap_pdes_invalid(va, pdes, &pde); 3413 if (lvl != 0) { 3414 /* 3415 * skip a range corresponding to an invalid pde. 3416 */ 3417 blkendva = (va & ptp_masks[lvl - 1]) + nbpd[lvl - 1]; 3418 continue; 3419 } 3420 3421 /* PA of the PTP */ 3422 ptppa = pmap_pte2pa(pde); 3423 3424 /* Get PTP if non-kernel mapping. */ 3425 if (pmap != pmap_kernel()) { 3426 ptp = pmap_find_ptp(pmap, va, ptppa, 1); 3427 KASSERTMSG(ptp != NULL, 3428 "pmap_remove: unmanaged PTP detected"); 3429 } else { 3430 /* Never free kernel PTPs. */ 3431 ptp = NULL; 3432 } 3433 3434 pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va, 3435 blkendva, &pv_tofree); 3436 3437 /* if PTP is no longer being used, free it! */ 3438 if (ptp && ptp->wire_count <= 1) { 3439 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3440 } 3441 } 3442 pmap_unmap_ptes(pmap, pmap2); /* unlock pmap */ 3443 kpreempt_enable(); 3444 3445 /* Now we free unused PVs */ 3446 if (pv_tofree) 3447 pmap_free_pvs(pv_tofree); 3448 } 3449 3450 /* 3451 * pmap_sync_pv: clear pte bits and return the old value of the pte. 3452 * 3453 * => Caller should disable kernel preemption. 3454 * => issues tlb shootdowns if necessary. 3455 */ 3456 3457 static int 3458 pmap_sync_pv(struct pv_pte *pvpte, pt_entry_t expect, int clearbits, 3459 pt_entry_t *optep) 3460 { 3461 struct pmap *pmap; 3462 struct vm_page *ptp; 3463 vaddr_t va; 3464 pt_entry_t *ptep; 3465 pt_entry_t opte; 3466 pt_entry_t npte; 3467 bool need_shootdown; 3468 3469 ptp = pvpte->pte_ptp; 3470 va = pvpte->pte_va; 3471 KASSERT(ptp == NULL || ptp->uobject != NULL); 3472 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); 3473 pmap = ptp_to_pmap(ptp); 3474 3475 KASSERT((expect & ~(PG_FRAME | PG_V)) == 0); 3476 KASSERT((expect & PG_V) != 0); 3477 KASSERT(clearbits == ~0 || (clearbits & ~(PG_M | PG_U | PG_RW)) == 0); 3478 KASSERT(kpreempt_disabled()); 3479 3480 ptep = pmap_map_pte(pmap, ptp, va); 3481 do { 3482 opte = *ptep; 3483 KASSERT((opte & (PG_M | PG_U)) != PG_M); 3484 KASSERT((opte & (PG_U | PG_V)) != PG_U); 3485 KASSERT(opte == 0 || (opte & PG_V) != 0); 3486 if ((opte & (PG_FRAME | PG_V)) != expect) { 3487 3488 /* 3489 * we lost a race with a V->P operation like 3490 * pmap_remove(). wait for the competitor 3491 * reflecting pte bits into mp_attrs. 3492 * 3493 * issue a redundant TLB shootdown so that 3494 * we can wait for its completion. 3495 */ 3496 3497 pmap_unmap_pte(); 3498 if (clearbits != 0) { 3499 pmap_tlb_shootdown(pmap, va, 3500 (pmap == pmap_kernel() ? PG_G : 0), 3501 TLBSHOOT_SYNC_PV1); 3502 } 3503 return EAGAIN; 3504 } 3505 3506 /* 3507 * check if there's anything to do on this pte. 3508 */ 3509 3510 if ((opte & clearbits) == 0) { 3511 need_shootdown = false; 3512 break; 3513 } 3514 3515 /* 3516 * we need a shootdown if the pte is cached. (PG_U) 3517 * 3518 * ...unless we are clearing only the PG_RW bit and 3519 * it isn't cached as RW. (PG_M) 3520 */ 3521 3522 need_shootdown = (opte & PG_U) != 0 && 3523 !(clearbits == PG_RW && (opte & PG_M) == 0); 3524 3525 npte = opte & ~clearbits; 3526 3527 /* 3528 * if we need a shootdown anyway, clear PG_U and PG_M. 3529 */ 3530 3531 if (need_shootdown) { 3532 npte &= ~(PG_U | PG_M); 3533 } 3534 KASSERT((npte & (PG_M | PG_U)) != PG_M); 3535 KASSERT((npte & (PG_U | PG_V)) != PG_U); 3536 KASSERT(npte == 0 || (opte & PG_V) != 0); 3537 } while (pmap_pte_cas(ptep, opte, npte) != opte); 3538 3539 if (need_shootdown) { 3540 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV2); 3541 } 3542 pmap_unmap_pte(); 3543 3544 *optep = opte; 3545 return 0; 3546 } 3547 3548 /* 3549 * pmap_page_remove: remove a managed vm_page from all pmaps that map it 3550 * 3551 * => R/M bits are sync'd back to attrs 3552 */ 3553 3554 void 3555 pmap_page_remove(struct vm_page *pg) 3556 { 3557 struct pmap_page *pp; 3558 struct pv_pte *pvpte; 3559 struct pv_entry *killlist = NULL; 3560 struct vm_page *ptp; 3561 pt_entry_t expect; 3562 int count; 3563 3564 KASSERT(uvm_page_locked_p(pg)); 3565 3566 pp = VM_PAGE_TO_PP(pg); 3567 expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V; 3568 count = SPINLOCK_BACKOFF_MIN; 3569 kpreempt_disable(); 3570 startover: 3571 while ((pvpte = pv_pte_first(pp)) != NULL) { 3572 struct pmap *pmap; 3573 struct pv_entry *pve; 3574 pt_entry_t opte; 3575 vaddr_t va; 3576 int error; 3577 3578 /* 3579 * add a reference to the pmap before clearing the pte. 3580 * otherwise the pmap can disappear behind us. 3581 */ 3582 3583 ptp = pvpte->pte_ptp; 3584 pmap = ptp_to_pmap(ptp); 3585 if (ptp != NULL) { 3586 pmap_reference(pmap); 3587 } 3588 3589 error = pmap_sync_pv(pvpte, expect, ~0, &opte); 3590 if (error == EAGAIN) { 3591 int hold_count; 3592 KERNEL_UNLOCK_ALL(curlwp, &hold_count); 3593 if (ptp != NULL) { 3594 pmap_destroy(pmap); 3595 } 3596 SPINLOCK_BACKOFF(count); 3597 KERNEL_LOCK(hold_count, curlwp); 3598 goto startover; 3599 } 3600 3601 pp->pp_attrs |= opte; 3602 va = pvpte->pte_va; 3603 pve = pmap_remove_pv(pp, ptp, va); 3604 3605 /* update the PTP reference count. free if last reference. */ 3606 if (ptp != NULL) { 3607 struct pmap *pmap2; 3608 pt_entry_t *ptes; 3609 pd_entry_t * const *pdes; 3610 3611 KASSERT(pmap != pmap_kernel()); 3612 3613 pmap_tlb_shootnow(); 3614 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3615 pmap_stats_update_bypte(pmap, 0, opte); 3616 ptp->wire_count--; 3617 if (ptp->wire_count <= 1) { 3618 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 3619 } 3620 pmap_unmap_ptes(pmap, pmap2); 3621 pmap_destroy(pmap); 3622 } else { 3623 KASSERT(pmap == pmap_kernel()); 3624 pmap_stats_update_bypte(pmap, 0, opte); 3625 } 3626 3627 if (pve != NULL) { 3628 pve->pve_next = killlist; /* mark it for death */ 3629 killlist = pve; 3630 } 3631 } 3632 pmap_tlb_shootnow(); 3633 kpreempt_enable(); 3634 3635 /* Now free unused pvs. */ 3636 pmap_free_pvs(killlist); 3637 } 3638 3639 /* 3640 * p m a p a t t r i b u t e f u n c t i o n s 3641 * functions that test/change managed page's attributes 3642 * since a page can be mapped multiple times we must check each PTE that 3643 * maps it by going down the pv lists. 3644 */ 3645 3646 /* 3647 * pmap_test_attrs: test a page's attributes 3648 */ 3649 3650 bool 3651 pmap_test_attrs(struct vm_page *pg, unsigned testbits) 3652 { 3653 struct pmap_page *pp; 3654 struct pv_pte *pvpte; 3655 pt_entry_t expect; 3656 u_int result; 3657 3658 KASSERT(uvm_page_locked_p(pg)); 3659 3660 pp = VM_PAGE_TO_PP(pg); 3661 if ((pp->pp_attrs & testbits) != 0) { 3662 return true; 3663 } 3664 expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V; 3665 kpreempt_disable(); 3666 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 3667 pt_entry_t opte; 3668 int error; 3669 3670 if ((pp->pp_attrs & testbits) != 0) { 3671 break; 3672 } 3673 error = pmap_sync_pv(pvpte, expect, 0, &opte); 3674 if (error == 0) { 3675 pp->pp_attrs |= opte; 3676 } 3677 } 3678 result = pp->pp_attrs & testbits; 3679 kpreempt_enable(); 3680 3681 /* 3682 * note that we will exit the for loop with a non-null pve if 3683 * we have found the bits we are testing for. 3684 */ 3685 3686 return result != 0; 3687 } 3688 3689 /* 3690 * pmap_clear_attrs: clear the specified attribute for a page. 3691 * 3692 * => we return true if we cleared one of the bits we were asked to 3693 */ 3694 3695 bool 3696 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits) 3697 { 3698 struct pmap_page *pp; 3699 struct pv_pte *pvpte; 3700 u_int result; 3701 pt_entry_t expect; 3702 int count; 3703 3704 KASSERT(uvm_page_locked_p(pg)); 3705 3706 pp = VM_PAGE_TO_PP(pg); 3707 expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V; 3708 count = SPINLOCK_BACKOFF_MIN; 3709 kpreempt_disable(); 3710 startover: 3711 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { 3712 pt_entry_t opte; 3713 int error; 3714 3715 error = pmap_sync_pv(pvpte, expect, clearbits, &opte); 3716 if (error == EAGAIN) { 3717 int hold_count; 3718 KERNEL_UNLOCK_ALL(curlwp, &hold_count); 3719 SPINLOCK_BACKOFF(count); 3720 KERNEL_LOCK(hold_count, curlwp); 3721 goto startover; 3722 } 3723 pp->pp_attrs |= opte; 3724 } 3725 result = pp->pp_attrs & clearbits; 3726 pp->pp_attrs &= ~clearbits; 3727 kpreempt_enable(); 3728 3729 return result != 0; 3730 } 3731 3732 3733 /* 3734 * p m a p p r o t e c t i o n f u n c t i o n s 3735 */ 3736 3737 /* 3738 * pmap_page_protect: change the protection of all recorded mappings 3739 * of a managed page 3740 * 3741 * => NOTE: this is an inline function in pmap.h 3742 */ 3743 3744 /* see pmap.h */ 3745 3746 /* 3747 * pmap_protect: set the protection in of the pages in a pmap 3748 * 3749 * => NOTE: this is an inline function in pmap.h 3750 */ 3751 3752 /* see pmap.h */ 3753 3754 /* 3755 * pmap_write_protect: write-protect pages in a pmap. 3756 */ 3757 void 3758 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot) 3759 { 3760 pt_entry_t *ptes; 3761 pt_entry_t * const *pdes; 3762 struct pmap *pmap2; 3763 vaddr_t blockend, va; 3764 3765 KASSERT(curlwp->l_md.md_gc_pmap != pmap); 3766 3767 sva &= PG_FRAME; 3768 eva &= PG_FRAME; 3769 3770 /* Acquire pmap. */ 3771 kpreempt_disable(); 3772 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3773 3774 for (va = sva ; va < eva ; va = blockend) { 3775 pt_entry_t *spte, *epte; 3776 int i; 3777 3778 blockend = x86_round_pdr(va + 1); 3779 if (blockend > eva) 3780 blockend = eva; 3781 3782 /* 3783 * XXXCDC: our PTE mappings should never be write-protected! 3784 * 3785 * long term solution is to move the PTEs out of user 3786 * address space. and into kernel address space (up 3787 * with APTE). then we can set VM_MAXUSER_ADDRESS to 3788 * be VM_MAX_ADDRESS. 3789 */ 3790 3791 /* XXXCDC: ugly hack to avoid freeing PDP here */ 3792 for (i = 0; i < PDP_SIZE; i++) { 3793 if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i) 3794 continue; 3795 } 3796 3797 /* Is it a valid block? */ 3798 if (!pmap_pdes_valid(va, pdes, NULL)) { 3799 continue; 3800 } 3801 KASSERT(va < VM_MAXUSER_ADDRESS || va >= VM_MAX_ADDRESS); 3802 3803 spte = &ptes[pl1_i(va)]; 3804 epte = &ptes[pl1_i(blockend)]; 3805 3806 for (/*null */; spte < epte ; spte++) { 3807 pt_entry_t opte, npte; 3808 3809 do { 3810 opte = *spte; 3811 if ((~opte & (PG_RW | PG_V)) != 0) { 3812 goto next; 3813 } 3814 npte = opte & ~PG_RW; 3815 } while (pmap_pte_cas(spte, opte, npte) != opte); 3816 3817 if ((opte & PG_M) != 0) { 3818 vaddr_t tva = x86_ptob(spte - ptes); 3819 pmap_tlb_shootdown(pmap, tva, opte, 3820 TLBSHOOT_WRITE_PROTECT); 3821 } 3822 next:; 3823 } 3824 } 3825 3826 /* Release pmap. */ 3827 pmap_unmap_ptes(pmap, pmap2); 3828 kpreempt_enable(); 3829 } 3830 3831 /* 3832 * pmap_unwire: clear the wired bit in the PTE. 3833 * 3834 * => Mapping should already be present. 3835 */ 3836 void 3837 pmap_unwire(struct pmap *pmap, vaddr_t va) 3838 { 3839 pt_entry_t *ptes, *ptep, opte; 3840 pd_entry_t * const *pdes; 3841 struct pmap *pmap2; 3842 3843 /* Acquire pmap. */ 3844 kpreempt_disable(); 3845 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); 3846 3847 if (!pmap_pdes_valid(va, pdes, NULL)) { 3848 panic("pmap_unwire: invalid PDE"); 3849 } 3850 3851 ptep = &ptes[pl1_i(va)]; 3852 opte = *ptep; 3853 KASSERT(pmap_valid_entry(opte)); 3854 3855 if (opte & PG_W) { 3856 pt_entry_t npte = opte & ~PG_W; 3857 3858 opte = pmap_pte_testset(ptep, npte); 3859 pmap_stats_update_bypte(pmap, npte, opte); 3860 } else { 3861 printf("pmap_unwire: wiring for pmap %p va 0x%lx " 3862 "did not change!\n", pmap, va); 3863 } 3864 3865 /* Release pmap. */ 3866 pmap_unmap_ptes(pmap, pmap2); 3867 kpreempt_enable(); 3868 } 3869 3870 /* 3871 * pmap_copy: copy mappings from one pmap to another 3872 * 3873 * => optional function 3874 * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr) 3875 */ 3876 3877 /* 3878 * defined as macro in pmap.h 3879 */ 3880 3881 __strict_weak_alias(pmap_enter, pmap_enter_default); 3882 3883 int 3884 pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, 3885 u_int flags) 3886 { 3887 return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0); 3888 } 3889 3890 /* 3891 * pmap_enter: enter a mapping into a pmap 3892 * 3893 * => must be done "now" ... no lazy-evaluation 3894 * => we set pmap => pv_head locking 3895 */ 3896 int 3897 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa, 3898 vm_prot_t prot, u_int flags, int domid) 3899 { 3900 pt_entry_t *ptes, opte, npte; 3901 pt_entry_t *ptep; 3902 pd_entry_t * const *pdes; 3903 struct vm_page *ptp, *pg; 3904 struct pmap_page *new_pp; 3905 struct pmap_page *old_pp; 3906 struct pv_entry *old_pve = NULL; 3907 struct pv_entry *new_pve; 3908 struct pv_entry *new_pve2; 3909 int error; 3910 bool wired = (flags & PMAP_WIRED) != 0; 3911 struct pmap *pmap2; 3912 3913 KASSERT(pmap_initialized); 3914 KASSERT(curlwp->l_md.md_gc_pmap != pmap); 3915 KASSERT(va < VM_MAX_KERNEL_ADDRESS); 3916 KASSERTMSG(va != (vaddr_t)PDP_BASE, 3917 "pmap_enter: trying to map over PDP!"); 3918 KASSERTMSG(va < VM_MIN_KERNEL_ADDRESS || 3919 pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]), 3920 "pmap_enter: missing kernel PTP for VA %lx!", va); 3921 3922 #ifdef XEN 3923 KASSERT(domid == DOMID_SELF || pa == 0); 3924 #endif /* XEN */ 3925 3926 npte = ma | protection_codes[prot] | PG_V; 3927 npte |= pmap_pat_flags(flags); 3928 if (wired) 3929 npte |= PG_W; 3930 if (va < VM_MAXUSER_ADDRESS) 3931 npte |= PG_u; 3932 else if (va < VM_MAX_ADDRESS) 3933 npte |= (PG_u | PG_RW); /* XXXCDC: no longer needed? */ 3934 else 3935 npte |= PG_k; 3936 if (pmap == pmap_kernel()) 3937 npte |= pmap_pg_g; 3938 if (flags & VM_PROT_ALL) { 3939 npte |= PG_U; 3940 if (flags & VM_PROT_WRITE) { 3941 KASSERT((npte & PG_RW) != 0); 3942 npte |= PG_M; 3943 } 3944 } 3945 3946 #ifdef XEN 3947 if (domid != DOMID_SELF) 3948 pg = NULL; 3949 else 3950 #endif 3951 pg = PHYS_TO_VM_PAGE(pa); 3952 if (pg != NULL) { 3953 /* This is a managed page */ 3954 npte |= PG_PVLIST; 3955 new_pp = VM_PAGE_TO_PP(pg); 3956 } else { 3957 new_pp = NULL; 3958 } 3959 3960 /* get pves. */ 3961 new_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT); 3962 new_pve2 = pool_cache_get(&pmap_pv_cache, PR_NOWAIT); 3963 if (new_pve == NULL || new_pve2 == NULL) { 3964 if (flags & PMAP_CANFAIL) { 3965 error = ENOMEM; 3966 goto out2; 3967 } 3968 panic("pmap_enter: pve allocation failed"); 3969 } 3970 3971 kpreempt_disable(); 3972 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 3973 if (pmap == pmap_kernel()) { 3974 ptp = NULL; 3975 } else { 3976 ptp = pmap_get_ptp(pmap, va, pdes); 3977 if (ptp == NULL) { 3978 pmap_unmap_ptes(pmap, pmap2); 3979 if (flags & PMAP_CANFAIL) { 3980 error = ENOMEM; 3981 goto out; 3982 } 3983 panic("pmap_enter: get ptp failed"); 3984 } 3985 } 3986 3987 /* 3988 * update the pte. 3989 */ 3990 3991 ptep = &ptes[pl1_i(va)]; 3992 do { 3993 opte = *ptep; 3994 3995 /* 3996 * if the same page, inherit PG_U and PG_M. 3997 */ 3998 if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) { 3999 npte |= opte & (PG_U | PG_M); 4000 } 4001 #if defined(XEN) 4002 if (domid != DOMID_SELF) { 4003 /* pmap_pte_cas with error handling */ 4004 int s = splvm(); 4005 if (opte != *ptep) { 4006 splx(s); 4007 continue; 4008 } 4009 error = xpq_update_foreign( 4010 vtomach((vaddr_t)ptep), npte, domid); 4011 splx(s); 4012 if (error) { 4013 if (ptp != NULL && ptp->wire_count <= 1) { 4014 pmap_free_ptp(pmap, ptp, va, ptes, pdes); 4015 } 4016 pmap_unmap_ptes(pmap, pmap2); 4017 goto out; 4018 } 4019 break; 4020 } 4021 #endif /* defined(XEN) */ 4022 } while (pmap_pte_cas(ptep, opte, npte) != opte); 4023 4024 /* 4025 * update statistics and PTP's reference count. 4026 */ 4027 4028 pmap_stats_update_bypte(pmap, npte, opte); 4029 if (ptp != NULL && !pmap_valid_entry(opte)) { 4030 ptp->wire_count++; 4031 } 4032 KASSERT(ptp == NULL || ptp->wire_count > 1); 4033 4034 /* 4035 * if the same page, we can skip pv_entry handling. 4036 */ 4037 4038 if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) { 4039 KASSERT(((opte ^ npte) & PG_PVLIST) == 0); 4040 goto same_pa; 4041 } 4042 4043 /* 4044 * if old page is managed, remove pv_entry from its list. 4045 */ 4046 4047 if ((~opte & (PG_V | PG_PVLIST)) == 0) { 4048 pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte)); 4049 4050 KASSERTMSG(pg != NULL, "pmap_enter: PG_PVLIST mapping with " 4051 "unmanaged page pa = 0x%" PRIx64 " (0x%" PRIx64 ")", 4052 (int64_t)pa, (int64_t)atop(pa)); 4053 4054 KASSERT(uvm_page_locked_p(pg)); 4055 4056 old_pp = VM_PAGE_TO_PP(pg); 4057 old_pve = pmap_remove_pv(old_pp, ptp, va); 4058 old_pp->pp_attrs |= opte; 4059 } 4060 4061 /* 4062 * if new page is managed, insert pv_entry into its list. 4063 */ 4064 4065 if (new_pp) { 4066 new_pve = pmap_enter_pv(new_pp, new_pve, &new_pve2, ptp, va); 4067 } 4068 4069 same_pa: 4070 pmap_unmap_ptes(pmap, pmap2); 4071 4072 /* 4073 * shootdown tlb if necessary. 4074 */ 4075 4076 if ((~opte & (PG_V | PG_U)) == 0 && 4077 ((opte ^ npte) & (PG_FRAME | PG_RW)) != 0) { 4078 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER); 4079 } 4080 4081 error = 0; 4082 out: 4083 kpreempt_enable(); 4084 out2: 4085 if (old_pve != NULL) { 4086 pool_cache_put(&pmap_pv_cache, old_pve); 4087 } 4088 if (new_pve != NULL) { 4089 pool_cache_put(&pmap_pv_cache, new_pve); 4090 } 4091 if (new_pve2 != NULL) { 4092 pool_cache_put(&pmap_pv_cache, new_pve2); 4093 } 4094 4095 return error; 4096 } 4097 4098 static bool 4099 pmap_get_physpage(vaddr_t va, int level, paddr_t *paddrp) 4100 { 4101 struct vm_page *ptp; 4102 struct pmap *kpm = pmap_kernel(); 4103 4104 if (!uvm.page_init_done) { 4105 4106 /* 4107 * we're growing the kernel pmap early (from 4108 * uvm_pageboot_alloc()). this case must be 4109 * handled a little differently. 4110 */ 4111 4112 if (!uvm_page_physget(paddrp)) 4113 panic("pmap_get_physpage: out of memory"); 4114 #if defined(__HAVE_DIRECT_MAP) 4115 pagezero(PMAP_DIRECT_MAP(*paddrp)); 4116 #else 4117 #if defined(XEN) 4118 if (XEN_VERSION_SUPPORTED(3, 4)) { 4119 xen_pagezero(*paddrp); 4120 return true; 4121 } 4122 #endif 4123 kpreempt_disable(); 4124 pmap_pte_set(early_zero_pte, 4125 pmap_pa2pte(*paddrp) | PG_V | PG_RW | PG_k); 4126 pmap_pte_flush(); 4127 pmap_update_pg((vaddr_t)early_zerop); 4128 memset(early_zerop, 0, PAGE_SIZE); 4129 #if defined(DIAGNOSTIC) 4130 pmap_pte_set(early_zero_pte, 0); 4131 pmap_pte_flush(); 4132 #endif /* defined(DIAGNOSTIC) */ 4133 kpreempt_enable(); 4134 #endif /* defined(__HAVE_DIRECT_MAP) */ 4135 } else { 4136 /* XXX */ 4137 ptp = uvm_pagealloc(NULL, 0, NULL, 4138 UVM_PGA_USERESERVE|UVM_PGA_ZERO); 4139 if (ptp == NULL) 4140 panic("pmap_get_physpage: out of memory"); 4141 ptp->flags &= ~PG_BUSY; 4142 ptp->wire_count = 1; 4143 *paddrp = VM_PAGE_TO_PHYS(ptp); 4144 } 4145 pmap_stats_update(kpm, 1, 0); 4146 return true; 4147 } 4148 4149 /* 4150 * Allocate the amount of specified ptps for a ptp level, and populate 4151 * all levels below accordingly, mapping virtual addresses starting at 4152 * kva. 4153 * 4154 * Used by pmap_growkernel. 4155 */ 4156 static void 4157 pmap_alloc_level(pd_entry_t * const *pdes, vaddr_t kva, int lvl, 4158 long *needed_ptps) 4159 { 4160 unsigned long i; 4161 vaddr_t va; 4162 paddr_t pa; 4163 unsigned long index, endindex; 4164 int level; 4165 pd_entry_t *pdep; 4166 #ifdef XEN 4167 int s = splvm(); /* protect xpq_* */ 4168 #endif 4169 4170 for (level = lvl; level > 1; level--) { 4171 if (level == PTP_LEVELS) 4172 pdep = pmap_kernel()->pm_pdir; 4173 else 4174 pdep = pdes[level - 2]; 4175 va = kva; 4176 index = pl_i_roundup(kva, level); 4177 endindex = index + needed_ptps[level - 1] - 1; 4178 4179 4180 for (i = index; i <= endindex; i++) { 4181 pt_entry_t pte; 4182 4183 KASSERT(!pmap_valid_entry(pdep[i])); 4184 pmap_get_physpage(va, level - 1, &pa); 4185 pte = pmap_pa2pte(pa) | PG_k | PG_V | PG_RW; 4186 #ifdef XEN 4187 pmap_pte_set(&pdep[i], pte); 4188 #if defined(PAE) || defined(__x86_64__) 4189 if (level == PTP_LEVELS && i >= PDIR_SLOT_KERN) { 4190 if (__predict_true( 4191 cpu_info_primary.ci_flags & CPUF_PRESENT)) { 4192 /* update per-cpu PMDs on all cpus */ 4193 xen_kpm_sync(pmap_kernel(), i); 4194 } else { 4195 /* 4196 * too early; update primary CPU 4197 * PMD only (without locks) 4198 */ 4199 #ifdef PAE 4200 pd_entry_t *cpu_pdep = 4201 &cpu_info_primary.ci_kpm_pdir[l2tol2(i)]; 4202 #endif 4203 #ifdef __x86_64__ 4204 pd_entry_t *cpu_pdep = 4205 &cpu_info_primary.ci_kpm_pdir[i]; 4206 #endif 4207 pmap_pte_set(cpu_pdep, pte); 4208 } 4209 } 4210 #endif /* PAE || __x86_64__ */ 4211 #else /* XEN */ 4212 pdep[i] = pte; 4213 #endif /* XEN */ 4214 KASSERT(level != PTP_LEVELS || nkptp[level - 1] + 4215 pl_i(VM_MIN_KERNEL_ADDRESS, level) == i); 4216 nkptp[level - 1]++; 4217 va += nbpd[level - 1]; 4218 } 4219 pmap_pte_flush(); 4220 } 4221 #ifdef XEN 4222 splx(s); 4223 #endif 4224 } 4225 4226 /* 4227 * pmap_growkernel: increase usage of KVM space 4228 * 4229 * => we allocate new PTPs for the kernel and install them in all 4230 * the pmaps on the system. 4231 */ 4232 4233 vaddr_t 4234 pmap_growkernel(vaddr_t maxkvaddr) 4235 { 4236 struct pmap *kpm = pmap_kernel(); 4237 #if !defined(XEN) || !defined(__x86_64__) 4238 struct pmap *pm; 4239 long old; 4240 #endif 4241 int s, i; 4242 long needed_kptp[PTP_LEVELS], target_nptp; 4243 bool invalidate = false; 4244 4245 s = splvm(); /* to be safe */ 4246 mutex_enter(kpm->pm_lock); 4247 4248 if (maxkvaddr <= pmap_maxkvaddr) { 4249 mutex_exit(kpm->pm_lock); 4250 splx(s); 4251 return pmap_maxkvaddr; 4252 } 4253 4254 maxkvaddr = x86_round_pdr(maxkvaddr); 4255 #if !defined(XEN) || !defined(__x86_64__) 4256 old = nkptp[PTP_LEVELS - 1]; 4257 #endif 4258 4259 /* 4260 * This loop could be optimized more, but pmap_growkernel() 4261 * is called infrequently. 4262 */ 4263 for (i = PTP_LEVELS - 1; i >= 1; i--) { 4264 target_nptp = pl_i_roundup(maxkvaddr, i + 1) - 4265 pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1); 4266 /* 4267 * XXX only need to check toplevel. 4268 */ 4269 if (target_nptp > nkptpmax[i]) 4270 panic("out of KVA space"); 4271 KASSERT(target_nptp >= nkptp[i]); 4272 needed_kptp[i] = target_nptp - nkptp[i]; 4273 } 4274 4275 pmap_alloc_level(normal_pdes, pmap_maxkvaddr, PTP_LEVELS, needed_kptp); 4276 4277 /* 4278 * If the number of top level entries changed, update all 4279 * pmaps. 4280 */ 4281 if (needed_kptp[PTP_LEVELS - 1] != 0) { 4282 #ifdef XEN 4283 #ifdef __x86_64__ 4284 /* nothing, kernel entries are never entered in user pmap */ 4285 #else /* __x86_64__ */ 4286 mutex_enter(&pmaps_lock); 4287 LIST_FOREACH(pm, &pmaps, pm_list) { 4288 int pdkidx; 4289 for (pdkidx = PDIR_SLOT_KERN + old; 4290 pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1]; 4291 pdkidx++) { 4292 pmap_pte_set(&pm->pm_pdir[pdkidx], 4293 kpm->pm_pdir[pdkidx]); 4294 } 4295 pmap_pte_flush(); 4296 } 4297 mutex_exit(&pmaps_lock); 4298 #endif /* __x86_64__ */ 4299 #else /* XEN */ 4300 unsigned newpdes; 4301 newpdes = nkptp[PTP_LEVELS - 1] - old; 4302 mutex_enter(&pmaps_lock); 4303 LIST_FOREACH(pm, &pmaps, pm_list) { 4304 memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old], 4305 &kpm->pm_pdir[PDIR_SLOT_KERN + old], 4306 newpdes * sizeof (pd_entry_t)); 4307 } 4308 mutex_exit(&pmaps_lock); 4309 #endif 4310 invalidate = true; 4311 } 4312 pmap_maxkvaddr = maxkvaddr; 4313 mutex_exit(kpm->pm_lock); 4314 splx(s); 4315 4316 if (invalidate && pmap_initialized) { 4317 /* Invalidate the PDP cache. */ 4318 pool_cache_invalidate(&pmap_pdp_cache); 4319 } 4320 4321 return maxkvaddr; 4322 } 4323 4324 #ifdef DEBUG 4325 void pmap_dump(struct pmap *, vaddr_t, vaddr_t); 4326 4327 /* 4328 * pmap_dump: dump all the mappings from a pmap 4329 * 4330 * => caller should not be holding any pmap locks 4331 */ 4332 4333 void 4334 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva) 4335 { 4336 pt_entry_t *ptes, *pte; 4337 pd_entry_t * const *pdes; 4338 struct pmap *pmap2; 4339 vaddr_t blkendva; 4340 4341 /* 4342 * if end is out of range truncate. 4343 * if (end == start) update to max. 4344 */ 4345 4346 if (eva > VM_MAXUSER_ADDRESS || eva <= sva) 4347 eva = VM_MAXUSER_ADDRESS; 4348 4349 /* 4350 * we lock in the pmap => pv_head direction 4351 */ 4352 4353 kpreempt_disable(); 4354 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); /* locks pmap */ 4355 4356 /* 4357 * dumping a range of pages: we dump in PTP sized blocks (4MB) 4358 */ 4359 4360 for (/* null */ ; sva < eva ; sva = blkendva) { 4361 4362 /* determine range of block */ 4363 blkendva = x86_round_pdr(sva+1); 4364 if (blkendva > eva) 4365 blkendva = eva; 4366 4367 /* valid block? */ 4368 if (!pmap_pdes_valid(sva, pdes, NULL)) 4369 continue; 4370 4371 pte = &ptes[pl1_i(sva)]; 4372 for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) { 4373 if (!pmap_valid_entry(*pte)) 4374 continue; 4375 printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR 4376 " (pte=%#" PRIxPADDR ")\n", 4377 sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte); 4378 } 4379 } 4380 pmap_unmap_ptes(pmap, pmap2); 4381 kpreempt_enable(); 4382 } 4383 #endif 4384 4385 /* 4386 * pmap_update: process deferred invalidations and frees. 4387 */ 4388 4389 void 4390 pmap_update(struct pmap *pmap) 4391 { 4392 struct vm_page *empty_ptps; 4393 lwp_t *l = curlwp; 4394 4395 /* 4396 * If we have torn down this pmap, invalidate non-global TLB 4397 * entries on any processors using it. 4398 */ 4399 KPREEMPT_DISABLE(l); 4400 if (__predict_false(l->l_md.md_gc_pmap == pmap)) { 4401 l->l_md.md_gc_pmap = NULL; 4402 pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, TLBSHOOT_UPDATE); 4403 } 4404 /* 4405 * Initiate any pending TLB shootdowns. Wait for them to 4406 * complete before returning control to the caller. 4407 */ 4408 pmap_tlb_shootnow(); 4409 KPREEMPT_ENABLE(l); 4410 4411 /* 4412 * Now that shootdowns are complete, process deferred frees, 4413 * but not from interrupt context. 4414 */ 4415 if (l->l_md.md_gc_ptp != NULL) { 4416 KASSERT((l->l_pflag & LP_INTR) == 0); 4417 if (cpu_intr_p()) { 4418 return; 4419 } 4420 empty_ptps = l->l_md.md_gc_ptp; 4421 l->l_md.md_gc_ptp = NULL; 4422 pmap_free_ptps(empty_ptps); 4423 } 4424 } 4425 4426 #if PTP_LEVELS > 4 4427 #error "Unsupported number of page table mappings" 4428 #endif 4429 4430 paddr_t 4431 pmap_init_tmp_pgtbl(paddr_t pg) 4432 { 4433 static bool maps_loaded; 4434 static const paddr_t x86_tmp_pml_paddr[] = { 4435 4 * PAGE_SIZE, 4436 5 * PAGE_SIZE, 4437 6 * PAGE_SIZE, 4438 7 * PAGE_SIZE 4439 }; 4440 static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 }; 4441 4442 pd_entry_t *tmp_pml, *kernel_pml; 4443 4444 int level; 4445 4446 if (!maps_loaded) { 4447 for (level = 0; level < PTP_LEVELS; ++level) { 4448 x86_tmp_pml_vaddr[level] = 4449 uvm_km_alloc(kernel_map, PAGE_SIZE, 0, 4450 UVM_KMF_VAONLY); 4451 4452 if (x86_tmp_pml_vaddr[level] == 0) 4453 panic("mapping of real mode PML failed\n"); 4454 pmap_kenter_pa(x86_tmp_pml_vaddr[level], 4455 x86_tmp_pml_paddr[level], 4456 VM_PROT_READ | VM_PROT_WRITE, 0); 4457 pmap_update(pmap_kernel()); 4458 } 4459 maps_loaded = true; 4460 } 4461 4462 /* Zero levels 1-3 */ 4463 for (level = 0; level < PTP_LEVELS - 1; ++level) { 4464 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 4465 memset(tmp_pml, 0, PAGE_SIZE); 4466 } 4467 4468 /* Copy PML4 */ 4469 kernel_pml = pmap_kernel()->pm_pdir; 4470 tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1]; 4471 memcpy(tmp_pml, kernel_pml, PAGE_SIZE); 4472 4473 #ifdef PAE 4474 /* 4475 * Use the last 4 entries of the L2 page as L3 PD entries. These 4476 * last entries are unlikely to be used for temporary mappings. 4477 * 508: maps 0->1GB (userland) 4478 * 509: unused 4479 * 510: unused 4480 * 511: maps 3->4GB (kernel) 4481 */ 4482 tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PG_V; 4483 tmp_pml[509] = 0; 4484 tmp_pml[510] = 0; 4485 tmp_pml[511] = pmap_pdirpa(pmap_kernel(), PDIR_SLOT_KERN) | PG_V; 4486 #endif 4487 4488 for (level = PTP_LEVELS - 1; level > 0; --level) { 4489 tmp_pml = (void *)x86_tmp_pml_vaddr[level]; 4490 4491 tmp_pml[pl_i(pg, level + 1)] = 4492 (x86_tmp_pml_paddr[level - 1] & PG_FRAME) | PG_RW | PG_V; 4493 } 4494 4495 tmp_pml = (void *)x86_tmp_pml_vaddr[0]; 4496 tmp_pml[pl_i(pg, 1)] = (pg & PG_FRAME) | PG_RW | PG_V; 4497 4498 #ifdef PAE 4499 /* Return the PA of the L3 page (entry 508 of the L2 page) */ 4500 return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t); 4501 #endif 4502 4503 return x86_tmp_pml_paddr[PTP_LEVELS - 1]; 4504 } 4505 4506 u_int 4507 x86_mmap_flags(paddr_t mdpgno) 4508 { 4509 u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK; 4510 u_int pflag = 0; 4511 4512 if (nflag & X86_MMAP_FLAG_PREFETCH) 4513 pflag |= PMAP_WRITE_COMBINE; 4514 4515 return pflag; 4516 } 4517