10Sstevel@tonic-gate /* 20Sstevel@tonic-gate * CDDL HEADER START 30Sstevel@tonic-gate * 40Sstevel@tonic-gate * The contents of this file are subject to the terms of the 51747Sjosephb * Common Development and Distribution License (the "License"). 61747Sjosephb * You may not use this file except in compliance with the License. 70Sstevel@tonic-gate * 80Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 90Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 100Sstevel@tonic-gate * See the License for the specific language governing permissions 110Sstevel@tonic-gate * and limitations under the License. 120Sstevel@tonic-gate * 130Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 140Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 150Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 160Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 170Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 180Sstevel@tonic-gate * 190Sstevel@tonic-gate * CDDL HEADER END 200Sstevel@tonic-gate */ 213446Smrj 220Sstevel@tonic-gate /* 237240Srh87107 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 240Sstevel@tonic-gate * Use is subject to license terms. 250Sstevel@tonic-gate */ 260Sstevel@tonic-gate 270Sstevel@tonic-gate #include <sys/types.h> 280Sstevel@tonic-gate #include <sys/sysmacros.h> 290Sstevel@tonic-gate #include <sys/kmem.h> 300Sstevel@tonic-gate #include <sys/atomic.h> 310Sstevel@tonic-gate #include <sys/bitmap.h> 320Sstevel@tonic-gate #include <sys/machparam.h> 330Sstevel@tonic-gate #include <sys/machsystm.h> 340Sstevel@tonic-gate #include <sys/mman.h> 350Sstevel@tonic-gate #include <sys/systm.h> 360Sstevel@tonic-gate #include <sys/cpuvar.h> 370Sstevel@tonic-gate #include <sys/thread.h> 380Sstevel@tonic-gate #include <sys/proc.h> 390Sstevel@tonic-gate #include <sys/cpu.h> 400Sstevel@tonic-gate #include <sys/kmem.h> 410Sstevel@tonic-gate #include <sys/disp.h> 420Sstevel@tonic-gate #include <sys/vmem.h> 430Sstevel@tonic-gate #include <sys/vmsystm.h> 440Sstevel@tonic-gate #include <sys/promif.h> 450Sstevel@tonic-gate #include <sys/var.h> 460Sstevel@tonic-gate #include <sys/x86_archext.h> 473446Smrj #include <sys/archsystm.h> 480Sstevel@tonic-gate #include <sys/bootconf.h> 490Sstevel@tonic-gate #include <sys/dumphdr.h> 500Sstevel@tonic-gate #include <vm/seg_kmem.h> 510Sstevel@tonic-gate #include <vm/seg_kpm.h> 520Sstevel@tonic-gate #include <vm/hat.h> 530Sstevel@tonic-gate #include <vm/hat_i86.h> 540Sstevel@tonic-gate #include <sys/cmn_err.h> 555084Sjohnlev #include <sys/panic.h> 565084Sjohnlev 575084Sjohnlev #ifdef __xpv 585084Sjohnlev #include <sys/hypervisor.h> 595084Sjohnlev #include <sys/xpv_panic.h> 605084Sjohnlev #endif 610Sstevel@tonic-gate 623446Smrj #include <sys/bootinfo.h> 633446Smrj #include <vm/kboot_mmu.h> 643446Smrj 653446Smrj static void x86pte_zero(htable_t *dest, uint_t entry, uint_t count); 663446Smrj 670Sstevel@tonic-gate kmem_cache_t *htable_cache; 680Sstevel@tonic-gate 690Sstevel@tonic-gate /* 700Sstevel@tonic-gate * The variable htable_reserve_amount, rather than HTABLE_RESERVE_AMOUNT, 710Sstevel@tonic-gate * is used in order to facilitate testing of the htable_steal() code. 720Sstevel@tonic-gate * By resetting htable_reserve_amount to a lower value, we can force 730Sstevel@tonic-gate * stealing to occur. The reserve amount is a guess to get us through boot. 740Sstevel@tonic-gate */ 750Sstevel@tonic-gate #define HTABLE_RESERVE_AMOUNT (200) 760Sstevel@tonic-gate uint_t htable_reserve_amount = HTABLE_RESERVE_AMOUNT; 770Sstevel@tonic-gate kmutex_t htable_reserve_mutex; 780Sstevel@tonic-gate uint_t htable_reserve_cnt; 790Sstevel@tonic-gate htable_t *htable_reserve_pool; 800Sstevel@tonic-gate 810Sstevel@tonic-gate /* 821747Sjosephb * Used to hand test htable_steal(). 830Sstevel@tonic-gate */ 841747Sjosephb #ifdef DEBUG 851747Sjosephb ulong_t force_steal = 0; 861747Sjosephb ulong_t ptable_cnt = 0; 871747Sjosephb #endif 881747Sjosephb 891747Sjosephb /* 901747Sjosephb * This variable is so that we can tune this via /etc/system 911747Sjosephb * Any value works, but a power of two <= mmu.ptes_per_table is best. 921747Sjosephb */ 931747Sjosephb uint_t htable_steal_passes = 8; 940Sstevel@tonic-gate 950Sstevel@tonic-gate /* 960Sstevel@tonic-gate * mutex stuff for access to htable hash 970Sstevel@tonic-gate */ 980Sstevel@tonic-gate #define NUM_HTABLE_MUTEX 128 990Sstevel@tonic-gate kmutex_t htable_mutex[NUM_HTABLE_MUTEX]; 1000Sstevel@tonic-gate #define HTABLE_MUTEX_HASH(h) ((h) & (NUM_HTABLE_MUTEX - 1)) 1010Sstevel@tonic-gate 1020Sstevel@tonic-gate #define HTABLE_ENTER(h) mutex_enter(&htable_mutex[HTABLE_MUTEX_HASH(h)]); 1030Sstevel@tonic-gate #define HTABLE_EXIT(h) mutex_exit(&htable_mutex[HTABLE_MUTEX_HASH(h)]); 1040Sstevel@tonic-gate 1050Sstevel@tonic-gate /* 1060Sstevel@tonic-gate * forward declarations 1070Sstevel@tonic-gate */ 1080Sstevel@tonic-gate static void link_ptp(htable_t *higher, htable_t *new, uintptr_t vaddr); 1090Sstevel@tonic-gate static void unlink_ptp(htable_t *higher, htable_t *old, uintptr_t vaddr); 1100Sstevel@tonic-gate static void htable_free(htable_t *ht); 1113446Smrj static x86pte_t *x86pte_access_pagetable(htable_t *ht, uint_t index); 1120Sstevel@tonic-gate static void x86pte_release_pagetable(htable_t *ht); 1130Sstevel@tonic-gate static x86pte_t x86pte_cas(htable_t *ht, uint_t entry, x86pte_t old, 1140Sstevel@tonic-gate x86pte_t new); 1150Sstevel@tonic-gate 1160Sstevel@tonic-gate /* 1170Sstevel@tonic-gate * A counter to track if we are stealing or reaping htables. When non-zero 1180Sstevel@tonic-gate * htable_free() will directly free htables (either to the reserve or kmem) 1190Sstevel@tonic-gate * instead of putting them in a hat's htable cache. 1200Sstevel@tonic-gate */ 1210Sstevel@tonic-gate uint32_t htable_dont_cache = 0; 1220Sstevel@tonic-gate 1230Sstevel@tonic-gate /* 1240Sstevel@tonic-gate * Track the number of active pagetables, so we can know how many to reap 1250Sstevel@tonic-gate */ 1260Sstevel@tonic-gate static uint32_t active_ptables = 0; 1270Sstevel@tonic-gate 1285084Sjohnlev #ifdef __xpv 1295084Sjohnlev /* 1305084Sjohnlev * Deal with hypervisor complications. 1315084Sjohnlev */ 1325084Sjohnlev void 1335084Sjohnlev xen_flush_va(caddr_t va) 1345084Sjohnlev { 1355084Sjohnlev struct mmuext_op t; 1365084Sjohnlev uint_t count; 1375084Sjohnlev 1385084Sjohnlev if (IN_XPV_PANIC()) { 1395084Sjohnlev mmu_tlbflush_entry((caddr_t)va); 1405084Sjohnlev } else { 1415084Sjohnlev t.cmd = MMUEXT_INVLPG_LOCAL; 1425084Sjohnlev t.arg1.linear_addr = (uintptr_t)va; 1435084Sjohnlev if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0) 1445084Sjohnlev panic("HYPERVISOR_mmuext_op() failed"); 1455084Sjohnlev ASSERT(count == 1); 1465084Sjohnlev } 1475084Sjohnlev } 1485084Sjohnlev 1495084Sjohnlev void 1505084Sjohnlev xen_gflush_va(caddr_t va, cpuset_t cpus) 1515084Sjohnlev { 1525084Sjohnlev struct mmuext_op t; 1535084Sjohnlev uint_t count; 1545084Sjohnlev 1555084Sjohnlev if (IN_XPV_PANIC()) { 1565084Sjohnlev mmu_tlbflush_entry((caddr_t)va); 1575084Sjohnlev return; 1585084Sjohnlev } 1595084Sjohnlev 1605084Sjohnlev t.cmd = MMUEXT_INVLPG_MULTI; 1615084Sjohnlev t.arg1.linear_addr = (uintptr_t)va; 1625084Sjohnlev /*LINTED: constant in conditional context*/ 1635084Sjohnlev set_xen_guest_handle(t.arg2.vcpumask, &cpus); 1645084Sjohnlev if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0) 1655084Sjohnlev panic("HYPERVISOR_mmuext_op() failed"); 1665084Sjohnlev ASSERT(count == 1); 1675084Sjohnlev } 1685084Sjohnlev 1695084Sjohnlev void 1705084Sjohnlev xen_flush_tlb() 1715084Sjohnlev { 1725084Sjohnlev struct mmuext_op t; 1735084Sjohnlev uint_t count; 1745084Sjohnlev 1755084Sjohnlev if (IN_XPV_PANIC()) { 1765084Sjohnlev xpv_panic_reload_cr3(); 1775084Sjohnlev } else { 1785084Sjohnlev t.cmd = MMUEXT_TLB_FLUSH_LOCAL; 1795084Sjohnlev if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0) 1805084Sjohnlev panic("HYPERVISOR_mmuext_op() failed"); 1815084Sjohnlev ASSERT(count == 1); 1825084Sjohnlev } 1835084Sjohnlev } 1845084Sjohnlev 1855084Sjohnlev void 1865084Sjohnlev xen_gflush_tlb(cpuset_t cpus) 1875084Sjohnlev { 1885084Sjohnlev struct mmuext_op t; 1895084Sjohnlev uint_t count; 1905084Sjohnlev 1915084Sjohnlev ASSERT(!IN_XPV_PANIC()); 1925084Sjohnlev t.cmd = MMUEXT_TLB_FLUSH_MULTI; 1935084Sjohnlev /*LINTED: constant in conditional context*/ 1945084Sjohnlev set_xen_guest_handle(t.arg2.vcpumask, &cpus); 1955084Sjohnlev if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0) 1965084Sjohnlev panic("HYPERVISOR_mmuext_op() failed"); 1975084Sjohnlev ASSERT(count == 1); 1985084Sjohnlev } 1995084Sjohnlev 2005084Sjohnlev /* 2015084Sjohnlev * Install/Adjust a kpm mapping under the hypervisor. 2025084Sjohnlev * Value of "how" should be: 2035084Sjohnlev * PT_WRITABLE | PT_VALID - regular kpm mapping 2045084Sjohnlev * PT_VALID - make mapping read-only 2055084Sjohnlev * 0 - remove mapping 2065084Sjohnlev * 2075084Sjohnlev * returns 0 on success. non-zero for failure. 2085084Sjohnlev */ 2095084Sjohnlev int 2105084Sjohnlev xen_kpm_page(pfn_t pfn, uint_t how) 2115084Sjohnlev { 2125084Sjohnlev paddr_t pa = mmu_ptob((paddr_t)pfn); 2135084Sjohnlev x86pte_t pte = PT_NOCONSIST | PT_REF | PT_MOD; 2145084Sjohnlev 2155084Sjohnlev if (kpm_vbase == NULL) 2165084Sjohnlev return (0); 2175084Sjohnlev 2185084Sjohnlev if (how) 2195084Sjohnlev pte |= pa_to_ma(pa) | how; 2205084Sjohnlev else 2215084Sjohnlev pte = 0; 2225084Sjohnlev return (HYPERVISOR_update_va_mapping((uintptr_t)kpm_vbase + pa, 2235084Sjohnlev pte, UVMF_INVLPG | UVMF_ALL)); 2245084Sjohnlev } 2255084Sjohnlev 2265084Sjohnlev void 2275084Sjohnlev xen_pin(pfn_t pfn, level_t lvl) 2285084Sjohnlev { 2295084Sjohnlev struct mmuext_op t; 2305084Sjohnlev uint_t count; 2315084Sjohnlev 2325084Sjohnlev t.cmd = MMUEXT_PIN_L1_TABLE + lvl; 2335084Sjohnlev t.arg1.mfn = pfn_to_mfn(pfn); 2345084Sjohnlev if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0) 2355084Sjohnlev panic("HYPERVISOR_mmuext_op() failed"); 2365084Sjohnlev ASSERT(count == 1); 2375084Sjohnlev } 2385084Sjohnlev 2395084Sjohnlev void 2405084Sjohnlev xen_unpin(pfn_t pfn) 2415084Sjohnlev { 2425084Sjohnlev struct mmuext_op t; 2435084Sjohnlev uint_t count; 2445084Sjohnlev 2455084Sjohnlev t.cmd = MMUEXT_UNPIN_TABLE; 2465084Sjohnlev t.arg1.mfn = pfn_to_mfn(pfn); 2475084Sjohnlev if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0) 2485084Sjohnlev panic("HYPERVISOR_mmuext_op() failed"); 2495084Sjohnlev ASSERT(count == 1); 2505084Sjohnlev } 2515084Sjohnlev 2525084Sjohnlev static void 2535084Sjohnlev xen_map(uint64_t pte, caddr_t va) 2545084Sjohnlev { 2555084Sjohnlev if (HYPERVISOR_update_va_mapping((uintptr_t)va, pte, 2565084Sjohnlev UVMF_INVLPG | UVMF_LOCAL)) 2575084Sjohnlev panic("HYPERVISOR_update_va_mapping() failed"); 2585084Sjohnlev } 2595084Sjohnlev #endif /* __xpv */ 2605084Sjohnlev 2610Sstevel@tonic-gate /* 2620Sstevel@tonic-gate * Allocate a memory page for a hardware page table. 2630Sstevel@tonic-gate * 2643446Smrj * A wrapper around page_get_physical(), with some extra checks. 2650Sstevel@tonic-gate */ 2663446Smrj static pfn_t 267*7589SVikram.Hegde@Sun.COM ptable_alloc(void) 2680Sstevel@tonic-gate { 2690Sstevel@tonic-gate pfn_t pfn; 2700Sstevel@tonic-gate page_t *pp; 2710Sstevel@tonic-gate 2723446Smrj pfn = PFN_INVALID; 2730Sstevel@tonic-gate atomic_add_32(&active_ptables, 1); 2740Sstevel@tonic-gate 2753446Smrj /* 2763446Smrj * The first check is to see if there is memory in the system. If we 2773446Smrj * drop to throttlefree, then fail the ptable_alloc() and let the 2783446Smrj * stealing code kick in. Note that we have to do this test here, 2793446Smrj * since the test in page_create_throttle() would let the NOSLEEP 2803446Smrj * allocation go through and deplete the page reserves. 2813446Smrj * 2823446Smrj * The !NOMEMWAIT() lets pageout, fsflush, etc. skip this check. 2833446Smrj */ 2843446Smrj if (!NOMEMWAIT() && freemem <= throttlefree + 1) 2853446Smrj return (PFN_INVALID); 2860Sstevel@tonic-gate 2871747Sjosephb #ifdef DEBUG 2883446Smrj /* 2893446Smrj * This code makes htable_steal() easier to test. By setting 2903446Smrj * force_steal we force pagetable allocations to fall 2913446Smrj * into the stealing code. Roughly 1 in ever "force_steal" 2923446Smrj * page table allocations will fail. 2933446Smrj */ 2943446Smrj if (proc_pageout != NULL && force_steal > 1 && 2953446Smrj ++ptable_cnt > force_steal) { 2963446Smrj ptable_cnt = 0; 2973446Smrj return (PFN_INVALID); 2983446Smrj } 2991747Sjosephb #endif /* DEBUG */ 3001747Sjosephb 301*7589SVikram.Hegde@Sun.COM pp = page_get_physical(KM_NOSLEEP); 3023446Smrj if (pp == NULL) 3033446Smrj return (PFN_INVALID); 304*7589SVikram.Hegde@Sun.COM ASSERT(PAGE_SHARED(pp)); 3053446Smrj pfn = pp->p_pagenum; 3060Sstevel@tonic-gate if (pfn == PFN_INVALID) 3070Sstevel@tonic-gate panic("ptable_alloc(): Invalid PFN!!"); 3081747Sjosephb HATSTAT_INC(hs_ptable_allocs); 3093446Smrj return (pfn); 3100Sstevel@tonic-gate } 3110Sstevel@tonic-gate 3120Sstevel@tonic-gate /* 3130Sstevel@tonic-gate * Free an htable's associated page table page. See the comments 3140Sstevel@tonic-gate * for ptable_alloc(). 3150Sstevel@tonic-gate */ 3160Sstevel@tonic-gate static void 3173446Smrj ptable_free(pfn_t pfn) 3180Sstevel@tonic-gate { 3193446Smrj page_t *pp = page_numtopp_nolock(pfn); 3200Sstevel@tonic-gate 3210Sstevel@tonic-gate /* 3220Sstevel@tonic-gate * need to destroy the page used for the pagetable 3230Sstevel@tonic-gate */ 3240Sstevel@tonic-gate ASSERT(pfn != PFN_INVALID); 3250Sstevel@tonic-gate HATSTAT_INC(hs_ptable_frees); 3260Sstevel@tonic-gate atomic_add_32(&active_ptables, -1); 3270Sstevel@tonic-gate if (pp == NULL) 3280Sstevel@tonic-gate panic("ptable_free(): no page for pfn!"); 3290Sstevel@tonic-gate ASSERT(pfn == pp->p_pagenum); 3305084Sjohnlev ASSERT(!IN_XPV_PANIC()); 3315084Sjohnlev #ifdef __xpv 3325084Sjohnlev if (kpm_vbase && xen_kpm_page(pfn, PT_VALID | PT_WRITABLE) < 0) 3335084Sjohnlev panic("failure making kpm r/w pfn=0x%lx", pfn); 3345084Sjohnlev #endif 335*7589SVikram.Hegde@Sun.COM page_free_physical(pp); 3360Sstevel@tonic-gate } 3370Sstevel@tonic-gate 3380Sstevel@tonic-gate /* 3390Sstevel@tonic-gate * Put one htable on the reserve list. 3400Sstevel@tonic-gate */ 3410Sstevel@tonic-gate static void 3420Sstevel@tonic-gate htable_put_reserve(htable_t *ht) 3430Sstevel@tonic-gate { 3440Sstevel@tonic-gate ht->ht_hat = NULL; /* no longer tied to a hat */ 3450Sstevel@tonic-gate ASSERT(ht->ht_pfn == PFN_INVALID); 3460Sstevel@tonic-gate HATSTAT_INC(hs_htable_rputs); 3470Sstevel@tonic-gate mutex_enter(&htable_reserve_mutex); 3480Sstevel@tonic-gate ht->ht_next = htable_reserve_pool; 3490Sstevel@tonic-gate htable_reserve_pool = ht; 3500Sstevel@tonic-gate ++htable_reserve_cnt; 3510Sstevel@tonic-gate mutex_exit(&htable_reserve_mutex); 3520Sstevel@tonic-gate } 3530Sstevel@tonic-gate 3540Sstevel@tonic-gate /* 3550Sstevel@tonic-gate * Take one htable from the reserve. 3560Sstevel@tonic-gate */ 3570Sstevel@tonic-gate static htable_t * 3580Sstevel@tonic-gate htable_get_reserve(void) 3590Sstevel@tonic-gate { 3600Sstevel@tonic-gate htable_t *ht = NULL; 3610Sstevel@tonic-gate 3620Sstevel@tonic-gate mutex_enter(&htable_reserve_mutex); 3630Sstevel@tonic-gate if (htable_reserve_cnt != 0) { 3640Sstevel@tonic-gate ht = htable_reserve_pool; 3650Sstevel@tonic-gate ASSERT(ht != NULL); 3660Sstevel@tonic-gate ASSERT(ht->ht_pfn == PFN_INVALID); 3670Sstevel@tonic-gate htable_reserve_pool = ht->ht_next; 3680Sstevel@tonic-gate --htable_reserve_cnt; 3690Sstevel@tonic-gate HATSTAT_INC(hs_htable_rgets); 3700Sstevel@tonic-gate } 3710Sstevel@tonic-gate mutex_exit(&htable_reserve_mutex); 3720Sstevel@tonic-gate return (ht); 3730Sstevel@tonic-gate } 3740Sstevel@tonic-gate 3750Sstevel@tonic-gate /* 3763446Smrj * Allocate initial htables and put them on the reserve list 3770Sstevel@tonic-gate */ 3780Sstevel@tonic-gate void 3790Sstevel@tonic-gate htable_initial_reserve(uint_t count) 3800Sstevel@tonic-gate { 3810Sstevel@tonic-gate htable_t *ht; 3820Sstevel@tonic-gate 3830Sstevel@tonic-gate count += HTABLE_RESERVE_AMOUNT; 3840Sstevel@tonic-gate while (count > 0) { 3850Sstevel@tonic-gate ht = kmem_cache_alloc(htable_cache, KM_NOSLEEP); 3860Sstevel@tonic-gate ASSERT(ht != NULL); 3870Sstevel@tonic-gate 3880Sstevel@tonic-gate ASSERT(use_boot_reserve); 3893446Smrj ht->ht_pfn = PFN_INVALID; 3903446Smrj htable_put_reserve(ht); 3910Sstevel@tonic-gate --count; 3920Sstevel@tonic-gate } 3930Sstevel@tonic-gate } 3940Sstevel@tonic-gate 3950Sstevel@tonic-gate /* 3960Sstevel@tonic-gate * Readjust the reserves after a thread finishes using them. 3970Sstevel@tonic-gate */ 3980Sstevel@tonic-gate void 3990Sstevel@tonic-gate htable_adjust_reserve() 4000Sstevel@tonic-gate { 4010Sstevel@tonic-gate htable_t *ht; 4020Sstevel@tonic-gate 4030Sstevel@tonic-gate /* 4040Sstevel@tonic-gate * Free any excess htables in the reserve list 4050Sstevel@tonic-gate */ 4064004Sjosephb while (htable_reserve_cnt > htable_reserve_amount && 4074004Sjosephb !USE_HAT_RESERVES()) { 4080Sstevel@tonic-gate ht = htable_get_reserve(); 4090Sstevel@tonic-gate if (ht == NULL) 4100Sstevel@tonic-gate return; 4110Sstevel@tonic-gate ASSERT(ht->ht_pfn == PFN_INVALID); 4120Sstevel@tonic-gate kmem_cache_free(htable_cache, ht); 4130Sstevel@tonic-gate } 4140Sstevel@tonic-gate } 4150Sstevel@tonic-gate 4160Sstevel@tonic-gate 4170Sstevel@tonic-gate /* 4180Sstevel@tonic-gate * This routine steals htables from user processes for htable_alloc() or 4190Sstevel@tonic-gate * for htable_reap(). 4200Sstevel@tonic-gate */ 4210Sstevel@tonic-gate static htable_t * 4220Sstevel@tonic-gate htable_steal(uint_t cnt) 4230Sstevel@tonic-gate { 4240Sstevel@tonic-gate hat_t *hat = kas.a_hat; /* list starts with khat */ 4250Sstevel@tonic-gate htable_t *list = NULL; 4260Sstevel@tonic-gate htable_t *ht; 4270Sstevel@tonic-gate htable_t *higher; 4280Sstevel@tonic-gate uint_t h; 4291747Sjosephb uint_t h_start; 4301747Sjosephb static uint_t h_seed = 0; 4310Sstevel@tonic-gate uint_t e; 4320Sstevel@tonic-gate uintptr_t va; 4330Sstevel@tonic-gate x86pte_t pte; 4340Sstevel@tonic-gate uint_t stolen = 0; 4350Sstevel@tonic-gate uint_t pass; 4361747Sjosephb uint_t threshold; 4370Sstevel@tonic-gate 4380Sstevel@tonic-gate /* 4390Sstevel@tonic-gate * Limit htable_steal_passes to something reasonable 4400Sstevel@tonic-gate */ 4410Sstevel@tonic-gate if (htable_steal_passes == 0) 4420Sstevel@tonic-gate htable_steal_passes = 1; 4430Sstevel@tonic-gate if (htable_steal_passes > mmu.ptes_per_table) 4440Sstevel@tonic-gate htable_steal_passes = mmu.ptes_per_table; 4450Sstevel@tonic-gate 4460Sstevel@tonic-gate /* 4471747Sjosephb * Loop through all user hats. The 1st pass takes cached htables that 4480Sstevel@tonic-gate * aren't in use. The later passes steal by removing mappings, too. 4490Sstevel@tonic-gate */ 4500Sstevel@tonic-gate atomic_add_32(&htable_dont_cache, 1); 4511747Sjosephb for (pass = 0; pass <= htable_steal_passes && stolen < cnt; ++pass) { 4521747Sjosephb threshold = pass * mmu.ptes_per_table / htable_steal_passes; 4531747Sjosephb hat = kas.a_hat; 4540Sstevel@tonic-gate for (;;) { 4550Sstevel@tonic-gate 4560Sstevel@tonic-gate /* 4571747Sjosephb * Clear the victim flag and move to next hat 4580Sstevel@tonic-gate */ 4590Sstevel@tonic-gate mutex_enter(&hat_list_lock); 4601747Sjosephb if (hat != kas.a_hat) { 4611747Sjosephb hat->hat_flags &= ~HAT_VICTIM; 4621747Sjosephb cv_broadcast(&hat_list_cv); 4631747Sjosephb } 4641747Sjosephb hat = hat->hat_next; 4651747Sjosephb 4661747Sjosephb /* 4671747Sjosephb * Skip any hat that is already being stolen from. 4681747Sjosephb * 4691747Sjosephb * We skip SHARED hats, as these are dummy 4701747Sjosephb * hats that host ISM shared page tables. 4711747Sjosephb * 4721747Sjosephb * We also skip if HAT_FREEING because hat_pte_unmap() 4731747Sjosephb * won't zero out the PTE's. That would lead to hitting 4741747Sjosephb * stale PTEs either here or under hat_unload() when we 4751747Sjosephb * steal and unload the same page table in competing 4761747Sjosephb * threads. 4771747Sjosephb */ 4781747Sjosephb while (hat != NULL && 4791747Sjosephb (hat->hat_flags & 4801747Sjosephb (HAT_VICTIM | HAT_SHARED | HAT_FREEING)) != 0) 4811747Sjosephb hat = hat->hat_next; 4821747Sjosephb 4831747Sjosephb if (hat == NULL) { 4840Sstevel@tonic-gate mutex_exit(&hat_list_lock); 4850Sstevel@tonic-gate break; 4860Sstevel@tonic-gate } 4871747Sjosephb 4881747Sjosephb /* 4891747Sjosephb * Are we finished? 4901747Sjosephb */ 4911747Sjosephb if (stolen == cnt) { 4921747Sjosephb /* 4931747Sjosephb * Try to spread the pain of stealing, 4941747Sjosephb * move victim HAT to the end of the HAT list. 4951747Sjosephb */ 4961747Sjosephb if (pass >= 1 && cnt == 1 && 4971747Sjosephb kas.a_hat->hat_prev != hat) { 4981747Sjosephb 4991747Sjosephb /* unlink victim hat */ 5001747Sjosephb if (hat->hat_prev) 5011747Sjosephb hat->hat_prev->hat_next = 5021747Sjosephb hat->hat_next; 5031747Sjosephb else 5041747Sjosephb kas.a_hat->hat_next = 5051747Sjosephb hat->hat_next; 5061747Sjosephb if (hat->hat_next) 5071747Sjosephb hat->hat_next->hat_prev = 5081747Sjosephb hat->hat_prev; 5091747Sjosephb else 5101747Sjosephb kas.a_hat->hat_prev = 5111747Sjosephb hat->hat_prev; 5121747Sjosephb 5131747Sjosephb 5141747Sjosephb /* relink at end of hat list */ 5151747Sjosephb hat->hat_next = NULL; 5161747Sjosephb hat->hat_prev = kas.a_hat->hat_prev; 5171747Sjosephb if (hat->hat_prev) 5181747Sjosephb hat->hat_prev->hat_next = hat; 5191747Sjosephb else 5201747Sjosephb kas.a_hat->hat_next = hat; 5211747Sjosephb kas.a_hat->hat_prev = hat; 5221747Sjosephb 5231747Sjosephb } 5241747Sjosephb 5251747Sjosephb mutex_exit(&hat_list_lock); 5261747Sjosephb break; 5271747Sjosephb } 5281747Sjosephb 5291747Sjosephb /* 5301747Sjosephb * Mark the HAT as a stealing victim. 5311747Sjosephb */ 5320Sstevel@tonic-gate hat->hat_flags |= HAT_VICTIM; 5330Sstevel@tonic-gate mutex_exit(&hat_list_lock); 5340Sstevel@tonic-gate 5350Sstevel@tonic-gate /* 5360Sstevel@tonic-gate * Take any htables from the hat's cached "free" list. 5370Sstevel@tonic-gate */ 5380Sstevel@tonic-gate hat_enter(hat); 5390Sstevel@tonic-gate while ((ht = hat->hat_ht_cached) != NULL && 5400Sstevel@tonic-gate stolen < cnt) { 5410Sstevel@tonic-gate hat->hat_ht_cached = ht->ht_next; 5420Sstevel@tonic-gate ht->ht_next = list; 5430Sstevel@tonic-gate list = ht; 5440Sstevel@tonic-gate ++stolen; 5450Sstevel@tonic-gate } 5460Sstevel@tonic-gate hat_exit(hat); 5470Sstevel@tonic-gate 5480Sstevel@tonic-gate /* 5490Sstevel@tonic-gate * Don't steal on first pass. 5500Sstevel@tonic-gate */ 5511747Sjosephb if (pass == 0 || stolen == cnt) 5520Sstevel@tonic-gate continue; 5530Sstevel@tonic-gate 5540Sstevel@tonic-gate /* 5551747Sjosephb * Search the active htables for one to steal. 5561747Sjosephb * Start at a different hash bucket every time to 5571747Sjosephb * help spread the pain of stealing. 5580Sstevel@tonic-gate */ 5591747Sjosephb h = h_start = h_seed++ % hat->hat_num_hash; 5601747Sjosephb do { 5610Sstevel@tonic-gate higher = NULL; 5620Sstevel@tonic-gate HTABLE_ENTER(h); 5630Sstevel@tonic-gate for (ht = hat->hat_ht_hash[h]; ht; 5640Sstevel@tonic-gate ht = ht->ht_next) { 5650Sstevel@tonic-gate 5660Sstevel@tonic-gate /* 5670Sstevel@tonic-gate * Can we rule out reaping? 5680Sstevel@tonic-gate */ 5690Sstevel@tonic-gate if (ht->ht_busy != 0 || 5700Sstevel@tonic-gate (ht->ht_flags & HTABLE_SHARED_PFN)|| 5711747Sjosephb ht->ht_level > 0 || 5721747Sjosephb ht->ht_valid_cnt > threshold || 5730Sstevel@tonic-gate ht->ht_lock_cnt != 0) 5740Sstevel@tonic-gate continue; 5750Sstevel@tonic-gate 5760Sstevel@tonic-gate /* 5770Sstevel@tonic-gate * Increment busy so the htable can't 5780Sstevel@tonic-gate * disappear. We drop the htable mutex 5790Sstevel@tonic-gate * to avoid deadlocks with 5800Sstevel@tonic-gate * hat_pageunload() and the hment mutex 5810Sstevel@tonic-gate * while we call hat_pte_unmap() 5820Sstevel@tonic-gate */ 5830Sstevel@tonic-gate ++ht->ht_busy; 5840Sstevel@tonic-gate HTABLE_EXIT(h); 5850Sstevel@tonic-gate 5860Sstevel@tonic-gate /* 5870Sstevel@tonic-gate * Try stealing. 5880Sstevel@tonic-gate * - unload and invalidate all PTEs 5890Sstevel@tonic-gate */ 5900Sstevel@tonic-gate for (e = 0, va = ht->ht_vaddr; 5913446Smrj e < HTABLE_NUM_PTES(ht) && 5920Sstevel@tonic-gate ht->ht_valid_cnt > 0 && 5930Sstevel@tonic-gate ht->ht_busy == 1 && 5940Sstevel@tonic-gate ht->ht_lock_cnt == 0; 5950Sstevel@tonic-gate ++e, va += MMU_PAGESIZE) { 5960Sstevel@tonic-gate pte = x86pte_get(ht, e); 5970Sstevel@tonic-gate if (!PTE_ISVALID(pte)) 5980Sstevel@tonic-gate continue; 5990Sstevel@tonic-gate hat_pte_unmap(ht, e, 6000Sstevel@tonic-gate HAT_UNLOAD, pte, NULL); 6010Sstevel@tonic-gate } 6020Sstevel@tonic-gate 6030Sstevel@tonic-gate /* 6040Sstevel@tonic-gate * Reacquire htable lock. If we didn't 6050Sstevel@tonic-gate * remove all mappings in the table, 6060Sstevel@tonic-gate * or another thread added a new mapping 6070Sstevel@tonic-gate * behind us, give up on this table. 6080Sstevel@tonic-gate */ 6090Sstevel@tonic-gate HTABLE_ENTER(h); 6100Sstevel@tonic-gate if (ht->ht_busy != 1 || 6110Sstevel@tonic-gate ht->ht_valid_cnt != 0 || 6120Sstevel@tonic-gate ht->ht_lock_cnt != 0) { 6130Sstevel@tonic-gate --ht->ht_busy; 6140Sstevel@tonic-gate continue; 6150Sstevel@tonic-gate } 6160Sstevel@tonic-gate 6170Sstevel@tonic-gate /* 6180Sstevel@tonic-gate * Steal it and unlink the page table. 6190Sstevel@tonic-gate */ 6200Sstevel@tonic-gate higher = ht->ht_parent; 6210Sstevel@tonic-gate unlink_ptp(higher, ht, ht->ht_vaddr); 6220Sstevel@tonic-gate 6230Sstevel@tonic-gate /* 6240Sstevel@tonic-gate * remove from the hash list 6250Sstevel@tonic-gate */ 6260Sstevel@tonic-gate if (ht->ht_next) 6270Sstevel@tonic-gate ht->ht_next->ht_prev = 6280Sstevel@tonic-gate ht->ht_prev; 6290Sstevel@tonic-gate 6300Sstevel@tonic-gate if (ht->ht_prev) { 6310Sstevel@tonic-gate ht->ht_prev->ht_next = 6320Sstevel@tonic-gate ht->ht_next; 6330Sstevel@tonic-gate } else { 6340Sstevel@tonic-gate ASSERT(hat->hat_ht_hash[h] == 6350Sstevel@tonic-gate ht); 6360Sstevel@tonic-gate hat->hat_ht_hash[h] = 6370Sstevel@tonic-gate ht->ht_next; 6380Sstevel@tonic-gate } 6390Sstevel@tonic-gate 6400Sstevel@tonic-gate /* 6410Sstevel@tonic-gate * Break to outer loop to release the 6423446Smrj * higher (ht_parent) pagetable. This 6430Sstevel@tonic-gate * spreads out the pain caused by 6440Sstevel@tonic-gate * pagefaults. 6450Sstevel@tonic-gate */ 6460Sstevel@tonic-gate ht->ht_next = list; 6470Sstevel@tonic-gate list = ht; 6480Sstevel@tonic-gate ++stolen; 6490Sstevel@tonic-gate break; 6500Sstevel@tonic-gate } 6510Sstevel@tonic-gate HTABLE_EXIT(h); 6520Sstevel@tonic-gate if (higher != NULL) 6530Sstevel@tonic-gate htable_release(higher); 6541747Sjosephb if (++h == hat->hat_num_hash) 6551747Sjosephb h = 0; 6561747Sjosephb } while (stolen < cnt && h != h_start); 6570Sstevel@tonic-gate } 6580Sstevel@tonic-gate } 6590Sstevel@tonic-gate atomic_add_32(&htable_dont_cache, -1); 6600Sstevel@tonic-gate return (list); 6610Sstevel@tonic-gate } 6620Sstevel@tonic-gate 6630Sstevel@tonic-gate /* 6640Sstevel@tonic-gate * This is invoked from kmem when the system is low on memory. We try 6650Sstevel@tonic-gate * to free hments, htables, and ptables to improve the memory situation. 6660Sstevel@tonic-gate */ 6670Sstevel@tonic-gate /*ARGSUSED*/ 6680Sstevel@tonic-gate static void 6690Sstevel@tonic-gate htable_reap(void *handle) 6700Sstevel@tonic-gate { 6710Sstevel@tonic-gate uint_t reap_cnt; 6720Sstevel@tonic-gate htable_t *list; 6730Sstevel@tonic-gate htable_t *ht; 6740Sstevel@tonic-gate 6750Sstevel@tonic-gate HATSTAT_INC(hs_reap_attempts); 6760Sstevel@tonic-gate if (!can_steal_post_boot) 6770Sstevel@tonic-gate return; 6780Sstevel@tonic-gate 6790Sstevel@tonic-gate /* 6800Sstevel@tonic-gate * Try to reap 5% of the page tables bounded by a maximum of 6810Sstevel@tonic-gate * 5% of physmem and a minimum of 10. 6820Sstevel@tonic-gate */ 6830Sstevel@tonic-gate reap_cnt = MIN(MAX(physmem / 20, active_ptables / 20), 10); 6840Sstevel@tonic-gate 6850Sstevel@tonic-gate /* 6860Sstevel@tonic-gate * Let htable_steal() do the work, we just call htable_free() 6870Sstevel@tonic-gate */ 6885084Sjohnlev XPV_DISALLOW_MIGRATE(); 6890Sstevel@tonic-gate list = htable_steal(reap_cnt); 6905084Sjohnlev XPV_ALLOW_MIGRATE(); 6910Sstevel@tonic-gate while ((ht = list) != NULL) { 6920Sstevel@tonic-gate list = ht->ht_next; 6930Sstevel@tonic-gate HATSTAT_INC(hs_reaped); 6940Sstevel@tonic-gate htable_free(ht); 6950Sstevel@tonic-gate } 6960Sstevel@tonic-gate 6970Sstevel@tonic-gate /* 6980Sstevel@tonic-gate * Free up excess reserves 6990Sstevel@tonic-gate */ 7000Sstevel@tonic-gate htable_adjust_reserve(); 7010Sstevel@tonic-gate hment_adjust_reserve(); 7020Sstevel@tonic-gate } 7030Sstevel@tonic-gate 7040Sstevel@tonic-gate /* 7053446Smrj * Allocate an htable, stealing one or using the reserve if necessary 7060Sstevel@tonic-gate */ 7070Sstevel@tonic-gate static htable_t * 7080Sstevel@tonic-gate htable_alloc( 7090Sstevel@tonic-gate hat_t *hat, 7100Sstevel@tonic-gate uintptr_t vaddr, 7110Sstevel@tonic-gate level_t level, 7120Sstevel@tonic-gate htable_t *shared) 7130Sstevel@tonic-gate { 7140Sstevel@tonic-gate htable_t *ht = NULL; 7150Sstevel@tonic-gate uint_t is_vlp; 7160Sstevel@tonic-gate uint_t is_bare = 0; 7170Sstevel@tonic-gate uint_t need_to_zero = 1; 7180Sstevel@tonic-gate int kmflags = (can_steal_post_boot ? KM_NOSLEEP : KM_SLEEP); 7190Sstevel@tonic-gate 7200Sstevel@tonic-gate if (level < 0 || level > TOP_LEVEL(hat)) 7210Sstevel@tonic-gate panic("htable_alloc(): level %d out of range\n", level); 7220Sstevel@tonic-gate 7230Sstevel@tonic-gate is_vlp = (hat->hat_flags & HAT_VLP) && level == VLP_LEVEL; 7240Sstevel@tonic-gate if (is_vlp || shared != NULL) 7250Sstevel@tonic-gate is_bare = 1; 7260Sstevel@tonic-gate 7270Sstevel@tonic-gate /* 7280Sstevel@tonic-gate * First reuse a cached htable from the hat_ht_cached field, this 7293446Smrj * avoids unnecessary trips through kmem/page allocators. 7300Sstevel@tonic-gate */ 7310Sstevel@tonic-gate if (hat->hat_ht_cached != NULL && !is_bare) { 7320Sstevel@tonic-gate hat_enter(hat); 7330Sstevel@tonic-gate ht = hat->hat_ht_cached; 7340Sstevel@tonic-gate if (ht != NULL) { 7350Sstevel@tonic-gate hat->hat_ht_cached = ht->ht_next; 7360Sstevel@tonic-gate need_to_zero = 0; 7370Sstevel@tonic-gate /* XX64 ASSERT() they're all zero somehow */ 7380Sstevel@tonic-gate ASSERT(ht->ht_pfn != PFN_INVALID); 7390Sstevel@tonic-gate } 7400Sstevel@tonic-gate hat_exit(hat); 7410Sstevel@tonic-gate } 7420Sstevel@tonic-gate 7430Sstevel@tonic-gate if (ht == NULL) { 7440Sstevel@tonic-gate /* 7453543Sjosephb * Allocate an htable, possibly refilling the reserves. 7460Sstevel@tonic-gate */ 7473543Sjosephb if (USE_HAT_RESERVES()) { 7480Sstevel@tonic-gate ht = htable_get_reserve(); 7490Sstevel@tonic-gate } else { 7500Sstevel@tonic-gate /* 7510Sstevel@tonic-gate * Donate successful htable allocations to the reserve. 7520Sstevel@tonic-gate */ 7530Sstevel@tonic-gate for (;;) { 7540Sstevel@tonic-gate ht = kmem_cache_alloc(htable_cache, kmflags); 7550Sstevel@tonic-gate if (ht == NULL) 7560Sstevel@tonic-gate break; 7570Sstevel@tonic-gate ht->ht_pfn = PFN_INVALID; 7583543Sjosephb if (USE_HAT_RESERVES() || 7590Sstevel@tonic-gate htable_reserve_cnt >= htable_reserve_amount) 7600Sstevel@tonic-gate break; 7610Sstevel@tonic-gate htable_put_reserve(ht); 7620Sstevel@tonic-gate } 7630Sstevel@tonic-gate } 7640Sstevel@tonic-gate 7650Sstevel@tonic-gate /* 7660Sstevel@tonic-gate * allocate a page for the hardware page table if needed 7670Sstevel@tonic-gate */ 7680Sstevel@tonic-gate if (ht != NULL && !is_bare) { 7691747Sjosephb ht->ht_hat = hat; 770*7589SVikram.Hegde@Sun.COM ht->ht_pfn = ptable_alloc(); 7710Sstevel@tonic-gate if (ht->ht_pfn == PFN_INVALID) { 7723543Sjosephb if (USE_HAT_RESERVES()) 7733543Sjosephb htable_put_reserve(ht); 7743543Sjosephb else 7753543Sjosephb kmem_cache_free(htable_cache, ht); 7760Sstevel@tonic-gate ht = NULL; 7770Sstevel@tonic-gate } 7780Sstevel@tonic-gate } 7790Sstevel@tonic-gate } 7800Sstevel@tonic-gate 7810Sstevel@tonic-gate /* 7821747Sjosephb * If allocations failed, kick off a kmem_reap() and resort to 7831747Sjosephb * htable steal(). We may spin here if the system is very low on 7841747Sjosephb * memory. If the kernel itself has consumed all memory and kmem_reap() 7851747Sjosephb * can't free up anything, then we'll really get stuck here. 7861747Sjosephb * That should only happen in a system where the administrator has 7871747Sjosephb * misconfigured VM parameters via /etc/system. 7880Sstevel@tonic-gate */ 7891747Sjosephb while (ht == NULL && can_steal_post_boot) { 7901747Sjosephb kmem_reap(); 7910Sstevel@tonic-gate ht = htable_steal(1); 7920Sstevel@tonic-gate HATSTAT_INC(hs_steals); 7930Sstevel@tonic-gate 7940Sstevel@tonic-gate /* 7951747Sjosephb * If we stole for a bare htable, release the pagetable page. 7960Sstevel@tonic-gate */ 7973446Smrj if (ht != NULL) { 7983446Smrj if (is_bare) { 7993446Smrj ptable_free(ht->ht_pfn); 8003446Smrj ht->ht_pfn = PFN_INVALID; 8015084Sjohnlev #if defined(__xpv) && defined(__amd64) 8025084Sjohnlev /* 8035084Sjohnlev * make stolen page table writable again in kpm 8045084Sjohnlev */ 8055084Sjohnlev } else if (kpm_vbase && xen_kpm_page(ht->ht_pfn, 8065084Sjohnlev PT_VALID | PT_WRITABLE) < 0) { 8075084Sjohnlev panic("failure making kpm r/w pfn=0x%lx", 8085084Sjohnlev ht->ht_pfn); 8095084Sjohnlev #endif 8103446Smrj } 8113446Smrj } 8120Sstevel@tonic-gate } 8130Sstevel@tonic-gate 8140Sstevel@tonic-gate /* 8151747Sjosephb * All attempts to allocate or steal failed. This should only happen 8161747Sjosephb * if we run out of memory during boot, due perhaps to a huge 8171747Sjosephb * boot_archive. At this point there's no way to continue. 8180Sstevel@tonic-gate */ 8190Sstevel@tonic-gate if (ht == NULL) 8200Sstevel@tonic-gate panic("htable_alloc(): couldn't steal\n"); 8210Sstevel@tonic-gate 8225084Sjohnlev #if defined(__amd64) && defined(__xpv) 8235084Sjohnlev /* 8245084Sjohnlev * Under the 64-bit hypervisor, we have 2 top level page tables. 8255084Sjohnlev * If this allocation fails, we'll resort to stealing. 8265084Sjohnlev * We use the stolen page indirectly, by freeing the 8275084Sjohnlev * stolen htable first. 8285084Sjohnlev */ 8295084Sjohnlev if (level == mmu.max_level) { 8305084Sjohnlev for (;;) { 8315084Sjohnlev htable_t *stolen; 8325084Sjohnlev 833*7589SVikram.Hegde@Sun.COM hat->hat_user_ptable = ptable_alloc(); 8345084Sjohnlev if (hat->hat_user_ptable != PFN_INVALID) 8355084Sjohnlev break; 8365084Sjohnlev stolen = htable_steal(1); 8375084Sjohnlev if (stolen == NULL) 8385084Sjohnlev panic("2nd steal ptable failed\n"); 8395084Sjohnlev htable_free(stolen); 8405084Sjohnlev } 8415084Sjohnlev block_zero_no_xmm(kpm_vbase + pfn_to_pa(hat->hat_user_ptable), 8425084Sjohnlev MMU_PAGESIZE); 8435084Sjohnlev } 8445084Sjohnlev #endif 8455084Sjohnlev 8460Sstevel@tonic-gate /* 8470Sstevel@tonic-gate * Shared page tables have all entries locked and entries may not 8480Sstevel@tonic-gate * be added or deleted. 8490Sstevel@tonic-gate */ 8500Sstevel@tonic-gate ht->ht_flags = 0; 8510Sstevel@tonic-gate if (shared != NULL) { 8520Sstevel@tonic-gate ASSERT(shared->ht_valid_cnt > 0); 8530Sstevel@tonic-gate ht->ht_flags |= HTABLE_SHARED_PFN; 8540Sstevel@tonic-gate ht->ht_pfn = shared->ht_pfn; 8550Sstevel@tonic-gate ht->ht_lock_cnt = 0; 8560Sstevel@tonic-gate ht->ht_valid_cnt = 0; /* updated in hat_share() */ 8570Sstevel@tonic-gate ht->ht_shares = shared; 8580Sstevel@tonic-gate need_to_zero = 0; 8590Sstevel@tonic-gate } else { 8600Sstevel@tonic-gate ht->ht_shares = NULL; 8610Sstevel@tonic-gate ht->ht_lock_cnt = 0; 8620Sstevel@tonic-gate ht->ht_valid_cnt = 0; 8630Sstevel@tonic-gate } 8640Sstevel@tonic-gate 8650Sstevel@tonic-gate /* 8660Sstevel@tonic-gate * setup flags, etc. for VLP htables 8670Sstevel@tonic-gate */ 8680Sstevel@tonic-gate if (is_vlp) { 8690Sstevel@tonic-gate ht->ht_flags |= HTABLE_VLP; 8700Sstevel@tonic-gate ASSERT(ht->ht_pfn == PFN_INVALID); 8710Sstevel@tonic-gate need_to_zero = 0; 8720Sstevel@tonic-gate } 8730Sstevel@tonic-gate 8740Sstevel@tonic-gate /* 8750Sstevel@tonic-gate * fill in the htable 8760Sstevel@tonic-gate */ 8770Sstevel@tonic-gate ht->ht_hat = hat; 8780Sstevel@tonic-gate ht->ht_parent = NULL; 8790Sstevel@tonic-gate ht->ht_vaddr = vaddr; 8800Sstevel@tonic-gate ht->ht_level = level; 8810Sstevel@tonic-gate ht->ht_busy = 1; 8820Sstevel@tonic-gate ht->ht_next = NULL; 8830Sstevel@tonic-gate ht->ht_prev = NULL; 8840Sstevel@tonic-gate 8850Sstevel@tonic-gate /* 8860Sstevel@tonic-gate * Zero out any freshly allocated page table 8870Sstevel@tonic-gate */ 8880Sstevel@tonic-gate if (need_to_zero) 8890Sstevel@tonic-gate x86pte_zero(ht, 0, mmu.ptes_per_table); 8903446Smrj 8915084Sjohnlev #if defined(__amd64) && defined(__xpv) 8925084Sjohnlev if (!is_bare && kpm_vbase) { 8935084Sjohnlev (void) xen_kpm_page(ht->ht_pfn, PT_VALID); 8945084Sjohnlev if (level == mmu.max_level) 8955084Sjohnlev (void) xen_kpm_page(hat->hat_user_ptable, PT_VALID); 8965084Sjohnlev } 8975084Sjohnlev #endif 8985084Sjohnlev 8990Sstevel@tonic-gate return (ht); 9000Sstevel@tonic-gate } 9010Sstevel@tonic-gate 9020Sstevel@tonic-gate /* 9030Sstevel@tonic-gate * Free up an htable, either to a hat's cached list, the reserves or 9040Sstevel@tonic-gate * back to kmem. 9050Sstevel@tonic-gate */ 9060Sstevel@tonic-gate static void 9070Sstevel@tonic-gate htable_free(htable_t *ht) 9080Sstevel@tonic-gate { 9090Sstevel@tonic-gate hat_t *hat = ht->ht_hat; 9100Sstevel@tonic-gate 9110Sstevel@tonic-gate /* 9120Sstevel@tonic-gate * If the process isn't exiting, cache the free htable in the hat 9135084Sjohnlev * structure. We always do this for the boot time reserve. We don't 9140Sstevel@tonic-gate * do this if the hat is exiting or we are stealing/reaping htables. 9150Sstevel@tonic-gate */ 9160Sstevel@tonic-gate if (hat != NULL && 9170Sstevel@tonic-gate !(ht->ht_flags & HTABLE_SHARED_PFN) && 9180Sstevel@tonic-gate (use_boot_reserve || 9190Sstevel@tonic-gate (!(hat->hat_flags & HAT_FREEING) && !htable_dont_cache))) { 9200Sstevel@tonic-gate ASSERT((ht->ht_flags & HTABLE_VLP) == 0); 9210Sstevel@tonic-gate ASSERT(ht->ht_pfn != PFN_INVALID); 9220Sstevel@tonic-gate hat_enter(hat); 9230Sstevel@tonic-gate ht->ht_next = hat->hat_ht_cached; 9240Sstevel@tonic-gate hat->hat_ht_cached = ht; 9250Sstevel@tonic-gate hat_exit(hat); 9260Sstevel@tonic-gate return; 9270Sstevel@tonic-gate } 9280Sstevel@tonic-gate 9290Sstevel@tonic-gate /* 9300Sstevel@tonic-gate * If we have a hardware page table, free it. 9313446Smrj * We don't free page tables that are accessed by sharing. 9320Sstevel@tonic-gate */ 9330Sstevel@tonic-gate if (ht->ht_flags & HTABLE_SHARED_PFN) { 9340Sstevel@tonic-gate ASSERT(ht->ht_pfn != PFN_INVALID); 9350Sstevel@tonic-gate } else if (!(ht->ht_flags & HTABLE_VLP)) { 9363446Smrj ptable_free(ht->ht_pfn); 9375084Sjohnlev #if defined(__amd64) && defined(__xpv) 9385084Sjohnlev if (ht->ht_level == mmu.max_level) { 9395084Sjohnlev ptable_free(hat->hat_user_ptable); 9405084Sjohnlev hat->hat_user_ptable = PFN_INVALID; 9415084Sjohnlev } 9425084Sjohnlev #endif 9430Sstevel@tonic-gate } 9443446Smrj ht->ht_pfn = PFN_INVALID; 9450Sstevel@tonic-gate 9460Sstevel@tonic-gate /* 9475084Sjohnlev * Free it or put into reserves. 9480Sstevel@tonic-gate */ 9494004Sjosephb if (USE_HAT_RESERVES() || htable_reserve_cnt < htable_reserve_amount) { 9500Sstevel@tonic-gate htable_put_reserve(ht); 9514004Sjosephb } else { 9520Sstevel@tonic-gate kmem_cache_free(htable_cache, ht); 9534004Sjosephb htable_adjust_reserve(); 9544004Sjosephb } 9550Sstevel@tonic-gate } 9560Sstevel@tonic-gate 9570Sstevel@tonic-gate 9580Sstevel@tonic-gate /* 9590Sstevel@tonic-gate * This is called when a hat is being destroyed or swapped out. We reap all 9600Sstevel@tonic-gate * the remaining htables in the hat cache. If destroying all left over 9610Sstevel@tonic-gate * htables are also destroyed. 9620Sstevel@tonic-gate * 9630Sstevel@tonic-gate * We also don't need to invalidate any of the PTPs nor do any demapping. 9640Sstevel@tonic-gate */ 9650Sstevel@tonic-gate void 9660Sstevel@tonic-gate htable_purge_hat(hat_t *hat) 9670Sstevel@tonic-gate { 9680Sstevel@tonic-gate htable_t *ht; 9690Sstevel@tonic-gate int h; 9700Sstevel@tonic-gate 9710Sstevel@tonic-gate /* 9720Sstevel@tonic-gate * Purge the htable cache if just reaping. 9730Sstevel@tonic-gate */ 9740Sstevel@tonic-gate if (!(hat->hat_flags & HAT_FREEING)) { 9750Sstevel@tonic-gate atomic_add_32(&htable_dont_cache, 1); 9760Sstevel@tonic-gate for (;;) { 9770Sstevel@tonic-gate hat_enter(hat); 9780Sstevel@tonic-gate ht = hat->hat_ht_cached; 9790Sstevel@tonic-gate if (ht == NULL) { 9800Sstevel@tonic-gate hat_exit(hat); 9810Sstevel@tonic-gate break; 9820Sstevel@tonic-gate } 9830Sstevel@tonic-gate hat->hat_ht_cached = ht->ht_next; 9840Sstevel@tonic-gate hat_exit(hat); 9850Sstevel@tonic-gate htable_free(ht); 9860Sstevel@tonic-gate } 9870Sstevel@tonic-gate atomic_add_32(&htable_dont_cache, -1); 9880Sstevel@tonic-gate return; 9890Sstevel@tonic-gate } 9900Sstevel@tonic-gate 9910Sstevel@tonic-gate /* 9920Sstevel@tonic-gate * if freeing, no locking is needed 9930Sstevel@tonic-gate */ 9940Sstevel@tonic-gate while ((ht = hat->hat_ht_cached) != NULL) { 9950Sstevel@tonic-gate hat->hat_ht_cached = ht->ht_next; 9960Sstevel@tonic-gate htable_free(ht); 9970Sstevel@tonic-gate } 9980Sstevel@tonic-gate 9990Sstevel@tonic-gate /* 10000Sstevel@tonic-gate * walk thru the htable hash table and free all the htables in it. 10010Sstevel@tonic-gate */ 10020Sstevel@tonic-gate for (h = 0; h < hat->hat_num_hash; ++h) { 10030Sstevel@tonic-gate while ((ht = hat->hat_ht_hash[h]) != NULL) { 10040Sstevel@tonic-gate if (ht->ht_next) 10050Sstevel@tonic-gate ht->ht_next->ht_prev = ht->ht_prev; 10060Sstevel@tonic-gate 10070Sstevel@tonic-gate if (ht->ht_prev) { 10080Sstevel@tonic-gate ht->ht_prev->ht_next = ht->ht_next; 10090Sstevel@tonic-gate } else { 10100Sstevel@tonic-gate ASSERT(hat->hat_ht_hash[h] == ht); 10110Sstevel@tonic-gate hat->hat_ht_hash[h] = ht->ht_next; 10120Sstevel@tonic-gate } 10130Sstevel@tonic-gate htable_free(ht); 10140Sstevel@tonic-gate } 10150Sstevel@tonic-gate } 10160Sstevel@tonic-gate } 10170Sstevel@tonic-gate 10180Sstevel@tonic-gate /* 10190Sstevel@tonic-gate * Unlink an entry for a table at vaddr and level out of the existing table 10200Sstevel@tonic-gate * one level higher. We are always holding the HASH_ENTER() when doing this. 10210Sstevel@tonic-gate */ 10220Sstevel@tonic-gate static void 10230Sstevel@tonic-gate unlink_ptp(htable_t *higher, htable_t *old, uintptr_t vaddr) 10240Sstevel@tonic-gate { 10250Sstevel@tonic-gate uint_t entry = htable_va2entry(vaddr, higher); 10260Sstevel@tonic-gate x86pte_t expect = MAKEPTP(old->ht_pfn, old->ht_level); 10270Sstevel@tonic-gate x86pte_t found; 10284169Sjosephb hat_t *hat = old->ht_hat; 10290Sstevel@tonic-gate 10300Sstevel@tonic-gate ASSERT(higher->ht_busy > 0); 10310Sstevel@tonic-gate ASSERT(higher->ht_valid_cnt > 0); 10320Sstevel@tonic-gate ASSERT(old->ht_valid_cnt == 0); 10330Sstevel@tonic-gate found = x86pte_cas(higher, entry, expect, 0); 10345084Sjohnlev #ifdef __xpv 10355084Sjohnlev /* 10365084Sjohnlev * This is weird, but Xen apparently automatically unlinks empty 10375084Sjohnlev * pagetables from the upper page table. So allow PTP to be 0 already. 10385084Sjohnlev */ 10395084Sjohnlev if (found != expect && found != 0) 10405084Sjohnlev #else 10410Sstevel@tonic-gate if (found != expect) 10425084Sjohnlev #endif 10430Sstevel@tonic-gate panic("Bad PTP found=" FMT_PTE ", expected=" FMT_PTE, 10440Sstevel@tonic-gate found, expect); 10454169Sjosephb 10464169Sjosephb /* 10474654Sjosephb * When a top level VLP page table entry changes, we must issue 10484654Sjosephb * a reload of cr3 on all processors. 10494654Sjosephb * 10504654Sjosephb * If we don't need do do that, then we still have to INVLPG against 10514654Sjosephb * an address covered by the inner page table, as the latest processors 10524654Sjosephb * have TLB-like caches for non-leaf page table entries. 10534169Sjosephb */ 10544169Sjosephb if (!(hat->hat_flags & HAT_FREEING)) { 10554654Sjosephb hat_tlb_inval(hat, (higher->ht_flags & HTABLE_VLP) ? 10564654Sjosephb DEMAP_ALL_ADDR : old->ht_vaddr); 10574169Sjosephb } 10584169Sjosephb 10590Sstevel@tonic-gate HTABLE_DEC(higher->ht_valid_cnt); 10600Sstevel@tonic-gate } 10610Sstevel@tonic-gate 10620Sstevel@tonic-gate /* 10630Sstevel@tonic-gate * Link an entry for a new table at vaddr and level into the existing table 10640Sstevel@tonic-gate * one level higher. We are always holding the HASH_ENTER() when doing this. 10650Sstevel@tonic-gate */ 10660Sstevel@tonic-gate static void 10670Sstevel@tonic-gate link_ptp(htable_t *higher, htable_t *new, uintptr_t vaddr) 10680Sstevel@tonic-gate { 10690Sstevel@tonic-gate uint_t entry = htable_va2entry(vaddr, higher); 10700Sstevel@tonic-gate x86pte_t newptp = MAKEPTP(new->ht_pfn, new->ht_level); 10710Sstevel@tonic-gate x86pte_t found; 10720Sstevel@tonic-gate 10730Sstevel@tonic-gate ASSERT(higher->ht_busy > 0); 10740Sstevel@tonic-gate 10750Sstevel@tonic-gate ASSERT(new->ht_level != mmu.max_level); 10760Sstevel@tonic-gate 10770Sstevel@tonic-gate HTABLE_INC(higher->ht_valid_cnt); 10780Sstevel@tonic-gate 10790Sstevel@tonic-gate found = x86pte_cas(higher, entry, 0, newptp); 10801251Skchow if ((found & ~PT_REF) != 0) 10810Sstevel@tonic-gate panic("HAT: ptp not 0, found=" FMT_PTE, found); 10824169Sjosephb 10834169Sjosephb /* 10844169Sjosephb * When any top level VLP page table entry changes, we must issue 10854169Sjosephb * a reload of cr3 on all processors using it. 10864269Sjosephb * We also need to do this for the kernel hat on PAE 32 bit kernel. 10874169Sjosephb */ 10884269Sjosephb if ( 10894269Sjosephb #ifdef __i386 10904269Sjosephb (higher->ht_hat == kas.a_hat && higher->ht_level == VLP_LEVEL) || 10914269Sjosephb #endif 10924269Sjosephb (higher->ht_flags & HTABLE_VLP)) 10934169Sjosephb hat_tlb_inval(higher->ht_hat, DEMAP_ALL_ADDR); 10940Sstevel@tonic-gate } 10950Sstevel@tonic-gate 10960Sstevel@tonic-gate /* 10973446Smrj * Release of hold on an htable. If this is the last use and the pagetable 10983446Smrj * is empty we may want to free it, then recursively look at the pagetable 10993446Smrj * above it. The recursion is handled by the outer while() loop. 11005084Sjohnlev * 11015084Sjohnlev * On the metal, during process exit, we don't bother unlinking the tables from 11025084Sjohnlev * upper level pagetables. They are instead handled in bulk by hat_free_end(). 11035084Sjohnlev * We can't do this on the hypervisor as we need the page table to be 11045084Sjohnlev * implicitly unpinnned before it goes to the free page lists. This can't 11055084Sjohnlev * happen unless we fully unlink it from the page table hierarchy. 11060Sstevel@tonic-gate */ 11070Sstevel@tonic-gate void 11080Sstevel@tonic-gate htable_release(htable_t *ht) 11090Sstevel@tonic-gate { 11100Sstevel@tonic-gate uint_t hashval; 11110Sstevel@tonic-gate htable_t *shared; 11120Sstevel@tonic-gate htable_t *higher; 11130Sstevel@tonic-gate hat_t *hat; 11140Sstevel@tonic-gate uintptr_t va; 11150Sstevel@tonic-gate level_t level; 11160Sstevel@tonic-gate 11170Sstevel@tonic-gate while (ht != NULL) { 11180Sstevel@tonic-gate shared = NULL; 11190Sstevel@tonic-gate for (;;) { 11200Sstevel@tonic-gate hat = ht->ht_hat; 11210Sstevel@tonic-gate va = ht->ht_vaddr; 11220Sstevel@tonic-gate level = ht->ht_level; 11230Sstevel@tonic-gate hashval = HTABLE_HASH(hat, va, level); 11240Sstevel@tonic-gate 11250Sstevel@tonic-gate /* 11260Sstevel@tonic-gate * The common case is that this isn't the last use of 11270Sstevel@tonic-gate * an htable so we don't want to free the htable. 11280Sstevel@tonic-gate */ 11290Sstevel@tonic-gate HTABLE_ENTER(hashval); 11300Sstevel@tonic-gate ASSERT(ht->ht_valid_cnt >= 0); 11310Sstevel@tonic-gate ASSERT(ht->ht_busy > 0); 11320Sstevel@tonic-gate if (ht->ht_valid_cnt > 0) 11330Sstevel@tonic-gate break; 11340Sstevel@tonic-gate if (ht->ht_busy > 1) 11350Sstevel@tonic-gate break; 11365224Smec ASSERT(ht->ht_lock_cnt == 0); 11370Sstevel@tonic-gate 11385084Sjohnlev #if !defined(__xpv) 11390Sstevel@tonic-gate /* 11400Sstevel@tonic-gate * we always release empty shared htables 11410Sstevel@tonic-gate */ 11420Sstevel@tonic-gate if (!(ht->ht_flags & HTABLE_SHARED_PFN)) { 11430Sstevel@tonic-gate 11440Sstevel@tonic-gate /* 11450Sstevel@tonic-gate * don't release if in address space tear down 11460Sstevel@tonic-gate */ 11470Sstevel@tonic-gate if (hat->hat_flags & HAT_FREEING) 11480Sstevel@tonic-gate break; 11490Sstevel@tonic-gate 11500Sstevel@tonic-gate /* 11510Sstevel@tonic-gate * At and above max_page_level, free if it's for 11520Sstevel@tonic-gate * a boot-time kernel mapping below kernelbase. 11530Sstevel@tonic-gate */ 11540Sstevel@tonic-gate if (level >= mmu.max_page_level && 11550Sstevel@tonic-gate (hat != kas.a_hat || va >= kernelbase)) 11560Sstevel@tonic-gate break; 11570Sstevel@tonic-gate } 11585084Sjohnlev #endif /* __xpv */ 11590Sstevel@tonic-gate 11600Sstevel@tonic-gate /* 11613446Smrj * Remember if we destroy an htable that shares its PFN 11623446Smrj * from elsewhere. 11630Sstevel@tonic-gate */ 11640Sstevel@tonic-gate if (ht->ht_flags & HTABLE_SHARED_PFN) { 11650Sstevel@tonic-gate ASSERT(shared == NULL); 11660Sstevel@tonic-gate shared = ht->ht_shares; 11670Sstevel@tonic-gate HATSTAT_INC(hs_htable_unshared); 11680Sstevel@tonic-gate } 11690Sstevel@tonic-gate 11700Sstevel@tonic-gate /* 11710Sstevel@tonic-gate * Handle release of a table and freeing the htable_t. 11720Sstevel@tonic-gate * Unlink it from the table higher (ie. ht_parent). 11730Sstevel@tonic-gate */ 11740Sstevel@tonic-gate ASSERT(ht->ht_lock_cnt == 0); 11750Sstevel@tonic-gate higher = ht->ht_parent; 11760Sstevel@tonic-gate ASSERT(higher != NULL); 11770Sstevel@tonic-gate 11780Sstevel@tonic-gate /* 11790Sstevel@tonic-gate * Unlink the pagetable. 11800Sstevel@tonic-gate */ 11810Sstevel@tonic-gate unlink_ptp(higher, ht, va); 11820Sstevel@tonic-gate 11830Sstevel@tonic-gate /* 11840Sstevel@tonic-gate * remove this htable from its hash list 11850Sstevel@tonic-gate */ 11860Sstevel@tonic-gate if (ht->ht_next) 11870Sstevel@tonic-gate ht->ht_next->ht_prev = ht->ht_prev; 11880Sstevel@tonic-gate 11890Sstevel@tonic-gate if (ht->ht_prev) { 11900Sstevel@tonic-gate ht->ht_prev->ht_next = ht->ht_next; 11910Sstevel@tonic-gate } else { 11920Sstevel@tonic-gate ASSERT(hat->hat_ht_hash[hashval] == ht); 11930Sstevel@tonic-gate hat->hat_ht_hash[hashval] = ht->ht_next; 11940Sstevel@tonic-gate } 11950Sstevel@tonic-gate HTABLE_EXIT(hashval); 11960Sstevel@tonic-gate htable_free(ht); 11970Sstevel@tonic-gate ht = higher; 11980Sstevel@tonic-gate } 11990Sstevel@tonic-gate 12000Sstevel@tonic-gate ASSERT(ht->ht_busy >= 1); 12010Sstevel@tonic-gate --ht->ht_busy; 12020Sstevel@tonic-gate HTABLE_EXIT(hashval); 12030Sstevel@tonic-gate 12040Sstevel@tonic-gate /* 12050Sstevel@tonic-gate * If we released a shared htable, do a release on the htable 12060Sstevel@tonic-gate * from which it shared 12070Sstevel@tonic-gate */ 12080Sstevel@tonic-gate ht = shared; 12090Sstevel@tonic-gate } 12100Sstevel@tonic-gate } 12110Sstevel@tonic-gate 12120Sstevel@tonic-gate /* 12130Sstevel@tonic-gate * Find the htable for the pagetable at the given level for the given address. 12140Sstevel@tonic-gate * If found acquires a hold that eventually needs to be htable_release()d 12150Sstevel@tonic-gate */ 12160Sstevel@tonic-gate htable_t * 12170Sstevel@tonic-gate htable_lookup(hat_t *hat, uintptr_t vaddr, level_t level) 12180Sstevel@tonic-gate { 12190Sstevel@tonic-gate uintptr_t base; 12200Sstevel@tonic-gate uint_t hashval; 12210Sstevel@tonic-gate htable_t *ht = NULL; 12220Sstevel@tonic-gate 12230Sstevel@tonic-gate ASSERT(level >= 0); 12240Sstevel@tonic-gate ASSERT(level <= TOP_LEVEL(hat)); 12250Sstevel@tonic-gate 12264654Sjosephb if (level == TOP_LEVEL(hat)) { 12274654Sjosephb #if defined(__amd64) 12284654Sjosephb /* 12294654Sjosephb * 32 bit address spaces on 64 bit kernels need to check 12304654Sjosephb * for overflow of the 32 bit address space 12314654Sjosephb */ 12324654Sjosephb if ((hat->hat_flags & HAT_VLP) && vaddr >= ((uint64_t)1 << 32)) 12334654Sjosephb return (NULL); 12344654Sjosephb #endif 12350Sstevel@tonic-gate base = 0; 12364654Sjosephb } else { 12370Sstevel@tonic-gate base = vaddr & LEVEL_MASK(level + 1); 12384654Sjosephb } 12390Sstevel@tonic-gate 12400Sstevel@tonic-gate hashval = HTABLE_HASH(hat, base, level); 12410Sstevel@tonic-gate HTABLE_ENTER(hashval); 12420Sstevel@tonic-gate for (ht = hat->hat_ht_hash[hashval]; ht; ht = ht->ht_next) { 12430Sstevel@tonic-gate if (ht->ht_hat == hat && 12440Sstevel@tonic-gate ht->ht_vaddr == base && 12450Sstevel@tonic-gate ht->ht_level == level) 12460Sstevel@tonic-gate break; 12470Sstevel@tonic-gate } 12480Sstevel@tonic-gate if (ht) 12490Sstevel@tonic-gate ++ht->ht_busy; 12500Sstevel@tonic-gate 12510Sstevel@tonic-gate HTABLE_EXIT(hashval); 12520Sstevel@tonic-gate return (ht); 12530Sstevel@tonic-gate } 12540Sstevel@tonic-gate 12550Sstevel@tonic-gate /* 12560Sstevel@tonic-gate * Acquires a hold on a known htable (from a locked hment entry). 12570Sstevel@tonic-gate */ 12580Sstevel@tonic-gate void 12590Sstevel@tonic-gate htable_acquire(htable_t *ht) 12600Sstevel@tonic-gate { 12610Sstevel@tonic-gate hat_t *hat = ht->ht_hat; 12620Sstevel@tonic-gate level_t level = ht->ht_level; 12630Sstevel@tonic-gate uintptr_t base = ht->ht_vaddr; 12640Sstevel@tonic-gate uint_t hashval = HTABLE_HASH(hat, base, level); 12650Sstevel@tonic-gate 12660Sstevel@tonic-gate HTABLE_ENTER(hashval); 12670Sstevel@tonic-gate #ifdef DEBUG 12680Sstevel@tonic-gate /* 12690Sstevel@tonic-gate * make sure the htable is there 12700Sstevel@tonic-gate */ 12710Sstevel@tonic-gate { 12720Sstevel@tonic-gate htable_t *h; 12730Sstevel@tonic-gate 12740Sstevel@tonic-gate for (h = hat->hat_ht_hash[hashval]; 12750Sstevel@tonic-gate h && h != ht; 12760Sstevel@tonic-gate h = h->ht_next) 12770Sstevel@tonic-gate ; 12780Sstevel@tonic-gate ASSERT(h == ht); 12790Sstevel@tonic-gate } 12800Sstevel@tonic-gate #endif /* DEBUG */ 12810Sstevel@tonic-gate ++ht->ht_busy; 12820Sstevel@tonic-gate HTABLE_EXIT(hashval); 12830Sstevel@tonic-gate } 12840Sstevel@tonic-gate 12850Sstevel@tonic-gate /* 12860Sstevel@tonic-gate * Find the htable for the pagetable at the given level for the given address. 12870Sstevel@tonic-gate * If found acquires a hold that eventually needs to be htable_release()d 12880Sstevel@tonic-gate * If not found the table is created. 12890Sstevel@tonic-gate * 12900Sstevel@tonic-gate * Since we can't hold a hash table mutex during allocation, we have to 12910Sstevel@tonic-gate * drop it and redo the search on a create. Then we may have to free the newly 12920Sstevel@tonic-gate * allocated htable if another thread raced in and created it ahead of us. 12930Sstevel@tonic-gate */ 12940Sstevel@tonic-gate htable_t * 12950Sstevel@tonic-gate htable_create( 12960Sstevel@tonic-gate hat_t *hat, 12970Sstevel@tonic-gate uintptr_t vaddr, 12980Sstevel@tonic-gate level_t level, 12990Sstevel@tonic-gate htable_t *shared) 13000Sstevel@tonic-gate { 13010Sstevel@tonic-gate uint_t h; 13020Sstevel@tonic-gate level_t l; 13030Sstevel@tonic-gate uintptr_t base; 13040Sstevel@tonic-gate htable_t *ht; 13050Sstevel@tonic-gate htable_t *higher = NULL; 13060Sstevel@tonic-gate htable_t *new = NULL; 13070Sstevel@tonic-gate 13080Sstevel@tonic-gate if (level < 0 || level > TOP_LEVEL(hat)) 13090Sstevel@tonic-gate panic("htable_create(): level %d out of range\n", level); 13100Sstevel@tonic-gate 13110Sstevel@tonic-gate /* 13120Sstevel@tonic-gate * Create the page tables in top down order. 13130Sstevel@tonic-gate */ 13140Sstevel@tonic-gate for (l = TOP_LEVEL(hat); l >= level; --l) { 13150Sstevel@tonic-gate new = NULL; 13160Sstevel@tonic-gate if (l == TOP_LEVEL(hat)) 13170Sstevel@tonic-gate base = 0; 13180Sstevel@tonic-gate else 13190Sstevel@tonic-gate base = vaddr & LEVEL_MASK(l + 1); 13200Sstevel@tonic-gate 13210Sstevel@tonic-gate h = HTABLE_HASH(hat, base, l); 13220Sstevel@tonic-gate try_again: 13230Sstevel@tonic-gate /* 13240Sstevel@tonic-gate * look up the htable at this level 13250Sstevel@tonic-gate */ 13260Sstevel@tonic-gate HTABLE_ENTER(h); 13270Sstevel@tonic-gate if (l == TOP_LEVEL(hat)) { 13280Sstevel@tonic-gate ht = hat->hat_htable; 13290Sstevel@tonic-gate } else { 13300Sstevel@tonic-gate for (ht = hat->hat_ht_hash[h]; ht; ht = ht->ht_next) { 13310Sstevel@tonic-gate ASSERT(ht->ht_hat == hat); 13320Sstevel@tonic-gate if (ht->ht_vaddr == base && 13330Sstevel@tonic-gate ht->ht_level == l) 13340Sstevel@tonic-gate break; 13350Sstevel@tonic-gate } 13360Sstevel@tonic-gate } 13370Sstevel@tonic-gate 13380Sstevel@tonic-gate /* 13390Sstevel@tonic-gate * if we found the htable, increment its busy cnt 13400Sstevel@tonic-gate * and if we had allocated a new htable, free it. 13410Sstevel@tonic-gate */ 13420Sstevel@tonic-gate if (ht != NULL) { 13430Sstevel@tonic-gate /* 13440Sstevel@tonic-gate * If we find a pre-existing shared table, it must 13450Sstevel@tonic-gate * share from the same place. 13460Sstevel@tonic-gate */ 13470Sstevel@tonic-gate if (l == level && shared && ht->ht_shares && 13480Sstevel@tonic-gate ht->ht_shares != shared) { 13490Sstevel@tonic-gate panic("htable shared from wrong place " 13507240Srh87107 "found htable=%p shared=%p", 13517240Srh87107 (void *)ht, (void *)shared); 13520Sstevel@tonic-gate } 13530Sstevel@tonic-gate ++ht->ht_busy; 13540Sstevel@tonic-gate HTABLE_EXIT(h); 13550Sstevel@tonic-gate if (new) 13560Sstevel@tonic-gate htable_free(new); 13570Sstevel@tonic-gate if (higher != NULL) 13580Sstevel@tonic-gate htable_release(higher); 13590Sstevel@tonic-gate higher = ht; 13600Sstevel@tonic-gate 13610Sstevel@tonic-gate /* 13620Sstevel@tonic-gate * if we didn't find it on the first search 13630Sstevel@tonic-gate * allocate a new one and search again 13640Sstevel@tonic-gate */ 13650Sstevel@tonic-gate } else if (new == NULL) { 13660Sstevel@tonic-gate HTABLE_EXIT(h); 13670Sstevel@tonic-gate new = htable_alloc(hat, base, l, 13680Sstevel@tonic-gate l == level ? shared : NULL); 13690Sstevel@tonic-gate goto try_again; 13700Sstevel@tonic-gate 13710Sstevel@tonic-gate /* 13720Sstevel@tonic-gate * 2nd search and still not there, use "new" table 13730Sstevel@tonic-gate * Link new table into higher, when not at top level. 13740Sstevel@tonic-gate */ 13750Sstevel@tonic-gate } else { 13760Sstevel@tonic-gate ht = new; 13770Sstevel@tonic-gate if (higher != NULL) { 13780Sstevel@tonic-gate link_ptp(higher, ht, base); 13790Sstevel@tonic-gate ht->ht_parent = higher; 13800Sstevel@tonic-gate } 13810Sstevel@tonic-gate ht->ht_next = hat->hat_ht_hash[h]; 13820Sstevel@tonic-gate ASSERT(ht->ht_prev == NULL); 13830Sstevel@tonic-gate if (hat->hat_ht_hash[h]) 13840Sstevel@tonic-gate hat->hat_ht_hash[h]->ht_prev = ht; 13850Sstevel@tonic-gate hat->hat_ht_hash[h] = ht; 13860Sstevel@tonic-gate HTABLE_EXIT(h); 13870Sstevel@tonic-gate 13880Sstevel@tonic-gate /* 13890Sstevel@tonic-gate * Note we don't do htable_release(higher). 13900Sstevel@tonic-gate * That happens recursively when "new" is removed by 13910Sstevel@tonic-gate * htable_release() or htable_steal(). 13920Sstevel@tonic-gate */ 13930Sstevel@tonic-gate higher = ht; 13940Sstevel@tonic-gate 13950Sstevel@tonic-gate /* 13960Sstevel@tonic-gate * If we just created a new shared page table we 13970Sstevel@tonic-gate * increment the shared htable's busy count, so that 13980Sstevel@tonic-gate * it can't be the victim of a steal even if it's empty. 13990Sstevel@tonic-gate */ 14000Sstevel@tonic-gate if (l == level && shared) { 14010Sstevel@tonic-gate (void) htable_lookup(shared->ht_hat, 14020Sstevel@tonic-gate shared->ht_vaddr, shared->ht_level); 14030Sstevel@tonic-gate HATSTAT_INC(hs_htable_shared); 14040Sstevel@tonic-gate } 14050Sstevel@tonic-gate } 14060Sstevel@tonic-gate } 14070Sstevel@tonic-gate 14080Sstevel@tonic-gate return (ht); 14090Sstevel@tonic-gate } 14100Sstevel@tonic-gate 14110Sstevel@tonic-gate /* 14125084Sjohnlev * Inherit initial pagetables from the boot program. On the 64-bit 14135084Sjohnlev * hypervisor we also temporarily mark the p_index field of page table 14145084Sjohnlev * pages, so we know not to try making them writable in seg_kpm. 14153446Smrj */ 14163446Smrj void 14173446Smrj htable_attach( 14183446Smrj hat_t *hat, 14193446Smrj uintptr_t base, 14203446Smrj level_t level, 14213446Smrj htable_t *parent, 14223446Smrj pfn_t pfn) 14233446Smrj { 14243446Smrj htable_t *ht; 14253446Smrj uint_t h; 14263446Smrj uint_t i; 14273446Smrj x86pte_t pte; 14283446Smrj x86pte_t *ptep; 14293446Smrj page_t *pp; 14303446Smrj extern page_t *boot_claim_page(pfn_t); 14313446Smrj 14323446Smrj ht = htable_get_reserve(); 14333446Smrj if (level == mmu.max_level) 14343446Smrj kas.a_hat->hat_htable = ht; 14353446Smrj ht->ht_hat = hat; 14363446Smrj ht->ht_parent = parent; 14373446Smrj ht->ht_vaddr = base; 14383446Smrj ht->ht_level = level; 14393446Smrj ht->ht_busy = 1; 14403446Smrj ht->ht_next = NULL; 14413446Smrj ht->ht_prev = NULL; 14423446Smrj ht->ht_flags = 0; 14433446Smrj ht->ht_pfn = pfn; 14443446Smrj ht->ht_lock_cnt = 0; 14453446Smrj ht->ht_valid_cnt = 0; 14463446Smrj if (parent != NULL) 14473446Smrj ++parent->ht_busy; 14483446Smrj 14493446Smrj h = HTABLE_HASH(hat, base, level); 14503446Smrj HTABLE_ENTER(h); 14513446Smrj ht->ht_next = hat->hat_ht_hash[h]; 14523446Smrj ASSERT(ht->ht_prev == NULL); 14533446Smrj if (hat->hat_ht_hash[h]) 14543446Smrj hat->hat_ht_hash[h]->ht_prev = ht; 14553446Smrj hat->hat_ht_hash[h] = ht; 14563446Smrj HTABLE_EXIT(h); 14573446Smrj 14583446Smrj /* 14593446Smrj * make sure the page table physical page is not FREE 14603446Smrj */ 14613446Smrj if (page_resv(1, KM_NOSLEEP) == 0) 14623446Smrj panic("page_resv() failed in ptable alloc"); 14633446Smrj 14643446Smrj pp = boot_claim_page(pfn); 14653446Smrj ASSERT(pp != NULL); 14663446Smrj page_downgrade(pp); 14675084Sjohnlev #if defined(__xpv) && defined(__amd64) 14683446Smrj /* 14693446Smrj * Record in the page_t that is a pagetable for segkpm setup. 14703446Smrj */ 14713446Smrj if (kpm_vbase) 14723446Smrj pp->p_index = 1; 14735084Sjohnlev #endif 14743446Smrj 14753446Smrj /* 14763446Smrj * Count valid mappings and recursively attach lower level pagetables. 14773446Smrj */ 14783446Smrj ptep = kbm_remap_window(pfn_to_pa(pfn), 0); 14793446Smrj for (i = 0; i < HTABLE_NUM_PTES(ht); ++i) { 14803446Smrj if (mmu.pae_hat) 14813446Smrj pte = ptep[i]; 14823446Smrj else 14833446Smrj pte = ((x86pte32_t *)ptep)[i]; 14843446Smrj if (!IN_HYPERVISOR_VA(base) && PTE_ISVALID(pte)) { 14853446Smrj ++ht->ht_valid_cnt; 14863446Smrj if (!PTE_ISPAGE(pte, level)) { 14873446Smrj htable_attach(hat, base, level - 1, 14883446Smrj ht, PTE2PFN(pte, level)); 14893446Smrj ptep = kbm_remap_window(pfn_to_pa(pfn), 0); 14903446Smrj } 14913446Smrj } 14923446Smrj base += LEVEL_SIZE(level); 14933446Smrj if (base == mmu.hole_start) 14943446Smrj base = (mmu.hole_end + MMU_PAGEOFFSET) & MMU_PAGEMASK; 14953446Smrj } 14963446Smrj 14973446Smrj /* 14983446Smrj * As long as all the mappings we had were below kernel base 14993446Smrj * we can release the htable. 15003446Smrj */ 15013446Smrj if (base < kernelbase) 15023446Smrj htable_release(ht); 15033446Smrj } 15043446Smrj 15053446Smrj /* 15060Sstevel@tonic-gate * Walk through a given htable looking for the first valid entry. This 15070Sstevel@tonic-gate * routine takes both a starting and ending address. The starting address 15080Sstevel@tonic-gate * is required to be within the htable provided by the caller, but there is 15090Sstevel@tonic-gate * no such restriction on the ending address. 15100Sstevel@tonic-gate * 15110Sstevel@tonic-gate * If the routine finds a valid entry in the htable (at or beyond the 15120Sstevel@tonic-gate * starting address), the PTE (and its address) will be returned. 15130Sstevel@tonic-gate * This PTE may correspond to either a page or a pagetable - it is the 15140Sstevel@tonic-gate * caller's responsibility to determine which. If no valid entry is 15150Sstevel@tonic-gate * found, 0 (and invalid PTE) and the next unexamined address will be 15160Sstevel@tonic-gate * returned. 15170Sstevel@tonic-gate * 15180Sstevel@tonic-gate * The loop has been carefully coded for optimization. 15190Sstevel@tonic-gate */ 15200Sstevel@tonic-gate static x86pte_t 15210Sstevel@tonic-gate htable_scan(htable_t *ht, uintptr_t *vap, uintptr_t eaddr) 15220Sstevel@tonic-gate { 15230Sstevel@tonic-gate uint_t e; 15240Sstevel@tonic-gate x86pte_t found_pte = (x86pte_t)0; 15253446Smrj caddr_t pte_ptr; 15263446Smrj caddr_t end_pte_ptr; 15270Sstevel@tonic-gate int l = ht->ht_level; 15280Sstevel@tonic-gate uintptr_t va = *vap & LEVEL_MASK(l); 15290Sstevel@tonic-gate size_t pgsize = LEVEL_SIZE(l); 15300Sstevel@tonic-gate 15310Sstevel@tonic-gate ASSERT(va >= ht->ht_vaddr); 15320Sstevel@tonic-gate ASSERT(va <= HTABLE_LAST_PAGE(ht)); 15330Sstevel@tonic-gate 15340Sstevel@tonic-gate /* 15350Sstevel@tonic-gate * Compute the starting index and ending virtual address 15360Sstevel@tonic-gate */ 15370Sstevel@tonic-gate e = htable_va2entry(va, ht); 15380Sstevel@tonic-gate 15390Sstevel@tonic-gate /* 15400Sstevel@tonic-gate * The following page table scan code knows that the valid 15410Sstevel@tonic-gate * bit of a PTE is in the lowest byte AND that x86 is little endian!! 15420Sstevel@tonic-gate */ 15433446Smrj pte_ptr = (caddr_t)x86pte_access_pagetable(ht, 0); 15443446Smrj end_pte_ptr = (caddr_t)PT_INDEX_PTR(pte_ptr, HTABLE_NUM_PTES(ht)); 15453446Smrj pte_ptr = (caddr_t)PT_INDEX_PTR((x86pte_t *)pte_ptr, e); 15462687Skchow while (!PTE_ISVALID(*pte_ptr)) { 15470Sstevel@tonic-gate va += pgsize; 15480Sstevel@tonic-gate if (va >= eaddr) 15490Sstevel@tonic-gate break; 15500Sstevel@tonic-gate pte_ptr += mmu.pte_size; 15510Sstevel@tonic-gate ASSERT(pte_ptr <= end_pte_ptr); 15520Sstevel@tonic-gate if (pte_ptr == end_pte_ptr) 15530Sstevel@tonic-gate break; 15540Sstevel@tonic-gate } 15550Sstevel@tonic-gate 15560Sstevel@tonic-gate /* 15570Sstevel@tonic-gate * if we found a valid PTE, load the entire PTE 15580Sstevel@tonic-gate */ 15593446Smrj if (va < eaddr && pte_ptr != end_pte_ptr) 15603446Smrj found_pte = GET_PTE((x86pte_t *)pte_ptr); 15610Sstevel@tonic-gate x86pte_release_pagetable(ht); 15620Sstevel@tonic-gate 15630Sstevel@tonic-gate #if defined(__amd64) 15640Sstevel@tonic-gate /* 15650Sstevel@tonic-gate * deal with VA hole on amd64 15660Sstevel@tonic-gate */ 15670Sstevel@tonic-gate if (l == mmu.max_level && va >= mmu.hole_start && va <= mmu.hole_end) 15680Sstevel@tonic-gate va = mmu.hole_end + va - mmu.hole_start; 15690Sstevel@tonic-gate #endif /* __amd64 */ 15700Sstevel@tonic-gate 15710Sstevel@tonic-gate *vap = va; 15720Sstevel@tonic-gate return (found_pte); 15730Sstevel@tonic-gate } 15740Sstevel@tonic-gate 15750Sstevel@tonic-gate /* 15760Sstevel@tonic-gate * Find the address and htable for the first populated translation at or 15770Sstevel@tonic-gate * above the given virtual address. The caller may also specify an upper 15780Sstevel@tonic-gate * limit to the address range to search. Uses level information to quickly 15790Sstevel@tonic-gate * skip unpopulated sections of virtual address spaces. 15800Sstevel@tonic-gate * 15810Sstevel@tonic-gate * If not found returns NULL. When found, returns the htable and virt addr 15820Sstevel@tonic-gate * and has a hold on the htable. 15830Sstevel@tonic-gate */ 15840Sstevel@tonic-gate x86pte_t 15850Sstevel@tonic-gate htable_walk( 15860Sstevel@tonic-gate struct hat *hat, 15870Sstevel@tonic-gate htable_t **htp, 15880Sstevel@tonic-gate uintptr_t *vaddr, 15890Sstevel@tonic-gate uintptr_t eaddr) 15900Sstevel@tonic-gate { 15910Sstevel@tonic-gate uintptr_t va = *vaddr; 15920Sstevel@tonic-gate htable_t *ht; 15930Sstevel@tonic-gate htable_t *prev = *htp; 15940Sstevel@tonic-gate level_t l; 15950Sstevel@tonic-gate level_t max_mapped_level; 15960Sstevel@tonic-gate x86pte_t pte; 15970Sstevel@tonic-gate 15980Sstevel@tonic-gate ASSERT(eaddr > va); 15990Sstevel@tonic-gate 16000Sstevel@tonic-gate /* 16010Sstevel@tonic-gate * If this is a user address, then we know we need not look beyond 16020Sstevel@tonic-gate * kernelbase. 16030Sstevel@tonic-gate */ 16040Sstevel@tonic-gate ASSERT(hat == kas.a_hat || eaddr <= kernelbase || 16050Sstevel@tonic-gate eaddr == HTABLE_WALK_TO_END); 16060Sstevel@tonic-gate if (hat != kas.a_hat && eaddr == HTABLE_WALK_TO_END) 16070Sstevel@tonic-gate eaddr = kernelbase; 16080Sstevel@tonic-gate 16090Sstevel@tonic-gate /* 16100Sstevel@tonic-gate * If we're coming in with a previous page table, search it first 16110Sstevel@tonic-gate * without doing an htable_lookup(), this should be frequent. 16120Sstevel@tonic-gate */ 16130Sstevel@tonic-gate if (prev) { 16140Sstevel@tonic-gate ASSERT(prev->ht_busy > 0); 16150Sstevel@tonic-gate ASSERT(prev->ht_vaddr <= va); 16160Sstevel@tonic-gate l = prev->ht_level; 16170Sstevel@tonic-gate if (va <= HTABLE_LAST_PAGE(prev)) { 16180Sstevel@tonic-gate pte = htable_scan(prev, &va, eaddr); 16190Sstevel@tonic-gate 16200Sstevel@tonic-gate if (PTE_ISPAGE(pte, l)) { 16210Sstevel@tonic-gate *vaddr = va; 16220Sstevel@tonic-gate *htp = prev; 16230Sstevel@tonic-gate return (pte); 16240Sstevel@tonic-gate } 16250Sstevel@tonic-gate } 16260Sstevel@tonic-gate 16270Sstevel@tonic-gate /* 16280Sstevel@tonic-gate * We found nothing in the htable provided by the caller, 16290Sstevel@tonic-gate * so fall through and do the full search 16300Sstevel@tonic-gate */ 16310Sstevel@tonic-gate htable_release(prev); 16320Sstevel@tonic-gate } 16330Sstevel@tonic-gate 16340Sstevel@tonic-gate /* 16350Sstevel@tonic-gate * Find the level of the largest pagesize used by this HAT. 16360Sstevel@tonic-gate */ 16374654Sjosephb if (hat->hat_ism_pgcnt > 0) { 16385349Skchow max_mapped_level = mmu.umax_page_level; 16394654Sjosephb } else { 16404654Sjosephb max_mapped_level = 0; 16414654Sjosephb for (l = 1; l <= mmu.max_page_level; ++l) 16424654Sjosephb if (hat->hat_pages_mapped[l] != 0) 16434654Sjosephb max_mapped_level = l; 16444654Sjosephb } 16450Sstevel@tonic-gate 16460Sstevel@tonic-gate while (va < eaddr && va >= *vaddr) { 16470Sstevel@tonic-gate ASSERT(!IN_VA_HOLE(va)); 16480Sstevel@tonic-gate 16490Sstevel@tonic-gate /* 16500Sstevel@tonic-gate * Find lowest table with any entry for given address. 16510Sstevel@tonic-gate */ 16520Sstevel@tonic-gate for (l = 0; l <= TOP_LEVEL(hat); ++l) { 16530Sstevel@tonic-gate ht = htable_lookup(hat, va, l); 16540Sstevel@tonic-gate if (ht != NULL) { 16550Sstevel@tonic-gate pte = htable_scan(ht, &va, eaddr); 16560Sstevel@tonic-gate if (PTE_ISPAGE(pte, l)) { 16570Sstevel@tonic-gate *vaddr = va; 16580Sstevel@tonic-gate *htp = ht; 16590Sstevel@tonic-gate return (pte); 16600Sstevel@tonic-gate } 16610Sstevel@tonic-gate htable_release(ht); 16620Sstevel@tonic-gate break; 16630Sstevel@tonic-gate } 16640Sstevel@tonic-gate 16650Sstevel@tonic-gate /* 16664654Sjosephb * No htable at this level for the address. If there 16674654Sjosephb * is no larger page size that could cover it, we can 16684654Sjosephb * skip right to the start of the next page table. 16694575Sdm120769 */ 16704575Sdm120769 ASSERT(l < TOP_LEVEL(hat)); 16714575Sdm120769 if (l >= max_mapped_level) { 16720Sstevel@tonic-gate va = NEXT_ENTRY_VA(va, l + 1); 16734654Sjosephb if (va >= eaddr) 16744654Sjosephb break; 16754575Sdm120769 } 16760Sstevel@tonic-gate } 16770Sstevel@tonic-gate } 16780Sstevel@tonic-gate 16790Sstevel@tonic-gate *vaddr = 0; 16800Sstevel@tonic-gate *htp = NULL; 16810Sstevel@tonic-gate return (0); 16820Sstevel@tonic-gate } 16830Sstevel@tonic-gate 16840Sstevel@tonic-gate /* 16850Sstevel@tonic-gate * Find the htable and page table entry index of the given virtual address 16860Sstevel@tonic-gate * with pagesize at or below given level. 16870Sstevel@tonic-gate * If not found returns NULL. When found, returns the htable, sets 16880Sstevel@tonic-gate * entry, and has a hold on the htable. 16890Sstevel@tonic-gate */ 16900Sstevel@tonic-gate htable_t * 16910Sstevel@tonic-gate htable_getpte( 16920Sstevel@tonic-gate struct hat *hat, 16930Sstevel@tonic-gate uintptr_t vaddr, 16940Sstevel@tonic-gate uint_t *entry, 16950Sstevel@tonic-gate x86pte_t *pte, 16960Sstevel@tonic-gate level_t level) 16970Sstevel@tonic-gate { 16980Sstevel@tonic-gate htable_t *ht; 16990Sstevel@tonic-gate level_t l; 17000Sstevel@tonic-gate uint_t e; 17010Sstevel@tonic-gate 17020Sstevel@tonic-gate ASSERT(level <= mmu.max_page_level); 17030Sstevel@tonic-gate 17040Sstevel@tonic-gate for (l = 0; l <= level; ++l) { 17050Sstevel@tonic-gate ht = htable_lookup(hat, vaddr, l); 17060Sstevel@tonic-gate if (ht == NULL) 17070Sstevel@tonic-gate continue; 17080Sstevel@tonic-gate e = htable_va2entry(vaddr, ht); 17090Sstevel@tonic-gate if (entry != NULL) 17100Sstevel@tonic-gate *entry = e; 17110Sstevel@tonic-gate if (pte != NULL) 17120Sstevel@tonic-gate *pte = x86pte_get(ht, e); 17130Sstevel@tonic-gate return (ht); 17140Sstevel@tonic-gate } 17150Sstevel@tonic-gate return (NULL); 17160Sstevel@tonic-gate } 17170Sstevel@tonic-gate 17180Sstevel@tonic-gate /* 17190Sstevel@tonic-gate * Find the htable and page table entry index of the given virtual address. 17200Sstevel@tonic-gate * There must be a valid page mapped at the given address. 17210Sstevel@tonic-gate * If not found returns NULL. When found, returns the htable, sets 17220Sstevel@tonic-gate * entry, and has a hold on the htable. 17230Sstevel@tonic-gate */ 17240Sstevel@tonic-gate htable_t * 17250Sstevel@tonic-gate htable_getpage(struct hat *hat, uintptr_t vaddr, uint_t *entry) 17260Sstevel@tonic-gate { 17270Sstevel@tonic-gate htable_t *ht; 17280Sstevel@tonic-gate uint_t e; 17290Sstevel@tonic-gate x86pte_t pte; 17300Sstevel@tonic-gate 17310Sstevel@tonic-gate ht = htable_getpte(hat, vaddr, &e, &pte, mmu.max_page_level); 17320Sstevel@tonic-gate if (ht == NULL) 17330Sstevel@tonic-gate return (NULL); 17340Sstevel@tonic-gate 17350Sstevel@tonic-gate if (entry) 17360Sstevel@tonic-gate *entry = e; 17370Sstevel@tonic-gate 17380Sstevel@tonic-gate if (PTE_ISPAGE(pte, ht->ht_level)) 17390Sstevel@tonic-gate return (ht); 17400Sstevel@tonic-gate htable_release(ht); 17410Sstevel@tonic-gate return (NULL); 17420Sstevel@tonic-gate } 17430Sstevel@tonic-gate 17440Sstevel@tonic-gate 17450Sstevel@tonic-gate void 17460Sstevel@tonic-gate htable_init() 17470Sstevel@tonic-gate { 17480Sstevel@tonic-gate /* 17490Sstevel@tonic-gate * To save on kernel VA usage, we avoid debug information in 32 bit 17500Sstevel@tonic-gate * kernels. 17510Sstevel@tonic-gate */ 17520Sstevel@tonic-gate #if defined(__amd64) 17530Sstevel@tonic-gate int kmem_flags = KMC_NOHASH; 17540Sstevel@tonic-gate #elif defined(__i386) 17550Sstevel@tonic-gate int kmem_flags = KMC_NOHASH | KMC_NODEBUG; 17560Sstevel@tonic-gate #endif 17570Sstevel@tonic-gate 17580Sstevel@tonic-gate /* 17590Sstevel@tonic-gate * initialize kmem caches 17600Sstevel@tonic-gate */ 17610Sstevel@tonic-gate htable_cache = kmem_cache_create("htable_t", 17620Sstevel@tonic-gate sizeof (htable_t), 0, NULL, NULL, 17630Sstevel@tonic-gate htable_reap, NULL, hat_memload_arena, kmem_flags); 17640Sstevel@tonic-gate } 17650Sstevel@tonic-gate 17660Sstevel@tonic-gate /* 17670Sstevel@tonic-gate * get the pte index for the virtual address in the given htable's pagetable 17680Sstevel@tonic-gate */ 17690Sstevel@tonic-gate uint_t 17700Sstevel@tonic-gate htable_va2entry(uintptr_t va, htable_t *ht) 17710Sstevel@tonic-gate { 17720Sstevel@tonic-gate level_t l = ht->ht_level; 17730Sstevel@tonic-gate 17740Sstevel@tonic-gate ASSERT(va >= ht->ht_vaddr); 17750Sstevel@tonic-gate ASSERT(va <= HTABLE_LAST_PAGE(ht)); 17763446Smrj return ((va >> LEVEL_SHIFT(l)) & (HTABLE_NUM_PTES(ht) - 1)); 17770Sstevel@tonic-gate } 17780Sstevel@tonic-gate 17790Sstevel@tonic-gate /* 17800Sstevel@tonic-gate * Given an htable and the index of a pte in it, return the virtual address 17810Sstevel@tonic-gate * of the page. 17820Sstevel@tonic-gate */ 17830Sstevel@tonic-gate uintptr_t 17840Sstevel@tonic-gate htable_e2va(htable_t *ht, uint_t entry) 17850Sstevel@tonic-gate { 17860Sstevel@tonic-gate level_t l = ht->ht_level; 17870Sstevel@tonic-gate uintptr_t va; 17880Sstevel@tonic-gate 17893446Smrj ASSERT(entry < HTABLE_NUM_PTES(ht)); 17900Sstevel@tonic-gate va = ht->ht_vaddr + ((uintptr_t)entry << LEVEL_SHIFT(l)); 17910Sstevel@tonic-gate 17920Sstevel@tonic-gate /* 17930Sstevel@tonic-gate * Need to skip over any VA hole in top level table 17940Sstevel@tonic-gate */ 17950Sstevel@tonic-gate #if defined(__amd64) 17960Sstevel@tonic-gate if (ht->ht_level == mmu.max_level && va >= mmu.hole_start) 17970Sstevel@tonic-gate va += ((mmu.hole_end - mmu.hole_start) + 1); 17980Sstevel@tonic-gate #endif 17990Sstevel@tonic-gate 18000Sstevel@tonic-gate return (va); 18010Sstevel@tonic-gate } 18020Sstevel@tonic-gate 18030Sstevel@tonic-gate /* 18040Sstevel@tonic-gate * The code uses compare and swap instructions to read/write PTE's to 18050Sstevel@tonic-gate * avoid atomicity problems, since PTEs can be 8 bytes on 32 bit systems. 18060Sstevel@tonic-gate * will naturally be atomic. 18070Sstevel@tonic-gate * 18080Sstevel@tonic-gate * The combination of using kpreempt_disable()/_enable() and the hci_mutex 18090Sstevel@tonic-gate * are used to ensure that an interrupt won't overwrite a temporary mapping 18100Sstevel@tonic-gate * while it's in use. If an interrupt thread tries to access a PTE, it will 18110Sstevel@tonic-gate * yield briefly back to the pinned thread which holds the cpu's hci_mutex. 18120Sstevel@tonic-gate */ 18130Sstevel@tonic-gate void 18143446Smrj x86pte_cpu_init(cpu_t *cpu) 18150Sstevel@tonic-gate { 18160Sstevel@tonic-gate struct hat_cpu_info *hci; 18170Sstevel@tonic-gate 18183446Smrj hci = kmem_zalloc(sizeof (*hci), KM_SLEEP); 18190Sstevel@tonic-gate mutex_init(&hci->hci_mutex, NULL, MUTEX_DEFAULT, NULL); 18200Sstevel@tonic-gate cpu->cpu_hat_info = hci; 18210Sstevel@tonic-gate } 18220Sstevel@tonic-gate 18233446Smrj void 18243446Smrj x86pte_cpu_fini(cpu_t *cpu) 18253446Smrj { 18263446Smrj struct hat_cpu_info *hci = cpu->cpu_hat_info; 18273446Smrj 18283446Smrj kmem_free(hci, sizeof (*hci)); 18293446Smrj cpu->cpu_hat_info = NULL; 18303446Smrj } 18313446Smrj 18323446Smrj #ifdef __i386 18330Sstevel@tonic-gate /* 18343446Smrj * On 32 bit kernels, loading a 64 bit PTE is a little tricky 18350Sstevel@tonic-gate */ 18363446Smrj x86pte_t 18373446Smrj get_pte64(x86pte_t *ptr) 18383446Smrj { 18393446Smrj volatile uint32_t *p = (uint32_t *)ptr; 18403446Smrj x86pte_t t; 18413446Smrj 18423446Smrj ASSERT(mmu.pae_hat != 0); 18433446Smrj for (;;) { 18443446Smrj t = p[0]; 18453446Smrj t |= (uint64_t)p[1] << 32; 18463446Smrj if ((t & 0xffffffff) == p[0]) 18473446Smrj return (t); 18483446Smrj } 18490Sstevel@tonic-gate } 18503446Smrj #endif /* __i386 */ 18510Sstevel@tonic-gate 18520Sstevel@tonic-gate /* 18530Sstevel@tonic-gate * Disable preemption and establish a mapping to the pagetable with the 18540Sstevel@tonic-gate * given pfn. This is optimized for there case where it's the same 18550Sstevel@tonic-gate * pfn as we last used referenced from this CPU. 18560Sstevel@tonic-gate */ 18570Sstevel@tonic-gate static x86pte_t * 18583446Smrj x86pte_access_pagetable(htable_t *ht, uint_t index) 18590Sstevel@tonic-gate { 18600Sstevel@tonic-gate /* 18610Sstevel@tonic-gate * VLP pagetables are contained in the hat_t 18620Sstevel@tonic-gate */ 18630Sstevel@tonic-gate if (ht->ht_flags & HTABLE_VLP) 18643446Smrj return (PT_INDEX_PTR(ht->ht_hat->hat_vlp_ptes, index)); 18653446Smrj return (x86pte_mapin(ht->ht_pfn, index, ht)); 18663446Smrj } 18670Sstevel@tonic-gate 18683446Smrj /* 18693446Smrj * map the given pfn into the page table window. 18703446Smrj */ 18713446Smrj /*ARGSUSED*/ 18723446Smrj x86pte_t * 18733446Smrj x86pte_mapin(pfn_t pfn, uint_t index, htable_t *ht) 18743446Smrj { 18753446Smrj x86pte_t *pteptr; 18765217Sjosephb x86pte_t pte = 0; 18773446Smrj x86pte_t newpte; 18783446Smrj int x; 18793446Smrj 18800Sstevel@tonic-gate ASSERT(pfn != PFN_INVALID); 18810Sstevel@tonic-gate 18820Sstevel@tonic-gate if (!khat_running) { 18833446Smrj caddr_t va = kbm_remap_window(pfn_to_pa(pfn), 1); 18843446Smrj return (PT_INDEX_PTR(va, index)); 18850Sstevel@tonic-gate } 18860Sstevel@tonic-gate 18870Sstevel@tonic-gate /* 18883446Smrj * If kpm is available, use it. 18893446Smrj */ 18903446Smrj if (kpm_vbase) 18913446Smrj return (PT_INDEX_PTR(hat_kpm_pfn2va(pfn), index)); 18923446Smrj 18933446Smrj /* 18943446Smrj * Disable preemption and grab the CPU's hci_mutex 18950Sstevel@tonic-gate */ 18960Sstevel@tonic-gate kpreempt_disable(); 18973446Smrj ASSERT(CPU->cpu_hat_info != NULL); 18983446Smrj mutex_enter(&CPU->cpu_hat_info->hci_mutex); 18993446Smrj x = PWIN_TABLE(CPU->cpu_id); 19003446Smrj pteptr = (x86pte_t *)PWIN_PTE_VA(x); 19015217Sjosephb #ifndef __xpv 19023446Smrj if (mmu.pae_hat) 19033446Smrj pte = *pteptr; 19043446Smrj else 19053446Smrj pte = *(x86pte32_t *)pteptr; 19065217Sjosephb #endif 19073446Smrj 19083446Smrj newpte = MAKEPTE(pfn, 0) | mmu.pt_global | mmu.pt_nx; 19095084Sjohnlev 19105084Sjohnlev /* 19115084Sjohnlev * For hardware we can use a writable mapping. 19125084Sjohnlev */ 19135084Sjohnlev #ifdef __xpv 19145084Sjohnlev if (IN_XPV_PANIC()) 19155084Sjohnlev #endif 19165084Sjohnlev newpte |= PT_WRITABLE; 19173446Smrj 19183446Smrj if (!PTE_EQUIV(newpte, pte)) { 19195084Sjohnlev 19205084Sjohnlev #ifdef __xpv 19215084Sjohnlev if (!IN_XPV_PANIC()) { 19225084Sjohnlev xen_map(newpte, PWIN_VA(x)); 19235084Sjohnlev } else 19245084Sjohnlev #endif 19255084Sjohnlev { 19265084Sjohnlev XPV_ALLOW_PAGETABLE_UPDATES(); 19275084Sjohnlev if (mmu.pae_hat) 19285084Sjohnlev *pteptr = newpte; 19295084Sjohnlev else 19305084Sjohnlev *(x86pte32_t *)pteptr = newpte; 19315084Sjohnlev XPV_DISALLOW_PAGETABLE_UPDATES(); 19325084Sjohnlev mmu_tlbflush_entry((caddr_t)(PWIN_VA(x))); 19335084Sjohnlev } 19340Sstevel@tonic-gate } 19353446Smrj return (PT_INDEX_PTR(PWIN_VA(x), index)); 19360Sstevel@tonic-gate } 19370Sstevel@tonic-gate 19380Sstevel@tonic-gate /* 19390Sstevel@tonic-gate * Release access to a page table. 19400Sstevel@tonic-gate */ 19410Sstevel@tonic-gate static void 19420Sstevel@tonic-gate x86pte_release_pagetable(htable_t *ht) 19430Sstevel@tonic-gate { 19440Sstevel@tonic-gate /* 19450Sstevel@tonic-gate * nothing to do for VLP htables 19460Sstevel@tonic-gate */ 19470Sstevel@tonic-gate if (ht->ht_flags & HTABLE_VLP) 19480Sstevel@tonic-gate return; 19490Sstevel@tonic-gate 19503446Smrj x86pte_mapout(); 19513446Smrj } 19523446Smrj 19533446Smrj void 19543446Smrj x86pte_mapout(void) 19553446Smrj { 19565084Sjohnlev if (kpm_vbase != NULL || !khat_running) 19570Sstevel@tonic-gate return; 19580Sstevel@tonic-gate 19590Sstevel@tonic-gate /* 19603446Smrj * Drop the CPU's hci_mutex and restore preemption. 19610Sstevel@tonic-gate */ 19625217Sjosephb #ifdef __xpv 19635217Sjosephb if (!IN_XPV_PANIC()) { 19645217Sjosephb uintptr_t va; 19655217Sjosephb 19665217Sjosephb /* 19675217Sjosephb * We need to always clear the mapping in case a page 19685217Sjosephb * that was once a page table page is ballooned out. 19695217Sjosephb */ 19705217Sjosephb va = (uintptr_t)PWIN_VA(PWIN_TABLE(CPU->cpu_id)); 19715217Sjosephb (void) HYPERVISOR_update_va_mapping(va, 0, 19725217Sjosephb UVMF_INVLPG | UVMF_LOCAL); 19735217Sjosephb } 19745217Sjosephb #endif 19753446Smrj mutex_exit(&CPU->cpu_hat_info->hci_mutex); 19760Sstevel@tonic-gate kpreempt_enable(); 19770Sstevel@tonic-gate } 19780Sstevel@tonic-gate 19790Sstevel@tonic-gate /* 19800Sstevel@tonic-gate * Atomic retrieval of a pagetable entry 19810Sstevel@tonic-gate */ 19820Sstevel@tonic-gate x86pte_t 19830Sstevel@tonic-gate x86pte_get(htable_t *ht, uint_t entry) 19840Sstevel@tonic-gate { 19850Sstevel@tonic-gate x86pte_t pte; 198647Sjosephb x86pte_t *ptep; 19870Sstevel@tonic-gate 19880Sstevel@tonic-gate /* 198947Sjosephb * Be careful that loading PAE entries in 32 bit kernel is atomic. 19900Sstevel@tonic-gate */ 19913446Smrj ASSERT(entry < mmu.ptes_per_table); 19923446Smrj ptep = x86pte_access_pagetable(ht, entry); 19933446Smrj pte = GET_PTE(ptep); 19940Sstevel@tonic-gate x86pte_release_pagetable(ht); 19950Sstevel@tonic-gate return (pte); 19960Sstevel@tonic-gate } 19970Sstevel@tonic-gate 19980Sstevel@tonic-gate /* 19990Sstevel@tonic-gate * Atomic unconditional set of a page table entry, it returns the previous 20003446Smrj * value. For pre-existing mappings if the PFN changes, then we don't care 20013446Smrj * about the old pte's REF / MOD bits. If the PFN remains the same, we leave 20023446Smrj * the MOD/REF bits unchanged. 20033446Smrj * 20043446Smrj * If asked to overwrite a link to a lower page table with a large page 20053446Smrj * mapping, this routine returns the special value of LPAGE_ERROR. This 20063446Smrj * allows the upper HAT layers to retry with a smaller mapping size. 20070Sstevel@tonic-gate */ 20080Sstevel@tonic-gate x86pte_t 20090Sstevel@tonic-gate x86pte_set(htable_t *ht, uint_t entry, x86pte_t new, void *ptr) 20100Sstevel@tonic-gate { 20110Sstevel@tonic-gate x86pte_t old; 20123446Smrj x86pte_t prev; 20130Sstevel@tonic-gate x86pte_t *ptep; 20143446Smrj level_t l = ht->ht_level; 20153446Smrj x86pte_t pfn_mask = (l != 0) ? PT_PADDR_LGPG : PT_PADDR; 20163446Smrj x86pte_t n; 20173446Smrj uintptr_t addr = htable_e2va(ht, entry); 20183446Smrj hat_t *hat = ht->ht_hat; 20190Sstevel@tonic-gate 20203446Smrj ASSERT(new != 0); /* don't use to invalidate a PTE, see x86pte_update */ 20210Sstevel@tonic-gate ASSERT(!(ht->ht_flags & HTABLE_SHARED_PFN)); 20223446Smrj if (ptr == NULL) 20233446Smrj ptep = x86pte_access_pagetable(ht, entry); 20243446Smrj else 20250Sstevel@tonic-gate ptep = ptr; 20263446Smrj 20273446Smrj /* 20283446Smrj * Install the new PTE. If remapping the same PFN, then 20293446Smrj * copy existing REF/MOD bits to new mapping. 20303446Smrj */ 20313446Smrj do { 20323446Smrj prev = GET_PTE(ptep); 20333446Smrj n = new; 20343446Smrj if (PTE_ISVALID(n) && (prev & pfn_mask) == (new & pfn_mask)) 20353446Smrj n |= prev & (PT_REF | PT_MOD); 20360Sstevel@tonic-gate 20373446Smrj /* 20383446Smrj * Another thread may have installed this mapping already, 20393446Smrj * flush the local TLB and be done. 20403446Smrj */ 20413446Smrj if (prev == n) { 20423446Smrj old = new; 20435084Sjohnlev #ifdef __xpv 20445084Sjohnlev if (!IN_XPV_PANIC()) 20455084Sjohnlev xen_flush_va((caddr_t)addr); 20465084Sjohnlev else 20475084Sjohnlev #endif 20485084Sjohnlev mmu_tlbflush_entry((caddr_t)addr); 20493446Smrj goto done; 20500Sstevel@tonic-gate } 20513446Smrj 20523446Smrj /* 20533446Smrj * Detect if we have a collision of installing a large 20543446Smrj * page mapping where there already is a lower page table. 20553446Smrj */ 20563543Sjosephb if (l > 0 && (prev & PT_VALID) && !(prev & PT_PAGESIZE)) { 20573543Sjosephb old = LPAGE_ERROR; 20583543Sjosephb goto done; 20593543Sjosephb } 20603446Smrj 20615084Sjohnlev XPV_ALLOW_PAGETABLE_UPDATES(); 20623446Smrj old = CAS_PTE(ptep, prev, n); 20635084Sjohnlev XPV_DISALLOW_PAGETABLE_UPDATES(); 20643446Smrj } while (old != prev); 20653446Smrj 20663446Smrj /* 20673446Smrj * Do a TLB demap if needed, ie. the old pte was valid. 20683446Smrj * 20693446Smrj * Note that a stale TLB writeback to the PTE here either can't happen 20703446Smrj * or doesn't matter. The PFN can only change for NOSYNC|NOCONSIST 20713446Smrj * mappings, but they were created with REF and MOD already set, so 20723446Smrj * no stale writeback will happen. 20733446Smrj * 20743446Smrj * Segmap is the only place where remaps happen on the same pfn and for 20753446Smrj * that we want to preserve the stale REF/MOD bits. 20763446Smrj */ 20773446Smrj if (old & PT_REF) 20783446Smrj hat_tlb_inval(hat, addr); 20793446Smrj 20803446Smrj done: 20810Sstevel@tonic-gate if (ptr == NULL) 20820Sstevel@tonic-gate x86pte_release_pagetable(ht); 20830Sstevel@tonic-gate return (old); 20840Sstevel@tonic-gate } 20850Sstevel@tonic-gate 20860Sstevel@tonic-gate /* 20873446Smrj * Atomic compare and swap of a page table entry. No TLB invalidates are done. 20883446Smrj * This is used for links between pagetables of different levels. 20893446Smrj * Note we always create these links with dirty/access set, so they should 20903446Smrj * never change. 20910Sstevel@tonic-gate */ 20923446Smrj x86pte_t 20930Sstevel@tonic-gate x86pte_cas(htable_t *ht, uint_t entry, x86pte_t old, x86pte_t new) 20940Sstevel@tonic-gate { 20950Sstevel@tonic-gate x86pte_t pte; 20960Sstevel@tonic-gate x86pte_t *ptep; 20975084Sjohnlev #ifdef __xpv 20985084Sjohnlev /* 20995084Sjohnlev * We can't use writable pagetables for upper level tables, so fake it. 21005084Sjohnlev */ 21015084Sjohnlev mmu_update_t t[2]; 21025084Sjohnlev int cnt = 1; 21035084Sjohnlev int count; 21045084Sjohnlev maddr_t ma; 21050Sstevel@tonic-gate 21065084Sjohnlev if (!IN_XPV_PANIC()) { 21075084Sjohnlev ASSERT(!(ht->ht_flags & HTABLE_VLP)); /* no VLP yet */ 21085084Sjohnlev ma = pa_to_ma(PT_INDEX_PHYSADDR(pfn_to_pa(ht->ht_pfn), entry)); 21095084Sjohnlev t[0].ptr = ma | MMU_NORMAL_PT_UPDATE; 21105084Sjohnlev t[0].val = new; 21115084Sjohnlev 21125084Sjohnlev #if defined(__amd64) 21135084Sjohnlev /* 21145084Sjohnlev * On the 64-bit hypervisor we need to maintain the user mode 21155084Sjohnlev * top page table too. 21165084Sjohnlev */ 21175084Sjohnlev if (ht->ht_level == mmu.max_level && ht->ht_hat != kas.a_hat) { 21185084Sjohnlev ma = pa_to_ma(PT_INDEX_PHYSADDR(pfn_to_pa( 21195084Sjohnlev ht->ht_hat->hat_user_ptable), entry)); 21205084Sjohnlev t[1].ptr = ma | MMU_NORMAL_PT_UPDATE; 21215084Sjohnlev t[1].val = new; 21225084Sjohnlev ++cnt; 21235084Sjohnlev } 21245084Sjohnlev #endif /* __amd64 */ 21255084Sjohnlev 21265084Sjohnlev if (HYPERVISOR_mmu_update(t, cnt, &count, DOMID_SELF)) 21275084Sjohnlev panic("HYPERVISOR_mmu_update() failed"); 21285084Sjohnlev ASSERT(count == cnt); 21295084Sjohnlev return (old); 21305084Sjohnlev } 21315084Sjohnlev #endif 21323446Smrj ptep = x86pte_access_pagetable(ht, entry); 21335084Sjohnlev XPV_ALLOW_PAGETABLE_UPDATES(); 21343446Smrj pte = CAS_PTE(ptep, old, new); 21355084Sjohnlev XPV_DISALLOW_PAGETABLE_UPDATES(); 21360Sstevel@tonic-gate x86pte_release_pagetable(ht); 21370Sstevel@tonic-gate return (pte); 21380Sstevel@tonic-gate } 21390Sstevel@tonic-gate 21400Sstevel@tonic-gate /* 21413446Smrj * Invalidate a page table entry as long as it currently maps something that 21423446Smrj * matches the value determined by expect. 21433446Smrj * 21443446Smrj * Also invalidates any TLB entries and returns the previous value of the PTE. 21450Sstevel@tonic-gate */ 21463446Smrj x86pte_t 21473446Smrj x86pte_inval( 21483446Smrj htable_t *ht, 21493446Smrj uint_t entry, 21503446Smrj x86pte_t expect, 21513446Smrj x86pte_t *pte_ptr) 21520Sstevel@tonic-gate { 21533446Smrj x86pte_t *ptep; 21544191Sjosephb x86pte_t oldpte; 21554191Sjosephb x86pte_t found; 21560Sstevel@tonic-gate 21573446Smrj ASSERT(!(ht->ht_flags & HTABLE_SHARED_PFN)); 21585349Skchow ASSERT(ht->ht_level <= mmu.max_page_level); 21593543Sjosephb 21603446Smrj if (pte_ptr != NULL) 21613446Smrj ptep = pte_ptr; 21623446Smrj else 21633446Smrj ptep = x86pte_access_pagetable(ht, entry); 21640Sstevel@tonic-gate 21655084Sjohnlev #if defined(__xpv) 21665084Sjohnlev /* 21675084Sjohnlev * If exit()ing just use HYPERVISOR_mmu_update(), as we can't be racing 21685084Sjohnlev * with anything else. 21695084Sjohnlev */ 21705084Sjohnlev if ((ht->ht_hat->hat_flags & HAT_FREEING) && !IN_XPV_PANIC()) { 21715084Sjohnlev int count; 21725084Sjohnlev mmu_update_t t[1]; 21735084Sjohnlev maddr_t ma; 21745084Sjohnlev 21755084Sjohnlev oldpte = GET_PTE(ptep); 21765084Sjohnlev if (expect != 0 && (oldpte & PT_PADDR) != (expect & PT_PADDR)) 21775084Sjohnlev goto done; 21785084Sjohnlev ma = pa_to_ma(PT_INDEX_PHYSADDR(pfn_to_pa(ht->ht_pfn), entry)); 21795084Sjohnlev t[0].ptr = ma | MMU_NORMAL_PT_UPDATE; 21805084Sjohnlev t[0].val = 0; 21815084Sjohnlev if (HYPERVISOR_mmu_update(t, 1, &count, DOMID_SELF)) 21825084Sjohnlev panic("HYPERVISOR_mmu_update() failed"); 21835084Sjohnlev ASSERT(count == 1); 21845084Sjohnlev goto done; 21855084Sjohnlev } 21865084Sjohnlev #endif /* __xpv */ 21875084Sjohnlev 21880Sstevel@tonic-gate /* 21893543Sjosephb * Note that the loop is needed to handle changes due to h/w updating 21903543Sjosephb * of PT_MOD/PT_REF. 21910Sstevel@tonic-gate */ 21923446Smrj do { 21934191Sjosephb oldpte = GET_PTE(ptep); 21944191Sjosephb if (expect != 0 && (oldpte & PT_PADDR) != (expect & PT_PADDR)) 21954191Sjosephb goto done; 21965084Sjohnlev XPV_ALLOW_PAGETABLE_UPDATES(); 21974191Sjosephb found = CAS_PTE(ptep, oldpte, 0); 21985084Sjohnlev XPV_DISALLOW_PAGETABLE_UPDATES(); 21994191Sjosephb } while (found != oldpte); 22004191Sjosephb if (oldpte & (PT_REF | PT_MOD)) 22014191Sjosephb hat_tlb_inval(ht->ht_hat, htable_e2va(ht, entry)); 22020Sstevel@tonic-gate 22034191Sjosephb done: 22043446Smrj if (pte_ptr == NULL) 22053446Smrj x86pte_release_pagetable(ht); 22064191Sjosephb return (oldpte); 22070Sstevel@tonic-gate } 22080Sstevel@tonic-gate 22090Sstevel@tonic-gate /* 22103446Smrj * Change a page table entry af it currently matches the value in expect. 22110Sstevel@tonic-gate */ 22120Sstevel@tonic-gate x86pte_t 22133446Smrj x86pte_update( 22143446Smrj htable_t *ht, 22153446Smrj uint_t entry, 22163446Smrj x86pte_t expect, 22173446Smrj x86pte_t new) 22180Sstevel@tonic-gate { 22190Sstevel@tonic-gate x86pte_t *ptep; 22203446Smrj x86pte_t found; 22210Sstevel@tonic-gate 22223446Smrj ASSERT(new != 0); 22233446Smrj ASSERT(!(ht->ht_flags & HTABLE_SHARED_PFN)); 22245349Skchow ASSERT(ht->ht_level <= mmu.max_page_level); 22250Sstevel@tonic-gate 22263446Smrj ptep = x86pte_access_pagetable(ht, entry); 22275084Sjohnlev XPV_ALLOW_PAGETABLE_UPDATES(); 22283446Smrj found = CAS_PTE(ptep, expect, new); 22295084Sjohnlev XPV_DISALLOW_PAGETABLE_UPDATES(); 22303446Smrj if (found == expect) { 22313446Smrj hat_tlb_inval(ht->ht_hat, htable_e2va(ht, entry)); 22320Sstevel@tonic-gate 22333446Smrj /* 22343446Smrj * When removing write permission *and* clearing the 22353446Smrj * MOD bit, check if a write happened via a stale 22363446Smrj * TLB entry before the TLB shootdown finished. 22373446Smrj * 22383446Smrj * If it did happen, simply re-enable write permission and 22393446Smrj * act like the original CAS failed. 22403446Smrj */ 22413446Smrj if ((expect & (PT_WRITABLE | PT_MOD)) == PT_WRITABLE && 22423446Smrj (new & (PT_WRITABLE | PT_MOD)) == 0 && 22433446Smrj (GET_PTE(ptep) & PT_MOD) != 0) { 22443446Smrj do { 22453446Smrj found = GET_PTE(ptep); 22465084Sjohnlev XPV_ALLOW_PAGETABLE_UPDATES(); 22473446Smrj found = 22483446Smrj CAS_PTE(ptep, found, found | PT_WRITABLE); 22495084Sjohnlev XPV_DISALLOW_PAGETABLE_UPDATES(); 22503446Smrj } while ((found & PT_WRITABLE) == 0); 22513446Smrj } 22523446Smrj } 22530Sstevel@tonic-gate x86pte_release_pagetable(ht); 22543446Smrj return (found); 22550Sstevel@tonic-gate } 22560Sstevel@tonic-gate 22575084Sjohnlev #ifndef __xpv 22580Sstevel@tonic-gate /* 22590Sstevel@tonic-gate * Copy page tables - this is just a little more complicated than the 22600Sstevel@tonic-gate * previous routines. Note that it's also not atomic! It also is never 22610Sstevel@tonic-gate * used for VLP pagetables. 22620Sstevel@tonic-gate */ 22630Sstevel@tonic-gate void 22640Sstevel@tonic-gate x86pte_copy(htable_t *src, htable_t *dest, uint_t entry, uint_t count) 22650Sstevel@tonic-gate { 22660Sstevel@tonic-gate caddr_t src_va; 22670Sstevel@tonic-gate caddr_t dst_va; 22680Sstevel@tonic-gate size_t size; 22693446Smrj x86pte_t *pteptr; 22703446Smrj x86pte_t pte; 22710Sstevel@tonic-gate 22720Sstevel@tonic-gate ASSERT(khat_running); 22730Sstevel@tonic-gate ASSERT(!(dest->ht_flags & HTABLE_VLP)); 22740Sstevel@tonic-gate ASSERT(!(src->ht_flags & HTABLE_VLP)); 22750Sstevel@tonic-gate ASSERT(!(src->ht_flags & HTABLE_SHARED_PFN)); 22760Sstevel@tonic-gate ASSERT(!(dest->ht_flags & HTABLE_SHARED_PFN)); 22770Sstevel@tonic-gate 22780Sstevel@tonic-gate /* 22793446Smrj * Acquire access to the CPU pagetable windows for the dest and source. 22800Sstevel@tonic-gate */ 22813446Smrj dst_va = (caddr_t)x86pte_access_pagetable(dest, entry); 22823446Smrj if (kpm_vbase) { 22833446Smrj src_va = (caddr_t) 22843446Smrj PT_INDEX_PTR(hat_kpm_pfn2va(src->ht_pfn), entry); 22850Sstevel@tonic-gate } else { 22863446Smrj uint_t x = PWIN_SRC(CPU->cpu_id); 22870Sstevel@tonic-gate 22880Sstevel@tonic-gate /* 22890Sstevel@tonic-gate * Finish defining the src pagetable mapping 22900Sstevel@tonic-gate */ 22913446Smrj src_va = (caddr_t)PT_INDEX_PTR(PWIN_VA(x), entry); 22923446Smrj pte = MAKEPTE(src->ht_pfn, 0) | mmu.pt_global | mmu.pt_nx; 22933446Smrj pteptr = (x86pte_t *)PWIN_PTE_VA(x); 22943446Smrj if (mmu.pae_hat) 22953446Smrj *pteptr = pte; 22963446Smrj else 22973446Smrj *(x86pte32_t *)pteptr = pte; 22983446Smrj mmu_tlbflush_entry((caddr_t)(PWIN_VA(x))); 22990Sstevel@tonic-gate } 23000Sstevel@tonic-gate 23010Sstevel@tonic-gate /* 23020Sstevel@tonic-gate * now do the copy 23030Sstevel@tonic-gate */ 23040Sstevel@tonic-gate size = count << mmu.pte_size_shift; 23050Sstevel@tonic-gate bcopy(src_va, dst_va, size); 23060Sstevel@tonic-gate 23070Sstevel@tonic-gate x86pte_release_pagetable(dest); 23080Sstevel@tonic-gate } 23090Sstevel@tonic-gate 23105084Sjohnlev #else /* __xpv */ 23115084Sjohnlev 23125084Sjohnlev /* 23135084Sjohnlev * The hypervisor only supports writable pagetables at level 0, so we have 23145084Sjohnlev * to install these 1 by 1 the slow way. 23155084Sjohnlev */ 23165084Sjohnlev void 23175084Sjohnlev x86pte_copy(htable_t *src, htable_t *dest, uint_t entry, uint_t count) 23185084Sjohnlev { 23195084Sjohnlev caddr_t src_va; 23205084Sjohnlev x86pte_t pte; 23215084Sjohnlev 23225084Sjohnlev ASSERT(!IN_XPV_PANIC()); 23235084Sjohnlev src_va = (caddr_t)x86pte_access_pagetable(src, entry); 23245084Sjohnlev while (count) { 23255084Sjohnlev if (mmu.pae_hat) 23265084Sjohnlev pte = *(x86pte_t *)src_va; 23275084Sjohnlev else 23285084Sjohnlev pte = *(x86pte32_t *)src_va; 23295084Sjohnlev if (pte != 0) { 23305084Sjohnlev set_pteval(pfn_to_pa(dest->ht_pfn), entry, 23315084Sjohnlev dest->ht_level, pte); 23325084Sjohnlev #ifdef __amd64 23335084Sjohnlev if (dest->ht_level == mmu.max_level && 23345084Sjohnlev htable_e2va(dest, entry) < HYPERVISOR_VIRT_END) 23355084Sjohnlev set_pteval( 23365084Sjohnlev pfn_to_pa(dest->ht_hat->hat_user_ptable), 23375084Sjohnlev entry, dest->ht_level, pte); 23385084Sjohnlev #endif 23395084Sjohnlev } 23405084Sjohnlev --count; 23415084Sjohnlev ++entry; 23425084Sjohnlev src_va += mmu.pte_size; 23435084Sjohnlev } 23445084Sjohnlev x86pte_release_pagetable(src); 23455084Sjohnlev } 23465084Sjohnlev #endif /* __xpv */ 23475084Sjohnlev 23480Sstevel@tonic-gate /* 23490Sstevel@tonic-gate * Zero page table entries - Note this doesn't use atomic stores! 23500Sstevel@tonic-gate */ 23513446Smrj static void 23520Sstevel@tonic-gate x86pte_zero(htable_t *dest, uint_t entry, uint_t count) 23530Sstevel@tonic-gate { 23540Sstevel@tonic-gate caddr_t dst_va; 23550Sstevel@tonic-gate size_t size; 23565084Sjohnlev #ifdef __xpv 23575084Sjohnlev int x; 23585084Sjohnlev x86pte_t newpte; 23595084Sjohnlev #endif 23600Sstevel@tonic-gate 23610Sstevel@tonic-gate /* 23620Sstevel@tonic-gate * Map in the page table to be zeroed. 23630Sstevel@tonic-gate */ 23640Sstevel@tonic-gate ASSERT(!(dest->ht_flags & HTABLE_SHARED_PFN)); 23650Sstevel@tonic-gate ASSERT(!(dest->ht_flags & HTABLE_VLP)); 23663446Smrj 23675084Sjohnlev /* 23685084Sjohnlev * On the hypervisor we don't use x86pte_access_pagetable() since 23695084Sjohnlev * in this case the page is not pinned yet. 23705084Sjohnlev */ 23715084Sjohnlev #ifdef __xpv 23725084Sjohnlev if (kpm_vbase == NULL) { 23735084Sjohnlev kpreempt_disable(); 23745084Sjohnlev ASSERT(CPU->cpu_hat_info != NULL); 23755084Sjohnlev mutex_enter(&CPU->cpu_hat_info->hci_mutex); 23765084Sjohnlev x = PWIN_TABLE(CPU->cpu_id); 23775084Sjohnlev newpte = MAKEPTE(dest->ht_pfn, 0) | PT_WRITABLE; 23785084Sjohnlev xen_map(newpte, PWIN_VA(x)); 23795084Sjohnlev dst_va = (caddr_t)PT_INDEX_PTR(PWIN_VA(x), entry); 23805084Sjohnlev } else 23815084Sjohnlev #endif 23825084Sjohnlev dst_va = (caddr_t)x86pte_access_pagetable(dest, entry); 23833446Smrj 23840Sstevel@tonic-gate size = count << mmu.pte_size_shift; 23853446Smrj ASSERT(size > BLOCKZEROALIGN); 23863446Smrj #ifdef __i386 23873446Smrj if ((x86_feature & X86_SSE2) == 0) 23880Sstevel@tonic-gate bzero(dst_va, size); 23893446Smrj else 23903446Smrj #endif 23913446Smrj block_zero_no_xmm(dst_va, size); 23923446Smrj 23935084Sjohnlev #ifdef __xpv 23945084Sjohnlev if (kpm_vbase == NULL) { 23955084Sjohnlev xen_map(0, PWIN_VA(x)); 23965084Sjohnlev mutex_exit(&CPU->cpu_hat_info->hci_mutex); 23975084Sjohnlev kpreempt_enable(); 23985084Sjohnlev } else 23995084Sjohnlev #endif 24005084Sjohnlev x86pte_release_pagetable(dest); 24010Sstevel@tonic-gate } 24020Sstevel@tonic-gate 24030Sstevel@tonic-gate /* 24040Sstevel@tonic-gate * Called to ensure that all pagetables are in the system dump 24050Sstevel@tonic-gate */ 24060Sstevel@tonic-gate void 24070Sstevel@tonic-gate hat_dump(void) 24080Sstevel@tonic-gate { 24090Sstevel@tonic-gate hat_t *hat; 24100Sstevel@tonic-gate uint_t h; 24110Sstevel@tonic-gate htable_t *ht; 24120Sstevel@tonic-gate 24130Sstevel@tonic-gate /* 24141747Sjosephb * Dump all page tables 24150Sstevel@tonic-gate */ 24161747Sjosephb for (hat = kas.a_hat; hat != NULL; hat = hat->hat_next) { 24170Sstevel@tonic-gate for (h = 0; h < hat->hat_num_hash; ++h) { 24180Sstevel@tonic-gate for (ht = hat->hat_ht_hash[h]; ht; ht = ht->ht_next) { 24191747Sjosephb if ((ht->ht_flags & HTABLE_VLP) == 0) 24200Sstevel@tonic-gate dump_page(ht->ht_pfn); 24210Sstevel@tonic-gate } 24220Sstevel@tonic-gate } 24230Sstevel@tonic-gate } 24240Sstevel@tonic-gate } 2425