13446Smrj /* 23446Smrj * CDDL HEADER START 33446Smrj * 43446Smrj * The contents of this file are subject to the terms of the 53446Smrj * Common Development and Distribution License (the "License"). 63446Smrj * You may not use this file except in compliance with the License. 73446Smrj * 83446Smrj * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 93446Smrj * or http://www.opensolaris.org/os/licensing. 103446Smrj * See the License for the specific language governing permissions 113446Smrj * and limitations under the License. 123446Smrj * 133446Smrj * When distributing Covered Code, include this CDDL HEADER in each 143446Smrj * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 153446Smrj * If applicable, add the following below this CDDL HEADER, with the 163446Smrj * fields enclosed by brackets "[]" replaced with your own identifying 173446Smrj * information: Portions Copyright [yyyy] [name of copyright owner] 183446Smrj * 193446Smrj * CDDL HEADER END 203446Smrj */ 213446Smrj 223446Smrj /* 233446Smrj * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 243446Smrj * Use is subject to license terms. 253446Smrj */ 263446Smrj 273446Smrj #pragma ident "%Z%%M% %I% %E% SMI" 283446Smrj 293446Smrj #include <sys/types.h> 303446Smrj #include <sys/systm.h> 313446Smrj #include <sys/archsystm.h> 323446Smrj #include <sys/debug.h> 333446Smrj #include <sys/bootconf.h> 343446Smrj #include <sys/bootsvcs.h> 353446Smrj #include <sys/bootinfo.h> 363446Smrj #include <sys/mman.h> 373446Smrj #include <sys/cmn_err.h> 383446Smrj #include <sys/param.h> 393446Smrj #include <sys/machparam.h> 403446Smrj #include <sys/machsystm.h> 413446Smrj #include <sys/promif.h> 423446Smrj #include <sys/kobj.h> 43*5084Sjohnlev #ifdef __xpv 44*5084Sjohnlev #include <sys/hypervisor.h> 45*5084Sjohnlev #endif 463446Smrj #include <vm/kboot_mmu.h> 473446Smrj #include <vm/hat_pte.h> 483446Smrj #include <vm/hat_i86.h> 493446Smrj #include <vm/seg_kmem.h> 503446Smrj 513446Smrj #if 0 523446Smrj /* 533446Smrj * Joe's debug printing 543446Smrj */ 553446Smrj #define DBG(x) \ 56*5084Sjohnlev bop_printf(NULL, "%M%: %s is %" PRIx64 "\n", #x, (uint64_t)(x)); 573446Smrj #else 583446Smrj #define DBG(x) /* naught */ 593446Smrj #endif 603446Smrj 613446Smrj /* 623446Smrj * Page table and memory stuff. 633446Smrj */ 643446Smrj static caddr_t window; 653446Smrj static caddr_t pte_to_window; 663446Smrj 673446Smrj /* 683446Smrj * this are needed by mmu_init() 693446Smrj */ 703446Smrj int kbm_nx_support = 0; /* NX bit in PTEs is in use */ 713446Smrj int kbm_pae_support = 0; /* PAE is 64 bit Page table entries */ 723446Smrj int kbm_pge_support = 0; /* PGE is Page table global bit enabled */ 733446Smrj int kbm_largepage_support = 0; 743446Smrj uint_t kbm_nucleus_size = 0; 753446Smrj 763446Smrj #define BOOT_SHIFT(l) (shift_amt[l]) 773446Smrj #define BOOT_SZ(l) ((size_t)1 << BOOT_SHIFT(l)) 783446Smrj #define BOOT_OFFSET(l) (BOOT_SZ(l) - 1) 793446Smrj #define BOOT_MASK(l) (~BOOT_OFFSET(l)) 803446Smrj 813446Smrj /* 823446Smrj * Initialize memory management parameters for boot time page table management 833446Smrj */ 843446Smrj void 853446Smrj kbm_init(struct xboot_info *bi) 863446Smrj { 873446Smrj /* 883446Smrj * configure mmu information 893446Smrj */ 903446Smrj kbm_nucleus_size = (uintptr_t)bi->bi_kseg_size; 913446Smrj kbm_largepage_support = bi->bi_use_largepage; 923446Smrj kbm_nx_support = bi->bi_use_nx; 933446Smrj kbm_pae_support = bi->bi_use_pae; 943446Smrj kbm_pge_support = bi->bi_use_pge; 953446Smrj window = bi->bi_pt_window; 963446Smrj DBG(window); 973446Smrj pte_to_window = bi->bi_pte_to_pt_window; 983446Smrj DBG(pte_to_window); 993446Smrj if (kbm_pae_support) { 1003446Smrj shift_amt = shift_amt_pae; 1013446Smrj ptes_per_table = 512; 1023446Smrj pte_size = 8; 1033446Smrj lpagesize = TWO_MEG; 1043446Smrj #ifdef __amd64 1053446Smrj top_level = 3; 1063446Smrj #else 1073446Smrj top_level = 2; 1083446Smrj #endif 1093446Smrj } else { 1103446Smrj shift_amt = shift_amt_nopae; 1113446Smrj ptes_per_table = 1024; 1123446Smrj pte_size = 4; 1133446Smrj lpagesize = FOUR_MEG; 1143446Smrj top_level = 1; 1153446Smrj } 1163446Smrj 117*5084Sjohnlev #ifdef __xpv 118*5084Sjohnlev xen_info = bi->bi_xen_start_info; 119*5084Sjohnlev mfn_list = (mfn_t *)xen_info->mfn_list; 120*5084Sjohnlev DBG(mfn_list); 121*5084Sjohnlev mfn_count = xen_info->nr_pages; 122*5084Sjohnlev DBG(mfn_count); 123*5084Sjohnlev #endif 1243446Smrj top_page_table = bi->bi_top_page_table; 1253446Smrj DBG(top_page_table); 1263446Smrj } 1273446Smrj 1283446Smrj /* 1293446Smrj * Change the addressible page table window to point at a given page 1303446Smrj */ 1313446Smrj /*ARGSUSED*/ 1323446Smrj void * 1333446Smrj kbm_remap_window(paddr_t physaddr, int writeable) 1343446Smrj { 135*5084Sjohnlev x86pte_t pt_bits = PT_NOCONSIST | PT_VALID | PT_WRITABLE; 1363446Smrj 1373446Smrj DBG(physaddr); 1383446Smrj 139*5084Sjohnlev #ifdef __xpv 140*5084Sjohnlev if (!writeable) 141*5084Sjohnlev pt_bits &= ~PT_WRITABLE; 142*5084Sjohnlev if (HYPERVISOR_update_va_mapping((uintptr_t)window, 143*5084Sjohnlev pa_to_ma(physaddr) | pt_bits, UVMF_INVLPG | UVMF_LOCAL) < 0) 144*5084Sjohnlev bop_panic("HYPERVISOR_update_va_mapping() failed"); 145*5084Sjohnlev #else 1463446Smrj if (kbm_pae_support) 1473446Smrj *((x86pte_t *)pte_to_window) = physaddr | pt_bits; 1483446Smrj else 1493446Smrj *((x86pte32_t *)pte_to_window) = physaddr | pt_bits; 1503446Smrj mmu_tlbflush_entry(window); 151*5084Sjohnlev #endif 1523446Smrj DBG(window); 1533446Smrj return (window); 1543446Smrj } 1553446Smrj 1563446Smrj /* 1573446Smrj * Add a mapping for the physical page at the given virtual address. 1583446Smrj */ 1593446Smrj void 1603446Smrj kbm_map(uintptr_t va, paddr_t pa, uint_t level, uint_t is_kernel) 1613446Smrj { 1623446Smrj x86pte_t *ptep; 1633446Smrj paddr_t pte_physaddr; 1643446Smrj x86pte_t pteval; 1653446Smrj 1663446Smrj if (khat_running) 1673446Smrj panic("kbm_map() called too late"); 1683446Smrj 1693446Smrj pteval = pa_to_ma(pa) | PT_NOCONSIST | PT_VALID | PT_WRITABLE; 1703446Smrj if (level == 1) 1713446Smrj pteval |= PT_PAGESIZE; 1723446Smrj if (kbm_pge_support && is_kernel) 1733446Smrj pteval |= PT_GLOBAL; 1743446Smrj 175*5084Sjohnlev #ifdef __xpv 176*5084Sjohnlev /* 177*5084Sjohnlev * try update_va_mapping first - fails if page table is missing. 178*5084Sjohnlev */ 179*5084Sjohnlev if (HYPERVISOR_update_va_mapping(va, pteval, 180*5084Sjohnlev UVMF_INVLPG | UVMF_LOCAL) == 0) 181*5084Sjohnlev return; 182*5084Sjohnlev #endif 183*5084Sjohnlev 1843446Smrj /* 1853446Smrj * Find the pte that will map this address. This creates any 1863446Smrj * missing intermediate level page tables. 1873446Smrj */ 1883446Smrj ptep = find_pte(va, &pte_physaddr, level, 0); 1893446Smrj if (ptep == NULL) 1903446Smrj bop_panic("kbm_map: find_pte returned NULL"); 1913446Smrj 192*5084Sjohnlev #ifdef __xpv 193*5084Sjohnlev if (HYPERVISOR_update_va_mapping(va, pteval, UVMF_INVLPG | UVMF_LOCAL)) 194*5084Sjohnlev bop_panic("HYPERVISOR_update_va_mapping() failed"); 195*5084Sjohnlev #else 1963446Smrj if (kbm_pae_support) 1973446Smrj *ptep = pteval; 1983446Smrj else 1993446Smrj *((x86pte32_t *)ptep) = pteval; 2003446Smrj mmu_tlbflush_entry((caddr_t)va); 201*5084Sjohnlev #endif 2023446Smrj } 2033446Smrj 204*5084Sjohnlev #ifdef __xpv 205*5084Sjohnlev 206*5084Sjohnlev /* 207*5084Sjohnlev * Add a mapping for the machine page at the given virtual address. 208*5084Sjohnlev */ 209*5084Sjohnlev void 210*5084Sjohnlev kbm_map_ma(maddr_t ma, uintptr_t va, uint_t level) 211*5084Sjohnlev { 212*5084Sjohnlev paddr_t pte_physaddr; 213*5084Sjohnlev x86pte_t pteval; 214*5084Sjohnlev 215*5084Sjohnlev pteval = ma | PT_NOCONSIST | PT_VALID | PT_REF | PT_WRITABLE; 216*5084Sjohnlev if (level == 1) 217*5084Sjohnlev pteval |= PT_PAGESIZE; 218*5084Sjohnlev 219*5084Sjohnlev /* 220*5084Sjohnlev * try update_va_mapping first - fails if page table is missing. 221*5084Sjohnlev */ 222*5084Sjohnlev if (HYPERVISOR_update_va_mapping(va, 223*5084Sjohnlev pteval, UVMF_INVLPG | UVMF_LOCAL) == 0) 224*5084Sjohnlev return; 225*5084Sjohnlev 226*5084Sjohnlev /* 227*5084Sjohnlev * Find the pte that will map this address. This creates any 228*5084Sjohnlev * missing intermediate level page tables 229*5084Sjohnlev */ 230*5084Sjohnlev (void) find_pte(va, &pte_physaddr, level, 0); 231*5084Sjohnlev 232*5084Sjohnlev if (HYPERVISOR_update_va_mapping(va, 233*5084Sjohnlev pteval, UVMF_INVLPG | UVMF_LOCAL) != 0) 234*5084Sjohnlev bop_panic("HYPERVISOR_update_va_mapping failed"); 235*5084Sjohnlev } 236*5084Sjohnlev 237*5084Sjohnlev #endif /* __xpv */ 238*5084Sjohnlev 239*5084Sjohnlev 2403446Smrj /* 2413446Smrj * Probe the boot time page tables to find the first mapping 2423446Smrj * including va (or higher) and return non-zero if one is found. 2433446Smrj * va is updated to the starting address and len to the pagesize. 2443446Smrj * pp will be set to point to the 1st page_t of the mapped page(s). 2453446Smrj * 2463446Smrj * Note that if va is in the middle of a large page, the returned va 2473446Smrj * will be less than what was asked for. 2483446Smrj */ 2493446Smrj int 2503446Smrj kbm_probe(uintptr_t *va, size_t *len, pfn_t *pfn, uint_t *prot) 2513446Smrj { 2523446Smrj uintptr_t probe_va; 2533446Smrj x86pte_t *ptep; 2543446Smrj paddr_t pte_physaddr; 2553446Smrj x86pte_t pte_val; 2563446Smrj level_t l; 2573446Smrj 2583446Smrj if (khat_running) 2593446Smrj panic("kbm_probe() called too late"); 2603446Smrj *len = 0; 2613446Smrj *pfn = PFN_INVALID; 2623446Smrj *prot = 0; 2633446Smrj probe_va = *va; 2643446Smrj restart_new_va: 2653446Smrj l = top_level; 2663446Smrj for (;;) { 2673446Smrj if (IN_VA_HOLE(probe_va)) 2683446Smrj probe_va = mmu.hole_end; 2693446Smrj 2703446Smrj if (IN_HYPERVISOR_VA(probe_va)) 271*5084Sjohnlev #if defined(__amd64) && defined(__xpv) 272*5084Sjohnlev probe_va = HYPERVISOR_VIRT_END; 273*5084Sjohnlev #else 2743446Smrj return (0); 275*5084Sjohnlev #endif 2763446Smrj 2773446Smrj /* 2783446Smrj * If we don't have a valid PTP/PTE at this level 2793446Smrj * then we can bump VA by this level's pagesize and try again. 2803446Smrj * When the probe_va wraps around, we are done. 2813446Smrj */ 2823446Smrj ptep = find_pte(probe_va, &pte_physaddr, l, 1); 2833446Smrj if (ptep == NULL) 2843446Smrj bop_panic("kbm_probe: find_pte returned NULL"); 2853446Smrj if (kbm_pae_support) 2863446Smrj pte_val = *ptep; 2873446Smrj else 2883446Smrj pte_val = *((x86pte32_t *)ptep); 2893446Smrj if (!PTE_ISVALID(pte_val)) { 2903446Smrj probe_va = (probe_va & BOOT_MASK(l)) + BOOT_SZ(l); 2913446Smrj if (probe_va <= *va) 2923446Smrj return (0); 2933446Smrj goto restart_new_va; 2943446Smrj } 2953446Smrj 2963446Smrj /* 2973446Smrj * If this entry is a pointer to a lower level page table 2983446Smrj * go down to it. 2993446Smrj */ 3003446Smrj if (!PTE_ISPAGE(pte_val, l)) { 3013446Smrj ASSERT(l > 0); 3023446Smrj --l; 3033446Smrj continue; 3043446Smrj } 3053446Smrj 3063446Smrj /* 3073446Smrj * We found a boot level page table entry 3083446Smrj */ 3093446Smrj *len = BOOT_SZ(l); 3103446Smrj *va = probe_va & ~(*len - 1); 3113446Smrj *pfn = PTE2PFN(pte_val, l); 3123446Smrj 3133446Smrj 3143446Smrj *prot = PROT_READ | PROT_EXEC; 3153446Smrj if (PTE_GET(pte_val, PT_WRITABLE)) 3163446Smrj *prot |= PROT_WRITE; 3173446Smrj 3183446Smrj /* 3193446Smrj * pt_nx is cleared if processor doesn't support NX bit 3203446Smrj */ 3213446Smrj if (PTE_GET(pte_val, mmu.pt_nx)) 3223446Smrj *prot &= ~PROT_EXEC; 3233446Smrj 3243446Smrj return (1); 3253446Smrj } 3263446Smrj } 3273446Smrj 3283446Smrj 3293446Smrj /* 3303446Smrj * Destroy a boot loader page table 4K mapping. 3313446Smrj */ 3323446Smrj void 3333446Smrj kbm_unmap(uintptr_t va) 3343446Smrj { 3353446Smrj if (khat_running) 3363446Smrj panic("kbm_unmap() called too late"); 3373446Smrj else { 338*5084Sjohnlev #ifdef __xpv 339*5084Sjohnlev (void) HYPERVISOR_update_va_mapping(va, 0, 340*5084Sjohnlev UVMF_INVLPG | UVMF_LOCAL); 341*5084Sjohnlev #else 3423446Smrj x86pte_t *ptep; 3433446Smrj level_t level = 0; 3443446Smrj uint_t probe_only = 1; 3453446Smrj 3463446Smrj ptep = find_pte(va, NULL, level, probe_only); 3473446Smrj if (ptep == NULL) 3483446Smrj return; 3493446Smrj 3503446Smrj if (kbm_pae_support) 3513446Smrj *ptep = 0; 3523446Smrj else 3533446Smrj *((x86pte32_t *)ptep) = 0; 3543446Smrj mmu_tlbflush_entry((caddr_t)va); 355*5084Sjohnlev #endif 3563446Smrj } 3573446Smrj } 3583446Smrj 3593446Smrj 3603446Smrj /* 3613446Smrj * Change a boot loader page table 4K mapping. 3623446Smrj * Returns the pfn of the old mapping. 3633446Smrj */ 3643446Smrj pfn_t 3653446Smrj kbm_remap(uintptr_t va, pfn_t pfn) 3663446Smrj { 3673446Smrj x86pte_t *ptep; 3683446Smrj level_t level = 0; 3693446Smrj uint_t probe_only = 1; 3703446Smrj x86pte_t pte_val = pa_to_ma(pfn_to_pa(pfn)) | PT_WRITABLE | 3713446Smrj PT_NOCONSIST | PT_VALID; 3723446Smrj x86pte_t old_pte; 3733446Smrj 3743446Smrj if (khat_running) 3753446Smrj panic("kbm_remap() called too late"); 3763446Smrj ptep = find_pte(va, NULL, level, probe_only); 3773446Smrj if (ptep == NULL) 3783446Smrj bop_panic("kbm_remap: find_pte returned NULL"); 3793446Smrj 3803446Smrj if (kbm_pae_support) 3813446Smrj old_pte = *ptep; 3823446Smrj else 3833446Smrj old_pte = *((x86pte32_t *)ptep); 3843446Smrj 385*5084Sjohnlev #ifdef __xpv 386*5084Sjohnlev if (HYPERVISOR_update_va_mapping(va, pte_val, UVMF_INVLPG | UVMF_LOCAL)) 387*5084Sjohnlev bop_panic("HYPERVISOR_update_va_mapping() failed"); 388*5084Sjohnlev #else 3893446Smrj if (kbm_pae_support) 3903446Smrj *((x86pte_t *)ptep) = pte_val; 3913446Smrj else 3923446Smrj *((x86pte32_t *)ptep) = pte_val; 3933446Smrj mmu_tlbflush_entry((caddr_t)va); 394*5084Sjohnlev #endif 3953446Smrj 3963446Smrj if (!(old_pte & PT_VALID) || ma_to_pa(old_pte) == -1) 3973446Smrj return (PFN_INVALID); 3983446Smrj return (mmu_btop(ma_to_pa(old_pte))); 3993446Smrj } 4003446Smrj 4013446Smrj 4023446Smrj /* 4033446Smrj * Change a boot loader page table 4K mapping to read only. 4043446Smrj */ 4053446Smrj void 4063446Smrj kbm_read_only(uintptr_t va, paddr_t pa) 4073446Smrj { 4083446Smrj x86pte_t pte_val = pa_to_ma(pa) | 4093446Smrj PT_NOCONSIST | PT_REF | PT_MOD | PT_VALID; 410*5084Sjohnlev 411*5084Sjohnlev #ifdef __xpv 412*5084Sjohnlev if (HYPERVISOR_update_va_mapping(va, pte_val, UVMF_INVLPG | UVMF_LOCAL)) 413*5084Sjohnlev bop_panic("HYPERVISOR_update_va_mapping() failed"); 414*5084Sjohnlev #else 4153446Smrj x86pte_t *ptep; 4163446Smrj level_t level = 0; 4173446Smrj 4183446Smrj ptep = find_pte(va, NULL, level, 0); 4193446Smrj if (ptep == NULL) 4203446Smrj bop_panic("kbm_read_only: find_pte returned NULL"); 4213446Smrj 4223446Smrj if (kbm_pae_support) 4233446Smrj *ptep = pte_val; 4243446Smrj else 4253446Smrj *((x86pte32_t *)ptep) = pte_val; 4263446Smrj mmu_tlbflush_entry((caddr_t)va); 427*5084Sjohnlev #endif 4283446Smrj } 4293446Smrj 4303446Smrj /* 4313446Smrj * interfaces for kernel debugger to access physical memory 4323446Smrj */ 4333446Smrj static x86pte_t save_pte; 4343446Smrj 4353446Smrj void * 4363446Smrj kbm_push(paddr_t pa) 4373446Smrj { 4383446Smrj static int first_time = 1; 4393446Smrj 4403446Smrj if (first_time) { 4413446Smrj first_time = 0; 4423446Smrj return (window); 4433446Smrj } 4443446Smrj 4453446Smrj if (kbm_pae_support) 4463446Smrj save_pte = *((x86pte_t *)pte_to_window); 4473446Smrj else 4483446Smrj save_pte = *((x86pte32_t *)pte_to_window); 4493446Smrj return (kbm_remap_window(pa, 0)); 4503446Smrj } 4513446Smrj 4523446Smrj void 4533446Smrj kbm_pop(void) 4543446Smrj { 455*5084Sjohnlev #ifdef __xpv 456*5084Sjohnlev if (HYPERVISOR_update_va_mapping((uintptr_t)window, save_pte, 457*5084Sjohnlev UVMF_INVLPG | UVMF_LOCAL) < 0) 458*5084Sjohnlev bop_panic("HYPERVISOR_update_va_mapping() failed"); 459*5084Sjohnlev #else 4603446Smrj if (kbm_pae_support) 4613446Smrj *((x86pte_t *)pte_to_window) = save_pte; 4623446Smrj else 4633446Smrj *((x86pte32_t *)pte_to_window) = save_pte; 4643446Smrj mmu_tlbflush_entry(window); 465*5084Sjohnlev #endif 4663446Smrj } 4673446Smrj 4683446Smrj x86pte_t 4693446Smrj get_pteval(paddr_t table, uint_t index) 4703446Smrj { 4713446Smrj void *table_ptr = kbm_remap_window(table, 0); 4723446Smrj 4733446Smrj if (kbm_pae_support) 4743446Smrj return (((x86pte_t *)table_ptr)[index]); 4753446Smrj return (((x86pte32_t *)table_ptr)[index]); 4763446Smrj } 4773446Smrj 478*5084Sjohnlev #ifndef __xpv 4793446Smrj void 4803446Smrj set_pteval(paddr_t table, uint_t index, uint_t level, x86pte_t pteval) 4813446Smrj { 4823446Smrj void *table_ptr = kbm_remap_window(table, 0); 4833446Smrj if (kbm_pae_support) 4843446Smrj ((x86pte_t *)table_ptr)[index] = pteval; 4853446Smrj else 4863446Smrj ((x86pte32_t *)table_ptr)[index] = pteval; 4873446Smrj if (level == top_level && level == 2) 4883446Smrj reload_cr3(); 4893446Smrj } 490*5084Sjohnlev #endif 4913446Smrj 4923446Smrj paddr_t 4933446Smrj make_ptable(x86pte_t *pteval, uint_t level) 4943446Smrj { 4953446Smrj paddr_t new_table; 4963446Smrj void *table_ptr; 4973446Smrj 4983446Smrj new_table = do_bop_phys_alloc(MMU_PAGESIZE, MMU_PAGESIZE); 4993446Smrj table_ptr = kbm_remap_window(new_table, 1); 5003446Smrj bzero(table_ptr, MMU_PAGESIZE); 501*5084Sjohnlev #ifdef __xpv 502*5084Sjohnlev /* Remove write permission to the new page table. */ 503*5084Sjohnlev (void) kbm_remap_window(new_table, 0); 504*5084Sjohnlev #endif 5053446Smrj 5063446Smrj if (level == top_level && level == 2) 5073446Smrj *pteval = pa_to_ma(new_table) | PT_VALID; 5083446Smrj else 5093446Smrj *pteval = pa_to_ma(new_table) | 5103446Smrj PT_VALID | PT_REF | PT_USER | PT_WRITABLE; 5113446Smrj 5123446Smrj return (new_table); 5133446Smrj } 5143446Smrj 5153446Smrj x86pte_t * 5163446Smrj map_pte(paddr_t table, uint_t index) 5173446Smrj { 5183446Smrj void *table_ptr = kbm_remap_window(table, 0); 5193446Smrj return ((x86pte_t *)((caddr_t)table_ptr + index * pte_size)); 5203446Smrj } 521