13446Smrj /* 23446Smrj * CDDL HEADER START 33446Smrj * 43446Smrj * The contents of this file are subject to the terms of the 53446Smrj * Common Development and Distribution License (the "License"). 63446Smrj * You may not use this file except in compliance with the License. 73446Smrj * 83446Smrj * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 93446Smrj * or http://www.opensolaris.org/os/licensing. 103446Smrj * See the License for the specific language governing permissions 113446Smrj * and limitations under the License. 123446Smrj * 133446Smrj * When distributing Covered Code, include this CDDL HEADER in each 143446Smrj * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 153446Smrj * If applicable, add the following below this CDDL HEADER, with the 163446Smrj * fields enclosed by brackets "[]" replaced with your own identifying 173446Smrj * information: Portions Copyright [yyyy] [name of copyright owner] 183446Smrj * 193446Smrj * CDDL HEADER END 203446Smrj */ 213446Smrj 223446Smrj /* 233446Smrj * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 243446Smrj * Use is subject to license terms. 253446Smrj */ 263446Smrj 273446Smrj #pragma ident "%Z%%M% %I% %E% SMI" 283446Smrj 293446Smrj #include <sys/types.h> 303446Smrj #include <sys/machparam.h> 313446Smrj #include <sys/x86_archext.h> 323446Smrj #include <sys/systm.h> 333446Smrj #include <sys/mach_mmu.h> 343446Smrj 353446Smrj #include <sys/multiboot.h> 363446Smrj 373446Smrj extern multiboot_header_t mb_header; 383446Smrj extern int have_cpuid(void); 393446Smrj extern uint32_t get_cpuid_edx(uint32_t *eax); 403446Smrj 413446Smrj #include <sys/inttypes.h> 423446Smrj #include <sys/bootinfo.h> 433446Smrj #include <sys/mach_mmu.h> 443446Smrj #include <sys/boot_console.h> 453446Smrj 463446Smrj #include "dboot_printf.h" 473446Smrj #include "dboot_xboot.h" 483446Smrj #include "dboot_elfload.h" 493446Smrj 503446Smrj /* 513446Smrj * This file contains code that runs to transition us from either a multiboot 523446Smrj * compliant loader (32 bit non-paging) or Xen domain loader to regular kernel 533446Smrj * execution. Its task is to setup the kernel memory image and page tables. 543446Smrj * 553446Smrj * The code executes as: 563446Smrj * - 32 bits under GRUB (for 32 or 64 bit Solaris) 573446Smrj * - 32 bit program for Xen 32 bit 583446Smrj * - 64 bit program for Xen 64 bit (at least that's my assumption for now) 593446Smrj * 603446Smrj * Under Xen, we must create mappings for any memory beyond the initial 613446Smrj * start of day allocation (such as the kernel itself). 623446Smrj * 633446Smrj * When not under Xen, the mapping between maddr_t and paddr_t is 1:1. 643446Smrj * Since we are running in real mode, so all such memory is accessible. 653446Smrj */ 663446Smrj 673446Smrj /* 683446Smrj * Standard bits used in PTE (page level) and PTP (internal levels) 693446Smrj */ 703446Smrj x86pte_t ptp_bits = PT_VALID | PT_REF | PT_USER | PT_WRITABLE | PT_USER; 713446Smrj x86pte_t pte_bits = PT_VALID | PT_REF | PT_MOD | PT_NOCONSIST | PT_WRITABLE; 723446Smrj 733446Smrj /* 743446Smrj * This is the target addresses (physical) where the kernel text and data 753446Smrj * nucleus pages will be unpacked. On Xen this is actually a virtual address. 763446Smrj */ 773446Smrj paddr_t ktext_phys; 783446Smrj uint32_t ksize = 2 * FOUR_MEG; /* kernel nucleus is 8Meg */ 793446Smrj 803446Smrj static uint64_t target_kernel_text; /* value to use for KERNEL_TEXT */ 813446Smrj 823446Smrj /* 833446Smrj * The stack is setup in assembler before entering startup_kernel() 843446Smrj */ 853446Smrj char stack_space[STACK_SIZE]; 863446Smrj 873446Smrj /* 883446Smrj * Used to track physical memory allocation 893446Smrj */ 903446Smrj static paddr_t next_avail_addr = 0; 913446Smrj 923446Smrj multiboot_info_t *mb_info; 933446Smrj 943446Smrj /* 953446Smrj * This contains information passed to the kernel 963446Smrj */ 973446Smrj struct xboot_info boot_info[2]; /* extra space to fix alignement for amd64 */ 983446Smrj struct xboot_info *bi; 993446Smrj 1003446Smrj /* 1013446Smrj * Page table and memory stuff. 1023446Smrj */ 1033446Smrj static uint64_t max_mem; /* maximum memory address */ 1043446Smrj 1053446Smrj /* 1063446Smrj * Information about processor MMU 1073446Smrj */ 1083446Smrj int amd64_support = 0; 1093446Smrj int largepage_support = 0; 1103446Smrj int pae_support = 0; 1113446Smrj int pge_support = 0; 1123446Smrj int NX_support = 0; 1133446Smrj 1143446Smrj /* 1153446Smrj * Low 32 bits of kernel entry address passed back to assembler. 1163446Smrj * When running a 64 bit kernel, the high 32 bits are 0xffffffff. 1173446Smrj */ 1183446Smrj uint32_t entry_addr_low; 1193446Smrj 1203446Smrj /* 1213446Smrj * Memlists for the kernel. We shouldn't need a lot of these. 1223446Smrj */ 123*3489Sjosephb #define MAX_MEMLIST (50) 1243446Smrj struct boot_memlist memlists[MAX_MEMLIST]; 1253446Smrj uint_t memlists_used = 0; 126*3489Sjosephb struct boot_memlist pcimemlists[MAX_MEMLIST]; 127*3489Sjosephb uint_t pcimemlists_used = 0; 1283446Smrj 1293446Smrj #define MAX_MODULES (10) 1303446Smrj struct boot_modules modules[MAX_MODULES]; 1313446Smrj uint_t modules_used = 0; 1323446Smrj 1333446Smrj /* 1343446Smrj * Debugging macros 1353446Smrj */ 1363446Smrj uint_t prom_debug = 0; 1373446Smrj uint_t map_debug = 0; 1383446Smrj 1393446Smrj /* 1403446Smrj * The Xen/Grub specific code builds the initial memlists. This code does 1413446Smrj * sort/merge/link for final use. 1423446Smrj */ 1433446Smrj static void 1443446Smrj sort_physinstall(void) 1453446Smrj { 1463446Smrj int i; 1473446Smrj int j; 1483446Smrj struct boot_memlist tmp; 1493446Smrj 1503446Smrj /* 1513446Smrj * Now sort the memlists, in case they weren't in order. 1523446Smrj * Yeah, this is a bubble sort; small, simple and easy to get right. 1533446Smrj */ 1543446Smrj DBG_MSG("Sorting phys-installed list\n"); 1553446Smrj for (j = memlists_used - 1; j > 0; --j) { 1563446Smrj for (i = 0; i < j; ++i) { 1573446Smrj if (memlists[i].addr < memlists[i + 1].addr) 1583446Smrj continue; 1593446Smrj tmp = memlists[i]; 1603446Smrj memlists[i] = memlists[i + 1]; 1613446Smrj memlists[i + 1] = tmp; 1623446Smrj } 1633446Smrj } 1643446Smrj 1653446Smrj /* 1663446Smrj * Merge any memlists that don't have holes between them. 1673446Smrj */ 1683446Smrj for (i = 0; i <= memlists_used - 1; ++i) { 1693446Smrj if (memlists[i].addr + memlists[i].size != memlists[i + 1].addr) 1703446Smrj continue; 1713446Smrj 1723446Smrj if (prom_debug) 1733446Smrj dboot_printf( 1743446Smrj "merging mem segs %" PRIx64 "...%" PRIx64 1753446Smrj " w/ %" PRIx64 "...%" PRIx64 "\n", 1763446Smrj memlists[i].addr, 1773446Smrj memlists[i].addr + memlists[i].size, 1783446Smrj memlists[i + 1].addr, 1793446Smrj memlists[i + 1].addr + memlists[i + 1].size); 1803446Smrj 1813446Smrj memlists[i].size += memlists[i + 1].size; 1823446Smrj for (j = i + 1; j < memlists_used - 1; ++j) 1833446Smrj memlists[j] = memlists[j + 1]; 1843446Smrj --memlists_used; 1853446Smrj DBG(memlists_used); 1863446Smrj --i; /* after merging we need to reexamine, so do this */ 1873446Smrj } 1883446Smrj 1893446Smrj if (prom_debug) { 1903446Smrj dboot_printf("\nFinal memlists:\n"); 1913446Smrj for (i = 0; i < memlists_used; ++i) { 1923446Smrj dboot_printf("\t%d: addr=%" PRIx64 " size=%" 1933446Smrj PRIx64 "\n", i, memlists[i].addr, memlists[i].size); 1943446Smrj } 1953446Smrj } 1963446Smrj 1973446Smrj /* 1983446Smrj * link together the memlists with native size pointers 1993446Smrj */ 2003446Smrj memlists[0].next = 0; 2013446Smrj memlists[0].prev = 0; 2023446Smrj for (i = 1; i < memlists_used; ++i) { 2033446Smrj memlists[i].prev = (native_ptr_t)(uintptr_t)(memlists + i - 1); 2043446Smrj memlists[i].next = 0; 2053446Smrj memlists[i - 1].next = (native_ptr_t)(uintptr_t)(memlists + i); 2063446Smrj } 2073446Smrj bi->bi_phys_install = (native_ptr_t)memlists; 2083446Smrj DBG(bi->bi_phys_install); 2093446Smrj } 2103446Smrj 2113446Smrj x86pte_t 2123446Smrj get_pteval(paddr_t table, uint_t index) 2133446Smrj { 2143446Smrj if (pae_support) 2153446Smrj return (((x86pte_t *)(uintptr_t)table)[index]); 2163446Smrj return (((x86pte32_t *)(uintptr_t)table)[index]); 2173446Smrj } 2183446Smrj 2193446Smrj /*ARGSUSED*/ 2203446Smrj void 2213446Smrj set_pteval(paddr_t table, uint_t index, uint_t level, x86pte_t pteval) 2223446Smrj { 2233446Smrj uintptr_t tab_addr = (uintptr_t)table; 2243446Smrj 2253446Smrj if (pae_support) 2263446Smrj ((x86pte_t *)tab_addr)[index] = pteval; 2273446Smrj else 2283446Smrj ((x86pte32_t *)tab_addr)[index] = (x86pte32_t)pteval; 2293446Smrj if (level == top_level && level == 2) 2303446Smrj reload_cr3(); 2313446Smrj } 2323446Smrj 2333446Smrj paddr_t 2343446Smrj make_ptable(x86pte_t *pteval, uint_t level) 2353446Smrj { 2363446Smrj paddr_t new_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE); 2373446Smrj 2383446Smrj if (level == top_level && level == 2) 2393446Smrj *pteval = pa_to_ma((uintptr_t)new_table) | PT_VALID; 2403446Smrj else 2413446Smrj *pteval = pa_to_ma((uintptr_t)new_table) | ptp_bits; 2423446Smrj 2433446Smrj if (map_debug) 2443446Smrj dboot_printf("new page table lvl=%d paddr=0x%lx ptp=0x%" 2453446Smrj PRIx64 "\n", level, (ulong_t)new_table, *pteval); 2463446Smrj return (new_table); 2473446Smrj } 2483446Smrj 2493446Smrj x86pte_t * 2503446Smrj map_pte(paddr_t table, uint_t index) 2513446Smrj { 2523446Smrj return ((x86pte_t *)(uintptr_t)(table + index * pte_size)); 2533446Smrj } 2543446Smrj 2553446Smrj #if 0 /* useful if debugging */ 2563446Smrj /* 2573446Smrj * dump out the contents of page tables... 2583446Smrj */ 2593446Smrj static void 2603446Smrj dump_tables(void) 2613446Smrj { 2623446Smrj uint_t save_index[4]; /* for recursion */ 2633446Smrj char *save_table[4]; /* for recursion */ 2643446Smrj uint_t l; 2653446Smrj uint64_t va; 2663446Smrj uint64_t pgsize; 2673446Smrj int index; 2683446Smrj int i; 2693446Smrj x86pte_t pteval; 2703446Smrj char *table; 2713446Smrj static char *tablist = "\t\t\t"; 2723446Smrj char *tabs = tablist + 3 - top_level; 2733446Smrj uint_t pa, pa1; 2743446Smrj 2753446Smrj dboot_printf("Finished pagetables:\n"); 2763446Smrj table = (char *)top_page_table; 2773446Smrj l = top_level; 2783446Smrj va = 0; 2793446Smrj for (index = 0; index < ptes_per_table; ++index) { 2803446Smrj pgsize = 1ull << shift_amt[l]; 2813446Smrj if (pae_support) 2823446Smrj pteval = ((x86pte_t *)table)[index]; 2833446Smrj else 2843446Smrj pteval = ((x86pte32_t *)table)[index]; 2853446Smrj if (pteval == 0) 2863446Smrj goto next_entry; 2873446Smrj 2883446Smrj dboot_printf("%s %lx[0x%x] = %" PRIx64 ", va=%" PRIx64, 2893446Smrj tabs + l, table, index, (uint64_t)pteval, va); 2903446Smrj pa = ma_to_pa(pteval & MMU_PAGEMASK); 2913446Smrj dboot_printf(" physaddr=%" PRIx64 "\n", pa); 2923446Smrj 2933446Smrj /* 2943446Smrj * Don't try to walk hypervisor private pagetables 2953446Smrj */ 2963446Smrj if ((l > 1 || (l == 1 && (pteval & PT_PAGESIZE) == 0))) { 2973446Smrj save_table[l] = table; 2983446Smrj save_index[l] = index; 2993446Smrj --l; 3003446Smrj index = -1; 3013446Smrj table = (char *)(uintptr_t) 3023446Smrj ma_to_pa(pteval & MMU_PAGEMASK); 3033446Smrj goto recursion; 3043446Smrj } 3053446Smrj 3063446Smrj /* 3073446Smrj * shorten dump for consecutive mappings 3083446Smrj */ 3093446Smrj for (i = 1; index + i < ptes_per_table; ++i) { 3103446Smrj if (pae_support) 3113446Smrj pteval = ((x86pte_t *)table)[index + i]; 3123446Smrj else 3133446Smrj pteval = ((x86pte32_t *)table)[index + i]; 3143446Smrj if (pteval == 0) 3153446Smrj break; 3163446Smrj pa1 = ma_to_pa(pteval & MMU_PAGEMASK); 3173446Smrj if (pa1 != pa + i * pgsize) 3183446Smrj break; 3193446Smrj } 3203446Smrj if (i > 2) { 3213446Smrj dboot_printf("%s...\n", tabs + l); 3223446Smrj va += pgsize * (i - 2); 3233446Smrj index += i - 2; 3243446Smrj } 3253446Smrj next_entry: 3263446Smrj va += pgsize; 3273446Smrj if (l == 3 && index == 256) /* VA hole */ 3283446Smrj va = 0xffff800000000000ull; 3293446Smrj recursion: 3303446Smrj ; 3313446Smrj } 3323446Smrj if (l < top_level) { 3333446Smrj ++l; 3343446Smrj index = save_index[l]; 3353446Smrj table = save_table[l]; 3363446Smrj goto recursion; 3373446Smrj } 3383446Smrj } 3393446Smrj #endif 3403446Smrj 3413446Smrj /* 3423446Smrj * Add a mapping for the physical page at the given virtual address. 3433446Smrj */ 3443446Smrj static void 3453446Smrj map_pa_at_va(paddr_t pa, native_ptr_t va, uint_t level) 3463446Smrj { 3473446Smrj x86pte_t *ptep; 3483446Smrj x86pte_t pteval; 3493446Smrj 3503446Smrj pteval = pa_to_ma(pa) | pte_bits; 3513446Smrj if (level > 0) 3523446Smrj pteval |= PT_PAGESIZE; 3533446Smrj if (va >= target_kernel_text && pge_support) 3543446Smrj pteval |= PT_GLOBAL; 3553446Smrj 3563446Smrj if (map_debug && pa != va) 3573446Smrj dboot_printf("mapping pa=0x%" PRIx64 " va=0x%" PRIx64 3583446Smrj " pte=0x%" PRIx64 " l=%d\n", 3593446Smrj (uint64_t)pa, (uint64_t)va, pteval, level); 3603446Smrj 3613446Smrj /* 3623446Smrj * Find the pte that will map this address. This creates any 3633446Smrj * missing intermediate level page tables 3643446Smrj */ 3653446Smrj ptep = find_pte(va, NULL, level, 0); 3663446Smrj 3673446Smrj /* 3683446Smrj * On Xen we must use hypervisor calls to modify the PTE, since 3693446Smrj * paging is active. On real hardware we just write to the pagetables 3703446Smrj * which aren't in use yet. 3713446Smrj */ 3723446Smrj if (va < 1024 * 1024) 3733446Smrj pteval |= PT_NOCACHE; /* for video RAM */ 3743446Smrj if (pae_support) 3753446Smrj *ptep = pteval; 3763446Smrj else 3773446Smrj *((x86pte32_t *)ptep) = (x86pte32_t)pteval; 3783446Smrj } 3793446Smrj 3803446Smrj /* 3813446Smrj * During memory allocation, find the highest address not used yet. 3823446Smrj */ 3833446Smrj static void 3843446Smrj check_higher(paddr_t a) 3853446Smrj { 3863446Smrj if (a < next_avail_addr) 3873446Smrj return; 3883446Smrj next_avail_addr = RNDUP(a + 1, MMU_PAGESIZE); 3893446Smrj DBG(next_avail_addr); 3903446Smrj } 3913446Smrj 3923446Smrj /* 393*3489Sjosephb * This is called to remove start..end from the 394*3489Sjosephb * possible range of PCI addresses. 395*3489Sjosephb */ 396*3489Sjosephb const uint64_t pci_lo_limit = 0x00100000ul; 397*3489Sjosephb const uint64_t pci_hi_limit = 0xfff00000ul; 398*3489Sjosephb static void 399*3489Sjosephb exclude_from_pci(uint64_t start, uint64_t end) 400*3489Sjosephb { 401*3489Sjosephb int i; 402*3489Sjosephb int j; 403*3489Sjosephb struct boot_memlist *ml; 404*3489Sjosephb 405*3489Sjosephb for (i = 0; i < pcimemlists_used; ++i) { 406*3489Sjosephb ml = &pcimemlists[i]; 407*3489Sjosephb 408*3489Sjosephb /* delete the entire range? */ 409*3489Sjosephb if (start <= ml->addr && ml->addr + ml->size <= end) { 410*3489Sjosephb --pcimemlists_used; 411*3489Sjosephb for (j = i; j < pcimemlists_used; ++j) 412*3489Sjosephb pcimemlists[j] = pcimemlists[j + 1]; 413*3489Sjosephb --i; /* to revisit the new one at this index */ 414*3489Sjosephb } 415*3489Sjosephb 416*3489Sjosephb /* split a range? */ 417*3489Sjosephb else if (ml->addr < start && end < ml->addr + ml->size) { 418*3489Sjosephb 419*3489Sjosephb ++pcimemlists_used; 420*3489Sjosephb if (pcimemlists_used > MAX_MEMLIST) 421*3489Sjosephb dboot_panic("too many pcimemlists"); 422*3489Sjosephb 423*3489Sjosephb for (j = pcimemlists_used - 1; j > i; --j) 424*3489Sjosephb pcimemlists[j] = pcimemlists[j - 1]; 425*3489Sjosephb ml->size = start - ml->addr; 426*3489Sjosephb 427*3489Sjosephb ++ml; 428*3489Sjosephb ml->size = (ml->addr + ml->size) - end; 429*3489Sjosephb ml->addr = end; 430*3489Sjosephb ++i; /* skip on to next one */ 431*3489Sjosephb } 432*3489Sjosephb 433*3489Sjosephb /* cut memory off the start? */ 434*3489Sjosephb else if (ml->addr < end && end < ml->addr + ml->size) { 435*3489Sjosephb ml->size -= end - ml->addr; 436*3489Sjosephb ml->addr = end; 437*3489Sjosephb } 438*3489Sjosephb 439*3489Sjosephb /* cut memory off the end? */ 440*3489Sjosephb else if (ml->addr <= start && start < ml->addr + ml->size) { 441*3489Sjosephb ml->size = start - ml->addr; 442*3489Sjosephb } 443*3489Sjosephb } 444*3489Sjosephb } 445*3489Sjosephb 446*3489Sjosephb /* 4473446Smrj * Walk through the module information finding the last used address. 4483446Smrj * The first available address will become the top level page table. 4493446Smrj * 4503446Smrj * We then build the phys_install memlist from the multiboot information. 4513446Smrj */ 4523446Smrj static void 4533446Smrj init_mem_alloc(void) 4543446Smrj { 4553446Smrj mb_memory_map_t *mmap; 4563446Smrj mb_module_t *mod; 4573446Smrj uint64_t start; 4583446Smrj uint64_t end; 4593446Smrj uint64_t page_offset = MMU_PAGEOFFSET; /* needs to be 64 bits */ 4603446Smrj extern char _end[]; 4613446Smrj int i; 4623446Smrj 4633446Smrj DBG_MSG("Entered init_mem_alloc()\n"); 4643446Smrj DBG((uintptr_t)mb_info); 4653446Smrj 4663446Smrj /* 4673446Smrj * search the modules to find the last used address 4683446Smrj * we'll build the module list while we're walking through here 4693446Smrj */ 4703446Smrj DBG_MSG("\nFinding Modules\n"); 4713446Smrj check_higher((paddr_t)&_end); 4723446Smrj for (mod = (mb_module_t *)(mb_info->mods_addr), i = 0; 4733446Smrj i < mb_info->mods_count; 4743446Smrj ++mod, ++i) { 4753446Smrj if (prom_debug) { 4763446Smrj dboot_printf("\tmodule #%d: %s at: 0x%lx, len 0x%lx\n", 4773446Smrj i, (char *)(mod->mod_name), 4783446Smrj (ulong_t)mod->mod_start, (ulong_t)mod->mod_end); 4793446Smrj } 4803446Smrj modules[i].bm_addr = mod->mod_start; 4813446Smrj modules[i].bm_size = mod->mod_end; 4823446Smrj 4833446Smrj check_higher(mod->mod_end); 4843446Smrj } 4853446Smrj bi->bi_modules = (native_ptr_t)modules; 4863446Smrj DBG(bi->bi_modules); 4873446Smrj bi->bi_module_cnt = mb_info->mods_count; 4883446Smrj DBG(bi->bi_module_cnt); 4893446Smrj 4903446Smrj /* 491*3489Sjosephb * start out by assuming PCI can use all physical addresses 492*3489Sjosephb */ 493*3489Sjosephb pcimemlists[0].addr = pci_lo_limit; 494*3489Sjosephb pcimemlists[0].size = pci_hi_limit - pci_lo_limit; 495*3489Sjosephb pcimemlists_used = 1; 496*3489Sjosephb 497*3489Sjosephb /* 4983446Smrj * Walk through the memory map from multiboot and build our memlist 4993446Smrj * structures. Note these will have native format pointers. 5003446Smrj */ 5013446Smrj DBG_MSG("\nFinding Memory Map\n"); 5023446Smrj DBG(mb_info->flags); 5033446Smrj max_mem = 0; 5043446Smrj if (mb_info->flags & 0x40) { 5053446Smrj DBG(mb_info->mmap_addr); 5063446Smrj DBG(mb_info->mmap_length); 5073446Smrj check_higher(mb_info->mmap_addr + mb_info->mmap_length); 5083446Smrj 5093446Smrj for (mmap = (mb_memory_map_t *)mb_info->mmap_addr; 5103446Smrj (uint32_t)mmap < mb_info->mmap_addr + mb_info->mmap_length; 5113446Smrj mmap = (mb_memory_map_t *)((uint32_t)mmap + mmap->size 5123446Smrj + sizeof (mmap->size))) { 5133446Smrj 5143446Smrj start = ((uint64_t)mmap->base_addr_high << 32) + 5153446Smrj mmap->base_addr_low; 5163446Smrj end = start + ((uint64_t)mmap->length_high << 32) + 5173446Smrj mmap->length_low; 5183446Smrj 519*3489Sjosephb if (prom_debug) 5203446Smrj dboot_printf("\ttype: %d %" PRIx64 "..%" 5213446Smrj PRIx64 "\n", mmap->type, start, end); 5223446Smrj 5233446Smrj /* 5243446Smrj * page align start and end 5253446Smrj */ 5263446Smrj start = (start + page_offset) & ~page_offset; 5273446Smrj end &= ~page_offset; 5283446Smrj if (end <= start) 5293446Smrj continue; 5303446Smrj 531*3489Sjosephb exclude_from_pci(start, end); 532*3489Sjosephb 533*3489Sjosephb /* 534*3489Sjosephb * only type 1 is usable RAM 535*3489Sjosephb */ 536*3489Sjosephb if (mmap->type != 1) 537*3489Sjosephb continue; 538*3489Sjosephb 5393446Smrj if (end > max_mem) 5403446Smrj max_mem = end; 5413446Smrj 5423446Smrj memlists[memlists_used].addr = start; 5433446Smrj memlists[memlists_used].size = end - start; 544*3489Sjosephb ++memlists_used; 545*3489Sjosephb if (memlists_used > MAX_MEMLIST) 546*3489Sjosephb dboot_panic("too many memlists"); 5473446Smrj } 5483446Smrj } else if (mb_info->flags & 0x01) { 5493446Smrj DBG(mb_info->mem_lower); 5503446Smrj memlists[memlists_used].addr = 0; 5513446Smrj memlists[memlists_used].size = mb_info->mem_lower * 1024; 5523446Smrj ++memlists_used; 5533446Smrj DBG(mb_info->mem_upper); 5543446Smrj memlists[memlists_used].addr = 1024 * 1024; 5553446Smrj memlists[memlists_used].size = mb_info->mem_upper * 1024; 5563446Smrj ++memlists_used; 557*3489Sjosephb exclude_from_pci(memlists[0].addr, 558*3489Sjosephb memlists[0].addr + memlists[memlists_used].size); 559*3489Sjosephb exclude_from_pci(memlists[1].addr, 560*3489Sjosephb memlists[1].addr + memlists[memlists_used].size); 5613446Smrj } else { 5623446Smrj dboot_panic("No memory info from boot loader!!!\n"); 5633446Smrj } 5643446Smrj 5653446Smrj check_higher(bi->bi_cmdline); 5663446Smrj 5673446Smrj /* 5683446Smrj * finish processing the physinstall list 5693446Smrj */ 5703446Smrj sort_physinstall(); 571*3489Sjosephb 572*3489Sjosephb /* 573*3489Sjosephb * Finish off the pcimemlist 574*3489Sjosephb */ 575*3489Sjosephb if (prom_debug) { 576*3489Sjosephb for (i = 0; i < pcimemlists_used; ++i) { 577*3489Sjosephb dboot_printf("pcimemlist entry 0x%" PRIx64 "..0x%" 578*3489Sjosephb PRIx64 "\n", pcimemlists[i].addr, 579*3489Sjosephb pcimemlists[i].addr + pcimemlists[i].size); 580*3489Sjosephb } 581*3489Sjosephb } 582*3489Sjosephb pcimemlists[0].next = 0; 583*3489Sjosephb pcimemlists[0].prev = 0; 584*3489Sjosephb for (i = 1; i < pcimemlists_used; ++i) { 585*3489Sjosephb pcimemlists[i].prev = 586*3489Sjosephb (native_ptr_t)(uintptr_t)(pcimemlists + i - 1); 587*3489Sjosephb pcimemlists[i].next = 0; 588*3489Sjosephb pcimemlists[i - 1].next = 589*3489Sjosephb (native_ptr_t)(uintptr_t)(pcimemlists + i); 590*3489Sjosephb } 591*3489Sjosephb bi->bi_pcimem = (native_ptr_t)pcimemlists; 592*3489Sjosephb DBG(bi->bi_pcimem); 5933446Smrj } 5943446Smrj 5953446Smrj /* 5963446Smrj * Simple memory allocator, allocates aligned physical memory. 5973446Smrj * Note that startup_kernel() only allocates memory, never frees. 5983446Smrj * Memory usage just grows in an upward direction. 5993446Smrj */ 6003446Smrj static void * 6013446Smrj do_mem_alloc(uint32_t size, uint32_t align) 6023446Smrj { 6033446Smrj uint_t i; 6043446Smrj uint64_t best; 6053446Smrj uint64_t start; 6063446Smrj uint64_t end; 6073446Smrj 6083446Smrj /* 6093446Smrj * make sure size is a multiple of pagesize 6103446Smrj */ 6113446Smrj size = RNDUP(size, MMU_PAGESIZE); 6123446Smrj next_avail_addr = RNDUP(next_avail_addr, align); 6133446Smrj 6143446Smrj /* 6153446Smrj * a really large bootarchive that causes you to run out of memory 6163446Smrj * may cause this to blow up 6173446Smrj */ 6183446Smrj /* LINTED E_UNEXPECTED_UINT_PROMOTION */ 6193446Smrj best = (uint64_t)-size; 6203446Smrj for (i = 0; i < memlists_used; ++i) { 6213446Smrj start = memlists[i].addr; 6223446Smrj end = start + memlists[i].size; 6233446Smrj 6243446Smrj /* 6253446Smrj * did we find the desired address? 6263446Smrj */ 6273446Smrj if (start <= next_avail_addr && next_avail_addr + size <= end) { 6283446Smrj best = next_avail_addr; 6293446Smrj goto done; 6303446Smrj } 6313446Smrj 6323446Smrj /* 6333446Smrj * if not is this address the best so far? 6343446Smrj */ 6353446Smrj if (start > next_avail_addr && start < best && 6363446Smrj RNDUP(start, align) + size <= end) 6373446Smrj best = RNDUP(start, align); 6383446Smrj } 6393446Smrj 6403446Smrj /* 6413446Smrj * We didn't find exactly the address we wanted, due to going off the 6423446Smrj * end of a memory region. Return the best found memory address. 6433446Smrj */ 6443446Smrj done: 6453446Smrj next_avail_addr = best + size; 6463446Smrj (void) memset((void *)(uintptr_t)best, 0, size); 6473446Smrj return ((void *)(uintptr_t)best); 6483446Smrj } 6493446Smrj 6503446Smrj void * 6513446Smrj mem_alloc(uint32_t size) 6523446Smrj { 6533446Smrj return (do_mem_alloc(size, MMU_PAGESIZE)); 6543446Smrj } 6553446Smrj 6563446Smrj 6573446Smrj /* 6583446Smrj * Build page tables to map all of memory used so far as well as the kernel. 6593446Smrj */ 6603446Smrj static void 6613446Smrj build_page_tables(void) 6623446Smrj { 6633446Smrj uint32_t psize; 6643446Smrj uint32_t level; 6653446Smrj uint32_t off; 6663446Smrj uint32_t i; 6673446Smrj uint64_t start; 6683446Smrj uint64_t end; 6693446Smrj uint64_t next_mapping; 6703446Smrj 6713446Smrj /* 6723446Smrj * If we're not using Xen, we need to create the top level pagetable. 6733446Smrj */ 6743446Smrj top_page_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE); 6753446Smrj DBG((uintptr_t)top_page_table); 6763446Smrj 6773446Smrj /* 6783446Smrj * Determine if we'll use large mappings for kernel, then map it. 6793446Smrj */ 6803446Smrj if (largepage_support) { 6813446Smrj psize = lpagesize; 6823446Smrj level = 1; 6833446Smrj } else { 6843446Smrj psize = MMU_PAGESIZE; 6853446Smrj level = 0; 6863446Smrj } 6873446Smrj 6883446Smrj DBG_MSG("Mapping kernel\n"); 6893446Smrj DBG(ktext_phys); 6903446Smrj DBG(target_kernel_text); 6913446Smrj DBG(ksize); 6923446Smrj DBG(psize); 6933446Smrj for (off = 0; off < ksize; off += psize) 6943446Smrj map_pa_at_va(ktext_phys + off, target_kernel_text + off, level); 6953446Smrj 6963446Smrj /* 6973446Smrj * The kernel will need a 1 page window to work with page tables 6983446Smrj */ 6993446Smrj bi->bi_pt_window = (uintptr_t)mem_alloc(MMU_PAGESIZE); 7003446Smrj DBG(bi->bi_pt_window); 7013446Smrj bi->bi_pte_to_pt_window = 7023446Smrj (uintptr_t)find_pte(bi->bi_pt_window, NULL, 0, 0); 7033446Smrj DBG(bi->bi_pte_to_pt_window); 7043446Smrj 7053446Smrj /* 7063446Smrj * Under multiboot we need 1:1 mappings for all of low memory, which 7073446Smrj * includes our pagetables. The following code works because our 7083446Smrj * simple memory allocator only grows usage in an upwards direction. 7093446Smrj * 7103446Smrj * We map *all* possible addresses below 1 Meg, since things like 7113446Smrj * the video RAM are down there. 7123446Smrj * 7133446Smrj * Skip memory between 1M and _start, this acts as a reserve 7143446Smrj * of memory usable for DMA. 7153446Smrj */ 7163446Smrj next_mapping = (uintptr_t)_start & MMU_PAGEMASK; 7173446Smrj if (map_debug) 7183446Smrj dboot_printf("1:1 map pa=0..1Meg\n"); 7193446Smrj for (start = 0; start < 1024 * 1024; start += MMU_PAGESIZE) 7203446Smrj map_pa_at_va(start, start, 0); 7213446Smrj 7223446Smrj for (i = 0; i < memlists_used; ++i) { 7233446Smrj start = memlists[i].addr; 7243446Smrj if (start < next_mapping) 7253446Smrj start = next_mapping; 7263446Smrj 7273446Smrj end = start + memlists[i].size; 7283446Smrj 7293446Smrj if (map_debug) 7303446Smrj dboot_printf("1:1 map pa=%" PRIx64 "..%" PRIx64 "\n", 7313446Smrj start, end); 7323446Smrj while (start < end && start < next_avail_addr) { 7333446Smrj map_pa_at_va(start, start, 0); 7343446Smrj start += MMU_PAGESIZE; 7353446Smrj } 7363446Smrj } 7373446Smrj 7383446Smrj DBG_MSG("\nPage tables constructed\n"); 7393446Smrj } 7403446Smrj 7413446Smrj #define NO_MULTIBOOT \ 7423446Smrj "multiboot is no longer used to boot the Solaris Operating System.\n\ 7433446Smrj The grub entry should be changed to:\n\ 7443446Smrj kernel$ /platform/i86pc/kernel/$ISADIR/unix\n\ 7453446Smrj module$ /platform/i86pc/$ISADIR/boot_archive\n\ 7463446Smrj See http://www.sun.com/msg/SUNOS-8000-AK for details.\n" 7473446Smrj 7483446Smrj /* 7493446Smrj * startup_kernel has a pretty simple job. It builds pagetables which reflect 7503446Smrj * 1:1 mappings for all memory in use. It then also adds mappings for 7513446Smrj * the kernel nucleus at virtual address of target_kernel_text using large page 7523446Smrj * mappings. The page table pages are also accessible at 1:1 mapped 7533446Smrj * virtual addresses. 7543446Smrj */ 7553446Smrj /*ARGSUSED*/ 7563446Smrj void 7573446Smrj startup_kernel(void) 7583446Smrj { 7593446Smrj char *cmdline; 7603446Smrj uintptr_t addr; 7613446Smrj 7623446Smrj /* 7633446Smrj * At this point we are executing in a 32 bit real mode. 7643446Smrj */ 7653446Smrj cmdline = (char *)mb_info->cmdline; 7663446Smrj prom_debug = (strstr(cmdline, "prom_debug") != NULL); 7673446Smrj map_debug = (strstr(cmdline, "map_debug") != NULL); 7683446Smrj bcons_init(cmdline); 7693446Smrj DBG_MSG("\n\nSolaris prekernel set: "); 7703446Smrj DBG_MSG(cmdline); 7713446Smrj DBG_MSG("\n"); 7723446Smrj 7733446Smrj if (strstr(cmdline, "multiboot") != NULL) { 7743446Smrj dboot_panic(NO_MULTIBOOT); 7753446Smrj } 7763446Smrj 7773446Smrj /* 7783446Smrj * boot info must be 16 byte aligned for 64 bit kernel ABI 7793446Smrj */ 7803446Smrj addr = (uintptr_t)boot_info; 7813446Smrj addr = (addr + 0xf) & ~0xf; 7823446Smrj bi = (struct xboot_info *)addr; 7833446Smrj DBG((uintptr_t)bi); 7843446Smrj bi->bi_cmdline = (native_ptr_t)(uintptr_t)cmdline; 7853446Smrj 7863446Smrj /* 7873446Smrj * Need correct target_kernel_text value 7883446Smrj */ 7893446Smrj #if defined(_BOOT_TARGET_amd64) 7903446Smrj target_kernel_text = KERNEL_TEXT_amd64; 7913446Smrj #else 7923446Smrj target_kernel_text = KERNEL_TEXT_i386; 7933446Smrj #endif 7943446Smrj DBG(target_kernel_text); 7953446Smrj 7963446Smrj /* 7973446Smrj * use cpuid to enable MMU features 7983446Smrj */ 7993446Smrj if (have_cpuid()) { 8003446Smrj uint32_t eax, edx; 8013446Smrj 8023446Smrj eax = 1; 8033446Smrj edx = get_cpuid_edx(&eax); 8043446Smrj if (edx & CPUID_INTC_EDX_PSE) 8053446Smrj largepage_support = 1; 8063446Smrj if (edx & CPUID_INTC_EDX_PGE) 8073446Smrj pge_support = 1; 8083446Smrj if (edx & CPUID_INTC_EDX_PAE) 8093446Smrj pae_support = 1; 8103446Smrj 8113446Smrj eax = 0x80000000; 8123446Smrj edx = get_cpuid_edx(&eax); 8133446Smrj if (eax >= 0x80000001) { 8143446Smrj eax = 0x80000001; 8153446Smrj edx = get_cpuid_edx(&eax); 8163446Smrj if (edx & CPUID_AMD_EDX_LM) 8173446Smrj amd64_support = 1; 8183446Smrj if (edx & CPUID_AMD_EDX_NX) 8193446Smrj NX_support = 1; 8203446Smrj } 8213446Smrj } else { 8223446Smrj dboot_printf("cpuid not supported\n"); 8233446Smrj } 8243446Smrj 8253446Smrj #if defined(_BOOT_TARGET_amd64) 8263446Smrj if (amd64_support == 0) 8273446Smrj dboot_panic("long mode not supported, rebooting\n"); 8283446Smrj else if (pae_support == 0) 8293446Smrj dboot_panic("long mode, but no PAE; rebooting\n"); 8303446Smrj #endif 8313446Smrj 8323446Smrj /* 8333446Smrj * initialize our memory allocator 8343446Smrj */ 8353446Smrj init_mem_alloc(); 8363446Smrj 8373446Smrj /* 8383446Smrj * configure mmu information 8393446Smrj */ 8403446Smrj #if !defined(_BOOT_TARGET_amd64) 8413446Smrj if (pae_support && (max_mem > FOUR_GIG || NX_support)) { 8423446Smrj #endif 8433446Smrj shift_amt = shift_amt_pae; 8443446Smrj ptes_per_table = 512; 8453446Smrj pte_size = 8; 8463446Smrj lpagesize = TWO_MEG; 8473446Smrj #if defined(_BOOT_TARGET_amd64) 8483446Smrj top_level = 3; 8493446Smrj #else 8503446Smrj top_level = 2; 8513446Smrj #endif 8523446Smrj #if !defined(_BOOT_TARGET_amd64) 8533446Smrj } else { 8543446Smrj pae_support = 0; 8553446Smrj NX_support = 0; 8563446Smrj shift_amt = shift_amt_nopae; 8573446Smrj ptes_per_table = 1024; 8583446Smrj pte_size = 4; 8593446Smrj lpagesize = FOUR_MEG; 8603446Smrj top_level = 1; 8613446Smrj } 8623446Smrj #endif 8633446Smrj 8643446Smrj DBG(pge_support); 8653446Smrj DBG(NX_support); 8663446Smrj DBG(largepage_support); 8673446Smrj DBG(amd64_support); 8683446Smrj DBG(top_level); 8693446Smrj DBG(pte_size); 8703446Smrj DBG(ptes_per_table); 8713446Smrj DBG(lpagesize); 8723446Smrj 8733446Smrj ktext_phys = FOUR_MEG; /* from UNIX Mapfile */ 8743446Smrj 8753446Smrj #if defined(_BOOT_TARGET_amd64) 8763446Smrj /* 8773446Smrj * For grub, copy kernel bits from the ELF64 file to final place. 8783446Smrj */ 8793446Smrj DBG_MSG("\nAllocating nucleus pages.\n"); 8803446Smrj ktext_phys = (uintptr_t)do_mem_alloc(ksize, FOUR_MEG); 8813446Smrj if (ktext_phys == 0) 8823446Smrj dboot_panic("failed to allocate aligned kernel memory\n"); 8833446Smrj if (dboot_elfload64(mb_header.load_addr) != 0) 8843446Smrj dboot_panic("failed to parse kernel ELF image, rebooting\n"); 8853446Smrj 8863446Smrj #endif 8873446Smrj DBG(ktext_phys); 8883446Smrj 8893446Smrj /* 8903446Smrj * Allocate page tables. 8913446Smrj */ 8923446Smrj build_page_tables(); 8933446Smrj 8943446Smrj /* 8953446Smrj * return to assembly code to switch to running kernel 8963446Smrj */ 8973446Smrj entry_addr_low = (uint32_t)target_kernel_text; 8983446Smrj DBG(entry_addr_low); 8993446Smrj bi->bi_use_largepage = largepage_support; 9003446Smrj bi->bi_use_pae = pae_support; 9013446Smrj bi->bi_use_pge = pge_support; 9023446Smrj bi->bi_use_nx = NX_support; 9033446Smrj bi->bi_next_paddr = next_avail_addr; 9043446Smrj DBG(bi->bi_next_paddr); 9053446Smrj bi->bi_next_vaddr = (uintptr_t)next_avail_addr; 9063446Smrj DBG(bi->bi_next_vaddr); 9073446Smrj bi->bi_mb_info = (uintptr_t)mb_info; 9083446Smrj bi->bi_top_page_table = (uintptr_t)top_page_table; 9093446Smrj 9103446Smrj bi->bi_kseg_size = FOUR_MEG; 9113446Smrj DBG(bi->bi_kseg_size); 9123446Smrj 9133446Smrj #if 0 /* useful if debugging initial page tables */ 9143446Smrj if (prom_debug) 9153446Smrj dump_tables(); 9163446Smrj #endif 9173446Smrj 9183446Smrj DBG_MSG("\n\n*** DBOOT DONE -- back to asm to jump to kernel\n\n"); 9193446Smrj } 920